diff --git a/BUILDING.md b/BUILDING.md
index 64e693ac9..7be4232ca 100644
--- a/BUILDING.md
+++ b/BUILDING.md
@@ -19,7 +19,7 @@ Follow these steps:
 1. Fork and clone the GitHub [coremltools repository](https://github.com/apple/coremltools).
 
 2. Run the [build.sh](scripts/build.sh) script to build `coremltools`.
-	* By default this script uses Python 3.7, but you can include `--python=3.8` (or `3.9`, `3.10`) as a argument to change the Python version.
+	* By default this script uses Python 3.7, but you can include `--python=3.8` (or `3.9`, `3.10`, `3.11`) as a argument to change the Python version.
 	* The script creates a new `build` folder with the coremltools distribution, and a `dist` folder with Python wheel files.
 
 3. Run the [test.sh](scripts/test.sh) script to test the build.
@@ -45,7 +45,7 @@ The following build targets help you configure the development environment. If y
 * `test_slow` | Run all non-fast tests.
 * `wheel` | Build wheels in release mode.
 
-The script uses Python 3.7, but you can include `--python=3.8` (or `3.9`, `3.10`) as a argument to change the Python version.
+The script uses Python 3.7, but you can include `--python=3.8` (or `3.9`, `3.10`, `3.11`) as a argument to change the Python version.
 
 ## Resources
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b65719798..e64d03104 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -191,6 +191,19 @@ else()
   message(STATUS "CoreML.framework and dependent frameworks not found. Skipping libcoremlpython build.")
 endif()
 
+# Build kmeans-1d
+set(KMEANS_DIR "${PROJECT_SOURCE_DIR}/deps/kmeans1d")
+execute_process(
+  COMMAND python3 setup.py build_ext --inplace
+  WORKING_DIRECTORY ${KMEANS_DIR}
+)
+
+# Copy kmeans-1d to Python deps folder
+execute_process(
+  COMMAND cp -r kmeans1d ../../coremltools/_deps
+  WORKING_DIRECTORY ${KMEANS_DIR}
+)
+
 set(PYTHON_TAG "cp${PYTHON_VERSION_MAJOR}${PYTHON_VERSION_MINOR}")
 if(APPLE)
   execute_process(COMMAND uname -m OUTPUT_VARIABLE HARDWARE_NAME OUTPUT_STRIP_TRAILING_WHITESPACE)
diff --git a/LICENSE.txt b/LICENSE.txt
index 78a5fe85d..1bcd2655e 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -1,4 +1,4 @@
-Copyright (c) 2020, Apple Inc. All rights reserved.
+Copyright © 2020-2023, Apple Inc. All rights reserved.
 
 Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:  
 
diff --git a/NOTICE.txt b/NOTICE.txt
new file mode 100644
index 000000000..ad9356512
--- /dev/null
+++ b/NOTICE.txt
@@ -0,0 +1,25 @@
+Copyright © 2020-2023, Apple Inc. All rights reserved.
+
+This project contains content adapted from kmeans1d (https://github.com/dstein64/kmeans1d), the license for which follows:
+
+MIT License
+
+Copyright (c) 2019 Daniel Steinberg
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/coremltools/__init__.py b/coremltools/__init__.py
index d741975e4..84821cea1 100644
--- a/coremltools/__init__.py
+++ b/coremltools/__init__.py
@@ -60,6 +60,10 @@
 # New versions for iOS 16.0
 _SPECIFICATION_VERSION_IOS_16 = 7
 
+# New versions for iOS 17.0
+_SPECIFICATION_VERSION_IOS_17 = 8
+
+
 class ComputeUnit(_Enum):
     '''
     The set of processing-unit configurations the model can use to make predictions.
@@ -76,6 +80,7 @@ class ComputeUnit(_Enum):
     _SPECIFICATION_VERSION_IOS_14: "CoreML4",
     _SPECIFICATION_VERSION_IOS_15: "CoreML5",
     _SPECIFICATION_VERSION_IOS_16: "CoreML6",
+    _SPECIFICATION_VERSION_IOS_17: "CoreML7",
 }
 
 # Default specification version for each backend
@@ -84,7 +89,7 @@ class ComputeUnit(_Enum):
 
 
 # expose sub packages as directories
-from . import converters, models, proto
+from . import converters, models, optimize, proto
 
 # expose unified converter in coremltools package level
 from .converters import ClassifierConfig
diff --git a/coremltools/_deps/.gitignore b/coremltools/_deps/.gitignore
new file mode 100644
index 000000000..12796295b
--- /dev/null
+++ b/coremltools/_deps/.gitignore
@@ -0,0 +1 @@
+kmeans1d/
diff --git a/coremltools/_deps/__init__.py b/coremltools/_deps/__init__.py
index 9d59acbe5..01e1f3eb4 100644
--- a/coremltools/_deps/__init__.py
+++ b/coremltools/_deps/__init__.py
@@ -16,6 +16,13 @@
 
 from coremltools import _logger as logger
 
+_HAS_KMEANS1D = True
+try:
+    from . import kmeans1d as _kmeans1d
+except:
+    _kmeans1d = None
+    _HAS_KMEANS1D = False
+
 
 def _get_version(version):
     # matching 1.6.1, and 1.6.1rc, 1.6.1.dev
@@ -156,6 +163,14 @@ def __get_sklearn_version(version):
 MSG_TORCH_NOT_FOUND = "PyTorch not found."
 
 
+_HAS_TORCH_VISION = True
+try:
+    import torchvision
+except:
+    _HAS_TORCH_VISION = False
+MSG_TORCH_VISION_NOT_FOUND = "TorchVision not found."
+
+
 # ---------------------------------------------------------------------------------------
 try:
     import scipy
diff --git a/coremltools/converters/_converters_entry.py b/coremltools/converters/_converters_entry.py
index bc588de1b..d0d914d3f 100644
--- a/coremltools/converters/_converters_entry.py
+++ b/coremltools/converters/_converters_entry.py
@@ -6,7 +6,7 @@
 import collections
 import gc
 import os
-from typing import Optional, Text, Union
+from typing import List, Optional, Text, Union
 
 from coremltools import (
     _LOWEST_ALLOWED_SPECIFICATION_VERSION_FOR_MILPROGRAM,
@@ -23,8 +23,11 @@
 from coremltools.converters.mil.converter import mil_convert
 from coremltools.converters.mil.input_types import (
     ClassifierConfig,
+    EnumeratedShapes,
     ImageType,
     InputType,
+    RangeDim,
+    Shape,
     TensorType,
 )
 from coremltools.converters.mil.mil import Program, types
@@ -395,7 +398,7 @@ def skip_real_div_ops(op):
 
             pipeline = ct.PassPipeline()
             pipeline.remove_passes({"common::fuse_conv_batchnorm"})
-            ct.convert(model, pass_pipeline=pipeline)
+            mlmodel = ct.convert(model, pass_pipeline=pipeline)
 
         * To avoid folding too-large ``const`` ops that lead to a large model, set pass option
           as shown in the following example:
@@ -404,7 +407,34 @@ def skip_real_div_ops(op):
 
             pipeline = ct.PassPipeline()
             pipeline.set_options("common::const_elimination", {"skip_const_by_size": "1e6"})
-            ct.convert(model, pass_pipeline=pipeline)
+            mlmodel = ct.convert(model, pass_pipeline=pipeline)
+
+        We also provide a set of predefined pass pipelines that you can directly call.
+
+        * To avoid running all graph pass, you can use:
+
+          .. sourcecode:: python
+
+             mlmodel = ct.convert(model, pass_pipeline=ct.PassPipeline.EMPTY)
+
+        * To only run the cleanup graph passes, like constant_elimination, dead_code_elimination, etc.
+          You can use:
+
+          .. sourcecode:: python
+
+             mlmodel = ct.convert(model, pass_pipeline=ct.PassPipeline.CLEANUP)
+
+        * To convert a source model with sparse weights to a sparse format Core ML model, you can use:
+
+          .. sourcecode:: python
+
+             mlmodel = ct.convert(model, pass_pipeline=ct.PassPipeline.DEFAULT_PRUNING)
+
+        * To convert a source model with palettized weights to a compressed format Core ML model, you can use:
+
+          .. sourcecode:: python
+
+             mlmodel = ct.convert(model, pass_pipeline=ct.PassPipeline.DEFAULT_PALETTIZATION)
 
     Returns
     -------
@@ -463,9 +493,17 @@ def skip_real_div_ops(op):
                                      outputs_as_tensor_or_image_types,
                                      outputs)
     exact_target = _determine_target(convert_to, minimum_deployment_target)
-    _validate_conversion_arguments(model, exact_source, inputs, outputs_as_tensor_or_image_types,
-                                   classifier_config, compute_precision,
-                                   exact_target, minimum_deployment_target)
+    _validate_conversion_arguments(
+        model,
+        exact_source,
+        exact_target,
+        inputs,
+        outputs_as_tensor_or_image_types,
+        classifier_config,
+        compute_precision,
+        exact_target,
+        minimum_deployment_target,
+    )
 
     if pass_pipeline is None:
         pass_pipeline = PassPipeline()
@@ -504,6 +542,12 @@ def skip_real_div_ops(op):
         main_pipeline=pass_pipeline,
     )
 
+    if exact_target == "mlprogram" and mlmodel._input_has_infinite_upper_bound():
+        raise ValueError(
+            "For mlprogram, inputs with infinite upper_bound is not allowed. Please set upper_bound"
+            ' to a positive value in "RangeDim()" for the "inputs" param in ct.convert().'
+        )
+
     if exact_target == 'milinternal':
         return mlmodel  # Returns the MIL program
 
@@ -539,7 +583,7 @@ def _need_fp16_cast_pass(
         raise ValueError(f"Invalid value of the argument 'compute_precision': {compute_precision}")
 
 
-def _set_default_specification_version(target):
+def _set_default_specification_version(target) -> Optional[AvailableTarget]:
     if target == "neuralnetwork":
         return _LOWEST_ALLOWED_SPECIFICATION_VERSION_FOR_NEURALNETWORK
     elif target == "mlprogram":
@@ -625,18 +669,20 @@ def _validate_outputs_argument(outputs):
                 return output_names, outputs
 
 
-def _validate_conversion_arguments(model,
-                                   exact_source,
-                                   inputs,
-                                   outputs,
-                                   classifier_config,
-                                   compute_precision,
-                                   convert_to,
-                                   minimum_deployment_target,
-                                   ):
+def _validate_conversion_arguments(
+    model,
+    exact_source,
+    exact_target,
+    inputs,
+    outputs,
+    classifier_config,
+    compute_precision,
+    convert_to,
+    minimum_deployment_target,
+):
     """
     Validate and process model, inputs, classifier_config based on
-    `exact_source` (which cannot be `auto`)
+    `exact_source` (which cannot be `auto`) and `exact_target`.
     """
 
     def raise_if_duplicated(input_list):
@@ -672,10 +718,10 @@ def _flatten_list(_inputs):
 
         # get flattened inputs
         flat_inputs = _flatten_list(inputs)
-        for t in flat_inputs:
-            if not isinstance(t, InputType):
+        for flat_input in flat_inputs:
+            if not isinstance(flat_input, InputType):
                 raise ValueError("inputs must be a list of type ct.TensorType or ct.ImageType")
-            if t.dtype == types.fp16:
+            if flat_input.dtype == types.fp16:
                 if not (
                     minimum_deployment_target is not None
                     and minimum_deployment_target >= AvailableTarget.iOS16
@@ -685,6 +731,24 @@ def _flatten_list(_inputs):
                         "target >= iOS16/macOS13/watchOS9/tvOS16"
                     )
 
+    if exact_target == "mlprogram":
+        err_msg_infinite_bound = (
+            "For mlprogram, inputs with infinite upper_bound is not allowed. Please set upper_bound"
+            ' to a positive value in "RangeDim()" for the "inputs" param in ct.convert().'
+        )
+        if inputs is not None:
+            for flat_input in _flatten_list(inputs):
+                tensor_shapes: List[Optional[Shape]] = (
+                    flat_input.shape.shapes
+                    if isinstance(flat_input.shape, EnumeratedShapes)
+                    else [flat_input.shape]
+                )
+                for tensor_shape in tensor_shapes:
+                    if tensor_shape is not None:
+                        for shape in tensor_shape.shape:
+                            if isinstance(shape, RangeDim) and shape.upper_bound < 0:
+                                raise ValueError(err_msg_infinite_bound)
+
     if outputs is not None:
         for t in outputs:
             if t.dtype == types.fp16:
diff --git a/coremltools/converters/mil/_deployment_compatibility.py b/coremltools/converters/mil/_deployment_compatibility.py
index e3a8f498b..d5e5bc6e0 100644
--- a/coremltools/converters/mil/_deployment_compatibility.py
+++ b/coremltools/converters/mil/_deployment_compatibility.py
@@ -5,10 +5,13 @@
 
 from enum import IntEnum
 
-from coremltools import (_SPECIFICATION_VERSION_IOS_13,
-                         _SPECIFICATION_VERSION_IOS_14,
-                         _SPECIFICATION_VERSION_IOS_15,
-                         _SPECIFICATION_VERSION_IOS_16)
+from coremltools import (
+    _SPECIFICATION_VERSION_IOS_13,
+    _SPECIFICATION_VERSION_IOS_14,
+    _SPECIFICATION_VERSION_IOS_15,
+    _SPECIFICATION_VERSION_IOS_16,
+    _SPECIFICATION_VERSION_IOS_17,
+)
 
 
 class AvailableTarget(IntEnum):
@@ -17,6 +20,7 @@ class AvailableTarget(IntEnum):
     iOS14 = _SPECIFICATION_VERSION_IOS_14
     iOS15 = _SPECIFICATION_VERSION_IOS_15
     iOS16 = _SPECIFICATION_VERSION_IOS_16
+    iOS17 = _SPECIFICATION_VERSION_IOS_17
 
     # macOS versions (aliases of iOS versions)
     macOS15 = _SPECIFICATION_VERSION_IOS_13
@@ -26,19 +30,22 @@ class AvailableTarget(IntEnum):
     macOS11 = _SPECIFICATION_VERSION_IOS_14
     macOS12 = _SPECIFICATION_VERSION_IOS_15
     macOS13 = _SPECIFICATION_VERSION_IOS_16
+    macOS14 = _SPECIFICATION_VERSION_IOS_17
 
     # watchOS versions (aliases of iOS versions)
     watchOS6 = _SPECIFICATION_VERSION_IOS_13
     watchOS7 = _SPECIFICATION_VERSION_IOS_14
     watchOS8 = _SPECIFICATION_VERSION_IOS_15
     watchOS9 = _SPECIFICATION_VERSION_IOS_16
+    watchOS10 = _SPECIFICATION_VERSION_IOS_17
 
     # tvOS versions (aliases of iOS versions)
     tvOS13 = _SPECIFICATION_VERSION_IOS_13
     tvOS14 = _SPECIFICATION_VERSION_IOS_14
     tvOS15 = _SPECIFICATION_VERSION_IOS_15
     tvOS16 = _SPECIFICATION_VERSION_IOS_16
-    
+    tvOS17 = _SPECIFICATION_VERSION_IOS_17
+
     # customized __str__
     def __str__(self):
         original_str = super().__str__()
diff --git a/coremltools/converters/mil/backend/mil/helper.py b/coremltools/converters/mil/backend/mil/helper.py
index b03708cd4..078a662a1 100644
--- a/coremltools/converters/mil/backend/mil/helper.py
+++ b/coremltools/converters/mil/backend/mil/helper.py
@@ -177,7 +177,14 @@ def create_scalar_value(py_scalar):
 
     # Set the tensor value
     t_field = _tensor_field_by_type(t_val, builtin_type)
-    if builtin_type in (types.fp16, types.int8, types.uint8, types.uint32):
+    if builtin_type in (
+        types.fp16,
+        types.int8,
+        types.uint8,
+        types.int16,
+        types.uint16,
+        types.uint32,
+    ):
         val.immediateValue.tensor.bytes.values = np_val_to_py_type(py_scalar)
     else:
         if builtin_type == types.str:
@@ -284,14 +291,22 @@ def types_to_proto(valuetype):
 
 def create_file_value(output_var, blob_writer):
     if output_var.val.dtype.kind == 'f' and output_var.val.dtype.itemsize == 4:
-        offset = blob_writer.write_float_data(output_var.val.flatten())
-    elif output_var.val.dtype.kind == 'f' and output_var.val.dtype.itemsize == 2:
-        output_var_fp16_to_bytes_to_uint16 = np.frombuffer(output_var.val.flatten().tobytes(), np.uint16)
-        offset = blob_writer.write_fp16_data(output_var_fp16_to_bytes_to_uint16)
+        offset = blob_writer.write_float_data(np.ascontiguousarray(output_var.val.flatten()))
+    elif output_var.val.dtype.kind == "f" and output_var.val.dtype.itemsize == 2:
+        output_var_fp16_to_bytes_to_uint16 = np.frombuffer(
+            output_var.val.flatten().tobytes(), np.uint16
+        )
+        offset = blob_writer.write_fp16_data(
+            np.ascontiguousarray(output_var_fp16_to_bytes_to_uint16)
+        )
     elif output_var.val.dtype.kind == "u" and output_var.val.dtype.itemsize == 1:
-        offset = blob_writer.write_uint8_data(output_var.val.flatten())
+        offset = blob_writer.write_uint8_data(np.ascontiguousarray(output_var.val.flatten()))
     elif output_var.val.dtype.kind == "i" and output_var.val.dtype.itemsize == 1:
-        offset = blob_writer.write_int8_data(output_var.val.flatten())
+        offset = blob_writer.write_int8_data(np.ascontiguousarray(output_var.val.flatten()))
+    elif output_var.val.dtype.kind == "u" and output_var.val.dtype.itemsize == 2:
+        offset = blob_writer.write_uint16_data(np.ascontiguousarray(output_var.val.flatten()))
+    elif output_var.val.dtype.kind == "i" and output_var.val.dtype.itemsize == 2:
+        offset = blob_writer.write_int16_data(np.ascontiguousarray(output_var.val.flatten()))
     else:
         raise TypeError("Unsupported type, {}, for net buffer serialization.".format(output_var.val.dtype))
 
diff --git a/coremltools/converters/mil/backend/mil/load.py b/coremltools/converters/mil/backend/mil/load.py
index e682e871b..246179e01 100644
--- a/coremltools/converters/mil/backend/mil/load.py
+++ b/coremltools/converters/mil/backend/mil/load.py
@@ -4,6 +4,7 @@
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
 import os
+import warnings
 
 import numpy as np
 
@@ -146,6 +147,8 @@ def translate_generic_op(op, parameters, blob_writer, literal_params=[]):
         attr_dict["weights"] = create_list_scalarvalue(weights, str)
         attr_dict["description"] = create_scalar_value(description)
 
+    attr_dict["name"] = create_scalar_value(op.name)
+
     return pm.Operation(
         type=op_type,
         blocks=blocks,
@@ -329,8 +332,14 @@ def load(prog, weights_dir, resume_on_errors=False, specification_version=_SPECI
             image_input_names[input_type.name] = input_type
             # error checking for input(s) marked as images
             if input_type.name not in list(prog.functions["main"].inputs.keys()):
-                msg = "Provided image input '{}' is not one of the inputs of the MIL program"
-                raise ValueError(msg.format(input_type.name))
+                raise ValueError(
+                    f"Provided image input '{input_type.name}' is not one of the inputs of the MIL program"
+                )
+        if input_type.name is None:
+            raise ValueError(
+                'Fail to auto-determine the input name. Please specify the "name" '
+                'parameter when use "inputs" in ct.convert().'
+            )
         input_shape_map[input_type.name] = input_type
 
     for name, var in prog.functions["main"].inputs.items():
@@ -457,6 +466,11 @@ def load(prog, weights_dir, resume_on_errors=False, specification_version=_SPECI
     model.mlProgram.CopyFrom(proto)
 
     # Set symbolic shapes
+    default_lower_bound = 1
+    default_upper_bound = (
+        default_lower_bound + 1 if kwargs.get("convert_to", None) == "mlprogram" else -1
+    )
+    default_bound_used = False
     for input_name in symbolic_inputs:
         input_type = input_shape_map.get(input_name, None)
 
@@ -480,13 +494,15 @@ def load(prog, weights_dir, resume_on_errors=False, specification_version=_SPECI
                 if isinstance(H, RangeDim):
                     img_range.add_height_range((H.lower_bound, H.upper_bound))
                 elif is_symbolic(H):
-                    img_range.add_height_range((1, -1))
+                    img_range.add_height_range((default_lower_bound, default_upper_bound))
+                    default_bound_used = True
                 else:
                     img_range.add_height_range((H, H))
                 if isinstance(W, RangeDim):
                     img_range.add_width_range((W.lower_bound, W.upper_bound))
                 elif is_symbolic(W):
-                    img_range.add_width_range((1, -1))
+                    img_range.add_width_range((default_lower_bound, default_upper_bound))
+                    default_bound_used = True
                 else:
                     img_range.add_width_range((W, W))
 
@@ -506,8 +522,9 @@ def load(prog, weights_dir, resume_on_errors=False, specification_version=_SPECI
                         lb.append(s.lower_bound)
                         ub.append(s.upper_bound)
                     elif is_symbolic(s):
-                        lb.append(1)
-                        ub.append(-1)
+                        lb.append(default_lower_bound)
+                        ub.append(default_upper_bound)
+                        default_bound_used = True
                     else:
                         lb.append(s)
                         ub.append(s)
@@ -520,8 +537,9 @@ def load(prog, weights_dir, resume_on_errors=False, specification_version=_SPECI
             ub = []
             for s in sym_type.get_shape():
                 if is_symbolic(s):
-                    lb.append(1)
-                    ub.append(-1)
+                    lb.append(default_lower_bound)
+                    ub.append(default_upper_bound)
+                    default_bound_used = True
                 else:
                     lb.append(s)
                     ub.append(s)
@@ -529,6 +547,22 @@ def load(prog, weights_dir, resume_on_errors=False, specification_version=_SPECI
                 model, input_name, lower_bounds=lb, upper_bounds=ub
             )
 
+    if default_bound_used and kwargs.get("convert_to", None) == "mlprogram":
+        warnings.warn(
+            "Some dimensions in the input shape are unknown, hence they are set to flexible ranges "
+            f"with lower bound and default value = {default_lower_bound}, and upper bound = "
+            f"{default_upper_bound}. To set different values for the default shape and upper bound, "
+            "please use the ct.RangeDim() method as described here: "
+            "https://coremltools.readme.io/docs/flexible-inputs#set-the-range-for-each-dimension.",
+            UserWarning,
+        )
+        convert_from = kwargs.get("convert_from", None)
+        if convert_from is not None and convert_from.startswith("tensorflow"):
+            warnings.warn(
+                'There is "None" dim in TF input placeholder. Please consider specifying '
+                'input shapes by using the "inputs" param in ct.convert().'
+            )
+
     # Set optional inputs
     _set_optional_inputs(model, input_types)
 
diff --git a/coremltools/converters/mil/backend/mil/passes/adjust_io_to_supported_types.py b/coremltools/converters/mil/backend/mil/passes/adjust_io_to_supported_types.py
index dd77dfedd..afacdf024 100644
--- a/coremltools/converters/mil/backend/mil/passes/adjust_io_to_supported_types.py
+++ b/coremltools/converters/mil/backend/mil/passes/adjust_io_to_supported_types.py
@@ -3,9 +3,10 @@
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+from typing import Set
+
 from coremltools import _logger as logger
-from coremltools.converters.mil._deployment_compatibility import \
-    AvailableTarget as target
+from coremltools.converters.mil._deployment_compatibility import AvailableTarget as target
 from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.mil import types as types
 from coremltools.converters.mil.mil.passes.graph_pass import AbstractGraphPass
@@ -16,16 +17,16 @@
 @register_pass(namespace="mil_backend")
 class adjust_io_to_supported_types(AbstractGraphPass):
     """
-    Converts all dtypes to types that are supported by the CoreML runtime.
-    The runtime supports only fp16, fp32, int32, str, and bool variables.
+    Converts all dtypes to types that are supported by the Core ML runtime.
+    The runtime supports fp16, fp32, int16, uint16, int32, str, and bool variables.
 
     General rules:
-        * Integer vars that are not 32 bit are replaced with int32 types.
+        * Integer vars with unsupported types are replaced with int32 types.
         * All other types not in the list of runtime supported types are replaced with the fp32 dtype.
           No casts are inserted; the previous type is replaced. The assumption is that all remaining
           types are numerical and can be reasonably replaced with 32 bit float types.
 
-    The "main" function has additional rules since its I/O is mapped to CoreML model I/O:
+    The "main" function has additional rules since its I/O is mapped to Core ML model I/O:
         * if function.opset_version <  coremltools.target.iOS16, then:
             * Fp16 I/O is replaced with fp32 I/O.
                 Casts (fp32 input -> fp16) are inserted at the beginning of the program to preserve 16 bit inputs.
@@ -37,7 +38,7 @@ class adjust_io_to_supported_types(AbstractGraphPass):
           The assumption is that all remaining types are numerical and it is valid to cast them to/from fp32.
 
         * The only exception: Int64 outputs are allowed for the classifier op. This is to keep consistency with
-          the CoreML API, which uses 64 bit integers to represent classifier labels.
+          the Core ML API, which uses 64 bit integers to represent classifier labels.
 
     ------
 
@@ -67,135 +68,149 @@ def apply(self, prog):
             is_main_funtion = name == "main"
             _adjust_io_to_supported_types(func, is_main_funtion)
 
-__RUNTIME_SUPPORTED_TYPES = [types.fp16, types.fp32, types.int32, types.str, types.bool]
 
-#####
-# Main Function
-#####
 def _adjust_var_dtype_helper(var, dtype):
-    if (types.is_scalar(var.sym_type)):
+    if types.is_scalar(var.sym_type):
         var._sym_type = dtype
     else:
         var._sym_type = types.tensor(dtype, var.sym_type.get_shape())
 
+
+def _get_io_supported_types(opset_version: target) -> Set[type]:
+    """Get Core ML I/O supported data types based on opset version."""
+    supported_types = {types.fp32, types.int32}
+    if opset_version >= target.iOS16:
+        supported_types.add(types.fp16)
+    return supported_types
+
+
+def _get_runtime_supported_types(opset_version: target) -> Set[type]:
+    """Get Core ML Runtime supported data types based on opset version."""
+    supported_types = {types.fp16, types.fp32, types.int32, types.str, types.bool}
+    if opset_version >= target.iOS17:
+        supported_types.update({types.int16, types.uint16})
+    return supported_types
+
+
 @block_context_manager
 def _adjust_main_inputs(func):
-    first_op = func.operations[0] if len(func.operations) > 0 else None
+    """
+    Adjust the inputs in main func.
+
+    If the input's dtype is not in Core ML I/O supported types, we do following steps:
+        1. Change the input's dtype to int32 or fp32 based on original dtype.
+        2. If the original dtype is supported in Core ML Runtime, we insert a cast op to cast the
+           input from the changed dtype to the original dtype.
+    """
+    _IO_SUPPORTED_TYPES = _get_io_supported_types(func.opset_version)
+    _RUNTIME_SUPPORTED_TYPES = _get_runtime_supported_types(func.opset_version)
+
     for input_name, input_var in func.inputs.items():
-       if (types.is_tensor(input_var.sym_type) or types.is_scalar(input_var.sym_type)) \
-            and input_var.dtype != types.fp32 \
-            and input_var.dtype != types.int32:
+        if (
+            types.is_tensor(input_var.sym_type) or types.is_scalar(input_var.sym_type)
+        ) and input_var.dtype not in _IO_SUPPORTED_TYPES:
             input_dtype_str = types.builtin_to_string(input_var.dtype)
-            if types.is_int(input_var.dtype):
-                # Replace non-int32 input type with int32.
-                logger.warning("Input" + input_var.name + " is of dtype " + input_dtype_str +\
-                               ". Only integer variables of bit width 32 are supported by the CoreML runtime. " +\
-                               "This input will be assigned a dtype of int32. " +\
-                               "No cast will be inserted; the previous dtype will be replaced.")
-                _adjust_var_dtype_helper(input_var, types.int32)
-            elif input_var.dtype == types.fp64:
-                # Replace float64 input type with fp32.
-                logger.warning("Input '" + input_var.name + "' is of dtype fp64. 64 bit float inputs are " +\
-                               "not supported by ML program models. This input will be assigned a dtype " +\
-                               "of fp32. No cast will be inserted; the previous dtype will be replaced.")
-                _adjust_var_dtype_helper(input_var, types.fp32)
-            elif input_var.dtype == types.fp16 \
-                 and func.opset_version >= target.iOS16:
-                pass # do nothing, since fp16 is a valid input type for CoreML
-            else:
-                # This is some other dtype. Change the type to fp32 and add a cast.
-                # This is only a limitation of main--other functions do not represent CoreML model inputs
-                # and do not have the same limitation on input types.
-                supported_dtypes = "{int32, fp32}" if func.opset_version < target.iOS16 else \
-                                    "{int32, fp16, fp32}"
-                msg = "\nInput '{}' is of dtype {}. The " +\
-                               "CoreML runtime does not support inputs with this dtype " +\
-                               "(supported dtypes are: {}). This input will be assigned a dtype of " +\
-                               "fp32. A cast will be inserted at the beginning of the program to " +\
-                               "convert the input to the originally defined dtype.\n"
-                if input_var.dtype == types.fp16:
-                    msg += "fp16 dtype input is supported if the function.opset_version is chosen to be at least " \
-                           "iOS16/macOS13.\n"
-                logger.warning(msg.format(
-                    input_var.name,
-                    input_dtype_str,
-                    supported_dtypes))
-
+            convert_to_dtype = types.int32 if types.is_int(input_var.dtype) else types.fp32
+            convert_to_dtype_str = types.builtin_to_string(convert_to_dtype)
+            should_insert_cast = input_var.dtype in _RUNTIME_SUPPORTED_TYPES
+            _adjust_var_dtype_helper(input_var, convert_to_dtype)
+            logger.warning(
+                f"\nInput '{input_var.name}' is of dtype {input_dtype_str}. The Core ML I/O does "
+                f"not support this dtype (supported dtypes are: {_IO_SUPPORTED_TYPES}). Consider "
+                f"setting `minimum_deployment_target` to a higher IOS version for more supported "
+                f"dtypes. This input is changed to {convert_to_dtype_str}.\n"
+            )
+
+            if not should_insert_cast:
+                logger.warning(
+                    f"The original input dtype {input_dtype_str} is not supported in "
+                    f"Core ML Runtime (supported dtypes are: {_RUNTIME_SUPPORTED_TYPES}). Consider "
+                    f"setting `minimum_deployment_target` to a higher IOS version for more "
+                    f"supported dtypes. We just changed the dtype and won't insert any cast op."
+                )
+                continue
+
+            logger.warning(
+                f"Trying to insert a cast op at the beginning of the program to convert "
+                f"the input to the originally defined dtype ({input_dtype_str}).\n"
+            )
+            try:
+                first_op = func.operations[0] if len(func.operations) > 0 else None
                 casted_input_var = mb.cast(x=input_var, dtype=input_dtype_str, before_op=first_op)
-                func.replace_uses_of_var_after_op(anchor_op=casted_input_var.op, old_var=input_var, new_var=casted_input_var)
-                _adjust_var_dtype_helper(input_var, types.fp32)
+                # Use force replace as the `input_var.dtype` could be not subtype of the
+                # `convert_to_dtype`. For example, int16 cast to int32. As it's only for input
+                # dtype cast, this replace should be safe.
+                func.replace_uses_of_var_after_op(
+                    anchor_op=casted_input_var.op,
+                    old_var=input_var,
+                    new_var=casted_input_var,
+                    force_replace=True,
+                    no_check_var_types=True,
+                )
+            except Exception as e:
+                logger.warning(
+                    f"Failed to insert the cast op.\n{e}\nThe dtype of the input "
+                    f"'{input_var.name}' is changed to {convert_to_dtype_str} without "
+                    f"inserting any cast op."
+                )
+
 
 @block_context_manager
 def _adjust_main_outputs(func):
+    """Adjust the outputs in the main func to make sure they have Core ML I/O supported types."""
+    _IO_SUPPORTED_TYPES = _get_io_supported_types(func.opset_version)
+
     new_outputs = []
     for output_var in func.outputs:
         output_type = output_var.sym_type
-        if (types.is_tensor(output_type) or types.is_scalar(output_type)) \
-            and output_var.dtype != types.fp32 \
-            and output_var.dtype != types.int32 \
-            and (func.opset_version < target.iOS16 or output_var.dtype != types.fp16):
-            # since fp16 is a valid output type for coreml from ios16 spec onwards, no need to cast
+        if (
+            types.is_tensor(output_type) or types.is_scalar(output_type)
+        ) and output_var.dtype not in _IO_SUPPORTED_TYPES:
             output_dtype_str = types.builtin_to_string(output_var.dtype)
-            supported_dtypes = "{int32, fp32}" if func.opset_version < target.iOS16 else \
-                                "{int32, fp16, fp32}"
-            msg = "\nOutput '{}' is of dtype {}. The " +\
-                           "CoreML runtime does not support outputs with this dtype " +\
-                           "(supported dtypes are: {}). This output will be assigned a dtype " +\
-                           "of fp32. A cast will be inserted at the end of the program to convert" +\
-                           "the original output dtype to the dtype supported by the CoreML runtime.\n"
+            target_dtype = "int32" if types.is_int(output_var.dtype) else "fp32"
+            logger.warning(
+                f"\nOutput '{output_var.name}' is of dtype {output_dtype_str}. The "
+                f"Core ML runtime does not support outputs with this dtype (supported "
+                f"dtypes are: {_IO_SUPPORTED_TYPES}). This output will changed to "
+                f"{target_dtype} by adding a cast op at the end of the program.\n"
+            )
             if output_var.dtype == types.fp16:
-                msg += "fp16 dtype output is supported if function.opset_version is chosen to be at least " \
-                       "iOS16/macOS13.\n"
-            logger.warning(msg.format(
-                               output_var.name,
-                               output_dtype_str,
-                               supported_dtypes,
-                           ))
+                logger.warning(
+                    "fp16 dtype output is supported if function.opset_version is chosen to be at "
+                    "least iOS16/macOS13.\n"
+                )
 
             output_var_name = output_var.name
-            output_var.set_name(output_var_name + "__pre__output__fp32__cast")
-            # Convert the output to fp32, and add a cast.
-            output_var = mb.cast(x=output_var, dtype="fp32")
+            output_var.set_name(f"{output_var_name}__pre__output__{target_dtype}__cast")
+            output_var = mb.cast(x=output_var, dtype=target_dtype)
             output_var.set_name(output_var_name)
         new_outputs.append(output_var)
     func.set_outputs(new_outputs)
 
 
-#####
-# General Functions and Blocks
-#####
-def _adjust_var(var):
+def _adjust_func_inputs(func):
     """
     Changes the dtype of the provided variable according
     to the rules outlined in the top level pass comment
     (see adjust_io_to_supported_types).
     """
-    if (types.is_tensor(var.sym_type) or types.is_scalar(var.sym_type)) \
-        and var.dtype not in __RUNTIME_SUPPORTED_TYPES:
-        dtype_str = types.builtin_to_string(var.dtype)
-        if types.is_int(var.dtype):
-            # Replace non-int32 input type with int32.
-            logger.warning("Input '" + var.name + "' is of dtype " + dtype_str +\
-                           ". Only integer variables of bit width 32 are supported by the CoreML runtime. " +\
-                           "This input will be assigned a dtype of int32. " +\
-                           "No cast will be inserted; the previous dtype will be replaced.")
-            _adjust_var_dtype_helper(var, types.int32)
-        else:
-            # This is some other unsupported dtype. Change the input type to fp32.
-            logger.warning("Var " + var.name + " is of dtype " + dtype_str + ". The CoreML runtime " +\
-                           "does not support this dtype (only fp16, fp32, bool, and int32 are supported). " +\
-                           "This input will be assigned a dtype of fp32. No cast will be inserted; " +\
-                           "the previous dtype will be replaced.")
-            _adjust_var_dtype_helper(var, types.fp32)
-
+    _RUNTIME_SUPPORTED_TYPES = _get_runtime_supported_types(func.opset_version)
 
-def _adjust_func_inputs(func):
     for input_name, input_var in func.inputs.items():
-        _adjust_var(input_var)
+        if (
+            types.is_tensor(input_var.sym_type) or types.is_scalar(input_var.sym_type)
+        ) and input_var.dtype not in _RUNTIME_SUPPORTED_TYPES:
+            dtype_str = types.builtin_to_string(input_var.dtype)
+            convert_to_dtype = types.int32 if types.is_int(input_var.dtype) else types.fp32
+            convert_to_dtype_str = types.builtin_to_string(convert_to_dtype)
+            _adjust_var_dtype_helper(input_var, convert_to_dtype)
+            logger.warning(
+                f"Input '{input_var.name}' is of dtype {dtype_str}, which is not"
+                f"supported by the Core ML runtime. This input will be changed to "
+                f"{convert_to_dtype_str}. No cast will be inserted."
+            )
+
 
-#####
-# The Pass
-#####
 def _adjust_io_to_supported_types(func, is_main):
     if is_main:
         _adjust_main_inputs(func)
diff --git a/coremltools/converters/mil/backend/mil/passes/test_passes.py b/coremltools/converters/mil/backend/mil/passes/test_passes.py
index 6d82ff380..4168fb775 100644
--- a/coremltools/converters/mil/backend/mil/passes/test_passes.py
+++ b/coremltools/converters/mil/backend/mil/passes/test_passes.py
@@ -9,9 +9,7 @@
 import numpy as np
 import pytest
 
-# import mil internal ops to add it to the builder
 import coremltools as ct
-# Set the testing backend
 from coremltools.converters.mil._deployment_compatibility import \
     AvailableTarget as target
 from coremltools.converters.mil.mil import Builder as mb
@@ -193,7 +191,7 @@ def prog(x):
             assert get_op_types_in_program(prog) == ['relu']
             assert inputs[0][1].dtype == types.fp16
             assert block.outputs[0].dtype == types.fp16
-            
+
     def test_float16_input_output_with_opset_version_inference(self):
         """
         Input graph:
@@ -203,14 +201,14 @@ def test_float16_input_output_with_opset_version_inference(self):
             %pixel_unshuffle_0: (1, 4, 2, 2, fp16)(Tensor) = pixel_unshuffle(x=%x, downscale_factor=2, name="pixel_unshuffle_0")
           } -> (%pixel_unshuffle_0)
         }
-        
+
         This function would be inferred as an iOS16 function, and the graph pass should behave properly
         """
         @mb.program(input_specs=[mb.TensorSpec(shape=(1, 1, 4, 4), dtype=types.fp16)])
         def prog(x):
             x = mb.pixel_unshuffle(x=x, downscale_factor=np.uint32(2))
             return x
-            
+
         prev_prog, prev_block, block = apply_pass_and_basic_check(
             prog, "mil_backend::adjust_io_to_supported_types"
         )
@@ -250,6 +248,65 @@ def prog(x):
         assert prev_inputs[0][1].name == inputs[0][1].name
         assert inputs[0][1].dtype == types.int32
 
+    @pytest.mark.parametrize(
+        "opset_version",
+        [None, target.iOS17],
+    )
+    def test_int16_input(self, opset_version):
+        """
+        Input graph:
+            func main(int16 x) {
+            ....
+            } -> (x)
+
+        Before IOS17, it becomes
+            func main(int32 x) {
+            ....
+            } -> (x)
+
+        In IOS17+, it becomes
+            func main(int32 x) {
+                %cast_0: (1, 1, 1, 1, int16)(Tensor) = cast(x=%x, dtype="int16", name="cast_0")
+                ....
+                %cast_1: (1, 1, 1, 1, int32)(Tensor) = cast(x=%x, dtype="int32", name="cast_1")
+            } -> (cast_1)
+        because IOS17+ supports int16 in Runtime (but doesn't support int16 for I/O).
+        """
+
+        @mb.program(
+            input_specs=[mb.TensorSpec(shape=(1, 1, 1, 1), dtype=types.int16)],
+            opset_version=opset_version,
+        )
+        def prog(x):
+            return x
+
+        prev_prog, prev_block, block = apply_pass_and_basic_check(
+            prog, "mil_backend::adjust_io_to_supported_types"
+        )
+
+        prev_inputs = list(prev_block.inputs.items())
+        inputs = list(block.inputs.items())
+        prev_outputs = prev_block.outputs
+        outputs = block.outputs
+        assert prev_inputs[0][1].dtype == types.int16
+        assert prev_outputs[0].dtype == types.int16
+        assert inputs[0][1].dtype == types.int32
+        assert outputs[0].dtype == types.int32
+        assert prev_inputs[0][1].name == inputs[0][1].name
+        assert outputs[0].name == prev_outputs[0].name
+        if opset_version and opset_version >= target.iOS17:
+            assert get_op_types_in_program(prog) == ["cast", "cast"]
+            cast_ops = [op for op in prog["main"].operations if op.op_type != "const"]
+            # The first cast is for int32 to int16.
+            assert cast_ops[0].x.dtype == types.int32
+            assert cast_ops[0].outputs[0].dtype == types.int16
+            # The second cast is for int16 to int32.
+            assert cast_ops[1].x.dtype == types.int16
+            assert cast_ops[1].outputs[0].dtype == types.int32
+        else:
+            # Before IOS17, the int16 is not supported in Runtime, so there is no cast inserted.
+            assert get_op_types_in_program(prog) == []
+
     def test_subblock(self):
         """
         Input graph:
@@ -885,4 +942,3 @@ def program(x):
             backend=("mlprogram", "fp32"),
             expected_output_shapes={block.outputs[0].name: tuple(x_shape)},
         )
-
diff --git a/coremltools/converters/mil/backend/nn/op_mapping.py b/coremltools/converters/mil/backend/nn/op_mapping.py
index 457775517..012c845a0 100644
--- a/coremltools/converters/mil/backend/nn/op_mapping.py
+++ b/coremltools/converters/mil/backend/nn/op_mapping.py
@@ -1708,13 +1708,15 @@ def lstm(const_context, builder, op):
 
     if direction in {"forward", "reverse"}:
         # Expand initial_h and initial_c,
-        # from shape (B, H) to shape (1, Batch, H, 1, 1)
-        _expand_dim(builder, initial_h + "_expanded", initial_h, [0, 3, 4])
-        initial_h += "_expanded"
-        # initial_h may have the same name as initial_c (e.g., same Var).
-        # Append a different string to avoid conflict
-        _expand_dim(builder, initial_c + "_expanded2", initial_c, [0, 3, 4])
-        initial_c += "_expanded2"
+        # from shape (B, H) to shape (1, Batch, H, 1, 1).
+        # Since initial_h and initial_c may get used in multiple places,
+        # prepend input_name to avoid conflict
+        _expand_dim(builder, input_name + initial_h + "_expanded", initial_h, [0, 3, 4])
+        initial_h = input_name + initial_h + "_expanded"
+        # initial_c may have the same name as initial_h (e.g., same Var).
+        # Append a different string to initial_c to avoid conflict
+        _expand_dim(builder, input_name + initial_c + "_expanded2", initial_c, [0, 3, 4])
+        initial_c = input_name + initial_c + "_expanded2"
 
         # w_x: [H*I, H*I, H*I, H*I]
         # w_h: [H*H, H*H, H*H, H*H]
diff --git a/coremltools/converters/mil/converter.py b/coremltools/converters/mil/converter.py
index 6642e200e..9242d4354 100644
--- a/coremltools/converters/mil/converter.py
+++ b/coremltools/converters/mil/converter.py
@@ -15,7 +15,7 @@
 from coremltools.models.model import _create_mlpackage
 
 from . import ImageType, InputType
-from .mil.passes.pass_pipeline import PassPipeline, PipelineManager
+from .mil.passes.pass_pipeline import PassPipeline, PassPipelineManager
 
 
 class ConverterRegistry:
@@ -47,9 +47,9 @@ def __call__(self, model, *args, **kwargs):
             max_opset_version, op = model._get_max_opset_version_and_op()
             if max_opset_version > specification_version:
                 msg = (
-                    "Please update the minimum_deployment_target to {!s},"
-                    " since op {} is only available in opset {!s} or newer."
-                ).format(max_opset_version, op.op_type, max_opset_version)
+                    "Please update the minimum_deployment_target to coremltools.target.{},"
+                    " since op {} is only available in opset coremltools.target.{} or newer."
+                ).format(max_opset_version.name, op.op_type, max_opset_version.name)
                 raise ValueError(msg)
 
         if "inputs" in kwargs and kwargs["inputs"] is not None:
@@ -269,6 +269,7 @@ def mil_convert_to_proto(
             f"one of: {list(converter_registry.frontends.keys())}"
         )
 
+    kwargs.setdefault("convert_from", convert_from)
     kwargs.setdefault("convert_to", convert_to)
 
     if main_pipeline is None:
@@ -283,16 +284,16 @@ def mil_convert_to_proto(
 
     frontend_converter = frontend_converter_type()
     prog = frontend_converter(model, **kwargs)
-    PipelineManager.apply_pipeline(prog, frontend_pipeline)
+    PassPipelineManager.apply_pipeline(prog, frontend_pipeline)
 
-    PipelineManager.apply_pipeline(prog, main_pipeline)
+    PassPipelineManager.apply_pipeline(prog, main_pipeline)
 
     prog._check_invalid_tensor_rank()
 
     if convert_to == 'milinternal':
         return None, prog
 
-    PipelineManager.apply_pipeline(prog, backend_pipeline)
+    PassPipelineManager.apply_pipeline(prog, backend_pipeline)
     backend_converter_type = converter_registry.backends.get(convert_to.lower())
     if not backend_converter_type:
         raise NotImplementedError(
diff --git a/coremltools/converters/mil/experimental/passes/generic_pass_infrastructure.py b/coremltools/converters/mil/experimental/passes/generic_pass_infrastructure.py
index 9ebb1b2d5..7ad6a3b7a 100644
--- a/coremltools/converters/mil/experimental/passes/generic_pass_infrastructure.py
+++ b/coremltools/converters/mil/experimental/passes/generic_pass_infrastructure.py
@@ -4,6 +4,7 @@
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
 import itertools
+import warnings
 from functools import partial
 
 from coremltools.converters.mil.mil.passes.helper import block_context_manager
@@ -109,7 +110,11 @@ def _pattern_detected(pattern, program_op, pattern_op, program_root_var, pattern
         # Last op in the pattern
         if len(pattern_child_op_list) == 0:
             if pattern.final_op is not None and pattern.final_op != program_op:
-                raise ValueError("User defined pattern has more than one final operation")
+                warnings.warn(
+                    "User defined pattern matched to more than one final operation. "
+                    "Skipped the pattern matching."
+                )
+                return False
             pattern.set_final_op(pattern_op.name, program_op)
             return True
 
@@ -218,4 +223,3 @@ def register_generic_pass(ops_arrangement, var_constraints, transform_pattern, p
         pass_registry.PASS_REGISTRY.passes[pass_id] = PassContainer(pass_name)
 
     pass_registry.PASS_REGISTRY[pass_id].add(pass_function)
-
diff --git a/coremltools/converters/mil/frontend/_utils.py b/coremltools/converters/mil/frontend/_utils.py
index 8e39fbe50..48913427d 100644
--- a/coremltools/converters/mil/frontend/_utils.py
+++ b/coremltools/converters/mil/frontend/_utils.py
@@ -12,7 +12,7 @@
 from coremltools.converters.mil.mil.types.symbolic import any_symbolic, is_symbolic
 
 
-def value_at(x: Var, idx: int, name=None):
+def value_at(x: Var, idx: int, name=None, before_op=None):
     """
     input x: 1D tensor (vector).
     return value at index idx. x[idx].
@@ -27,6 +27,8 @@ def value_at(x: Var, idx: int, name=None):
     }
     if name is not None:
         args["name"] = name
+    if before_op is not None:
+        args["before_op"] = before_op
     return mb.slice_by_index(**args)
 
 
diff --git a/coremltools/converters/mil/frontend/milproto/load.py b/coremltools/converters/mil/frontend/milproto/load.py
index 1761e31b3..f7c3508b4 100644
--- a/coremltools/converters/mil/frontend/milproto/load.py
+++ b/coremltools/converters/mil/frontend/milproto/load.py
@@ -107,14 +107,18 @@ def _load_file_value(context, filevalue_spec, dtype):
         context.blob_reader_from_filename[filename] = blob_reader
 
     if dtype == types.uint8:
-        np_value = np.array(blob_reader.read_uint8_data(offset), np.uint8)
+        np_value = blob_reader.read_uint8_data(offset)
     elif dtype == types.int8:
-        np_value = np.array(blob_reader.read_int8_data(offset), np.int8)
+        np_value = blob_reader.read_int8_data(offset)
+    elif dtype == types.uint16:
+        np_value = blob_reader.read_uint16_data(offset)
+    elif dtype == types.int16:
+        np_value = blob_reader.read_int16_data(offset)
     elif dtype == types.fp16:
-        np_value_uint16 = np.array(blob_reader.read_fp16_data(offset), np.uint16)
+        np_value_uint16 = blob_reader.read_fp16_data(offset)
         np_value = np.frombuffer(np_value_uint16.tobytes(), np.float16)
     elif dtype == types.fp32:
-        np_value = np.array(blob_reader.read_float_data(offset), np.float32)
+        np_value = blob_reader.read_float_data(offset)
     else:
         raise ValueError("Invalid dtype for blob file value type")
 
@@ -263,9 +267,6 @@ def _load_operation(context, op_spec):
                 "Loading Custom Layer operation not yet implemented"
             )
 
-        if op_spec.attributes:
-            raise ValueError("Attributes on operation not supported")
-
         # The conversion steps of an operation proto -> PyMIL operation are as following:
 
         # (i)   Convert the input arguments:
@@ -289,7 +290,12 @@ def _load_operation(context, op_spec):
         # (iv)  Set the outer_op for control flow
         #       Once the operation is created, we replace the dummy outer_op with the legit one, to make it a valid PyMIL program
 
-        inputs = {}
+        attrs = list(op_spec.attributes.items())
+        if len(attrs) > 0:
+            if len(attrs) != 1 or attrs[0][0] != "name":
+                raise ValueError("\"name\" is the only supported attribute for operation")
+        inputs = {k: _load_value(context, v) for k, v in op_spec.attributes.items()}
+
         for param_name, argument in op_spec.inputs.items():
             vars = []
             for binding in argument.arguments:
diff --git a/coremltools/converters/mil/frontend/milproto/test_load.py b/coremltools/converters/mil/frontend/milproto/test_load.py
index cb45d13b6..69a90e8ca 100644
--- a/coremltools/converters/mil/frontend/milproto/test_load.py
+++ b/coremltools/converters/mil/frontend/milproto/test_load.py
@@ -20,7 +20,10 @@
     run_compare_tf
 from coremltools.converters.mil.mil.ops.tests.testing_utils import \
     compare_backend
-from coremltools.converters.mil.testing_utils import get_op_types_in_program
+from coremltools.converters.mil.testing_utils import (
+    get_op_names_in_program,
+    get_op_types_in_program
+)
 
 if _HAS_TORCH:
     import torch
@@ -34,7 +37,7 @@ def get_pymil_prog_from_mlmodel(mlmodel):
         model_spec=model_spec,
         specification_version=model_spec.specificationVersion,
         file_weights_dir=mlmodel.weights_dir,
-    )  
+    )
 
 def get_roundtrip_mlmodel(mlmodel):
     """
@@ -81,7 +84,7 @@ def prog(x):
             return x
 
         # Convert it to MIL proto backed MLModel
-        mlmodel = ct.convert(prog, convert_to="mlprogram")
+        mlmodel = ct.convert(prog, convert_to="mlprogram", compute_units=ct.ComputeUnit.CPU_ONLY)
 
         # Load MLModel back to PyMIL
         loaded_pymil_prog = get_pymil_prog_from_mlmodel(mlmodel)
@@ -92,14 +95,19 @@ def prog(x):
 
     def test_mil_proto_to_pymil_with_version_handling(self):
         # This test makes sure the correct version of the op is picked up during mil_proto -> pymil conversion
-        
+
         # iOS15 version program with iOS13 version topk
         @mb.program(input_specs=[mb.TensorSpec(shape=(1, 1, 4, 4))], opset_version=ct.target.iOS15)
         def prog(x):
             x = mb.topk(x=x, k=1, axis=-1, ascending=True)
             return x
 
-        iOS15_mlmodel = ct.convert(prog, convert_to="mlprogram", minimum_deployment_target=ct.target.iOS15)
+        iOS15_mlmodel = ct.convert(
+            prog,
+            convert_to="mlprogram",
+            minimum_deployment_target=ct.target.iOS15,
+            compute_units=ct.ComputeUnit.CPU_ONLY,
+        )
         iOS15_pymil_prog = get_pymil_prog_from_mlmodel(iOS15_mlmodel)
         topk_op = iOS15_pymil_prog.functions["main"].find_ops(op_type="topk")[0]
         assert not hasattr(topk_op, "sort")
@@ -110,11 +118,38 @@ def prog(x):
             x = mb.topk(x=x, k=1, axis=-1, ascending=True)
             return x
 
-        iOS16_mlmodel = ct.convert(prog, convert_to="mlprogram", minimum_deployment_target=ct.target.iOS16)
+        iOS16_mlmodel = ct.convert(
+            prog,
+            convert_to="mlprogram",
+            minimum_deployment_target=ct.target.iOS16,
+            compute_units=ct.ComputeUnit.CPU_ONLY,
+        )
         iOS16_pymil_prog = get_pymil_prog_from_mlmodel(iOS16_mlmodel)
         topk_op = iOS16_pymil_prog.functions["main"].find_ops(op_type="topk")[0]
         assert hasattr(topk_op, "sort")
 
+    def test_mil_proto_preserving_ops_name(self):
+        # This test is checking the route source_model -> MIL -> mil_prot -> pymil is preserving the op name
+        # Define a PyMIL program
+        @mb.program(input_specs=[mb.TensorSpec(shape=(1, 3, 100, 100)), ])
+        def prog(x):
+            # MIL operation takes named inputs (instead of positional inputs).
+            # Here `name` argument is optional.
+            x = mb.relu(x=x, name='i_am_relu')
+            x = mb.conv(x=x, weight=np.random.rand(10, 3, 2, 2), name="i_am_conv")
+            x = mb.transpose(x=x, perm=[0, 3, 1, 2], name='i_am_transpose')
+            x = mb.reduce_mean(x=x, axes=[2, 3], keep_dims=False, name='i_am_reduce_mean')
+            x = mb.log(x=x, name='i_am_log')
+            return x
+
+        mlmodel = ct.convert(prog, convert_to="mlprogram", compute_units=ct.ComputeUnit.CPU_ONLY)
+        op_names = get_op_names_in_program(mlmodel._mil_program, skip_const_ops=False)
+
+        prog = get_pymil_prog_from_mlmodel(mlmodel)
+        new_op_names = get_op_names_in_program(prog, skip_const_ops=False)
+
+        assert op_names == new_op_names
+
 @pytest.mark.skipif(ct.utils._macos_version() < (12, 0), reason="mlprogram predict available only on macOS12+")
 class TestE2ENumericalCorrectness:
     @pytest.mark.skipif(not _HAS_TORCH, reason="requires torch")
@@ -191,7 +226,7 @@ def test_list(self):
         input_dict = dict(zip(inputs, input_values))
         _, mlmodel, _, _ = run_compare_tf(
             model,
-            input_dict, 
+            input_dict,
             outputs,
             compute_unit=ct.ComputeUnit.CPU_ONLY,
             backend=("mlprogram", "fp16")
diff --git a/coremltools/converters/mil/frontend/tensorflow/ops.py b/coremltools/converters/mil/frontend/tensorflow/ops.py
index 78b775b2b..1fb7abbd2 100644
--- a/coremltools/converters/mil/frontend/tensorflow/ops.py
+++ b/coremltools/converters/mil/frontend/tensorflow/ops.py
@@ -4,19 +4,16 @@
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
 import numpy as _np
+import numpy as np
 
 from coremltools import _logger as logger
-from coremltools.converters.mil._deployment_compatibility import \
-    AvailableTarget as target
+from coremltools.converters.mil._deployment_compatibility import AvailableTarget as target
 from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.mil import types
-from coremltools.converters.mil.mil.block import \
-    is_current_opset_version_compatible_with
-from coremltools.converters.mil.mil.ops.defs._utils import (
-    broadcast_shapes, promote_input_dtypes)
+from coremltools.converters.mil.mil.block import is_current_opset_version_compatible_with
+from coremltools.converters.mil.mil.ops.defs._utils import broadcast_shapes, promote_input_dtypes
 from coremltools.converters.mil.mil.types import builtin_to_string
-from coremltools.converters.mil.mil.types.symbolic import (any_symbolic,
-                                                           is_symbolic)
+from coremltools.converters.mil.mil.types.symbolic import is_symbolic
 
 from .._utils import build_einsum_mil
 from .convert_utils import convert_graph
@@ -620,22 +617,32 @@ def ExtractImagePatches(context, node):
     box_indices = _np.transpose(box_indices)
     box_indices = box_indices.reshape(-1, 1)
     boxes = _np.tile(boxes, (batch, 1))
-    boxes = _np.concatenate([box_indices, boxes], axis=1)
-    boxes = boxes.reshape(boxes.shape[0], 1, boxes.shape[1], 1, 1)
 
-    # use crop_and_resize
     x = _transpose_NHWC_to_NCHW(x)
-    x = mb.crop_resize(
-        x=x,
-        roi=boxes,
-        target_height=sizes[1],
-        target_width=sizes[2],
-        normalized_coordinates=False,
-        spatial_scale=1.0,
-        box_coordinate_mode="CORNERS_HEIGHT_FIRST",
-        sampling_mode="ALIGN_CORNERS",
-    )
-    x = mb.squeeze(x=x, axes=[1])
+    crop_resize_args = {
+        "x": x,
+        "target_height": sizes[1],
+        "target_width": sizes[2],
+        "normalized_coordinates": False,
+        "spatial_scale": 1.0,
+        "box_coordinate_mode": "CORNERS_HEIGHT_FIRST",
+        "sampling_mode": "ALIGN_CORNERS",
+    }
+    if not is_current_opset_version_compatible_with(target.iOS17):
+        # Before IOS17, boxes need to be shape [N,1,4,1,1] or [N,1,5,1,1].
+        boxes = _np.concatenate([box_indices, boxes], axis=1)
+        boxes = boxes.reshape(boxes.shape[0], 1, boxes.shape[1], 1, 1)
+        # Before IOS17, the input param is `roi` instead of `boxes`.
+        crop_resize_args["roi"] = boxes
+        x = mb.crop_resize(**crop_resize_args)
+        # Before IOS17, the output has an extra dim at axis 1.
+        x = mb.squeeze(x=x, axes=[1])
+    else:
+        # At this point `boxes` has shape [N, 4], which is good enough for IOS17+.
+        crop_resize_args["boxes"] = boxes
+        box_indices = np.squeeze(box_indices, axis=-1)
+        crop_resize_args["box_indices"] = box_indices
+        x = mb.crop_resize(**crop_resize_args)
     x = _transpose_NCHW_to_NHWC(x, node_name=node.name + "_transpose_to_nhwc")
     x = mb.reshape(x=x, shape=(batch, len(h_index), len(w_index), -1), name=node.name)
     context.add(node.name, x)
@@ -2257,6 +2264,7 @@ def Gather(context, node):
     x = mb.gather(x=x, indices=indices, axis=axis, name=node.name)
     context.add(node.name, x)
 
+
 def _perform_gather_with_batch_dims(x, indices, batch_dims, gather_func, func_args, name):
     """
     An utility function to compute gather and gather_nd with batch_dims
@@ -3011,16 +3019,18 @@ def ZerosLike(context, node):
 @register_tf_op
 def IsFinite(context, node):
     x = context[node.inputs[0]]
-    if any_symbolic(x.shape):
-        x_shape = mb.shape(x=x)
-    else:
-        x_shape = [1] if x.shape == () else x.shape
-    max_tensor = mb.fill(shape=x_shape, value=_np.finfo(_np.float32).max)
-    min_tensor = mb.fill(shape=x_shape, value=_np.finfo(_np.float32).min)
-    less_then = mb.less_equal(x=x, y=max_tensor)
-    greater_than = mb.greater_equal(x=x, y=min_tensor)
-    x = mb.logical_and(x=less_then, y=greater_than, name=node.name)
-    context.add(node.name, x)
+
+    # In floating-point arithmetic, symbolically, inf + anything = inf,
+    # so we can detect if x is finite by x + y != x
+    #
+    # To avoid false alarm, i.e. x + y = x due to rounding error for small y,
+    # here we use the fp16 max as y
+    dtype = types.nptype_from_builtin(x.sym_type.get_primitive())
+    y_add = dtype(_np.finfo(_np.float16).max)
+    x_plus = mb.add(x=x, y=y_add)
+    result = mb.not_equal(x=x, y=x_plus, name=node.name)
+
+    context.add(node.name, result)
 
 
 @register_tf_op
@@ -3045,20 +3055,22 @@ def CropAndResize(context, node):
     if const_box_info:
         boxes = context[node.inputs[1]].val
         box_indices = context[node.inputs[2]].val
-        box_indices = _np.expand_dims(box_indices, axis=1)
-        boxes = _np.concatenate([box_indices, boxes], axis=1)
-        # CoreML expects boxes/ROI in
-        # [N, 1, 5, 1, 1] format
-        boxes = boxes.reshape(boxes.shape[0], 1, boxes.shape[1], 1, 1)
+        if not is_current_opset_version_compatible_with(target.iOS17):
+            # Before IOS17, CoreML expects boxes/ROI in [N, 1, 5, 1, 1] shape.
+            box_indices = _np.expand_dims(box_indices, axis=1)
+            boxes = _np.concatenate([box_indices, boxes], axis=1)
+            boxes = boxes.reshape(boxes.shape[0], 1, boxes.shape[1], 1, 1)
     else:
         box_indices = context[node.inputs[2]]
         boxes = context[node.inputs[1]]
-        box_indices = mb.expand_dims(x=box_indices, axes=[1])
-        if box_indices.dtype != boxes.dtype:
-            box_indices = mb.cast(x=box_indices, dtype=types.builtin_to_string(boxes.dtype))
-        boxes = mb.concat(values=(box_indices, boxes), axis=1)
-        # TODO: Dynamic rank: Use GetShape and select indices dynamically
-        boxes = mb.reshape(x=boxes, shape=[boxes.shape[0], 1, boxes.shape[1], 1, 1])
+        if not is_current_opset_version_compatible_with(target.iOS17):
+            # Before IOS17, CoreML expects ROI in [N, 1, 5, 1, 1] shape.
+            if box_indices.dtype != boxes.dtype:
+                box_indices = mb.cast(x=box_indices, dtype=types.builtin_to_string(boxes.dtype))
+            box_indices = mb.expand_dims(x=box_indices, axes=[1])
+            boxes = mb.concat(values=(box_indices, boxes), axis=1)
+            # TODO: Dynamic rank: Use GetShape and select indices dynamically
+            boxes = mb.reshape(x=boxes, shape=[boxes.shape[0], 1, boxes.shape[1], 1, 1])
 
     # Get Height and Width of crop
     h_out, w_out = crop_size[0], crop_size[1]
@@ -3078,9 +3090,8 @@ def CropAndResize(context, node):
     x = _transpose_NHWC_to_NCHW(x)
 
     # Crop Resize
-    args = {
+    crop_resize_args = {
         "x": x,
-        "roi": boxes,
         "target_height": h_out,
         "target_width": w_out,
         "normalized_coordinates": True,
@@ -3089,19 +3100,26 @@ def CropAndResize(context, node):
         "sampling_mode": method,
     }
     if is_current_opset_version_compatible_with(target.iOS16):
-        args["pad_value"] = pad_value
+        crop_resize_args["pad_value"] = pad_value
     else:
         if pad_value != 0.0:
-            msg = (
-                    "For iOS15 or older, only extrapolation_value=0.0 is supported or the tf CropAndResize op. "
-                    "Got {}"
-            ).format(pad_value)
-            raise ValueError(msg)
-    x = mb.crop_resize(**args)
+            raise ValueError(
+                f"For iOS15 or older, only extrapolation_value=0.0 is supported or the tf CropAndResize op. Got {pad_value}"
+            )
+    if not is_current_opset_version_compatible_with(target.iOS17):
+        # Before IOS17, the input param is `roi` instead of `boxes`.
+        crop_resize_args["roi"] = boxes
+    else:
+        crop_resize_args["boxes"] = boxes
+        crop_resize_args["box_indices"] = box_indices
+
+    x = mb.crop_resize(**crop_resize_args)
 
-    # CoreML output format: [N, 1, C, h_out, w_out]
-    # TF output format: [N, h_out, w_out, C]
-    x = mb.squeeze(x=x, axes=[1])
+    if not is_current_opset_version_compatible_with(target.iOS17):
+        # Before IOS17, the output has an extra dim at axis 1.
+        # CoreML output format: [N, 1, C, h_out, w_out]
+        # TF output format: [N, h_out, w_out, C]
+        x = mb.squeeze(x=x, axes=[1])
     x = _transpose_NCHW_to_NHWC(x, node.name)
     context.add(node.name, x)
 
diff --git a/coremltools/converters/mil/frontend/tensorflow/test/test_ops.py b/coremltools/converters/mil/frontend/tensorflow/test/test_ops.py
index 1cdcfbdd3..8cc3d9517 100644
--- a/coremltools/converters/mil/frontend/tensorflow/test/test_ops.py
+++ b/coremltools/converters/mil/frontend/tensorflow/test/test_ops.py
@@ -6,9 +6,11 @@
 import itertools
 import math
 import os
+import platform
 import shutil
 import tempfile
 from distutils.version import StrictVersion
+from typing import Optional
 
 import numpy as np
 import pytest
@@ -23,6 +25,7 @@
     load_tf_pb,
     make_tf_graph,
 )
+from coremltools.converters.mil.mil import Operation, Program, types
 from coremltools.converters.mil.testing_reqs import backends, compute_units
 from coremltools.converters.mil.testing_utils import (
     einsum_equations,
@@ -35,6 +38,7 @@
 
 PREBUILT_TF1_WHEEL_VERSION = "1.15.5"
 
+
 @pytest.mark.skipif(not _HAS_TF_1, reason=MSG_TF1_NOT_FOUND)
 class TestContribResampler(TensorFlowBaseTest):
     @pytest.mark.parametrize(
@@ -251,16 +255,40 @@ def build_model(x):
         )
 
 
-class TestActivationElu(TensorFlowBaseTest):
-    @pytest.mark.parametrize(
-        "compute_unit, backend, rank",
+class TestActivation(TensorFlowBaseTest):
+    @staticmethod
+    def run_compare_tf(model, input_dict, outputs, target_op: Optional[str] = None, **kwargs):
+        """Override compare method for Activation ops tests, as we want to verify the mixed
+        precision support for alpha/beta in IOS17 Activation Ops."""
+        results = TensorFlowBaseTest.run_compare_tf(model, input_dict, outputs, **kwargs)
+
+        if target_op and kwargs.get("backend", (None, None))[1] == "fp16":
+            prog: Program = results[1]._mil_program
+            activation_op: Operation = prog.find_ops(op_type=target_op, exactly_one=True)[0]
+            assert activation_op.x.dtype == types.fp16
+
+            # Before IOS17, both alpha and input/output are converted to fp16.
+            # After IOS17, alpha is kept as fp32 because it supports mixed precision.
+            expected_alpha_beta_dtype = types.fp16
+            if kwargs.get("minimum_deployment_target", None) == ct.target.iOS17:
+                expected_alpha_beta_dtype = types.fp32
+            if hasattr(activation_op, "alpha"):
+                assert activation_op.alpha.dtype == expected_alpha_beta_dtype
+            if hasattr(activation_op, "beta"):
+                assert activation_op.beta.dtype == expected_alpha_beta_dtype
+
+        return results
+
+    @pytest.mark.parametrize(
+        "compute_unit, backend, rank, minimum_deployment_target",
         itertools.product(
             compute_units,
             backends,
-            [rank for rank in range(1, 6)]
+            [rank for rank in range(1, 6)],
+            [None, ct.target.iOS17],
         ),
     )
-    def test(self, compute_unit, backend, rank):
+    def test_elu(self, compute_unit, backend, rank, minimum_deployment_target):
         input_shape = np.random.randint(low=1, high=4, size=rank)
 
         @make_tf_graph([input_shape])
@@ -271,63 +299,62 @@ def build_model(x):
 
         input_values = [random_gen(input_shape, -1, 1)]
         input_dict = dict(zip(inputs, input_values))
-        TensorFlowBaseTest.run_compare_tf(
+        self.run_compare_tf(
             model,
             input_dict,
             outputs,
             compute_unit=compute_unit,
             backend=backend,
+            minimum_deployment_target=minimum_deployment_target,
+            target_op="elu",
         )
 
-
-class TestAddN(TensorFlowBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, rank, num_inputs",
+        "compute_unit, backend, rank, minimum_deployment_target",
         itertools.product(
             compute_units,
             backends,
-            list(range(6)),
-            [1, 3, 9],
+            [rank for rank in range(1, 6)],
+            [None, ct.target.iOS17],
         ),
     )
-    def test(self, compute_unit, backend, rank, num_inputs):
-        if rank == 0:
-            pytest.skip('Rank 0 not supported by CoreML runtime')
-
+    def test_leaky_relu(self, compute_unit, backend, rank, minimum_deployment_target):
         input_shape = np.random.randint(low=1, high=4, size=rank)
-        input_shapes = [input_shape[:] for _ in range(num_inputs)]
 
-        @make_tf_graph(input_shapes)
-        def build_model(*inputs):
-            return tf.raw_ops.AddN(inputs=inputs)
+        @make_tf_graph([input_shape])
+        def build_model(x):
+            return tf.nn.leaky_relu(x, 0.2)
 
         model, inputs, outputs = build_model
-        input_values = [random_gen(shape, -1, 1) for shape in input_shapes]
+
+        input_values = [random_gen(input_shape, -1, 1)]
         input_dict = dict(zip(inputs, input_values))
-        TensorFlowBaseTest.run_compare_tf(
+        self.run_compare_tf(
             model,
             input_dict,
             outputs,
             compute_unit=compute_unit,
             backend=backend,
+            minimum_deployment_target=minimum_deployment_target,
+            target_op="leaky_relu",
         )
 
-
-class TestAddOrdering(TensorFlowBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend",
-        itertools.product(compute_units, backends),
+        "compute_unit, backend, rank",
+        itertools.product(compute_units, backends, [rank for rank in range(1, 6)]),
     )
-    def test(self, compute_unit, backend):
-        @make_tf_graph([(2, 3, 4), (2, 3, 4)])
-        def build_model(x, y):
-            return tf.math.add(x, y)
+    def test_relu(self, compute_unit, backend, rank):
+        input_shape = np.random.randint(low=1, high=4, size=rank)
+
+        @make_tf_graph([input_shape])
+        def build_model(x):
+            return tf.nn.relu(x)
 
         model, inputs, outputs = build_model
-        input_values = [random_gen((2, 3, 4), -1, 1)] * 2
-        input_dict = dict(zip(inputs, input_values))
 
-        spec, _, _, _, _, _ = TensorFlowBaseTest.run_compare_tf(
+        input_values = [random_gen(input_shape, -10.0, 10)]
+        input_dict = dict(zip(inputs, input_values))
+        self.run_compare_tf(
             model,
             input_dict,
             outputs,
@@ -335,38 +362,22 @@ def build_model(x, y):
             backend=backend,
         )
 
-        if backend[0] == "neuralnetwork":
-            nn_spec = spec.neuralNetwork
-            if _HAS_TF_1:
-                input_names = ["Placeholder", "Placeholder_1"]
-            elif _HAS_TF_2:
-                input_names = ["args_0", "args_1"]
-
-            assert nn_spec.layers[0].input[0] == input_names[0]
-            assert nn_spec.layers[0].input[1] == input_names[1]
-
-
-class TestActivationLeakyReLU(TensorFlowBaseTest):
     @pytest.mark.parametrize(
         "compute_unit, backend, rank",
-        itertools.product(
-            compute_units,
-            backends,
-            [rank for rank in range(1, 6)]
-        ),
+        itertools.product(compute_units, backends, [rank for rank in range(1, 6)]),
     )
-    def test(self, compute_unit, backend, rank):
+    def test_relu6(self, compute_unit, backend, rank):
         input_shape = np.random.randint(low=1, high=4, size=rank)
 
         @make_tf_graph([input_shape])
         def build_model(x):
-            return tf.nn.leaky_relu(x, 0.2)
+            return tf.nn.relu6(x)
 
         model, inputs, outputs = build_model
 
         input_values = [random_gen(input_shape, -1, 1)]
         input_dict = dict(zip(inputs, input_values))
-        TensorFlowBaseTest.run_compare_tf(
+        self.run_compare_tf(
             model,
             input_dict,
             outputs,
@@ -374,28 +385,22 @@ def build_model(x):
             backend=backend,
         )
 
-
-class TestActivationReLU(TensorFlowBaseTest):
     @pytest.mark.parametrize(
         "compute_unit, backend, rank",
-        itertools.product(
-            compute_units,
-            backends,
-            [rank for rank in range(1, 6)]
-        ),
+        itertools.product(compute_units, backends, [rank for rank in range(1, 6)]),
     )
-    def test(self, compute_unit, backend, rank):
+    def test_sigmoid(self, compute_unit, backend, rank):
         input_shape = np.random.randint(low=1, high=4, size=rank)
 
         @make_tf_graph([input_shape])
         def build_model(x):
-            return tf.nn.relu(x)
+            return tf.math.sigmoid(x)
 
         model, inputs, outputs = build_model
 
-        input_values = [random_gen(input_shape, -10.0, 10)]
+        input_values = [random_gen(input_shape, -1, 1)]
         input_dict = dict(zip(inputs, input_values))
-        TensorFlowBaseTest.run_compare_tf(
+        self.run_compare_tf(
             model,
             input_dict,
             outputs,
@@ -403,28 +408,22 @@ def build_model(x):
             backend=backend,
         )
 
-
-class TestActivationReLU6(TensorFlowBaseTest):
     @pytest.mark.parametrize(
         "compute_unit, backend, rank",
-        itertools.product(
-            compute_units,
-            backends,
-            [rank for rank in range(1, 6)]
-        ),
+        itertools.product(compute_units, backends, [rank for rank in range(1, 6)]),
     )
-    def test(self, compute_unit, backend, rank):
+    def test_softplus(self, compute_unit, backend, rank):
         input_shape = np.random.randint(low=1, high=4, size=rank)
 
         @make_tf_graph([input_shape])
         def build_model(x):
-            return tf.nn.relu6(x)
+            return tf.math.softplus(x)
 
         model, inputs, outputs = build_model
 
         input_values = [random_gen(input_shape, -1, 1)]
         input_dict = dict(zip(inputs, input_values))
-        TensorFlowBaseTest.run_compare_tf(
+        self.run_compare_tf(
             model,
             input_dict,
             outputs,
@@ -432,88 +431,50 @@ def build_model(x):
             backend=backend,
         )
 
-
-class TestGelu(TensorFlowBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, rank, mode",
+        "compute_unit, backend, rank_and_axes",
         itertools.product(
             compute_units,
             backends,
-            [rank for rank in range(2, 3)],
-            ("tanh_approx", "exact_1", "exact_2", "exact_3")
+            [(rank, axis) for rank in range(1, 6) for axis in range(-1, rank)],
         ),
     )
-    def test(self, compute_unit, backend, rank, mode):
+    def test_softmax(self, compute_unit, backend, rank_and_axes):
+        rank, axis = rank_and_axes
         input_shape = np.random.randint(low=1, high=4, size=rank)
 
         @make_tf_graph([input_shape])
-        def build_model_tanh_approx(x):
-            a = 0.5 * (
-                1.0 + tf.tanh((math.sqrt(2 / math.pi) * (x + 0.044715 * tf.pow(x, 3))))
-            )
-            return a * x
-
-        @make_tf_graph([input_shape])
-        def build_model_exact_1(x):
-            return x * (0.5 * (1.0 + tf.math.erf(x / tf.math.sqrt(2.0))))
-
-        @make_tf_graph([input_shape])
-        def build_model_exact_2(x):
-            return 0.5 * (x * (1.0 + tf.math.erf(x / tf.math.sqrt(2.0))))
-
-        @make_tf_graph([input_shape])
-        def build_model_exact_3(x):
-            return (x * 0.5) * (1.0 + tf.math.erf(x / tf.math.sqrt(2.0)))
-
-        if mode == "tanh_approx":
-            build_model = build_model_tanh_approx
-        elif mode == "exact_1":
-            build_model = build_model_exact_1
-        elif mode == "exact_2":
-            build_model = build_model_exact_2
-        elif mode == "exact_3":
-            build_model = build_model_exact_3
-        else:
-            raise ValueError("Unexpected mode for Gelu layer")
+        def build_model(x):
+            return tf.nn.softmax(x, axis=axis)
 
         model, inputs, outputs = build_model
 
-        input_values = [random_gen(input_shape, -5, 5)]
+        input_values = [random_gen(input_shape, -1, 1)]
         input_dict = dict(zip(inputs, input_values))
-        spec, mlmodel, _, _, _, _ = TensorFlowBaseTest.run_compare_tf(
+        self.run_compare_tf(
             model,
             input_dict,
             outputs,
             compute_unit=compute_unit,
             backend=backend,
         )
-        assert TestGelu._op_count_in_mil_program(mlmodel, "gelu") == 1
-        assert TestGelu._op_count_in_mil_program(mlmodel, "erf") == 0
-        assert TestGelu._op_count_in_mil_program(mlmodel, "pow") == 0
-        assert TestGelu._op_count_in_mil_program(mlmodel, "tanh") == 0
-
 
-class TestActivationSigmoid(TensorFlowBaseTest):
     @pytest.mark.parametrize(
         "compute_unit, backend, rank",
-        itertools.product(
-            compute_units,
-            backends,
-            [rank for rank in range(1, 6)]
-        ),
+        itertools.product(compute_units, backends, [rank for rank in range(1, 6)]),
     )
-    def test(self, compute_unit, backend, rank):
+    def test_softsign(self, compute_unit, backend, rank):
         input_shape = np.random.randint(low=1, high=4, size=rank)
 
         @make_tf_graph([input_shape])
         def build_model(x):
-            return tf.math.sigmoid(x)
+            return tf.math.softsign(x)
 
         model, inputs, outputs = build_model
 
         input_values = [random_gen(input_shape, -1, 1)]
         input_dict = dict(zip(inputs, input_values))
-        TensorFlowBaseTest.run_compare_tf(
+        self.run_compare_tf(
             model,
             input_dict,
             outputs,
@@ -521,56 +482,60 @@ def build_model(x):
             backend=backend,
         )
 
-
-class TestActivationSoftPlus(TensorFlowBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, rank",
+        "compute_unit, backend, rank, minimum_deployment_target",
         itertools.product(
             compute_units,
             backends,
-            [rank for rank in range(1, 6)]
+            [rank for rank in range(1, 6)],
+            [None, ct.target.iOS17],
         ),
     )
-    def test(self, compute_unit, backend, rank):
+    def test_selu(self, compute_unit, backend, rank, minimum_deployment_target):
         input_shape = np.random.randint(low=1, high=4, size=rank)
 
         @make_tf_graph([input_shape])
         def build_model(x):
-            return tf.math.softplus(x)
+            return tf.nn.selu(x)
 
         model, inputs, outputs = build_model
 
-        input_values = [random_gen(input_shape, -1, 1)]
+        input_values = [random_gen(input_shape, -1.0, 1.0)]
         input_dict = dict(zip(inputs, input_values))
-        TensorFlowBaseTest.run_compare_tf(
+        self.run_compare_tf(
             model,
             input_dict,
             outputs,
             compute_unit=compute_unit,
             backend=backend,
+            minimum_deployment_target=minimum_deployment_target,
+            target_op="elu",
         )
 
 
-class TestActivationSoftmax(TensorFlowBaseTest):
+class TestAddN(TensorFlowBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, rank_and_axes",
+        "compute_unit, backend, rank, num_inputs",
         itertools.product(
             compute_units,
             backends,
-            [(rank, axis) for rank in range(1, 6) for axis in range(-1, rank)],
+            list(range(6)),
+            [1, 3, 9],
         ),
     )
-    def test(self, compute_unit, backend, rank_and_axes):
-        rank, axis = rank_and_axes
+    def test(self, compute_unit, backend, rank, num_inputs):
+        if rank == 0:
+            pytest.skip('Rank 0 not supported by CoreML runtime')
+
         input_shape = np.random.randint(low=1, high=4, size=rank)
+        input_shapes = [input_shape[:] for _ in range(num_inputs)]
 
-        @make_tf_graph([input_shape])
-        def build_model(x):
-            return tf.nn.softmax(x, axis=axis)
+        @make_tf_graph(input_shapes)
+        def build_model(*inputs):
+            return tf.raw_ops.AddN(inputs=inputs)
 
         model, inputs, outputs = build_model
-
-        input_values = [random_gen(input_shape, -1, 1)]
+        input_values = [random_gen(shape, -1, 1) for shape in input_shapes]
         input_dict = dict(zip(inputs, input_values))
         TensorFlowBaseTest.run_compare_tf(
             model,
@@ -581,27 +546,21 @@ def build_model(x):
         )
 
 
-class TestActivationSoftSign(TensorFlowBaseTest):
+class TestAddOrdering(TensorFlowBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, rank",
-        itertools.product(
-            compute_units,
-            backends,
-            [rank for rank in range(1, 6)]
-        ),
+        "compute_unit, backend",
+        itertools.product(compute_units, backends),
     )
-    def test(self, compute_unit, backend, rank):
-        input_shape = np.random.randint(low=1, high=4, size=rank)
-
-        @make_tf_graph([input_shape])
-        def build_model(x):
-            return tf.math.softsign(x)
+    def test(self, compute_unit, backend):
+        @make_tf_graph([(2, 3, 4), (2, 3, 4)])
+        def build_model(x, y):
+            return tf.math.add(x, y)
 
         model, inputs, outputs = build_model
-
-        input_values = [random_gen(input_shape, -1, 1)]
+        input_values = [random_gen((2, 3, 4), -1, 1)] * 2
         input_dict = dict(zip(inputs, input_values))
-        TensorFlowBaseTest.run_compare_tf(
+
+        spec, _, _, _, _, _ = TensorFlowBaseTest.run_compare_tf(
             model,
             input_dict,
             outputs,
@@ -609,34 +568,74 @@ def build_model(x):
             backend=backend,
         )
 
+        if backend[0] == "neuralnetwork":
+            nn_spec = spec.neuralNetwork
+            if _HAS_TF_1:
+                input_names = ["Placeholder", "Placeholder_1"]
+            elif _HAS_TF_2:
+                input_names = ["args_0", "args_1"]
+
+            assert nn_spec.layers[0].input[0] == input_names[0]
+            assert nn_spec.layers[0].input[1] == input_names[1]
 
-class TestActivationSelu(TensorFlowBaseTest):
+class TestGelu(TensorFlowBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, rank",
+        "compute_unit, backend, rank, mode",
         itertools.product(
             compute_units,
             backends,
-            [rank for rank in range(1, 6)]
+            [rank for rank in range(2, 3)],
+            ("tanh_approx", "exact_1", "exact_2", "exact_3")
         ),
     )
-    def test(self, compute_unit, backend, rank):
+    def test(self, compute_unit, backend, rank, mode):
         input_shape = np.random.randint(low=1, high=4, size=rank)
 
         @make_tf_graph([input_shape])
-        def build_model(x):
-            return tf.nn.selu(x)
+        def build_model_tanh_approx(x):
+            a = 0.5 * (
+                1.0 + tf.tanh((math.sqrt(2 / math.pi) * (x + 0.044715 * tf.pow(x, 3))))
+            )
+            return a * x
+
+        @make_tf_graph([input_shape])
+        def build_model_exact_1(x):
+            return x * (0.5 * (1.0 + tf.math.erf(x / tf.math.sqrt(2.0))))
+
+        @make_tf_graph([input_shape])
+        def build_model_exact_2(x):
+            return 0.5 * (x * (1.0 + tf.math.erf(x / tf.math.sqrt(2.0))))
+
+        @make_tf_graph([input_shape])
+        def build_model_exact_3(x):
+            return (x * 0.5) * (1.0 + tf.math.erf(x / tf.math.sqrt(2.0)))
+
+        if mode == "tanh_approx":
+            build_model = build_model_tanh_approx
+        elif mode == "exact_1":
+            build_model = build_model_exact_1
+        elif mode == "exact_2":
+            build_model = build_model_exact_2
+        elif mode == "exact_3":
+            build_model = build_model_exact_3
+        else:
+            raise ValueError("Unexpected mode for Gelu layer")
 
         model, inputs, outputs = build_model
 
-        input_values = [random_gen(input_shape, -1.0, 1.0)]
+        input_values = [random_gen(input_shape, -5, 5)]
         input_dict = dict(zip(inputs, input_values))
-        TensorFlowBaseTest.run_compare_tf(
+        spec, mlmodel, _, _, _, _ = TensorFlowBaseTest.run_compare_tf(
             model,
             input_dict,
             outputs,
             compute_unit=compute_unit,
             backend=backend,
         )
+        assert TestGelu._op_count_in_mil_program(mlmodel, "gelu") == 1
+        assert TestGelu._op_count_in_mil_program(mlmodel, "erf") == 0
+        assert TestGelu._op_count_in_mil_program(mlmodel, "pow") == 0
+        assert TestGelu._op_count_in_mil_program(mlmodel, "tanh") == 0
 
 
 class Testlog1p(TensorFlowBaseTest):
@@ -2210,7 +2209,7 @@ class TestEinsum(TensorFlowBaseTest):
         )
     )
     def test(self, compute_unit, backend, equation, reverse_input_order):
-        input_shapes, _ = gen_input_shapes_einsum(equation, False)
+        input_shapes, _ = gen_input_shapes_einsum(equation, False, backend)
         if _HAS_TF_1:
             if len(set(input_shapes[0])) < len(input_shapes[0]) or len(set(input_shapes[1])) < len(input_shapes[1]):
                 pytest.skip("tf1 does not support diagonal cases")
@@ -2606,7 +2605,8 @@ def build_model(x):
                     assert len(layer.upsample.fractionalScalingFactor) == 0
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, input_shape, num_of_crops, crop_size, method, dynamic, extrapolation_value",
+        "compute_unit, backend, input_shape, num_of_crops, crop_size, method, dynamic, "
+        "extrapolation_value, minimum_deployment_target",
         itertools.product(
             compute_units,
             backends,
@@ -2616,6 +2616,7 @@ def build_model(x):
             ["bilinear"],
             [False, True],
             [0.0, 1.0],
+            [None, ct.target.iOS17],
         ),
     )
     def test_crop_and_resize(
@@ -2628,20 +2629,14 @@ def test_crop_and_resize(
         method,
         dynamic,
         extrapolation_value,
+        minimum_deployment_target,
     ):
-        if backend[0] == "mlprogram" and compute_unit != ct.ComputeUnit.CPU_ONLY and crop_size == (1, 1):
-            # in this case, there is a numerical mismatch on the GPU MIL backend. The GPU runtime tests are
-            # tracked seprately.
-            return
-
         if extrapolation_value != 0.0:
-            if backend[0] == "neuralnetwork":
-                pytest.xfail("pad_value not availabe in neural network backend.")
-            if ct.utils._macos_version() < (13, 0):
-                pytest.skip("pad_value not supported in macOS12 or older.")
-            minimum_deployment_target = ct.target.iOS16
-        else:
-            minimum_deployment_target = None
+            if minimum_deployment_target is None or minimum_deployment_target < ct.target.iOS16:
+                pytest.skip(
+                    "extrapolation_value (corresponds to `pad_value` in MIL crop_resize op) only "
+                    "supported in IOS16+."
+                )
 
         # rdar://98749492 (crop_resize is unstable for cropping out of bound setting in fp16)
         if backend[0] == "mlprogram":
@@ -2706,7 +2701,7 @@ def build_model(x, boxes_pl, box_indices_pl):
         test_dynamic() if dynamic else test_static()
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, width, height, strides, sizes, padding,",
+        "compute_unit, backend, width, height, strides, sizes, padding, minimum_deployment_target",
         itertools.product(
             compute_units,
             backends,
@@ -2715,10 +2710,19 @@ def build_model(x, boxes_pl, box_indices_pl):
             [(1, 1), (2, 1), (3, 5)],
             [(1, 1), (1, 2), (5, 4)],
             ["VALID", "SAME"],
+            [None, ct.target.iOS17],
         ),
     )
     def test_extract_patches(
-        self, compute_unit, backend, width, height, strides, sizes, padding
+        self,
+        compute_unit,
+        backend,
+        width,
+        height,
+        strides,
+        sizes,
+        padding,
+        minimum_deployment_target,
     ):
         # TODO: theoritically, the current extractpatches code handle batch size rather than 1,
         # but there seems to have a bug in crop_resize when using GPU and batch_size > 1.
@@ -2750,6 +2754,7 @@ def build_model(x):
             outputs,
             compute_unit=compute_unit,
             backend=backend,
+            minimum_deployment_target=minimum_deployment_target,
         )
 
 
@@ -2963,6 +2968,11 @@ class TestNormalization(TensorFlowBaseTest):
         ),
     )
     def test_fused_batch_norm(self, compute_unit, backend, epsilon):
+        if backend[0] == "neuralnetwork" and epsilon == 1e-10 and platform.machine() == "x86_64":
+            pytest.xfail(
+                "rdar://108739991 ([CI][TF] re-enable batch norm unittest failing in Intel machines)"
+            )
+
         # TensorFlow's FusedBatchNorm is only for 4D inputs
         input_shape = np.random.randint(low=1, high=4, size=4)
         attr_shape = [list(input_shape)[-1]]
@@ -3666,6 +3676,7 @@ def build_model(x):
         else:
             test_tf_reduction()
 
+
 class TestGather(TensorFlowBaseTest):
     @pytest.mark.parametrize(
         "compute_unit, backend, rankX_rankIndices_axis, mode",
@@ -3719,6 +3730,57 @@ def build_model(x, indices):
             backend=backend,
         )
 
+    @pytest.mark.parametrize(
+        "compute_unit, backend, mode",
+        itertools.product(
+            compute_units,
+            backends,
+            ["Gather", "GatherV2", "gather"],
+        ),
+    )
+    def test_gather_invalid_indices(self, compute_unit, backend, mode):
+        """
+        This test is to verify that TensorFlow Gather op doesn't allow negative nor out-of-range
+        indices, so don't need mb.select for IOS17 mb.gather when lowering TensorFlow gather op.
+        Use TensorFlowBaseTest.run_compare_tf to make this test compatible with both TF1 and TF2.
+        """
+
+        @make_tf_graph([[4, tf.int32]])
+        def build_model(indices):
+            params = tf.constant([0.0, 1.0, 2.0, 3.0, 4.0, 5.0])
+            if mode == "Gather":
+                res = tf.raw_ops.Gather(params=params, indices=indices)
+            elif mode == "GatherV2":
+                res = tf.raw_ops.GatherV2(params=params, indices=indices, axis=0)
+            elif mode == "gather":
+                res = tf.gather(params, indices)
+            else:
+                raise ValueError(f"Unsupported mode: {mode}")
+            return res
+
+        model, inputs, outputs = build_model
+
+        with pytest.raises(tf.errors.InvalidArgumentError, match="-1 is not in \[0, 6\)"):
+            # Negative indices will error out.
+            input_dict = dict(zip(inputs, [np.array([2, 0, -1, 5], dtype=np.int32)]))
+            TensorFlowBaseTest.run_compare_tf(
+                model,
+                input_dict,
+                outputs,
+                compute_unit=compute_unit,
+                backend=backend,
+            )
+        with pytest.raises(tf.errors.InvalidArgumentError, match="6 is not in \[0, 6\)"):
+            # Out-of-range indices will error out.
+            input_dict = dict(zip(inputs, [np.array([2, 0, 1, 6], dtype=np.int32)]))
+            TensorFlowBaseTest.run_compare_tf(
+                model,
+                input_dict,
+                outputs,
+                compute_unit=compute_unit,
+                backend=backend,
+            )
+
     @pytest.mark.parametrize(
         "compute_unit, backend, rankX_rankIndices_axis_batchdims, mode",
         itertools.product(
@@ -3871,21 +3933,68 @@ def build_model(x, indices):
             minimum_deployment_target=ct.target.iOS16 if backend[0] == "mlprogram" else None
         )
 
+    @pytest.mark.parametrize(
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
+    )
+    def test_gather_nd_invalid_indices(self, compute_unit, backend):
+        """
+        This test is to verify that TensorFlow GatherNd op doesn't allow negative nor out-of-range
+        indices, so don't need mb.select for IOS17 mb.gather when lowering TensorFlow GatherNd op.
+        Use TensorFlowBaseTest.run_compare_tf to make this test compatible with both TF1 and TF2.
+        """
+
+        @make_tf_graph([[2, 2, tf.int32]])
+        def build_model(indices):
+            params = tf.constant([[0.0, 1.0], [2.0, 3.0]])
+            return tf.gather_nd(params, indices)
+
+        model, inputs, outputs = build_model
+
+        with pytest.raises(
+            tf.errors.InvalidArgumentError,
+            match="\[1, -1\] does not index into param shape \[2,2\]",
+        ):
+            # Negative indices will error out.
+            input_dict = dict(zip(inputs, [np.array([[0, 0], [1, -1]], dtype=np.int32)]))
+            TensorFlowBaseTest.run_compare_tf(
+                model,
+                input_dict,
+                outputs,
+                compute_unit=compute_unit,
+                backend=backend,
+            )
+        with pytest.raises(
+            tf.errors.InvalidArgumentError, match="\[2, 0\] does not index into param shape \[2,2\]"
+        ):
+            # Out-of-range indices will error out.
+            input_dict = dict(zip(inputs, [np.array([[2, 0], [1, 1]], dtype=np.int32)]))
+            TensorFlowBaseTest.run_compare_tf(
+                model,
+                input_dict,
+                outputs,
+                compute_unit=compute_unit,
+                backend=backend,
+            )
+
 
 class TestScatter(TensorFlowBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, data_rank, indices_rank",
+        "compute_unit, backend, data_rank, indices_rank, minimum_deployment_target",
         itertools.product(
             compute_units,
             backends,
             list(range(1, 4)),
             list(range(2, 4)),
+            [None, ct.target.iOS17],
         ),
     )
     def test_scatter_nd_with_zeros(
-        self, compute_unit, backend, data_rank, indices_rank
+        self, compute_unit, backend, data_rank, indices_rank, minimum_deployment_target
     ):
-
         shape = np.random.randint(low=2, high=4, size=data_rank).astype(np.int32)
         indices_shape = np.random.randint(low=2, high=4, size=indices_rank)
         indices_shape[-1] = np.random.randint(low=1, high=data_rank + 1)
@@ -3914,12 +4023,67 @@ def build_model(indices, updates, shape):
             outputs,
             compute_unit=compute_unit,
             backend=backend,
+            minimum_deployment_target=minimum_deployment_target,
         )
 
+    @pytest.mark.parametrize(
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
+    )
+    def test_scatter_nd_with_invalid_indices(self, compute_unit, backend):
+        shape = np.random.randint(low=2, high=4, size=3).astype(np.int32)
+        indices_shape = np.random.randint(low=2, high=4, size=3)
+        indices_shape[-1] = np.random.randint(low=1, high=4)
+        updates_shape = list(indices_shape[:-1]) + list(shape[indices_shape[-1] :])
+
+        updates = np.random.rand(*updates_shape).astype(np.int32)
+        neg_indices_list = []
+        for i in range(indices_shape[-1]):
+            neg_indices_list.append(np.random.randint(-shape[i], 0, size=indices_shape[:-1]))
+        indices = np.stack(neg_indices_list, axis=-1).astype(np.int32)
+
+        @make_tf_graph(
+            [list(indices.shape) + [tf.int32], updates_shape + [tf.int32], [3, tf.int32]]
+        )
+        def build_model(indices, updates, shape):
+            return tf.raw_ops.ScatterNd(indices=indices, updates=updates, shape=shape)
+
+        model, inputs, outputs = build_model
+
+        # TensorFlow ScatterNd doesn't support negative indices.
+        with pytest.raises(tf.errors.InvalidArgumentError, match="does not index into shape"):
+            TensorFlowBaseTest.run_compare_tf(
+                model,
+                dict(zip(inputs, [indices, updates, shape])),
+                outputs,
+                compute_unit=compute_unit,
+                backend=backend,
+            )
+
+        out_of_range_indices_list = []
+        for i in range(indices_shape[-1]):
+            out_of_range_indices_list.append(
+                np.random.randint(shape[i], shape[i] * 2, size=indices_shape[:-1])
+            )
+        indices = np.stack(out_of_range_indices_list, axis=-1).astype(np.int32)
+
+        # TensorFlow ScatterNd doesn't support out of range indices.
+        with pytest.raises(tf.errors.InvalidArgumentError, match="does not index into shape"):
+            TensorFlowBaseTest.run_compare_tf(
+                model,
+                dict(zip(inputs, [indices, updates, shape])),
+                outputs,
+                compute_unit=compute_unit,
+                backend=backend,
+            )
+
 
 class TestTensorScatterAdd(TensorFlowBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, tensor_rank, indices_rank",
+        "compute_unit, backend, tensor_rank, indices_rank, minimum_deployment_target",
         itertools.product(
             compute_units,
             backends,
@@ -3927,10 +4091,11 @@ class TestTensorScatterAdd(TensorFlowBaseTest):
             # and Core ML only supports updates_rank < 6,
             # so we constrain tensor_rank + indices_rank - 2 < 6
             [tensor_rank for tensor_rank in range(1, 5)],
-            [indices_rank for indices_rank in range(2, 4)]
+            [indices_rank for indices_rank in range(2, 4)],
+            [None, ct.target.iOS17],
         ),
     )
-    def test(self, compute_unit, backend, tensor_rank, indices_rank):
+    def test_scatter_add(self, compute_unit, backend, tensor_rank, indices_rank, minimum_deployment_target):
         # To avoid indexing out of bound:
         #     tensor size for each dimension >= MIN_TENSOR_SIZE
         #     index for each dimension < MIN_TENSOR_SIZE
@@ -3968,8 +4133,73 @@ def build_model(tensor, indices, updates):
             outputs,
             compute_unit=compute_unit,
             backend=backend,
+            minimum_deployment_target=minimum_deployment_target,
         )
 
+    @pytest.mark.parametrize(
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
+    )
+    def test_scatter_add_invalid_indices(self, compute_unit, backend):
+        # To avoid indexing out of bound:
+        #     tensor size for each dimension >= MIN_TENSOR_SIZE
+        #     index for each dimension < MIN_TENSOR_SIZE
+        MIN_TENSOR_SIZE = 3
+
+        tensor_rank = 3
+        indices_rank = 3
+        tensor_shape = np.random.randint(low=MIN_TENSOR_SIZE, high=9, size=tensor_rank)
+        # indices shape constraint: 0 < indices_shape[-1] <= tensor_rank
+        indices_shape = np.random.randint(low=1, high=tensor_rank + 1, size=indices_rank)
+
+        updates_shape = []
+        for i in range(indices_rank - 1):
+            updates_shape.append(indices_shape[i])
+        for i in range(indices_shape[-1], tensor_rank):
+            updates_shape.append(tensor_shape[i])
+        updates_shape = np.array(updates_shape)
+
+        @make_tf_graph([tensor_shape, list(indices_shape) + [tf.int32], updates_shape])
+        def build_model(tensor, indices, updates):
+            return tf.tensor_scatter_nd_add(tensor, indices, updates)
+
+        model, inputs, outputs = build_model
+
+        # TensorFlow tensor_scatter_nd_add doesn't support negative indices.
+        neg_indices = random_gen(indices_shape, rand_min=-3, rand_max=-1, dtype=np.int32)
+        input_values = [
+            random_gen(tensor_shape, rand_min=-1.0, rand_max=1.0),
+            neg_indices,
+            random_gen(updates_shape, rand_min=-1.0, rand_max=1.0),
+        ]
+        with pytest.raises(tf.errors.InvalidArgumentError, match="does not index into shape"):
+            TensorFlowBaseTest.run_compare_tf(
+                model,
+                dict(zip(inputs, input_values)),
+                outputs,
+                compute_unit=compute_unit,
+                backend=backend,
+            )
+
+        # TensorFlow tensor_scatter_nd_add doesn't support out of range indices.
+        out_of_range_indices = random_gen(indices_shape, rand_min=10, rand_max=20, dtype=np.int32)
+        input_values = [
+            random_gen(tensor_shape, rand_min=-1.0, rand_max=1.0),
+            out_of_range_indices,
+            random_gen(updates_shape, rand_min=-1.0, rand_max=1.0),
+        ]
+        with pytest.raises(tf.errors.InvalidArgumentError, match="does not index into shape"):
+            TensorFlowBaseTest.run_compare_tf(
+                model,
+                dict(zip(inputs, input_values)),
+                outputs,
+                compute_unit=compute_unit,
+                backend=backend,
+            )
+
 
 class TestSliceByIndex(TensorFlowBaseTest):
     @pytest.mark.parametrize(
@@ -3982,6 +4212,10 @@ class TestSliceByIndex(TensorFlowBaseTest):
         ),
     )
     def test_slice_by_index_simple(self, compute_unit, backend, rank, masking_type):
+        if backend[0] == "mlprogram":
+            pytest.xfail(
+                "rdar://109854221 ([Bug][Regression] slice_by_index is throwing expection through E5ML - Follow up radar)"
+            )
         input_shape = np.random.randint(low=2, high=4, size=rank)
         begin_val = np.array(
             [
@@ -4698,6 +4932,11 @@ def test_non_max_suppression(
                 "number of boxes is large)"
             )
 
+        if backend[0] == "mlprogram":
+            # force we are using fp16 for mlprogram, until this radar is fix:
+            # rdar://109871491 ([Bug][CI][Regression] Numerical regression on E5ML for nms layers)
+            backend = ("mlprogram", "fp32")
+
         boxes_val = random_gen(shape=(num_boxes, 4), rand_min=0, rand_max=32)
         # When the input score is too close, the returned index order is not guaranteed.
         # So instead of generating random scores by rand, use shuffle.
@@ -5119,6 +5358,32 @@ def build_model(x):
             backend=backend,
         )
 
+    @pytest.mark.parametrize(
+        "compute_unit, backend",
+        itertools.product(compute_units, backends),
+    )
+    def test_tile_invalid(self, compute_unit, backend):
+        """TF doesn't support tile where `multiples` have different length than x's rank."""
+        x_shape = (2, 3, 4)
+
+        with pytest.raises(ValueError, match="Shape must be rank 3 but is rank 2"):
+
+            @make_tf_graph([x_shape])
+            def build_model(x):
+                return tf.tile(x, multiples=[1, 2])
+
+            model, inputs, outputs = build_model
+            input_values = [random_gen(x_shape)]
+            input_dict = dict(zip(inputs, input_values))
+            TensorFlowBaseTest.run_compare_tf(
+                model,
+                input_dict,
+                outputs,
+                compute_unit=compute_unit,
+                backend=backend,
+            )
+
+
 class TestDynamicTile(TensorFlowBaseTest):
     @pytest.mark.parametrize(
         "compute_unit, backend, rank",
@@ -5568,11 +5833,13 @@ def build_model(x):
             backend=backend,
         )
 
+
 class TestReshape(TensorFlowBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends,)
+        "compute_unit, backend, minimum_deployment_target",
+        itertools.product(compute_units, backends, [None, ct.target.iOS17]),
     )
-    def test_flatten(self, compute_unit, backend):
+    def test_flatten(self, compute_unit, backend, minimum_deployment_target):
         shapes = [[2, 2], [3, 2, 1, 2], [2, 1, 4, 3]]
 
         for input_shape in shapes:
@@ -5591,10 +5858,11 @@ def build_model(x):
                 outputs,
                 compute_unit=compute_unit,
                 backend=backend,
+                minimum_deployment_target=minimum_deployment_target,
             )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, input_shape",
+        "compute_unit, backend, input_shape, minimum_deployment_target",
         itertools.product(
             compute_units,
             backends,
@@ -5603,9 +5871,10 @@ def build_model(x):
                 ([3, 4, 5, 6], [4, 5, 3, 6]),
                 ([4, 4, 5, 6], [2, 2, -1]),
             ],
+            [None, ct.target.iOS17],
         ),
     )
-    def test_reshape_static(self, compute_unit, backend, input_shape):
+    def test_reshape_static(self, compute_unit, backend, input_shape, minimum_deployment_target):
         @make_tf_graph([input_shape[0]])
         def build_model(x):
             return tf.reshape(x, shape=input_shape[1])
@@ -5620,10 +5889,11 @@ def build_model(x):
             outputs,
             compute_unit=compute_unit,
             backend=backend,
+            minimum_deployment_target=minimum_deployment_target,
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, input_shape",
+        "compute_unit, backend, input_shape, minimum_deployment_target",
         itertools.product(
             compute_units,
             backends,
@@ -5633,9 +5903,10 @@ def build_model(x):
                 ([4, 4, 5, 6], [2, 2, -1]),
                 ([2, 3, 5, 3], [2, -1]),
             ],
+            [None, ct.target.iOS17],
         ),
     )
-    def test_reshape_dynamic(self, compute_unit, backend, input_shape):
+    def test_reshape_dynamic(self, compute_unit, backend, input_shape, minimum_deployment_target):
         @make_tf_graph([input_shape[0], (len(input_shape[1]), tf.int32)])
         def build_model(x, y):
             return tf.reshape(x, shape=y)
@@ -5656,14 +5927,12 @@ def build_model(x, y):
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, shape",
+        "compute_unit, backend, shape, minimum_deployment_target",
         itertools.product(
-            compute_units,
-            backends,
-            [[1], [1, 1], [1, 1, -1], []],
+            compute_units, backends, [[1], [1, 1], [1, 1, -1], []], [None, ct.target.iOS17]
         ),
     )
-    def test_reshape_scalar(self, compute_unit, backend, shape):
+    def test_reshape_scalar(self, compute_unit, backend, shape, minimum_deployment_target):
         pytest.skip('Rank 0 not supported by CoreML runtime')
 
         input_shape = ()
@@ -5682,8 +5951,10 @@ def build_model(x):
             outputs,
             compute_unit=compute_unit,
             backend=backend,
+            minimum_deployment_target=minimum_deployment_target,
         )
 
+
 class TestShape(TensorFlowBaseTest):
     @pytest.mark.parametrize(
         "compute_unit, backend, rank",
@@ -6285,7 +6556,12 @@ def build_model(x):
         # Before rdar://93071454 (batch_to_space is error out in espresso for dynamic inputs cormel model) is fixed,
         # we need to specify the default shape for the dynamic model by setting inputs_for_conversion
         if dynamic:
-            shape = tuple([RangeDim(default=dim) for dim in input_shape])
+            shape = tuple(
+                [
+                    RangeDim(default=dim, upper_bound=dim if backend[0] == "mlprogram" else -1)
+                    for dim in input_shape
+                ]
+            )
             inputs_for_conversion = [TensorType(shape=shape, dtype=np.float32)]
         else:
             inputs_for_conversion = None
@@ -6330,7 +6606,12 @@ def build_model(x):
         # Before rdar://93071454 (batch_to_space is error out in espresso for dynamic inputs cormel model) is fixed,
         # we need to specify the default shape for the dynamic model by setting inputs_for_conversion
         if dynamic:
-            shape = tuple([RangeDim(default=dim) for dim in input_shape])
+            shape = tuple(
+                [
+                    RangeDim(default=dim, upper_bound=dim if backend[0] == "mlprogram" else -1)
+                    for dim in input_shape
+                ]
+            )
             inputs_for_conversion = [TensorType(shape=shape, dtype=np.float32)]
         else:
                         inputs_for_conversion = None
@@ -6769,17 +7050,9 @@ def build_model(x):
 class TestIsFinite(TensorFlowBaseTest):
     @pytest.mark.parametrize(
         "compute_unit, backend, rank, dynamic",
-        itertools.product(
-            compute_units,
-            backends,
-            [rank for rank in range(5)],
-            [True, False]
-        ),
+        itertools.product(compute_units, backends, [rank for rank in range(1, 5)], [True, False]),
     )
     def test(self, compute_unit, backend, rank, dynamic):
-        if rank == 0:
-            pytest.skip('Rank 0 not supported by CoreML runtime')
-
         def _generate_num_with_inf(input_shape):
             res = random_gen(input_shape, rand_min=-1, rand_max=1)
             random_map = np.random.choice([np.inf, -np.inf, 0], size=input_shape)
@@ -6885,15 +7158,16 @@ def build_model(x):
 
 class TestClipByValue(TensorFlowBaseTest):
     @pytest.mark.parametrize(
-        'compute_unit, backend, rank, min_and_max',
-         itertools.product(
-             compute_units,
-             backends,
-             [rank for rank in range(5)],
-             [(-1, 1), (-1, -1), (1, 2), (-3, -2)],
-         ),
+        "compute_unit, backend, rank, min_and_max, minimum_deployment_target",
+        itertools.product(
+            compute_units,
+            backends,
+            [rank for rank in range(5)],
+            [(-1, 1), (-1, -1), (1, 2), (-3, -2)],
+            [None, ct.target.iOS17],
+        ),
     )
-    def test(self, compute_unit, backend, rank, min_and_max):
+    def test(self, compute_unit, backend, rank, min_and_max, minimum_deployment_target):
         if rank == 0:
             pytest.skip('Rank 0 not supported by CoreML runtime')
 
@@ -6913,7 +7187,8 @@ def build_model(x):
             input_dict,
             outputs,
             compute_unit=compute_unit,
-            backend=backend
+            backend=backend,
+            minimum_deployment_target=minimum_deployment_target,
         )
 
 
@@ -6954,11 +7229,7 @@ def build_model(x):
             input_values = [input_value]
         input_dict = dict(zip(inputs, input_values))
         TensorFlowBaseTest.run_compare_tf(
-            model,
-            input_dict,
-            outputs,
-            compute_unit=compute_unit,
-            backend=backend
+            model, input_dict, outputs, compute_unit=compute_unit, backend=backend
         )
 
 class TestAudioSpectrogram(TensorFlowBaseTest):
diff --git a/coremltools/converters/mil/frontend/tensorflow/test/test_tf_conversion_api.py b/coremltools/converters/mil/frontend/tensorflow/test/test_tf_conversion_api.py
index 3c820eb2a..9f4866f71 100644
--- a/coremltools/converters/mil/frontend/tensorflow/test/test_tf_conversion_api.py
+++ b/coremltools/converters/mil/frontend/tensorflow/test/test_tf_conversion_api.py
@@ -3,6 +3,7 @@
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+import itertools
 import os
 import tempfile
 
@@ -11,11 +12,19 @@
 
 import coremltools as ct
 from coremltools._deps import _HAS_TF_1, _HAS_TF_2, MSG_TF1_NOT_FOUND
+from coremltools.converters.mil.testing_reqs import backends, compute_units
 from coremltools.converters.mil.testing_utils import (
-    assert_cast_ops_count, assert_input_dtype, assert_ops_in_mil_program,
-    assert_output_dtype, assert_prog_input_type, assert_prog_output_type,
-    assert_spec_input_image_type, assert_spec_output_image_type,
-    get_op_types_in_program, verify_prediction)
+    assert_cast_ops_count,
+    assert_input_dtype,
+    assert_ops_in_mil_program,
+    assert_output_dtype,
+    assert_prog_input_type,
+    assert_prog_output_type,
+    assert_spec_input_image_type,
+    assert_spec_output_image_type,
+    get_op_types_in_program,
+    verify_prediction,
+)
 from coremltools.proto import FeatureTypes_pb2 as ft
 from coremltools.test.api.test_api_examples import TestInputs as _TestInputs
 
@@ -246,6 +255,53 @@ def test_input_wrongname():
         expected_error = "Input ({}) provided is not found in given tensorflow graph. Placeholders in graph are: {}".format("wrong_input", ["input", "input_1"])
         assert expected_error == str(e.value)
 
+    @pytest.mark.parametrize(
+        "backend, compute_unit",
+        itertools.product(
+            backends,
+            compute_units,
+        ),
+    )
+    def test_input_dynamic_without_inputs_param(self, backend, compute_unit):
+        """The `inputs` param is not provided for a dynamic input (shape has `None`)."""
+        with tf.Graph().as_default() as graph:
+            x = tf.placeholder(tf.float32, shape=(None, None, 3), name="input")
+            x1 = tf.placeholder(tf.float32, shape=(1, 2, 3), name="input_1")
+            y = tf.nn.relu(x, name="output")
+            y1 = tf.nn.relu(x1, name="output_1")
+
+        convert_to = backend[0]
+        if convert_to == "mlprogram":
+            with pytest.warns(
+                UserWarning,
+                match="Some dimensions in the input shape are unknown, hence they are set to "
+                "flexible ranges with lower bound and default value = 1, and upper bound = 2. "
+                "To set different values for the default shape and upper bound, please use "
+                "the ct.RangeDim.*",
+            ):
+                mlmodel = ct.convert(
+                    graph,
+                    convert_to=convert_to,
+                    compute_units=compute_unit,
+                )
+        else:
+            mlmodel = ct.convert(
+                graph,
+                convert_to=convert_to,
+                compute_units=compute_unit,
+            )
+
+        spec = mlmodel.get_spec()
+        assert list(spec.description.input[0].type.multiArrayType.shape) == [1, 1, 3]
+        assert (
+            spec.description.input[0].type.multiArrayType.shapeRange.sizeRanges[1].lowerBound == 1
+        )
+        assert (
+            spec.description.input[0].type.multiArrayType.shapeRange.sizeRanges[1].upperBound == -1
+            if convert_to == "neuralnetwork"
+            else 2
+        )
+
     @staticmethod
     @pytest.mark.skipif(not ct.utils._is_macos(), reason="test needs predictions")
     def test_tf_predict_input():
diff --git a/coremltools/converters/mil/frontend/tensorflow/test/testing_utils.py b/coremltools/converters/mil/frontend/tensorflow/test/testing_utils.py
index 08dc9d9b6..5e0ff7511 100644
--- a/coremltools/converters/mil/frontend/tensorflow/test/testing_utils.py
+++ b/coremltools/converters/mil/frontend/tensorflow/test/testing_utils.py
@@ -12,8 +12,11 @@
 import coremltools.models.utils as coremltoolsutils
 from coremltools._deps import _HAS_TF_2
 from coremltools.converters.mil.testing_reqs import ct
-from coremltools.converters.mil.testing_utils import (compare_backend,
-                                                      ct_convert)
+from coremltools.converters.mil.testing_utils import (
+    compare_backend,
+    ct_convert,
+    validate_minimum_deployment_target,
+)
 
 tf = pytest.importorskip("tensorflow", minversion="1.15.0")
 
@@ -147,11 +150,31 @@ def tf_graph_to_mlmodel(
     input_names = get_tf_node_names(list(feed_dict.keys()), mode="inputs")
     output_names = get_tf_node_names(output_nodes, mode="outputs")
     input_values = {name: val for name, val in zip(input_names, feed_dict.values())}
-        
-    inputs = inputs_for_conversion if inputs_for_conversion is not None else None
+
+    if inputs_for_conversion is None and backend[0] == "mlprogram":
+        # As mlprogram by default use a small upper-bound for dynamic shapes, set a larger one here
+        # to avoid test failures.
+        has_dynamic_shape = False
+        input_types = []
+        for input_placeholder in list(feed_dict.keys()):
+            input_shape = [
+                ct.RangeDim(upper_bound=64) if dim.value is None else dim.value
+                for dim in input_placeholder.shape
+            ]
+            input_types.append(
+                ct.TensorType(name=input_placeholder.name.split(":")[0], shape=input_shape)
+            )
+            if any([dim.value is None for dim in input_placeholder.shape]):
+                has_dynamic_shape = True
+        if has_dynamic_shape:
+            inputs_for_conversion = input_types
 
     mlmodel = ct_convert(
-        graph, inputs=inputs, outputs=output_names, source=frontend, convert_to=backend,
+        graph,
+        inputs=inputs_for_conversion,
+        outputs=output_names,
+        source=frontend,
+        convert_to=backend,
         compute_units=compute_unit,
         minimum_deployment_target=minimum_deployment_target,
     )
@@ -287,13 +310,13 @@ def run_compare_tf(
     pred = None
     if not coremltoolsutils._has_custom_layer(mlmodel._spec):
         pred = compare_backend(
-                mlmodel,
-                input_key_values,
-                expected_outputs,
-                atol=atol,
-                rtol=rtol,
-                also_compare_shapes=True,
-                dtype=backend[1],
+            mlmodel,
+            input_key_values,
+            expected_outputs,
+            atol=atol,
+            rtol=rtol,
+            also_compare_shapes=True,
+            dtype=backend[1],
         )
     else:
         print('Skipping model prediction as it has a custom nn layer!')
@@ -334,6 +357,8 @@ def run_compare_tf(graph, feed_dict, output_nodes,
                        backend=("neuralnetwork", "fp32"), atol=1e-04, rtol=1e-05,
                        freeze_graph=False, tf_outputs=None,
                        minimum_deployment_target=None):
+        if minimum_deployment_target is not None:
+            validate_minimum_deployment_target(minimum_deployment_target, backend)
 
         res = run_compare_tf(graph,
                              feed_dict,
@@ -348,7 +373,7 @@ def run_compare_tf(graph, feed_dict, output_nodes,
                              tf_outputs=tf_outputs,
                              minimum_deployment_target=minimum_deployment_target
         )
-        
+
         alist = []
         if res is not None:
             alist = list(res)
@@ -361,13 +386,11 @@ def run_compare_tf(graph, feed_dict, output_nodes,
     def _op_count_in_mil_program(mlmodel, op_type):
         prog = mlmodel._mil_program
         return len(prog.find_ops(op_type=op_type))
-        
-        
+
+
 if _HAS_TF_2:
     from coremltools.converters.mil.frontend.tensorflow2.test.testing_utils import (
         TensorFlow2BaseTest, make_tf2_graph)
-    from coremltools.converters.mil.frontend.tensorflow.test.testing_utils import \
-        TensorFlowBaseTest
+    from coremltools.converters.mil.frontend.tensorflow.test.testing_utils import TensorFlowBaseTest
     TensorFlowBaseTest.run_compare_tf = TensorFlow2BaseTest.run_compare_tf2
     make_tf_graph = make_tf2_graph
-
diff --git a/coremltools/converters/mil/frontend/tensorflow2/load.py b/coremltools/converters/mil/frontend/tensorflow2/load.py
index e7f2504bd..624475c0d 100644
--- a/coremltools/converters/mil/frontend/tensorflow2/load.py
+++ b/coremltools/converters/mil/frontend/tensorflow2/load.py
@@ -90,39 +90,38 @@ def __init__(self, model, debug=False, **kwargs):
         ]
 
     def _get_concrete_functions_and_graph_def(self):
-        msg = (
-            "Expected model format: [SavedModel | [concrete_function] | "
-            "tf.keras.Model | .h5 | GraphDef], got {}"
-        )
-        if (
-            isinstance(self.model, list)
-            or isinstance(self.model, _tf.keras.Model)
-            or isinstance(self.model, str)
-            or isinstance(self.model, _tf.compat.v1.GraphDef)
-        ):
-            cfs = []
-            if isinstance(self.model, list):
-                cfs = self.model
-            if isinstance(self.model, _tf.keras.Model):
-                cfs = self._concrete_fn_from_tf_keras_or_h5(self.model)
-            elif isinstance(self.model, _tf.compat.v1.GraphDef):
-                return None, self.model
-            elif isinstance(self.model, str):
-                if not _os_path.exists(self.model):
-                    raise ValueError(
-                        'Input model "{}" does not exist'.format(self.model)
-                    )
-                elif _os_path.isfile(self.model) \
-                     and (self.model.endswith(".h5") or self.model.endswith(".hdf5")):
-                    cfs = self._concrete_fn_from_tf_keras_or_h5(self.model)
-                elif _os_path.isdir(self.model):
-                    saved_model = _tf.saved_model.load(self.model)
-                    sv = saved_model.signatures.values()
-                    cfs = sv if isinstance(sv, list) else list(sv)
-                else:
-                    raise NotImplementedError(msg.format(self.model))
-        else:
-            raise NotImplementedError(msg.format(self.model))
+        if not isinstance(self.model, (list, str, _tf.keras.Model, _tf.compat.v1.GraphDef)):
+            raise NotImplementedError(
+                f"Expected model format: [SavedModel | concrete_function | "
+                f"tf.keras.Model | .h5 | GraphDef], got {self.model}"
+            )
+
+        cfs = []
+        if isinstance(self.model, list):
+            cfs = self.model
+        if isinstance(self.model, _tf.keras.Model):
+            cfs = self._concrete_fn_from_tf_keras(self.model)
+        elif isinstance(self.model, _tf.compat.v1.GraphDef):
+            return None, self.model
+        elif isinstance(self.model, str):
+            if not _os_path.exists(self.model):
+                raise ValueError(f'Input model "{self.model}" does not exist')
+            elif _os_path.isfile(self.model) and (
+                self.model.endswith(".h5") or self.model.endswith(".hdf5")
+            ):
+                # Keep a reference to loaded model, or it errors out due to variables deletion, see
+                # https://github.com/tensorflow/tensorflow/issues/37615#issuecomment-1552237114.
+                keras_model = _tf.keras.models.load_model(self.model)
+                cfs = self._concrete_fn_from_tf_keras(keras_model)
+            elif _os_path.isdir(self.model):
+                saved_model = _tf.saved_model.load(self.model)
+                sv = saved_model.signatures.values()
+                cfs = sv if isinstance(sv, list) else list(sv)
+            else:
+                raise ValueError(
+                    f"Input model path should be .h5/.hdf5 file or a directory, but "
+                    f"got {self.model}"
+                )
 
         graph_def = self._graph_def_from_concrete_fn(cfs)
 
@@ -311,9 +310,7 @@ def _dict_from_graph_def(graph, fn_name="main", sg_input_shapes=None):
         return graph_dict, graph_inputs, graph_outputs, graph_ret
 
     @staticmethod
-    def _concrete_fn_from_tf_keras_or_h5(keras_model):
-        if not isinstance(keras_model, _tf.keras.Model):
-            keras_model = _tf.keras.models.load_model(keras_model)
+    def _concrete_fn_from_tf_keras(keras_model: _tf.keras.Model):
         input_signature = _saving_utils.model_input_signature(
             keras_model, keep_original_batch_size=True
         )
diff --git a/coremltools/converters/mil/frontend/tensorflow2/test/test_tf2_conversion_api.py b/coremltools/converters/mil/frontend/tensorflow2/test/test_tf2_conversion_api.py
index a004ed89a..71c9a3f89 100644
--- a/coremltools/converters/mil/frontend/tensorflow2/test/test_tf2_conversion_api.py
+++ b/coremltools/converters/mil/frontend/tensorflow2/test/test_tf2_conversion_api.py
@@ -5,16 +5,12 @@
 
 import os
 import platform
-import urllib
-from io import BytesIO
 from os import chdir, getcwd
 from shutil import rmtree
 from tempfile import mkdtemp
 
 import numpy as np
 import pytest
-import requests
-from PIL import Image
 
 import coremltools as ct
 from coremltools.converters.mil.mil import types
@@ -141,12 +137,12 @@ def teardown_class(self):
         chdir(self._cwd)
         if os.path.exists(self._temp_dir):
             rmtree(self._temp_dir)
-    
+
     @staticmethod
     def test_convert_tf_keras_h5_file():
         if platform.machine() == "arm64":
             pytest.xfail("rdar://101162740 ([CI] [TF] The tf_keras_h5_file API testing is failing on M1 with new OS)")
-            
+
         for file_extension in ("h5", "hdf5"):
             x = tf.keras.Input(shape=(32,), name="input")
             y = tf.keras.layers.Dense(16, activation="softmax")(x)
diff --git a/coremltools/converters/mil/frontend/tensorflow2/test/test_v2_load.py b/coremltools/converters/mil/frontend/tensorflow2/test/test_v2_load.py
index 7e05b4ca5..14877a904 100644
--- a/coremltools/converters/mil/frontend/tensorflow2/test/test_v2_load.py
+++ b/coremltools/converters/mil/frontend/tensorflow2/test/test_v2_load.py
@@ -70,7 +70,7 @@ def test_keras_h5_file(self):
             source=frontend,
         )
         assert mlmodel is not None
-        
+
     def test_keras_hdf5_file(self):
         keras_model = tf.keras.Sequential(
             [tf.keras.layers.ReLU(input_shape=(4, 5), batch_size=3)]
@@ -133,7 +133,7 @@ def __call__(self, x):
             [concrete_func], outputs=["Identity"], source=frontend
         )
         assert mlmodel is not None
-    
+
     def test_graphdef_from_tf_function(self):
         class build_model(tf.Module):
             def __init__(self):
@@ -177,17 +177,16 @@ def test_model_metadata(self):
         assert "tensorflow==2." in metadata_keys["com.github.apple.coremltools.source"]
 
     def test_invalid_format_none(self):
-        with pytest.raises(NotImplementedError) as e:
+        with pytest.raises(NotImplementedError, match="Expected model format: .* .h5"):
             converter.convert(None, source=frontend)
-        e.match(r"Expected model format: .* .h5")
 
     def test_invalid_format_invalid_extension(self):
-        _, invalid_filename = tempfile.mkstemp(
-            suffix=".invalid", prefix=self.saved_model_dir
-        )
-        with pytest.raises(NotImplementedError) as e:
+        _, invalid_filename = tempfile.mkstemp(suffix=".invalid", prefix=self.saved_model_dir)
+        with pytest.raises(
+            ValueError,
+            match="Input model path should be .h5/.hdf5 file or a directory, but got .*.invalid",
+        ):
             converter.convert(invalid_filename, source=frontend)
-        e.match(r"Expected model format: .* .h5")
 
     def test_invalid_format_multiple_concrete_functions(self):
         class build_model(tf.Module):
@@ -199,9 +198,10 @@ def __call__(self, x):
 
         model = build_model()
         cf = model.__call__.get_concrete_function()
-        with pytest.raises(NotImplementedError) as e:
+        with pytest.raises(
+            NotImplementedError, match="Only a single concrete function is supported"
+        ):
             converter.convert([cf, cf, cf], source=frontend)
-        e.match(r"Only a single concrete function is supported")
 
     def test_invalid_converter_type(self):
         keras_model = tf.keras.Sequential(
diff --git a/coremltools/converters/mil/frontend/tensorflow2/test/test_v2_ops_tf_keras.py b/coremltools/converters/mil/frontend/tensorflow2/test/test_v2_ops_tf_keras.py
index 7dd167eb8..23e74dd68 100644
--- a/coremltools/converters/mil/frontend/tensorflow2/test/test_v2_ops_tf_keras.py
+++ b/coremltools/converters/mil/frontend/tensorflow2/test/test_v2_ops_tf_keras.py
@@ -527,6 +527,11 @@ def test_conv2d_padding_dynamic_input(
         TensorFlowBaseTest.run_compare_tf_keras(
             model,
             [random_gen((1, 80, 40, 1), rand_min=-10, rand_max=10)],
+            inputs_for_conversion=[
+                ct.TensorType(
+                    shape=(1, ct.RangeDim(upper_bound=80), ct.RangeDim(upper_bound=80), 1)
+                )
+            ],
             compute_unit=compute_unit,
             backend=backend,
         )
@@ -1038,9 +1043,14 @@ class TestNormalization(TensorFlowBaseTest):
     def test_layer_normalization(self, compute_unit, backend, rank, axis, epsilon, dynamic):
         shape = np.random.randint(low=2, high=4, size=rank)
         keras_shape = shape.tolist()
+        inputs_for_conversion = None
 
         if dynamic:
             keras_shape[0] = None
+            if backend[0] == "mlprogram":
+                inputs_for_conversion = [
+                    ct.TensorType(shape=[ct.RangeDim(upper_bound=4)] + keras_shape[1:])
+                ]
 
         model = tf.keras.Sequential(
             [
@@ -1052,6 +1062,7 @@ def test_layer_normalization(self, compute_unit, backend, rank, axis, epsilon, d
         TensorFlowBaseTest.run_compare_tf_keras(
             model,
             [random_gen(shape, rand_min=-100, rand_max=100)],
+            inputs_for_conversion=inputs_for_conversion,
             compute_unit=compute_unit,
             backend=backend,
         )
@@ -1690,9 +1701,16 @@ def test(
             kwargs = {"data_format": data_format}
             shape = np.random.randint(low=2, high=4, size=5)
             keras_shape = np.copy(shape).tolist()
-            # not support upsampling3D with dynamic input shape, since 6D tensors are produced in that case
             if dynamic:
-                return
+                pytest.skip(
+                    "upsampling3D with dynamic input shape is not supported, since 6D tensors are produced in that case"
+                )
+
+        inputs_for_conversion = None
+        if backend[0] == "mlprogram" and dynamic:
+            inputs_for_conversion = [
+                ct.TensorType(shape=[dim or ct.RangeDim(upper_bound=10) for dim in keras_shape])
+            ]
 
         model = tf.keras.Sequential(
             [op(batch_input_shape=keras_shape, size=upsample_factor, **kwargs)]
@@ -1700,6 +1718,7 @@ def test(
         spec = TensorFlowBaseTest.run_compare_tf_keras(
             model,
             [random_gen(shape, rand_min=-10, rand_max=10)],
+            inputs_for_conversion=inputs_for_conversion,
             compute_unit=compute_unit,
             backend=backend,
         )[0]
diff --git a/coremltools/converters/mil/frontend/tensorflow2/test/testing_utils.py b/coremltools/converters/mil/frontend/tensorflow2/test/testing_utils.py
index b80e5df1a..b922e4130 100644
--- a/coremltools/converters/mil/frontend/tensorflow2/test/testing_utils.py
+++ b/coremltools/converters/mil/frontend/tensorflow2/test/testing_utils.py
@@ -13,10 +13,15 @@
 import coremltools as ct
 import coremltools.models.utils as coremltoolsutils
 from coremltools.converters.mil.frontend.tensorflow.test.testing_utils import (
-    TensorFlowBaseTest, get_tf_node_names)
+    TensorFlowBaseTest,
+    get_tf_node_names,
+)
 from coremltools.converters.mil.input_types import RangeDim, TensorType
-from coremltools.converters.mil.testing_utils import (compare_backend,
-                                                      ct_convert)
+from coremltools.converters.mil.testing_utils import (
+    compare_backend,
+    ct_convert,
+    validate_minimum_deployment_target,
+)
 from coremltools.models.utils import _macos_version
 
 
@@ -104,18 +109,22 @@ def run_compare_tf2(
     minimum_deployment_target: coremltools.target enumeration
         The spec version for the mlmodel
     """
+    # Infinite upper-bound not allowed in mlprogram.
+    symbolic_upper_bound = 20 if backend[0] == "mlprogram" else -1
+
     inputs = []
     if inputs_for_conversion is None:
         cf_inputs = [t for t in model[0].inputs if t.dtype != dtypes.resource]
         for t in cf_inputs:
             name = get_tf_node_names(t.name)[0]
-            shape = [RangeDim() if s is None or s == -1 else s \
-                    for s in list(t.get_shape())]
-            inputs.append(TensorType(name=name, shape=shape,
-                                     dtype=t.dtype.as_numpy_dtype))
+            shape = [
+                RangeDim(upper_bound=symbolic_upper_bound) if s is None or s == -1 else s
+                for s in list(t.get_shape())
+            ]
+            inputs.append(TensorType(name=name, shape=shape, dtype=t.dtype.as_numpy_dtype))
     else:
         inputs = inputs_for_conversion
-        
+
     outputs = []
     for t in output_names:
         name = get_tf_node_names(t)[0]
@@ -129,7 +138,7 @@ def run_compare_tf2(
     else:
         ref = [tf_outputs.numpy()]
     expected_outputs = {n: v for n, v in zip(outputs, ref)}
-    
+
     mlmodel = ct_convert(
         model,
         source=frontend,
@@ -152,13 +161,13 @@ def run_compare_tf2(
     pred = None
     if not coremltoolsutils._has_custom_layer(mlmodel._spec):
         pred = compare_backend(
-                mlmodel,
-                input_dict,
-                expected_outputs,
-                atol=atol,
-                rtol=rtol,
-                also_compare_shapes=True,
-                dtype=backend[1],
+            mlmodel,
+            input_dict,
+            expected_outputs,
+            atol=atol,
+            rtol=rtol,
+            also_compare_shapes=True,
+            dtype=backend[1],
         )
     else:
         print('Skipping model prediction as it has a custom nn layer!')
@@ -221,13 +230,13 @@ def run_compare_tf_keras(
     pred = None
     if not coremltoolsutils._has_custom_layer(proto):
         pred = compare_backend(
-                mlmodel,
-                input_key_values,
-                expected_outputs,
-                atol=atol,
-                rtol=rtol,
-                also_compare_shapes=True,
-                dtype=backend[1]
+            mlmodel,
+            input_key_values,
+            expected_outputs,
+            atol=atol,
+            rtol=rtol,
+            also_compare_shapes=True,
+            dtype=backend[1],
         )
     else:
         print('Skipping model prediction as it has a custom nn layer!')
@@ -237,30 +246,37 @@ def run_compare_tf_keras(
 class TensorFlow2BaseTest(TensorFlowBaseTest):
 
     @staticmethod
-    def run_compare_tf2(model,
-                        input_dict,
-                        output_names,
-                        inputs_for_conversion=None,
-                        compute_unit=ct.ComputeUnit.CPU_ONLY,
-                        frontend_only=False,
-                        frontend="tensorflow",
-                        backend=("neuralnetwork", "fp32"),
-                        debug=False,
-                        atol=1e-04,
-                        rtol=1e-05,
-                        minimum_deployment_target=None,):
-        res = run_compare_tf2(model,
-                              input_dict,
-                              output_names,
-                              inputs_for_conversion=inputs_for_conversion,
-                              compute_unit=compute_unit,
-                              frontend_only=frontend_only,
-                              frontend=frontend,
-                              backend=backend,
-                              debug=debug,
-                              atol=atol,
-                              rtol=rtol,
-                              minimum_deployment_target=minimum_deployment_target,)
+    def run_compare_tf2(
+        model,
+        input_dict,
+        output_names,
+        inputs_for_conversion=None,
+        compute_unit=ct.ComputeUnit.CPU_ONLY,
+        frontend_only=False,
+        frontend="tensorflow",
+        backend=("neuralnetwork", "fp32"),
+        debug=False,
+        atol=1e-04,
+        rtol=1e-05,
+        minimum_deployment_target=None,
+    ):
+        if minimum_deployment_target is not None:
+            validate_minimum_deployment_target(minimum_deployment_target, backend)
+
+        res = run_compare_tf2(
+            model,
+            input_dict,
+            output_names,
+            inputs_for_conversion=inputs_for_conversion,
+            compute_unit=compute_unit,
+            frontend_only=frontend_only,
+            frontend=frontend,
+            backend=backend,
+            debug=debug,
+            atol=atol,
+            rtol=rtol,
+            minimum_deployment_target=minimum_deployment_target,
+        )
         alist = list(res)
         alist.append(TensorFlow2BaseTest.testclassname)
         alist.append(TensorFlow2BaseTest.testmodelname)
diff --git a/coremltools/converters/mil/frontend/torch/__init__.py b/coremltools/converters/mil/frontend/torch/__init__.py
index 521d2e466..22897457e 100644
--- a/coremltools/converters/mil/frontend/torch/__init__.py
+++ b/coremltools/converters/mil/frontend/torch/__init__.py
@@ -8,6 +8,7 @@
 register_torch_op = None
 
 if _HAS_TORCH:
+    from . import ops, quantization_ops
     from .dialect_ops import (torch_tensor_assign, torch_upsample_bilinear,
                               torch_upsample_nearest_neighbor)
     from .torch_op_registry import register_torch_op
diff --git a/coremltools/converters/mil/frontend/torch/converter.py b/coremltools/converters/mil/frontend/torch/converter.py
index a7cdff804..f2c97e75e 100644
--- a/coremltools/converters/mil/frontend/torch/converter.py
+++ b/coremltools/converters/mil/frontend/torch/converter.py
@@ -14,6 +14,7 @@
 from coremltools.converters.mil.input_types import ImageType
 from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.mil import Function, Program, types
+from coremltools.converters.mil.mil.types import is_float
 
 from .._utils import get_output_names
 from .internal_graph import InternalTorchIRGraph, InternalTorchIRNode
@@ -40,6 +41,137 @@
 mil_to_torch_types = {v: k for k, v in torch_to_mil_types.items()}
 
 
+class QuantizationContext:
+    """
+    Utilities to manage information pertaining to quantization of tensors in a PyTorch graph.
+    """
+
+    def __init__(self, context):
+        self._context = context
+
+        # Maps var name to tuple of (torch dtype, scale, zero_point)
+        # zero_point is in a NumPy dtype corresponding to torch one (for e.g. np.uint8 for torch.quint8).
+        self._quant_param_map = {}
+        # In MIL Programs, if a MIL op doesn't support quantized I/O but the PyTorch ops do,
+        # we just use floating-point tensors after dequantization. This means that information about
+        # what dtype (int8/uint8) quantized tensors had in the PyTorch graph is not carried into
+        # in the MIL graph.
+        # To simplify, we only support a single dtype for activation quantizations throughout the
+        # incoming graph.
+        # The other option is to remember dtypes across ops, including MIL ones that don't support
+        # quantized I/O. We will need to be careful about edge cases like conflicting dtypes, etc.
+        self._quant_dtype = None
+
+    def add_quantization_info(self, name, torch_dtype, scale, zero_point, axis=None):
+        """
+        Stores the quantization parameters (torch dtype, scale, zero_point) corresponding to a named
+        var in the graph.
+        zero_point should be in a NumPy dtype corresponding to torch one (for e.g. np.uint8 for torch.quint8).
+        """
+        self._quant_param_map[name] = (torch_dtype, scale, zero_point, axis)
+
+    def get_quantization_info(self, name):
+        """
+        Retrieves the information added via add_quantization_info, if applicable.
+        Returns None if quantization parameters could not be found.
+        """
+        if name not in self._quant_param_map:
+            return None
+        return self._quant_param_map[name]
+
+    def maybe_handle_quantized_inputs(self, node: InternalTorchIRNode):
+        """
+        If a node's op doesn't support quantized inputs but gets one, this will wire it to
+        receive a dequantized version of it.
+        """
+
+        op_type = node.kind
+        if op_type in {"quantize_per_tensor", "dequantize"} or "quantized::" in op_type:
+            # Op can handle quantized inputs. Nothing to do here.
+            return
+
+        for input_name in node.inputs:
+            if self.get_quantization_info(input_name) is None:
+                # Not a quantized tensor
+                continue
+
+            # We need a dequantized version of the input to feed to the op.
+            dequantized_var, _ = self.get_dequantized_var(input_name)
+            node.replace_name(input_name, dequantized_var.name)
+
+    def get_quantized_per_tensor(self, name, torch_dtype, scale, zero_point, quantized_name):
+        """
+        Quantizes the provided named var as per quantization params.
+        zero_point will be cast to the appropriate dtype based on torch_dtype.
+        """
+        if self._quant_dtype is None:
+            self._quant_dtype = torch_dtype
+        elif self._quant_dtype != torch_dtype:
+            raise NotImplementedError(
+                "Currently we only support a single activation dtype throughout the model"
+            )
+
+        if torch_dtype == torch.quint8:
+            zero_point = np.uint8(zero_point)
+            output_dtype = "uint8"
+        elif torch_dtype == torch.qint8:
+            zero_point = np.int8(zero_point)
+            output_dtype = "int8"
+        else:
+            raise ValueError(f"Invalid torch dtype for quantization: {torch_dtype}")
+        if np.isscalar(zero_point):
+            # MIL allows skipping zero_point if its zero.
+            if zero_point == 0:
+                zero_point = None
+            # TODO (rdar://107718371): skip 128 for uint8 by switching to int8
+
+        result = mb.quantize(
+            input=self._context[name], zero_point=zero_point, scale=scale, output_dtype=output_dtype
+        )
+        self._context.add(result, quantized_name)
+        self._context.quant_context.add_quantization_info(
+            quantized_name, torch_dtype, scale, zero_point
+        )
+        return result
+
+    def get_dequantized_var(self, name: str, dequantized_name: str = None):
+        """
+        Returns dequantized var & torch dtype corresponding to the named var.
+        """
+
+        original_var = self._context[name]
+        if is_float(original_var.dtype):
+            # Input doesn't need dequantization.
+            # This might happen if in the PyTorch graph the upstream nodes supported quantized inputs,
+            # but MIL does not. In that case, we already dequantized the vars before feeding them to
+            # the MIL op.
+            if dequantized_name is not None:
+                self._context.add(original_var, dequantized_name)
+            if self._quant_dtype is None:
+                raise AssertionError("Trying to dequantize without quantization info")
+            return original_var, self._quant_dtype
+
+        quant_params = self.get_quantization_info(name)
+        if quant_params is None:
+            raise ValueError(
+                f"Could not find quantization parameters for quantized var {original_var.name}"
+            )
+        torch_dtype, scale, zero_point, axis = quant_params
+
+        # We add a new var corresponding to each dequantized value.
+        # This ensures the atomicity of quantized op patterns in MIL.
+        dequantized_var = mb.dequantize(
+            input=original_var, scale=scale, zero_point=zero_point, axis=axis
+        )
+        if dequantized_name is not None:
+            dequantized_var_name = dequantized_name
+        else:
+            dequantized_var_name = dequantized_var.name
+        self._context.add(dequantized_var, dequantized_var_name)
+
+        return dequantized_var, torch_dtype
+
+
 class TranscriptionContext:
     """
     Maintains a map from torch operations to their MIL values
@@ -51,13 +183,29 @@ class TranscriptionContext:
     def __init__(self, name=None):
         self.name = name if name else ""
         self._current_graph = [{}]
+        self._torch_graph = None
+        self._quant_context = QuantizationContext(self)
+
+    @property
+    def torch_graph(self):
+        if self._torch_graph is None:
+            raise ValueError("InternalTorchIRGraph not set yet on context")
+        return self._torch_graph
+
+    @property
+    def quant_context(self):
+        return self._quant_context
+
+    @torch_graph.setter
+    def torch_graph(self, graph: InternalTorchIRGraph):
+        self._torch_graph = graph
 
     def prepare_for_conversion(self, node: InternalTorchIRNode):
         """
         Perform any preparation necessary before node-specific frontend conversion
         is invoked.
         """
-        pass
+        self.quant_context.maybe_handle_quantized_inputs(node)
 
     def add(self, ssa_var, torch_name=None):
         """
@@ -69,7 +217,7 @@ def add(self, ssa_var, torch_name=None):
         if torch_name is None:
             torch_name = ssa_var.name
         if torch_name in self._current_graph[-1]:
-            print("Torch var {} is added again.".format(torch_name))
+            print(f"Torch var {torch_name} is added again.")
             return
         self._current_graph[-1][torch_name] = ssa_var
 
@@ -83,9 +231,7 @@ def __getitem__(self, torch_name):
             current_graph = self._current_graph[idx]
             if torch_name in current_graph:
                 return self._current_graph[idx][torch_name]
-        raise ValueError(
-            "Torch var {} not found in context {}".format(torch_name, self.name)
-        )
+        raise ValueError(f"Torch var {torch_name} not found in context {self.name}")
 
     def __contains__(self, torch_name):
         """Returns whether or not the torch var exist in context."""
@@ -121,7 +267,7 @@ def __str__(self):
                     shape_str = v.sym_shape()
                 else:
                     shape_str = "None"
-                __str += "%{} : {}\n".format(k, shape_str)
+                __str += f"%{k} : {shape_str}\n"
             _str += __str + "\n"
         return _str
 
@@ -174,6 +320,7 @@ def __init__(
         self.graph = InternalTorchIRGraph(
             raw_graph, params_dict, self.inputs, cut_at_symbols
         )
+        self.context.torch_graph = self.graph
 
         # TODO (rdar://106161395): Register Torch IR passes and unify them into the pass pipeline.
         # Apply Torch IR passes
@@ -231,8 +378,16 @@ def check_ops(self):
 
     def convert_const(self):
         for name, val in self.graph.params.items():
-            if not isinstance(val, np.ndarray):
-                raise ValueError("unsupported class for {} in PyTorch graph: {}".format(name, type(val)))
+            if isinstance(val, torch._C.ScriptObject):
+                logger.info(f"Encountered constant {name} of type _torch._C.ScriptObject")
+                continue
+            elif not isinstance(val, np.ndarray):
+                raise ValueError(f"unsupported class for {name} in PyTorch graph: {type(val)}")
+            # TODO (rdar://107718371): support uint8 quantization
+            # Some torch models store indices with uint8, which are unrelated to quantization and
+            # need to be cast to int32 since Core ML does not support int8.
+            # We need a way to distinguish whether an uint8 is quantization (so should be kept)
+            # or not (so should be cast to int32).
             if val.dtype == np.uint8:
                 val = val.astype(np.int32)
             const = mb.const(val=val, name=name)
@@ -289,18 +444,15 @@ def convert(self):
             # in Fairseq MT.
             for g in graph_outputs:
                 if g is None:
-                    msg = "Droping output {} which is None"
-                    logger.warning(msg.format(g))
+                    logger.warning(f"Droping output {g} which is None")
             graph_outputs = [g for g in graph_outputs if g is not None]
 
             # Output renaming occurs
             if self.outputs is not None:
                 if len(self.outputs) != len(graph_outputs):
-                    msg = "Number of outputs provided, {}, do not match the number of outputs detected in the model, {}."
-                    raise ValueError(msg.format(
-                        len(self.outputs),
-                        len(graph_outputs),
-                    ))
+                    raise ValueError(
+                        f"Number of outputs provided, {len(self.outputs)}, do not match the number of outputs detected in the model, {len(graph_outputs)}."
+                    )
             if self.output_names:
                 for index, var in enumerate(graph_outputs):
                     if self.output_names[index] is not None:
@@ -384,7 +536,7 @@ def _check_is_tensor(node, module):
             if not isinstance(module, torch.Tensor):
                 return False
             if str(node.output().type()) not in ("Tensor", "Optional[Tensor]"):
-                raise TypeError("Type \"{}\" not supported".format(node.output().type()))
+                raise TypeError(f'Type "{node.output().type()}" not supported')
             return True
 
         def _check_is_quantized_tensor(node, module):
diff --git a/coremltools/converters/mil/frontend/torch/ops.py b/coremltools/converters/mil/frontend/torch/ops.py
index e4cc47d89..91a257fcf 100644
--- a/coremltools/converters/mil/frontend/torch/ops.py
+++ b/coremltools/converters/mil/frontend/torch/ops.py
@@ -6,33 +6,30 @@
 import builtins
 import math as _math
 import numbers
+import re
 from collections.abc import Iterable
 from typing import List, Optional
 
 import numpy as _np
+import numpy as np
 import torch
 from tqdm import tqdm as _tqdm
 
 from coremltools import _logger as logger
-from coremltools.converters.mil._deployment_compatibility import (
-    AvailableTarget as target,
-)
+from coremltools.converters.mil._deployment_compatibility import AvailableTarget as target
 from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.mil import Symbol, types
-from coremltools.converters.mil.mil.block import (
-    is_current_opset_version_compatible_with,
-)
+from coremltools.converters.mil.mil.block import is_current_opset_version_compatible_with
 from coremltools.converters.mil.mil.ops.defs._utils import (
-    MAX_SIZE_CONSTANT_FOLDING, promote_input_dtypes,
-    solve_slice_by_index_shape)
-from coremltools.converters.mil.mil.types import is_bool, nptype_from_builtin
-from coremltools.converters.mil.mil.types.symbolic import (
-    any_symbolic,
-    is_symbolic,
+    MAX_SIZE_CONSTANT_FOLDING,
+    promote_input_dtypes,
+    solve_slice_by_index_shape,
 )
+from coremltools.converters.mil.mil.types import is_bool, nptype_from_builtin
+from coremltools.converters.mil.mil.types.symbolic import any_symbolic, is_symbolic
 from coremltools.converters.mil.mil.var import ListVar, Var
 
-from .._utils import value_at, build_einsum_mil
+from .._utils import build_einsum_mil, value_at
 from .torch_op_registry import _TORCH_OPS_REGISTRY, register_torch_op
 
 # The pytorch args for many of the below ops were sourced from
@@ -80,9 +77,16 @@ def convert_nodes(context, graph):
 
         logger.info("Converting op {} : {}".format(node.name, node.kind))
         if add_op is None:
-            raise RuntimeError(
-                "PyTorch convert function for op '{}' not implemented.".format(node.kind)
-            )
+            if re.match(r".*_dynamic", node.kind):
+                raise RuntimeError(
+                    f"PyTorch convert function for op '{node.kind}' not implemented.\n"
+                    "Dynamic quantized models are not supported by Core ML.\n"
+                    "Please use static quantization or the APIs in coremltools.optimize to quantize/compress models."
+                )
+            else:
+                raise RuntimeError(
+                    f"PyTorch convert function for op '{node.kind}' not implemented."
+                )
 
         context.prepare_for_conversion(node)
         add_op(context, node)
@@ -136,6 +140,7 @@ def convert_block(context, block, inputs):
     11: torch.bool,
     12: torch.qint8,
     13: torch.quint8,
+    14: torch.qint32,
 }
 
 NUMPY_DTYPE_TO_TORCH_NUM = {
@@ -163,6 +168,7 @@ def convert_block(context, block, inputs):
 }
 
 NUM_TO_DTYPE_STRING = {
+    2: "int16",
     3: "int32",
     4: "int32",
     5: "fp16",
@@ -215,6 +221,13 @@ def _list_select(shape_var, index):
     if shape_var.can_be_folded_to_const():
         res = mb.const(val=shape_var.val[index])
     else:
+        if is_current_opset_version_compatible_with(target.iOS17):
+            # IOS17 `gather` requires non-negative indices.
+            index = mb.select(
+                cond=mb.greater_equal(x=index, y=0),
+                a=index,
+                b=mb.add(x=index, y=value_at(mb.shape(x=shape_var), 0)),
+            )
         res = mb.gather(x=shape_var, indices=index)
     return res
 
@@ -1884,6 +1897,24 @@ def stack(context, node):
     context.add(res)
 
 
+@register_torch_op
+def tile(context, node):
+    x, dims = _get_inputs(context, node, expected=2)
+
+    # The torch.tile only supports tuple of ints for "dims", not Tensor. So it will not be dynamic.
+    if dims is None or dims.val is None:
+        raise ValueError("The `dims` input for torch.tile must be static (tuple of ints).")
+
+    dims_num = dims.shape[0]
+    if dims_num < x.rank:
+        # When the number of elements in dims is smaller than rank of x, ones are prepended.
+        prepend_ones = np.array([1] * (x.rank - dims_num))
+        dims = mb.concat(values=(prepend_ones, dims), axis=0)
+
+    res = mb.tile(x=x, reps=dims, name=node.name)
+    context.add(res)
+
+
 @register_torch_op
 def item(context, node):
     inputs = _get_inputs(context, node, expected=1)
@@ -3362,13 +3393,15 @@ def index_put(context, node):
     if types.is_bool(indices_type):
         assert len(indices) == 1, "Unsupported index_put_ usage."
         indices = indices[0]
-        assert indices.shape == x.shape, "indices shape must equal to input shape for index put operation."
+        assert (
+            indices.shape == x.shape
+        ), "indices shape must equal to input shape for index put operation."
         indices = mb.cast(x=indices, dtype="int32")
         indices = mb.non_zero(x=indices)
 
     if types.is_int(indices_type):
         if len(indices) > 1:
-            indices = mb.stack(values=indices, axis=rank - 1)
+            indices = mb.stack(values=indices, axis=indices[0].rank)
         else:
             indices = mb.expand_dims(x=indices[0], axes=[-1])
 
@@ -3380,6 +3413,19 @@ def index_put(context, node):
         reps = mb.expand_dims(x=reps, axes=[0])
         values = mb.tile(x=values, reps=reps)
 
+    if is_current_opset_version_compatible_with(target.iOS17):
+        # IOS17 `scatter_nd` behaviour is undefined for negative indices.
+        cond = mb.greater_equal(x=indices, y=0)
+        x_shape = mb.shape(x=x)
+        indices_shape = mb.shape(x=indices)
+        indices_last_dim = value_at(indices_shape, indices.rank - 1)
+        indices_last_dim_expand = mb.expand_dims(x=indices_last_dim, axes=[0])
+        slice_shape = mb.slice_by_size(x=x_shape, begin=[0], size=indices_last_dim_expand)
+        indices = mb.select(
+            cond=cond,
+            a=indices,
+            b=mb.add(x=indices, y=slice_shape),
+        )
     result = mb.scatter_nd(data=x, indices=indices, updates=values, mode=mode, name=node.name)
     context.add(result)
 
@@ -3514,7 +3560,15 @@ def index(context, node):
     # For the single index axis case, we can use mb.gather directly
     if len(indices_axes) == 1:
         axis = indices_axes[0]
-        x = mb.gather(x=x, indices=valid_indices[0], axis=axis, name=node.name)
+        indices = valid_indices[0]
+        if is_current_opset_version_compatible_with(target.iOS17):
+            # IOS17 `gather` behaviour is undefined for negative indices.
+            indices = mb.select(
+                cond=mb.greater_equal(x=indices, y=0),
+                a=indices,
+                b=mb.add(x=indices, y=value_at(mb.shape(x=x), axis)),
+            )
+        x = mb.gather(x=x, indices=indices, axis=axis, name=node.name)
         context.add(x)
         return
 
@@ -3543,6 +3597,20 @@ def index(context, node):
     name = node.name + "_transpose" if is_connected else node.name
     perm = indices_axes + [axis for axis in range(x.rank) if axis not in indices_axes]
     x = mb.transpose(x=x, perm=perm)
+
+    if is_current_opset_version_compatible_with(target.iOS17):
+        # IOS17 `gather_nd` behaviour is undefined for negative indices.
+        cond = mb.greater_equal(x=indices, y=0)
+        x_shape = mb.shape(x=x)
+        indices_shape = mb.shape(x=indices)
+        indices_last_dim = value_at(indices_shape, indices.rank - 1)
+        indices_last_dim_expand = mb.expand_dims(x=indices_last_dim, axes=[0])
+        slice_shape = mb.slice_by_size(x=x_shape, begin=[0], size=indices_last_dim_expand)
+        indices = mb.select(
+            cond=cond,
+            a=indices,
+            b=mb.add(x=indices, y=slice_shape),
+        )
     x = mb.gather_nd(x=x, indices=indices, name=name)
 
     # if the index axes are connect, we need to transpose it back
@@ -4140,20 +4208,11 @@ def masked_fill(context, node):
     x = inputs[0]
     mask = inputs[1]
     value = inputs[2]
-    # @mb.select does not properly broadcast scalar input, so as a workaround
-    # we create a full sized tensor.
-
-    if types.is_int(value.dtype):
-        # @mb.fill cannot handle value with dtype integer
-        # so we cast the value.
-        value = mb.cast(x=value, dtype="fp32")
 
     if not types.is_bool(mask.dtype):
         # cond must be bool type
         mask = mb.cast(x=mask, dtype="bool")
 
-    shape = mb.shape(x=x, name=node.name + "_shape")
-    value = mb.fill(shape=shape, value=value, name=node.name + "_value")
     res = mb.select(cond=mask, a=value, b=x, name=node.name)
     context.add(res)
 
@@ -5731,9 +5790,7 @@ def torchvision_nms(context, node):
     iou_threshold = inputs[2].val
     # Use float min to avoid boxes being pruned by scores in MIL NMS op.
     score_threshold = (
-        _np.finfo(_np.float16).min
-        if boxes.dtype._width == 16
-        else _np.finfo(_np.float32).min
+        _np.finfo(_np.float16).min if boxes.dtype._width == 16 else _np.finfo(_np.float32).min
     )
 
     box_num = boxes.shape[0]
@@ -5758,21 +5815,46 @@ def torchvision_nms(context, node):
     boxes = mb.expand_dims(x=boxes, axes=[0])
     scores = mb.expand_dims(x=scores, axes=[0, -1])
 
-    _, _, indices, valid_outputs = mb.non_maximum_suppression(
-        boxes=boxes,
-        scores=scores,
-        max_boxes=box_num,
-        iou_threshold=iou_threshold,
-        score_threshold=score_threshold,
-    )
+    if not is_current_opset_version_compatible_with(target.iOS17):
+        _, _, indices, valid_outputs = mb.non_maximum_suppression(
+            boxes=boxes,
+            scores=scores,
+            max_boxes=box_num,
+            iou_threshold=iou_threshold,
+            score_threshold=score_threshold,
+        )
+
+        indices = mb.squeeze(x=indices, axes=[0])
+        valid_outputs = mb.squeeze(x=valid_outputs, axes=[0])
+        range = mb.range_1d(end=valid_outputs, start=0, step=1)
+        indices = mb.cast(x=indices, dtype="fp32")
+        valid_indices = mb.gather(x=indices, indices=range, axis=0)
+        valid_indices = mb.cast(x=valid_indices, dtype="int32", name=node.name)
+        context.add(valid_indices)
+    else:
+        # In IOS17, the MIL NMS op's inputs are ordered with number of boxes in the last dimension.
+        boxes = mb.transpose(x=boxes, perm=[0, 2, 1])
+        scores = mb.transpose(x=scores, perm=[0, 2, 1])
+
+        # In IOS17, the MIL NMS op's last output (number of valid boxes in each batch) gets removed.
+        _, _, indices = mb.non_maximum_suppression(
+            boxes=boxes,
+            scores=scores,
+            max_boxes=box_num,
+            iou_threshold=iou_threshold,
+        )
 
-    indices = mb.squeeze(x=indices, axes=[0])
-    valid_outputs = mb.squeeze(x=valid_outputs, axes=[0])
-    range = mb.range_1d(end=valid_outputs, start=0, step=1)
-    indices = mb.cast(x=indices, dtype="fp32")
-    valid_indices = mb.gather(x=indices, indices=range, axis=0)
-    valid_indices = mb.cast(x=valid_indices, dtype="int32", name=node.name)
-    context.add(valid_indices)
+        # Remove invalid indices (the padded -1 indices).
+        valid_outputs = mb.reduce_sum(
+            x=mb.cast(x=mb.greater(x=indices, y=-1), dtype="int32"), axes=[-1]
+        )
+        valid_indices = mb.slice_by_size(
+            x=mb.squeeze(x=indices, axes=[0]),
+            begin=mb.fill_like(ref_tensor=valid_outputs, value=0),
+            size=valid_outputs,
+            name=node.name,
+        )
+        context.add(valid_indices)
 
 
 @register_torch_op
diff --git a/coremltools/converters/mil/frontend/torch/quantization_ops.py b/coremltools/converters/mil/frontend/torch/quantization_ops.py
new file mode 100644
index 000000000..a59e54b3f
--- /dev/null
+++ b/coremltools/converters/mil/frontend/torch/quantization_ops.py
@@ -0,0 +1,315 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import numpy as _np
+import torch as _torch
+
+from coremltools import _logger as logger
+from coremltools.converters.mil.mil import Builder as mb
+from coremltools.converters.mil.mil import Var
+
+from .ops import NUM_TO_TORCH_DTYPE, _get_inputs
+from .torch_op_registry import register_torch_op
+
+TORCH_QTYPE_TO_NP_TYPE = {_torch.qint8: _np.int8, _torch.quint8: _np.uint8}
+
+TORCH_QTYPE_TO_STR = {_torch.qint8: "int8", _torch.quint8: "uint8"}
+
+
+def _quantize_general(
+    context,
+    node,
+    input: Var,
+    scale_var: Var,
+    zero_point_var: Var,
+    torch_dtype_var: Var,
+    axis: int = None,
+):
+    scale = scale_var.val
+    if scale is None:
+        raise ValueError("quantization scale must be const at compile time")
+
+    zero_point = zero_point_var.val
+    if zero_point is None:
+        raise ValueError("quantization zero point must be const at compile time")
+
+    torch_dtype = NUM_TO_TORCH_DTYPE.get(torch_dtype_var.val)
+    if torch_dtype is None:
+        raise ValueError("quantization dtype must be const at compile time")
+    dtype = TORCH_QTYPE_TO_STR.get(torch_dtype)
+    # pytorch quantization dtype can be int32, which is not supported in MIL
+    if dtype is None:
+        raise ValueError("MIL quantization dtype must be int8 or uint8")
+
+    # perf: all 0 zero point can be no zero point in MIL
+    if zero_point is not None and _np.all(zero_point == 0):
+        zero_point = None
+
+    # make sure zero point dtype is consistent with quantization dtype,
+    # since torch may provide int32 zero point
+    if zero_point is not None:
+        if dtype == "int8" and _np.all(-128 <= zero_point) and _np.all(zero_point < 128):
+            zero_point = zero_point.astype(_np.int8)
+        elif dtype == "uint8" and _np.all(0 <= zero_point) and _np.all(zero_point < 256):
+            zero_point = zero_point.astype(_np.uint8)
+        else:
+            raise ValueError("cannot fit zero point into quantization dtype")
+
+    result = mb.quantize(
+        input=input,
+        zero_point=zero_point,
+        scale=scale,
+        output_dtype=dtype,
+        axis=axis,
+    )
+    context.add(result, node.name)
+    context.quant_context.add_quantization_info(node.name, torch_dtype, scale, zero_point, axis)
+
+
+@register_torch_op
+def quantize_per_tensor(context, node):
+    input, scale, zero_point, torch_dtype = _get_inputs(context, node, expected=[4])
+
+    _quantize_general(context, node, input, scale, zero_point, torch_dtype)
+
+
+@register_torch_op
+def quantize_per_channel(context, node):
+    input, scale, zero_point, axis, torch_dtype = _get_inputs(context, node, expected=[5])
+
+    if axis.val is None:
+        raise ValueError("quantization axis must be const at compile time")
+
+    _quantize_general(context, node, input, scale, zero_point, torch_dtype, axis.val)
+
+
+@register_torch_op
+def dequantize(context, node):
+    context.quant_context.get_dequantized_var(node.inputs[0], node.name)
+
+
+def _dequantized_weight(qweight):
+    """
+    Given the first output (qweight) of torch.ops.quantized.conv2d/linear_unpack,
+    this returns a dequantized version of the tensor to be added to the context.
+    """
+    if qweight.qscheme() == _torch.per_tensor_affine:
+        quant_dtype_np = TORCH_QTYPE_TO_NP_TYPE[qweight.dtype]
+        scale = _np.float32(qweight.q_scale())
+        zero_point = quant_dtype_np(qweight.q_zero_point())
+        quantized_weights = _torch.int_repr(qweight).numpy()
+        # Axis doesn't matter for per-tensor quantization.
+        axis = _np.int32(0)
+        dequant_weights = mb.constexpr_affine_dequantize(
+            quantized_data=quantized_weights,
+            zero_point=zero_point,
+            scale=scale,
+            axis=axis,
+        )
+    # per_channel_affine_float_qparams is same as per_channel_affine except that it
+    # expects both scale and zero point to be floating point values.
+    elif qweight.qscheme() in {_torch.per_channel_affine, _torch.per_channel_affine_float_qparams}:
+        quant_dtype_np = TORCH_QTYPE_TO_NP_TYPE[qweight.dtype]
+        # TODO: How do we set the appropriate dtype here (fp16/fp32)?
+        scale = qweight.q_per_channel_scales().numpy()
+        zero_point = quant_dtype_np(qweight.q_per_channel_zero_points().numpy())
+        if qweight.qscheme() == _torch.per_channel_affine:
+            zero_point = quant_dtype_np(qweight.q_per_channel_zero_points().numpy())
+        else:
+            logger.warning(
+                "Found per_channel_affine_float_qparams qscheme, which isn't directly "
+                "supported by coremltools. Casting zero-points to quantized type loses some "
+                "precision."
+            )
+            dtype_info = _np.iinfo(quant_dtype_np)
+            val = _np.clip(
+                _np.around(qweight.q_per_channel_zero_points().numpy()),
+                dtype_info.min,
+                dtype_info.max,
+            )
+            zero_point = quant_dtype_np(val)
+        quantized_weights = _torch.int_repr(qweight).numpy()
+        # Axis doesn't matter for per-tensor quantization.
+        axis = _np.int32(0)
+        dequant_weights = mb.constexpr_affine_dequantize(
+            quantized_data=quantized_weights,
+            zero_point=zero_point,
+            scale=scale,
+            axis=axis,
+        )
+    else:
+        raise ValueError(f'Unsupported quant scheme "{qweight.qscheme()}"')
+    return dequant_weights
+
+
+def _process_conv(context, node, add_relu=False):
+    # Node has 4 inputs:
+    # 1. The input activations
+    # 2. The packed weights/biases (need to get from context.torch_graph)
+    # 3. output scale
+    # 4. output zero-point
+
+    # Unpack weights/bias & dequantize weights.
+    packed_params = context.torch_graph.params[node.inputs[1]]
+    qweight, bias = _torch.ops.quantized.conv2d_unpack(packed_params)
+    dequant_weights = _dequantized_weight(qweight)
+    context.add(dequant_weights)
+    # Bias can be fed as-is.
+    bias = bias.detach().numpy()
+
+    # Convolution Parameters.
+    x, x_dtype = context.quant_context.get_dequantized_var(node.inputs[0])
+    raw_params = tuple(list(packed_params.__getstate__())[:-1])
+    conv_attr_raw = raw_params[0][1][0].detach().numpy().astype(_np.int32)
+    # Stride
+    strides = conv_attr_raw[1:3]
+    # Padding. torch.nn.quantized.Conv2d & its variants only support 'zeros' mode.
+    pad = conv_attr_raw[3:5]
+    assert conv_attr_raw[8] == 0
+    if len(dequant_weights.shape) in (3, 4):
+        # 1D and 2D: Need to explicitly state L-R, T-B pad
+        pad = _np.repeat(pad, 2)
+    else:
+        raise ValueError("Invalid weight dimension. Must be 4 for 2D convolution.")
+    # Dilation.
+    dilations = conv_attr_raw[5:7]
+    # Group.
+    group = conv_attr_raw[9]
+    kwargs = {
+        "x": x,
+        "weight": dequant_weights,
+        "bias": bias,
+        "strides": strides,
+        "pad_type": "custom",
+        "pad": pad,
+        "dilations": dilations,
+    }
+    if group > 0:
+        kwargs["groups"] = group
+
+    res = mb.conv(**kwargs)
+    if add_relu:
+        res = mb.relu(x=res)
+    context.add(res)
+
+    out_scale = context[node.inputs[2]]
+    out_zero_point = context[node.inputs[3]].val
+    _ = context.quant_context.get_quantized_per_tensor(
+        res.name, x_dtype, out_scale, out_zero_point, node.name
+    )
+
+
+def _process_linear(context, node, add_relu=False):
+    # Node has 4 inputs:
+    # 1. The input activations
+    # 2. The packed weights/biases (need to get from context.torch_graph)
+    # 3. output scale
+    # 4. output zero-point
+
+    # Unpack PyTorch's packed params.
+    packed_params = context.torch_graph.params[node.inputs[1]]
+    qweight, bias = _torch.ops.quantized.linear_unpack(packed_params)
+    dequant_weights = _dequantized_weight(qweight)
+    context.add(dequant_weights)
+    # Bias can be fed as-is.
+    bias = bias.detach().numpy()
+
+    x, x_dtype = context.quant_context.get_dequantized_var(node.inputs[0])
+    res = mb.linear(x=x, weight=dequant_weights, bias=bias)
+    if add_relu:
+        res = mb.relu(x=res)
+    context.add(res)
+
+    out_scale = context[node.inputs[2]]
+    out_zero_point = context[node.inputs[3]].val
+    _ = context.quant_context.get_quantized_per_tensor(
+        res.name, x_dtype, out_scale, out_zero_point, node.name
+    )
+
+
+def _process_binary(context, node, binary_op, add_relu=False):
+    # Node has 4 inputs:
+    # 1. LHS
+    # 2. RHS
+    # 3. output scale
+    # 4. output zero-point
+
+    assert len(node.inputs) == 4
+    assert len(node.outputs) == 1
+
+    lhs, lhs_dtype = context.quant_context.get_dequantized_var(node.inputs[0])
+    rhs, rhs_dtype = context.quant_context.get_dequantized_var(node.inputs[1])
+    assert lhs_dtype == rhs_dtype
+
+    res = binary_op(x=lhs, y=rhs)
+    if add_relu:
+        res = mb.relu(x=res)
+    context.add(res)
+
+    out_scale = context[node.inputs[2]]
+    out_zero_point = context[node.inputs[3]].val
+    _ = context.quant_context.get_quantized_per_tensor(
+        res.name, lhs_dtype, out_scale, out_zero_point, node.name
+    )
+
+
+@register_torch_op(torch_alias=["quantized::linear"])
+def quantized_linear(context, node):
+    _process_linear(context, node)
+
+
+@register_torch_op(torch_alias=["quantized::linear_relu"])
+def quantized_linear_relu(context, node):
+    _process_linear(context, node, add_relu=True)
+
+
+@register_torch_op(torch_alias=["quantized::conv2d_relu"])
+def quantized_conv2d_relu(context, node):
+    _process_conv(context, node, add_relu=True)
+
+
+@register_torch_op(torch_alias=["quantized::conv2d"])
+def quantized_conv2d(context, node):
+    _process_conv(context, node)
+
+
+@register_torch_op(torch_alias=["quantized::add"])
+def quantized_add(context, node):
+    _process_binary(context, node, mb.add)
+
+
+@register_torch_op(torch_alias=["quantized::add_relu"])
+def quantized_add_relu(context, node):
+    _process_binary(context, node, mb.add, add_relu=True)
+
+
+@register_torch_op(torch_alias=["quantized::mul"])
+def quantized_mul(context, node):
+    _process_binary(context, node, mb.mul)
+
+
+@register_torch_op(torch_alias=["quantized::embedding_byte"])
+def quantized_embedding(context, node):
+    packed_params = context.torch_graph.params[node.inputs[0]]
+    qweight = _torch.ops.quantized.embedding_bag_unpack(packed_params)
+    dequant_weights = _dequantized_weight(qweight)
+    indices = context[node.inputs[1]]
+
+    if len(node.inputs) >= 3:
+        logger.warning(
+            "Core ML quantized embedding (gather) layer does not support any "
+            "inputs besides the weights and indices. Those given "
+            "will be ignored."
+        )
+
+    if isinstance(indices, tuple):
+        # Sometimes inputs will be a tuple, so handle that correctly.
+        assert len(indices) == 1
+        indices = indices[0]
+    indices = mb.cast(x=indices, dtype="int32")
+
+    #  Changing the axis from 0 is not an option in torch, so we don't expose it
+    gather = mb.gather(x=dequant_weights, indices=indices, name=node.name)
+    context.add(gather)
diff --git a/coremltools/converters/mil/frontend/torch/test/test_torch_conversion_api.py b/coremltools/converters/mil/frontend/torch/test/test_torch_conversion_api.py
index bd7d1ce15..775f7c5b2 100644
--- a/coremltools/converters/mil/frontend/torch/test/test_torch_conversion_api.py
+++ b/coremltools/converters/mil/frontend/torch/test/test_torch_conversion_api.py
@@ -12,13 +12,19 @@
 
 import coremltools as ct
 from coremltools._deps import _HAS_TORCH, MSG_TORCH_NOT_FOUND
-from coremltools.converters.mil.frontend.torch.test.testing_utils import \
-    _copy_input_data
+from coremltools.converters.mil.frontend.torch.test.testing_utils import _copy_input_data
 from coremltools.converters.mil.testing_utils import (
-    assert_cast_ops_count, assert_input_dtype, assert_ops_in_mil_program,
-    assert_output_dtype, assert_prog_input_type, assert_prog_output_type,
-    assert_spec_input_image_type, assert_spec_output_image_type,
-    verify_prediction)
+    assert_cast_ops_count,
+    assert_input_dtype,
+    assert_ops_in_mil_program,
+    assert_output_dtype,
+    assert_prog_input_type,
+    assert_prog_output_type,
+    assert_spec_input_image_type,
+    assert_spec_output_image_type,
+    get_op_types_in_program,
+    verify_prediction,
+)
 from coremltools.proto import FeatureTypes_pb2 as ft
 from coremltools.test.api.test_api_examples import TestInputs as _TestInputs
 
@@ -26,6 +32,8 @@
     import torch
     import torchvision
 
+    torch.manual_seed(1818)
+
 #################################################################################
 # Note: all tests are also used as examples in https://coremltools.readme.io/docs
 # as a reference.
@@ -98,7 +106,7 @@ def test_convert_torch_vision_mobilenet_v2(tmpdir):
         if ct.utils._is_macos():
             results = mlmodel.predict({"input": example_input.numpy()})
             assert isinstance(results, dict)
-            
+
     @staticmethod
     def test_convert_torch_traced_model_to_milinternal(tmpdir):
         from torch import nn
@@ -127,7 +135,7 @@ def forward(self, x):
             convert_to='milinternal'
         )
         assert isinstance(model, ct.converters.mil.Program)
-        
+
     @staticmethod
     def test_torch_classifier():
         class Net(torch.nn.Module):
@@ -178,7 +186,7 @@ def _test_classifier(traced_model, example_input, class_type, backend):
             _test_classifier(traced_model, example_input, class_type, "neuralnetwork")
             if ct.utils._macos_version() >= (12, 0):
                 _test_classifier(traced_model, example_input, class_type, "mlprogram")
-                
+
     @staticmethod
     @pytest.mark.parametrize("convert_to", ['neuralnetwork', 'mlprogram'])
     def test_convert_to_argument_with_torch_model(tmpdir, convert_to):
@@ -207,7 +215,7 @@ def forward(self, x):
             assert spec.WhichOneof('Type') == 'mlProgram'
         else:
             assert spec.WhichOneof('Type') == 'neuralNetwork'
-            
+
     @staticmethod
     def test_deployment_target_argument_with_torch_model():
         class Network(torch.nn.Module):
@@ -266,7 +274,7 @@ def forward(self, x):
         expected_error = "When 'convert_to' is mlprogram, the minimum deployment target " \
                          "must be at least iOS15/macOS12/watchOS8/tvOS15"
         assert expected_error == str(e.value)
-        
+
     @staticmethod
     def test_get_milprogram_method_with_torch_model():
         class Network(torch.nn.Module):
@@ -290,7 +298,7 @@ def forward(self, x):
             convert_to='mlprogram'
         )
         assert isinstance(model._get_mil_internal(), ct.converters.mil.Program)
-        
+
     @staticmethod
     @pytest.mark.skipif(ct.utils._macos_version() < (12, 0), reason='Model produces specification 6.')
     @pytest.mark.parametrize(
@@ -342,7 +350,7 @@ class TestTorchInputs(_TestInputs):
     @pytest.mark.skipif(not ct.utils._is_macos(), reason="test needs predictions")
     def test_torch_predict_input():
         TestTorchInputs._test_variant_input_type_prediction(torch.tensor)
-        
+
     @staticmethod
     def test_int64_inputs():
 
@@ -416,7 +424,7 @@ def forward(self, x):
                 ],
                 outputs=["output"],
             )
-            
+
     @staticmethod
     def test_fully_dynamic_inputs():
         """
@@ -441,7 +449,7 @@ def forward(self, x, y):
             scripted_model,
             inputs=[
                 ct.TensorType("x", shape=(ct.RangeDim(), ct.RangeDim())),
-                ct.TensorType("y", shape=(ct.RangeDim(), ct.RangeDim()))
+                ct.TensorType("y", shape=(ct.RangeDim(), ct.RangeDim())),
             ],
         )
 
@@ -462,7 +470,7 @@ def forward(self, x, y):
               "y": y.cpu().detach().numpy()})
             for i, name in enumerate(mlmodel.output_description):
                 np.testing.assert_allclose(torch_res[i], results[name])
-                
+
     @staticmethod
     def test_rank0_inputs_torch():
         """Similar to TestPyTorchConverterExamples::test_int64_inputs but
@@ -497,7 +505,7 @@ def forward(self, x):
                     )
                 ],
             )
-        
+
     @staticmethod
     @pytest.mark.parametrize("variable_length", [True, False])
     def test_torch_range_dim_lstm(variable_length):
@@ -538,8 +546,7 @@ def forward(self, x, hidden_state, cell_state):
         # ct.RangeDim() tells coremltools that this dimension can change for
         # each inference example (aka "runtime-determined"). If the sequence
         # length is always the same (e.g., 2 step LSTM would have seq_len == 2)
-        # Note that fixed-length models usually run slightly faster
-        # than variable length models.
+        # Note that fixed-length models usually run slightly faster than variable length models.
         ct_seq_len = ct.RangeDim() if variable_length else seq_len
         seq_input = ct.TensorType(shape=(ct_seq_len, batch, input_size),
             name="seq_input")
@@ -747,7 +754,8 @@ def forward(self, x, y):
         traced_model = torch.jit.trace(model, example_input)
 
         required_input = ct.TensorType(
-            name="required_input", shape=(ct.RangeDim(),), dtype=np.int64)
+            name="required_input", shape=(ct.RangeDim(),), dtype=np.int64
+        )
         default_value = np.array([3]).astype(np.float32)
         optional_input = ct.TensorType(name="optional_input", shape=(1,),
             default_value=default_value)
@@ -1399,3 +1407,153 @@ def test_grayscale_fp16_output_image(self, rank4_grayscale_input_model):
         reference_output = rank4_grayscale_input_model(torch.from_numpy(sample_input)).detach().numpy()
         reference_output = np.squeeze(reference_output)
         np.testing.assert_allclose(reference_output, model_output_as_numpy, rtol=1e-2, atol=1e-2)
+
+
+@pytest.mark.skipif(
+    ct.utils._macos_version() < (14, 0), reason="Tests are for deployment target ios17/macos14"
+)
+class TestQuantizationConversionAPI:
+    def test_dynamic_quantization(self):
+        torch.backends.quantized.engine = "qnnpack"
+
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.fc = torch.nn.Linear(3, 2)
+
+            def forward(self, x):
+                x = self.fc(x)
+                return x
+
+        SHAPE = (4, 3)
+        x = torch.randn(SHAPE)
+
+        model_fp32 = Model()
+        model_int8 = torch.ao.quantization.quantize_dynamic(
+            model_fp32,
+            {torch.nn.Linear},  # a set of layers to dynamically quantize
+            dtype=torch.qint8,
+        )
+        model_int8.eval()
+
+        traced_model = torch.jit.trace(model_int8, x)
+
+        with pytest.raises(
+            RuntimeError,
+            match=(
+                r"PyTorch convert function for op '.*_dynamic' not implemented\.\n"
+                r"Dynamic quantized models are not supported by Core ML.\n"
+                r"Please use static quantization or the APIs in coremltools.optimize to quantize/compress models."
+            ),
+        ):
+            ct.convert(traced_model, inputs=[ct.TensorType(shape=SHAPE)])
+
+    def test_static_quantization_as_activation_quantization(self):
+        torch.backends.quantized.engine = "qnnpack"
+
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.quant = torch.ao.quantization.QuantStub()
+                self.conv = torch.nn.Conv2d(3, 2, 5)
+                self.relu = torch.nn.ReLU()
+                self.dequant = torch.ao.quantization.DeQuantStub()
+
+            def forward(self, x):
+                x = self.quant(x)
+                x = self.conv(x)
+                x = self.relu(x)
+                x = self.dequant(x)
+                return x
+
+        SHAPE = (4, 3, 8, 16)
+        x = torch.randn(SHAPE)
+
+        model_fp32 = Model()
+        model_fp32.eval()
+
+        model_fp32.qconfig = torch.ao.quantization.get_default_qconfig("qnnpack")
+        model_fp32_fused = torch.ao.quantization.fuse_modules(model_fp32, [["conv", "relu"]])
+        model_fp32_prepared = torch.ao.quantization.prepare(model_fp32_fused)
+        model_fp32_prepared(x)
+        model_int8 = torch.ao.quantization.convert(model_fp32_prepared)
+
+        traced_model = torch.jit.trace(model_int8, x)
+        coreml_model = ct.convert(
+            traced_model,
+            inputs=[ct.TensorType(name="x", shape=SHAPE)],
+            outputs=[ct.TensorType(name="y")],
+            minimum_deployment_target=ct.target.iOS17,
+        )
+
+        ops = get_op_types_in_program(coreml_model._mil_program)
+        # constexpr_affine_dequantize and cast -> quantize can have arbitrary order
+        assert ops[:3] == ["cast", "quantize", "constexpr_affine_dequantize"] or ops[:3] == [
+            "constexpr_affine_dequantize",
+            "cast",
+            "quantize",
+        ]
+        # these ops have well-defined order
+        assert ops[3:] == [
+            # quantized ConvRelu op
+            "dequantize",
+            "conv",
+            "relu",
+            "quantize",
+            # dequantize and output
+            "dequantize",
+            "cast",
+        ]
+
+        output = traced_model(x)
+        coreml_output = coreml_model.predict({"x": x})["y"]
+        np.testing.assert_allclose(output, coreml_output, rtol=1e-2, atol=2e-2)
+
+    def test_static_quantization_as_weight_compression(self):
+        torch.backends.quantized.engine = "qnnpack"
+
+        weight = torch.rand(5, 3, 2, 4)
+
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.quant = torch.ao.quantization.QuantStub()
+                self.dequant = torch.ao.quantization.DeQuantStub()
+
+            def forward(self, x):
+                quantized_weight = self.quant(weight)
+                dequantized_weight = self.dequant(quantized_weight)
+                y = torch.nn.functional.conv2d(x, dequantized_weight)
+                return y
+
+        SHAPE = (4, 3, 16, 32)
+        x = torch.randn(SHAPE)
+
+        model_fp32 = Model()
+        model_fp32.eval()
+
+        model_fp32.qconfig = torch.ao.quantization.get_default_qconfig("qnnpack")
+        model_fp32_prepared = torch.ao.quantization.prepare(model_fp32)
+        model_fp32_prepared(x)
+        model_int8 = torch.ao.quantization.convert(model_fp32_prepared)
+
+        traced_model = torch.jit.trace(model_int8, x)
+        coreml_model = ct.convert(
+            traced_model,
+            inputs=[ct.TensorType(name="x", shape=SHAPE)],
+            outputs=[ct.TensorType(name="y")],
+            minimum_deployment_target=ct.target.iOS17,
+        )
+
+        ops = get_op_types_in_program(coreml_model._mil_program)
+        # constexpr_affine_dequantize and cast can have arbitrary order
+        assert ops[:2] == ["cast", "constexpr_affine_dequantize"] or ops[:2] == [
+            "constexpr_affine_dequantize",
+            "cast",
+        ]
+        # these ops have well-defined order
+        assert ops[2:] == ["conv", "cast"]
+
+        output = traced_model(x)
+        coreml_output = coreml_model.predict({"x": x})["y"]
+        np.testing.assert_allclose(output, coreml_output, rtol=1e-2, atol=2e-2)
diff --git a/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py b/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py
index dd3108fee..6555a3e24 100644
--- a/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py
+++ b/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py
@@ -5,7 +5,7 @@
 
 import itertools
 import platform
-from typing import List, Tuple
+from typing import List, Optional, Tuple
 from unittest.mock import patch
 
 import numpy as np
@@ -18,6 +18,11 @@
 from coremltools import RangeDim, Shape, TensorType
 from coremltools._deps import version_lt
 from coremltools.converters.mil import testing_reqs
+from coremltools.converters.mil.frontend.torch.ops import (
+    NUM_TO_TORCH_DTYPE,
+    NUMPY_DTYPE_TO_TORCH_NUM,
+)
+from coremltools.converters.mil.mil import Operation, Program, types
 from coremltools.converters.mil.mil.var import Var
 from coremltools.converters.mil.testing_utils import einsum_equations, gen_input_shapes_einsum
 from coremltools.models.utils import _macos_version, _python_version
@@ -232,8 +237,19 @@ def forward(self, x):
 
         model = Model()
         shape = (1, 3, 256, 256)
+        upper_bound = 512 if backend[0] == "mlprogram" else -1
         converter_input_type = [
-            TensorType(shape=Shape(shape=[1, 3, RangeDim(), RangeDim()], default=shape))
+            TensorType(
+                shape=Shape(
+                    shape=[
+                        1,
+                        3,
+                        RangeDim(upper_bound=upper_bound),
+                        RangeDim(upper_bound=upper_bound),
+                    ],
+                    default=shape,
+                )
+            )
         ]
 
         self.run_compare_torch(
@@ -258,7 +274,12 @@ def forward(self, x):
         x = torch.rand(1, 3, 256, 256)
         traced_model = torch.jit.trace(model, x)
         input_x = ct.TensorType(
-            shape=(1, 3, ct.RangeDim(default=256), ct.RangeDim(default=256)),
+            shape=(
+                1,
+                3,
+                ct.RangeDim(upper_bound=512, default=256),
+                ct.RangeDim(upper_bound=512, default=256),
+            ),
             name="input",
         )
         cml = ct.convert(
@@ -1819,9 +1840,10 @@ def test_convolution_transpose2d_dynamic_input(
         input_shape = (1, in_channels, in_height, in_width)
 
         if dynamic_input:
+            upper_bound = 4096 if backend[0] == "mlprogram" else -1
             converter_input_type = [
                 TensorType(
-                    shape=(1, in_channels, RangeDim(256, -1), RangeDim(256, -1)),
+                    shape=(1, in_channels, RangeDim(256, upper_bound), RangeDim(256, upper_bound)),
                     dtype=np.float32,
                 )
             ]
@@ -2091,7 +2113,14 @@ def test_upsample_linear1d_with_scales_dynamic(
             },
         )
         converter_input_type = [
-            TensorType(shape=(1, 3, RangeDim(default=22)), dtype=np.float32)
+            TensorType(
+                shape=(
+                    1,
+                    3,
+                    RangeDim(default=22, upper_bound=22 if backend[0] == "mlprogram" else -1),
+                ),
+                dtype=np.float32,
+            )
         ]
         mlmodel = self.run_compare_torch(
             input_shape,
@@ -2167,7 +2196,12 @@ def test_upsample_nearest1d_with_scales_dynamic(
                 "recompute_scale_factor": True,
             },
         )
-        converter_input_type = [TensorType(shape=(1, 3, RangeDim()), dtype=np.float32)]
+        converter_input_type = [
+            TensorType(
+                shape=(1, 3, RangeDim(upper_bound=10 if backend[0] == "mlprogram" else -1)),
+                dtype=np.float32,
+            )
+        ]
         mlmodel = self.run_compare_torch(
             input_shape,
             model,
@@ -2328,8 +2362,12 @@ def test_upsample_nearest2d_with_scales_dynamic(
                 "recompute_scale_factor": True,
             },
         )
+        upper_bound = 10 if backend[0] == "mlprogram" else -1
         converter_input_type = [
-            TensorType(shape=(1, 3, RangeDim(), RangeDim()), dtype=np.float32)
+            TensorType(
+                shape=(1, 3, RangeDim(upper_bound=upper_bound), RangeDim(upper_bound=upper_bound)),
+                dtype=np.float32,
+            )
         ]
         mlmodel = self.run_compare_torch(
             input_shape,
@@ -2386,9 +2424,15 @@ def test_upsample_bilinear2d_with_scales_dynamic(
                 "recompute_scale_factor": recompute_scale_factor,
             },
         )
+        dim_upper_bound = 30 if backend[0] == "mlprogram" else -1
         converter_input_type = [
             TensorType(
-                shape=(1, 3, RangeDim(default=9), RangeDim(default=22)),
+                shape=(
+                    1,
+                    3,
+                    RangeDim(default=9, upper_bound=dim_upper_bound),
+                    RangeDim(default=22, upper_bound=dim_upper_bound),
+                ),
                 dtype=np.float32,
             )
         ]
@@ -2898,7 +2942,11 @@ def forward(self, input):
 
 
 class TestPoolSymbolicInput(TorchBaseTest):
-    def test_max_pool(self):
+    @pytest.mark.parametrize(
+        "compute_unit, backend",
+        itertools.product(compute_units, backends),
+    )
+    def test_max_pool(self, compute_unit, backend):
         model = nn.MaxPool2d(
             kernel_size=1,
             stride=2,
@@ -2907,17 +2955,26 @@ def test_max_pool(self):
             ceil_mode=True,
         )
         input_shape = (1, 1, 11, 11)
+        upper_bound = 20 if backend[0] == "mlprogram" else -1
         converter_input_type = [
-            TensorType(shape=(1, 1, RangeDim(), RangeDim()), dtype=np.float32)
+            TensorType(
+                shape=(1, 1, RangeDim(upper_bound=upper_bound), RangeDim(upper_bound=upper_bound)),
+                dtype=np.float32,
+            )
         ]
         self.run_compare_torch(
             input_shape,
             model,
-            backend=backends[0],
+            backend=backend,
+            compute_unit=compute_unit,
             converter_input_type=converter_input_type,
         )
 
-    def test_avg_pool(self):
+    @pytest.mark.parametrize(
+        "compute_unit, backend",
+        itertools.product(compute_units, backends),
+    )
+    def test_avg_pool(self, compute_unit, backend):
         model = nn.AvgPool2d(
             kernel_size=2,
             stride=2,
@@ -2926,13 +2983,18 @@ def test_avg_pool(self):
             ceil_mode=True,
         )
         input_shape = (1, 2, 15, 15)
+        upper_bound = 20 if backend[0] == "mlprogram" else -1
         converter_input_type = [
-            TensorType(shape=(1, 2, RangeDim(), RangeDim()), dtype=np.float32)
+            TensorType(
+                shape=(1, 2, RangeDim(upper_bound=upper_bound), RangeDim(upper_bound=upper_bound)),
+                dtype=np.float32,
+            )
         ]
         self.run_compare_torch(
             input_shape,
             model,
-            backend=backends[0],
+            backend=backend,
+            compute_unit=compute_unit,
             converter_input_type=converter_input_type,
         )
 
@@ -3195,6 +3257,10 @@ def test_lstm(
         LSTM_batch_first,
         pad_value,
     ):
+        if backend[0] == "mlprogram":
+            pytest.xfail(
+                "rdar://109081548 ([Bug] TestLSTMWithPackedSequence is failing through E5ML)"
+            )
         from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
 
         input_size = 4
@@ -3447,6 +3513,28 @@ def forward(self, x):
         )
 
 
+class TestTile(TorchBaseTest):
+    @pytest.mark.parametrize(
+        "compute_unit, backend, dims",
+        itertools.product(
+            compute_units,
+            backends,
+            [(1, 2, 4), (3, 2), (2,)],
+        ),
+    )
+    def test_tile(self, compute_unit, backend, dims):
+        class TestModel(nn.Module):
+            def forward(self, x):
+                return torch.tile(x, dims)
+
+        self.run_compare_torch(
+            (2, 3, 5),
+            TestModel(),
+            backend=backend,
+            compute_unit=compute_unit,
+        )
+
+
 class TestBitwiseNot(TorchBaseTest):
     @pytest.mark.parametrize(
         "compute_unit, backend, input_type",
@@ -4073,13 +4161,14 @@ def forward(self, x):
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend",
+        "compute_unit, backend, minimum_deployment_target",
         itertools.product(
             compute_units,
             backends,
+            [None, ct.target.iOS17],
         ),
     )
-    def test_expand_dynamic_shape0(self, compute_unit, backend):
+    def test_expand_dynamic_shape0(self, compute_unit, backend, minimum_deployment_target):
         class TestModel(nn.Module):
             def forward(self, x):
                 return x.expand(x.shape[1], x.shape[1])
@@ -4088,9 +4177,14 @@ def forward(self, x):
             torch.arange(20).reshape((1, 20)),
             TestModel(),
             input_as_shape=False,
-            converter_input_type=[TensorType(shape=[1, ct.RangeDim()])],
+            converter_input_type=[
+                TensorType(
+                    shape=[1, ct.RangeDim(upper_bound=20 if backend[0] == "mlprogram" else -1)]
+                )
+            ],
             backend=backend,
             compute_unit=compute_unit,
+            minimum_deployment_target=minimum_deployment_target,
         )
 
     @pytest.mark.parametrize(
@@ -4105,11 +4199,19 @@ class TestModel(nn.Module):
             def forward(self, x):
                 return x.expand(x.shape[0], 1, x.shape[-1], x.shape[-1])
 
+        upper_bound = 20 if backend[0] == "mlprogram" else -1
         self.run_compare_torch(
             torch.arange(20).reshape((1, 20)),
             TestModel(),
             input_as_shape=False,
-            converter_input_type=[TensorType(shape=[ct.RangeDim(), ct.RangeDim()])],
+            converter_input_type=[
+                TensorType(
+                    shape=[
+                        ct.RangeDim(upper_bound=upper_bound),
+                        ct.RangeDim(upper_bound=upper_bound),
+                    ]
+                )
+            ],
             backend=backend,
             compute_unit=compute_unit,
         )
@@ -4126,11 +4228,12 @@ class TestModel(nn.Module):
             def forward(self, x):
                 return x.expand(x.shape[-1], 1, x.shape[-1], x.shape[-1])
 
+        upper_bound = 20 if backend[0] == "mlprogram" else -1
         self.run_compare_torch(
             torch.arange(20).reshape((1, 20)),
             TestModel(),
             input_as_shape=False,
-            converter_input_type=[TensorType(shape=[1, ct.RangeDim()])],
+            converter_input_type=[TensorType(shape=[1, ct.RangeDim(upper_bound=upper_bound)])],
             backend=backend,
             compute_unit=compute_unit,
         )
@@ -4147,11 +4250,19 @@ class TestModel(nn.Module):
             def forward(self, x):
                 return x.expand(x.shape[0], 10)
 
+        upper_bound = 20 if backend[0] == "mlprogram" else -1
         self.run_compare_torch(
             torch.arange(20).reshape((20, 1)),
             TestModel(),
             input_as_shape=False,
-            converter_input_type=[TensorType(shape=[ct.RangeDim(), ct.RangeDim()])],
+            converter_input_type=[
+                TensorType(
+                    shape=[
+                        ct.RangeDim(upper_bound=upper_bound),
+                        ct.RangeDim(upper_bound=upper_bound),
+                    ]
+                )
+            ],
             backend=backend,
             compute_unit=compute_unit,
         )
@@ -4172,7 +4283,11 @@ def forward(self, x, y):
             [torch.arange(20).reshape((20, 1)), torch.Tensor([20, 20])],
             TestModel(),
             input_as_shape=False,
-            converter_input_type=[TensorType(shape=[ct.RangeDim(), 1])],
+            converter_input_type=[
+                TensorType(
+                    shape=[ct.RangeDim(upper_bound=20 if backend[0] == "mlprogram" else -1), 1]
+                )
+            ],
             backend=backend,
             compute_unit=compute_unit,
         )
@@ -4397,6 +4512,7 @@ def forward(self, x):
             input_as_shape=False,
         )
 
+
 class TestEinsum(TorchBaseTest):
     @pytest.mark.parametrize(
         "compute_unit, backend, equation, reverse_input_order, dynamic",
@@ -4412,36 +4528,8 @@ def test_einsum(self, compute_unit, backend, equation, reverse_input_order, dyna
         class TestEinsum(nn.Module):
             def forward(self, x, y):
                 return torch.einsum(equation, x, y)
-        if backend == ("mlprogram", "fp16"):
-            if equation in [
-                "abc,cde->abde",
-                "abcd,cde->abe",
-                "iji,ji->j",
-                "jii,ijk->jk",
-                "ija,la->ijal",
-                "ia,ia->a",
-                "ai,ia->a",
-                "abi,abi->ab",
-                "iab,iab->ab",
-                "abi,bai->ba",
-                "ij,j->i",
-                "i,ij->j",
-                "ai,ija->aj",
-                "aibj,bi->jba",
-                "ij,jk->ik",
-                "abij,abjk->abik",
-                "aijb,bajk->abik",
-                "aij,aij->a",
-                "ija,ija->a",
-                "ija,jia->a",
-                "aijb,ajbi->ab",
-                "aibj,cdij->cadb",
-                "ijk,lmj->iklm",
-                "ijak,akl->aijl",
-            ] and dynamic:
-                pytest.xfail("rdar://106631543 ([Infra]Re-enable the unittests for torch einsum ops)")
-
-        input_shapes, converter_input_type = gen_input_shapes_einsum(equation, dynamic)
+
+        input_shapes, converter_input_type = gen_input_shapes_einsum(equation, dynamic, backend)
 
         if reverse_input_order:
             input_output_strings = equation.split("->")
@@ -4464,7 +4552,7 @@ def forward(self, x, y):
             backend=backend,
             compute_unit=compute_unit,
             input_as_shape=True,
-            converter_input_type=converter_input_type
+            converter_input_type=converter_input_type,
         )
 
     @pytest.mark.parametrize(
@@ -4551,7 +4639,7 @@ def test_cumsum(self, compute_unit, backend, axis):
 
 class TestReshape(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, output_shape",
+        "compute_unit, backend, output_shape, minimum_deployment_target",
         itertools.product(
             compute_units,
             backends,
@@ -4560,13 +4648,18 @@ class TestReshape(TorchBaseTest):
                 (2, -1),
                 (2, 1, 1, 3),
             ],
+            [None, ct.target.iOS17],
         ),
     )
-    def test_reshape(self, compute_unit, backend, output_shape):
+    def test_reshape(self, compute_unit, backend, output_shape, minimum_deployment_target):
         input_shape = (2, 3)
         model = ModuleWrapper(function=torch.reshape, kwargs={"shape": output_shape})
         self.run_compare_torch(
-            input_shape, model, backend=backend, compute_unit=compute_unit
+            input_shape,
+            model,
+            backend=backend,
+            compute_unit=compute_unit,
+            minimum_deployment_target=minimum_deployment_target,
         )
 
 
@@ -4606,9 +4699,15 @@ def test_flatten(self, compute_unit, backend, start_dim, end_dim, is_dynamic):
         input_shape = (2, 3, 4, 5)
         converter_input_type = None
         if is_dynamic:
+            dim_upper_bound = 8 if backend[0] == "mlprogram" else -1
             converter_input_type = [
                 TensorType(
-                    shape=(2, 3, RangeDim(default=4), RangeDim(default=5)),
+                    shape=(
+                        2,
+                        3,
+                        RangeDim(default=4, upper_bound=dim_upper_bound),
+                        RangeDim(default=5, upper_bound=dim_upper_bound),
+                    ),
                     dtype=np.float32,
                 )
             ]
@@ -4646,8 +4745,40 @@ def test_gather_along_axis(self, compute_unit, backend, rank_and_axis):
             [params_shape], model, backend=backend, compute_unit=compute_unit
         )
 
+    def test_gather_along_axis_invalid_indices(self):
+        """This test is to verify that PyTorch gather op doesn't allow negative and out-of-range
+        indices, so we don't need to add mb.select for IOS17 mb.gather op when lowering torch.gather."""
+        data = torch.tensor([[1, 2], [3, 4]])
+        with pytest.raises(RuntimeError, match="index -1 is out of bounds"):
+            torch.gather(data, 1, torch.tensor([[-1, 0], [1, 0]]))
+        with pytest.raises(RuntimeError, match="index 2 is out of bounds"):
+            torch.gather(data, 1, torch.tensor([[0, 0], [2, 0]]))
+
 
 class TestActivation(TorchBaseTest):
+    @staticmethod
+    def run_compare_torch(input_data, model, target_op: Optional[str] = None, **kwargs):
+        """Override compare method for Activation ops tests, as we want to verify the mixed
+        precision support for alpha/beta in IOS17 Activation Ops."""
+        results = TorchBaseTest.run_compare_torch(input_data, model, **kwargs)
+
+        if target_op and kwargs.get("backend", (None, None))[1] == "fp16":
+            prog: Program = results[1]._mil_program
+            activation_op: Operation = prog.find_ops(op_type=target_op, exactly_one=True)[0]
+            assert activation_op.x.dtype == types.fp16
+
+            # Before IOS17, both alpha and input/output are converted to fp16.
+            # After IOS17, alpha is kept as fp32 because it supports mixed precision.
+            expected_alpha_beta_dtype = types.fp16
+            if kwargs.get("minimum_deployment_target", None) == ct.target.iOS17:
+                expected_alpha_beta_dtype = types.fp32
+            if hasattr(activation_op, "alpha"):
+                assert activation_op.alpha.dtype == expected_alpha_beta_dtype
+            if hasattr(activation_op, "beta"):
+                assert activation_op.beta.dtype == expected_alpha_beta_dtype
+
+        return results
+
     @pytest.mark.parametrize(
         "compute_unit, backend, shape",
         itertools.product(compute_units, backends, COMMON_SHAPES_ALL),
@@ -4672,16 +4803,19 @@ def test_relu6(self, compute_unit, backend, shape):
         self.run_compare_torch(shape, model, backend=backend, compute_unit=compute_unit)
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, alpha, shape, single_alpha",
+        "compute_unit, backend, alpha, shape, single_alpha, minimum_deployment_target",
         itertools.product(
             compute_units,
             backends,
             [0.25, 2.0],
             [(3,), (2, 6), (2, 3, 4), (2, 5, 6, 7), (2, 3, 4, 5, 6)],
             [True, False],
+            [None, ct.target.iOS17],
         ),
     )
-    def test_prelu(self, compute_unit, backend, alpha, shape, single_alpha):
+    def test_prelu(
+        self, compute_unit, backend, alpha, shape, single_alpha, minimum_deployment_target
+    ):
         if backend[0] == "mlprogram" and backend[1] == "fp16" or (len(shape) == 5):
             pytest.xfail(
                 "rdar://92175249 ([MIL] TestActivation::test_prelu[backend=(mlprogram, fp16)] CI failure)"
@@ -4692,7 +4826,12 @@ def test_prelu(self, compute_unit, backend, alpha, shape, single_alpha):
             num_parameters = 1
         model = nn.PReLU(num_parameters, alpha).eval()
         mlmodel = self.run_compare_torch(
-            input_shape, model, backend=backend, compute_unit=compute_unit
+            input_shape,
+            model,
+            backend=backend,
+            compute_unit=compute_unit,
+            minimum_deployment_target=minimum_deployment_target,
+            target_op="leaky_relu",  # prelu got fused to lrelu
         )
         prog = mlmodel[1]._mil_program
         # Unfortunately since all these tests result in a prelu with a common leakage factor, the
@@ -4701,19 +4840,30 @@ def test_prelu(self, compute_unit, backend, alpha, shape, single_alpha):
         assert len(prog.find_ops(op_type="prelu")) == 0
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, shape, alpha",
-        itertools.product(compute_units, backends, COMMON_SHAPES_ALL, [0.1, 2.0, 1.4]),
+        "compute_unit, backend, shape, alpha, minimum_deployment_target",
+        itertools.product(
+            compute_units, backends, COMMON_SHAPES_ALL, [0.1, 2.0], [None, ct.target.iOS17]
+        ),
     )
-    def test_leaky_relu(self, compute_unit, backend, shape, alpha):
+    def test_leaky_relu(self, compute_unit, backend, shape, alpha, minimum_deployment_target):
         model = nn.LeakyReLU(negative_slope=alpha).eval()
         self.run_compare_torch(
             shape,
             model,
             backend=backend,
+            minimum_deployment_target=minimum_deployment_target,
+            target_op="leaky_relu",
         )
 
         model = ModuleWrapper(nn.functional.leaky_relu_, {"negative_slope": alpha})
-        self.run_compare_torch(shape, model, backend=backend, compute_unit=compute_unit)
+        self.run_compare_torch(
+            shape,
+            model,
+            backend=backend,
+            compute_unit=compute_unit,
+            minimum_deployment_target=minimum_deployment_target,
+            target_op="leaky_relu",
+        )
 
     @pytest.mark.parametrize(
         "compute_unit, backend, shape",
@@ -4756,12 +4906,36 @@ def test_hardtanh(self, compute_unit, backend, range_val):
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, shape, alpha",
-        itertools.product(compute_units, backends, COMMON_SHAPES_ALL, [0.1, 2.0, 1.4]),
+        "compute_unit, backend, shape, alpha, minimum_deployment_target",
+        itertools.product(
+            compute_units, backends, COMMON_SHAPES_ALL, [0.1, 2.0], [None, ct.target.iOS17]
+        ),
     )
-    def test_elu(self, compute_unit, backend, shape, alpha):
+    def test_elu(self, compute_unit, backend, shape, alpha, minimum_deployment_target):
         model = nn.ELU(alpha).eval()
-        self.run_compare_torch(shape, model, backend=backend, compute_unit=compute_unit)
+        self.run_compare_torch(
+            shape,
+            model,
+            backend=backend,
+            compute_unit=compute_unit,
+            minimum_deployment_target=minimum_deployment_target,
+            target_op="elu",
+        )
+
+    @pytest.mark.parametrize(
+        "compute_unit, backend, shape, minimum_deployment_target",
+        itertools.product(compute_units, backends, COMMON_SHAPES_ALL, [None, ct.target.iOS17]),
+    )
+    def test_hardswish(self, compute_unit, backend, shape, minimum_deployment_target):
+        model = nn.Hardswish().eval()
+        self.run_compare_torch(
+            shape,
+            model,
+            backend=backend,
+            compute_unit=compute_unit,
+            minimum_deployment_target=minimum_deployment_target,
+            target_op="thresholded_relu",
+        )
 
     @pytest.mark.parametrize(
         "compute_unit, backend, shape",
@@ -4794,26 +4968,38 @@ def test_sigmoid(self, compute_unit, backend, shape):
         self.run_compare_torch(shape, model, backend=backend, compute_unit=compute_unit)
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, shape",
-        itertools.product(compute_units, backends, COMMON_SHAPES_ALL),
+        "compute_unit, backend, shape, minimum_deployment_target",
+        itertools.product(compute_units, backends, COMMON_SHAPES_ALL, [None, ct.target.iOS17]),
     )
-    def test_sigmoid_hard(self, compute_unit, backend, shape):
+    def test_sigmoid_hard(self, compute_unit, backend, shape, minimum_deployment_target):
         model = nn.Hardsigmoid().eval()
-        self.run_compare_torch(shape, model, backend=backend, compute_unit=compute_unit)
+        self.run_compare_torch(
+            shape,
+            model,
+            backend=backend,
+            compute_unit=compute_unit,
+            minimum_deployment_target=minimum_deployment_target,
+            target_op="sigmoid_hard",
+        )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, beta, threshold",
-        itertools.product(compute_units, backends, [1, 2, 5], [5, 10, 20]),
+        "compute_unit, backend, beta, threshold, minimum_deployment_target",
+        itertools.product(compute_units, backends, [1, 2, 5], [5, 10, 20], [None, ct.target.iOS17]),
     )
     @pytest.mark.skipif(
         _macos_version() <= (10, 15),
         reason="Parametric SoftPlus segfaults on macOS 10.15 and below.",
     )
-    def test_softplus(self, compute_unit, backend, beta, threshold):
+    def test_softplus(self, compute_unit, backend, beta, threshold, minimum_deployment_target):
         input_shape = (1, 10, 5, 15)
         model = nn.Softplus(beta, threshold).eval()
         self.run_compare_torch(
-            input_shape, model, backend=backend, compute_unit=compute_unit
+            input_shape,
+            model,
+            backend=backend,
+            compute_unit=compute_unit,
+            minimum_deployment_target=minimum_deployment_target,
+            target_op="softplus_parametric",
         )
 
     @pytest.mark.parametrize(
@@ -4913,7 +5099,7 @@ def test_elementwise_no_params(self, compute_unit, backend, shape, op_string):
         self.run_compare_torch(shape, model, backend=backend, compute_unit=compute_unit)
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, shape, clamp_range",
+        "compute_unit, backend, shape, clamp_range, minimum_deployment_target",
         itertools.product(
             compute_units,
             backends,
@@ -4928,9 +5114,10 @@ def test_elementwise_no_params(self, compute_unit, backend, shape, op_string):
                 (1, 3.5),
                 (1, -1),
             ],
+            [None, ct.target.iOS17],
         ),
     )
-    def test_clamp(self, compute_unit, backend, shape, clamp_range):
+    def test_clamp(self, compute_unit, backend, shape, clamp_range, minimum_deployment_target):
         params_dict = {}
         if clamp_range[0] is not None:
             params_dict["min"] = clamp_range[0]
@@ -4939,7 +5126,12 @@ def test_clamp(self, compute_unit, backend, shape, clamp_range):
 
         model = ModuleWrapper(torch.clamp, params_dict)
         self.run_compare_torch(
-            shape, model, backend=backend, compute_unit=compute_unit, rand_range=(-5, 5)
+            shape,
+            model,
+            backend=backend,
+            compute_unit=compute_unit,
+            rand_range=(-5, 5),
+            minimum_deployment_target=minimum_deployment_target,
         )
 
     @pytest.mark.parametrize(
@@ -4963,15 +5155,16 @@ def test_clamp_int_input(self, compute_unit, backend):
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, shape, threshold",
+        "compute_unit, backend, shape, threshold, minimum_deployment_target",
         itertools.product(
             compute_units,
             backends,
             [(1, 3, 5, 8)],
             [(0.0, 0.0), (0.5, 0.5), (0.5, 10), (0.9, 0.0)],
+            [None, ct.target.iOS17],
         ),
     )
-    def test_threshold(self, compute_unit, backend, shape, threshold):
+    def test_threshold(self, compute_unit, backend, shape, threshold, minimum_deployment_target):
         model = torch.nn.Threshold(threshold[0], threshold[1]).eval()
         input_value = torch.rand(np.prod(shape))
         # make sure the values are not too close to the threshold
@@ -4985,6 +5178,7 @@ def test_threshold(self, compute_unit, backend, shape, threshold):
             backend=backend,
             compute_unit=compute_unit,
             input_as_shape=False,
+            minimum_deployment_target=minimum_deployment_target,
         )
 
     @pytest.mark.parametrize(
@@ -5197,6 +5391,48 @@ def test_split_with_sizes(self, compute_unit, backend, split_sizes, dim):
             input_shape, model, backend=backend, compute_unit=compute_unit
         )
 
+    @pytest.mark.parametrize(
+        "compute_unit, backend, dim",
+        itertools.product(compute_units, backends, [-1]),
+    )
+    def test_split_with_dynamic_sizes(self, compute_unit, backend, dim):
+        class TestModel(torch.nn.Module):
+            def forward(self, x):
+                size = x[0]
+                return torch.split(x, size, dim=dim)
+
+        input_shape = np.random.randint(low=2, high=6, size=20)
+        torch_in = torch.tensor(input_shape)
+        model = TestModel()
+        torch_out = model(torch_in)
+        self.run_compare_torch(
+            torch_in,
+            model,
+            expected_results=torch_out,
+            input_as_shape=False,
+            backend=backend,
+            compute_unit=compute_unit,
+        )
+
+        if backends[0] == "mlprogram":
+            with patch.object(Var, "_is_nonreplaceable_var") as mocked_is_nonreplaceable_var:
+                # Mock that shape op is non-replaceable, so the gather op will be kept.
+                mocked_is_nonreplaceable_var.side_effect = (
+                    lambda var: var.op and "shape" in var.op.op_type
+                )
+                with pytest.raises(
+                    RuntimeError,
+                    match="in operation of type split: Param 'split_sizes' must be const",
+                ):
+                    self.run_compare_torch(
+                        torch_in,
+                        model,
+                        expected_results=torch_out,
+                        input_as_shape=False,
+                        backend=backend,
+                        compute_unit=compute_unit,
+                    )
+
 
 class TestUnbind(TorchBaseTest):
     @pytest.mark.parametrize(
@@ -5511,12 +5747,18 @@ def forward(self, x, y):
 
         module = Model()
         inputs = [torch.tensor([[1], [2]]), torch.tensor([2])]
+        upper_bound = 10 if backend[0] == "mlprogram" else -1
         self.run_compare_torch(
             inputs,
             module,
             input_as_shape=False,
             converter_input_type=[
-                ct.TensorType(shape=(ct.RangeDim(), ct.RangeDim())),
+                ct.TensorType(
+                    shape=(
+                        ct.RangeDim(upper_bound=upper_bound),
+                        ct.RangeDim(upper_bound=upper_bound),
+                    )
+                ),
                 ct.TensorType(shape=(1,)),
             ],
             backend=backend,
@@ -5710,21 +5952,6 @@ def forward(self, x):
         # The empty_like op is folded to const, so there is no fill nor fill_like op.
         assert len(prog.find_ops(op_type="fill")) + len(prog.find_ops(op_type="fill_like")) == 0
 
-        with patch.object(Var, '_is_nonreplaceable_var') as mocked_is_nonreplaceable_var:
-            # Mock that only shape op is not replaceable.
-            mocked_is_nonreplaceable_var.side_effect = (
-                lambda var: var.op and var.op.op_type == "shape"
-            )
-            mlmodel = self.run_compare_torch(
-                [(1, 2, 3)],
-                model,
-                backend=backend,
-                compute_unit=compute_unit
-            )
-            prog = mlmodel[1]._mil_program
-            # The shape op is not folded to const.
-            assert len(prog.find_ops(op_type="fill")) + len(prog.find_ops(op_type="fill_like")) == 1
-
     @pytest.mark.parametrize(
         "compute_unit, backend, rank",
         itertools.product(
@@ -5831,7 +6058,7 @@ class TestTopk(TorchBaseTest):
             [((4, 6, 7, 3), -1, 2), ((10, 3, 4), 2, 2), ((5,), 0, 2)],
         ),
     )
-    def test_topk(self, compute_unit, backend, largest, sort, shape_dim_k, dynamic):
+    def test_topk(self, compute_unit, backend, largest, sort, dynamic, shape_dim_k):
         if not sort and backend[0] == "neuralnetwork":
             pytest.xfail("iOS16 version topk needed for sort = False")
         if not sort and _macos_version() < (13, 0):
@@ -5868,6 +6095,49 @@ def forward(self, x, y):
             minimum_deployment_target=ct.target.iOS16 if not sort else None,
         )
 
+    @pytest.mark.parametrize(
+        "compute_unit, backend, x_dtype",
+        itertools.product(
+            compute_units,
+            [("mlprogram", "fp16")],
+            [np.float32, np.float16, np.int32, np.int16, np.uint16],
+        ),
+    )
+    def test_topk_ios17(self, compute_unit, backend, x_dtype):
+        if x_dtype == np.float16:
+            pytest.skip("PyTorch doesn't support fp16 topk.")
+        if x_dtype == np.uint16:
+            pytest.skip("PyTorch doesn't have uint16 data type.")
+
+        x_torch_dtype = NUM_TO_TORCH_DTYPE[NUMPY_DTYPE_TO_TORCH_NUM[x_dtype]]
+
+        class TopkModel(nn.Module):
+            def forward(self, x, y):
+                topk = torch.topk(x.to(x_torch_dtype), k=2, dim=-1, largest=True, sorted=True)
+                return topk.values + y
+
+        input_data_x = torch.randint(low=0, high=100, size=(2, 3, 4))
+        input_data_y = torch.randint(low=0, high=100, size=(1,))
+
+        model = TopkModel()
+        expected_results = model(input_data_x, input_data_y)
+        mlmodel = self.run_compare_torch(
+            [input_data_x, input_data_y],
+            model,
+            expected_results=expected_results,
+            input_as_shape=False,
+            backend=backend,
+            compute_unit=compute_unit,
+            minimum_deployment_target=ct.target.iOS17,
+        )
+        prog = mlmodel[1]._mil_program
+        topk_op = prog.find_ops(op_type="topk", exactly_one=True)[0]
+        expected_topk_x_dtype = types.type_mapping.numpy_type_to_builtin_type(x_dtype)
+        if backend[1] == "fp16" and x_dtype == np.float32:
+            # For fp16 precision the fp32 input/output will be cast to fp16.
+            expected_topk_x_dtype = types.fp16
+        assert topk_op.x.dtype == expected_topk_x_dtype
+
 
 class TestLog10(TorchBaseTest):
     @pytest.mark.parametrize(
@@ -6491,7 +6761,16 @@ def forward(self, x):
         shape = (2, 10, 3)
         model = TensorAssignModel()
         if dynamic:
-            converter_input_type = [ct.TensorType(shape=(ct.RangeDim(), ct.RangeDim(), ct.RangeDim()))]
+            upper_bound = 10 if backend[0] == "mlprogram" else -1
+            converter_input_type = [
+                ct.TensorType(
+                    shape=(
+                        ct.RangeDim(upper_bound=upper_bound),
+                        ct.RangeDim(upper_bound=upper_bound),
+                        ct.RangeDim(upper_bound=upper_bound),
+                    )
+                )
+            ]
         else:
             converter_input_type = None
         self.run_compare_torch(
@@ -6521,8 +6800,15 @@ def forward(self, x, begin_0, begin_1, end_1):
         shape = (2, 10, 3)
         model = TensorAssignModel()
         if dynamic:
+            upper_bound = 10 if backend[0] == "mlprogram" else -1
             converter_input_type = [
-                ct.TensorType(shape=(ct.RangeDim(), ct.RangeDim(), ct.RangeDim())),
+                ct.TensorType(
+                    shape=(
+                        ct.RangeDim(upper_bound=upper_bound),
+                        ct.RangeDim(upper_bound=upper_bound),
+                        ct.RangeDim(upper_bound=upper_bound),
+                    )
+                ),
                 ct.TensorType(shape=(1,), dtype=np.int32),
                 ct.TensorType(shape=(1,), dtype=np.int32),
                 ct.TensorType(shape=(1,), dtype=np.int32),
@@ -6570,13 +6856,14 @@ def forward(self, x):
 
 class TestIndexPut(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend",
+        "compute_unit, backend, minimum_deployment_target",
         itertools.product(
             compute_units,
             backends,
+            [None, ct.target.iOS17],
         ),
     )
-    def test_index_put_case_1(self, compute_unit, backend):
+    def test_index_put_case_1(self, compute_unit, backend, minimum_deployment_target):
         class IndexPutModel(torch.nn.Module):
             def forward(self, x, y):
                 y = x + 1
@@ -6585,20 +6872,24 @@ def forward(self, x, y):
                 return x
 
         shape = (3, 2)
-        model = IndexPutModel()
         self.run_compare_torch(
-            [shape, shape], model, backend=backend, compute_unit=compute_unit
+            [shape, shape],
+            IndexPutModel(),
+            backend=backend,
+            compute_unit=compute_unit,
+            minimum_deployment_target=minimum_deployment_target,
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, rank",
+        "compute_unit, backend, rank, minimum_deployment_target",
         itertools.product(
             compute_units,
             backends,
             [0, 1],
+            [None, ct.target.iOS17],
         ),
     )
-    def test_index_put_case_2(self, compute_unit, backend, rank):
+    def test_index_put_case_2(self, compute_unit, backend, rank, minimum_deployment_target):
         class IndexPutModel(torch.nn.Module):
             def forward(self, x):
                 mask = torch.tensor([True, False, False, False, True, True]).view(3, 2)
@@ -6610,16 +6901,18 @@ def forward(self, x):
 
         shape = (3, 2)
         model = IndexPutModel()
-        self.run_compare_torch(shape, model, backend=backend, compute_unit=compute_unit)
+        self.run_compare_torch(shape, model, backend=backend, compute_unit=compute_unit,
+                               minimum_deployment_target=minimum_deployment_target)
 
     @pytest.mark.parametrize(
-        "compute_unit, backend",
+        "compute_unit, backend, minimum_deployment_target",
         itertools.product(
             compute_units,
             backends,
+            [None, ct.target.iOS17],
         ),
     )
-    def test_index_put_case_3(self, compute_unit, backend):
+    def test_index_put_case_3(self, compute_unit, backend, minimum_deployment_target):
         if _macos_version() < (13, 0):
             pytest.skip("Issue fixed in iOS16/macOS13")
 
@@ -6640,13 +6933,14 @@ def forward(self, x, y):
             backend=backend,
             compute_unit=compute_unit,
             input_as_shape=False,
+            minimum_deployment_target=minimum_deployment_target,
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, rank, accumulate",
-        itertools.product(compute_units, backends, [1, 2], [True, False]),
+        "compute_unit, backend, rank, accumulate, minimum_deployment_target",
+        itertools.product(compute_units, backends, [3], [True, False], [None, ct.target.iOS17]),
     )
-    def test_index_put_case_4(self, compute_unit, backend, rank, accumulate):
+    def test_index_put_case_4(self, compute_unit, backend, rank, accumulate, minimum_deployment_target):
         class IndexPutModel(torch.nn.Module):
             def forward(self, x, indices, values):
                 x.index_put_(tuple(indices.t()), values, accumulate=accumulate)
@@ -6664,6 +6958,12 @@ def forward(self, x, indices, values):
                 torch.LongTensor([[0, 1], [1, 2], [2, 2]]),
                 torch.Tensor([1.0, 5.0, 8.0]),
             ]
+        elif rank == 3:
+            inputs = [
+                torch.ones([2, 3, 4]),
+                torch.LongTensor([[0, 1], [1, 1], [0, 0]]),
+                torch.tensor([[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0], [9.0, 6.0, 2.0, 1.0]]),
+            ]
 
         model = IndexPutModel()
         self.run_compare_torch(
@@ -6672,12 +6972,80 @@ def forward(self, x, indices, values):
             backend=backend,
             compute_unit=compute_unit,
             input_as_shape=False,
+            minimum_deployment_target=minimum_deployment_target,
+        )
+
+    @pytest.mark.parametrize(
+        "compute_unit, backend, accumulate, minimum_deployment_target",
+        itertools.product(compute_units, backends, [True, False], [None, ct.target.iOS17]),
+    )
+    def test_index_put_negative_indices_case_1(
+        self, compute_unit, backend, accumulate, minimum_deployment_target
+    ):
+        class IndexPutModel(torch.nn.Module):
+            def forward(self, x):
+                x.index_put_(
+                    indices=(torch.LongTensor([0, -1]), torch.LongTensor([-2, 1])),
+                    values=torch.Tensor([1.0, 5.0]),
+                    accumulate=accumulate,
+                )
+                return x
+
+        self.run_compare_torch(
+            (3, 4),
+            IndexPutModel(),
+            backend=backend,
+            compute_unit=compute_unit,
+            minimum_deployment_target=minimum_deployment_target,
+        )
+
+    @pytest.mark.parametrize(
+        "compute_unit, backend, rank, accumulate, minimum_deployment_target",
+        itertools.product(
+            compute_units, backends, [1, 2, 3], [True, False], [None, ct.target.iOS17]
+        ),
+    )
+    def test_index_put_negative_indices_case_2(
+        self, compute_unit, backend, rank, accumulate, minimum_deployment_target
+    ):
+        class IndexPutModel(torch.nn.Module):
+            def forward(self, x, indices, values):
+                x.index_put_(tuple(indices.t()), values, accumulate=accumulate)
+                return x
+
+        if rank == 1:
+            inputs = [
+                torch.Tensor([1.0, 2.0, 3.0, 4.0, 5.0, 6]),
+                torch.LongTensor([[-1], [-4]]),
+                torch.Tensor([3.0, 7.0]),
+            ]
+        elif rank == 2:
+            inputs = [
+                torch.ones([3, 4]),
+                torch.LongTensor([[-2, -1], [-2, 0], [-1, 1]]),
+                torch.Tensor([1.0, 5.0, 8.0]),
+            ]
+        elif rank == 3:
+            inputs = [
+                torch.ones([2, 3, 4]),
+                torch.LongTensor([[-1, -1], [-2, 0], [0, 1]]),
+                torch.tensor([[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0], [9.0, 6.0, 2.0, 1.0]]),
+            ]
+
+        model = IndexPutModel()
+        self.run_compare_torch(
+            inputs,
+            model,
+            backend=backend,
+            compute_unit=compute_unit,
+            input_as_shape=False,
+            minimum_deployment_target=minimum_deployment_target,
         )
 
 
 class TestIndex(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, shape",
+        "compute_unit, backend, shape, minimum_deployment_target",
         itertools.product(
             compute_units,
             backends,
@@ -6685,9 +7053,10 @@ class TestIndex(TorchBaseTest):
                 (10,),
                 (3, 4, 5, 6),
             ],
+            [None, ct.target.iOS17],
         ),
     )
-    def test_index_bool_indices(self, compute_unit, backend, shape):
+    def test_index_bool_indices(self, compute_unit, backend, shape, minimum_deployment_target):
         rank = len(shape)
         class IndexModel(torch.nn.Module):
             def __init__(self, axis):
@@ -6722,10 +7091,11 @@ def forward(self, x, y):
                     backend=backend,
                     compute_unit=compute_unit,
                     input_as_shape=False,
+                    minimum_deployment_target=minimum_deployment_target,
                 )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, shape",
+        "compute_unit, backend, shape, minimum_deployment_target",
         itertools.product(
             compute_units,
             backends,
@@ -6733,9 +7103,10 @@ def forward(self, x, y):
                 (1, 2),
                 (3, 4, 5, 6),
             ],
+            [None, ct.target.iOS17],
         ),
     )
-    def test_index_int_index_case_1(self, compute_unit, backend, shape):
+    def test_index_int_index_case_1(self, compute_unit, backend, shape, minimum_deployment_target):
         # all elements are selected
         class IndexModel(torch.nn.Module):
             def forward(self, x):
@@ -6745,10 +7116,16 @@ def forward(self, x):
                     return x[:]
 
         model = IndexModel()
-        self.run_compare_torch(shape, model, backend=backend, compute_unit=compute_unit)
+        self.run_compare_torch(
+            shape,
+            model,
+            backend=backend,
+            compute_unit=compute_unit,
+            minimum_deployment_target=minimum_deployment_target,
+        )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, shape",
+        "compute_unit, backend, shape, minimum_deployment_target",
         itertools.product(
             compute_units,
             backends,
@@ -6756,24 +7133,31 @@ def forward(self, x):
                 (1, 2),
                 (3, 4, 5, 6),
             ],
+            [None, ct.target.iOS17],
         ),
     )
-    def test_index_int_index_case_2(self, compute_unit, backend, shape):
-        # only one axis is sliced
+    def test_index_int_index_case_2(self, compute_unit, backend, shape, minimum_deployment_target):
+        """Only one axis is sliced."""
         class IndexModel(torch.nn.Module):
             def forward(self, x):
                 if len(shape) == 2:
                     index = torch.tensor([0])
                     return x[index, :]
                 elif len(shape) == 4:
-                    index = torch.tensor([1, 2])
+                    index = torch.tensor([1, -2])
                     return x[:, :, index]
 
         model = IndexModel()
-        self.run_compare_torch(shape, model, backend=backend, compute_unit=compute_unit)
+        self.run_compare_torch(
+            shape,
+            model,
+            backend=backend,
+            compute_unit=compute_unit,
+            minimum_deployment_target=minimum_deployment_target,
+        )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, shape",
+        "compute_unit, backend, shape, minimum_deployment_target",
         itertools.product(
             compute_units,
             backends,
@@ -6781,10 +7165,11 @@ def forward(self, x):
                 (1, 2, 3),
                 (2, 3, 4, 5),
             ],
+            [None, ct.target.iOS17],
         ),
     )
-    def test_index_int_index_case_3(self, compute_unit, backend, shape):
-        # only two axes are sliced, and connected
+    def test_index_int_index_case_3(self, compute_unit, backend, shape, minimum_deployment_target):
+        """Only two axes are sliced, and connected."""
         class IndexModel(torch.nn.Module):
             def forward(self, x):
                 if len(shape) == 3:
@@ -6798,10 +7183,16 @@ def forward(self, x):
                     return x[:, index_1, index_2, :]
 
         model = IndexModel()
-        self.run_compare_torch(shape, model, backend=backend, compute_unit=compute_unit)
+        self.run_compare_torch(
+            shape,
+            model,
+            backend=backend,
+            compute_unit=compute_unit,
+            minimum_deployment_target=minimum_deployment_target,
+        )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, shape",
+        "compute_unit, backend, shape, minimum_deployment_target",
         itertools.product(
             compute_units,
             backends,
@@ -6809,10 +7200,11 @@ def forward(self, x):
                 (1, 2, 3),
                 (2, 3, 4, 5),
             ],
+            [None, ct.target.iOS17],
         ),
     )
-    def test_index_int_index_case_4(self, compute_unit, backend, shape):
-        # only two axes are sliced, and not connected
+    def test_index_int_index_case_4(self, compute_unit, backend, shape, minimum_deployment_target):
+        """Only two axes are sliced, and not connected."""
         class IndexModel(torch.nn.Module):
             def forward(self, x):
                 if len(shape) == 3:
@@ -6826,10 +7218,16 @@ def forward(self, x):
                     return x[index_1, :, :, index_2]
 
         model = IndexModel()
-        self.run_compare_torch(shape, model, backend=backend, compute_unit=compute_unit)
+        self.run_compare_torch(
+            shape,
+            model,
+            backend=backend,
+            compute_unit=compute_unit,
+            minimum_deployment_target=minimum_deployment_target,
+        )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, shape",
+        "compute_unit, backend, shape, minimum_deployment_target",
         itertools.product(
             compute_units,
             backends,
@@ -6837,30 +7235,37 @@ def forward(self, x):
                 (1, 2, 3),
                 (2, 3, 4, 5),
             ],
+            [None, ct.target.iOS17],
         ),
     )
-    def test_index_int_index_case_5(self, compute_unit, backend, shape):
-        # all axes are sliced
+    def test_index_int_index_case_5(self, compute_unit, backend, shape, minimum_deployment_target):
+        """All axes are sliced."""
         class IndexModel(torch.nn.Module):
             def forward(self, x):
                 if len(shape) == 3:
                     index_1 = torch.tensor([0])
                     index_2 = torch.tensor([1])
-                    index_3 = torch.tensor([2])
+                    index_3 = torch.tensor([-1])  # Test negative indices.
                     return x[index_1, index_2, index_3]
 
                 elif len(shape) == 4:
                     index_1 = torch.tensor([0, 1, 1, 0, 0])
                     index_2 = torch.tensor([1, 2, 0, 0, 0])
-                    index_3 = torch.tensor([0, 1, 2, 3, 3])
+                    index_3 = torch.tensor([0, 1, -2, 3, 3])  # Test negative indices.
                     index_4 = torch.tensor([2, 1, 0, 4, 4])
                     return x[index_1, index_2, index_3, index_4]
 
         model = IndexModel()
-        self.run_compare_torch(shape, model, backend=backend, compute_unit=compute_unit)
+        self.run_compare_torch(
+            shape,
+            model,
+            backend=backend,
+            compute_unit=compute_unit,
+            minimum_deployment_target=minimum_deployment_target,
+        )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, shape",
+        "compute_unit, backend, shape, minimum_deployment_target",
         itertools.product(
             compute_units,
             backends,
@@ -6868,10 +7273,11 @@ def forward(self, x):
                 (1, 2),
                 (3, 4, 5, 6),
             ],
+            [None, ct.target.iOS17],
         ),
     )
-    def test_index_int_index_case_6(self, compute_unit, backend, shape):
-        # only one axis is sliced + nd mode
+    def test_index_int_index_case_6(self, compute_unit, backend, shape, minimum_deployment_target):
+        """Only one axis is sliced + nd mode."""
         class IndexModel(torch.nn.Module):
             def forward(self, x):
                 if len(shape) == 2:
@@ -6884,10 +7290,16 @@ def forward(self, x):
                     return x[:, index]
 
         model = IndexModel()
-        self.run_compare_torch(shape, model, backend=backend, compute_unit=compute_unit)
+        self.run_compare_torch(
+            shape,
+            model,
+            backend=backend,
+            compute_unit=compute_unit,
+            minimum_deployment_target=minimum_deployment_target,
+        )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, shape",
+        "compute_unit, backend, shape, minimum_deployment_target",
         itertools.product(
             compute_units,
             backends,
@@ -6895,10 +7307,11 @@ def forward(self, x):
                 (1, 2, 3),
                 (2, 3, 4, 5),
             ],
+            [None, ct.target.iOS17],
         ),
     )
-    def test_index_int_index_case_7(self, compute_unit, backend, shape):
-        # two axes are sliced, and connected + nd mode
+    def test_index_int_index_case_7(self, compute_unit, backend, shape, minimum_deployment_target):
+        """Two axes are sliced, and connected + nd mode."""
         class IndexModel(torch.nn.Module):
             def forward(self, x):
                 if len(shape) == 3:
@@ -6912,10 +7325,16 @@ def forward(self, x):
                     return x[:, index_1, index_2, :]
 
         model = IndexModel()
-        self.run_compare_torch(shape, model, backend=backend, compute_unit=compute_unit)
+        self.run_compare_torch(
+            shape,
+            model,
+            backend=backend,
+            compute_unit=compute_unit,
+            minimum_deployment_target=minimum_deployment_target,
+        )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, shape",
+        "compute_unit, backend, shape, minimum_deployment_target",
         itertools.product(
             compute_units,
             backends,
@@ -6923,10 +7342,11 @@ def forward(self, x):
                 (1, 2, 3),
                 (2, 3, 4, 5),
             ],
+            [None, ct.target.iOS17],
         ),
     )
-    def test_index_int_index_case_8(self, compute_unit, backend, shape):
-        # two axes are sliced, and not connected + nd mode
+    def test_index_int_index_case_8(self, compute_unit, backend, shape, minimum_deployment_target):
+        """Two axes are sliced, and not connected + nd mode."""
         class IndexModel(torch.nn.Module):
             def forward(self, x):
                 if len(shape) == 3:
@@ -6940,10 +7360,16 @@ def forward(self, x):
                     return x[index_1, :, :, index_2]
 
         model = IndexModel()
-        self.run_compare_torch(shape, model, backend=backend, compute_unit=compute_unit)
+        self.run_compare_torch(
+            shape,
+            model,
+            backend=backend,
+            compute_unit=compute_unit,
+            minimum_deployment_target=minimum_deployment_target,
+        )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, shape",
+        "compute_unit, backend, shape, minimum_deployment_target",
         itertools.product(
             compute_units,
             backends,
@@ -6951,10 +7377,11 @@ def forward(self, x):
                 (1, 2, 3),
                 (2, 3, 4, 5),
             ],
+            [None, ct.target.iOS17],
         ),
     )
-    def test_index_int_index_case_9(self, compute_unit, backend, shape):
-        # one axis is sliced through bool mask
+    def test_index_int_index_case_9(self, compute_unit, backend, shape, minimum_deployment_target):
+        """One axis is sliced through bool mask."""
         class IndexModel(torch.nn.Module):
             def forward(self, x):
                 if len(shape) == 3:
@@ -6964,10 +7391,16 @@ def forward(self, x):
                     return x[[True, False], :, :, :]
 
         model = IndexModel()
-        self.run_compare_torch(shape, model, backend=backend, compute_unit=compute_unit)
+        self.run_compare_torch(
+            shape,
+            model,
+            backend=backend,
+            compute_unit=compute_unit,
+            minimum_deployment_target=minimum_deployment_target,
+        )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, shape",
+        "compute_unit, backend, shape, minimum_deployment_target",
         itertools.product(
             compute_units,
             backends,
@@ -6975,10 +7408,11 @@ def forward(self, x):
                 (1, 2, 3),
                 (2, 3, 4, 5),
             ],
+            [None, ct.target.iOS17],
         ),
     )
-    def test_index_int_index_case_10(self, compute_unit, backend, shape):
-        # multiple axes are sliced through bool masks with possible broadcasting
+    def test_index_int_index_case_10(self, compute_unit, backend, shape, minimum_deployment_target):
+        """Multiple axes are sliced through bool masks with possible broadcasting."""
         class IndexModel(torch.nn.Module):
             def forward(self, x):
                 if len(shape) == 3:
@@ -7003,10 +7437,16 @@ def forward(self, x):
                     return output_1, output_2
 
         model = IndexModel()
-        self.run_compare_torch(shape, model, backend=backend, compute_unit=compute_unit)
+        self.run_compare_torch(
+            shape,
+            model,
+            backend=backend,
+            compute_unit=compute_unit,
+            minimum_deployment_target=minimum_deployment_target,
+        )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, shape",
+        "compute_unit, backend, shape, minimum_deployment_target",
         itertools.product(
             compute_units,
             backends,
@@ -7014,10 +7454,11 @@ def forward(self, x):
                 (3, 4),
                 (3, 4, 5, 6)
             ],
+            [None, ct.target.iOS17],
         ),
     )
-    def test_index_int_index_case_11(self, compute_unit, backend, shape):
-        # broadcasable indices
+    def test_index_int_index_case_11(self, compute_unit, backend, shape, minimum_deployment_target):
+        """Broadcastable indices."""
         class IndexModel(torch.nn.Module):
             def forward(self, x):
                 if len(shape) == 2:
@@ -7032,10 +7473,16 @@ def forward(self, x):
                     return x[index_1, :, index_3, index_2]
 
         model = IndexModel()
-        self.run_compare_torch(shape, model, backend=backend, compute_unit=compute_unit)
+        self.run_compare_torch(
+            shape,
+            model,
+            backend=backend,
+            compute_unit=compute_unit,
+            minimum_deployment_target=minimum_deployment_target,
+        )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, shape",
+        "compute_unit, backend, shape, minimum_deployment_target",
         itertools.product(
             compute_units,
             backends,
@@ -7043,10 +7490,11 @@ def forward(self, x):
                 (1, 2, 3),
                 (2, 3, 4, 5),
             ],
+            [None, ct.target.iOS17],
         ),
     )
-    def test_index_int_index_case_12(self, compute_unit, backend, shape):
-        # Another broadcastable indices test case
+    def test_index_int_index_case_12(self, compute_unit, backend, shape, minimum_deployment_target):
+        """Another broadcastable indices test case."""
         class IndexModel(torch.nn.Module):
             def forward(self, x):
                 index_1 = torch.tensor([0, 1])
@@ -7058,9 +7506,67 @@ def forward(self, x):
                 )
 
         self.run_compare_torch(
-            shape, IndexModel(), backend=backend, compute_unit=compute_unit
+            shape,
+            IndexModel(),
+            backend=backend,
+            compute_unit=compute_unit,
+            minimum_deployment_target=minimum_deployment_target,
+        )
+
+    @pytest.mark.parametrize(
+        "compute_unit, backend, shape, minimum_deployment_target",
+        itertools.product(
+            compute_units,
+            backends,
+            [
+                (1, 2, 3),
+                (2, 3, 4, 5),
+            ],
+            [None, ct.target.iOS17],
+        ),
+    )
+    def test_index_int_index_case_13(self, compute_unit, backend, shape, minimum_deployment_target):
+        """Another broadcastable indices (negative) test case."""
+
+        class IndexModel(torch.nn.Module):
+            def forward(self, x):
+                index_1 = torch.tensor([-1, 1])
+                index_2 = torch.tensor([-1])
+                return x[:, index_1, index_2] if len(shape) == 3 else x[:, index_1, index_2, :]
+
+        self.run_compare_torch(
+            shape,
+            IndexModel(),
+            backend=backend,
+            compute_unit=compute_unit,
+            minimum_deployment_target=minimum_deployment_target,
         )
 
+
+class TestIndexSelect(TorchBaseTest):
+    @pytest.mark.parametrize(
+        "compute_unit, backend, dim",
+        itertools.product(compute_units, backends, [0, -1]),
+    )
+    def test_index_select(self, compute_unit, backend, dim):
+        class TestModel(torch.nn.Module):
+            def forward(self, x):
+                indices = torch.tensor([0, 2])
+                return torch.index_select(x, dim, indices)
+
+        self.run_compare_torch((3, 4), TestModel(), backend=backend, compute_unit=compute_unit)
+
+    def test_index_select_invalid_indices(self):
+        """This test is to verify that PyTorch index_select op doesn't allow negative nor
+        out-of-range indices, so we don't need to add mb.select for IOS17 mb.gather when lowering
+        PyTorch index_select op."""
+        x = torch.randn(3, 4)
+        with pytest.raises(IndexError, match="index out of range"):
+            torch.index_select(x, 0, torch.tensor([0, -1]))
+        with pytest.raises(IndexError, match="index out of range"):
+            torch.index_select(x, 0, torch.tensor([0, 3]))
+
+
 class TestLoss(TorchBaseTest):
     @pytest.mark.parametrize(
         "compute_unit, backend, rank, reduction",
@@ -7174,6 +7680,26 @@ def test_constant_pad_3d(self, compute_unit, backend):
         )
 
 
+class TestMaskedFill(TorchBaseTest):
+    @pytest.mark.parametrize(
+        "compute_unit, backend",
+        itertools.product(compute_units, backends),
+    )
+    def test_masked_fill(self, compute_unit, backend):
+        SHAPE = (2, 3)
+        MASK = torch.bernoulli(torch.rand(SHAPE[-1])).to(torch.bool)
+        VALUE = 10.0
+
+        model = ModuleWrapper(torch.masked_fill, {"mask": MASK, "value": VALUE})
+
+        TorchBaseTest.run_compare_torch(
+            SHAPE,
+            model,
+            backend=backend,
+            compute_unit=compute_unit,
+        )
+
+
 class TestMeshgrid(TorchBaseTest):
     @pytest.mark.parametrize(
         "compute_unit, backend, x, y, z, dtype, inp_mode, indexing",
@@ -7227,7 +7753,7 @@ def forward(self, x, y, z):
 
 class TestScatter(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, shapes_dims",
+        "compute_unit, backend, shapes_dims, minimum_deployment_target",
         itertools.product(
             compute_units,
             backends,
@@ -7236,9 +7762,10 @@ class TestScatter(TorchBaseTest):
                 [(2, 3), (1, -1)],
                 [(2, 3, 4, 5), (0, -2)],
             ],
+            [None, ct.target.iOS17],
         ),
     )
-    def test_scatter(self, compute_unit, backend, shapes_dims):
+    def test_scatter(self, compute_unit, backend, shapes_dims, minimum_deployment_target):
         class TestModel(nn.Module):
             def __init__(self, dim, shapes):
                 super(TestModel, self).__init__()
@@ -7251,13 +7778,14 @@ def forward(self, x):
 
         shapes, dims = shapes_dims
         for dim in dims:
-            m = TestModel(0, shapes)
+            m = TestModel(dim, shapes)
             self.run_compare_torch(
-                shapes, m, backend=backend, compute_unit=compute_unit
+                shapes, m, backend=backend, compute_unit=compute_unit,
+                minimum_deployment_target=minimum_deployment_target,
             )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, shapes_dims",
+        "compute_unit, backend, shapes_dims, minimum_deployment_target",
         itertools.product(
             compute_units,
             backends,
@@ -7266,9 +7794,10 @@ def forward(self, x):
                 [(2, 3), (1, -1)],
                 [(2, 3, 4, 5), (0, -2)],
             ],
+            [None, ct.target.iOS17],
         ),
     )
-    def test_scatter_with_scalar_source(self, compute_unit, backend, shapes_dims):
+    def test_scatter_with_scalar_source(self, compute_unit, backend, shapes_dims, minimum_deployment_target):
         class TestModel(nn.Module):
             def __init__(self, dim, shapes):
                 super(TestModel, self).__init__()
@@ -7281,13 +7810,14 @@ def forward(self, x):
 
         shapes, dims = shapes_dims
         for dim in dims:
-            m = TestModel(0, shapes)
+            m = TestModel(dim, shapes)
             self.run_compare_torch(
-                shapes, m, backend=backend, compute_unit=compute_unit
+                shapes, m, backend=backend, compute_unit=compute_unit,
+                minimum_deployment_target=minimum_deployment_target,
             )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, shapes_dims, mode",
+        "compute_unit, backend, shapes_dims, mode, minimum_deployment_target",
         itertools.product(
             compute_units,
             backends,
@@ -7297,9 +7827,10 @@ def forward(self, x):
                 [(2, 3, 4, 5), (0, -2)],
             ],
             ["add", "multiply"],
+            [None, ct.target.iOS17],
         ),
     )
-    def test_scatter_with_reduce(self, compute_unit, backend, shapes_dims, mode):
+    def test_scatter_with_reduce(self, compute_unit, backend, shapes_dims, mode, minimum_deployment_target):
         class TestModel(nn.Module):
             def __init__(self, dim, shapes, mode):
                 super(TestModel, self).__init__()
@@ -7313,13 +7844,14 @@ def forward(self, x):
 
         shapes, dims = shapes_dims
         for dim in dims:
-            m = TestModel(0, shapes, mode)
+            m = TestModel(dim, shapes, mode)
             self.run_compare_torch(
-                shapes, m, backend=backend, compute_unit=compute_unit
+                shapes, m, backend=backend, compute_unit=compute_unit,
+                minimum_deployment_target=minimum_deployment_target,
             )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, shapes_dims",
+        "compute_unit, backend, shapes_dims, minimum_deployment_target",
         itertools.product(
             compute_units,
             backends,
@@ -7328,9 +7860,10 @@ def forward(self, x):
                 [(2, 3), (1, -1)],
                 [(2, 3, 4, 5), (0, -2)],
             ],
+            [None, ct.target.iOS17],
         ),
     )
-    def test_scatter_add(self, compute_unit, backend, shapes_dims):
+    def test_scatter_add(self, compute_unit, backend, shapes_dims, minimum_deployment_target):
         class TestModel(nn.Module):
             def __init__(self, dim, shapes):
                 super(TestModel, self).__init__()
@@ -7345,7 +7878,49 @@ def forward(self, x):
         for dim in dims:
             m = TestModel(dim, shapes)
             self.run_compare_torch(
-                shapes, m, backend=backend, compute_unit=compute_unit
+                shapes, m, backend=backend, compute_unit=compute_unit,
+                minimum_deployment_target=minimum_deployment_target,
+            )
+
+    @pytest.mark.parametrize(
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            [("mlprogram", "fp16")],
+        ),
+    )
+    def test_scatter_with_invalid_indices(self, compute_unit, backend):
+        """
+        As PyTorch's `scatter_` and `scatter_add_` do verify indices and error out for negative
+        and out-of-bound indices, it doesn't involve the PyMIL validation.
+        """
+
+        class ScatterModel(nn.Module):
+            def forward(self, x):
+                index = torch.tensor([[-1, 1, 2, 0]])
+                return torch.zeros(1, 4, dtype=x.dtype).scatter_(1, index, x)
+
+        class ScatterAddModel(nn.Module):
+            def forward(self, x):
+                index = torch.tensor([[0, 5, 2, 0]])
+                return torch.zeros(1, 4, dtype=x.dtype).scatter_add_(1, index, x)
+
+        with pytest.raises(RuntimeError, match="index -1 is out of bounds for dimension 1"):
+            self.run_compare_torch(
+                (1, 4),
+                ScatterModel(),
+                backend=backend,
+                compute_unit=compute_unit,
+                minimum_deployment_target=ct.target.iOS17,
+            )
+
+        with pytest.raises(RuntimeError, match="index 5 is out of bounds for dimension 1"):
+            self.run_compare_torch(
+                (1, 4),
+                ScatterAddModel(),
+                backend=backend,
+                compute_unit=compute_unit,
+                minimum_deployment_target=ct.target.iOS17,
             )
 
 
@@ -7452,6 +8027,15 @@ def forward(self, x):
             converter_input_type=converter_input_type,
         )
 
+    def test_embedding_invalid_indices(self):
+        """This test is to verify that PyTorch embedding op doesn't allow negative and out-of-range
+        indices, so we don't need to add mb.select for IOS17 mb.gather op."""
+        embedding_matrix = torch.rand(10, 3)
+        with pytest.raises(IndexError, match="index out of range"):
+            torch.nn.functional.embedding(torch.tensor([[-1, 2], [4, 3]]), embedding_matrix)
+        with pytest.raises(IndexError, match="index out of range"):
+            torch.nn.functional.embedding(torch.tensor([[1, 2], [4, 10]]), embedding_matrix)
+
 
 class TestDuplicateOutputTensors(TorchBaseTest):
     @pytest.mark.parametrize(
@@ -7944,7 +8528,7 @@ def forward(self, x):
     @pytest.mark.parametrize(
         "compute_unit, backend",
         itertools.product(
-            compute_units, 
+            compute_units,
             backends,
         )
     )
@@ -7953,7 +8537,7 @@ class AbsModel(torch.nn.Module):
             def forward(self, x):
                 x = torch.complex(x, x)
                 return torch.abs(x)
-        
+
         TorchBaseTest.run_compare_torch(
             (1, 16),
             AbsModel(),
@@ -8188,7 +8772,7 @@ class TestSTFT(TorchBaseTest):
     @pytest.mark.parametrize(
         "compute_unit, backend, input_shape, complex, n_fft, hop_length, win_length, window, center, pad_mode, normalized, onesided",
         itertools.product(
-            compute_units, 
+            compute_units,
             backends,
             [(1, 32), (32,), (3, 32)], # input shape
             [False, True], # complex
@@ -8211,19 +8795,19 @@ def forward(self, x):
                 applied_window = window(win_length) if window and win_length else None
                 x = torch.complex(x, x) if complex else x
                 x = torch.stft(
-                    x, 
-                    n_fft=n_fft, 
-                    hop_length=hop_length, 
+                    x,
+                    n_fft=n_fft,
+                    hop_length=hop_length,
                     win_length=win_length,
                     window=applied_window,
-                    center=center, 
+                    center=center,
                     pad_mode=pad_mode,
                     normalized=normalized,
                     onesided=onesided,
                     return_complex=True)
                 x = torch.stack([torch.real(x), torch.imag(x)], dim=0)
                 return x
-        
+
         TorchBaseTest.run_compare_torch(
             input_shape,
             STFTModel(),
@@ -8235,7 +8819,7 @@ class TestSpectrogram(TorchBaseTest):
     @pytest.mark.parametrize(
         "compute_unit, backend, input_shape, spec, power",
         itertools.product(
-            compute_units, 
+            compute_units,
             backends,
             [(1, 1000), (1000,), (3, 1000)], # input shape
             [torchaudio.transforms.Spectrogram, torchaudio.transforms.MelSpectrogram],
@@ -8255,14 +8839,14 @@ def __init__(self) -> None:
                 # the other spectrogram options are passed through to stft
                 # and are tested in TestSTFT
                 self.spec = spec(power=power, n_fft=128)
-            
+
             def forward(self, x):
                 x = self.spec(x)
                 if power is None:
                     # complex: stack them
                     x = torch.stack([torch.real(x), torch.imag(x)], dim=0)
                 return x
-        
+
         TorchBaseTest.run_compare_torch(
             input_shape,
             SpectrogramModel(),
@@ -8274,13 +8858,14 @@ def forward(self, x):
 
 class TestNms(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, box_num, iou_threshold, dynamic_input",
+        "compute_unit, backend, box_num, iou_threshold, dynamic_input, minimum_deployment_target",
         itertools.product(
             compute_units,
             backends,
             [1, 5, 20, 1000],
             [0.0, 0.2, 0.8],
             [True, False],
+            [None, ct.target.iOS17],
         ),
     )
     def test_nms(
@@ -8290,6 +8875,7 @@ def test_nms(
         box_num: int,
         iou_threshold: float,
         dynamic_input: bool,
+        minimum_deployment_target: ct.target,
     ):
         if box_num >= 1000 and backend == ("mlprogram", "fp16"):
             pytest.xfail(
@@ -8323,9 +8909,10 @@ def forward(self, boxes, scores):
         input_scores = torch.tensor(input_scores, dtype=torch.float32)
 
         if dynamic_input:
+            upper_bound = 4096 if backend[0] == "mlprogram" else -1
             converter_input_type = [
-                ct.TensorType(shape=(RangeDim(1, -1), 4)),
-                ct.TensorType(shape=(RangeDim(1, -1),)),
+                ct.TensorType(shape=(RangeDim(1, upper_bound), 4)),
+                ct.TensorType(shape=(RangeDim(1, upper_bound),)),
             ]
         else:
             converter_input_type = [
@@ -8344,19 +8931,22 @@ def forward(self, boxes, scores):
             backend=backend,
             converter_input_type=converter_input_type,
             compute_unit=compute_unit,
+            minimum_deployment_target=minimum_deployment_target,
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend",
+        "compute_unit, backend, minimum_deployment_target",
         itertools.product(
             compute_units,
             backends,
+            [None, ct.target.iOS17],
         ),
     )
     def test_nms_corner_case_iou_equal_threshold(
         self,
         compute_unit,
         backend: Tuple[str, str],
+        minimum_deployment_target: ct.target,
     ):
         class NmsModel(torch.nn.Module):
             def forward(self, boxes, scores):
@@ -8376,12 +8966,38 @@ def forward(self, boxes, scores):
         nms_model = NmsModel()
         nms_model.eval()
         expected_results = nms_model(input_boxes, input_scores)
-        with pytest.raises(AssertionError, match="Items are not equal"):
-            # TODO: rdar://104966206 ([PyTorch] Re-enable NMS Corner Case Tests After PyTorch Fixes Bugs).
-            # This is because the IOU between the last box ([1., 1., 2., 3.]) and the second box ([0., 0., 2., 2.]) is
-            # exactly 0.2 (IOU threshold), which leads to a corner case that PyTorch will remove the second box while
-            # CoreML keeps it. According to PyTorch's doc, only boxes with `greater than iou_threshold` should be
-            # removed, so it's a bug in PyTorch's side.
+
+        if backend[1] == "fp32" and minimum_deployment_target != ct.target.iOS17:
+            with pytest.raises(AssertionError, match="Items are not equal"):
+                # TODO: rdar://104966206 ([PyTorch] Re-enable NMS Corner Case Tests After PyTorch Fixes Bugs).
+                # This is because the IOU between the last box ([1., 1., 2., 3.]) and the 2nd box ([0., 0., 2., 2.]) is
+                # exactly 0.2 (IOU threshold), which leads to a corner case that PyTorch will remove the second box while
+                # CoreML keeps it. According to PyTorch's doc, only boxes with `greater than iou_threshold` should be
+                # removed, so it's a bug in PyTorch's side.
+                #
+                # The reason of the PyTorch bug is:
+                #     They always use fp64 for the IOU theshold in their c++ backend,
+                #     even if the boxes and the scores can be fp32,
+                #     so the IOU threshold (fp64 0.2) rounds to 0.20000000000000001 and
+                #     the IOU between the last and the 2nd boxes (fp32 0.2) rounds to 0.20000000298023224,
+                #     leading to fp32 0.2 > fp64 0.2 and the removal happens
+                TorchBaseTest.run_compare_torch(
+                    [input_boxes, input_scores],
+                    nms_model,
+                    expected_results=expected_results,
+                    input_as_shape=False,
+                    backend=backend,
+                    converter_input_type=converter_input_type,
+                    compute_unit=compute_unit,
+                    minimum_deployment_target=minimum_deployment_target,
+                )
+        else:
+            # In fp16, the IOU threshold (fp16 0.2) rounds to 0.199951171875.
+            # On CPU, espresso computes everything in fp32, so the IOU between
+            # the last and the 2nd boxes (fp32 0.2) rounds to 0.20000000298023224,
+            # leading to fp32 0.2 > fp16 0.2 and the removal happens
+            #
+            # In IOS17, the CoreML and PyTorch have same results for the corner case.
             TorchBaseTest.run_compare_torch(
                 [input_boxes, input_scores],
                 nms_model,
@@ -8390,10 +9006,11 @@ def forward(self, boxes, scores):
                 backend=backend,
                 converter_input_type=converter_input_type,
                 compute_unit=compute_unit,
+                minimum_deployment_target=minimum_deployment_target,
             )
 
         # Change the last input box to make IOU slightly larger than 0.2, the output of CoreML will match PyTorch.
-        input_boxes[-1][-1] = 2.999
+        input_boxes[-1][-1] = 2.997
         expected_results = nms_model(input_boxes, input_scores)
         TorchBaseTest.run_compare_torch(
             [input_boxes, input_scores],
@@ -8406,7 +9023,7 @@ def forward(self, boxes, scores):
         )
 
         # Change the last input box to make IOU slightly smaller than 0.2, the output of CoreML will match PyTorch.
-        input_boxes[-1][-1] = 3.0001
+        input_boxes[-1][-1] = 3.003
         expected_results = nms_model(input_boxes, input_scores)
         TorchBaseTest.run_compare_torch(
             [input_boxes, input_scores],
@@ -8436,48 +9053,36 @@ def forward(self, x):
             [(1, 2, 3)],
             TestModel(),
             backend=backend,
-            compute_unit=compute_unit
+            compute_unit=compute_unit,
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend",
+        "compute_unit, backend, dim, minimum_deployment_target",
         itertools.product(
             compute_units,
             [('mlprogram', "fp16")],
+            [2, -1],
+            [None, ct.target.iOS17],
         )
     )
-    def test_tensor_size_with_dim(self, compute_unit: ct.ComputeUnit.CPU_ONLY,
-                                  backend: List[Tuple[str]]):
+    def test_tensor_size_with_dim(
+        self,
+        compute_unit: ct.ComputeUnit.CPU_ONLY,
+        backend: List[Tuple[str]],
+        dim: int,
+        minimum_deployment_target: ct.target,
+    ):
         class TestModel(torch.nn.Module):
             def forward(self, x):
-                return x.size(dim=-1)
-
-        model = TestModel()
+                return x.size(dim=dim)
 
-        mlmodel = self.run_compare_torch(
+        self.run_compare_torch(
             [(1, 2, 3)],
-            model,
+            TestModel(),
             backend=backend,
-            compute_unit=compute_unit
+            compute_unit=compute_unit,
+            minimum_deployment_target=minimum_deployment_target,
         )
-        prog = mlmodel[1]._mil_program
-        # The shape op is folded to const.
-        assert len(prog.find_ops(op_type="shape")) == 0
-
-        with patch.object(Var, '_is_nonreplaceable_var') as mocked_is_nonreplaceable_var:
-            # Mock that shape op is non-replaceable.
-            mocked_is_nonreplaceable_var.side_effect = (
-                lambda var: var.op and "shape" in var.op.op_type
-            )
-            mlmodel = self.run_compare_torch(
-                [(1, 2, 3)],
-                model,
-                backend=backend,
-                compute_unit=compute_unit
-            )
-            prog = mlmodel[1]._mil_program
-            # The shape op is not folded to const.
-            assert len(prog.find_ops(op_type="shape")) == 1
 
 
 class TestBitwiseAnd(TorchBaseTest):
diff --git a/coremltools/converters/mil/frontend/torch/test/test_torch_quantization_ops.py b/coremltools/converters/mil/frontend/torch/test/test_torch_quantization_ops.py
new file mode 100644
index 000000000..91f2a5798
--- /dev/null
+++ b/coremltools/converters/mil/frontend/torch/test/test_torch_quantization_ops.py
@@ -0,0 +1,309 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import itertools
+
+import numpy as np
+import pytest
+import torch
+import torchvision
+
+import coremltools as ct
+from coremltools import TensorType
+from coremltools._deps import (
+    _HAS_TORCH,
+    _HAS_TORCH_VISION,
+    MSG_TORCH_NOT_FOUND,
+    MSG_TORCH_VISION_NOT_FOUND,
+)
+from .testing_utils import TorchBaseTest
+
+pytest.mark.skipif(not _HAS_TORCH, reason=MSG_TORCH_NOT_FOUND)
+
+torch.manual_seed(30)
+np.random.seed(30)
+torch.backends.quantized.engine = "qnnpack"
+
+
+class TorchQuantizationBaseTest(TorchBaseTest):
+    @staticmethod
+    def run_compare_torch(
+        input_data,
+        model,
+        atol=1e-04,
+        rtol=1e-05,
+        input_as_shape=True,
+    ):
+        # TODO(rdar://108472419): properly design a random input
+        if input_as_shape:
+            input_data = torch.ones(*input_data)
+
+        return TorchBaseTest.run_compare_torch(
+            input_data,
+            model,
+            atol=atol,
+            rtol=rtol,
+            input_as_shape=False,
+            backend=("mlprogram", "fp32"),
+            use_scripting=False,
+            compute_unit=ct.ComputeUnit.CPU_ONLY,
+            minimum_deployment_target=ct.target.iOS17,
+        )
+
+
+# TODO(rdar://107430678): test stand-alone quantize and dequantize when cast is ready
+class TestPyTorchQuantizationOps(TorchQuantizationBaseTest):
+    @pytest.mark.parametrize(
+        "quant_dtype, input_rank, is_zp_present, zp_dtype",
+        itertools.product(
+            (torch.qint8, torch.quint8, torch.qint32),
+            (1, 3, 5),
+            (True, False),
+            (np.int8, np.uint8, np.int32),
+        ),
+    )
+    def test_quantize_dequantize_per_tensor(self, quant_dtype, input_rank, is_zp_present, zp_dtype):
+        input_shape = [*np.random.randint(low=1, high=5, size=(input_rank,))]
+        scale = np.random.rand()
+        zero_point = 0
+        if is_zp_present:
+            low = 0 if quant_dtype == torch.quint8 or zp_dtype == np.uint8 else -128
+            high = 128 if quant_dtype == torch.qint8 or zp_dtype == np.int8 else 256
+            zero_point = np.random.randint(low, high, dtype=zp_dtype)
+
+        class Model(torch.nn.Module):
+            def forward(self, x):
+                quantized = torch.quantize_per_tensor(x, scale, zero_point, quant_dtype)
+                dequantized = torch.dequantize(quantized)
+                return dequantized
+
+        model = Model()
+        if quant_dtype == torch.qint32:
+            with pytest.raises(
+                ValueError,
+                match=r"MIL quantization dtype must be int8 or uint8",
+            ):
+                self.run_compare_torch(input_shape, model)
+        else:
+            self.run_compare_torch(input_shape, model, atol=5e-4, rtol=5e-4)
+
+    @pytest.mark.parametrize(
+        "quant_dtype, input_rank, is_zp_present, zp_dtype",
+        itertools.product(
+            (torch.qint8, torch.quint8, torch.qint32),
+            (1, 4, 5),
+            (True, False),
+            (torch.int8, torch.uint8, torch.int32),
+        ),
+    )
+    def test_quantize_dequantize_per_channel(
+        self, quant_dtype, input_rank, is_zp_present, zp_dtype
+    ):
+        input_shape = [*np.random.randint(low=1, high=5, size=(input_rank,))]
+        axis = np.random.randint(low=0, high=input_rank)
+        scale = torch.rand(input_shape[axis])
+        zero_point = torch.zeros(input_shape[axis], dtype=zp_dtype)
+        if is_zp_present:
+            low = 0 if quant_dtype == torch.quint8 or zp_dtype == torch.uint8 else -128
+            high = 128 if quant_dtype == torch.qint8 or zp_dtype == torch.int8 else 256
+            zero_point = torch.randint(low, high, (input_shape[axis],), dtype=zp_dtype)
+
+        class Model(torch.nn.Module):
+            def forward(self, x):
+                quantized = torch.quantize_per_channel(x, scale, zero_point, axis, quant_dtype)
+                dequantized = torch.dequantize(quantized)
+                return dequantized
+
+        model = Model()
+        if quant_dtype == torch.qint32:
+            with pytest.raises(
+                ValueError,
+                match=r"MIL quantization dtype must be int8 or uint8",
+            ):
+                self.run_compare_torch(input_shape, model)
+        else:
+            self.run_compare_torch(input_shape, model, atol=5e-4, rtol=5e-4)
+
+
+# TODO(rdar://108463675): refactor torch op tests later to parametrize quantized vs standard ops
+class TestPytorchQuantizedOps(TorchQuantizationBaseTest):
+    # PyTorch quantized_linear kernel only supports rank >= 2
+    @pytest.mark.parametrize(
+        "use_relu, input_rank, quant_dtype",
+        itertools.product([True, False], [2, 3, 4], [torch.quint8, torch.qint8]),
+    )
+    def test_quantized_linear(self, use_relu, input_rank, quant_dtype):
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                if use_relu:
+                    linear = torch.nn.intrinsic.quantized.LinearReLU
+                else:
+                    linear = torch.nn.quantized.Linear
+                self.quant_linear = linear(5, 4)
+
+            def forward(self, x):
+                x = torch.quantize_per_tensor(x, scale=1.0, zero_point=0, dtype=quant_dtype)
+                x = self.quant_linear(x)
+                return torch.dequantize(x)
+
+        model = Model()
+
+        if input_rank == 2:
+            input_shape = (3, 5)
+        elif input_rank == 3:
+            input_shape = (1, 3, 5)
+        elif input_rank == 4:
+            input_shape = (1, 2, 3, 5)
+        self.run_compare_torch(input_shape, model)
+
+    @pytest.mark.parametrize(
+        ",".join(
+            [
+                "use_relu",
+                "quant_dtype",
+                "padding",
+                "stride",
+                "height",
+                "width",
+                "in_channels",
+                "out_channels",
+                "kernel_size",
+                "dilation",
+                "bias",
+            ]
+        ),
+        [
+            (use_relu, quant_dtype, padding, stride, *param)
+            for use_relu, quant_dtype, padding, stride, param in itertools.product(
+                [True, False],
+                [torch.quint8, torch.qint8],
+                [1, 0],
+                [1, 2, 3],
+                [
+                    (5, 3, 1, 1, 1, 1, True),
+                    (3, 3, 1, 1, 1, 3, False),
+                    (4, 3, 3, 3, 2, 1, True),
+                    (7, 3, 3, 3, 1, 1, False),
+                ],
+            )
+        ],
+    )
+    def test_quantized_conv2d(
+        self,
+        use_relu,
+        quant_dtype,
+        padding,
+        stride,
+        height,
+        width,
+        in_channels,
+        out_channels,
+        kernel_size,
+        dilation,
+        bias,
+    ):
+        if padding == "same" and stride != 1:
+            return
+
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                if use_relu:
+                    conv = torch.nn.intrinsic.quantized.ConvReLU2d
+                else:
+                    conv = torch.nn.quantized.Conv2d
+                self.quant_conv = conv(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    kernel_size=kernel_size,
+                    stride=stride,
+                    padding=padding,
+                    dilation=dilation,
+                    bias=bias,
+                    dtype=quant_dtype,
+                )
+
+            def forward(self, x):
+                x = torch.quantize_per_tensor(x, scale=1.0, zero_point=0, dtype=quant_dtype)
+                x = self.quant_conv(x)
+                return torch.dequantize(x)
+
+        model = Model()
+
+        self.run_compare_torch(
+            (1, in_channels, height, width),
+            model,
+        )
+
+    @pytest.mark.parametrize(
+        "input_dtype",
+        (np.int32, np.float32),
+    )
+    def test_quantized_embedding(self, input_dtype):
+        pytest.xfail("rdar://106152706 gather: Required param 'validate_indices' is missing")
+
+        num_embeddings = 4
+        embedding_size = 10
+        B = 2
+        dim = 5
+        converter_input_type = TensorType(shape=(B, dim), dtype=input_dtype)
+
+        # input shape: (B, dim)
+        # output shape : (B, dim, embedding_size)
+        # shape of weights : (num_embeddings, embedding_size)
+        class EmbeddingModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.embedding = torch.nn.quantized.Embedding(num_embeddings, embedding_size)
+
+            def forward(self, x):
+                return self.embedding(x)
+
+        input_data = np.random.randint(low=0, high=num_embeddings, size=(B, dim))
+        input_data = torch.from_numpy(input_data)
+        model = EmbeddingModel()
+        self.run_compare_torch(
+            input_data, model, input_as_shape=False, converter_input_type=converter_input_type
+        )
+
+    # Tests for add, add_relu, mul
+    # See: https://pytorch.org/docs/stable/generated/torch.ao.nn.quantized.QFunctional.html
+    @pytest.mark.parametrize(
+        "quant_dtype, qfunc_name",
+        itertools.product(
+            [torch.quint8, torch.qint8],
+            ["add", "add_relu", "mul"],
+        ),
+    )
+    def test_qfunc_binary_ops(self, quant_dtype, qfunc_name):
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.qfunc = torch.nn.quantized.QFunctional()
+
+            def forward(self, x):
+                x = torch.quantize_per_tensor(x, scale=1.0, zero_point=0, dtype=quant_dtype)
+                x = getattr(self.qfunc, qfunc_name)(x, x)
+                return torch.dequantize(x)
+
+        model = Model()
+
+        self.run_compare_torch((2, 3), model)
+
+
+@pytest.mark.skipif(not _HAS_TORCH_VISION, reason=MSG_TORCH_VISION_NOT_FOUND)
+class TestTorchvisionQuantizedModels(TorchQuantizationBaseTest):
+    # TODO (rdar://107444188): add other torchvision quantized models
+    # As of torchvision 0.13.1, there are 5 quantized models:
+    #     googlenet, inception, mobilenet, resnet, shufflenet
+    # Unfortunately, only mobilenet is working. Others would have
+    #     RuntimeError: Quantized backend not supported
+    # Presumably because they need `fbgemm`, which does not support macOS
+    # We should add them to our end-to-end test once torchvision fix their macOS
+
+    def test_quantized_mobilenetv2(self):
+        model = torchvision.models.quantization.mobilenet_v2(pretrained=True, quantize=True)
+        self.run_compare_torch((1, 3, 224, 224), model, atol=1.0)
diff --git a/coremltools/converters/mil/frontend/torch/test/testing_utils.py b/coremltools/converters/mil/frontend/torch/test/testing_utils.py
index f50a587da..19cc9ffab 100644
--- a/coremltools/converters/mil/frontend/torch/test/testing_utils.py
+++ b/coremltools/converters/mil/frontend/torch/test/testing_utils.py
@@ -12,9 +12,8 @@
 import coremltools.models.utils as coremltoolsutils
 from coremltools import RangeDim, TensorType
 from coremltools._deps import _IS_MACOS
-from coremltools.converters.mil.mil.types.type_mapping import \
-    nptype_from_builtin
-from coremltools.converters.mil.testing_utils import ct_convert
+from coremltools.converters.mil.mil.types.type_mapping import nptype_from_builtin
+from coremltools.converters.mil.testing_utils import ct_convert, validate_minimum_deployment_target
 
 from ..converter import torch_to_mil_types
 
@@ -148,7 +147,7 @@ def convert_and_compare(
     backend=("neuralnetwork", "fp32"),
     converter_input_type=None,
     compute_unit=ct.ComputeUnit.CPU_ONLY,
-    minimum_deployment_target=None
+    minimum_deployment_target=None,
 ):
     """
     If expected results is not set, it will by default
@@ -188,11 +187,8 @@ def convert_and_compare(
     if not coremltoolsutils._has_custom_layer(mlmodel._spec):
         coreml_preds = mlmodel.predict(coreml_inputs)
         coreml_outputs = mlmodel._spec.description.output
-        coreml_results = [
-            coreml_preds[output.name] for output in coreml_outputs
-        ]
-        for torch_result, coreml_result in zip(expected_results,
-                                               coreml_results):
+        coreml_results = [coreml_preds[output.name] for output in coreml_outputs]
+        for torch_result, coreml_result in zip(expected_results, coreml_results):
 
             if torch_result.shape == ():
                 torch_result = np.array([torch_result])
@@ -212,18 +208,18 @@ def store_testname_with_args(self, request):
 
     @staticmethod
     def run_compare_torch(
-            input_data,
-            model,
-            expected_results=None,
-            atol=1e-04,
-            rtol=1e-05,
-            input_as_shape=True,
-            backend=("neuralnetwork", "fp32"),
-            rand_range=(-1.0, 1.0),
-            use_scripting=False,
-            converter_input_type=None,
-            compute_unit=ct.ComputeUnit.CPU_ONLY,
-            minimum_deployment_target=None,
+        input_data,
+        model,
+        expected_results=None,
+        atol=1e-04,
+        rtol=1e-05,
+        input_as_shape=True,
+        backend=("neuralnetwork", "fp32"),
+        rand_range=(-1.0, 1.0),
+        use_scripting=False,
+        converter_input_type=None,
+        compute_unit=ct.ComputeUnit.CPU_ONLY,
+        minimum_deployment_target=None,
     ):
         """
         Traces a model and runs a numerical test.
@@ -233,6 +229,9 @@ def run_compare_torch(
             converter_input_type: If not None, then pass it to the "inputs" argument to the
                 ct.convert() call.
         """
+        if minimum_deployment_target is not None:
+            validate_minimum_deployment_target(minimum_deployment_target, backend)
+
         model.eval()
         if input_as_shape:
             input_data = generate_input_data(input_data, rand_range)
@@ -242,18 +241,17 @@ def run_compare_torch(
         else:
             model_spec = trace_model(model, _copy_input_data(input_data))
 
-        model_spec, mlmodel, coreml_inputs, coreml_results = \
-            convert_and_compare(
-                input_data,
-                model_spec,
-                expected_results=expected_results,
-                atol=atol,
-                rtol=rtol,
-                backend=backend,
-                converter_input_type=converter_input_type,
-                compute_unit=compute_unit,
-                minimum_deployment_target=minimum_deployment_target,
-            )
+        model_spec, mlmodel, coreml_inputs, coreml_results = convert_and_compare(
+            input_data,
+            model_spec,
+            expected_results=expected_results,
+            atol=atol,
+            rtol=rtol,
+            backend=backend,
+            converter_input_type=converter_input_type,
+            compute_unit=compute_unit,
+            minimum_deployment_target=minimum_deployment_target,
+        )
 
         return model_spec, mlmodel, coreml_inputs, coreml_results, \
             TorchBaseTest.testclassname, TorchBaseTest.testmodelname
diff --git a/coremltools/converters/mil/input_types.py b/coremltools/converters/mil/input_types.py
index 0a06d4d76..3e0bd5457 100644
--- a/coremltools/converters/mil/input_types.py
+++ b/coremltools/converters/mil/input_types.py
@@ -4,14 +4,13 @@
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
 from enum import Enum
+from typing import Optional
 
 import numpy as np
 
 from coremltools.converters.mil.mil import types
 from coremltools.converters.mil.mil.types.symbolic import is_symbolic
-from coremltools.converters.mil.mil.types.type_mapping import (
-    is_builtin, numpy_type_to_builtin_type)
-
+from coremltools.converters.mil.mil.types.type_mapping import is_builtin, numpy_type_to_builtin_type
 
 
 class ColorLayout(Enum):
@@ -106,7 +105,7 @@ def __init__(
 
         color_layout: string or enumeration of type ``ct.colorlayout``
             Color layout of the image. Valid values are as follows:
-            
+
             Enumeration (recommended):
                 * ``ct.colorlayout.RGB``
                 * ``ct.colorlayout.BGR``
@@ -264,29 +263,36 @@ def __str__(self):
 
 
 class RangeDim:
-    def __init__(self, lower_bound=1, upper_bound=-1, default=None,
-            symbol=None):
+    def __init__(
+        self,
+        lower_bound: int = 1,
+        upper_bound: int = -1,
+        default: Optional[int] = None,
+        symbol: Optional[str] = None,
+    ):
         """
         A class for providing a range of accepted shapes.
 
         Parameters
         ----------
-        lower_bound: (int)
+        lower_bound:
             The minimum valid value for the shape.
 
-        upper_bound: (int)
+        upper_bound:
             The maximum valid value for the shape.
 
-            Set to ``-1`` if there is no upper limit.
+            Set to ``-1`` if there is no upper limit (only works if backend is set to "neuralnetwork").
+            When backend is set to "mlprogram" during conversion, -1 is not allowed. A finite
+            positive upper bound must be provided.
 
-        default: (int) or None
-            The default value that is used for initiating the model, and set in the input shape field of the model file.
+        default:
+            The default value that is used for initiating the model, and set in the input shape
+            field of the model file.
 
             If set to ``None``, ``lower_bound`` would be used as default.
 
-        symbol: (str)
-            Optional symbol name for the dim. Autogenerate a symbol name if
-            not specified.
+        symbol:
+            Optional symbol name for the dim. Autogenerate a symbol name if not specified.
         """
         if symbol is None:
             from coremltools.converters.mil.mil import get_new_symbol
@@ -296,20 +302,17 @@ def __init__(self, lower_bound=1, upper_bound=-1, default=None,
             self.symbol = Symbol(symbol)
         self.lower_bound = lower_bound
         self.upper_bound = upper_bound
+
         if default is None:
             self.default = lower_bound
         else:
             if default < lower_bound:
                 raise ValueError(
-                    "Default value {} is less than minimum value ({}) for range".format(
-                        default, lower_bound
-                    )
+                    f"Default value {default} is less than minimum value ({lower_bound}) for range"
                 )
-            if upper_bound > 0 and default > upper_bound:
+            if default > upper_bound > 0:
                 raise ValueError(
-                    "Default value {} is greater than maximum value ({}) for range".format(
-                        default, upper_bound
-                    )
+                    f"Default value {default} is greater than maximum value ({upper_bound}) for range"
                 )
             self.default = default
 
@@ -330,11 +333,11 @@ def __init__(self, shape, default=None):
         ----------
         shape: list of (int), symbolic values, RangeDim object
             The valid shape of the input.
-        
+
         default: tuple of int or None
             The default shape that is used for initiating the model, and set in
             the metadata of the model file.
-            
+
             If None, then ``shape`` is used.
         """
         from coremltools.converters.mil.mil import get_new_symbol
diff --git a/coremltools/converters/mil/mil/block.py b/coremltools/converters/mil/mil/block.py
index b9759959d..0002c5cf0 100644
--- a/coremltools/converters/mil/mil/block.py
+++ b/coremltools/converters/mil/mil/block.py
@@ -452,6 +452,7 @@ def try_replace_uses_of_var_after_op(
         anchor_op,
         old_var,
         new_var,
+        end_op=None,
         no_check_var_types=False,
         no_check_var_visibility=False,
     ):
@@ -459,6 +460,7 @@ def try_replace_uses_of_var_after_op(
         :param anchor_op: Operation
         :param old_var: Var
         :param new_var: Var
+        :param end_op: Operation
         :param no_check_var_types: bool
         :param no_check_var_visibility: bool
         :return: True if the old_var can be replaced by new_var. False otherwsie.
@@ -474,6 +476,7 @@ def try_replace_uses_of_var_after_op(
 
         self.replace_uses_of_var_after_op(
             anchor_op=anchor_op,
+            end_op=end_op,
             old_var=old_var,
             new_var=new_var,
             no_check_var_types=no_check_var_types,
diff --git a/coremltools/converters/mil/mil/operation.py b/coremltools/converters/mil/mil/operation.py
index 8b8888e27..19157b035 100644
--- a/coremltools/converters/mil/mil/operation.py
+++ b/coremltools/converters/mil/mil/operation.py
@@ -485,12 +485,11 @@ def _validate_and_set_inputs(self, input_kvs, no_check_var_types=False):
         def check_and_detach(v_new, v_old, op, no_check_var_types):
             # Check new var's sym_type is compatible with the
             # existing's sym_type.
-            if (
-                not is_compatible_type(v_new.sym_type, v_old.sym_type)
-                and not no_check_var_types
-            ):
-                msg = "New var type {} not a subtype of " + "existing var type {}"
-                raise ValueError(msg.format(v_new.sym_type, v_old.sym_type))
+            if not is_compatible_type(v_new.sym_type, v_old.sym_type) and not no_check_var_types:
+                raise ValueError(
+                    f"New var type `{types.builtin_to_string(v_new.sym_type)}` not a "
+                    f"subtype of existing var type `{types.builtin_to_string(v_old.sym_type)}`."
+                )
             v_old.remove_child_op(op, no_check_var_types)
 
         self.input_spec.validate_inputs(self.name, self.op_type, input_kvs)
diff --git a/coremltools/converters/mil/mil/ops/defs/__init__.py b/coremltools/converters/mil/mil/ops/defs/__init__.py
index f62e29b9a..edde1bdfe 100644
--- a/coremltools/converters/mil/mil/ops/defs/__init__.py
+++ b/coremltools/converters/mil/mil/ops/defs/__init__.py
@@ -3,4 +3,4 @@
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
-from . import complex_dialect_ops, iOS15, iOS16
+from . import complex_dialect_ops, iOS15, iOS16, iOS17
diff --git a/coremltools/converters/mil/mil/ops/defs/_utils.py b/coremltools/converters/mil/mil/ops/defs/_utils.py
index 67f41e77c..f5ffead0c 100644
--- a/coremltools/converters/mil/mil/ops/defs/_utils.py
+++ b/coremltools/converters/mil/mil/ops/defs/_utils.py
@@ -72,6 +72,30 @@ def raise_incompatible_dim_exception():
     return tuple(ret_shapes)
 
 
+def infer_type_with_broadcast(typea, typeb, primitive_type):
+    """
+    Given 2 primitive types `typea` and `typeb`, and their promotion `primitive_type`,
+    return the type after broadcast
+    """
+
+    # broadcast
+    if not types.is_tensor(typea) and not types.is_tensor(typeb):
+        # both typea and typeb are not tensors
+        return primitive_type
+    if types.is_tensor(typea) and not types.is_tensor(typeb):
+        # a is tensor, b is not
+        return types.tensor(primitive_type, typea.get_shape())
+    if not types.is_tensor(typea) and types.is_tensor(typeb):
+        # a is not tensor, b is
+        return types.tensor(primitive_type, typeb.get_shape())
+
+    # both a, b are tensors
+    shapea = list(typea.get_shape())
+    shapeb = list(typeb.get_shape())
+    ret_shape = broadcast_shapes(shapea, shapeb)
+    return types.tensor(primitive_type, ret_shape)
+
+
 def promoted_primitive_type(type1, type2):
     """
     Given a pair of tensor or primitive types, find the smallest type that can store an instance
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS15/activation.py b/coremltools/converters/mil/mil/ops/defs/iOS15/activation.py
index 0df819f4a..887cd3f14 100644
--- a/coremltools/converters/mil/mil/ops/defs/iOS15/activation.py
+++ b/coremltools/converters/mil/mil/ops/defs/iOS15/activation.py
@@ -246,9 +246,10 @@ class prelu(activation_with_alpha):
     Parameters
     ----------
     x: tensor<[B, C, 1..3], T> (Required)
-        * x must have rank 4 or rank 3 or rank 5, i.e. a shape of (B,C,H) or (B,C,H,W) or (B,C,D,H,W)
+        * ``x`` must have rank 4, rank 3, or rank 5; that is, a shape of
+          ``(B,C,H)``, ``(B,C,H,W)``, or ``(B,C,D,H,W)``.
     alpha: const tensor<[C], T>, (Required)
-        * The length of alpha must match the second dimension of x (channel dimension)
+        * The length of ``alpha`` must match the second dimension of ``x`` (channel dimension).
 
     Returns
     -------
@@ -262,9 +263,11 @@ class prelu(activation_with_alpha):
 
     @precondition(allow=VALUE)
     def value_inference(self):
+        # Expends alpha on all dims besides the channel (2nd) dim.
         alpha_br = self.alpha.val
-        for i in range(1, len(self.x.shape)):
-            alpha_br = np.expand_dims(alpha_br, i)
+        for i in range(len(self.x.shape)):
+            if i != 1:
+                alpha_br = np.expand_dims(alpha_br, i)
         x_pos = np.maximum(self.x.val, 0)
         b = np.minimum(self.x.val, 0)
         return x_pos + b * alpha_br
@@ -280,8 +283,8 @@ def type_inference(self):
             raise ValueError("alpha should be rank 1")
         if self.x.shape[1] != self.alpha.val.shape[0]:
             raise ValueError(
-                "Size of dimension 1 of alpha should be the same as "
-                + "the size of dimension 1 of x."
+                f"Size of dimension 0 of alpha ({self.alpha.val.shape[0]}) should be "
+                f"the same as the size of dimension 1 of x ({self.x.shape[1]})."
             )
         if self.x.rank in (3, 5):
             # check whether all alpha values are the same or not
@@ -493,9 +496,11 @@ class softplus_parametric(activation_with_alpha_and_beta):
     def value_inference(self):
         alpha_br = np.copy(self.alpha.val)
         beta_br = np.copy(self.beta.val)
-        for i in range(1, len(self.x.val.shape)):
-            alpha_br = np.expand_dims(alpha_br, i)
-            beta_br = np.expand_dims(beta_br, i)
+        # Expends alpha and beta on all dims besides the channel (2nd) dim.
+        for i in range(len(self.x.val.shape)):
+            if i != 1:
+                alpha_br = np.expand_dims(alpha_br, i)
+                beta_br = np.expand_dims(beta_br, i)
         return alpha_br * np.log(1 + np.exp(self.x.val * beta_br))
 
     def type_inference(self):
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS15/control_flow.py b/coremltools/converters/mil/mil/ops/defs/iOS15/control_flow.py
index 621ddf059..8b31ecaab 100644
--- a/coremltools/converters/mil/mil/ops/defs/iOS15/control_flow.py
+++ b/coremltools/converters/mil/mil/ops/defs/iOS15/control_flow.py
@@ -8,23 +8,37 @@
 import numpy as np
 
 from coremltools import _logger as logger
-from coremltools.converters.mil.mil import (Block, get_existing_symbol,
-                                            get_new_symbol, types)
-from coremltools.converters.mil.mil.input_type import (DefaultInputs,
-                                                       InputSpec,
-                                                       InternalInputType,
-                                                       ListInputType,
-                                                       PyFunctionInputType,
-                                                       TensorInputType,
-                                                       TupleInputType)
-from coremltools.converters.mil.mil.operation import (NONE, SYMBOL, VALUE,
-                                                      Operation, mil_list,
-                                                      precondition)
+from coremltools.converters.mil.mil import Block, get_existing_symbol, get_new_symbol, types
+from coremltools.converters.mil.mil.input_type import (
+    DefaultInputs,
+    InputSpec,
+    InternalInputType,
+    ListInputType,
+    PyFunctionInputType,
+    TensorInputType,
+    TupleInputType,
+)
+from coremltools.converters.mil.mil.operation import (
+    NONE,
+    SYMBOL,
+    VALUE,
+    Operation,
+    mil_list,
+    precondition,
+)
 from coremltools.converters.mil.mil.ops.defs._op_reqs import register_op
+from coremltools.converters.mil.mil.ops.defs._utils import (
+    infer_type_with_broadcast,
+    promoted_primitive_type,
+)
 from coremltools.converters.mil.mil.types import is_compatible_type
+from coremltools.converters.mil.mil.types.type_list import list as types_list
 from coremltools.converters.mil.mil.types.type_mapping import (
-    builtin_to_string, is_subtype, numpy_type_to_builtin_type,
-    numpy_val_to_builtin_val)
+    builtin_to_string,
+    is_subtype,
+    numpy_type_to_builtin_type,
+    numpy_val_to_builtin_val,
+)
 
 
 @register_op
@@ -192,8 +206,6 @@ def _get_type_val(self, value):
             # mil_list is a special case that we want to preserve the int64 element type
             if isinstance(list_value[0], np.int64):
                 builtin_elem_type = types.int64
-            from coremltools.converters.mil.mil.types.type_list import \
-                list as types_list
             builtin_type = types_list(builtin_elem_type, init_length=len(list_value), dynamic_length=False)
             return builtin_type, value
 
@@ -276,23 +288,21 @@ class select(Operation):
     }
 
     def type_inference(self):
-        a_type = self.a.sym_type
-        b_type = self.b.sym_type
-        if all([a_type, b_type]):
-            compatible, ret_type = types.is_tensor_and_is_compatible_general_shape(
-                a_type, b_type
-            )
-            if compatible:
-                return ret_type
-            elif a_type == b_type:
-                return a_type
-            else:
-                raise ValueError("Type mismatch {} vs. {}".format(a_type, b_type))
-        return a_type if a_type is not None else b_type
+        typea = self.a.sym_type
+        typeb = self.b.sym_type
+        primitive_type = promoted_primitive_type(typea, typeb)
+        if primitive_type is None:
+            raise ValueError("Incompatible primitive types in broadcast operation")
+
+        return infer_type_with_broadcast(typea, typeb, primitive_type)
 
     @precondition(allow=VALUE)
     def value_inference(self):
-        return np.where(self.cond.val, self.a.val, self.b.val)
+        res = np.where(self.cond.val, self.a.val, self.b.val)
+        sym_type = self.type_inference()
+        if types.is_scalar(sym_type) and not np.isscalar(res):
+            res = getattr(np, str(res.dtype))(res.item())
+        return res
 
 
 @register_op
@@ -528,7 +538,7 @@ class make_list(Operation):
           ``init_length`` is the fixed length of the list throughout runtime.
 
     dynamic_length: <bool> (Optional, Default is True)
-    
+
     elem_shape: Tuple[const<T>] (Required)
         * 1-D vector denoting the shape of elements.
         * If ``T = int32``, the element shape is known at compile time.
@@ -544,7 +554,7 @@ class make_list(Operation):
     Returns
     -------
     List[*]
-    
+
     Attributes
     ----------
     T: i32, string
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS15/elementwise_binary.py b/coremltools/converters/mil/mil/ops/defs/iOS15/elementwise_binary.py
index 1f89facbc..ac02e6ed4 100644
--- a/coremltools/converters/mil/mil/ops/defs/iOS15/elementwise_binary.py
+++ b/coremltools/converters/mil/mil/ops/defs/iOS15/elementwise_binary.py
@@ -6,13 +6,19 @@
 
 import numpy as np
 
-from coremltools.converters.mil.mil import (InputSpec, Operation,
-                                            TensorInputType, precondition,
-                                            types)
+from coremltools.converters.mil.mil import (
+    InputSpec,
+    Operation,
+    TensorInputType,
+    precondition,
+    types,
+)
 from coremltools.converters.mil.mil.operation import VALUE
 from coremltools.converters.mil.mil.ops.defs._op_reqs import register_op
 from coremltools.converters.mil.mil.ops.defs._utils import (
-    broadcast_shapes, promoted_primitive_type)
+    infer_type_with_broadcast,
+    promoted_primitive_type,
+)
 
 
 class elementwise_binary(Operation):
@@ -36,22 +42,7 @@ def type_inference(self):
             raise ValueError("Incompatible primitive types in broadcast operation")
         primitive_type = self.get_dtype(primitive_type)
 
-        # broadcast
-        if not types.is_tensor(typea) and not types.is_tensor(typeb):
-            # both typea and typeb are not tensors
-            return primitive_type
-        if types.is_tensor(typea) and not types.is_tensor(typeb):
-            # a is tensor, b is not
-            return types.tensor(primitive_type, typea.get_shape())
-        if not types.is_tensor(typea) and types.is_tensor(typeb):
-            # a is not tensor, b is
-            return types.tensor(primitive_type, typeb.get_shape())
-
-        # both a, b are tensors
-        shapea = list(typea.get_shape())
-        shapeb = list(typeb.get_shape())
-        ret_shape = broadcast_shapes(shapea, shapeb)
-        return types.tensor(primitive_type, ret_shape)
+        return infer_type_with_broadcast(typea, typeb, primitive_type)
 
     @precondition(allow=VALUE)
     def value_inference(self):
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS15/elementwise_unary.py b/coremltools/converters/mil/mil/ops/defs/iOS15/elementwise_unary.py
index 1ef87516e..4ed065631 100644
--- a/coremltools/converters/mil/mil/ops/defs/iOS15/elementwise_unary.py
+++ b/coremltools/converters/mil/mil/ops/defs/iOS15/elementwise_unary.py
@@ -818,11 +818,14 @@ class cast(Operation):
     """
     Cast the input ``x`` to the new type ``dtype``.
 
+    Notice that the underlying Core MIL op doesn't support int64 and fp64. We support them in PyMIL
+    by mapping int64 to int32, and mapping fp64 to fp32.
+
     Parameters
     ----------
     x: tensor<[\*d], T> (Required)
     dtype: const str (Required)
-        * Can be one of the following types: ``int32``, ``int64``, ``fp32``, ``fp64``.
+        * Can be one of the following types: ``int32``, ``int64``, ``fp32``, ``fp64``, ``bool``.
 
     Returns
     -------
@@ -843,56 +846,65 @@ class cast(Operation):
         "T": (types.fp16, types.fp32, types.fp64, types.int32, types.int64, types.bool),
     }
 
+    str_to_types_map = {
+        "int32": types.int32,
+        "int64": types.int32,
+        "fp16": types.fp16,
+        "fp32": types.fp32,
+        "fp64": types.fp32,
+        "bool": types.bool,
+    }
+
+    str_to_numpy_type_map = {
+        "int32": np.int32,
+        "int64": np.int32,
+        "fp16": np.float16,
+        "fp32": np.float32,
+        "fp64": np.float32,
+        "bool": bool,
+    }
+
     def type_inference(self):
-        type_map = {
-            "int32": types.int32,
-            "int64": types.int32,
-            "fp16": types.fp16,
-            "fp32": types.fp32,
-            "fp64": types.fp32,
-            "bool": types.bool,
-        }
-
-        if self.dtype.val not in type_map.keys():
+        if self.dtype.val not in self.str_to_types_map.keys():
             raise NotImplementedError(
                 "Parameter dtype of the cast operation can be one of the {}. "
-                "Provided {}".format(type_map.keys(), self.dtype.val)
+                "Provided {}".format(self.str_to_types_map.keys(), self.dtype.val)
             )
 
         if not types.is_tensor(self.x.sym_type):
-            return type_map[self.dtype.val]
+            return self.str_to_types_map[self.dtype.val]
 
         ret_shape = self.x.shape
-        return types.tensor(type_map[self.dtype.val], ret_shape)
+        return types.tensor(self.str_to_types_map[self.dtype.val], ret_shape)
 
     @precondition(allow=VALUE | SYMBOL)
     def value_inference(self):
         return self.get_cast_value(self.x, self.dtype.val)
 
-    @staticmethod
-    def get_cast_value(input_var, dtype_val):
-        type_map = {
-            "int32": np.int32,
-            "int64": np.int32,
-            "fp16": np.float16,
-            "fp32": np.float32,
-            "fp64": np.float32,
-            "bool": bool,
-        }
-
-        if dtype_val not in type_map.keys():
+    @classmethod
+    def get_cast_value(cls, input_var, dtype_val):
+        if dtype_val not in cls.str_to_numpy_type_map.keys():
             raise NotImplementedError(
                 "Parameter dtype of the cast operation can be one of the {}. "
-                "Provided {}".format(type_map.keys(), dtype_val)
+                "Provided {}".format(cls.str_to_numpy_type_map.keys(), dtype_val)
             )
 
         if input_var.val is None:
-            if input_var.sym_val is not None and not is_symbolic(input_var.sym_val) and len(input_var.sym_val.shape) == 1:
-                result = [np.array(val).astype(dtype=type_map[dtype_val]).item() if not is_symbolic(val) else val for val in input_var.sym_val]
+            if (
+                input_var.sym_val is not None
+                and not is_symbolic(input_var.sym_val)
+                and len(input_var.sym_val.shape) == 1
+            ):
+                result = [
+                    np.array(val).astype(dtype=cls.str_to_numpy_type_map[dtype_val]).item()
+                    if not is_symbolic(val)
+                    else val
+                    for val in input_var.sym_val
+                ]
                 return np.array(result)
             return None
 
         if not types.is_tensor(input_var.sym_type):
-            return input_var.val.astype(dtype=type_map[dtype_val])
+            return input_var.val.astype(dtype=cls.str_to_numpy_type_map[dtype_val])
         else:
-            return np.array(input_var.val).astype(dtype=type_map[dtype_val])
+            return np.array(input_var.val).astype(dtype=cls.str_to_numpy_type_map[dtype_val])
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS15/image_resizing.py b/coremltools/converters/mil/mil/ops/defs/iOS15/image_resizing.py
index 3186ead7e..3e5c4431e 100644
--- a/coremltools/converters/mil/mil/ops/defs/iOS15/image_resizing.py
+++ b/coremltools/converters/mil/mil/ops/defs/iOS15/image_resizing.py
@@ -56,7 +56,7 @@ class upsample_nearest_neighbor(Operation):
             type_domain="U"
         ),
     )
-    
+
     type_domains = {
         "T": (types.fp16, types.fp32),
         "U": (types.fp32, types.int32),
@@ -122,7 +122,7 @@ class resize_nearest_neighbor(Operation):
         target_size_height=TensorInputType(const=True, type_domain=types.int32),
         target_size_width=TensorInputType(const=True, type_domain=types.int32),
     )
-    
+
     type_domains = {
         "T": (types.fp16, types.fp32),
     }
@@ -144,10 +144,10 @@ class upsample_bilinear(Operation):
     """
     Upsample the spatial dimensions (last two dimensions) of the input
     by scale factors using bilinear interpolation.
-    The upsample_bilinear operation in MIL corresponds to the recompute_scale_factor=True
+    The upsample_bilinear operation in MIL corresponds to the ``recompute_scale_factor=True``
     mode in the pyorch bilinear interpolation op. That is,
     the scale factor is recomputed by the output size.
-    Note that when the scale_factor_height and scale_factor_width are floating point, this
+    Note that when the ``scale_factor_height`` and ``scale_factor_width`` are floating point, this
     could result in a different scale factor due to rounding.
 
     Parameters
@@ -236,7 +236,7 @@ class upsample_bilinear(Operation):
             optional=True,
             type_domain=types.bool),
     )
-    
+
     type_domains = {
         "T": (types.fp16, types.fp32),
         "U": (types.int32, types.fp32),
@@ -375,7 +375,7 @@ class resize_bilinear(Operation):
             type_domain=types.str
         ),
     )
-    
+
     type_domains = {
         "T": (types.fp16, types.fp32),
     }
@@ -445,10 +445,10 @@ class crop_resize(Operation):
         * Target width for resizing each patch.
 
     normalized_coordinates : const<bool> (Optional, default=False)
-        * If true, the bounding box coordinates must be in the
+        * If ``True``, the bounding box coordinates must be in the
           interval ``[0, 1]``. Scaling is based on the input spatial
           dimensions: ``(H_in - 1)`` for height and ``(W_in - 1)`` for width.
-        * If false, the bounding box coordinates must be in the interval
+        * If ``False``, the bounding box coordinates must be in the interval
           ``[0, H_in - 1]`` for height dimensions and ``[0, W_in - 1]`` for
           width dimensions.
 
@@ -510,11 +510,25 @@ class crop_resize(Operation):
         box_coordinate_mode=TensorInputType(const=True, optional=True, type_domain=types.str),
         sampling_mode=TensorInputType(const=True, optional=True, type_domain=types.str),
     )
-    
+
     type_domains = {
         "T": (types.fp16, types.fp32),
     }
 
+    _VALID_SAMPLING_MODES = {
+        "STRICT_ALIGN_CORNERS",
+        "ALIGN_CORNERS",
+        "UNALIGN_CORNERS",
+        "DEFAULT",
+        "OFFSET_CORNERS",
+    }
+    _VALID_BOX_COORDINATE_MODES = {
+        "CORNERS_HEIGHT_FIRST",
+        "CORNERS_WIDTH_FIRST",
+        "CENTER_SIZE_HEIGHT_FIRST",
+        "CENTER_SIZE_WIDTH_FIRST",
+    }
+
     def default_inputs(self):
         return DefaultInputs(
             target_height=1,
@@ -525,34 +539,26 @@ def default_inputs(self):
             sampling_mode="DEFAULT",
         )
 
-    def type_inference(self):
+    def _validate_input(self):
         if self.x.rank != 4:
             raise ValueError(
-                'input to the "crop_resize" op must be of rank 4. Provided {}'.format(
-                    self.x.rank
-                )
+                f'input to the "crop_resize" op must be of rank 4. Provided {self.x.rank}'
             )
-
         if self.roi.rank != 5:
             raise ValueError(
-                'ROI input to the "crop_resize" op must be of rank 5, provided {}'.format(
-                    self.roi.rank
-                )
+                f'ROI input to the "crop_resize" op must be of rank 5, provided {self.roi.rank}'
             )
-
-        if self.sampling_mode.val not in {
-            "STRICT_ALIGN_CORNERS",
-            "ALIGN_CORNERS",
-            "UNALIGN_CORNERS",
-            "DEFAULT",
-            "OFFSET_CORNERS",
-        }:
+        if self.box_coordinate_mode.val not in self._VALID_BOX_COORDINATE_MODES:
             raise ValueError(
-                '"crop_resize" op: unrecognized sampling mode "{}"'.format(
-                    self.sampling_mode
-                )
+                f'"crop_resize" op: unrecognized box_coordinate_mode "{self.box_coordinate_mode.val}"'
+            )
+        if self.sampling_mode.val not in self._VALID_SAMPLING_MODES:
+            raise ValueError(
+                f'"crop_resize" op: unrecognized sampling mode "{self.sampling_mode.val}"'
             )
 
+    def type_inference(self):
+        self._validate_input()
         # ret_shape: [N] + [B, C, h_out, w_out]
         N, B, C = self.roi.shape[0], self.x.shape[0], self.x.shape[1]
         ret_shape = [N, B, C, self.target_height.val, self.target_width.val]
@@ -592,7 +598,7 @@ class crop(Operation):
         crop_height=TensorInputType(const=True, type_domain=types.int32),
         crop_width=TensorInputType(const=True, type_domain=types.int32),
     )
-    
+
     type_domains = {
         "T": (types.fp16, types.fp32),
     }
@@ -709,7 +715,7 @@ class affine(Operation):
         coordinates_mode=TensorInputType(const=True, type_domain=types.str),
         align_corners=TensorInputType(const=True, type_domain=types.bool),
     )
-    
+
     type_domains = {
         "T": (types.fp16, types.fp32),
     }
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS15/recurrent.py b/coremltools/converters/mil/mil/ops/defs/iOS15/recurrent.py
index b6d5ee4ed..b3b5d25a6 100644
--- a/coremltools/converters/mil/mil/ops/defs/iOS15/recurrent.py
+++ b/coremltools/converters/mil/mil/ops/defs/iOS15/recurrent.py
@@ -3,12 +3,8 @@
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
-from coremltools.converters.mil.mil import Operation, types
-from coremltools.converters.mil.mil.input_type import (
-    DefaultInputs,
-    InputSpec,
-    TensorInputType
-)
+from coremltools.converters.mil.mil import Operation, Var, types
+from coremltools.converters.mil.mil.input_type import DefaultInputs, InputSpec, TensorInputType
 from coremltools.converters.mil.mil.ops.defs._op_reqs import register_op
 
 
@@ -291,12 +287,15 @@ class lstm(Operation):
 
     recurrent_activation: const<str> (Optional) [Default=sigmoid]
         * Activation applied on input, forget, and output gates.
+        * Supported values: ``hard_sigmoid``, ``linear``, ``relu``, ``scaled_tanh``, ``sigmoid``, ``tanh``
 
     cell_activation: const<str> (Optional) [Default=tanh]
         * Activation applied on cell gate.
+        * Supported values: ``hard_sigmoid``, ``linear``, ``relu``, ``scaled_tanh``, ``sigmoid``, ``tanh``
 
     activation: const<str> (Optional) [Default=tanh]
         * Activation applied on output gate.
+        * Supported values: ``hard_sigmoid``, ``linear``, ``relu``, ``scaled_tanh``, ``sigmoid``, ``tanh``
 
     clip: const<T> (optional) [Default=None]
         * Cell gate is clipped to ``[-clip, +clip]``.
@@ -353,57 +352,67 @@ def default_inputs(self):
             clip=None)
 
     def type_inference(self):
-        if self.x.rank != 3:
-            raise ValueError(
-                "Invalid input shape. Expecting Rank 3 input, got {}".format(
-                    len(self.x.rank)
-                )
-            )
+        self._validate_inputs()
+
         sequence_length, batch_size, input_size = self.x.shape
+        hidden_dim, hidden_size = self.weight_hh.shape
+        dim_factor = 8 if self.direction.val == "bidirectional" else 4
+        out_seq_len = sequence_length if self.output_sequence.val else 1
+        num_directions = dim_factor // 4
+        output_shape = [out_seq_len, batch_size, num_directions * hidden_size]
+        output_h_shape = [batch_size, num_directions * hidden_size]
+        output_c_shape = [batch_size, num_directions * hidden_size]
+        return (
+            types.tensor(self.x.dtype, tuple(output_shape)),
+            types.tensor(self.x.dtype, tuple(output_h_shape)),
+            types.tensor(self.x.dtype, tuple(output_c_shape)),
+        )
 
-        def weight_shape_check(wt_ih, wt_hh):
-            if wt_ih.rank != 2 or wt_hh.rank != 2:
-                raise ValueError(
-                    "Expecting Rank 2 input, got weight_ih rank: {}, weight_hh rank: {}".format(
-                        wt_ih.rank, wt_hh.rank
-                    )
-                )
+    def _validate_inputs(self):
+        _ALLOWED_DIRECTIONS = {"forward", "reverse", "bidirectional"}
+        _ALLOWED_ACTIVATIONS = {"tanh", "scaled_tanh", "sigmoid", "hard_sigmoid", "relu", "linear"}
 
-            hidden_size = wt_hh.shape[1]
-            if wt_hh.shape[0] // hidden_size != 4 or wt_ih.shape[0] // hidden_size != 4:
+        def check_activation(activation: str):
+            if activation.lower() not in _ALLOWED_ACTIVATIONS:
                 raise ValueError(
-                    "Incorrect weight matrix: hidden dim size mismatch. \
-                                Provided weight_ih {}, weight_hh {}. Expecting <4*H, H>".format(
-                        wt_ih.shape, wt_hh.shape
-                    )
+                    f"Activation `{activation}` not supported. Supported activations: {_ALLOWED_ACTIVATIONS}"
                 )
 
+        if self.x.rank != 3:
+            raise ValueError(f"Invalid input shape. Expecting Rank 3 input, got {len(self.x.rank)}")
+
         direction = self.direction.val
-        valid_directions = {"forward", "reverse", "bidirectional"}
-        if direction not in valid_directions:
+        if direction not in _ALLOWED_DIRECTIONS:
             raise ValueError(
-                "Direction {} not supported. Supported directions: {}".format(
-                    direction, valid_directions
-                )
+                f"Direction {direction} not supported. Supported directions: {_ALLOWED_DIRECTIONS}"
             )
 
-        weight_shape_check(self.weight_ih, self.weight_hh)
+        self._weight_shape_check(self.weight_ih, self.weight_hh)
         if direction == "bidirectional":
-            weight_shape_check(self.weight_ih_back, self.weight_hh_back)
+            if self.weight_ih_back is None or self.weight_hh_back is None:
+                raise ValueError(
+                    "For bidirectional LSTM, the `weight_ih_back` and `weight_hh_back`"
+                    " must be provided."
+                )
+            self._weight_shape_check(self.weight_ih_back, self.weight_hh_back)
 
-        hidden_dim, hidden_size = self.weight_hh.shape
+        check_activation(self.recurrent_activation.val)
+        check_activation(self.cell_activation.val)
+        check_activation(self.activation.val)
 
-        dim_factor = 8 if direction == "bidirectional" else 4
-        out_seq_len = sequence_length if self.output_sequence.val else 1
-        num_directions = dim_factor // 4
-        output_shape = [out_seq_len, batch_size, num_directions * hidden_size]
-        output_h_shape = [batch_size, num_directions * hidden_size]
-        output_c_shape = [batch_size, num_directions * hidden_size]
-        return (
-            types.tensor(self.x.dtype, tuple(output_shape)),
-            types.tensor(self.x.dtype, tuple(output_h_shape)),
-            types.tensor(self.x.dtype, tuple(output_c_shape)),
-        )
+    @staticmethod
+    def _weight_shape_check(wt_ih: Var, wt_hh: Var):
+        if wt_ih.rank != 2 or wt_hh.rank != 2:
+            raise ValueError(
+                f"Expecting Rank 2 input, got weight_ih rank: {wt_ih.rank}, "
+                f"weight_hh rank: {wt_hh.rank}"
+            )
+        hidden_size = wt_hh.shape[1]
+        if wt_hh.shape[0] // hidden_size != 4 or wt_ih.shape[0] // hidden_size != 4:
+            raise ValueError(
+                f"Incorrect weight matrix: hidden dim size mismatch. Provided "
+                f"weight_ih {wt_ih.shape}, weight_hh {wt_hh.shape}. Expecting <4*H, H>"
+            )
 
 
 @register_op
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS15/reduction.py b/coremltools/converters/mil/mil/ops/defs/iOS15/reduction.py
index ce9343037..7748904f5 100644
--- a/coremltools/converters/mil/mil/ops/defs/iOS15/reduction.py
+++ b/coremltools/converters/mil/mil/ops/defs/iOS15/reduction.py
@@ -21,7 +21,7 @@ class ReductionAxes(Operation):
         axes=TensorInputType(const=True, optional=True, type_domain=types.int32),
         keep_dims=TensorInputType(const=True, optional=True, type_domain=types.bool),
     )
-    
+
     type_domains = {
         "T": (types.fp16, types.fp32, types.int32),
     }
@@ -69,7 +69,7 @@ class ReductionAxis(Operation):
         axis=TensorInputType(const=True, optional=True, type_domain=types.int32),
         keep_dims=TensorInputType(const=True, optional=True, type_domain=types.bool),
     )
-    
+
     type_domains = {
         "T": (types.fp16, types.fp32, types.int32),
     }
@@ -114,16 +114,7 @@ def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
     def type_inference(self):
-        x_shape = self.x.shape
-        axis = self.axis.val
-
-        reduced_shape = list(x_shape)
-        axis = axis if axis >= 0 else axis + len(reduced_shape)
-        if self.keep_dims.val:
-            reduced_shape[axis] = 1
-        else:
-            reduced_shape.pop(axis)
-
+        reduced_shape = self._find_reduced_shape()
         return types.tensor(types.int32, tuple(reduced_shape))
 
 
@@ -206,32 +197,32 @@ def get_operator(self):
 class reduce_l1_norm(ReductionAxes):
     """
     Computes the L1 normalization of elements across given dimensions of the input tensor.
-    
+
     Parameters
     ----------
     x: <\*,T> (Required)
         * Must be 1-dimensional or higher.
-    
+
     axes: const<K,i32> (Optional, default="None", reduce on all axes.)
         * The dimensions to reduce.
-    
+
     keep_dims: const<bool> (Optional, default=False)
         * If ``False``, the rank is reduced by ``1`` for each entry in ``axes``,
           otherwise retain reduced axes with length ``1``.
-    
+
     Returns
     -------
     <\*,T>
         * Scalar or tensor: The reduced tensor.
-    
+
     Attributes
     ----------
     T: i32, fp16, fp32
-    
+
     References
     ----------
     See `reduce_mean <https://www.tensorflow.org/api_docs/python/tf/math/reduce_mean?version=stable>`_.
-    
+
     """
 
     def get_operator(self):
@@ -245,24 +236,24 @@ def l1_norm(x, axis=None, keepdims=False):
 class reduce_l2_norm(ReductionAxes):
     """
     Computes the L2 normalization of elements across given dimensions of the input tensor.
-    
+
     Parameters
     ----------
     x: <\*,T> (Required)
         * Must be 1-dimensional or higher.
-    
+
     axes: const<K,i32> (Optional, default="None", reduce on all axes.)
         * The dimensions to reduce.
-    
+
     keep_dims: const<bool> (Optional, default=False)
         * If ``False``, the rank is reduced by ``1`` for each entry in ``axes``,
           otherwise retain reduced axes with length ``1``.
-    
+
     Returns
     -------
     <\*,T>
         * Scalar or tensor: The reduced tensor.
-    
+
     Attributes
     ----------
     T: i32, fp16, fp32
@@ -280,24 +271,24 @@ class reduce_log_sum(ReductionAxes):
     """
     Computes the natural logarithm of the sum of all the elements across given dimensions
     of the input tensor.
-    
+
     Parameters
     ----------
     x: <\*,T> (Required)
         * Must be 1-dimensional or higher.
-    
+
     axes: const<K,i32> (Optional, default="None", reduce on all axes.)
         * The dimensions to reduce.
-    
+
     keep_dims: const<bool> (Optional, default=False)
         * If ``False``, the rank is reduced by ``1`` for each entry in ``axes``,
           otherwise retain reduced axes with length ``1``.
-    
+
     Returns
     -------
     <\*,T>
         * Scalar or tensor: The reduced tensor.
-    
+
     Attributes
     ----------
     T: i32, fp16, fp32
@@ -318,32 +309,32 @@ class reduce_log_sum_exp(ReductionAxes):
     function, more numerically stable than ``log(sum(exp(input)))``. It avoids
     overflows caused by taking the ``exp`` of large inputs and underflows caused by
     taking the ``log`` of small inputs.
-    
+
     Parameters
     ----------
     x: <\*,T> (Required)
         * Must be 1-dimensional or higher.
-    
+
     axes: const<K,i32> (Optional, default="None", reduce on all axes.)
         * The dimensions to reduce.
-    
+
     keep_dims: const<bool> (Optional, default=False)
         * If ``False``, the rank is reduced by ``1`` for each entry in ``axes``,
           otherwise retain reduced axes with length ``1``.
-    
+
     Returns
     -------
     <\*,T>
         * Scalar or tensor: The reduced tensor.
-    
+
     Attributes
     ----------
     T: i32, fp16, fp32
-    
+
     References
     ----------
     See `tf.math.reduce_logsumexp <https://www.tensorflow.org/api_docs/python/tf/math/reduce_logsumexp>`_.
-    
+
     """
 
     def get_operator(self):
@@ -365,29 +356,29 @@ def operator(a, axis=None, keepdims=False):
 class reduce_max(ReductionAxes):
     """
     Computes the maximum of elements across given dimensions of the input tensor.
-    
+
     Parameters
     ----------
     x: <\*,T> (Required)
         * Must be 1-dimensional or higher.
-    
+
     axes: const<K,i32> (Optional, default="None", reduce on all axes.)
         * The dimensions to reduce.
-    
+
     keep_dims: const<bool> (Optional, default=False)
         * If ``False``, the rank is reduced by ``1`` for each entry in ``axes``,
           otherwise retain reduced axes with length ``1``.
-    
+
     Returns
     -------
     <\*,T>
         * Scalar or tensor: The reduced tensor.
-    
+
     Attributes
     ----------
     T: i32, fp16, fp32
     """
-    
+
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
@@ -399,28 +390,28 @@ def get_operator(self):
 class reduce_mean(ReductionAxes):
     """
     Computes the mean of elements across given dimensions of the input tensor.
-    
+
     Parameters
     ----------
     x: <\*,T> (Required)
         * Must be 1-dimensional or higher.
-    
+
     axes: const<K,i32> (Optional, default="None", reduce on all axes.)
         * The dimensions to reduce.
-    
+
     keep_dims: const<bool> (Optional, default=False)
         * If ``False``, the rank is reduced by ``1`` for each entry in ``axes``,
           otherwise retain reduced axes with length ``1``.
-    
+
     Returns
     -------
     <\*,T>
         * Scalar or tensor: The reduced tensor.
-    
+
     Attributes
     ----------
     T: i32, fp16, fp32
-    
+
     References
     ----------
     For an example, see `tf.math.reduce_mean <https://www.tensorflow.org/api_docs/python/tf/math/reduce_mean?version=stable>`_.
@@ -434,24 +425,24 @@ def get_operator(self):
 class reduce_min(ReductionAxes):
     """
     Computes the minimum of elements across given dimensions of the input tensor.
-    
+
     Parameters
     ----------
     x: <\*,T> (Required)
         * Must be 1-dimensional or higher.
-    
+
     axes: const<K,i32> (Optional, default="None", reduce on all axes.)
         * The dimensions to reduce.
-    
+
     keep_dims: const<bool> (Optional, default=False)
         * If ``False``, the rank is reduced by ``1`` for each entry in ``axes``,
           otherwise retain reduced axes with length ``1``.
-    
+
     Returns
     -------
     <\*,T>
         * Scalar or tensor: The reduced tensor.
-    
+
     Attributes
     ----------
     T: i32, fp16, fp32
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS15/tensor_operation.py b/coremltools/converters/mil/mil/ops/defs/iOS15/tensor_operation.py
index 89873486e..d80022226 100644
--- a/coremltools/converters/mil/mil/ops/defs/iOS15/tensor_operation.py
+++ b/coremltools/converters/mil/mil/ops/defs/iOS15/tensor_operation.py
@@ -1284,6 +1284,9 @@ def type_inference(self):
         axis = self.axis.val
         if axis < 0:
             axis += (self.values[0].rank + 1)
+        rank = self.values[0].rank
+        if axis > rank:
+            raise ValueError(f"axis must in range [{-rank}, {rank}). Got {axis}")
         ret_shape = list(t_shape)
         ret_shape.insert(axis, num_tensors)
         return types.tensor(self.values[0].dtype, ret_shape)
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS15/tensor_transformation.py b/coremltools/converters/mil/mil/ops/defs/iOS15/tensor_transformation.py
index fe2480f44..a7924d100 100644
--- a/coremltools/converters/mil/mil/ops/defs/iOS15/tensor_transformation.py
+++ b/coremltools/converters/mil/mil/ops/defs/iOS15/tensor_transformation.py
@@ -3,24 +3,29 @@
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+from typing import List
+
 import numpy as np
 import sympy as sm
 
 from coremltools import _logger as logger
-from coremltools.converters.mil.mil import (Operation, get_new_symbol,
-                                            get_new_variadic_symbol,
-                                            precondition, types)
-from coremltools.converters.mil.mil.input_type import (DefaultInputs,
-                                                       InputSpec,
-                                                       TensorInputType)
+from coremltools.converters.mil.mil import (
+    Operation,
+    get_new_symbol,
+    get_new_variadic_symbol,
+    precondition,
+    types,
+)
+from coremltools.converters.mil.mil.input_type import DefaultInputs, InputSpec, TensorInputType
 from coremltools.converters.mil.mil.operation import SYMBOL, VALUE
 from coremltools.converters.mil.mil.ops.defs._op_reqs import register_op
-from coremltools.converters.mil.mil.ops.defs._utils import \
-    solve_slice_by_index_shape
-from coremltools.converters.mil.mil.types.symbolic import (any_symbolic,
-                                                           any_variadic,
-                                                           is_symbolic,
-                                                           isscalar)
+from coremltools.converters.mil.mil.ops.defs._utils import solve_slice_by_index_shape
+from coremltools.converters.mil.mil.types.symbolic import (
+    any_symbolic,
+    any_variadic,
+    is_symbolic,
+    isscalar,
+)
 
 
 @register_op
@@ -214,9 +219,61 @@ def value_inference(self):
         return val
 
     def _get_type_val(self):
-        x_type = self.x.dtype
-        x_shape = self.x.shape
-        x_vol = np.prod(x_shape)
+        count_neg_one = np.count_nonzero(self.shape.sym_val == -1)
+        if count_neg_one > 1:
+            raise ValueError(
+                f"Reshape op supports only one dimension to be -1, "
+                f"but got {count_neg_one} dimensions be -1."
+            )
+
+        if not any_symbolic(self.x.shape) and self.shape.val is not None:
+            ret_shape = self._infer_shape_static()
+        else:
+            ret_shape = self._infer_shape_dynamic()
+
+        ret_val = None
+        if self.x.val is not None and all(isscalar(a) and not is_symbolic(a) for a in ret_shape):
+            ret_val = reshape_with_symbol(self.x.val, ret_shape)
+        return types.tensor(self.x.dtype, tuple(ret_shape)), ret_val
+
+    @staticmethod
+    def replace_zeros_in_shape(from_shape: List[int], to_shape: List[int]) -> List[int]:
+        """Replaces 0s in `to_shape` by the corresponding dims in `from_shape`."""
+        if to_shape.count(0):
+            if len(from_shape) != len(to_shape):
+                raise ValueError(
+                    f"When there is 0 in shape, the rank of x ({len(from_shape)}) "
+                    f"must equal to the target shape len ({len(to_shape)})."
+                )
+            to_shape = [s if s != 0 else from_shape[dim] for dim, s in enumerate(to_shape)]
+        return to_shape
+
+    @staticmethod
+    def replace_neg_one_in_shape(from_shape: List[int], to_shape: List[int]) -> List[int]:
+        """Replaces -1 in `to_shape` by the corresponding dims in `from_shape`."""
+        if to_shape.count(-1):
+            neg_one_idx = to_shape.index(-1)
+            total_element_num = np.prod(from_shape)
+            remain_element_num = np.prod(
+                [dim for idx, dim in enumerate(to_shape) if idx != neg_one_idx]
+            )
+            infer_dim = total_element_num // remain_element_num
+            to_shape[neg_one_idx] = infer_dim
+        return to_shape
+
+    def _infer_shape_static(self):
+        from_shape = list(self.x.shape)
+        to_shape = list(self.shape.val)
+        to_shape = self.replace_zeros_in_shape(from_shape, to_shape)
+        to_shape = self.replace_neg_one_in_shape(from_shape, to_shape)
+        if np.prod(from_shape) != np.prod(to_shape):
+            raise ValueError(
+                f"Invalid target shape in `reshape` op ({from_shape} to {list(self.shape.val)})."
+            )
+        return to_shape
+
+    def _infer_shape_dynamic(self):
+        x_vol = np.prod(self.x.shape)
         # shape is const, and thus sym_val is not None
         sym_shape = self.shape.sym_val
         sym_shape = [get_new_symbol() if d == -1 else d for d in sym_shape]
@@ -224,10 +281,7 @@ def _get_type_val(self):
             ret_shape = reshape.enforce_volumetric_constraint(x_vol, sym_shape)
         except:
             ret_shape = sym_shape
-        ret_val = None
-        if self.x.val is not None and all(isscalar(a) and not is_symbolic(a) for a in ret_shape):
-            ret_val = reshape_with_symbol(self.x.val, ret_shape)
-        return types.tensor(x_type, tuple(ret_shape)), ret_val
+        return ret_shape
 
     @staticmethod
     def enforce_volumetric_constraint(left_volume, inshape):
@@ -240,13 +294,6 @@ def enforce_volumetric_constraint(left_volume, inshape):
 
         # Handling when reshape is given 0 instead of actual input
         # input tensor shape: [4, 3, 2], reshape:[0, -1], output tensor shape: [4, 6]
-        if shape.count(-1) > 1:
-            raise ValueError(
-                "Reshape op supports only one dimension to be -1. Given {}".format(
-                    shape.count(-1)
-                )
-            )
-
         infer_dim_index = shape.index(-1) if -1 in shape else None
         right_volume = 1
         for i in shape:
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS16/__init__.py b/coremltools/converters/mil/mil/ops/defs/iOS16/__init__.py
index e83fcb3b0..542cfa144 100644
--- a/coremltools/converters/mil/mil/ops/defs/iOS16/__init__.py
+++ b/coremltools/converters/mil/mil/ops/defs/iOS16/__init__.py
@@ -2,6 +2,7 @@
 #
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
 from coremltools.converters.mil._deployment_compatibility import \
     AvailableTarget as target
 
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS16/constexpr_ops.py b/coremltools/converters/mil/mil/ops/defs/iOS16/constexpr_ops.py
index 5306bbf31..e0fa58329 100644
--- a/coremltools/converters/mil/mil/ops/defs/iOS16/constexpr_ops.py
+++ b/coremltools/converters/mil/mil/ops/defs/iOS16/constexpr_ops.py
@@ -1,4 +1,8 @@
-# Copyright (c) 2022, Apple Inc. All rights reserved.
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
 import numpy as np
 
 from coremltools.converters.mil.mil import types
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS16/image_resizing.py b/coremltools/converters/mil/mil/ops/defs/iOS16/image_resizing.py
index da1f5dfb2..ac53dff64 100644
--- a/coremltools/converters/mil/mil/ops/defs/iOS16/image_resizing.py
+++ b/coremltools/converters/mil/mil/ops/defs/iOS16/image_resizing.py
@@ -21,10 +21,10 @@
 @register_op(opset_version=_IOS16_TARGET)
 class resample(_resample_iOS15):
     """
-    The iOS 16 version of ``resample`` supports float 16 coordinates.
-    
-    For the complete documentation, see the 
-    `iOS 15 version <#module-coremltools.converters.mil.mil.ops.defs.iOS15.image_resizing>`_.
+    This version of ``resample`` supports float 16 coordinates.
+
+    For complete documentation, see the
+    iOS 15 :py:class:`~.iOS15.image_resizing.resample`.
     """
     input_spec = InputSpec(
         x=TensorInputType(type_domain="T"),
@@ -47,12 +47,14 @@ def type_inference(self):
 @register_op(opset_version=_IOS16_TARGET)
 class upsample_bilinear(_upsample_bilinear_iOS15):
     """
-    iOS16 version of upsample_bilinear supports half_pixel_centers
+    This version of ``upsample_bilinear`` supports ``half_pixel_centers``.
+    For complete documentation, see the
+    iOS 15 :py:class:`~.iOS15.image_resizing.upsample_bilinear`.
 
-    Additional Parameters
+    Parameters
     ----------
     half_pixel_centers: const<bool> (Optional)
-        * Default to !align_corners if not provided
+        * Defaults to ``!align_corners`` if not provided.
     """
 
     input_spec = _upsample_bilinear_iOS15.input_spec + InputSpec(
@@ -65,14 +67,15 @@ def default_inputs(self):
 @register_op(opset_version=_IOS16_TARGET)
 class crop_resize(_crop_resize_iOS15):
     """
-    iOS16 version of crop_resize, which supports ``pad_value``
-    
-    Additional Parameters
+    This version differs from the iOS 15 :py:class:`~.iOS15.image_resizing.crop_resize`
+    by supporting ``pad_value`` as an additional parameter.
+
+    Parameters
     ----------
-    pad_value : const<T> (Optional, default=1.0)
-        * If the box indexes go beyond the input boundary, the input image is padded with pad_value.
-        * Defaults to 0.
-        * It is the same as extrapolation_value in tf.image.crop_and_resize.
+    pad_value : const<T> (Optional, default=0.0)
+        * If the box indexes go beyond the input boundary, the input image is padded with ``pad_value``.
+        * Defaults to ``0``.
+        * It is the same as ``extrapolation_value`` in `tf.image.crop_and_resize <https://www.tensorflow.org/api_docs/python/tf/image/crop_and_resize>`_.
 
     Attributes
     ----------
@@ -83,4 +86,4 @@ class crop_resize(_crop_resize_iOS15):
     )
 
     def default_inputs(self):
-        return super().default_inputs() + DefaultInputs(pad_value=1.0)
+        return super().default_inputs() + DefaultInputs(pad_value=0.0)
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS16/scatter_gather.py b/coremltools/converters/mil/mil/ops/defs/iOS16/scatter_gather.py
index 82653b0fb..1e3e88c61 100644
--- a/coremltools/converters/mil/mil/ops/defs/iOS16/scatter_gather.py
+++ b/coremltools/converters/mil/mil/ops/defs/iOS16/scatter_gather.py
@@ -17,10 +17,11 @@
 @register_op(opset_version=_IOS16_TARGET)
 class gather(Operation):
     """
-    An iOS16 version of gather
+    The iOS16 version.
+    This section documents only the differences between this version and the
+    iOS 15 :py:class:`~.iOS15.scatter_gather.gather`.
     
-    The new gather op supports `batch_dims`
-    similar to `tf.gather <https://www.tensorflow.org/api_docs/python/tf/gather>`_.
+    This version supports ``batch_dims``, similar to `tf.gather <https://www.tensorflow.org/api_docs/python/tf/gather>`_.
 
     Parameters
     ----------
@@ -30,7 +31,7 @@ class gather(Operation):
     axis: const i32 (Optional. Default=``0``)
         * Negative axis is supported.
     batch_dims: const i32 (Optional. Default=``0``)
-        * The number of batch dimensions
+        * The number of batch dimensions.
 
     Returns
     -------
@@ -117,15 +118,18 @@ def type_inference(self):
 @register_op(opset_version=_IOS16_TARGET)
 class gather_nd(Operation):
     """
-    An iOS16 version of gather_nd
-    The new gather_nd op supports `batch_dims`
+    The iOS16 version.
+    This section documents only the differences between this version and the
+    iOS 15 :py:class:`~.iOS15.scatter_gather.gather_nd`.
+
+    This version supports ``batch_dims``.
 
     Parameters
     ----------
     x: tensor<\*D, T> (Required)
     indices: tensor<\*K, i32> (Required)
     batch_dims: const i32 (Optional. Default=``0``)
-        * The number of batch dimensions
+        * The number of batch dimensions.
 
     Returns
     -------
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS16/tensor_transformation.py b/coremltools/converters/mil/mil/ops/defs/iOS16/tensor_transformation.py
index 473b7c683..8cec7eea2 100644
--- a/coremltools/converters/mil/mil/ops/defs/iOS16/tensor_transformation.py
+++ b/coremltools/converters/mil/mil/ops/defs/iOS16/tensor_transformation.py
@@ -144,8 +144,8 @@ def type_inference(self):
 class pixel_unshuffle(Operation):
     """
     Rearrange elements in a tensor from spatial dimensions into depth (channel).
-    It is basically the inverse operation of `pixel_shuffle <#coremltools.converters.mil.mil.ops.defs.iOS15.tensor_transformation.pixel_shuffle>`_.
-    Equivalent to PyTorch's ``PixelUnshuffle``.
+    It is basically the inverse operation of :py:class:`~.iOS15.tensor_transformation.pixel_shuffle`.
+    Equivalent to `PyTorch PixelUnshuffle <https://pytorch.org/docs/stable/generated/torch.nn.PixelUnshuffle.html#pixelunshuffle>`_.
 
     Parameters
     ----------
@@ -158,7 +158,7 @@ class pixel_unshuffle(Operation):
     Returns
     -------
     tensor<[n, C * f^2, H, W], T>
-        * Where ``f`` is the downscale factor.
+        * In which ``f`` is the downscale factor.
 
     Attributes
     ----------
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS17/__init__.py b/coremltools/converters/mil/mil/ops/defs/iOS17/__init__.py
new file mode 100644
index 000000000..123b06775
--- /dev/null
+++ b/coremltools/converters/mil/mil/ops/defs/iOS17/__init__.py
@@ -0,0 +1,34 @@
+#  Copyright (c) 2022, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+from coremltools.converters.mil._deployment_compatibility import AvailableTarget as target
+
+_IOS17_TARGET = target.iOS17
+
+from .activation import (
+    clamped_relu,
+    elu,
+    leaky_relu,
+    linear_activation,
+    prelu,
+    scaled_tanh,
+    sigmoid_hard,
+    softplus_parametric,
+    thresholded_relu,
+)
+from .elementwise_unary import cast, clip
+from .image_resizing import crop_resize
+from .quantization_ops import dequantize, quantize
+from .reduction import reduce_argmax, reduce_argmin
+from .scatter_gather import (
+    gather,
+    gather_along_axis,
+    gather_nd,
+    scatter,
+    scatter_along_axis,
+    scatter_nd,
+)
+from .tensor_operation import non_maximum_suppression, topk
+from .tensor_transformation import reshape
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS17/activation.py b/coremltools/converters/mil/mil/ops/defs/iOS17/activation.py
new file mode 100644
index 000000000..92fbbdaa2
--- /dev/null
+++ b/coremltools/converters/mil/mil/ops/defs/iOS17/activation.py
@@ -0,0 +1,357 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+from coremltools.converters.mil.mil import types
+from coremltools.converters.mil.mil.input_type import InputSpec, TensorInputType
+from coremltools.converters.mil.mil.ops.defs._op_reqs import register_op
+from coremltools.converters.mil.mil.ops.defs.iOS15.activation import (
+    clamped_relu as _clamped_relu_iOS15,
+)
+from coremltools.converters.mil.mil.ops.defs.iOS15.activation import elu as _elu_iOS15
+from coremltools.converters.mil.mil.ops.defs.iOS15.activation import leaky_relu as _leaky_relu_iOS15
+from coremltools.converters.mil.mil.ops.defs.iOS15.activation import (
+    linear_activation as _linear_activation_iOS15,
+)
+from coremltools.converters.mil.mil.ops.defs.iOS15.activation import prelu as _prelu_iOS15
+from coremltools.converters.mil.mil.ops.defs.iOS15.activation import (
+    scaled_tanh as _scaled_tanh_iOS15,
+)
+from coremltools.converters.mil.mil.ops.defs.iOS15.activation import (
+    sigmoid_hard as _sigmoid_hard_iOS15,
+)
+from coremltools.converters.mil.mil.ops.defs.iOS15.activation import (
+    softplus_parametric as _softplus_parametric_iOS15,
+)
+from coremltools.converters.mil.mil.ops.defs.iOS15.activation import (
+    thresholded_relu as _thresholded_relu_iOS15,
+)
+from coremltools.converters.mil.mil.ops.defs.iOS17 import _IOS17_TARGET
+
+
+@register_op(opset_version=_IOS17_TARGET)
+class clamped_relu(_clamped_relu_iOS15):
+    """
+    If ``x >= 0`` return elementwise ``min(beta, x)``, otherwise return
+    ``min(beta, alpha * x)``. 
+    
+    The major difference between this version and the iOS 15 :py:class:`~.iOS15.activation.clamped_relu`
+    is that the ``alpha`` and ``beta`` may have a different dtype than the input/output.
+
+    Parameters
+    ----------
+    x: tensor<\*?, T> (Required)
+    alpha: const U (Required)
+    beta: const U (Required)
+
+    Returns
+    -------
+    tensor<\*?, T>
+        * A tensor of the same type and shape as ``x``.
+
+    Attributes
+    ----------
+    T: fp16, fp32
+    U: fp16, fp32
+    """
+
+    input_spec = InputSpec(
+        x=TensorInputType(type_domain="T"),
+        alpha=TensorInputType(const=True, type_domain="U"),
+        beta=TensorInputType(const=True, type_domain="U"),
+    )
+
+    type_domains = {
+        "T": (types.fp16, types.fp32),
+        "U": (types.fp16, types.fp32),
+    }
+
+
+@register_op(opset_version=_IOS17_TARGET)
+class elu(_elu_iOS15):
+    """
+    If ``x > 0`` return elementwise ``x``, otherwise return ``alpha * (e^x - 1)``.
+
+    The major difference between this version and the iOS 15 :py:class:`~.iOS15.activation.elu`
+    is that the ``alpha`` may have a different dtype than the input/output.
+
+    Parameters
+    ----------
+    x: tensor<\*?, T> (Required)
+    alpha: const U (Required)
+
+    Returns
+    -------
+    tensor<\*?, T>
+        * A tensor of the same shape and type as ``x``.
+
+    Attributes
+    ----------
+    T: fp16, fp32
+    U: fp16, fp32
+    """
+
+    input_spec = InputSpec(
+        x=TensorInputType(type_domain="T"),
+        alpha=TensorInputType(const=True, type_domain="U"),
+    )
+
+    type_domains = {
+        "T": (types.fp16, types.fp32),
+        "U": (types.fp16, types.fp32),
+    }
+
+
+@register_op(opset_version=_IOS17_TARGET)
+class leaky_relu(_leaky_relu_iOS15):
+    """
+    If ``x >= 0`` apply ``x`` elementwise, otherwise apply ``alpha * x`` elementwise.
+
+    The major difference between this version and the iOS 15 :py:class:`~.iOS15.activation.leaky_relu`
+    is that the ``alpha`` may have a different dtype than the input/output.
+
+    Parameters
+    ----------
+    x: <*?, T> (Required)
+    alpha: const U (Required)
+
+    Returns
+    -------
+    tensor<\*?, T>
+        * A tensor of the same shape and type as ``x``.
+
+    Attributes
+    ----------
+    T: fp16, fp32
+    U: fp16, fp32
+    """
+
+    input_spec = InputSpec(
+        x=TensorInputType(type_domain="T"),
+        alpha=TensorInputType(const=True, type_domain="U"),
+    )
+
+    type_domains = {
+        "T": (types.fp16, types.fp32),
+        "U": (types.fp16, types.fp32),
+    }
+
+
+@register_op(opset_version=_IOS17_TARGET)
+class linear_activation(_linear_activation_iOS15):
+    """
+    Apply elementwise ``x * alpha + beta``.
+
+    The major difference between this version and the iOS 15 :py:class:`~.iOS15.activation.linear_activation`
+    is that the ``alpha`` and ``beta`` may have a different dtype than the input/output.
+
+    Parameters
+    ----------
+    x: tensor<\*?, T> (Required)
+    alpha: const U (Required)
+    beta: const U (Required)
+
+    Returns
+    -------
+    tensor<\*?, T>
+        * A tensor of the same shape and type as ``x``.
+
+    Attributes
+    ----------
+    T: fp16, fp32
+    U: fp16, fp32
+    """
+
+    input_spec = InputSpec(
+        x=TensorInputType(type_domain="T"),
+        alpha=TensorInputType(const=True, type_domain="U"),
+        beta=TensorInputType(const=True, type_domain="U"),
+    )
+
+    type_domains = {
+        "T": (types.fp16, types.fp32),
+        "U": (types.fp16, types.fp32),
+    }
+
+
+@register_op(opset_version=_IOS17_TARGET)
+class prelu(_prelu_iOS15):
+    """
+    Where ``i = 1 ... C``, if ``x_i > 0``, return ``x_i`` , otherwise return ``alpha_i * x_i``.
+
+    The major difference between this version and the iOS 15 :py:class:`~.iOS15.activation.prelu`
+    is that the ``alpha`` may have a different dtype than the input/output.
+
+    Parameters
+    ----------
+    x: tensor<[B, C, 1..3], T> (Required)
+        * ``x`` must have rank 4, rank 3, or rank 5; that is, a shape of
+          ``(B,C,H)``, ``(B,C,H,W)``, or ``(B,C,D,H,W)``.
+    alpha: const tensor<[C], U>, (Required)
+        * The length of ``alpha`` must match the second dimension of ``x`` (channel dimension).
+
+    Returns
+    -------
+    tensor<[B, C, 1..3], T>
+        * A tensor of the same shape as ``x``.
+
+    Attributes
+    ----------
+    T: fp32, fp16
+    U: fp16, fp32
+    """
+
+    input_spec = InputSpec(
+        x=TensorInputType(type_domain="T"),
+        alpha=TensorInputType(const=True, type_domain="U"),
+    )
+
+    type_domains = {
+        "T": (types.fp16, types.fp32),
+        "U": (types.fp16, types.fp32),
+    }
+
+
+@register_op(opset_version=_IOS17_TARGET)
+class scaled_tanh(_scaled_tanh_iOS15):
+    """
+    Return ``alpha * tanh(beta * x)`` elementwise.
+
+    The major difference between this version and the iOS 15 :py:class:`~.iOS15.activation.scaled_tanh`
+    is that the ``alpha`` and ``beta`` may have a different dtype than the input/output.
+
+    Parameters
+    ----------
+    x: tensor<\*?, T> (Required)
+        * Input range is ``(-inf, inf)``.
+    alpha: const U (Required)
+    beta: const U (Required)
+
+    Returns
+    -------
+    tensor<\*?, T>
+        * A tensor of the same shape and type as ``x``.
+
+    Attributes
+    ----------
+    T: fp16, fp32
+    U: fp16, fp32
+    """
+
+    input_spec = InputSpec(
+        x=TensorInputType(type_domain="T"),
+        alpha=TensorInputType(const=True, type_domain="U"),
+        beta=TensorInputType(const=True, type_domain="U"),
+    )
+
+    type_domains = {
+        "T": (types.fp16, types.fp32),
+        "U": (types.fp16, types.fp32),
+    }
+
+
+@register_op(opset_version=_IOS17_TARGET)
+class sigmoid_hard(_sigmoid_hard_iOS15):
+    """
+    Return ``min( max( alpha * x + beta, 0 ), 1 )`` elementwise.
+
+    The major difference between this version and the iOS 15 :py:class:`~.iOS15.activation.sigmoid_hard`
+    is that the ``alpha`` and ``beta`` may have a different dtype than the input/output.
+
+    Parameters
+    ----------
+    x: tensor<\*?, T> (Required)
+    alpha: const U (Required)
+    beta: const U (Required)
+
+    Returns
+    -------
+    tensor<\*?, T>
+        * A tensor of the same shape and type as ``x``.
+
+    Attributes
+    ----------
+    T: fp16, fp32
+    U: fp16, fp32
+    """
+
+    input_spec = InputSpec(
+        x=TensorInputType(type_domain="T"),
+        alpha=TensorInputType(const=True, type_domain="U"),
+        beta=TensorInputType(const=True, type_domain="U"),
+    )
+
+    type_domains = {
+        "T": (types.fp16, types.fp32),
+        "U": (types.fp16, types.fp32),
+    }
+
+
+@register_op(opset_version=_IOS17_TARGET)
+class softplus_parametric(_softplus_parametric_iOS15):
+    """
+    Return ``alpha_i * log( 1 + e^( beta_i * x_i ) )``, where ``i = 1 ... C``.
+
+    Parameters
+    ----------
+    x: tensor<[b, C, n, m], T> (Required)
+    alpha: const tensor<[C], U> (Required)
+    beta: const tensor<[C], U> (Required)
+
+    Returns
+    -------
+    tensor<[b, C, n, m], T>
+        * A tensor of the same shape as ``x``.
+
+    Attributes
+    ----------
+    T: fp16, fp32
+    U: fp16, fp32
+    """
+
+    input_spec = InputSpec(
+        x=TensorInputType(type_domain="T"),
+        alpha=TensorInputType(const=True, type_domain="U"),
+        beta=TensorInputType(const=True, type_domain="U"),
+    )
+
+    type_domains = {
+        "T": (types.fp16, types.fp32),
+        "U": (types.fp16, types.fp32),
+    }
+
+
+@register_op(opset_version=_IOS17_TARGET)
+class thresholded_relu(_thresholded_relu_iOS15):
+    """
+    Return ``x`` if ``x >= alpha``, otherwise return ``0``.
+
+    The major difference between this version and the iOS 15 :py:class:`~.iOS15.activation.thresholded_relu`
+    is that the ``alpha`` may have a different dtype than the input/output.
+
+    Parameters
+    ----------
+    x: tensor<\*?, T> (Required)
+    alpha: const U (Required)
+
+    Returns
+    -------
+    tensor<\*, T>
+        * A tensor of the same shape and type as ``x``.
+
+    Attributes
+    ----------
+    T: fp16, fp32
+    U: fp16, fp32
+    """
+
+    input_spec = InputSpec(
+        x=TensorInputType(type_domain="T"),
+        alpha=TensorInputType(const=True, type_domain="U"),
+    )
+
+    type_domains = {
+        "T": (types.fp16, types.fp32),
+        "U": (types.fp16, types.fp32),
+    }
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS17/elementwise_unary.py b/coremltools/converters/mil/mil/ops/defs/iOS17/elementwise_unary.py
new file mode 100644
index 000000000..f17c51631
--- /dev/null
+++ b/coremltools/converters/mil/mil/ops/defs/iOS17/elementwise_unary.py
@@ -0,0 +1,112 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import numpy as np
+
+from coremltools.converters.mil.mil import types
+from coremltools.converters.mil.mil.input_type import InputSpec, TensorInputType
+from coremltools.converters.mil.mil.ops.defs._op_reqs import register_op
+from coremltools.converters.mil.mil.ops.defs.iOS15.elementwise_unary import cast as _cast_iOS15
+from coremltools.converters.mil.mil.ops.defs.iOS15.elementwise_unary import clip as _clip_iOS15
+from coremltools.converters.mil.mil.ops.defs.iOS17 import _IOS17_TARGET
+
+
+@register_op(opset_version=_IOS17_TARGET)
+class cast(_cast_iOS15):
+    """
+    Cast the input ``x`` to the new type ``dtype``.
+    The only difference between this version and the iOS 15 :py:class:`~.iOS15.elementwise_unary.cast`
+    is that it supports int16 and uint16.
+
+    Parameters
+    ----------
+    x: tensor<[\*d], T> (Required)
+    dtype: const str (Required)
+        * Can be one of the following types: ``int16``, ``uint16``, ``int32``, ``int64``, ``fp16``,
+        ``fp32``, ``fp64``, or ``bool``.
+
+    Returns
+    -------
+    tensor<[\*d], dtype>
+        * A tensor of the same shape as ``x``, with type ``dtype``.
+
+    Attributes
+    ----------
+    T: i16, ui16, i32, i64, fp16, fp32, fp64, bool.
+    """
+
+    input_spec = InputSpec(
+        x=TensorInputType(type_domain="T"), dtype=TensorInputType(const=True, type_domain=types.str)
+    )
+
+    type_domains = {
+        "T": (
+            types.fp16,
+            types.fp32,
+            types.fp64,
+            types.int16,
+            types.uint16,
+            types.int32,
+            types.int64,
+            types.bool,
+        ),
+    }
+
+    str_to_types_map = {
+        "int16": types.int16,
+        "uint16": types.uint16,
+        "int32": types.int32,
+        "int64": types.int32,
+        "fp16": types.fp16,
+        "fp32": types.fp32,
+        "fp64": types.fp32,
+        "bool": types.bool,
+    }
+
+    str_to_numpy_type_map = {
+        "int16": np.int16,
+        "uint16": np.uint16,
+        "int32": np.int32,
+        "int64": np.int32,
+        "fp16": np.float16,
+        "fp32": np.float32,
+        "fp64": np.float32,
+        "bool": bool,
+    }
+
+
+@register_op(opset_version=_IOS17_TARGET)
+class clip(_clip_iOS15):
+    """
+    Clip the values in the input ``x`` to ``[alpha, beta]``, element-wise.
+    Any values less than ``alpha`` are set to ``alpha``, and any values greater
+    than ``beta`` are set to ``beta``.
+
+    The major difference between this version and the iOS 15 :py:class:`~.iOS15.elementwise_unary.clip`
+    is that it uses strict validation to ensure that ``alpha < beta``.
+
+    Parameters
+    ----------
+    x: tensor<[\*d], T> (Required)
+    alpha: const T (Required)
+    beta: const T (Required)
+
+    Returns
+    -------
+    tensor<[\*d], T>
+        * A tensor of the same shape as ``x``.
+
+    Attributes
+    ----------
+    T: fp16, fp32
+    """
+
+    def type_inference(self):
+        if self.alpha.val >= self.beta.val:
+            raise ValueError(
+                f"The `alpha` value ({self.alpha.val}) should be smaller than `beta` value "
+                f"({self.beta.val}) in `clip` op."
+            )
+        return self.x.sym_type
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS17/image_resizing.py b/coremltools/converters/mil/mil/ops/defs/iOS17/image_resizing.py
new file mode 100644
index 000000000..efb91e37b
--- /dev/null
+++ b/coremltools/converters/mil/mil/ops/defs/iOS17/image_resizing.py
@@ -0,0 +1,185 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import numpy as np
+
+from coremltools.converters.mil.mil import types
+from coremltools.converters.mil.mil.input_type import DefaultInputs, InputSpec, TensorInputType
+from coremltools.converters.mil.mil.ops.defs._op_reqs import register_op
+from coremltools.converters.mil.mil.ops.defs.iOS16.image_resizing import (
+    crop_resize as _crop_resize_iOS16,
+)
+from coremltools.converters.mil.mil.ops.defs.iOS17 import _IOS17_TARGET
+
+
+@register_op(opset_version=_IOS17_TARGET)
+class crop_resize(_crop_resize_iOS16):
+    """
+    The major differences between this version and the iOS 16 :py:class:`~.iOS16.image_resizing.crop_resize`
+    are as follows:
+
+    - The input ``ROI`` is replaced by ``boxes`` and ``box_indices``.
+    - The dtype domain of input ``x``, ``boxes``, and ``box_indices`` are independent.
+    - The output no longer has the ``B`` dim. The output is ``[N, C, target_height, target_width]``
+      rather than the ``[N, B, C, target_height, target_width]`` in iOS 16.
+
+    Parameters
+    ----------
+    x: tensor<[B, C, H, W], T> (Required)
+        * The input, from which patches (regions of interest) are extracted
+          and resized using bilinear interpolation.
+        * Rank ``4``.
+
+    boxes: tensor<[N, 4], BOX_T> (Required)
+        * Coordinates of ``N`` boxes.
+        * The convention to express coordinates depends on the value of ``box_coordinate_mode``.
+        * If ``normalized_coordinates`` is True, only fp16 and fp32 dtypes are allowed.
+
+    box_indices: tensor<[N], BOX_INDEX_T> (Optional)
+        * Default is ``arange(N)``, or ``[0, 1, ..., N-1]``.
+        * If ``box_indices[i]=j``, this means that ``boxes[i]`` will be applied to the ``j``-th image.
+          Therefore, it is invalid for ``box_indices[i]`` to be greater than ``B``.
+
+    target_height: const<i32> (Optional, Default=1)
+        * Target height for resizing each patch.
+
+    target_width: const<i32> (Optional, Default=1)
+        * Target width for resizing each patch.
+
+    normalized_coordinates : const<bool> (Optional, default=False)
+        * If ``True``, the bounding box coordinates must be in the
+          interval ``[0, 1]``. Scaling is based on the input spatial
+          dimensions: ``(H_in - 1)`` for height and ``(W_in - 1)`` for width.
+        * If ``False``, the bounding box coordinates must be in the interval
+          ``[0, H_in - 1]`` for height dimensions and ``[0, W_in - 1]`` for
+          width dimensions.
+
+    spatial_scale : const<fp32> (Optional, default=1.0)
+        * Additional spatial scale that multiplies the bounding box coordinates.
+        * You would use this to implement the RoI Align layer, which typically
+          uses unnormalized RoI coordinates along with a spatial scale that is
+          less than or equal to ``1``.
+
+    box_coordinate_mode: const<str> (Optional, default="CORNERS_HEIGHT_FIRST")
+        * Specifies the convention for specifying the four bounding box
+          coordinates for an image of size ``(Height, Width)``. The ``(0,0)``
+          coordinate corresponds to the top-left corner of the image.
+        * This parameter can take one of four values:
+
+          ``"CORNERS_HEIGHT_FIRST"``: ``[h_start, w_start, h_end, w_end]``
+
+          ``"CORNERS_WIDTH_FIRST"``: ``[w_start, h_start, w_end, h_end]``
+
+          ``"CENTER_SIZE_HEIGHT_FIRST"``: ``[h_center, w_center, box_height, box_width]``
+
+          ``"CENTER_SIZE_WIDTH_FIRST"``: ``[w_center, h_center, box_width, box_height]``
+
+    sampling_mode : const<str> (Optional, default="DEFAULT")
+        * This parameter can take ``"STRICT_ALIGN_CORNERS"``,
+          ``"ALIGN_CORNERS"``, ``"DEFAULT"``, ``"OFFSET_CORNERS"`` or
+          ``UNALIGN_CORNERS`` as values.
+        * This is the same convention used by the :py:class:`~.iOS15.image_resizing.resize_bilinear` op.
+
+    pad_value : const<T> (Optional, default=0.0)
+        * If the box indexes go beyond the input boundary, the input image is padded with ``pad_value``.
+        * Defaults to ``0``.
+        * It is the same as ``extrapolation_value`` in `tf.image.crop_and_resize <https://www.tensorflow.org/api_docs/python/tf/image/crop_and_resize>`_.
+
+    Returns
+    -------
+    tensor<[N, C, target_height, target_width], T>
+
+    Attributes
+    ----------
+    T: fp16, fp32
+    BOX_T: fp16, fp32, uint16
+    BOX_INDEX_T: uint16, int32
+    """
+
+    input_spec = InputSpec(
+        x=TensorInputType(type_domain="T"),
+        boxes=TensorInputType(type_domain="BOX_T"),
+        box_indices=TensorInputType(optional=True, type_domain="BOX_INDEX_T"),
+        target_height=TensorInputType(const=True, optional=True, type_domain=types.int32),
+        target_width=TensorInputType(const=True, optional=True, type_domain=types.int32),
+        normalized_coordinates=TensorInputType(const=True, optional=True, type_domain=types.bool),
+        spatial_scale=TensorInputType(const=True, optional=True, type_domain=types.fp32),
+        box_coordinate_mode=TensorInputType(const=True, optional=True, type_domain=types.str),
+        sampling_mode=TensorInputType(const=True, optional=True, type_domain=types.str),
+        pad_value=TensorInputType(const=True, optional=True, type_domain="T"),
+    )
+
+    type_domains = {
+        "T": (types.fp16, types.fp32),
+        "BOX_T": (types.fp16, types.fp32, types.uint16),
+        "BOX_INDEX_T": (types.uint16, types.int32),
+    }
+
+    def default_inputs(self):
+        if self.box_indices is None and self.boxes.shape[0] > self.x.shape[0]:
+            # The default box indices is [0, 1, ..., N-1], which is out-of-range for N>B.
+            raise ValueError(
+                f'"crop_resize" op: N dimension of "boxes" ({self.boxes.shape[0]}) '
+                f'should not be greater than the B dimension of "x" ({self.x.shape[0]}) '
+                f'when "box_indices" is not specified, otherwise "box_indices" would '
+                f'point outside of "x" bounds.'
+            )
+
+        return DefaultInputs(
+            box_indices=list(range(self.boxes.shape[0])),
+            target_height=1,
+            target_width=1,
+            normalized_coordinates=False,
+            spatial_scale=1.0,
+            box_coordinate_mode="CONRNERS_HEIGHT_FIRST",
+            sampling_mode="DEFAULT",
+            pad_value=0.0,
+        )
+
+    def _validate_input(self):
+        if self.x.rank != 4:
+            raise ValueError(
+                f'input to the "crop_resize" op must be of rank 4, but got {self.x.rank}'
+            )
+        if self.boxes.rank != 2 or self.boxes.shape[1] != 4:
+            raise ValueError(
+                f'"crop_resize" op: input "boxes" must has shape [N, 4], but got {self.boxes.shape}'
+            )
+        if self.box_indices.rank != 1 or self.box_indices.shape[0] != self.boxes.shape[0]:
+            raise ValueError(
+                f'"crop_resize" op: input "box_indices" must has shape [{self.boxes.shape[0]}], '
+                f"but got {self.box_indices.shape}"
+            )
+        if self.box_indices.val is not None and np.any(self.box_indices.val >= self.x.shape[0]):
+            raise ValueError(
+                f'"crop_resize" op: input "box_indices" should not have values >= B dimension of x '
+                f"({self.x.shape[0]}), but got {self.box_indices.val}"
+            )
+        if self.box_coordinate_mode.val not in self._VALID_BOX_COORDINATE_MODES:
+            raise ValueError(
+                f'"crop_resize" op: unrecognized box_coordinate_mode "{self.box_coordinate_mode.val}"'
+            )
+        if self.sampling_mode.val not in self._VALID_SAMPLING_MODES:
+            raise ValueError(
+                f'"crop_resize" op: unrecognized sampling mode "{self.sampling_mode.val}"'
+            )
+        if self.normalized_coordinates.val:
+            if self.boxes.dtype not in {types.fp16, types.fp32}:
+                raise ValueError(
+                    f'"crop_resize" op: When normalized_coordinates is set, the '
+                    f'"boxes" must have fp16 or fp32 dtype, but got '
+                    f"{types.builtin_to_string(self.sampling_mode.val)}"
+                )
+
+    def type_inference(self):
+        self._validate_input()
+        # Output shape is [N, C, h_out, w_out].
+        ret_shape = [
+            self.boxes.shape[0],
+            self.x.shape[1],
+            self.target_height.val,
+            self.target_width.val,
+        ]
+        return types.tensor(self.x.dtype, ret_shape)
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS17/quantization_ops.py b/coremltools/converters/mil/mil/ops/defs/iOS17/quantization_ops.py
new file mode 100644
index 000000000..0e635cea9
--- /dev/null
+++ b/coremltools/converters/mil/mil/ops/defs/iOS17/quantization_ops.py
@@ -0,0 +1,242 @@
+# Copyright (c) 2022, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import numpy as np
+
+from coremltools.converters.mil.mil import types
+from coremltools.converters.mil.mil.input_type import InputSpec, TensorInputType
+from coremltools.converters.mil.mil.operation import VALUE, Operation, precondition
+from coremltools.converters.mil.mil.ops.defs._op_reqs import register_op
+from coremltools.converters.mil.mil.ops.defs.iOS17 import _IOS17_TARGET
+
+
+def _rank_promoted_to_same_as_data(data, axis, param):
+    """
+    Reshapes `param` to be the same shape as `data`.
+    """
+    if axis is not None:
+        axis = axis if axis >= 0 else axis + len(data.shape)
+    if len(param.shape) == 0:
+        return np.reshape(param, np.ones(len(data.shape), np.int32))
+    else:
+        axes = [i for i in range(len(data.shape)) if i != axis]
+        return np.expand_dims(param, axis=tuple(axes))
+
+
+def _check_scale_zp_shapes(input_data, scale, zero_point, axis):
+    def assert_vector_size_same_as_axial_dimension(param, axis_dim_size, name):
+        if param.rank == 1 and param.shape[0] != axis_dim_size:
+            raise ValueError(
+                "Parameter {}, if vector, needs to have same size as the dimension size along the parameter input".format(
+                    name
+                )
+            )
+
+    if scale.rank == 0:
+        # ios17.dequantize doesn't want axis defined for scalar quant params.
+        if axis is not None:
+            raise ValueError("axis should not be provided to quantize if scale/zp are scalars")
+        if zero_point is not None and zero_point.rank != 0:
+            raise ValueError("zero_point should be a scalar if scale is a scalar")
+    elif scale.rank == 1:
+        if axis is None or axis.val is None:
+            raise ValueError("axis should be provided to quantize if scale/zp are not scalars")
+        if axis.val < -input_data.rank or axis.val >= input_data.rank:
+            raise ValueError(
+                "Parameter axis needs to be in the range -input.rank <= axis < input.rank"
+            )
+
+        input_axis_dim_size = input_data.shape[axis.val]
+        assert_vector_size_same_as_axial_dimension(scale, input_axis_dim_size, "scale")
+        if zero_point is not None:
+            if zero_point.rank != 1:
+                raise ValueError("zero_point should be a vector if scale is a vector")
+            assert_vector_size_same_as_axial_dimension(
+                zero_point, input_axis_dim_size, "zero_point"
+            )
+    else:
+        raise ValueError("Params scale & zero_point should both be scalars or vectors")
+
+
+@register_op(opset_version=_IOS17_TARGET)
+class quantize(Operation):
+    """
+    Performs affine/linear quantization on an input tensor.
+
+    The original data comes from the first "input".
+    The other parameters -- ``scale``, ``zero_point``, and ``axis`` -- describe how
+    quantization should occur::
+
+        quantized_data = clip(round(input / scale) + zero_point)
+
+    Parameters
+    ----------
+    input: tensor<SrcT, [1..]> (Required)
+
+    zero_point: const tensor<DstT, [0..1]> (Optional)
+        * The ``zero_point`` can be either a scalar or a vector. If not provided, it is
+          assumed to be ``0``.
+        * The ``zero_point`` follows similar broadcasting rules and size constraints as ``scale``.
+
+    scale: const tensor<SrcT, [0..1]> (Required)
+        * The ``scale`` can be either a scalar or a vector.
+        * If ``scale`` is a vector, for implementation, it is broadcasted to the following shape:
+            - The rank of ``scale`` becomes the same as the rank of the input.
+            - Constraint: ``size(scale-vector) == input.shape[axis]``.
+            - For ``i == axis``, ``scale.shape[i] == input.shape[i]``.
+            - For ``i != axis``, ``scale.shape == 1``.
+            - For example:
+                - Assume ``input.shape = (2, 3, 4, 5)`` and ``axis = 1``.
+                - If ``scale`` is a vector, then ``scale.size`` needs to be equal to
+                  ``input.shape[axis]``; that is, equal to ``3``.
+                - This is broadcasted to ``(1, 3, 1, 1)``.
+
+    axis: const tensor<int32, []> (Optional)
+
+    output_dtype: const tensor<string, []> (Required)
+        * This parameter can take ``"uint8"``, ``"int8"`` as values.
+        * The ``output_dtype`` value must match the ``zero_point`` dtype.
+
+    Returns
+    -------
+    tensor<DstT, [1..]>
+
+    Attributes
+    ----------
+    SrcT: fp16, fp32
+    DstT: uint8, int8
+    """
+
+    input_spec = InputSpec(
+        input=TensorInputType(type_domain="SrcT"),
+        zero_point=TensorInputType(const=True, optional=True, type_domain="DstT"),
+        scale=TensorInputType(const=True, type_domain="SrcT"),
+        axis=TensorInputType(const=True, optional=True, type_domain=types.int32),
+        output_dtype=TensorInputType(const=True, type_domain=types.str),
+    )
+
+    type_domains = {
+        "SrcT": (types.fp16, types.fp32),
+        "DstT": (types.uint8, types.int8),
+    }
+
+    def type_inference(self):
+        out_dtype = types.string_to_builtin(self.output_dtype.val)
+        if out_dtype not in {types.int8, types.uint8}:
+            raise ValueError(
+                '"quantize" op: unrecognized output dtype "{}"'.format(self.output_dtype.val)
+            )
+
+        if self.zero_point is not None:
+            if out_dtype != self.zero_point.dtype:
+                raise ValueError(
+                    "output_dtype & zero_point dtype mismatch: {}, {}".format(
+                        self.output_dtype.val, types.builtin_to_string(self.zero_point.dtype)
+                    )
+                )
+
+        _check_scale_zp_shapes(self.input, self.scale, self.zero_point, self.axis)
+
+        return types.tensor(out_dtype, self.input.shape)
+
+    @precondition(allow=VALUE)
+    def value_inference(self):
+        original_data = self.input.val
+        if self.zero_point is not None:
+            zero_point = self.zero_point.val
+        else:
+            zero_point = np.int8(0) if self.output_dtype.val == "int8" else np.uint8(0)
+        scale = self.scale.val
+        axis = None
+        if self.axis is not None:
+            axis = self.axis.val
+        dtype_info = np.iinfo(zero_point.dtype)
+
+        sc = _rank_promoted_to_same_as_data(original_data, axis, scale)
+        zp = _rank_promoted_to_same_as_data(original_data, axis, zero_point)
+        val = np.clip(
+            np.around(original_data / sc) + zp.astype(np.float32), dtype_info.min, dtype_info.max
+        )
+        return val.astype(zero_point.dtype)
+
+
+@register_op(opset_version=_IOS17_TARGET)
+class dequantize(Operation):
+    """
+    Performs dequantization on an input tensor with affine/linear quantization.
+
+    The quantized data comes from the first "input".
+    The other parameters -- ``scale``, ``zero_point``, and ``axis`` -- describe how
+    unquantized values can be extracted from it,
+    using the following equation for affine/linear quantization::
+
+        unquantized_data = scale * (input - zero_point)
+
+    Parameters
+    ----------
+    input: tensor<SrcT, [1..]> (Required)
+
+    zero_point: const tensor<SrcT, [0..1]> (Optional)
+        * The ``zero_point`` can be either a scalar or a vector. If not provided,
+          it is assumed to be ``0``.
+        * The ``zero_point`` follows similar broadcasting rules and size constraints as ``scale``.
+
+    scale: const tensor<DstT, [0..1]> (Required)
+        * The ``scale`` can be either a scalar or a vector.
+        * If ``scale`` is a vector, for implementation, it is broadcasted to the following shape:
+            - The rank of ``scale`` becomes the same as the rank of the input.
+            - Constraint: ``size(scale-vector) == input.shape[axis]``.
+            - For ``i == axis``, ``scale.shape[i] == input.shape[i]``.
+            - For ``i != axis``, ``scale.shape == 1``.
+            - For example:
+                - Assume ``input.shape = (2, 3, 4, 5)`` and ``axis = 1``.
+                - If ``scale`` is a vector, then ``scale.size`` needs to be equal to
+                  ``input.shape[axis]``; that is, equal to ``3``.
+                - This is broadcasted to ``(1, 3, 1, 1)``.
+
+    axis: const tensor<int32, []> (Optional)
+
+    Returns
+    -------
+    tensor<DstT, [1..]>
+
+    Attributes
+    ----------
+    SrcT: uint8, int8
+    DstT: fp16, fp32
+    """
+
+    input_spec = InputSpec(
+        input=TensorInputType(type_domain="SrcT"),
+        zero_point=TensorInputType(const=True, optional=True, type_domain="SrcT"),
+        scale=TensorInputType(const=True, type_domain="DstT"),
+        axis=TensorInputType(const=True, optional=True, type_domain=types.int32),
+    )
+
+    type_domains = {
+        "DstT": (types.fp16, types.fp32),
+        "SrcT": (types.uint8, types.int8),
+    }
+
+    def type_inference(self):
+        _check_scale_zp_shapes(self.input, self.scale, self.zero_point, self.axis)
+        return types.tensor(self.scale.dtype, self.input.shape)
+
+    @precondition(allow=VALUE)
+    def value_inference(self):
+        quantized_data = self.input.val
+        if self.zero_point is not None:
+            zero_point = self.zero_point.val
+        else:
+            zero_point = np.int8(0) if self.input.dtype == types.int8 else np.uint8(0)
+        scale = self.scale.val
+        axis = None
+        if self.axis is not None:
+            axis = self.axis.val
+
+        sc = _rank_promoted_to_same_as_data(quantized_data, axis, scale)
+        zp = _rank_promoted_to_same_as_data(quantized_data, axis, zero_point)
+        val = sc * (quantized_data.astype(np.float32) - zp.astype(np.float32))
+        return val.astype(scale.dtype)
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS17/reduction.py b/coremltools/converters/mil/mil/ops/defs/iOS17/reduction.py
new file mode 100644
index 000000000..f8d5d03fd
--- /dev/null
+++ b/coremltools/converters/mil/mil/ops/defs/iOS17/reduction.py
@@ -0,0 +1,124 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import numpy as np
+
+from coremltools.converters.mil.mil import types
+from coremltools.converters.mil.mil.input_type import DefaultInputs, InputSpec, TensorInputType
+from coremltools.converters.mil.mil.ops.defs._op_reqs import register_op
+from coremltools.converters.mil.mil.ops.defs.iOS15.reduction import reduce_arg as _reduce_arg_iOS15
+from coremltools.converters.mil.mil.ops.defs.iOS17 import _IOS17_TARGET
+
+
+class reduce_arg(_reduce_arg_iOS15):
+    _VALID_OUTPUT_DTYPES = ("int32", "uint16")
+
+    input_spec = InputSpec(
+        x=TensorInputType(type_domain="T"),
+        axis=TensorInputType(const=True, optional=True, type_domain=types.int32),
+        keep_dims=TensorInputType(const=True, optional=True, type_domain=types.bool),
+        output_dtype=TensorInputType(const=True, optional=True, type_domain=types.str),
+    )
+
+    type_domains = {
+        "T": (types.fp16, types.fp32, types.int32),
+    }
+
+    def default_inputs(self):
+        return DefaultInputs(
+            axis=-1,
+            keep_dims=False,
+            output_dtype="int32",
+        )
+
+    def type_inference(self):
+        reduced_shape = self._find_reduced_shape()
+        output_dtype = self.output_dtype.val.lower()
+        if output_dtype not in self._VALID_OUTPUT_DTYPES:
+            raise ValueError(
+                f'Invalid "output_dtype" {output_dtype}. Only support {self._VALID_OUTPUT_DTYPES}'
+            )
+        return types.tensor(types.string_to_builtin(output_dtype), tuple(reduced_shape))
+
+
+@register_op(opset_version=_IOS17_TARGET)
+class reduce_argmax(reduce_arg):
+    """
+    Computes the indices of the maximum value across dimensions of a tensor.
+    In case of ties, the identity of the return value is not guaranteed.
+    The differences between this version and the iOS 15 :py:class:`~.iOS15.reduction.reduce_argmax`:
+      - The output supports uint16 dtype.
+      - New optional input ``output_dtype``.
+
+    Parameters
+    ----------
+    x: <\*,T> (Required)
+        * Must be 1-dimensional or higher.
+
+    axis: const<i32> (Optional)
+        * The dimension to reduce. Default is ``-1``.
+
+    keep_dims: const<bool> (Optional, default=False)
+        * If ``False``, the rank is reduced by ``1`` by removing the dimension
+          specified in ``axis``.
+        * If ``True``, retain reduced axis with length ``1``.
+
+    output_dtype: const<str> (Optional)
+        * Possible values: ``uint16``, ``int32``.
+        * If set, then value type inference will output using that dtype.
+        * Default is ``int32``.
+
+    Returns
+    -------
+    <\*, U>
+
+    Attributes
+    ----------
+    T: fp16, fp32, i32
+    U: int32, uint16
+    """
+
+    def get_operator(self):
+        return np.argmax
+
+
+@register_op(opset_version=_IOS17_TARGET)
+class reduce_argmin(reduce_arg):
+    """
+    Computes the indices of the minimum value across dimensions of a tensor.
+    In case of ties, the identity of the return value is not guaranteed.
+    The differences between this version and the iOS 15 :py:class:`~.iOS15.reduction.reduce_argmin`:
+      - The output supports uint16 dtype.
+      - New optional input ``output_dtype``.
+
+    Parameters
+    ----------
+    x: <\*,T> (Required)
+        * Must be 1-dimensional or higher.
+
+    axis: const<i32> (Optional)
+        * The dimension to reduce. Default is ``-1``.
+
+    keep_dims: const<bool> (Optional, default=False)
+        * If ``False``, the rank is reduced by ``1`` by removing the dimension specified
+          in ``axis``, otherwise retain reduced axis with length ``1``.
+
+    output_dtype: const<str> (Optional)
+        * Possible values: ``uint16``, ``int32``.
+        * If set, then value type inference will output using that dtype.
+        * Default is ``int32``.
+
+    Returns
+    -------
+    <\*, U>
+
+    Attributes
+    ----------
+    T: fp16, fp32, i32
+    U: int32, uint16
+    """
+
+    def get_operator(self):
+        return np.argmin
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS17/scatter_gather.py b/coremltools/converters/mil/mil/ops/defs/iOS17/scatter_gather.py
new file mode 100644
index 000000000..95898c616
--- /dev/null
+++ b/coremltools/converters/mil/mil/ops/defs/iOS17/scatter_gather.py
@@ -0,0 +1,430 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import numpy as np
+
+from coremltools.converters.mil.mil import types
+from coremltools.converters.mil.mil.input_type import DefaultInputs, InputSpec, TensorInputType
+from coremltools.converters.mil.mil.ops.defs._op_reqs import register_op
+from coremltools.converters.mil.mil.ops.defs.iOS15.scatter_gather import (
+    gather_along_axis as _gather_along_axis_iOS15,
+)
+from coremltools.converters.mil.mil.ops.defs.iOS15.scatter_gather import scatter as _scatter_iOS15
+from coremltools.converters.mil.mil.ops.defs.iOS15.scatter_gather import (
+    scatter_along_axis as _scatter_along_axis_iOS15,
+)
+from coremltools.converters.mil.mil.ops.defs.iOS15.scatter_gather import (
+    scatter_nd as _scatter_nd_iOS15,
+)
+from coremltools.converters.mil.mil.ops.defs.iOS16.scatter_gather import gather as _gather_iOS16
+from coremltools.converters.mil.mil.ops.defs.iOS16.scatter_gather import (
+    gather_nd as _gather_nd_iOS16,
+)
+from coremltools.converters.mil.mil.ops.defs.iOS17 import _IOS17_TARGET
+
+
+@register_op(opset_version=_IOS17_TARGET)
+class scatter(_scatter_iOS15):
+    """
+    Scatter ``updates`` to ``data`` at locations ``indices`` at dimension ``axis``
+    by the operation ``mode``.
+
+    This section documents only the differences between this version and the
+    iOS 15 :py:class:`~.iOS15.scatter_gather.scatter`. The major differences are as follows:
+
+    - Input parameter ``indices`` now supports only positive values -- negative values
+      are considered out-of-bound. If support for negative indices is required, they must be
+      explicitly converted to positive values using the following::
+
+        index = iOS17.select(index >= 0, index, index + max_index)
+
+    - New input parameter called ``validate_indices`` has been added to all scatter ops.
+      Its behavior is as follows:
+       - If ``True``, it raises a runtime (possibly also a compile-time) exception for out-of-bound values of
+         the ``indices`` parameter.
+       - If ``False``, absolutely no checking is performed for out-of-bound values of ``indices``
+         either at compile or runtime. Behavior for out-of-bound indices is undefined but memory safe.
+
+    Parameters
+    ----------
+    data: tensor<\*D, T> (Required)
+    indices: tensor<[C], i32> (Required)
+        * 1-D tensor.
+    updates: tensor<\*K, T> (Required)
+        * ``K = data.shape[:axis] + [len(indices)] + data.shape[axis+1:]``.
+    axis: const i32 (Optional)
+        * Default to ``0``.
+    mode: const string (Optional)
+        * Can be the following modes: ``add``, ``div``, ``max``, ``min``, ``mul``, ``sub``, ``update``.
+        * Default value is ``update``.
+    validate_indices: const bool (Optional)
+        * If ``True``, it raises a runtime (possibly also a compile-time) exception for out-of-bound values of
+          the ``indices`` parameter.
+        * If ``False``, absolutely no checking is performed for out-of-bound values of ``indices``
+          either at compile or runtime. Behavior for out-of-bound indices is undefined but memory safe.
+        * Default value is ``False``.
+
+    Returns
+    -------
+    tensor<\*D, T>
+        * With the same type and shape as input ``x``.
+
+    Attributes
+    ----------
+    T: fp16, fp32, i32
+    """
+
+    input_spec = InputSpec(
+        data=TensorInputType(type_domain="T"),
+        indices=TensorInputType(type_domain=types.int32),
+        updates=TensorInputType(type_domain="T"),
+        axis=TensorInputType(const=True, optional=True, type_domain=types.int32),
+        mode=TensorInputType(const=True, optional=True, type_domain=types.str),
+        validate_indices=TensorInputType(const=True, optional=True, type_domain=types.bool),
+    )
+
+    def default_inputs(self):
+        return DefaultInputs(
+            axis=0,
+            mode="add",
+            validate_indices=False,
+        )
+
+    def type_inference(self):
+        result = super().type_inference()
+        if self.validate_indices.val:
+            indices = self.indices.val
+            if indices is not None:
+                if np.count_nonzero(
+                    np.logical_or(indices < 0, indices >= self.data.shape[self.axis.val])
+                ):
+                    raise IndexError(
+                        f"Indices is out of bounds for `{self.op_type}` node {self.name}. "
+                        f"Expected indices between [0, {self.data.shape[self.axis.val]}), but got {indices}."
+                    )
+        return result
+
+
+@register_op(opset_version=_IOS17_TARGET)
+class scatter_along_axis(_scatter_along_axis_iOS15):
+    """
+    Scatter ``updates`` to ``data`` at locations ``indices`` along ``axis`` dimension
+    using the ``mode`` operation.
+
+    The major differences from the previous version are illustrated in :py:class:`scatter`.
+    For more information, see the iOS 15 :py:class:`~.iOS15.scatter_gather.scatter_along_axis`.
+
+    Parameters
+    ----------
+    data: tensor<\*D, T> (Required)
+    indices: tensor<\*K, i32> (Required)
+        * ``rank(indices) == rank(data)``.
+    updates: tensor<\*K, T> (Required)
+        * Must be the same shape as ``indices``.
+    axis: const i32 (Optional)
+        * Default to ``0``.
+    mode: const string (Optional)
+        * Default to ``add``.
+        * Can be the following modes: ``add``, ``div``, ``max``, ``min``, ``mul``, ``sub``, ``update``.
+    validate_indices: const bool (Optional)
+        * If ``True``, it raises a runtime (possibly also a compile-time) exception for out-of-bound values of
+          the ``indices`` parameter.
+        * If ``False``, absolutely no checking is performed for out-of-bound values of ``indices``
+          either at compile or runtime. Behavior for out-of-bound indices is undefined but memory safe.
+        * Default value is ``False``.
+
+    Returns
+    -------
+    tensor<\*D, T>
+        * With the same type and shape as input ``x``.
+
+    Attributes
+    ----------
+    T: fp16, fp32, i32
+    """
+
+    input_spec = InputSpec(
+        data=TensorInputType(type_domain="T"),
+        indices=TensorInputType(type_domain=types.int32),
+        updates=TensorInputType(type_domain="T"),
+        axis=TensorInputType(const=True, optional=True, type_domain=types.int32),
+        mode=TensorInputType(const=True, optional=True, type_domain=types.str),
+        validate_indices=TensorInputType(const=True, optional=True, type_domain=types.bool),
+    )
+
+    def default_inputs(self):
+        return DefaultInputs(
+            axis=0,
+            mode="add",
+            validate_indices=False,
+        )
+
+    def type_inference(self):
+        result = super().type_inference()
+        if self.validate_indices.val:
+            indices = self.indices.val
+            if indices is not None:
+                if np.count_nonzero(
+                    np.logical_or(indices < 0, indices >= self.data.shape[self.axis.val])
+                ):
+                    raise IndexError(
+                        f"Indices is out of bounds for `{self.op_type}` node {self.name}. "
+                        f"Expected indices between [0, {self.data.shape[self.axis.val]}), but got {indices}."
+                    )
+        return result
+
+
+@register_op(opset_version=_IOS17_TARGET)
+class scatter_nd(_scatter_nd_iOS15):
+    """
+    Scatter ``updates`` to ``data`` at locations ``indices``.
+
+    The major differences from the previous version are illustrated in :py:class:`scatter`.
+    For more information, see the iOS 15 :py:class:`~.iOS15.scatter_gather.scatter_nd`.
+
+    Parameters
+    ----------
+    data: tensor<\*D, T> (Required)
+    indices: tensor<\*K, i32> (Required)
+    updates: tensor<\*K, T> (Required)
+        * Must be the shape as ``K[:-1]+data.shape[K[-1]:]``.
+    mode: const string (Optional)
+        * Default to ``add``.
+        * Can be the following modes: ``add``, ``div``, ``max``, ``min``, ``mul``, ``sub``, ``update``.
+    validate_indices: const bool (Optional)
+        * If ``True``, it raises a runtime (possibly also a compile-time) exception for out-of-bound values of
+          the ``indices`` parameter.
+        * If ``False``, absolutely no checking is performed for out-of-bound values of ``indices``
+          either at compile or runtime. Behavior for out-of-bound indices is undefined but memory safe.
+        * Default value is ``False``.
+
+    Returns
+    -------
+    tensor<\*D, T>
+        * A tensor with the same shape and type as ``data``.
+
+    Attributes
+    ----------
+    T: fp16, fp32, i32
+    """
+
+    input_spec = InputSpec(
+        data=TensorInputType(type_domain="T"),
+        indices=TensorInputType(type_domain=types.int32),
+        updates=TensorInputType(type_domain="T"),
+        mode=TensorInputType(const=True, optional=True, type_domain=types.str),
+        validate_indices=TensorInputType(const=True, optional=True, type_domain=types.bool),
+    )
+
+    def default_inputs(self):
+        return DefaultInputs(
+            mode="add",
+            validate_indices=False,
+        )
+
+    def type_inference(self):
+        result = super().type_inference()
+        if self.validate_indices.val:
+            indices = self.indices.val
+            upper_bound = self.data.shape
+            if indices is not None:
+                if np.count_nonzero(np.logical_or(indices < 0, indices >= upper_bound)):
+                    raise IndexError(
+                        f"Indices is out of bounds for `{self.op_type}` node {self.name}. "
+                        f"Expected indices between [0, {upper_bound}), but got {indices}."
+                    )
+        return result
+
+
+@register_op(opset_version=_IOS17_TARGET)
+class gather(_gather_iOS16):
+    """
+    Gather slices from input ``x`` along dimension ``axis`` according to ``indices``,
+    similar to `tf.gather_nd <https://www.tensorflow.org/api_docs/python/tf/gather_nd>`_.
+
+    This section documents only the differences between this version and the
+    iOS 16 :py:class:`~.iOS16.scatter_gather.gather`. The major differences are as follows:
+
+    - Input parameter ``indices`` now supports only positive values -- negative values
+      are considered out-of-bound. If support for negative indices is required, they must be
+      explicitly converted to positive values, using the following::
+
+         index = iOS17.select(index >= 0, index, index + max_index)
+
+    - New input parameter called ``validate_indices`` has been added to all gather ops.
+      Its behavior is as follows:
+       - If ``True``, it raises a runtime (possibly also a compile-time) exception for
+         out-of-bound values of the ``indices`` parameter.
+       - If ``False``, absolutely no checking is performed for out-of-bound values of ``indices``
+         either at compile or runtime. Behavior for out-of-bound indices is undefined but memory safe.
+
+    Parameters
+    ----------
+    x: tensor<\*D, U> (Required)
+    indices: tensor<\*N, I> (Required)
+        * Indices values may be negative. More precisely, ``-D[axis]<= v < D[axis]`` for ``v`` in ``indices``.
+    axis: const i32 (Optional. Default=``0``)
+        * Negative axis is supported.
+    batch_dims: const i32 (Optional. Default=``0``)
+        * The number of batch dimensions.
+    validate_indices: const bool (Optional)
+        * If ``True``, it raises a runtime (possibly also a compile-time) exception for out-of-bound values of
+          the ``indices`` parameter.
+        * If ``False``, absolutely no checking is performed for out-of-bound values of ``indices``
+          either at compile or runtime. Behavior for out-of-bound indices is undefined but memory safe.
+        * Default value is ``False``.
+
+    Returns
+    -------
+    tensor<\*K, T>
+        * Where ``K = D[:axis] + N[batch_dims:] + D[axis+1:]``.
+
+    Attributes
+    ----------
+    T: fp16, fp32, i32
+    I: uint16, int16, int32
+    """
+
+    input_spec = InputSpec(
+        x=TensorInputType(type_domain="U"),
+        indices=TensorInputType(type_domain="I"),
+        axis=TensorInputType(const=True, optional=True, type_domain=types.int32),
+        batch_dims=TensorInputType(const=True, optional=True, type_domain=types.int32),
+        validate_indices=TensorInputType(const=True, optional=True, type_domain=types.bool),
+    )
+
+    def default_inputs(self):
+        return DefaultInputs(axis=0, batch_dims=0, validate_indices=False)
+
+    def type_inference(self):
+        result = super().type_inference()
+        if self.validate_indices.val:
+            indices = self.indices.val
+            if indices is not None:
+                if np.count_nonzero(
+                    np.logical_or(indices < 0, indices >= self.x.shape[self.axis.val])
+                ):
+                    raise IndexError(
+                        f"Indices is out of bounds for `{self.op_type}` node {self.name}. "
+                        f"Expected indices between [0, {self.x.shape[self.axis.val]}), but got {indices}."
+                    )
+        return result
+
+
+@register_op(opset_version=_IOS17_TARGET)
+class gather_along_axis(_gather_along_axis_iOS15):
+    """
+    Take the values along ``axis`` at locations ``indices``.
+
+    The major differences from the previous version are illustrated in :py:class:`gather`.
+    For more information, see the iOS 15 :py:class:`~.iOS15.scatter_gather.gather_along_axis`.
+
+    Parameters
+    ----------
+    x: tensor<\*D, T> (Required)
+    indices: tensor<\*K, i32> (Required)
+        * ``rank(indices) == rank(x)``.
+    axis: const i32 (Optional):
+        * Default to ``0``.
+    validate_indices: const bool (Optional)
+        * If ``True``, it raises a runtime (possibly also a compile-time) exception for out-of-bound values of
+          the ``indices`` parameter.
+        * If ``False``, absolutely no checking is performed for out-of-bound values of ``indices``
+          either at compile or runtime. Behavior for out-of-bound indices is undefined but memory safe.
+        * Default value is ``False``.
+
+    Returns
+    -------
+    tensor<\*D, T>:
+        * Output tensor has the same shape as ``indices``.
+
+    Attributes
+    ----------
+    T: fp16, fp32, i32
+    """
+
+    input_spec = InputSpec(
+        x=TensorInputType(type_domain="T"),
+        indices=TensorInputType(type_domain=types.int32),
+        axis=TensorInputType(const=True, optional=True, type_domain=types.int32),
+        validate_indices=TensorInputType(const=True, optional=True, type_domain=types.bool),
+    )
+
+    def default_inputs(self):
+        return DefaultInputs(
+            axis=0,
+            validate_indices=False,
+        )
+
+    def type_inference(self):
+        result = super().type_inference()
+        if self.validate_indices.val:
+            indices = self.indices.val
+            if indices is not None:
+                upper_bound = self.x.shape[self.axis.val]
+                if np.count_nonzero(np.logical_or(indices < 0, indices >= upper_bound)):
+                    raise IndexError(
+                        f"Indices is out of bounds for `{self.op_type}` node {self.name}. "
+                        f"Expected indices between [0, {upper_bound}), but got {indices}."
+                    )
+        return result
+
+
+@register_op(opset_version=_IOS17_TARGET)
+class gather_nd(_gather_nd_iOS16):
+    """
+    Gather slices from ``x`` according to ``indices``, similar to `tf.gather_nd`.
+
+    The major differences from the previous version are illustrated in :py:class:`gather`.
+    For more information, see the iOS 16 :py:class:`~.iOS16.scatter_gather.gather_nd`.
+
+    Parameters
+    ----------
+    x: tensor<\*D, T> (Required)
+    indices: tensor<\*K, i32> (Required)
+    batch_dims: const i32 (Optional. Default=``0``)
+        * The number of batch dimensions.
+    validate_indices: const bool (Optional)
+        * If ``True``, it raises a runtime (possibly also a compile-time) exception for out-of-bound values of
+          the ``indices`` parameter.
+        * If ``False``, absolutely no checking is performed for out-of-bound values of ``indices``
+          either at compile or runtime. Behavior for out-of-bound indices is undefined but memory safe.
+        * Default value is ``False``.
+
+    Returns
+    -------
+    tensor<\*V, T>
+        * ``V = K[:-1] + D[batch_dims + K[-1]:]``, where ``D = x.shape`` and ``K = indices.shape``.
+
+    Attributes
+    ----------
+    T: fp16, fp32, i32
+    """
+
+    input_spec = InputSpec(
+        x=TensorInputType(type_domain="U"),
+        indices=TensorInputType(type_domain="I"),
+        batch_dims=TensorInputType(const=True, optional=True, type_domain=types.int32),
+        validate_indices=TensorInputType(const=True, optional=True, type_domain=types.bool),
+    )
+
+    def default_inputs(self):
+        return DefaultInputs(
+            batch_dims=0,
+            validate_indices=False,
+        )
+
+    def type_inference(self):
+        result = super().type_inference()
+        if self.validate_indices.val:
+            indices = self.indices.val
+            upper_bound = self.x.shape
+            if indices is not None:
+                if np.count_nonzero(np.logical_or(indices < 0, indices >= upper_bound)):
+                    raise IndexError(
+                        f"Indices is out of bounds for `{self.op_type}` node {self.name}. "
+                        f"Expected indices between [0, {upper_bound}), but got {indices}."
+                    )
+        return result
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS17/tensor_operation.py b/coremltools/converters/mil/mil/ops/defs/iOS17/tensor_operation.py
new file mode 100644
index 000000000..da74582ef
--- /dev/null
+++ b/coremltools/converters/mil/mil/ops/defs/iOS17/tensor_operation.py
@@ -0,0 +1,148 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+from coremltools.converters.mil.mil import types
+from coremltools.converters.mil.mil.input_type import InputSpec, TensorInputType
+from coremltools.converters.mil.mil.ops.defs._op_reqs import register_op
+from coremltools.converters.mil.mil.ops.defs.iOS15.tensor_operation import (
+    non_maximum_suppression as _nms_iOS15,
+)
+from coremltools.converters.mil.mil.ops.defs.iOS16.tensor_operation import topk as _topk_iOS16
+from coremltools.converters.mil.mil.ops.defs.iOS17 import _IOS17_TARGET
+
+
+@register_op(opset_version=_IOS17_TARGET)
+class non_maximum_suppression(_nms_iOS15):
+    """
+    Performs non-maximum suppression (NMS) on the boxes according to their intersection-over-union (IoU).
+
+    NMS iteratively removes lower-scoring boxes which have an IoU greater than ``iou_threshold`` with
+    another (higher-scoring) box.
+    
+    The major differences between this version and the iOS 15 :py:class:`~.iOS15.tensor_operation.non_maximum_suppression`
+    are as follows:
+
+       - The input parameter ``score_threshold`` has been removed.
+       - The inputs ``boxes`` and ``scores`` are ordered with number of boxes in the last dimension.
+       - The fourth output containing number of boxes for each batch has been removed.
+
+    Parameters
+    ----------
+
+    boxes: tensor<[n, 4, B], T> (Required)
+        * Box coordinates on which to perform NMS. The coordinates are expected in
+          ``CENTER_SIZE_WIDTH_FIRST`` format ``(x, y, width, height)``, in which ``(x, y)`` is the center.
+    scores: tensor<[n, K, B], T> (Required)
+        * Scores for each one of the boxes. ``K`` is the number of classes.
+    iou_threshold: const<T> (Required)
+        * The intersection over union (IoU) threshold over which boxes are
+          suppressed. NMS remove all overlapping boxes with ``IoU > iou_threshold``.
+    max_boxes: const<i32> (Required)
+        * Maximum number of boxes to select. If the number of surviving boxes are
+          less, the output is padded up to this number.
+    per_class_suppression: const<bool> (Optional)
+        * Defaults to ``False``.
+        * If ``True``, suppression is performed independently within boxes of each class.
+
+    Returns
+    -------
+    tensor<[n, 4, max_boxes], T>
+        * Coordinates of selected boxes.
+    tensor<[n, K, max_boxes], T>
+        * Scores of selected boxes.
+    tensor<[n, max_boxes], i32>
+        * Indices of selected boxes.
+
+    Attributes
+    ----------
+    T: fp16, fp32
+    """
+
+    input_spec = InputSpec(
+        boxes=TensorInputType(type_domain="T"),
+        scores=TensorInputType(type_domain="T"),
+        iou_threshold=TensorInputType(const=True, type_domain="T"),
+        max_boxes=TensorInputType(const=True, type_domain=types.int32),
+        per_class_suppression=TensorInputType(const=True, optional=True, type_domain=types.bool),
+    )
+
+    def type_inference(self):
+        boxes_dtype = self.boxes.dtype
+        scores_dtype = self.scores.dtype
+        n_batch, n_score_class, _ = self.scores.shape
+        max_boxes = self.max_boxes.val
+
+        return (
+            types.tensor(boxes_dtype, (n_batch, 4, max_boxes)),
+            types.tensor(scores_dtype, (n_batch, n_score_class, max_boxes)),
+            types.tensor(types.int32, (n_batch, max_boxes)),
+        )
+
+
+@register_op(opset_version=_IOS17_TARGET)
+class topk(_topk_iOS16):
+    """
+    A version of ``topk`` for iOS 17+. The only difference between this version and the
+    iOS 16 :py:class:`~.iOS16.tensor_operation.topk` is the data type support.
+    The newly added data type is:
+    - int16, unint16 for ``x`` and output.
+    - int16 for ``k``.
+
+    Parameters
+    ----------
+    x: <\*?, T> (Required)
+        * Input tensor.
+    k: const<K> (Optional)
+        * Defaults to ``1``.
+        * Number of values/indices to be computed along each axis.
+        * Set to ``-1`` to select all elements.
+    axis: const<i32> (Optional)
+        * Defaults to ``-1`` (last dimension).
+        * Axis to perform the operation.
+    ascending: const<bool> (Optional)
+        * Defaults to ``False``, sort in descending order.
+        * ``True`` to sort in ascending order.
+    sort: const<bool> (Optional)
+        * Defaults to ``True``.
+        * If ``True``, ``top-k`` elements are themselves sorted.
+          Otherwise, no particular ordering is guaranteed.
+    return_indices: const<bool> (Optional)
+        * Defaults to ``True``.
+        * If ``True``, returns both values and indices. Otherwise, returns only the ``top-k`` values.
+
+    Returns
+    -------
+    tensor<\*?, T>
+        * Values of top/bottom ``k`` elements.
+
+    tensor<\*?, int32>
+        * Only returned when ``return_indices = True``
+        * Indices of the top/bottom ``k`` elements along axis.
+
+    Attributes
+    ----------
+    T: fp16, fp32, int16, int32, uint16
+    K: int16, int32
+    """
+
+    input_spec = InputSpec(
+        x=TensorInputType(type_domain="T"),
+        k=TensorInputType(const=True, optional=True, type_domain="K"),
+        axis=TensorInputType(const=True, optional=True, type_domain=types.int32),
+        ascending=TensorInputType(const=True, optional=True, type_domain=types.bool),
+        sort=TensorInputType(const=True, optional=True, type_domain=types.bool),
+        return_indices=TensorInputType(const=True, optional=True, type_domain=types.bool),
+    )
+
+    type_domains = {
+        "T": (
+            types.fp16,
+            types.fp32,
+            types.int16,
+            types.int32,
+            types.uint16,
+        ),
+        "K": (types.int16, types.int32),
+    }
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS17/tensor_transformation.py b/coremltools/converters/mil/mil/ops/defs/iOS17/tensor_transformation.py
new file mode 100644
index 000000000..c2052aa51
--- /dev/null
+++ b/coremltools/converters/mil/mil/ops/defs/iOS17/tensor_transformation.py
@@ -0,0 +1,87 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+from typing import List
+
+from coremltools.converters.mil.mil.ops.defs._op_reqs import register_op
+from coremltools.converters.mil.mil.ops.defs.iOS15.tensor_transformation import (
+    reshape as _reshape_iOS15,
+)
+from coremltools.converters.mil.mil.ops.defs.iOS17 import _IOS17_TARGET
+
+
+@register_op(opset_version=_IOS17_TARGET)
+class reshape(_reshape_iOS15):
+    """
+    Return a tensor that has the same values as ``x`` with shape ``shape``.
+    ``shape`` must have the same volume (number of elements) as ``x``.
+
+    The major difference between this version and the iOS 15 :py:class:`~.iOS15.tensor_transformation.reshape` is as follows:
+
+    When the ``shape`` contains ``0``,
+    the restriction about ``K == rank(x)`` is no longer enforced. Each ``0`` in ``shape`` will match the
+    corresponding dimension in ``x.shape``, counting from the rightmost element. So ``shape[i]``
+    matches ``input[j]`` if ``length(shape)-i == rank(input)-j``. If a ``0`` is out of range, assign ``1``
+    (equivalent to ``expand_dims`` for ``x.shape``).
+
+    More specifically, when ``x.shape`` is ``[2, 50]`` and ``shape`` is ``[1, 0, -1, 0]``, it will error out
+    in iOS 15 or iOS 16 because ``x`` has rank ``2`` while the ``len`` of ``shape`` is ``4``. In iOS 17, the result will
+    have ``shape`` ``[1, 1, 2, 50]``, because the rightmost ``0`` will be changed to the rightmost dim of
+    ``x.shape``, which is ``50``. There is no other ``0`` that has a corresponding dim in ``x.shape``, so it is set
+    as ``1``. Finally, the ``-1`` is calculated based on knowing dimensions that produce ``2``.
+
+    Parameters
+    ----------
+    x: tensor<\*?, T> (Required)
+
+        * An ``n-D`` tensor or a scalar.
+        * If ``x`` has a fixed rank (and possibly contains symbolic dimension),
+          ``shape`` may contain elements that are not positive integers (see below).
+        * If ``x`` has a variadic rank, ``shape`` can only contain positive integers.
+
+    shape: tensor<[K], i32> (Required)
+
+        A 1-D tensor, with elements from the following:
+
+            * Positive integers.
+            * Symbols: All but one symbol in ``shape`` must be present in ``x.shape``.
+              The new symbol that is not present in ``x.shape`` represents a dimension
+              such that the total size remains constant. Symbol is illegal
+              if ``x`` has a variadic rank.
+            * ``-1``: ``-1`` introduces a new symbol (see Symbols). Therefore, ``-1`` is
+              allowed if all symbols in the ``shape`` appear in ``x.shape``. ``-1`` is illegal
+              if ``x`` has a variadic rank.
+            * ``0``: It will match the corresponding dimension in ``x.shape``. See the previous
+              description of different behaviors with iOS 17.
+
+    Returns
+    -------
+    tensor<\*?, T>
+        * Tensor with shape determined by the input shape.
+
+    Attributes
+    ----------
+    T: fp16, fp32, i32, bool
+    """
+
+    @staticmethod
+    def replace_zeros_in_shape(from_shape: List[int], to_shape: List[int]) -> List[int]:
+        """
+        Replaces 0s in `to_shape` by the corresponding dims in `from_shape`.
+
+        Overrides IOS15's method to demonstrate IOS17's different behaviours.
+        """
+        if to_shape.count(0):
+            # To do right alignment, we reverse the input and do left alignment instead.
+            from_shape_reversed = from_shape[::-1]
+            to_shape_reversed = to_shape[::-1]
+            for idx, to_element in enumerate(to_shape_reversed):
+                if to_element == 0:
+                    to_shape_reversed[idx] = (
+                        from_shape_reversed[idx] if idx < len(from_shape_reversed) else 1
+                    )
+            # Reverse the result back to make the right alignment.
+            to_shape = to_shape_reversed[::-1]
+        return to_shape
diff --git a/coremltools/converters/mil/mil/ops/helper.py b/coremltools/converters/mil/mil/ops/helper.py
index fc699b636..ea65697f4 100644
--- a/coremltools/converters/mil/mil/ops/helper.py
+++ b/coremltools/converters/mil/mil/ops/helper.py
@@ -3,6 +3,7 @@
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+
 def _get_version_of_op(op_variants, opset_version):
     """
     A utility function that retrieves an op cls given a dictionary of op variants and target version
@@ -12,17 +13,13 @@ def _get_version_of_op(op_variants, opset_version):
     opset_versions.sort()
     if opset_version is None:
         op_cls = op_variants[opset_versions[0]]
-    elif opset_version > opset_versions[-1]:
-        # TODO(rdar://103267345): Remove when no longer required.
-        # MIL opsets inherit ops from previous ones by default.
-        op_cls = op_variants[opset_versions[-1]]
     else:
         if opset_version not in op_variants:
             op_type = list(op_variants.values())[0].__name__
             msg = (
-                "No available version for {} in the {!s} opset. Please update the "
-                "minimum_deployment_target to at least {!s}"
-            ).format(op_type, opset_version, opset_versions[0])
-            raise ValueError(msg) 
+                "No available version for {} in the coremltools.target.{} opset. Please update the "
+                "minimum_deployment_target to at least coremltools.target.{}"
+            ).format(op_type, opset_version.name, opset_versions[0].name)
+            raise ValueError(msg)
         op_cls = op_variants[opset_version]
     return op_cls
diff --git a/coremltools/converters/mil/mil/ops/registry.py b/coremltools/converters/mil/mil/ops/registry.py
index d24f5f4e8..49946796c 100644
--- a/coremltools/converters/mil/mil/ops/registry.py
+++ b/coremltools/converters/mil/mil/ops/registry.py
@@ -56,7 +56,8 @@ class SSAOpRegistry:
         target.iOS13,
         target.iOS14,
         target.iOS15,
-        target.iOS16
+        target.iOS16,
+        target.iOS17,
     )
     core_ops = defaultdict(dict)
     dialect_ops = {}
diff --git a/coremltools/converters/mil/mil/ops/tests/test_activation.py b/coremltools/converters/mil/mil/ops/tests/test_activation.py
index 2daf5ad8f..515208b28 100644
--- a/coremltools/converters/mil/mil/ops/tests/test_activation.py
+++ b/coremltools/converters/mil/mil/ops/tests/test_activation.py
@@ -9,6 +9,7 @@
 import pytest
 import scipy
 
+import coremltools as ct
 from coremltools.converters.mil import testing_reqs
 from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.mil import types
@@ -392,14 +393,11 @@ def test_builder_eval(self):
         v = mb.prelu(x=x_val, alpha=alpha)
 
         alpha_br = alpha
-
-        for i in range(1, len(x_val.shape)):
-            alpha_br = np.expand_dims(alpha_br, i)
-
-        x_pos = np.maximum(x_val, 0)
-        b = np.minimum(x_val, 0)
-
-        np.testing.assert_allclose(x_pos + b * alpha_br, v.val, atol=1e-04, rtol=1e-05)
+        for i in range(len(x_val.shape)):
+            if i != 1:
+                alpha_br = np.expand_dims(alpha_br, i)
+        expected_res = np.maximum(x_val, 0) + np.minimum(x_val, 0) * alpha_br
+        np.testing.assert_allclose(expected_res, v.val, atol=1e-04, rtol=1e-05)
 
     @ssa_fn
     def test_builder_eval1(self):
@@ -822,12 +820,13 @@ def test_builder_eval(self):
 
         alpha_br = np.array([1, 2, 3], dtype=np.float32)
         beta_br = np.array([4, 5, 6], dtype=np.float32)
-        for i in range(1, len(x_val.shape)):
-            alpha_br = np.expand_dims(alpha_br, i)
-            beta_br = np.expand_dims(beta_br, i)
-        out = alpha_br * np.log(np.exp(x_val * beta_br) + 1)
+        for i in range(len(x_val.shape)):
+            if i != 1:
+                alpha_br = np.expand_dims(alpha_br, i)
+                beta_br = np.expand_dims(beta_br, i)
+        expected_res = alpha_br * np.log(np.exp(x_val * beta_br) + 1)
 
-        np.testing.assert_allclose(out, v.val, atol=1e-04, rtol=1e-05)
+        np.testing.assert_allclose(expected_res, v.val, atol=1e-04, rtol=1e-05)
 
     @ssa_fn
     def test_builder_eval2(self):
@@ -1078,3 +1077,173 @@ def build(x):
             compute_unit=compute_unit,
             backend=backend,
         )
+
+
+class TestInputWeightDifferentDtypes:
+    """
+    Starting from IOS17 the alpha/beta can have different dtypes from the input/output, so this
+    test class is mainly to verify the behaviour of those alpha/beta related activations.
+    """
+
+    @pytest.mark.parametrize(
+        "opset_version, different_dtype, op_name",
+        itertools.product(
+            [None, ct.target.iOS17],
+            [True, False],
+            ["elu", "leaky_relu", "prelu", "thresholded_relu"],
+        ),
+    )
+    def test_builder_eval_alpha(self, opset_version, different_dtype, op_name):
+        x = np.array([[[-1, 2, -3], [4, -5, 6]]], dtype=np.float32)
+        alpha = np.float16(2.0) if different_dtype else np.float32(2.0)
+        if op_name == "prelu":
+            alpha = np.array([2.0, 2.0], dtype=alpha.dtype)  # prelu requires alpha to be rank 1.
+
+        def prog():
+            return getattr(mb, op_name)(x=x, alpha=alpha)
+
+        if different_dtype and opset_version != ct.target.iOS17:
+            # Before iOS17 it should raise error when alpha has different dtype than input/output.
+            with pytest.raises(ValueError, match="must have the same data type"):
+                mb.program(input_specs=[], opset_version=opset_version)(prog)
+        else:
+            mb.program(input_specs=[], opset_version=opset_version)(prog)
+
+    @pytest.mark.parametrize(
+        "opset_version, different_dtype, op_name",
+        itertools.product(
+            [None, ct.target.iOS17],
+            [True, False],
+            [
+                "clamped_relu",
+                "linear_activation",
+                "scaled_tanh",
+                "sigmoid_hard",
+                "softplus_parametric",
+            ],
+        ),
+    )
+    def test_builder_eval_alpha_beta(self, opset_version, different_dtype, op_name):
+        x = np.array([[[-1, 2, -3], [4, -5, 6]]], dtype=np.float32)
+        alpha = np.float16(2.0) if different_dtype else np.float32(2.0)
+        beta = np.float16(1.0) if different_dtype else np.float32(1.0)
+        if op_name == "softplus_parametric":
+            alpha = np.array([2.0, 2.0], dtype=alpha.dtype)
+            beta = np.array([1.0, 1.0], dtype=beta.dtype)
+
+        def prog():
+            return getattr(mb, op_name)(x=x, alpha=alpha, beta=beta)
+
+        if different_dtype and opset_version != ct.target.iOS17:
+            with pytest.raises(ValueError, match="must have the same data type"):
+                mb.program(input_specs=[], opset_version=opset_version)(prog)
+        else:
+            mb.program(input_specs=[], opset_version=opset_version)(prog)
+
+    @pytest.mark.parametrize(
+        "compute_unit, different_dtype, op_name",
+        itertools.product(
+            compute_units, [True, False], ["elu", "leaky_relu", "prelu", "thresholded_relu"]
+        ),
+    )
+    def test_builder_to_backend_numerical_alpha(self, compute_unit, different_dtype, op_name):
+        x = np.array([[[-1, 2, -3], [4, -5, 6]]], dtype=np.float32)
+        alpha = np.float16(2.0) if different_dtype else np.float32(2.0)
+        if op_name == "prelu":
+            alpha = np.array([2.0, 2.0], dtype=alpha.dtype)
+
+        def calculate_by_np():
+            if op_name == "elu":
+                res = np.copy(x)
+                res[res < 0] = alpha * (np.exp(res[res < 0]) - 1)
+                return res
+            elif op_name == "leaky_relu":
+                res = np.copy(x)
+                res[res < 0] *= 2.0
+                return res
+            elif op_name == "prelu":
+                alpha_br = np.copy(alpha)
+                for i in range(len(x.shape)):
+                    if i != 1:
+                        alpha_br = np.expand_dims(alpha_br, i)
+                res = np.maximum(x, 0) + np.minimum(x, 0) * alpha_br
+                return res
+            elif op_name == "thresholded_relu":
+                res = np.copy(x)
+                res[res < alpha] = 0.0
+                return res
+            else:
+                raise ValueError(f"Invalid op_name: {op_name}")
+
+        def build(x):
+            return getattr(mb, op_name)(x=x, alpha=alpha)
+
+        run_compare_builder(
+            build,
+            input_placeholders={"x": mb.placeholder(shape=x.shape)},
+            input_values={"x": x},
+            expected_output_types=x.shape + (types.fp32,),
+            expected_outputs=calculate_by_np(),
+            compute_unit=compute_unit,
+            backend=("mlprogram", "fp16"),
+            minimum_deployment_target=ct.target.iOS17,
+        )
+
+    @pytest.mark.parametrize(
+        "compute_unit, different_dtype, op_name",
+        itertools.product(
+            compute_units,
+            [True, False],
+            [
+                "clamped_relu",
+                "linear_activation",
+                "scaled_tanh",
+                "sigmoid_hard",
+                "softplus_parametric",
+            ],
+        ),
+    )
+    def test_builder_to_backend_numerical_alpha_beta(self, compute_unit, different_dtype, op_name):
+        x = np.array([[[-1, 2, -3], [4, -5, 6]]], dtype=np.float32)
+        alpha = np.float16(2.0) if different_dtype else np.float32(2.0)
+        beta = np.float16(1.0) if different_dtype else np.float32(1.0)
+        if op_name == "softplus_parametric":
+            alpha = np.array([2.0, 2.0], dtype=alpha.dtype)
+            beta = np.array([1.0, 1.0], dtype=beta.dtype)
+
+        def calculate_by_np():
+            if op_name == "clamped_relu":
+                return np.minimum(np.maximum(x, 0), beta) + np.minimum(
+                    np.minimum(x, 0) * alpha, beta
+                )
+            elif op_name == "linear_activation":
+                return x * alpha + beta
+            elif op_name == "scaled_tanh":
+                return alpha * np.tanh(x * beta)
+            elif op_name == "sigmoid_hard":
+                return np.minimum(np.maximum((alpha * x) + beta, 0), 1)
+            elif op_name == "softplus_parametric":
+                alpha_br = alpha
+                beta_br = beta
+                for i in range(len(x.shape)):
+                    if i != 1:
+                        alpha_br = np.expand_dims(alpha_br, i)
+                        beta_br = np.expand_dims(beta_br, i)
+                res = alpha_br * np.log(np.exp(x * beta_br) + 1)
+                return res
+            else:
+                raise ValueError(f"Invalid op_name: {op_name}")
+
+        def build(x):
+            return getattr(mb, op_name)(x=x, alpha=alpha, beta=beta)
+
+        run_compare_builder(
+            build,
+            input_placeholders={"x": mb.placeholder(shape=x.shape)},
+            input_values={"x": x},
+            expected_output_types=x.shape + (types.fp32,),
+            expected_outputs=calculate_by_np(),
+            compute_unit=compute_unit,
+            backend=("mlprogram", "fp16"),
+            minimum_deployment_target=ct.target.iOS17,
+        )
diff --git a/coremltools/converters/mil/mil/ops/tests/test_control_flow.py b/coremltools/converters/mil/mil/ops/tests/test_control_flow.py
index f541f1ca7..11711058a 100644
--- a/coremltools/converters/mil/mil/ops/tests/test_control_flow.py
+++ b/coremltools/converters/mil/mil/ops/tests/test_control_flow.py
@@ -9,11 +9,11 @@
 import pytest
 
 from coremltools.converters.mil.mil import Builder as mb
-from coremltools.converters.mil.mil import types
+from coremltools.converters.mil.mil import get_new_symbol, types
 from coremltools.converters.mil.testing_reqs import backends, compute_units
 from coremltools.converters.mil.testing_utils import random_gen, ssa_fn
 
-from .testing_utils import UNK_SYM, run_compare_builder
+from .testing_utils import UNK_SYM, construct_inputs_from_placeholders, run_compare_builder
 
 
 class TestSelect:
@@ -54,11 +54,12 @@ def build(cond, a, b):
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends)
+        "compute_unit, backend",
+        itertools.product(compute_units, backends),
     )
     def test_builder_to_backend_smoke_broadcast(self, compute_unit, backend):
         cond_val = np.array([[1], [0], [2]], dtype=np.float32)
-        a_val = np.array([[3, 1, 1], [1, 4, 1], [5, 6, 1]], dtype=np.float32)
+        a_val = np.array([1, 7, 8], dtype=np.float32)
         b_val = np.array([[3, 2, 2], [2, 4, 2], [5, 6, 2]], dtype=np.float32)
         input_placeholders = {
             "cond": mb.placeholder(shape=cond_val.shape),
@@ -74,9 +75,41 @@ def build(cond, a, b):
 
         expected_output_types = [(3, 3, types.fp32)]
         expected_outputs = [
-            np.array(
-                [[3.0, 1.0, 1.0], [2.0, 4.0, 2.0], [5.0, 6.0, 1.0]], dtype=np.float32
-            )
+            np.array([[1.0, 7.0, 8.0], [2.0, 4.0, 2.0], [1.0, 7.0, 8.0]], dtype=np.float32)
+        ]
+
+        run_compare_builder(
+            build,
+            input_placeholders,
+            input_values,
+            expected_output_types,
+            expected_outputs,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+    @pytest.mark.parametrize(
+        "compute_unit, backend",
+        itertools.product(compute_units, backends),
+    )
+    def test_builder_to_backend_smoke_scalar_and_tensor(self, compute_unit, backend):
+        cond_val = np.array([[1], [0], [2]], dtype=np.float32)
+        a_val = np.float32(1.0)
+        b_val = np.array([[3, 2, 2], [2, 4, 2], [5, 6, 2]], dtype=np.float32)
+        input_placeholders = {
+            "cond": mb.placeholder(shape=cond_val.shape),
+            "b": mb.placeholder(shape=b_val.shape),
+        }
+        input_values = {"cond": cond_val, "b": b_val}
+
+        def build(cond, b):
+            if not types.is_bool(cond.dtype):
+                cond = mb.cast(x=cond, dtype="bool")
+            return [mb.select(cond=cond, a=a_val, b=b)]
+
+        expected_output_types = [(3, 3, types.fp32)]
+        expected_outputs = [
+            np.array([[1.0, 1.0, 1.0], [2.0, 4.0, 2.0], [1.0, 1.0, 1.0]], dtype=np.float32)
         ]
 
         run_compare_builder(
@@ -89,6 +122,34 @@ def build(cond, a, b):
             backend=backend,
         )
 
+    @pytest.mark.parametrize(
+        "compute_unit, backend",
+        itertools.product(compute_units, backends),
+    )
+    def test_builder_to_backend_smoke_symbolic(self, compute_unit, backend):
+        SYMBOLIC_SHAPE = tuple([get_new_symbol() for _ in range(5)])
+        VALUE = 100.0
+
+        input_placeholders = {"a": mb.placeholder(shape=SYMBOLIC_SHAPE)}
+
+        def build(a):
+            return [mb.select(cond=False, a=a, b=np.float32(VALUE))]
+
+        shape = tuple(np.random.randint(1, 5, size=len(SYMBOLIC_SHAPE)))
+        a = np.random.rand(*shape)
+        input_values = {"a": a}
+
+        run_compare_builder(
+            build,
+            input_placeholders,
+            input_values,
+            expected_output_types=[SYMBOLIC_SHAPE + (types.fp32,)],
+            expected_outputs=[VALUE],
+            inputs=construct_inputs_from_placeholders(input_placeholders, upper_bound=10),
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
     @ssa_fn
     def test_builder_eval(self):
         cond = np.random.randint(low=0, high=2, size=(6, 1, 7)).astype(bool)
@@ -101,10 +162,17 @@ def test_builder_eval(self):
     def test_builder_eval_broadcast(self):
         cond = np.array([[True], [False], [True]])
         a = np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float32)
-        b = np.array([[7, 8], [9, 10], [11, 12]], dtype=np.float32)
+        b = np.array([7, 8], dtype=np.float32)
         res = mb.select(cond=cond, a=a, b=b)
-        np.testing.assert_allclose(np.array([[1, 2], [9, 10], [5, 6]], dtype=np.float32), res.val, atol=1e-04, rtol=1e-05)
+        np.testing.assert_allclose(
+            np.array([[1, 2], [7, 8], [5, 6]], dtype=np.float32), res.val, atol=1e-04, rtol=1e-05
+        )
 
+    @ssa_fn
+    def test_builder_eval_scalar(self):
+        res = mb.select(cond=True, a=np.float32(1), b=np.float32(2))
+        assert isinstance(res.val, np.float32)
+        np.testing.assert_allclose(np.float32(1), res.val)
 
 class TestCond:
     @pytest.mark.parametrize(
diff --git a/coremltools/converters/mil/mil/ops/tests/test_elementwise_unary.py b/coremltools/converters/mil/mil/ops/tests/test_elementwise_unary.py
index f1b0640a8..aa91f944c 100644
--- a/coremltools/converters/mil/mil/ops/tests/test_elementwise_unary.py
+++ b/coremltools/converters/mil/mil/ops/tests/test_elementwise_unary.py
@@ -4,17 +4,19 @@
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
 import itertools
+from unittest.mock import patch
 
 import numpy as np
 import pytest
 import scipy
 
+import coremltools as ct
 from coremltools.converters.mil import testing_reqs
 from coremltools.converters.mil.mil import Builder as mb
-from coremltools.converters.mil.mil import (Function, get_new_symbol,
-                                            types)
-from coremltools.converters.mil.mil.types.symbolic import \
-    is_compatible_symbolic_vector
+from coremltools.converters.mil.mil import Function, get_new_symbol, types
+from coremltools.converters.mil.mil.passes.pass_pipeline import PassPipeline
+from coremltools.converters.mil.mil.types.symbolic import is_compatible_symbolic_vector
+from coremltools.converters.mil.mil.var import Var
 from coremltools.converters.mil.testing_utils import ssa_fn
 
 from .testing_utils import run_compare_builder
@@ -671,6 +673,87 @@ def build(x):
             backend=backend,
         )
 
+    @pytest.mark.parametrize(
+        "compute_unit, backend, src_dtype, dst_dtype",
+        itertools.product(
+            compute_units,
+            [("mlprogram", "fp16")],
+            [np.float16, np.float32, np.float64, np.int64, np.int32, np.int16, np.uint16],
+            [np.float16, np.float32, np.float64, np.int64, np.int32, np.int16, np.uint16],
+        ),
+    )
+    def test_builder_eval_cast_ios17(self, compute_unit, backend, src_dtype, dst_dtype):
+        x = np.array([[1, 2, 3], [4, 5, 6]], dtype=src_dtype)
+        dst_dtype_str = types.builtin_to_string(
+            types.type_mapping.numpy_type_to_builtin_type(dst_dtype)
+        )
+        expected_res = x.astype(dtype=np.float16)
+
+        @mb.program(input_specs=[], opset_version=ct.target.iOS17)
+        def prog():
+            return mb.cast(x=x, dtype=dst_dtype_str)
+
+        main_func = prog.functions["main"]
+        cast_op = main_func.find_ops(op_type="cast")[0]
+        np.testing.assert_allclose(expected_res, cast_op.outputs[0].val, atol=1e-04, rtol=1e-05)
+
+    @pytest.mark.parametrize(
+        "compute_unit, backend, src_dtype, dst_dtype",
+        itertools.product(
+            compute_units,
+            [("mlprogram", "fp16")],
+            [np.float16, np.float32, np.int16, np.int32, np.uint16],
+            [np.float16, np.float32, np.int16, np.int32, np.uint16],
+        ),
+    )
+    def test_builder_to_backend_cast_ios17(self, compute_unit, backend, src_dtype, dst_dtype):
+        _SUPPORTED_IO_DTYPES = {types.fp16, types.fp32, types.int32}
+        x = np.array([[1, 2, 3], [4, 5, 6]], dtype=src_dtype)
+        src_builtin_dtype = types.type_mapping.numpy_type_to_builtin_type(src_dtype)
+        dst_builtin_dtype = types.type_mapping.numpy_type_to_builtin_type(dst_dtype)
+        expected_res = x.astype(dtype=np.float16)
+
+        expected_cast_num = 1
+        if src_builtin_dtype not in _SUPPORTED_IO_DTYPES:
+            # A cast will be inserted for unsupported dtypes inputs.
+            expected_cast_num += 1
+
+        # As CoreML IO only allows fp16/32 and int32, the output will be further cast.
+        expected_res_builtin_dtype = dst_builtin_dtype
+        if dst_builtin_dtype not in _SUPPORTED_IO_DTYPES:
+            expected_res_builtin_dtype = (
+                types.int32 if types.is_int(dst_builtin_dtype) else types.fp32
+            )
+            expected_cast_num += 1
+
+        def build(x):
+            return mb.cast(x=x, dtype=types.builtin_to_string(dst_builtin_dtype))
+
+        with patch.object(Var, "_is_nonreplaceable_var") as mocked_is_nonreplaceable_var:
+            # Mock that the cast is non-replaceable, to make sure it's kept in the graph.
+            mocked_is_nonreplaceable_var.side_effect = (
+                lambda var: var.op and var.op.op_type == "cast"
+            )
+            # Remove the cast optimization pass to make sure all cast are kept in the graph.
+            pass_pipeline: PassPipeline = PassPipeline.DEFAULT
+            pass_pipeline.remove_passes(
+                ["common::cast_optimization", "common::topological_reorder"]
+            )
+            mlmodel = run_compare_builder(
+                build,
+                {"x": mb.placeholder(shape=x.shape, dtype=src_builtin_dtype)},
+                input_values={"x": x},
+                expected_output_types=x.shape + (expected_res_builtin_dtype,),
+                expected_outputs=expected_res,
+                compute_unit=compute_unit,
+                backend=backend,
+                minimum_deployment_target=ct.target.iOS17,
+                pass_pipeline=pass_pipeline,
+            )
+            prog = mlmodel._mil_program
+            cast_ops = prog["main"].find_ops(op_type="cast")
+            assert len(cast_ops) == expected_cast_num
+
     def test_erf_value_inference(self):
         INPUT_SIZE=(2, 3, 4)
         rs = np.random.RandomState(1234)
diff --git a/coremltools/converters/mil/mil/ops/tests/test_image_resizing.py b/coremltools/converters/mil/mil/ops/tests/test_image_resizing.py
index ab0e542ee..92439617d 100644
--- a/coremltools/converters/mil/mil/ops/tests/test_image_resizing.py
+++ b/coremltools/converters/mil/mil/ops/tests/test_image_resizing.py
@@ -261,7 +261,7 @@ class TestUpsampleNearestNeighborFractionalScales:
     def test_builder_to_backend_smoke(self, compute_unit, backend):
         if backend[0] == "neuralnetwork":
             pytest.skip("nn backend not supported")
-        
+
         if backend[0] == "mlprogram" and compute_unit != ct.ComputeUnit.CPU_ONLY:
             pytest.xfail("rdar://97398448 (TestUpsampleNearestNeighborFractionalScales failing on GPU)")
 
@@ -490,7 +490,7 @@ def build_upsample_bilinear(x):
             )
 
         expected_output_type = (1, 1, 2, 6, types.fp32)
-        
+
         if align_corners and not half_pixel_centers:
             expected_output = [1., 1.2, 1.4, 1.6, 1.8, 2., 1., 1.2, 1.4, 1.6, 1.8, 2.]
         elif not align_corners and half_pixel_centers:
@@ -703,16 +703,11 @@ def build(x):
 
 class TestCropResize:
     @pytest.mark.parametrize(
-        "compute_unit, backend",
-        itertools.product(compute_units, backends),
+        "compute_unit, backend, pad_value",
+        itertools.product(compute_units, backends, [0.0, 1.0, 10.0]),
     )
-    def test_builder_to_backend_smoke_pad_value(self, compute_unit, backend):
-        if backend[0] == "neuralnetwork":
-            pytest.skip("pad_mode only supported on iOS16 or above")
-            
-        if ct.utils._macos_version() < (13, 0):
-            pytest.skip("pad_value not supported in macOS12 or older.")
-
+    def test_builder_to_backend_ios16(self, compute_unit, backend, pad_value):
+        """For iOS16+ the crop_resize op supports pad_value."""
         x = np.array(
             [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]],
             dtype=np.float32,
@@ -723,24 +718,40 @@ def test_builder_to_backend_smoke_pad_value(self, compute_unit, backend):
             [0, 0.5, 1.8, 1., 0.3],
             [0, 0.0, 0.4, 0.6, 0.7],
         ], dtype=np.float32).reshape(3, 1, 5, 1, 1)
-        
+
         def build(x):
             return mb.crop_resize(
-                    x=x,
-                    roi=roi,
-                    target_width=2,
-                    target_height=2,
-                    normalized_coordinates=True,
-                    box_coordinate_mode="CORNERS_HEIGHT_FIRST",
-                    sampling_mode="ALIGN_CORNERS",
-                    pad_value=10.0,
+                x=x,
+                roi=roi,
+                target_width=2,
+                target_height=2,
+                normalized_coordinates=True,
+                box_coordinate_mode="CORNERS_HEIGHT_FIRST",
+                sampling_mode="ALIGN_CORNERS",
+                pad_value=pad_value,
             )
-        
+
         expected_output_type = [
             (3, 1, 1, 2, 2, types.fp32),
         ]
         expected_output = [
-            np.array([ 3.1, 5.2, 10, 10, 10, 7.899, 10, 13.9, 2.2, 3.1, 9.4, 10.3], dtype=np.float32).reshape(3, 1, 1, 2, 2),
+            np.array(
+                [
+                    3.1,
+                    5.2,
+                    pad_value,
+                    pad_value,
+                    pad_value,
+                    7.899,
+                    pad_value,
+                    13.9,
+                    2.2,
+                    3.1,
+                    9.4,
+                    10.3,
+                ],
+                dtype=np.float32,
+            ).reshape(3, 1, 1, 2, 2),
         ]
 
         input_placeholder_dict = {"x": mb.placeholder(shape=(1, 1, 4, 4))}
@@ -756,11 +767,10 @@ def build(x):
             backend=backend,
             minimum_deployment_target=ct.target.iOS16,
         )
-        
-        
+
     @pytest.mark.parametrize(
         "compute_unit, backend, is_symbolic",
-        itertools.product(compute_units, backends, compute_units),
+        itertools.product(compute_units, backends, [True, False]),
     )
     def test_builder_to_backend_smoke(self, compute_unit, backend, is_symbolic):
         if backend[0] == "mlprogram" and compute_unit != ct.ComputeUnit.CPU_ONLY:
@@ -921,14 +931,119 @@ def build(x, mode=0):
         ]
 
         for mode in range(6):
-            # nn-proto does not support UNALIGN_CORNERS
-            if not (backend[0] == 'neuralnetwork' and mode == 5):
-                run_compare_builder(
-                    functools.partial(build, mode=mode),
-                    input_placeholder_dict,
-                    input_value_dict,
-                    expected_output_type[mode],
-                    expected_output[mode],
-                    compute_unit=compute_unit,
-                    backend=backend,
-                )
+            if backend[0] == "neuralnetwork" and mode == 5:
+                pytest.skip("nn-proto does not support UNALIGN_CORNERS")
+            run_compare_builder(
+                functools.partial(build, mode=mode),
+                input_placeholder_dict,
+                input_value_dict,
+                expected_output_type[mode],
+                expected_output[mode],
+                compute_unit=compute_unit,
+                backend=backend,
+            )
+
+    @pytest.mark.parametrize(
+        "compute_unit, backend, N",
+        itertools.product(compute_units, backends, [1, 3]),
+    )
+    def test_builder_to_backend_ios17(self, compute_unit, backend, N):
+        """For iOS17+ the `roi` input is replaced by `boxes` and `box_indices`."""
+        x = np.arange(1, 17, dtype=np.float32).reshape(1, 1, 4, 4)
+        boxes = np.array([1, 1, 2, 2], dtype=np.float32).reshape(1, 4)
+        box_indices = None
+        normalized_coordinates = False
+        if N == 3:
+            boxes = np.array(
+                [
+                    [0.1, 0.3, 1.3, 1.0],
+                    [0.5, 1.8, 1.0, 0.3],
+                    [0.0, 0.4, 0.6, 0.7],
+                ],
+                dtype=np.float32,
+            )
+            box_indices = np.array([0] * 3, dtype=np.int32)
+            normalized_coordinates = True
+
+        def build(x):
+            return mb.crop_resize(
+                x=x,
+                boxes=boxes,
+                box_indices=box_indices,
+                target_width=2,
+                target_height=2,
+                normalized_coordinates=normalized_coordinates,
+                box_coordinate_mode="CORNERS_HEIGHT_FIRST",
+                sampling_mode="ALIGN_CORNERS",
+                pad_value=10.0,
+            )
+
+        expected_outputs = [np.array([6, 7, 10, 11], dtype=np.float32).reshape(1, 1, 2, 2)]
+        if N == 3:
+            expected_outputs = [
+                np.array(
+                    [3.1, 5.2, 10.0, 10.0, 10.0, 7.899, 10.0, 13.9, 2.2, 3.1, 9.4, 10.3],
+                    dtype=np.float32,
+                ).reshape(3, 1, 2, 2)
+            ]
+
+        run_compare_builder(
+            build,
+            input_placeholders={"x": mb.placeholder(shape=(1, 1, 4, 4))},
+            input_values={"x": x},
+            expected_output_types=[(N, 1, 2, 2, types.fp32)],
+            expected_outputs=expected_outputs,
+            compute_unit=compute_unit,
+            backend=backend,
+            minimum_deployment_target=ct.target.iOS17,
+        )
+
+    def test_builder_eval_ios17_invalid(self):
+        x = np.arange(1, 17, dtype=np.float32).reshape(1, 1, 4, 4)
+        three_boxes = np.array(
+            [
+                [0.1, 0.3, 1.3, 1.0],
+                [0.5, 1.8, 1.0, 0.3],
+                [0.0, 0.4, 0.6, 0.7],
+            ],
+            dtype=np.float32,
+        )
+        with pytest.raises(
+            ValueError,
+            match='N dimension of "boxes" \(3\) should not be greater '
+            'than the B dimension of "x" \(1\)',
+        ):
+
+            @mb.program(input_specs=[], opset_version=ct.target.iOS17)
+            def prog():
+                return mb.crop_resize(x=x, boxes=three_boxes)
+
+        one_box = np.array([1, 1, 2, 2], dtype=np.float32).reshape(1, 4)
+        indices_out_of_bound = np.array([10], dtype=np.int32)
+        with pytest.raises(
+            ValueError,
+            match='input "box_indices" should not have values >= B '
+            "dimension of x \(1\), but got \[10\]",
+        ):
+
+            @mb.program(input_specs=[], opset_version=ct.target.iOS17)
+            def prog():
+                return mb.crop_resize(x=x, boxes=one_box, box_indices=indices_out_of_bound)
+
+        indices_two_dim = np.array([[0]], dtype=np.int32)
+        with pytest.raises(
+            ValueError, match='input "box_indices" must has shape \[1\], but got \(1, 1\)'
+        ):
+
+            @mb.program(input_specs=[], opset_version=ct.target.iOS17)
+            def prog():
+                return mb.crop_resize(x=x, boxes=one_box, box_indices=indices_two_dim)
+
+        x_rank5 = np.arange(1, 17, dtype=np.float32).reshape(1, 1, 4, 4, 1)
+        with pytest.raises(
+            ValueError, match='input to the "crop_resize" op must be of rank 4, but got 5'
+        ):
+
+            @mb.program(input_specs=[], opset_version=ct.target.iOS17)
+            def prog():
+                return mb.crop_resize(x=x_rank5, boxes=one_box)
diff --git a/coremltools/converters/mil/mil/ops/tests/test_normalization.py b/coremltools/converters/mil/mil/ops/tests/test_normalization.py
index abff161b8..cc3568df7 100644
--- a/coremltools/converters/mil/mil/ops/tests/test_normalization.py
+++ b/coremltools/converters/mil/mil/ops/tests/test_normalization.py
@@ -10,14 +10,13 @@
 import pytest
 
 import coremltools as ct
-from coremltools._deps import (_HAS_TF_2, _HAS_TORCH, MSG_TF2_NOT_FOUND,
-                               MSG_TORCH_NOT_FOUND)
+from coremltools._deps import _HAS_TF_2, _HAS_TORCH, MSG_TF2_NOT_FOUND, MSG_TORCH_NOT_FOUND
 from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.mil import Function, get_new_symbol, types
 from coremltools.converters.mil.testing_reqs import backends, compute_units
 from coremltools.converters.mil.testing_utils import random_gen
 
-from .testing_utils import UNK_SYM, run_compare_builder
+from .testing_utils import UNK_SYM, construct_inputs_from_placeholders, run_compare_builder
 
 if _HAS_TORCH:
     import torch
@@ -526,6 +525,9 @@ def build(x):
             input_values,
             expected_output_types,
             expected_outputs,
+            inputs=construct_inputs_from_placeholders(input_placeholders, 10)
+            if backend[0] == "mlprogram"
+            else None,
             compute_unit=compute_unit,
             backend=backend,
         )
@@ -550,7 +552,7 @@ def test_builder_to_backend_stress_numpy(self, compute_unit, backend, rank_and_a
 
         if backend == ("mlprogram", "fp16") and compute_unit != ct.ComputeUnit.CPU_ONLY:
             pytest.xfail("rdar://80662357 ([GPU failures] LayerNorm FP16 tests failing on GPU with numerical errors)")
-            
+
         if backend[0] == "neuralnetwork" and compute_unit != ct.ComputeUnit.CPU_ONLY and platform.machine() == "arm64":
             pytest.xfail("rdar://98015195 ([M1 native tests] Some MIL unittests are failing on M1 native)")
 
diff --git a/coremltools/converters/mil/mil/ops/tests/test_quantization.py b/coremltools/converters/mil/mil/ops/tests/test_quantization.py
new file mode 100644
index 000000000..5b05b61ee
--- /dev/null
+++ b/coremltools/converters/mil/mil/ops/tests/test_quantization.py
@@ -0,0 +1,460 @@
+#  Copyright (c) 2022, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import itertools
+from typing import Tuple
+
+import numpy as np
+import pytest
+
+import coremltools as ct
+from coremltools._deps import _HAS_TORCH, MSG_TORCH_NOT_FOUND
+from coremltools.converters.mil.mil import Builder as mb
+from coremltools.converters.mil.mil.ops.tests.testing_utils import run_compare_builder
+from coremltools.converters.mil.mil.types import builtin_to_string, numpy_type_to_builtin_type
+from coremltools.converters.mil.testing_utils import ssa_fn
+
+if _HAS_TORCH:
+    import torch
+
+torch.manual_seed(1042)
+np.random.seed(1042)
+
+
+class TestQuantizationBase:
+    @staticmethod
+    def get_random_quantization_params(
+        float_dtype: np.dtype,
+        quant_dtype: np.dtype,
+        input_rank: int,
+        is_zp_present: bool = True,
+        axis: int = None,
+    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+        """
+        return floating-point input, floating-point scale, integer zero point
+        """
+
+        x_shape = np.random.randint(low=1, high=5, size=(input_rank,))
+
+        low, high = (-128, 128) if quant_dtype == np.int8 else (0, 256)
+
+        # create quantized x
+        x_q = np.random.randint(low=low, high=high, size=x_shape)
+
+        # create scale and zero point, the dequantize x
+        x_fp = None
+        scale = None
+        zp = None
+        # quantize per tensor
+        if axis is None:
+            scale = np.array(np.random.rand())
+            if is_zp_present:
+                zp = np.array(np.random.randint(low=low, high=high))
+                x_fp = (x_q - zp) * scale
+            else:
+                x_fp = x_q * scale
+        # quantize per channel
+        else:
+            # prepare broadcast shape for latter dequantize
+            broadcastable_shape = np.ones(input_rank, dtype=np.int32)
+            broadcastable_shape[axis] = x_shape[axis]
+
+            scale = np.random.rand(x_shape[axis])
+            broadcasted_scale = np.reshape(scale, broadcastable_shape)
+
+            if is_zp_present:
+                zp = np.random.randint(low=low, high=high, size=x_shape[axis])
+                broadcasted_zp = np.reshape(zp, broadcastable_shape)
+                x_fp = (x_q - broadcasted_zp) * broadcasted_scale
+            else:
+                x_fp = x_q * broadcasted_scale
+
+        x_fp = x_fp.astype(float_dtype)
+        scale = scale.astype(float_dtype)
+        zero_point = zp.astype(quant_dtype) if is_zp_present else None
+        return x_fp, scale, zero_point
+
+    @staticmethod
+    def torch_quantize(
+        x: np.ndarray,
+        scale: np.ndarray,
+        zero_point: np.ndarray,
+        axis: int = None,
+        quant_dtype: np.dtype = None,
+    ) -> torch.Tensor:
+        """
+        return quantized x by pytorch
+        """
+
+        # quantization data type is either inferred from `zero_point`,
+        # or explicitly provided
+        if zero_point is not None:
+            quant_dtype = zero_point.dtype
+        assert quant_dtype is not None
+
+        # if scale is scalar, then axis must be None
+        # if scale is not scalar, then axis must have a value
+        assert (len(scale.shape) == 0) != (axis is not None)
+
+        x_torch = torch.from_numpy(x).to(torch.float32)
+        s_torch = torch.from_numpy(scale).to(torch.float32)
+        zp_torch = (
+            torch.zeros(scale.shape, dtype=torch.int)
+            if zero_point is None
+            else torch.from_numpy(zero_point)
+        )
+        dtype_torch = torch.quint8 if quant_dtype == np.uint8 else torch.qint8
+
+        output: np.ndarray
+        if axis is None:
+            output = torch.quantize_per_tensor(x_torch, s_torch, zp_torch, dtype_torch)
+        else:
+            if axis < 0:
+                axis += len(x.shape)
+            output = torch.quantize_per_channel(x_torch, s_torch, zp_torch, axis, dtype_torch)
+        return output
+
+
+class TestQuantize(TestQuantizationBase):
+    @ssa_fn
+    def test_builder_eval_scalar_params(self):
+        v = mb.quantize(
+            input=np.float32([[0, 2, 4], [0, 2, 4]]),
+            zero_point=np.uint8(1),
+            scale=np.float32(2),
+            output_dtype="uint8",
+        )
+        np.testing.assert_allclose(np.array([[1, 2, 3], [1, 2, 3]]).astype(np.uint8), v.val)
+
+    @ssa_fn
+    def test_builder_eval_vector_params(self):
+        v = mb.quantize(
+            input=np.array([1, 2, 3, 4]).reshape(1, 1, 2, 2).astype(np.float32),
+            zero_point=np.array([2, 4]).astype(np.int8),
+            scale=np.array([1, 2]).astype(np.float32),
+            axis=3,
+            output_dtype="int8",
+        )
+        np.testing.assert_allclose(
+            np.array([3, 5, 5, 6]).reshape(1, 1, 2, 2).astype(np.int8), v.val
+        )
+
+    @ssa_fn
+    def test_builder_eval_vector_params_neg_axis(self):
+        v = mb.quantize(
+            input=np.array([1, 2, 3, 4]).reshape(1, 1, 2, 2).astype(np.float32),
+            zero_point=np.array([2, 4]).astype(np.int8),
+            scale=np.array([1, 2]).astype(np.float32),
+            axis=-1,
+            output_dtype="int8",
+        )
+        np.testing.assert_allclose(
+            np.array([3, 5, 5, 6]).reshape(1, 1, 2, 2).astype(np.int8), v.val
+        )
+
+    @ssa_fn
+    def test_builder_eval_no_zero_point(self):
+        v = mb.quantize(
+            input=np.float32([[0, 2, 4], [0, 2, 4]]),
+            scale=np.float32(2),
+            output_dtype="int8",
+        )
+        np.testing.assert_allclose(np.array([[0, 1, 2], [0, 1, 2]]).astype(np.int8), v.val)
+
+    def test_smoke_builder_to_backend_quantize_per_tensor(self):
+        def build(x):
+            x = mb.cast(x=x, dtype="fp16")
+            quantized = mb.quantize(
+                input=x,
+                zero_point=np.int8(10),
+                scale=np.float16(0.1),
+                output_dtype="int8",
+            )
+            # TODO(rdar://107430678): Replace scale=1 zero_point=0 quantize/dequantize with cast
+            dequantized = mb.dequantize(
+                input=quantized,
+                scale=np.float16(1),
+            )
+            return dequantized
+
+        x = np.array([-1.0, 0.0, 1.0, 2.0], dtype=np.float16)
+        expected_output = np.array([0, 10, 20, 30], dtype=np.float16)
+        expected_output_type = expected_output.shape + (
+            numpy_type_to_builtin_type(expected_output.dtype),
+        )
+        run_compare_builder(
+            build,
+            input_placeholders={"x": mb.placeholder(shape=x.shape)},
+            input_values={"x": x},
+            expected_output_types=[expected_output_type],
+            expected_outputs=[expected_output],
+            compute_unit=ct.ComputeUnit.CPU_ONLY,
+            backend=("mlprogram", "fp16"),
+            minimum_deployment_target=ct.target.iOS17,
+        )
+
+    def test_smoke_builder_to_backend_quantize_per_channel(self):
+        def build(x):
+            x = mb.cast(x=x, dtype="fp16")
+            quantized = mb.quantize(
+                input=x,
+                zero_point=np.uint8([10, 0]),
+                scale=np.float16([0.1, 0.01]),
+                axis=0,
+                output_dtype="uint8",
+            )
+            # TODO(rdar://107430678): Replace scale=1 zero_point=0 quantize/dequantize with cast
+            dequantized = mb.dequantize(
+                input=quantized,
+                scale=np.float16(1),
+            )
+            return dequantized
+
+        x = np.array([[-1.0, 0.0], [1.0, 2.0]], dtype=np.float16)
+        expected_output = np.array([[0, 10], [100, 200]], dtype=np.float16)
+        expected_output_type = expected_output.shape + (
+            numpy_type_to_builtin_type(expected_output.dtype),
+        )
+        run_compare_builder(
+            build,
+            input_placeholders={"x": mb.placeholder(shape=x.shape)},
+            input_values={"x": x},
+            expected_output_types=[expected_output_type],
+            expected_outputs=[expected_output],
+            compute_unit=ct.ComputeUnit.CPU_ONLY,
+            backend=("mlprogram", "fp16"),
+            minimum_deployment_target=ct.target.iOS17,
+        )
+
+    @pytest.mark.skipif(not _HAS_TORCH, reason=MSG_TORCH_NOT_FOUND)
+    @pytest.mark.parametrize(
+        "float_dtype, quant_dtype, compute_precision, input_rank, is_zp_present",
+        itertools.product(
+            (np.float32, np.float16),
+            (np.int8, np.uint8),
+            ("fp32", "fp16"),
+            (1, 2, 3, 4, 5),
+            (True, False),
+        ),
+    )
+    def test_stress_builder_to_backend_quantize_all_possibilities(
+        self, float_dtype, quant_dtype, compute_precision, input_rank, is_zp_present
+    ):
+        def build(x):
+            x = mb.cast(x=x, dtype=builtin_to_string(numpy_type_to_builtin_type(float_dtype)))
+            quantized = mb.quantize(
+                input=x,
+                zero_point=zero_point,
+                scale=scale,
+                axis=axis,
+                output_dtype=builtin_to_string(numpy_type_to_builtin_type(quant_dtype)),
+            )
+            # TODO(rdar://107430678): Replace scale=1 zero_point=0 quantize/dequantize with cast
+            dequantized = mb.dequantize(
+                input=quantized,
+                scale=float_dtype(1),
+            )
+            # TODO(rdar://98013530): some fp16-output models fail
+            if float_dtype == np.float16:
+                return mb.cast(x=dequantized, dtype="fp32")
+            else:
+                return dequantized
+
+        for axis in [None] + [i for i in range(-input_rank, input_rank)]:
+            x_fp, scale, zero_point = self.get_random_quantization_params(
+                float_dtype, quant_dtype, input_rank, is_zp_present, axis
+            )
+
+            input_placeholders = {
+                "x": mb.placeholder(
+                    shape=x_fp.shape,
+                    dtype=numpy_type_to_builtin_type(float_dtype),
+                ),
+            }
+            input_values = {"x": x_fp}
+
+            output_torch = self.torch_quantize(x_fp, scale, zero_point, axis, quant_dtype)
+            output_torch_val = output_torch.int_repr().numpy()
+            output_type = output_torch_val.shape + (numpy_type_to_builtin_type(np.float32),)
+            expected_outputs = [output_torch_val]
+            expected_output_types = [output_type]
+
+            run_compare_builder(
+                build,
+                input_placeholders,
+                input_values,
+                expected_output_types,
+                expected_outputs=expected_outputs,
+                compute_unit=ct.ComputeUnit.CPU_ONLY,
+                backend=("mlprogram", compute_precision),
+                minimum_deployment_target=ct.target.iOS17,
+            )
+
+
+class TestDequantize(TestQuantizationBase):
+    @ssa_fn
+    def test_builder_eval_scalar_params(self):
+        v = mb.dequantize(
+            input=np.array([[1, 2, 3], [1, 2, 3]]).astype(np.uint8),
+            zero_point=np.uint8(1),
+            scale=np.float32(2),
+        )
+        np.testing.assert_allclose(np.float32([[0, 2, 4], [0, 2, 4]]), v.val)
+
+    @ssa_fn
+    def test_builder_eval_vector_params(self):
+        v = mb.dequantize(
+            input=np.array([3, 5, 5, 6]).reshape(1, 1, 2, 2).astype(np.uint8),
+            zero_point=np.array([2, 4]).astype(np.uint8),
+            scale=np.array([1, 2]).astype(np.float32),
+            axis=3,
+        )
+        np.testing.assert_allclose(
+            np.array([1, 2, 3, 4]).reshape(1, 1, 2, 2).astype(np.float32), v.val
+        )
+
+    @ssa_fn
+    def test_builder_eval_no_zero_point(self):
+        v = mb.dequantize(
+            input=np.array([[0, 1, 2], [0, 1, 2]]).astype(np.int8),
+            scale=np.float32(2),
+        )
+        np.testing.assert_allclose(np.float32([[0, 2, 4], [0, 2, 4]]), v.val)
+
+    def test_smoke_builder_to_backend_dequantize_per_tensor(self):
+        def build(x):
+            x = mb.cast(x=x, dtype="fp32")
+            # TODO(rdar://107430678): Replace scale=1 zero_point=0 quantize/dequantize with cast
+            quantized = mb.quantize(
+                input=x,
+                scale=np.float32(1),
+                output_dtype="uint8",
+            )
+            dequantized = mb.dequantize(
+                input=quantized,
+                zero_point=np.uint8(5),
+                scale=np.float32(0.2),
+            )
+            return dequantized
+
+        x = np.array([5, 10, 15, 20], dtype=np.float32)
+        expected_output = np.array([0, 1, 2, 3], dtype=np.float32)
+        expected_output_type = expected_output.shape + (
+            numpy_type_to_builtin_type(expected_output.dtype),
+        )
+        run_compare_builder(
+            build,
+            input_placeholders={"x": mb.placeholder(shape=x.shape)},
+            input_values={"x": x},
+            expected_output_types=[expected_output_type],
+            expected_outputs=[expected_output],
+            compute_unit=ct.ComputeUnit.CPU_ONLY,
+            backend=("mlprogram", "fp32"),
+            minimum_deployment_target=ct.target.iOS17,
+            atol=1e-3,
+            rtol=1e-3,
+        )
+
+    def test_smoke_builder_to_backend_dequantize_per_channel(self):
+        def build(x):
+            x = mb.cast(x=x, dtype="fp32")
+            # TODO(rdar://107430678): Replace scale=1 zero_point=0 quantize/dequantize with cast
+            quantized = mb.quantize(
+                input=x,
+                scale=np.float32(1),
+                output_dtype="int8",
+            )
+            dequantized = mb.dequantize(
+                input=quantized,
+                zero_point=np.int8([-5, 5]),
+                scale=np.float32([0.2, 0.3]),
+                axis=1,
+            )
+            return dequantized
+
+        x = np.array([[-10, -5], [0, 5]], dtype=np.float32)
+        expected_output = np.array([[-1, -3], [1, 0]], dtype=np.float32)
+        expected_output_type = expected_output.shape + (
+            numpy_type_to_builtin_type(expected_output.dtype),
+        )
+        run_compare_builder(
+            build,
+            input_placeholders={"x": mb.placeholder(shape=x.shape)},
+            input_values={"x": x},
+            expected_output_types=[expected_output_type],
+            expected_outputs=[expected_output],
+            compute_unit=ct.ComputeUnit.CPU_ONLY,
+            backend=("mlprogram", "fp32"),
+            minimum_deployment_target=ct.target.iOS17,
+            atol=1e-3,
+            rtol=1e-3,
+        )
+
+    @pytest.mark.skipif(not _HAS_TORCH, reason=MSG_TORCH_NOT_FOUND)
+    @pytest.mark.parametrize(
+        "float_dtype, quant_dtype, compute_precision, input_rank, is_zp_present",
+        itertools.product(
+            (np.float32, np.float16),
+            (np.int8, np.uint8),
+            ("fp32", "fp16"),
+            (1, 2, 3, 4, 5),
+            (True, False),
+        ),
+    )
+    def test_stress_builder_to_backend_dequantize_all_possibilities(
+        self, float_dtype, quant_dtype, compute_precision, input_rank, is_zp_present
+    ):
+        def build(x):
+            x = mb.cast(x=x, dtype=builtin_to_string(numpy_type_to_builtin_type(float_dtype)))
+            # TODO(rdar://107430678): Replace scale=1 zero_point=0 quantize/dequantize with cast
+            quantized = mb.quantize(
+                input=x,
+                scale=float_dtype(1),
+                output_dtype=builtin_to_string(numpy_type_to_builtin_type(quant_dtype)),
+            )
+            dequantized = mb.dequantize(
+                input=quantized,
+                zero_point=zero_point,
+                scale=scale,
+                axis=axis,
+            )
+            # TODO(rdar://98013530): some fp16-output models fail
+            if float_dtype == np.float16:
+                return mb.cast(x=dequantized, dtype="fp32")
+            else:
+                return dequantized
+
+        for axis in [None] + [i for i in range(-input_rank, input_rank)]:
+            x_fp, scale, zero_point = self.get_random_quantization_params(
+                float_dtype, quant_dtype, input_rank, is_zp_present, axis
+            )
+
+            x_q = self.torch_quantize(x_fp, scale, zero_point, axis, quant_dtype)
+
+            output_torch_val = torch.dequantize(x_q).numpy()
+            output_type = output_torch_val.shape + (numpy_type_to_builtin_type(np.float32),)
+
+            input_placeholders = {
+                "x": mb.placeholder(
+                    shape=x_fp.shape,
+                    dtype=numpy_type_to_builtin_type(float_dtype),
+                ),
+            }
+            input_values = {"x": x_q.int_repr().numpy()}
+
+            expected_outputs = [output_torch_val]
+            expected_output_types = [output_type]
+
+            run_compare_builder(
+                build,
+                input_placeholders,
+                input_values,
+                expected_output_types,
+                expected_outputs=expected_outputs,
+                compute_unit=ct.ComputeUnit.CPU_ONLY,
+                backend=("mlprogram", compute_precision),
+                minimum_deployment_target=ct.target.iOS17,
+                rtol=1e-3,
+            )
diff --git a/coremltools/converters/mil/mil/ops/tests/test_recurrent.py b/coremltools/converters/mil/mil/ops/tests/test_recurrent.py
index 43c44eadd..6444ebf2b 100644
--- a/coremltools/converters/mil/mil/ops/tests/test_recurrent.py
+++ b/coremltools/converters/mil/mil/ops/tests/test_recurrent.py
@@ -8,12 +8,14 @@
 import numpy as np
 import pytest
 
+import coremltools as ct
 from coremltools._deps import _HAS_TORCH, MSG_TORCH_NOT_FOUND
 from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.mil import get_new_symbol, types
 from coremltools.converters.mil.testing_reqs import backends, compute_units
+from coremltools.converters.mil.testing_utils import ssa_fn
 
-from .testing_utils import run_compare_builder
+from .testing_utils import construct_inputs_from_placeholders, run_compare_builder
 
 if _HAS_TORCH:
     import torch
@@ -46,8 +48,8 @@ class TestGRU:
             [True, False],
             ["forward", "reverse"],
             [
-                ["TANH", "SIGMOID"],
-                ["SIGMOID", "TANH"],
+                ["tanh", "sigmoid"],
+                ["sigmoid", "tanh"],
             ],
             [True, False],
         ),
@@ -79,17 +81,21 @@ def test_builder_to_backend_smoke(
         b_o = 2 * np.random.rand(hidden_size) - 1 if has_bias else np.zeros((hidden_size))
 
         def apply_act(x, option):
-            if option == 'TANH':
+            if option == "tanh":
                 return np.tanh(x)
-            elif option == 'SIGMOID':
-                return 1. / (1 + np.exp(-x))
+            elif option == "sigmoid":
+                return 1.0 / (1 + np.exp(-x))
             else:
                 raise ValueError("activation invalid")
 
-        def get_numpy_prediction_gru(X, H, return_seq, direction,
-                                     inner_activation_str='SIGMOID',
-                                     activation_str='TANH',
-                                     ):
+        def get_numpy_prediction_gru(
+            X,
+            H,
+            return_seq,
+            direction,
+            inner_activation_str="sigmoid",
+            activation_str="tanh",
+        ):
             """
             shape of X : (B, Seq, input_size)
 
@@ -117,9 +123,9 @@ def get_numpy_prediction_gru(X, H, return_seq, direction,
             output = np.transpose(output, (1, 0, 2))
             return output, output[-1, :, :]
 
-        def get_numpy_prediction_gru_single_batch(X, h, return_seq, direction,
-                                                  inner_activation_str='SIGMOID',
-                                                  activation_str='TANH'):
+        def get_numpy_prediction_gru_single_batch(
+            X, h, return_seq, direction, inner_activation_str="sigmoid", activation_str="tanh"
+        ):
             np_out = np.zeros((seq_len, hidden_size))
             batch_x = X if direction == "forward" else X[::-1, :]
             for k in range(seq_len):
@@ -193,6 +199,9 @@ def build(x, initial_h):
             input_values,
             expected_output_types,
             expected_outputs,
+            inputs=construct_inputs_from_placeholders(input_placeholders, upper_bound=10)
+            if symbolic and backend[0] == "mlprogram"
+            else None,
             compute_unit=compute_unit,
             backend=backend,
         )
@@ -222,9 +231,9 @@ class TestLSTM:
             backends,
             [[8, 32, 32]],
             [1, 4],
-            ["SIGMOID"],
-            ["TANH"],
-            ["TANH", "SIGMOID"],
+            ["sigmoid"],
+            ["tanh"],
+            ["relu", "scaled_tanh", "hard_sigmoid", "linear"],
             [False, True],
             [False, True],
             [False, True],
@@ -250,15 +259,17 @@ def test_numpy_numerical(
         clip,
     ):
         def _apply_act(x, option):
-            if option == "TANH":
+            # All activation functions use their standard default values.
+            # This makes `tanh` equivalent to `scaled_tanh`, and makes `linear` a pass through.
+            if option == "tanh" or option == "scaled_tanh":
                 return np.tanh(x)
-            elif option == "RELU":
+            elif option == "relu":
                 return np.maximum(0, x)
-            elif option == "SIGMOID":
+            elif option == "sigmoid":
                 return 1.0 / (1 + np.exp(-x))
-            elif option == "SIGMOID_HARD":
+            elif option == "hard_sigmoid":
                 return np.minimum(np.maximum(0.2 * x + 0.5, 0), 1)
-            elif option == "LINEAR":
+            elif option == "linear":
                 return x
             else:
                 raise ValueError("activation invalid")
@@ -517,6 +528,9 @@ def build(x, initial_h, initial_c):
             input_values,
             expected_output_types,
             expected_outputs,
+            inputs=construct_inputs_from_placeholders(input_placeholders, upper_bound=64)
+            if symbolic and backend[0] == "mlprogram"
+            else None,
             compute_unit=compute_unit,
             backend=backend,
         )
@@ -670,10 +684,57 @@ def build(x, initial_h, initial_c):
             input_values,
             expected_output_types,
             expected_outputs,
+            inputs=construct_inputs_from_placeholders(input_placeholders, upper_bound=64)
+            if symbolic and backend[0] == "mlprogram"
+            else None,
             compute_unit=compute_unit,
             backend=backend,
         )
 
+    @ssa_fn
+    def test_invalid_bidirectional_lstm(self):
+        with pytest.raises(
+            ValueError,
+            match="For bidirectional LSTM, the `weight_ih_back` and "
+            "`weight_hh_back` must be provided.",
+        ):
+            seq_len = 3
+            batch = 2
+            input_size = 4
+            hidden_size = 5
+            mb.lstm(
+                x=np.random.rand(seq_len, batch, input_size),
+                initial_h=np.zeros((batch, hidden_size)).astype(np.float32),
+                initial_c=np.zeros((batch, hidden_size)).astype(np.float32),
+                weight_ih=np.random.rand(4 * hidden_size, input_size),
+                weight_hh=np.random.rand(4 * hidden_size, hidden_size),
+                direction="bidirectional",
+            )
+
+    @ssa_fn
+    def test_invalid_activation_lstm(self):
+        seq_len = 3
+        batch = 2
+        input_size = 4
+        hidden_size = 5
+        arguments = {
+            "x": np.random.rand(seq_len, batch, input_size),
+            "initial_h": np.zeros((batch, hidden_size)).astype(np.float32),
+            "initial_c": np.zeros((batch, hidden_size)).astype(np.float32),
+            "weight_ih": np.random.rand(4 * hidden_size, input_size),
+            "weight_hh": np.random.rand(4 * hidden_size, hidden_size),
+            "direction": "forward",
+        }
+
+        with pytest.raises(ValueError, match="Activation `dummy` not supported."):
+            mb.lstm(recurrent_activation="dummy", **arguments)
+
+        with pytest.raises(ValueError, match="Activation `dummy` not supported."):
+            mb.lstm(cell_activation="dummy", **arguments)
+
+        with pytest.raises(ValueError, match="Activation `dummy` not supported."):
+            mb.lstm(activation="dummy", **arguments)
+
 
 class TestRNN:
     @pytest.mark.skipif(not _HAS_TORCH, reason=MSG_TORCH_NOT_FOUND)
@@ -785,6 +846,9 @@ def build(x, initial_h):
             input_values,
             expected_output_types,
             expected_outputs,
+            inputs=construct_inputs_from_placeholders(input_placeholders, upper_bound=64)
+            if symbolic and backend[0] == "mlprogram"
+            else None,
             compute_unit=compute_unit,
             backend=backend,
         )
diff --git a/coremltools/converters/mil/mil/ops/tests/test_reduction.py b/coremltools/converters/mil/mil/ops/tests/test_reduction.py
index 2a10db8a7..3555e8764 100644
--- a/coremltools/converters/mil/mil/ops/tests/test_reduction.py
+++ b/coremltools/converters/mil/mil/ops/tests/test_reduction.py
@@ -9,11 +9,11 @@
 import pytest
 import scipy
 
+import coremltools as ct
 from coremltools.converters.mil import testing_reqs
 from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.mil import get_new_symbol, types
-from coremltools.converters.mil.mil.ops.tests.testing_utils import \
-    run_compare_builder
+from coremltools.converters.mil.mil.ops.tests.testing_utils import run_compare_builder
 from coremltools.converters.mil.testing_utils import random_gen, ssa_fn
 
 backends = testing_reqs.backends
@@ -354,3 +354,47 @@ def prog():
                 atol=1e-04,
                 rtol=1e-05
             )
+
+    @pytest.mark.parametrize(
+        "compute_unit, backend, op_name, output_dtype",
+        itertools.product(
+            compute_units, backends, ["reduce_argmax", "reduce_argmin"], ["int32", "uint16", None]
+        ),
+    )
+    def test_reduce_arg_ios17_output_dtype(self, compute_unit, backend, op_name, output_dtype):
+        def build(x):
+            return getattr(mb, op_name)(x=x, axis=1, keep_dims=False, output_dtype=output_dtype)
+
+        val = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype=np.float32)
+        input_placeholders = {"x": mb.placeholder(shape=val.shape)}
+        input_values = {"x": val}
+        output_np_type = np.uint16 if output_dtype == "uint16" else np.int32
+        output_type = types.uint16 if output_dtype == "uint16" else types.int32
+        expected_output_types = (2, output_type)
+        expected_outputs = np.array(
+            [2, 2] if op_name == "reduce_argmax" else [0, 0], dtype=output_np_type
+        )
+
+        run_compare_builder(
+            build,
+            input_placeholders,
+            input_values,
+            expected_output_types,
+            expected_outputs,
+            compute_unit=compute_unit,
+            backend=backend,
+            minimum_deployment_target=ct.target.iOS17,
+        )
+
+    @pytest.mark.parametrize(
+        "op_name",
+        ["reduce_argmax", "reduce_argmin"],
+    )
+    def test_reduce_arg_ios17_output_dtype_invalid(self, op_name):
+        x = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype=np.float32)
+
+        def prog():
+            return getattr(mb, op_name)(x=x, axis=1, keep_dims=False, output_dtype="dummy")
+
+        with pytest.raises(ValueError, match='Invalid "output_dtype" dummy'):
+            mb.program(input_specs=[], opset_version=ct.target.iOS17)(prog)
diff --git a/coremltools/converters/mil/mil/ops/tests/test_scatter_gather.py b/coremltools/converters/mil/mil/ops/tests/test_scatter_gather.py
index 6829e9a8a..d2ef1b9c2 100644
--- a/coremltools/converters/mil/mil/ops/tests/test_scatter_gather.py
+++ b/coremltools/converters/mil/mil/ops/tests/test_scatter_gather.py
@@ -13,7 +13,7 @@
 from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.mil import types
 from coremltools.converters.mil.testing_reqs import backends, compute_units
-from coremltools.converters.mil.testing_utils import ssa_fn
+from coremltools.models.utils import _macos_version
 
 from .testing_utils import run_compare_builder
 
@@ -23,9 +23,14 @@
 
 class TestScatter:
     @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends,)
+        "compute_unit, backend, minimum_deployment_target", itertools.product(
+            compute_units, backends, [None, ct.target.iOS17])
     )
-    def test_builder_to_backend_smoke(self, compute_unit, backend):
+    def test_builder_to_backend_smoke(self, compute_unit, backend, minimum_deployment_target):
+        if minimum_deployment_target == ct.target.iOS17:
+            if backend[0] != "mlprogram" or _macos_version() < (14, 0):
+                pytest.skip("IOS17 target available only on macOS 14+ with mlprogram.")
+
         data = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
         indices = np.array([1, 0], dtype=np.int32)
         updates = np.array([[5, 6, 7], [8, 9, 10]], dtype=np.float32)
@@ -52,34 +57,27 @@ def build(data, indices, updates):
             expected_outputs,
             compute_unit=compute_unit,
             backend=backend,
+            minimum_deployment_target=minimum_deployment_target,
         )
 
     @pytest.mark.skipif(not _HAS_TF_2, reason=MSG_TF2_NOT_FOUND)
     @pytest.mark.parametrize(
-        "compute_unit, backend, rankData_rankIndices, accumulate_mode",
+        "compute_unit, backend, rankData_rankIndices, accumulate_mode, minimum_deployment_target",
         itertools.product(
             compute_units,
             backends,
-            [
-                (1, 2),
-                (2, 1),
-                (3, 2),
-                (2, 3),
-                (2, 2),
-                (1, 1),
-                (3, 3),
-                (3, 3),
-                (3, 3),
-                (1, 3),
-                (3, 1),
-                (3, 1),
-            ],
+            [(1, 2), (2, 1), (3, 2), (2, 3), (1, 1), (3, 3), (1, 3)],
             ["update", "add", "sub", "mul", "div", "max", "min"],
+            [None, ct.target.iOS17]
         ),
     )
     def test_builder_to_backend_programmatic(
-        self, compute_unit, backend, rankData_rankIndices, accumulate_mode
+        self, compute_unit, backend, rankData_rankIndices, accumulate_mode, minimum_deployment_target
     ):
+        if minimum_deployment_target == ct.target.iOS17:
+            if backend[0] != "mlprogram" or _macos_version() < (14, 0):
+                pytest.skip("IOS17 target available only on macOS 14+ with mlprogram.")
+
         data_rank, indices_rank = rankData_rankIndices
         data_shape = np.random.randint(low=2, high=5, size=data_rank)
         indices_shape = np.random.randint(low=2, high=5, size=indices_rank)
@@ -130,14 +128,92 @@ def build(data, indices, updates):
             expected_output,
             compute_unit=compute_unit,
             backend=backend,
+            minimum_deployment_target=minimum_deployment_target,
         )
 
+    @pytest.mark.parametrize(
+        "compute_unit, backend, indices_val, validate_indices, dynamic",
+        itertools.product(
+            compute_units,
+            backends,
+            [[-1, 0], [10, 0]],  # One negative indices, another out-of-range indices.
+            [True, False],
+            [True, False],
+        ),
+    )
+    def test_ios17_invalid_indices(self, compute_unit, backend, indices_val, validate_indices, dynamic):
+        if backend[0] != "mlprogram" or _macos_version() < (14, 0):
+            pytest.skip("IOS17 target available only on macOS 14+")
+
+        def build_static(data, updates):
+            return (
+                mb.scatter(
+                    data=data,
+                    indices=np.array(indices_val, dtype=np.int32),
+                    updates=updates,
+                    validate_indices=validate_indices,
+                ),
+            )
+
+        def build_dynamic(data, indices, updates):
+            return (mb.scatter(data=data, indices=indices, updates=updates, validate_indices=validate_indices), )
+
+        data = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
+        updates = np.array([[5, 6, 7], [8, 9, 10]], dtype=np.float32)
+        input_placeholders = {
+            "data": mb.placeholder(shape=data.shape),
+            "updates": mb.placeholder(shape=updates.shape),
+        }
+        input_values = {"data": data, "updates": updates}
+        if dynamic:
+            indices = np.array(indices_val, dtype=np.int32)
+            input_placeholders["indices"] = mb.placeholder(shape=indices.shape, dtype=types.int32)
+            input_values["indices"] = indices
+
+        if not validate_indices:
+            # When not validate indices, negative or out-of-bound indices behavior is undefined.
+            expected_error = AssertionError
+            expected_error_msg = "Not equal"
+        elif dynamic:
+            # In PyMIL's validation, the `validate_indices` will only validate indices whose values are
+            # known during op insertion, so it will not error out at PyMIL layer, but instead, rely on
+            # the backend to do the validation after compilation.
+            expected_error = RuntimeError
+            expected_error_msg = (
+                "Error computing NN outputs",
+                "Unable to compute the prediction using a neural network model",
+            )
+        else:
+            # The negative or out-of-bound indices will error out when validate_indices is set.
+            expected_error = IndexError
+            expected_error_msg = "Indices is out of bounds"
+
+        with pytest.raises(expected_error) as excinfo:
+            run_compare_builder(
+                build_dynamic if dynamic else build_static,
+                input_placeholders,
+                input_values,
+                expected_output_types=(2, 3, types.fp32),
+                expected_outputs=np.array([[9, 11, 13], [9, 11, 13]], dtype=np.float32),
+                compute_unit=compute_unit,
+                backend=backend,
+                minimum_deployment_target=ct.target.iOS17,
+            )
+            if not isinstance(expected_error_msg, tuple):
+                expected_error_msg = expected_error_msg
+            assert any([err in str(excinfo.value) for err in expected_error_msg])
+
 
 class TestScatterAlongAxis:
     @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends,)
+        "compute_unit, backend, minimum_deployment_target", itertools.product(
+            compute_units, backends, [None, ct.target.iOS17])
     )
-    def test_builder_to_backend_smoke(self, compute_unit, backend):
+    def test_builder_to_backend_smoke(self, compute_unit, backend, minimum_deployment_target):
+        if minimum_deployment_target == ct.target.iOS17:
+            if backend[0] != "mlprogram" or _macos_version() < (14, 0):
+                pytest.skip("IOS17 target available only on macOS 14+ with mlprogram.")
+
         data = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
         indices = np.array([[1, 0, 1], [1, 1, 0]], dtype=np.int32)
         updates = np.array([[5, 6, 7], [8, 9, 10]], dtype=np.float32)
@@ -166,27 +242,50 @@ def build(data, indices, updates):
             expected_outputs,
             compute_unit=compute_unit,
             backend=backend,
+            minimum_deployment_target=minimum_deployment_target,
         )
 
-    @ssa_fn
-    def test_builder_eval(self):
-        x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
-        indices = np.array([[1, 0, 1], [1, 1, 0]], dtype=np.int32)
-        updates = np.array([[5, 6, 7], [8, 9, 10]], dtype=np.float32)
-        v = mb.scatter_along_axis(
-            data=x, indices=indices, updates=updates, axis=0, mode="update"
+    @pytest.mark.parametrize(
+        "opset_version",
+        [ct.target.iOS15, ct.target.iOS16, ct.target.iOS17],
+    )
+    def test_builder_eval(self, opset_version):
+        @mb.program(
+            input_specs=[mb.TensorSpec(shape=(1,), dtype=types.fp32)], opset_version=opset_version
+        )
+        def prog(x):
+            params = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
+            indices = np.array([[1, 0, 1], [1, 1, 0]], dtype=np.int32)
+            updates = np.array([[5, 6, 7], [8, 9, 10]], dtype=np.float32)
+            res = mb.scatter_along_axis(
+                data=params, indices=indices, updates=updates, axis=0, mode="update"
+            )
+            return res
+
+        main_func = prog.functions["main"]
+        gather_ops = main_func.find_ops(op_type="scatter_along_axis")[0]
+
+        np.testing.assert_allclose(
+            np.array([[1, 6, 10], [8, 9, 7]], dtype=np.float32),
+            gather_ops.outputs[0].val,
+            atol=1e-04,
+            rtol=1e-05,
         )
-        np.testing.assert_allclose(np.array([[1, 6, 10], [8, 9, 7]], dtype=np.float32), v.val, atol=1e-04, rtol=1e-05)
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, rank_axis",
+        "compute_unit, backend, rank_axis, minimum_deployment_target",
         itertools.product(
             compute_units,
             backends,
             [(rank, axis) for rank in range(1, 5) for axis in range(-rank, rank)],
+            [None, ct.target.iOS17]
         ),
     )
-    def test_builder_to_backend_programmatic(self, compute_unit, backend, rank_axis):
+    def test_builder_to_backend_programmatic(self, compute_unit, backend, rank_axis, minimum_deployment_target):
+        if minimum_deployment_target == ct.target.iOS17:
+            if backend[0] != "mlprogram" or _macos_version() < (14, 0):
+                pytest.skip("IOS17 target available only on macOS 14+ with mlprogram.")
+
         rank, axis = rank_axis
         data_shape = np.random.randint(low=2, high=8, size=rank)
         indices_shape = np.copy(data_shape)
@@ -195,9 +294,13 @@ def test_builder_to_backend_programmatic(self, compute_unit, backend, rank_axis)
 
         data = np.random.rand(*data_shape).astype(np.float32)
         updates = np.random.rand(*updates_shape).astype(np.float32)
-        indices = np.random.randint(
-            -data_shape[axis], data_shape[axis], size=indices_shape
-        ).astype(np.int32)
+        if minimum_deployment_target == ct.target.iOS17:
+            # IOS17 scatter_along_axis requires indices to be non-negative.
+            indices = np.random.randint(0, data_shape[axis], size=indices_shape).astype(np.int32)
+        else:
+            indices = np.random.randint(
+                -data_shape[axis], data_shape[axis], size=indices_shape
+            ).astype(np.int32)
 
         def build(data, indices, updates):
             return mb.scatter_along_axis(
@@ -225,14 +328,88 @@ def build(data, indices, updates):
             np_output,
             compute_unit=compute_unit,
             backend=backend,
+            minimum_deployment_target=minimum_deployment_target,
         )
 
+    @pytest.mark.parametrize(
+        "compute_unit, backend, indices_val, dynamic",
+        itertools.product(
+            compute_units,
+            backends,
+            [[[-1, 0, 1], [1, 1, 0]], [[1, 10, 1], [1, 1, 0]]],
+            [True, False],
+        ),
+    )
+    def test_ios17_invalid_indices(self, compute_unit, backend, indices_val, dynamic):
+        if backend[0] != "mlprogram" or _macos_version() < (14, 0):
+            pytest.skip("IOS17 target available only on macOS 14+")
+
+        def build_static(data, updates):
+            return (
+                mb.scatter_along_axis(
+                    data=data,
+                    indices=np.array(indices_val, dtype=np.int32),
+                    updates=updates,
+                    validate_indices=True,
+                ),
+            )
+
+        def build_dynamic(data, indices, updates):
+            return mb.scatter_along_axis(
+                data=data, indices=indices, updates=updates, axis=0, mode="update",
+                validate_indices=True,
+            )
+
+        data = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
+        updates = np.array([[5, 6, 7], [8, 9, 10]], dtype=np.float32)
+        input_placeholders = {
+            "data": mb.placeholder(shape=data.shape),
+            "updates": mb.placeholder(shape=updates.shape),
+        }
+        input_values = {"data": data, "updates": updates}
+        if dynamic:
+            indices = np.array(indices_val, dtype=np.int32)
+            input_placeholders["indices"] = mb.placeholder(shape=indices.shape, dtype=types.int32)
+            input_values["indices"] = indices
+
+        if dynamic:
+            expected_error = RuntimeError
+            expected_error_msg = (
+                "Error computing NN outputs",
+                "Unable to compute the prediction using a neural network model",
+            )
+        else:
+            # The negative or out-of-bound indices will error out when validate_indices is set.
+            expected_error = IndexError
+            expected_error_msg = "Indices is out of bounds"
+
+        # The negative or out-of-bound indices will error out when validate_indices is set.
+        with pytest.raises(expected_error) as excinfo:
+            run_compare_builder(
+                build_dynamic if dynamic else build_static,
+                input_placeholders,
+                input_values,
+                expected_output_types=(2, 3, types.fp32),
+                expected_outputs=np.array([[1, 6, 10], [8, 9, 7]], dtype=np.float32),
+                compute_unit=compute_unit,
+                backend=backend,
+                minimum_deployment_target=ct.target.iOS17,
+            )
+            if not isinstance(expected_error_msg, tuple):
+                expected_error_msg = expected_error_msg
+            assert any([err in str(excinfo.value) for err in expected_error_msg])
+
 
 class TestScatterNd:
     @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends,)
+        "compute_unit, backend, minimum_deployment_target", itertools.product(
+            compute_units, backends, [None, ct.target.iOS17])
     )
-    def test_builder_to_backend_smoke(self, compute_unit, backend):
+    def test_builder_to_backend_smoke(self, compute_unit, backend, minimum_deployment_target):
+        if minimum_deployment_target == ct.target.iOS17:
+            if backend[0] != "mlprogram" or _macos_version() < (14, 0):
+                pytest.skip("IOS17 target available only on macOS 14+ with mlprogram.")
+
         data = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
         indices = np.array([[1, 0], [0, 2]], dtype=np.int32)
         updates = np.array([5, 10], dtype=np.float32)
@@ -247,46 +424,35 @@ def test_builder_to_backend_smoke(self, compute_unit, backend):
         def build(data, indices, updates):
             return (mb.scatter_nd(data=data, indices=indices, updates=updates),)
 
-        expected_output_types = (2, 3, types.fp32)
-
-        expected_outputs = np.array([[1, 2, 13], [9, 5, 6]], dtype=np.float32)
-
         run_compare_builder(
             build,
             input_placeholders,
             input_values,
-            expected_output_types,
-            expected_outputs,
+            expected_output_types=(2, 3, types.fp32),
+            expected_outputs=np.array([[1, 2, 13], [9, 5, 6]], dtype=np.float32),
             compute_unit=compute_unit,
             backend=backend,
+            minimum_deployment_target=minimum_deployment_target,
         )
 
     @pytest.mark.skipif(not _HAS_TF_2, reason=MSG_TF2_NOT_FOUND)
     @pytest.mark.parametrize(
-        "compute_unit, backend, rankData_rankIndices, accumulate_mode",
+        "compute_unit, backend, rankData_rankIndices, accumulate_mode, minimum_deployment_target",
         itertools.product(
             compute_units,
             backends,
-            [
-                (1, 2),
-                (2, 2),
-                (3, 2),
-                (2, 3),
-                (1, 4),
-                (5, 2),
-                (2, 5),
-                (4, 3),
-                (3, 4),
-                (2, 4),
-                (4, 2),
-                (1, 5),
-            ],
+            [(2, 2), (1, 4), (5, 2), (4, 3), (3, 4), (1, 5)],
             ["update", "add", "sub"],
+            [None, ct.target.iOS17],
         ),
     )
     def test_builder_to_backend_programmatic(
-        self, compute_unit, backend, rankData_rankIndices, accumulate_mode
+        self, compute_unit, backend, rankData_rankIndices, accumulate_mode, minimum_deployment_target
     ):
+        if minimum_deployment_target == ct.target.iOS17:
+            if backend[0] != "mlprogram" or _macos_version() < (14, 0):
+                pytest.skip("IOS17 target available only on macOS 14+ with mlprogram.")
+
         data_rank, indices_rank = rankData_rankIndices
         data_shape = np.random.randint(low=2, high=5, size=data_rank)
         indices_shape = np.random.randint(low=2, high=5, size=indices_rank)
@@ -334,14 +500,83 @@ def build(data, indices, updates):
             expected_output,
             compute_unit=compute_unit,
             backend=backend,
+            minimum_deployment_target=minimum_deployment_target,
         )
 
+    @pytest.mark.parametrize(
+        "compute_unit, backend, indices_val, dynamic",
+        itertools.product(
+            compute_units, backends, [[[1, 0], [0, -1]], [[1, 0], [0, 3]]], [True, False]
+        ),
+    )
+    def test_ios17_invalid_indices(self, compute_unit, backend, indices_val, dynamic):
+        if backend[0] != "mlprogram" or _macos_version() < (14, 0):
+            pytest.skip("IOS17 target available only on macOS 14+")
+
+        def build_static(data, updates):
+            return (
+                mb.scatter_nd(
+                    data=data,
+                    indices=np.array(indices_val, dtype=np.int32),
+                    updates=updates,
+                    validate_indices=True,
+                ),
+            )
+
+        def build_dynamic(data, indices, updates):
+            return (
+                mb.scatter_nd(data=data, indices=indices, updates=updates, validate_indices=True),
+            )
+
+        data = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
+        updates = np.array([5, 10], dtype=np.float32)
+        input_placeholders = {
+            "data": mb.placeholder(shape=data.shape),
+            "updates": mb.placeholder(shape=updates.shape),
+        }
+        input_values = {"data": data, "updates": updates}
+        if dynamic:
+            indices = np.array(indices_val, dtype=np.int32)
+            input_placeholders["indices"] = mb.placeholder(shape=indices.shape, dtype=types.int32)
+            input_values["indices"] = indices
+
+        if dynamic:
+            expected_error = RuntimeError
+            expected_error_msg = (
+                "Error computing NN outputs",
+                "Unable to compute the prediction using a neural network model",
+            )
+        else:
+            # The negative or out-of-bound indices will error out when validate_indices is set.
+            expected_error = IndexError
+            expected_error_msg = "Indices is out of bounds"
+
+        with pytest.raises(expected_error) as excinfo:
+            run_compare_builder(
+                build_dynamic if dynamic else build_static,
+                input_placeholders,
+                input_values,
+                expected_output_types=(2, 3, types.fp32),
+                expected_outputs=np.array([[1, 2, 13], [9, 5, 6]], dtype=np.float32),
+                compute_unit=compute_unit,
+                backend=backend,
+                minimum_deployment_target=ct.target.iOS17,
+            )
+            if not isinstance(expected_error_msg, tuple):
+                expected_error_msg = expected_error_msg
+            assert any([err in str(excinfo.value) for err in expected_error_msg])
+
 
 class TestGather:
     @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends,)
+        "compute_unit, backend, minimum_deployment_target",
+        itertools.product(compute_units, backends, [None, ct.target.iOS17]),
     )
-    def test_builder_to_backend_smoke(self, compute_unit, backend):
+    def test_builder_to_backend_smoke(self, compute_unit, backend, minimum_deployment_target):
+        if minimum_deployment_target == ct.target.iOS17:
+            if backend[0] != "mlprogram" or _macos_version() < (14, 0):
+                pytest.skip("IOS17 target available only on macOS 14+ with mlprogram.")
+
         x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
         indices = np.array([1, 0], dtype=np.int32)
         input_placeholders = {
@@ -389,17 +624,23 @@ def build(x, indices):
             expected_outputs,
             compute_unit=compute_unit,
             backend=backend,
+            minimum_deployment_target=minimum_deployment_target,
         )
 
-
     @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends,)
+        "compute_unit, backend, minimum_deployment_target",
+        itertools.product(compute_units, backends, [ct.target.iOS16, ct.target.iOS17]),
     )
-    def test_builder_to_backend_smoke_iOS16(self, compute_unit, backend):
+    def test_builder_to_backend_smoke_batch_dims(
+        self, compute_unit, backend, minimum_deployment_target
+    ):
         if backend[0] == "neuralnetwork":
             pytest.skip("nn backend not supported")
         if ct.utils._macos_version() < (13, 0):
             pytest.skip("batch_dims not supported in macOS12 or older.")
+        if minimum_deployment_target == ct.target.iOS17:
+            if _macos_version() < (14, 0):
+                pytest.skip("IOS17 target available only on macOS 14+ with mlprogram.")
 
         x = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], dtype=np.float32)
         indices = np.array([[[1, 0], [0, 1]], [[1, 0], [0, 0]]], dtype=np.int32)
@@ -445,7 +686,7 @@ def build(x, indices):
                          [ 7,  8,  9]],
                         [[ 7,  8,  9],
                          [ 7,  8,  9]]]]], dtype=np.float32
-            ), 
+            ),
             np.array([[[[ 4,  5,  6],
                         [ 1,  2,  3]],
                        [[ 1,  2,  3],
@@ -496,12 +737,17 @@ def build(x, indices):
             expected_outputs,
             compute_unit=compute_unit,
             backend=backend,
-            minimum_deployment_target=ct.target.iOS16,
+            minimum_deployment_target=minimum_deployment_target,
         )
 
-
-    def test_builder_eval_iOS16(self):
-        @mb.program(input_specs=[mb.TensorSpec(shape=(1, ), dtype=types.fp32)], opset_version=ct.target.iOS16)
+    @pytest.mark.parametrize(
+        "opset_version",
+        [ct.target.iOS16, ct.target.iOS17],
+    )
+    def test_builder_eval_batch_dims(self, opset_version):
+        @mb.program(
+            input_specs=[mb.TensorSpec(shape=(1,), dtype=types.fp32)], opset_version=opset_version
+        )
         def prog(x):
             params = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], dtype=np.float32)
             indices = np.array([[[1, 0], [0, 1]], [[1, 0], [0, 0]]], dtype=np.int32)
@@ -512,16 +758,23 @@ def prog(x):
         gather_ops = main_func.find_ops(op_type="gather")[0]
 
         np.testing.assert_allclose(
-            np.array([[[ 2,  1], [ 4,  5]], [[ 8,  7], [10, 10]]], dtype=np.float32), 
-            gather_ops.outputs[0].val, 
-            atol=1e-04, 
+            np.array([[[2, 1], [4, 5]], [[8, 7], [10, 10]]], dtype=np.float32),
+            gather_ops.outputs[0].val,
+            atol=1e-04,
             rtol=1e-05
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends,)
+        "compute_unit, backend, minimum_deployment_target",
+        itertools.product(compute_units, backends, [None, ct.target.iOS17]),
     )
-    def test_embedding_builder_to_backend_smoke(self, compute_unit, backend):
+    def test_embedding_builder_to_backend_smoke(
+        self, compute_unit, backend, minimum_deployment_target
+    ):
+        if minimum_deployment_target == ct.target.iOS17:
+            if backend[0] != "mlprogram" or _macos_version() < (14, 0):
+                pytest.skip("IOS17 target available only on macOS 14+ with mlprogram.")
+
         x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
         indices = np.array([1, 0], dtype=np.int32)
         input_placeholders = {
@@ -554,21 +807,80 @@ def build(indices):
             expected_outputs,
             compute_unit=compute_unit,
             backend=backend,
+            minimum_deployment_target=minimum_deployment_target,
         )
 
-    @ssa_fn
-    def test_builder_eval(self):
-        x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
-        indices = np.array([1, 0], dtype=np.int32)
-        v = mb.gather(x=x, indices=indices, axis=-1)
-        np.testing.assert_allclose(np.array([[2, 1], [5, 4]], dtype=np.float32), v.val, atol=1e-04, rtol=1e-05)
+    @pytest.mark.parametrize(
+        "opset_version",
+        [ct.target.iOS15, ct.target.iOS16, ct.target.iOS17],
+    )
+    def test_builder_eval(self, opset_version):
+        @mb.program(
+            input_specs=[mb.TensorSpec(shape=(1,), dtype=types.fp32)], opset_version=opset_version
+        )
+        def prog(x):
+            params = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
+            indices = np.array([1, 0], dtype=np.int32)
+            res = mb.gather(x=params, indices=indices, axis=-1)
+            return res
+
+        main_func = prog.functions["main"]
+        gather_ops = main_func.find_ops(op_type="gather")[0]
+
+        np.testing.assert_allclose(
+            np.array([[2, 1], [5, 4]], dtype=np.float32),
+            gather_ops.outputs[0].val,
+            atol=1e-04,
+            rtol=1e-05,
+        )
+
+    @pytest.mark.parametrize(
+        "indices_val, validate_indices, opset_version",
+        itertools.product([[-1, 0], [0, 3]], [True, False], [None, ct.target.iOS17]),
+    )
+    def test_builder_invalid_indices(self, indices_val, validate_indices, opset_version):
+        def prog(x):
+            params = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
+            indices = np.array(indices_val, dtype=np.int32)
+            if opset_version == ct.target.iOS17:
+                res = mb.gather(
+                    x=params, indices=indices, axis=-1, validate_indices=validate_indices
+                )
+            else:
+                res = mb.gather(x=params, indices=indices, axis=-1)
+            return res
+
+        if opset_version == ct.target.iOS17 and validate_indices:
+            with pytest.raises(IndexError, match="Indices is out of bounds for `gather` node"):
+                mb.program(
+                    input_specs=[mb.TensorSpec(shape=(1,), dtype=types.fp32)],
+                    opset_version=opset_version,
+                )(prog)
+        elif any([idx > 2 for idx in indices_val]):
+            # If the indices are not validated during type inference for IOS17, the `gather` op's
+            # value inference will raise error for out-of-bound index.
+            with pytest.raises(IndexError, match="index 3 is out of bounds for axis 1 with size 3"):
+                mb.program(
+                    input_specs=[mb.TensorSpec(shape=(1,), dtype=types.fp32)],
+                    opset_version=opset_version,
+                )(prog)
+        else:
+            mb.program(
+                input_specs=[mb.TensorSpec(shape=(1,), dtype=types.fp32)],
+                opset_version=opset_version,
+            )(prog)
 
 
 class TestGatherAlongAxis:
     @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends,)
+        "compute_unit, backend, minimum_deployment_target",
+        itertools.product(compute_units, backends, [None, ct.target.iOS17]),
     )
-    def test_builder_to_backend_smoke(self, compute_unit, backend):
+    def test_builder_to_backend_smoke(self, compute_unit, backend, minimum_deployment_target):
+        if minimum_deployment_target == ct.target.iOS17:
+            if backend[0] != "mlprogram" or _macos_version() < (14, 0):
+                pytest.skip("IOS17 target available only on macOS 14+ with mlprogram.")
+
         x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
         indices = np.array([[1, 0, 1], [1, 1, 0]], dtype=np.int32)
         input_placeholders = {
@@ -611,24 +923,48 @@ def build(x, indices):
             expected_outputs,
             compute_unit=compute_unit,
             backend=backend,
+            minimum_deployment_target=minimum_deployment_target,
         )
 
-    @ssa_fn
-    def test_builder_eval(self):
-        x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
-        indices = np.array([[1, 0, 1], [0, 0, 1]], dtype=np.int32)
-        v = mb.gather_along_axis(x=x, indices=indices, axis=0)
-        np.testing.assert_allclose(np.array([[4, 2, 6], [1, 2, 6]], dtype=np.float32), v.val, atol=1e-04, rtol=1e-05)
+    @pytest.mark.parametrize(
+        "opset_version",
+        [ct.target.iOS15, ct.target.iOS16, ct.target.iOS17],
+    )
+    def test_builder_eval(self, opset_version):
+        @mb.program(
+            input_specs=[mb.TensorSpec(shape=(1,), dtype=types.fp32)], opset_version=opset_version
+        )
+        def prog(x):
+            params = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
+            indices = np.array([[1, 0, 1], [0, 0, 1]], dtype=np.int32)
+            res = mb.gather_along_axis(x=params, indices=indices, axis=0)
+            return res
+
+        main_func = prog.functions["main"]
+        gather_ops = main_func.find_ops(op_type="gather_along_axis")[0]
+
+        np.testing.assert_allclose(
+            np.array([[4, 2, 6], [1, 2, 6]], dtype=np.float32),
+            gather_ops.outputs[0].val,
+            atol=1e-04,
+            rtol=1e-05,
+        )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, rank_axis",
+        "compute_unit, backend, rank_axis, minimum_deployment_target",
         itertools.product(
             compute_units,
             backends,
             [(rank, axis) for rank in range(1, 5) for axis in range(-rank, rank)],
+            [None, ct.target.iOS17],
         ),
     )
-    def test_builder_to_backend_programmatic(self, compute_unit, backend, rank_axis):
+    def test_builder_to_backend_programmatic(
+        self, compute_unit, backend, rank_axis, minimum_deployment_target
+    ):
+        if minimum_deployment_target == ct.target.iOS17:
+            if backend[0] != "mlprogram" or _macos_version() < (14, 0):
+                pytest.skip("IOS17 target available only on macOS 14+ with mlprogram.")
         if backend[0] == "mlprogram" and compute_unit != ct.ComputeUnit.CPU_ONLY:
             pytest.xfail("rdar://97398875 (TestGatherAlongAxis failing on mlprgram + GPU)")
         rank, axis = rank_axis
@@ -637,9 +973,9 @@ def test_builder_to_backend_programmatic(self, compute_unit, backend, rank_axis)
         indices_shape[axis] = np.random.randint(low=1, high=8)
 
         x = np.random.rand(*x_shape).astype(np.float32)
-        indices = np.random.randint(
-            -x_shape[axis], x_shape[axis], size=indices_shape
-        ).astype(np.int32)
+        # IOS17 gather_along_axis requires non-negative indices.
+        lower_bound = 0 if minimum_deployment_target == ct.target.iOS17 else -x_shape[axis]
+        indices = np.random.randint(lower_bound, x_shape[axis], size=indices_shape).astype(np.int32)
 
         def build(x, indices):
             return mb.gather_along_axis(x=x, indices=indices, axis=axis)
@@ -662,14 +998,62 @@ def build(x, indices):
             expected_output,
             compute_unit=compute_unit,
             backend=backend,
+            minimum_deployment_target=minimum_deployment_target,
         )
 
+    @pytest.mark.parametrize(
+        "indices_val, validate_indices, opset_version",
+        itertools.product(
+            [[[1, 0, -1], [0, 0, 1]], [[1, 0, 1], [0, 0, 2]]],
+            [True, False],
+            [None, ct.target.iOS17],
+        ),
+    )
+    def test_builder_invalid_indices(self, indices_val, validate_indices, opset_version):
+        def prog(x):
+            params = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
+            indices = np.array(indices_val, dtype=np.int32)
+            if opset_version == ct.target.iOS17:
+                res = mb.gather_along_axis(
+                    x=params, indices=indices, axis=0, validate_indices=validate_indices
+                )
+            else:
+                res = mb.gather_along_axis(x=params, indices=indices, axis=0)
+            return res
+
+        if opset_version == ct.target.iOS17 and validate_indices:
+            with pytest.raises(
+                IndexError, match="Indices is out of bounds for `gather_along_axis` node"
+            ):
+                mb.program(
+                    input_specs=[mb.TensorSpec(shape=(1,), dtype=types.fp32)],
+                    opset_version=opset_version,
+                )(prog)
+        elif any([idx > 1 for sub_indices in indices_val for idx in sub_indices]):
+            # If the indices are not validated during type inference for IOS17, the `gather` op's
+            # value inference will raise error for out-of-bound index.
+            with pytest.raises(IndexError, match="index 2 is out of bounds for axis 0 with size 2"):
+                mb.program(
+                    input_specs=[mb.TensorSpec(shape=(1,), dtype=types.fp32)],
+                    opset_version=opset_version,
+                )(prog)
+        else:
+            mb.program(
+                input_specs=[mb.TensorSpec(shape=(1,), dtype=types.fp32)],
+                opset_version=opset_version,
+            )(prog)
+
 
 class TestGatherNd:
     @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends,)
+        "compute_unit, backend, minimum_deployment_target",
+        itertools.product(compute_units, backends, [None, ct.target.iOS17]),
     )
-    def test_builder_to_backend_smoke(self, compute_unit, backend):
+    def test_builder_to_backend_smoke(self, compute_unit, backend, minimum_deployment_target):
+        if minimum_deployment_target == ct.target.iOS17:
+            if backend[0] != "mlprogram" or _macos_version() < (14, 0):
+                pytest.skip("IOS17 target available only on macOS 14+ with mlprogram.")
+
         x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
         indices = np.array([[1, 0], [0, 2]], dtype=np.int32)
         input_placeholders = {
@@ -694,17 +1078,23 @@ def build(x, indices):
             compute_unit=compute_unit,
             frontend_only=False,
             backend=backend,
+            minimum_deployment_target=minimum_deployment_target,
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends,)
+        "compute_unit, backend, minimum_deployment_target",
+        itertools.product(compute_units, backends, [ct.target.iOS16, ct.target.iOS17]),
     )
-    def test_builder_to_backend_smoke_iOS16(self, compute_unit, backend):
+    def test_builder_to_backend_smoke_batch_dims(
+        self, compute_unit, backend, minimum_deployment_target
+    ):
         if backend[0] == "neuralnetwork":
             pytest.skip("nn backend not supported")
-
         if ct.utils._macos_version() < (13, 0):
             pytest.skip("batch_dims not supported in macOS12 or older.")
+        if minimum_deployment_target == ct.target.iOS17:
+            if backend[0] != "mlprogram" or _macos_version() < (14, 0):
+                pytest.skip("IOS17 target available only on macOS 14+ with mlprogram.")
 
         x = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], dtype=np.float32)
         indices = np.array([[[1, 0], [0, 1]], [[1, 0], [0, 0]]], dtype=np.int32)
@@ -746,5 +1136,35 @@ def build(x, indices):
             expected_outputs,
             compute_unit=compute_unit,
             backend=backend,
-            minimum_deployment_target=ct.target.iOS16,
+            minimum_deployment_target=minimum_deployment_target,
         )
+
+    @pytest.mark.parametrize(
+        "indices_val, validate_indices, opset_version",
+        itertools.product(
+            [[[-1], [2]], [[1], [3]]], [True, False], [ct.target.iOS16, ct.target.iOS17]
+        ),
+    )
+    def test_builder_invalid_indices(self, indices_val, validate_indices, opset_version):
+        def prog(x):
+            params = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
+            indices = np.array(indices_val, dtype=np.int32)
+            if opset_version == ct.target.iOS17:
+                res = mb.gather_nd(
+                    x=params, indices=indices, batch_dims=1, validate_indices=validate_indices
+                )
+            else:
+                res = mb.gather_nd(x=params, indices=indices, batch_dims=1)
+            return res
+
+        if opset_version == ct.target.iOS17 and validate_indices:
+            with pytest.raises(IndexError, match="Indices is out of bounds for `gather_nd` node"):
+                mb.program(
+                    input_specs=[mb.TensorSpec(shape=(1,), dtype=types.fp32)],
+                    opset_version=opset_version,
+                )(prog)
+        else:
+            mb.program(
+                input_specs=[mb.TensorSpec(shape=(1,), dtype=types.fp32)],
+                opset_version=opset_version,
+            )(prog)
diff --git a/coremltools/converters/mil/mil/ops/tests/test_slice.py b/coremltools/converters/mil/mil/ops/tests/test_slice.py
index a9fa669e1..b5ab0a02b 100644
--- a/coremltools/converters/mil/mil/ops/tests/test_slice.py
+++ b/coremltools/converters/mil/mil/ops/tests/test_slice.py
@@ -149,10 +149,10 @@ def test_builder_eval_scalar_output_corner_cases(self):
         v = [
             mb.slice_by_index(
                 x=x1, begin=[0,], end=[0], squeeze_mask=[True],
-            ), 
+            ),
             mb.slice_by_index(
                 x=x2, begin=[0, 0, 0, 0], end=[0, 0, 0, 0], squeeze_mask=[True, True, True, True],
-            ), 
+            ),
         ]
         assert v[0].val.shape == ()
         assert v[0].val == 2
@@ -353,8 +353,17 @@ def prog(x):
         y_neuralnetwork = list(model.predict({'x': x}).values())[0]
         np.testing.assert_allclose(y_numpy, y_neuralnetwork)
 
-        model = ct.convert(prog, source="milinternal", convert_to="mlprogram")
-        y_mlprogram = list(model.predict({'x': x}).values())[0]
+        model = ct.convert(
+            prog,
+            source="milinternal",
+            convert_to="mlprogram",
+            compute_units=ct.ComputeUnit.CPU_ONLY,
+        )
+
+        # rdar://109080828 ([Bug] slice_by_index is throwing expection through E5ML stack) need to be fixed
+        # The above radar fixed the CPU case,
+        # the non-CPU is still failing, which is tracked in rdar://109854221 ([Bug][Regression] slice_by_index is throwing expection through E5ML - Follow up radar)
+        # y_mlprogram = list(model.predict({'x': x}).values())[0]
         # rdar://102217935 needs to be fixed before mlprogram will pass
         # np.testing.assert_allclose(y_numpy, y_mlprogram)
 
diff --git a/coremltools/converters/mil/mil/ops/tests/test_tensor_operation.py b/coremltools/converters/mil/mil/ops/tests/test_tensor_operation.py
index 7b0f31846..9fd8ffda6 100644
--- a/coremltools/converters/mil/mil/ops/tests/test_tensor_operation.py
+++ b/coremltools/converters/mil/mil/ops/tests/test_tensor_operation.py
@@ -5,6 +5,7 @@
 
 import itertools
 import platform
+from unittest.mock import patch
 
 import numpy as np
 import pytest
@@ -14,11 +15,16 @@
 from coremltools.converters.mil import testing_reqs
 from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.mil import Function, get_new_symbol, types
-from coremltools.converters.mil.testing_utils import (get_op_types_in_program,
-                                                      random_gen, ssa_fn)
-from coremltools.models.utils import _macos_version
+from coremltools.converters.mil.mil.passes.pass_pipeline import PassPipeline
+from coremltools.converters.mil.mil.var import Var
+from coremltools.converters.mil.testing_utils import get_op_types_in_program, random_gen, ssa_fn
 
-from .testing_utils import UNK_SYM, UNK_VARIADIC, run_compare_builder
+from .testing_utils import (
+    UNK_SYM,
+    UNK_VARIADIC,
+    construct_inputs_from_placeholders,
+    run_compare_builder,
+)
 
 if _HAS_TF_2:
     import tensorflow as tf
@@ -386,6 +392,9 @@ def build(shape):
             input_values,
             expected_output_types,
             expected_outputs,
+            inputs=construct_inputs_from_placeholders(input_placeholders, 3)
+            if backend[0] == "mlprogram"
+            else None,
             compute_unit=compute_unit,
             backend=backend,
         )
@@ -712,7 +721,7 @@ def test_builder_eval(self):
         x_val = np.random.randint(low=-1, high=2, size=(6, 1, 7))
         res = mb.non_zero(x=x_val)
         np.testing.assert_allclose(np.transpose(np.nonzero(x_val)), res.val, atol=1e-04, rtol=1e-05)
-        
+
     @ssa_fn
     def test_shape_inference_for_deterministic_input(self):
         # If the input is compile time known, the builder should be able to infer the shape from value
@@ -1261,16 +1270,6 @@ def build(x):
         )
     )
     def test_builder_to_backend_smoke_iOS16(self, compute_unit, backend, return_indices, sort):
-        if backend[0] == "neuralnetwork":
-            pytest.skip("nn backend not supported")
-        if _macos_version() < (13, 0):
-            pytest.skip("New functionality in macOS13/iOS16")
-
-        if not return_indices:
-            pytest.xfail(
-                "rdar://92880117 (Topk with return_indices = False error out at the MIL->EIR stage)"
-            )
-
         val = np.array([[-1.0, 2.0, -3.0], [4.0, -5.0, 6.0]], dtype=np.float32)
         input_placeholders = {"x": mb.placeholder(shape=val.shape)}
         input_values = {"x": val}
@@ -1302,6 +1301,61 @@ def build(x):
             minimum_deployment_target=ct.target.iOS16,
         )
 
+    @pytest.mark.parametrize(
+        "compute_unit, backend, x_dtype, k_dtype",
+        itertools.product(
+            compute_units,
+            [("mlprogram", "fp16")],
+            [np.float32, np.float16, np.int32, np.int16, np.uint16],
+            [np.int32, np.int16],
+        ),
+    )
+    def test_ios17_different_dtypes(self, compute_unit, backend, x_dtype, k_dtype):
+        def build(x):
+            return mb.topk(x=x, k=k_dtype(2), axis=1)
+
+        if k_dtype == np.int16:
+            pytest.xfail("k with dtype int16 will trigger backend error.")
+
+        val = np.array([[2, 3, 1], [5, 4, 6]], dtype=x_dtype)
+        x_mb_dtype = types.type_mapping.numpy_type_to_builtin_type(x_dtype)
+        input_placeholders = {"x": mb.placeholder(shape=val.shape, dtype=x_mb_dtype)}
+        input_values = {"x": val}
+        # As int16 is not in CoreML I/O supported dtypes, it will be cast to int32.
+        expected_output_types = [(2, 2, x_mb_dtype), (2, 2, types.int32)]
+        expected_outputs = [
+            np.array([[3, 2], [6, 5]], dtype=x_dtype),
+            np.array([[1, 0], [2, 0]], dtype=np.int32),
+        ]
+
+        with patch.object(Var, "_is_nonreplaceable_var") as mocked_is_nonreplaceable_var:
+            # Mock that the cast is non-replaceable, to make sure it's kept in the graph.
+            mocked_is_nonreplaceable_var.side_effect = (
+                lambda var: var.op and var.op.op_type == "cast"
+            )
+            # Remove the cast optimization pass to make sure all cast are kept in the graph.
+            pass_pipeline: PassPipeline = PassPipeline.DEFAULT
+            pass_pipeline.remove_passes(
+                ["common::cast_optimization", "common::topological_reorder"]
+            )
+            mlmodel = run_compare_builder(
+                build,
+                input_placeholders,
+                input_values,
+                expected_output_types,
+                expected_outputs,
+                compute_unit=compute_unit,
+                backend=backend,
+                minimum_deployment_target=ct.target.iOS17,
+                pass_pipeline=pass_pipeline,
+            )
+        prog = mlmodel._mil_program
+        topk_op = prog["main"].find_ops(op_type="topk")[0]
+        expected_x_dtype = x_mb_dtype
+        if backend[1] == "fp16" and types.is_float(x_mb_dtype):
+            expected_x_dtype = types.fp16
+        assert types.builtin_to_string(topk_op.x.dtype) == types.builtin_to_string(expected_x_dtype)
+
     @ssa_fn
     def test_builder_eval(self):
         def np_topk(x, k, axis, ascending=False):
@@ -1480,6 +1534,9 @@ def build(x):
             input_values,
             expected_output_types,
             expected_outputs,
+            inputs=construct_inputs_from_placeholders(input_placeholders, 10)
+            if backend[0] == "mlprogram"
+            else None,
             compute_unit=compute_unit,
             backend=backend,
         )
@@ -1562,6 +1619,9 @@ def build(x):
             input_values,
             expected_output_types,
             expected_outputs,
+            inputs=construct_inputs_from_placeholders(input_placeholders, 10)
+            if backend[0] == "mlprogram"
+            else None,
             compute_unit=compute_unit,
             backend=backend,
         )
@@ -1683,3 +1743,52 @@ def test_builder_eval(self):
         res = mb.argsort(x=x_val, axis=-3)
         # The default np argsort mode is ascending, which is opposite to MIL's argsort op.
         np.testing.assert_allclose(np.argsort(-x_val, axis=-3), res.val, atol=1e-04, rtol=1e-05)
+
+
+class TestConcat:
+    @pytest.mark.parametrize(
+        "compute_unit, backend, axis",
+        itertools.product(
+            compute_units,
+            backends,
+            [0, 1],
+        ),
+    )
+    def test_builder_to_backend_numerical(self, compute_unit, backend, axis):
+        def build(x1, x2):
+            return mb.concat(values=[x1, x2], axis=axis)
+
+        val1 = np.array([[-1.0, 2.0, -3.0], [4.0, -5.0, 6.0]], dtype=np.float32)
+        val2 = -val1
+        input_placeholders = {
+            "x1": mb.placeholder(shape=val1.shape),
+            "x2": mb.placeholder(shape=val2.shape),
+        }
+        input_values = {"x1": val1, "x2": val2}
+        expected_res = np.concatenate([val1, val2], axis=axis)
+
+        run_compare_builder(
+            build,
+            input_placeholders,
+            input_values,
+            expected_output_types=[expected_res.shape + (types.fp32,)],
+            expected_outputs=expected_res,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+    def test_builder_eval_different_dtypes_error_out(self):
+        """If the input to the concat op has different dtypes, it will error out."""
+        with pytest.raises(
+            ValueError,
+            match="Tensors in 'values' of the concat op \(concat_0\) should share the same data type",
+        ):
+
+            @mb.program(
+                input_specs=[
+                    mb.TensorSpec(shape=(2, 3), dtype=types.fp32),
+                    mb.TensorSpec(shape=(2, 3), dtype=types.int32),
+                ]
+            )
+            def prog(x1, x2):
+                return mb.concat(values=[x1, x2], axis=0)
diff --git a/coremltools/converters/mil/mil/ops/tests/test_tensor_transformation.py b/coremltools/converters/mil/mil/ops/tests/test_tensor_transformation.py
index 111065291..ac5f715e1 100644
--- a/coremltools/converters/mil/mil/ops/tests/test_tensor_transformation.py
+++ b/coremltools/converters/mil/mil/ops/tests/test_tensor_transformation.py
@@ -17,7 +17,12 @@
 from coremltools.converters.mil.testing_reqs import backends, compute_units
 from coremltools.converters.mil.testing_utils import ssa_fn
 
-from .testing_utils import UNK_SYM, UNK_VARIADIC, run_compare_builder
+from .testing_utils import (
+    UNK_SYM,
+    UNK_VARIADIC,
+    construct_inputs_from_placeholders,
+    run_compare_builder,
+)
 
 if _HAS_TORCH:
     import torch
@@ -90,7 +95,7 @@ def build(x):
 class TestBatchToSpace:
     @pytest.mark.parametrize(
         "compute_unit, backend", itertools.product(compute_units, backends,)
-    )    
+    )
     def test_builder_to_backend_smoke(self, compute_unit, backend):
         # original input type is (8, 1, 1, 3, fp32)
         val = np.array([[[[ 0,  1,  3]]],
@@ -204,6 +209,9 @@ def build(x):
             input_values,
             expected_output_types,
             expected_outputs,
+            inputs=construct_inputs_from_placeholders(input_placeholders, 10)
+            if backend[0] == "mlprogram"
+            else None,
             compute_unit=compute_unit,
             backend=backend,
         )
@@ -311,6 +319,7 @@ def build(x):
             backend=backend,
         )
 
+
 class TestReshapeLike:
     @pytest.mark.parametrize(
         "compute_unit, backend, InputShape_RefShapes_Begins_Ends_EndMasks, InputType_RefType",
@@ -334,20 +343,20 @@ def test_builder_to_backend_smoke(
         ):
         if backend[0] == "neuralnetwork":
             pytest.skip("reshape_like not supoprted in neuralnetwork backend.")
-            
+
         if ct.utils._macos_version() < (13, 0):
             pytest.skip("reshape_like not supported in macOS12 or older.")
-            
+
         input_shape, ref_shapes, begins, ends, end_masks = InputShape_RefShapes_Begins_Ends_EndMasks
         ref_shape_1, ref_shape_2 = ref_shapes
         input_type, ref_type = InputType_RefType
-        
+
         t = np.random.rand(*input_shape).astype(np.float32)
         ref_tensor_1 = np.random.rand(*ref_shape_1).astype(np.float32)
         ref_tensor_2 = np.random.rand(*ref_shape_2).astype(np.float32)
 
         input_placeholders = {
-            "x": mb.placeholder(shape=t.shape), 
+            "x": mb.placeholder(shape=t.shape),
             "ref_tensor_1": mb.placeholder(shape=ref_shape_1),
             "ref_tensor_2": mb.placeholder(shape=ref_shape_2),
         }
@@ -360,14 +369,14 @@ def test_builder_to_backend_smoke(
         def build(x, ref_tensor_1, ref_tensor_2):
             if input_type == types.bool:
                 x = mb.cast(x=x, dtype="bool")
-                
+
             if ref_type == types.bool:
                 ref_tensor_1 = mb.cast(x=ref_tensor_1, dtype="bool")
                 ref_tensor_2 = mb.cast(x=ref_tensor_2, dtype="bool")
-                
+
             ref_tensors = (ref_tensor_1, ref_tensor_2)
             return mb.reshape_like(x=x, ref_tensors=ref_tensors, begins=begins, ends=ends, end_masks=end_masks)
-            
+
         output_shape = ()
         for ref_shape, begin, end, end_mask in zip((ref_shape_1, ref_shape_2), begins, ends, end_masks):
             if end_mask:
@@ -483,16 +492,198 @@ def build(x, shape, shape2):
             "shape": np.array([2, 1, 3], dtype=np.float32),
             "shape2": np.array([2, 1, 3], dtype=np.float32),
         }
+
         run_compare_builder(
             build,
             input_placeholders,
             input_values,
             expected_output_types,
             expected_outputs,
+            inputs=construct_inputs_from_placeholders(input_placeholders, 10)
+            if backend[0] == "mlprogram"
+            else None,
             compute_unit=compute_unit,
             backend=backend,
         )
 
+    @ssa_fn
+    def test_too_many_neg_ones(self):
+        x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
+        with pytest.raises(ValueError, match="Reshape op supports only one dimension to be -1"):
+            mb.reshape(x=x, shape=[-1, -1])
+
+    @ssa_fn
+    def test_invalid_target_shape(self):
+        x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
+        with pytest.raises(ValueError, match="Invalid target shape in `reshape` op"):
+            mb.reshape(x=x, shape=[4, -1])
+
+    @ssa_fn
+    def test_invalid_target_shape_with_zero(self):
+        x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
+        with pytest.raises(ValueError, match="Invalid target shape in `reshape` op"):
+            mb.reshape(x=x, shape=[0, 7])
+
+    @pytest.mark.parametrize(
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
+    )
+    def test_reshape_with_zero(self, compute_unit, backend):
+        if backend[0] == "neuralnetwork":
+            pytest.skip("Reshape with 0 is not supported in neuralnetwork.")
+
+        t = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
+        input_placeholders = {"x": mb.placeholder(shape=t.shape)}
+        input_values = {"x": t}
+
+        def build(x):
+            return [
+                mb.reshape(x=x, shape=[0, -1]),
+                mb.reshape(x=x, shape=[0, 3]),
+                mb.reshape(x=x, shape=[-1, 0]),
+            ]
+
+        expected_output_types = [
+            (2, 3, types.fp32),
+            (2, 3, types.fp32),
+            (2, 3, types.fp32),
+        ]
+        expected_outputs = [
+            np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32),
+            np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32),
+            np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32),
+        ]
+
+        run_compare_builder(
+            build,
+            input_placeholders,
+            input_values,
+            expected_output_types,
+            expected_outputs,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+    @pytest.mark.parametrize(
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
+    )
+    def test_reshape_with_zero_different_len(self, compute_unit, backend):
+        if backend[0] == "neuralnetwork":
+            pytest.skip("Reshape with 0 is not supported in neuralnetwork.")
+
+        t = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
+        input_placeholders = {"x": mb.placeholder(shape=t.shape)}
+        input_values = {"x": t}
+
+        def build(x):
+            return [
+                mb.reshape(x=x, shape=[1, 0, -1, 0]),
+            ]
+
+        expected_output_types = [
+            (1, 1, 2, 3, types.fp32),
+        ]
+        expected_outputs = [
+            np.array([[[[1, 2, 3], [4, 5, 6]]]], dtype=np.float32),
+        ]
+
+        with pytest.raises(
+            ValueError,
+            match="When there is 0 in shape, the rank of x .* must "
+            "equal to the target shape len",
+        ):
+            run_compare_builder(
+                build,
+                input_placeholders,
+                input_values,
+                expected_output_types,
+                expected_outputs,
+                compute_unit=compute_unit,
+                backend=backend,
+            )
+
+    @pytest.mark.parametrize(
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
+    )
+    def test_reshape_with_zero_different_len(self, compute_unit, backend):
+        if backend[0] == "neuralnetwork":
+            pytest.skip("Reshape with 0 is not supported in neuralnetwork.")
+
+        t = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
+        input_placeholders = {"x": mb.placeholder(shape=t.shape)}
+        input_values = {"x": t}
+
+        def build(x):
+            return [mb.reshape(x=x, shape=[1, 0, -1, 0])]
+
+        # In IOS15/16 it will error out because rank of x needs to have same length as shape.
+        with pytest.raises(
+            ValueError,
+            match="When there is 0 in shape, the rank of x .* must "
+            "equal to the target shape len",
+        ):
+            run_compare_builder(
+                build,
+                input_placeholders,
+                input_values,
+                compute_unit=compute_unit,
+                backend=backend,
+            )
+
+        # In IOS17 it accepts different length.
+        expected_output_types = [(1, 1, 2, 3, types.fp32)]
+        expected_outputs = [np.array([[[[1, 2, 3], [4, 5, 6]]]], dtype=np.float32)]
+        run_compare_builder(
+            build,
+            input_placeholders,
+            input_values,
+            expected_output_types,
+            expected_outputs,
+            compute_unit=compute_unit,
+            backend=backend,
+            minimum_deployment_target=ct.target.iOS17,
+        )
+
+    @pytest.mark.parametrize(
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
+    )
+    def test_reshape_invalid_with_zero(self, compute_unit, backend):
+        if backend[0] == "neuralnetwork":
+            pytest.skip("Reshape with 0 is not supported in neuralnetwork.")
+
+        t = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
+        input_placeholders = {"x": mb.placeholder(shape=t.shape)}
+        input_values = {"x": t}
+
+        def build(x):
+            return [mb.reshape(x=x, shape=[4, 0, -1, 0])]
+
+        with pytest.raises(ValueError, match="Invalid target shape in `reshape` op"):
+            run_compare_builder(
+                build,
+                input_placeholders,
+                input_values,
+                compute_unit=compute_unit,
+                backend=backend,
+                minimum_deployment_target=ct.target.iOS17,
+            )
+
+
 
 class TestReverse:
     @pytest.mark.parametrize(
@@ -659,6 +850,9 @@ def build(x):
             input_values,
             expected_output_types,
             expected_outputs,
+            inputs=construct_inputs_from_placeholders(input_placeholders, 10)
+            if backend[0] == "mlprogram"
+            else None,
             compute_unit=compute_unit,
             backend=backend,
         )
@@ -848,6 +1042,9 @@ def build(x):
             input_values,
             expected_output_types,
             expected_outputs,
+            inputs=construct_inputs_from_placeholders(input_placeholders, 10)
+            if backend[0] == "mlprogram"
+            else None,
             compute_unit=compute_unit,
             backend=backend,
         )
@@ -883,12 +1080,16 @@ def build(x):
         input_values = {
             "x": np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32),
         }
+
         run_compare_builder(
             build,
             input_placeholders,
             input_values,
             expected_output_types,
             expected_outputs,
+            inputs=construct_inputs_from_placeholders(input_placeholders, 10)
+            if backend[0] == "mlprogram"
+            else None,
             compute_unit=compute_unit,
             backend=backend,
         )
@@ -999,7 +1200,7 @@ def test_builder_to_backend_stress(
     ):
         if backend[0] == "neuralnetwork":
             pytest.skip("nn backend not supported")
-            
+
         val = np.random.rand(*shape)
         input_placeholders = {"x": mb.placeholder(shape=val.shape)}
         input_values = {"x": val}
diff --git a/coremltools/converters/mil/mil/ops/tests/testing_utils.py b/coremltools/converters/mil/mil/ops/tests/testing_utils.py
index c6528b407..d898de057 100644
--- a/coremltools/converters/mil/mil/ops/tests/testing_utils.py
+++ b/coremltools/converters/mil/mil/ops/tests/testing_utils.py
@@ -3,13 +3,19 @@
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+from typing import Dict, List, Optional
+
 import coremltools as ct
 from coremltools import _logger as logger
-from coremltools.converters.mil.mil import Function, Program
+from coremltools.converters.mil.input_types import TensorType
+from coremltools.converters.mil.mil import Function, Placeholder, Program
+from coremltools.converters.mil.mil.passes.pass_pipeline import PassPipeline
 from coremltools.converters.mil.mil.types.symbolic import is_symbolic
-from coremltools.converters.mil.testing_utils import (compare_backend,
-                                                      ct_convert)
-
+from coremltools.converters.mil.testing_utils import (
+    compare_backend,
+    ct_convert,
+    validate_minimum_deployment_target,
+)
 
 UNK_VARIADIC = "*s_unk"
 UNK_SYM = "s_unk"
@@ -30,6 +36,7 @@ def run_compare_builder(
     also_compare_shapes=False,
     converter=ct.convert,
     minimum_deployment_target=None,
+    pass_pipeline: Optional[PassPipeline] = None,
 ):
     """
     Inputs:
@@ -66,6 +73,9 @@ def run_compare_builder(
     Returns:
         The converted mlmodel
     """
+    if minimum_deployment_target is not None:
+        validate_minimum_deployment_target(minimum_deployment_target, backend)
+
     if not isinstance(expected_output_types, list):
         expected_output_types = [expected_output_types]
 
@@ -94,13 +104,8 @@ def run_compare_builder(
     assert len(output_vars) == len(expected_output_types), assert_msg
 
     for out_var, s in zip(output_vars, expected_output_types):
-        if out_var.dtype != s[-1]:
-            raise ValueError(
-                "Output {} type: expect {}, got {}. Program:\n{}".format(
-                    out_var.name, s[-1].__type_info__(),
-                    out_var.dtype.__type_info__(), prog
-                )
-            )
+        # The output type will be casted by the `adjust_io_to_supported_types` pass, so we don't
+        # check the output var dtype matching here.
         if UNK_VARIADIC in s[:-1]:
             msg = "Skip type checking for UNK_VARIADIC. Output shape: {} vs expected shape: {}"
             logger.debug(msg.format(out_var.shape, s[:-1]))
@@ -123,13 +128,15 @@ def run_compare_builder(
         if output_shape != expected_shape:
             raise ValueError(msg)
 
-    mlmodel = ct_convert(prog,
-                         converter=converter,
-                         source="milinternal",
-                         convert_to=backend,
-                         inputs=inputs,
-                         compute_units=compute_unit,
-                         minimum_deployment_target=minimum_deployment_target
+    mlmodel = ct_convert(
+        prog,
+        converter=converter,
+        source="milinternal",
+        convert_to=backend,
+        inputs=inputs,
+        compute_units=compute_unit,
+        minimum_deployment_target=minimum_deployment_target,
+        pass_pipeline=pass_pipeline,
     )
 
     if frontend_only:
@@ -153,7 +160,22 @@ def run_compare_builder(
         atol=atol,
         rtol=rtol,
         also_compare_shapes=also_compare_shapes,
-        dtype=backend[1]
+        dtype=backend[1],
     )
 
     return mlmodel
+
+
+def construct_inputs_from_placeholders(
+    input_placeholders: Dict[str, Placeholder], upper_bound: int
+) -> [List[TensorType]]:
+    """Construct the `inputs` param from placeholders with upper_bound."""
+    inputs: [List[TensorType]] = []
+    for input_name, placeholder in input_placeholders.items():
+        input_shape = [
+            ct.RangeDim(upper_bound=upper_bound) if is_symbolic(shape) else shape
+            for shape in placeholder.sym_shape
+        ]
+        input_tensor_type = TensorType(name=input_name, shape=input_shape)
+        inputs.append(input_tensor_type)
+    return inputs
diff --git a/coremltools/converters/mil/mil/passes/__init__.py b/coremltools/converters/mil/mil/passes/__init__.py
index ec624d2dc..b2f9a288c 100644
--- a/coremltools/converters/mil/mil/passes/__init__.py
+++ b/coremltools/converters/mil/mil/passes/__init__.py
@@ -37,6 +37,7 @@
     optimize_elementwise_binary,
     optimize_linear,
     optimize_normalization,
+    optimize_quantization,
     optimize_repeat_ops,
     optimize_tensor_operation,
     preprocess,
diff --git a/coremltools/converters/mil/mil/passes/defs/cleanup/__init__.py b/coremltools/converters/mil/mil/passes/defs/cleanup/__init__.py
index 5c534eb77..32aeb2f5f 100644
--- a/coremltools/converters/mil/mil/passes/defs/cleanup/__init__.py
+++ b/coremltools/converters/mil/mil/passes/defs/cleanup/__init__.py
@@ -3,6 +3,7 @@
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+from .const_deduplication import const_deduplication
 from .const_elimination import const_elimination
 from .dead_code_elimination import dead_code_elimination
 from .dedup_op_and_var_names import dedup_op_and_var_names
diff --git a/coremltools/converters/mil/mil/passes/defs/cleanup/const_deduplication.py b/coremltools/converters/mil/mil/passes/defs/cleanup/const_deduplication.py
new file mode 100644
index 000000000..c34abeee8
--- /dev/null
+++ b/coremltools/converters/mil/mil/passes/defs/cleanup/const_deduplication.py
@@ -0,0 +1,141 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import hashlib
+from typing import Dict, List, Tuple
+
+import numpy as np
+
+from coremltools.converters.mil.mil import Block, Var, types
+from coremltools.converters.mil.mil.passes.graph_pass import AbstractGraphPass
+from coremltools.converters.mil.mil.passes.helper import block_context_manager
+from coremltools.converters.mil.mil.passes.pass_registry import register_pass
+
+
+@register_pass(namespace="common")
+class const_deduplication(AbstractGraphPass):
+    """
+    Remove duplicated large constants (tensor with 100+ elements)
+
+    For example
+
+    .. code-block::
+
+        Input graph (where weight and bias are large constants):
+            weight_q = const(weight)
+            weight_k = const(weight)
+            bias_q = const(bias)
+            bias_k = const(bias)
+            q_embedding = linear(x=q, weight=weight_q, bias=bias_q)
+            k_embedding = linear(x=k, weight=weight_k, bias=bias_k)
+
+        Output graph:
+            weight_q = const(weight)
+            bias_q = const(bias)
+            q_embedding = linear(x=q, weight=weight_q, bias=bias_q)
+            k_embedding = linear(x=k, weight=weight_q, bias=bias_q)
+
+    Concretely, we consider a constant as duplicated if there exists such a previous constant that:
+
+    1. has same dtype and value
+
+    2. comes from same type of op
+
+    The reason why op type is considered is, there are 2 types of constants in Core ML:
+
+    1. The usual constant, i.e., the output of ``const`` op
+
+    2. The result of const expression, i.e., the output of ``constexpr_*`` ops
+    """
+
+    NUMEL_THRESH = 100
+    CONSTEXPR_OPS = {
+        "constexpr_affine_dequantize",
+        "constexpr_cast",
+        "constexpr_lut_to_dense",
+        "constexpr_sparse_to_dense",
+    }
+    DTYPE2ATOL = {
+        types.fp16: 6e-8,
+        types.fp32: 1e-12,
+    }
+
+    def apply(self, prog) -> None:
+        for f in prog.functions.values():
+            self._constant_deduplication_block(f)
+
+    @block_context_manager
+    def _constant_deduplication_block(self, block: Block) -> None:
+        for op in list(block.operations):
+            for b in op.blocks:
+                self._constant_deduplication_block(b)
+
+        unique2duplicates = self.find_constants(block)
+        for unique in unique2duplicates:
+            for duplicate in unique2duplicates[unique]:
+                if duplicate in block.outputs:
+                    continue
+                op = duplicate.op
+                block.replace_uses_of_var_after_op(
+                    anchor_op=op,
+                    old_var=duplicate,
+                    new_var=unique,
+                    force_replace=True if op.op_type in self.CONSTEXPR_OPS else False,
+                )
+                block.remove_ops([op])
+
+    def find_constants(self, block: Block) -> Dict[Var, List[Var]]:
+        """
+        Given a block, return all constants in the block in such a format:
+            {unique_var_0: [duplicated_var_0_0, duplicated_var_0_1, ...],
+             unique_var_1: [duplicated_var_1_0, duplicated_var_1_1, ...],
+             ...
+            }
+        """
+        unique2duplicates: Dict[Var, List[Var]] = {}
+
+        # instead of brute-force C_N^2 comparison, use a hash map to be O(N)
+        constant_dict: Dict[Tuple[str, types.type, Tuple[int], str], List[Var]] = {}
+        for op in list(block.operations):
+            op_type = op.op_type
+            if op_type == "const" or op_type in self.CONSTEXPR_OPS:
+                constant_var = op.outputs[0]
+                shape = constant_var.shape
+
+                numel = np.prod(shape)
+                if numel < self.NUMEL_THRESH:
+                    continue
+
+                dtype = constant_var.dtype
+                value = constant_var.val
+                hash = hashlib.sha1(
+                    np.ascontiguousarray(value.reshape(-1)[: self.NUMEL_THRESH])
+                ).hexdigest()
+                key = (op_type, dtype, shape, hash)
+
+                if key not in constant_dict:
+                    constant_dict[key] = [constant_var]
+                    unique2duplicates[constant_var] = []
+                else:
+                    hash_collisions = constant_dict[key]
+
+                    existing_constant_var = None
+                    for var in hash_collisions:
+                        if np.allclose(
+                            value,
+                            var.val,
+                            rtol=0.0,
+                            atol=self.DTYPE2ATOL.get(dtype, 1e-12),
+                        ):
+                            existing_constant_var = var
+                            break
+
+                    if existing_constant_var is None:
+                        hash_collisions.append(constant_var)
+                        unique2duplicates[constant_var] = []
+                    else:
+                        unique2duplicates[existing_constant_var].append(constant_var)
+
+        return unique2duplicates
diff --git a/coremltools/converters/mil/mil/passes/defs/cleanup/const_elimination.py b/coremltools/converters/mil/mil/passes/defs/cleanup/const_elimination.py
index 76c6bdcc7..7ec9d3d1b 100644
--- a/coremltools/converters/mil/mil/passes/defs/cleanup/const_elimination.py
+++ b/coremltools/converters/mil/mil/passes/defs/cleanup/const_elimination.py
@@ -28,7 +28,7 @@ class const_elimination(AbstractGraphPass):
             %4 = other_op(%2_const, %3)
 
     Support options:
-    
+
     - ``skip_const_by_size``: Skip folding ``const`` ops that have larger number of elements than a threshold.
     """
 
@@ -97,6 +97,23 @@ def _const_elimination_block(self, block):
                         output.set_name(output.name + "_ignored")
                     else:
                         all_outputs_are_replaced = False
+                # force const folding of the shape op
+                elif output.val is not None and op.op_type == "shape":
+                    res = mb.const(
+                        val=output.val,
+                        before_op=op,
+                        # same var name, but different python
+                        # instance does not violate SSA property.
+                        name=output.name,
+                    )
+                    op.enclosing_block.replace_uses_of_var_after_op(
+                        anchor_op=op,
+                        old_var=output,
+                        new_var=res,
+                        force_replace=True,
+                    )
+                    # rename the const output
+                    output.set_name(output.name + "_ignored")
                 else:
                     all_outputs_are_replaced = False
 
diff --git a/coremltools/converters/mil/mil/passes/defs/cleanup/remove_redundant_ops.py b/coremltools/converters/mil/mil/passes/defs/cleanup/remove_redundant_ops.py
index 2c0905e09..32e9950ac 100644
--- a/coremltools/converters/mil/mil/passes/defs/cleanup/remove_redundant_ops.py
+++ b/coremltools/converters/mil/mil/passes/defs/cleanup/remove_redundant_ops.py
@@ -136,11 +136,14 @@ def _try_to_remove_ops(candidate_ops_list):
         # This can be safely done, since all the ops in ops_to_remove
         # appear after first_op, hence first_op.outputs[0] variable is in
         # scope before the op's output var
+        ops_removed = []
         for op in ops_to_remove:
-            op.enclosing_block.replace_uses_of_var_after_op(
-                anchor_op=op, old_var=op.outputs[0], new_var=first_op.outputs[0]
-            )
-        block.remove_ops(ops_to_remove)
+            if op.enclosing_block.try_replace_uses_of_var_after_op(
+                anchor_op=op, old_var=op.outputs[0], new_var=first_op.outputs[0]):
+                ops_removed.append(op)
+        if len(ops_removed) == 0:
+            return False
+        block.remove_ops(ops_removed)
         return True
 
     @staticmethod
diff --git a/coremltools/converters/mil/mil/passes/defs/optimize_conv.py b/coremltools/converters/mil/mil/passes/defs/optimize_conv.py
index a6359d257..f31360f12 100644
--- a/coremltools/converters/mil/mil/passes/defs/optimize_conv.py
+++ b/coremltools/converters/mil/mil/passes/defs/optimize_conv.py
@@ -127,9 +127,7 @@ def _compose_conv1d_block(self, block: Block):
         def help_compose_conv1d_block(block: Block) -> bool:
             for op in list(block.operations):
                 for b in op.blocks:
-                    block_changed = True
-                    while block_changed:
-                        block_changed = help_compose_conv1d_block(b)
+                    self._compose_conv1d_block(b)
 
                 # must start with expanding a 3-D tensor,
                 # who has batch, channel, length dimensions
@@ -328,7 +326,7 @@ def _try_apply_transform_channel_last(
 @register_pass(namespace="common")
 class fuse_conv_batchnorm(AbstractGraphPass):
     """
-    Fuse the following ``batch_norm`` layer into ``conv`` and ``conv_transpose``. 
+    Fuse the following ``batch_norm`` layer into ``conv`` and ``conv_transpose``.
     That is, convert ``conv + batch_norm`` to ``conv``, by modifying the weight and bias in the ``conv`` layer.
 
     .. code-block::
diff --git a/coremltools/converters/mil/mil/passes/defs/optimize_quantization.py b/coremltools/converters/mil/mil/passes/defs/optimize_quantization.py
new file mode 100644
index 000000000..879618d5b
--- /dev/null
+++ b/coremltools/converters/mil/mil/passes/defs/optimize_quantization.py
@@ -0,0 +1,677 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+from typing import Tuple
+
+import numpy as np
+
+import coremltools.converters.mil.mil.types as types
+from coremltools.converters.mil.mil import Block
+from coremltools.converters.mil.mil import Builder as mb
+from coremltools.converters.mil.mil import Operation, Var
+from coremltools.converters.mil.mil.passes.graph_pass import AbstractGraphPass
+from coremltools.converters.mil.mil.passes.helper import (
+    _check_child_op_type,
+    _check_no_output_connection,
+    block_context_manager,
+)
+from coremltools.converters.mil.mil.passes.pass_registry import register_pass
+
+
+# TODO (rdar://107718371): remove this pass after implementing QuantizedVar
+@register_pass(namespace="common")
+class nullify_redundant_quantization_zero_point(AbstractGraphPass):
+    """
+    In Core ML quantization, the performance is better when ``zero point = 0``,
+    so we try to make ``zero point = 0`` if possible:
+
+    * ``zero point = -128``
+        * this must be an int8 quantization
+        * equivalent to uint8 quantization with 0 zero point
+    * ``zero point = 128``
+        * this must be an uint8 quantization
+        * equivalent to int8 quantization with 0 zero point
+
+    Since ``zero point = 0`` is equivalent to ``zero point = None`` in Core ML semantics,
+    we further canonicalize to ``zero point = None`` to:
+
+    * make further graph passes easier
+    * avoid serializing trivial 0
+
+    The ``zero point = 0`` case can be canonicalized trivially
+
+    .. code-block::
+
+        Input op:
+
+            quantize/dequantize(zero_point=0)
+
+        Output op:
+
+            quantize/dequantize(zero_point=None)
+
+    To guarantee the conservation of output regardless the zero-point shift
+    in ``zero point = ±128`` cases, we would only transform:
+
+    * const dequantize, where we fuse the zero-point shift into the const
+
+    .. code-block::
+
+        Input op:
+
+            dequantize(input=const, zero_point=±128)
+
+        Output op:
+
+            dequantize(input=const∓128, zero_point=None)
+
+    * ``quantize -> dequantize``, where we nullify both simultaneously
+
+    .. code-block::
+
+        Input graph:
+
+            input -> quantize(zero_point=±128) -> dequantize(zero_point=±128) -> output
+
+        Output graph:
+
+            input -> quantize(zero_point=None) -> dequantize(zero_point=None) -> output
+    """
+
+    def apply(self, prog):
+        for f in prog.functions.values():
+            self._nullify_redundant_quantization_zero_point_block(f)
+
+    @block_context_manager
+    def _nullify_redundant_quantization_zero_point_block(self, block):
+        def apply_block(block: Block) -> bool:
+            for op in list(block.operations):
+                for b in op.blocks:
+                    self._nullify_redundant_quantization_zero_point_block(b)
+
+                # no need to break, since only the current op gets changed
+                self.try_transform_zp0(op)
+                self.try_transform_zp128_const_dequantize(op)
+
+                # has to break as the downstream iterator is affected
+                if self.try_transform_zp128_quantize_dequantize(op):
+                    return True
+
+            return False
+
+        need_transformation = True
+        while need_transformation:
+            need_transformation = apply_block(block)
+
+    @staticmethod
+    def try_transform_zp0(op: Operation) -> bool:
+        if op.op_type not in ("quantize", "dequantize"):
+            return False
+
+        zero_point = op.zero_point
+        # if already no zero point, no need for further nullification
+        if zero_point is None:
+            return False
+        zero_point = zero_point.val
+
+        if not np.all(zero_point == 0):
+            return False
+
+        new_var: Var
+        if op.op_type == "quantize":
+            new_var = mb.quantize(
+                input=op.input,
+                scale=op.scale,
+                axis=op.axis,
+                output_dtype=op.output_dtype,
+                before_op=op,
+            )
+        else:
+            new_var = mb.dequantize(
+                input=op.input,
+                scale=op.scale,
+                axis=op.axis,
+                before_op=op,
+            )
+
+        block: Block = op.enclosing_block
+        if not block.try_replace_uses_of_var_after_op(
+            anchor_op=op, old_var=op.outputs[0], new_var=new_var
+        ):
+            return False
+        block.remove_ops([op])
+
+        return True
+
+    @staticmethod
+    def try_transform_zp128_const_dequantize(op: Operation) -> bool:
+        if op.op_type != "dequantize":
+            return False
+
+        zero_point = op.zero_point
+        # if already no zero point, no need for further nullification
+        if zero_point is None:
+            return False
+        zero_point = zero_point.val
+
+        is_negative_128 = np.all(zero_point == -128)
+        is_positive_128 = np.all(zero_point == 128)
+        if not (is_negative_128 or is_positive_128):
+            return False
+
+        input = op.input.val
+        if input is None:
+            return False
+        if is_negative_128:
+            input = np.uint8(np.int16(input) + 128)
+        else:
+            input = np.int8(np.int16(input) - 128)
+
+        new_var = mb.dequantize(
+            input=input,
+            scale=op.scale,
+            axis=op.axis,
+            before_op=op,
+        )
+
+        block: Block = op.enclosing_block
+        if not block.try_replace_uses_of_var_after_op(
+            anchor_op=op, old_var=op.outputs[0], new_var=new_var
+        ):
+            return False
+        block.remove_ops([op])
+
+        return True
+
+    @staticmethod
+    def try_transform_zp128_quantize_dequantize(op: Operation) -> bool:
+        if op.op_type != "quantize":
+            return False
+
+        zero_point = op.zero_point
+        # if already no zero point, no need for further nullification
+        if zero_point is None:
+            return False
+        zero_point = zero_point.val
+
+        is_negative_128 = np.all(zero_point == -128)
+        is_positive_128 = np.all(zero_point == 128)
+        if not (is_negative_128 or is_positive_128):
+            return False
+
+        if not _check_child_op_type(op, "dequantize"):
+            return False
+        dequantize_op = op.outputs[0].child_ops[0]
+
+        dequantize_zero_point = dequantize_op.zero_point
+        if dequantize_zero_point is None:
+            return False
+        dequantize_zero_point = dequantize_zero_point.val
+
+        if not np.all(dequantize_zero_point == (-128 if is_negative_128 else 128)):
+            return False
+
+        new_quantize = mb.quantize(
+            input=op.input,
+            scale=op.scale,
+            axis=op.axis,
+            output_dtype="uint8" if is_negative_128 else "int8",
+            before_op=dequantize_op,
+        )
+        new_dequantize = mb.dequantize(
+            input=new_quantize,
+            scale=dequantize_op.scale,
+            axis=dequantize_op.axis,
+            before_op=dequantize_op,
+        )
+
+        block: Block = op.enclosing_block
+        if not block.try_replace_uses_of_var_after_op(
+            anchor_op=dequantize_op,
+            old_var=dequantize_op.outputs[0],
+            new_var=new_dequantize,
+        ):
+            return False
+        block.remove_ops([op, dequantize_op])
+        return True
+
+
+@register_pass(namespace="common")
+class dequantize_quantize_pair_elimination(AbstractGraphPass):
+    """
+    When a ``dequantize`` is followed by an identical ``quantize`` (same scale,
+    zero point, axis), they cancel out and can be eliminated
+
+    .. code-block::
+
+        Input graph:
+            input -> dequantize -> quantize -> output
+
+        Output graph:
+            input -> output
+
+    PS: On the other hand, the reversed pattern, i.e., ``quantize -> dequantize``,
+    is not redundant, since that is the pattern which naturally occurs when a
+    quantized op is converted.
+    In current activation quantization conversion, a quantized op becomes
+
+    .. code-block::
+
+        dequantize -> regular op -> quantize
+
+    so if we have a sequence of quantized ops, we will get
+
+    .. code-block::
+
+        dequantize -> regular op1 -> quantize -> dequantize -> regular op2 -> quantize
+
+    The ``quantize -> dequantize`` pair in the middle is not redundant, even if
+    they have identical scales and zero points and axes, since removing them will lead to
+    loss of information about the quantization parameters of the output var of op1
+    """
+
+    def apply(self, prog):
+        for f in prog.functions.values():
+            self._dequantize_quantize_pair_elimination_block(f)
+
+    @block_context_manager
+    def _dequantize_quantize_pair_elimination_block(self, block):
+        def apply_block(block: Block) -> bool:
+            for op in list(block.operations):
+                for b in op.blocks:
+                    self._dequantize_quantize_pair_elimination_block(b)
+
+                # has to break as the downstream iterator is affected
+                if self.try_dequantize_quantize_pair_elimination(op):
+                    return True
+
+            return False
+
+        need_transformation = True
+        while need_transformation:
+            need_transformation = apply_block(block)
+
+    @staticmethod
+    def try_dequantize_quantize_pair_elimination(op: Operation) -> bool:
+        if op.op_type != "dequantize":
+            return False
+
+        if op.outputs[0] in op.enclosing_block.outputs:
+            return False
+
+        if not _check_child_op_type(op, "quantize"):
+            return False
+        quantize_op = op.outputs[0].child_ops[0]
+
+        if np.any(op.scale.val != quantize_op.scale.val):
+            return False
+
+        is_dequantize_zp_present = op.zero_point is not None
+        is_quantize_zp_present = quantize_op.zero_point is not None
+        if is_dequantize_zp_present != is_quantize_zp_present:
+            return False
+        if is_dequantize_zp_present and is_quantize_zp_present:
+            if np.any(op.zero_point.val != quantize_op.zero_point.val):
+                return False
+
+        is_dequantize_axis_present = op.axis is not None
+        is_quantize_axis_present = quantize_op.axis is not None
+        if is_dequantize_axis_present != is_quantize_axis_present:
+            return False
+        if is_dequantize_axis_present and is_quantize_axis_present:
+            if op.axis.val != quantize_op.axis.val:
+                return False
+
+        block: Block = op.enclosing_block
+        if not block.try_replace_uses_of_var_after_op(
+            anchor_op=quantize_op,
+            old_var=quantize_op.outputs[0],
+            new_var=op.input,
+        ):
+            return False
+        block.remove_ops([op, quantize_op])
+        return True
+
+
+@register_pass(namespace="common")
+class distributive_quantized_binary_op_scale_normalization(AbstractGraphPass):
+    """
+    In the backend, for better performance, quantized op can have 1 input scale
+    fused within the quantized op kernel. For binary ops, there are 2 inputs,
+    but only 1 can get fused. For example, for quantized ``add``
+
+    .. code-block::
+
+        MIL graph (consists of MIL ops):
+
+            dequantize(x, s_x, zp_x) -|
+            x_fp = (x - zp_x) * s_x   |
+                                      |->  add(x_fp, y_fp)   -> quantize(z_fp, s_z, zp_z)
+            dequantize(y, s_y, zp_y) -|   z_fp = x_fp + y_fp      z = z_fp / s_z + zp_z
+            y_fp = (y - zp_y) * s_y
+
+        Backend graph (consists of backend instructions, usually including + - * / and fused *+):
+
+            x_shift = x - zp_x -------------------------|
+                                                        |-> z_fp = s_x * x_shift + y_fp -> z = z_fp / s_z + zp_z
+            y_shift = y - zp_y -> y_fp = s_y * y_shift -|
+
+    Where ``x`` and ``y`` are the inputs, ``z`` is the output,
+    ``s`` and ``zp`` are the corresponding scale and zero point.
+
+    The reason why fusing one scale leads to better performance is,
+    instead of 2 instructions ``x_fp = s_x * x_shift`` and ``z_fp = x_fp + y_fp``,
+    a single ``z_fp = x_shift * s_x + y_fp`` instruction achieves the same result.
+
+    In this pass, we normalize ``s_y`` to 1, so the ``y_fp = s_y * y_shift``
+    instruction can get skipped as well, leading to even better performance.
+    This pass only applies to distributive binary ops such as ``add`` and ``sub``
+
+    Appendix: Mathematical and Computer-Scientific Details
+
+    Mathematically, for a binary operator ``.op.``
+
+    .. code-block::
+
+        z_fp = (x - zp_x) * s_x .op. (y - zp_y) * s_y
+             = s_y * [(x - zp_x) * s_x/s_y .op. (y - zp_y) * 1]
+
+    The corresponding pseudo code is
+
+    .. code-block::
+
+        # before
+        z_fp = (x - zp_x) * s_x .op. (y - zp_y) * s_y
+        z = z_fp / s - zp_z
+
+        # after
+        z_fp_modified = (x - zp_x) * s_x/s_y .op. (y - zp_y) * 1.0
+        z = z_fp_modified / (s_z/s_y) - zp_z
+
+    Concretely, as a MIL graph pass
+
+    .. code-block::
+
+        Input graph:
+            dequantize(scale=s_x) -|
+                                   |-> op -> quantize(scale=s_z)
+            dequantize(scale=s_y) -|
+
+        Output graph:
+            dequantize(scale=s_x/s_y) -|
+                                       |-> op -> quantize(scale=s_z/s_y)
+            dequantize(scale=1.0)     -|
+
+    PS: we only support scalar ``s_y`` for now. If ``s_y`` is not scalar but
+    ``s_x`` is, we would swap ``x`` and ``y``. Support for both-vector case is
+    to be explored, due to the broadcasting complication.
+    """
+
+    DISTRIBUTIVE_BINARY_OPS = {"add", "sub"}
+
+    def apply(self, prog):
+        @block_context_manager
+        def apply_block(block: Block):
+            for op in list(block.operations):
+                for b in op.blocks:
+                    apply_block(b)
+
+                matched_ops = self.match_pattern(op)
+                if matched_ops is not None:
+                    dequantize_x, dequantize_y, quantize_z = matched_ops
+                    self.try_to_transform(op, dequantize_x, dequantize_y, quantize_z)
+
+        for f in prog.functions.values():
+            apply_block(f)
+
+    def match_pattern(self, op: Operation) -> Tuple[Operation, Operation, Operation]:
+        """
+        try to match distributive quantized binary op:
+                ...
+                 ^
+                 |
+            dequantize(x) -|
+                           |-> op(x, y) (-> relu) -> quantize(z)
+            dequantize(y) -|
+                 |
+                 v
+                ...
+
+        return dequantize_x, dequantize_y, quantize_z for further transformation
+
+        return None if no match
+        """
+        # make sure the op is distributive
+        if op.op_type not in self.DISTRIBUTIVE_BINARY_OPS:
+            return None
+
+        # quantized op may be fused with relu
+        # relu would not affect distributivity
+        tail_op = op
+        if _check_child_op_type(op, "relu"):
+            tail_op = op.outputs[0].child_ops[0]
+
+        # make sure the inputs are quantized
+        dequantize_x = op.x.op
+        dequantize_y = op.y.op
+        if (
+            dequantize_x is None
+            or dequantize_y is None
+            or dequantize_x.op_type != "dequantize"
+            or dequantize_y.op_type != "dequantize"
+        ):
+            return None
+
+        # make sure the output is quantized
+        if not _check_child_op_type(tail_op, "quantize"):
+            return None
+        quantize_z = tail_op.outputs[0].child_ops[0]
+
+        # make sure the intermediate results are not block outputs
+        # since we only guarantee conservation of z
+        if not _check_no_output_connection(
+            op.enclosing_block, [dequantize_x, dequantize_y, op, tail_op, quantize_z]
+        ):
+            return None
+
+        return dequantize_x, dequantize_y, quantize_z
+
+    def try_to_transform(
+        self, op: Operation, dequantize_x: Operation, dequantize_y: Operation, quantize_z: Operation
+    ) -> bool:
+        """
+        given dequantize_x, dequantize_y, quantize_z, tranform by
+            z_fp = (x - zp_x) * s_x/s_y .op. (y - zp_y) * 1.0
+            z = z_fp / (s_z/s_y) - zp_z
+
+        See the class doc for details
+        """
+        block = quantize_z.enclosing_block
+
+        new_s_x, new_s_z = self.try_to_divide(dequantize_x, dequantize_y, quantize_z)
+        # if s_y cannot be used to divide, then swap x and y and try again
+        if new_s_x is None and new_s_z is None:
+            dequantize_x, dequantize_y = dequantize_y, dequantize_x
+            new_s_x, new_s_z = self.try_to_divide(dequantize_x, dequantize_y, quantize_z)
+            # after swap, if still cannot divide, then give up
+            if new_s_x is None and new_s_z is None:
+                return False
+
+        # insert normalized new_dequantize_x and new_dequantize_y before op
+        new_dequantize_x = mb.dequantize(
+            input=dequantize_x.input,
+            scale=new_s_x,
+            zero_point=dequantize_x.zero_point,
+            axis=dequantize_x.axis,
+            before_op=op,
+        )
+        new_dequantize_y = mb.dequantize(
+            input=dequantize_y.input,
+            scale=1.0 if dequantize_y.axis is None else np.full(dequantize_y.scale.val.shape, 1.0),
+            zero_point=dequantize_y.zero_point,
+            axis=dequantize_y.axis,
+            before_op=op,
+        )
+        # insert normalized new_quantize_z before quantize_z
+        new_quantize_z = mb.quantize(
+            input=quantize_z.input,
+            scale=new_s_z,
+            zero_point=quantize_z.zero_point,
+            axis=quantize_z.axis,
+            output_dtype=quantize_z.output_dtype,
+            before_op=quantize_z,
+        )
+        if not (
+            # replace dequantize_x and dequantize_y with the normalized ones
+            # in the range of (new_dequantize_x, op] and (new_dequantize_y, op]
+            # in case dequantize_x and dequantize_y also feed to other ops
+            # which should not get altered by this transformation
+            block.try_replace_uses_of_var_after_op(
+                anchor_op=new_dequantize_x.op,
+                end_op=op,
+                old_var=dequantize_x.outputs[0],
+                new_var=new_dequantize_x,
+            )
+            and block.try_replace_uses_of_var_after_op(
+                anchor_op=new_dequantize_y.op,
+                end_op=op,
+                old_var=dequantize_y.outputs[0],
+                new_var=new_dequantize_y,
+            )
+            # replace quantize_z with the normalized one
+            and block.try_replace_uses_of_var_after_op(
+                anchor_op=quantize_z, old_var=quantize_z.outputs[0], new_var=new_quantize_z
+            )
+        ):
+            return False
+
+        # remove quantize_z here, but not dequantize_x and dequantize_y, since:
+        # * all uses of quantize_z has been replaced with the normalized one
+        # * dequantize_x and dequantize_y may feed to multiple ops, which are not replaced
+        #   (if not, then pass dead_code_elimination will eliminate them)
+        block.remove_ops([quantize_z])
+
+        return True
+
+    def try_to_divide(
+        self,
+        dequantize_x: Operation,
+        dequantize_y: Operation,
+        quantize_z: Operation,
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        """
+        compute s_x/s_y and s_z/s_y, return the results if succeeds, else None
+
+        The broadcast rule is very complicated:
+        1. Broadcast s_x to x, s_y to y, s_z to z, according to axes
+        2. Broadcast s_x and s_y
+        3. Perform s_x/s_y and s_z/s_y
+        4. De-broadcast s_x/s_y and s_z/s_y down to vectors according to axes,
+           raise exception if impossible to de-broadcast
+
+        As a result, for now we only handle the scalar s_y case
+        """
+
+        # TODO (rdar://109170887): explore vector s_y
+        if dequantize_y.axis is not None:
+            return None, None
+
+        s_x_fp32 = np.float32(dequantize_x.scale.val)
+        s_y_fp32 = np.float32(dequantize_y.scale.val)
+        s_z_fp32 = np.float32(quantize_z.scale.val)
+
+        s_x_d_s_y = s_x_fp32 / s_y_fp32
+        s_z_d_s_y = s_z_fp32 / s_y_fp32
+
+        if (
+            self.overflow_fp16(s_x_d_s_y)
+            or self.underflow_fp16(s_x_d_s_y)
+            or self.overflow_fp16(s_z_d_s_y)
+            or self.underflow_fp16(s_z_d_s_y)
+        ):
+            return None, None
+
+        return s_x_d_s_y, s_z_d_s_y
+
+    @staticmethod
+    def overflow_fp16(x: np.ndarray) -> bool:
+        return np.max(np.abs(x)) > 65504
+
+    @staticmethod
+    def underflow_fp16(x: np.ndarray) -> bool:
+        return np.min(np.abs(x)) < np.nextafter(0.0, 1.0, dtype=np.float16)
+
+
+@register_pass(namespace="common")
+class dequantize_to_constexpr(AbstractGraphPass):
+    """
+    ``dequantize`` op with constant input is equivalent to ``constexpr_affine_dequantize``.
+    This is one of the canoncalization pass that transforms all such
+    ``dequantize`` ops to respective ``constexpr_affine_dequantize`` ops.
+
+    .. code-block::
+
+        Input graph:
+
+            dequantize(input=const) -> downstream op
+
+        Output graph:
+
+            constexpr_affine_dequantize -> downstream op
+
+    This pass is being performed because constant tensors being propagated
+    through ``dequantize`` op would be serialized in bloated/decompressed fashion,
+    whereas with ``constexpr_affine_dequantize``,
+    constant weights/tensors remain compressed at serialization.
+    """
+
+    def apply(self, prog):
+        @block_context_manager
+        def apply_block(block):
+            for op in list(block.operations):
+                for b in op.blocks:
+                    apply_block(b)
+
+                if self.is_valid_op(op):
+                    self.transform_op(op)
+
+        for f in prog.functions.values():
+            apply_block(f)
+
+    def is_valid_op(self, op):
+        return op.op_type == "dequantize" and op.outputs[0].val is not None
+
+    def transform_op(self, op):
+        quantized_data = op.input.val
+
+        scale = op.scale.val
+
+        zero_point = None
+        if op.zero_point is not None:
+            zero_point = op.zero_point.val
+        else:
+            zero_point = np.int8(0) if op.input.dtype == types.int8 else np.uint8(0)
+
+        # In dequantize semantics, axis may be None:
+        #     when scale is a scalar, axis is None
+        #
+        # In constexpr_affine_dequantize semantics, None axis is not allowed;
+        # since axis is not referred to when scale is a scalar, we pass a dummy
+        axis = 0
+        if op.axis is not None:
+            axis = op.axis.val
+
+        new_var = mb.constexpr_affine_dequantize(
+            quantized_data=quantized_data,
+            zero_point=zero_point,
+            scale=scale,
+            axis=axis,
+            before_op=op,
+            name=op.name + "_affine_dequantized",
+        )
+
+        block = op.enclosing_block
+        block.replace_uses_of_var_after_op(anchor_op=op, old_var=op.outputs[0], new_var=new_var)
+        block.remove_ops([op])
diff --git a/coremltools/converters/mil/mil/passes/defs/optimize_tensor_operation.py b/coremltools/converters/mil/mil/passes/defs/optimize_tensor_operation.py
index f7dbd04d4..5733ab1b7 100644
--- a/coremltools/converters/mil/mil/passes/defs/optimize_tensor_operation.py
+++ b/coremltools/converters/mil/mil/passes/defs/optimize_tensor_operation.py
@@ -5,7 +5,10 @@
 
 import numpy as np
 
+from coremltools.converters.mil._deployment_compatibility import AvailableTarget
+from coremltools.converters.mil.frontend._utils import value_at
 from coremltools.converters.mil.mil import Builder as mb
+from coremltools.converters.mil.mil.block import is_current_opset_version_compatible_with
 from coremltools.converters.mil.mil.passes.graph_pass import AbstractGraphPass
 from coremltools.converters.mil.mil.passes.helper import (
     _check_child_op_type,
@@ -15,10 +18,11 @@
 from coremltools.converters.mil.mil.passes.pass_registry import register_pass
 from coremltools.converters.mil.mil.types.symbolic import any_symbolic
 
+
 @register_pass(namespace="common")
 class expand_high_rank_reshape_and_transpose(AbstractGraphPass):
     """
-    Detect the pattern ``reshape_1-->transpose-->reshape_2``, where ``reshape_1`` has 
+    Detect the pattern ``reshape_1-->transpose-->reshape_2``, where ``reshape_1`` has
     a output tensor with rank >= 6, and the reshape_2 produces a tensor with rank <= 5.
 
     In general, we can expand this pattern into a sequence of rank 4 ``reshape`` and ``transpose`` ops,
@@ -48,7 +52,7 @@ def apply(self, prog):
 
     @staticmethod
     def _match_pattern(op):
-        # We are detecting the 
+        # We are detecting the
         # reshape(>= rank 6) -> transpose -> reshape(<= rank 5) pattern
         ops = [op]
         if op.op_type != "reshape":
@@ -89,7 +93,7 @@ def _get_prod(start, end, arr, skip_indices):
         original_shape = reshape_op.outputs[0].shape
         original_perm = transpose_op.perm.val.tolist()
 
-        # Group the consecutive axes in the perm, sometimes this could directly lower the 
+        # Group the consecutive axes in the perm, sometimes this could directly lower the
         # rank under 6.
         #
         # For instance:
@@ -110,7 +114,7 @@ def _get_prod(start, end, arr, skip_indices):
         group_axes = []
         i = 0
         res = []
-        for i in range(len(original_perm)):  
+        for i in range(len(original_perm)):
             if i > 0 and original_perm[i] == original_perm[i-1] + 1:
                 res.append(original_perm[i])
             else:
@@ -140,7 +144,7 @@ def _get_prod(start, end, arr, skip_indices):
             # we can directly use them to replace the original pattern
             x = mb.reshape(x=x, shape=shape, before_op=reshape_op)
             x = mb.transpose(x=x, perm=perm, before_op=reshape_op)
-        
+
         else:
             # Otherwise, we need to expand the rank-N tensor into N reshape, and N transpose ops.
             # Note that all intrermediate tensors have rank 4.
@@ -166,9 +170,9 @@ def _get_prod(start, end, arr, skip_indices):
                 dim = shape[axis]
                 memo.add(axis)
                 reshape_shape = [
-                    leading_dim, 
-                    _get_prod(0, axis, shape, memo), 
-                    dim, 
+                    leading_dim,
+                    _get_prod(0, axis, shape, memo),
+                    dim,
                     _get_prod(axis + 1, rank, shape, memo)
                 ]
                 x = mb.reshape(x=x, shape=reshape_shape, before_op=reshape_op)
@@ -547,8 +551,21 @@ def _try_to_transform(onehot_op, block):
             return False
 
         # remove onehot and matmul and replace with gather op
-        out_name = matmul_op.outputs[0].name
-        x = mb.gather(x=W_var, indices=root_var, axis=0, name=out_name, before_op=matmul_op)
+        if is_current_opset_version_compatible_with(AvailableTarget.iOS17):
+            # IOS17 `gather` requires non-negative indices.
+            root_var = mb.select(
+                cond=mb.greater_equal(x=root_var, y=0, before_op=matmul_op),
+                a=root_var,
+                b=mb.add(
+                    x=root_var,
+                    y=value_at(mb.shape(x=W_var, before_op=matmul_op), 0, before_op=matmul_op),
+                    before_op=matmul_op,
+                ),
+                before_op=matmul_op,
+            )
+        x = mb.gather(
+            x=W_var, indices=root_var, axis=0, name=matmul_op.outputs[0].name, before_op=matmul_op
+        )
 
         matmul_op.enclosing_block.replace_uses_of_var_after_op(
             anchor_op=matmul_op, old_var=matmul_op.outputs[0], new_var=x
@@ -715,20 +732,20 @@ def _replace_stack_reshape_block(self, block):
 class use_reflection_padding(AbstractGraphPass):
     """
     Identify a reflection padding layer composed out of `slices` and `concats`.
-    
+
     .. code-block::
 
         Input graph:
-        
+
                 ------------------------------------------------------------------------------------- |
                 |                                                                                     v
         input(1, 2, 6, 8) ------> slice_by_index(begin=[0, 0, 0, 1], end=[0, 0, 0, 2]) -----> concat(axis=3) ---> out(1, 2, 6, 10)
                 |                                                                                     ^
                 ----------------> slice_by_index(begin=[0, 0, 0, -2], end=[0, 0, 0, -1]) -------------|
 
-        
+
         Output graph:
-        
+
         input(1, 2, 6, 8) -----0> pad(mode=reflect, size=[0, 0, 1, 1]) -----> out(1, 2, 6, 10)
 
     """
diff --git a/coremltools/converters/mil/mil/passes/defs/quantization.py b/coremltools/converters/mil/mil/passes/defs/quantization.py
index 2f7478616..6253f28b3 100644
--- a/coremltools/converters/mil/mil/passes/defs/quantization.py
+++ b/coremltools/converters/mil/mil/passes/defs/quantization.py
@@ -8,25 +8,13 @@
 
 import numpy as np
 
-from coremltools import _logger as logger
-from coremltools.converters.mil.backend.mil.load import should_use_weight_file
+from coremltools.converters.mil._deployment_compatibility import AvailableTarget
 from coremltools.converters.mil.mil import Builder as mb
-from coremltools.converters.mil.mil import types
-from coremltools.converters.mil.mil.ops.defs.iOS16 import (
-    constexpr_affine_dequantize,
-    constexpr_lut_to_dense,
-    constexpr_sparse_to_dense,
-)
+from coremltools.converters.mil.mil import Operation, types
 from coremltools.converters.mil.mil.passes.graph_pass import AbstractGraphPass
 from coremltools.converters.mil.mil.passes.helper import block_context_manager
 from coremltools.converters.mil.mil.passes.pass_registry import register_pass
 from coremltools.converters.mil.mil.program import Program
-from coremltools.converters.mil.mil.types.type_mapping import (
-    is_builtin,
-    nptype_from_builtin,
-    numpy_type_to_builtin_type,
-)
-from coremltools.models.neural_network.quantization_utils import _get_kmeans_lookup_table_and_weight
 
 
 class ComputePrecision(_Enum):
@@ -138,6 +126,16 @@ class FP16ComputePrecision(AbstractQuantizationPass):
     - For each output of dtype float16, inject a "cast" op to change it back to float32
     """
 
+    # Activation related ops with alpha/beta parameters.
+    _ACTIVATION_ALPHA_OPS: Set[str] = {"elu", "leaky_relu", "prelu", "thresholded_relu"}
+    _ACTIVATION_ALPHA_BETA_OPS: Set[str] = {
+        "clamped_relu",
+        "linear_activation",
+        "scaled_tanh",
+        "sigmoid_hard",
+        "softplus_parametric",
+    }
+
     def __init__(self, op_selector=None):
         super(FP16ComputePrecision, self).__init__(op_selector=op_selector)
         self.target_dtype = "fp16"
@@ -146,8 +144,13 @@ def __init__(self, op_selector=None):
         # For reference: Checkout test_single_input_to_multiple_operations in `TestFP16CastTransform`.
         self.cache_vars = {}
 
-    def fp16_overflow(self, op):
-        # Constants with values more than 65504 or less than -65504 overflows in FP16
+    def fp16_overflow(self, op: Operation) -> bool:
+        # This overflow check consists of two parts:
+        # 1. For valid fp32 numbers (abs < 1e38), we want their exact values,
+        #    so we make sure they are within fp16 range [-65504, 65504]
+        # 2. For inifinities (abs >= 1e38), their exact values does not matter,
+        #    so we can always downcast them to fp16 inf. For example, in attention mask
+        #    we just want -inf to make the masked entries have 0 probability after softmax
         for _, inputs in op.inputs.items():
             is_list_input = isinstance(inputs, (list, tuple))
             if not is_list_input:
@@ -158,15 +161,18 @@ def fp16_overflow(self, op):
                     and var.op.op_type == "const"
                     and var.is_tensor_or_scalar_of(dtype="fp32")
                 ):
-                    if np.max(np.abs(var.op.val.val), initial=0.0) > 65504:
+                    value = np.expand_dims(var.op.val.val, 0)
+                    abs_value = np.abs(value)
+                    if np.max(abs_value[np.where(abs_value < 1e38)], initial=0.0) > 65504:
                         return True
         return False
 
-    def is_valid_op(self, op):
-
+    def is_valid_op(self, op: Operation) -> bool:
+        """Determines if op is valid for fp16 casting."""
         if op.op_type in ["cast", "while_loop", "cond"]:
             return False
 
+        # TODO: Remove after supporting FP16 dynamic quantize transformation for list ops (rdar://74458192)
         if op.op_type in [
             "make_list",
             "list_gather",
@@ -175,8 +181,9 @@ def is_valid_op(self, op):
             "list_write",
             "list_length",
         ]:
-            return False  #  rdar://74458192
+            return False
 
+        # TODO: Remove after supporting IOS17 FP16 RNN Ops (rdar://108143371)
         if op.op_type in ["gru", "rnn", "lstm"]:
             return False
 
@@ -185,12 +192,21 @@ def is_valid_op(self, op):
 
         return True
 
-    def is_valid_parameter(self, op, param_name):
+    def should_cast_parameter(self, op: Operation, param_name: str) -> bool:
+        """Determines if a param of an op should be casted to fp16."""
+        # Make sure the param is valid for fp16 when type domain is specified.
         type_domain = getattr(op.input_spec.input_types[param_name], "type_domain", None)
-        if type_domain is not None:
-            if len(type_domain) == 0:
-                return True
-            return types.fp16 in type_domain
+        if type_domain and types.fp16 not in type_domain:
+            return False
+
+        if op.opset_version >= AvailableTarget.iOS17:
+            # In IOS17+ activation ops with alpha/beta support mixed precision, and we don't want to
+            # cast alpha/beta to fp16 for better numerical accuracy.
+            if op.op_type in self._ACTIVATION_ALPHA_OPS and param_name == "alpha":
+                return False
+            if op.op_type in self._ACTIVATION_ALPHA_BETA_OPS and param_name in {"alpha", "beta"}:
+                return False
+
         return True
 
     def _check_underflow_to_zero(self, new_var, var):
@@ -229,7 +245,7 @@ def transform_op(self, op):
 
         for param, inputs in op.inputs.items():
             # First loop, iterates over all the input parameters of an operation.
-            if not self.is_valid_parameter(op, param):
+            if not self.should_cast_parameter(op, param):
                 continue
 
             is_list_input = isinstance(inputs, (list, tuple))
@@ -301,14 +317,14 @@ def transform_op(self, op):
 class add_fp16_cast(FP16ComputePrecision):
     """
     For each input of dtype float32, inject a ``cast`` op to change it to float16 dtype.
-    
+
     For each output of dtype float16, inject a ``cast`` op to change it back to float32.
 
     This pass is the registered interface for FP16ComputePrecision, which makes it consistent with
     other passes' interfaces.
 
     Support options:
-    
+
     - ``skip_ops_by_type``: Skip op types specified by comma-separated string; for example, ``"mul,const"``.
     """
 
@@ -321,537 +337,3 @@ def skip_ops_by_type(self):
     @skip_ops_by_type.setter
     def skip_ops_by_type(self, criteria: Text):
         self._skip_ops_by_type = set(criteria.split(","))
-
-
-class SparseParams:
-    def __init__(self, nonzero_data=None, mask=None, shape=None):
-        self.nonzero_data = nonzero_data
-        self.mask = mask
-        self.shape = shape
-
-
-class WeightSparsifier(AbstractQuantizationPass):
-    """
-    This transform does the following, for each const op and if the "op_selector" return True:
-    - (self.sparsity) fraction of values with the least absolute value are zeroed out.
-    - If fake_compression=False,  Zeroed-Out Value is encoded via constexpr_sparse_to_dense op
-    - If fake_compression=True,   Zeroed-Out Value is encoded via const op
-    - Old const is replaced by a new operation with zeroed-out value.
-    """
-
-    WEIGHT_SPARSIFICATION_MODES = ("THRESHOLD_BASED", "PERCENTILE_BASED")
-
-    def __init__(
-        self,
-        mode="threshold_based",
-        threshold=1e-3,
-        target_percentile=1.0,
-        fake_compression=False,
-        op_selector=None,
-    ):
-        super().__init__(op_selector=op_selector)
-        self.fake_compression = fake_compression
-        self.mode = mode.upper()
-        self.threshold = threshold
-        self.target_percentile = target_percentile
-
-        if self.mode not in WeightSparsifier.WEIGHT_SPARSIFICATION_MODES:
-            msg = "Only mode {} supported for weight sparsification. Got mode {}.".format(
-                WeightSparsifier.WEIGHT_SPARSIFICATION_MODES, self.mode
-            )
-            raise ValueError(msg)
-
-        if self.mode == "PERCENTILE_BASED" and (
-            self.target_percentile < 0 or self.target_percentile > 1
-        ):
-            raise ValueError(
-                "Invalid value of target_percentile: {}. Needs to be in [0, 1]".format(
-                    self.target_percentile
-                )
-            )
-
-        if self.mode == "THRESHOLD_BASED" and self.threshold < 0:
-            raise ValueError(
-                "Invalid value of threshold: {}. Needs to be in [0, inf)".format(self.threshold)
-            )
-
-    def is_valid_op(self, op):
-        if op.op_type == "const" and should_use_weight_file(op.val.val):
-            return True
-        return False
-
-    @staticmethod
-    def compress(val, mode, target_percentile=None, threshold=None):
-
-        mode = mode.upper()
-
-        def sparsify_with_percentile(val, target_percentile):
-            q = target_percentile * 100
-            return np.where(np.abs(val) <= np.percentile(np.abs(val), q), 0, val)
-
-        def sparsify_with_thresohld(val, threshold):
-            return np.where(np.abs(val) <= threshold, 0, val)
-
-        if not isinstance(val, (np.ndarray, np.generic)):
-            raise ValueError("Only numpy arrays are supported")
-
-        flattened_val = val.flatten()
-
-        if mode == "PERCENTILE_BASED":
-            flattened_val = sparsify_with_percentile(flattened_val, target_percentile)
-        elif mode == "THRESHOLD_BASED":
-            flattened_val = sparsify_with_thresohld(flattened_val, threshold)
-
-        params = SparseParams()
-        params.nonzero_data = flattened_val[np.where(flattened_val != 0)]
-        params.mask = np.packbits(np.where(flattened_val != 0, 1, 0), bitorder="little")
-        params.shape = val.shape
-        return params
-
-    @staticmethod
-    def decompress(params):
-        if not isinstance(params, SparseParams):
-            raise ValueError("Invalid type of params")
-        return constexpr_sparse_to_dense.decompress(params.nonzero_data, params.mask, params.shape)
-
-    def transform_op(self, op):
-        block = op.enclosing_block
-        sparse_params = self.compress(op.val.val, self.mode, self.target_percentile, self.threshold)
-
-        if not self.fake_compression:
-            new_var = mb.constexpr_sparse_to_dense(
-                nonzero_data=sparse_params.nonzero_data,
-                mask=sparse_params.mask,
-                shape=np.uint32(sparse_params.shape),
-                before_op=op,
-                name=op.name + "_sparsified",
-            )
-        else:
-            decompressed_val = self.decompress(sparse_params)
-            new_var = mb.const(
-                val=decompressed_val,
-                before_op=op,
-                name=op.name + "_fake_sparsified",
-            )
-
-        op.enclosing_block.replace_uses_of_var_after_op(
-            anchor_op=op,
-            old_var=op.outputs[0],
-            new_var=new_var,
-            no_check_var_types=True,
-        )
-
-        block.remove_ops([op])
-
-
-class LutParams:
-    def __init__(self, lut=None, indices=None, shape=None):
-        self.lut = lut
-        self.indices = indices
-        self.shape = shape
-
-
-class WeightPalettizer(AbstractQuantizationPass):
-    """
-    This transform does the following, for each const op and if the "op_selector" return True:
-    - A linear look up table with 2**(nbits) entries is created and value is represented via indexing into this look up table.
-    - If fake_compression=False,  compressed value is encoded via constexpr_lut_to_dense op
-    - If fake_compression=True,   compressed value is decompressed and then encoded via const op
-    - Old const op is replaced by a newly created operation.
-    """
-
-    WEIGHT_PALETTIZATION_MODES = ("KMEANS", "UNIFORM", "UNIQUE", "CUSTOM")
-
-    def __init__(
-        self, nbits, fake_compression=False, op_selector=None, mode="kmeans", lut_function=None
-    ):
-        super().__init__(op_selector=op_selector)
-        self.fake_compression = fake_compression
-        self.nbits = nbits
-        self.mode = mode.upper()
-        self.lut_function = lut_function
-
-        if self.mode not in WeightPalettizer.WEIGHT_PALETTIZATION_MODES:
-            msg = "Only mode {} supported for weight palettization. Got mode {}.".format(
-                WeightPalettizer.WEIGHT_PALETTIZATION_MODES, self.mode
-            )
-            raise ValueError(msg)
-
-        if nbits is None and self.mode in ("KMEANS", "UNIFORM"):
-            msg = "nbits must be provided for mode {}".format(mode)
-            raise ValueError(msg)
-
-        if nbits is not None and self.mode in ("UNIQUE", "CUSTOM"):
-            msg = "nbits must NOT be provided for mode {}".format(mode)
-            raise ValueError(msg)
-
-        if self.nbits is not None and self.nbits not in (1, 2, 4, 6, 8):
-            raise ValueError(
-                "Invalid value of nbits ({}) for palettization. Supported bits are {{1, 2, 4, 6, 8}}".format(
-                    nbits
-                )
-            )
-
-        if (self.mode == "CUSTOM") ^ (lut_function is not None):
-            msg = "lut_function must be None if mode is not custom, and that it cannot be None when the mode is custom."
-            raise ValueError(msg)
-
-        if self.mode == "CUSTOM" and not callable(self.lut_function):
-            msg = "A function object must be provided as lut_function. Got a lut_functions as type {}".format(
-                type(self.lut_function)
-            )
-            raise ValueError(msg)
-
-    def is_valid_op(self, op):
-        if op.op_type == "const" and should_use_weight_file(op.val.val):
-            return True
-        return False
-
-    @staticmethod
-    def compress(val, mode, nbits=None, lut_function=None):
-
-        mode = mode.upper()
-
-        def compress_kmeans(val, nbits):
-            lut, indices = _get_kmeans_lookup_table_and_weight(nbits, val)
-            lut = lut.astype(val.dtype)
-            indices = indices.astype(np.uint8)
-            return lut, indices
-
-        def compress_uniform(val, nbits):
-            val = val.flatten()
-            val_min = np.amin(val)
-            val_max = np.amax(val)
-            scale = (val_max - val_min) / ((1 << nbits) - 1)
-            indices = np.round(((val - val_min) / (val_max - val_min)) * ((1 << nbits) - 1)).astype(
-                np.uint8
-            )
-            lut = np.array(range(0, 1 << nbits)) * scale + val_min
-            lut = lut.astype(val.dtype)
-            return lut, indices
-
-        def get_nbits_for_unique_mode(val):
-            val = val.flatten()
-            unique_vals = np.unique(val).tolist()
-            for nbits in (1, 2, 4, 6, 8):
-                if len(unique_vals) <= 1 << nbits:
-                    return nbits
-            msg = "weight value cannot be represented in an 8 bits palettization. Skipped."
-            logger.warning(msg)
-            return None
-
-        def compress_unique(val, nbits):
-            val = val.flatten()
-            unique_vals = np.unique(val).tolist()
-            if len(unique_vals) > 1 << nbits:
-                msg = "Too many unique values {} in the weight. Couldn't represented in {} bits.".format(
-                    len(unique_vals), nbits
-                )
-                raise ValueError(msg)
-            lut = [0] * (1 << nbits)
-            lut[: len(unique_vals)] = unique_vals
-            indices = np.zeros((len(val),))
-            for i, k in enumerate(lut[:len(unique_vals)]):
-                indices += (i + 1) * (val == k).astype(np.int32)
-            indices = indices - 1
-            assert (
-                len(np.where(indices == -1)[0]) == 0
-            ), "weight must be corresponding to one existing indice"
-
-            lut = np.array(lut).astype(val.dtype)
-            indices = indices.astype(np.uint8)
-            return lut, indices
-
-        def pack_indices_into_bytes_array(indices, nbits):
-            bitarray = np.unpackbits(indices.reshape(-1, 1), bitorder="little", axis=-1)[:, :nbits]
-            return np.packbits(bitarray.flatten(), bitorder="little")
-
-        def check_lut_parameters_are_valid(val, lut, indices):
-            if not isinstance(lut, np.ndarray) or not isinstance(indices, np.ndarray):
-                raise ValueError("LUT and indices must be type of numpy array.")
-
-            if indices.size != val.size:
-                msg = "Indices size ({}) mismatched with the original weight({}).".format(
-                    indices.size, val.size
-                )
-                raise ValueError(msg)
-
-            if len(indices.shape) != 1 or indices.dtype != np.uint8:
-                msg = "Indices must be a numpy vector of type uint8. Found shape {} with type {}".format(
-                    indices.shape, indices.dtype
-                )
-                raise ValueError(msg)
-
-            if lut.dtype != val.dtype:
-                msg = "Dtype mismatched between LUT ({}) and weight ({})".format(
-                    lut.dtype, val.dtype
-                )
-                raise ValueError(msg)
-
-        if not isinstance(val, (np.ndarray, np.generic)):
-            raise ValueError("Only numpy arrays are supported")
-
-        if mode == "KMEANS":
-            lut, indices = compress_kmeans(val, nbits)
-        elif mode == "UNIFORM":
-            lut, indices = compress_uniform(val, nbits)
-        elif mode == "UNIQUE":
-            nbits = get_nbits_for_unique_mode(val)
-            if nbits is None:
-                return None
-            lut, indices = compress_unique(val, nbits)
-        elif mode == "CUSTOM":
-            lut, indices = lut_function(val)
-
-        check_lut_parameters_are_valid(val, lut, indices)
-
-        params = LutParams()
-        params.lut = lut
-        params.shape = val.shape
-        params.indices = pack_indices_into_bytes_array(indices, int(np.log2(lut.shape[0])))
-        return params
-
-    @staticmethod
-    def decompress(params):
-        if not isinstance(params, LutParams):
-            raise ValueError("Invalid type of params")
-        return constexpr_lut_to_dense.decompress(params.lut, params.indices, params.shape)
-
-    def transform_op(self, op):
-        block = op.enclosing_block
-        lut_params = self.compress(op.val.val, self.mode, self.nbits, self.lut_function)
-
-        if lut_params is None:
-            return
-
-        if not self.fake_compression:
-            new_var = mb.constexpr_lut_to_dense(
-                indices=lut_params.indices,
-                lut=lut_params.lut,
-                shape=np.uint32(lut_params.shape),
-                before_op=op,
-                name=op.name + "_palettized",
-            )
-        else:
-            decompressed_val = self.decompress(lut_params)
-            new_var = mb.const(
-                val=decompressed_val,
-                before_op=op,
-                name=op.name + "_fake_palettized",
-            )
-
-        op.enclosing_block.replace_uses_of_var_after_op(
-            anchor_op=op,
-            old_var=op.outputs[0],
-            new_var=new_var,
-            no_check_var_types=True,
-        )
-
-        block.remove_ops([op])
-
-
-class AffineQuantParams:
-    def __init__(self, quantized_data=None, zero_point=None, scale=None, axis=None):
-        self.quantized_data = quantized_data
-        self.zero_point = zero_point
-        self.scale = scale
-        self.axis = axis
-
-
-class WeightAffineQuantizer(AbstractQuantizationPass):
-    """
-    This transform does the following, for each const op and if the "op_selector" return True:
-    - Values are linearly quantized into unsigned 8-bits.
-    - If fake_compression=False,  compressed value is encoded via constexpr_affine_dequantize op
-    - If fake_compression=True,   compressed value is decompressed and then encoded via const op
-    - Old const is replaced by a newly created operation.
-    """
-
-    WEIGHT_AFFINE_QUANTIZATION_MODES = ("LINEAR_SYMMETRIC", "LINEAR")
-    WEIGHT_AFFINE_DTYPES = (types.int8, types.uint8)
-
-    def __init__(self, fake_compression=False, op_selector=None, mode="linear", dtype=np.int8):
-        super().__init__(op_selector=op_selector)
-        self.fake_compression = fake_compression
-        self.mode = mode.upper()
-
-        # check mode
-        if self.mode not in WeightAffineQuantizer.WEIGHT_AFFINE_QUANTIZATION_MODES:
-            msg = "Only mode {} supported for weight affine quantization. Got mode {}.".format(
-                WeightAffineQuantizer.WEIGHT_AFFINE_QUANTIZATION_MODES, self.mode
-            )
-            raise ValueError(msg)
-
-        # check dtype
-        msg = f"dtype={dtype} is unsupported for affine_quantize_weights."
-        if is_builtin(dtype):
-            self.dtype = dtype
-        else:
-            try:
-                self.dtype = numpy_type_to_builtin_type(dtype)
-            except TypeError:
-                raise ValueError(msg)
-
-        if self.dtype not in WeightAffineQuantizer.WEIGHT_AFFINE_DTYPES:
-            raise ValueError(msg)
-
-    def is_valid_op(self, op):
-        if op.op_type == "const" and should_use_weight_file(op.val.val):
-            return True
-        return False
-
-    @staticmethod
-    def _get_axis(op):
-        axis = 0
-        var = op.outputs[0]
-        if len(var.child_ops) == 1 and var.child_ops[0].op_type == "conv_transpose":
-            axis = 1
-        return axis
-
-    @staticmethod
-    def compress(val, axis, mode, dtype):
-        def _ensure_numerical_range_and_cast(val, low, high, np_dtype):
-            '''
-            For some cases, the computed quantized data might exceed the data range.
-            For instance, after rounding and addition, we might get `128` for the int8 quantization.
-            This utility function ensures the val in the data range before doing the cast.
-            '''
-            val = np.minimum(val, high)
-            val = np.maximum(val, low)
-            return val.astype(np_dtype)
-
-        mode = mode.upper()
-        mode_dtype_to_range = {
-            (types.int8, "LINEAR"): (-128, 127),
-            (types.int8, "LINEAR_SYMMETRIC"): (-127, 127),
-            (types.uint8, "LINEAR"): (0, 255),
-            (types.uint8, "LINEAR_SYMMETRIC"): (0, 254),
-        }
-
-        if not isinstance(val, (np.ndarray, np.generic)):
-            raise ValueError("Only numpy arrays are supported")
-
-        params = AffineQuantParams()
-        axes = tuple([i for i in range(len(val.shape)) if i != axis])
-        val_min = np.amin(val, axis=axes, keepdims=True)
-        val_max = np.amax(val, axis=axes, keepdims=True)
-
-        if mode == "LINEAR_SYMMETRIC":
-            # For the linear_symmetric mode, the range is symmetrical to 0
-            max_abs = np.maximum(np.abs(val_min), np.abs(val_max))
-            val_min = -max_abs
-            val_max = max_abs
-        else:
-            assert mode == "LINEAR"
-            # For the linear mode, we need to make sure the data range contains `0`
-            val_min = np.minimum(0.0, val_min)
-            val_max = np.maximum(0.0, val_max)
-
-        q_val_min, q_val_max = mode_dtype_to_range[(dtype, mode)]
-
-        # Set the zero point to symmetric mode
-        np_dtype = nptype_from_builtin(dtype)
-        if mode == "LINEAR_SYMMETRIC":
-            if dtype == types.int8:
-                params.zero_point = (0 * np.ones(val_min.shape)).astype(np.int8)
-            else:
-                assert dtype == types.uint8
-                params.zero_point = (127 * np.ones(val_min.shape)).astype(np.uint8)
-        else:
-            assert mode == "LINEAR"
-            params.zero_point = (q_val_min * val_max - q_val_max * val_min) / (val_max - val_min)
-            params.zero_point = np.round(params.zero_point)
-            params.zero_point = _ensure_numerical_range_and_cast(params.zero_point, q_val_min, q_val_max, np_dtype)
-
-        # compute the params
-        params.scale = (val_max - val_min) / (q_val_max - q_val_min)
-        params.scale = params.scale.astype(val.dtype).squeeze()
-
-        params.quantized_data = np.round(
-            val * (q_val_max - q_val_min) / (val_max - val_min)
-        )
-        params.quantized_data = (params.quantized_data + params.zero_point)
-        params.quantized_data = _ensure_numerical_range_and_cast(params.quantized_data, q_val_min, q_val_max, np_dtype)
-
-        params.zero_point = params.zero_point.squeeze()
-        params.axis = axis
-
-        return params
-
-    @staticmethod
-    def decompress(params):
-        if not isinstance(params, AffineQuantParams):
-            raise ValueError("Invalid type of params")
-        return constexpr_affine_dequantize.decompress(
-            params.quantized_data, params.zero_point, params.scale, params.axis
-        )
-
-    def transform_op(self, op):
-        block = op.enclosing_block
-        quant_params = self.compress(op.val.val, self._get_axis(op), self.mode, self.dtype)
-
-        if not self.fake_compression:
-            new_var = mb.constexpr_affine_dequantize(
-                quantized_data=quant_params.quantized_data,
-                zero_point=quant_params.zero_point,
-                scale=quant_params.scale,
-                axis=quant_params.axis,
-                before_op=op,
-                name=op.name + "_affine_quantized",
-            )
-        else:
-            decompressed_val = self.decompress(quant_params)
-            new_var = mb.const(
-                val=decompressed_val,
-                before_op=op,
-                name=op.name + "_fake_affine_quantized",
-            )
-
-        op.enclosing_block.replace_uses_of_var_after_op(
-            anchor_op=op,
-            old_var=op.outputs[0],
-            new_var=new_var,
-            no_check_var_types=True,
-        )
-
-        block.remove_ops([op])
-
-
-class WeightDecompressor(AbstractQuantizationPass):
-    """
-    This graph pass transforms the constexpr ops back into mb.const op.
-    constexpr ops includes:
-    (1) constexpr_affine_dequantize
-    (2) constexpr_lut_to_dense
-    (3) constexpr_sparse_to_dense
-    """
-
-    def __init__(self, op_selector):
-        super().__init__(op_selector=op_selector)
-
-    def is_valid_op(self, op):
-        return op.op_type in (
-            "constexpr_affine_dequantize",
-            "constexpr_lut_to_dense",
-            "constexpr_sparse_to_dense",
-        )
-
-    def transform_op(self, op):
-        block = op.enclosing_block
-
-        decompressed_val = op.value_inference()
-        new_var = mb.const(
-            val=decompressed_val,
-            before_op=op,
-            name=op.name,
-        )
-
-        op.enclosing_block.replace_uses_of_var_after_op(
-            anchor_op=op,
-            old_var=op.outputs[0],
-            new_var=new_var,
-            no_check_var_types=True,
-            force_replace=True,
-        )
-
-        block.remove_ops([op])
diff --git a/coremltools/converters/mil/mil/passes/graph_pass.md b/coremltools/converters/mil/mil/passes/graph_pass.md
index 57783a26d..5bfe6b42d 100644
--- a/coremltools/converters/mil/mil/passes/graph_pass.md
+++ b/coremltools/converters/mil/mil/passes/graph_pass.md
@@ -1,55 +1,72 @@
-# MIL Graph Pass Guide
+# MIL Graph Pass
 
-This guide describes the passes that optimize an MIL Program. 
+## For Users
 
-## Graph Passes in the Conversion Process
+This guide describes the passes that optimize an MIL Program.
 
-The conversion process in Core ML Tools, as described in [Model Intermediate Language](https://coremltools.readme.io/docs/model-intermediate-language#overview),
+### Overview
+
+In Core ML Tools, the conversion process, as described in [Model Intermediate Language](https://coremltools.readme.io/docs/model-intermediate-language#overview),
 is roughly divided into the following stages based on the model representation:
 
-1. Front-end: PyTorch, TensorFlow (and so on) -> Model Intermediate Language (MIL) Program
+1. Frontend (PyTorch/TensorFlow/etc --> Model Intermediate Language (MIL) Program)
 2. MIL-based Graph Optimizations
-3. Backend: MIL -> NeuralNetworks/MLProgram Proto
+3. Backend (MIL --> NeuralNetworks/MLProgram Proto)
 
-The Program is a Python class for Core ML Tools internal in-memory and Pythonic representation. It is the same class you would use when using the Core ML Tools [Python MIL Builder](https://coremltools.readme.io/docs/model-intermediate-language#create-a-mil-program) directly.
+The Program is a Python class for Core ML Tools's internal in-memory and Pythonic representation. It's the same class you would use when using the Core ML Tools [Python MIL Builder](https://coremltools.readme.io/docs/model-intermediate-language#create-a-mil-program) directly.
 
 The Program consists of a `main` function implemented as a [`Block`](https://github.com/apple/coremltools/blob/main/coremltools/converters/mil/mil/block.py). Each `Block` contains a list of [`Operators`](https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html). Passes are applied to the Program representation to simplify and canonicalize it.
 
-During each conversion, the graph passes are specified in a pass pipeline (the `pass_pipeline` parameter in [`ct.convert`](https://apple.github.io/coremltools/source/coremltools.converters.convert.html#coremltools.converters._converters_entry.convert)). 
+During each conversion, the graph passes are specified in a pass pipeline (the `pass_pipeline` parameter in `ct.convert`).
 All available passes are recorded in `_PIPELINE_NAME_TO_PASSES` in `ct.PassPipeline`.
-For a detailed description of each pass, including what each pass does with examples, see the
-[MIL Graph Passes in the coremltools API Reference](https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.passes.defs.html#mil-graph-passes).
+For a detailed description of each pass (including what it does and examples), see the
+[coremltools API Reference](https://apple.github.io/coremltools/index.html).
 
 You can find the code for all MIL passes at `coremltools/converters/mil/mil/passes/defs`.
 
-In addition to using the default setting (`ct.PassPipeline()`), you can do the following:
+In addition to the using default setting, you can:
+
+* Use predefined builtin PassPipelines (see [Predefined Builtin PassPipeline Factory](#predefined-builtin-passpipeline)). We provide a set of predefined commonly used passpipelines that you can directly call.
 
 * Decide which passes and the order of passes to run (see
-[Specify Passes To Run](#specify-passes-to-run)). For example:
-   - Switching off certain fusions to correctly export Phoenix-optimized models for palettization.
+[Specify Passes To Run](#specify-passes-to-run)). For example,
+   - Switching off certain fusions to correctly export Phoenix optimized models for palettization.
    - Skipping all passes to keep the MIL Program untouched.
 
-* Set options for a specific pass to control the behaviour (see [Set Pass Option](#set-pass-option)). For example:
+* Set options for a specific pass to control the behaviour (see [Set Pass Option](#set-pass-option)). For example,
    - Setting a threshold in a constant elimination pass to trade off computation and model size.
    - Skipping ops in fp16 quantization casting.
 
 * Define a custom graph pass to do fully customized optimization on the Program (see [Define Custom Graph Pass](#define-custom-graph-pass)).
 
+### Predefined Builtin PassPipeline
+
+We provide a set of predefined commonly used passpipeline to the users. Which includes:
+
+* `coremltools.PassPipeline.EMPTY`: This skips all passes.
+
+* `coremltools.PassPipeline.DEFAULT`: This is used by the converter by default.
+
+* `coremltools.PassPipeline.CLEANUP`: This contains cleanup graph passes. For instance, `const_elimination`, `dead_code_elimination`, etc.
+
+* `coremltools.PassPipeline.DEFAULT_PALETTIZATION`: This is used for the conversion of a palettized source model.
+
+* `coremltools.PassPipeline.DEFAULT_PRUNING`: This is used for the conversion of a sparse source model.
 
-## Specify Passes To Run
+### Specify Passes To Run
 
-If no pass pipeline is specified, the default pipeline is used:
+If no pass pipeline is specified, a default pipeline will be used:
 
 ```python
 # The following two conversions are equivalent.
-ct.convert(model, pass_pipeline=ct.PassPipeline())
+ct.convert(model, pass_pipeline=ct.PassPipeline.DEFAULT)
 ct.convert(model)
 ```
 
 To skip all passes, use an empty pipeline:
 
 ```python
-pipeline = ct.PassPipeline.get_empty_pipeline()
+pipeline = ct.PassPipeline.EMPTY
 ct.convert(model, pass_pipeline=pipeline)
 ```
 
@@ -66,7 +83,7 @@ ct.convert(model, pass_pipeline=pipeline)
 To inspect passes and their corresponding indexes in the pipeline:
 
 ```python
-pipeline = ct.PassPipeline()
+pipeline = ct.PassPipeline.DEFAULT
 # Find indexes of a specific pass.
 pass_indexes = [
     idx
@@ -79,19 +96,21 @@ You can skip specific passes to avoid unwanted side effects. For example, to avo
 the `conv` and `batchnorm`:
 
 ```python
-pipeline = ct.PassPipeline()
+pipeline = ct.PassPipeline.DEFAULT
 pipeline.remove_passes({"common::fuse_conv_batchnorm"})
 ct.convert(model, pass_pipeline=pipeline)
 ```
 
-## Set Pass Option
+### Set Pass Option
 
-You can set options specific to a certain pass. Each pass option is an attribute of the corresponding pass class. 
+You can set options specific to a certain pass.
+Each pass option is an attribute of the corresponding pass class.
+In the following example, you can see how `skip_const_by_size` is supported in `const_elimination`. You can also add options to existing passes or your custom passes.
 
-The following example shows how to avoid folding too-large `const` ops that would lead to a large model, and in the example you can see how `skip_const_by_size` is supported in `const_elimination`:
+The following example shows how to avoid folding too-large `const` ops that would lead to a large model:
 
 ```python
-pipeline = ct.PassPipeline()
+pipeline = ct.PassPipeline.DEFAULT
 pipeline.set_options("common::const_elimination", {"skip_const_by_size": "1e6"})
 ct.convert(model, pass_pipeline=pipeline)
 ```
@@ -101,7 +120,7 @@ You can also add options to existing passes or to your custom passes.
 Another example is to skip ops during an fp16 quantization pass:
 
 ```python
-pipeline = ct.PassPipeline()
+pipeline = ct.PassPipeline.DEFAULT
 pipeline.set_options("common::add_fp16_cast", {"skip_ops_by_type": "mul,const"})
 ct.convert(model, pass_pipeline=pipeline)
 ```
@@ -157,7 +176,7 @@ main[CoreML5](%input: (1, 2, 3, fp32)(Tensor)) {
 You can then use the `pass_pipeline` API to remove that pass from the pipeline with the following code:
 
 ```python
-pipeline = ct.PassPipeline()
+pipeline = ct.PassPipeline.DEFAULT
 pipeline.remove_passes(["common::merge_consecutive_relus"])
 converted_model = ct.convert(
     traced_model,
@@ -237,7 +256,7 @@ class my_merge_consecutive_relus(AbstractGraphPass):
 You can then remove the `common::merge_consecutive_relus` and insert your custom pass `mypass::my_merge_consecutive_relus`:
 
 ```python
-pipeline = ct.PassPipeline()
+pipeline = ct.PassPipeline.DEFAULT
 # Find the index of the merge_consecutive_relus pass, where we will insert our custom pass.
 pass_index = pipeline.passes.index("common::merge_consecutive_relus")
 pipeline.remove_passes(["common::merge_consecutive_relus"])
diff --git a/coremltools/converters/mil/mil/passes/graph_pass.py b/coremltools/converters/mil/mil/passes/graph_pass.py
index 14869c50f..6a39628a9 100644
--- a/coremltools/converters/mil/mil/passes/graph_pass.py
+++ b/coremltools/converters/mil/mil/passes/graph_pass.py
@@ -21,10 +21,6 @@ class PassOption:
     def __init__(self, option_name: Text, option_val: Union[Text, Callable[[Operation], bool]]):
         if not isinstance(option_name, Text):
             raise ValueError(f"The option name should be text, but got {type(option_name)}")
-        if not isinstance(option_val, Text) and not isinstance(option_val, Callable):
-            raise ValueError(
-                f"The option value should be text or callable, but got {type(option_val)}"
-            )
         self._option_name = option_name
         self._option_val = option_val
 
diff --git a/coremltools/converters/mil/mil/passes/helper.py b/coremltools/converters/mil/mil/passes/helper.py
index d9dddca8e..2ce4d2f12 100644
--- a/coremltools/converters/mil/mil/passes/helper.py
+++ b/coremltools/converters/mil/mil/passes/helper.py
@@ -10,6 +10,12 @@
 from coremltools.converters.mil.mil import Block, Operation, Var
 from coremltools.converters.mil.mil.passes.graph_pass import AbstractGraphPass
 
+class classproperty(property):
+    """
+    A decorator class that allow us to have a class-level property
+    """
+    def __get__(self, owner, cls):
+        return self.fget(cls)
 
 def block_context_manager(func):
     """
diff --git a/coremltools/converters/mil/mil/passes/pass_pipeline.py b/coremltools/converters/mil/mil/passes/pass_pipeline.py
index 359876e9f..687930038 100644
--- a/coremltools/converters/mil/mil/passes/pass_pipeline.py
+++ b/coremltools/converters/mil/mil/passes/pass_pipeline.py
@@ -13,17 +13,32 @@
 from coremltools.converters._profile_utils import _profile
 from coremltools.converters.mil import Program
 from coremltools.converters.mil.mil.passes.graph_pass import PassOption
+from coremltools.converters.mil.mil.passes.helper import classproperty as _classproperty
 from coremltools.converters.mil.mil.passes.pass_registry import PASS_REGISTRY
 
 _COMMON_PASSES: List[Text] = [
     "common::lower_complex_dialect_ops",
     "common::update_output_dtypes",
     "common::cast_optimization",
+    "common::noop_elimination",
+    # quantization pass 1: canonicalize zero point
+    # always start quantization passes with canonicalizations
+    "common::nullify_redundant_quantization_zero_point",
+    # quantization pass 2: remove redundancy
+    # remove redundancy after canonicalization but before anything else
+    "common::dequantize_quantize_pair_elimination",
+    # the main quantization passes
+    "common::distributive_quantized_binary_op_scale_normalization",
+    # the last quantization pass: replace const dequantize with constexpr
+    # after all quantization passes, since constexpr will not be further optimized
+    # before const elimination, otherwise const dequantize would get bloated
+    "common::dequantize_to_constexpr",
     "common::const_elimination",
     "common::sanitize_input_output_names",
     "common::divide_to_multiply",
     "common::add_conv_transpose_output_shape",
     "common::const_elimination",
+    "common::const_deduplication",  # after all consts have been settled
     "common::loop_invariant_elimination",
     "common::remove_symbolic_reshape",
     "common::noop_elimination",
@@ -82,6 +97,7 @@
     "common::const_elimination",
     "common::cast_optimization",
     "common::const_elimination",
+    "common::const_deduplication",  # after all consts have been settled
     "common::loop_invariant_elimination",
     "common::noop_elimination",
     "common::dedup_op_and_var_names",
@@ -91,7 +107,15 @@
     "common::dead_code_elimination",  # always end with dce
 ]
 
-_FRONTEND_TORCH_PASSES = [
+_PALETTIZATION_PASSES: List[Text] = [
+    "compression::palettize_weights",
+]
+
+_SPARSIFICATION_PASSES: List[Text] = [
+    "compression::prune_weights",
+]
+
+_FRONTEND_TORCH_PASSES: List[Text] = [
     "common::dead_code_elimination",
     "common::loop_invariant_elimination",
     "common::dead_code_elimination",
@@ -99,7 +123,7 @@
     "torch::torch_tensor_assign_to_core",
 ]
 
-_FRONTEND_TF1_PASSES = [
+_FRONTEND_TF1_PASSES: List[Text] = [
     "common::dead_code_elimination",
     "common::loop_invariant_elimination",
     "tensorflow::backfill_make_list_elem_type",
@@ -112,7 +136,7 @@
     "tensorflow::expand_tf_lstm",
 ]
 
-_FRONTEND_TF2_PASSES = [
+_FRONTEND_TF2_PASSES: List[Text] = [
     "common::dead_code_elimination",
     "common::loop_invariant_elimination",
     # tensorflow2::remove_vacuous_cond should come before
@@ -128,12 +152,13 @@
     "tensorflow::expand_tf_lstm",
 ]
 
-_BACKEND_MIL_PASSES = [
+_BACKEND_MIL_PASSES: List[Text] = [
     "common::const_elimination",
     "mil_backend::adjust_io_to_supported_types",
     "mil_backend::insert_image_preprocessing_ops",
     "mil_backend::fuse_activation_silu",
     "common::const_elimination",  # rank0_expand_dims_swap might introduce some new const tensor
+    "common::const_deduplication",  # after all consts have been settled
     "common::cast_optimization",
     "common::dead_code_elimination",
     "mil_backend::sanitize_name_strings",
@@ -141,11 +166,12 @@
     "nn_backend::handle_unused_inputs",  # must come after dce.
 ]
 
-_BACKEND_NN_PASSES = [
+_BACKEND_NN_PASSES: List[Text] = [
     "nn_backend::decompose_conv1d",  # at the beginning of nn pass
     "nn_backend::commingle_loop_vars",
     "nn_backend::handle_return_inputs_as_outputs",
     "common::const_elimination",
+    "common::const_deduplication",  # after all consts have been settled
     # "remove_redundant_ops" pass should be applied towards the end, once other graph passes have done their optimizations.
     # For instance, it should come after passes such as "reduce_transpose" that can introduce redundant transposes
     # in the network (while reducing the total number of transposes), and after passes such as "fuse_layernorm_or_instancenorm"
@@ -165,19 +191,19 @@ class PassPipeline:
 
     .. sourcecode:: python
 
-        pipeline = PassPipeline()
+        pipeline = ct.PassPipeline.DEFAULT
 
     Create an empty pipeline (this will result in no graph passes being applied to the model):
 
     .. sourcecode:: python
 
-        pipeline = PassPipeline.get_empty_pipeline()
+        pipeline = ct.PassPipeline.EMPTY
 
     Add passes to pipeline:
 
     .. sourcecode:: python
 
-        pipeline=ct.PassPipeline()
+        pipeline = ct.PassPipeline.DEFAULT
         pipeline.append_pass("common::reduce_transposes")
         pipeline.insert_pass(index=0, pass_name="common::reduce_transposes")
         # Can also specify all passes by setting the passes of the pipeline.
@@ -199,19 +225,28 @@ class PassPipeline:
         # Get all passes.
         pass_names = pipeline.passes
         # Find indexes of a specific pass.
-        pass_indexes = [idx for idx, pass_name in enumerate(pass_names) if pass_names[idx] == "common::reduce_transposes"]
+        pass_indexes = [
+            idx
+            for idx, pass_name in enumerate(pass_names)
+            if pass_names[idx] == "common::reduce_transposes"
+        ]
 
     Set options for a specific pass:
 
     .. sourcecode:: python
 
-        pipeline=ct.PassPipeline()
-        pipeline.set_options(pass_name="common::const_elimination", options={"skip_const_by_size":
-            "100000"}, override=False)
+        pipeline = ct.PassPipeline.DEFAULT
+        pipeline.set_options(
+            pass_name="common::const_elimination",
+            options={"skip_const_by_size": "100000"},
+        )
     """
 
     _PIPELINE_NAME_TO_PASSES = {
         "default": _COMMON_PASSES + _CLEANUP_PASSES,
+        "cleanup": _CLEANUP_PASSES,
+        "default_palettization": _PALETTIZATION_PASSES + _COMMON_PASSES + _CLEANUP_PASSES,
+        "default_sparsification": _SPARSIFICATION_PASSES + _COMMON_PASSES + _CLEANUP_PASSES,
         "empty": [],
         # Frontend pipelines.
         "frontend_milinternal": [],
@@ -286,17 +321,16 @@ def get_all_options(self) -> Dict[Text, List[PassOption]]:
         """Gets all options in the pipeline."""
         return self._pass_options
 
-    def set_options(self, pass_name: Text, options: Dict[Text, Text], override: bool = False):
+    def set_options(self, pass_name: Text, options: Dict[Text, Text], override: bool = True):
         """Sets options for a specific pass."""
-        if self._pass_options.get(pass_name, None) and not override:
-            raise ValueError(f"The pass {pass_name} already has associated options.")
+        if self._pass_options.get(pass_name, None):
+            if not override:
+                raise ValueError(f"The pass {pass_name} already has associated options.")
+            else:
+                logger.warning(f"The pass {pass_name} already has associated options. Override the existing options.")
+
         pass_options: List[PassOption] = []
         for option_name, option_val in options.items():
-            if not (isinstance(option_name, str) and isinstance(option_val, str)):
-                raise ValueError(
-                    f"The options must be specified by Dict[Text, Text], but got "
-                    f"Dict[{type(option_name)}, {type(option_val)}]"
-                )
             pass_option = PassOption(option_name=option_name, option_val=option_val)
             pass_options.append(pass_option)
         self._pass_options[pass_name] = pass_options
@@ -321,25 +355,11 @@ def validate(self):
                     f"pipeline: {self._pass_names}"
                 )
 
-    @staticmethod
-    def get_empty_pipeline() -> PassPipeline:
-        """Creates an empty pipeline without any pass."""
-        return PassPipeline(pass_names=[])
-
     @staticmethod
     def get_pipeline(pipeline_name: Text) -> PassPipeline:
         """
         Gets a pipeline based on the name. Raises an error if no pipeline is found.
-        Available Pipelines:
-        - "default": _COMMON_PASSES + _CLEANUP_PASSES
-        - "empty": empty
-        - "frontend_pytorch": _FRONTEND_TORCH_PASSES
-        - "frontend_tensorflow": _FRONTEND_TF1_PASSES
-        - "frontend_tensorflow2": _FRONTEND_TF2_PASSES
-        - "frontend_milinternal": empty
-        - "backend_mlprogram": _BACKEND_MIL_PASSES
-        - "backend_neuralnetwork": _BACKEND_NN_PASSES
-        - "backend_milinternal": empty
+        Available Pipelines are defined in _PIPELINE_NAME_TO_PASSES
         """
         if pipeline_name not in PassPipeline._PIPELINE_NAME_TO_PASSES:
             raise ValueError(
@@ -348,8 +368,55 @@ def get_pipeline(pipeline_name: Text) -> PassPipeline:
             )
         return PassPipeline(PassPipeline._PIPELINE_NAME_TO_PASSES[pipeline_name], pipeline_name)
 
+    """
+    =======================================
+    Pre-defined PassPipeline configurations
+    =======================================
+    """
+    @_classproperty
+    def EMPTY(cls) -> PassPipeline:
+        """Creates an empty pipeline without any pass."""
+        return PassPipeline(pass_names=[])
+
+    @_classproperty
+    def DEFAULT(cls) -> PassPipeline:
+        """Creates a pipeline that the converter uses by default."""
+        return PassPipeline.get_pipeline("default")
+
+    @_classproperty
+    def CLEANUP(cls) -> PassPipeline:
+        """Create a pipeline that contains cleanup passes."""
+        return PassPipeline.get_pipeline("cleanup")
+
+    @_classproperty
+    def DEFAULT_PALETTIZATION(cls) -> PassPipeline:
+        """Create a default palettization pipeline to convert a compressed source model"""
+        # We use delayed import to avoid circular import
+        from coremltools.optimize.coreml import OpPalettizerConfig, OptimizationConfig
+        pipeline = PassPipeline.get_pipeline("default_palettization")
+
+        # set default palettization
+        config = OptimizationConfig(global_config=OpPalettizerConfig(mode="unique"))
+        pipeline.set_options("compression::palettize_weights", {"config": config})
+        return pipeline
+
+    @_classproperty
+    def DEFAULT_PRUNING(cls) -> PassPipeline:
+        """Create a default sparsification pipeline to convert a compressed source model"""
+        # We use delayed import to avoid circular import
+        from coremltools.optimize.coreml import OpThresholdPrunerConfig, OptimizationConfig
+        pipeline = PassPipeline.get_pipeline("default_sparsification")
+
+        # set default sparsification
+        config = OptimizationConfig(
+            global_config=OpThresholdPrunerConfig(
+                threshold=1e-3,
+            )
+        )
+        pipeline.set_options("compression::prune_weights", {"config": config})
+        return pipeline
 
-class PipelineManager:
+class PassPipelineManager:
     @staticmethod
     @_profile
     def apply_pipeline(prog: Program, pass_pipeline: PassPipeline):
diff --git a/coremltools/converters/mil/mil/passes/tests/test_pass_pipeline.py b/coremltools/converters/mil/mil/passes/tests/test_pass_pipeline.py
index 553f6072c..d48103a55 100644
--- a/coremltools/converters/mil/mil/passes/tests/test_pass_pipeline.py
+++ b/coremltools/converters/mil/mil/passes/tests/test_pass_pipeline.py
@@ -7,7 +7,7 @@
 import pytest
 
 from coremltools.converters.mil.mil import Builder as mb
-from coremltools.converters.mil.mil.passes.pass_pipeline import PassPipeline, PipelineManager
+from coremltools.converters.mil.mil.passes.pass_pipeline import PassPipeline, PassPipelineManager
 from coremltools.converters.mil.testing_utils import assert_model_is_valid, get_op_types_in_program
 
 np.random.seed(1984)
@@ -23,10 +23,10 @@ def prog(x):
             return x
 
         assert get_op_types_in_program(prog) == ["relu", "relu", "add"]
-        pipeline = PassPipeline.get_empty_pipeline()
+        pipeline = PassPipeline.EMPTY
         pipeline.append_pass("common::merge_consecutive_relus")
         assert pipeline.passes == ["common::merge_consecutive_relus"]
-        PipelineManager.apply_pipeline(prog, pipeline)
+        PassPipelineManager.apply_pipeline(prog, pipeline)
         assert get_op_types_in_program(prog) == ["relu", "add"]
 
         inputs = {"x": (2, 3)}
@@ -37,7 +37,7 @@ def prog(x):
         )
 
     def test_insert_pass_at_index(self):
-        pipeline = PassPipeline.get_empty_pipeline()
+        pipeline = PassPipeline.EMPTY
         pipeline.insert_pass(index=0, pass_name="common::merge_consecutive_relus")
         pipeline.insert_pass(index=0, pass_name="common::noop_elimination")
         pipeline.insert_pass(index=1, pass_name="common::noop_elimination")
@@ -50,7 +50,7 @@ def test_insert_pass_at_index(self):
         ]
 
     def test_insert_invalid_pass(self):
-        pipeline = PassPipeline.get_empty_pipeline()
+        pipeline = PassPipeline.EMPTY
         with pytest.raises(ValueError, match="The pass test_pass is not registered."):
             pipeline.append_pass("test_pass")
         with pytest.raises(ValueError, match="The pass test_pass is not registered."):
@@ -59,7 +59,7 @@ def test_insert_invalid_pass(self):
             pipeline.passes = ["invalid_pass"]
 
     def test_remove_passes(self):
-        pipeline = PassPipeline.get_empty_pipeline()
+        pipeline = PassPipeline.EMPTY
         pipeline.passes = [
             "common::noop_elimination",
             "common::merge_consecutive_reshapes",
@@ -75,7 +75,7 @@ def test_remove_passes(self):
         assert pipeline.passes == ["common::merge_consecutive_reshapes"]
 
     def test_set_pass_options(self):
-        pipeline = PassPipeline.get_empty_pipeline()
+        pipeline = PassPipeline.EMPTY
         pipeline.append_pass("common::add_fp16_cast")
         assert pipeline.get_options("common::add_fp16_cast") is None
         pipeline.set_options("common::add_fp16_cast", {"skip_ops_by_type": "matmul,const"})
@@ -89,14 +89,14 @@ def test_set_pass_options_already_exist(self):
         with pytest.raises(
             ValueError, match="The pass common::add_fp16_cast already has associated options."
         ):
-            pipeline.set_options("common::add_fp16_cast", {"skip_ops_by_type": "concat"})
+            pipeline.set_options("common::add_fp16_cast", {"skip_ops_by_type": "concat"}, override=False)
         # Override the options.
-        pipeline.set_options("common::add_fp16_cast", {"skip_ops_by_type": "concat"}, override=True)
+        pipeline.set_options("common::add_fp16_cast", {"skip_ops_by_type": "concat"})
         assert pipeline.get_options("common::add_fp16_cast")[0].option_name == "skip_ops_by_type"
         assert pipeline.get_options("common::add_fp16_cast")[0].option_val == "concat"
 
     def test_set_pass_options_for_pass_not_in_pipeline(self):
-        pipeline = PassPipeline.get_empty_pipeline()
+        pipeline = PassPipeline.EMPTY
         pipeline.set_options("common::add_fp16_cast", {"skip_ops_by_type": "matmul,const"})
         with pytest.raises(
             ValueError,
diff --git a/coremltools/converters/mil/mil/passes/tests/test_passes.py b/coremltools/converters/mil/mil/passes/tests/test_passes.py
index 193b101d3..4630be19c 100644
--- a/coremltools/converters/mil/mil/passes/tests/test_passes.py
+++ b/coremltools/converters/mil/mil/passes/tests/test_passes.py
@@ -18,7 +18,6 @@
 )
 from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.mil import Function, Program, Symbol, get_new_symbol, types
-from coremltools.converters.mil.mil.passes.defs import quantization
 from coremltools.converters.mil.mil.passes.defs.cleanup import topological_reorder
 from coremltools.converters.mil.mil.passes.helper import _check_var_scalar_value
 from coremltools.converters.mil.mil.passes.pass_registry import PASS_REGISTRY
@@ -32,11 +31,170 @@
     get_op_names_in_program,
     get_op_types_in_program,
 )
+from coremltools.models.utils import _macos_version
+
+import coremltools.optimize as cto
 
 np.random.seed(1984)
 _VALIDATE_MODEL = True
 
 
+class TestConstDeduplication:
+    def test_const_deduplication(self):
+        BATCH_DIM = 5
+        SEQUENCE_LENGTH = 4
+        ENCODING_DIM = 256
+        EMBEDDING_DIM = 128
+        weight = np.random.rand(EMBEDDING_DIM, ENCODING_DIM)
+        bias = np.random.rand(EMBEDDING_DIM)
+
+        @mb.program(
+            input_specs=[
+                mb.TensorSpec(shape=(BATCH_DIM, SEQUENCE_LENGTH, ENCODING_DIM)),
+                mb.TensorSpec(shape=(BATCH_DIM, SEQUENCE_LENGTH, ENCODING_DIM)),
+            ]
+        )
+        def prog(q, k):
+            q_e = mb.linear(x=q, weight=weight, bias=bias)
+            k_e = mb.linear(x=k, weight=weight, bias=bias)
+            attention = mb.matmul(x=q_e, y=k_e, transpose_y=True)
+            return attention
+
+        prev_prog, _, _ = apply_pass_and_basic_check(prog, "common::const_deduplication")
+        assert_op_count_match(prev_prog, expect=6, op="const")
+        assert_op_count_match(prog, expect=4, op="const")
+
+    def test_constexpr_deduplication(self):
+        BATCH_DIM = 5
+        SEQUENCE_LENGTH = 4
+        ENCODING_DIM = 256
+        EMBEDDING_DIM = 128
+        quantized_weight = np.random.randint(
+            -128, 128, size=(EMBEDDING_DIM, ENCODING_DIM), dtype=np.int8
+        )
+        quantized_bias = np.random.randint(-128, 128, size=EMBEDDING_DIM, dtype=np.int8)
+
+        @mb.program(
+            input_specs=[
+                mb.TensorSpec(shape=(BATCH_DIM, SEQUENCE_LENGTH, ENCODING_DIM)),
+                mb.TensorSpec(shape=(BATCH_DIM, SEQUENCE_LENGTH, ENCODING_DIM)),
+            ]
+        )
+        def prog(q, k):
+            weight_q = mb.constexpr_affine_dequantize(
+                quantized_data=quantized_weight,
+                zero_point=np.int8(0),
+                scale=np.float32(1.0),
+                axis=0,
+            )
+            weight_k = mb.constexpr_affine_dequantize(
+                quantized_data=quantized_weight,
+                zero_point=np.int8(0),
+                scale=np.float32(1.0),
+                axis=0,
+            )
+            bias_q = mb.constexpr_affine_dequantize(
+                quantized_data=quantized_bias,
+                zero_point=np.int8(0),
+                scale=np.float32(1.0),
+                axis=0,
+            )
+            bias_k = mb.constexpr_affine_dequantize(
+                quantized_data=quantized_bias,
+                zero_point=np.int8(0),
+                scale=np.float32(1.0),
+                axis=0,
+            )
+            q_e = mb.linear(x=q, weight=weight_q, bias=bias_q)
+            k_e = mb.linear(x=k, weight=weight_k, bias=bias_k)
+            attention = mb.matmul(x=q_e, y=k_e, transpose_y=True)
+            return attention
+
+        prev_prog, _, _ = apply_pass_and_basic_check(prog, "common::const_deduplication")
+        assert_op_count_match(prev_prog, expect=4, op="constexpr_affine_dequantize")
+        assert_op_count_match(prog, expect=2, op="constexpr_affine_dequantize")
+
+    def test_const_deduplication_as_outputs(self):
+        """
+        If the duplicated constants are block outputs, we should not remove them.
+        """
+        # case 1:
+        # const_2 can be eliminated since it is not block output
+        const = np.random.rand(40, 20, 30)
+
+        @mb.program(
+            input_specs=[
+                mb.TensorSpec(
+                    shape=(
+                        40,
+                        20,
+                        30,
+                    )
+                )
+            ]
+        )
+        def prog(x):
+            const_1 = mb.const(val=const, name="const_1")
+            const_2 = mb.const(val=const, name="const_2")
+            x = mb.relu(x=x)
+            x = mb.add(x=x, y=const_2)
+            return x, const_1
+
+        prev_prog, _, _ = apply_pass_and_basic_check(prog, "common::const_deduplication")
+        assert_op_count_match(prev_prog, expect=2, op="const")
+        assert_op_count_match(prog, expect=1, op="const")
+        assert prog.functions["main"].outputs[1].name == "const_1"
+
+        # case 2:
+        # const_2 can not be eliminated since it is a block output
+        const = np.random.rand(40, 20, 30)
+
+        @mb.program(
+            input_specs=[
+                mb.TensorSpec(
+                    shape=(
+                        40,
+                        20,
+                        30,
+                    )
+                )
+            ]
+        )
+        def prog(x):
+            const_1 = mb.const(val=const, name="const_1")
+            const_2 = mb.const(val=const, name="const_2")
+            x = mb.relu(x=x)
+            x = mb.add(x=x, y=const_2)
+            return x, const_1, const_2
+
+        prev_prog, _, _ = apply_pass_and_basic_check(prog, "common::const_deduplication")
+        assert_op_count_match(prev_prog, expect=2, op="const")
+        assert_op_count_match(prog, expect=2, op="const")
+        assert prog.functions["main"].outputs[1].name == "const_1"
+        assert prog.functions["main"].outputs[2].name == "const_2"
+
+    @pytest.mark.skip("rdar://109374995 consts are not shared across blocks")
+    def test_const_deduplication_multiple_blocks(self):
+        weight = np.random.rand(5, 3, 2, 2)
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(4, 3, 8, 8))])
+        def prog(x):
+            def _true_fn():
+                return mb.conv(x=x, weight=weight, pad_type="valid")
+
+            def _false_fn():
+                y = mb.mul(x=x, y=2.0)
+                return mb.conv(x=y, weight=weight, pad_type="valid")
+
+            x_gt_0_tensor = mb.greater(x=x, y=0.0)
+            x_gt_0 = mb.slice_by_index(x=x_gt_0_tensor, begin=(0, 0, 0, 0), end=(1, 1, 1, 1))
+            return mb.cond(pred=x_gt_0, _true_fn=_true_fn, _false_fn=_false_fn)
+
+        prev_prog, _, _ = apply_pass_and_basic_check(prog, "common::const_deduplication")
+        assert_op_count_match(prev_prog, expect=8, op="const")
+        assert_op_count_match(prog, expect=6, op="const")
+
+
 class TestConstElimination:
     def test_const_elimination(self):
         @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
@@ -67,6 +225,27 @@ def prog(x):
         # Not fold into const because the upstream constexpr_cast op is non-replaceable.
         assert get_op_types_in_program(prog) == ["constexpr_cast", "add", "add"]
 
+    def test_force_const_eliminate_nonreplaceable_ops(self):
+        @mb.program(input_specs=[mb.TensorSpec(shape=(3,), dtype=types.int32)])
+        def prog(x):
+            a = np.random.rand(2, 3, 5).astype(np.float16)
+            constexpr_a = mb.constexpr_cast(source_val=a, output_dtype="fp32")
+            double_a = mb.add(x=constexpr_a, y=a.astype(np.float32))
+            a_shape = mb.shape(x=double_a)
+            return mb.add(x=x, y=a_shape)
+
+        assert get_op_types_in_program(prog) == ["constexpr_cast", "add", "shape", "add"]
+
+        apply_pass_and_basic_check(prog, "common::const_elimination")
+        # still fold shape into const regardless the non-replaceable upstream
+        # constexpr_cast op, since it only provides a shape
+        assert get_op_types_in_program(prog) == ["constexpr_cast", "add", "add"]
+
+        apply_pass_and_basic_check(prog, "common::dead_code_elimination")
+        # constexpr_cast(a) and add(a, a) no longer contributes to output,
+        # so they should get dead code eliminated
+        assert get_op_types_in_program(prog) == ["add"]
+
     @patch(
         "coremltools.converters.mil.mil.passes.defs.cleanup.const_elimination._skip_const_by_size",
         1000,
@@ -1461,6 +1640,38 @@ def prog(shape):
         ]
         assert get_op_types_in_program(prog) == ["cast", "random_uniform", "random_uniform", "add"]
 
+    def test_nonreplaceable_vars(self):
+        """
+        Nonreplaceable vars shouldn't be removed, e.g. palettized weights
+
+        const_1----->add---->add_1------|
+                      |                 |
+                    input              add---->output
+                      |                 |
+        const_2----->add---->add_2------|
+        """
+        def _constexpr_lut_to_dense():
+            lut_data = np.array(
+                [-19.0, 4.0, 0.0, -1.0, 1.0, 3.0, 5.0, -8.0, 19, 13, 42, 4.5, 5.4, 2.0, -6, -7]
+            ).astype(np.float32)
+            indices = np.array([212, 21]).astype(np.uint8)
+            shape = np.array([4, 1]).astype(np.uint32)
+            return mb.constexpr_lut_to_dense(lut=lut_data, indices=indices, shape=shape)
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(4, 1))])
+        def prog(x):
+            constexpr_1 = _constexpr_lut_to_dense()
+            constexpr_2 = _constexpr_lut_to_dense()
+            c = mb.add(x=constexpr_1, y=x)
+            d = mb.add(x=constexpr_2, y=x)
+            return mb.add(x=c, y=d)
+
+        prev_prog, _, _ = apply_pass_and_basic_check(
+            prog,
+            "common::remove_redundant_ops",
+        )
+        assert get_op_types_in_program(prev_prog) == get_op_types_in_program(prog)
+
 
 class TestTopologicalReorder:
     def test_move_sink_casts_to_the_end(self):
@@ -1955,6 +2166,33 @@ def prog(x):
             expected_output_shapes={block.outputs[0].name: (3, 5, 6)},
         )
 
+    def test_gelu_tanh_multiple_final_operations(self):
+        """
+        The generic pattern matching only supports one final output operation. For multiple final
+        operations, we want to make sure it just skip the pattern matching instead of failing the
+        whole conversion.
+        """
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(3, 5, 6))])
+        def prog(x):
+            x_1 = mb.mul(x=x, y=0.5)
+            x_2 = mb.pow(x=x, y=3.0)
+            x_2 = mb.mul(x=x_2, y=0.044715)
+            x_2 = mb.add(x=x, y=x_2)
+            x_2 = mb.mul(x=x_2, y=np.sqrt(2 / np.pi))
+            x_2 = mb.tanh(x=x_2)
+            x_2 = mb.add(x=x_2, y=1.0)
+            x_2 = mb.mul(x=x_1, y=x_2)
+            x_2 = mb.mul(x=x_2, y=1.0)
+            return x_2
+
+        with pytest.warns(
+            UserWarning,
+            match="User defined pattern matched to more than one final operation. "
+            "Skipped the pattern matching.",
+        ):
+            apply_pass_and_basic_check(prog, "common::fuse_gelu_tanh_approximation")
+
     @pytest.mark.parametrize(
         "op_type, is_first_op1, is_first_op2, is_first_op3, is_first_op4, const_mul_first",
         itertools.product(
@@ -2316,8 +2554,8 @@ def _get_constexpr_cast(shape):
     @staticmethod
     def _get_constexpr_sparse_to_dense(shape):
         val = np.random.rand(*shape)
-        sparse_params = quantization.WeightSparsifier.compress(
-            val=val, mode="PERCENTILE_MODE", target_percentile=0.4
+        sparse_params = cto.coreml._quantization_passes.prune_weights.compress_by_magnitude(
+            val=val, target_sparsity=0.4
         )
         return mb.constexpr_sparse_to_dense(
             nonzero_data=sparse_params.nonzero_data,
@@ -2328,7 +2566,7 @@ def _get_constexpr_sparse_to_dense(shape):
     @staticmethod
     def _get_constexpr_lut_to_dense(shape):
         val = np.random.rand(*shape)
-        lut_params = quantization.WeightPalettizer.compress(val=val, nbits=4, mode="UNIFORM")
+        lut_params = cto.coreml._quantization_passes.palettize_weights.compress(val=val, nbits=4, mode="UNIFORM")
         return mb.constexpr_lut_to_dense(
             indices=lut_params.indices,
             lut=lut_params.lut,
@@ -2338,7 +2576,7 @@ def _get_constexpr_lut_to_dense(shape):
     @staticmethod
     def _get_constexpr_affine_dequantize(shape):
         val = np.random.rand(*shape)
-        quant_params = quantization.WeightAffineQuantizer.compress(
+        quant_params = cto.coreml._quantization_passes.linear_quantize_weights.compress(
             val=val, axis=0, mode="LINEAR_SYMMETRIC", dtype=types.uint8
         )
         return mb.constexpr_affine_dequantize(
@@ -2780,8 +3018,8 @@ def test_success_reduce_consecutive_transposes(self):
         """
         @mb.program(input_specs=[mb.TensorSpec(shape=(1, 2, 3, 4))])
         def prog(x):
-            x1 = mb.transpose(x=x, perm=[0, 2, 1, 3]) 
-            x1 = mb.transpose(x=x1, perm=[3, 2, 0, 1]) 
+            x1 = mb.transpose(x=x, perm=[0, 2, 1, 3])
+            x1 = mb.transpose(x=x1, perm=[3, 2, 0, 1])
             x2 = mb.transpose(x=x, perm=[3, 2, 1, 0])
             x2 = mb.transpose(x=x2, perm=[2, 3, 0, 1])
             x2 = mb.transpose(x=x2, perm=[0, 2, 1, 3])
@@ -2809,7 +3047,7 @@ def test_success_reduce_consecutive_transposes_with_output_constrain(self):
                        |                            |
                        v                            v
                     output_1                     output_2
-              
+
         Output:
             x --> transpose_1 -> transpose_6 -> transpose_7-> add -> output_1
                        |             |
@@ -2849,7 +3087,7 @@ def test_not_merge_transposes(self):
         """
         Input:
             x --> transpose_1 -> add -> transpose_2 -> output
-              
+
         Output:
             x --> transpose_1 -> add -> transpose_2 -> output
         """
@@ -2878,12 +3116,12 @@ def _test_numerical(prog, input_shape, reshape_shape, perm, output_shape):
         coreml_input = {"x": x}
         mlmodel = ct.convert(prog, source="milinternal")
         coreml_output = list(mlmodel.predict(coreml_input).values())[0]
-        
+
         gt = np.reshape(x, reshape_shape)
         gt = np.transpose(gt, perm)
         gt = np.reshape(gt, output_shape)
-        np.testing.assert_allclose(gt, coreml_output, rtol=1e-03, atol=1e-05)    
-    
+        np.testing.assert_allclose(gt, coreml_output, rtol=1e-03, atol=1e-05)
+
     def test_rank6(self):
         input_shape = (1, 2, 3, 4, 5)
         reshape_shape = (1, 2, 3, 2, 2, 5)
@@ -2915,7 +3153,7 @@ def prog(x):
             x = mb.reshape(x=x, shape=output_shape)
             return x
         prev_prog, _, block = apply_pass_and_basic_check(prog, "common::expand_high_rank_reshape_and_transpose")
-       
+
         prog._check_invalid_tensor_rank()
         assert get_op_types_in_program(prog) == ["reshape", "transpose", "reshape"]
         TestExpandHighRankReshapeAndTranspose._test_numerical(prev_prog, input_shape, reshape_shape, perm, output_shape)
@@ -2934,7 +3172,7 @@ def prog(x):
             return x
 
         prev_prog, _, block = apply_pass_and_basic_check(prog, "common::expand_high_rank_reshape_and_transpose")
-       
+
         prog._check_invalid_tensor_rank()
         assert get_op_types_in_program(prog) == ["reshape", "transpose"] * 16 + ["reshape"]
         TestExpandHighRankReshapeAndTranspose._test_numerical(prev_prog, input_shape, reshape_shape, perm, output_shape)
@@ -2953,7 +3191,7 @@ def prog(x):
             return x, x1
 
         prev_prog, _, block = apply_pass_and_basic_check(prog, "common::expand_high_rank_reshape_and_transpose")
-        
+
         with pytest.raises(ValueError, match="Core ML only supports tensors with rank <= 5"):
             prog._check_invalid_tensor_rank()
 
@@ -5519,8 +5757,11 @@ def prog(x, y):
 
 
 class TestFuseOnehotMatmulToGather:
-    @pytest.mark.parametrize("rank", [1, 2, 3, 4])
-    def test_fuse_onehot_matmul_to_gather(self, rank):
+    @pytest.mark.parametrize(
+        "backend, rank, opset_version",
+        itertools.product(backends, [1, 2, 3, 4], [None, ct.target.iOS17]),
+    )
+    def test_fuse_onehot_matmul_to_gather(self, backend, rank, opset_version):
         """
         Input:
             %2 = one_hot(%1, on_value=1, off_value=0, axis=-1)
@@ -5535,7 +5776,10 @@ def test_fuse_onehot_matmul_to_gather(self, rank):
         vocab_size = 15
         embedding_size = 12
 
-        @mb.program(input_specs=[mb.TensorSpec(shape=input_shape, dtype=types.int32)])
+        @mb.program(
+            input_specs=[mb.TensorSpec(shape=input_shape, dtype=types.int32)],
+            opset_version=opset_version,
+        )
         def prog(x):
             x = mb.one_hot(
                 indices=x, on_value=1.0, off_value=0.0, axis=-1, one_hot_vector_size=vocab_size
@@ -5547,11 +5791,29 @@ def prog(x):
             prog, "common::fuse_onehot_matmul_to_gather"
         )
         assert get_op_types_in_program(prev_prog) == ["one_hot", "matmul"]
-        assert get_op_types_in_program(prog) == ["gather"]
+        if opset_version == ct.target.iOS17:
+            # Several ops added to make sure indices in iOS17 gather is non-negative.
+            assert get_op_types_in_program(prog) == [
+                "greater_equal",
+                "shape",
+                "slice_by_index",
+                "add",
+                "select",
+                "gather",
+            ]
+        else:
+            assert get_op_types_in_program(prog) == ["gather"]
+
+        if opset_version == ct.target.iOS17:
+            if backend[0] != "mlprogram" or _macos_version() < (14, 0):
+                pytest.skip("IOS17 target available only on macOS 14+ with mlprogram.")
+
         assert_model_is_valid(
             prog,
             {"x": input_shape},
+            backend=backend,
             expected_output_shapes={block.outputs[0].name: input_shape + (embedding_size,)},
+            minimum_deployment_target=opset_version,
         )
 
 
@@ -7089,387 +7351,3 @@ def prog(x):
 
         if _VALIDATE_MODEL:
             assert_model_is_valid(prog, {"x": (2, 4)})
-
-
-class TestCompressionGraphPass:
-    """
-    Most of the numerical tests are already convered in coremltools.tests.ml_program.test_compression_utils.
-    This test is checking the basic behavior of the graph pass classes.
-    """
-
-    @staticmethod
-    def _get_conv_program():
-        @mb.program(
-            input_specs=[mb.TensorSpec(shape=(1, 30, 10, 10))], opset_version=ct.target.iOS16
-        )
-        def prog(x):
-            conv_weight = np.random.rand(90, 30, 2, 2).astype(np.float32)
-            x = mb.conv(x=x, weight=conv_weight)
-            return x
-
-        return prog
-
-    @pytest.mark.parametrize(
-        "fake_compression",
-        [True, False],
-    )
-    def test_affine_quantizer(self, fake_compression):
-        quantizer = quantization.WeightAffineQuantizer(
-            fake_compression=fake_compression, op_selector=lambda const: True
-        )
-        prog = self._get_conv_program()
-        quantizer.apply(prog)
-        expected_ops = ["constexpr_affine_dequantize", "conv"] if not fake_compression else ["conv"]
-        assert get_op_types_in_program(prog) == expected_ops
-
-    @pytest.mark.parametrize(
-        "fake_compression",
-        [True, False],
-    )
-    def test_weight_sparsifier(self, fake_compression):
-        quantizer = quantization.WeightSparsifier(
-            fake_compression=fake_compression,
-            op_selector=lambda const: True,
-            mode="percentile_based",
-            target_percentile=0.75,
-        )
-        prog = self._get_conv_program()
-        quantizer.apply(prog)
-        expected_ops = ["constexpr_sparse_to_dense", "conv"] if not fake_compression else ["conv"]
-        assert get_op_types_in_program(prog) == expected_ops
-
-    @pytest.mark.parametrize(
-        "fake_compression",
-        [True, False],
-    )
-    def test_weight_palettization(self, fake_compression):
-        quantizer = quantization.WeightPalettizer(
-            fake_compression=fake_compression,
-            op_selector=lambda const: True,
-            mode="uniform",
-            nbits=4,
-        )
-        prog = self._get_conv_program()
-        quantizer.apply(prog)
-        expected_ops = ["constexpr_lut_to_dense", "conv"] if not fake_compression else ["conv"]
-        assert get_op_types_in_program(prog) == expected_ops
-
-    @pytest.mark.parametrize(
-        "axis, mode, source_dtype, target_dtype, data_range",
-        itertools.product(
-            [0, 1, 2, 3, -1],
-            ["linear", "linear_symmetric"],
-            [np.float16, np.float32],
-            [types.uint8, types.int8],
-            [
-                [-1., 1.],
-                [-3., -1.],
-                [1., 3.],
-                # Test corner case of same values
-                [0., 0.],
-                [1., 1.],
-                [-1., -1.],
-            ]
-        ),
-    ) 
-    def test_affine_quantizer_compression(self, axis, mode, source_dtype, target_dtype, data_range):
-        input_shape = (10, 20, 30, 40)
-        low, high = data_range
-        val = np.random.uniform(low, high, input_shape).astype(source_dtype)
-        
-        params = quantization.WeightAffineQuantizer.compress(val, axis, mode, target_dtype)
-        decompressed_val = quantization.WeightAffineQuantizer.decompress(params)
-        
-        np.testing.assert_allclose(val, decompressed_val, rtol=1e-02, atol=1e-02)
-
-    @pytest.mark.parametrize(
-        "mode, nbits, shape",
-        itertools.product(
-            ["KMEANS", "UNIFORM", "UNIQUE"],
-            [1, 2, 4, 6, 8],
-            [
-                (1,),
-                (1, 1),
-                (1, 10),
-                (2, 20),
-                (3, 7, 9),
-                (17, 17, 17),
-            ]
-        ),
-    ) 
-    def test_palettizer_compression(self, mode, nbits, shape):
-        val_size = np.prod(shape)
-        max_val = 2 ** nbits
-        val = np.arange(max_val).tolist()
-        val = np.array(val * (val_size // max_val + 1))[:val_size].astype(np.float32)
-        params = quantization.WeightPalettizer.compress(val, mode=mode, nbits=nbits)
-        decompressed_val = quantization.WeightPalettizer.decompress(params)
-
-        # For
-        # 1. UNIQUE / KMEANS mode
-        # 2. UNIFORM mode with the data range <= tensor size
-        # We can perfecting re-construct the original value
-        if (mode in ["UNIQUE", "KMEANS"]) or (mode == "UNIFORM" and max_val <= val_size): 
-            np.testing.assert_allclose(val, decompressed_val, rtol=1e-02, atol=1e-02)
-
-class TestFP16CastTransform(unittest.TestCase):
-    """"""
-
-    """
-    Input graph:
-        input -----> square -----> out
-
-    Output graph:
-        input -----> cast(dtype="fp16") -----> square -----> cast(dtype="fp32") ---> out
-    """
-
-    def test_single_input_to_single_operation(self):
-        @mb.program(input_specs=[mb.TensorSpec(shape=(10, 20))])
-        def prog(x):
-            x = mb.square(x=x)
-            return x
-
-        self.assertEqual(get_op_types_in_program(prog), ["square"])
-
-        apply_pass_and_basic_check(
-            prog, quantization.FP16ComputePrecision(op_selector=lambda op: True)
-        )
-        _, _, block = apply_pass_and_basic_check(prog, "common::dead_code_elimination")
-
-        self.assertEqual(get_op_types_in_program(prog), ["cast", "square", "cast"])
-
-        # Asserting first cast configuration
-        cast_1 = block.find_ops(op_type="cast")[0]
-        self.assertEqual(cast_1.dtype.val, "fp16")
-        self.assertEqual(len(cast_1.outputs), 1)
-        self.assertEqual(len(cast_1.outputs[0].child_ops), 1)
-        self.assertEqual(cast_1.outputs[0].child_ops[0].op_type, "square")
-
-        # Asserting second cast configuration
-        cast_2 = block.find_ops(op_type="cast")[1]
-        self.assertEqual(cast_2.dtype.val, "fp32")
-        self.assertEqual(len(cast_2.outputs), 1)
-        self.assertEqual(len(cast_2.outputs[0].child_ops), 0)
-
-        assert_model_is_valid(
-            prog,
-            {"x": (10, 20)},
-            expected_output_shapes={block.outputs[0].name: (10, 20)},
-        )
-
-    """
-    Input graph:
-        input -----> div -----> out
-                      ^
-        const(eps) ---|
-
-    Output graph:
-        input --------> cast(dtype="fp16") -----> div -----> cast(dtype="fp32") ---> out
-                                                   ^
-        const(eps) ---> cast(dtype="fp16") --------|
-    """
-
-    def test_divide_by_zero_operation(self):
-        @mb.program(input_specs=[mb.TensorSpec(shape=(10, 20))])
-        def prog(x):
-            eps = mb.const(val=1e-10)
-            x = mb.real_div(x=x, y=eps)
-            return x
-
-        prev_prog, prev_block, block = apply_pass_and_basic_check(
-            prog, quantization.FP16ComputePrecision(op_selector=lambda op: True)
-        )
-
-        mlmodel = ct.convert(prog, source="milinternal", compute_units=ct.ComputeUnit.CPU_ONLY)
-        input_dict = {"x": np.random.rand(10, 20)}
-
-        if _IS_MACOS:
-            prediction = mlmodel.predict(input_dict)
-            assert not np.isnan(prediction["real_div_0"]).any()
-            assert np.isfinite(prediction["real_div_0"]).all()
-
-    """
-    Input graph:
-        input1 ----->|
-                     concat -----> out
-        input2 ----->|
-
-    Output graph:
-        input1 -----> cast(dtype="fp16") ----->|
-                                               concat -----> cast(dtype="fp32") ---> out
-        input2 -----> cast(dtype="fp16") ----->|
-
-    """
-
-    def test_multiple_inputs_to_single_operation(self):
-        @mb.program(input_specs=[mb.TensorSpec(shape=(10, 20)), mb.TensorSpec(shape=(10, 20))])
-        def prog(x, y):
-            x = mb.concat(values=(x, y), axis=0)
-            return x
-
-        self.assertEqual(get_op_types_in_program(prog), ["concat"])
-
-        apply_pass_and_basic_check(
-            prog, quantization.FP16ComputePrecision(op_selector=lambda op: True)
-        )
-        _, _, block = apply_pass_and_basic_check(prog, "common::dead_code_elimination")
-
-        self.assertEqual(get_op_types_in_program(prog), ["cast", "cast", "concat", "cast"])
-
-        # Asserting first cast configuration
-        cast_1 = block.find_ops(op_type="cast")[0]
-        self.assertEqual(cast_1.dtype.val, "fp16")
-        self.assertEqual(len(cast_1.outputs), 1)
-        self.assertEqual(len(cast_1.outputs[0].child_ops), 1)
-        self.assertEqual(cast_1.outputs[0].child_ops[0].op_type, "concat")
-
-        # Asserting second cast configuration
-        cast_2 = block.find_ops(op_type="cast")[1]
-        self.assertEqual(cast_2.dtype.val, "fp16")
-        self.assertEqual(len(cast_2.outputs), 1)
-        self.assertEqual(len(cast_2.outputs[0].child_ops), 1)
-        self.assertEqual(cast_2.outputs[0].child_ops[0].op_type, "concat")
-
-        # Asserting third cast configuration
-        cast_3 = block.find_ops(op_type="cast")[2]
-        self.assertEqual(cast_3.dtype.val, "fp32")
-        self.assertEqual(len(cast_3.outputs), 1)
-        self.assertEqual(len(cast_3.outputs[0].child_ops), 0)
-
-        assert_model_is_valid(
-            prog,
-            {"x": (10, 20), "y": (10, 20)},
-            expected_output_shapes={block.outputs[0].name: (20, 20)},
-        )
-
-    """
-    Input graph:
-                            |-----> output_1
-          input -----> split
-                            |-----> output_2
-
-    Output graph:
-
-                                                     |-----> cast(dtype="fp32") ---> output_1
-          input -----> cast(dtype="fp16") -----> split
-                                                     |-----> cast(dtype="fp32") ---> output_2
-
-    """
-
-    def test_multiple_outputs_from_single_operation(self):
-        @mb.program(input_specs=[mb.TensorSpec(shape=(10, 20))])
-        def prog(x):
-            x = mb.split(x=x, axis=0, num_splits=2)
-            return x
-
-        self.assertEqual(get_op_types_in_program(prog), ["split"])
-
-        apply_pass_and_basic_check(
-            prog, quantization.FP16ComputePrecision(op_selector=lambda op: True)
-        )
-        _, _, block = apply_pass_and_basic_check(prog, "common::dead_code_elimination")
-
-        self.assertEqual(get_op_types_in_program(prog), ["cast", "split", "cast", "cast"])
-
-        # Asserting first cast configuration
-        cast_1 = block.find_ops(op_type="cast")[0]
-        self.assertEqual(cast_1.dtype.val, "fp16")
-        self.assertEqual(len(cast_1.outputs), 1)
-        self.assertEqual(len(cast_1.outputs[0].child_ops), 1)
-        self.assertEqual(cast_1.outputs[0].child_ops[0].op_type, "split")
-
-        # Asserting second cast configuration
-        cast_2 = block.find_ops(op_type="cast")[1]
-        self.assertEqual(cast_2.dtype.val, "fp32")
-        self.assertEqual(len(cast_2.outputs), 1)
-        self.assertEqual(len(cast_2.outputs[0].child_ops), 0)
-
-        # Asserting third cast configuration
-        cast_3 = block.find_ops(op_type="cast")[2]
-        self.assertEqual(cast_3.dtype.val, "fp32")
-        self.assertEqual(len(cast_3.outputs), 1)
-        self.assertEqual(len(cast_3.outputs[0].child_ops), 0)
-
-        assert_model_is_valid(
-            prog,
-            {"x": (10, 20)},
-            expected_output_shapes={block.outputs[0].name: (5, 20), block.outputs[1].name: (5, 20)},
-        )
-
-    """
-    Input graph:
-
-         |----> square ---> output_1
-    input|
-         |----> relu   ---> output_2
-
-    Output graph:
-
-                                        |---->square-----> cast(dtype="fp32") ---> output_1
-          input -----> cast(dtype="fp16")
-                                        |----> relu -----> cast(dtype="fp32") ---> output_2
-
-    """
-
-    def test_single_input_to_multiple_operations(self):
-        @mb.program(input_specs=[mb.TensorSpec(shape=(10, 20))])
-        def prog(x):
-            y = mb.square(x=x)
-            z = mb.relu(x=x)
-            return y, z
-
-        self.assertEqual(get_op_types_in_program(prog), ["square", "relu"])
-
-        apply_pass_and_basic_check(
-            prog, quantization.FP16ComputePrecision(op_selector=lambda op: True)
-        )
-        _, _, block = apply_pass_and_basic_check(prog, "common::dead_code_elimination")
-
-        self.assertEqual(get_op_types_in_program(prog), ["cast", "square", "cast", "relu", "cast"])
-
-        # Asserting first cast configuration
-        cast_1 = block.find_ops(op_type="cast")[0]
-        self.assertEqual(cast_1.dtype.val, "fp16")
-        self.assertEqual(len(cast_1.outputs), 1)
-        self.assertEqual(len(cast_1.outputs[0].child_ops), 2)
-        self.assertEqual(cast_1.outputs[0].child_ops[0].op_type, "square")
-        self.assertEqual(cast_1.outputs[0].child_ops[1].op_type, "relu")
-
-        # Asserting second cast configuration
-        cast_2 = block.find_ops(op_type="cast")[1]
-        self.assertEqual(cast_2.dtype.val, "fp32")
-        self.assertEqual(len(cast_2.outputs), 1)
-        self.assertEqual(len(cast_2.outputs[0].child_ops), 0)
-
-        # Asserting third cast configuration
-        cast_3 = block.find_ops(op_type="cast")[2]
-        self.assertEqual(cast_3.dtype.val, "fp32")
-        self.assertEqual(len(cast_3.outputs), 1)
-        self.assertEqual(len(cast_3.outputs[0].child_ops), 0)
-
-        assert_model_is_valid(
-            prog,
-            {"x": (10, 20)},
-            expected_output_shapes={
-                block.outputs[0].name: (10, 20),
-                block.outputs[1].name: (10, 20),
-            },
-        )
-
-    def test_duplicate_output_vars(self):
-        @mb.program(input_specs=[mb.TensorSpec(shape=(1, 2))])
-        def prog(x):
-            relu1 = mb.relu(x=x)
-            return relu1, relu1
-
-        _, _, block = apply_pass_and_basic_check(
-            prog, quantization.FP16ComputePrecision(op_selector=lambda op: True)
-        )
-        self.assertEqual(get_op_types_in_program(prog), ["cast", "relu", "cast"])
-
-        assert_model_is_valid(
-            prog,
-            {"x": (1, 2)},
-            expected_output_shapes={block.outputs[0].name: (1, 2), block.outputs[1].name: (1, 2)},
-            backend=("mlprogram", "fp16"),
-        )
diff --git a/coremltools/converters/mil/mil/passes/tests/test_quantization_passes.py b/coremltools/converters/mil/mil/passes/tests/test_quantization_passes.py
new file mode 100644
index 000000000..a3da5d655
--- /dev/null
+++ b/coremltools/converters/mil/mil/passes/tests/test_quantization_passes.py
@@ -0,0 +1,1657 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import itertools
+from typing import Tuple
+import unittest
+
+import numpy as np
+import parameterized
+import pytest
+
+import coremltools as ct
+import coremltools.converters.mil.mil.types as types
+from coremltools._deps import _IS_MACOS
+from coremltools.converters.mil.mil import Builder as mb
+from coremltools.converters.mil.mil.passes.defs import quantization
+from coremltools.converters.mil.mil.types import numpy_type_to_builtin_type
+from coremltools.converters.mil.testing_utils import (
+    apply_pass_and_basic_check,
+    assert_model_is_valid,
+    get_op_types_in_program,
+)
+
+np.random.seed(1818)
+
+
+class QuantizationBaseTest:
+    @staticmethod
+    def generate_random_quantization_params(
+        float_dtype: np.dtype,
+        quant_dtype: np.dtype,
+        input_shape: Tuple[int],
+        is_zp_present: bool = True,
+        is_axis_present: bool = True,
+    ) -> Tuple[np.ndarray, np.ndarray, int]:
+        """
+        return scale, zero point, axis
+        """
+
+        input_rank = len(input_shape)
+        low, high = (-128, 128) if quant_dtype == np.int8 else (0, 256)
+
+        scale = None
+        zero_point = None
+        axis = (
+            np.random.randint(-input_rank, input_rank, dtype=np.int32) if is_axis_present else None
+        )
+        if is_axis_present:
+            scale = np.random.rand(input_shape[axis]).astype(float_dtype)
+            if is_zp_present:
+                zero_point = np.random.randint(
+                    low=low, high=high, size=input_shape[axis], dtype=quant_dtype
+                )
+        else:
+            scale = np.array(np.random.rand()).astype(float_dtype)
+            if is_zp_present:
+                zero_point = np.array(np.random.randint(low=low, high=high, dtype=quant_dtype))
+
+        return scale, zero_point, axis
+
+    @staticmethod
+    def generate_random_quantize_input(
+        float_dtype: np.dtype,
+        quant_dtype: np.dtype,
+        scale: np.ndarray,
+        zero_point: np.ndarray,
+        axis: int,
+        shape: Tuple[int],
+    ) -> np.ndarray:
+        assert float_dtype == scale.dtype
+        if zero_point is not None:
+            assert quant_dtype == zero_point.dtype
+        if axis is not None:
+            assert shape[axis] == scale.shape[0]
+        if zero_point is not None and axis is not None:
+            assert shape[axis] == zero_point.shape[0]
+
+        low, high = (-128, 128) if quant_dtype == np.int8 else (0, 256)
+        x_q = np.random.randint(low=low, high=high, size=shape, dtype=np.int32)
+        if axis is None:
+            if zero_point is None:
+                x_fp = x_q * scale
+            else:
+                x_fp = (x_q - zero_point) * scale
+        else:
+            # prepare broadcast shape for latter dequantize
+            broadcastable_shape = np.ones(len(shape), dtype=np.int32)
+            broadcastable_shape[axis] = shape[axis]
+
+            broadcasted_scale = np.reshape(scale, broadcastable_shape)
+
+            if zero_point is None:
+                x_fp = x_q * broadcasted_scale
+            else:
+                broadcasted_zero_point = np.reshape(zero_point, broadcastable_shape)
+                x_fp = (x_q - broadcasted_zero_point) * broadcasted_scale
+
+        return float_dtype(x_fp)
+
+
+class TestNullifyRedundantQuantizationZeroPoint:
+    @staticmethod
+    def np_dtype_to_str(np_dtype: np.dtype) -> str:
+        NP_DTYPE_TO_STR = {np.int8: "int8", np.uint8: "uint8"}
+        return NP_DTYPE_TO_STR.get(np_dtype)
+
+    @staticmethod
+    def shift_128(input: np.ndarray, quant_dtype: np.dtype) -> np.ndarray:
+        """
+        shift const input according to zero point shift and dtype change:
+            int8: -128 -> 0, int8 -> uint8
+            uint8: 128 -> 0, uint8 -> int8
+        """
+        shifted_input: np.ndarray
+        if quant_dtype == np.int8:
+            shifted_input = np.uint8(np.int16(input) + 128)
+        else:
+            shifted_input = np.int8(np.int16(input) - 128)
+        return shifted_input
+
+    @pytest.mark.parametrize(
+        "quant_dtype, is_axis_present",
+        itertools.product(
+            (np.int8, np.uint8),
+            (True, False),
+        ),
+    )
+    def test_optimize_zp0_quantize(self, quant_dtype, is_axis_present):
+        """
+        initial graph:
+            input -> quantize(zero_point=0) -> dequantize(scale=1) -> output
+
+        final graph:
+            input -> quantize() -> dequantize(scale=1) -> output
+        """
+
+        SHAPE = (1, 3)
+
+        rank = len(SHAPE)
+        axis = np.random.randint(-rank, rank) if is_axis_present else None
+
+        scale = np.random.rand(SHAPE[axis]) if is_axis_present else np.random.rand()
+
+        zero_point = np.zeros(SHAPE[axis], dtype=quant_dtype) if is_axis_present else quant_dtype(0)
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=SHAPE)])
+        def prog(x):
+            quantized = mb.quantize(
+                input=x,
+                scale=scale,
+                zero_point=zero_point,
+                axis=axis,
+                output_dtype=self.np_dtype_to_str(quant_dtype),
+            )
+            dequantized = mb.dequantize(
+                input=quantized,
+                scale=1.0,
+            )
+            return dequantized
+
+        assert get_op_types_in_program(prog) == ["quantize", "dequantize"]
+        quantize_op = prog.find_ops(op_type="quantize")[0]
+        assert np.all(quantize_op.zero_point.val == 0)
+
+        _, _, block = apply_pass_and_basic_check(prog, "common::nullify_redundant_quantization_zero_point")
+        assert get_op_types_in_program(prog) == ["quantize", "dequantize"]
+        quantize_op = prog.find_ops(op_type="quantize")[0]
+        assert quantize_op.zero_point is None
+
+        assert_model_is_valid(
+            prog,
+            {"x": SHAPE},
+            minimum_deployment_target=ct.target.iOS17,
+            backend=("mlprogram", "fp32"),
+            expected_output_shapes={block.outputs[0].name: SHAPE},
+        )
+
+    @pytest.mark.parametrize(
+        "quant_dtype, is_axis_present",
+        itertools.product(
+            (np.int8, np.uint8),
+            (True, False),
+        ),
+    )
+    def test_optimize_zp0_dequantize(self, quant_dtype, is_axis_present):
+        """
+        initial graph:
+            input -> quantize(scale=1) -> dequantize(zero_point=0) -> output
+
+        final graph:
+            input -> quantize(scale=1) -> dequantize() -> output
+        """
+
+        SHAPE = (6, 5, 4, 3, 2)
+
+        rank = len(SHAPE)
+        axis = np.random.randint(-rank, rank) if is_axis_present else None
+
+        scale = np.random.rand(SHAPE[axis]) if is_axis_present else np.random.rand()
+
+        zero_point = np.zeros(SHAPE[axis], dtype=quant_dtype) if is_axis_present else quant_dtype(0)
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=SHAPE)])
+        def prog(x):
+            quantized = mb.quantize(
+                input=x,
+                scale=1.0,
+                output_dtype=self.np_dtype_to_str(quant_dtype),
+            )
+            dequantized = mb.dequantize(
+                input=quantized,
+                scale=scale,
+                zero_point=zero_point,
+                axis=axis,
+            )
+            return dequantized
+
+        assert get_op_types_in_program(prog) == ["quantize", "dequantize"]
+        dequantize_op = prog.find_ops(op_type="dequantize")[0]
+        assert np.all(dequantize_op.zero_point.val == 0)
+
+        _, _, block = apply_pass_and_basic_check(
+            prog, "common::nullify_redundant_quantization_zero_point"
+        )
+        assert get_op_types_in_program(prog) == ["quantize", "dequantize"]
+        dequantize_op = prog.find_ops(op_type="dequantize")[0]
+        assert dequantize_op.zero_point is None
+
+        assert_model_is_valid(
+            prog,
+            {"x": SHAPE},
+            minimum_deployment_target=ct.target.iOS17,
+            backend=("mlprogram", "fp32"),
+            expected_output_shapes={block.outputs[0].name: SHAPE},
+        )
+
+    @pytest.mark.parametrize(
+        "quant_dtype, is_axis_present",
+        itertools.product(
+            (np.int8, np.uint8),
+            (True, False),
+        ),
+    )
+    def test_optimize_zp128_quantize_dequantize(self, quant_dtype, is_axis_present):
+        """
+        initial graph:
+            input -> quantize(zero_point=±128) -> dequantize(zero_point=±128) -> output
+
+        final graph:
+            input -> quantize() -> dequantize() -> output
+        """
+
+        SHAPE = (2, 3)
+
+        rank = len(SHAPE)
+        axis = np.random.randint(-rank, rank) if is_axis_present else None
+
+        scale_quantize = np.random.rand(SHAPE[axis]) if is_axis_present else np.random.rand()
+        scale_dequantize = np.random.rand(SHAPE[axis]) if is_axis_present else np.random.rand()
+
+        zero_point_value = -128 if quant_dtype == np.int8 else 128
+        zero_point = (
+            np.full(SHAPE[axis], zero_point_value, dtype=quant_dtype)
+            if is_axis_present
+            else quant_dtype(zero_point_value)
+        )
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=SHAPE)])
+        def prog(x):
+            quantized = mb.quantize(
+                input=x,
+                scale=scale_quantize,
+                zero_point=zero_point,
+                axis=axis,
+                output_dtype=self.np_dtype_to_str(quant_dtype),
+            )
+            dequantized = mb.dequantize(
+                input=quantized,
+                scale=scale_dequantize,
+                zero_point=zero_point,
+                axis=axis,
+            )
+            return dequantized
+
+        assert get_op_types_in_program(prog) == ["quantize", "dequantize"]
+        quantize_op = prog.find_ops(op_type="quantize")[0]
+        dequantize_op = prog.find_ops(op_type="dequantize")[0]
+        assert np.all(quantize_op.zero_point.val == zero_point_value)
+        assert np.all(dequantize_op.zero_point.val == zero_point_value)
+
+        prev_prog, _, block = apply_pass_and_basic_check(
+            prog, "common::nullify_redundant_quantization_zero_point"
+        )
+        assert get_op_types_in_program(prog) == ["quantize", "dequantize"]
+        quantize_op = prog.find_ops(op_type="quantize")[0]
+        dequantize_op = prog.find_ops(op_type="dequantize")[0]
+        assert quantize_op.zero_point is None
+        assert dequantize_op.zero_point is None
+
+        assert_model_is_valid(
+            prog,
+            {"x": SHAPE},
+            minimum_deployment_target=ct.target.iOS17,
+            backend=("mlprogram", "fp32"),
+            expected_output_shapes={block.outputs[0].name: SHAPE},
+        )
+
+        prev_model = ct.convert(prev_prog, minimum_deployment_target=ct.target.iOS17)
+        model = ct.convert(prog, minimum_deployment_target=ct.target.iOS17)
+
+        x = np.random.rand(*SHAPE)
+        prev_output = list(prev_model.predict({"x": x}).values())[0]
+        output = list(model.predict({"x": x}).values())[0]
+        assert np.all(prev_output == output)
+
+    @pytest.mark.parametrize(
+        "quant_dtype, is_axis_present",
+        itertools.product(
+            (np.int8, np.uint8),
+            (True, False),
+        ),
+    )
+    def test_optimize_zp128_const_dequantize(self, quant_dtype, is_axis_present):
+        """
+        initial graph:
+            input -----------------------|
+                                         |-> add -> output
+            dequantize(zero_point=±128) -|
+
+        apply nullify_redundant_quantization_zero_point:
+            input --------|
+                          |-> add -> output
+            dequantize() -|
+
+        final graph:
+            input -----------------------|
+                                         |-> add -> output
+            constexpr_affine_dequantize -|
+        """
+
+        SHAPE = (2, 5, 3)
+
+        quantized = (
+            np.random.randint(low=-128, high=128, size=SHAPE, dtype=quant_dtype)
+            if quant_dtype == np.int8
+            else np.random.randint(low=0, high=256, size=SHAPE, dtype=quant_dtype)
+        )
+
+        rank = len(SHAPE)
+        axis = np.random.randint(-rank, rank) if is_axis_present else None
+
+        scale = np.random.rand(SHAPE[axis]) if is_axis_present else np.random.rand()
+
+        zero_point_value = -128 if quant_dtype == np.int8 else 128
+        zero_point = (
+            np.full(SHAPE[axis], zero_point_value, dtype=quant_dtype)
+            if is_axis_present
+            else quant_dtype(zero_point_value)
+        )
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=SHAPE)])
+        def prog(x):
+            dequantized = mb.dequantize(
+                input=quantized,
+                scale=scale,
+                zero_point=zero_point,
+                axis=axis,
+            )
+            # Core ML cannot have a model with idle input and constant outputs
+            # so append an `add` op to make the model valid
+            result = mb.add(x=x, y=dequantized)
+            return result
+
+        assert get_op_types_in_program(prog) == ["dequantize", "add"]
+        dequantize_op = prog.find_ops(op_type="dequantize")[0]
+        assert np.all(dequantize_op.input.val == quantized)
+        assert np.all(dequantize_op.zero_point.val == zero_point_value)
+
+        prev_prog, _, block = apply_pass_and_basic_check(
+            prog, "common::nullify_redundant_quantization_zero_point"
+        )
+        assert get_op_types_in_program(prog) == ["dequantize", "add"]
+        dequantize_op = prog.find_ops(op_type="dequantize")[0]
+        assert np.all(dequantize_op.input.val == self.shift_128(quantized, quant_dtype))
+        assert dequantize_op.zero_point is None
+
+        _, _, block = apply_pass_and_basic_check(prog, "common::dequantize_to_constexpr")
+        assert get_op_types_in_program(prog) == ["constexpr_affine_dequantize", "add"]
+
+        assert_model_is_valid(
+            prog,
+            {"x": SHAPE},
+            minimum_deployment_target=ct.target.iOS17,
+            backend=("mlprogram", "fp32"),
+            expected_output_shapes={block.outputs[0].name: SHAPE},
+        )
+
+        prev_model = ct.convert(prev_prog, minimum_deployment_target=ct.target.iOS17)
+        model = ct.convert(prog, minimum_deployment_target=ct.target.iOS17)
+
+        x = np.random.rand(*SHAPE)
+        prev_output = list(prev_model.predict({"x": x}).values())[0]
+        output = list(model.predict({"x": x}).values())[0]
+        assert np.all(prev_output == output)
+
+    @pytest.mark.parametrize(
+        "quant_dtype, is_axis_present",
+        itertools.product(
+            (np.int8, np.uint8),
+            (True, False),
+        ),
+    )
+    def test_keep_mismatching_quantize_dequantize(self, quant_dtype, is_axis_present):
+        """
+        initial graph:
+            input -> quantize(zero_point=±128 + perturbation) -> dequantize(zero_point=±128) -> output
+
+        final graph:
+            input -> quantize(zero_point=±128 + perturbation) -> dequantize(zero_point=±128) -> output
+
+        perturbation may also be applied to dequantize
+        """
+
+        SHAPE = (2, 3)
+
+        rank = len(SHAPE)
+        axis = np.random.randint(-rank, rank) if is_axis_present else None
+
+        scale_quantize = np.random.rand(SHAPE[axis]) if is_axis_present else np.random.rand()
+        scale_dequantize = np.random.rand(SHAPE[axis]) if is_axis_present else np.random.rand()
+
+        zero_point_value = -128 if quant_dtype == np.int8 else 128
+        perturbation = np.random.randint(1, 10, dtype=quant_dtype)
+        zero_point = (
+            np.full(SHAPE[axis], zero_point_value, dtype=quant_dtype)
+            if is_axis_present
+            else quant_dtype(zero_point_value)
+        )
+        zero_point_perturbed = quant_dtype(zero_point + perturbation)
+
+        perturb_quantize = np.random.rand() < 0.5
+        if perturb_quantize:
+            zero_point_quantize = zero_point_perturbed
+            zero_point_dequantize = zero_point
+        else:
+            zero_point_quantize = zero_point
+            zero_point_dequantize = zero_point_perturbed
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=SHAPE)])
+        def prog(x):
+            quantized = mb.quantize(
+                input=x,
+                scale=scale_quantize,
+                zero_point=zero_point_quantize,
+                axis=axis,
+                output_dtype=self.np_dtype_to_str(quant_dtype),
+            )
+            dequantized = mb.dequantize(
+                input=quantized,
+                scale=scale_dequantize,
+                zero_point=zero_point_dequantize,
+                axis=axis,
+            )
+            return dequantized
+
+        assert get_op_types_in_program(prog) == ["quantize", "dequantize"]
+        quantize_op = prog.find_ops(op_type="quantize")[0]
+        dequantize_op = prog.find_ops(op_type="dequantize")[0]
+        if perturb_quantize:
+            assert np.all(quantize_op.zero_point.val == zero_point_perturbed)
+            assert np.all(dequantize_op.zero_point.val == zero_point)
+        else:
+            assert np.all(quantize_op.zero_point.val == zero_point)
+            assert np.all(dequantize_op.zero_point.val == zero_point_perturbed)
+
+        _, _, block = apply_pass_and_basic_check(
+            prog, "common::nullify_redundant_quantization_zero_point"
+        )
+        assert get_op_types_in_program(prog) == ["quantize", "dequantize"]
+        quantize_op = prog.find_ops(op_type="quantize")[0]
+        dequantize_op = prog.find_ops(op_type="dequantize")[0]
+        if perturb_quantize:
+            assert np.all(quantize_op.zero_point.val == zero_point_perturbed)
+            assert np.all(dequantize_op.zero_point.val == zero_point)
+        else:
+            assert np.all(quantize_op.zero_point.val == zero_point)
+            assert np.all(dequantize_op.zero_point.val == zero_point_perturbed)
+
+        assert_model_is_valid(
+            prog,
+            {"x": SHAPE},
+            minimum_deployment_target=ct.target.iOS17,
+            backend=("mlprogram", "fp32"),
+            expected_output_shapes={block.outputs[0].name: SHAPE},
+        )
+
+
+class TestDequantizeQuantizePairElimination:
+    @staticmethod
+    def generate_scale_zp_axis(shape, is_zp_present, is_axis_present):
+        rank = len(shape)
+
+        axis = None
+        if is_axis_present:
+            axis = np.random.randint(-rank, rank, dtype=np.int32)
+
+        scale = np.random.rand(shape[axis]) if is_axis_present else np.random.rand()
+
+        zero_point = None
+        if is_zp_present:
+            zero_point = (
+                np.random.randint(-128, 128, shape[axis], dtype=np.int8)
+                if is_axis_present
+                else np.random.randint(-128, 128, dtype=np.int8)
+            )
+
+        return scale, zero_point, axis
+
+    @pytest.mark.parametrize(
+        "is_zp_present, is_axis_present",
+        itertools.product(
+            (True, False),
+            (True, False),
+        ),
+    )
+    def test_eliminate_identical_dequantize_quantize(self, is_zp_present, is_axis_present):
+        """
+        Input graph:
+            input -> quantize0 -> dequantize1 -> quantize1 -> dequantize2 -> add -> quantize2 -> dequantize3 -> output
+
+        Output graph:
+            input -> quantize0 -> dequantize2 -> add -> quantize2 -> dequantize3 -> output
+        """
+
+        SHAPE = (2, 3)
+        scale, zero_point, axis = self.generate_scale_zp_axis(SHAPE, is_zp_present, is_axis_present)
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=SHAPE, dtype=types.fp32)])
+        def prog(x):
+            # quantize input
+            quantized_0 = mb.quantize(
+                input=x, scale=scale, zero_point=zero_point, axis=axis, output_dtype="int8"
+            )
+            # redundant dequantize-quantize pair
+            dequantized_1 = mb.dequantize(
+                input=quantized_0, scale=scale, zero_point=zero_point, axis=axis
+            )
+            quantized_1 = mb.quantize(
+                input=dequantized_1,
+                scale=scale,
+                zero_point=zero_point,
+                axis=axis,
+                output_dtype="int8",
+            )
+            # dequantize-op-quantize sandwich
+            dequantized_2 = mb.dequantize(
+                input=quantized_1, scale=scale, zero_point=zero_point, axis=axis
+            )
+            y = mb.add(x=dequantized_2, y=dequantized_2)
+            quantized_2 = mb.quantize(input=y, scale=0.1, output_dtype="int8")
+            # dequantize output
+            dequantized_3 = mb.dequantize(input=quantized_2, scale=0.1)
+            return dequantized_3
+
+        prev_prog, _, _ = apply_pass_and_basic_check(
+            prog, "common::dequantize_quantize_pair_elimination"
+        )
+        assert get_op_types_in_program(prev_prog) == [
+            "quantize",
+            "dequantize",
+            "quantize",
+            "dequantize",
+            "add",
+            "quantize",
+            "dequantize",
+        ]
+        # As expected, dequantize_1 -> quantize_1 gets eliminated.
+        # On the other hand, even with same scales and zero points and axes,
+        # quantize_0 -> dequantize_2 and quantize_2 -> dequantize_3 are kept.
+        assert get_op_types_in_program(prog) == [
+            "quantize",
+            "dequantize",
+            "add",
+            "quantize",
+            "dequantize",
+        ]
+
+    @pytest.mark.parametrize(
+        "is_zp_present, is_axis_present, is_shifted_zp_present",
+        itertools.product(
+            (True, False),
+            (True, False),
+            (True, False),
+        ),
+    )
+    def test_keep_unidentical_dequantize_quantize(
+        self, is_zp_present, is_axis_present, is_shifted_zp_present
+    ):
+        """
+        Input graph:
+            input -> quantize0 -> dequantize1(scale1, zp1) -> quantize1(scale2, zp2) -> dequantize2 -> add -> quantize2 -> dequantize3 -> output
+
+        Nothing changes when dequantize1 and quantize1 have different parameters
+        """
+
+        SHAPE = (2, 3, 5)
+        scale, zero_point, axis = self.generate_scale_zp_axis(SHAPE, is_zp_present, is_axis_present)
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=SHAPE, dtype=types.fp32)])
+        def prog(x):
+            # quantize input
+            quantized_0 = mb.quantize(
+                input=x, scale=scale, zero_point=zero_point, axis=axis, output_dtype="int8"
+            )
+            # non-redundant dequantize-quantize pair
+            # this pattern can emerge from a (future) graph pass
+            dequantized_1 = mb.dequantize(
+                input=quantized_0, scale=scale, zero_point=zero_point, axis=axis
+            )
+            if is_zp_present:
+                # input graph:
+                #     dequantize -> add(y=const) -> quantize
+                # output graph:
+                #     dequantize -> quantize(zero_point += const / scale)
+                if is_shifted_zp_present:
+                    shifted_zero_point = (
+                        (zero_point + 1.0).astype(np.int8)
+                        if is_axis_present
+                        else np.int8(zero_point + 1.0)
+                    )
+                    quantized_1 = mb.quantize(
+                        input=dequantized_1,
+                        scale=scale,
+                        zero_point=shifted_zero_point,
+                        axis=axis,
+                        output_dtype="int8",
+                    )
+                else:
+                    quantized_1 = mb.quantize(
+                        input=dequantized_1, scale=scale, axis=axis, output_dtype="int8"
+                    )
+            else:
+                # input graph:
+                #     dequantize(zero_point=0) -> mul(y=const) -> quantize(zero_point=0)
+                # output graph:
+                #     dequantize(zero_point=0) -> quantize(scale /= const, zero_point=0)
+                quantized_1 = mb.quantize(
+                    input=dequantized_1, scale=scale / 2.0, axis=axis, output_dtype="int8"
+                )
+            # dequantize-op-quantize sandwich
+            dequantized_2 = mb.dequantize(
+                input=quantized_1, scale=scale, zero_point=zero_point, axis=axis
+            )
+            y = mb.add(x=dequantized_2, y=dequantized_2)
+            quantized_2 = mb.quantize(input=y, scale=0.1, output_dtype="int8")
+            # dequantize output
+            dequantized_3 = mb.dequantize(input=quantized_2, scale=0.1)
+            return dequantized_3
+
+        prev_prog, _, _ = apply_pass_and_basic_check(
+            prog, "common::dequantize_quantize_pair_elimination"
+        )
+        assert get_op_types_in_program(prev_prog) == [
+            "quantize",
+            "dequantize",
+            "quantize",
+            "dequantize",
+            "add",
+            "quantize",
+            "dequantize",
+        ]
+        # nothing gets eliminated
+        assert get_op_types_in_program(prog) == [
+            "quantize",
+            "dequantize",
+            "quantize",
+            "dequantize",
+            "add",
+            "quantize",
+            "dequantize",
+        ]
+
+    @pytest.mark.parametrize(
+        "is_zp_present, is_axis_present",
+        itertools.product(
+            (True, False),
+            (True, False),
+        ),
+    )
+    def test_keep_block_output_dequantize(self, is_zp_present, is_axis_present):
+        """
+        Input graph:
+            input -> quantize0 -> dequantize1 -> quantize1 -> dequantize2 -> add -> quantize2 -> dequantize3 -> output
+
+        Nothing changes when dequantize1 is a block output
+        """
+
+        SHAPE = (2, 3)
+        scale, zero_point, axis = self.generate_scale_zp_axis(SHAPE, is_zp_present, is_axis_present)
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=SHAPE, dtype=types.fp32)])
+        def prog(x):
+            # quantize input
+            quantized_0 = mb.quantize(
+                input=x, scale=scale, zero_point=zero_point, axis=axis, output_dtype="int8"
+            )
+            # redundant dequantize-quantize pair
+            dequantized_1 = mb.dequantize(
+                input=quantized_0, scale=scale, zero_point=zero_point, axis=axis
+            )
+            quantized_1 = mb.quantize(
+                input=dequantized_1,
+                scale=scale,
+                zero_point=zero_point,
+                axis=axis,
+                output_dtype="int8",
+            )
+            # dequantize-op-quantize sandwich
+            dequantized_2 = mb.dequantize(
+                input=quantized_1, scale=scale, zero_point=zero_point, axis=axis
+            )
+            y = mb.add(x=dequantized_2, y=dequantized_2)
+            quantized_2 = mb.quantize(input=y, scale=0.1, output_dtype="int8")
+            # dequantize output
+            dequantized_3 = mb.dequantize(input=quantized_2, scale=0.1)
+            return dequantized_1, dequantized_3
+
+        prev_prog, _, _ = apply_pass_and_basic_check(
+            prog, "common::dequantize_quantize_pair_elimination"
+        )
+        assert get_op_types_in_program(prev_prog) == [
+            "quantize",
+            "dequantize",
+            "quantize",
+            "dequantize",
+            "add",
+            "quantize",
+            "dequantize",
+        ]
+        # nothing gets eliminated
+        assert get_op_types_in_program(prog) == [
+            "quantize",
+            "dequantize",
+            "quantize",
+            "dequantize",
+            "add",
+            "quantize",
+            "dequantize",
+        ]
+
+    @pytest.mark.parametrize(
+        "is_zp_present, is_axis_present",
+        itertools.product(
+            (True, False),
+            (True, False),
+        ),
+    )
+    def test_keep_multichildren_dequantize(self, is_zp_present, is_axis_present):
+        """
+        Input graph:
+                                               |-> quantize1 -> dequantize2 -> add -> quantize2 -> dequantize3 -> output1
+            input -> quantize0 -> dequantize1 -|
+                                               |-> mul -> quantize -> dequantize -> output2
+
+        Nothing changes when dequantize1 has multiple children
+        """
+
+        SHAPE = (2, 3)
+        scale, zero_point, axis = self.generate_scale_zp_axis(SHAPE, is_zp_present, is_axis_present)
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=SHAPE, dtype=types.fp32)])
+        def prog(x):
+            # quantize input
+            quantized_0 = mb.quantize(
+                input=x, scale=scale, zero_point=zero_point, axis=axis, output_dtype="int8"
+            )
+            # redundant dequantize-quantize pair
+            dequantized_1 = mb.dequantize(
+                input=quantized_0, scale=scale, zero_point=zero_point, axis=axis
+            )
+            quantized_1 = mb.quantize(
+                input=dequantized_1,
+                scale=scale,
+                zero_point=zero_point,
+                axis=axis,
+                output_dtype="int8",
+            )
+            # dequantize-op-quantize sandwich
+            dequantized_2 = mb.dequantize(
+                input=quantized_1, scale=scale, zero_point=zero_point, axis=axis
+            )
+            y = mb.add(x=dequantized_2, y=dequantized_2)
+            quantized_2 = mb.quantize(input=y, scale=0.1, output_dtype="int8")
+            # dequantize output
+            dequantized_3 = mb.dequantize(input=quantized_2, scale=0.1)
+
+            # now add another usage of dequantized_1
+            z = mb.mul(x=dequantized_1, y=dequantized_1)
+            quantized_z = mb.quantize(input=z, scale=0.2, output_dtype="int8")
+            dequantized_z = mb.dequantize(input=quantized_z, scale=0.2)
+
+            return dequantized_3, dequantized_z
+
+        prev_prog, _, _ = apply_pass_and_basic_check(
+            prog, "common::dequantize_quantize_pair_elimination"
+        )
+        assert get_op_types_in_program(prev_prog) == [
+            "quantize",
+            "dequantize",
+            "quantize",
+            "dequantize",
+            "add",
+            "quantize",
+            "dequantize",
+            "mul",
+            "quantize",
+            "dequantize",
+        ]
+        # nothing gets eliminated
+        assert get_op_types_in_program(prog) == [
+            "quantize",
+            "dequantize",
+            "quantize",
+            "dequantize",
+            "add",
+            "quantize",
+            "dequantize",
+            "mul",
+            "quantize",
+            "dequantize",
+        ]
+
+
+@pytest.mark.skipif(ct.utils._macos_version() < (14, 0), reason="Requires Core ML 7")
+class TestDistributiveQuantizedBinaryOpScaleNormalization(QuantizationBaseTest):
+    @pytest.mark.parametrize(
+        "op_type, has_relu_fusion, input_rank, is_axis_x_present",
+        itertools.product(
+            ("add", "sub"),
+            (True, False),
+            (1, 3, 5),
+            (True, False),
+        ),
+    )
+    def test_normalize(self, op_type, has_relu_fusion, input_rank, is_axis_x_present):
+        """
+        Input graph:
+            x -> quantize(scale_x) -> dequantize(scale_x) -|
+                                                           |-> add/sub (-> relu) -> dequantize(scale_z) -> output
+            y -> quantize(scale_y) -> dequantize(scale_y) -|
+
+        Output graph:
+            x -> quantize(scale_x) -> dequantize(scale_x/scale_y) -|
+                                                                   |-> add/sub (-> relu) -> dequantize(scale_z/scale_y) -> output
+            y -> quantize(scale_y) -> dequantize(1.0)             -|
+
+        x and y may get swapped to have the one with scalar scale being new "y"
+        """
+
+        # if axis_x is present, then axis_y is not, vice versa,
+        # so that one of scale_x or scale_y is scalar
+        SHAPE = np.random.randint(1, 5, size=input_rank, dtype=np.int32)
+        scale_x, zero_point_x, axis_x = self.generate_random_quantization_params(
+            np.float32, np.int8, SHAPE, True, is_axis_x_present
+        )
+        scale_y, zero_point_y, axis_y = self.generate_random_quantization_params(
+            np.float32, np.int8, SHAPE, True, not is_axis_x_present
+        )
+        scale_z, zero_point_z, axis_z = self.generate_random_quantization_params(
+            np.float32, np.int8, SHAPE, True, True
+        )
+
+        @mb.program(
+            input_specs=[
+                mb.TensorSpec(shape=SHAPE, dtype=types.fp32),
+                mb.TensorSpec(shape=SHAPE, dtype=types.fp32),
+            ]
+        )
+        def prog(x, y):
+            # quantize input
+            quantize_x = mb.quantize(
+                input=x, scale=scale_x, zero_point=zero_point_x, axis=axis_x, output_dtype="int8"
+            )
+            quantize_y = mb.quantize(
+                input=y, scale=scale_y, zero_point=zero_point_y, axis=axis_y, output_dtype="int8"
+            )
+            # quantized binary op
+            dequantize_x = mb.dequantize(
+                input=quantize_x, scale=scale_x, zero_point=zero_point_x, axis=axis_x
+            )
+            dequantize_y = mb.dequantize(
+                input=quantize_y, scale=scale_y, zero_point=zero_point_y, axis=axis_y
+            )
+            z = None
+            if op_type == "add":
+                z = mb.add(x=dequantize_x, y=dequantize_y)
+            elif op_type == "sub":
+                z = mb.sub(x=dequantize_x, y=dequantize_y)
+            else:
+                raise ValueError("unsupported op type")
+            if has_relu_fusion:
+                z = mb.relu(x=z)
+            quantize_z = mb.quantize(
+                input=z, scale=scale_z, zero_point=zero_point_z, axis=axis_z, output_dtype="int8"
+            )
+            # dequantize output
+            dequantize_z = mb.dequantize(
+                input=quantize_z, scale=scale_z, zero_point=zero_point_z, axis=axis_z
+            )
+            return dequantize_z
+
+        # dequantize_x, dequantize_y, z
+        prev_prog, _, _ = apply_pass_and_basic_check(
+            prog, "common::distributive_quantized_binary_op_scale_normalization"
+        )
+        # dequantize_x, dequantize_y, dequantize_x_normalized, dequantize_y_normalized, z
+        _, _, _ = apply_pass_and_basic_check(prog, "common::dead_code_elimination")
+        # dequantize_x_normalized, dequantize_y_normalized, z
+
+        scale_prev_dequantize_x = prev_prog.find_ops(op_type="dequantize")[0].scale.val
+        scale_prev_dequantize_y = prev_prog.find_ops(op_type="dequantize")[1].scale.val
+        scale_prev_quantize_z = prev_prog.find_ops(op_type="quantize")[-1].scale.val
+        assert np.all(scale_prev_dequantize_x == scale_x)
+        assert np.all(scale_prev_dequantize_y == scale_y)
+        assert np.all(scale_prev_quantize_z == scale_z)
+
+        scale_dequantize_x = prog.find_ops(op_type="dequantize")[0].scale.val
+        scale_dequantize_y = prog.find_ops(op_type="dequantize")[1].scale.val
+        scale_quantize_z = prog.find_ops(op_type="quantize")[-1].scale.val
+        # if axis_x is present, then scale_y gets normalized
+        # else, scale_x gets normalized, and x and y will get swapped
+        assert np.all(
+            scale_dequantize_x == scale_x / scale_y if is_axis_x_present else scale_y / scale_x
+        )
+        assert np.all(scale_dequantize_y == 1.0)
+        assert np.all(
+            scale_quantize_z == scale_z / scale_y if is_axis_x_present else scale_z / scale_x
+        )
+
+        prev_model = ct.convert(
+            prev_prog,
+            source="milinternal",
+            convert_to="mlprogram",
+            compute_precision=ct.precision.FLOAT32,
+            minimum_deployment_target=ct.target.iOS17,
+        )
+        model = ct.convert(
+            prog,
+            source="milinternal",
+            convert_to="mlprogram",
+            compute_precision=ct.precision.FLOAT32,
+            minimum_deployment_target=ct.target.iOS17,
+        )
+
+        x = self.generate_random_quantize_input(
+            np.float32, np.int8, scale_x, zero_point_x, axis_x, SHAPE
+        )
+        y = self.generate_random_quantize_input(
+            np.float32, np.int8, scale_y, zero_point_y, axis_y, SHAPE
+        )
+        prev_output = list(prev_model.predict({"x": x, "y": y}).values())[0]
+        output = list(model.predict({"x": x, "y": y}).values())[0]
+        assert np.all(prev_output == output)
+
+    def test_normalize_versatile_inputs(self):
+        """
+        Input graph:
+                                               |-> exp -> dequantize(scale_z)
+                                               |
+            x -> quantize(scale_x) -> dequantize(scale_x) -|
+                                                           |-> add -> dequantize(scale_z) -> output
+            y -> quantize(scale_y) -> dequantize(scale_y) -|
+
+        Output graph:
+                         |-> dequantize(scale_x) -> exp -> dequantize(scale_z)
+                         |
+            x -> quantize(scale_x) -> dequantize(scale_x/scale_y) -|
+                                                                   |-> add -> dequantize(scale_z/scale_y) -> output
+            y -> quantize(scale_y) -> dequantize(1.0)             -|
+        """
+
+        SHAPE = (2, 1)
+        scale_x, zero_point_x, axis_x = np.float32(0.2), None, None
+        scale_y, zero_point_y, axis_y = np.float32(0.3), None, None
+        scale_z, zero_point_z, axis_z = np.float32(0.5), None, None
+
+        @mb.program(
+            input_specs=[
+                mb.TensorSpec(shape=SHAPE, dtype=types.fp32),
+                mb.TensorSpec(shape=SHAPE, dtype=types.fp32),
+            ]
+        )
+        def prog(x, y):
+            # quantize input
+            quantize_x = mb.quantize(
+                input=x, scale=scale_x, zero_point=zero_point_x, axis=axis_x, output_dtype="uint8"
+            )
+            quantize_y = mb.quantize(
+                input=y, scale=scale_y, zero_point=zero_point_y, axis=axis_y, output_dtype="uint8"
+            )
+            # quantized binary op
+            dequantize_x = mb.dequantize(
+                input=quantize_x, scale=scale_x, zero_point=zero_point_x, axis=axis_x
+            )
+            dequantize_y = mb.dequantize(
+                input=quantize_y, scale=scale_y, zero_point=zero_point_y, axis=axis_y
+            )
+            z = mb.add(x=dequantize_x, y=dequantize_y)
+            quantize_z = mb.quantize(
+                input=z, scale=scale_z, zero_point=zero_point_z, axis=axis_z, output_dtype="uint8"
+            )
+            # another quantized op
+            z1 = mb.exp(x=dequantize_x)
+            quantize_z1 = mb.quantize(
+                input=z1, scale=scale_z, zero_point=zero_point_z, axis=axis_z, output_dtype="uint8"
+            )
+            # dequantize output
+            dequantize_z = mb.dequantize(
+                input=quantize_z, scale=scale_z, zero_point=zero_point_z, axis=axis_z
+            )
+            dequantize_z1 = mb.dequantize(
+                input=quantize_z1, scale=scale_z, zero_point=zero_point_z, axis=axis_z
+            )
+            return dequantize_z, dequantize_z1
+
+        prev_prog, _, _ = apply_pass_and_basic_check(
+            prog, "common::distributive_quantized_binary_op_scale_normalization"
+        )
+        _, _, _ = apply_pass_and_basic_check(prog, "common::dead_code_elimination")
+
+        scale_prev_dequantize_x = prev_prog.find_ops(op_type="dequantize")[0].scale.val
+        scale_prev_dequantize_y = prev_prog.find_ops(op_type="dequantize")[1].scale.val
+        scale_prev_quantize_z = prev_prog.find_ops(op_type="quantize")[-2].scale.val
+        assert np.all(scale_prev_dequantize_x == scale_x)
+        assert np.all(scale_prev_dequantize_y == scale_y)
+        assert np.all(scale_prev_quantize_z == scale_z)
+
+        scale_dequantize_x_to_z1 = prog.find_ops(op_type="dequantize")[0].scale.val
+        scale_dequantize_x_to_z = prog.find_ops(op_type="dequantize")[1].scale.val
+        scale_dequantize_y = prog.find_ops(op_type="dequantize")[2].scale.val
+        scale_quantize_z = prog.find_ops(op_type="quantize")[-2].scale.val
+        assert np.all(scale_dequantize_x_to_z1 == scale_x)
+        assert np.all(scale_dequantize_x_to_z == scale_x / scale_y)
+        assert np.all(scale_dequantize_y == 1.0)
+        assert np.all(scale_quantize_z == scale_z / scale_y)
+
+    def test_skip_0_scale(self):
+        """
+        Input graph:
+            x -> quantize(eps) -> dequantize(eps) -|
+                                                   |-> add -> dequantize -> output
+            y -> quantize(eps) -> dequantize(eps) -|
+
+        Nothing changes due to underflow scale
+        """
+
+        # consider anything underflows fp16 to be 0
+        SHAPE = (1, 2)
+        scale_x, zero_point_x, axis_x = np.float32(5e-8), None, None
+        scale_y, zero_point_y, axis_y = np.float32(-5e-8), None, None
+        scale_z, zero_point_z, axis_z = np.float32(0.8), None, None
+
+        @mb.program(
+            input_specs=[
+                mb.TensorSpec(shape=SHAPE, dtype=types.fp32),
+                mb.TensorSpec(shape=SHAPE, dtype=types.fp32),
+            ]
+        )
+        def prog(x, y):
+            # quantize input
+            quantize_x = mb.quantize(
+                input=x, scale=scale_x, zero_point=zero_point_x, axis=axis_x, output_dtype="uint8"
+            )
+            quantize_y = mb.quantize(
+                input=y, scale=scale_y, zero_point=zero_point_y, axis=axis_y, output_dtype="uint8"
+            )
+            # quantized binary op
+            dequantize_x = mb.dequantize(
+                input=quantize_x, scale=scale_x, zero_point=zero_point_x, axis=axis_x
+            )
+            dequantize_y = mb.dequantize(
+                input=quantize_y, scale=scale_y, zero_point=zero_point_y, axis=axis_y
+            )
+            z = mb.add(x=dequantize_x, y=dequantize_y)
+            quantize_z = mb.quantize(
+                input=z, scale=scale_z, zero_point=zero_point_z, axis=axis_z, output_dtype="uint8"
+            )
+            # dequantize output
+            dequantize_z = mb.dequantize(
+                input=quantize_z, scale=scale_z, zero_point=zero_point_z, axis=axis_z
+            )
+            return dequantize_z
+
+        prev_prog, _, _ = apply_pass_and_basic_check(
+            prog, "common::distributive_quantized_binary_op_scale_normalization"
+        )
+
+        scale_prev_dequantize_x = prev_prog.find_ops(op_type="dequantize")[0].scale.val
+        scale_prev_dequantize_y = prev_prog.find_ops(op_type="dequantize")[1].scale.val
+        scale_prev_quantize_z = prev_prog.find_ops(op_type="quantize")[-1].scale.val
+        assert np.all(scale_prev_dequantize_x == scale_x)
+        assert np.all(scale_prev_dequantize_y == scale_y)
+        assert np.all(scale_prev_quantize_z == scale_z)
+
+        scale_dequantize_x = prog.find_ops(op_type="dequantize")[0].scale.val
+        scale_dequantize_y = prog.find_ops(op_type="dequantize")[1].scale.val
+        scale_quantize_z = prog.find_ops(op_type="quantize")[-1].scale.val
+        assert np.all(scale_dequantize_x == scale_x)
+        assert np.all(scale_dequantize_y == scale_y)
+        assert np.all(scale_quantize_z == scale_z)
+
+    @pytest.mark.parametrize("input_rank", (1, 2, 5))
+    def test_skip_2_vector_scales(self, input_rank):
+        """
+        Input graph:
+            x -> quantize(scale_x) -> dequantize(scale_x) -|
+                                                           |-> add -> dequantize(scale_z) -> output
+            y -> quantize(scale_y) -> dequantize(scale_y) -|
+
+        Nothing changes when both scale_x and scale_y are vectors
+        """
+
+        # axis_x and axis_y are both present
+        SHAPE = np.random.randint(1, 5, size=input_rank, dtype=np.int32)
+        scale_x, zero_point_x, axis_x = self.generate_random_quantization_params(
+            np.float16, np.uint8, SHAPE, False, True
+        )
+        scale_y, zero_point_y, axis_y = self.generate_random_quantization_params(
+            np.float16, np.uint8, SHAPE, False, True
+        )
+        scale_z, zero_point_z, axis_z = self.generate_random_quantization_params(
+            np.float16, np.uint8, SHAPE, False, False
+        )
+
+        @mb.program(
+            input_specs=[
+                mb.TensorSpec(shape=SHAPE, dtype=types.fp16),
+                mb.TensorSpec(shape=SHAPE, dtype=types.fp16),
+            ]
+        )
+        def prog(x, y):
+            # quantize input
+            quantize_x = mb.quantize(
+                input=x, scale=scale_x, zero_point=zero_point_x, axis=axis_x, output_dtype="uint8"
+            )
+            quantize_y = mb.quantize(
+                input=y, scale=scale_y, zero_point=zero_point_y, axis=axis_y, output_dtype="uint8"
+            )
+            # quantized binary op
+            dequantize_x = mb.dequantize(
+                input=quantize_x, scale=scale_x, zero_point=zero_point_x, axis=axis_x
+            )
+            dequantize_y = mb.dequantize(
+                input=quantize_y, scale=scale_y, zero_point=zero_point_y, axis=axis_y
+            )
+            z = mb.add(x=dequantize_x, y=dequantize_y)
+            quantize_z = mb.quantize(
+                input=z, scale=scale_z, zero_point=zero_point_z, axis=axis_z, output_dtype="uint8"
+            )
+            # dequantize output
+            dequantize_z = mb.dequantize(
+                input=quantize_z, scale=scale_z, zero_point=zero_point_z, axis=axis_z
+            )
+            return dequantize_z
+
+        prev_prog, _, _ = apply_pass_and_basic_check(
+            prog, "common::distributive_quantized_binary_op_scale_normalization"
+        )
+
+        scale_prev_dequantize_x = prev_prog.find_ops(op_type="dequantize")[0].scale.val
+        scale_prev_dequantize_y = prev_prog.find_ops(op_type="dequantize")[1].scale.val
+        scale_prev_quantize_z = prev_prog.find_ops(op_type="quantize")[-1].scale.val
+        assert np.all(scale_prev_dequantize_x == scale_x)
+        assert np.all(scale_prev_dequantize_y == scale_y)
+        assert np.all(scale_prev_quantize_z == scale_z)
+
+        scale_dequantize_x = prog.find_ops(op_type="dequantize")[0].scale.val
+        scale_dequantize_y = prog.find_ops(op_type="dequantize")[1].scale.val
+        scale_quantize_z = prog.find_ops(op_type="quantize")[-1].scale.val
+        assert np.all(scale_dequantize_x == scale_x)
+        assert np.all(scale_dequantize_y == scale_y)
+        assert np.all(scale_quantize_z == scale_z)
+
+
+class TestDequantizeToConstexpr:
+    @pytest.mark.parametrize(
+        "float_dtype, quant_dtype, is_scalar, is_zp_present",
+        itertools.product(
+            (np.float32, np.float16),
+            (np.int8, np.uint8),
+            (True, False),
+            (True, False),
+        ),
+    )
+    def test_dequantize_const_to_constexpr(
+        self, float_dtype, quant_dtype, is_scalar, is_zp_present
+    ):
+        """
+        Input graph:
+            input -> dequantize -> output
+
+        Output graph:
+            input -> constexpr_affine_dequantize -> output
+        """
+
+        @mb.program(input_specs=[])
+        def prog():
+            y = None
+            if is_scalar:
+                if is_zp_present:
+                    y = mb.dequantize(
+                        input=np.array([10, 11], dtype=quant_dtype),
+                        scale=float_dtype(0.1),
+                        zero_point=quant_dtype(2),
+                    )
+                else:
+                    y = mb.dequantize(
+                        input=np.array([13, 14, 15], dtype=quant_dtype), scale=float_dtype(0.2)
+                    )
+            else:
+                if is_zp_present:
+                    y = mb.dequantize(
+                        input=np.array([[10, 11], [12, 13], [14, 15]], dtype=quant_dtype),
+                        scale=np.array([0.1, 0.2, 0.3], dtype=float_dtype),
+                        zero_point=np.array([6, 7, 8], dtype=quant_dtype),
+                        axis=0,
+                    )
+                else:
+                    y = mb.dequantize(
+                        input=np.array([[19, 20, 21], [22, 23, 24]], dtype=quant_dtype),
+                        scale=np.array([0.4, 0.5, 0.6], dtype=float_dtype),
+                        axis=1,
+                    )
+            return y
+
+        assert get_op_types_in_program(prog) == ["dequantize"]
+
+        prev_prog, prev_block, block = apply_pass_and_basic_check(
+            prog, "common::dequantize_to_constexpr"
+        )
+        assert get_op_types_in_program(prog) == ["constexpr_affine_dequantize"]
+
+    @pytest.mark.parametrize(
+        "float_dtype, quant_dtype, is_scalar, is_zp_present",
+        itertools.product(
+            (np.float32, np.float16),
+            (np.int8, np.uint8),
+            (True, False),
+            (True, False),
+        ),
+    )
+    def test_dequantize_variable_unchanged(
+        self, float_dtype, quant_dtype, is_scalar, is_zp_present
+    ):
+        """
+        Input graph:
+            input -> dequantize -> output
+
+        Output graph:
+            input -> dequantize -> output
+        """
+
+        if is_scalar:
+            if is_zp_present:
+
+                @mb.program(
+                    input_specs=[
+                        mb.TensorSpec(
+                            shape=(1, 2, 3, 4, 5), dtype=numpy_type_to_builtin_type(quant_dtype)
+                        )
+                    ]
+                )
+                def prog(x):
+                    y = mb.dequantize(input=x, scale=float_dtype(0.1), zero_point=quant_dtype(1))
+                    return y
+
+            else:
+
+                @mb.program(
+                    input_specs=[
+                        mb.TensorSpec(
+                            shape=(4, 3, 2, 1), dtype=numpy_type_to_builtin_type(quant_dtype)
+                        )
+                    ]
+                )
+                def prog(x):
+                    y = mb.dequantize(input=x, scale=float_dtype(0.2))
+                    return y
+
+        else:
+            if is_zp_present:
+
+                @mb.program(
+                    input_specs=[
+                        mb.TensorSpec(shape=(3, 2), dtype=numpy_type_to_builtin_type(quant_dtype))
+                    ]
+                )
+                def prog(x):
+                    y = mb.dequantize(
+                        input=x,
+                        scale=np.array([0.1, 0.2, 0.3], dtype=float_dtype),
+                        zero_point=np.array([1, 2, 3], dtype=quant_dtype),
+                        axis=0,
+                    )
+                    return y
+
+            else:
+
+                @mb.program(
+                    input_specs=[
+                        mb.TensorSpec(shape=(2, 3), dtype=numpy_type_to_builtin_type(quant_dtype))
+                    ]
+                )
+                def prog(x):
+                    y = mb.dequantize(
+                        input=x,
+                        scale=np.array([0.4, 0.5, 0.6], dtype=float_dtype),
+                        axis=1,
+                    )
+                    return y
+
+        assert get_op_types_in_program(prog) == ["dequantize"]
+
+        prev_prog, prev_block, block = apply_pass_and_basic_check(
+            prog, "common::dequantize_to_constexpr"
+        )
+        assert get_op_types_in_program(prog) == ["dequantize"]
+
+
+class TestFP16CastTransform(unittest.TestCase):
+    def test_single_input_to_single_operation(self):
+        """
+        Input graph:
+            input -> square -> output
+
+        Output graph:
+            input -> cast(fp32->fp16) -> square -> cast(fp16->fp32) -> output
+        """
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(10, 20))])
+        def prog(x):
+            x = mb.square(x=x)
+            return x
+
+        self.assertEqual(get_op_types_in_program(prog), ["square"])
+
+        apply_pass_and_basic_check(
+            prog, quantization.FP16ComputePrecision(op_selector=lambda op: True)
+        )
+        _, _, block = apply_pass_and_basic_check(prog, "common::dead_code_elimination")
+
+        self.assertEqual(get_op_types_in_program(prog), ["cast", "square", "cast"])
+
+        # Asserting first cast configuration
+        cast_1 = block.find_ops(op_type="cast")[0]
+        self.assertEqual(cast_1.dtype.val, "fp16")
+        self.assertEqual(len(cast_1.outputs), 1)
+        self.assertEqual(len(cast_1.outputs[0].child_ops), 1)
+        self.assertEqual(cast_1.outputs[0].child_ops[0].op_type, "square")
+
+        # Asserting second cast configuration
+        cast_2 = block.find_ops(op_type="cast")[1]
+        self.assertEqual(cast_2.dtype.val, "fp32")
+        self.assertEqual(len(cast_2.outputs), 1)
+        self.assertEqual(len(cast_2.outputs[0].child_ops), 0)
+
+        assert_model_is_valid(
+            prog,
+            {"x": (10, 20)},
+            expected_output_shapes={block.outputs[0].name: (10, 20)},
+        )
+
+    @parameterized.parameterized.expand([[1.0], [-1.0]])
+    def test_inf(self, sign):
+        """
+        Input graph:
+            input -> add(±2e38) -> tanh -> output
+
+        Output graph:
+            input -> cast(fp32->fp16) -> add(±inf) -> tanh -> cast(fp16->fp32) -> output
+        """
+
+        SHAPE = (2, 3)
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=SHAPE)])
+        def prog(x):
+            y = mb.add(x=x, y=np.float32(sign * 2e38))
+            z = mb.tanh(x=y)
+            return z
+
+        assert get_op_types_in_program(prog) == ["add", "tanh"]
+
+        prev_prog, _, _ = apply_pass_and_basic_check(prog, "common::add_fp16_cast")
+        apply_pass_and_basic_check(prog, "common::cast_optimization")
+        apply_pass_and_basic_check(prog, "common::const_elimination")
+        _, _, block = apply_pass_and_basic_check(prog, "common::dead_code_elimination")
+
+        assert get_op_types_in_program(prog) == ["cast", "add", "tanh", "cast"]
+        cast_to_fp16, cast_to_fp32 = prog.find_ops(op_type="cast")
+        assert cast_to_fp16.dtype.val == "fp16"
+        assert cast_to_fp32.dtype.val == "fp32"
+
+        output_name = block.outputs[0].name
+        assert_model_is_valid(prog, {"x": SHAPE}, expected_output_shapes={output_name: SHAPE})
+
+        prev_model = ct.convert(prev_prog)
+        model = ct.convert(prog)
+
+        x = 65500.0 * np.random.rand(*SHAPE)
+        prev_output = prev_model.predict({"x": x})[output_name]
+        output = model.predict({"x": x})[output_name]
+        np.allclose(prev_output, output)
+
+    def test_fp16_overflow(self):
+        """
+        Input graph:
+            input -> clip(-77777, 88888) -> output
+
+        Nothing gets changed due to fp16 overflow
+        """
+
+        SHAPE = (2, 1, 3, 7, 5)
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=SHAPE)])
+        def prog(x):
+            y = mb.clip(x=x, alpha=np.float32(-77777), beta=np.float32(88888))
+            return y
+
+        assert get_op_types_in_program(prog) == ["clip"]
+
+        apply_pass_and_basic_check(prog, "common::add_fp16_cast")
+
+        assert get_op_types_in_program(prog) == ["clip"]
+
+    def test_divide_by_zero_operation(self):
+        """
+        Input graph:
+            input ------|
+                        |-> div -> output
+            const(eps) -|
+
+        Output graph:
+            input ------> cast(fp32->fp16) -|
+                                            |-> div -> cast(fp16->fp32) -> output
+            const(eps) -> cast(fp32->fp16) -|
+        """
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(10, 20))])
+        def prog(x):
+            eps = mb.const(val=1e-10)
+            x = mb.real_div(x=x, y=eps)
+            return x
+
+        prev_prog, prev_block, block = apply_pass_and_basic_check(
+            prog, quantization.FP16ComputePrecision(op_selector=lambda op: True)
+        )
+
+        mlmodel = ct.convert(prog, compute_units=ct.ComputeUnit.CPU_ONLY)
+        input_dict = {"x": np.random.rand(10, 20)}
+
+        if _IS_MACOS:
+            prediction = mlmodel.predict(input_dict)
+            assert not np.isnan(prediction["real_div_0"]).any()
+            assert np.isfinite(prediction["real_div_0"]).all()
+
+    def test_multiple_inputs_to_single_operation(self):
+        """
+        Input graph:
+            input1 -|
+                    |-> concat -> output
+            input2 -|
+
+        Output graph:
+            input1 -> cast(fp32->fp16) -|
+                                        |-> concat -> cast(fp16->fp32) -> output
+            input2 -> cast(fp32->fp16) -|
+        """
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(10, 20)), mb.TensorSpec(shape=(10, 20))])
+        def prog(x, y):
+            x = mb.concat(values=(x, y), axis=0)
+            return x
+
+        self.assertEqual(get_op_types_in_program(prog), ["concat"])
+
+        apply_pass_and_basic_check(
+            prog, quantization.FP16ComputePrecision(op_selector=lambda op: True)
+        )
+        _, _, block = apply_pass_and_basic_check(prog, "common::dead_code_elimination")
+
+        self.assertEqual(get_op_types_in_program(prog), ["cast", "cast", "concat", "cast"])
+
+        # Asserting first cast configuration
+        cast_1 = block.find_ops(op_type="cast")[0]
+        self.assertEqual(cast_1.dtype.val, "fp16")
+        self.assertEqual(len(cast_1.outputs), 1)
+        self.assertEqual(len(cast_1.outputs[0].child_ops), 1)
+        self.assertEqual(cast_1.outputs[0].child_ops[0].op_type, "concat")
+
+        # Asserting second cast configuration
+        cast_2 = block.find_ops(op_type="cast")[1]
+        self.assertEqual(cast_2.dtype.val, "fp16")
+        self.assertEqual(len(cast_2.outputs), 1)
+        self.assertEqual(len(cast_2.outputs[0].child_ops), 1)
+        self.assertEqual(cast_2.outputs[0].child_ops[0].op_type, "concat")
+
+        # Asserting third cast configuration
+        cast_3 = block.find_ops(op_type="cast")[2]
+        self.assertEqual(cast_3.dtype.val, "fp32")
+        self.assertEqual(len(cast_3.outputs), 1)
+        self.assertEqual(len(cast_3.outputs[0].child_ops), 0)
+
+        assert_model_is_valid(
+            prog,
+            {"x": (10, 20), "y": (10, 20)},
+            expected_output_shapes={block.outputs[0].name: (20, 20)},
+        )
+
+    def test_multiple_outputs_from_single_operation(self):
+        """
+        Input graph:
+                            |-> output_1
+            input -> split -|
+                            |-> output_2
+
+        Output graph:
+                                                |-> cast(fp16->fp32) -> output_1
+            input -> cast(fp32->fp16) -> split -|
+                                                |-> cast(fp16->fp32) -> output_2
+        """
+        @mb.program(input_specs=[mb.TensorSpec(shape=(10, 20))])
+        def prog(x):
+            x = mb.split(x=x, axis=0, num_splits=2)
+            return x
+
+        self.assertEqual(get_op_types_in_program(prog), ["split"])
+
+        apply_pass_and_basic_check(
+            prog, quantization.FP16ComputePrecision(op_selector=lambda op: True)
+        )
+        _, _, block = apply_pass_and_basic_check(prog, "common::dead_code_elimination")
+
+        self.assertEqual(get_op_types_in_program(prog), ["cast", "split", "cast", "cast"])
+
+        # Asserting first cast configuration
+        cast_1 = block.find_ops(op_type="cast")[0]
+        self.assertEqual(cast_1.dtype.val, "fp16")
+        self.assertEqual(len(cast_1.outputs), 1)
+        self.assertEqual(len(cast_1.outputs[0].child_ops), 1)
+        self.assertEqual(cast_1.outputs[0].child_ops[0].op_type, "split")
+
+        # Asserting second cast configuration
+        cast_2 = block.find_ops(op_type="cast")[1]
+        self.assertEqual(cast_2.dtype.val, "fp32")
+        self.assertEqual(len(cast_2.outputs), 1)
+        self.assertEqual(len(cast_2.outputs[0].child_ops), 0)
+
+        # Asserting third cast configuration
+        cast_3 = block.find_ops(op_type="cast")[2]
+        self.assertEqual(cast_3.dtype.val, "fp32")
+        self.assertEqual(len(cast_3.outputs), 1)
+        self.assertEqual(len(cast_3.outputs[0].child_ops), 0)
+
+        assert_model_is_valid(
+            prog,
+            {"x": (10, 20)},
+            expected_output_shapes={block.outputs[0].name: (5, 20), block.outputs[1].name: (5, 20)},
+        )
+
+    def test_single_input_to_multiple_operations(self):
+        """
+        Input graph:
+                   |-> square -> output_1
+            input -|
+                   |->  relu  -> output_2
+
+        Output graph:
+                                       |-> square -> cast(fp16->fp32) -> output_1
+            input -> cast(fp32->fp16) -|
+                                       |->  relu  -> cast(fp16->fp32) -> output_2
+        """
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(10, 20))])
+        def prog(x):
+            y = mb.square(x=x)
+            z = mb.relu(x=x)
+            return y, z
+
+        self.assertEqual(get_op_types_in_program(prog), ["square", "relu"])
+
+        apply_pass_and_basic_check(
+            prog, quantization.FP16ComputePrecision(op_selector=lambda op: True)
+        )
+        _, _, block = apply_pass_and_basic_check(prog, "common::dead_code_elimination")
+
+        self.assertEqual(get_op_types_in_program(prog), ["cast", "square", "cast", "relu", "cast"])
+
+        # Asserting first cast configuration
+        cast_1 = block.find_ops(op_type="cast")[0]
+        self.assertEqual(cast_1.dtype.val, "fp16")
+        self.assertEqual(len(cast_1.outputs), 1)
+        self.assertEqual(len(cast_1.outputs[0].child_ops), 2)
+        self.assertEqual(cast_1.outputs[0].child_ops[0].op_type, "square")
+        self.assertEqual(cast_1.outputs[0].child_ops[1].op_type, "relu")
+
+        # Asserting second cast configuration
+        cast_2 = block.find_ops(op_type="cast")[1]
+        self.assertEqual(cast_2.dtype.val, "fp32")
+        self.assertEqual(len(cast_2.outputs), 1)
+        self.assertEqual(len(cast_2.outputs[0].child_ops), 0)
+
+        # Asserting third cast configuration
+        cast_3 = block.find_ops(op_type="cast")[2]
+        self.assertEqual(cast_3.dtype.val, "fp32")
+        self.assertEqual(len(cast_3.outputs), 1)
+        self.assertEqual(len(cast_3.outputs[0].child_ops), 0)
+
+        assert_model_is_valid(
+            prog,
+            {"x": (10, 20)},
+            expected_output_shapes={
+                block.outputs[0].name: (10, 20),
+                block.outputs[1].name: (10, 20),
+            },
+        )
+
+    def test_duplicate_output_vars(self):
+        """
+        Input graph:
+                           |-> output_1
+            input -> relu -|
+                           |-> output_2
+
+        Output graph:
+                                                                   |-> output_1
+            input -> cast(fp32->fp16) -> relu -> cast(fp16->fp32) -|
+                                                                   |-> output_2
+        """
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(1, 2))])
+        def prog(x):
+            relu1 = mb.relu(x=x)
+            return relu1, relu1
+
+        _, _, block = apply_pass_and_basic_check(
+            prog, quantization.FP16ComputePrecision(op_selector=lambda op: True)
+        )
+        self.assertEqual(get_op_types_in_program(prog), ["cast", "relu", "cast"])
+
+        assert_model_is_valid(
+            prog,
+            {"x": (1, 2)},
+            expected_output_shapes={block.outputs[0].name: (1, 2), block.outputs[1].name: (1, 2)},
+            backend=("mlprogram", "fp16"),
+        )
diff --git a/coremltools/converters/mil/mil/program.py b/coremltools/converters/mil/mil/program.py
index 39e67cf37..1468a6100 100644
--- a/coremltools/converters/mil/mil/program.py
+++ b/coremltools/converters/mil/mil/program.py
@@ -20,6 +20,10 @@
 
 
 class Program:
+    @staticmethod
+    def _get_opset_str_value(op):
+        return f"coremltools.target.{op.name}"
+
     def __init__(self):
         self.main_input_types = []
         self.main_output_types = None
@@ -55,9 +59,13 @@ def check_version_compatibility_block(block):
                 expected_op_cls = _get_version_of_op(op._op_variants, max_opset_version)
                 if type(op) is not expected_op_cls:
                     msg = (
-                        "Op {} with an out of date version {!s} is detected. Please use @mb.program(input_specs=..., "
-                        "opset_version={!s})"
-                    ).format(op.op_type, op.opset_version, max_opset_version)
+                        "Op {} with an out of date version {} is detected. Please use @mb.program(input_specs=..., "
+                        "opset_version={})"
+                    ).format(
+                        op.op_type,
+                        self._get_opset_str_value(op.opset_version),
+                        self._get_opset_str_value(max_opset_version),
+                    )
                     raise ValueError(msg)
         for func in self.functions.values():
             check_version_compatibility_block(func)
@@ -69,18 +77,24 @@ def _check_or_set_functions_opset_version(self, max_opset_version):
                 func.opset_version = max_opset_version
             else:
                 if func.opset_version < max_opset_version:
-                    msg = "function should have at least opset_version {!s}. Got {!s}".format(max_opset_version, func.opset_version)
+                    msg = "function should have at least opset_version {}. Got {}".format(
+                        self._get_opset_str_value(max_opset_version),
+                        self._get_opset_str_value(func.opset_version),
+                    )
                     raise ValueError(msg)
         for func in funcs:
             if func.opset_version != funcs[0].opset_version:
-                msg = "all functions must have the same opset_version. Got {!s} and {!s}.".format(func.opset_version, funcs[0].opset_version)
+                msg = "all functions must have the same opset_version. Got {} and {}.".format(
+                    self._get_opset_str_value(func.opset_version),
+                    self._get_opset_str_value(funcs[0].opset_version),
+                )
                 raise ValueError(msg)
 
     def _check_program_opset_version(self):
         max_opset_version, _ = self._get_max_opset_version_and_op()
         self._check_ops_version_compatibility(max_opset_version)
         self._check_or_set_functions_opset_version(max_opset_version)
-        
+
     def _check_invalid_tensor_rank(self):
         '''
         Early error out for tensor with rank >= 6
diff --git a/coremltools/converters/mil/mil/tests/test_programs.py b/coremltools/converters/mil/mil/tests/test_programs.py
index 1a7e8e1f2..80feeb1bd 100644
--- a/coremltools/converters/mil/mil/tests/test_programs.py
+++ b/coremltools/converters/mil/mil/tests/test_programs.py
@@ -146,7 +146,9 @@ def test_reserved_node_names():
     def prog(x):
         return mb.square(x=x, name="tensor")
 
-    mlmodel = ct.convert(prog, source="milinternal", convert_to="mlprogram")
+    mlmodel = ct.convert(
+        prog, source="milinternal", convert_to="mlprogram", compute_units=ct.ComputeUnit.CPU_ONLY
+    )
 
     feed_dict = {
         "x": np.random.rand(10, 20).astype(np.float32),
@@ -224,7 +226,9 @@ def test_multi_versions_op_selection():
     @staticmethod
     def test_pymil_front_end_conversion():
         prog = get_simple_topk_pixel_unshuffle_program(opset_version=ct.target.iOS16)
-        mlmodel = ct.convert(prog, minimum_deployment_target=ct.target.iOS16)
+        mlmodel = ct.convert(
+            prog, minimum_deployment_target=ct.target.iOS16, compute_units=ct.ComputeUnit.CPU_ONLY
+        )
 
     @staticmethod
     def test_nested_block_opset_version_selection():
@@ -253,7 +257,9 @@ def test_pymil_opset_version_inference():
             "since op pixel_unshuffle is only available in opset coremltools.target.iOS16 or newer."
         )
         with pytest.raises(ValueError, match=expected_err_str):
-            mlmodel = ct.convert(prog, convert_to="mlprogram")
+            mlmodel = ct.convert(
+                prog, convert_to="mlprogram", compute_units=ct.ComputeUnit.CPU_ONLY
+            )
 
     @staticmethod
     def test_pymil_front_end_conversion_early_error_out():
@@ -263,7 +269,11 @@ def test_pymil_front_end_conversion_early_error_out():
             "since op pixel_unshuffle is only available in opset coremltools.target.iOS16 or newer."
         )
         with pytest.raises(ValueError, match=expected_err_str):
-            mlmodel = ct.convert(prog, minimum_deployment_target=ct.target.iOS15)
+            mlmodel = ct.convert(
+                prog,
+                minimum_deployment_target=ct.target.iOS15,
+                compute_units=ct.ComputeUnit.CPU_ONLY,
+            )
 
     @staticmethod
     def test_unsupported_op_early_error_out():
@@ -322,8 +332,9 @@ def test_rank6_tensor_early_error_out():
             def prog(x):
                 res = mb.reshape(x=x, shape=(1, 1, 1, 1, 1, 1), name="reshape_0")
                 return res
-            ct.convert(prog, source="milinternal")
-                
+
+            ct.convert(prog, source="milinternal", compute_units=ct.ComputeUnit.CPU_ONLY)
+
     @staticmethod
     def test_rank5_list_early_error_out():
         '''
@@ -343,5 +354,3 @@ def prog(x):
                     name="list_0",
                 )
                 return ls
-
-
diff --git a/coremltools/converters/mil/mil/types/__init__.py b/coremltools/converters/mil/mil/types/__init__.py
index 6cdc9fc33..004e981ce 100644
--- a/coremltools/converters/mil/mil/types/__init__.py
+++ b/coremltools/converters/mil/mil/types/__init__.py
@@ -24,8 +24,7 @@
                            string_to_builtin, type_to_builtin_type)
 from .type_str import str
 from .type_tensor import (is_compatible_type, is_tensor_and_is_compatible,
-                          is_tensor_and_is_compatible_general_shape, tensor,
-                          tensor_has_complete_shape)
+                          tensor, tensor_has_complete_shape)
 from .type_tuple import tuple
 from .type_unknown import unknown
 from .type_void import void
diff --git a/coremltools/converters/mil/mil/types/type_int.py b/coremltools/converters/mil/mil/types/type_int.py
index 132ee9f1a..61b0149ac 100644
--- a/coremltools/converters/mil/mil/types/type_int.py
+++ b/coremltools/converters/mil/mil/types/type_int.py
@@ -36,11 +36,9 @@ def val(self, v):
             from .type_mapping import (builtin_to_string, nptype_from_builtin,
                                        numpy_type_to_builtin_type)
 
-            if not isinstance(v, (np.generic, sm.Basic)):
+            if not isinstance(v, (np.generic, np.ndarray, sm.Basic)):
                 raise ValueError(
-                    "types should have value of numpy type or Symbols, got {} instead".format(
-                        type(v)
-                    )
+                    f"types should have value of numpy type or Symbols, got {type(v)} instead"
                 )
 
             if isinstance(v, sm.Basic):
@@ -54,16 +52,14 @@ def val(self, v):
                 else:
                     self._val = v.astype(nptype_from_builtin(self.__class__))
                     logger.warning(
-                        "Saving value type of {} into a builtin type of {}, might overflow or loses precision!".format(
-                            v.dtype, builtin_to_string(self.__class__)
-                        )
+                        f"Saving value type of {v.dtype} into a builtin type of "
+                        f"{builtin_to_string(self.__class__)}, might overflow or loses precision!"
                     )
             else:
                 self._val = v.astype(nptype_from_builtin(self.__class__))
                 logger.warning(
-                    "Saving value type of {} into a builtin type of {}, might be incompatible or loses precision!".format(
-                        v.dtype, builtin_to_string(self.__class__)
-                    )
+                    f"Saving value type of {v.dtype} into a builtin type of "
+                    f"{builtin_to_string(self.__class__)}, might be incompatible or loses precision!"
                 )
 
         @classmethod
diff --git a/coremltools/converters/mil/mil/types/type_mapping.py b/coremltools/converters/mil/mil/types/type_mapping.py
index 9dbdcd9c7..a6fbeab1f 100644
--- a/coremltools/converters/mil/mil/types/type_mapping.py
+++ b/coremltools/converters/mil/mil/types/type_mapping.py
@@ -416,7 +416,7 @@ def np_val_to_py_type(val):
     if not isinstance(val, (_np.ndarray, _np.generic)):
         return val
 
-    if val.dtype in [_np.float16, _np.uint8, _np.int8, _np.uint32]:
+    if val.dtype in (_np.float16, _np.uint8, _np.int8, _np.uint16, _np.int16, _np.uint32):
         return val.tobytes()
     else:
         # val is np.ndarray or np.generic
diff --git a/coremltools/converters/mil/mil/types/type_tensor.py b/coremltools/converters/mil/mil/types/type_tensor.py
index a56cf3cce..694e27db8 100644
--- a/coremltools/converters/mil/mil/types/type_tensor.py
+++ b/coremltools/converters/mil/mil/types/type_tensor.py
@@ -185,44 +185,6 @@ def is_tensor_and_is_compatible(tensor_type1, tensor_type2, allow_promotion=Fals
 
     return True, tensor(primitive_type, most_specific_shape)
 
-def is_tensor_and_is_compatible_general_shape(tensor_type1, tensor_type2):
-    # returns a pair of (bool, type)
-    # If Both are tensors, and have compatible shape, the first return is true
-    # The return will be the most general version of the tensor type.
-    # Note that this may not be either tensor types. i.e.
-    #
-    # is_tensor_and_is_compatible(tensor[fp32,[10,-1]] ,tensor[fp32,[-1,20]])
-    # will return True, tensor[fp32, [-1,-1]]
-
-    if not is_tensor(tensor_type1) or not is_tensor(tensor_type2):
-        return False, None
-    shape1 = tensor_type1.get_shape()
-    shape2 = tensor_type2.get_shape()
-
-    if tensor_type1.get_primitive() != tensor_type2.get_primitive():
-        return False, None
-
-    if len(shape1) == 0:
-        return True, tensor_type2
-    if len(shape2) == 0:
-        return True, tensor_type1
-
-    if len(shape1) != len(shape2):
-        return False, None
-
-    most_general_shape = []
-    for i in range(len(shape1)):
-        if shape1[i] == -1 or issubclass(type(shape1[i]), sm.Basic):
-            most_general_shape.append(shape1[i])
-        elif shape2[i] == -1 or issubclass(type(shape2[i]), sm.Basic):
-            most_general_shape.append(shape2[i])
-        elif shape1[i] == shape2[i]:
-            most_general_shape.append(shape1[i])
-        elif shape1[i] != shape2[i]:
-            return False, None
-
-    return True, tensor(tensor_type1.get_primitive(), most_general_shape)
-
 def is_compatible_type(type1, type2):
     """
     Return if type1 and type2 are compatible.
diff --git a/coremltools/converters/mil/test_flexible_shape_inputs.py b/coremltools/converters/mil/test_flexible_shape_inputs.py
index 9920b758b..ad126e473 100644
--- a/coremltools/converters/mil/test_flexible_shape_inputs.py
+++ b/coremltools/converters/mil/test_flexible_shape_inputs.py
@@ -3,12 +3,17 @@
 # Use of this source code is governed by a BSD-3-clause license that can be
 # found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+import itertools
+import os
+import tempfile
+
 import numpy as _np
 import PIL.Image
 import pytest
 
 import coremltools as ct
-from coremltools._deps import _HAS_TORCH, MSG_TORCH_NOT_FOUND
+from coremltools._deps import _HAS_TF_2, _HAS_TORCH, MSG_TF2_NOT_FOUND, MSG_TORCH_NOT_FOUND
+from coremltools.converters.mil.testing_reqs import backends, compute_units
 
 if _HAS_TORCH:
     import torch
@@ -24,6 +29,10 @@ def forward(self, x):
             return self.conv(x)
 
 
+if _HAS_TF_2:
+    import tensorflow as tf
+
+
 def _numpy_array_to_pil_image(x):
     """
     convert x of shape (1, 3, H, W) to PIL image
@@ -47,6 +56,7 @@ def _compute_snr(arr1, arr2):
     psnr = 10 * _np.log10(max_signal_energy / noise_var)
     return snr, psnr
 
+
 def _assert_torch_coreml_output_shapes(coreml_model, spec, torch_model, torch_example_input, is_image_input=False):
     torch_out = torch_model(torch_example_input)
     input_name = spec.description.input[0].name
@@ -64,20 +74,32 @@ def _assert_torch_coreml_output_shapes(coreml_model, spec, torch_model, torch_ex
 
 
 @pytest.mark.skipif(not _HAS_TORCH or not ct.utils._is_macos(), reason=MSG_TORCH_NOT_FOUND)
-class TestFlexibleInputShapes:
+class TestFlexibleInputShapesTorch:
 
-    @pytest.mark.parametrize("convert_to", ['neuralnetwork', 'mlprogram'])
-    def test_multiarray_input_rangedim(self, convert_to):
+    @pytest.mark.parametrize(
+        "backend, compute_unit",
+        itertools.product(
+            backends,
+            compute_units,
+        ),
+    )
+    def test_multiarray_input_rangedim(self, backend, compute_unit):
+        convert_to = backend[0]
         if convert_to == "mlprogram" and ct.utils._macos_version() < (12, 0):
             return
 
         example_input = torch.rand(1, 3, 50, 50) * 100
         traced_model = torch.jit.trace(TestConvModule().eval(), example_input)
 
-        input_shape = ct.Shape(shape=(1, 3, ct.RangeDim(25, 100, default=45), ct.RangeDim(25, 100, default=45)))
-        model = ct.convert(traced_model,
-                           inputs=[ct.TensorType(shape=input_shape)],
-                           convert_to=convert_to)
+        input_shape = ct.Shape(
+            shape=(1, 3, ct.RangeDim(25, 100, default=45), ct.RangeDim(25, 100, default=45))
+        )
+        model = ct.convert(
+            traced_model,
+            inputs=[ct.TensorType(shape=input_shape)],
+            convert_to=convert_to,
+            compute_units=compute_unit,
+        )
 
         spec = model.get_spec()
         assert list(spec.description.input[0].type.multiArrayType.shape) == [1, 3, 45, 45]
@@ -85,19 +107,79 @@ def test_multiarray_input_rangedim(self, convert_to):
         assert spec.description.input[0].type.multiArrayType.shapeRange.sizeRanges[2].upperBound == 100
         _assert_torch_coreml_output_shapes(model, spec, traced_model, example_input)
 
-    @pytest.mark.parametrize("convert_to", ['neuralnetwork', 'mlprogram'])
-    def test_multiarray_input_enumerated(self, convert_to):
+    @pytest.mark.parametrize(
+        "backend, compute_unit, explicitly_set",
+        itertools.product(
+            backends,
+            compute_units,
+            [True, False],
+        ),
+    )
+    def test_multiarray_input_rangedim_infinite(self, backend, compute_unit, explicitly_set):
+        convert_to = backend[0]
+        example_input = torch.rand(1, 3, 50, 50) * 100
+        traced_model = torch.jit.trace(TestConvModule().eval(), example_input)
+        second_dim = ct.RangeDim()
+        if explicitly_set:
+            second_dim.upper_bound = -1
+        input_shape = ct.Shape(shape=(1, 3, second_dim, ct.RangeDim(25, 100, default=45)))
+
+        if convert_to == "mlprogram":
+            with pytest.raises(
+                ValueError,
+                match="For mlprogram, inputs with infinite upper_bound is not allowed. Please set "
+                'upper_bound to a positive value in "RangeDim\(\)" for the "inputs" param in '
+                "ct.convert\(\).",
+            ):
+                ct.convert(
+                    traced_model,
+                    inputs=[ct.TensorType(shape=input_shape)],
+                    convert_to=convert_to,
+                    compute_units=compute_unit,
+                )
+        else:
+            model = ct.convert(
+                traced_model,
+                inputs=[ct.TensorType(shape=input_shape)],
+                convert_to=convert_to,
+                compute_units=compute_unit,
+            )
+            spec = model.get_spec()
+            assert list(spec.description.input[0].type.multiArrayType.shape) == [1, 3, 1, 45]
+            assert (
+                spec.description.input[0].type.multiArrayType.shapeRange.sizeRanges[2].lowerBound
+                == 1
+            )
+            assert (
+                spec.description.input[0].type.multiArrayType.shapeRange.sizeRanges[2].upperBound
+                == -1
+            )
+            _assert_torch_coreml_output_shapes(model, spec, traced_model, example_input)
+
+    @pytest.mark.parametrize(
+        "backend, compute_unit",
+        itertools.product(
+            backends,
+            compute_units,
+        ),
+    )
+    def test_multiarray_input_enumerated(self, backend, compute_unit):
+        convert_to = backend[0]
         if convert_to == "mlprogram" and ct.utils._macos_version() < (12, 0):
             return
 
         example_input = torch.rand(1, 3, 50, 50) * 100
         traced_model = torch.jit.trace(TestConvModule().eval(), example_input)
 
-        input_shape = ct.EnumeratedShapes(shapes=[[1, 3, 25, 25], [1, 3, 50, 50], [1, 3, 67, 67]],
-                                          default=[1, 3, 67, 67])
-        model = ct.convert(traced_model,
-                           inputs=[ct.TensorType(shape=input_shape)],
-                           convert_to=convert_to)
+        input_shape = ct.EnumeratedShapes(
+            shapes=[[1, 3, 25, 25], [1, 3, 50, 50], [1, 3, 67, 67]], default=[1, 3, 67, 67]
+        )
+        model = ct.convert(
+            traced_model,
+            inputs=[ct.TensorType(shape=input_shape)],
+            convert_to=convert_to,
+            compute_units=compute_unit,
+        )
 
         spec = model.get_spec()
         assert list(spec.description.input[0].type.multiArrayType.shape) == [1, 3, 67, 67]
@@ -106,36 +188,112 @@ def test_multiarray_input_enumerated(self, convert_to):
         _assert_torch_coreml_output_shapes(model, spec, traced_model, example_input)
 
     @pytest.mark.skipif(ct.utils._macos_version() < (12, 0), reason="Image input with RangeDim works correctly on macOS12+")
-    @pytest.mark.parametrize("convert_to", ['neuralnetwork', 'mlprogram'])
-    def test_image_input_rangedim(self, convert_to):
+    @pytest.mark.parametrize(
+        "backend, compute_unit",
+        itertools.product(
+            backends,
+            compute_units,
+        ),
+    )
+    def test_image_input_rangedim(self, backend, compute_unit):
+        convert_to = backend[0]
         example_input = torch.rand(1, 3, 50, 50) * 255
         traced_model = torch.jit.trace(TestConvModule().eval(), example_input)
 
-        input_shape = ct.Shape(shape=(1, 3, ct.RangeDim(25, 100, default=45), ct.RangeDim(25, 100, default=45)))
-        model = ct.convert(traced_model,
-                           inputs=[ct.ImageType(shape=input_shape)],
-                           convert_to=convert_to)
+        input_shape = ct.Shape(
+            shape=(1, 3, ct.RangeDim(25, 100, default=35), ct.RangeDim(25, 100, default=45))
+        )
+        model = ct.convert(
+            traced_model,
+            inputs=[ct.ImageType(shape=input_shape)],
+            convert_to=convert_to,
+            compute_units=compute_unit,
+        )
 
         spec = model.get_spec()
         assert spec.description.input[0].type.imageType.width == 45
-        assert spec.description.input[0].type.imageType.height == 45
+        assert spec.description.input[0].type.imageType.height == 35
         assert spec.description.input[0].type.imageType.imageSizeRange.widthRange.lowerBound == 25
         assert spec.description.input[0].type.imageType.imageSizeRange.widthRange.upperBound == 100
         _assert_torch_coreml_output_shapes(model, spec, traced_model, example_input, is_image_input=True)
 
-    @pytest.mark.parametrize("convert_to", ['neuralnetwork', 'mlprogram'])
-    def test_image_input_enumerated(self, convert_to):
+    @pytest.mark.skipif(
+        ct.utils._macos_version() < (12, 0),
+        reason="Image input with RangeDim works correctly on macOS12+",
+    )
+    @pytest.mark.parametrize(
+        "backend, compute_unit, explicitly_set",
+        itertools.product(
+            backends,
+            compute_units,
+            [True, False],
+        ),
+    )
+    def test_image_input_rangedim_infinite(self, backend, compute_unit, explicitly_set):
+        convert_to = backend[0]
+        example_input = torch.rand(1, 3, 50, 50) * 255
+        traced_model = torch.jit.trace(TestConvModule().eval(), example_input)
+
+        second_dim = ct.RangeDim(upper_bound=-1) if explicitly_set else ct.RangeDim()
+        input_shape = ct.Shape(shape=(1, 3, second_dim, ct.RangeDim(25, 100, default=45)))
+
+        if convert_to == "mlprogram":
+            with pytest.raises(
+                ValueError,
+                match="For mlprogram, inputs with infinite upper_bound is not allowed. Please set "
+                'upper_bound to a positive value in "RangeDim\(\)" for the "inputs" param in '
+                "ct.convert\(\).",
+            ):
+                ct.convert(
+                    traced_model,
+                    inputs=[ct.ImageType(shape=input_shape)],
+                    convert_to=convert_to,
+                    compute_units=compute_unit,
+                )
+        else:
+            model = ct.convert(
+                traced_model,
+                inputs=[ct.ImageType(shape=input_shape)],
+                convert_to=convert_to,
+                compute_units=compute_unit,
+            )
+            spec = model.get_spec()
+            assert spec.description.input[0].type.imageType.width == 45
+            assert spec.description.input[0].type.imageType.height == 1
+            assert (
+                spec.description.input[0].type.imageType.imageSizeRange.heightRange.lowerBound == 1
+            )
+            assert (
+                spec.description.input[0].type.imageType.imageSizeRange.heightRange.upperBound == -1
+            )
+            _assert_torch_coreml_output_shapes(
+                model, spec, traced_model, example_input, is_image_input=True
+            )
+
+    @pytest.mark.parametrize(
+        "backend, compute_unit",
+        itertools.product(
+            backends,
+            compute_units,
+        ),
+    )
+    def test_image_input_enumerated(self, backend, compute_unit):
+        convert_to = backend[0]
         if convert_to == "mlprogram" and ct.utils._macos_version() < (12, 0):
             return
 
         example_input = torch.rand(1, 3, 50, 50) * 255
         traced_model = torch.jit.trace(TestConvModule().eval(), example_input)
 
-        input_shape = ct.EnumeratedShapes(shapes=[[1, 3, 25, 25], [1, 3, 50, 50], [1, 3, 67, 67]],
-                                          default=[1, 3, 67, 67])
-        model = ct.convert(traced_model,
-                           inputs=[ct.ImageType(shape=input_shape)],
-                           convert_to=convert_to)
+        input_shape = ct.EnumeratedShapes(
+            shapes=[[1, 3, 25, 25], [1, 3, 50, 50], [1, 3, 67, 67]], default=[1, 3, 67, 67]
+        )
+        model = ct.convert(
+            traced_model,
+            inputs=[ct.ImageType(shape=input_shape)],
+            convert_to=convert_to,
+            compute_units=compute_unit,
+        )
 
         spec = model.get_spec()
         assert spec.description.input[0].type.imageType.width == 67
@@ -144,3 +302,357 @@ def test_image_input_enumerated(self, convert_to):
         assert spec.description.input[0].type.imageType.enumeratedSizes.sizes[0].width == 25
         assert spec.description.input[0].type.imageType.enumeratedSizes.sizes[0].height == 25
         _assert_torch_coreml_output_shapes(model, spec, traced_model, example_input, is_image_input=True)
+
+
+@pytest.mark.skipif(not _HAS_TF_2 or not ct.utils._is_macos(), reason=MSG_TF2_NOT_FOUND)
+class TestFlexibleInputShapesTF:
+    @classmethod
+    def setup_class(cls):
+        """Prepares tf model in different formats (keras model, h5 file, saved_model dir)."""
+        input_1 = tf.keras.Input(shape=(None, None, 16), name="input_1")
+        input_2 = tf.keras.Input(shape=(None, None, 4), name="input_2")
+        x = tf.keras.layers.Dense(4, activation=tf.nn.relu)(input_1) + input_2
+        outputs = tf.keras.layers.Dense(5, activation=tf.nn.softmax)(x)
+        cls.model = tf.keras.Model(inputs=[input_1, input_2], outputs=outputs)
+
+        cls.temp_dir = tempfile.TemporaryDirectory()
+        cls.h5_model_path = os.path.join(cls.temp_dir.name, "tf_keras_model.h5")
+        cls.model.save(cls.h5_model_path)
+        cls.saved_model_path = os.path.join(cls.temp_dir.name, "saved_model")
+        cls.model.save(cls.saved_model_path, save_format="tf")
+
+    @classmethod
+    def teardown_class(cls):
+        """CLean up temp dir that stores the TF models."""
+        cls.temp_dir.cleanup()
+
+    @staticmethod
+    def _find_unknown_dim_warning(raised_warnings: pytest.WarningsRecorder) -> bool:
+        """Find if pytest catches any warning message about the unknown dim warning."""
+        for raised_warning in raised_warnings:
+            if raised_warning.message.args[0].startswith(
+                "Some dimensions in the input shape are unknown, hence they are set to flexible ranges"
+            ):
+                return True
+        return False
+
+    @pytest.mark.parametrize(
+        "backend, compute_unit, model_format",
+        itertools.product(
+            backends,
+            compute_units,
+            ["keras_model", "h5", "saved_model"],
+        ),
+    )
+    def test_dynamic_shape_no_inputs(self, backend, compute_unit, model_format):
+        """
+        The `inputs` param in `ct.convert` is not provided, so all inputs in the TF model with `None`
+        dim will have a range shape where lower-bound/default/upper-bound are sanitized to finite
+        numbers and warns users.
+        """
+        convert_to = backend[0]
+        model_param = self.model
+        if model_format == "h5":
+            model_param = self.h5_model_path
+        elif model_format == "saved_model":
+            model_param = self.saved_model_path
+
+        if convert_to == "mlprogram":
+            with pytest.warns(
+                UserWarning,
+                match="Some dimensions in the input shape are unknown, hence they are set to "
+                "flexible ranges with lower bound and default value = 1, and upper bound = 2. "
+                "To set different values for the default shape and upper bound, please use "
+                "the ct.RangeDim.*",
+            ):
+                mlmodel = ct.convert(
+                    model_param,
+                    source="tensorflow",
+                    convert_to=convert_to,
+                    compute_units=compute_unit,
+                )
+        else:
+            mlmodel = ct.convert(
+                model_param,
+                source="tensorflow",
+                convert_to=convert_to,
+                compute_units=compute_unit,
+            )
+
+        spec = mlmodel.get_spec()
+        assert list(spec.description.input[0].type.multiArrayType.shape) == [1, 1, 1, 16]
+        assert (
+            spec.description.input[0].type.multiArrayType.shapeRange.sizeRanges[2].lowerBound == 1
+        )
+        assert (
+            spec.description.input[0].type.multiArrayType.shapeRange.sizeRanges[2].upperBound == -1
+            if convert_to == "neuralnetwork"
+            else 2
+        )
+
+    @pytest.mark.parametrize(
+        "backend, compute_unit, specify_input",
+        itertools.product(
+            backends,
+            compute_units,
+            ["input_1", "input_2"],
+        ),
+    )
+    def test_dynamic_shape_partial_inputs(self, backend, compute_unit, specify_input):
+        """
+        The `inputs` param in `ct.convert` is partially provided, where the TF model has two inputs
+        while we only provide one in `inputs` param. So another input in the TF model with `None`
+        dim will have a range shape where lower-bound/default/upper-bound are sanitized to finite
+        numbers and warns users.
+        """
+        convert_to = backend[0]
+        last_dim = 16 if specify_input == "input_1" else 4
+        inputs = [
+            ct.TensorType(
+                shape=ct.Shape(
+                    shape=(
+                        1,
+                        3,
+                        ct.RangeDim(2, 10, default=8),
+                        ct.RangeDim(4, 20, default=last_dim),
+                    )
+                ),
+                name=specify_input,
+            )
+        ]
+
+        if convert_to == "mlprogram":
+            with pytest.warns(
+                UserWarning,
+                match="Some dimensions in the input shape are unknown, hence they are set to "
+                "flexible ranges with lower bound and default value = 1, and upper bound = 2. "
+                "To set different values for the default shape and upper bound, please use "
+                "the ct.RangeDim.*",
+            ):
+                mlmodel = ct.convert(
+                    self.model,
+                    source="tensorflow",
+                    inputs=inputs,
+                    convert_to=convert_to,
+                    compute_units=compute_unit,
+                )
+        else:
+            mlmodel = ct.convert(
+                self.model,
+                source="tensorflow",
+                inputs=inputs,
+                convert_to=convert_to,
+                compute_units=compute_unit,
+            )
+
+        spec = mlmodel.get_spec()
+        # Notice the input in spec is not ordered, so need to use name to find input_1 and input_2.
+        for input_spec in spec.description.input:
+            if input_spec.name == "input_1":
+                input_1_spec = input_spec
+            elif input_spec.name == "input_2":
+                input_2_spec = input_spec
+        assert (
+            list(input_1_spec.type.multiArrayType.shape) == [1, 3, 8, 16]
+            if specify_input == "input_1"
+            else [1, 1, 1, 16]
+        )
+        assert (
+            list(input_2_spec.type.multiArrayType.shape) == [1, 3, 8, 4]
+            if specify_input == "input_2"
+            else [1, 1, 1, 4]
+        )
+        assert (
+            input_1_spec.type.multiArrayType.shapeRange.sizeRanges[2].lowerBound == 2
+            if specify_input == "input_1"
+            else 1
+        )
+        assert (
+            input_2_spec.type.multiArrayType.shapeRange.sizeRanges[2].lowerBound == 2
+            if specify_input == "input_2"
+            else 1
+        )
+        default_upper_bound = -1 if convert_to == "neuralnetwork" else 2
+        assert (
+            input_1_spec.type.multiArrayType.shapeRange.sizeRanges[2].upperBound == 10
+            if specify_input == "input_1"
+            else default_upper_bound
+        )
+        assert (
+            input_2_spec.type.multiArrayType.shapeRange.sizeRanges[2].upperBound == 10
+            if specify_input == "input_2"
+            else default_upper_bound
+        )
+
+    @pytest.mark.parametrize(
+        "backend, compute_unit",
+        itertools.product(
+            backends,
+            compute_units,
+        ),
+    )
+    def test_multiarray_input_rangedim(self, backend, compute_unit):
+        input_shape_1 = ct.Shape(
+            shape=(1, 3, ct.RangeDim(8, 20, default=8), ct.RangeDim(10, 100, default=16))
+        )
+        input_shape_2 = ct.Shape(
+            shape=(1, 3, ct.RangeDim(4, 16, default=16), ct.RangeDim(1, 10, default=4))
+        )
+
+        with pytest.warns() as raised_warnings:
+            model = ct.convert(
+                self.model,
+                source="tensorflow",
+                inputs=[
+                    ct.TensorType(shape=input_shape_1, name="input_1"),
+                    ct.TensorType(shape=input_shape_2, name="input_2"),
+                ],
+                convert_to=backend[0],
+                compute_units=compute_unit,
+            )
+            assert not self._find_unknown_dim_warning(raised_warnings)
+
+        spec = model.get_spec()
+        assert list(spec.description.input[0].type.multiArrayType.shape) == [1, 3, 8, 16]
+        assert (
+            spec.description.input[0].type.multiArrayType.shapeRange.sizeRanges[2].lowerBound == 8
+        )
+        assert (
+            spec.description.input[0].type.multiArrayType.shapeRange.sizeRanges[2].upperBound == 20
+        )
+        assert list(spec.description.input[1].type.multiArrayType.shape) == [1, 3, 16, 4]
+        assert (
+            spec.description.input[1].type.multiArrayType.shapeRange.sizeRanges[2].lowerBound == 4
+        )
+        assert (
+            spec.description.input[1].type.multiArrayType.shapeRange.sizeRanges[2].upperBound == 16
+        )
+
+    @pytest.mark.parametrize(
+        "backend, compute_unit, explicitly_set",
+        itertools.product(
+            backends,
+            compute_units,
+            [True, False],
+        ),
+    )
+    def test_multiarray_input_rangedim_infinite(self, backend, compute_unit, explicitly_set):
+        convert_to = backend[0]
+        second_dim = ct.RangeDim(upper_bound=-1) if explicitly_set else ct.RangeDim()
+        input_shape = ct.Shape(shape=(1, 3, second_dim, ct.RangeDim(10, 100, default=16)))
+
+        if convert_to == "mlprogram":
+            with pytest.raises(
+                ValueError,
+                match="For mlprogram, inputs with infinite upper_bound is not allowed. Please set "
+                'upper_bound to a positive value in "RangeDim\(\)" for the "inputs" param in '
+                "ct.convert\(\).",
+            ):
+                ct.convert(
+                    self.model,
+                    source="tensorflow",
+                    inputs=[ct.TensorType(shape=input_shape, name="input_1")],
+                    convert_to=convert_to,
+                    compute_units=compute_unit,
+                )
+        else:
+            model = ct.convert(
+                self.model,
+                source="tensorflow",
+                inputs=[ct.TensorType(shape=input_shape, name="input_1")],
+                convert_to=convert_to,
+                compute_units=compute_unit,
+            )
+            spec = model.get_spec()
+            assert list(spec.description.input[0].type.multiArrayType.shape) == [1, 3, 1, 16]
+            assert (
+                spec.description.input[0].type.multiArrayType.shapeRange.sizeRanges[2].lowerBound
+                == 1
+            )
+            assert (
+                spec.description.input[0].type.multiArrayType.shapeRange.sizeRanges[2].upperBound
+                == -1
+            )
+
+    @pytest.mark.parametrize(
+        "backend, compute_unit",
+        itertools.product(
+            backends,
+            compute_units,
+        ),
+    )
+    def test_multiarray_single_input_rangedim(self, backend, compute_unit):
+        input_1 = tf.keras.Input(shape=(None, None, 16), name="input_1")
+        x = tf.keras.layers.Dense(4, activation=tf.nn.relu)(input_1)
+        outputs = tf.keras.layers.Dense(5, activation=tf.nn.softmax)(x)
+        single_input_model = tf.keras.Model(inputs=input_1, outputs=outputs)
+
+        # The `inputs` will work without specifying the name.
+        model = ct.convert(
+            single_input_model,
+            source="tensorflow",
+            inputs=[
+                ct.TensorType(
+                    shape=(1, 3, ct.RangeDim(8, 20, default=8), ct.RangeDim(10, 100, default=16))
+                )
+            ],
+            convert_to=backend[0],
+            compute_units=compute_unit,
+        )
+        spec = model.get_spec()
+        assert list(spec.description.input[0].type.multiArrayType.shape) == [1, 3, 8, 16]
+        assert (
+            spec.description.input[0].type.multiArrayType.shapeRange.sizeRanges[2].lowerBound == 8
+        )
+        assert (
+            spec.description.input[0].type.multiArrayType.shapeRange.sizeRanges[2].upperBound == 20
+        )
+
+    @pytest.mark.skipif(
+        ct.utils._macos_version() < (12, 0),
+        reason="Image input with RangeDim works correctly on macOS12+",
+    )
+    @pytest.mark.parametrize(
+        "backend, compute_unit, explicitly_set",
+        itertools.product(
+            backends,
+            compute_units,
+            [True, False],
+        ),
+    )
+    def test_image_input_rangedim_infinite(self, backend, compute_unit, explicitly_set):
+        convert_to = backend[0]
+        second_dim = ct.RangeDim(upper_bound=-1) if explicitly_set else ct.RangeDim()
+        input_shape = ct.Shape(shape=(1, 2, second_dim, ct.RangeDim(1, 10, default=3)))
+
+        if convert_to == "mlprogram":
+            with pytest.raises(
+                ValueError,
+                match="For mlprogram, inputs with infinite upper_bound is not allowed. Please set "
+                'upper_bound to a positive value in "RangeDim\(\)" for the "inputs" param in '
+                "ct.convert\(\).",
+            ):
+                ct.convert(
+                    self.model,
+                    source="tensorflow",
+                    inputs=[ct.ImageType(shape=input_shape, name="input_1")],
+                    convert_to=convert_to,
+                    compute_units=compute_unit,
+                )
+        else:
+            model = ct.convert(
+                self.model,
+                source="tensorflow",
+                inputs=[ct.ImageType(shape=input_shape, name="input_1")],
+                convert_to=convert_to,
+                compute_units=compute_unit,
+            )
+            spec = model.get_spec()
+            assert spec.description.input[0].type.imageType.width == 1
+            assert spec.description.input[0].type.imageType.height == 2
+            assert (
+                spec.description.input[0].type.imageType.imageSizeRange.widthRange.lowerBound == 1
+            )
+            assert (
+                spec.description.input[0].type.imageType.imageSizeRange.widthRange.upperBound == -1
+            )
diff --git a/coremltools/converters/mil/testing_reqs.py b/coremltools/converters/mil/testing_reqs.py
index 00c074879..4c6ba4bdd 100644
--- a/coremltools/converters/mil/testing_reqs.py
+++ b/coremltools/converters/mil/testing_reqs.py
@@ -8,8 +8,7 @@
 import pytest
 
 import coremltools as ct
-from coremltools._deps import (_HAS_TF_1, _HAS_TF_2, _HAS_TORCH)
-
+from coremltools._deps import _HAS_TF_1, _HAS_TF_2, _HAS_TORCH
 
 # Setting up backend / precision
 backends = []
@@ -31,11 +30,11 @@
     backends = [('mlprogram', "fp16"), ('neuralnetwork', "fp32")]
     if os.getenv('INCLUDE_MIL_FP32_UNIT_TESTS') == '1':
         backends.append(('mlprogram', 'fp32'))
-        
+
 # Setting up compute unit
 compute_units = []
-if 'COMPUTE_UNITS' in os.environ:
-    for i, cur_str_val in enumerate(os.environ['COMPUTE_UNITS'].split(',')):
+if "COMPUTE_UNITS" in os.environ:
+    for cur_str_val in os.environ["COMPUTE_UNITS"].split(","):
         cur_str_val = cur_str_val.strip().upper()
         if cur_str_val not in ct.ComputeUnit.__members__:
             raise ValueError("Compute unit \"{}\" not supported in coremltools.".format(cur_str_val))
@@ -52,3 +51,6 @@
 if _HAS_TF_2:
     tf = pytest.importorskip("tensorflow")
     tf.random.set_seed(1234)
+
+if _HAS_TORCH:
+    torch = pytest.importorskip("torch")
diff --git a/coremltools/converters/mil/testing_utils.py b/coremltools/converters/mil/testing_utils.py
index 781d2fb9b..01560f6ef 100644
--- a/coremltools/converters/mil/testing_utils.py
+++ b/coremltools/converters/mil/testing_utils.py
@@ -8,8 +8,10 @@
 import re
 from functools import partial
 from pathlib import Path
+from typing import Dict, List, Tuple
 
 import numpy as np
+import pytest
 from PIL import Image
 
 import coremltools as ct
@@ -22,12 +24,21 @@
 
 np.random.seed(10)
 
-DTYPE_TO_FEATURE_TYPE_MAP = {"int32": ft.ArrayFeatureType.INT32,
-                             "fp32": ft.ArrayFeatureType.FLOAT32,
-                             "fp16": ft.ArrayFeatureType.FLOAT16,
-                             }
-
-einsum_equations = [
+DTYPE_TO_FEATURE_TYPE_MAP: Dict[str, ft.ArrayFeatureType] = {
+    "int32": ft.ArrayFeatureType.INT32,
+    "fp32": ft.ArrayFeatureType.FLOAT32,
+    "fp16": ft.ArrayFeatureType.FLOAT16,
+}
+
+# The minimum macOS version for an IOS target. For example, iOS16 target requires macOS13+.
+IOS_TO_MINIMUM_MACOS_VERSION: Dict[ct.target, int] = {
+    ct.target.iOS14: 11,
+    ct.target.iOS15: 12,
+    ct.target.iOS16: 13,
+    ct.target.iOS17: 14,
+}
+
+einsum_equations: List[str] = [
     # hardcoded cases
     "abcd,adce->abce",
     "abc,cbd->abd",
@@ -102,7 +113,12 @@ def assert_op_count_match(program, expect, op=None, verbose=False):
 
 
 def assert_model_is_valid(
-    program, inputs, backend=("neuralnetwork", "fp32"), verbose=True, expected_output_shapes=None
+    program,
+    inputs,
+    backend=("neuralnetwork", "fp32"),
+    verbose=True,
+    expected_output_shapes=None,
+    minimum_deployment_target: ct.target = None,
 ):
     """
     Assert Core ML model is valid.
@@ -112,6 +128,9 @@ def assert_model_is_valid(
     - input: str -> shape tuple. All program input names need to appear in str.
       shape tuple can only contain positive integers.
     """
+    if minimum_deployment_target is not None:
+        validate_minimum_deployment_target(minimum_deployment_target, backend)
+
     # Avoid circular import
     from coremltools.converters.mil.testing_reqs import ct
 
@@ -119,8 +138,13 @@ def assert_model_is_valid(
     for name, shape in inputs.items():
         input_dict[name] = np.random.rand(*shape)
 
-    mlmodel = ct_convert(program, source="milinternal", convert_to=backend,
-                         compute_units=ct.ComputeUnit.CPU_ONLY)
+    mlmodel = ct_convert(
+        program,
+        source="milinternal",
+        convert_to=backend,
+        compute_units=ct.ComputeUnit.CPU_ONLY,
+        minimum_deployment_target=minimum_deployment_target,
+    )
     assert mlmodel is not None
 
     if verbose:
@@ -245,7 +269,7 @@ def compare_backend(
     mlmodel,
     input_key_values,
     expected_outputs,
-    dtype = "fp32",
+    dtype="fp32",
     atol=1e-04,
     rtol=1e-05,
     also_compare_shapes=True,
@@ -292,9 +316,7 @@ def compare_backend(
     return None
 
 
-def compare_shapes(
-    mlmodel, input_key_values, expected_outputs, pred=None
-):
+def compare_shapes(mlmodel, input_key_values, expected_outputs, pred=None):
     """
     Inputs:
         - mlmodel: MLModel.
@@ -499,7 +521,8 @@ def random_gen_input_feature_type(input_desc):
     else:
         raise ValueError('unsupported type')
 
-def gen_input_shapes_einsum(equation, dynamic):
+
+def gen_input_shapes_einsum(equation: str, dynamic: bool, backend: Tuple[str, str]):
     equation = equation.replace(" ", "")
     left = equation.split("->")[0]
     a_desc, b_desc = left.split(",")
@@ -510,7 +533,10 @@ def gen_input_shapes_einsum(equation, dynamic):
         if symbol not in shapes:
             shapes[symbol] = cur_default_shape
             if dynamic:
-                converter_shapes[symbol] = ct.RangeDim(default=cur_default_shape)
+                converter_shapes[symbol] = ct.RangeDim(
+                    default=cur_default_shape,
+                    upper_bound=cur_default_shape if backend[0] == "mlprogram" else -1,
+                )
             else:
                 converter_shapes[symbol] = cur_default_shape
             cur_default_shape += 1
@@ -543,3 +569,19 @@ def assert_cast_ops_count(mlmodel, expected_count):
 
 def assert_ops_in_mil_program(mlmodel, expected_op_list):
     assert expected_op_list == get_op_types_in_program(mlmodel._mil_program)
+
+
+def validate_minimum_deployment_target(
+    minimum_deployment_target: ct.target, backend: Tuple[str, str]
+):
+    """
+    Validates the minimum deployment target based on backend and macOS version. Only used in tests.
+    """
+    if minimum_deployment_target >= ct.target.iOS15 and backend[0] != "mlprogram":
+        pytest.skip("IOS15+ target only compatible with mlprogram.")
+    if coremltoolsutils._is_macos():
+        macos_major_version = coremltoolsutils._macos_version()[0]
+        if macos_major_version < IOS_TO_MINIMUM_MACOS_VERSION[minimum_deployment_target]:
+            pytest.skip(
+                f"IOS{minimum_deployment_target} target requires macOS {macos_major_version}+."
+            )
diff --git a/coremltools/models/_deprecation.py b/coremltools/models/_deprecation.py
index 5eb9d43df..2effb07d2 100644
--- a/coremltools/models/_deprecation.py
+++ b/coremltools/models/_deprecation.py
@@ -6,7 +6,7 @@
 import warnings
 
 
-def deprecated(obj=None, suffix=""):
+def deprecated(obj=None, suffix="", version="", obj_prefix=""):
     """
     Decorator to mark a function or a class as deprecated
     """
@@ -16,16 +16,14 @@ def decorator_deprecation_warning(obj):
         def wrapped(*args, **kwargs):
             if isinstance(obj, type):
                 msg = (
-                    'Class "%s" is deprecated and will be removed in 6.0.'
-                    % obj.__name__
+                    f"Class {obj_prefix}{obj.__name__} is deprecated and will be removed in {version}."
                 )
             else:
                 msg = (
-                    'Function "%s" is deprecated and will be removed in 6.0.'
-                    % obj.__name__
+                    f"Function {obj_prefix}{obj.__name__} is deprecated and will be removed in {version}."
                 )
             if suffix:
-                msg += "; %s" % suffix
+                msg += f"; {suffix}"
             warnings.warn(msg, category=FutureWarning)
             return obj(*args, **kwargs)
 
diff --git a/coremltools/models/ml_program/compression_utils.py b/coremltools/models/ml_program/compression_utils.py
index ab63fc6fd..942165e0f 100644
--- a/coremltools/models/ml_program/compression_utils.py
+++ b/coremltools/models/ml_program/compression_utils.py
@@ -5,605 +5,106 @@
 
 import numpy as _np
 
-from coremltools import _SPECIFICATION_VERSION_IOS_16
 from coremltools.converters.mil import Operation as _Operation
-from coremltools.converters.mil.converter import mil_convert as _mil_convert
-from coremltools.converters.mil.frontend.milproto.load import load as _milproto_to_pymil
-from coremltools.converters.mil.mil.passes.defs.quantization import (
-    AbstractQuantizationPass as _AbstractQuantizationPass,
+from coremltools.models._deprecation import deprecated as _deprecated
+from coremltools.optimize.coreml import (
+    OpLinearQuantizerConfig as _OpLinearQuantizerConfig,
+    OpMagnitudePrunerConfig as _OpMagnitudePrunerConfig,
+    OpPalettizerConfig as _OpPalettizerConfig,
+    OpThresholdPrunerConfig as _OpThresholdPrunerConfig,
+    OptimizationConfig as _OptimizationConfig,
 )
-from coremltools.converters.mil.mil.passes.defs.quantization import (
-    WeightAffineQuantizer as _WeightAffineQuantizer,
-)
-from coremltools.converters.mil.mil.passes.defs.quantization import (
-    WeightDecompressor as _WeightDecompressor,
-)
-from coremltools.converters.mil.mil.passes.defs.quantization import (
-    WeightPalettizer as _WeightPalettizer,
-)
-from coremltools.converters.mil.mil.passes.defs.quantization import (
-    WeightSparsifier as _WeightSparsifier,
+from coremltools.optimize.coreml import (
+    linear_quantize_weights as _linear_quantize_weights,
+    decompress_weights as _decompress_weights,
+    palettize_weights as _palettize_weights,
+    prune_weights as _prune_weights,
 )
 
 _DEFAULT_MIN_WEIGHT_SIZE_TO_COMPRESS = 2048
-_DEFAULT_SPECIFICATION_VERSION_FOR_COMPRESSION = _SPECIFICATION_VERSION_IOS_16
-
 
 def _default_op_selector(const_op):
     if not isinstance(const_op, _Operation) or const_op.op_type != "const":
         raise ValueError("Input of the op_selector must be type of const Operation, got {}.".format(type(const_op)))
     return const_op.val.val.size > _DEFAULT_MIN_WEIGHT_SIZE_TO_COMPRESS
 
-def _apply_graph_pass(mlmodel, graph_pass):
-    # Utility function which compresses a coreml model
-    # convert the fully precision mlmodel into pymil program
-    model_spec = mlmodel.get_spec()
-    model_type = model_spec.WhichOneof("Type")
-    if model_type in ("neuralNetwork", "neuralNetworkClassifier", "neuralNetworkRegressor", "pipeline", "PipelineClassifier", "PipelineRegressor"):
-        msg = ("coremltools.compression_utils are meant to be used only with mlprogram typed coreml models. "
-              "This model has type {}. Please use coremltools.models.neural_network.quantization_utils.quantize_weights"
-              "instead to compress the weights of the model.")
-        raise TypeError(msg.format(model_type))
-    elif model_type == "mlProgram":
-        pass
-    else:
-       raise TypeError("weight compression not applicable for model type {}".format(model_type))
-
-    assert isinstance(graph_pass, _AbstractQuantizationPass), "compression pass must be an AbstractQuantizationPass instance"
-    specification_version = max(model_spec.specificationVersion, _DEFAULT_SPECIFICATION_VERSION_FOR_COMPRESSION)
-    prog = _milproto_to_pymil(
-        model_spec=model_spec,
-        specification_version=specification_version,
-        file_weights_dir=mlmodel.weights_dir,
-    )
-
-    # apply compression graph pass
-    graph_pass.apply(prog)
-
-    # convert the pymil program back to mlmodel
-    compressed_mlmodel = _mil_convert(
-        prog,
-        convert_to="mlprogram",
-        convert_from="milinternal",
-        specification_version=specification_version,
-        compute_units=mlmodel.compute_unit,
-        model_description=model_spec.description,
-    )
-    return compressed_mlmodel
-
+@_deprecated(
+    suffix="Please use coremltools.optimize.coreml.affine_quantize_weights",
+    version="7.0",
+    obj_prefix="coremltools.compression_utils.",
+)
 def affine_quantize_weights(mlmodel, mode="linear_symmetric", op_selector=None, dtype=_np.int8):
     """
-    Utility function to convert a float precision MLModel of type ``mlprogram`` that uses
-    float-precision weights into a compressed MLModel that uses 8-bit weights. This is
-    achieved by converting the float weight values that are stored in the ``const`` op
-    into the ``constexpr_affine_dequantize`` op.
-
-    This function uses affine quantization on the float weights, providing up to 2x
-    savings in storage compared to float 16, or up to 4x savings compared to float 32.
-    All computation at runtime uses float precision; the precision of the intermediate
-    tensors and the compute precision of the ops are not altered.
-
-    For each weight, this utility function converts the weight into the int8 or uint8 type using
-    either `Linear interpolation` (``"linear"`` mode) or `Linear symmetric
-    interpolation` (``"linear_symmetric"`` mode, the default).
-
-    **Linear interpolation**
-
-    Linear interpolation (``"linear"`` mode) maps the min/max of the float
-    range to the 8-bit integer range ``[low, high]`` using a zero point (also called quantization bias, or
-    offset) and a scale factor. For the int8 quantization, ``[low, high] = [-128, 127]``, while uint8
-    quantization uses range ``[0, 255]``.
-
-    ``"linear"`` mode uses the quantization formula:
-
-    .. math::
-       w_r = s * (w_q - z)
-
-    Where:
-
-        * :math:`w_r` and  :math:`s` are of type float.
-        * :math:`w_r`` represents the float precision weight.
-        * :math:`s` represents the scale.
-        * :math:`w_q` and :math:`z` are of type 8-bit integer.
-        * :math:`w_q` represents quantized weight.
-        * :math:`z` represents the zero point.
-
-    Quantized weights are computed as follows:
-
-    .. math::
-       w_q = cast\_to\_8\_bit\_integer(w_r / s + cast\_to\_float(z))
-
-    Note: :math:`cast\_to\_8\_bit\_integer` is the process of clipping the input to range ``[low, high]`` followed by rounding and casting to 8-bit integer.
-    
-    In ``"linear"`` mode, ``s, z`` are computed by mapping the original float range
-    ``[A, B]`` into the 8-bit integer range ``[-128, 127]`` or ``[0, 255]``. That is, you are solving the
-    following linear equations:
-
-        * ``B = s * (high - z)``
-        * ``A = s * (low - z)``
-
-    The equations result in the following:
-
-        * ``s = (B - A) / (high - low)``
-        * ``z = cast_to_8_bit_integer((low * B - high * A) / (B - A))``
-
-    When the rank of weight ``w`` is 1, then ``s`` and ``z`` are both scalars. When the
-    rank of the weight is greater than 1, then ``s`` and ``z`` are both vectors. In that
-    case, scales are computed per `channel`, in which `channel` is the output dimension,
-    which corresponds to the first dimension for ops such as ``conv`` and ``linear``, and
-    the second dimension for the ``conv_transpose`` op.
-    
-    For ``"linear"`` mode, :math:`A = min(w_r)`, :math:`B = max(w_r)`.
-    
-    **Linear symmetric interpolation**
-
-    With linear symmetric interpolation (``"linear_symmetric"`` mode, the default), rather than
-    mapping the exact min/max of the float range to the quantized range,
-
-    the function chooses the maximum absolute value between the min/max, which results in a 
-    floating-point range that is symmetric with respect to zero. This also makes the resulting zero 
-    point ``0`` for int8 weight and ``127`` for uint8 weight.
-    
-    For ``"linear_symmetric"`` mode:
-    
-       * :math:`A = -R` and :math:`B = R`, where :math:`R = max(abs(w_r))`.
-       * This function maps to the range of ``[-127, 127]`` for int8 weight and ``[0, 254]`` for uint8 weight.
-       * The result is ``s=(B-A)/254`` -> ``s=2R/254`` -> ``s=R/127``.
-       * Solving for ``z``: 
-            * int8:  ``z = (-127 * R + 127 * R)/2R`` -> ``z=0``.
-            * uint8: ``z = (0 * R + 254 * R)/2R`` -> ``z=127``.      
-
-    Parameters
-    ----------
-    mlmodel: MLModel
-        Model to be quantized. This MLModel should be of type ``mlprogram``.
-
-    mode: str
-        Mode for linear quantization:
-
-        * ``"linear_symmetric"`` (default): Input data are quantized in the range
-          ``[-R, R]``, where :math:`R = max(abs(w_r))`.
-        * ``"linear"``: Input data are quantized in the range
-          :math:`[min(w_r), max(w_r)]`.
-
-    op_selector: callable
-        This function takes a single parameter with type ``coremltools.converters.mil.Const``;
-        that is, a ``const`` operation. It returns a ``bool``: ``True`` to compress ``const_op``,
-        otherwise ``False``. See the following examples:
-
-        * All constants in the network are compressed:
-
-          .. sourcecode:: python
-
-              def op_selector(const_op):
-                  return True
-
-        * Only the constant with ``tensor.size > 2048`` is compressed:
-
-          .. sourcecode:: python
-
-              def op_selector(const_op):
-                  return const_op.val.val.size > 2048
-
-        * Compress the constant if it is the weight of a convolution layer
-          and ``tensor.size > 2048``:
-
-          .. sourcecode:: python
-
-              def op_selector(const_op):
-                  return (
-                      const_op.val.val.size > 2048
-                      and const_op.val.child_ops[0].op_type == "conv"
-                      and const_op.val == const_op.val.child_ops[0].weight
-                  )
-
-        * When creating a custom ``op_selector`` function, the following attributes are helpful:
-
-             * ``const_op.val.val``: The numpy array holding the value of the const.
-             * ``const_op.val.child_ops``: A list of ops into which this constant is feeding.
-             * ``const_op.val.child_ops[i].op_type``: The string corresponding to the op type
-               of the i-th child op.
-             * ``const_op.val.child_ops[i].name``: The string corresponding to the name the
-               i-th child op.
-
-        * If ``op_selector`` is not provided, it will be set to the behavior in which
-          weights bigger than 2048 elements are compressed:
-
-          .. sourcecode:: python
-
-              def op_selector(const_op):
-                  return const_op.val.val.size > 2048
-
-    dtype: np.generic or mil.type type
-        Determines the quantizaed data type (int8/uint8).
-
-        * The allowed values are:
-            * ``np.int8`` (the default)
-            * ``np.uint8``
-            * ``coremltools.converters.mil.mil.types.int8``
-            * ``coremltools.converters.mil.mil.types.uint8``
-
-    Returns
-    -------
-
-    model: MLModel
-        The quantized MLModel instance.
-
-    Examples
-    --------
-
-        import coremltools as ct
-        model = ct.models.MLModel('my_model.mlpackage')
-        compressed_model = ct.compression_utils.affine_quantize_weights(model, mode="linear_symmetric")
-
+    ``coremltools.compression_utils.affine_quantize_weights`` is deprecated and will be removed in the future.
+    Please use ``coremltools.optimize.coreml.linear_quantize_weights``.
     """
     if op_selector is None:
         op_selector = _default_op_selector
-    affine_weight_quantizer = _WeightAffineQuantizer(fake_compression=False, mode=mode, op_selector=op_selector, dtype=dtype)
-    return _apply_graph_pass(mlmodel, affine_weight_quantizer)
 
+    op_config = _OpLinearQuantizerConfig(mode=mode, dtype=dtype, weight_threshold=None)
+    config = _OptimizationConfig(global_config=op_config, is_deprecated=True, op_selector=op_selector)
+    return _linear_quantize_weights(mlmodel, config)
 
+@_deprecated(
+    suffix="Please use coremltools.optimize.coreml.palettize_weights",
+    version="7.0",
+    obj_prefix="coremltools.compression_utils.",
+)
 def palettize_weights(mlmodel, nbits=None, mode="kmeans", op_selector=None, lut_function=None):
     """
-    Utility function to convert a float precision MLModel of type ``mlprogram`` to a
-    compressed MLModel by reducing the overall number of weights using a lookup table
-    (LUT). A LUT contains a list of float values. An `nbit` LUT has 2\ :sup:`nbits` entries.
-
-    For example, a float weight vector such as ``{0.3, 0.3, 0.5, 0.5}`` can be compressed
-    using a 1-bit LUT: ``{0.3, 0.5}``. In this case the float vector can be replaced
-    with a 1-bit vector ``{0, 0, 1, 1}``.
-
-    This function iterates over all the weights in the ``mlprogram``, discretizes its values,
-    and constructs the LUT according to the algorithm specified in ``mode``. The float
-    values are then converted to the `nbit` values, and the LUT is saved alongside each
-    weight. The ``const`` ops storing weight values are replaced by
-    ``constexpr_lut_to_dense`` ops.
-
-    At runtime, the LUT and the `nbit` values are used to reconstruct the float weight
-    values, which are then used to perform the float operaton the weight is feeding into.
-
-    Consider the following example of ``"uniform"`` mode (a linear histogram):
-
-        * ``nbits = 4``
-        * ``mode = "uniform"``
-        * ``weight = [0.11, 0.19, 0.3, 0.08, 0.0, 0.02]``
-
-    The weight can be converted to a palette with indices ``[0, 1, 2, 3]`` (2 bits). The
-    indices are a byte array.
-
-    The data range ``[0.0, 0.3]`` is divided into 4 partitions linearly, which is
-    ``[0.0, 0.1, 0.2, 0.3]``.
-
-        * The LUT would be ``[0.0, 0.1, 0.2, 0.3]``.
-
-        * The weight is rounded to ``[0.1, 0.2, 0.3, 0.1, 0.0, 0.0]``, and represented in
-          the palette as indices ``[01b, 10b, 11b, 01b, 00b, 00b]``.
-
-    Parameters
-    ----------
-    mlmodel: MLModel
-        Model to be converted by a LUT. This MLModel should be of type ``mlprogram``.
-
-    nbits: int
-        Number of bits per weight. Required for ``kmeans`` or ``uniform`` mode, but must
-        not be set for ``unique`` or ``custom`` mode. A LUT would have
-        2\ :sup:`nbits` entries, where `nbits` can be ``{1, 2, 4, 6, 8}``.
-
-    mode: str
-        Determine how the LUT is constructed by specifying one of the following:
-
-        * ``"kmeans"`` (default): The LUT is generated by `k-means clustering`, a method of vector
-          quantization that groups similar data points together to discover underlying
-          patterns by using a fixed number (`k`) of clusters in a dataset. A cluster
-          refers to a collection of data points aggregated together because of certain
-          similarities. `nbits` is required.
-
-        * ``"uniform"``: The LUT is generated by a linear histogram.
-
-           - ``[v_min, v_min + scale, v_min + 2 * scale, ..., v_max]``
-           - Where the weight is in the range ``[v_min, v_max]``, and
-             ``scale = (v_max - v_min) / (1 << nbits - 1)``.
-           - ``nbits`` is required.
-
-           A `histogram` is a representation of the distribution of a continuous variable,
-           in which the entire range of values is divided into a series of intervals (or
-           `bins`) and the representation displays how many values fall into each bin.
-           Linear histograms have one bin at even intervals, such as one bin per integer.
-
-        * ``"unique"``: The LUT is generated by unique values in the weights. The weights
-          are assumed to be on a discrete lattice but stored in a float data type. This
-          parameter identifies the weights and converts them into the palettized representation.
-
-          Do not provide ``nbits`` for this mode. ``nbits`` is picked up automatically,
-          with the smallest possible value in ``{1, 2, 4, 6, 8}`` such that the
-          number of the unique values is ``<= (1 << nbits)``. If the weight has ``> 256``
-          unique values, the compression is skipped.
-
-          For example:
-
-          * If the weights are ``{0.1, 0.2, 0.3, 0.4}`` and ``nbits=2``, the weights are
-            converted to ``{00b, 01b, 10b, 11b}``, and the generated LUT is
-            ``[0.1, 0.2, 0.3, 0.4]``.
-          * If the weights are ``{0.1, 0.2, 0.3, 0.4}`` and ``nbits=1``, nothing happens
-            because the weights are not a 1-bit lattice.
-          * If the weights are ``{0.1, 0.2, 0.3, 0.4, 0.5}`` and ``nbits=2``, nothing
-            happens because the weights are not a 2-bit lattice.
-
-        * ``"custom"``: The LUT and palettization parameters are calculated using a custom
-          function. If this mode is selected then ``lut_function`` must be provided.
-
-          Do not provide ``nbits`` for this mode. The user should customize ``nbits`` in the
-          ``lut_function`` implementation.
-
-    op_selector: callable
-        This function takes a single parameter with type ``coremltools.converters.mil.Operation``.
-        It returns a ``bool``: ``True`` to compress ``const_op``, otherwise ``False``.
-        See the following examples:
-
-        * All constants in the network are compressed:
-
-          .. sourcecode:: python
-
-              def op_selector(const_op):
-                  return True
-
-        * Only the constant with ``tensor.size > 2048`` is compressed:
-
-          .. sourcecode:: python
-
-              def op_selector(const_op):
-                  return const_op.val.val.size > 2048
-
-        * Compress the constant if it is the weight of a convolution layer
-          and ``tensor.size > 2048``:
-
-          .. sourcecode:: python
-
-              def op_selector(const_op):
-                  return (
-                      const_op.val.val.size > 2048
-                      and const_op.val.child_ops[0].op_type == "conv"
-                      and const_op.val == const_op.val.child_ops[0].weight
-                  )
-
-        * When creating a custom ``op_selector`` function, the following attributes are helpful:
-
-             * ``const_op.val.val``: The numpy array holding the value of the const.
-             * ``const_op.val.child_ops``: A list of ops into which this constant is feeding.
-             * ``const_op.val.child_ops[i].op_type``: The string corresponding to the op type
-               of the i-th child op.
-             * ``const_op.val.child_ops[i].name``: The string corresponding to the name the
-               i-th child op.
-
-        * If ``op_selector`` is not provided, it will be set to the behavior in which
-          weights bigger than 2048 elements are compressed:
-
-          .. sourcecode:: python
-
-              def op_selector(const_op):
-                  return const_op.val.val.size > 2048
-
-    lut_function: callable
-        A callable function which computes the weight palettization parameters. This must
-        be provided if the mode is set to ``"custom"``.
-
-        weight: np.ndarray
-            A float precision numpy array.
-
-        Returns: lut: list[float]
-            The lookup table.
-
-        indices: list[int]
-            A list of indices for each element.
-
-        The following is an example that extract the ``top_k`` elements as the LUT. Given
-        that ``weight = [0.1, 0.5, 0.3, 0.3, 0.5, 0.6, 0.7]``, the ``lut_function``
-        produces ``lut = [0, 0.5, 0.6, 0.7], indices = [0, 1, 0, 0, 2, 3]``.
-
-        .. sourcecode:: python
-
-           def lut_function(weight):
-               # In this example, we assume elements in the weights >= 0
-               weight = weight.flatten()
-               nbits = 4
-
-               # Get the LUT, from extracting top k maximum unique elements in the weight to be the LUT
-               # Note that k = 1 << nbits - 1, so we have the first element be 0
-               unique_elements = np.unique(weight)
-               k = (1 << nbits) - 1
-               top_k = np.partition(weight, -k)[-k:]
-               np.sort(top_k)
-               lut = [0.0] + top_k.tolist()
-
-               # Compute the indices
-               mapping = {v: idx for idx, v in enumerate(lut)}
-               indices = [mapping[v] if v in mapping else 0 for v in weight]
-
-               return lut, indices
-
-    Returns
-    -------
-    model: MLModel
-        The palettized MLModel instance.
-
-    Examples
-    --------
-
-    .. sourcecode:: python
-
-        import coremltools as ct
-
-        model = ct.models.MLModel("my_model.mlpackage")
-        compressed_model = ct.compression_utils.palettize_weights(model, mode="kmeans", nbits=4)
-
-
+    ``coremltools.compression_utils.palettize_weights`` is deprecated and will be removed in the future.
+    Please use ``coremltools.optimize.coreml.palettize_weights``.
     """
     if op_selector is None:
         op_selector = _default_op_selector
-    weight_palettizer = _WeightPalettizer(nbits=nbits, fake_compression=False, op_selector=op_selector, mode=mode, lut_function=lut_function)
-    return _apply_graph_pass(mlmodel, weight_palettizer)
 
+    op_config = _OpPalettizerConfig(nbits=nbits, mode=mode, lut_function=lut_function, weight_threshold=None)
+    config = _OptimizationConfig(global_config=op_config, is_deprecated=True, op_selector=op_selector)
+    return _palettize_weights(mlmodel, config)
 
+@_deprecated(
+    suffix="Please use coremltools.optimize.coreml.sparsify_weights",
+    version="7.0",
+    obj_prefix="coremltools.compression_utils.",
+)
 def sparsify_weights(mlmodel, mode="threshold_based", threshold=1e-3, target_percentile=1.0, op_selector=None):
     """
-    Utility function to convert a float precision MLModel of type ``mlprogram`` to a
-    compressed MLModel using sparse representation. The ``const`` ops storing weight
-    values are replaced by ``constexpr_sparse_to_dense`` ops.
-
-    This function is useful if the model is trained with pruning techniques so that
-    a lot of weights have zero values. If a large percentage of weight values are zero,
-    a sparse representation is more efficient than a dense one (the default).
-
-    The sparsified weights are stored in a bit mask. If the weight values are
-    ``{0, 0, 0, 0, 0, 0, 0, 56.3}``, its sparse representation contains a bit mask with
-    ones on locations where the value is non-zero: ``00000001b``. This is accompanied by
-    non-zero data, which is a size-1 vector of value ``{56.3}``.
-
-    For example, given the following:
-
-        * ``weight = [0.3, 0, 0, 0.5, 0, 0]``
-        * ``non_zero_data, bit_mask = sparsify(weight)``
-
-    The indices of the non-zero elements are:
-
-        * ``non_zero_data = [0.3, 0.5]``
-        * ``bit_mask = "100100"``
-
-    Parameters
-    ----------
-    mlmodel: MLModel
-        Model to be sparsified. This MLModel should be of type ``mlprogram``.
-
-    mode: str
-        Determine the scheme to sparsify the model by specifying one of the following:
-
-        * ``"threshold_based"`` (default): All the absolute weight values that are smaller
-          than ``threshold`` are changed to 0, and the tensor is stored in a sparse format.
-          For example, given the following:
-
-               * ``weight = [0.3, -0.2, -0.01, 0.05]``
-               * ``threshold = 0.03``
-
-          The sparsified weight would be ``[0.3, -0.2, 0, 0.05]``.
-
-        * ``"percentile_based"``: Sparsify the weight with a constant sparsity percentile,
-          which is ``target_percentile``. Where
-          ``n = floor(size_of_weight_tensor * target_percentile)``, the ``n`` lowest
-          absolute weight values are changed to 0. For example, given the following:
-
-               * ``weight = [0.3, -0.2, -0.01, 0.05]``
-               * ``target_percentile = 0.75``
-
-          The sparsified weight would be ``[0.3, 0, 0, 0]``.
-
-    threshold: float
-        Required when ``mode = "prune_threshold"``. The absolute threshold to sparsify the weight.
-
-    target_percentile: float
-        Required when ``mode = "percentile_based"``. The percentage of sparsity for
-        compression, which needs to be in the range [0, 1]. When 0, no sparsification
-        occurs. For 1, all weights become 0.
-
-    op_selector: callable
-        This function takes a single parameter with type ``coremltools.converters.mil.Operation``.
-        It returns a ``bool``: ``True`` to compress ``const_op``, otherwise ``False``.
-        See the following examples:
-
-        * All constants in the network are compressed:
-
-          .. sourcecode:: python
-
-              def op_selector(const_op):
-                  return True
-
-        * Only the constant with ``tensor.size > 2048`` is compressed:
-
-          .. sourcecode:: python
-
-              def op_selector(const_op):
-                  return const_op.val.val.size > 2048
-
-        * Compress the constant if it is the weight of a convolution layer
-          and ``tensor.size > 2048``:
-
-          .. sourcecode:: python
-
-              def op_selector(const_op):
-                  return (
-                      const_op.val.val.size > 2048
-                      and const_op.val.child_ops[0].op_type == "conv"
-                      and const_op.val == const_op.val.child_ops[0].weight
-                  )
-
-        * When creating a custom ``op_selector`` function, the following attributes are helpful:
-
-             * ``const_op.val.val``: The numpy array holding the value of the const.
-             * ``const_op.val.child_ops``: A list of ops into which this constant is feeding.
-             * ``const_op.val.child_ops[i].op_type``: The string corresponding to the op type
-               of the i-th child op.
-             * ``const_op.val.child_ops[i].name``: The string corresponding to the name the
-               i-th child op.
-
-        * If ``op_selector`` is not provided, it will be set to the behavior in which
-          weights bigger than 2048 elements are compressed:
-
-          .. sourcecode:: python
-
-              def op_selector(const_op):
-                  return const_op.val.val.size > 2048
-
-    Returns
-    -------
-    model: MLModel
-        The sparse MLModel instance.
-
-    Examples
-    --------
-    .. sourcecode:: python
-
-        import coremltools as ct
-
-        model = ct.models.MLModel("my_model.mlpackage")
-        compressed_model = ct.compression_utils.sparsify_weights(
-            model, mode="threshold_based", threshold=0.01
-        )
-
+    ``coremltools.compression_utils.sparsify_weights`` is deprecated and will be removed in the future.
+    Please use ``coremltools.optimize.coreml.prune_weights``.
     """
     if op_selector is None:
         op_selector = _default_op_selector
-    weight_sparsifier = _WeightSparsifier(mode=mode, threshold=threshold, target_percentile=target_percentile, op_selector=op_selector)
-    return _apply_graph_pass(mlmodel, weight_sparsifier)
-
-def decompress_weights(mlmodel):
-    """
-    Utility function to convert weights that are sparse or palettized or affine quantized, back to the float format.
-    That is, convert any of the follwing three ops:
-
-    (1) constexpr_affine_dequantize
-    (2) constexpr_lut_to_dense
-    (3) constexpr_sparse_to_dense
 
-    to mb.const
-
-    Parameters
-    ----------
-    mlmodel: MLModel
-        Model which will be decompressed.
-
-    Returns
-    -------
-    model: MLModel
-        The MLModel with no constexpr ops included.
+    if mode.upper() == "THRESHOLD_BASED":
+        op_config = _OpThresholdPrunerConfig(
+            threshold=threshold,
+            minimum_sparsity_percentile=0.0,
+            weight_threshold=None,
+        )
 
-    Examples
-    --------
-    .. sourcecode:: python
+    elif mode.upper() == "PERCENTILE_BASED":
+        op_config = _OpMagnitudePrunerConfig(
+            target_sparsity=target_percentile,
+            weight_threshold=None,
+        )
 
-        import coremltools as ct
+    else:
+        raise ValueError(
+            'Only modes "THRESHOLD_BASED" and "PERCENTILE_BASED" are supported for weight sparsification.'
+            f' Got mode: "{mode}".'
+        )
 
-        model = ct.models.MLModel("my_compressed_model.mlpackage")
-        decompressed_model = ct.compression_utils.decompress_weights(model)
+    config = _OptimizationConfig(global_config=op_config, is_deprecated=True, op_selector=op_selector)
+    return _prune_weights(mlmodel, config)
 
+@_deprecated(
+    suffix="Please use coremltools.optimize.coreml.decompress_weights",
+    version="7.0",
+    obj_prefix="coremltools.compression_utils.",
+)
+def decompress_weights(mlmodel):
+    """
+    ``coremltools.compression_utils.decompress_weights`` is deprecated and will be removed in the future.
+    Please use ``coremltools.optimize.coreml.decompress_weights``.
     """
-    weight_decompressor = _WeightDecompressor(op_selector=lambda op: True)
-    return _apply_graph_pass(mlmodel, weight_decompressor)
+    return _decompress_weights(mlmodel)
diff --git a/coremltools/models/model.py b/coremltools/models/model.py
index 0f38fc2ad..74b765126 100644
--- a/coremltools/models/model.py
+++ b/coremltools/models/model.py
@@ -20,11 +20,19 @@
 from ..proto import FeatureTypes_pb2 as _ft
 from ..proto import MIL_pb2 as _MIL_pb2
 from ..proto import Model_pb2 as _Model_pb2
-from .utils import (_MLMODEL_EXTENSION, _MLPACKAGE_AUTHOR_NAME,
-                    _MLPACKAGE_EXTENSION, _WEIGHTS_DIR_NAME, _create_mlpackage,
-                    _has_custom_layer, _is_macos, _macos_version,
-                    load_spec as _load_spec, save_spec as _save_spec,
-                    )
+from .utils import (
+    _MLMODEL_EXTENSION,
+    _MLPACKAGE_AUTHOR_NAME,
+    _MLPACKAGE_EXTENSION,
+    _MODEL_FILE_NAME,
+    _WEIGHTS_DIR_NAME,
+    _create_mlpackage,
+    _has_custom_layer,
+    _is_macos,
+    _macos_version,
+)
+from .utils import load_spec as _load_spec
+from .utils import save_spec as _save_spec
 
 if _HAS_TORCH:
     import torch as _torch
@@ -313,6 +321,31 @@ def cleanup(package_path):
             if _os.path.exists(package_path):
                 _shutil.rmtree(package_path)
 
+        def does_model_contain_mlprogram(model) -> bool:
+            """
+            Is this an mlprogram or is it a pipeline with at least one mlprogram?
+            """
+            model_type = model.WhichOneof("Type")
+
+            if model_type == "mlProgram":
+                return True
+            elif model_type not in ("pipeline", "pipelineClassifier", "pipelineRegressor"):
+                return False
+
+            # Does this pipeline contain an mlprogram?
+            if model_type == "pipeline":
+                pipeline_models = model.pipeline.models
+            elif model_type == "pipelineClassifier":
+                pipeline_models = model.pipelineClassifier.pipeline.models
+            else:
+                assert model_type == "pipelineRegressor"
+                pipeline_models = model.pipelineRegressor.pipeline.models
+
+            for m in pipeline_models:
+                if does_model_contain_mlprogram(m):
+                    return True
+            return False
+
         if not isinstance(compute_units, _ComputeUnit):
             raise TypeError('"compute_units" parameter must be of type: coremltools.ComputeUnit')
         elif (compute_units == _ComputeUnit.CPU_AND_NE
@@ -343,12 +376,13 @@ def cleanup(package_path):
                 model, compute_units, skip_model_load=skip_model_load,
             )
         elif isinstance(model, _Model_pb2.Model):
-            model_type = model.WhichOneof('Type')
-            if model_type in ("mlProgram", 'pipelineClassifier', 'pipelineRegressor', 'pipeline'):
-                if model_type == "mlProgram" and weights_dir is None:
-                    raise Exception('MLModel of type mlProgram cannot be loaded just from the model spec object. '
-                                    'It also needs the path to the weights file. Please provide that as well, '
-                                    'using the \'weights_dir\' argument.')
+            if does_model_contain_mlprogram(model):
+                if model.WhichOneof("Type") == "mlProgram" and weights_dir is None:
+                    raise Exception(
+                        "MLModel of type mlProgram cannot be loaded just from the model spec object. "
+                        "It also needs the path to the weights file. Please provide that as well, "
+                        "using the 'weights_dir' argument."
+                    )
                 self.is_package = True
                 self.is_temp_package = True
                 filename = _create_mlpackage(model, weights_dir)
@@ -460,6 +494,11 @@ def save(self, save_path: str):
             elif ext != _MLPACKAGE_EXTENSION:
                 raise Exception("For an ML Program, extension must be {} (not {})".format(_MLPACKAGE_EXTENSION, ext))
             _shutil.copytree(self.package_path, save_path)
+
+            saved_spec_path = _os.path.join(
+                save_path, "Data", _MLPACKAGE_AUTHOR_NAME, _MODEL_FILE_NAME
+            )
+            _save_spec(self._spec, saved_spec_path)
         else:
             _save_spec(self._spec, save_path)
 
@@ -572,6 +611,13 @@ def verify_and_convert_input_dict(d):
                 else:
                     raise Exception("Unable to load CoreML.framework. Cannot make predictions.")
 
+    def _input_has_infinite_upper_bound(self) -> bool:
+        """Check if any input has infinite upper bound (-1)."""
+        for input_spec in self.input_description._fd_spec:
+            for size_range in input_spec.type.multiArrayType.shapeRange.sizeRanges:
+                if size_range.upperBound == -1:
+                    return True
+        return False
 
     def _set_build_info_mil_attributes(self, metadata):
         if self._spec.WhichOneof('Type') != "mlProgram":
diff --git a/coremltools/models/neural_network/builder.py b/coremltools/models/neural_network/builder.py
index cdf49998c..bbbf06377 100644
--- a/coremltools/models/neural_network/builder.py
+++ b/coremltools/models/neural_network/builder.py
@@ -42,7 +42,8 @@ def _set_recurrent_activation(param, activation):
         param.tanh.MergeFromString(b"")
     elif activation == "LINEAR":
         param.linear.MergeFromString(b"")
-    elif activation == "SIGMOID_HARD":
+    elif activation == "SIGMOID_HARD" or activation == "HARD_SIGMOID":
+        # The standard name is "hard_sigmoid", but in nn there are still usages of "sigmoid_hard".
         param.sigmoidHard.MergeFromString(b"")
     elif activation == "SCALED_TANH":
         param.scaledTanh.MergeFromString(b"")
@@ -223,8 +224,8 @@ class NeuralNetworkBuilder:
     defined. The builder can also set preprocessing steps to handle
     specialized input formats (such as images), and set class labels for neural
     network classifiers.
-    
-    Refer to the protobuf messages in the specification (NeuralNetwork.proto) 
+
+    Refer to the protobuf messages in the specification (NeuralNetwork.proto)
     for more details.
 
     Examples
@@ -237,20 +238,28 @@ class NeuralNetworkBuilder:
         # Create a neural network binary classifier that classifies
         # 3-dimensional data points
         # Specify input and output dimensions
-        >>> input_dim = (3,)
-        >>> output_dim = (2,)
+        input_dim = (3,)
+        output_dim = (2,)
 
         # Specify input and output features
-        >>> input_features = [('data', datatypes.Array(*input_dim))]
-        >>> output_features = [('probs', datatypes.Array(*output_dim))]
+        input_features = [("data", datatypes.Array(*input_dim))]
+        output_features = [("probs", datatypes.Array(*output_dim))]
 
         # Build a simple neural network with 1 inner product layer
-        >>> builder = NeuralNetworkBuilder(input_features, output_features)
-        >>> builder.add_inner_product(name='ip_layer', W=weights, b=bias, input_channels=3, output_channels=2,
-        ... has_bias=True, input_name='data', output_name='probs')
+        builder = NeuralNetworkBuilder(input_features, output_features)
+        builder.add_inner_product(
+            name="ip_layer",
+            W=weights,
+            b=bias,
+            input_channels=3,
+            output_channels=2,
+            has_bias=True,
+            input_name="data",
+            output_name="probs",
+        )
 
         # save the spec by the builder
-        >>> save_spec(builder.spec, 'network.mlmodel')
+        save_spec(builder.spec, "network.mlmodel")
     """
 
     def __init__(
@@ -273,22 +282,22 @@ def __init__(
         ----------
 
         input_features: [(str, datatypes.Array)] or None
-            List of input feature of the network. 
-            Each feature is a ``(name, array)`` tuple, where ``name`` is the 
-            name of the feature, and ``array`` is a ``datatype.Array`` object 
+            List of input feature of the network.
+            Each feature is a ``(name, array)`` tuple, where ``name`` is the
+            name of the feature, and ``array`` is a ``datatype.Array`` object
             describing the feature type.
-            
+
             * When ``spec`` is ``None`` (building from scratch), ``input_features`` must not be ``None``.
 
         output_features: [(str, datatypes.Array or None)] or None
-            List of output feature of the network. Each feature is a 
-            ``(name, array)`` tuple, where ``name`` is the name of the feature, 
+            List of output feature of the network. Each feature is a
+            ``(name, array)`` tuple, where ``name`` is the name of the feature,
             and ``array`` is a ``datatypes.Array`` object describing the feature type.
-            
+
             * The ``array`` can be ``None`` if not known.
-            
+
             * When ``spec`` is ``None`` (building from scratch), ``output_features`` must not be ``None``.
-            
+
         mode: str ('classifier', 'regressor' or None)
             Mode (one of ``'classifier'``, ``'regressor'``, or ``None``).
 
@@ -298,28 +307,28 @@ def __init__(
 
         disable_rank5_shape_mapping: bool
             Only applicable for neural networks.
-            
+
             If True, inputs are no longer forced to map to rank 5 tensors
             (rank is equal to the length of the shape of the tensor).
             Instead, for multi-array inputs ``"EXACT_ARRAY_MAPPING"`` mapping is used, whereas
-            for image inputs ``"RANK4_IMAGE_MAPPING"`` is used. For details, 
+            for image inputs ``"RANK4_IMAGE_MAPPING"`` is used. For details,
             see description of enums ``NeuralNetworkMultiArrayShapeMapping``
             and ``NeuralNetworkImageShapeMapping`` in NeuralNetwork.proto.
-            
+
             When ``spec`` is not ``None``, this argument will be ignored.
 
         spec: None or coremltools.proto.Model_pb2
-            If ``None``, a new MLModel spec will be created by the builder with 
+            If ``None``, a new MLModel spec will be created by the builder with
             input and output features.
-            
-            Otherwise, the builder will continue to build on ``spec``. 
+
+            Otherwise, the builder will continue to build on ``spec``.
             This is useful when the MLModel is built incrementally.
 
         nn_spec: None or coremltools.proto.NeuralNetwork_pb2
             If ``None``, a new, empty NeuralNetwork proto will be created for spec.
-            
-            If ``nn_spec`` is not ``None`` and ``spec`` is ``None``, the builder will 
-            build a NeuralNetwork spec without wrapping it within an MLModel. 
+
+            If ``nn_spec`` is not ``None`` and ``spec`` is ``None``, the builder will
+            build a NeuralNetwork spec without wrapping it within an MLModel.
             This is useful to create nested NeuralNetworks for models
             with control flow operations.
 
@@ -333,9 +342,9 @@ def __init__(
 
             # Construct a builder that builds a neural network classifier with a 299 x 299 x 3
             # dimensional input and 1000 dimensional output
-            >>> input_features = [('data', datatypes.Array((299, 299, 3)))]
-            >>> output_features = [('probs', datatypes.Array((1000,)))]
-            >>> builder = NeuralNetworkBuilder(input_features, output_features, mode='classifier')
+            input_features = [("data", datatypes.Array((299, 299, 3)))]
+            output_features = [("probs", datatypes.Array((1000,)))]
+            builder = NeuralNetworkBuilder(input_features, output_features, mode="classifier")
 
         See Also
         --------
@@ -448,7 +457,7 @@ def set_input(self, input_names, input_dims):
 
             # Set the neural network spec inputs to be 3 dimensional vector data1 and
             # 4 dimensional vector data2.
-            >>> builder.set_input(input_names=['data1', 'data2'], input_dims=[(3,), (4,)])
+            builder.set_input(input_names=["data1", "data2"], input_dims=[(3,), (4,)])
 
         See Also
         --------
@@ -509,7 +518,7 @@ def set_output(self, output_names, output_dims):
 
             # Set the neural network spec outputs to be 3 dimensional vector feature1 and
             # 4 dimensional vector feature2.
-            >>> builder.set_output(output_names=['feature1', 'feature2'], output_dims=[(3,), (4,)])
+            builder.set_output(output_names=["feature1", "feature2"], output_dims=[(3,), (4,)])
 
         See Also
         --------
@@ -544,7 +553,7 @@ def set_training_input(self, training_input):
 
             # Set the neural network spec training inputs to be 3 dimensional vector for 'input' and
             # Double for 'target'.
-            >>> builder.set_training_input([('input', datatypes.Array(3)), ('target', 'Double')])
+            builder.set_training_input([("input", datatypes.Array(3)), ("target", "Double")])
         """
         spec = self.spec
         set_training_features(spec, training_input)
@@ -933,19 +942,19 @@ def make_updatable(self, trainables):
 
     def set_categorical_cross_entropy_loss(self, name, input):
         r"""
-        Categorical Cross Entropy is used for single label categorization 
+        Categorical Cross Entropy is used for single label categorization
         (only one category is applicable for each data point).
 
         Parameters
         ----------
         name: The name of the loss layer
         input: The name of the input
-        	The ``input`` should be a vector of length N representing the 
-        	distribution over N categories. This must be the output of a softmax.
+                The ``input`` should be a vector of length N representing the
+                distribution over N categories. This must be the output of a softmax.
 
         Notes
         -----
-        
+
         .. math::
            Loss_ {CCE}(input, target) = -\sum_{i = 1} ^ {N}(target == i) log(input[i]) = - log(input[target])
         """
@@ -1038,16 +1047,16 @@ def set_categorical_cross_entropy_loss(self, name, input):
     def set_mean_squared_error_loss(self, name, input_feature=None):
         """
         input_feature: [(str, datatypes.Array)] or None
-            The input feature of the loss layer. Each feature is a 
-            ``(name, array)`` tuple, where ``name`` is the name of the model's 
-            tensor our loss will be attached to, and ``array`` is a 
+            The input feature of the loss layer. Each feature is a
+            ``(name, array)`` tuple, where ``name`` is the name of the model's
+            tensor our loss will be attached to, and ``array`` is a
             ``datatypes.Array`` object describing the shape of that tensor.
             Both the name and the array's shape must be provided in the tuple.
-            
+
         Examples
         --------
-        
-            >>> feature = [('output_tensor', datatypes.Array((299, 299, 3)))]
+
+            feature = [('output_tensor', datatypes.Array((299, 299, 3)))]
         """
         if self.spec is None:
             return
@@ -1463,7 +1472,7 @@ def add_inner_product(
     ):
         """
         Add an inner product layer to the model.
-        Refer to the ``InnerProductLayerParams`` message in the specification 
+        Refer to the ``InnerProductLayerParams`` message in the specification
         (NeuralNetwork.proto) for more details.
 
         Parameters
@@ -1472,7 +1481,7 @@ def add_inner_product(
             The name of this layer.
         W: numpy.array or bytes()
             Weight matrix of shape ``(output_channels, input_channels)``.
-            If ``W`` is of type ``bytes()`` (quantized), other quantization 
+            If ``W`` is of type ``bytes()`` (quantized), other quantization
             related arguments must be provided as well (see below).
         b: numpy.array
             Bias vector of shape: ``(output_channels, )``.
@@ -1485,7 +1494,7 @@ def add_inner_product(
 
             - If True, the bias vector of this layer is not ignored.
             - If False, the bias vector is ignored.
-        
+
         input_name: str
             The input blob name of this layer.
         output_name: str
@@ -1494,31 +1503,31 @@ def add_inner_product(
         Quantization arguments, used when ``W`` is of type ``bytes()``:
             int_8_dynamic_quantize: boolean
                 Whether to quantize and dequantize before and after inner product, respectively.
-                Expects byte weights, representing int8 values, if True. 
+                Expects byte weights, representing int8 values, if True.
                 See NeuralNetwork.proto for other validation conditions.
 
             is_quantized_weight: bool, optional
-                Set it to true when ``W`` is of type ``bytes()``, representing 
+                Set it to true when ``W`` is of type ``bytes()``, representing
                 quantized weights, default: false.
 
             quantization_type: str
-                When weights are quantized (that is, ``W`` is of type ``bytes()``), 
+                When weights are quantized (that is, ``W`` is of type ``bytes()``),
                 this should be either ``"linear"`` or ``"lut"``.
 
             nbits: int
-                Should be between 1 and 8 (inclusive). Number of bits per weight 
+                Should be between 1 and 8 (inclusive). Number of bits per weight
                 value. Only applicable when weights are quantized.
 
             quant_scale: numpy.array(dtype=numpy.float32)
-                scale vector to be used with linear quantization. Must be of 
+                scale vector to be used with linear quantization. Must be of
                 length either 1 or output_channels.
 
             quant_bias: numpy.array(dtype=numpy.float32)
-                bias vector to be used with linear quantization. Must be of 
+                bias vector to be used with linear quantization. Must be of
                 length either 1 or output_channels.
 
             quant_lut: numpy.array(dtype=numpy.float32)
-                the LUT (look up table) to be used with LUT quantization. 
+                the LUT (look up table) to be used with LUT quantization.
                 Must be of length 2^n bits.
 
         See Also
@@ -1587,7 +1596,7 @@ def add_embedding(
     ):
         """
         Add an embedding layer to the model.
-        Refer to the ``EmbeddingLayerParams`` message in the specification 
+        Refer to the ``EmbeddingLayerParams`` message in the specification
         (NeuralNetwork.proto) for more details.
 
         Parameters
@@ -1596,7 +1605,7 @@ def add_embedding(
             The name of this layer.
         W: float32 numpy.array or bytes()
             Weight matrix of shape ``(output_channels, input_dim)``.
-            If ``W`` is of type ``bytes()`` (quantized to 1-8 bits), other 
+            If ``W`` is of type ``bytes()`` (quantized to 1-8 bits), other
             quantization related arguments must be provided as well (see below).
         b: numpy.array
             Bias vector of shape ``(output_channels, )``.
@@ -1622,22 +1631,22 @@ def add_embedding(
             Set it to true when ``W`` is of type ``bytes()``, representing quantized weights.
 
         quantization_type: str
-            When weights are quantized (that is, ``W`` is of type ``bytes()``), 
+            When weights are quantized (that is, ``W`` is of type ``bytes()``),
             this should be either ``"linear"`` or ``"lut"``.
 
         nbits: int
             Should be between 1 and 8 (inclusive). Number of bits per weight value.
 
         quant_scale: numpy.array(dtype=numpy.float32)
-            Scale vector to be used with linear quantization. 
+            Scale vector to be used with linear quantization.
             Must be of length either 1 or output_channels.
 
         quant_bias: numpy.array(dtype=numpy.float32)
-            Bias vector to be used with linear quantization. 
+            Bias vector to be used with linear quantization.
             Must be of length either 1 or output_channels.
 
         quant_lut: numpy.array(dtype=numpy.float32)
-            The LUT (look up table) to be used with LUT quantization. 
+            The LUT (look up table) to be used with LUT quantization.
             Must be of length 2^n bits.
 
         See Also
@@ -1687,7 +1696,7 @@ def add_embedding(
     def add_softmax(self, name, input_name, output_name):
         """
         Add a softmax layer to the model.
-        Refer to the ``SoftmaxLayerParams`` message in the specification 
+        Refer to the ``SoftmaxLayerParams`` message in the specification
         (NeuralNetwork.proto) for more details.
 
         Parameters
@@ -1747,19 +1756,19 @@ def add_activation(
                   ``f(x) = min(max(alpha * x + beta, -1), 1)``
 
                   where ``alpha`` and ``beta`` are constant scalars.
-                  
+
                 - ``'LEAKYRELU'``: leaky relu function, defined as:
 
                   ``f(x) = (x >= 0) * x + (x < 0) * alpha * x``
 
                   where ``alpha`` is a constant scalar.
-                  
+
                 - ``'PRELU'``: Parametric ReLU function, defined as:
 
                   ``f(x) = (x >= 0) * x + (x < 0) * alpha * x``
 
                   where ``alpha`` is a multi-dimensional array of same size as ``x``.
-                  
+
                 - ``'ELU'``: Exponential linear unit function, defined as:
 
                   ``f(x) = (x >= 0) * x + (x < 0) * (alpha * exp(x) - 1)``
@@ -1770,15 +1779,15 @@ def add_activation(
 
                   ``f(x) = alpha * log(1 + exp(beta * x))``
 
-                  where ``alpha`` and ``beta`` are two multi-dimensional arrays 
+                  where ``alpha`` and ``beta`` are two multi-dimensional arrays
                   of same size as ``x``.
-                  
+
                 - ``'THRESHOLDEDRELU'``: Thresholded ReLU function, defined as:
 
                   ``f(x) = (x >= alpha) * x``
 
                   where ``alpha`` is a constant scalar.
-                  
+
                 - ``'LINEAR'``: linear function.
 
                    ``f(x) = alpha * x + beta``
@@ -1790,17 +1799,17 @@ def add_activation(
         params: list of float or numpy.array
             Parameters for the activation, depending on non_linearity.
 
-                - When ``non_linearity`` is one of [``'RELU'``, ``'SIGMOID'``, ``'TANH'``, ``'SCALED_TANH'``, ``'SOFTPLUS'``, ``'SOFTSIGN'``], 
+                - When ``non_linearity`` is one of [``'RELU'``, ``'SIGMOID'``, ``'TANH'``, ``'SCALED_TANH'``, ``'SOFTPLUS'``, ``'SOFTSIGN'``],
                   params is ignored.
-                - When ``non_linearity`` is one of [``'SCALED_TANH'``, ``'SIGMOID_HARD'``, ``'LINEAR'``], 
+                - When ``non_linearity`` is one of [``'SCALED_TANH'``, ``'SIGMOID_HARD'``, ``'LINEAR'``],
                   param is a list of 2 floats ``[alpha, beta]``.
-                - When ``non_linearity`` is one of [``'LEAKYRELU'``, ``'ELU'``, ``'THRESHOLDEDRELU'``], 
+                - When ``non_linearity`` is one of [``'LEAKYRELU'``, ``'ELU'``, ``'THRESHOLDEDRELU'``],
                   param is a list of 1 float ``[alpha]``.
-                - When ``non_linearity`` is ``'PRELU'``, param is a list of 1 numpy array ``[alpha]``. 
+                - When ``non_linearity`` is ``'PRELU'``, param is a list of 1 numpy array ``[alpha]``.
                   The shape of ``alpha`` is ``(C,)``, where ``C`` is either the number of input channels or
                   1. When ``C = 1``, same ``alpha`` is applied to all channels.
-                - When ``non_linearity`` is ``'PARAMETRICSOFTPLUS'``, param is a 
-                  list of 2 numpy arrays ``[alpha, beta]``. The shape of ``alpha`` and 
+                - When ``non_linearity`` is ``'PARAMETRICSOFTPLUS'``, param is a
+                  list of 2 numpy arrays ``[alpha, beta]``. The shape of ``alpha`` and
                   `beta` is ``(C, )``, where ``C`` is either
                   the number of input channels or 1. When ``C = 1``, same ``alpha`` and
                   ``beta`` are applied to all channels.
@@ -1926,18 +1935,18 @@ def add_elementwise(self, name, input_names, output_name, mode, alpha=None):
             - ``'SEQUENCE_CONCAT'``: Concatenate input blobs along the sequence axis.
             - ``'ADD'``: Perform an element-wise summation over the input blobs.
             - ``'MULTIPLY'``: Perform an element-wise multiplication over the input blobs.
-            - ``'DOT'``: Compute the dot product of the two input blobs. 
+            - ``'DOT'``: Compute the dot product of the two input blobs.
               In this mode, the length of ``input_names`` should be 2.
-            - ``'COS'``: Compute the cosine similarity of the two input blobs. 
+            - ``'COS'``: Compute the cosine similarity of the two input blobs.
               In this mode, the length of ``input_names`` should be 2.
             - ``'MAX'``: Compute the element-wise maximum over the input blobs.
             - ```'MIN'```: Compute the element-wise minimum over the input blobs.
             - ``'AVE'``: Compute the element-wise average over the input blobs.
 
         alpha: float
-            * if ``mode == 'ADD'`` and there is only one ``input_name``, 
+            * if ``mode == 'ADD'`` and there is only one ``input_name``,
               ``alpha`` is added to the input.
-            * if ``mode == 'MULTIPLY'`` and there is only one ``input_name``, 
+            * if ``mode == 'MULTIPLY'`` and there is only one ``input_name``,
               ``alpha`` is multiplied to the input.
 
         See Also
@@ -1988,7 +1997,7 @@ def add_upsample(
     ):
         """
         Add an upsample layer to the model.
-        Refer to the ``UpsampleLayerParams`` message in the specification 
+        Refer to the ``UpsampleLayerParams`` message in the specification
         (NeuralNetwork.proto) for more details.
 
         Parameters
@@ -1996,10 +2005,10 @@ def add_upsample(
         name: str
             The name of this layer.
         scaling_factor_h: int or float
-            Scaling factor on the vertical direction. Float values only 
+            Scaling factor on the vertical direction. Float values only
             supported with ``BILINEAR`` and ``ALIGN_CORNERS_*``.
         scaling_factor_w: int or float
-            Scaling factor on the horizontal direction. Float values only 
+            Scaling factor on the horizontal direction. Float values only
             supported with ``BILINEAR`` and ``ALIGN_CORNERS_*``.
         input_name: str
             The input blob name of this layer.
@@ -2007,26 +2016,26 @@ def add_upsample(
             The output blob name of this layer.
         mode: str
             Overall interpolation mode. The following values are supported:
-            
+
             * ``'NN'``: nearest neighbour
             * ``'BILINEAR'``: bilinear interpolation
-            
+
         linear_upsample_mode: str
-            Specifies the behavior for linear upsampling. Only valid when 
+            Specifies the behavior for linear upsampling. Only valid when
             Interpolation Mode is ``BILINEAR``.
-            
-            If input grid is ``[0, Xin-1]`` (corresponding to an input size of 
+
+            If input grid is ``[0, Xin-1]`` (corresponding to an input size of
             ``Xin``), and if the output size is ``Xout``,
             then the grid points are sampled in the following manner:
-            
+
             'DEFAULT':
                 - ``spacing = (Xin-Xin/Xout) / (Xout-1)``
                 - ``grid_point[i] = min(Xin-1, max(0, i * spacing)), for i = 0,1,2,..,Xout-1``
-                
+
             'ALIGN_CORNERS_TRUE':
                 - ``spacing = (Xin-1) / (Xout-1)``
                 - ``grid_point[i] = min(Xin-1, max(0, i * spacing)), for i = 0,1,2,..,Xout-1``
-                
+
             'ALIGN_CORNERS_FALSE':
                 - ``spacing = Xin / Xout``
                 - ``grid_point[i] = min(Xin-1, max(0, i * spacing + 0.5 * spacing - 0.5)), for i = 0,1,2,..,Xout-1``
@@ -2102,7 +2111,7 @@ def add_scale(
     ):
         """
         Add a scale layer to the model.
-        Refer to the ``ScaleLayerParams`` message in the specification 
+        Refer to the ``ScaleLayerParams`` message in the specification
         (NeuralNetwork.proto) for more details.
 
         Parameters
@@ -2121,10 +2130,10 @@ def add_scale(
             The output blob name of this layer.
 
         shape_scale: list of int or tuple of int
-            List of ints that specifies the shape of the scale parameter. 
+            List of ints that specifies the shape of the scale parameter.
             Can be ``[1]``, ``[C]``, ``[1,H,W]``, or ``[C,H,W]``.
         shape_bias: list of int
-            List of ints that specifies the shape of the bias parameter 
+            List of ints that specifies the shape of the bias parameter
             (if present). Can be ``[1]``, ``[C]``, ``[1,H,W]``, or ``[C,H,W]``.
 
         See Also
@@ -2171,7 +2180,7 @@ def add_scale(
     def add_bias(self, name, b, input_name, output_name, shape_bias=None):
         """
         Add a bias layer to the model.
-        Refer to the ``BiasLayerParams`` message in the specification 
+        Refer to the ``BiasLayerParams`` message in the specification
         (NeuralNetwork.proto) for more details.
 
         Parameters
@@ -2185,7 +2194,7 @@ def add_bias(self, name, b, input_name, output_name, shape_bias=None):
         output_name: str
             The output blob name of this layer.
         shape_bias: list of int
-            List of ints that specifies the shape of the bias parameter 
+            List of ints that specifies the shape of the bias parameter
             (if present). Can be ``[1]``, ``[C]``, ``[1,H,W]``, or ``[C,H,W]``.
 
         See Also
@@ -2219,7 +2228,7 @@ def add_bias(self, name, b, input_name, output_name, shape_bias=None):
     def add_sequence_repeat(self, name, nrep, input_name, output_name):
         """
         Add a sequence repeat layer to the model.
-        Refer to the ``SequenceRepeatLayerParams`` message in the specification 
+        Refer to the ``SequenceRepeatLayerParams`` message in the specification
         (NeuralNetwork.proto) for more details.
 
         Parameters
@@ -2270,12 +2279,12 @@ def add_convolution(
     ):
         """
         Add a convolution layer to the network.
-        Refer to the ``ConvolutionLayerParams`` message in the specification 
+        Refer to the ``ConvolutionLayerParams`` message in the specification
         (NeuralNetwork.proto) for more details.
 
         Parameters
         ----------
-        
+
         name: str
             The name of this layer.
 
@@ -2299,27 +2308,27 @@ def add_convolution(
 
         border_mode: str
             Option for the padding type and output blob shape. Can be either 'valid' or 'same'.
- 
+
         groups: int
-            Number of kernel groups. Input is divided into groups along the channel axis. 
+            Number of kernel groups. Input is divided into groups along the channel axis.
             Each kernel group share the same weights.
- 
+
         W: numpy.array or bytes() or None
 
             Weight of the convolution kernels.
 
-            * If ``is_deconv`` is False, ``W`` should have 
+            * If ``is_deconv`` is False, ``W`` should have
               shape ``(height, width, kernel_channels, output_channels)``, where:
                  ``kernel_channel = input_channels / groups``
-            * If ``is_deconv`` is True, ``W`` should have 
+            * If ``is_deconv`` is True, ``W`` should have
               shape ``(height, width, kernel_channels, output_channels / groups)``, where:
                  ``kernel_channel = input_channels``
 
-            If ``W`` is of type ``bytes()`` (quantized), other quantization 
+            If ``W`` is of type ``bytes()`` (quantized), other quantization
             related arguments must be provided as well (see below).
 
             For Core ML specification version >=4, ``W`` can be ``None``. In this case,
-            the convolution layer takes 2 inputs, where the 1st input represents 
+            the convolution layer takes 2 inputs, where the 1st input represents
             the input feature map, and the 2nd input represents the weight blob.
 
         b: numpy.array
@@ -2332,16 +2341,16 @@ def add_convolution(
             - If False, bias is ignored.
 
         is_deconv: boolean
-            Whether the convolution layer is performing a convolution or a 
+            Whether the convolution layer is performing a convolution or a
             transposed convolution (deconvolution).
 
             - If True, the convolution layer is performing transposed convolution.
             - If False, the convolution layer is performing regular convolution.
 
         output_shape: tuple or None
-            Either ``None`` or a 2-tuple, specifying the output 
-            shape ``(output_height, output_width)``. 
-            
+            Either ``None`` or a 2-tuple, specifying the output
+            shape ``(output_height, output_width)``.
+
             - Used only when ``is_deconv == True``.
             - When ``is_deconv == False``, this parameter is ignored.
             - If it is ``None``, the output shape is calculated automatically using the ``border_mode``.
@@ -2357,39 +2366,39 @@ def add_convolution(
             Defaults to ``[1, 1]``.
 
         padding_top, padding_bottom, padding_left, padding_right: int
-            Values of height (top, bottom) and width (left, right) padding 
+            Values of height (top, bottom) and width (left, right) padding
             to be used if ``border_more`` is ``"valid"``.
 
         same_padding_asymmetry_mode: str
             Type of asymmetric padding to be used when  ``border_mode`` is ``'same'``.
             Can be either ``'BOTTOM_RIGHT_HEAVY'`` or  ``'TOP_LEFT_HEAVY'``.
 
-		Quantization
-			Quantization arguments expected in ``kwargs``, when ``W`` is of type ``bytes()``.
-        
-				quantization_type: str
-					When weights are quantized (that is, ``W`` is of type ``bytes()``), 
-					this should be either ``"linear"`` or ``"lut"``.
+                Quantization
+                        Quantization arguments expected in ``kwargs``, when ``W`` is of type ``bytes()``.
+
+                                quantization_type: str
+                                        When weights are quantized (that is, ``W`` is of type ``bytes()``),
+                                        this should be either ``"linear"`` or ``"lut"``.
 
-				nbits: int
-					Should be between 1 and 8 (inclusive). Number of bits per weight 
-					value. Only applicable when weights are quantized.
+                                nbits: int
+                                        Should be between 1 and 8 (inclusive). Number of bits per weight
+                                        value. Only applicable when weights are quantized.
 
-				quant_scale: numpy.array(dtype=numpy.float32)
-					scale vector to be used with linear quantization. Must be of 
-					length either 1 or ``output_channels``.
+                                quant_scale: numpy.array(dtype=numpy.float32)
+                                        scale vector to be used with linear quantization. Must be of
+                                        length either 1 or ``output_channels``.
 
-				quant_bias: numpy.array(dtype=numpy.float32)
-					bias vector to be used with linear quantization. Must be of 
-					length either 1 or ``output_channels``.
+                                quant_bias: numpy.array(dtype=numpy.float32)
+                                        bias vector to be used with linear quantization. Must be of
+                                        length either 1 or ``output_channels``.
 
-				quant_lut: numpy.array(dtype=numpy.float32)
-					the LUT (look up table) to be used with LUT quantization. 
+                                quant_lut: numpy.array(dtype=numpy.float32)
+                                        the LUT (look up table) to be used with LUT quantization.
                                         Must be of length 2^n bits.
 
         Depthwise convolution
-        	Depthwise convolution is a special case of convolution, in which:
-        
+                Depthwise convolution is a special case of convolution, in which:
+
                   * ``kernel_channels = 1 (== input_channels / groups)``
                   * ``output_channels = channel_multiplier * input_channels``
                   * ``groups = input_channels``
@@ -2555,7 +2564,7 @@ def add_convolution3d(
     ):
         """
         Add a 3 dimensional convolution layer to the network.
-        Refer to the ``Convolution3DLayerParams`` message in the specification 
+        Refer to the ``Convolution3DLayerParams`` message in the specification
         (NeuralNetwork.proto) for more details.
 
         Parameters
@@ -2583,17 +2592,17 @@ def add_convolution3d(
             Weight of the convolution kernels. ``W`` should have shape:
 
             - If ``deconv`` is False:
-            
+
                  ``(output_channels, kernel_channels, depth, height, width)``, where:
-                 
+
                  ``kernel_channels = input_channels / groups``
-              
-            - If ``deconv`` is True: 
-            
+
+            - If ``deconv`` is True:
+
                  ``(output_channels / groups, kernel_channels, depth, height, width)``, where:
-              
+
                  ``kernel_channels = input_channels``
-              
+
         b: numpy.array
             Biases of the convolution kernels. ``b`` should have shape ``(outputChannels, )``.
 
@@ -2623,7 +2632,7 @@ def add_convolution3d(
             Tuple of length 3 if Convolution Transpose.
 
         padding_mode: str
-            Option for the padding type and output blob shape. 
+            Option for the padding type and output blob shape.
             Can be ``'custom'``, ``'valid'``, or ``'same'``.
             Defaults to ``'valid'``. Case-insensitive.
 
@@ -2638,8 +2647,8 @@ def add_convolution3d(
             The output blob name of this layer.
 
         Depthwise convolution
-        	Depthwise convolution is a special case of convolution, in which:
-        
+                Depthwise convolution is a special case of convolution, in which:
+
             * ``kernel_channels = 1`` (``== input_channels / groups``)
             * ``output_channels = channel_multiplier * input_channels``
             * ``groups = input_channels``
@@ -2752,7 +2761,7 @@ def add_pooling(
     ):
         """
         Add a pooling layer to the model that performs spatial pooling.
-        Refer to the ``PoolingLayerParams`` message in the specification 
+        Refer to the ``PoolingLayerParams`` message in the specification
         (NeuralNetwork.proto) for more details.
 
         Parameters
@@ -2777,7 +2786,7 @@ def add_pooling(
             Type of pooling performed. Can either be ``'MAX'``, ``'AVERAGE'``, or ``'L2'``.
 
         padding_type: str
-            Option for the type of padding and output blob shape. Can be either 
+            Option for the type of padding and output blob shape. Can be either
             ``'VALID'``, ``'SAME'``, or ``'INCLUDE_LAST_PIXEL'``.
 
         input_name: str
@@ -2789,20 +2798,20 @@ def add_pooling(
         exclude_pad_area: boolean
             Whether to exclude padded area in the ``'AVERAGE'`` pooling operation,
             default: true. This flag is only used with average pooling.
-            
+
             - If True, the value of the padded area will be excluded.
             - If False, the padded area will be included.
 
         is_global: boolean
             Whether the pooling operation is global. Defaults to False.
-            
-            - If True, the pooling operation is global. The pooling region 
+
+            - If True, the pooling operation is global. The pooling region
               is of the same size of the input blob.
               Parameters ``height``, ``width``, ``stride_height``, and ``stride_width`` will be ignored.
             - If False, the pooling operation is not global.
 
         padding_top, padding_bottom, padding_left, padding_right: int
-            Values of height (top, bottom) and width (left, right) padding 
+            Values of height (top, bottom) and width (left, right) padding
             to be used if padding type is ``"VALID"`` or ``"INCLUDE_LAST_PIXEL"``.
 
         same_padding_asymmetry_mode: str.
@@ -2894,7 +2903,7 @@ def add_pooling3d(
     ):
         """
         Add a pooling layer to the model that performs spatial pooling across three dimensions.
-        Refer to the ``Pooling3DLayerParams`` message in the specification 
+        Refer to the ``Pooling3DLayerParams`` message in the specification
         (NeuralNetwork.proto) for more details.
 
         Parameters
@@ -2988,8 +2997,8 @@ def add_global_pooling3d(self, name, input_name, output_name, pooling_type):
         Add a layer to pool three spatial dimensions down to one value.
         This behaves like a special case of Pooling3DLayerParams in which
         the Kernel is the size of the input and there is no padding.
-        
-        Refer to the ``GlobalPooling3DLayerParams`` message in the specification 
+
+        Refer to the ``GlobalPooling3DLayerParams`` message in the specification
         (NeuralNetwork.proto) for more details.
 
         Parameters
@@ -3038,8 +3047,8 @@ def add_padding(
 
 
         Add a padding layer to the model that performs padding along spatial dimensions.
-        
-        Refer to the ``PaddingLayerParams`` message in the specification 
+
+        Refer to the ``PaddingLayerParams`` message in the specification
         (NeuralNetwork.proto) for more details.
 
         Parameters
@@ -3105,7 +3114,7 @@ def add_crop(
             - When it has 2 input blobs, it crops the first input blob based
               on the dimension of the second blob with an offset.
 
-        Refer to the ``CropLayerParams`` message in the specification 
+        Refer to the ``CropLayerParams`` message in the specification
         (NeuralNetwork.proto) for more details.
 
         Parameters
@@ -3167,7 +3176,7 @@ def add_simple_rnn(
     ):
         """
         Add a simple recurrent layer to the model.
-        Refer to the ``SimpleRecurrentLayerParams`` message in the specification 
+        Refer to the ``SimpleRecurrentLayerParams`` message in the specification
         (NeuralNetwork.proto) for more details.
 
         Parameters
@@ -3175,13 +3184,13 @@ def add_simple_rnn(
         name: str
             The name of this layer.
         W_h: numpy.array
-            Weights of the recurrent layer's hidden state. 
+            Weights of the recurrent layer's hidden state.
             Must be of shape ``(hidden_size, hidden_size)``.
         W_x: numpy.array
-            Weights of the recurrent layer's input. 
+            Weights of the recurrent layer's input.
             Must be of shape ``(hidden_size, input_size)``.
         b: numpy.array or None
-            Bias of the recurrent layer's output. If ``None``, bias is ignored. 
+            Bias of the recurrent layer's output. If ``None``, bias is ignored.
             Otherwise it must be of shape ``(hidden_size, )``.
         hidden_size: int
             Number of hidden units. This is equal to the number of channels of output shape.
@@ -3250,7 +3259,7 @@ def add_gru(
     ):
         """
         Add a Gated-Recurrent Unit (GRU) layer to the model.
-        Refer to the ``GRULayerParams`` message in the specification 
+        Refer to the ``GRULayerParams`` message in the specification
         (NeuralNetwork.proto) for more details.
 
         Parameters
@@ -3259,17 +3268,17 @@ def add_gru(
             The name of this layer.
         W_h: [numpy.array]
             List of recursion weight matrices. The ordering is ``[R_z, R_r, R_o]``,
-            where ``R_z``, ``R_r`` and ``R_o`` are weight matrices at update gate, 
+            where ``R_z``, ``R_r`` and ``R_o`` are weight matrices at update gate,
             reset gate and output gate.
             The shapes of these matrices are ``(hidden_size, hidden_size)``.
         W_x: [numpy.array]
             List of input weight matrices. The ordering is ``[W_z, W_r, W_o]``,
-            where ``W_z``, ``W_r``, and ``W_o`` are weight matrices at update gate, 
+            where ``W_z``, ``W_r``, and ``W_o`` are weight matrices at update gate,
             reset gate and output gate.
             The shapes of these matrices are ``(hidden_size, input_size)``.
         b: [numpy.array] or None
             List of biases of the GRU layer. The ordering is ``[b_z, b_r, b_o]``,
-            where ``b_z``, ``b_r``, and ``b_o`` are biases at update gate, 
+            where ``b_z``, ``b_r``, and ``b_o`` are biases at update gate,
             reset gate and output gate.
             If ``None``, biases are ignored. Otherwise the shapes of the biases are ``(hidden_size, )``.
         hidden_size: int
@@ -3282,7 +3291,7 @@ def add_gru(
             Defaults to ``'TANH'``.
             See add_activation for more detailed description.
         inner_activation: str
-            Inner activation function used at update and reset gates. 
+            Inner activation function used at update and reset gates.
             Can be one of the following options:
             [``'RELU'``, ``'TANH'``, ``'SIGMOID'``, ``'SCALED_TANH'``, ``'SIGMOID_HARD'``, ``'LINEAR'``].
             Defaults to ``'SIGMOID_HARD'``.
@@ -3364,7 +3373,7 @@ def add_unilstm(
     ):
         """
         Add a Uni-directional LSTM layer to the model.
-        Refer to the ``UniDirectionalLSTMLayerParams`` message in the specification 
+        Refer to the ``UniDirectionalLSTMLayerParams`` message in the specification
         (NeuralNetwork.proto) for more details.
 
         Parameters
@@ -3516,39 +3525,39 @@ def add_bidirlstm(
         name: str
             The name of this layer.
         W_h: [numpy.array]
-            List of recursion weight matrices for the forward layer. 
+            List of recursion weight matrices for the forward layer.
             The ordering is ``[R_i, R_f, R_o, R_z]``,
-            where ``R_i``, ``R_f``, ``R_o``, and ``R_z`` are weight matrices at 
+            where ``R_i``, ``R_f``, ``R_o``, and ``R_z`` are weight matrices at
             input gate, forget gate, output gate and cell gate.
             The shapes of these matrices are ``(hidden_size, hidden_size)``.
         W_x: [numpy.array]
-            List of input weight matrices for the forward layer. The ordering 
+            List of input weight matrices for the forward layer. The ordering
             is ``[W_i, W_f, W_o, W_z]``,
-            where ``W_i``, ``W_f``, ``W_o``, and ``W_z`` are weight matrices at 
+            where ``W_i``, ``W_f``, ``W_o``, and ``W_z`` are weight matrices at
             input gate, forget gate, output gate and cell gate.
             The shapes of these matrices are ``(hidden_size, input_size)``.
         b: [numpy.array]
-            List of biases for the forward layer. The ordering is 
+            List of biases for the forward layer. The ordering is
             ``[b_i, b_f, b_o, b_z]``,
-            where ``b_i``, ``b_f``, ``b_o``, and ``b_z`` are biases at input 
+            where ``b_i``, ``b_f``, ``b_o``, and ``b_z`` are biases at input
             gate, forget gate, output gate and cell gate.
-            If ``None``, biases are ignored. Otherwise the shapes of the biases 
+            If ``None``, biases are ignored. Otherwise the shapes of the biases
             are ``(hidden_size, )``.
         W_h_back: [numpy.array]
-            List of recursion weight matrices for the backward layer. The 
+            List of recursion weight matrices for the backward layer. The
             ordering is ``[R_i, R_f, R_o, R_z]``,
-            where ``R_i``, ``R_f``, ``R_o``, and ``R_z`` are weight matrices 
+            where ``R_i``, ``R_f``, ``R_o``, and ``R_z`` are weight matrices
             at input gate, forget gate, output gate and cell gate.
             The shapes of these matrices are ``(hidden_size, hidden_size)``.
         W_x_back: [numpy.array]
-            List of input weight matrices for the backward layer. The ordering 
+            List of input weight matrices for the backward layer. The ordering
             is `[W_i, W_f, W_o, W_z]``,
-            where ``W_i``, ``W_f``, ``W_o``, and ``W_z`` are weight matrices 
+            where ``W_i``, ``W_f``, ``W_o``, and ``W_z`` are weight matrices
             at input gate, forget gate, output gate and cell gate.
             The shapes of these matrices are ``(hidden_size, input_size)``.
         b_back: [numpy.array]
             List of biases for the backward layer. The ordering is ``[b_i, b_f, b_o, b_z]``,
-            where ``b_i``, ``b_f``, ``b_o``, and ``b_z`` are biases at input 
+            where ``b_i``, ``b_f``, ``b_o``, and ``b_z`` are biases at input
             gate, forget gate, output gate and cell gate.
             The shapes of the biases ``(hidden_size)``.
         hidden_size: int
@@ -3556,18 +3565,18 @@ def add_bidirlstm(
         input_size: int
             Number of the number of channels of input shape.
         input_names: list of str
-            The input blob names of this layer, in the order of 
+            The input blob names of this layer, in the order of
             ``[x, h_input, c_input, h_reverse_input, c_reverse_input]``.
         output_names: list of str
-            The output blob names of this layer, in the order of 
+            The output blob names of this layer, in the order of
             ``[y, h_output, c_output, h_reverse_output, c_reverse_output]``.
         inner_activation: str
-            Inner activation function used at input and forget gate. Can be one 
+            Inner activation function used at input and forget gate. Can be one
             of the following options:
             [``'RELU'``, ``'TANH'``, ``'SIGMOID'``, ``'SCALED_TANH'``, ``'SIGMOID_HARD'``, ``'LINEAR'``].
             Defaults to ``'SIGMOID'``.
         cell_state_update_activation: str
-            Cell state update activation function used at the cell state update gate. 
+            Cell state update activation function used at the cell state update gate.
             Can be one of the following options:
             [``'RELU'``, ``'TANH'``, ``'SIGMOID'``, ``'SCALED_TANH'``, ``'SIGMOID_HARD'``, ``'LINEAR'``].
             Defaults to ``'TANH'``.
@@ -3576,15 +3585,15 @@ def add_bidirlstm(
             [``'RELU'``, ``'TANH'``, ``'SIGMOID'``, ``'SCALED_TANH'``, ``'SIGMOID_HARD'``, ``'LINEAR'``].
             Defaults to ``'TANH'``.
         peep: [numpy.array] or None
-            List of peephole vectors for the forward layer. The ordering 
+            List of peephole vectors for the forward layer. The ordering
             is ``[p_i, p_f, p_o]``,
-            where ``p_i``, ``p_f``, and ``p_o`` are peephole vectors at input 
+            where ``p_i``, ``p_f``, and ``p_o`` are peephole vectors at input
             gate, forget gate, and output gate.
             The shapes of the peephole vectors are ``(hidden_size,)``. Defaults to ``None``.
         peep_back: [numpy.array] or None
-            List of peephole vectors for the backward layer. The ordering 
+            List of peephole vectors for the backward layer. The ordering
             is ``[p_i, p_f, p_o]``,
-            where ``p_i``, ``p_f``, and ``p_o`` are peephole vectors at input 
+            where ``p_i``, ``p_f``, and ``p_o`` are peephole vectors at input
             gate, forget gate, and output gate.
             The shapes of the peephole vectors are ``(hidden_size,)``. Defaults to ``None``.
         output_all: boolean
@@ -3697,9 +3706,9 @@ def add_bidirlstm(
 
     def add_flatten(self, name, mode, input_name, output_name):
         """
-        Add a flatten layer. Only flattens the channel, height and width axis. 
+        Add a flatten layer. Only flattens the channel, height and width axis.
         Leaves the sequence axis as is.
-        Refer to the ``FlattenLayerParams`` message in the 
+        Refer to the ``FlattenLayerParams`` message in the
         specification (NeuralNetwork.proto) for more details.
 
         Parameters
@@ -3931,7 +3940,7 @@ def add_batchnorm(
 
         ``y = gamma * (x - mean) / sqrt(variance + epsilon) + beta``
 
-        Refer to the ``BatchnormLayerParams`` message in the specification 
+        Refer to the ``BatchnormLayerParams`` message in the specification
         (NeuralNetwork.proto) for more details.
 
         Parameters
@@ -3956,7 +3965,7 @@ def add_batchnorm(
             Set to ``True`` if mean and variance is to be computed from the input data.
         instance_normalization: bool
             Set compute_mean_var and this to ``True`` to perform
-            instance normalization. That is, mean and variance are computed 
+            instance normalization. That is, mean and variance are computed
             from the single input instance.
         epsilon: float
             Value of epsilon. Defaults to ``1e-5`` if not specified.
@@ -4589,7 +4598,7 @@ def add_crop_resize(
               All the N~ RoIs are extracted from all the batches of the input.
             * If RoI shape = ``[N, 1, 5, 1, 1]``, the first element of the
               channel axis specifies the input batch id from which to extract the RoI and
-              must be in the interval ``[0, Batch - 1]``. That is, ``n`` -th RoI is 
+              must be in the interval ``[0, Batch - 1]``. That is, ``n`` -th RoI is
               extracted from the ``RoI[n,0,0,0]`` -th input batch id.
               The last four elements of the channel axis specify the
               bounding box coordinates.
@@ -4604,22 +4613,22 @@ def add_crop_resize(
             Output width dimension.
 
         mode: str
-            * The following values are supported: 
-              ``'STRICT_ALIGN_ENDPOINTS_MODE'``, ``'ALIGN_ENDPOINTS_MODE'``, 
+            * The following values are supported:
+              ``'STRICT_ALIGN_ENDPOINTS_MODE'``, ``'ALIGN_ENDPOINTS_MODE'``,
               ``'UPSAMPLE_MODE'``, ``'ROI_ALIGN_MODE'``.
             * This parameter determines the sampling grid used for bilinear interpolation.
 
         normalized_roi: bool
             * If true the bounding box coordinates must be in the interval ``[0, 1]``.
-              They are scaled by ``(input_height - 1)``, ``(input_width - 1)``; 
+              They are scaled by ``(input_height - 1)``, ``(input_width - 1)``;
               that is, based on the input spatial dimensions.
             * If false the bounding box coordinates must be in the interval
-              ``[0, input_height - 1]`` and ``[0, input_width - 1]``, 
+              ``[0, input_height - 1]`` and ``[0, input_width - 1]``,
               respectively for height and width dimensions.
 
         box_indices_mode: str
-            * The following values are supported: 
-              ``'CORNERS_HEIGHT_FIRST'``, ``'CORNERS_WIDTH_FIRST'``, 
+            * The following values are supported:
+              ``'CORNERS_HEIGHT_FIRST'``, ``'CORNERS_WIDTH_FIRST'``,
               ``'CENTER_SIZE_HEIGHT_FIRST'``, ``'CENTER_SIZE_WIDTH_FIRST'``.
             * Representation used to interpret the bounding box coordinates (RoI) input.
                 * ``'CORNERS_HEIGHT_FIRST'``: ``[h_start, w_start, h_end, w_end]``
@@ -7045,7 +7054,7 @@ def add_batched_mat_mul(
             Must be equal to the last dimension of the output, default: 0.
 
         W: float32 numpy.array or bytes(), optional
-            Weight matrix of shape ``(weight_matrix_rows, weight_matrix_columns)``. 
+            Weight matrix of shape ``(weight_matrix_rows, weight_matrix_columns)``.
             If ``W`` is of type ``bytes()`` (quantized to 1-8 bits), other
             quantization-related arguments must be provided as well (see below).
 
@@ -7053,36 +7062,36 @@ def add_batched_mat_mul(
             Bias vector of shape (weight_matrix_columns,).
 
         Quantization
-        	Quantization arguments, used when ``W`` is of type ``bytes()``:
+                Quantization arguments, used when ``W`` is of type ``bytes()``:
 
-				is_quantized_weight: bool, optional
-					Set it to true when ``W`` is of type ``bytes()``, representing
-					quantized weights, default: false.
+                                is_quantized_weight: bool, optional
+                                        Set it to true when ``W`` is of type ``bytes()``, representing
+                                        quantized weights, default: false.
 
-				quantization_type: str, optional
-					When weights are quantized (that is, ``W`` is of type ``bytes()``),
-					this should be either ``"linear"`` or ``"lut"``, default: ``"linear"``.
+                                quantization_type: str, optional
+                                        When weights are quantized (that is, ``W`` is of type ``bytes()``),
+                                        this should be either ``"linear"`` or ``"lut"``, default: ``"linear"``.
 
-				nbits: int, optional
-					Should be between 1 and 8 (inclusive). Number of bits per weight value, default: 8.
+                                nbits: int, optional
+                                        Should be between 1 and 8 (inclusive). Number of bits per weight value, default: 8.
 
-				quant_scale: numpy.array(dtype=numpy.float32), optional
-					Scale vector to be used with linear quantization. 
-					Must be of length either 1 or ``weight_matrix_columns``, default: ``None``.
+                                quant_scale: numpy.array(dtype=numpy.float32), optional
+                                        Scale vector to be used with linear quantization.
+                                        Must be of length either 1 or ``weight_matrix_columns``, default: ``None``.
 
-				quant_bias: numpy.array(dtype=numpy.float32), optional
-					Bias vector to be used with linear quantization. 
-					Must be of length either 1 or ``weight_matrix_columns``, default: ``None``.
+                                quant_bias: numpy.array(dtype=numpy.float32), optional
+                                        Bias vector to be used with linear quantization.
+                                        Must be of length either 1 or ``weight_matrix_columns``, default: ``None``.
 
-				quant_lut: numpy.array(dtype=numpy.float32), optional
-					The LUT (look up table) to be used with LUT quantization. 
-					Must be of length 2^n bits, default: ``None``.
+                                quant_lut: numpy.array(dtype=numpy.float32), optional
+                                        The LUT (look up table) to be used with LUT quantization.
+                                        Must be of length 2^n bits, default: ``None``.
 
-				int_8_dynamic_quantize: bool
-					Whether to quantize and dequantize before and after 
-					batched matmul, respectively.
-					Expects byte weights, representing int8 values, if True. 
-					See NeuralNetwork.proto for other validation conditions.
+                                int_8_dynamic_quantize: bool
+                                        Whether to quantize and dequantize before and after
+                                        batched matmul, respectively.
+                                        Expects byte weights, representing int8 values, if True.
+                                        See NeuralNetwork.proto for other validation conditions.
 
         See Also
         --------
diff --git a/coremltools/models/neural_network/quantization_utils.py b/coremltools/models/neural_network/quantization_utils.py
index ab94a5ec8..ddcbee825 100644
--- a/coremltools/models/neural_network/quantization_utils.py
+++ b/coremltools/models/neural_network/quantization_utils.py
@@ -12,20 +12,29 @@
 
 import numpy as _np
 
-from coremltools import ComputeUnit as _ComputeUnit
-from coremltools.models import (_QUANTIZATION_MODE_CUSTOM_LOOKUP_TABLE,
-                                _QUANTIZATION_MODE_DEQUANTIZE,
-                                _QUANTIZATION_MODE_LINEAR_QUANTIZATION,
-                                _QUANTIZATION_MODE_LINEAR_SYMMETRIC,
-                                _QUANTIZATION_MODE_LOOKUP_TABLE_KMEANS,
-                                _QUANTIZATION_MODE_LOOKUP_TABLE_LINEAR,
-                                _SUPPORTED_QUANTIZATION_MODES)
-from coremltools.models import MLModel as _MLModel
-
-from ... import (_MINIMUM_FP16_SPEC_VERSION,
-                 _MINIMUM_QUANTIZED_MODEL_SPEC_VERSION,
-                 _SPECIFICATION_VERSION_IOS_14)
-from ..._deps import _HAS_SKLEARN as _HAS_SKLEARN
+from coremltools import (
+    ComputeUnit as _ComputeUnit,
+    _logger
+)
+from coremltools._deps import (
+    _HAS_KMEANS1D,
+    _kmeans1d
+)
+from coremltools.models import (
+    _QUANTIZATION_MODE_CUSTOM_LOOKUP_TABLE,
+    _QUANTIZATION_MODE_DEQUANTIZE,
+    _QUANTIZATION_MODE_LINEAR_QUANTIZATION,
+    _QUANTIZATION_MODE_LINEAR_SYMMETRIC,
+    _QUANTIZATION_MODE_LOOKUP_TABLE_KMEANS,
+    _QUANTIZATION_MODE_LOOKUP_TABLE_LINEAR,
+    _SUPPORTED_QUANTIZATION_MODES,
+    MLModel as _MLModel
+)
+from ... import (
+    _MINIMUM_FP16_SPEC_VERSION,
+    _MINIMUM_QUANTIZED_MODEL_SPEC_VERSION,
+    _SPECIFICATION_VERSION_IOS_14
+)
 from ..utils import _get_model, _macos_version, _wp_to_fp16wp
 from .optimization_utils import _optimize_nn
 
@@ -355,42 +364,62 @@ def _get_linear_lookup_table_and_weight(nbits, wp):
     return lookup_table, qw
 
 
-def _get_kmeans_lookup_table_and_weight(
-    nbits, w, init="k-means++", tol=1e-2, n_init=1, rand_seed=0
-):
+def _get_kmeans_lookup_table_and_weight(nbits, w, force_kmeans1d=False):
     """
-    Generate K-Means lookup table given a weight parameter field
+    Generate K-Means lookup table given weights
 
     nbits:
         Number of bits for quantization
 
     w:
-        Weight as numpy array
+        Weights as numpy array
+
+    force_kmeans1d:
+        Use kmeans1d regardless of number of weights
 
     Returns
     -------
     lut: numpy.array
-        Lookup table, numpy array of shape (1 << nbits, );
+        Lookup table, numpy array of shape (1 << nbits, )
     wq: numpy.array
         Quantized weight of type numpy.uint8
     """
-    if _HAS_SKLEARN:
-        from sklearn.cluster import KMeans
-    else:
-        raise ModuleNotFoundError(
-            "scikit-learn is required for k-means quantization."
-            " To install, run: \"pip install -U scikit-learn\"."
-        )
-    units = _np.prod(w.shape)
+    num_weights = _np.prod(w.shape)
     lut_len = 1 << nbits
-    n_clusters = units if (units < lut_len) else lut_len
     wf = w.reshape(-1, 1)
-    kmeans = KMeans(
-        n_clusters=n_clusters, init=init, tol=tol, n_init=n_init, random_state=rand_seed
-    ).fit(wf)
-    wq = kmeans.labels_[:units]
     lut = _np.zeros(lut_len)
-    lut[:n_clusters] = kmeans.cluster_centers_.flatten()
+
+    is_better_to_use_kmeans1d = (num_weights >= 10_000 and w.dtype == _np.float16)
+
+    if (is_better_to_use_kmeans1d and _HAS_KMEANS1D) or force_kmeans1d:
+        # Cluster with kmeans1d
+        assert(_HAS_KMEANS1D)
+        values, indices, counts = _np.unique(wf, return_inverse=True, return_counts=True)
+        n_clusters = min(len(values), lut_len)
+        kmeans_results = _kmeans1d.cluster(values, n_clusters, weights=counts)
+        lut[:n_clusters] = kmeans_results.centroids
+        wq = _np.array(kmeans_results.clusters)[indices]
+    else:
+        # Cluster with scikit-learn
+        try:
+            from sklearn.cluster import KMeans
+        except:
+            raise ModuleNotFoundError(
+                "scikit-learn is required for k-means quantization."
+                " To install, run: \"pip install scikit-learn\"."
+            )
+
+        if is_better_to_use_kmeans1d:
+            _logger.warning("It would be better to use kmeans1d but that is not available."
+                         " Using scikit-learn for K-means.")
+
+        n_clusters = min(num_weights, lut_len)
+        kmeans = KMeans(
+            n_clusters, init="k-means++", tol=1e-2, n_init=1, random_state=0
+        ).fit(wf)
+        wq = kmeans.labels_[:num_weights]
+        lut[:n_clusters] = kmeans.cluster_centers_.flatten()
+
     return lut, wq
 
 
diff --git a/coremltools/optimize/__init__.py b/coremltools/optimize/__init__.py
new file mode 100644
index 000000000..1f35b1199
--- /dev/null
+++ b/coremltools/optimize/__init__.py
@@ -0,0 +1,6 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+from . import coreml
diff --git a/coremltools/optimize/coreml/__init__.py b/coremltools/optimize/coreml/__init__.py
new file mode 100644
index 000000000..9d40e6246
--- /dev/null
+++ b/coremltools/optimize/coreml/__init__.py
@@ -0,0 +1,19 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+from ._config import (
+	OpLinearQuantizerConfig,
+    OpMagnitudePrunerConfig,
+	OpPalettizerConfig,
+    OpThresholdPrunerConfig,
+	OptimizationConfig,
+)
+
+from ._post_training_quantization import (
+    decompress_weights,
+	linear_quantize_weights,
+	palettize_weights,
+	prune_weights,
+)
diff --git a/coremltools/optimize/coreml/_config.py b/coremltools/optimize/coreml/_config.py
new file mode 100644
index 000000000..8b9fec3fa
--- /dev/null
+++ b/coremltools/optimize/coreml/_config.py
@@ -0,0 +1,971 @@
+#  Copyright (c) 2020, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import sys
+from abc import ABC, abstractmethod
+from typing import IO, Any, Callable, Dict, Optional, Tuple, Union
+
+import cattrs
+import numpy as np
+import yaml
+from attrs import define, field, validators
+
+from coremltools.converters.mil.mil import Operation, types
+from coremltools.converters.mil.mil.types.type_mapping import is_builtin, numpy_type_to_builtin_type
+
+
+class OpCompressorConfig(ABC):
+    """
+    An abstract class for the compressor configuration
+    """
+
+    def _validate_op_type(self, op_type):
+        """
+        A utility function checking if an op type is valid for the configuration
+        """
+        pass
+
+    @classmethod
+    @abstractmethod
+    def _from_dict(cls, config_dict: Dict[str, Any]) -> "OpCompressorConfig":
+        """
+        An abstract method that construct an OpCompressorConfig from a dictionary.
+        It must be implemented in the child class.
+        """
+        raise ValueError("_from_dict must be implemented in the subclasses of OpCompressorConfig.")
+
+def _check_weight_threshold(instance, attribute, value):
+    if value is not None and value < 0:
+        raise ValueError(f"\"weight_threshold\" must be a non-negative integer. Got {value}.")
+
+"""
+Linear Quantization configuration
+"""
+
+@define
+class OpLinearQuantizerConfig(OpCompressorConfig):
+    """
+    Parameters
+    ----------
+
+    mode: str
+        Mode for linear quantization:
+
+        * ``"linear_symmetric"`` (default): Input data are quantized in the range
+          ``[-R, R]``, where :math:`R = max(abs(w_r))`.
+        * ``"linear"``: Input data are quantized in the range
+          :math:`[min(w_r), max(w_r)]`.
+
+    dtype: np.generic or mil.type type
+        Determines the quantized data type (int8/uint8).
+
+        * The allowed values are:
+            * ``np.int8`` (the default)
+            * ``np.uint8``
+            * ``coremltools.converters.mil.mil.types.int8``
+            * ``coremltools.converters.mil.mil.types.uint8``
+
+    weight_threshold: int
+        The size threshold, above which weights are pruned.
+        That is, a weight tensor is pruned only if its total number of elements are greater than ``weight_threshold``.
+
+        For example, if ``weight_threshold = 1024`` and a weight tensor is of shape ``[10, 20, 1, 1]``, hence ``200``
+        elements, it will not be pruned.
+
+        * If not provided, it will be set to ``2048``, in which weights bigger than ``2048`` elements are compressed.
+    """
+    mode: str = field(default="linear_symmetric", validator=validators.instance_of(str))
+    dtype: type = field(default=np.int8, validator=validators.instance_of(type))
+    weight_threshold: Optional[int] = field(default=2048, validator=validators.optional([validators.instance_of(int), _check_weight_threshold]))
+
+    _WEIGHT_AFFINE_QUANTIZATION_MODES = ("LINEAR_SYMMETRIC", "LINEAR")
+    _WEIGHT_AFFINE_DTYPES = (types.int8, types.uint8)
+
+    @mode.validator
+    def check_mode(self, attr, mode):
+        if not mode.upper() in self._WEIGHT_AFFINE_QUANTIZATION_MODES:
+            raise ValueError(f"Only mode {self._WEIGHT_AFFINE_QUANTIZATION_MODES} supported for weight affine quantization. Got mode: \"{mode}\".")
+
+    @dtype.validator
+    def check_dtype(self, attr, dtype):
+        msg = f"dtype={dtype} is unsupported for affine_quantize_weights."
+        if not is_builtin(dtype):
+            try:
+                dtype = numpy_type_to_builtin_type(dtype)
+            except TypeError:
+                raise ValueError(msg)
+
+        if dtype not in self._WEIGHT_AFFINE_DTYPES:
+            raise ValueError(msg)
+
+    def __attrs_post_init__(self):
+        self.mode = self.mode.upper()
+        if not is_builtin(self.dtype):
+            self.dtype = numpy_type_to_builtin_type(self.dtype)
+
+    @classmethod
+    def _from_dict(cls, config_dict: Dict[str, Any]) -> "OpLinearQuantizerConfig":
+        def _structure_type(value, dtype):
+            if isinstance(value, type):
+                return value
+            else:
+                if not isinstance(value, str) or value not in ("int8", "uint8"):
+                    raise ValueError(
+                        f'"dtype" must be type of type or str ["int8", "uint8"]. Got {value}'
+                    )
+                return getattr(np, value)
+
+        converter = cattrs.Converter(forbid_extra_keys=True)
+        converter.register_structure_hook(type, _structure_type)
+        return converter.structure(config_dict, cls)
+
+"""
+Pruner configurations
+"""
+@define
+class OpThresholdPrunerConfig(OpCompressorConfig):
+    """
+    All weights with absolute value smaller than ``threshold`` are changed to ``0``,
+    and the tensor is stored in a sparse format.
+
+    For example, given the following:
+
+       * ``weight = [0.3, -0.2, -0.01, 0.05]``
+       * ``threshold = 0.03``
+
+    The sparsified weight would be ``[0.3, -0.2, 0, 0.05]``.
+
+    Parameters
+    ----------
+
+    threshold: float
+        All weight values above this threshold are set to ``0``.
+
+        * Default value is ``1e-3``.
+
+    minimum_sparsity_percentile: float
+        The sparsity level must be above this value for the weight representation to be stored in the sparse format rather than the dense format.
+
+        For example, if ``minimum_sparsity_percentile = 0.6`` and the sparisty level is ``0.54``; that is, ``54%`` of the
+        weight values are exactly ``0``, then the resulting weight tensor will be stored as a dense const op,
+        and not converted to the ``constsexpr_sparse_to_dense`` op (which stores the weight values in a sparse format).
+
+        * Must be a value between ``0`` and ``1``.
+        * Default value is ``0.5``.
+
+    weight_threshold: int
+        The size threshold, above which weights are pruned.
+        That is, a weight tensor is pruned only if its total number of elements are greater than ``weight_threshold``.
+
+        For example, if ``weight_threshold = 1024`` and a weight tensor is of shape ``[10, 20, 1, 1]``, hence ``200``
+        elements, it will not be pruned.
+
+        * If not provided, it will be set to ``2048``, in which weights bigger than ``2048`` elements are compressed.
+    """
+    threshold: float = field(default=1e-3, validator=validators.instance_of(float))
+    minimum_sparsity_percentile: float = field(default=0.5, validator=validators.instance_of(float))
+    weight_threshold: Optional[int] = field(
+                                        default=2048,
+                                        validator=validators.optional([validators.instance_of(int), _check_weight_threshold])
+                                      )
+
+    @threshold.validator
+    def check_threshold(self, attr, threshold):
+        if threshold < 0:
+            raise ValueError(
+                f"Invalid value of \"threshold\": {threshold}. Needs to be in [0, inf)"
+            )
+
+    @minimum_sparsity_percentile.validator
+    def check_minimum_sparsity_percentile(self, attr, minimum_sparsity_percentile):
+        if minimum_sparsity_percentile < 0 or minimum_sparsity_percentile > 1:
+            raise ValueError(
+                f"Invalid value of \"minimum_sparsity_percentile\": {minimum_sparsity_percentile}. Needs to be in [0, 1]"
+            )
+
+    @classmethod
+    def _from_dict(cls, config_dict: Dict[str, Any]) -> "OpThresholdPrunerConfig":
+        converter = cattrs.Converter(forbid_extra_keys=True)
+        return converter.structure(config_dict, cls)
+
+@define
+class OpMagnitudePrunerConfig(OpCompressorConfig):
+    """
+    Prune the weight with a constant sparsity percentile, which can be specified by either ``target_sparsity`` or ``n_m_ratio``.
+
+    If ``target_sparsity`` is set, where ``n = floor(size_of_weight_tensor * target_sparsity)``, the ``n`` lowest
+    absolute weight values are changed to ``0``. For example, given the following:
+
+       * ``weight = [0.3, -0.2, -0.01, 0.05]``
+       * ``target_sparsity = 0.75``
+
+    The sparsified weight would be ``[0.3, 0, 0, 0]``.
+
+    If ``block_size`` is set, then weights are pruned in a block structured manner; that is, chunks of weight values, as big as the ``block_size``, will be set to ``0``.
+    Block sparsity can only be applied to ``linear`` and ``conv`` layers.
+    For example:
+
+        .. code-block:: python
+
+            # Given a 4 x 2 weight with the following value, and block_size = 2, dim = 0.
+            [
+                [1, 3],
+                [-6, -7],
+                [0, 3],
+                [-9, 2],
+            ]
+
+            # We first flatten the matrix along axis = 0.
+            [1, -6, 0, -9, 3, -7, 3, 2]
+
+            # For block size 2, the L2 norm will be compute of first 2 elements, then the second and 3rd element and so on.
+            [6.08, 9.00, 7.62, 3.61]
+
+            # Then the smallest values will be picked to prune. So if target_sparsity = 0.5, then the blocks that will be
+            # pruned will be with ones with L2 norm value of 6.08 and 3.61. And hence, the elements in the first and third
+            # block are pruned. Resulting in the following flatten pruned tensor:
+            [0, 0, 0, -9, 3, -7, 0, 0]
+
+            # The final pruned tensor is:
+            [
+                [0, 3],
+                [0, -7],
+                [0, 0],
+                [-9, 0],
+            ]
+
+    The ``n_m_ratio`` triggers ``n:m`` pruning along the ``dim`` axis. In ``n:m`` pruning,
+    out of every ``m`` elements, ``n`` with lowest magnitude are set to ``0``.
+    For more information, see
+    `Learning N:M Fine-Grained Structured Sparse Neural Networks From Scratch <https://arxiv.org/abs/2102.04010>`_.
+
+    ``n:m`` pruning can be applied only to ``linear`` and ``conv`` layers.
+
+    Example:
+
+        .. code-block:: python
+
+            # Given a 4 x 4 weight of
+            [
+                [3, 4, 7, 6],
+                [1, 8, -3, -8],
+                [-2, -3, -4, 0],
+                [5, 4, -3, -2],
+            ]
+
+            # For n_m_ratio = (1, 2) with axis = 1 (default), the resulting pruned weight is
+            [
+                [0, 4, 7, 0],
+                [0, 8, 0, -8],
+                [0, -3, -4, 0],
+                [5, 0, -3, 0],
+            ]
+
+            # For axis = 0, we get
+            [
+                [3, 0, 7, 0],
+                [0, 8, 0, -8],
+                [0, 0, -4, 0],
+                [5, 4, 0, -2],
+            ]
+
+    Parameters
+    ----------
+
+    target_sparsity: float
+        The percentage of sparsity for compression, which needs to be in the range ``[0, 1]``. When ``0``, no sparsification
+        occurs. For ``1``, all weights become ``0``.
+
+    block_size: int
+        Block size for inducing block sparsity.
+        This is applied on the ``dim`` dimension of the parameter.
+        Having the zeros aligned in the parameter helps gain latency/memory performance on-device.
+
+        * If set, must be greater than ``1`` to enable block sparsity.
+        * Block sparsity can be applied only to ``linear`` and ``conv`` layers.
+        * The channel will be padded with ``0`` if it is not divisble by ``block_size``.
+
+    n_m_ratio: tuple[int]
+        A tuple of two integers which specify the ratio for ``n:m`` pruning.
+
+        * ``n`` must be smaller or equal to ``m``.
+        * The channel would be padded with ``0`` if it is not divisble by ``m``.
+
+    dim: int
+        Dimension where the block sparsity or ``n:m`` sparsity is applied.
+
+        * Must be either ``0`` or ``1``.
+        * The default value for block sparsity is ``0`` (output channel).
+        * The default value for ``n:m`` sparsity is ``1`` (input channel).
+
+    weight_threshold: int
+        The size threshold, above which weights are pruned.
+        That is, a weight tensor is pruned only if its total number of elements is greater than ``weight_threshold``.
+
+        For example, if ``weight_threshold = 1024`` and a weight tensor is of shape ``[10, 20, 1, 1]``, hence ``200``
+        elements, it will not be pruned.
+
+        * If not provided, it will be set to ``2048``, in which weights bigger than ``2048`` elements are compressed.
+    """
+    target_sparsity: Optional[float] = field(default=None, validator=validators.optional(validators.instance_of(float)))
+    block_size: Optional[int] = field(default=None, validator=validators.optional(validators.instance_of(int)))
+    n_m_ratio: Optional[Tuple[int, int]] = field(default=None, validator=validators.optional(validators.instance_of((list, tuple))))
+    dim: Optional[int] = field(default=None, validator=validators.optional(validators.instance_of(int)))
+    weight_threshold: Optional[int] = field(
+                                        default=2048,
+                                        validator=validators.optional([validators.instance_of(int), _check_weight_threshold])
+                                      )
+
+    _SUPPORTED_OPS_FOR_STRUCTURAL_PRUNING = {
+        "conv": ["weight"],
+        "linear": ["weight"],
+    }
+
+    def _is_structural_pruning(self):
+        return self.n_m_ratio is not None or self.block_size is not None
+
+    def _validate_op_type(self, op_type):
+        """
+        Structural sparsity can only be applied to conv / linear weight.
+        """
+        if self._is_structural_pruning() and op_type not in self._SUPPORTED_OPS_FOR_STRUCTURAL_PRUNING:
+            raise ValueError(f"block sparsity or n:m pruning does not support op type {op_type}.")
+
+    def _check_const_op_is_valid(self, op):
+        def _get_child_op_and_input(op):
+            assert op.op_type == "const"
+            res = []
+            for child in op.outputs[0].child_ops:
+                child_op_type = child.op_type
+                child_op_input = ""
+                for k, v in child.inputs.items():
+                    if v is op.outputs[0]:
+                        child_op_input = k
+                        break
+                assert child_op_input != ""
+                res.append((child_op_type, child_op_input))
+            return res
+
+        if not self._is_structural_pruning():
+            return True
+
+        child_op_type_and_input = _get_child_op_and_input(op)
+        for op_type, input in child_op_type_and_input:
+            if op_type not in self._SUPPORTED_OPS_FOR_STRUCTURAL_PRUNING:
+                return False
+            if input not in self._SUPPORTED_OPS_FOR_STRUCTURAL_PRUNING[op_type]:
+                return False
+
+        return True
+
+    @target_sparsity.validator
+    def check_target_sparsity(self, attr, target_sparsity):
+        msg = "Either \"target_sparsity\" or \"n_m_ratio\" need to be set. They cannot be set at the same time."
+        if target_sparsity is not None and self.n_m_ratio is not None:
+            raise ValueError(msg)
+        if target_sparsity is None and self.n_m_ratio is None:
+            raise ValueError(msg)
+
+        if target_sparsity is None:
+            return
+        if target_sparsity < 0 or target_sparsity > 1:
+            raise ValueError(
+                f"Invalid value of \"target_sparsity\": {target_sparsity}. Needs to be in [0, 1]."
+            )
+
+    @block_size.validator
+    def check_block_size(self, attr, block_size):
+        if block_size is not None and self.n_m_ratio is not None:
+            raise ValueError(
+                "\"block_size\" and \"n_m_ratio\" cannot be set at the same time."
+            )
+        if block_size is None:
+            return
+        if  block_size is not None and block_size <= 1:
+            raise ValueError(f"\"block_size\" must be an integer > 1. Got {block_size}.")
+
+    @n_m_ratio.validator
+    def check_n_m_ratio(self, attr, n_m_ratio):
+        if n_m_ratio is None:
+            return
+        if len(n_m_ratio) != 2 or n_m_ratio[0] > n_m_ratio[1]:
+            raise ValueError(f"\"n_m_ratio\" must be a tuple of two integers (n, m). n <= m. Got {n_m_ratio}")
+
+    @dim.validator
+    def check_dim(self, attr, dim):
+        if dim is None:
+            return
+        if self.block_size is None and self.n_m_ratio is None:
+            raise ValueError("\"dim\" can only be set along with \"block_size\" or \"n_m_ratio\".")
+        if dim not in [0, 1]:
+            raise ValueError(f"\"dim\" must be 1 or 0. Got {dim}.")
+
+    def __attrs_post_init__(self):
+        if self.block_size is not None and self.dim is None:
+            self.dim = 0
+        if self.n_m_ratio is not None and self.dim is None:
+            self.dim = 1
+
+    @classmethod
+    def _from_dict(cls, config_dict: Dict[str, Any]) -> "OpMagnitudePrunerConfig":
+        converter = cattrs.Converter(forbid_extra_keys=True)
+        return converter.structure(config_dict, cls)
+
+"""
+Palettizer configuration
+"""
+
+@define
+class OpPalettizerConfig(OpCompressorConfig):
+    """
+    Parameters
+    ----------
+
+    nbits: int
+        Number of bits per weight. Required for ``kmeans`` or ``uniform`` mode, but must
+        not be set for ``unique`` or ``custom`` mode. A LUT would have
+        2\ :sup:`nbits` entries, where `nbits` can be ``{1, 2, 4, 6, 8}``.
+
+    mode: str
+        Determine how the LUT is constructed by specifying one of the following:
+
+        * ``"kmeans"`` (default): The LUT is generated by `k-means clustering`, a method of vector
+          quantization that groups similar data points together to discover underlying
+          patterns by using a fixed number (`k`) of clusters in a dataset. A cluster
+          refers to a collection of data points aggregated together because of certain
+          similarities. `nbits` is required.
+
+        * ``"uniform"``: The LUT is generated by a linear histogram.
+
+           - ``[v_min, v_min + scale, v_min + 2 * scale, ..., v_max]``
+           - Where the weight is in the range ``[v_min, v_max]``, and
+             ``scale = (v_max - v_min) / (1 << nbits - 1)``.
+           - ``nbits`` is required.
+
+           A `histogram` is a representation of the distribution of a continuous variable,
+           in which the entire range of values is divided into a series of intervals (or
+           `bins`) and the representation displays how many values fall into each bin.
+           Linear histograms have one bin at even intervals, such as one bin per integer.
+
+        * ``"unique"``: The LUT is generated by unique values in the weights. The weights
+          are assumed to be on a discrete lattice but stored in a float data type. This
+          parameter identifies the weights and converts them into the palettized representation.
+
+          Do not provide ``nbits`` for this mode. ``nbits`` is picked up automatically,
+          with the smallest possible value in ``{1, 2, 4, 6, 8}`` such that the
+          number of the unique values is ``<= (1 << nbits)``. If the weight has ``> 256``
+          unique values, the compression is skipped.
+
+          For example:
+
+          * If the weights are ``{0.1, 0.2, 0.3, 0.4}`` and ``nbits=2``, the weights are
+            converted to ``{00b, 01b, 10b, 11b}``, and the generated LUT is
+            ``[0.1, 0.2, 0.3, 0.4]``.
+          * If the weights are ``{0.1, 0.2, 0.3, 0.4}`` and ``nbits=1``, nothing happens
+            because the weights are not a 1-bit lattice.
+          * If the weights are ``{0.1, 0.2, 0.3, 0.4, 0.5}`` and ``nbits=2``, nothing
+            happens because the weights are not a 2-bit lattice.
+
+        * ``"custom"``: The LUT and palettization parameters are calculated using a custom
+          function. If this mode is selected then ``lut_function`` must be provided.
+
+          Do not provide ``nbits`` for this mode. The user should customize ``nbits`` in the
+          ``lut_function`` implementation.
+
+    lut_function: callable
+        A callable function which computes the weight palettization parameters. This must
+        be provided if the mode is set to ``"custom"``.
+
+        weight: np.ndarray
+            A float precision numpy array.
+
+        Returns: lut: list[float]
+            The lookup table.
+
+        indices: list[int]
+            A list of indices for each element.
+
+        The following is an example that extract the ``top_k`` elements as the LUT. Given
+        that ``weight = [0.1, 0.5, 0.3, 0.3, 0.5, 0.6, 0.7]``, the ``lut_function``
+        produces ``lut = [0, 0.5, 0.6, 0.7], indices = [0, 1, 0, 0, 2, 3]``.
+
+        .. sourcecode:: python
+
+           def lut_function(weight):
+               # In this example, we assume elements in the weights >= 0
+               weight = weight.flatten()
+               nbits = 4
+
+               # Get the LUT, from extracting top k maximum unique elements in the weight to be the LUT
+               # Note that k = 1 << nbits - 1, so we have the first element be 0
+               unique_elements = np.unique(weight)
+               k = (1 << nbits) - 1
+               top_k = np.partition(weight, -k)[-k:]
+               np.sort(top_k)
+               lut = [0.0] + top_k.tolist()
+
+               # Compute the indices
+               mapping = {v: idx for idx, v in enumerate(lut)}
+               indices = [mapping[v] if v in mapping else 0 for v in weight]
+
+               return lut, indices
+
+    weight_threshold: int
+        The size threshold, above which weights are pruned.
+        That is, a weight tensor is pruned only if its total number of elements are greater than ``weight_threshold``.
+
+        For example, if ``weight_threshold = 1024`` and a weight tensor is of shape ``[10, 20, 1, 1]``, hence ``200``
+        elements, it will not be pruned.
+
+        * If not provided, it will be set to ``2048``, in which weights bigger than ``2048`` elements are compressed.
+    """
+    mode: str = field(default="kmeans", validator=validators.instance_of(str))
+    nbits: Optional[int] = field(default=None)
+    lut_function: Optional[Callable] = field(default=None)
+    weight_threshold: Optional[int] = field(default=2048, validator=validators.optional([validators.instance_of(int), _check_weight_threshold]))
+
+    _WEIGHT_PALETTIZATION_MODES = ("KMEANS", "UNIFORM", "UNIQUE", "CUSTOM")
+
+    @nbits.validator
+    def check_nbits(self, attr, nbits):
+        mode = self.mode.upper()
+
+        if nbits is None and mode in ("KMEANS", "UNIFORM"):
+            raise ValueError(f"\"nbits\" must be provided for {self.mode} mode")
+
+        if nbits is not None and mode in ("UNIQUE", "CUSTOM"):
+            raise ValueError(f"\"nbits\" must NOT be provided for {self.mode} mode")
+
+        if nbits is not None and nbits not in [1, 2, 4, 6, 8]:
+            raise ValueError(
+                f"Invalid value of \"nbits\" ({nbits}) for palettization. Supported \"nbits\" are {{1, 2, 4, 6, 8}}"
+            )
+
+    @mode.validator
+    def check_mode(self, attr, mode):
+        if not mode.upper() in self._WEIGHT_PALETTIZATION_MODES:
+            raise ValueError(f"Only modes {self._WEIGHT_PALETTIZATION_MODES} are supported for weight palettization. Got \"mode\": \"{mode}\".")
+
+
+    @lut_function.validator
+    def check_lut_function(self, attr, lut_function):
+        mode = self.mode.upper()
+
+        if lut_function is None and mode == "CUSTOM":
+            raise ValueError("\"lut_function\" can not be None, if \"mode\" is \"custom\".")
+
+        if lut_function is not None and mode != "CUSTOM":
+            raise ValueError("\"lut_function\" must be None, if \"mode\" is not \"custom\".")
+
+        if lut_function is not None and not callable(lut_function):
+            raise ValueError(f"A function object must be provided as \"lut_function\". Got a \"lut_function\" as type {type(self.lut_function)}")
+
+    def __attrs_post_init__(self):
+        self.mode = self.mode.upper()
+
+    @classmethod
+    def _from_dict(cls, config_dict: Dict[str, Any]) -> "OpPalettizerConfig":
+        if "lut_function" in config_dict:
+            raise ValueError(
+                "_from_dict method does not support lut_function. Please create the OpPalettizerConfig from scratch."
+            )
+        converter = cattrs.Converter(forbid_extra_keys=True)
+        return converter.structure(config_dict, cls)
+
+@define
+class OptimizationConfig:
+    """
+    A configuration wrapper that enables fine-grained control when compressing a model,
+    Providing the following levels: `global`, `op type`, and `op name`.
+
+    1. ``global_config``: The default configuration applied to all ops / consts.
+    2. ``op_type_configs``: Configurations applied to specific op type. It overrides ``global_config``.
+    3. ``op_name_configs``: Confgurations applied to specific op instance. It overrides ``global_config`` and ``op_type_configs``.
+
+    The following is an example that constructs an optimization config for weight palettization.
+
+        .. code-block:: python
+
+            from coremltools.optimize.coreml import OpPalettizerConfig, OptimizationConfig
+
+            # The default global configuration is 8 bits palettization with kmeans
+            global_config = OpPalettizerConfig(mode="kmeans", nbits=8)
+
+            # We use 2 bits palettization for convolution layers, and skip the compression for linear layers
+            op_type_configs = {
+                "conv": OpPalettizerConfig(mode="kmeans", nbits=2),
+                "linear": None,
+            }
+
+            # We want a convolution layer named "conv_1" to have a 4 bits palettization with a different mode
+            op_name_configs = {
+                "conv_1": OpPalettizerConfig(mode="uniform", nbits=4),
+            }
+
+            # Now we can put all configuration across three levels to construct an OptimizationConfig object
+            config = OptimizationConfig(
+                global_config=global_config,
+                op_type_configs=op_type_configs,
+                op_name_configs=op_name_configs,
+            )
+
+
+    Parameters
+    ----------
+
+    global_config: OpCompressorConfig
+        Config to be applied globally to all supported ops.
+
+    op_type_configs: dict[str, OpCompressorConfig]
+        Op type level configs applied to a specific op class.
+
+        * The keys of the dictionary are the string of the op type, and the values are the corresponding :py:class:`OpCompressorConfig`.
+        * An op type will not be compressed if the value is set to ``None``.
+
+    op_name_configs: dict[str, OpCompressorConfig]
+        Op instance level configs applied to a specific op or constant.
+
+        * The keys of the dictionary are the name of an op instance, and the values are the corresponding :py:class:`OpCompressorConfig`.
+        * An op instance will not be compressed if the value is set to ``None``.
+    """
+    global_config: Optional[OpCompressorConfig] = field(default=None)
+    op_type_configs: Optional[OpCompressorConfig] = field(default=None)
+    op_name_configs: Optional[OpCompressorConfig] = field(default=None)
+
+    # The following two private attributes is aim for backward compatibility for ct.compression_utils implementation
+    # They need to be removed in the future once we deprecate ct.compression_utils
+    _is_deprecated: bool = field(default=False, validator=validators.instance_of(bool))
+    _op_selector: Optional[Callable] = field(default=None)
+
+    @staticmethod
+    def _check_op_config_type(config):
+        if config is None:
+            return
+        if not isinstance(config, OpCompressorConfig):
+            raise ValueError(f"config must be type of OpCompressorConfig. Got {type(config)}.")
+
+    def set_global(self, op_config: OpCompressorConfig):
+        """
+        Sets the global config that would be applied to all constant ops.
+
+        .. code-block:: python
+
+            from coremltools.optimize.coreml import OpPalettizerConfig, OptimizationConfig
+
+            config = OptimizationConfig()
+            global_config = OpPalettizerConfig(mode="kmeans", nbits=8)
+            config.set_global(global_config)
+
+        Parameters
+        ----------
+
+        op_config: OpCompressorConfig
+            Config to be applied globally to all supported ops.
+        """
+        self._check_op_config_type(op_config)
+        self.global_config = op_config
+
+    def set_op_type(
+        self,
+        op_type: str,
+        op_config: OpCompressorConfig,
+    ):
+        """
+        Sets the compression config at the level of op type.
+
+        .. code-block:: python
+
+            from coremltools.optimize.coreml import OpPalettizerConfig, OptimizationConfig
+
+            config = OptimizationConfig()
+            conv_config = OpPalettizerConfig(mode="kmeans", nbits=2)
+            config.set_op_type("conv", conv_config)
+
+        Parameters
+        ----------
+
+        op_type: str
+            The type of an op. For instance, ``"conv", "linear"``.
+
+        op_config: OpCompressorConfig
+            Op type level config applied to a specific op class ``op_type``.
+        """
+        if self._is_deprecated:
+            raise ValueError("set_op_type is not exposed through the coremltools.compression_utils API.")
+        self._check_op_config_type(op_config)
+        if op_config is not None:
+            op_config._validate_op_type(op_type)
+        self.op_type_configs[op_type] = op_config
+
+    def set_op_name(
+        self,
+        op_name: str,
+        op_config: OpCompressorConfig,
+    ):
+        """
+        Sets the compression config at the level of op instance by name.
+
+        .. code-block:: python
+
+            from coremltools.optimize.coreml import OpPalettizerConfig, OptimizationConfig
+
+            config = OptimizationConfig()
+            op_config = OpPalettizerConfig(mode="kmeans", nbits=2)
+            config.set_op_name("conv_1", op_config)
+
+        Parameters
+        ----------
+
+        op_name: str
+            The name of the op instance.
+
+        op_config: OpCompressorConfig
+            Op instance level config applied to a specific op or constant with name ``op_name``.
+        """
+        if self._is_deprecated:
+            raise ValueError("set_op_name is not exposed through the coremltools.compression_utils API.")
+        self._check_op_config_type(op_config)
+        self.op_name_configs[op_name] = op_config
+
+    @_is_deprecated.validator
+    def check_is_deprecated(self, attr, _is_deprecated):
+        if not _is_deprecated and self._op_selector is not None:
+            raise ValueError("op_selector is supported only through the coremltools.compression_utils API.")
+
+    @op_type_configs.validator
+    def check_op_type_configs(self, attr, op_type_configs):
+        if op_type_configs is None:
+            return
+        for v in op_type_configs.values():
+            self._check_op_config_type(v)
+        for k, v in op_type_configs.items():
+            if v is not None:
+                v._validate_op_type(k)
+
+    @op_name_configs.validator
+    def check_op_name_configs(self, attr, op_name_configs):
+        if op_name_configs is None:
+            return
+        for v in op_name_configs.values():
+            self._check_op_config_type(v)
+
+    @global_config.validator
+    def check_global_configs(self, attr, global_config):
+        if global_config is None:
+            return
+        self._check_op_config_type(global_config)
+
+
+    def _get_op_config(self, op: Operation):
+        """
+        This utility function retrieve the compression config for an non-const Operation instance.
+        The priority is by: op name -> op type -> global
+        """
+        if not isinstance(op, Operation):
+           raise TypeError(f"op must be type of Operation. Got {type(op)}")
+
+        if op.op_type == "const":
+            raise TypeError("op must not be of type const")
+
+        if op.name in self.op_name_configs:
+            return self.op_name_configs[op.name]
+        elif op.op_type in self.op_type_configs:
+            return self.op_type_configs[op.op_type]
+
+        return self.global_config
+
+    def _get_const_op_config(self, op: Operation):
+        """
+        This utility function retrieves the compression config by an const Operation instance.
+        If the const is fed into multiple operations, an error would be thrown if a conflict is detected.
+        """
+        if not isinstance(op, Operation):
+            raise TypeError(f"op must be type of Operation. Got {type(op)}")
+
+        if op.op_type != "const":
+            raise TypeError(f"op must be of type const. Got {op.op_type}")
+
+        if op.name in self.op_name_configs:
+            return self.op_name_configs[op.name]
+
+        if op.op_type in self.op_type_configs:
+            # We don't allow users to call set_op_type for "const" ops.
+            # The users are supposed to use set_global instead
+            raise ValueError("const ops cannot be set by the `set_op_type` function. Please use `set_global`")
+
+        # If the constant's output is only connected to the block output, we don't do compression
+        # Due to this bug: rdar://108274019 ([Bug] constexpr ops cannot be directly fed to block output)
+        child_ops = op.outputs[0].child_ops
+        if len(child_ops) == 0:
+            return None
+
+        op_configs = [self._get_op_config(op) for op in child_ops]
+
+        for i, config in enumerate(op_configs):
+            if config != op_configs[0]:
+                raise ValueError(
+                    f"compression config conflict detected between ops {child_ops[0]} and {child_ops[i]}. "
+                    f"{child_ops[0]} has config {op_configs[0]} while {child_ops[i]} has {config}."
+                )
+        return op_configs[0]
+
+    def __attrs_post_init__(self):
+        if self.op_type_configs is None:
+            self.op_type_configs = {}
+        if self.op_name_configs is None:
+            self.op_name_configs = {}
+
+    @classmethod
+    def from_dict(cls, config_dict: Dict[str, Any]) -> "OptimizationConfig":
+        """
+        Construct an ``OptimizationConfig`` instance from a nested dictionary.
+        The dictionary should have the structure that only contains (if any) the following four ``str`` keys:
+
+        * ``"config_type"``: Specify the configuration class type.
+        * ``"global_config"``: Parameters for ``global_config``.
+        * ``"op_type_configs"``: A nested dictionary for ``op_type_configs``.
+        * ``"op_name_config"``: A nested dictionary for ``op_name_configs``.
+
+        The following is a nested dictionary that creates an optimization config for weight palettization:
+
+        .. code-block:: python
+
+            config_dict = {
+                "config_type": "OpPalettizerConfig",
+                "global_config": {
+                    "mode": "kmeans",
+                    "nbits": 4,
+                },
+                "op_type_configs": {
+                    "conv": {
+                        "mode": "uniform",
+                        "nbits": 1,
+                    }
+                },
+                "op_name_configs": {
+                    "conv_1": {
+                        "mode": "unique",
+                    }
+                },
+            }
+
+        Note that you can override the ``config_type``. For instance, if you want to do threshold-based
+        pruning to the model in addition to the convolution layers in which magnitude pruning is applied, the following is an
+        example of the nested dictionary:
+
+        .. code-block:: python
+
+            config_dict = {
+                "config_type": "OpThresholdPrunerConfig",
+                "global_config": {
+                    "threshold": 0.01,
+                },
+                "op_type_configs": {
+                    "conv": {
+                        "config_type": "OpMagnitudePrunerConfig",
+                        "n_m_ratio": [3, 4],
+                    }
+                },
+            }
+
+        Parameters
+        ----------
+
+        config_dict: dict[str, Any]
+            A dictionary that represents the configuration structure.
+        """
+        def _get_cls_instance(cls_type, cls_attrs):
+            if cls_attrs is None:
+                return None
+            converter = cattrs.Converter(forbid_extra_keys=True)
+            if "config_type" in cls_attrs:
+                cls_type = cls_attrs["config_type"]
+                del cls_attrs["config_type"]
+            class_type = getattr(sys.modules[__name__], cls_type)
+            return class_type._from_dict(cls_attrs)
+
+        def _check_config_dict(config_dict):
+            valid_keys = ("config_type", "global_config", "op_name_configs", "op_type_configs")
+            for k in config_dict:
+                if k not in valid_keys:
+                    raise ValueError(
+                        f"Invalid key {k} to construct an OptimizationConfig object. Supported keys are {valid_keys}."
+                    )
+
+        _check_config_dict(config_dict)
+
+        config_type = config_dict.get("config_type", None)
+        if config_type is None or not isinstance(config_type, str):
+            raise ValueError("config_type must be provided with type of string.")
+
+        cls_attrs = {}
+        if config_dict.get("global_config", None) is not None:
+            cls_attrs["global_config"] = _get_cls_instance(
+                config_type, config_dict["global_config"]
+            )
+        for key in ["op_type_configs", "op_name_configs"]:
+            if config_dict.get(key, None) is None:
+                continue
+            if not isinstance(config_dict[key], dict):
+                raise ValueError(f"{key} must be type of dict. Got {type(config_dict[key])}")
+            cls_attrs[key] = {
+                k: _get_cls_instance(config_type, v) for k, v in config_dict[key].items()
+            }
+
+        return cls(**cls_attrs)
+
+    @classmethod
+    def from_yaml(cls, yml: Union[IO, str]) -> "OptimizationConfig":
+        """
+        Construct an ``OptimizationConfig`` instance from a YAML file.
+        The YAML file should have the structure that only contains (if any) the following four ``str`` keys:
+
+        * ``"config_type"``: Specify the configuration class type.
+        * ``"global_config"``: Parameters for ``global_config``.
+        * ``"op_type_configs"``: A nested dictionary for ``op_type_configs``.
+        * ``"op_name_config"``: A nested dictionary for ``op_name_configs``.
+
+        The following is a YAML file that creates an optimization config for weight palettization:
+
+        ::
+
+            config_type: OpPalettizerConfig
+            global_config:
+                mode: kmeans
+                nbits: 4
+            op_type_configs:
+                conv:
+                    mode: uniform
+                    nbits: 1
+            op_name_configs:
+                conv_1:
+                    mode: unique
+
+        Note that you can override the ``config_type``. For instance, if you want to do threshold-based
+        pruning to the model in addition to the convolution layers in which magnitude pruning is applied, the following is an
+        example of the YAML file:
+
+        ::
+
+            config_type: OpThresholdPrunerConfig
+            global_config:
+                threshold: 0.01
+            op_type_configs:
+                conv:
+                    config_type: OpMagnitudePrunerConfig
+                    n_m_ratio: [3, 4]
+
+        Parameters
+        ----------
+
+        yml: str, IO
+            A YAML file or the path to the file.
+        """
+        if isinstance(yml, str):
+            with open(yml, "r") as file:
+                config_dict = yaml.safe_load(file)
+        else:
+            config_dict = yaml.safe_load(yml)
+        return cls.from_dict(config_dict)
diff --git a/coremltools/optimize/coreml/_post_training_quantization.py b/coremltools/optimize/coreml/_post_training_quantization.py
new file mode 100644
index 000000000..55db021ea
--- /dev/null
+++ b/coremltools/optimize/coreml/_post_training_quantization.py
@@ -0,0 +1,329 @@
+# Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+# Use of this source code is governed by a BSD-3-clause license that can be
+# found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+from coremltools import _SPECIFICATION_VERSION_IOS_16
+from coremltools.converters.mil import Operation as _Operation
+from coremltools.converters.mil.converter import mil_convert as _mil_convert
+from coremltools.converters.mil.frontend.milproto.load import load as _milproto_to_pymil
+from coremltools.converters.mil.mil.passes.defs.quantization import (
+    AbstractQuantizationPass as _AbstractQuantizationPass,
+)
+from ._quantization_passes import (
+    linear_quantize_weights as _linear_quantize_weights,
+    palettize_weights as _palettize_weights,
+    prune_weights as _prune_weights,
+    WeightDecompressor as _WeightDecompressor,
+)
+from coremltools.models import MLModel as _MLModel
+from coremltools.optimize.coreml import OptimizationConfig as _OptimizationConfig
+
+_DEFAULT_SPECIFICATION_VERSION_FOR_COMPRESSION = _SPECIFICATION_VERSION_IOS_16
+
+def _apply_graph_pass(mlmodel, graph_pass):
+    # Utility function which compresses a coreml model
+    # convert the fully precision mlmodel into pymil program
+    model_spec = mlmodel.get_spec()
+    model_type = model_spec.WhichOneof("Type")
+    if model_type in ("neuralNetwork", "neuralNetworkClassifier", "neuralNetworkRegressor", "pipeline", "PipelineClassifier", "PipelineRegressor"):
+        msg = ("coremltools.optimize.coreml are meant to be used only with mlprogram typed coreml models. "
+              "This model has type {}. Please use coremltools.models.neural_network.quantization_utils.quantize_weights"
+              "instead to compress the weights of the model.")
+        raise TypeError(msg.format(model_type))
+    elif model_type == "mlProgram":
+        pass
+    else:
+       raise TypeError("weight compression not applicable for model type {}".format(model_type))
+
+    assert isinstance(graph_pass, _AbstractQuantizationPass), "compression pass must be an AbstractQuantizationPass instance"
+    specification_version = max(model_spec.specificationVersion, _DEFAULT_SPECIFICATION_VERSION_FOR_COMPRESSION)
+    prog = _milproto_to_pymil(
+        model_spec=model_spec,
+        specification_version=specification_version,
+        file_weights_dir=mlmodel.weights_dir,
+    )
+
+    # apply compression graph pass
+    graph_pass.apply(prog)
+
+    # convert the pymil program back to mlmodel
+    compressed_mlmodel = _mil_convert(
+        prog,
+        convert_to="mlprogram",
+        convert_from="milinternal",
+        specification_version=specification_version,
+        compute_units=mlmodel.compute_unit,
+        model_description=model_spec.description,
+    )
+    return compressed_mlmodel
+
+def linear_quantize_weights(mlmodel: _MLModel, config: _OptimizationConfig):
+    """
+    Utility function to convert a float precision MLModel of type ``mlprogram``, which uses
+    float-precision weights, into a compressed MLModel that uses 8-bit weights. This is
+    achieved by converting the float weight values that are stored in the ``const`` op
+    into the ``constexpr_affine_dequantize`` op.
+
+    This function uses linear quantization on the float weights, providing up to 2x
+    savings in storage compared to float 16, or up to 4x savings compared to float 32.
+    All computation at runtime uses float precision; the precision of the intermediate
+    tensors and the compute precision of the ops are not altered.
+
+    For each weight, this utility function converts the weight into the int8 or uint8 type using
+    either `linear interpolation` (``"linear"`` mode) or `linear symmetric
+    interpolation` (``"linear_symmetric"`` mode, the default).
+
+    **Linear interpolation**
+
+    Linear interpolation (``"linear"`` mode) maps the min/max of the float
+    range to the 8-bit integer range ``[low, high]`` using a zero point (also called quantization bias, or
+    offset) and a scale factor. For the int8 quantization, ``[low, high] = [-128, 127]``, while uint8
+    quantization uses range ``[0, 255]``.
+
+    ``"linear"`` mode uses the quantization formula:
+
+    .. math::
+       w_r = s * (w_q - z)
+
+    Where:
+
+        * :math:`w_r` and  :math:`s` are of type float.
+        * :math:`w_r`` represents the float precision weight.
+        * :math:`s` represents the scale.
+        * :math:`w_q` and :math:`z` are of type 8-bit integer.
+        * :math:`w_q` represents quantized weight.
+        * :math:`z` represents the zero point.
+
+    Quantized weights are computed as follows:
+
+    .. math::
+       w_q = cast\_to\_8\_bit\_integer(w_r / s + cast\_to\_float(z))
+
+    Note: :math:`cast\_to\_8\_bit\_integer` is the process of clipping the input to range ``[low, high]`` followed by rounding and casting to 8-bit integer.
+    
+    In ``"linear"`` mode, ``s, z`` are computed by mapping the original float range
+    ``[A, B]`` into the 8-bit integer range ``[-128, 127]`` or ``[0, 255]``. That is, you are solving the
+    following linear equations:
+
+        * ``B = s * (high - z)``
+        * ``A = s * (low - z)``
+
+    The equations result in the following:
+
+        * ``s = (B - A) / (high - low)``
+        * ``z = cast_to_8_bit_integer((low * B - high * A) / (B - A))``
+
+    When the rank of weight ``w`` is 1, then ``s`` and ``z`` are both scalars. When the
+    rank of the weight is greater than 1, then ``s`` and ``z`` are both vectors. In that
+    case, scales are computed per `channel`, in which `channel` is the output dimension,
+    which corresponds to the first dimension for ops such as ``conv`` and ``linear``, and
+    the second dimension for the ``conv_transpose`` op.
+    
+    For ``"linear"`` mode, :math:`A = min(w_r)`, :math:`B = max(w_r)`.
+    
+    **Linear symmetric interpolation**
+
+    With linear symmetric interpolation (``"linear_symmetric"`` mode, the default), rather than
+    mapping the exact min/max of the float range to the quantized range, the function
+    chooses the maximum absolute value between the min/max, which results in a
+    floating-point range that is symmetric with respect to zero. This also makes the resulting zero
+    point ``0`` for int8 weight and ``127`` for uint8 weight.
+    
+    For ``"linear_symmetric"`` mode:
+    
+       * :math:`A = -R` and :math:`B = R`, where :math:`R = max(abs(w_r))`.
+       * This function maps to the range of ``[-127, 127]`` for int8 weight and ``[0, 254]`` for uint8 weight.
+       * The result is ``s=(B-A)/254`` -> ``s=2R/254`` -> ``s=R/127``.
+       * Solving for ``z``:
+            * int8:  ``z = (-127 * R + 127 * R)/2R`` -> ``z=0``.
+            * uint8: ``z = (0 * R + 254 * R)/2R`` -> ``z=127``.
+
+    Parameters
+    ----------
+    mlmodel: MLModel
+        Model to be quantized. This MLModel should be of type ``mlprogram``.
+        
+    config: OptimizationConfig
+        An :py:class:`OptimizationConfig` object that specifies the parameters for weight quantization.
+
+    Returns
+    -------
+
+    model: MLModel
+        The quantized MLModel instance.
+
+    Examples
+    --------
+    .. sourcecode:: python
+ 
+        import coremltools as ct
+        import coremltools.optimize as cto
+        
+        model = ct.coreml.models.MLModel('my_model.mlpackage')
+        config = cto.coreml.OptimizationConfig(
+            global_config=cto.coreml.OpLinearQuantizerConfig(mode="linear_symmetric")
+        )
+        compressed_model = cto.coreml.linear_quantize_weights(model, config)
+
+    """
+
+    linear_weight_quantizer = _linear_quantize_weights(config, fake_compression=False)
+    return _apply_graph_pass(mlmodel, linear_weight_quantizer)
+
+def palettize_weights(mlmodel: _MLModel, config: _OptimizationConfig):
+    """
+    Utility function to convert a float precision MLModel of type ``mlprogram`` to a
+    compressed MLModel by reducing the overall number of weights using a lookup table
+    (LUT). A LUT contains a list of float values. An `nbit` LUT has 2\ :sup:`nbits` entries.
+
+    For example, a float weight vector such as ``{0.3, 0.3, 0.5, 0.5}`` can be compressed
+    using a 1-bit LUT: ``{0.3, 0.5}``. In this case the float vector can be replaced
+    with a 1-bit vector ``{0, 0, 1, 1}``.
+
+    This function iterates over all the weights in the ``mlprogram``, discretizes its values,
+    and constructs the LUT according to the algorithm specified in ``mode``. The float
+    values are then converted to the `nbit` values, and the LUT is saved alongside each
+    weight. The ``const`` ops storing weight values are replaced by
+    ``constexpr_lut_to_dense`` ops.
+
+    At runtime, the LUT and the `nbit` values are used to reconstruct the float weight
+    values, which are then used to perform the float operaton the weight is feeding into.
+
+    Consider the following example of ``"uniform"`` mode (a linear histogram):
+
+        * ``nbits = 4``
+        * ``mode = "uniform"``
+        * ``weight = [0.11, 0.19, 0.3, 0.08, 0.0, 0.02]``
+
+    The weight can be converted to a palette with indices ``[0, 1, 2, 3]`` (2 bits). The
+    indices are a byte array.
+
+    The data range ``[0.0, 0.3]`` is divided into 4 partitions linearly, which is
+    ``[0.0, 0.1, 0.2, 0.3]``.
+
+        * The LUT would be ``[0.0, 0.1, 0.2, 0.3]``.
+
+        * The weight is rounded to ``[0.1, 0.2, 0.3, 0.1, 0.0, 0.0]``, and represented in
+          the palette as indices ``[01b, 10b, 11b, 01b, 00b, 00b]``.
+
+    Parameters
+    ----------
+    mlmodel: MLModel
+        Model to be converted by a LUT. This MLModel should be of type ``mlprogram``.
+        
+    config: OptimizationConfig
+        An :py:class:`OptimizationConfig` object that specifies the parameters for weight palettization.
+
+    Returns
+    -------
+    model: MLModel
+        The palettized MLModel instance.
+
+    Example
+    -------
+
+    .. sourcecode:: python
+
+        import coremltools as ct
+        import coremltools.optimize as cto
+        
+        model = ct.models.MLModel('my_model.mlpackage')
+        config = cto.coreml.OptimizationConfig(
+            global_config=cto.coreml.OpPalettizerConfig(mode="kmeans", nbits=4)
+        )
+        compressed_model = cto.coreml.palettize_weights(model, config)
+
+    """
+
+    weight_palettizer = _palettize_weights(config, fake_compression=False)
+    return _apply_graph_pass(mlmodel, weight_palettizer)
+
+def prune_weights(mlmodel: _MLModel, config: _OptimizationConfig):
+    """
+    Utility function to convert a float precision MLModel of type ``mlprogram`` to a
+    compressed MLModel using sparse representation. The ``const`` ops storing weight
+    values are replaced by ``constexpr_sparse_to_dense`` ops.
+
+    This function is useful if the model is trained with pruning techniques so that
+    a lot of weights have zero values. If a large percentage of weight values are zero,
+    a sparse representation is more efficient than a dense one (the default).
+
+    The sparsified weights are stored in a bit mask. If the weight values are
+    ``{0, 0, 0, 0, 0, 0, 0, 56.3}``, its sparse representation contains a bit mask with
+    ones on locations where the value is non-zero: ``00000001b``. This is accompanied by
+    non-zero data, which is a size-1 vector of value ``{56.3}``.
+
+    For example, given the following:
+
+        * ``weight = [0.3, 0, 0, 0.5, 0, 0]``
+        * ``non_zero_data, bit_mask = sparsify(weight)``
+
+    The indices of the non-zero elements are:
+
+        * ``non_zero_data = [0.3, 0.5]``
+        * ``bit_mask = "100100"``
+
+    Parameters
+    ----------
+    mlmodel: MLModel
+        Model to be sparsified. This MLModel should be of type ``mlprogram``.
+
+    config: OptimizationConfig
+        An :py:class:`OptimizationConfig` object that specifies the parameters for weight pruning.
+
+    Returns
+    -------
+    model: MLModel
+        The sparse MLModel instance.
+
+    Example
+    -------
+    .. sourcecode:: python
+
+        import coremltools as ct
+        import coremltools.optimize as cto
+        
+        model = ct.models.MLModel('my_model.mlpackage')
+        config = cto.coreml.OptimizationConfig(
+            global_config=cto.coreml.OpThresholdPrunerConfig(threshold=1e-3)
+        )
+        compressed_model = cto.coreml.prune_weights(model, config)
+
+    """
+
+    weight_pruner = _prune_weights(config, fake_compression=False)
+    return _apply_graph_pass(mlmodel, weight_pruner)
+
+def decompress_weights(mlmodel: _MLModel):
+    """
+    Utility function to convert weights that are sparse or palettized or affine quantized, back to the float format.
+    That is, convert any of the following three ops to ``mb.const``:
+
+    (1) ``constexpr_affine_dequantize``
+    (2) ``constexpr_lut_to_dense``
+    (3) ``constexpr_sparse_to_dense``
+
+    Parameters
+    ----------
+    mlmodel: MLModel
+        Model which will be decompressed.
+
+    Returns
+    -------
+    model: MLModel
+        The MLModel with no ``constexpr`` ops included.
+
+    Example
+    -------
+    .. sourcecode:: python
+
+        import coremltools as ct
+
+        model = ct.models.MLModel("my_compressed_model.mlpackage")
+        decompressed_model = ct.optimize.coreml.decompress_weights(model)
+
+    """
+
+    weight_decompressor = _WeightDecompressor(op_selector=lambda op: True)
+    return _apply_graph_pass(mlmodel, weight_decompressor)
diff --git a/coremltools/optimize/coreml/_quantization_passes.py b/coremltools/optimize/coreml/_quantization_passes.py
new file mode 100644
index 000000000..55bce9272
--- /dev/null
+++ b/coremltools/optimize/coreml/_quantization_passes.py
@@ -0,0 +1,750 @@
+# Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+# Use of this source code is governed by a BSD-3-clause license that can be
+# found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import numpy as np
+from tqdm import tqdm
+
+from coremltools import _logger as logger
+from coremltools.converters.mil.backend.mil.load import should_use_weight_file
+from coremltools.converters.mil.mil import Builder as mb
+from coremltools.converters.mil.mil import Operation, Program, types
+from coremltools.converters.mil.mil.ops.defs.iOS16 import (
+    constexpr_affine_dequantize,
+    constexpr_lut_to_dense,
+    constexpr_sparse_to_dense,
+)
+from coremltools.converters.mil.mil.passes.defs.quantization import AbstractQuantizationPass
+from coremltools.converters.mil.mil.passes.helper import block_context_manager
+from coremltools.converters.mil.mil.passes.pass_registry import register_pass
+from coremltools.converters.mil.mil.types.type_mapping import nptype_from_builtin
+from coremltools.models.neural_network.quantization_utils import _get_kmeans_lookup_table_and_weight
+from coremltools.optimize.coreml._config import (
+    OpLinearQuantizerConfig,
+    OpMagnitudePrunerConfig,
+    OpPalettizerConfig,
+    OpThresholdPrunerConfig,
+    OptimizationConfig,
+)
+
+"""
+--------------------------------
+Compression parameters wrapper -
+--------------------------------
+"""
+class SparseParams:
+    def __init__(self, nonzero_data=None, mask=None, shape=None):
+        self.nonzero_data = nonzero_data
+        self.mask = mask
+        self.shape = shape
+
+class LutParams:
+    def __init__(self, lut=None, indices=None, shape=None):
+        self.lut = lut
+        self.indices = indices
+        self.shape = shape
+
+class AffineQuantParams:
+    def __init__(self, quantized_data=None, zero_point=None, scale=None, axis=None):
+        self.quantized_data = quantized_data
+        self.zero_point = zero_point
+        self.scale = scale
+        self.axis = axis
+
+"""
+------------------------
+Compression graph pass -
+------------------------
+"""
+class AbstractCompressionPass(AbstractQuantizationPass):
+    """
+    The abstract class for the compression graph passes.
+    """
+    def __init__(self, config: OptimizationConfig = None, fake_compression: bool = False):
+        if not isinstance(config, (OptimizationConfig, type(None))):
+            raise ValueError(f"config must be of type OptimizationConfig. Got {type(config)}.")
+
+        op_selector = None if config is None else config._op_selector
+
+        super().__init__(op_selector=op_selector)
+
+        self.fake_compression = fake_compression
+        self._config = config
+        if config is not None:
+            self._check_config_type(config)
+
+    def apply(self, prog):
+        if not isinstance(prog, Program):
+            raise TypeError('Transform "{}" can only be applied on PyMIL programs.'.format(self))
+
+        @block_context_manager
+        def apply_block(block):
+            valid_consts = []
+            for op in list(block.operations):
+                for b in op.blocks:
+                    apply_block(b)
+
+                if self.is_valid_op(op):
+                    need_transform = True
+                    if self.op_selector is not None:
+                        need_transform = self.op_selector(op)
+
+                    if need_transform:
+                        valid_consts.append(op)
+
+            for op in tqdm(
+                valid_consts,
+                desc=f"Running compression pass {self.__class__.__name__}",
+                unit=" ops",
+            ):
+                self.transform_op(op)
+
+        for f in prog.functions.values():
+            apply_block(f)
+
+    @property
+    def config(self):
+        return self._config
+
+    @config.setter
+    def config(self, value):
+        self._check_config_type(value)
+        self._config = value
+
+    @staticmethod
+    def need_compress_const(op: Operation, _is_deprecated: bool, weight_threshold: float):
+        """
+        The utility function is checking whether a const op can be compressed.
+        If ``_is_deprecated = True``, the user is using the ``ct.compression_utils``, in which the ops are already filtered by ``op_selector``.
+        For the new ``ct.optimize.coreml`` API, ``op_selector`` is no longer supported, so the ``weight_threshold`` is checked explicitly instead.
+        """
+        val = op.outputs[0].val
+        if _is_deprecated and weight_threshold != None:
+            raise ValueError("weight_threshold cannot be set through the deprecated ct.compression_util API")
+
+        if _is_deprecated:
+            return should_use_weight_file(val)
+
+        # const fed into constexpr ops cannot be compressed
+        if any([child_op.op_type.startswith("constexpr") for child_op in op.outputs[0].child_ops]):
+            return False
+
+        if weight_threshold is None:
+            raise ValueError("weight_threshold cannot be None")
+
+        return should_use_weight_file(val) and val.size > weight_threshold
+
+    def _check_config_type(self, config: OptimizationConfig):
+        """
+        The utility function is checking the OptimizationConfig is holding correct type of op config.
+        """
+        def get_supported_types_as_str(supported_type):
+            if not isinstance(supported_type, (tuple, list)):
+                supported_type = [supported_type]
+            return ", ".join([f"{val.__name__}" for val in supported_type])
+
+        all_configs = []
+        if config.global_config is not None:
+            all_configs.append(config.global_config)
+        all_configs.extend(list(config.op_type_configs.values()))
+        all_configs.extend(list(config.op_name_configs.values()))
+
+        for config in all_configs:
+            if not isinstance(config, self._SUPPORTED_CONFIG_TYPE) and config is not None:
+                supported_type_str = get_supported_types_as_str(self._SUPPORTED_CONFIG_TYPE)
+                raise ValueError(f"{self.__class__.__name__} only accept {supported_type_str} type config. Got {config.__class__.__name__}.")
+
+@register_pass(namespace="compression")
+class prune_weights(AbstractCompressionPass):
+    """
+    This transform works for each ``const`` op if:
+    
+    - ``_is_deprecated=True`` and the ``op_selector`` returns ``True``.
+    - ``_is_deprecated=False`` and the ``const`` value size ``> weight_threshold``.
+
+    The transform performs the following:
+    
+    - The fraction of values with the least absolute value are zeroed out (self.sparsity).
+    - If ``fake_compression=False``, the zeroed-out value is encoded using the ``constexpr_sparse_to_dense`` op.
+    - If ``fake_compression=True``, the zeroed-out value is encoded using the ``const`` op.
+    - Old ``const`` is replaced by a new operation with zeroed-out value.
+    """
+    _SUPPORTED_CONFIG_TYPE = (OpMagnitudePrunerConfig, OpThresholdPrunerConfig)
+
+    def is_valid_op(self, op: Operation):
+        if op.op_type == "const" and should_use_weight_file(op.outputs[0].val):
+            return True
+        return False
+
+    @staticmethod
+    def _pack_val_to_sparse_param(val):
+        flattened_val = val.flatten()
+        params = SparseParams()
+        params.nonzero_data = flattened_val[np.where(flattened_val != 0)]
+        params.mask = np.packbits(np.where(flattened_val != 0, 1, 0), bitorder="little")
+        params.shape = val.shape
+        return params
+
+    @staticmethod
+    def compress_by_threshold(val, threshold, minimum_sparsity_percentile):
+        val = np.where(np.abs(val) <= threshold, 0, val)
+        sparsity_percentile = np.sum(val == 0.0) / val.size
+        if sparsity_percentile < minimum_sparsity_percentile:
+            msg = (f"weight value has sparsity of {sparsity_percentile} < "
+                   f"minimum_sparsity_percentile {minimum_sparsity_percentile}. Skipped."
+                  )
+            logger.warning(msg)
+            return None
+        return prune_weights._pack_val_to_sparse_param(val)
+
+    @staticmethod
+    def compress_by_magnitude(val, target_sparsity, block_size=None, dim=None):
+        def _apply_block_sparsity(val, block_size, dim):
+            shape = val.shape
+            rank = len(shape)
+            assert dim in [0, 1], "bock sparsity pruning only supports dim [0, 1]."
+            assert rank in [2, 3, 4, 5], "block sparsity only supports weights of rank [2, 3, 4, 5]"
+            """
+            Block sparsity follows these steps:
+            
+            1. Input tensor with shape of ``[C_out, Cin, *K]``.
+            2. If ``dim = 1``, the tensor is transposed to ``[Cin, C_out, *K]``. The following example assumes ``dim = 0``.
+            3. Pad ``C_out`` so that it can be divided by ``block_size``: ``[C_out_pad, Cin, *K]``.
+            4. Divide the output channel by ``block_size`` and reshape: ``[C_out_pad // block_size, block_size, C_in, *K]``.
+            5. Compute the magnitude for each block: ``[C_out_pad // block_size, 1, C_in, *K]``.
+            6. Replicate the magnitude values for each block: ``[C_out_pad // block_size, block_size, C_in, *K]``.
+            7. Reshape the tensor back to ``[Cout_pad, C_in, *K]``.
+            8. Crop the tensor to ``[C_out, C_in, *K]``.
+            9. If ``dim = 1``, tranpose the tensor back to the original layout.
+            """
+            if dim == 1:
+                perm = [1, 0] + list(range(2, rank))
+                val = np.transpose(val, axes=perm)
+
+            channel = val.shape[0]
+            if channel % block_size != 0:
+                pad_size = block_size - channel % block_size
+                pad_value = [(0, pad_size)] + [(0, 0)] * (rank - 1)
+                val = np.pad(val, pad_value)
+            shape_padded = val.shape
+            assert shape_padded[0] % block_size == 0
+
+            new_shape = list(shape_padded)
+            new_shape.insert(1, block_size)
+            new_shape[0] = new_shape[0] // block_size
+            val = np.reshape(val, (new_shape))
+
+            val = val * val
+            val = np.sum(val, axis=1, keepdims=True)
+            val = np.sqrt(val)
+
+            reps = [1] * (rank + 1)
+            reps[1] = block_size
+            val = np.tile(val, reps)
+            val =  np.reshape(val, shape_padded)
+            val = val[:channel]
+
+            if dim == 1:
+                val = np.transpose(val, axes=perm)
+
+            return val
+
+        magnitude_map = np.abs(val)
+        if block_size is not None:
+            channel = magnitude_map.shape[dim]
+            if block_size > channel / 2:
+                logger.warning(
+                    f"block_size > channel / 2 is not applicable for block sparsity. Got block_size = {block_size}, channel = {channel}. Skipped."
+                )
+                return None
+
+            magnitude_map = _apply_block_sparsity(magnitude_map, block_size, dim)
+        q = target_sparsity * 100
+        if q == 100:
+            val = 0 * val
+        elif q != 0:
+            val = np.where(magnitude_map <= np.percentile(magnitude_map, q), 0, val)
+        return prune_weights._pack_val_to_sparse_param(val)
+
+    @staticmethod
+    def compress_by_nm_sparsity(val, n_m_ratio, dim):
+        n, m = n_m_ratio
+        assert n <= m
+        shape = val.shape
+        rank = len(shape)
+        assert dim in [0, 1], "n:m pruning only supports dim [0, 1]."
+        assert rank in [2, 3, 4, 5], "m:m pruning only supports weights of rank [2, 3, 4, 5]"
+        """
+        The `n-m` pruning process follows these steps:
+        1. Input tensor with shape of ``[C_out, C_in, *K]``, where ``K`` is the spatial dimension from ``0`` to ``3``.
+        2. If ``axis = 1``, tranpose the tensor to shape ``[*K, C_out, C_in]``; otherwise, ``(axis = 0)`` to ``[*K, C_in, C_out]``.
+        3. For the case of ``axis = 1``, reshape input to a 2D tensor ``[*K*C_out, C_in]``. Similar for ``axis = 0``.
+        4. Pad the last dimension with ``0`` so that it can be divided by ``m``: ``[*K*C_out, C_in_pad]``.
+        5. Reshape the tensor to have the last dimension ``m``: ``[*K*C_out*C_in_pad//m, m]``.
+        6. For each vector of length ``m``, we set the lowest ``n`` magnitute elements to ``0``.
+        7. Reshape the tensor back to the shape of ``[*K*C_out, C_in_pad]``.
+        8. Crop the last dimension to match the original shape of ``[*K*C_out, C_in]``.
+        9. Reshape the tensor to shape ``[*K, C_out, C_in]``.
+        10. Tranpose the tensor back to ``[C_out, C_in, K]``.
+        """
+        perm = list(range(2, rank)) + [0, 1]
+        if dim == 0:
+            perm[-2], perm[-1] = 1, 0
+        weight = np.copy(np.transpose(val, axes=perm))
+        shape_begin = weight.shape
+
+        weight = np.reshape(weight, (-1, weight.shape[-1]))
+        channel = weight.shape[-1]
+        if m > channel / 2:
+            logger.warning(
+                f"m > channel / 2 is not applicable for n:m pruning. Got m = {m}, channel = {channel}. Skipped."
+            )
+            return None
+        if channel % m != 0:
+            pad_size = m - channel % m
+            weight = np.pad(weight, ((0, 0), (0, pad_size)))
+        shape_padded = weight.shape
+        assert shape_padded[-1] % m == 0
+
+        weight = np.reshape(weight, (-1, m))
+        magnitute = np.abs(weight)
+        indices = np.argsort(magnitute, axis=-1)[:, :n]
+
+        n_m_mask = np.zeros(weight.shape).astype(val.dtype)
+        np.put_along_axis(n_m_mask, indices, 1.0, axis=-1)
+        n_m_mask = np.reshape(n_m_mask, shape_padded)
+        n_m_mask = n_m_mask[:, :channel]
+
+        n_m_mask = np.reshape(n_m_mask, shape_begin)
+        perm_back = [perm.index(i) for i in range(rank)]
+        n_m_mask = np.transpose(n_m_mask, axes=perm_back)
+
+        val = val * (1 - n_m_mask)
+        return prune_weights._pack_val_to_sparse_param(val)
+
+    @staticmethod
+    def decompress(params):
+        if not isinstance(params, SparseParams):
+            raise ValueError("Invalid type of params")
+        return constexpr_sparse_to_dense.decompress(params.nonzero_data, params.mask, params.shape)
+
+    def transform_op(self, op: Operation):
+        op_config = self.config._get_const_op_config(op)
+        if op_config is None:
+            return
+        if not self.need_compress_const(op, self.config._is_deprecated, op_config.weight_threshold):
+            return
+
+        if not isinstance(op.outputs[0].val, (np.ndarray, np.generic)):
+            raise ValueError("Only numpy arrays are supported")
+
+        if isinstance(op_config, OpThresholdPrunerConfig):
+            sparse_params = self.compress_by_threshold(
+                                val=op.outputs[0].val,
+                                threshold=op_config.threshold,
+                                minimum_sparsity_percentile=op_config.minimum_sparsity_percentile
+                            )
+        elif isinstance(op_config, OpMagnitudePrunerConfig):
+            # Structural sparsity can only be applied to conv / linear weight
+            # For non applicable constant, we skip the compression,
+            # we do allow the user to do structural pruning for non applicable constant,
+            # if it is explicitly set by set_op_name,
+            if not op_config._check_const_op_is_valid(op):
+                if op.name not in self.config.op_name_configs:
+                    logger.warning(f"op named {op.name} not applicable for {OpMagnitudePrunerConfig} configuration. Skipped.")
+                    return
+
+            if op_config.target_sparsity is not None:
+                sparse_params = self.compress_by_magnitude(
+                                    val=op.outputs[0].val,
+                                    target_sparsity=op_config.target_sparsity,
+                                    block_size=op_config.block_size,
+                                    dim=op_config.dim,
+                                )
+            elif op_config.n_m_ratio is not None:
+                sparse_params = self.compress_by_nm_sparsity(
+                                    val=op.outputs[0].val,
+                                    n_m_ratio=op_config.n_m_ratio,
+                                    dim=op_config.dim,
+                                )
+
+        if sparse_params is None:
+            return
+
+        if not self.fake_compression:
+            new_var = mb.constexpr_sparse_to_dense(
+                nonzero_data=sparse_params.nonzero_data,
+                mask=sparse_params.mask,
+                shape=np.uint32(sparse_params.shape),
+                before_op=op,
+                name=op.name + "_sparsified",
+            )
+        else:
+            decompressed_val = self.decompress(sparse_params)
+            new_var = mb.const(
+                val=decompressed_val,
+                before_op=op,
+                name=op.name + "_fake_sparsified",
+            )
+
+        op.enclosing_block.replace_uses_of_var_after_op(
+            anchor_op=op,
+            old_var=op.outputs[0],
+            new_var=new_var,
+            no_check_var_types=True,
+        )
+
+        op.enclosing_block.remove_ops([op])
+
+@register_pass(namespace="compression")
+class palettize_weights(AbstractCompressionPass):
+    """
+    This transform works for each ``const`` op if:
+
+    - ``_is_deprecated=True`` and the ``op_selector`` returns ``True``.
+    - ``_is_deprecated=False`` and the ``const`` value size ``> weight_threshold``.
+
+    The transform performs the following:
+    
+    - A linear look-up table (LUT) with 2\ :sup:`nbits` entries is created with values represented by indexing into this LUT.
+    - If ``fake_compression=False``, compressed value is encoded using the ``constexpr_lut_to_dense`` op.
+    - If ``fake_compression=True``,  compressed value is decompressed and then encoded using the ``const`` op.
+    - Old ``const`` op is replaced by a newly created operation.
+    """
+    _SUPPORTED_CONFIG_TYPE = OpPalettizerConfig
+
+    def is_valid_op(self, op: Operation):
+        if op.op_type == "const" and should_use_weight_file(op.outputs[0].val):
+            return True
+        return False
+
+    @staticmethod
+    def compress(val, mode, nbits=None, lut_function=None):
+
+        def compress_kmeans(val, nbits):
+            lut, indices = _get_kmeans_lookup_table_and_weight(nbits, val)
+            lut = lut.astype(val.dtype)
+            indices = indices.astype(np.uint8)
+            return lut, indices
+
+        def compress_uniform(val, nbits):
+            val = val.flatten()
+            val_min = np.amin(val)
+            val_max = np.amax(val)
+            scale = (val_max - val_min) / ((1 << nbits) - 1)
+            indices = np.round(((val - val_min) / (val_max - val_min)) * ((1 << nbits) - 1)).astype(
+                np.uint8
+            )
+            lut = np.array(range(0, 1 << nbits)) * scale + val_min
+            lut = lut.astype(val.dtype)
+            return lut, indices
+
+        def get_nbits_for_unique_mode(val):
+            val = val.flatten()
+            unique_vals = np.unique(val).tolist()
+            for nbits in (1, 2, 4, 6, 8):
+                if len(unique_vals) <= 1 << nbits:
+                    return nbits
+            msg = "weight value cannot be represented in an 8 bits palettization. Skipped."
+            logger.warning(msg)
+            return None
+
+        def compress_unique(val, nbits):
+            val = val.flatten()
+            unique_vals = np.unique(val).tolist()
+            if len(unique_vals) > 1 << nbits:
+                msg = "Too many unique values {} in the weight. Couldn't represented in {} bits.".format(
+                    len(unique_vals), nbits
+                )
+                raise ValueError(msg)
+            lut = [0] * (1 << nbits)
+            lut[: len(unique_vals)] = unique_vals
+            indices = np.zeros((len(val),))
+            for i, k in enumerate(lut[:len(unique_vals)]):
+                indices += (i + 1) * (val == k).astype(np.int32)
+            indices = indices - 1
+            assert (
+                len(np.where(indices == -1)[0]) == 0
+            ), "weight must be corresponding to one existing indice"
+
+            lut = np.array(lut).astype(val.dtype)
+            indices = indices.astype(np.uint8)
+            return lut, indices
+
+        def pack_indices_into_bytes_array(indices, nbits):
+            bitarray = np.unpackbits(indices.reshape(-1, 1), bitorder="little", axis=-1)[:, :nbits]
+            return np.packbits(bitarray.flatten(), bitorder="little")
+
+        def check_lut_parameters_are_valid(val, lut, indices):
+            if not isinstance(lut, np.ndarray) or not isinstance(indices, np.ndarray):
+                raise ValueError("LUT and indices must be type of numpy array.")
+
+            if indices.size != val.size:
+                msg = "Indices size ({}) mismatched with the original weight({}).".format(
+                    indices.size, val.size
+                )
+                raise ValueError(msg)
+
+            if len(indices.shape) != 1 or indices.dtype != np.uint8:
+                msg = "Indices must be a numpy vector of type uint8. Found shape {} with type {}".format(
+                    indices.shape, indices.dtype
+                )
+                raise ValueError(msg)
+
+            if lut.dtype != val.dtype:
+                msg = "Dtype mismatched between LUT ({}) and weight ({})".format(
+                    lut.dtype, val.dtype
+                )
+                raise ValueError(msg)
+
+        if not isinstance(val, (np.ndarray, np.generic)):
+            raise ValueError(f"Only numpy arrays are supported. Got {type(val)}")
+
+        if mode == "KMEANS":
+            lut, indices = compress_kmeans(val, nbits)
+        elif mode == "UNIFORM":
+            lut, indices = compress_uniform(val, nbits)
+        elif mode == "UNIQUE":
+            nbits = get_nbits_for_unique_mode(val)
+            if nbits is None:
+                return None
+            lut, indices = compress_unique(val, nbits)
+        elif mode == "CUSTOM":
+            lut, indices = lut_function(val)
+
+        check_lut_parameters_are_valid(val, lut, indices)
+
+        params = LutParams()
+        params.lut = lut
+        params.shape = val.shape
+        params.indices = pack_indices_into_bytes_array(indices, int(np.log2(lut.shape[0])))
+        return params
+
+    @staticmethod
+    def decompress(params):
+        if not isinstance(params, LutParams):
+            raise ValueError("Invalid type of params")
+        return constexpr_lut_to_dense.decompress(params.lut, params.indices, params.shape)
+
+    def transform_op(self, op: Operation):
+        op_config = self.config._get_const_op_config(op)
+        if op_config is None:
+            return
+        if not self.need_compress_const(op, self.config._is_deprecated, op_config.weight_threshold):
+            return
+
+        lut_params = self.compress(
+            op.outputs[0].val,
+            op_config.mode,
+            op_config.nbits,
+            op_config.lut_function
+        )
+
+        if lut_params is None:
+            return
+
+        if not self.fake_compression:
+            new_var = mb.constexpr_lut_to_dense(
+                indices=lut_params.indices,
+                lut=lut_params.lut,
+                shape=np.uint32(lut_params.shape),
+                before_op=op,
+                name=op.name + "_palettized",
+            )
+        else:
+            decompressed_val = self.decompress(lut_params)
+            new_var = mb.const(
+                val=decompressed_val,
+                before_op=op,
+                name=op.name + "_fake_palettized",
+            )
+
+        op.enclosing_block.replace_uses_of_var_after_op(
+            anchor_op=op,
+            old_var=op.outputs[0],
+            new_var=new_var,
+            no_check_var_types=True,
+        )
+
+        op.enclosing_block.remove_ops([op])
+
+@register_pass(namespace="compression")
+class linear_quantize_weights(AbstractCompressionPass):
+    """
+    This transform works for each ``const`` op if:
+
+    - ``_is_deprecated=True`` and the ``op_selector`` returns ``True``.
+    - ``_is_deprecated=False`` and the ``const`` value size ``> weight_threshold``.
+
+    The transform performs the following:
+
+    - Values are linearly quantized into unsigned 8-bits.
+    - If ``fake_compression=False``, compressed value is encoded using the ``constexpr_affine_dequantize`` op.
+    - If ``fake_compression=True``, compressed value is decompressed and then encoded using the ``const`` op.
+    """
+    _SUPPORTED_CONFIG_TYPE = OpLinearQuantizerConfig
+
+    def is_valid_op(self, op: Operation):
+        if op.op_type == "const" and should_use_weight_file(op.outputs[0].val):
+            return True
+        return False
+
+    @staticmethod
+    def _get_axis(op):
+        axis = 0
+        var = op.outputs[0]
+        if len(var.child_ops) == 1 and var.child_ops[0].op_type == "conv_transpose":
+            axis = 1
+        return axis
+
+    @staticmethod
+    def compress(val, axis, mode, dtype):
+        def _ensure_numerical_range_and_cast(val, low, high, np_dtype):
+            '''
+            For some cases, the computed quantized data might exceed the data range.
+            For instance, after rounding and addition, we might get `128` for the int8 quantization.
+            This utility function ensures the val in the data range before doing the cast.
+            '''
+            val = np.minimum(val, high)
+            val = np.maximum(val, low)
+            return val.astype(np_dtype)
+
+        mode_dtype_to_range = {
+            (types.int8, "LINEAR"): (-128, 127),
+            (types.int8, "LINEAR_SYMMETRIC"): (-127, 127),
+            (types.uint8, "LINEAR"): (0, 255),
+            (types.uint8, "LINEAR_SYMMETRIC"): (0, 254),
+        }
+
+        if not isinstance(val, (np.ndarray, np.generic)):
+            raise ValueError("Only numpy arrays are supported")
+
+        params = AffineQuantParams()
+        axes = tuple([i for i in range(len(val.shape)) if i != axis])
+        val_min = np.amin(val, axis=axes, keepdims=True)
+        val_max = np.amax(val, axis=axes, keepdims=True)
+
+        if mode == "LINEAR_SYMMETRIC":
+            # For the linear_symmetric mode, the range is symmetrical to 0
+            max_abs = np.maximum(np.abs(val_min), np.abs(val_max))
+            val_min = -max_abs
+            val_max = max_abs
+        else:
+            assert mode == "LINEAR"
+            # For the linear mode, we need to make sure the data range contains `0`
+            val_min = np.minimum(0.0, val_min)
+            val_max = np.maximum(0.0, val_max)
+
+        q_val_min, q_val_max = mode_dtype_to_range[(dtype, mode)]
+
+        # Set the zero point to symmetric mode
+        np_dtype = nptype_from_builtin(dtype)
+        if mode == "LINEAR_SYMMETRIC":
+            if dtype == types.int8:
+                params.zero_point = (0 * np.ones(val_min.shape)).astype(np.int8)
+            else:
+                assert dtype == types.uint8
+                params.zero_point = (127 * np.ones(val_min.shape)).astype(np.uint8)
+        else:
+            assert mode == "LINEAR"
+            params.zero_point = (q_val_min * val_max - q_val_max * val_min) / (val_max - val_min)
+            params.zero_point = np.round(params.zero_point)
+            params.zero_point = _ensure_numerical_range_and_cast(params.zero_point, q_val_min, q_val_max, np_dtype)
+
+        # compute the params
+        params.scale = (val_max - val_min) / (q_val_max - q_val_min)
+        params.scale = params.scale.astype(val.dtype).squeeze()
+
+        params.quantized_data = np.round(
+            val * (q_val_max - q_val_min) / (val_max - val_min)
+        )
+        params.quantized_data = (params.quantized_data + params.zero_point)
+        params.quantized_data = _ensure_numerical_range_and_cast(params.quantized_data, q_val_min, q_val_max, np_dtype)
+
+        params.zero_point = params.zero_point.squeeze()
+        params.axis = axis
+
+        return params
+
+    @staticmethod
+    def decompress(params):
+        if not isinstance(params, AffineQuantParams):
+            raise ValueError("Invalid type of params")
+        return constexpr_affine_dequantize.decompress(
+            params.quantized_data, params.zero_point, params.scale, params.axis
+        )
+
+    def transform_op(self, op: Operation):
+        op_config = self.config._get_const_op_config(op)
+        if op_config is None:
+            return
+        if not self.need_compress_const(op, self.config._is_deprecated, op_config.weight_threshold):
+            return
+
+        quant_params = self.compress(op.outputs[0].val, self._get_axis(op), op_config.mode, op_config.dtype)
+
+        if not self.fake_compression:
+            new_var = mb.constexpr_affine_dequantize(
+                quantized_data=quant_params.quantized_data,
+                zero_point=quant_params.zero_point,
+                scale=quant_params.scale,
+                axis=quant_params.axis,
+                before_op=op,
+                name=op.name + "_affine_quantized",
+            )
+        else:
+            decompressed_val = self.decompress(quant_params)
+            new_var = mb.const(
+                val=decompressed_val,
+                before_op=op,
+                name=op.name + "_fake_affine_quantized",
+            )
+
+        op.enclosing_block.replace_uses_of_var_after_op(
+            anchor_op=op,
+            old_var=op.outputs[0],
+            new_var=new_var,
+            no_check_var_types=True,
+        )
+
+        op.enclosing_block.remove_ops([op])
+
+@register_pass(namespace="compression")
+class WeightDecompressor(AbstractQuantizationPass):
+    """
+    This graph pass transforms the ``constexpr`` op back into ``mb.const`` op.
+    The ``constexpr`` op includes:
+
+    - ``constexpr_affine_dequantize``
+    - ``constexpr_lut_to_dense``
+    - ``constexpr_sparse_to_dense``
+    """
+
+    def __init__(self, op_selector):
+        super().__init__(op_selector=op_selector)
+
+    def is_valid_op(self, op):
+        return op.op_type in (
+            "constexpr_affine_dequantize",
+            "constexpr_lut_to_dense",
+            "constexpr_sparse_to_dense",
+        )
+
+    def transform_op(self, op):
+        decompressed_val = op.value_inference()
+        new_var = mb.const(
+            val=decompressed_val,
+            before_op=op,
+            name=op.name,
+        )
+
+        op.enclosing_block.replace_uses_of_var_after_op(
+            anchor_op=op,
+            old_var=op.outputs[0],
+            new_var=new_var,
+            no_check_var_types=True,
+            force_replace=True,
+        )
+
+        op.enclosing_block.remove_ops([op])
diff --git a/coremltools/optimize/torch/__init__.py b/coremltools/optimize/torch/__init__.py
new file mode 100644
index 000000000..e66d96feb
--- /dev/null
+++ b/coremltools/optimize/torch/__init__.py
@@ -0,0 +1,16 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+from coremltools.optimize.torch import (
+    base_model_optimizer,
+    optimization_config,
+    palettization,
+    pruning,
+    quantization,
+)
+
+from ._logging import init_root_logger as _init_root_logger
+
+_logger = _init_root_logger()
diff --git a/coremltools/optimize/torch/_logging.py b/coremltools/optimize/torch/_logging.py
new file mode 100644
index 000000000..f046b7378
--- /dev/null
+++ b/coremltools/optimize/torch/_logging.py
@@ -0,0 +1,51 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import logging
+import os
+
+
+def init_root_logger():
+    logger = get_root_logger()
+    logger.propagate = False
+    for handler in logger.handlers:
+        logger.removeHandler(handler)
+    logger.addHandler(logging.StreamHandler())
+    level = os.environ.get("COREMLTOOLS_OPTIMIZE_TORCH_LOG_LEVEL", "info").upper()
+    logger.setLevel(level)
+    set_logger_formatter(logger)
+    return logger
+
+
+def get_root_logger():
+    return logging.getLogger("coremltools.optimize.torch")
+
+
+def set_logger_formatter(logger, rank=None):
+    rank_component = f"rank {rank}:" if rank is not None else ""
+    fmt = f"{rank_component}%(asctime)s:%(name)s:%(lineno)s:%(levelname)s: %(message)s"
+    formatter = logging.Formatter(fmt=fmt)
+    for handler in logger.handlers:
+        handler.setFormatter(formatter)
+
+
+def set_logger_filters(logger, rank=None):
+    for handler in logger.handlers:
+        handler.addFilter(RankZeroFilter(rank))
+
+
+def set_rank_for_root_logger(rank):
+    logger = get_root_logger()
+    set_logger_formatter(logger, rank)
+    set_logger_filters(logger, rank)
+
+
+class RankZeroFilter(logging.Filter):
+    def __init__(self, rank):
+        super().__init__()
+        self.rank = rank
+
+    def filter(self, record):
+        return self.rank == 0
diff --git a/coremltools/optimize/torch/_typing.py b/coremltools/optimize/torch/_typing.py
new file mode 100644
index 000000000..227b587e9
--- /dev/null
+++ b/coremltools/optimize/torch/_typing.py
@@ -0,0 +1,13 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+from typing import Any as _Any
+from typing import Callable as _Callable
+from typing import Dict as _Dict
+
+import torch as _torch
+
+ParamsDict = _Dict[str, _Any]
+TensorCallable = _Callable[[_torch.Tensor], _torch.Tensor]
diff --git a/coremltools/optimize/torch/_utils/__init__.py b/coremltools/optimize/torch/_utils/__init__.py
new file mode 100644
index 000000000..25c7d28c5
--- /dev/null
+++ b/coremltools/optimize/torch/_utils/__init__.py
@@ -0,0 +1,4 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
diff --git a/coremltools/optimize/torch/_utils/math_utils.py b/coremltools/optimize/torch/_utils/math_utils.py
new file mode 100644
index 000000000..38038ac50
--- /dev/null
+++ b/coremltools/optimize/torch/_utils/math_utils.py
@@ -0,0 +1,10 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import torch as _torch
+
+
+def rmse_error(a, b):
+    return _torch.sqrt(_torch.mean(_torch.square(a - b)))
diff --git a/coremltools/optimize/torch/_utils/python_utils.py b/coremltools/optimize/torch/_utils/python_utils.py
new file mode 100644
index 000000000..33f9cf2fa
--- /dev/null
+++ b/coremltools/optimize/torch/_utils/python_utils.py
@@ -0,0 +1,12 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+from typing import Any as _Any
+
+
+def get_str(val: _Any):
+    if isinstance(val, float):
+        return f"{val:.5f}"
+    return str(val)
diff --git a/coremltools/optimize/torch/_utils/state_dict_utils.py b/coremltools/optimize/torch/_utils/state_dict_utils.py
new file mode 100644
index 000000000..08ec1d3c0
--- /dev/null
+++ b/coremltools/optimize/torch/_utils/state_dict_utils.py
@@ -0,0 +1,60 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+from typing import Any, Dict, Mapping, NamedTuple
+
+import torch
+
+
+class AddMetadataStateDictHook:
+    """
+    Create a hook that will add the given keys/values in the state dict metadata of the module it is registered on
+    Args:
+        extra_metadata: the extra state dict to be added to the state dict
+        allow_overwrite: If True, do not raise if any of the keys are already in the state dict
+          and would be overwritten by the new state
+    """
+    def __init__(self, extra_metadata: Mapping[str, Any], allow_overwrite: bool = False):
+        self.extra_metadata = extra_metadata
+        self.allow_overwrite = allow_overwrite
+
+    def __call__(
+        self,
+        module: torch.nn.Module,
+        destination: Dict[str, torch.Tensor],
+        prefix: str,
+        local_metadata: Dict[str, Any],
+    ) -> Dict[str, torch.Tensor]:
+        for key, value in self.extra_metadata.items():
+            if key in local_metadata and not self.allow_overwrite:
+                raise ValueError(
+                    f"Metadata key '{key}' would be overwritten as it already exists in the local_metadata dict: {local_metadata[key]}"
+                )
+            local_metadata[key] = value
+        return destination
+
+
+class LoadStateDictPostHook:
+    """
+    Create a hook that acts on the module after its state_dict has been loaded.
+    """
+
+    def __call__(self, module: torch.nn.Module, incompatible_keys: NamedTuple) -> None:
+        pass
+
+
+def _verify_state_dict(state_dict, expected_keys):
+    missing_keys = []
+    unexpected_keys = []
+    for key in state_dict:
+        if key not in expected_keys:
+            unexpected_keys.append(key)
+    if len(unexpected_keys) > 0:
+        raise ValueError(f"Found unexpected keys {unexpected_keys} in state_dict: {state_dict}")
+    for key in expected_keys:
+        if key not in state_dict:
+            missing_keys.append(key)
+    if len(missing_keys) > 0:
+        raise ValueError(f"Missing keys {missing_keys} from state_dict: {state_dict}")
diff --git a/coremltools/optimize/torch/_utils/torch_utils.py b/coremltools/optimize/torch/_utils/torch_utils.py
new file mode 100644
index 000000000..54acf2575
--- /dev/null
+++ b/coremltools/optimize/torch/_utils/torch_utils.py
@@ -0,0 +1,100 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import logging as _logging
+import operator as _operator
+import re as _re
+from contextlib import contextmanager
+from typing import List as _List
+from typing import Tuple as _Tuple
+from typing import Union as _Union
+
+import numpy as _np
+import torch as _torch
+
+_logger = _logging.getLogger(__name__)
+
+
+def list_or_str_to_tensor(alist: _Union[_List[int], str, _torch.Tensor]) -> _torch.Tensor:
+    if isinstance(alist, _torch.Tensor):
+        return alist
+    elif isinstance(alist, str):
+        # Safety check since we are calling eval
+        range_str_regex = r"^(range)\(\d+(\,?\s*\d+){0,2}\)$"
+        assert _re.match(range_str_regex, alist), (
+            f"{alist} is invalid.",
+            "Please provide a string such as 'range(...)'",
+        )
+        try:
+            alist = eval(alist)
+        except Exception:
+            _logger.error(
+                f"Invalid range str {alist}.",
+                "Please refer to the documentation for correct usage",
+            )
+
+    return _torch.tensor(
+        _np.ones(
+            len(alist),
+        )
+        * alist,
+        dtype=_torch.float32,
+        requires_grad=False,
+    )
+
+
+def maybe_convert_str_to_dtype(dtype: _Union[str, _torch.dtype]) -> _torch.dtype:
+    _str_to_dtype_map = {
+        "quint8": _torch.quint8,
+        "qint8": _torch.qint8,
+        "float32": _torch.float32,
+    }
+    if isinstance(dtype, str):
+        dtype = dtype.lower()
+        if dtype in _str_to_dtype_map:
+            return _str_to_dtype_map[dtype]
+        else:
+            raise ValueError(f"Received unsupported dtype: {dtype}")
+    elif isinstance(dtype, _torch.dtype):
+        return dtype
+    else:
+        raise ValueError(f"Received unrecognized type for dtype: {type(dtype)}")
+
+
+def maybe_convert_str_to_mod_type(mod_type: str):
+    """
+    Convert str to module type
+    """
+    if not isinstance(mod_type, str):
+        return mod_type
+    if _re.fullmatch(r"operator\.[a-z]+", mod_type) and hasattr(_operator, mod_type.split(".")[-1]):
+        return getattr(_operator, mod_type.split(".")[-1])
+    elif _re.fullmatch(r"torch\.[a-z]+", mod_type) and hasattr(_torch, mod_type.split(".")[-1]):
+        return getattr(_torch, mod_type.split(".")[-1])
+    elif hasattr(_torch.nn, mod_type):
+        return getattr(_torch.nn, mod_type)
+    elif hasattr(_torch.nn.functional, mod_type):
+        return getattr(_torch.nn.functional, mod_type)
+    return mod_type
+
+
+@contextmanager
+def get_eval_model(model):
+    train_flag = model.training
+    try:
+        yield model.eval()
+    finally:
+        model.train(mode=train_flag)
+
+
+def get_parent_child_name(name: str) -> _Tuple[str, str]:
+    """
+    Returns name of parent and child modules from a full module name.
+    """
+    split = name.rsplit(".", 1)
+    if len(split) == 1:
+        return "", split[0]
+    else:
+        return split[0], split[1]
diff --git a/coremltools/optimize/torch/_utils/version_utils.py b/coremltools/optimize/torch/_utils/version_utils.py
new file mode 100644
index 000000000..fe689fe7a
--- /dev/null
+++ b/coremltools/optimize/torch/_utils/version_utils.py
@@ -0,0 +1,19 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import torch as _torch
+from packaging import version
+
+
+def version_ge(module, target_version):
+    return version.parse(module.__version__) >= version.parse(target_version)
+
+
+def get_torch_version():
+    return _torch.__version__
+
+
+def is_torch_2():
+    return version_ge(_torch, "2.0.0")
diff --git a/coremltools/optimize/torch/base_model_optimizer.py b/coremltools/optimize/torch/base_model_optimizer.py
new file mode 100644
index 000000000..54a9298e2
--- /dev/null
+++ b/coremltools/optimize/torch/base_model_optimizer.py
@@ -0,0 +1,74 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import logging as _logging
+from abc import ABC as _ABC
+from abc import abstractmethod as _abstractmethod
+from collections import UserDict as _UserDict
+from typing import Optional as _Optional
+from typing import Tuple as _Tuple
+
+import torch as _torch
+
+from coremltools.optimize.torch._utils.python_utils import get_str as _get_str
+from coremltools.optimize.torch.optimization_config import OptimizationConfig as _OptimizationConfig
+
+_logger = _logging.getLogger(__name__)
+
+
+class _Report(_UserDict):
+    def __repr__(self):
+        if len(self.data) < 1:
+            return ""
+        column_names = list(self.data.values())[0].keys()
+        column_names = ["name"] + list(column_names)
+        print_list = [column_names]
+        print_list += [
+            [f"{key}"] + [_get_str(val[cn]) for cn in column_names[1:]]
+            for key, val in self.data.items()
+        ]
+        col_size = [max(map(len, col)) for col in zip(*print_list)]
+        ret_str = [
+            " | ".join(
+                f"{' ' * (col_size[idx] - len(val))}{val}" for idx, val in enumerate(print_list[0])
+            )
+        ]
+        ret_str += [" | ".join(f"{'-' * cs}" for cs in col_size)]
+        for pl in print_list[1:]:
+            ret_str.append(
+                " | ".join(f"{' ' * (col_size[idx] - len(val))}{val}" for idx, val in enumerate(pl))
+            )
+        return "\n".join(ret_str)
+
+
+class BaseModelOptimizer(_ABC):
+    _supported_modules: _Tuple
+
+    def __init__(self, model: _torch.nn.Module, config: _Optional[_OptimizationConfig] = None):
+        self._model = model
+        self._config = config
+        self._step_count = 0
+
+    @_abstractmethod
+    def prepare(self, *args, **kwargs) -> _torch.nn.Module:
+        raise NotImplementedError()
+
+    @_abstractmethod
+    def step(self):
+        raise NotImplementedError()
+
+    @_abstractmethod
+    def finalize(
+        self, model: _Optional[_torch.nn.Module] = None, inplace: bool = False
+    ) -> _torch.nn.Module:
+        raise NotImplementedError()
+
+    @_abstractmethod
+    def report(self) -> _Report:
+        raise NotImplementedError()
+
+    @property
+    def supported_modules(self):
+        return self._supported_modules
diff --git a/coremltools/optimize/torch/optimization_config.py b/coremltools/optimize/torch/optimization_config.py
new file mode 100644
index 000000000..3b3124e05
--- /dev/null
+++ b/coremltools/optimize/torch/optimization_config.py
@@ -0,0 +1,238 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+from collections import OrderedDict as _OrderedDict
+from typing import IO as _IO
+from typing import Any as _Any
+from typing import Callable as _Callable
+from typing import Dict as _Dict
+from typing import List as _List
+from typing import Optional as _Optional
+from typing import Type as _Type
+from typing import Union as _Union
+
+import cattrs as _cattrs
+import torch as _torch
+import yaml as _yaml
+from attr import Factory as _Factory
+from attr import asdict as _asdict
+from attr import define as _define
+
+
+@_define
+class ModuleOptimizationConfig:
+    @classmethod
+    def from_dict(cls, config_dict: _Dict[str, _Any]) -> "ModuleOptimizationConfig":
+        """
+        Create class from a dictionary of string keys and values.
+
+        Args:
+            config_dict (:obj:`dict` of :obj:`str` and values): A nested dictionary of strings
+                and values.
+        """
+        # passing forbid_extra_keys=True doesn't prevent silent failure when keys are mis-spelled
+        _validate_config_dict(cls, config_dict)
+        converter = _cattrs.Converter(forbid_extra_keys=True)
+        return converter.structure_attrs_fromdict(config_dict, cls)
+
+    @classmethod
+    def from_yaml(cls, yml: _Union[_IO, str]) -> "ModuleOptimizationConfig":
+        """
+        Create class from a yaml stream.
+
+        Args:
+            yml: An :py:class:`IO` stream containing yaml or a :obj:`str`
+                path to the yaml file.
+        """
+        return _from_yaml(cls, yml)
+
+    def as_dict(self) -> _Dict[str, _Any]:
+        """
+        Returns the config as a dictionary.
+        """
+        return _asdict(self)
+
+
+@_define
+class OptimizationConfig:
+    global_config: _Optional[ModuleOptimizationConfig] = None
+    module_type_configs: _Dict[
+        _Union[_Callable, str], _Optional[ModuleOptimizationConfig]
+    ] = _Factory(_OrderedDict)
+    module_name_configs: _Dict[str, _Optional[ModuleOptimizationConfig]] = _Factory(_OrderedDict)
+
+    def set_global(
+        self, global_config: _Optional[ModuleOptimizationConfig]
+    ) -> "OptimizationConfig":
+        """
+        Set the global config.
+        """
+        self.global_config = global_config
+        return self
+
+    def set_module_type(
+        self, object_type: _Union[_Callable, str], opt_config: _Optional[ModuleOptimizationConfig]
+    ) -> "OptimizationConfig":
+        """
+        Set the module level optimization config for a given module type. If the module level optimization config
+        for an existing module type was already set, the new config will override the old one.
+        """
+        self.module_type_configs[object_type] = opt_config
+        return self
+
+    def set_module_name(
+        self, module_name: str, opt_config: _Optional[ModuleOptimizationConfig]
+    ) -> "OptimizationConfig":
+        """
+        Set the module level optimization config for a given module instance. If the module level optimization config
+        for an existing module was already set, the new config will override the old one.
+        """
+        self.module_name_configs[module_name] = opt_config
+        return self
+
+    def get_module_config(
+        self, name: str, module: _torch.nn.Module
+    ) -> _Optional[ModuleOptimizationConfig]:
+        if name in self.module_name_configs:
+            return self.module_name_configs[name]
+        elif type(module) in self.module_type_configs:
+            return self.module_type_configs[type(module)]
+        elif module.__class__.__name__ in self.module_type_configs:
+            return self.module_type_configs[module.__class__.__name__]
+        else:
+            return self.global_config
+
+    @classmethod
+    def from_dict(cls, config_dict: _Dict[str, _Any]) -> _Optional["OptimizationConfig"]:
+        """
+        Create class from a dictionary of string keys and values.
+
+        Args:
+            config_dict (:obj:`dict` of :obj:`str` and values): A nested dictionary of strings
+                and values.
+        """
+        # passing forbid_extra_keys=True doesn't prevent silent failure when keys are mis-spelled
+        _validate_config_dict(cls, config_dict)
+        return
+
+    @classmethod
+    def from_yaml(cls, yml: _Union[_IO, str]) -> "OptimizationConfig":
+        """
+        Create class from a yaml stream.
+
+        Args:
+            yml: An :py:class:`IO` stream containing yaml or a :obj:`str`
+                path to the yaml file.
+        """
+        return _from_yaml(cls, yml)
+
+    def as_dict(self) -> _Dict[str, _Any]:
+        """
+        Returns the config as a dictionary.
+        """
+        return _asdict(self)
+
+    def _validate_same_params(self, param_names: _List[str]):
+        """
+        This method validates that all the parameters in param_names
+        have the same value across all the module level configs.
+        """
+        expected_values = None
+        if self.global_config is not None:
+            expected_values = {
+                param_name: getattr(self.global_config, param_name) for param_name in param_names
+            }
+        for name, config in self.module_type_configs.items():
+            if config is not None:
+                expected_values = self._validate_expected_value(
+                    expected_values, name, config, param_names
+                )
+        for name, config in self.module_name_configs.items():
+            if config is not None:
+                expected_values = self._validate_expected_value(
+                    expected_values, name, config, param_names
+                )
+
+    @staticmethod
+    def _validate_expected_value(
+        expected_values: _Dict[str, _Any],
+        name: str,
+        config: ModuleOptimizationConfig,
+        param_names: _List[str],
+    ):
+        if expected_values is None:
+            expected_values = {
+                param_name: getattr(config, param_name) for param_name in param_names
+            }
+        for param_name, expected_val in expected_values.items():
+            val = getattr(config, param_name)
+            if val != expected_val:
+                raise ValueError(
+                    f"Value of parameter {param_name} cannot "
+                    f"be different between different module level configs."
+                    f"Expected value: {expected_val}, received: {val} "
+                    f"for config {name}."
+                )
+        return expected_values
+
+
+def _structure_from_dict_hook_factory(conversion_cls: _Any) -> _Callable:
+    def _structure_from_dict_hook(
+        module_type_dict: _Dict[_Union[_Callable, str], _Any], type: _Any
+    ):
+        return_dict = _OrderedDict()
+        for key, value in module_type_dict.items():
+            if value is None:
+                return_dict[key] = None
+            else:
+                if isinstance(value, dict):
+                    return_dict[key] = conversion_cls.from_dict(value)
+                else:
+                    assert isinstance(value, conversion_cls), (
+                        "value in module type dict should be either a dict or "
+                        "a module config object."
+                    )
+                    return_dict[key] = value
+        return return_dict
+    return _structure_from_dict_hook
+
+
+def _validate_config_dict(cls: _Type, config_dict: _Dict[str, _Any]):
+    for key, _ in config_dict.items():
+        if not hasattr(cls, key):
+            raise ValueError(f"Found unrecognized key {key} in config_dict: {config_dict}.")
+
+
+def _from_yaml(
+    cls: _Union[_Type[OptimizationConfig], _Type[ModuleOptimizationConfig]], yml: _Union[_IO, str]
+):
+    if isinstance(yml, str):
+        with open(yml, "r") as file:
+            dict_from_yml = _yaml.safe_load(file)
+    else:
+        dict_from_yml = _yaml.safe_load(yml)
+    assert isinstance(dict_from_yml, dict), (
+        "Invalid yaml received. yaml stream should return a dict "
+        f"on parsing. Received type: {type(dict_from_yml)}."
+    )
+    return cls.from_dict(dict_from_yml)
+
+
+def _validate_module_type_keys_factory(supported_modules):
+    supported_module_names = [cls.__name__ for cls in supported_modules]
+
+    def validate_module_type_key(instance, attribute, value):
+        if isinstance(value, str):
+            assert value in supported_module_names, (
+                f"keys for module_type_configs must be one of "
+                f"{supported_module_names}. Received: {value}."
+            )
+        else:
+            assert value in supported_modules, (
+                f"keys for module_type_configs must be one of "
+                f"{supported_modules}. Received: {value}."
+            )
+
+    return validate_module_type_key
diff --git a/coremltools/optimize/torch/palettization/__init__.py b/coremltools/optimize/torch/palettization/__init__.py
new file mode 100644
index 000000000..92a00a226
--- /dev/null
+++ b/coremltools/optimize/torch/palettization/__init__.py
@@ -0,0 +1,38 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+"""
+.. _coremltools_optimize_torch_palettization:
+
+.. include:: palettization_desc.rst
+     :end-line: 7
+
+_`Palettizer`
+=============
+
+Top level APIs
+--------------
+
+.. autoclass:: coremltools.optimize.torch.palettization.ModuleDKMPalettizerConfig
+    :members: from_dict, as_dict, from_yaml
+
+.. autoclass:: coremltools.optimize.torch.palettization.DKMPalettizerConfig
+    :members: set_global, set_module_type, set_module_name, from_dict, as_dict, from_yaml
+
+.. autoclass:: coremltools.optimize.torch.palettization.DKMPalettizer
+    :members: prepare, step, report, finalize
+
+
+_`Palettization layers for DKM`
+-------------------------------
+
+.. autoclass:: coremltools.optimize.torch.palettization.FakePalettize
+    :no-members:
+
+"""
+
+from .fake_palettize import FakePalettize
+from .palettization_config import DKMPalettizerConfig, ModuleDKMPalettizerConfig
+from .palettizer import DKMPalettizer
diff --git a/coremltools/optimize/torch/palettization/_custom_conversion.py b/coremltools/optimize/torch/palettization/_custom_conversion.py
new file mode 100644
index 000000000..326796f56
--- /dev/null
+++ b/coremltools/optimize/torch/palettization/_custom_conversion.py
@@ -0,0 +1,273 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import torch.nn as _nn
+import torch.nn.qat as _nnqat
+
+from ._supported_modules import Conv1d, Embedding, LayerNorm, MultiheadAttention
+
+
+class PalettizationCustomConversionBase(_nn.Module):
+    """
+    PalettizationCustomConversionBase is the base class for palettized model conversion. It implements the
+    get_finalized_weights method which returns the palettized weights from ``LUT`` and ``indices``
+    post-palettization.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    @classmethod
+    def do_attribute_assertions(cls, observed_module: _nn.Module):
+        assert hasattr(
+            observed_module, "qconfig"
+        ), f"Module {type(observed_module)} has no attribute qconfig"
+        assert hasattr(observed_module, "activation_post_process"), (
+            f"Module {type(observed_module)} has no " f"attribute activation_post_process "
+        )
+        assert hasattr(observed_module, "weight_fake_quant"), (
+            f"Module {type(observed_module)} has no attribute " f"weight_fake_quant "
+        )
+
+    @classmethod
+    def get_finalized_weights(cls, observed_module: _nn.Module):
+        return observed_module.weight_fake_quant.forward(observed_module.weight.detach())
+
+    @classmethod
+    def from_observed(cls, observed_module: _nn.Module):
+        """
+        The classes that base-class this class will have to implement the ``from_observed`` method to tell the
+        convert method what type of a module to return through Pytorch's conversion.
+        """
+        raise NotImplementedError()
+
+
+class LinearPalettizationConversion(PalettizationCustomConversionBase):
+    """
+    Conversion class for Linear.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    @classmethod
+    def from_observed(cls, observed_module: _nn.Module):
+        cls.do_attribute_assertions(observed_module)
+        finalized_weights = cls.get_finalized_weights(observed_module)
+        return_module = _nn.Linear(
+            in_features=observed_module.in_features,
+            out_features=observed_module.out_features,
+            bias=observed_module.bias is not None,
+            device=observed_module.device if hasattr(observed_module, "device") else None,
+            dtype=observed_module.dtype if hasattr(observed_module, "dtype") else None,
+        )
+        return_module.weight = _nn.Parameter(finalized_weights)
+        if observed_module.bias is not None:
+            return_module.bias = _nn.Parameter(observed_module.bias.detach())
+        return_module.activation_post_process = observed_module.activation_post_process
+        return return_module
+
+
+class Conv1dPalettizationConversion(PalettizationCustomConversionBase):
+    """
+    Conversion class for Conv2d.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    @classmethod
+    def from_observed(cls, observed_module: _nn.Module):
+        cls.do_attribute_assertions(observed_module)
+        finalized_weights = cls.get_finalized_weights(observed_module)
+        return_module = _nn.Conv1d(
+            in_channels=observed_module.in_channels,
+            out_channels=observed_module.out_channels,
+            kernel_size=observed_module.kernel_size,
+            stride=observed_module.stride,
+            padding=observed_module.padding,
+            dilation=observed_module.dilation,
+            groups=observed_module.groups,
+            bias=observed_module.bias is not None,
+            padding_mode=observed_module.padding_mode,
+            device=observed_module.device if hasattr(observed_module, "device") else None,
+            dtype=observed_module.dtype if hasattr(observed_module, "dtype") else None,
+        )
+        return_module.weight = _nn.Parameter(finalized_weights)
+        if observed_module.bias is not None:
+            return_module.bias = _nn.Parameter(observed_module.bias.detach())
+        return_module.activation_post_process = observed_module.activation_post_process
+        return return_module
+
+
+class Conv2dPalettizationConversion(PalettizationCustomConversionBase):
+    """
+    Conversion class for Conv2d.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    @classmethod
+    def from_observed(cls, observed_module: _nn.Module):
+        cls.do_attribute_assertions(observed_module)
+        finalized_weights = cls.get_finalized_weights(observed_module)
+        return_module = _nn.Conv2d(
+            in_channels=observed_module.in_channels,
+            out_channels=observed_module.out_channels,
+            kernel_size=observed_module.kernel_size,
+            stride=observed_module.stride,
+            padding=observed_module.padding,
+            dilation=observed_module.dilation,
+            groups=observed_module.groups,
+            bias=observed_module.bias is not None,
+            padding_mode=observed_module.padding_mode,
+            device=observed_module.device if hasattr(observed_module, "device") else None,
+            dtype=observed_module.dtype if hasattr(observed_module, "dtype") else None,
+        )
+        return_module.weight = _nn.Parameter(finalized_weights)
+        if observed_module.bias is not None:
+            return_module.bias = _nn.Parameter(observed_module.bias.detach())
+        return_module.activation_post_process = observed_module.activation_post_process
+        return return_module
+
+
+class Conv3dPalettizationConversion(PalettizationCustomConversionBase):
+    """
+    Conversion class for Conv3d.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    @classmethod
+    def from_observed(cls, observed_module: _nn.Module):
+        cls.do_attribute_assertions(observed_module)
+        finalized_weights = cls.get_finalized_weights(observed_module)
+        return_module = _nn.Conv3d(
+            in_channels=observed_module.in_channels,
+            out_channels=observed_module.out_channels,
+            kernel_size=observed_module.kernel_size,
+            stride=observed_module.stride,
+            padding=observed_module.padding,
+            dilation=observed_module.dilation,
+            groups=observed_module.groups,
+            bias=observed_module.bias is not None,
+            padding_mode=observed_module.padding_mode,
+            device=observed_module.device if hasattr(observed_module, "device") else None,
+            dtype=observed_module.dtype if hasattr(observed_module, "dtype") else None,
+        )
+        return_module.weight = _nn.Parameter(finalized_weights)
+        if observed_module.bias is not None:
+            return_module.bias = _nn.Parameter(observed_module.bias.detach())
+        return_module.activation_post_process = observed_module.activation_post_process
+        return return_module
+
+
+class LayerNormPalettizationConversion(PalettizationCustomConversionBase):
+    """
+    Conversion class for LayerNorm.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    @classmethod
+    def from_observed(cls, observed_module: _nn.Module):
+        cls.do_attribute_assertions(observed_module)
+        finalized_weights = cls.get_finalized_weights(observed_module)
+        return_module = _nn.LayerNorm(
+            normalized_shape=observed_module.normalized_shape,
+            eps=observed_module.eps,
+            elementwise_affine=observed_module.elementwise_affine,
+            device=observed_module.device if hasattr(observed_module, "device") else None,
+            dtype=observed_module.dtype if hasattr(observed_module, "dtype") else None,
+        )
+        if observed_module.elementwise_affine:
+            return_module.weight = _nn.Parameter(finalized_weights)
+            if observed_module.bias:
+                return_module.bias = _nn.Parameter(observed_module.bias.detach())
+        return_module.activation_post_process = observed_module.activation_post_process
+        return return_module
+
+
+class MultiheadAttentionPalettizationConversion(PalettizationCustomConversionBase):
+    """
+    Conversion class for MultiheadAttention.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    @classmethod
+    def from_observed(cls, observed_module: _nn.Module):
+        cls.do_attribute_assertions(observed_module)
+        finalized_weights = cls.get_finalized_weights(observed_module)
+        return_module = _nn.MultiheadAttention(
+            embed_dim=observed_module.embed_dim,
+            num_heads=observed_module.num_heads,
+            dropout=observed_module.dropout,
+            bias=observed_module.bias is not None,
+            add_bias_kv=observed_module.add_bias_kv,
+            add_zero_attn=observed_module.add_zero_attn,
+            kdim=observed_module.kdim,
+            vdim=observed_module.vdim,
+            batch_first=observed_module.batch_first,
+            device=observed_module.device if hasattr(observed_module, "device") else None,
+            dtype=observed_module.dtype if hasattr(observed_module, "dtype") else None,
+        )
+        return_module.weight = _nn.Parameter(finalized_weights)
+        return_module.bias = _nn.Parameter(observed_module.bias.detach())
+        if observed_module.add_bias_kv:
+            return_module.bias_k = _nn.Parameter(observed_module.bias_k.detach())
+            return_module.bias_v = _nn.Parameter(observed_module.bias_v.detach())
+        else:
+            return_module.bias_k = return_module.bias_v = None
+        return_module.activation_post_process = observed_module.activation_post_process
+        return return_module
+
+
+class EmbeddingPalettizationConversion(PalettizationCustomConversionBase):
+    """
+    Conversion class for Embedding.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    @classmethod
+    def from_observed(cls, observed_module: _nn.Module):
+        cls.do_attribute_assertions(observed_module)
+        finalized_weights = cls.get_finalized_weights(observed_module)
+        return_module = _nn.Embedding(
+            num_embeddings=observed_module.num_embeddings,
+            embedding_dim=observed_module.embedding_dim,
+            padding_idx=observed_module.padding_idx,
+            max_norm=observed_module.max_norm,
+            norm_type=observed_module.norm_type,
+            scale_grad_by_freq=observed_module.scale_grad_by_freq,
+            sparse=observed_module.sparse,
+            _weight=None,
+            device=observed_module.device if hasattr(observed_module, "device") else None,
+            dtype=observed_module.dtype if hasattr(observed_module, "dtype") else None,
+        )
+        return_module.weight = _nn.Parameter(finalized_weights)
+        return_module.activation_post_process = observed_module.activation_post_process
+        return return_module
+
+
+# Dictionary to map nnqat modules to Custom Conversion class. Each of these Custom Conversion classes
+# implement a ``from_observed`` method which is used to create original modules from qat modules.
+PALETTIZATION_CONVERT_DICT = {
+    "observed_to_quantized_custom_module_class": {
+        _nnqat.Linear: LinearPalettizationConversion,
+        _nnqat.Conv2d: Conv2dPalettizationConversion,
+        _nnqat.Conv3d: Conv3dPalettizationConversion,
+        Conv1d: Conv1dPalettizationConversion,
+        LayerNorm: LayerNormPalettizationConversion,
+        Embedding: EmbeddingPalettizationConversion,
+        MultiheadAttention: MultiheadAttentionPalettizationConversion,
+    }
+}
diff --git a/coremltools/optimize/torch/palettization/_efficient_kmeans.py b/coremltools/optimize/torch/palettization/_efficient_kmeans.py
new file mode 100644
index 000000000..21305b745
--- /dev/null
+++ b/coremltools/optimize/torch/palettization/_efficient_kmeans.py
@@ -0,0 +1,349 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import numpy as _np
+import torch as _torch
+import torch.distributed as _dist
+
+
+class _EfficientKMeans:
+    """
+    _EfficientKMeans is primarily used by palettization to perform a k-means operation. This class also has an in-house
+    implementation of k-means, called ``kmeans_pp`` which runs entirely on GPU and is ~10x faster than sklearn's API.
+    """
+
+    @staticmethod
+    def get_cluster_avg(n_clusters: int, indices, vals):
+        v_sum = (
+            _torch.zeros([n_clusters] + list(vals[0].size()), dtype=vals.dtype)
+            .to(vals.device)
+            .index_add_(0, indices, vals)
+        )
+        v_numel = (
+            _torch.zeros(n_clusters, dtype=_torch.int)
+            .to(vals.device)
+            .index_add_(0, indices, _torch.ones(len(vals), dtype=_torch.int).to(vals.device))
+        )
+        v_avg = v_sum / v_numel.reshape(-1, 1)
+
+        return v_avg
+
+    @staticmethod
+    def x_c_dist(weights: _torch.Tensor, centroids: _torch.Tensor):
+        """
+        Method to calculate distance between weights and centroids.
+        """
+        return _torch.cdist(weights, centroids).square()
+
+    def __init__(
+        self,
+        n_clusters: int,
+        init: str,
+        n_init: int = 0,
+        labels=None,
+        verbose: int = 0,
+        max_iter: int = 100,
+        tol: float = 0.0001,
+        error_bnd: int = 0,
+    ):
+        self.n_clusters = n_clusters
+        self.n_init = n_init
+        self.max_iter = max_iter
+        self.tol = tol
+        self.verbose = verbose
+        self.labels_ = labels
+        self.inertia_ = None
+        self.cluster_centers_ = init
+        self.error_bnd = error_bnd
+
+        assert self.max_iter > 0
+        assert self.n_clusters > 0
+
+    def kmeans_pp(self, n_init: str, X: _torch.Tensor, random_state: int, offset: int = 0):
+        """
+        In-house implementation of kmeans that runs entirely on GPU and is ~10x faster.
+        """
+        assert (
+            len(X) >= self.n_clusters
+        ), f"Weight fewer points than the number of clusters: {len(X)} vs. {self.n_clusters}"
+
+        S = X[offset:]
+
+        self.inertia_ = None
+
+        width = (len(S) - 1) // (random_state + 1)
+
+        for i in range(n_init):
+            idx = int(i / n_init * width)
+            C = S[idx].unsqueeze(0)
+
+            for j in range(len(C), self.n_clusters):
+                min_error, labels = self.__class__.x_c_dist(S, C).min(dim=-1)
+
+                while True:
+                    max_dist_idx = _torch.argmax(min_error)
+                    assert min_error[max_dist_idx] >= 0, "Cannot find a next candidate"
+
+                    candidate_C = S[max_dist_idx]
+                    if candidate_C in set(C):
+                        _dist[max_dist_idx] = -1
+                    else:
+                        C = _torch.vstack((C, candidate_C))
+                        break
+
+            if len(set(C)) != self.n_clusters:
+                return self.kmeans_pp(n_init, X, random_state, offset + 1)
+
+            min_error, labels = self.__class__.x_c_dist(X, C).min(dim=-1)
+            cur_cost = min_error.sum()
+
+            if self.inertia_ is None or self.inertia_ > cur_cost:
+                self.inertia_ = cur_cost
+                self.cluster_centers_ = C
+                self.labels_ = labels
+
+    def cost(self, i: int, j: int, new_cluster_cost: float):
+        if i > j:
+            cur_cost = 0
+        else:
+            size = j - i + 1
+            sum_i_j = self.prefix_x[j] - (self.prefix_x[i - 1] if i >= 1 else 0)
+            sum2_i_j = self.prefix_x2[j] - (self.prefix_x2[i - 1] if i >= 1 else 0)
+            mean_i_j = sum_i_j / size
+            cc_i_j = -mean_i_j * mean_i_j * size + sum2_i_j
+
+            if cc_i_j < 0:
+                cc_i_j = 0
+
+            cur_cost = cc_i_j * (1 - self.tol) + new_cluster_cost * self.tol
+
+        return cur_cost
+
+    def backtrace(self, P, T, i, m):
+        if m >= 0:
+            P = [m] + P
+
+        if m == 0:
+            return P
+
+        return self.backtrace(P, T, i - 1, T[i - 1][m - 1])
+
+    def fit(self, X: _torch.Tensor):
+        """
+        Method to run kmeans operation.
+        """
+        N = len(X)
+        if isinstance(self.cluster_centers_, str):
+            if "kmeans++" in self.cluster_centers_:
+
+                if _dist.is_available() and _dist.is_initialized():
+                    world_size = _dist.get_world_size()
+                    rank = _dist.get_rank()
+                else:
+                    world_size = 1
+                    rank = 0
+
+                if "cpu" in self.cluster_centers_:
+                    import sklearn.cluster
+
+                    kmeans = sklearn.cluster.KMeans(
+                        n_init=max(10, self.n_init // world_size),
+                        n_clusters=self.n_clusters,
+                        max_iter=self.max_iter,
+                        random_state=rank + 1,
+                        verbose=0,
+                        tol=self.tol,
+                    ).fit(X.cpu().numpy())
+                    self.inertia_ = _torch.Tensor([kmeans.inertia_]).to(X.device)
+                    self.labels_ = _torch.from_numpy(kmeans.labels_).to(_torch.int).to(X.device)
+                    self.cluster_centers_ = None
+                else:
+                    self.kmeans_pp(self.n_init, X, rank + 1)
+
+                self.fit(X)
+
+                bcast_rank = self.get_best_rank(self.inertia_, _torch.argmin)
+                if bcast_rank is not None:
+                    _dist.broadcast(self.cluster_centers_, bcast_rank)
+                    _dist.broadcast(self.labels_, bcast_rank)
+
+                return self
+
+            elif self.cluster_centers_ == "opt1d":
+                nX, sort_order = _torch.sort(X, dim=0)
+                nX = nX.cpu().numpy()
+                rN = range(N)
+
+                self.prefix_x = _np.cumsum(nX)
+                self.prefix_x2 = _np.cumsum(_np.square(nX))
+
+                new_cluster_cost = 0  # 2 * self.cost(0, N - 1, 0)
+
+                num_D = self.n_clusters if self.verbose >= 2 else 2
+
+                D = _np.full((num_D, N), _np.inf)
+                D[0] = [self.cost(0, m, new_cluster_cost) for m in rN]
+                T = _np.full((self.n_clusters, N), -1, dtype=int)
+                T[0] = [0 for m in rN]
+
+                opt_t_cost = D[0][-1]
+                opt_n_clusters = 0
+                for c in range(1, self.n_clusters):
+                    if True:
+
+                        def lookup(m, j):
+                            return -(
+                                D[(c - 1) % num_D][min(j - 1, m)]
+                                + self.cost(j, m, new_cluster_cost)
+                            )
+
+                        R = self.smawk(rN, rN, lookup)
+
+                        for k, v in R.items():
+                            D[c % num_D][k] = -lookup(k, v)
+                            T[c][k] = v
+                    else:
+                        for m in range(1, N):
+                            for j in range(m):
+                                cur_cost = D[(c - 1) % num_D][j] + self.cost(
+                                    j + 1, m, new_cluster_cost
+                                )
+                                if cur_cost < D[c % num_D][m]:
+                                    D[c % num_D][m] = cur_cost
+                                    T[c][m] = j + 1
+
+                    if opt_t_cost > D[c % num_D][-1]:
+                        opt_t_cost = D[c % num_D][-1]
+                        opt_n_clusters = c
+
+                P = []
+                P = self.backtrace(P, T, opt_n_clusters, T[opt_n_clusters][-1])
+                P.append(N)
+
+                self.labels_ = []
+                self.cluster_centers_ = []
+                for i in range(len(P) - 1):
+                    v = nX[P[i] : P[i + 1]]
+                    if len(v):
+                        self.labels_ += [len(self.cluster_centers_)] * len(v)
+                        self.cluster_centers_.append([_np.mean(v)])
+
+                self.n_clusters = len(self.cluster_centers_)
+                self.cluster_centers_ = _torch.from_numpy(_np.array(self.cluster_centers_)).to(
+                    device=X.device, dtype=X.dtype
+                )
+                min_error, self.labels_ = self.__class__.x_c_dist(X, self.cluster_centers_).min(
+                    dim=-1
+                )
+                self.inertia_ = min_error.sum()
+
+        else:
+            self.inertia_ = None
+
+            for i in range(self.max_iter):
+
+                self.cluster_centers_ = self.__class__.get_cluster_avg(
+                    self.n_clusters, self.labels_, X
+                )
+
+                nan_centers = self.cluster_centers_.isnan()
+                if nan_centers.any():
+                    self.kmeans_pp(self.n_init, X, i)
+                    continue
+
+                self.x_c_dist = self.__class__.x_c_dist(X, self.cluster_centers_)
+                min_error, self.labels_ = self.x_c_dist.min(dim=-1)
+                cur_inertia = min_error.sum()
+
+                if self.error_bnd and _torch.sqrt(cur_inertia / N) < self.error_bnd:
+                    unique, counts = _torch.unique(self.labels_, return_counts=True)
+                    idx = unique[counts.argmin()]
+
+                    reduce_cluster_centers_ = self.cluster_centers_.clone()
+                    reduce_cluster_centers_[idx] = _np.nan
+
+                    reduce_cluster_centers_ = reduce_cluster_centers_[
+                        ~_torch.isnan(reduce_cluster_centers_)
+                    ].view(-1, 1)
+                    reduce_min_error, reduce_labels_ = self.__class__.x_c_dist(
+                        X, reduce_cluster_centers_
+                    ).min(dim=-1)
+                    reduce_inertia = reduce_cluster_centers_.sum()
+                    self.rmse_error = _torch.sqrt(reduce_inertia / N)
+
+                    if self.rmse_error < self.error_bnd:
+                        self.cluster_centers_ = reduce_cluster_centers_
+                        self.labels_ = reduce_labels_
+                        self.n_clusters = len(self.cluster_centers_)
+                        continue
+
+                if self.inertia_ is None or abs(self.inertia_ - cur_inertia) > self.tol:
+                    self.inertia_ = cur_inertia
+                else:
+                    self.inertia_ = cur_inertia
+                    break
+
+        return self
+
+    def get_best_rank(self, metric, func=_torch.argmin):
+        if _dist.is_available() and _dist.is_initialized():
+            world_size = _dist.get_world_size()
+            if world_size > 1:
+                tensor_list = [_torch.zeros_like(metric) for _ in range(world_size)]
+                _dist.all_gather(tensor_list, metric)
+                bcast_rank = func(_torch.Tensor(tensor_list))
+
+                return bcast_rank
+
+        return None
+
+    def rmse_error(self, a, b):
+        return _torch.sqrt(_torch.mean(_torch.square(a - b)))
+
+    def smawk(self, rows, cols, lookup):
+        """Search for row-maxima in a 2d totally monotone matrix M[i,j].
+        The input is specified by a list of row indices, a list of column
+        indices, and a function "lookup" satisfying lookup(i,j) = M[i,j].
+        The matrix must satisfy the totally monotone ordering property:
+        if i occurs before i' in rows, j occurs before j' in cols, and
+        M[i,j] < M[i,j'], then also M[i',j] < M[i',j'].  The result is
+        returned as a dictionary mapping row i to the column j containing
+        the largest value M[i,j].  Ties are broken in favor of earlier
+        columns.  The number of calls to lookup is O(len(rows)+len(cols))."""
+
+        # base case of recursion
+        if not rows:
+            return {}
+
+        # reduce phase: make number of columns at most equal to number of rows
+        stack = []
+        for c in cols:
+            while len(stack) >= 1 and lookup(rows[len(stack) - 1], stack[-1]) < lookup(
+                rows[len(stack) - 1], c
+            ):
+                stack.pop()
+            if len(stack) != len(rows):
+                stack.append(c)
+
+        cols = stack
+
+        # recursive call to search for every odd row
+        result = self.smawk([rows[i] for i in range(1, len(rows), 2)], cols, lookup)
+
+        # go back and fill in the even rows
+        c = 0
+        for r in range(0, len(rows), 2):
+            row = rows[r]
+            if r == len(rows) - 1:
+                cc = len(cols) - 1  # if r is last row, search through last col
+            else:
+                cc = c  # otherwise only until pos of max in row r+1
+                target = result[rows[r + 1]]
+                while cols[cc] != target:
+                    cc += 1
+            result[row] = max([(lookup(row, cols[x]), -x, cols[x]) for x in range(c, cc + 1)])[2]
+            c = cc
+
+        return result
diff --git a/coremltools/optimize/torch/palettization/_fake_palettizer_tensor_hook.py b/coremltools/optimize/torch/palettization/_fake_palettizer_tensor_hook.py
new file mode 100644
index 000000000..41b8272bd
--- /dev/null
+++ b/coremltools/optimize/torch/palettization/_fake_palettizer_tensor_hook.py
@@ -0,0 +1,114 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import torch as _torch
+import torch.nn.functional as _F
+
+
+class _FakePalettizationTensorHook:
+    """
+    _FakePalettizationTensorHook is the class to assist in using CPU when we only want to utilize a certain percentage
+    of the GPU memory.
+    """
+    gc_trigger = None
+
+    def __init__(
+        self, size_list, use_cpu: bool = False, name: str = None, palett_tau: float = 0.0001
+    ):
+        self.name = name
+        self.size_list = size_list
+        self.tensor_list = [None] * len(size_list)
+        self.device_list = [None] * len(size_list)
+        self.use_cpu = use_cpu
+        self.palett_tau = palett_tau
+
+    def init_pack(self, x: _torch.Tensor):
+        """
+        Method that initialises packing and saving values to CPU.
+        """
+        if x.size() in self.size_list:
+            idx = self.size_list.index(x.size())
+
+            if self.tensor_list[idx] is None:
+                self.device_list[idx] = x.device
+
+                if self.use_cpu:
+                    self.tensor_list[idx] = _torch.empty(
+                        x.size(), dtype=x.dtype, layout=x.layout, pin_memory=True
+                    )
+                    self.tensor_list[idx].copy_(x)
+                else:
+                    self.tensor_list[idx] = x
+
+            elif _torch.equal(self.tensor_list[idx][0].to(self.device_list[idx]), x[0]):
+                pass
+            else:
+                assert False
+
+            return idx
+
+        return x
+
+    def init_unpack(self, x: _torch.Tensor):
+        """
+        Method that initialises un-packing and retrieving values from CPU.
+        """
+        if isinstance(x, int):
+            idx = x
+
+            assert self.tensor_list[idx] is not None
+            self.tensor_list[idx] = self.tensor_list[idx].to(
+                self.device_list[idx], non_blocking=True
+            )
+            return self.tensor_list[idx]
+
+        return x
+
+    def reuse_pack(self, x: _torch.Tensor):
+        """
+        Method to pack reused variables on to CPU.
+        """
+        if x.layout != _torch.sparse_coo and x.size() in self.size_list:
+            idx = self.size_list.index(x.size())
+
+            assert self.size_list[idx] is not None
+
+            header = self.tensor_list[idx][0].to(self.device_list[idx])
+
+            if _torch.equal(x[0], -header * header / self.palett_tau):
+                return idx, "x_c_dist"
+            elif _torch.equal(x[0], _F.softmax(-header * header / self.palett_tau)):
+                return idx, "softmax"
+            else:
+                return x.to_sparse(), "sparse"
+
+        return x
+
+    def reuse_unpack(self, x: _torch.Tensor):
+        """
+        Method to unpack reused variables from CPU.
+        """
+        if isinstance(x, tuple):
+            obj, op = x
+            if isinstance(obj, int):
+                idx = obj
+                assert self.tensor_list[idx] is not None
+                self.tensor_list[idx] = self.tensor_list[idx].to(self.device_list[idx])
+
+                if op == "softmax":
+                    val = self.tensor_list[idx] * self.tensor_list[idx] / self.palett_tau
+                    return _F.softmax(-val, dim=1)
+                elif op == "x_c_dist":
+                    return -self.tensor_list[idx] * self.tensor_list[idx] / self.palett_tau
+                elif op == "transpose":
+                    return self.tensor_list[idx].T
+                else:
+                    assert False
+            elif op == "sparse":
+                return obj.to_dense()
+        return x
+
+    def debug_hook(self, x: _torch.Tensor):
+        return x
diff --git a/coremltools/optimize/torch/palettization/_partitioner.py b/coremltools/optimize/torch/palettization/_partitioner.py
new file mode 100644
index 000000000..599519904
--- /dev/null
+++ b/coremltools/optimize/torch/palettization/_partitioner.py
@@ -0,0 +1,188 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+from typing import Tuple as _Tuple
+
+import torch as _torch
+
+from ._efficient_kmeans import _EfficientKMeans
+
+
+class _Partitioner:
+    """
+    Internal class that manages partitioning. The ``FakePalettize`` class base classes the ``_Partitioner`` class
+    and all the partitioning logic is controlled by this class.
+    """
+
+    def __init__(
+        self,
+        n_bits: int,
+        enforce_zero: bool,
+        partition_size: int,
+        cluster_dim: int,
+        cluster_permute: _Tuple,
+        palett_tau: float,
+        kmeans_init: str,
+        prune_threshold: float,
+        kmeans_opt1d_threshold: int,
+        add_extra_centroid: bool,
+    ):
+        self.centroids_init = [kmeans_init]
+        if add_extra_centroid:
+            self.n_clusters = [2 ** int(n_bits) + 1]
+        else:
+            self.n_clusters = [2 ** int(n_bits)]
+        self.labels_init = [None]
+        self.enforce_zero = [enforce_zero]
+        self.partitions = []
+        self.partition_size = partition_size
+        self.cluster_dim = cluster_dim
+        self.cluster_permute = cluster_permute
+        self.prune_threshold = float(prune_threshold)
+
+        self.kmeans_init = kmeans_init
+        self.kmeans_opt1d_threshold = kmeans_opt1d_threshold
+        self.palett_tau = palett_tau
+
+    def create_partitions(self, weights: _torch.Tensor):
+        """
+        Method to create partitions in the weights. These partitions can be used to run channel level palettization.
+        """
+        num_channels = len(weights)
+        numel_per_channel = _torch.numel(weights[0])
+        num_channels_per_partition = min(
+            num_channels, max(1, int(self.partition_size / numel_per_channel))
+        )
+
+        self.partitions = [
+            (n, min(n + num_channels_per_partition, num_channels))
+            for n in range(0, num_channels, num_channels_per_partition)
+        ]
+        num_partitions = len(self.partitions)
+
+        if self.centroids_init[0] == "auto":
+            # if auto then pick either init method
+            numel_per_partition = numel_per_channel * num_channels_per_partition
+            self.centroids_init[0] = (
+                "opt1d"
+                if (
+                    numel_per_partition <= self.n_clusters[0]
+                    or numel_per_partition <= self.kmeans_opt1d_threshold
+                )
+                and self.cluster_dim == 1
+                else "cpu.kmeans++"
+            )
+
+        self.centroids_init = self.centroids_init * num_partitions
+        self.n_clusters = self.n_clusters * num_partitions
+        self.labels_init = self.labels_init * num_partitions
+        self.enforce_zero = self.enforce_zero * num_partitions
+
+        assert (
+            num_channels_per_partition * numel_per_channel
+            >= min(self.n_clusters) * self.cluster_dim
+        ), f"The number of clusters ({self.n_clusters}) and/or the cluster dim ({self.cluster_dim}) is TOO big"
+
+    def get_partition_kmeans(
+        self, weights: _torch.Tensor, partition_index: int, partition: int, max_iter: int, init: str
+    ):
+        """
+        Method to get kmeans for a particular partition.
+        """
+        Y = weights[partition[0] : partition[1]].detach()
+        cY, pad = self.flatten(Y)
+
+        kmeans = _EfficientKMeans(
+            n_clusters=self.n_clusters[partition_index],
+            init=init,
+            labels=self.labels_init[partition_index],
+            n_init=10,
+            max_iter=max_iter,
+        ).fit(cY)
+
+        if self.enforce_zero[partition_index]:
+            zero_point = (
+                _torch.zeros(kmeans.cluster_centers_[0].size())
+                .to(kmeans.cluster_centers_.device)
+                .unsqueeze(0)
+            )
+            zero_idx = _torch.argmin(_torch.cdist(kmeans.cluster_centers_, zero_point))
+            kmeans.cluster_centers_[zero_idx] = zero_point
+
+        weights[partition[0] : partition[1]] = self.deflatten(
+            kmeans.cluster_centers_[kmeans.labels_], Y.size(), pad
+        )
+
+        return kmeans
+
+    def init_partitions(self, weights: _torch.Tensor):
+        """
+        Method to initialize the partitions and set the k-means. Called during first iteration of palettization in the
+        forward method of ``FakePalettize``.
+        """
+        with _torch.no_grad():
+            self.create_partitions(weights)
+            for i, partition in enumerate(self.partitions):
+                kmeans = self.get_partition_kmeans(
+                    weights.clone(), i, partition, max_iter=100, init=self.centroids_init[i]
+                )
+
+                self.centroids_init[i] = kmeans.cluster_centers_
+                self.labels_init[i] = kmeans.labels_
+                self.n_clusters[i] = kmeans.n_clusters
+
+    def flatten(self, weight_partition: _torch.Tensor):
+        """
+        Method to flatten a particular weight partition.
+        """
+        permute = self.cluster_permute
+        dim = self.cluster_dim
+
+        if permute and len(permute) == len(weight_partition.size()):
+            weight_partition = weight_partition.permute(permute)
+
+        num_misalignment = _torch.numel(weight_partition) % dim
+
+        pad = None
+        if num_misalignment:
+            weight_partition = weight_partition.flatten()
+            pad = weight_partition[-num_misalignment:]
+            weight_partition = weight_partition[:-num_misalignment]
+
+        return weight_partition.reshape(-1, dim), pad
+
+    def deflatten(self, weight_partition: _torch.Tensor, target_size: _Tuple, pad: _torch.Tensor):
+        """
+        Method to deflatten a particular weight partition.
+        """
+        permute = self.cluster_permute
+
+        if pad is not None:
+            weight_partition = _torch.cat([weight_partition.flatten(), pad])
+
+        if permute and len(permute) == len(target_size):
+            cur_shape = [target_size[i] for i in permute]
+
+            weight_partition = weight_partition.reshape(cur_shape)
+            weight_partition = weight_partition.permute(
+                _torch.argsort(_torch.Tensor(permute)).tolist()
+            )
+            assert weight_partition.size() == target_size
+
+        return weight_partition.reshape(target_size)
+
+    # Do not use _load_from_state_dict as this class doesn't call super
+    # So it makes multiple inheritance easier to apprehend in child classes
+    def _load_from_state_dict_(
+        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+    ):
+        self.cluster_permute = state_dict.pop(prefix + "permute")
+        self.partitions = state_dict.pop(prefix + "partitions")
+
+    # Do not use _save_to_state_dict as this class doesn't call super
+    # So it makes multiple inheritance easier to apprehend in child classes
+    def _save_to_state_dict_(self, destination, prefix, keep_vars):
+        destination[prefix + "permute"] = self.cluster_permute
+        destination[prefix + "partitions"] = self.partitions
diff --git a/coremltools/optimize/torch/palettization/_supported_modules.py b/coremltools/optimize/torch/palettization/_supported_modules.py
new file mode 100644
index 000000000..66c6e82d6
--- /dev/null
+++ b/coremltools/optimize/torch/palettization/_supported_modules.py
@@ -0,0 +1,284 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import torch as _torch
+import torch.nn as _nn
+import torch.nn.functional as _F
+
+from .palettization_config import SUPPORTED_PYTORCH_QAT_MODULES
+
+
+def _get_palettization_qat_mappings():
+    """
+    _get_palettization_qat_mappings creates qat_module_mappings supported by coremltools.optimize.torch for palettization. We
+    support three modules already in DEFAULT_QAT_MODULE_MAPPINGS, namely, nn.Linear, nn.Conv2d and
+    nn.Conv3d. Additionally, we have added support for preparation of nn.Conv1d, nn.LayerNorm,
+    nn.MultiheadAttention and nn.Embedding modules.
+    """
+    qat_module_mappings = (
+        _torch.quantization.quantization_mappings.get_default_qat_module_mappings()
+    )
+    for k in list(qat_module_mappings.keys()):
+        if k not in SUPPORTED_PYTORCH_QAT_MODULES:
+            del qat_module_mappings[k]
+    qat_module_mappings[Conv1d._FLOAT_MODULE] = Conv1d
+    qat_module_mappings[LayerNorm._FLOAT_MODULE] = LayerNorm
+    qat_module_mappings[MultiheadAttention._FLOAT_MODULE] = MultiheadAttention
+    qat_module_mappings[Embedding._FLOAT_MODULE] = Embedding
+
+    return qat_module_mappings
+
+
+class Conv1d(_nn.Conv1d):
+    _FLOAT_MODULE = _nn.Conv1d
+
+    def forward(self, input):
+        qweight = self.weight_fake_quant(self.weight)
+        if self.padding_mode != "zeros":
+            return _F.conv1d(
+                _F.pad(input, self._reversed_padding_repeated_twice, mode=self.padding_mode),
+                qweight,
+                self.bias,
+                self.stride,
+                (0,),
+                self.dilation,
+                self.groups,
+            )
+        return _F.conv1d(
+            input, qweight, self.bias, self.stride, self.padding, self.dilation, self.groups
+        )
+
+    @classmethod
+    def from_float(cls, mod):
+        r"""Create a qat module from a float module or qparams_dict
+
+        Args: `mod` a float module, either produced by torch.quantization utilities
+        or directly from user
+        """
+        assert type(mod) == cls._FLOAT_MODULE, (
+            "qat." + cls.__name__ + ".from_float only works for " + cls._FLOAT_MODULE.__name__
+        )
+        assert hasattr(mod, "qconfig"), "Input float module must have qconfig defined"
+        assert mod.qconfig, "Input float module must have a valid qconfig"
+
+        qconfig = mod.qconfig
+        qat = cls(
+            mod.in_channels,
+            mod.out_channels,
+            mod.kernel_size,
+            stride=mod.stride,
+            padding=mod.padding,
+            dilation=mod.dilation,
+            groups=mod.groups,
+            bias=mod.bias is not None,
+            padding_mode=mod.padding_mode,
+        )
+        qat.qconfig = qconfig
+        qat.weight_fake_quant = qconfig.weight()
+
+        wnorm = None
+
+        for k, hook in mod._forward_pre_hooks.items():
+            if "WeightNorm" in str(hook):
+                wnorm = hook
+
+        if wnorm:
+            qat = _nn.utils.weight_norm(qat, name=wnorm.name, dim=wnorm.dim)
+
+        for name, param in mod.named_parameters(recurse=False):
+            setattr(qat, name, param)
+
+        if wnorm:
+            _nn.utils.remove_weight_norm(mod)
+
+        return qat
+
+
+class LayerNorm(_nn.LayerNorm):
+    _FLOAT_MODULE = _nn.LayerNorm
+
+    def forward(self, input):
+        return _F.layer_norm(
+            input,
+            self.normalized_shape,
+            self.weight_fake_quant(self.weight) if self.elementwise_affine else self.weight,
+            self.bias,
+            self.eps,
+        )
+
+    @classmethod
+    def from_float(cls, mod):
+        r"""Create a qat module from a float module or qparams_dict
+
+        Args: `mod` a float module, either produced by torch.quantization utilities
+        or directly from user
+        """
+        assert type(mod) == cls._FLOAT_MODULE, (
+            "qat." + cls.__name__ + ".from_float only works for " + cls._FLOAT_MODULE.__name__
+        )
+        assert hasattr(mod, "qconfig"), "Input float module must have qconfig defined"
+        assert mod.qconfig, "Input float module must have a valid qconfig"
+
+        assert isinstance(
+            mod.weight, _nn.Parameter
+        ), "CANNOT be prepared for palettization: weight is NOT learnable"
+
+        qconfig = mod.qconfig
+        qat = cls(mod.normalized_shape, eps=mod.eps, elementwise_affine=mod.elementwise_affine)
+        qat.qconfig = qconfig
+
+        if qat.elementwise_affine:
+            qat.weight_fake_quant = qconfig.weight()
+
+        for name, param in mod.named_parameters(recurse=False):
+            setattr(qat, name, param)
+
+        assert qat.elementwise_affine == (qat.weight is not None)
+        return qat
+
+
+class Embedding(_nn.Embedding):
+    _FLOAT_MODULE = _nn.Embedding
+
+    def forward(self, input):
+        qweight = self.weight_fake_quant(self.weight)
+        return _F.embedding(
+            input,
+            qweight,
+            self.padding_idx,
+            self.max_norm,
+            self.norm_type,
+            self.scale_grad_by_freq,
+            self.sparse,
+        )
+
+    @classmethod
+    def from_float(cls, mod):
+        r"""Create a qat module from a float module or qparams_dict
+
+        Args: `mod` a float module, either produced by torch.quantization utilities
+        or directly from user
+        """
+        assert type(mod) == cls._FLOAT_MODULE, (
+            "qat." + cls.__name__ + ".from_float only works for " + cls._FLOAT_MODULE.__name__
+        )
+        assert hasattr(mod, "qconfig"), "Input float module must have qconfig defined"
+        assert mod.qconfig, "Input float module must have a valid qconfig"
+
+        assert isinstance(
+            mod.weight, _nn.Parameter
+        ), "CANNOT be prepared for palettization: weight is NOT learnable"
+
+        qconfig = mod.qconfig
+        qat = cls(
+            mod.num_embeddings,
+            mod.embedding_dim,
+            mod.padding_idx,
+            max_norm=mod.max_norm,
+            norm_type=mod.norm_type,
+            scale_grad_by_freq=mod.scale_grad_by_freq,
+            sparse=mod.sparse,
+            _weight=None,
+        )
+        qat.qconfig = qconfig
+        qat.weight_fake_quant = qconfig.weight()
+
+        for name, param in mod.named_parameters(recurse=False):
+            setattr(qat, name, param)
+
+        return qat
+
+
+class MultiheadAttention(_nn.MultiheadAttention):
+    _FLOAT_MODULE = _nn.MultiheadAttention
+
+    def forward(self, query, key, value, key_padding_mask=None, need_weights=True, attn_mask=None):
+        if not self._qkv_same_embed_dim:
+            return _F.multi_head_attention_forward(
+                query,
+                key,
+                value,
+                self.embed_dim,
+                self.num_heads,
+                self.in_proj_weight,
+                self.in_proj_bias,
+                self.bias_k,
+                self.bias_v,
+                self.add_zero_attn,
+                self.dropout,
+                self.out_proj.weight_fake_quant(self.out_proj.weight),
+                self.out_proj.bias,
+                training=self.training,
+                key_padding_mask=key_padding_mask,
+                need_weights=need_weights,
+                attn_mask=attn_mask,
+                use_separate_proj_weight=True,
+                q_proj_weight=self.q_proj_weight_fake_quant(self.q_proj_weight),
+                k_proj_weight=self.k_proj_weight_fake_quant(self.k_proj_weight),
+                v_proj_weight=self.v_proj_weight_fake_quant(self.v_proj_weight),
+            )
+        else:
+            return _F.multi_head_attention_forward(
+                query,
+                key,
+                value,
+                self.embed_dim,
+                self.num_heads,
+                self.in_proj_weight_fake_quant(self.in_proj_weight),
+                self.in_proj_bias,
+                self.bias_k,
+                self.bias_v,
+                self.add_zero_attn,
+                self.dropout,
+                self.out_proj.weight_fake_quant(self.out_proj.weight),
+                self.out_proj.bias,
+                training=self.training,
+                key_padding_mask=key_padding_mask,
+                need_weights=need_weights,
+                attn_mask=attn_mask,
+            )
+
+    @classmethod
+    def from_float(cls, mod):
+        r"""Create a palettization module from a float module or qparams_dict
+
+        Args: `mod` a float module, either produced by torch.quantization utilities
+        or directly from user
+        """
+        assert type(mod) == cls._FLOAT_MODULE, (
+            "qat." + cls.__name__ + ".from_float only works for " + cls._FLOAT_MODULE.__name__
+        )
+        assert hasattr(mod, "qconfig"), "Input float module must have qconfig defined"
+        assert mod.qconfig, "Input float module must have a valid qconfig"
+
+        qconfig = mod.qconfig
+        qat = cls(
+            mod.embed_dim,
+            mod.num_heads,
+            mod.dropout,
+            bias=hasattr(mod, "in_proj_bias"),
+            add_bias_kv=mod.bias_k is not None,
+            add_zero_attn=mod.add_zero_attn,
+            kdim=mod.kdim,
+            vdim=mod.vdim,
+            qconfig=qconfig,
+        )
+        qat.qconfig = qconfig
+        if not qat._qkv_same_embed_dim:
+            qat.q_proj_weight_fake_quant = qconfig.weight()
+            qat.k_proj_weight_fake_quant = qconfig.weight()
+            qat.v_proj_weight_fake_quant = qconfig.weight()
+        else:
+            qat.in_proj_weight_fake_quant = qconfig.weight()
+
+        qat.out_proj.weight_fake_quant = qconfig.weight()
+
+        for name, param in mod.named_parameters(recurse=False):
+            setattr(qat, name, param)
+
+        for name, param in mod.out_proj.named_parameters(recurse=False):
+            setattr(qat.out_proj, name, param)
+
+        return qat
diff --git a/coremltools/optimize/torch/palettization/fake_palettize.py b/coremltools/optimize/torch/palettization/fake_palettize.py
new file mode 100644
index 000000000..5e4f3a7ab
--- /dev/null
+++ b/coremltools/optimize/torch/palettization/fake_palettize.py
@@ -0,0 +1,452 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import contextlib
+import gc
+
+import torch as _torch
+import torch.nn.functional as _F
+from torch.ao.quantization.observer import ObserverBase as _ObserverBase
+from torch.quantization import FakeQuantize as _FakeQuantize
+
+from ._efficient_kmeans import _EfficientKMeans
+from ._fake_palettizer_tensor_hook import _FakePalettizationTensorHook
+from ._partitioner import _Partitioner
+from .palettization_config import DEFAULT_PALETTIZATION_ADVANCED_OPTIONS
+
+
+class FakePalettize(_FakeQuantize, _Partitioner):
+    """
+    A class that implements the `DKM algorithm <https://arxiv.org/abs/2108.12659>`_.
+
+    Example:
+            .. code-block:: python
+
+                from collections import OrderedDict
+                import torch
+                import torch.nn as nn
+                import coremltools.optimize.torch.palettization as palett
+
+                model = nn.Sequential(
+                    OrderedDict(
+                        [
+                            ("linear1", nn.Linear(4, 5)),
+                            ("sigmoid1", nn.Sigmoid()),
+                            ("linear2", nn.Linear(5, 4)),
+                            ("sigmoid2", nn.Sigmoid),
+                        ]
+                    )
+                )
+
+                fq_activation = nn.Identity
+                fq_weight = palett.FakePalettize.with_args(
+                    observer=torch.quantization.MovingAveragePerChannelMinMaxObserver.with_args(
+                        quant_min=-128, quant_max=127, dtype=torch.qint8
+                    ),
+                    n_bits=2,
+                    cluster_dim=1,
+                )
+                model.linear2.qconfig = torch.quantization.QConfig(
+                    activation=fq_activation, weight=fq_weight
+                )
+
+                palettized_model = palett.prepare_palettizer(model)
+
+                train_model(palettized_model)
+
+                palettized_converted_model = palett.finalize(palettized_model)
+
+
+    Args:
+        observer (:obj:`torch.ao.quantization.observer.ObserverBase`): Observer for quantizing the ``LUT``.
+        n_bits (:obj:`int`): Number of palettization bits. There would be :math:`2^{n\_bits}` unique weights in the ``LUT``.
+        cluster_dim (:obj:`int`): Dimensionality of centroids to use for clustering.
+        quant_min (:obj:`int`): The minimum allowable quantized value.
+        quant_max (:obj:`int`): The maximum allowable quantized value.
+        cluster_dtype (:obj:`str`): String that decides whether to quantize the ``LUT`` or not. The following are the ``str``
+            LUT quantization combinations: (``u8``, ``uint8``), (``i8``, ``int8``), and (``f16``, ``float16``).
+        advanced_options (:obj:`dict`): Advanced options to configure the palettization algorithm.
+        observer_kwargs (optional): Arguments for the observer module.
+
+    .. note::
+        You can find the allowed keys for ``advanced_options`` in `DEFAULT_PALETTIZATION_ADVANCED_OPTIONS`.
+    """
+
+    fake_palett_enabled: _torch.Tensor
+
+    def __init__(
+        self,
+        observer: _ObserverBase,
+        n_bits: int,
+        cluster_dim: int,
+        quant_min: int = -128,
+        quant_max: int = 127,
+        cluster_dtype: str = "f32",
+        advanced_options: dict = {},
+        **observer_kwargs,
+    ):
+        partition_size = advanced_options.get(
+            "partition_size", DEFAULT_PALETTIZATION_ADVANCED_OPTIONS["partition_size"]
+        )
+        cluster_permute = advanced_options.get(
+            "cluster_permute", DEFAULT_PALETTIZATION_ADVANCED_OPTIONS["cluster_permute"]
+        )
+        palett_max_mem = advanced_options.get(
+            "palett_max_mem", DEFAULT_PALETTIZATION_ADVANCED_OPTIONS["palett_max_mem"]
+        )
+        kmeans_max_iter = advanced_options.get(
+            "kmeans_max_iter", DEFAULT_PALETTIZATION_ADVANCED_OPTIONS["kmeans_max_iter"]
+        )
+        prune_threshold = advanced_options.get(
+            "prune_threshold", DEFAULT_PALETTIZATION_ADVANCED_OPTIONS["prune_threshold"]
+        )
+        kmeans_init = advanced_options.get(
+            "kmeans_init", DEFAULT_PALETTIZATION_ADVANCED_OPTIONS["kmeans_init"]
+        )
+        kmeans_opt1d_threshold = advanced_options.get(
+            "kmeans_opt1d_threshold",
+            DEFAULT_PALETTIZATION_ADVANCED_OPTIONS["kmeans_opt1d_threshold"],
+        )
+        enforce_zero = advanced_options.get(
+            "enforce_zero", DEFAULT_PALETTIZATION_ADVANCED_OPTIONS["enforce_zero"]
+        )
+        palett_mode = advanced_options.get(
+            "palett_mode", DEFAULT_PALETTIZATION_ADVANCED_OPTIONS["palett_mode"]
+        )
+        palett_cluster_tol = advanced_options.get(
+            "palett_cluster_tol", DEFAULT_PALETTIZATION_ADVANCED_OPTIONS["palett_cluster_tol"]
+        )
+        palett_tau = advanced_options.get(
+            "palett_tau", DEFAULT_PALETTIZATION_ADVANCED_OPTIONS["palett_tau"]
+        )
+        palett_epsilon = advanced_options.get(
+            "palett_epsilon", DEFAULT_PALETTIZATION_ADVANCED_OPTIONS["palett_epsilon"]
+        )
+        palett_lambda = advanced_options.get(
+            "palett_lambda", DEFAULT_PALETTIZATION_ADVANCED_OPTIONS["palett_lambda"]
+        )
+        add_extra_centroid = advanced_options.get(
+            "add_extra_centroid", DEFAULT_PALETTIZATION_ADVANCED_OPTIONS["add_extra_centroid"]
+        )
+
+        self._target_module_level_sparsity = 0.0
+
+        _FakeQuantize.__init__(self, observer, quant_min, quant_max, **observer_kwargs)
+        _Partitioner.__init__(
+            self,
+            n_bits,
+            enforce_zero,
+            partition_size,
+            cluster_dim,
+            cluster_permute,
+            palett_tau,
+            kmeans_init,
+            prune_threshold,
+            kmeans_opt1d_threshold,
+            add_extra_centroid,
+        )
+
+        self.cluster_dtype = cluster_dtype
+        self.add_extra_centroid = add_extra_centroid
+        self.need_to_quantize = self.cluster_dtype in ["i8", "u8", "f16"]
+        self.autograd_graph = hasattr(_torch.autograd, "graph") and palett_max_mem < 1.0
+        self.palett_max_mem = palett_max_mem
+        self.palett_cluster_tol = palett_cluster_tol
+        self.kmeans_max_iter = kmeans_max_iter
+        self.palett_mode = palett_mode
+        self.palett_tau = palett_tau
+        self.palett_epsilon = palett_epsilon
+        self.palett_lambda = palett_lambda
+        self.n_bits = n_bits
+        self.cluster_dim = cluster_dim
+        self.kmeans_init = kmeans_init
+        # Temporary create placeholder buffers that will get replaced with proper centroids on the first forward,
+        # or when we reload a checkpoint. Having placeholder values is useful to maintain the structure of the state
+        # dict constant.
+        self.register_buffer("centroids", _torch.rand([1]))
+        self.register_buffer("labels", _torch.rand([1]))
+        # During init, we would want the fake_palett_enabled flag to be False, i.e. to be at a state of 0. Also, we
+        # would have set the fake_quant_enabled and observer_enabled to be 0 as well so that palettizer does nothing
+        # until the first milestone.
+        self.register_buffer("fake_palett_enabled", _torch.tensor([0], dtype=_torch.uint8))
+        self.disable_fake_quant()
+        self.disable_observer()
+        self.buffers_are_placeholders = True
+
+    def enable_fake_palett(self, enabled: bool = True) -> None:
+        self.fake_palett_enabled[0] = 1 if enabled else 0
+
+    def disable_fake_palett(self):
+        self.enable_fake_palett(False)
+
+    def diff_palettize(self, weights: _torch.Tensor):
+        """
+        Method called to run the differentiable k-means operation.
+        """
+        use_cpu_if_cuda_available = False
+        if _torch.cuda.is_available():
+            t = _torch.cuda.get_device_properties(weights.device).total_memory
+            a = _torch.cuda.memory_allocated(weights.device)
+            use_cpu_if_cuda_available = (a / t) > self.palett_max_mem and self.autograd_graph
+            if use_cpu_if_cuda_available:
+                if _FakePalettizationTensorHook.gc_trigger is None:
+                    _FakePalettizationTensorHook.gc_trigger = True
+
+        if _FakePalettizationTensorHook.gc_trigger:
+            gc.collect()
+
+        auto_grad_graph_on_cpu = (
+            _torch.autograd.graph.save_on_cpu(pin_memory=True)
+            if use_cpu_if_cuda_available
+            else contextlib.nullcontext()
+        )
+
+        for i, partition in enumerate(self.partitions):
+
+            current_partition_clone = weights[partition[0] : partition[1]].clone()
+            cX, pad = self.flatten(current_partition_clone)
+
+            with _torch.no_grad():
+                palett_table = _torch.unique(self.centroids[i], dim=0)
+                if len(palett_table) < self.n_clusters[i] * self.palett_cluster_tol:
+                    # We use n_init as 3 so as to not spend a lot of time running this operation
+                    kmeans = _EfficientKMeans(
+                        n_clusters=self.n_clusters[i],
+                        init="kmeans++",
+                        labels=self.labels[i],
+                        n_init=3,
+                        max_iter=1,
+                    )
+                    kmeans.kmeans_pp(3, cX, 0)
+                    self.centroids[i] = kmeans.cluster_centers_
+
+            centroids = self.centroids[i].clone()
+
+            assert not centroids.requires_grad
+            last_inertia = None
+
+            for j in range(self.kmeans_max_iter):
+                if self.autograd_graph:
+                    tensor_hook = _FakePalettizationTensorHook(
+                        [_torch.Size([cX.size()[0], centroids.size()[0]])],
+                        use_cpu_if_cuda_available,
+                        f"FakePalettizationTensorHook.{i}.{j}",
+                        self.palett_tau,
+                    )
+                    auto_grad_graph_hook_init = _torch.autograd.graph.saved_tensors_hooks(
+                        tensor_hook.init_pack, tensor_hook.init_unpack
+                    )
+                    auto_grad_graph_hook_reuse = _torch.autograd.graph.saved_tensors_hooks(
+                        tensor_hook.reuse_pack, tensor_hook.reuse_unpack
+                    )
+                else:
+                    auto_grad_graph_hook_init = contextlib.nullcontext()
+                    auto_grad_graph_hook_reuse = contextlib.nullcontext()
+
+                with auto_grad_graph_hook_init:
+                    x_c_dist = _EfficientKMeans.x_c_dist(cX, centroids)
+                    min_error, _ = x_c_dist.min(dim=-1)
+
+                with auto_grad_graph_hook_reuse:
+                    if "dkm" in self.palett_mode:
+                        attention = _F.softmax(-x_c_dist / self.palett_tau, dim=1)
+                    elif "gsm" in self.palett_mode:
+                        attention = _F.gumbel_softmax(-x_c_dist / self.palett_tau, dim=1)
+                    elif "hard" in self.palett_mode:
+                        col_idx = x_c_dist.min(dim=1).indices
+                        row_idx = _torch.arange(start=0, end=len(col_idx), dtype=_torch.int32).to(
+                            cX.device
+                        )
+                        attention = _torch.sparse_coo_tensor(
+                            _torch.vstack([row_idx, col_idx]),
+                            _torch.ones_like(row_idx).to(cX.device),
+                            x_c_dist.size(),
+                            dtype=x_c_dist.dtype,
+                            requires_grad=True,
+                        ).to_dense()
+
+                assert attention.requires_grad
+                attention_sum = attention.sum(dim=0).view(-1, 1)
+                attention_sum[attention_sum == 0] = 1e-6
+
+                with auto_grad_graph_hook_reuse:
+                    centroids = _torch.matmul(cX.T, attention).T / attention_sum
+
+                with auto_grad_graph_on_cpu:
+                    if self.need_to_quantize:
+                        centroids = super().forward(centroids)
+
+                    assert centroids.requires_grad
+
+                    if self.prune_threshold > 0:
+                        centroids = _torch.nn.Hardshrink(self.prune_threshold.item())(centroids)
+
+                    if self.enforce_zero[i]:
+                        zero_point = (
+                            _torch.zeros(centroids[0].size()).to(centroids.device).unsqueeze(0)
+                        )
+                        zero_idx = _torch.argmin(_torch.cdist(centroids, zero_point))
+                        centroids[zero_idx] = zero_point
+
+                cur_inertia = min_error.sum()
+
+                if last_inertia and abs(last_inertia - cur_inertia) <= self.palett_epsilon:
+                    break
+
+                last_inertia = cur_inertia
+
+            with auto_grad_graph_hook_reuse:
+                weights[partition[0] : partition[1]] = self.deflatten(
+                    _torch.matmul(attention, centroids), current_partition_clone.size(), pad
+                )
+
+                self.centroids[i] = (
+                    self.palett_lambda * self.centroids[i] + (1 - self.palett_lambda) * centroids
+                ).detach()
+                self.labels[i] = attention.detach().max(dim=1)[1].data
+
+        return weights
+
+    def palettize(self, weights: _torch.Tensor):
+        """
+        This method is run during inference time by the forward method of the ``FakePalettize`` class. It calculates the
+        weight from the ``LUT`` and ``indices`` across all partitions and returns them.
+        """
+        for i, partition in enumerate(self.partitions):
+            labels = self.labels[i]
+            if labels is not None:
+                current_weight_partition = weights[partition[0] : partition[1]].detach()
+                _, pad = self.flatten(current_weight_partition)
+
+                weights[partition[0] : partition[1]] = self.deflatten(
+                    self.centroids[i][labels.long()], current_weight_partition.size(), pad
+                )
+
+        return weights
+
+    def forward(self, weights: _torch.Tensor):
+        if self.partition_size == 0:
+            forwarded_weights = super().forward(weights)
+            if self.fake_palett_enabled[0] == 1:
+                with _torch.no_grad():
+                    quant_centroids, quant_labels = forwarded_weights.unique(return_inverse=True)
+                    self.centroids = _torch.stack([quant_centroids.view(-1, self.cluster_dim)])
+                    self.labels = _torch.stack([quant_labels])
+        else:
+            forwarded_weights = weights.clone()
+
+            if self.fake_palett_enabled[0] == 1:
+                if not self.partitions:
+                    self.init_partitions(weights.detach())
+                    self.centroids = _torch.stack(self.centroids_init)
+                    self.labels = _torch.stack(self.labels_init)
+                    self.buffers_are_placeholders = False
+
+                if self.training:
+                    forwarded_weights = self.diff_palettize(forwarded_weights)
+                else:
+                    forwarded_weights = self.palettize(forwarded_weights)
+            else:
+                forwarded_weights = super().forward(weights)
+
+        if self.cluster_dtype == "f16":
+            forwarded_weights = forwarded_weights.to(_torch.float16).to(weights.dtype)
+        elif self.cluster_dtype == "b16":
+            forwarded_weights = forwarded_weights.to(_torch.bfloat16).to(weights.dtype)
+
+        return forwarded_weights
+
+    def _load_from_state_dict(
+        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+    ):
+
+        self.cluster_dtype = local_metadata["cluster_dtype"]
+        state_dict_buffers_are_placeholders = local_metadata["buffers_are_placeholders"]
+
+        if not self.buffers_are_placeholders and state_dict_buffers_are_placeholders:
+            raise ValueError(
+                f"Trying to reload an uninitialized state dict onto an initialized module: {prefix[:-1]}"
+            )
+
+        if self.buffers_are_placeholders and not state_dict_buffers_are_placeholders:
+            # We only change the size of the placeholders if we intend to reload a proper checkpoint
+            # onto an uninitialized module. In the other cases, we expect the state dict and the module to be compatible.
+            self.centroids = _torch.empty(
+                state_dict[prefix + "centroids"].size(), device=self.centroids.device
+            )
+            self.labels = _torch.empty(
+                state_dict[prefix + "labels"].size(), device=self.labels.device
+            )
+            self.fake_palett_enabled = _torch.empty(
+                state_dict[prefix + "fake_palett_enabled"].size(), device=self.labels.device
+            )
+
+        self.buffers_are_placeholders = state_dict_buffers_are_placeholders
+
+        _Partitioner._load_from_state_dict_(
+            self,
+            state_dict,
+            prefix + "palett.",
+            local_metadata,
+            strict,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+
+        if self.need_to_quantize:
+            # We will go through FakeQuantize._load_from_state_dict and then nn.Module._load_from_state_dict
+            super()._load_from_state_dict(
+                state_dict,
+                prefix,
+                local_metadata,
+                strict,
+                missing_keys,
+                unexpected_keys,
+                error_msgs,
+            )
+        else:
+            # Jump FakeQuantize and go to nn.Module directly
+            super(_FakeQuantize, self)._load_from_state_dict(
+                state_dict,
+                prefix,
+                local_metadata,
+                strict,
+                missing_keys,
+                unexpected_keys,
+                error_msgs,
+            )
+
+    def _save_to_state_dict(self, destination, prefix, keep_vars):
+
+        if self.need_to_quantize:
+            # Use normal inheritance, go through FakeQuantize._save_to_state_dict
+            super()._save_to_state_dict(destination, prefix, keep_vars)
+            self.centroids = super().forward(self.centroids)
+        else:
+            # Skip FakeQuantize._save_to_state_dict and go directly to nn.Module._save_to_state_dict
+            super(_FakeQuantize, self)._save_to_state_dict(destination, prefix, keep_vars)
+
+        # State dicts can only contain tensors (for DDP), so store infos in the metatadata dict (in particular str)
+        destination._metadata[prefix[:-1]]["cluster_dtype"] = self.cluster_dtype
+        destination._metadata[prefix[:-1]][
+            "buffers_are_placeholders"
+        ] = self.buffers_are_placeholders
+        _Partitioner._save_to_state_dict_(self, destination, prefix + "palett.", keep_vars)
+
+    def __repr__(self):
+        rep = super().__repr__()
+        if self.centroids.shape[0] != self.n_clusters:
+            rep += " ===> centroids: uninitialised buffer, "
+            rep += "labels: uninitialised buffer, "
+        else:
+            rep += f" ===> centroids: {self.centroids}, "
+            rep += f"labels: {self.labels}, "
+        rep += f"cluster_dtype: {self.cluster_dtype}, "
+        rep += f"n_bits: {self.n_bits}, "
+        rep += f"cluster_dim: {self.cluster_dim}, "
+        rep += f"palett_tau: {self.palett_tau}, "
+        rep += f"palett_mode: {self.palett_mode}"
+        return rep
diff --git a/coremltools/optimize/torch/palettization/palettization_config.py b/coremltools/optimize/torch/palettization/palettization_config.py
new file mode 100644
index 000000000..4b864e4f2
--- /dev/null
+++ b/coremltools/optimize/torch/palettization/palettization_config.py
@@ -0,0 +1,376 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+from collections import OrderedDict as _OrderedDict
+from typing import Any as _Any
+from typing import Callable as _Callable
+from typing import Dict as _Dict
+from typing import List as _List
+from typing import NewType as _NewType
+from typing import Optional as _Optional
+from typing import Union as _Union
+
+import cattrs as _cattrs
+import torch as _torch
+import torch.nn as _nn
+from attr import define as _define
+from attr import field as _field
+from attrs import validators as _validators
+
+from coremltools.optimize.torch._utils.torch_utils import (
+    maybe_convert_str_to_dtype as _maybe_convert_str_to_dtype,
+)
+from coremltools.optimize.torch.optimization_config import (
+    ModuleOptimizationConfig as _ModuleOptimizationConfig,
+)
+from coremltools.optimize.torch.optimization_config import OptimizationConfig as _OptimizationConfig
+from coremltools.optimize.torch.optimization_config import _validate_module_type_keys_factory
+
+# Default advanced options for palettization
+DEFAULT_PALETTIZATION_ADVANCED_OPTIONS = {
+    "partition_size": 2000000000,
+    "cluster_permute": None,
+    "palett_max_mem": 1.0,
+    "kmeans_max_iter": 3,
+    "prune_threshold": 0.0,
+    "kmeans_init": "cpu.kmeans++",
+    "kmeans_opt1d_threshold": 1024,
+    "enforce_zero": False,
+    "palett_mode": "dkm",
+    "palett_tau": 0.0001,
+    "palett_epsilon": 0.0001,
+    "palett_lambda": 0.0,
+    "add_extra_centroid": False,
+    "palett_cluster_tol": 0.05,
+}
+
+
+DEFAULT_PALETTIZATION_OPTIONS = {
+    "quant_min": -128,
+    "quant_max": 127,
+    "dtype": _torch.qint8,
+    "cluster_dtype": "32",
+    "weight_threshold": 2048,
+    "milestone": 0,
+    "quantize_activations": False,
+}
+
+
+_default_palettization_scheme = {
+    **DEFAULT_PALETTIZATION_OPTIONS,
+    **DEFAULT_PALETTIZATION_ADVANCED_OPTIONS,
+}
+
+
+# Default scheme for palettization
+DEFAULT_PALETTIZATION_SCHEME = {
+    _nn.Linear: {"n_bits": 4, "cluster_dim": 1, **_default_palettization_scheme},
+    _nn.Conv1d: {"n_bits": 2, "cluster_dim": 1, **_default_palettization_scheme},
+    _nn.Conv2d: {"n_bits": 2, "cluster_dim": 1, **_default_palettization_scheme},
+    _nn.Conv3d: {"n_bits": 2, "cluster_dim": 1, **_default_palettization_scheme},
+    _nn.LayerNorm: {"n_bits": 2, "cluster_dim": 1, **_default_palettization_scheme},
+    _nn.MultiheadAttention: {"n_bits": 2, "cluster_dim": 1, **_default_palettization_scheme},
+    _nn.Embedding: {"n_bits": 2, "cluster_dim": 1, **_default_palettization_scheme},
+}
+
+
+# Pytorch modules from torch.ao.quantization.quantization_mappings.DEFAULT_QAT_MODULE_MAPPINGS that are supported
+# for palettization
+SUPPORTED_PYTORCH_QAT_MODULES = (_nn.Linear, _nn.Conv2d, _nn.Conv3d)
+
+
+@_define
+class ModuleDKMPalettizerConfig(_ModuleOptimizationConfig):
+    r"""
+    Module level configuration for :py:class:`DKMPalettizer`.
+
+    For most use cases, the only parameters you need to specify are ``n_bits``, ``cluster_dim``,
+    ``weight_threshold``, and ``milestone``.
+
+    .. note::
+        Some of the following parameters are meant for advanced use cases and for further fine-tuning the
+        DKM algorithm. The default values usually work for a majority of tasks. These parameters are ``partition_size``, ``cluster_permute``, ``palett_max_mem``,
+        ``kmeans_max_iter``, ``prune_threshold``, ``kmeans_init``, ``kmeans_opt1d_threshold``,
+        ``enforce_zero``, ``palett_mode``, ``palett_tau``, ``palett_epsilon``, ``palett_lambda``,
+        ``add_extra_centroid`` and ``palett_cluster_tol``.
+
+    .. note::
+        Change the following parameters only when you use activation quantization in conjunction with
+        DKM weight palettization: ``quant_min``, ``quant_max``, ``dtype``, and ``quantize_activations``.
+
+    Args:
+        n_bits (:obj:`int`): Number of clusters. The number of clusters used is :math:`2^{n\_bits}`.
+        cluster_dim (:obj:`int`): The dimension of each cluster.
+        quant_min: (:obj:`int`): The minimum value for each element in the weight clusters if they are quantized.
+            Defaults to ``-128``.
+        quant_max: (:obj:`int`): The maximum value for each element in the weight clusters if they are quantized.
+            Defaults to ``127``
+        dtype (:py:class:`torch.dtype`): The ``dtype`` to use for quantizing the activations. Only applies when
+            ``quantize_activations`` is ``True``. Defaults to :py:class:`torch.qint8`.
+        cluster_dtype (:obj:`str`): ``dtype`` to use for quantizing the clusters. Defaults to ``f'32'``, i.e.,
+            by default, the clusters aren't quantized.
+        weight_threshold (:obj:`int`): A module is only palettized if the number of elements in
+            its weight matrix exceeds ``weight_threshold``. Defaults to ``0``.
+        milestone (:obj:`int`): Step or epoch at which palettization begins. Defaults to ``0``.
+        quantize_activations (:obj:`bool`): When ``True``, the activation are quantized. Defaults to ``False``.
+        partition_size (:obj:`int`): partition_size helps in per channel palettization.
+        cluster_permute (:obj:`tuple`): Permute to apply to weight partitions. Defaults to ``None``.
+        palett_max_mem (:obj:`float`): Proportion of available GPU memory that should be used for palettization.
+            Defaults to ``1.0``.
+        kmeans_max_iter (:obj:`int`): Maximum number of differentiable ``k-means`` iterations. Defaults to ``3``.
+        prune_threshold (:obj:`float`): Hard-shrinks weights between [``-prune_threshold``, ``prune_threshold``] to
+            zero. Defaults to ``0.0``. Useful for joint pruning and palettization.
+        kmeans_init (:obj:`str`): ``k-means`` algorithm to use. Defaults to ``cpu.kmeans++``.
+            Other available options are ``efficient_kmeans`` and ``kmeans_pp``.
+        kmeans_opt1d_threshold (:obj:`int`): Channel threshold to decide if ``opt1d kmeans`` should be used.
+            Defaults to ``1024``.
+        enforce_zero (:obj:`bool`): If ``True``, enforces closest to origin LUT cluster to be fixed to zero.
+            Defaults to ``False``.
+        palett_mode (:obj:`str`): Criteria to calculate attention during ``k-means``. Defaults to ``dkm``.
+            Other available options are ``gsm`` and ``hard``.
+        palett_tau (:obj:`float`): Temperature factor for softmax using in DKM algorithm. Defaults to ``0.0001``.
+        palett_epsilon (:obj:`float`): Distance threshold for clusters between ``k-means`` iterations.
+            Defaults to ``0.0001``.
+        palett_lambda (:obj:`float`): Reduces effects of outliers during centroid calculation. Defaults to
+            ``0.0``.
+        add_extra_centroid (:obj:`bool`): If true, adds an extra centroid to LUT. Defaults to ``False``.
+        palett_cluster_tol (:obj:`float`): Tolerance for non-unique centroids in the LUT. The higher the number,
+            the more tolerance for non-unique centroids. Defaults to ``0.05``.
+    """
+    n_bits: _Optional[int] = _field(
+        default=None, validator=_validators.optional(_validators.instance_of(int))
+    )
+    cluster_dim: _Optional[int] = _field(
+        default=None, validator=_validators.optional(_validators.instance_of(int))
+    )
+    quant_min: int = _field(
+        default=DEFAULT_PALETTIZATION_OPTIONS["quant_min"],
+        validator=_validators.instance_of(int),
+    )
+    quant_max: int = _field(
+        default=DEFAULT_PALETTIZATION_OPTIONS["quant_max"],
+        validator=_validators.instance_of(int),
+    )
+    dtype: _torch.dtype = _field(
+        default=DEFAULT_PALETTIZATION_OPTIONS["dtype"],
+        converter=_maybe_convert_str_to_dtype,
+        validator=_validators.instance_of(_torch.dtype),
+    )
+    cluster_dtype: str = _field(
+        default=DEFAULT_PALETTIZATION_OPTIONS["cluster_dtype"],
+        validator=_validators.instance_of(str),
+    )
+    weight_threshold: int = _field(
+        default=DEFAULT_PALETTIZATION_OPTIONS["weight_threshold"],
+        validator=_validators.instance_of(int),
+    )
+    milestone: int = _field(
+        default=DEFAULT_PALETTIZATION_OPTIONS["milestone"],
+        validator=_validators.instance_of(int),
+    )
+    quantize_activations: bool = _field(
+        default=DEFAULT_PALETTIZATION_OPTIONS["quantize_activations"],
+        validator=_validators.instance_of(bool),
+    )
+    partition_size: int = _field(
+        default=DEFAULT_PALETTIZATION_ADVANCED_OPTIONS["partition_size"],
+        validator=_validators.instance_of(int),
+    )
+    cluster_permute: _Optional[tuple] = _field(
+        default=DEFAULT_PALETTIZATION_ADVANCED_OPTIONS["cluster_permute"],
+        validator=_validators.optional(_validators.instance_of(tuple)),
+    )
+    palett_max_mem: float = _field(
+        default=DEFAULT_PALETTIZATION_ADVANCED_OPTIONS["palett_max_mem"],
+        validator=_validators.instance_of(float),
+    )
+    kmeans_max_iter: int = _field(
+        default=DEFAULT_PALETTIZATION_ADVANCED_OPTIONS["kmeans_max_iter"],
+        validator=_validators.instance_of(int),
+    )
+    prune_threshold: float = _field(
+        default=DEFAULT_PALETTIZATION_ADVANCED_OPTIONS["prune_threshold"],
+        validator=_validators.instance_of(float),
+    )
+    kmeans_init: str = _field(
+        default=DEFAULT_PALETTIZATION_ADVANCED_OPTIONS["kmeans_init"],
+        validator=_validators.instance_of(str),
+    )
+    kmeans_opt1d_threshold: int = _field(
+        default=DEFAULT_PALETTIZATION_ADVANCED_OPTIONS["kmeans_opt1d_threshold"],
+        validator=_validators.instance_of(int),
+    )
+    enforce_zero: bool = _field(
+        default=DEFAULT_PALETTIZATION_ADVANCED_OPTIONS["enforce_zero"],
+        validator=_validators.instance_of(bool),
+    )
+    palett_mode: str = _field(
+        default=DEFAULT_PALETTIZATION_ADVANCED_OPTIONS["palett_mode"],
+        validator=_validators.instance_of(str),
+    )
+    palett_tau: float = _field(
+        default=DEFAULT_PALETTIZATION_ADVANCED_OPTIONS["palett_tau"],
+        validator=_validators.instance_of(float),
+    )
+    palett_epsilon: float = _field(
+        default=DEFAULT_PALETTIZATION_ADVANCED_OPTIONS["palett_epsilon"],
+        validator=_validators.instance_of(float),
+    )
+    palett_lambda: float = _field(
+        default=DEFAULT_PALETTIZATION_ADVANCED_OPTIONS["palett_lambda"],
+        validator=_validators.instance_of(float),
+    )
+    add_extra_centroid: bool = _field(
+        default=DEFAULT_PALETTIZATION_ADVANCED_OPTIONS["add_extra_centroid"],
+        validator=_validators.instance_of(bool),
+    )
+    palett_cluster_tol: float = _field(
+        default=DEFAULT_PALETTIZATION_ADVANCED_OPTIONS["palett_cluster_tol"],
+        validator=_validators.instance_of(float),
+    )
+
+
+_default_module_type_configs = _OrderedDict(
+    {
+        key: ModuleDKMPalettizerConfig.from_dict(val)
+        for key, val in DEFAULT_PALETTIZATION_SCHEME.items()
+    }
+)
+
+
+_GlobalConfigType = _NewType(
+    "GlobalConfigType",
+    _Union[
+        _Optional[ModuleDKMPalettizerConfig],
+        _List[_Optional[ModuleDKMPalettizerConfig]],
+    ],
+)
+_ModuleTypeConfigType = _NewType(
+    "ModuleTypeConfigType", _Dict[_Union[_Callable, str], _GlobalConfigType]
+)
+_ModuleNameConfigType = _NewType(
+    "ModuleNameConfigType", _Dict[str, _Optional[ModuleDKMPalettizerConfig]]
+)
+
+
+def _validate_dkm_config_type(instance, attribute, value):
+    if value is not None:
+        if isinstance(value, list):
+            return _validators.deep_iterable(
+                member_validator=_validators.optional(
+                    _validators.instance_of(ModuleDKMPalettizerConfig)
+                ),
+                iterable_validator=_validators.instance_of(list),
+            )(instance, attribute, value)
+        else:
+            return _validators.optional(_validators.instance_of(ModuleDKMPalettizerConfig))(
+                instance, attribute, value
+            )
+
+
+@_define
+class DKMPalettizerConfig(_OptimizationConfig):
+    """
+    Configuration for :py:class:`DKMPalettizer`.
+
+    The ``module_type_configs`` parameter can accept a list of :py:class:`ModuleDKMPalettizerConfig`
+    as values for a given module type. The list can specify
+    different parameters for different ``weight_threshold`` values. This is useful if
+    you want to apply different configs to layers of the same type with weights of different sizes.
+
+    For example, to use ``4`` -bit palettization for weights with more than ``1000`` elements and
+    ``2`` -bit palettization for weights with more than ``300`` but less than ``1000`` elements,
+    create a config as follows:
+
+    .. code-block:: python
+
+        custom_config = {
+            nn.Conv2d: [
+                {"n_bits": 4, "cluster_dim": 4, "weight_threshold": 1000},
+                {"n_bits": 2, "cluster_dim": 2, "weight_threshold": 400},
+            ]
+        }
+        config = DKMPalettizerConfig.from_dict({"module_type_configs": custom_config})
+
+    Args:
+        global_config (:py:class:`ModuleDKMPalettizerConfig`): Config to be applied globally
+            to all supported modules. Missing values are chosen from the default config.
+        module_type_configs (:obj:`dict` of :obj:`str` to :py:class:`ModuleDKMPalettizerConfig`):
+            Module type level configs applied to a specific module class, such as :py:class:`torch.nn.Linear`.
+            The keys can be either strings or module classes.
+        module_name_configs (:obj:`dict` of :obj:`str` to :py:class:`ModuleDKMPalettizerConfig`):
+            Module level configs applied to specific modules.
+            The name of the module must be a fully qualified name that can be used to fetch it
+            from the top level module using the ``module.get_submodule(target)`` method.
+    """
+
+    global_config: _GlobalConfigType = _field(default=None, validator=_validate_dkm_config_type)
+    module_type_configs: _ModuleTypeConfigType = _field(
+        factory=_OrderedDict,
+        validator=_validators.deep_mapping(
+            key_validator=_validators.and_(
+                _validators.instance_of((str, _Callable)),
+                _validate_module_type_keys_factory(list(DEFAULT_PALETTIZATION_SCHEME.keys())),
+            ),
+            value_validator=_validate_dkm_config_type,
+            mapping_validator=_validators.instance_of(dict),
+        ),
+    )
+    module_name_configs: _ModuleNameConfigType = _field(
+        factory=_OrderedDict,
+        validator=_validators.deep_mapping(
+            key_validator=_validators.instance_of(str),
+            value_validator=_validators.optional(
+                _validators.instance_of(ModuleDKMPalettizerConfig)
+            ),
+            mapping_validator=_validators.instance_of(dict),
+        ),
+    )
+
+    def __attrs_post_init__(self):
+        if (
+            self.global_config is None
+            and len(self.module_type_configs) == 0
+            and len(self.module_name_configs) == 0
+        ):
+            self.module_type_configs = _default_module_type_configs
+        self._sort_configs_by_weight_threshold(self.global_config)
+        for ctype, config in self.module_type_configs.items():
+            self.set_module_type(ctype, self._sort_configs_by_weight_threshold(config))
+        for name, config in self.module_name_configs.items():
+            self.set_module_type(name, self._sort_configs_by_weight_threshold(config))
+
+    @classmethod
+    def from_dict(cls, config_dict: _Dict[str, _Any]) -> "DKMPalettizerConfig":
+        super().from_dict(config_dict)
+        converter = _cattrs.Converter(forbid_extra_keys=True)
+        converter.register_structure_hook(_ModuleTypeConfigType, _structure_from_dict_hook)
+        converter.register_structure_hook(_ModuleNameConfigType, _structure_from_dict_hook)
+        converter.register_structure_hook(_GlobalConfigType, _structure_dkm_config_hook)
+        return converter.structure_attrs_fromdict(config_dict, cls)
+
+    @staticmethod
+    def _sort_configs_by_weight_threshold(config: _GlobalConfigType):
+        if isinstance(config, list):
+            return sorted(config, key=lambda x: x.weight_threshold)
+        return config
+
+
+def _structure_dkm_config_hook(
+    config_dict: _Union[_List[_Dict[str, _Any]], _Dict[str, _Any]], type: _Any
+):
+    if isinstance(config_dict, list):
+        return [ModuleDKMPalettizerConfig.from_dict(cd) for cd in config_dict]
+    return ModuleDKMPalettizerConfig.from_dict(config_dict)
+
+
+def _structure_from_dict_hook(module_type_dict: _Dict[_Union[_Callable, str], _Any], type: _Any):
+    return_dict = _OrderedDict()
+    for key, value in module_type_dict.items():
+        if value is None:
+            return_dict[key] = None
+        else:
+            return_dict[key] = _structure_dkm_config_hook(value, type)
+    return return_dict
diff --git a/coremltools/optimize/torch/palettization/palettizer.py b/coremltools/optimize/torch/palettization/palettizer.py
new file mode 100644
index 000000000..e215a7c4e
--- /dev/null
+++ b/coremltools/optimize/torch/palettization/palettizer.py
@@ -0,0 +1,282 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import copy as _copy
+import logging as _logging
+from typing import Dict as _Dict
+from typing import Optional as _Optional
+
+import torch as _torch
+import torch.nn as _nn
+from torch.ao.quantization import FakeQuantize as _FakeQuantize
+
+from coremltools.optimize.torch._typing import ParamsDict as _ParamsDict
+from coremltools.optimize.torch._utils.math_utils import rmse_error as _rmse_error
+from coremltools.optimize.torch._utils.torch_utils import get_eval_model as _get_eval_model
+from coremltools.optimize.torch.base_model_optimizer import (
+    BaseModelOptimizer as _BaseModelOptimizer,
+)
+from coremltools.optimize.torch.base_model_optimizer import _Report
+from coremltools.optimize.torch.palettization._custom_conversion import (
+    PALETTIZATION_CONVERT_DICT as _PALETTIZATION_CONVERT_DICT,
+)
+from coremltools.optimize.torch.palettization._supported_modules import (
+    _get_palettization_qat_mappings,
+)
+from coremltools.optimize.torch.palettization.fake_palettize import FakePalettize as _FakePalettize
+from coremltools.optimize.torch.palettization.palettization_config import (
+    DEFAULT_PALETTIZATION_ADVANCED_OPTIONS as _DEFAULT_PALETTIZATION_ADVANCED_OPTIONS,
+)
+from coremltools.optimize.torch.palettization.palettization_config import (
+    DEFAULT_PALETTIZATION_SCHEME as _DEFAULT_PALETTIZATION_SCHEME,
+)
+from coremltools.optimize.torch.palettization.palettization_config import (
+    DKMPalettizerConfig as _DKMPalettizerConfig,
+)
+from coremltools.optimize.torch.palettization.palettization_config import (
+    ModuleDKMPalettizerConfig as _ModuleDKMPalettizerConfig,
+)
+
+_logger = _logging.getLogger(__name__)
+
+
+class Palettizer(_BaseModelOptimizer):
+    pass
+
+
+class DKMPalettizer(Palettizer):
+    def __init__(self, model: _nn.Module, config: _Optional[_DKMPalettizerConfig] = None):
+        config = _DKMPalettizerConfig() if config is None else config
+        super().__init__(model, config)
+        self._milestones = {}
+        self._supported_modules = _get_palettization_qat_mappings()
+
+    def _palettize_supported_modules(self):
+        """
+        Method to palettize supported modules.
+        """
+        for name, submodule in self._model.named_modules(remove_duplicate=True):
+            config = self._config.get_module_config(name, submodule)
+            if type(submodule) in self._supported_modules:
+                if config is not None:
+                    submod_configs = config if isinstance(config, list) else [config]
+                    for submod_config in submod_configs:
+                        if submodule.weight.numel() > submod_config.weight_threshold:
+                            module_level_advanced_options = self._get_module_level_advanced_options(
+                                submodule, submod_config
+                            )
+                            n_bits = (
+                                submod_config.n_bits
+                                if submod_config.n_bits is not None
+                                else _DEFAULT_PALETTIZATION_SCHEME[type(submodule)]["n_bits"]
+                            )
+                            cluster_dim = (
+                                submod_config.cluster_dim
+                                if submod_config.cluster_dim is not None
+                                else _DEFAULT_PALETTIZATION_SCHEME[type(submodule)]["cluster_dim"]
+                            )
+                            self._palettize_module(
+                                submodule,
+                                n_bits,
+                                cluster_dim,
+                                submod_config.quant_min,
+                                submod_config.quant_max,
+                                submod_config.cluster_dtype,
+                                submod_config.dtype,
+                                submod_config.quantize_activations,
+                                module_level_advanced_options,
+                            )
+                            self._milestones[name] = submod_config.milestone
+
+    @staticmethod
+    def _palettize_module(
+        module: _nn.Module,
+        n_bits: int,
+        cluster_dim: int,
+        quant_min: int,
+        quant_max: int,
+        cluster_dtype: str,
+        dtype: _torch.dtype,
+        quantize_activations: bool,
+        advanced_options: _Dict,
+    ):
+        """
+        Method to palettize a module.
+        """
+        fq_activation = _nn.Identity
+        fq_weight = _FakePalettize.with_args(
+            observer=_torch.quantization.MovingAveragePerChannelMinMaxObserver.with_args(
+                quant_min=quant_min, quant_max=quant_max, dtype=dtype
+            ),
+            n_bits=n_bits,
+            cluster_dim=cluster_dim,
+            quant_min=quant_min,
+            quant_max=quant_max,
+            cluster_dtype=cluster_dtype,
+            advanced_options=advanced_options,
+        )
+        if quantize_activations:
+            fq_activation = _FakeQuantize.with_args(
+                observer=_torch.quantization.MovingAveragePerChannelMinMaxObserver.with_args(
+                    quant_min=quant_min, quant_max=quant_max
+                ),
+                quant_min=quant_min,
+                quant_max=quant_max,
+            )
+
+        module.qconfig = _torch.quantization.QConfig(activation=fq_activation, weight=fq_weight)
+
+    @staticmethod
+    def _get_module_level_advanced_options(
+        module: _nn.Module, module_level_config: _ModuleDKMPalettizerConfig
+    ) -> _ParamsDict:
+        """
+        Returns advanced_options for a module. First checks whether the user specified something for those options in the
+        palettization_config. If not, uses the options from the DEFAULT_PALETTIZATION_SCHEME of that module type.
+        Returns false otherwise.
+        """
+        module_level_advanced_options = {}
+        for key in _DEFAULT_PALETTIZATION_ADVANCED_OPTIONS.keys():
+            if key == "cluster_permute" and module_level_config.cluster_dtype == "oc_last":
+                cluster_permute = list(range(module.weight.dim()))
+                cluster_permute = cluster_permute[1:] + cluster_permute[:1]
+                module_level_advanced_options[key] = cluster_permute
+            else:
+                module_level_advanced_options[key] = getattr(module_level_config, key)
+        return module_level_advanced_options
+
+    def prepare(self, inplace: bool = False) -> _nn.Module:
+        """
+        Prepares a model for palettization aware training by inserting :py:class:`FakePalettize` layers in appropriate
+        places as specified by the config.
+
+        Args:
+            inplace (:obj:`bool`): If ``True``, model transformations are carried out in-place and
+                the original module is mutated, otherwise a copy of the model is mutated and returned.
+        """
+        if not inplace:
+            self._model = _copy.deepcopy(self._model)
+
+        self._model.train()
+        self._palettize_supported_modules()
+        qat_mappings = _get_palettization_qat_mappings()
+        self._model = _torch.quantization.prepare_qat(
+            self._model, mapping=qat_mappings, inplace=True
+        )
+        return self._model
+
+    def finalize(self, model: _Optional[_nn.Module] = None, inplace: bool = False) -> _nn.Module:
+        """
+        Removes :py:class:`FakePalettize` layers from a model and creates new model weights from the ``LUT`` and
+        ``indices`` buffers.
+
+        This function is called to prepare a palettized model for export using
+        `coremltools <https://coremltools.readme.io/docs>`_.
+
+        Args:
+            model (:obj:`nn.Module`): model to finalize.
+            inplace (:obj:`bool`): If ``True``, model transformations are carried out in-place and
+                the original module is mutated; otherwise, a copy of the model is mutated and returned.
+        """
+        if model is None:
+            model = self._model
+        model.eval()
+        finalized_model = _torch.quantization.convert(
+            model, convert_custom_config_dict=_PALETTIZATION_CONVERT_DICT, inplace=inplace
+        )
+
+        if model is None:
+            self._model = finalized_model
+        return finalized_model
+
+    def step(self):
+        """
+        Step through the palettizer. When the number of times ``step``
+        is called is equal to ``milestone``, palettization is enabled.
+        """
+        for name, module in self._model.named_modules():
+            if name in self._milestones:
+                if self._step_count == self._milestones[name]:
+                    self._enable_fake_palett_impl(module, True)
+                    self._init_prune_threshold_and_module_wise_target_sparsity(module)
+                if self._step_count > self._milestones[name]:
+                    self._update_prune_threshold(module)
+        self._step_count += 1
+
+    @staticmethod
+    def _init_prune_threshold_and_module_wise_target_sparsity(module: _torch.nn.Module):
+        if hasattr(module, "weight_fake_quant") and hasattr(module, "weight_mask"):
+            non_zero_weights = module.weight_mask.count_nonzero().item()
+            total_weights = _torch.numel(module.weight_mask)
+            target_module_level_sparsity = 1 - non_zero_weights / total_weights
+            inverse_mask = (module.weight_mask + 1) % 2
+            n_bits = module.weight_fake_quant.n_bits
+            cluster_dim = module.weight_fake_quant.cluster_dim
+            add_extra_centroid = module.weight_fake_quant.add_extra_centroid
+            n_clusters = 2 ** int(n_bits) + int(add_extra_centroid)
+            prune_threshold_init = _torch.abs(inverse_mask * module.weight_orig).max() / (
+                total_weights / cluster_dim / n_clusters
+            )
+
+            module.weight_fake_quant.prune_threshold = prune_threshold_init
+            module.weight_fake_quant._target_module_level_sparsity = target_module_level_sparsity
+
+    @staticmethod
+    def _update_prune_threshold(module: _torch.nn.Module):
+        if hasattr(module, "weight_fake_quant") and hasattr(module, "weight_mask"):
+            weight_detached = module.weight.detach()
+            qweight = module.weight_fake_quant.palettize(weight_detached)
+
+            sparsity = 1 - qweight.count_nonzero() / qweight.numel()
+            prune_ratio = float(module.weight_fake_quant._target_module_level_sparsity) / (
+                sparsity + 1e-7
+            )
+            if prune_ratio > 0 and abs(prune_ratio - 1) > 0.01:
+                prune_multiplier = max(min(prune_ratio, 1.25), 0.9)
+                module.weight_fake_quant.prune_threshold *= prune_multiplier
+
+    def enable_fake_palett(self, flag: bool):
+        _logging.info(
+            f"[{type(self).__name__}] " + ("enable" if flag else "disable") + " fake_palett"
+        )
+        for name, module in self._model.named_modules():
+            self._enable_fake_palett_impl(module, flag)
+
+    @staticmethod
+    def _enable_fake_palett_impl(module: _torch.nn.Module, flag: bool):
+        if hasattr(module, "weight_fake_quant") and isinstance(
+            module.weight_fake_quant, _FakePalettize
+        ):
+            module.weight_fake_quant.enable_fake_palett(flag)
+
+    def report(self) -> _Report:
+        """
+        Returns a dictionary with important statistics related to current state of palettization.
+        Each key in the dictionary corresponds to a module name, and the
+        value is a dictionary containing the statistics, such as number of clusters and
+        cluster dimension, number of parameters, and so on.
+        """
+        report = _Report()
+        with _get_eval_model(self._model) as model:
+            with _torch.no_grad():
+                for name, module in model.named_modules():
+                    module_summary = dict()
+                    if hasattr(module, "weight_fake_quant"):
+                        module_summary["device"] = module.weight.device
+                        qweight = module.weight_fake_quant.forward(module.weight.detach())
+                        cluster_dtype = module.weight_fake_quant.cluster_dtype
+                        cluster_permute = module.weight_fake_quant.cluster_permute
+                        module_summary["error"] = _rmse_error(
+                            module.weight.detach(), qweight
+                        ).item()
+                        n_clusters = module.weight_fake_quant.n_clusters[0]
+                        module_summary["#params"] = int(_torch.numel(qweight))
+                        cluster_dim = module.weight_fake_quant.cluster_dim
+                        module_summary["#dtype"] = (
+                            f":num_clusters: {n_clusters} <{cluster_dtype, cluster_permute}> "
+                            f"dim={cluster_dim}"
+                        )
+                        report[name] = module_summary
+        return report
diff --git a/coremltools/optimize/torch/pruning/__init__.py b/coremltools/optimize/torch/pruning/__init__.py
new file mode 100644
index 000000000..76bc06822
--- /dev/null
+++ b/coremltools/optimize/torch/pruning/__init__.py
@@ -0,0 +1,58 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+"""
+.. _coremltools_optimize_torch_pruning:
+
+.. include:: pruning_desc.rst
+
+_`MagnitudePruner`
+==================
+
+.. autoclass:: coremltools.optimize.torch.pruning.ModuleMagnitudePrunerConfig
+    :members: from_dict, as_dict, from_yaml
+
+.. autoclass:: coremltools.optimize.torch.pruning.MagnitudePrunerConfig
+    :members: set_global, set_module_type, set_module_name, from_dict, as_dict, from_yaml
+
+.. autoclass:: coremltools.optimize.torch.pruning.MagnitudePruner
+    :members: prepare, step, report, finalize
+
+Pruning scheduler
+=================
+
+:obj:`coremltools.optimize.torch.pruning.pruning_scheduler` submodule contains classes
+that implement pruning schedules, which can be used for changing the
+sparsity of pruning masks applied by various types of pruning algorithms
+to prune neural network parameters.
+
+
+Base class
+----------
+
+.. autoclass:: coremltools.optimize.torch.pruning.pruning_scheduler.PruningScheduler
+    :show-inheritance:
+    :no-members:
+
+
+PolynomialDecayScheduler
+------------------------
+
+.. autoclass:: coremltools.optimize.torch.pruning.pruning_scheduler.PolynomialDecayScheduler
+    :show-inheritance:
+    :members: compute_sparsity
+
+
+ConstantSparsityScheduler
+-------------------------
+
+.. autoclass:: coremltools.optimize.torch.pruning.pruning_scheduler.ConstantSparsityScheduler
+    :show-inheritance:
+    :members: compute_sparsity
+"""
+
+
+from .magnitude_pruner import MagnitudePruner, MagnitudePrunerConfig, ModuleMagnitudePrunerConfig
+from .pruning_scheduler import ConstantSparsityScheduler, PolynomialDecayScheduler
diff --git a/coremltools/optimize/torch/pruning/_base_pruner.py b/coremltools/optimize/torch/pruning/_base_pruner.py
new file mode 100644
index 000000000..7b30f95fe
--- /dev/null
+++ b/coremltools/optimize/torch/pruning/_base_pruner.py
@@ -0,0 +1,122 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import copy as _copy
+import logging as _logging
+from typing import Optional as _Optional
+from typing import Tuple as _Tuple
+
+import torch as _torch
+
+from coremltools.optimize.torch._utils.torch_utils import get_eval_model as _get_eval_model
+from coremltools.optimize.torch.base_model_optimizer import (
+    BaseModelOptimizer as _BaseModelOptimizer,
+)
+from coremltools.optimize.torch.base_model_optimizer import _Report
+from coremltools.optimize.torch.optimization_config import OptimizationConfig as _OptimizationConfig
+from coremltools.optimize.torch.pruning._utils import (
+    get_global_sparsity_summaries as _get_global_sparsity_summaries,
+)
+
+_logger = _logging.getLogger(__name__)
+
+
+class BasePruner(_BaseModelOptimizer):
+    pass
+
+
+class BasePrunerWithPruningMethod(BasePruner):
+    """
+    Base class for all pruners which use a PruningMethod (implemented in
+
+    """
+
+    _supported_modules: _Tuple
+
+    def __init__(self, model: _torch.nn.Module, config: _OptimizationConfig):
+        super().__init__(model, config)
+        self._pruner_info = {}
+
+    @property
+    def _is_prepared(self) -> bool:
+        return len(self._pruner_info) > 0
+
+    def prepare(self, inplace: bool = False) -> _torch.nn.Module:
+        """
+        Prepares the model for pruning.
+
+        Args:
+            inplace (:obj:`bool`): If ``True``, model transformations are carried out in-place and
+                the original module is mutated, otherwise a copy of the model is mutated and returned.
+        """
+        return _copy.deepcopy(self._model) if not inplace else self._model
+
+    def step(self):
+        """
+        Steps through the pruning schedule once. At every call to
+        :meth:`.step`, an internal step counter is incremented by one.
+        """
+        raise NotImplementedError()
+
+    def finalize(
+        self, model: _Optional[_torch.nn.Module] = None, inplace: bool = False
+    ) -> _torch.nn.Module:
+        """
+        Prepares the model for export. Removes pruning forward pre-hooks
+        attached to submodules and commits pruning changes to pruned module parameters by
+        multiplying the pruning masks with the parameter matrix.
+
+        Args:
+            model (:obj:`nn.Module`): model to finalize
+            inplace (:obj:`bool`): If ``True``, model transformations are carried out in-place and
+                the original module is mutated, otherwise a copy of the model is mutated and returned.
+        """
+        if model is None:
+            model = self._model
+        finalized_model = model if inplace else _copy.deepcopy(model)
+        for _, submodule in finalized_model.named_modules(remove_duplicate=True):
+            if hasattr(submodule, "pruning_method"):
+                submodule.pruning_method.remove(submodule)
+        if model is None:
+            self._model = finalized_model
+        return finalized_model
+
+    def report(self) -> _Report:
+        """
+        Returns a dictionary with important statistics related to current state of pruning.
+        Each key in the dictionary corresponds to a module name and the value is a dictionary
+        containing the statistics such as ``unstructured_weight_sparsity``,
+        number of parameters, etc. Also contains a ``global`` key containing the same statistics
+        aggregated over all the modules set up for pruning.
+        """
+        report = _Report()
+        with _get_eval_model(self._model):
+            with _torch.no_grad():
+                # add submodule level sparsity summary
+                total_num_params = 0
+                for name, pruner_info in self._pruner_info.items():
+                    submodule = pruner_info.module
+                    if hasattr(submodule, "pruning_method"):
+                        submod_config = pruner_info.config
+                        num_params = getattr(submodule, submod_config.param_name).detach().numel()
+                        summary = {"#params": int(num_params)}
+                        summary.update(submodule.pruning_method.get_sparsity_summary(submodule))
+                        total_num_params += num_params
+                        report[name] = summary
+                # get global sparsity summary
+                global_summaries = {"#params": total_num_params}
+                for sparsity_type in ["structured", "unstructured", "block2"]:
+                    layer_numel = [val["#params"] for _, val in report.items()]
+                    layer_sparsities = [
+                        val[f"{sparsity_type}_weight_sparsity"] for _, val in report.items()
+                    ]
+                    global_summaries[
+                        f"{sparsity_type}_weight_sparsity"
+                    ] = _get_global_sparsity_summaries(layer_sparsities, layer_numel)
+                report["global"] = global_summaries
+        return report
+
+
+_allowed_granularity_values = ["per_scalar", "per_kernel", "per_channel", "per_layer"]
diff --git a/coremltools/optimize/torch/pruning/_base_pruning_method.py b/coremltools/optimize/torch/pruning/_base_pruning_method.py
new file mode 100644
index 000000000..bafdd4dcc
--- /dev/null
+++ b/coremltools/optimize/torch/pruning/_base_pruning_method.py
@@ -0,0 +1,287 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import logging as _logging
+import types as _types
+from typing import Any as _Any
+from typing import Dict as _Dict
+from typing import NamedTuple as _NamedTuple
+from typing import Optional as _Optional
+from typing import cast as _cast
+
+import numpy as _np
+import torch as _torch
+import torch.nn.utils.prune as _prune
+import torch.utils.hooks as _hooks
+
+from coremltools.optimize.torch._typing import ParamsDict as _ParamsDict
+from coremltools.optimize.torch._utils.state_dict_utils import (
+    LoadStateDictPostHook as _LoadStateDictPostHook,
+)
+from coremltools.optimize.torch.pruning._utils import block2_sparsity as _block2_sparsity
+from coremltools.optimize.torch.pruning._utils import structured_sparsity as _structured_sparsity
+from coremltools.optimize.torch.pruning._utils import (
+    unstructured_sparsity as _unstructured_sparsity,
+)
+
+_logger = _logging.getLogger(__name__)
+
+
+class BaseDynamicPruningMethod(_prune.BasePruningMethod):
+    """
+    Extension of PyTorch's native pruning infra for seamless
+    model export and progressive sparsity schedules
+
+    This class works by registering itself as a forward pre-hook
+    into each prune-able `nn.Module` to apply the pruning mask
+    """
+
+    _tensor_name: str
+    scheduled: bool
+
+    def update_mask(self, module: _torch.nn.Module, scheduled_value: float) -> None:
+        raise NotImplementedError()
+
+    def bind_module(self, module: _torch.nn.Module) -> None:
+        module.pruning_method = self  # type: ignore
+
+        orig_get_state = getattr(module, "__getstate__", None)
+
+        # Override state method of module instance to exclude the non-leaf tensor
+        # which is neither a parameter nor a buffer
+        # See: https://discuss.pytorch.org/t/using-nn-utils-prune-causes-torch-tensor-deepcopy-to-fail/107470
+        def __getstate__(self: _torch.nn.Module) -> _Dict[str, _Any]:
+            if orig_get_state is not None:
+                state: _Dict[str, _Any] = orig_get_state()
+            else:
+                state = dict(self.__dict__)
+
+            if hasattr(self, "pruning_method"):
+                pruner = _cast(BaseDynamicPruningMethod, self.pruning_method)
+                if pruner._tensor_name in state:
+                    state[pruner._tensor_name] = None
+            return state
+
+        module.__getstate__ = _types.MethodType(__getstate__, module)  # type: ignore[assignment]
+
+    @classmethod
+    def from_module_and_params(
+        cls, module: _torch.nn.Module, param_name: str = "weight", **params: _ParamsDict
+    ) -> "BaseDynamicPruningMethod":
+        """
+        Factory method of this class that is tied to a particular nn.Module
+        """
+        pruning_method: BaseDynamicPruningMethod
+        pruning_method = super(BaseDynamicPruningMethod, cls).apply(
+            module, name=param_name, **params
+        )
+        pruning_method.bind_module(module)
+        return pruning_method
+
+    def _remove_impl(self, module: _torch.nn.Module, fuse_pruning_mask: bool) -> None:
+        assert self._tensor_name is not None
+
+        # Restore the (pruned) tensor under its original name
+        orig = module._parameters[self._tensor_name + "_orig"]
+        assert orig is not None
+
+        if fuse_pruning_mask:
+            pruned_orig = None
+            if self.scheduled:
+                current_mask = module._buffers[self._tensor_name + "_mask"]
+                assert current_mask is not None
+                current_amount = self.infer_sparsity_amount_from_external_mask(
+                    current_mask
+                )  # may have been loaded from ckpt and current_amount != self.amount:  # self.amount may be
+                # out-of-sync with the ckpt
+
+                if hasattr(self, "amount") and not _np.isclose(
+                    current_amount, self.amount, rtol=1 / orig.numel()
+                ):
+                    _logger.warning(
+                        f"Pruning method {self.__class__}'s sparsity schedule state ({self.amount}) is inconsistent "
+                        f"with pruning mask's current state ({current_amount}). This is probably harmless "
+                        f"if you are exporting a pruned model"
+                    )
+                    # We have detected an inconsistent state so we correct for this by updating the
+                    # pruning method's schedule. This correction will ensure the following `self._apply_mask_impl`
+                    # call to use the correct self.amount
+                    self.update_mask(module, current_amount)
+                    pruned_orig = current_mask.to(orig.dtype) * orig
+
+            if pruned_orig is None:
+                pruned_orig = self._apply_mask_impl(module)
+
+            orig.data = pruned_orig.data
+
+        setattr(module, self._tensor_name, orig)
+        del module._parameters[self._tensor_name + "_orig"]
+        del module._buffers[self._tensor_name + "_mask"]
+
+    def remove(self, module: _torch.nn.Module, fuse_pruning_mask: bool = True) -> _torch.nn.Module:
+        """Removes pruning masks and forward_pre_hooks from the module
+
+        If `fuse_pruning_mask` is True, then weights are fused with the pruning
+        mask before re-registering the weights under the original name
+        """
+        name = self._tensor_name
+        for k, hook in module._forward_pre_hooks.items():
+            if isinstance(hook, BaseDynamicPruningMethod) and hook._tensor_name == name:
+                self._remove_impl(module, fuse_pruning_mask)
+                del module._forward_pre_hooks[k]
+                if hasattr(module, "pruning_method"):
+                    delattr(module, "pruning_method")
+                return module
+
+        raise ValueError(
+            f"Parameter '{name}' of module {module} has to be pruned "
+            f"before pruning can be removed."
+        )
+
+    def _apply_mask_impl(self, module: _torch.nn.Module) -> _torch.Tensor:
+        # Identical to prune.BasePruningMethod.apply_mask as the default method for fusing weights and masks
+        # Exposed to allow overriding by complex pruning algorithms
+        assert self._tensor_name is not None, "Module {} has to be pruned".format(module)
+        mask = getattr(module, self._tensor_name + "_mask")
+        orig = getattr(module, self._tensor_name + "_orig")
+        pruned_tensor: _torch.Tensor = mask.to(dtype=orig.dtype) * orig
+        return pruned_tensor
+
+    def apply_mask(self, module: _torch.nn.Module) -> _torch.Tensor:
+        return self._apply_mask_impl(module)
+
+    def infer_sparsity_amount_from_external_mask(self, external_mask: _torch.Tensor) -> float:
+        """
+        Infer the sparsity amount from a given binary mask based on the granularity
+        configuration of the pruning method
+        """
+        if hasattr(self, "granularity"):
+            # rank 2: torch.Linear, rank 3: torch.Conv1d, rank 4: torch.Conv2d, rank 5: torch.Conv3d
+            rank = len(external_mask.shape)
+
+            if self.granularity == "per_scalar" or rank == 2:
+                return external_mask.eq(0).float().mean().item()
+            elif rank in [3, 4, 5]:
+                if self.granularity == "per_kernel":
+                    start_dim = 2
+                elif self.granularity == "per_channel":
+                    start_dim = 1
+                else:
+                    raise ValueError(
+                        f"Can not infer sparsity amount for granularity: {self.granularity}"
+                    )
+                return external_mask.flatten(start_dim).eq(0).all(-1).float().mean().item()
+            else:
+                raise ValueError(f"weights tensor rank must be in [2, 3, 4, 5], got {rank}")
+
+    def get_sparsity_summary(self, module: _torch.nn.Module) -> _Dict[str, _torch.tensor]:
+        """
+        Returns summary of the current state of pruning of module, indexed with name.
+        """
+        assert self._tensor_name is not None, "Module {} has not been pruned".format(module)
+        weight: _torch.Tensor = getattr(module, self._tensor_name).detach()
+        if hasattr(module, "weight_fake_quant") and hasattr(module.weight_fake_quant, "palettize"):
+            weight = module.weight_fake_quant.palettize(weight)
+
+        summary = {
+            "structured_weight_sparsity": _structured_sparsity(weight),
+            "unstructured_weight_sparsity": _unstructured_sparsity(weight),
+        }
+
+        if weight.size(0) % 2 == 0:
+            summary["block2_weight_sparsity"] = _block2_sparsity(weight)
+        else:
+            summary["block2_weight_sparsity"] = -1  # Not applicable
+        return summary
+
+
+class _SyncScheduledValueLoadStateDictPostHook(_LoadStateDictPostHook):
+    def __init__(self, scheduled_value_name: str):
+        super().__init__()
+        self._scheduled_value_name = scheduled_value_name
+
+    def __call__(self, module: _torch.nn.Module, incompatible_keys: _NamedTuple) -> None:
+        if hasattr(module, "pruning_method"):
+            pruning_method: ScheduledBaseDynamicPruningMethod = module.pruning_method
+            assert hasattr(pruning_method, "_tensor_name"), (
+                f"state_dict cannot be loaded. Attribute _tensor_name "
+                f"missing from pruning forward hook installed on the "
+                f"module: {module}"
+            )
+            assert hasattr(pruning_method, self._scheduled_value_name), (
+                f"state_dict cannot be loaded. Attribute {self._scheduled_value_name} "
+                f"missing from pruning forward hook installed on the module {module}"
+            )
+            scheduled_value_buffer_name = (
+                f"{pruning_method._tensor_name}_{self._scheduled_value_name}"
+            )
+            assert hasattr(module, scheduled_value_buffer_name), (
+                f"state_dict cannot be loaded. Buffer {scheduled_value_buffer_name} "
+                f"missing from module: {module}"
+            )
+            scheduled_value = getattr(module, scheduled_value_buffer_name)
+            # set pruning method amount to be the same as the value from state dict
+            if isinstance(scheduled_value, _torch.Tensor):
+                scheduled_value = scheduled_value.data.item()
+            setattr(pruning_method, self._scheduled_value_name, scheduled_value)
+
+
+class ScheduledBaseDynamicPruningMethod(BaseDynamicPruningMethod):
+    """
+    An extension of BaseDynamicPruningMethod for scheduled pruners
+    where the pruning amount is changed externally over the
+    course of the training.
+    """
+
+    def __init__(self, scheduled_value: _Any, scheduled_value_name: str, **kwargs: _ParamsDict):
+        super().__init__()
+        self.scheduled_value_name = scheduled_value_name
+        setattr(self, scheduled_value_name, scheduled_value)
+        self.sync_scheduled_value_post_hook_handle: _Optional[_hooks.RemovableHandle] = None
+
+    def bind_module(self, module: _torch.nn.Module) -> None:
+        super().bind_module(module)
+        param_tensor = getattr(module, self._tensor_name + "_orig")
+        scheduled_value = getattr(self, self.scheduled_value_name)
+        scheduled_value_tensor = _torch.tensor(scheduled_value, device=param_tensor.device)
+        module.register_buffer(
+            self._tensor_name + "_" + self.scheduled_value_name,
+            scheduled_value_tensor,
+        )
+        self.sync_scheduled_value_post_hook_handle = module.register_load_state_dict_post_hook(
+            _SyncScheduledValueLoadStateDictPostHook(self.scheduled_value_name)
+        )
+
+    def update_mask(self, module: _torch.nn.Module, scheduled_value: float) -> None:
+        assert self._tensor_name is not None
+        assert self.scheduled
+
+        # Get the original non-pruned parameter tensor
+        orig = getattr(module, self._tensor_name + "_orig")
+
+        assert (
+            orig is not None
+        ), "Must have called apply() to initialize pruning before calling update_mask()"
+
+        # Update scheduled value
+        setattr(self, self.scheduled_value_name, scheduled_value)
+        # keep scheduled value buffer in sync
+        scheduled_value_tensor: _torch.Tensor = getattr(
+            module, self._tensor_name + "_" + self.scheduled_value_name
+        )
+        scheduled_value_tensor.fill_(scheduled_value)
+
+        # Update the mask with the new amount
+        module.register_buffer(
+            self._tensor_name + "_mask",
+            self.compute_mask(orig, default_mask=None),
+        )
+
+    def _remove_impl(self, module: _torch.nn.Module, fuse_pruning_mask: bool) -> None:
+        super()._remove_impl(module, fuse_pruning_mask)
+        del module._buffers[self._tensor_name + "_" + self.scheduled_value_name]
+        if self.sync_scheduled_value_post_hook_handle is not None:
+            self.sync_scheduled_value_post_hook_handle.remove()
+            self.sync_scheduled_value_post_hook_handle = None
diff --git a/coremltools/optimize/torch/pruning/_utils.py b/coremltools/optimize/torch/pruning/_utils.py
new file mode 100644
index 000000000..f53465568
--- /dev/null
+++ b/coremltools/optimize/torch/pruning/_utils.py
@@ -0,0 +1,275 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import logging as _logging
+from typing import List as _List
+from typing import Optional as _Optional
+from typing import Tuple as _Tuple
+from typing import cast as _cast
+
+import numpy as _np
+import torch as _torch
+
+logger = _logging.getLogger(__name__)
+
+
+def lerp(v0, v1, t):
+    return v0 + (v1 - v0) * t
+
+
+def spline(v0, v1, t, power):
+    one_m_t = 1.0 - t
+    x = one_m_t**power
+    return lerp(v1, v0, x)
+
+
+def magnitude_ranked_mask(
+    weights: _torch.Tensor, sparsity_fraction: float, block_size: int, granularity: str
+) -> _torch.Tensor:
+    """
+    Compute a binary mask for pruning based on magnitude-based ranking
+    If granularity is `per_scalar`, L1 norm is used. L2 is used otherwise
+    """
+    shape = weights.shape
+    rank = len(shape)
+
+    # rank 1: flattened global unstructured weights, rank 2: torch.Linear, rank 3: torch.Conv1d,
+    # rank 4: torch.Conv2d, rank 5: torch.Conv3d
+    assert rank in [1, 2, 3, 4, 5], f"weights tensor rank must be in [1, 2, 3, 4, 5], got {rank}"
+
+    if granularity == "per_scalar" or rank == 2:
+        magnitude_map = weights.abs()
+        nb_weight_components = weights.numel()
+
+    elif rank in [3, 4, 5]:
+        if granularity == "per_kernel":
+            start_dim = 2
+            nb_weight_components = shape[0] * shape[1]
+        elif granularity == "per_channel":
+            start_dim = 1
+            nb_weight_components = shape[0]
+        else:
+            raise ValueError(f"Unsupported granularity for magnitude_ranked_mask: {granularity}")
+
+        # Compute L2 norm per weight slice (as defined by the granularity)
+        magnitude_map = _torch.norm(weights.flatten(start_dim), dim=-1)
+        for _ in range(rank - start_dim):
+            magnitude_map = magnitude_map.unsqueeze(-1)
+
+    if block_size > 1:
+        ch_shape = shape[0]
+        if ch_shape % block_size != 0:
+            # Since the number of channels isn't divisible by block size,
+            # we shall pad the channels so that it is divisible
+            pad_shape = list(magnitude_map.shape)
+            pad_shape[0] = block_size - ch_shape % block_size
+            magnitude_map = _torch.cat(
+                [magnitude_map, _torch.zeros(pad_shape, device=magnitude_map.device)], dim=0
+            )
+            ch_shape = magnitude_map.shape[0]
+            assert ch_shape % block_size == 0
+
+        if block_size > ch_shape / 2:
+            raise ValueError(
+                f"Pruning block size ({block_size}) can be at most half the number of output channels ({ch_shape}/2={ch_shape/2})"
+            )
+
+        # Reshape to expose the "block" sub-axis
+        s = list(magnitude_map.shape)  # block exposed shape
+        s.insert(1, block_size)
+        s[0] = int(s[0] / block_size)
+        f = [-1] * len(s)  # expand factors to recover orig shape
+        f[1] = block_size
+        magnitude_map = (
+            magnitude_map.view(s)
+            .pow(2)
+            .sum(1, keepdim=True)
+            .sqrt()
+            .expand(f)
+            .contiguous()
+            .view(magnitude_map.shape)
+        )
+
+        # Reshape to original shape in case of padding
+        magnitude_map = magnitude_map[: shape[0]]
+
+    nb_nonzero = _torch.ceil(
+        _torch.as_tensor(nb_weight_components, dtype=_torch.float32) * (1 - sparsity_fraction)
+    ).int()
+
+    # handle special case when sparsity_fraction = 1.0
+    if nb_nonzero == 0:
+        thr = 1.0 + magnitude_map.flatten().max()
+    else:
+        thr = (
+            magnitude_map.flatten().sort()[0].flip(0)[nb_nonzero - 1]
+        )  # produces same mask for 1.0 and 0.0 sparsity
+
+    mask = _torch.greater_equal(magnitude_map, thr)
+
+    return mask
+
+
+def n_m_mask(weights: _torch.Tensor, nm: _Tuple[int, int], dim: _Optional[int] = 1):
+    """
+    Create a n:m sparsity mask.
+    """
+    shape = weights.shape
+    permuted_shape = shape
+    rank = len(shape)
+    num_zeros, block_size = nm
+    mask_value = 0.0
+
+    assert num_zeros < block_size, (
+        f"n (number of zeros) = {num_zeros} must be " f"less than m (block size) = {block_size}"
+    )
+
+    assert dim in [0, 1], (
+        f"n:m mask is supported along dimensions (0, 1), "
+        f"corresponding to input and output channels. Received "
+        f"dim = {dim}"
+    )
+    # rank 2: torch.Linear, rank 3: torch.Conv1d,
+    # rank 4: torch.Conv2d, rank 5: torch.Conv3d
+    assert rank in [2, 3, 4, 5], f"weights tensor rank must be in [2, 3, 4, 5], got {rank}"
+
+    # num_non_zeros = block_size - num_zeros
+
+    # if n:m is required along C_o, flip C_i and C_o
+    if dim == 0:
+        weights = _torch.permute(weights, [1, 0] + list(range(2, rank)))
+    # transform to A x C_i
+    # For Conv1D: C_o x C_i x H         ==>         H x C_o x C_i ==> H*C_o     x C_i
+    # For Conv2D: C_o x C_i x H x W     ==>     H x W x C_o x C_i ==> H*W*C_o   x C_i
+    # For Conv3D: C_o x C_i x H x W x D ==> H x W x D x C_o x C_i ==> H*W*D*C_o x C_i
+    if rank > 2:
+        permute_array = list(range(2, rank)) + [0, 1]
+        weights = _torch.permute(weights, permute_array)
+        permuted_shape = weights.shape
+        weights = _torch.reshape(weights, (-1, weights.shape[-1]))
+
+    abs_weights = weights.abs()
+    padding_size = block_size - abs_weights.shape[-1] % block_size
+    abs_weights_pad = _torch.nn.functional.pad(abs_weights, (0, padding_size), mode="constant")
+
+    num_blocks = abs_weights_pad.numel() // block_size
+    weights_blocks = abs_weights_pad.view(num_blocks, block_size)
+
+    indices = _torch.argsort(weights_blocks, dim=1)[:, :num_zeros]
+    sparsity_mask = _torch.ones([num_blocks, block_size], device=weights.device)
+    sparsity_mask.scatter_(dim=1, index=indices, value=mask_value)
+    sparsity_mask = sparsity_mask.view(abs_weights_pad.shape)
+    sparsity_mask = sparsity_mask[:, : abs_weights.shape[-1]]
+
+    # revert changes to mask shape to achieve same size as original weight
+    if rank > 2:
+        sparsity_mask = _torch.reshape(sparsity_mask, permuted_shape)
+        permute_array = [rank - 2, rank - 1] + list(range(0, rank - 2))
+        sparsity_mask = _torch.permute(sparsity_mask, permute_array)
+    if dim == 0:
+        sparsity_mask = _torch.permute(sparsity_mask, [1, 0] + list(range(2, rank)))
+
+    return sparsity_mask
+
+
+def block2_sparsity(weight: _torch.Tensor) -> _torch.Tensor:
+    n = weight.size(0)
+    assert n % 2 == 0
+    return weight.flatten(1).view(n // 2, 2, -1).sum(1).eq(0.0).float().mean().item()
+
+
+def structured_sparsity(weight: _torch.Tensor) -> _torch.Tensor:
+    return weight.flatten(1).sum(1).eq(0.0).float().mean().item()
+
+
+def unstructured_sparsity(weight: _torch.Tensor) -> _torch.Tensor:
+    return weight.eq(0.0).float().mean().item()
+
+
+def unstructured_sparsity_matrix(
+    name: str, weight: _torch.Tensor, block_size: int
+) -> _torch.Tensor:
+    import matplotlib
+
+    matplotlib.use("agg")
+    import matplotlib.pyplot as plt
+
+    rank = len(weight.shape)
+
+    weight = weight.clone().detach()
+
+    if block_size is not None and block_size > 1:
+        C_out, C_in = weight.shape[:2]
+        assert C_out % block_size == 0
+        if rank > 2:
+            weight = weight.flatten(2).view(C_out // block_size, block_size, C_in, -1)
+        else:
+            weight = weight.view(C_out // 2, 2, C_in)
+
+        sparsity_matrix = weight.sum(1).eq(0.0).float()
+    else:
+        sparsity_matrix = weight.eq(0.0).float()
+
+    if rank > 2:
+        max_kernel_support = _np.prod(sparsity_matrix.shape[2:])
+        sparsity_matrix = sparsity_matrix.sum(dim=tuple(range(2, len(sparsity_matrix.shape))))
+    else:
+        max_kernel_support = 1
+
+    f = plt.figure()
+    ax = f.gca()
+    ax.imshow(
+        max_kernel_support - sparsity_matrix.cpu().numpy(),
+        cmap="jet",
+        interpolation="nearest",
+        vmin=0,
+        vmax=max_kernel_support,
+    )
+    ax.set_xlabel("Input channels index")
+    ax.set_ylabel("Output channels index")
+    sparsity_type = (
+        f"Block-{block_size}" if block_size is not None and block_size > 1 else "Unstructured"
+    )
+    ax.set_title(f"{sparsity_type} Sparsity Matrix for Layer {name}")
+    ax.set_xticks([])
+    ax.set_yticks([])
+    f.canvas.draw()
+
+    im = _np.frombuffer(f.canvas.tostring_rgb(), dtype=_np.uint8).copy()
+    im = im.reshape((1,) + f.canvas.get_width_height()[::-1] + (3,))
+
+    f.clear()
+    plt.close(f)
+    return _torch.from_numpy(im)
+
+
+def get_global_sparsity_summaries(
+    layer_sparsities: _List[_torch.Tensor], layer_numel: _List[int]
+) -> float:
+    assert len(layer_sparsities) == len(layer_numel)
+
+    weighted_sum, denom = 0.0, 0.0
+    for sparsity, numel in zip(layer_sparsities, layer_numel):
+        if sparsity >= 0.0:
+            denom += numel
+            weighted_sum += numel * _cast(float, sparsity)
+
+    if _torch.all(_torch.tensor(layer_sparsities) < 0):
+        # to indicate the sparsity type is not applicable
+        return -1
+
+    assert denom > 0.0
+    return weighted_sum / denom
+
+
+def validate_allowed_granularity_values(instance, attribute, value):
+    if value is None:
+        return
+    allowed_values = ["per_scalar", "per_kernel", "per_channel", "per_layer"]
+    if value not in allowed_values:
+        raise ValueError(
+            f"Allowed values for granularity are: {', '.join(allowed_values)}. "
+            f"Received: {value}"
+        )
diff --git a/coremltools/optimize/torch/pruning/magnitude_pruner.py b/coremltools/optimize/torch/pruning/magnitude_pruner.py
new file mode 100644
index 000000000..cdd2e1a4b
--- /dev/null
+++ b/coremltools/optimize/torch/pruning/magnitude_pruner.py
@@ -0,0 +1,386 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import copy as _copy
+import logging as _logging
+from collections import OrderedDict as _OrderedDict
+from typing import Any as _Any
+from typing import Callable as _Callable
+from typing import Dict as _Dict
+from typing import NewType as _NewType
+from typing import Optional as _Optional
+from typing import Tuple as _Tuple
+from typing import Union as _Union
+
+import attrs as _attrs
+import cattrs as _cattrs
+import torch as _torch
+from attr import define as _define
+from attr import field as _field
+from attrs import validators as _validators
+
+from coremltools.optimize.torch._typing import ParamsDict as _ParamsDict
+from coremltools.optimize.torch.optimization_config import (
+    ModuleOptimizationConfig as _ModuleOptimizationConfig,
+)
+from coremltools.optimize.torch.optimization_config import OptimizationConfig as _OptimizationConfig
+from coremltools.optimize.torch.optimization_config import (
+    _structure_from_dict_hook_factory,
+    _validate_module_type_keys_factory,
+)
+from coremltools.optimize.torch.pruning._base_pruner import (
+    BasePrunerWithPruningMethod as _BasePrunerWithPruningMethod,
+)
+from coremltools.optimize.torch.pruning._base_pruner import _allowed_granularity_values
+from coremltools.optimize.torch.pruning._base_pruning_method import (
+    ScheduledBaseDynamicPruningMethod as _ScheduledBaseDynamicPruningMethod,
+)
+from coremltools.optimize.torch.pruning._utils import (
+    magnitude_ranked_mask as _magnitude_ranked_mask,
+)
+from coremltools.optimize.torch.pruning._utils import n_m_mask as _n_m_mask
+from coremltools.optimize.torch.pruning.pruning_scheduler import (
+    ConstantSparsityScheduler as _ConstantSparsityScheduler,
+)
+from coremltools.optimize.torch.pruning.pruning_scheduler import (
+    PruningScheduler as _PruningScheduler,
+)
+from coremltools.optimize.torch.pruning.pruning_scheduler import _PruningSchedulerType
+
+_logger = _logging.getLogger(__name__)
+
+
+_SUPPORTED_MODULES = (_torch.nn.Linear, _torch.nn.Conv1d, _torch.nn.Conv2d, _torch.nn.Conv3d)
+
+
+@_define
+class ModuleMagnitudePrunerConfig(_ModuleOptimizationConfig):
+    """
+    Module level configuration for :py:class:`MagnitudePruner`.
+
+    Args:
+        scheduler (:py:class:`PruningScheduler`): A pruning scheduler which specifies how the
+            sparsity should be changed over the course of the training.
+        initial_sparsity (:obj:`float`): Desired fraction of zeroes at the beginning of the
+            training process.
+        target_sparsity (:obj:`float`): Desired fraction of zeroes at the end of the
+            training process.
+        granularity (:obj:`str`): Specifies the granularity at which the pruning mask will be
+            computed. Can be one of ``per_layer``, ``per_channel``, ``per_kernel``, ``per_scalar``.
+        block_size (:obj:`int`): Block size for inducing block sparsity within the mask. This
+            is applied on the output channel dimension of the parameter (the ``0`` -th dimension). Having the zeros aligned
+            in the parameter helps gain latency/memory performance on-device. ``block_size`` must be greater than ``1``
+            to enable block sparsity, and must be at most half the number of output channels, and must be divisible by
+            the number of output channels.
+        n_m_ratio (:obj:`tuple` of :obj:`int`): A tuple of two integers which specify how ``n:m`` pruning should be
+            applied. In ``n:m`` pruning, out of every ``m`` elements,
+            ``n`` with lowest magnitude are set to zero. When ``n_m_ratio`` is not ``None``, ``block_size``,
+            ``granularity``, and ``initial_sparsity`` should be ``1``, ``per_scalar``, and ``0.0`` respectively.
+            The value of ``target_sparsity`` is ignored and the actualy target sparsity is determined by the
+            ``n:m`` ratio. For more information, see `Learning N:M Fine-Grained Structured Sparse Neural Networks From Scratch <https://arxiv.org/abs/2102.04010>`_.
+        dim (:obj:`int`): Dimension along which blocks of ``m`` elements are chosen when applying ``n:m`` sparsity. This
+            parameter is only used when ``n_m_ratio`` is not ``None``.
+        param_name (:obj:`str`): The name of the parameter to be pruned. Defaults to ``weight``.
+    """
+
+    scheduler: _PruningSchedulerType = _field(
+        default=_ConstantSparsityScheduler(begin_step=0),
+        validator=_validators.instance_of(_PruningScheduler),
+    )
+    initial_sparsity: float = _field(default=0.0, validator=_validators.instance_of(float))
+    target_sparsity: float = _field(default=0.5, validator=_validators.instance_of(float))
+    granularity: str = _field(
+        default="per_scalar",
+        validator=[_validators.instance_of(str), _validators.in_(_allowed_granularity_values)],
+    )
+    block_size: int = _field(default=1, validator=_validators.instance_of(int))
+    n_m_ratio: _Optional[_Tuple[int, int]] = _field(
+        default=None,
+        validator=_attrs.validators.optional(
+            _validators.deep_iterable(
+                member_validator=_validators.instance_of(int),
+                iterable_validator=_validators.instance_of((tuple, list)),
+            )
+        ),
+    )
+    dim: int = _field(default=1, validator=_validators.instance_of(int))
+    param_name: str = _field(default="weight", validator=_validators.instance_of(str))
+
+    def __attrs_post_init__(self):
+        if self.n_m_ratio is not None:
+            assert (
+                len(self.n_m_ratio) == 2
+            ), f"n_m_ratio must be a tuple of 2 integers, received: {self.n_m_ratio}"
+            n, m = self.n_m_ratio
+            assert m > 0, f"Received n_m_ratio (n, m): {self.n_m_ratio}. m must be greater than 0."
+            assert n <= m, (
+                f"Received n_m_ratio (n, m): {self.n_m_ratio}. The number of zero in a block (n) "
+                f"must be less than or equal to the block size (m)."
+            )
+            if self.block_size is not None and self.block_size > 1:
+                raise ValueError(
+                    f"Received block_size = {self.block_size} and n_m_ratio = {self.n_m_ratio}. "
+                    f"These two modes are mutually exclusive. When n_m_ratio != None, "
+                    f"the only allowed value of block_size is 1. "
+                    f"n_m_ratio should be equal to None for block_size > 1."
+                )
+            if self.granularity is not None and self.granularity != "per_scalar":
+                raise ValueError(
+                    f"Received granularity = {self.granularity} and n_m_ratio = {self.n_m_ratio}. "
+                    f"When n_m_ratio != None, the only allowed value of granularity is "
+                    f"per_scalar."
+                )
+            if self.initial_sparsity is not None and self.initial_sparsity > 0.0:
+                raise ValueError(
+                    f"Received initial_sparsity = {self.initial_sparsity} and "
+                    f"n_m_ratio = {self.nm_ratio}. When n_m_ratio != None, the only allowed "
+                    f"value of initial_sparsity is 0."
+                )
+
+
+_ModuleTypeConfigType = _NewType(
+    "ModuleTypeConfigType",
+    _Dict[_Union[_Callable, str], _Optional[ModuleMagnitudePrunerConfig]],
+)
+
+
+@_define
+class MagnitudePrunerConfig(_OptimizationConfig):
+    """
+    Configuration for :py:class:`MagnitudePruner`.
+
+    Args:
+        global_config (:py:class:`ModuleMagnitudePrunerConfig`): Config to be applied globally
+            to all supported modules. Missing values are chosen from the default config.
+        module_type_configs (:obj:`dict` of :obj:`str` to :py:class:`ModuleMagnitudePrunerConfig`): Module
+            type level configs applied to a specific module class, such as :py:class:`torch.nn.Linear`.
+            The keys can be either strings or module classes.
+        module_name_configs (:obj:`dict` of :obj:`str` to :py:class:`ModuleMagnitudePrunerConfig`): Module level
+            configs applied to specific modules. The name of the module must be a fully qualified name that can
+            be used to fetch it from the top level module using the ``module.get_submodule(target)`` method.
+    """
+
+    global_config: _Optional[ModuleMagnitudePrunerConfig] = _field(
+        default=None,
+        validator=_validators.optional(_validators.instance_of(ModuleMagnitudePrunerConfig)),
+    )
+    module_type_configs: _ModuleTypeConfigType = _field(
+        factory=_OrderedDict,
+        validator=_validators.deep_mapping(
+            key_validator=_validators.and_(
+                _validators.instance_of((str, _Callable)),
+                _validate_module_type_keys_factory(_SUPPORTED_MODULES),
+            ),
+            value_validator=_validators.optional(
+                _validators.instance_of(ModuleMagnitudePrunerConfig)
+            ),
+            mapping_validator=_validators.instance_of(dict),
+        ),
+    )
+    module_name_configs: _Dict[str, _Optional[ModuleMagnitudePrunerConfig]] = _field(
+        factory=_OrderedDict,
+        validator=_validators.deep_mapping(
+            key_validator=_validators.instance_of(str),
+            value_validator=_validators.optional(
+                _validators.instance_of(ModuleMagnitudePrunerConfig)
+            ),
+            mapping_validator=_validators.instance_of(dict),
+        ),
+    )
+
+    def __attrs_post_init__(self):
+        if (
+            self.global_config is None
+            and len(self.module_type_configs) == 0
+            and len(self.module_name_configs) == 0
+        ):
+            self.global_config = ModuleMagnitudePrunerConfig()
+
+    @classmethod
+    def from_dict(cls, config_dict: _Dict[str, _Any]) -> "MagnitudePrunerConfig":
+        super().from_dict(config_dict)
+        converter = _cattrs.Converter(forbid_extra_keys=True)
+        converter.register_structure_hook(
+            _ModuleTypeConfigType,
+            _structure_from_dict_hook_factory(ModuleMagnitudePrunerConfig),
+        )
+        return converter.structure_attrs_fromdict(config_dict, cls)
+
+
+class _MagnitudePruningMethod(_ScheduledBaseDynamicPruningMethod):
+    """
+    Magnitude-based static mask pruning method as described in
+    `To Prune or Not to Prune <https://arxiv.org/pdf/1710.01878>`_.
+    """
+    _tensor_name: str
+    scheduled: bool = True
+    amount: float
+
+    def __init__(
+        self,
+        amount: float,
+        block_size: int,
+        granularity: str,
+        n_m_ratio: _Optional[_Tuple[int, int]] = None,
+        dim: _Optional[int] = None,
+        **kwargs: _ParamsDict,
+    ):
+        super().__init__(scheduled_value=amount, scheduled_value_name="amount")
+        self.block_size = block_size
+        self.granularity = granularity
+        self.n_m_ratio = n_m_ratio
+        self.dim = dim
+
+    def compute_mask(self, t: _torch.Tensor, default_mask: _torch.Tensor) -> _torch.Tensor:
+        if self.n_m_ratio is not None:
+            _, block_size = self.n_m_ratio
+            num_zeros = int(self.amount * block_size)
+            if num_zeros == 0:
+                # when number of zeros is < 0, we increase sparsity gradually
+                return _magnitude_ranked_mask(t, self.amount, 1, self.granularity).float()
+            else:
+                return _n_m_mask(t, (num_zeros, block_size), self.dim).float()
+        else:
+            return _magnitude_ranked_mask(t, self.amount, self.block_size, self.granularity).float()
+
+
+@_define
+class _MagnitudePrunerInfo:
+    config: ModuleMagnitudePrunerConfig
+    module: _torch.nn.Module
+    sparsity_level: float
+
+
+class MagnitudePruner(_BasePrunerWithPruningMethod):
+    """
+    This pruning algorithm was inspired by the paper `"To prune or not to prune"
+    <https://arxiv.org/pdf/1710.01878.pdf>`_.
+
+    In order to achieve the desired sparsity, the algorithm sorts a module's weight matrix
+    by the magnitude of its elements, and sets all elements less than a threshold to zero.
+    Magnitude is computed using L1 norm when granularity is ``per_scalar``, otherwise, L2
+    norm is used.
+
+    The pruner can be configured at different granularities such as per scalar, per kernel,
+    per channel (output channel), or per layer, to induce varying sparsity structures in the
+    weight matrix.
+
+    When the ``block_size`` parameter is provided in the config, zeros are induced in the same locations
+    (across all other axes) in ``block_size`` number of consecutive output channels.
+
+    When the ``n_m_ratio`` parameter is provided in the config, out of every ``m`` elements, the smallest ``n``
+    are set to zero. The ``m`` element blocks are chosen along the dimension specified by the ``dim`` parameter.
+
+    Example:
+            .. code-block:: python
+
+                import torch
+                from collections import OrderedDict
+                from coremltools.optimize.torch.pruning import MagnitudePruner, MagnitudePrunerConfig
+
+                # define model and loss function
+                model = torch.nn.Sequential(
+                    OrderedDict(
+                        [
+                            ("conv1", torch.nn.Conv2d(3, 32, 3, padding="same")),
+                            ("conv2", torch.nn.Conv2d(32, 32, 3, padding="same")),
+                        ]
+                    )
+                )
+                loss_fn = define_loss()  # define the loss function
+
+                # initialize pruner and configure it
+                # we only prune the fisrt conv layer
+                config = MagnitudePrunerConfig.from_dict(
+                    {
+                        "module_name_configs": {
+                            "conv1": {
+                                "scheduler": {"update_steps": [3, 5, 7]},
+                                "target_sparsity": 0.75,
+                                "granularity": "per_channel",
+                            },
+                        }
+                    }
+                )
+
+                pruner = MagnitudePruner(model, config)
+
+                # insert pruning layers in the model
+                model = pruner.prepare()
+
+                for inputs, labels in data:
+                    output = model(inputs)
+                    loss = loss_fn(output, labels)
+                    loss.backward()
+                    optimizer.step()
+                    pruner.step()
+
+                # commit pruning masks to model parameters
+                pruner.finalize(inplace=True)
+
+    Args:
+        model (:py:class:`torch.nn.Module`): Model on which the pruner will act.
+        config (:py:class:`MagnitudePrunerConfig`): Config which specifies how
+            different submodules in the model will be configured for pruning.
+            Default config is used when passed as ``None``.
+    """
+    _supported_modules: _Tuple = _SUPPORTED_MODULES
+
+    def __init__(self, model: _torch.nn.Module, config: _Optional[MagnitudePrunerConfig] = None):
+        config = MagnitudePrunerConfig() if config is None else config
+        super().__init__(model, config)
+
+    def prepare(self, inplace: bool = False) -> _torch.nn.Module:
+        if self._is_prepared:
+            _logger.warning(
+                "Model has already been prepared for pruning. This API call will be a no-op."
+            )
+            return self._model
+        self._model = super().prepare(inplace=inplace)
+        for name, submodule in self._model.named_modules(remove_duplicate=True):
+            submod_config = self._config.get_module_config(name, submodule)
+            if isinstance(submodule, self._supported_modules) and submod_config is not None:
+                submod_config = _copy.deepcopy(submod_config)
+                if submod_config.n_m_ratio is not None:
+                    num_zeros, block_size = submod_config.n_m_ratio
+                    # Add target sparsity to make scheduler work
+                    submod_config.target_sparsity = float(num_zeros) / float(block_size)
+                _MagnitudePruningMethod.from_module_and_params(
+                    submodule,
+                    param_name=submod_config.param_name,
+                    amount=submod_config.initial_sparsity,
+                    block_size=submod_config.block_size,
+                    granularity=submod_config.granularity,
+                    n_m_ratio=submod_config.n_m_ratio,
+                    dim=submod_config.dim,
+                )
+                self._pruner_info[name] = _MagnitudePrunerInfo(
+                    config=submod_config,
+                    module=submodule,
+                    sparsity_level=submod_config.initial_sparsity,
+                )
+        return self._model
+
+    def step(self):
+        if not self._is_prepared:
+            _logger.warning(
+                "Model has not been prepared for pruning. This API call "
+                "will be a no-op. prepare method must be called before "
+                "a call to the step method."
+            )
+            return
+        self._step_count += 1
+        for name, pruner_info in self._pruner_info.items():
+            if hasattr(pruner_info.module, "pruning_method"):
+                sparsity_level = pruner_info.config.scheduler.compute_sparsity(
+                    self._step_count,
+                    prev_sparsity=pruner_info.sparsity_level,
+                    config=pruner_info.config,
+                )
+                if sparsity_level != pruner_info.sparsity_level:
+                    pruner_info.module.pruning_method.update_mask(
+                        pruner_info.module, sparsity_level
+                    )
+                pruner_info.sparsity_level = sparsity_level
diff --git a/coremltools/optimize/torch/pruning/pruning_scheduler.py b/coremltools/optimize/torch/pruning/pruning_scheduler.py
new file mode 100644
index 000000000..591f9e587
--- /dev/null
+++ b/coremltools/optimize/torch/pruning/pruning_scheduler.py
@@ -0,0 +1,144 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+from abc import ABC as _ABC
+from abc import abstractmethod as _abstractmethod
+from typing import Union as _Union
+
+import attr as _attr
+import torch as _torch
+from attr import define as _define
+from attr import field as _field
+from attrs import validators as _validators
+
+from coremltools.optimize.torch._utils.torch_utils import (
+    list_or_str_to_tensor as _list_or_str_to_tensor,
+)
+from coremltools.optimize.torch.optimization_config import (
+    ModuleOptimizationConfig as _ModuleOptimizationConfig,
+)
+from coremltools.optimize.torch.pruning._utils import spline as _spline
+
+
+@_define
+class PruningScheduler(_ABC):
+    """
+    An abstraction for implementing schedules to be used for
+    changing the sparsity of pruning masks applied by various types of
+    pruning algorithms to module parameters over the course of the training.
+    """
+
+    @_abstractmethod
+    def compute_sparsity(
+        self, step_count: int, prev_sparsity: float, config: _ModuleOptimizationConfig
+    ) -> float:
+        """
+        Compute the sparsity at the next step given the previous sparsity
+        and the module optimization config.
+
+        Args:
+            step_count (:obj:`int`): Current step count.
+            prev_sparsity (:obj:`float`): Sparsity at previous step.
+            config (:py:class:`ModuleOptimizationConfig`): Optimization
+                config for the module which contains information such as
+                target sparsity and initial sparsity.
+        """
+        raise NotImplementedError()
+
+
+@_define
+class PolynomialDecayScheduler(PruningScheduler):
+    r"""
+    A pruning scheduler inspired by the paper `"To prune or not to prune" <https://arxiv.org/pdf/1710.01878.pdf>`_.
+
+    It sets the sparsity at step :math:`t` using the formula:
+
+    .. math::
+
+            sparsity_t = target\_sparsity + (initial\_sparsity - target\_sparsity)
+                       * (1 - \frac{update\_index}{total\_number\_of\_updates}) ^ {power}
+
+    If :math:`t` is in :math:`update\_steps`, else it keeps the sparsity at its previous value.
+
+    Here, :math:`update\_index` is the index of :math:`t` in the :math:`update\_steps` array and
+    :math:`total\_number\_of\_updates` is the length of :math:`update\_steps` array.
+
+    Args:
+        update_steps (:obj:`list` of :obj:`int` or :obj:`str`): The indices of
+            optimization steps at which pruning should be performed. This can
+            be passed in as a string representing the range, such as
+            ``range(start_index, end_index, step_size)``.
+        power (:obj:`int`, optional): Exponent to be used in the
+            sparsity function. Defaults to ``3``.
+    """
+
+    update_steps: _torch.tensor = _field(
+        converter=_list_or_str_to_tensor, eq=_attr.cmp_using(eq=_torch.equal)
+    )
+    power: int = _field(default=3, validator=_validators.instance_of(int))
+
+    @update_steps.validator
+    def _check_update_steps(self, attribute, value):
+        assert (
+            len(value.size()) == 1
+        ), f"update_steps: {value} must be a 1-D tensor or list of ints."
+        for elem in value:
+            if elem.int() != elem:
+                raise ValueError(f"Each element of update_steps {value} must be an integer.")
+            assert (
+                elem >= 0
+            ), f"All elements of update_steps must be non-negative. Received: {value}."
+
+    def compute_sparsity(
+        self, step_count: int, prev_sparsity: float, config: _ModuleOptimizationConfig
+    ) -> float:
+        cur_step_update_steps_mask = step_count == self.update_steps
+        if _torch.any(cur_step_update_steps_mask):
+            update_number = _torch.nonzero(cur_step_update_steps_mask, as_tuple=True)[0].item()
+            update_step_shape = self.update_steps.shape[0]
+            if update_step_shape == 1:
+                t = 1.0
+            else:
+                t = update_number / (update_step_shape - 1)
+            initial_sparsity = (
+                config.initial_sparsity if hasattr(config, "initial_sparsity") else 0.0
+            )
+            assert hasattr(config, "target_sparsity"), (
+                f"Attribute target_sparsity not found in config {config}. "
+                f"{self.__class__} only works with configs "
+                f"which have this attribute."
+            )
+            return _spline(initial_sparsity, config.target_sparsity, t, self.power)
+        return prev_sparsity
+
+
+@_define
+class ConstantSparsityScheduler(PruningScheduler):
+    """
+    A pruning schedule with constant sparsity throughout training.
+
+    Sparsity is set to zero initially and to ``target_sparsity`` at
+    step ``begin_step``.
+
+    Args:
+        begin_step (:obj:`int`): step at which to begin pruning.
+    """
+
+    begin_step: int = _field(validator=_validators.instance_of(int))
+
+    def compute_sparsity(
+        self, step_count: int, prev_sparsity: float, config: _ModuleOptimizationConfig
+    ) -> float:
+        if step_count >= self.begin_step:
+            assert hasattr(config, "target_sparsity"), (
+                f"Attribute target_sparsity not found in config {config}. "
+                f"{self.__class__} only works with configs "
+                f"which have this attribute."
+            )
+            return config.target_sparsity
+        return prev_sparsity
+
+
+_PruningSchedulerType = _Union[PolynomialDecayScheduler, ConstantSparsityScheduler]
diff --git a/coremltools/optimize/torch/quantization/__init__.py b/coremltools/optimize/torch/quantization/__init__.py
new file mode 100644
index 000000000..3b58bf642
--- /dev/null
+++ b/coremltools/optimize/torch/quantization/__init__.py
@@ -0,0 +1,38 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+"""
+.. _coremltools_optimize_torch_qat:
+
+Quantization refers to techniques for performing neural network computations in lower precision than
+floating point. Quantization can reduce a model’s size and also improve a model’s inference latency and
+memory bandwidth requirement, because many hardware platforms offer high-performance implementations of quantized
+operations.
+
+_`LinearQuantizer`
+==================
+
+.. autoclass::  coremltools.optimize.torch.quantization.ModuleLinearQuantizerConfig
+    :members: from_dict, as_dict, from_yaml
+
+.. autoclass::  coremltools.optimize.torch.quantization.LinearQuantizerConfig
+    :members: set_global, set_module_type, set_module_name, from_dict, as_dict, from_yaml
+
+.. autoclass:: coremltools.optimize.torch.quantization.LinearQuantizer
+    :members: prepare, step, report, finalize
+
+.. autoclass::  coremltools.optimize.torch.quantization.ObserverType
+
+.. autoclass::  coremltools.optimize.torch.quantization.QuantizationScheme
+
+"""
+
+from .quantization_config import (
+    LinearQuantizerConfig,
+    ModuleLinearQuantizerConfig,
+    ObserverType,
+    QuantizationScheme,
+)
+from .quantizer import LinearQuantizer
diff --git a/coremltools/optimize/torch/quantization/_backend_config.py b/coremltools/optimize/torch/quantization/_backend_config.py
new file mode 100644
index 000000000..b0e67930a
--- /dev/null
+++ b/coremltools/optimize/torch/quantization/_backend_config.py
@@ -0,0 +1,869 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import operator as _operator
+from typing import Any as _Any
+from typing import List as _List
+from typing import Set as _Set
+
+import torch as _torch
+import torch.ao.nn.qat as _nnq
+import torch.ao.nn.quantized.reference as _nnr
+import torch.nn as _nn
+import torch.nn.functional as _F
+import torch.nn.intrinsic as _nni
+import torch.nn.intrinsic.qat as _nniq
+from torch.ao.quantization.backend_config import BackendConfig as _BackendConfig
+from torch.ao.quantization.backend_config import BackendPatternConfig as _BackendPatternConfig
+from torch.ao.quantization.backend_config import DTypeWithConstraints as _DTypeWithConstraints
+
+import coremltools.optimize.torch.quantization.modules.fused_modules as _fused
+import coremltools.optimize.torch.quantization.modules.qat_modules as _qat
+import coremltools.optimize.torch.quantization.modules.quantized_modules as _quantized
+from coremltools.optimize.torch._utils.version_utils import is_torch_2 as _is_torch_2
+from coremltools.optimize.torch.quantization._backend_config_utils import (
+    activation_configs as _activation_configs,
+)
+from coremltools.optimize.torch.quantization._backend_config_utils import (
+    binary_op_act_configs as _binary_op_relu_configs,
+)
+from coremltools.optimize.torch.quantization._backend_config_utils import (
+    binary_op_configs as _binary_op_configs,
+)
+from coremltools.optimize.torch.quantization._backend_config_utils import bn_relu as _bn_relu
+from coremltools.optimize.torch.quantization._backend_config_utils import (
+    share_observer_configs as _share_observer_configs,
+)
+from coremltools.optimize.torch.quantization._backend_config_utils import (
+    weighted_act_configs as _weighted_act_configs,
+)
+from coremltools.optimize.torch.quantization._backend_config_utils import (
+    weighted_bn_act_configs as _weighted_bn_act_configs,
+)
+from coremltools.optimize.torch.quantization._backend_config_utils import (
+    weighted_bn_configs as _weighted_bn_configs,
+)
+from coremltools.optimize.torch.quantization._backend_config_utils import (
+    weighted_bn_relu_configs as _weighted_bn_relu_configs,
+)
+from coremltools.optimize.torch.quantization._backend_config_utils import (
+    weighted_configs as _weighted_configs,
+)
+from coremltools.optimize.torch.quantization._backend_config_utils import (
+    weighted_relu_configs as _weighted_relu_configs,
+)
+
+# module based activations
+_mod_activations = (
+    _nn.PReLU,
+    _nn.RReLU,
+    _nn.ReLU6,
+    _nn.LeakyReLU,
+    _nn.Sigmoid,
+    _nn.LogSigmoid,
+    _nn.Hardsigmoid,
+    _nn.SiLU,
+    _nn.ELU,
+    _nn.CELU,
+    _nn.SELU,
+    _nn.GLU,
+    _nn.Mish,
+    _nn.GELU,
+    _nn.Tanh,
+    _nn.Hardtanh,
+    _nn.Softmax,
+    _nn.LogSoftmax,
+    _nn.Hardswish,
+)
+
+# functional activations
+_func_activations = (
+    _F.prelu,
+    _F.rrelu_,
+    _F.rrelu,
+    _F.relu6,
+    _F.leaky_relu,
+    _F.leaky_relu_,
+    _F.logsigmoid,
+    _F.silu,
+    _F.elu,
+    _F.elu_,
+    _F.celu,
+    _F.celu_,
+    _F.selu,
+    _F.selu_,
+    _F.glu,
+    _F.mish,
+    _F.gelu,
+    _F.hardtanh,
+    _F.hardtanh_,
+    _F.log_softmax,
+    _F.hardswish,
+)
+
+
+# layers which have a fixed output range and hence use fixed qparams
+_fixed_qparams_modules = {
+    _torch.nn.Hardsigmoid,
+    _torch.nn.functional.hardsigmoid,
+    "hardsigmoid",
+    "hardsigmoid_",
+    _torch.nn.Sigmoid,
+    _torch.sigmoid,
+    "sigmoid",
+    "sigmoid_",
+    _torch.nn.Softmax,
+    _torch.nn.Tanh,
+    _torch.tanh,
+    "tanh",
+    "tanh_",
+}
+
+
+class _BackendConfigRegistry:
+    """
+    A registry of quantization patterns.
+    """
+
+    backend_config: _BackendConfig = _BackendConfig()
+    supported_modules: _Set[_Any] = set()
+
+    @classmethod
+    def register(cls):
+        def inner_wrapper(wrapped_fn):
+            backend_pattern_configs: _List[_BackendPatternConfig] = wrapped_fn()
+            for config in backend_pattern_configs:
+                if not isinstance(config.pattern, tuple):
+                    cls.supported_modules.add(config.pattern)
+            cls.backend_config.set_backend_pattern_configs(backend_pattern_configs)
+            return wrapped_fn
+
+        return inner_wrapper
+
+
+@_BackendConfigRegistry.register()
+def _conv1d_act() -> _List[_BackendPatternConfig]:
+    """
+    float: Conv1d -> Act
+    qat: FakeQuant -> qat.ConvAct1d -> FakeQuant
+    """
+    configs = _weighted_relu_configs(
+        mod=_nn.Conv1d,
+        func_mod=_F.conv1d,
+        fused_mod=_nni.ConvReLU1d,
+        qat_mod=_nniq.ConvReLU1d,
+        ref_quant_mod=_nnr.Conv1d,
+    )
+    for act in _mod_activations:
+        configs.extend(
+            _weighted_act_configs(
+                mod=_nn.Conv1d,
+                func_mod=_F.conv1d,
+                act=act,
+                fused_mod=_fused.ConvAct1d,
+                qat_mod=_qat.ConvAct1d,
+                ref_quant_mod=_quantized.QuantizedConvAct1d,
+            )
+        )
+    return configs
+
+
+@_BackendConfigRegistry.register()
+def _conv2d_act() -> _List[_BackendPatternConfig]:
+    """
+    float: Conv2d -> Act
+    qat: FakeQuant -> qat.ConvAct2d -> FakeQuant
+    """
+    configs = _weighted_relu_configs(
+        mod=_nn.Conv2d,
+        func_mod=_F.conv2d,
+        fused_mod=_nni.ConvReLU2d,
+        qat_mod=_nniq.ConvReLU2d,
+        ref_quant_mod=_nnr.Conv2d,
+    )
+    for act in _mod_activations:
+        configs.extend(
+            _weighted_act_configs(
+                mod=_nn.Conv2d,
+                func_mod=_F.conv2d,
+                act=act,
+                fused_mod=_fused.ConvAct2d,
+                qat_mod=_qat.ConvAct2d,
+                ref_quant_mod=_quantized.QuantizedConvAct2d,
+            )
+        )
+    return configs
+
+
+@_BackendConfigRegistry.register()
+def _conv3d_act() -> _List[_BackendPatternConfig]:
+    """
+    float: Conv3d -> Act
+    qat: FakeQuant -> qat.ConvAct3d -> FakeQuant
+    """
+    configs = _weighted_relu_configs(
+        mod=_nn.Conv3d,
+        func_mod=_F.conv3d,
+        fused_mod=_nni.ConvReLU3d,
+        qat_mod=_nniq.ConvReLU3d,
+        ref_quant_mod=_nnr.Conv3d,
+    )
+    for act in _mod_activations:
+        configs.extend(
+            _weighted_act_configs(
+                mod=_nn.Conv3d,
+                func_mod=_F.conv3d,
+                act=act,
+                fused_mod=_fused.ConvAct3d,
+                qat_mod=_qat.ConvAct3d,
+                ref_quant_mod=_quantized.QuantizedConvAct3d,
+            )
+        )
+    return configs
+
+
+@_BackendConfigRegistry.register()
+def _linear_act() -> _List[_BackendPatternConfig]:
+    """
+    float: Linear -> Act
+    qat: FakeQuant -> qat.LinearAct -> FakeQuant
+    """
+    configs = _weighted_relu_configs(
+        mod=_nn.Linear,
+        func_mod=_F.linear,
+        fused_mod=_nni.LinearReLU,
+        qat_mod=_nniq.LinearReLU,
+        ref_quant_mod=_nnr.Linear,
+    )
+    for act in _mod_activations:
+        configs.extend(
+            _weighted_act_configs(
+                mod=_nn.Linear,
+                func_mod=_F.linear,
+                act=act,
+                fused_mod=_fused.LinearAct,
+                qat_mod=_qat.LinearAct,
+                ref_quant_mod=_quantized.QuantizedLinearAct,
+            )
+        )
+    return configs
+
+
+@_BackendConfigRegistry.register()
+def _conv1d_bn() -> _List[_BackendPatternConfig]:
+    """
+    float: Conv1d -> BatchNorm1d
+    qat: FakeQuant -> qat.ConvBn1d -> FakeQuant
+    """
+    return _weighted_bn_configs(
+        mod=_nn.Conv1d,
+        bn_mod=_nn.BatchNorm1d,
+        fused_mod=_nni.ConvBn1d,
+        qat_mod=_nniq.ConvBn1d,
+        ref_quant_mod=_nnr.Conv1d,
+    )
+
+
+@_BackendConfigRegistry.register()
+def _conv2d_bn() -> _List[_BackendPatternConfig]:
+    """
+    float: Conv2d -> BatchNorm2d
+    qat: FakeQuant -> qat.ConvBn2d -> FakeQuant
+    """
+    return _weighted_bn_configs(
+        mod=_nn.Conv2d,
+        bn_mod=_nn.BatchNorm2d,
+        fused_mod=_nni.ConvBn2d,
+        qat_mod=_nniq.ConvBn2d,
+        ref_quant_mod=_nnr.Conv2d,
+    )
+
+
+@_BackendConfigRegistry.register()
+def _conv3d_bn() -> _List[_BackendPatternConfig]:
+    """
+    float: Conv3d -> BatchNorm3d
+    qat: FakeQuant -> qat.ConvBn3d -> FakeQuant
+    """
+    return _weighted_bn_configs(
+        mod=_nn.Conv3d,
+        bn_mod=_nn.BatchNorm3d,
+        fused_mod=_nni.ConvBn3d,
+        qat_mod=_nniq.ConvBn3d,
+        ref_quant_mod=_nnr.Conv3d,
+    )
+
+
+@_BackendConfigRegistry.register()
+def _linear_bn() -> _List[_BackendPatternConfig]:
+    """
+    float: Linear -> BatchNorm1d
+    qat: FakeQuant -> qat.LinearBn1d -> FakeQuant
+    """
+    return _weighted_bn_configs(
+        mod=_nn.Linear,
+        bn_mod=_nn.BatchNorm1d,
+        fused_mod=_nni.LinearBn1d,
+        qat_mod=_nniq.LinearBn1d,
+        ref_quant_mod=_nnr.Linear,
+    )
+
+
+@_BackendConfigRegistry.register()
+def _conv1d_bn_act() -> _List[_BackendPatternConfig]:
+    """
+    float: Conv1d -> BatchNorm1d -> Act
+    qat: FakeQuant -> qat.ConvBnAct1d -> FakeQuant
+    """
+    configs = _weighted_bn_relu_configs(
+        mod=_nn.Conv1d,
+        bn_mod=_nn.BatchNorm1d,
+        fused_mod=_nni.ConvBnReLU1d,
+        qat_mod=_nniq.ConvBnReLU1d,
+        ref_quant_mod=_nnr.Conv1d,
+    )
+    for act in _mod_activations:
+        configs.extend(
+            _weighted_bn_act_configs(
+                mod=_nn.Conv1d,
+                act=act,
+                bn_mod=_nn.BatchNorm1d,
+                root_mod=_nni.ConvBn1d,
+                fused_mod=_fused.ConvBnAct1d,
+                qat_mod=_qat.ConvBnAct1d,
+                ref_quant_mod=_quantized.QuantizedConvAct1d,
+            )
+        )
+    return configs
+
+
+@_BackendConfigRegistry.register()
+def _conv2d_bn_act() -> _List[_BackendPatternConfig]:
+    """
+    float: Conv2d -> BatchNorm2d -> Act
+    qat: FakeQuant -> qat.ConvBnAct2d -> FakeQuant
+    """
+    configs = _weighted_bn_relu_configs(
+        mod=_nn.Conv2d,
+        bn_mod=_nn.BatchNorm2d,
+        fused_mod=_nni.ConvBnReLU2d,
+        qat_mod=_nniq.ConvBnReLU2d,
+        ref_quant_mod=_nnr.Conv2d,
+    )
+    for act in _mod_activations:
+        configs.extend(
+            _weighted_bn_act_configs(
+                mod=_nn.Conv2d,
+                act=act,
+                bn_mod=_nn.BatchNorm2d,
+                root_mod=_nni.ConvBn2d,
+                fused_mod=_fused.ConvBnAct2d,
+                qat_mod=_qat.ConvBnAct2d,
+                ref_quant_mod=_quantized.QuantizedConvAct2d,
+            )
+        )
+    return configs
+
+
+@_BackendConfigRegistry.register()
+def _conv3d_bn_act() -> _List[_BackendPatternConfig]:
+    """
+    float: Conv3d -> BatchNorm3d -> Act
+    qat: FakeQuant -> qat.ConvBnAct3d -> FakeQuant
+    """
+    configs = _weighted_bn_relu_configs(
+        mod=_nn.Conv3d,
+        bn_mod=_nn.BatchNorm3d,
+        fused_mod=_nni.ConvBnReLU3d,
+        qat_mod=_nniq.ConvBnReLU3d,
+        ref_quant_mod=_nnr.Conv3d,
+    )
+    for act in _mod_activations:
+        configs.extend(
+            _weighted_bn_act_configs(
+                mod=_nn.Conv3d,
+                act=act,
+                bn_mod=_nn.BatchNorm3d,
+                root_mod=_nni.ConvBn3d,
+                fused_mod=_fused.ConvBnAct3d,
+                qat_mod=_qat.ConvBnAct3d,
+                ref_quant_mod=_quantized.QuantizedConvAct3d,
+            )
+        )
+    return configs
+
+
+@_BackendConfigRegistry.register()
+def _conv1d() -> _List[_BackendPatternConfig]:
+    """
+    float: Conv1d
+    qat: FakeQuant -> qat.Conv1d -> FakeQuant
+    """
+    return _weighted_configs(
+        mod=_nn.Conv1d,
+        func_mod=_F.conv1d,
+        qat_mod=_nnq.Conv1d,
+        ref_quant_mod=_nnr.Conv1d,
+    )
+
+
+@_BackendConfigRegistry.register()
+def _conv2d() -> _List[_BackendPatternConfig]:
+    """
+    float: Conv2d
+    qat: FakeQuant -> qat.Conv2d -> FakeQuant
+    """
+    return _weighted_configs(
+        mod=_nn.Conv2d,
+        func_mod=_F.conv2d,
+        qat_mod=_nnq.Conv2d,
+        ref_quant_mod=_nnr.Conv2d,
+    )
+
+
+@_BackendConfigRegistry.register()
+def _conv3d() -> _List[_BackendPatternConfig]:
+    """
+    float: Conv3d
+    qat: FakeQuant -> qat.Conv3d -> FakeQuant
+    """
+    return _weighted_configs(
+        mod=_nn.Conv3d,
+        func_mod=_F.conv3d,
+        qat_mod=_nnq.Conv3d,
+        ref_quant_mod=_nnr.Conv3d,
+    )
+
+
+@_BackendConfigRegistry.register()
+def _linear() -> _List[_BackendPatternConfig]:
+    """
+    float: Linear
+    qat: FakeQuant -> qat.Linear -> FakeQuant
+    """
+    return _weighted_configs(
+        mod=_nn.Linear,
+        func_mod=_F.linear,
+        qat_mod=_nnq.Linear,
+        ref_quant_mod=_nnr.Linear,
+    )
+
+
+@_BackendConfigRegistry.register()
+def _embedding() -> _List[_BackendPatternConfig]:
+    """
+    float: Embedding
+    qat: qat.Embedding
+    """
+    return _weighted_configs(
+        mod=_nn.Embedding,
+        func_mod=None,
+        qat_mod=_nnq.Embedding,
+        ref_quant_mod=_nnr.Embedding,
+        input_output_observed=False,
+    )
+
+
+@_BackendConfigRegistry.register()
+def _embedding_bag() -> _List[_BackendPatternConfig]:
+    """
+    float: EmbeddingBag
+    qat: qat.EmbeddingBag
+    """
+    return _weighted_configs(
+        mod=_nn.EmbeddingBag,
+        func_mod=None,
+        qat_mod=_nnq.EmbeddingBag,
+        ref_quant_mod=_nnr.EmbeddingBag,
+        input_output_observed=False,
+    )
+
+
+# n-ary ops
+@_BackendConfigRegistry.register()
+def _identity() -> _List[_BackendPatternConfig]:
+    return _share_observer_configs(ops=[_nn.Identity])
+
+
+@_BackendConfigRegistry.register()
+def _add_act() -> _List[_BackendPatternConfig]:
+    """
+    float:
+    input_1 ->
+                add -> Act -> output
+    input_2 ->
+
+    qat:
+    FakeQuant ->
+                 add -> Act -> FakeQuant
+    FakeQuant ->
+    """
+    acts = _mod_activations + _func_activations + (_nn.ReLU, _F.relu, _torch.relu)
+    return _binary_op_relu_configs(ops=[_operator.add, _torch.add], acts=list(acts))
+
+
+@_BackendConfigRegistry.register()
+def _mul_act() -> _List[_BackendPatternConfig]:
+    """
+    float:
+    input_1 ->
+                mul -> Act -> output
+    input_2 ->
+
+    qat:
+    FakeQuant ->
+                 mul -> Act -> FakeQuant
+    FakeQuant ->
+    """
+    acts = _mod_activations + _func_activations + (_nn.ReLU, _F.relu, _torch.relu)
+    return _binary_op_relu_configs(ops=[_operator.mul, _torch.mul], acts=list(acts))
+
+
+@_BackendConfigRegistry.register()
+def _matmul_act() -> _List[_BackendPatternConfig]:
+    """
+    float:
+    input_1 ->
+                matmul -> Act -> output
+    input_2 ->
+
+    qat:
+    FakeQuant ->
+                 matmul -> Act -> FakeQuant
+    FakeQuant ->
+    """
+    acts = _mod_activations + _func_activations + (_nn.ReLU, _F.relu, _torch.relu)
+    return _binary_op_relu_configs(ops=[_torch.matmul], acts=list(acts))
+
+
+@_BackendConfigRegistry.register()
+def _add() -> _List[_BackendPatternConfig]:
+    """
+    float:
+    input_1 ->
+                add -> output
+    input_2 ->
+
+    qat:
+    FakeQuant ->
+                 add -> FakeQuant
+    FakeQuant ->
+    """
+    return _binary_op_configs(ops=[_operator.add, _torch.add])
+
+
+@_BackendConfigRegistry.register()
+def _mul() -> _List[_BackendPatternConfig]:
+    """
+    float:
+    input_1 ->
+                mul -> output
+    input_2 ->
+
+    qat:
+    FakeQuant ->
+                 mul -> FakeQuant
+    FakeQuant ->
+    """
+    return _binary_op_configs(ops=[_operator.mul, _torch.mul])
+
+
+@_BackendConfigRegistry.register()
+def _matmul() -> _List[_BackendPatternConfig]:
+    """
+    float:
+    input_1 ->
+                matmul -> output
+    input_2 ->
+
+    qat:
+    FakeQuant ->
+                 matmul -> FakeQuant
+    FakeQuant ->
+    """
+    return _binary_op_configs(ops=[_torch.matmul])
+
+
+@_BackendConfigRegistry.register()
+def _cat() -> _List[_BackendPatternConfig]:
+    """
+    float:
+    input_1 ->
+                cat -> output
+    input_2 ->
+
+    qat:
+    FakeQuant ->
+                 cat -> FakeQuant
+    FakeQuant ->
+
+    The number of inputs is not restricted to 2.
+    All FakeQuant(s) share the same scale and zero point
+    """
+    return _share_observer_configs(ops=[_torch.cat])
+
+
+# pooling layers
+@_BackendConfigRegistry.register()
+def _max_pool1d() -> _List[_BackendPatternConfig]:
+    """
+    float: MaxPool1d
+    qat: FakeQuant -> MaxPool1d -> FakeQuant
+
+    FakeQuant(s) share the same scale and zero point
+    """
+    return _share_observer_configs(ops=[_nn.MaxPool1d, _F.max_pool1d])
+
+
+@_BackendConfigRegistry.register()
+def _max_pool2d() -> _List[_BackendPatternConfig]:
+    """
+    float: MaxPool2d
+    qat: FakeQuant -> MaxPool2d -> FakeQuant
+
+    FakeQuant(s) share the same scale and zero point
+    """
+    return _share_observer_configs(ops=[_nn.MaxPool2d, _F.max_pool2d])
+
+
+@_BackendConfigRegistry.register()
+def _max_pool3d() -> _List[_BackendPatternConfig]:
+    """
+    float: MaxPool3d
+    qat: FakeQuant -> MaxPool3d -> FakeQuant
+
+    FakeQuant(s) share the same scale and zero point
+    """
+    return _share_observer_configs(ops=[_nn.MaxPool3d, _F.max_pool3d])
+
+
+@_BackendConfigRegistry.register()
+def _adaptive_avg_pool1d() -> _List[_BackendPatternConfig]:
+    """
+    float: AdaptiveAvgPool1d
+    qat: FakeQuant -> AdaptiveAvgPool1d -> FakeQuant
+
+    FakeQuant(s) share the same scale and zero point
+    """
+    return _share_observer_configs(
+        ops=[_nn.AdaptiveAvgPool1d, _F.adaptive_avg_pool1d, _torch.adaptive_avg_pool1d]
+    )
+
+
+@_BackendConfigRegistry.register()
+def _adaptive_avg_pool2d() -> _List[_BackendPatternConfig]:
+    """
+    float: AdaptiveAvgPool2d
+    qat: FakeQuant -> AdaptiveAvgPool2d -> FakeQuant
+
+    FakeQuant(s) share the same scale and zero point
+    """
+    return _share_observer_configs(ops=[_nn.AdaptiveAvgPool2d, _F.adaptive_avg_pool2d])
+
+
+@_BackendConfigRegistry.register()
+def _adaptive_avg_pool3d() -> _List[_BackendPatternConfig]:
+    """
+    float: AdaptiveAvgPool3d
+    qat: FakeQuant -> AdaptiveAvgPool3d -> FakeQuant
+
+    FakeQuant(s) share the same scale and zero point
+    """
+    return _share_observer_configs(ops=[_nn.AdaptiveAvgPool3d, _F.adaptive_avg_pool3d])
+
+
+@_BackendConfigRegistry.register()
+def _avg_pool1d() -> _List[_BackendPatternConfig]:
+    """
+    float: AvgPool1d
+    qat: FakeQuant -> AvgPool1d -> FakeQuant
+
+    FakeQuant(s) share the same scale and zero point
+    """
+    return _share_observer_configs(
+        ops=[_nn.AvgPool1d, _F.avg_pool1d, _torch.avg_pool1d, _torch.mean]
+    )
+
+
+@_BackendConfigRegistry.register()
+def _avg_pool2d() -> _List[_BackendPatternConfig]:
+    """
+    float: AvgPool2d
+    qat: FakeQuant -> AvgPool2d -> FakeQuant
+
+    FakeQuant(s) share the same scale and zero point
+    """
+    return _share_observer_configs(ops=[_nn.AvgPool2d, _F.avg_pool2d, _torch._C._nn.avg_pool2d])
+
+
+@_BackendConfigRegistry.register()
+def _avg_pool3d() -> _List[_BackendPatternConfig]:
+    """
+    float: AvgPool3d
+    qat: FakeQuant -> AvgPool3d -> FakeQuant
+
+    FakeQuant(s) share the same scale and zero point
+    """
+    return _share_observer_configs(ops=[_nn.AvgPool3d, _F.avg_pool3d, _torch._C._nn.avg_pool3d])
+
+
+# memory movement ops
+@_BackendConfigRegistry.register()
+def _flatten() -> _List[_BackendPatternConfig]:
+    """
+    float: AvgPool1d
+    qat: FakeQuant -> Flatten -> FakeQuant
+
+    FakeQuant(s) share the same scale and zero point
+    """
+    return _share_observer_configs(ops=[_nn.Flatten, _torch.flatten])
+
+
+# norm layers
+@_BackendConfigRegistry.register()
+def _bn() -> _List[_BackendPatternConfig]:
+    """
+    float: BatchNorm
+    qat: FakeQuant -> BatchNorm -> FakeQuant
+    """
+    return _activation_configs(ops=[_nn.BatchNorm1d, _nn.BatchNorm2d, _nn.BatchNorm3d])
+
+
+@_BackendConfigRegistry.register()
+def _bn2d_relu() -> _List[_BackendPatternConfig]:
+    """
+    float: BatchNorm2d -> ReLU
+    qat: FakeQuant -> BNReLU2d -> FakeQuant
+    """
+    return _bn_relu(mod=_nn.BatchNorm2d, fused_mod=_nni.BNReLU2d)
+
+
+@_BackendConfigRegistry.register()
+def _bn3d_relu() -> _List[_BackendPatternConfig]:
+    """
+    float: BatchNorm3d -> ReLU
+    qat: FakeQuant -> BNReLU3d -> FakeQuant
+    """
+    return _bn_relu(mod=_nn.BatchNorm3d, fused_mod=_nni.BNReLU3d)
+
+
+# activations
+@_BackendConfigRegistry.register()
+def _softmax() -> _List[_BackendPatternConfig]:
+    """
+    float: Softmax
+    qat: FakeQuant -> Softmax -> FakeQuant
+
+    FakeQuant at the output has fixed qparams.
+    """
+    constraints = (
+        _DTypeWithConstraints(
+            dtype=_torch.quint8,
+            quant_min_lower_bound=0,
+            quant_max_upper_bound=255,
+            scale_exact_match=1.0 / 256.0,
+            zero_point_exact_match=0,
+        )
+        if _is_torch_2()
+        else None
+    )
+    return _activation_configs(ops=[_nn.Softmax], constraints=constraints)
+
+
+@_BackendConfigRegistry.register()
+def _sigmoid() -> _List[_BackendPatternConfig]:
+    """
+    float: Sigmoid
+    qat: FakeQuant -> Sigmoid -> FakeQuant
+
+    FakeQuant at the output has fixed qparams.
+    """
+    constraints = (
+        _DTypeWithConstraints(
+            dtype=_torch.quint8,
+            quant_min_lower_bound=0,
+            quant_max_upper_bound=255,
+            scale_exact_match=1.0 / 256.0,
+            zero_point_exact_match=0,
+        )
+        if _is_torch_2()
+        else None
+    )
+    return _activation_configs(ops=[_nn.Sigmoid, _F.sigmoid], constraints=constraints)
+
+
+@_BackendConfigRegistry.register()
+def _hardsigmoid() -> _List[_BackendPatternConfig]:
+    """
+    float: Hardsigmoid
+    qat: FakeQuant -> Hardsigmoid -> FakeQuant
+
+    FakeQuant at the output has fixed qparams.
+    """
+    constraints = (
+        _DTypeWithConstraints(
+            dtype=_torch.quint8,
+            quant_min_lower_bound=0,
+            quant_max_upper_bound=255,
+            scale_exact_match=1.0 / 256.0,
+            zero_point_exact_match=0,
+        )
+        if _is_torch_2()
+        else None
+    )
+    return _activation_configs(ops=[_nn.Hardsigmoid, _F.hardsigmoid], constraints=constraints)
+
+
+@_BackendConfigRegistry.register()
+def _tanh() -> _List[_BackendPatternConfig]:
+    """
+    float: Tanh
+    qat: FakeQuant -> Tanh -> FakeQuant
+
+    FakeQuant at the output has fixed qparams.
+    """
+    constraints = (
+        _DTypeWithConstraints(
+            dtype=_torch.quint8,
+            quant_min_lower_bound=0,
+            quant_max_upper_bound=255,
+            scale_exact_match=2.0 / 256.0,
+            zero_point_exact_match=128,
+        )
+        if _is_torch_2()
+        else None
+    )
+    return _activation_configs(ops=[_nn.Tanh, _F.tanh], constraints=constraints)
+
+
+@_BackendConfigRegistry.register()
+def _activations() -> _List[_BackendPatternConfig]:
+    """
+    float: Act
+    qat: FakeQuant -> Act -> FakeQuant
+    """
+    ops = [op for op in _mod_activations if op not in _fixed_qparams_modules]
+    ops += [
+        _nn.ReLU,
+        _F.relu,
+        _F.relu_,
+    ] + list(_func_activations)
+    return _activation_configs(ops=ops)
+
+
+def get_backend_config() -> _BackendConfig:
+    """
+    Returns backend config encoding information about how quantization
+    layers are inserted in a module.
+    """
+    return _BackendConfigRegistry.backend_config
+
+
+def get_supported_modules() -> _List[_Any]:
+    """
+    Returns a list of modules which are supported for quantization
+    aware training.
+    """
+    return tuple(_BackendConfigRegistry.supported_modules)
diff --git a/coremltools/optimize/torch/quantization/_backend_config_utils.py b/coremltools/optimize/torch/quantization/_backend_config_utils.py
new file mode 100644
index 000000000..2a0d6d29b
--- /dev/null
+++ b/coremltools/optimize/torch/quantization/_backend_config_utils.py
@@ -0,0 +1,436 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+from copy import deepcopy as _deepcopy
+from typing import Any as _Any
+from typing import Callable as _Callable
+from typing import List as _List
+from typing import Optional as _Optional
+from typing import Tuple as _Tuple
+from typing import Type as _Type
+
+import torch as _torch
+import torch.nn as _nn
+import torch.nn.functional as _F
+from torch.ao.quantization.backend_config import BackendPatternConfig as _BackendPatternConfig
+from torch.ao.quantization.backend_config import DTypeConfig as _DTypeConfig
+from torch.ao.quantization.backend_config import DTypeWithConstraints as _DTypeWithConstraints
+from torch.ao.quantization.backend_config import ObservationType as _ObservationType
+
+from coremltools.optimize.torch._utils.version_utils import is_torch_2 as _is_torch_2
+
+act_quant_dtype_configs = [
+    # int input and output
+    _DTypeConfig(
+        input_dtype=_torch.quint8,
+        output_dtype=_torch.quint8,
+    ),
+    # int input, float output
+    _DTypeConfig(
+        input_dtype=_torch.quint8,
+        output_dtype=_torch.float,
+    ),
+    # float input, int output
+    _DTypeConfig(
+        input_dtype=_torch.float,
+        output_dtype=_torch.quint8,
+    ),
+]
+
+
+weighted_dtype_configs = [
+    # weight int, act float
+    _DTypeConfig(
+        input_dtype=_torch.float,
+        output_dtype=_torch.float,
+        weight_dtype=_torch.qint8,
+        bias_dtype=_torch.float,
+    ),
+    # weight int, act int
+    _DTypeConfig(
+        input_dtype=_torch.quint8,
+        output_dtype=_torch.quint8,
+        weight_dtype=_torch.qint8,
+        bias_dtype=_torch.float,
+    ),
+]
+
+
+def get_fuser_method(constructor):
+    """
+    Creates fuser method from class constructor of fused modules.
+    """
+    if _is_torch_2():
+
+        def fuser_method(is_qat, m1, m2):
+            if isinstance(m1, tuple):
+                m0, m1 = m1
+                return constructor(m1, m0, m2)
+            return constructor(m1, m2)
+
+    else:
+
+        def fuser_method(is_qat, m1, m2):
+            if isinstance(m2, tuple):
+                m2, m3 = m2
+                return constructor(m3, m2, m1)
+            return constructor(m2, m1)
+
+    return fuser_method
+
+
+def get_fusion_pattern(pattern: _Tuple[_Any, _Any]) -> _Tuple[_Any, _Any]:
+    """
+    Swaps fusion pattern if torch version is >= 2.0.
+    """
+    if _is_torch_2():
+        return pattern[1], pattern[0]
+    else:
+        return pattern
+
+
+def fused_mod_config(
+    mod: _Type[_nn.Module],
+    fused_mod: _Type[_nn.Module],
+    qat_mod: _Type[_nn.Module],
+    ref_quant_mod: _Type[_nn.Module],
+    input_output_observed: _Optional[bool] = None,
+) -> _BackendPatternConfig:
+    """
+    Returns backend pattern config for fused modules.
+    """
+    config = (
+        _BackendPatternConfig(fused_mod)
+        .set_observation_type(_ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT)
+        .set_dtype_configs(weighted_dtype_configs)
+        .set_root_module(mod)
+        .set_qat_module(qat_mod)
+        .set_reference_quantized_module(ref_quant_mod)
+    )
+    if input_output_observed is not None:
+        if _is_torch_2():
+            config.set_observation_type(_ObservationType.INPUT_OUTPUT_NOT_OBSERVED)
+        else:
+            config._input_output_observed = False
+    return config
+
+
+def qat_mod_config(
+    mod: _Type[_nn.Module], qat_mod: _Type[_nn.Module], ref_quant_mod: _Type[_nn.Module]
+) -> _BackendPatternConfig:
+    """
+    Returns backend pattern config for QAT modules.
+    """
+    return (
+        _BackendPatternConfig(qat_mod)
+        .set_observation_type(_ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT)
+        .set_dtype_configs(weighted_dtype_configs)
+        .set_root_module(mod)
+        .set_reference_quantized_module(ref_quant_mod)
+    )
+
+
+def weighted_configs(
+    mod: _Type[_nn.Module],
+    func_mod: _Optional[_Callable],
+    qat_mod: _Type[_nn.Module],
+    ref_quant_mod: _Type[_nn.Module],
+    input_output_observed: _Optional[bool] = None,
+) -> _List[_BackendPatternConfig]:
+    """
+    Returns backend pattern configs for modules which have a weight associated with them,
+    such as convolution, linear, embedding, etc.
+    """
+    configs = [
+        # conv/linear module
+        fused_mod_config(
+            mod=mod,
+            fused_mod=mod,
+            qat_mod=qat_mod,
+            ref_quant_mod=ref_quant_mod,
+            input_output_observed=input_output_observed,
+        ),
+        # qat conv/linear
+        qat_mod_config(mod=mod, qat_mod=qat_mod, ref_quant_mod=ref_quant_mod),
+    ]
+    if func_mod is not None:
+        configs += [
+            # functional
+            _BackendPatternConfig(func_mod)
+            .set_observation_type(_ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT)
+            .set_dtype_configs(weighted_dtype_configs)
+            ._set_input_type_to_index({"weight": 1, "bias": 2}),
+        ]
+    return configs
+
+
+def weighted_relu_configs(
+    mod: _Type[_nn.Module],
+    func_mod: _Callable,
+    fused_mod: _Type[_nn.Module],
+    ref_quant_mod: _Type[_nn.Module],
+    qat_mod: _Type[_nn.Module],
+) -> _List[_BackendPatternConfig]:
+    """
+    Returns backend pattern configs for the following sequence of ops:
+
+    input -> mod -> relu -> output
+
+    where mod is a module with a weight associated with it, such as convolution and linear.
+    """
+    return [
+        # conv/linear module + relu func/module
+        *[
+            _BackendPatternConfig(get_fusion_pattern((act, mod)))
+            .set_dtype_configs(weighted_dtype_configs)
+            .set_fuser_method(get_fuser_method(fused_mod))
+            .set_fused_module(fused_mod)
+            for act in [_nn.ReLU, _F.relu]
+        ],
+        # conv/linear func + relu func/module
+        *[
+            _BackendPatternConfig(get_fusion_pattern((act, func_mod)))
+            .set_observation_type(_ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT)
+            .set_dtype_configs(weighted_dtype_configs)
+            for act in [_nn.ReLU, _F.relu]
+        ],
+        # conv/linear + relu fused
+        fused_mod_config(
+            mod=mod, fused_mod=fused_mod, qat_mod=qat_mod, ref_quant_mod=ref_quant_mod
+        ),
+        # qat conv/linear + relu
+        qat_mod_config(mod=mod, qat_mod=qat_mod, ref_quant_mod=ref_quant_mod),
+    ]
+
+
+def weighted_act_configs(
+    mod: _Type[_nn.Module],
+    func_mod: _Callable,
+    act: _Type[_nn.Module],
+    fused_mod: _Type[_nn.Module],
+    qat_mod: _Type[_nn.Module],
+    ref_quant_mod: _Type[_nn.Module],
+) -> _List[_BackendPatternConfig]:
+    """
+    Returns backend pattern configs for the following sequence of ops:
+
+    input -> mod -> activation -> output
+
+    where mod is a module with a weight associated with it, such as convolution and linear.
+    """
+    return [
+        # conv/linear module + act module
+        _BackendPatternConfig(get_fusion_pattern((act, mod)))
+        .set_dtype_configs(weighted_dtype_configs)
+        .set_fuser_method(get_fuser_method(fused_mod))
+        .set_fused_module(fused_mod),
+        # conv/linear func + act module
+        _BackendPatternConfig(get_fusion_pattern((act, func_mod)))
+        .set_observation_type(_ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT)
+        .set_dtype_configs(weighted_dtype_configs),
+        # conv/linear + act fused
+        fused_mod_config(
+            mod=fused_mod,
+            fused_mod=fused_mod,
+            qat_mod=qat_mod,
+            ref_quant_mod=ref_quant_mod,
+        ),
+        # qat conv/linear + act
+        qat_mod_config(mod=fused_mod, qat_mod=qat_mod, ref_quant_mod=ref_quant_mod),
+    ]
+
+
+def weighted_bn_configs(
+    mod: _Type[_nn.Module],
+    bn_mod: _Type[_nn.Module],
+    fused_mod: _Type[_nn.Module],
+    ref_quant_mod: _Type[_nn.Module],
+    qat_mod: _Type[_nn.Module],
+) -> _List[_BackendPatternConfig]:
+    """
+    Returns backend pattern configs for the following sequence of ops:
+
+    input -> mod -> batch_norm -> output
+
+    where mod is a module with a weight associated with it, such as convolution and linear.
+    """
+    return [
+        # conv module + bn module
+        _BackendPatternConfig(get_fusion_pattern((bn_mod, mod)))
+        .set_dtype_configs(weighted_dtype_configs)
+        .set_fuser_method(get_fuser_method(fused_mod))
+        .set_fused_module(fused_mod),
+        # conv + bn fused
+        fused_mod_config(
+            mod=mod, fused_mod=fused_mod, qat_mod=qat_mod, ref_quant_mod=ref_quant_mod
+        ),
+        # qat conv + bn
+        qat_mod_config(mod=mod, qat_mod=qat_mod, ref_quant_mod=ref_quant_mod),
+    ]
+
+
+def weighted_bn_relu_configs(
+    mod: _Type[_nn.Module],
+    bn_mod: _Type[_nn.Module],
+    fused_mod: _Type[_nn.Module],
+    ref_quant_mod: _Type[_nn.Module],
+    qat_mod: _Type[_nn.Module],
+) -> _List[_BackendPatternConfig]:
+    """
+    Returns backend pattern configs for the following sequence of ops:
+
+    input -> mod -> batch_norm -> relu -> output
+
+    where mod is a module with a weight associated with it, such as convolution and linear.
+    """
+    return [
+        # conv module + bn module + relu func/module
+        *[
+            _BackendPatternConfig(get_fusion_pattern((act, (bn_mod, mod))))
+            .set_dtype_configs(weighted_dtype_configs)
+            .set_fuser_method(get_fuser_method(fused_mod))
+            .set_fused_module(fused_mod)
+            for act in [_nn.ReLU, _F.relu]
+        ],
+        # conv + bn + relu fused
+        fused_mod_config(
+            mod=mod, fused_mod=fused_mod, qat_mod=qat_mod, ref_quant_mod=ref_quant_mod
+        ),
+        # qat conv + bn + relu
+        qat_mod_config(mod=mod, qat_mod=qat_mod, ref_quant_mod=ref_quant_mod),
+    ]
+
+
+def weighted_bn_act_configs(
+    mod: _Type[_nn.Module],
+    act: _Type[_nn.Module],
+    bn_mod: _Type[_nn.Module],
+    root_mod: _Type[_nn.Module],
+    fused_mod: _Type[_nn.Module],
+    ref_quant_mod: _Type[_nn.Module],
+    qat_mod: _Type[_nn.Module],
+) -> _List[_BackendPatternConfig]:
+    """
+    Returns backend pattern configs for the following sequence of ops:
+
+    input -> mod -> batch_norm -> activation -> output
+
+    where mod is a module with a weight associated with it, such as convolution and linear.
+    """
+    return [
+        # conv module + bn module + act module
+        _BackendPatternConfig(get_fusion_pattern((act, (bn_mod, mod))))
+        .set_dtype_configs(weighted_dtype_configs)
+        .set_fuser_method(get_fuser_method(fused_mod))
+        .set_fused_module(fused_mod),
+        # conv + bn + act fused
+        fused_mod_config(
+            mod=root_mod,
+            fused_mod=fused_mod,
+            qat_mod=qat_mod,
+            ref_quant_mod=ref_quant_mod,
+        ),
+        # qat conv + bn + act
+        qat_mod_config(mod=root_mod, qat_mod=qat_mod, ref_quant_mod=ref_quant_mod),
+    ]
+
+
+def binary_op_configs(ops: _List[_Any]) -> _List[_BackendPatternConfig]:
+    """
+    Returns backend pattern configs for the following sequence of ops:
+
+    input_1 -->
+                  operator --> output
+    input_2 -->
+
+    where operator is a binary operator such as add, multiply or matmul.
+    """
+    return [
+        _BackendPatternConfig(op)
+        .set_dtype_configs(act_quant_dtype_configs)
+        .set_observation_type(_ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT)
+        for op in ops
+    ]
+
+
+def binary_op_act_configs(ops: _List[_Any], acts: _List[_Any]) -> _List[_BackendPatternConfig]:
+    """
+    Returns backend pattern configs for the following sequence of ops:
+
+    input_1 -->
+                  operator --> act --> output
+    input_2 -->
+
+    where operator is a binary operator such as add or multiply.
+    """
+    configs = []
+    for op in ops:
+        configs.extend(
+            [
+                _BackendPatternConfig(get_fusion_pattern((act, op)))
+                .set_dtype_configs(act_quant_dtype_configs)
+                .set_observation_type(_ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT)
+                for act in acts
+            ]
+        )
+    return configs
+
+
+def share_observer_configs(ops: _List[_Any]) -> _List[_BackendPatternConfig]:
+    """
+    Returns backend pattern configs for ops which do not change the scale or
+    zero-point of the input tensor and thus can share the same qparams.
+    """
+    return [
+        _BackendPatternConfig(op)
+        .set_observation_type(_ObservationType.OUTPUT_SHARE_OBSERVER_WITH_INPUT)
+        .set_dtype_configs(act_quant_dtype_configs)
+        for op in ops
+    ]
+
+
+def activation_configs(
+    ops: _List[_Any], constraints: _Optional[_DTypeWithConstraints] = None
+) -> _List[_BackendPatternConfig]:
+    """
+    Returns backend pattern configs for default ops like activations which
+    do not have an associated weight but can alter the scale and zero point of
+    the input tensor.
+    """
+    dtype_configs = []
+    for act_dtype in act_quant_dtype_configs:
+        new_act_dtype = _deepcopy(act_dtype)
+        if act_dtype.output_dtype == _torch.quint8 and constraints is not None:
+            new_act_dtype.output_dtype_with_constraints = constraints
+        dtype_configs.append(new_act_dtype)
+    return [
+        _BackendPatternConfig(op)
+        .set_observation_type(_ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT)
+        .set_dtype_configs(dtype_configs)
+        for op in ops
+    ]
+
+
+def bn_relu(
+    mod: _Type[_nn.Module],
+    fused_mod: _Type[_nn.Module],
+) -> _List[_BackendPatternConfig]:
+    """
+    Returns backend pattern configs for the following sequence of ops:
+
+    input -> batch_norm -> relu -> output
+    """
+    return [
+        # bn module + relu func/module
+        *[
+            _BackendPatternConfig(get_fusion_pattern((act, mod)))
+            .set_dtype_configs(weighted_dtype_configs)
+            .set_fuser_method(get_fuser_method(fused_mod))
+            .set_fused_module(fused_mod)
+            for act in [_nn.ReLU, _F.relu]
+        ]
+    ] + activation_configs(
+        ops=[fused_mod]
+    )  # fused bn + relu
diff --git a/coremltools/optimize/torch/quantization/_configure.py b/coremltools/optimize/torch/quantization/_configure.py
new file mode 100644
index 000000000..350f2c479
--- /dev/null
+++ b/coremltools/optimize/torch/quantization/_configure.py
@@ -0,0 +1,404 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+from collections import defaultdict as _defaultdict
+from typing import Any as _Any
+from typing import Optional as _Optional
+
+import torch as _torch
+import torch.ao.quantization as _aoquant
+import torch.fx as _fx
+import torch.nn as _nn
+import torch.nn.intrinsic as _nni
+import torch.nn.intrinsic.qat as _nniqat
+from torch.ao.quantization.backend_config import BackendConfig as _BackendConfig
+from torch.ao.quantization.fx.custom_config import PrepareCustomConfig as _PrepareCustomConfig
+from torch.quantization.quantize_fx import prepare_qat_fx as _prepare_qat_fx
+
+import coremltools.optimize.torch.quantization.modules.qat_modules as _qat
+from coremltools.optimize.torch._utils.torch_utils import (
+    get_parent_child_name as _get_parent_child_name,
+)
+from coremltools.optimize.torch.quantization._backend_config import _fixed_qparams_modules
+from coremltools.optimize.torch.quantization._utils import CombinationOpType as _CombinationOpType
+from coremltools.optimize.torch.quantization._utils import combine_op_type as _combine_op_type
+from coremltools.optimize.torch.quantization._utils import find_module as _find_module
+from coremltools.optimize.torch.quantization._utils import find_target as _find_target
+from coremltools.optimize.torch.quantization._utils import (
+    get_share_qparams_ops as _get_share_qparams_ops,
+)
+from coremltools.optimize.torch.quantization._utils import (
+    group_activation_quantization_modules_by_id as _group_activation_quantization_modules_by_id,
+)
+from coremltools.optimize.torch.quantization._utils import (
+    is_activation_post_process as _is_activation_post_process,
+)
+from coremltools.optimize.torch.quantization._utils import is_quantized as _is_quantized
+from coremltools.optimize.torch.quantization.quantization_config import (
+    QuantizationScheme as _QuantizationScheme,
+)
+
+# layers which only scale the output and hence can use zero point = 0 if needed
+_scale_only_layers = {
+    _torch.nn.Dropout,
+    _torch.nn.Dropout1d,
+    _torch.nn.Dropout2d,
+    _torch.nn.Dropout3d,
+}
+
+
+# layers which are always quantized with affine config because they have zero point = 0
+_always_affine_layers = {
+    _torch.nn.ReLU,
+    _torch.nn.functional.relu,
+    _torch.nn.functional.relu_,
+    _torch.nn.ReLU6,
+    _nni.ConvReLU1d,
+    _nniqat.ConvReLU1d,
+    _nni.ConvReLU2d,
+    _nniqat.ConvReLU2d,
+    _nni.ConvReLU3d,
+    _nniqat.ConvBnReLU3d,
+    _nni.ConvBnReLU1d,
+    _nniqat.ConvBnReLU1d,
+    _nni.ConvBnReLU2d,
+    _nniqat.ConvBnReLU2d,
+    _nni.ConvBnReLU3d,
+    _nniqat.ConvBnReLU3d,
+    _nni.LinearReLU,
+    _nniqat.LinearReLU,
+    _nni.BNReLU3d,
+    _nni.BNReLU3d,
+}
+
+
+def _get_affine_act_post_process(module: _aoquant.FakeQuantizeBase):
+    """
+    Returns activation post process module which is same as module but with
+    affine qscheme.
+    """
+    _common_observer_param_names = [
+        "dtype",
+        "qscheme",
+        "reduce_range",
+        "quant_min",
+        "quant_max",
+        "eps",
+    ]
+    _observer_type_to_param_names = {
+        _aoquant.MinMaxObserver: list(_common_observer_param_names),
+        _aoquant.PerChannelMinMaxObserver: list(_common_observer_param_names) + ["ch_axis"],
+        _aoquant.MovingAverageMinMaxObserver: list(_common_observer_param_names)
+        + ["averaging_constant"],
+        _aoquant.MovingAveragePerChannelMinMaxObserver: list(_common_observer_param_names)
+        + ["averaging_constant", "ch_axis"],
+        _aoquant.HistogramObserver: [
+            "bins",
+            "upsample_rate",
+            "dtype",
+            "qscheme",
+            "reduce_range",
+            "eps",
+        ],
+        _aoquant.PlaceholderObserver: [
+            "dtype",
+            "quant_min",
+            "quant_max",
+            "custom_op_name",
+        ],
+        _aoquant.NoopObserver: ["dtype", "custom_op_name"],
+        _aoquant.FixedQParamsObserver: [
+            "scale",
+            "zero_point",
+            "dtype",
+            "qscheme",
+            "quant_min",
+            "quant_max",
+        ],
+    }
+
+    activation_post_process = module.activation_post_process
+    if type(activation_post_process) not in _observer_type_to_param_names:
+        raise ValueError(f"Found unrecognized observer type {type(activation_post_process)}.")
+    observer_type = type(activation_post_process)
+    observer_param_names = _observer_type_to_param_names[observer_type]
+    kwargs = {k: getattr(activation_post_process, k) for k in observer_param_names}
+    if "qscheme" in kwargs:
+        kwargs["qscheme"] = _torch.per_tensor_affine
+    new_act_post_process = _aoquant.FakeQuantize(observer=observer_type, **kwargs)
+    return new_act_post_process
+
+
+class QATConfigurationHandler:
+    """
+    Prepares the model for QAT by inserting weight and activation quantizers as
+    specified in qconfig_mapping.
+
+    Implements additional graph passes on a prepared module returned by prepare_qat_fx.
+    """
+
+    def __init__(
+        self,
+        prepare_custom_config: _PrepareCustomConfig,
+        qconfig_mapping: _aoquant.QConfigMapping,
+        backend_config: _BackendConfig,
+        quantization_scheme: _QuantizationScheme,
+    ):
+        self._quantization_scheme = quantization_scheme
+        self._qconfig_mapping = qconfig_mapping
+        self._prepare_custom_config = prepare_custom_config
+        self._backend_config = backend_config
+        self._share_qparams_ops = _get_share_qparams_ops(self._backend_config)
+        self._act_quant_groups = dict()
+        self._modules_to_replace = _defaultdict(list)
+        self._new_act_post_process = dict()
+
+    def prepare(self, model: _nn.Module, example_inputs: _Any):
+        """
+        Performs graph passes on model to configure activation and weight quantization layers.
+        """
+        model = _prepare_qat_fx(
+            model,
+            prepare_custom_config=self._prepare_custom_config,
+            qconfig_mapping=self._qconfig_mapping,
+            example_inputs=example_inputs,
+            backend_config=self._backend_config,
+        )
+
+        self._act_quant_groups = _group_activation_quantization_modules_by_id(model)
+        if self._quantization_scheme == _QuantizationScheme.symmetric:
+            self._mark_always_affine_layers_for_replacement(model)
+            self._mark_always_affine_combination_ops_for_replacement(model)
+        self._mark_fixed_qparams_modules_for_replacement(model)
+        self._replace_weight_fake_quant_for_embedding_layers(model)
+        model = self._replace_activation_quantizers(model)
+        model = self._remove_activation_quantizer_after_dropout(model)
+        return model
+
+    def _replace_activation_quantizers(self, model: _fx.GraphModule) -> _fx.GraphModule:
+        """
+        Replaces all nodes marked for replacement with new nodes.
+        """
+        replaced = set()
+        for node, new_act_post_process in self._new_act_post_process.items():
+            if node not in replaced:
+                model.delete_submodule(node.target)
+                model.add_submodule(node.target, new_act_post_process)
+                replaced.add(node)
+                # replace pointers to all modules which share this activation quantizer
+                for child_node in self._modules_to_replace[node]:
+                    if child_node not in replaced:
+                        parent, child = _get_parent_child_name(child_node.target)
+                        parent_mod = model.get_submodule(parent)
+                        setattr(parent_mod, child, new_act_post_process)
+                        replaced.add(child_node)
+        model.recompile()
+        return model
+
+    def _mark_act_post_process_for_replacement(
+        self,
+        node: _fx.Node,
+        model: _fx.GraphModule,
+        new_act_post_process: _Optional[_aoquant.FakeQuantize] = None,
+    ):
+        """
+        Marks an activation post process layer (activation quantizer) for replacement.
+        """
+        shared_qparam_nodes = []
+        if len(node.users) == 1:
+            next_node = list(node.users.keys())[0]
+            next_module = _find_module(model, next_node)
+            if _is_activation_post_process(next_module) and _is_quantized(next_module):
+                module_to_replace_id = id(model.get_submodule(next_node.target))
+                # Some mods share the activation quantizer being replaced here,
+                # so we collect all those mods here so that those can be pointed to
+                # the new replaced module
+                for child_node in self._act_quant_groups[module_to_replace_id]:
+                    consumer_node = child_node.args[0]
+                    if consumer_node.op == "call_module":
+                        child_mod = _find_module(model, consumer_node)
+                        if type(child_mod) in self._share_qparams_ops:
+                            shared_qparam_nodes.append(child_node)
+                            self._modules_to_replace[child_node] = []
+                    elif consumer_node.op == "call_function":
+                        if consumer_node.target in self._share_qparams_ops:
+                            shared_qparam_nodes.append(child_node)
+                            self._modules_to_replace[child_node] = []
+                self._modules_to_replace[next_node] = shared_qparam_nodes
+                if new_act_post_process is None:
+                    new_act_post_process = _get_affine_act_post_process(next_module)
+                self._new_act_post_process[next_node] = new_act_post_process
+
+    @staticmethod
+    def _remove_activation_quantizer_after_dropout(model: _fx.GraphModule):
+        """
+        During evaluation, dropout is a no-op. During conversion,
+
+        conv_1 -> activation_q_1 -> dropout -> activation_q_2 -> conv_2
+
+        becomes
+
+        conv_1 -> quant_1 -> dequant_1 -> quant_2 -> dequant_2 -> conv_2
+
+        where quant_1,dequant_1 have different qparams from quant_2/dequant_2
+        because dropout scales the output by 1/(1-p). This leads to inefficiency
+        during inference. Since during inference, conv_2 sees quantized activations
+        coming from conv_1, removing activation_q_2 doesn't lead to
+        increased quantization error. Hence, this pass removes activation_q_2.
+        """
+        nodes_to_remove = set()
+        for node in model.graph.nodes:
+            if node.op == "call_module":
+                layer = _find_module(model, node)
+                if isinstance(layer, tuple(_scale_only_layers)):
+                    prev_module = _find_module(model, node.prev)
+                    next_module = _find_module(model, node.next)
+                    if _is_activation_post_process(next_module) and _is_activation_post_process(
+                        prev_module
+                    ):
+                        nodes_to_remove.add(node.next)
+        for node in nodes_to_remove:
+            node.replace_all_uses_with(node.prev)
+            model.delete_submodule(node.target)
+            model.graph.erase_node(node)
+        model.recompile()
+        return model
+
+    def _mark_always_affine_layers_for_replacement(self, model: _fx.GraphModule):
+        """
+        Some layers like ReLU can be quantized with affine qscheme even when we want
+        to use symmetric quantization (zero point = 0). This is because these layers
+        always have a non-negative output. And thus, an affine activation post process layer attached
+        after layers like these will always observe zero point as 0. This can possibly help us
+        reduce quantization error because of the larger number of quantization levels available.
+        (Symmetric quantization will force the output of these layers to use [0, 127] as the
+        output range, but with affine quantization, we can use [0, 255]).
+
+        prepare_qat_fx requires all modules being fused together to have the same QConfig.
+        Thus, if we have a Conv followed by a ReLU and we want to set ReLU to have affine qscheme,
+        we would have to set Conv to use affine qscheme as well. But this isn't correct because a stand alone
+        Conv layer somewhere else in the network will also use affine qscheme which is undesirable
+        we want to fix zero point to 0.
+
+        Hence, we add this pass which replaces all occurrences of activation post process after
+         ``always_affine_layers`` with an affine version.
+        """
+        # Note: For all these ops, whether or not we can use affine qscheme for them depends only on
+        # the op itself or one preceding op.
+        # Note: graph.nodes traverses the nodes in topological order
+        for node in model.graph.nodes:
+            if node.op == "call_module":
+                layer = _find_target(model, node.target)
+                if type(layer) in _always_affine_layers:
+                    self._mark_act_post_process_for_replacement(node, model)
+                elif isinstance(layer, (_qat.ConvAct2d, _qat.ConvBnAct2d, _qat.LinearAct)):
+                    if type(layer.act) in _always_affine_layers:
+                        self._mark_act_post_process_for_replacement(node, model)
+                # layers which only scale the output can also use affine qcheme
+                elif isinstance(layer, tuple(_scale_only_layers)):
+                    arg_mod = _find_module(model, node.args[0])
+                    if (
+                        _is_activation_post_process(arg_mod)
+                        and node.args[0] in self._modules_to_replace
+                    ):
+                        self._mark_act_post_process_for_replacement(node, model)
+            elif node.op == "call_function":
+                combine_op_type = _combine_op_type(node)
+                if combine_op_type is not None:
+                    if combine_op_type == _CombinationOpType.AddReLU:
+                        self._mark_act_post_process_for_replacement(node, model)
+                elif node.target in _always_affine_layers:
+                    self._mark_act_post_process_for_replacement(node, model)
+
+    def _mark_always_affine_combination_ops_for_replacement(self, model: _fx.GraphModule):
+        """
+        This method follows the same reasoning as described in ``_mark_always_affine_layers_for_replacement``,
+        but instead of replacing activation quantizers for stand-alone ops, it replaces them for
+        ops which consume more than 1 tensor as input.
+
+        For add or cat, if the qscheme of all tensors being combined together is
+        affine, it also uses affine qscheme, otherwise, it uses symmetric qscheme.
+        """
+        for node in model.graph.nodes:
+            if node.op == "call_function":
+                combine_op_type = _combine_op_type(node)
+                if combine_op_type is not None and combine_op_type != _CombinationOpType.AddReLU:
+                    args = node.args
+                    if combine_op_type == _CombinationOpType.Concat:
+                        args = node.args[0]
+                    arg_act_qschemes = []
+                    for arg in args:
+                        arg_mod = _find_module(model, arg)
+                        if arg_mod is not None:
+                            if (
+                                type(arg_mod) in _always_affine_layers
+                                or arg in self._modules_to_replace
+                            ):
+                                arg_act_qschemes.append(_QuantizationScheme.affine)
+                            elif hasattr(arg_mod, "qscheme"):
+                                if arg_mod.qscheme == _torch.per_tensor_affine:
+                                    arg_act_qschemes.append(_QuantizationScheme.affine)
+                                else:
+                                    arg_act_qschemes.append(_QuantizationScheme.symmetric)
+                            else:
+                                arg_act_qschemes.append(_QuantizationScheme.symmetric)
+                        else:
+                            arg_act_qschemes.append(_QuantizationScheme.symmetric)
+                    if all(x == _QuantizationScheme.affine for x in arg_act_qschemes):
+                        # We have already marked cat op for replacement, when one of the
+                        # tensors it combines was marked for replacement. So we don't need to
+                        # add it here again.
+                        if combine_op_type != _CombinationOpType.Concat:
+                            self._mark_act_post_process_for_replacement(node, model)
+                    else:
+                        # If any of the tensor being cat-ed together need to use
+                        # [-128, 127] range, we can't use affine quantization in
+                        # symmetric mode for them, so we remove them from modules marked for replacement.
+                        if combine_op_type == _CombinationOpType.Concat:
+                            for arg in args:
+                                if arg in self._modules_to_replace:
+                                    self._modules_to_replace.pop(arg)
+                                if arg in self._new_act_post_process:
+                                    self._new_act_post_process.pop(arg)
+
+    def _mark_fixed_qparams_modules_for_replacement(self, model: _fx.GraphModule):
+        """
+        If a fixed qparams activation is fused, with conv/linear, we need to make sure
+        its qconfig is inherited by the fused op's activation quantizer. Before this step,
+        all fused layers will have symmetric/affine activation quantizer.
+        """
+        for node in model.graph.nodes:
+            if node.op == "call_module":
+                layer = _find_target(model, node.target)
+                if isinstance(layer, (_qat.ConvAct2d, _qat.ConvBnAct2d, _qat.LinearAct)):
+                    # If output of this layer is being cat with another layer, we don't want
+                    # to enforce that layer to use the same activation quantizer, so we ignore it
+                    if _torch.cat in [
+                        child_node.target for child_node in self._act_quant_groups[id(layer)]
+                    ]:
+                        continue
+                    elif type(layer.act) in _fixed_qparams_modules:
+                        act_post_process = self._qconfig_mapping.object_type_qconfigs[
+                            type(layer.act)
+                        ].activation()
+                        self._mark_act_post_process_for_replacement(node, model, act_post_process)
+
+    def _replace_weight_fake_quant_for_embedding_layers(self, model: _fx.GraphModule):
+        """
+        Changes qscheme of embedding layers from float qparams to integer qparams.
+        """
+        for node in model.graph.nodes:
+            if node.op == "call_module":
+                layer = _find_target(model, node.target)
+                if isinstance(layer, _torch.nn.Embedding) and hasattr(layer, "weight_fake_quant"):
+                    weight_dtype = layer.weight_fake_quant.dtype
+                    delattr(layer, "weight_fake_quant")
+                    layer.weight_fake_quant = _aoquant.FakeQuantize(
+                        observer=type(layer.qconfig.weight().activation_post_process),
+                        dtype=weight_dtype,
+                        qscheme=_QuantizationScheme.get_qscheme(
+                            self._quantization_scheme, is_per_channel=True
+                        ),
+                    )
diff --git a/coremltools/optimize/torch/quantization/_qconfig_mapping.py b/coremltools/optimize/torch/quantization/_qconfig_mapping.py
new file mode 100644
index 000000000..506a54b19
--- /dev/null
+++ b/coremltools/optimize/torch/quantization/_qconfig_mapping.py
@@ -0,0 +1,214 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+from typing import Any as _Any
+from typing import Optional as _Optional
+
+import torch as _torch
+import torch.ao.quantization as _aoquant
+import torch.nn as _nn
+
+from coremltools.optimize.torch.quantization._backend_config import _fixed_qparams_modules
+from coremltools.optimize.torch.quantization._backend_config import (
+    get_supported_modules as _get_supported_modules,
+)
+from coremltools.optimize.torch.quantization.quantization_config import (
+    LinearQuantizerConfig as _LinearQuantizerConfig,
+)
+from coremltools.optimize.torch.quantization.quantization_config import (
+    ModuleLinearQuantizerConfig as _ModuleLinearQuantizerConfig,
+)
+from coremltools.optimize.torch.quantization.quantization_config import (
+    ObserverType as _ObserverType,
+)
+from coremltools.optimize.torch.quantization.quantization_config import (
+    QuantizationScheme as _QuantizationScheme,
+)
+from coremltools.optimize.torch.quantization.quantization_config import (
+    _default_quantization_options,
+)
+
+
+class _QConfigMappingBuilder:
+    """
+    Builds py:class:`QConfigMapping` from :py:class:`LinearQuantizerConfig`.
+    """
+
+    @staticmethod
+    def _get_default_qconfig_from_quantization_scheme(
+        quantization_scheme: _QuantizationScheme,
+    ) -> _aoquant.QConfig:
+        """
+        Returns default QConfig for a given quantization types
+        """
+        return _aoquant.QConfig(
+            activation=_aoquant.FakeQuantize.with_args(
+                observer=_ObserverType.get_observer(
+                    _default_quantization_options["observer"], is_per_channel=False
+                ),
+                dtype=_default_quantization_options["activation_dtype"],
+                qscheme=_QuantizationScheme.get_qscheme(quantization_scheme, is_per_channel=False),
+            ),
+            weight=_aoquant.FakeQuantize.with_args(
+                observer=_ObserverType.get_observer(
+                    _default_quantization_options["observer"],
+                    is_per_channel=_default_quantization_options["weight_per_channel"],
+                ),
+                dtype=_default_quantization_options["weight_dtype"],
+                qscheme=_QuantizationScheme.get_qscheme(
+                    quantization_scheme,
+                    is_per_channel=_default_quantization_options["weight_per_channel"],
+                ),
+            ),
+        )
+
+    @staticmethod
+    def _adjust_qconfig_for_module_type(mod_type: _Any, qconfig: _aoquant.QConfig):
+        """
+        Enforces Embedding layers to use float qparams, because that's preferred
+        by prepare_qat_fx.
+        """
+        if mod_type == _torch.nn.Embedding:
+            weight = qconfig.weight()
+            return _aoquant.QConfig(
+                activation=_aoquant.NoopObserver.with_args(dtype=_torch.float),
+                weight=_aoquant.FakeQuantize.with_args(
+                    observer=type(weight.activation_post_process),
+                    dtype=weight.dtype,
+                    qscheme=_torch.per_channel_affine_float_qparams,
+                ),
+            )
+        return qconfig
+
+    @staticmethod
+    def _get_module_names_for_setting_qconfig(model: _nn.Module, mod_name: str):
+        """
+        When layers are fused and we want to skip quantization for a convolution
+        or linear layer, we need to set the qconfig for the layer being fused as None
+        as well.
+        """
+        try:
+            submod = model.get_submodule(mod_name)
+        except AttributeError:
+            return (mod_name,)
+
+        if isinstance(submod, _torch.nn.Conv2d):
+            return mod_name, f"{mod_name}.conv"
+        elif isinstance(submod, _torch.nn.Linear):
+            return mod_name, f"{mod_name}.linear"
+        return (mod_name,)
+
+    @staticmethod
+    def _create_qconfig_from_quantization_config(
+        quantization_config: _ModuleLinearQuantizerConfig,
+    ) -> _Optional[_aoquant.QConfig]:
+        """
+        Creates a :py:class:`QConfig` from ``quantization_config``
+        """
+        if quantization_config.weight_dtype == _torch.float32:
+            return None
+        if quantization_config.activation_dtype == _torch.float32:
+            activation_qconfig = _aoquant.NoopObserver.with_args(
+                dtype=_torch.float,
+            )
+        else:
+            activation_qconfig = _aoquant.FakeQuantize.with_args(
+                observer=_ObserverType.get_observer(
+                    quantization_config.activation_observer,
+                    is_per_channel=False,
+                ),
+                dtype=quantization_config.activation_dtype,
+                qscheme=_QuantizationScheme.get_qscheme(
+                    quantization_config.quantization_scheme,
+                    is_per_channel=False,
+                ),
+            )
+        weight_qconfig = _aoquant.FakeQuantize.with_args(
+            observer=_ObserverType.get_observer(
+                quantization_config.weight_observer,
+                is_per_channel=quantization_config.weight_per_channel,
+            ),
+            dtype=quantization_config.weight_dtype,
+            qscheme=_QuantizationScheme.get_qscheme(
+                quantization_config.quantization_scheme,
+                is_per_channel=quantization_config.weight_per_channel,
+            ),
+        )
+        return _aoquant.QConfig(activation=activation_qconfig, weight=weight_qconfig)
+
+    def get_default_qconfig_mapping(
+        self,
+        quantization_scheme: _QuantizationScheme,
+        qconfig: _Optional[_aoquant.QConfig] = None,
+    ) -> _aoquant.QConfigMapping:
+        """
+        Returns default QconfigMapping for a given quantization scheme. If a qconfig is passed,
+        it is used as the default qconfig instead.
+        """
+        supported_modules = list(set(_get_supported_modules()) - set(_fixed_qparams_modules))
+        # Add _FakeQuantize to ensure all fused ops have same qconfig
+        supported_modules.append(_aoquant.FakeQuantize)
+
+        qconfig_mapping = _aoquant.QConfigMapping()
+        default_qconfig_mapping = _aoquant.get_default_qat_qconfig_mapping()
+
+        # copy qconfig mapping for fixed qparams
+        for key in default_qconfig_mapping.object_type_qconfigs:
+            if key in _fixed_qparams_modules:
+                qconfig_mapping.set_object_type(
+                    key, default_qconfig_mapping.object_type_qconfigs[key]
+                )
+
+        qconfig = (
+            self._get_default_qconfig_from_quantization_scheme(quantization_scheme)
+            if qconfig is None
+            else qconfig
+        )
+
+        qconfig_mapping.set_global(qconfig)
+        for mod_type in supported_modules:
+            qconfig_mapping.set_object_type(
+                mod_type,
+                self._adjust_qconfig_for_module_type(mod_type, qconfig),
+            )
+        return qconfig_mapping
+
+    def get_qconfig_mapping_from_quantization_config(
+        self,
+        model: _nn.Module,
+        quantization_config: _LinearQuantizerConfig,
+        quantization_scheme: _QuantizationScheme,
+    ) -> _aoquant.QConfigMapping:
+        """
+        Builds py:class:`QConfigMapping` from :py:class:`LinearQuantizerConfig`.
+        """
+        qconfig_mapping = self.get_default_qconfig_mapping(quantization_scheme)
+        if quantization_config.global_config is not None:
+            qconfig_mapping = self.get_default_qconfig_mapping(
+                quantization_scheme,
+                self._create_qconfig_from_quantization_config(quantization_config.global_config),
+            )
+        for mod_type, config in quantization_config.module_type_configs.items():
+            qconfig = (
+                self._create_qconfig_from_quantization_config(config)
+                if config is not None
+                else config
+            )
+            qconfig = (
+                self._adjust_qconfig_for_module_type(mod_type, qconfig)
+                if qconfig is not None
+                else qconfig
+            )
+            qconfig_mapping = qconfig_mapping.set_object_type(mod_type, qconfig)
+        for mod_name, config in quantization_config.module_name_configs.items():
+            qconfig = (
+                self._create_qconfig_from_quantization_config(config)
+                if config is not None
+                else config
+            )
+            mod_names = self._get_module_names_for_setting_qconfig(model, mod_name)
+            for mn in mod_names:
+                qconfig_mapping = qconfig_mapping.set_module_name(mn, qconfig)
+        return qconfig_mapping
diff --git a/coremltools/optimize/torch/quantization/_utils.py b/coremltools/optimize/torch/quantization/_utils.py
new file mode 100644
index 000000000..5ddfa1c7b
--- /dev/null
+++ b/coremltools/optimize/torch/quantization/_utils.py
@@ -0,0 +1,155 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import operator as _operator
+from collections import defaultdict
+from enum import Enum as _Enum
+from typing import Dict as _Dict
+from typing import List as _List
+from typing import Optional as _Optional
+
+import torch as _torch
+import torch.ao.quantization as _aoquant
+import torch.fx as _fx
+from torch.ao.quantization.backend_config import BackendConfig as _BackendConfig
+from torch.ao.quantization.backend_config import ObservationType as _ObservationType
+
+from coremltools.optimize.torch._utils.version_utils import is_torch_2 as _is_torch_2
+
+
+class CombinationOpType(_Enum):
+    Add = "add"
+    Mul = "mul"
+    Concat = "concat"
+    AddReLU = "add_relu"
+
+
+def find_target(model, target_name):
+    """
+    Finds the module in model which is referenced by the target_name.
+    target_name is in the form of `mod_a.mod_b.mod_c`
+    """
+    current_obj = model
+    for attr in target_name.split("."):
+        current_obj = getattr(current_obj, attr)
+    return current_obj
+
+
+def find_module(model: _torch.nn.Module, node: _fx.Node):
+    """
+    Finds module corresponding to the node.
+    """
+    if hasattr(node, "op") and node.op == "call_module":
+        return find_target(model, node.target)
+    return None
+
+
+def is_add(node: _fx.Node):
+    """
+    Returns True if node is an add op
+    """
+    if node.op == "call_function":
+        return node.target == _operator.add or node.target == _torch.add
+    return False
+
+
+def is_mul(node: _fx.Node):
+    """
+    Returns True if node is a mul op
+    """
+    if node.op == "call_function":
+        return node.target == _operator.mul or node.target == _torch.mul
+    return False
+
+
+def is_concat(node: _fx.Node):
+    """
+    Returns True if node is a concat op
+    """
+    if node.op == "call_function":
+        return node.target == _torch.cat
+    return False
+
+
+def is_relu(node: _fx.Node) -> bool:
+    """
+    Returns True if node is a relu op
+    """
+    if node.op == "call_function":
+        return node.target == _torch.nn.functional.relu
+    return False
+
+
+def is_add_relu(node: _fx.Node) -> bool:
+    """
+    Returns True if node is a add-relu op
+    """
+    return is_relu(node) and len(node.args) == 1 and is_add(node.args[0])
+
+
+def combine_op_type(node: _fx.Node) -> _Optional[CombinationOpType]:
+    """
+    Returns type of combination op at this node -> add, mul, add-relu or concat
+    """
+    if is_add(node):
+        return CombinationOpType.Add
+    elif is_mul(node):
+        return CombinationOpType.Mul
+    elif is_add_relu(node):
+        return CombinationOpType.AddReLU
+    elif is_concat(node):
+        return CombinationOpType.Concat
+    return None
+
+
+def is_activation_post_process(module: _torch.nn.Module) -> bool:
+    """
+    Returns true if a module is an activation post process module.
+    """
+    return isinstance(module, _aoquant.FakeQuantizeBase)
+
+
+def is_quantized(module: _aoquant.FakeQuantizeBase):
+    """
+    Returns true if activation post process module uses integer dtypes.
+    """
+    if hasattr(module, "activation_post_process"):
+        return module.activation_post_process.dtype in [_torch.qint8, _torch.quint8]
+    return False
+
+
+def group_activation_quantization_modules_by_id(
+    model: _fx.GraphModule,
+) -> _Dict[int, _List[_fx.Node]]:
+    """
+    Groups activation post process layers by their ids. This is useful
+    because multiple activation post process modules in a traced graph may
+    point to the same module.
+    """
+    groups = defaultdict(list)
+    for node in model.graph.nodes:
+        if node.op == "call_module":
+            module = find_target(model, node.target)
+            if is_activation_post_process(module) and is_quantized(module):
+                groups[id(module)].append(node)
+    return groups
+
+
+def get_share_qparams_ops(backend_config: _BackendConfig):
+    """
+    Returns list of ops which share qparams with input.
+    """
+
+    configs = (
+        backend_config._pattern_complex_format_to_config
+        if _is_torch_2()
+        else backend_config.configs
+    )
+
+    return [
+        op
+        for op in configs
+        if configs[op].observation_type == _ObservationType.OUTPUT_SHARE_OBSERVER_WITH_INPUT
+    ]
diff --git a/coremltools/optimize/torch/quantization/modules/__init__.py b/coremltools/optimize/torch/quantization/modules/__init__.py
new file mode 100644
index 000000000..25c7d28c5
--- /dev/null
+++ b/coremltools/optimize/torch/quantization/modules/__init__.py
@@ -0,0 +1,4 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
diff --git a/coremltools/optimize/torch/quantization/modules/fused_modules.py b/coremltools/optimize/torch/quantization/modules/fused_modules.py
new file mode 100644
index 000000000..ee5eeebcb
--- /dev/null
+++ b/coremltools/optimize/torch/quantization/modules/fused_modules.py
@@ -0,0 +1,67 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+from collections import OrderedDict as _OrderedDict
+from typing import Union as _Union
+
+import torch as _torch
+import torch.nn as _nn
+import torch.nn.intrinsic as _nni
+
+
+class _ConvAct(_torch.nn.Sequential):
+    def __init__(self, conv: _nn.Module, act: _nn.Module):
+        super().__init__(_OrderedDict([("conv", conv), ("act", act)]))
+
+    @property
+    def weight(self):
+        return self.conv.weight
+
+
+class _ConvBnAct(_torch.nn.Sequential):
+    intr_mod: _Union[_nni.ConvBn1d, _nni.ConvBn2d, _nni.ConvBn3d]
+
+    def __init__(self, conv: _nn.Module, bn: _nn.Module, act: _nn.Module):
+        super().__init__(_OrderedDict([("conv", self.intr_mod(conv, bn)), ("act", act)]))
+
+    @property
+    def weight(self):
+        return self.conv.weight
+
+
+class ConvAct1d(_ConvAct):
+    pass
+
+
+class ConvAct2d(_ConvAct):
+    pass
+
+
+class ConvAct3d(_ConvAct):
+    pass
+
+
+class ConvBnAct1d(_ConvBnAct):
+    intr_mod = _nni.ConvBn1d
+    pass
+
+
+class ConvBnAct2d(_ConvBnAct):
+    intr_mod = _nni.ConvBn2d
+    pass
+
+
+class ConvBnAct3d(_ConvBnAct):
+    intr_mod = _nni.ConvBn3d
+    pass
+
+
+class LinearAct(_torch.nn.Sequential):
+    def __init__(self, linear: _nn.Linear, act: _nn.Module):
+        super().__init__(_OrderedDict([("linear", linear), ("act", act)]))
+
+    @property
+    def weight(self):
+        return self.linear.weight
diff --git a/coremltools/optimize/torch/quantization/modules/qat_modules.py b/coremltools/optimize/torch/quantization/modules/qat_modules.py
new file mode 100644
index 000000000..452941b61
--- /dev/null
+++ b/coremltools/optimize/torch/quantization/modules/qat_modules.py
@@ -0,0 +1,173 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+from collections import OrderedDict as _OrderedDict
+from typing import Type as _Type
+from typing import Union as _Union
+
+import torch as _torch
+import torch.ao.nn.intrinsic as _nni
+import torch.ao.nn.qat as _nnqat
+import torch.ao.quantization as _aoquant
+import torch.nn as _nn
+import torch.nn.intrinsic.qat as _nniqat
+
+import coremltools.optimize.torch.quantization.modules.fused_modules as _fuse
+
+
+class _ConvAct(_torch.nn.Sequential):
+    root_mod: _Type[_nn.Module]
+    qat_mod: _Union[_nnqat.Conv1d, _nnqat.Conv2d, _nnqat.Conv3d]
+    fused_mod: _Union[_fuse.ConvAct1d, _fuse.ConvAct2d, _fuse.ConvAct3d]
+
+    def __init__(self, conv: _nn.Module, act: _nn.Module, qconfig: _aoquant.QConfig):
+        super().__init__(_OrderedDict([("conv", conv), ("act", act)]))
+        self.qconfig = qconfig
+
+    def forward(self, x: _torch.Tensor) -> _torch.Tensor:
+        return self.act(self.conv(x))
+
+    @property
+    def weight(self):
+        return self.conv.weight
+
+    @property
+    def weight_fake_quant(self):
+        return self.conv.weight_fake_quant
+
+    @classmethod
+    def from_float(cls, mod: _nn.Module):
+        if isinstance(mod.conv, cls.qat_mod):
+            conv = mod.conv
+
+        else:
+            assert isinstance(mod.conv, cls.root_mod), (
+                f"Failed to convert module for QAT. "
+                f"Expected module type {cls.root_mod}, "
+                f"received type {type(mod.conv)}."
+            )
+            conv = cls.qat_mod.from_float(mod.conv)
+
+        conv.activation_post_process = None
+        return cls(conv, mod.act, mod.qconfig)
+
+    def to_float(self) -> _nn.Module:
+        return self.fused_mod(
+            conv=self.conv.to_float(),
+            act=self.act,
+        )
+
+
+class _ConvBnAct(_ConvAct):
+    intr_mod: _Type[_nn.Module]
+    qat_mod: _Union[_nniqat.ConvBn1d, _nniqat.ConvBn2d, _nniqat.ConvBn3d]
+    fused_mod: _Union[_fuse.ConvAct1d, _fuse.ConvAct2d, _fuse.ConvAct3d]
+
+    @classmethod
+    def from_float(cls, mod: _nn.Module):
+        if isinstance(mod.conv, cls.intr_mod):
+            conv = cls.qat_mod.from_float(mod.conv)
+        else:
+            conv = mod.conv
+            assert isinstance(conv, cls.qat_mod), (
+                f"Failed to convert module for QAT. "
+                f"Expected module type {cls.qat_mod}, "
+                f"received type {type(conv)}."
+            )
+        conv.activation_post_process = None
+        return cls(conv, mod.act, mod.qconfig)
+
+
+class ConvAct1d(_ConvAct):
+    root_mod = _nn.Conv1d
+    qat_mod = _nnqat.Conv1d
+    fused_mod = _fuse.ConvAct1d
+
+    def __init__(self, conv: _nnqat.Conv1d, act: _nn.Module, qconfig: _aoquant.QConfig):
+        super().__init__(conv, act, qconfig)
+
+
+class ConvAct2d(_ConvAct):
+    root_mod = _nn.Conv2d
+    qat_mod = _nnqat.Conv2d
+    fused_mod = _fuse.ConvAct2d
+
+    def __init__(self, conv: _nnqat.Conv2d, act: _nn.Module, qconfig: _aoquant.QConfig):
+        super().__init__(conv, act, qconfig)
+
+
+class ConvAct3d(_ConvAct):
+    root_mod = _nn.Conv3d
+    qat_mod = _nnqat.Conv3d
+    fused_mod = _fuse.ConvAct3d
+
+    def __init__(self, conv: _nnqat.Conv3d, act: _nn.Module, qconfig: _aoquant.QConfig):
+        super().__init__(conv, act, qconfig)
+
+
+class ConvBnAct1d(_ConvBnAct):
+    intr_mod = _nni.ConvBn1d
+    qat_mod = _nniqat.ConvBn1d
+    fused_mod = _fuse.ConvAct1d
+
+    def __init__(self, conv: _nniqat.ConvBn1d, act: _nn.Module, qconfig: _aoquant.QConfig):
+        super().__init__(conv, act, qconfig)
+
+
+class ConvBnAct2d(_ConvBnAct):
+    intr_mod = _nni.ConvBn2d
+    qat_mod = _nniqat.ConvBn2d
+    fused_mod = _fuse.ConvAct2d
+
+    def __init__(self, conv: _nniqat.ConvBn2d, act: _nn.Module, qconfig: _aoquant.QConfig):
+        super().__init__(conv, act, qconfig)
+
+
+class ConvBnAct3d(_ConvBnAct):
+    intr_mod = _nni.ConvBn3d
+    qat_mod = _nniqat.ConvBn3d
+    fused_mod = _fuse.ConvAct3d
+
+    def __init__(self, conv: _nniqat.ConvBn3d, act: _nn.Module, qconfig: _aoquant.QConfig):
+        super().__init__(conv, act, qconfig)
+
+
+class LinearAct(_torch.nn.Sequential):
+    def __init__(self, linear: _nnqat.Linear, act: _nn.Module, qconfig: _aoquant.QConfig):
+        super().__init__(_OrderedDict([("linear", linear), ("act", act)]))
+        self.qconfig = qconfig
+
+    def forward(self, x: _torch.Tensor) -> _torch.Tensor:
+        return self.act(self.linear(x))
+
+    @property
+    def weight(self):
+        return self.linear.weight
+
+    @property
+    def weight_fake_quant(self):
+        return self.linear.weight_fake_quant
+
+    @classmethod
+    def from_float(cls, mod: _fuse.LinearAct):
+        if isinstance(mod.linear, _nnqat.Linear):
+            linear = mod.linear
+
+        else:
+            assert isinstance(mod.linear, _nn.Linear), (
+                f"Failed to convert module for QAT. "
+                f"Expected module type {_nn.Linear}, "
+                f"received type {type(mod.linear)}."
+            )
+            linear = _nnqat.Linear.from_float(mod.linear)
+
+        linear.activation_post_process = None
+        return cls(linear, mod.act, mod.qconfig)
+
+    def to_float(self) -> _fuse.LinearAct:
+        return _fuse.LinearAct(
+            linear=self.linear.to_float(),
+            act=self.act,
+        )
diff --git a/coremltools/optimize/torch/quantization/modules/quantized_modules.py b/coremltools/optimize/torch/quantization/modules/quantized_modules.py
new file mode 100644
index 000000000..f1810c045
--- /dev/null
+++ b/coremltools/optimize/torch/quantization/modules/quantized_modules.py
@@ -0,0 +1,53 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+from collections import OrderedDict as _OrderedDict
+from typing import Type as _Type
+
+import torch.ao.nn.quantized.reference as _reference
+import torch.nn as _nn
+
+
+class _QuantizedConvAct(_nn.Sequential):
+    ref_quant_mod: _Type[_nn.Module]
+
+    def __init__(self, conv: _nn.Module, act: _nn.Module):
+        super().__init__(_OrderedDict([("conv", conv), ("act", act)]))
+
+    @classmethod
+    def from_float(cls, float_conv_act, weight_qparams):
+        conv = cls.ref_quant_mod.from_float(float_conv_act.conv, weight_qparams)
+        return cls(conv, float_conv_act.act)
+
+
+class QuantizedConvAct1d(_QuantizedConvAct):
+    ref_quant_mod = _reference.Conv1d
+
+    def __init__(self, conv: _reference.Conv1d, act: _nn.Module):
+        super().__init__(conv, act)
+
+
+class QuantizedConvAct2d(_QuantizedConvAct):
+    ref_quant_mod = _reference.Conv2d
+
+    def __init__(self, conv: _reference.Conv2d, act: _nn.Module):
+        super().__init__(conv, act)
+
+
+class QuantizedConvAct3d(_QuantizedConvAct):
+    ref_quant_mod = _reference.Conv3d
+
+    def __init__(self, conv: _reference.Conv3d, act: _nn.Module):
+        super().__init__(conv, act)
+
+
+class QuantizedLinearAct(_nn.Sequential):
+    def __init__(self, linear: _reference.Linear, act: _nn.Module):
+        super().__init__(_OrderedDict([("linear", linear), ("act", act)]))
+
+    @classmethod
+    def from_float(cls, float_linear_act, weight_qparams):
+        linear = _reference.Linear.from_float(float_linear_act.linear, weight_qparams)
+        return cls(linear, float_linear_act.act)
diff --git a/coremltools/optimize/torch/quantization/quantization_config.py b/coremltools/optimize/torch/quantization/quantization_config.py
new file mode 100644
index 000000000..f68898fe1
--- /dev/null
+++ b/coremltools/optimize/torch/quantization/quantization_config.py
@@ -0,0 +1,269 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import logging as _logging
+from collections import OrderedDict as _OrderedDict
+from enum import Enum as _Enum
+from enum import unique as _unique
+from typing import Any as _Any
+from typing import Callable as _Callable
+from typing import Dict as _Dict
+from typing import List as _List
+from typing import NewType as _NewType
+from typing import Optional as _Optional
+from typing import Union as _Union
+
+import cattrs as _cattrs
+import torch as _torch
+import torch.ao.quantization as _aoquant
+from attr import define as _define
+from attr import field as _field
+from attrs import validators as _validators
+
+from coremltools.optimize.torch._utils.torch_utils import (
+    maybe_convert_str_to_dtype as _maybe_convert_str_to_dtype,
+)
+from coremltools.optimize.torch._utils.torch_utils import (
+    maybe_convert_str_to_mod_type as _maybe_convert_str_to_mod_type,
+)
+from coremltools.optimize.torch.optimization_config import (
+    ModuleOptimizationConfig as _ModuleOptimizationConfig,
+)
+from coremltools.optimize.torch.optimization_config import OptimizationConfig as _OptimizationConfig
+from coremltools.optimize.torch.optimization_config import _structure_from_dict_hook_factory
+
+_logger = _logging.getLogger(__name__)
+
+
+@_unique
+class ObserverType(_Enum):
+    """
+    An enum indicating the type of observer. Allowed options are moving_average_min_max and mix_max.
+    """
+    moving_average_min_max = "moving_average_min_max"
+    mix_max = "min_max"
+
+    @staticmethod
+    def get_observer(observer_type: "ObserverType", is_per_channel: bool) -> _Any:
+        _str_to_observer_map = {
+            "moving_average_min_max": _aoquant.MovingAverageMinMaxObserver,
+            "min_max": _aoquant.MinMaxObserver,
+            "moving_average_min_max_per_channel": _aoquant.MovingAveragePerChannelMinMaxObserver,
+            "min_max_per_channel": _aoquant.PerChannelMinMaxObserver,
+        }
+        observer_name = observer_type.value
+        if is_per_channel:
+            observer_name = f"{observer_name}_per_channel"
+        return _str_to_observer_map[observer_name]
+
+
+@_unique
+class QuantizationScheme(_Enum):
+    """
+    An enum indicating the type of quantization to be performed. Allowed options are symmetric
+    and affine.
+    """
+
+    symmetric = "symmetric"
+    affine = "affine"
+
+    @staticmethod
+    def get_qscheme(
+        quantizaton_scheme: "QuantizationScheme", is_per_channel: bool
+    ) -> _torch.qscheme:
+        _str_to_qscheme_map = {
+            "symmetric": _torch.per_tensor_symmetric,
+            "affine": _torch.per_tensor_affine,
+            "symmetric_per_channel": _torch.per_channel_symmetric,
+            "affine_per_channel": _torch.per_channel_affine,
+        }
+        quantization_scheme_name = quantizaton_scheme.value
+        if is_per_channel:
+            quantization_scheme_name = f"{quantization_scheme_name}_per_channel"
+        return _str_to_qscheme_map[quantization_scheme_name]
+
+
+_default_quantization_options = {
+    "weight_dtype": _torch.qint8,
+    "weight_per_channel": True,
+    "activation_dtype": _torch.quint8,
+    "observer": ObserverType.moving_average_min_max,
+    "quantization_scheme": QuantizationScheme.symmetric,
+}
+
+
+@_define
+class ModuleLinearQuantizerConfig(_ModuleOptimizationConfig):
+    """
+    Module level configuration for :py:class:`LinearQuantizer`.
+
+    Args:
+        weight_dtype (:py:class:`torch.dtype`): The dtype to use for quantizing the weights. Defaults to
+            :py:class:`torch.qint8`.
+        weight_observer (:py:class:`ObserverType`): Type of observer to use for quantizing weights. Defaults
+            to ``moving_average_min_max``.
+        weight_per_channel (:obj:`bool`): When ``True``, weights are quantized per channel; otherwise, per tensor.
+        activation_dtype (:py:class:`torch.dtype`): The dtype to use for quantizing the activations. Defaults to
+            :py:class:`torch.qint8`.
+        activation_observer (:py:class:`ObserverType`): Type of observer to use for quantizing activations. Defaults
+            to ``moving_average_min_max``.
+        quantization_scheme: (:py:class:`QuantizationScheme`): Type of quantization configuration to use. When
+            this parameter is set to :py:class:`QuantizationScheme.symmetric`, all weights are
+            quantized with zero point as zero, and all activations are quantized with zero point as zero for
+            non-negative activations and 128 for all other activations. When it is set to
+            :py:class:`QuantizationScheme.affine`, zero point can be set anywhere in the range of values allowed
+            for the quantized weight/activation.
+        milestones (:obj:`list` of :obj:`int`): A list of four integers indicating milestones to use during
+            quantization. The first milestone corresponds to enabling observers, the second to enabling fake
+            quantization simulation, the third to disabling observers, and the last
+            to freezing batch norm statistics.
+    """
+
+    weight_dtype: _torch.dtype = _field(
+        default=_default_quantization_options["weight_dtype"],
+        converter=_maybe_convert_str_to_dtype,
+        validator=[
+            _validators.instance_of(_torch.dtype),
+            _validators.in_([_torch.qint8, _torch.quint8, _torch.float32]),
+        ],
+    )
+    weight_observer: ObserverType = _field(
+        default=_default_quantization_options["observer"],
+        converter=ObserverType,
+        validator=_validators.in_(ObserverType),
+    )
+    weight_per_channel: bool = _field(
+        default=_default_quantization_options["weight_per_channel"],
+        validator=_validators.instance_of(bool),
+    )
+    activation_dtype: _torch.dtype = _field(
+        default=_default_quantization_options["activation_dtype"],
+        converter=_maybe_convert_str_to_dtype,
+        validator=[
+            _validators.instance_of(_torch.dtype),
+            _validators.in_([_torch.quint8, _torch.float32]),
+        ],
+    )
+    activation_observer: ObserverType = _field(
+        default=_default_quantization_options["observer"],
+        converter=ObserverType,
+        validator=_validators.in_(ObserverType),
+    )
+    quantization_scheme: QuantizationScheme = _field(
+        default=_default_quantization_options["quantization_scheme"],
+        converter=QuantizationScheme,
+        validator=_validators.in_(QuantizationScheme),
+    )
+    milestones: _Optional[_List[int]] = _field(
+        default=None,
+        validator=_validators.optional(
+            _validators.deep_iterable(
+                member_validator=_validators.instance_of(int),
+                iterable_validator=_validators.instance_of(list),
+            )
+        ),
+    )
+
+    def __attrs_post_init__(self):
+        if self.weight_dtype == _torch.float32 and self.activation_dtype != _torch.float32:
+            raise ValueError(
+                f"Unsupported configuration: weight_dtype = {self.weight_dtype}, "
+                f"activation_dtype = {self.activation_dtype}. When weights are not quantized,"
+                f"activations cannot be quantized."
+            )
+
+    @milestones.validator
+    def _check_milestones(self, attribute, value):
+        if value is not None:
+            assert len(value) == 4, (
+                f"Received milestones = {value}. "
+                f"Milestones should be of length 4. "
+                f"Refer to docs for more information."
+            )
+
+
+_ModuleTypeConfigType = _NewType(
+    "ModuleTypeConfigType",
+    _Dict[_Union[_Callable, str], _Optional[ModuleLinearQuantizerConfig]],
+)
+
+
+@_define
+class LinearQuantizerConfig(_OptimizationConfig):
+    """
+    Configuration for :py:class:`LinearQuantizer`.
+
+    Args:
+        global_config (:py:class:`ModuleLinearQuantizerConfig`): Config to be applied globally
+            to all supported modules. Missing values are chosen from the default config.
+        module_type_configs (:obj:`dict` of :obj:`str` to :py:class:`ModuleLinearQuantizerConfig`):
+            Module type level configs applied to a specific
+            module class, such as :py:class:`torch.nn.Linear`. The keys can be either strings
+            or module classes.
+        module_name_configs (:obj:`dict` of :obj:`str` to :py:class:`ModuleLinearQuantizerConfig`):
+            Module level configs applied to specific modules.
+            The name of the module must be a fully qualified name that can be used to fetch it
+            from the top level module using the ``module.get_submodule(target)`` method.
+        non_traceable_module_names (:obj:`list` of :obj:`str`):
+            Names of modules which cannot be traced using ``torch.fx``.
+
+    .. note::
+        The ``quantization_scheme`` parameter must be the same across all configs.
+
+    """
+
+    global_config: _Optional[ModuleLinearQuantizerConfig] = _field(
+        default=None,
+        validator=_validators.optional(_validators.instance_of(ModuleLinearQuantizerConfig)),
+    )
+    module_type_configs: _ModuleTypeConfigType = _field(
+        factory=_OrderedDict,
+        validator=_validators.deep_mapping(
+            key_validator=_validators.instance_of((str, _Callable)),
+            value_validator=_validators.optional(
+                _validators.instance_of(ModuleLinearQuantizerConfig)
+            ),
+            mapping_validator=_validators.instance_of(dict),
+        ),
+    )
+    module_name_configs: _Dict[str, _Optional[ModuleLinearQuantizerConfig]] = _field(
+        factory=_OrderedDict,
+        validator=_validators.deep_mapping(
+            key_validator=_validators.instance_of(str),
+            value_validator=_validators.optional(
+                _validators.instance_of(ModuleLinearQuantizerConfig)
+            ),
+            mapping_validator=_validators.instance_of(dict),
+        ),
+    )
+    non_traceable_module_names: _List[str] = _field(
+        default=list(),
+        validator=_validators.deep_iterable(
+            member_validator=_validators.instance_of(str),
+        ),
+    )
+
+    def __attrs_post_init__(self):
+        if (
+            self.global_config is None
+            and len(self.module_type_configs) == 0
+            and len(self.module_name_configs) == 0
+        ):
+            self.global_config = ModuleLinearQuantizerConfig()
+        self.module_type_configs = {
+            _maybe_convert_str_to_mod_type(key): val
+            for key, val in self.module_type_configs.items()
+        }
+        self._validate_same_params(["quantization_scheme"])
+
+    @classmethod
+    def from_dict(cls, config_dict: _Dict[str, _Any]) -> "LinearQuantizerConfig":
+        super().from_dict(config_dict)
+        converter = _cattrs.Converter(forbid_extra_keys=True)
+        converter.register_structure_hook(
+            _ModuleTypeConfigType,
+            _structure_from_dict_hook_factory(ModuleLinearQuantizerConfig),
+        )
+        return converter.structure_attrs_fromdict(config_dict, cls)
diff --git a/coremltools/optimize/torch/quantization/quantizer.py b/coremltools/optimize/torch/quantization/quantizer.py
new file mode 100644
index 000000000..7bdf99bbb
--- /dev/null
+++ b/coremltools/optimize/torch/quantization/quantizer.py
@@ -0,0 +1,268 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import copy as _copy
+import logging as _logging
+from typing import Any as _Any
+from typing import Optional as _Optional
+from typing import Tuple as _Tuple
+
+import torch as _torch
+import torch.ao.quantization as _aoquant
+import torch.nn.intrinsic.qat as _nnintrinsicqat
+from torch.ao.quantization.fx.custom_config import PrepareCustomConfig as _PrepareCustomConfig
+from torch.ao.quantization.quantize_fx import convert_to_reference_fx as _convert_to_reference_fx
+
+from coremltools.optimize.torch._utils.math_utils import rmse_error as _rmse_error
+from coremltools.optimize.torch._utils.torch_utils import get_eval_model as _get_eval_model
+from coremltools.optimize.torch.base_model_optimizer import (
+    BaseModelOptimizer as _BaseModelOptimizer,
+)
+from coremltools.optimize.torch.base_model_optimizer import _Report
+from coremltools.optimize.torch.quantization._backend_config import (
+    get_backend_config as _get_backend_config,
+)
+from coremltools.optimize.torch.quantization._backend_config import (
+    get_supported_modules as _get_supported_modules,
+)
+from coremltools.optimize.torch.quantization._configure import (
+    QATConfigurationHandler as _QATConfigurationHandler,
+)
+from coremltools.optimize.torch.quantization._qconfig_mapping import _QConfigMappingBuilder
+from coremltools.optimize.torch.quantization.quantization_config import (
+    LinearQuantizerConfig as _LinearQuantizerConfig,
+)
+from coremltools.optimize.torch.quantization.quantization_config import (
+    ModuleLinearQuantizerConfig as _ModuleLinearQuantizerConfig,
+)
+
+_logger = _logging.getLogger(__name__)
+
+
+class Quantizer(_BaseModelOptimizer):
+    pass
+
+
+class LinearQuantizer(Quantizer):
+    """
+    Perform quantization aware training (QAT) of models.
+
+    Example:
+
+            .. code-block:: python
+
+                import torch.nn as nn
+                from coremltools.optimize.torch.quantization import (
+                    LinearQuantizer,
+                    LinearQuantizerConfig,
+                )
+
+                model = nn.Sequential(
+                    OrderedDict(
+                        {
+                            "conv": nn.Conv2d(1, 20, (3, 3)),
+                            "relu1": nn.ReLU(),
+                            "conv2": nn.Conv2d(20, 20, (3, 3)),
+                            "relu2": nn.ReLU(),
+                        }
+                    )
+                )
+
+                loss_fn = define_loss()
+
+                # initialize the quantizer
+                config = LinearQuantizerConfig.from_dict(
+                    {
+                        "global_config": {
+                            "quantization_scheme": "symmetric",
+                            "milestones": [0, 100, 400, 400],
+                        }
+                    }
+                )
+
+                quantizer = LinearQuantizer(model, config)
+
+                # prepare the model to insert FakeQuantize layers for QAT
+                model = quantizer.prepare()
+
+                # use quantizer in your PyTorch training loop
+                for inputs, labels in data:
+                    output = model(inputs)
+                    loss = loss_fn(output, labels)
+                    loss.backward()
+                    optimizer.step()
+                    quantizer.step()
+
+                # convert operations to their quanitzed counterparts using parameters learnt via QAT
+                model = quantizer.finalize(inplace=True)
+
+    Args:
+        model (:obj:`torch.nn.Module`): Module to be trained.
+        config (:py:class:`_LinearQuantizerConfig`): Config that specifies how
+            different submodules in the model will be quantized.
+            Default config is used when passed as ``None``.
+    """
+    _supported_modules: _Tuple = tuple(_get_supported_modules())
+
+    def __init__(self, model: _torch.nn.Module, config: _Optional[_LinearQuantizerConfig] = None):
+        config = _LinearQuantizerConfig() if config is None else config
+        super().__init__(model, config)
+        global_config = self._construct_global_config()
+        self._is_prepared = False
+        self._quantization_scheme = global_config.quantization_scheme
+        self._milestones = global_config.milestones
+        qmapping_builder = _QConfigMappingBuilder()
+        self._qconfig_mapping = qmapping_builder.get_qconfig_mapping_from_quantization_config(
+            model=self._model,
+            quantization_config=self._config,
+            quantization_scheme=self._quantization_scheme,
+        )
+
+    def _construct_global_config(self) -> _ModuleLinearQuantizerConfig:
+        if self._config.global_config is not None:
+            return self._config.global_config
+        for _, config in self._config.module_type_configs.items():
+            if config is not None:
+                return config
+        for _, config in self._config.module_name_configs.items():
+            if config is not None:
+                return config
+        return _ModuleLinearQuantizerConfig()
+
+    def prepare(self, example_inputs: _Any, inplace: bool = False) -> _torch.nn.Module:
+        """
+        Prepares the model for quantization aware training by inserting
+        :py:class:`torch.ao.quantization.FakeQuantize` layers in the model in appropriate places.
+
+        Args:
+            inplace (:obj:`bool`): If ``True``, model transformations are carried out in-place and
+                the original module is mutated, otherwise a copy of the model is mutated and returned.
+        """
+        if self._is_prepared:
+            _logger.warning(
+                "Model has already been prepared for QAT. This API call will be a no-op."
+            )
+            return self._model
+        model = self._model
+        if not inplace:
+            model = _copy.deepcopy(self._model)
+        model.train()
+        prepare_custom_config = _PrepareCustomConfig().set_non_traceable_module_names(
+            self._config.non_traceable_module_names
+        )
+        qat_handler = _QATConfigurationHandler(
+            prepare_custom_config=prepare_custom_config,
+            qconfig_mapping=self._qconfig_mapping,
+            backend_config=_get_backend_config(),
+            quantization_scheme=self._quantization_scheme,
+        )
+        prepared_model = qat_handler.prepare(model, example_inputs)
+        if self._milestones is not None:
+            prepared_model.apply(_aoquant.disable_observer)
+            prepared_model.apply(_aoquant.disable_fake_quant)
+        self._model = prepared_model
+        self._is_prepared = True
+        return prepared_model
+
+    def step(self):
+        """
+        Steps through the milestones defined for this quantizer.
+
+        The first milestone corresponds to enabling observers, the second
+        to enabling fake quantization simulation, the third
+        to disabling observers, and the last to freezing batch norm statistics.
+
+        .. note::
+            If milestones argument is set as ``None``, this method is a no-op.
+
+        .. note::
+            In order to not use a particular milestone, its value can be set as ``float('inf')``.
+        """
+        if not self._is_prepared:
+            _logger.warning(
+                "Model has not been prepared for QAT. This API call "
+                "will be a no-op. prepare method must be called before "
+                "a call to the step method."
+            )
+            return
+        if self._milestones is None:
+            return
+        else:
+            if self._step_count == self._milestones[0]:
+                self._model.apply(_aoquant.enable_observer)
+            if self._step_count == self._milestones[1]:
+                self._model.apply(_aoquant.enable_fake_quant)
+            if self._step_count == self._milestones[2]:
+                self._model.apply(_aoquant.disable_observer)
+            if self._step_count == self._milestones[3]:
+                self._model.apply(_nnintrinsicqat.freeze_bn_stats)
+        self._step_count += 1
+
+    def finalize(
+        self, model: _Optional[_torch.nn.Module] = None, inplace: bool = False
+    ) -> _torch.nn.Module:
+        """
+        Prepares the model for export.
+
+        Args:
+            model (:py:class:`_torch.nn.Module`): Model to be finalized.
+            inplace (:obj:`bool`): If ``True``, model transformations are carried out in-place and
+                the original module is mutated; otherwise, a copy of the model is mutated and returned.
+
+        .. note::
+            Once the model is finalized with ``in_place = True``, it may not be
+            runnable on the GPU.
+        """
+        if not self._is_prepared:
+            _logger.warning(
+                "Model has not been prepared for QAT. This API call "
+                "will be a no-op. prepare method must be called before "
+                "a call to the finalize method."
+            )
+            return self._model
+        if model is None:
+            model = self._model
+        if not inplace:
+            model = _copy.deepcopy(model)
+        model.eval()
+        finalized_model = _convert_to_reference_fx(
+            model, qconfig_mapping=self._qconfig_mapping, backend_config=_get_backend_config()
+        )
+        if model is None:
+            self._model = finalized_model
+        return finalized_model
+
+    def report(self) -> _Report:
+        """
+        Returns a dictionary with important statistics related to current state of quantization.
+        Each key in the dictionary corresponds to a module name, and the
+        value is a dictionary containing the statistics such as scale, zero point,
+        number of parameters, and so on.
+        """
+        report = _Report()
+        with _get_eval_model(self._model) as model:
+            with _torch.no_grad():
+                for name, module in model.named_modules(remove_duplicate=True):
+                    module_summary = dict()
+                    if hasattr(module, "weight_fake_quant"):
+                        module_summary["device"] = module.weight.device
+                        observer = module.weight_fake_quant.activation_post_process
+                        qweight = module.weight_fake_quant.forward(module.weight.detach())
+                        module_summary["qmin"] = module.weight_fake_quant.quant_min
+                        module_summary["qmax"] = module.weight_fake_quant.quant_max
+                        module_summary["min_v"] = _torch.min(observer.min_val).item()
+                        module_summary["max_v"] = _torch.max(observer.max_val).item()
+                        scale = module.weight_fake_quant.scale
+                        if len(scale) <= 1:
+                            module_summary["scale"] = scale.item()
+                        zerop = module.weight_fake_quant.zero_point
+                        if len(zerop) <= 1:
+                            module_summary["zerop"] = zerop.item()
+                        module_summary["error"] = _rmse_error(
+                            module.weight.detach(), qweight
+                        ).item()
+                        module_summary["#params"] = int(_torch.numel(qweight))
+                        report[name] = module_summary
+        return report
diff --git a/coremltools/proto/VisionFeaturePrint_pb2.py b/coremltools/proto/VisionFeaturePrint_pb2.py
index face7914c..e276ba2dc 100644
--- a/coremltools/proto/VisionFeaturePrint_pb2.py
+++ b/coremltools/proto/VisionFeaturePrint_pb2.py
@@ -19,7 +19,7 @@
   name='VisionFeaturePrint.proto',
   package='CoreML.Specification.CoreMLModels',
   syntax='proto3',
-  serialized_pb=_b('\n\x18VisionFeaturePrint.proto\x12!CoreML.Specification.CoreMLModels\"\xe0\x04\n\x12VisionFeaturePrint\x12L\n\x05scene\x18\x14 \x01(\x0b\x32;.CoreML.Specification.CoreMLModels.VisionFeaturePrint.SceneH\x00\x12P\n\x07objects\x18\x15 \x01(\x0b\x32=.CoreML.Specification.CoreMLModels.VisionFeaturePrint.ObjectsH\x00\x1a\xb7\x01\n\x05Scene\x12Y\n\x07version\x18\x01 \x01(\x0e\x32H.CoreML.Specification.CoreMLModels.VisionFeaturePrint.Scene.SceneVersion\"S\n\x0cSceneVersion\x12\x19\n\x15SCENE_VERSION_INVALID\x10\x00\x12\x13\n\x0fSCENE_VERSION_1\x10\x01\x12\x13\n\x0fSCENE_VERSION_2\x10\x02\x1a\xd5\x01\n\x07Objects\x12]\n\x07version\x18\x01 \x01(\x0e\x32L.CoreML.Specification.CoreMLModels.VisionFeaturePrint.Objects.ObjectsVersion\x12\x0e\n\x06output\x18\x64 \x03(\t\"[\n\x0eObjectsVersion\x12\x1b\n\x17OBJECTS_VERSION_INVALID\x10\x00\x12\x15\n\x11OBJECTS_VERSION_1\x10\x01\x12\x15\n\x11OBJECTS_VERSION_2\x10\x02\x42\x18\n\x16VisionFeaturePrintTypeB\x02H\x03\x62\x06proto3')
+  serialized_pb=_b('\n\x18VisionFeaturePrint.proto\x12!CoreML.Specification.CoreMLModels\"\xc9\x04\n\x12VisionFeaturePrint\x12L\n\x05scene\x18\x14 \x01(\x0b\x32;.CoreML.Specification.CoreMLModels.VisionFeaturePrint.SceneH\x00\x12P\n\x07objects\x18\x15 \x01(\x0b\x32=.CoreML.Specification.CoreMLModels.VisionFeaturePrint.ObjectsH\x00\x1a\xb7\x01\n\x05Scene\x12Y\n\x07version\x18\x01 \x01(\x0e\x32H.CoreML.Specification.CoreMLModels.VisionFeaturePrint.Scene.SceneVersion\"S\n\x0cSceneVersion\x12\x19\n\x15SCENE_VERSION_INVALID\x10\x00\x12\x13\n\x0fSCENE_VERSION_1\x10\x01\x12\x13\n\x0fSCENE_VERSION_2\x10\x02\x1a\xbe\x01\n\x07Objects\x12]\n\x07version\x18\x01 \x01(\x0e\x32L.CoreML.Specification.CoreMLModels.VisionFeaturePrint.Objects.ObjectsVersion\x12\x0e\n\x06output\x18\x64 \x03(\t\"D\n\x0eObjectsVersion\x12\x1b\n\x17OBJECTS_VERSION_INVALID\x10\x00\x12\x15\n\x11OBJECTS_VERSION_1\x10\x01\x42\x18\n\x16VisionFeaturePrintTypeB\x02H\x03\x62\x06proto3')
 )
 
 
@@ -64,15 +64,11 @@
       name='OBJECTS_VERSION_1', index=1, number=1,
       options=None,
       type=None),
-    _descriptor.EnumValueDescriptor(
-      name='OBJECTS_VERSION_2', index=2, number=2,
-      options=None,
-      type=None),
   ],
   containing_type=None,
   options=None,
   serialized_start=555,
-  serialized_end=646,
+  serialized_end=623,
 )
 _sym_db.RegisterEnumDescriptor(_VISIONFEATUREPRINT_OBJECTS_OBJECTSVERSION)
 
@@ -143,7 +139,7 @@
   oneofs=[
   ],
   serialized_start=433,
-  serialized_end=646,
+  serialized_end=623,
 )
 
 _VISIONFEATUREPRINT = _descriptor.Descriptor(
@@ -183,7 +179,7 @@
       index=0, containing_type=None, fields=[]),
   ],
   serialized_start=64,
-  serialized_end=672,
+  serialized_end=649,
 )
 
 _VISIONFEATUREPRINT_SCENE.fields_by_name['version'].enum_type = _VISIONFEATUREPRINT_SCENE_SCENEVERSION
diff --git a/coremltools/test/api/test_api_examples.py b/coremltools/test/api/test_api_examples.py
index f13e37426..8400b59b1 100644
--- a/coremltools/test/api/test_api_examples.py
+++ b/coremltools/test/api/test_api_examples.py
@@ -390,7 +390,7 @@ def test_empty_pipeline(self):
         example_input = torch.rand(1, 1, 28, 28)
         traced_model = torch.jit.trace(model, example_input)
 
-        pipeline = ct.PassPipeline.get_empty_pipeline()
+        pipeline = ct.PassPipeline.EMPTY
 
         model_converted = ct.convert(
             traced_model,
@@ -466,7 +466,7 @@ def test_pass_option_skip_const_by_size(self):
         )
 
         pipeline.set_options(
-            "common::const_elimination", {"skip_const_by_size": "-1"}, override=True
+            "common::const_elimination", {"skip_const_by_size": "-1"}
         )
         model_converted = ct.convert(
             traced_model,
diff --git a/coremltools/test/api/test_api_visibilities.py b/coremltools/test/api/test_api_visibilities.py
index c66e0c6db..d3c365484 100644
--- a/coremltools/test/api/test_api_visibilities.py
+++ b/coremltools/test/api/test_api_visibilities.py
@@ -43,6 +43,7 @@ def _check_visible_modules(actual, expected):
     "transform",
     "libmodelpackage",
     "libmilstoragepython",
+    "optimize",
 ]
 
 
@@ -160,6 +161,27 @@ def test_converters(self):
         ]
         _check_visible_modules(_get_visible_items(ct.converters), expected)
 
+    def test_optimize(self):
+        expected = [
+            "coreml",
+            "torch",
+        ]
+        _check_visible_modules(_get_visible_items(ct.optimize), expected)
+
+    def test_optimize_coreml(self):
+        expected = [
+            "OpLinearQuantizerConfig",
+            "OpMagnitudePrunerConfig",
+            "OpPalettizerConfig",
+            "OptimizationConfig",
+            "OpThresholdPrunerConfig",
+            "linear_quantize_weights",
+            "palettize_weights",
+            "prune_weights",
+            "decompress_weights",
+        ]
+        _check_visible_modules(_get_visible_items(ct.optimize.coreml), expected)
+
     def test_converters_libsvm(self):
         _check_visible_modules(_get_visible_items(ct.converters.libsvm), ["convert"])
 
diff --git a/coremltools/test/blob/test_weights.py b/coremltools/test/blob/test_weights.py
index c08183464..72bb061d1 100644
--- a/coremltools/test/blob/test_weights.py
+++ b/coremltools/test/blob/test_weights.py
@@ -29,7 +29,7 @@ def test_weight_blob_int8(self):
         writer = None
 
         reader = BlobReader(self.working_dir + "/net.wt")
-        output_arr = np.array(reader.read_int8_data(offset), np.int8)
+        output_arr = reader.read_int8_data(offset)
         np.testing.assert_equal(input_arr, output_arr)
 
     def test_weight_blob_uint8(self):
@@ -39,7 +39,27 @@ def test_weight_blob_uint8(self):
         writer = None
 
         reader = BlobReader(self.working_dir + "/net.wt")
-        output_arr = np.array(reader.read_uint8_data(offset), np.uint8)
+        output_arr = reader.read_uint8_data(offset)
+        np.testing.assert_almost_equal(input_arr, output_arr)
+
+    def test_weight_blob_int16(self):
+        writer = BlobWriter(self.working_dir + "/net.wt")
+        input_arr = np.array([-5, -2, 0, 2, 5], dtype=np.int16)
+        offset = writer.write_int16_data(input_arr)
+        writer = None
+
+        reader = BlobReader(self.working_dir + "/net.wt")
+        output_arr = reader.read_int16_data(offset)
+        np.testing.assert_equal(input_arr, output_arr)
+
+    def test_weight_blob_uint16(self):
+        writer = BlobWriter(self.working_dir + "/net.wt")
+        input_arr = np.array([1, 2, 3, 4, 5], dtype=np.uint16)
+        offset = writer.write_uint16_data(input_arr)
+        writer = None
+
+        reader = BlobReader(self.working_dir + "/net.wt")
+        output_arr = reader.read_uint16_data(offset)
         np.testing.assert_almost_equal(input_arr, output_arr)
 
     def test_weight_blob_fp16(self):
@@ -50,7 +70,7 @@ def test_weight_blob_fp16(self):
         writer = None
 
         reader = BlobReader(self.working_dir + "/net.wt")
-        output_arr_uint16 = np.array(reader.read_fp16_data(offset), np.uint16)
+        output_arr_uint16 = reader.read_fp16_data(offset)
         output_arr = np.frombuffer(output_arr_uint16.tobytes(), np.float16)
         np.testing.assert_almost_equal(input_arr, output_arr)
 
@@ -61,7 +81,7 @@ def test_weight_blob_fp32(self):
         writer = None
 
         reader = BlobReader(self.working_dir + "/net.wt")
-        output_arr = np.array(reader.read_float_data(offset), np.float32)
+        output_arr = reader.read_float_data(offset)
         np.testing.assert_almost_equal(input_arr, output_arr)
 
 if __name__ == "__main__":
diff --git a/coremltools/test/ml_program/test_compression.py b/coremltools/test/ml_program/test_compression.py
index 283efbfbc..10ed10382 100644
--- a/coremltools/test/ml_program/test_compression.py
+++ b/coremltools/test/ml_program/test_compression.py
@@ -3,29 +3,18 @@
 # Use of this source code is governed by a BSD-3-clause license that can be
 # found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
-import itertools
-
 import numpy as np
 import pytest
 import torch
 
 import coremltools as ct
-from coremltools._deps import _HAS_SKLEARN
+from coremltools.models.ml_program.compression_utils import (
+    affine_quantize_weights,
+    decompress_weights,
+    palettize_weights,
+    sparsify_weights,
+)
 from coremltools.converters.mil.testing_utils import get_op_types_in_program
-from coremltools.converters.mil.mil import types
-
-
-def create_unique_weight(weight, nbits):
-    shape = weight.detach().numpy().shape
-    size = weight.detach().numpy().size
-
-    unique_number = 1 << 4
-    weight = []
-    partition_len = size // unique_number + 1
-    for i in range(unique_number):
-        weight += [i] * (partition_len)
-    weight = np.reshape(np.array(weight[:size]).astype(np.float32), shape)
-    return weight
 
 def get_test_model_and_data(multi_layer=False):
     inputs = [ct.TensorType(name="data", shape=(1, 64, 10, 10))]
@@ -53,380 +42,78 @@ def forward(self, x):
 
 
 class TestCompressionUtils:
-
-    affine_quantize_weights = ct.compression_utils.affine_quantize_weights
-    palettize_weights = ct.compression_utils.palettize_weights
-    sparsify_weights = ct.compression_utils.sparsify_weights
-    decompress_weights = ct.compression_utils.decompress_weights
-
-    @staticmethod
-    def verify_model_outputs(model, compressed_model, input_values):
-        """
-        This utility functions does the following checks:
-
-        (1) Verify the output of the compressed model has the same shape / type of the original model
-        (2) The decompressed and compressed model have the same numerical outputs
-        """
-
-        # Make sure the model can be decompressed
-        decompressed_model = TestCompressionUtils.decompress_weights(compressed_model)
-
-        # Validate the output shape / type
-        ref_outputs = model._mil_program.functions["main"].outputs
-        outputs = compressed_model._mil_program.functions["main"].outputs
-
-        assert len(ref_outputs) == len(outputs)
-
-        for a, b in zip(ref_outputs, outputs):
-            assert a.name == b.name
-            assert a.shape == a.shape
-            assert a.dtype == b.dtype
-
-        if ct.utils._macos_version() < (13, 0):
-            return
-
-        # Validate that the compressed model could be decompressed, and produces correct outputs
-        output_dict = compressed_model.predict(input_values)
-        de_output_dict = decompressed_model.predict(input_values)
-        for k, v in de_output_dict.items():
-            assert k in output_dict
-            np.testing.assert_allclose(v, output_dict[k])
-
+    """
+    Since ct.compression_utils is deprecated, this test is only checking the API is still working.
+    """
     @staticmethod
     def test_op_selector():
         model, inputs, torch_input_values, coreml_input_values = get_test_model_and_data()
         torchmodel = torch.jit.trace(model, torch_input_values)
         mlmodel = ct.convert(torchmodel, inputs=inputs, convert_to="mlprogram")
-        mlmodel_no_quantized = TestCompressionUtils.affine_quantize_weights(mlmodel, mode="linear", op_selector=lambda const_op: const_op.val.val.size > 1e7)
-        expected_ops = ['cast', 'conv', 'cast']        
+        mlmodel_no_quantized = affine_quantize_weights(mlmodel, mode="linear", op_selector=lambda const_op: const_op.val.val.size > 1e7)
+        expected_ops = ['cast', 'conv', 'cast']
         assert get_op_types_in_program(mlmodel_no_quantized._mil_program) == expected_ops
-
-    @staticmethod
-    @pytest.mark.skipif(not _HAS_SKLEARN, reason="Missing scikit-learn. Skipping tests.")
-    def test_weight_decompression():
-        """
-        This test is doing the following steps
-
-        (1) compress a model with two conv layers into a compressed model with two different constexpr ops
-            
-            [Original model]:
-
-                     weight_1      weight_2
-                       |             |
-                       v             v
-            input -> conv_1 -----> conv_2 ---> output
-
-
-            [Compressed model]:
-
-                   weight_1_lut   weight_2_affine
-                       |               |
-                       v               v
-            input -> conv_1 ------>  conv_2 ---> output
-
-            , where weight_1_lut is a constexpr_lut_to_dense op and weight_2_affine is a constexpr_affine_dequantize op
         
-        (2) decompress the compressed model
-
-            [Decompressed model]:
-
-                   weight_1_new   weight_2_new
-                       |               |
-                       v               v
-            input -> conv_1 ------>  conv_2 ---> output
-
-            , note that, weight_1_new is equivalent to weight_1_lut, and weight_2_new is equivalent to weight_2_affine
-        """
-        model, inputs, torch_input_values, coreml_input_values = get_test_model_and_data(multi_layer=True)
-        torchmodel = torch.jit.trace(model, torch_input_values)
-        mlmodel = ct.convert(torchmodel, inputs=inputs, convert_to="mlprogram")
-        
-        # we first compress the model
-        mlmodel = TestCompressionUtils.palettize_weights(mlmodel, mode="kmeans", nbits=4, op_selector=lambda const_op: const_op.name == "conv_1_weight_to_fp16")
-        mlmodel = TestCompressionUtils.affine_quantize_weights(mlmodel, mode="linear", op_selector=lambda const_op: const_op.name == "conv_2_weight_to_fp16")
-        expected_ops = ['constexpr_lut_to_dense', 'cast', 'conv', 'constexpr_affine_dequantize', 'conv', 'cast']
-        assert get_op_types_in_program(mlmodel._mil_program) == expected_ops
-
-        # decompress the model
-        decompressed_model = TestCompressionUtils.decompress_weights(mlmodel)
-        assert get_op_types_in_program(decompressed_model._mil_program) == ['cast', 'conv', 'conv', 'cast']
-
-        if ct.utils._macos_version() < (13, 0):
-            return
-
-        # compared the numerical outputs
-        output_dict = mlmodel.predict(coreml_input_values)
-        de_output_dict = decompressed_model.predict(coreml_input_values)
-
-        for k, v in output_dict.items():
-            assert k in de_output_dict
-            np.testing.assert_allclose(v, de_output_dict[k])
-
     @staticmethod
-    def test_compression_utils_error_handling():
+    def test_affine_quantize_weights_smoke():
         model, inputs, torch_input_values, coreml_input_values = get_test_model_and_data()
         torchmodel = torch.jit.trace(model, torch_input_values)
         mlmodel = ct.convert(torchmodel, inputs=inputs, convert_to="mlprogram")
-
-        # Test invalid mode for affine quantization
-        expected_err_str = "supported for weight affine quantization. Got mode"
-        with pytest.raises(ValueError, match=expected_err_str):
-            TestCompressionUtils.affine_quantize_weights(mlmodel, mode="invalid_mode")
-
-        # Test invalid dtype for affine quantization
-        expected_err_str = "is unsupported for affine_quantize_weight"
-        with pytest.raises(ValueError, match=expected_err_str):
-            TestCompressionUtils.affine_quantize_weights(mlmodel, dtype=np.int32)
-
-        with pytest.raises(ValueError, match=expected_err_str):
-            TestCompressionUtils.affine_quantize_weights(mlmodel, dtype="int32")
-
-        # Test invalid mode for weight sparsification
-        expected_err_str = "supported for weight sparsification. Got mode"
-        with pytest.raises(ValueError, match=expected_err_str):
-            TestCompressionUtils.sparsify_weights(mlmodel, mode="invalid_mode")
-
-        # Test invalid threshold for weight sparsification
-        expected_err_str = "Invalid value of threshold: \-1. Needs to be in \[0, inf\)"
-        with pytest.raises(ValueError, match=expected_err_str):
-            TestCompressionUtils.sparsify_weights(mlmodel, mode="threshold_based", threshold=-1)
-
-        # Test invalid percentile for weight sparsification
-        expected_err_str = "Invalid value of target_percentile: 1.2. Needs to be in \[0, 1\]"
-        with pytest.raises(ValueError, match=expected_err_str):
-           TestCompressionUtils.sparsify_weights(mlmodel, mode="percentile_based", target_percentile=1.2)
-
-        # Test invalid mode for weight palettization
-        expected_err_str = "supported for weight palettization. Got mode"
-        with pytest.raises(ValueError, match=expected_err_str):
-            TestCompressionUtils.palettize_weights(mlmodel, mode="invalid_mode")
-
-        # Test nbits must be provided for kmeans, uniform mode for weight palettization
-        expected_err_str = "nbits must be provided for mode"
-        with pytest.raises(ValueError, match=expected_err_str):
-            TestCompressionUtils.palettize_weights(mlmodel, mode="kmeans")
-
-        with pytest.raises(ValueError, match=expected_err_str):
-            TestCompressionUtils.palettize_weights(mlmodel, mode="uniform")
-
-        # Test nbits must not be provided for unique, custom mode for weight palettization
-        expected_err_str = "nbits must NOT be provided for mode"
-        with pytest.raises(ValueError, match=expected_err_str):
-            TestCompressionUtils.palettize_weights(mlmodel, mode="unique", nbits=2)
-
-        with pytest.raises(ValueError, match=expected_err_str):
-            TestCompressionUtils.palettize_weights(mlmodel, mode="custom", nbits=2)
-
-        # Test lut_function must be provided for custom mode, and must not be provided otherwise
-        expected_err_str = "lut_function must be None if mode is not custom, and that it cannot be None when the mode is custom."
-        with pytest.raises(ValueError, match=expected_err_str):
-            TestCompressionUtils.palettize_weights(mlmodel, mode="custom")
-        with pytest.raises(ValueError, match=expected_err_str):
-            TestCompressionUtils.palettize_weights(mlmodel, mode="unique", lut_function=lambda op: True)
-
-        # Test lut_function must be a function obejct
-        expected_err_str = "A function object must be provided as lut_function"
-        with pytest.raises(ValueError, match=expected_err_str):
-            TestCompressionUtils.palettize_weights(mlmodel, mode="custom", lut_function=1)
-
-
+        mlmodel_quantized = affine_quantize_weights(mlmodel, mode="linear_symmetric", dtype=np.int8)
+        
+        # validate parameters
+        expected_ops = ['constexpr_affine_dequantize', 'cast', 'conv', 'cast']
+        assert get_op_types_in_program(mlmodel_quantized._mil_program) == expected_ops
+        
     @staticmethod
-    @pytest.mark.parametrize(
-        "mode, dtype",
-        itertools.product(
-            ("linear", "linear_symmetric"),
-            (np.int8, np.uint8, types.int8, types.uint8),
-        ),
-    )
-    def test_linear_quanitzation(mode, dtype):
+    def test_palettize_weights_smoke():
         model, inputs, torch_input_values, coreml_input_values = get_test_model_and_data()
         torchmodel = torch.jit.trace(model, torch_input_values)
         mlmodel = ct.convert(torchmodel, inputs=inputs, convert_to="mlprogram")
-
-        mlmodel_quantized = TestCompressionUtils.affine_quantize_weights(mlmodel, mode=mode, dtype=dtype)
+        mlmodel_palettized = palettize_weights(mlmodel, nbits=4, mode="uniform")
 
         # validate parameters
-        expected_ops = ['constexpr_affine_dequantize', 'cast', 'conv', 'cast']
-        assert get_op_types_in_program(mlmodel_quantized._mil_program) == expected_ops
-
-        quanitze_op = mlmodel_quantized._mil_program.functions["main"].find_ops(op_type="constexpr_affine_dequantize")[0]
-        assert model.weight.detach().numpy().shape == quanitze_op.quantized_data.shape
-
-        TestCompressionUtils.verify_model_outputs(mlmodel, mlmodel_quantized, coreml_input_values)
-
+        expected_ops = ['constexpr_lut_to_dense', 'cast', 'conv', 'cast']
+        assert get_op_types_in_program(mlmodel_palettized._mil_program) == expected_ops
+        
     @staticmethod
-    @pytest.mark.parametrize(
-        "threshold",
-        (0.0, 0.001, 1e2),
-    )
-    def test_weight_sparsify_threshold_based(threshold):
+    def test_sparsify_weights_threshold_smoke():
         model, inputs, torch_input_values, coreml_input_values = get_test_model_and_data()
         with torch.no_grad():
             model.weight[0][0][0][0] = 101
         torchmodel = torch.jit.trace(model, torch_input_values)
         mlmodel = ct.convert(torchmodel, inputs=inputs, convert_to="mlprogram")
-        mlmodel_sparsified = TestCompressionUtils.sparsify_weights(mlmodel, mode="threshold_based", threshold=threshold)
+        mlmodel_sparsified = sparsify_weights(mlmodel, mode="threshold_based", threshold=0.01)
 
         # validate parameters
         expected_ops = ['constexpr_sparse_to_dense', 'cast', 'conv', 'cast']
         assert get_op_types_in_program(mlmodel_sparsified._mil_program) == expected_ops
-
-        main_func = mlmodel_sparsified._mil_program.functions["main"]
-        sparse_to_dense_op = main_func.find_ops(op_type="constexpr_sparse_to_dense")[0]
-        non_sparse_data = sparse_to_dense_op.nonzero_data
-
-        if threshold != 1e2:
-            assert np.min(np.absolute(non_sparse_data.val)) >= threshold
-        else:
-            assert non_sparse_data.val.size == 1
-
-        assert sparse_to_dense_op.shape.val.tolist() == list(model.weight.detach().numpy().shape)
-
-        # validate the model
-        TestCompressionUtils.verify_model_outputs(mlmodel, mlmodel_sparsified, coreml_input_values)
-
+        
     @staticmethod
-    @pytest.mark.parametrize(
-        "percentile",
-        (0., 0.5, 1.0),
-    )
-    def test_weight_sparsify_percentile_based(percentile):
+    def test_sparsify_weights_percentile_smoke():
         model, inputs, torch_input_values, coreml_input_values = get_test_model_and_data()
+        with torch.no_grad():
+            model.weight[0][0][0][0] = 101
         torchmodel = torch.jit.trace(model, torch_input_values)
         mlmodel = ct.convert(torchmodel, inputs=inputs, convert_to="mlprogram")
-        mlmodel_sparsified = TestCompressionUtils.sparsify_weights(mlmodel, mode="percentile_based", target_percentile=percentile)
+        mlmodel_sparsified = sparsify_weights(mlmodel, mode="percentile_based", target_percentile=0.8)
 
-        # validate parameters        
+        # validate parameters
         expected_ops = ['constexpr_sparse_to_dense', 'cast', 'conv', 'cast']
         assert get_op_types_in_program(mlmodel_sparsified._mil_program) == expected_ops
-
-        main_func = mlmodel_sparsified._mil_program.functions["main"]
-        sparse_to_dense_op = main_func.find_ops(op_type="constexpr_sparse_to_dense")[0]
-        non_sparse_data = sparse_to_dense_op.nonzero_data
-        weight = model.weight.detach().numpy()
-
-        if percentile == 0.:
-            assert non_sparse_data.val.size == weight.size - 1
-        elif percentile == 0.5:
-            assert non_sparse_data.val.size <= 0.51 * (weight.size) and non_sparse_data.val.size >= 0.49 * (weight.size)
-        else:
-            assert non_sparse_data.val.size == 0
-
-        assert sparse_to_dense_op.shape.val.tolist() == list(model.weight.detach().numpy().shape)
-
-        # validate the model
-        TestCompressionUtils.verify_model_outputs(mlmodel, mlmodel_sparsified, coreml_input_values)
-
-    @staticmethod
-    @pytest.mark.parametrize(
-        "mode",
-        ("uniform", "kmeans") if _HAS_SKLEARN else ("uniform",)
-    )
-    def test_weight_palettization(mode):
-        model, inputs, torch_input_values, coreml_input_values = get_test_model_and_data()
-        torchmodel = torch.jit.trace(model, torch_input_values)
-        mlmodel = ct.convert(torchmodel, inputs=inputs, convert_to="mlprogram")
-        mlmodel_palettized = TestCompressionUtils.palettize_weights(mlmodel, nbits=4, mode=mode)
-
-        # validate parameters
-        expected_ops = ['constexpr_lut_to_dense', 'cast', 'conv', 'cast']
-        assert get_op_types_in_program(mlmodel_palettized._mil_program) == expected_ops
-
-        main_func = mlmodel_palettized._mil_program.functions["main"]
-        lut_to_dense_op = main_func.find_ops(op_type="constexpr_lut_to_dense")[0]
-
-        assert lut_to_dense_op.shape.val.tolist() == list(model.weight.detach().numpy().shape)
-    
-        # validate the model 
-        TestCompressionUtils.verify_model_outputs(mlmodel, mlmodel_palettized, coreml_input_values)
-
-    @staticmethod
-    def test_weight_palettization_unique_case_1():
-        # In this model, both conv weights can be palettized
-        model, inputs, torch_input_values, coreml_input_values = get_test_model_and_data(multi_layer=True)
-
-        weight_1_unique = create_unique_weight(model.conv_1.weight, nbits=2)
-        weight_2_unique = create_unique_weight(model.conv_2.weight, nbits=6)
         
-        with torch.no_grad():
-            model.conv_1.weight = torch.nn.Parameter(torch.Tensor(weight_1_unique))
-            model.conv_2.weight = torch.nn.Parameter(torch.Tensor(weight_2_unique))
-
-        torchmodel = torch.jit.trace(model, torch_input_values)
-        mlmodel = ct.convert(torchmodel, inputs=inputs, convert_to="mlprogram")
-
-        # validate parameters
-        mlmodel_palettized = TestCompressionUtils.palettize_weights(mlmodel, mode="unique")
-        expected_ops = ['constexpr_lut_to_dense', 'cast', 'conv', 'constexpr_lut_to_dense', 'conv', 'cast']
-        assert get_op_types_in_program(mlmodel_palettized._mil_program) == expected_ops
-
-        main_func = mlmodel_palettized._mil_program.functions["main"]
-        lut_to_dense_op_1 = main_func.find_ops(op_type="constexpr_lut_to_dense")[0]
-        lut_to_dense_op_2 = main_func.find_ops(op_type="constexpr_lut_to_dense")[1]
-
-        assert lut_to_dense_op_1.shape.val.tolist() == list(model.conv_1.weight.detach().numpy().shape)
-        assert lut_to_dense_op_2.shape.val.tolist() == list(model.conv_2.weight.detach().numpy().shape)
-
-        # validate the model 
-        TestCompressionUtils.verify_model_outputs(mlmodel, mlmodel_palettized, coreml_input_values)
-
     @staticmethod
-    def test_weight_palettization_unique_case_2(caplog):
-        # In this model, only one conv weights can be palettized, the converter should warn the users that one weight is skipped
+    def test_weight_decompression_smoke():
         model, inputs, torch_input_values, coreml_input_values = get_test_model_and_data(multi_layer=True)
-
-        weight_1_unique = create_unique_weight(model.conv_1.weight, nbits=2)
-        
-        with torch.no_grad():
-            model.conv_1.weight = torch.nn.Parameter(torch.Tensor(weight_1_unique))
-
-        torchmodel = torch.jit.trace(model, torch_input_values)
-        mlmodel = ct.convert(torchmodel, inputs=inputs, convert_to="mlprogram")
-
-        # validate parameters
-        # converter should warn the user that one weight is not compressed
-        mlmodel_palettized = TestCompressionUtils.palettize_weights(mlmodel, mode="unique")
-        warning_msg = "weight value cannot be represented in an 8 bits palettization. Skipped."
-        assert any([warning_msg in rec.message for rec in caplog.records])
-
-        expected_ops = ['constexpr_lut_to_dense', 'cast', 'conv', 'conv', 'cast']
-        assert get_op_types_in_program(mlmodel_palettized._mil_program) == expected_ops
-
-        main_func = mlmodel_palettized._mil_program.functions["main"]
-        lut_to_dense_op_1 = main_func.find_ops(op_type="constexpr_lut_to_dense")[0]
-        assert lut_to_dense_op_1.shape.val.tolist() == list(model.conv_1.weight.detach().numpy().shape)
-
-        # validate the model 
-        TestCompressionUtils.verify_model_outputs(mlmodel, mlmodel_palettized, coreml_input_values)
-
-    @staticmethod
-    def test_weight_palettization_custom():
-        model, inputs, torch_input_values, coreml_input_values = get_test_model_and_data()
         torchmodel = torch.jit.trace(model, torch_input_values)
         mlmodel = ct.convert(torchmodel, inputs=inputs, convert_to="mlprogram")
+        
+        # we first compress the model
+        mlmodel = palettize_weights(mlmodel, mode="kmeans", nbits=4, op_selector=lambda const_op: const_op.name == "conv_1_weight_to_fp16")
+        mlmodel = affine_quantize_weights(mlmodel, mode="linear", op_selector=lambda const_op: const_op.name == "conv_2_weight_to_fp16")
+        expected_ops = ['constexpr_lut_to_dense', 'cast', 'conv', 'constexpr_affine_dequantize', 'conv', 'cast']
+        assert get_op_types_in_program(mlmodel._mil_program) == expected_ops
 
-        def lut_function(weight):
-            nbits = 4
-            weight = weight.flatten()
-            unique_elements = np.unique(weight)
-            k = (1 << nbits) - 1
-            top_k = np.partition(weight, -k)[-k:]
-            np.sort(top_k)
-            lut = np.array([0.] + top_k.tolist()).astype(weight.dtype)
-            mapping = {v: idx for idx, v in enumerate(lut)}
-            indices = np.array([mapping[v] if v in mapping else 0 for v in weight]).astype(np.uint8)
-            return lut, indices
-
-        mlmodel_palettized = TestCompressionUtils.palettize_weights(mlmodel, mode="custom", lut_function=lut_function)
-
-        # validate parameters
-        expected_ops = ['constexpr_lut_to_dense', 'cast', 'conv', 'cast']
-        assert get_op_types_in_program(mlmodel_palettized._mil_program) == expected_ops
-
-        main_func = mlmodel_palettized._mil_program.functions["main"]
-        lut_to_dense_op = main_func.find_ops(op_type="constexpr_lut_to_dense")[0]
-
-        assert lut_to_dense_op.shape.val.tolist() == list(model.weight.detach().numpy().shape)
-    
-        # validate the model 
-        TestCompressionUtils.verify_model_outputs(mlmodel, mlmodel_palettized, coreml_input_values)
+        # decompress the model
+        decompressed_model = decompress_weights(mlmodel)
+        assert get_op_types_in_program(decompressed_model._mil_program) == ['cast', 'conv', 'conv', 'cast']
diff --git a/coremltools/test/modelpackage/test_modelpackage.py b/coremltools/test/modelpackage/test_modelpackage.py
index a3d2059ba..e64a4ca55 100644
--- a/coremltools/test/modelpackage/test_modelpackage.py
+++ b/coremltools/test/modelpackage/test_modelpackage.py
@@ -12,13 +12,16 @@
 
 import coremltools
 from coremltools import ComputeUnit, utils
+from coremltools._deps import _HAS_TORCH
 from coremltools.converters.mil import Builder as mb
 from coremltools.libmodelpackage import ModelPackage
 from coremltools.models import MLModel
-from coremltools.models.utils import (_MLPACKAGE_AUTHOR_NAME,
-                                      _WEIGHTS_DIR_NAME)
+from coremltools.models.utils import _MLPACKAGE_AUTHOR_NAME, _WEIGHTS_DIR_NAME
 from coremltools.proto import Model_pb2
 
+if _HAS_TORCH:
+    import torch
+
 
 def _remove_path(path):
     if os.path.isdir(path):
@@ -30,6 +33,7 @@ def _remove_path(path):
 class TestMLModel:
 
     def setup_class(self):
+
         spec = Model_pb2.Model()
         spec.specificationVersion = coremltools.SPECIFICATION_VERSION
 
@@ -100,15 +104,44 @@ def test_model_api(self):
         model.save(package.name)
         loaded_model = MLModel(package.name)
 
-        assert model.author == "Test author"
-        assert model.license == "Test license"
-        assert model.short_description == "Test model"
-        assert model.input_description["feature_1"] == "This is feature 1"
-        assert model.output_description["output"] == "This is output"
+        assert loaded_model.author == "Test author"
+        assert loaded_model.license == "Test license"
+        assert loaded_model.short_description == "Test model"
+        assert loaded_model.input_description["feature_1"] == "This is feature 1"
+        assert loaded_model.output_description["output"] == "This is output"
 
         # cleanup
         _remove_path(package.name)
 
+
+    @pytest.mark.skipif(not _HAS_TORCH, reason="requires torch")
+    def test_save_from_mlpackage(self):
+        class Model(torch.nn.Module):
+            def forward(self, x):
+                return x
+
+        example_input = torch.rand(1, 3, 50, 50)
+        traced_model = torch.jit.trace(Model().eval(), example_input)
+
+        model = coremltools.convert(
+            traced_model,
+            inputs=[coremltools.TensorType(shape=example_input.shape)],
+            convert_to="mlprogram",
+        )
+
+        author = "Bobby Joe!"
+        model.author = author
+
+        save_dir = tempfile.TemporaryDirectory(suffix=".mlpackage").name
+
+        model.save(save_dir)
+        loaded_model = MLModel(save_dir)
+
+        assert loaded_model.author == author
+
+        _remove_path(save_dir)
+
+
     def test_predict_api(self):
         model = MLModel(self.spec)
 
@@ -232,15 +265,14 @@ def test_save_in_place(self):
 
         _remove_path(package.name)
 
+    @pytest.mark.skipif(not _HAS_TORCH, reason="requires torch")
     def test_mil_as_package(self):
-        import torch
-
         num_tokens = 3
         embedding_size = 5
 
         class TestModule(torch.nn.Module):
             def __init__(self):
-                super(TestModule, self).__init__()
+                super().__init__()
                 self.embedding = torch.nn.Embedding(num_tokens, embedding_size)
 
             def forward(self, x):
@@ -310,7 +342,7 @@ def test_model_save_no_extension(self):
 
         class TestModule(torch.nn.Module):
             def __init__(self):
-                super(TestModule, self).__init__()
+                super().__init__()
                 self.embedding = torch.nn.Embedding(num_tokens, embedding_size)
 
             def forward(self, x):
diff --git a/coremltools/test/neural_network/test_quantization.py b/coremltools/test/neural_network/test_quantization.py
index ff5899146..80a4d949b 100644
--- a/coremltools/test/neural_network/test_quantization.py
+++ b/coremltools/test/neural_network/test_quantization.py
@@ -7,6 +7,7 @@
 Module containing unit tests for verifying various quantizations.
 """
 
+import itertools
 import unittest
 
 import numpy as np
@@ -560,3 +561,64 @@ def test_embeddingND_quantize(compute_units):
     )
     def test_embeddingND_quantize_CPU_and_NE(self):
         self.test_embeddingND_quantize(ComputeUnit.CPU_AND_NE)
+
+
+class TestKMeansLookup:
+    @pytest.mark.parametrize("weightShape, dtype",
+                             itertools.product(
+                                 [(20, 20), (120, 120)],
+                                 [np.float16, np.float32]
+                             ))
+    def test_kmeans_lookup(self, weightShape, dtype):
+        nbits = 4
+        w = np.random.rand(*weightShape).astype(dtype)
+
+        lookup_table, quantized_weights = quantization_utils._get_kmeans_lookup_table_and_weight(nbits, w)
+
+        assert(len(lookup_table) == 2 ** nbits)
+        assert(quantized_weights.shape == (np.prod(weightShape),))
+        assert(len(np.unique(quantized_weights)) <= len(lookup_table))
+
+        quantized_weight_values = lookup_table[quantized_weights]
+        max_deltas = np.abs(w.flatten() - quantized_weight_values.flatten()).max()
+        assert max_deltas < 0.1
+
+    def test_kmeans1d_exact_value(self):
+        w = np.array(
+            [
+                [12.0, 11.0, 12.0, 33.0, 32.0, 99.0, 0.0, 34.0, 40.0],
+                [41.0, 34.0, 98.0, 75.1, 89.0, 99.0, 0.0, 10.0, 41.0],
+            ]
+        )
+
+        lookup_table, quantized_weights = quantization_utils._get_kmeans_lookup_table_and_weight(
+            4, w, force_kmeans1d=True
+        )
+
+        assert all(
+            lookup_table
+            == np.array(
+                [
+                    0.0,
+                    10.0,
+                    11.0,
+                    12.0,
+                    32.0,
+                    33.0,
+                    34.0,
+                    40.0,
+                    41.0,
+                    75.1,
+                    89.0,
+                    98.0,
+                    99.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                ]
+            )
+        )
+        assert all(
+            quantized_weights
+            == np.array([3, 2, 3, 5, 4, 12, 0, 6, 7, 8, 6, 11, 9, 10, 12, 0, 1, 8])
+        )
diff --git a/coremltools/test/neural_network/test_tf_numeric.py b/coremltools/test/neural_network/test_tf_numeric.py
index 8c0ebbfac..3fcdca7e5 100644
--- a/coremltools/test/neural_network/test_tf_numeric.py
+++ b/coremltools/test/neural_network/test_tf_numeric.py
@@ -7,6 +7,7 @@
 import unittest
 
 import numpy as np
+import pytest
 
 import coremltools.models.datatypes as datatypes
 from coremltools import ComputeUnit
@@ -390,6 +391,9 @@ def test_resize_bilinear_cpu_only(self):
 
     @unittest.skipUnless(_macos_version() >= (10, 14), "Only supported on MacOS 10.14+")
     def test_crop_resize(self, cpu_only=False):
+        if _macos_version()[0] == 12:
+            pytest.xfail("rdar://110274216")
+
         def get_coreml_model_crop_resize(params):
             eval = True
             mlmodel = None
diff --git a/coremltools/test/optimize/__init__.py b/coremltools/test/optimize/__init__.py
new file mode 100644
index 000000000..af4a4e029
--- /dev/null
+++ b/coremltools/test/optimize/__init__.py
@@ -0,0 +1,4 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
\ No newline at end of file
diff --git a/coremltools/test/optimize/coreml/__init__.py b/coremltools/test/optimize/coreml/__init__.py
new file mode 100644
index 000000000..af4a4e029
--- /dev/null
+++ b/coremltools/test/optimize/coreml/__init__.py
@@ -0,0 +1,4 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
\ No newline at end of file
diff --git a/coremltools/test/optimize/coreml/test_passes.py b/coremltools/test/optimize/coreml/test_passes.py
new file mode 100644
index 000000000..096866713
--- /dev/null
+++ b/coremltools/test/optimize/coreml/test_passes.py
@@ -0,0 +1,2497 @@
+# Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+# Use of this source code is governed by a BSD-3-clause license that can be
+# found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import itertools
+import os
+import tempfile
+
+import cattrs
+import numpy as np
+import pytest
+import yaml
+
+import coremltools as ct
+import coremltools.optimize as cto
+import coremltools.optimize.coreml._quantization_passes as quantization
+from coremltools.converters.mil.mil import Builder as mb
+from coremltools.converters.mil.mil import types
+from coremltools.converters.mil.mil.passes.tests.test_passes import TestSkipConstexprOps
+from coremltools.converters.mil.testing_utils import get_op_types_in_program
+
+
+class TestCompressionNumerical:
+    """
+    This unit test is checking the numerical correctness for the compress/decompress methods
+    in the compression graph paths.
+    """
+    @pytest.mark.parametrize(
+        "axis, mode, source_dtype, target_dtype, data_range",
+        itertools.product(
+            [0, 1, 2, 3, -1],
+            ["LINEAR", "LINEAR_SYMMETRIC"],
+            [np.float16, np.float32],
+            [types.uint8, types.int8],
+            [
+                [-1., 1.],
+                [-3., -1.],
+                [1., 3.],
+                # Test corner case of same values
+                [0., 0.],
+                [1., 1.],
+                [-1., -1.],
+            ]
+        ),
+    )
+    def test_linear_quantizer_compression(self, axis, mode, source_dtype, target_dtype, data_range):
+        input_shape = (10, 20, 30, 40)
+        low, high = data_range
+        val = np.random.uniform(low, high, input_shape).astype(source_dtype)
+        params = quantization.linear_quantize_weights.compress(val, axis, mode, target_dtype)
+        decompressed_val = quantization.linear_quantize_weights.decompress(params)
+        np.testing.assert_allclose(val, decompressed_val, rtol=1e-02, atol=1e-02)
+
+    @pytest.mark.parametrize(
+        "mode, nbits, shape",
+        itertools.product(
+            ["KMEANS", "UNIFORM", "UNIQUE"],
+            [1, 2, 4, 6, 8],
+            [
+                (1,),
+                (1, 1),
+                (1, 10),
+                (2, 20),
+                (3, 7, 9),
+                (17, 17, 17),
+            ]
+        ),
+    )
+    def test_palettizer_compression(self, mode, nbits, shape):
+        val_size = np.prod(shape)
+        max_val = 2 ** nbits
+        val = np.arange(max_val).tolist()
+        val = np.array(val * (val_size // max_val + 1))[:val_size].astype(np.float32)
+        params = quantization.palettize_weights.compress(val, mode=mode, nbits=nbits)
+        decompressed_val = quantization.palettize_weights.decompress(params)
+
+        # For
+        # 1. UNIQUE / KMEANS mode
+        # 2. UNIFORM mode with the data range <= tensor size
+        # We can perfecting re-construct the original value
+        if (mode in ["UNIQUE", "KMEANS"]) or (mode == "UNIFORM" and max_val <= val_size):
+            np.testing.assert_allclose(val, decompressed_val, rtol=1e-02, atol=1e-02)
+
+    def test_block_sparsity_pruning_smoke(self):
+        # dim = 0
+        val = np.array(
+            [
+                [1, 3, 4],
+                [-6, -7, 2],
+                [0, 3, 4],
+                [-9, 2, -1],
+            ]
+        ).astype(np.float32)
+
+        expected_val = np.array(
+            [
+                [1, 3, 0],
+                [-6, -7, 0],
+                [0, 0, 0],
+                [-9, 0, 0],
+            ]
+        ).astype(np.float32)
+
+        params = quantization.prune_weights.compress_by_magnitude(
+            val,
+            target_sparsity=0.5,
+            block_size=2,
+            dim=0,
+        )
+        decompressed_val = quantization.prune_weights.decompress(params)
+        np.testing.assert_array_equal(decompressed_val, expected_val)
+
+        # dim = 1, with padding
+        val = np.array(
+            [
+                [1, 3, 4, 18, 1],
+                [-6, -7, 2, 2, 9],
+                [0, 3, 4, 8, 9],
+            ]
+        ).astype(np.float32)
+
+        expected_val = np.array(
+            [
+                [0, 0, 4, 18, 0],
+                [-6, -7, 0, 0, 9],
+                [0, 0, 0, 0, 9],
+            ]
+        ).astype(np.float32)
+
+        params = quantization.prune_weights.compress_by_magnitude(
+            val,
+            target_sparsity=0.5,
+            block_size=2,
+            dim=1,
+        )
+        decompressed_val = quantization.prune_weights.decompress(params)
+        np.testing.assert_array_equal(decompressed_val, expected_val)
+
+    @pytest.mark.parametrize(
+        "block_size, target_sparsity, shape, dim",
+        itertools.product(
+            [2, 5, 10, 17],
+            [0.0, 0.1, 0.5, 0.75, 1.0],
+            [
+                (10, 25),
+                (
+                    10,
+                    5,
+                    8,
+                ),
+                (40, 100, 6, 7),
+                (20, 60, 4, 5, 6),
+            ],
+            [0, 1],
+        ),
+    )
+    def test_block_sparsity_pruning_stress(self, block_size, target_sparsity, shape, dim):
+        def _is_int(val):
+            return int(val) == val
+
+        val = np.random.rand(*shape)
+        rank = len(shape)
+
+        params = quantization.prune_weights.compress_by_magnitude(
+            val,
+            target_sparsity=target_sparsity,
+            block_size=block_size,
+            dim=dim,
+        )
+
+        if block_size > shape[dim] / 2:
+            assert params is None
+            return
+
+        decompressed_val = quantization.prune_weights.decompress(params)
+        assert decompressed_val.shape == val.shape
+
+        sparsity_percentile = np.sum(decompressed_val == 0) / np.prod(shape)
+        if (shape[dim]) % block_size == 0 and _is_int(
+            np.prod(shape) // block_size * target_sparsity
+        ):
+            assert sparsity_percentile == target_sparsity
+
+        val_compress = np.copy(val)
+        val_compress[np.where(decompressed_val == 0)] = 0
+        np.testing.assert_array_equal(decompressed_val, val_compress)
+
+    def test_n_m_pruning_smoke(self):
+        # dim = 1
+        val = np.array(
+            [
+                [1, 3, 4, -3],
+                [-6, -7, 2, 4],
+                [0, 3, 4, 1],
+                [-9, 2, -1, 8],
+            ]
+        ).astype(np.float32)
+
+        expected_val = np.array(
+            [
+                [0, 3, 4, 0],
+                [0, -7, 0, 4],
+                [0, 3, 4, 0],
+                [-9, 0, 0, 8],
+            ]
+        ).astype(np.float32)
+
+        params = quantization.prune_weights.compress_by_nm_sparsity(
+            val,
+            n_m_ratio=(1, 2),
+            dim=1,
+        )
+        decompressed_val = quantization.prune_weights.decompress(params)
+        np.testing.assert_array_equal(decompressed_val, expected_val)
+
+        # dim = 0, with padding
+        val = np.array(
+            [
+                [1, 3, 4, -3, 2, 4],
+                [-6, -7, 2, 4, 6, 8],
+                [0, 4, 4, 1, -9, -4],
+                [-9, 2, -1, 8, 3, 9],
+                [-1, 5, 0, 8, 9, -3],
+                [-3, 3, 6, 3, 6, -1],
+                [2, 1, -2, 8, 2, -6],
+            ]
+        ).astype(np.float32)
+
+        expected_val = np.array(
+            [
+                [0, 0, 0, 0, 0, 0],
+                [-6, -7, 0, 4, 0, 8],
+                [0, 0, 4, 0, -9, 0],
+                [-9, 0, 0, 0, 0, 9],
+                [0, 5, 0, 8, 9, 0],
+                [0, 0, 6, 0, 0, 0],
+                [2, 1, -2, 8, 2, -6],
+            ]
+        ).astype(np.float32)
+
+        params = quantization.prune_weights.compress_by_nm_sparsity(
+            val,
+            n_m_ratio=(2, 3),
+            dim=0,
+        )
+        decompressed_val = quantization.prune_weights.decompress(params)
+        print(decompressed_val)
+        np.testing.assert_array_equal(decompressed_val, expected_val)
+
+    @pytest.mark.parametrize(
+        "n_m_ratio, shape",
+        itertools.product(
+            [
+                (1, 1),
+                (0, 2),
+                (1, 2),
+                (3, 5),
+                (5, 10),
+                (12, 17),
+            ],
+            [
+                (1, 2),
+                (3, 3),
+                (
+                    10,
+                    5,
+                    8,
+                ),
+                (80, 50, 6, 7),
+                (40, 30, 4, 5, 6),
+            ],
+        ),
+    )
+    def test_n_m_pruning_stress(self, n_m_ratio, shape):
+        n, m = n_m_ratio
+        val = np.random.rand(*shape)
+        rank = len(shape)
+
+        for dim in [0, 1]:
+            params = quantization.prune_weights.compress_by_nm_sparsity(
+                val,
+                n_m_ratio=n_m_ratio,
+                dim=dim,
+            )
+
+            # We skip the compression if m > channel / 2
+            if m > shape[dim] / 2:
+                assert params is None
+                return
+
+            decompressed_val = quantization.prune_weights.decompress(params)
+            assert decompressed_val.shape == val.shape
+
+            sparsity_percentile = np.sum(decompressed_val == 0) / np.prod(shape)
+            if (shape[dim]) % m == 0:
+                assert sparsity_percentile == n / m
+
+            val_compress = np.copy(val)
+            val_compress[np.where(decompressed_val == 0)] = 0
+            np.testing.assert_array_equal(decompressed_val, val_compress)
+
+class TestCompressionGraphBackwardCompatibility:
+    """
+    Most of the numerical tests are already convered in coremltools.tests.ml_program.test_compression_utils.
+    This test is checking the basic behavior of the graph pass classes using only global config.
+    This test also convers the backward compatibility test for the deprecated ct.compression_utils.
+    """
+    @staticmethod
+    def _get_conv_program():
+        @mb.program(
+            input_specs=[mb.TensorSpec(shape=(1, 30, 10, 10))], opset_version=ct.target.iOS16
+        )
+        def prog(x):
+            conv_weight = np.random.rand(90, 30, 2, 2).astype(np.float32)
+            x = mb.conv(x=x, weight=conv_weight)
+            return x
+
+        return prog
+
+    @pytest.mark.parametrize(
+        "fake_compression, is_deprecated",
+        itertools.product(
+            [True, False],
+            [True, False],
+        )
+    )
+    def test_affine_quantizer(self, fake_compression, is_deprecated):
+        weight_threshold = None if is_deprecated else 0
+        op_selector=(lambda const: True) if is_deprecated else None
+        op_config = cto.coreml.OpLinearQuantizerConfig(weight_threshold=weight_threshold)
+        config = cto.coreml.OptimizationConfig(global_config=op_config, is_deprecated=is_deprecated, op_selector=op_selector)
+        quantizer = quantization.linear_quantize_weights(
+            config=config, fake_compression=fake_compression
+        )
+        prog = self._get_conv_program()
+        quantizer.apply(prog)
+        expected_ops = ["constexpr_affine_dequantize", "conv"] if not fake_compression else ["conv"]
+        assert get_op_types_in_program(prog) == expected_ops
+
+    @pytest.mark.parametrize(
+        "fake_compression, is_deprecated",
+        itertools.product(
+            [True, False],
+            [True, False],
+        )
+    )
+    def test_weight_pruner(self, fake_compression, is_deprecated):
+        weight_threshold = None if is_deprecated else 0
+        op_selector=(lambda const: True) if is_deprecated else None
+        op_config = cto.coreml.OpMagnitudePrunerConfig(
+                weight_threshold=weight_threshold,
+                target_sparsity=0.75,
+        )
+        config = cto.coreml.OptimizationConfig(global_config=op_config, is_deprecated=is_deprecated, op_selector=op_selector)
+        quantizer = quantization.prune_weights(
+            config=config, fake_compression=fake_compression
+        )
+        prog = self._get_conv_program()
+        quantizer.apply(prog)
+        expected_ops = ["constexpr_sparse_to_dense", "conv"] if not fake_compression else ["conv"]
+        assert get_op_types_in_program(prog) == expected_ops
+
+    @pytest.mark.parametrize(
+        "fake_compression, is_deprecated",
+        itertools.product(
+            [True, False],
+            [True, False],
+        )
+    )
+    def test_weight_palettization(self, fake_compression, is_deprecated):
+        weight_threshold = None if is_deprecated else 0
+        op_selector=(lambda const: True) if is_deprecated else None
+        op_config = cto.coreml.OpPalettizerConfig(
+                weight_threshold=weight_threshold,
+                mode="uniform",
+                nbits=4,
+        )
+        config = cto.coreml.OptimizationConfig(global_config=op_config, is_deprecated=is_deprecated, op_selector=op_selector)
+        quantizer = quantization.palettize_weights(
+            config=config, fake_compression=fake_compression
+        )
+        prog = self._get_conv_program()
+        quantizer.apply(prog)
+        expected_ops = ["constexpr_lut_to_dense", "conv"] if not fake_compression else ["conv"]
+        assert get_op_types_in_program(prog) == expected_ops
+
+class TestCompressionPasses:
+    @staticmethod
+    def _get_test_program():
+        @mb.program(
+            input_specs=[mb.TensorSpec(shape=(1, 30, 10, 10))], opset_version=ct.target.iOS16
+        )
+        def prog(x):
+            # weight
+            conv_weight = np.random.rand(90, 30, 2, 2).astype(np.float32)
+            linear_weight = np.random.rand(70, 81).astype(np.float32)
+            conv_transpose_weight = np.random.rand(30, 4, 21, 10).astype(np.float32)
+
+            # graph
+            x = mb.conv(x=x, weight=conv_weight, name="conv")
+            x = mb.reshape(x=x, shape=(1, 90, 81), name="reshape_1")
+            x = mb.linear(x=x, weight=linear_weight, name="linear")
+            x = mb.reshape(x=x, shape=(1, 30, 21, 10), name="reshape_2")
+            x = mb.conv_transpose(x=x, weight=conv_transpose_weight, name="conv_transpose")
+            return x
+        return prog
+
+    @staticmethod
+    def _get_test_program_2():
+        @mb.program(
+            input_specs=[mb.TensorSpec(shape=(1, 30, 10, 10))], opset_version=ct.target.iOS16
+        )
+        def prog(x):
+            # weight
+            conv1_weight = np.random.rand(40, 30, 2, 2).astype(np.float32)
+            conv2_weight = np.random.rand(50, 40, 3, 3).astype(np.float32)
+            conv3_weight = np.random.rand(60, 50, 2, 4).astype(np.float32)
+
+            linear1_weight = np.random.rand(80, 60).astype(np.float32)
+            linear2_weight = np.random.rand(90, 80).astype(np.float32)
+
+            conv_transpose_weight = np.random.rand(60, 30, 6, 10).astype(np.float32)
+
+            # graph
+            x = mb.conv(x=x, weight=conv1_weight, name="conv1")
+            x = mb.conv(x=x, weight=conv2_weight, name="conv2")
+            x = mb.conv(x=x, weight=conv3_weight, name="conv3")
+            x = mb.reshape(x=x, shape=(6, 4, 60), name="reshape1")
+            x = mb.linear(x=x, weight=linear1_weight, name="linear1")
+            x = mb.linear(x=x, weight=linear2_weight, name="linear2")
+            x = mb.reshape(x=x, shape=(1, 30, 6, 12), name="reshape2")
+            x = mb.conv_transpose(x=x, weight=conv_transpose_weight, name="conv_transpose")
+            return x
+        return prog
+
+class TestOptimizationConfig(TestCompressionPasses):
+    """
+    Test some basic funtionality of the OptimizationConfig.
+    """
+    @pytest.mark.parametrize(
+        "compressor_class, fake_compression",
+        itertools.product(
+            [
+                quantization.palettize_weights,
+                quantization.prune_weights,
+                quantization.linear_quantize_weights,
+            ],
+            [True, False],
+        )
+    )
+    def test_empty_config(self, compressor_class, fake_compression):
+        """
+        For an empty config, the compression graph passes should do nothing
+        """
+        config = cto.coreml.OptimizationConfig()
+        compressor = compressor_class(
+            config=config, fake_compression=fake_compression
+        )
+        prog = self._get_test_program()
+        compressor.apply(prog)
+        expected_ops = ["conv", "reshape", "linear", "reshape", "conv_transpose"]
+        assert get_op_types_in_program(prog) == expected_ops
+
+    def test_empty_op_type(self):
+        """
+        If an op_type config is set to None. The entire class will not be compressed.
+        """
+        config = cto.coreml.OptimizationConfig(
+            global_config=cto.coreml.OpPalettizerConfig(mode="kmeans", nbits=2),
+            op_type_configs={
+                "conv": None,
+            },
+        )
+        compressor = quantization.palettize_weights(config=config)
+        prog = self._get_test_program()
+        compressor.apply(prog)
+        expected_ops = [
+            "conv",
+            "reshape",
+            "constexpr_lut_to_dense",
+            "linear",
+            "reshape",
+            "constexpr_lut_to_dense",
+            "conv_transpose",
+        ]
+        assert get_op_types_in_program(prog) == expected_ops
+        conv_op = prog.find_ops(op_type="conv")[0]
+        assert conv_op.weight.op.op_type == "const"
+
+    def test_empty_op_name(self):
+        """
+        If an op_name config is set to None. The op instance will not be compressed.
+        """
+        config = cto.coreml.OptimizationConfig(
+            op_type_configs={
+                "conv": cto.coreml.OpPalettizerConfig(mode="kmeans", nbits=2),
+            },
+            op_name_configs={
+                "conv1": None,
+            },
+        )
+        compressor = quantization.palettize_weights(config=config)
+        prog = self._get_test_program_2()
+        compressor.apply(prog)
+        expected_ops = [
+            "conv",
+            "constexpr_lut_to_dense",
+            "conv",
+            "constexpr_lut_to_dense",
+            "conv",
+            "reshape",
+            "linear",
+            "linear",
+            "reshape",
+            "conv_transpose",
+        ]
+        assert get_op_types_in_program(prog) == expected_ops
+        conv_op = prog.find_ops(op_type="conv")[0]
+        assert conv_op.weight.op.op_type == "const"
+
+    def test_config_hierarchy(self):
+        """
+        This test is checking the graph pass compresses the program correctly according to the following heirarchical order (high -> low):
+        1. op name
+        2. op type
+        3. global
+        """
+        prog = self._get_test_program_2()
+
+        # global config
+        global_config = cto.coreml.OpPalettizerConfig(
+                nbits=8,
+                mode="KMEANS",
+                weight_threshold=100,
+        )
+
+        # op type config
+        conv_config = cto.coreml.OpPalettizerConfig(
+                nbits=6,
+                mode="KMEANS",
+                weight_threshold=100,
+        )
+        linear_config = cto.coreml.OpPalettizerConfig(
+                nbits=4,
+                mode="KMEANS",
+                weight_threshold=100,
+        )
+
+        # op name config
+        conv1_config = cto.coreml.OpPalettizerConfig(
+                nbits=2,
+                mode="KMEANS",
+                weight_threshold=100,
+        )
+        linear2_config = cto.coreml.OpPalettizerConfig(
+                nbits=1,
+                mode="KMEANS",
+                weight_threshold=100,
+        )
+
+        config = cto.coreml.OptimizationConfig()
+        config.set_global(global_config)
+
+        config.set_op_type("conv", conv_config)
+        config.set_op_type("linear", linear_config)
+
+        config.set_op_name("conv1", conv1_config)
+        config.set_op_name("linear2", linear2_config)
+
+        compressor = quantization.palettize_weights(config=config)
+        compressor.apply(prog)
+
+        expected_ops = [
+            "constexpr_lut_to_dense",
+            "conv",
+            "constexpr_lut_to_dense",
+            "conv",
+            "constexpr_lut_to_dense",
+            "conv",
+            "reshape",
+            "constexpr_lut_to_dense",
+            "linear",
+            "constexpr_lut_to_dense",
+            "linear",
+            "reshape",
+            "constexpr_lut_to_dense",
+            "conv_transpose",
+        ]
+        assert get_op_types_in_program(prog) == expected_ops
+
+        expected_nbits = [2, 6, 6, 4, 1, 8, 8]
+        lut_ops = prog.find_ops(op_type="constexpr_lut_to_dense")
+
+        for nbits, op in zip(expected_nbits, lut_ops):
+            assert op.lut.val.shape == (2**nbits,)
+
+    def test_mixed_compression_algorithms(self):
+        """
+        This test is checking a program can be ran under different compression method
+        """
+        prog = self._get_test_program_2()
+
+        # Run palettization for conv ops
+        conv_config = cto.coreml.OpPalettizerConfig(
+                nbits=1,
+                mode="KMEANS",
+                weight_threshold=100,
+        )
+        config = cto.coreml.OptimizationConfig()
+        config.set_op_type("conv", conv_config)
+
+        compressor = quantization.palettize_weights(config=config)
+        compressor.apply(prog)
+
+        expected_ops = [
+            "constexpr_lut_to_dense",
+            "conv",
+            "constexpr_lut_to_dense",
+            "conv",
+            "constexpr_lut_to_dense",
+            "conv",
+            "reshape",
+            "linear",
+            "linear",
+            "reshape",
+            "conv_transpose",
+        ]
+        assert get_op_types_in_program(prog) == expected_ops
+
+        # Run affine quanitzation for conv1 / linear1. Note that since conv1 is already compressed
+        # the quantization makes no affect on it
+        op_name_config = cto.coreml.OpLinearQuantizerConfig(
+                mode="LINEAR_SYMMETRIC",
+                dtype=np.int8,
+                weight_threshold=100,
+        )
+        config = cto.coreml.OptimizationConfig()
+        config.set_op_name("conv1", op_name_config)
+        config.set_op_name("linear1", op_name_config)
+
+        compressor = quantization.linear_quantize_weights(config=config)
+        compressor.apply(prog)
+
+        expected_ops = [
+            "constexpr_lut_to_dense",
+            "conv",
+            "constexpr_lut_to_dense",
+            "conv",
+            "constexpr_lut_to_dense",
+            "conv",
+            "reshape",
+            "constexpr_affine_dequantize",
+            "linear",
+            "linear",
+            "reshape",
+            "conv_transpose",
+        ]
+        assert get_op_types_in_program(prog) == expected_ops
+
+        # Run sparsification for the whoel program
+        global_config = cto.coreml.OpMagnitudePrunerConfig(
+                target_sparsity=0.85,
+                weight_threshold=100,
+        )
+        config = cto.coreml.OptimizationConfig(global_config=global_config)
+
+        compressor = quantization.prune_weights(config=config)
+        compressor.apply(prog)
+
+        expected_ops = [
+            "constexpr_lut_to_dense",
+            "conv",
+            "constexpr_lut_to_dense",
+            "conv",
+            "constexpr_lut_to_dense",
+            "conv",
+            "reshape",
+            "constexpr_affine_dequantize",
+            "linear",
+            "constexpr_sparse_to_dense",
+            "linear",
+            "reshape",
+            "constexpr_sparse_to_dense",
+            "conv_transpose",
+        ]
+        assert get_op_types_in_program(prog) == expected_ops
+
+    @staticmethod
+    def test_const_only_used_as_output_skip_compress():
+        """
+        If the const is only fed to the block output, we skip the compression,
+        due to the bug rdar://108274019 ([Bug] constexpr ops cannot be directly fed to block output)
+        """
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(10, 20, 30))], opset_version=ct.target.iOS16)
+        def prog(x):
+            val = np.random.rand(10, 20, 30).astype(np.float32)
+            const = mb.const(val=val)
+            output = mb.add(x=x, y=1.0)
+            return output, const
+
+        op_config = cto.coreml.OpPalettizerConfig(
+            nbits=2,
+            mode="kmeans",
+            weight_threshold=0,
+        )
+        config = cto.coreml.OptimizationConfig(global_config=op_config)
+        compressor = quantization.palettize_weights(config=config)
+        compressor.apply(prog)
+        assert get_op_types_in_program(prog) == ["add"]
+
+    @staticmethod
+    def test_const_as_output():
+        """
+        If the const is fed to the block output and at least one another op, it can still be compressed
+        """
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(10, 20, 30))], opset_version=ct.target.iOS16)
+        def prog(x):
+            val = np.random.rand(10, 20, 30).astype(np.float32)
+            const = mb.const(val=val)
+            output = mb.add(x=x, y=const)
+            return output, const
+
+        op_config = cto.coreml.OpPalettizerConfig(
+            nbits=2,
+            mode="kmeans",
+            weight_threshold=0,
+        )
+        config = cto.coreml.OptimizationConfig(global_config=op_config)
+        compressor = quantization.palettize_weights(config=config)
+        compressor.apply(prog)
+        assert get_op_types_in_program(prog) == ["constexpr_lut_to_dense", "add"]
+
+    @staticmethod
+    def test_set_op_name_for_const():
+        """
+        We can set_op_name for const ops
+        """
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(1, 10, 30))], opset_version=ct.target.iOS16)
+        def prog(x):
+            add_const_1 = np.random.rand(10, 30).astype(np.float32)
+            add_const_2 = np.random.rand(10, 30).astype(np.float32)
+            const_1 = mb.const(val=add_const_1, name="const_1")
+            const_2 = mb.const(val=add_const_2, name="const_2")
+            x = mb.add(x=x, y=const_1)
+            return mb.add(x=x, y=const_2)
+
+        compressor = quantization.palettize_weights(
+            config=cto.coreml.OptimizationConfig(
+                global_config=cto.coreml.OpPalettizerConfig(nbits=2, mode="KMEANS", weight_threshold=50),
+                op_name_configs={"const_2": cto.coreml.OpPalettizerConfig(nbits=4, mode="KMEANS", weight_threshold=50)}
+            )
+        )
+
+        compressor.apply(prog)
+
+        expected_ops = [
+            "constexpr_lut_to_dense",
+            "constexpr_lut_to_dense",
+            "add",
+            "add",
+        ]
+        assert get_op_types_in_program(prog) == expected_ops
+
+        expected_nbits = [2, 4]
+        lut_ops = prog.find_ops(op_type="constexpr_lut_to_dense")
+
+        for nbits, op in zip(expected_nbits, lut_ops):
+            assert op.lut.val.shape == (2**nbits,)
+
+    @staticmethod
+    @pytest.mark.parametrize(
+        "constexpr_op",
+        TestSkipConstexprOps.CONSTEXPR_OPS,
+    )
+    def test_constexpr_const_not_compressed(constexpr_op):
+        """
+        The const op which is fed into constexpr ops cannot be compressed.
+        """
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3, 4, 5))])
+        def prog(x):
+            constexpr = TestSkipConstexprOps.CONSTEXPR_FUNCS[constexpr_op]((2, 3, 4, 5))
+            return mb.add(x=x, y=constexpr)
+
+        compressor = quantization.palettize_weights(
+            config=cto.coreml.OptimizationConfig(
+                global_config=cto.coreml.OpPalettizerConfig(nbits=2, mode="KMEANS", weight_threshold=0),
+            )
+        )
+        compressor.apply(prog)
+        expected_ops = [constexpr_op, "add"]
+        assert get_op_types_in_program(prog) == expected_ops
+
+    @staticmethod
+    def test_shared_weights():
+        """
+        If a const is shared with different downstream ops, we do a further conflict detection.
+        """
+
+        def _get_program():
+            @mb.program(
+                input_specs=[mb.TensorSpec(shape=(1, 10, 30))], opset_version=ct.target.iOS16
+            )
+            def prog(x):
+                add_const = np.random.rand(10, 30).astype(np.float32)
+                add_const = mb.const(val=add_const, name="add_const")
+                x = mb.add(x=x, y=add_const, name="add_1")
+                return mb.add(x=x, y=add_const, name="add_2")
+            return prog
+
+        # [Case 1] No conflict. Global and op_name level config are the same
+        prog = _get_program()
+
+        compressor = quantization.palettize_weights(
+            config=cto.coreml.OptimizationConfig(
+                global_config=cto.coreml.OpPalettizerConfig(nbits=2, mode="KMEANS", weight_threshold=50),
+                op_name_configs={"add_2": cto.coreml.OpPalettizerConfig(nbits=2, mode="KMEANS", weight_threshold=50)}
+            )
+        )
+
+        compressor.apply(prog)
+
+        expected_ops = [
+            "constexpr_lut_to_dense",
+            "add",
+            "add",
+        ]
+        assert get_op_types_in_program(prog) == expected_ops
+
+        # [Case 2] No conflict. op_name level configs are the same
+        prog = _get_program()
+
+        compressor = quantization.palettize_weights(
+            config=cto.coreml.OptimizationConfig(
+                global_config=cto.coreml.OpPalettizerConfig(nbits=4, mode="KMEANS", weight_threshold=50),
+                op_name_configs={
+                    "add_1": cto.coreml.OpPalettizerConfig(nbits=2, mode="KMEANS", weight_threshold=50),
+                    "add_2": cto.coreml.OpPalettizerConfig(nbits=2, mode="KMEANS", weight_threshold=50),
+                }
+            )
+        )
+
+        compressor.apply(prog)
+
+        expected_ops = [
+            "constexpr_lut_to_dense",
+            "add",
+            "add",
+        ]
+        assert get_op_types_in_program(prog) == expected_ops
+
+        # [Case 3] Conflict. Global and op_name level config are different
+        prog = _get_program()
+
+        compressor = quantization.palettize_weights(
+            config=cto.coreml.OptimizationConfig(
+                global_config=cto.coreml.OpPalettizerConfig(nbits=2, mode="KMEANS", weight_threshold=50),
+                op_name_configs={"add_2": cto.coreml.OpPalettizerConfig(nbits=4, mode="KMEANS", weight_threshold=50)}
+            )
+        )
+
+        with pytest.raises(ValueError, match="compression config conflict detected between ops"):
+            compressor.apply(prog)
+
+        # [Case 4] Conflict. op_name level configs are different
+        prog = _get_program()
+
+        compressor = quantization.palettize_weights(
+            config=cto.coreml.OptimizationConfig(
+                global_config=cto.coreml.OpPalettizerConfig(nbits=2, mode="KMEANS", weight_threshold=50),
+                op_name_configs={
+                    "add_1": cto.coreml.OpPalettizerConfig(nbits=4, mode="KMEANS", weight_threshold=50),
+                    "add_2": cto.coreml.OpPalettizerConfig(nbits=4, mode="KMEANS", weight_threshold=30),
+                },
+            )
+        )
+
+        with pytest.raises(ValueError, match="compression config conflict detected between ops"):
+            compressor.apply(prog)
+
+
+class TestLinearQuantizer(TestCompressionPasses):
+    @pytest.mark.parametrize(
+        "mode, dtype, weight_threshold, fake_compression",
+        itertools.product(
+            ["LINEAR", "LINEAR_SYMMETRIC"],
+            [np.int8, np.uint8, types.int8, types.uint8],
+            [1000, 7000],
+            [True, False],
+        ),
+    )
+    def test_global_config_affine_quantizer(self, mode, dtype, weight_threshold, fake_compression):
+        """
+        Global config would compress all operations with the same config
+        """
+        op_config = cto.coreml.OpLinearQuantizerConfig(
+            mode=mode, dtype=dtype, weight_threshold=weight_threshold
+        )
+        config = cto.coreml.OptimizationConfig(global_config=op_config)
+        compressor = quantization.linear_quantize_weights(
+            config=config, fake_compression=fake_compression
+        )
+        prog = self._get_test_program()
+        compressor.apply(prog)
+
+        if fake_compression:
+            expected_ops = ["conv", "reshape", "linear", "reshape", "conv_transpose"]
+        elif weight_threshold == 1000:
+            expected_ops = [
+                "constexpr_affine_dequantize",
+                "conv",
+                "reshape",
+                "constexpr_affine_dequantize",
+                "linear",
+                "reshape",
+                "constexpr_affine_dequantize",
+                "conv_transpose",
+            ]
+        else:
+            assert weight_threshold == 7000
+            # linear weight size < 7000
+            expected_ops = [
+                "constexpr_affine_dequantize",
+                "conv",
+                "reshape",
+                "linear",
+                "reshape",
+                "constexpr_affine_dequantize",
+                "conv_transpose",
+            ]
+        assert get_op_types_in_program(prog) == expected_ops
+
+    def test_op_type_config_linear_quantizer(self):
+        """
+        set_op_type allow the user to set different config for each op type.
+        Also checking that the config can be overwritten
+        """
+        conv_config_1 = cto.coreml.OpLinearQuantizerConfig(
+            mode="LINEAR_SYMMETRIC",
+            dtype=np.uint8,
+            weight_threshold=2000,
+        )
+        # conv_config_2 overwrite conv_config_1
+        conv_config_2 = cto.coreml.OpLinearQuantizerConfig(
+            mode="LINEAR_SYMMETRIC",
+            dtype=np.int8,
+            weight_threshold=2000,
+        )
+        # The weight_threshold is super large so linear is not going to be compressed
+        linear_config = cto.coreml.OpLinearQuantizerConfig(
+            mode="LINEAR_SYMMETRIC",
+            dtype=np.int8,
+            weight_threshold=1000000,
+        )
+        conv_transpose_config = cto.coreml.OpLinearQuantizerConfig(
+            mode="LINEAR",
+            dtype=np.uint8,
+            weight_threshold=2000,
+        )
+
+        config = cto.coreml.OptimizationConfig()
+        config.set_op_type("conv", conv_config_1)
+        config.set_op_type("conv", conv_config_2)
+        config.set_op_type("linear", linear_config)
+        config.set_op_type("conv_transpose", conv_transpose_config)
+
+        compressor = quantization.linear_quantize_weights(config=config)
+
+        prog = self._get_test_program()
+        compressor.apply(prog)
+
+        expected_ops = [
+            "constexpr_affine_dequantize",
+            "conv",
+            "reshape",
+            "linear",
+            "reshape",
+            "constexpr_affine_dequantize",
+            "conv_transpose",
+        ]
+        assert get_op_types_in_program(prog) == expected_ops
+
+        # Test different dtype are applied
+        assert (
+            prog.find_ops(op_type="constexpr_affine_dequantize")[0].quantized_data.val.dtype
+            == np.int8
+        )
+        assert (
+            prog.find_ops(op_type="constexpr_affine_dequantize")[1].quantized_data.val.dtype
+            == np.uint8
+        )
+
+    def test_op_name_config_linear_quantizer(self):
+        """
+        set_op_name allow the user to set different config for each op specified by name.
+        Also checking that the config can be overwritten
+        """
+        conv_config_1 = cto.coreml.OpLinearQuantizerConfig(
+            mode="LINEAR_SYMMETRIC",
+            dtype=np.uint8,
+            weight_threshold=2000,
+        )
+        # conv_config_2 overwrite conv_config_1
+        conv_config_2 = cto.coreml.OpLinearQuantizerConfig(
+            mode="LINEAR_SYMMETRIC",
+            dtype=np.int8,
+            weight_threshold=2000,
+        )
+        # The weight_threshold is super large so linear is not going to be compressed
+        linear_config = cto.coreml.OpLinearQuantizerConfig(
+            mode="LINEAR_SYMMETRIC",
+            dtype=np.int8,
+            weight_threshold=1000000,
+        )
+        conv_transpose_config = cto.coreml.OpLinearQuantizerConfig(
+            mode="LINEAR",
+            dtype=np.uint8,
+            weight_threshold=2000,
+        )
+
+        config = cto.coreml.OptimizationConfig()
+        config.set_op_name("conv", conv_config_1)
+        config.set_op_name("conv", conv_config_2)
+        config.set_op_name("linear", linear_config)
+        config.set_op_name("conv_transpose", conv_transpose_config)
+
+        compressor = quantization.linear_quantize_weights(config=config)
+
+        prog = self._get_test_program()
+        compressor.apply(prog)
+
+        expected_ops = [
+            "constexpr_affine_dequantize",
+            "conv",
+            "reshape",
+            "linear",
+            "reshape",
+            "constexpr_affine_dequantize",
+            "conv_transpose",
+        ]
+        assert get_op_types_in_program(prog) == expected_ops
+
+        # Test different dtype are applied
+        assert (
+            prog.find_ops(op_type="constexpr_affine_dequantize")[0].quantized_data.val.dtype
+            == np.int8
+        )
+        assert (
+            prog.find_ops(op_type="constexpr_affine_dequantize")[1].quantized_data.val.dtype
+            == np.uint8
+        )
+
+
+class TestPruner(TestCompressionPasses):
+    @pytest.mark.parametrize(
+        "mode, threshold, target_sparsity, weight_threshold, fake_compression",
+        itertools.product(
+            ["THRESHOLD_BASED", "PERCENTILE_BASED"],
+            [1e-3, 1.0],
+            [0.2, 0.98],
+            [1000, 7000],
+            [True, False],
+        ),
+    )
+    def test_global_config_pruner(
+        self, mode, threshold, target_sparsity, weight_threshold, fake_compression
+    ):
+        """
+        Global config would compress all operations with the same config
+        """
+        if mode == "THRESHOLD_BASED":
+            op_config = cto.coreml.OpThresholdPrunerConfig(
+                threshold=threshold,
+                weight_threshold=weight_threshold,
+                minimum_sparsity_percentile=0.0,
+            )
+        else:
+            assert mode == "PERCENTILE_BASED"
+            op_config = cto.coreml.OpMagnitudePrunerConfig(
+                target_sparsity=target_sparsity,
+                weight_threshold=weight_threshold,
+            )
+
+        config = cto.coreml.OptimizationConfig(global_config=op_config)
+        compressor = quantization.prune_weights(config=config, fake_compression=fake_compression)
+        prog = self._get_test_program()
+        compressor.apply(prog)
+
+        if fake_compression:
+            expected_ops = ["conv", "reshape", "linear", "reshape", "conv_transpose"]
+        elif weight_threshold == 1000:
+            expected_ops = [
+                "constexpr_sparse_to_dense",
+                "conv",
+                "reshape",
+                "constexpr_sparse_to_dense",
+                "linear",
+                "reshape",
+                "constexpr_sparse_to_dense",
+                "conv_transpose",
+            ]
+        else:
+            assert weight_threshold == 7000
+            # linear weight size < 7000
+            expected_ops = [
+                "constexpr_sparse_to_dense",
+                "conv",
+                "reshape",
+                "linear",
+                "reshape",
+                "constexpr_sparse_to_dense",
+                "conv_transpose",
+            ]
+        assert get_op_types_in_program(prog) == expected_ops
+
+    def test_op_type_config_pruner(self):
+        """
+        set_op_type allow the user to set different config for each op type.
+        Also checking that the config can be overwritten
+        """
+        conv_config_1 = cto.coreml.OpMagnitudePrunerConfig(
+            target_sparsity=0.5,
+            weight_threshold=2000,
+        )
+        # conv_config_2 overwrite conv_config_1
+        conv_config_2 = cto.coreml.OpMagnitudePrunerConfig(
+            target_sparsity=0.9,
+            weight_threshold=2000,
+        )
+        linear_config = cto.coreml.OpMagnitudePrunerConfig(
+            target_sparsity=0.2,
+            weight_threshold=2000,
+        )
+        # The weight_threshold is super large so conv_transpose is not going to be compressed
+        conv_transpose_config = cto.coreml.OpThresholdPrunerConfig(
+            threshold=1.0,
+            weight_threshold=1000000,
+        )
+
+        config = cto.coreml.OptimizationConfig()
+        config.set_op_type("conv", conv_config_1)
+        config.set_op_type("conv", conv_config_2)
+        config.set_op_type("linear", linear_config)
+        config.set_op_type("conv_transpose", conv_transpose_config)
+
+        compressor = quantization.prune_weights(config=config)
+
+        prog = self._get_test_program()
+        compressor.apply(prog)
+
+        expected_ops = [
+            "constexpr_sparse_to_dense",
+            "conv",
+            "reshape",
+            "constexpr_sparse_to_dense",
+            "linear",
+            "reshape",
+            "conv_transpose",
+        ]
+        assert get_op_types_in_program(prog) == expected_ops
+
+        # Test different sparcsity percentile are applied
+        assert (
+            prog.find_ops(op_type="constexpr_sparse_to_dense")[0].nonzero_data.val.size == 1080
+        )  # 1080 * 0.1
+        assert (
+            prog.find_ops(op_type="constexpr_sparse_to_dense")[1].nonzero_data.val.size == 4536
+        )  # 5670 * 0.8
+
+    def test_op_name_config_pruner(self):
+        """
+        set_op_name allow the user to set different config for each op specified by name.
+        Also checking that the config can be overwritten
+        """
+        conv_config_1 = cto.coreml.OpMagnitudePrunerConfig(
+            target_sparsity=0.5,
+            weight_threshold=2000,
+        )
+        # conv_config_2 overwrite conv_config_1
+        conv_config_2 = cto.coreml.OpMagnitudePrunerConfig(
+            target_sparsity=0.9,
+            weight_threshold=2000,
+        )
+        linear_config = cto.coreml.OpMagnitudePrunerConfig(
+            target_sparsity=0.2,
+            weight_threshold=2000,
+        )
+        # The weight_threshold is super large so conv_transpose is not going to be compressed
+        conv_transpose_config = cto.coreml.OpThresholdPrunerConfig(
+            threshold=1.0,
+            weight_threshold=1000000,
+        )
+
+        config = cto.coreml.OptimizationConfig()
+        config.set_op_name("conv", conv_config_1)
+        config.set_op_name("conv", conv_config_2)
+        config.set_op_name("linear", linear_config)
+        config.set_op_name("conv_transpose", conv_transpose_config)
+
+        compressor = quantization.prune_weights(config=config)
+
+        prog = self._get_test_program()
+        compressor.apply(prog)
+
+        expected_ops = [
+            "constexpr_sparse_to_dense",
+            "conv",
+            "reshape",
+            "constexpr_sparse_to_dense",
+            "linear",
+            "reshape",
+            "conv_transpose",
+        ]
+        assert get_op_types_in_program(prog) == expected_ops
+
+        # Test different sparcsity percentile are applied
+        assert (
+            prog.find_ops(op_type="constexpr_sparse_to_dense")[0].nonzero_data.val.size == 1080
+        )  # 1080 * 0.1
+        assert (
+            prog.find_ops(op_type="constexpr_sparse_to_dense")[1].nonzero_data.val.size == 4536
+        )  # 5670 * 0.8
+
+    @pytest.mark.parametrize(
+        "target_sparsity, minimum_sparsity_percentile",
+        itertools.product(
+            [0.1, 0.5, 0.9],
+            [0.0, 0.3, 0.7],
+        ),
+    )
+    def test_pruner_minimum_sparsity_percentile(self, target_sparsity, minimum_sparsity_percentile):
+        def _get_sparse_weight(shape, target_sparsity):
+            size = np.prod(shape)
+            weight = 3 * np.ones(size)
+            num_of_zeros = int(size * target_sparsity)
+            weight[:num_of_zeros] = 0
+            return np.reshape(weight, shape).astype(np.float32)
+
+        def _get_simple_program():
+            @mb.program(
+                input_specs=[mb.TensorSpec(shape=(1, 30, 10, 10))], opset_version=ct.target.iOS16
+            )
+            def prog(x):
+                conv_weight = _get_sparse_weight((90, 30, 3, 3), target_sparsity)
+                x = mb.conv(x=x, weight=conv_weight, name="conv1")
+                return x
+
+            return prog
+
+        op_config = cto.coreml.OpThresholdPrunerConfig(
+            threshold=1e-3,
+            minimum_sparsity_percentile=minimum_sparsity_percentile,
+            weight_threshold=200,
+        )
+        config = cto.coreml.OptimizationConfig(global_config=op_config)
+        compressor = quantization.prune_weights(config=config)
+        prog = _get_simple_program()
+        compressor.apply(prog)
+
+        if minimum_sparsity_percentile < target_sparsity:
+            expected_ops = ["constexpr_sparse_to_dense", "conv"]
+        else:
+            expected_ops = ["conv"]
+        assert get_op_types_in_program(prog) == expected_ops
+
+    def test_structural_pruning(self):
+        def _get_test_prog():
+            @mb.program(
+                input_specs=[mb.TensorSpec(shape=(1, 30, 10, 10))], opset_version=ct.target.iOS16
+            )
+            def prog(x):
+                conv_weight_1 = mb.const(
+                    val=np.random.rand(90, 30, 2, 2).astype(np.float32), name="w_1"
+                )
+                conv_bias_1 = mb.const(
+                    val=np.random.rand(
+                        90,
+                    ).astype(np.float32),
+                    name="b_1",
+                )
+                conv_weight_2 = mb.const(
+                    val=np.random.rand(10, 90, 2, 2).astype(np.float32), name="w_2"
+                )
+                linear_weight = mb.const(val=np.random.rand(128, 64).astype(np.float32), name="l_w")
+                linear_bias = mb.const(
+                    val=np.random.rand(
+                        128,
+                    ).astype(np.float32),
+                    name="l_b",
+                )
+                add_const = mb.const(
+                    val=np.random.rand(10, 128).astype(np.float32), name="add_const"
+                )
+
+                x = mb.conv(x=x, weight=conv_weight_1, bias=conv_bias_1, name="conv_1")
+                x = mb.conv(x=x, weight=conv_weight_2, name="conv_2")
+                x = mb.reshape(x=x, shape=(10, 64))
+                x = mb.linear(x=x, weight=linear_weight, bias=linear_bias, name="linear_1")
+                x = mb.add(x=x, y=add_const, name="add_1")
+                return x
+
+            return prog
+
+        # (1) Global structural pruning config will only applied to conv / linear weight
+        prog = _get_test_prog()
+        config = cto.coreml.OptimizationConfig(
+            global_config=cto.coreml.OpMagnitudePrunerConfig(
+                n_m_ratio=(2, 3),
+                weight_threshold=0,
+            )
+        )
+        compressor = quantization.prune_weights(config=config)
+        compressor.apply(prog)
+        expected_ops = [
+            "constexpr_sparse_to_dense",
+            "constexpr_sparse_to_dense",
+            "constexpr_sparse_to_dense",
+            "conv",
+            "conv",
+            "reshape",
+            "linear",
+            "add",
+        ]
+        assert get_op_types_in_program(prog) == expected_ops
+        conv_ops = prog.find_ops(op_type="conv")
+        assert conv_ops[0].weight.op.op_type == "constexpr_sparse_to_dense"
+        assert conv_ops[1].weight.op.op_type == "constexpr_sparse_to_dense"
+        assert prog.find_ops(op_type="linear")[0].weight.op.op_type == "constexpr_sparse_to_dense"
+
+        # (2) Even by setting the ops with structural pruning, make sure only weight is sparsified, not bias
+        prog = _get_test_prog()
+        config = cto.coreml.OptimizationConfig(
+            op_type_configs={
+                "conv": cto.coreml.OpMagnitudePrunerConfig(
+                    n_m_ratio=(2, 3),
+                    weight_threshold=0,
+                )
+            },
+            op_name_configs={
+                "linear_1": cto.coreml.OpMagnitudePrunerConfig(
+                    n_m_ratio=(1, 4),
+                    weight_threshold=0,
+                )
+            },
+        )
+        compressor = quantization.prune_weights(config=config)
+        compressor.apply(prog)
+        expected_ops = [
+            "constexpr_sparse_to_dense",
+            "constexpr_sparse_to_dense",
+            "constexpr_sparse_to_dense",
+            "conv",
+            "conv",
+            "reshape",
+            "linear",
+            "add",
+        ]
+        assert get_op_types_in_program(prog) == expected_ops
+        conv_ops = prog.find_ops(op_type="conv")
+        assert conv_ops[0].weight.op.op_type == "constexpr_sparse_to_dense"
+        assert conv_ops[1].weight.op.op_type == "constexpr_sparse_to_dense"
+        assert prog.find_ops(op_type="linear")[0].weight.op.op_type == "constexpr_sparse_to_dense"
+
+        # (3) Early error out when setting a non applicable op to structural pruning with set_op_type
+        with pytest.raises(
+            ValueError, match="block sparsity or n:m pruning does not support op type add"
+        ):
+            config = cto.coreml.OptimizationConfig(
+                op_type_configs={
+                    "add": cto.coreml.OpMagnitudePrunerConfig(
+                        n_m_ratio=(2, 3),
+                        weight_threshold=0,
+                    )
+                },
+            )
+
+        with pytest.raises(
+            ValueError, match="block sparsity or n:m pruning does not support op type add"
+        ):
+            config = cto.coreml.OptimizationConfig()
+            config.set_op_type(
+                "add",
+                cto.coreml.OpMagnitudePrunerConfig(
+                    n_m_ratio=(2, 3),
+                    weight_threshold=0,
+                ),
+            )
+
+        # (4) By using set_op_name, we can still force a const op to use structural pruning
+        prog = _get_test_prog()
+        config = cto.coreml.OptimizationConfig(
+            op_name_configs={
+                "add_const": cto.coreml.OpMagnitudePrunerConfig(
+                    n_m_ratio=(1, 4),
+                    weight_threshold=0,
+                )
+            }
+        )
+        compressor = quantization.prune_weights(config=config)
+        compressor.apply(prog)
+        expected_ops = [
+            "constexpr_sparse_to_dense",
+            "conv",
+            "conv",
+            "reshape",
+            "linear",
+            "add",
+        ]
+        assert get_op_types_in_program(prog) == expected_ops
+        assert prog.find_ops(op_type="add")[0].y.op.op_type == "constexpr_sparse_to_dense"
+
+
+class TestPalettizer(TestCompressionPasses):
+    @pytest.mark.parametrize(
+        "nbits, mode, weight_threshold, fake_compression",
+        itertools.product(
+            [2, 6],
+            ["KMEANS", "UNIFORM"],
+            [1000, 7000],
+            [True, False],
+        ),
+    )
+    def test_global_config_palettizer(self, nbits, mode, weight_threshold, fake_compression):
+        """
+        Global config would compress all operations with the same config
+        """
+        op_config = cto.coreml.OpPalettizerConfig(
+            nbits=nbits, mode=mode, weight_threshold=weight_threshold
+        )
+        config = cto.coreml.OptimizationConfig(global_config=op_config)
+        compressor = quantization.palettize_weights(
+            config=config, fake_compression=fake_compression
+        )
+        prog = self._get_test_program()
+        compressor.apply(prog)
+
+        if fake_compression:
+            expected_ops = ["conv", "reshape", "linear", "reshape", "conv_transpose"]
+        elif weight_threshold == 1000:
+            expected_ops = [
+                "constexpr_lut_to_dense",
+                "conv",
+                "reshape",
+                "constexpr_lut_to_dense",
+                "linear",
+                "reshape",
+                "constexpr_lut_to_dense",
+                "conv_transpose",
+            ]
+        else:
+            assert weight_threshold == 7000
+            # linear weight size < 7000
+            expected_ops = [
+                "constexpr_lut_to_dense",
+                "conv",
+                "reshape",
+                "linear",
+                "reshape",
+                "constexpr_lut_to_dense",
+                "conv_transpose",
+            ]
+        assert get_op_types_in_program(prog) == expected_ops
+
+    def test_op_type_config_palettizer(self):
+        """
+        set_op_type allow the user to set different config for each op type.
+        Also checking that the config can be overwritten
+        """
+        conv_config_1 = cto.coreml.OpPalettizerConfig(
+            nbits=8,
+            mode="KMEANS",
+            weight_threshold=2000,
+        )
+        # conv_config_2 overwrite conv_config_1
+        conv_config_2 = cto.coreml.OpPalettizerConfig(
+            nbits=2,
+            mode="KMEANS",
+            weight_threshold=2000,
+        )
+        linear_config = cto.coreml.OpPalettizerConfig(
+            nbits=4,
+            mode="UNIFORM",
+            weight_threshold=2000,
+        )
+        # The weight_threshold is super large so conv_transpose is not going to be compressed
+        conv_transpose_config = cto.coreml.OpPalettizerConfig(
+            nbits=4,
+            mode="UNIFORM",
+            weight_threshold=1000000,
+        )
+
+        config = cto.coreml.OptimizationConfig()
+        config.set_op_type("conv", conv_config_1)
+        config.set_op_type("conv", conv_config_2)
+        config.set_op_type("linear", linear_config)
+        config.set_op_type("conv_transpose", conv_transpose_config)
+
+        compressor = quantization.palettize_weights(config=config)
+
+        prog = self._get_test_program()
+        compressor.apply(prog)
+
+        expected_ops = [
+            "constexpr_lut_to_dense",
+            "conv",
+            "reshape",
+            "constexpr_lut_to_dense",
+            "linear",
+            "reshape",
+            "conv_transpose",
+        ]
+        assert get_op_types_in_program(prog) == expected_ops
+
+        # Test different nbits are applied
+        assert prog.find_ops(op_type="constexpr_lut_to_dense")[0].lut.val.shape == (4,)
+        assert prog.find_ops(op_type="constexpr_lut_to_dense")[1].lut.val.shape == (16,)
+
+    def test_op_name_config_palettizer(self):
+        """
+        set_op_name allow the user to set different config for each op specified by name.
+        Also checking that the config can be overwritten
+        """
+        conv_config_1 = cto.coreml.OpPalettizerConfig(
+            nbits=8,
+            mode="KMEANS",
+            weight_threshold=2000,
+        )
+        # conv_config_2 overwrite conv_config_1
+        conv_config_2 = cto.coreml.OpPalettizerConfig(
+            nbits=2,
+            mode="KMEANS",
+            weight_threshold=2000,
+        )
+        linear_config = cto.coreml.OpPalettizerConfig(
+            nbits=4,
+            mode="UNIFORM",
+            weight_threshold=2000,
+        )
+        # The weight_threshold is super large so conv_transpose is not going to be compressed
+        conv_transpose_config = cto.coreml.OpPalettizerConfig(
+            nbits=4,
+            mode="UNIFORM",
+            weight_threshold=1000000,
+        )
+
+        config = cto.coreml.OptimizationConfig()
+        config.set_op_name("conv", conv_config_1)
+        config.set_op_name("conv", conv_config_2)
+        config.set_op_name("linear", linear_config)
+        config.set_op_name("conv_transpose", conv_transpose_config)
+
+        compressor = quantization.palettize_weights(config=config)
+
+        prog = self._get_test_program()
+        compressor.apply(prog)
+
+        expected_ops = [
+            "constexpr_lut_to_dense",
+            "conv",
+            "reshape",
+            "constexpr_lut_to_dense",
+            "linear",
+            "reshape",
+            "conv_transpose",
+        ]
+        assert get_op_types_in_program(prog) == expected_ops
+
+        # Test different nbits are applied
+        assert prog.find_ops(op_type="constexpr_lut_to_dense")[0].lut.val.shape == (4,)
+        assert prog.find_ops(op_type="constexpr_lut_to_dense")[1].lut.val.shape == (16,)
+
+
+class TestCompressionOperations(TestCompressionPasses):
+    """
+    This test is checking compression for some common operations.
+    """
+
+    COMPRESSORS = [
+        quantization.palettize_weights(
+            config=cto.coreml.OptimizationConfig(
+                global_config=cto.coreml.OpPalettizerConfig(
+                    nbits=2, mode="KMEANS", weight_threshold=50
+                )
+            )
+        ),
+        quantization.linear_quantize_weights(
+            config=cto.coreml.OptimizationConfig(
+                global_config=cto.coreml.OpLinearQuantizerConfig(
+                    mode="LINEAR_SYMMETRIC", dtype=np.int8, weight_threshold=50
+                )
+            )
+        ),
+        quantization.prune_weights(
+            config=cto.coreml.OptimizationConfig(
+                global_config=cto.coreml.OpMagnitudePrunerConfig(
+                    target_sparsity=0.9, weight_threshold=50
+                )
+            )
+        ),
+    ]
+
+    COMPRESSOR_TO_OP_TYPE = {
+        "palettize_weights": "constexpr_lut_to_dense",
+        "linear_quantize_weights": "constexpr_affine_dequantize",
+        "prune_weights": "constexpr_sparse_to_dense",
+    }
+
+    @staticmethod
+    @pytest.mark.parametrize(
+        "compressor",
+        COMPRESSORS,
+    )
+    def test_conv_compress(compressor):
+        @mb.program(
+            input_specs=[mb.TensorSpec(shape=(1, 30, 10, 10))], opset_version=ct.target.iOS16
+        )
+        def prog(x):
+            conv_weight = np.random.rand(90, 30, 2, 2).astype(np.float32)
+            return mb.conv(x=x, weight=conv_weight)
+
+        compressor.apply(prog)
+        op_type = TestCompressionOperations.COMPRESSOR_TO_OP_TYPE[compressor.__class__.__name__]
+        assert get_op_types_in_program(prog) == [op_type, "conv"]
+
+    @staticmethod
+    @pytest.mark.parametrize(
+        "compressor",
+        COMPRESSORS,
+    )
+    def test_conv_transpose_compress(compressor):
+        @mb.program(
+            input_specs=[mb.TensorSpec(shape=(1, 30, 10, 10))], opset_version=ct.target.iOS16
+        )
+        def prog(x):
+            conv_weight = np.random.rand(90, 30, 2, 2).astype(np.float32)
+            return mb.conv_transpose(x=x, weight=conv_weight)
+
+        compressor.apply(prog)
+        op_type = TestCompressionOperations.COMPRESSOR_TO_OP_TYPE[compressor.__class__.__name__]
+        assert get_op_types_in_program(prog) == [op_type, "conv_transpose"]
+
+    @staticmethod
+    @pytest.mark.parametrize(
+        "compressor",
+        COMPRESSORS,
+    )
+    def test_liear_compress(compressor):
+        @mb.program(input_specs=[mb.TensorSpec(shape=(1, 30, 10))], opset_version=ct.target.iOS16)
+        def prog(x):
+            linear_weight = np.random.rand(40, 10).astype(np.float32)
+            return mb.linear(x=x, weight=linear_weight)
+
+        compressor.apply(prog)
+        op_type = TestCompressionOperations.COMPRESSOR_TO_OP_TYPE[compressor.__class__.__name__]
+        assert get_op_types_in_program(prog) == [op_type, "linear"]
+
+    @staticmethod
+    @pytest.mark.parametrize(
+        "compressor",
+        COMPRESSORS,
+    )
+    def test_matmul_compress(compressor):
+        @mb.program(input_specs=[mb.TensorSpec(shape=(1, 30, 10))], opset_version=ct.target.iOS16)
+        def prog(x):
+            weight1 = np.random.rand(10, 40).astype(np.float32)
+            weight2 = np.random.rand(20, 30).astype(np.float32)
+
+            x = mb.matmul(x=x, y=weight1)
+            return mb.matmul(x=weight2, y=x)
+
+        compressor.apply(prog)
+        op_type = TestCompressionOperations.COMPRESSOR_TO_OP_TYPE[compressor.__class__.__name__]
+        assert get_op_types_in_program(prog) == [op_type, "matmul", op_type, "matmul"]
+
+    @staticmethod
+    @pytest.mark.parametrize(
+        "compressor",
+        COMPRESSORS,
+    )
+    def test_gru_compress(compressor):
+        @mb.program(
+            input_specs=[mb.TensorSpec(shape=(1, 10, 30)), mb.TensorSpec(shape=(10, 40))],
+            opset_version=ct.target.iOS16,
+        )
+        def prog(x, initial_h):
+            weight_ih = np.random.rand(120, 30).astype(np.float32)
+            weight_hh = np.random.rand(120, 40).astype(np.float32)
+            return mb.gru(x=x, initial_h=initial_h, weight_ih=weight_ih, weight_hh=weight_hh)
+
+        compressor.apply(prog)
+        op_type = TestCompressionOperations.COMPRESSOR_TO_OP_TYPE[compressor.__class__.__name__]
+        assert get_op_types_in_program(prog) == [op_type, op_type, "gru"]
+
+    @staticmethod
+    @pytest.mark.parametrize(
+        "compressor",
+        COMPRESSORS,
+    )
+    def test_lstm_compress(compressor):
+        @mb.program(
+            input_specs=[
+                mb.TensorSpec(shape=(1, 10, 30)),
+                mb.TensorSpec(shape=(10, 40)),
+                mb.TensorSpec(shape=(10, 40)),
+            ],
+            opset_version=ct.target.iOS16,
+        )
+        def prog(x, initial_h, initial_c):
+            weight_ih = np.random.rand(160, 30).astype(np.float32)
+            weight_hh = np.random.rand(160, 40).astype(np.float32)
+            return mb.lstm(
+                x=x,
+                initial_h=initial_h,
+                initial_c=initial_c,
+                weight_ih=weight_ih,
+                weight_hh=weight_hh,
+            )
+
+        compressor.apply(prog)
+        op_type = TestCompressionOperations.COMPRESSOR_TO_OP_TYPE[compressor.__class__.__name__]
+        assert get_op_types_in_program(prog) == [op_type, op_type, "lstm"]
+
+    @staticmethod
+    @pytest.mark.parametrize(
+        "compressor",
+        COMPRESSORS,
+    )
+    def test_rnn_compress(compressor):
+        @mb.program(
+            input_specs=[
+                mb.TensorSpec(shape=(1, 10, 30)),
+                mb.TensorSpec(shape=(10, 40)),
+            ],
+            opset_version=ct.target.iOS16,
+        )
+        def prog(x, initial_h):
+            weight_ih = np.random.rand(40, 30).astype(np.float32)
+            weight_hh = np.random.rand(40, 40).astype(np.float32)
+            return mb.rnn(x=x, initial_h=initial_h, weight_ih=weight_ih, weight_hh=weight_hh)
+
+        compressor.apply(prog)
+        op_type = TestCompressionOperations.COMPRESSOR_TO_OP_TYPE[compressor.__class__.__name__]
+        assert get_op_types_in_program(prog) == [op_type, op_type, "rnn"]
+
+    @staticmethod
+    @pytest.mark.parametrize(
+        "compressor",
+        COMPRESSORS,
+    )
+    def test_add_compress(compressor):
+        @mb.program(input_specs=[mb.TensorSpec(shape=(1, 10, 30))], opset_version=ct.target.iOS16)
+        def prog(x):
+            add_const = np.random.rand(10, 30).astype(np.float32)
+            return mb.add(x=x, y=add_const)
+
+        compressor.apply(prog)
+        op_type = TestCompressionOperations.COMPRESSOR_TO_OP_TYPE[compressor.__class__.__name__]
+        assert get_op_types_in_program(prog) == [op_type, "add"]
+
+    @staticmethod
+    def test_add_compress_set_op_type():
+        @mb.program(input_specs=[mb.TensorSpec(shape=(1, 10, 30))], opset_version=ct.target.iOS16)
+        def prog(x):
+            add_const = np.random.rand(10, 30).astype(np.float32)
+            return mb.add(x=x, y=add_const)
+
+        compressor = quantization.palettize_weights(
+            config=cto.coreml.OptimizationConfig(
+                global_config=cto.coreml.OpPalettizerConfig(
+                    nbits=2, mode="KMEANS", weight_threshold=50
+                ),
+                op_type_configs={
+                    "add": cto.coreml.OpPalettizerConfig(
+                        nbits=4, mode="KMEANS", weight_threshold=50
+                    )
+                },
+            )
+        )
+        compressor.apply(prog)
+        assert get_op_types_in_program(prog) == ["constexpr_lut_to_dense", "add"]
+        # also check the compression config comes from set_op_type
+        assert prog.find_ops(op_type="constexpr_lut_to_dense")[0].lut.val.shape == (16,)
+
+
+class TestInvalidConfig:
+    """
+    This test is checking error handling for invalid configuraion.
+    """
+
+    @staticmethod
+    def test_invalid_config_type():
+        err_msg = "config must be of type OptimizationConfig"
+        with pytest.raises(ValueError, match=err_msg):
+            compressor = quantization.palettize_weights(
+                config=1,
+            )
+
+        with pytest.raises(ValueError, match=err_msg):
+            compressor = quantization.linear_quantize_weights(
+                config="12",
+            )
+
+        with pytest.raises(ValueError, match=err_msg):
+            compressor = quantization.prune_weights(
+                config=[12, 3],
+            )
+
+        msg = "palettize_weights only accept OpPalettizerConfig type config"
+        with pytest.raises(ValueError, match=msg):
+            compressor = quantization.palettize_weights(
+                config=cto.coreml.OptimizationConfig(
+                    global_config=cto.coreml.OpLinearQuantizerConfig(),
+                )
+            )
+
+        with pytest.raises(ValueError, match=msg):
+            compressor = quantization.palettize_weights(
+                config=cto.coreml.OptimizationConfig(
+                    op_type_configs={"op": cto.coreml.OpLinearQuantizerConfig()},
+                )
+            )
+
+        with pytest.raises(ValueError, match=msg):
+            compressor = quantization.palettize_weights(
+                config=cto.coreml.OptimizationConfig(
+                    op_name_configs={"name": cto.coreml.OpLinearQuantizerConfig()},
+                )
+            )
+
+        msg = "linear_quantize_weights only accept OpLinearQuantizerConfig type config"
+        with pytest.raises(ValueError, match=msg):
+            compressor = quantization.linear_quantize_weights(
+                config=cto.coreml.OptimizationConfig(
+                    global_config=cto.coreml.OpPalettizerConfig(nbits=2),
+                )
+            )
+
+        with pytest.raises(ValueError, match=msg):
+            compressor = quantization.linear_quantize_weights(
+                config=cto.coreml.OptimizationConfig(
+                    op_type_configs={"op": cto.coreml.OpPalettizerConfig(nbits=2)},
+                )
+            )
+
+        with pytest.raises(ValueError, match=msg):
+            compressor = quantization.linear_quantize_weights(
+                config=cto.coreml.OptimizationConfig(
+                    op_name_configs={"op": cto.coreml.OpPalettizerConfig(nbits=2)},
+                )
+            )
+
+        msg = "prune_weights only accept (OpMagnitudePrunerConfig, OpThresholdPrunerConfig) type config"
+        with pytest.raises(ValueError, match=msg):
+            compressor = quantization.prune_weights(
+                config=cto.coreml.OptimizationConfig(
+                    global_config=cto.coreml.OpPalettizerConfig(nbits=2),
+                )
+            )
+
+        with pytest.raises(ValueError, match=msg):
+            compressor = quantization.prune_weights(
+                config=cto.coreml.OptimizationConfig(
+                    op_type_configs={"op": cto.coreml.OpPalettizerConfig(nbits=2)},
+                )
+            )
+
+        with pytest.raises(ValueError, match=msg):
+            compressor = quantization.prune_weights(
+                config=cto.coreml.OptimizationConfig(
+                    op_name_configs={"name": cto.coreml.OpPalettizerConfig(nbits=2)},
+                )
+            )
+
+        msg = "config must be type of OpCompressorConfig."
+        with pytest.raises(ValueError, match=msg):
+            cto.coreml.OptimizationConfig(
+                global_config="str",
+            )
+
+        with pytest.raises(ValueError, match=msg):
+            cto.coreml.OptimizationConfig(
+                op_type_configs={"op": 123},
+            )
+
+        with pytest.raises(ValueError, match=msg):
+            cto.coreml.OptimizationConfig(
+                op_name_configs={"name": []},
+            )
+
+        msg = 'Invalid value of "minimum_sparsity_percentile":'
+        with pytest.raises(ValueError, match=msg):
+            config = cto.coreml.OpThresholdPrunerConfig(
+                threshold=0.8,
+                minimum_sparsity_percentile=1.2,
+            )
+
+        with pytest.raises(ValueError, match=msg):
+            config = cto.coreml.OpThresholdPrunerConfig(
+                threshold=0.8,
+                minimum_sparsity_percentile=-9.0,
+            )
+
+        msg = '"weight_threshold" must be a non-negative integer.'
+        with pytest.raises(ValueError, match=msg):
+            config = cto.coreml.OpThresholdPrunerConfig(
+                threshold=0.8,
+                weight_threshold=-9,
+            )
+
+        with pytest.raises(ValueError, match=msg):
+            config = cto.coreml.OpMagnitudePrunerConfig(
+                target_sparsity=1.0,
+                weight_threshold=-8,
+            )
+
+        with pytest.raises(ValueError, match=msg):
+            config = cto.coreml.OpLinearQuantizerConfig(
+                weight_threshold=-9,
+            )
+
+        with pytest.raises(ValueError, match=msg):
+            config = cto.coreml.OpPalettizerConfig(
+                nbits=2,
+                weight_threshold=-10,
+            )
+
+        msg = 'Either "target_sparsity" or "n_m_ratio" need to be set. They cannot be set at the same time.'
+        with pytest.raises(ValueError, match=msg):
+            config = cto.coreml.OpMagnitudePrunerConfig()
+
+        with pytest.raises(ValueError, match=msg):
+            config = cto.coreml.OpMagnitudePrunerConfig(
+                target_sparsity=0.0,
+                n_m_ratio=(2, 10),
+            )
+
+        msg = 'Invalid value of "target_sparsity":'
+        with pytest.raises(ValueError, match=msg):
+            config = cto.coreml.OpMagnitudePrunerConfig(
+                target_sparsity=-0.9,
+            )
+
+        with pytest.raises(ValueError, match=msg):
+            config = cto.coreml.OpMagnitudePrunerConfig(
+                target_sparsity=1.1,
+            )
+
+        with pytest.raises(
+            ValueError, match='"block_size" and "n_m_ratio" cannot be set at the same time.'
+        ):
+            config = cto.coreml.OpMagnitudePrunerConfig(
+                n_m_ratio=(2, 2),
+                block_size=9,
+            )
+
+        msg = '"block_size" must be an integer \> 1'
+        with pytest.raises(ValueError, match=msg):
+            config = cto.coreml.OpMagnitudePrunerConfig(
+                target_sparsity=0.9,
+                block_size=1,
+            )
+
+        with pytest.raises(ValueError, match=msg):
+            config = cto.coreml.OpMagnitudePrunerConfig(
+                target_sparsity=0.9,
+                block_size=-9,
+            )
+
+        msg = '"n_m_ratio" must be a tuple of two integers \(n, m\). n \<\= m. Got'
+        with pytest.raises(ValueError, match=msg):
+            config = cto.coreml.OpMagnitudePrunerConfig(
+                n_m_ratio=(2, 2, 2),
+            )
+
+        with pytest.raises(ValueError, match=msg):
+            config = cto.coreml.OpMagnitudePrunerConfig(
+                n_m_ratio=(6, 1),
+            )
+
+        msg = '"dim" must be 1 or 0'
+        with pytest.raises(ValueError, match=msg):
+            config = cto.coreml.OpMagnitudePrunerConfig(
+                n_m_ratio=(1, 1),
+                dim=-1,
+            )
+
+        with pytest.raises(ValueError, match=msg):
+            config = cto.coreml.OpMagnitudePrunerConfig(
+                target_sparsity=1.0,
+                block_size=2,
+                dim=2,
+            )
+
+        with pytest.raises(
+            ValueError, match='"dim" can only be set along with "block_size" or "n_m_ratio".'
+        ):
+            config = cto.coreml.OpMagnitudePrunerConfig(
+                target_sparsity=1.0,
+                dim=1,
+            )
+
+    @staticmethod
+    def test_set_op_type_error_out_for_const():
+        """
+        We cannot use set_op_type for const op
+        """
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(1, 10, 30))], opset_version=ct.target.iOS16)
+        def prog(x):
+            add_const = np.random.rand(10, 30).astype(np.float32)
+            return mb.add(x=x, y=add_const, name="add1")
+
+        compressor = quantization.palettize_weights(
+            config=cto.coreml.OptimizationConfig(
+                global_config=cto.coreml.OpPalettizerConfig(
+                    nbits=2, mode="KMEANS", weight_threshold=50
+                ),
+                op_type_configs={
+                    "const": cto.coreml.OpPalettizerConfig(
+                        nbits=4, mode="KMEANS", weight_threshold=50
+                    )
+                },
+            )
+        )
+
+        with pytest.raises(
+            ValueError,
+            match="const ops cannot be set by the `set_op_type` function. Please use `set_global`",
+        ):
+            compressor.apply(prog)
+
+
+class TestConfigurationFromDictFromYaml:
+    """
+    Test the from_dict and from_yaml functionality.
+    """
+
+    @staticmethod
+    def load_to_yaml(config_dict):
+        with tempfile.NamedTemporaryFile("w") as file:
+            yaml.dump(config_dict, file)
+            yaml_dict = yaml.safe_load(open(file.name))
+            file.close()
+        return yaml_dict
+
+    @staticmethod
+    def get_yaml(config_dict):
+        with tempfile.NamedTemporaryFile("w", delete=False) as file:
+            yaml.dump(config_dict, file)
+            return file.name
+
+    def get_opt_config(self, config_dict, from_yaml, yaml_as_string):
+        if from_yaml:
+            yaml_file_name = self.get_yaml(config_dict)
+            if not yaml_as_string:
+                yaml = open(yaml_file_name)
+            else:
+                yaml = yaml_file_name
+            config = quantization.OptimizationConfig.from_yaml(yaml)
+            os.remove(yaml_file_name)
+        else:
+            config = quantization.OptimizationConfig.from_dict(config_dict)
+        return config
+
+    @staticmethod
+    @pytest.mark.parametrize(
+        "config_cls",
+        [
+            quantization.OpLinearQuantizerConfig,
+            quantization.OpThresholdPrunerConfig,
+            quantization.OpMagnitudePrunerConfig,
+            quantization.OpPalettizerConfig,
+        ],
+    )
+    def test_config_load_invalid_key(config_cls):
+        # Invalid key
+        config_dict = {"invalid": 2}
+        with pytest.raises(cattrs.errors.ClassValidationError):
+            config_cls._from_dict(config_dict)
+
+    @pytest.mark.parametrize(
+        "mode, dtype, weight_threshold, use_yaml",
+        itertools.product(
+            ["linear", "linear_symmetric"],
+            ["int8", "uint8", np.int8, np.uint8, types.int8, types.uint8],
+            [1024, None],
+            [True, False],
+        ),
+    )
+    def test_linear_quantizer_config_load_stress(self, mode, dtype, weight_threshold, use_yaml):
+        config_dict = {
+            "mode": mode,
+            "dtype": dtype,
+            "weight_threshold": weight_threshold,
+        }
+
+        if use_yaml and dtype in ("int8", "uint8"):
+            config_dict = self.load_to_yaml(config_dict)
+
+        config = quantization.OpLinearQuantizerConfig._from_dict(config_dict)
+
+        if dtype in ["int8", np.int8, types.int8]:
+            expected_dtype = np.int8
+        elif dtype in ["uint8", np.uint8, types.uint8]:
+            expected_dtype = np.uint8
+
+        expected_config = quantization.OpLinearQuantizerConfig(
+            mode=mode,
+            dtype=expected_dtype,
+            weight_threshold=weight_threshold,
+        )
+        assert config == expected_config
+
+    @pytest.mark.parametrize(
+        "threshold, minimum_sparsity_percentile, weight_threshold, use_yaml",
+        itertools.product(
+            [0.0, 1.0],
+            [0.0, 1.0],
+            [1024, None],
+            [True, False],
+        ),
+    )
+    def test_threshold_pruner_config_load_stress(
+        self, threshold, minimum_sparsity_percentile, weight_threshold, use_yaml
+    ):
+        config_dict = {
+            "threshold": threshold,
+            "minimum_sparsity_percentile": minimum_sparsity_percentile,
+            "weight_threshold": weight_threshold,
+        }
+
+        if use_yaml:
+            config_dict = self.load_to_yaml(config_dict)
+
+        config = quantization.OpThresholdPrunerConfig._from_dict(config_dict)
+
+        expected_config = quantization.OpThresholdPrunerConfig(
+            threshold=threshold,
+            minimum_sparsity_percentile=minimum_sparsity_percentile,
+            weight_threshold=weight_threshold,
+        )
+        assert config == expected_config
+
+    @pytest.mark.parametrize(
+        "n_m_ratio, dim, weight_threshold, use_yaml",
+        itertools.product(
+            [[1, 1], (2, 3)],
+            [0, 1],
+            [1024, None],
+            [True, False],
+        ),
+    )
+    def test_magnitude_nm_pruner_config_load_stress(
+        self, n_m_ratio, dim, weight_threshold, use_yaml
+    ):
+        config_dict = {
+            "n_m_ratio": n_m_ratio,
+            "dim": dim,
+            "weight_threshold": weight_threshold,
+        }
+
+        if use_yaml and not isinstance(n_m_ratio, tuple):
+            config_dict = self.load_to_yaml(config_dict)
+
+        config = quantization.OpMagnitudePrunerConfig._from_dict(config_dict)
+
+        expected_config = quantization.OpMagnitudePrunerConfig(
+            n_m_ratio=tuple(n_m_ratio),
+            dim=dim,
+            weight_threshold=weight_threshold,
+        )
+        assert config == expected_config
+
+    @pytest.mark.parametrize(
+        "target_sparsity, block_size, dim, weight_threshold, use_yaml",
+        itertools.product(
+            [0.0, 1.0],
+            [None, 2],
+            [None, 0, 1],
+            [None, 1024],
+            [True, False],
+        ),
+    )
+    def test_magnitude_block_sparsity_pruner_config_load_stress(
+        self, target_sparsity, block_size, dim, weight_threshold, use_yaml
+    ):
+        if block_size is None and dim is not None:
+            return
+
+        config_dict = {
+            "target_sparsity": target_sparsity,
+            "block_size": block_size,
+            "dim": dim,
+            "weight_threshold": weight_threshold,
+        }
+
+        if use_yaml:
+            config_dict = self.load_to_yaml(config_dict)
+
+        config = quantization.OpMagnitudePrunerConfig._from_dict(config_dict)
+
+        expected_config = quantization.OpMagnitudePrunerConfig(
+            target_sparsity=target_sparsity,
+            block_size=block_size,
+            dim=dim,
+            weight_threshold=weight_threshold,
+        )
+        assert config == expected_config
+
+    @pytest.mark.parametrize(
+        "mode_nbits, weight_threshold, use_yaml",
+        itertools.product(
+            [
+                ("kmeans", 2),
+                ("uniform", 1),
+                ("unique", None),
+            ],
+            [None, 1024],
+            [True, False],
+        ),
+    )
+    def test_palettizer_config_load_stress(self, mode_nbits, weight_threshold, use_yaml):
+        mode, nbits = mode_nbits
+
+        config_dict = {
+            "mode": mode,
+            "nbits": nbits,
+            "weight_threshold": weight_threshold,
+        }
+
+        if use_yaml:
+            config_dict = self.load_to_yaml(config_dict)
+
+        config = quantization.OpPalettizerConfig._from_dict(config_dict)
+
+        expected_config = quantization.OpPalettizerConfig(
+            mode=mode,
+            nbits=nbits,
+            weight_threshold=weight_threshold,
+        )
+        assert config == expected_config
+
+    @pytest.mark.parametrize(
+        "from_yaml, yaml_as_string",
+        itertools.product(
+            [True, False],
+            [True, False],
+        ),
+    )
+    def test_optimization_config_load_corner_cases(self, from_yaml, yaml_as_string):
+        config_dict = {
+            "bobby_joe": 56,
+        }
+        with pytest.raises(
+            ValueError, match="Invalid key bobby_joe to construct an OptimizationConfig object."
+        ):
+            self.get_opt_config(config_dict, from_yaml, yaml_as_string)
+
+        config_dict = {
+            "global_config": None,
+        }
+        with pytest.raises(ValueError, match="config_type must be provided with type of string."):
+            self.get_opt_config(config_dict, from_yaml, yaml_as_string)
+
+        config_dict = {
+            "config_type": "OpLinearQuantizerConfig",
+            "op_type_configs": 123,
+        }
+        with pytest.raises(ValueError, match="op_type_configs must be type of dict. Got"):
+            self.get_opt_config(config_dict, from_yaml, yaml_as_string)
+
+        config_dict = {
+            "config_type": "OpLinearQuantizerConfig",
+            "op_name_configs": "eric",
+        }
+        with pytest.raises(ValueError, match="op_name_configs must be type of dict. Got"):
+            self.get_opt_config(config_dict, from_yaml, yaml_as_string)
+
+        # check that the value of the dictionary can be None or not provided
+        config_dict = {
+            "config_type": "OpLinearQuantizerConfig",
+        }
+        config = self.get_opt_config(config_dict, from_yaml, yaml_as_string)
+
+        assert config.global_config is None
+        assert config.op_type_configs == {}
+        assert config.op_name_configs == {}
+
+        config_dict = {
+            "config_type": "OpLinearQuantizerConfig",
+            "global_config": None,
+            "op_type_configs": {
+                "conv": None,
+            },
+            "op_name_configs": {
+                "op_1": None,
+            },
+        }
+        config = self.get_opt_config(config_dict, from_yaml, yaml_as_string)
+        assert config.global_config is None
+        assert config.op_type_configs["conv"] is None
+        assert config.op_name_configs["op_1"] is None
+
+    @pytest.mark.parametrize(
+        "from_yaml, yaml_as_string",
+        itertools.product(
+            [True, False],
+            [True, False],
+        ),
+    )
+    def test_optimization_config_load_linear_quantizer(self, from_yaml, yaml_as_string):
+        config_dict = {
+            "config_type": "OpLinearQuantizerConfig",
+            "global_config": {
+                "mode": "linear",
+                "dtype": "int8",
+                "weight_threshold": None,
+            },
+            "op_type_configs": {
+                "linear": {
+                    "mode": "linear_symmetric",
+                    "dtype": "uint8",
+                    "weight_threshold": None,
+                },
+            },
+            "op_name_configs": {
+                "op_1": {
+                    "mode": "linear_symmetric",
+                    "dtype": "int8",
+                    "weight_threshold": 2047,
+                },
+                "op_2": {
+                    "mode": "linear",
+                    "dtype": "uint8",
+                    "weight_threshold": 1,
+                },
+            },
+        }
+        config = self.get_opt_config(config_dict, from_yaml, yaml_as_string)
+
+        expected_global_config = quantization.OpLinearQuantizerConfig(
+            mode="linear",
+            dtype=np.int8,
+            weight_threshold=None,
+        )
+        assert config.global_config == expected_global_config
+
+        expected_config = quantization.OpLinearQuantizerConfig(
+            mode="linear_symmetric",
+            dtype=np.uint8,
+            weight_threshold=None,
+        )
+        assert config.op_type_configs["linear"] == expected_config
+
+        expected_config = quantization.OpLinearQuantizerConfig(
+            mode="linear_symmetric",
+            dtype=np.int8,
+            weight_threshold=2047,
+        )
+        assert config.op_name_configs["op_1"] == expected_config
+
+        expected_config = quantization.OpLinearQuantizerConfig(
+            mode="linear",
+            dtype=np.uint8,
+            weight_threshold=1,
+        )
+        assert config.op_name_configs["op_2"] == expected_config
+
+    @pytest.mark.parametrize(
+        "from_yaml, yaml_as_string",
+        itertools.product(
+            [True, False],
+            [True, False],
+        ),
+    )
+    def test_optimization_config_load_pruner(self, from_yaml, yaml_as_string):
+        """
+        This test also checking the override of the config_type
+        """
+        config_dict = {
+            "config_type": "OpThresholdPrunerConfig",
+            "global_config": {
+                "config_type": "OpMagnitudePrunerConfig",
+                "target_sparsity": 0.3,
+            },
+            "op_type_configs": {
+                "linear": {
+                    "config_type": "OpMagnitudePrunerConfig",
+                    "n_m_ratio": [4, 5],
+                    "dim": 0,
+                    "weight_threshold": 2,
+                },
+                "conv": {
+                    "threshold": 0.01,
+                    "minimum_sparsity_percentile": 0.01,
+                    "weight_threshold": 45,
+                },
+            },
+            "op_name_configs": {
+                "op_1": {
+                    "threshold": 0.1,
+                    "minimum_sparsity_percentile": 0.1,
+                    "weight_threshold": 1,
+                },
+                "op_2": {
+                    "config_type": "OpMagnitudePrunerConfig",
+                    "target_sparsity": 0.5,
+                    "block_size": 100,
+                },
+            },
+        }
+        config = self.get_opt_config(config_dict, from_yaml, yaml_as_string)
+
+        expected_global_config = quantization.OpMagnitudePrunerConfig(
+            target_sparsity=0.3,
+        )
+        assert config.global_config == expected_global_config
+
+        expected_config = quantization.OpMagnitudePrunerConfig(
+            n_m_ratio=(4, 5),
+            dim=0,
+            weight_threshold=2,
+        )
+        assert config.op_type_configs["linear"] == expected_config
+
+        expected_config = quantization.OpThresholdPrunerConfig(
+            threshold=0.01,
+            minimum_sparsity_percentile=0.01,
+            weight_threshold=45,
+        )
+        assert config.op_type_configs["conv"] == expected_config
+
+        expected_config = quantization.OpThresholdPrunerConfig(
+            threshold=0.1,
+            minimum_sparsity_percentile=0.1,
+            weight_threshold=1,
+        )
+        assert config.op_name_configs["op_1"] == expected_config
+
+        expected_config = quantization.OpMagnitudePrunerConfig(
+            target_sparsity=0.5,
+            block_size=100,
+        )
+        assert config.op_name_configs["op_2"] == expected_config
+
+    @pytest.mark.parametrize(
+        "from_yaml, yaml_as_string",
+        itertools.product(
+            [True, False],
+            [True, False],
+        ),
+    )
+    def test_optimization_config_load_palettizer(self, from_yaml, yaml_as_string):
+        config_dict = {
+            "config_type": "OpPalettizerConfig",
+            "global_config": {
+                "mode": "kmeans",
+                "nbits": 1,
+                "weight_threshold": 2,
+            },
+            "op_type_configs": {
+                "linear": {
+                    "mode": "uniform",
+                    "nbits": 6,
+                    "weight_threshold": None,
+                },
+            },
+            "op_name_configs": {
+                "op_1": {
+                    "config_type": "OpPalettizerConfig",
+                    "mode": "unique",
+                },
+            },
+        }
+        config = self.get_opt_config(config_dict, from_yaml, yaml_as_string)
+
+        expected_global_config = quantization.OpPalettizerConfig(
+            mode="kmeans",
+            nbits=1,
+            weight_threshold=2,
+        )
+        assert config.global_config == expected_global_config
+
+        expected_config = quantization.OpPalettizerConfig(
+            mode="uniform",
+            nbits=6,
+            weight_threshold=None,
+        )
+        assert config.op_type_configs["linear"] == expected_config
+
+        expected_config = quantization.OpPalettizerConfig(
+            mode="unique",
+        )
+        assert config.op_name_configs["op_1"] == expected_config
diff --git a/coremltools/test/optimize/coreml/test_post_training_quantization.py b/coremltools/test/optimize/coreml/test_post_training_quantization.py
new file mode 100644
index 000000000..425e8070f
--- /dev/null
+++ b/coremltools/test/optimize/coreml/test_post_training_quantization.py
@@ -0,0 +1,874 @@
+# Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+# Use of this source code is governed by a BSD-3-clause license that can be
+# found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import itertools
+
+import numpy as np
+import pytest
+import torch
+
+import coremltools as ct
+from coremltools._deps import _HAS_SKLEARN
+from coremltools.converters.mil.mil import types
+from coremltools.converters.mil.testing_utils import get_op_types_in_program
+import coremltools.optimize as cto
+from coremltools.test.ml_program.test_compression import get_test_model_and_data
+
+# Wrapper functions that create the optimization config and call ct.optimize.coreml APIs
+def linear_quantize_weights(mlmodel, mode="linear", dtype=np.int8):
+    op_config = cto.coreml.OpLinearQuantizerConfig(mode=mode, dtype=dtype)
+    config = cto.coreml.OptimizationConfig(global_config=op_config)
+    return cto.coreml.linear_quantize_weights(mlmodel, config)
+
+def palettize_weights(mlmodel, nbits=None, mode="kmeans", lut_function=None):
+    op_config = cto.coreml.OpPalettizerConfig(mode=mode, nbits=nbits, lut_function=lut_function)
+    config = cto.coreml.OptimizationConfig(global_config=op_config)
+    return cto.coreml.palettize_weights(mlmodel, config)
+
+def prune_weights(
+        mlmodel,
+        mode="threshold_based",
+        threshold=1e-3,
+        target_sparsity=1.0,
+        block_size=-1,
+        n_m_ratio=(),
+    ):
+    if mode == "threshold_based":
+        op_config = cto.coreml.OpThresholdPrunerConfig(
+            threshold=threshold,
+            minimum_sparsity_percentile=0.0,
+        )
+    elif mode == "percentile_based":
+        op_config = cto.coreml.OpMagnitudePrunerConfig(
+            target_sparsity=target_sparsity,
+        )
+    elif mode == "block_sparsity":
+        op_config = cto.coreml.OpMagnitudePrunerConfig(
+            target_sparsity=target_sparsity,
+            block_size=block_size,
+        )
+    else:
+        assert mode == "n_m_pruning"
+        op_config = cto.coreml.OpMagnitudePrunerConfig(
+            n_m_ratio=n_m_ratio,
+        )
+
+    config = cto.coreml.OptimizationConfig(global_config=op_config)
+    return cto.coreml.prune_weights(mlmodel, config)
+
+def decompress_weights(mlmodel):
+    return cto.coreml.decompress_weights(mlmodel)
+
+
+# Utility functions for testing
+def get_test_model_and_data_complex():
+    inputs = [ct.TensorType(name="data", shape=(1, 64, 10, 10))]
+    torch_input_values = [torch.rand(*i.shape.to_list()) for i in inputs]
+    coreml_input_values = {
+        i.name: val.detach().numpy() for i, val in zip(inputs, torch_input_values)
+    }
+    class Model(torch.nn.Module):
+        def __init__(self):
+            super(Model, self).__init__()
+            self.conv_1 = torch.nn.Conv2d(in_channels=64, out_channels=32, kernel_size=2)
+            self.conv_2 = torch.nn.Conv2d(in_channels=32, out_channels=64, kernel_size=2)
+            self.linear_1 = torch.nn.Linear(64, 128)
+            self.linear_2 = torch.nn.Linear(128, 256)
+            self.lstm = torch.nn.LSTM(256, 80)
+
+        def forward(self, x):
+            conv_1 = self.conv_1(x)
+            conv_2 = self.conv_2(conv_1)
+            reshape = torch.reshape(conv_2, (1, 64, 64))
+            linear_1 = self.linear_1(reshape)
+            linear_2 = self.linear_2(linear_1)
+            lstm = self.lstm(linear_2)
+            return lstm
+
+    return Model().eval(), inputs, torch_input_values, coreml_input_values
+
+
+def create_unique_weight(weight, nbits):
+    shape = weight.detach().numpy().shape
+    size = weight.detach().numpy().size
+
+    unique_number = 1 << nbits
+    weight = []
+    partition_len = size // unique_number + 1
+    for i in range(unique_number):
+        weight += [i] * (partition_len)
+    weight = np.reshape(np.array(weight[:size]).astype(np.float32), shape)
+    return weight
+
+
+def create_sparse_weight(weight, target_sparsity):
+    shape = list(weight.shape)
+    size = np.prod(shape)
+    weight = 3 * np.ones(size)
+    num_of_zeros = int(size * target_sparsity)
+    weight[:num_of_zeros] = 0
+    return np.reshape(weight, shape).astype(np.float32)
+
+
+def verify_model_outputs(model, compressed_model, input_values):
+    """
+    This utility functions does the following checks:
+
+    (1) Verify the output of the compressed model has the same shape / type of the original model
+    (2) The decompressed and compressed model have the same numerical outputs
+    """
+
+    # Make sure the model can be decompressed
+    decompressed_model = decompress_weights(compressed_model)
+
+    # Validate the output shape / type
+    ref_outputs = model._mil_program.functions["main"].outputs
+    outputs = compressed_model._mil_program.functions["main"].outputs
+
+    assert len(ref_outputs) == len(outputs)
+
+    for a, b in zip(ref_outputs, outputs):
+        assert a.name == b.name
+        assert a.shape == a.shape
+        assert a.dtype == b.dtype
+
+    if ct.utils._macos_version() < (13, 0):
+        return
+
+    # Validate that the compressed model could be decompressed, and produces correct outputs
+    output_dict = compressed_model.predict(input_values)
+    de_output_dict = decompressed_model.predict(input_values)
+    for k, v in de_output_dict.items():
+        assert k in output_dict
+        np.testing.assert_allclose(v, output_dict[k])
+
+class TestLinearQuantizeWeights:
+    @staticmethod
+    def test_linear_quantization():
+        model, inputs, torch_input_values, coreml_input_values = get_test_model_and_data_complex()
+        torchmodel = torch.jit.trace(model, torch_input_values)
+        mlmodel = ct.convert(torchmodel, inputs=inputs, convert_to="mlprogram", compute_precision=ct.precision.FLOAT32)
+
+        config = cto.coreml.OptimizationConfig()
+        conv_config = cto.coreml.OpLinearQuantizerConfig(mode="linear_symmetric", dtype=np.int8, weight_threshold=500)
+        lstm_config = cto.coreml.OpLinearQuantizerConfig(mode="linear", dtype=np.uint8, weight_threshold=4800)
+
+        config.set_op_type("conv", conv_config)
+        config.set_op_type("lstm", lstm_config)
+        config.set_op_name("conv_2_1", None)
+
+        mlmodel = cto.coreml.linear_quantize_weights(mlmodel, config)
+        expected_ops = [
+            "constexpr_affine_dequantize",
+            "conv",
+            "conv",
+            "reshape",
+            "linear",
+            "linear",
+            "constexpr_affine_dequantize",
+            "constexpr_affine_dequantize",
+            "constexpr_affine_dequantize",
+            "lstm",
+            "expand_dims",
+            "expand_dims"
+        ]
+        prog = mlmodel._mil_program
+        assert get_op_types_in_program(prog) == expected_ops
+        assert prog.find_ops(op_type="conv")[1].weight.op.op_type == "const"
+
+        expected_dtype = [np.int8, np.uint8, np.uint8, np.uint8, np.uint8]
+        affine_ops = prog.find_ops(op_type="constexpr_affine_dequantize")
+        for dtype, op in zip(expected_dtype, affine_ops):
+            assert op.quantized_data.val.dtype == dtype
+
+    @staticmethod
+    @pytest.mark.parametrize(
+        "mode, dtype",
+        itertools.product(
+            ("linear", "linear_symmetric"),
+            (np.int8, np.uint8, types.int8, types.uint8),
+        ),
+    )
+    def test_linear_quanitzation_stress(mode, dtype):
+        model, inputs, torch_input_values, coreml_input_values = get_test_model_and_data()
+        torchmodel = torch.jit.trace(model, torch_input_values)
+        mlmodel = ct.convert(torchmodel, inputs=inputs, convert_to="mlprogram")
+
+        mlmodel_quantized = linear_quantize_weights(mlmodel, mode=mode, dtype=dtype)
+
+        # validate parameters
+        expected_ops = ['constexpr_affine_dequantize', 'cast', 'conv', 'cast']
+        assert get_op_types_in_program(mlmodel_quantized._mil_program) == expected_ops
+
+        quanitze_op = mlmodel_quantized._mil_program.functions["main"].find_ops(op_type="constexpr_affine_dequantize")[0]
+        assert model.weight.detach().numpy().shape == quanitze_op.quantized_data.shape
+
+        verify_model_outputs(mlmodel, mlmodel_quantized, coreml_input_values)
+
+
+class TestPalettizeWeights:
+    @staticmethod
+    def test_palettization():
+        model, inputs, torch_input_values, coreml_input_values = get_test_model_and_data_complex()
+        torchmodel = torch.jit.trace(model, torch_input_values)
+        mlmodel = ct.convert(torchmodel, inputs=inputs, convert_to="mlprogram", compute_precision=ct.precision.FLOAT32)
+
+        config = cto.coreml.OptimizationConfig()
+        global_config = cto.coreml.OpPalettizerConfig(nbits=8, mode="kmeans", weight_threshold=500)
+        conv_config = cto.coreml.OpPalettizerConfig(nbits=6, mode="kmeans", weight_threshold=500)
+        conv_2_config = cto.coreml.OpPalettizerConfig(nbits=4, mode="kmeans", weight_threshold=500)
+        linear_1_config = cto.coreml.OpPalettizerConfig(nbits=2, mode="kmeans", weight_threshold=500)
+
+        config.set_global(global_config)
+        config.set_op_type("conv", conv_config)
+        config.set_op_name("conv_2_1", conv_2_config)
+        config.set_op_name("input_5", linear_1_config)
+
+        mlmodel = cto.coreml.palettize_weights(mlmodel, config)
+        expected_ops = [
+            "constexpr_lut_to_dense",
+            "constexpr_lut_to_dense",
+            "constexpr_lut_to_dense",
+            "constexpr_lut_to_dense",
+            "conv",
+            "conv",
+            "reshape",
+            "linear",
+            "linear",
+            "constexpr_lut_to_dense",
+            "constexpr_lut_to_dense",
+            "constexpr_lut_to_dense",
+            "lstm",
+            "expand_dims",
+            "expand_dims"
+        ]
+        prog = mlmodel._mil_program
+        assert get_op_types_in_program(prog) == expected_ops
+
+        expected_nbits = [6, 4, 2, 8, 8, 8, 8, 8]
+        lut_ops = prog.find_ops(op_type="constexpr_lut_to_dense")
+
+        for nbits, op in zip(expected_nbits, lut_ops):
+            assert op.lut.val.shape == (2**nbits,)
+
+    @staticmethod
+    @pytest.mark.parametrize(
+        "mode",
+        ("uniform", "kmeans") if _HAS_SKLEARN else ("uniform",)
+    )
+    def test_weight_palettization_stress(mode):
+        model, inputs, torch_input_values, coreml_input_values = get_test_model_and_data()
+        torchmodel = torch.jit.trace(model, torch_input_values)
+        mlmodel = ct.convert(torchmodel, inputs=inputs, convert_to="mlprogram")
+        mlmodel_palettized = palettize_weights(mlmodel, nbits=4, mode=mode)
+
+        # validate parameters
+        expected_ops = ['constexpr_lut_to_dense', 'cast', 'conv', 'cast']
+        assert get_op_types_in_program(mlmodel_palettized._mil_program) == expected_ops
+
+        main_func = mlmodel_palettized._mil_program.functions["main"]
+        lut_to_dense_op = main_func.find_ops(op_type="constexpr_lut_to_dense")[0]
+
+        assert lut_to_dense_op.shape.val.tolist() == list(model.weight.detach().numpy().shape)
+
+        # validate the model
+        verify_model_outputs(mlmodel, mlmodel_palettized, coreml_input_values)
+
+    @staticmethod
+    def test_weight_palettization_unique_case_1():
+        # In this model, both conv weights can be palettized
+        model, inputs, torch_input_values, coreml_input_values = get_test_model_and_data(multi_layer=True)
+
+        weight_1_unique = create_unique_weight(model.conv_1.weight, nbits=2)
+        weight_2_unique = create_unique_weight(model.conv_2.weight, nbits=6)
+
+        with torch.no_grad():
+            model.conv_1.weight = torch.nn.Parameter(torch.Tensor(weight_1_unique))
+            model.conv_2.weight = torch.nn.Parameter(torch.Tensor(weight_2_unique))
+
+        torchmodel = torch.jit.trace(model, torch_input_values)
+        mlmodel = ct.convert(torchmodel, inputs=inputs, convert_to="mlprogram")
+
+        # validate parameters
+        mlmodel_palettized = palettize_weights(mlmodel, mode="unique")
+        expected_ops = ['constexpr_lut_to_dense', 'cast', 'conv', 'constexpr_lut_to_dense', 'conv', 'cast']
+        assert get_op_types_in_program(mlmodel_palettized._mil_program) == expected_ops
+
+        main_func = mlmodel_palettized._mil_program.functions["main"]
+        lut_to_dense_op_1 = main_func.find_ops(op_type="constexpr_lut_to_dense")[0]
+        lut_to_dense_op_2 = main_func.find_ops(op_type="constexpr_lut_to_dense")[1]
+
+        assert lut_to_dense_op_1.shape.val.tolist() == list(model.conv_1.weight.detach().numpy().shape)
+        assert lut_to_dense_op_2.shape.val.tolist() == list(model.conv_2.weight.detach().numpy().shape)
+
+        # validate the model
+        verify_model_outputs(mlmodel, mlmodel_palettized, coreml_input_values)
+
+    def test_weight_palettization_unique_case_2(self, caplog):
+        # In this model, only one conv weights can be palettized, the converter should warn the users that one weight is skipped
+        model, inputs, torch_input_values, coreml_input_values = get_test_model_and_data(multi_layer=True)
+
+        weight_1_unique = create_unique_weight(model.conv_1.weight, nbits=2)
+
+        with torch.no_grad():
+            model.conv_1.weight = torch.nn.Parameter(torch.Tensor(weight_1_unique))
+
+        torchmodel = torch.jit.trace(model, torch_input_values)
+        mlmodel = ct.convert(torchmodel, inputs=inputs, convert_to="mlprogram")
+
+        # validate parameters
+        # converter should warn the user that one weight is not compressed
+        mlmodel_palettized = palettize_weights(mlmodel, mode="unique")
+        warning_msg = "weight value cannot be represented in an 8 bits palettization. Skipped."
+        assert any([warning_msg in rec.message for rec in caplog.records])
+
+        expected_ops = ['constexpr_lut_to_dense', 'cast', 'conv', 'conv', 'cast']
+        assert get_op_types_in_program(mlmodel_palettized._mil_program) == expected_ops
+
+        main_func = mlmodel_palettized._mil_program.functions["main"]
+        lut_to_dense_op_1 = main_func.find_ops(op_type="constexpr_lut_to_dense")[0]
+        assert lut_to_dense_op_1.shape.val.tolist() == list(model.conv_1.weight.detach().numpy().shape)
+
+        # validate the model
+        verify_model_outputs(mlmodel, mlmodel_palettized, coreml_input_values)
+
+    @staticmethod
+    def test_weight_palettization_custom():
+        model, inputs, torch_input_values, coreml_input_values = get_test_model_and_data()
+        torchmodel = torch.jit.trace(model, torch_input_values)
+        mlmodel = ct.convert(torchmodel, inputs=inputs, convert_to="mlprogram")
+
+        def lut_function(weight):
+            nbits = 4
+            weight = weight.flatten()
+            unique_elements = np.unique(weight)
+            k = (1 << nbits) - 1
+            top_k = np.partition(weight, -k)[-k:]
+            np.sort(top_k)
+            lut = np.array([0.] + top_k.tolist()).astype(weight.dtype)
+            mapping = {v: idx for idx, v in enumerate(lut)}
+            indices = np.array([mapping[v] if v in mapping else 0 for v in weight]).astype(np.uint8)
+            return lut, indices
+
+        mlmodel_palettized = palettize_weights(mlmodel, mode="custom", lut_function=lut_function)
+
+        # validate parameters
+        expected_ops = ['constexpr_lut_to_dense', 'cast', 'conv', 'cast']
+        assert get_op_types_in_program(mlmodel_palettized._mil_program) == expected_ops
+
+        main_func = mlmodel_palettized._mil_program.functions["main"]
+        lut_to_dense_op = main_func.find_ops(op_type="constexpr_lut_to_dense")[0]
+
+        assert lut_to_dense_op.shape.val.tolist() == list(model.weight.detach().numpy().shape)
+
+        # validate the model
+        verify_model_outputs(mlmodel, mlmodel_palettized, coreml_input_values)
+
+    @staticmethod
+    def test_convert_palettized_source_model_default():
+        model, inputs, torch_input_values, coreml_input_values = get_test_model_and_data_complex()
+
+        weight_1_unique = create_unique_weight(model.conv_1.weight, nbits=2)
+        weight_2_unique = create_unique_weight(model.conv_2.weight, nbits=6)
+        linear_1_unique = create_unique_weight(model.linear_1.weight, nbits=4)
+
+        with torch.no_grad():
+            model.conv_1.weight = torch.nn.Parameter(torch.Tensor(weight_1_unique))
+            model.conv_2.weight = torch.nn.Parameter(torch.Tensor(weight_2_unique))
+            model.linear_1.weight = torch.nn.Parameter(torch.Tensor(linear_1_unique))
+
+        torchmodel = torch.jit.trace(model, torch_input_values)
+        mlmodel = ct.convert(
+            torchmodel,
+            inputs=inputs,
+            convert_to="mlprogram",
+            pass_pipeline=ct.PassPipeline.DEFAULT_PALETTIZATION,
+            compute_precision=ct.precision.FLOAT32,
+            minimum_deployment_target=ct.target.iOS16,
+        )
+
+        expected_ops = [
+            "constexpr_lut_to_dense",
+            "constexpr_lut_to_dense",
+            "constexpr_lut_to_dense",
+            "conv",
+            "conv",
+            "reshape",
+            "linear",
+            "linear",
+            "constexpr_lut_to_dense",
+            "squeeze",
+            "lstm",
+            "expand_dims",
+            "expand_dims",
+        ]
+        prog = mlmodel._mil_program
+        assert get_op_types_in_program(prog) == expected_ops
+
+        expected_nbits = [2, 6, 4, 1, 1]
+        lut_ops = prog.find_ops(op_type="constexpr_lut_to_dense")
+
+        for nbits, op in zip(expected_nbits, lut_ops):
+            assert op.lut.val.shape == (2**nbits,)
+
+    @staticmethod
+    def test_convert_palettized_source_model_custom():
+        model, inputs, torch_input_values, coreml_input_values = get_test_model_and_data_complex()
+
+        weight_1_unique = create_unique_weight(model.conv_1.weight, nbits=2)
+        weight_2_unique = create_unique_weight(model.conv_2.weight, nbits=6)
+        linear_1_unique = create_unique_weight(model.linear_1.weight, nbits=4)
+
+        with torch.no_grad():
+            model.conv_1.weight = torch.nn.Parameter(torch.Tensor(weight_1_unique))
+            model.conv_2.weight = torch.nn.Parameter(torch.Tensor(weight_2_unique))
+            model.linear_1.weight = torch.nn.Parameter(torch.Tensor(linear_1_unique))
+
+        pipeline = ct.PassPipeline.DEFAULT_PALETTIZATION
+        config = cto.coreml.OptimizationConfig(
+            global_config=cto.coreml.OpPalettizerConfig(mode="unique"),
+            op_type_configs={
+                "conv": None,
+                "linear": cto.coreml.OpPalettizerConfig(nbits=1, mode="kmeans"),
+            }
+        )
+        pipeline.set_options("compression::palettize_weights", {"config": config})
+
+        torchmodel = torch.jit.trace(model, torch_input_values)
+        mlmodel = ct.convert(
+            torchmodel,
+            inputs=inputs,
+            convert_to="mlprogram",
+            pass_pipeline=pipeline,
+            compute_precision=ct.precision.FLOAT32,
+            minimum_deployment_target=ct.target.iOS16,
+        )
+
+        expected_ops = [
+            "constexpr_lut_to_dense",
+            "constexpr_lut_to_dense",
+            "conv",
+            "conv",
+            "reshape",
+            "linear",
+            "linear",
+            "constexpr_lut_to_dense",
+            "squeeze",
+            "lstm",
+            "expand_dims",
+            "expand_dims",
+        ]
+        prog = mlmodel._mil_program
+        assert get_op_types_in_program(prog) == expected_ops
+
+        expected_nbits = [1, 1, 1, 1]
+        lut_ops = prog.find_ops(op_type="constexpr_lut_to_dense")
+
+        for nbits, op in zip(expected_nbits, lut_ops):
+            assert op.lut.val.shape == (2**nbits,)
+
+        conv_ops = prog.find_ops(op_type="conv")
+        assert conv_ops[0].weight.op.op_type == "const"
+        assert conv_ops[1].weight.op.op_type == "const"
+
+        linear_ops = prog.find_ops(op_type="linear")
+        assert linear_ops[0].weight.op.op_type == "constexpr_lut_to_dense"
+        assert linear_ops[1].weight.op.op_type == "constexpr_lut_to_dense"
+
+
+class TestPruneWeights:
+    @staticmethod
+    def test_pruning():
+        model, inputs, torch_input_values, coreml_input_values = get_test_model_and_data_complex()
+        torchmodel = torch.jit.trace(model, torch_input_values)
+        mlmodel = ct.convert(torchmodel, inputs=inputs, convert_to="mlprogram", compute_precision=ct.precision.FLOAT32)
+
+        config = cto.coreml.OptimizationConfig()
+        global_config = cto.coreml.OpMagnitudePrunerConfig(target_sparsity=0.9, weight_threshold=500)
+
+        config.set_global(global_config)
+        config.set_op_type("lstm", None)
+        config.set_op_name("input_5", None)
+
+        mlmodel = cto.coreml.prune_weights(mlmodel, config)
+        expected_ops = [
+            "constexpr_sparse_to_dense",
+            "constexpr_sparse_to_dense",
+            "constexpr_sparse_to_dense",
+            "conv",
+            "conv",
+            "reshape",
+            "linear",
+            "linear",
+            "lstm",
+            "expand_dims",
+            "expand_dims"
+        ]
+        prog = mlmodel._mil_program
+        assert get_op_types_in_program(prog) == expected_ops
+        assert prog.find_ops(op_type="linear")[0].weight.op.op_type == "const"
+
+    @staticmethod
+    @pytest.mark.parametrize(
+        "threshold",
+        (0.0, 0.001, 1e2),
+    )
+    def test_weight_pruning_threshold_based(threshold):
+        model, inputs, torch_input_values, coreml_input_values = get_test_model_and_data()
+        with torch.no_grad():
+            model.weight[0][0][0][0] = 101
+        torchmodel = torch.jit.trace(model, torch_input_values)
+        mlmodel = ct.convert(torchmodel, inputs=inputs, convert_to="mlprogram")
+        mlmodel_sparsified = prune_weights(mlmodel, mode="threshold_based", threshold=threshold)
+
+        # validate parameters
+        expected_ops = ['constexpr_sparse_to_dense', 'cast', 'conv', 'cast']
+        assert get_op_types_in_program(mlmodel_sparsified._mil_program) == expected_ops
+
+        main_func = mlmodel_sparsified._mil_program.functions["main"]
+        sparse_to_dense_op = main_func.find_ops(op_type="constexpr_sparse_to_dense")[0]
+        non_sparse_data = sparse_to_dense_op.nonzero_data
+
+        if threshold != 1e2:
+            assert np.min(np.absolute(non_sparse_data.val)) >= threshold
+        else:
+            assert non_sparse_data.val.size == 1
+
+        assert sparse_to_dense_op.shape.val.tolist() == list(model.weight.detach().numpy().shape)
+
+        # validate the model
+        verify_model_outputs(mlmodel, mlmodel_sparsified, coreml_input_values)
+
+    @staticmethod
+    @pytest.mark.parametrize(
+        "percentile",
+        (0., 0.5, 1.0),
+    )
+    def test_weight_pruning_percentile_based(percentile):
+        model, inputs, torch_input_values, coreml_input_values = get_test_model_and_data()
+        torchmodel = torch.jit.trace(model, torch_input_values)
+        mlmodel = ct.convert(torchmodel, inputs=inputs, convert_to="mlprogram")
+        mlmodel_sparsified = prune_weights(mlmodel, mode="percentile_based", target_sparsity=percentile)
+
+        # validate parameters
+        expected_ops = ['constexpr_sparse_to_dense', 'cast', 'conv', 'cast']
+        assert get_op_types_in_program(mlmodel_sparsified._mil_program) == expected_ops
+
+        main_func = mlmodel_sparsified._mil_program.functions["main"]
+        sparse_to_dense_op = main_func.find_ops(op_type="constexpr_sparse_to_dense")[0]
+        non_sparse_data = sparse_to_dense_op.nonzero_data
+        weight = model.weight.detach().numpy()
+
+        if percentile == 0.:
+            assert non_sparse_data.val.size == weight.size
+        elif percentile == 0.5:
+            assert non_sparse_data.val.size <= 0.51 * (weight.size) and non_sparse_data.val.size >= 0.49 * (weight.size)
+        else:
+            assert non_sparse_data.val.size == 0
+
+        assert sparse_to_dense_op.shape.val.tolist() == list(model.weight.detach().numpy().shape)
+
+        # validate the model
+        verify_model_outputs(mlmodel, mlmodel_sparsified, coreml_input_values)
+
+    def test_weight_pruning_block_sparsity(self):
+        model, inputs, torch_input_values, coreml_input_values = get_test_model_and_data()
+        torchmodel = torch.jit.trace(model, torch_input_values)
+        mlmodel = ct.convert(torchmodel, inputs=inputs, convert_to="mlprogram")
+        mlmodel_sparsified = prune_weights(mlmodel, mode="block_sparsity", target_sparsity=0.3, block_size=5)
+
+        # validate parameters
+        expected_ops = ['constexpr_sparse_to_dense', 'cast', 'conv', 'cast']
+        assert get_op_types_in_program(mlmodel_sparsified._mil_program) == expected_ops
+
+    def test_weight_pruning_n_m(self):
+        model, inputs, torch_input_values, coreml_input_values = get_test_model_and_data()
+        torchmodel = torch.jit.trace(model, torch_input_values)
+        mlmodel = ct.convert(torchmodel, inputs=inputs, convert_to="mlprogram")
+        mlmodel_sparsified = prune_weights(mlmodel, mode="n_m_pruning", n_m_ratio=(2, 3))
+
+        # validate parameters
+        expected_ops = ['constexpr_sparse_to_dense', 'cast', 'conv', 'cast']
+        assert get_op_types_in_program(mlmodel_sparsified._mil_program) == expected_ops
+
+    def test_convert_sparse_source_model_default(self):
+        model, inputs, torch_input_values, coreml_input_values = get_test_model_and_data_complex()
+
+        weight_1_sparse = create_sparse_weight(model.conv_1.weight, 0.5)
+        weight_2_sparse = create_sparse_weight(model.conv_2.weight, 0.1)
+        linear_1_sparse = create_sparse_weight(model.linear_1.weight, 0.9)
+
+        with torch.no_grad():
+            model.conv_1.weight = torch.nn.Parameter(torch.Tensor(weight_1_sparse))
+            model.conv_2.weight = torch.nn.Parameter(torch.Tensor(weight_2_sparse))
+            model.linear_1.weight = torch.nn.Parameter(torch.Tensor(linear_1_sparse))
+
+        torchmodel = torch.jit.trace(model, torch_input_values)
+
+        mlmodel = ct.convert(
+            torchmodel,
+            inputs=inputs,
+            convert_to="mlprogram",
+            pass_pipeline=ct.PassPipeline.DEFAULT_PRUNING,
+            compute_precision=ct.precision.FLOAT32,
+            minimum_deployment_target=ct.target.iOS16,
+        )
+
+        prog = mlmodel._mil_program
+
+        # The default minimum_sparsity_percentile is 0.3, so only conv1, linear1, and two initialize states of lstm
+        # are compressed
+
+        expected_ops = [
+            "constexpr_sparse_to_dense",
+            "constexpr_sparse_to_dense",
+            "conv",
+            "conv",
+            "reshape",
+            "linear",
+            "linear",
+            "constexpr_sparse_to_dense",
+            "squeeze",
+            "lstm",
+            "expand_dims",
+            "expand_dims"
+        ]
+        assert get_op_types_in_program(prog) == expected_ops
+
+        conv_ops = prog.find_ops(op_type="conv")
+        assert conv_ops[0].weight.op.op_type == "constexpr_sparse_to_dense"
+        assert conv_ops[1].weight.op.op_type == "const"
+
+        linear_ops = prog.find_ops(op_type="linear")
+        assert linear_ops[0].weight.op.op_type == "constexpr_sparse_to_dense"
+        assert linear_ops[1].weight.op.op_type == "const"
+
+    def test_convert_sparse_source_model_custom(self):
+        model, inputs, torch_input_values, coreml_input_values = get_test_model_and_data_complex()
+
+        weight_1_sparse = create_sparse_weight(model.conv_1.weight, 0.5)
+        weight_2_sparse = create_sparse_weight(model.conv_2.weight, 0.1)
+        linear_1_sparse = create_sparse_weight(model.linear_1.weight, 0.9)
+
+        with torch.no_grad():
+            model.conv_1.weight = torch.nn.Parameter(torch.Tensor(weight_1_sparse))
+            model.conv_2.weight = torch.nn.Parameter(torch.Tensor(weight_2_sparse))
+            model.linear_1.weight = torch.nn.Parameter(torch.Tensor(linear_1_sparse))
+
+        torchmodel = torch.jit.trace(model, torch_input_values)
+
+        pipeline = ct.PassPipeline.DEFAULT_PRUNING
+        config = cto.coreml.OptimizationConfig(
+            global_config=cto.coreml.OpThresholdPrunerConfig(threshold=1e-3, minimum_sparsity_percentile=0.05),
+            op_type_configs={
+                "conv": None
+            }
+        )
+        pipeline.set_options("compression::prune_weights", {"config": config})
+        mlmodel = ct.convert(
+            torchmodel,
+            inputs=inputs,
+            convert_to="mlprogram",
+            pass_pipeline=pipeline,
+            compute_precision=ct.precision.FLOAT32,
+            minimum_deployment_target=ct.target.iOS16,
+        )
+
+        prog = mlmodel._mil_program
+        expected_ops = [
+            "constexpr_sparse_to_dense",
+            "conv",
+            "conv",
+            "reshape",
+            "linear",
+            "linear",
+            "constexpr_sparse_to_dense",
+            "squeeze",
+            "lstm",
+            "expand_dims",
+            "expand_dims"
+        ]
+        assert get_op_types_in_program(prog) == expected_ops
+
+        conv_ops = prog.find_ops(op_type="conv")
+        assert conv_ops[0].weight.op.op_type == "const"
+        assert conv_ops[1].weight.op.op_type == "const"
+
+        linear_ops = prog.find_ops(op_type="linear")
+        assert linear_ops[0].weight.op.op_type == "constexpr_sparse_to_dense"
+        assert linear_ops[1].weight.op.op_type == "const"
+
+class TestDecompressWeights:
+    @staticmethod
+    def test_weight_decopmression_coreml_optimize():
+        model, inputs, torch_input_values, coreml_input_values = get_test_model_and_data_complex()
+
+        weight_1_sparse = create_sparse_weight(model.conv_1.weight, 0.5)
+        weight_2_sparse = create_sparse_weight(model.conv_2.weight, 0.1)
+        linear_1_unique = create_unique_weight(model.linear_1.weight, nbits=4)
+
+        with torch.no_grad():
+            model.conv_1.weight = torch.nn.Parameter(torch.Tensor(weight_1_sparse))
+            model.conv_2.weight = torch.nn.Parameter(torch.Tensor(weight_2_sparse))
+            model.linear_1.weight = torch.nn.Parameter(torch.Tensor(linear_1_unique))
+
+        torchmodel = torch.jit.trace(model, torch_input_values)
+
+        pipeline = ct.PassPipeline.DEFAULT_PRUNING
+
+        pipeline.insert_pass(1, "compression::palettize_weights")
+        config = cto.coreml.OptimizationConfig(
+            global_config=cto.coreml.OpPalettizerConfig(mode="unique"),
+        )
+        pipeline.set_options("compression::palettize_weights", {"config": config})
+
+        mlmodel = ct.convert(
+            torchmodel,
+            inputs=inputs,
+            convert_to="mlprogram",
+            pass_pipeline=pipeline,
+            compute_precision=ct.precision.FLOAT32,
+            minimum_deployment_target=ct.target.iOS16,
+        )
+
+        decompressed_model = cto.coreml.decompress_weights(mlmodel)
+        prog = decompressed_model._mil_program
+        op_types =  get_op_types_in_program(prog)
+        for val in op_types:
+            assert "constexpr" not in val
+
+        if ct.utils._macos_version() < (13, 0):
+            return
+
+        # compared the numerical outputs
+        output_dict = mlmodel.predict(coreml_input_values)
+        de_output_dict = decompressed_model.predict(coreml_input_values)
+
+        for k, v in output_dict.items():
+            assert k in de_output_dict
+            np.testing.assert_allclose(v, de_output_dict[k])
+
+
+class TestConvertMixedCompression:
+    @staticmethod
+    def test_convert_sparse_and_palettized_source_model_custom():
+        model, inputs, torch_input_values, coreml_input_values = get_test_model_and_data_complex()
+
+        weight_1_sparse = create_sparse_weight(model.conv_1.weight, 0.5)
+        weight_2_sparse = create_sparse_weight(model.conv_2.weight, 0.1)
+        linear_1_unique = create_unique_weight(model.linear_1.weight, nbits=4)
+
+        with torch.no_grad():
+            model.conv_1.weight = torch.nn.Parameter(torch.Tensor(weight_1_sparse))
+            model.conv_2.weight = torch.nn.Parameter(torch.Tensor(weight_2_sparse))
+            model.linear_1.weight = torch.nn.Parameter(torch.Tensor(linear_1_unique))
+
+        torchmodel = torch.jit.trace(model, torch_input_values)
+
+        pipeline = ct.PassPipeline.DEFAULT_PRUNING
+
+        pipeline.insert_pass(1, "compression::palettize_weights")
+        config = cto.coreml.OptimizationConfig(
+            global_config=cto.coreml.OpPalettizerConfig(mode="unique"),
+        )
+        pipeline.set_options("compression::palettize_weights", {"config": config})
+
+        mlmodel = ct.convert(
+            torchmodel,
+            inputs=inputs,
+            convert_to="mlprogram",
+            pass_pipeline=pipeline,
+            compute_precision=ct.precision.FLOAT32,
+            minimum_deployment_target=ct.target.iOS16,
+        )
+
+        prog = mlmodel._mil_program
+        expected_ops = [
+            "constexpr_sparse_to_dense",
+            "constexpr_lut_to_dense",
+            "constexpr_lut_to_dense",
+            "conv",
+            "conv",
+            "reshape",
+            "linear",
+            "linear",
+            "constexpr_sparse_to_dense",
+            "squeeze",
+            "lstm",
+            "expand_dims",
+            "expand_dims"
+        ]
+        assert get_op_types_in_program(prog) == expected_ops
+
+        conv_ops = prog.find_ops(op_type="conv")
+        assert conv_ops[0].weight.op.op_type == "constexpr_sparse_to_dense"
+        assert conv_ops[1].weight.op.op_type == "constexpr_lut_to_dense"
+
+        linear_ops = prog.find_ops(op_type="linear")
+        assert linear_ops[0].weight.op.op_type == "constexpr_lut_to_dense"
+        assert linear_ops[1].weight.op.op_type == "const"
+
+class TestErrorHandling:
+    @staticmethod
+    def test_error_handling():
+        model, inputs, torch_input_values, coreml_input_values = get_test_model_and_data()
+        torchmodel = torch.jit.trace(model, torch_input_values)
+        mlmodel = ct.convert(torchmodel, inputs=inputs, convert_to="mlprogram")
+
+        # Test invalid mode for affine quantization
+        expected_err_str = "supported for weight affine quantization. Got mode"
+        with pytest.raises(ValueError, match=expected_err_str):
+            linear_quantize_weights(mlmodel, mode="invalid_mode")
+
+        # Test invalid dtype for affine quantization
+        expected_err_str = "is unsupported for affine_quantize_weight"
+        with pytest.raises(ValueError, match=expected_err_str):
+            linear_quantize_weights(mlmodel, dtype=np.int32)
+
+        expected_err_str = "\'dtype\' must be \<class \'type\'\> \(got \'int32\'"
+        with pytest.raises(TypeError, match=expected_err_str):
+            linear_quantize_weights(mlmodel, dtype="int32")
+
+        # Test invalid threshold for weight sparsification
+        expected_err_str = 'Invalid value of "threshold": \-1.0. Needs to be in \[0, inf\)'
+        with pytest.raises(ValueError, match=expected_err_str):
+            prune_weights(mlmodel, mode="threshold_based", threshold=-1.0)
+
+        # Test invalid percentile for weight sparsification
+        expected_err_str = "Invalid value of \"target_sparsity\": 1.2. Needs to be in \[0, 1\]"
+        with pytest.raises(ValueError, match=expected_err_str):
+           prune_weights(mlmodel, mode="percentile_based", target_sparsity=1.2)
+
+        # Test invalid mode for weight palettization
+        expected_err_str = "supported for weight palettization. Got \"mode\""
+        with pytest.raises(ValueError, match=expected_err_str):
+            palettize_weights(mlmodel, mode="invalid_mode")
+
+        # Test nbits must be provided for kmeans, uniform mode for weight palettization
+        expected_err_str = "\"nbits\" must be provided for"
+        with pytest.raises(ValueError, match=expected_err_str):
+            palettize_weights(mlmodel, mode="kmeans")
+
+        with pytest.raises(ValueError, match=expected_err_str):
+            palettize_weights(mlmodel, mode="uniform")
+
+        # Test nbits must not be provided for unique, custom mode for weight palettization
+        expected_err_str = "\"nbits\" must NOT be provided for"
+        with pytest.raises(ValueError, match=expected_err_str):
+            palettize_weights(mlmodel, mode="unique", nbits=2)
+
+        with pytest.raises(ValueError, match=expected_err_str):
+            palettize_weights(mlmodel, mode="custom", nbits=2)
+
+        # Test lut_function must be provided for custom mode, and must not be provided otherwise
+        with pytest.raises(ValueError, match="\"lut_function\" can not be None, if \"mode\" is \"custom\"."):
+            palettize_weights(mlmodel, mode="custom")
+        with pytest.raises(ValueError, match="\"lut_function\" must be None, if \"mode\" is not \"custom\"."):
+            palettize_weights(mlmodel, mode="unique", lut_function=lambda op: True)
+
+        # Test lut_function must be a function obejct
+        expected_err_str = "A function object must be provided as \"lut_function\""
+        with pytest.raises(ValueError, match=expected_err_str):
+            palettize_weights(mlmodel, mode="custom", lut_function=1)
diff --git a/coremltools/test/optimize/torch/__init__.py b/coremltools/test/optimize/torch/__init__.py
new file mode 100644
index 000000000..461b69f06
--- /dev/null
+++ b/coremltools/test/optimize/torch/__init__.py
@@ -0,0 +1,6 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+# module
diff --git a/coremltools/test/optimize/torch/conftest.py b/coremltools/test/optimize/torch/conftest.py
new file mode 100644
index 000000000..6ab0dee21
--- /dev/null
+++ b/coremltools/test/optimize/torch/conftest.py
@@ -0,0 +1,53 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import os
+import shutil
+from coremltools.test.optimize.torch.models.mnist import (
+    mnist_dataset,
+    mnist_model,
+    mnist_model_large,
+    mnist_model_quantization,
+)
+from coremltools.test.optimize.torch.pruning.pruning_utils import get_model_and_pruner
+
+import pytest
+
+
+# dummy function to use the imported fixtures so that linter
+# does not remove them as unused imports
+def _dummy(
+    mnist_dataset,
+    mnist_model,
+    mnist_model_large,
+    mnist_model_quantization,
+    get_model_and_pruner,
+):
+    return (
+        mnist_dataset,
+        mnist_model,
+        mnist_model_large,
+        mnist_model_quantization,
+        get_model_and_pruner,
+    )
+
+
+def _datadir(request):
+    # When using this fixture with parametrized tests, we end up with '[' and ']' characters in the pathname, which TF
+    # is not happy with. Thus we should substitute these characters with a more universally accepted path character.
+    safe_name = request.node.name.replace("[", "___").replace("]", "___")
+
+    dir = test_data_path() / safe_name   # noqa: F821
+    shutil.rmtree(str(dir), ignore_errors=True)
+    os.makedirs(str(dir))
+    return dir
+
+
+@pytest.fixture
+def datadir(request):
+    """
+    Directory for storing test data for latter inspection.
+    """
+    return _datadir(request)
diff --git a/coremltools/test/optimize/torch/models/__init__.py b/coremltools/test/optimize/torch/models/__init__.py
new file mode 100644
index 000000000..25c7d28c5
--- /dev/null
+++ b/coremltools/test/optimize/torch/models/__init__.py
@@ -0,0 +1,4 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
diff --git a/coremltools/test/optimize/torch/models/mnist.py b/coremltools/test/optimize/torch/models/mnist.py
new file mode 100644
index 000000000..c7679c49e
--- /dev/null
+++ b/coremltools/test/optimize/torch/models/mnist.py
@@ -0,0 +1,101 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+# type: ignore
+import os
+from collections import OrderedDict
+from coremltools.test.optimize.torch.utils import test_data_path
+
+import pytest
+import torch.nn as nn
+from filelock import FileLock
+from torchvision import datasets, transforms
+
+# IMPORTANT: DO NOT import these fixtures in your tests.
+# That leads pytest to run the fixtures (even session-scoped) multiple times.
+# These have been imported into conftest.py, which makes them available for all
+# tests within the test/ folder.
+
+
+num_classes = 10
+
+
+@pytest.fixture
+def mnist_model():
+    return nn.Sequential(OrderedDict([
+        ('conv1', nn.Conv2d(1, 32, (5, 5), padding='same')),
+        ('relu1', nn.ReLU()),
+        ('pool1', nn.MaxPool2d(2, stride=2, padding=0)),
+        ('bn1', nn.BatchNorm2d(32, eps=0.001, momentum=0.01)),
+        ('conv2', nn.Conv2d(32, 64, (5, 5), padding='same')),
+        ('relu2', nn.ReLU()),
+        ('pool2', nn.MaxPool2d(2, stride=2, padding=0)),
+        ('flatten', nn.Flatten()),
+        ('dense1', nn.Linear(3136, 1024)),
+        ('relu3', nn.ReLU()),
+        ('dropout', nn.Dropout(p=0.4)),
+        ('dense2', nn.Linear(1024, num_classes)),
+        ('softmax', nn.LogSoftmax())]))
+
+
+@pytest.fixture
+def mnist_model_quantization():
+    # String padding mode like "same" or "valid" is not supported
+    # for quantized models: https://github.com/pytorch/pytorch/issues/76304
+    return nn.Sequential(OrderedDict([
+        ('conv1', nn.Conv2d(1, 32, (5, 5), padding=2)),
+        ('bn1', nn.BatchNorm2d(32, eps=0.001, momentum=0.01)),
+        ('relu1', nn.ReLU6()),
+        ('pool1', nn.MaxPool2d(2, stride=2, padding=0)),
+        ('conv2', nn.Conv2d(32, 64, (5, 5), padding=2)),
+        ('relu2', nn.ReLU6()),
+        ('pool2', nn.MaxPool2d(2, stride=2, padding=0)),
+        ('flatten', nn.Flatten()),
+        ('dense1', nn.Linear(3136, 1024)),
+        ('relu3', nn.ReLU6()),
+        ('dropout', nn.Dropout(p=0.4)),
+        ('dense2', nn.Linear(1024, num_classes)),
+        ('softmax', nn.LogSoftmax())]))
+
+
+@pytest.fixture
+def mnist_model_large():
+    """
+    MNIST model with redundant layers for testing pruning algorithm
+    """
+    return nn.Sequential(OrderedDict([
+        ('conv1', nn.Conv2d(1, 32, (5, 5), padding='same')),
+        ('relu1', nn.ReLU()),
+        ('pool1', nn.MaxPool2d(2, stride=2, padding=0)),
+        ('bn1', nn.BatchNorm2d(32, eps=0.001, momentum=0.01)),
+        ('conv2', nn.Conv2d(32, 64, (5, 5), padding='same')),
+        ('relu2', nn.ReLU()),
+        ('pool2', nn.MaxPool2d(2, stride=2, padding=0)),
+        ('conv3', nn.Conv2d(64, 64, (5, 5), padding='same')),
+        ('relu3', nn.ReLU()),
+        ('conv4', nn.Conv2d(64, 64, (5, 5), padding='same')),
+        ('relu4', nn.ReLU()),
+        ('flatten', nn.Flatten()),
+        ('dense1', nn.Linear(3136, 1024)),
+        ('relu5', nn.ReLU()),
+        ('dropout', nn.Dropout(p=0.4)),
+        ('dense2', nn.Linear(1024, num_classes)),
+        ('softmax', nn.LogSoftmax())]))
+
+
+@pytest.fixture(scope="session")
+def mnist_dataset():
+    transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize((0.1307,), (0.3081,))
+    ])
+    data_path = os.path.join(test_data_path(), 'mnist')
+    os.makedirs(data_path, exist_ok=True)
+    with FileLock(os.path.join(data_path, 'data.lock')):
+        train = datasets.MNIST(data_path, train=True, download=True,
+                               transform=transform)
+        test = datasets.MNIST(data_path, train=False, download=True,
+                              transform=transform)
+    return train, test
diff --git a/coremltools/test/optimize/torch/palettization/__init__.py b/coremltools/test/optimize/torch/palettization/__init__.py
new file mode 100644
index 000000000..25c7d28c5
--- /dev/null
+++ b/coremltools/test/optimize/torch/palettization/__init__.py
@@ -0,0 +1,4 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
diff --git a/coremltools/test/optimize/torch/palettization/palettization_utils.py b/coremltools/test/optimize/torch/palettization/palettization_utils.py
new file mode 100644
index 000000000..889e93dcb
--- /dev/null
+++ b/coremltools/test/optimize/torch/palettization/palettization_utils.py
@@ -0,0 +1,24 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+from torch.ao.quantization import quantization_mappings
+
+
+def _assert_changes_post_attach(module, n_bits, cluster_dim):
+    assert hasattr(module, 'qconfig')
+    assert module.qconfig.weight.p.keywords["n_bits"] == n_bits
+    assert module.qconfig.weight.p.keywords["cluster_dim"] == cluster_dim
+
+
+def _assert_changes_post_prepare(original_module, palettized_module, n_bits, cluster_dim, kmeans_max_iter):
+    assert type(palettized_module) == quantization_mappings.DEFAULT_QAT_MODULE_MAPPINGS[type(original_module)]
+    assert palettized_module.weight_fake_quant.n_clusters[0] == 2 ** n_bits
+    assert palettized_module.weight_fake_quant.cluster_dim == cluster_dim
+    assert palettized_module.weight_fake_quant.kmeans_max_iter == kmeans_max_iter
+
+
+def _get_max_unique_weights_in_module_post_conversion(config, module):
+    return (2 ** config[type(module)]["n_bits"]) \
+           * config[type(module)]["cluster_dim"]
diff --git a/coremltools/test/optimize/torch/palettization/test_palettization_api.py b/coremltools/test/optimize/torch/palettization/test_palettization_api.py
new file mode 100644
index 000000000..5f377fa15
--- /dev/null
+++ b/coremltools/test/optimize/torch/palettization/test_palettization_api.py
@@ -0,0 +1,563 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import copy
+
+import pytest
+import torch
+import torch.functional as F
+import torch.nn as nn
+
+from coremltools.optimize.torch.palettization import DKMPalettizer, DKMPalettizerConfig
+from coremltools.optimize.torch.palettization.palettization_config import (
+    DEFAULT_PALETTIZATION_SCHEME,
+)
+from coremltools.test.optimize.torch.palettization.palettization_utils import (
+    _assert_changes_post_attach,
+    _assert_changes_post_prepare,
+)
+
+
+def _create_simple_model():
+    class Net(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv1 = nn.Conv2d(3, 6, 5)
+            self.pool = nn.MaxPool2d(2, 2)
+            self.conv2 = nn.Conv2d(6, 16, 5)
+            self.fc1 = nn.Linear(16 * 5 * 5, 120)
+            self.fc2 = nn.Linear(120, 84)
+            self.fc3 = nn.Linear(84, 10)
+
+        def forward(self, x):
+            x = self.pool(F.relu(self.conv1(x)))
+            x = self.pool(F.relu(self.conv2(x)))
+            x = torch.flatten(x, 1)  # flatten all dimensions except batch
+            x = F.relu(self.fc1(x))
+            x = F.relu(self.fc2(x))
+            x = self.fc3(x)
+            return x
+
+    return Net()
+
+
+@pytest.fixture
+def simple_model():
+    return _create_simple_model()
+
+
+def test_inplace_false_attach_config(simple_model):
+    palettizer = DKMPalettizer(simple_model)
+    prepared_model = palettizer.prepare()
+
+    assert not hasattr(simple_model.conv1, "qconfig")
+    assert not hasattr(simple_model.conv2, "qconfig")
+    assert not hasattr(simple_model.fc1, "qconfig")
+    assert not hasattr(simple_model.fc2, "qconfig")
+    assert not hasattr(simple_model.fc3, "qconfig")
+
+    _assert_changes_post_attach(
+        prepared_model.conv2,
+        DEFAULT_PALETTIZATION_SCHEME[type(simple_model.conv2)]["n_bits"],
+        DEFAULT_PALETTIZATION_SCHEME[type(simple_model.conv2)]["cluster_dim"],
+    )
+    _assert_changes_post_attach(
+        prepared_model.fc1,
+        DEFAULT_PALETTIZATION_SCHEME[type(simple_model.fc1)]["n_bits"],
+        DEFAULT_PALETTIZATION_SCHEME[type(simple_model.fc1)]["cluster_dim"],
+    )
+    _assert_changes_post_attach(
+        prepared_model.fc2,
+        DEFAULT_PALETTIZATION_SCHEME[type(simple_model.fc2)]["n_bits"],
+        DEFAULT_PALETTIZATION_SCHEME[type(simple_model.fc2)]["cluster_dim"],
+    )
+
+
+def test_attach_config_simple_model_uniform_palettization_config(simple_model):
+    config = DKMPalettizerConfig.from_dict({"global_config": {"n_bits": 4}})
+    palettizer = DKMPalettizer(simple_model, config)
+    palettizer.prepare(inplace=True)
+
+    n_bits = config.global_config.n_bits
+    _assert_changes_post_attach(simple_model.conv2, n_bits, 1)
+    _assert_changes_post_attach(simple_model.fc1, n_bits, 1)
+    _assert_changes_post_attach(simple_model.fc2, n_bits, 1)
+
+
+def test_attach_config_simple_model_custom_palettization_config(simple_model):
+    custom_config = {
+        nn.Conv2d: {"n_bits": 2, "cluster_dim": 2},
+        nn.Linear: {"n_bits": 4},
+    }
+    config = DKMPalettizerConfig.from_dict(
+        {"module_type_configs": custom_config,
+         "module_name_configs": {'conv2': {"n_bits": 3, "cluster_dim": 2}}}
+    )
+    palettizer = DKMPalettizer(simple_model, config)
+    palettizer.prepare(inplace=True)
+
+    _assert_changes_post_attach(simple_model.conv2, 3, 2)
+    _assert_changes_post_attach(simple_model.fc1, custom_config[nn.Linear]["n_bits"], 1)
+    _assert_changes_post_attach(simple_model.fc2, custom_config[nn.Linear]["n_bits"], 1)
+
+
+def test_attach_config_simple_model_weight_threshold_test(simple_model):
+    custom_config = {nn.Conv2d: {"n_bits": 2, "cluster_dim": 2, "weight_threshold": 1000}}
+    config = DKMPalettizerConfig.from_dict(
+        {"module_type_configs": custom_config}
+    )
+    palettizer = DKMPalettizer(simple_model, config)
+    palettizer.prepare(inplace=True)
+
+    # For the below two assertions, prepare_qat would propagate a None qconfig throughout the supported modules in
+    # the model
+    assert hasattr(simple_model.conv1, "qconfig") and simple_model.conv1.qconfig is None
+    assert hasattr(simple_model.fc1, "qconfig") and simple_model.fc1.qconfig is None
+    _assert_changes_post_attach(
+        simple_model.conv2,
+        custom_config[nn.Conv2d]["n_bits"],
+        custom_config[nn.Conv2d]["cluster_dim"],
+    )
+
+
+def test_attach_config_simple_model_weight_threshold_range_test(simple_model):
+    custom_config = {
+        nn.Conv2d: [
+            {"n_bits": 4, "cluster_dim": 4, "weight_threshold": 1000},
+            {"n_bits": 2, "cluster_dim": 2, "weight_threshold": 400},
+        ]
+    }
+    config = DKMPalettizerConfig.from_dict({"module_type_configs": custom_config})
+    palettizer = DKMPalettizer(simple_model, config)
+    palettizer.prepare(inplace=True)
+
+    # For the below assertion, prepare_qat would propagate a None qconfig throughout the supported modules in
+    # the model
+    assert hasattr(simple_model.fc1, "qconfig") and simple_model.fc1.qconfig is None
+    _assert_changes_post_attach(
+        simple_model.conv1,
+        custom_config[nn.Conv2d][1]["n_bits"],
+        custom_config[nn.Conv2d][1]["cluster_dim"],
+    )
+    _assert_changes_post_attach(
+        simple_model.conv2,
+        custom_config[nn.Conv2d][0]["n_bits"],
+        custom_config[nn.Conv2d][0]["cluster_dim"],
+    )
+
+
+def test_attach_config_only_on_specified_modules_conv(simple_model):
+    """
+    If there is a module type specified in the palettization_config, qconfigs should only be applied to modules of
+    those types not to modules of other type. For eg: If palettization_config only contains Conv2d, we should
+    not attach a qconfig to nn.Linear despite it being supported by palettization.
+    """
+    custom_config = {nn.Conv2d: {"n_bits": 2, "cluster_dim": 2}}
+    config = DKMPalettizerConfig.from_dict({"module_type_configs": custom_config})
+    palettizer = DKMPalettizer(simple_model, config)
+    palettizer.prepare(inplace=True)
+
+    # For the below assertion, prepare_qat would propagate a None qconfig throughout the supported modules in
+    # the model
+    assert hasattr(simple_model.fc1, "qconfig") and simple_model.fc1.qconfig is None
+    _assert_changes_post_attach(
+        simple_model.conv2,
+        custom_config[nn.Conv2d]["n_bits"],
+        custom_config[nn.Conv2d]["cluster_dim"],
+    )
+
+
+def test_attach_config_only_on_specified_modules_linear(simple_model):
+    custom_config = {nn.Linear: {"n_bits": 2, "cluster_dim": 2}}
+    config = DKMPalettizerConfig.from_dict(
+        {"module_type_configs": custom_config}
+    )
+    palettizer = DKMPalettizer(simple_model, config)
+    palettizer.prepare(inplace=True)
+
+    # For the below two assertions, prepare_qat would propagate a None qconfig throughout the supported modules in
+    # the model
+    assert hasattr(simple_model.conv1, "qconfig") and simple_model.conv1.qconfig is None
+    assert hasattr(simple_model.conv2, "qconfig") and simple_model.conv2.qconfig is None
+    _assert_changes_post_attach(
+        simple_model.fc1,
+        custom_config[nn.Linear]["n_bits"],
+        custom_config[nn.Linear]["cluster_dim"],
+    )
+
+
+def test_prepare_palettizer_simple_model_custom_palettization_config(simple_model):
+    simple_model_copy = copy.deepcopy(simple_model)
+    custom_config = {nn.Conv2d: {"n_bits": 2, "cluster_dim": 2, "kmeans_max_iter": 4},
+                     nn.Linear: {"n_bits": 4, "cluster_dim": 1, "kmeans_max_iter": 5}}
+    config = DKMPalettizerConfig.from_dict(
+        {"module_type_configs": custom_config}
+    )
+    palettizer = DKMPalettizer(simple_model, config)
+    prepared_model = palettizer.prepare(inplace=True)
+    num_epochs = 1
+    for epoch in range(num_epochs):
+        palettizer.step()
+
+    _assert_changes_post_prepare(
+        simple_model_copy.conv2,
+        prepared_model.conv2,
+        custom_config[nn.Conv2d]["n_bits"],
+        custom_config[nn.Conv2d]["cluster_dim"],
+        custom_config[nn.Conv2d]["kmeans_max_iter"],
+    )
+    _assert_changes_post_prepare(
+        simple_model_copy.fc1,
+        prepared_model.fc1,
+        custom_config[nn.Linear]["n_bits"],
+        custom_config[nn.Linear]["cluster_dim"],
+        custom_config[nn.Linear]["kmeans_max_iter"],
+    )
+    _assert_changes_post_prepare(
+        simple_model_copy.fc2,
+        prepared_model.fc2,
+        custom_config[nn.Linear]["n_bits"],
+        custom_config[nn.Linear]["cluster_dim"],
+        custom_config[nn.Linear]["kmeans_max_iter"],
+    )
+
+
+def test_inplace_true_prepare_palettizer(simple_model):
+    simple_model_copy = copy.deepcopy(simple_model)
+    custom_config = {
+        nn.Conv2d: {
+            "n_bits": 2,
+            "cluster_dim": 2,
+            "kmeans_max_iter": 4,
+            "milestone": 1,
+        },
+        nn.Linear: {
+            "n_bits": 4,
+            "cluster_dim": 1,
+            "kmeans_max_iter": 5,
+            "milestone": 1,
+        },
+    }
+    config = DKMPalettizerConfig.from_dict({"module_type_configs": custom_config})
+    palettizer = DKMPalettizer(simple_model, config)
+
+    palettizer.prepare(inplace=True)
+    num_steps = 2
+    for step in range(num_steps):
+        palettizer.step()
+        if step == 0:
+            assert palettizer._model.fc1.weight_fake_quant.fake_palett_enabled[0] == 0
+        else:
+            assert palettizer._model.fc1.weight_fake_quant.fake_palett_enabled[0] == 1
+
+    _assert_changes_post_prepare(
+        simple_model_copy.conv2,
+        simple_model.conv2,
+        custom_config[nn.Conv2d]["n_bits"],
+        custom_config[nn.Conv2d]["cluster_dim"],
+        custom_config[nn.Conv2d]["kmeans_max_iter"],
+    )
+    _assert_changes_post_prepare(
+        simple_model_copy.fc1,
+        simple_model.fc1,
+        custom_config[nn.Linear]["n_bits"],
+        custom_config[nn.Linear]["cluster_dim"],
+        custom_config[nn.Linear]["kmeans_max_iter"],
+    )
+    _assert_changes_post_prepare(
+        simple_model_copy.fc2,
+        simple_model.fc2,
+        custom_config[nn.Linear]["n_bits"],
+        custom_config[nn.Linear]["cluster_dim"],
+        custom_config[nn.Linear]["kmeans_max_iter"],
+    )
+
+
+
+def test_prepare_palettizer_simple_model_custom_palettization_config_milestone_1(simple_model):
+    custom_config = {nn.Conv2d: {"n_bits": 2, "cluster_dim": 2, "kmeans_max_iter": 4, "milestone": 1},
+                     nn.Linear: {"n_bits": 4, "cluster_dim": 1, "kmeans_max_iter": 5, "milestone": 1}}
+    config = DKMPalettizerConfig.from_dict(
+        {"module_type_configs": custom_config}
+    )
+    palettizer = DKMPalettizer(simple_model, config)
+
+    prepared_model = palettizer.prepare()
+    num_steps = 2
+    for step in range(num_steps):
+        palettizer.step()
+        if step == 0:
+            assert palettizer._model.fc1.weight_fake_quant.fake_palett_enabled[0] == 0
+        else:
+            assert palettizer._model.fc1.weight_fake_quant.fake_palett_enabled[0] == 1
+
+    _assert_changes_post_prepare(simple_model.conv2, prepared_model.conv2, custom_config[nn.Conv2d]["n_bits"],
+                                 custom_config[nn.Conv2d]["cluster_dim"], custom_config[nn.Conv2d]["kmeans_max_iter"])
+    _assert_changes_post_prepare(simple_model.fc1, prepared_model.fc1, custom_config[nn.Linear]["n_bits"],
+                                 custom_config[nn.Linear]["cluster_dim"], custom_config[nn.Linear]["kmeans_max_iter"])
+    _assert_changes_post_prepare(simple_model.fc2, prepared_model.fc2, custom_config[nn.Linear]["n_bits"],
+                                 custom_config[nn.Linear]["cluster_dim"], custom_config[nn.Linear]["kmeans_max_iter"])
+
+
+def test_prepare_palettizer_different_milestone_per_module_type(simple_model):
+    custom_config = {
+        nn.Conv2d: {
+            "n_bits": 2,
+            "cluster_dim": 2,
+            "kmeans_max_iter": 4,
+            "milestone": 1,
+        },
+        nn.Linear: {"n_bits": 4, "kmeans_max_iter": 5, "milestone": 2},
+    }
+    config = DKMPalettizerConfig.from_dict({"module_type_configs": custom_config})
+    palettizer = DKMPalettizer(simple_model, config)
+
+    orig_conv_mods = [simple_model.conv2]
+    orig_fc_mods = [simple_model.fc1, simple_model.fc2]
+
+    prepared_model = palettizer.prepare()
+
+    prepared_conv_mods = [prepared_model.conv2]
+    prepared_fc_mods = [prepared_model.fc1, prepared_model.fc2]
+
+    num_steps = 3
+    for step in range(num_steps):
+        palettizer.step()
+        if step == 0:
+            for mod in prepared_conv_mods + prepared_fc_mods:
+                assert mod.weight_fake_quant.fake_palett_enabled[0] == 0
+        elif step == 1:
+            for mod in prepared_conv_mods:
+                assert mod.weight_fake_quant.fake_palett_enabled[0] == 1
+            for mod in prepared_fc_mods:
+                assert mod.weight_fake_quant.fake_palett_enabled[0] == 0
+        else:
+            for mod in prepared_conv_mods:
+                assert mod.weight_fake_quant.fake_palett_enabled[0] == 1
+            for mod in prepared_fc_mods:
+                assert mod.weight_fake_quant.fake_palett_enabled[0] == 1
+
+    for orig, prep in zip(orig_conv_mods, prepared_conv_mods):
+        _assert_changes_post_prepare(orig, prep, custom_config[nn.Conv2d]["n_bits"],
+                                     custom_config[nn.Conv2d]["cluster_dim"],
+                                     custom_config[nn.Conv2d]["kmeans_max_iter"])
+    for orig, prep in zip(orig_fc_mods, prepared_fc_mods):
+        _assert_changes_post_prepare(
+            orig,
+            prep,
+            custom_config[nn.Linear]["n_bits"],
+            1,
+            custom_config[nn.Linear]["kmeans_max_iter"],
+        )
+
+
+def test_attach_config_weight_threshold_range_different_milestone(simple_model):
+    custom_config = {nn.Conv2d: [{"n_bits": 4, "cluster_dim": 4, "weight_threshold": 1000, "milestone": 2},
+                                 {"n_bits": 2, "cluster_dim": 2, "weight_threshold": 400, "milestone": 1}]}
+    config = DKMPalettizerConfig.from_dict(
+        {"module_type_configs": custom_config}
+    )
+    palettizer = DKMPalettizer(simple_model, config)
+    prepared_model = palettizer.prepare()
+
+    # configs should get sorted automatically
+    assert hasattr(prepared_model.fc1, "qconfig") and prepared_model.fc1.qconfig is None
+
+    num_steps = 3
+    for step in range(num_steps):
+        palettizer.step()
+        if step == 0:
+            assert prepared_model.conv2.weight_fake_quant.fake_palett_enabled[0] == 0
+        elif step == 1:
+            assert prepared_model.conv2.weight_fake_quant.fake_palett_enabled[0] == 0
+        else:
+            assert prepared_model.conv2.weight_fake_quant.fake_palett_enabled[0] == 1
+
+    _assert_changes_post_attach(
+        prepared_model.conv2,
+        custom_config[nn.Conv2d][0]["n_bits"],
+        custom_config[nn.Conv2d][0]["cluster_dim"],
+    )
+
+
+def test_prepare_palettizer_simple_model_custom_palettization_config_none_module(simple_model):
+    custom_config = {nn.Conv2d: {"n_bits": 2, "cluster_dim": 2, "kmeans_max_iter": 4},
+                     nn.Linear: {"n_bits": 4, "cluster_dim": 1, "kmeans_max_iter": 5}}
+    config = DKMPalettizerConfig.from_dict(
+        {"module_name_configs": {"conv1": None}, "module_type_configs": custom_config}
+    )
+    palettizer = DKMPalettizer(simple_model, config)
+
+    prepared_model = palettizer.prepare()
+    num_epochs = 1
+    for epoch in range(num_epochs):
+        palettizer.step()
+    assert type(prepared_model.conv1) == nn.Conv2d  # Means that if None was provided, it wasn't prepared.
+
+    _assert_changes_post_prepare(simple_model.conv2, prepared_model.conv2, custom_config[nn.Conv2d]["n_bits"],
+                                 custom_config[nn.Conv2d]["cluster_dim"], custom_config[nn.Conv2d]["kmeans_max_iter"])
+    _assert_changes_post_prepare(simple_model.fc1, prepared_model.fc1, custom_config[nn.Linear]["n_bits"],
+                                 custom_config[nn.Linear]["cluster_dim"], custom_config[nn.Linear]["kmeans_max_iter"])
+    _assert_changes_post_prepare(simple_model.fc2, prepared_model.fc2, custom_config[nn.Linear]["n_bits"],
+                                 custom_config[nn.Linear]["cluster_dim"], custom_config[nn.Linear]["kmeans_max_iter"])
+
+
+def test_prepare_palettizer_simple_model_custom_palettization_config_none_conv2d(simple_model):
+    custom_config = {nn.Conv2d: None,
+                     nn.Linear: {"n_bits": 4, "cluster_dim": 1, "kmeans_max_iter": 5}}
+    config = DKMPalettizerConfig.from_dict(
+        {"module_type_configs": custom_config}
+    )
+    palettizer = DKMPalettizer(simple_model, config)
+
+    prepared_model = palettizer.prepare()
+    num_epochs = 1
+    for epoch in range(num_epochs):
+        palettizer.step()
+    assert type(prepared_model.conv1) == nn.Conv2d  # Means that if None was provided, it wasn't prepared.
+    assert type(prepared_model.conv2) == nn.Conv2d
+    assert not hasattr(prepared_model.conv1, "weight_fake_quant")
+    assert not hasattr(prepared_model.conv2, "weight_fake_quant")
+
+    _assert_changes_post_prepare(simple_model.fc1, prepared_model.fc1, custom_config[nn.Linear]["n_bits"],
+                                 custom_config[nn.Linear]["cluster_dim"], custom_config[nn.Linear]["kmeans_max_iter"])
+    _assert_changes_post_prepare(simple_model.fc2, prepared_model.fc2, custom_config[nn.Linear]["n_bits"],
+                                 custom_config[nn.Linear]["cluster_dim"], custom_config[nn.Linear]["kmeans_max_iter"])
+
+
+def test_prepare_palettizer_simple_model_custom_palettization_config_linear_default(simple_model):
+    custom_config = {nn.Conv2d: {"n_bits": 2, "cluster_dim": 2, "kmeans_max_iter": 4},
+                     nn.Linear: {"n_bits": 4, "cluster_dim": 1}}
+    config = DKMPalettizerConfig.from_dict(
+        {"module_type_configs": custom_config}
+    )
+    palettizer = DKMPalettizer(simple_model, config)
+
+    prepared_model = palettizer.prepare()
+    num_epochs = 1
+    for epoch in range(num_epochs):
+        palettizer.step()
+
+    _assert_changes_post_prepare(
+        simple_model.conv2,
+        prepared_model.conv2,
+        custom_config[nn.Conv2d]["n_bits"],
+        custom_config[nn.Conv2d]["cluster_dim"],
+        custom_config[nn.Conv2d]["kmeans_max_iter"],
+    )
+    _assert_changes_post_prepare(
+        simple_model.fc1,
+        prepared_model.fc1,
+        custom_config[nn.Linear]["n_bits"],
+        custom_config[nn.Linear]["cluster_dim"],
+        DEFAULT_PALETTIZATION_SCHEME[nn.Linear]["kmeans_max_iter"],
+    )
+    _assert_changes_post_prepare(
+        simple_model.fc2,
+        prepared_model.fc2,
+        custom_config[nn.Linear]["n_bits"],
+        custom_config[nn.Linear]["cluster_dim"],
+        DEFAULT_PALETTIZATION_SCHEME[nn.Linear]["kmeans_max_iter"],
+    )
+
+
+def test_inplace_true_attach_config(simple_model):
+    simple_model_copy = copy.deepcopy(simple_model)
+    palettizer = DKMPalettizer(simple_model)
+    palettizer.prepare(inplace=True)
+
+    _assert_changes_post_attach(
+        simple_model.conv2,
+        DEFAULT_PALETTIZATION_SCHEME[type(simple_model_copy.conv2)]["n_bits"],
+        DEFAULT_PALETTIZATION_SCHEME[type(simple_model_copy.conv2)]["cluster_dim"],
+    )
+    _assert_changes_post_attach(
+        simple_model.fc1,
+        DEFAULT_PALETTIZATION_SCHEME[type(simple_model_copy.fc1)]["n_bits"],
+        DEFAULT_PALETTIZATION_SCHEME[type(simple_model_copy.fc1)]["cluster_dim"],
+    )
+    _assert_changes_post_attach(
+        simple_model.fc2,
+        DEFAULT_PALETTIZATION_SCHEME[type(simple_model_copy.fc2)]["n_bits"],
+        DEFAULT_PALETTIZATION_SCHEME[type(simple_model_copy.fc2)]["cluster_dim"],
+    )
+
+
+def test_inplace_false_attach_config(simple_model):
+    palettizer = DKMPalettizer(simple_model)
+    prepared_model = palettizer.prepare()
+
+    assert not hasattr(simple_model.conv1, "qconfig")
+    assert not hasattr(simple_model.conv2, "qconfig")
+    assert not hasattr(simple_model.fc1, "qconfig")
+    assert not hasattr(simple_model.fc2, "qconfig")
+    assert not hasattr(simple_model.fc3, "qconfig")
+
+    _assert_changes_post_attach(
+        prepared_model.conv2,
+        DEFAULT_PALETTIZATION_SCHEME[type(simple_model.conv2)]["n_bits"],
+        DEFAULT_PALETTIZATION_SCHEME[type(simple_model.conv2)]["cluster_dim"],
+    )
+    _assert_changes_post_attach(
+        prepared_model.fc1,
+        DEFAULT_PALETTIZATION_SCHEME[type(simple_model.fc1)]["n_bits"],
+        DEFAULT_PALETTIZATION_SCHEME[type(simple_model.fc1)]["cluster_dim"],
+    )
+    _assert_changes_post_attach(
+        prepared_model.fc2,
+        DEFAULT_PALETTIZATION_SCHEME[type(simple_model.fc2)]["n_bits"],
+        DEFAULT_PALETTIZATION_SCHEME[type(simple_model.fc2)]["cluster_dim"],
+    )
+
+
+def test_inplace_true_prepare_palettizer(simple_model):
+    simple_model_copy = copy.deepcopy(simple_model)
+    custom_config = {
+        nn.Conv2d: {
+            "n_bits": 2,
+            "cluster_dim": 2,
+            "kmeans_max_iter": 4,
+            "milestone": 1,
+        },
+        nn.Linear: {
+            "n_bits": 4,
+            "cluster_dim": 1,
+            "kmeans_max_iter": 5,
+            "milestone": 1,
+        },
+    }
+    config = DKMPalettizerConfig.from_dict({"module_type_configs": custom_config})
+    palettizer = DKMPalettizer(simple_model, config)
+
+    palettizer.prepare(inplace=True)
+    num_steps = 2
+    for step in range(num_steps):
+        palettizer.step()
+        if step == 0:
+            assert palettizer._model.fc1.weight_fake_quant.fake_palett_enabled[0] == 0
+        else:
+            assert palettizer._model.fc1.weight_fake_quant.fake_palett_enabled[0] == 1
+
+    _assert_changes_post_prepare(
+        simple_model_copy.conv2,
+        simple_model.conv2,
+        custom_config[nn.Conv2d]["n_bits"],
+        custom_config[nn.Conv2d]["cluster_dim"],
+        custom_config[nn.Conv2d]["kmeans_max_iter"],
+    )
+    _assert_changes_post_prepare(
+        simple_model_copy.fc1,
+        simple_model.fc1,
+        custom_config[nn.Linear]["n_bits"],
+        custom_config[nn.Linear]["cluster_dim"],
+        custom_config[nn.Linear]["kmeans_max_iter"],
+    )
+    _assert_changes_post_prepare(
+        simple_model_copy.fc2,
+        simple_model.fc2,
+        custom_config[nn.Linear]["n_bits"],
+        custom_config[nn.Linear]["cluster_dim"],
+        custom_config[nn.Linear]["kmeans_max_iter"],
+    )
diff --git a/coremltools/test/optimize/torch/pruning/__init__.py b/coremltools/test/optimize/torch/pruning/__init__.py
new file mode 100644
index 000000000..25c7d28c5
--- /dev/null
+++ b/coremltools/test/optimize/torch/pruning/__init__.py
@@ -0,0 +1,4 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
diff --git a/coremltools/test/optimize/torch/pruning/pruning_utils.py b/coremltools/test/optimize/torch/pruning/pruning_utils.py
new file mode 100644
index 000000000..ad50cfe41
--- /dev/null
+++ b/coremltools/test/optimize/torch/pruning/pruning_utils.py
@@ -0,0 +1,118 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import os
+
+image_size = 28
+batch_size = 128
+num_classes = 10
+
+
+def verify_global_pruning_amount(supported_modules, model, expected_sparsity):
+    total_params = 0
+    unpruned_params = 0
+    for name, module in model.named_modules():
+        if type(module) in supported_modules:
+            total_params += module.weight.numel()
+            if hasattr(module, "weight_mask"):
+                unpruned_params += torch.nonzero(module.weight_mask, as_tuple=False).size(0)
+            else:
+                unpruned_params += torch.nonzero(module.weight, as_tuple=False).size(0)
+
+    actual_global_sparsity = 1 - unpruned_params / total_params
+    np.testing.assert_allclose(actual_global_sparsity, expected_sparsity, atol=0.02)
+
+
+def train_and_eval_model(model, mnist_dataset, pruner, num_epochs):
+    # setup data loaders
+    train, test = mnist_dataset
+    train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
+    test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size)
+
+    # train the model
+    optimizer = torch.optim.Adam(model.parameters(), eps=1e-07, weight_decay=1e-4)
+
+    # train one epoch
+    for epoch in range(num_epochs):
+        model.train()
+        for batch_idx, (data, target) in enumerate(train_loader):
+            optimizer.zero_grad()
+            output = model(data)
+            loss = F.nll_loss(output, target)
+            loss.backward()
+            optimizer.step()
+            pruner.step()
+            if batch_idx % 100 == 0:
+                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
+                    epoch, batch_idx * len(data), len(train_loader.dataset),
+                    100. * batch_idx / len(train_loader), loss.item()))
+                # if not isinstance(pruner, GlobalChannelPruner):
+                #     print(pruner.get_submodule_sparsity_summaries())
+
+    accuracy = eval_model(model, test_loader)
+    return accuracy
+
+
+def eval_model(model, test_loader):
+    model.eval()
+    test_loss = 0
+    correct = 0
+    with torch.no_grad():
+        for data, target in test_loader:
+            output = model(data)
+            test_loss += F.nll_loss(output, target, reduction='sum').item()
+            pred = output.argmax(dim=1, keepdim=True)
+            correct += pred.eq(target.view_as(pred)).sum().item()
+
+        test_loss /= len(test_loader.dataset)
+        accuracy = 100. * correct / len(test_loader.dataset)
+
+        print('\nTest set: Average loss: {:.4f}, Accuracy: {:.0f}%\n'.format(
+            test_loss, accuracy))
+    return accuracy
+
+
+def get_compression_ratio(model, pruner):
+    # export the model
+    import coremltools_internal as ct
+
+    model.eval()
+    pruner.finalize(inplace=True)
+    example_input = torch.rand(1, 1, 28, 28)
+    traced_model = torch.jit.trace(model, example_input)
+
+    converted_model = ct.convert(
+        traced_model,
+        convert_to="mlprogram",
+        inputs=[ct.TensorType(shape=example_input.shape)],
+    )
+
+    # save and get size
+    converted_model.save("/tmp/converted_model_unpruned.mlpackage")
+    unpruned_model_size = os.path.getsize(
+        "/tmp/converted_model_unpruned.mlpackage/Data/com.apple.CoreML/weights/weight.bin")
+
+    # compress the model
+    pruned_model = ct.compression_utils.sparsify_weights(converted_model, mode="threshold_based", threshold=1e-12)
+
+    # save and get size
+    pruned_model.save("/tmp/converted_model_pruned.mlpackage")
+    pruned_model_size = os.path.getsize(
+        "/tmp/converted_model_pruned.mlpackage/Data/com.apple.CoreML/weights/weight.bin")
+
+    compression_ratio = pruned_model_size/unpruned_model_size
+
+    print(f"Compression ratio: {compression_ratio}")
+    return compression_ratio
+
+
+def get_model_and_pruner(mnist_model, pruner_cls, pruner_config):
+    model = mnist_model
+    pruner = pruner_cls(model, pruner_config)
+    pruner.prepare(inplace=True)
+    return model, pruner
diff --git a/coremltools/test/optimize/torch/pruning/test_magnitude_pruner.py b/coremltools/test/optimize/torch/pruning/test_magnitude_pruner.py
new file mode 100644
index 000000000..7906c86a5
--- /dev/null
+++ b/coremltools/test/optimize/torch/pruning/test_magnitude_pruner.py
@@ -0,0 +1,570 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import copy
+from collections import OrderedDict
+
+import numpy as np
+import pytest
+import torch
+
+from coremltools.optimize.torch.pruning import (
+    MagnitudePruner,
+    MagnitudePrunerConfig,
+    ModuleMagnitudePrunerConfig,
+)
+from coremltools.optimize.torch.pruning._utils import n_m_mask
+
+
+def _zero_loss(x, y):
+    return torch.sum(x) * 0.0
+
+
+def _mock_initializer(shape, dtype):
+    # Each output channel is (entirely) an integer, increaing. This makes it so
+    # that we know what to expect from the LnPruner.
+    output_channel_index = 0
+    num_output_channels = shape[output_channel_index]
+    output_channel_values = np.arange(1, num_output_channels + 1, dtype=dtype)
+    broadcast_shape = tuple(-1 if i == output_channel_index else 1 for i, _ in enumerate(shape))
+    output_channel_values_reshaped = np.reshape(output_channel_values, broadcast_shape)
+    return torch.tensor(np.broadcast_to(output_channel_values_reshaped, shape))
+
+
+def _create_module():
+    conv2d = torch.nn.Conv2d(in_channels=3,
+                             out_channels=4,
+                             kernel_size=(3, 3),
+                             bias=False,
+                             groups=1)
+    conv2d.weight = torch.nn.Parameter(_mock_initializer(conv2d.weight.shape, np.float32))
+    activation = torch.nn.ReLU()
+    return torch.nn.Sequential(OrderedDict([
+        ('conv2d', conv2d),
+        ('activation', activation)]))
+
+
+def _create_large_module():
+    def _conv2d():
+        return torch.nn.Conv2d(8, 8, (3, 3), bias=False, groups=1)
+
+    return torch.nn.Sequential(OrderedDict([
+        ('conv1', _conv2d()),
+        ('conv2', _conv2d()),
+        ('conv3', _conv2d()),
+        ('flatten', torch.nn.Flatten()),
+        ('linear1', torch.nn.Linear(2592, 100)),
+        ('linear2', torch.nn.Linear(100, 10))]))
+
+
+@pytest.fixture
+def simple_module():
+    return _create_module()
+
+
+@pytest.fixture
+def large_module():
+    return _create_large_module()
+
+
+@pytest.fixture(scope="module")
+def sample_data():
+    X = np.asarray([np.random.uniform(0.0, 1.0, size=(3, 8, 8)).astype(np.float32) for _ in range(4)])
+    Y = np.asarray([np.random.uniform(0.0, 1.0, size=(4, 6, 6)).astype(np.float32) for _ in range(4)])
+    X, Y = torch.tensor(X), torch.tensor(Y)
+    return X, Y
+
+
+
+@pytest.mark.parametrize('out_channels', [17, 127])
+@pytest.mark.parametrize('block_size', [2, 3, 4])
+def test_magnitude_pruner_nondivisible_block_size(out_channels, block_size):
+    """
+    Test block sparsity when the number of channels is not divisible by block size
+    """
+    conv2d = torch.nn.Conv2d(in_channels=3,
+                             out_channels=out_channels,
+                             kernel_size=(3, 3),
+                             bias=False,
+                             groups=1)
+    weight_shape = tuple(conv2d.weight.shape)
+    weight_tensor = torch.abs(torch.randn(*weight_shape))
+    weight_tensor[weight_tensor == 0] = 1.0
+    conv2d.weight = torch.nn.Parameter(weight_tensor)
+    config = MagnitudePrunerConfig.from_dict(
+            {"global_config":
+                {
+                    "scheduler": {"update_steps": [1, 2]},
+                    "initial_sparsity": 0.0,
+                    "target_sparsity": 0.5,
+                    "block_size": block_size,
+                }},
+        )
+    pruner = MagnitudePruner(conv2d, config)
+    conv2d = pruner.prepare()
+
+    for _ in range(4):
+        pruner.step()
+
+    if block_size > 1:
+        block_sparse_channels = out_channels - out_channels % block_size
+        for idx in range(0, block_sparse_channels, block_size):
+            for jdx in range(1, block_size):
+                assert torch.all(conv2d.weight_mask[idx] == conv2d.weight_mask[idx + jdx])
+
+    sparsity = conv2d.weight_mask.eq(0).sum() / conv2d.weight_mask.numel()
+    np.testing.assert_array_almost_equal(sparsity, 0.5, decimal=2)
+
+
+def test_magnitude_pruner_incompatible_block_size(simple_module):
+    """
+    Test MagnitudePruner init failure when block_size is incompatibe with the number of channels
+    in the block sparsity dimension
+    """
+    # block size greater than half the number of channels
+    with pytest.raises(ValueError):
+        config = MagnitudePrunerConfig.from_dict(
+            {"global_config":
+                {
+                    "scheduler": {"update_steps": [0, 1]},
+                    "block_size": 4
+                }},
+        )
+        pruner = MagnitudePruner(simple_module, config)
+        pruner.prepare(inplace=True)
+    # block size greater than half the number of channels
+    config.global_config.block_size = 4
+    with pytest.raises(ValueError):
+        pruner = MagnitudePruner(simple_module, config)
+        pruner.prepare(inplace=True)
+
+@pytest.mark.parametrize(
+    "options",
+    [("block_size", 2), ("initial_sparsity", 0.5), ("granularity", "per_channel")],
+)
+def test_magnitude_pruner_n_m_ratio_param_usage(options):
+    param_name, val = options
+    with pytest.raises(Exception):
+        MagnitudePrunerConfig.from_dict(
+            {"global_config": {
+                "n_m_ratio": [3, 4],
+                param_name: val}},
+        )
+
+
+@pytest.mark.parametrize('config_dict', [
+    {"module_type_configs": {"Linear": {"block_size": 2}}},
+    {"module_name_configs": {"conv2d": {"block_size": 2}}},
+    {"global_config": {"block_size": 2}},
+    {},
+])
+def test_magnitude_pruner_config_global_config_set(config_dict):
+    config = MagnitudePrunerConfig.from_dict(config_dict)
+    if len(config_dict) == 0:
+        assert config.global_config == ModuleMagnitudePrunerConfig()
+    else:
+        keys = ["global_config", "module_type_configs", "module_name_configs"]
+        for key in keys:
+            if key not in config_dict:
+                param_in_config = getattr(config, key)
+                assert param_in_config is None or len(param_in_config) == 0
+        if "global_config" in config_dict:
+            assert config.global_config.block_size == config_dict["global_config"]["block_size"]
+        if "module_name_configs" in config_dict:
+            for key in config_dict["module_name_configs"]:
+                assert config.module_name_configs[key].block_size == \
+                       config_dict["module_name_configs"][key]["block_size"]
+        if "module_type_configs" in config_dict:
+            for key in config_dict["module_type_configs"]:
+                assert config.module_type_configs[key].block_size == \
+                       config_dict["module_type_configs"][key]["block_size"]
+
+
+@pytest.mark.parametrize('out_channels', [16, 64])
+@pytest.mark.parametrize('block_size', [1, 4, 8])
+def test_magnitude_pruner_block_sparsity(out_channels, block_size):
+    """
+    Test block sparsity structure is obtained by MagnitudePruner when block_size > 1
+    """
+    conv2d = torch.nn.Conv2d(in_channels=3,
+                             out_channels=out_channels,
+                             kernel_size=(3, 3),
+                             bias=False,
+                             groups=1)
+    weight_shape = tuple(conv2d.weight.shape)
+    weight_tensor = torch.abs(torch.randn(*weight_shape))
+    weight_tensor[weight_tensor == 0] = 1.0
+    conv2d.weight = torch.nn.Parameter(weight_tensor)
+    config = MagnitudePrunerConfig.from_dict(
+            {"global_config":
+                {
+                    "scheduler": {"update_steps": [1, 2]},
+                    "initial_sparsity": 0.0,
+                    "target_sparsity": 0.5,
+                    "block_size": block_size,
+                }},
+        )
+    pruner = MagnitudePruner(conv2d, config)
+    conv2d = pruner.prepare()
+
+    for _ in range(4):
+        pruner.step()
+
+    if block_size > 1:
+        for idx in range(0, out_channels, block_size):
+            for jdx in range(1, block_size):
+                assert torch.all(conv2d.weight_mask[idx] == conv2d.weight_mask[idx + jdx])
+
+    assert torch.sum(conv2d.weight_mask == 0).item() == int(0.5 * torch.numel(conv2d.weight))
+
+
+def test_finalize(simple_module):
+    """
+    Test that calling finalize on the module leads to param being replaced with
+    param_orig * param_mask.
+    """
+    config = MagnitudePrunerConfig.from_dict(
+        {"global_config":
+            {
+                "scheduler": {"update_steps": [1, 2]},
+                "initial_sparsity": 0.0,
+                "target_sparsity": 0.5,
+                "granularity": "per_channel"
+            }},
+    )
+    pruner = MagnitudePruner(simple_module, config)
+    simple_module = pruner.prepare()
+
+    for _ in range(4):
+        pruner.step()
+
+    pruner.finalize(inplace=True)
+
+    assert torch.sum(simple_module.conv2d.weight[:2] == 0).item() == 54
+    assert torch.sum(simple_module.conv2d.weight[2] == 3).item() == 27
+    assert torch.sum(simple_module.conv2d.weight[3] == 4).item() == 27
+
+
+def test_magnitude_pruning_correctness(simple_module):
+    """
+    Test correctness of magnitude pruning.
+
+    Initialize convolution weight with 4 output channels,
+    with weights associated with channel `k` having integer value k+1 (k=0,...,3).
+    We test that pruning twice indeed zeros out 3 output channels.
+    """
+    config = MagnitudePrunerConfig.from_dict(
+        {"global_config":
+            {
+                "scheduler": {"update_steps": [2, 3]},
+                "initial_sparsity": 0.0,
+                "target_sparsity": 0.75,
+                "granularity": "per_channel"
+            }},
+    )
+    pruner = MagnitudePruner(simple_module, config)
+    simple_module = pruner.prepare()
+
+    # Perform 4 iterations: pruning should happen on steps 2 and 3
+    # step 1: No pruning
+    pruner.step()
+    np.testing.assert_equal(simple_module.conv2d.weight_mask.numpy(),
+                            np.array([1, 1, 1, 1], dtype=np.int32).reshape((4, 1, 1, 1)))
+    # step 2: prune once, polynomial schedule will give new sparsity as 0.0, still no pruning
+    pruner.step()
+    np.testing.assert_equal(simple_module.conv2d.weight_mask.numpy(),
+                            np.array([1, 1, 1, 1], dtype=np.int32).reshape((4, 1, 1, 1)))
+    # step 3: prune once again, polynomial schedule will give new sparsity as 1.0, 75% = 3 out of 4
+    # channels with least magnitude (first three channels) will be pruned out
+    pruner.step()
+    np.testing.assert_equal(simple_module.conv2d.weight_mask.numpy(),
+                            np.array([0, 0, 0, 1], dtype=np.int32).reshape((4, 1, 1, 1)))
+
+    # step 4: prune once again, polynomial schedule sparsity stays at 0.75, no further pruning
+    pruner.step()
+    np.testing.assert_equal(simple_module.conv2d.weight_mask.numpy(),
+                            np.array([0, 0, 0, 1], dtype=np.int32).reshape((4, 1, 1, 1)))
+
+
+def test_magnitude_pruning_training_and_validation(simple_module, sample_data):
+    """
+    Tests pruned weights are used for computing forward pass
+    pruned module. Also demonstrates how pruner can be combined with
+    training code in PyTorch, i.e, pruning can be done at a schedule
+    different from training.
+
+    Note: No actual training happens here because loss function is a no-op.
+    """
+    config = MagnitudePrunerConfig.from_dict(
+        {"global_config":
+            {
+                "scheduler": {"update_steps": [2, 3]},
+                "initial_sparsity": 0.0,
+                "target_sparsity": 0.75,
+                "granularity": "per_channel"
+            }},
+    )
+    pruner = MagnitudePruner(simple_module, config)
+    simple_module = pruner.prepare()
+
+    # Train the model for 4 epochs
+    num_epochs = 4
+    X, Y = sample_data
+    simple_module.train()
+    optimizer = torch.optim.Adam(params=simple_module.parameters(), lr=0.001)
+    for _ in range(num_epochs):
+        for inp, label in zip(X, Y):
+            inp = inp.view(1, *X.shape[1:])
+            label = label.view(1, *Y.shape[1:])
+            output = simple_module(inp)
+            loss = _zero_loss(output, label)
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+        pruner.step()
+
+    # Test inference
+    # After 4 iterations, pruner will zero out first 3 layers of conv2d layer in simple_module
+    simple_module.eval()
+    with torch.no_grad():
+        x_test = torch.tensor(np.random.uniform(0.0, 1.0, size=(1, 3, 8, 8)).astype(np.float32))
+        y_test = simple_module(x_test).detach().numpy()
+
+    zero_output = y_test[:, :3, :, :]
+    nonzero_output = y_test[:, 3:, :, :]
+    np.testing.assert_equal(zero_output, np.zeros_like(zero_output))
+    assert np.any(np.abs(nonzero_output) > 0.0)
+
+
+@pytest.mark.parametrize('granularity', ["per_scalar", "per_channel"])
+def test_magnitude_pruning_granularity_parameter_usage(simple_module, granularity):
+    """
+    Tests MagnitudePruner creates mask of the correct shape
+    depending on the granularity parameter.
+
+    We set target sparsity to 1.0 so the mask should be all zeros after 4 iterations.
+    """
+    config = MagnitudePrunerConfig.from_dict(
+        {"global_config":
+            {
+                "scheduler": {"update_steps": [2, 3]},
+                "initial_sparsity": 0.5,
+                "target_sparsity": 1.0,
+                "granularity": granularity
+            }},
+    )
+    pruner = MagnitudePruner(simple_module, config)
+    simple_module = pruner.prepare()
+
+    # Perform 4 iterations
+    for _ in range(4):
+        pruner.step()
+
+    mask_data = simple_module.conv2d.weight_mask.numpy()
+    # Pruning mask should be all zeros since the pruner should be at 100% sparsity.
+    if granularity == "per_scalar":
+        expected_mask_shape = (4, 3, 3, 3)
+    else:
+        assert granularity == "per_channel"
+        expected_mask_shape = (4, 1, 1, 1)
+    np.testing.assert_equal(mask_data, np.zeros(expected_mask_shape))
+
+
+@pytest.mark.parametrize('granularity', ["per_scalar", "per_channel"])
+def test_pruner_finalize(simple_module, granularity):
+    config = MagnitudePrunerConfig.from_dict(
+        {"global_config":
+            {
+                "scheduler": {"update_steps": [2, 3]},
+                "initial_sparsity": 0.5,
+                "target_sparsity": 1.0,
+                "granularity": granularity
+            }},
+    )
+    pruner = MagnitudePruner(simple_module, config)
+    simple_module = pruner.prepare()
+
+    assert hasattr(simple_module.conv2d, "weight_mask")
+    assert hasattr(simple_module.conv2d, "weight_orig")
+
+    # Perform 4 iterations
+    for _ in range(4):
+        pruner.step()
+
+    pruner.finalize(inplace=True)
+
+    assert not hasattr(simple_module.conv2d, "weight_mask")
+    assert not hasattr(simple_module.conv2d, "weight_orig")
+
+    weight_data = simple_module.conv2d.weight.detach().numpy()
+    np.testing.assert_equal(weight_data, np.zeros_like(weight_data))
+
+    # calling finalize again is a no-op
+    pruner.finalize(inplace=True)
+
+
+@pytest.mark.parametrize("block_size", [1, 2])
+@pytest.mark.parametrize("granularity", ["per_scalar", "per_channel"])
+def test_sparsity_report_method(large_module, block_size, granularity):
+    model = large_module
+    target_sparsity = 0.5
+    config = MagnitudePrunerConfig.from_dict(
+        {"global_config":
+            {
+                "scheduler": {"update_steps": [2, 3]},
+                "block_size": block_size,
+                "initial_sparsity": 0.0,
+                "target_sparsity": target_sparsity,
+                "granularity": granularity
+            }},
+    )
+    pruner = MagnitudePruner(model, config)
+    pruner.prepare(inplace=True)
+
+    inp = torch.ones(1, 8, 24, 24)
+    for _ in range(4):
+        model(inp)
+        pruner.step()
+
+    report = pruner.report()
+
+    assert len(report) == 6
+
+    for sparsity in [val["unstructured_weight_sparsity"] for _, val in report.items()]:
+        assert sparsity == pytest.approx(target_sparsity, 0.1)
+    if block_size == 2:
+        for sparsity in [val["block2_weight_sparsity"] for _, val in report.items()]:
+            assert sparsity == pytest.approx(target_sparsity, 0.1)
+    if granularity == "per_channel":
+        for sparsity in [
+            val["structured_weight_sparsity"] for _, val in report.items()
+        ][:3]:
+            # only conv layers
+            assert sparsity == pytest.approx(target_sparsity, 0.1)
+
+
+def test_sparsity_report_block2_sparsity_not_applicable():
+    model = torch.nn.Sequential(torch.nn.Conv2d(1, 31, 2, 1),
+                                torch.nn.Conv2d(31, 21, 2, 1))
+    target_sparsity = 0.5
+    config = MagnitudePrunerConfig.from_dict(
+        {"global_config":
+            {
+                "scheduler": {"begin_step": 0},
+                "initial_sparsity": 0.0,
+                "target_sparsity": target_sparsity,
+            }},
+    )
+    pruner = MagnitudePruner(model, config)
+    pruner.prepare(inplace=True)
+
+    inp = torch.ones(1, 1, 28, 28)
+    for _ in range(2):
+        pruner.step()
+        model(inp)
+
+    report = pruner.report()
+
+    assert len(report) == 3
+
+    for sparsity in [val["block2_weight_sparsity"] for _, val in report.items()]:
+        assert sparsity == -1
+
+
+def test_magnitude_pruner_cloning(simple_module):
+    model = simple_module
+    config = MagnitudePrunerConfig.from_dict(
+        {"global_config":
+            {
+                "scheduler": {"update_steps": [0, 1]},
+            }},
+    )
+    pruner = MagnitudePruner(model, config)
+    pruner.prepare(inplace=True)
+
+    model_copy = copy.deepcopy(model)
+
+    assert hasattr(model_copy.conv2d, "pruning_method")
+    assert torch.all(model_copy.conv2d.weight_orig == model.conv2d.weight_orig)
+    assert torch.all(model_copy.conv2d.weight_mask == model.conv2d.weight_mask)
+
+    pruner.finalize(inplace=True)
+
+    model_copy_finalize = copy.deepcopy(model)
+
+    assert not hasattr(model_copy_finalize.conv2d, "pruning_method")
+    assert torch.all(model_copy_finalize.conv2d.weight == model.conv2d.weight)
+
+
+@pytest.mark.parametrize('weights_shape', [[2, 8], [2, 8, 1, 1]])
+@pytest.mark.parametrize('dim', [1, 0])
+def test_nm_pruner_mask_computation(weights_shape, dim):
+    weights = torch.tensor(
+        [
+            [2, 3, 0, 4, 5, 9, 1, 1],
+            [3, 6, 1, 0, 2, 3, 8, 9]
+        ]
+    )
+    if dim == 1:
+        expected_mask = torch.tensor(
+            [
+                [0, 1, 0, 1, 1, 1, 0, 0],
+                [1, 1, 0, 0, 0, 0, 1, 1]
+
+            ]
+        )
+        nm = (2, 4)
+    else:
+        expected_mask = torch.tensor(
+            [
+                [0, 0, 0, 1, 1, 1, 0, 0],
+                [1, 1, 1, 0, 0, 0, 1, 1]
+
+            ]
+        )
+        nm = (1, 2)
+    if weights_shape == [2, 8, 1, 1]:
+        weights = weights.reshape([2, 8, 1, 1])
+        expected_mask = expected_mask.reshape([2, 8, 1, 1])
+    mask = n_m_mask(weights, nm, dim=dim)
+    np.testing.assert_array_equal(mask, expected_mask)
+
+
+@pytest.mark.parametrize("range_str", ["range(0, 25000, 100)", "range(0)"])
+def test_polynomial_scheduler_range_str(range_str):
+    pruner_config = MagnitudePrunerConfig.from_dict(
+        {"global_config": {"scheduler": {"update_steps": range_str}}}
+    )
+
+    update_steps_tensor = torch.tensor(list(eval(range_str)))
+    assert torch.all(
+        pruner_config.global_config.scheduler.update_steps == update_steps_tensor
+    )
+
+
+def test_nm_pruner_polynomial_scheduler():
+    model = torch.nn.Linear(8, 2)
+    weights = torch.tensor(
+        [[2, 3, 7, 4, 5, 8, 1, 6], [4, 5, 1, 6, 2, 3, 7, 8]], dtype=torch.float
+    )
+    model.weight.data = weights
+    data = torch.randn(1, 8)
+
+    config = MagnitudePrunerConfig.from_dict(
+        {
+            "global_config": {
+                "scheduler": {"update_steps": range(8), "power": 1},
+                "n_m_ratio": (7, 8),
+            }
+        }
+    )
+    pruner = MagnitudePruner(model, config)
+    model = pruner.prepare()
+
+    for idx in range(7):
+        pruner.step()
+        model(data)
+        for row in range(2):
+            assert torch.count_nonzero(model.weight_mask[row]) == (7 - idx)
diff --git a/coremltools/test/optimize/torch/pruning/test_pruning_scheduler.py b/coremltools/test/optimize/torch/pruning/test_pruning_scheduler.py
new file mode 100644
index 000000000..85827651f
--- /dev/null
+++ b/coremltools/test/optimize/torch/pruning/test_pruning_scheduler.py
@@ -0,0 +1,87 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import sys
+
+import pytest
+import torch
+
+from coremltools.optimize.torch.pruning import (
+    ConstantSparsityScheduler,
+    MagnitudePruner,
+    MagnitudePrunerConfig,
+    ModuleMagnitudePrunerConfig,
+    PolynomialDecayScheduler,
+)
+
+
+@pytest.fixture
+def simple_module():
+    return torch.nn.Conv2d(3, 3, (3, 3), bias=False, groups=1)
+
+
+@pytest.mark.skipif(sys.platform == "darwin", reason="temporarily disabled.")
+@pytest.mark.parametrize('steps_and_expected', [[[4, 7, 9], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.875, 0.875, 1.0, 1.0]],
+                                                [[3], [0.0, 0.0, 1.0, 1.0]]])
+def test_polynomial_decay_correctness(simple_module, steps_and_expected):
+    """
+    Tests correctness of polynomial decay schedule.
+
+    Note: Schedule can be stepped beyond the maximum step specified in update_steps.
+    Beyond the max update step, the sparsity stays at the target sparsity.
+    For example, in the first test case, we step 10 times, whereas max step is 9. At the 10th call
+    to schedule.step, sparsity remains at 1.0.
+    """
+
+    update_steps, expected_sparsitys = steps_and_expected
+    config = MagnitudePrunerConfig().set_global(
+        ModuleMagnitudePrunerConfig(
+            scheduler=PolynomialDecayScheduler(update_steps=update_steps),
+            initial_sparsity=0.0, target_sparsity=1.0,
+        )
+    )
+    pruner = MagnitudePruner(simple_module, config)
+    pruner.prepare(inplace=True)
+
+    for expected in expected_sparsitys:
+        pruner.step()
+        assert pruner._pruner_info[''].sparsity_level == expected
+
+
+@pytest.mark.parametrize('steps', [[2.5, 6.5, 3.3],
+                                   [[2, 3], [3, 5]],
+                                   [-2, 0, 2]])
+def test_polynomial_decay_initialization_failure(steps):
+    with pytest.raises(Exception):
+        PolynomialDecayScheduler(update_steps=steps)
+    with pytest.raises(Exception):
+        PolynomialDecayScheduler(update_steps=torch.tensor(steps))
+
+
+@pytest.mark.skipif(sys.platform == "darwin", reason="temporarily disabled.")
+@pytest.mark.parametrize('step_and_target', [(4, 0.5), (0, 0.8)])
+def test_constant_sparsity_correctness(simple_module, step_and_target):
+    """
+    Tests correctness of spline schedule.
+
+    Note: Schedule can be stepped beyond the maximum step specified in update_steps.
+    Beyond the max update step, the sparsity stays at the target sparsity.
+    For example, in the first test case, we step 10 times, whereas max step is 9. At the 10th call
+    to schedule.step, sparsity remains at 1.0.
+    """
+    begin_step, target_sparsity = step_and_target
+    initial_sparsity = target_sparsity if begin_step == 0 else 0.0
+    config = MagnitudePrunerConfig().set_global(
+        ModuleMagnitudePrunerConfig(
+            scheduler=ConstantSparsityScheduler(begin_step=begin_step),
+            initial_sparsity=initial_sparsity, target_sparsity=target_sparsity,
+        )
+    )
+    pruner = MagnitudePruner(simple_module, config)
+    pruner.prepare(inplace=True)
+    for _ in range(begin_step):
+        assert pruner._pruner_info[''].sparsity_level == initial_sparsity
+        pruner.step()
+    assert pruner._pruner_info[''].sparsity_level == target_sparsity
diff --git a/coremltools/test/optimize/torch/quantization/__init__.py b/coremltools/test/optimize/torch/quantization/__init__.py
new file mode 100644
index 000000000..25c7d28c5
--- /dev/null
+++ b/coremltools/test/optimize/torch/quantization/__init__.py
@@ -0,0 +1,4 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
diff --git a/coremltools/test/optimize/torch/quantization/test_configure.py b/coremltools/test/optimize/torch/quantization/test_configure.py
new file mode 100644
index 000000000..57dfb2ad5
--- /dev/null
+++ b/coremltools/test/optimize/torch/quantization/test_configure.py
@@ -0,0 +1,755 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import operator
+from collections import OrderedDict
+from typing import List
+
+import pytest
+import torch
+import torch.ao.nn.quantized.reference
+import torch.ao.quantization
+import torch.nn as nn
+import torch.nn.intrinsic
+import torch.nn.intrinsic.qat
+import torch.nn.qat
+import torch.nn.quantized
+
+from coremltools.optimize.torch.quantization import LinearQuantizer, LinearQuantizerConfig
+from coremltools.optimize.torch.quantization._backend_config import _mod_activations
+from coremltools.optimize.torch.quantization._qconfig_mapping import _QConfigMappingBuilder
+from coremltools.optimize.torch.quantization._utils import find_module, is_activation_post_process
+from coremltools.optimize.torch.quantization.modules import fused_modules as _fused
+from coremltools.optimize.torch.quantization.modules import qat_modules as _qat
+from coremltools.optimize.torch.quantization.modules import quantized_modules as _quantized
+from coremltools.optimize.torch.quantization.quantization_config import QuantizationScheme
+
+
+def get_configs_for_qscheme(
+    activation_dtype=torch.quint8,
+    weight_per_channel=True,
+) -> List[LinearQuantizerConfig]:
+    return [
+        LinearQuantizerConfig.from_dict(
+            {
+                "global_config": {
+                    "quantization_scheme": QuantizationScheme.symmetric,
+                    "milestones": [0, 0, 10, 10],
+                    "activation_dtype": activation_dtype,
+                    "weight_per_channel": weight_per_channel,
+                }
+            }
+        ),
+        LinearQuantizerConfig.from_dict(
+            {
+                "global_config": {
+                    "quantization_scheme": QuantizationScheme.affine,
+                    "milestones": [0, 0, 10, 10],
+                    "activation_dtype": activation_dtype,
+                    "weight_per_channel": weight_per_channel,
+                }
+            }
+        ),
+    ]
+
+
+def quantize_model(model, data, config=None):
+    quantizer = LinearQuantizer(model, config)
+    prepared_model = quantizer.prepare(example_inputs=data, inplace=False)
+    quantizer.step()
+    prepared_model(data)
+    return prepared_model, quantizer
+
+
+@pytest.mark.parametrize(
+    "config",
+    get_configs_for_qscheme() + get_configs_for_qscheme(weight_per_channel=False),
+)
+def test_conv_relu_fusion(config):
+    model = nn.Sequential(
+        OrderedDict(
+            {
+                "conv": nn.Conv2d(1, 20, (3, 3)),
+                "act": nn.ReLU(),
+            }
+        )
+    )
+    data = torch.randn(1, 1, 28, 28)
+
+    prepared_model, quantizer = quantize_model(model, data, config)
+
+    assert isinstance(prepared_model.conv, torch.nn.intrinsic.qat.ConvReLU2d)
+
+    converted_model = quantizer.finalize(inplace=False)
+
+    assert isinstance(converted_model.conv, torch.ao.nn.intrinsic.ConvReLU2d)
+    assert isinstance(converted_model.conv[0], torch.ao.nn.quantized.reference.Conv2d)
+
+
+@pytest.mark.parametrize(
+    "config",
+    get_configs_for_qscheme() + get_configs_for_qscheme(weight_per_channel=False),
+)
+@pytest.mark.parametrize("activation_fn", list(_mod_activations))
+def test_conv_act_fusion(config, activation_fn):
+    model = nn.Sequential(OrderedDict({
+        'conv': nn.Conv2d(1, 20, (3, 3)),
+        'act': activation_fn(),
+    }))
+    data = torch.randn(1, 1, 28, 28)
+
+    prepared_model, quantizer = quantize_model(model, data, config)
+
+    assert isinstance(prepared_model.conv, _qat.ConvAct2d)
+    assert isinstance(prepared_model.conv.act, activation_fn)
+
+    converted_model = quantizer.finalize(inplace=False)
+
+    assert isinstance(converted_model.conv, _quantized.QuantizedConvAct2d)
+    assert isinstance(converted_model.conv.act, activation_fn)
+
+
+@pytest.mark.parametrize(
+    "config",
+    get_configs_for_qscheme() + get_configs_for_qscheme(weight_per_channel=False),
+)
+def test_conv_bn_relu_fusion(config):
+    model = nn.Sequential(
+        OrderedDict(
+            {
+                "conv": nn.Conv2d(1, 20, (3, 3)),
+                "bn": nn.BatchNorm2d(20),
+                "act": nn.ReLU(),
+            }
+        )
+    )
+    data = torch.randn(1, 1, 28, 28)
+
+    prepared_model, quantizer = quantize_model(model, data, config)
+
+    assert isinstance(prepared_model.conv, torch.nn.intrinsic.qat.ConvBnReLU2d)
+
+    converted_model = quantizer.finalize(inplace=False)
+
+    assert isinstance(converted_model.conv, torch.ao.nn.intrinsic.ConvReLU2d)
+    assert isinstance(converted_model.conv[0], torch.ao.nn.quantized.reference.Conv2d)
+
+
+@pytest.mark.parametrize(
+    "config",
+    get_configs_for_qscheme() + get_configs_for_qscheme(weight_per_channel=False),
+)
+@pytest.mark.parametrize("activation_fn", list(_mod_activations))
+def test_conv_bn_act_fusion(config, activation_fn):
+    model = nn.Sequential(OrderedDict({
+        'conv': nn.Conv2d(1, 20, (3, 3)),
+        'bn': nn.BatchNorm2d(20),
+        'act': activation_fn(),
+    }))
+    data = torch.randn(1, 1, 28, 28)
+
+    prepared_model, quantizer = quantize_model(model, data, config)
+
+    assert isinstance(prepared_model.conv, _qat.ConvBnAct2d)
+    assert isinstance(prepared_model.conv.act, activation_fn)
+
+    converted_model = quantizer.finalize(inplace=False)
+
+    assert isinstance(converted_model.conv, _quantized.QuantizedConvAct2d)
+    assert isinstance(converted_model.conv.act, activation_fn)
+
+
+@pytest.mark.parametrize(
+    "config",
+    get_configs_for_qscheme() + get_configs_for_qscheme(weight_per_channel=False),
+)
+def test_linear_relu_fusion(config):
+    model = nn.Sequential(OrderedDict({"linear": nn.Linear(20, 100), "act": nn.ReLU()}))
+    data = torch.randn(1, 20)
+
+    prepared_model, quantizer = quantize_model(model, data, config)
+
+    assert isinstance(prepared_model.linear, torch.nn.intrinsic.qat.LinearReLU)
+
+    converted_model = quantizer.finalize(inplace=False)
+
+    assert isinstance(converted_model.linear, torch.ao.nn.intrinsic.LinearReLU)
+    assert isinstance(converted_model.linear[0], torch.ao.nn.quantized.reference.Linear)
+
+
+@pytest.mark.parametrize(
+    "config",
+    get_configs_for_qscheme() + get_configs_for_qscheme(weight_per_channel=False),
+)
+@pytest.mark.parametrize("activation_fn", list(_mod_activations))
+def test_linear_act_fusion(config, activation_fn):
+    model = nn.Sequential(OrderedDict({
+        'linear': nn.Linear(20, 100),
+        'act': activation_fn(),
+    }))
+    data = torch.randn(1, 20)
+
+    prepared_model, quantizer = quantize_model(model, data, config)
+
+    assert isinstance(prepared_model.linear, _qat.LinearAct)
+    assert isinstance(prepared_model.linear.act, activation_fn)
+
+    converted_model = quantizer.finalize(inplace=False)
+
+    assert isinstance(converted_model.linear, _quantized.QuantizedLinearAct)
+    assert isinstance(converted_model.linear.act, activation_fn)
+
+
+@pytest.mark.parametrize("activation_fn", [torch.nn.ReLU, torch.nn.ReLU6])
+@pytest.mark.parametrize("layer_and_data", [[nn.Conv2d(1, 20, (3, 3)), torch.randn(1, 1, 28, 28)],
+                                            [nn.Linear(20, 100), torch.randn(1, 20)]])
+@pytest.mark.parametrize("bn", [nn.BatchNorm2d(20), None])
+def test_single_act_qscheme_for_symmetric(activation_fn, layer_and_data, bn):
+    """
+    Tests that when qscheme is symmetric, always affine layers have affine qscheme
+    """
+    layer, data = layer_and_data
+    if isinstance(layer, nn.Conv2d) and bn is not None:
+        model = nn.Sequential(OrderedDict({
+            'layer': layer,
+            'bn': bn,
+            'act': activation_fn(),
+        }))
+    else:
+        model = nn.Sequential(OrderedDict({
+            'layer': layer,
+            'act': activation_fn(),
+        }))
+
+    prepared_model, _ = quantize_model(model, data)
+
+    assert prepared_model.activation_post_process_0.qscheme == torch.per_tensor_symmetric
+    assert prepared_model.activation_post_process_1.qscheme == torch.per_tensor_affine
+
+
+@pytest.mark.parametrize("activation_fn", [torch.nn.Hardsigmoid,
+                                           torch.nn.Sigmoid,
+                                           torch.nn.Softmax,
+                                           torch.nn.Tanh])
+@pytest.mark.parametrize("layer_and_data", [[nn.Conv2d(1, 20, (3, 3)), torch.randn(1, 1, 28, 28)],
+                                            [nn.Linear(20, 100), torch.randn(1, 20)]])
+@pytest.mark.parametrize("bn", [nn.BatchNorm2d(20), None])
+@pytest.mark.parametrize("config", get_configs_for_qscheme())
+def test_single_fixed_qparams_act_for_symmetric(
+    activation_fn, layer_and_data, bn, config
+):
+    """
+    Tests that when qscheme is symmetric, the qparams of fixed qparam ops are maintained
+    """
+    layer, data = layer_and_data
+    if isinstance(layer, nn.Conv2d) and bn is not None:
+        model = nn.Sequential(OrderedDict({
+            'layer': layer,
+            'bn': bn,
+            'act': activation_fn(),
+        }))
+    else:
+        model = nn.Sequential(OrderedDict({
+            'layer': layer,
+            'act': activation_fn(),
+        }))
+
+    prepared_model, _ = quantize_model(model, data)
+
+    builder = _QConfigMappingBuilder()
+    qconfig = builder.get_default_qconfig_mapping(
+        QuantizationScheme.symmetric
+    ).object_type_qconfigs[activation_fn]
+
+    assert prepared_model.activation_post_process_1.scale == qconfig.activation().scale
+    assert prepared_model.activation_post_process_1.zero_point == qconfig.activation().zero_point
+
+
+@pytest.mark.parametrize("activation_fn", [nn.ReLU, nn.ReLU6])
+def test_dropout_affine_input(activation_fn):
+    model = nn.Sequential(OrderedDict({
+        'conv': nn.Conv2d(1, 20, (3, 3)),
+        'relu': activation_fn(),
+        'dropout': nn.Dropout2d(),
+        'leaky_relu': nn.LeakyReLU()
+    }))
+    data = torch.randn(1, 1, 28, 28)
+
+    prepared_model, _ = quantize_model(model, data)
+
+    assert prepared_model.activation_post_process_1.qscheme == torch.per_tensor_affine
+    assert not hasattr(prepared_model, "activation_post_process_2")
+    assert prepared_model.activation_post_process_3.qscheme == torch.per_tensor_symmetric
+
+
+def test_sequential_network_config_for_symmetric(mnist_model_quantization):
+    """
+    Tests a sequential network with multiple modules is configured correctly.
+    This network has layers where input and output observers are shared. We test
+    that for these layers, we set acitvation quantizer correctly for always affine layers
+    """
+    data = torch.randn(1, 1, 28, 28)
+    prepared_model, quantizer = quantize_model(mnist_model_quantization, data)
+
+    # verify module fusion
+    assert isinstance(prepared_model.conv1, _qat.ConvBnAct2d)
+    assert isinstance(prepared_model.conv2, _qat.ConvAct2d)
+    assert isinstance(prepared_model.dense1, _qat.LinearAct)
+    assert isinstance(prepared_model.dense2, _qat.LinearAct)
+
+    # verify activation quantizers
+    # after input
+    assert prepared_model.activation_post_process_0.qscheme == torch.per_tensor_symmetric
+    # after conv1
+    assert prepared_model.activation_post_process_1.qscheme == torch.per_tensor_affine
+    # after pool, this is shared with output of conv1
+    assert id(prepared_model.activation_post_process_1) == id(prepared_model.activation_post_process_2)
+    # after conv2
+    assert prepared_model.activation_post_process_3.qscheme == torch.per_tensor_affine
+    # after pool and flatten, shared with output of conv2
+    assert id(prepared_model.activation_post_process_3) == id(prepared_model.activation_post_process_4)
+    assert id(prepared_model.activation_post_process_3) == id(prepared_model.activation_post_process_5)
+    # after linear1
+    assert prepared_model.activation_post_process_6.qscheme == torch.per_tensor_affine
+    # after dropout
+    # we remove activation post process after dropout layer
+    assert not hasattr(prepared_model, "activation_post_process_7")
+    # after linear2, logsoftmax
+    assert prepared_model.activation_post_process_8.qscheme == torch.per_tensor_symmetric
+
+    # convert model and test fusion
+    converted_model = quantizer.finalize(inplace=False)
+
+    # assert converted module fusion
+    assert isinstance(converted_model.conv1, _quantized.QuantizedConvAct2d)
+    assert isinstance(converted_model.conv2, _quantized.QuantizedConvAct2d)
+    assert isinstance(converted_model.dense1, _quantized.QuantizedLinearAct)
+    assert isinstance(converted_model.dense2, _quantized.QuantizedLinearAct)
+
+
+class ConvBlock(nn.Module):
+    def __init__(self, activation):
+        super().__init__()
+        self.conv = nn.Conv2d(1, 20, (3, 3), padding='same')
+        self.activation = activation
+
+    def forward(self, x):
+        return self.activation(self.conv(x))
+
+
+class ResidualBlock(nn.Module):
+    def __init__(self, activation: nn.Module):
+        super().__init__()
+        self.conv = ConvBlock(activation)
+
+    def forward(self, x):
+        return x + self.conv(x)
+
+
+@pytest.mark.parametrize("activation_fn", [torch.nn.functional.relu, torch.nn.functional.relu_])
+def test_functional_relu_qscheme_for_symmetric(activation_fn):
+    class Model(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv1 = nn.Conv2d(1, 20, (3, 3), padding='same')
+            self.conv2 = nn.Conv2d(20, 20, (3, 3), padding='same')
+
+        def forward(self, x):
+            return self.conv2(activation_fn(self.conv1(x)))
+
+    model = Model()
+    data = torch.randn(1, 1, 28, 28)
+
+    prepared_model, _ = quantize_model(model, data)
+
+    if activation_fn == torch.nn.functional.relu:
+        assert prepared_model.activation_post_process_1.qscheme == torch.per_tensor_affine
+    else:
+        assert prepared_model.activation_post_process_2.qscheme == torch.per_tensor_affine
+
+
+@pytest.mark.parametrize("activation_fn", [torch.nn.ReLU, torch.nn.ReLU6])
+def test_addition_of_uint_and_uint_for_symmetric(activation_fn):
+    model = nn.Sequential(OrderedDict({
+            'previous_activation': nn.ReLU(),
+            'res_block': ResidualBlock(activation_fn()),
+    }))
+    data = torch.randn(1, 1, 28, 28)
+
+    prepared_model, _ = quantize_model(model, data)
+
+    assert prepared_model.activation_post_process_0.qscheme == torch.per_tensor_symmetric
+    affine_acts = [prepared_model.activation_post_process_1,
+                   prepared_model.activation_post_process_2, prepared_model.activation_post_process_3]
+    for act in affine_acts:
+        assert act.qscheme == torch.per_tensor_affine
+
+
+@pytest.mark.parametrize("activation_fn", [torch.nn.ReLU, torch.nn.ReLU6])
+def test_addition_of_int_and_uint_for_symmetric(activation_fn):
+    model = nn.Sequential(OrderedDict({
+            'previous_activation': nn.LeakyReLU(),
+            'res_block': ResidualBlock(activation_fn()),
+    }))
+    data = torch.randn(1, 1, 28, 28)
+
+    prepared_model, _ = quantize_model(model, data)
+
+    # relu shares observer with input, so input is affine as well
+    symmetric_acts = [prepared_model.activation_post_process_0, prepared_model.activation_post_process_1,
+                      prepared_model.activation_post_process_3]
+    for act in symmetric_acts:
+        assert act.qscheme == torch.per_tensor_symmetric
+    # output of conv block is still affine
+    assert prepared_model.activation_post_process_2.qscheme == torch.per_tensor_affine
+
+
+class ComplexAdd(nn.Module):
+    """
+    a (affine)
+              + ->  c (symmetric)
+    b (symmetric)
+                                + -> g (symmetric)
+    d (affine)
+              +  -> f (affine)
+    e (affine)
+    """
+
+    def __init__(self, activation_fn):
+        super().__init__()
+        self.lrelu = nn.LeakyReLU()
+        self.relu1 = activation_fn()
+        self.relu2 = activation_fn()
+        self.relu3 = activation_fn()
+
+    def forward(self, x):
+        a = self.relu1(x)
+        b = self.lrelu(x)
+        d = self.relu2(x)
+        e = self.relu3(x)
+        c = a + b
+        f = d + e
+        g = c + f
+        return g
+
+
+@pytest.mark.parametrize("activation_fn", [torch.nn.ReLU, torch.nn.ReLU6])
+def test_complex_add(activation_fn):
+    model = ComplexAdd(activation_fn)
+    data = torch.randn(1, 1, 28, 28)
+
+    prepared_model, _ = quantize_model(model, data)
+
+    symmetric_acts = [prepared_model.activation_post_process_0, prepared_model.activation_post_process_2,
+                      prepared_model.activation_post_process_5, prepared_model.activation_post_process_7]
+    for act in symmetric_acts:
+        assert act.qscheme == torch.per_tensor_symmetric
+    affine_acts = [prepared_model.activation_post_process_1, prepared_model.activation_post_process_3,
+                   prepared_model.activation_post_process_4, prepared_model.activation_post_process_6]
+    for act in affine_acts:
+        assert act.qscheme == torch.per_tensor_affine
+
+
+class ComplexConcatAdd(nn.Module):
+    """
+    conv_c (uint)  --- c.
+                        .`-- concat
+                   .--a2
+    conv_a (uint) `
+                   `--a1 `-- add
+    conv_b (int)  ---- b `
+    """
+    def __init__(self, activation_fn):
+        super().__init__()
+        self.conv_a = ConvBlock(activation_fn())
+        self.conv_b = ConvBlock(nn.LeakyReLU())
+        self.conv_c = ConvBlock(activation_fn())
+
+    def forward(self, x):
+        a1 = self.conv_a(x)
+        b = self.conv_b(x)
+        ab = a1 + b
+        c = self.conv_c(x)
+        ac = torch.cat([a1, c])
+        return ab, ac
+
+
+@pytest.mark.parametrize("activation_fn", [torch.nn.ReLU, torch.nn.ReLU6])
+def test_complex_concat_add(activation_fn):
+    model = ComplexConcatAdd(activation_fn)
+    data = torch.randn(1, 1, 28, 28)
+
+    prepared_model, _ = quantize_model(model, data)
+
+    symmetric_acts = [prepared_model.activation_post_process_0, prepared_model.activation_post_process_2,
+                      prepared_model.activation_post_process_3]
+    for act in symmetric_acts:
+        assert act.qscheme == torch.per_tensor_symmetric
+    affine_acts = [prepared_model.activation_post_process_1, prepared_model.activation_post_process_4,
+                   prepared_model.activation_post_process_5]
+    for act in affine_acts:
+        assert act.qscheme == torch.per_tensor_affine
+
+
+class ConcatBlock(nn.Module):
+    def __init__(self, *activations: torch.nn.Module):
+        super().__init__()
+        self.branches = nn.ModuleList(ConvBlock(act) for act in activations)
+
+    def forward(self, x):
+        return torch.cat(list(f(x) for f in self.branches))
+
+
+@pytest.mark.parametrize("activation_fn", [torch.nn.ReLU, torch.nn.ReLU6, torch.nn.LeakyReLU])
+def test_concat_uint_and_int(activation_fn):
+    model = ConcatBlock(activation_fn(), nn.Identity())
+    data = torch.randn(1, 1, 28, 28)
+
+    prepared_model, _ = quantize_model(model, data)
+
+    symmetric_acts = [prepared_model.activation_post_process_0, prepared_model.activation_post_process_2]
+    for act in symmetric_acts:
+        assert act.qscheme == torch.per_tensor_symmetric
+    # these are inputs and output of cat layer, they all share same activation quantization
+    other_acts = [prepared_model.activation_post_process_1, prepared_model.activation_post_process_3,
+                  prepared_model.activation_post_process_4]
+    for act in other_acts:
+        if isinstance(activation_fn(), (torch.nn.ReLU, torch.nn.ReLU6)):
+            assert act.qscheme == torch.per_tensor_affine
+        else:
+            assert act.qscheme == torch.per_tensor_symmetric
+
+    assert id(prepared_model.activation_post_process_1) == id(prepared_model.activation_post_process_3)
+    assert id(prepared_model.activation_post_process_3) == id(prepared_model.activation_post_process_4)
+
+
+@pytest.mark.parametrize(
+    "config", get_configs_for_qscheme(activation_dtype=torch.float32)
+)
+@pytest.mark.parametrize("activation_fn", list(_mod_activations) + [nn.ReLU])
+@pytest.mark.parametrize("bn", [nn.BatchNorm2d(20), None])
+def test_conv_weight_only_quantization(config, activation_fn, bn):
+    if bn is not None:
+        model = nn.Sequential(
+            OrderedDict(
+                {
+                    "layer": nn.Conv2d(1, 20, (3, 3)),
+                    "bn": bn,
+                    "act": activation_fn(),
+                }
+            )
+        )
+    else:
+        model = nn.Sequential(
+            OrderedDict(
+                {
+                    "layer": nn.Conv2d(1, 20, (3, 3)),
+                    "act": activation_fn(),
+                }
+            )
+        )
+    data = torch.randn(1, 1, 28, 28)
+
+    prepared_model, quantizer = quantize_model(model, data, config)
+
+    if bn is not None:
+        assert isinstance(prepared_model.layer, _qat.ConvBnAct2d) or isinstance(
+            prepared_model.layer, torch.nn.intrinsic.qat.ConvBnReLU2d
+        )
+    else:
+        assert isinstance(prepared_model.layer, _qat.ConvAct2d) or isinstance(
+            prepared_model.layer, torch.nn.intrinsic.qat.ConvReLU2d
+        )
+
+    assert len(list(prepared_model.children())) == 1
+
+    converted_model = quantizer.finalize(inplace=False)
+
+    assert isinstance(
+        converted_model.layer, _quantized.QuantizedConvAct2d
+    ) or isinstance(converted_model.layer[0], torch.ao.nn.quantized.reference.Conv2d)
+
+
+@pytest.mark.parametrize(
+    "config", get_configs_for_qscheme(activation_dtype=torch.float32)
+)
+@pytest.mark.parametrize("activation_fn", list(_mod_activations) + [nn.ReLU])
+def test_linear_weight_only_quantization(config, activation_fn):
+    model = nn.Sequential(
+        OrderedDict(
+            {
+                "layer": nn.Linear(20, 100),
+                "act": activation_fn(),
+            }
+        )
+    )
+    data = torch.randn(1, 20)
+
+    prepared_model, quantizer = quantize_model(model, data, config)
+
+    assert isinstance(prepared_model.layer, _qat.LinearAct) or isinstance(
+        prepared_model.layer, torch.nn.intrinsic.qat.LinearReLU
+    )
+
+    assert len(list(prepared_model.children())) == 1
+
+    converted_model = quantizer.finalize(inplace=False)
+
+    assert isinstance(
+        converted_model.layer, _quantized.QuantizedLinearAct
+    ) or isinstance(converted_model.layer[0], torch.ao.nn.quantized.reference.Linear)
+
+
+# @pytest.mark.parametrize("activation_dtype", [torch.float32, torch.quint8])
+# TODO: Fix quantization of embedding layer when activation dtype is quint8
+@pytest.mark.parametrize("activation_dtype", [torch.float32])
+def test_embedding_layer_quantization(activation_dtype):
+    model = nn.Sequential(
+        OrderedDict(
+            {
+                "embedding": nn.Embedding(10, 10),
+                "linear": nn.Linear(10, 10),
+            }
+        )
+    )
+    data = torch.randint(0, 10, (1, 10))
+
+    configs = get_configs_for_qscheme(activation_dtype)
+
+    for config in configs:
+        prepared_model, quantizer = quantize_model(model, data, config)
+
+        assert isinstance(prepared_model.embedding, torch.nn.qat.Embedding)
+
+        if activation_dtype == torch.float32:
+            assert len(list(prepared_model.children())) == 2
+        else:
+            assert len(list(prepared_model.children())) == 4
+            assert prepared_model.activation_post_process_0.dtype == torch.quint8
+            assert prepared_model.activation_post_process_1.dtype == torch.quint8
+
+        if config.global_config.quantization_scheme == QuantizationScheme.symmetric:
+            assert (
+                prepared_model.embedding.weight_fake_quant.qscheme
+                == torch.per_channel_symmetric
+            )
+        else:
+            assert (
+                prepared_model.embedding.weight_fake_quant.qscheme
+                == torch.per_channel_affine
+            )
+
+        converted_model = quantizer.finalize(inplace=False)
+
+        assert isinstance(
+            converted_model.embedding, torch.ao.nn.quantized.reference.Embedding
+        )
+        assert isinstance(
+            converted_model.linear, torch.ao.nn.quantized.reference.Linear
+        )
+
+
+@pytest.mark.parametrize("config", get_configs_for_qscheme())
+@pytest.mark.parametrize("activation_fn", list(_mod_activations) + [nn.ReLU])
+@pytest.mark.parametrize("elementwise_op", [operator.add, torch.add, operator.mul, torch.mul])
+def test_elementwise_op_act_fusion(config, activation_fn, elementwise_op):
+    class ElementWiseActModule(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv1 = torch.nn.Conv2d(48, 48, (3, 3), (1, 1), padding=(1, 1))
+            self.act = activation_fn()
+
+        def forward(self, x):
+            return self.act(elementwise_op(x, self.conv1(x)))
+
+    model = ElementWiseActModule()
+    data = torch.randn(1, 48, 224, 224)
+
+    prepared_model, quantizer = quantize_model(model, data, config)
+
+    for node in prepared_model.graph.nodes:
+        if node.op == "call_function":
+            assert isinstance(find_module(prepared_model, node.next), activation_fn)
+            assert is_activation_post_process(
+                find_module(prepared_model, node.next.next)
+            )
+
+
+@pytest.mark.parametrize("quantization_scheme", ["symmetric", "affine"])
+@pytest.mark.parametrize(
+    "skipped_layers",
+    [
+        ["conv1", "pool1"],
+        ["conv2", "pool1", "pool2"],
+        ["dense1", "flatten", "dropout"],
+        ["dense2", "dropout"],
+    ],
+)
+def test_skipping_quantization_for_layers(
+    mnist_model_quantization, quantization_scheme, skipped_layers
+):
+    config_s = LinearQuantizerConfig.from_dict(
+        {
+            "global_config": {
+                "quantization_scheme": quantization_scheme,
+                "milestones": [0, 0, 100, 100],
+            },
+            "module_name_configs": {
+                skipped_layer: None for skipped_layer in skipped_layers
+            },
+        }
+    )
+    config_f = LinearQuantizerConfig.from_dict(
+        {
+            "global_config": {
+                "quantization_scheme": quantization_scheme,
+                "milestones": [0, 0, 100, 100],
+            }
+        }
+    )
+    data = torch.randn(1, 1, 28, 28)
+    prepared_model_s, quantizer_s = quantize_model(
+        mnist_model_quantization, data, config_s
+    )
+    prepared_model_f, quantizer_f = quantize_model(
+        mnist_model_quantization, data, config_f
+    )
+
+    skipped_mod_name = skipped_layers[0]
+    skipped_mod = mnist_model_quantization.get_submodule(skipped_mod_name)
+    if isinstance(skipped_mod, nn.Conv2d):
+        submod_s = prepared_model_s.get_submodule(skipped_mod_name)
+        submod_f = prepared_model_f.get_submodule(skipped_mod_name)
+        assert isinstance(submod_s, _fused.ConvBnAct2d) or isinstance(
+            submod_s, _fused.ConvAct2d
+        )
+        assert not hasattr(submod_s.conv, "weight_fake_quant")
+        assert isinstance(submod_f, _qat.ConvBnAct2d) or isinstance(
+            submod_f, _qat.ConvAct2d
+        )
+        assert hasattr(submod_f.conv, "weight_fake_quant")
+    elif isinstance(skipped_mod, nn.Linear):
+        submod_s = prepared_model_s.get_submodule(skipped_mod_name)
+        submod_f = prepared_model_f.get_submodule(skipped_mod_name)
+        assert isinstance(submod_s, _fused.LinearAct)
+        assert not hasattr(submod_s.linear, "weight_fake_quant")
+        assert isinstance(submod_f, _qat.LinearAct)
+        assert hasattr(submod_f.linear, "weight_fake_quant")
+
+    for node in prepared_model_s.graph.nodes:
+        if node.target == skipped_mod_name:
+            for consumer in node.users:
+                assert "activation_post_process" not in consumer.target
+            for producer in node.args:
+                assert "activation_post_process" not in producer.target
+
+    for node in prepared_model_f.graph.nodes:
+        if node.target == skipped_mod_name:
+            for consumer in node.users:
+                assert "activation_post_process" in consumer.target
+            for producer in node.args:
+                if producer.target != "dropout":
+                    # for some nodes, if producer is dropout, we won't have activation post process
+                    assert "activation_post_process" in producer.target
diff --git a/coremltools/test/optimize/torch/quantization/test_quantizer.py b/coremltools/test/optimize/torch/quantization/test_quantizer.py
new file mode 100644
index 000000000..20177a3e0
--- /dev/null
+++ b/coremltools/test/optimize/torch/quantization/test_quantizer.py
@@ -0,0 +1,235 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+from collections import OrderedDict
+
+import cattrs
+import pytest
+import torch
+import torch.ao.quantization
+import torch.nn as nn
+import torch.nn.intrinsic
+import torch.nn.intrinsic.qat
+import torch.nn.quantized
+import torch.nn.quantized.modules.utils
+
+from coremltools.optimize.torch.quantization import (
+    LinearQuantizer,
+    LinearQuantizerConfig,
+    ModuleLinearQuantizerConfig,
+)
+
+
+@pytest.mark.parametrize(
+    "option_and_value", [
+        ("weight_dtype", torch.int32),
+        ("activation_dtype", torch.int8),
+        ("milestones", [0, 2])
+    ]
+)
+def test_config_illegal_options(option_and_value):
+    option, value = option_and_value
+    with pytest.raises(cattrs.errors.ClassValidationError):
+        LinearQuantizerConfig.from_dict({"global_config": {option: value}})
+
+
+@pytest.mark.parametrize(
+    "config_dict",
+    [
+        {"module_type_configs": {nn.Linear: {"weight_dtype": torch.quint8}}},
+        {"module_name_configs": {"conv2d": {"weight_dtype": torch.quint8}}},
+        {"global_config": {"weight_dtype": torch.quint8}},
+        {},
+    ],
+)
+def test_linear_quantizer_config_global_config_set(config_dict):
+    config = LinearQuantizerConfig.from_dict(config_dict)
+    if len(config_dict) == 0:
+        assert config.global_config == ModuleLinearQuantizerConfig()
+    else:
+        keys = ["global_config", "module_type_configs", "module_name_configs"]
+        for key in keys:
+            if key not in config_dict:
+                param_in_config = getattr(config, key)
+                assert param_in_config is None or len(param_in_config) == 0
+        if "global_config" in config_dict:
+            assert config.global_config.weight_dtype == config_dict["global_config"]["weight_dtype"]
+        if "module_name_configs" in config_dict:
+            for key in config_dict["module_name_configs"]:
+                assert config.module_name_configs[key].weight_dtype == \
+                       config_dict["module_name_configs"][key]["weight_dtype"]
+        if "module_type_configs" in config_dict:
+            for key in config_dict["module_type_configs"]:
+                assert config.module_type_configs[key].weight_dtype == \
+                       config_dict["module_type_configs"][key]["weight_dtype"]
+
+
+@pytest.mark.parametrize(
+    "config_dict",
+    [
+        {
+            "global_config": {"quantization_scheme": "affine"},
+            "module_name_configs": {"conv1": {"quantization_scheme": "symmetric"}},
+        },
+        {
+            "global_config": {"quantization_scheme": "affine"},
+            "module_type_configs": {nn.Linear: {"quantization_scheme": "symmetric"}},
+        },
+        {
+            "module_name_configs": {
+                "conv1": {"quantization_scheme": "affine"},
+                "conv2": {"quantization_scheme": "symmetric"},
+            }
+        },
+        {
+            "module_type_configs": {
+                nn.Linear: {"quantization_scheme": "symmetric"},
+                "Conv2d": {"quantization_scheme": "affine"},
+            }
+        },
+        {
+            "module_type_configs": {nn.Linear: {"quantization_scheme": "symmetric"}},
+            "module_name_configs": {"conv1": {"quantization_scheme": "affine"}},
+        },
+        {"global_config": {"activation_dtype": "quint8", "weight_dtype": "float32"}},
+        {
+            "module_name_configs": {
+                "conv1": {"activation_dtype": "quint8", "weight_dtype": "float32"}
+            }
+        },
+        {
+            "module_name_configs": {
+                "Conv2d": {"activation_dtype": "quint8", "weight_dtype": "float32"}
+            }
+        },
+    ],
+)
+def test_linear_quantizer_config_failure_modes(config_dict):
+    with pytest.raises(Exception):
+        LinearQuantizerConfig.from_dict(config_dict)
+
+
+def test_linear_quantizer_config_different_config_success():
+    config_dict = {
+        "global_config": {"quantization_scheme": "affine"},
+        "module_name_configs": {
+            "conv1": {"quantization_scheme": "affine"},
+            "conv2": {"quantization_scheme": "affine"},
+        },
+        "module_type_configs": {nn.Linear: {"quantization_scheme": "affine"}},
+    }
+    LinearQuantizerConfig.from_dict(config_dict)
+
+
+@pytest.mark.parametrize(
+    "config_dict",
+    [
+        {
+            "global_config": {"quantization_scheme": "affine"},
+            "module_name_configs": {
+                "conv1": {"quantization_scheme": "affine"},
+                "conv2": {"quantization_scheme": "affine"},
+            },
+            "module_type_configs": {nn.Linear: {"quantization_scheme": "affine"}},
+        },
+        {
+            "module_name_configs": {
+                "conv1": {"quantization_scheme": "affine"},
+                "conv2": {"quantization_scheme": "affine"},
+            }
+        },
+        {"module_type_configs": {nn.Linear: {"quantization_scheme": "affine"}}},
+        {},
+    ],
+)
+def test_linear_quantizer_quantization_scheme_setting(config_dict):
+    model = nn.Sequential(OrderedDict({
+        'conv': nn.Conv2d(1, 20, (3, 3)),
+        'relu': nn.ReLU(),
+    }))
+    config = LinearQuantizerConfig.from_dict(config_dict)
+    quantizer = LinearQuantizer(model, config)
+
+    def_quantization_scheme = ModuleLinearQuantizerConfig().quantization_scheme.value
+    quantization_scheme = quantizer._quantization_scheme.value
+    if len(config_dict) == 0:
+        assert def_quantization_scheme == quantization_scheme
+    else:
+        assert quantization_scheme == "affine"
+
+
+@pytest.mark.parametrize("quantization_scheme", ["symmetric", "affine"])
+def test_activation_defaults(quantization_scheme):
+    model = nn.Sequential(OrderedDict({
+        'conv': nn.Conv2d(1, 20, (3, 3)),
+        'relu': nn.ReLU(),
+    }))
+    config = LinearQuantizerConfig.from_dict(
+        {"global_config": {
+            "quantization_scheme": quantization_scheme,
+            "milestones": [0, 2, 3, 3],
+        }}
+    )
+    quantizer = LinearQuantizer(model, config)
+    model = quantizer.prepare(example_inputs=torch.randn(1, 1, 28, 28))
+
+    assert isinstance(model.conv, torch.nn.intrinsic.qat.ConvReLU2d)
+    assert model.activation_post_process_0.dtype == torch.quint8
+    if quantization_scheme == "symmetric":
+        assert model.activation_post_process_0.qscheme == torch.per_tensor_symmetric
+    else:
+        assert model.activation_post_process_0.qscheme == torch.per_tensor_affine
+    assert model.activation_post_process_1.dtype == torch.quint8
+    assert model.activation_post_process_1.qscheme == torch.per_tensor_affine
+
+
+@pytest.mark.parametrize("quantization_scheme", ["symmetric", "affine"])
+def test_quantizer_step_mechanism(quantization_scheme):
+    model = nn.Sequential(OrderedDict({
+        'conv': nn.Conv2d(1, 20, (3, 3)),
+        'bn': nn.BatchNorm2d(20),
+        'relu': nn.ReLU(),
+    }))
+
+    config = LinearQuantizerConfig.from_dict(
+        {"global_config": {
+            "quantization_scheme": quantization_scheme,
+            "milestones": [0, 1, 2, 3],
+        }}
+    )
+    quantizer = LinearQuantizer(model, config)
+    model = quantizer.prepare(example_inputs=torch.randn(1, 1, 28, 28))
+
+    assert not model.activation_post_process_0.observer_enabled
+    assert not model.activation_post_process_0.fake_quant_enabled
+    assert not model.activation_post_process_1.observer_enabled
+    assert not model.activation_post_process_1.fake_quant_enabled
+
+    for idx in range(4):
+        quantizer.step()
+        if idx == 0:
+            assert not model.conv.freeze_bn
+            assert model.activation_post_process_0.observer_enabled
+            assert not model.activation_post_process_0.fake_quant_enabled
+            assert model.activation_post_process_1.observer_enabled
+            assert not model.activation_post_process_1.fake_quant_enabled
+        if idx == 1:
+            assert not model.conv.freeze_bn
+            assert model.activation_post_process_0.observer_enabled
+            assert model.activation_post_process_0.fake_quant_enabled
+            assert model.activation_post_process_1.observer_enabled
+            assert model.activation_post_process_1.fake_quant_enabled
+        if idx == 2:
+            assert not model.conv.freeze_bn
+            assert not model.activation_post_process_0.observer_enabled
+            assert model.activation_post_process_0.fake_quant_enabled
+            assert not model.activation_post_process_1.observer_enabled
+            assert model.activation_post_process_1.fake_quant_enabled
+        if idx == 3:
+            assert model.conv.freeze_bn
+            assert not model.activation_post_process_0.observer_enabled
+            assert model.activation_post_process_0.fake_quant_enabled
+            assert not model.activation_post_process_1.observer_enabled
+            assert model.activation_post_process_1.fake_quant_enabled
diff --git a/coremltools/test/optimize/torch/test_api_surface.py b/coremltools/test/optimize/torch/test_api_surface.py
new file mode 100644
index 000000000..8862c59ce
--- /dev/null
+++ b/coremltools/test/optimize/torch/test_api_surface.py
@@ -0,0 +1,116 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+from typing import List
+
+import coremltools.optimize.torch
+
+
+def _get_visible_items(d):
+    return [x for x in dir(d) if not x.startswith("_")]
+
+
+def _check_visible_modules(actual: List[str], expected: List[str]):
+    assert set(actual) == set(expected), "API mis-matched. Got %s, expected %s" % (
+        actual,
+        expected,
+    )
+
+
+class TestApiVisibilities:
+    """Test APIs visible to users"""
+
+    def test_top_level(self):
+        # coremltools.optimize.torch.*
+        expected = [
+            "base_model_optimizer",
+            "optimization_config",
+            "palettization",
+            "pruning",
+            "quantization",
+        ]
+        visible_modules = _get_visible_items(coremltools.optimize.torch)
+        _check_visible_modules(visible_modules, expected)
+
+    def test_base_model_optimizer_module(self):
+        # coremltools.optimize.torch.base_model_optimizer.*
+        expected = ["BaseModelOptimizer"]
+        visible_modules = _get_visible_items(coremltools.optimize.torch.base_model_optimizer)
+        _check_visible_modules(visible_modules, expected)
+
+    def test_optimization_config_module(self):
+        # coremltools.optimize.torch.optimization_config.*
+        expected = ["ModuleOptimizationConfig", "OptimizationConfig"]
+        visible_modules = _get_visible_items(coremltools.optimize.torch.optimization_config)
+        _check_visible_modules(visible_modules, expected)
+
+    def test_palettization_module(self):
+        # coremltools.optimize.torch.palettization.*
+        expected = [
+            "FakePalettize",
+            "DKMPalettizer",
+            "DKMPalettizerConfig",
+            "ModuleDKMPalettizerConfig",
+            "palettization_config",
+            "fake_palettize",
+            "palettizer",
+        ]
+        visible_modules = _get_visible_items(coremltools.optimize.torch.palettization)
+        _check_visible_modules(visible_modules, expected)
+        # coremltools.optimize.torch.palettization.palettizer.*
+        expected = [
+            "Palettizer",
+            "DKMPalettizer",
+        ]
+        visible_modules = _get_visible_items(coremltools.optimize.torch.palettization.palettizer)
+        _check_visible_modules(visible_modules, expected)
+
+    def test_pruning_module(self):
+        # coremltools.optimize.torch.pruning.*
+        expected = [
+            "ConstantSparsityScheduler",
+            "MagnitudePruner",
+            "MagnitudePrunerConfig",
+            "ModuleMagnitudePrunerConfig",
+            "PolynomialDecayScheduler",
+            "magnitude_pruner",
+            "pruning_scheduler",
+        ]
+        visible_modules = _get_visible_items(coremltools.optimize.torch.pruning)
+        _check_visible_modules(visible_modules, expected)
+
+    def test_quantization_module(self):
+        # coremltools.optimize.torch.quantization.*
+        expected = [
+            "LinearQuantizer",
+            "LinearQuantizerConfig",
+            "ModuleLinearQuantizerConfig",
+            "ObserverType",
+            "QuantizationScheme",
+            "quantizer",
+            "quantization_config",
+            "modules",
+        ]
+        visible_modules = _get_visible_items(coremltools.optimize.torch.quantization)
+        _check_visible_modules(visible_modules, expected)
+        # coremltools.optimize.torch.quantization.LinearQuantizer.*
+        expected = [
+            "finalize",
+            "prepare",
+            "step",
+            "report",
+            "supported_modules",
+        ]
+        visible_modules = _get_visible_items(
+            coremltools.optimize.torch.quantization.LinearQuantizer
+        )
+        _check_visible_modules(visible_modules, expected)
+        # coremltools.optimize.torch.quantization.quantizer.*
+        expected = [
+            "Quantizer",
+            "LinearQuantizer",
+        ]
+        visible_modules = _get_visible_items(coremltools.optimize.torch.quantization.quantizer)
+        _check_visible_modules(visible_modules, expected)
diff --git a/coremltools/test/optimize/torch/test_base_optimizer.py b/coremltools/test/optimize/torch/test_base_optimizer.py
new file mode 100644
index 000000000..7cd5ae95b
--- /dev/null
+++ b/coremltools/test/optimize/torch/test_base_optimizer.py
@@ -0,0 +1,31 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import pytest
+import torch
+
+from coremltools.optimize.torch.palettization import DKMPalettizer
+from coremltools.optimize.torch.pruning import MagnitudePruner
+from coremltools.optimize.torch.quantization import LinearQuantizer
+
+
+@pytest.mark.parametrize("optimizer", [MagnitudePruner, LinearQuantizer, DKMPalettizer])
+@pytest.mark.parametrize("inplace", [True, False])
+def test_report_model_train_state(optimizer, inplace):
+    model = torch.nn.Sequential(torch.nn.Conv2d(1, 31, 2, 1), torch.nn.Conv2d(31, 21, 2, 1))
+
+    opt = optimizer(model)
+    if optimizer == LinearQuantizer:
+        p_model = opt.prepare(inplace=inplace, example_inputs=torch.randn(1))
+    else:
+        p_model = opt.prepare(inplace=inplace)
+
+    p_model.train()
+    opt.report()
+    assert p_model.training
+
+    p_model.eval()
+    opt.report()
+    assert not p_model.training
diff --git a/coremltools/test/optimize/torch/utils.py b/coremltools/test/optimize/torch/utils.py
new file mode 100644
index 000000000..2ddcb3d65
--- /dev/null
+++ b/coremltools/test/optimize/torch/utils.py
@@ -0,0 +1,56 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import pathlib
+import sys
+
+from packaging import version
+
+
+def _python_version():
+    """
+    Return python version as a tuple of integers
+    """
+    version = sys.version.split(" ")[0]
+    version = list(map(int, list(version.split("."))))
+    return tuple(version)
+
+
+def _macos_version():
+    """
+    Returns macOS version as a tuple of integers, making it easy to do proper
+    version comparisons. On non-Macs, it returns an empty tuple.
+    """
+    if sys.platform == "darwin":
+        try:
+            import subprocess
+
+            ver_str = (
+                subprocess.run(["sw_vers", "-productVersion"], stdout=subprocess.PIPE)
+                .stdout.decode("utf-8")
+                .strip("\n")
+            )
+            return tuple([int(v) for v in ver_str.split(".")])
+        except:
+            raise Exception("Unable to detemine the macOS version")
+    return ()
+
+
+def version_ge(module, target_version):
+    """
+    Example usage:
+    >>> import torch # v1.5.0
+    >>> version_ge(torch, '1.6.0') # False
+    """
+    return version.parse(module.__version__) >= version.parse(target_version)
+
+
+def version_lt(module, target_version):
+    """See version_ge"""
+    return version.parse(module.__version__) < version.parse(target_version)
+
+
+def test_data_path():
+    return pathlib.Path(__file__).parent.absolute() / "_test_data"
diff --git a/coremltools/test/pipeline/test_pipeline.py b/coremltools/test/pipeline/test_pipeline.py
index 1729a3fae..236275997 100644
--- a/coremltools/test/pipeline/test_pipeline.py
+++ b/coremltools/test/pipeline/test_pipeline.py
@@ -184,9 +184,17 @@ def test_pipeline_regression_creation(self):
         input_names = self.scikit_data.feature_names
         output_name = "target"
 
-        p_regressor = converter.convert(
-            self.scikit_model, input_names, "target"
-        ).get_spec()
+        p_regressor_model = converter.convert(self.scikit_model, input_names, "target")
+
+        x = dict(zip(self.scikit_data["feature_names"], self.scikit_data["data"][0]))
+        y = p_regressor_model.predict(x)
+        self.assertIsNotNone(y)
+
+        with tempfile.TemporaryDirectory() as save_dir:
+            p_regressor_model.save(save_dir + "/test.mlmodel")
+
+        p_regressor = p_regressor_model.get_spec()
+
         self.assertIsNotNone(p_regressor)
         self.assertEqual(len(p_regressor.pipelineRegressor.pipeline.models), 2)
 
diff --git a/coremltools/version.py b/coremltools/version.py
index b1b161143..b17d2e113 100644
--- a/coremltools/version.py
+++ b/coremltools/version.py
@@ -4,4 +4,4 @@
 # found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
 
-__version__ = "6.3.0"  # VERSION_STRING
+__version__ = "7.0b1"  # VERSION_STRING
diff --git a/deps/FP16/LICENSE b/deps/FP16/LICENSE
new file mode 100644
index 000000000..eabec6c86
--- /dev/null
+++ b/deps/FP16/LICENSE
@@ -0,0 +1,11 @@
+The MIT License (MIT)
+
+Copyright (c) 2017 Facebook Inc.
+Copyright (c) 2017 Georgia Institute of Technology
+Copyright 2019 Google LLC
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/deps/FP16/README.md b/deps/FP16/README.md
new file mode 100644
index 000000000..6cba15862
--- /dev/null
+++ b/deps/FP16/README.md
@@ -0,0 +1,20 @@
+# FP16
+Header-only library for conversion to/from half-precision floating point formats
+
+## Features
+
+- Supports IEEE and ARM alternative half-precision floating-point format
+    - Property converts infinities and NaNs
+    - Properly converts denormal numbers, even on systems without denormal support
+- Header-only library, no installation or build required
+- Compatible with C99 and C++11
+- Fully covered with unit tests and microbenchmarks
+
+## Acknowledgements
+
+[![HPC Garage logo](https://github.com/Maratyszcza/PeachPy/blob/master/logo/hpcgarage.png)](http://hpcgarage.org)
+[![Georgia Tech College of Computing logo](https://github.com/Maratyszcza/PeachPy/blob/master/logo/college-of-computing.gif)](http://www.cse.gatech.edu/)
+
+The library is developed by [Marat Dukhan](http://www.maratdukhan.com) of Georgia Tech. FP16 is a research project at [Richard Vuduc](http://vuduc.org)'s HPC Garage lab in the Georgia Institute of Technology, College of Computing, School of Computational Science and Engineering.
+
+This material is based upon work supported by the U.S. National Science Foundation (NSF) Award Number 1339745. Any opinions, findings and conclusions or recommendations expressed in this material are those of the authors and do not necessarily reflect those of NSF.
diff --git a/deps/FP16/include/fp16.h b/deps/FP16/include/fp16.h
new file mode 100644
index 000000000..9d7366e99
--- /dev/null
+++ b/deps/FP16/include/fp16.h
@@ -0,0 +1,11 @@
+#pragma once
+#ifndef FP16_H
+#define FP16_H
+
+#include <fp16/fp16.h>
+
+#if defined(PSIMD_H)
+#include <fp16/psimd.h>
+#endif
+
+#endif /* FP16_H */
diff --git a/deps/FP16/include/fp16/bitcasts.h b/deps/FP16/include/fp16/bitcasts.h
new file mode 100644
index 000000000..86a4e22c4
--- /dev/null
+++ b/deps/FP16/include/fp16/bitcasts.h
@@ -0,0 +1,92 @@
+#pragma once
+#ifndef FP16_BITCASTS_H
+#define FP16_BITCASTS_H
+
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+	#include <cstdint>
+#elif !defined(__OPENCL_VERSION__)
+	#include <stdint.h>
+#endif
+
+#if defined(__INTEL_COMPILER)
+	#include <immintrin.h>
+#endif
+
+#if defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64))
+	#include <intrin.h>
+#endif
+
+
+static inline float fp32_from_bits(uint32_t w) {
+#if defined(__OPENCL_VERSION__)
+	return as_float(w);
+#elif defined(__CUDA_ARCH__)
+	return __uint_as_float((unsigned int) w);
+#elif defined(__INTEL_COMPILER)
+	return _castu32_f32(w);
+#elif defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64))
+	return _CopyFloatFromInt32((__int32) w);
+#else
+	union {
+		uint32_t as_bits;
+		float as_value;
+	} fp32 = { w };
+	return fp32.as_value;
+#endif
+}
+
+static inline uint32_t fp32_to_bits(float f) {
+#if defined(__OPENCL_VERSION__)
+	return as_uint(f);
+#elif defined(__CUDA_ARCH__)
+	return (uint32_t) __float_as_uint(f);
+#elif defined(__INTEL_COMPILER)
+	return _castf32_u32(f);
+#elif defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64))
+	return (uint32_t) _CopyInt32FromFloat(f);
+#else
+	union {
+		float as_value;
+		uint32_t as_bits;
+	} fp32 = { f };
+	return fp32.as_bits;
+#endif
+}
+
+static inline double fp64_from_bits(uint64_t w) {
+#if defined(__OPENCL_VERSION__)
+	return as_double(w);
+#elif defined(__CUDA_ARCH__)
+	return __longlong_as_double((long long) w);
+#elif defined(__INTEL_COMPILER)
+	return _castu64_f64(w);
+#elif defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64))
+	return _CopyDoubleFromInt64((__int64) w);
+#else
+	union {
+		uint64_t as_bits;
+		double as_value;
+	} fp64 = { w };
+	return fp64.as_value;
+#endif
+}
+
+static inline uint64_t fp64_to_bits(double f) {
+#if defined(__OPENCL_VERSION__)
+	return as_ulong(f);
+#elif defined(__CUDA_ARCH__)
+	return (uint64_t) __double_as_longlong(f);
+#elif defined(__INTEL_COMPILER)
+	return _castf64_u64(f);
+#elif defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64))
+	return (uint64_t) _CopyInt64FromDouble(f);
+#else
+	union {
+		double as_value;
+		uint64_t as_bits;
+	} fp64 = { f };
+	return fp64.as_bits;
+#endif
+}
+
+#endif /* FP16_BITCASTS_H */
diff --git a/deps/FP16/include/fp16/fp16.h b/deps/FP16/include/fp16/fp16.h
new file mode 100644
index 000000000..b95aa15f5
--- /dev/null
+++ b/deps/FP16/include/fp16/fp16.h
@@ -0,0 +1,451 @@
+#pragma once
+#ifndef FP16_FP16_H
+#define FP16_FP16_H
+
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+	#include <cstdint>
+	#include <cmath>
+#elif !defined(__OPENCL_VERSION__)
+	#include <stdint.h>
+	#include <math.h>
+#endif
+
+#ifdef _MSC_VER
+	#include <intrin.h>
+#endif
+
+#include <fp16/bitcasts.h>
+
+
+/*
+ * Convert a 16-bit floating-point number in IEEE half-precision format, in bit representation, to
+ * a 32-bit floating-point number in IEEE single-precision format, in bit representation.
+ *
+ * @note The implementation doesn't use any floating-point operations.
+ */
+static inline uint32_t fp16_ieee_to_fp32_bits(uint16_t h) {
+	/*
+	 * Extend the half-precision floating-point number to 32 bits and shift to the upper part of the 32-bit word:
+	 *      +---+-----+------------+-------------------+
+	 *      | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
+	 *      +---+-----+------------+-------------------+
+	 * Bits  31  26-30    16-25            0-15
+	 *
+	 * S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0 - zero bits.
+	 */
+	const uint32_t w = (uint32_t) h << 16;
+	/*
+	 * Extract the sign of the input number into the high bit of the 32-bit word:
+	 *
+	 *      +---+----------------------------------+
+	 *      | S |0000000 00000000 00000000 00000000|
+	 *      +---+----------------------------------+
+	 * Bits  31                 0-31
+	 */
+	const uint32_t sign = w & UINT32_C(0x80000000);
+	/*
+	 * Extract mantissa and biased exponent of the input number into the bits 0-30 of the 32-bit word:
+	 *
+	 *      +---+-----+------------+-------------------+
+	 *      | 0 |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
+	 *      +---+-----+------------+-------------------+
+	 * Bits  30  27-31     17-26            0-16
+	 */
+	const uint32_t nonsign = w & UINT32_C(0x7FFFFFFF);
+	/*
+	 * Renorm shift is the number of bits to shift mantissa left to make the half-precision number normalized.
+	 * If the initial number is normalized, some of its high 6 bits (sign == 0 and 5-bit exponent) equals one.
+	 * In this case renorm_shift == 0. If the number is denormalize, renorm_shift > 0. Note that if we shift
+	 * denormalized nonsign by renorm_shift, the unit bit of mantissa will shift into exponent, turning the
+	 * biased exponent into 1, and making mantissa normalized (i.e. without leading 1).
+	 */
+#ifdef _MSC_VER
+	unsigned long nonsign_bsr;
+	_BitScanReverse(&nonsign_bsr, (unsigned long) nonsign);
+	uint32_t renorm_shift = (uint32_t) nonsign_bsr ^ 31;
+#else
+	uint32_t renorm_shift = __builtin_clz(nonsign);
+#endif
+	renorm_shift = renorm_shift > 5 ? renorm_shift - 5 : 0;
+	/*
+	 * Iff half-precision number has exponent of 15, the addition overflows it into bit 31,
+	 * and the subsequent shift turns the high 9 bits into 1. Thus
+	 *   inf_nan_mask ==
+	 *                   0x7F800000 if the half-precision number had exponent of 15 (i.e. was NaN or infinity)
+	 *                   0x00000000 otherwise
+	 */
+	const int32_t inf_nan_mask = ((int32_t) (nonsign + 0x04000000) >> 8) & INT32_C(0x7F800000);
+	/*
+	 * Iff nonsign is 0, it overflows into 0xFFFFFFFF, turning bit 31 into 1. Otherwise, bit 31 remains 0.
+	 * The signed shift right by 31 broadcasts bit 31 into all bits of the zero_mask. Thus
+	 *   zero_mask ==
+	 *                0xFFFFFFFF if the half-precision number was zero (+0.0h or -0.0h)
+	 *                0x00000000 otherwise
+	 */
+	const int32_t zero_mask = (int32_t) (nonsign - 1) >> 31;
+	/*
+	 * 1. Shift nonsign left by renorm_shift to normalize it (if the input was denormal)
+	 * 2. Shift nonsign right by 3 so the exponent (5 bits originally) becomes an 8-bit field and 10-bit mantissa
+	 *    shifts into the 10 high bits of the 23-bit mantissa of IEEE single-precision number.
+	 * 3. Add 0x70 to the exponent (starting at bit 23) to compensate the different in exponent bias
+	 *    (0x7F for single-precision number less 0xF for half-precision number).
+	 * 4. Subtract renorm_shift from the exponent (starting at bit 23) to account for renormalization. As renorm_shift
+	 *    is less than 0x70, this can be combined with step 3.
+	 * 5. Binary OR with inf_nan_mask to turn the exponent into 0xFF if the input was NaN or infinity.
+	 * 6. Binary ANDNOT with zero_mask to turn the mantissa and exponent into zero if the input was zero.
+	 * 7. Combine with the sign of the input number.
+	 */
+	return sign | ((((nonsign << renorm_shift >> 3) + ((0x70 - renorm_shift) << 23)) | inf_nan_mask) & ~zero_mask);
+}
+
+/*
+ * Convert a 16-bit floating-point number in IEEE half-precision format, in bit representation, to
+ * a 32-bit floating-point number in IEEE single-precision format.
+ *
+ * @note The implementation relies on IEEE-like (no assumption about rounding mode and no operations on denormals)
+ * floating-point operations and bitcasts between integer and floating-point variables.
+ */
+static inline float fp16_ieee_to_fp32_value(uint16_t h) {
+	/*
+	 * Extend the half-precision floating-point number to 32 bits and shift to the upper part of the 32-bit word:
+	 *      +---+-----+------------+-------------------+
+	 *      | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
+	 *      +---+-----+------------+-------------------+
+	 * Bits  31  26-30    16-25            0-15
+	 *
+	 * S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0 - zero bits.
+	 */
+	const uint32_t w = (uint32_t) h << 16;
+	/*
+	 * Extract the sign of the input number into the high bit of the 32-bit word:
+	 *
+	 *      +---+----------------------------------+
+	 *      | S |0000000 00000000 00000000 00000000|
+	 *      +---+----------------------------------+
+	 * Bits  31                 0-31
+	 */
+	const uint32_t sign = w & UINT32_C(0x80000000);
+	/*
+	 * Extract mantissa and biased exponent of the input number into the high bits of the 32-bit word:
+	 *
+	 *      +-----+------------+---------------------+
+	 *      |EEEEE|MM MMMM MMMM|0 0000 0000 0000 0000|
+	 *      +-----+------------+---------------------+
+	 * Bits  27-31    17-26            0-16
+	 */
+	const uint32_t two_w = w + w;
+
+	/*
+	 * Shift mantissa and exponent into bits 23-28 and bits 13-22 so they become mantissa and exponent
+	 * of a single-precision floating-point number:
+	 *
+	 *       S|Exponent |          Mantissa
+	 *      +-+---+-----+------------+----------------+
+	 *      |0|000|EEEEE|MM MMMM MMMM|0 0000 0000 0000|
+	 *      +-+---+-----+------------+----------------+
+	 * Bits   | 23-31   |           0-22
+	 *
+	 * Next, there are some adjustments to the exponent:
+	 * - The exponent needs to be corrected by the difference in exponent bias between single-precision and half-precision
+	 *   formats (0x7F - 0xF = 0x70)
+	 * - Inf and NaN values in the inputs should become Inf and NaN values after conversion to the single-precision number.
+	 *   Therefore, if the biased exponent of the half-precision input was 0x1F (max possible value), the biased exponent
+	 *   of the single-precision output must be 0xFF (max possible value). We do this correction in two steps:
+	 *   - First, we adjust the exponent by (0xFF - 0x1F) = 0xE0 (see exp_offset below) rather than by 0x70 suggested
+	 *     by the difference in the exponent bias (see above).
+	 *   - Then we multiply the single-precision result of exponent adjustment by 2**(-112) to reverse the effect of
+	 *     exponent adjustment by 0xE0 less the necessary exponent adjustment by 0x70 due to difference in exponent bias.
+	 *     The floating-point multiplication hardware would ensure than Inf and NaN would retain their value on at least
+	 *     partially IEEE754-compliant implementations.
+	 *
+	 * Note that the above operations do not handle denormal inputs (where biased exponent == 0). However, they also do not
+	 * operate on denormal inputs, and do not produce denormal results.
+	 */
+	const uint32_t exp_offset = UINT32_C(0xE0) << 23;
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
+	const float exp_scale = 0x1.0p-112f;
+#else
+	const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
+#endif
+	const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
+
+	/*
+	 * Convert denormalized half-precision inputs into single-precision results (always normalized).
+	 * Zero inputs are also handled here.
+	 *
+	 * In a denormalized number the biased exponent is zero, and mantissa has on-zero bits.
+	 * First, we shift mantissa into bits 0-9 of the 32-bit word.
+	 *
+	 *                  zeros           |  mantissa
+	 *      +---------------------------+------------+
+	 *      |0000 0000 0000 0000 0000 00|MM MMMM MMMM|
+	 *      +---------------------------+------------+
+	 * Bits             10-31                0-9
+	 *
+	 * Now, remember that denormalized half-precision numbers are represented as:
+	 *    FP16 = mantissa * 2**(-24).
+	 * The trick is to construct a normalized single-precision number with the same mantissa and thehalf-precision input
+	 * and with an exponent which would scale the corresponding mantissa bits to 2**(-24).
+	 * A normalized single-precision floating-point number is represented as:
+	 *    FP32 = (1 + mantissa * 2**(-23)) * 2**(exponent - 127)
+	 * Therefore, when the biased exponent is 126, a unit change in the mantissa of the input denormalized half-precision
+	 * number causes a change of the constructud single-precision number by 2**(-24), i.e. the same ammount.
+	 *
+	 * The last step is to adjust the bias of the constructed single-precision number. When the input half-precision number
+	 * is zero, the constructed single-precision number has the value of
+	 *    FP32 = 1 * 2**(126 - 127) = 2**(-1) = 0.5
+	 * Therefore, we need to subtract 0.5 from the constructed single-precision number to get the numerical equivalent of
+	 * the input half-precision number.
+	 */
+	const uint32_t magic_mask = UINT32_C(126) << 23;
+	const float magic_bias = 0.5f;
+	const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
+
+	/*
+	 * - Choose either results of conversion of input as a normalized number, or as a denormalized number, depending on the
+	 *   input exponent. The variable two_w contains input exponent in bits 27-31, therefore if its smaller than 2**27, the
+	 *   input is either a denormal number, or zero.
+	 * - Combine the result of conversion of exponent and mantissa with the sign of the input number.
+	 */
+	const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
+	const uint32_t result = sign |
+		(two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
+	return fp32_from_bits(result);
+}
+
+/*
+ * Convert a 32-bit floating-point number in IEEE single-precision format to a 16-bit floating-point number in
+ * IEEE half-precision format, in bit representation.
+ *
+ * @note The implementation relies on IEEE-like (no assumption about rounding mode and no operations on denormals)
+ * floating-point operations and bitcasts between integer and floating-point variables.
+ */
+static inline uint16_t fp16_ieee_from_fp32_value(float f) {
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
+	const float scale_to_inf = 0x1.0p+112f;
+	const float scale_to_zero = 0x1.0p-110f;
+#else
+	const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
+	const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
+#endif
+	float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
+
+	const uint32_t w = fp32_to_bits(f);
+	const uint32_t shl1_w = w + w;
+	const uint32_t sign = w & UINT32_C(0x80000000);
+	uint32_t bias = shl1_w & UINT32_C(0xFF000000);
+	if (bias < UINT32_C(0x71000000)) {
+		bias = UINT32_C(0x71000000);
+	}
+
+	base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
+	const uint32_t bits = fp32_to_bits(base);
+	const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
+	const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
+	const uint32_t nonsign = exp_bits + mantissa_bits;
+	return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
+}
+
+/*
+ * Convert a 16-bit floating-point number in ARM alternative half-precision format, in bit representation, to
+ * a 32-bit floating-point number in IEEE single-precision format, in bit representation.
+ *
+ * @note The implementation doesn't use any floating-point operations.
+ */
+static inline uint32_t fp16_alt_to_fp32_bits(uint16_t h) {
+	/*
+	 * Extend the half-precision floating-point number to 32 bits and shift to the upper part of the 32-bit word:
+	 *      +---+-----+------------+-------------------+
+	 *      | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
+	 *      +---+-----+------------+-------------------+
+	 * Bits  31  26-30    16-25            0-15
+	 *
+	 * S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0 - zero bits.
+	 */
+	const uint32_t w = (uint32_t) h << 16;
+	/*
+	 * Extract the sign of the input number into the high bit of the 32-bit word:
+	 *
+	 *      +---+----------------------------------+
+	 *      | S |0000000 00000000 00000000 00000000|
+	 *      +---+----------------------------------+
+	 * Bits  31                 0-31
+	 */
+	const uint32_t sign = w & UINT32_C(0x80000000);
+	/*
+	 * Extract mantissa and biased exponent of the input number into the bits 0-30 of the 32-bit word:
+	 *
+	 *      +---+-----+------------+-------------------+
+	 *      | 0 |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
+	 *      +---+-----+------------+-------------------+
+	 * Bits  30  27-31     17-26            0-16
+	 */
+	const uint32_t nonsign = w & UINT32_C(0x7FFFFFFF);
+	/*
+	 * Renorm shift is the number of bits to shift mantissa left to make the half-precision number normalized.
+	 * If the initial number is normalized, some of its high 6 bits (sign == 0 and 5-bit exponent) equals one.
+	 * In this case renorm_shift == 0. If the number is denormalize, renorm_shift > 0. Note that if we shift
+	 * denormalized nonsign by renorm_shift, the unit bit of mantissa will shift into exponent, turning the
+	 * biased exponent into 1, and making mantissa normalized (i.e. without leading 1).
+	 */
+#ifdef _MSC_VER
+	unsigned long nonsign_bsr;
+	_BitScanReverse(&nonsign_bsr, (unsigned long) nonsign);
+	uint32_t renorm_shift = (uint32_t) nonsign_bsr ^ 31;
+#else
+	uint32_t renorm_shift = __builtin_clz(nonsign);
+#endif
+	renorm_shift = renorm_shift > 5 ? renorm_shift - 5 : 0;
+	/*
+	 * Iff nonsign is 0, it overflows into 0xFFFFFFFF, turning bit 31 into 1. Otherwise, bit 31 remains 0.
+	 * The signed shift right by 31 broadcasts bit 31 into all bits of the zero_mask. Thus
+	 *   zero_mask ==
+	 *                0xFFFFFFFF if the half-precision number was zero (+0.0h or -0.0h)
+	 *                0x00000000 otherwise
+	 */
+	const int32_t zero_mask = (int32_t) (nonsign - 1) >> 31;
+	/*
+	 * 1. Shift nonsign left by renorm_shift to normalize it (if the input was denormal)
+	 * 2. Shift nonsign right by 3 so the exponent (5 bits originally) becomes an 8-bit field and 10-bit mantissa
+	 *    shifts into the 10 high bits of the 23-bit mantissa of IEEE single-precision number.
+	 * 3. Add 0x70 to the exponent (starting at bit 23) to compensate the different in exponent bias
+	 *    (0x7F for single-precision number less 0xF for half-precision number).
+	 * 4. Subtract renorm_shift from the exponent (starting at bit 23) to account for renormalization. As renorm_shift
+	 *    is less than 0x70, this can be combined with step 3.
+	 * 5. Binary ANDNOT with zero_mask to turn the mantissa and exponent into zero if the input was zero.
+	 * 6. Combine with the sign of the input number.
+	 */
+	return sign | (((nonsign << renorm_shift >> 3) + ((0x70 - renorm_shift) << 23)) & ~zero_mask);
+}
+
+/*
+ * Convert a 16-bit floating-point number in ARM alternative half-precision format, in bit representation, to
+ * a 32-bit floating-point number in IEEE single-precision format.
+ *
+ * @note The implementation relies on IEEE-like (no assumption about rounding mode and no operations on denormals)
+ * floating-point operations and bitcasts between integer and floating-point variables.
+ */
+static inline float fp16_alt_to_fp32_value(uint16_t h) {
+	/*
+	 * Extend the half-precision floating-point number to 32 bits and shift to the upper part of the 32-bit word:
+	 *      +---+-----+------------+-------------------+
+	 *      | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
+	 *      +---+-----+------------+-------------------+
+	 * Bits  31  26-30    16-25            0-15
+	 *
+	 * S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0 - zero bits.
+	 */
+	const uint32_t w = (uint32_t) h << 16;
+	/*
+	 * Extract the sign of the input number into the high bit of the 32-bit word:
+	 *
+	 *      +---+----------------------------------+
+	 *      | S |0000000 00000000 00000000 00000000|
+	 *      +---+----------------------------------+
+	 * Bits  31                 0-31
+	 */
+	const uint32_t sign = w & UINT32_C(0x80000000);
+	/*
+	 * Extract mantissa and biased exponent of the input number into the high bits of the 32-bit word:
+	 *
+	 *      +-----+------------+---------------------+
+	 *      |EEEEE|MM MMMM MMMM|0 0000 0000 0000 0000|
+	 *      +-----+------------+---------------------+
+	 * Bits  27-31    17-26            0-16
+	 */
+	const uint32_t two_w = w + w;
+
+	/*
+	 * Shift mantissa and exponent into bits 23-28 and bits 13-22 so they become mantissa and exponent
+	 * of a single-precision floating-point number:
+	 *
+	 *       S|Exponent |          Mantissa
+	 *      +-+---+-----+------------+----------------+
+	 *      |0|000|EEEEE|MM MMMM MMMM|0 0000 0000 0000|
+	 *      +-+---+-----+------------+----------------+
+	 * Bits   | 23-31   |           0-22
+	 *
+	 * Next, the exponent is adjusted for the difference in exponent bias between single-precision and half-precision
+	 * formats (0x7F - 0xF = 0x70). This operation never overflows or generates non-finite values, as the largest
+	 * half-precision exponent is 0x1F and after the adjustment is can not exceed 0x8F < 0xFE (largest single-precision
+	 * exponent for non-finite values).
+	 *
+	 * Note that this operation does not handle denormal inputs (where biased exponent == 0). However, they also do not
+	 * operate on denormal inputs, and do not produce denormal results.
+	 */
+	const uint32_t exp_offset = UINT32_C(0x70) << 23;
+	const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset);
+
+	/*
+	 * Convert denormalized half-precision inputs into single-precision results (always normalized).
+	 * Zero inputs are also handled here.
+	 *
+	 * In a denormalized number the biased exponent is zero, and mantissa has on-zero bits.
+	 * First, we shift mantissa into bits 0-9 of the 32-bit word.
+	 *
+	 *                  zeros           |  mantissa
+	 *      +---------------------------+------------+
+	 *      |0000 0000 0000 0000 0000 00|MM MMMM MMMM|
+	 *      +---------------------------+------------+
+	 * Bits             10-31                0-9
+	 *
+	 * Now, remember that denormalized half-precision numbers are represented as:
+	 *    FP16 = mantissa * 2**(-24).
+	 * The trick is to construct a normalized single-precision number with the same mantissa and thehalf-precision input
+	 * and with an exponent which would scale the corresponding mantissa bits to 2**(-24).
+	 * A normalized single-precision floating-point number is represented as:
+	 *    FP32 = (1 + mantissa * 2**(-23)) * 2**(exponent - 127)
+	 * Therefore, when the biased exponent is 126, a unit change in the mantissa of the input denormalized half-precision
+	 * number causes a change of the constructud single-precision number by 2**(-24), i.e. the same ammount.
+	 *
+	 * The last step is to adjust the bias of the constructed single-precision number. When the input half-precision number
+	 * is zero, the constructed single-precision number has the value of
+	 *    FP32 = 1 * 2**(126 - 127) = 2**(-1) = 0.5
+	 * Therefore, we need to subtract 0.5 from the constructed single-precision number to get the numerical equivalent of
+	 * the input half-precision number.
+	 */
+	const uint32_t magic_mask = UINT32_C(126) << 23;
+	const float magic_bias = 0.5f;
+	const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
+
+	/*
+	 * - Choose either results of conversion of input as a normalized number, or as a denormalized number, depending on the
+	 *   input exponent. The variable two_w contains input exponent in bits 27-31, therefore if its smaller than 2**27, the
+	 *   input is either a denormal number, or zero.
+	 * - Combine the result of conversion of exponent and mantissa with the sign of the input number.
+	 */
+	const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
+	const uint32_t result = sign |
+		(two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
+	return fp32_from_bits(result);
+}
+
+/*
+ * Convert a 32-bit floating-point number in IEEE single-precision format to a 16-bit floating-point number in
+ * ARM alternative half-precision format, in bit representation.
+ *
+ * @note The implementation relies on IEEE-like (no assumption about rounding mode and no operations on denormals)
+ * floating-point operations and bitcasts between integer and floating-point variables.
+ */
+static inline uint16_t fp16_alt_from_fp32_value(float f) {
+	const uint32_t w = fp32_to_bits(f);
+	const uint32_t sign = w & UINT32_C(0x80000000);
+	const uint32_t shl1_w = w + w;
+
+	const uint32_t shl1_max_fp16_fp32 = UINT32_C(0x8FFFC000);
+	const uint32_t shl1_base = shl1_w > shl1_max_fp16_fp32 ? shl1_max_fp16_fp32 : shl1_w;
+	uint32_t shl1_bias = shl1_base & UINT32_C(0xFF000000);
+	const uint32_t exp_difference = 23 - 10;
+	const uint32_t shl1_bias_min = (127 - 1 - exp_difference) << 24;
+	if (shl1_bias < shl1_bias_min) {
+		shl1_bias = shl1_bias_min;
+	}
+
+	const float bias = fp32_from_bits((shl1_bias >> 1) + ((exp_difference + 2) << 23));
+	const float base = fp32_from_bits((shl1_base >> 1) + (2 << 23)) + bias;
+
+	const uint32_t exp_f = fp32_to_bits(base) >> 13;
+	return (sign >> 16) | ((exp_f & UINT32_C(0x00007C00)) + (fp32_to_bits(base) & UINT32_C(0x00000FFF)));
+}
+
+#endif /* FP16_FP16_H */
diff --git a/deps/FP16/include/fp16/psimd.h b/deps/FP16/include/fp16/psimd.h
new file mode 100644
index 000000000..428ab0651
--- /dev/null
+++ b/deps/FP16/include/fp16/psimd.h
@@ -0,0 +1,131 @@
+#pragma once
+#ifndef FP16_PSIMD_H
+#define FP16_PSIMD_H
+
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+	#include <cstdint>
+#elif !defined(__OPENCL_VERSION__)
+	#include <stdint.h>
+#endif
+
+#include <psimd.h>
+
+
+PSIMD_INTRINSIC psimd_f32 fp16_ieee_to_fp32_psimd(psimd_u16 half) {
+	const psimd_u32 word = (psimd_u32) psimd_interleave_lo_u16(psimd_zero_u16(), half);
+
+	const psimd_u32 sign = word & psimd_splat_u32(UINT32_C(0x80000000));
+	const psimd_u32 shr3_nonsign = (word + word) >> psimd_splat_u32(4);
+
+	const psimd_u32 exp_offset = psimd_splat_u32(UINT32_C(0x70000000));
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
+	const psimd_f32 exp_scale = psimd_splat_f32(0x1.0p-112f);
+#else
+	const psimd_f32 exp_scale = psimd_splat_f32(fp32_from_bits(UINT32_C(0x7800000)));
+#endif
+	const psimd_f32 norm_nonsign = psimd_mul_f32((psimd_f32) (shr3_nonsign + exp_offset), exp_scale);
+
+	const psimd_u16 magic_mask = psimd_splat_u16(UINT16_C(0x3E80));
+	const psimd_f32 magic_bias = psimd_splat_f32(0.25f);
+	const psimd_f32 denorm_nonsign = psimd_sub_f32((psimd_f32) psimd_interleave_lo_u16(half + half, magic_mask), magic_bias);
+
+	const psimd_s32 denorm_cutoff = psimd_splat_s32(INT32_C(0x00800000));
+	const psimd_s32 denorm_mask = (psimd_s32) shr3_nonsign < denorm_cutoff;
+	return (psimd_f32) (sign | (psimd_s32) psimd_blend_f32(denorm_mask, denorm_nonsign, norm_nonsign));
+}
+
+PSIMD_INTRINSIC psimd_f32x2 fp16_ieee_to_fp32x2_psimd(psimd_u16 half) {
+	const psimd_u32 word_lo = (psimd_u32) psimd_interleave_lo_u16(psimd_zero_u16(), half);
+	const psimd_u32 word_hi = (psimd_u32) psimd_interleave_hi_u16(psimd_zero_u16(), half);
+
+	const psimd_u32 sign_mask = psimd_splat_u32(UINT32_C(0x80000000));
+	const psimd_u32 sign_lo = word_lo & sign_mask;
+	const psimd_u32 sign_hi = word_hi & sign_mask;
+	const psimd_u32 shr3_nonsign_lo = (word_lo + word_lo) >> psimd_splat_u32(4);
+	const psimd_u32 shr3_nonsign_hi = (word_hi + word_hi) >> psimd_splat_u32(4);
+
+	const psimd_u32 exp_offset = psimd_splat_u32(UINT32_C(0x70000000));
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
+	const psimd_f32 exp_scale = psimd_splat_f32(0x1.0p-112f);
+#else
+	const psimd_f32 exp_scale = psimd_splat_f32(fp32_from_bits(UINT32_C(0x7800000)));
+#endif
+	const psimd_f32 norm_nonsign_lo = psimd_mul_f32((psimd_f32) (shr3_nonsign_lo + exp_offset), exp_scale);
+	const psimd_f32 norm_nonsign_hi = psimd_mul_f32((psimd_f32) (shr3_nonsign_hi + exp_offset), exp_scale);
+
+	const psimd_u16 magic_mask = psimd_splat_u16(UINT16_C(0x3E80));
+	const psimd_u16 shl1_half = half + half;
+	const psimd_f32 magic_bias = psimd_splat_f32(0.25f);
+	const psimd_f32 denorm_nonsign_lo = psimd_sub_f32((psimd_f32) psimd_interleave_lo_u16(shl1_half, magic_mask), magic_bias);
+	const psimd_f32 denorm_nonsign_hi = psimd_sub_f32((psimd_f32) psimd_interleave_hi_u16(shl1_half, magic_mask), magic_bias);
+
+	const psimd_s32 denorm_cutoff = psimd_splat_s32(INT32_C(0x00800000));
+	const psimd_s32 denorm_mask_lo = (psimd_s32) shr3_nonsign_lo < denorm_cutoff;
+	const psimd_s32 denorm_mask_hi = (psimd_s32) shr3_nonsign_hi < denorm_cutoff;
+
+	psimd_f32x2 result;
+	result.lo = (psimd_f32) (sign_lo | (psimd_s32) psimd_blend_f32(denorm_mask_lo, denorm_nonsign_lo, norm_nonsign_lo));
+	result.hi = (psimd_f32) (sign_hi | (psimd_s32) psimd_blend_f32(denorm_mask_hi, denorm_nonsign_hi, norm_nonsign_hi));
+	return result;
+}
+
+PSIMD_INTRINSIC psimd_f32 fp16_alt_to_fp32_psimd(psimd_u16 half) {
+	const psimd_u32 word = (psimd_u32) psimd_interleave_lo_u16(psimd_zero_u16(), half);
+
+	const psimd_u32 sign = word & psimd_splat_u32(INT32_C(0x80000000));
+	const psimd_u32 shr3_nonsign = (word + word) >> psimd_splat_u32(4);
+
+#if 0
+	const psimd_s32 exp112_offset = psimd_splat_s32(INT32_C(0x38000000));
+	const psimd_s32 nonsign_bits = (psimd_s32) shr3_nonsign + exp112_offset;
+	const psimd_s32 exp1_offset = psimd_splat_s32(INT32_C(0x00800000));
+	const psimd_f32 two_nonsign = (psimd_f32) (nonsign_bits + exp1_offset);
+	const psimd_s32 exp113_offset = exp112_offset | exp1_offset;
+	return (psimd_f32) (sign | (psimd_s32) psimd_sub_f32(two_nonsign, (psimd_f32) psimd_max_s32(nonsign_bits, exp113_offset)));
+#else
+	const psimd_u32 exp_offset = psimd_splat_u32(UINT32_C(0x38000000));
+	const psimd_f32 nonsign = (psimd_f32) (shr3_nonsign + exp_offset);
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
+	const psimd_f32 denorm_bias = psimd_splat_f32(0x1.0p-14f);
+#else
+	const psimd_f32 denorm_bias = psimd_splat_f32(fp32_from_bits(UINT32_C(0x38800000)));
+#endif
+	return (psimd_f32) (sign | (psimd_s32) psimd_sub_f32(psimd_add_f32(nonsign, nonsign), psimd_max_f32(nonsign, denorm_bias)));
+#endif
+}
+
+PSIMD_INTRINSIC psimd_f32x2 fp16_alt_to_fp32x2_psimd(psimd_u16 half) {
+	const psimd_u32 word_lo = (psimd_u32) psimd_interleave_lo_u16(psimd_zero_u16(), half);
+	const psimd_u32 word_hi = (psimd_u32) psimd_interleave_hi_u16(psimd_zero_u16(), half);
+
+	const psimd_u32 sign_mask = psimd_splat_u32(UINT32_C(0x80000000));
+	const psimd_u32 sign_lo = word_lo & sign_mask;
+	const psimd_u32 sign_hi = word_hi & sign_mask;
+	const psimd_u32 shr3_nonsign_lo = (word_lo + word_lo) >> psimd_splat_u32(4);
+	const psimd_u32 shr3_nonsign_hi = (word_hi + word_hi) >> psimd_splat_u32(4);
+
+#if 1
+	const psimd_s32 exp112_offset = psimd_splat_s32(INT32_C(0x38000000));
+	const psimd_s32 nonsign_bits_lo = (psimd_s32) shr3_nonsign_lo + exp112_offset;
+	const psimd_s32 nonsign_bits_hi = (psimd_s32) shr3_nonsign_hi + exp112_offset;
+	const psimd_s32 exp1_offset = psimd_splat_s32(INT32_C(0x00800000));
+	const psimd_f32 two_nonsign_lo = (psimd_f32) (nonsign_bits_lo + exp1_offset);
+	const psimd_f32 two_nonsign_hi = (psimd_f32) (nonsign_bits_hi + exp1_offset);
+	const psimd_s32 exp113_offset = exp1_offset | exp112_offset;
+	psimd_f32x2 result;
+	result.lo = (psimd_f32) (sign_lo | (psimd_s32) psimd_sub_f32(two_nonsign_lo, (psimd_f32) psimd_max_s32(nonsign_bits_lo, exp113_offset)));
+	result.hi = (psimd_f32) (sign_hi | (psimd_s32) psimd_sub_f32(two_nonsign_hi, (psimd_f32) psimd_max_s32(nonsign_bits_hi, exp113_offset)));
+	return result;
+#else
+	const psimd_u32 exp_offset = psimd_splat_u32(UINT32_C(0x38000000));
+	const psimd_f32 nonsign_lo = (psimd_f32) (shr3_nonsign_lo + exp_offset);
+	const psimd_f32 nonsign_hi = (psimd_f32) (shr3_nonsign_hi + exp_offset);
+	const psimd_f32 denorm_bias = psimd_splat_f32(0x1.0p-14f);
+	psimd_f32x2 result;
+	result.lo = (psimd_f32) (sign_lo | (psimd_s32) psimd_sub_f32(psimd_add_f32(nonsign_lo, nonsign_lo), psimd_max_f32(nonsign_lo, denorm_bias)));
+	result.hi = (psimd_f32) (sign_hi | (psimd_s32) psimd_sub_f32(psimd_add_f32(nonsign_hi, nonsign_hi), psimd_max_f32(nonsign_hi, denorm_bias)));
+	return result;
+#endif
+}
+
+#endif /* FP16_PSIMD_H */
diff --git a/deps/kmeans1d/.github/workflows/build.yml b/deps/kmeans1d/.github/workflows/build.yml
new file mode 100644
index 000000000..25ee831d8
--- /dev/null
+++ b/deps/kmeans1d/.github/workflows/build.yml
@@ -0,0 +1,81 @@
+name: build
+# When the 'permissions' key is specified, unspecified permission scopes (e.g.,
+# actions, checks, etc.) are set to no access (none).
+permissions:
+  contents: read
+on:
+  push:
+    branches: [master]
+  pull_request:
+    branches: [master]
+  schedule:
+    # Run weekly (* is a special character in YAML, so quote the string)
+    - cron: '0 0 * * 0'
+  workflow_dispatch:
+    inputs:
+      # When git-ref is empty, HEAD will be checked out.
+      git-ref:
+        description: Optional git ref (branch, tag, or full SHA)
+        required: false
+
+jobs:
+  build:
+    runs-on: ${{ matrix.config.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ['3.6', '3.7', '3.8', '3.9', '3.10']
+        config:
+          - os: ubuntu-latest
+            cc: clang
+            cxx: clang++
+          - os: ubuntu-latest
+            cc: gcc
+            cxx: g++
+          - os: windows-latest
+          - os: macos-latest
+        exclude:
+          # Python 3.6 is not supported on GitHub-hosted Ubuntu runners as of Ubuntu 22.04.
+          # https://github.com/actions/setup-python/issues/544#issuecomment-1332535877
+          - {config: {os: ubuntu-latest}, python-version: '3.6'}
+
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v3
+      with:
+        # When the ref is empty, HEAD will be checked out.
+        ref: ${{ github.event.inputs.git-ref }}
+
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ matrix.python-version }}
+
+    - name: Lint
+      run: |
+        pip install flake8
+        # stop the build if there are Python syntax errors or undefined names
+        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+
+    - name: Typing
+      run: |
+        python -m pip install mypy
+        python -m mypy kmeans1d
+
+    - name: Install (ubuntu)
+      if: matrix.config.os == 'ubuntu-latest'
+      env:
+        CC: ${{ matrix.config.cc }}
+        CXX: ${{ matrix.config.cxx }}
+      run: pip install --verbose .
+
+    - name: Install (other)
+      if: matrix.config.os != 'ubuntu-latest'
+      run: pip install --verbose .
+
+    - name: Test
+      run: |
+        cd tests  # so package is imported from site-packages instead of working directory
+        python -m unittest discover . -v
diff --git a/deps/kmeans1d/.github/workflows/packages.yml b/deps/kmeans1d/.github/workflows/packages.yml
new file mode 100644
index 000000000..858abce9a
--- /dev/null
+++ b/deps/kmeans1d/.github/workflows/packages.yml
@@ -0,0 +1,94 @@
+name: packages
+# When the 'permissions' key is specified, unspecified permission scopes (e.g.,
+# actions, checks, etc.) are set to no access (none).
+permissions:
+  contents: read
+on:
+  workflow_dispatch:
+    inputs:
+      # When git-ref is empty, HEAD will be checked out.
+      git-ref:
+        description: Optional git ref (branch, tag, or full SHA)
+        required: false
+
+jobs:
+  sdist:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+
+    steps:
+    - name: Clone
+      uses: actions/checkout@v2
+      with:
+        # When the ref is empty, HEAD will be checked out.
+        ref: ${{ github.event.inputs.git-ref }}
+
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.x'
+
+    - name: Dependencies
+      run: python -m pip install --upgrade pip
+
+    - name: Build
+      run: python setup.py sdist
+
+    - name: Upload
+      uses: actions/upload-artifact@v2
+      with:
+        name: packages
+        path: ./dist
+
+  wheels:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        # Use the oldest version of Ubuntu that supports the versions of Python
+        # you're targeting.
+        #  > "Building manylinux-compatible wheels is not trivial; as a general
+        #    rule, binaries built on one Linux distro will only work on other
+        #    Linux distros that are the same age or newer. Therefore, if we
+        #    want to make binaries that run on most Linux distros, we have to
+        #    use a very old distro -- CentOS 6."
+        #  - https://github.com/pypa/manylinux
+        os: [macos-latest, windows-latest, ubuntu-20.04]
+        python-version: ['3.6', '3.7', '3.8', '3.9', '3.10']
+
+    steps:
+    - name: Clone
+      uses: actions/checkout@v2
+      with:
+        # When the ref is empty, HEAD will be checked out.
+        ref: ${{ github.event.inputs.git-ref }}
+
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+
+    - name: Dependencies
+      run: python -m pip install --upgrade pip wheel
+
+    - name: Build
+      run: python setup.py bdist_wheel
+
+    - name: manylinux
+      if: ${{ startsWith(matrix.os, 'ubuntu-') }}
+      run: |
+        python -m pip install --upgrade auditwheel
+        python -m auditwheel show dist/*.whl
+        # Use manylinux2014, as you can't use an older ABI because of the
+        # presence of too-recent versioned symbols. To target manylinux1 and/or
+        # manylinux2010, PyPA provides docker images.
+        python -m auditwheel repair --plat manylinux2014_x86_64 dist/*.whl
+        rm -r dist
+        mv wheelhouse dist
+
+    - name: Upload
+      uses: actions/upload-artifact@v2
+      with:
+        name: packages
+        path: ./dist
diff --git a/deps/kmeans1d/.gitignore b/deps/kmeans1d/.gitignore
new file mode 100644
index 000000000..376f22a0a
--- /dev/null
+++ b/deps/kmeans1d/.gitignore
@@ -0,0 +1,107 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+# PyCharm
+.idea
diff --git a/deps/kmeans1d/LICENSE b/deps/kmeans1d/LICENSE
new file mode 100644
index 000000000..fef4b8e10
--- /dev/null
+++ b/deps/kmeans1d/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2019 Daniel Steinberg
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/deps/kmeans1d/README.md b/deps/kmeans1d/README.md
new file mode 100644
index 000000000..1316cb143
--- /dev/null
+++ b/deps/kmeans1d/README.md
@@ -0,0 +1,85 @@
+[![Build Status](https://github.com/dstein64/kmeans1d/workflows/build/badge.svg)](https://github.com/dstein64/kmeans1d/actions)
+
+kmeans1d
+========
+
+A Python library with an implementation of *k*-means clustering on 1D data, based on the algorithm
+from Xiaolin (1991), as presented by Gronlund et al. (2017, Section 2.2).
+
+Globally optimal *k*-means clustering is NP-hard for multi-dimensional data. Lloyd's algorithm is a
+popular approach for finding a locally optimal solution. For 1-dimensional data, there are polynomial
+time algorithms. The algorithm implemented here is an *O(kn + n log n)* dynamic programming algorithm
+for finding the globally optimal *k* clusters for *n* 1D data points.
+
+The code is written in C++, and wrapped with Python.
+
+Requirements
+------------
+
+*kmeans1d* supports Python 3.x.
+
+Installation
+------------
+
+[kmeans1d](https://pypi.python.org/pypi/kmeans1d) is available on PyPI, the Python Package Index.
+
+```sh
+$ pip3 install kmeans1d
+```
+
+Example Usage
+-------------
+
+```python
+import kmeans1d
+
+x = [4.0, 4.1, 4.2, -50, 200.2, 200.4, 200.9, 80, 100, 102]
+k = 4
+
+clusters, centroids = kmeans1d.cluster(x, k)
+
+print(clusters)   # [1, 1, 1, 0, 3, 3, 3, 2, 2, 2]
+print(centroids)  # [-50.0, 4.1, 94.0, 200.5]
+```
+
+Tests
+-----
+
+Tests are in [tests/](https://github.com/dstein64/kmeans1d/blob/master/tests).
+
+```sh
+# Run tests
+$ python3 -m unittest discover tests -v
+```
+
+Development
+-----------
+
+The underlying C++ code can be built in-place, outside the context of `pip`. This requires Python
+development tools for building Python modules (e.g., the `python3-dev` package on Ubuntu). `gcc`,
+`clang`, and `MSVC` have been tested.
+
+```
+$ python3 setup.py build_ext --inplace
+```
+
+The [packages](https://github.com/dstein64/kmeans1d/blob/master/.github/workflows/packages.yml)
+GitHub action can be manually triggered (`Actions` > `packages` > `Run workflow`) to build wheels
+and a source distribution.
+
+License
+-------
+
+The code in this repository has an [MIT License](https://en.wikipedia.org/wiki/MIT_License).
+
+See [LICENSE](https://github.com/dstein64/kmeans1d/blob/master/LICENSE).
+
+References
+----------
+
+[1] Wu, Xiaolin. "Optimal Quantization by Matrix Searching." Journal of Algorithms 12, no. 4
+(December 1, 1991): 663
+
+[2] Gronlund, Allan, Kasper Green Larsen, Alexander Mathiasen, Jesper Sindahl Nielsen, Stefan Schneider,
+and Mingzhou Song. "Fast Exact K-Means, k-Medians and Bregman Divergence Clustering in 1D."
+ArXiv:1701.07204 [Cs], January 25, 2017. http://arxiv.org/abs/1701.07204.
diff --git a/deps/kmeans1d/kmeans1d/__init__.py b/deps/kmeans1d/kmeans1d/__init__.py
new file mode 100644
index 000000000..6841033ab
--- /dev/null
+++ b/deps/kmeans1d/kmeans1d/__init__.py
@@ -0,0 +1,25 @@
+# MIT License
+#
+# Copyright (c) 2019 Daniel Steinberg
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+# Copyright © 2023 Apple Inc.
+
+from .core import Clustered, cluster
diff --git a/deps/kmeans1d/kmeans1d/_core.cpp b/deps/kmeans1d/kmeans1d/_core.cpp
new file mode 100755
index 000000000..8e0ac6f33
--- /dev/null
+++ b/deps/kmeans1d/kmeans1d/_core.cpp
@@ -0,0 +1,372 @@
+// MIT License
+//
+// Copyright (c) 2019 Daniel Steinberg
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+// Copyright © 2023 Apple Inc.
+
+#include <Python.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <numeric>
+#include <unordered_map>
+#include <vector>
+
+using namespace std;
+
+typedef unsigned long ulong;
+
+/*
+ *  Internal implementation of the SMAWK algorithm.
+ */
+template <typename T>
+void _smawk(
+        const vector<ulong>& rows,
+        const vector<ulong>& cols,
+        const function<T(ulong, ulong)>& lookup,
+        vector<ulong>* result) {
+    // Recursion base case
+    if (rows.size() == 0) return;
+
+    // ********************************
+    // * REDUCE
+    // ********************************
+
+    vector<ulong> _cols;  // Stack of surviving columns
+    for (ulong col : cols) {
+        while (true) {
+            if (_cols.size() == 0) break;
+            ulong row = rows[_cols.size() - 1];
+            if (lookup(row, col) >= lookup(row, _cols.back()))
+                break;
+            _cols.pop_back();
+        }
+        if (_cols.size() < rows.size())
+            _cols.push_back(col);
+    }
+
+    // Call recursively on odd-indexed rows
+    vector<ulong> odd_rows;
+    for (ulong i = 1; i < rows.size(); i += 2) {
+        odd_rows.push_back(rows[i]);
+    }
+    _smawk(odd_rows, _cols, lookup, result);
+
+    unordered_map<ulong, ulong> col_idx_lookup;
+    for (ulong idx = 0; idx < _cols.size(); ++idx) {
+        col_idx_lookup[_cols[idx]] = idx;
+    }
+
+    // ********************************
+    // * INTERPOLATE
+    // ********************************
+
+    // Fill-in even-indexed rows
+    ulong start = 0;
+    for (ulong r = 0; r < rows.size(); r += 2) {
+        ulong row = rows[r];
+        ulong stop = _cols.size() - 1;
+        if (r < rows.size() - 1)
+            stop = col_idx_lookup[(*result)[rows[r + 1]]];
+        ulong argmin = _cols[start];
+        T min = lookup(row, argmin);
+        for (ulong c = start + 1; c <= stop; ++c) {
+            T value = lookup(row, _cols[c]);
+            if (c == start || value < min) {
+                argmin = _cols[c];
+                min = value;
+            }
+        }
+        (*result)[row] = argmin;
+        start = stop;
+    }
+}
+
+/*
+ *  Interface for the SMAWK algorithm, for finding the minimum value in each row
+ *  of an implicitly-defined totally monotone matrix.
+ */
+template <typename T>
+vector<ulong> smawk(
+        const ulong num_rows,
+        const ulong num_cols,
+        const function<T(ulong, ulong)>& lookup) {
+    vector<ulong> result;
+    result.resize(num_rows);
+    vector<ulong> rows(num_rows);
+    iota(begin(rows), end(rows), 0);
+    vector<ulong> cols(num_cols);
+    iota(begin(cols), end(cols), 0);
+    _smawk<T>(rows, cols, lookup, &result);
+    return result;
+}
+
+/*
+ *  Calculates cluster costs in O(1) using prefix sum arrays.
+ */
+class CostCalculator {
+    vector<double> cumsum;
+    vector<double> cumsum2;
+
+  public:
+    CostCalculator(const vector<double>& vec, ulong n, const vector<ulong>& /*sort_idxs*/) {
+        cumsum.push_back(0.0);
+        cumsum2.push_back(0.0);
+        for (ulong i = 0; i < n; ++i) {
+            double x = vec[i];
+            cumsum.push_back(x + cumsum[i]);
+            cumsum2.push_back(x * x + cumsum2[i]);
+        }
+    }
+
+    double weight(ulong i, ulong j) {
+        return (i <= j) ? 1 + j - i : 0;
+    }
+
+    double calc(ulong i, ulong j) {
+        if (j < i) return 0.0;
+        double mu = (cumsum[j + 1] - cumsum[i]) / (j - i + 1);
+        double result = cumsum2[j + 1] - cumsum2[i];
+        result += (j - i + 1) * (mu * mu);
+        result -= (2 * mu) * (cumsum[j + 1] - cumsum[i]);
+        return result;
+    }
+};
+
+/*
+ *  Weighted version of the CostCalculator
+ */
+class WeightedCostCalculator {
+    vector<double> cumw;
+    vector<double> cumsum;
+    vector<double> cumsum2;
+
+  public:
+    WeightedCostCalculator(
+            const vector<double>& vec,
+            ulong n,
+            const vector<ulong>& sort_idxs,
+            const double* unsorted_weights) {
+        vector<double> sorted_weights(n);
+        for (ulong i = 0; i < n; ++i) {
+            sorted_weights[i] = unsorted_weights[sort_idxs[i]];
+        }
+        cumw.push_back(0.0);
+        cumsum.push_back(0.0);
+        cumsum2.push_back(0.0);
+        for (ulong i = 0; i < n; ++i) {
+            double x = vec[i];
+            double w = sorted_weights[i];
+            cumw.push_back(w + cumw[i]);
+            cumsum.push_back(w * x + cumsum[i]);
+            cumsum2.push_back(w * x * x + cumsum2[i]);
+        }
+    }
+
+    double weight(ulong i, ulong j) {
+        return (i <= j) ? cumw[j + 1] - cumw[i] : 0.0;
+    }
+
+    double calc(ulong i, ulong j) {
+        if (j < i) return 0.0;
+        double w = weight(i, j);
+        double mu = (cumsum[j + 1] - cumsum[i]) / w;
+        double result = cumsum2[j + 1] - cumsum2[i];
+        result += w * (mu * mu);
+        result -= (2 * mu) * (cumsum[j + 1] - cumsum[i]);
+        return result;
+    }
+};
+
+template <typename T>
+class Matrix {
+    vector<T> data;
+    ulong num_rows;
+    ulong num_cols;
+
+  public:
+    Matrix(ulong num_rows, ulong num_cols) {
+        this->num_rows = num_rows;
+        this->num_cols = num_cols;
+        data.resize(num_rows * num_cols);
+    }
+
+    inline T get(ulong i, ulong j) {
+        return data[i * num_cols + j];
+    }
+
+    inline void set(ulong i, ulong j, T value) {
+        data[i * num_cols + j] = value;
+    }
+};
+
+template <typename CostCalculatorType, typename... CostArgsTypes>
+void cluster_impl(
+        const double* array,
+        ulong n,
+        ulong k,
+        ulong* clusters,
+        double* centroids,
+        CostArgsTypes... args) {
+    // ***************************************************
+    // * Sort input array and save info for de-sorting
+    // ***************************************************
+
+    vector<ulong> sort_idxs(n);
+    iota(sort_idxs.begin(), sort_idxs.end(), 0);
+    sort(
+        sort_idxs.begin(),
+        sort_idxs.end(),
+        [&array](ulong a, ulong b) {return array[a] < array[b];});
+    vector<ulong> undo_sort_lookup(n);
+    vector<double> sorted_array(n);
+    for (ulong i = 0; i < n; ++i) {
+        sorted_array[i] = array[sort_idxs[i]];
+        undo_sort_lookup[sort_idxs[i]] = i;
+    }
+
+    // ***************************************************
+    // * Set D and T using dynamic programming algorithm
+    // ***************************************************
+
+    // Algorithm as presented in section 2.2 of (Gronlund et al., 2017).
+
+    CostCalculatorType cost_calculator(sorted_array, n, sort_idxs, args...);
+    Matrix<double> D(k, n);
+    Matrix<ulong> T(k, n);
+
+    for (ulong i = 0; i < n; ++i) {
+        D.set(0, i, cost_calculator.calc(0, i));
+        T.set(0, i, 0);
+    }
+
+    for (ulong k_ = 1; k_ < k; ++k_) {
+        auto C = [&D, &k_, &cost_calculator](ulong i, ulong j) -> double {
+            ulong col = i < j - 1 ? i : j - 1;
+            return D.get(k_ - 1, col) + cost_calculator.calc(j, i);
+        };
+        vector<ulong> row_argmins = smawk<double>(n, n, C);
+        for (ulong i = 0; i < row_argmins.size(); ++i) {
+            ulong argmin = row_argmins[i];
+            double min = C(i, argmin);
+            D.set(k_, i, min);
+            T.set(k_, i, argmin);
+        }
+    }
+
+    // ***************************************************
+    // * Extract cluster assignments by backtracking
+    // ***************************************************
+
+    // TODO: This step requires O(kn) memory usage due to saving the entire
+    //       T matrix. However, it can be modified so that the memory usage is O(n).
+    //       D and T would not need to be retained in full (D already doesn't need
+    //       to be fully retained, although it currently is).
+    //       Details are in section 3 of (Grønlund et al., 2017).
+
+    vector<double> sorted_clusters(n);
+
+    ulong t = n;
+    ulong k_ = k - 1;
+    ulong n_ = n - 1;
+    // The do/while loop was used in place of:
+    //   for (k_ = k - 1; k_ >= 0; --k_)
+    // to avoid wraparound of an unsigned type.
+    do {
+        ulong t_ = t;
+        t = T.get(k_, n_);
+        double centroid = 0.0;
+        for (ulong i = t; i < t_; ++i) {
+            sorted_clusters[i] = k_;
+            // Mean computation: this is only for squared L2 cost calculators
+            centroid += (
+                (sorted_array[i] - centroid)
+                * cost_calculator.weight(i, i)
+                / cost_calculator.weight(t, i)
+            );
+        }
+        centroids[k_] = centroid;
+        k_ -= 1;
+        n_ = t - 1;
+    } while (t > 0);
+
+    // ***************************************************
+    // * Order cluster assignments to match de-sorted
+    // * ordering
+    // ***************************************************
+
+    for (ulong i = 0; i < n; ++i) {
+        clusters[i] = sorted_clusters[undo_sort_lookup[i]];
+    }
+}
+
+extern "C" {
+// "__declspec(dllexport)" causes the function to be exported when compiling on Windows.
+// Otherwise, the function is not exported and the code raises
+//   "AttributeError: function 'cluster' not found".
+// Exporting is a Windows platform requirement, not just a Visual Studio requirement
+// (https://stackoverflow.com/a/22288874/1509433). The _WIN32 macro covers the Visual
+// Studio compiler (MSVC) and MinGW. The __CYGWIN__ macro covers gcc and clang under
+// Cygwin.
+#if defined(_WIN32) || defined(__CYGWIN__)
+__declspec(dllexport)
+#endif
+void cluster(
+        double* array,
+        ulong n,
+        ulong k,
+        ulong* clusters,
+        double* centroids) {
+    cluster_impl<CostCalculator>(array, n, k, clusters, centroids);
+}
+
+#if defined(_WIN32) || defined(__CYGWIN__)
+__declspec(dllexport)
+#endif
+void cluster_with_weights(
+        double* array,
+        double* weights,
+        ulong n,
+        ulong k,
+        ulong* clusters,
+        double* centroids) {
+    cluster_impl<WeightedCostCalculator, const double*>(
+        array, n, k, clusters, centroids, weights
+    );
+}
+} // extern "C"
+
+static PyMethodDef module_methods[] = {
+    {NULL, NULL, 0, NULL}
+};
+
+static struct PyModuleDef _coremodule = {
+    PyModuleDef_HEAD_INIT,
+    "kmeans1d._core",
+    NULL,
+    -1,
+    module_methods,
+};
+
+PyMODINIT_FUNC PyInit__core(void) {
+    return PyModule_Create(&_coremodule);
+}
diff --git a/deps/kmeans1d/kmeans1d/core.py b/deps/kmeans1d/kmeans1d/core.py
new file mode 100755
index 000000000..9baf4d1b8
--- /dev/null
+++ b/deps/kmeans1d/kmeans1d/core.py
@@ -0,0 +1,74 @@
+# MIT License
+#
+# Copyright (c) 2019 Daniel Steinberg
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+# Copyright © 2023 Apple Inc.
+
+from collections import namedtuple
+import ctypes
+from typing import Optional, Sequence
+
+from . import _core  # type: ignore
+
+
+Clustered = namedtuple('Clustered', 'clusters centroids')
+
+_DLL = ctypes.cdll.LoadLibrary(_core.__file__)
+
+
+def cluster(
+        array: Sequence[float],
+        k: int,
+        *,
+        weights: Optional[Sequence[float]] = None) -> Clustered:
+    """
+    :param array: A sequence of floats
+    :param k: Number of clusters (int)
+    :param weights: Sequence of weights (if provided, must have same length as `array`)
+    :return: A tuple with (clusters, centroids)
+    """
+    assert k > 0, f'Invalid k: {k}'
+    n = len(array)
+    assert n > 0, f'Invalid len(array): {n}'
+    k = min(k, n)
+
+    if weights is not None:
+        assert len(weights) == n, f'len(weights)={len(weights)} != len(array)={n}'
+
+    c_array = (ctypes.c_double * n)(*array)
+    c_n = ctypes.c_ulong(n)
+    c_k = ctypes.c_ulong(k)
+    c_clusters = (ctypes.c_ulong * n)()
+    c_centroids = (ctypes.c_double * k)()
+
+    if weights is None:
+        _DLL.cluster(c_array, c_n, c_k, c_clusters, c_centroids)
+    else:
+        c_weights = (ctypes.c_double * n)(*weights)
+        _DLL.cluster_with_weights(c_array, c_weights, c_n, c_k, c_clusters, c_centroids)
+
+
+    clusters = list(c_clusters)
+    centroids = list(c_centroids)
+
+    output = Clustered(clusters=clusters, centroids=centroids)
+
+    return output
diff --git a/deps/kmeans1d/kmeans1d/version.txt b/deps/kmeans1d/kmeans1d/version.txt
new file mode 100644
index 000000000..9e11b32fc
--- /dev/null
+++ b/deps/kmeans1d/kmeans1d/version.txt
@@ -0,0 +1 @@
+0.3.1
diff --git a/deps/kmeans1d/setup.py b/deps/kmeans1d/setup.py
new file mode 100755
index 000000000..05fa86ff6
--- /dev/null
+++ b/deps/kmeans1d/setup.py
@@ -0,0 +1,67 @@
+import os
+import setuptools
+from setuptools import Extension, setup
+from setuptools.command.build_ext import build_ext
+
+
+class BuildExt(build_ext):
+    """A custom build extension for adding -stdlib arguments for clang++."""
+
+    def build_extensions(self):
+        # '-std=c++11' is added to `extra_compile_args` so the code can compile
+        # with clang++. This works across compilers (ignored by MSVC).
+        for extension in self.extensions:
+            extension.extra_compile_args.append('-std=c++11')
+
+        try:
+            build_ext.build_extensions(self)
+        except setuptools.distutils.errors.CompileError:
+            # Workaround Issue #2.
+            # '-stdlib=libc++' is added to `extra_compile_args` and `extra_link_args`
+            # so the code can compile on macOS with Anaconda.
+            for extension in self.extensions:
+                extension.extra_compile_args.append('-stdlib=libc++')
+                extension.extra_link_args.append('-stdlib=libc++')
+            build_ext.build_extensions(self)
+
+
+extension = Extension('kmeans1d._core', ['kmeans1d/_core.cpp'])
+
+version_txt = os.path.join(os.path.dirname(__file__), 'kmeans1d', 'version.txt')
+with open(version_txt, 'r') as f:
+    version = f.read().strip()
+
+with open('README.md') as f:
+    long_description = f.read()
+
+setup(
+    author='Daniel Steinberg',
+    author_email='ds@dannyadam.com',
+    classifiers=[
+        'Development Status :: 4 - Beta',
+        'Intended Audience :: Developers',
+        'Intended Audience :: Science/Research',
+        'Topic :: Scientific/Engineering',
+        'Topic :: Scientific/Engineering :: Artificial Intelligence',
+        'Topic :: Scientific/Engineering :: Information Analysis',
+        'License :: OSI Approved :: MIT License',
+        'Operating System :: Unix',
+        'Operating System :: POSIX :: Linux',
+        'Operating System :: MacOS',
+        'Operating System :: Microsoft :: Windows',
+        'Programming Language :: Python :: 3',
+    ],
+    cmdclass={'build_ext': BuildExt},
+    description='A Python package for optimal 1D k-means clustering',
+    ext_modules=[extension],
+    keywords=['k-means', 'machine learning', 'optimization'],
+    license='MIT',
+    long_description=long_description,
+    long_description_content_type='text/markdown',
+    name='kmeans1d',
+    package_data={'kmeans1d': ['version.txt']},
+    packages=['kmeans1d'],
+    python_requires='>=3.6',
+    url='https://github.com/dstein64/kmeans1d',
+    version=version,
+)
diff --git a/deps/kmeans1d/tests/test_kmeans1d.py b/deps/kmeans1d/tests/test_kmeans1d.py
new file mode 100644
index 000000000..93ef0d941
--- /dev/null
+++ b/deps/kmeans1d/tests/test_kmeans1d.py
@@ -0,0 +1,85 @@
+# MIT License
+#
+# Copyright (c) 2019 Daniel Steinberg
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+# Copyright © 2023 Apple Inc.
+
+import unittest
+
+from kmeans1d import cluster
+
+def round_list(L, ndigits=0) -> list:
+    """ Round all values in a list """
+    return [round(v, ndigits=ndigits) for v in L]
+
+def compute_inertial(points, weights, clusters, centroids) -> float:
+    """ Compute the inertia (k-means loss, i.e. weighted sum of squared differences) """
+    assert len(points) == len(weights) == len(clusters)
+    assert all(0 <= k < len(centroids) for k in clusters)
+    return sum(
+        w * (x - centroids[k]) ** 2
+        for x, w, k in zip(points, weights, clusters)
+    )
+
+class TestKmeans1D(unittest.TestCase):
+    """kmeans1d tests"""
+    def test_cluster(self):
+        x = [4.0, 4.1, 4.2, -50, 200.2, 200.4, 200.9, 80, 100, 102]
+        k = 4
+        clusters, centroids = cluster(x, k)
+        self.assertEqual(clusters, [1, 1, 1, 0, 3, 3, 3, 2, 2, 2])
+        self.assertEqual(centroids, [-50.0, 4.1, 94.0, 200.5])
+
+    def test_cluster_with_weights(self):
+        x = [4.0, 4.1, 4.2, -50, 200.2, 200.4, 200.9, 80, 100, 102]
+        w = [1, 1, 1, 0.125, 4, 1, 1, 3, 2, 2]
+        k = 4
+        clusters, centroids = cluster(x, k, weights=w)
+        centroids = round_list(centroids, ndigits=9)  # because of numerical inaccuracy
+        self.assertEqual(clusters, [0, 0, 0, 0, 3, 3, 3, 1, 2, 2])
+        self.assertEqual(centroids, [1.936, 80.0, 101.0, 200.35])
+
+    def test_weights_vs_repetition(self):
+        x = [10, 24, 16, 12, 20]
+        w = [3, 1, 4, 2, 3]
+        k = 2
+
+        # Unweighted
+        u_clusters, _ = cluster(x, k)
+        self.assertEqual(u_clusters, [0, 1, 0, 0, 1])
+
+        # Weighted: different than unweighted
+        w_clusters, w_centroids = cluster(x, k, weights=w)
+        w_ssd = compute_inertial(x, w, w_clusters, w_centroids)
+        self.assertEqual(w_clusters, [0, 1, 1, 0, 1])
+        self.assertEqual(w_centroids, [10.8, 18.5])
+        self.assertEqual(w_ssd, 66.8)
+
+        # Repeated values: same as weighted
+        r_x = [xi for xi, n in zip(x, w) for _ in range(n)]
+        self.assertEqual(len(r_x), sum(w))
+        r_clusters, r_centroids = cluster(r_x, k)
+        r_ssd = compute_inertial(r_x, [1] * len(r_x), r_clusters, r_centroids)
+        self.assertEqual(r_centroids, w_centroids)
+        self.assertEqual(r_ssd, w_ssd)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/docs/index.rst b/docs/index.rst
index f30abd55f..de7b77335 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -21,6 +21,7 @@ This document is the API Reference for coremltools. For guides, installation ins
    source/coremltools.converters.mil.input_types.rst
    source/coremltools.converters.mil.mil.ops.defs.rst
    source/coremltools.converters.mil.mil.passes.defs.rst
+   source/coremltools.optimize.rst
 
 * :ref:`genindex`
 * :ref:`modindex`
diff --git a/docs/source/coremltools.converters.mil.mil.ops.defs.rst b/docs/source/coremltools.converters.mil.mil.ops.defs.rst
index 5e0539103..522861151 100644
--- a/docs/source/coremltools.converters.mil.mil.ops.defs.rst
+++ b/docs/source/coremltools.converters.mil.mil.ops.defs.rst
@@ -3,7 +3,7 @@ MIL Ops
 
 Operators supported by the Model Intermediate Language (MIL):
 
-activation
+activation (iOS 15+)
 ---------------------------------------------------------
 
 .. automodule:: coremltools.converters.mil.mil.ops.defs.iOS15.activation
@@ -26,6 +26,20 @@ activation
    .. autoclass:: softsign
    .. autoclass:: thresholded_relu
 
+activation (iOS 17+)
+---------------------------------------------------------
+
+.. automodule:: coremltools.converters.mil.mil.ops.defs.iOS17.activation
+
+   .. autoclass:: clamped_relu
+   .. autoclass:: elu
+   .. autoclass:: leaky_relu
+   .. autoclass:: linear_activation
+   .. autoclass:: prelu
+   .. autoclass:: scaled_tanh
+   .. autoclass:: sigmoid_hard
+   .. autoclass:: softplus_parametric
+   .. autoclass:: thresholded_relu
 
 classify
 ---------------------------------------------------
@@ -34,7 +48,6 @@ classify
 
    .. autoclass:: classify
 
-
 constexpr_ops
 ---------------------------------------------------
 
@@ -45,7 +58,6 @@ constexpr_ops
    .. autoclass:: constexpr_lut_to_dense
    .. autoclass:: constexpr_sparse_to_dense
 
-
 control\_flow
 ------------------------------------------------------------
 
@@ -62,7 +74,6 @@ control\_flow
    .. autoclass:: list_gather
    .. autoclass:: list_scatter
 
-
 conv
 ---------------------------------------------------
 
@@ -71,7 +82,6 @@ conv
    .. autoclass:: conv
    .. autoclass:: conv_transpose
 
-
 elementwise\_binary
 ------------------------------------------------------------------
 
@@ -96,8 +106,7 @@ elementwise\_binary
    .. autoclass:: pow
    .. autoclass:: sub
 
-
-elementwise\_unary
+elementwise\_unary (iOS 15+)
 -----------------------------------------------------------------
 
 .. automodule:: coremltools.converters.mil.mil.ops.defs.iOS15.elementwise_unary
@@ -107,6 +116,7 @@ elementwise\_unary
    .. autoclass:: asin
    .. autoclass:: atan
    .. autoclass:: atanh
+   .. autoclass:: cast
    .. autoclass:: ceil
    .. autoclass:: clip
    .. autoclass:: cos
@@ -128,41 +138,53 @@ elementwise\_unary
    .. autoclass:: tan
    .. autoclass:: tanh
    .. autoclass:: threshold
-   .. autoclass:: cast
 
+elementwise\_unary (iOS 17+)
+-----------------------------------------------------------------
+
+.. automodule:: coremltools.converters.mil.mil.ops.defs.iOS17.elementwise_unary
 
-image\_resizing (iOS 15)
+   .. autoclass:: cast
+   .. autoclass:: clip
+
+image\_resizing (iOS 15+)
 --------------------------------------------------------------
 
 .. automodule:: coremltools.converters.mil.mil.ops.defs.iOS15.image_resizing
 
-   .. autoclass:: upsample_nearest_neighbor
-   .. autoclass:: resize_nearest_neighbor
-   .. autoclass:: upsample_bilinear
-   .. autoclass:: resize_bilinear
-   .. autoclass:: crop_resize
-   .. autoclass:: crop
    .. autoclass:: affine
+   .. autoclass:: crop
+   .. autoclass:: crop_resize
    .. autoclass:: resample
-
+   .. autoclass:: resize_bilinear
+   .. autoclass:: resize_nearest_neighbor
+   .. autoclass:: upsample_bilinear
+   .. autoclass:: upsample_nearest_neighbor
 
 image\_resizing (iOS 16+)
 --------------------------------------------------------------
 
 .. automodule:: coremltools.converters.mil.mil.ops.defs.iOS16.image_resizing
 
+   .. autoclass:: crop_resize
    .. autoclass:: resample
+   .. autoclass:: upsample_bilinear
+
+image\_resizing (iOS 17+)
+--------------------------------------------------------------
+
+.. automodule:: coremltools.converters.mil.mil.ops.defs.iOS17.image_resizing
 
+   .. autoclass:: crop_resize
 
 linear
 -----------------------------------------------------
 
 .. automodule:: coremltools.converters.mil.mil.ops.defs.iOS15.linear
 
+   .. autoclass:: einsum
    .. autoclass:: linear
    .. autoclass:: matmul
-   .. autoclass:: einsum
-
 
 normalization
 ------------------------------------------------------------
@@ -175,7 +197,6 @@ normalization
    .. autoclass:: layer_norm
    .. autoclass:: local_response_norm
 
-
 pool
 ---------------------------------------------------
 
@@ -185,6 +206,13 @@ pool
    .. autoclass:: l2_pool
    .. autoclass:: max_pool
 
+quantization
+------------------------------------------------------------
+
+.. automodule:: coremltools.converters.mil.mil.ops.defs.iOS17.quantization_ops
+
+   .. autoclass:: quantize
+   .. autoclass:: dequantize
 
 random
 -----------------------------------------------------
@@ -196,7 +224,6 @@ random
    .. autoclass:: random_normal
    .. autoclass:: random_uniform
 
-
 recurrent
 --------------------------------------------------------
 
@@ -206,8 +233,7 @@ recurrent
    .. autoclass:: lstm
    .. autoclass:: rnn
 
-
-reduction
+reduction (iOS 15+)
 --------------------------------------------------------
 
 .. automodule:: coremltools.converters.mil.mil.ops.defs.iOS15.reduction
@@ -224,44 +250,69 @@ reduction
    .. autoclass:: reduce_prod
    .. autoclass:: reduce_sum
    .. autoclass:: reduce_sum_square
-  
 
-scatter\_gather
+reduction (iOS 17+)
+--------------------------------------------------------
+
+.. automodule:: coremltools.converters.mil.mil.ops.defs.iOS17.reduction
+
+   .. autoclass:: reduce_argmax
+   .. autoclass:: reduce_argmin
+
+scatter\_gather (iOS 15+)
 --------------------------------------------------------------
 
 .. automodule:: coremltools.converters.mil.mil.ops.defs.iOS15.scatter_gather
 
    .. autoclass:: gather
-   .. autoclass:: scatter
    .. autoclass:: gather_along_axis
-   .. autoclass:: scatter_along_axis
    .. autoclass:: gather_nd
+   .. autoclass:: scatter
+   .. autoclass:: scatter_along_axis
    .. autoclass:: scatter_nd
 
+scatter\_gather (iOS 16+)
+--------------------------------------------------------------
+
+.. automodule:: coremltools.converters.mil.mil.ops.defs.iOS16.scatter_gather
+
+   .. autoclass:: gather
+   .. autoclass:: gather_nd
 
-tensor\_operation (iOS 15)
+scatter\_gather (iOS 17+)
+--------------------------------------------------------------
+
+.. automodule:: coremltools.converters.mil.mil.ops.defs.iOS17.scatter_gather
+
+   .. autoclass:: gather
+   .. autoclass:: gather_along_axis
+   .. autoclass:: gather_nd
+   .. autoclass:: scatter
+   .. autoclass:: scatter_along_axis
+   .. autoclass:: scatter_nd
+
+tensor\_operation (iOS 15+)
 ----------------------------------------------------------------
 
 .. automodule:: coremltools.converters.mil.mil.ops.defs.iOS15.tensor_operation
 
+   .. autoclass:: argsort
    .. autoclass:: band_part
+   .. autoclass:: concat
    .. autoclass:: cumsum
    .. autoclass:: fill
+   .. autoclass:: flatten2d
+   .. autoclass:: identity
    .. autoclass:: non_maximum_suppression
    .. autoclass:: non_zero
    .. autoclass:: one_hot
    .. autoclass:: pad
    .. autoclass:: range_1d
-   .. autoclass:: tile
-   .. autoclass:: argsort
-   .. autoclass:: topk
-   .. autoclass:: flatten2d
    .. autoclass:: shape
-   .. autoclass:: concat
    .. autoclass:: split
    .. autoclass:: stack
-   .. autoclass:: identity
-
+   .. autoclass:: tile
+   .. autoclass:: topk
 
 tensor\_operation (iOS 16+)
 ----------------------------------------------------------------
@@ -271,6 +322,13 @@ tensor\_operation (iOS 16+)
    .. autoclass:: fill_like
    .. autoclass:: topk
 
+tensor\_operation (iOS 17+)
+----------------------------------------------------------------
+
+.. automodule:: coremltools.converters.mil.mil.ops.defs.iOS17.tensor_operation
+
+   .. autoclass:: non_maximum_suppression
+   .. autoclass:: topk
 
 tensor\_transformation (iOS 15)
 ---------------------------------------------------------------------
@@ -279,6 +337,7 @@ tensor\_transformation (iOS 15)
 
    .. autoclass:: depth_to_space
    .. autoclass:: expand_dims
+   .. autoclass:: pixel_shuffle
    .. autoclass:: reshape
    .. autoclass:: reverse
    .. autoclass:: reverse_sequence
@@ -286,17 +345,21 @@ tensor\_transformation (iOS 15)
    .. autoclass:: slice_by_size
    .. autoclass:: space_to_depth
    .. autoclass:: squeeze
-   .. autoclass:: transpose
-   .. autoclass:: pixel_shuffle
    .. autoclass:: sliding_windows
-
+   .. autoclass:: transpose
 
 tensor\_transformation (iOS 16+)
 ---------------------------------------------------------------------
 
 .. automodule:: coremltools.converters.mil.mil.ops.defs.iOS16.tensor_transformation
 
-   .. autoclass:: reshape_like
    .. autoclass:: pixel_unshuffle
+   .. autoclass:: reshape_like
+
+tensor\_transformation (iOS 17+)
+---------------------------------------------------------------------
+
+.. automodule:: coremltools.converters.mil.mil.ops.defs.iOS17.tensor_transformation
 
+   .. autoclass:: reshape
 
diff --git a/docs/source/coremltools.converters.mil.mil.passes.defs.rst b/docs/source/coremltools.converters.mil.mil.passes.defs.rst
index 8337a9179..2dcda1b45 100644
--- a/docs/source/coremltools.converters.mil.mil.passes.defs.rst
+++ b/docs/source/coremltools.converters.mil.mil.passes.defs.rst
@@ -8,6 +8,7 @@ cleanup
 
 .. automodule:: coremltools.converters.mil.mil.passes.defs.cleanup
 
+    .. autoclass:: const_deduplication
     .. autoclass:: const_elimination
     .. autoclass:: dead_code_elimination
     .. autoclass:: dedup_op_and_var_names
@@ -71,6 +72,17 @@ optimize_normalization
     .. autoclass:: fuse_layernorm_or_instancenorm
 
 
+optimize_quantization
+---------------------------------------------------------
+
+.. automodule:: coremltools.converters.mil.mil.passes.defs.optimize_quantization
+
+    .. autoclass:: nullify_redundant_quantization_zero_point
+    .. autoclass:: dequantize_quantize_pair_elimination
+    .. autoclass:: distributive_quantized_binary_op_scale_normalization
+    .. autoclass:: dequantize_to_constexpr
+
+
 optimize_repeat_ops
 ---------------------------------------------------------
 
@@ -91,10 +103,10 @@ optimize_tensor_operation
 
     .. autoclass:: concat_to_pixel_shuffle
     .. autoclass:: detect_concat_interleave
+    .. autoclass:: expand_high_rank_reshape_and_transpose
     .. autoclass:: fuse_onehot_matmul_to_gather
     .. autoclass:: replace_stack_reshape
     .. autoclass:: use_reflection_padding
-    .. autoclass:: expand_high_rank_reshape_and_transpose
 
 
 preprocess
diff --git a/docs/source/coremltools.optimize.coreml.config.rst b/docs/source/coremltools.optimize.coreml.config.rst
new file mode 100644
index 000000000..2cbab0333
--- /dev/null
+++ b/docs/source/coremltools.optimize.coreml.config.rst
@@ -0,0 +1,14 @@
+Compression Configuration
+==========================
+.. automodule:: coremltools.optimize.coreml
+
+    .. autoclass:: OpLinearQuantizerConfig
+
+    .. autoclass:: OpThresholdPrunerConfig
+
+    .. autoclass:: OpMagnitudePrunerConfig
+
+    .. autoclass:: OpPalettizerConfig
+
+    .. autoclass:: OptimizationConfig
+        :members: set_global, set_op_type, set_op_name, from_yaml, from_dict
diff --git a/docs/source/coremltools.optimize.coreml.graph.rst b/docs/source/coremltools.optimize.coreml.graph.rst
new file mode 100644
index 000000000..740e1ce42
--- /dev/null
+++ b/docs/source/coremltools.optimize.coreml.graph.rst
@@ -0,0 +1,10 @@
+Compression Graph Passes
+========================
+
+.. automodule:: coremltools.optimize.coreml._quantization_passes
+
+    .. autoclass:: palettize_weights
+
+    .. autoclass:: linear_quantize_weights
+
+    .. autoclass:: WeightDecompressor
diff --git a/docs/source/coremltools.optimize.coreml.post_training_quantization.rst b/docs/source/coremltools.optimize.coreml.post_training_quantization.rst
new file mode 100644
index 000000000..b292cd784
--- /dev/null
+++ b/docs/source/coremltools.optimize.coreml.post_training_quantization.rst
@@ -0,0 +1,9 @@
+Post-Training Compression
+==========================
+
+.. automodule:: coremltools.optimize.coreml
+
+    .. autofunction:: linear_quantize_weights
+    .. autofunction:: prune_weights
+    .. autofunction:: palettize_weights
+    .. autofunction:: decompress_weights
\ No newline at end of file
diff --git a/docs/source/coremltools.optimize.rst b/docs/source/coremltools.optimize.rst
new file mode 100644
index 000000000..1137f8056
--- /dev/null
+++ b/docs/source/coremltools.optimize.rst
@@ -0,0 +1,22 @@
+Optimizers
+===============================================
+
+To deploy models on devices such as the iPhone, you often need to optimize the models to
+use less storage space, reduce power consumption, and reduce latency during inference.
+For an overview, see Optimizing Models Post-Training
+(`Compressing ML Program Weights <https://coremltools.readme.io/docs/compressing-ml-program-weights>`_
+and `Compressing Neural Network Weights <https://coremltools.readme.io/docs/quantization>`_).
+
+coreml
+---------------------------------------------------------
+
+Optimizers that compress Core ML models:
+
+.. toctree::
+   :maxdepth: 1
+   
+   coremltools.optimize.coreml.post_training_quantization.rst
+   coremltools.optimize.coreml.config.rst
+   coremltools.optimize.coreml.graph.rst
+   
+
diff --git a/milstoragepython/MilStorage.cpp b/milstoragepython/MilStorage.cpp
index 8a9fd7d9f..2c702ed74 100644
--- a/milstoragepython/MilStorage.cpp
+++ b/milstoragepython/MilStorage.cpp
@@ -29,38 +29,43 @@ MilStoragePythonWriter::MilStoragePythonWriter(const std::string& filePath, bool
 namespace {
     template <typename T>
     u_int64_t writeData(MILBlob::Blob::StorageWriter& m_writer,
-                        const std::vector<const T>& data) {
-        return m_writer.WriteData(MILBlob::Util::MakeSpan(data));
+                        const py::array_t<T>& data) {
+        auto fpSpan = MILBlob::Util::Span<const T>(data.data(), data.size());
+        return m_writer.WriteData(fpSpan);
     }
 
-    template <>
-    u_int64_t writeData<uint16_t>(MILBlob::Blob::StorageWriter& m_writer,
-                                  const std::vector<const uint16_t>& data) {
-        auto intSpan = MILBlob::Util::MakeSpan(data);
-        auto fpSpan = MILBlob::Util::SpanCast<const MILBlob::Fp16>(intSpan);
-        return m_writer.WriteData(fpSpan);
-  }
 }
 
 // These methods are needed in addition to the above template methods
 // because pybind does not allow us to expose template methods to
 // Python with gcc on Linux.
-u_int64_t MilStoragePythonWriter::write_int8_data(const std::vector<const int8_t>& data) {
+u_int64_t MilStoragePythonWriter::write_int8_data(const py::array_t<int8_t>& data) {
     return writeData<int8_t>(*m_writer, data);
 }
 
-u_int64_t MilStoragePythonWriter::write_uint8_data(const std::vector<const uint8_t>& data) {
+u_int64_t MilStoragePythonWriter::write_uint8_data(const py::array_t<uint8_t>& data) {
     return writeData<uint8_t>(*m_writer, data);
 }
 
-u_int64_t MilStoragePythonWriter::write_fp16_data(const std::vector<const uint16_t>& data) {
+u_int64_t MilStoragePythonWriter::write_int16_data(const py::array_t<int16_t>& data) {
+    return writeData<int16_t>(*m_writer, data);
+}
+
+u_int64_t MilStoragePythonWriter::write_uint16_data(const py::array_t<uint16_t>& data) {
     return writeData<uint16_t>(*m_writer, data);
 }
 
-u_int64_t MilStoragePythonWriter::write_float_data(const std::vector<const float>& data) {
-    return writeData<float>(*m_writer, data);
+u_int64_t MilStoragePythonWriter::write_fp16_data(const py::array_t<uint16_t>& data){
+
+    auto intSpan = MILBlob::Util::Span<const uint16_t>(data.data(), data.size());
+    auto fpSpan = MILBlob::Util::SpanCast<const MILBlob::Fp16>(intSpan);
+
+    return m_writer->WriteData(fpSpan);
 }
 
+u_int64_t MilStoragePythonWriter::write_float_data(const py::array_t<float>& data){
+    return writeData<float>(*m_writer, data);
+}
 
 /*
  *
@@ -76,36 +81,40 @@ MilStoragePythonReader::MilStoragePythonReader(std::string filePath)
 
 namespace {
     template <typename T>
-    const std::vector<T> readData(MILBlob::Blob::StorageReader& m_reader,
+    py::array_t<T> readData(MILBlob::Blob::StorageReader& m_reader,
                                   uint64_t offset) {
-        auto view = m_reader.GetDataView<T>(offset);
-        return std::vector<T>(view.begin(), view.end());
+        auto spanData = m_reader.GetDataView<T>(offset);
+        return py::array_t<T>(spanData.Size(), spanData.Data());
     }
-
-    template <>
-    const std::vector<uint16_t> readData<uint16_t>(MILBlob::Blob::StorageReader& m_reader,
-                                                   uint64_t offset) {
-        auto fpView = m_reader.GetDataView<MILBlob::Fp16>(offset);
-        auto intView = MILBlob::Util::SpanCast<const uint16_t>(fpView);
-        return std::vector<uint16_t>(intView.begin(), intView.end());
-  }
 }
 
 // These methods are needed in addition to the above template methods
 // because pybind does not allow us to expose template methods to
 // Python with gcc on Linux.
-const std::vector<int8_t> MilStoragePythonReader::read_int8_data(uint64_t offset) {
+py::array_t<int8_t> MilStoragePythonReader::read_int8_data(uint64_t offset) {
     return readData<int8_t>(*m_reader, offset);
 }
 
-const std::vector<uint8_t> MilStoragePythonReader::read_uint8_data(uint64_t offset) {
+py::array_t<uint8_t> MilStoragePythonReader::read_uint8_data(uint64_t offset) {
     return readData<uint8_t>(*m_reader, offset);
 }
 
-const std::vector<uint16_t> MilStoragePythonReader::read_fp16_data(uint64_t offset) {
+py::array_t<int16_t> MilStoragePythonReader::read_int16_data(uint64_t offset) {
+    return readData<int16_t>(*m_reader, offset);
+}
+
+py::array_t<uint16_t> MilStoragePythonReader::read_uint16_data(uint64_t offset) {
     return readData<uint16_t>(*m_reader, offset);
 }
 
-const std::vector<float> MilStoragePythonReader::read_float_data(uint64_t offset) {
-    return readData<float>(*m_reader, offset);
+py::array_t<uint16_t> MilStoragePythonReader::read_fp16_data(uint64_t offset) {
+
+    auto fpView = m_reader->GetDataView<MILBlob::Fp16>(offset);
+    auto intView = MILBlob::Util::SpanCast<const uint16_t>(fpView);
+
+    return py::array_t<uint16_t> (intView.Size(), intView.Data());
 }
+
+py::array_t<float> MilStoragePythonReader::read_float_data(uint64_t offset) {
+    return readData<float>(*m_reader, offset);
+}
\ No newline at end of file
diff --git a/milstoragepython/MilStorage.hpp b/milstoragepython/MilStorage.hpp
index 79d37af45..1e0ac7b89 100644
--- a/milstoragepython/MilStorage.hpp
+++ b/milstoragepython/MilStorage.hpp
@@ -5,10 +5,14 @@
 
 #pragma once
 
+#include <pybind11/pybind11.h>
+#include <pybind11/numpy.h>
+
 #include <memory>
 #include <string>
 #include <vector>
 
+namespace py = pybind11;
 
 namespace MILBlob {
 namespace Blob {
@@ -30,10 +34,12 @@ namespace CoreML {
             MilStoragePythonWriter(const std::string& filePath, bool truncateFile);
             ~MilStoragePythonWriter();
 
-            u_int64_t write_int8_data(const std::vector<const int8_t>& data);
-            u_int64_t write_uint8_data(const std::vector<const uint8_t>& data);
-            u_int64_t write_fp16_data(const std::vector<const uint16_t>& data);
-            u_int64_t write_float_data(const std::vector<const float>& data);
+            u_int64_t write_int8_data(const py::array_t<int8_t>& data);
+            u_int64_t write_uint8_data(const py::array_t<uint8_t>& data);
+            u_int64_t write_int16_data(const py::array_t<int16_t>& data);
+            u_int64_t write_uint16_data(const py::array_t<uint16_t>& data);
+            u_int64_t write_fp16_data(const py::array_t<uint16_t>& data);
+            u_int64_t write_float_data(const py::array_t<float>& data);
 
         private:
             std::unique_ptr<MILBlob::Blob::StorageWriter> m_writer;
@@ -49,10 +55,13 @@ namespace CoreML {
             MilStoragePythonReader(std::string filePath);
             ~MilStoragePythonReader();
 
-            const std::vector<int8_t> read_int8_data(uint64_t offset);
-            const std::vector<uint8_t> read_uint8_data(uint64_t offset);
-            const std::vector<uint16_t> read_fp16_data(uint64_t offset);
-            const std::vector<float> read_float_data(uint64_t offset);
+            py::array_t<int8_t> read_int8_data(uint64_t offset);
+            py::array_t<uint8_t> read_uint8_data(uint64_t offset);
+            py::array_t<int16_t> read_int16_data(uint64_t offset);
+            py::array_t<uint16_t> read_uint16_data(uint64_t offset);
+            py::array_t<uint16_t> read_fp16_data(uint64_t offset);
+            py::array_t<float> read_float_data(uint64_t offset);
+
 
         private:
             std::unique_ptr<MILBlob::Blob::StorageReader> m_reader;
diff --git a/milstoragepython/MilStoragePython.cpp b/milstoragepython/MilStoragePython.cpp
index e427b8f04..62d4b18d8 100644
--- a/milstoragepython/MilStoragePython.cpp
+++ b/milstoragepython/MilStoragePython.cpp
@@ -32,32 +32,20 @@ PYBIND11_PLUGIN(libmilstoragepython) {
     py::module m("libmilstoragepython", "Library to create, access and edit CoreML blob files.");
 
     py::class_<MilStoragePythonWriter> blobStorageWriter(m, "_BlobStorageWriter");
-    blobStorageWriter.def(py::init<const std::string &, bool>(), py::arg("file_name"), py::arg("truncate_file") = true)
-        .def("write_int8_data", [](MilStoragePythonWriter &w, py::buffer buf) {
-            auto info = buf.request();
-            const std::vector<const int8_t> data(static_cast<int8_t*>(info.ptr), static_cast<int8_t*>(info.ptr) + info.size);
-            return w.write_int8_data(data);
-        })
-        .def("write_uint8_data", [](MilStoragePythonWriter &w, py::buffer buf) {
-            auto info = buf.request();
-            const std::vector<const uint8_t> data(static_cast<uint8_t*>(info.ptr), static_cast<uint8_t*>(info.ptr) + info.size);
-            return w.write_uint8_data(data);
-        })
-        .def("write_fp16_data", [](MilStoragePythonWriter &w, py::buffer buf) {
-            auto info = buf.request();
-            const std::vector<const uint16_t> data(static_cast<uint16_t*>(info.ptr), static_cast<uint16_t*>(info.ptr) + info.size);
-            return w.write_fp16_data(data);
-        })
-        .def("write_float_data", [](MilStoragePythonWriter &w, py::buffer buf) {
-            auto info = buf.request();
-            const std::vector<const float> data(static_cast<float*>(info.ptr), static_cast<float*>(info.ptr) + info.size);
-            return w.write_float_data(data);
-        });
+    blobStorageWriter.def(py::init<const std::string&, bool>(), py::arg("file_name"), py::arg("truncate_file") = true)
+      .def("write_int8_data", &MilStoragePythonWriter::write_int8_data)
+      .def("write_uint8_data", &MilStoragePythonWriter::write_uint8_data)
+      .def("write_int16_data", &MilStoragePythonWriter::write_int16_data)
+      .def("write_uint16_data", &MilStoragePythonWriter::write_uint16_data)
+      .def("write_fp16_data", &MilStoragePythonWriter::write_fp16_data)
+      .def("write_float_data", &MilStoragePythonWriter::write_float_data);
 
     py::class_<MilStoragePythonReader> blobStorageReader(m, "_BlobStorageReader");
     blobStorageReader.def(py::init<std::string>())
       .def("read_int8_data", &MilStoragePythonReader::read_int8_data)
       .def("read_uint8_data", &MilStoragePythonReader::read_uint8_data)
+      .def("read_int16_data", &MilStoragePythonReader::read_int16_data)
+      .def("read_uint16_data", &MilStoragePythonReader::read_uint16_data)
       .def("read_fp16_data", &MilStoragePythonReader::read_fp16_data)
       .def("read_float_data", &MilStoragePythonReader::read_float_data);
 
diff --git a/mlmodel/CMakeLists.txt b/mlmodel/CMakeLists.txt
index df8fb48ca..6d7b7fa66 100644
--- a/mlmodel/CMakeLists.txt
+++ b/mlmodel/CMakeLists.txt
@@ -2,6 +2,7 @@ include("${CMAKE_SOURCE_DIR}/cmake/coreml-utils.cmake")
 
 include_directories(
   ..
+  ../deps/FP16/include
   ../deps/protobuf/src
   src
 )
@@ -77,6 +78,7 @@ add_library(mlmodel
     ${CMAKE_CURRENT_BINARY_DIR}/format/AudioFeaturePrint.pb.cc
     ${CMAKE_CURRENT_BINARY_DIR}/format/BayesianProbitRegressor.pb.cc
     ${CMAKE_CURRENT_BINARY_DIR}/format/CategoricalMapping.pb.cc
+    ${CMAKE_CURRENT_BINARY_DIR}/format/ClassConfidenceThresholding.pb.cc
     ${CMAKE_CURRENT_BINARY_DIR}/format/CustomModel.pb.cc
     ${CMAKE_CURRENT_BINARY_DIR}/format/DataStructures.pb.cc
     ${CMAKE_CURRENT_BINARY_DIR}/format/DictVectorizer.pb.cc
@@ -134,6 +136,7 @@ add_library(mlmodel
     src/Validation/AudioFeaturePrintValidator.cpp
     src/Validation/BayesianProbitRegressionValidator.cpp
     src/Validation/CategoricalMappingValidator.cpp
+    src/Validation/ClassConfidenceThresholdingValidator.cpp
     src/Validation/CustomModelValidator.cpp
     src/Validation/DictVectorizerValidator.cpp
     src/Validation/FeatureVectorizerValidator.cpp
@@ -184,6 +187,7 @@ set(proto_files
     AudioFeaturePrint
     BayesianProbitRegressor
     CategoricalMapping
+	ClassConfidenceThresholding
     CustomModel
     DataStructures
     DictVectorizer
diff --git a/mlmodel/build/format/CategoricalMapping.pb.h b/mlmodel/build/format/CategoricalMapping.pb.h
index 19c1e38ef..44b34bf39 100644
--- a/mlmodel/build/format/CategoricalMapping.pb.h
+++ b/mlmodel/build/format/CategoricalMapping.pb.h
@@ -101,6 +101,9 @@ extern Int64ToStringMap_MapEntryDefaultTypeInternal _Int64ToStringMap_MapEntry_d
 class Int64Vector;
 class Int64VectorDefaultTypeInternal;
 extern Int64VectorDefaultTypeInternal _Int64Vector_default_instance_;
+class PrecisionRecallCurve;
+class PrecisionRecallCurveDefaultTypeInternal;
+extern PrecisionRecallCurveDefaultTypeInternal _PrecisionRecallCurve_default_instance_;
 class SequenceFeatureType;
 class SequenceFeatureTypeDefaultTypeInternal;
 extern SequenceFeatureTypeDefaultTypeInternal _SequenceFeatureType_default_instance_;
diff --git a/mlmodel/build/format/ClassConfidenceThresholding.pb.cc b/mlmodel/build/format/ClassConfidenceThresholding.pb.cc
new file mode 100644
index 000000000..a2eacc367
--- /dev/null
+++ b/mlmodel/build/format/ClassConfidenceThresholding.pb.cc
@@ -0,0 +1,294 @@
+// Generated by the protocol buffer compiler.  DO NOT EDIT!
+// source: ClassConfidenceThresholding.proto
+
+#define INTERNAL_SUPPRESS_PROTOBUF_FIELD_DEPRECATION
+#include "ClassConfidenceThresholding.pb.h"
+
+#include <algorithm>
+
+#include <google/protobuf/stubs/common.h>
+#include <google/protobuf/stubs/port.h>
+#include <google/protobuf/stubs/once.h>
+#include <google/protobuf/io/coded_stream.h>
+#include <google/protobuf/wire_format_lite_inl.h>
+#include <google/protobuf/io/zero_copy_stream_impl_lite.h>
+// @@protoc_insertion_point(includes)
+
+namespace CoreML {
+namespace Specification {
+class ClassConfidenceThresholdingDefaultTypeInternal : public ::google::protobuf::internal::ExplicitlyConstructed<ClassConfidenceThresholding> {
+} _ClassConfidenceThresholding_default_instance_;
+
+namespace protobuf_ClassConfidenceThresholding_2eproto {
+
+PROTOBUF_CONSTEXPR_VAR ::google::protobuf::internal::ParseTableField
+    const TableStruct::entries[] = {
+  {0, 0, 0, ::google::protobuf::internal::kInvalidMask, 0, 0},
+};
+
+PROTOBUF_CONSTEXPR_VAR ::google::protobuf::internal::AuxillaryParseTableField
+    const TableStruct::aux[] = {
+  ::google::protobuf::internal::AuxillaryParseTableField(),
+};
+PROTOBUF_CONSTEXPR_VAR ::google::protobuf::internal::ParseTable const
+    TableStruct::schema[] = {
+  { NULL, NULL, 0, -1, -1, false },
+};
+
+
+void TableStruct::Shutdown() {
+  _ClassConfidenceThresholding_default_instance_.Shutdown();
+}
+
+void TableStruct::InitDefaultsImpl() {
+  GOOGLE_PROTOBUF_VERIFY_VERSION;
+
+  ::google::protobuf::internal::InitProtobufDefaults();
+  ::CoreML::Specification::protobuf_DataStructures_2eproto::InitDefaults();
+  _ClassConfidenceThresholding_default_instance_.DefaultConstruct();
+}
+
+void InitDefaults() {
+  static GOOGLE_PROTOBUF_DECLARE_ONCE(once);
+  ::google::protobuf::GoogleOnceInit(&once, &TableStruct::InitDefaultsImpl);
+}
+void AddDescriptorsImpl() {
+  InitDefaults();
+  ::CoreML::Specification::protobuf_DataStructures_2eproto::AddDescriptors();
+  ::google::protobuf::internal::OnShutdown(&TableStruct::Shutdown);
+}
+
+void AddDescriptors() {
+  static GOOGLE_PROTOBUF_DECLARE_ONCE(once);
+  ::google::protobuf::GoogleOnceInit(&once, &AddDescriptorsImpl);
+}
+#ifdef GOOGLE_PROTOBUF_NO_STATIC_INITIALIZER
+// Force AddDescriptors() to be called at static initialization time.
+struct StaticDescriptorInitializer {
+  StaticDescriptorInitializer() {
+    AddDescriptors();
+  }
+} static_descriptor_initializer;
+#endif  // GOOGLE_PROTOBUF_NO_STATIC_INITIALIZER
+
+}  // namespace protobuf_ClassConfidenceThresholding_2eproto
+
+
+// ===================================================================
+
+#if !defined(_MSC_VER) || _MSC_VER >= 1900
+const int ClassConfidenceThresholding::kPrecisionRecallCurvesFieldNumber;
+#endif  // !defined(_MSC_VER) || _MSC_VER >= 1900
+
+ClassConfidenceThresholding::ClassConfidenceThresholding()
+  : ::google::protobuf::MessageLite(), _internal_metadata_(NULL) {
+  if (GOOGLE_PREDICT_TRUE(this != internal_default_instance())) {
+    protobuf_ClassConfidenceThresholding_2eproto::InitDefaults();
+  }
+  SharedCtor();
+  // @@protoc_insertion_point(constructor:CoreML.Specification.ClassConfidenceThresholding)
+}
+ClassConfidenceThresholding::ClassConfidenceThresholding(const ClassConfidenceThresholding& from)
+  : ::google::protobuf::MessageLite(),
+      _internal_metadata_(NULL),
+      precisionrecallcurves_(from.precisionrecallcurves_),
+      _cached_size_(0) {
+  _internal_metadata_.MergeFrom(from._internal_metadata_);
+  // @@protoc_insertion_point(copy_constructor:CoreML.Specification.ClassConfidenceThresholding)
+}
+
+void ClassConfidenceThresholding::SharedCtor() {
+  _cached_size_ = 0;
+}
+
+ClassConfidenceThresholding::~ClassConfidenceThresholding() {
+  // @@protoc_insertion_point(destructor:CoreML.Specification.ClassConfidenceThresholding)
+  SharedDtor();
+}
+
+void ClassConfidenceThresholding::SharedDtor() {
+}
+
+void ClassConfidenceThresholding::SetCachedSize(int size) const {
+  GOOGLE_SAFE_CONCURRENT_WRITES_BEGIN();
+  _cached_size_ = size;
+  GOOGLE_SAFE_CONCURRENT_WRITES_END();
+}
+const ClassConfidenceThresholding& ClassConfidenceThresholding::default_instance() {
+  protobuf_ClassConfidenceThresholding_2eproto::InitDefaults();
+  return *internal_default_instance();
+}
+
+ClassConfidenceThresholding* ClassConfidenceThresholding::New(::google::protobuf::Arena* arena) const {
+  ClassConfidenceThresholding* n = new ClassConfidenceThresholding;
+  if (arena != NULL) {
+    arena->Own(n);
+  }
+  return n;
+}
+
+void ClassConfidenceThresholding::Clear() {
+// @@protoc_insertion_point(message_clear_start:CoreML.Specification.ClassConfidenceThresholding)
+  precisionrecallcurves_.Clear();
+}
+
+bool ClassConfidenceThresholding::MergePartialFromCodedStream(
+    ::google::protobuf::io::CodedInputStream* input) {
+#define DO_(EXPRESSION) if (!GOOGLE_PREDICT_TRUE(EXPRESSION)) goto failure
+  ::google::protobuf::uint32 tag;
+  // @@protoc_insertion_point(parse_start:CoreML.Specification.ClassConfidenceThresholding)
+  for (;;) {
+    ::std::pair< ::google::protobuf::uint32, bool> p = input->ReadTagWithCutoffNoLastTag(16383u);
+    tag = p.first;
+    if (!p.second) goto handle_unusual;
+    switch (::google::protobuf::internal::WireFormatLite::GetTagFieldNumber(tag)) {
+      // repeated .CoreML.Specification.PrecisionRecallCurve precisionRecallCurves = 100;
+      case 100: {
+        if (static_cast< ::google::protobuf::uint8>(tag) ==
+            static_cast< ::google::protobuf::uint8>(802u)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadMessageNoVirtual(
+                input, add_precisionrecallcurves()));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      default: {
+      handle_unusual:
+        if (tag == 0 ||
+            ::google::protobuf::internal::WireFormatLite::GetTagWireType(tag) ==
+            ::google::protobuf::internal::WireFormatLite::WIRETYPE_END_GROUP) {
+          goto success;
+        }
+        DO_(::google::protobuf::internal::WireFormatLite::SkipField(input, tag));
+        break;
+      }
+    }
+  }
+success:
+  // @@protoc_insertion_point(parse_success:CoreML.Specification.ClassConfidenceThresholding)
+  return true;
+failure:
+  // @@protoc_insertion_point(parse_failure:CoreML.Specification.ClassConfidenceThresholding)
+  return false;
+#undef DO_
+}
+
+void ClassConfidenceThresholding::SerializeWithCachedSizes(
+    ::google::protobuf::io::CodedOutputStream* output) const {
+  // @@protoc_insertion_point(serialize_start:CoreML.Specification.ClassConfidenceThresholding)
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  (void) cached_has_bits;
+
+  // repeated .CoreML.Specification.PrecisionRecallCurve precisionRecallCurves = 100;
+  for (unsigned int i = 0, n = this->precisionrecallcurves_size(); i < n; i++) {
+    ::google::protobuf::internal::WireFormatLite::WriteMessage(
+      100, this->precisionrecallcurves(i), output);
+  }
+
+  // @@protoc_insertion_point(serialize_end:CoreML.Specification.ClassConfidenceThresholding)
+}
+
+size_t ClassConfidenceThresholding::ByteSizeLong() const {
+// @@protoc_insertion_point(message_byte_size_start:CoreML.Specification.ClassConfidenceThresholding)
+  size_t total_size = 0;
+
+  // repeated .CoreML.Specification.PrecisionRecallCurve precisionRecallCurves = 100;
+  {
+    unsigned int count = this->precisionrecallcurves_size();
+    total_size += 2UL * count;
+    for (unsigned int i = 0; i < count; i++) {
+      total_size +=
+        ::google::protobuf::internal::WireFormatLite::MessageSizeNoVirtual(
+          this->precisionrecallcurves(i));
+    }
+  }
+
+  int cached_size = ::google::protobuf::internal::ToCachedSize(total_size);
+  GOOGLE_SAFE_CONCURRENT_WRITES_BEGIN();
+  _cached_size_ = cached_size;
+  GOOGLE_SAFE_CONCURRENT_WRITES_END();
+  return total_size;
+}
+
+void ClassConfidenceThresholding::CheckTypeAndMergeFrom(
+    const ::google::protobuf::MessageLite& from) {
+  MergeFrom(*::google::protobuf::down_cast<const ClassConfidenceThresholding*>(&from));
+}
+
+void ClassConfidenceThresholding::MergeFrom(const ClassConfidenceThresholding& from) {
+// @@protoc_insertion_point(class_specific_merge_from_start:CoreML.Specification.ClassConfidenceThresholding)
+  GOOGLE_DCHECK_NE(&from, this);
+  _internal_metadata_.MergeFrom(from._internal_metadata_);
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  (void) cached_has_bits;
+
+  precisionrecallcurves_.MergeFrom(from.precisionrecallcurves_);
+}
+
+void ClassConfidenceThresholding::CopyFrom(const ClassConfidenceThresholding& from) {
+// @@protoc_insertion_point(class_specific_copy_from_start:CoreML.Specification.ClassConfidenceThresholding)
+  if (&from == this) return;
+  Clear();
+  MergeFrom(from);
+}
+
+bool ClassConfidenceThresholding::IsInitialized() const {
+  return true;
+}
+
+void ClassConfidenceThresholding::Swap(ClassConfidenceThresholding* other) {
+  if (other == this) return;
+  InternalSwap(other);
+}
+void ClassConfidenceThresholding::InternalSwap(ClassConfidenceThresholding* other) {
+  precisionrecallcurves_.InternalSwap(&other->precisionrecallcurves_);
+  std::swap(_cached_size_, other->_cached_size_);
+}
+
+::std::string ClassConfidenceThresholding::GetTypeName() const {
+  return "CoreML.Specification.ClassConfidenceThresholding";
+}
+
+#if PROTOBUF_INLINE_NOT_IN_HEADERS
+// ClassConfidenceThresholding
+
+// repeated .CoreML.Specification.PrecisionRecallCurve precisionRecallCurves = 100;
+int ClassConfidenceThresholding::precisionrecallcurves_size() const {
+  return precisionrecallcurves_.size();
+}
+void ClassConfidenceThresholding::clear_precisionrecallcurves() {
+  precisionrecallcurves_.Clear();
+}
+const ::CoreML::Specification::PrecisionRecallCurve& ClassConfidenceThresholding::precisionrecallcurves(int index) const {
+  // @@protoc_insertion_point(field_get:CoreML.Specification.ClassConfidenceThresholding.precisionRecallCurves)
+  return precisionrecallcurves_.Get(index);
+}
+::CoreML::Specification::PrecisionRecallCurve* ClassConfidenceThresholding::mutable_precisionrecallcurves(int index) {
+  // @@protoc_insertion_point(field_mutable:CoreML.Specification.ClassConfidenceThresholding.precisionRecallCurves)
+  return precisionrecallcurves_.Mutable(index);
+}
+::CoreML::Specification::PrecisionRecallCurve* ClassConfidenceThresholding::add_precisionrecallcurves() {
+  // @@protoc_insertion_point(field_add:CoreML.Specification.ClassConfidenceThresholding.precisionRecallCurves)
+  return precisionrecallcurves_.Add();
+}
+::google::protobuf::RepeatedPtrField< ::CoreML::Specification::PrecisionRecallCurve >*
+ClassConfidenceThresholding::mutable_precisionrecallcurves() {
+  // @@protoc_insertion_point(field_mutable_list:CoreML.Specification.ClassConfidenceThresholding.precisionRecallCurves)
+  return &precisionrecallcurves_;
+}
+const ::google::protobuf::RepeatedPtrField< ::CoreML::Specification::PrecisionRecallCurve >&
+ClassConfidenceThresholding::precisionrecallcurves() const {
+  // @@protoc_insertion_point(field_list:CoreML.Specification.ClassConfidenceThresholding.precisionRecallCurves)
+  return precisionrecallcurves_;
+}
+
+#endif  // PROTOBUF_INLINE_NOT_IN_HEADERS
+
+// @@protoc_insertion_point(namespace_scope)
+
+}  // namespace Specification
+}  // namespace CoreML
+
+// @@protoc_insertion_point(global_scope)
diff --git a/mlmodel/build/format/ClassConfidenceThresholding.pb.h b/mlmodel/build/format/ClassConfidenceThresholding.pb.h
new file mode 100644
index 000000000..445ef7bf3
--- /dev/null
+++ b/mlmodel/build/format/ClassConfidenceThresholding.pb.h
@@ -0,0 +1,283 @@
+// Generated by the protocol buffer compiler.  DO NOT EDIT!
+// source: ClassConfidenceThresholding.proto
+
+#ifndef PROTOBUF_ClassConfidenceThresholding_2eproto__INCLUDED
+#define PROTOBUF_ClassConfidenceThresholding_2eproto__INCLUDED
+
+#include <string>
+
+#include <google/protobuf/stubs/common.h>
+
+#if GOOGLE_PROTOBUF_VERSION < 3003000
+#error This file was generated by a newer version of protoc which is
+#error incompatible with your Protocol Buffer headers.  Please update
+#error your headers.
+#endif
+#if 3003000 < GOOGLE_PROTOBUF_MIN_PROTOC_VERSION
+#error This file was generated by an older version of protoc which is
+#error incompatible with your Protocol Buffer headers.  Please
+#error regenerate this file with a newer version of protoc.
+#endif
+
+#include <google/protobuf/io/coded_stream.h>
+#include <google/protobuf/arena.h>
+#include <google/protobuf/arenastring.h>
+#include <google/protobuf/generated_message_table_driven.h>
+#include <google/protobuf/generated_message_util.h>
+#include <google/protobuf/metadata_lite.h>
+#include <google/protobuf/message_lite.h>
+#include <google/protobuf/repeated_field.h>  // IWYU pragma: export
+#include <google/protobuf/extension_set.h>  // IWYU pragma: export
+#include "DataStructures.pb.h"  // IWYU pragma: export
+// @@protoc_insertion_point(includes)
+namespace CoreML {
+namespace Specification {
+class ArrayFeatureType;
+class ArrayFeatureTypeDefaultTypeInternal;
+extern ArrayFeatureTypeDefaultTypeInternal _ArrayFeatureType_default_instance_;
+class ArrayFeatureType_EnumeratedShapes;
+class ArrayFeatureType_EnumeratedShapesDefaultTypeInternal;
+extern ArrayFeatureType_EnumeratedShapesDefaultTypeInternal _ArrayFeatureType_EnumeratedShapes_default_instance_;
+class ArrayFeatureType_Shape;
+class ArrayFeatureType_ShapeDefaultTypeInternal;
+extern ArrayFeatureType_ShapeDefaultTypeInternal _ArrayFeatureType_Shape_default_instance_;
+class ArrayFeatureType_ShapeRange;
+class ArrayFeatureType_ShapeRangeDefaultTypeInternal;
+extern ArrayFeatureType_ShapeRangeDefaultTypeInternal _ArrayFeatureType_ShapeRange_default_instance_;
+class ClassConfidenceThresholding;
+class ClassConfidenceThresholdingDefaultTypeInternal;
+extern ClassConfidenceThresholdingDefaultTypeInternal _ClassConfidenceThresholding_default_instance_;
+class DictionaryFeatureType;
+class DictionaryFeatureTypeDefaultTypeInternal;
+extern DictionaryFeatureTypeDefaultTypeInternal _DictionaryFeatureType_default_instance_;
+class DoubleFeatureType;
+class DoubleFeatureTypeDefaultTypeInternal;
+extern DoubleFeatureTypeDefaultTypeInternal _DoubleFeatureType_default_instance_;
+class DoubleRange;
+class DoubleRangeDefaultTypeInternal;
+extern DoubleRangeDefaultTypeInternal _DoubleRange_default_instance_;
+class DoubleVector;
+class DoubleVectorDefaultTypeInternal;
+extern DoubleVectorDefaultTypeInternal _DoubleVector_default_instance_;
+class FeatureType;
+class FeatureTypeDefaultTypeInternal;
+extern FeatureTypeDefaultTypeInternal _FeatureType_default_instance_;
+class FloatVector;
+class FloatVectorDefaultTypeInternal;
+extern FloatVectorDefaultTypeInternal _FloatVector_default_instance_;
+class ImageFeatureType;
+class ImageFeatureTypeDefaultTypeInternal;
+extern ImageFeatureTypeDefaultTypeInternal _ImageFeatureType_default_instance_;
+class ImageFeatureType_EnumeratedImageSizes;
+class ImageFeatureType_EnumeratedImageSizesDefaultTypeInternal;
+extern ImageFeatureType_EnumeratedImageSizesDefaultTypeInternal _ImageFeatureType_EnumeratedImageSizes_default_instance_;
+class ImageFeatureType_ImageSize;
+class ImageFeatureType_ImageSizeDefaultTypeInternal;
+extern ImageFeatureType_ImageSizeDefaultTypeInternal _ImageFeatureType_ImageSize_default_instance_;
+class ImageFeatureType_ImageSizeRange;
+class ImageFeatureType_ImageSizeRangeDefaultTypeInternal;
+extern ImageFeatureType_ImageSizeRangeDefaultTypeInternal _ImageFeatureType_ImageSizeRange_default_instance_;
+class Int64FeatureType;
+class Int64FeatureTypeDefaultTypeInternal;
+extern Int64FeatureTypeDefaultTypeInternal _Int64FeatureType_default_instance_;
+class Int64Range;
+class Int64RangeDefaultTypeInternal;
+extern Int64RangeDefaultTypeInternal _Int64Range_default_instance_;
+class Int64Set;
+class Int64SetDefaultTypeInternal;
+extern Int64SetDefaultTypeInternal _Int64Set_default_instance_;
+class Int64ToDoubleMap;
+class Int64ToDoubleMapDefaultTypeInternal;
+extern Int64ToDoubleMapDefaultTypeInternal _Int64ToDoubleMap_default_instance_;
+class Int64ToDoubleMap_MapEntry;
+class Int64ToDoubleMap_MapEntryDefaultTypeInternal;
+extern Int64ToDoubleMap_MapEntryDefaultTypeInternal _Int64ToDoubleMap_MapEntry_default_instance_;
+class Int64ToStringMap;
+class Int64ToStringMapDefaultTypeInternal;
+extern Int64ToStringMapDefaultTypeInternal _Int64ToStringMap_default_instance_;
+class Int64ToStringMap_MapEntry;
+class Int64ToStringMap_MapEntryDefaultTypeInternal;
+extern Int64ToStringMap_MapEntryDefaultTypeInternal _Int64ToStringMap_MapEntry_default_instance_;
+class Int64Vector;
+class Int64VectorDefaultTypeInternal;
+extern Int64VectorDefaultTypeInternal _Int64Vector_default_instance_;
+class PrecisionRecallCurve;
+class PrecisionRecallCurveDefaultTypeInternal;
+extern PrecisionRecallCurveDefaultTypeInternal _PrecisionRecallCurve_default_instance_;
+class SequenceFeatureType;
+class SequenceFeatureTypeDefaultTypeInternal;
+extern SequenceFeatureTypeDefaultTypeInternal _SequenceFeatureType_default_instance_;
+class SizeRange;
+class SizeRangeDefaultTypeInternal;
+extern SizeRangeDefaultTypeInternal _SizeRange_default_instance_;
+class StringFeatureType;
+class StringFeatureTypeDefaultTypeInternal;
+extern StringFeatureTypeDefaultTypeInternal _StringFeatureType_default_instance_;
+class StringToDoubleMap;
+class StringToDoubleMapDefaultTypeInternal;
+extern StringToDoubleMapDefaultTypeInternal _StringToDoubleMap_default_instance_;
+class StringToDoubleMap_MapEntry;
+class StringToDoubleMap_MapEntryDefaultTypeInternal;
+extern StringToDoubleMap_MapEntryDefaultTypeInternal _StringToDoubleMap_MapEntry_default_instance_;
+class StringToInt64Map;
+class StringToInt64MapDefaultTypeInternal;
+extern StringToInt64MapDefaultTypeInternal _StringToInt64Map_default_instance_;
+class StringToInt64Map_MapEntry;
+class StringToInt64Map_MapEntryDefaultTypeInternal;
+extern StringToInt64Map_MapEntryDefaultTypeInternal _StringToInt64Map_MapEntry_default_instance_;
+class StringVector;
+class StringVectorDefaultTypeInternal;
+extern StringVectorDefaultTypeInternal _StringVector_default_instance_;
+}  // namespace Specification
+}  // namespace CoreML
+
+namespace CoreML {
+namespace Specification {
+
+namespace protobuf_ClassConfidenceThresholding_2eproto {
+// Internal implementation detail -- do not call these.
+struct TableStruct {
+  static const ::google::protobuf::internal::ParseTableField entries[];
+  static const ::google::protobuf::internal::AuxillaryParseTableField aux[];
+  static const ::google::protobuf::internal::ParseTable schema[];
+  static const ::google::protobuf::uint32 offsets[];
+  static void InitDefaultsImpl();
+  static void Shutdown();
+};
+void AddDescriptors();
+void InitDefaults();
+}  // namespace protobuf_ClassConfidenceThresholding_2eproto
+
+// ===================================================================
+
+class ClassConfidenceThresholding : public ::google::protobuf::MessageLite /* @@protoc_insertion_point(class_definition:CoreML.Specification.ClassConfidenceThresholding) */ {
+ public:
+  ClassConfidenceThresholding();
+  virtual ~ClassConfidenceThresholding();
+
+  ClassConfidenceThresholding(const ClassConfidenceThresholding& from);
+
+  inline ClassConfidenceThresholding& operator=(const ClassConfidenceThresholding& from) {
+    CopyFrom(from);
+    return *this;
+  }
+
+  static const ClassConfidenceThresholding& default_instance();
+
+  static inline const ClassConfidenceThresholding* internal_default_instance() {
+    return reinterpret_cast<const ClassConfidenceThresholding*>(
+               &_ClassConfidenceThresholding_default_instance_);
+  }
+  static PROTOBUF_CONSTEXPR int const kIndexInFileMessages =
+    0;
+
+  void Swap(ClassConfidenceThresholding* other);
+
+  // implements Message ----------------------------------------------
+
+  inline ClassConfidenceThresholding* New() const PROTOBUF_FINAL { return New(NULL); }
+
+  ClassConfidenceThresholding* New(::google::protobuf::Arena* arena) const PROTOBUF_FINAL;
+  void CheckTypeAndMergeFrom(const ::google::protobuf::MessageLite& from)
+    PROTOBUF_FINAL;
+  void CopyFrom(const ClassConfidenceThresholding& from);
+  void MergeFrom(const ClassConfidenceThresholding& from);
+  void Clear() PROTOBUF_FINAL;
+  bool IsInitialized() const PROTOBUF_FINAL;
+
+  size_t ByteSizeLong() const PROTOBUF_FINAL;
+  bool MergePartialFromCodedStream(
+      ::google::protobuf::io::CodedInputStream* input) PROTOBUF_FINAL;
+  void SerializeWithCachedSizes(
+      ::google::protobuf::io::CodedOutputStream* output) const PROTOBUF_FINAL;
+  void DiscardUnknownFields();
+  int GetCachedSize() const PROTOBUF_FINAL { return _cached_size_; }
+  private:
+  void SharedCtor();
+  void SharedDtor();
+  void SetCachedSize(int size) const;
+  void InternalSwap(ClassConfidenceThresholding* other);
+  private:
+  inline ::google::protobuf::Arena* GetArenaNoVirtual() const {
+    return NULL;
+  }
+  inline void* MaybeArenaPtr() const {
+    return NULL;
+  }
+  public:
+
+  ::std::string GetTypeName() const PROTOBUF_FINAL;
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  // repeated .CoreML.Specification.PrecisionRecallCurve precisionRecallCurves = 100;
+  int precisionrecallcurves_size() const;
+  void clear_precisionrecallcurves();
+  static const int kPrecisionRecallCurvesFieldNumber = 100;
+  const ::CoreML::Specification::PrecisionRecallCurve& precisionrecallcurves(int index) const;
+  ::CoreML::Specification::PrecisionRecallCurve* mutable_precisionrecallcurves(int index);
+  ::CoreML::Specification::PrecisionRecallCurve* add_precisionrecallcurves();
+  ::google::protobuf::RepeatedPtrField< ::CoreML::Specification::PrecisionRecallCurve >*
+      mutable_precisionrecallcurves();
+  const ::google::protobuf::RepeatedPtrField< ::CoreML::Specification::PrecisionRecallCurve >&
+      precisionrecallcurves() const;
+
+  // @@protoc_insertion_point(class_scope:CoreML.Specification.ClassConfidenceThresholding)
+ private:
+
+  ::google::protobuf::internal::InternalMetadataWithArenaLite _internal_metadata_;
+  ::google::protobuf::RepeatedPtrField< ::CoreML::Specification::PrecisionRecallCurve > precisionrecallcurves_;
+  mutable int _cached_size_;
+  friend struct protobuf_ClassConfidenceThresholding_2eproto::TableStruct;
+};
+// ===================================================================
+
+
+// ===================================================================
+
+#if !PROTOBUF_INLINE_NOT_IN_HEADERS
+// ClassConfidenceThresholding
+
+// repeated .CoreML.Specification.PrecisionRecallCurve precisionRecallCurves = 100;
+inline int ClassConfidenceThresholding::precisionrecallcurves_size() const {
+  return precisionrecallcurves_.size();
+}
+inline void ClassConfidenceThresholding::clear_precisionrecallcurves() {
+  precisionrecallcurves_.Clear();
+}
+inline const ::CoreML::Specification::PrecisionRecallCurve& ClassConfidenceThresholding::precisionrecallcurves(int index) const {
+  // @@protoc_insertion_point(field_get:CoreML.Specification.ClassConfidenceThresholding.precisionRecallCurves)
+  return precisionrecallcurves_.Get(index);
+}
+inline ::CoreML::Specification::PrecisionRecallCurve* ClassConfidenceThresholding::mutable_precisionrecallcurves(int index) {
+  // @@protoc_insertion_point(field_mutable:CoreML.Specification.ClassConfidenceThresholding.precisionRecallCurves)
+  return precisionrecallcurves_.Mutable(index);
+}
+inline ::CoreML::Specification::PrecisionRecallCurve* ClassConfidenceThresholding::add_precisionrecallcurves() {
+  // @@protoc_insertion_point(field_add:CoreML.Specification.ClassConfidenceThresholding.precisionRecallCurves)
+  return precisionrecallcurves_.Add();
+}
+inline ::google::protobuf::RepeatedPtrField< ::CoreML::Specification::PrecisionRecallCurve >*
+ClassConfidenceThresholding::mutable_precisionrecallcurves() {
+  // @@protoc_insertion_point(field_mutable_list:CoreML.Specification.ClassConfidenceThresholding.precisionRecallCurves)
+  return &precisionrecallcurves_;
+}
+inline const ::google::protobuf::RepeatedPtrField< ::CoreML::Specification::PrecisionRecallCurve >&
+ClassConfidenceThresholding::precisionrecallcurves() const {
+  // @@protoc_insertion_point(field_list:CoreML.Specification.ClassConfidenceThresholding.precisionRecallCurves)
+  return precisionrecallcurves_;
+}
+
+#endif  // !PROTOBUF_INLINE_NOT_IN_HEADERS
+
+// @@protoc_insertion_point(namespace_scope)
+
+
+}  // namespace Specification
+}  // namespace CoreML
+
+// @@protoc_insertion_point(global_scope)
+
+#endif  // PROTOBUF_ClassConfidenceThresholding_2eproto__INCLUDED
diff --git a/mlmodel/build/format/ClassConfidenceThresholding_enums.h b/mlmodel/build/format/ClassConfidenceThresholding_enums.h
new file mode 100644
index 000000000..57c41f757
--- /dev/null
+++ b/mlmodel/build/format/ClassConfidenceThresholding_enums.h
@@ -0,0 +1,3 @@
+#ifndef __CLASSCONFIDENCETHRESHOLDING_ENUMS_H
+#define __CLASSCONFIDENCETHRESHOLDING_ENUMS_H
+#endif
diff --git a/mlmodel/build/format/DataStructures.pb.cc b/mlmodel/build/format/DataStructures.pb.cc
index 8d8521ccf..840ab3eec 100644
--- a/mlmodel/build/format/DataStructures.pb.cc
+++ b/mlmodel/build/format/DataStructures.pb.cc
@@ -46,6 +46,8 @@ class Int64SetDefaultTypeInternal : public ::google::protobuf::internal::Explici
 } _Int64Set_default_instance_;
 class DoubleRangeDefaultTypeInternal : public ::google::protobuf::internal::ExplicitlyConstructed<DoubleRange> {
 } _DoubleRange_default_instance_;
+class PrecisionRecallCurveDefaultTypeInternal : public ::google::protobuf::internal::ExplicitlyConstructed<PrecisionRecallCurve> {
+} _PrecisionRecallCurve_default_instance_;
 
 namespace protobuf_DataStructures_2eproto {
 
@@ -75,6 +77,7 @@ PROTOBUF_CONSTEXPR_VAR ::google::protobuf::internal::ParseTable const
   { NULL, NULL, 0, -1, -1, false },
   { NULL, NULL, 0, -1, -1, false },
   { NULL, NULL, 0, -1, -1, false },
+  { NULL, NULL, 0, -1, -1, false },
 };
 
 
@@ -90,6 +93,7 @@ void TableStruct::Shutdown() {
   _Int64Range_default_instance_.Shutdown();
   _Int64Set_default_instance_.Shutdown();
   _DoubleRange_default_instance_.Shutdown();
+  _PrecisionRecallCurve_default_instance_.Shutdown();
 }
 
 void TableStruct::InitDefaultsImpl() {
@@ -112,6 +116,7 @@ void TableStruct::InitDefaultsImpl() {
   _Int64Range_default_instance_.DefaultConstruct();
   _Int64Set_default_instance_.DefaultConstruct();
   _DoubleRange_default_instance_.DefaultConstruct();
+  _PrecisionRecallCurve_default_instance_.DefaultConstruct();
   _StringToInt64Map_MapEntry_default_instance_.get_mutable()->set_default_instance(_StringToInt64Map_MapEntry_default_instance_.get_mutable());
   _StringToInt64Map_MapEntry_default_instance_.get_mutable()->InitAsDefaultInstance();
   _Int64ToStringMap_MapEntry_default_instance_.get_mutable()->set_default_instance(_Int64ToStringMap_MapEntry_default_instance_.get_mutable());
@@ -120,6 +125,14 @@ void TableStruct::InitDefaultsImpl() {
   _StringToDoubleMap_MapEntry_default_instance_.get_mutable()->InitAsDefaultInstance();
   _Int64ToDoubleMap_MapEntry_default_instance_.get_mutable()->set_default_instance(_Int64ToDoubleMap_MapEntry_default_instance_.get_mutable());
   _Int64ToDoubleMap_MapEntry_default_instance_.get_mutable()->InitAsDefaultInstance();
+  _PrecisionRecallCurve_default_instance_.get_mutable()->precisionvalues_ = const_cast< ::CoreML::Specification::FloatVector*>(
+      ::CoreML::Specification::FloatVector::internal_default_instance());
+  _PrecisionRecallCurve_default_instance_.get_mutable()->precisionconfidencethresholds_ = const_cast< ::CoreML::Specification::FloatVector*>(
+      ::CoreML::Specification::FloatVector::internal_default_instance());
+  _PrecisionRecallCurve_default_instance_.get_mutable()->recallvalues_ = const_cast< ::CoreML::Specification::FloatVector*>(
+      ::CoreML::Specification::FloatVector::internal_default_instance());
+  _PrecisionRecallCurve_default_instance_.get_mutable()->recallconfidencethresholds_ = const_cast< ::CoreML::Specification::FloatVector*>(
+      ::CoreML::Specification::FloatVector::internal_default_instance());
 }
 
 void InitDefaults() {
@@ -2825,6 +2838,480 @@ void DoubleRange::set_maxvalue(double value) {
 
 #endif  // PROTOBUF_INLINE_NOT_IN_HEADERS
 
+// ===================================================================
+
+#if !defined(_MSC_VER) || _MSC_VER >= 1900
+const int PrecisionRecallCurve::kPrecisionValuesFieldNumber;
+const int PrecisionRecallCurve::kPrecisionConfidenceThresholdsFieldNumber;
+const int PrecisionRecallCurve::kRecallValuesFieldNumber;
+const int PrecisionRecallCurve::kRecallConfidenceThresholdsFieldNumber;
+#endif  // !defined(_MSC_VER) || _MSC_VER >= 1900
+
+PrecisionRecallCurve::PrecisionRecallCurve()
+  : ::google::protobuf::MessageLite(), _internal_metadata_(NULL) {
+  if (GOOGLE_PREDICT_TRUE(this != internal_default_instance())) {
+    protobuf_DataStructures_2eproto::InitDefaults();
+  }
+  SharedCtor();
+  // @@protoc_insertion_point(constructor:CoreML.Specification.PrecisionRecallCurve)
+}
+PrecisionRecallCurve::PrecisionRecallCurve(const PrecisionRecallCurve& from)
+  : ::google::protobuf::MessageLite(),
+      _internal_metadata_(NULL),
+      _cached_size_(0) {
+  _internal_metadata_.MergeFrom(from._internal_metadata_);
+  if (from.has_precisionvalues()) {
+    precisionvalues_ = new ::CoreML::Specification::FloatVector(*from.precisionvalues_);
+  } else {
+    precisionvalues_ = NULL;
+  }
+  if (from.has_precisionconfidencethresholds()) {
+    precisionconfidencethresholds_ = new ::CoreML::Specification::FloatVector(*from.precisionconfidencethresholds_);
+  } else {
+    precisionconfidencethresholds_ = NULL;
+  }
+  if (from.has_recallvalues()) {
+    recallvalues_ = new ::CoreML::Specification::FloatVector(*from.recallvalues_);
+  } else {
+    recallvalues_ = NULL;
+  }
+  if (from.has_recallconfidencethresholds()) {
+    recallconfidencethresholds_ = new ::CoreML::Specification::FloatVector(*from.recallconfidencethresholds_);
+  } else {
+    recallconfidencethresholds_ = NULL;
+  }
+  // @@protoc_insertion_point(copy_constructor:CoreML.Specification.PrecisionRecallCurve)
+}
+
+void PrecisionRecallCurve::SharedCtor() {
+  ::memset(&precisionvalues_, 0, reinterpret_cast<char*>(&recallconfidencethresholds_) -
+    reinterpret_cast<char*>(&precisionvalues_) + sizeof(recallconfidencethresholds_));
+  _cached_size_ = 0;
+}
+
+PrecisionRecallCurve::~PrecisionRecallCurve() {
+  // @@protoc_insertion_point(destructor:CoreML.Specification.PrecisionRecallCurve)
+  SharedDtor();
+}
+
+void PrecisionRecallCurve::SharedDtor() {
+  if (this != internal_default_instance()) {
+    delete precisionvalues_;
+  }
+  if (this != internal_default_instance()) {
+    delete precisionconfidencethresholds_;
+  }
+  if (this != internal_default_instance()) {
+    delete recallvalues_;
+  }
+  if (this != internal_default_instance()) {
+    delete recallconfidencethresholds_;
+  }
+}
+
+void PrecisionRecallCurve::SetCachedSize(int size) const {
+  GOOGLE_SAFE_CONCURRENT_WRITES_BEGIN();
+  _cached_size_ = size;
+  GOOGLE_SAFE_CONCURRENT_WRITES_END();
+}
+const PrecisionRecallCurve& PrecisionRecallCurve::default_instance() {
+  protobuf_DataStructures_2eproto::InitDefaults();
+  return *internal_default_instance();
+}
+
+PrecisionRecallCurve* PrecisionRecallCurve::New(::google::protobuf::Arena* arena) const {
+  PrecisionRecallCurve* n = new PrecisionRecallCurve;
+  if (arena != NULL) {
+    arena->Own(n);
+  }
+  return n;
+}
+
+void PrecisionRecallCurve::Clear() {
+// @@protoc_insertion_point(message_clear_start:CoreML.Specification.PrecisionRecallCurve)
+  if (GetArenaNoVirtual() == NULL && precisionvalues_ != NULL) {
+    delete precisionvalues_;
+  }
+  precisionvalues_ = NULL;
+  if (GetArenaNoVirtual() == NULL && precisionconfidencethresholds_ != NULL) {
+    delete precisionconfidencethresholds_;
+  }
+  precisionconfidencethresholds_ = NULL;
+  if (GetArenaNoVirtual() == NULL && recallvalues_ != NULL) {
+    delete recallvalues_;
+  }
+  recallvalues_ = NULL;
+  if (GetArenaNoVirtual() == NULL && recallconfidencethresholds_ != NULL) {
+    delete recallconfidencethresholds_;
+  }
+  recallconfidencethresholds_ = NULL;
+}
+
+bool PrecisionRecallCurve::MergePartialFromCodedStream(
+    ::google::protobuf::io::CodedInputStream* input) {
+#define DO_(EXPRESSION) if (!GOOGLE_PREDICT_TRUE(EXPRESSION)) goto failure
+  ::google::protobuf::uint32 tag;
+  // @@protoc_insertion_point(parse_start:CoreML.Specification.PrecisionRecallCurve)
+  for (;;) {
+    ::std::pair< ::google::protobuf::uint32, bool> p = input->ReadTagWithCutoffNoLastTag(127u);
+    tag = p.first;
+    if (!p.second) goto handle_unusual;
+    switch (::google::protobuf::internal::WireFormatLite::GetTagFieldNumber(tag)) {
+      // .CoreML.Specification.FloatVector precisionValues = 1;
+      case 1: {
+        if (static_cast< ::google::protobuf::uint8>(tag) ==
+            static_cast< ::google::protobuf::uint8>(10u)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadMessageNoVirtual(
+               input, mutable_precisionvalues()));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // .CoreML.Specification.FloatVector precisionConfidenceThresholds = 2;
+      case 2: {
+        if (static_cast< ::google::protobuf::uint8>(tag) ==
+            static_cast< ::google::protobuf::uint8>(18u)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadMessageNoVirtual(
+               input, mutable_precisionconfidencethresholds()));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // .CoreML.Specification.FloatVector recallValues = 3;
+      case 3: {
+        if (static_cast< ::google::protobuf::uint8>(tag) ==
+            static_cast< ::google::protobuf::uint8>(26u)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadMessageNoVirtual(
+               input, mutable_recallvalues()));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // .CoreML.Specification.FloatVector recallConfidenceThresholds = 4;
+      case 4: {
+        if (static_cast< ::google::protobuf::uint8>(tag) ==
+            static_cast< ::google::protobuf::uint8>(34u)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadMessageNoVirtual(
+               input, mutable_recallconfidencethresholds()));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      default: {
+      handle_unusual:
+        if (tag == 0 ||
+            ::google::protobuf::internal::WireFormatLite::GetTagWireType(tag) ==
+            ::google::protobuf::internal::WireFormatLite::WIRETYPE_END_GROUP) {
+          goto success;
+        }
+        DO_(::google::protobuf::internal::WireFormatLite::SkipField(input, tag));
+        break;
+      }
+    }
+  }
+success:
+  // @@protoc_insertion_point(parse_success:CoreML.Specification.PrecisionRecallCurve)
+  return true;
+failure:
+  // @@protoc_insertion_point(parse_failure:CoreML.Specification.PrecisionRecallCurve)
+  return false;
+#undef DO_
+}
+
+void PrecisionRecallCurve::SerializeWithCachedSizes(
+    ::google::protobuf::io::CodedOutputStream* output) const {
+  // @@protoc_insertion_point(serialize_start:CoreML.Specification.PrecisionRecallCurve)
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  (void) cached_has_bits;
+
+  // .CoreML.Specification.FloatVector precisionValues = 1;
+  if (this->has_precisionvalues()) {
+    ::google::protobuf::internal::WireFormatLite::WriteMessage(
+      1, *this->precisionvalues_, output);
+  }
+
+  // .CoreML.Specification.FloatVector precisionConfidenceThresholds = 2;
+  if (this->has_precisionconfidencethresholds()) {
+    ::google::protobuf::internal::WireFormatLite::WriteMessage(
+      2, *this->precisionconfidencethresholds_, output);
+  }
+
+  // .CoreML.Specification.FloatVector recallValues = 3;
+  if (this->has_recallvalues()) {
+    ::google::protobuf::internal::WireFormatLite::WriteMessage(
+      3, *this->recallvalues_, output);
+  }
+
+  // .CoreML.Specification.FloatVector recallConfidenceThresholds = 4;
+  if (this->has_recallconfidencethresholds()) {
+    ::google::protobuf::internal::WireFormatLite::WriteMessage(
+      4, *this->recallconfidencethresholds_, output);
+  }
+
+  // @@protoc_insertion_point(serialize_end:CoreML.Specification.PrecisionRecallCurve)
+}
+
+size_t PrecisionRecallCurve::ByteSizeLong() const {
+// @@protoc_insertion_point(message_byte_size_start:CoreML.Specification.PrecisionRecallCurve)
+  size_t total_size = 0;
+
+  // .CoreML.Specification.FloatVector precisionValues = 1;
+  if (this->has_precisionvalues()) {
+    total_size += 1 +
+      ::google::protobuf::internal::WireFormatLite::MessageSizeNoVirtual(
+        *this->precisionvalues_);
+  }
+
+  // .CoreML.Specification.FloatVector precisionConfidenceThresholds = 2;
+  if (this->has_precisionconfidencethresholds()) {
+    total_size += 1 +
+      ::google::protobuf::internal::WireFormatLite::MessageSizeNoVirtual(
+        *this->precisionconfidencethresholds_);
+  }
+
+  // .CoreML.Specification.FloatVector recallValues = 3;
+  if (this->has_recallvalues()) {
+    total_size += 1 +
+      ::google::protobuf::internal::WireFormatLite::MessageSizeNoVirtual(
+        *this->recallvalues_);
+  }
+
+  // .CoreML.Specification.FloatVector recallConfidenceThresholds = 4;
+  if (this->has_recallconfidencethresholds()) {
+    total_size += 1 +
+      ::google::protobuf::internal::WireFormatLite::MessageSizeNoVirtual(
+        *this->recallconfidencethresholds_);
+  }
+
+  int cached_size = ::google::protobuf::internal::ToCachedSize(total_size);
+  GOOGLE_SAFE_CONCURRENT_WRITES_BEGIN();
+  _cached_size_ = cached_size;
+  GOOGLE_SAFE_CONCURRENT_WRITES_END();
+  return total_size;
+}
+
+void PrecisionRecallCurve::CheckTypeAndMergeFrom(
+    const ::google::protobuf::MessageLite& from) {
+  MergeFrom(*::google::protobuf::down_cast<const PrecisionRecallCurve*>(&from));
+}
+
+void PrecisionRecallCurve::MergeFrom(const PrecisionRecallCurve& from) {
+// @@protoc_insertion_point(class_specific_merge_from_start:CoreML.Specification.PrecisionRecallCurve)
+  GOOGLE_DCHECK_NE(&from, this);
+  _internal_metadata_.MergeFrom(from._internal_metadata_);
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  (void) cached_has_bits;
+
+  if (from.has_precisionvalues()) {
+    mutable_precisionvalues()->::CoreML::Specification::FloatVector::MergeFrom(from.precisionvalues());
+  }
+  if (from.has_precisionconfidencethresholds()) {
+    mutable_precisionconfidencethresholds()->::CoreML::Specification::FloatVector::MergeFrom(from.precisionconfidencethresholds());
+  }
+  if (from.has_recallvalues()) {
+    mutable_recallvalues()->::CoreML::Specification::FloatVector::MergeFrom(from.recallvalues());
+  }
+  if (from.has_recallconfidencethresholds()) {
+    mutable_recallconfidencethresholds()->::CoreML::Specification::FloatVector::MergeFrom(from.recallconfidencethresholds());
+  }
+}
+
+void PrecisionRecallCurve::CopyFrom(const PrecisionRecallCurve& from) {
+// @@protoc_insertion_point(class_specific_copy_from_start:CoreML.Specification.PrecisionRecallCurve)
+  if (&from == this) return;
+  Clear();
+  MergeFrom(from);
+}
+
+bool PrecisionRecallCurve::IsInitialized() const {
+  return true;
+}
+
+void PrecisionRecallCurve::Swap(PrecisionRecallCurve* other) {
+  if (other == this) return;
+  InternalSwap(other);
+}
+void PrecisionRecallCurve::InternalSwap(PrecisionRecallCurve* other) {
+  std::swap(precisionvalues_, other->precisionvalues_);
+  std::swap(precisionconfidencethresholds_, other->precisionconfidencethresholds_);
+  std::swap(recallvalues_, other->recallvalues_);
+  std::swap(recallconfidencethresholds_, other->recallconfidencethresholds_);
+  std::swap(_cached_size_, other->_cached_size_);
+}
+
+::std::string PrecisionRecallCurve::GetTypeName() const {
+  return "CoreML.Specification.PrecisionRecallCurve";
+}
+
+#if PROTOBUF_INLINE_NOT_IN_HEADERS
+// PrecisionRecallCurve
+
+// .CoreML.Specification.FloatVector precisionValues = 1;
+bool PrecisionRecallCurve::has_precisionvalues() const {
+  return this != internal_default_instance() && precisionvalues_ != NULL;
+}
+void PrecisionRecallCurve::clear_precisionvalues() {
+  if (GetArenaNoVirtual() == NULL && precisionvalues_ != NULL) delete precisionvalues_;
+  precisionvalues_ = NULL;
+}
+const ::CoreML::Specification::FloatVector& PrecisionRecallCurve::precisionvalues() const {
+  // @@protoc_insertion_point(field_get:CoreML.Specification.PrecisionRecallCurve.precisionValues)
+  return precisionvalues_ != NULL ? *precisionvalues_
+                         : *::CoreML::Specification::FloatVector::internal_default_instance();
+}
+::CoreML::Specification::FloatVector* PrecisionRecallCurve::mutable_precisionvalues() {
+  
+  if (precisionvalues_ == NULL) {
+    precisionvalues_ = new ::CoreML::Specification::FloatVector;
+  }
+  // @@protoc_insertion_point(field_mutable:CoreML.Specification.PrecisionRecallCurve.precisionValues)
+  return precisionvalues_;
+}
+::CoreML::Specification::FloatVector* PrecisionRecallCurve::release_precisionvalues() {
+  // @@protoc_insertion_point(field_release:CoreML.Specification.PrecisionRecallCurve.precisionValues)
+  
+  ::CoreML::Specification::FloatVector* temp = precisionvalues_;
+  precisionvalues_ = NULL;
+  return temp;
+}
+void PrecisionRecallCurve::set_allocated_precisionvalues(::CoreML::Specification::FloatVector* precisionvalues) {
+  delete precisionvalues_;
+  precisionvalues_ = precisionvalues;
+  if (precisionvalues) {
+    
+  } else {
+    
+  }
+  // @@protoc_insertion_point(field_set_allocated:CoreML.Specification.PrecisionRecallCurve.precisionValues)
+}
+
+// .CoreML.Specification.FloatVector precisionConfidenceThresholds = 2;
+bool PrecisionRecallCurve::has_precisionconfidencethresholds() const {
+  return this != internal_default_instance() && precisionconfidencethresholds_ != NULL;
+}
+void PrecisionRecallCurve::clear_precisionconfidencethresholds() {
+  if (GetArenaNoVirtual() == NULL && precisionconfidencethresholds_ != NULL) delete precisionconfidencethresholds_;
+  precisionconfidencethresholds_ = NULL;
+}
+const ::CoreML::Specification::FloatVector& PrecisionRecallCurve::precisionconfidencethresholds() const {
+  // @@protoc_insertion_point(field_get:CoreML.Specification.PrecisionRecallCurve.precisionConfidenceThresholds)
+  return precisionconfidencethresholds_ != NULL ? *precisionconfidencethresholds_
+                         : *::CoreML::Specification::FloatVector::internal_default_instance();
+}
+::CoreML::Specification::FloatVector* PrecisionRecallCurve::mutable_precisionconfidencethresholds() {
+  
+  if (precisionconfidencethresholds_ == NULL) {
+    precisionconfidencethresholds_ = new ::CoreML::Specification::FloatVector;
+  }
+  // @@protoc_insertion_point(field_mutable:CoreML.Specification.PrecisionRecallCurve.precisionConfidenceThresholds)
+  return precisionconfidencethresholds_;
+}
+::CoreML::Specification::FloatVector* PrecisionRecallCurve::release_precisionconfidencethresholds() {
+  // @@protoc_insertion_point(field_release:CoreML.Specification.PrecisionRecallCurve.precisionConfidenceThresholds)
+  
+  ::CoreML::Specification::FloatVector* temp = precisionconfidencethresholds_;
+  precisionconfidencethresholds_ = NULL;
+  return temp;
+}
+void PrecisionRecallCurve::set_allocated_precisionconfidencethresholds(::CoreML::Specification::FloatVector* precisionconfidencethresholds) {
+  delete precisionconfidencethresholds_;
+  precisionconfidencethresholds_ = precisionconfidencethresholds;
+  if (precisionconfidencethresholds) {
+    
+  } else {
+    
+  }
+  // @@protoc_insertion_point(field_set_allocated:CoreML.Specification.PrecisionRecallCurve.precisionConfidenceThresholds)
+}
+
+// .CoreML.Specification.FloatVector recallValues = 3;
+bool PrecisionRecallCurve::has_recallvalues() const {
+  return this != internal_default_instance() && recallvalues_ != NULL;
+}
+void PrecisionRecallCurve::clear_recallvalues() {
+  if (GetArenaNoVirtual() == NULL && recallvalues_ != NULL) delete recallvalues_;
+  recallvalues_ = NULL;
+}
+const ::CoreML::Specification::FloatVector& PrecisionRecallCurve::recallvalues() const {
+  // @@protoc_insertion_point(field_get:CoreML.Specification.PrecisionRecallCurve.recallValues)
+  return recallvalues_ != NULL ? *recallvalues_
+                         : *::CoreML::Specification::FloatVector::internal_default_instance();
+}
+::CoreML::Specification::FloatVector* PrecisionRecallCurve::mutable_recallvalues() {
+  
+  if (recallvalues_ == NULL) {
+    recallvalues_ = new ::CoreML::Specification::FloatVector;
+  }
+  // @@protoc_insertion_point(field_mutable:CoreML.Specification.PrecisionRecallCurve.recallValues)
+  return recallvalues_;
+}
+::CoreML::Specification::FloatVector* PrecisionRecallCurve::release_recallvalues() {
+  // @@protoc_insertion_point(field_release:CoreML.Specification.PrecisionRecallCurve.recallValues)
+  
+  ::CoreML::Specification::FloatVector* temp = recallvalues_;
+  recallvalues_ = NULL;
+  return temp;
+}
+void PrecisionRecallCurve::set_allocated_recallvalues(::CoreML::Specification::FloatVector* recallvalues) {
+  delete recallvalues_;
+  recallvalues_ = recallvalues;
+  if (recallvalues) {
+    
+  } else {
+    
+  }
+  // @@protoc_insertion_point(field_set_allocated:CoreML.Specification.PrecisionRecallCurve.recallValues)
+}
+
+// .CoreML.Specification.FloatVector recallConfidenceThresholds = 4;
+bool PrecisionRecallCurve::has_recallconfidencethresholds() const {
+  return this != internal_default_instance() && recallconfidencethresholds_ != NULL;
+}
+void PrecisionRecallCurve::clear_recallconfidencethresholds() {
+  if (GetArenaNoVirtual() == NULL && recallconfidencethresholds_ != NULL) delete recallconfidencethresholds_;
+  recallconfidencethresholds_ = NULL;
+}
+const ::CoreML::Specification::FloatVector& PrecisionRecallCurve::recallconfidencethresholds() const {
+  // @@protoc_insertion_point(field_get:CoreML.Specification.PrecisionRecallCurve.recallConfidenceThresholds)
+  return recallconfidencethresholds_ != NULL ? *recallconfidencethresholds_
+                         : *::CoreML::Specification::FloatVector::internal_default_instance();
+}
+::CoreML::Specification::FloatVector* PrecisionRecallCurve::mutable_recallconfidencethresholds() {
+  
+  if (recallconfidencethresholds_ == NULL) {
+    recallconfidencethresholds_ = new ::CoreML::Specification::FloatVector;
+  }
+  // @@protoc_insertion_point(field_mutable:CoreML.Specification.PrecisionRecallCurve.recallConfidenceThresholds)
+  return recallconfidencethresholds_;
+}
+::CoreML::Specification::FloatVector* PrecisionRecallCurve::release_recallconfidencethresholds() {
+  // @@protoc_insertion_point(field_release:CoreML.Specification.PrecisionRecallCurve.recallConfidenceThresholds)
+  
+  ::CoreML::Specification::FloatVector* temp = recallconfidencethresholds_;
+  recallconfidencethresholds_ = NULL;
+  return temp;
+}
+void PrecisionRecallCurve::set_allocated_recallconfidencethresholds(::CoreML::Specification::FloatVector* recallconfidencethresholds) {
+  delete recallconfidencethresholds_;
+  recallconfidencethresholds_ = recallconfidencethresholds;
+  if (recallconfidencethresholds) {
+    
+  } else {
+    
+  }
+  // @@protoc_insertion_point(field_set_allocated:CoreML.Specification.PrecisionRecallCurve.recallConfidenceThresholds)
+}
+
+#endif  // PROTOBUF_INLINE_NOT_IN_HEADERS
+
 // @@protoc_insertion_point(namespace_scope)
 
 }  // namespace Specification
diff --git a/mlmodel/build/format/DataStructures.pb.h b/mlmodel/build/format/DataStructures.pb.h
index 525bd0497..64a835f61 100644
--- a/mlmodel/build/format/DataStructures.pb.h
+++ b/mlmodel/build/format/DataStructures.pb.h
@@ -100,6 +100,9 @@ extern Int64ToStringMap_MapEntryDefaultTypeInternal _Int64ToStringMap_MapEntry_d
 class Int64Vector;
 class Int64VectorDefaultTypeInternal;
 extern Int64VectorDefaultTypeInternal _Int64Vector_default_instance_;
+class PrecisionRecallCurve;
+class PrecisionRecallCurveDefaultTypeInternal;
+extern PrecisionRecallCurveDefaultTypeInternal _PrecisionRecallCurve_default_instance_;
 class SequenceFeatureType;
 class SequenceFeatureTypeDefaultTypeInternal;
 extern SequenceFeatureTypeDefaultTypeInternal _SequenceFeatureType_default_instance_;
@@ -1140,6 +1143,117 @@ class DoubleRange : public ::google::protobuf::MessageLite /* @@protoc_insertion
   mutable int _cached_size_;
   friend struct protobuf_DataStructures_2eproto::TableStruct;
 };
+// -------------------------------------------------------------------
+
+class PrecisionRecallCurve : public ::google::protobuf::MessageLite /* @@protoc_insertion_point(class_definition:CoreML.Specification.PrecisionRecallCurve) */ {
+ public:
+  PrecisionRecallCurve();
+  virtual ~PrecisionRecallCurve();
+
+  PrecisionRecallCurve(const PrecisionRecallCurve& from);
+
+  inline PrecisionRecallCurve& operator=(const PrecisionRecallCurve& from) {
+    CopyFrom(from);
+    return *this;
+  }
+
+  static const PrecisionRecallCurve& default_instance();
+
+  static inline const PrecisionRecallCurve* internal_default_instance() {
+    return reinterpret_cast<const PrecisionRecallCurve*>(
+               &_PrecisionRecallCurve_default_instance_);
+  }
+  static PROTOBUF_CONSTEXPR int const kIndexInFileMessages =
+    15;
+
+  void Swap(PrecisionRecallCurve* other);
+
+  // implements Message ----------------------------------------------
+
+  inline PrecisionRecallCurve* New() const PROTOBUF_FINAL { return New(NULL); }
+
+  PrecisionRecallCurve* New(::google::protobuf::Arena* arena) const PROTOBUF_FINAL;
+  void CheckTypeAndMergeFrom(const ::google::protobuf::MessageLite& from)
+    PROTOBUF_FINAL;
+  void CopyFrom(const PrecisionRecallCurve& from);
+  void MergeFrom(const PrecisionRecallCurve& from);
+  void Clear() PROTOBUF_FINAL;
+  bool IsInitialized() const PROTOBUF_FINAL;
+
+  size_t ByteSizeLong() const PROTOBUF_FINAL;
+  bool MergePartialFromCodedStream(
+      ::google::protobuf::io::CodedInputStream* input) PROTOBUF_FINAL;
+  void SerializeWithCachedSizes(
+      ::google::protobuf::io::CodedOutputStream* output) const PROTOBUF_FINAL;
+  void DiscardUnknownFields();
+  int GetCachedSize() const PROTOBUF_FINAL { return _cached_size_; }
+  private:
+  void SharedCtor();
+  void SharedDtor();
+  void SetCachedSize(int size) const;
+  void InternalSwap(PrecisionRecallCurve* other);
+  private:
+  inline ::google::protobuf::Arena* GetArenaNoVirtual() const {
+    return NULL;
+  }
+  inline void* MaybeArenaPtr() const {
+    return NULL;
+  }
+  public:
+
+  ::std::string GetTypeName() const PROTOBUF_FINAL;
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  // .CoreML.Specification.FloatVector precisionValues = 1;
+  bool has_precisionvalues() const;
+  void clear_precisionvalues();
+  static const int kPrecisionValuesFieldNumber = 1;
+  const ::CoreML::Specification::FloatVector& precisionvalues() const;
+  ::CoreML::Specification::FloatVector* mutable_precisionvalues();
+  ::CoreML::Specification::FloatVector* release_precisionvalues();
+  void set_allocated_precisionvalues(::CoreML::Specification::FloatVector* precisionvalues);
+
+  // .CoreML.Specification.FloatVector precisionConfidenceThresholds = 2;
+  bool has_precisionconfidencethresholds() const;
+  void clear_precisionconfidencethresholds();
+  static const int kPrecisionConfidenceThresholdsFieldNumber = 2;
+  const ::CoreML::Specification::FloatVector& precisionconfidencethresholds() const;
+  ::CoreML::Specification::FloatVector* mutable_precisionconfidencethresholds();
+  ::CoreML::Specification::FloatVector* release_precisionconfidencethresholds();
+  void set_allocated_precisionconfidencethresholds(::CoreML::Specification::FloatVector* precisionconfidencethresholds);
+
+  // .CoreML.Specification.FloatVector recallValues = 3;
+  bool has_recallvalues() const;
+  void clear_recallvalues();
+  static const int kRecallValuesFieldNumber = 3;
+  const ::CoreML::Specification::FloatVector& recallvalues() const;
+  ::CoreML::Specification::FloatVector* mutable_recallvalues();
+  ::CoreML::Specification::FloatVector* release_recallvalues();
+  void set_allocated_recallvalues(::CoreML::Specification::FloatVector* recallvalues);
+
+  // .CoreML.Specification.FloatVector recallConfidenceThresholds = 4;
+  bool has_recallconfidencethresholds() const;
+  void clear_recallconfidencethresholds();
+  static const int kRecallConfidenceThresholdsFieldNumber = 4;
+  const ::CoreML::Specification::FloatVector& recallconfidencethresholds() const;
+  ::CoreML::Specification::FloatVector* mutable_recallconfidencethresholds();
+  ::CoreML::Specification::FloatVector* release_recallconfidencethresholds();
+  void set_allocated_recallconfidencethresholds(::CoreML::Specification::FloatVector* recallconfidencethresholds);
+
+  // @@protoc_insertion_point(class_scope:CoreML.Specification.PrecisionRecallCurve)
+ private:
+
+  ::google::protobuf::internal::InternalMetadataWithArenaLite _internal_metadata_;
+  ::CoreML::Specification::FloatVector* precisionvalues_;
+  ::CoreML::Specification::FloatVector* precisionconfidencethresholds_;
+  ::CoreML::Specification::FloatVector* recallvalues_;
+  ::CoreML::Specification::FloatVector* recallconfidencethresholds_;
+  mutable int _cached_size_;
+  friend struct protobuf_DataStructures_2eproto::TableStruct;
+};
 // ===================================================================
 
 
@@ -1513,6 +1627,166 @@ inline void DoubleRange::set_maxvalue(double value) {
   // @@protoc_insertion_point(field_set:CoreML.Specification.DoubleRange.maxValue)
 }
 
+// -------------------------------------------------------------------
+
+// PrecisionRecallCurve
+
+// .CoreML.Specification.FloatVector precisionValues = 1;
+inline bool PrecisionRecallCurve::has_precisionvalues() const {
+  return this != internal_default_instance() && precisionvalues_ != NULL;
+}
+inline void PrecisionRecallCurve::clear_precisionvalues() {
+  if (GetArenaNoVirtual() == NULL && precisionvalues_ != NULL) delete precisionvalues_;
+  precisionvalues_ = NULL;
+}
+inline const ::CoreML::Specification::FloatVector& PrecisionRecallCurve::precisionvalues() const {
+  // @@protoc_insertion_point(field_get:CoreML.Specification.PrecisionRecallCurve.precisionValues)
+  return precisionvalues_ != NULL ? *precisionvalues_
+                         : *::CoreML::Specification::FloatVector::internal_default_instance();
+}
+inline ::CoreML::Specification::FloatVector* PrecisionRecallCurve::mutable_precisionvalues() {
+  
+  if (precisionvalues_ == NULL) {
+    precisionvalues_ = new ::CoreML::Specification::FloatVector;
+  }
+  // @@protoc_insertion_point(field_mutable:CoreML.Specification.PrecisionRecallCurve.precisionValues)
+  return precisionvalues_;
+}
+inline ::CoreML::Specification::FloatVector* PrecisionRecallCurve::release_precisionvalues() {
+  // @@protoc_insertion_point(field_release:CoreML.Specification.PrecisionRecallCurve.precisionValues)
+  
+  ::CoreML::Specification::FloatVector* temp = precisionvalues_;
+  precisionvalues_ = NULL;
+  return temp;
+}
+inline void PrecisionRecallCurve::set_allocated_precisionvalues(::CoreML::Specification::FloatVector* precisionvalues) {
+  delete precisionvalues_;
+  precisionvalues_ = precisionvalues;
+  if (precisionvalues) {
+    
+  } else {
+    
+  }
+  // @@protoc_insertion_point(field_set_allocated:CoreML.Specification.PrecisionRecallCurve.precisionValues)
+}
+
+// .CoreML.Specification.FloatVector precisionConfidenceThresholds = 2;
+inline bool PrecisionRecallCurve::has_precisionconfidencethresholds() const {
+  return this != internal_default_instance() && precisionconfidencethresholds_ != NULL;
+}
+inline void PrecisionRecallCurve::clear_precisionconfidencethresholds() {
+  if (GetArenaNoVirtual() == NULL && precisionconfidencethresholds_ != NULL) delete precisionconfidencethresholds_;
+  precisionconfidencethresholds_ = NULL;
+}
+inline const ::CoreML::Specification::FloatVector& PrecisionRecallCurve::precisionconfidencethresholds() const {
+  // @@protoc_insertion_point(field_get:CoreML.Specification.PrecisionRecallCurve.precisionConfidenceThresholds)
+  return precisionconfidencethresholds_ != NULL ? *precisionconfidencethresholds_
+                         : *::CoreML::Specification::FloatVector::internal_default_instance();
+}
+inline ::CoreML::Specification::FloatVector* PrecisionRecallCurve::mutable_precisionconfidencethresholds() {
+  
+  if (precisionconfidencethresholds_ == NULL) {
+    precisionconfidencethresholds_ = new ::CoreML::Specification::FloatVector;
+  }
+  // @@protoc_insertion_point(field_mutable:CoreML.Specification.PrecisionRecallCurve.precisionConfidenceThresholds)
+  return precisionconfidencethresholds_;
+}
+inline ::CoreML::Specification::FloatVector* PrecisionRecallCurve::release_precisionconfidencethresholds() {
+  // @@protoc_insertion_point(field_release:CoreML.Specification.PrecisionRecallCurve.precisionConfidenceThresholds)
+  
+  ::CoreML::Specification::FloatVector* temp = precisionconfidencethresholds_;
+  precisionconfidencethresholds_ = NULL;
+  return temp;
+}
+inline void PrecisionRecallCurve::set_allocated_precisionconfidencethresholds(::CoreML::Specification::FloatVector* precisionconfidencethresholds) {
+  delete precisionconfidencethresholds_;
+  precisionconfidencethresholds_ = precisionconfidencethresholds;
+  if (precisionconfidencethresholds) {
+    
+  } else {
+    
+  }
+  // @@protoc_insertion_point(field_set_allocated:CoreML.Specification.PrecisionRecallCurve.precisionConfidenceThresholds)
+}
+
+// .CoreML.Specification.FloatVector recallValues = 3;
+inline bool PrecisionRecallCurve::has_recallvalues() const {
+  return this != internal_default_instance() && recallvalues_ != NULL;
+}
+inline void PrecisionRecallCurve::clear_recallvalues() {
+  if (GetArenaNoVirtual() == NULL && recallvalues_ != NULL) delete recallvalues_;
+  recallvalues_ = NULL;
+}
+inline const ::CoreML::Specification::FloatVector& PrecisionRecallCurve::recallvalues() const {
+  // @@protoc_insertion_point(field_get:CoreML.Specification.PrecisionRecallCurve.recallValues)
+  return recallvalues_ != NULL ? *recallvalues_
+                         : *::CoreML::Specification::FloatVector::internal_default_instance();
+}
+inline ::CoreML::Specification::FloatVector* PrecisionRecallCurve::mutable_recallvalues() {
+  
+  if (recallvalues_ == NULL) {
+    recallvalues_ = new ::CoreML::Specification::FloatVector;
+  }
+  // @@protoc_insertion_point(field_mutable:CoreML.Specification.PrecisionRecallCurve.recallValues)
+  return recallvalues_;
+}
+inline ::CoreML::Specification::FloatVector* PrecisionRecallCurve::release_recallvalues() {
+  // @@protoc_insertion_point(field_release:CoreML.Specification.PrecisionRecallCurve.recallValues)
+  
+  ::CoreML::Specification::FloatVector* temp = recallvalues_;
+  recallvalues_ = NULL;
+  return temp;
+}
+inline void PrecisionRecallCurve::set_allocated_recallvalues(::CoreML::Specification::FloatVector* recallvalues) {
+  delete recallvalues_;
+  recallvalues_ = recallvalues;
+  if (recallvalues) {
+    
+  } else {
+    
+  }
+  // @@protoc_insertion_point(field_set_allocated:CoreML.Specification.PrecisionRecallCurve.recallValues)
+}
+
+// .CoreML.Specification.FloatVector recallConfidenceThresholds = 4;
+inline bool PrecisionRecallCurve::has_recallconfidencethresholds() const {
+  return this != internal_default_instance() && recallconfidencethresholds_ != NULL;
+}
+inline void PrecisionRecallCurve::clear_recallconfidencethresholds() {
+  if (GetArenaNoVirtual() == NULL && recallconfidencethresholds_ != NULL) delete recallconfidencethresholds_;
+  recallconfidencethresholds_ = NULL;
+}
+inline const ::CoreML::Specification::FloatVector& PrecisionRecallCurve::recallconfidencethresholds() const {
+  // @@protoc_insertion_point(field_get:CoreML.Specification.PrecisionRecallCurve.recallConfidenceThresholds)
+  return recallconfidencethresholds_ != NULL ? *recallconfidencethresholds_
+                         : *::CoreML::Specification::FloatVector::internal_default_instance();
+}
+inline ::CoreML::Specification::FloatVector* PrecisionRecallCurve::mutable_recallconfidencethresholds() {
+  
+  if (recallconfidencethresholds_ == NULL) {
+    recallconfidencethresholds_ = new ::CoreML::Specification::FloatVector;
+  }
+  // @@protoc_insertion_point(field_mutable:CoreML.Specification.PrecisionRecallCurve.recallConfidenceThresholds)
+  return recallconfidencethresholds_;
+}
+inline ::CoreML::Specification::FloatVector* PrecisionRecallCurve::release_recallconfidencethresholds() {
+  // @@protoc_insertion_point(field_release:CoreML.Specification.PrecisionRecallCurve.recallConfidenceThresholds)
+  
+  ::CoreML::Specification::FloatVector* temp = recallconfidencethresholds_;
+  recallconfidencethresholds_ = NULL;
+  return temp;
+}
+inline void PrecisionRecallCurve::set_allocated_recallconfidencethresholds(::CoreML::Specification::FloatVector* recallconfidencethresholds) {
+  delete recallconfidencethresholds_;
+  recallconfidencethresholds_ = recallconfidencethresholds;
+  if (recallconfidencethresholds) {
+    
+  } else {
+    
+  }
+  // @@protoc_insertion_point(field_set_allocated:CoreML.Specification.PrecisionRecallCurve.recallConfidenceThresholds)
+}
+
 #endif  // !PROTOBUF_INLINE_NOT_IN_HEADERS
 // -------------------------------------------------------------------
 
@@ -1542,6 +1816,8 @@ inline void DoubleRange::set_maxvalue(double value) {
 
 // -------------------------------------------------------------------
 
+// -------------------------------------------------------------------
+
 
 // @@protoc_insertion_point(namespace_scope)
 
diff --git a/mlmodel/build/format/DictVectorizer.pb.h b/mlmodel/build/format/DictVectorizer.pb.h
index 34de63b93..49db4e424 100644
--- a/mlmodel/build/format/DictVectorizer.pb.h
+++ b/mlmodel/build/format/DictVectorizer.pb.h
@@ -101,6 +101,9 @@ extern Int64ToStringMap_MapEntryDefaultTypeInternal _Int64ToStringMap_MapEntry_d
 class Int64Vector;
 class Int64VectorDefaultTypeInternal;
 extern Int64VectorDefaultTypeInternal _Int64Vector_default_instance_;
+class PrecisionRecallCurve;
+class PrecisionRecallCurveDefaultTypeInternal;
+extern PrecisionRecallCurveDefaultTypeInternal _PrecisionRecallCurve_default_instance_;
 class SequenceFeatureType;
 class SequenceFeatureTypeDefaultTypeInternal;
 extern SequenceFeatureTypeDefaultTypeInternal _SequenceFeatureType_default_instance_;
diff --git a/mlmodel/build/format/GLMClassifier.pb.h b/mlmodel/build/format/GLMClassifier.pb.h
index bd16537c0..b5703a55f 100644
--- a/mlmodel/build/format/GLMClassifier.pb.h
+++ b/mlmodel/build/format/GLMClassifier.pb.h
@@ -105,6 +105,9 @@ extern Int64ToStringMap_MapEntryDefaultTypeInternal _Int64ToStringMap_MapEntry_d
 class Int64Vector;
 class Int64VectorDefaultTypeInternal;
 extern Int64VectorDefaultTypeInternal _Int64Vector_default_instance_;
+class PrecisionRecallCurve;
+class PrecisionRecallCurveDefaultTypeInternal;
+extern PrecisionRecallCurveDefaultTypeInternal _PrecisionRecallCurve_default_instance_;
 class SequenceFeatureType;
 class SequenceFeatureTypeDefaultTypeInternal;
 extern SequenceFeatureTypeDefaultTypeInternal _SequenceFeatureType_default_instance_;
diff --git a/mlmodel/build/format/Gazetteer.pb.h b/mlmodel/build/format/Gazetteer.pb.h
index d534c159c..a14305271 100644
--- a/mlmodel/build/format/Gazetteer.pb.h
+++ b/mlmodel/build/format/Gazetteer.pb.h
@@ -98,6 +98,9 @@ extern Int64ToStringMap_MapEntryDefaultTypeInternal _Int64ToStringMap_MapEntry_d
 class Int64Vector;
 class Int64VectorDefaultTypeInternal;
 extern Int64VectorDefaultTypeInternal _Int64Vector_default_instance_;
+class PrecisionRecallCurve;
+class PrecisionRecallCurveDefaultTypeInternal;
+extern PrecisionRecallCurveDefaultTypeInternal _PrecisionRecallCurve_default_instance_;
 class SequenceFeatureType;
 class SequenceFeatureTypeDefaultTypeInternal;
 extern SequenceFeatureTypeDefaultTypeInternal _SequenceFeatureType_default_instance_;
diff --git a/mlmodel/build/format/Imputer.pb.h b/mlmodel/build/format/Imputer.pb.h
index 19a45cc51..256ef8e26 100644
--- a/mlmodel/build/format/Imputer.pb.h
+++ b/mlmodel/build/format/Imputer.pb.h
@@ -101,6 +101,9 @@ extern Int64ToStringMap_MapEntryDefaultTypeInternal _Int64ToStringMap_MapEntry_d
 class Int64Vector;
 class Int64VectorDefaultTypeInternal;
 extern Int64VectorDefaultTypeInternal _Int64Vector_default_instance_;
+class PrecisionRecallCurve;
+class PrecisionRecallCurveDefaultTypeInternal;
+extern PrecisionRecallCurveDefaultTypeInternal _PrecisionRecallCurve_default_instance_;
 class SequenceFeatureType;
 class SequenceFeatureTypeDefaultTypeInternal;
 extern SequenceFeatureTypeDefaultTypeInternal _SequenceFeatureType_default_instance_;
diff --git a/mlmodel/build/format/ItemSimilarityRecommender.pb.h b/mlmodel/build/format/ItemSimilarityRecommender.pb.h
index 0206aa9a4..7a4ff1b73 100644
--- a/mlmodel/build/format/ItemSimilarityRecommender.pb.h
+++ b/mlmodel/build/format/ItemSimilarityRecommender.pb.h
@@ -107,6 +107,9 @@ extern ItemSimilarityRecommender_ConnectedItemDefaultTypeInternal _ItemSimilarit
 class ItemSimilarityRecommender_SimilarItems;
 class ItemSimilarityRecommender_SimilarItemsDefaultTypeInternal;
 extern ItemSimilarityRecommender_SimilarItemsDefaultTypeInternal _ItemSimilarityRecommender_SimilarItems_default_instance_;
+class PrecisionRecallCurve;
+class PrecisionRecallCurveDefaultTypeInternal;
+extern PrecisionRecallCurveDefaultTypeInternal _PrecisionRecallCurve_default_instance_;
 class SequenceFeatureType;
 class SequenceFeatureTypeDefaultTypeInternal;
 extern SequenceFeatureTypeDefaultTypeInternal _SequenceFeatureType_default_instance_;
diff --git a/mlmodel/build/format/LinkedModel.pb.h b/mlmodel/build/format/LinkedModel.pb.h
index f29bb18bf..7881b618e 100644
--- a/mlmodel/build/format/LinkedModel.pb.h
+++ b/mlmodel/build/format/LinkedModel.pb.h
@@ -113,6 +113,9 @@ extern LinkedModelDefaultTypeInternal _LinkedModel_default_instance_;
 class LinkedModelFile;
 class LinkedModelFileDefaultTypeInternal;
 extern LinkedModelFileDefaultTypeInternal _LinkedModelFile_default_instance_;
+class PrecisionRecallCurve;
+class PrecisionRecallCurveDefaultTypeInternal;
+extern PrecisionRecallCurveDefaultTypeInternal _PrecisionRecallCurve_default_instance_;
 class SequenceFeatureType;
 class SequenceFeatureTypeDefaultTypeInternal;
 extern SequenceFeatureTypeDefaultTypeInternal _SequenceFeatureType_default_instance_;
diff --git a/mlmodel/build/format/Model.pb.cc b/mlmodel/build/format/Model.pb.cc
index 74fb6a578..8ab5c09de 100644
--- a/mlmodel/build/format/Model.pb.cc
+++ b/mlmodel/build/format/Model.pb.cc
@@ -52,6 +52,7 @@ class ModelDefaultTypeInternal : public ::google::protobuf::internal::Explicitly
   const ::CoreML::Specification::MILSpec::Program* mlprogram_;
   const ::CoreML::Specification::CustomModel* custommodel_;
   const ::CoreML::Specification::LinkedModel* linkedmodel_;
+  const ::CoreML::Specification::ClassConfidenceThresholding* classconfidencethresholding_;
   const ::CoreML::Specification::OneHotEncoder* onehotencoder_;
   const ::CoreML::Specification::Imputer* imputer_;
   const ::CoreML::Specification::FeatureVectorizer* featurevectorizer_;
@@ -142,6 +143,7 @@ void TableStruct::InitDefaultsImpl() {
   ::CoreML::Specification::protobuf_ItemSimilarityRecommender_2eproto::InitDefaults();
   ::CoreML::Specification::CoreMLModels::protobuf_SoundAnalysisPreprocessing_2eproto::InitDefaults();
   ::CoreML::Specification::protobuf_LinkedModel_2eproto::InitDefaults();
+  ::CoreML::Specification::protobuf_ClassConfidenceThresholding_2eproto::InitDefaults();
   _Pipeline_default_instance_.DefaultConstruct();
   _PipelineClassifier_default_instance_.DefaultConstruct();
   _PipelineRegressor_default_instance_.DefaultConstruct();
@@ -201,6 +203,7 @@ void AddDescriptorsImpl() {
   ::CoreML::Specification::protobuf_ItemSimilarityRecommender_2eproto::AddDescriptors();
   ::CoreML::Specification::CoreMLModels::protobuf_SoundAnalysisPreprocessing_2eproto::AddDescriptors();
   ::CoreML::Specification::protobuf_LinkedModel_2eproto::AddDescriptors();
+  ::CoreML::Specification::protobuf_ClassConfidenceThresholding_2eproto::AddDescriptors();
   ::google::protobuf::internal::OnShutdown(&TableStruct::Shutdown);
 }
 
@@ -3077,6 +3080,7 @@ const int Model::kItemSimilarityRecommenderFieldNumber;
 const int Model::kMlProgramFieldNumber;
 const int Model::kCustomModelFieldNumber;
 const int Model::kLinkedModelFieldNumber;
+const int Model::kClassConfidenceThresholdingFieldNumber;
 const int Model::kOneHotEncoderFieldNumber;
 const int Model::kImputerFieldNumber;
 const int Model::kFeatureVectorizerFieldNumber;
@@ -3192,6 +3196,10 @@ Model::Model(const Model& from)
       mutable_linkedmodel()->::CoreML::Specification::LinkedModel::MergeFrom(from.linkedmodel());
       break;
     }
+    case kClassConfidenceThresholding: {
+      mutable_classconfidencethresholding()->::CoreML::Specification::ClassConfidenceThresholding::MergeFrom(from.classconfidencethresholding());
+      break;
+    }
     case kOneHotEncoder: {
       mutable_onehotencoder()->::CoreML::Specification::OneHotEncoder::MergeFrom(from.onehotencoder());
       break;
@@ -3385,6 +3393,10 @@ void Model::clear_Type() {
       delete Type_.linkedmodel_;
       break;
     }
+    case kClassConfidenceThresholding: {
+      delete Type_.classconfidencethresholding_;
+      break;
+    }
     case kOneHotEncoder: {
       delete Type_.onehotencoder_;
       break;
@@ -3742,6 +3754,18 @@ bool Model::MergePartialFromCodedStream(
         break;
       }
 
+      // .CoreML.Specification.ClassConfidenceThresholding classConfidenceThresholding = 560;
+      case 560: {
+        if (static_cast< ::google::protobuf::uint8>(tag) ==
+            static_cast< ::google::protobuf::uint8>(4482u)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadMessageNoVirtual(
+               input, mutable_classconfidencethresholding()));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
       // .CoreML.Specification.OneHotEncoder oneHotEncoder = 600;
       case 600: {
         if (static_cast< ::google::protobuf::uint8>(tag) ==
@@ -4109,6 +4133,12 @@ void Model::SerializeWithCachedSizes(
       556, *Type_.linkedmodel_, output);
   }
 
+  // .CoreML.Specification.ClassConfidenceThresholding classConfidenceThresholding = 560;
+  if (has_classconfidencethresholding()) {
+    ::google::protobuf::internal::WireFormatLite::WriteMessage(
+      560, *Type_.classconfidencethresholding_, output);
+  }
+
   // .CoreML.Specification.OneHotEncoder oneHotEncoder = 600;
   if (has_onehotencoder()) {
     ::google::protobuf::internal::WireFormatLite::WriteMessage(
@@ -4370,6 +4400,13 @@ size_t Model::ByteSizeLong() const {
           *Type_.linkedmodel_);
       break;
     }
+    // .CoreML.Specification.ClassConfidenceThresholding classConfidenceThresholding = 560;
+    case kClassConfidenceThresholding: {
+      total_size += 2 +
+        ::google::protobuf::internal::WireFormatLite::MessageSizeNoVirtual(
+          *Type_.classconfidencethresholding_);
+      break;
+    }
     // .CoreML.Specification.OneHotEncoder oneHotEncoder = 600;
     case kOneHotEncoder: {
       total_size += 2 +
@@ -4601,6 +4638,10 @@ void Model::MergeFrom(const Model& from) {
       mutable_linkedmodel()->::CoreML::Specification::LinkedModel::MergeFrom(from.linkedmodel());
       break;
     }
+    case kClassConfidenceThresholding: {
+      mutable_classconfidencethresholding()->::CoreML::Specification::ClassConfidenceThresholding::MergeFrom(from.classconfidencethresholding());
+      break;
+    }
     case kOneHotEncoder: {
       mutable_onehotencoder()->::CoreML::Specification::OneHotEncoder::MergeFrom(from.onehotencoder());
       break;
@@ -5641,6 +5682,54 @@ void Model::set_allocated_linkedmodel(::CoreML::Specification::LinkedModel* link
   // @@protoc_insertion_point(field_set_allocated:CoreML.Specification.Model.linkedModel)
 }
 
+// .CoreML.Specification.ClassConfidenceThresholding classConfidenceThresholding = 560;
+bool Model::has_classconfidencethresholding() const {
+  return Type_case() == kClassConfidenceThresholding;
+}
+void Model::set_has_classconfidencethresholding() {
+  _oneof_case_[0] = kClassConfidenceThresholding;
+}
+void Model::clear_classconfidencethresholding() {
+  if (has_classconfidencethresholding()) {
+    delete Type_.classconfidencethresholding_;
+    clear_has_Type();
+  }
+}
+ const ::CoreML::Specification::ClassConfidenceThresholding& Model::classconfidencethresholding() const {
+  // @@protoc_insertion_point(field_get:CoreML.Specification.Model.classConfidenceThresholding)
+  return has_classconfidencethresholding()
+      ? *Type_.classconfidencethresholding_
+      : ::CoreML::Specification::ClassConfidenceThresholding::default_instance();
+}
+::CoreML::Specification::ClassConfidenceThresholding* Model::mutable_classconfidencethresholding() {
+  if (!has_classconfidencethresholding()) {
+    clear_Type();
+    set_has_classconfidencethresholding();
+    Type_.classconfidencethresholding_ = new ::CoreML::Specification::ClassConfidenceThresholding;
+  }
+  // @@protoc_insertion_point(field_mutable:CoreML.Specification.Model.classConfidenceThresholding)
+  return Type_.classconfidencethresholding_;
+}
+::CoreML::Specification::ClassConfidenceThresholding* Model::release_classconfidencethresholding() {
+  // @@protoc_insertion_point(field_release:CoreML.Specification.Model.classConfidenceThresholding)
+  if (has_classconfidencethresholding()) {
+    clear_has_Type();
+    ::CoreML::Specification::ClassConfidenceThresholding* temp = Type_.classconfidencethresholding_;
+    Type_.classconfidencethresholding_ = NULL;
+    return temp;
+  } else {
+    return NULL;
+  }
+}
+void Model::set_allocated_classconfidencethresholding(::CoreML::Specification::ClassConfidenceThresholding* classconfidencethresholding) {
+  clear_Type();
+  if (classconfidencethresholding) {
+    set_has_classconfidencethresholding();
+    Type_.classconfidencethresholding_ = classconfidencethresholding;
+  }
+  // @@protoc_insertion_point(field_set_allocated:CoreML.Specification.Model.classConfidenceThresholding)
+}
+
 // .CoreML.Specification.OneHotEncoder oneHotEncoder = 600;
 bool Model::has_onehotencoder() const {
   return Type_case() == kOneHotEncoder;
diff --git a/mlmodel/build/format/Model.pb.h b/mlmodel/build/format/Model.pb.h
index 83ed72e36..58895f832 100644
--- a/mlmodel/build/format/Model.pb.h
+++ b/mlmodel/build/format/Model.pb.h
@@ -60,6 +60,7 @@
 #include "ItemSimilarityRecommender.pb.h"  // IWYU pragma: export
 #include "SoundAnalysisPreprocessing.pb.h"  // IWYU pragma: export
 #include "LinkedModel.pb.h"  // IWYU pragma: export
+#include "ClassConfidenceThresholding.pb.h"  // IWYU pragma: export
 // @@protoc_insertion_point(includes)
 namespace CoreML {
 namespace Specification {
@@ -222,6 +223,9 @@ extern CeilLayerParamsDefaultTypeInternal _CeilLayerParams_default_instance_;
 class ClampedReLULayerParams;
 class ClampedReLULayerParamsDefaultTypeInternal;
 extern ClampedReLULayerParamsDefaultTypeInternal _ClampedReLULayerParams_default_instance_;
+class ClassConfidenceThresholding;
+class ClassConfidenceThresholdingDefaultTypeInternal;
+extern ClassConfidenceThresholdingDefaultTypeInternal _ClassConfidenceThresholding_default_instance_;
 class ClipLayerParams;
 class ClipLayerParamsDefaultTypeInternal;
 extern ClipLayerParamsDefaultTypeInternal _ClipLayerParams_default_instance_;
@@ -672,6 +676,9 @@ extern PoolingLayerParams_ValidCompletePaddingDefaultTypeInternal _PoolingLayerP
 class PowBroadcastableLayerParams;
 class PowBroadcastableLayerParamsDefaultTypeInternal;
 extern PowBroadcastableLayerParamsDefaultTypeInternal _PowBroadcastableLayerParams_default_instance_;
+class PrecisionRecallCurve;
+class PrecisionRecallCurveDefaultTypeInternal;
+extern PrecisionRecallCurveDefaultTypeInternal _PrecisionRecallCurve_default_instance_;
 class QuantizationParams;
 class QuantizationParamsDefaultTypeInternal;
 extern QuantizationParamsDefaultTypeInternal _QuantizationParams_default_instance_;
@@ -1958,6 +1965,7 @@ class Model : public ::google::protobuf::MessageLite /* @@protoc_insertion_point
     kMlProgram = 502,
     kCustomModel = 555,
     kLinkedModel = 556,
+    kClassConfidenceThresholding = 560,
     kOneHotEncoder = 600,
     kImputer = 601,
     kFeatureVectorizer = 602,
@@ -2210,6 +2218,15 @@ class Model : public ::google::protobuf::MessageLite /* @@protoc_insertion_point
   ::CoreML::Specification::LinkedModel* release_linkedmodel();
   void set_allocated_linkedmodel(::CoreML::Specification::LinkedModel* linkedmodel);
 
+  // .CoreML.Specification.ClassConfidenceThresholding classConfidenceThresholding = 560;
+  bool has_classconfidencethresholding() const;
+  void clear_classconfidencethresholding();
+  static const int kClassConfidenceThresholdingFieldNumber = 560;
+  const ::CoreML::Specification::ClassConfidenceThresholding& classconfidencethresholding() const;
+  ::CoreML::Specification::ClassConfidenceThresholding* mutable_classconfidencethresholding();
+  ::CoreML::Specification::ClassConfidenceThresholding* release_classconfidencethresholding();
+  void set_allocated_classconfidencethresholding(::CoreML::Specification::ClassConfidenceThresholding* classconfidencethresholding);
+
   // .CoreML.Specification.OneHotEncoder oneHotEncoder = 600;
   bool has_onehotencoder() const;
   void clear_onehotencoder();
@@ -2393,6 +2410,7 @@ class Model : public ::google::protobuf::MessageLite /* @@protoc_insertion_point
   void set_has_mlprogram();
   void set_has_custommodel();
   void set_has_linkedmodel();
+  void set_has_classconfidencethresholding();
   void set_has_onehotencoder();
   void set_has_imputer();
   void set_has_featurevectorizer();
@@ -2440,6 +2458,7 @@ class Model : public ::google::protobuf::MessageLite /* @@protoc_insertion_point
     ::CoreML::Specification::MILSpec::Program* mlprogram_;
     ::CoreML::Specification::CustomModel* custommodel_;
     ::CoreML::Specification::LinkedModel* linkedmodel_;
+    ::CoreML::Specification::ClassConfidenceThresholding* classconfidencethresholding_;
     ::CoreML::Specification::OneHotEncoder* onehotencoder_;
     ::CoreML::Specification::Imputer* imputer_;
     ::CoreML::Specification::FeatureVectorizer* featurevectorizer_;
@@ -4326,6 +4345,54 @@ inline void Model::set_allocated_linkedmodel(::CoreML::Specification::LinkedMode
   // @@protoc_insertion_point(field_set_allocated:CoreML.Specification.Model.linkedModel)
 }
 
+// .CoreML.Specification.ClassConfidenceThresholding classConfidenceThresholding = 560;
+inline bool Model::has_classconfidencethresholding() const {
+  return Type_case() == kClassConfidenceThresholding;
+}
+inline void Model::set_has_classconfidencethresholding() {
+  _oneof_case_[0] = kClassConfidenceThresholding;
+}
+inline void Model::clear_classconfidencethresholding() {
+  if (has_classconfidencethresholding()) {
+    delete Type_.classconfidencethresholding_;
+    clear_has_Type();
+  }
+}
+inline  const ::CoreML::Specification::ClassConfidenceThresholding& Model::classconfidencethresholding() const {
+  // @@protoc_insertion_point(field_get:CoreML.Specification.Model.classConfidenceThresholding)
+  return has_classconfidencethresholding()
+      ? *Type_.classconfidencethresholding_
+      : ::CoreML::Specification::ClassConfidenceThresholding::default_instance();
+}
+inline ::CoreML::Specification::ClassConfidenceThresholding* Model::mutable_classconfidencethresholding() {
+  if (!has_classconfidencethresholding()) {
+    clear_Type();
+    set_has_classconfidencethresholding();
+    Type_.classconfidencethresholding_ = new ::CoreML::Specification::ClassConfidenceThresholding;
+  }
+  // @@protoc_insertion_point(field_mutable:CoreML.Specification.Model.classConfidenceThresholding)
+  return Type_.classconfidencethresholding_;
+}
+inline ::CoreML::Specification::ClassConfidenceThresholding* Model::release_classconfidencethresholding() {
+  // @@protoc_insertion_point(field_release:CoreML.Specification.Model.classConfidenceThresholding)
+  if (has_classconfidencethresholding()) {
+    clear_has_Type();
+    ::CoreML::Specification::ClassConfidenceThresholding* temp = Type_.classconfidencethresholding_;
+    Type_.classconfidencethresholding_ = NULL;
+    return temp;
+  } else {
+    return NULL;
+  }
+}
+inline void Model::set_allocated_classconfidencethresholding(::CoreML::Specification::ClassConfidenceThresholding* classconfidencethresholding) {
+  clear_Type();
+  if (classconfidencethresholding) {
+    set_has_classconfidencethresholding();
+    Type_.classconfidencethresholding_ = classconfidencethresholding;
+  }
+  // @@protoc_insertion_point(field_set_allocated:CoreML.Specification.Model.classConfidenceThresholding)
+}
+
 // .CoreML.Specification.OneHotEncoder oneHotEncoder = 600;
 inline bool Model::has_onehotencoder() const {
   return Type_case() == kOneHotEncoder;
diff --git a/mlmodel/build/format/Model_enums.h b/mlmodel/build/format/Model_enums.h
index 42bf19dd3..3fd7c7c49 100644
--- a/mlmodel/build/format/Model_enums.h
+++ b/mlmodel/build/format/Model_enums.h
@@ -19,6 +19,7 @@ enum MLModelType: int {
     MLModelType_mlProgram = 502,
     MLModelType_customModel = 555,
     MLModelType_linkedModel = 556,
+    MLModelType_classConfidenceThresholding = 560,
     MLModelType_oneHotEncoder = 600,
     MLModelType_imputer = 601,
     MLModelType_featureVectorizer = 602,
@@ -79,6 +80,8 @@ static const char * MLModelType_Name(MLModelType x) {
             return "MLModelType_customModel";
         case MLModelType_linkedModel:
             return "MLModelType_linkedModel";
+        case MLModelType_classConfidenceThresholding:
+            return "MLModelType_classConfidenceThresholding";
         case MLModelType_oneHotEncoder:
             return "MLModelType_oneHotEncoder";
         case MLModelType_imputer:
diff --git a/mlmodel/build/format/NearestNeighbors.pb.h b/mlmodel/build/format/NearestNeighbors.pb.h
index 752cce679..b0990b8f3 100644
--- a/mlmodel/build/format/NearestNeighbors.pb.h
+++ b/mlmodel/build/format/NearestNeighbors.pb.h
@@ -120,6 +120,9 @@ extern LinearIndexDefaultTypeInternal _LinearIndex_default_instance_;
 class NearestNeighborsIndex;
 class NearestNeighborsIndexDefaultTypeInternal;
 extern NearestNeighborsIndexDefaultTypeInternal _NearestNeighborsIndex_default_instance_;
+class PrecisionRecallCurve;
+class PrecisionRecallCurveDefaultTypeInternal;
+extern PrecisionRecallCurveDefaultTypeInternal _PrecisionRecallCurve_default_instance_;
 class SequenceFeatureType;
 class SequenceFeatureTypeDefaultTypeInternal;
 extern SequenceFeatureTypeDefaultTypeInternal _SequenceFeatureType_default_instance_;
diff --git a/mlmodel/build/format/NeuralNetwork.pb.h b/mlmodel/build/format/NeuralNetwork.pb.h
index d273bb930..fabdf7005 100644
--- a/mlmodel/build/format/NeuralNetwork.pb.h
+++ b/mlmodel/build/format/NeuralNetwork.pb.h
@@ -510,6 +510,9 @@ extern PoolingLayerParams_ValidCompletePaddingDefaultTypeInternal _PoolingLayerP
 class PowBroadcastableLayerParams;
 class PowBroadcastableLayerParamsDefaultTypeInternal;
 extern PowBroadcastableLayerParamsDefaultTypeInternal _PowBroadcastableLayerParams_default_instance_;
+class PrecisionRecallCurve;
+class PrecisionRecallCurveDefaultTypeInternal;
+extern PrecisionRecallCurveDefaultTypeInternal _PrecisionRecallCurve_default_instance_;
 class QuantizationParams;
 class QuantizationParamsDefaultTypeInternal;
 extern QuantizationParamsDefaultTypeInternal _QuantizationParams_default_instance_;
diff --git a/mlmodel/build/format/NonMaximumSuppression.pb.h b/mlmodel/build/format/NonMaximumSuppression.pb.h
index 5fc5d5015..30b7d5e73 100644
--- a/mlmodel/build/format/NonMaximumSuppression.pb.h
+++ b/mlmodel/build/format/NonMaximumSuppression.pb.h
@@ -104,6 +104,9 @@ extern NonMaximumSuppressionDefaultTypeInternal _NonMaximumSuppression_default_i
 class NonMaximumSuppression_PickTop;
 class NonMaximumSuppression_PickTopDefaultTypeInternal;
 extern NonMaximumSuppression_PickTopDefaultTypeInternal _NonMaximumSuppression_PickTop_default_instance_;
+class PrecisionRecallCurve;
+class PrecisionRecallCurveDefaultTypeInternal;
+extern PrecisionRecallCurveDefaultTypeInternal _PrecisionRecallCurve_default_instance_;
 class SequenceFeatureType;
 class SequenceFeatureTypeDefaultTypeInternal;
 extern SequenceFeatureTypeDefaultTypeInternal _SequenceFeatureType_default_instance_;
diff --git a/mlmodel/build/format/OneHotEncoder.pb.h b/mlmodel/build/format/OneHotEncoder.pb.h
index 4e3e09fb9..2986e1748 100644
--- a/mlmodel/build/format/OneHotEncoder.pb.h
+++ b/mlmodel/build/format/OneHotEncoder.pb.h
@@ -102,6 +102,9 @@ extern Int64VectorDefaultTypeInternal _Int64Vector_default_instance_;
 class OneHotEncoder;
 class OneHotEncoderDefaultTypeInternal;
 extern OneHotEncoderDefaultTypeInternal _OneHotEncoder_default_instance_;
+class PrecisionRecallCurve;
+class PrecisionRecallCurveDefaultTypeInternal;
+extern PrecisionRecallCurveDefaultTypeInternal _PrecisionRecallCurve_default_instance_;
 class SequenceFeatureType;
 class SequenceFeatureTypeDefaultTypeInternal;
 extern SequenceFeatureTypeDefaultTypeInternal _SequenceFeatureType_default_instance_;
diff --git a/mlmodel/build/format/Parameters.pb.h b/mlmodel/build/format/Parameters.pb.h
index db23ca7af..0e708b014 100644
--- a/mlmodel/build/format/Parameters.pb.h
+++ b/mlmodel/build/format/Parameters.pb.h
@@ -107,6 +107,9 @@ extern Int64ToStringMap_MapEntryDefaultTypeInternal _Int64ToStringMap_MapEntry_d
 class Int64Vector;
 class Int64VectorDefaultTypeInternal;
 extern Int64VectorDefaultTypeInternal _Int64Vector_default_instance_;
+class PrecisionRecallCurve;
+class PrecisionRecallCurveDefaultTypeInternal;
+extern PrecisionRecallCurveDefaultTypeInternal _PrecisionRecallCurve_default_instance_;
 class SequenceFeatureType;
 class SequenceFeatureTypeDefaultTypeInternal;
 extern SequenceFeatureTypeDefaultTypeInternal _SequenceFeatureType_default_instance_;
diff --git a/mlmodel/build/format/SVM.pb.h b/mlmodel/build/format/SVM.pb.h
index e6fce40b9..9be742495 100644
--- a/mlmodel/build/format/SVM.pb.h
+++ b/mlmodel/build/format/SVM.pb.h
@@ -116,6 +116,9 @@ extern LinearKernelDefaultTypeInternal _LinearKernel_default_instance_;
 class PolyKernel;
 class PolyKernelDefaultTypeInternal;
 extern PolyKernelDefaultTypeInternal _PolyKernel_default_instance_;
+class PrecisionRecallCurve;
+class PrecisionRecallCurveDefaultTypeInternal;
+extern PrecisionRecallCurveDefaultTypeInternal _PrecisionRecallCurve_default_instance_;
 class RBFKernel;
 class RBFKernelDefaultTypeInternal;
 extern RBFKernelDefaultTypeInternal _RBFKernel_default_instance_;
diff --git a/mlmodel/build/format/TextClassifier.pb.h b/mlmodel/build/format/TextClassifier.pb.h
index 312be3e12..1634d956d 100644
--- a/mlmodel/build/format/TextClassifier.pb.h
+++ b/mlmodel/build/format/TextClassifier.pb.h
@@ -98,6 +98,9 @@ extern Int64ToStringMap_MapEntryDefaultTypeInternal _Int64ToStringMap_MapEntry_d
 class Int64Vector;
 class Int64VectorDefaultTypeInternal;
 extern Int64VectorDefaultTypeInternal _Int64Vector_default_instance_;
+class PrecisionRecallCurve;
+class PrecisionRecallCurveDefaultTypeInternal;
+extern PrecisionRecallCurveDefaultTypeInternal _PrecisionRecallCurve_default_instance_;
 class SequenceFeatureType;
 class SequenceFeatureTypeDefaultTypeInternal;
 extern SequenceFeatureTypeDefaultTypeInternal _SequenceFeatureType_default_instance_;
diff --git a/mlmodel/build/format/TreeEnsemble.pb.h b/mlmodel/build/format/TreeEnsemble.pb.h
index 71800369f..a4e647a7d 100644
--- a/mlmodel/build/format/TreeEnsemble.pb.h
+++ b/mlmodel/build/format/TreeEnsemble.pb.h
@@ -99,6 +99,9 @@ extern Int64ToStringMap_MapEntryDefaultTypeInternal _Int64ToStringMap_MapEntry_d
 class Int64Vector;
 class Int64VectorDefaultTypeInternal;
 extern Int64VectorDefaultTypeInternal _Int64Vector_default_instance_;
+class PrecisionRecallCurve;
+class PrecisionRecallCurveDefaultTypeInternal;
+extern PrecisionRecallCurveDefaultTypeInternal _PrecisionRecallCurve_default_instance_;
 class SequenceFeatureType;
 class SequenceFeatureTypeDefaultTypeInternal;
 extern SequenceFeatureTypeDefaultTypeInternal _SequenceFeatureType_default_instance_;
diff --git a/mlmodel/build/format/VisionFeaturePrint.pb.cc b/mlmodel/build/format/VisionFeaturePrint.pb.cc
index 72fb90f85..e86125d4b 100644
--- a/mlmodel/build/format/VisionFeaturePrint.pb.cc
+++ b/mlmodel/build/format/VisionFeaturePrint.pb.cc
@@ -108,7 +108,6 @@ bool VisionFeaturePrint_Objects_ObjectsVersion_IsValid(int value) {
   switch (value) {
     case 0:
     case 1:
-    case 2:
       return true;
     default:
       return false;
@@ -118,7 +117,6 @@ bool VisionFeaturePrint_Objects_ObjectsVersion_IsValid(int value) {
 #if !defined(_MSC_VER) || _MSC_VER >= 1900
 const VisionFeaturePrint_Objects_ObjectsVersion VisionFeaturePrint_Objects::OBJECTS_VERSION_INVALID;
 const VisionFeaturePrint_Objects_ObjectsVersion VisionFeaturePrint_Objects::OBJECTS_VERSION_1;
-const VisionFeaturePrint_Objects_ObjectsVersion VisionFeaturePrint_Objects::OBJECTS_VERSION_2;
 const VisionFeaturePrint_Objects_ObjectsVersion VisionFeaturePrint_Objects::ObjectsVersion_MIN;
 const VisionFeaturePrint_Objects_ObjectsVersion VisionFeaturePrint_Objects::ObjectsVersion_MAX;
 const int VisionFeaturePrint_Objects::ObjectsVersion_ARRAYSIZE;
diff --git a/mlmodel/build/format/VisionFeaturePrint.pb.h b/mlmodel/build/format/VisionFeaturePrint.pb.h
index 4ceb47a40..db34011eb 100644
--- a/mlmodel/build/format/VisionFeaturePrint.pb.h
+++ b/mlmodel/build/format/VisionFeaturePrint.pb.h
@@ -79,13 +79,12 @@ const int VisionFeaturePrint_Scene_SceneVersion_SceneVersion_ARRAYSIZE = VisionF
 enum VisionFeaturePrint_Objects_ObjectsVersion {
   VisionFeaturePrint_Objects_ObjectsVersion_OBJECTS_VERSION_INVALID = 0,
   VisionFeaturePrint_Objects_ObjectsVersion_OBJECTS_VERSION_1 = 1,
-  VisionFeaturePrint_Objects_ObjectsVersion_OBJECTS_VERSION_2 = 2,
   VisionFeaturePrint_Objects_ObjectsVersion_VisionFeaturePrint_Objects_ObjectsVersion_INT_MIN_SENTINEL_DO_NOT_USE_ = ::google::protobuf::kint32min,
   VisionFeaturePrint_Objects_ObjectsVersion_VisionFeaturePrint_Objects_ObjectsVersion_INT_MAX_SENTINEL_DO_NOT_USE_ = ::google::protobuf::kint32max
 };
 bool VisionFeaturePrint_Objects_ObjectsVersion_IsValid(int value);
 const VisionFeaturePrint_Objects_ObjectsVersion VisionFeaturePrint_Objects_ObjectsVersion_ObjectsVersion_MIN = VisionFeaturePrint_Objects_ObjectsVersion_OBJECTS_VERSION_INVALID;
-const VisionFeaturePrint_Objects_ObjectsVersion VisionFeaturePrint_Objects_ObjectsVersion_ObjectsVersion_MAX = VisionFeaturePrint_Objects_ObjectsVersion_OBJECTS_VERSION_2;
+const VisionFeaturePrint_Objects_ObjectsVersion VisionFeaturePrint_Objects_ObjectsVersion_ObjectsVersion_MAX = VisionFeaturePrint_Objects_ObjectsVersion_OBJECTS_VERSION_1;
 const int VisionFeaturePrint_Objects_ObjectsVersion_ObjectsVersion_ARRAYSIZE = VisionFeaturePrint_Objects_ObjectsVersion_ObjectsVersion_MAX + 1;
 
 // ===================================================================
@@ -250,8 +249,6 @@ class VisionFeaturePrint_Objects : public ::google::protobuf::MessageLite /* @@p
     VisionFeaturePrint_Objects_ObjectsVersion_OBJECTS_VERSION_INVALID;
   static const ObjectsVersion OBJECTS_VERSION_1 =
     VisionFeaturePrint_Objects_ObjectsVersion_OBJECTS_VERSION_1;
-  static const ObjectsVersion OBJECTS_VERSION_2 =
-    VisionFeaturePrint_Objects_ObjectsVersion_OBJECTS_VERSION_2;
   static inline bool ObjectsVersion_IsValid(int value) {
     return VisionFeaturePrint_Objects_ObjectsVersion_IsValid(value);
   }
diff --git a/mlmodel/build/format/VisionFeaturePrint_enums.h b/mlmodel/build/format/VisionFeaturePrint_enums.h
index cc33b2531..0fe9c5dfe 100644
--- a/mlmodel/build/format/VisionFeaturePrint_enums.h
+++ b/mlmodel/build/format/VisionFeaturePrint_enums.h
@@ -28,7 +28,6 @@ enum MLSceneVersion: int {
 enum MLObjectsVersion: int {
     MLObjectsVersionOBJECTS_VERSION_INVALID = 0,
     MLObjectsVersionOBJECTS_VERSION_1 = 1,
-    MLObjectsVersionOBJECTS_VERSION_2 = 2,
 };
 
 #endif
diff --git a/mlmodel/build/format/WordEmbedding.pb.h b/mlmodel/build/format/WordEmbedding.pb.h
index aab4c1f22..e9a3e74ce 100644
--- a/mlmodel/build/format/WordEmbedding.pb.h
+++ b/mlmodel/build/format/WordEmbedding.pb.h
@@ -98,6 +98,9 @@ extern Int64ToStringMap_MapEntryDefaultTypeInternal _Int64ToStringMap_MapEntry_d
 class Int64Vector;
 class Int64VectorDefaultTypeInternal;
 extern Int64VectorDefaultTypeInternal _Int64Vector_default_instance_;
+class PrecisionRecallCurve;
+class PrecisionRecallCurveDefaultTypeInternal;
+extern PrecisionRecallCurveDefaultTypeInternal _PrecisionRecallCurve_default_instance_;
 class SequenceFeatureType;
 class SequenceFeatureTypeDefaultTypeInternal;
 extern SequenceFeatureTypeDefaultTypeInternal _SequenceFeatureType_default_instance_;
diff --git a/mlmodel/build/format/WordTagger.pb.h b/mlmodel/build/format/WordTagger.pb.h
index 6b4093830..64bc24642 100644
--- a/mlmodel/build/format/WordTagger.pb.h
+++ b/mlmodel/build/format/WordTagger.pb.h
@@ -98,6 +98,9 @@ extern Int64ToStringMap_MapEntryDefaultTypeInternal _Int64ToStringMap_MapEntry_d
 class Int64Vector;
 class Int64VectorDefaultTypeInternal;
 extern Int64VectorDefaultTypeInternal _Int64Vector_default_instance_;
+class PrecisionRecallCurve;
+class PrecisionRecallCurveDefaultTypeInternal;
+extern PrecisionRecallCurveDefaultTypeInternal _PrecisionRecallCurve_default_instance_;
 class SequenceFeatureType;
 class SequenceFeatureTypeDefaultTypeInternal;
 extern SequenceFeatureTypeDefaultTypeInternal _SequenceFeatureType_default_instance_;
diff --git a/mlmodel/format/ClassConfidenceThresholding.proto b/mlmodel/format/ClassConfidenceThresholding.proto
new file mode 100644
index 000000000..173296345
--- /dev/null
+++ b/mlmodel/format/ClassConfidenceThresholding.proto
@@ -0,0 +1,41 @@
+// Copyright (c) 2022, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
+
+syntax = "proto3";
+option optimize_for = LITE_RUNTIME;
+
+import public "DataStructures.proto";
+
+package CoreML.Specification;
+
+/* A model to filter classification labels by confidence thresholds.
+ *
+ * The model has one input:
+ * - A multi-array of type FP16, FP32, or FP64 and shape [C], where C
+ * is the number of classes.
+ *
+ * The model has one output:
+ * - A multi-array of type FP16, FP32, or FP64 and shape [2, C], where
+ *   C is the number of classes. The values in [0, :] is the same as
+ *   the confidence inputs. The values in [1, :] is either 0 or 1,
+ *   where 1 means the class is present and 0 means it is not.
+ *
+ * Currently, the model simply takes all the classes.
+ *
+ *   filteredClassConfidences[0, :] = classConfidences[:]
+ *   filteredClassConfidences[1, :] = 1
+ */
+ 
+message ClassConfidenceThresholding {
+
+    /**
+     * The precision-recall curve for each class label.
+     *
+     * The field is optional. When it exists, the number of curves
+     * must match the number of class labels.
+     */
+    repeated PrecisionRecallCurve precisionRecallCurves = 100;
+}
+
diff --git a/mlmodel/format/DataStructures.proto b/mlmodel/format/DataStructures.proto
index 2574574b8..2abd48c76 100644
--- a/mlmodel/format/DataStructures.proto
+++ b/mlmodel/format/DataStructures.proto
@@ -93,3 +93,34 @@ message DoubleRange {
     double maxValue = 2;
 }
 
+/**
+ * Precision/Recall curve.
+ *
+ * The syntax comprises two tables, one to look up the confidence value threshold
+ * for a given precision, and the other for a given recall.
+ *
+ * Example:
+ * ----------------------+----+----+----+----+----+----+----+----+----
+ * precisionValues       | .1 | .2 | .3 | .4 | .5 | .6 | .7 |
+ * precisionConfidence   | .0 | .0 | .0 | .0 | .1 | .3 | .4 |
+ * ----------------------+----+----+----+----+----+----+----+----+----
+ *
+ * ----------------------+----+----+----+----+----+----+----+----+----
+ * recallValues          | .1 | .2 | .3 | .4 | .5 | .6 | .7 | .8 | .9
+ * recallConfidence      | .7 | .6 | .5 | .4 | .3 | .3 | .2 | .1 | .0
+ * ----------------------+----+----+----+----+----+----+----+----+----
+ *
+ * The application expects that, when it filters out samples with
+ * confidence threshold = 0.1, it gets precision = 0.5. Likewise,
+ * with threshold = 0.2 it gets recall = 0.7.
+ *
+ * The table must have only valid values; do not use `NaN`, `+/- INF`,
+ * or negative values. The application is responsible for inter/extrapolating
+ * approprate confidence threshold based on the application's specifc need.
+ */
+message PrecisionRecallCurve {
+    FloatVector precisionValues = 1;
+    FloatVector precisionConfidenceThresholds = 2;
+    FloatVector recallValues = 3;
+    FloatVector recallConfidenceThresholds = 4;
+}
diff --git a/mlmodel/format/MIL.proto b/mlmodel/format/MIL.proto
index 598f0d1b1..5bd83a895 100644
--- a/mlmodel/format/MIL.proto
+++ b/mlmodel/format/MIL.proto
@@ -10,7 +10,7 @@
  * - A Function consists of
  *     - List of named inputs and output types
  *     - A block defining scope for a function - similar to a function in C/C++
- * - A Block consists of 
+ * - A Block consists of
  *     - List of named inputs and output names
  *     - Topologically sorted Ops
  * - A Op consists of
@@ -23,7 +23,7 @@
  * Identifiers, generally used for names and keys, must match the
  * regular expression [A-Za-z\_][A-Za-z0-9\_@]*
  */
- 
+
 syntax = "proto3";
 option optimize_for = LITE_RUNTIME;
 
@@ -82,7 +82,7 @@ message Block {
     // The names to give to values returned by this block. They must be
     // identifiers as described above.
     //
-    // ValueType of outputs[i] is Operation[j].outputs[k].type where 
+    // ValueType of outputs[i] is Operation[j].outputs[k].type where
     // i, j and k are indices of block output, block Operation and
     // jth operation's output respectively.
     // this is due to
@@ -202,6 +202,7 @@ enum DataType {
     FLOAT16 = 10;
     FLOAT32 = 11;
     FLOAT64 = 12;
+    BFLOAT16 = 13;
 
     // Ints
     INT8 = 21;
@@ -312,7 +313,7 @@ message TensorValue {
     message RepeatedFloats {
         repeated float values = 1 [packed = true];
     }
-    
+
     message RepeatedDoubles {
         repeated double values = 1 [packed = true];
     }
diff --git a/mlmodel/format/Model.proto b/mlmodel/format/Model.proto
index 6430f7745..56d55ad72 100644
--- a/mlmodel/format/Model.proto
+++ b/mlmodel/format/Model.proto
@@ -35,6 +35,7 @@
  *   - `LinkedModel`
  *   - `SoundAnalysisPreprocessing`
  *   - `ItemSimilarityRecommender`
+ *   - `ClassConfidenceThresholding`
  *
  * Feature Engineering
  *   - `Imputer`
@@ -89,6 +90,7 @@ import public "Parameters.proto";
 import public "ItemSimilarityRecommender.proto";
 import public "SoundAnalysisPreprocessing.proto";
 import public "LinkedModel.proto";
+import public "ClassConfidenceThresholding.proto";
 
 package CoreML.Specification;
 
@@ -258,7 +260,7 @@ message SerializedModel {
  * 8 : iOS 17, macOS 14, tvOS 17, watchOS 10 (Core ML 7)
  * - iOS 17 ops
  * - Scene print v2
- * - Detection print v2
+ * - ClassConfidenceThresholding model
  */
 message Model {
     int32 specificationVersion = 1;
@@ -305,6 +307,9 @@ message Model {
         CustomModel customModel = 555;
         LinkedModel linkedModel = 556;
 
+        // Precision Recall Curve 'container''
+        ClassConfidenceThresholding classConfidenceThresholding = 560;
+        
         // feature engineering starts at 600
         OneHotEncoder oneHotEncoder = 600;
         Imputer imputer = 601;
diff --git a/mlmodel/format/VisionFeaturePrint.proto b/mlmodel/format/VisionFeaturePrint.proto
index c18230f5b..a87fdd40f 100644
--- a/mlmodel/format/VisionFeaturePrint.proto
+++ b/mlmodel/format/VisionFeaturePrint.proto
@@ -43,12 +43,6 @@ message VisionFeaturePrint {
             // features: one at high resolution of shape (288, 35, 35)
             // the other at low resolution of shape (768, 17, 17)
             OBJECTS_VERSION_1 = 1;
-
-            // VERSION_2 is available on iOS,tvOS 17.0+, macOS 14.0+
-            // It uses a 360x360 input image and yields two multiarray
-            // features: one at high resolution of shape (128, 45, 45)
-            // the other at low resolution of shape (168, 23, 23)
-            OBJECTS_VERSION_2 = 2;
         }
 
         ObjectsVersion version = 1;
diff --git a/mlmodel/src/Comparison.cpp b/mlmodel/src/Comparison.cpp
index 18843819b..2ff7567b2 100644
--- a/mlmodel/src/Comparison.cpp
+++ b/mlmodel/src/Comparison.cpp
@@ -107,6 +107,8 @@ namespace CoreML {
                     return a.soundanalysispreprocessing() == b.soundanalysispreprocessing();
                 case Model::kLinkedModel:
                     return a.linkedmodel() == b.linkedmodel();
+                case Model::kClassConfidenceThresholding:
+                    return a.classconfidencethresholding() == b.classconfidencethresholding();
                 case Model::TYPE_NOT_SET:
                     return true;
             }
@@ -823,6 +825,26 @@ namespace CoreML {
             return true;
         }
 
+        bool operator==(const ClassConfidenceThresholding& a,
+                        const ClassConfidenceThresholding& b) {
+            if (!std::equal(a.precisionrecallcurves().begin(), a.precisionrecallcurves().end(), b.precisionrecallcurves().begin(),
+                [] (const CoreML::Specification::PrecisionRecallCurve& a, const CoreML::Specification::PrecisionRecallCurve& b) {
+                    if (!std::equal(a.precisionvalues().vector().begin(), a.precisionvalues().vector().end(),
+                                    b.precisionvalues().vector().begin())) return false;
+                    if (!std::equal(a.precisionconfidencethresholds().vector().begin(), a.precisionconfidencethresholds().vector().end(),
+                                    b.precisionconfidencethresholds().vector().begin())) return false;
+                    if (!std::equal(a.recallvalues().vector().begin(), a.recallvalues().vector().end(),
+                                    b.recallvalues().vector().begin())) return false;
+                    if (!std::equal(a.recallconfidencethresholds().vector().begin(), a.recallconfidencethresholds().vector().end(),
+                                    b.recallconfidencethresholds().vector().begin())) return false;
+                    return true;
+                })) {
+                return false;
+            } else {
+                return true;
+            }
+        }
+
         bool operator==(const CoreMLModels::WordTagger& a,
                         const CoreMLModels::WordTagger& b) {
             
diff --git a/mlmodel/src/Comparison.hpp b/mlmodel/src/Comparison.hpp
index 2ed6aadcc..6325cca3b 100644
--- a/mlmodel/src/Comparison.hpp
+++ b/mlmodel/src/Comparison.hpp
@@ -107,6 +107,8 @@ namespace CoreML {
                         const CustomModel& b);
         bool operator==(const LinkedModel& a,
                         const LinkedModel& b);
+        bool operator==(const ClassConfidenceThresholding& a,
+                        const ClassConfidenceThresholding& b);
 
         // Apple provided models
         bool operator==(const CoreMLModels::WordTagger& a,
diff --git a/mlmodel/src/Globals.hpp b/mlmodel/src/Globals.hpp
index 0954a4dbf..af8cbb895 100644
--- a/mlmodel/src/Globals.hpp
+++ b/mlmodel/src/Globals.hpp
@@ -59,7 +59,10 @@ namespace CoreML {
     // - GRAYSCALE_FLOAT16 image color space.
     static const int32_t MLMODEL_SPECIFICATION_VERSION_IOS16 = 7;
 
-    static const int32_t MLMODEL_SPECIFICATION_VERSION_NEWEST = MLMODEL_SPECIFICATION_VERSION_IOS16;
+    // version 8:
+    static const int32_t MLMODEL_SPECIFICATION_VERSION_IOS17 = 8;
+
+    static const int32_t MLMODEL_SPECIFICATION_VERSION_NEWEST = MLMODEL_SPECIFICATION_VERSION_IOS17;
 
 }
 
diff --git a/mlmodel/src/MILBlob/Bf16.hpp b/mlmodel/src/MILBlob/Bf16.hpp
new file mode 100644
index 000000000..125e59c45
--- /dev/null
+++ b/mlmodel/src/MILBlob/Bf16.hpp
@@ -0,0 +1,57 @@
+// Copyright (c) 2021, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+#pragma once
+
+#include <cstdint>
+#include <functional>
+
+namespace MILBlob {
+
+/**
+ * Struct for holding bytes that represent a bf16 number.
+ * Floating point interface treats "bytes" as brain float16 floating point
+ *  (https://en.wikipedia.org/wiki/Bfloat16_floating-point_format)
+ */
+struct Bf16 {
+    explicit Bf16(uint16_t bs) : bytes(bs) {}
+    Bf16() : bytes(0) {}
+
+    static Bf16 FromFloat(float f);
+
+    float GetFloat() const;
+    void SetFloat(float f);
+
+    // NOLINTNEXTLINE(misc-non-private-member-variables-in-classes)
+    uint16_t bytes;
+};
+
+inline bool operator==(const Bf16& first, const Bf16& second) noexcept
+{
+    // Note this comparison is quick and dirty - it will give incorrect results
+    // for (-0.0 == 0.0) and, depending on bit pattern, (NaN == NaN).
+    return first.bytes == second.bytes;
+}
+
+inline bool operator!=(const Bf16& first, const Bf16& second) noexcept
+{
+    // Note this comparison is quick and dirty - it will give incorrect results
+    // for (-0.0 != 0.0) and, depending on bit pattern, (NaN != NaN).
+    return first.bytes != second.bytes;
+}
+
+}  // namespace MILBlob
+
+namespace std {
+
+template <>
+struct hash<MILBlob::Bf16> {
+    size_t operator()(const MILBlob::Bf16& fp) const
+    {
+        return fp.bytes;
+    }
+};
+
+}  // namespace std
diff --git a/mlmodel/src/MILBlob/Blob/BlobDataType.hpp b/mlmodel/src/MILBlob/Blob/BlobDataType.hpp
new file mode 100644
index 000000000..1db5587c3
--- /dev/null
+++ b/mlmodel/src/MILBlob/Blob/BlobDataType.hpp
@@ -0,0 +1,71 @@
+// Copyright (c) 2021, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+#pragma once
+
+#include "MILBlob/Bf16.hpp"
+#include "MILBlob/Fp16.hpp"
+
+namespace MILBlob {
+namespace Blob {
+
+enum class BlobDataType : uint32_t
+{
+    // *** WARNING ***
+    // for binary compatibility, values should ONLY be added at the end.
+    //
+    // this file needs to remain in sync across multiple repos.
+    // please be cognizant of that when making changes to the
+    // format.
+
+    Float16 = 1,
+    Float32 = 2,
+    UInt8 = 3,
+    Int8 = 4,
+    BFloat16 = 5,
+    Int16 = 6,
+    UInt16 = 7,
+};
+
+template <typename T>
+struct BlobDataTypeTraits;
+
+template <>
+struct BlobDataTypeTraits<float> {
+    static constexpr BlobDataType DataType = BlobDataType::Float32;
+};
+
+template <>
+struct BlobDataTypeTraits<Fp16> {
+    static constexpr BlobDataType DataType = BlobDataType::Float16;
+};
+
+template <>
+struct BlobDataTypeTraits<Bf16> {
+    static constexpr BlobDataType DataType = BlobDataType::BFloat16;
+};
+
+template <>
+struct BlobDataTypeTraits<uint8_t> {
+    static constexpr BlobDataType DataType = BlobDataType::UInt8;
+};
+
+template <>
+struct BlobDataTypeTraits<int8_t> {
+    static constexpr BlobDataType DataType = BlobDataType::Int8;
+};
+
+template <>
+struct BlobDataTypeTraits<int16_t> {
+    static constexpr BlobDataType DataType = BlobDataType::Int16;
+};
+
+template <>
+struct BlobDataTypeTraits<uint16_t> {
+    static constexpr BlobDataType DataType = BlobDataType::UInt16;
+};
+
+}  // namespace Blob
+}  // namespace MILBlob
diff --git a/mlmodel/src/MILBlob/Blob/StorageFormat.hpp b/mlmodel/src/MILBlob/Blob/StorageFormat.hpp
index 05ecf209c..917c442c1 100644
--- a/mlmodel/src/MILBlob/Blob/StorageFormat.hpp
+++ b/mlmodel/src/MILBlob/Blob/StorageFormat.hpp
@@ -5,7 +5,7 @@
 
 #pragma once
 
-#include "MILBlob/Fp16.hpp"
+#include "MILBlob/Blob/BlobDataType.hpp"
 
 namespace MILBlob {
 namespace Blob {
@@ -40,44 +40,6 @@ constexpr uint64_t DefaultStorageAlignment = 64;
 // Default sentinel for validation for metadata
 constexpr uint64_t BlobMetadataSentinel = 0xDEADBEEF;
 
-enum BlobDataType : uint32_t
-{
-    // *** WARNING ***
-    // for binary compatibility, values should ONLY be added at the end.
-    //
-    // this file needs to remain in sync across multiple repos.
-    // please be cognizant of that when making changes to the
-    // format.
-
-    Float16 = 1,
-    Float32 = 2,
-    UInt8 = 3,
-    Int8 = 4,
-};
-
-template <typename T>
-struct BlobDataTypeTraits;
-
-template <>
-struct BlobDataTypeTraits<float> {
-    static constexpr BlobDataType DataType = BlobDataType::Float32;
-};
-
-template <>
-struct BlobDataTypeTraits<Fp16> {
-    static constexpr BlobDataType DataType = BlobDataType::Float16;
-};
-
-template <>
-struct BlobDataTypeTraits<uint8_t> {
-    static constexpr BlobDataType DataType = BlobDataType::UInt8;
-};
-
-template <>
-struct BlobDataTypeTraits<int8_t> {
-    static constexpr BlobDataType DataType = BlobDataType::Int8;
-};
-
 /**
  * blob_metadata: stores information of blob present in weight file
  */
diff --git a/mlmodel/src/MILBlob/Blob/StorageReader.cpp b/mlmodel/src/MILBlob/Blob/StorageReader.cpp
index 91b2779f1..65ede7742 100644
--- a/mlmodel/src/MILBlob/Blob/StorageReader.cpp
+++ b/mlmodel/src/MILBlob/Blob/StorageReader.cpp
@@ -3,6 +3,7 @@
 // Use of this source code is governed by a BSD-3-clause license that can be
 // found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+#include "MILBlob/Bf16.hpp"
 #include "MILBlob/Blob/MMapFileReader.hpp"
 #include "MILBlob/Blob/MMapFileReaderFactory.hpp"
 #include "MILBlob/Blob/StorageFormat.hpp"
@@ -69,9 +70,39 @@ class StorageReader::Impl final {
 
     bool IsEncrypted() const
     {
+        EnsureLoaded();
         return m_reader->IsEncrypted();
     }
 
+    BlobDataType GetDataType(uint64_t metadataOffset) const
+    {
+        auto metadata = GetMetadata(metadataOffset);
+        return metadata.mil_dtype;
+    }
+
+    std::vector<uint64_t> GetAllOffsets() const
+    {
+        EnsureLoaded();
+
+        const auto& header = m_reader->ReadStruct<storage_header>(0);
+        auto numBlobs = header.count;
+
+        std::vector<uint64_t> allOffsets;
+        allOffsets.reserve(numBlobs);
+        // The first metadata offset lies just after the file header.
+        uint64_t currMetadataOffset = sizeof(storage_header);
+        for (uint32_t i = 0; i < numBlobs; ++i) {
+            allOffsets.push_back(currMetadataOffset);
+            auto metadata = GetMetadata(currMetadataOffset);
+            // Update offset for next iteration to aligned value.
+            currMetadataOffset = metadata.offset + metadata.sizeInBytes;
+            if (currMetadataOffset % DefaultStorageAlignment != 0) {
+                currMetadataOffset += DefaultStorageAlignment - currMetadataOffset % DefaultStorageAlignment;
+            }
+        }
+        return allOffsets;
+    }
+
 private:
     void EnsureLoaded() const
     {
@@ -127,6 +158,12 @@ Util::Span<const uint8_t> StorageReader::GetDataView<uint8_t>(uint64_t offset) c
     return m_impl->GetDataView<uint8_t>(offset);
 }
 
+template <>
+Util::Span<const Bf16> StorageReader::GetDataView<Bf16>(uint64_t offset) const
+{
+    return m_impl->GetDataView<Bf16>(offset);
+}
+
 template <>
 Util::Span<const Fp16> StorageReader::GetDataView<Fp16>(uint64_t offset) const
 {
@@ -139,6 +176,18 @@ Util::Span<const float> StorageReader::GetDataView<float>(uint64_t offset) const
     return m_impl->GetDataView<float>(offset);
 }
 
+template <>
+Util::Span<const int16_t> StorageReader::GetDataView<int16_t>(uint64_t offset) const
+{
+    return m_impl->GetDataView<int16_t>(offset);
+}
+
+template <>
+Util::Span<const uint16_t> StorageReader::GetDataView<uint16_t>(uint64_t offset) const
+{
+    return m_impl->GetDataView<uint16_t>(offset);
+}
+
 Util::Span<const uint8_t> StorageReader::GetRawDataView(uint64_t offset) const
 {
     return m_impl->GetRawDataView(offset);
@@ -158,3 +207,13 @@ bool StorageReader::IsEncrypted() const
 {
     return m_impl->IsEncrypted();
 }
+
+BlobDataType StorageReader::GetDataType(uint64_t metadataOffset) const
+{
+    return m_impl->GetDataType(metadataOffset);
+}
+
+std::vector<uint64_t> StorageReader::GetAllOffsets() const
+{
+    return m_impl->GetAllOffsets();
+}
diff --git a/mlmodel/src/MILBlob/Blob/StorageReader.hpp b/mlmodel/src/MILBlob/Blob/StorageReader.hpp
index 9c5c4e0a5..56caeb508 100644
--- a/mlmodel/src/MILBlob/Blob/StorageReader.hpp
+++ b/mlmodel/src/MILBlob/Blob/StorageReader.hpp
@@ -5,10 +5,13 @@
 
 #pragma once
 
+#include "MILBlob/Bf16.hpp"
 #include "MILBlob/Fp16.hpp"
+#include "MILBlob/Blob/BlobDataType.hpp"
 #include "MILBlob/Util/Span.hpp"
 #include <memory>
 #include <string>
+#include <vector>
 
 namespace MILBlob {
 namespace Blob {
@@ -20,8 +23,11 @@ namespace Blob {
  *
  * This file format supports the following types:
  * - uint8_t
+ * - Bf16
  * - Fp16
  * - float
+ * - int16_t
+ * - uint16_t
  */
 class StorageReader final {
 public:
@@ -40,6 +46,7 @@ class StorageReader final {
      * Returns a Span view into the underlying memory-mapped storage. The
      * file will be mapped into memory on first access. This is valid for the
      * supported types noted above.
+     * NOTE: `offset` should be the metadata offset.
      * @throws std::range_error if offset is not valid.
      */
     template <typename T>
@@ -49,6 +56,7 @@ class StorageReader final {
      * Returns an uint8_t Span view into the underlying memory-mapped storage. The
      * file will be mapped into memory on first access. This is valid for the
      * supported types noted above.
+     * NOTE: `offset` should be the metadata offset.
      * @throws std::range_error if offset is not valid.
      */
     Util::Span<const uint8_t> GetRawDataView(uint64_t offset) const;
@@ -68,6 +76,15 @@ class StorageReader final {
     /** Returns true if the underlying file is encrypted. */
     bool IsEncrypted() const;
 
+    /**
+     * Returns the storage type of the data blob for the given metadata offset
+     * @throws std::range_error if metadataOffset is not valid.
+     */
+    BlobDataType GetDataType(uint64_t metadataOffset) const;
+
+    /** Returns a vector containing the metadata offsets for all blobs in the file, in order. */
+    std::vector<uint64_t> GetAllOffsets() const;
+
 private:
     class Impl;
     const std::unique_ptr<Impl> m_impl;
@@ -78,9 +95,15 @@ Util::Span<const int8_t> StorageReader::GetDataView<int8_t>(uint64_t) const;
 template <>
 Util::Span<const uint8_t> StorageReader::GetDataView<uint8_t>(uint64_t) const;
 template <>
+Util::Span<const Bf16> StorageReader::GetDataView<Bf16>(uint64_t) const;
+template <>
 Util::Span<const Fp16> StorageReader::GetDataView<Fp16>(uint64_t) const;
 template <>
 Util::Span<const float> StorageReader::GetDataView<float>(uint64_t) const;
+template <>
+Util::Span<const int16_t> StorageReader::GetDataView<int16_t>(uint64_t) const;
+template <>
+Util::Span<const uint16_t> StorageReader::GetDataView<uint16_t>(uint64_t) const;
 
 }  // namespace Blob
 }  // namespace MILBlob
diff --git a/mlmodel/src/MILBlob/Blob/StorageWriter.cpp b/mlmodel/src/MILBlob/Blob/StorageWriter.cpp
index 726a3ceee..b57774d1c 100644
--- a/mlmodel/src/MILBlob/Blob/StorageWriter.cpp
+++ b/mlmodel/src/MILBlob/Blob/StorageWriter.cpp
@@ -3,6 +3,7 @@
 // Use of this source code is governed by a BSD-3-clause license that can be
 // found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+#include "MILBlob/Bf16.hpp"
 #include "MILBlob/Blob/FileWriter.hpp"
 #include "MILBlob/Blob/StorageFormat.hpp"
 #include "MILBlob/Blob/StorageWriter.hpp"
@@ -34,7 +35,8 @@ class StorageWriter::Impl final {
     ~Impl() = default;
 
     Impl(const std::string& filePath, bool truncateFile)
-        : m_fileWriter(std::make_unique<FileWriter>(filePath, truncateFile))
+        : m_filePath(filePath)
+        , m_fileWriter(std::make_unique<FileWriter>(filePath, truncateFile))
     {
         if (truncateFile) {
             m_fileWriter->WriteData(CastAndMakeSpan(m_header), 0);
@@ -61,7 +63,13 @@ class StorageWriter::Impl final {
     template <typename T>
     uint64_t WriteData(Util::Span<const T> data);
 
+    std::string GetFilePath() const
+    {
+        return m_filePath;
+    }
+
 private:
+    std::string m_filePath;
     std::unique_ptr<FileWriter> m_fileWriter;
     storage_header m_header;
 };
@@ -119,6 +127,12 @@ uint64_t StorageWriter::WriteData<uint8_t>(Util::Span<const uint8_t> data)
     return m_impl->WriteData(data);
 }
 
+template <>
+uint64_t StorageWriter::WriteData<Bf16>(Util::Span<const Bf16> data)
+{
+    return m_impl->WriteData(data);
+}
+
 template <>
 uint64_t StorageWriter::WriteData<Fp16>(Util::Span<const Fp16> data)
 {
@@ -130,3 +144,20 @@ uint64_t StorageWriter::WriteData<float>(Util::Span<const float> data)
 {
     return m_impl->WriteData(data);
 }
+
+template <>
+uint64_t StorageWriter::WriteData<int16_t>(Util::Span<const int16_t> data)
+{
+    return m_impl->WriteData(data);
+}
+
+template <>
+uint64_t StorageWriter::WriteData<uint16_t>(Util::Span<const uint16_t> data)
+{
+    return m_impl->WriteData(data);
+}
+
+std::string StorageWriter::GetFilePath() const
+{
+    return m_impl->GetFilePath();
+}
diff --git a/mlmodel/src/MILBlob/Blob/StorageWriter.hpp b/mlmodel/src/MILBlob/Blob/StorageWriter.hpp
index 77903a5b3..00a1423a5 100644
--- a/mlmodel/src/MILBlob/Blob/StorageWriter.hpp
+++ b/mlmodel/src/MILBlob/Blob/StorageWriter.hpp
@@ -5,6 +5,7 @@
 
 #pragma once
 
+#include "MILBlob/Bf16.hpp"
 #include "MILBlob/Fp16.hpp"
 #include "MILBlob/Util/Span.hpp"
 #include <memory>
@@ -36,6 +37,11 @@ class StorageWriter final {
     template <typename T>
     uint64_t WriteData(Util::Span<const T> data);
 
+    /**
+     * Returns the file path of the blob storage file.
+     */
+    std::string GetFilePath() const;
+
 private:
     class Impl;
     const std::unique_ptr<Impl> m_impl;
@@ -46,9 +52,15 @@ uint64_t StorageWriter::WriteData<int8_t>(Util::Span<const int8_t>);
 template <>
 uint64_t StorageWriter::WriteData<uint8_t>(Util::Span<const uint8_t>);
 template <>
+uint64_t StorageWriter::WriteData<Bf16>(Util::Span<const Bf16>);
+template <>
 uint64_t StorageWriter::WriteData<Fp16>(Util::Span<const Fp16>);
 template <>
 uint64_t StorageWriter::WriteData<float>(Util::Span<const float>);
+template <>
+uint64_t StorageWriter::WriteData<int16_t>(Util::Span<const int16_t>);
+template <>
+uint64_t StorageWriter::WriteData<uint16_t>(Util::Span<const uint16_t>);
 
 }  // namespace Blob
 }  // namespace MILBlob
diff --git a/mlmodel/src/MILBlob/Fp16.cpp b/mlmodel/src/MILBlob/Fp16.cpp
new file mode 100644
index 000000000..ae1e71a10
--- /dev/null
+++ b/mlmodel/src/MILBlob/Fp16.cpp
@@ -0,0 +1,31 @@
+// Copyright (c) 2021, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+#include "MILBlob/Fp16.hpp"
+
+// fp16 lib code has some conversion warnings we don't want to globally ignore
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wincompatible-pointer-types"
+#pragma clang diagnostic ignored "-Wsign-conversion"
+#pragma clang diagnostic ignored "-Wconversion"
+#include "fp16/fp16.h"
+#pragma clang diagnostic pop
+
+using namespace MILBlob;
+
+/* static */ Fp16 Fp16::FromFloat(float f)
+{
+    return Fp16(fp16_ieee_from_fp32_value(f));
+}
+
+float Fp16::GetFloat() const
+{
+    return fp16_ieee_to_fp32_value(bytes);
+}
+
+void Fp16::SetFloat(float f)
+{
+    bytes = fp16_ieee_from_fp32_value(f);
+}
diff --git a/mlmodel/src/Model.cpp b/mlmodel/src/Model.cpp
index 0e48ef164..cbc2cb9c8 100644
--- a/mlmodel/src/Model.cpp
+++ b/mlmodel/src/Model.cpp
@@ -114,6 +114,7 @@ namespace CoreML {
                 VALIDATE_MODEL_TYPE(soundAnalysisPreprocessing);
                 VALIDATE_MODEL_TYPE(audioFeaturePrint);
                 VALIDATE_MODEL_TYPE(linkedModel);
+                VALIDATE_MODEL_TYPE(classConfidenceThresholding);
             case MLModelType_serializedModel:
             case MLModelType_mlProgram:
                 return {};
diff --git a/mlmodel/src/ResultReason.hpp b/mlmodel/src/ResultReason.hpp
index b2fc3b87f..4d7234199 100644
--- a/mlmodel/src/ResultReason.hpp
+++ b/mlmodel/src/ResultReason.hpp
@@ -33,6 +33,7 @@ enum class ResultReason {
     MODEL_MAIN_INPUT_RANK_MISMATCHED,
     MODEL_MAIN_INPUT_SHAPE_MISMATCHED,
     MODEL_MAIN_INPUT_TYPE_MISMATCHED,
+    MODEL_MAIN_INPUT_UNBOUNDED_UPPER_RANGE,
     MODEL_MAIN_OUTPUT_COUNT_MISMATCHED,
     MODEL_MAIN_OUTPUT_RANK_MISMATCHED,
     MODEL_MAIN_OUTPUT_SHAPE_MISMATCHED,
diff --git a/mlmodel/src/Utils.cpp b/mlmodel/src/Utils.cpp
index 397cd6279..a0cbc3e87 100644
--- a/mlmodel/src/Utils.cpp
+++ b/mlmodel/src/Utils.cpp
@@ -121,6 +121,10 @@ void CoreML::downgradeSpecificationVersion(Specification::Model *pModel) {
         // lets start at the newest specification version and downgrade from there
         pModel->set_specificationversion(MLMODEL_SPECIFICATION_VERSION_NEWEST);
     }
+    
+    if (pModel->specificationversion() == MLMODEL_SPECIFICATION_VERSION_IOS17 && !hasIOS17Features(*pModel)) {
+        pModel->set_specificationversion(MLMODEL_SPECIFICATION_VERSION_IOS16);
+    }
 
     if (pModel->specificationversion() == MLMODEL_SPECIFICATION_VERSION_IOS16 && !hasIOS16Features(*pModel)) {
         pModel->set_specificationversion(MLMODEL_SPECIFICATION_VERSION_IOS15);
@@ -328,6 +332,19 @@ bool CoreML::hasFloat16MultiArray(const Specification::Model& model) {
     return false;
 }
 
+bool CoreML::hasCoreML7Opsets(const Specification::Model& model) {
+    if (model.Type_case() == Specification::Model::kMlProgram) {
+        auto main_iter = model.mlprogram().functions().find("main");
+        if (main_iter != model.mlprogram().functions().end()) {
+            const auto& main = main_iter->second;
+            if (main.opset() == "CoreML7") {
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
 bool CoreML::hasCoreML6Opsets(const Specification::Model& model) {
     if (model.Type_case() == Specification::Model::kMlProgram) {
         auto main_iter = model.mlprogram().functions().find("main");
@@ -639,26 +656,56 @@ bool CoreML::hasIOS16Features(const Specification::Model& model) {
     return result;
 }
 
-__attribute__((__unused__))
 bool CoreML::hasIOS17Features(const Specification::Model& model) {
     // New in IOS17 features:
-    // - Apple Vision feature extractor for scenes using scene net V5 (revision == 2)
-    // - Apple Vision feature extractor for objects using scene net v5 (revision == 2)
+    // - Revision 2 of Apple Vision feature extractor for scenes
+    // - BERT embedding for text classifier and word tagger (revision == 4)
+
+    bool result = false;
 
     switch (model.Type_case()) {
+        case Specification::Model::kPipeline:
+            for (auto &m : model.pipeline().models()) {
+                result = result || hasIOS17Features(m);
+                if (result) {
+                    return true;
+                }
+            }
+            break;
+        case Specification::Model::kPipelineRegressor:
+            for (auto &m : model.pipelineregressor().pipeline().models()) {
+                result = result || hasIOS17Features(m);
+                if (result) {
+                    return true;
+                }
+            }
+            break;
+        case Specification::Model::kPipelineClassifier:
+            for (auto &m : model.pipelineclassifier().pipeline().models()) {
+                result = result || hasIOS17Features(m);
+                if (result) {
+                    return true;
+                }
+            }
+            break;
         case Specification::Model::kVisionFeaturePrint:
             if (model.visionfeatureprint().has_scene() && model.visionfeatureprint().scene().version() == 2) {
                 return true;
             }
-            if (model.visionfeatureprint().has_objects() && model.visionfeatureprint().objects().version() == 2) {
-                return true;
-            }
-
+            break;
+        case Specification::Model::kClassConfidenceThresholding:
+            return true;
+        case Specification::Model::kWordTagger:
+            return model.wordtagger().revision() == 4;
+        case Specification::Model::kTextClassifier:
+            return model.textclassifier().revision() == 4;
         default:
             break;
     }
 
-    return false;
+    result = result || hasCoreML7Opsets(model);
+
+    return result;
 }
 
 bool CoreML::hasCustomModel(const Specification::Model& model) {
diff --git a/mlmodel/src/Utils.hpp b/mlmodel/src/Utils.hpp
index a547fba7f..94792ed97 100644
--- a/mlmodel/src/Utils.hpp
+++ b/mlmodel/src/Utils.hpp
@@ -148,6 +148,7 @@ namespace CoreML {
     bool hasFloat16MultiArray(const Specification::Model& model);
     bool hasGrayscaleFloat16Image(const Specification::Model& model);
     bool hasCoreML6Opsets(const Specification::Model& model);
+    bool hasCoreML7Opsets(const Specification::Model& model);
 
     bool hasModelOrSubModelProperty(const Specification::Model& model, const std::function<bool(const Specification::Model&)> &boolFunc);
 
diff --git a/mlmodel/src/Validation/ClassConfidenceThresholdingValidator.cpp b/mlmodel/src/Validation/ClassConfidenceThresholdingValidator.cpp
new file mode 100644
index 000000000..5154f610e
--- /dev/null
+++ b/mlmodel/src/Validation/ClassConfidenceThresholdingValidator.cpp
@@ -0,0 +1,70 @@
+//
+//  ClassConfidenceThresholdingValidator.cpp
+//  libmlmodelspec
+#include "Result.hpp"
+#include "Validators.hpp"
+#include "ValidatorUtils-inl.hpp"
+#include "../build/format/Model.pb.h"
+#include <cmath>
+
+namespace CoreML {
+
+    template <>
+    Result validate<MLModelType_classConfidenceThresholding>(const Specification::Model& format) {
+        const auto& interface = format.description();
+        Result result;
+
+        result = validateModelDescription(interface, format.specificationversion());
+        if (!result.good()) {
+            return result;
+        }
+
+        // validate precisionRecallCurves
+        google::protobuf::RepeatedPtrField<CoreML::Specification::PrecisionRecallCurve> precisionrecallcurves =
+                format.classconfidencethresholding().precisionrecallcurves();
+        int nCurves = precisionrecallcurves.size();
+        if (nCurves > 0) {
+            for (int i = 0; i < nCurves; ++i) {
+                int precisionvaluesElts = precisionrecallcurves.Get(i).precisionvalues().vector().size();
+                int precisionthreshElts = precisionrecallcurves.Get(i).precisionconfidencethresholds().vector().size();
+                if (0 == precisionvaluesElts || precisionvaluesElts != precisionthreshElts) {
+                    return Result(ResultType::INVALID_MODEL_PARAMETERS, "Zero length or mismatched precisionRecallCurves components");
+                }
+                
+                int recallvaluesElts = precisionrecallcurves.Get(i).recallvalues().vector().size();
+                int recallthreshElts = precisionrecallcurves.Get(i).recallconfidencethresholds().vector().size();
+                if (0 == recallvaluesElts || recallvaluesElts != recallthreshElts) {
+                    return Result(ResultType::INVALID_MODEL_PARAMETERS, "Zero length or mismatched precisionRecallCurves components");
+                }
+                            
+                for (auto elt : precisionrecallcurves.Get(i).precisionvalues().vector()) {
+                    if (std::isinf(elt) || std::isnan(elt) || elt < 0.0f) {
+                        return Result(ResultType::INVALID_MODEL_PARAMETERS, "An element of precisionvalues is not a positive number or zero.");
+                    }
+                }
+                
+                for (auto elt : precisionrecallcurves.Get(i).precisionconfidencethresholds().vector()) {
+                    if (std::isinf(elt) || std::isnan(elt) || elt < 0.0f) {
+                        return Result(ResultType::INVALID_MODEL_PARAMETERS, "An element of precisionconfidencethresholds is not a positive number or zero.");
+                    }
+                }
+                
+                for (auto elt : precisionrecallcurves.Get(i).recallvalues().vector()) {
+                    if (std::isinf(elt) || std::isnan(elt) || elt < 0.0f) {
+                        return Result(ResultType::INVALID_MODEL_PARAMETERS, "An element of recallvalues is not a positive number or zero.");
+                    }
+                }
+                
+                for (auto elt : precisionrecallcurves.Get(i).recallconfidencethresholds().vector()) {
+                    if (std::isinf(elt) || std::isnan(elt) || elt < 0.0f) {
+                        return Result(ResultType::INVALID_MODEL_PARAMETERS, "An element of recallconfidencethresholds is not a positive number or zero.");
+                    }
+                }
+            }
+            return Result();
+        } else {
+            return Result(ResultType::INVALID_MODEL_PARAMETERS, "The ClassConfidenceThresholding model has no precisionRecallCurves.");
+        }
+    }
+}
+
diff --git a/mlmodel/src/Validation/VisionFeaturePrintValidator.cpp b/mlmodel/src/Validation/VisionFeaturePrintValidator.cpp
index bfec863a0..627a6cdc0 100644
--- a/mlmodel/src/Validation/VisionFeaturePrintValidator.cpp
+++ b/mlmodel/src/Validation/VisionFeaturePrintValidator.cpp
@@ -49,13 +49,13 @@ namespace CoreML {
             case Specification::CoreMLModels::VisionFeaturePrint::kObjects:
                 if (visionFeaturePrint.objects().version() == Specification::CoreMLModels::VisionFeaturePrint_Objects_ObjectsVersion_OBJECTS_VERSION_INVALID) {
                     return Result(ResultType::INVALID_MODEL_PARAMETERS, "Version for objects is invalid");
-                } else if (visionFeaturePrint.objects().version() == Specification::CoreMLModels::VisionFeaturePrint_Objects_ObjectsVersion_OBJECTS_VERSION_1 || visionFeaturePrint.objects().version() == Specification::CoreMLModels::VisionFeaturePrint_Objects_ObjectsVersion_OBJECTS_VERSION_2) {
+                } else if (visionFeaturePrint.objects().version() == Specification::CoreMLModels::VisionFeaturePrint_Objects_ObjectsVersion_OBJECTS_VERSION_1) {
 
                     if (visionFeaturePrint.objects().output_size() != 2) {
                         return Result(ResultType::INVALID_MODEL_PARAMETERS, "Two outputs for objects need to be provided");
                     }
 
-                    // validate the outputs: only two outputs with multiarray type is allowed for version 1 and version 2.
+                    // validate the outputs: only two outputs with multiarray type is allowed for version 1.
                     result = validateDescriptionsContainFeatureWithTypes(interface.output(), 2, {Specification::FeatureType::kMultiArrayType});
                     if (!result.good()) {
                         return result;
diff --git a/mlmodel/tests/MILBlob/BlobUtils.cpp b/mlmodel/tests/MILBlob/BlobUtils.cpp
index 6fddd81e9..0f746deff 100644
--- a/mlmodel/tests/MILBlob/BlobUtils.cpp
+++ b/mlmodel/tests/MILBlob/BlobUtils.cpp
@@ -99,6 +99,70 @@ AutoDeleteTempFile MakeStorageTempFileWith3Records()
         // 512 Bytes
         // DATA 4
         0xf8f0, 0x02fe, 0x1008, 0x0000, // [-16, -8, -2, 2, 8, 16]
+        // Padding
+        0x0000, 0x0000, 0x0000, 0x0000,
+        0x0000, 0x0000, 0x0000, 0x0000,
+        0x0000, 0x0000, 0x0000, 0x0000,
+        0x0000, 0x0000, 0x0000, 0x0000,
+        0x0000, 0x0000, 0x0000, 0x0000,
+        0x0000, 0x0000, 0x0000, 0x0000,
+        0x0000, 0x0000, 0x0000, 0x0000,
+        // 576 Bytes
+        // METADATA 5
+        0xBEEF, 0xDEAD, 0x0005, 0x0000,  // sentinel=0xDEADBEEF, mil_dtype=Bf16
+        0x0008, 0x0000, 0x0000, 0x0000,  // sizeInBytes=8 bytes
+        0x0280, 0x0000, 0x0000, 0x0000,  // offset
+        0x0000, 0x0000, 0x0000, 0x0000,  // reserved_0
+        0x0000, 0x0000, 0x0000, 0x0000,  // reserved_1
+        0x0001, 0x0000, 0x0000, 0x0000,  // reserved_2
+        0x0000, 0x0000, 0x0000, 0x0000,  // reserved_3
+        0x0000, 0x0000, 0x0000, 0x0000,  // reserved_4
+        // 640 Bytes
+        // DATA 5
+        0x000E, 0xC0FE, 0x0810, 0x0000,
+        // Padding
+        0x0000, 0x0000, 0x0000, 0x0000,
+        0x0000, 0x0000, 0x0000, 0x0000,
+        0x0000, 0x0000, 0x0000, 0x0000,
+        0x0000, 0x0000, 0x0000, 0x0000,
+        0x0000, 0x0000, 0x0000, 0x0000,
+        0x0000, 0x0000, 0x0000, 0x0000,
+        0x0000, 0x0000, 0x0000, 0x0000,
+        // 704 BYTES
+        // METADATA 6
+        0xBEEF, 0xDEAD, 0x0006, 0x0000,  // sentinel=0xDEADBEEF, mil_dtype=int16
+        0x0004, 0x0000, 0x0000, 0x0000,  // sizeInBytes=4 bytes
+        0x0300, 0x0000, 0x0000, 0x0000,  // offset
+        0x0000, 0x0000, 0x0000, 0x0000,  // reserved_0
+        0x0000, 0x0000, 0x0000, 0x0000,  // reserved_1
+        0x0001, 0x0000, 0x0000, 0x0000,  // reserved_2
+        0x0000, 0x0000, 0x0000, 0x0000,  // reserved_3
+        0x0000, 0x0000, 0x0000, 0x0000,  // reserved_4
+        // 768 BYTES
+        // DATA 6
+        0xe8d0, 0x007e, 0x0000, 0x0000,
+        // Padding
+        0x0000, 0x0000, 0x0000, 0x0000,
+        0x0000, 0x0000, 0x0000, 0x0000,
+        0x0000, 0x0000, 0x0000, 0x0000,
+        0x0000, 0x0000, 0x0000, 0x0000,
+        0x0000, 0x0000, 0x0000, 0x0000,
+        0x0000, 0x0000, 0x0000, 0x0000,
+        0x0000, 0x0000, 0x0000, 0x0000,
+        // 832 BYTES
+        // METADATA 7
+        0xBEEF, 0xDEAD, 0x0007, 0x0000,  // sentinel=0xDEADBEEF, mil_dtype=uint16
+        0x0004, 0x0000, 0x0000, 0x0000,  // sizeInBytes=4 bytes
+        0x0380, 0x0000, 0x0000, 0x0000,  // offset
+        0x0000, 0x0000, 0x0000, 0x0000,  // reserved_0
+        0x0000, 0x0000, 0x0000, 0x0000,  // reserved_1
+        0x0001, 0x0000, 0x0000, 0x0000,  // reserved_2
+        0x0000, 0x0000, 0x0000, 0x0000,  // reserved_3
+        0x0000, 0x0000, 0x0000, 0x0000,  // reserved_4
+        // 896 BYTES
+        // DATA 7
+        0xe8d0, 0x007e, 0x0000, 0x0000,
+
     };
     // clang-format on
 
diff --git a/mlmodel/tests/MILBlob/StorageIntegrationTests.cpp b/mlmodel/tests/MILBlob/StorageIntegrationTests.cpp
index 2077f414f..5c710ffa4 100644
--- a/mlmodel/tests/MILBlob/StorageIntegrationTests.cpp
+++ b/mlmodel/tests/MILBlob/StorageIntegrationTests.cpp
@@ -3,6 +3,7 @@
 // Use of this source code is governed by a BSD-3-clause license that can be
 // found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+#include "MILBlob/Bf16.hpp"
 #include "MILBlob/Blob/StorageReader.hpp"
 #include "MILBlob/Blob/StorageWriter.hpp"
 #include "MILBlob/Fp16.hpp"
@@ -25,8 +26,11 @@ int testStorageIntegrationTestsWriteAndReadValues()
     const std::vector<uint8_t> data0 = {0x02, 0x00, 0x40, 0x00, 0x07};
     const std::vector<Fp16> data1 = {Fp16(0x000E), Fp16(0xC0FE), Fp16(0x0810)};
     const std::vector<float> data2 = {0x700000, 0xC0FEE, 0x8FACE, 0x91FADE};
+    const std::vector<Bf16> data3 = {Bf16(0x000E), Bf16(0xC0FE), Bf16(0x0810)};
+    const std::vector<int16_t> data4 = {int16_t(0xF041), int16_t(0x8000), 0x75};
+    const std::vector<uint16_t> data5 = {0x0, 0xFFFF, 0x7064};
 
-    uint64_t offset0, offset1, offset2;
+    uint64_t offset0, offset1, offset2, offset3, offset4, offset5;
     {
         StorageWriter writer(tempfile.GetFilename());
         // offset in bytes for reference
@@ -38,13 +42,28 @@ int testStorageIntegrationTestsWriteAndReadValues()
         // padding: 132 - 191
         // offset1: 192
         // metadata0: 192 - 255,
-        // data0: 256 - 261
+        // data1: 256 - 261
         offset1 = writer.WriteData(Util::MakeSpan(data1));
         // padding: 262 - 319
-        // offset1: 320
+        // offset2: 320
         // metadata0: 320 - 384,
-        // data0: 384 - 400
+        // data2: 384 - 400
         offset2 = writer.WriteData(Util::MakeSpan(data2));
+        // padding: 401 - 447
+        // offset3: 448
+        // metadata0: 448 - 512,
+        // data3: 512 - 517
+        offset3 = writer.WriteData(Util::MakeSpan(data3));
+        // padding: 518 - 575
+        // offset4: 576
+        // metadata0: 576 - 640,
+        // data4: 640 - 645
+        offset4 = writer.WriteData(Util::MakeSpan(data4));
+        // padding: 646 - 703
+        // offset5: 704
+        // metadata0: 704 - 768
+        // data5: 768 - 773
+        offset5 = writer.WriteData(Util::MakeSpan(data5));
     }
 
     StorageReader reader(tempfile.GetFilename());
@@ -63,6 +82,21 @@ int testStorageIntegrationTestsWriteAndReadValues()
     auto out2 = reader.GetDataView<float>(offset2);
     ML_ASSERT_SPAN_EQ(Util::MakeSpan(data2), out2);
 
+    // Validate data3
+    ML_ASSERT_EQ(offset3, uint64_t(448));
+    auto out3 = reader.GetDataView<Bf16>(offset3);
+    ML_ASSERT_SPAN_EQ(Util::MakeSpan(data3), out3);
+
+    // Validate data4
+    ML_ASSERT_EQ(offset4, uint64_t(576));
+    auto out4 = reader.GetDataView<int16_t>(offset4);
+    ML_ASSERT_SPAN_EQ(Util::MakeSpan(data4), out4);
+
+    // Validate data5
+    ML_ASSERT_EQ(offset5, uint64_t(704));
+    auto out5 = reader.GetDataView<uint16_t>(offset5);
+    ML_ASSERT_SPAN_EQ(Util::MakeSpan(data5), out5);
+
     return 0;
 }
 
@@ -106,4 +140,3 @@ int testStorageIntegrationTestsReadDataWithIncorrectOffset()
 
     return 0;
 }
-
diff --git a/mlmodel/tests/MILBlob/StorageReaderTests.cpp b/mlmodel/tests/MILBlob/StorageReaderTests.cpp
index 1facaed36..b7dbc2813 100644
--- a/mlmodel/tests/MILBlob/StorageReaderTests.cpp
+++ b/mlmodel/tests/MILBlob/StorageReaderTests.cpp
@@ -3,6 +3,8 @@
 // Use of this source code is governed by a BSD-3-clause license that can be
 // found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+#include "MILBlob/Bf16.hpp"
+#include "MILBlob/Blob/BlobDataType.hpp"
 #include "MILBlob/Blob/StorageReader.hpp"
 #include "MILBlob/Blob/StorageWriter.hpp"
 #include "MILBlob/Fp16.hpp"
@@ -250,6 +252,30 @@ int testStorageReaderTestsThreeRecords()
         ML_ASSERT_SPAN_EQ(data, Util::SpanCast<float>(Util::MakeSpan(expectedValues)));
     }
 
+    {  // read Bf16 weights from metadata 4
+        auto data = reader.GetDataView<Bf16>(576);
+        ML_ASSERT_EQ(data.Size(), 8 / sizeof(Bf16));
+
+        std::vector<Bf16> expectedValues = {Bf16(0x000E), Bf16(0xC0FE), Bf16(0x0810), Bf16(0x0000)};
+        ML_ASSERT_SPAN_EQ(data, Util::MakeSpan(expectedValues));
+    }
+
+    {  // read int16_t weights from metadata 5
+        auto data = reader.GetDataView<int16_t>(704);
+        ML_ASSERT_EQ(data.Size(), 4 / sizeof(int16_t));
+
+        std::vector<int16_t> expectedValues = {int16_t(0xe8d0), int16_t(0x007e)};
+        ML_ASSERT_SPAN_EQ(data, Util::MakeSpan(expectedValues));
+    }
+
+    {  // read uint16_t weights from metadata 6
+        auto data = reader.GetDataView<uint16_t>(832);
+        ML_ASSERT_EQ(data.Size(), 4 / sizeof(uint16_t));
+
+        std::vector<uint16_t> expectedValues = {uint16_t(0xe8d0), uint16_t(0x007e)};
+        ML_ASSERT_SPAN_EQ(data, Util::MakeSpan(expectedValues));
+    }
+
     return 0;
 }
 
@@ -283,6 +309,30 @@ int testStorageReaderTestsRawData()
         ML_ASSERT_SPAN_EQ(data, Util::SpanCast<uint8_t>(Util::MakeSpan(expectedValues)));
     }
 
+    {  // read Bf16 weights from metadata 4
+        auto data = reader.GetRawDataView(576);
+        ML_ASSERT_EQ(data.Size(), size_t(8));
+
+        std::vector<Bf16> expectedValues = {Bf16(0x000E), Bf16(0xC0FE), Bf16(0x0810), Bf16(0x0000)};
+        ML_ASSERT_SPAN_EQ(data, Util::SpanCast<const uint8_t>(Util::MakeSpan(expectedValues)));
+    }
+
+    {  // read int16_t weights from metadata 5
+        auto data = reader.GetRawDataView(704);
+        ML_ASSERT_EQ(data.Size(), size_t(4));
+
+        std::vector<int16_t> expectedValue = {int16_t(0xe8d0), int16_t(0x7e)};
+        ML_ASSERT_SPAN_EQ(data, Util::SpanCast<const uint8_t>(Util::MakeSpan(expectedValue)));
+    }
+
+    {  // read uint16_t weights from metadata 5
+        auto data = reader.GetRawDataView(832);
+        ML_ASSERT_EQ(data.Size(), size_t(4));
+
+        std::vector<uint16_t> expectedValue = {uint16_t(0xe8d0), uint16_t(0x7e)};
+        ML_ASSERT_SPAN_EQ(data, Util::SpanCast<const uint8_t>(Util::MakeSpan(expectedValue)));
+    }
+
     return 0;
 }
 
@@ -295,16 +345,25 @@ int testStorageReaderTestsDataOffset()
     {  // read data offset for uint8_t weights from metadata 1
         ML_ASSERT_EQ(uint64_t(128), reader.GetDataOffset(64));
         ML_ASSERT_EQ(uint64_t(5), reader.GetDataSize(64));
+        ML_ASSERT_EQ(BlobDataType::UInt8, reader.GetDataType(64));
     }
 
     {  // read data offset for Fp16 weights from metadata 2
         ML_ASSERT_EQ(uint64_t(256), reader.GetDataOffset(192));
         ML_ASSERT_EQ(uint64_t(8), reader.GetDataSize(192));
+        ML_ASSERT_EQ(BlobDataType::Float16, reader.GetDataType(192));
     }
 
     {  // read data offset for float weights from metadata 3
         ML_ASSERT_EQ(uint64_t(384), reader.GetDataOffset(320));
         ML_ASSERT_EQ(uint64_t(16), reader.GetDataSize(320));
+        ML_ASSERT_EQ(BlobDataType::Float32, reader.GetDataType(320));
+    }
+
+    {  // read data offset for Bf16 weights from metadata 4
+        ML_ASSERT_EQ(uint64_t(640), reader.GetDataOffset(576));
+        ML_ASSERT_EQ(uint64_t(8), reader.GetDataSize(576));
+        ML_ASSERT_EQ(BlobDataType::BFloat16, reader.GetDataType(576));
     }
 
     return 0;
@@ -322,6 +381,7 @@ int testStorageReaderTestsInt8Data()
     }
 
     StorageReader reader(tempfile.GetFilename());
+    ML_ASSERT_EQ(reader.GetDataType(offset), BlobDataType::Int8);
     const auto readData = reader.GetDataView<int8_t>(offset);
     ML_ASSERT_EQ(readData.Size(), data.size());
 
@@ -330,3 +390,58 @@ int testStorageReaderTestsInt8Data()
     return 0;
 }
 
+int testStorageReaderTestsAllOffsets()
+{
+    AutoDeleteTempFile tempfile;
+    const std::vector<std::vector<int8_t>> dataVectors = {{1, -1, -20, 25, 13},
+                                                          {2, -2, -40, 50, 26},
+                                                          {3, -3, -60, 75, 39}};
+    std::vector<uint64_t> originalOffsets;
+    originalOffsets.reserve(3);
+    {
+        StorageWriter writer(tempfile.GetFilename());
+        for (size_t i = 0; i < dataVectors.size(); ++i) {
+            auto span = Util::MakeSpan(dataVectors[i]);
+            originalOffsets.push_back(writer.WriteData(span));
+        }
+    }
+
+    StorageReader reader(tempfile.GetFilename());
+    auto obtainedOffsets = reader.GetAllOffsets();
+    ML_ASSERT_EQ(obtainedOffsets.size(), 3);
+    for (size_t i = 0; i < 3; ++i) {
+        ML_ASSERT_EQ(obtainedOffsets[i], originalOffsets[i]);
+        const auto readData = reader.GetDataView<int8_t>(obtainedOffsets[i]);
+        ML_ASSERT_EQ(readData.Size(), dataVectors[i].size());
+        ML_ASSERT_SPAN_EQ(readData, Util::MakeSpan(dataVectors[i]));
+    }
+
+    return 0;
+}
+
+int testStorageReaderTestsAllOffsetsWithEmptyBlobFile()
+{
+    AutoDeleteTempFile tempfile;
+    {
+        StorageWriter writer(tempfile.GetFilename());
+    }
+
+    StorageReader reader(tempfile.GetFilename());
+    auto obtainedOffsets = reader.GetAllOffsets();
+    ML_ASSERT_EQ(obtainedOffsets.size(), 0);
+
+    return 0;
+}
+
+int testStorageReaderTestsIsEncryptedWithEmptyBlobFile()
+{
+    AutoDeleteTempFile tempfile;
+    {
+        StorageWriter writer(tempfile.GetFilename());
+    }
+
+    StorageReader reader(tempfile.GetFilename());
+    ML_ASSERT_NOT(reader.IsEncrypted());
+
+    return 0;
+}
diff --git a/mlmodel/tests/MILBlob/StorageWriterTests.cpp b/mlmodel/tests/MILBlob/StorageWriterTests.cpp
index 0ea071a5f..04f561d57 100644
--- a/mlmodel/tests/MILBlob/StorageWriterTests.cpp
+++ b/mlmodel/tests/MILBlob/StorageWriterTests.cpp
@@ -3,6 +3,7 @@
 // Use of this source code is governed by a BSD-3-clause license that can be
 // found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+#include "MILBlob/Bf16.hpp"
 #include "MILBlob/Blob/StorageFormat.hpp"
 #include "MILBlob/Blob/StorageWriter.hpp"
 #include "MILBlob/Fp16.hpp"
@@ -61,6 +62,7 @@ int testStorageWriterTestsSupportedTypes()
 {
     AutoDeleteTempFile tempfile;
     auto filePath = tempfile.GetFilename();
+    uint32_t headerCount = 0;
 
     // Writing uint8_t values
     {
@@ -73,11 +75,59 @@ int testStorageWriterTestsSupportedTypes()
         }
 
         ML_ASSERT_EQ(offset % DefaultStorageAlignment, uint64_t(0));
-        ML_ASSERT(IsCorrectHeader(filePath, 1 /*count*/));
+        ML_ASSERT(IsCorrectHeader(filePath, ++headerCount /*count*/));
         ML_ASSERT(IsCorrectMetadata<uint8_t>(filePath, offset, 4, BlobDataType::UInt8));
         ML_ASSERT(IsCorrectData<uint8_t>(filePath, offset, expectedSpan));
     }
 
+    // Writing uint16_t values
+    {
+        const std::vector<uint16_t> val = {0xFFC2, 0x0, 0x8000, 0x03DE};
+        auto expectedSpan = Util::MakeSpan(val);
+        uint64_t offset = 0;
+        {
+            StorageWriter writer(tempfile.GetFilename(), /* truncateFile */ false);
+            offset = writer.WriteData(expectedSpan);
+        }
+
+        ML_ASSERT_EQ(offset % DefaultStorageAlignment, uint64_t(0));
+        ML_ASSERT(IsCorrectHeader(filePath, ++headerCount));
+        ML_ASSERT(IsCorrectMetadata<uint16_t>(filePath, offset, 4, BlobDataType::UInt16));
+        ML_ASSERT(IsCorrectData<uint16_t>(filePath, offset, expectedSpan));
+    }
+
+    // Writing int16_t values
+    {
+        const std::vector<int16_t> val = {int16_t(0xFFC2), 0x7FFF, int16_t(0x8000), 0x03DE};
+        auto expectedSpan = Util::MakeSpan(val);
+        uint64_t offset = 0;
+        {
+            StorageWriter writer(tempfile.GetFilename(), /* truncateFile */ false);
+            offset = writer.WriteData(expectedSpan);
+        }
+
+        ML_ASSERT_EQ(offset % DefaultStorageAlignment, uint64_t(0));
+        ML_ASSERT(IsCorrectHeader(filePath, ++headerCount));
+        ML_ASSERT(IsCorrectMetadata<int16_t>(filePath, offset, 4, BlobDataType::Int16));
+        ML_ASSERT(IsCorrectData<int16_t>(filePath, offset, expectedSpan));
+    }
+
+    // Writing bf16 values
+    {
+        const std::vector<Bf16> val = {Bf16(0x12), Bf16(0x00), Bf16(0x124), Bf16(0xabcd)};
+        auto expectedSpan = Util::MakeSpan(val);
+        uint64_t offset = 0;
+        {
+            StorageWriter writer(tempfile.GetFilename(), /* truncateFile */ false);
+            offset = writer.WriteData(expectedSpan);
+        }
+
+        ML_ASSERT_EQ(offset % DefaultStorageAlignment, uint64_t(0));
+        ML_ASSERT(IsCorrectHeader(filePath, ++headerCount /*count*/));
+        ML_ASSERT(IsCorrectMetadata<Bf16>(filePath, offset, 4, BlobDataType::BFloat16));
+        ML_ASSERT(IsCorrectData<Bf16>(filePath, offset, expectedSpan));
+    }
+
     // Writing fp16 values
     {
         const std::vector<Fp16> val = {Fp16(0x12), Fp16(0x00), Fp16(0x124), Fp16(0xabcd)};
@@ -89,7 +139,7 @@ int testStorageWriterTestsSupportedTypes()
         }
 
         ML_ASSERT_EQ(offset % DefaultStorageAlignment, uint64_t(0));
-        ML_ASSERT(IsCorrectHeader(filePath, 2 /*count*/));
+        ML_ASSERT(IsCorrectHeader(filePath, ++headerCount /*count*/));
         ML_ASSERT(IsCorrectMetadata<Fp16>(filePath, offset, 4, BlobDataType::Float16));
         ML_ASSERT(IsCorrectData<Fp16>(filePath, offset, expectedSpan));
     }
@@ -105,7 +155,7 @@ int testStorageWriterTestsSupportedTypes()
         }
 
         ML_ASSERT_EQ(offset % DefaultStorageAlignment, uint64_t(0));
-        ML_ASSERT(IsCorrectHeader(filePath, 3 /*count*/));
+        ML_ASSERT(IsCorrectHeader(filePath, ++headerCount /*count*/));
         ML_ASSERT(IsCorrectMetadata<float>(filePath, offset, 4, BlobDataType::Float32));
         ML_ASSERT(IsCorrectData<float>(filePath, offset, expectedSpan));
     }
@@ -121,7 +171,7 @@ int testStorageWriterTestsSupportedTypes()
         }
 
         ML_ASSERT_EQ(offset % DefaultStorageAlignment, uint64_t(0));
-        ML_ASSERT(IsCorrectHeader(filePath, 4 /*count*/));
+        ML_ASSERT(IsCorrectHeader(filePath, ++headerCount /*count*/));
         ML_ASSERT(IsCorrectMetadata<int8_t>(filePath, offset, 4, BlobDataType::Int8));
         ML_ASSERT(IsCorrectData<int8_t>(filePath, offset, expectedSpan));
     }
@@ -220,4 +270,3 @@ int testStorageWriterTestsAlignment()
 
     return 0;
 }
-
diff --git a/mlmodel/tests/MLModelTests.hpp b/mlmodel/tests/MLModelTests.hpp
index aeb43f2e8..1a935772c 100644
--- a/mlmodel/tests/MLModelTests.hpp
+++ b/mlmodel/tests/MLModelTests.hpp
@@ -298,11 +298,14 @@ MLMODEL_TEST(testSpanTestsStaticSizedAccessMutable)
 MLMODEL_TEST(testStorageIntegrationTestsReadDataWithIncorrectOffset)
 MLMODEL_TEST(testStorageIntegrationTestsReadDataWithIncorrectType)
 MLMODEL_TEST(testStorageIntegrationTestsWriteAndReadValues)
+MLMODEL_TEST(testStorageReaderTestsAllOffsets)
+MLMODEL_TEST(testStorageReaderTestsAllOffsetsWithEmptyBlobFile)
 MLMODEL_TEST(testStorageReaderTestsBasicProperties)
 MLMODEL_TEST(testStorageReaderTestsDataOffset)
 MLMODEL_TEST(testStorageReaderTestsIncorrectDType)
 MLMODEL_TEST(testStorageReaderTestsIncorrectMetadata)
 MLMODEL_TEST(testStorageReaderTestsInt8Data)
+MLMODEL_TEST(testStorageReaderTestsIsEncryptedWithEmptyBlobFile)
 MLMODEL_TEST(testStorageReaderTestsRawData)
 MLMODEL_TEST(testStorageReaderTestsThreeRecords)
 MLMODEL_TEST(testStorageReaderTestsTruncatedData)
diff --git a/reqs/build.pip b/reqs/build.pip
index 5612d28e6..2acd5e9a4 100644
--- a/reqs/build.pip
+++ b/reqs/build.pip
@@ -1,6 +1,8 @@
-numpy==1.21.0; platform_machine == "arm64" and python_version != "3.10"
-numpy<1.20; platform_machine != "arm64" and python_version != "3.10"
+# Use the oldest possible version of numpy
+numpy==1.21.0; platform_machine == "arm64" and python_version < "3.10"
+numpy<1.20; platform_machine != "arm64" and python_version < "3.10"
 numpy==1.21.3; python_version == "3.10"
+numpy==1.23.2; python_version == "3.11"
 
 # rdar://93977023
 protobuf<=3.20.3; python_version < "3.7"
@@ -11,3 +13,6 @@ six
 sympy
 tqdm
 wheel
+attrs
+cattrs
+pyaml
diff --git a/reqs/test.pip b/reqs/test.pip
index 98a340967..0d4c69556 100644
--- a/reqs/test.pip
+++ b/reqs/test.pip
@@ -5,7 +5,7 @@ h5py==3.8.0; platform_machine != "arm64" and python_version >= "3.8"
 future
 numpy>1.18.4; platform_machine != "arm64"
 numpy==1.24.2; platform_machine == "arm64"
-libsvm
+libsvm; python_version < '3.11'
 olefile==0.44
 pandas
 parameterized==0.8.1
@@ -13,10 +13,14 @@ pillow
 pytest==7.1.2
 pytest-cov
 pytest-sugar
+
 scikit-learn==0.19.2; python_version < '3.8'
 scikit-learn==1.1.2; python_version >= '3.8'
-scipy > 1.4 ; python_version < '3.8'
-scipy==1.8.1; python_version >= '3.8'
+
+scipy>1.4 ; python_version < '3.8'
+scipy==1.8.1; python_version >= '3.8' and python_version < '3.11'
+scipy==1.9.2; python_version == '3.11'
+
 six
 sympy > 1.6
 gast==0.4.0
@@ -34,11 +38,17 @@ tensorflow==2.12.0; platform_machine != "arm64"
 tensorflow-estimator==2.12.0; platform_machine != "arm64"
 keras==2.12.0; platform_machine != "arm64"
 
-# TensorFlow (arm64) related package
-tensorflow-macos==2.11.0; platform_machine == "arm64"
-tensorflow-estimator==2.11.0; platform_machine == "arm64"
-keras==2.11.0; platform_machine == "arm64"
+# TensorFlow (arm64) related package. Currently no Python 3.11 support.
+tensorflow-macos==2.11.0; platform_machine == "arm64" and python_version < "3.11"
+tensorflow-estimator==2.11.0; platform_machine == "arm64" and python_version < "3.11"
+keras==2.11.0; platform_machine == "arm64" and python_version < "3.11"
 
-tensorflow-addons==0.19.0
+tensorflow-addons==0.19.0; python_version < "3.11"
 tensorflow-hub==0.12.0
 transformers==4.26.0
+
+# coremltools.optimize.torch
+filelock==3.6.0
+pytest-flake8==1.0.7
+pytest-xdist==2.5.0
+pytest-mock==3.8.2
diff --git a/setup.py b/setup.py
index 019c7c75a..4f5f6688b 100755
--- a/setup.py
+++ b/setup.py
@@ -66,7 +66,14 @@
     url="https://github.com/apple/coremltools",
     packages=find_packages(),
     package_data={
-        "": ["LICENSE.txt", "README.md", "libmilstoragepython.so", "libcoremlpython.so", "libmodelpackage.so"]
+        "": [
+            "_core.*.so",  # kmeans1d
+            "libcoremlpython.so",
+            "libmilstoragepython.so",
+            "libmodelpackage.so",
+            "LICENSE.txt",
+            "README.md",
+        ]
     },
     install_requires=[
         "numpy >= 1.14.5",
@@ -74,6 +81,9 @@
         "sympy",
         "tqdm",
         "packaging",
+        "attrs",
+        "cattrs",
+        "pyaml",
     ],
     classifiers=[
         "Development Status :: 5 - Production/Stable",
@@ -84,6 +94,7 @@
         "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",
         "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3.11",
         "Topic :: Scientific/Engineering",
         "Topic :: Software Development",
     ],