From f96051e154f7c3a735c17108f1bff840bab2b355 Mon Sep 17 00:00:00 2001
From: "Kern Handa (KERN)" <kerha@microsoft.com>
Date: Wed, 10 Jun 2020 21:26:30 +0000
Subject: [PATCH] Merged PR 1583: ELL release v3.1.0

ELL release v3.1.0

- Move to VS 2019
- Fix a codegen error that was resulting in incorrect functional behavior
- Fix regressions in audio training tutorial (#232)
- Add importing of Sum nodes to ONNX importer
- Fix crash in LLVMContext::SetName
- Improved performance of CNN models on Pi3 with new implementations of spatial, pointwise and regular convolutions
- Improved performance of reorder node
- New nodes: ReorderDataCodeNode, SpatialConvolutionNode, MatrixMatrixMultiplyCodeNode
- Implement parallelization strategies for matrix multiplication nodes.
- Only enable new MatrixMatrixMultipleCodeNode path for select ARM targets like Pi, and not Intel/AMD CPUs
- Add the flag `--skip_ellcode` to `compile` and `wrap.py` tools to use OpenBLAS for linear algebra computations.
---
 .gitattributes                                |    5 +
 .gitignore                                    |    1 +
 CMake/CommonInterfaces.cmake                  |    4 +-
 CMake/CopySharedLibraries.cmake               |   14 +-
 CMake/LLVMSetup.cmake                         |    3 -
 CMake/OpenBLASSetup.cmake                     |   13 +-
 CMakeLists.txt                                |   37 +-
 History.md                                    |   13 +
 VERSION                                       |    2 +-
 docs/gallery/ILSVRC2012/Asparagus.md          |    4 +-
 docs/gallery/ILSVRC2012/Bean.md               |    6 +-
 docs/gallery/ILSVRC2012/Buckthorn.md          |    6 +-
 docs/gallery/ILSVRC2012/Carrot.md             |    6 +-
 docs/gallery/ILSVRC2012/CashewNut.md          |    6 +-
 docs/gallery/ILSVRC2012/Chalta.md             |    6 +-
 docs/gallery/ILSVRC2012/Clary.md              |    6 +-
 docs/gallery/ILSVRC2012/Clover.md             |    6 +-
 docs/gallery/ILSVRC2012/Coconut.md            |    6 +-
 docs/gallery/ILSVRC2012/Ginger.md             |    6 +-
 docs/gallery/ILSVRC2012/Mashua.md             |    4 +-
 docs/gallery/ILSVRC2012/PandanFlower.md       |    6 +-
 docs/gallery/ILSVRC2012/Pear.md               |    6 +-
 docs/gallery/ILSVRC2012/SevenSisters.md       |    6 +-
 docs/gallery/ILSVRC2012/Sweetsop.md           |    6 +-
 docs/gallery/ILSVRC2012/Tamarind.md           |    6 +-
 docs/gallery/ILSVRC2012/WaterApple.md         |    6 +-
 docs/gallery/ILSVRC2012/Wattleseed.md         |    4 +-
 .../index.md                                  |    1 -
 interfaces/CMakeLists.txt                     |    1 +
 .../MatrixMatrixMultiplyImplementation.h      |   17 +
 .../common/include/ModelBuilderInterface.h    |    3 +
 interfaces/common/include/ModelInterface.h    |    3 +
 interfaces/common/model.i                     |    2 +
 interfaces/common/model_python_post.i         |   10 +
 .../common/src/ModelBuilderInterface.cpp      |  126 +-
 interfaces/common/src/ModelInterface.cpp      |    1 +
 interfaces/python/CMakeLists.txt              |   17 +-
 interfaces/python/package/CMakeLists.txt      |    9 +-
 interfaces/python/package/ell/CMakeLists.txt  |   26 +-
 .../python/package/ell/nodes/__init__.py      |    1 +
 interfaces/python/test/CMakeLists.txt         |    4 +-
 interfaces/python/test/compiled_model_test.py |    3 +
 .../common/include/MapCompilerArguments.h     |    2 +
 libraries/common/src/LoadModel.cpp            |    6 +
 libraries/common/src/MapCompilerArguments.cpp |   16 +
 libraries/emittable_functions/CMakeLists.txt  |    6 +-
 libraries/emitters/CMakeLists.txt             |   43 +-
 libraries/emitters/include/CompilerOptions.h  |    6 +
 libraries/emitters/include/EmitterTypes.h     |   30 +-
 .../emitters/include/FunctionDeclaration.h    |    3 +-
 libraries/emitters/include/IRAssemblyWriter.h |    6 +
 libraries/emitters/include/IREmitter.h        |    7 +-
 .../emitters/include/IRExecutionEngine.h      |   10 +-
 .../emitters/include/IRFunctionEmitter.h      |  184 +-
 libraries/emitters/include/IRIfEmitter.h      |    4 +-
 libraries/emitters/include/IRLocalValue.h     |    9 +-
 libraries/emitters/include/IRLoopEmitter.h    |   10 +-
 libraries/emitters/include/IRModuleEmitter.h  |  101 +-
 libraries/emitters/include/IRPosixRuntime.h   |    4 +-
 libraries/emitters/include/IRRuntime.h        |   10 +-
 libraries/emitters/include/LLVMUtilities.h    |    6 +
 libraries/emitters/include/ModuleEmitter.h    |    2 +-
 libraries/emitters/include/TargetDevice.h     |   14 +
 libraries/emitters/src/CompilerOptions.cpp    |    2 +
 libraries/emitters/src/EmitterTypes.cpp       |   26 +
 libraries/emitters/src/IRAssemblyWriter.cpp   |    5 +
 libraries/emitters/src/IREmitter.cpp          |   28 +-
 libraries/emitters/src/IRExecutionEngine.cpp  |   15 +-
 libraries/emitters/src/IRFunctionEmitter.cpp  |  264 +-
 libraries/emitters/src/IRIfEmitter.cpp        |    4 +-
 libraries/emitters/src/IRLocalValue.cpp       |   25 +
 libraries/emitters/src/IRLoopEmitter.cpp      |   87 +-
 libraries/emitters/src/IRModuleEmitter.cpp    |  282 +-
 libraries/emitters/src/IRRuntime.cpp          |   25 +-
 libraries/emitters/src/LLVMUtilities.cpp      |   88 +-
 libraries/emitters/src/TargetDevice.cpp       |   17 +
 .../templates/LLVMEmitterTargets.h.in         |   16 +
 .../emitters/test/include/IREmitterTest.h     |    2 +
 libraries/emitters/test/src/IREmitterTest.cpp |   75 +
 libraries/emitters/test/src/main.cpp          |    2 +
 libraries/math/CMakeLists.txt                 |    1 +
 libraries/math/src/BlasWrapper.cpp            |   10 +-
 libraries/model/include/Map.h                 |    2 -
 libraries/model/include/OutputPort.h          |    1 -
 libraries/model/src/CompilableCodeNode.cpp    |    4 +-
 libraries/model/src/Map.cpp                   |    5 +
 .../model/test/include/CompilableNodesTest.h  |    8 +
 .../model/test/src/CompilableNodesTest.cpp    |  288 +
 .../test/src/model_compiler_test_main.cpp     |   29 +-
 .../include/ModelTestUtilities.h              |   54 +-
 libraries/nodes/CMakeLists.txt                |    5 +
 .../nodes/include/BroadcastOperationNodes.h   |  519 +-
 libraries/nodes/include/IRNode.h              |    2 -
 .../include/MatrixMatrixMultiplyCodeNode.h    |  271 +
 .../MatrixMatrixMultiplyImplementation.h      |   23 +
 libraries/nodes/include/NodeOperations.h      |    3 +-
 libraries/nodes/include/ReorderDataCodeNode.h |  586 ++
 .../nodes/include/SpatialConvolutionNode.h    |  239 +
 .../nodes/include/UnrolledConvolutionNode.h   |    1 +
 .../src/BinaryConvolutionalLayerNode.cpp      |    4 +-
 .../nodes/src/ConvolutionalLayerNode.cpp      |   19 +-
 .../src/MatrixMatrixMultiplyCodeNode.cpp      |  549 ++
 .../nodes/src/NeuralNetworkPredictorNode.cpp  |    4 +-
 .../nodes/src/UnrolledConvolutionNode.cpp     |   99 +-
 .../nodes/src/WinogradConvolutionNode.cpp     |    4 +-
 .../nodes/test/src/BasicMathNodesTests.cpp    |    8 +-
 libraries/nodes/test/src/DSPNodesTests.cpp    |    8 +-
 libraries/optimization/CMakeLists.txt         |    4 +-
 .../optimization/include/VectorSolution.h     |    2 +-
 libraries/optimization/src/Interval.cpp       |    2 +-
 ...OptimizeReorderDataNodesTransformation.cpp |   10 +-
 .../passes/test/src/ModelOptimizerTest.cpp    |   26 +-
 .../passes/test/src/TransformationTest.cpp    |   26 +-
 libraries/utilities/CMakeLists.txt            |    4 +
 libraries/utilities/include/EnumFlagHelpers.h |   39 +
 libraries/utilities/include/FunctionUtils.h   |   79 +-
 libraries/utilities/include/MemoryLayout.h    |    8 +
 libraries/utilities/include/StringUtil.h      |   14 +
 .../utilities/include/TunableParameters.h     |  165 +
 libraries/utilities/include/TypeAliases.h     |    1 +
 libraries/utilities/include/TypeTraits.h      |    7 +-
 libraries/utilities/src/Files.cpp             |    7 +-
 libraries/utilities/src/MemoryLayout.cpp      |   31 +-
 libraries/utilities/src/StringUtil.cpp        |   10 +
 .../test/include/TunableParameters_test.h     |   15 +
 .../test/src/TunableParameters_test.cpp       |   80 +
 libraries/utilities/test/src/main.cpp         |    5 +
 libraries/value/CMakeLists.txt                |   61 +-
 libraries/value/README.md                     |    2 +-
 libraries/value/include/Array.h               |  185 +
 libraries/value/include/CachingProvider.h     |  191 +
 libraries/value/include/CachingStrategies.h   |   63 +
 libraries/value/include/ComputeContext.h      |   22 +-
 libraries/value/include/CppEmitterContext.h   |  166 +
 libraries/value/include/EmitterContext.h      |  214 +-
 libraries/value/include/FunctionDeclaration.h |  151 +-
 libraries/value/include/LLVMContext.h         |   53 +-
 libraries/value/include/LoopNests.h           |  224 +
 libraries/value/include/Matrix.h              |   40 +
 libraries/value/include/MatrixOperations.h    |    8 +-
 libraries/value/include/Print.h               |   36 +
 libraries/value/include/Scalar.h              |    1 +
 libraries/value/include/ScalarOperations.h    |   29 +
 libraries/value/include/TensorOperations.h    |    6 +
 libraries/value/include/Value.h               |   13 +-
 libraries/value/include/ValueOperations.h     |    7 +
 libraries/value/include/ValueType.h           |    5 +
 libraries/value/include/VectorOperations.h    |    6 +
 .../value/include/loopnests/CodeGenerator.h   |   43 +
 .../loopnests/CodePositionConstraints.h       |  166 +
 libraries/value/include/loopnests/ForAll.h    |   40 +
 libraries/value/include/loopnests/Index.h     |   68 +
 .../value/include/loopnests/IndexRange.h      |   50 +
 .../value/include/loopnests/IterationDomain.h |   47 +
 libraries/value/include/loopnests/Kernel.h    |  166 +
 .../value/include/loopnests/KernelPredicate.h |  315 +
 .../value/include/loopnests/LoopIndexInfo.h   |   41 +
 libraries/value/include/loopnests/LoopNest.h  |  292 +
 .../value/include/loopnests/LoopNestPrinter.h |   65 +
 .../value/include/loopnests/LoopNestVisitor.h |  138 +
 libraries/value/include/loopnests/Range.h     |   44 +
 .../value/include/loopnests/SplitIndexRange.h |  112 +
 .../include/loopnests/SplitIterationDomain.h  |   95 +
 libraries/value/src/Array.cpp                 |  120 +
 libraries/value/src/CachingProvider.cpp       |   39 +
 libraries/value/src/CachingStrategies.cpp     | 1943 ++++++
 libraries/value/src/ComputeContext.cpp        |  668 +-
 libraries/value/src/CppEmitterContext.cpp     | 1753 +++++
 libraries/value/src/EmitterContext.cpp        |  260 +-
 libraries/value/src/FunctionDeclaration.cpp   |  136 +-
 libraries/value/src/LLVMContext.cpp           |  719 +-
 libraries/value/src/LoopNests.cpp             |  354 +
 libraries/value/src/Matrix.cpp                |    7 +-
 libraries/value/src/MatrixOperations.cpp      |   34 +-
 libraries/value/src/Print.cpp                 |   65 +
 libraries/value/src/Scalar.cpp                |   15 +-
 libraries/value/src/ScalarOperations.cpp      |   58 +
 libraries/value/src/TensorOperations.cpp      |   14 +-
 libraries/value/src/Value.cpp                 |   12 +
 libraries/value/src/ValueOperations.cpp       |    5 +
 libraries/value/src/Vector.cpp                |    1 +
 libraries/value/src/VectorOperations.cpp      |   28 +-
 .../value/src/loopnests/CodeGenerator.cpp     |  461 ++
 .../src/loopnests/CodePositionConstraints.cpp |  104 +
 libraries/value/src/loopnests/ForAll.cpp      |   49 +
 libraries/value/src/loopnests/Index.cpp       |   55 +
 libraries/value/src/loopnests/IndexRange.cpp  |   65 +
 .../value/src/loopnests/IterationDomain.cpp   |   60 +
 libraries/value/src/loopnests/Kernel.cpp      |  148 +
 .../value/src/loopnests/KernelPredicate.cpp   |  703 ++
 libraries/value/src/loopnests/LoopNest.cpp    |  900 +++
 .../value/src/loopnests/LoopNestPrinter.cpp   |  484 ++
 .../value/src/loopnests/LoopNestVisitor.cpp   | 1057 +++
 libraries/value/src/loopnests/Range.cpp       |   65 +
 .../value/src/loopnests/SplitIndexRange.cpp   |  506 ++
 .../src/loopnests/SplitIterationDomain.cpp    |  254 +
 .../value/test/include/CachingStrategy_test.h |  148 +
 libraries/value/test/include/Functions_test.h |   18 +
 .../value/test/include/LoopNestAPI_test.h     |   37 +
 .../test/include/LoopNest_convolution_test.h  |   18 +
 .../value/test/include/LoopNest_kernels.h     |   42 +
 libraries/value/test/include/LoopNest_test.h  |   89 +
 libraries/value/test/include/Matrix_test.h    |    1 +
 libraries/value/test/include/Scalar_test.h    |    2 +
 libraries/value/test/include/TestUtil.h       |   95 +-
 libraries/value/test/include/Value_test.h     |   21 +
 .../value/test/src/CachingStrategy_test.cpp   | 6171 +++++++++++++++++
 libraries/value/test/src/Functions_test.cpp   |   61 +
 libraries/value/test/src/LoopNestAPI_test.cpp | 1090 +++
 .../test/src/LoopNest_convolution_test.cpp    |  198 +
 libraries/value/test/src/LoopNest_kernels.cpp |  234 +
 libraries/value/test/src/LoopNest_test.cpp    | 3661 ++++++++++
 libraries/value/test/src/Matrix_test.cpp      |   68 +-
 libraries/value/test/src/Scalar_test.cpp      |   71 +-
 libraries/value/test/src/Tensor_test.cpp      |   24 +-
 libraries/value/test/src/TestUtil.cpp         |  371 +-
 libraries/value/test/src/Value_test.cpp       |  617 +-
 libraries/value/test/src/Vector_test.cpp      |   16 +-
 libraries/value/test/src/main.cpp             |  327 +-
 tools/importers/CNTK/cntk_to_ell.py           |   32 +-
 tools/importers/torch/test/CMakeLists.txt     |    5 +-
 tools/trainers/forestTrainer/CMakeLists.txt   |    2 +-
 tools/trainers/linearTrainer/CMakeLists.txt   |   22 +-
 tools/trainers/protoNNTrainer/CMakeLists.txt  |    2 +-
 .../sweepingSGDTrainer/CMakeLists.txt         |    2 +-
 tools/utilities/finetune/CMakeLists.txt       |    9 +-
 .../nodeTiming/gemmCodeNode/.gitignore        |    2 +
 .../nodeTiming/gemmCodeNode/README.md         |   46 +
 .../gemmCodeNode/deploy/full_pass.cmd         |   12 +
 .../gemmCodeNode/deploy/full_pass.sh          |   17 +
 .../nodeTiming/gemmCodeNode/deploy/run.py     |   41 +
 .../gemmCodeNode/deploy/timing_aggregator.py  |   95 +
 .../gemmCodeNode/scripts/build_gemm_models.py |  128 +
 .../gemmCodeNode/scripts/build_tests.py       |  289 +
 .../scripts/make_default_models.py            |   71 +
 .../scripts/special_model_args.py             |   13 +
 .../gemmCodeNode/src/CMakeLists.txt.in        |   21 +
 .../nodeTiming/gemmCodeNode/src/Runner.cpp.in |  134 +
 tools/utilities/pitest/drivetest.py           |    7 +-
 tools/utilities/print/src/PrintGraph.cpp      |    2 +
 .../profile/CMakeLists-device-parallel.txt.in |    3 +-
 tools/utilities/profile/CMakeLists.txt        |    2 +-
 .../utilities/pythonlibs/audio/view_audio.py  |    9 +-
 tools/utilities/pythonlibs/buildtools.py      |   40 +-
 tools/wrap/CMakeLists.txt                     |    2 +-
 tools/wrap/templates/CMakeLists.cpp.txt.in    |    6 +-
 tools/wrap/templates/CMakeLists.python.txt.in |    1 +
 tools/wrap/wrap.py                            |   43 +-
 248 files changed, 32163 insertions(+), 1723 deletions(-)
 create mode 100644 interfaces/common/include/MatrixMatrixMultiplyImplementation.h
 create mode 100644 libraries/emitters/templates/LLVMEmitterTargets.h.in
 create mode 100644 libraries/nodes/include/MatrixMatrixMultiplyCodeNode.h
 create mode 100644 libraries/nodes/include/MatrixMatrixMultiplyImplementation.h
 create mode 100644 libraries/nodes/include/ReorderDataCodeNode.h
 create mode 100644 libraries/nodes/include/SpatialConvolutionNode.h
 create mode 100644 libraries/nodes/src/MatrixMatrixMultiplyCodeNode.cpp
 create mode 100644 libraries/utilities/include/EnumFlagHelpers.h
 create mode 100644 libraries/utilities/include/TunableParameters.h
 create mode 100644 libraries/utilities/test/include/TunableParameters_test.h
 create mode 100644 libraries/utilities/test/src/TunableParameters_test.cpp
 create mode 100644 libraries/value/include/Array.h
 create mode 100644 libraries/value/include/CachingProvider.h
 create mode 100644 libraries/value/include/CachingStrategies.h
 create mode 100644 libraries/value/include/CppEmitterContext.h
 create mode 100644 libraries/value/include/LoopNests.h
 create mode 100644 libraries/value/include/Print.h
 create mode 100644 libraries/value/include/ScalarOperations.h
 create mode 100644 libraries/value/include/loopnests/CodeGenerator.h
 create mode 100644 libraries/value/include/loopnests/CodePositionConstraints.h
 create mode 100644 libraries/value/include/loopnests/ForAll.h
 create mode 100644 libraries/value/include/loopnests/Index.h
 create mode 100644 libraries/value/include/loopnests/IndexRange.h
 create mode 100644 libraries/value/include/loopnests/IterationDomain.h
 create mode 100644 libraries/value/include/loopnests/Kernel.h
 create mode 100644 libraries/value/include/loopnests/KernelPredicate.h
 create mode 100644 libraries/value/include/loopnests/LoopIndexInfo.h
 create mode 100644 libraries/value/include/loopnests/LoopNest.h
 create mode 100644 libraries/value/include/loopnests/LoopNestPrinter.h
 create mode 100644 libraries/value/include/loopnests/LoopNestVisitor.h
 create mode 100644 libraries/value/include/loopnests/Range.h
 create mode 100644 libraries/value/include/loopnests/SplitIndexRange.h
 create mode 100644 libraries/value/include/loopnests/SplitIterationDomain.h
 create mode 100644 libraries/value/src/Array.cpp
 create mode 100644 libraries/value/src/CachingProvider.cpp
 create mode 100644 libraries/value/src/CachingStrategies.cpp
 create mode 100644 libraries/value/src/CppEmitterContext.cpp
 create mode 100644 libraries/value/src/LoopNests.cpp
 create mode 100644 libraries/value/src/Print.cpp
 create mode 100644 libraries/value/src/ScalarOperations.cpp
 create mode 100644 libraries/value/src/loopnests/CodeGenerator.cpp
 create mode 100644 libraries/value/src/loopnests/CodePositionConstraints.cpp
 create mode 100644 libraries/value/src/loopnests/ForAll.cpp
 create mode 100644 libraries/value/src/loopnests/Index.cpp
 create mode 100644 libraries/value/src/loopnests/IndexRange.cpp
 create mode 100644 libraries/value/src/loopnests/IterationDomain.cpp
 create mode 100644 libraries/value/src/loopnests/Kernel.cpp
 create mode 100644 libraries/value/src/loopnests/KernelPredicate.cpp
 create mode 100644 libraries/value/src/loopnests/LoopNest.cpp
 create mode 100644 libraries/value/src/loopnests/LoopNestPrinter.cpp
 create mode 100644 libraries/value/src/loopnests/LoopNestVisitor.cpp
 create mode 100644 libraries/value/src/loopnests/Range.cpp
 create mode 100644 libraries/value/src/loopnests/SplitIndexRange.cpp
 create mode 100644 libraries/value/src/loopnests/SplitIterationDomain.cpp
 create mode 100644 libraries/value/test/include/CachingStrategy_test.h
 create mode 100644 libraries/value/test/include/Functions_test.h
 create mode 100644 libraries/value/test/include/LoopNestAPI_test.h
 create mode 100644 libraries/value/test/include/LoopNest_convolution_test.h
 create mode 100644 libraries/value/test/include/LoopNest_kernels.h
 create mode 100644 libraries/value/test/include/LoopNest_test.h
 create mode 100644 libraries/value/test/src/CachingStrategy_test.cpp
 create mode 100644 libraries/value/test/src/Functions_test.cpp
 create mode 100644 libraries/value/test/src/LoopNestAPI_test.cpp
 create mode 100644 libraries/value/test/src/LoopNest_convolution_test.cpp
 create mode 100644 libraries/value/test/src/LoopNest_kernels.cpp
 create mode 100644 libraries/value/test/src/LoopNest_test.cpp
 create mode 100644 tools/utilities/nodeTiming/gemmCodeNode/.gitignore
 create mode 100644 tools/utilities/nodeTiming/gemmCodeNode/README.md
 create mode 100644 tools/utilities/nodeTiming/gemmCodeNode/deploy/full_pass.cmd
 create mode 100755 tools/utilities/nodeTiming/gemmCodeNode/deploy/full_pass.sh
 create mode 100755 tools/utilities/nodeTiming/gemmCodeNode/deploy/run.py
 create mode 100755 tools/utilities/nodeTiming/gemmCodeNode/deploy/timing_aggregator.py
 create mode 100755 tools/utilities/nodeTiming/gemmCodeNode/scripts/build_gemm_models.py
 create mode 100755 tools/utilities/nodeTiming/gemmCodeNode/scripts/build_tests.py
 create mode 100755 tools/utilities/nodeTiming/gemmCodeNode/scripts/make_default_models.py
 create mode 100755 tools/utilities/nodeTiming/gemmCodeNode/scripts/special_model_args.py
 create mode 100644 tools/utilities/nodeTiming/gemmCodeNode/src/CMakeLists.txt.in
 create mode 100644 tools/utilities/nodeTiming/gemmCodeNode/src/Runner.cpp.in

diff --git a/.gitattributes b/.gitattributes
index 2d2188e4f..3fb5eea16 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -3,6 +3,11 @@
 ###############################################################################
 * text=auto
 
+###############################################################################
+# Explicitly force .sh scripts to have LF line endings
+###############################################################################
+*.sh text eol=lf
+
 ###############################################################################
 # Set default behavior for command prompt diff.
 #
diff --git a/.gitignore b/.gitignore
index 08f9f6946..a52e99a56 100644
--- a/.gitignore
+++ b/.gitignore
@@ -66,6 +66,7 @@ artifacts/
 *.pidb
 *.svclog
 *.scc
+*.ll
 
 # Chutzpah Test files
 _Chutzpah*
diff --git a/CMake/CommonInterfaces.cmake b/CMake/CommonInterfaces.cmake
index 1539c5d3c..c528431ed 100644
--- a/CMake/CommonInterfaces.cmake
+++ b/CMake/CommonInterfaces.cmake
@@ -8,11 +8,9 @@
 # On Linux and Mac, this can be done by call *make* on the specific language wrapper e.g.
 # make _ELL_python
 
-cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
-
 set(GLOBAL_BIN_DIR "${CMAKE_BINARY_DIR}/bin")
 if(WIN32)
-set(GLOBAL_BIN_DIR "${CMAKE_BINARY_DIR}/bin/release")
+  set(GLOBAL_BIN_DIR "${GLOBAL_BIN_DIR}/release")
 endif()
 
 #
diff --git a/CMake/CopySharedLibraries.cmake b/CMake/CopySharedLibraries.cmake
index a8b44eb32..a7605f6b3 100644
--- a/CMake/CopySharedLibraries.cmake
+++ b/CMake/CopySharedLibraries.cmake
@@ -3,21 +3,27 @@
 #
 
 # Copies necessary DLLs to global binary directory
-macro(copy_shared_libraries target_name)
+macro(copy_shared_libraries_to target_name target_location)
     if(WIN32)
-        set(target_location "${CMAKE_BINARY_DIR}/bin/$<CONFIG>/")
         if(EXISTS ${BLAS_DLL_DIR})
             set(command_target_name copy_dlls_to_${target_name})
             foreach(blas_dll ${BLAS_DLLS})
                 add_custom_command(TARGET ${target_name} POST_BUILD
-                    COMMAND ${CMAKE_COMMAND} -E make_directory ${target_location}
-                    COMMAND ${CMAKE_COMMAND} -E copy_if_different ${BLAS_DLL_DIR}/${blas_dll} ${target_location}
+                    COMMAND ${CMAKE_COMMAND} -E make_directory "${target_location}/$<CONFIG>"
+                    COMMAND ${CMAKE_COMMAND} -E copy_if_different ${BLAS_DLL_DIR}/${blas_dll} "${target_location}/$<CONFIG>"
                 )
             endforeach()
         endif()
     endif()
 endmacro()
 
+macro(copy_shared_libraries target_name)
+    if(WIN32)
+        set(target_location "${CMAKE_BINARY_DIR}/bin")
+        copy_shared_libraries_to(${target_name} ${target_location})
+    endif()
+endmacro()
+
 macro(set_test_library_path test_name)
     if(WIN32)
         set (GLOBAL_BIN_DIR ${CMAKE_BINARY_DIR}/bin)
diff --git a/CMake/LLVMSetup.cmake b/CMake/LLVMSetup.cmake
index 7c1bab565..7ffce54bd 100644
--- a/CMake/LLVMSetup.cmake
+++ b/CMake/LLVMSetup.cmake
@@ -66,7 +66,4 @@ foreach(DEFINITION ${LLVM_DEFINITIONS})
     add_definitions(${DEFINITION})
 endforeach()
 
-set(LLVM_LIBS ${LLVM_AVAILABLE_LIBS})
-list(FILTER LLVM_LIBS INCLUDE REGEX "LLVM.+")
-
 set_property(TARGET intrinsics_gen PROPERTY FOLDER "cmake_macros")
diff --git a/CMake/OpenBLASSetup.cmake b/CMake/OpenBLASSetup.cmake
index f4f2be1c4..e63b3a1a1 100644
--- a/CMake/OpenBLASSetup.cmake
+++ b/CMake/OpenBLASSetup.cmake
@@ -123,14 +123,17 @@ else()
             set(BLAS_LIB_SEARCH_PATHS ${BLAS_PACKAGE_DIR}/lib/)
             set(BLAS_FOUND TRUE)
         else()
-            # Known registry ID (family, model) settings for various CPU types
+            # Known registry ID (family, model) settings for various Intel CPU types
             #
             # Haswell: Family 6, model 60, 63, 69
             # Broadwell: Family 6, Model 70, 79 (compatible with Haswell)
             # Kaby Lake: Family 6, Model 78, 142, 158 (compatible with Haswell)
             # Sandybridge: Family 6, model 42, 45
             # Ivybridge: Family 6, model 58 (compatible with Sandybridge)
-            # Skylake: Family 6, model 42
+            # Skylake: Family 6, model 85
+            #
+            # Known registry ID (family, model) settings for various AMD CPU types
+            # Epyc: Family 23, model 1 (compatible with Haswell)
 
             # We can set up a mapping from a detected processor generation to the version of
             # the OpenBLAS libraries to use with the set_processor_mapping macro. For instance,
@@ -159,11 +162,15 @@ else()
                         set(processor_generation "sandybridge")
                     elseif(processor_model EQUAL 58)
                         set(processor_generation "sandybridge") # actually ivybridge, but it is compatible with sandybridge
-                    elseif(processor_model EQUAL 42)
+                    elseif(processor_model EQUAL 85)
                         set(processor_generation "sandybridge") # actually skylake, but it is compatible with sandybridge
                     else()
                         set(processor_generation "unknown")
                     endif()
+                elseif(processor_family EQUAL 23)
+                    if(processor_model EQUAL 1)
+                        set(processor_generation "haswell")
+                    endif()
                 endif()
             else()
                 set(processor_generation "${PROCESSOR_HINT}")
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2cf446dff..d01e464fd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,11 +4,18 @@
 
 cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
 
+# Error on non-existent dependency in add_dependencies.
+cmake_policy(SET CMP0046 NEW)
+
 # Include modules in the CMake directory.
-list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/CMake")
+set(ELL_ROOT "${CMAKE_CURRENT_SOURCE_DIR}")
+list(APPEND CMAKE_MODULE_PATH "${ELL_ROOT}/CMake")
 include(CompilerCache)
 
-project(ELL)
+project(ELL CXX ASM)
+if(MSVC)
+  enable_language(ASM_MASM)
+endif()
 
 file(STRINGS "VERSION" ELL_VERSION)
 message(STATUS "ELL version ${ELL_VERSION}")
@@ -41,11 +48,14 @@ option(DISABLE_PYTHON "Explicitly disable building python modules" OFF)
 option(CNTK "Enable CNTK importer and related unit tests (requires CNTK python module)" OFF)
 option(ONNX "Enable ONNX importer and related unit tests (requires PyTorch and ONNX python modules)" OFF)
 
-set(ELL_ROOT "${CMAKE_SOURCE_DIR}")
-set(FLAKE8_CONFIG "${CMAKE_SOURCE_DIR}/.flake8")
-set(TEST_MODELS_REPO "https://github.com/Microsoft/ell-test-models" CACHE DOCUMENTATION "URL to the git repo containing test models" )
+set(FLAKE8_CONFIG "${ELL_ROOT}/.flake8")
+set(TEST_MODELS_REPO "https://github.com/Microsoft/ell-test-models" CACHE STRING "URL to the git repo containing test models" )
 message(STATUS "Configuring tests to use TEST_MODELS_REPO at: ${TEST_MODELS_REPO}")
-set(EXTERNAL_DIR "${CMAKE_SOURCE_DIR}/external" CACHE DOCUMENTATION "Directory to install external dependencies" )
+
+if(NOT ELL_EXTERNAL_DIR)
+  set(ELL_EXTERNAL_DIR "${ELL_ROOT}/external" CACHE STRING "Directory to install external dependencies" )
+endif(NOT ELL_EXTERNAL_DIR)
+
 set(RPI_PASSWORD "$ENV{RPI_PASSWORD}")
 set(RPI_CLUSTER "$ENV{RPI_CLUSTER}")
 set(RPI_KEY "$ENV{RPI_APIKEY}")
@@ -106,7 +116,8 @@ endif()
 enable_testing()
 
 # Set up global variables to help find NuGet projects
-set(PACKAGE_ROOT ${EXTERNAL_DIR})
+set(PACKAGE_ROOT ${ELL_EXTERNAL_DIR})
+
 include(OpenBLASSetup)
 include(LLVMSetup)
 include(SWIGSetup)
@@ -138,14 +149,16 @@ else()
   add_compile_options(-Wmissing-field-initializers)
   add_compile_options(-fvisibility-inlines-hidden)
   add_compile_options(-Wno-unknown-pragmas)
-  add_compile_options(-Wno-backslash-newline-escape)
   add_compile_options(-Wno-comment)
   set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -ggdb3 -O0")
   set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -ggdb3 -O0")
   set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -ggdb3")
   set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} -ggdb3")
   if(${CMAKE_CXX_COMPILER_ID} STREQUAL Clang)
+    add_compile_options(-Wno-backslash-newline-escape)
     add_compile_options(-Wno-self-assign)
+  else() # GCC
+    add_compile_options(-Wno-ignored-attributes)
   endif()
 endif()
 
@@ -163,17 +176,17 @@ add_subdirectory(interfaces)
 add_subdirectory(examples)
 
 # Add user directories to ELL build if requested
-if(EXISTS "${CMAKE_SOURCE_DIR}/user")
+if(EXISTS "${ELL_ROOT}/user")
   # Add root user directory if it has a CMakeLists.txt file and INCLUDE_IN_ELL_BUILD.txt file
-  if(EXISTS"${CMAKE_SOURCE_DIR}/user/CMakeLists.txt" AND EXISTS "${CMAKE_SOURCE_DIR}/user/INCLUDE_IN_ELL_BUILD.txt")
+  if(EXISTS"${ELL_ROOT}/user/CMakeLists.txt" AND EXISTS "${ELL_ROOT}/user/INCLUDE_IN_ELL_BUILD.txt")
     message(STATUS "Adding user directory to ELL build")
     add_subdirectory(user)
   endif()
 
   # Now add all child directories that have CMakeLists.txt files and INCLUDE_IN_ELL_BUILD.txt file
-  file(GLOB children RELATIVE "${CMAKE_SOURCE_DIR}/user" "${CMAKE_SOURCE_DIR}/user/*")
+  file(GLOB children RELATIVE "${ELL_ROOT}/user" "${ELL_ROOT}/user/*")
   foreach(child ${children})
-    if(IS_DIRECTORY "${CMAKE_SOURCE_DIR}/user/${child}" AND EXISTS "${CMAKE_SOURCE_DIR}/user/${child}/CMakeLists.txt" AND EXISTS "${CMAKE_SOURCE_DIR}/user/${child}/INCLUDE_IN_ELL_BUILD.txt")
+    if(IS_DIRECTORY "${ELL_ROOT}/user/${child}" AND EXISTS "${ELL_ROOT}/user/${child}/CMakeLists.txt" AND EXISTS "${ELL_ROOT}/user/${child}/INCLUDE_IN_ELL_BUILD.txt")
       message(STATUS "Adding user directory ${child} to ELL build")
       add_subdirectory("user/${child}")
     endif()
diff --git a/History.md b/History.md
index 20ac1ea0a..8ee727020 100644
--- a/History.md
+++ b/History.md
@@ -1,3 +1,16 @@
+## 3.1.0
+- Move to VS 2019
+- Fix a codegen error that was resulting in incorrect functional behavior
+- Fix regressions in audio training tutorial (#232)
+- Add importing of Sum nodes to ONNX importer
+- Fix crash in LLVMContext::SetName
+- Improved performance of CNN models on Pi3 with new implementations of spatial, pointwise and regular convolutions
+- Improved performance of reorder node
+- New nodes: ReorderDataCodeNode, SpatialConvolutionNode, MatrixMatrixMultiplyCodeNode
+- Implement parallelization strategies for matrix multiplication nodes.
+- Only enable new MatrixMatrixMultipleCodeNode path for select ARM targets like Pi, and not Intel/AMD CPUs
+- Add the flag `--skip_ellcode` to `compile` and `wrap.py` tools to use OpenBLAS for linear algebra computations.
+
 ## 3.0.3
 - Fix VS 2019 build.
 
diff --git a/VERSION b/VERSION
index 75a22a26a..fd2a01863 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-3.0.3
+3.1.0
diff --git a/docs/gallery/ILSVRC2012/Asparagus.md b/docs/gallery/ILSVRC2012/Asparagus.md
index 9e72d64b0..c9294035a 100644
--- a/docs/gallery/ILSVRC2012/Asparagus.md
+++ b/docs/gallery/ILSVRC2012/Asparagus.md
@@ -6,7 +6,7 @@ permalink: /gallery/ILSVRC2012/Asparagus
 
 [Back to Gallery](/ELL/gallery)
 
-## ILSVRC2012 Classification: 64x64x3 Convolutional Neural Network (52.07% top 1 accuracy, 76.40% top 5 accuracy, 108ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz)
+## ILSVRC2012 Classification: 64x64x3 Convolutional Neural Network (52.07% top 1 accuracy, 76.40% top 5 accuracy, 181ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz)
 
 <table class="table table-striped table-bordered">
     <tr>
@@ -19,7 +19,7 @@ permalink: /gallery/ILSVRC2012/Asparagus
     </tr>
     <tr>
         <td> Performance </td>
-        <td colspan="3"> Raspberry Pi 3 (Raspbian) @ 700MHz : 108ms/frame </td>
+        <td colspan="3"> Raspberry Pi 3 (Raspbian) @ 700MHz : 181ms/frame </td>
     </tr>
     <tr>
         <td> Uncompressed Size </td>
diff --git a/docs/gallery/ILSVRC2012/Bean.md b/docs/gallery/ILSVRC2012/Bean.md
index 924fcf52c..8cfd6c457 100644
--- a/docs/gallery/ILSVRC2012/Bean.md
+++ b/docs/gallery/ILSVRC2012/Bean.md
@@ -6,12 +6,12 @@ permalink: /gallery/ILSVRC2012/Bean
 
 [Back to Gallery](/ELL/gallery)
 
-## ILSVRC2012 Classification: 128x128x3 Convolutional Neural Network (55.12% top 1 accuracy, 78.21% top 5 accuracy, 144ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz)
+## ILSVRC2012 Classification: 128x128x3 Convolutional Neural Network (55.12% top 1 accuracy, 78.21% top 5 accuracy, 87ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz)
 
 <table class="table table-striped table-bordered">
     <tr>
         <td> Download </td>
-        <td> <a href="https://github.com/Microsoft/ELL-models/raw/master/models/ILSVRC2012/Bean/Bean_pi3.ell.zip">Bean_pi3.ell.zip</a></td>
+        <td> <a href="https://github.com/Microsoft/ELL-models/raw/master/models/ILSVRC2012/Bean/Bean.ell.zip">Bean.ell.zip</a></td>
     </tr>
     <tr>
         <td> Accuracy </td>
@@ -19,7 +19,7 @@ permalink: /gallery/ILSVRC2012/Bean
     </tr>
     <tr>
         <td> Performance </td>
-        <td colspan="3"> Raspberry Pi 3 (Raspbian) @ 700MHz : 144ms/frame </td>
+        <td colspan="3"> Raspberry Pi 3 (Raspbian) @ 700MHz : 87ms/frame </td>
     </tr>
     <tr>
         <td> Uncompressed Size </td>
diff --git a/docs/gallery/ILSVRC2012/Buckthorn.md b/docs/gallery/ILSVRC2012/Buckthorn.md
index 6d0aacf4d..cebefe939 100644
--- a/docs/gallery/ILSVRC2012/Buckthorn.md
+++ b/docs/gallery/ILSVRC2012/Buckthorn.md
@@ -6,12 +6,12 @@ permalink: /gallery/ILSVRC2012/Buckthorn
 
 [Back to Gallery](/ELL/gallery)
 
-## ILSVRC2012 Classification: 64x64x3 Convolutional Neural Network (57.57% top 1 accuracy, 80.55% top 5 accuracy, 171ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz)
+## ILSVRC2012 Classification: 64x64x3 Convolutional Neural Network (57.57% top 1 accuracy, 80.55% top 5 accuracy, 113ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz)
 
 <table class="table table-striped table-bordered">
     <tr>
         <td> Download </td>
-        <td colspan="3"> <a href="https://github.com/Microsoft/ELL-models/raw/master/models/ILSVRC2012/Buckthorn/Buckthorn_pi3.ell.zip">Buckthorn_pi3.ell.zip</a></td>
+        <td colspan="3"> <a href="https://github.com/Microsoft/ELL-models/raw/master/models/ILSVRC2012/Buckthorn/Buckthorn.ell.zip">Buckthorn.ell.zip</a></td>
     </tr>
     <tr>
         <td> Accuracy </td>
@@ -19,7 +19,7 @@ permalink: /gallery/ILSVRC2012/Buckthorn
     </tr>
     <tr>
         <td> Performance </td>
-        <td colspan="3"> Raspberry Pi 3 (Raspbian) @ 700MHz : 171ms/frame </td>
+        <td colspan="3"> Raspberry Pi 3 (Raspbian) @ 700MHz : 113ms/frame </td>
     </tr>
     <tr>
         <td> Uncompressed Size </td>
diff --git a/docs/gallery/ILSVRC2012/Carrot.md b/docs/gallery/ILSVRC2012/Carrot.md
index 9b83882f4..6b5f98dfa 100644
--- a/docs/gallery/ILSVRC2012/Carrot.md
+++ b/docs/gallery/ILSVRC2012/Carrot.md
@@ -6,12 +6,12 @@ permalink: /gallery/ILSVRC2012/Carrot
 
 [Back to Gallery](/ELL/gallery)
 
-## ILSVRC2012 Classification: 128x128x3 Convolutional Neural Network (64.61% top 1 accuracy, 85.63% top 5 accuracy, 397ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz)
+## ILSVRC2012 Classification: 128x128x3 Convolutional Neural Network (64.61% top 1 accuracy, 85.63% top 5 accuracy, 341ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz)
 
 <table class="table table-striped table-bordered">
     <tr>
         <td> Download </td>
-        <td colspan="3"> <a href="https://github.com/Microsoft/ELL-models/raw/master/models/ILSVRC2012/Carrot/Carrot_pi3.ell.zip">Carrot_pi3.ell.zip</a></td>
+        <td colspan="3"> <a href="https://github.com/Microsoft/ELL-models/raw/master/models/ILSVRC2012/Carrot/Carrot.ell.zip">Carrot.ell.zip</a></td>
     </tr>
     <tr>
         <td> Accuracy </td>
@@ -19,7 +19,7 @@ permalink: /gallery/ILSVRC2012/Carrot
     </tr>
     <tr>
         <td> Performance </td>
-        <td colspan="3"> Raspberry Pi 3 (Raspbian) @ 700MHz: 397ms/frame </td>
+        <td colspan="3"> Raspberry Pi 3 (Raspbian) @ 700MHz: 341ms/frame </td>
     </tr>
     <tr>
         <td> Uncompressed Size </td>
diff --git a/docs/gallery/ILSVRC2012/CashewNut.md b/docs/gallery/ILSVRC2012/CashewNut.md
index fb5cab1c7..459054082 100644
--- a/docs/gallery/ILSVRC2012/CashewNut.md
+++ b/docs/gallery/ILSVRC2012/CashewNut.md
@@ -6,12 +6,12 @@ permalink: /gallery/ILSVRC2012/CashewNut
 
 [Back to Gallery](/ELL/gallery)
 
-## ILSVRC2012 Classification: 128x128x3 Convolutional Neural Network (60.22% top 1 accuracy, 82.44% top 5 accuracy, 178ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz)
+## ILSVRC2012 Classification: 128x128x3 Convolutional Neural Network (60.22% top 1 accuracy, 82.44% top 5 accuracy, 106ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz)
 
 <table class="table table-striped table-bordered">
     <tr>
         <td> Download </td>
-        <td colspan="3"> <a href="https://github.com/Microsoft/ELL-models/raw/master/models/ILSVRC2012/CashewNut/CashewNut_pi3.ell.zip">CashewNut_pi3.ell.zip</a></td>
+        <td colspan="3"> <a href="https://github.com/Microsoft/ELL-models/raw/master/models/ILSVRC2012/CashewNut/CashewNut.ell.zip">CashewNut.ell.zip</a></td>
     </tr>
     <tr>
         <td> Accuracy </td>
@@ -19,7 +19,7 @@ permalink: /gallery/ILSVRC2012/CashewNut
     </tr>
     <tr>
         <td> Performance </td>
-        <td colspan="3"> Raspberry Pi 3 (Raspbian) @ 700MHz: 178ms/frame </td>
+        <td colspan="3"> Raspberry Pi 3 (Raspbian) @ 700MHz: 106ms/frame </td>
     </tr>
     <tr>
         <td> Uncompressed Size </td>
diff --git a/docs/gallery/ILSVRC2012/Chalta.md b/docs/gallery/ILSVRC2012/Chalta.md
index a952c5a50..0f762a38c 100644
--- a/docs/gallery/ILSVRC2012/Chalta.md
+++ b/docs/gallery/ILSVRC2012/Chalta.md
@@ -6,12 +6,12 @@ permalink: /gallery/ILSVRC2012/Chalta
 
 [Back to Gallery](/ELL/gallery)
 
-## ILSVRC2012 Classification: 64x64x3 Convolutional Neural Network (58.74% top 1 accuracy, 81.59% top 5 accuracy, 221ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz)
+## ILSVRC2012 Classification: 64x64x3 Convolutional Neural Network (58.74% top 1 accuracy, 81.59% top 5 accuracy, 147ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz)
 
 <table class="table table-striped table-bordered">
     <tr>
         <td> Download </td>
-        <td colspan="3"> <a href="https://github.com/Microsoft/ELL-models/raw/master/models/ILSVRC2012/Chalta/Chalta_pi3.ell.zip">Chalta_pi3.ell.zip</a></td>
+        <td colspan="3"> <a href="https://github.com/Microsoft/ELL-models/raw/master/models/ILSVRC2012/Chalta/Chalta.ell.zip">Chalta.ell.zip</a></td>
     </tr>
     <tr>
         <td> Accuracy </td>
@@ -19,7 +19,7 @@ permalink: /gallery/ILSVRC2012/Chalta
     </tr>
     <tr>
         <td> Performance </td>
-        <td colspan="3"> Raspberry Pi 3 (Raspbian) @ 700MHz: 221ms/frame </td>
+        <td colspan="3"> Raspberry Pi 3 (Raspbian) @ 700MHz: 147ms/frame </td>
     </tr>
     <tr>
         <td> Uncompressed Size </td>
diff --git a/docs/gallery/ILSVRC2012/Clary.md b/docs/gallery/ILSVRC2012/Clary.md
index 7fa372ed7..e1b4b4fd8 100644
--- a/docs/gallery/ILSVRC2012/Clary.md
+++ b/docs/gallery/ILSVRC2012/Clary.md
@@ -6,12 +6,12 @@ permalink: /gallery/ILSVRC2012/Clary
 
 [Back to Gallery](/ELL/gallery)
 
-## ILSVRC2012 Classification: 128x128x3 Convolutional Neural Network (66.65% top 1 accuracy, 87.17% top 5 accuracy, 506ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz)
+## ILSVRC2012 Classification: 128x128x3 Convolutional Neural Network (66.65% top 1 accuracy, 87.17% top 5 accuracy, 361ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz)
 
 <table class="table table-striped table-bordered">
     <tr>
         <td> Download </td>
-        <td colspan="3"> <a href="https://github.com/Microsoft/ELL-models/raw/master/models/ILSVRC2012/Clary/Clary_pi3.ell.zip">Clary_pi3.ell.zip</a></td>
+        <td colspan="3"> <a href="https://github.com/Microsoft/ELL-models/raw/master/models/ILSVRC2012/Clary/Clary.ell.zip">Clary.ell.zip</a></td>
     </tr>
     <tr>
         <td> Accuracy </td>
@@ -19,7 +19,7 @@ permalink: /gallery/ILSVRC2012/Clary
     </tr>
     <tr>
         <td> Performance </td>
-        <td colspan="3"> Raspberry Pi 3 (Raspbian) @ 700MHz: 506ms/frame </td>
+        <td colspan="3"> Raspberry Pi 3 (Raspbian) @ 700MHz: 361ms/frame </td>
     </tr>
     <tr>
         <td> Uncompressed Size </td>
diff --git a/docs/gallery/ILSVRC2012/Clover.md b/docs/gallery/ILSVRC2012/Clover.md
index dbc619710..f171badb4 100644
--- a/docs/gallery/ILSVRC2012/Clover.md
+++ b/docs/gallery/ILSVRC2012/Clover.md
@@ -6,12 +6,12 @@ permalink: /gallery/ILSVRC2012/Clover
 
 [Back to Gallery](/ELL/gallery)
 
-## ILSVRC2012 Classification: 64x64x3 Convolutional Neural Network (53.04% top 1 accuracy, 77.12% top 5 accuracy, 126ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz)
+## ILSVRC2012 Classification: 64x64x3 Convolutional Neural Network (53.04% top 1 accuracy, 77.12% top 5 accuracy, 90ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz)
 
 <table class="table table-striped table-bordered">
     <tr>
         <td> Download </td>
-        <td colspan="3"> <a href="https://github.com/Microsoft/ELL-models/raw/master/models/ILSVRC2012/Clover/Clover_pi3.ell.zip">Clover_pi3.ell.zip</a></td>
+        <td colspan="3"> <a href="https://github.com/Microsoft/ELL-models/raw/master/models/ILSVRC2012/Clover/Clover.ell.zip">Clover.ell.zip</a></td>
     </tr>
     <tr>
         <td> Accuracy </td>
@@ -19,7 +19,7 @@ permalink: /gallery/ILSVRC2012/Clover
     </tr>
     <tr>
         <td> Performance </td>
-        <td colspan="3"> Raspberry Pi 3 (Raspbian) @ 700MHz: 126ms/frame </td>
+        <td colspan="3"> Raspberry Pi 3 (Raspbian) @ 700MHz: 90ms/frame </td>
     </tr>
     <tr>
         <td> Uncompressed Size </td>
diff --git a/docs/gallery/ILSVRC2012/Coconut.md b/docs/gallery/ILSVRC2012/Coconut.md
index 7deca6415..dff22e5ff 100644
--- a/docs/gallery/ILSVRC2012/Coconut.md
+++ b/docs/gallery/ILSVRC2012/Coconut.md
@@ -6,12 +6,12 @@ permalink: /gallery/ILSVRC2012/Coconut
 
 [Back to Gallery](/ELL/gallery)
 
-## ILSVRC2012 Classification: 64x64x3 Convolutional Neural Network (44.41% top 1 accuracy, 69.41% top 5 accuracy, 46ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz)
+## ILSVRC2012 Classification: 64x64x3 Convolutional Neural Network (44.41% top 1 accuracy, 69.41% top 5 accuracy,30ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz)
 
 <table class="table table-striped table-bordered">
     <tr>
         <td> Download  </td>
-        <td colspan="3"> <a href="https://github.com/Microsoft/ELL-models/raw/master/models/ILSVRC2012/Coconut/Coconut_pi3.ell.zip">Coconut_pi3.ell.zip</a></td>
+        <td colspan="3"> <a href="https://github.com/Microsoft/ELL-models/raw/master/models/ILSVRC2012/Coconut/Coconut.ell.zip">Coconut.ell.zip</a></td>
     </tr>
     <tr>
         <td> Accuracy </td>
@@ -19,7 +19,7 @@ permalink: /gallery/ILSVRC2012/Coconut
     </tr>
     <tr>
         <td> Performance </td>
-        <td colspan="3"> Raspberry Pi 3 (Raspbian) @ 700MHz: 46ms/frame </td>
+        <td colspan="3"> Raspberry Pi 3 (Raspbian) @ 700MHz: 30ms/frame </td>
     </tr>
     <tr>
         <td> Uncompressed Size </td>
diff --git a/docs/gallery/ILSVRC2012/Ginger.md b/docs/gallery/ILSVRC2012/Ginger.md
index bea5f6b22..4b35d720a 100644
--- a/docs/gallery/ILSVRC2012/Ginger.md
+++ b/docs/gallery/ILSVRC2012/Ginger.md
@@ -6,12 +6,12 @@ permalink: /gallery/ILSVRC2012/Ginger
 
 [Back to Gallery](/ELL/gallery)
 
-## ILSVRC2012 Classification: 128x128x3 Convolutional Neural Network (62.33% top 1 accuracy, 84.14% top 5 accuracy, 205ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz)
+## ILSVRC2012 Classification: 128x128x3 Convolutional Neural Network (62.33% top 1 accuracy, 84.14% top 5 accuracy, 122ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz)
 
 <table class="table table-striped table-bordered">
     <tr>
         <td> Download </td>
-        <td colspan="3"> <a href="https://github.com/Microsoft/ELL-models/raw/master/models/ILSVRC2012/Ginger/Ginger_pi3.ell.zip">Ginger_pi3.ell.zip</a></td>
+        <td colspan="3"> <a href="https://github.com/Microsoft/ELL-models/raw/master/models/ILSVRC2012/Ginger/Ginger.ell.zip">Ginger.ell.zip</a></td>
     </tr>
     <tr>
         <td> Accuracy </td>
@@ -19,7 +19,7 @@ permalink: /gallery/ILSVRC2012/Ginger
     </tr>
     <tr>
         <td> Performance </td>
-        <td colspan="3"> Raspberry Pi 3 (Raspbian) @ 700MHz: 205ms/frame </td>
+        <td colspan="3"> Raspberry Pi 3 (Raspbian) @ 700MHz: 122ms/frame </td>
     </tr>
     <tr>
         <td> Uncompressed Size </td>
diff --git a/docs/gallery/ILSVRC2012/Mashua.md b/docs/gallery/ILSVRC2012/Mashua.md
index 96d95e492..995f2ff38 100644
--- a/docs/gallery/ILSVRC2012/Mashua.md
+++ b/docs/gallery/ILSVRC2012/Mashua.md
@@ -6,7 +6,7 @@ permalink: /gallery/ILSVRC2012/Mashua
 
 [Back to Gallery](/ELL/gallery)
 
-## ILSVRC2012 Classification: 64x64x3 Convolutional Neural Network (64.59% top 1 accuracy, 85.50% top 5 accuracy, 804ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz)
+## ILSVRC2012 Classification: 64x64x3 Convolutional Neural Network (64.59% top 1 accuracy, 85.50% top 5 accuracy, 525ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz)
 
 <table class="table table-striped table-bordered">
     <tr>
@@ -19,7 +19,7 @@ permalink: /gallery/ILSVRC2012/Mashua
     </tr>
     <tr>
         <td> Performance </td>
-        <td colspan="3"> Raspberry Pi 3 (Raspbian) @ 700MHz: 804ms/frame </td>
+        <td colspan="3"> Raspberry Pi 3 (Raspbian) @ 700MHz: 525ms/frame </td>
     </tr>
     <tr>
         <td> Uncompressed Size </td>
diff --git a/docs/gallery/ILSVRC2012/PandanFlower.md b/docs/gallery/ILSVRC2012/PandanFlower.md
index 140b9634f..7c34d0536 100644
--- a/docs/gallery/ILSVRC2012/PandanFlower.md
+++ b/docs/gallery/ILSVRC2012/PandanFlower.md
@@ -6,12 +6,12 @@ permalink: /gallery/ILSVRC2012/PandanFlower
 
 [Back to Gallery](/ELL/gallery)
 
-## ILSVRC2012 Classification: 64x64x3 Convolutional Neural Network (62.05% top 1 accuracy, 83.80% top 5 accuracy, 347ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz)
+## ILSVRC2012 Classification: 64x64x3 Convolutional Neural Network (62.05% top 1 accuracy, 83.80% top 5 accuracy, 239ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz)
 
 <table class="table table-striped table-bordered">
     <tr>
         <td> Download </td>
-        <td colspan="3"> <a href="https://github.com/Microsoft/ELL-models/raw/master/models/ILSVRC2012/PandanFlower/PandanFlower_pi3.ell.zip">PandanFlower_pi3.ell.zip</a></td>
+        <td colspan="3"> <a href="https://github.com/Microsoft/ELL-models/raw/master/models/ILSVRC2012/PandanFlower/PandanFlower.ell.zip">PandanFlower.ell.zip</a></td>
     </tr>
     <tr>
         <td> Accuracy </td>
@@ -19,7 +19,7 @@ permalink: /gallery/ILSVRC2012/PandanFlower
     </tr>
     <tr>
         <td> Performance </td>
-        <td colspan="3"> Raspberry Pi 3 (Raspbian) @ 700MHz: 347ms/frame </td>
+        <td colspan="3"> Raspberry Pi 3 (Raspbian) @ 700MHz: 239ms/frame </td>
     </tr>
     <tr>
         <td> Uncompressed Size </td>
diff --git a/docs/gallery/ILSVRC2012/Pear.md b/docs/gallery/ILSVRC2012/Pear.md
index 0a50db4d6..213426135 100644
--- a/docs/gallery/ILSVRC2012/Pear.md
+++ b/docs/gallery/ILSVRC2012/Pear.md
@@ -6,12 +6,12 @@ permalink: /gallery/ILSVRC2012/Pear
 
 [Back to Gallery](/ELL/gallery)
 
-## ILSVRC2012 Classification: 64x64x3 Convolutional Neural Network (50.08% top 1 accuracy, 74.74% top 5 accuracy, 90ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz)
+## ILSVRC2012 Classification: 64x64x3 Convolutional Neural Network (50.08% top 1 accuracy, 74.74% top 5 accuracy, 85ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz)
 
 <table class="table table-striped table-bordered">
     <tr>
         <td> Download </td>
-        <td> <a href="https://github.com/Microsoft/ELL-models/raw/master/models/ILSVRC2012/Pear/Pear_pi3.ell.zip">Pear_pi3.ell.zip</a></td>
+        <td> <a href="https://github.com/Microsoft/ELL-models/raw/master/models/ILSVRC2012/Pear/Pear.ell.zip">Pear.ell.zip</a></td>
     </tr>
     <tr>
         <td> Accuracy </td>
@@ -19,7 +19,7 @@ permalink: /gallery/ILSVRC2012/Pear
     </tr>
     <tr>
         <td> Performance </td>
-        <td colspan="3"> Raspberry Pi 3 (Raspbian) @ 700MHz : 90ms/frame </td>
+        <td colspan="3"> Raspberry Pi 3 (Raspbian) @ 700MHz : 85ms/frame </td>
     </tr>
     <tr>
         <td> Uncompressed Size </td>
diff --git a/docs/gallery/ILSVRC2012/SevenSisters.md b/docs/gallery/ILSVRC2012/SevenSisters.md
index 94d7f155e..80866b933 100644
--- a/docs/gallery/ILSVRC2012/SevenSisters.md
+++ b/docs/gallery/ILSVRC2012/SevenSisters.md
@@ -6,12 +6,12 @@ permalink: /gallery/ILSVRC2012/SevenSisters
 
 [Back to Gallery](/ELL/gallery)
 
-## ILSVRC2012 Classification: 128x128x3 Convolutional Neural Network (66.04% top 1 accuracy, 86.59% top 5 accuracy, 489ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz)
+## ILSVRC2012 Classification: 128x128x3 Convolutional Neural Network (66.04% top 1 accuracy, 86.59% top 5 accuracy, 415ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz)
 
 <table class="table table-striped table-bordered">
     <tr>
         <td> Download </td>
-        <td colspan="3"> <a href="https://github.com/Microsoft/ELL-models/raw/master/models/ILSVRC2012/SevenSisters/SevenSisters_pi3.ell.zip">SevenSisters_pi3.ell.zip</a></td>
+        <td colspan="3"> <a href="https://github.com/Microsoft/ELL-models/raw/master/models/ILSVRC2012/SevenSisters/SevenSisters.ell.zip">SevenSisters.ell.zip</a></td>
     </tr>
     <tr>
         <td> Accuracy </td>
@@ -19,7 +19,7 @@ permalink: /gallery/ILSVRC2012/SevenSisters
     </tr>
     <tr>
         <td> Performance </td>
-        <td colspan="3"> Raspberry Pi 3 (Raspbian) @ 700MHz: 489ms/frame </td>
+        <td colspan="3"> Raspberry Pi 3 (Raspbian) @ 700MHz: 415ms/frame </td>
     </tr>
     <tr>
         <td> Uncompressed Size </td>
diff --git a/docs/gallery/ILSVRC2012/Sweetsop.md b/docs/gallery/ILSVRC2012/Sweetsop.md
index 4ea819445..3d5acd33f 100644
--- a/docs/gallery/ILSVRC2012/Sweetsop.md
+++ b/docs/gallery/ILSVRC2012/Sweetsop.md
@@ -6,12 +6,12 @@ permalink: /gallery/ILSVRC2012/Sweetsop
 
 [Back to Gallery](/ELL/gallery)
 
-## ILSVRC2012 Classification: 128x128x3 Convolutional Neural Network (72.24% top 1 accuracy, 90.78% top 5 accuracy, 1012ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz)
+## ILSVRC2012 Classification: 128x128x3 Convolutional Neural Network (72.24% top 1 accuracy, 90.78% top 5 accuracy, 719ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz)
 
 <table class="table table-striped table-bordered">
     <tr>
         <td> Download </td>
-        <td colspan="3"> <a href="https://github.com/Microsoft/ELL-models/raw/master/models/ILSVRC2012/Sweetsop/Sweetsop_pi3.ell.zip">Sweetsop_pi3.ell.zip</a></td>
+        <td colspan="3"> <a href="https://github.com/Microsoft/ELL-models/raw/master/models/ILSVRC2012/Sweetsop/Sweetsop.ell.zip">Sweetsop.ell.zip</a></td>
     </tr>
     <tr>
         <td> Accuracy </td>
@@ -19,7 +19,7 @@ permalink: /gallery/ILSVRC2012/Sweetsop
     </tr>
     <tr>
         <td> Performance </td>
-        <td colspan="3"> Raspberry Pi 3 (Raspbian) @ 700MHz: 1012ms/frame </td>
+        <td colspan="3"> Raspberry Pi 3 (Raspbian) @ 700MHz: 719ms/frame </td>
     </tr>
     <tr>
         <td> Uncompressed Size </td>
diff --git a/docs/gallery/ILSVRC2012/Tamarind.md b/docs/gallery/ILSVRC2012/Tamarind.md
index ff3f78d8d..d721e8e96 100644
--- a/docs/gallery/ILSVRC2012/Tamarind.md
+++ b/docs/gallery/ILSVRC2012/Tamarind.md
@@ -6,12 +6,12 @@ permalink: /gallery/ILSVRC2012/Tamarind
 
 [Back to Gallery](/ELL/gallery)
 
-## ILSVRC2012 Classification: 128x128x3 Convolutional Neural Network (70.46% top 1 accuracy, 89.57% top 5 accuracy, 665ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz)
+## ILSVRC2012 Classification: 128x128x3 Convolutional Neural Network (70.46% top 1 accuracy, 89.57% top 5 accuracy, 470ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz)
 
 <table class="table table-striped table-bordered">
     <tr>
         <td> Download </td>
-        <td colspan="3"> <a href="https://github.com/Microsoft/ELL-models/raw/master/models/ILSVRC2012/Tamarind/Tamarind_pi3.ell.zip">Tamarind_pi3.ell.zip</a></td>
+        <td colspan="3"> <a href="https://github.com/Microsoft/ELL-models/raw/master/models/ILSVRC2012/Tamarind/Tamarind.ell.zip">Tamarind.ell.zip</a></td>
     </tr>
     <tr>
         <td> Accuracy </td>
@@ -19,7 +19,7 @@ permalink: /gallery/ILSVRC2012/Tamarind
     </tr>
     <tr>
         <td> Performance </td>
-        <td colspan="3"> Raspberry Pi 3 (Raspbian) @ 700MHz: 665ms/frame </td>
+        <td colspan="3"> Raspberry Pi 3 (Raspbian) @ 700MHz: 470ms/frame </td>
     </tr>
     <tr>
         <td> Uncompressed Size </td>
diff --git a/docs/gallery/ILSVRC2012/WaterApple.md b/docs/gallery/ILSVRC2012/WaterApple.md
index b58ae1df0..f3b86f088 100644
--- a/docs/gallery/ILSVRC2012/WaterApple.md
+++ b/docs/gallery/ILSVRC2012/WaterApple.md
@@ -6,12 +6,12 @@ permalink: /gallery/ILSVRC2012/WaterApple
 
 [Back to Gallery](/ELL/gallery)
 
-## ILSVRC2012 Classification: 64x64x3 Convolutional Neural Network (61.21% top 1 accuracy, 83.23% top 5 accuracy, 269ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz)
+## ILSVRC2012 Classification: 64x64x3 Convolutional Neural Network (61.21% top 1 accuracy, 83.23% top 5 accuracy, 186ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz)
 
 <table class="table table-striped table-bordered">
     <tr>
         <td> Download </td>
-        <td colspan="3"> <a href="https://github.com/Microsoft/ELL-models/raw/master/models/ILSVRC2012/WaterApple/WaterApple_pi3.ell.zip">WaterApple_pi3.ell.zip</a></td>
+        <td colspan="3"> <a href="https://github.com/Microsoft/ELL-models/raw/master/models/ILSVRC2012/WaterApple/WaterApple.ell.zip">WaterApple.ell.zip</a></td>
     </tr>
     <tr>
         <td> Accuracy </td>
@@ -19,7 +19,7 @@ permalink: /gallery/ILSVRC2012/WaterApple
     </tr>
     <tr>
         <td> Performance </td>
-        <td colspan="3"> Raspberry Pi 3 (Raspbian) @ 700MHz: 269ms/frame </td>
+        <td colspan="3"> Raspberry Pi 3 (Raspbian) @ 700MHz: 186ms/frame </td>
     </tr>
     <tr>
         <td> Uncompressed Size </td>
diff --git a/docs/gallery/ILSVRC2012/Wattleseed.md b/docs/gallery/ILSVRC2012/Wattleseed.md
index c8d94d9ba..ff41582ab 100644
--- a/docs/gallery/ILSVRC2012/Wattleseed.md
+++ b/docs/gallery/ILSVRC2012/Wattleseed.md
@@ -6,7 +6,7 @@ permalink: /gallery/ILSVRC2012/Wattleseed
 
 [Back to Gallery](/ELL/gallery)
 
-## ILSVRC2012 Classification: 64x64x3 Convolutional Neural Network (63.23% top 1 accuracy, 84.72% top 5 accuracy, 601ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz)
+## ILSVRC2012 Classification: 64x64x3 Convolutional Neural Network (63.23% top 1 accuracy, 84.72% top 5 accuracy, 350ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz)
 
 <table class="table table-striped table-bordered">
     <tr>
@@ -19,7 +19,7 @@ permalink: /gallery/ILSVRC2012/Wattleseed
     </tr>
     <tr>
         <td> Performance </td>
-        <td colspan="3"> Raspberry Pi 3 (Raspbian) @ 700MHz: 601ms/frame </td>
+        <td colspan="3"> Raspberry Pi 3 (Raspbian) @ 700MHz: 350ms/frame </td>
     </tr>
     <tr>
         <td> Uncompressed Size </td>
diff --git a/docs/tutorials/Boosting-classifier-accuracy-by-grouping-categories/index.md b/docs/tutorials/Boosting-classifier-accuracy-by-grouping-categories/index.md
index db551aa21..a9fbd8d31 100644
--- a/docs/tutorials/Boosting-classifier-accuracy-by-grouping-categories/index.md
+++ b/docs/tutorials/Boosting-classifier-accuracy-by-grouping-categories/index.md
@@ -294,4 +294,3 @@ def main():
 
 ## Troubleshooting
 Find tips in the Troubleshooting section of the [Raspberry Pi Setup Instructions](/ELL/tutorials/Raspberry-Pi-setup).
-
diff --git a/interfaces/CMakeLists.txt b/interfaces/CMakeLists.txt
index a2d64d917..90c852a1b 100644
--- a/interfaces/CMakeLists.txt
+++ b/interfaces/CMakeLists.txt
@@ -26,6 +26,7 @@ set(_sources
     common/include/CallbackInterface.h
     common/include/DatasetInterface.h
     common/include/DatasetInterfaceImpl.h
+    common/include/MatrixMatrixMultiplyImplementation.h
     common/include/MathInterface.h
     common/include/ModelBuilderInterface.h
     common/include/ModelInterface.h
diff --git a/interfaces/common/include/MatrixMatrixMultiplyImplementation.h b/interfaces/common/include/MatrixMatrixMultiplyImplementation.h
new file mode 100644
index 000000000..2b02c4431
--- /dev/null
+++ b/interfaces/common/include/MatrixMatrixMultiplyImplementation.h
@@ -0,0 +1,17 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     MatrixMatrixMultiplyImplementation.h (interfaces)
+//  Authors:  Mason Remy
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#pragma once
+
+#include <nodes/include/MatrixMatrixMultiplyImplementation.h>
+
+enum class MatrixMatrixMultiplyImplementation : int
+{
+    SimpleForLoops = (int)ell::nodes::MatrixMatrixMultiplyImplementation::SimpleForLoops,
+    Mlas_Loopnest_Value = (int)ell::nodes::MatrixMatrixMultiplyImplementation::Mlas_Loopnest_Value,
+    ImplementationCount = (int)ell::nodes::MatrixMatrixMultiplyImplementation::LAST
+};
diff --git a/interfaces/common/include/ModelBuilderInterface.h b/interfaces/common/include/ModelBuilderInterface.h
index 9ef08106e..46db1039a 100644
--- a/interfaces/common/include/ModelBuilderInterface.h
+++ b/interfaces/common/include/ModelBuilderInterface.h
@@ -48,6 +48,9 @@ class ModelBuilder
     Node AddConstantNode(Model model, std::vector<double> values, const PortMemoryLayout& outputMemoryLayout, PortType type);
     Node AddDCTNode(Model model, PortElements input, int numFilters);
     Node AddMatrixMultiplyNode(Model model, PortElements input1, PortElements input2);
+    Node AddMatrixMatrixMultiplyNode(Model model, PortElements input1, PortElements input2);
+    Node AddMatrixMatrixMultiplyCodeNode(Model model, PortElements input1, PortElements input2, int gemmImpl);
+    Node AddMatrixMatrixMultiplyCodeNode(Model model, PortElements input1, PortElements input2, int panelM, int panelN, int panelK, int kernelM, int kernelN, int kernelK, int gemmImpl);
     Node AddDotProductNode(Model model, PortElements input1, PortElements input2);
     Node AddNeuralNetworkPredictorNode(Model model, PortElements input, ell::api::predictors::NeuralNetworkPredictor predictor);
     Node AddFFTNode(Model model, PortElements input, int nfft = 0);
diff --git a/interfaces/common/include/ModelInterface.h b/interfaces/common/include/ModelInterface.h
index 5ea9f9223..2dc977e37 100644
--- a/interfaces/common/include/ModelInterface.h
+++ b/interfaces/common/include/ModelInterface.h
@@ -495,6 +495,9 @@ struct MapCompilerOptions
 
     /// <summary> Emit debug code. </summary>
     bool debug = false;
+
+    /// <summary> Skip ELLCode optimization. </summary>
+    bool skip_ellcode = false;
 };
 
 //
diff --git a/interfaces/common/model.i b/interfaces/common/model.i
index c102386f2..4facda208 100644
--- a/interfaces/common/model.i
+++ b/interfaces/common/model.i
@@ -17,6 +17,7 @@ std::vector<void*> GetOutputBuffersFromList(std::shared_ptr<ell::model::Map> map
 #endif // SWIGPYTHON
 
 #include "Ports.h"
+#include "MatrixMatrixMultiplyImplementation.h"
 #include "ModelInterface.h"
 #include "ModelBuilderInterface.h"
 
@@ -33,6 +34,7 @@ std::vector<void*> GetOutputBuffersFromList(std::shared_ptr<ell::model::Map> map
 
 // Include the C++ code to be wrapped
 %include "Ports.h"
+%include "MatrixMatrixMultiplyImplementation.h"
 %include "ModelInterface.h"
 %include "ModelBuilderInterface.h"
 %include "macros.i"
diff --git a/interfaces/common/model_python_post.i b/interfaces/common/model_python_post.i
index 35014482a..a02fa69b5 100644
--- a/interfaces/common/model_python_post.i
+++ b/interfaces/common/model_python_post.i
@@ -8,6 +8,16 @@
 
 %pythoncode %{
 
+# Python friendly class for MatrixMatrixMultiplyImplementation
+class MatrixMatrixMultiplyImplementation:
+    SimpleForLoops = MatrixMatrixMultiplyImplementation_SimpleForLoops
+    Mlas_Loopnest_Value = MatrixMatrixMultiplyImplementation_Mlas_Loopnest_Value
+    ImplementationCount = MatrixMatrixMultiplyImplementation_ImplementationCount
+
+del MatrixMatrixMultiplyImplementation_SimpleForLoops
+del MatrixMatrixMultiplyImplementation_Mlas_Loopnest_Value
+del MatrixMatrixMultiplyImplementation_ImplementationCount
+
 # Python friendly class for PortType
 class PortType:
     bigInt = PortType_bigInt
diff --git a/interfaces/common/src/ModelBuilderInterface.cpp b/interfaces/common/src/ModelBuilderInterface.cpp
index 4340a3933..db9e644ed 100644
--- a/interfaces/common/src/ModelBuilderInterface.cpp
+++ b/interfaces/common/src/ModelBuilderInterface.cpp
@@ -37,11 +37,13 @@
 #include <nodes/include/HammingWindowNode.h>
 #include <nodes/include/IIRFilterNode.h>
 #include <nodes/include/LSTMNode.h>
+#include <nodes/include/MatrixMatrixMultiplyNode.h>
+#include <nodes/include/MatrixMatrixMultiplyCodeNode.h>
 #include <nodes/include/MatrixVectorMultiplyNode.h>
 #include <nodes/include/NeuralNetworkPredictorNode.h>
 #include <nodes/include/NodeOperations.h>
 #include <nodes/include/ReinterpretLayoutNode.h>
-#include <nodes/include/ReorderDataNode.h>
+#include <nodes/include/ReorderDataCodeNode.h>
 #include <nodes/include/TypeCastNode.h>
 #include <nodes/include/UnaryOperationNode.h>
 #include <nodes/include/VoiceActivityDetectorNode.h>
@@ -455,7 +457,7 @@ Node ModelBuilder::AddReorderDataNode(Model model, PortElements input, PortMemor
     switch (type)
     {
     case PortType::real:
-        newNode = model.GetModel()->AddNode<ell::nodes::ReorderDataNode<double>>(
+        newNode = model.GetModel()->AddNode<ell::nodes::ReorderDataCodeNode<double>>(
             ell::model::PortElements<double>(elements),
             inputMemoryLayout.Get(),
             outputMemoryLayout.Get(),
@@ -463,7 +465,7 @@ Node ModelBuilder::AddReorderDataNode(Model model, PortElements input, PortMemor
             outputPaddingValue);
         break;
     case PortType::smallReal:
-        newNode = model.GetModel()->AddNode<ell::nodes::ReorderDataNode<float>>(
+        newNode = model.GetModel()->AddNode<ell::nodes::ReorderDataCodeNode<float>>(
             ell::model::PortElements<float>(elements),
             inputMemoryLayout.Get(),
             outputMemoryLayout.Get(),
@@ -484,12 +486,12 @@ Node ModelBuilder::AddReorderDataNode(Model model, PortElements input, std::vect
     switch (type)
     {
     case PortType::real:
-        newNode = model.GetModel()->AddNode<ell::nodes::ReorderDataNode<double>>(
+        newNode = model.GetModel()->AddNode<ell::nodes::ReorderDataCodeNode<double>>(
             ell::model::PortElements<double>(elements),
             order);
         break;
     case PortType::smallReal:
-        newNode = model.GetModel()->AddNode<ell::nodes::ReorderDataNode<float>>(
+        newNode = model.GetModel()->AddNode<ell::nodes::ReorderDataCodeNode<float>>(
             ell::model::PortElements<float>(elements),
             order);
         break;
@@ -971,6 +973,120 @@ Node ModelBuilder::AddMatrixMultiplyNode(Model model, PortElements input1, PortE
     return Node(newNode, model.GetModel());
 }
 
+Node ModelBuilder::AddMatrixMatrixMultiplyNode(Model model, PortElements input1, PortElements input2)
+{
+    auto type = input1.GetType();
+    auto t2 = input2.GetType();
+    if (type != t2)
+    {
+        throw std::invalid_argument("Error: input1 has different element types from input2");
+    }
+    auto elements1 = input1.GetPortElements();
+    auto elements2 = input2.GetPortElements();
+
+    auto layout1 = elements1.GetMemoryLayout();
+    auto layout2 = elements2.GetMemoryLayout();
+
+    ell::model::Node* newNode = nullptr;
+
+    if (layout1.NumDimensions() != 2 && layout2.NumDimensions() != 2)
+    {
+        throw std::invalid_argument("Error: input sizes invalid");
+    }
+    else
+    {
+        switch (type)
+        {
+        case PortType::real:
+            newNode = model.GetModel()->AddNode<ell::nodes::MatrixMatrixMultiplyNode<double>>(ell::model::PortElements<double>(elements1), ell::model::PortElements<double>(elements2));
+            break;
+        case PortType::smallReal:
+            newNode = model.GetModel()->AddNode<ell::nodes::MatrixMatrixMultiplyNode<float>>(ell::model::PortElements<float>(elements1), ell::model::PortElements<float>(elements2));
+            break;
+        default:
+            throw std::invalid_argument("Error: could not create MatrixMatrixMultiplyNode of the requested type");
+        }
+    }
+
+    return Node(newNode, model.GetModel());
+}
+
+Node ModelBuilder::AddMatrixMatrixMultiplyCodeNode(Model model, PortElements input1, PortElements input2, int gemmImpl)
+{
+    auto type = input1.GetType();
+    auto t2 = input2.GetType();
+    if (type != t2)
+    {
+        throw std::invalid_argument("Error: input1 has different element types from input2");
+    }
+    auto elements1 = input1.GetPortElements();
+    auto elements2 = input2.GetPortElements();
+
+    auto layout1 = elements1.GetMemoryLayout();
+    auto layout2 = elements2.GetMemoryLayout();
+
+    ell::model::Node* newNode = nullptr;
+
+    if (layout1.NumDimensions() != 2 && layout2.NumDimensions() != 2)
+    {
+        throw std::invalid_argument("Error: input sizes invalid");
+    }
+    else
+    {
+        switch (type)
+        {
+        case PortType::real:
+            newNode = model.GetModel()->AddNode<ell::nodes::MatrixMatrixMultiplyCodeNode<double>>(ell::model::PortElements<double>(elements1), ell::model::PortElements<double>(elements2), static_cast<ell::nodes::MatrixMatrixMultiplyImplementation>(gemmImpl));
+            break;
+        case PortType::smallReal:
+            newNode = model.GetModel()->AddNode<ell::nodes::MatrixMatrixMultiplyCodeNode<float>>(ell::model::PortElements<float>(elements1), ell::model::PortElements<float>(elements2), static_cast<ell::nodes::MatrixMatrixMultiplyImplementation>(gemmImpl));
+            break;
+        default:
+            throw std::invalid_argument("Error: could not create MatrixMatrixMultiplyCodeNode of the requested type");
+        }
+    }
+
+    return Node(newNode, model.GetModel());
+}
+
+Node ModelBuilder::AddMatrixMatrixMultiplyCodeNode(Model model, PortElements input1, PortElements input2, int panelM, int panelN, int panelK, int kernelM, int kernelN, int kernelK, int gemmImpl)
+{
+    auto type = input1.GetType();
+    auto t2 = input2.GetType();
+    if (type != t2)
+    {
+        throw std::invalid_argument("Error: input1 has different element types from input2");
+    }
+    auto elements1 = input1.GetPortElements();
+    auto elements2 = input2.GetPortElements();
+
+    auto layout1 = elements1.GetMemoryLayout();
+    auto layout2 = elements2.GetMemoryLayout();
+
+    ell::model::Node* newNode = nullptr;
+
+    if (layout1.NumDimensions() != 2 && layout2.NumDimensions() != 2)
+    {
+        throw std::invalid_argument("Error: input sizes invalid");
+    }
+    else
+    {
+        switch (type)
+        {
+        case PortType::real:
+            newNode = model.GetModel()->AddNode<ell::nodes::MatrixMatrixMultiplyCodeNode<double>>(ell::model::PortElements<double>(elements1), ell::model::PortElements<double>(elements2), panelM, panelN, panelK, kernelM, kernelN, kernelK, static_cast<ell::nodes::MatrixMatrixMultiplyImplementation>(gemmImpl));
+            break;
+        case PortType::smallReal:
+            newNode = model.GetModel()->AddNode<ell::nodes::MatrixMatrixMultiplyCodeNode<float>>(ell::model::PortElements<float>(elements1), ell::model::PortElements<float>(elements2), panelM, panelN, panelK, kernelM, kernelN, kernelK, static_cast<ell::nodes::MatrixMatrixMultiplyImplementation>(gemmImpl));
+            break;
+        default:
+            throw std::invalid_argument("Error: could not create MatrixMatrixMultiplyCodeNode of the requested type");
+        }
+    }
+
+    return Node(newNode, model.GetModel());
+}
+
 Node ModelBuilder::AddDotProductNode(Model model, PortElements input1, PortElements input2)
 {
     ell::model::Node* newNode = nullptr;
diff --git a/interfaces/common/src/ModelInterface.cpp b/interfaces/common/src/ModelInterface.cpp
index a329997bf..35d540ed1 100644
--- a/interfaces/common/src/ModelInterface.cpp
+++ b/interfaces/common/src/ModelInterface.cpp
@@ -960,6 +960,7 @@ CompiledMap Map::Compile(const std::string& targetDevice, const std::string& mod
     settings.compilerSettings.allowVectorInstructions = compilerSettings.allowVectorInstructions;
     settings.compilerSettings.vectorWidth = compilerSettings.vectorWidth;
     settings.compilerSettings.debug = compilerSettings.debug;
+    settings.compilerSettings.skip_ellcode = compilerSettings.skip_ellcode;
 
     ell::model::ModelOptimizerOptions optimizerOptions;
     optimizerOptions["fuseLinearFunctionNodes"] = optimizerSettings.fuseLinearFunctionNodes;
diff --git a/interfaces/python/CMakeLists.txt b/interfaces/python/CMakeLists.txt
index c08117b71..2a001f1e9 100644
--- a/interfaces/python/CMakeLists.txt
+++ b/interfaces/python/CMakeLists.txt
@@ -37,16 +37,16 @@ if (TARGET _ELL_python)
   add_custom_command(TARGET _ELL_python
       POST_BUILD
       COMMAND ${CMAKE_COMMAND} -E copy ${PYTHON_DIR}/ell_py.py ${PYTHON_DIR}/package/ell/ell_py.py
-      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/tools/utilities/pythonlibs/buildtools.py ${PYTHON_DIR}/package/ell/util/buildtools.py
-      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/docs/tutorials/shared/tutorial_helpers.py ${PYTHON_DIR}/package/ell/util/tutorialHelpers.py
+      COMMAND ${CMAKE_COMMAND} -E copy ${ELL_ROOT}/tools/utilities/pythonlibs/buildtools.py ${PYTHON_DIR}/package/ell/util/buildtools.py
+      COMMAND ${CMAKE_COMMAND} -E copy ${ELL_ROOT}/docs/tutorials/shared/tutorial_helpers.py ${PYTHON_DIR}/package/ell/util/tutorialHelpers.py
   )
 
-  file(GLOB PKGHDR RELATIVE ${CMAKE_SOURCE_DIR}/interfaces ${CMAKE_SOURCE_DIR}/interfaces/common/*.i ${CMAKE_SOURCE_DIR}/interfaces/common/include/*.h)
+  file(GLOB PKGHDR RELATIVE ${ELL_ROOT}/interfaces ${ELL_ROOT}/interfaces/common/*.i ${ELL_ROOT}/interfaces/common/include/*.h)
 
   foreach(hdr ${PKGHDR})
     add_custom_command(TARGET _ELL_python
         POST_BUILD
-        COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/interfaces/${hdr} ${CMAKE_BINARY_DIR}/interfaces/python/package/ell/headers/${hdr}
+        COMMAND ${CMAKE_COMMAND} -E copy ${ELL_ROOT}/interfaces/${hdr} ${CMAKE_BINARY_DIR}/interfaces/python/package/ell/headers/${hdr}
     )
   endforeach()
 
@@ -54,9 +54,9 @@ if (TARGET _ELL_python)
 
   add_custom_command(TARGET _ELL_python
       POST_BUILD
-      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/CMake/OpenBLASSetup.cmake ${DEPLOYDIR}/OpenBLASSetup.cmake
-      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/interfaces/common/include/CallbackInterface.h ${DEPLOYDIR}/include/CallbackInterface.h
-      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/tools/wrap/templates/CMakeLists.python.txt.in ${DEPLOYDIR}/CMakeLists.python.txt.in
+      COMMAND ${CMAKE_COMMAND} -E copy ${ELL_ROOT}/CMake/OpenBLASSetup.cmake ${DEPLOYDIR}/OpenBLASSetup.cmake
+      COMMAND ${CMAKE_COMMAND} -E copy ${ELL_ROOT}/interfaces/common/include/CallbackInterface.h ${DEPLOYDIR}/include/CallbackInterface.h
+      COMMAND ${CMAKE_COMMAND} -E copy ${ELL_ROOT}/tools/wrap/templates/CMakeLists.python.txt.in ${DEPLOYDIR}/CMakeLists.python.txt.in
   )
 
   if(WIN32)
@@ -73,4 +73,7 @@ if (TARGET _ELL_python)
           COMMAND ${CMAKE_COMMAND} -E copy $<TARGET_FILE:_ELL_python> ${PYTHON_DIR}/package/ell/$<TARGET_FILE_NAME:_ELL_python>
     )
   endif()
+
+  add_dependencies(_ELL_python pythonpackage)
+
 endif(TARGET _ELL_python)
diff --git a/interfaces/python/package/CMakeLists.txt b/interfaces/python/package/CMakeLists.txt
index b241b8dc5..0977e904e 100644
--- a/interfaces/python/package/CMakeLists.txt
+++ b/interfaces/python/package/CMakeLists.txt
@@ -1,15 +1,16 @@
 if(${PYTHON_ENABLED})
 
     set(module_name "pythonpackage")
-    
+
     set(src bld.bat build.sh MANIFEST.in meta.yaml setup.py)
 
     add_custom_target(${module_name} ALL DEPENDS SOURCES ${src})
-    
+
     copy_newer_files(${module_name} src)
 
     set_property(TARGET ${module_name} PROPERTY FOLDER "interfaces/python/package")
 
-endif()  # PYTHON_ENABLED
+    add_subdirectory(ell)
+    add_dependencies(${module_name} ${module_name}_ell)
 
-add_subdirectory(ell)
+endif()  # PYTHON_ENABLED
diff --git a/interfaces/python/package/ell/CMakeLists.txt b/interfaces/python/package/ell/CMakeLists.txt
index 4b2cee0a9..49b8f5a9a 100644
--- a/interfaces/python/package/ell/CMakeLists.txt
+++ b/interfaces/python/package/ell/CMakeLists.txt
@@ -3,18 +3,24 @@ if(${PYTHON_ENABLED})
     set(module_name "pythonpackage_ell")
 
     set(src __init__.py rpi_magic.py platform.py)
+    add_custom_target(${module_name} ALL DEPENDS SOURCES ${src})
 
-    add_subdirectory(data)
-    add_subdirectory(math)
-    add_subdirectory(model)
-    add_subdirectory(neural)
-    add_subdirectory(nodes)
-    add_subdirectory(trainers)
-    add_subdirectory(util)
-    add_subdirectory(vision)
+    set(module_components
+        data
+        math
+        model
+        neural
+        nodes
+        trainers
+        util
+        vision
+    )
+
+    foreach(component ${module_components})
+        add_subdirectory(${component})
+        add_dependencies(${module_name} ${module_name}_${component})
+    endforeach(component ${module_components})
 
-    add_custom_target(${module_name} ALL DEPENDS SOURCES ${src})
-    
     copy_newer_files(${module_name} src)
 
     set_property(TARGET ${module_name} PROPERTY FOLDER "interfaces/python/package/ell")
diff --git a/interfaces/python/package/ell/nodes/__init__.py b/interfaces/python/package/ell/nodes/__init__.py
index c51bf4912..803a0a769 100644
--- a/interfaces/python/package/ell/nodes/__init__.py
+++ b/interfaces/python/package/ell/nodes/__init__.py
@@ -11,6 +11,7 @@
 InputNodeList,\
 InputPort,\
 InputPortIterator, \
+MatrixMatrixMultiplyImplementation, \
 Node,\
 NodeIterator,\
 OutputNode,\
diff --git a/interfaces/python/test/CMakeLists.txt b/interfaces/python/test/CMakeLists.txt
index fb467c11b..2e9d3c618 100644
--- a/interfaces/python/test/CMakeLists.txt
+++ b/interfaces/python/test/CMakeLists.txt
@@ -8,7 +8,7 @@ if(${PYTHON_ENABLED})
   file(GLOB test_src RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.py)
 
   add_custom_target(${test_name} ALL DEPENDS ${test_src} SOURCES ${test_src})
-  add_dependencies(${test_name} pythonlibs)
+  add_dependencies(${test_name} pythonlibs dspDataFiles _ELL_python)
   set_property(TARGET ${test_name} PROPERTY FOLDER "tests")
 
   # copy the contents of the test directory to build/interfaces/python
@@ -20,4 +20,4 @@ if(${PYTHON_ENABLED})
     COMMAND ${PYTHON_EXECUTABLE} test.py)
   set_property(TARGET ${test_name} PROPERTY FOLDER "tests")
 
-endif()  # PYTHON_ENABLED
\ No newline at end of file
+endif()  # PYTHON_ENABLED
diff --git a/interfaces/python/test/compiled_model_test.py b/interfaces/python/test/compiled_model_test.py
index 33465fa2a..bf49d8d81 100644
--- a/interfaces/python/test/compiled_model_test.py
+++ b/interfaces/python/test/compiled_model_test.py
@@ -157,6 +157,9 @@ def test():
         return 1
     else:
         return 0
+    if x > (1 - bias) / scale:
+        return 1
+    return (scale * x) + bias
 
 
 if __name__ == '__main__':
diff --git a/libraries/common/include/MapCompilerArguments.h b/libraries/common/include/MapCompilerArguments.h
index 89542bbff..b579ed51e 100644
--- a/libraries/common/include/MapCompilerArguments.h
+++ b/libraries/common/include/MapCompilerArguments.h
@@ -40,6 +40,7 @@ namespace common
         bool useBlas = false;
         bool debug = false;
         utilities::Optional<bool> positionIndependentCode = false; // for generating -fPIC object code
+        int globalValueAlignment = 32;
 
         // potentially per-node options:
         bool enableVectorization = true;
@@ -67,6 +68,7 @@ namespace common
         std::string targetArchitecture = "";
         std::string targetFeatures = "";
         std::string targetDataLayout = "";
+        bool skip_ellcode = false;
 
         /// <summary> Gets a `MapCompilerOptions` with the settings specified in the commandline arguments. </summary>
         ///
diff --git a/libraries/common/src/LoadModel.cpp b/libraries/common/src/LoadModel.cpp
index 84eae5fa3..0845c9a32 100644
--- a/libraries/common/src/LoadModel.cpp
+++ b/libraries/common/src/LoadModel.cpp
@@ -41,6 +41,7 @@
 #include <nodes/include/LSTMNode.h>
 #include <nodes/include/LinearPredictorNode.h>
 #include <nodes/include/MatrixMatrixMultiplyNode.h>
+#include <nodes/include/MatrixMatrixMultiplyCodeNode.h>
 #include <nodes/include/MatrixVectorMultiplyNode.h>
 #include <nodes/include/MatrixVectorProductNode.h>
 #include <nodes/include/MovingAverageNode.h>
@@ -52,9 +53,11 @@
 #include <nodes/include/ReceptiveFieldMatrixNode.h>
 #include <nodes/include/ReinterpretLayoutNode.h>
 #include <nodes/include/ReorderDataNode.h>
+#include <nodes/include/ReorderDataCodeNode.h>
 #include <nodes/include/SimpleConvolutionNode.h>
 #include <nodes/include/SinkNode.h>
 #include <nodes/include/SourceNode.h>
+#include <nodes/include/SpatialConvolutionNode.h>
 #include <nodes/include/UnaryOperationNode.h>
 #include <nodes/include/UnrolledConvolutionNode.h>
 #include <nodes/include/VoiceActivityDetectorNode.h>
@@ -129,11 +132,13 @@ namespace common
         context.GetTypeFactory().AddType<model::Node, nodes::MatrixVectorProductNode<ElementType, math::MatrixLayout::rowMajor>>();
         context.GetTypeFactory().AddType<model::Node, nodes::MatrixVectorProductNode<ElementType, math::MatrixLayout::columnMajor>>();
         context.GetTypeFactory().AddType<model::Node, nodes::MatrixMatrixMultiplyNode<ElementType>>();
+        context.GetTypeFactory().AddType<model::Node, nodes::MatrixMatrixMultiplyCodeNode<ElementType>>();
         context.GetTypeFactory().AddType<model::Node, nodes::MatrixVectorMultiplyNode<ElementType>>();
         context.GetTypeFactory().AddType<model::Node, nodes::MovingAverageNode<ElementType>>();
         context.GetTypeFactory().AddType<model::Node, nodes::MovingVarianceNode<ElementType>>();
         context.GetTypeFactory().AddType<model::Node, nodes::NeuralNetworkPredictorNode<ElementType>>();
         context.GetTypeFactory().AddType<model::Node, nodes::ReceptiveFieldMatrixNode<ElementType>>();
+        context.GetTypeFactory().AddType<model::Node, nodes::ReorderDataCodeNode<ElementType>>();
         context.GetTypeFactory().AddType<model::Node, nodes::ReorderDataNode<ElementType>>();
         context.GetTypeFactory().AddType<model::Node, nodes::ReinterpretLayoutNode<ElementType>>();
         context.GetTypeFactory().AddType<model::Node, nodes::RNNNode<ElementType>>();
@@ -141,6 +146,7 @@ namespace common
         context.GetTypeFactory().AddType<model::Node, nodes::SimpleConvolutionNode<ElementType>>();
         context.GetTypeFactory().AddType<model::Node, nodes::SinkNode<ElementType>>();
         context.GetTypeFactory().AddType<model::Node, nodes::SourceNode<ElementType>>();
+        context.GetTypeFactory().AddType<model::Node, nodes::SpatialConvolutionNode<ElementType>>();
         context.GetTypeFactory().AddType<model::Node, nodes::SumNode<ElementType>>();
         context.GetTypeFactory().AddType<model::Node, nodes::TypeCastNode<bool, ElementType>>();
         context.GetTypeFactory().AddType<model::Node, nodes::TypeCastNode<int, ElementType>>();
diff --git a/libraries/common/src/MapCompilerArguments.cpp b/libraries/common/src/MapCompilerArguments.cpp
index d66b47bf8..33fd28277 100644
--- a/libraries/common/src/MapCompilerArguments.cpp
+++ b/libraries/common/src/MapCompilerArguments.cpp
@@ -191,6 +191,20 @@ namespace common
               { "true", utilities::Optional<bool>(true) },
               { "false", utilities::Optional<bool>(false) } },
             "auto");
+
+        parser.AddOption(
+            globalValueAlignment,
+            "globalValueAlignment",
+            "gva",
+            "The number of bytes to align global buffers to",
+            32);
+        
+        parser.AddOption(
+            skip_ellcode,
+            "skip_ellcode",
+            "skip_ellcode",
+            "To skip ELLCode",
+            false);
     }
 
     model::MapCompilerOptions MapCompilerArguments::GetMapCompilerOptions(const std::string& modelName) const
@@ -231,6 +245,8 @@ namespace common
         settings.profile = profile;
         settings.compilerSettings.profile = profile;
         settings.compilerSettings.positionIndependentCode = positionIndependentCode;
+        settings.compilerSettings.globalValueAlignment = globalValueAlignment;
+        settings.compilerSettings.skip_ellcode = skip_ellcode;
 
         if (target != "")
         {
diff --git a/libraries/emittable_functions/CMakeLists.txt b/libraries/emittable_functions/CMakeLists.txt
index fbe9883b1..c3e09e3bd 100644
--- a/libraries/emittable_functions/CMakeLists.txt
+++ b/libraries/emittable_functions/CMakeLists.txt
@@ -18,14 +18,10 @@ set(include
     include/VoiceActivityDetector.h
 )
 
-set(tcc
-)
-
 source_group("src" FILES ${src})
 source_group("include" FILES ${include})
-source_group("tcc" FILES ${tcc})
 
-add_library(${library_name} ${src} ${include} ${tcc})
+add_library(${library_name} ${src} ${include})
 target_include_directories(${library_name} PRIVATE include ${ELL_LIBRARIES_DIR})
 target_link_libraries(${library_name} PUBLIC value)
 target_compile_options(${library_name} PUBLIC ${LLVM_COMPILE_OPTIONS})
diff --git a/libraries/emitters/CMakeLists.txt b/libraries/emitters/CMakeLists.txt
index e82aa20be..8373c4c54 100644
--- a/libraries/emitters/CMakeLists.txt
+++ b/libraries/emitters/CMakeLists.txt
@@ -91,19 +91,58 @@ set (include
 
 set (templates
     templates/CppPredictWrapper.in
+    templates/LLVMEmitterTargets.h.in
     templates/SwigModule.in
     templates/SwigPredictPython.in
     templates/SwigShapeWrappers.in
 )
 
+# This is supposed to be overriden on the command line
+# As of LLVM 8.0.1, the possible values within the list are:
+# AArch64 AMDGPU ARM BPF Hexagon Lanai Mips MSP430 NVPTX PowerPC Sparc SystemZ
+# WebAssembly X86 XCore
+set(LLVM_EMITTER_TARGETS "X86;ARM" CACHE STRING "List of LLVM emitter targets to support. Default is \"X86;ARM\". Specify 'ALL' to support all targets")
+if(LLVM_EMITTER_TARGETS STREQUAL "ALL")
+  set(LLVM_EMITTER_TARGETS_FINAL ${LLVM_ALL_TARGETS})
+else()
+  set(LLVM_EMITTER_TARGETS_FINAL ${LLVM_EMITTER_TARGETS})
+endif()
+
+set(emitter_targets_content "")
+set(llvm_emitter_target_libs )
+foreach(LLVM_EMITTER_TARGET ${LLVM_EMITTER_TARGETS_FINAL})
+  if(NOT ${LLVM_EMITTER_TARGET} IN_LIST LLVM_ALL_TARGETS)
+    message(FATAL_ERROR "Unrecognized LLVM emitter target: ${LLVM_EMITTER_TARGET}.\n\nTargets must be one of: ${LLVM_ALL_TARGETS}")
+  endif()
+  set(emitter_targets_content "${emitter_targets_content}    EMITTER_TARGET_ACTION(${LLVM_EMITTER_TARGET}) \\\n")
+  set(llvm_emitter_target_libs
+    ${llvm_emitter_target_libs}
+    LLVM${LLVM_EMITTER_TARGET}CodeGen
+    LLVM${LLVM_EMITTER_TARGET}AsmParser
+    LLVM${LLVM_EMITTER_TARGET}Disassembler
+    LLVM${LLVM_EMITTER_TARGET}AsmPrinter
+    LLVM${LLVM_EMITTER_TARGET}Desc
+    LLVM${LLVM_EMITTER_TARGET}Info
+  )
+endforeach(LLVM_EMITTER_TARGET LLVM_EMITTER_TARGETS)
+configure_file(templates/LLVMEmitterTargets.h.in build/LLVMEmitterTargets.h @ONLY)
+
 source_group("src" FILES ${src})
 source_group("include" FILES ${include})
 source_group("templates" FILES ${templates})
 
 add_library(${library_name} ${src} ${include} ${templates})
-target_include_directories(${library_name} PRIVATE include templates ${ELL_LIBRARIES_DIR})
+target_include_directories(${library_name} PRIVATE ${CMAKE_CURRENT_BINARY_DIR} include templates ${ELL_LIBRARIES_DIR})
 target_include_directories(${library_name} SYSTEM PUBLIC ${LLVM_INCLUDE_DIRS})
-target_link_libraries(${library_name} math utilities ${LLVM_LIBS})
+target_link_libraries(
+  ${library_name}
+  math
+  utilities
+
+  LLVMMCJIT
+  ${llvm_emitter_target_libs}
+  LLVMipo
+)
 target_compile_options(${library_name} PUBLIC ${LLVM_COMPILE_OPTIONS})
 
 set_property(TARGET ${library_name} PROPERTY FOLDER "libraries")
diff --git a/libraries/emitters/include/CompilerOptions.h b/libraries/emitters/include/CompilerOptions.h
index 7b60cef2e..a0a7b68cb 100644
--- a/libraries/emitters/include/CompilerOptions.h
+++ b/libraries/emitters/include/CompilerOptions.h
@@ -92,6 +92,12 @@ namespace emitters
         /// <summary> The name of the file being compiled. </summary>
         std::string modelFile;
 
+        /// <summary> The byte alignment to use for global values. </summary>
+        int globalValueAlignment = 32;
+
+        /// <summary> Skip ELLCode optimization. </summary>
+        bool skip_ellcode = false;
+
     private:
         void AddOptions(const utilities::PropertyBag& properties);
     };
diff --git a/libraries/emitters/include/EmitterTypes.h b/libraries/emitters/include/EmitterTypes.h
index dc7d4866c..c92bf80b6 100644
--- a/libraries/emitters/include/EmitterTypes.h
+++ b/libraries/emitters/include/EmitterTypes.h
@@ -56,6 +56,25 @@ namespace emitters
         ///<summary> Pointer to a Double </summary>
         DoublePointer,
 
+        //
+        // Pointers to pointers
+        //
+        VoidPointerPointer,
+        ///<summary> Pointer to a pointer to a character array </summary>
+        Char8PointerPointer,
+        ///<summary> Pointer to a pointer to a byte </summary>
+        BytePointerPointer,
+        ///<summary> Pointer to a pointer to a Int16 </summary>
+        Int16PointerPointer,
+        ///<summary> Pointer to a pointer to an Int32 </summary>
+        Int32PointerPointer,
+        ///<summary> Pointer to a pointer to an Int64 </summary>
+        Int64PointerPointer,
+        ///<summary> Pointer to a pointer to a Float </summary>
+        FloatPointerPointer,
+        ///<summary> Pointer to a pointer to a Double </summary>
+        DoublePointerPointer,
+
         //
         // Custom Structs
         //
@@ -249,13 +268,20 @@ namespace emitters
     template <typename ValueType>
     VariableType GetVariableType();
 
-    /// <summary> Gets the value form the VariableType enum that corresponds to a pointer to a given nonpointer type. </summary>
+    /// <summary> Gets the value from the VariableType enum that corresponds to a pointer from a given nonpointer type. </summary>
     ///
-    /// <param name="type"> The nonpointer type, such as Short or Double. </typeparam>
+    /// <param name="type"> The nonpointer type, such as Int16 or Double. </typeparam>
     ///
     /// <returns> A VariableType that corresponds to the pointer to a given type. </returns>
     VariableType GetPointerType(VariableType type);
 
+    /// <summary> Gets the value from the VariableType enum that corresponds to a nonpointer from a given pointer type. </summary>
+    ///
+    /// <param name="type"> The pointer type, such as Int16Pointer or DoublePointer. </typeparam>
+    ///
+    /// <returns> A VariableType that corresponds to the nonpointer from a given type. </returns>
+    VariableType GetNonPointerType(VariableType type);
+
     /// <summary> Gets the default value for a certain type. </summary>
     ///
     /// <typeparam name="ValueType"> The type. </typeparam>
diff --git a/libraries/emitters/include/FunctionDeclaration.h b/libraries/emitters/include/FunctionDeclaration.h
index cea7d1b0f..060db2912 100644
--- a/libraries/emitters/include/FunctionDeclaration.h
+++ b/libraries/emitters/include/FunctionDeclaration.h
@@ -51,6 +51,7 @@ namespace emitters
 
         /// <summary> Get the LLVM type, if we have it. </summary>
         LLVMType GetLLVMType() const { return _llvmType; }
+
     private:
         std::string _name;
         VariableType _type;
@@ -58,7 +59,7 @@ namespace emitters
         LLVMType _llvmType;
     };
 
-    ///<summary> Collections of argument flags</summary>
+    /// <summary> Collections of argument flags </summary>
     using FunctionArgumentList = std::vector<FunctionArgument>;
 
     /// <summary> A function definition that defines the name, return type and arguments of a function </summary>
diff --git a/libraries/emitters/include/IRAssemblyWriter.h b/libraries/emitters/include/IRAssemblyWriter.h
index a7368ea9f..4793965c1 100644
--- a/libraries/emitters/include/IRAssemblyWriter.h
+++ b/libraries/emitters/include/IRAssemblyWriter.h
@@ -45,7 +45,13 @@ namespace emitters
 
         OptimizationLevel optimizationLevel = OptimizationLevel::Default;
         FloatABIType floatABI = FloatABIType::Default;
+        
         FloatFusionMode floatFusionMode = FloatFusionMode::Fast;
+        bool unsafeFPMath = true;
+        bool noInfsFPMath = true;
+        bool noNaNsFPMath = true;
+        bool noSignedZerosFPMath = true;
+        
         OutputRelocationModel relocModel = OutputRelocationModel::Static;
     };
 
diff --git a/libraries/emitters/include/IREmitter.h b/libraries/emitters/include/IREmitter.h
index 87568217f..6cfeb352c 100644
--- a/libraries/emitters/include/IREmitter.h
+++ b/libraries/emitters/include/IREmitter.h
@@ -42,7 +42,7 @@ namespace emitters
     using LLVMException = utilities::ErrorCodeException<std::error_code>;
 
     /// <summary>
-    /// Wraps the LLVM API with an easy to use object model that hides some unncessary detail.
+    /// Wraps the LLVM API with an easy to use object model that hides some unnecessary detail.
     /// Incorporates our own x-compiler abstractions such as VariableType and TypedOperator.
     ///
     /// Note: IREmitter is stateful. It has a "current block" that it is emitting IR into.
@@ -53,7 +53,6 @@ namespace emitters
         IREmitter(const IREmitter&) = delete;
         IREmitter(IREmitter&&) = default;
         IREmitter& operator=(const IREmitter&) = delete;
-        IREmitter& operator=(IREmitter&&) = default;
         ~IREmitter() = default;
 
         /// <summary> Get the LLVM Type information for a VariableType. </summary>
@@ -1115,7 +1114,7 @@ namespace emitters
     private:
         friend class IRModuleEmitter;
 
-        IREmitter(IRModuleEmitter& moduleEmitter, llvm::LLVMContext& context);
+        IREmitter(llvm::LLVMContext& context, llvm::Module& module);
 
         LLVMType GetBaseVariableType(VariableType type) const;
         llvm::Constant* Integer(VariableType type, const size_t value);
@@ -1131,7 +1130,7 @@ namespace emitters
         LLVMFunction CreateFunction(llvm::Module* pModule, const std::string& name, llvm::Function::LinkageTypes linkage, llvm::FunctionType* pFunctionType);
         LLVMValue Zero();
 
-        IRModuleEmitter& _moduleEmitter;
+        llvm::Module& _module;
         llvm::LLVMContext& _llvmContext; // LLVM global context
         mutable llvm::IRBuilder<> _irBuilder; // IRBuilder API
         IRValueTable _stringLiterals; // String literals are emitted as constants. We have to track them ourselves to prevent dupes.
diff --git a/libraries/emitters/include/IRExecutionEngine.h b/libraries/emitters/include/IRExecutionEngine.h
index 423c62e17..a12b29e14 100644
--- a/libraries/emitters/include/IRExecutionEngine.h
+++ b/libraries/emitters/include/IRExecutionEngine.h
@@ -37,13 +37,13 @@ namespace emitters
         ///
         /// <param name="module"> The module. </param>
         /// <param name="verify"> Indicates if the execution engine should run a verification pass before running the code. </param>
-        IRExecutionEngine(IRModuleEmitter&& module, bool verify = false);
+        IRExecutionEngine(IRModuleEmitter&& module, bool verify = true, llvm::CodeGenOpt::Level optLevel = llvm::CodeGenOpt::Level::Default);
 
         /// <summary> Inject the primary "owner" module into the execution engine. </summary>
         ///
         /// <param name="pModule"> The module. </param>
         /// <param name="verify"> Indicates if the execution engine should run a verification pass before running the code. </param>
-        IRExecutionEngine(std::unique_ptr<llvm::Module> pModule, bool verify = false);
+        IRExecutionEngine(std::unique_ptr<llvm::Module> pModule, bool verify = false, llvm::CodeGenOpt::Level optLevel = llvm::CodeGenOpt::Level::Default);
 
         /// <summary> Destructor </summary>
         ~IRExecutionEngine();
@@ -103,6 +103,12 @@ namespace emitters
         /// <param name="address"> The address of the function being defined. </param>
         void DefineFunction(LLVMFunction func, utilities::UIntPtrT address);
 
+        /// <summary> Set the address of a named function already defined elsewhere in the binary. </summary>
+        ///
+        /// <param name="name"> The function name being defined. </param>
+        /// <param name="address"> The address of the function being referenced. </param>
+        void DefineFunction(const std::string& name, utilities::UIntPtrT address);
+
         /// <summary>
         /// Return a main function that takes no arguments - if one exists. Returns nullptr if not found.
         /// </summary>
diff --git a/libraries/emitters/include/IRFunctionEmitter.h b/libraries/emitters/include/IRFunctionEmitter.h
index 6a1cc5d82..392d8c0b6 100644
--- a/libraries/emitters/include/IRFunctionEmitter.h
+++ b/libraries/emitters/include/IRFunctionEmitter.h
@@ -2,7 +2,7 @@
 //
 //  Project:  Embedded Learning Library (ELL)
 //  File:     IRFunctionEmitter.h (emitters)
-//  Authors:  Umesh Madan, Chuck Jacobs
+//  Authors:  Umesh Madan, Chuck Jacobs, Kern Handa
 //
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -25,6 +25,8 @@
 #include "LLVMUtilities.h"
 #include "Variable.h"
 
+#include <utilities/include/FunctionUtils.h>
+
 #include <llvm/IR/Argument.h>
 #include <llvm/IR/BasicBlock.h>
 #include <llvm/IR/Constant.h>
@@ -33,10 +35,12 @@
 #include <llvm/IR/Instructions.h>
 #include <llvm/IR/LLVMContext.h>
 
+#include <any>
 #include <functional>
 #include <initializer_list>
 #include <ostream>
 #include <string>
+#include <type_traits>
 #include <vector>
 
 namespace ell
@@ -48,6 +52,15 @@ namespace emitters
     /// <summary> A list of IRLocalScalar values </summary>
     using IRScalarList = std::vector<IRLocalScalar>;
 
+    /// <summary> Helper enum used to specify whether a FunctionDeclaration should be inlined </summary>
+    enum class FunctionInlining
+    {
+        defaultInline,
+        always,
+        prefer,
+        never
+    };
+
     /// <summary> Used to emit code into an existing LLVM IR Function </summary>
     class IRFunctionEmitter
     {
@@ -57,7 +70,11 @@ namespace emitters
         {
             None = 0,
             /// <summary> Suppress alias analysis </summary>
-            NoAlias
+            NoAlias,
+            /// <summary> This indicates that the parameter or return pointer is dereferenceable.
+            /// This attribute may only be applied to pointer typed parameters. A pointer that is
+            /// dereferenceable can be loaded from speculatively without a risk of trapping. </summary>
+            Dereferenceable
         };
 
         /// <summary> Query if this IRFunctionEmitter is valid. </summary>
@@ -705,18 +722,19 @@ namespace emitters
         ///
         /// <param name="index"> The index of the argument </param>
         /// <param name="attribute"> The attribute </param>
-        void SetAttributeForArgument(size_t index, Attributes attribute);
+        /// <param name="extra"> Any extra information that the attribute might make use of </param>
+        void SetAttributeForArgument(size_t index, Attributes attribute, const std::any& extra = {});
 
         /// <summary> Sets an attribute for all arguments </summary>
         ///
         /// <param name="attribute"> The attribute </summary>
-        void SetAttributeForArguments(Attributes attribute);
+        void SetAttributeForArguments(Attributes attribute, const std::any& extra = {});
 
         /// <summary> Sets an attribute for arguments at the specified indices </summary>
         ///
         /// <param name="indices"> The indices of the arguments </param>
         /// <param name="attribute"> The attribute </param>
-        void SetAttributeForArguments(std::vector<size_t> indices, Attributes attribute);
+        void SetAttributeForArguments(std::vector<size_t> indices, Attributes attribute, const std::any& extra = {});
 
         /// <summary> Emit a stack variable. </summary>
         ///
@@ -756,15 +774,6 @@ namespace emitters
         /// <returns> Pointer to the array. </returns>
         llvm::AllocaInst* Variable(VariableType type, int size);
 
-        /// <summary> Emit a 2D stack array of the given dimensions. </summary>
-        ///
-        /// <param name="type"> The array entry type. </param>
-        /// <param name="rows"> The number of rows in the array. </param>
-        /// <param name="columns"> The number of columns in the array. </param>
-        ///
-        /// <returns> Pointer to the array. </returns>
-        llvm::AllocaInst* Variable(VariableType type, int rows, int columns);
-
         /// <summary> Emit a stack array of the given size. </summary>
         ///
         /// <param name="type"> The array entry type. </param>
@@ -773,15 +782,6 @@ namespace emitters
         /// <returns> Pointer to the array. </returns>
         llvm::AllocaInst* Variable(LLVMType type, int size);
 
-        /// <summary> Emit a 2D stack array of the given dimensions. </summary>
-        ///
-        /// <param name="type"> The array entry type. </param>
-        /// <param name="rows"> The number of rows in the array. </param>
-        /// <param name="columns"> The number of columns in the array. </param>
-        ///
-        /// <returns> Pointer to the array. </returns>
-        llvm::AllocaInst* Variable(LLVMType type, int rows, int columns);
-
         /// <summary> Return an emitted stack variable and assign it a name. </summary>
         ///
         /// <param name="type"> The variable type. </param>
@@ -1032,12 +1032,26 @@ namespace emitters
         /// <param name="body"> A function that emits the body of the loop. </param>
         void For(int count, ForLoopBodyFunction body);
 
+        /// <summary> Emits a for loop counting from zero to a constant end value. </summary>
+        ///
+        /// <param name="tag"> Tag to use when naming the basic block regions </param>
+        /// <param name="count"> The number of iterations to make. </param>
+        /// <param name="body"> A function that emits the body of the loop. </param>
+        void For(const std::string& tag, int count, ForLoopBodyFunction body);
+
         /// <summary> Emits a for loop counting from zero to a constant end value. </summary>
         ///
         /// <param name="count"> The number of iterations to make. </param>
         /// <param name="body"> A function that emits the body of the loop. </param>
         void For(LLVMValue count, ForLoopBodyFunction body);
 
+        /// <summary> Emits a for loop counting from zero to a constant end value. </summary>
+        ///
+        /// <param name="tag"> Tag to use when naming the basic block regions </param>
+        /// <param name="count"> The number of iterations to make. </param>
+        /// <param name="body"> A function that emits the body of the loop. </param>
+        void For(const std::string& tag, LLVMValue count, ForLoopBodyFunction body);
+
         /// <summary> Emits a for loop counting from a begin value up to (but not including) a constant end value. </summary>
         ///
         /// <param name="beginValue"> The starting value of the loop iterator. </param>
@@ -1045,6 +1059,14 @@ namespace emitters
         /// <param name="body"> A function that emits the body of the loop. </param>
         void For(int beginValue, int endValue, ForLoopBodyFunction body);
 
+        /// <summary> Emits a for loop counting from a begin value up to (but not including) a constant end value. </summary>
+        ///
+        /// <param name="tag"> Tag to use when naming the basic block regions </param>
+        /// <param name="beginValue"> The starting value of the loop iterator. </param>
+        /// <param name="endValue"> The ending value of the loop iterator. </param>
+        /// <param name="body"> A function that emits the body of the loop. </param>
+        void For(const std::string& tag, int beginValue, int endValue, ForLoopBodyFunction body);
+
         /// <summary> Emits a for loop counting from a begin value up to (but not including) a constant end value. </summary>
         ///
         /// <param name="beginValue"> The starting value of the loop iterator. </param>
@@ -1052,6 +1074,14 @@ namespace emitters
         /// <param name="body"> A function that emits the body of the loop. </param>
         void For(LLVMValue beginValue, LLVMValue endValue, ForLoopBodyFunction body);
 
+        /// <summary> Emits a for loop counting from a begin value up to (but not including) a constant end value. </summary>
+        ///
+        /// <param name="tag"> Tag to use when naming the basic block regions </param>
+        /// <param name="beginValue"> The starting value of the loop iterator. </param>
+        /// <param name="endValue"> The ending value of the loop iterator. </param>
+        /// <param name="body"> A function that emits the body of the loop. </param>
+        void For(const std::string& tag, LLVMValue beginValue, LLVMValue endValue, ForLoopBodyFunction body);
+
         /// <summary> Emits a for loop counting from a begin value up to (but not including) a constant end value with a given increment. </summary>
         ///
         /// <param name="beginValue"> The starting value of the loop iterator. </param>
@@ -1060,6 +1090,15 @@ namespace emitters
         /// <param name="body"> A function that emits the body of the loop. </param>
         void For(int beginValue, int endValue, int increment, ForLoopBodyFunction body);
 
+        /// <summary> Emits a for loop counting from a begin value up to (but not including) a constant end value with a given increment. </summary>
+        ///
+        /// <param name="tag"> Tag to use when naming the basic block regions </param>
+        /// <param name="beginValue"> The starting value of the loop iterator. </param>
+        /// <param name="endValue"> The ending value of the loop iterator. </param>
+        /// <param name="increment"> The increment for the iterator. </param>
+        /// <param name="body"> A function that emits the body of the loop. </param>
+        void For(const std::string& tag, int beginValue, int endValue, int increment, ForLoopBodyFunction body);
+
         /// <summary> Emits a for loop counting from a begin value up to (but not including) a constant end value with a given increment. </summary>
         ///
         /// <param name="beginValue"> The starting value of the loop iterator. </param>
@@ -1068,6 +1107,15 @@ namespace emitters
         /// <param name="body"> A function that emits the body of the loop. </param>
         void For(LLVMValue beginValue, LLVMValue endValue, LLVMValue increment, ForLoopBodyFunction body);
 
+        /// <summary> Emits a for loop counting from a begin value up to (but not including) a constant end value with a given increment. </summary>
+        ///
+        /// <param name="tag"> Tag to use when naming the basic block regions </param>
+        /// <param name="beginValue"> The starting value of the loop iterator. </param>
+        /// <param name="endValue"> The ending value of the loop iterator. </param>
+        /// <param name="increment"> The increment for the iterator. </param>
+        /// <param name="body"> A function that emits the body of the loop. </param>
+        void For(const std::string& tag, LLVMValue beginValue, LLVMValue endValue, LLVMValue increment, ForLoopBodyFunction body);
+
         //
         // Extended for loops
         //
@@ -1078,36 +1126,78 @@ namespace emitters
         /// <param name="body"> A function that emits the body of the loop. </param>
         void For(const std::vector<ConstLoopRange>& ranges, MultiDimForLoopBodyFunction body);
 
+        /// <summary> Emits a set of nested for loops, each counting from a begin value up to (but not including) an end value. </summary>
+        ///
+        /// <param name="tag"> Tag to use when naming the basic block regions </param>
+        /// <param name="ranges"> The range objects describing the ranges to iterate over (begin, end). </param>
+        /// <param name="body"> A function that emits the body of the loop. </param>
+        void For(const std::string& tag, const std::vector<ConstLoopRange>& ranges, MultiDimForLoopBodyFunction body);
+
         /// <summary> Emits a set of nested for loops, each counting from a begin value up to (but not including) an end value. </summary>
         ///
         /// <param name="ranges"> The range objects describing the ranges to iterate over (begin, end). </param>
         /// <param name="body"> A function that emits the body of the loop. </param>
         void For(const std::vector<LoopRange>& ranges, MultiDimForLoopBodyFunction body);
 
+        /// <summary> Emits a set of nested for loops, each counting from a begin value up to (but not including) an end value. </summary>
+        ///
+        /// <param name="tag"> Tag to use when naming the basic block regions </param>
+        /// <param name="ranges"> The range objects describing the ranges to iterate over (begin, end). </param>
+        /// <param name="body"> A function that emits the body of the loop. </param>
+        void For(const std::string& tag, const std::vector<LoopRange>& ranges, MultiDimForLoopBodyFunction body);
+
         /// <summary> Emits a tiled for loop counting from a begin value up to (but not including) an end value with a given increment. </summary>
         ///
         /// <param name="range"> The range object describing the range to iterate over (begin, end, and increment). </param>
         /// <param name="body"> A function that emits the body of the loop. </param>
         void For(ConstTiledLoopRange range, TiledForLoopBodyFunction body);
 
+        /// <summary> Emits a tiled for loop counting from a begin value up to (but not including) an end value with a given increment. </summary>
+        ///
+        /// <param name="tag"> Tag to use when naming the basic block regions </param>
+        /// <param name="range"> The range object describing the range to iterate over (begin, end, and increment). </param>
+        /// <param name="body"> A function that emits the body of the loop. </param>
+        void For(const std::string& tag, ConstTiledLoopRange range, TiledForLoopBodyFunction body);
+
         /// <summary> Emits a tiled for loop counting from a begin value up to (but not including) an end value with a given increment. </summary>
         ///
         /// <param name="range"> The range object describing the range to iterate over (begin, end, and increment). </param>
         /// <param name="body"> A function that emits the body of the loop. </param>
         void For(TiledLoopRange range, TiledForLoopBodyFunction body);
 
+        /// <summary> Emits a tiled for loop counting from a begin value up to (but not including) an end value with a given increment. </summary>
+        ///
+        /// <param name="tag"> Tag to use when naming the basic block regions </param>
+        /// <param name="range"> The range object describing the range to iterate over (begin, end, and increment). </param>
+        /// <param name="body"> A function that emits the body of the loop. </param>
+        void For(const std::string& tag, TiledLoopRange range, TiledForLoopBodyFunction body);
+
         /// <summary> Emits a set of nested tiled for loops, each counting from a begin value up to (but not including) an end value, with a given increment. </summary>
         ///
         /// <param name="ranges"> The range objects describing the ranges to iterate over (begin, end, increment). </param>
         /// <param name="body"> A function that emits the body of the loop. </param>
         void For(const std::vector<ConstTiledLoopRange>& ranges, TiledMultiDimForLoopBodyFunction body);
 
+        /// <summary> Emits a set of nested tiled for loops, each counting from a begin value up to (but not including) an end value, with a given increment. </summary>
+        ///
+        /// <param name="tag"> Tag to use when naming the basic block regions </param>
+        /// <param name="ranges"> The range objects describing the ranges to iterate over (begin, end, increment). </param>
+        /// <param name="body"> A function that emits the body of the loop. </param>
+        void For(const std::string& tag, const std::vector<ConstTiledLoopRange>& ranges, TiledMultiDimForLoopBodyFunction body);
+
         /// <summary> Emits a set of nested tiled for loops, each counting from a begin value up to (but not including) an end value, with a given increment. </summary>
         ///
         /// <param name="ranges"> The range objects describing the ranges to iterate over (begin, end, increment). </param>
         /// <param name="body"> A function that emits the body of the loop. </param>
         void For(const std::vector<TiledLoopRange>& ranges, TiledMultiDimForLoopBodyFunction body);
 
+        /// <summary> Emits a set of nested tiled for loops, each counting from a begin value up to (but not including) an end value, with a given increment. </summary>
+        ///
+        /// <param name="tag"> Tag to use when naming the basic block regions </param>
+        /// <param name="ranges"> The range objects describing the ranges to iterate over (begin, end, increment). </param>
+        /// <param name="body"> A function that emits the body of the loop. </param>
+        void For(const std::string& tag, const std::vector<TiledLoopRange>& ranges, TiledMultiDimForLoopBodyFunction body);
+
         /// <summary> Emits a parallel for loop counting from zero to a constant end value. </summary>
         ///
         /// <param name="count"> The number of iterations to make. </param>
@@ -1166,11 +1256,24 @@ namespace emitters
 
         /// <summary> Emits a while loop. </summary>
         ///
-        /// </param name="condition"> A function the emits code returning a single-bit boolean test value </param>
+        /// <param name="tag"> Tag to use when naming the basic block regions </param>
+        /// <param name="pTestValuePointer"> Pointer to a memory location that will be dereferenced for the test value. </param>
+        /// <param name="body"> A function that emits the body of the loop. </param>
+        void While(const std::string& tag, LLVMValue pTestValuePointer, WhileLoopBodyFunction body);
+
+        /// <summary> Emits a while loop. </summary>
         ///
+        /// <param name="condition"> A function the emits code returning a single-bit boolean test value </param>
         /// <param name="body"> A function that emits the body of the loop. </param>
         void While(std::function<LLVMValue(IRFunctionEmitter&)> condition, WhileLoopBodyFunction body);
 
+        /// <summary> Emits a while loop. </summary>
+        ///
+        /// <param name="tag"> Tag to use when naming the basic block regions </param>
+        /// <param name="condition"> A function the emits code returning a single-bit boolean test value </param>
+        /// <param name="body"> A function that emits the body of the loop. </param>
+        void While(const std::string& tag, std::function<LLVMValue(IRFunctionEmitter&)> condition, WhileLoopBodyFunction body);
+
         /// <summary> Emits an if statement. </summary>
         ///
         /// <param name="pTestValuePointer"> Pointer to a memory location that will be dereferenced for the test value. </param>
@@ -1312,6 +1415,13 @@ namespace emitters
         /// <returns> Pointer to the return value of the call to the printf function. </returns>
         LLVMValue Printf(std::initializer_list<LLVMValue> arguments);
 
+        /// <summary> Emits a printf call. </summary>
+        ///
+        /// <param name="arguments"> Arguments to the printf call. </param>
+        ///
+        /// <returns> Pointer to the return value of the call to the printf function. </returns>
+        LLVMValue Printf(std::vector<LLVMValue> arguments);
+
         /// <summary> Emits a printf call. </summary>
         ///
         /// <param name="format"> Describes the printf format to use. </param>
@@ -1636,6 +1746,9 @@ namespace emitters
         /// <summary> Gets the CPU id of the currently-running thread. Currently only available on Linux. Returns -1 if unavailable. </summary>
         LLVMValue GetCpu();
 
+        /// <summary> Emits a system call to trap into the debugger </summary>
+        void DebugBreak();
+
         //
         // Information about the current function begin emitted
         //
@@ -1701,6 +1814,12 @@ namespace emitters
         /// <summary> Tags a profiling function to be included in the SWIG interface. </summary>
         void IncludeInSwigInterface();
 
+        /// <summary> Tags a function to be inlined or not, depending on the value passed in. </summary>
+        void SetInlineState(FunctionInlining inlineState);
+
+        /// <summary> Tags an LLVM function pointer to be inlined or not, depending on the value passed in. </summary>
+        static void SetInlineState(LLVMFunction function, FunctionInlining inlineState);
+
     private:
         friend class IRModuleEmitter;
 
@@ -1713,13 +1832,11 @@ namespace emitters
         {
         public:
             EntryBlockScope(IRFunctionEmitter& function);
-            void ExitScope();
             ~EntryBlockScope();
 
         private:
             IRFunctionEmitter& _function;
             llvm::IRBuilder<>::InsertPoint _oldPos;
-            bool _inScope = true;
         };
 
         LLVMValue PtrOffsetA(LLVMValue pPointer, int offset);
@@ -1735,6 +1852,21 @@ namespace emitters
         LLVMValue SetValueAtH(LLVMValue pPointer, int offset, LLVMValue pValue);
 
         llvm::BasicBlock* GetEntryBlock() { return _entryBlock; }
+
+        template <typename FunctionType>
+        auto ExecuteInEntryBlock(FunctionType&& fn) -> utilities::FunctionReturnType<FunctionType>
+        {
+            EntryBlockScope scope(*this);
+            if constexpr (utilities::HasReturnValue<FunctionType>())
+            {
+                return fn();
+            }
+            else
+            {
+                fn();
+            }
+        }
+
         void SetUpFunction();
 
         void RegisterFunctionArgs(const NamedVariableTypeList& args);
diff --git a/libraries/emitters/include/IRIfEmitter.h b/libraries/emitters/include/IRIfEmitter.h
index 658bf533b..710cd3ff3 100644
--- a/libraries/emitters/include/IRIfEmitter.h
+++ b/libraries/emitters/include/IRIfEmitter.h
@@ -79,10 +79,10 @@ namespace emitters
         IRIfEmitter& operator=(const IRIfEmitter&) = delete;
 
         /// <summary> Move constructor </summary>
-        IRIfEmitter(IRIfEmitter&& other);
+        IRIfEmitter(IRIfEmitter&& other) noexcept;
 
         /// <summary> Move assignment operator </summary>
-        IRIfEmitter& operator=(IRIfEmitter&& other);
+        IRIfEmitter& operator=(IRIfEmitter&& other) noexcept;
 
         /// <summary> Emits an 'if' block. </summary>
         ///
diff --git a/libraries/emitters/include/IRLocalValue.h b/libraries/emitters/include/IRLocalValue.h
index 589cc85a2..759ddcb24 100644
--- a/libraries/emitters/include/IRLocalValue.h
+++ b/libraries/emitters/include/IRLocalValue.h
@@ -18,6 +18,7 @@ namespace emitters
 {
     class IRFunctionEmitter;
     struct IRLocalValue;
+    struct IRLocalScalar;
 
     namespace detail
     {
@@ -63,9 +64,6 @@ namespace emitters
 
         /// <summary> The LLVMValue being wrapped. </summary>
         LLVMValue value;
-
-    private:
-        IRLocalValue() = default;
     };
 
     /// <summary>
@@ -74,6 +72,11 @@ namespace emitters
     struct IRLocalPointer : public IRLocalValue
     {
         using IRLocalValue::IRLocalValue;
+
+        IRLocalValue Load() const;
+        IRLocalPointer Offset(int offset) const;
+        IRLocalPointer Offset(LLVMValue offset) const;
+        IRLocalPointer Offset(const IRLocalScalar& offset) const;
     };
 } // namespace emitters
 } // namespace ell
diff --git a/libraries/emitters/include/IRLoopEmitter.h b/libraries/emitters/include/IRLoopEmitter.h
index 6b451f39f..d2341be5f 100644
--- a/libraries/emitters/include/IRLoopEmitter.h
+++ b/libraries/emitters/include/IRLoopEmitter.h
@@ -25,7 +25,7 @@ namespace emitters
     {
     public:
         virtual ~IRLoopEmitter() = default;
-    
+
     protected:
         IRLoopEmitter(IRFunctionEmitter& functionEmitter);
         void AddLoopMetadata(llvm::BranchInst* branch, bool unroll, bool parallel);
@@ -40,7 +40,8 @@ namespace emitters
         /// <summary> Constructs an instance of IRForLoopEmitter. </summary>
         ///
         /// <param name="functionEmitter"> The function emitter. </param>
-        IRForLoopEmitter(IRFunctionEmitter& functionEmitter);
+        /// <param name="tag"> Optional, tag to use when naming the basic block regions </param>
+        IRForLoopEmitter(IRFunctionEmitter& functionEmitter, const std::string& tag = "");
 
         /// <summary> Gets the block containing the body of the for loop. </summary>
         ///
@@ -105,6 +106,7 @@ namespace emitters
         llvm::BasicBlock* _pIncrementBlock = nullptr; // Here we increment the iteration variable
         llvm::BasicBlock* _pAfterBlock = nullptr; // When the loop is done, we branch to this block
         LLVMValue _pIterationVariable = nullptr;
+        std::string _tag;
     };
 
     /// <summary> Class that simplifies while loop creation. Used internally by IRFunctionEmitter. </summary>
@@ -116,7 +118,8 @@ namespace emitters
         /// <summary> Constructs an instance of IRWhileLoopEmitter. </summary>
         ///
         /// <param name="functionEmitter"> The function emitter. </param>
-        IRWhileLoopEmitter(IRFunctionEmitter& functionEmitter);
+        /// <param name="tag"> Optional, tag to use when naming the basic block regions </param>
+        IRWhileLoopEmitter(IRFunctionEmitter& functionEmitter, const std::string& tag = "");
 
         /// <summary> Emits the beginning of a while loop that uses a mutable test value. </summary>
         ///
@@ -150,6 +153,7 @@ namespace emitters
         llvm::BasicBlock* _pConditionBlock = nullptr; // Here we do the loop termination check
         llvm::BasicBlock* _pBodyBlock = nullptr; // The body of the loop
         llvm::BasicBlock* _pAfterBlock = nullptr; // When the loop is done, we branch to this block
+        std::string _tag;
     };
 } // namespace emitters
 } // namespace ell
diff --git a/libraries/emitters/include/IRModuleEmitter.h b/libraries/emitters/include/IRModuleEmitter.h
index b7e77b8e1..570b96bad 100644
--- a/libraries/emitters/include/IRModuleEmitter.h
+++ b/libraries/emitters/include/IRModuleEmitter.h
@@ -57,9 +57,9 @@ namespace emitters
         IRModuleEmitter(const std::string& moduleName, const CompilerOptions& parameters);
 
         IRModuleEmitter(const IRModuleEmitter&) = delete;
-        IRModuleEmitter(IRModuleEmitter&&) = default;
+        IRModuleEmitter(IRModuleEmitter&&) = delete;
         IRModuleEmitter& operator=(const IRModuleEmitter&) = delete;
-        IRModuleEmitter& operator=(IRModuleEmitter&&) = default;
+        IRModuleEmitter& operator=(IRModuleEmitter&&) = delete;
 
         //
         // Properties of the module
@@ -87,17 +87,17 @@ namespace emitters
         /// <summary> Returns the runtime object that manages functions. </summary>
         ///
         /// <returns> Reference to the `IRRuntime`. </returns>
-        IRRuntime& GetRuntime() { return _runtime; }
+        IRRuntime& GetRuntime() { return *_runtime; }
 
         /// <summary> Gets a reference to the profiler. </summary>
         ///
         /// <returns> Reference to the `IRProfiler` object for this module. </returns>
-        IRProfiler& GetProfiler() { return _profiler; }
+        IRProfiler& GetProfiler() { return *_profiler; }
 
         /// <summary> Gets a reference to the underlying IREmitter. </summary>
         ///
         /// <returns> Reference to the underlying IREmitter. </returns>
-        IREmitter& GetIREmitter() { return _emitter; }
+        IREmitter& GetIREmitter() { return *_emitter; }
 
         /// <summary> Can this module emitter still be used to add functions to the module? </summary>
         ///
@@ -240,74 +240,82 @@ namespace emitters
         ///
         /// <param name="type"> The variable type. </param>
         /// <param name="name"> The name of the variable. </param>
+        /// <param name="isThreadLocal"> Specifies whether the global memory address is unique to each thread </param>
         ///
         /// <returns> Pointer to the llvm::GlobalVariable that represents the variable. </returns>
-        llvm::GlobalVariable* Global(VariableType type, const std::string& name);
+        llvm::GlobalVariable* Global(VariableType type, const std::string& name, bool isThreadLocal = false);
 
         /// <summary> Emit a named global variable of the given type. </summary>
         ///
         /// <param name="pType"> Pointer to the runtime value that contains the variable type. </param>
         /// <param name="name"> The name of the variable. </param>
+        /// <param name="isThreadLocal"> Specifies whether the global memory address is unique to each thread </param>
         ///
         /// <returns> Pointer to the llvm::GlobalVariable that represents the variable. </returns>
-        llvm::GlobalVariable* Global(LLVMType pType, const std::string& name);
+        llvm::GlobalVariable* Global(LLVMType pType, const std::string& name, bool isThreadLocal = false);
 
         /// <summary> Emit a named global variable of a template type. </summary>
         ///
         /// <typeparam name="ValueType"> The variable type. </typeparam>
         /// <param name="name"> The name of the variable. </param>
         /// <param name="value"> The initial value of the variable. </param>
+        /// <param name="isThreadLocal"> Specifies whether the global memory address is unique to each thread </param>
         ///
         /// <returns> Pointer to the llvm::GlobalVariable that represents the variable. </returns>
         template <typename ValueType>
-        llvm::GlobalVariable* Global(const std::string& name, ValueType value);
+        llvm::GlobalVariable* Global(const std::string& name, ValueType value, bool isThreadLocal = false);
 
         /// <summary> Emit a named global variable of Pointer type, initialized to nullptr. </summary>
         ///
         /// <typeparam name="ValueType"> The variable type. </typeparam>
         /// <param name="name"> The name of the variable. </param>
         /// <param name="type"> The variable type. </param>
+        /// <param name="isThreadLocal"> Specifies whether the global memory address is unique to each thread </param>
         ///
         /// <returns> Pointer to the llvm::GlobalVariable that represents the variable. </returns>
-        llvm::GlobalVariable* GlobalPointer(const std::string& name, VariableType type);
+        llvm::GlobalVariable* GlobalPointer(const std::string& name, VariableType type, bool isThreadLocal = false);
 
         /// <summary> Emit a named global array of the given type and size. </summary>
         ///
         /// <param name="type"> The variable type. </param>
         /// <param name="name"> The name of the variable. </param>
         /// <param name="size"> The array size. </param>
+        /// <param name="isThreadLocal"> Specifies whether the global memory address is unique to each thread </param>
         ///
         /// <returns> Pointer to the llvm::GlobalVariable that represents the variable. </returns>
-        llvm::GlobalVariable* GlobalArray(VariableType type, const std::string& name, const size_t size);
+        llvm::GlobalVariable* GlobalArray(VariableType type, const std::string& name, const size_t size, bool isThreadLocal = false);
 
         /// <summary> Emit a named global array of the given type and size. </summary>
         ///
         /// <param name="name"> The name of the variable. </param>
         /// <param name="pType"> Pointer to the runtime value that contains the variable type. </param>
         /// <param name="size"> The array size. </param>
+        /// <param name="isThreadLocal"> Specifies whether the global memory address is unique to each thread </param>
         ///
         /// <returns> Pointer to the llvm::GlobalVariable that represents the variable. </returns>
-        llvm::GlobalVariable* GlobalArray(const std::string& name, LLVMType pType, const size_t size);
+        llvm::GlobalVariable* GlobalArray(const std::string& name, LLVMType pType, const size_t size, bool isThreadLocal = false);
 
         /// <summary> Emit a zero-initialized named, module scoped array of a template type. </summary>
         ///
         /// <typeparam name="ValueType"> Type of each array entry. </typeparam>
         /// <param name="name"> The name of the variable. </param>
         /// <param name="size"> The size of the array. </param>
+        /// <param name="isThreadLocal"> Specifies whether the global memory address is unique to each thread </param>
         ///
         /// <returns> Pointer to the llvm::GlobalVariable that represents the variable. </returns>
         template <typename ValueType>
-        llvm::GlobalVariable* GlobalArray(const std::string& name, size_t size);
+        llvm::GlobalVariable* GlobalArray(const std::string& name, size_t size, bool isThreadLocal = false);
 
         /// <summary> Emit a named, module scoped array of a template type. </summary>
         ///
         /// <typeparam name="ValueType"> Type of each array entry. </typeparam>
         /// <param name="name"> The name of the variable. </param>
         /// <param name="value"> The value of the array. </param>
+        /// <param name="isThreadLocal"> Specifies whether the global memory address is unique to each thread </param>
         ///
         /// <returns> Pointer to the llvm::GlobalVariable that represents the variable. </returns>
         template <typename ValueType>
-        llvm::GlobalVariable* GlobalArray(const std::string& name, const std::vector<ValueType>& value);
+        llvm::GlobalVariable* GlobalArray(const std::string& name, const std::vector<ValueType>& value, bool isThreadLocal = false);
 
         //
         // Functions
@@ -355,6 +363,13 @@ namespace emitters
         /// <returns> Pointer to an llvm::Function that represents the requested function, or nullptr if it doesn't exist. </returns>
         LLVMFunction GetFunction(const std::string& name) const;
 
+        /// <summary> Get an LLVM intrinsic function taking no arguments with the given id. </summary>
+        ///
+        /// <param name="id"> The intrinsic function identifier. </param>
+        ///
+        /// <returns> Pointer to an llvm::Function that represents the requested function. </returns>
+        LLVMFunction GetIntrinsic(llvm::Intrinsic::ID id);
+
         /// <summary> Get an LLVM intrinsic function with the given id and signature. </summary>
         ///
         /// <param name="id"> The intrinsic function identifier. </param>
@@ -531,6 +546,31 @@ namespace emitters
         /// <param name="text"> The IR text. </param>
         void LoadIR(const std::string& text);
 
+        /// <summary> Load LLVM IR from the given stream </summary>
+        ///
+        /// <param name="stream"> The stream that serves as the input </param>
+        void LoadIR(std::istream& stream);
+
+        /// <summary> Load LLVM IR from the file into this module. </summary>
+        ///
+        /// <param name="filename"> The name of the file containing the IR </param>
+        void LoadIRFromFile(const std::string& filename);
+
+        /// <summary> Load Assembler text into this module. </summary>
+        ///
+        /// <param name="text"> The IR text. </param>
+        void LoadAsm(const std::string& text);
+
+        /// <summary> Load Assembler from the given stream </summary>
+        ///
+        /// <param name="stream"> The stream that serves as the input </param>
+        void LoadAsm(std::istream& stream);
+
+        /// <summary> Load Assembler from the file into this module. </summary>
+        ///
+        /// <param name="filename"> The name of the file containing the Assembler text </param>
+        void LoadAsmFromFile(const std::string& filename);
+
         //
         // Optimization
         //
@@ -752,7 +792,7 @@ namespace emitters
         void InsertFunctionMetadata(LLVMFunction function, const std::string& tag, const std::vector<std::string>& value = { "" });
 
         // Get a reference to the thread pool
-        IRThreadPool& GetThreadPool() { return _threadPool; }
+        IRThreadPool& GetThreadPool() { return *_threadPool; }
 
         // Actual code output implementations
         void WriteHeader(std::ostream& stream);
@@ -762,18 +802,13 @@ namespace emitters
         // Lower-level internal functions
         //
         void SetCompilerOptions(const CompilerOptions& parameters) override;
-        llvm::GlobalVariable* AddGlobal(const std::string& name, LLVMType pType, llvm::Constant* pInitial, bool isConst);
+        llvm::GlobalVariable* AddGlobal(const std::string& name, LLVMType pType, llvm::Constant* pInitial, bool isConst, bool isThreadLocal = false);
         IRFunctionEmitter Function(const std::string& name, VariableType returnType, const VariableTypeList* pArguments, bool isPublic);
         llvm::Function::LinkageTypes Linkage(bool isPublic);
         llvm::ConstantAggregateZero* ZeroInitializer(LLVMType pType);
         static void CompleteCompilerOptions(CompilerOptions& parameters);
         void SetTargetTriple(const std::string& triple);
-
-        //
-        // LLVM global state management
-        //
-        void InitializeLLVM();
-        static llvm::PassRegistry* InitializeGlobalPassRegistry();
+        MachineCodeOutputOptions GetMachineCodeOutputOptions() const;
 
         //
         // Data members
@@ -781,14 +816,14 @@ namespace emitters
         std::unique_ptr<llvm::LLVMContext> _llvmContext; // LLVM global context
         std::unique_ptr<llvm::Module> _llvmModule; // The LLVM Module being emitted
         std::unique_ptr<IRDiagnosticHandler> _diagnosticHandler = nullptr;
-        IREmitter _emitter;
+        std::unique_ptr<IREmitter> _emitter;
         std::stack<std::pair<IRFunctionEmitter, llvm::IRBuilder<>::InsertPoint>> _functionStack; // contains the location we were emitting code into when we paused to emit a new function
 
         IRValueTable _literals; // Symbol table - name to literals
         IRValueTable _globals; // Symbol table - name to global variables
-        IRRuntime _runtime; // Manages emission of runtime functions
-        IRThreadPool _threadPool; // A pool of worker threads -- gets initialized the first time it's used (?)
-        IRProfiler _profiler;
+        std::unique_ptr<IRRuntime> _runtime; // Manages emission of runtime functions
+        std::unique_ptr<IRThreadPool> _threadPool; // A pool of worker threads -- gets initialized the first time it's used (?)
+        std::unique_ptr<IRProfiler> _profiler;
         int _globalStringIndex = 0;
 
         // Info to modify how code is written out
@@ -824,31 +859,31 @@ namespace emitters
     template <typename ValueType>
     llvm::GlobalVariable* IRModuleEmitter::Constant(const std::string& name, ValueType value)
     {
-        return AddGlobal(name, _emitter.Type(GetVariableType<ValueType>()), _emitter.Literal(value), true);
+        return AddGlobal(name, GetIREmitter().Type(GetVariableType<ValueType>()), GetIREmitter().Literal(value), true);
     }
 
     template <typename ValueType>
-    llvm::GlobalVariable* IRModuleEmitter::Global(const std::string& name, ValueType value)
+    llvm::GlobalVariable* IRModuleEmitter::Global(const std::string& name, ValueType value, bool isThreadLocal)
     {
-        return AddGlobal(name, _emitter.Type(GetVariableType<ValueType>()), _emitter.Literal(value), false);
+        return AddGlobal(name, GetIREmitter().Type(GetVariableType<ValueType>()), GetIREmitter().Literal(value), false, isThreadLocal);
     }
 
     template <typename ValueType>
     llvm::GlobalVariable* IRModuleEmitter::ConstantArray(const std::string& name, const std::vector<ValueType>& value)
     {
-        return AddGlobal(name, _emitter.ArrayType(GetVariableType<ValueType>(), value.size()), _emitter.Literal(value), true);
+        return AddGlobal(name, GetIREmitter().ArrayType(GetVariableType<ValueType>(), value.size()), GetIREmitter().Literal(value), true);
     }
 
     template <typename ValueType>
-    llvm::GlobalVariable* IRModuleEmitter::GlobalArray(const std::string& name, size_t size)
+    llvm::GlobalVariable* IRModuleEmitter::GlobalArray(const std::string& name, size_t size, bool isThreadLocal)
     {
-        return GlobalArray(GetVariableType<ValueType>(), name, size);
+        return GlobalArray(GetVariableType<ValueType>(), name, size, isThreadLocal);
     }
 
     template <typename ValueType>
-    llvm::GlobalVariable* IRModuleEmitter::GlobalArray(const std::string& name, const std::vector<ValueType>& value)
+    llvm::GlobalVariable* IRModuleEmitter::GlobalArray(const std::string& name, const std::vector<ValueType>& value, bool isThreadLocal)
     {
-        return AddGlobal(name, _emitter.ArrayType(GetVariableType<ValueType>(), value.size()), _emitter.Literal(value), false);
+        return AddGlobal(name, GetIREmitter().ArrayType(GetVariableType<ValueType>(), value.size()), GetIREmitter().Literal(value), false, isThreadLocal);
     }
 
     //
diff --git a/libraries/emitters/include/IRPosixRuntime.h b/libraries/emitters/include/IRPosixRuntime.h
index d68decf38..5c7adace0 100644
--- a/libraries/emitters/include/IRPosixRuntime.h
+++ b/libraries/emitters/include/IRPosixRuntime.h
@@ -161,10 +161,12 @@ namespace emitters
         // GetPthreadAttrType
         // GetPthreadOnceType
 
+        IRPosixRuntime(const IRPosixRuntime&) = delete;
+
     private:
         friend IRModuleEmitter;
         friend IRRuntime;
-        IRPosixRuntime(IRModuleEmitter& module);
+        explicit IRPosixRuntime(IRModuleEmitter& module);
 
         LLVMType GetIntType(); // returns LLVM type for native `int`
         LLVMType GetPointerSizedIntType(); // returns LLVM type for an int the size of a pointer
diff --git a/libraries/emitters/include/IRRuntime.h b/libraries/emitters/include/IRRuntime.h
index 36774cc03..03e3da461 100644
--- a/libraries/emitters/include/IRRuntime.h
+++ b/libraries/emitters/include/IRRuntime.h
@@ -88,6 +88,12 @@ namespace emitters
         template <typename ValueType>
         LLVMFunction GetTanhFunction();
 
+        /// <summary> Get the fma function </summary>
+        ///
+        /// <returns> An LLVM function pointer to the function. </returns>
+        template <typename ValueType>
+        LLVMFunction GetFmaFunction();
+
         // emitter types
         LLVMFunction GetSqrtFunction(VariableType argType);
         LLVMFunction GetAbsFunction(VariableType argType);
@@ -103,6 +109,7 @@ namespace emitters
         LLVMFunction GetFloorFunction(VariableType argType);
         LLVMFunction GetCeilFunction(VariableType argType);
         LLVMFunction GetCopySignFunction(VariableType argType);
+        LLVMFunction GetFmaFunction(VariableType argType);
 
         // llvm types
         LLVMFunction GetSqrtFunction(LLVMType argType);
@@ -119,6 +126,7 @@ namespace emitters
         LLVMFunction GetFloorFunction(LLVMType argType);
         LLVMFunction GetCeilFunction(LLVMType argType);
         LLVMFunction GetCopySignFunction(LLVMType argType);
+        LLVMFunction GetFmaFunction(LLVMType argType);
 
         LLVMFunction GetPrefetchFunction();
 
@@ -172,7 +180,7 @@ namespace emitters
 
     private:
         friend IRModuleEmitter;
-        IRRuntime(IRModuleEmitter& module);
+        explicit IRRuntime(IRModuleEmitter& module);
 
         LLVMType GetIntType(); // returns LLVM type for native `int`
 
diff --git a/libraries/emitters/include/LLVMUtilities.h b/libraries/emitters/include/LLVMUtilities.h
index 698184208..405fec92e 100644
--- a/libraries/emitters/include/LLVMUtilities.h
+++ b/libraries/emitters/include/LLVMUtilities.h
@@ -26,6 +26,9 @@ namespace emitters
     /// <summary> Nice name for llvm::Function pointers. </summary>
     using LLVMFunction = llvm::Function*;
 
+    /// <summary> Nice name for llvm::Function pointers. </summary>
+    using LLVMFunctionType = llvm::FunctionType*;
+
     /// <summary> Nice name for llvm::Type pointers. </summary>
     using LLVMType = llvm::Type*;
 
@@ -74,5 +77,8 @@ namespace emitters
     /// <returns> The VariableType or VariableType::Custom for anything that doesn't map. </returns>
     VariableType ToVariableType(LLVMType type);
 
+    /// <summary> Initializes LLVM </summary>
+    void InitializeLLVM();
+
 } // namespace emitters
 } // namespace ell
diff --git a/libraries/emitters/include/ModuleEmitter.h b/libraries/emitters/include/ModuleEmitter.h
index dd44a9c08..045b24b87 100644
--- a/libraries/emitters/include/ModuleEmitter.h
+++ b/libraries/emitters/include/ModuleEmitter.h
@@ -40,7 +40,7 @@ namespace emitters
         /// <summary> Return the base compiler settings </summary>
         ///
         /// <returns> The settings for the compiler </returns>
-        CompilerOptions GetCompilerOptions() const { return _options; }
+        const CompilerOptions& GetCompilerOptions() const { return _options; }
 
         // Note, this differs from IRModuleEmitter::BeginFunction only by return type
         /// <summary> Set a function declaration. Note that BeginMapPredictFunction can't be called from within a function - it completes the currently-being-emitted function </summary>
diff --git a/libraries/emitters/include/TargetDevice.h b/libraries/emitters/include/TargetDevice.h
index 6c1e0f1a0..be2c20d07 100644
--- a/libraries/emitters/include/TargetDevice.h
+++ b/libraries/emitters/include/TargetDevice.h
@@ -25,6 +25,20 @@ namespace emitters
         std::string features = "";
         size_t numBits = 0;
 
+        /// <summary> Helper function to test whether the TargetDevice has a particular feature </summary>
+        /// <remarks> If this is filled in by LLVM for the host target, the possible features are target dependent
+        /// and include, but are not limited to, the following:
+        /// X86: cx8, cmov, mmx, fxsr, sse, sse2, sse3, pclmul, ssse3, cx16, sse4.1, sse4.2, movbe, popcnt, aes, rdrnd,
+        ///      avx, fma, xsave, f16c, sahf, lzcnt, sse4a, prfchw, xop, lwp, fma4, tbm, mwaitx, 64bit, clzero, wbnoinvd,
+        ///      fsgsbase, sgx, bmi, avx2, bmi2, invpcid, rtm, avx512f, avx512dq, rdseed, adx, avx512ifma, clflushopt,
+        ///      clwb, avx512pf, avx512er, avx512cd, sha, avx512bw, avx512vl, prefetchwt1, avx512vbmi, pku, waitpkg,
+        ///      avx512vbmi2, shstk, gfni, vaes, vpclmulqdq, avx512vnni, avx512bitalg, avx512vpopcntdq, rdpid, cldemote,
+        ///      movdiri, movdir64b, enqcmd, pconfig, avx512bf16, xsaveopt, xsavec, xsaves, ptwrite
+        /// AArch64: neon, fp-armv8, crc, crypto
+        /// ARM: fp16, neon, vfp3, d16, vfp4, hwdiv-arm, hwdiv
+        /// </remarks>
+        inline bool HasFeature(const std::string& feature) const { return features.find(feature) != std::string::npos; }
+
         /// <summary> Indicates if the target device is a Windows system </summary>
         bool IsWindows() const;
 
diff --git a/libraries/emitters/src/CompilerOptions.cpp b/libraries/emitters/src/CompilerOptions.cpp
index 3259e74ed..fe45c1a8c 100644
--- a/libraries/emitters/src/CompilerOptions.cpp
+++ b/libraries/emitters/src/CompilerOptions.cpp
@@ -72,6 +72,8 @@ namespace emitters
         maxThreads = properties.GetOrParseEntry<int>("maxThreads", maxThreads);
         useFastMath = properties.GetOrParseEntry<bool>("useFastMath", useFastMath);
         debug = properties.GetOrParseEntry<bool>("debug", debug);
+        globalValueAlignment = properties.GetOrParseEntry<int>("globalValueAlignment", globalValueAlignment);
+        skip_ellcode = properties.GetOrParseEntry<bool>("skip_ellcode", skip_ellcode);
 
         if (properties.HasEntry("deviceName"))
         {
diff --git a/libraries/emitters/src/EmitterTypes.cpp b/libraries/emitters/src/EmitterTypes.cpp
index c5a207c2f..c66a9d854 100644
--- a/libraries/emitters/src/EmitterTypes.cpp
+++ b/libraries/emitters/src/EmitterTypes.cpp
@@ -330,6 +330,32 @@ namespace emitters
         return type;
     }
 
+    VariableType GetNonPointerType(VariableType type)
+    {
+        switch (type)
+        {
+        case VariableType::VoidPointer:
+            return VariableType::Void;
+        case VariableType::BytePointer:
+            return VariableType::Byte;
+        case VariableType::Int16Pointer:
+            return VariableType::Int16;
+        case VariableType::Int32Pointer:
+            return VariableType::Int32;
+        case VariableType::Int64Pointer:
+            return VariableType::Int64;
+        case VariableType::FloatPointer:
+            return VariableType::Float;
+        case VariableType::DoublePointer:
+            return VariableType::Double;
+        case VariableType::Char8Pointer:
+            return VariableType::Char8;
+        default:
+            break;
+        }
+        return type;
+    }
+
     template <>
     TypedOperator GetAddForValueType<float>()
     {
diff --git a/libraries/emitters/src/IRAssemblyWriter.cpp b/libraries/emitters/src/IRAssemblyWriter.cpp
index 5cae93755..af67d5075 100644
--- a/libraries/emitters/src/IRAssemblyWriter.cpp
+++ b/libraries/emitters/src/IRAssemblyWriter.cpp
@@ -142,6 +142,11 @@ namespace emitters
         llvm::TargetOptions targetOptions = MakeTargetOptions();
         targetOptions.MCOptions.AsmVerbose = ellOptions.verboseOutput;
         targetOptions.FloatABIType = ellOptions.floatABI;
+        targetOptions.AllowFPOpFusion = ellOptions.floatFusionMode;
+        targetOptions.UnsafeFPMath = ellOptions.unsafeFPMath ? 1 : 0;
+        targetOptions.NoInfsFPMath = ellOptions.noInfsFPMath ? 1 : 0;
+        targetOptions.NoNaNsFPMath = ellOptions.noNaNsFPMath ? 1 : 0;
+        targetOptions.NoSignedZerosFPMath = ellOptions.noSignedZerosFPMath ? 1 : 0;
 
         OutputRelocationModel relocModel = ellOptions.relocModel;
         llvm::CodeModel::Model codeModel = llvm::CodeModel::Small; // If this code gets run during JIT, we may have to change to medium/large
diff --git a/libraries/emitters/src/IREmitter.cpp b/libraries/emitters/src/IREmitter.cpp
index 29e2c3c0c..6f4a19e1d 100644
--- a/libraries/emitters/src/IREmitter.cpp
+++ b/libraries/emitters/src/IREmitter.cpp
@@ -7,8 +7,8 @@
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 #include "IREmitter.h"
-#include "IRModuleEmitter.h"
 #include "EmitterException.h"
+#include "IRModuleEmitter.h"
 #include "LLVMUtilities.h"
 
 #include <utilities/include/Logger.h>
@@ -77,8 +77,8 @@ namespace emitters
     //
     // IREmitter implementation
     //
-    IREmitter::IREmitter(IRModuleEmitter& moduleEmitter, llvm::LLVMContext& context) :
-        _moduleEmitter(moduleEmitter),
+    IREmitter::IREmitter(llvm::LLVMContext& context, llvm::Module& module) :
+        _module(module),
         _llvmContext(context),
         _irBuilder(context)
     {}
@@ -90,39 +90,55 @@ namespace emitters
         case VariableType::Void:
             return GetBaseVariableType(type);
         case VariableType::VoidPointer:
-            // We use BytePointer to avoid LLVM Assertion failed: isValidElementType(EltTy) && "Invalid type for pointer element!", 
+            // We use BytePointer to avoid LLVM Assertion failed: isValidElementType(EltTy) && "Invalid type for pointer element!",
             // file ~\llvm-8\lib\ir\type.cpp, line 632
             return GetBaseVariableType(VariableType::Byte)->getPointerTo();
+        case VariableType::VoidPointerPointer:
+            return GetBaseVariableType(VariableType::Byte)->getPointerTo()->getPointerTo();
         case VariableType::Boolean:
             return GetBaseVariableType(type);
         case VariableType::Byte:
             return GetBaseVariableType(type);
         case VariableType::BytePointer:
             return GetBaseVariableType(VariableType::Byte)->getPointerTo();
+        case VariableType::BytePointerPointer:
+            return GetBaseVariableType(VariableType::Byte)->getPointerTo()->getPointerTo();
         case VariableType::Int16:
             return GetBaseVariableType(type);
         case VariableType::Int16Pointer:
             return GetBaseVariableType(VariableType::Int16)->getPointerTo();
+        case VariableType::Int16PointerPointer:
+            return GetBaseVariableType(VariableType::Int16)->getPointerTo()->getPointerTo();
         case VariableType::Int32:
             return GetBaseVariableType(type);
         case VariableType::Int32Pointer:
             return GetBaseVariableType(VariableType::Int32)->getPointerTo();
+        case VariableType::Int32PointerPointer:
+            return GetBaseVariableType(VariableType::Int32)->getPointerTo()->getPointerTo();
         case VariableType::Int64:
             return GetBaseVariableType(type);
         case VariableType::Int64Pointer:
             return GetBaseVariableType(VariableType::Int64)->getPointerTo();
+        case VariableType::Int64PointerPointer:
+            return GetBaseVariableType(VariableType::Int64)->getPointerTo()->getPointerTo();
         case VariableType::Float:
             return GetBaseVariableType(type);
         case VariableType::FloatPointer:
             return GetBaseVariableType(VariableType::Float)->getPointerTo();
+        case VariableType::FloatPointerPointer:
+            return GetBaseVariableType(VariableType::Float)->getPointerTo()->getPointerTo();
         case VariableType::Double:
             return GetBaseVariableType(type);
         case VariableType::DoublePointer:
             return GetBaseVariableType(VariableType::Double)->getPointerTo();
+        case VariableType::DoublePointerPointer:
+            return GetBaseVariableType(VariableType::Double)->getPointerTo()->getPointerTo();
         case VariableType::Char8:
             return GetBaseVariableType(type);
         case VariableType::Char8Pointer:
             return GetBaseVariableType(VariableType::Char8)->getPointerTo();
+        case VariableType::Char8PointerPointer:
+            return GetBaseVariableType(VariableType::Char8)->getPointerTo()->getPointerTo();
         default:
             throw EmitterException(EmitterError::valueTypeNotSupported);
         }
@@ -1115,7 +1131,9 @@ namespace emitters
 
     uint64_t IREmitter::SizeOf(LLVMType type) const
     {
-        return _moduleEmitter.GetTargetDataLayout().getTypeAllocSize(type);
+        assert(!_module.getDataLayout().getStringRepresentation().empty());
+
+        return _module.getDataLayout().getTypeAllocSize(type);
     }
 
     uint64_t IREmitter::SizeOf(VariableType type) const
diff --git a/libraries/emitters/src/IRExecutionEngine.cpp b/libraries/emitters/src/IRExecutionEngine.cpp
index ed8e36390..d444581ba 100644
--- a/libraries/emitters/src/IRExecutionEngine.cpp
+++ b/libraries/emitters/src/IRExecutionEngine.cpp
@@ -40,12 +40,12 @@ namespace emitters
         throw emitters::EmitterException(emitters::EmitterError::unexpected, msg);
     }
 
-    IRExecutionEngine::IRExecutionEngine(IRModuleEmitter&& module, bool verify) :
-        IRExecutionEngine(module.TransferOwnership(), verify)
+    IRExecutionEngine::IRExecutionEngine(IRModuleEmitter&& module, bool verify, llvm::CodeGenOpt::Level optLevel) :
+        IRExecutionEngine(module.TransferOwnership(), verify, optLevel)
     {
     }
 
-    IRExecutionEngine::IRExecutionEngine(std::unique_ptr<llvm::Module> pModule, bool verify)
+    IRExecutionEngine::IRExecutionEngine(std::unique_ptr<llvm::Module> pModule, bool verify, llvm::CodeGenOpt::Level optLevel)
     {
         auto debugPrintFunction = pModule->getFunction("DebugPrint");
 
@@ -53,7 +53,7 @@ namespace emitters
         llvm::InitializeNativeTargetAsmPrinter();
 
         _pBuilder = std::make_unique<llvm::EngineBuilder>(std::move(pModule));
-        _pBuilder->setEngineKind(llvm::EngineKind::JIT).setVerifyModules(verify).setUseOrcMCJITReplacement(false);
+        _pBuilder->setEngineKind(llvm::EngineKind::JIT).setVerifyModules(verify).setOptLevel(optLevel).setEmulatedTLS(true);
 
         static bool installed = false;
         if (!installed)
@@ -68,7 +68,6 @@ namespace emitters
         {
             DefineFunction(debugPrintFunction, reinterpret_cast<UIntPtrT>(&DebugPrintImpl));
         }
-
     }
 
     IRExecutionEngine::~IRExecutionEngine()
@@ -124,6 +123,12 @@ namespace emitters
         _pEngine->addGlobalMapping(func, (void*)address);
     }
 
+    void IRExecutionEngine::DefineFunction(const std::string& name, UIntPtrT address)
+    {
+        EnsureEngine();
+        _pEngine->addGlobalMapping(name, address);
+    }
+
     DynamicFunction IRExecutionEngine::GetMain()
     {
         return reinterpret_cast<DynamicFunction>(GetFunctionAddress("main"));
diff --git a/libraries/emitters/src/IRFunctionEmitter.cpp b/libraries/emitters/src/IRFunctionEmitter.cpp
index 911b2a130..349583c61 100644
--- a/libraries/emitters/src/IRFunctionEmitter.cpp
+++ b/libraries/emitters/src/IRFunctionEmitter.cpp
@@ -39,7 +39,7 @@ namespace emitters
     namespace
     {
         // Helper function for recursive function
-        void MultiDimFor(IRFunctionEmitter& function, std::vector<IRFunctionEmitter::ConstLoopRange> ranges, std::vector<IRLocalScalar> prevIndices, IRFunctionEmitter::MultiDimForLoopBodyFunction body)
+        void MultiDimFor(IRFunctionEmitter& function, std::vector<IRFunctionEmitter::ConstLoopRange> ranges, std::vector<IRLocalScalar> prevIndices, IRFunctionEmitter::MultiDimForLoopBodyFunction body, const std::string& tag = "")
         {
             if (ranges.empty())
             {
@@ -49,16 +49,16 @@ namespace emitters
             {
                 auto range = ranges.front();
                 std::vector<IRFunctionEmitter::ConstLoopRange> suffix(ranges.begin() + 1, ranges.end());
-                function.For(range.begin, range.end, [suffix, prevIndices, body](IRFunctionEmitter& function, auto index) {
+                function.For(range.begin, range.end, [suffix, prevIndices, body, tag](IRFunctionEmitter& function, auto index) {
                     std::vector<IRLocalScalar> prefix(prevIndices.begin(), prevIndices.end());
                     prefix.push_back(index);
-                    MultiDimFor(function, suffix, prefix, body);
+                    MultiDimFor(function, suffix, prefix, body, tag);
                 });
             }
         }
 
         // Helper function for recursive function
-        void MultiDimFor(IRFunctionEmitter& function, std::vector<IRFunctionEmitter::LoopRange> ranges, std::vector<IRLocalScalar> prevIndices, IRFunctionEmitter::MultiDimForLoopBodyFunction body)
+        void MultiDimFor(IRFunctionEmitter& function, std::vector<IRFunctionEmitter::LoopRange> ranges, std::vector<IRLocalScalar> prevIndices, IRFunctionEmitter::MultiDimForLoopBodyFunction body, const std::string& tag = "")
         {
             if (ranges.empty())
             {
@@ -68,16 +68,16 @@ namespace emitters
             {
                 auto range = ranges.front();
                 std::vector<IRFunctionEmitter::LoopRange> suffix(ranges.begin() + 1, ranges.end());
-                function.For(range.begin, range.end, [suffix, prevIndices, body](IRFunctionEmitter& function, auto index) {
+                function.For(range.begin, range.end, [suffix, prevIndices, body, tag](IRFunctionEmitter& function, auto index) {
                     std::vector<IRLocalScalar> prefix(prevIndices.begin(), prevIndices.end());
                     prefix.push_back(index);
-                    MultiDimFor(function, suffix, prefix, body);
+                    MultiDimFor(function, suffix, prefix, body, tag);
                 });
             }
         }
 
         // Helper function for recursive function
-        void TiledMultiDimFor(IRFunctionEmitter& function, std::vector<IRFunctionEmitter::ConstTiledLoopRange> ranges, std::vector<IRFunctionEmitter::BlockInterval> prevIntervals, IRFunctionEmitter::TiledMultiDimForLoopBodyFunction body)
+        void TiledMultiDimFor(IRFunctionEmitter& function, std::vector<IRFunctionEmitter::ConstTiledLoopRange> ranges, std::vector<IRFunctionEmitter::BlockInterval> prevIntervals, IRFunctionEmitter::TiledMultiDimForLoopBodyFunction body, const std::string& tag = "")
         {
             if (ranges.empty())
             {
@@ -87,16 +87,16 @@ namespace emitters
             {
                 auto range = ranges.front();
                 std::vector<IRFunctionEmitter::ConstTiledLoopRange> suffix(ranges.begin() + 1, ranges.end());
-                function.For(range, [suffix, prevIntervals, body](IRFunctionEmitter& function, auto interval) {
+                function.For(range, [suffix, prevIntervals, body, tag](IRFunctionEmitter& function, auto interval) {
                     std::vector<IRFunctionEmitter::BlockInterval> prefix(prevIntervals.begin(), prevIntervals.end());
                     prefix.push_back(interval);
-                    TiledMultiDimFor(function, suffix, prefix, body);
+                    TiledMultiDimFor(function, suffix, prefix, body, tag);
                 });
             }
         }
 
         // Helper function for recursive function
-        void TiledMultiDimFor(IRFunctionEmitter& function, std::vector<IRFunctionEmitter::TiledLoopRange> ranges, std::vector<IRFunctionEmitter::BlockInterval> prevIntervals, IRFunctionEmitter::TiledMultiDimForLoopBodyFunction body)
+        void TiledMultiDimFor(IRFunctionEmitter& function, std::vector<IRFunctionEmitter::TiledLoopRange> ranges, std::vector<IRFunctionEmitter::BlockInterval> prevIntervals, IRFunctionEmitter::TiledMultiDimForLoopBodyFunction body, const std::string& tag = "")
         {
             if (ranges.empty())
             {
@@ -106,24 +106,26 @@ namespace emitters
             {
                 auto range = ranges.front();
                 std::vector<IRFunctionEmitter::TiledLoopRange> suffix(ranges.begin() + 1, ranges.end());
-                function.For(range, [suffix, prevIntervals, body](IRFunctionEmitter& function, auto interval) {
+                function.For(range, [suffix, prevIntervals, body, tag](IRFunctionEmitter& function, auto interval) {
                     std::vector<IRFunctionEmitter::BlockInterval> prefix(prevIntervals.begin(), prevIntervals.end());
                     prefix.push_back(interval);
-                    TiledMultiDimFor(function, suffix, prefix, body);
+                    TiledMultiDimFor(function, suffix, prefix, body, tag);
                 });
             }
         }
 
-        constexpr llvm::Attribute::AttrKind ToLLVMAttr(IRFunctionEmitter::Attributes attr)
+        llvm::Attribute ToLLVMAttr(llvm::LLVMContext& context, IRFunctionEmitter::Attributes attr, const std::any& extra)
         {
             switch (attr)
             {
             default:
                 [[fallthrough]];
             case IRFunctionEmitter::Attributes::None:
-                return llvm::Attribute::AttrKind::None;
+                return llvm::Attribute::get(context, llvm::Attribute::AttrKind::None);
             case IRFunctionEmitter::Attributes::NoAlias:
-                return llvm::Attribute::AttrKind::NoAlias;
+                return llvm::Attribute::get(context, llvm::Attribute::AttrKind::NoAlias);
+            case IRFunctionEmitter::Attributes::Dereferenceable:
+                return llvm::Attribute::getWithDereferenceableBytes(context, std::any_cast<uint64_t>(extra));
             }
         }
     } // namespace
@@ -168,6 +170,7 @@ namespace emitters
     {
         Log() << "Completing function " << GetFunctionName() << EOL;
         Verify();
+        // TODO: set a flag indicating that this function is done
     }
 
     void IRFunctionEmitter::SetUpFunction()
@@ -711,67 +714,53 @@ namespace emitters
         }
     }
 
-    void IRFunctionEmitter::EntryBlockScope::ExitScope()
-    {
-        if (_inScope)
-        {
-            _function.SetCurrentInsertPoint(_oldPos);
-            _inScope = false;
-        }
-    }
-
     IRFunctionEmitter::EntryBlockScope::~EntryBlockScope()
     {
-        ExitScope();
+        _function.SetCurrentInsertPoint(_oldPos);
     }
 
-    void IRFunctionEmitter::SetAttributeForArgument(size_t index, IRFunctionEmitter::Attributes attribute)
+    void IRFunctionEmitter::SetAttributeForArgument(size_t index, IRFunctionEmitter::Attributes attribute, const std::any& extra)
     {
-        (_pFunction->arg_begin() + index)->addAttr(ToLLVMAttr(attribute));
+        (_pFunction->arg_begin() + index)->addAttr(ToLLVMAttr(GetLLVMContext(), attribute, extra));
     }
 
-    void IRFunctionEmitter::SetAttributeForArguments(IRFunctionEmitter::Attributes attribute)
+    void IRFunctionEmitter::SetAttributeForArguments(IRFunctionEmitter::Attributes attribute, const std::any& extra)
     {
         for (auto& argument : Arguments())
         {
-            argument.addAttr(ToLLVMAttr(attribute));
+            argument.addAttr(ToLLVMAttr(GetLLVMContext(), attribute, extra));
         }
     }
 
-    void IRFunctionEmitter::SetAttributeForArguments(std::vector<size_t> indices, IRFunctionEmitter::Attributes attribute)
+    void IRFunctionEmitter::SetAttributeForArguments(std::vector<size_t> indices, IRFunctionEmitter::Attributes attribute, const std::any& extra)
     {
         for (auto index : indices)
         {
-            SetAttributeForArgument(index, attribute);
+            SetAttributeForArgument(index, attribute, extra);
         }
     }
 
     llvm::AllocaInst* IRFunctionEmitter::Variable(VariableType type)
     {
-        EntryBlockScope scope(*this);
-        auto alloca = GetEmitter().StackAllocate(type);
-        scope.ExitScope();
-
-       return alloca;
+        return ExecuteInEntryBlock([this, type] {
+            return GetEmitter().StackAllocate(type);
+        });
     }
 
     llvm::AllocaInst* IRFunctionEmitter::Variable(LLVMType type)
     {
-        EntryBlockScope scope(*this);
-        auto alloca = GetEmitter().StackAllocate(type);
-        scope.ExitScope();
-
-        return alloca;
+        return ExecuteInEntryBlock([this, type] {
+            return GetEmitter().StackAllocate(type);
+        });
     }
 
     llvm::AllocaInst* IRFunctionEmitter::Variable(VariableType type, const std::string& namePrefix)
     {
-        EntryBlockScope scope(*this);
-
         // don't do this for emitted variables!
         auto name = _locals.GetUniqueName(namePrefix);
-        auto result = GetEmitter().StackAllocate(type, name);
-        scope.ExitScope();
+        auto result = ExecuteInEntryBlock([this, type, name] {
+            return GetEmitter().StackAllocate(type, name);
+        });
 
         _locals.Add(name, result);
         return result;
@@ -779,10 +768,10 @@ namespace emitters
 
     llvm::AllocaInst* IRFunctionEmitter::Variable(LLVMType type, const std::string& namePrefix)
     {
-        EntryBlockScope scope(*this);
         auto name = _locals.GetUniqueName(namePrefix);
-        auto result = GetEmitter().StackAllocate(type, name);
-        scope.ExitScope();
+        auto result = ExecuteInEntryBlock([this, type, name] {
+            return GetEmitter().StackAllocate(type, name);
+        });
 
         _locals.Add(name, result);
         return result;
@@ -790,9 +779,9 @@ namespace emitters
 
     llvm::AllocaInst* IRFunctionEmitter::EmittedVariable(VariableType type, const std::string& name)
     {
-        EntryBlockScope scope(*this);
-        auto result = GetEmitter().StackAllocate(type, name);
-        scope.ExitScope();
+        auto result = ExecuteInEntryBlock([this, type, name] {
+            return GetEmitter().StackAllocate(type, name);
+        });
 
         _locals.Add(name, result);
         return result;
@@ -800,38 +789,16 @@ namespace emitters
 
     llvm::AllocaInst* IRFunctionEmitter::Variable(VariableType type, int size)
     {
-        EntryBlockScope scope(*this);
-        auto alloca = GetEmitter().StackAllocate(type, size);
-        scope.ExitScope();
-
-        return alloca;
-    }
-
-    llvm::AllocaInst* IRFunctionEmitter::Variable(VariableType type, int rows, int columns)
-    {
-        EntryBlockScope scope(*this);
-        auto alloca = GetEmitter().StackAllocate(type, rows, columns);
-        scope.ExitScope();
-
-        return alloca;
+        return ExecuteInEntryBlock([this, type, size] {
+            return GetEmitter().StackAllocate(type, size);
+        });
     }
 
     llvm::AllocaInst* IRFunctionEmitter::Variable(LLVMType type, int size)
     {
-        EntryBlockScope scope(*this);
-        auto alloca = GetEmitter().StackAllocate(type, size);
-        scope.ExitScope();
-
-        return alloca;
-    }
-
-    llvm::AllocaInst* IRFunctionEmitter::Variable(LLVMType type, int rows, int columns)
-    {
-        EntryBlockScope scope(*this);
-        auto alloca = GetEmitter().StackAllocate(type, rows, columns);
-        scope.ExitScope();
-
-        return alloca;
+        return ExecuteInEntryBlock([this, type, size] {
+            return GetEmitter().StackAllocate(type, size);
+        });
     }
 
     LLVMValue IRFunctionEmitter::Load(LLVMValue pPointer)
@@ -1072,13 +1039,18 @@ namespace emitters
     // For loops
     //
     void IRFunctionEmitter::For(int count, std::function<void(IRFunctionEmitter&, IRLocalScalar)> body)
+    {
+        For(std::string{}, count, body);
+    }
+
+    void IRFunctionEmitter::For(const std::string& tag, int count, std::function<void(IRFunctionEmitter&, IRLocalScalar)> body)
     {
         if (count < 0)
         {
             throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "For loop count must be >= 0");
         }
 
-        auto loop = IRForLoopEmitter(*this);
+        auto loop = IRForLoopEmitter(*this, tag);
         loop.Begin(count);
         body(*this, LocalScalar(loop.LoadIterationVariable()));
         loop.End();
@@ -1086,33 +1058,53 @@ namespace emitters
 
     void IRFunctionEmitter::For(LLVMValue count, std::function<void(IRFunctionEmitter&, IRLocalScalar)> body)
     {
-        auto loop = IRForLoopEmitter(*this);
+        For(std::string{}, count, body);
+    }
+
+    void IRFunctionEmitter::For(const std::string& tag, LLVMValue count, std::function<void(IRFunctionEmitter&, IRLocalScalar)> body)
+    {
+        auto loop = IRForLoopEmitter(*this, tag);
         loop.Begin(count);
         body(*this, LocalScalar(loop.LoadIterationVariable()));
         loop.End();
     }
 
     void IRFunctionEmitter::For(int beginValue, int endValue, std::function<void(IRFunctionEmitter&, IRLocalScalar)> body)
+    {
+        For(std::string{}, beginValue, endValue, body);
+    }
+
+    void IRFunctionEmitter::For(const std::string& tag, int beginValue, int endValue, std::function<void(IRFunctionEmitter&, IRLocalScalar)> body)
     {
         if (endValue < beginValue)
         {
             throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "For loop begin must be <= end");
         }
-        For(beginValue, endValue, 1, body);
+        For(tag, beginValue, endValue, 1, body);
     }
 
     void IRFunctionEmitter::For(LLVMValue beginValue, LLVMValue endValue, std::function<void(IRFunctionEmitter&, IRLocalScalar)> body)
     {
-        For(beginValue, endValue, Literal<int>(1), body);
+        For(std::string{}, beginValue, endValue, body);
+    }
+
+    void IRFunctionEmitter::For(const std::string& tag, LLVMValue beginValue, LLVMValue endValue, std::function<void(IRFunctionEmitter&, IRLocalScalar)> body)
+    {
+        For(tag, beginValue, endValue, Literal<int>(1), body);
     }
 
     void IRFunctionEmitter::For(int beginValue, int endValue, int increment, std::function<void(IRFunctionEmitter&, IRLocalScalar)> body)
+    {
+        For(std::string{}, beginValue, endValue, increment, body);
+    }
+
+    void IRFunctionEmitter::For(const std::string& tag, int beginValue, int endValue, int increment, std::function<void(IRFunctionEmitter&, IRLocalScalar)> body)
     {
         if (endValue < beginValue)
         {
             throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "For loop begin must be <= end");
         }
-        auto loop = IRForLoopEmitter(*this);
+        auto loop = IRForLoopEmitter(*this, tag);
         loop.Begin(beginValue, endValue, increment);
         body(*this, LocalScalar(loop.LoadIterationVariable()));
         loop.End();
@@ -1120,7 +1112,12 @@ namespace emitters
 
     void IRFunctionEmitter::For(LLVMValue beginValue, LLVMValue endValue, LLVMValue increment, std::function<void(IRFunctionEmitter&, IRLocalScalar)> body)
     {
-        auto loop = IRForLoopEmitter(*this);
+        For(std::string{}, beginValue, endValue, increment, body);
+    }
+
+    void IRFunctionEmitter::For(const std::string& tag, LLVMValue beginValue, LLVMValue endValue, LLVMValue increment, std::function<void(IRFunctionEmitter&, IRLocalScalar)> body)
+    {
+        auto loop = IRForLoopEmitter(*this, tag);
         loop.Begin(beginValue, endValue, increment);
         body(*this, LocalScalar(loop.LoadIterationVariable()));
         loop.End();
@@ -1132,15 +1129,30 @@ namespace emitters
 
     void IRFunctionEmitter::For(const std::vector<ConstLoopRange>& ranges, MultiDimForLoopBodyFunction body)
     {
-        emitters::MultiDimFor(*this, ranges, {}, body);
+        For(std::string{}, ranges, body);
+    }
+
+    void IRFunctionEmitter::For(const std::string& tag, const std::vector<ConstLoopRange>& ranges, MultiDimForLoopBodyFunction body)
+    {
+        emitters::MultiDimFor(*this, ranges, {}, body, tag);
     }
 
     void IRFunctionEmitter::For(const std::vector<LoopRange>& ranges, MultiDimForLoopBodyFunction body)
     {
-        emitters::MultiDimFor(*this, ranges, {}, body);
+        For(std::string{}, ranges, body);
+    }
+
+    void IRFunctionEmitter::For(const std::string& tag, const std::vector<LoopRange>& ranges, MultiDimForLoopBodyFunction body)
+    {
+        emitters::MultiDimFor(*this, ranges, {}, body, tag);
     }
 
     void IRFunctionEmitter::For(ConstTiledLoopRange range, TiledForLoopBodyFunction body)
+    {
+        For(std::string{}, range, body);
+    }
+
+    void IRFunctionEmitter::For(const std::string& tag, ConstTiledLoopRange range, TiledForLoopBodyFunction body)
     {
         auto stepSize = range.blockSize;
         auto numFullBlocks = (range.end - range.begin) / stepSize;
@@ -1150,7 +1162,7 @@ namespace emitters
         if (numFullBlocks > 0)
         {
             // For(range.begin, fullBlocksEnd, stepSize, [stepSize, body](IRFunctionEmitter function, auto index) {
-            For(numFullBlocks, [stepSize, range, body](IRFunctionEmitter function, auto blockIndex) {
+            For(tag, numFullBlocks, [stepSize, range, body](IRFunctionEmitter function, auto blockIndex) {
                 auto index = range.begin + blockIndex * stepSize;
                 body(function, { index, index + stepSize, function.LocalScalar(stepSize), blockIndex });
             });
@@ -1164,6 +1176,11 @@ namespace emitters
     }
 
     void IRFunctionEmitter::For(TiledLoopRange range, TiledForLoopBodyFunction body)
+    {
+        For(std::string{}, range, body);
+    }
+
+    void IRFunctionEmitter::For(const std::string& tag, TiledLoopRange range, TiledForLoopBodyFunction body)
     {
         if (!range.blockSize.IsConstantInt())
         {
@@ -1175,8 +1192,8 @@ namespace emitters
         auto fullBlocksEnd = range.begin + (numFullBlocks * stepSize);
 
         // full blocks
-        If(numFullBlocks > 0, [numFullBlocks, stepSize, range, body](auto& function) {
-            function.For(numFullBlocks, range.blockSize, [stepSize, range, body](IRFunctionEmitter function, auto blockIndex) {
+        If(numFullBlocks > 0, [numFullBlocks, stepSize, range, body, tag](auto& function) {
+            function.For(tag, numFullBlocks, range.blockSize, [stepSize, range, body](IRFunctionEmitter function, auto blockIndex) {
                 auto index = range.begin + blockIndex * stepSize;
                 body(function, { index, index + range.blockSize, range.blockSize, blockIndex });
             });
@@ -1190,12 +1207,22 @@ namespace emitters
 
     void IRFunctionEmitter::For(const std::vector<ConstTiledLoopRange>& ranges, TiledMultiDimForLoopBodyFunction body)
     {
-        emitters::TiledMultiDimFor(*this, ranges, {}, body);
+        For(std::string{}, ranges, body);
+    }
+
+    void IRFunctionEmitter::For(const std::string& tag, const std::vector<ConstTiledLoopRange>& ranges, TiledMultiDimForLoopBodyFunction body)
+    {
+        emitters::TiledMultiDimFor(*this, ranges, {}, body, tag);
     }
 
     void IRFunctionEmitter::For(const std::vector<TiledLoopRange>& ranges, TiledMultiDimForLoopBodyFunction body)
     {
-        emitters::TiledMultiDimFor(*this, ranges, {}, body);
+        For(std::string{}, ranges, body);
+    }
+
+    void IRFunctionEmitter::For(const std::string& tag, const std::vector<TiledLoopRange>& ranges, TiledMultiDimForLoopBodyFunction body)
+    {
+        emitters::TiledMultiDimFor(*this, ranges, {}, body, tag);
     }
 
     //
@@ -1251,7 +1278,12 @@ namespace emitters
 
     void IRFunctionEmitter::While(LLVMValue pTestValuePointer, std::function<void(IRFunctionEmitter& function)> body)
     {
-        auto loop = IRWhileLoopEmitter(*this);
+        While(std::string{}, pTestValuePointer, body);
+    }
+
+    void IRFunctionEmitter::While(const std::string& tag, LLVMValue pTestValuePointer, std::function<void(IRFunctionEmitter& function)> body)
+    {
+        auto loop = IRWhileLoopEmitter(*this, tag);
         loop.Begin(pTestValuePointer);
         body(*this);
         loop.End();
@@ -1259,7 +1291,12 @@ namespace emitters
 
     void IRFunctionEmitter::While(std::function<LLVMValue(IRFunctionEmitter&)> condition, WhileLoopBodyFunction body)
     {
-        auto loop = IRWhileLoopEmitter(*this);
+        While(std::string{}, condition, body);
+    }
+
+    void IRFunctionEmitter::While(const std::string& tag, std::function<LLVMValue(IRFunctionEmitter&)> condition, WhileLoopBodyFunction body)
+    {
+        auto loop = IRWhileLoopEmitter(*this, tag);
         loop.Begin(condition);
         body(*this);
         loop.End();
@@ -1372,6 +1409,12 @@ namespace emitters
         return Call(PrintfFnName, arguments);
     }
 
+    LLVMValue IRFunctionEmitter::Printf(std::vector<LLVMValue> arguments)
+    {
+        EnsurePrintf();
+        return Call(PrintfFnName, arguments);
+    }
+
     LLVMValue IRFunctionEmitter::Printf(const std::string& format, std::initializer_list<LLVMValue> arguments)
     {
         EnsurePrintf();
@@ -1732,6 +1775,11 @@ namespace emitters
         }
     }
 
+    void IRFunctionEmitter::DebugBreak()
+    {
+        GetModule().GetIntrinsic(llvm::Intrinsic::debugtrap);
+    }
+
     //
     // Information about the current function begin emitted
     //
@@ -1798,6 +1846,34 @@ namespace emitters
         InsertMetadata(c_swigFunctionTagName);
     }
 
+    void IRFunctionEmitter::SetInlineState(FunctionInlining inlineState)
+    {
+        SetInlineState(_pFunction, inlineState);
+    }
+
+    void IRFunctionEmitter::SetInlineState(LLVMFunction function, FunctionInlining inlineState)
+    {
+        for (auto attr : { llvm::Attribute::AttrKind::AlwaysInline, llvm::Attribute::AttrKind::InlineHint, llvm::Attribute::AttrKind::NoInline })
+        {
+            function->removeFnAttr(attr);
+        }
+
+        switch (inlineState)
+        {
+        case FunctionInlining::always:
+            function->addFnAttr(llvm::Attribute::AttrKind::AlwaysInline);
+            break;
+        case FunctionInlining::prefer:
+            function->addFnAttr(llvm::Attribute::AttrKind::InlineHint);
+            break;
+        case FunctionInlining::never:
+            function->addFnAttr(llvm::Attribute::AttrKind::NoInline);
+            break;
+        default:
+            break;
+        }
+    }
+
     //
     // Internal functions
     //
diff --git a/libraries/emitters/src/IRIfEmitter.cpp b/libraries/emitters/src/IRIfEmitter.cpp
index a94e1c36b..4b4959f2a 100644
--- a/libraries/emitters/src/IRIfEmitter.cpp
+++ b/libraries/emitters/src/IRIfEmitter.cpp
@@ -52,12 +52,12 @@ namespace emitters
 
     // Move ctor and assignment op are needed to explicitly swap out values,
     // since the default behavior for "moving" fundamental types (aka, pointers) is to do a bitwise-copy
-    IRIfEmitter::IRIfEmitter(IRIfEmitter&& other)
+    IRIfEmitter::IRIfEmitter(IRIfEmitter&& other) noexcept
     {
         *this = std::move(other);
     }
 
-    IRIfEmitter& IRIfEmitter::operator=(IRIfEmitter&& other)
+    IRIfEmitter& IRIfEmitter::operator=(IRIfEmitter&& other) noexcept
     {
         if (this != &other)
         {
diff --git a/libraries/emitters/src/IRLocalValue.cpp b/libraries/emitters/src/IRLocalValue.cpp
index 87aa96737..e4510de76 100644
--- a/libraries/emitters/src/IRLocalValue.cpp
+++ b/libraries/emitters/src/IRLocalValue.cpp
@@ -8,6 +8,7 @@
 
 #include "IRLocalValue.h"
 #include "IRFunctionEmitter.h"
+#include "IRLocalScalar.h"
 
 #include <utilities/include/Exception.h>
 
@@ -87,5 +88,29 @@ namespace emitters
         this->value = value;
         return *this;
     }
+
+    //
+    // IRLocalPointer
+    //
+    IRLocalValue IRLocalPointer::Load() const
+    {
+        return { function, function.Load(value) };
+    }
+
+    IRLocalPointer IRLocalPointer::Offset(int offset) const
+    {
+        return { function, function.PointerOffset(value, offset) };
+    }
+
+    IRLocalPointer IRLocalPointer::Offset(LLVMValue offset) const
+    {
+        return { function, function.PointerOffset(value, offset) };
+    }
+
+    IRLocalPointer IRLocalPointer::Offset(const IRLocalScalar& offset) const
+    {
+        return { function, function.PointerOffset(value, offset) };
+    }
+
 } // namespace emitters
 } // namespace ell
diff --git a/libraries/emitters/src/IRLoopEmitter.cpp b/libraries/emitters/src/IRLoopEmitter.cpp
index 7db6ce16e..dd8997312 100644
--- a/libraries/emitters/src/IRLoopEmitter.cpp
+++ b/libraries/emitters/src/IRLoopEmitter.cpp
@@ -19,6 +19,33 @@ namespace emitters
     const std::string LoopIncBlockName = "loop.inc";
     const std::string LoopAfterBlockName = "loop.after";
 
+    namespace
+    {
+        std::array<llvm::Metadata*, 2> GenerateUnrollMetadata(llvm::LLVMContext& context)
+        {
+            return { llvm::MDString::get(context, "llvm.loop.unroll.enable"),
+                     llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
+                         llvm::Type::getInt1Ty(context), true)) };
+        }
+
+        std::array<llvm::Metadata*, 2> GenerateVectorizeMetadata(llvm::LLVMContext& context)
+        {
+            return { llvm::MDString::get(context, "llvm.loop.vectorize.enable"),
+                     llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
+                         llvm::Type::getInt1Ty(context), true)) };
+        }
+
+        std::array<llvm::Metadata*, 2> GenerateVectorizeFollowupMetadata(llvm::LLVMContext& context)
+        {
+            return { llvm::MDString::get(context, "llvm.loop.vectorize.followup_vectorized"), llvm::MDNode::get(context, GenerateUnrollMetadata(context)) };
+        }
+
+        std::array<llvm::Metadata*, 1> GenerateDisableNonforcedMetadata(llvm::LLVMContext& context)
+        {
+            return { llvm::MDString::get(context, "llvm.loop.disable_nonforced") };
+        }
+    } // namespace
+
     IRLoopEmitter::IRLoopEmitter(IRFunctionEmitter& functionEmitter) :
         _functionEmitter(functionEmitter) {}
 
@@ -32,20 +59,22 @@ namespace emitters
         auto tempNode = llvm::MDNode::getTemporary(context, {});
         metadataElements.push_back(tempNode.get());
 
-        if (unroll)
+        if (vectorize)
         {
-            llvm::Metadata* vals[] = { llvm::MDString::get(context, "llvm.loop.unroll.enable"),
-                                       llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
-                                           llvm::Type::getInt1Ty(context), true)) };
-            metadataElements.push_back(llvm::MDNode::get(context, vals));
+            metadataElements.push_back(llvm::MDNode::get(context, GenerateVectorizeMetadata(context)));
+            if (unroll)
+            {
+                metadataElements.push_back(llvm::MDNode::get(context, GenerateVectorizeFollowupMetadata(context)));
+            }
+        }
+        else if (unroll)
+        {
+            metadataElements.push_back(llvm::MDNode::get(context, GenerateUnrollMetadata(context)));
         }
 
-        if (vectorize)
+        if (unroll || vectorize)
         {
-            llvm::Metadata* vals[] = { llvm::MDString::get(context, "llvm.loop.vectorize.enable"),
-                                       llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
-                                           llvm::Type::getInt1Ty(context), true)) };
-            metadataElements.push_back(llvm::MDNode::get(context, vals));
+            metadataElements.push_back(llvm::MDNode::get(context, GenerateDisableNonforcedMetadata(context)));
         }
 
         auto loopID = llvm::MDNode::get(context, metadataElements);
@@ -62,16 +91,18 @@ namespace emitters
     // _pAfterBlock -- branch to this block when done
     //
 
-    IRForLoopEmitter::IRForLoopEmitter(IRFunctionEmitter& functionEmitter) :
-        IRLoopEmitter(functionEmitter) {}
+    IRForLoopEmitter::IRForLoopEmitter(IRFunctionEmitter& functionEmitter, const std::string& tag) :
+        IRLoopEmitter(functionEmitter),
+        _tag(tag)
+    {}
 
     void IRForLoopEmitter::CreateBlocks()
     {
-        _pInitializationBlock = _functionEmitter.Block(LoopInitBlockName);
-        _pConditionBlock = _functionEmitter.Block(LoopConditionBlockName);
-        _pBodyBlock = _functionEmitter.Block(LoopBodyBlockName);
-        _pIncrementBlock = _functionEmitter.Block(LoopIncBlockName);
-        _pAfterBlock = _functionEmitter.Block(LoopAfterBlockName);
+        _pInitializationBlock = _functionEmitter.Block(_tag + LoopInitBlockName);
+        _pConditionBlock = _functionEmitter.Block(_tag + LoopConditionBlockName);
+        _pBodyBlock = _functionEmitter.Block(_tag + LoopBodyBlockName);
+        _pIncrementBlock = _functionEmitter.Block(_tag + LoopIncBlockName);
+        _pAfterBlock = _functionEmitter.Block(_tag + LoopAfterBlockName);
     }
 
     llvm::BasicBlock* IRForLoopEmitter::Begin(int repeatCount)
@@ -123,9 +154,9 @@ namespace emitters
         _functionEmitter.Branch(_pConditionBlock);
         _functionEmitter.SetCurrentBlock(_pConditionBlock);
         auto branchInst = _functionEmitter.Branch(comparison, _functionEmitter.Load(_pIterationVariable), pTestValue, _pBodyBlock, _pAfterBlock);
-        
+
         bool unroll = false;
-        bool vectorize = false;
+        bool vectorize = true;
         AddLoopMetadata(branchInst, unroll, vectorize);
     }
 
@@ -168,15 +199,17 @@ namespace emitters
     //
     // IRWhileLoopEmitter
     //
-    IRWhileLoopEmitter::IRWhileLoopEmitter(IRFunctionEmitter& functionEmitter) :
-        IRLoopEmitter(functionEmitter) {}
+    IRWhileLoopEmitter::IRWhileLoopEmitter(IRFunctionEmitter& functionEmitter, const std::string& tag) :
+        IRLoopEmitter(functionEmitter),
+        _tag(tag)
+    {}
 
     void IRWhileLoopEmitter::CreateBlocks()
     {
-        _pInitializationBlock = _functionEmitter.Block(LoopInitBlockName);
-        _pConditionBlock = _functionEmitter.Block(LoopConditionBlockName);
-        _pBodyBlock = _functionEmitter.Block(LoopBodyBlockName);
-        _pAfterBlock = _functionEmitter.Block(LoopAfterBlockName);
+        _pInitializationBlock = _functionEmitter.Block(_tag + LoopInitBlockName);
+        _pConditionBlock = _functionEmitter.Block(_tag + LoopConditionBlockName);
+        _pBodyBlock = _functionEmitter.Block(_tag + LoopBodyBlockName);
+        _pAfterBlock = _functionEmitter.Block(_tag + LoopAfterBlockName);
     }
 
     llvm::BasicBlock* IRWhileLoopEmitter::Begin(LLVMValue pTestValuePointer)
@@ -226,8 +259,8 @@ namespace emitters
         _functionEmitter.SetCurrentBlock(_pConditionBlock);
         auto conditionValue = condition(_functionEmitter);
         auto branchInst = _functionEmitter.Branch(conditionValue,
-                                                 _pBodyBlock,
-                                                 _pAfterBlock);
+                                                  _pBodyBlock,
+                                                  _pAfterBlock);
         bool unroll = false;
         bool vectorize = false;
         AddLoopMetadata(branchInst, unroll, vectorize);
diff --git a/libraries/emitters/src/IRModuleEmitter.cpp b/libraries/emitters/src/IRModuleEmitter.cpp
index c9fec8ed9..8cf9a1a72 100644
--- a/libraries/emitters/src/IRModuleEmitter.cpp
+++ b/libraries/emitters/src/IRModuleEmitter.cpp
@@ -20,18 +20,11 @@
 #include <utilities/include/Files.h>
 #include <utilities/include/Logger.h>
 
-#include <llvm/ADT/Triple.h>
 #include <llvm/AsmParser/Parser.h>
-#include <llvm/Bitcode/BitcodeWriter.h>
-#include <llvm/IR/DerivedTypes.h>
 #include <llvm/IR/Verifier.h>
-#include <llvm/Support/FileSystem.h>
-#include <llvm/Support/Host.h>
 #include <llvm/Support/TargetRegistry.h>
-#include <llvm/Support/TargetSelect.h>
 #include <llvm/Support/ToolOutputFile.h>
 #include <llvm/Support/raw_os_ostream.h>
-#include <llvm/Target/TargetMachine.h>
 #include <llvm/Transforms/Utils/ModuleUtils.h>
 
 #include <algorithm>
@@ -52,13 +45,15 @@ namespace emitters
     IRModuleEmitter::IRModuleEmitter(const std::string& moduleName, const CompilerOptions& parameters) :
         _llvmContext(std::make_unique<llvm::LLVMContext>()),
         _llvmModule(std::make_unique<llvm::Module>(moduleName, *_llvmContext)),
-        _emitter(*this, *_llvmContext),
-        _runtime(*this),
-        _threadPool(*this),
-        _profiler(*this, parameters.profile)
+        _emitter(new IREmitter(*_llvmContext, *_llvmModule)),
+        _runtime(new IRRuntime(*this)),
+        _threadPool(new IRThreadPool(*this)),
+        _profiler(new IRProfiler(*this, parameters.profile))
     {
         InitializeLLVM();
-        InitializeGlobalPassRegistry();
+
+        // Create a diagnostic handler to record if there was an error
+        _diagnosticHandler = std::unique_ptr<IRDiagnosticHandler>(new IRDiagnosticHandler(*_llvmContext));
 
         SetCompilerOptions(parameters);
         if (GetCompilerOptions().includeDiagnosticInfo)
@@ -66,7 +61,7 @@ namespace emitters
             DeclarePrintf();
         }
 
-        _profiler.Init();
+        _profiler->Init();
     }
 
     void IRModuleEmitter::SetCompilerOptions(const CompilerOptions& parameters)
@@ -135,7 +130,7 @@ namespace emitters
     IRFunctionEmitter& IRModuleEmitter::BeginFunction(const std::string& functionName, VariableType returnType)
     {
         _functions[functionName] = FunctionDeclaration(functionName, returnType);
-        return BeginFunction(functionName, _emitter.Type(returnType));
+        return BeginFunction(functionName, GetIREmitter().Type(returnType));
     }
 
     IRFunctionEmitter& IRModuleEmitter::BeginFunction(const std::string& functionName, LLVMType returnType)
@@ -151,21 +146,21 @@ namespace emitters
             fake.push_back({ "", t });
         }
         _functions[functionName] = FunctionDeclaration(functionName, returnType, fake);
-        return BeginFunction(functionName, _emitter.Type(returnType), _emitter.GetLLVMTypes(args));
+        return BeginFunction(functionName, GetIREmitter().Type(returnType), GetIREmitter().GetLLVMTypes(args));
     }
 
     IRFunctionEmitter& IRModuleEmitter::BeginFunction(const std::string& functionName, VariableType returnType, const NamedVariableTypeList& args)
     {
         _functions[functionName] = FunctionDeclaration(functionName, returnType, args);
-        return BeginFunction(functionName, _emitter.Type(returnType), args);
+        return BeginFunction(functionName, GetIREmitter().Type(returnType), args);
     }
 
     IRFunctionEmitter& IRModuleEmitter::BeginFunction(const std::string& functionName, VariableType returnType, const FunctionArgumentList& args)
     {
         _functions[functionName] = FunctionDeclaration(functionName, returnType, args);
         Log() << "Begin emitting IR for function " << functionName << EOL;
-        auto currentPos = _emitter.GetCurrentInsertPoint();
-        IRFunctionEmitter newFunction = Function(functionName, _emitter.Type(returnType), args, false);
+        auto currentPos = GetIREmitter().GetCurrentInsertPoint();
+        IRFunctionEmitter newFunction = Function(functionName, GetIREmitter().Type(returnType), args, false);
         _functionStack.emplace(newFunction, currentPos);
         return _functionStack.top().first;
     }
@@ -177,7 +172,7 @@ namespace emitters
             _functions[functionName] = FunctionDeclaration(functionName, VariableType::Custom, args);
         }
         Log() << "Begin emitting IR for function " << functionName << EOL;
-        auto currentPos = _emitter.GetCurrentInsertPoint();
+        auto currentPos = GetIREmitter().GetCurrentInsertPoint();
         IRFunctionEmitter newFunction = Function(functionName, returnType, args, false);
         _functionStack.emplace(newFunction, currentPos);
         return _functionStack.top().first;
@@ -195,7 +190,7 @@ namespace emitters
             _functions[functionName] = FunctionDeclaration(functionName, ToVariableType(returnType), argInfo);
         }
         Log() << "Begin emitting IR for function " << functionName << EOL;
-        auto currentPos = _emitter.GetCurrentInsertPoint();
+        auto currentPos = GetIREmitter().GetCurrentInsertPoint();
         IRFunctionEmitter newFunction = Function(functionName, returnType, argTypes, false);
         _functionStack.emplace(newFunction, currentPos);
         return _functionStack.top().first;
@@ -213,7 +208,7 @@ namespace emitters
             _functions[functionName] = FunctionDeclaration(functionName, ToVariableType(returnType), argInfo);
         }
         Log() << "Begin emitting IR for function " << functionName << EOL;
-        auto currentPos = _emitter.GetCurrentInsertPoint();
+        auto currentPos = GetIREmitter().GetCurrentInsertPoint();
         IRFunctionEmitter newFunction = Function(functionName, returnType, args, false);
         _functionStack.emplace(newFunction, currentPos);
         return _functionStack.top().first;
@@ -266,7 +261,7 @@ namespace emitters
             currentFunction.ConcatRegions();
             currentFunction.CompleteFunction();
         }
-        _emitter.SetCurrentInsertPoint(previousPos);
+        GetIREmitter().SetCurrentInsertPoint(previousPos);
 
         Log() << "End emitting of function " << currentFunction.GetFunctionName() << EOL;
     }
@@ -375,10 +370,10 @@ namespace emitters
         std::vector<llvm::Metadata*> metadataElements;
         for (const auto& value : values)
         {
-            metadataElements.push_back({ llvm::MDString::get(_emitter.GetContext(), value) });
+            metadataElements.push_back({ llvm::MDString::get(GetIREmitter().GetContext(), value) });
         }
 
-        auto metadataNode = llvm::MDNode::get(_emitter.GetContext(), metadataElements);
+        auto metadataNode = llvm::MDNode::get(GetIREmitter().GetContext(), metadataElements);
         auto metadata = _llvmModule->getOrInsertNamedMetadata(tag);
         metadata->addOperand(metadataNode);
     }
@@ -402,10 +397,10 @@ namespace emitters
         std::vector<llvm::Metadata*> metadataElements;
         for (const auto& value : values)
         {
-            metadataElements.push_back({ llvm::MDString::get(_emitter.GetContext(), value) });
+            metadataElements.push_back({ llvm::MDString::get(GetIREmitter().GetContext(), value) });
         }
 
-        auto metadataNode = llvm::MDNode::get(_emitter.GetContext(), metadataElements);
+        auto metadataNode = llvm::MDNode::get(GetIREmitter().GetContext(), metadataElements);
         function->setMetadata(tag, metadataNode);
     }
 
@@ -449,49 +444,52 @@ namespace emitters
 
     llvm::GlobalVariable* IRModuleEmitter::Constant(VariableType type, const std::string& name, double value)
     {
-        return AddGlobal(name, _emitter.Type(type), _emitter.Literal(value), true);
+        return AddGlobal(name, GetIREmitter().Type(type), GetIREmitter().Literal(value), true);
     }
 
-    llvm::GlobalVariable* IRModuleEmitter::Global(VariableType type, const std::string& name)
+    llvm::GlobalVariable* IRModuleEmitter::Global(VariableType type, const std::string& name, bool isThreadLocal)
     {
-        return AddGlobal(name, _emitter.Type(type), _emitter.Zero(type), false);
+        return AddGlobal(name, GetIREmitter().Type(type), GetIREmitter().Zero(type), false, isThreadLocal);
     }
 
-    llvm::GlobalVariable* IRModuleEmitter::Global(LLVMType pType, const std::string& name)
+    llvm::GlobalVariable* IRModuleEmitter::Global(LLVMType pType, const std::string& name, bool isThreadLocal)
     {
         auto initializer = ZeroInitializer(pType);
-        return AddGlobal(name, pType, initializer, false);
+        return AddGlobal(name, pType, initializer, false, isThreadLocal);
     }
 
-    llvm::GlobalVariable* IRModuleEmitter::GlobalPointer(const std::string& name, VariableType type)
+    llvm::GlobalVariable* IRModuleEmitter::GlobalPointer(const std::string& name, VariableType type, bool isThreadLocal)
     {
-        llvm::PointerType* pointerType = _emitter.Type(type)->getPointerTo();
-        return AddGlobal(name, pointerType, _emitter.NullPointer(pointerType), false);
+        llvm::PointerType* pointerType = GetIREmitter().Type(type)->getPointerTo();
+        return AddGlobal(name, pointerType, GetIREmitter().NullPointer(pointerType), false, isThreadLocal);
     }
 
-    llvm::GlobalVariable* IRModuleEmitter::GlobalArray(VariableType type, const std::string& name, const size_t size)
+    llvm::GlobalVariable* IRModuleEmitter::GlobalArray(VariableType type, const std::string& name, const size_t size, bool isThreadLocal)
     {
-        llvm::ArrayType* pArrayType = _emitter.ArrayType(type, size);
-        return AddGlobal(name, pArrayType, ZeroInitializer(pArrayType), false);
+        llvm::ArrayType* pArrayType = GetIREmitter().ArrayType(type, size);
+        return AddGlobal(name, pArrayType, ZeroInitializer(pArrayType), false, isThreadLocal);
     }
 
-    llvm::GlobalVariable* IRModuleEmitter::GlobalArray(const std::string& name, LLVMType pType, const size_t size)
+    llvm::GlobalVariable* IRModuleEmitter::GlobalArray(const std::string& name, LLVMType pType, const size_t size, bool isThreadLocal)
     {
         assert(pType != nullptr);
 
         llvm::ArrayType* pArrayType = llvm::ArrayType::get(pType, size);
-        return AddGlobal(name, pArrayType, ZeroInitializer(pArrayType), false);
+        return AddGlobal(name, pArrayType, ZeroInitializer(pArrayType), false, isThreadLocal);
     }
 
     // This function has the actual implementation for all the above Global/GlobalArray() methods
-    llvm::GlobalVariable* IRModuleEmitter::AddGlobal(const std::string& name, LLVMType pType, llvm::Constant* pInitial, bool isConst)
+    llvm::GlobalVariable* IRModuleEmitter::AddGlobal(const std::string& name, LLVMType pType, llvm::Constant* pInitial, bool isConst, bool isThreadLocal)
     {
+        CompilerOptions options = GetCompilerOptions();
         _llvmModule->getOrInsertGlobal(name, pType);
         auto global = _llvmModule->getNamedGlobal(name);
+        global->setAlignment(options.globalValueAlignment);
         global->setInitializer(pInitial);
         global->setConstant(isConst);
         global->setExternallyInitialized(false);
         global->setLinkage(llvm::GlobalValue::LinkageTypes::InternalLinkage);
+        global->setThreadLocal(isThreadLocal);
         assert(llvm::isa<llvm::GlobalVariable>(global));
         return llvm::cast<llvm::GlobalVariable>(global);
     }
@@ -503,7 +501,7 @@ namespace emitters
     LLVMFunction IRModuleEmitter::DeclareFunction(const std::string& name, VariableType returnType)
     {
         _functions[name] = FunctionDeclaration(name, returnType);
-        return _emitter.DeclareFunction(GetLLVMModule(), name, returnType);
+        return GetIREmitter().DeclareFunction(GetLLVMModule(), name, returnType);
     }
 
     LLVMFunction IRModuleEmitter::DeclareFunction(const std::string& name, VariableType returnType, const VariableTypeList& arguments)
@@ -516,18 +514,19 @@ namespace emitters
         // record this function definition in our local _functions so that these definitions can be found via GetFunctionNames
         // and GetCallbackFunctionNames
         _functions[name] = FunctionDeclaration(name, returnType, fake);
-        return _emitter.DeclareFunction(GetLLVMModule(), name, returnType, arguments);
+        return GetIREmitter().DeclareFunction(GetLLVMModule(), name, returnType, arguments);
     }
 
     LLVMFunction IRModuleEmitter::DeclareFunction(const std::string& name, VariableType returnType, const NamedVariableTypeList& arguments)
     {
         _functions[name] = FunctionDeclaration(name, returnType, arguments);
-        return _emitter.DeclareFunction(GetLLVMModule(), name, returnType, arguments);
+        return GetIREmitter().DeclareFunction(GetLLVMModule(), name, returnType, arguments);
     }
 
     LLVMFunction IRModuleEmitter::DeclareFunction(const std::string& name, llvm::FunctionType* functionType)
     {
-        return _emitter.DeclareFunction(GetLLVMModule(), name, functionType);
+        // TODO: add to _functions list
+        return GetIREmitter().DeclareFunction(GetLLVMModule(), name, functionType);
     }
 
     IRFunctionEmitter IRModuleEmitter::Function(const std::string& name, VariableType returnType, bool isPublic)
@@ -548,7 +547,8 @@ namespace emitters
 
     IRFunctionEmitter IRModuleEmitter::Function(const std::string& name, VariableType returnType, const NamedVariableTypeList& arguments, bool isPublic)
     {
-        LLVMFunction pFunction = _emitter.Function(GetLLVMModule(), name, returnType, Linkage(isPublic), arguments);
+        // TODO: add this function to the _functions list??
+        LLVMFunction pFunction = GetIREmitter().Function(GetLLVMModule(), name, returnType, Linkage(isPublic), arguments);
         if (pFunction == nullptr)
         {
             throw EmitterException(EmitterError::functionNotFound);
@@ -560,7 +560,8 @@ namespace emitters
 
     IRFunctionEmitter IRModuleEmitter::Function(const std::string& name, LLVMType returnType, const NamedVariableTypeList& arguments, bool isPublic)
     {
-        LLVMFunction pFunction = _emitter.Function(GetLLVMModule(), name, returnType, Linkage(isPublic), arguments);
+        // TODO: add this function to the _functions list??
+        LLVMFunction pFunction = GetIREmitter().Function(GetLLVMModule(), name, returnType, Linkage(isPublic), arguments);
         if (pFunction == nullptr)
         {
             throw EmitterException(EmitterError::functionNotFound);
@@ -572,7 +573,8 @@ namespace emitters
 
     IRFunctionEmitter IRModuleEmitter::Function(const std::string& name, LLVMType returnType, const FunctionArgumentList& arguments, bool isPublic)
     {
-        LLVMFunction pFunction = _emitter.Function(GetLLVMModule(), name, returnType, Linkage(isPublic), arguments);
+        // TODO: add this function to the _functions list??
+        LLVMFunction pFunction = GetIREmitter().Function(GetLLVMModule(), name, returnType, Linkage(isPublic), arguments);
         if (pFunction == nullptr)
         {
             throw EmitterException(EmitterError::functionNotFound);
@@ -584,7 +586,8 @@ namespace emitters
 
     IRFunctionEmitter IRModuleEmitter::Function(const std::string& name, VariableType returnType, const VariableTypeList* pArguments, bool isPublic)
     {
-        LLVMFunction pFunction = _emitter.Function(GetLLVMModule(), name, returnType, Linkage(isPublic), pArguments);
+        // TODO: add this function to the _functions list??
+        LLVMFunction pFunction = GetIREmitter().Function(GetLLVMModule(), name, returnType, Linkage(isPublic), pArguments);
         if (pFunction == nullptr)
         {
             throw EmitterException(EmitterError::functionNotFound);
@@ -594,7 +597,8 @@ namespace emitters
 
     IRFunctionEmitter IRModuleEmitter::Function(const std::string& name, LLVMType returnType, const std::vector<LLVMType>& argTypes, bool isPublic)
     {
-        LLVMFunction pFunction = _emitter.Function(GetLLVMModule(), name, returnType, Linkage(isPublic), argTypes);
+        // TODO: add this function to the _functions list??
+        LLVMFunction pFunction = GetIREmitter().Function(GetLLVMModule(), name, returnType, Linkage(isPublic), argTypes);
         if (pFunction == nullptr)
         {
             throw EmitterException(EmitterError::functionNotFound);
@@ -604,7 +608,8 @@ namespace emitters
 
     IRFunctionEmitter IRModuleEmitter::Function(const std::string& name, LLVMType returnType, const NamedLLVMTypeList& arguments, bool isPublic)
     {
-        LLVMFunction pFunction = _emitter.Function(GetLLVMModule(), name, returnType, Linkage(isPublic), arguments);
+        // TODO: add this function to the _functions list??
+        LLVMFunction pFunction = GetIREmitter().Function(GetLLVMModule(), name, returnType, Linkage(isPublic), arguments);
         if (pFunction == nullptr)
         {
             throw EmitterException(EmitterError::functionNotFound);
@@ -622,16 +627,21 @@ namespace emitters
         return GetLLVMModule()->getFunction(name);
     }
 
+    LLVMFunction IRModuleEmitter::GetIntrinsic(llvm::Intrinsic::ID id)
+    {
+        return GetIREmitter().GetIntrinsic(GetLLVMModule(), id, VariableTypeList{});
+    }
+
     LLVMFunction IRModuleEmitter::GetIntrinsic(llvm::Intrinsic::ID id, const std::initializer_list<VariableType>& arguments)
     {
         VariableTypeList valueTypeList = arguments;
-        return _emitter.GetIntrinsic(GetLLVMModule(), id, valueTypeList);
+        return GetIREmitter().GetIntrinsic(GetLLVMModule(), id, valueTypeList);
     }
 
     LLVMFunction IRModuleEmitter::GetIntrinsic(llvm::Intrinsic::ID id, const std::initializer_list<LLVMType>& arguments)
     {
         LLVMTypeList valueTypeList = arguments;
-        return _emitter.GetIntrinsic(GetLLVMModule(), id, valueTypeList);
+        return GetIREmitter().GetIntrinsic(GetLLVMModule(), id, valueTypeList);
     }
 
     //
@@ -643,7 +653,7 @@ namespace emitters
         NamedLLVMTypeList llvmFields;
         for (auto& field : fields)
         {
-            llvmFields.emplace_back(field.first, _emitter.Type(field.second));
+            llvmFields.emplace_back(field.first, GetIREmitter().Type(field.second));
         }
         return GetOrCreateStruct(name, llvmFields);
     }
@@ -666,7 +676,7 @@ namespace emitters
 
     llvm::StructType* IRModuleEmitter::GetOrCreateStruct(const std::string& name, const LLVMTypeList& fields)
     {
-        if (auto structType = _emitter.GetStruct(name))
+        if (auto structType = GetIREmitter().GetStruct(name))
         {
             // Check that existing struct fields match the ones we're trying to create
             auto structFields = structType->elements();
@@ -687,24 +697,24 @@ namespace emitters
             return structType;
         }
 
-        return _emitter.DeclareStruct(name, fields);
+        return GetIREmitter().DeclareStruct(name, fields);
     }
 
     llvm::StructType* IRModuleEmitter::GetAnonymousStructType(const LLVMTypeList& fieldTypes, bool packed)
     {
-        return _emitter.GetAnonymousStructType(fieldTypes, packed);
+        return GetIREmitter().GetAnonymousStructType(fieldTypes, packed);
     }
 
     llvm::StructType* IRModuleEmitter::GetStruct(const std::string& name)
     {
-        return _emitter.GetStruct(name);
+        return GetIREmitter().GetStruct(name);
     }
 
     //
     // Code output / input
     //
 
-    void IRModuleEmitter::WriteToFile(const std::string& filePath, ModuleOutputFormat format)
+    MachineCodeOutputOptions IRModuleEmitter::GetMachineCodeOutputOptions() const
     {
         MachineCodeOutputOptions options;
         auto compilerOptions = GetCompilerOptions();
@@ -716,6 +726,11 @@ namespace emitters
 
         options.verifyModule = true;
         options.floatFusionMode = compilerOptions.useFastMath ? FloatFusionMode::Fast : FloatFusionMode::Standard;
+        options.unsafeFPMath = compilerOptions.useFastMath;
+        options.noInfsFPMath = compilerOptions.useFastMath;
+        options.noNaNsFPMath = compilerOptions.useFastMath;
+        options.noSignedZerosFPMath = compilerOptions.useFastMath;
+
         if (compilerOptions.positionIndependentCode.HasValue())
         {
             options.relocModel = compilerOptions.positionIndependentCode.GetValue() ? OutputRelocationModel::PIC_ : OutputRelocationModel::Static;
@@ -727,7 +742,15 @@ namespace emitters
         }
 
         // Other params to possibly set:
+        //   bool verboseOutput = false;
+        //   bool verifyModule = false;
         //   FloatABIType floatABI = FloatABIType::Default;
+        return options;
+    }
+
+    void IRModuleEmitter::WriteToFile(const std::string& filePath, ModuleOutputFormat format)
+    {
+        auto options = GetMachineCodeOutputOptions();
 
         switch (format)
         {
@@ -779,18 +802,7 @@ namespace emitters
 
     void IRModuleEmitter::WriteToStream(std::ostream& stream, ModuleOutputFormat format)
     {
-        MachineCodeOutputOptions options;
-        auto compilerOptions = GetCompilerOptions();
-        options.targetDevice = compilerOptions.targetDevice;
-        if (compilerOptions.optimize)
-        {
-            options.optimizationLevel = OptimizationLevel::Aggressive;
-        }
-        // Other params to possibly set:
-        //   bool verboseOutput = false;
-        //   bool verifyModule = false;
-        //   FloatABIType floatABI = FloatABIType::Default;
-        //   FloatFusionMode floatFusionMode = FloatFusionMode::Standard;
+        auto options = GetMachineCodeOutputOptions();
 
         WriteToStream(stream, format, options);
     }
@@ -836,16 +848,56 @@ namespace emitters
 
     void IRModuleEmitter::LoadIR(const std::string& text)
     {
-        llvm::MemoryBufferRef buffer(text, "<string>"); // See Parser.cpp in LLVM code base for why...
+        auto buffer = llvm::MemoryBuffer::getMemBuffer(text);
+        llvm::SMDiagnostic errorHandler;
+        bool hadError = llvm::parseAssemblyInto(*buffer, GetLLVMModule(), nullptr, errorHandler);
+        if (hadError)
+        {
+            std::string message = errorHandler.getMessage();
+            throw EmitterException(EmitterError::parserError, message);
+        }
+    }
+
+    void IRModuleEmitter::LoadIR(std::istream& stream)
+    {
+        std::string irStr(std::istreambuf_iterator<char>(stream), {});
+        LoadIR(irStr);
+    }
+
+    void IRModuleEmitter::LoadIRFromFile(const std::string& filename)
+    {
+        auto buffer = llvm::MemoryBuffer::getFile(filename);
+        if (!buffer)
+        {
+            throw EmitterException(EmitterError::parserError, "Unable to open " + filename);
+        }
+
         llvm::SMDiagnostic errorHandler;
-        bool hadError = llvm::parseAssemblyInto(buffer, GetLLVMModule(), nullptr, errorHandler);
+        bool hadError = llvm::parseAssemblyInto(*(buffer->get()), GetLLVMModule(), nullptr, errorHandler);
         if (hadError)
         {
-            std::string message = errorHandler.getMessage(); //IRLoader::ErrorToString(errorHandler);
+            std::string message = errorHandler.getMessage();
             throw EmitterException(EmitterError::parserError, message);
         }
     }
 
+    void IRModuleEmitter::LoadAsm(const std::string& text)
+    {
+        GetLLVMModule()->appendModuleInlineAsm(text);
+    }
+
+    void IRModuleEmitter::LoadAsm(std::istream& stream)
+    {
+        std::string asmStr(std::istreambuf_iterator<char>(stream), {});
+        LoadAsm(asmStr);
+    }
+
+    void IRModuleEmitter::LoadAsmFromFile(const std::string& filename)
+    {
+        auto stream = OpenIfstream(filename);
+        LoadAsm(stream);
+    }
+
     void IRModuleEmitter::WriteHeader(std::ostream& os)
     {
         WriteModuleHeader(os, *this);
@@ -905,16 +957,16 @@ namespace emitters
     {
         std::string globalName = GetModuleName() + "_debug_message_" + std::to_string(_globalStringIndex++);
         llvm::GlobalVariable* msg = ConstantArray(globalName, std::vector<char>(message.c_str(), message.c_str() + message.size() + 1));
-        auto& context = _emitter.GetContext();
+        auto& context = GetIREmitter().GetContext();
         auto charPointerType = llvm::Type::getInt8Ty(context)->getPointerTo();
-        llvm::Value* msgptr = _emitter.CastPointer(msg, charPointerType);
+        llvm::Value* msgptr = GetIREmitter().CastPointer(msg, charPointerType);
         auto function = DeclareDebugPrint();
-        _emitter.Call(function, msgptr);
+        GetIREmitter().Call(function, msgptr);
     }
 
     void IRModuleEmitter::DeclarePrintf()
     {
-        auto& context = _emitter.GetContext();
+        auto& context = GetIREmitter().GetContext();
         auto type = llvm::FunctionType::get(
             llvm::Type::getInt32Ty(context),
             { llvm::Type::getInt8PtrTy(context) },
@@ -983,7 +1035,14 @@ namespace emitters
         }
 
         auto relocModel = parameters.targetDevice.IsWindows() ? OutputRelocationModel::Static : OutputRelocationModel::PIC_;
-        const llvm::TargetOptions options;
+        llvm::TargetOptions options;
+        options.FloatABIType = llvm::FloatABI::Default;
+        options.AllowFPOpFusion = parameters.useFastMath ? llvm::FPOpFusion::Standard : llvm::FPOpFusion::Fast;
+        options.UnsafeFPMath = parameters.useFastMath ? 1 : 0;
+        options.NoInfsFPMath = parameters.useFastMath ? 1 : 0;
+        options.NoNaNsFPMath = parameters.useFastMath ? 1 : 0;
+        options.NoSignedZerosFPMath = parameters.useFastMath ? 1 : 0;
+
         const llvm::CodeModel::Model codeModel = llvm::CodeModel::Small;
         auto tm = target->createTargetMachine(llvm::Triple::normalize(parameters.targetDevice.triple),
                                               parameters.targetDevice.cpu,
@@ -1126,73 +1185,6 @@ namespace emitters
         return llvm::ConstantAggregateZero::get(pType);
     }
 
-    //
-    // Global LLVM state management
-    //
-    void IRModuleEmitter::InitializeLLVM()
-    {
-        // All targets
-        llvm::InitializeAllTargetInfos();
-        llvm::InitializeAllTargets();
-        llvm::InitializeAllTargetMCs();
-        llvm::InitializeAllAsmPrinters();
-        llvm::InitializeAllAsmParsers();
-        llvm::InitializeAllDisassemblers();
-
-        // Native target (perhaps unnecessary, since we're initializing _all_ the targets above)
-        llvm::InitializeNativeTarget();
-        llvm::InitializeNativeTargetAsmPrinter();
-        llvm::InitializeNativeTargetAsmParser();
-        llvm::InitializeNativeTargetDisassembler();
-
-        // Create a diagnostic handler to record if there was an error
-        _diagnosticHandler = std::unique_ptr<IRDiagnosticHandler>(new IRDiagnosticHandler(*_llvmContext));
-    }
-
-    llvm::PassRegistry* IRModuleEmitter::InitializeGlobalPassRegistry()
-    {
-        // Get the global pass registry
-        llvm::PassRegistry* registry = llvm::PassRegistry::getPassRegistry();
-
-        // Initialize all of the optimization passes (probably unnecessary)
-        llvm::initializeCore(*registry);
-        llvm::initializeCoroutines(*registry);
-        llvm::initializeScalarOpts(*registry);
-        llvm::initializeObjCARCOpts(*registry);
-        llvm::initializeVectorization(*registry);
-        llvm::initializeIPO(*registry);
-        llvm::initializeAnalysis(*registry);
-        llvm::initializeTransformUtils(*registry);
-        llvm::initializeInstCombine(*registry);
-        llvm::initializeAggressiveInstCombine(*registry);
-        llvm::initializeInstrumentation(*registry);
-        llvm::initializeTarget(*registry);
-        // For codegen passes, only passes that do IR to IR transformation are
-        // supported.
-        llvm::initializeExpandMemCmpPassPass(*registry);
-        llvm::initializeScalarizeMaskedMemIntrinPass(*registry);
-        llvm::initializeCodeGenPreparePass(*registry);
-        llvm::initializeAtomicExpandPass(*registry);
-        llvm::initializeRewriteSymbolsLegacyPassPass(*registry);
-        llvm::initializeWinEHPreparePass(*registry);
-        llvm::initializeDwarfEHPreparePass(*registry);
-        llvm::initializeSafeStackLegacyPassPass(*registry);
-        llvm::initializeSjLjEHPreparePass(*registry);
-        llvm::initializePreISelIntrinsicLoweringLegacyPassPass(*registry);
-        llvm::initializeGlobalMergePass(*registry);
-        llvm::initializeIndirectBrExpandPassPass(*registry);
-        llvm::initializeInterleavedLoadCombinePass(*registry);
-        llvm::initializeInterleavedAccessPass(*registry);
-        llvm::initializeEntryExitInstrumenterPass(*registry);
-        llvm::initializePostInlineEntryExitInstrumenterPass(*registry);
-        llvm::initializeUnreachableBlockElimLegacyPassPass(*registry);
-        llvm::initializeExpandReductionsPass(*registry);
-        llvm::initializeWasmEHPreparePass(*registry);
-        llvm::initializeWriteBitcodePassPass(*registry);
-
-        return registry;
-    }
-
     template <>
     CallbackRegistry<float>& IRModuleEmitter::GetCallbackRegistry() const
     {
diff --git a/libraries/emitters/src/IRRuntime.cpp b/libraries/emitters/src/IRRuntime.cpp
index dca4241df..51300e8c3 100644
--- a/libraries/emitters/src/IRRuntime.cpp
+++ b/libraries/emitters/src/IRRuntime.cpp
@@ -420,17 +420,22 @@ namespace emitters
         return _module.GetIntrinsic(llvm::Intrinsic::sin, { argType });
     }
 
-	LLVMFunction IRRuntime::GetCosFunction(VariableType argType)
+    LLVMFunction IRRuntime::GetCosFunction(VariableType argType)
     {
         return _module.GetIntrinsic(llvm::Intrinsic::cos, { argType });
     }
 
-	LLVMFunction IRRuntime::GetCopySignFunction(VariableType argType)
+    LLVMFunction IRRuntime::GetCopySignFunction(VariableType argType)
     {
         return _module.GetIntrinsic(llvm::Intrinsic::copysign, { argType });
     }
 
-	LLVMFunction IRRuntime::GetTanhFunction(VariableType argType)
+    LLVMFunction IRRuntime::GetFmaFunction(VariableType argType)
+    {
+        return _module.GetIntrinsic(llvm::Intrinsic::fma, { argType });
+    }
+
+    LLVMFunction IRRuntime::GetTanhFunction(VariableType argType)
     {
         // This assumes a standard C runtime library is linked
         auto& emitter = _module.GetIREmitter();
@@ -529,6 +534,11 @@ namespace emitters
         return _module.GetIntrinsic(llvm::Intrinsic::copysign, { argType });
     }
 
+    LLVMFunction IRRuntime::GetFmaFunction(LLVMType argType)
+    {
+        return _module.GetIntrinsic(llvm::Intrinsic::fma, { argType });
+    }
+
     LLVMFunction IRRuntime::GetPrefetchFunction()
     {
         return _module.GetIntrinsic(llvm::Intrinsic::prefetch, std::initializer_list<llvm::Type*>{});
@@ -563,11 +573,10 @@ namespace emitters
                                      // got to the end of both strings, so they are equal, return 1.
                                      fn.Store(result, fn.Literal(1));
                                      fn.Store(continuing, fn.FalseBit());
-                                 })
-                                   .ElseIf(achar == zero || bchar == zero || achar != bchar, [continuing](emitters::IRFunctionEmitter& fn) {
-                                       // terminate loop with 0 result
-                                       fn.Store(continuing, fn.FalseBit());
-                                   });
+                                 }).ElseIf(achar == zero || bchar == zero || achar != bchar, [continuing](emitters::IRFunctionEmitter& fn) {
+                                   // terminate loop with 0 result
+                                   fn.Store(continuing, fn.FalseBit());
+                               });
                                fn.Store(index, indexValue + 1);
                            });
 
diff --git a/libraries/emitters/src/LLVMUtilities.cpp b/libraries/emitters/src/LLVMUtilities.cpp
index 49710854f..5da919227 100644
--- a/libraries/emitters/src/LLVMUtilities.cpp
+++ b/libraries/emitters/src/LLVMUtilities.cpp
@@ -9,9 +9,11 @@
 #include "LLVMUtilities.h"
 #include "EmitterException.h"
 
-#include <llvm/IR/Type.h>
-#include <llvm/IR/DerivedTypes.h>
+#include "build/LLVMEmitterTargets.h"
+
 #include <llvm/IR/Value.h>
+#include <llvm/Support/TargetSelect.h>
+#include <llvm/Target/TargetMachine.h>
 
 namespace ell
 {
@@ -90,16 +92,20 @@ namespace emitters
         }
         else
         {
-            if (type->isVoidTy()) {
+            if (type->isVoidTy())
+            {
                 return VariableType::Void;
             }
-            else if (type->isDoubleTy()) {
+            else if (type->isDoubleTy())
+            {
                 return VariableType::Double;
             }
-            else if (type->isFloatTy()) {
+            else if (type->isFloatTy())
+            {
                 return VariableType::Float;
             }
-            else if (type->isIntegerTy()) {
+            else if (type->isIntegerTy())
+            {
                 switch (type->getIntegerBitWidth())
                 {
                 case 8:
@@ -116,5 +122,75 @@ namespace emitters
         return VariableType::Custom;
     }
 
+    namespace
+    {
+        void InitializeLLVMTargets()
+        {
+            // This block is part of a X-Macro. LLVM_EMITTER_TARGETS below is
+            // defined in build/LLVMEmitterTargets.h at CMake configure time.
+            // It is dependent on the value of the CMake variable LLVM_EMITTER_TARGETS.
+            // For each LLVM target specified in that variable, EMITTER_TARGET_ACTION
+            // below gets called
+#define EMITTER_TARGET_ACTION(TargetName)     \
+    LLVMInitialize##TargetName##TargetInfo(); \
+    LLVMInitialize##TargetName##Target();     \
+    LLVMInitialize##TargetName##TargetMC();   \
+    LLVMInitialize##TargetName##AsmPrinter(); \
+    LLVMInitialize##TargetName##AsmParser();  \
+    LLVMInitialize##TargetName##Disassembler();
+            LLVM_EMITTER_TARGETS
+#undef EMITTER_TARGET_ACTION
+
+            llvm::InitializeNativeTarget();
+        }
+
+        void InitializeGlobalPassRegistry()
+        {
+            // Get the global pass registry
+            llvm::PassRegistry* registry = llvm::PassRegistry::getPassRegistry();
+
+            // Initialize all of the optimization passes (probably unnecessary)
+            llvm::initializeCore(*registry);
+            llvm::initializeScalarOpts(*registry);
+            llvm::initializeVectorization(*registry);
+            llvm::initializeIPO(*registry);
+            llvm::initializeAnalysis(*registry);
+            llvm::initializeTransformUtils(*registry);
+            llvm::initializeInstCombine(*registry);
+            llvm::initializeAggressiveInstCombine(*registry);
+            llvm::initializeInstrumentation(*registry);
+            llvm::initializeTarget(*registry);
+            llvm::initializeGlobalISel(*registry);
+
+            // For codegen passes, only passes that do IR to IR transformation are
+            // supported.
+            llvm::initializeExpandMemCmpPassPass(*registry);
+            llvm::initializeScalarizeMaskedMemIntrinPass(*registry);
+            llvm::initializeCodeGenPreparePass(*registry);
+            llvm::initializeAtomicExpandPass(*registry);
+            llvm::initializeRewriteSymbolsLegacyPassPass(*registry);
+            llvm::initializeWinEHPreparePass(*registry);
+            llvm::initializeDwarfEHPreparePass(*registry);
+            llvm::initializeSafeStackLegacyPassPass(*registry);
+            llvm::initializeSjLjEHPreparePass(*registry);
+            llvm::initializePreISelIntrinsicLoweringLegacyPassPass(*registry);
+            llvm::initializeGlobalMergePass(*registry);
+            llvm::initializeIndirectBrExpandPassPass(*registry);
+            llvm::initializeInterleavedLoadCombinePass(*registry);
+            llvm::initializeInterleavedAccessPass(*registry);
+            llvm::initializeEntryExitInstrumenterPass(*registry);
+            llvm::initializePostInlineEntryExitInstrumenterPass(*registry);
+            llvm::initializeUnreachableBlockElimLegacyPassPass(*registry);
+            llvm::initializeExpandReductionsPass(*registry);
+            llvm::initializeWriteBitcodePassPass(*registry);
+        }
+    } // namespace
+
+    void InitializeLLVM()
+    {
+        InitializeLLVMTargets();
+        InitializeGlobalPassRegistry();
+    }
+
 } // namespace emitters
 } // namespace ell
diff --git a/libraries/emitters/src/TargetDevice.cpp b/libraries/emitters/src/TargetDevice.cpp
index 923f18b09..90a933484 100644
--- a/libraries/emitters/src/TargetDevice.cpp
+++ b/libraries/emitters/src/TargetDevice.cpp
@@ -9,6 +9,7 @@
 #include "TargetDevice.h"
 #include "EmitterException.h"
 #include "IRAssemblyWriter.h" // for OutputRelocationModel
+#include "LLVMUtilities.h"
 
 #include <llvm/ADT/Triple.h>
 #include <llvm/Support/Host.h>
@@ -219,6 +220,8 @@ namespace emitters
 
         void SetHostTargetProperties(TargetDevice& targetDevice)
         {
+            InitializeLLVM();
+
             auto hostTripleString = llvm::sys::getProcessTriple();
             llvm::Triple hostTriple(hostTripleString);
 
@@ -226,6 +229,20 @@ namespace emitters
             targetDevice.architecture = llvm::Triple::getArchTypeName(hostTriple.getArch());
             targetDevice.cpu = llvm::sys::getHostCPUName();
 
+            llvm::StringMap<bool> features;
+            llvm::sys::getHostCPUFeatures(features);
+            for (const auto& feature : features)
+            {
+                if (feature.second)
+                {
+                    targetDevice.features += '+' + feature.first().str() + ",";
+                }
+            }
+            if (!targetDevice.features.empty())
+            {
+                targetDevice.features.pop_back();
+            }
+
             SetTargetDataLayout(targetDevice);
         }
 
diff --git a/libraries/emitters/templates/LLVMEmitterTargets.h.in b/libraries/emitters/templates/LLVMEmitterTargets.h.in
new file mode 100644
index 000000000..9b66e6fde
--- /dev/null
+++ b/libraries/emitters/templates/LLVMEmitterTargets.h.in
@@ -0,0 +1,16 @@
+// Auto-generated
+// The contents of this file are based on the CMake variable LLVM_EMITTER_TARGETS
+// Specified in libraries/emitters/CMakeLists.txt
+// The generated preprocessor define is part of an X-Macro (https://en.wikipedia.org/wiki/X_Macro)
+// The other part of the X-Macro relies on the definition of the macro EMITTER_TARGET_ACTION(TargetName)
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     LLVMEmitterTargets.h (emitters)
+//  Authors:  Kern Handa
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#define LLVM_EMITTER_TARGETS \
+@emitter_targets_content@
diff --git a/libraries/emitters/test/include/IREmitterTest.h b/libraries/emitters/test/include/IREmitterTest.h
index d5fda7574..2ee12b460 100644
--- a/libraries/emitters/test/include/IREmitterTest.h
+++ b/libraries/emitters/test/include/IREmitterTest.h
@@ -42,3 +42,5 @@ void TestElseIfWithComputedCondition();
 
 void TestCastValue();
 void TestCastToConditionalBool();
+
+void TestInlineAssembly();
diff --git a/libraries/emitters/test/src/IREmitterTest.cpp b/libraries/emitters/test/src/IREmitterTest.cpp
index 7ca4772b5..6a6cf23a6 100644
--- a/libraries/emitters/test/src/IREmitterTest.cpp
+++ b/libraries/emitters/test/src/IREmitterTest.cpp
@@ -23,6 +23,8 @@
 #include <utilities/include/TypeAliases.h>
 #include <utilities/include/Unused.h>
 
+#include <llvm/Transforms/Utils/Cloning.h>
+
 #include <functional>
 #include <iostream>
 #include <memory>
@@ -1164,3 +1166,76 @@ void TestCastToConditionalBool()
     TestCastToConditionalBool<float>();
     TestCastToConditionalBool<double>();
 }
+
+void TestInlineAssembly()
+{
+    VariableType inType = emitters::GetVariableType<int>();
+    VariableType outType = emitters::GetVariableType<int>();
+
+    auto module = MakeHostModuleEmitter("TestInlineAssembly");
+    auto targetDevice = module.GetCompilerOptions().targetDevice;
+    auto functionIdentifier = targetDevice.IsMacOS() ? "_square" : "square";
+    auto functionName = "square";
+
+    std::string asmStr;
+    if(targetDevice.IsWindows())
+    {
+     asmStr= R"XX(
+    .globl FUNCTION
+FUNCTION:
+    movl    %ecx, %eax
+    imull   %ecx, %eax
+    retq
+)XX";
+    }
+    else
+    {
+     asmStr= R"XX(
+    .globl     FUNCTION
+FUNCTION:
+    imull      %edi, %edi
+    movl       %edi, %eax
+    retq
+)XX";
+    }
+
+    ReplaceAll(asmStr, "FUNCTION", functionIdentifier);
+
+    module.GetLLVMModule()->appendModuleInlineAsm(asmStr);
+
+    module.DeclareFunction(functionName, outType, { inType });
+    module.DebugDump();
+
+    const emitters::NamedVariableTypeList parameters = { { "x", inType } };
+    auto fn = module.BeginFunction("InlineAssembly", outType, parameters);
+    {
+        auto arguments = fn.Arguments().begin();
+        auto x = fn.LocalScalar(&(*arguments++));
+        auto squareFn = module.GetFunction(functionName);
+        squareFn->addFnAttr(llvm::Attribute::AttrKind::AlwaysInline);
+        auto result = fn.Call(squareFn, {x});
+
+        fn.Return(result);
+    }
+    module.EndFunction();
+
+#if 0
+    module.DebugDump();
+    module.WriteToStream(std::cout, ModuleOutputFormat::assembly);
+#endif
+    fn.Verify();
+
+    IRExecutionEngine jit(std::move(module));
+    auto testFn = jit.GetFunction<int(int)>("InlineAssembly");
+
+    bool success = true;
+    auto trials = std::vector<int>{ 1, 2, 35 };
+    for (auto val : trials)
+    {
+        auto result = testFn(val);
+        auto expected = val * val;
+        success = success && (result == expected);
+    }
+
+    testing::ProcessTest("Testing InlineAssembly", success);
+}
diff --git a/libraries/emitters/test/src/main.cpp b/libraries/emitters/test/src/main.cpp
index 199e5f0f4..a3fc007d9 100644
--- a/libraries/emitters/test/src/main.cpp
+++ b/libraries/emitters/test/src/main.cpp
@@ -52,6 +52,8 @@ void TestIR()
 
     TestCastValue();
     TestCastToConditionalBool();
+
+    TestInlineAssembly();
 }
 
 void TestIRFunction()
diff --git a/libraries/math/CMakeLists.txt b/libraries/math/CMakeLists.txt
index 41fabdfc0..215a06aa8 100644
--- a/libraries/math/CMakeLists.txt
+++ b/libraries/math/CMakeLists.txt
@@ -9,6 +9,7 @@ if(MSVC)
 endif()
 
 include (OpenBLASSetup)
+add_compile_options(-DUSE_OPENBLAS=1)
 
 set(src src/BlasWrapper.cpp
          src/Tensor.cpp
diff --git a/libraries/math/src/BlasWrapper.cpp b/libraries/math/src/BlasWrapper.cpp
index 3bc1cdc60..67eef22c4 100644
--- a/libraries/math/src/BlasWrapper.cpp
+++ b/libraries/math/src/BlasWrapper.cpp
@@ -10,8 +10,12 @@
 #include "Matrix.h"
 
 #if USE_BLAS
+#if USE_MKL
+#include <mkl.h>
+#elif USE_OPENBLAS
 #include <cblas.h>
 #endif
+#endif
 
 #include <thread> // for hardware_concurrency()
 
@@ -69,8 +73,12 @@ namespace math
             {
                 numThreads = std::thread::hardware_concurrency();
             }
-#ifdef OPENBLAS_CONST
+#if USE_BLAS
+#if defined(USE_OPENBLAS) && defined(OPENBLAS_CONST)
             openblas_set_num_threads(numThreads);
+#elif USE_MKL
+            mkl_set_num_threads(numThreads);
+#endif
 #endif
         }
 
diff --git a/libraries/model/include/Map.h b/libraries/model/include/Map.h
index 8e842052c..16927d5ac 100644
--- a/libraries/model/include/Map.h
+++ b/libraries/model/include/Map.h
@@ -65,8 +65,6 @@ namespace model
         /// <param name="other"> The other map. </param>
         Map(const Map& other);
 
-        Map(Map&& other) = default;
-
         /// <summary> Assignment operator. </summary>
         ///
         /// <param name="other"> The other map. </param>
diff --git a/libraries/model/include/OutputPort.h b/libraries/model/include/OutputPort.h
index d47949f6c..8564775ec 100644
--- a/libraries/model/include/OutputPort.h
+++ b/libraries/model/include/OutputPort.h
@@ -32,7 +32,6 @@ namespace model
     public:
         OutputPortBase() = default;
         OutputPortBase(const OutputPortBase& other) = delete;
-        OutputPortBase(OutputPortBase&& other) = default;
 
         /// <summary> Constructor </summary>
         ///
diff --git a/libraries/model/src/CompilableCodeNode.cpp b/libraries/model/src/CompilableCodeNode.cpp
index d121a9ce9..3f052c4be 100644
--- a/libraries/model/src/CompilableCodeNode.cpp
+++ b/libraries/model/src/CompilableCodeNode.cpp
@@ -77,7 +77,7 @@ namespace model
         const auto& inputs = GetInputPorts();
         const auto& outputs = GetOutputPorts();
 
-        std::vector<Value> parameters;
+        std::vector<ViewAdapter> parameters;
         parameters.reserve(inputs.size() + outputs.size());
         std::transform(inputs.begin(), inputs.end(), std::back_inserter(parameters), PortToValue);
         std::transform(outputs.begin(), outputs.end(), std::back_inserter(parameters), PortToValue);
@@ -222,7 +222,7 @@ namespace model
         const auto& inputs = GetInputPorts();
         const auto& outputs = GetOutputPorts();
 
-        std::vector<Value> args;
+        std::vector<ViewAdapter> args;
         args.reserve(inputs.size() + outputs.size());
         std::transform(inputs.begin(), inputs.end(), std::back_inserter(args), PortToValue);
 
diff --git a/libraries/model/src/Map.cpp b/libraries/model/src/Map.cpp
index cb27db0f0..0b491e9be 100644
--- a/libraries/model/src/Map.cpp
+++ b/libraries/model/src/Map.cpp
@@ -88,6 +88,10 @@ namespace model
             AddOutput(name, transformer.GetCorrespondingOutputs(*outputPort));
         }
 
+        _metadata = other._metadata;
+
+        // TODO (kerha): _computeContext isn't copied right now. Not sure if it should be. [2019-08-23]
+
         _model.Verify();
     }
 
@@ -326,6 +330,7 @@ namespace model
         swap(a._outputs, b._outputs);
         swap(a._outputNames, b._outputNames);
         swap(a._outputsMap, b._outputsMap);
+        swap(a._metadata, b._metadata);
         swap(a._computeContext, b._computeContext);
     }
 
diff --git a/libraries/model/test/include/CompilableNodesTest.h b/libraries/model/test/include/CompilableNodesTest.h
index b9176a25a..a464acbf3 100644
--- a/libraries/model/test/include/CompilableNodesTest.h
+++ b/libraries/model/test/include/CompilableNodesTest.h
@@ -19,6 +19,7 @@
 
 #include <nodes/include/ConstantNode.h>
 #include <nodes/include/DotProductNode.h>
+#include <nodes/include/MatrixMatrixMultiplyCodeNode.h>
 
 #include <predictors/neural/include/ConvolutionalLayer.h>
 #include <predictors/neural/include/Layer.h>
@@ -50,6 +51,10 @@ void TestReinterpretLayoutNode();
 void TestReorderDataNode1();
 void TestReorderDataNode2();
 void TestReorderDataNode3();
+void TestReorderDataCodeNode1();
+void TestReorderDataCodeNode2();
+void TestReorderDataCodeNode3();
+void TestReorderDataCodeNode4();
 void TestReceptiveFieldMatrixNode(size_t numChannels, bool useNewReshape);
 void TestCompilableAccumulatorNodeFunction();
 void TestCompilableSourceNode();
@@ -70,6 +75,7 @@ void TestBufferNode();
 void TestMatrixVectorMultiplyNode(int m, int n, bool useBlas);
 void TestMatrixMatrixMultiplyNode(int m, int n, int k, bool useBlas);
 void TestOrderedMatrixMatrixMultiplyNode(int m, int n, int k, bool transposeA, bool transposeB, bool transposeC, bool useBlas);
+void TestMatrixMatrixMultiplyCodeNode(int m, int n, int k, int panelM, int panelN, int panelK, int kernelM, int kernelN, int kernelK, nodes::MatrixMatrixMultiplyImplementation gemmImpl);
 
 void TestBroadcasUnaryOperationNodeCompile();
 void TestBroadcasBinaryOperationNodeCompileAdd();
@@ -107,8 +113,10 @@ void TestMaxPoolingLayerNode(size_t inRows, size_t inCols, size_t numChannels, s
 void TestMeanPoolingLayerNode(size_t inRows, size_t inCols, size_t numChannels, size_t outRows, size_t outCols, size_t poolingSize, size_t poolingStride, size_t inputPadding = 0, size_t outputPadding = 0);
 void TestScalingLayerNode(size_t inputPadding = 0, size_t outputPadding = 0);
 void TestSoftmaxLayerNode(size_t inputPadding = 0, size_t outputPadding = 0);
+void TestSpatialConvolutionNode(size_t inputPadding = 1, size_t outputPadding = 0);
 void TestFusedLinearLayerNodes(size_t rows, size_t columns, size_t channels);
 void TestRegionDetectionNode();
+void TestIRNode();
 
 #pragma region implementation
 
diff --git a/libraries/model/test/src/CompilableNodesTest.cpp b/libraries/model/test/src/CompilableNodesTest.cpp
index cda5cd546..0b5d9a1a6 100644
--- a/libraries/model/test/src/CompilableNodesTest.cpp
+++ b/libraries/model/test/src/CompilableNodesTest.cpp
@@ -49,6 +49,7 @@
 #include <nodes/include/FullyConnectedLayerNode.h>
 #include <nodes/include/IRNode.h>
 #include <nodes/include/L2NormSquaredNode.h>
+#include <nodes/include/MatrixMatrixMultiplyCodeNode.h>
 #include <nodes/include/MatrixMatrixMultiplyNode.h>
 #include <nodes/include/MatrixVectorMultiplyNode.h>
 #include <nodes/include/MatrixVectorProductNode.h>
@@ -59,10 +60,12 @@
 #include <nodes/include/ReceptiveFieldMatrixNode.h>
 #include <nodes/include/RegionDetectionLayerNode.h>
 #include <nodes/include/ReinterpretLayoutNode.h>
+#include <nodes/include/ReorderDataCodeNode.h>
 #include <nodes/include/ReorderDataNode.h>
 #include <nodes/include/SinkNode.h>
 #include <nodes/include/SoftmaxLayerNode.h>
 #include <nodes/include/SourceNode.h>
+#include <nodes/include/SpatialConvolutionNode.h>
 #include <nodes/include/SumNode.h>
 #include <nodes/include/TypeCastNode.h>
 #include <nodes/include/UnaryOperationNode.h>
@@ -1174,6 +1177,135 @@ void TestReorderDataNode3()
     VerifyCompiledOutput(map, compiledMap, signal, "ReorderDataNode");
 }
 
+void TestReorderDataCodeNode1()
+{
+    using ElementType = float;
+    int numRows = 2;
+    int numColumns = 3;
+    int numChannels = 16;
+    model::Model model;
+    model::PortMemoryLayout inputLayout(model::MemoryShape{ numRows, numColumns, numChannels }); // Default order: 0, 1, 2 == rows, columns, channels
+    auto outputLayout = inputLayout.ReorderedCopy({ 2, 0, 1 });
+
+    size_t inputSize = inputLayout.GetMemorySize();
+    auto inputNode = model.AddNode<model::InputNode<ElementType>>(inputSize);
+    const auto& testOutput = ReorderDataWithCodeNode(inputNode->output, inputLayout, outputLayout);
+    auto map = model::Map(model, { { "input", inputNode } }, { { "output", testOutput } });
+
+    // First, create the input tensor and expected output
+    math::ChannelColumnRowTensor<float> input(numRows, numColumns, numChannels);
+    math::ColumnRowChannelTensor<float> expectedOutput(numRows, numColumns, numChannels);
+
+    // Next, verify that the compiled output is correct
+    FillTensor<float>(input, 1, 1);
+    FillTensor<float>(expectedOutput, 1, 1);
+
+	Log() << "Input:" << EOL << input.ToArray() << EOL;
+
+    std::string name = "TestReorderDataCodeNode1";
+    TestWithSerialization(map, name, [&](model::Map& map, int iteration) {
+        model::IRMapCompiler compiler;
+        auto compiledMap = compiler.Compile(map);
+
+        // compare output
+        std::vector<std::vector<ElementType>> signal = { input.ToArray() };
+        std::vector<std::vector<ElementType>> expected = { expectedOutput.ToArray() };
+        VerifyCompiledOutputAndResult(map, compiledMap, signal, expected, utilities::FormatString("%s iteration %d", name.c_str(), iteration));
+    });
+}
+
+void TestReorderDataCodeNode2()
+{
+    using ElementType = float;
+    int numRows = 3;
+    int numColumns = 3;
+    int numChannels = 16;
+    int padding = 1;
+    model::Model model;
+    model::PortMemoryLayout inputLayout(model::MemoryShape{ numRows, numColumns, numChannels }, model::MemoryShape{ padding, padding, 0 }); // Default order: 0, 1, 2 == rows, columns, channels
+	auto outputLayout = inputLayout.ReorderedCopy({ 2, 0, 1 });
+
+    size_t inputSize = inputLayout.GetMemorySize();
+    auto inputNode = model.AddNode<model::InputNode<ElementType>>(inputSize);
+    const auto& testOutput = ReorderDataWithCodeNode(inputNode->output, inputLayout, outputLayout, std::vector<int>{ 2, 0, 1 });
+    auto map = model::Map(model, { { "input", inputNode } }, { { "output", testOutput } });
+    model::IRMapCompiler compiler;
+    auto compiledMap = compiler.Compile(map);
+
+    std::vector<ElementType> input(inputSize);
+    FillVector(input, 1.0f);
+	Log() << "Input:" << EOL << input << EOL;
+
+    // compare output
+    std::vector<std::vector<ElementType>> signal = { input };
+    VerifyCompiledOutput(map, compiledMap, signal, "ReorderDataCodeNode2");
+}
+
+void TestReorderDataCodeNode3()
+{
+    using ElementType = float;
+    int numRows = 3;
+    int numColumns = 4;
+    int numChannels = 2;
+    int padding = 1;
+    model::Model model;
+    model::PortMemoryLayout inputLayout(model::MemoryShape{ numRows, numColumns, numChannels }, model::MemoryShape{ padding, padding, 0 }); // Default order: 0, 1, 2 == rows, columns, channels
+	auto outputLayout = inputLayout.ReorderedCopy({ 2, 0, 1 });
+
+    size_t inputSize = inputLayout.GetMemorySize();
+    auto inputNode = model.AddNode<model::InputNode<ElementType>>(inputSize);
+    const auto& testOutput = ReorderDataWithCodeNode(inputNode->output, inputLayout, outputLayout);
+    auto map = model::Map(model, { { "input", inputNode } }, { { "output", testOutput } });
+    model::IRMapCompiler compiler;
+    auto compiledMap = compiler.Compile(map);
+
+    std::vector<ElementType> input(inputSize);
+    FillVector(input, 1.0f);
+	Log() << "Input:" << EOL << input << EOL;
+
+    // compare output
+    std::vector<std::vector<ElementType>> signal = { input };
+    VerifyCompiledOutput(map, compiledMap, signal, "ReorderDataCodeNode3");
+}
+
+void TestReorderDataCodeNode4()
+{
+    using ElementType = float;
+    int numRows = 2;
+    int numColumns = 5;
+    model::Model model;
+    model::PortMemoryLayout inputLayout(model::MemoryShape{ numRows, numColumns });
+    auto outputLayout = inputLayout.ReorderedCopy({ 1, 0 });
+
+    size_t inputSize = inputLayout.GetMemorySize();
+    auto inputNode = model.AddNode<model::InputNode<ElementType>>(inputSize);
+    const auto& testOutput = ReorderDataWithCodeNode(inputNode->output, inputLayout, outputLayout);
+    auto map = model::Map(model, { { "input", inputNode } }, { { "output", testOutput } });
+
+    // First, create the input tensor and expected output
+    math::RowMatrix<float> input(numRows, numColumns);
+    math::ColumnMatrix<float> expectedOutput(numRows, numColumns);
+
+    // Next, verify that the compiled output is correct
+    FillMatrix<float>(input, 1, 1);
+    FillMatrix<float>(expectedOutput, 1, 1);
+
+    std::vector<std::vector<ElementType>> signal = { input.ToArray() };
+	std::vector<std::vector<ElementType>> expected = { expectedOutput.ToArray() };
+
+	Log() << "Input:" << EOL << input.ToArray() << EOL;
+
+	std::string name = "TestReorderDataCodeNode4";
+    TestWithSerialization(map, name, [&](model::Map& map, int iteration) {
+        model::IRMapCompiler compiler;
+        auto compiledMap = compiler.Compile(map);
+
+        // compare output
+        VerifyCompiledOutputAndResult(map, compiledMap, signal, expected, utilities::FormatString("%s iteration %d", name.c_str(), iteration));
+    });
+}
+
+
 void TestReceptiveFieldMatrixNode(size_t numChannels, bool useNewReshape)
 {
     const std::array<int, 3> rcdOrder = std::array<int, 3>{ 0, 1, 2 };
@@ -1644,6 +1776,59 @@ void TestOrderedMatrixMatrixMultiplyNode(int m, int n, int k, bool transposeA, b
     });
 }
 
+void TestMatrixMatrixMultiplyCodeNode(int m, int n, int k, int panelM, int panelN, int panelK, int kernelM, int kernelN, int kernelK, MatrixMatrixMultiplyImplementation gemmImpl)
+{
+    using ValueType = float;
+    std::vector<ValueType> matrixBVals(k * n);
+    FillRandomVector(matrixBVals);
+
+    model::Model model;
+    auto inputMatrixNode = model.AddNode<model::InputNode<ValueType>>(m * k);
+    auto matrixBNode = model.AddNode<ConstantNode<ValueType>>(matrixBVals);
+
+    int lda = k;
+    int ldb = n;
+    int ldc = n;
+
+    auto matMatMultNode = model.AddNode<MatrixMatrixMultiplyCodeNode<ValueType>>(inputMatrixNode->output, m, n, k, lda, matrixBNode->output, ldb, ldc, panelM, panelN, panelK, kernelM, kernelN, kernelK, gemmImpl);
+
+    auto map = model::Map(model, { { "inputMatrix", inputMatrixNode } }, { { "output", matMatMultNode->output } });
+
+    std::string name = "MatrixMatrixMultiplyCodeNode";
+    TestWithSerialization(map, name, [&](model::Map& map, int iteration) {
+        // compare output
+        std::vector<ValueType> matrixAVals(m * k);
+        FillRandomVector(matrixAVals);
+        std::vector<std::vector<ValueType>> signal = { matrixAVals };
+
+        model::MapCompilerOptions settings;
+        model::ModelOptimizerOptions optimizerOptions;
+        model::IRMapCompiler compiler(settings, optimizerOptions);
+        auto compiledMap = compiler.Compile(map);
+
+        std::vector<ValueType> expectedResult(m * n);
+        for (int i = 0; i < m; i++)
+        {
+            for (int j = 0; j < n; j++)
+            {
+                for (int kVal = 0; kVal < k; kVal++)
+                {
+                    expectedResult[i * n + j] += matrixAVals[i * k + kVal] * matrixBVals[kVal * n + j];
+                }
+            }
+        }
+
+        std::vector<std::vector<ValueType>> expected { expectedResult };
+        std::stringstream id;
+        id << std::boolalpha << "MatrixMatrixMultiplyCodeNode(impl = " << static_cast<int>(gemmImpl)
+           << ", m = " << m << ", n = " << n << ", k = " << k
+           << ", panelM = " << panelM << ", panelN =" << panelN << ", panelK = " << panelK
+           << ", kernelM = " << kernelM << ", kernelN = " << kernelN << ", kernelK" << kernelK << ") iteration " << iteration;
+
+        VerifyCompiledOutputAndResult(map, compiledMap, signal, expected, id.str());
+    });
+}
+
 // C callback (called by emitted code)
 static int lagNotificationCallbackCount = 0;
 extern "C" {
@@ -3625,3 +3810,106 @@ void TestBroadcasBinaryOperationNodeCompileWithOrdering()
     auto computed = compiledMap.Compute<double>(input1Vals);
     testing::ProcessTest("TestBroadcastBinaryOperationNodeCompileWithOrdering", testing::IsEqual(computed, expected));
 }
+
+void TestSpatialConvolutionNode(size_t inputPaddingSize, size_t outputPaddingSize)
+{
+    // Abbreviations:
+    //
+    // r == # input rows
+    // c == # input columns
+    // ch == # input channels
+    // fw == filter width
+    // nf == # filters
+    // pi == input padding amount
+    // po == output padding amount
+
+    // Data dimensions:
+    //
+    // Input: r x c x ch, with padding -> r+2pi x c+2pi x ch
+    //     == 1 x 2 x 2, with padding == 1 -> 3 x 4 x 2
+    // Weights: nf x fw x fw x ch
+    //       == 2 x 3 x 3 x 2, (2 3x3 filters, with 2 input channels each)
+    // Output: r x c x nf, with padding -> 1+2po x 2+2po x 2
+    //      == 1 x 2 x 2, with padding == 0 -> 1 x 2 x 2
+
+    using ElementType = double;
+    using LayerParameters = typename Layer<ElementType>::LayerParameters;
+    using TensorType = typename Layer<ElementType>::TensorType;
+    using TensorReferenceType = typename Layer<ElementType>::TensorReferenceType;
+    using Shape = typename Layer<ElementType>::Shape;
+
+    assert(inputPaddingSize == 1);
+    TensorType inputWithPadding(2 + 2 * inputPaddingSize, 2 + 2 * inputPaddingSize, 2);
+    TensorReferenceType input = inputWithPadding.GetSubTensor({ inputPaddingSize, inputPaddingSize, 0 }, { 2, 2, 2 });
+    inputWithPadding.Fill(0);
+    input(0, 0, 0) = 2;
+    input(0, 1, 0) = 1;
+    input(0, 0, 1) = 3;
+    input(0, 1, 1) = 2;
+    // Input channel 0: [2, 3], input channel 1: [1, 2]
+
+    Shape outputShape = { 2, 2, 2 };
+
+    LayerParameters parameters{ inputWithPadding, ZeroPadding(inputPaddingSize), outputShape, ZeroPadding(outputPaddingSize) };
+    ConvolutionalParameters convolutionalParams{ 3, 1, ConvolutionMethod::automatic, 2 };
+
+    // Filter weights in `weightsVector` are in numFilters x numChannels x filterSize x filterSize order
+    // clang-format off
+    std::vector<ElementType> weightsVector {
+        1, 3, 2,   3, 1, 1,   2, 3, 1,   // Filter 1, channel 1
+        1, 2, 1,   2, 3, 2,   1, 2, 1}; // Filter 2, channel 2
+    // clang-format on
+
+    // Viewed as planar filters (ch x fw x fw):
+    //
+    //       1 3 2
+    // f0 =  3 1 1
+    //       2 3 1
+    //
+    //       1 2 1
+    // f1 =  2 3 2
+    //       1 2 1
+
+    // Filter weights in `weights` tensor are in numFilters x filterSize x filterSize x 1 order
+    TensorType weights(convolutionalParams.receptiveField * outputShape.NumChannels(), convolutionalParams.receptiveField, 1);
+
+    size_t vectorIndex = 0;
+    for (size_t f = 0; f < outputShape.NumChannels(); ++f)
+    {
+        for (size_t k = 0; k < 1; ++k)
+        {
+            for (size_t i = 0; i < convolutionalParams.receptiveField; ++i)
+            {
+                for (size_t j = 0; j < convolutionalParams.receptiveField; ++j)
+                {
+                    weights(f * convolutionalParams.receptiveField + i, j, k) = weightsVector[vectorIndex++];
+                }
+            }
+        }
+    }
+
+    //
+    // Verify ConvolutionalLayerNode
+    //
+    ConvolutionalLayer<ElementType> layer(parameters, convolutionalParams, weights);
+    layer.Compute();
+    auto output = layer.GetOutput();
+
+    // Create model
+    model::Model model;
+    auto inputMemoryLayout = utilities::MemoryLayout(
+        utilities::MemoryShape{ 2, 2, 2 },
+        utilities::MemoryShape{ static_cast<int>(inputPaddingSize), static_cast<int>(inputPaddingSize), 0 });
+    // BUGBUG: This fails when the order is not canonical order.
+    auto inputNode = model.AddNode<model::InputNode<double>>(inputMemoryLayout.ReorderedCopy({ 2, 0, 1 }));
+    auto outputMemoryLayout = utilities::MemoryLayout(utilities::MemoryShape{ 2, 2, 2 });
+    auto computeNode = model.AddNode<SpatialConvolutionNode<double>>(inputNode->output, layer, outputMemoryLayout);
+    auto map = model::Map(model, { { "input", inputNode } }, { { "output", computeNode->output } });
+
+    const auto info = "TestSpatialConvolutionalLayer";
+
+    VerifyLayerMap<ElementType>(map, computeNode, inputWithPadding, output, info);
+
+    // Test archiving / unarchiving produces same result
+    VerifyArchiveAndUnarchivingMap<ElementType>(map, computeNode, inputWithPadding, output, info);
+}
diff --git a/libraries/model/test/src/model_compiler_test_main.cpp b/libraries/model/test/src/model_compiler_test_main.cpp
index c20717aa7..567a2343c 100644
--- a/libraries/model/test/src/model_compiler_test_main.cpp
+++ b/libraries/model/test/src/model_compiler_test_main.cpp
@@ -11,6 +11,7 @@
 #include "CompilerTest.h"
 #include "ModelHeaderOutputTest.h"
 #include "PerformanceCountersTest.h"
+#include <nodes/include/MatrixMatrixMultiplyCodeNode.h>
 
 #include <model_testing/include/ModelTestUtilities.h>
 
@@ -22,8 +23,25 @@ using namespace ell;
 using namespace ell::emitters;
 using namespace ell::predictors::neural;
 
+void TestMatrixMatrixMultiplyCodeNodeImplementations()
+{
+    const int fallbackPanelM = 1;
+    const int fallbackPanelN = 1;
+    const int fallbackPanelK = 1;
+    const int fallbackKernelM = 1;
+    const int fallbackKernelN = 1;
+    const int fallbackKernelK = 1;
+    // Naive for-loop implementation
+    TestMatrixMatrixMultiplyCodeNode(1, 1, 1, fallbackPanelM, fallbackPanelN, fallbackPanelK, fallbackKernelM, fallbackKernelN, fallbackKernelK, nodes::MatrixMatrixMultiplyImplementation::SimpleForLoops);
+    TestMatrixMatrixMultiplyCodeNode(4, 4, 4, fallbackPanelM, fallbackPanelN, fallbackPanelK, fallbackKernelM, fallbackKernelN, fallbackKernelK, nodes::MatrixMatrixMultiplyImplementation::SimpleForLoops);
+    TestMatrixMatrixMultiplyCodeNode(4, 8, 8, fallbackPanelM, fallbackPanelN, fallbackPanelK, fallbackKernelM, fallbackKernelN, fallbackKernelK, nodes::MatrixMatrixMultiplyImplementation::SimpleForLoops);
+    TestMatrixMatrixMultiplyCodeNode(4, 4, 8, fallbackPanelM, fallbackPanelN, fallbackPanelK, fallbackKernelM, fallbackKernelN, fallbackKernelK, nodes::MatrixMatrixMultiplyImplementation::SimpleForLoops);
+}
+
 void TestIRCompiler()
 {
+    // TestIRNode(); // Failing on Windows
+
     TestBufferNode<float>();
     TestBufferNode<double>();
     TestBufferNode<int>();
@@ -99,6 +117,8 @@ void TestIRCompiler()
 
     // TestMatrixMatrixMultiplyNode(15, 25600, 27, false); // Fails due to numerical  issues
 
+    TestMatrixMatrixMultiplyCodeNodeImplementations();
+
     TestCompilableScalarOutputNode();
     TestCompilableVectorOutputNode();
     TestCompilableAccumulatorNode();
@@ -120,7 +140,11 @@ void TestIRCompiler()
     TestReorderDataNode1();
     TestReorderDataNode2();
     TestReorderDataNode3();
-    TestReceptiveFieldMatrixNode(1, true); // new version
+    TestReorderDataCodeNode1();
+    TestReorderDataCodeNode2();
+    TestReorderDataCodeNode3();
+	TestReorderDataCodeNode4();
+	TestReceptiveFieldMatrixNode(1, true); // new version
     TestReceptiveFieldMatrixNode(1, false); // old (slow) version
     TestReceptiveFieldMatrixNode(2, true); // new version
     // TestReceptiveFieldMatrixNode(2, false); // old (slow) version -- Fails
@@ -239,6 +263,9 @@ void TestIRCompiler()
     TestConvolutionalLayerNode2(ConvolutionMethod::winograd, 1, 0);
     TestConvolutionalLayerNode3(ConvolutionMethod::winograd, 1, 0);
 
+	//BUGBUG: This test currently fails for Compute but passes for Compile.
+	//TestSpatialConvolutionNode(1, 0);
+
     TestFullyConnectedLayerNode();
     // TestFullyConnectedLayerNode(0, 1); // Fully-connected layer nodes can't have padding (yet)
     // TestFullyConnectedLayerNode(0, 2); // Fully-connected layer nodes can't have padding (yet)
diff --git a/libraries/model_testing/include/ModelTestUtilities.h b/libraries/model_testing/include/ModelTestUtilities.h
index d4e7aaf90..5874884cc 100644
--- a/libraries/model_testing/include/ModelTestUtilities.h
+++ b/libraries/model_testing/include/ModelTestUtilities.h
@@ -155,12 +155,18 @@ void FillTensor(ell::math::ChannelColumnRowTensor<ElementType>& tensor, ElementT
 template <typename ElementType>
 void FillTensor(math::TensorReference<ElementType, math::Dimension::channel, math::Dimension::column, math::Dimension::row>& tensor, ElementType startValue = 0, ElementType step = 1);
 
+template <typename ElementType>
+void FillTensor(ell::math::ColumnRowChannelTensor<ElementType>& tensor, ElementType startValue = 0, ElementType step = 1);
+
 template <typename ElementType>
 void FillWeightsTensor(ell::math::ChannelColumnRowTensor<ElementType>& tensor, ElementType startValue = 0, ElementType step = 1);
 
 template <typename ElementType>
 void FillMatrix(math::RowMatrix<ElementType>& matrix, ElementType startValue = 0, ElementType step = 1);
 
+template <typename ElementType>
+void FillMatrix(math::ColumnMatrix<ElementType>& matrix, ElementType startValue = 0, ElementType step = 1);
+
 #pragma region implementation
 
 template <typename ValueType, typename InfoType>
@@ -385,11 +391,9 @@ void VerifyCompiledOutput(model::Map& map, model::IRCompiledMap& compiledMap, co
     }
 }
 
-
 ///<summary> Verify the compiled output matches the computed output, and also verify computed output matches a given expected output </summary>
 template <typename InputType, typename OutputType>
-bool VerifyCompiledOutputAndResult(model::Map& map, model::IRCompiledMap& compiledMap, const std::vector<std::vector<InputType>>& signal,
-    const std::vector<std::vector<OutputType>>& expectedOutput, const std::string& name, const std::string& additionalMessage, double epsilon)
+bool VerifyCompiledOutputAndResult(model::Map& map, model::IRCompiledMap& compiledMap, const std::vector<std::vector<InputType>>& signal, const std::vector<std::vector<OutputType>>& expectedOutput, const std::string& name, const std::string& additionalMessage, double epsilon)
 {
     bool ok = true;
     std::vector<OutputType> computedResult;
@@ -430,7 +434,6 @@ bool VerifyCompiledOutputAndResult(model::Map& map, model::IRCompiledMap& compil
     return ok;
 }
 
-
 template <typename ValueType>
 class Uniform
 {
@@ -520,6 +523,23 @@ void FillTensor(math::TensorReference<ElementType, math::Dimension::channel, mat
         return result; });
 }
 
+template <typename ElementType>
+void FillTensor(ell::math::ColumnRowChannelTensor<ElementType>& tensor, ElementType startValue, ElementType step)
+{
+    ElementType val = startValue;
+    for (size_t row = 0; row < tensor.NumRows(); row++)
+    {
+        for (size_t column = 0; column < tensor.NumColumns(); column++)
+        {
+            for (size_t channel = 0; channel < tensor.NumChannels(); channel++)
+            {
+                tensor(row, column, channel) = val;
+                val += step;
+            }
+        }
+    }
+}
+
 template <typename ElementType>
 void FillWeightsTensor(ell::math::ChannelColumnRowTensor<ElementType>& tensor, ElementType startValue, ElementType step)
 {
@@ -534,10 +554,28 @@ template <typename ElementType>
 void FillMatrix(math::RowMatrix<ElementType>& matrix, ElementType startValue, ElementType step)
 {
     ElementType val = startValue;
-    matrix.Generate([&val, step]() {
-        auto result = val;
-        val += step;
-        return result; });
+    for (size_t row = 0; row < matrix.NumRows(); row++)
+    {
+        for (size_t column = 0; column < matrix.NumColumns(); column++)
+        {
+            matrix(row, column) = val;
+            val += step;
+        }
+    }
+}
+
+template <typename ElementType>
+void FillMatrix(math::ColumnMatrix<ElementType>& matrix, ElementType startValue, ElementType step)
+{
+    ElementType val = startValue;
+    for (size_t row = 0; row < matrix.NumRows(); row++)
+    {
+        for (size_t column = 0; column < matrix.NumColumns(); column++)
+        {
+            matrix(row, column) = val;
+            val += step;
+        }
+    }
 }
 
 #pragma endregion implementation
diff --git a/libraries/nodes/CMakeLists.txt b/libraries/nodes/CMakeLists.txt
index 5e70370ec..e9943b0e0 100644
--- a/libraries/nodes/CMakeLists.txt
+++ b/libraries/nodes/CMakeLists.txt
@@ -25,7 +25,9 @@ set(src
     src/IIRFilterNode.cpp
     src/IRNode.cpp
     src/LSTMNode.cpp
+    src/MatrixMatrixMultiplyCodeNode.cpp
     src/MatrixMatrixMultiplyNode.cpp
+    src/MatrixMatrixMultiplyCodeNode.cpp
     src/MatrixVectorMultiplyNode.cpp
     src/NeuralNetworkPredictorNode.cpp
     src/PoolingLayerNode.cpp
@@ -79,6 +81,7 @@ set(include
     include/L2NormSquaredNode.h
     include/LSTMNode.h
     include/LinearPredictorNode.h
+    include/MatrixMatrixMultiplyCodeNode.h
     include/MatrixMatrixMultiplyNode.h
     include/MatrixVectorMultiplyNode.h
     include/MatrixVectorProductNode.h
@@ -94,6 +97,7 @@ set(include
     include/RNNNode.h
     include/RegionDetectionLayerNode.h
     include/ReinterpretLayoutNode.h
+    include/ReorderDataCodeNode.h
     include/ReorderDataNode.h
     include/ScalingLayerNode.h
     include/SimpleConvolutionNode.h
@@ -101,6 +105,7 @@ set(include
     include/SinkNode.h
     include/SoftmaxLayerNode.h
     include/SourceNode.h
+    include/SpatialConvolutionNode.h
     include/SquaredEuclideanDistanceNode.h
     include/SumNode.h
     include/TypeCastNode.h
diff --git a/libraries/nodes/include/BroadcastOperationNodes.h b/libraries/nodes/include/BroadcastOperationNodes.h
index ad1051939..6f7d63ff0 100644
--- a/libraries/nodes/include/BroadcastOperationNodes.h
+++ b/libraries/nodes/include/BroadcastOperationNodes.h
@@ -10,15 +10,7 @@
 
 #include "NodeOperations.h"
 
-#include <emitters/include/CompilableFunction.h>
-#include <emitters/include/IRAsyncTask.h>
-#include <emitters/include/IREmitter.h>
-#include <emitters/include/IRVectorUtilities.h>
-#include <emitters/include/LLVMUtilities.h>
-
-#include <model/include/CompilableNode.h>
-#include <model/include/CompilableNodeUtilities.h>
-#include <model/include/IRMapCompiler.h>
+#include <model/include/CompilableCodeNode.h>
 #include <model/include/Model.h>
 #include <model/include/ModelTransformer.h>
 #include <model/include/Node.h>
@@ -27,6 +19,13 @@
 #include <utilities/include/Exception.h>
 #include <utilities/include/TypeName.h>
 
+#include <value/include/Array.h>
+#include <value/include/EmitterContext.h>
+#include <value/include/Scalar.h>
+
+#include <value/include/loopnests/CodeGenerator.h>
+#include <value/include/loopnests/LoopNest.h>
+
 #include <functional>
 #include <numeric>
 #include <string>
@@ -37,6 +36,10 @@ namespace ell
 {
 namespace nodes
 {
+    using UnaryScalarFunction = value::Scalar (*)(value::Scalar);
+    using BinaryScalarFunction = value::Scalar (*)(value::Scalar, value::Scalar);
+    using TernaryScalarFunction = value::Scalar (*)(value::Scalar, value::Scalar, value::Scalar);
+
     /// <summary>
     ///
     /// Broadcast operation nodes perform elementwise operations on multidimensional arrays, using "broadcast" semantics. If the
@@ -51,7 +54,7 @@ namespace nodes
 
     // Base class for broadcast nodes
     template <typename ValueType, typename FunctionType>
-    class BroadcastOperationNode : public model::CompilableNode
+    class BroadcastOperationNode : public model::CompilableCodeNode
     {
     public:
         /// @name Output Port
@@ -60,6 +63,8 @@ namespace nodes
         /// @}
 
     protected:
+        using KernelFunctionType = std::function<void(const std::vector<value::Value>& args)>;
+
         BroadcastOperationNode(const std::vector<model::InputPortBase*>& inputsPortRefs,
                                const std::vector<const model::OutputPortBase*>& inputs,
                                ValueType padding = 0);
@@ -75,59 +80,36 @@ namespace nodes
         const model::OutputPort<ValueType>& GetOutput() const;
         const model::InputPort<ValueType>& GetInput(int index) const;
 
-        const FunctionType& GetFunction() const;
-        template <typename OpFunctionType>
-        void SetFunction(OpFunctionType&& function);
-        virtual ValueType ComputeOperation(const std::vector<ValueType>& args) const = 0;
-        virtual emitters::IRLocalScalar CompileOperation(const std::vector<emitters::IRLocalScalar>& args) const = 0;
-
-        void Compute() const override;
-        void Compile(model::IRMapCompiler& compiler, emitters::IRFunctionEmitter& function) override;
+        void Define(ell::value::FunctionDeclaration& fn) override;
+        KernelFunctionType MakeKernel(FunctionType f) const;
+        virtual KernelFunctionType GetKernelFunction() const = 0;
+        value::Scalar CallKernelFunction(FunctionType f, std::vector<value::Array> inputs, std::vector<std::vector<value::Scalar>> indices) const;
 
         bool HasState() const override { return true; } // stored state: function and padding value
 
-    protected:
         void WriteToArchive(utilities::Archiver& archiver) const override;
         void ReadFromArchive(utilities::Unarchiver& archiver) override;
 
     private:
-        void ComputeDimensionLoop(int dimension,
-                                  const std::vector<int>& prevInputDimensionOffsets,
-                                  const std::vector<int>& lastActiveInputDimensions,
-                                  const std::vector<ValueType>& inputValues,
-                                  int prevOutputDimensionOffset,
-                                  std::vector<ValueType>& output) const;
-
-        void CompileDimensionLoop(model::IRMapCompiler& compiler,
-                                  emitters::IRFunctionEmitter& function,
-                                  int dimension,
-                                  const std::vector<emitters::IRLocalArray>& inputs,
-                                  const std::vector<emitters::IRLocalScalar>& prevInputDimensionOffsets,
-                                  const std::vector<int>& lastActiveInputDimensions,
-                                  const std::vector<emitters::IRLocalScalar>& inputValues,
-                                  emitters::IRLocalScalar prevOutputDimensionOffset,
-                                  emitters::IRLocalArray& output) const;
-        std::vector<int> GetLastActiveInputDimensions() const;
-
         utilities::ArchiveVersion GetArchiveVersion() const override;
         bool CanReadArchiveVersion(const utilities::ArchiveVersion& version) const override;
 
         ValueType GetOutputPadding() const { return _paddingValue; }
 
         model::OutputPort<ValueType> _output;
-        std::unique_ptr<FunctionType> _function;
         ValueType _paddingValue;
     };
 
     //
     // BroadcastUnaryOperationNode
     //
+
     template <typename ValueType>
-    class BroadcastUnaryOperationNode : public BroadcastOperationNode<ValueType, UnaryFunctionType<ValueType>>
+    class BroadcastUnaryOperationNode : public BroadcastOperationNode<ValueType, UnaryScalarFunction>
     {
     public:
         using OperationType = UnaryOperationType;
-        using FunctionType = UnaryFunctionType<ValueType>;
+        using FunctionType = UnaryScalarFunction;
 
         /// <summary> Default constructor. </summary>
         BroadcastUnaryOperationNode();
@@ -166,16 +148,15 @@ namespace nodes
 
     protected:
         using BroadcastOperationNode<ValueType, FunctionType>::GetOutput;
-        using BroadcastOperationNode<ValueType, FunctionType>::GetFunction;
-        using BroadcastOperationNode<ValueType, FunctionType>::SetFunction;
-        ValueType ComputeOperation(const std::vector<ValueType>& args) const override;
-        emitters::IRLocalScalar CompileOperation(const std::vector<emitters::IRLocalScalar>& args) const override;
+        using BroadcastOperationNode<ValueType, FunctionType>::MakeKernel;
+        using KernelFunctionType = typename BroadcastOperationNode<ValueType, FunctionType>::KernelFunctionType;
+
+        KernelFunctionType GetKernelFunction() const override;
 
         void WriteToArchive(utilities::Archiver& archiver) const override;
         void ReadFromArchive(utilities::Unarchiver& archiver) override;
 
     private:
-        void SetOperationFunction();
         void Copy(model::ModelTransformer& transformer) const override;
 
         model::InputPort<ValueType> _input;
@@ -186,11 +167,11 @@ namespace nodes
     // BroadcastBinaryOperationNode
     //
     template <typename ValueType>
-    class BroadcastBinaryOperationNode : public BroadcastOperationNode<ValueType, BinaryFunctionType<ValueType>>
+    class BroadcastBinaryOperationNode : public BroadcastOperationNode<ValueType, BinaryScalarFunction>
     {
     public:
         using OperationType = BinaryOperationType;
-        using FunctionType = BinaryFunctionType<ValueType>;
+        using FunctionType = BinaryScalarFunction;
 
         /// <summary> Default constructor. </summary>
         BroadcastBinaryOperationNode();
@@ -236,16 +217,15 @@ namespace nodes
 
     protected:
         using BroadcastOperationNode<ValueType, FunctionType>::GetOutput;
-        using BroadcastOperationNode<ValueType, FunctionType>::GetFunction;
-        using BroadcastOperationNode<ValueType, FunctionType>::SetFunction;
-        ValueType ComputeOperation(const std::vector<ValueType>& args) const override;
-        emitters::IRLocalScalar CompileOperation(const std::vector<emitters::IRLocalScalar>& args) const override;
+        using BroadcastOperationNode<ValueType, FunctionType>::MakeKernel;
+        using KernelFunctionType = typename BroadcastOperationNode<ValueType, FunctionType>::KernelFunctionType;
+
+        KernelFunctionType GetKernelFunction() const override;
 
         void WriteToArchive(utilities::Archiver& archiver) const override;
         void ReadFromArchive(utilities::Unarchiver& archiver) override;
 
     private:
-        void SetOperationFunction();
         void Copy(model::ModelTransformer& transformer) const override;
 
         model::InputPort<ValueType> _input1;
@@ -257,11 +237,11 @@ namespace nodes
     // BroadcastTernaryOperationNode
     //
     template <typename ValueType>
-    class BroadcastTernaryOperationNode : public BroadcastOperationNode<ValueType, TernaryFunctionType<ValueType>>
+    class BroadcastTernaryOperationNode : public BroadcastOperationNode<ValueType, TernaryScalarFunction>
     {
     public:
         using OperationType = TernaryOperationType;
-        using FunctionType = TernaryFunctionType<ValueType>;
+        using FunctionType = TernaryScalarFunction;
 
         /// <summary> Default constructor. </summary>
         BroadcastTernaryOperationNode();
@@ -308,16 +288,15 @@ namespace nodes
 
     protected:
         using BroadcastOperationNode<ValueType, FunctionType>::GetOutput;
-        using BroadcastOperationNode<ValueType, FunctionType>::GetFunction;
-        using BroadcastOperationNode<ValueType, FunctionType>::SetFunction;
-        ValueType ComputeOperation(const std::vector<ValueType>& args) const override;
-        emitters::IRLocalScalar CompileOperation(const std::vector<emitters::IRLocalScalar>& args) const override;
+        using BroadcastOperationNode<ValueType, FunctionType>::MakeKernel;
+        using KernelFunctionType = typename BroadcastOperationNode<ValueType, FunctionType>::KernelFunctionType;
+
+        KernelFunctionType GetKernelFunction() const override;
 
         void WriteToArchive(utilities::Archiver& archiver) const override;
         void ReadFromArchive(utilities::Unarchiver& archiver) override;
 
     private:
-        void SetOperationFunction();
         void Copy(model::ModelTransformer& transformer) const override;
 
         model::InputPort<ValueType> _input1;
@@ -345,7 +324,7 @@ namespace nodes
     BroadcastOperationNode<ValueType, FunctionType>::BroadcastOperationNode(const std::vector<model::InputPortBase*>& inputPortRefs,
                                                                             const std::vector<const model::OutputPortBase*>& inputs,
                                                                             ValueType paddingValue) :
-        CompilableNode(inputPortRefs, { &_output }),
+        CompilableCodeNode("BroadcastOperationNode", inputPortRefs, { &_output }),
         _output(this, ell::model::Node::defaultOutputPortName, ComputeBroadcastedLayout(inputs)),
         _paddingValue(paddingValue)
     {
@@ -356,7 +335,7 @@ namespace nodes
                                                                             const std::vector<const model::OutputPortBase*>& inputs,
                                                                             const model::PortMemoryLayout& outputLayout,
                                                                             ValueType paddingValue) :
-        CompilableNode(inputPortRefs, { &_output }),
+        CompilableCodeNode("BroadcastOperationNode", inputPortRefs, { &_output }),
         _output(this, ell::model::Node::defaultOutputPortName, outputLayout),
         _paddingValue(paddingValue)
     {
@@ -389,215 +368,99 @@ namespace nodes
     }
 
     template <typename ValueType, typename FunctionType>
-    template <typename OpFunctionType>
-    void BroadcastOperationNode<ValueType, FunctionType>::SetFunction(OpFunctionType&& function)
-    {
-        _function = std::make_unique<OpFunctionType>(std::move(function));
-    }
-
-    template <typename ValueType, typename FunctionType>
-    const FunctionType& BroadcastOperationNode<ValueType, FunctionType>::GetFunction() const
+    void BroadcastOperationNode<ValueType, FunctionType>::Define(ell::value::FunctionDeclaration& fn)
     {
-        return *_function;
-    }
-
-    //
-    // Arbitrary-depth nested loops are generated recursively. The EmitComputeDimensionLoop
-    // function emits `numDimensions` nested loops of the form:
-    //
-    // for(iz = 0; iz < sz; ++iz)
-    // {
-    //     zOffset = (iz+offset[2]) * stride[2];
-    //     for(iy = 0; iy < sy; ++iy)
-    //     {
-    //         yOffset = zOffset + (iy+offset[1]) * stride[1];
-    //         for(ix = 0; ix < sx; ++ix)
-    //         {
-    //             offset = yOffset + (ix+offset[0]) * stride[0];
-    //             x = arr[offset];
-    //             val = f(x);
-    //             output[offset] = val;
-    //         }
-    //     }
-    // }
-    //
+        using namespace value::loopnests;
 
-    template <typename ValueType, typename FunctionType>
-    std::vector<int> BroadcastOperationNode<ValueType, FunctionType>::GetLastActiveInputDimensions() const
-    {
-        const auto numDimensions = NumDimensions();
-        auto numInputs = NumInputPorts();
-        std::vector<int> lastActiveInputDimensions(numInputs, 0);
-        for (int i = 0; i < numInputs; ++i)
-        {
-            const auto& inputLayout = GetInput(i).GetMemoryLayout();
-            const auto& activeSize = inputLayout.GetLogicalDimensionActiveSize();
-            for (int j = numDimensions - 1; j >= 0; --j)
+        (void)fn.Define([this](const std::vector<value::Value>& args) {
+            if (static_cast<int>(args.size()) != (this->NumInputPorts() + this->NumOutputPorts()))
             {
-                if (activeSize[j] != 1)
-                {
-                    lastActiveInputDimensions[i] = j;
-                    break;
-                }
+                throw utilities::LogicException(utilities::LogicExceptionErrors::illegalState);
             }
-        }
 
-        return lastActiveInputDimensions;
-    }
+            auto outputLayout = this->GetOutputMemoryLayout();
+            auto numDim = outputLayout.NumDimensions();
 
-    template <typename ValueType, typename FunctionType>
-    void BroadcastOperationNode<ValueType, FunctionType>::ComputeDimensionLoop(int dimension,
-                                                                               const std::vector<int>& prevInputDimensionOffsetsIn,
-                                                                               const std::vector<int>& lastActiveInputDimensions,
-                                                                               const std::vector<ValueType>& inputValuesIn,
-                                                                               int prevOutputDimensionOffset,
-                                                                               std::vector<ValueType>& output) const
-    {
-        auto prevInputDimensionOffsets = prevInputDimensionOffsetsIn;
-        auto inputValues = inputValuesIn;
-        const auto& outputLayout = GetOutputMemoryLayout();
-        const auto outputGlobalOffset = outputLayout.GetFirstEntryOffset();
-        const auto& outputSize = outputLayout.GetLogicalDimensionActiveSize();
-        const auto& outputIncrement = outputLayout.GetLogicalDimensionIncrement();
-
-        const auto numDimensions = outputLayout.NumDimensions();
-        const auto numInputs = NumInputPorts();
-
-        for (int loopIndex = 0; loopIndex < outputSize[dimension]; ++loopIndex)
-        {
-            auto thisOutputDimensionOffset = prevOutputDimensionOffset + loopIndex * outputIncrement[dimension];
-            std::vector<int> thisInputDimensionOffsets(numInputs, 0);
-            for (int inputIndex = 0; inputIndex < numInputs; ++inputIndex)
+            // Create the indices and ranges for the loop nest
+            std::vector<Index> indices;
+            std::vector<IndexRange> ranges;
+            for (int d = 0; d < numDim; ++d)
             {
-                const auto& input = GetInput(inputIndex);
-                const auto& inputLayout = input.GetMemoryLayout();
-                const auto inputGlobalOffset = inputLayout.GetFirstEntryOffset();
-                const auto& inputSize = inputLayout.GetLogicalDimensionActiveSize();
-                const auto& inputIncrement = inputLayout.GetLogicalDimensionIncrement();
-
-                // Account for broadcasting dimensions by setting loopIndex to 0 if this is a broadcast dimension for this input
-                auto thisLoopIndex = inputSize[dimension] == 1 ? 0 : loopIndex;
-                auto thisInputDimensionOffset = prevInputDimensionOffsets[inputIndex] + thisLoopIndex * inputIncrement[dimension];
-                thisInputDimensionOffsets[inputIndex] = thisInputDimensionOffset;
-                if (dimension == lastActiveInputDimensions[inputIndex])
-                {
-                    inputValues[inputIndex] = input[inputGlobalOffset + thisInputDimensionOffset];
-                }
+                auto name = "i_" + std::to_string(d);
+                Index i(name);
+                indices.emplace_back(i);
+                int size = static_cast<int>(outputLayout.GetLogicalDimensionActiveSize()[d]);
+                ranges.push_back({ i, { 0, size } });
             }
 
-            if (dimension < numDimensions - 1)
-            {
-                // Recursive call to emit nested loop
-                ComputeDimensionLoop(dimension + 1, thisInputDimensionOffsets, lastActiveInputDimensions, inputValues, thisOutputDimensionOffset, output);
-            }
-            else
-            {
-                // We're in the innermost loop --- compute the value
-                auto outputValue = ComputeOperation(inputValues);
-                output[outputGlobalOffset + thisOutputDimensionOffset] = outputValue;
-            }
-        }
+            LoopNest loop(ranges);
+            auto kernel = value::loopnests::Kernel("kernel")
+                              .Inputs(args)
+                              .Indices(indices)
+                              .Define(GetKernelFunction());
+            loop.AddKernel(kernel);
+
+            CodeGenerator generator;
+            generator.Run(loop);
+        });
     }
 
     template <typename ValueType, typename FunctionType>
-    void BroadcastOperationNode<ValueType, FunctionType>::CompileDimensionLoop(model::IRMapCompiler& compiler,
-                                                                               emitters::IRFunctionEmitter& function,
-                                                                               int dimension,
-                                                                               const std::vector<emitters::IRLocalArray>& inputsIn,
-                                                                               const std::vector<emitters::IRLocalScalar>& prevInputDimensionOffsetsIn,
-                                                                               const std::vector<int>& lastActiveInputDimensions,
-                                                                               const std::vector<emitters::IRLocalScalar>& inputValuesIn,
-                                                                               emitters::IRLocalScalar prevOutputDimensionOffset,
-                                                                               emitters::IRLocalArray& output) const
-    {
-        auto inputs = inputsIn;
-        auto prevInputDimensionOffsets = prevInputDimensionOffsetsIn;
-        auto inputValues = inputValuesIn;
-
-        model::PortMemoryLayout outputLayout = GetOutputMemoryLayout();
-        const auto outputGlobalOffset = static_cast<int>(outputLayout.GetFirstEntryOffset());
-        const auto& outputSize = outputLayout.GetLogicalDimensionActiveSize();
-        const auto& outputIncrement = outputLayout.GetLogicalDimensionIncrement();
-
-        const auto numDimensions = outputLayout.NumDimensions();
-        const auto numInputs = NumInputPorts();
-
-        function.For(0, outputSize[dimension], [&](emitters::IRFunctionEmitter& function, auto loopIndex) {
-            auto thisOutputDimensionOffset = prevOutputDimensionOffset + loopIndex * outputIncrement[dimension];
-            std::vector<emitters::IRLocalScalar> thisInputDimensionOffsets(numInputs, function.LocalScalar<int>(0));
-            for (int inputIndex = 0; inputIndex < numInputs; ++inputIndex)
+    auto BroadcastOperationNode<ValueType, FunctionType>::MakeKernel(FunctionType f) const -> KernelFunctionType
+    {
+        // # args = # inputs + # outputs
+        // the rest are indices
+        return [this, f = std::move(f)](const std::vector<value::Value>& args) {
+            if (static_cast<int>(args.size()) != NumDimensions() + NumInputPorts() + 1)
             {
-                const auto& inputPort = GetInput(inputIndex);
-                const auto& inputLayout = inputPort.GetMemoryLayout();
-                const auto inputGlobalOffset = static_cast<int>(inputLayout.GetFirstEntryOffset());
-                const auto& inputSize = inputLayout.GetLogicalDimensionActiveSize();
-                const auto& inputIncrement = inputLayout.GetLogicalDimensionIncrement();
-                const auto& input = inputs[inputIndex];
-
-                // Account for broadcasting dimensions by setting loopIndex to 0 if this is a broadcast dimension for this input
-                auto thisLoopIndex = inputSize[dimension] == 1 ? function.LocalScalar<int>(0) : loopIndex;
-                auto thisInputDimensionOffset = prevInputDimensionOffsets[inputIndex] + thisLoopIndex * inputIncrement[dimension];
-                thisInputDimensionOffsets[inputIndex] = thisInputDimensionOffset;
-                if (dimension == lastActiveInputDimensions[inputIndex])
-                {
-                    inputValues[inputIndex] = input[thisInputDimensionOffset + inputGlobalOffset];
-                }
+                throw utilities::LogicException(utilities::LogicExceptionErrors::illegalState);
             }
 
-            if (dimension < numDimensions - 1)
+            const int numInputs = this->NumInputPorts();
+            std::vector<value::Array> inputs;
+            auto it = args.begin();
+            for (int i = 0; i < numInputs; ++i)
             {
-                // Recursive call to emit nested loop
-                CompileDimensionLoop(compiler, function, dimension + 1, inputs, thisInputDimensionOffsets, lastActiveInputDimensions, inputValues, thisOutputDimensionOffset, output);
+                inputs.push_back({ *it++ });
             }
-            else
+            auto output = value::Array(*it++);
+            std::vector<std::vector<value::Scalar>> indices(numInputs);
+            std::vector<value::Scalar> outputIndices;
+            int dimension = 0;
+            for (; it != args.end(); ++it)
             {
-                // We're in the innermost loop --- compute the value
-                auto outputValue = CompileOperation(inputValues);
-                output[outputGlobalOffset + thisOutputDimensionOffset] = outputValue;
+                for (int i = 0; i < numInputs; ++i)
+                {
+                    indices[i].push_back(GetInputMemoryLayout(i).GetLogicalDimensionActiveSize(dimension) > 1 ? value::Scalar{ *it } : value::Scalar(0));
+                }
+                outputIndices.push_back({ *it });
+                ++dimension;
             }
-        });
-    }
 
-    template <typename ValueType, typename FunctionType>
-    void BroadcastOperationNode<ValueType, FunctionType>::Compute() const
-    {
-        const auto& outputLayout = GetOutputMemoryLayout();
-        const auto numInputs = NumInputPorts();
-
-        auto outputSize = outputLayout.GetMemorySize();
-        auto output = std::vector<ValueType>(outputSize);
-
-        const int startDimension = 0;
-        std::vector<int> prevInputOffsets(numInputs, 0);
-        auto lastActiveInputDimensions = GetLastActiveInputDimensions();
-        std::vector<ValueType> inputValues(numInputs);
-        const int startOffset = 0;
-        ComputeDimensionLoop(startDimension, prevInputOffsets, lastActiveInputDimensions, inputValues, startOffset, output);
-
-        GetOutput().SetOutput(output);
+            output(outputIndices) = CallKernelFunction(f, inputs, indices);
+        };
     }
 
     template <typename ValueType, typename FunctionType>
-    void BroadcastOperationNode<ValueType, FunctionType>::Compile(model::IRMapCompiler& compiler, emitters::IRFunctionEmitter& function)
+    value::Scalar BroadcastOperationNode<ValueType, FunctionType>::CallKernelFunction(FunctionType f, std::vector<value::Array> inputs, std::vector<std::vector<value::Scalar>> indices) const
     {
-        const auto numInputs = NumInputPorts();
-
-        std::vector<emitters::IRLocalArray> inputs;
-        for (int index = 0; index < numInputs; ++index)
+        // TODO: if FunctionType was a function that took a vector of inputs, then we could dispense with this `if constexpr` block
+        if constexpr(std::is_same_v<FunctionType, UnaryScalarFunction>)
         {
-            const auto& inputPort = GetInput(index);
-            auto inputVar = function.LocalArray(compiler.EnsurePortEmitted(inputPort));
-            inputs.push_back(inputVar);
+            return f(inputs[0](indices[0]));
+        }
+        else if constexpr(std::is_same_v<FunctionType, BinaryScalarFunction>)
+        {
+            return f(inputs[0](indices[0]), inputs[1](indices[1]));
+        }
+        else if constexpr(std::is_same_v<FunctionType, TernaryScalarFunction>)
+        {
+            return f(inputs[0](indices[0]), inputs[1](indices[1]), inputs[2](indices[2]));
+        }
+        else
+        {
+            throw utilities::LogicException(utilities::LogicExceptionErrors::illegalState);
         }
-
-        auto output = function.LocalArray(compiler.EnsurePortEmitted(GetOutput(), this->GetOutputPadding()));
-
-        const int startDimension = 0;
-        std::vector<emitters::IRLocalScalar> prevInputOffsets(numInputs, function.LocalScalar<int>(0));
-        auto lastActiveInputDimensions = GetLastActiveInputDimensions();
-        std::vector<emitters::IRLocalScalar> inputValues(numInputs, function.LocalScalar());
-        const emitters::IRLocalScalar startOffset = function.LocalScalar<int>(0);
-        CompileDimensionLoop(compiler, function, startDimension, inputs, prevInputOffsets, lastActiveInputDimensions, inputValues, startOffset, output);
     }
 
     template <typename ValueType, typename FunctionType>
@@ -619,7 +482,7 @@ namespace nodes
     template <typename ValueType, typename FunctionType>
     void BroadcastOperationNode<ValueType, FunctionType>::WriteToArchive(utilities::Archiver& archiver) const
     {
-        model::CompilableNode::WriteToArchive(archiver);
+        model::CompilableCodeNode::WriteToArchive(archiver);
         auto outputLayout = GetOutputMemoryLayout();
         archiver["outputLayout"] << outputLayout;
         archiver["padding"] << _paddingValue;
@@ -628,7 +491,7 @@ namespace nodes
     template <typename ValueType, typename FunctionType>
     void BroadcastOperationNode<ValueType, FunctionType>::ReadFromArchive(utilities::Unarchiver& archiver)
     {
-        model::CompilableNode::ReadFromArchive(archiver);
+        model::CompilableCodeNode::ReadFromArchive(archiver);
         model::PortMemoryLayout outputLayout;
         archiver["outputLayout"] >> outputLayout;
         _output.SetMemoryLayout(outputLayout);
@@ -641,7 +504,7 @@ namespace nodes
     template <typename ValueType>
     BroadcastUnaryOperationNode<ValueType>::BroadcastUnaryOperationNode() :
         BroadcastOperationNode<ValueType, FunctionType>({ &_input }, {}, static_cast<ValueType>(0)),
-        _input(this, {}, model::CompilableNode::defaultInputPortName),
+        _input(this, {}, model::CompilableCodeNode::defaultInputPortName),
         _operation(OperationType::none)
     {
     }
@@ -649,39 +512,17 @@ namespace nodes
     template <typename ValueType>
     BroadcastUnaryOperationNode<ValueType>::BroadcastUnaryOperationNode(const model::OutputPort<ValueType>& input, OperationType operation, ValueType paddingValue) :
         BroadcastOperationNode<ValueType, FunctionType>({ &_input }, { &input }, paddingValue),
-        _input(this, input, model::CompilableNode::defaultInputPortName),
+        _input(this, input, model::CompilableCodeNode::defaultInputPortName),
         _operation(operation)
     {
-        SetOperationFunction();
     }
 
     template <typename ValueType>
     BroadcastUnaryOperationNode<ValueType>::BroadcastUnaryOperationNode(const model::OutputPort<ValueType>& input, const model::PortMemoryLayout& outputLayout, OperationType operation, ValueType paddingValue) :
         BroadcastOperationNode<ValueType, FunctionType>({ &_input }, { &input }, outputLayout, paddingValue),
-        _input(this, input, model::CompilableNode::defaultInputPortName),
+        _input(this, input, model::CompilableCodeNode::defaultInputPortName),
         _operation(operation)
     {
-        SetOperationFunction();
-    }
-
-    template <typename ValueType>
-    ValueType BroadcastUnaryOperationNode<ValueType>::ComputeOperation(const std::vector<ValueType>& args) const
-    {
-        if (args.size() != 1)
-        {
-            throw utilities::LogicException(utilities::LogicExceptionErrors::illegalState);
-        }
-        return GetFunction().Compute(args[0]);
-    }
-
-    template <typename ValueType>
-    emitters::IRLocalScalar BroadcastUnaryOperationNode<ValueType>::CompileOperation(const std::vector<emitters::IRLocalScalar>& args) const
-    {
-        if (args.size() != 1)
-        {
-            throw utilities::LogicException(utilities::LogicExceptionErrors::illegalState);
-        }
-        return GetFunction().Compile(args[0].function, args[0]);
     }
 
     template <typename ValueType>
@@ -698,7 +539,7 @@ namespace nodes
     void BroadcastUnaryOperationNode<ValueType>::WriteToArchive(utilities::Archiver& archiver) const
     {
         BroadcastOperationNode<ValueType, FunctionType>::WriteToArchive(archiver);
-        archiver[model::CompilableNode::defaultInputPortName] << _input;
+        archiver[model::CompilableCodeNode::defaultInputPortName] << _input;
         archiver["operation"] << ToString(_operation);
     }
 
@@ -706,11 +547,10 @@ namespace nodes
     void BroadcastUnaryOperationNode<ValueType>::ReadFromArchive(utilities::Unarchiver& archiver)
     {
         BroadcastOperationNode<ValueType, FunctionType>::ReadFromArchive(archiver);
-        archiver[model::CompilableNode::defaultInputPortName] >> _input;
+        archiver[model::CompilableCodeNode::defaultInputPortName] >> _input;
         std::string operation;
         archiver["operation"] >> operation;
         _operation = FromString<UnaryOperationType>(operation);
-        SetOperationFunction();
     }
 
     //
@@ -719,8 +559,8 @@ namespace nodes
     template <typename ValueType>
     BroadcastBinaryOperationNode<ValueType>::BroadcastBinaryOperationNode() :
         BroadcastOperationNode<ValueType, FunctionType>({ &_input1, &_input2 }, {}),
-        _input1(this, {}, model::CompilableNode::defaultInput1PortName),
-        _input2(this, {}, model::CompilableNode::defaultInput2PortName),
+        _input1(this, {}, model::CompilableCodeNode::defaultInput1PortName),
+        _input2(this, {}, model::CompilableCodeNode::defaultInput2PortName),
         _operation(OperationType::none)
     {
     }
@@ -731,11 +571,10 @@ namespace nodes
                                                                           OperationType operation,
                                                                           ValueType paddingValue) :
         BroadcastOperationNode<ValueType, FunctionType>({ &_input1, &_input2 }, { &input1, &input2 }, paddingValue),
-        _input1(this, input1, model::CompilableNode::defaultInput1PortName),
-        _input2(this, input2, model::CompilableNode::defaultInput2PortName),
+        _input1(this, input1, model::CompilableCodeNode::defaultInput1PortName),
+        _input2(this, input2, model::CompilableCodeNode::defaultInput2PortName),
         _operation(operation)
     {
-        SetOperationFunction();
     }
 
     template <typename ValueType>
@@ -745,31 +584,10 @@ namespace nodes
                                                                           OperationType operation,
                                                                           ValueType paddingValue) :
         BroadcastOperationNode<ValueType, FunctionType>({ &_input1, &_input2 }, { &input1, &input2 }, outputLayout, paddingValue),
-        _input1(this, input1, model::CompilableNode::defaultInput1PortName),
-        _input2(this, input2, model::CompilableNode::defaultInput2PortName),
+        _input1(this, input1, model::CompilableCodeNode::defaultInput1PortName),
+        _input2(this, input2, model::CompilableCodeNode::defaultInput2PortName),
         _operation(operation)
     {
-        SetOperationFunction();
-    }
-
-    template <typename ValueType>
-    ValueType BroadcastBinaryOperationNode<ValueType>::ComputeOperation(const std::vector<ValueType>& args) const
-    {
-        if (args.size() != 2)
-        {
-            throw utilities::LogicException(utilities::LogicExceptionErrors::illegalState);
-        }
-        return GetFunction().Compute(args[0], args[1]);
-    }
-
-    template <typename ValueType>
-    emitters::IRLocalScalar BroadcastBinaryOperationNode<ValueType>::CompileOperation(const std::vector<emitters::IRLocalScalar>& args) const
-    {
-        if (args.size() != 2)
-        {
-            throw utilities::LogicException(utilities::LogicExceptionErrors::illegalState);
-        }
-        return GetFunction().Compile(args[0].function, args[0], args[1]);
     }
 
     template <typename ValueType>
@@ -788,8 +606,8 @@ namespace nodes
     void BroadcastBinaryOperationNode<ValueType>::WriteToArchive(utilities::Archiver& archiver) const
     {
         BroadcastOperationNode<ValueType, FunctionType>::WriteToArchive(archiver);
-        archiver[model::CompilableNode::defaultInput1PortName] << _input1;
-        archiver[model::CompilableNode::defaultInput1PortName] << _input2;
+        archiver[model::CompilableCodeNode::defaultInput1PortName] << _input1;
+        archiver[model::CompilableCodeNode::defaultInput2PortName] << _input2;
         archiver["operation"] << ToString(_operation);
     }
 
@@ -797,12 +615,11 @@ namespace nodes
     void BroadcastBinaryOperationNode<ValueType>::ReadFromArchive(utilities::Unarchiver& archiver)
     {
         BroadcastOperationNode<ValueType, FunctionType>::ReadFromArchive(archiver);
-        archiver[model::CompilableNode::defaultInput1PortName] >> _input1;
-        archiver[model::CompilableNode::defaultInput1PortName] >> _input2;
+        archiver[model::CompilableCodeNode::defaultInput1PortName] >> _input1;
+        archiver[model::CompilableCodeNode::defaultInput2PortName] >> _input2;
         std::string operation;
         archiver["operation"] >> operation;
         _operation = FromString<BinaryOperationType>(operation);
-        SetOperationFunction();
     }
 
     //
@@ -811,9 +628,9 @@ namespace nodes
     template <typename ValueType>
     BroadcastTernaryOperationNode<ValueType>::BroadcastTernaryOperationNode() :
         BroadcastOperationNode<ValueType, FunctionType>({ &_input1, &_input2, &_input3 }, {}),
-        _input1(this, {}, model::CompilableNode::defaultInput1PortName),
-        _input2(this, {}, model::CompilableNode::defaultInput2PortName),
-        _input3(this, {}, model::CompilableNode::defaultInput3PortName),
+        _input1(this, {}, model::CompilableCodeNode::defaultInput1PortName),
+        _input2(this, {}, model::CompilableCodeNode::defaultInput2PortName),
+        _input3(this, {}, model::CompilableCodeNode::defaultInput3PortName),
         _operation(OperationType::none)
     {
     }
@@ -825,12 +642,11 @@ namespace nodes
                                                                             OperationType operation,
                                                                             ValueType paddingValue) :
         BroadcastOperationNode<ValueType, FunctionType>({ &_input1, &_input2, &_input3 }, { &input1, &input2, &input3 }, paddingValue),
-        _input1(this, input1, model::CompilableNode::defaultInput1PortName),
-        _input2(this, input2, model::CompilableNode::defaultInput2PortName),
-        _input3(this, input3, model::CompilableNode::defaultInput3PortName),
+        _input1(this, input1, model::CompilableCodeNode::defaultInput1PortName),
+        _input2(this, input2, model::CompilableCodeNode::defaultInput2PortName),
+        _input3(this, input3, model::CompilableCodeNode::defaultInput3PortName),
         _operation(operation)
     {
-        SetOperationFunction();
     }
 
     template <typename ValueType>
@@ -841,32 +657,11 @@ namespace nodes
                                                                             OperationType operation,
                                                                             ValueType paddingValue) :
         BroadcastOperationNode<ValueType, FunctionType>({ &_input1, &_input2, &_input3 }, { &input1, &input2, &input3 }, outputLayout, paddingValue),
-        _input1(this, input1, model::CompilableNode::defaultInput1PortName),
-        _input2(this, input2, model::CompilableNode::defaultInput2PortName),
-        _input3(this, input3, model::CompilableNode::defaultInput3PortName),
+        _input1(this, input1, model::CompilableCodeNode::defaultInput1PortName),
+        _input2(this, input2, model::CompilableCodeNode::defaultInput2PortName),
+        _input3(this, input3, model::CompilableCodeNode::defaultInput3PortName),
         _operation(operation)
     {
-        SetOperationFunction();
-    }
-
-    template <typename ValueType>
-    ValueType BroadcastTernaryOperationNode<ValueType>::ComputeOperation(const std::vector<ValueType>& args) const
-    {
-        if (args.size() != 3)
-        {
-            throw utilities::LogicException(utilities::LogicExceptionErrors::illegalState);
-        }
-        return GetFunction().Compute(args[0], args[1], args[2]);
-    }
-
-    template <typename ValueType>
-    emitters::IRLocalScalar BroadcastTernaryOperationNode<ValueType>::CompileOperation(const std::vector<emitters::IRLocalScalar>& args) const
-    {
-        if (args.size() != 3)
-        {
-            throw utilities::LogicException(utilities::LogicExceptionErrors::illegalState);
-        }
-        return GetFunction().Compile(args[0].function, args[0], args[1], args[2]);
     }
 
     template <typename ValueType>
@@ -887,9 +682,9 @@ namespace nodes
     void BroadcastTernaryOperationNode<ValueType>::WriteToArchive(utilities::Archiver& archiver) const
     {
         BroadcastOperationNode<ValueType, FunctionType>::WriteToArchive(archiver);
-        archiver[model::CompilableNode::defaultInput1PortName] << _input1;
-        archiver[model::CompilableNode::defaultInput2PortName] << _input2;
-        archiver[model::CompilableNode::defaultInput3PortName] << _input3;
+        archiver[model::CompilableCodeNode::defaultInput1PortName] << _input1;
+        archiver[model::CompilableCodeNode::defaultInput2PortName] << _input2;
+        archiver[model::CompilableCodeNode::defaultInput3PortName] << _input3;
         archiver["operation"] << ToString(_operation);
     }
 
@@ -897,45 +692,44 @@ namespace nodes
     void BroadcastTernaryOperationNode<ValueType>::ReadFromArchive(utilities::Unarchiver& archiver)
     {
         BroadcastOperationNode<ValueType, FunctionType>::ReadFromArchive(archiver);
-        archiver[model::CompilableNode::defaultInput1PortName] >> _input1;
-        archiver[model::CompilableNode::defaultInput2PortName] >> _input2;
-        archiver[model::CompilableNode::defaultInput3PortName] >> _input3;
+        archiver[model::CompilableCodeNode::defaultInput1PortName] >> _input1;
+        archiver[model::CompilableCodeNode::defaultInput2PortName] >> _input2;
+        archiver[model::CompilableCodeNode::defaultInput3PortName] >> _input3;
         std::string operation;
         archiver["operation"] >> operation;
         _operation = FromString<TernaryOperationType>(operation);
-        SetOperationFunction();
     }
 
     template <typename ValueType>
-    void BroadcastUnaryOperationNode<ValueType>::SetOperationFunction()
+    auto BroadcastUnaryOperationNode<ValueType>::GetKernelFunction() const -> KernelFunctionType
     {
         switch (_operation)
         {
         case UnaryOperationType::abs:
-            SetFunction(AbsFunction<ValueType>());
+            return MakeKernel(value::Abs);
             break;
         case UnaryOperationType::exp:
-            SetFunction(ExpFunction<ValueType>());
+            return MakeKernel(value::Exp);
             break;
         case UnaryOperationType::log:
-            SetFunction(LogFunction<ValueType>());
+            return MakeKernel(value::Log);
             break;
         case UnaryOperationType::sqrt:
-            SetFunction(SqrtFunction<ValueType>());
+            return MakeKernel(value::Sqrt);
             break;
         case UnaryOperationType::logicalNot:
             throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "Not implemented");
         case UnaryOperationType::tanh:
-            SetFunction(TanhFunction<ValueType>());
+            return MakeKernel(value::Tanh);
             break;
         case UnaryOperationType::square:
-            SetFunction(SquareFunction<ValueType>());
+            return MakeKernel(value::Square);
             break;
         case UnaryOperationType::sin:
-            SetFunction(SinFunction<ValueType>());
+            return MakeKernel(value::Sin);
             break;
         case UnaryOperationType::cos:
-            SetFunction(CosFunction<ValueType>());
+            return MakeKernel(value::Cos);
             break;
         default:
             throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "Illegal operation");
@@ -943,21 +737,24 @@ namespace nodes
     }
 
     template <typename ValueType>
-    void BroadcastBinaryOperationNode<ValueType>::SetOperationFunction()
+    auto BroadcastBinaryOperationNode<ValueType>::GetKernelFunction() const -> KernelFunctionType
     {
         switch (_operation)
         {
         case BinaryOperationType::add:
-            SetFunction(AddFunction<ValueType>());
+            return MakeKernel(value::Add);
             break;
         case BinaryOperationType::subtract:
-            SetFunction(SubtractFunction<ValueType>());
+            return MakeKernel(value::Subtract);
             break;
         case BinaryOperationType::multiply:
-            SetFunction(MultiplyFunction<ValueType>());
+            return MakeKernel(value::Multiply);
             break;
         case BinaryOperationType::divide:
-            SetFunction(DivideFunction<ValueType>());
+            return MakeKernel(value::Divide);
+            break;
+        case BinaryOperationType::modulo:
+            return MakeKernel(value::Modulo);
             break;
         case BinaryOperationType::logicalAnd:
             throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "Not implemented");
@@ -971,12 +768,12 @@ namespace nodes
     }
 
     template <typename ValueType>
-    void BroadcastTernaryOperationNode<ValueType>::SetOperationFunction()
+    auto BroadcastTernaryOperationNode<ValueType>::GetKernelFunction() const -> KernelFunctionType
     {
         switch (_operation)
         {
         case TernaryOperationType::fma:
-            SetFunction(FMAFunction<ValueType>());
+            return MakeKernel(value::FusedMultiplyAdd);
             break;
         default:
             throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "Illegal operation");
diff --git a/libraries/nodes/include/IRNode.h b/libraries/nodes/include/IRNode.h
index 81820b637..d94f61ef7 100644
--- a/libraries/nodes/include/IRNode.h
+++ b/libraries/nodes/include/IRNode.h
@@ -41,8 +41,6 @@ namespace nodes
     class IRNode : public model::CompilableNode
     {
     public:
-        IRNode() = default;
-
         IRNode(const IRNode&) = delete;
         IRNode(IRNode&&) = delete;
 
diff --git a/libraries/nodes/include/MatrixMatrixMultiplyCodeNode.h b/libraries/nodes/include/MatrixMatrixMultiplyCodeNode.h
new file mode 100644
index 000000000..9b286b63c
--- /dev/null
+++ b/libraries/nodes/include/MatrixMatrixMultiplyCodeNode.h
@@ -0,0 +1,271 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     MatrixMatrixMultiplyCodeNode.h (nodes)
+//  Authors:  Mason Remy, Denny Sun
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Current gaps:
+// Doesn't support transposed matrices
+// Doesn't support alpha and beta values
+
+#pragma once
+
+#include <model/include/CompilableCodeNode.h>
+#include <model/include/InputPort.h>
+#include <model/include/OutputPort.h>
+
+#include <nodes/include/MatrixMatrixMultiplyImplementation.h>
+
+#include <utilities/include/ArchiveVersion.h>
+#include <utilities/include/IArchivable.h>
+#include <utilities/include/TypeName.h>
+
+#include <value/include/CachingStrategies.h>
+#include <value/include/EmitterContext.h>
+#include <value/include/FunctionDeclaration.h>
+#include <value/include/LoopNests.h>
+#include <value/include/Matrix.h>
+#include <value/include/Scalar.h>
+#include <value/include/ScalarOperations.h>
+#include <value/include/loopnests/CodeGenerator.h>
+#include <value/include/loopnests/Kernel.h>
+#include <value/include/loopnests/LoopNest.h>
+
+#include <string>
+
+namespace ell
+{
+namespace nodes
+{
+    /// <summary> A node that multiplies two matrices. </summary>
+    template <typename ValueType>
+    class MatrixMatrixMultiplyCodeNode : public model::CompilableCodeNode
+    {
+    public:
+        /// @name Input and Output Ports
+        /// @{
+        const model::InputPort<ValueType>& input1 = _input1;
+        const model::InputPort<ValueType>& input2 = _input2;
+        const model::OutputPort<ValueType>& output = _output;
+        /// @}
+
+        /// <summary> Default Constructor </summary>
+        MatrixMatrixMultiplyCodeNode();
+
+        /// <summary> Constructor. </summary>
+        ///
+        /// <param name="input1"> The left-hand input of the matrix multiplication, a row-major matrix of size m x k.  </param>
+        /// <param name="input2"> The right-hand input of the matrix multiplication, a row-major matrix of size k x n. </param>
+        /// <param name="gemmImpl"> Which implementation of matrix-matrix multiplication to use </param>
+        MatrixMatrixMultiplyCodeNode(const model::OutputPort<ValueType>& input1, const model::OutputPort<ValueType>& input2, const MatrixMatrixMultiplyImplementation& gemmImpl = MatrixMatrixMultiplyImplementation::DEFAULT);
+
+        /// <summary> Constructor. </summary>
+        ///
+        /// <param name="input1"> The left-hand input of the matrix multiplication, a row-major matrix of size m x k.  </param>
+        /// <param name="input2"> The right-hand input of the matrix multiplication, a row-major matrix of size k x n. </param>
+        /// <param name="panelM"> The panel size to use in the M dimension (rows of A, C) </param>
+        /// <param name="panelN"> The panel size to use in the N dimension (columns of B, C) </param>
+        /// <param name="panelK"> The panel size to use in the K dimension (columns of A, rows of B) </param>
+        /// <param name="kernelM"> The kernel size to use in the M dimension (rows of A, C). </param>
+        /// <param name="kernelN"> The kernel size to use in the N dimension (columns of B, C). </param>
+        /// <param name="kernelK"> The kernel size to use in the K dimension (columns of A, rows of B). </param>
+        /// <param name="gemmImpl"> Which implementation of matrix-matrix multiplication to use </param>
+        MatrixMatrixMultiplyCodeNode(const model::OutputPort<ValueType>& input1, const model::OutputPort<ValueType>& input2, int panelM, int panelN, int panelK, int kernelM, int kernelN, int kernelK, const MatrixMatrixMultiplyImplementation& gemmImpl = MatrixMatrixMultiplyImplementation::DEFAULT);
+
+        /// <summary> Constructor. </summary>
+        ///
+        /// <param name="input1"> The left-hand input of the matrix multiplication, a row-major matrix of size m x k.  </param>
+        /// <param name="input2"> The right-hand input of the matrix multiplication, a row-major matrix of size k x n. </param>
+        /// <param name="outputMemoryLayout"> The output memory layout to use. </param>
+        /// <param name="gemmImpl"> Which implementation of matrix-matrix multiplication to use </param>
+        MatrixMatrixMultiplyCodeNode(const model::OutputPort<ValueType>& input1, const model::OutputPort<ValueType>& input2, const model::PortMemoryLayout& outputMemoryLayout, const MatrixMatrixMultiplyImplementation& gemmImpl = MatrixMatrixMultiplyImplementation::DEFAULT);
+
+        /// <summary> Constructor. </summary>
+        ///
+        /// <param name="input1"> The left-hand input of the matrix multiplication, a row-major matrix of size m x k.  </param>
+        /// <param name="input2"> The right-hand input of the matrix multiplication, a row-major matrix of size k x n. </param>
+        /// <param name="outputMemoryLayout"> The output memory layout to use. </param>
+        /// <param name="panelM"> The panel size to use in the M dimension (rows of A, C) </param>
+        /// <param name="panelN"> The panel size to use in the N dimension (columns of B, C) </param>
+        /// <param name="panelK"> The panel size to use in the K dimension (columns of A, rows of B) </param>
+        /// <param name="kernelM"> The kernel size to use in the M dimension (rows of A, C). </param>
+        /// <param name="kernelN"> The kernel size to use in the N dimension (columns of B, C). </param>
+        /// <param name="kernelK"> The kernel size to use in the K dimension (columns of A, rows of B). </param>
+        /// <param name="gemmImpl"> Which implementation of matrix-matrix multiplication to use </param>
+        MatrixMatrixMultiplyCodeNode(const model::OutputPort<ValueType>& input1, const model::OutputPort<ValueType>& input2, const model::PortMemoryLayout& outputMemoryLayout, int panelM, int panelN, int panelK, int kernelM, int kernelN, int kernelK, const MatrixMatrixMultiplyImplementation& gemmImpl = MatrixMatrixMultiplyImplementation::DEFAULT);
+
+        /// <summary> Constructor. </summary>
+        ///
+        /// <param name="input1"> The left-hand input of the matrix multiplication, a row-major matrix of size m x k.  </param>
+        /// <param name="m"> The number of rows in the left hand input matrix and in the output matrix. </param>
+        /// <param name="n"> The number of columns in the right hand input matrix and in the output matrix. </param>
+        /// <param name="k"> The number of columns in the left hand input matrix and the number of columns in the right hand input matrix. </param>
+        /// <param name="matrix1Stride"> The number of elements between successive elements in a single column in the left hand input matrix. </param>
+        /// <param name="input2"> The right-hand input of the matrix multiplication, a row-major matrix of size k x n. </param>
+        /// <param name="matrix2Stride"> The number of elements between successive elements in a single column in the right hand input matrix. </param>
+        /// <param name="outputMatrixStride"> The number of elements between successive elements in a single column in the output matrix. </param>
+        /// <param name="gemmImpl"> Which implementation of matrix-matrix multiplication to use </param>
+        MatrixMatrixMultiplyCodeNode(const model::OutputPort<ValueType>& input1, int m, int n, int k, int matrix1Stride, const model::OutputPort<ValueType>& input2, int matrix2Stride, int outputMatrixStride, const MatrixMatrixMultiplyImplementation& gemmImpl = MatrixMatrixMultiplyImplementation::DEFAULT);
+
+        /// <summary> Constructor. </summary>
+        ///
+        /// <param name="input1"> The left-hand input of the matrix multiplication, a row-major matrix of size m x k.  </param>
+        /// <param name="m"> The number of rows in the left hand input matrix and in the output matrix. </param>
+        /// <param name="n"> The number of columns in the right hand input matrix and in the output matrix. </param>
+        /// <param name="k"> The number of columns in the left hand input matrix and the number of columns in the right hand input matrix. </param>
+        /// <param name="matrix1Stride"> The number of elements between successive elements in a single column in the left hand input matrix. </param>
+        /// <param name="input2"> The right-hand input of the matrix multiplication, a row-major matrix of size k x n. </param>
+        /// <param name="matrix2Stride"> The number of elements between successive elements in a single column in the right hand input matrix. </param>
+        /// <param name="outputMatrixStride"> The number of elements between successive elements in a single column in the output matrix. </param>
+        /// <param name="panelM"> The panel size to use in the M dimension (rows of A, C) </param>
+        /// <param name="panelN"> The panel size to use in the N dimension (columns of B, C) </param>
+        /// <param name="panelK"> The panel size to use in the K dimension (columns of A, rows of B) </param>
+        /// <param name="kernelM"> The kernel size to use in the M dimension (rows of A, C). </param>
+        /// <param name="kernelN"> The kernel size to use in the N dimension (columns of B, C). </param>
+        /// <param name="kernelK"> The kernel size to use in the K dimension (columns of A, rows of B). </param>
+        /// <param name="gemmImpl"> Which implementation of matrix-matrix multiplication to use </param>
+        MatrixMatrixMultiplyCodeNode(const model::OutputPort<ValueType>& input1, int m, int n, int k, int matrix1Stride, const model::OutputPort<ValueType>& input2, int matrix2Stride, int outputMatrixStride, int panelM, int panelN, int panelK, int kernelM, int kernelN, int kernelK, const MatrixMatrixMultiplyImplementation& gemmImpl = MatrixMatrixMultiplyImplementation::DEFAULT);
+
+        /// <summary> Constructor. </summary>
+        ///
+        /// <param name="input1"> The left-hand input of the matrix multiplication, a row-major matrix of size m x k.  </param>
+        /// <param name="m"> The number of rows in the left hand input matrix and in the output matrix. </param>
+        /// <param name="n"> The number of columns in the right hand input matrix and in the output matrix. </param>
+        /// <param name="k"> The number of columns in the left hand input matrix and the number of columns in the right hand input matrix. </param>
+        /// <param name="matrix1Stride"> The number of elements between successive elements in a single column in the left hand input matrix. </param>
+        /// <param name="transpose1"> If true, transpose the left-hand input matrix. </param>
+        /// <param name="input2"> The right-hand input of the matrix multiplication, a row-major matrix of size k x n. </param>
+        /// <param name="matrix2Stride"> The number of elements between successive elements in a single column in the right hand input matrix. </param>
+        /// <param name="transpose2"> If true, transpose the right-hand input matrix. </param>
+        /// <param name="outputMatrixStride"> The number of elements between successive elements in a single column in the output matrix. </param>
+        /// <param name="gemmImpl"> Which implementation of matrix-matrix multiplication to use </param>
+        MatrixMatrixMultiplyCodeNode(const model::OutputPort<ValueType>& input1, int m, int n, int k, int matrix1Stride, bool transpose1, const model::OutputPort<ValueType>& input2, int matrix2Stride, bool transpose2, int outputMatrixStride, const MatrixMatrixMultiplyImplementation& gemmImpl = MatrixMatrixMultiplyImplementation::DEFAULT);
+
+        /// <summary> Constructor. </summary>
+        ///
+        /// <param name="input1"> The left-hand input of the matrix multiplication, a row-major matrix of size m x k.  </param>
+        /// <param name="m"> The number of rows in the left hand input matrix and in the output matrix. </param>
+        /// <param name="n"> The number of columns in the right hand input matrix and in the output matrix. </param>
+        /// <param name="k"> The number of columns in the left hand input matrix and the number of columns in the right hand input matrix. </param>
+        /// <param name="matrix1Stride"> The number of elements between successive elements in a single column in the left hand input matrix. </param>
+        /// <param name="transpose1"> If true, transpose the left-hand input matrix. </param>
+        /// <param name="input2"> The right-hand input of the matrix multiplication, a row-major matrix of size k x n. </param>
+        /// <param name="matrix2Stride"> The number of elements between successive elements in a single column in the right hand input matrix. </param>
+        /// <param name="transpose2"> If true, transpose the right-hand input matrix. </param>
+        /// <param name="outputMatrixStride"> The number of elements between successive elements in a single column in the output matrix. </param>
+        /// <param name="panelM"> The panel size to use in the M dimension (rows of A, C) </param>
+        /// <param name="panelN"> The panel size to use in the N dimension (columns of B, C) </param>
+        /// <param name="panelK"> The panel size to use in the K dimension (columns of A, rows of B) </param>
+        /// <param name="kernelM"> The kernel size to use in the M dimension (rows of A, C). </param>
+        /// <param name="kernelN"> The kernel size to use in the N dimension (columns of B, C). </param>
+        /// <param name="kernelK"> The kernel size to use in the K dimension (columns of A, rows of B). </param>
+        /// <param name="gemmImpl"> Which implementation of matrix-matrix multiplication to use </param>
+        MatrixMatrixMultiplyCodeNode(const model::OutputPort<ValueType>& input1, int m, int n, int k, int matrix1Stride, bool transpose1, const model::OutputPort<ValueType>& input2, int matrix2Stride, bool transpose2, int outputMatrixStride, int panelM, int panelN, int panelK, int kernelM, int kernelN, int kernelK, const MatrixMatrixMultiplyImplementation& gemmImpl = MatrixMatrixMultiplyImplementation::DEFAULT);
+
+        /// <summary> Constructor. </summary>
+        ///
+        /// <param name="input1"> The left-hand input of the matrix multiplication, a row-major matrix of size m x k.  </param>
+        /// <param name="m"> The number of rows in the left hand input matrix and in the output matrix. </param>
+        /// <param name="n"> The number of columns in the right hand input matrix and in the output matrix. </param>
+        /// <param name="k"> The number of columns in the left hand input matrix and the number of columns in the right hand input matrix. </param>
+        /// <param name="matrix1Stride"> The number of elements between successive elements in a single column in the left hand input matrix. </param>
+        /// <param name="transpose1"> If true, transpose the left-hand input matrix. </param>
+        /// <param name="input2"> The right-hand input of the matrix multiplication, a row-major matrix of size k x n. </param>
+        /// <param name="matrix2Stride"> The number of elements between successive elements in a single column in the right hand input matrix. </param>
+        /// <param name="transpose2"> If true, transpose the right-hand input matrix. </param>
+        /// <param name="outputMatrixStride"> The number of elements between successive elements in a single column in the output matrix. </param>
+        /// <param name="transposeOutput"> If true, transpose the output matrix. </param>
+        /// <param name="gemmImpl"> Which implementation of matrix-matrix multiplication to use </param>
+        MatrixMatrixMultiplyCodeNode(const model::OutputPort<ValueType>& input1, int m, int n, int k, int matrix1Stride, bool transpose1, const model::OutputPort<ValueType>& input2, int matrix2Stride, bool transpose2, int outputMatrixStride, bool transposeOutput, const MatrixMatrixMultiplyImplementation& gemmImpl = MatrixMatrixMultiplyImplementation::DEFAULT);
+
+        /// <summary> Constructor. </summary>
+        ///
+        /// <param name="input1"> The left-hand input of the matrix multiplication, a row-major matrix of size m x k.  </param>
+        /// <param name="m"> The number of rows in the left hand input matrix and in the output matrix. </param>
+        /// <param name="n"> The number of columns in the right hand input matrix and in the output matrix. </param>
+        /// <param name="k"> The number of columns in the left hand input matrix and the number of columns in the right hand input matrix. </param>
+        /// <param name="matrix1Stride"> The number of elements between successive elements in a single column in the left hand input matrix. </param>
+        /// <param name="transpose1"> If true, transpose the left-hand input matrix. </param>
+        /// <param name="input2"> The right-hand input of the matrix multiplication, a row-major matrix of size k x n. </param>
+        /// <param name="matrix2Stride"> The number of elements between successive elements in a single column in the right hand input matrix. </param>
+        /// <param name="transpose2"> If true, transpose the right-hand input matrix. </param>
+        /// <param name="outputMatrixStride"> The number of elements between successive elements in a single column in the output matrix. </param>
+        /// <param name="transposeOutput"> If true, transpose the output matrix. </param>
+        /// <param name="panelM"> The panel size to use in the M dimension (rows of A, C) </param>
+        /// <param name="panelN"> The panel size to use in the N dimension (columns of B, C) </param>
+        /// <param name="panelK"> The panel size to use in the K dimension (columns of A, rows of B) </param>
+        /// <param name="kernelM"> The kernel size to use in the M dimension (rows of A, C). </param>
+        /// <param name="kernelN"> The kernel size to use in the N dimension (columns of B, C). </param>
+        /// <param name="kernelK"> The kernel size to use in the K dimension (columns of A, rows of B). </param>
+        /// <param name="gemmImpl"> Which implementation of matrix-matrix multiplication to use </param>
+        MatrixMatrixMultiplyCodeNode(const model::OutputPort<ValueType>& input1, int m, int n, int k, int matrix1Stride, bool transpose1, const model::OutputPort<ValueType>& input2, int matrix2Stride, bool transpose2, int outputMatrixStride, bool transposeOutput, int panelM, int panelN, int panelK, int kernelM, int kernelN, int kernelK, const MatrixMatrixMultiplyImplementation& gemmImpl = MatrixMatrixMultiplyImplementation::DEFAULT);
+
+        /// <summary> Gets the name of this type (for serialization). </summary>
+        ///
+        /// <returns> The name of this type. </returns>
+        static std::string GetTypeName() { return utilities::GetCompositeTypeName<ValueType>("MatrixMatrixMultiplyCodeNode"); }
+
+        /// <summary> Gets the name of this type (for serialization). </summary>
+        ///
+        /// <returns> The name of this type. </returns>
+        std::string GetRuntimeTypeName() const override { return GetTypeName(); }
+
+    protected:
+        void Define(value::FunctionDeclaration& fn) override;
+        utilities::ArchiveVersion GetArchiveVersion() const override;
+        bool CanReadArchiveVersion(const utilities::ArchiveVersion& version) const override;
+        void WriteToArchive(utilities::Archiver& archiver) const override;
+        void ReadFromArchive(utilities::Unarchiver& archiver) override;
+        bool HasState() const override { return true; } // stored state:  m, n, k, lda, ldb, ldc, transpose
+
+    private:
+        void Copy(model::ModelTransformer& transformer) const override;
+
+        void ZeroMatrix(value::Matrix matrix) const;
+
+        void ForLoopGEMM(const value::Matrix matA, const value::Matrix matB, value::Matrix matC);
+        void Gemm(const value::Matrix mat, const value::Matrix matB, value::Matrix matC);
+        void GemmFn(const value::Matrix mat, const value::Matrix matB, value::Matrix matC, int thread_num = 0);
+        void ParallelizeGemmCol(const value::Matrix matA, const value::Matrix matB, value::Matrix matC, int numThreads = 2);
+        void ParallelizeGemmRow(const value::Matrix matA, const value::Matrix matB, value::Matrix matC, int numThreads = 2);
+        void ELLCodeGEMM(const value::Matrix matA, const value::Matrix matB, value::Matrix matC);
+
+        // Inputs
+        model::InputPort<ValueType> _input1;
+        model::InputPort<ValueType> _input2;
+
+        // Output
+        model::OutputPort<ValueType> _output;
+
+        // Matrix dimensions
+        // Matrix 1 is MxK, Matrix 2 is KxN, Output is MxN
+        int _m = 0, _n = 0, _k = 0;
+        int _lda = 0, _ldb = 0, _ldc = 0;
+        bool _transpose1 = false, _transpose2 = false, _transposeOutput = false;
+
+        // Implementation-controlling members
+        int _panelM;
+        int _panelN;
+        int _panelK;
+        int _kernelM;
+        int _kernelN;
+        int _kernelK;
+        MatrixMatrixMultiplyImplementation _impl;
+
+        static const int _defaultPanelM = 64;
+        static const int _defaultPanelN = 64;
+        static const int _defaultPanelK = 64;
+        static const int _defaultKernelM = 4;
+        static const int _defaultKernelN = 4;
+        static const int _defaultKernelK = 4;
+    };
+
+    //
+    // Explicit instantiation declarations
+    //
+    extern template class MatrixMatrixMultiplyCodeNode<float>;
+    extern template class MatrixMatrixMultiplyCodeNode<double>;
+} // namespace nodes
+} // namespace ell
diff --git a/libraries/nodes/include/MatrixMatrixMultiplyImplementation.h b/libraries/nodes/include/MatrixMatrixMultiplyImplementation.h
new file mode 100644
index 000000000..655433e7d
--- /dev/null
+++ b/libraries/nodes/include/MatrixMatrixMultiplyImplementation.h
@@ -0,0 +1,23 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     MatrixMatrixMultiplyImplementation.h (nodes)
+//  Authors:  Mason Remy, Denny Sun
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+namespace ell
+{
+namespace nodes
+{
+    enum class MatrixMatrixMultiplyImplementation : int
+    {
+        SimpleForLoops = 0,
+        Mlas_Loopnest_Value,
+        LAST,
+        DEFAULT = Mlas_Loopnest_Value
+    };
+} // namespace nodes
+} // namespace ell
diff --git a/libraries/nodes/include/NodeOperations.h b/libraries/nodes/include/NodeOperations.h
index 81a899dce..faa1779d5 100644
--- a/libraries/nodes/include/NodeOperations.h
+++ b/libraries/nodes/include/NodeOperations.h
@@ -59,7 +59,8 @@ namespace nodes
             divide,
             logicalAnd,
             logicalOr,
-            logicalXor
+            logicalXor,
+            modulo
         };
 
         template <typename ValueType>
diff --git a/libraries/nodes/include/ReorderDataCodeNode.h b/libraries/nodes/include/ReorderDataCodeNode.h
new file mode 100644
index 000000000..37b3085e7
--- /dev/null
+++ b/libraries/nodes/include/ReorderDataCodeNode.h
@@ -0,0 +1,586 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     ReorderDataCodeNode.h (nodes)
+//  Authors:  Byron Changuion
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include <model/include/CompilableCodeNode.h>
+#include <model/include/IRMapCompiler.h>
+#include <model/include/InputPort.h>
+#include <model/include/OutputPort.h>
+
+#include <value/include/FunctionDeclaration.h>
+#include <value/include/Scalar.h>
+#include <value/include/ScalarOperations.h>
+#include <value/include/Tensor.h>
+
+#include <value/include/loopnests/CodeGenerator.h>
+#include <value/include/loopnests/Kernel.h>
+#include <value/include/loopnests/LoopNest.h>
+
+#include <string>
+#include <vector>
+
+namespace ell
+{
+namespace nodes
+{
+    /// <summary> A node that takes data from its input and outputs it in a different order.  </summary>
+    template <typename ValueType>
+    class ReorderDataCodeNode : public model::CompilableCodeNode
+    {
+
+    public:
+        /// @name Input and Output Ports
+        /// @{
+        const model::InputPort<ValueType>& input = _input;
+        const model::OutputPort<ValueType>& output = _output;
+        /// @}
+
+        /// <summary> Default Constructor </summary>
+        ReorderDataCodeNode();
+
+        /// <summary> Constructor with no reordering </summary>
+        ///
+        /// <param name="input"> The input to reorder. </param>
+        /// <param name="outputMemoryLayout"> The memory layout of the output. Data will be copied into the "active" area, and the rest will be zeroed out according to the padding value. </param>
+        /// <param name="paddingValue"> The value to fill the inactive area with. </param>
+        ReorderDataCodeNode(const model::OutputPort<ValueType>& input, const model::PortMemoryLayout& outputMemoryLayout, ValueType paddingValue = 0);
+
+        /// <summary> Constructor with no reordering </summary>
+        ///
+        /// <param name="input"> The input to reorder. </param>
+        /// <param name="inputMemoryLayout"> The memory layout of the input. Only data in the "active" area will be copied. </param>
+        /// <param name="outputMemoryLayout"> The memory layout of the output. Data will be copied into the "active" area, and the rest will be zeroed out according to the padding value. </param>
+        /// <param name="paddingValue"> The value to fill the inactive area with. </param>
+        ReorderDataCodeNode(const model::OutputPort<ValueType>& input, const model::PortMemoryLayout& inputMemoryLayout, const model::PortMemoryLayout& outputMemoryLayout, ValueType paddingValue = 0);
+
+        /// <summary> Constructor with reordering </summary>
+        ///
+        /// <param name="input"> The input to reorder. </param>
+        /// <param name="order"> The permutation vector to apply to the dimensions when copying. Input dimension `i` will get copied to output dimension `order[i]`. If left empty, no reordering is done.
+        //    For instance, to reorder the normal interleaved image order into a planar order, the `order` parameter would be
+        ///   set to {2, 0, 1} --- reordering {row, column, channel} to {channel, row, column} </param>
+        ReorderDataCodeNode(const model::OutputPort<ValueType>& input, const model::DimensionOrder& order);
+
+        /// <summary> Constructor with reordering </summary>
+        ///
+        /// <param name="input"> The input to reorder. </param>
+        /// <param name="outputMemoryLayout"> The memory layout of the output. Data will be copied into the "active" area, and the rest will be zeroed out according to the padding value. </param>
+        /// <param name="order"> The permutation vector to apply to the dimensions when copying. Input dimension `i` will get copied to output dimension `order[i]`. If left empty, no reordering is done.
+        ///    For instance, to reorder the normal interleaved image order into a planar order, the `order` parameter would be
+        ///    set to {2, 0, 1} --- reordering {row, column, channel} to {channel, row, column} </param>
+        /// <param name="paddingValue"> The value to fill the inactive area with. </param>
+        ReorderDataCodeNode(const model::OutputPort<ValueType>& input, const model::PortMemoryLayout& outputMemoryLayout, const model::DimensionOrder& order, ValueType paddingValue = 0);
+
+        /// <summary> Constructor with reordering </summary>
+        ///
+        /// <param name="input"> The input to reorder. </param>
+        /// <param name="inputMemoryLayout"> The memory layout of the input. Only data in the "active" area is guaranteed to be copied. </param>
+        /// <param name="outputMemoryLayout"> The memory layout of the output. Data will be copied into the "active" area, and the rest will be zeroed out according to the padding value. </param>
+        /// <param name="order"> The permutation vector to apply to the dimensions when copying. Input dimension `i` will get copied to output dimension `order[i]`. If left empty, no reordering is done.
+        ///   For instance, to reorder the normal interleaved image order into a planar order, the `order` parameter would be
+        ///   set to {2, 0, 1} --- reordering {row, column, channel} to {channel, row, column} </param>
+        /// <param name="paddingValue"> The value to fill the inactive area with. </param>
+        ReorderDataCodeNode(const model::OutputPort<ValueType>& input, const model::PortMemoryLayout& inputMemoryLayout, const model::PortMemoryLayout& outputMemoryLayout, const model::DimensionOrder& order, ValueType paddingValue = 0);
+
+        /// <summary> Gets information about the input memory layout </summary>
+        const model::PortMemoryLayout& GetInputMemoryLayout() const { return _inputMemoryLayout; }
+
+        /// <summary> Gets information about the input memory layout </summary>
+        model::PortMemoryLayout GetOutputMemoryLayout() const { return _outputMemoryLayout; }
+
+        /// <summary> Returns padding value </summary>
+        ///
+        /// <returns> Padding value </returns>
+        ValueType GetPaddingValue() const { return _paddingValue; }
+
+        /// <summary> Returns true if the node can accept input with this memory layout order, else false </summary>
+        ///
+        /// <param name="order"> The memory layout order for all the input ports </summary>
+        /// <returns> If the node can accept the input memory layout order, true, else false </returns>
+        bool CanAcceptInputLayout(const utilities::DimensionOrder& order) const override
+        {
+            return GetInputMemoryLayout().GetLogicalDimensionOrder() == order;
+        }
+
+        /// <summary> Gets the name of this type (for serialization). </summary>
+        ///
+        /// <returns> The name of this type. </returns>
+        static std::string GetTypeName() { return utilities::GetCompositeTypeName<ValueType>("ReorderDataCodeNode"); }
+
+    protected:
+        void Define(ell::value::FunctionDeclaration& fn) override;
+        void WriteToArchive(utilities::Archiver& archiver) const override;
+        void ReadFromArchive(utilities::Unarchiver& archiver) override;
+        bool HasState() const override { return true; } // stored state: operation
+        std::string GetRuntimeTypeName() const override { return GetTypeName(); }
+
+    private:
+        void Copy(model::ModelTransformer& transformer) const override;
+
+        void reorder_kernel_optimized_columns(value::Tensor source, value::Tensor dest, value::Scalar i, value::Scalar j, value::Scalar k);
+        void reorder_kernel_optimized_channels(value::Tensor source, value::Tensor dest, value::Scalar i, value::Scalar j, value::Scalar k);
+        static void reorder_kernel_basic(value::Tensor source, value::Tensor dest, value::Scalar i, value::Scalar j, value::Scalar k);
+
+        // Inputs
+        model::InputPort<ValueType> _input;
+
+        // Output
+        model::OutputPort<ValueType> _output;
+
+        // Memory Layouts
+        model::PortMemoryLayout _inputMemoryLayout;
+        model::PortMemoryLayout _outputMemoryLayout;
+
+        ValueType _paddingValue;
+
+		// This is used in the Define function as a workaround for passing in constant Scalar values
+		// to the kernel
+		int _kernel_size;
+};
+
+    /// <summary> Convenience function for adding a node to a model. </summary>
+    ///
+    /// <param name="input"> The input to reorder. </param>
+    /// <param name="outputMemoryLayout"> The memory layout of the output. Data will be copied into the "active" area, and the rest will be zeroed out according to the padding value. </param>
+    /// <param name="paddingValue"> The value to fill the inactive area with. </param>
+    ///
+    /// <returns> The output of the new node. </returns>
+    template <typename ValueType>
+    const model::OutputPort<ValueType>& ReorderDataWithCodeNode(const model::OutputPort<ValueType>& input, const model::PortMemoryLayout& outputMemoryLayout, ValueType paddingValue = 0);
+
+    /// <summary> Convenience function for adding a node to a model. </summary>
+    ///
+    /// <param name="input"> The input to reorder. </param>
+    /// <param name="inputMemoryLayout"> The memory layout of the input. Only data in the "active" area will be copied. </param>
+    /// <param name="outputMemoryLayout"> The memory layout of the output. Data will be copied into the "active" area, and the rest will be zeroed out according to the padding value. </param>
+    /// <param name="paddingValue"> The value to fill the inactive area with. </param>
+    ///
+    /// <returns> The output of the new node. </returns>
+    template <typename ValueType>
+    const model::OutputPort<ValueType>& ReorderDataWithCodeNode(const model::OutputPort<ValueType>& input, const model::PortMemoryLayout& inputMemoryLayout, const model::PortMemoryLayout& outputMemoryLayout, ValueType paddingValue = 0);
+
+    /// <summary> Convenience function for adding a node to a model. </summary>
+    ///
+    /// <param name="input"> The input to reorder. </param>
+    /// <param name="outputMemoryLayout"> The memory layout of the output. Data will be copied into the "active" area, and the rest will be zeroed out according to the padding value. </param>
+    /// <param name="order"> The permutation vector to apply to the dimensions when copying. Input dimension `i` will get copied to output dimension `order[i]`. If left empty, no reordering is done.
+    ///    For instance, to reorder the normal interleaved image order into a planar order, the `order` parameter would be
+    ///    set to {2, 0, 1} --- reordering {row, column, channel} to {channel, row, column} </param>
+    /// <param name="paddingValue"> The value to fill the inactive area with. </param>
+    ///
+    /// <returns> The output of the new node. </returns>
+    template <typename ValueType>
+    const model::OutputPort<ValueType>& ReorderDataWithCodeNode(const model::OutputPort<ValueType>& input, const model::PortMemoryLayout& outputMemoryLayout, const model::DimensionOrder& order, ValueType paddingValue = 0);
+
+    /// <summary> Convenience function for adding a node to a model. </summary>
+    ///
+    /// <param name="input"> The input to reorder. </param>
+    /// <param name="inputMemoryLayout"> The memory layout of the input. Only data in the "active" area will be copied. </param>
+    /// <param name="outputMemoryLayout"> The memory layout of the output. Data will be copied into the "active" area, and the rest will be zeroed out according to the padding value. </param>
+    /// <param name="order"> The permutation vector to apply to the dimensions when copying. Input dimension `i` will get copied to output dimension `order[i]`. If left empty, no reordering is done.
+    ///    For instance, to reorder the normal interleaved image order into a planar order, the `order` parameter would be
+    ///    set to {2, 0, 1} --- reordering {row, column, channel} to {channel, row, column} </param>
+    /// <param name="paddingValue"> The value to fill the inactive area with. </param>
+    ///
+    /// <returns> The output of the new node. </returns>
+    template <typename ValueType>
+    const model::OutputPort<ValueType>& ReorderDataWithCodeNode(const model::OutputPort<ValueType>& input, const model::PortMemoryLayout& inputMemoryLayout, const model::PortMemoryLayout& outputMemoryLayout, const model::DimensionOrder& order, ValueType paddingValue = 0);
+
+    /// <summary> Convenience function for adding a node to a model. </summary>
+    ///
+    /// <param name="input"> The input to reorder. </param>
+    /// <param name="order"> The permutation vector to apply to the dimensions when copying. Input dimension `i` will get copied to output dimension `order[i]`. If left empty, no reordering is done.
+    ///    For instance, to reorder the normal interleaved image order into a planar order, the `order` parameter would be
+    ///    set to {2, 0, 1} --- reordering {row, column, channel} to {channel, row, column} </param>
+    ///
+    /// <returns> The output of the new node. </returns>
+    template <typename ValueType>
+    const model::OutputPort<ValueType>& ReorderDataWithCodeNode(const model::OutputPort<ValueType>& input, const model::DimensionOrder& order);
+
+} // namespace nodes
+} // namespace ell
+
+#pragma region implementation
+
+namespace ell
+{
+namespace nodes
+{
+    //
+    // ReorderDataCodeNode
+    //
+    template <typename ValueType>
+    ReorderDataCodeNode<ValueType>::ReorderDataCodeNode() :
+        CompilableCodeNode("ReorderDataCodeNode", { &_input }, { &_output }),
+        _input(this, {}, defaultInputPortName),
+        _output(this, defaultOutputPortName, 0),
+        _inputMemoryLayout(utilities::MemoryShape{}),
+        _outputMemoryLayout(_output.GetMemoryLayout()),
+        _paddingValue(0),
+        _kernel_size(1)
+    {}
+
+    //
+    // Without reordering ("reshape" / slicing)
+    //
+    template <typename ValueType>
+    ReorderDataCodeNode<ValueType>::ReorderDataCodeNode(const model::OutputPort<ValueType>& input,
+                                                        const model::PortMemoryLayout& outputMemoryLayout,
+                                                        ValueType paddingValue) :
+        CompilableCodeNode("ReorderDataCodeNode", { &_input }, { &_output }),
+        _input(this, input, defaultInputPortName),
+        _output(this, defaultOutputPortName, outputMemoryLayout),
+        _inputMemoryLayout(_input.GetMemoryLayout()),
+        _outputMemoryLayout(_output.GetMemoryLayout()),
+        _paddingValue(paddingValue),
+        _kernel_size(1)
+    {
+        if (_inputMemoryLayout.NumDimensions() != outputMemoryLayout.NumDimensions())
+        {
+            throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument,
+                                            "Error: input and output layouts must have same dimension");
+        }
+    }
+
+    template <typename ValueType>
+    ReorderDataCodeNode<ValueType>::ReorderDataCodeNode(const model::OutputPort<ValueType>& input,
+                                                        const model::PortMemoryLayout& inputMemoryLayout,
+                                                        const model::PortMemoryLayout& outputMemoryLayout,
+                                                        ValueType paddingValue) :
+        CompilableCodeNode("ReorderDataCodeNode", { &_input }, { &_output }),
+        _input(this, input, defaultInputPortName),
+        _output(this, defaultOutputPortName, outputMemoryLayout),
+        _inputMemoryLayout(inputMemoryLayout),
+        _outputMemoryLayout(_output.GetMemoryLayout()),
+        _paddingValue(paddingValue),
+		_kernel_size(1)
+    {
+        if (inputMemoryLayout.NumDimensions() != outputMemoryLayout.NumDimensions())
+        {
+            throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument,
+                                            "Error: input and output layouts must have same dimension");
+        }
+    }
+
+    //
+    // With reordering ("reshape" / slicing, followed by transpose / dimension reordering)
+    //
+    template <typename ValueType>
+    ReorderDataCodeNode<ValueType>::ReorderDataCodeNode(const model::OutputPort<ValueType>& input,
+                                                        const model::DimensionOrder& order) :
+        CompilableCodeNode("ReorderDataCodeNode", { &_input }, { &_output }),
+        _input(this, input, defaultInputPortName),
+        _output(this, defaultOutputPortName, _input.GetMemoryLayout().ReorderedCopy(order)),
+        _inputMemoryLayout(_input.GetMemoryLayout()),
+        _outputMemoryLayout(_output.GetMemoryLayout()),
+        _paddingValue(0),
+		_kernel_size(1)
+    {
+        if (_inputMemoryLayout.NumDimensions() != order.NumDimensions())
+        {
+            throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument,
+                                            "Error: input and output layouts must have same dimension");
+        }
+    }
+
+    template <typename ValueType>
+    ReorderDataCodeNode<ValueType>::ReorderDataCodeNode(const model::OutputPort<ValueType>& input,
+                                                        const model::PortMemoryLayout& outputMemoryLayout,
+                                                        const model::DimensionOrder& order,
+                                                        ValueType paddingValue) :
+        CompilableCodeNode("ReorderDataCodeNode", { &_input }, { &_output }),
+        _input(this, input, defaultInputPortName),
+        _output(this, defaultOutputPortName, outputMemoryLayout.ReorderedCopy(order)),
+        _inputMemoryLayout(_input.GetMemoryLayout()),
+        _outputMemoryLayout(_output.GetMemoryLayout()),
+        _paddingValue(paddingValue),
+		_kernel_size(1)
+    {
+        if (_inputMemoryLayout.NumDimensions() != outputMemoryLayout.NumDimensions())
+        {
+            throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument,
+                                            "Error: input and output layouts must have same dimension");
+        }
+    }
+
+    template <typename ValueType>
+    ReorderDataCodeNode<ValueType>::ReorderDataCodeNode(const model::OutputPort<ValueType>& input,
+                                                        const model::PortMemoryLayout& inputMemoryLayout,
+                                                        const model::PortMemoryLayout& outputMemoryLayout,
+                                                        const model::DimensionOrder& order,
+                                                        ValueType paddingValue) :
+        CompilableCodeNode("ReorderDataCodeNode", { &_input }, { &_output }),
+        _input(this, input, defaultInputPortName),
+        _output(this, defaultOutputPortName, outputMemoryLayout.ReorderedCopy(order)),
+        _inputMemoryLayout(inputMemoryLayout),
+        _outputMemoryLayout(_output.GetMemoryLayout()),
+        _paddingValue(paddingValue),
+		_kernel_size(1)
+    {
+        if (inputMemoryLayout.NumDimensions() != outputMemoryLayout.NumDimensions())
+        {
+            throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument,
+                                            "Error: input and output layouts must have same dimension");
+        }
+    }
+
+    //
+    // A reorder kernel that is optimized for when channels are the minor increment
+    //
+    template <typename ValueType>
+    void ReorderDataCodeNode<ValueType>::reorder_kernel_optimized_channels(value::Tensor source, value::Tensor dest, value::Scalar i, value::Scalar j, value::Scalar k)
+    {
+        value::Vector cache = value::MakeVector<ValueType>(_kernel_size);
+        for (int l = 0; l < _kernel_size; ++l)
+        {
+            cache(l) = source(i, j, k * _kernel_size + l);
+        }
+
+        for (int l = 0; l < _kernel_size; ++l)
+        {
+            dest(i, j, k * _kernel_size + l) = cache(l);
+        }
+    }
+
+    //
+    // A reorder kernel that is optimized for when columns are the minor increment
+    //
+    template <typename ValueType>
+    void ReorderDataCodeNode<ValueType>::reorder_kernel_optimized_columns(value::Tensor source, value::Tensor dest, value::Scalar i, value::Scalar j, value::Scalar k)
+    {
+        value::Vector cache = value::MakeVector<ValueType>(_kernel_size);
+        for (int l = 0; l < _kernel_size; ++l)
+        {
+            cache(l) = source(i, j * _kernel_size + l, k);
+        }
+
+        for (int l = 0; l < _kernel_size; ++l)
+        {
+            dest(i, j * _kernel_size + l, k) = cache(l);
+        }
+    }
+
+    //
+    // A basic, unoptimized reorder kernel
+    //
+    template <typename ValueType>
+    void ReorderDataCodeNode<ValueType>::reorder_kernel_basic(value::Tensor source, value::Tensor dest, value::Scalar i, value::Scalar j, value::Scalar k)
+    {
+        dest(i, j, k) = source(i, j, k);
+    }
+
+    template <typename ValueType>
+    void ReorderDataCodeNode<ValueType>::Define(ell::value::FunctionDeclaration& fn)
+    {
+        (void)fn.Define([this](const value::Value value_input, value::Value output) {
+            namespace loopnests = ell::value::loopnests;
+
+            auto input = value_input;
+            // Set the layout to use for the input view.
+            input.SetLayout(_inputMemoryLayout);
+
+            // Check if this is a Tensor
+            if (input.GetLayout().NumDimensions() == 3)
+            {
+                auto data = value::Tensor(input);
+                auto result = value::Tensor(output);
+
+                _kernel_size = 8;
+                if (output.GetLayout().GetLogicalDimensionOrder() == utilities::DimensionOrder({ 0, 1, 2 }))
+                {
+                    if (result.Channels() % _kernel_size != 0)
+                    {
+                        _kernel_size = 4;
+                    }
+                }
+                else if (output.GetLayout().GetLogicalDimensionOrder() == utilities::DimensionOrder({ 2, 0, 1 }))
+                {
+                    if (result.Columns() % _kernel_size != 0)
+                    {
+                        _kernel_size = 4;
+                    }
+                }
+
+                // Check the order to see which kernel to use. Additionally, verify that an optimized kernel can run on this input, else fallback
+                // to the simple one.
+                if (output.GetLayout().GetLogicalDimensionOrder() == utilities::DimensionOrder({ 2, 0, 1 }) && result.Columns() % _kernel_size == 0)
+                {
+                    // Declare the indexes
+                    loopnests::IndexRange i("i", { 0, (int)(data.Rows()) });
+                    loopnests::IndexRange j("j", { 0, (int)(data.Columns() / _kernel_size) });
+                    loopnests::IndexRange k("k", { 0, (int)(data.Channels()) });
+
+                    auto kernel = loopnests::Kernel("kernel")
+                                      .Inputs(input, output)
+                                      .Indices(i.GetIndex(), j.GetIndex(), k.GetIndex())
+                                      .Define([this](value::Tensor source, value::Tensor dest, value::Scalar i, value::Scalar j, value::Scalar k) {
+                                          reorder_kernel_optimized_columns(source, dest, i, j, k);
+                                      });
+
+                    loopnests::LoopNest loop(std::vector<loopnests::IndexRange>{ i, j, k });
+                    loop.AddKernel(kernel);
+
+                    loopnests::CodeGenerator generator;
+                    generator.Run(loop);
+                }
+                else if (output.GetLayout().GetLogicalDimensionOrder() == utilities::DimensionOrder({ 0, 1, 2 }) && result.Channels() % _kernel_size == 0)
+                {
+                    // Declare the indexes
+                    loopnests::IndexRange i("i", { 0, (int)(data.Rows()) });
+                    loopnests::IndexRange j("j", { 0, (int)(data.Columns()) });
+                    loopnests::IndexRange k("k", { 0, (int)(data.Channels() / _kernel_size) });
+
+                    auto kernel = loopnests::Kernel("kernel")
+                                      .Inputs(input, output)
+                                      .Indices(i.GetIndex(), j.GetIndex(), k.GetIndex())
+                                      .Define([this](value::Tensor source, value::Tensor dest, value::Scalar i, value::Scalar j, value::Scalar k) {
+                                          reorder_kernel_optimized_channels(source, dest, i, j, k);
+                                      });
+
+                    loopnests::LoopNest loop(std::vector<loopnests::IndexRange>{ i, j, k });
+                    loop.AddKernel(kernel);
+
+                    loopnests::CodeGenerator generator;
+                    generator.Run(loop);
+                }
+                else
+                {
+                    // Declare the indexes
+                    loopnests::IndexRange i("i", { 0, (int)(data.Rows()) });
+                    loopnests::IndexRange j("j", { 0, (int)(data.Columns()) });
+                    loopnests::IndexRange k("k", { 0, (int)(data.Channels()) });
+
+                    // This is the basic fallback kernel that can do one element at a time
+                    auto kernel = loopnests::Kernel("kernel")
+                                      .Inputs(input, output)
+                                      .Indices(i.GetIndex(), j.GetIndex(), k.GetIndex())
+                                      .Define(reorder_kernel_basic);
+
+                    loopnests::LoopNest loop(std::vector<loopnests::IndexRange>{ i, j, k });
+                    loop.AddKernel(kernel);
+
+                    loopnests::CodeGenerator generator;
+                    generator.Run(loop);
+                }
+            }
+            else if (input.GetLayout().NumDimensions() == 2)
+            {
+                auto data = value::Matrix(input);
+                auto result = value::Matrix(output);
+
+                value::Scalar v = value::Allocate(result.Type(), ell::utilities::ScalarLayout);
+                For(data, [&](value::Scalar row, value::Scalar column) {
+                    result(row, column) = data(row, column);
+                });
+            }
+            else
+            {
+                auto data = value::Vector(input);
+                auto result = value::Vector(output);
+
+                For(data, [&](value::Scalar index) {
+                    result[index] = data[index];
+                });
+            }
+        });
+    }
+
+    template <typename ValueType>
+    void ReorderDataCodeNode<ValueType>::WriteToArchive(utilities::Archiver& archiver) const
+    {
+        CompilableNode::WriteToArchive(archiver);
+        archiver[defaultInputPortName] << _input;
+        archiver["inputLayout"] << _inputMemoryLayout;
+        archiver["outputLayout"] << _outputMemoryLayout;
+        archiver["paddingValue"] << _paddingValue;
+    }
+
+    template <typename ValueType>
+    void ReorderDataCodeNode<ValueType>::ReadFromArchive(utilities::Unarchiver& archiver)
+    {
+        CompilableNode::ReadFromArchive(archiver);
+        archiver[defaultInputPortName] >> _input;
+        archiver["inputLayout"] >> _inputMemoryLayout;
+        archiver["outputLayout"] >> _outputMemoryLayout;
+        archiver["paddingValue"] >> _paddingValue;
+        _output.SetMemoryLayout(_outputMemoryLayout);
+    }
+
+    template <typename ValueType>
+    void ReorderDataCodeNode<ValueType>::Copy(model::ModelTransformer& transformer) const
+    {
+        const auto& newInputs = transformer.GetCorrespondingInputs(_input);
+        auto newNode = transformer.AddNode<ReorderDataCodeNode<ValueType>>(newInputs, _inputMemoryLayout, _outputMemoryLayout, _paddingValue);
+        transformer.MapNodeOutput(output, newNode->output);
+    }
+
+    template <typename ValueType>
+    const model::OutputPort<ValueType>& ReorderDataWithCodeNode(const model::OutputPort<ValueType>& input, const model::PortMemoryLayout& outputMemoryLayout, ValueType paddingValue)
+    {
+        model::Model* model = input.GetNode()->GetModel();
+        if (model == nullptr)
+        {
+            throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "Input not part of a model");
+        }
+        auto node = model->AddNode<ReorderDataCodeNode<ValueType>>(input, outputMemoryLayout, paddingValue);
+        return node->output;
+    }
+
+    template <typename ValueType>
+    const model::OutputPort<ValueType>& ReorderDataWithCodeNode(const model::OutputPort<ValueType>& input, const model::PortMemoryLayout& inputMemoryLayout, const model::PortMemoryLayout& outputMemoryLayout, ValueType paddingValue)
+    {
+        model::Model* model = input.GetNode()->GetModel();
+        if (model == nullptr)
+        {
+            throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "Input not part of a model");
+        }
+        auto node = model->AddNode<ReorderDataCodeNode<ValueType>>(input, inputMemoryLayout, outputMemoryLayout, paddingValue);
+        return node->output;
+    }
+
+    template <typename ValueType>
+    const model::OutputPort<ValueType>& ReorderDataWithCodeNode(const model::OutputPort<ValueType>& input, const model::PortMemoryLayout& outputMemoryLayout, const model::DimensionOrder& order, ValueType paddingValue)
+    {
+        model::Model* model = input.GetNode()->GetModel();
+        if (model == nullptr)
+        {
+            throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "Input not part of a model");
+        }
+        auto node = model->AddNode<ReorderDataCodeNode<ValueType>>(input, outputMemoryLayout, order, paddingValue);
+        return node->output;
+    }
+
+    template <typename ValueType>
+    const model::OutputPort<ValueType>& ReorderDataWithCodeNode(const model::OutputPort<ValueType>& input, const model::PortMemoryLayout& inputMemoryLayout, const model::PortMemoryLayout& outputMemoryLayout, const model::DimensionOrder& order, ValueType paddingValue)
+    {
+        model::Model* model = input.GetNode()->GetModel();
+        if (model == nullptr)
+        {
+            throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "Input not part of a model");
+        }
+        auto node = model->AddNode<ReorderDataCodeNode<ValueType>>(input, inputMemoryLayout, outputMemoryLayout, order, paddingValue);
+        return node->output;
+    }
+
+    template <typename ValueType>
+    const model::OutputPort<ValueType>& ReorderDataWithCodeNode(const model::OutputPort<ValueType>& input, const model::DimensionOrder& order)
+    {
+        model::Model* model = input.GetNode()->GetModel();
+        if (model == nullptr)
+        {
+            throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "Input not part of a model");
+        }
+        auto node = model->AddNode<ReorderDataCodeNode<ValueType>>(input, order);
+        return node->output;
+    }
+
+} // namespace nodes
+} // namespace ell
+
+#pragma endregion implementation
diff --git a/libraries/nodes/include/SpatialConvolutionNode.h b/libraries/nodes/include/SpatialConvolutionNode.h
new file mode 100644
index 000000000..63bdaf803
--- /dev/null
+++ b/libraries/nodes/include/SpatialConvolutionNode.h
@@ -0,0 +1,239 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     SpatialConvolutionNode.h (nodes)
+//  Authors:  Byron Changuion
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include <model/include/CompilableCodeNode.h>
+#include <model/include/IRMapCompiler.h>
+#include <model/include/InputPort.h>
+#include <model/include/OutputPort.h>
+
+#include <value/include/FunctionDeclaration.h>
+#include <value/include/Scalar.h>
+#include <value/include/ScalarOperations.h>
+#include <value/include/Tensor.h>
+
+#include <predictors/neural/include/ConvolutionalLayer.h>
+
+#include <value/include/loopnests/CodeGenerator.h>
+#include <value/include/loopnests/Kernel.h>
+#include <value/include/loopnests/LoopNest.h>
+
+#include <string>
+#include <vector>
+
+namespace ell
+{
+namespace nodes
+{
+    /// <summary> A node that performs the spatial convolution in a depthwise separable
+    /// convolutional model. By definition, this node requires:
+    /// - Number of input channels per weights filter to be 1
+    /// - Number of filters must equal number of input channels
+    /// </summary>
+    template <typename ValueType>
+    class SpatialConvolutionNode : public model::CompilableCodeNode
+    {
+
+    public:
+        using LayerType = predictors::neural::ConvolutionalLayer<ValueType>;
+
+        /// @name Input and Output Ports
+        /// @{
+        const model::InputPort<ValueType>& input = _input;
+        const model::OutputPort<ValueType>& output = _output;
+        /// @}
+
+        /// <summary> Default Constructor </summary>
+        SpatialConvolutionNode();
+
+        /// <summary> Constructor from a layer. </summary>
+        ///
+        /// <param name="input"> </param>
+        /// <param name="layer"> The convolutional layer to wrap. </param>
+        SpatialConvolutionNode(const model::OutputPort<ValueType>& input, const LayerType& layer, const model::PortMemoryLayout& outputMemoryLayout);
+
+        /// <summary> Returns true if the node can accept input with this memory layout order, else false </summary>
+        ///
+        /// <param name="order"> The memory layout order for all the input ports </summary>
+        /// <returns> If the node can accept the input memory layout order, true, else false </returns>
+        bool CanAcceptInputLayout(const utilities::DimensionOrder& order) const override
+        {
+            return true;
+        }
+
+        /// <summary> Gets the name of this type (for serialization). </summary>
+        ///
+        /// <returns> The name of this type. </returns>
+        static std::string GetTypeName() { return utilities::GetCompositeTypeName<ValueType>("SpatialConvolutionNode"); }
+
+    protected:
+        void Define(ell::value::FunctionDeclaration& fn) override;
+        void WriteToArchive(utilities::Archiver& archiver) const override;
+        void ReadFromArchive(utilities::Unarchiver& archiver) override;
+        bool HasState() const override { return true; } // stored state: convolutional parameters
+        std::string GetRuntimeTypeName() const override { return GetTypeName(); }
+
+    private:
+        void Copy(model::ModelTransformer& transformer) const override;
+
+        // Called with output i, j, k
+        void spatial_convolutional_kernel(value::Tensor output, value::Tensor input, value::Tensor weights, value::Scalar i, value::Scalar j, value::Scalar k);
+
+        // Inputs
+        model::InputPort<ValueType> _input;
+
+        // Output
+        model::OutputPort<ValueType> _output;
+
+        // Convolutional layer
+        LayerType _layer;
+    };
+
+} // namespace nodes
+} // namespace ell
+
+#pragma region implementation
+
+namespace ell
+{
+namespace nodes
+{
+    //
+    // SpatialConvolutionNode
+    //
+    template <typename ValueType>
+    SpatialConvolutionNode<ValueType>::SpatialConvolutionNode() :
+        CompilableCodeNode("SpatialConvolutionNode", { &_input }, { &_output }),
+        _input(this, {}, defaultInputPortName),
+        _output(this, defaultOutputPortName, 0)
+    {}
+
+    //
+    // SpatialConvolutionNode
+    //
+    template <typename ValueType>
+    SpatialConvolutionNode<ValueType>::SpatialConvolutionNode(const model::OutputPort<ValueType>& input,
+                                                              const LayerType& layer,
+                                                              const model::PortMemoryLayout& outputMemoryLayout) :
+        CompilableCodeNode("SpatialConvolutionNode", { &_input }, { &_output }),
+        _input(this, input, defaultInputPortName),
+        _output(this, defaultOutputPortName, outputMemoryLayout),
+        _layer(layer)
+    {
+        const auto& weights = _layer.GetWeights();
+        if (weights.NumChannels() != 1)
+        {
+            throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument,
+                                            "Error: weights for Spatial Convolution must have single channel");
+        }
+        if (_input.GetMemoryLayout().GetLogicalDimensionExtent(2) != _output.GetMemoryLayout().GetLogicalDimensionExtent(2))
+        {
+            throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument,
+                                            "Error: input and output number of channels must match for Spatial Convolution");
+        }
+    }
+
+    //
+    // A spatial convolution kernel
+    //
+    template <typename ValueType>
+    void SpatialConvolutionNode<ValueType>::spatial_convolutional_kernel(value::Tensor output, value::Tensor input, value::Tensor weights, value::Scalar row, value::Scalar column, value::Scalar channel)
+    {
+        const auto& parameters = _layer.GetConvolutionalParameters();
+        const int receptiveFieldRows = (int)parameters.receptiveField;
+        const int receptiveFieldColumns = (int)parameters.receptiveField;
+        const int rowStride = (int)parameters.stride;
+        const int columnStride = (int)parameters.stride;
+
+        value::Scalar temp = value::Allocate(output.GetValue().GetBaseType(), ell::utilities::ScalarLayout);
+        temp = static_cast<ValueType>(0.0);
+
+        // Unroll the calculations for the receptive field size in row and column dimensions
+        for (int k_r = 0; k_r < receptiveFieldRows; ++k_r)
+        {
+            for (int k_c = 0; k_c < receptiveFieldColumns; ++k_c)
+            {
+                // Weight filters are stacked in the row dimension. For spatial convolutions, the weights channel index is always 0
+                // since there is only one channel per filter.
+                temp += input(row * rowStride + k_r, column * columnStride + k_c, channel) * weights(channel * receptiveFieldRows + k_r, k_c, 0);
+            }
+        }
+        output(row, column, channel) = temp;
+    }
+
+    template <typename ValueType>
+    void SpatialConvolutionNode<ValueType>::Define(ell::value::FunctionDeclaration& fn)
+    {
+        (void)fn.Define([this](const value::Tensor input_tensor, value::Tensor output) {
+            namespace loopnests = ell::value::loopnests;
+
+            value::Value input_value(input_tensor.GetValue());
+            input_value.SetLayout(utilities::MemoryLayout(input_value.GetLayout().GetExtent(), input_value.GetLayout().GetLogicalDimensionOrder()));
+            value::Tensor input(input_value);
+
+            // Declare constants
+            const auto& w = _layer.GetWeights();
+            std::vector<ValueType> data = w.ToArray();
+            value::Tensor weights({ data,
+                                    utilities::MemoryLayout({ static_cast<int>(w.NumRows()), static_cast<int>(w.NumColumns()), static_cast<int>(w.NumChannels()) },
+                                                            utilities::DimensionOrder(utilities::RowMajorTensorOrder)) });
+
+            // Declare the indexes
+            loopnests::IndexRange i("i", { 0, (int)(output.Rows()) });
+            loopnests::IndexRange j("j", { 0, (int)(output.Columns()) });
+            loopnests::IndexRange k("k", { 0, (int)(output.Channels()) });
+
+            auto kernel = loopnests::Kernel("kernel")
+                              .Inputs(output.GetValue(), input.GetValue(), weights.GetValue())
+                              .Indices(i.GetIndex(), j.GetIndex(), k.GetIndex())
+                              .Define([this](value::Tensor output, value::Tensor input, value::Tensor weights, value::Scalar row, value::Scalar column, value::Scalar channel) {
+                                  spatial_convolutional_kernel(output, input, weights, row, column, channel);
+                              });
+
+            loopnests::LoopNest loop(std::vector<loopnests::IndexRange>{ i, j, k });
+            loop.AddKernel(kernel);
+            loop.SetLoopOrder({ k.GetIndex(), i.GetIndex(), j.GetIndex() });
+
+            loopnests::CodeGenerator generator;
+            generator.Run(loop);
+        });
+    }
+
+    template <typename ValueType>
+    void SpatialConvolutionNode<ValueType>::WriteToArchive(utilities::Archiver& archiver) const
+    {
+        Node::WriteToArchive(archiver);
+        archiver[defaultInputPortName] << _input;
+        archiver["outputLayout"] << _output.GetMemoryLayout();
+        archiver["layer"] << _layer;
+    }
+
+    template <typename ValueType>
+    void SpatialConvolutionNode<ValueType>::ReadFromArchive(utilities::Unarchiver& archiver)
+    {
+        Node::ReadFromArchive(archiver);
+        archiver[defaultInputPortName] >> _input;
+        model::PortMemoryLayout outputMemoryLayout;
+        archiver["outputLayout"] >> outputMemoryLayout;
+        _output.SetMemoryLayout(outputMemoryLayout);
+        archiver["layer"] >> _layer;
+    }
+
+    template <typename ValueType>
+    void SpatialConvolutionNode<ValueType>::Copy(model::ModelTransformer& transformer) const
+    {
+        const auto& newInputs = transformer.GetCorrespondingInputs(_input);
+        auto newNode = transformer.AddNode<SpatialConvolutionNode<ValueType>>(newInputs, _layer, _output.GetMemoryLayout());
+        transformer.MapNodeOutput(output, newNode->output);
+    }
+
+} // namespace nodes
+} // namespace ell
+
+#pragma endregion implementation
diff --git a/libraries/nodes/include/UnrolledConvolutionNode.h b/libraries/nodes/include/UnrolledConvolutionNode.h
index 142c5a8c5..98dd91d66 100644
--- a/libraries/nodes/include/UnrolledConvolutionNode.h
+++ b/libraries/nodes/include/UnrolledConvolutionNode.h
@@ -106,6 +106,7 @@ namespace nodes
         void Copy(model::ModelTransformer& transformer) const override;
 
         MatrixType GetWeightsMatrix(const ConstTensorReferenceType& weightsTensor) const;
+        bool IsELLCodeTarget(model::ModelTransformer& transformer) const;
 
         // Input
         model::InputPort<ValueType> _input;
diff --git a/libraries/nodes/src/BinaryConvolutionalLayerNode.cpp b/libraries/nodes/src/BinaryConvolutionalLayerNode.cpp
index aa8e617a5..68fa1a8e2 100644
--- a/libraries/nodes/src/BinaryConvolutionalLayerNode.cpp
+++ b/libraries/nodes/src/BinaryConvolutionalLayerNode.cpp
@@ -8,7 +8,7 @@
 
 #include "BinaryConvolutionalLayerNode.h"
 #include "ConstantNode.h"
-#include "ReorderDataNode.h"
+#include "ReorderDataCodeNode.h"
 
 #include <emitters/include/IRAsyncTask.h>
 #include <emitters/include/IREmitter.h>
@@ -240,7 +240,7 @@ namespace nodes
         // Output of xnor is in (f x h x w) order, need to transpose to the canonical (h x w x f) order
         model::PortMemoryLayout outputShape(model::MemoryShape{ numFilters, outputImageHeight, outputImageWidth }, model::DimensionOrder{ 2, 0, 1 }); // Note: memory layout constructor takes the sizes in physical dimension order
         model::PortMemoryLayout transposedOutputShape(model::MemoryShape{ outputImageHeight, outputImageWidth, numFilters }, model::MemoryShape{ outputDataPadding, outputDataPadding, 0 }, model::DimensionOrder{ 0, 1, 2 });
-        const auto& reorderedOutput = ReorderData(xnorOutput, outputShape, transposedOutputShape);
+        const auto& reorderedOutput = ReorderDataWithCodeNode(xnorOutput, outputShape, transposedOutputShape);
         transformer.MapNodeOutput(this->output, reorderedOutput);
         return true;
     }
diff --git a/libraries/nodes/src/ConvolutionalLayerNode.cpp b/libraries/nodes/src/ConvolutionalLayerNode.cpp
index d992ede9a..26d288d62 100644
--- a/libraries/nodes/src/ConvolutionalLayerNode.cpp
+++ b/libraries/nodes/src/ConvolutionalLayerNode.cpp
@@ -8,8 +8,9 @@
 
 #include "ConvolutionalLayerNode.h"
 #include "DiagonalConvolutionNode.h"
-#include "ReorderDataNode.h"
+#include "ReorderDataCodeNode.h"
 #include "SimpleConvolutionNode.h"
+#include "SpatialConvolutionNode.h"
 #include "UnrolledConvolutionNode.h"
 #include "WinogradConvolutionNode.h"
 
@@ -48,7 +49,7 @@ namespace nodes
         auto convInputLayout = originalInputLayout.ReorderedCopy({ shouldReorderToChannelMajor ? utilities::ChannelMajorTensorOrder : utilities::RowMajorTensorOrder });
         auto convOutputLayout = originalOutputLayout.ReorderedCopy({ shouldReorderToChannelMajor ? utilities::ChannelMajorTensorOrder : utilities::RowMajorTensorOrder });
 
-        const auto& preConvReorder = ReorderData(*newInput, originalInputLayout, convInputLayout);
+        const auto& preConvReorder = ReorderDataWithCodeNode(*newInput, originalInputLayout, convInputLayout);
         newInput = &preConvReorder;
 
         const model::OutputPort<ValueType>* convOutput;
@@ -57,8 +58,16 @@ namespace nodes
         {
         case ConvolutionMethod::simple:
         {
-            auto convNode = transformer.AddNode<SimpleConvolutionNode<ValueType>>(*newInput, convInputLayout, convOutputLayout, weights, convParams.stride);
-            convOutput = &convNode->output;
+            if (isDepthwiseSeparable)
+            {
+                auto convNode = transformer.AddNode<SpatialConvolutionNode<ValueType>>(*newInput, this->GetLayer(), convOutputLayout);
+                convOutput = &convNode->output;
+            }
+            else
+            {
+                auto convNode = transformer.AddNode<SimpleConvolutionNode<ValueType>>(*newInput, convInputLayout, convOutputLayout, weights, convParams.stride);
+                convOutput = &convNode->output;
+            }
         }
         break;
         case ConvolutionMethod::unrolled:
@@ -86,7 +95,7 @@ namespace nodes
         // Copy metadata
         const_cast<model::Node*>(convOutput->GetNode())->GetMetadata() = this->GetMetadata();
 
-        const auto& postConvReorder = ReorderData(*convOutput, originalOutputLayout);
+        const auto& postConvReorder = ReorderDataWithCodeNode(*convOutput, originalOutputLayout);
         transformer.MapNodeOutput(this->output, postConvReorder);
 
         return true;
diff --git a/libraries/nodes/src/MatrixMatrixMultiplyCodeNode.cpp b/libraries/nodes/src/MatrixMatrixMultiplyCodeNode.cpp
new file mode 100644
index 000000000..bfe311f55
--- /dev/null
+++ b/libraries/nodes/src/MatrixMatrixMultiplyCodeNode.cpp
@@ -0,0 +1,549 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     MatrixMatrixMultiplyCodeNode.h (nodes)
+//  Authors:  Mason Remy, Denny Sun
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <model/include/ModelTransformer.h>
+
+#include <nodes/include/MatrixMatrixMultiplyCodeNode.h>
+
+#include <utilities/include/ArchiveVersion.h>
+#include <utilities/include/Exception.h>
+#include <utilities/include/IArchivable.h>
+
+#include <value/include/FunctionDeclaration.h>
+#include <value/include/Matrix.h>
+#include <value/include/MatrixOperations.h>
+#include <value/include/Scalar.h>
+#include <value/include/ScalarOperations.h>
+
+#include <value/include/loopnests/CodeGenerator.h>
+#include <value/include/loopnests/Kernel.h>
+#include <value/include/loopnests/LoopNest.h>
+
+#include <value/include/CachingStrategies.h>
+#include <value/include/LLVMContext.h>
+#include <llvm/Analysis/TargetTransformInfo.h>
+
+//using namespace ell::utilities;
+using namespace ell::value;
+
+namespace ell
+{
+namespace nodes
+{
+    template <typename ValueType>
+    MatrixMatrixMultiplyCodeNode<ValueType>::MatrixMatrixMultiplyCodeNode() :
+        CompilableCodeNode("MatrixMatrixMultiplyCodeNode", { &_input1, &_input2 }, { &_output }),
+        _input1(this, {}, defaultInput1PortName),
+        _input2(this, {}, defaultInput2PortName),
+        _output(this, defaultOutputPortName, 0),
+        _impl(MatrixMatrixMultiplyImplementation::DEFAULT)
+    {
+    }
+
+    template <typename ValueType>
+    MatrixMatrixMultiplyCodeNode<ValueType>::MatrixMatrixMultiplyCodeNode(const model::OutputPort<ValueType>& input1, const model::OutputPort<ValueType>& input2, const MatrixMatrixMultiplyImplementation& gemmImpl) :
+        MatrixMatrixMultiplyCodeNode<ValueType>(input1, input2, _defaultPanelM, _defaultPanelN, _defaultPanelK, _defaultKernelM, _defaultKernelN, _defaultKernelK, gemmImpl)
+    {
+    }
+
+    template <typename ValueType>
+    MatrixMatrixMultiplyCodeNode<ValueType>::MatrixMatrixMultiplyCodeNode(const model::OutputPort<ValueType>& input1, const model::OutputPort<ValueType>& input2, int panelM, int panelN, int panelK, int kernelM, int kernelN, int kernelK, const MatrixMatrixMultiplyImplementation& gemmImpl) :
+        MatrixMatrixMultiplyCodeNode<ValueType>(input1, input2, model::PortMemoryLayout({ input1.GetMemoryLayout().GetActiveSize(0), input2.GetMemoryLayout().GetActiveSize(1) }), panelM, panelN, panelK, kernelM, kernelN, kernelK, gemmImpl)
+    {
+    }
+
+    template <typename ValueType>
+    MatrixMatrixMultiplyCodeNode<ValueType>::MatrixMatrixMultiplyCodeNode(const model::OutputPort<ValueType>& input1, const model::OutputPort<ValueType>& input2, const model::PortMemoryLayout& outputLayout, const MatrixMatrixMultiplyImplementation& gemmImpl) :
+        MatrixMatrixMultiplyCodeNode<ValueType>(input1, input2, outputLayout, _defaultPanelM, _defaultPanelN, _defaultPanelK, _defaultKernelM, _defaultKernelN, _defaultKernelK, gemmImpl)
+    {
+    }
+
+    template <typename ValueType>
+    MatrixMatrixMultiplyCodeNode<ValueType>::MatrixMatrixMultiplyCodeNode(const model::OutputPort<ValueType>& input1, const model::OutputPort<ValueType>& input2, const model::PortMemoryLayout& outputLayout, int panelM, int panelN, int panelK, int kernelM, int kernelN, int kernelK, const MatrixMatrixMultiplyImplementation& gemmImpl) :
+        MatrixMatrixMultiplyCodeNode<ValueType>::MatrixMatrixMultiplyCodeNode(input1,
+                                                                              input1.GetMemoryLayout().GetLogicalDimensionActiveSize(0),
+                                                                              input2.GetMemoryLayout().GetLogicalDimensionActiveSize(1),
+                                                                              input1.GetMemoryLayout().GetLogicalDimensionActiveSize(1),
+                                                                              input1.GetMemoryLayout().GetExtent(1),
+                                                                              !input1.GetMemoryLayout().IsCanonicalOrder(),
+                                                                              input2,
+                                                                              input2.GetMemoryLayout().GetExtent(1),
+                                                                              !input2.GetMemoryLayout().IsCanonicalOrder(),
+                                                                              outputLayout.GetExtent(1),
+                                                                              !outputLayout.IsCanonicalOrder(),
+                                                                              panelM,
+                                                                              panelN,
+                                                                              panelK,
+                                                                              kernelM,
+                                                                              kernelN,
+                                                                              kernelK,
+                                                                              gemmImpl)
+    {
+        if (input1.GetMemoryLayout().NumDimensions() != 2 || input2.GetMemoryLayout().NumDimensions() != 2)
+        {
+            throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "Input matrices must have a memory layout with 2 dimensions");
+        }
+        if (_k != input2.GetMemoryLayout().GetLogicalDimensionActiveSize(0))
+        {
+            throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "Input matrices incompatible");
+        }
+    }
+
+    template <typename ValueType>
+    MatrixMatrixMultiplyCodeNode<ValueType>::MatrixMatrixMultiplyCodeNode(const model::OutputPort<ValueType>& input1, int m, int n, int k, int matrix1Stride, const model::OutputPort<ValueType>& input2, int matrix2Stride, int outputMatrixStride, const MatrixMatrixMultiplyImplementation& gemmImpl) :
+        MatrixMatrixMultiplyCodeNode<ValueType>(input1, m, n, k, matrix1Stride, input2, matrix2Stride, outputMatrixStride, _defaultPanelM, _defaultPanelN, _defaultPanelK, _defaultKernelM, _defaultKernelN, _defaultKernelK, gemmImpl)
+    {
+    }
+
+    template <typename ValueType>
+    MatrixMatrixMultiplyCodeNode<ValueType>::MatrixMatrixMultiplyCodeNode(const model::OutputPort<ValueType>& input1, int m, int n, int k, int matrix1Stride, const model::OutputPort<ValueType>& input2, int matrix2Stride, int outputMatrixStride, int panelM, int panelN, int panelK, int kernelM, int kernelN, int kernelK, const MatrixMatrixMultiplyImplementation& gemmImpl) :
+        MatrixMatrixMultiplyCodeNode<ValueType>(input1, m, n, k, matrix1Stride, false, input2, matrix2Stride, false, outputMatrixStride, false, panelM, panelN, panelK, kernelM, kernelN, kernelK, gemmImpl)
+    {
+    }
+
+    template <typename ValueType>
+    MatrixMatrixMultiplyCodeNode<ValueType>::MatrixMatrixMultiplyCodeNode(const model::OutputPort<ValueType>& input1, int m, int n, int k, int matrix1Stride, bool transpose1, const model::OutputPort<ValueType>& input2, int matrix2Stride, bool transpose2, int outputMatrixStride, const MatrixMatrixMultiplyImplementation& gemmImpl) :
+        MatrixMatrixMultiplyCodeNode<ValueType>(input1, m, n, k, matrix1Stride, transpose1, input2, matrix2Stride, transpose2, outputMatrixStride, _defaultPanelM, _defaultPanelN, _defaultPanelK, _defaultKernelM, _defaultKernelN, _defaultKernelK, gemmImpl)
+    {
+    }
+
+    template <typename ValueType>
+    MatrixMatrixMultiplyCodeNode<ValueType>::MatrixMatrixMultiplyCodeNode(const model::OutputPort<ValueType>& input1, int m, int n, int k, int matrix1Stride, bool transpose1, const model::OutputPort<ValueType>& input2, int matrix2Stride, bool transpose2, int outputMatrixStride, int panelM, int panelN, int panelK, int kernelM, int kernelN, int kernelK, const MatrixMatrixMultiplyImplementation& gemmImpl) :
+        MatrixMatrixMultiplyCodeNode<ValueType>(input1, m, n, k, matrix1Stride, transpose1, input2, matrix2Stride, transpose2, outputMatrixStride, false, panelM, panelN, panelK, kernelM, kernelN, kernelK, gemmImpl)
+    {
+    }
+
+    template <typename ValueType>
+    MatrixMatrixMultiplyCodeNode<ValueType>::MatrixMatrixMultiplyCodeNode(const model::OutputPort<ValueType>& input1, int m, int n, int k, int matrix1Stride, bool transpose1, const model::OutputPort<ValueType>& input2, int matrix2Stride, bool transpose2, int outputMatrixStride, bool transposeOutput, const MatrixMatrixMultiplyImplementation& gemmImpl) :
+        MatrixMatrixMultiplyCodeNode<ValueType>(input1, m, n, k, matrix1Stride, transpose1, input2, matrix2Stride, transpose2, outputMatrixStride, transposeOutput, _defaultPanelM, _defaultPanelN, _defaultPanelK, _defaultKernelM, _defaultKernelN, _defaultKernelK, gemmImpl)
+    {
+    }
+
+    template <typename ValueType>
+    MatrixMatrixMultiplyCodeNode<ValueType>::MatrixMatrixMultiplyCodeNode(const model::OutputPort<ValueType>& input1, int m, int n, int k, int matrix1Stride, bool transpose1, const model::OutputPort<ValueType>& input2, int matrix2Stride, bool transpose2, int outputMatrixStride, bool transposeOutput, int panelM, int panelN, int panelK, int kernelM, int kernelN, int kernelK, const MatrixMatrixMultiplyImplementation& gemmImpl) :
+        CompilableCodeNode("MatrixMatrixMultiplyCodeNode", { &_input1, &_input2 }, { &_output }),
+        _input1(this, input1, defaultInput1PortName),
+        _input2(this, input2, defaultInput2PortName),
+        _output(this, defaultOutputPortName, utilities::MemoryShape{ m, n }),
+        _m(m),
+        _n(n),
+        _k(k),
+        _lda(matrix1Stride),
+        _ldb(matrix2Stride),
+        _ldc(outputMatrixStride),
+        _transpose1(transpose1),
+        _transpose2(transpose2),
+        _transposeOutput(transposeOutput),
+        _panelM(panelM),
+        _panelN(panelN),
+        _panelK(panelK),
+        _kernelM(kernelM),
+        _kernelN(kernelN),
+        _kernelK(kernelK),
+        _impl(gemmImpl)
+    {
+        if (static_cast<int>(input1.Size()) != m * k)
+        {
+            throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "Input matrix 1 size incorrect");
+        }
+
+        if (static_cast<int>(input2.Size()) != k * n)
+        {
+            throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "Input matrix 2 size incorrect");
+        }
+    }
+
+    template <typename ValueType>
+    void MatrixMatrixMultiplyCodeNode<ValueType>::ZeroMatrix(value::Matrix matrix) const
+    {
+        namespace loopnests = ell::value::loopnests;
+        int M = (int)(matrix.Rows());
+        int N = (int)(matrix.Columns());
+        loopnests::Index m("m"), n("n");
+        loopnests::LoopNest zeroingLoop({ { m, { 0, M } },
+                                          { n, { 0, N } } });
+
+        auto [mKernelOuter, mKernelInner] = zeroingLoop.Split(m, _kernelM);
+        auto [nKernelOuter, nKernelInner] = zeroingLoop.Split(n, _kernelN);
+        auto zeroingKernel = loopnests::Kernel("Zero_output")
+                                 .Inputs(matrix.GetValue())
+                                 .Indices(m, n)
+                                 .Define([&](value::Matrix C, value::Scalar row, value::Scalar col) {
+                                     C(row, col) = static_cast<ValueType>(0);
+                                 });
+        zeroingLoop.AddKernel(zeroingKernel);
+        zeroingLoop.Unroll(mKernelInner);
+        zeroingLoop.Unroll(nKernelInner);
+        loopnests::CodeGenerator zeroingGenerator;
+        zeroingGenerator.Run(zeroingLoop);
+    }
+
+    template <typename ValueType>
+    void MatrixMatrixMultiplyCodeNode<ValueType>::ForLoopGEMM(const value::Matrix matA, const value::Matrix matB, value::Matrix matC)
+    {
+        namespace loopnests = ell::value::loopnests;
+        // Currently treat beta as 0
+        ZeroMatrix(matC);
+
+        int M = (int)(matA.Rows());
+        int N = (int)(matB.Columns());
+        int K = (int)(matA.Columns());
+
+        loopnests::Index m("m"), n("n"), k("k");
+        loopnests::LoopNest loop({ { m, { 0, M } },
+                                   { n, { 0, N } },
+                                   { k, { 0, K } } });
+
+        // innermost GEMM kernel
+        auto kernel = loopnests::Kernel("GEMMKernel")
+                          .Inputs(matA.GetValue(), matB.GetValue(), matC.GetValue())
+                          .Indices(m, n, k)
+                          .Define([](value::Matrix A, value::Matrix B, value::Matrix C, value::Scalar m, value::Scalar n, value::Scalar k) {
+                              C(m, n) += A(m, k) * B(k, n);
+                          });
+
+        loop.AddKernel(kernel, loopnests::LoopFragmentType::body);
+        loop.SetLoopOrder({ m, k, n });
+                auto outputC = matC.GetValue();
+            outputC.SetLayout({ { (int)matC.Size() } });
+          //  ell::DebugPrintVector(outputC);
+        loopnests::CodeGenerator generator;
+        generator.Run(loop);
+    }
+
+    template <typename ValueType>
+    void MatrixMatrixMultiplyCodeNode<ValueType>::Gemm(value::Matrix A, value::Matrix B, value::Matrix C)
+    {
+        using namespace value;
+
+        int vectorSize = 4;
+        int NumRowsInKernel = 2;
+
+        InvokeForContext<LLVMContext>([&](LLVMContext& context) {
+            auto targetMachine = context.GetModuleEmitter().GetTargetMachine();
+            auto fn = context.GetFunctionEmitter().GetFunction();
+            auto info = targetMachine->getTargetTransformInfo(*fn);
+            // See https://llvm.org/doxygen/classllvm_1_1TargetTransformInfo.html for the big list of amazing things you can get from this TargetMachineInfo object
+            vectorSize = static_cast<int>(info.getRegisterBitWidth(true)) / (8 * sizeof(float));
+            if (vectorSize > 8)
+            {
+                // The vector width is 16 floats instead of 8 (e.g. in AVX-512), so double the sizes as needed
+                vectorSize = 16;
+                NumRowsInKernel = 12;
+            }
+        });
+
+        int NumColumnsInKernel = 2 * vectorSize;
+        if (NumColumnsInKernel > (int)B.Columns())
+        {
+            NumColumnsInKernel = vectorSize;
+            NumRowsInKernel *= 2;
+        }
+
+        // Declare and/or calculate constants
+        const int OutputRows = (int)(A.Rows());
+        const int OutputColumns = (int)(B.Columns());
+        const int InnerDimension = (int)(A.Columns());
+        const int kUnroll = 4;
+        int columnBlock = std::min(64, OutputColumns);
+        int innerDimensionBlock = std::min(256, InnerDimension);    
+
+        // Declare indexes
+        loopnests::Index i("i"), j("j"), k("k");
+        // Define LoopNest
+        auto nest = Using({ A, B }, ArgumentType::Input)
+                        .Using({ C }, ArgumentType::Output)
+                        .ForAll(i, 0, OutputRows)
+                        .ForAll(j, 0, OutputColumns)
+                        .ForAll(k, 0, InnerDimension)
+                        .Do([](Matrix A_, Matrix B_, Matrix C_, Scalar i_, Scalar j_, Scalar k_) {
+                            C_(i_, j_) += B_(k_, j_) * A_(i_, k_);
+                        });
+        auto& schedule = nest.GetSchedule();
+        
+        auto topLevelJ = j;
+        auto topLevelK = k;
+
+        // Declare splits
+        auto jCache = schedule.Split(j, columnBlock);
+        auto kCache = schedule.Split(k, innerDimensionBlock);
+        auto kBlock = schedule.Split(k, kUnroll);
+        auto jKernelOuter2 = schedule.Split(j, NumColumnsInKernel);
+        auto jKernelOuter = schedule.Split(j, vectorSize);
+        auto iKernelOuter = schedule.Split(i, NumRowsInKernel);
+
+        // Set the order
+        schedule.SetOrder({ jCache, kCache, iKernelOuter, jKernelOuter2, kBlock, k, i, jKernelOuter, j });
+
+        // Set up caching    
+        if ((OutputColumns > NumColumnsInKernel) && ((OutputColumns % NumColumnsInKernel) == 0))
+        {
+            auto extraCacheBParams = std::make_tuple(NumColumnsInKernel, jKernelOuter2, BoundaryConditionHandling::ZeroPadding);
+            schedule.template Cache<BLASTCopy>(B,
+                                    { topLevelK, topLevelJ },
+                                    { innerDimensionBlock, columnBlock },
+                                    { kCache, jCache },
+                                    std::nullopt, // Order isn't used by BLASTCopy
+                                    extraCacheBParams);
+        }
+        auto extraZeroInputReduceOutputParams = std::make_tuple(vectorSize);
+        schedule.template Cache<ZeroInputReduceOutput>(C,
+                                            { iKernelOuter, jKernelOuter2 },
+                                            { NumRowsInKernel, NumColumnsInKernel },
+                                            { iKernelOuter, jKernelOuter2 },
+                                            utilities::RowMajorMatrixOrder,
+                                            extraZeroInputReduceOutputParams);
+
+        // Set unrolling
+        schedule.Unroll(jKernelOuter);
+        schedule.Unroll(i);
+        schedule.Unroll(k);
+
+        // Run the generator
+        nest.Run();
+    }
+
+    template <typename ValueType>
+    void MatrixMatrixMultiplyCodeNode<ValueType>::GemmFn(value::Matrix A, value::Matrix B, value::Matrix C, int thread_num)
+    {
+        value::DeclareFunction("InnerMatMul" + std::to_string(thread_num))
+            .Parameters(A, B, C)
+            .Define([this](value::Matrix A, value::Matrix B, value::Matrix C) {
+                Gemm(A, B, C);
+            })(A, B, C);
+    }
+
+    template <typename ValueType>
+    void MatrixMatrixMultiplyCodeNode<ValueType>::ParallelizeGemmCol(Matrix A, Matrix B, Matrix C, int numThreads)
+    {    
+        const int columns = B.Columns() / numThreads; 
+        const int col_spill = B.Columns() % numThreads;
+
+        Parallelize(
+            numThreads,
+            std::tuple{ A, B, C },
+            [=](value::Scalar id, value::Matrix A, value::Matrix B, value::Matrix C) 
+            {
+                value::Scalar colStart = id * value::Scalar{columns};
+                int thread_seq = 0;
+
+                EmitterContext::IfContext IfCxt = If(id == thread_seq,
+                [&] {
+                        GemmFn(
+                            A,
+                            B.SubMatrix(value::Scalar{0}, colStart, (int)B.Rows(), columns),
+                            C.SubMatrix(value::Scalar{0}, colStart, (int)C.Rows(), columns),
+                            thread_seq);
+                });
+                
+                thread_seq++;
+
+                for(int i = thread_seq; i < numThreads; i++)
+                {
+                    IfCxt.ElseIf(id == i,
+                    [&] {
+                            int actualColumns = i==(numThreads-1) ? columns + col_spill : columns;
+                            
+                            GemmFn(
+                                A,
+                                B.SubMatrix(value::Scalar{0}, colStart, (int)B.Rows(), actualColumns),
+                                C.SubMatrix(value::Scalar{0}, colStart, (int)C.Rows(), actualColumns),
+                                i);
+                    });
+                }
+            }); 
+    }
+
+    template <typename ValueType>
+    void MatrixMatrixMultiplyCodeNode<ValueType>::ParallelizeGemmRow(Matrix A, Matrix B, Matrix C, int numThreads)
+    {    
+        const int rows = A.Rows() / numThreads;
+        const int row_spill = A.Rows() % numThreads;
+
+        Parallelize(
+            numThreads,
+            std::tuple{ A, B, C },
+            [=](value::Scalar id, value::Matrix A, value::Matrix B, value::Matrix C) 
+            {
+                value::Scalar rowStart = id * value::Scalar{rows};
+                int thread_seq = 0;
+
+                EmitterContext::IfContext IfCxt = If(id == thread_seq,
+                [&] {
+                        GemmFn(
+                            A.SubMatrix(rowStart, value::Scalar{0}, rows, (int)A.Columns()),
+                            B,
+                            C.SubMatrix(rowStart, value::Scalar{0}, rows, (int)C.Columns()),
+                            thread_seq);
+                });
+                
+                thread_seq++;
+
+                for(int i = thread_seq; i < numThreads; i++)
+                {
+                    IfCxt.ElseIf(id == i,
+                    [&] {
+                            int actualRows = i==(numThreads-1) ? rows + row_spill : rows;
+                            GemmFn(
+                                A.SubMatrix(rowStart, value::Scalar{0}, actualRows, (int)A.Columns()),
+                                B,
+                                C.SubMatrix(rowStart, value::Scalar{0}, actualRows, (int)C.Columns()),
+                                i);
+                    });
+                }
+        }); 
+    }
+
+    template <typename ValueType>
+    void MatrixMatrixMultiplyCodeNode<ValueType>::ELLCodeGEMM(const value::Matrix matA, const value::Matrix matB, value::Matrix matC)
+    {
+        double computationSize = double(matC.Rows() * matC.Columns() * matA.Columns());
+
+        size_t minThreadLoad = 112 * 1024;
+        const size_t maxThreads = 4;
+        size_t numThreads = maxThreads;
+
+        if (computationSize < double(minThreadLoad * maxThreads)) 
+        {
+            numThreads = std::min(int(computationSize / double(minThreadLoad)) + 1, int(maxThreads));
+        }
+        if (numThreads > 1)
+        {
+            if (matC.Rows() > matC.Columns())
+            {
+                ParallelizeGemmRow(matA, matB, matC, int(numThreads));
+            }
+            else
+            {
+                ParallelizeGemmCol(matA, matB, matC, int(numThreads));
+            }
+        }
+        else
+        {
+            Gemm(matA, matB, matC);
+        }
+    }
+
+    template <typename ValueType>
+    void MatrixMatrixMultiplyCodeNode<ValueType>::Define(value::FunctionDeclaration& fn)
+    {
+        (void)fn.Define([this](const value::Value valueA, const value::Value valueB, value::Value valueC) {
+            auto tempA = valueA;
+            tempA.SetLayout(utilities::MemoryLayout({ _m, _k }));
+            auto tempB = valueB;
+            tempB.SetLayout(utilities::MemoryLayout({ _k, _n }));
+            auto tempC = valueC;
+            if (_transposeOutput)
+            {
+                tempC.SetLayout(utilities::MemoryLayout({ _n, _m }, utilities::DimensionOrder{1, 0}));
+            }
+            else
+            {
+                tempC.SetLayout(utilities::MemoryLayout({ _m, _n }));
+            }            
+
+            auto matA = value::Matrix(tempA);
+            auto matB = value::Matrix(tempB);
+            auto matC = value::Matrix(tempC);
+
+            switch (_impl)
+            {
+            case (MatrixMatrixMultiplyImplementation::SimpleForLoops):
+                ForLoopGEMM(matA, matB, matC);
+                break;
+            case (MatrixMatrixMultiplyImplementation::Mlas_Loopnest_Value):
+                ELLCodeGEMM(matA, matB, matC);
+                break;
+            case (MatrixMatrixMultiplyImplementation::LAST):
+                throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "MatrixMatrixMultiplyImplementation::LAST is not a valid impl value");
+                break;
+            }
+        });
+    }
+
+    template <typename ValueType>
+    void MatrixMatrixMultiplyCodeNode<ValueType>::Copy(model::ModelTransformer& transformer) const
+    {
+        const auto& newInput1 = transformer.GetCorrespondingInputs(_input1);
+        const auto& newInput2 = transformer.GetCorrespondingInputs(_input2);
+        auto newNode = transformer.AddNode<MatrixMatrixMultiplyCodeNode<ValueType>>(newInput1, _m, _n, _k, _lda, _transpose1, newInput2, _ldb, _transpose2, _ldc, _transposeOutput, _panelM, _panelN, _panelK, _kernelM, _kernelN, _kernelK, _impl);
+        transformer.MapNodeOutput(output, newNode->output);
+    }
+
+    template <typename ValueType>
+    utilities::ArchiveVersion MatrixMatrixMultiplyCodeNode<ValueType>::GetArchiveVersion() const
+    {
+        constexpr utilities::ArchiveVersion currentArchiveVersion = { utilities::ArchiveVersionNumbers::v2 };
+        return std::max(currentArchiveVersion, CompilableCodeNode::GetArchiveVersion());
+    }
+
+    template <typename ValueType>
+    bool MatrixMatrixMultiplyCodeNode<ValueType>::CanReadArchiveVersion(const utilities::ArchiveVersion& version) const
+    {
+        return CompilableCodeNode::CanReadArchiveVersion(version);
+    }
+
+    template <typename ValueType>
+    void MatrixMatrixMultiplyCodeNode<ValueType>::WriteToArchive(utilities::Archiver& archiver) const
+    {
+        Node::WriteToArchive(archiver);
+        archiver[defaultInput1PortName] << _input1;
+        archiver[defaultInput2PortName] << _input2;
+        archiver[defaultOutputPortName] << _output;
+        archiver["m"] << _m;
+        archiver["n"] << _n;
+        archiver["k"] << _k;
+        archiver["lda"] << _lda;
+        archiver["ldb"] << _ldb;
+        archiver["ldc"] << _ldc;
+        archiver["transpose1"] << _transpose1;
+        archiver["transpose2"] << _transpose2;
+        archiver["transposeOutput"] << _transposeOutput;
+        archiver["panelM"] << _panelM;
+        archiver["panelN"] << _panelN;
+        archiver["panelK"] << _panelK;
+        archiver["kernelM"] << _kernelM;
+        archiver["kernelN"] << _kernelN;
+        archiver["kernelK"] << _kernelK;
+        archiver["gemmImpl"] << static_cast<int>(_impl);
+    }
+
+    template <typename ValueType>
+    void MatrixMatrixMultiplyCodeNode<ValueType>::ReadFromArchive(utilities::Unarchiver& archiver)
+    {
+        Node::ReadFromArchive(archiver);
+        archiver[defaultInput1PortName] >> _input1;
+        archiver[defaultInput2PortName] >> _input2;
+        archiver[defaultOutputPortName] >> _output;
+        archiver["m"] >> _m;
+        archiver["n"] >> _n;
+        archiver["k"] >> _k;
+        archiver["lda"] >> _lda;
+        archiver["ldb"] >> _ldb;
+        archiver["ldc"] >> _ldc;
+        archiver["transpose1"] >> _transpose1;
+        archiver["transpose2"] >> _transpose2;
+        archiver.OptionalProperty("transposeOutput", false) >> _transposeOutput;
+        archiver["panelM"] >> _panelM;
+        archiver["panelN"] >> _panelN;
+        archiver["panelK"] >> _panelK;
+        archiver["kernelM"] >> _kernelM;
+        archiver["kernelN"] >> _kernelN;
+        archiver["kernelK"] >> _kernelK;
+        int gemmImpl = 0;
+        archiver["gemmImpl"] >> gemmImpl;
+        _impl = static_cast<MatrixMatrixMultiplyImplementation>(gemmImpl);
+    }
+
+    //
+    // Explicit instantiation definitions
+    //
+    template class MatrixMatrixMultiplyCodeNode<float>;
+    template class MatrixMatrixMultiplyCodeNode<double>;
+} // namespace nodes
+} // namespace ell
diff --git a/libraries/nodes/src/NeuralNetworkPredictorNode.cpp b/libraries/nodes/src/NeuralNetworkPredictorNode.cpp
index 23f1d7c7b..db2b090b0 100644
--- a/libraries/nodes/src/NeuralNetworkPredictorNode.cpp
+++ b/libraries/nodes/src/NeuralNetworkPredictorNode.cpp
@@ -8,7 +8,7 @@
 
 #include "NeuralNetworkPredictorNode.h"
 #include "ConstantNode.h"
-#include "ReorderDataNode.h"
+#include "ReorderDataCodeNode.h"
 
 #include <data/include/DenseDataVector.h>
 
@@ -122,7 +122,7 @@ namespace nodes
             // If the input layer wants padding on its output, add a ReorderDataNode to add padding
             model::PortMemoryLayout inputNodeLayout(model::MemoryShape{ (int)inputShape.NumRows(), (int)inputShape.NumColumns(), (int)inputShape.NumChannels() });
             model::PortMemoryLayout paddedInputNodeLayout(model::MemoryShape{ (int)inputShape.NumRows(), (int)inputShape.NumColumns(), (int)inputShape.NumChannels() }, model::MemoryShape{ (int)padding, (int)padding, 0 });
-            const auto& paddedInput = ReorderData(*newInputElements, inputNodeLayout, paddedInputNodeLayout, predictors::neural::GetPaddingValue<ValueType>(outputPadding.paddingScheme));
+            const auto& paddedInput = ReorderDataWithCodeNode(*newInputElements, inputNodeLayout, paddedInputNodeLayout, predictors::neural::GetPaddingValue<ValueType>(outputPadding.paddingScheme));
             newInputElements = &paddedInput;
         }
 
diff --git a/libraries/nodes/src/UnrolledConvolutionNode.cpp b/libraries/nodes/src/UnrolledConvolutionNode.cpp
index 8524f7f85..d02495f86 100644
--- a/libraries/nodes/src/UnrolledConvolutionNode.cpp
+++ b/libraries/nodes/src/UnrolledConvolutionNode.cpp
@@ -9,8 +9,10 @@
 #include "UnrolledConvolutionNode.h"
 #include "ConstantNode.h"
 #include "MatrixMatrixMultiplyNode.h"
+#include "MatrixMatrixMultiplyCodeNode.h"
 #include "ReceptiveFieldMatrixNode.h"
-#include "ReorderDataNode.h"
+#include "ReorderDataCodeNode.h"
+#include <value/include/LLVMContext.h>
 
 #include <utilities/include/Unused.h>
 
@@ -90,6 +92,23 @@ namespace nodes
         return weightsMatrix;
     }
 
+    template <typename ValueType>
+    bool UnrolledConvolutionNode<ValueType>::IsELLCodeTarget(model::ModelTransformer& transformer) const
+    {
+        auto compiler = dynamic_cast<const model::IRMapCompiler*>(transformer.GetContext().GetCompiler());
+        if(compiler != nullptr)
+        {
+            auto device_name = compiler->GetCompilerOptions().targetDevice.deviceName;
+            bool skip_ELLCode = compiler->GetCompilerOptions().skip_ellcode;
+            if (device_name.compare("pi3") == 0 && !skip_ELLCode)
+            {
+               return true;
+            }
+        }
+ 
+        return false;
+    }
+
     template <typename ValueType>
     void UnrolledConvolutionNode<ValueType>::Copy(model::ModelTransformer& transformer) const
     {
@@ -148,6 +167,7 @@ namespace nodes
         std::array<int, 3> dataOrder = useNewMethod ? drcOrder : rcdOrder;
         assert(outputPadding == 0 && "Unrolled convolution node output padding not supported yet");
 
+        bool isELLCodeTarget = IsELLCodeTarget(transformer);
         // weights: numFilters x fieldVolumeSize == m x k
         // ShapedInput: fieldVolumeSize x outputRows == k x n
         // Matrix multiply output: numFilters x outputRows = m x n
@@ -155,20 +175,39 @@ namespace nodes
         if (dataOrder == rcdOrder) // don't reorder input -- use old method
         {
             auto receptiveFieldMatrixNode = transformer.AddNode<ReceptiveFieldMatrixNode<ValueType>>(newInput, inputLayout, filterSize, _stride, inputPadding, dataOrder, outputImageWidth, outputImageHeight);
-            auto matrixMultNode = transformer.AddNode<MatrixMatrixMultiplyNode<ValueType>>(weights, m, n, k, lda, false, receptiveFieldMatrixNode->output, ldb, false, ldc, true);
-
-            if (outputPadding != 0)
+            if(isELLCodeTarget)
             {
-                // Add padding
-                model::PortMemoryLayout outputLayout(model::MemoryShape{ outputImageHeight, outputImageWidth, numFilters });
-                model::PortMemoryLayout paddedOutputLayout(model::MemoryShape{ outputImageHeight, outputImageWidth, numFilters }, model::MemoryShape{ outputPadding, outputPadding, 0 });
-                const auto& reorderedOutput = ReorderData(matrixMultNode->output, outputLayout, paddedOutputLayout);
-                transformer.MapNodeOutput(this->output, reorderedOutput);
+                auto matrixMultNode = transformer.AddNode<MatrixMatrixMultiplyCodeNode<ValueType>>(weights, m, n, k, lda, false, receptiveFieldMatrixNode->output, ldb, false, ldc, true);
+                if (outputPadding != 0)
+                {
+                    // Add padding
+                    model::PortMemoryLayout outputLayout(model::MemoryShape{ outputImageHeight, outputImageWidth, numFilters });
+                    model::PortMemoryLayout paddedOutputLayout(model::MemoryShape{ outputImageHeight, outputImageWidth, numFilters }, model::MemoryShape{ outputPadding, outputPadding, 0 });
+                    const auto& reorderedOutput = ReorderDataWithCodeNode(matrixMultNode->output, outputLayout, paddedOutputLayout);
+                    transformer.MapNodeOutput(this->output, reorderedOutput);
+                }
+                else
+                {
+                    transformer.MapNodeOutput(this->output, matrixMultNode->output);
+                }
             }
             else
             {
-                transformer.MapNodeOutput(this->output, matrixMultNode->output);
+                auto matrixMultNode = transformer.AddNode<MatrixMatrixMultiplyNode<ValueType>>(weights, m, n, k, lda, false, receptiveFieldMatrixNode->output, ldb, false, ldc, true);
+                if (outputPadding != 0)
+                {
+                    // Add padding
+                    model::PortMemoryLayout outputLayout(model::MemoryShape{ outputImageHeight, outputImageWidth, numFilters });
+                    model::PortMemoryLayout paddedOutputLayout(model::MemoryShape{ outputImageHeight, outputImageWidth, numFilters }, model::MemoryShape{ outputPadding, outputPadding, 0 });
+                    const auto& reorderedOutput = ReorderDataWithCodeNode(matrixMultNode->output, outputLayout, paddedOutputLayout);
+                    transformer.MapNodeOutput(this->output, reorderedOutput);
+                }
+                else
+                {
+                    transformer.MapNodeOutput(this->output, matrixMultNode->output);
+                }
             }
+            
         }
         else // reorder input to be channels x rows x columns (drc) (then we can use the 'new' receptive field matrix generation)
         {
@@ -177,23 +216,41 @@ namespace nodes
             // Remove padding and transpose to channel-major order
             model::PortMemoryLayout inputLayout(model::MemoryShape{ inputHeight, inputWidth, inputDepth }, model::MemoryShape{ inputPadding, inputPadding, 0 });
             model::PortMemoryLayout transposedInputLayout(model::MemoryShape{ inputDepth, inputHeight, inputWidth }, model::DimensionOrder{ 2, 0, 1 }); // Note: memory layout constructor takes the sizes in physical dimension order
-            const auto& reorderedInput = ReorderData(newInput, inputLayout, transposedInputLayout);
+            const auto& reorderedInput = ReorderDataWithCodeNode(newInput, inputLayout, transposedInputLayout);
 
             auto receptiveFieldMatrixNode = transformer.AddNode<ReceptiveFieldMatrixNode<ValueType>>(reorderedInput, reorderedInput.GetMemoryLayout(), _filterSize, _stride, inputPadding, dataOrder, outputImageWidth, outputImageHeight);
-            auto matrixMultNode = transformer.AddNode<MatrixMatrixMultiplyNode<ValueType>>(weights, m, n, k, lda, false, receptiveFieldMatrixNode->output, ldb, false, ldc, true);
-
-            if (outputPadding != 0)
+            if(isELLCodeTarget)
             {
-                // Add padding
-                model::PortMemoryLayout outputLayout(model::MemoryShape{ outputImageHeight, outputImageWidth, numFilters });
-                model::PortMemoryLayout paddedOutputLayout(model::MemoryShape{ outputImageHeight, outputImageWidth, numFilters }, model::MemoryShape{ outputPadding, outputPadding, 0 });
-                const auto& reorderedOutput = ReorderData(matrixMultNode->output, outputLayout, paddedOutputLayout);
-                transformer.MapNodeOutput(this->output, reorderedOutput);
+                auto matrixMultNode = transformer.AddNode<MatrixMatrixMultiplyCodeNode<ValueType>>(weights, m, n, k, lda, false, receptiveFieldMatrixNode->output, ldb, false, ldc, true);
+                if (outputPadding != 0)
+                {
+                    // Add padding
+                    model::PortMemoryLayout outputLayout(model::MemoryShape{ outputImageHeight, outputImageWidth, numFilters });
+                    model::PortMemoryLayout paddedOutputLayout(model::MemoryShape{ outputImageHeight, outputImageWidth, numFilters }, model::MemoryShape{ outputPadding, outputPadding, 0 });
+                    const auto& reorderedOutput = ReorderDataWithCodeNode(matrixMultNode->output, outputLayout, paddedOutputLayout);
+                    transformer.MapNodeOutput(this->output, reorderedOutput);
+                }
+                else
+                {
+                    transformer.MapNodeOutput(this->output, matrixMultNode->output);
+                }
             }
             else
             {
-                transformer.MapNodeOutput(this->output, matrixMultNode->output);
-            }
+                auto matrixMultNode = transformer.AddNode<MatrixMatrixMultiplyNode<ValueType>>(weights, m, n, k, lda, false, receptiveFieldMatrixNode->output, ldb, false, ldc, true);    
+                if (outputPadding != 0)
+                {
+                    // Add padding
+                    model::PortMemoryLayout outputLayout(model::MemoryShape{ outputImageHeight, outputImageWidth, numFilters });
+                    model::PortMemoryLayout paddedOutputLayout(model::MemoryShape{ outputImageHeight, outputImageWidth, numFilters }, model::MemoryShape{ outputPadding, outputPadding, 0 });
+                    const auto& reorderedOutput = ReorderDataWithCodeNode(matrixMultNode->output, outputLayout, paddedOutputLayout);
+                    transformer.MapNodeOutput(this->output, reorderedOutput);
+                }
+                else
+                {
+                    transformer.MapNodeOutput(this->output, matrixMultNode->output);
+                }
+            }       
         }
         return true;
     }
diff --git a/libraries/nodes/src/WinogradConvolutionNode.cpp b/libraries/nodes/src/WinogradConvolutionNode.cpp
index 48e25e9ed..189ff6147 100644
--- a/libraries/nodes/src/WinogradConvolutionNode.cpp
+++ b/libraries/nodes/src/WinogradConvolutionNode.cpp
@@ -8,7 +8,7 @@
 
 #include "WinogradConvolutionNode.h"
 #include "ConstantNode.h"
-#include "ReorderDataNode.h"
+#include "ReorderDataCodeNode.h"
 
 #include <dsp/include/WinogradConvolution.h>
 
@@ -1187,7 +1187,7 @@ namespace nodes
         {
             // add a ReorderDataNode to convert to channel-major, which is more efficient in this case
             auto orderArr = utilities::ChannelMajorTensorOrder;
-            const auto& reorderedData = ReorderData(*newInput, convInputLayout, convInputLayout, utilities::DimensionOrder{ orderArr });
+            const auto& reorderedData = ReorderDataWithCodeNode(*newInput, convInputLayout, convInputLayout, utilities::DimensionOrder{ orderArr });
             newInput = &reorderedData;
             convInputLayout = reorderedData.GetMemoryLayout();
         }
diff --git a/libraries/nodes/test/src/BasicMathNodesTests.cpp b/libraries/nodes/test/src/BasicMathNodesTests.cpp
index ae5fe0e4e..6ad6bd9ac 100644
--- a/libraries/nodes/test/src/BasicMathNodesTests.cpp
+++ b/libraries/nodes/test/src/BasicMathNodesTests.cpp
@@ -70,7 +70,7 @@ void TestBasicMathNodes()
     TestUnaryOperationNodeCompute();
 
     TestBroadcastUnaryOperationNodeCompute();
-    TestBroadcastLinearFunctionNodeCompute();
+
     TestBroadcastBinaryOperationNodeComputeFull();
     TestBroadcastBinaryOperationNodeComputeAdd();
     TestBroadcastBinaryOperationNodeComputeSubtract();
@@ -79,6 +79,8 @@ void TestBasicMathNodes()
     TestBroadcastBinaryOperationNodeComputeWithBadLayout();
     TestBroadcastBinaryOperationNodeComputeDifferentBroadcastDimensions();
     TestBroadcastTernaryOperationNodeComputeFMA();
+
+    TestBroadcastLinearFunctionNodeCompute();
 }
 
 void TestUnaryOperationNodeCompute(UnaryOperationType op, double (*expectedTransform)(double))
@@ -296,7 +298,6 @@ void TestBroadcastBinaryOperationNodeComputeAdd()
 
     model::PortMemoryLayout input1Layout({ numRows, numColumns, numChannels });
     model::PortMemoryLayout input2Layout({ 1, numColumns, 1 });
-    model::PortMemoryLayout input3Layout({ 1, numColumns, 1 });
 
     // clang-format off
     std::vector<double> input1Vals{ 1, 2, 1, 2,    1, 2, 1, 2,    1, 2, 1, 2,        3, 4, 3, 4,    3, 4, 3, 4,    3, 4, 3, 4 };
@@ -327,7 +328,6 @@ void TestBroadcastBinaryOperationNodeComputeSubtract()
 
     model::PortMemoryLayout input1Layout({ numRows, numColumns, numChannels });
     model::PortMemoryLayout input2Layout({ 1, numColumns, 1 });
-    model::PortMemoryLayout input3Layout({ 1, numColumns, 1 });
 
     // clang-format off
     std::vector<double> input1Vals{ 1, 2, 1, 2,      1, 2, 1, 2,      1, 2, 1, 2,            3, 4, 3, 4,    3, 4, 3, 4,    3, 4, 3, 4 };
@@ -473,7 +473,7 @@ void TestBroadcastBinaryOperationNodeComputeDifferentBroadcastDimensions()
 
     // clang-format off
     std::vector<double> input1Vals{ 1,
-                                    2};
+                                    2 };
     // broadcasts to:             { 1, 1, 1,
     //                              2, 2, 2 }
     std::vector<double> input2Vals{ 2, 4, 6 };
diff --git a/libraries/nodes/test/src/DSPNodesTests.cpp b/libraries/nodes/test/src/DSPNodesTests.cpp
index 5282128e3..4458aa989 100644
--- a/libraries/nodes/test/src/DSPNodesTests.cpp
+++ b/libraries/nodes/test/src/DSPNodesTests.cpp
@@ -37,7 +37,7 @@
 #include <nodes/include/IIRFilterNode.h>
 #include <nodes/include/LSTMNode.h>
 #include <nodes/include/RNNNode.h>
-#include <nodes/include/ReorderDataNode.h>
+#include <nodes/include/ReorderDataCodeNode.h>
 #include <nodes/include/SimpleConvolutionNode.h>
 #include <nodes/include/UnrolledConvolutionNode.h>
 #include <nodes/include/WinogradConvolutionNode.h>
@@ -370,7 +370,7 @@ static void TestIIRFilterNode4()
 template <typename ValueType>
 static void TestMelFilterBankNode()
 {
-    const ValueType epsilon = static_cast<ValueType>(1e-6);
+    const ValueType epsilon = static_cast<ValueType>(1e-5);
     const size_t numFilters = 13;
     const size_t windowSize = 512;
     const size_t fftSize = 512;
@@ -593,7 +593,7 @@ static void TestConvolutionNodeCompileVsReference(ImageShape inputShape, Filters
     auto convInputLayout = inputMemoryLayout.ReorderedCopy({ shouldReorderToChannelMajor ? utilities::ChannelMajorTensorOrder : utilities::RowMajorTensorOrder });
     auto convOutputLayout = outputMemoryLayout.ReorderedCopy({ shouldReorderToChannelMajor ? utilities::ChannelMajorTensorOrder : utilities::RowMajorTensorOrder });
 
-    auto preConvReorderNode = model.AddNode<nodes::ReorderDataNode<ValueType>>(inputNode->output, inputMemoryLayout, convInputLayout);
+    auto preConvReorderNode = model.AddNode<nodes::ReorderDataCodeNode<ValueType>>(inputNode->output, inputMemoryLayout, convInputLayout);
     const auto* newInput = &preConvReorderNode->output;
 
     model::PortElements<ValueType> convOutput;
@@ -629,7 +629,7 @@ static void TestConvolutionNodeCompileVsReference(ImageShape inputShape, Filters
     }
     }
 
-    auto postConvReorderNode = model.AddNode<nodes::ReorderDataNode<ValueType>>(convOutput, convOutputLayout, outputMemoryLayout);
+    auto postConvReorderNode = model.AddNode<nodes::ReorderDataCodeNode<ValueType>>(convOutput, convOutputLayout, outputMemoryLayout);
 
     auto map = model::Map(model, { { "input", inputNode } }, { { "output", postConvReorderNode->output } });
 
diff --git a/libraries/optimization/CMakeLists.txt b/libraries/optimization/CMakeLists.txt
index 54ae1b097..49641749b 100644
--- a/libraries/optimization/CMakeLists.txt
+++ b/libraries/optimization/CMakeLists.txt
@@ -42,7 +42,7 @@ set(include
 source_group("src" FILES ${src})
 source_group("include" FILES ${include})
 
-add_library(${library_name} ${src} ${include} ${tcc} ${doc})
+add_library(${library_name} ${src} ${include})
 target_include_directories(${library_name} PRIVATE include ${ELL_LIBRARIES_DIR})
 target_link_libraries(${library_name} math)
 
@@ -81,7 +81,7 @@ set(test_include
 source_group("src" FILES ${test_src})
 source_group("include" FILES ${test_include})
 
-add_executable(${test_name} ${test_src} ${test_include} ${test_tcc})
+add_executable(${test_name} ${test_src} ${test_include})
 target_include_directories(${test_name} PRIVATE test/include ${ELL_LIBRARIES_DIR})
 target_link_libraries(${test_name} optimization testing)
 copy_shared_libraries(${test_name})
diff --git a/libraries/optimization/include/VectorSolution.h b/libraries/optimization/include/VectorSolution.h
index 5d93f463b..04e833b12 100644
--- a/libraries/optimization/include/VectorSolution.h
+++ b/libraries/optimization/include/VectorSolution.h
@@ -33,7 +33,7 @@ namespace optimization
 
         /// <summary> Solutions are expected to have a ParameterType. Empty here because this solution type doesn't need any parameters. </summary>
         struct ParametersType {};
-        
+
         /// <summary> Default constructor. </summary>
         VectorSolution() = default;
 
diff --git a/libraries/optimization/src/Interval.cpp b/libraries/optimization/src/Interval.cpp
index 845e5bbdf..5bc3aca35 100644
--- a/libraries/optimization/src/Interval.cpp
+++ b/libraries/optimization/src/Interval.cpp
@@ -1,7 +1,7 @@
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 //
 //  Project:  Embedded Learning Library (ELL)
-//  File:     Interval.tcc (optimization)
+//  File:     Interval.cpp (optimization)
 //  Authors:  Ofer Dekel
 //
 ////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/libraries/passes/src/OptimizeReorderDataNodesTransformation.cpp b/libraries/passes/src/OptimizeReorderDataNodesTransformation.cpp
index f46b94647..6dd58e5ee 100644
--- a/libraries/passes/src/OptimizeReorderDataNodesTransformation.cpp
+++ b/libraries/passes/src/OptimizeReorderDataNodesTransformation.cpp
@@ -10,7 +10,7 @@
 
 #include <model/include/ModelTransformer.h>
 
-#include <nodes/include/ReorderDataNode.h>
+#include <nodes/include/ReorderDataCodeNode.h>
 
 #include <utilities/include/Exception.h>
 #include <utilities/include/Logger.h>
@@ -50,7 +50,7 @@ namespace passes
                 return true;
             }
 
-            if (auto reorderNode = dynamic_cast<const ReorderDataNode<ValueType>*>(&nodeToOptimize))
+            if (auto reorderNode = dynamic_cast<const ReorderDataCodeNode<ValueType>*>(&nodeToOptimize))
             {
                 const auto& node = *reorderNode;
 
@@ -71,9 +71,9 @@ namespace passes
                     while (currentNode != nullptr)
                     {
                         // iff we have one dependent node and it's a reorder node
-                        const ReorderDataNode<ValueType>* nextNode = nullptr;
+                        const ReorderDataCodeNode<ValueType>* nextNode = nullptr;
                         if (currentNode->GetDependentNodes().size() == 1 &&
-                            (nextNode = dynamic_cast<const ReorderDataNode<ValueType>*>(currentNode->GetDependentNodes()[0])))
+                            (nextNode = dynamic_cast<const ReorderDataCodeNode<ValueType>*>(currentNode->GetDependentNodes()[0])))
                         {
                             Log() << "Removing node ReorderDataNode [id = " << currentNode->GetId().ToString() << "] since it is followed by another ReorderDataNode" << EOL;
 
@@ -114,7 +114,7 @@ namespace passes
                     // otherwise, create a new reorder node and use the input to the chain and map its output to the
                     // final output of the chain
                     const auto& newInput = transformer.GetCorrespondingInputs(node.input);
-                    const auto& reorderedInput = nodes::ReorderData(newInput, inputLayout, outputLayout, node.GetPaddingValue());
+                    const auto& reorderedInput = nodes::ReorderDataWithCodeNode(newInput, inputLayout, outputLayout, node.GetPaddingValue());
                     transformer.MapNodeOutput(*finalOutputPort, reorderedInput);
 
                     Log() << "ReorderDataNode chain's input and output memory layout are different. Entire chain is being "
diff --git a/libraries/passes/test/src/ModelOptimizerTest.cpp b/libraries/passes/test/src/ModelOptimizerTest.cpp
index 02b045b7b..2eb097af4 100644
--- a/libraries/passes/test/src/ModelOptimizerTest.cpp
+++ b/libraries/passes/test/src/ModelOptimizerTest.cpp
@@ -17,7 +17,7 @@
 #include <nodes/include/ConstantNode.h>
 #include <nodes/include/ConvolutionalLayerNode.h>
 #include <nodes/include/MatrixMatrixMultiplyNode.h>
-#include <nodes/include/ReorderDataNode.h>
+#include <nodes/include/ReorderDataCodeNode.h>
 
 #include <passes/include/StandardTransformations.h>
 
@@ -230,11 +230,11 @@ void TestOptimizeReorderDataNodes1()
 
     model::Model model;
     auto inputMatrixNode = model.AddNode<model::InputNode<ValueType>>(model::MemoryShape{ m, k });
-    auto reorderedInputMatrixNode = model.AddNode<nodes::ReorderDataNode<ValueType>>(inputMatrixNode->output, orderA);
+    auto reorderedInputMatrixNode = model.AddNode<nodes::ReorderDataCodeNode<ValueType>>(inputMatrixNode->output, orderA);
 
     std::vector<ValueType> matrixBVals(k * n);
     auto matrixBNode = model.AddNode<nodes::ConstantNode<ValueType>>(matrixBVals, model::MemoryShape{ k, n });
-    auto reorderedMatrixBNode = model.AddNode<nodes::ReorderDataNode<ValueType>>(matrixBNode->output, orderB);
+    auto reorderedMatrixBNode = model.AddNode<nodes::ReorderDataCodeNode<ValueType>>(matrixBNode->output, orderB);
 
     auto matMatMultNode = model.AddNode<nodes::MatrixMatrixMultiplyNode<ValueType>>(reorderedInputMatrixNode->output, reorderedMatrixBNode->output, outputLayout);
 
@@ -280,11 +280,11 @@ void TestOptimizeReorderDataNodes2()
 
     model::Model model;
     auto inputMatrixNode = model.AddNode<model::InputNode<ValueType>>(model::MemoryShape{ m, k });
-    auto reorderedInputMatrixNode = model.AddNode<nodes::ReorderDataNode<ValueType>>(inputMatrixNode->output, orderA);
+    auto reorderedInputMatrixNode = model.AddNode<nodes::ReorderDataCodeNode<ValueType>>(inputMatrixNode->output, orderA);
 
     std::vector<ValueType> matrixBVals(k * n);
     auto matrixBNode = model.AddNode<nodes::ConstantNode<ValueType>>(matrixBVals, model::MemoryShape{ k, n });
-    auto reorderedMatrixBNode = model.AddNode<nodes::ReorderDataNode<ValueType>>(matrixBNode->output, orderB);
+    auto reorderedMatrixBNode = model.AddNode<nodes::ReorderDataCodeNode<ValueType>>(matrixBNode->output, orderB);
 
     auto matMatMultNode = model.AddNode<nodes::MatrixMatrixMultiplyNode<ValueType>>(reorderedInputMatrixNode->output, reorderedMatrixBNode->output, outputLayout);
 
@@ -330,11 +330,11 @@ void TestOptimizeReorderDataNodes3()
 
     model::Model model;
     auto inputMatrixNode = model.AddNode<model::InputNode<ValueType>>(model::MemoryShape{ m, k });
-    auto reorderedInputMatrixNode = model.AddNode<nodes::ReorderDataNode<ValueType>>(inputMatrixNode->output, orderA);
+    auto reorderedInputMatrixNode = model.AddNode<nodes::ReorderDataCodeNode<ValueType>>(inputMatrixNode->output, orderA);
 
     std::vector<ValueType> matrixBVals(k * n);
     auto matrixBNode = model.AddNode<nodes::ConstantNode<ValueType>>(matrixBVals, model::MemoryShape{ k, n });
-    auto reorderedMatrixBNode = model.AddNode<nodes::ReorderDataNode<ValueType>>(matrixBNode->output, orderB);
+    auto reorderedMatrixBNode = model.AddNode<nodes::ReorderDataCodeNode<ValueType>>(matrixBNode->output, orderB);
 
     auto matMatMultNode = model.AddNode<nodes::MatrixMatrixMultiplyNode<ValueType>>(reorderedInputMatrixNode->output, reorderedMatrixBNode->output, outputLayout);
 
@@ -380,17 +380,17 @@ void TestOptimizeReorderDataNodes4()
     auto rowMajorLayout = model::PortMemoryLayout(model::MemoryShape{ m, k }).ReorderedCopy(rowMajor);
     auto colMajorLayout = model::PortMemoryLayout(model::MemoryShape{ m, k }).ReorderedCopy(colMajor);
     auto inputMatrixNode = model.AddNode<model::InputNode<ValueType>>(rowMajorLayout.GetActiveSize());
-    auto reorderedInputMatrixNode1 = model.AddNode<nodes::ReorderDataNode<ValueType>>(inputMatrixNode->output, rowMajorLayout, colMajorLayout);
-    auto reorderedInputMatrixNode2 = model.AddNode<nodes::ReorderDataNode<ValueType>>(reorderedInputMatrixNode1->output, colMajorLayout, rowMajorLayout);
-    auto reorderedInputMatrixNode3 = model.AddNode<nodes::ReorderDataNode<ValueType>>(reorderedInputMatrixNode2->output, rowMajorLayout, rowMajorLayout);
+    auto reorderedInputMatrixNode1 = model.AddNode<nodes::ReorderDataCodeNode<ValueType>>(inputMatrixNode->output, rowMajorLayout, colMajorLayout);
+    auto reorderedInputMatrixNode2 = model.AddNode<nodes::ReorderDataCodeNode<ValueType>>(reorderedInputMatrixNode1->output, colMajorLayout, rowMajorLayout);
+    auto reorderedInputMatrixNode3 = model.AddNode<nodes::ReorderDataCodeNode<ValueType>>(reorderedInputMatrixNode2->output, rowMajorLayout, rowMajorLayout);
 
     std::vector<ValueType> matrixBVals(k * n);
     rowMajorLayout = model::PortMemoryLayout(model::MemoryShape{ k, n }).ReorderedCopy(rowMajor);
     colMajorLayout = model::PortMemoryLayout(model::MemoryShape{ k, n }).ReorderedCopy(colMajor);
     auto matrixBNode = model.AddNode<nodes::ConstantNode<ValueType>>(matrixBVals, rowMajorLayout);
-    auto reorderedMatrixBNode1 = model.AddNode<nodes::ReorderDataNode<ValueType>>(matrixBNode->output, rowMajorLayout, rowMajorLayout);
-    auto reorderedMatrixBNode2 = model.AddNode<nodes::ReorderDataNode<ValueType>>(reorderedMatrixBNode1->output, rowMajorLayout, colMajorLayout);
-    auto reorderedMatrixBNode3 = model.AddNode<nodes::ReorderDataNode<ValueType>>(reorderedMatrixBNode2->output, colMajorLayout, colMajorLayout);
+    auto reorderedMatrixBNode1 = model.AddNode<nodes::ReorderDataCodeNode<ValueType>>(matrixBNode->output, rowMajorLayout, rowMajorLayout);
+    auto reorderedMatrixBNode2 = model.AddNode<nodes::ReorderDataCodeNode<ValueType>>(reorderedMatrixBNode1->output, rowMajorLayout, colMajorLayout);
+    auto reorderedMatrixBNode3 = model.AddNode<nodes::ReorderDataCodeNode<ValueType>>(reorderedMatrixBNode2->output, colMajorLayout, colMajorLayout);
 
     auto matMatMultNode = model.AddNode<nodes::MatrixMatrixMultiplyNode<ValueType>>(reorderedInputMatrixNode3->output, reorderedMatrixBNode3->output, outputLayout);
 
diff --git a/libraries/passes/test/src/TransformationTest.cpp b/libraries/passes/test/src/TransformationTest.cpp
index 5b0e585f6..3e3f5f24b 100644
--- a/libraries/passes/test/src/TransformationTest.cpp
+++ b/libraries/passes/test/src/TransformationTest.cpp
@@ -20,7 +20,7 @@
 #include <nodes/include/ConstantNode.h>
 #include <nodes/include/ConvolutionalLayerNode.h>
 #include <nodes/include/MatrixMatrixMultiplyNode.h>
-#include <nodes/include/ReorderDataNode.h>
+#include <nodes/include/ReorderDataCodeNode.h>
 
 #include <predictors/neural/include/ConvolutionalLayer.h>
 
@@ -304,11 +304,11 @@ void TestOptimizeReorderDataNodesTransformation1()
 
     model::Model model;
     auto inputMatrixNode = model.AddNode<model::InputNode<ValueType>>(model::MemoryShape{ m, k });
-    auto reorderedInputMatrixNode = model.AddNode<nodes::ReorderDataNode<ValueType>>(inputMatrixNode->output, orderA);
+    auto reorderedInputMatrixNode = model.AddNode<nodes::ReorderDataCodeNode<ValueType>>(inputMatrixNode->output, orderA);
 
     std::vector<ValueType> matrixBVals(k * n);
     auto matrixBNode = model.AddNode<nodes::ConstantNode<ValueType>>(matrixBVals, model::MemoryShape{ k, n });
-    auto reorderedMatrixBNode = model.AddNode<nodes::ReorderDataNode<ValueType>>(matrixBNode->output, orderB);
+    auto reorderedMatrixBNode = model.AddNode<nodes::ReorderDataCodeNode<ValueType>>(matrixBNode->output, orderB);
 
     auto matMatMultNode = model.AddNode<nodes::MatrixMatrixMultiplyNode<ValueType>>(reorderedInputMatrixNode->output, reorderedMatrixBNode->output, outputLayout);
 
@@ -345,11 +345,11 @@ void TestOptimizeReorderDataNodesTransformation2()
 
     model::Model model;
     auto inputMatrixNode = model.AddNode<model::InputNode<ValueType>>(model::MemoryShape{ m, k });
-    auto reorderedInputMatrixNode = model.AddNode<nodes::ReorderDataNode<ValueType>>(inputMatrixNode->output, orderA);
+    auto reorderedInputMatrixNode = model.AddNode<nodes::ReorderDataCodeNode<ValueType>>(inputMatrixNode->output, orderA);
 
     std::vector<ValueType> matrixBVals(k * n);
     auto matrixBNode = model.AddNode<nodes::ConstantNode<ValueType>>(matrixBVals, model::MemoryShape{ k, n });
-    auto reorderedMatrixBNode = model.AddNode<nodes::ReorderDataNode<ValueType>>(matrixBNode->output, orderB);
+    auto reorderedMatrixBNode = model.AddNode<nodes::ReorderDataCodeNode<ValueType>>(matrixBNode->output, orderB);
 
     auto matMatMultNode = model.AddNode<nodes::MatrixMatrixMultiplyNode<ValueType>>(reorderedInputMatrixNode->output, reorderedMatrixBNode->output, outputLayout);
 
@@ -385,11 +385,11 @@ void TestOptimizeReorderDataNodesTransformation3()
 
     model::Model model;
     auto inputMatrixNode = model.AddNode<model::InputNode<ValueType>>(model::MemoryShape{ m, k });
-    auto reorderedInputMatrixNode = model.AddNode<nodes::ReorderDataNode<ValueType>>(inputMatrixNode->output, orderA);
+    auto reorderedInputMatrixNode = model.AddNode<nodes::ReorderDataCodeNode<ValueType>>(inputMatrixNode->output, orderA);
 
     std::vector<ValueType> matrixBVals(k * n);
     auto matrixBNode = model.AddNode<nodes::ConstantNode<ValueType>>(matrixBVals, model::MemoryShape{ k, n });
-    auto reorderedMatrixBNode = model.AddNode<nodes::ReorderDataNode<ValueType>>(matrixBNode->output, orderB);
+    auto reorderedMatrixBNode = model.AddNode<nodes::ReorderDataCodeNode<ValueType>>(matrixBNode->output, orderB);
 
     auto matMatMultNode = model.AddNode<nodes::MatrixMatrixMultiplyNode<ValueType>>(reorderedInputMatrixNode->output, reorderedMatrixBNode->output, outputLayout);
 
@@ -430,17 +430,17 @@ void TestOptimizeReorderDataNodesTransformation4()
     auto rowMajorLayout = model::PortMemoryLayout(model::MemoryShape{ m, k }).ReorderedCopy(rowMajor);
     auto colMajorLayout = model::PortMemoryLayout(model::MemoryShape{ m, k }).ReorderedCopy(colMajor);
     auto inputMatrixNode = model.AddNode<model::InputNode<ValueType>>(rowMajorLayout.GetActiveSize());
-    auto reorderedInputMatrixNode1 = model.AddNode<nodes::ReorderDataNode<ValueType>>(inputMatrixNode->output, rowMajorLayout, colMajorLayout);
-    auto reorderedInputMatrixNode2 = model.AddNode<nodes::ReorderDataNode<ValueType>>(reorderedInputMatrixNode1->output, colMajorLayout, rowMajorLayout);
-    auto reorderedInputMatrixNode3 = model.AddNode<nodes::ReorderDataNode<ValueType>>(reorderedInputMatrixNode2->output, rowMajorLayout, rowMajorLayout);
+    auto reorderedInputMatrixNode1 = model.AddNode<nodes::ReorderDataCodeNode<ValueType>>(inputMatrixNode->output, rowMajorLayout, colMajorLayout);
+    auto reorderedInputMatrixNode2 = model.AddNode<nodes::ReorderDataCodeNode<ValueType>>(reorderedInputMatrixNode1->output, colMajorLayout, rowMajorLayout);
+    auto reorderedInputMatrixNode3 = model.AddNode<nodes::ReorderDataCodeNode<ValueType>>(reorderedInputMatrixNode2->output, rowMajorLayout, rowMajorLayout);
 
     std::vector<ValueType> matrixBVals(k * n);
     rowMajorLayout = model::PortMemoryLayout(model::MemoryShape{ k, n }).ReorderedCopy(rowMajor);
     colMajorLayout = model::PortMemoryLayout(model::MemoryShape{ k, n }).ReorderedCopy(colMajor);
     auto matrixBNode = model.AddNode<nodes::ConstantNode<ValueType>>(matrixBVals, rowMajorLayout);
-    auto reorderedMatrixBNode1 = model.AddNode<nodes::ReorderDataNode<ValueType>>(matrixBNode->output, rowMajorLayout, rowMajorLayout);
-    auto reorderedMatrixBNode2 = model.AddNode<nodes::ReorderDataNode<ValueType>>(reorderedMatrixBNode1->output, rowMajorLayout, colMajorLayout);
-    auto reorderedMatrixBNode3 = model.AddNode<nodes::ReorderDataNode<ValueType>>(reorderedMatrixBNode2->output, colMajorLayout, colMajorLayout);
+    auto reorderedMatrixBNode1 = model.AddNode<nodes::ReorderDataCodeNode<ValueType>>(matrixBNode->output, rowMajorLayout, rowMajorLayout);
+    auto reorderedMatrixBNode2 = model.AddNode<nodes::ReorderDataCodeNode<ValueType>>(reorderedMatrixBNode1->output, rowMajorLayout, colMajorLayout);
+    auto reorderedMatrixBNode3 = model.AddNode<nodes::ReorderDataCodeNode<ValueType>>(reorderedMatrixBNode2->output, colMajorLayout, colMajorLayout);
 
     auto matMatMultNode = model.AddNode<nodes::MatrixMatrixMultiplyNode<ValueType>>(reorderedInputMatrixNode3->output, reorderedMatrixBNode3->output, outputLayout);
 
diff --git a/libraries/utilities/CMakeLists.txt b/libraries/utilities/CMakeLists.txt
index 16e82e6bf..5b3115eff 100644
--- a/libraries/utilities/CMakeLists.txt
+++ b/libraries/utilities/CMakeLists.txt
@@ -49,6 +49,7 @@ set(include
   include/CStringParser.h
   include/Debug.h
   include/Graph.h
+  include/EnumFlagHelpers.h
   include/Exception.h
   include/Files.h
   include/Format.h
@@ -79,6 +80,7 @@ set(include
   include/StringUtil.h
   include/Tokenizer.h
   include/TransformIterator.h
+  include/TunableParameters.h
   include/TupleUtils.h
   include/TypeAliases.h
   include/TypeFactory.h
@@ -121,6 +123,7 @@ set(test_src
   test/src/ObjectArchive_test.cpp
   test/src/PropertyBag_test.cpp
   test/src/RingBuffer_test.cpp
+  test/src/TunableParameters_test.cpp
   test/src/TypeFactory_test.cpp
   test/src/TypeName_test.cpp
   test/src/Variant_test.cpp
@@ -137,6 +140,7 @@ set(test_include
   test/include/ObjectArchive_test.h
   test/include/PropertyBag_test.h
   test/include/RingBuffer_test.h
+  test/include/TunableParameters_test.h
   test/include/TypeFactory_test.h
   test/include/TypeName_test.h
   test/include/Variant_test.h
diff --git a/libraries/utilities/include/EnumFlagHelpers.h b/libraries/utilities/include/EnumFlagHelpers.h
new file mode 100644
index 000000000..b1bcdd6c6
--- /dev/null
+++ b/libraries/utilities/include/EnumFlagHelpers.h
@@ -0,0 +1,39 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     EnumFlagHelpers.h (utilities)
+//  Authors:  Kern Handa
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#define ELL_DEFINE_ENUM_FLAG_OPERATORS(ENUMTYPE)                                                             \
+    inline ENUMTYPE operator|(ENUMTYPE a, ENUMTYPE b)                                                        \
+    {                                                                                                        \
+        return ENUMTYPE(((std::underlying_type_t<ENUMTYPE>)a) | ((std::underlying_type_t<ENUMTYPE>)b));      \
+    }                                                                                                        \
+    inline ENUMTYPE& operator|=(ENUMTYPE& a, ENUMTYPE b)                                                     \
+    {                                                                                                        \
+        return (ENUMTYPE&)(((std::underlying_type_t<ENUMTYPE>&)a) |= ((std::underlying_type_t<ENUMTYPE>)b)); \
+    }                                                                                                        \
+    inline ENUMTYPE operator&(ENUMTYPE a, ENUMTYPE b)                                                        \
+    {                                                                                                        \
+        return ENUMTYPE(((std::underlying_type_t<ENUMTYPE>)a) & ((std::underlying_type_t<ENUMTYPE>)b));      \
+    }                                                                                                        \
+    inline ENUMTYPE& operator&=(ENUMTYPE& a, ENUMTYPE b)                                                     \
+    {                                                                                                        \
+        return (ENUMTYPE&)(((std::underlying_type_t<ENUMTYPE>&)a) &= ((std::underlying_type_t<ENUMTYPE>)b)); \
+    }                                                                                                        \
+    inline ENUMTYPE operator~(ENUMTYPE a)                                                                    \
+    {                                                                                                        \
+        return ENUMTYPE(~((std::underlying_type_t<ENUMTYPE>)a));                                             \
+    }                                                                                                        \
+    inline ENUMTYPE operator^(ENUMTYPE a, ENUMTYPE b)                                                        \
+    {                                                                                                        \
+        return ENUMTYPE(((std::underlying_type_t<ENUMTYPE>)a) ^ ((std::underlying_type_t<ENUMTYPE>)b));      \
+    }                                                                                                        \
+    inline ENUMTYPE& operator^=(ENUMTYPE& a, ENUMTYPE b)                                                     \
+    {                                                                                                        \
+        return (ENUMTYPE&)(((std::underlying_type_t<ENUMTYPE>&)a) ^= ((std::underlying_type_t<ENUMTYPE>)b)); \
+    }
diff --git a/libraries/utilities/include/FunctionUtils.h b/libraries/utilities/include/FunctionUtils.h
index d6e0ecadf..c51f6caea 100644
--- a/libraries/utilities/include/FunctionUtils.h
+++ b/libraries/utilities/include/FunctionUtils.h
@@ -68,6 +68,24 @@ namespace utilities
     template <typename FunctionType, typename Arg, typename... Args>
     void ApplyToEach(FunctionType&& function, Arg&& arg, Args&&... args);
 
+    namespace detail
+    {
+        template <typename FunctionType, typename Tuple, size_t... I>
+        void ApplyToEach(FunctionType&& function, Tuple&& tuple, std::index_sequence<I...>)
+        {
+            (function(std::get<I>(tuple)), ...);
+        }
+    } // namespace detail
+
+    template <typename FunctionType, typename... Args>
+    void ApplyToEach(FunctionType&& function, std::tuple<Args...>& tuple)
+    {
+        detail::ApplyToEach(
+            std::forward<FunctionType>(function),
+            std::forward<std::tuple<Args...>>(tuple),
+            std::make_index_sequence<sizeof...(Args)>());
+    }
+
     //
     // FunctionTraits
     //
@@ -75,12 +93,22 @@ namespace utilities
     /// <summary> FunctionTraits: A type-traits-like way to get the return type and argument types of a function </summary>
     ///
     template <typename T>
-    struct FunctionTraits; // undefined base template
+    struct FunctionTraits : public FunctionTraits<decltype(&T::operator())> { }; // generic base template
 
     // Function pointers
     template <typename ReturnT, typename... Args>
     struct FunctionTraits<ReturnT(Args...)>
     {
+        using Type = ReturnT(Args...);
+        using ReturnType = ReturnT;
+        using ArgTypes = std::tuple<Args...>;
+        static constexpr size_t NumArgs = typename std::tuple_size<ArgTypes>();
+    };
+
+    template <typename ReturnT, typename... Args>
+    struct FunctionTraits<ReturnT (*)(Args...)>
+    {
+        using Type = ReturnT(Args...);
         using ReturnType = ReturnT;
         using ArgTypes = std::tuple<Args...>;
         static constexpr size_t NumArgs = typename std::tuple_size<ArgTypes>();
@@ -90,23 +118,72 @@ namespace utilities
     template <typename ReturnT, typename... Args>
     struct FunctionTraits<std::function<ReturnT(Args...)>>
     {
+        using Type = ReturnT(Args...);
         using ReturnType = ReturnT;
         using ArgTypes = std::tuple<Args...>;
         static constexpr size_t NumArgs = typename std::tuple_size<ArgTypes>();
     };
 
+    // const std::function
     template <typename ReturnT, typename... Args>
     struct FunctionTraits<const std::function<ReturnT(Args...)>>
     {
+        using Type = ReturnT(Args...);
+        using ReturnType = ReturnT;
+        using ArgTypes = std::tuple<Args...>;
+        static constexpr size_t NumArgs = typename std::tuple_size<ArgTypes>();
+    };
+
+    // class member
+    template <typename ReturnT, typename Class, typename... Args>
+    struct FunctionTraits<ReturnT (Class::*)(Args...)>
+    {
+        using Type = ReturnT(Args...);
+        using ReturnType = ReturnT;
+        using ArgTypes = std::tuple<Args...>;
+        static constexpr size_t NumArgs = typename std::tuple_size<ArgTypes>();
+    };
+
+    template <typename ReturnT, typename Class, typename... Args>
+    struct FunctionTraits<ReturnT (Class::*)(Args...)&>
+    {
+        using Type = ReturnT(Args...);
+        using ReturnType = ReturnT;
+        using ArgTypes = std::tuple<Args...>;
+        static constexpr size_t NumArgs = typename std::tuple_size<ArgTypes>();
+    };
+
+    template <typename ReturnT, typename Class, typename... Args>
+    struct FunctionTraits<ReturnT (Class::*)(Args...) const>
+    {
+        using Type = ReturnT(Args...);
+        using ReturnType = ReturnT;
+        using ArgTypes = std::tuple<Args...>;
+        static constexpr size_t NumArgs = typename std::tuple_size<ArgTypes>();
+    };
+
+    template <typename ReturnT, typename Class, typename... Args>
+    struct FunctionTraits<ReturnT (Class::*)(Args...) const&>
+    {
+        using Type = ReturnT(Args...);
         using ReturnType = ReturnT;
         using ArgTypes = std::tuple<Args...>;
         static constexpr size_t NumArgs = typename std::tuple_size<ArgTypes>();
     };
 
     // Handy type aliases
+    template <typename F>
+    using FunctionType = typename FunctionTraits<F>::Type;
+
     template <typename FunctionType>
     using FunctionReturnType = typename FunctionTraits<FunctionType>::ReturnType;
 
+    template <typename FunctionType>
+    constexpr bool HasReturnValue()
+    {
+        return !std::is_same_v<void, FunctionReturnType<FunctionType>>;
+    }
+
     template <typename FunctionType>
     using FunctionArgTypes = typename FunctionTraits<FunctionType>::ArgTypes;
 
diff --git a/libraries/utilities/include/MemoryLayout.h b/libraries/utilities/include/MemoryLayout.h
index 5ad6d640c..ca35301d7 100644
--- a/libraries/utilities/include/MemoryLayout.h
+++ b/libraries/utilities/include/MemoryLayout.h
@@ -109,6 +109,8 @@ namespace utilities
         /// <summary> Element access operator. </summary>
         int operator[](int index) const;
 
+        std::string ToString() const;
+
     private:
         int& operator[](int index);
     };
@@ -159,6 +161,8 @@ namespace utilities
         ///
         /// <returns> The name of this type. </returns>
         static std::string GetTypeName() { return "MemoryShape"; }
+
+        std::string ToString() const;
     };
 
     /// <summary> A vector of numbers representing an index into a multidimensional array. </summary>
@@ -195,6 +199,8 @@ namespace utilities
         ///
         /// <returns> The name of this type. </returns>
         static std::string GetTypeName() { return "MemoryCoordinates"; }
+
+        std::string ToString() const;
     };
 
     /// <summary> A class representing layout of a block of data in memory where the block can also
@@ -553,6 +559,8 @@ namespace utilities
         /// a simple one dimensional vector, otherwise throws an exception. </summary>
         MemoryLayout Flatten() const;
 
+        std::string ToString() const;
+
     protected:
         size_t GetDataOffset() const; // offset for physical entry {0,0,0...}
         void WriteToArchive(utilities::Archiver& archiver) const override;
diff --git a/libraries/utilities/include/StringUtil.h b/libraries/utilities/include/StringUtil.h
index ad8a8711f..8b06bfe21 100644
--- a/libraries/utilities/include/StringUtil.h
+++ b/libraries/utilities/include/StringUtil.h
@@ -22,6 +22,20 @@ namespace utilities
     /// <returns> `true` if the substring is contained in the string (according to `std::string::find`) </returns>
     bool Contains(const std::string& s, const std::string& substring);
 
+    /// <summary> Checks whether a string starts with the specified substring (using case-sensitive comparison) </summary>
+    ///
+    /// <param name="s"> The string to search </param>
+    /// <param name="substring"> The substring to search for </param>
+    /// <returns> `true` if the string starts with the substring (according to `std::string::compare`) </returns>
+    bool StartsWith(const std::string& s, const std::string& prefix);
+
+    /// <summary> Checks whether a string ends with the specified substring (using case-sensitive comparison) </summary>
+    ///
+    /// <param name="s"> The string to search </param>
+    /// <param name="substring"> The substring to search for </param>
+    /// <returns> `true` if the string ends with the substring (according to `std::string::compare`) </returns>
+    bool EndsWith(const std::string& s, const std::string& suffix);
+
     /// <summary> Returns copy of std::string with all lowercase characters </summary>
     ///
     /// <param name="s"> The string to convert to lowercase </param>
diff --git a/libraries/utilities/include/TunableParameters.h b/libraries/utilities/include/TunableParameters.h
new file mode 100644
index 000000000..ba3723b01
--- /dev/null
+++ b/libraries/utilities/include/TunableParameters.h
@@ -0,0 +1,165 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     TunableParameters.h (utilities)
+//  Authors:  Kern Handa
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include "FunctionUtils.h"
+
+#include <map>
+#include <string>
+#include <tuple>
+#include <type_traits>
+#include <vector>
+
+namespace ell
+{
+namespace utilities
+{
+    /// <summary> Represents a range of values to iterate over, in conjunction with the `TuningEngine` class below. </summary>
+    /// <remarks>
+    /// Construction of parameters should take place before construction of the TuningEngine, which should be passed
+    /// the set of parameters to iterate over, creating a full set of combinatorial possibilities. The parameters should
+    /// then be used in place of `T`, resulting in a full exploration of the possibilities. For example,
+    /// ```
+    /// TunableParameter M = std::vector{ 2, 4, 6 }, N = std::vector{ 3, 5 };
+    /// TuningEngine engine(M, N);
+    /// do {
+    ///   std::cout << (int)M * (int)N << " ";
+    /// } while (engine.Next());
+    /// ```
+    /// will produce the following output:
+    /// ```
+    /// 6 10 12 20 18 30
+    /// ```
+    /// The internal states of `TunableParameter` instances are modified when iterated over by `TuningEngine`. To reset the state
+    /// the `Reset()` function can be called either on an individual instance of `TunableParameter` or on `TuningEngine`, which will
+    /// call `Reset()` on the set of parameters the engine is operating over.
+    /// </remarks>
+    template <typename T>
+    class TunableParameter
+    {
+    public:
+        TunableParameter(std::vector<T> range, const std::string& name) :
+            _name(name),
+            _range(std::move(range)),
+            _current(_range.begin())
+        {}
+
+        operator T() const
+        {
+            return *_current;
+        }
+
+        bool Next()
+        {
+            return ++_current != _range.end();
+        }
+
+        void Reset()
+        {
+            _current = _range.begin();
+        }
+
+        std::string Name() const
+        {
+            return _name;
+        }
+
+        std::string ValueString() const
+        {
+            // Obviously, this will only work if there's an overload for std::to_string that takes T
+            return std::to_string(*_current);
+        }
+
+        std::string ToString() const
+        {
+            return Name() + ValueString();
+        }
+
+    private:
+        std::string _name;
+        std::vector<T> _range;
+        typename std::vector<T>::iterator _current;
+    };
+
+    /// <summary> Takes an arbitrary number of lvalue references to `TunableParameter<T>` instances and iterates over them in a
+    /// combinatorial manner. `TunableParameter<T>` instances have their state modified by the iteration of the engine,
+    /// as explained above in the documentation for `TunableParameter<T>`. </summary>
+    template <typename... Ts>
+    class TuningEngine
+    {
+    public:
+        TuningEngine(TunableParameter<Ts>&... params) :
+            _params(std::tie(params...))
+        {
+        }
+
+        bool Next()
+        {
+            ++_currentIteration;
+            return NextImpl(std::make_integer_sequence<int64_t, sizeof...(Ts)>());
+        }
+
+        void Reset()
+        {
+            ApplyToEach([](auto& param) { param.Reset(); }, _params);
+        }
+
+        size_t CurrentIteration() const { return _currentIteration; }
+
+        std::string ToString(const std::string& sep = "_") const
+        {
+            return ToStringImpl(sep, std::make_index_sequence<sizeof...(Ts) - 1>());
+        }
+
+        std::map<std::string, std::string> CurrentValues() const
+        {
+            return CurrentValuesImpl(std::make_integer_sequence<int64_t, sizeof...(Ts)>());
+        }
+
+    private:
+        template <int64_t... Is>
+        bool NextImpl(std::integer_sequence<int64_t, Is...> seq)
+        {
+            // Uses fold expressions combined with boolean OR short-circuiting
+            // to iteratively call Next() on the individual parameters, starting
+            // with the last one and working our way to the first one.
+            return (
+                [](auto& param) {
+                    auto b = param.Next();
+                    if (!b)
+                    {
+                        param.Reset();
+                    }
+                    return b;
+                }(std::get<(seq.size() - 1) - Is>(_params)) ||
+                ...);
+        }
+
+        template <size_t... Is>
+        std::string ToStringImpl(const std::string& sep, std::index_sequence<Is...> seq) const
+        {
+            return ((std::get<Is>(_params).ToString() + sep) + ... + (std::get<seq.size()>(_params).ToString()));
+        }
+
+        template <int64_t... Is>
+        std::map<std::string, std::string> CurrentValuesImpl(std::integer_sequence<int64_t, Is...> seq) const
+        {
+            std::map<std::string, std::string> result;
+            ([&](auto& param) {
+                result[param.Name()] = param.ValueString();
+            }(std::get<(seq.size() - 1) - Is>(_params)),
+             ...);
+            return result;
+        }
+
+        std::tuple<TunableParameter<Ts>&...> _params;
+        size_t _currentIteration = 0;
+    };
+} // namespace utilities
+} // namespace ell
diff --git a/libraries/utilities/include/TypeAliases.h b/libraries/utilities/include/TypeAliases.h
index 8e24eb85a..ea8387194 100644
--- a/libraries/utilities/include/TypeAliases.h
+++ b/libraries/utilities/include/TypeAliases.h
@@ -8,6 +8,7 @@
 
 #pragma once
 
+#include <cstdint>
 #include <type_traits>
 
 namespace ell
diff --git a/libraries/utilities/include/TypeTraits.h b/libraries/utilities/include/TypeTraits.h
index b66355acf..5bfbe43c7 100644
--- a/libraries/utilities/include/TypeTraits.h
+++ b/libraries/utilities/include/TypeTraits.h
@@ -26,6 +26,11 @@ namespace utilities
         {};
     } // namespace detail
 
+    /// <summary> Templated type that always has a value of `false`. </summary>
+    template <typename... Ts>
+    struct FalseType : std::false_type
+    {};
+
     /// <summary> Enabled if ValueType is a boolean. </summary>
     template <typename ValueType>
     using IsBoolean = std::enable_if_t<std::is_same<std::decay_t<ValueType>, bool>::value, bool>;
@@ -161,7 +166,7 @@ namespace utilities
         {
             using type = std::remove_cv_t<std::remove_reference_t<T>>;
         };
-    }
+    } // namespace detail
 
     // Convenience type alias to remove all references and const/volatile qualifiers
     // from a type.
diff --git a/libraries/utilities/src/Files.cpp b/libraries/utilities/src/Files.cpp
index 8dfd5c51e..9631c8075 100644
--- a/libraries/utilities/src/Files.cpp
+++ b/libraries/utilities/src/Files.cpp
@@ -36,10 +36,15 @@ namespace utilities
             const auto& path = filepath;
 #endif
             // open file
+            if(!FileExists(filepath))
+            {
+                throw utilities::InputException(InputExceptionErrors::invalidArgument, "file " + filepath + " doesn't exist");
+            }
+
             auto stream = std::ifstream(path, mode);
 
             // check that it opened
-            if (!stream.is_open())
+            if (!stream)
             {
                 throw utilities::InputException(InputExceptionErrors::invalidArgument, "error opening file " + filepath);
             }
diff --git a/libraries/utilities/src/MemoryLayout.cpp b/libraries/utilities/src/MemoryLayout.cpp
index b11ea5bb3..fc7ca98c5 100644
--- a/libraries/utilities/src/MemoryLayout.cpp
+++ b/libraries/utilities/src/MemoryLayout.cpp
@@ -13,6 +13,7 @@
 
 #include <cassert>
 #include <numeric>
+#include <sstream>
 
 namespace ell
 {
@@ -154,6 +155,13 @@ namespace utilities
         return true;
     }
 
+    std::string DimensionOrder::ToString() const
+    {
+        std::stringstream ss;
+        ss << *this;
+        return ss.str();
+    }
+
     //
     // MemoryShape / Coordinates
     //
@@ -177,6 +185,20 @@ namespace utilities
         }
     }
 
+    std::string MemoryShape::ToString() const
+    {
+        std::stringstream ss;
+        ss << *this;
+        return ss.str();
+    }
+
+    std::string MemoryCoordinates::ToString() const
+    {
+        std::stringstream ss;
+        ss << *this;
+        return ss.str();
+    }
+
     //
     // MemoryLayout
     //
@@ -582,6 +604,13 @@ namespace utilities
         }
     }
 
+    std::string MemoryLayout::ToString() const
+    {
+        std::stringstream ss;
+        ss << *this;
+        return ss.str();
+    }
+
     bool Equal(const DimensionVector& shape1, const DimensionVector& shape2)
     {
         auto size = shape1.NumDimensions();
@@ -706,7 +735,7 @@ namespace utilities
         else
         {
             throw InputException(InputExceptionErrors::invalidArgument,
-                "Cannot flatten a discontiguous MemoryLayout.");
+                                 "Cannot flatten a discontiguous MemoryLayout.");
         }
     }
 
diff --git a/libraries/utilities/src/StringUtil.cpp b/libraries/utilities/src/StringUtil.cpp
index c99b51b2c..2b2515beb 100644
--- a/libraries/utilities/src/StringUtil.cpp
+++ b/libraries/utilities/src/StringUtil.cpp
@@ -23,6 +23,16 @@ namespace utilities
         return s.find(substring) != std::string::npos;
     }
 
+    bool StartsWith(const std::string& s, const std::string& prefix)
+    {
+        return s.size() >= prefix.size() && s.compare(0, prefix.size(), prefix) == 0;
+    }
+
+    bool EndsWith(const std::string& s, const std::string& suffix)
+    {
+        return s.size() >= suffix.size() && s.compare(s.size() - suffix.size(), std::string::npos, suffix) == 0;
+    }
+
     std::string ToLowercase(const std::string& s)
     {
         std::string lower = s;
diff --git a/libraries/utilities/test/include/TunableParameters_test.h b/libraries/utilities/test/include/TunableParameters_test.h
new file mode 100644
index 000000000..573de3bdc
--- /dev/null
+++ b/libraries/utilities/test/include/TunableParameters_test.h
@@ -0,0 +1,15 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     TunableParameters_test.h (utilities)
+//  Authors:  Kern Handa
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+namespace ell
+{
+void TunableParameters_test1();
+void TunableParameters_test2();
+} // namespace ell
diff --git a/libraries/utilities/test/src/TunableParameters_test.cpp b/libraries/utilities/test/src/TunableParameters_test.cpp
new file mode 100644
index 000000000..36d1409f8
--- /dev/null
+++ b/libraries/utilities/test/src/TunableParameters_test.cpp
@@ -0,0 +1,80 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     TunableParameters_test.cpp (utilities)
+//  Authors:  Kern Handa
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "TunableParameters_test.h"
+
+#include <utilities/include/TunableParameters.h>
+
+#include <testing/include/testing.h>
+
+#include <algorithm>
+#include <vector>
+
+namespace ell
+{
+
+using namespace utilities;
+
+void TunableParameters_test1()
+{
+    std::vector expected{ 1, 2, 3, 4 };
+    std::vector<int> actual;
+    TunableParameter p(expected, "expected");
+    TuningEngine engine(p);
+    do
+    {
+        actual.push_back(p);
+    } while (engine.Next());
+
+    testing::ProcessTest("TunableParameters_test1", actual == expected);
+}
+
+void TunableParameters_test2()
+{
+    std::vector<int> p1Values{ 1, 2, 3 };
+    std::vector<int> p2Values{ 4, 5 };
+    std::vector<int> p3Values{ 6, 7, 8 };
+    TunableParameter p1(p1Values, "p1");
+    TunableParameter p2(p2Values, "p2");
+    TunableParameter p3(p3Values, "p3");
+    std::vector expected{
+        std::vector{ 1, 4, 6 },
+        std::vector{ 1, 4, 7 },
+        std::vector{ 1, 4, 8 },
+        std::vector{ 1, 5, 6 },
+        std::vector{ 1, 5, 7 },
+        std::vector{ 1, 5, 8 },
+        std::vector{ 2, 4, 6 },
+        std::vector{ 2, 4, 7 },
+        std::vector{ 2, 4, 8 },
+        std::vector{ 2, 5, 6 },
+        std::vector{ 2, 5, 7 },
+        std::vector{ 2, 5, 8 },
+        std::vector{ 3, 4, 6 },
+        std::vector{ 3, 4, 7 },
+        std::vector{ 3, 4, 8 },
+        std::vector{ 3, 5, 6 },
+        std::vector{ 3, 5, 7 },
+        std::vector{ 3, 5, 8 },
+    };
+    std::vector<std::vector<int>> actual;
+    TuningEngine engine(p1, p2, p3);
+    do
+    {
+        actual.push_back(std::vector<int>{ p1, p2, p3 });
+    } while (engine.Next());
+
+    testing::ProcessTest(
+        "TunableParameters_test2",
+        std::equal(expected.begin(), expected.end(), actual.begin(), actual.end()));
+
+    engine.Reset();
+    testing::ProcessTest("TunableParameters_test2 - Reset", expected[0] == std::vector<int>{ p1, p2, p3 });
+}
+
+} // namespace ell
diff --git a/libraries/utilities/test/src/main.cpp b/libraries/utilities/test/src/main.cpp
index e79c72a44..c176e7766 100644
--- a/libraries/utilities/test/src/main.cpp
+++ b/libraries/utilities/test/src/main.cpp
@@ -16,6 +16,7 @@
 #include "ObjectArchive_test.h"
 #include "PropertyBag_test.h"
 #include "RingBuffer_test.h"
+#include "TunableParameters_test.h"
 #include "TypeFactory_test.h"
 #include "TypeName_test.h"
 #include "Variant_test.h"
@@ -117,6 +118,10 @@ int main(int argc, char* argv[])
         // PropertyBag tests
         TestPropertyBag();
         TestRecursivePropertyBag();
+
+        // TunableParameters
+        TunableParameters_test1();
+        TunableParameters_test2();
     }
     catch (const utilities::Exception& exception)
     {
diff --git a/libraries/value/CMakeLists.txt b/libraries/value/CMakeLists.txt
index f89302aa6..b821536d4 100644
--- a/libraries/value/CMakeLists.txt
+++ b/libraries/value/CMakeLists.txt
@@ -5,31 +5,59 @@
 set(library_name value)
 
 set(src
+    src/Array.cpp
+    src/CachingProvider.cpp
+    src/CachingStrategies.cpp
     src/ComputeContext.cpp
+    src/CppEmitterContext.cpp
     src/Emittable.cpp
     src/EmitterContext.cpp
     src/FunctionDeclaration.cpp
     src/LLVMContext.cpp
+    src/LoopNests.cpp
     src/Matrix.cpp
     src/MatrixOperations.cpp
+    src/Print.cpp
     src/Reference.cpp
     src/Scalar.cpp
+    src/ScalarOperations.cpp
     src/Tensor.cpp
     src/TensorOperations.cpp
     src/Value.cpp
     src/ValueOperations.cpp
     src/Vector.cpp
     src/VectorOperations.cpp
+
+    src/loopnests/CodeGenerator.cpp
+    src/loopnests/CodePositionConstraints.cpp
+    src/loopnests/ForAll.cpp
+    src/loopnests/Index.cpp
+    src/loopnests/IndexRange.cpp
+    src/loopnests/IterationDomain.cpp
+    src/loopnests/Kernel.cpp
+    src/loopnests/KernelPredicate.cpp
+    src/loopnests/LoopNest.cpp
+    src/loopnests/LoopNestPrinter.cpp
+    src/loopnests/LoopNestVisitor.cpp
+    src/loopnests/Range.cpp
+    src/loopnests/SplitIndexRange.cpp
+    src/loopnests/SplitIterationDomain.cpp
 )
 
 set(include
+    include/Array.h
+    include/CachingProvider.h
+    include/CachingStrategies.h
     include/ComputeContext.h
+    include/CppEmitterContext.h
     include/Emittable.h
     include/EmitterContext.h
     include/FunctionDeclaration.h
     include/LLVMContext.h
+    include/LoopNests.h
     include/Matrix.h
     include/MatrixOperations.h
+    include/Print.h
     include/Reference.h
     include/Scalar.h
     include/Tensor.h
@@ -39,6 +67,22 @@ set(include
     include/ValueType.h
     include/Vector.h
     include/VectorOperations.h
+
+    include/loopnests/CodeGenerator.h
+    include/loopnests/CodePositionConstraints.h
+    include/loopnests/ForAll.h
+    include/loopnests/Index.h
+    include/loopnests/IndexRange.h
+    include/loopnests/IterationDomain.h
+    include/loopnests/Kernel.h
+    include/loopnests/KernelPredicate.h
+    include/loopnests/LoopIndexInfo.h
+    include/loopnests/LoopNest.h
+    include/loopnests/LoopNestPrinter.h
+    include/loopnests/LoopNestVisitor.h
+    include/loopnests/Range.h
+    include/loopnests/SplitIndexRange.h
+    include/loopnests/SplitIterationDomain.h
 )
 
 set(doc
@@ -53,7 +97,6 @@ target_include_directories(${library_name} PRIVATE include ${ELL_LIBRARIES_DIR})
 target_include_directories(${library_name} SYSTEM PUBLIC ${LLVM_INCLUDE_DIRS})
 target_link_libraries(${library_name} PUBLIC ${LLVM_LIBS} emitters utilities)
 target_compile_options(${library_name} PUBLIC ${LLVM_COMPILE_OPTIONS})
-
 set_property(TARGET ${library_name} PROPERTY FOLDER "libraries")
 
 #
@@ -68,21 +111,33 @@ endif()
 
 set(test_src
     test/src/main.cpp
+    test/src/CachingStrategy_test.cpp
+    test/src/Functions_test.cpp
+    test/src/LoopNest_convolution_test.cpp
+    test/src/LoopNest_kernels.cpp
+    test/src/LoopNest_test.cpp
+    test/src/LoopNestAPI_test.cpp
     test/src/Matrix_test.cpp
     test/src/Scalar_test.cpp
     test/src/Tensor_test.cpp
     test/src/TestUtil.cpp
-    test/src/Vector_test.cpp
     test/src/Value_test.cpp
+    test/src/Vector_test.cpp
 )
 
 set(test_include
+    test/include/CachingStrategy_test.h
+    test/include/Functions_test.h
+    test/include/LoopNest_convolution_test.h
+    test/include/LoopNest_kernels.h
+    test/include/LoopNest_test.h
+    test/include/LoopNestAPI_test.h
     test/include/Matrix_test.h
     test/include/Scalar_test.h
     test/include/Tensor_test.h
     test/include/TestUtil.h
-    test/include/Vector_test.h
     test/include/Value_test.h
+    test/include/Vector_test.h
 )
 
 source_group("src" FILES ${test_src})
diff --git a/libraries/value/README.md b/libraries/value/README.md
index addcf9c8b..fa4ab9f29 100644
--- a/libraries/value/README.md
+++ b/libraries/value/README.md
@@ -27,7 +27,7 @@ littering it with namespace-level qualifications. NB: this may not be necessary
 due to ADL, depending on what one is trying to do.
 
 As this API will be type-erased (more below), usage of this API does not need
-to be templated and thus does not need to be in `.h`/`.tcc` files.
+to be templated and thus does not need to be in `.h` files.
 
 ## Classes
 * `Value` - top-level type-erased class that will be the basis of all
diff --git a/libraries/value/include/Array.h b/libraries/value/include/Array.h
new file mode 100644
index 000000000..038b207f7
--- /dev/null
+++ b/libraries/value/include/Array.h
@@ -0,0 +1,185 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     Array.h (value)
+//  Authors:  Chuck Jacobs
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include "EmitterContext.h"
+
+#include <utilities/include/FunctionUtils.h>
+#include <utilities/include/MemoryLayout.h>
+
+#include <functional>
+#include <tuple>
+
+namespace ell
+{
+namespace value
+{
+
+    /// <summary> Wraps a Value instance and enforces a memory layout that represents a multidimensional array </summary>
+    class Array
+    {
+    public:
+        Array();
+
+        /// <summary> Constructor that wraps the provided instance of Value </summary>
+        /// <param name="value"> The Value instance to wrap </param>
+        Array(Value value, const std::string& name = "");
+
+        /// <summary> Constructs an instance from a 1D std::vector reshaped into the given array shape </summary>
+        /// <typeparam name="T"> Any fundamental type accepted by Value </typeparam>
+        /// <param name="data"> The data represented as a std::vector, in canonical row-major layout </param>
+        /// <param name="shape"> The shape of the memory </param>
+        template <typename T>
+        Array(const std::vector<T>& data, const utilities::MemoryShape& shape);
+
+        /// <summary> Constructs an instance from a 1D std::vector reshaped into the given array shape </summary>
+        /// <typeparam name="T"> Any fundamental type accepted by Value </typeparam>
+        /// <param name="data"> The data represented as a std::vector, in canonical row-major layout </param>
+        /// <param name="layout"> The layout of the memory </param>
+        template <typename T>
+        Array(const std::vector<T>& data, const utilities::MemoryLayout& layout);
+
+        Array(const Array&);
+        Array(Array&&) noexcept;
+        Array& operator=(const Array&);
+        Array& operator=(Array&&);
+        ~Array();
+
+        /// <summary> Array element access operator. </summary>
+        /// <returns> The Scalar value wrapping the value that is at the specified index within the array </return>
+        Scalar operator()(const std::vector<Scalar>& indices);
+
+        /// <summary> Array element access operator. </summary>
+        /// <returns> A copy of the Scalar value that is at the specified index within the array </return>
+        Scalar operator()(const std::vector<Scalar>& indices) const;
+
+        /// <summary> Array element access operator. </summary>
+        /// <returns> The Scalar value wrapping the value that is at the specified index within the array </return>
+        template <typename... T>
+        Scalar operator()(T... indices);
+
+        /// <summary> Array element access operator. </summary>
+        /// <returns> A copy of the Scalar value that is at the specified index within the array </return>
+        template <typename... T>
+        Scalar operator()(T... indices) const;
+
+        /// <summary> Gets the underlying wrapped Value instance </summary>
+        Value GetValue() const;
+
+        /// <summary> Creates a new Array instance that contains the same data as this instance </summary>
+        /// <returns> A new Array instance that points to a new, distinct memory that contains the same data as this instance </returns>
+        Array Copy() const;
+
+        /// <summary> Returns the number of active elements </summary>
+        /// <returns> The size of the array </returns>
+        size_t Size() const;
+
+        /// <summary> Retrieves the type of data stored in the wrapped Value instance </summary>
+        /// <returns> The type </returns>
+        ValueType Type() const;
+
+        void SetName(const std::string& name);
+        std::string GetName() const;
+
+    private:
+        Value _value;
+    };
+
+    /// <summary> Creates a for loop over the array </summary>
+    /// <param name="array"> The instance of Array that references the data over which to iterate </param>
+    /// <param name="fn"> The function to be called for each coordinate where there is an active element </param>
+    void For(Array array, std::function<void(const std::vector<Scalar>&)> fn);
+
+    /// <summary> Constructs an allocated instance with the specified dimensions </summary>
+    /// <typeparam name="T"> Any fundamental type accepted by Value </typeparam>
+    /// <param name="shape"> The shape of the memory </param>
+    template <typename T>
+    Array MakeArray(const utilities::MemoryShape& shape)
+    {
+        return Array(Allocate<T>(utilities::MemoryLayout(shape)));
+    }
+
+    /// <summary> Constructs an allocated instance with the specified dimensions </summary>
+    /// <typeparam name="T"> Any fundamental type accepted by Value </typeparam>
+    /// <param name="shape"> The shape of the memory </param>
+    /// <param name="name"> The name of the allocated matrix </param>
+    template <typename T>
+    Array MakeArray(const utilities::MemoryShape& shape, std::string name)
+    {
+        auto result = MakeArray<T>(shape);
+        result.SetName(name);
+        return result;
+    }
+
+} // namespace value
+} // namespace ell
+
+#pragma region implementation
+
+namespace ell
+{
+namespace value
+{
+    template <typename T>
+    Array::Array(const std::vector<T>& data, const utilities::MemoryShape& shape)
+    {
+        using namespace utilities;
+
+        int size = static_cast<int>(data.size());
+        if (size != shape.NumElements())
+        {
+            throw InputException(InputExceptionErrors::invalidSize);
+        }
+        _value = Value(data, MemoryLayout(shape));
+    }
+
+    template <typename T>
+    Array::Array(const std::vector<T>& data, const utilities::MemoryLayout& layout)
+    {
+        using namespace utilities;
+
+        auto size = data.size();
+        if (size != layout.GetMemorySize())
+        {
+            throw InputException(InputExceptionErrors::invalidSize);
+        }
+        _value = Value(data, layout);
+    }
+
+    template <typename... T>
+    Scalar Array::operator()(T... indices)
+    {
+        static_assert(utilities::AllSame<Scalar, std::decay_t<T>...>);
+        if (sizeof...(T) != GetValue().GetLayout().NumDimensions())
+        {
+            throw utilities::InputException(utilities::InputExceptionErrors::invalidSize);
+        }
+        Value indexedValue = GetContext().Offset(_value, { indices... });
+        indexedValue.SetLayout(utilities::ScalarLayout);
+
+        return indexedValue;
+    }
+
+    template <typename... T>
+    Scalar Array::operator()(T... indices) const
+    {
+        static_assert(utilities::AllSame<Scalar, std::decay_t<T>...>);
+        if (sizeof...(T) != GetValue().GetLayout().NumDimensions())
+        {
+            throw utilities::InputException(utilities::InputExceptionErrors::invalidSize);
+        }
+        Value indexedValue = GetContext().Offset(_value, { indices... }); // shouldn't this be "load(offset(...))"?
+        indexedValue.SetLayout(utilities::ScalarLayout);
+
+        return Scalar(indexedValue).Copy();
+    }
+} // namespace value
+} // namespace ell
+
+#pragma endregion implementation
diff --git a/libraries/value/include/CachingProvider.h b/libraries/value/include/CachingProvider.h
new file mode 100644
index 000000000..fa3eb95ae
--- /dev/null
+++ b/libraries/value/include/CachingProvider.h
@@ -0,0 +1,191 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     CachingProvider.h (value)
+//  Authors:  Kern Handa
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include "EmitterContext.h"
+
+#include "loopnests/Index.h"
+
+#include <any>
+#include <memory>
+#include <optional>
+#include <tuple>
+#include <vector>
+
+namespace ell
+{
+namespace value
+{
+    using loopnests::Index;
+
+    class LoopNest;
+
+    class CachingProvider
+    {
+    public:
+        CachingProvider() = default;
+        virtual ~CachingProvider() = default;
+
+        void Initialize(ViewAdapter view, utilities::MemoryShape cacheShape, utilities::DimensionOrder order, std::vector<Index> kernelIndices, std::vector<Index> atIndices, std::any extra);
+
+        void HandleCaching(LoopNest&);
+
+    protected:
+        Value _value;
+        utilities::MemoryShape _shape;
+        utilities::DimensionOrder _order;
+        std::vector<Index> _kernelIndices;
+        std::vector<Index> _atIndices;
+        std::any _extra;
+        
+    private:
+        virtual void HandleCachingImpl(LoopNest&) = 0;
+    };
+
+    namespace
+    {
+        class CachingHelper
+        {
+        public:
+            CachingHelper(const CachingHelper&) = delete;
+            CachingHelper& operator=(const CachingHelper&) = delete;
+            CachingHelper(CachingHelper&& other)
+            {
+                *this = std::move(other);
+            }
+
+            CachingHelper& operator=(CachingHelper&& other)
+            {
+                if (this != &other)
+                {
+                    std::swap(_value, other._value);
+                    std::swap(_atIndices, other._atIndices);
+                    std::swap(_kernelIndices, other._kernelIndices);
+                    std::swap(_shape, other._shape);
+                    std::swap(_order, other._order);
+                    std::swap(_provider, other._provider);
+                    std::swap(_extra, other._extra);
+                }
+
+                return *this;
+            }
+
+            CachingHelper(ViewAdapter view) :
+                _value(view)
+            {
+                if (!_value.IsDefined())
+                {
+                    throw utilities::LogicException(utilities::LogicExceptionErrors::illegalState,
+                                                    "View to be cached is not defined");
+                }
+            }
+
+            CachingHelper Using(std::vector<Index> indices) &&
+            {
+                if (indices.empty())
+                {
+                    throw utilities::LogicException(utilities::LogicExceptionErrors::illegalState,
+                                                    "Indices that specify caching indices cannot be empty");
+                }
+
+                _kernelIndices = indices;
+                if (_atIndices.empty())
+                {
+                    _atIndices = _kernelIndices;
+                }
+
+                return std::move(*this);
+            }
+
+            CachingHelper At(std::vector<Index> indices) &&
+            {
+                if (indices.empty())
+                {
+                    throw utilities::LogicException(utilities::LogicExceptionErrors::illegalState,
+                                                    "Indices that specify caching kernel location cannot be empty");
+                }
+
+                _atIndices = indices;
+                return std::move(*this);
+            }
+
+            CachingHelper Size(utilities::MemoryShape shape, utilities::DimensionOrder order) &&
+            {
+                _shape = shape;
+                _order = order;
+
+                return std::move(*this);
+            }
+
+            CachingHelper Size(utilities::MemoryShape shape) &&
+            {
+                return std::move(*this).Size(shape, utilities::DimensionOrder(shape.NumDimensions()));
+            }
+
+            template <typename... Ts>
+            CachingHelper Extra(Ts&&... ts) &&
+            {
+                _extra = std::make_tuple<Ts...>(std::forward<Ts>(ts)...);
+                return std::move(*this);
+            }
+
+            template <typename T, typename ProviderType = typename T::ProviderType>
+            CachingHelper Type(T&&) &&
+            {
+                if (!_value.IsDefined())
+                {
+                    throw utilities::LogicException(utilities::LogicExceptionErrors::illegalState,
+                                                    "View to be cached is not defined");
+                }
+                if (_kernelIndices.empty())
+                {
+                    throw utilities::LogicException(utilities::LogicExceptionErrors::illegalState,
+                                                    "Indices that specify caching location cannot be empty");
+                }
+                if (!_shape)
+                {
+                    throw utilities::LogicException(utilities::LogicExceptionErrors::illegalState,
+                                                    "Memory shape of cached location must be provided");
+                }
+
+                auto provider = std::make_unique<ProviderType>();
+                provider->Initialize(std::move(_value), std::move(*_shape), std::move(*_order), std::move(_kernelIndices), std::move(_atIndices), std::move(_extra));
+                _provider = std::move(provider);
+                return std::move(*this);
+            }
+
+            operator std::unique_ptr<CachingProvider>() &&
+            {
+                return std::move(_provider);
+            }
+
+        private:
+            Value _value;
+            std::vector<Index> _atIndices;
+            std::vector<Index> _kernelIndices;
+            std::optional<utilities::MemoryShape> _shape;
+            std::optional<utilities::DimensionOrder> _order;
+            std::unique_ptr<CachingProvider> _provider;
+            std::any _extra;
+        };
+    } // namespace
+
+    inline CachingHelper CreateCacheFor(ViewAdapter view)
+    {
+        return CachingHelper(view);
+    }
+
+    template <typename CachingProviderType>
+    struct CachingStrategyType
+    {
+        using ProviderType = CachingProviderType;
+    };
+
+} // namespace value
+} // namespace ell
diff --git a/libraries/value/include/CachingStrategies.h b/libraries/value/include/CachingStrategies.h
new file mode 100644
index 000000000..9e78b742a
--- /dev/null
+++ b/libraries/value/include/CachingStrategies.h
@@ -0,0 +1,63 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     CachingStrategies.h (value)
+//  Authors:  Kern Handa
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include "CachingProvider.h"
+
+namespace ell
+{
+namespace value
+{
+    enum class BoundaryConditionHandling : int
+    {
+        ZeroPadding
+    };
+
+    using ReduceFunctionType = void(value::Scalar, value::Scalar);
+    void CopyReduce(value::Scalar, value::Scalar);
+    void SumReduce(value::Scalar, value::Scalar);
+
+    class CopyInputCopyOutput : public CachingProvider
+    {
+        void HandleCachingImpl(LoopNest&) override;
+    };
+
+    class CopyInputNoOutput : public CachingProvider
+    {
+        void HandleCachingImpl(LoopNest&) override;
+    };
+
+    class ZeroInputReduceOutput : public CachingProvider
+    {
+        void HandleCachingImpl(LoopNest&) override;
+    };
+
+    class BLASTCopy : public CachingProvider
+    {
+    public:
+        void HandleCachingImpl(LoopNest&) override;
+
+        Value _rawCache;
+    };
+
+    class GeneralCachingStrategy : public CachingProvider
+    {
+    public:
+        void HandleCachingImpl(LoopNest&) override;
+        Value _rawCache;
+    };
+
+    using SubMatrixCopyInCopyOutCache = CachingStrategyType<CopyInputCopyOutput>;
+    using SubMatrixCopyIn = CachingStrategyType<CopyInputNoOutput>;
+    using ZeroInputCopyOutMatrixCache = CachingStrategyType<ZeroInputReduceOutput>;
+
+    using BLASTCopyCache = CachingStrategyType<BLASTCopy>;
+
+} // namespace value
+} // namespace ell
diff --git a/libraries/value/include/ComputeContext.h b/libraries/value/include/ComputeContext.h
index 22d45a6b9..844cc97c4 100644
--- a/libraries/value/include/ComputeContext.h
+++ b/libraries/value/include/ComputeContext.h
@@ -12,9 +12,11 @@
 #include "FunctionDeclaration.h"
 #include "Scalar.h"
 
+#include <atomic>
 #include <forward_list>
 #include <map>
 #include <optional>
+#include <mutex>
 #include <stack>
 #include <string>
 
@@ -34,12 +36,12 @@ namespace value
         const ConstantData& GetConstantData(Value value) const;
 
     private:
-        Value AllocateImpl(ValueType type, MemoryLayout layout) override;
+        Value AllocateImpl(ValueType type, MemoryLayout layout, size_t alignment, AllocateFlags flags) override;
 
         std::optional<Value> GetGlobalValue(GlobalAllocationScope scope, std::string name) override;
 
-        Value GlobalAllocateImpl(GlobalAllocationScope scope, std::string name, ConstantData data, MemoryLayout layout) override;
-        Value GlobalAllocateImpl(GlobalAllocationScope scope, std::string name, ValueType type, MemoryLayout layout) override;
+        Value GlobalAllocateImpl(GlobalAllocationScope scope, std::string name, ConstantData data, MemoryLayout layout, AllocateFlags flags) override;
+        Value GlobalAllocateImpl(GlobalAllocationScope scope, std::string name, ValueType type, MemoryLayout layout, AllocateFlags flags) override;
 
         detail::ValueTypeDescription GetTypeImpl(Emittable emittable) override;
 
@@ -48,8 +50,8 @@ namespace value
 
         Value StoreConstantDataImpl(ConstantData data) override;
 
-        void ForImpl(MemoryLayout layout, std::function<void(std::vector<Scalar>)> fn) override;
-        void ForImpl(Scalar start, Scalar stop, Scalar step, std::function<void(Scalar)> fn) override;
+        void ForImpl(MemoryLayout layout, std::function<void(std::vector<Scalar>)> fn, const std::string& name) override;
+        void ForImpl(Scalar start, Scalar stop, Scalar step, std::function<void(Scalar)> fn, const std::string& name) override;
 
         void MoveDataImpl(Value& source, Value& destination) override;
 
@@ -71,12 +73,15 @@ namespace value
 
         IfContext IfImpl(Scalar test, std::function<void()> fn) override;
 
+        void WhileImpl(Scalar test, std::function<void()> fn) override;
+
         std::optional<Value> CallImpl(FunctionDeclaration func, std::vector<Value> args) override;
 
         void PrefetchImpl(Value data, PrefetchType type, PrefetchLocality locality) override;
 
         void ParallelizeImpl(int numTasks, std::vector<Value> captured, std::function<void(Scalar, std::vector<Value>)> fn) override;
 
+        void DebugBreakImpl() override;
         void DebugDumpImpl(Value value, std::string tag, std::ostream& stream) const override;
         void DebugDumpImpl(FunctionDeclaration fn, std::string tag, std::ostream& stream) const override;
         void DebugPrintImpl(std::string message) override;
@@ -84,6 +89,10 @@ namespace value
         void SetNameImpl(const Value& value, const std::string& name) override;
         std::string GetNameImpl(const Value& value) const override;
 
+        void ImportCodeFileImpl(std::string) override;
+
+        Scalar GetFunctionAddressImpl(const FunctionDeclaration& fn) override;
+
         Value IntrinsicCall(FunctionDeclaration intrinsic, std::vector<Value> args);
 
         bool ValidateValue(Value value) const;
@@ -102,9 +111,12 @@ namespace value
         Frame& GetTopFrame();
         const Frame& GetTopFrame() const;
 
+        friend void swap(ComputeContext&, ComputeContext&) noexcept;
+
         class IfContextImpl;
         struct FunctionScope;
 
+        mutable std::recursive_mutex _mutex;
         std::stack<Frame> _stack;
         std::map<std::string, std::pair<ConstantData, MemoryLayout>> _globals;
         std::unordered_map<FunctionDeclaration, DefinedFunction> _definedFunctions;
diff --git a/libraries/value/include/CppEmitterContext.h b/libraries/value/include/CppEmitterContext.h
new file mode 100644
index 000000000..f166a2969
--- /dev/null
+++ b/libraries/value/include/CppEmitterContext.h
@@ -0,0 +1,166 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     CppEmitterContext.h (value)
+//  Authors:  Kern Handa
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include "ComputeContext.h"
+#include "EmitterContext.h"
+#include "FunctionDeclaration.h"
+
+#include <forward_list>
+#include <functional>
+#include <iosfwd>
+#include <sstream>
+#include <stack>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+
+namespace ell
+{
+namespace value
+{
+    class CppEmitterContext : public EmitterContext
+    {
+    public:
+        CppEmitterContext(std::string moduleName, std::ostream& stream);
+        CppEmitterContext(std::string modulename, std::unique_ptr<std::ostream> stream);
+        CppEmitterContext(const TargetDevice& target, std::string moduleName, std::ostream& stream);
+        CppEmitterContext(const TargetDevice& target, std::string modulename, std::unique_ptr<std::ostream> stream);
+
+        ~CppEmitterContext();
+
+    private:
+        Value AllocateImpl(ValueType, MemoryLayout, size_t /* alignment */, AllocateFlags flags) override;
+        Value AllocateImpl(detail::ValueTypeDescription, std::optional<MemoryLayout>, std::string, std::optional<std::string> = std::nullopt, bool = false);
+
+        std::optional<Value> GetGlobalValue(GlobalAllocationScope scope, std::string name) override;
+        Value GlobalAllocateImpl(GlobalAllocationScope scope, std::string name, ConstantData data, MemoryLayout layout, AllocateFlags flags) override;
+        Value GlobalAllocateImpl(GlobalAllocationScope scope, std::string name, ValueType type, MemoryLayout layout, AllocateFlags flags) override;
+
+        detail::ValueTypeDescription GetTypeImpl(Emittable) override;
+
+        DefinedFunction CreateFunctionImpl(FunctionDeclaration decl, DefinedFunction fn) override;
+        bool IsFunctionDefinedImpl(FunctionDeclaration decl) const override;
+
+        Value StoreConstantDataImpl(ConstantData data) override;
+
+        void ForImpl(MemoryLayout layout, std::function<void(std::vector<Scalar>)> fn, const std::string& name) override;
+        void ForImpl(Scalar start, Scalar stop, Scalar step, std::function<void(Scalar)> fn, const std::string& name) override;
+
+        void MoveDataImpl(Value& source, Value& destination) override;
+
+        void CopyDataImpl(const Value& source, Value& destination) override;
+
+        Value ReferenceImpl(Value source) override;
+
+        Value DereferenceImpl(Value source) override;
+
+        Value OffsetImpl(Value source, Value offset) override;
+
+        Value UnaryOperationImpl(ValueUnaryOperation op, Value destination) override;
+        Value BinaryOperationImpl(ValueBinaryOperation op, Value destination, Value source) override;
+
+        Value LogicalOperationImpl(ValueLogicalOperation op, Value source1, Value source2) override;
+
+        Value CastImpl(Value value, ValueType type) override;
+
+        IfContext IfImpl(Scalar test, std::function<void()> fn) override;
+
+        void WhileImpl(Scalar test, std::function<void()> fn) override;
+
+        std::optional<Value> CallImpl(FunctionDeclaration func, std::vector<Value> args) override;
+
+        void PrefetchImpl(Value data, PrefetchType type, PrefetchLocality locality) override;
+
+        void ParallelizeImpl(int numTasks, std::vector<Value> captured, std::function<void(Scalar, std::vector<Value>)> fn) override;
+
+        void DebugBreakImpl() override;
+        void DebugDumpImpl(Value value, std::string tag, std::ostream& stream) const override;
+        void DebugDumpImpl(FunctionDeclaration fn, std::string tag, std::ostream& stream) const override;
+        void DebugPrintImpl(std::string message) override;
+
+        void SetNameImpl(const Value& value, const std::string& name) override;
+        std::string GetNameImpl(const Value& value) const override;
+
+        void ImportCodeFileImpl(std::string) override;
+
+        Scalar GetFunctionAddressImpl(const FunctionDeclaration& fn) override;
+
+        Value IntrinsicCall(FunctionDeclaration intrinsic, std::vector<Value> args);
+        std::optional<Value> EmitExternalCall(FunctionDeclaration externalFunc, std::vector<Value> args);
+
+        void DeclareFunction(FunctionDeclaration decl);
+        std::ostream& WriteFunctionSignature(std::ostream& stream, FunctionDeclaration decl);
+
+        std::ostream& Out();
+        std::ostream& Global();
+        std::ostream& FnDecl();
+
+        std::string GetScopeAdjustedName(GlobalAllocationScope scope, std::string name) const;
+        std::string GetGlobalScopedName(std::string name) const;
+        std::string GetCurrentFunctionScopedName(std::string name) const;
+
+        Value SimpleNumericIntrinsic(FunctionDeclaration intrinsic, std::vector<Value> args);
+        Value MaxMinIntrinsic(FunctionDeclaration intrinsic, std::vector<Value> args);
+        Value PowIntrinsic(FunctionDeclaration intrinsic, std::vector<Value> args);
+        Value CopySignIntrinsic(FunctionDeclaration intrinsic, std::vector<Value> args);
+        Value FmaIntrinsic(FunctionDeclaration intrinsic, std::vector<Value> args);
+        Value MemFnIntrinsic(FunctionDeclaration intrinsic, std::vector<Value> args);
+
+        template <typename Fn>
+        void Indented(Fn&&);
+
+        struct PromotedConstantDataDescription
+        {
+            const ConstantData* data;
+            Emittable realValue;
+        };
+
+        Value PromoteConstantData(Value value);
+        std::optional<PromotedConstantDataDescription> HasBeenPromoted(Value value) const;
+        Value Realize(Value value);
+        Value EnsureEmittable(Value value);
+
+        std::string ScalarToString(ViewAdapter scalar) const;
+
+        struct ValueImpl
+        {
+            std::string name;
+            detail::ValueTypeDescription typeDesc;
+        };
+
+        struct FnContext
+        {
+            std::forward_list<ValueImpl> dataList;
+            std::string name;
+        };
+
+        class IfContextImpl;
+        struct FunctionScope;
+
+        std::unique_ptr<std::ostream> _ownedStream;
+        ComputeContext _computeContext;
+
+        std::stack<std::vector<PromotedConstantDataDescription>> _promotedConstantStack;
+        std::stack<FnContext> _fnStacks;
+        std::ostringstream _globalStream;
+        std::ostringstream _fnDeclStream;
+        std::ostringstream _expressionStream;
+        std::reference_wrapper<std::ostream> _stream;
+        std::reference_wrapper<std::ostream> _outputStream;
+        std::unordered_map<FunctionDeclaration, DefinedFunction> _definedFunctions;
+        std::map<std::string, std::pair<Emittable, MemoryLayout>> _globals;
+        std::forward_list<ValueImpl> _globalsList;
+        std::unordered_set<std::string> _declaredFunctions;
+        std::string _moduleName;
+        size_t _indent = 0;
+    };
+
+} // namespace value
+} // namespace ell
diff --git a/libraries/value/include/EmitterContext.h b/libraries/value/include/EmitterContext.h
index 0f1fe1b79..7a1dfbad4 100644
--- a/libraries/value/include/EmitterContext.h
+++ b/libraries/value/include/EmitterContext.h
@@ -13,13 +13,17 @@
 #include "Value.h"
 #include "ValueType.h"
 
+#include <emitters/include/TargetDevice.h>
+
 #include <utilities/include/Boolean.h>
+#include <utilities/include/EnumFlagHelpers.h>
 #include <utilities/include/FunctionUtils.h>
 #include <utilities/include/MemoryLayout.h>
 #include <utilities/include/TupleUtils.h>
 
 #include <functional>
 #include <iosfwd>
+#include <map>
 #include <memory>
 #include <optional>
 #include <string>
@@ -31,6 +35,7 @@ namespace ell
 {
 namespace value
 {
+    using emitters::TargetDevice;
 
     namespace detail
     {
@@ -42,6 +47,13 @@ namespace value
     enum class PrefetchType;
     enum class PrefetchLocality;
 
+    enum class AllocateFlags : uint64_t
+    {
+        None = 0,
+        ThreadLocal = 1 << 0,
+    };
+    ELL_DEFINE_ENUM_FLAG_OPERATORS(AllocateFlags);
+
     /// <summary> An interface describing the global context that's used by the Value library </summary>
     /// <remarks> This class employs the non-virtual interface pattern to provide an easy to use API while
     /// minimizing the functions needed to be overloaded. </remarks>
@@ -73,11 +85,14 @@ namespace value
         public:
             IfContext(std::unique_ptr<IfContextImpl> impl);
             IfContext(const IfContext&) = delete;
-            IfContext(IfContext&&) = delete;
+            IfContext(IfContext&&) = default;
 
             IfContext&& ElseIf(Scalar, std::function<void()>) &&;
             void Else(std::function<void()>) &&;
 
+            void ElseIf(Scalar, std::function<void()>) &;
+            void Else(std::function<void()>) &;
+
         private:
             std::unique_ptr<IfContextImpl> _impl;
         };
@@ -85,32 +100,43 @@ namespace value
         /// <summary> Describes the type that can be used to represent constant C++ data </summary>
         using ConstantData = detail::ConstantData;
 
+        EmitterContext(const TargetDevice& target) :
+            _targetDevice(target) {}
+
         virtual ~EmitterContext();
 
         /// <summary> Allocates data with the specified type and size </summary>
         /// <param name="type"> The type of the data to allocate </param>
         /// <param name="size"> The size of the allocation, in number of elements </param>
-        /// <returns> An instance of Value that contains a referece to the allocated memory </returns>
-        Value Allocate(ValueType type, size_t size);
+        /// <param name="alignment"> The byte alignment to use for the allocated value. </summary>
+        /// <param name="flags"> Any additional flags. Not all contexts may support all flags. </summary>
+        /// <returns> An instance of Value that contains a reference to the allocated memory </returns>
+        Value Allocate(ValueType type, size_t size, size_t alignment = 0, AllocateFlags flags = AllocateFlags::None);
 
         /// <summary> Allocates data with the specified type and size </summary>
         /// <param name="type"> The type of the data to allocate </param>
         /// <param name="layout"> The memory layout of the allocation, in number of elements </param>
-        /// <returns> An instance of Value that contains a referece to the allocated memory </returns>
-        Value Allocate(ValueType type, MemoryLayout layout);
+        /// <param name="alignment"> The byte alignment to use for the allocated value. </summary>
+        /// <param name="flags"> Any additional flags. Not all contexts may support all flags. </summary>
+        /// <returns> An instance of Value that contains a reference to the allocated memory </returns>
+        Value Allocate(ValueType type, MemoryLayout layout, size_t alignment = 0, AllocateFlags flags = AllocateFlags::None);
 
         /// <summary> Allocates function static data </summary>
         /// <param name="name"> The name of the variable </param>
         /// <param name="type"> The type of the data </param>
         /// <param name="layout"> The layout of the data </param>
-        Value StaticAllocate(std::string name, ValueType type, utilities::MemoryLayout layout);
+        /// <param name="flags"> Any additional flags. Not all contexts may support all flags. </summary>
+        /// <returns> An instance of Value that contains a reference to the allocated memory </returns>
+        Value StaticAllocate(std::string name, ValueType type, utilities::MemoryLayout layout, AllocateFlags flags = AllocateFlags::None);
 
         /// <summary> Allocates function static data </summary>
         /// <param name="name"> The name of the variable </param>
         /// <param name="data"> The data </param>
         /// <param name="layout"> The layout of the data </param>
+        /// <param name="flags"> Any additional flags. Not all contexts may support all flags. </summary>
+        /// <returns> An instance of Value that contains a reference to the allocated memory </returns>
         template <typename T, std::enable_if_t<std::is_arithmetic_v<T>, void*> = nullptr>
-        Value StaticAllocate(std::string name, const std::vector<T>& data, std::optional<utilities::MemoryLayout> layout = {})
+        Value StaticAllocate(std::string name, const std::vector<T>& data, std::optional<utilities::MemoryLayout> layout = {}, AllocateFlags flags = AllocateFlags::None)
         {
             if (auto globalValue = GetGlobalValue(GlobalAllocationScope::Function, name))
             {
@@ -118,33 +144,40 @@ namespace value
             }
 
             auto optionalLayout = utilities::MemoryLayout({ static_cast<int>(data.size()) });
-            return GlobalAllocateImpl(GlobalAllocationScope::Function, name, data, layout.value_or(optionalLayout));
+            return GlobalAllocateImpl(GlobalAllocationScope::Function, name, data, layout.value_or(optionalLayout), flags);
         }
 
         /// <summary> Allocates scalar function static data </summary>
         /// <param name="name"> The name of the variable </param>
         /// <param name="data"> The data </param>
+        /// <param name="flags"> Any additional flags. Not all contexts may support all flags. </summary>
+        /// <returns> An instance of Value that contains a reference to the allocated memory </returns>
         template <typename T, std::enable_if_t<std::is_arithmetic_v<T>, void*> = nullptr>
-        Value StaticAllocate(std::string name, T t)
+        Value StaticAllocate(std::string name, T t, AllocateFlags flags = AllocateFlags::None)
         {
             return this
                 ->template StaticAllocate(name,
                                           std::vector<
-                                              std::conditional_t<std::is_same_v<T, bool>, utilities::Boolean, T>>{ t });
+                                              std::conditional_t<std::is_same_v<T, bool>, utilities::Boolean, T>>{ t },
+                                          flags);
         }
 
         /// <summary> Allocates global data </summary>
         /// <param name="name"> The name of the variable </param>
         /// <param name="type"> The type of the data </param>
         /// <param name="layout"> The layout of the data </param>
-        Value GlobalAllocate(std::string name, ValueType type, utilities::MemoryLayout layout);
+        /// <param name="flags"> Any additional flags. Not all contexts may support all flags. </summary>
+        /// <returns> An instance of Value that contains a reference to the allocated memory </returns>
+        Value GlobalAllocate(std::string name, ValueType type, utilities::MemoryLayout layout, AllocateFlags flags = AllocateFlags::None);
 
         /// <summary> Allocates global data </summary>
         /// <param name="name"> The name of the variable </param>
         /// <param name="data"> The data </param>
         /// <param name="layout"> The layout of the data </param>
+        /// <param name="flags"> Any additional flags. Not all contexts may support all flags. </summary>
+        /// <returns> An instance of Value that contains a reference to the allocated memory </returns>
         template <typename T, std::enable_if_t<std::is_arithmetic_v<T>, void*> = nullptr>
-        Value GlobalAllocate(std::string name, const std::vector<T>& data, std::optional<utilities::MemoryLayout> layout = {})
+        Value GlobalAllocate(std::string name, const std::vector<T>& data, std::optional<utilities::MemoryLayout> layout = {}, AllocateFlags flags = AllocateFlags::None)
         {
             if (auto globalValue = GetGlobalValue(GlobalAllocationScope::Global, name))
             {
@@ -152,19 +185,22 @@ namespace value
             }
 
             auto optionalLayout = utilities::MemoryLayout({ static_cast<int>(data.size()) });
-            return GlobalAllocateImpl(GlobalAllocationScope::Global, name, data, layout.value_or(optionalLayout));
+            return GlobalAllocateImpl(GlobalAllocationScope::Global, name, data, layout.value_or(optionalLayout), flags);
         }
 
         /// <summary> Allocates scalar global data </summary>
         /// <param name="name"> The name of the variable </param>
         /// <param name="data"> The data </param>
+        /// <param name="flags"> Any additional flags. Not all contexts may support all flags. </summary>
+        /// <returns> An instance of Value that contains a reference to the allocated memory </returns>
         template <typename T, std::enable_if_t<std::is_arithmetic_v<T>, void*> = nullptr>
-        Value GlobalAllocate(std::string name, T t)
+        Value GlobalAllocate(std::string name, T t, AllocateFlags flags = AllocateFlags::None)
         {
             return this
                 ->template GlobalAllocate(name,
                                           std::vector<
-                                              std::conditional_t<std::is_same_v<T, bool>, utilities::Boolean, T>>{ t });
+                                              std::conditional_t<std::is_same_v<T, bool>, utilities::Boolean, T>>{ t },
+                                          flags);
         }
 
         /// <summary> Gets the type information contained in an instance of Emittable </summary>
@@ -184,20 +220,22 @@ namespace value
 
         /// <summary> Stores data known ahead of time in the form of a std::vector of one of the fundamental types </summary>
         /// <param name="data"> The data that is to be stored by the context instance </param>
-        /// <returns> An instance of Value that contains a referece to the allocated memory </returns>
+        /// <returns> An instance of Value that contains a reference to the allocated memory </returns>
         Value StoreConstantData(ConstantData data);
 
         /// <summary> Creates a for loop over the memory pointed to with the given layout </summary>
         /// <param name="layout"> The layout used to describe the iteration characteristics. Only active elements are iterated over. </param>
         /// <param name="fn"> The function to be called for each coordinate where there is an active element </param>
-        void For(MemoryLayout layout, std::function<void(std::vector<Scalar>)> fn);
+        /// <param name="name"> Optional, a name that can be used by the emitter context to tag this loop in the emitted code </param>
+        void For(MemoryLayout layout, std::function<void(std::vector<Scalar>)> fn, const std::string& name = "");
 
         /// <summary> Creates a for loop beggining at `start`, ending at `stop`, and incrementing by `step` </summary>
         /// <param name="start"> The value used to initialize the loop counter </param>
         /// <param name="stop"> The terminal value of the loop </param>
         /// <param name="step"> The value by which the loop counter is incremented </param>
         /// <param name="fn"> The function to be called for each coordinate where there is an active element </param>
-        void For(Scalar start, Scalar stop, Scalar step, std::function<void(Scalar)> fn);
+        /// <param name="name"> Optional, a name that can be used by the emitter context to tag this loop in the emitted code </param>
+        void For(Scalar start, Scalar stop, Scalar step, std::function<void(Scalar)> fn, const std::string& name = "");
 
         /// <summary> Moves the data from one location to another </summary>
         /// <param name="source"> The source of the memory to be moved </param>
@@ -247,7 +285,9 @@ namespace value
 
         IfContext If(Scalar test, std::function<void()> fn);
 
-        std::optional<Value> Call(FunctionDeclaration func, std::vector<Value> args);
+        void While(Scalar test, std::function<void()> fn);
+
+        std::optional<Value> Call(FunctionDeclaration func, std::vector<ViewAdapter> args);
 
         void Prefetch(Value data, PrefetchType type, PrefetchLocality locality);
 
@@ -258,9 +298,15 @@ namespace value
         /// will be filled in with the values provided within the `captured` parameter. </param>
         void Parallelize(int numTasks, std::vector<Value> captured, std::function<void(Scalar, std::vector<Value>)> fn);
 
+        void DebugBreak();
         void DebugDump(Value value, std::string tag, std::ostream* stream) const;
         void DebugDump(FunctionDeclaration fn, std::string tag, std::ostream* stream) const;
 
+        /// <summary> Returns a unique name based on the prefix provided </summary>
+        /// <param name="prefix"> The prefix for the unique name desired </param>
+        /// <returns> A unique name for this instance </returns>
+        std::string UniqueName(const std::string& prefix);
+
         /// <summary> Emit a debug print message.  This assumes the application
         /// on the target platform implements a "void DebugPrint(char* message)" function.  This function will be
         /// defined for you when running in JIT or Compute mode.  </summary>
@@ -276,15 +322,27 @@ namespace value
         /// <param name="value"> The Value instance </param>
         std::string GetName(const Value& value) const;
 
+        void ImportCodeFile(std::string filename);
+
+        Scalar GetFunctionAddress(const FunctionDeclaration& fn);
+
+        const TargetDevice& GetTargetDevice() const { return _targetDevice; }
+
     protected:
         const std::vector<std::reference_wrapper<FunctionDeclaration>>& GetIntrinsics() const;
 
+        std::vector<Value> NormalizeReferenceLevels(const std::vector<Value>& args, const std::vector<Value>& expected) const;
+
+        std::optional<Value> GetGlobalValue(GlobalAllocationScope, std::string, MemoryLayout);
+
+        std::map<std::string, int> _uniqueNames;
+
     private:
-        virtual Value AllocateImpl(ValueType, MemoryLayout) = 0;
+        virtual Value AllocateImpl(ValueType, MemoryLayout, size_t alignment, AllocateFlags flags) = 0;
 
         virtual std::optional<Value> GetGlobalValue(GlobalAllocationScope scope, std::string name) = 0;
-        virtual Value GlobalAllocateImpl(GlobalAllocationScope scope, std::string name, ConstantData data, MemoryLayout layout) = 0;
-        virtual Value GlobalAllocateImpl(GlobalAllocationScope scope, std::string name, ValueType type, MemoryLayout layout) = 0;
+        virtual Value GlobalAllocateImpl(GlobalAllocationScope scope, std::string name, ConstantData data, MemoryLayout layout, AllocateFlags flags) = 0;
+        virtual Value GlobalAllocateImpl(GlobalAllocationScope scope, std::string name, ValueType type, MemoryLayout layout, AllocateFlags flags) = 0;
 
         virtual detail::ValueTypeDescription GetTypeImpl(Emittable) = 0;
 
@@ -293,8 +351,8 @@ namespace value
 
         virtual Value StoreConstantDataImpl(ConstantData data) = 0;
 
-        virtual void ForImpl(MemoryLayout layout, std::function<void(std::vector<Scalar>)> fn) = 0;
-        virtual void ForImpl(Scalar start, Scalar stop, Scalar step, std::function<void(Scalar)> fn) = 0;
+        virtual void ForImpl(MemoryLayout layout, std::function<void(std::vector<Scalar>)> fn, const std::string& name) = 0;
+        virtual void ForImpl(Scalar start, Scalar stop, Scalar step, std::function<void(Scalar)> fn, const std::string& name) = 0;
 
         virtual void MoveDataImpl(Value& source, Value& destination) = 0;
 
@@ -314,12 +372,16 @@ namespace value
 
         virtual IfContext IfImpl(Scalar test, std::function<void()> fn) = 0;
 
+        virtual void WhileImpl(Scalar test, std::function<void()> fn) = 0;
+
         virtual std::optional<Value> CallImpl(FunctionDeclaration func, std::vector<Value> args) = 0;
 
         virtual void PrefetchImpl(Value data, PrefetchType type, PrefetchLocality locality) = 0;
 
         virtual void ParallelizeImpl(int numTasks, std::vector<Value> captured, std::function<void(Scalar, std::vector<Value>)> fn) = 0;
 
+        virtual void DebugBreakImpl() = 0;
+
         virtual void DebugDumpImpl(Value value, std::string tag, std::ostream& stream) const = 0;
         virtual void DebugDumpImpl(FunctionDeclaration fn, std::string tag, std::ostream& stream) const = 0;
 
@@ -327,6 +389,14 @@ namespace value
 
         virtual void SetNameImpl(const Value& value, const std::string& name) = 0;
         virtual std::string GetNameImpl(const Value& value) const = 0;
+
+        virtual void ImportCodeFileImpl(std::string filename) = 0;
+
+        virtual Scalar GetFunctionAddressImpl(const FunctionDeclaration& fn) = 0;
+
+        friend void swap(EmitterContext&, EmitterContext&) noexcept;
+
+        TargetDevice _targetDevice;
     };
 
     /// <summary> Returns the global instance of EmitterContext </summary>
@@ -389,8 +459,8 @@ namespace value
         EmitterContext* _oldContext;
     };
 
-    template <typename T, bool b>
-    struct ContextGuard : private ContextGuard<>
+    template <typename T>
+    struct ContextGuard<T, true> : private ContextGuard<>
     {
         template <typename... Args>
         ContextGuard(Args&&... args) :
@@ -404,75 +474,81 @@ namespace value
         T _context;
     };
 
+    inline const TargetDevice& GetContextTargetDevice()
+    {
+        return GetContext().GetTargetDevice();
+    }
+
     /// <summary> Allocates data with the specified type and size </summary>
     /// <param name="type"> The type of the data to allocate </param>
     /// <param name="size"> The size of the allocation, in number of elements </param>
-    /// <returns> An instance of Value that contains a referece to the allocated memory </returns>
-    Value Allocate(ValueType type, size_t size);
+    /// <returns> An instance of Value that contains a reference to the allocated memory </returns>
+    Value Allocate(ValueType type, size_t size, size_t alignment = 0, AllocateFlags flags = AllocateFlags::None);
 
     /// <summary> Allocates data with the specified type and size </summary>
     /// <param name="type"> The type of the data to allocate </param>
     /// <param name="layout"> The memory layout of the allocation, in number of elements </param>
-    /// <returns> An instance of Value that contains a referece to the allocated memory </returns>
-    Value Allocate(ValueType type, utilities::MemoryLayout layout);
+    /// <returns> An instance of Value that contains a reference to the allocated memory </returns>
+    Value Allocate(ValueType type, utilities::MemoryLayout layout, size_t alignment = 0, AllocateFlags flags = AllocateFlags::None);
 
     /// <summary> Allocates data with the specified type and size </summary>
     /// <typeparam name="T"> The type of the data to allocate </param>
     /// <param name="size"> The size of the allocation, in number of elements </param>
-    /// <returns> An instance of Value that contains a referece to the allocated memory </returns>
+    /// <returns> An instance of Value that contains a reference to the allocated memory </returns>
     template <typename T>
-    Value Allocate(size_t size)
+    Value Allocate(size_t size, size_t alignment = 0, AllocateFlags flags = AllocateFlags::None)
     {
-        return Allocate(GetValueType<T>(), size);
+        return Allocate(GetValueType<T>(), size, alignment, flags);
     }
 
     /// <summary> Allocates data with the specified type and size </summary>
     /// <typeparam name="T"> The type of the data to allocate </param>
     /// <param name="layout"> The memory layout of the allocation, in number of elements </param>
-    /// <returns> An instance of Value that contains a referece to the allocated memory </returns>
+    /// <returns> An instance of Value that contains a reference to the allocated memory </returns>
     template <typename T>
-    Value Allocate(utilities::MemoryLayout layout)
+    Value Allocate(utilities::MemoryLayout layout, size_t alignment = 0, AllocateFlags flags = AllocateFlags::None)
     {
-        return Allocate(GetValueType<T>(), layout);
+        return Allocate(GetValueType<T>(), layout, alignment, flags);
     }
 
     /// <summary> Allocates function static data </summary>
     /// <param name="name"> The name of the variable </param>
     /// <param name="type"> The type of the data </param>
     /// <param name="layout"> The layout of the data </param>
-    Value StaticAllocate(std::string name, ValueType type, utilities::MemoryLayout layout);
+    Value StaticAllocate(std::string name, ValueType type, utilities::MemoryLayout layout, AllocateFlags flags = AllocateFlags::None);
 
     /// <summary> Allocates function static data </summary>
     /// <param name="name"> The name of the variable </param>
     /// <param name="data"> The data </param>
     /// <param name="layout"> The layout of the data </param>
     template <typename T, std::enable_if_t<std::is_arithmetic_v<T>, void*> = nullptr>
-    Value StaticAllocate(std::string name, const std::vector<T>& data, std::optional<utilities::MemoryLayout> layout = {})
+    Value StaticAllocate(std::string name, const std::vector<T>& data, std::optional<utilities::MemoryLayout> layout = {}, AllocateFlags flags = AllocateFlags::None)
     {
-        return GetContext().StaticAllocate(name, data, layout);
+        return GetContext().StaticAllocate(name, data, layout, flags);
     }
 
     /// <summary> Allocates scalar function static data </summary>
     /// <param name="name"> The name of the variable </param>
     /// <param name="data"> The data </param>
     template <typename T, std::enable_if_t<std::is_arithmetic_v<T>, void*> = nullptr>
-    Scalar StaticAllocate(std::string name, T t)
+    Scalar StaticAllocate(std::string name, T t, AllocateFlags flags = AllocateFlags::None)
     {
         return StaticAllocate(name,
                               std::vector<std::conditional_t<std::is_same_v<T, bool>, utilities::Boolean, T>>{ t },
-                              utilities::ScalarLayout);
+                              utilities::ScalarLayout,
+                              flags);
     }
 
     /// <summary> Allocates global data </summary>
     /// <param name="name"> The name of the variable </param>
     /// <param name="type"> The type of the data </param>
     /// <param name="layout"> The layout of the data </param>
-    Value GlobalAllocate(std::string name, ValueType type, utilities::MemoryLayout layout);
+    Value GlobalAllocate(std::string name, ValueType type, utilities::MemoryLayout layout, AllocateFlags flags = AllocateFlags::None);
 
     template <typename T, std::enable_if_t<std::is_arithmetic_v<T>, void*> = nullptr>
-    Value GlobalAllocate(std::string name, utilities::MemoryLayout layout)
+    Value GlobalAllocate(std::string name, utilities::MemoryLayout layout, AllocateFlags flags = AllocateFlags::None)
     {
-        return GlobalAllocate(name, GetValueType<T>(), layout);
+        return GlobalAllocate(name, GetValueType<T>(), layout, flags);
     }
 
     /// <summary> Allocates global data </summary>
@@ -480,22 +556,24 @@ namespace value
     /// <param name="data"> The data </param>
     /// <param name="layout"> The layout of the data </param>
     template <typename T, std::enable_if_t<std::is_arithmetic_v<T>, void*> = nullptr>
-    Value GlobalAllocate(std::string name, const std::vector<T>& data, std::optional<utilities::MemoryLayout> layout = {})
+    Value GlobalAllocate(std::string name, const std::vector<T>& data, std::optional<utilities::MemoryLayout> layout = {}, AllocateFlags flags = AllocateFlags::None)
     {
-        return GetContext().GlobalAllocate(name, data, layout);
+        return GetContext().GlobalAllocate(name, data, layout, flags);
     }
 
     /// <summary> Allocates scalar global data </summary>
     /// <param name="name"> The name of the variable </param>
     /// <param name="data"> The data </param>
     template <typename T, std::enable_if_t<std::is_arithmetic_v<T>, void*> = nullptr>
-    Scalar GlobalAllocate(std::string name, T t)
+    Scalar GlobalAllocate(std::string name, T t, AllocateFlags flags = AllocateFlags::None)
     {
         return GlobalAllocate(name,
                               std::vector<std::conditional_t<std::is_same_v<T, bool>, utilities::Boolean, T>>{ t },
-                              utilities::ScalarLayout);
+                              utilities::ScalarLayout,
+                              flags);
     }
 
+    void DebugBreak();
     void DebugDump(FunctionDeclaration fn, std::string tag = "", std::ostream* stream = nullptr);
     void DebugDump(Value value, std::string tag = "", std::ostream* stream = nullptr);
 
@@ -505,8 +583,18 @@ namespace value
         return DebugDump(value.GetValue(), tag, stream);
     }
 
+    /// <summary> Emit a debug print message.  This assumes the application
+    /// on the target platform implements a "void DebugPrint(char* message)" function.  This function will be
+    /// defined for you when running in JIT or Compute mode.  </summary>
+    void DebugPrint(std::string message);
+
     EmitterContext::IfContext If(Scalar test, std::function<void()> fn);
 
+    void While(Scalar test, std::function<void()> fn);
+
+    void ForRange(const std::string& name, Scalar end, std::function<void(Scalar)> fn);
+    void ForRange(const std::string& name, Scalar start, Scalar end, std::function<void(Scalar)> fn);
+    void ForRange(const std::string& name, Scalar start, Scalar end, Scalar step, std::function<void(Scalar)> fn);
     void ForRange(Scalar end, std::function<void(Scalar)> fn);
     void ForRange(Scalar start, Scalar end, std::function<void(Scalar)> fn);
     void ForRange(Scalar start, Scalar end, Scalar step, std::function<void(Scalar)> fn);
@@ -516,10 +604,11 @@ namespace value
     /// function called `GetValue()` which returns a `Value` instance. </typeparam>
     /// <param name="numTasks"> The number of tasks that should be created </param>
     /// <param name="captured"> A list of values to be used inside the function </param>
-    /// <param name="fn"> The function that gets run for each task. The first parameter is the task number for that particular call. Subsequent parameters must match the typelist
-    /// `Tys...` and will be filled in with the values provided within the `captured` parameter. </param>
-    template <typename... Tys>
-    void Parallelize(int numTasks, std::tuple<Tys...> captured, std::function<void(Scalar, Tys...)> fn);
+    /// <param name="fn"> The function that gets run for each task. The first parameter is the task number for that particular call.
+    /// Subsequent parameters must match the typelist `Tys...` and will be filled in with the values provided within the `captured`
+    /// parameter. In other words, the signature for the function should be `void fn(Scalar, Tys...)`. </param>
+    template <typename Fn, typename... Tys>
+    void Parallelize(int numTasks, std::tuple<Tys...> captured, Fn&& fn);
 
     /// <summary> Runs the provided function, in parallel if possible </summary>
     /// <param name="numTasks"> The number of tasks that should be created </param>
@@ -544,6 +633,10 @@ namespace value
     extern FunctionDeclaration RoundFunctionDeclaration;
     extern FunctionDeclaration FloorFunctionDeclaration;
     extern FunctionDeclaration CeilFunctionDeclaration;
+    extern FunctionDeclaration FmaFunctionDeclaration;
+    extern FunctionDeclaration MemCopyFunctionDeclaration;
+    extern FunctionDeclaration MemMoveFunctionDeclaration;
+    extern FunctionDeclaration MemSetFunctionDeclaration;
 
     Scalar Abs(Scalar s);
     Scalar Cos(Scalar s);
@@ -564,6 +657,7 @@ namespace value
     Scalar Sign(Scalar s);
     Scalar Square(Scalar s);
     Scalar LogicalNot(Scalar v);
+    Scalar Fma(Scalar a, Scalar b, Scalar c);
 
     Vector Abs(Vector v);
     Vector Cos(Vector v);
@@ -581,6 +675,11 @@ namespace value
     Vector Floor(Vector v);
     Vector Ceil(Vector v);
 
+    void MemCopy(ViewAdapter dest, ViewAdapter source, std::optional<Scalar> length = std::nullopt);
+    void MemMove(ViewAdapter dest, ViewAdapter source, std::optional<Scalar> length = std::nullopt);
+    void MemSet(ViewAdapter dest, Scalar data, std::optional<Scalar> length = std::nullopt);
+    void ZeroMemory(ViewAdapter dest, std::optional<Scalar> length = std::nullopt);
+
     /// <summary> Specifier determining if the fetch should be for a read or a write </summary>
     enum class PrefetchType
     {
@@ -605,6 +704,11 @@ namespace value
     template <typename ViewType>
     void Prefetch(ViewType view, PrefetchType type = PrefetchType::Read, PrefetchLocality locality = PrefetchLocality::None);
 
+    /// <summary> Returns a unique name based on the prefix provided </summary>
+    /// <param name="prefix"> The prefix for the unique name desired </param>
+    /// <returns> A unique name for the current EmitterContext instance </returns>
+    std::string UniqueName(const std::string& prefix);
+
     /// <summary> Returns the passed in View type with the memory layout representative of the full view of the memory, i.e., no padding. </summary>
     template <typename ViewType>
     ViewType AsFullView(ViewType view);
@@ -643,8 +747,8 @@ namespace value
         }
     }
 
-    template <typename... Tys>
-    void Parallelize(int numTasks, std::tuple<Tys...> captured, std::function<void(Scalar, Tys...)> fn)
+    template <typename Fn, typename... Tys>
+    void Parallelize(int numTasks, std::tuple<Tys...> captured, Fn&& fn)
     {
         auto capturedValues = utilities::TupleToVector<Value>([](auto view) { return detail::GetValue(view); }, captured);
 
@@ -677,7 +781,7 @@ namespace value
     template <typename ViewType>
     ViewType AsFullView(ViewType view)
     {
-        auto value = GetValue(view);
+        auto value = detail::GetValue(view);
         value.SetLayout(utilities::MemoryLayout{ value.GetLayout().GetExtent() });
         return value;
     }
diff --git a/libraries/value/include/FunctionDeclaration.h b/libraries/value/include/FunctionDeclaration.h
index f1a29e7f4..d53a86fed 100644
--- a/libraries/value/include/FunctionDeclaration.h
+++ b/libraries/value/include/FunctionDeclaration.h
@@ -9,6 +9,7 @@
 #pragma once
 
 #include "EmitterContext.h"
+#include "Scalar.h"
 #include "Value.h"
 
 #include <utilities/include/FunctionUtils.h>
@@ -23,11 +24,12 @@ namespace ell
 {
 namespace value
 {
-    /// <summary> Helper enum used to specify whether a FunctionDeclaration object should decorate its function name </summary>
-    enum class FunctionDecorated
+    /// <summary> Helper enum used to specify whether a FunctionDeclaration should be inlined </summary>
+    enum class FunctionInlining
     {
-        Yes,
-        No
+        defaultInline,
+        always,
+        never
     };
 
     /// <summary> Describes a function that can be acted upon by an EmitterContext instance </summary>
@@ -39,49 +41,60 @@ namespace value
 
         /// <summary> Constructor </summary>
         /// <param name="name"> The name of the function </param>
-        FunctionDeclaration(std::string name);
+        explicit FunctionDeclaration(std::string name);
 
         /// <summary> Sets the return type for this function declaration </summary>
         /// <param name="returnType"> A Value instance describing type of the value that is expected and its memory layout to be returned by the function </param>
         /// <returns> A reference to this instance </returns>
         /// <remarks> If this function is not called, the instance defaults to a void return type </remarks>
-        FunctionDeclaration& Returns(Value returnType);
+        FunctionDeclaration& Returns(ViewAdapter returnType);
 
         /// <summary> Sets whether this function should be decorated (mangled) </summary>
         /// <param name="shouldDecorate"> An enum value specifying whether this function should be decorated </param>
         /// <returns> A reference to this instance </returns>
         /// <remarks> By default, a function is decorated, which means the name gets suffixed by an encoding of the function's parameter and return types.
         /// Functions that are declared externally should probably not be decorated </remarks>
-        FunctionDeclaration& Decorated(FunctionDecorated shouldDecorate);
+        FunctionDeclaration& Decorated(bool shouldDecorate);
+
+        /// <summary> If `public` is true, set the function to appear in the public header, otherwise the function is internal </summary>
+        FunctionDeclaration& Public(bool isPublic);
+
+        /// <summary> Sets whether this function should be inlined  </summary>
+        /// <param name="shouldInline"> A FunctionInlining value specifying whether this function should be inlined or not </param>
+        FunctionDeclaration& Inlined(FunctionInlining shouldInline = FunctionInlining::always);
 
         /// <summary> Specifies a function definition for this declaration </summary>
         /// <param name="fn"> A function object that takes zero or more Value library observer types and returns void or a Value library observer type.
         /// This function object defines this function. </param>
         /// <returns> A std::function function object that matches the signature of the function passed in </returns>
-        /// <remarks> If this function is not called, this function declaration is treated as an external function. Not all contexts may support an external
+        /// <remarks> If this function or `Imported` is not called, this function declaration is treated as an external function. Not all contexts may support an external
         /// function </remarks>
         template <typename Fn>
         [[maybe_unused]] auto Define(Fn&& fn);
 
+        /// <summary> Specifies the code file that is to be imported to define this function
+        /// <remarks> The specified file is imported when this function declaration is used to emit a call. </remarks>
+        FunctionDeclaration& DefineFromFile(std::string file);
+
         /// <summary> Sets the parameters this function requires </summary>
         /// <param name="paramTypes"> Zero or more Value instances or view types with a GetValue() member function describing
         /// the types of the arguments and their memory layout expected by the function </param>
         /// <returns> A reference to this instance </returns>
         /// <remarks> If this function is not called, the instance defaults to taking no arguments </remarks>
         template <typename... Types>
-        FunctionDeclaration& Parameters(Types&& ... paramTypes);
+        FunctionDeclaration& Parameters(Types&&... paramTypes);
 
         /// <summary> Sets the parameters this function requires </summary>
         /// <param name="paramTypes"> Zero or more Value instances describing the types of the arguments and their memory layout expected by the function </param>
         /// <returns> A reference to this instance </returns>
         /// <remarks> If this function is not called, the instance defaults to taking no arguments </remarks>
-        [[nodiscard]] FunctionDeclaration& Parameters(std::vector<Value> parameters);
+        [[nodiscard]] FunctionDeclaration& Parameters(std::vector<ViewAdapter> parameters);
 
         /// <summary> Emits a call to the function declaration </summary>
         /// <param name="arguments"> A vector of Value instances that hold the arguments for the function call </param>
         /// <returns> A std::optional instance that holds a Value instance with the return value of the call, if it is expected, otherwise empty </returns>
         /// <remarks> If the function is not defined and the context is capable of it, this will emit a call to an external function </remarks>
-        [[maybe_unused]] std::optional<Value> Call(std::vector<Value> arguments) const;
+        [[maybe_unused]] std::optional<Value> Call(std::vector<ViewAdapter> arguments) const;
 
         /// <summary> Emits a call to the function declaration </summary>
         /// <param name="arguments"> Zero or more Value instances or view types with a GetValue() member function describing
@@ -101,23 +114,49 @@ namespace value
         /// Otherwise, the std::optional instance is empty </summary>
         const std::optional<Value>& GetReturnType() const;
 
+        /// <summary> Returns true if function is to appear in the public header, false otherwise </summary>
+        [[nodiscard]] bool IsPublic() const;
+
         /// <summary> Returns true if function is defined for current context, false otherwise </summary>
         [[nodiscard]] bool IsDefined() const;
 
         /// <summary> Returns true if the instance is an empty function declaration </summary>
         [[nodiscard]] bool IsEmpty() const;
 
+        /// <summary> Returns true if the instance represents an imported function </summary>
+        [[nodiscard]] bool IsImported() const;
+
+        /// <summary> Returns true if the instance is inlined </summary>
+        [[nodiscard]] FunctionInlining InlineState() const;
+
+        Scalar GetPointer() const;
+
+        void SetPointer(Scalar pointer) { _pointer = pointer; }
+
+        bool IsPointerSet() const { return _pointer.has_value(); }
+
     private:
         template <typename ReturnT, typename... Args>
         [[maybe_unused]] std::function<ReturnT(Args...)> DefineImpl(std::function<ReturnT(Args...)> fn);
 
+        template <typename ReturnT, typename... Args>
+        [[maybe_unused]] std::function<ReturnT(Args...)> DefineImpl(std::false_type, std::function<ReturnT(Args...)> fn);
+
+        template <typename ReturnT, typename... Args>
+        [[maybe_unused]] inline std::function<ReturnT(Args...)> DefineImpl(std::true_type, std::function<ReturnT(Args...)>);
+
         void CheckNonEmpty() const;
 
+        std::string _importedSource;
         std::string _originalFunctionName;
         mutable std::optional<std::string> _decoratedFunctionName;
         std::optional<Value> _returnType;
         std::vector<Value> _paramTypes;
+        std::optional<Scalar> _pointer;
+
+        FunctionInlining _inlineState = FunctionInlining::defaultInline;
         bool _isDecorated = true;
+        bool _isPublic = false;
         bool _isEmpty = true;
     };
 
@@ -153,16 +192,6 @@ namespace value
     {
         // Until MacOS's compiler has proper std::function deduction guides
 #if defined(__APPLE__)
-        template <typename Fn>
-        struct Function : public std::function<Fn>
-        {
-            Function(const std::function<Fn>& fn) :
-                std::function<Fn>(fn) {}
-            Function(std::function<Fn>&& fn) :
-                std::function<Fn>(std::move(fn)) {}
-            using std::function<Fn>::function;
-        };
-
         template <typename>
         struct StdFunctionDeductionGuideHelper
         {};
@@ -191,6 +220,16 @@ namespace value
             using Type = ReturnT(Args...);
         };
 
+        template <typename Fn>
+        struct Function : public std::function<Fn>
+        {
+            Function(const std::function<Fn>& fn) :
+                std::function<Fn>(fn) {}
+            Function(std::function<Fn>&& fn) :
+                std::function<Fn>(std::move(fn)) {}
+            using std::function<Fn>::function;
+        };
+
         // Function pointer
         template <typename ReturnT, typename... Args>
         Function(ReturnT (*)(Args...))->Function<ReturnT(Args...)>;
@@ -212,48 +251,75 @@ namespace value
     template <typename Fn>
     [[maybe_unused]] auto FunctionDeclaration::Define(Fn&& fn)
     {
-        return DefineImpl(FUNCTION_TYPE{ std::forward<Fn>(fn) });
+        return DefineImpl(FUNCTION_TYPE(std::forward<Fn>(fn)));
     }
-
 #undef FUNCTION_TYPE
 
-    template <typename... Types>
-    FunctionDeclaration& FunctionDeclaration::Parameters(Types&&... paramTypes)
+    template <typename ReturnT, typename... Args>
+    inline std::function<ReturnT(Args...)> FunctionDeclaration::DefineImpl(std::function<ReturnT(Args...)> fn)
     {
-        return Parameters(std::vector<Value>{ detail::GetValue(std::forward<Types>(paramTypes))... });
+        if constexpr (sizeof...(Args) == 1 && utilities::AllSame<utilities::RemoveCVRefT<Args>..., std::vector<Value>>)
+        {
+            return DefineImpl(std::true_type{}, fn);
+        }
+        else
+        {
+            return DefineImpl(std::false_type{}, fn);
+        }
     }
 
-    template <typename... Types>
-    std::optional<Value> FunctionDeclaration::Call(Types&&... arguments) const
+    template <typename ReturnT, typename... Args>
+    inline std::function<ReturnT(Args...)> FunctionDeclaration::DefineImpl(std::true_type, std::function<ReturnT(Args...)> fn)
     {
-        return Call(std::vector<Value>{ detail::GetValue(std::forward<Types>(arguments))... });
+        auto createdFn = GetContext().CreateFunction(*this, [fn = std::move(fn)](std::vector<Value> args) -> std::optional<Value> {
+            if constexpr (std::is_same_v<ReturnT, void>)
+            {
+                fn(args);
+                return std::nullopt;
+            }
+            else
+            {
+                return fn(args);
+            }
+        });
+
+        return [createdFn = std::move(createdFn)](Args... args) -> ReturnT {
+            if constexpr (std::is_same_v<ReturnT, void>)
+            {
+                createdFn(args...);
+            }
+            else
+            {
+                return *createdFn(args...);
+            }
+        };
     }
 
     template <typename ReturnT, typename... Args>
-    [[maybe_unused]] std::function<ReturnT(Args...)> FunctionDeclaration::DefineImpl(std::function<ReturnT(Args...)> fn)
+    [[maybe_unused]] std::function<ReturnT(Args...)> FunctionDeclaration::DefineImpl(std::false_type, std::function<ReturnT(Args...)> fn)
     {
         if constexpr (std::is_same_v<ReturnT, void>)
         {
             if (_returnType.has_value())
             {
-                throw utilities::InputException(utilities::InputExceptionErrors::typeMismatch, "Defining function has a return value, but declaration does not");
+                throw utilities::InputException(utilities::InputExceptionErrors::typeMismatch, utilities::FormatString("[%s] Defining function has a return value, but declaration does not", GetFunctionName().c_str()));
             }
         }
         else
         {
             if (!_returnType.has_value())
             {
-                throw utilities::InputException(utilities::InputExceptionErrors::typeMismatch, "Defining function returns void, but declaration does not");
+                throw utilities::InputException(utilities::InputExceptionErrors::typeMismatch, utilities::FormatString("[%s] Defining function returns void, but declaration does not", GetFunctionName().c_str()));
             }
 
-            // Try to instantiate an instance of the return type (R) with the Value instance that reprepsents the return type (_returnType)
+            // Try to instantiate an instance of the return type (R) with the Value instance that represents the return type (_returnType)
             // If this throws, the return value of the defining function is not compatible with the Value instance specified in the declaration
             ReturnT returnType = *_returnType;
         }
 
         if (sizeof...(Args) != _paramTypes.size())
         {
-            throw utilities::InputException(utilities::InputExceptionErrors::typeMismatch, utilities::FormatString("Defining function takes %zu parameters, but declaration was specific to have %zu.", sizeof...(Args), _paramTypes.size()));
+            throw utilities::InputException(utilities::InputExceptionErrors::typeMismatch, utilities::FormatString("[%s] Defining function takes %zu parameters, but declaration was specific to have %zu.", GetFunctionName().c_str(), sizeof...(Args), _paramTypes.size()));
         }
 
         if constexpr (sizeof...(Args) > 0)
@@ -277,7 +343,7 @@ namespace value
             }
         });
 
-        return [createdFn = std::move(createdFn)](Args&&... args) -> ReturnT {
+        return [createdFn = std::move(createdFn), name = GetFunctionName()](Args&&... args) -> ReturnT {
             constexpr auto argSize = sizeof...(Args);
             std::vector<Value> argValues;
             argValues.reserve(argSize);
@@ -289,7 +355,7 @@ namespace value
                 if (fnReturn)
                 {
                     throw utilities::LogicException(utilities::LogicExceptionErrors::illegalState,
-                                                    "Function is supposed to return void, but a value was returned from the defining function");
+                                                    utilities::FormatString("[%s] Function is supposed to return void, but a value was returned from the defining function", name.c_str()));
                 }
             }
             else
@@ -298,6 +364,19 @@ namespace value
             }
         };
     }
+
+    template <typename... Types>
+    FunctionDeclaration& FunctionDeclaration::Parameters(Types&&... paramTypes)
+    {
+        return Parameters(std::vector<ViewAdapter>{ std::forward<Types>(paramTypes)... });
+    }
+
+    template <typename... Types>
+    std::optional<Value> FunctionDeclaration::Call(Types&&... arguments) const
+    {
+        return Call(std::vector<ViewAdapter>{ std::forward<Types>(arguments)... });
+    }
+
 } // namespace value
 } // namespace ell
 
diff --git a/libraries/value/include/LLVMContext.h b/libraries/value/include/LLVMContext.h
index 2b90bf1d0..b3578abe1 100644
--- a/libraries/value/include/LLVMContext.h
+++ b/libraries/value/include/LLVMContext.h
@@ -13,18 +13,18 @@
 #include "FunctionDeclaration.h"
 #include "Scalar.h"
 
+#include <emitters/include/IRModuleEmitter.h>
+#include <emitters/include/LLVMUtilities.h>
+
 #include <functional>
-#include <optional>
+#include <memory>
 #include <stack>
 
 namespace ell
 {
 namespace emitters
 {
-
     class IRFunctionEmitter;
-    class IRModuleEmitter;
-
 } // namespace emitters
 } // namespace ell
 
@@ -38,19 +38,37 @@ namespace value
     public:
         /// <summary> Constructor </summary>
         /// <param name="emitter"> A reference to an IRModuleEmitter that will be used to emit LLVM IR </param>
-        LLVMContext(emitters::IRModuleEmitter& emitter);
+        explicit LLVMContext(emitters::IRModuleEmitter& emitter);
+
+        /// <summary> Constructor </summary>
+        /// <param name="emitter"> Takes ownership of the IRModuleEmitter that will be used to emit LLVM IR </param>
+        explicit LLVMContext(std::unique_ptr<emitters::IRModuleEmitter>&& emitter);
+
+        /// <summary> Constructor </summary>
+        ///
+        /// <param name="moduleName"> Name of the module. </param>
+        /// <param name="parameters"> Options for the compiler </param>
+        LLVMContext(const std::string& moduleName, const emitters::CompilerOptions& parameters);
 
         emitters::IRModuleEmitter& GetModuleEmitter() const;
 
         emitters::IRFunctionEmitter& GetFunctionEmitter() const;
 
+        emitters::LLVMFunction DeclareFunction(const FunctionDeclaration& func);
+
+        std::optional<emitters::LLVMValue> ToLLVMValue(Value value) const;
+        std::vector<std::optional<emitters::LLVMValue>> ToLLVMValue(std::vector<Value> values) const;
+
+        emitters::LLVMValue ToLLVMValue(Value value);
+        std::vector<emitters::LLVMValue> ToLLVMValue(std::vector<Value> values);
+
     private:
-        Value AllocateImpl(ValueType value, MemoryLayout layout) override;
+        Value AllocateImpl(ValueType value, MemoryLayout layout, size_t alignment, AllocateFlags flags = AllocateFlags::None) override;
 
         std::optional<Value> GetGlobalValue(GlobalAllocationScope scope, std::string name) override;
 
-        Value GlobalAllocateImpl(GlobalAllocationScope scope, std::string name, ConstantData data, MemoryLayout layout) override;
-        Value GlobalAllocateImpl(GlobalAllocationScope scope, std::string name, ValueType type, MemoryLayout layout) override;
+        Value GlobalAllocateImpl(GlobalAllocationScope scope, std::string name, ConstantData data, MemoryLayout layout, AllocateFlags flags = AllocateFlags::None) override;
+        Value GlobalAllocateImpl(GlobalAllocationScope scope, std::string name, ValueType type, MemoryLayout layout, AllocateFlags flags = AllocateFlags::None) override;
 
         detail::ValueTypeDescription GetTypeImpl(Emittable emittable) override;
 
@@ -59,8 +77,8 @@ namespace value
 
         Value StoreConstantDataImpl(ConstantData data) override;
 
-        void ForImpl(MemoryLayout layout, std::function<void(std::vector<Scalar>)> fn) override;
-        void ForImpl(Scalar start, Scalar stop, Scalar step, std::function<void(Scalar)> fn) override;
+        void ForImpl(MemoryLayout layout, std::function<void(std::vector<Scalar>)> fn, const std::string& name) override;
+        void ForImpl(Scalar start, Scalar stop, Scalar step, std::function<void(Scalar)> fn, const std::string& name) override;
 
         void MoveDataImpl(Value& source, Value& destination) override;
 
@@ -81,12 +99,15 @@ namespace value
 
         IfContext IfImpl(Scalar test, std::function<void()> fn) override;
 
+        void WhileImpl(Scalar test, std::function<void()> fn) override;
+
         std::optional<Value> CallImpl(FunctionDeclaration func, std::vector<Value> args) override;
 
         void PrefetchImpl(Value data, PrefetchType type, PrefetchLocality locality) override;
 
         void ParallelizeImpl(int numTasks, std::vector<Value> captured, std::function<void(Scalar, std::vector<Value>)> fn) override;
 
+        void DebugBreakImpl() override;
         void DebugDumpImpl(Value value, std::string tag, std::ostream& stream) const override;
         void DebugDumpImpl(FunctionDeclaration fn, std::string tag, std::ostream& stream) const override;
         void DebugPrintImpl(std::string message) override;
@@ -94,6 +115,10 @@ namespace value
         void SetNameImpl(const Value& value, const std::string& name) override;
         std::string GetNameImpl(const Value& value) const override;
 
+        void ImportCodeFileImpl(std::string) override;
+
+        Scalar GetFunctionAddressImpl(const FunctionDeclaration& fn) override;
+
         Value IntrinsicCall(FunctionDeclaration intrinsic, std::vector<Value> args);
 
         std::optional<Value> EmitExternalCall(FunctionDeclaration func, std::vector<Value> args);
@@ -104,6 +129,8 @@ namespace value
         std::string GetGlobalScopedName(std::string name) const;
         std::string GetCurrentFunctionScopedName(std::string name) const;
 
+        emitters::LLVMFunctionType ToLLVMFunctionType(const FunctionDeclaration& func) const;
+
         struct PromotedConstantDataDescription
         {
             const ConstantData* data;
@@ -114,12 +141,14 @@ namespace value
         std::optional<PromotedConstantDataDescription> HasBeenPromoted(Value value) const;
         Value Realize(Value value) const;
         Value EnsureEmittable(Value value);
+        std::vector<Value> EnsureEmittable(std::vector<Value> values);
 
         class IfContextImpl;
         struct FunctionScope;
 
         std::stack<std::vector<PromotedConstantDataDescription>> _promotedConstantStack;
 
+        std::unique_ptr<emitters::IRModuleEmitter> _ownedEmitter;
         emitters::IRModuleEmitter& _emitter;
 
         // LLVMContext uses ComputeContext internally to handle cases where all relevant operands are constant
@@ -130,5 +159,9 @@ namespace value
         std::unordered_map<FunctionDeclaration, DefinedFunction> _definedFunctions;
     };
 
+    emitters::LLVMValue ToLLVMValue(Value value);
+    emitters::LLVMValue ToLLVMValue(ViewAdapter value);
+
+    std::vector<emitters::LLVMValue> ToLLVMValue(std::vector<Value> values);
 } // namespace value
 } // namespace ell
diff --git a/libraries/value/include/LoopNests.h b/libraries/value/include/LoopNests.h
new file mode 100644
index 000000000..1b9d94018
--- /dev/null
+++ b/libraries/value/include/LoopNests.h
@@ -0,0 +1,224 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     LoopNests.h (value)
+//  Authors:  Kern Handa
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include "CachingProvider.h"
+#include "EmitterContext.h"
+#include "FunctionDeclaration.h"
+#include "Value.h"
+
+#include "loopnests/Index.h"
+#include "loopnests/Kernel.h"
+#include "loopnests/KernelPredicate.h"
+
+#include <utilities/include/FunctionUtils.h>
+
+#include <functional>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+namespace ell
+{
+namespace value
+{
+    namespace loopnests
+    {
+        class LoopNest;
+    }
+
+    using loopnests::Index;
+    using loopnests::Kernel;
+
+    enum class ArgumentType
+    {
+        Input,
+        InputOutput,
+        Output,
+        Temporary
+    };
+
+    class LoopNestImpl;
+
+    class Schedule
+    {
+    public:
+        Schedule(const Schedule&);
+        Schedule& operator=(const Schedule&);
+
+        Schedule(Schedule&&) noexcept = delete;
+        Schedule& operator=(Schedule&&) noexcept = delete;
+
+        Index Split(Index& index, int factor);
+
+        /// <summary> Parallelizes the loop represented by the index, if parallelization is enabled. No effect if parallelization is disabled </summary>
+        /// <param name="index"> Represents the loop to parallelize </param>
+        void Parallelize(Index index);
+
+        /// <summary> Parallelizes the loop represented by the index, if parallelization is enabled. No effect if parallelization is disabled </summary>
+        /// <param name="index"> Represents the loop to parallelize. On return, this index points to the inner loop created by the split </param>
+        /// <param name="factor"> The factor by which to parallelize. Ideally, represents the number of threads to use </param>
+        /// <returns> The index which represents the outer loop, now parallelized </returns>
+        Index Parallelize(Index index, int factor);
+
+        /// <summary> Unrolls the loop represented by the index </summary>
+        /// <param name="index"> Represents the loop to unroll </param>
+        void Unroll(Index index);
+
+        /// <summary> Unrolls the loop represented by the index </summary>
+        /// <param name="index"> Represents the loop to unroll. On return, this index points to the inner loop created by the split </param>
+        /// <param name="factor"> The factor by which to unroll </param>
+        /// <returns> The index which represents the outer loop, now unrolled </returns>
+        Index Unroll(Index index, int factor);
+
+        void Cache(std::unique_ptr<CachingProvider> provider);
+
+        template <typename CachingStrategyType>
+        void Cache(
+            ViewAdapter view,
+            std::vector<Index> kernelIndices,
+            utilities::MemoryShape size = {},
+            std::vector<Index> atIndices = {},
+            std::optional<utilities::DimensionOrder> order = std::nullopt,
+            std::any extras = {});
+
+        void Cache(
+            CachingProvider& provider,
+            ViewAdapter view,
+            std::vector<Index> kernelIndices,
+            utilities::MemoryShape size,
+            std::vector<Index> atIndices = {},
+            std::optional<utilities::DimensionOrder> order = std::nullopt,
+            std::any extras = {});
+
+        void SetOrder(std::vector<Index> indices);
+
+    private:
+        Schedule(LoopNest&);
+        friend class LoopNest;
+        utilities::MemoryShape GetShapeFromIndicesIncrement(std::vector<Index>& kernelIndices);
+
+        std::reference_wrapper<LoopNest> _nest;
+        std::reference_wrapper<LoopNestImpl> _impl;
+    };
+
+    class LoopNest
+    {
+    public:
+        LoopNest();
+        LoopNest(const LoopNest&);
+        LoopNest(LoopNest&&) noexcept;
+        LoopNest& operator=(const LoopNest&);
+        LoopNest& operator=(LoopNest&&) noexcept;
+        ~LoopNest();
+
+        LoopNest& Using(std::initializer_list<ViewAdapter> inputs, ArgumentType argType);
+        LoopNest& ForAll(Index index, int begin, int end);
+
+        template <typename Fn>
+        LoopNest& Do(Fn&& fn, std::vector<Index> kernelOuterIndices = {}, std::string kernelId = "");
+
+        template <typename Fn>
+        LoopNest& Do(Fn&& fn, std::string kernelId);
+
+        LoopNest& Do(std::function<void(std::vector<Value>)> fn, std::vector<Index> kernelOuterIndices = {}, std::string kernelId = "");
+
+        LoopNest& Do(std::function<void(std::vector<Value>)> fn, std::string kernelId);
+
+        LoopNest& Do(Kernel kernel, std::vector<Index> kernelOuterIndex = {});
+
+        LoopNest& Do(Kernel kernel, const loopnests::KernelPredicate& predicate, const loopnests::KernelPredicate& placement = {});
+
+        Schedule& GetSchedule();
+
+        void Run() const;
+
+        loopnests::LoopNest& GetUnderlyingLoopNest();
+        const loopnests::LoopNest& GetUnderlyingLoopNest() const;
+
+    private:
+        template <typename... Args>
+        LoopNest& DoImpl(std::function<void(Args...)> fn, std::vector<Index> kernelOuterIndices, std::string kernelId);
+
+        friend void swap(LoopNest& nest1, LoopNest& nest2) noexcept;
+        friend class Schedule;
+
+        std::unique_ptr<LoopNestImpl> _impl;
+        Schedule _schedule;
+    };
+
+    LoopNest Using(std::initializer_list<ViewAdapter> inputs, ArgumentType argType);
+} // namespace value
+} // namespace ell
+
+#pragma region implementation
+
+namespace ell
+{
+namespace value
+{
+#if defined(__APPLE__)
+#define FUNCTION_TYPE detail::Function // defined in implementation region of FunctionDeclaration.h
+#else
+#define FUNCTION_TYPE std::function
+#endif // defined(__APPLE__)
+
+    template <typename Fn>
+    LoopNest& LoopNest::Do(Fn&& fn, std::vector<Index> kernelOuterIndices, std::string kernelId)
+    {
+        return DoImpl(FUNCTION_TYPE(std::forward<Fn>(fn)), kernelOuterIndices, kernelId);
+    }
+
+    template <typename Fn>
+    LoopNest& LoopNest::Do(Fn&& fn, std::string kernelId)
+    {
+        return Do(std::move(fn), {}, kernelId);
+    }
+
+    template <typename... Args>
+    LoopNest& LoopNest::DoImpl(std::function<void(Args...)> fn, std::vector<Index> kernelOuterIndices, std::string kernelId)
+    {
+        static_assert(std::conjunction_v<std::is_convertible<Args, ViewAdapter>...>);
+        return Do(
+            std::function<void(std::vector<Value>)>{
+                [fn = std::move(fn)](std::vector<Value> args) {
+                    std::tuple<Args...> tupleArgs = utilities::VectorToTuple<Args...>(args);
+                    std::apply(fn, tupleArgs);
+                } },
+                kernelOuterIndices,
+                kernelId);
+    }
+
+    template <typename CachingStrategyType>
+    void Schedule::Cache(
+        ViewAdapter view,
+        std::vector<Index> kernelIndices,
+        utilities::MemoryShape size,
+        std::vector<Index> atIndices,
+        std::optional<utilities::DimensionOrder> order,
+        std::any extras)
+    {
+        static_assert(std::is_base_of_v<CachingProvider, CachingStrategyType>, "CachingStrategyType must inherit from CachingProvider!");
+
+        CachingStrategyType provider{};
+        Cache(
+            provider,
+            view,
+            kernelIndices,
+            size,
+            atIndices,
+            order,
+            extras);
+    }
+
+#undef FUNCTION_TYPE
+} // namespace value
+} // namespace ell
+
+#pragma endregion implementation
diff --git a/libraries/value/include/Matrix.h b/libraries/value/include/Matrix.h
index b3396adc0..ef4e56f60 100644
--- a/libraries/value/include/Matrix.h
+++ b/libraries/value/include/Matrix.h
@@ -25,6 +25,12 @@ namespace value
     class Matrix
     {
     public:
+        enum class MatrixLayout
+        {
+            rowMajor,
+            columnMajor
+        };
+
         Matrix();
 
         /// <summary> Constructor that wraps the provided instance of Value </summary>
@@ -97,6 +103,8 @@ namespace value
         /// <summary> Gets the number of columns within the active area </summary>
         size_t Columns() const;
 
+        MatrixLayout GetMatrixLayout() const;
+
         /// <summary> Retrieves the type of data stored in the wrapped Value instance </summary>
         /// <returns> The type </returns>
         ValueType Type() const;
@@ -128,6 +136,16 @@ namespace value
         Value _value;
     };
 
+    /// <summary> Constructs an allocated instance with the specified dimensions </summary>
+    /// <param name="rows"> The number of rows of the allocated matrix </param>
+    /// <param name="columns"> The number of columns of the allocated matrix </param>
+    /// <param name="type"> The type of the elements </typeparam>
+    /// <param name="name"> The optional name </param>
+    inline Matrix MakeMatrix(int rows, int columns, ValueType type, const std::string& name = "")
+    {
+        return Matrix(Allocate(type, utilities::MemoryLayout({ rows, columns })), name);
+    }
+
     /// <summary> Constructs an allocated instance with the specified dimensions </summary>
     /// <typeparam name="T"> Any fundamental type accepted by Value </typeparam>
     /// <param name="rows"> The number of rows of the allocated matrix </param>
@@ -139,6 +157,28 @@ namespace value
         return Matrix(Allocate<T>(utilities::MemoryLayout({ rows, columns })), name);
     }
 
+
+    /// <summary> Constructs a statically-allocated instance with the specified dimensions </summary>
+    /// <param name="rows"> The number of rows of the allocated matrix </param>
+    /// <param name="columns"> The number of columns of the allocated matrix </param>
+    /// <param name="type"> The type of the elements </typeparam>
+    /// <param name="name"> The optional name </param>
+    inline Matrix MakeStaticMatrix(int rows, int columns, ValueType type, const std::string& name = "")
+    {
+        return Matrix(StaticAllocate(name, type, utilities::MemoryLayout({ rows, columns })));
+    }
+    
+    /// <summary> Constructs a statically-allocated instance with the specified dimensions </summary>
+    /// <typeparam name="T"> Any fundamental type accepted by Value </typeparam>
+    /// <param name="rows"> The number of rows of the allocated matrix </param>
+    /// <param name="columns"> The number of columns of the allocated matrix </param>
+    /// <param name="name"> The optional name </param>
+    template <typename T>
+    Matrix MakeStaticMatrix(int rows, int columns, const std::string& name = "")
+    {
+        return Matrix(StaticAllocate<T>(name, utilities::MemoryLayout({ rows, columns })));
+    }
+
 } // namespace value
 } // namespace ell
 
diff --git a/libraries/value/include/MatrixOperations.h b/libraries/value/include/MatrixOperations.h
index 477dbb096..4f0bffc4e 100644
--- a/libraries/value/include/MatrixOperations.h
+++ b/libraries/value/include/MatrixOperations.h
@@ -23,7 +23,7 @@ namespace value
     class Vector;
 
     /// <summary> Reinterprets the given data value as a matrix of the given size </summary>
-	Matrix ToMatrix(Value data, int numRows, int numCols);
+    Matrix ToMatrix(Value data, int numRows, int numCols);
 
     Scalar Sum(Matrix matrix, Scalar initialValue);
 
@@ -32,6 +32,12 @@ namespace value
     /// <param name="fn"> The function to be called for each coordinate where there is an active element </param>
     void For(Matrix matrix, std::function<void(Scalar, Scalar)> fn);
 
+    /// <summary> Creates a for loop over the matrix </summary>
+    /// <param name="name"> A name that can be used by the emitter context to tag this loop in the emitted code </param>
+    /// <param name="matrix"> The instance of Matrix that references the data over which to iterate </param>
+    /// <param name="fn"> The function to be called for each coordinate where there is an active element </param>
+    void For(const std::string& name, Matrix matrix, std::function<void(Scalar, Scalar)> fn);
+
     Matrix GEMM(Matrix m1, Matrix m2);
 
     Vector GEMV(Matrix m, Vector v);
diff --git a/libraries/value/include/Print.h b/libraries/value/include/Print.h
new file mode 100644
index 000000000..e5315a7ce
--- /dev/null
+++ b/libraries/value/include/Print.h
@@ -0,0 +1,36 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     Print.h (value)
+//  Authors:  Chuck Jacobs
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include "Value.h"
+
+#include <string>
+#include <vector>
+
+namespace ell
+{
+namespace value
+{
+    /// <summary> Emits a print call. </summary>
+    ///
+    /// <param name="text"> The text to print. </param>
+    void Print(const std::string& text);
+
+    /// <summary> Emits a printf call. </summary>
+    ///
+    /// <param name="arguments"> Arguments to the printf call. </param>
+    void Printf(const std::vector<Value>& arguments);
+
+    /// <summary> Emits a printf call. </summary>
+    ///
+    /// <param name="format"> Describes the printf format to use. </param>
+    /// <param name="arguments"> Arguments to the printf call. </param>
+    void Printf(const std::string& format, const std::vector<Value>& arguments);
+} // namespace value
+} // namespace ell
diff --git a/libraries/value/include/Scalar.h b/libraries/value/include/Scalar.h
index e6ca718c4..3ff84a23e 100644
--- a/libraries/value/include/Scalar.h
+++ b/libraries/value/include/Scalar.h
@@ -8,6 +8,7 @@
 
 #pragma once
 
+#include "ScalarOperations.h"
 #include "Value.h"
 
 namespace ell
diff --git a/libraries/value/include/ScalarOperations.h b/libraries/value/include/ScalarOperations.h
new file mode 100644
index 000000000..f63123189
--- /dev/null
+++ b/libraries/value/include/ScalarOperations.h
@@ -0,0 +1,29 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     ScalarOperations.h (value)
+//  Authors:  Kern Handa
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include <type_traits>
+
+namespace ell
+{
+namespace value
+{
+
+    class Scalar;
+
+    /// <summary> Arithmetic operators </summary>
+    Scalar Add(Scalar, Scalar);
+    Scalar Subtract(Scalar, Scalar);
+    Scalar Multiply(Scalar, Scalar);
+    Scalar Divide(Scalar, Scalar);
+    Scalar Modulo(Scalar, Scalar);
+    Scalar FusedMultiplyAdd(Scalar a, Scalar b, Scalar c); // returns (a*b)+c
+
+} // namespace value
+} // namespace ell
diff --git a/libraries/value/include/TensorOperations.h b/libraries/value/include/TensorOperations.h
index 8cce73089..2c3926b7f 100644
--- a/libraries/value/include/TensorOperations.h
+++ b/libraries/value/include/TensorOperations.h
@@ -33,5 +33,11 @@ namespace value
     /// <param name="fn"> The function to be called for each coordinate where there is an active element </param>
     void For(Tensor tensor, std::function<void(Scalar, Scalar, Scalar)> fn);
 
+    /// <summary> Creates a for loop over the tensor </summary>
+    /// <param name="name"> A name that can be used by the emitter context to tag this loop in the emitted code </param>
+    /// <param name="tensor"> The instance of Tensor that references the data over which to iterate </param>
+    /// <param name="fn"> The function to be called for each coordinate where there is an active element </param>
+    void For(const std::string& name, Tensor tensor, std::function<void(Scalar, Scalar, Scalar)> fn);
+
 } // namespace value
 } // namespace ell
diff --git a/libraries/value/include/Value.h b/libraries/value/include/Value.h
index 8975ba082..55a07fd9a 100644
--- a/libraries/value/include/Value.h
+++ b/libraries/value/include/Value.h
@@ -404,6 +404,9 @@ namespace value
         /// <returns> The underlying data storage </returns>
         const UnderlyingDataType& GetUnderlyingData() const;
 
+        /// <summary> Gets a reference to the complete type description being held </summary>
+        const detail::ValueTypeDescription& GetType() const { return _type; }
+
         /// <summary> Set the name for this instance with the current emitter context </summary>
         /// <param name="name"> The name </param>
         void SetName(const std::string& name);
@@ -427,12 +430,16 @@ namespace value
     /// `Value GetValue()` </summary>
     struct ViewAdapter
     {
-        template <typename ViewType>
-        ViewAdapter(ViewType view) :
+        template <typename View>
+        ViewAdapter(View view) :
             _value(detail::GetValue(view))
         {}
 
-        inline operator Value() const { return _value; }
+        inline operator const Value&() const { return _value; }
+        inline operator Value&() { return _value; }
+
+        inline Value& GetValue() { return _value; }
+        inline const Value& GetValue() const { return _value;  }
 
     private:
         Value _value;
diff --git a/libraries/value/include/ValueOperations.h b/libraries/value/include/ValueOperations.h
index 7f939ab1d..1ae3be1de 100644
--- a/libraries/value/include/ValueOperations.h
+++ b/libraries/value/include/ValueOperations.h
@@ -27,6 +27,13 @@ namespace value
     /// <param name="fn"> The function to be called for each coordinate where there is an active element </param>
     void For(utilities::MemoryLayout layout, std::function<void(Scalar)> fn);
 
+    /// <summary> Creates a for loop beggining at `start`, ending at `stop`, and incrementing by `step` </summary>
+    /// <param name="start"> The value used to initialize the loop counter </param>
+    /// <param name="stop"> The terminal value of the loop </param>
+    /// <param name="step"> The value by which the loop counter is incremented </param>
+    /// <param name="fn"> The function to be called for each coordinate where there is an active element </param>
+    void For(Scalar start, Scalar stop, Scalar step, std::function<void(Scalar)> fn);
+
     /// <summary> Cast a value to another type, returning a new value </summary>
     /// <param name="value"> The data to convert </param>
     /// <param name="type"> The type to which the data should be casted </param>
diff --git a/libraries/value/include/ValueType.h b/libraries/value/include/ValueType.h
index 3e426a0a0..67094bbe0 100644
--- a/libraries/value/include/ValueType.h
+++ b/libraries/value/include/ValueType.h
@@ -9,6 +9,7 @@
 #pragma once
 
 #include <utilities/include/Boolean.h>
+#include <utilities/include/Exception.h>
 #include <utilities/include/TypeTraits.h>
 
 #include <cstdint>
@@ -118,6 +119,10 @@ namespace value
         {
             return ValueType::Double;
         }
+        else
+        {
+            static_assert(utilities::FalseType<T>::value, "Unknown value type");
+        }
     }
 
     /// <summary> Get a string representation of the enum value </summary>
diff --git a/libraries/value/include/VectorOperations.h b/libraries/value/include/VectorOperations.h
index 269ac7aed..65e4d9e9f 100644
--- a/libraries/value/include/VectorOperations.h
+++ b/libraries/value/include/VectorOperations.h
@@ -49,5 +49,11 @@ namespace value
     /// <param name="fn"> The function to be called for each coordinate where there is an active element </param>
     void For(Vector vector, std::function<void(Scalar)> fn);
 
+    /// <summary> Creates a for loop over the vector </summary>
+    /// <param name="name"> A name that can be used by the emitter context to tag this loop in the emitted code </param>
+    /// <param name="vector"> The instance of Vector that references the data over which to iterate </param>
+    /// <param name="fn"> The function to be called for each coordinate where there is an active element </param>
+    void For(const std::string& name, Vector vector, std::function<void(Scalar)> fn);
+
 } // namespace value
 } // namespace ell
diff --git a/libraries/value/include/loopnests/CodeGenerator.h b/libraries/value/include/loopnests/CodeGenerator.h
new file mode 100644
index 000000000..e2cbfc860
--- /dev/null
+++ b/libraries/value/include/loopnests/CodeGenerator.h
@@ -0,0 +1,43 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     CodeGenerator.h (value)
+//  Authors:  Chuck Jacobs, Kern Handa
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include "LoopNest.h"
+#include "LoopNestVisitor.h"
+
+#include <unordered_map>
+
+namespace ell
+{
+namespace value
+{
+    namespace loopnests
+    {
+        /// <summary>
+        /// Takes a loop nest and generates code for it
+        /// </summary>
+        class CodeGenerator : public LoopNestVisitor
+        {
+        public:
+            void Run(const LoopNest& loopNest) const;
+
+        private:
+            void GenerateLoopRangeOld(const LoopRange& range, const RecursionState& state, const LoopVisitSchedule& schedule, std::function<void(Scalar)> codegenFn) const override;
+            void GenerateLoopRangeNew(const LoopRange& range, const RecursionStateNew& state, const LoopVisitSchedule& schedule, std::function<void(Scalar)> codegenFn) const override;
+            Scalar EmitIndexExpression(const Index& index, const IndexExpression& expr, const LoopIndexSymbolTable& indexVariables) const override;
+            void InvokeKernel(const Kernel& kernel, const KernelPredicate& predicate, const LoopIndexSymbolTable& runtimeIndexVariables, const LoopVisitSchedule& schedule) const override;
+            bool InvokeKernelGroup(const ScheduledKernelGroup& kernelGroup, const LoopIndexSymbolTable& runtimeIndexVariables, const LoopVisitSchedule& schedule) const override;
+
+            void InvokeKernel(const Kernel& kernel, const LoopIndexSymbolTable& runtimeIndexVariables, const LoopVisitSchedule& schedule) const;
+            Scalar EmitKernelPredicate(const KernelPredicate& predicate, const LoopIndexSymbolTable& runtimeIndexVariables, const LoopVisitSchedule& schedule) const;
+        };
+
+    } // namespace loopnests
+} // namespace value
+} // namespace ell
diff --git a/libraries/value/include/loopnests/CodePositionConstraints.h b/libraries/value/include/loopnests/CodePositionConstraints.h
new file mode 100644
index 000000000..6fad5147d
--- /dev/null
+++ b/libraries/value/include/loopnests/CodePositionConstraints.h
@@ -0,0 +1,166 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     CodePositionConstraints.h (value)
+//  Authors:  Chuck Jacobs, Kern Handa
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include "Index.h"
+
+#include <bitset>
+#include <ostream>
+#include <vector>
+
+namespace ell
+{
+namespace value
+{
+    namespace loopnests
+    {
+        enum class LoopFragmentType : int
+        {
+            prologue, // Occurs as part of a loop prologue, before the main loop body
+            body, // Occurs as part of the main loop body
+            boundary, // Occurs as part of a loop boundary condition
+            epilogue, // Occurs as part of a loop epilogue, after the main loop body
+            LAST
+        };
+
+        bool IsBoundaryFragment(LoopFragmentType f);
+
+        std::ostream& operator<<(std::ostream& os, LoopFragmentType t);
+
+        // A set of LoopFragmentTypes
+        class LoopFragmentFlags
+        {
+        public:
+            LoopFragmentFlags() = default;
+            LoopFragmentFlags(LoopFragmentType type)
+            {
+                _flags[static_cast<int>(type)] = 1;
+            };
+            explicit LoopFragmentFlags(int flags);
+
+            bool GetFlag(LoopFragmentType type) const
+            {
+                return _flags[static_cast<int>(type)];
+            }
+
+            void SetFlag(LoopFragmentType type, bool value)
+            {
+                _flags[static_cast<int>(type)] = value;
+            }
+
+            static LoopFragmentFlags All()
+            {
+                LoopFragmentFlags result;
+                result.SetFlag(LoopFragmentType::prologue, true);
+                result.SetFlag(LoopFragmentType::body, true);
+                result.SetFlag(LoopFragmentType::boundary, false);
+                result.SetFlag(LoopFragmentType::epilogue, true);
+                return result;
+            }
+
+            LoopFragmentFlags& operator&=(const LoopFragmentFlags& other)
+            {
+                _flags &= other._flags;
+                return *this;
+            }
+
+            LoopFragmentFlags& operator|=(const LoopFragmentFlags& other)
+            {
+                _flags |= other._flags;
+                return *this;
+            }
+
+        private:
+            std::bitset<static_cast<int>(LoopFragmentType::LAST)> _flags;
+        };
+
+        std::ostream& operator<<(std::ostream& os, LoopFragmentFlags f);
+
+        inline LoopFragmentFlags operator&(LoopFragmentType a, LoopFragmentType b)
+        {
+            LoopFragmentFlags result(a);
+            result &= b;
+            return result;
+        }
+
+        inline LoopFragmentFlags operator&(LoopFragmentFlags a, LoopFragmentType b)
+        {
+            a &= b;
+            return a;
+        }
+
+        inline LoopFragmentFlags operator&(LoopFragmentType a, LoopFragmentFlags b)
+        {
+            return b & a;
+        }
+
+        inline LoopFragmentFlags operator|(LoopFragmentType a, LoopFragmentType b)
+        {
+            LoopFragmentFlags result(a);
+            result |= b;
+            return result;
+        }
+
+        inline LoopFragmentFlags operator|(LoopFragmentFlags a, LoopFragmentType b)
+        {
+            a |= b;
+            return a;
+        }
+
+        inline LoopFragmentFlags operator|(LoopFragmentType a, LoopFragmentFlags b)
+        {
+            return b | a;
+        }
+
+        /// <summary>
+        /// A class to hold the constraints that govern where a piece of code may / must run. Used to generate a concrete
+        /// schedule for running (non-"kernel") code
+        /// </summary>
+
+        // TODO: each boundary index needs its own "placement" value (e.g., you could have a kernel that runs when j==0 and k==N-1)
+        class CodePositionConstraints
+        {
+        public:
+            CodePositionConstraints(LoopFragmentType placement, std::vector<Index> requiredIndices, std::vector<Index> boundaryIndices);
+
+            LoopFragmentType GetPlacement() const { return _placement; }
+
+            std::vector<Index> GetRequiredIndices() const; // indices we depend on
+            std::vector<Index> GetBoundaryIndices() const; // indices defining the fragment
+
+        private:
+            LoopFragmentType _placement;
+            std::vector<Index> _requiredIndices;
+            std::vector<Index> _boundaryIndices;
+        };
+
+        bool operator==(const CodePositionConstraints& i1, const CodePositionConstraints& i2);
+        bool operator!=(const CodePositionConstraints& i1, const CodePositionConstraints& i2);
+    } // namespace loopnests
+} // namespace value
+} // namespace ell
+
+//
+// Custom specialization of std::hash so we can keep constraints in containers that require hashable types
+//
+namespace std
+{
+/// <summary> Implements a hash function for the CodePositionConstraints class, so that it can be used with associative containers (maps, sets, and the like). </summary>
+template <>
+struct hash<::ell::value::loopnests::CodePositionConstraints>
+{
+    using argument_type = ell::value::loopnests::CodePositionConstraints;
+    using result_type = std::size_t;
+
+    /// <summary> Computes a hash of the input value. </summary>
+    ///
+    /// <returns> A hash value for the given input. </returns>
+    result_type operator()(const argument_type& constraints) const;
+};
+} // namespace std
diff --git a/libraries/value/include/loopnests/ForAll.h b/libraries/value/include/loopnests/ForAll.h
new file mode 100644
index 000000000..f1333ce44
--- /dev/null
+++ b/libraries/value/include/loopnests/ForAll.h
@@ -0,0 +1,40 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     ForAll.h (value)
+//  Authors:  Chuck Jacobs, Kern Handa
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include "LoopNest.h"
+
+namespace ell
+{
+namespace value::loopnests
+{
+    //
+    // Syntactic sugar class for stringing together loop nest calls
+    //
+    class ForAll
+    {
+    public:
+        ForAll(const ForAll& other) = delete;
+        ForAll(ForAll&& other) = default;
+        ForAll(IterationDomain domain);
+        ForAll& operator=(const ForAll& other) = delete;
+        ForAll& operator=(ForAll&& other) = default;
+
+        ForAll& AddKernel(const Kernel& kernel);
+        ForAll& AddKernel(const Kernel& kernel, const CodePositionConstraints& where);
+        ForAll& Split(const Index& dimension, int size);
+        ForAll& SetLoopOrder(const std::vector<Index>& order);
+
+        const LoopNest& GetNest() const;
+
+    private:
+        LoopNest _loops;
+    };
+} // namespace value::loopnests
+} // namespace ell
\ No newline at end of file
diff --git a/libraries/value/include/loopnests/Index.h b/libraries/value/include/loopnests/Index.h
new file mode 100644
index 000000000..474467392
--- /dev/null
+++ b/libraries/value/include/loopnests/Index.h
@@ -0,0 +1,68 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     Index.h (value)
+//  Authors:  Chuck Jacobs, Kern Handa
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include <ostream>
+#include <string>
+
+namespace ell
+{
+namespace value
+{
+    namespace loopnests
+    {
+        /// <summary>
+        /// A placeholder object representing a runtime variable used as the index for a loop (e.g., the 'i' in 'for(i = ...)').
+        /// </summary>
+        class Index
+        {
+        public:
+            using Id = int;
+            Index() = default;
+            Index(const Index& other) = default;
+            Index(Index&& other) = default;
+            Index(const std::string& name);
+            Index& operator=(const Index& other) = default;
+            Index& operator=(Index&& other) = default;
+
+            const std::string& GetName() const;
+            Id GetId() const;
+
+        private:
+            static int GetNextId();
+
+            friend inline bool operator==(const Index& i1, const Index& i2) { return i1.GetId() == i2.GetId(); }
+            friend inline bool operator!=(const Index& i1, const Index& i2) { return !(i1 == i2); }
+            friend inline bool operator<(const Index& i1, const Index& i2) { return i1.GetId() < i2.GetId(); }
+
+            std::string _name;
+            Id _id = -1;
+        };
+
+        struct SplitIndex
+        {
+            Index outer;
+            Index inner;
+        };
+
+        std::ostream& operator<<(std::ostream& os, const Index& index);
+    } // namespace loopnests
+} // namespace value
+} // namespace ell
+
+namespace std
+{
+template <>
+struct hash<::ell::value::loopnests::Index>
+{
+    using argument_type = ::ell::value::loopnests::Index;
+    using result_type = std::size_t;
+    result_type operator()(const argument_type& index) const;
+};
+} // namespace std
diff --git a/libraries/value/include/loopnests/IndexRange.h b/libraries/value/include/loopnests/IndexRange.h
new file mode 100644
index 000000000..759886606
--- /dev/null
+++ b/libraries/value/include/loopnests/IndexRange.h
@@ -0,0 +1,50 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     IndexRange.h (value)
+//  Authors:  Chuck Jacobs, Kern Handa
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include "Index.h"
+#include "Range.h"
+
+#include <string>
+
+namespace ell
+{
+namespace value
+{
+    namespace loopnests
+    {
+        /// <summary>
+        /// A range of integer values, used to express the interval that an index variable may take on.
+        /// </summary>
+        class IndexRange
+        {
+        public:
+            IndexRange(const Index& index, const Range& range);
+            IndexRange(const std::string& name, const Range& range);
+
+            const Index& GetIndex() const;
+            const std::string& GetName() const;
+            int Begin() const;
+            int End() const;
+            int Size() const;
+            int Increment() const;
+            Range GetRange() const;
+
+        private:
+            Index _index;
+            Range _range;
+
+            friend inline bool operator==(const IndexRange& i1, const IndexRange& i2) { return (i1.GetIndex() == i2.GetIndex()) && (i1.GetRange() == i2.GetRange()); }
+            friend inline bool operator!=(const IndexRange& i1, const IndexRange& i2) { return (i1.GetIndex() != i2.GetIndex()) || (i1.GetRange() != i2.GetRange()); }
+            friend inline bool operator<(const IndexRange& i1, const IndexRange& i2) { return (i1.GetIndex() != i2.GetIndex()) ? (i1.GetIndex() == i2.GetIndex()) : (i1.GetRange() < i2.GetRange()); }
+        };
+
+    } // namespace loopnests
+} // namespace value
+} // namespace ell
\ No newline at end of file
diff --git a/libraries/value/include/loopnests/IterationDomain.h b/libraries/value/include/loopnests/IterationDomain.h
new file mode 100644
index 000000000..b7ba65531
--- /dev/null
+++ b/libraries/value/include/loopnests/IterationDomain.h
@@ -0,0 +1,47 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     IterationDomain.h (value)
+//  Authors:  Chuck Jacobs, Kern Handa
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include "IndexRange.h"
+
+#include <initializer_list>
+#include <map>
+#include <vector>
+
+namespace ell
+{
+namespace value
+{
+    namespace loopnests
+    {
+        /// <summary>
+        /// The set of all points (IterationVectors) to be visited by a loop or loop nest.
+        /// </summary>
+        class IterationDomain
+        {
+        public:
+            IterationDomain() = default;
+            IterationDomain(const std::vector<IndexRange>& ranges);
+            IterationDomain(const std::initializer_list<IndexRange>& ranges);
+
+            int NumDimensions() const;
+            IndexRange GetDimensionRange(int dimension) const;
+            IndexRange GetDimensionRange(const Index& index) const;
+            const std::vector<IndexRange>& GetRanges() const;
+
+        private:
+            int GetDimensionRangeFromIndex(const Index& index) const;
+
+            std::vector<IndexRange> _dimensions;
+            std::map<Index::Id, int> _indexToDimensionMap;
+        };
+
+    } // namespace loopnests
+} // namespace value
+} // namespace ell
diff --git a/libraries/value/include/loopnests/Kernel.h b/libraries/value/include/loopnests/Kernel.h
new file mode 100644
index 000000000..80403d110
--- /dev/null
+++ b/libraries/value/include/loopnests/Kernel.h
@@ -0,0 +1,166 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     Kernel.h (value)
+//  Authors:  Chuck Jacobs, Kern Handa
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include "Index.h"
+
+#include "../FunctionDeclaration.h"
+#include "../Scalar.h"
+#include "../Value.h"
+
+#include <algorithm>
+#include <cassert>
+#include <string>
+#include <vector>
+
+namespace ell
+{
+namespace value
+{
+    namespace loopnests
+    {
+        /// <summary>
+        /// Code that runs inside the loop nest. This is the code that actually implements the operation. The loops and
+        /// scheduling are all for the purpose of calling this code.
+        /// </summary>
+        class Kernel
+        {
+        public:
+            using Id = std::string;
+
+            explicit Kernel(std::string name);
+            Kernel(std::string name, Id id);
+
+            template <typename... Types>
+            Kernel& Inputs(Types... inputs);
+
+            Kernel& Inputs(const std::vector<Value>& inputs);
+
+            Kernel& Indices(std::vector<Index> indices);
+
+            template <typename... Types>
+            Kernel& Indices(Types... indices);
+
+            template <typename Fn>
+            Kernel& Define(Fn&& fn);
+
+            // TODO : make this a template specialization of Define(), currently lambdas and std::functions aren't
+            // getting matched correctly
+            Kernel& DefineEx(std::function<void(std::vector<Value>, std::vector<Scalar>)>&& fn);
+
+            void Call(std::vector<Value> arguments, std::vector<Value> indices) const;
+
+            const std::string& GetName() const;
+            const Id& GetId() const;
+            const std::vector<Value>& GetArgs() const;
+            const std::vector<Index>& GetIndices() const;
+
+        private:
+            Id _id;
+            std::string _kernelName;
+            std::vector<Value> _inputs;
+            std::vector<Index> _indices;
+            std::function<void(std::vector<Value> arguments, std::vector<Value> indices)> _kernel;
+        };
+
+        inline bool operator==(const Kernel& i1, const Kernel& i2) { return i1.GetId() == i2.GetId(); }
+        inline bool operator!=(const Kernel& i1, const Kernel& i2) { return !(i1 == i2); }
+    } // namespace loopnests
+} // namespace value
+} // namespace ell
+
+namespace std
+{
+/// <summary> Implements a hash function for the Kernel class, so that it can be used with associative containers (maps, sets, and the like). </summary>
+template <>
+struct hash<::ell::value::loopnests::Kernel>
+{
+    using argument_type = ::ell::value::loopnests::Kernel;
+    using result_type = std::size_t;
+    result_type operator()(const argument_type& kernel) const;
+};
+} // namespace std
+
+#pragma region implementation
+
+namespace ell
+{
+namespace value
+{
+    namespace loopnests
+    {
+        template <typename... Types>
+        Kernel& Kernel::Inputs(Types... inputs)
+        {
+            static_assert(std::conjunction_v<std::is_convertible<Types, ViewAdapter>...>);
+            return Inputs(std::vector<Value>{ ViewAdapter{ std::forward<Types>(inputs) }... });
+        }
+
+        template <typename... Types>
+        Kernel& Kernel::Indices(Types... indices)
+        {
+            static_assert(utilities::AllSame<Index, std::decay_t<Types>...>);
+            return Indices(std::vector<Index>{ std::forward<Types>(indices)... });
+        }
+
+        template <typename Fn>
+        Kernel& Kernel::Define(Fn&& fn)
+        {
+            _kernel = [numOriginalIndices = _indices.size(),
+                       originalInputs = _inputs,
+                       kernelName = UniqueName(_kernelName + "KernelFn"),
+                       fnDefinition = std::move(fn)](std::vector<Value> arguments, std::vector<Value> indices) {
+                using namespace utilities;
+
+                if (arguments.size() != originalInputs.size())
+                {
+                    throw InputException(InputExceptionErrors::invalidArgument, "Number of arguments does not match number of expected inputs");
+                }
+                if (indices.size() != numOriginalIndices)
+                {
+                    throw InputException(InputExceptionErrors::invalidArgument, "Number of indices does not match number of expected indices");
+                }
+
+                std::vector<ViewAdapter> fnInputs(arguments.begin(), arguments.end());
+                fnInputs.insert(fnInputs.end(), indices.begin(), indices.end());
+
+                std::vector<ViewAdapter> fnParameters(originalInputs.begin(), originalInputs.end());
+                fnParameters.insert(fnParameters.end(), indices.begin(), indices.end());
+                for (auto i = 0u; i < originalInputs.size(); ++i)
+                {
+                    Value& param = fnParameters[i];
+                    const Value& input = fnInputs[i];
+
+                    if (!input.IsConstrained())
+                    {
+                        param.ClearLayout();
+                    }
+                    else
+                    {
+                        param.SetLayout(input.GetLayout());
+                    }
+                }
+
+                auto fn = DeclareFunction(kernelName).Parameters(fnParameters);
+                fn.Inlined(FunctionInlining::always);
+                if (!fn.IsDefined())
+                {
+                    fn.Define(fnDefinition);
+                }
+
+                fn.Call(fnInputs);
+            };
+
+            return *this;
+        }
+
+    } // namespace loopnests
+} // namespace value
+} // namespace ell
+#pragma endregion
diff --git a/libraries/value/include/loopnests/KernelPredicate.h b/libraries/value/include/loopnests/KernelPredicate.h
new file mode 100644
index 000000000..bcee6c06f
--- /dev/null
+++ b/libraries/value/include/loopnests/KernelPredicate.h
@@ -0,0 +1,315 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     KernelPredicate.h (value)
+//  Authors:  Chuck Jacobs
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include "Index.h"
+#include "LoopIndexInfo.h"
+
+#include "../ValueType.h"
+
+#include <iosfwd>
+#include <memory>
+#include <optional>
+#include <string>
+#include <variant>
+
+namespace ell
+{
+namespace value
+{
+    namespace loopnests
+    {
+        class KernelPredicate;
+        class LoopVisitSchedule;
+
+        enum class Fragment
+        {
+            first,
+            last,
+            endBoundary,
+            all
+        };
+
+        enum class Placement
+        {
+            before,
+            after
+        };
+
+        class EmptyPredicate
+        {
+        public:
+            const EmptyPredicate& Simplify() const { return *this; }
+            const EmptyPredicate& Simplify(const LoopIndexSymbolTable& indices, const LoopVisitSchedule& schedule) const { return *this; }
+
+        private:
+            friend inline bool operator==(const EmptyPredicate& i1, const EmptyPredicate& i2) { return true; }
+            friend inline bool operator!=(const EmptyPredicate& i1, const EmptyPredicate& i2) { return false; }
+            friend inline bool operator<(const EmptyPredicate& i1, const EmptyPredicate& i2) { return false; }
+        };
+
+        class ConstantPredicate
+        {
+        public:
+            explicit ConstantPredicate(bool value);
+
+            bool GetValue() const;
+
+            const ConstantPredicate& Simplify() const { return *this; }
+            const ConstantPredicate& Simplify(const LoopIndexSymbolTable& indices, const LoopVisitSchedule& schedule) const { return *this; }
+
+        private:
+            bool _value;
+
+            friend inline bool operator==(const ConstantPredicate& i1, const ConstantPredicate& i2) { return i1.GetValue() == i2.GetValue(); }
+            friend inline bool operator!=(const ConstantPredicate& i1, const ConstantPredicate& i2) { return i1.GetValue() != i2.GetValue(); }
+            friend inline bool operator<(const ConstantPredicate& i1, const ConstantPredicate& i2) { return i1.GetValue() < i2.GetValue(); }
+        };
+
+        class FragmentTypePredicate
+        {
+        public:
+            FragmentTypePredicate(const Index& index, Fragment condition);
+
+            const Index& GetIndex() const;
+            Fragment GetCondition() const;
+
+            KernelPredicate Simplify() const;
+            KernelPredicate Simplify(const LoopIndexSymbolTable& indices, const LoopVisitSchedule& schedule) const;
+
+        private:
+            Index _index;
+            Fragment _condition;
+
+            friend inline bool operator==(const FragmentTypePredicate& i1, const FragmentTypePredicate& i2) { return (i1.GetIndex() == i2.GetIndex()) && (i1.GetCondition() == i2.GetCondition()); }
+            friend inline bool operator!=(const FragmentTypePredicate& i1, const FragmentTypePredicate& i2) { return (i1.GetIndex() != i2.GetIndex()) || (i1.GetCondition() != i2.GetCondition()); }
+            friend inline bool operator<(const FragmentTypePredicate& i1, const FragmentTypePredicate& i2) { return (i1.GetIndex() != i2.GetIndex()) ? (i1.GetIndex() < i2.GetIndex()) : (i1.GetCondition() < i2.GetCondition()); }
+        };
+
+        class PlacementPredicate
+        {
+        public:
+            // Where to schedule kernel in its loop (before or after any inner loops)
+            explicit PlacementPredicate(Placement where);
+
+            // Where to schedule kernel in relation to an index
+            PlacementPredicate(const Index& index, Placement where);
+
+            bool HasIndex() const;
+            Index GetIndex() const;
+            Placement GetPlacement() const;
+
+            const PlacementPredicate& Simplify() const;
+            const PlacementPredicate& Simplify(const LoopIndexSymbolTable& indices, const LoopVisitSchedule& schedule) const;
+
+        private:
+            std::optional<Index> _index;
+            Placement _placement;
+
+            friend inline bool operator==(const PlacementPredicate& i1, const PlacementPredicate& i2) { return (i1.GetIndex() == i2.GetIndex()) && (i1.GetPlacement() == i2.GetPlacement()); }
+            friend inline bool operator!=(const PlacementPredicate& i1, const PlacementPredicate& i2) { return (i1.GetIndex() != i2.GetIndex()) || (i1.GetPlacement() != i2.GetPlacement()); }
+            friend inline bool operator<(const PlacementPredicate& i1, const PlacementPredicate& i2) { return (i1.GetIndex() != i2.GetIndex()) ? (i1.GetIndex() < i2.GetIndex()) : (i1.GetPlacement() < i2.GetPlacement()); }
+        };
+
+        class IndexDefinedPredicate
+        {
+        public:
+            explicit IndexDefinedPredicate(const Index& index);
+
+            const Index& GetIndex() const;
+
+            const IndexDefinedPredicate& Simplify() const;
+            const IndexDefinedPredicate& Simplify(const LoopIndexSymbolTable& indices, const LoopVisitSchedule& schedule) const;
+
+        private:
+            Index _index;
+
+            friend inline bool operator==(const IndexDefinedPredicate& i1, const IndexDefinedPredicate& i2) { return i1.GetIndex() == i2.GetIndex(); }
+            friend inline bool operator!=(const IndexDefinedPredicate& i1, const IndexDefinedPredicate& i2) { return i1.GetIndex() != i2.GetIndex(); }
+            friend inline bool operator<(const IndexDefinedPredicate& i1, const IndexDefinedPredicate& i2) { return i1.GetIndex() < i2.GetIndex(); }
+        };
+
+        class KernelPredicateConjunction
+        {
+        public:
+            KernelPredicateConjunction(const KernelPredicate& lhs, const KernelPredicate& rhs);
+            KernelPredicateConjunction(const KernelPredicateConjunction& other);
+            KernelPredicateConjunction(KernelPredicateConjunction&& other) = default;
+            KernelPredicateConjunction& operator=(const KernelPredicateConjunction& other);
+            KernelPredicateConjunction& operator=(KernelPredicateConjunction&& other) = default;
+
+            const std::vector<std::unique_ptr<KernelPredicate>>& GetTerms() const;
+
+            KernelPredicate Simplify() const;
+            KernelPredicate Simplify(const LoopIndexSymbolTable& indices, const LoopVisitSchedule& schedule) const;
+
+        private:
+            friend class KernelPredicate;
+            KernelPredicateConjunction(const std::vector<std::unique_ptr<KernelPredicate>>& terms);
+
+            std::vector<std::unique_ptr<KernelPredicate>> _terms;
+        };
+
+        class KernelPredicateDisjunction
+        {
+        public:
+            KernelPredicateDisjunction(const KernelPredicate& lhs, const KernelPredicate& rhs);
+            KernelPredicateDisjunction(const KernelPredicateDisjunction& other);
+            KernelPredicateDisjunction(KernelPredicateDisjunction&& other) = default;
+            KernelPredicateDisjunction& operator=(const KernelPredicateDisjunction& other);
+            KernelPredicateDisjunction& operator=(KernelPredicateDisjunction&& other) = default;
+
+            const std::vector<std::unique_ptr<KernelPredicate>>& GetTerms() const;
+
+            KernelPredicate Simplify() const;
+            KernelPredicate Simplify(const LoopIndexSymbolTable& indices, const LoopVisitSchedule& schedule) const;
+
+        private:
+            friend class KernelPredicate;
+            KernelPredicateDisjunction(const std::vector<std::unique_ptr<KernelPredicate>>& terms);
+
+            std::vector<std::unique_ptr<KernelPredicate>> _terms;
+        };
+
+        class KernelPredicate
+        {
+        public:
+            KernelPredicate() = default;
+
+            KernelPredicate(const EmptyPredicate& predicate);
+            KernelPredicate(const ConstantPredicate& predicate);
+            KernelPredicate(const FragmentTypePredicate& predicate);
+            KernelPredicate(const PlacementPredicate& predicate);
+            KernelPredicate(const IndexDefinedPredicate& predicate);
+            KernelPredicate(const KernelPredicateConjunction& predicate);
+            KernelPredicate(const KernelPredicateDisjunction& predicate);
+
+            KernelPredicate(const KernelPredicate&) = default;
+            KernelPredicate(KernelPredicate&&) = default;
+            KernelPredicate& operator=(const KernelPredicate&) = default;
+            KernelPredicate& operator=(KernelPredicate&&) = default;
+
+            KernelPredicate Simplify() const;
+            KernelPredicate Simplify(const LoopIndexSymbolTable& indices, const LoopVisitSchedule& schedule) const;
+
+            template <typename FunctionType>
+            void Visit(FunctionType&& f) const; // FunctionType of the form void(const KernelPredicate&)
+
+            bool IsConstant() const;
+            bool GetConstantValue() const;
+
+            bool IsAlwaysTrue() const;
+            bool IsAlwaysFalse() const;
+            bool IsEmpty() const;
+
+            template <typename T>
+            bool Is() const;
+
+            template <typename T>
+            const T* As() const;
+
+            friend std::ostream& operator<<(std::ostream& os, const KernelPredicate& predicate);
+
+        private:
+            std::variant<EmptyPredicate,
+                         ConstantPredicate,
+                         FragmentTypePredicate,
+                         PlacementPredicate,
+                         IndexDefinedPredicate,
+                         KernelPredicateConjunction,
+                         KernelPredicateDisjunction>
+                _expr;
+        };
+
+        KernelPredicate First(const Index& index);
+        KernelPredicate Last(const Index& index);
+        KernelPredicate EndBoundary(const Index& index);
+        KernelPredicate All(const Index& index);
+        KernelPredicate IsDefined(const Index& index);
+        KernelPredicate Before(const Index& index);
+        KernelPredicate After(const Index& index);
+
+        KernelPredicate operator&&(const KernelPredicate& lhs, const KernelPredicate& rhs);
+        KernelPredicate operator||(const KernelPredicate& lhs, const KernelPredicate& rhs);
+
+        KernelPredicate operator==(const Index& index, int value);
+        KernelPredicate operator==(int value, const Index& index);
+        // KernelPredicate operator==(const Index& index1, const Index& index2);
+        KernelPredicate operator!=(const Index& index, int value);
+        KernelPredicate operator!=(int value, const Index& index);
+        // KernelPredicate operator!=(const Index& index1, const Index& index2);
+        KernelPredicate operator<(const Index& index, int value);
+        KernelPredicate operator<(int value, const Index& index);
+        // KernelPredicate operator<(const Index& index1, const Index& index2);
+        KernelPredicate operator>(const Index& index, int value);
+        KernelPredicate operator>(int value, const Index& index);
+        // KernelPredicate operator>(const Index& index1, const Index& index2);
+        KernelPredicate operator<=(const Index& index, int value);
+        KernelPredicate operator<=(int value, const Index& index);
+        // KernelPredicate operator<=(const Index& index1, const Index& index2);
+        KernelPredicate operator>=(const Index& index, int value);
+        KernelPredicate operator>=(int value, const Index& index);
+        // KernelPredicate operator>=(const Index& index1, const Index& index2);
+
+        std::string ToString(Fragment condition);
+
+        std::ostream& operator<<(std::ostream& os, const EmptyPredicate& predicate);
+        std::ostream& operator<<(std::ostream& os, const ConstantPredicate& predicate);
+        std::ostream& operator<<(std::ostream& os, const FragmentTypePredicate& predicate);
+        std::ostream& operator<<(std::ostream& os, const PlacementPredicate& predicate);
+        std::ostream& operator<<(std::ostream& os, const KernelPredicateConjunction& predicate);
+        std::ostream& operator<<(std::ostream& os, const KernelPredicateDisjunction& predicate);
+        // std::ostream& operator<<(std::ostream& os, const KernelPredicate& predicate);
+
+    } // namespace loopnests
+} // namespace value
+} // namespace ell
+
+#pragma region implementation
+namespace ell
+{
+namespace value
+{
+    namespace loopnests
+    {
+        template <typename T>
+        bool KernelPredicate::Is() const
+        {
+            return std::holds_alternative<T>(_expr);
+        }
+
+        template <typename T>
+        const T* KernelPredicate::As() const
+        {
+            return std::get_if<T>(&_expr);
+        }
+
+        template <typename FunctionType>
+        void KernelPredicate::Visit(FunctionType&& f) const
+        {
+            f(*this);
+            if (auto conj = As<KernelPredicateConjunction>(); conj != nullptr)
+            {
+                for (const auto& t : conj->GetTerms())
+                {
+                    t->Visit(f);
+                }
+            }
+            else if (auto disj = As<KernelPredicateDisjunction>(); disj != nullptr)
+            {
+                for (const auto& t : disj->GetTerms())
+                {
+                    t->Visit(f);
+                }
+            }
+        }
+    } // namespace loopnests
+} // namespace value
+} // namespace ell
+#pragma endregion implementation
diff --git a/libraries/value/include/loopnests/LoopIndexInfo.h b/libraries/value/include/loopnests/LoopIndexInfo.h
new file mode 100644
index 000000000..224fef47e
--- /dev/null
+++ b/libraries/value/include/loopnests/LoopIndexInfo.h
@@ -0,0 +1,41 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     LoopIndexInfo.h (value)
+//  Authors:  Chuck Jacobs
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include "Index.h"
+#include "Range.h"
+
+#include "../Scalar.h"
+
+#include <unordered_map>
+
+namespace ell
+{
+namespace value
+{
+    namespace loopnests
+    {
+        enum class LoopIndexState
+        {
+            notVisited,
+            inProgress,
+            done
+        };
+
+        struct LoopIndexSymbolTableEntry
+        {
+            Index scope; // redundant with key in symbol table map
+            Scalar value;
+            Range loopRange;
+            LoopIndexState state;
+        };
+        using LoopIndexSymbolTable = std::unordered_map<Index, LoopIndexSymbolTableEntry>;
+    } // namespace loopnests
+} // namespace value
+} // namespace ell
diff --git a/libraries/value/include/loopnests/LoopNest.h b/libraries/value/include/loopnests/LoopNest.h
new file mode 100644
index 000000000..b00a606c8
--- /dev/null
+++ b/libraries/value/include/loopnests/LoopNest.h
@@ -0,0 +1,292 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     LoopNest.h (value)
+//  Authors:  Chuck Jacobs, Kern Handa
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include "CodePositionConstraints.h"
+#include "Index.h"
+#include "IndexRange.h"
+#include "IterationDomain.h"
+#include "Kernel.h"
+#include "KernelPredicate.h"
+#include "SplitIndexRange.h"
+#include "SplitIterationDomain.h"
+
+#include "../Value.h"
+
+#include <ostream>
+#include <unordered_map>
+#include <vector>
+
+namespace ell
+{
+namespace value
+{
+    namespace loopnests
+    {
+        class LoopNest;
+
+        struct ScheduledKernel
+        {
+            bool newVersion = false; // Temporary workaround... to be removed
+
+            Kernel kernel;
+            CodePositionConstraints constraints;
+            KernelPredicate predicate;
+            KernelPredicate placement;
+        };
+
+        struct ScheduledKernelGroup
+        {
+            Kernel::Id id;
+            std::vector<ScheduledKernel> kernels;
+        };
+
+        struct RenameAction
+        {
+            Value oldValue;
+            Value newValue;
+            std::vector<Index> where;
+            std::vector<Kernel::Id> excludedKernels;
+        };
+
+        struct ScaledIndex
+        {
+            int scale;
+            Index index;
+        };
+
+        struct IndexExpression
+        {
+            std::vector<ScaledIndex> indices;
+            int begin = 0;
+        };
+
+        struct LoopInfo
+        {
+            Index loopIndex;
+            Range fullRange; // the range for the "unswitched" part of this loop
+            Range fragmentRange;
+        };
+
+        /// <summary>
+        /// Represents the concrete sequence of loops to be generated, in detail. Derived from the loop nest and the
+        /// order of the loops.
+        /// </summary>
+        class LoopVisitSchedule
+        {
+        public:
+            struct LoopInfo
+            {
+                Index dimension;
+                IndexRange indexRange;
+                int boundarySize = 0;
+                int scale = 0;
+            };
+
+            using StateQueue = std::vector<LoopInfo>;
+
+            /// <summary> Copy and move constructors / assignment operators </summary>
+            LoopVisitSchedule(const LoopVisitSchedule& other);
+            LoopVisitSchedule(LoopVisitSchedule&&) = default;
+            LoopVisitSchedule& operator=(const LoopVisitSchedule& other);
+            LoopVisitSchedule& operator=(LoopVisitSchedule&& other) = default;
+
+            /// <summary> Returns the global nest level: how many total loops are current (over all dimensions) </summary>
+            int CurrentNestLevel() const;
+
+            /// <summary> Returns `true` if all the loops have been visited </summary>
+            bool IsDone() const;
+
+            /// <summary> Returns `true` if the current loop is the innermost level </summary>
+            bool IsInnermostLoop() const;
+
+            /// <summary> The index of the current loop (e.g., `i_1`) </summary>
+            Index CurrentLoopIndex() const;
+
+            /// <summary> Returns the logical (dimension) index of the current loop (i.e., `i`, not `i_0`, `i_1`, ...) </summary>
+            Index CurrentDimension() const;
+
+            /// <summary> The span (start-end) of the entire dimension the current loop is part of. </summary>
+            int DimensionSize() const;
+
+            /// <summary> The range [start, end) of the current loop.
+            Range LoopRange() const;
+
+            /// <summary> The span (start-end) of the current loop. Only the same as the
+            /// number of iterations if the increment is 1. </summary>
+            int LoopSize() const;
+            int LoopIncrement() const;
+            int NonBoundaryEnd() const;
+
+            /// <summary> The amount this loop index needs to be scaled when generating the expression for the original dimension index. </summary>
+            int LoopIndexScale() const;
+
+            /// <summary> Returns `true` if the current loop index has a prologue / epilogue of the given type (because there is a kernel associated with it). </summary>
+            bool CurrentLoopHasFragment(std::vector<ScheduledKernel> activeKernels, LoopFragmentType fragmentType) const;
+
+            /// <summary> Returns `true` if a future loop (one inside this loop) has a prologue / epilogue of the given type (because there is a kernel associated with it) on the same dimension as the current loop. </summary>
+            bool FutureLoopHasFragmentForThisIndex(std::vector<ScheduledKernel> activeKernels, LoopFragmentType fragmentType) const;
+
+            bool FragmentCanRunAlone(std::vector<ScheduledKernel> activeKernels, LoopFragmentType fragmentType) const; // Returns `true` if the current loop index has a prologue / epilogue of the given type (because there is a kernel associated with it), and if all such kernels are able to be in their own fragment
+
+            int CurrentIndexEndBoundarySize() const; // Returns the size of the last inner loop if the current loop has a potentially-unswitched boundary condition at the end, or zero if the increment divides the size evenly
+
+            bool WillVisitIndex(const Index& index) const;
+            bool IsFullyDefined(const Index& index) const;
+            bool IsFullyDefinedByThisLoop(const Index& index) const;
+            bool WasIterationVariableDefined(const Index& index) const;
+            KernelPredicate GetKernelPredicate(const ScheduledKernel& kernel) const;
+
+            LoopVisitSchedule Next() const;
+            LoopVisitSchedule Prev() const;
+
+            const LoopInfo& Front() const;
+
+            const SplitIterationDomain& GetDomain() const;
+            const LoopNest& GetLoopNest() const { return _nest.get(); }
+
+        private:
+            friend class LoopNest;
+            LoopVisitSchedule(const LoopNest& nest, StateQueue state);
+            LoopVisitSchedule(const LoopNest& nest, int level, StateQueue state);
+
+            int _level; // == current position in state queue
+            StateQueue _state;
+            std::reference_wrapper<const LoopNest> _nest;
+        };
+
+        /// <summary>
+        /// A nested set of loops, and the code (kernels) that run inside them
+        /// </summary>
+        class LoopNest
+        {
+        public:
+            LoopNest(IterationDomain domain);
+
+            enum class ConstraintType
+            {
+                constraint,
+                predicate
+            };
+
+            /// <summary> Add a "body" kernel to be run in the middle of the loop nest </summary>
+            void AddKernel(const Kernel& kernel, ConstraintType type = ConstraintType::constraint);
+
+            /// <summary> Add a kernel to be run as the prologue or epilogue of a loop </summary>
+            void AddKernel(const Kernel& kernel, LoopFragmentType where);
+
+            /// <summary> Add a kernel to be run as the prologue or epilogue of a loop </summary>
+            void AddKernel(const Kernel& kernel, const CodePositionConstraints& where);
+
+            /// <summary> Add a kernel to be run as allowed by a predicate </summary>
+            void AddKernel(const Kernel& kernel, const KernelPredicate& predicate);
+
+            /// <summary> Add a kernel to be run as allowed by a predicate and a placement predicate </summary>
+            void AddKernel(const Kernel& kernel, const KernelPredicate& predicate, const KernelPredicate& placement);
+
+            /// <summary> Add a kernel to be run as the prologue or epilogue of a loop </summary>
+            void AddKernel(const Kernel& kernel, const CodePositionConstraints& where, const KernelPredicate& predicate, const KernelPredicate& placement);
+
+            const std::vector<ScheduledKernel>& GetKernels() const;
+
+            std::vector<ScheduledKernelGroup> GetKernelGroups() const;
+
+            void Parallelize(Index index);
+            [[maybe_unused]] SplitIndex Parallelize(Index index, int factor);
+
+            void Unroll(Index index);
+            [[maybe_unused]] SplitIndex Unroll(Index index, int factor);
+
+            [[maybe_unused]] SplitIndex Split(Index index, int size);
+
+            void SetLoopOrder(const std::vector<Index>& order);
+
+            void RenameVariable(ViewAdapter oldVariable, ViewAdapter newVariable, const std::vector<Index>& where, const std::vector<Kernel>& excludedKernels = {});
+
+            int NumDimensions() const;
+            Range GetIndexRange(Index index) const;
+            std::vector<IndexRange> GetLoopIndexRanges() const;
+            const SplitIndexRange& GetDimensionRange(int dimension) const;
+            const SplitIndexRange& GetDimensionRange(const Index& dimension) const;
+            int NumSplits(const Index& dimension) const;
+            const std::vector<Index>& GetLoopSequence() const;
+            LoopVisitSchedule GetLoopSchedule() const;
+
+            // Methods used by code generators
+            int GetLoopIndexScale(const Index& index) const;
+
+            /// <summary> Get the concrete loop index given a logical dimension index and split level </summary>
+            Index GetLoopIndex(const Index& dimension, int level) const;
+
+            bool IsParallelized(const Index& index) const;
+
+            bool IsUnrolled(const Index& index) const;
+
+            /// <summary> See if an Index is used as a parameter to a kernel </summary>
+            bool IsUsed(const Index& index, const std::vector<ScheduledKernel>& activeKernels) const;
+
+            /// Preliminary "variable-renaming" support
+            const std::vector<RenameAction>& GetRenameActions() const;
+
+            const SplitIterationDomain& GetDomain() const;
+
+            Index GetBaseIndex(const Index& index) const;
+
+            /// <summary> Return `true` iff `index` is a concrete index for a loop. </summary>
+            bool IsLoopIndex(const Index& index) const;
+
+            /// <summary> Return `true` iff `index` must be computed from other indices (i.e., it has been split). </summary>
+            bool IsComputedIndex(const Index& index) const;
+
+            IndexExpression GetIndexExpression(const Index& index) const;
+
+            void DebugDump(std::string tag, std::ostream* stream) const;
+
+            const std::string& Name() const { return _name; }
+
+        private:
+            void InitLoopSequence();
+            void ConvertKernelConstraints();
+            void ConvertKernelConstraints(ScheduledKernel& kernel);
+
+            SplitIterationDomain _domain;
+            std::vector<Index> _loopSequence;
+            std::vector<ScheduledKernel> _kernels;
+            std::vector<RenameAction> _renameActions;
+            std::vector<Index> _parallelizedIndices;
+            std::vector<Index> _unrolledIndices;
+            std::string _name = UniqueName("LoopNest");
+        };
+
+        void DebugDump(const LoopNest& nest, std::string tag = "", std::ostream* stream = nullptr);
+
+        LoopNest Fuse(const LoopNest& nest1, const LoopNest& nest2);
+        LoopNest Fuse(const LoopNest& nest1, const LoopNest& nest2, const std::vector<Index>& dependentIndices1, const std::vector<Index>& dependentIndices2);
+
+        bool operator==(const ScheduledKernel& i1, const ScheduledKernel& i2);
+        bool operator!=(const ScheduledKernel& i1, const ScheduledKernel& i2);
+    } // namespace loopnests
+} // namespace value
+} // namespace ell
+
+namespace std
+{
+/// <summary> Implements a hash function for the ScheduledKernel class, so that it can be used with associative containers (maps, sets, and the like). </summary>
+template <>
+struct hash<::ell::value::loopnests::ScheduledKernel>
+{
+    using argument_type = ell::value::loopnests::ScheduledKernel;
+    using result_type = std::size_t;
+
+    /// <summary> Computes a hash of the input value. </summary>
+    ///
+    /// <returns> A hash value for the given input. </returns>
+    result_type operator()(const argument_type& constraints) const;
+};
+} // namespace std
diff --git a/libraries/value/include/loopnests/LoopNestPrinter.h b/libraries/value/include/loopnests/LoopNestPrinter.h
new file mode 100644
index 000000000..5bc7239fa
--- /dev/null
+++ b/libraries/value/include/loopnests/LoopNestPrinter.h
@@ -0,0 +1,65 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     LoopNestPrinter.h (value)
+//  Authors:  Chuck Jacobs
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include "LoopNest.h"
+#include "LoopNestVisitor.h"
+
+#include <ostream>
+#include <unordered_map>
+
+namespace ell
+{
+namespace value
+{
+    namespace loopnests
+    {
+        /// <summary>
+        /// Takes a loop nest and generates code for it
+        /// </summary>
+        class LoopNestPrinter : public LoopNestVisitor
+        {
+        public:
+            LoopNestPrinter(std::ostream& stream);
+            void Print(const LoopNest& loopNest) const;
+
+        private:
+            std::ostream& _stream;
+            mutable int _indentLevel;
+
+            // RAII struct to manage indent level
+            struct Indenter
+            {
+                Indenter(const LoopNestPrinter& printer) :
+                    printer(printer) { ++printer._indentLevel; }
+                ~Indenter() { --printer._indentLevel; }
+                const LoopNestPrinter& printer;
+            };
+
+            void GenerateLoopRangeOld(const LoopRange& range, const RecursionState& state, const LoopVisitSchedule& schedule, std::function<void(Scalar)> codegenFn) const override;
+            void GenerateLoopRangeNew(const LoopRange& range, const RecursionStateNew& state, const LoopVisitSchedule& schedule, std::function<void(Scalar)> codegenFn) const override;
+            Scalar EmitIndexExpression(const Index& index, const IndexExpression& expr, const LoopIndexSymbolTable& indexVariables) const override;
+            void InvokeKernel(const Kernel& kernel, const KernelPredicate& predicate, const LoopIndexSymbolTable& runtimeIndexVariables, const LoopVisitSchedule& schedule) const override;
+            bool InvokeKernelGroup(const ScheduledKernelGroup& kernelGroup, const LoopIndexSymbolTable& runtimeIndexVariables, const LoopVisitSchedule& schedule) const override;
+
+            void InvokeKernel(const Kernel& kernel, const LoopIndexSymbolTable& runtimeIndexVariables, const LoopVisitSchedule& schedule) const;
+            void EmitIf(const KernelPredicate& predicate, const LoopIndexSymbolTable& runtimeIndexVariables, const LoopVisitSchedule& schedule) const;
+            void EmitElseIf(const KernelPredicate& predicate, const LoopIndexSymbolTable& runtimeIndexVariables, const LoopVisitSchedule& schedule) const;
+            void EmitElse() const;
+            void EmitEndIf() const;
+
+            std::string GetIndent() const;
+            void WriteLine(std::string l) const;
+            std::string GetIndexString(const Index& predicate, const LoopIndexSymbolTable& runtimeIndexVariables) const;
+            std::string GetPredicateString(const KernelPredicate& predicate, const LoopIndexSymbolTable& runtimeIndexVariables, const LoopVisitSchedule& schedule) const;
+        };
+
+    } // namespace loopnests
+} // namespace value
+} // namespace ell
diff --git a/libraries/value/include/loopnests/LoopNestVisitor.h b/libraries/value/include/loopnests/LoopNestVisitor.h
new file mode 100644
index 000000000..baf928384
--- /dev/null
+++ b/libraries/value/include/loopnests/LoopNestVisitor.h
@@ -0,0 +1,138 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     LoopNestVisitor.h (value)
+//  Authors:  Chuck Jacobs, Kern Handa
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include "KernelPredicate.h"
+#include "LoopIndexInfo.h"
+#include "LoopNest.h"
+
+#include <functional>
+#include <set>
+#include <unordered_map>
+#include <vector>
+
+namespace ell
+{
+namespace value
+{
+    namespace loopnests
+    {
+        /// <summary>
+        /// Abstract base class for objects that visit a loop nest (e.g., code generators)
+        /// </summary>
+        class LoopNestVisitor
+        {
+        public:
+            static Range GetLoopRange(const Index& loopIndex, const LoopIndexSymbolTable& activeRanges, const LoopVisitSchedule& schedule);
+
+        protected:
+            virtual ~LoopNestVisitor() = default;
+
+            void Visit(const LoopNest& loopNest) const;
+
+            bool WillKernelRunInThisLoop(const ScheduledKernel& kernel, LoopFragmentFlags kernelFilter, const LoopVisitSchedule& schedule) const;
+
+            std::vector<ScheduledKernel> GetValidKernels(std::vector<ScheduledKernel> activeKernels,
+                                                         const std::unordered_map<Index, LoopFragmentFlags>& currentFragmentStates,
+                                                         LoopFragmentFlags currentLoopFlags,
+                                                         LoopFragmentFlags kernelFilter,
+                                                         const LoopVisitSchedule& schedule) const;
+
+            bool ShouldRunKernel(const ScheduledKernel& kernel,
+                                 LoopFragmentType placement,
+                                 const std::unordered_map<Index, LoopFragmentFlags>& constraintIndices,
+                                 LoopFragmentFlags currentLoopFlags,
+                                 const LoopVisitSchedule& schedule) const;
+
+            bool IsIdentity(const IndexExpression& expr, const Index& index) const;
+
+            /// <summary>
+            /// Returns `true` if the current loop body is inside the loop for the given index (so, "inside" counts the current loop being emitted)
+            /// </summary>
+            bool IsFullyDefined(const Index& index, const LoopVisitSchedule& schedule) const;
+
+            /// <summary>
+            /// Returns `true` if the current loop body is inside the loop for the given index (so, "inside" counts the current loop being emitted)
+            /// </summary>
+            bool AreAllFullyDefined(const std::vector<Index>& indices, const LoopVisitSchedule& schedule) const;
+
+            struct RecursionState
+            {
+                RecursionState(const LoopNest& loopNest);
+                RecursionState(const RecursionState&) = default;
+
+                LoopIndexSymbolTable loopIndices; // map from an loop Index variable -> the actual (Scalar) runtime loop index for that loop
+                LoopFragmentFlags currentFragment;
+                std::unordered_map<Index, Range> activeDimensionRanges; // map from dimension index variable -> active loop range for that dimension at this recursion level if that dimension has been previously visited
+                std::vector<ScheduledKernel> activeKernels;
+                std::unordered_map<Index, LoopFragmentFlags> fragmentStates;
+            };
+
+            struct Partition
+            {
+                Index index;
+                Range range;
+            };
+            using PartitionList = std::vector<Partition>;
+
+            using ActiveKernelGroupList = std::vector<std::pair<bool, ScheduledKernelGroup>>;
+
+            struct RecursionStateNew
+            {
+                RecursionStateNew(const LoopNest& loopNest);
+                RecursionStateNew(const RecursionStateNew&) = default;
+
+                LoopIndexSymbolTable loopIndices; // map from an loop Index variable ->
+                //   the actual (Scalar) runtime loop index for that loop
+                //   range visited by that variable in this branch of the code (for loops that have already been visited)
+                //   state of the variable's loop (before, inside, after)
+                ActiveKernelGroupList kernelGroups;
+            };
+
+            struct LoopRange
+            {
+                Scalar start;
+                Scalar stop;
+                Scalar step;
+                LoopFragmentFlags futureLoopFragmentFlags;
+                LoopFragmentFlags currentLoopFragmentFlags;
+            };
+
+            bool UseNewVersion(const LoopNest& loopNest) const;
+            void GenerateLoopsOld(const RecursionState& state, const LoopVisitSchedule& schedule) const;
+            void GenerateLoopsNew(RecursionStateNew& state, const LoopVisitSchedule& schedule) const;
+            std::function<void(Scalar)> GetCodegenFnOld(const LoopRange& r, const RecursionState& state, const LoopVisitSchedule& schedule) const;
+            std::function<void(Scalar)> GetCodegenFnNew(const LoopRange& r, const RecursionStateNew& state, const LoopVisitSchedule& schedule) const;
+
+            PartitionList GetPartitions(const Index& loopIndex, Range loopRange, const ActiveKernelGroupList& kernels, const LoopIndexSymbolTable& runtimeIndexVariables, const LoopVisitSchedule& schedule) const;
+            void AddSplits(const Index& loopIndex, Range loopRange, const KernelPredicate& predicate, const LoopVisitSchedule& schedule, std::set<int>& splits) const;
+
+            // Get the end of the "regular" part of this loop (the part that divides by the tile size evenly)
+            int GetMainBodyLoopEnd(const RecursionState& state, const LoopVisitSchedule& schedule, const Range& loopRange) const;
+            bool LoopInEndBoundaryFragment(const RecursionState& state, const LoopVisitSchedule& schedule) const;
+
+            void DefineComputedIndexVariables(LoopIndexSymbolTable& runtimeLoopIndices, const std::vector<ScheduledKernel>& activeKernels, const LoopVisitSchedule& schedule) const;
+            LoopIndexSymbolTable GetRuntimeIndexVariables(const LoopIndexSymbolTable& runtimeLoopIndices, const LoopNest& loopNest) const;
+            void DefinePostLoopIndex(const Index& loopIndex, LoopIndexSymbolTable& runtimeLoopIndices, const LoopVisitSchedule& schedule) const;
+
+            KernelPredicate GetKernelPredicate(const ScheduledKernel& kernel, const LoopVisitSchedule& schedule) const;
+            bool IsPlacementValid(const ScheduledKernel& kernel, const LoopIndexSymbolTable& runtimeLoopIndices, const LoopVisitSchedule& schedule) const;
+            std::vector<ScheduledKernel> GetValidKernels(const ScheduledKernelGroup& kernelGroup, const LoopIndexSymbolTable& runtimeIndexVariables, const LoopVisitSchedule& schedule) const;
+
+            // abstract:
+            virtual void GenerateLoopRangeOld(const LoopRange& range, const RecursionState& state, const LoopVisitSchedule& schedule, std::function<void(Scalar)> codegenFn) const = 0;
+            virtual void GenerateLoopRangeNew(const LoopRange& range, const RecursionStateNew& state, const LoopVisitSchedule& schedule, std::function<void(Scalar)> codegenFn) const = 0;
+            virtual Scalar EmitIndexExpression(const Index& index, const IndexExpression& expr, const LoopIndexSymbolTable& indexVariables) const = 0;
+            virtual void InvokeKernel(const Kernel& kernel, const KernelPredicate& predicate, const LoopIndexSymbolTable& runtimeIndexVariables, const LoopVisitSchedule& schedule) const = 0;
+            virtual bool InvokeKernelGroup(const ScheduledKernelGroup& kernel, const LoopIndexSymbolTable& runtimeIndexVariables, const LoopVisitSchedule& schedule) const = 0;
+        };
+
+    } // namespace loopnests
+} // namespace value
+} // namespace ell
diff --git a/libraries/value/include/loopnests/Range.h b/libraries/value/include/loopnests/Range.h
new file mode 100644
index 000000000..50df26e5b
--- /dev/null
+++ b/libraries/value/include/loopnests/Range.h
@@ -0,0 +1,44 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     Range.h (value)
+//  Authors:  Chuck Jacobs, Kern Handa
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include <iosfwd>
+
+namespace ell
+{
+namespace value
+{
+    namespace loopnests
+    {
+        /// <summary>
+        /// A class representing the half-open interval `[begin, end)`, with an increment between points of _increment.
+        /// </summary>
+        class Range
+        {
+        public:
+            Range(int begin, int end, int increment = 1);
+
+            int Begin() const;
+            int End() const;
+            int Size() const;
+            int Increment() const;
+
+        private:
+            int _begin;
+            int _end;
+            int _increment;
+        };
+
+        bool operator==(const Range& i1, const Range& i2);
+        bool operator!=(const Range& i1, const Range& i2);
+        bool operator<(const Range& i1, const Range& i2);
+        std::ostream& operator<<(std::ostream& os, const Range& r);
+    } // namespace loopnests
+} // namespace value
+} // namespace ell
diff --git a/libraries/value/include/loopnests/SplitIndexRange.h b/libraries/value/include/loopnests/SplitIndexRange.h
new file mode 100644
index 000000000..4148c506b
--- /dev/null
+++ b/libraries/value/include/loopnests/SplitIndexRange.h
@@ -0,0 +1,112 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     SplitIndexRange.h (value)
+//  Authors:  Chuck Jacobs, Kern Handa
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include "Index.h"
+#include "IndexRange.h"
+
+#include <map>
+#include <iosfwd>
+#include <unordered_map>
+#include <vector>
+
+namespace ell
+{
+namespace value
+{
+    namespace loopnests
+    {
+        /// <summary>
+        /// A hierarchically-decomposed dimension range, used to represent the sizes for the different loop levels of a tiled
+        /// loop
+        /// </summary>
+        class SplitIndexRange
+        {
+        public:
+            SplitIndexRange() = default;
+            SplitIndexRange(const IndexRange& range);
+
+            const Index& GetDimensionIndex() const; // The dimension index (e.g., `i`), not a concrete loop index
+            Range GetDimensionRange() const; // Returns the full range over the dimension
+
+            int NumSplits() const;
+            int GetBegin() const;
+            int GetSize() const;
+            int GetIncrement() const;
+
+            int GetSplitSize(int level) const; // note: returns '1' for the last level
+            Index GetSplitIndex(int level) const;
+            Range GetIndexRange(const Index& index) const;
+
+            bool Contains(const Index& index) const; // returns `true` if the given index belongs to this domain
+
+            bool IsLoopIndex(const Index& index) const; // a leaf node in the index tree
+            bool IsComputedIndex(const Index& index) const; // an interior node in the index tree
+            bool IsDimension(const Index& index) const; // the index corresponding to the original range
+
+            bool IsParentOf(const Index& parent, const Index& child) const; // is 'parent' the (immediate) parent of 'child'?
+            bool IsChildOf(const Index& child, const Index& parent) const; // is 'child' a(n) (immediate) child of 'parent'?
+            bool DependsOn(const Index& index1, const Index& index2) const; // does 'index1' depend on 'index2'? (after a split, the parent depends on the new (leaf) indices)
+
+            const std::vector<Index>& GetIndices() const;
+            std::vector<Index> GetLoopIndices() const;
+            std::vector<Index> GetComputedIndices() const;
+            std::vector<Index> GetDependentIndices(const Index& index, bool includeSelf = false) const;
+            std::vector<Index> GetDependentLoopIndices(const Index& index, bool includeSelf = false) const;
+            
+            bool HasParentIndex(const Index& parent) const;
+            
+            /// <summary> Get the index that was split in order to create the given index </summary>
+            Index GetParentIndex(const Index& parent) const;
+
+            bool IsOuterSplitIndex(const Index& index) const;
+            bool IsInnerSplitIndex(const Index& index) const;
+            Index GetOuterSplitIndex(const Index& parent) const;
+            Index GetInnerSplitIndex(const Index& parent) const;
+            
+            std::vector<Index> GetAllParentIndices(const Index& index) const;
+            std::vector<Index> GetChildIndices(const Index& index) const;
+
+            void Print(std::ostream& os) const;
+            
+        private:
+            friend class SplitIterationDomain;
+
+            SplitIndex Split(int size); // add a split --- must be smaller than last split
+            SplitIndex Split(Index index, int size); // split the given index
+            SplitIndex SplitNode(int node, int size); // split the given index
+            
+            int GetNode(const Index& index) const; // returns the offset (index into a vector) for the given index
+            int GetParent(int node) const;
+            int GetLeftChild(int node) const;
+            int GetRightChild(int node) const;
+            int GetNthLeaf(int n) const;
+            int GetSmallestLeaf(int node) const; // returns the "smallest" leaf descendent of index. If index is itself a leaf, return it, else return GetSmallestLeaf(index.rightChild)
+            bool IsLeaf(int node) const;
+            bool IsInteriorNode(int node) const;
+
+            // The indices and their properties are stored in a binary tree
+            // The root is the dimension index (e.g., 'i')
+            // Leaves are concrete loop indices
+            // Interior nodes are computed indices
+            //
+            // Initially, the tree has 1 node: the dimension index.
+            // Splitting a leaf turns it into an interior node with 2 children (the loop indices of the 2 new loops).
+            // Splitting an interior node is illegal. If an interior node is specified, the rightmost leaf node
+            // is chosen (arbitrarily).
+
+            std::unordered_map<Index, int> _indexOffset; // A map from Index -> location in the below vectors for info about that index
+            std::vector<Index> _indices; // _indices[0] is i
+            std::vector<int> _parentOffset; // parent[0] is null (-1)
+            std::vector<int> _leftChildOffset; // offset to entry of the first (left) child. The right child is adjacent, so it's at (this value)+1
+            std::vector<Range> _ranges;
+        };
+    } // namespace loopnests
+} // namespace value
+} // namespace ell
diff --git a/libraries/value/include/loopnests/SplitIterationDomain.h b/libraries/value/include/loopnests/SplitIterationDomain.h
new file mode 100644
index 000000000..57c98cd3f
--- /dev/null
+++ b/libraries/value/include/loopnests/SplitIterationDomain.h
@@ -0,0 +1,95 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     SplitIterationDomain.h (value)
+//  Authors:  Chuck Jacobs, Kern Handa
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include "IterationDomain.h"
+#include "SplitIndexRange.h"
+
+#include <ostream>
+#include <unordered_map>
+#include <vector>
+
+namespace ell
+{
+namespace value
+{
+    namespace loopnests
+    {
+        /// <summary>
+        /// An `IterationDomain` where some of the dimensions may have been split (tiled).
+        /// </summary>
+        class SplitIterationDomain
+        {
+        public:
+            SplitIterationDomain(const IterationDomain& domain);
+
+            int NumDimensions() const;
+            int GetDimensionSize(const Index& dimensionIndex) const;
+            int GetDimensionBegin(const Index& dimensionIndex) const;
+
+            Range GetIndexRange(const Index& index) const;
+
+            std::vector<Index> GetAllLoopIndices() const;
+
+            const std::vector<Index>& GetIndicesForDimension(const Index& dimensionIndex) const;
+            std::vector<Index> GetLoopIndicesForDimension(const Index& dimensionIndex) const;
+            std::vector<Index> GetComputedIndicesForDimension(const Index& dimensionIndex) const;
+
+            std::vector<Index> GetDependentIndices(const Index& index, bool includeSelf = false) const;
+            std::vector<Index> GetDependentLoopIndices(const Index& index, bool includeSelf = false) const;
+
+            bool Contains(const Index& index) const; // returns `true` if the given index belongs to this domain
+            bool IsLoopIndex(const Index& index) const; // a leaf node in the index tree
+            bool IsComputedIndex(const Index& index) const; // an interior node in the index tree
+            bool IsDimension(const Index& index) const; // the index corresponding to the original range
+
+            bool SameDimension(const Index& index1, const Index& index2) const;
+            bool IsParentOf(const Index& parent, const Index& child) const; // is 'parent' the (immediate) parent of 'child'?
+            bool IsChildOf(const Index& child, const Index& parent) const; // is 'child' a (immediate) child of 'parent'?
+            bool DependsOn(const Index& index1, const Index& index2) const; // does 'index1' depend on 'index2'? (after a split, the parent depends on the new (leaf) indices)
+
+            bool HasParentIndex(const Index& parent) const;
+
+            /// <summary> Get the index that was split in order to create the given index </summary>
+            Index GetParentIndex(const Index& parent) const;
+
+            bool IsOuterSplitIndex(const Index& index) const;
+            bool IsInnerSplitIndex(const Index& index) const;
+            Index GetOuterSplitIndex(const Index& parent) const;
+            Index GetInnerSplitIndex(const Index& parent) const;
+
+            std::vector<Index> GetAllParentIndices(const Index& index) const;
+            std::vector<Index> GetChildIndices(const Index& index) const;
+
+            const SplitIndexRange& GetDimensionRange(const Index& index) const;
+            SplitIndexRange& GetDimensionRange(const Index& index);
+
+            const SplitIndexRange& GetDimensionRange(int offset) const;
+            SplitIndexRange& GetDimensionRange(int offset);
+
+            int NumSplits(const Index& dimensionIndex) const;
+            Index GetBaseIndex(const Index& index) const;
+            Index GetBaseIndex(int offset) const;
+            bool IsPrimaryDimension(const Index& index) const;
+
+            SplitIndex Split(const Index& index, int splitSize);
+
+            void Print(std::ostream& os) const;
+
+        private:
+            int GetOffsetFromIndex(const Index& index) const;
+
+            std::unordered_map<Index, Index> _baseIndices;
+            std::vector<SplitIndexRange> _dimensions;
+            std::unordered_map<Index, int> _indexToOffsetMap;
+        };
+
+    } // namespace loopnests
+} // namespace value
+} // namespace ell
diff --git a/libraries/value/src/Array.cpp b/libraries/value/src/Array.cpp
new file mode 100644
index 000000000..d8f2505a1
--- /dev/null
+++ b/libraries/value/src/Array.cpp
@@ -0,0 +1,120 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     Array.cpp (value)
+//  Authors:  Chuck Jacobs
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "Array.h"
+#include "EmitterContext.h"
+
+#include <utilities/include/Exception.h>
+
+#include <cassert>
+#include <functional>
+
+namespace ell
+{
+using namespace utilities;
+
+namespace value
+{
+    Array::Array() = default;
+
+    Array::Array(Value value, const std::string& name) :
+        _value(value)
+    {
+        if (!_value.IsDefined() || !_value.IsConstrained())
+        {
+            throw InputException(InputExceptionErrors::invalidArgument, "Value passed in must be defined and have a memory layout");
+        }
+        if (_value.GetLayout() == utilities::ScalarLayout)
+        {
+            throw InputException(InputExceptionErrors::invalidArgument, "Value passed in must not be scalar");
+        }
+        if (!name.empty())
+        {
+            SetName(name);
+        }
+    }
+
+    Array::~Array() = default;
+    Array::Array(const Array&) = default;
+    Array::Array(Array&&) noexcept = default;
+
+    Array& Array::operator=(const Array& other)
+    {
+        if (this != &other)
+        {
+            _value = other._value;
+        }
+        return *this;
+    }
+
+    Array& Array::operator=(Array&& other)
+    {
+        if (this != &other)
+        {
+            _value = std::move(other._value);
+            other._value = Value();
+        }
+        return *this;
+    }
+
+    Value Array::GetValue() const { return _value; }
+
+    Array Array::Copy() const
+    {
+        auto newValue = Allocate(_value.GetBaseType(), _value.GetLayout());
+        newValue = _value;
+        return newValue;
+    }
+
+    Scalar Array::operator()(const std::vector<Scalar>& indices)
+    {
+        if (static_cast<int>(indices.size()) != GetValue().GetLayout().NumDimensions())
+        {
+            throw utilities::InputException(utilities::InputExceptionErrors::invalidSize);
+        }
+        Value indexedValue = GetContext().Offset(_value, indices);
+        indexedValue.SetLayout(utilities::ScalarLayout);
+
+        return indexedValue;
+    }
+
+    Scalar Array::operator()(const std::vector<Scalar>& indices) const
+    {
+        if (static_cast<int>(indices.size()) != GetValue().GetLayout().NumDimensions())
+        {
+            throw utilities::InputException(utilities::InputExceptionErrors::invalidSize);
+        }
+        Value indexedValue = GetContext().Offset(_value, indices);
+        indexedValue.SetLayout(utilities::ScalarLayout);
+
+        return Scalar(indexedValue).Copy();
+    }
+
+    size_t Array::Size() const { return _value.GetLayout().NumElements(); }
+
+    ValueType Array::Type() const { return _value.GetBaseType(); }
+
+    void Array::SetName(const std::string& name) { _value.SetName(name); }
+
+    std::string Array::GetName() const { return _value.GetName(); }
+
+    void For(Array array, std::function<void(const std::vector<Scalar>&)> fn)
+    {
+        auto layout = array.GetValue().GetLayout();
+        GetContext().For(layout, [fn = std::move(fn), &layout](std::vector<Scalar> coordinates) {
+            if (layout.NumDimensions() != static_cast<int>(coordinates.size()))
+            {
+                throw utilities::InputException(utilities::InputExceptionErrors::invalidSize);
+            }
+
+            fn(coordinates);
+        });
+    }
+
+} // namespace value
+} // namespace ell
diff --git a/libraries/value/src/CachingProvider.cpp b/libraries/value/src/CachingProvider.cpp
new file mode 100644
index 000000000..32ca06813
--- /dev/null
+++ b/libraries/value/src/CachingProvider.cpp
@@ -0,0 +1,39 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     CachingProvider.cpp (value)
+//  Authors:  Kern Handa
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "CachingProvider.h"
+#include "LoopNests.h"
+
+#include "loopnests/LoopNest.h"
+
+namespace ell
+{
+namespace value
+{
+    void CachingProvider::Initialize(ViewAdapter view, utilities::MemoryShape cacheShape, utilities::DimensionOrder order, std::vector<Index> kernelIndices, std::vector<Index> atIndices, std::any extra)
+    {
+        _value = view;
+        _shape = cacheShape;
+        _order = order;
+        _kernelIndices = kernelIndices;
+        _atIndices = atIndices;
+        _extra = extra;
+    }
+
+    void CachingProvider::HandleCaching(LoopNest& loopnest)
+    {
+        for (auto& index : _kernelIndices)
+        {
+            index = loopnest.GetUnderlyingLoopNest().GetBaseIndex(index);
+        }
+
+        HandleCachingImpl(loopnest);
+    }
+
+} // namespace value
+} // namespace ell
\ No newline at end of file
diff --git a/libraries/value/src/CachingStrategies.cpp b/libraries/value/src/CachingStrategies.cpp
new file mode 100644
index 000000000..cb51732e3
--- /dev/null
+++ b/libraries/value/src/CachingStrategies.cpp
@@ -0,0 +1,1943 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     CachingStrategies.cpp (value)
+//  Authors:  Kern Handa, Mason Remy
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <Array.h>
+#include <CachingStrategies.h>
+#include <LoopNests.h>
+#include <Matrix.h>
+#include <Reference.h>
+#include <Scalar.h>
+#include <Tensor.h>
+#include <loopnests/Kernel.h>
+#include <loopnests/LoopNest.h>
+
+#include <utilities/include/Exception.h>
+#include <utilities/include/TypeAliases.h>
+#include <utilities/include/Unused.h>
+
+#include "ComputeContext.h"
+#include "CppEmitterContext.h"
+#include "LLVMContext.h"
+
+#include <llvm/Analysis/TargetTransformInfo.h>
+#include <llvm/CodeGen/TargetPassConfig.h>
+#include <llvm/Target/TargetMachine.h>
+
+#include <cmath>
+#include <queue>
+#include <set>
+
+#if 1 // DEBUGGING
+#include <iostream>
+#endif
+
+namespace ell
+{
+namespace value
+{
+    using namespace utilities;
+
+    // TODO : Generalize to machine characteristics and move out of CachingStrategies
+    struct RegisterCharacteristics
+    {
+        unsigned NumberOfSIMDRegisters;
+        unsigned NumberOfElementsPerSIMDRegister;
+    };
+
+    template <typename ValueType>
+    RegisterCharacteristics GetRegisterCharacteristics()
+    {
+        RegisterCharacteristics characteristics;
+        // Set some defaults for non-LLVMContext
+        characteristics.NumberOfSIMDRegisters = 8;
+        characteristics.NumberOfElementsPerSIMDRegister = 4;
+        InvokeForContext<LLVMContext>([&](LLVMContext& context) {
+            auto targetMachine = context.GetModuleEmitter().GetTargetMachine();
+            auto fn = context.GetFunctionEmitter().GetFunction();
+            auto info = targetMachine->getTargetTransformInfo(*fn);
+            // See https://llvm.org/doxygen/classllvm_1_1TargetTransformInfo.html for the big list of amazing things you can get from this TargetMachineInfo object
+            characteristics.NumberOfSIMDRegisters = info.getNumberOfRegisters(true);
+            auto SIMDRegisterBitWidth = info.getRegisterBitWidth(true);
+
+            auto bytesPerElement = context.GetModuleEmitter().GetIREmitter().SizeOf<ValueType>();
+            auto bitsPerElement = 8 * bytesPerElement;
+            characteristics.NumberOfElementsPerSIMDRegister = SIMDRegisterBitWidth / bitsPerElement;
+        });
+        return characteristics;
+    }
+
+    RegisterCharacteristics GetRegisterCharacteristics(ValueType type)
+    {
+        switch (type)
+        {
+        case ValueType::Void:
+            return GetRegisterCharacteristics<void>();
+            break;
+        case ValueType::Boolean:
+            return GetRegisterCharacteristics<bool>();
+            break;
+        case ValueType::Char8:
+            return GetRegisterCharacteristics<char>();
+            break;
+        case ValueType::Byte:
+            return GetRegisterCharacteristics<uint8_t>();
+            break;
+        case ValueType::Int16:
+            return GetRegisterCharacteristics<short>();
+            break;
+        case ValueType::Int32:
+            return GetRegisterCharacteristics<int>();
+            break;
+        case ValueType::Int64:
+            return GetRegisterCharacteristics<int64_t>();
+            break;
+        case ValueType::Float:
+            return GetRegisterCharacteristics<float>();
+            break;
+        case ValueType::Double:
+            return GetRegisterCharacteristics<double>();
+            break;
+        default:
+            throw InputException(InputExceptionErrors::invalidArgument, "Unrecognized or unsupported ValueType");
+        }
+    }
+
+    void CopyReduce(Scalar baseValue, Scalar cacheValue)
+    {
+        baseValue = cacheValue;
+    }
+
+    void SumReduce(Scalar baseValue, Scalar cacheValue)
+    {
+        baseValue += cacheValue;
+    }
+
+    // Makes a vector of all integers that are a power of base that are strictly less than N, ordered in decreasing value
+    std::vector<int> GetTelescopingSizes(int N, int base = 2)
+    {
+        int maxPower = std::log2(N);
+        if (std::pow(base, maxPower) == N)
+        {
+            // If N is already a power of base, dont add it to the vector
+            maxPower--;
+        }
+        std::vector<int> result;
+        result.reserve(maxPower);
+        for (int power = maxPower; power >= 0; --power)
+        {
+            result.push_back(static_cast<int>(std::pow(base, power)));
+        }
+        return result;
+    }
+
+    int RoundUpToMultiple(int input, int factor)
+    {
+        int remainder = input % factor;
+        return remainder > 0 ? input + (factor - remainder) : input;
+    }
+
+    static inline void ValidateInputDimensionality(const Value& value, const MemoryShape& cacheSize, const DimensionOrder& order)
+    {
+        if (cacheSize.NumDimensions() != value.GetLayout().NumDimensions())
+        {
+            throw LogicException(LogicExceptionErrors::illegalState, "Dimensionality of data-to-be-cached must match shape of requested cache size");
+        }
+        if (cacheSize.NumDimensions() != order.NumDimensions())
+        {
+            throw LogicException(LogicExceptionErrors::illegalState, "Dimensionality of dimension order must match shape of requested cache size");
+        }
+
+        if (value.GetLayout().NumDimensions() != 2)
+        {
+            throw LogicException(LogicExceptionErrors::notImplemented, "Only matrix source data is supported at this time");
+        }
+    }
+
+    // TODO move to Array slice code and generalize
+    Array SliceArray4_1(Array array, Scalar firstIndex)
+    {
+        Value indexedValue = array.GetValue().Offset({ firstIndex, 0, 0, 0 });
+        auto currentLayout = array.GetValue().GetLayout();
+
+        indexedValue.SetLayout(currentLayout.GetSliceLayout(currentLayout.GetPhysicalDimension(0)));
+
+        return indexedValue;
+    }
+
+    Array SliceArray4_1_offset(Array array, Scalar firstIndex)
+    {
+        auto currentLayout = array.GetValue().GetLayout();
+        auto memoryOffsets = currentLayout.GetOffset();
+        const auto memoryOrder = currentLayout.GetLogicalDimensionOrder();
+
+        // TODO : replace memory offsets with absolute offset support
+        Value indexedValue = array.GetValue().Offset({ firstIndex - memoryOffsets[memoryOrder[0]],
+                                                       0 - memoryOffsets[memoryOrder[1]],
+                                                       0 - memoryOffsets[memoryOrder[2]],
+                                                       0 - memoryOffsets[memoryOrder[3]] });
+        indexedValue.SetLayout(currentLayout.GetSliceLayout(currentLayout.GetPhysicalDimension(0)));
+
+        return indexedValue;
+    }
+    Matrix SliceArray4_2(Array array, Scalar firstIndex, Scalar secondIndex)
+    {
+        Value indexedValue = array.GetValue().Offset({ firstIndex, secondIndex, 0, 0 });
+        auto currentLayout = array.GetValue().GetLayout();
+
+        auto newLayout = currentLayout.GetSliceLayout(currentLayout.GetPhysicalDimension(0));
+        newLayout = newLayout.GetSliceLayout(newLayout.GetPhysicalDimension(0));
+
+        indexedValue.SetLayout(newLayout);
+        return indexedValue;
+    }
+    Array SliceArray6_2(Array array, Scalar firstIndex, Scalar secondIndex)
+    {
+        Value indexedValue = array.GetValue().Offset({ firstIndex, secondIndex, 0, 0, 0, 0 });
+        auto currentLayout = array.GetValue().GetLayout();
+
+        auto newLayout = currentLayout.GetSliceLayout(currentLayout.GetPhysicalDimension(0));
+        newLayout = newLayout.GetSliceLayout(newLayout.GetPhysicalDimension(0));
+
+        indexedValue.SetLayout(newLayout);
+        return indexedValue;
+    }
+    Array SliceArray6_4(Array array, Scalar firstIndex, Scalar secondIndex, Scalar thirdIndex, Scalar fourthIndex)
+    {
+        Value indexedValue = array.GetValue().Offset({ firstIndex, secondIndex, thirdIndex, fourthIndex, 0, 0 });
+        auto newLayout = array.GetValue().GetLayout();
+        newLayout = newLayout.GetSliceLayout(newLayout.GetPhysicalDimension(0));
+        newLayout = newLayout.GetSliceLayout(newLayout.GetPhysicalDimension(0));
+        newLayout = newLayout.GetSliceLayout(newLayout.GetPhysicalDimension(0));
+        newLayout = newLayout.GetSliceLayout(newLayout.GetPhysicalDimension(0));
+
+        indexedValue.SetLayout(newLayout);
+        return indexedValue;
+    }
+
+    void CopyInputCopyOutput::HandleCachingImpl(LoopNest& nest)
+    {
+        // throw LogicException(LogicExceptionErrors::notImplemented);
+    }
+
+    void CopyInputNoOutput::HandleCachingImpl(LoopNest& nest)
+    {
+        ValidateInputDimensionality(_value, _shape, _order);
+
+        // _shape is specified in logical dimensions, if _order is not canonical order then we need to reorder the layout
+        auto canonicalLayout = MemoryLayout{ _shape };
+        auto orderedLayout = canonicalLayout.ReorderedCopy(_order);
+
+        auto cacheName = UniqueName("copyInputNoOutputCache");
+        auto cacheValue = StaticAllocate(cacheName, _value.GetBaseType(), orderedLayout, AllocateFlags::ThreadLocal);
+        cacheValue.SetName(cacheName);
+        auto cacheRef = cacheValue.Reference();
+        cacheRef.SetName(cacheName + "Ref");
+
+        [[maybe_unused]] IntPtrT origAddress{};
+        InvokeForContext<ComputeContext>([&] { origAddress = std::get<IntPtrT*>(cacheRef.GetUnderlyingData())[0]; });
+
+        auto copyInputKernel = loopnests::Kernel(cacheName + "_Init_Kernel")
+                                   .Inputs(_value, cacheRef)
+                                   .Indices(_kernelIndices)
+                                   .Define([origAddress, orderedLayout](value::Matrix input, value::Value cacheRef, value::Scalar i, value::Scalar j) {
+                                       DEBUG_USED(origAddress);
+                                       InvokeForContext<ComputeContext>([&] {
+                                           [[maybe_unused]] auto addr = std::get<IntPtrT*>(cacheRef.GetUnderlyingData())[0];
+                                           assert(addr == origAddress);
+                                       });
+
+                                       Matrix cacheMatrix = cacheRef.Dereference();
+                                       int M = static_cast<int>(input.Rows());
+                                       int N = static_cast<int>(input.Columns());
+                                       Scalar cacheRows = value::Min(M - i, orderedLayout.GetLogicalDimensionActiveSize(0));
+                                       Scalar cacheColumns = value::Min(N - j, orderedLayout.GetLogicalDimensionActiveSize(1));
+
+                                       if (input.GetMatrixLayout() == Matrix::MatrixLayout::rowMajor)
+                                       {
+                                           ForRange(cacheRows, [&](Scalar i_inner) {
+                                               ForRange(cacheColumns, [&](Scalar j_inner) {
+                                                   cacheMatrix(i_inner, j_inner) = input(i + i_inner, j + j_inner);
+                                               });
+                                           });
+                                       }
+                                       else
+                                       {
+                                           ForRange(cacheColumns, [&](Scalar j_inner) {
+                                               ForRange(cacheRows, [&](Scalar i_inner) {
+                                                   cacheMatrix(i_inner, j_inner) = input(i + i_inner, j + j_inner);
+                                               });
+                                           });
+                                       }
+                                       auto offsetCacheValue = cacheMatrix.GetValue().Offset({ -1 * i, -1 * j });
+                                       offsetCacheValue.SetLayout(orderedLayout);
+                                       cacheRef = offsetCacheValue.Reference();
+                                   });
+
+        auto resetOffsetKernel = loopnests::Kernel(cacheName + "_Reset_Kernel")
+                                     .Inputs(cacheRef)
+                                     .Indices(_kernelIndices)
+                                     .Define([orderedLayout](value::Value cacheRef, value::Scalar i, value::Scalar j) {
+                                         Matrix cacheMatrix = cacheRef.Dereference();
+                                         auto offsetCacheValue = cacheMatrix.GetValue().Offset({ i, j });
+                                         offsetCacheValue.SetLayout(orderedLayout);
+                                         cacheRef = offsetCacheValue.Reference();
+                                     });
+
+        auto& underlyingNest = nest.GetUnderlyingLoopNest();
+        underlyingNest.AddKernel(copyInputKernel, loopnests::CodePositionConstraints{ loopnests::LoopFragmentType::prologue, _atIndices, {} });
+        underlyingNest.AddKernel(resetOffsetKernel, loopnests::CodePositionConstraints{ loopnests::LoopFragmentType::epilogue, _atIndices, {} });
+        underlyingNest.RenameVariable(_value, cacheRef, _atIndices, { copyInputKernel });
+    }
+
+    void ZeroInputReduceOutput::HandleCachingImpl(LoopNest& nest)
+    {
+        ValidateInputDimensionality(_value, _shape, _order);
+        auto canonicalLayout = MemoryLayout{ _shape };
+        auto orderedLayout = canonicalLayout.ReorderedCopy(_order);
+
+        auto cacheName = UniqueName("empyInputCopyOutputCache");
+        auto tempValue = StaticAllocate(cacheName, _value.GetBaseType(), orderedLayout, AllocateFlags::ThreadLocal);
+        tempValue.SetName(cacheName);
+        Matrix temp(tempValue);
+        auto cacheRef = tempValue.Reference();
+        cacheRef.SetName(cacheName + "Ref");
+        [[maybe_unused]] IntPtrT origAddress{};
+        InvokeForContext<ComputeContext>([&] { origAddress = std::get<IntPtrT*>(cacheRef.GetUnderlyingData())[0]; });
+
+        auto kernel3 = loopnests::Kernel(cacheName + "_Init_Kernel")
+                           .Inputs(cacheRef)
+                           .Indices(_kernelIndices)
+                           .Define([shape = orderedLayout](value::Value temp, value::Scalar i, value::Scalar j) {
+                               Matrix tempMatrix = temp.Dereference();
+
+                               value::For(tempMatrix, [&](value::Scalar i_inner, value::Scalar j_inner) {
+                                   tempMatrix(i_inner, j_inner) = Cast(0, tempMatrix.Type());
+                               });
+
+                               // Update cacheRef so that global (i, k) index into the corect spot in the cache
+                               auto cacheTmpOffset = tempMatrix.GetValue().Offset({ -1 * i, -1 * j });
+                               cacheTmpOffset.SetLayout(shape);
+                               temp = cacheTmpOffset.Reference();
+                           });
+
+        auto& underlyingNest = nest.GetUnderlyingLoopNest();
+        underlyingNest.AddKernel(kernel3, loopnests::CodePositionConstraints{ loopnests::LoopFragmentType::prologue, _atIndices, {} });
+
+        auto kernel2 = loopnests::Kernel(cacheName + "_Reduce_Kernel")
+                           .Inputs(_value, cacheRef)
+                           .Indices(_kernelIndices)
+                           .Define([shape = orderedLayout](value::Matrix C, value::Value temp, value::Scalar i, value::Scalar j) {
+                               auto cacheTmpOffset = temp.Dereference().Offset({ i, j });
+                               cacheTmpOffset.SetLayout(shape);
+                               temp = cacheTmpOffset.Reference();
+                               auto cache = value::Matrix(temp.Dereference());
+
+                               int M = static_cast<int>(C.Rows());
+                               int N = static_cast<int>(C.Columns());
+                               Scalar extraM = value::Min(M - i, shape.GetLogicalDimensionActiveSize(0));
+                               Scalar extraN = value::Min(N - j, shape.GetLogicalDimensionActiveSize(1));
+
+                               ForRange(extraM, [&](Scalar i_inner) {
+                                   ForRange(extraN, [&](Scalar j_inner) {
+                                       C(i + i_inner, j + j_inner) += cache(i_inner, j_inner);
+                                   });
+                               });
+                           });
+        underlyingNest.AddKernel(kernel2, loopnests::CodePositionConstraints{ loopnests::LoopFragmentType::epilogue, _atIndices, {} });
+        underlyingNest.RenameVariable(_value, cacheRef, _atIndices, { kernel2, kernel3 });
+    } // namespace value
+
+    void BLASTCopy::HandleCachingImpl(LoopNest& nest)
+    {
+        /* BLAS T COPY:
+        suppose input matrix is M x N, cache size is M' x N', stripeSize = 4
+        so cache successive M'x4 row-major submatrices from the input matrix
+
+         0  1  2  3 16 17 18 19      0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 ...
+         4  5  6  7 20 21 22 23 ->  16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
+         8  9 10 11 24 25 26 27
+        12 13 14 15 28 29 30 31
+
+        Need 2 layers of caching:
+        at M x N level, build up cache values
+        at stripeSize level, set up pointer and memory layout
+         */
+
+        ValidateInputDimensionality(_value, _shape, _order);
+
+        // get block size, stripe size, and stripe slitting index from extras
+        auto extraParams = std::any_cast<std::tuple<int, Index, BoundaryConditionHandling>>(_extra);
+        int stripeSize;
+        Index stripeSplitIndex;
+        BoundaryConditionHandling boundaryHandling;
+        std::tie(stripeSize, stripeSplitIndex, boundaryHandling) = extraParams;
+
+        if (boundaryHandling == BoundaryConditionHandling::ZeroPadding && _shape[1] % stripeSize != 0)
+        {
+            // To avoid an odd repeated edge case, enforce that the number of cache columns is a multiple of the stripe size
+            // So the base 3D cache view can represent the full cache
+            throw InputException(InputExceptionErrors::invalidSize, "The number of cache columns must be a multiple of the cache stripe size");
+        }
+
+        // Cache structure:
+        // Lift the 2D submatrix into a 3D array to set up the cache simply
+        // The first dimension identifies which cached column block to use
+        // The second two dimensions identify the element inside of that cached submatrix block
+        // Index mapping: input ( i, j ) -> cache ( j / stripeSize, i, j % stripeSize )
+        //                cache ( i, j, k ) -> input ( j, i * stripeSize + k )
+
+        // Boundary handling
+        // There are 4 boundary scenarios (possibly all 4 can happen in a single input matrix + cache size combination
+        // while iterating over the matrix):
+        //     |-------N-------|
+        //     |----N'---|
+        // _ _ *---------------*
+        // | | |         |     |
+        // | M'|    1    |  2  |
+        // | | |         |     |
+        // M _ |_________|_____|
+        // |   |    3    |  4  |
+        // |   |         |     |
+        // _   *---------------*
+
+        // 1 : The cache has exactly as many rows and columns as the input matrix chunk
+        //     - This is the simple case, leave the cache as { M' x N' }
+        // 2 : The cache has more columns than the input matrix but fewer rows
+        //     - re-view the cache to be { M' x remainingColumns }
+        // 3 : The cache has more rows than the input matrix but fewer columns
+        //     - re-view the cache to be { remainingRows x N' }
+        // 4 : The cache has more rows and columns than the input matrix
+        //     - re-view the cache to be { remainingRows x remainingColumns }
+        // Note: it is assumed that the input matrix is stepped over in splits based on the
+        //     cache size given, so the cache can never be smaller than the input matrix chunk
+
+        // Since the matrix and cache sizes are known ahead of time, we can compute all of the boundary
+        // condition layouts that are needed:
+        // remainingRows = M % M'
+        // remainingColumns = N % N'
+
+        auto inputMatrix = Matrix(_value);
+        int inputRows = inputMatrix.Rows();
+        int inputCols = inputMatrix.Columns();
+        int remainingRows = inputRows % _shape[0];
+        int remainingCols = inputCols % _shape[1];
+        int roundedRemainingCols = RoundUpToMultiple(remainingCols, stripeSize);
+        // we don't need to round up remainingRows since stripe size only applies to columns in BLASTCopy
+
+        auto generateTCOPYCacheLayout = [stripeSize](int rows, int cols) {
+            auto cacheDimOrder = DimensionOrder{ 0, 1, 2 };
+            auto liftedShape = MemoryShape{ cols / stripeSize, rows, stripeSize };
+            auto cacheLayout = MemoryLayout{ liftedShape, cacheDimOrder };
+            return cacheLayout;
+        };
+        auto generateTCOPYCacheViewLayout = [stripeSize](int rows, int cols) {
+            auto cacheViewLayout = MemoryLayout{ { rows, stripeSize }, RowMajorMatrixOrder };
+            return cacheViewLayout;
+        };
+
+        auto baseCacheLayout = generateTCOPYCacheLayout(_shape[0], _shape[1]); // The non-boundary-case 3D lifted shape
+        auto baseCacheViewLayout = generateTCOPYCacheViewLayout(_shape[0], _shape[1]);
+
+        // "Boundary" condition 1 is the general case (i.e. non-boundary case)
+        auto boundaryConditionCacheLayout1 = baseCacheLayout;
+        auto cacheViewLayout1 = baseCacheViewLayout;
+
+        // Boundary condition 2, re-view to M' x remainingColumns
+        auto boundaryConditionCacheLayout2 = generateTCOPYCacheLayout(_shape[0], roundedRemainingCols);
+        auto cacheViewLayout2 = generateTCOPYCacheViewLayout(_shape[0], roundedRemainingCols);
+
+        // Boundary condition 3, re-view to remainingRows x N'
+        auto boundaryConditionCacheLayout3 = generateTCOPYCacheLayout(remainingRows, _shape[1]);
+        auto cacheViewLayout3 = generateTCOPYCacheViewLayout(remainingRows, _shape[1]);
+
+        // Boundary condition 4, re-view to remainingRows x remainingColumns
+        auto boundaryConditionCacheLayout4 = generateTCOPYCacheLayout(remainingRows, roundedRemainingCols);
+        auto cacheViewLayout4 = generateTCOPYCacheViewLayout(remainingRows, roundedRemainingCols);
+
+        auto cacheName = UniqueName("BLASTCopyCache");
+        _rawCache = StaticAllocate(cacheName, _value.GetBaseType(), baseCacheLayout);
+        Array liftedCache(_rawCache);
+
+        auto cacheRef = _rawCache.Reference();
+        cacheRef.SetLayout(baseCacheViewLayout);
+        cacheRef.SetName(cacheName + "_Ref");
+
+        auto cacheFillKernel = loopnests::Kernel(cacheName + "_Fill_Cache_Kernel")
+                                   .Inputs(_value, liftedCache)
+                                   .Indices(_kernelIndices)
+                                   .Define([remainingRows, remainingCols, stripeSize, shape = _shape, inputRows, inputCols, boundaryConditionCacheLayout1, boundaryConditionCacheLayout2, boundaryConditionCacheLayout3, boundaryConditionCacheLayout4](value::Matrix input, value::Array cache, value::Scalar i, value::Scalar j) {
+                                       // We may need to re-view the cache to a smaller layout if we have less
+                                       // data to cache than we have available space in the cache.
+                                       // If we re-view the cache then we can keep the smaller cached data
+                                       // physically contiguous while still using the same looping APIs
+                                       Scalar kernelRemainingRows = inputRows - i;
+                                       Scalar kernelRemainingCols = inputCols - j;
+                                       Scalar notEnoughRows = shape[0] > kernelRemainingRows;
+                                       Scalar notEnoughCols = shape[1] > kernelRemainingCols;
+                                       ZeroMemory(cache);
+
+                                       // Generate the cache fill loop in a parameterized lambda so we can emit the different layout versions independently
+                                       auto cacheFillLoop = [&](MemoryLayout cacheFillLayout, int rows, int cols) {
+                                           auto cacheFillView = cache.GetValue();
+                                           cacheFillView.SetLayout(cacheFillLayout);
+                                           auto reViewedCache = Array(cacheFillView);
+
+                                           ForRange(Scalar{ 0 }, Scalar{ cols / stripeSize }, [&](Scalar stripeColumnChunk) {
+                                               ForRange(Scalar{ 0 }, Scalar{ rows }, [&](Scalar row) {
+                                                   ForRange(Scalar{ 0 }, Scalar{ stripeSize }, [&](Scalar stripeColumn) {
+                                                       reViewedCache({ stripeColumnChunk, row, stripeColumn }) = input(i + row, j + stripeColumnChunk * stripeSize + stripeColumn);
+                                                   });
+                                               });
+                                           });
+                                           auto finalColumnChunk = Scalar{ cols / stripeSize };
+                                           ForRange(Scalar{ 0 }, Scalar{ rows }, [&](Scalar row) {
+                                               ForRange(Scalar{ 0 }, Scalar{ cols % stripeSize }, [&](Scalar stripeColumn) {
+                                                   reViewedCache({ finalColumnChunk, row, stripeColumn }) = input(i + row, j + finalColumnChunk * stripeSize + stripeColumn);
+                                               });
+                                           });
+                                       };
+
+                                       // Emit all of the different loops individually since the cache layouts are set at emit-time
+                                       If(notEnoughRows,
+                                          [&]() {
+                                              If(notEnoughCols,
+                                                 [&]() {
+                                                     // Boundary condition 4
+                                                     cacheFillLoop(boundaryConditionCacheLayout4, remainingRows, remainingCols);
+                                                 })
+                                                  .Else(
+                                                      [&]() {
+                                                          // Boundary condition 3
+                                                          cacheFillLoop(boundaryConditionCacheLayout3, remainingRows, shape[1]);
+                                                      });
+                                          })
+                                           .ElseIf(notEnoughCols,
+                                                   [&]() {
+                                                       // Boundary condition 2
+                                                       cacheFillLoop(boundaryConditionCacheLayout2, shape[0], remainingCols);
+                                                   })
+                                           .Else(
+                                               [&]() {
+                                                   // Boundary condition 1
+                                                   cacheFillLoop(boundaryConditionCacheLayout1, shape[0], shape[1]);
+                                               });
+                                   });
+
+        auto& underlyingNest = nest.GetUnderlyingLoopNest();
+        underlyingNest.AddKernel(cacheFillKernel, loopnests::CodePositionConstraints{ loopnests::LoopFragmentType::prologue, _atIndices, {} });
+
+        std::vector<Index> viewInitKernelIndices;
+        viewInitKernelIndices.assign(_kernelIndices.begin(), _kernelIndices.end());
+        viewInitKernelIndices.push_back(stripeSplitIndex);
+        auto viewInitKernel = loopnests::Kernel(cacheName + "_View_Init_Kernel")
+                                  .Inputs(liftedCache, cacheRef)
+                                  .Indices(viewInitKernelIndices)
+                                  .Define([shape = _shape, stripeSize, inputRows, inputCols, cacheViewLayout1, cacheViewLayout2, cacheViewLayout3, cacheViewLayout4, boundaryConditionCacheLayout1, boundaryConditionCacheLayout2, boundaryConditionCacheLayout3, boundaryConditionCacheLayout4](value::Array cache, value::Value cacheRef, value::Scalar i, value::Scalar j, value::Scalar jStripe) {
+                                      // To set up the view for the kernel to use, we need to set up the cacheRef reference
+                                      // so that a kernel indexing with (i, j) winds up in the right spot, pointing into the
+                                      // cached row-major submatrix that is the (j / stripeSize, ALL, ALL) slice of the cache array
+
+                                      // We may need to re-view the cache view to a smaller layout if we are in one of the boundary conditions
+                                      Scalar remainingRows = inputRows - i;
+                                      Scalar remainingCols = inputCols - j;
+                                      Scalar notEnoughRows = shape[0] > remainingRows;
+                                      Scalar notEnoughCols = shape[1] > remainingCols;
+
+                                      auto cacheViewFn = [&](MemoryLayout cacheLayout, MemoryLayout viewLayout) {
+                                          // Re-View the cache so we can index into the correct cached stripe
+                                          auto cacheView = cache.GetValue();
+                                          cacheView.SetLayout(cacheLayout);
+                                          auto cacheStripe = jStripe % shape[1]; // If N > N', make sure we index into the re-initialized cache position
+                                          auto indexedCacheView = cacheView.Offset({ cacheStripe / stripeSize, 0, 0 });
+
+                                          // Re-View the indexed cache as a 2-D matrix so we can position the offset pointer for use in the inner kernels
+                                          indexedCacheView.SetLayout(viewLayout);
+                                          auto offsetIndexedCacheView = indexedCacheView.Offset({ -1 * i, -1 * j });
+                                          offsetIndexedCacheView.SetLayout(viewLayout);
+                                          cacheRef.SetLayout(viewLayout);
+                                          cacheRef = offsetIndexedCacheView.Reference();
+                                      };
+
+                                      // Emit all of the views and offsets individually since the cache layouts are set at emit-time
+                                      If(notEnoughRows,
+                                         [&]() {
+                                             If(notEnoughCols,
+                                                [&]() {
+                                                    // Boundary condition 4
+                                                    cacheViewFn(boundaryConditionCacheLayout4, cacheViewLayout4);
+                                                })
+                                                 .Else(
+                                                     [&]() {
+                                                         // Boundary condition 3
+                                                         cacheViewFn(boundaryConditionCacheLayout3, cacheViewLayout3);
+                                                     });
+                                         })
+                                          .ElseIf(notEnoughCols,
+                                                  [&]() {
+                                                      // Boundary condition 2
+                                                      cacheViewFn(boundaryConditionCacheLayout2, cacheViewLayout2);
+                                                  })
+                                          .Else(
+                                              [&]() {
+                                                  // Boundary condition 1
+                                                  cacheViewFn(boundaryConditionCacheLayout1, cacheViewLayout1);
+                                              });
+                                  });
+        underlyingNest.AddKernel(viewInitKernel, loopnests::CodePositionConstraints{ loopnests::LoopFragmentType::prologue, { stripeSplitIndex }, {} });
+        underlyingNest.RenameVariable(_value, cacheRef, _atIndices, { cacheFillKernel, viewInitKernel });
+    }
+
+   // namespace value
+
+  
+
+
+
+    // Helper class to hold a binary tree with a MemoryLayout at each leaf node corresponding to a different
+    // boundary condition and with a number of levels equal to the number of dimensions in a cache layout
+    class BoundaryConditionMemoryLayoutHelper
+    {
+        // A multi-dimensional cache memory layout with N dimensions can have 2^N different boundary condition
+        // layouts since each dimension of the cache memory layout could either be in a:
+        // - general case - the number of elements in that dimension in this particular slice in the
+        //                  cache layout is less than or equal to the number of elements remaining in the input
+        //                  for that dimension
+        // - boundary case - the number of elements in that dimension in this particular slice in the cache
+        //                  layout is greater than the number of elements remaining in the input for that dimension
+        //
+        // We must generate two different types of things for these cases:
+        // 1) a set of memory layouts for each possible scenario - at emit time we can know all of the general
+        //      or boundary cases that we will hit. We need the shape of the input region, the input fill region,
+        //      the cache layout, and the cache fill layout.
+        // 2) a nested set of emitted If/Else switches that will switch on the remaining size of the input for
+        //      each cache dimension and call a given lambda with the appropriate boundary condition memory layout
+    public:
+        BoundaryConditionMemoryLayoutHelper(MemoryShape inputShape,
+                                            const std::vector<int>& orderedIndexSizes,
+                                            const std::vector<int>& logicalDimensionMapping,
+                                            const std::vector<int>& splitIndexScaleFactors,
+                                            unsigned cacheFillThresholdIdxOffset,
+                                            unsigned cacheViewThresholdIdxOffset) :
+            _inputShape(inputShape),
+            _orderedIndexSizes(orderedIndexSizes),
+            _logicalDimensionMapping(logicalDimensionMapping),
+            _splitIndexScaleFactors(splitIndexScaleFactors),
+            _cacheFillThresholdIdxOffset(cacheFillThresholdIdxOffset),
+            _cacheViewThresholdIdx(cacheViewThresholdIdxOffset)
+        {
+            if (orderedIndexSizes.size() != logicalDimensionMapping.size())
+            {
+                throw InputException(InputExceptionErrors::invalidSize, "Need to provide the same number of ordered index sizes as logical dimension mappings");
+            }
+            if (orderedIndexSizes.size() != splitIndexScaleFactors.size())
+            {
+                throw InputException(InputExceptionErrors::invalidSize, "Need to provide the same number of ordered index sizes as split index scale factors mappings");
+            }
+            if (orderedIndexSizes.empty())
+            {
+                throw InputException(InputExceptionErrors::invalidSize, "Need to provide at least one index size");
+            }
+            FillTree();
+        }
+
+        template <typename Fn>
+        void EmitBoundarySwitches(const std::vector<Scalar>& compositeIndices, Fn&& func) const
+        {
+            unsigned inputLogicalDimensionCount = _inputShape.NumDimensions();
+            if (compositeIndices.size() != inputLogicalDimensionCount)
+            {
+                throw InputException(InputExceptionErrors::invalidSize, "Need to provide one scalar index value per logical dimension in the input");
+            }
+
+            // Compute how many elements are remaining in each logical dimension
+            std::vector<Scalar> remainingElements;
+            remainingElements.reserve(inputLogicalDimensionCount);
+            for (size_t logicalDimension = 0; logicalDimension < inputLogicalDimensionCount; ++logicalDimension)
+            {
+                Scalar remaining = _inputShape[logicalDimension] - compositeIndices[logicalDimension];
+                remainingElements.push_back(remaining);
+            }
+
+            // Determine which levels of the tree are going to be in a boundary condition based on the remaining elements
+            std::vector<Scalar> isBoundaryCase;
+            isBoundaryCase.reserve(_logicalDimensionMapping.size());
+            for (unsigned idx = 0; idx < _logicalDimensionMapping.size(); ++idx)
+            {
+                isBoundaryCase.push_back(_orderedIndexSizes[idx] > remainingElements[_logicalDimensionMapping[idx]]);
+            }
+
+            // Run a depth-first traversal of the tree to emit the nested If/Else switches to handle all the boundary conditions
+            RecursiveEmitHelper(_tree, isBoundaryCase, 0, func);
+        }
+
+    private:
+        struct BoundaryConditionTreeNode
+        {
+            BoundaryConditionTreeNode(const std::vector<int>& logicalDimensionSizes) :
+                cacheLogicalDimensionSizes(logicalDimensionSizes.size()),
+                cacheFillLogicalDimensionSizes(logicalDimensionSizes.size()),
+                subLogicalDimensionSizes(logicalDimensionSizes)
+            {}
+            BoundaryConditionTreeNode(const std::shared_ptr<BoundaryConditionTreeNode>& parent,
+                                      int newCacheSize,
+                                      int newInputSize,
+                                      int logicalDimension,
+                                      const std::vector<int>& splitIndexScaleFactors,
+                                      bool isCacheFillIdx) :
+                cacheSizes(parent->cacheSizes),
+                cacheFillSizes(parent->cacheFillSizes),
+                cacheLogicalDimensionSizes(parent->cacheLogicalDimensionSizes),
+                cacheFillLogicalDimensionSizes(parent->cacheFillLogicalDimensionSizes),
+                subLogicalDimensionSizes(parent->subLogicalDimensionSizes)
+            {
+                cacheSizes.push_back(newCacheSize);
+
+                if (cacheLogicalDimensionSizes[logicalDimension] == 0)
+                {
+                    cacheLogicalDimensionSizes[logicalDimension] = newInputSize;
+                }
+
+                if (isCacheFillIdx)
+                {
+                    cacheFillSizes.push_back(newCacheSize);
+                    if (cacheFillLogicalDimensionSizes[logicalDimension] == 0)
+                    {
+                        cacheFillLogicalDimensionSizes[logicalDimension] = newInputSize;
+                    }
+                }
+                subLogicalDimensionSizes[logicalDimension] = newInputSize;
+                ComputeShape(splitIndexScaleFactors);
+            }
+
+            void ComputeShape(const std::vector<int>& splitIndexScaleFactors)
+            {
+                // Create a cache shape for this level
+                std::vector<int> shardSizes;
+                shardSizes.reserve(cacheSizes.size());
+
+                std::vector<int> fillShardSizes;
+                fillShardSizes.reserve(cacheFillSizes.size());
+                unsigned fillOffset = cacheSizes.size() - cacheFillSizes.size();
+                for (unsigned idx = 0; idx < cacheSizes.size(); ++idx)
+                {
+                    int shardSize = cacheSizes[idx] / splitIndexScaleFactors[idx];
+                    if (cacheSizes[idx] % splitIndexScaleFactors[idx] != 0)
+                    {
+                        // Account for partial shards
+                        shardSize++;
+                    }
+                    shardSizes.push_back(shardSize);
+                    if (idx >= fillOffset)
+                    {
+                        fillShardSizes.push_back(shardSize);
+                    }
+                }
+
+                cacheShape = { shardSizes };
+                cacheFillShape = { fillShardSizes };
+                inputRegionShape = { cacheLogicalDimensionSizes };
+                inputRegionFillShape = { cacheFillLogicalDimensionSizes };
+            }
+
+            // Use shared_ptr instead of unique_ptr since we need to be able to copy these helper objects into multiple lambdas
+            std::shared_ptr<BoundaryConditionTreeNode> generalCase;
+            std::shared_ptr<BoundaryConditionTreeNode> boundaryCase;
+            std::vector<int> cacheSizes;
+            std::vector<int> cacheFillSizes;
+            std::vector<int> cacheLogicalDimensionSizes; // logical dimension sizes represented by the full cache
+            std::vector<int> cacheFillLogicalDimensionSizes; // logical dimension sizes represented by the fill view of the cache
+            std::vector<int> subLogicalDimensionSizes; // logical input dimension sizes represented by this portion of the tree
+            MemoryShape cacheShape;
+            MemoryShape cacheFillShape;
+            MemoryShape inputRegionShape;
+            MemoryShape inputRegionFillShape;
+        };
+
+        void FillTree()
+        {
+            int logicalDimensionCount = _inputShape.NumDimensions();
+            std::vector<int> baseLogicalDimensionCacheSizes;
+            baseLogicalDimensionCacheSizes.reserve(logicalDimensionCount);
+            for (int logicalDimension = 0; logicalDimension < logicalDimensionCount; ++logicalDimension)
+            {
+                baseLogicalDimensionCacheSizes.push_back(_inputShape[logicalDimension]);
+            }
+            _tree = std::make_shared<BoundaryConditionTreeNode>(baseLogicalDimensionCacheSizes);
+            std::queue<std::shared_ptr<BoundaryConditionTreeNode>> activeNodes;
+            activeNodes.push(_tree);
+
+            // Compute the minimum sizes for remainders / boundary cases for each level
+            // A remainder / boundary case needs to cover an integer number of the splits that
+            // occur later on in that logical dimension, so we compute the possible sizes for
+            // the remainders in each cache dimension up front
+            // E.g. if we have a cache that is 4x4, with a split of 2 in the column dimension,
+            //      ordered at {1,0,1} so that the cache size is {2,4,2}, but our input is
+            //      4x3, we need to zero-pad the innermost dimension since we need to keep
+            //      an integer number of them, and thus behave as though it's still a 4x4
+            //      input region and we have a {2,4,2} cache.
+            //      If instead we have 4x2 input, we can reduce the outermost dimension shard
+            //      count by 1 and still cover an integer number of the inner splits with a
+            //      {1,4,2} cache
+            std::vector<int> remainderMinimumSizes(_logicalDimensionMapping.size());
+            std::map<int, int> logicalDimensionWorkingSizes;
+            // loop from the innermost split dimension to the outermost
+            for (unsigned idx = _logicalDimensionMapping.size() - 1; _logicalDimensionMapping.size() > idx; --idx)
+            {
+                int logicalDimension = _logicalDimensionMapping[idx];
+                auto workingSizeIter = logicalDimensionWorkingSizes.find(logicalDimension);
+                int size = _orderedIndexSizes[idx];
+                if (workingSizeIter == logicalDimensionWorkingSizes.end())
+                {
+                    remainderMinimumSizes[idx] = 1;
+                }
+                else
+                {
+                    remainderMinimumSizes[idx] = logicalDimensionWorkingSizes[logicalDimension];
+                }
+                logicalDimensionWorkingSizes[logicalDimension] = size;
+            }
+
+            for (unsigned idx = 0; idx < _logicalDimensionMapping.size(); ++idx)
+            {
+                int logicalDimension = _logicalDimensionMapping[idx];
+                int cacheSplitSize = _orderedIndexSizes[idx];
+                size_t numNodesInLevel = activeNodes.size();
+                for (unsigned nodeIdx = 0; nodeIdx < numNodesInLevel; ++nodeIdx)
+                {
+                    auto currentNode = activeNodes.front();
+                    activeNodes.pop();
+
+                    int baseLogicalInputSize = currentNode->subLogicalDimensionSizes[logicalDimension];
+                    int baseRemainderSize = baseLogicalInputSize % cacheSplitSize;
+
+                    // round up the logical input size based on the remainder minimum size for this dimension
+                    int logicalInputSize = RoundUpToMultiple(baseLogicalInputSize, remainderMinimumSizes[idx]);
+                    int remainderSize = logicalInputSize % cacheSplitSize;
+
+                    if (idx > _cacheViewThresholdIdx || remainderSize == 0)
+                    {
+                        // We can't reshape the cache view, so if we're inside of the view portion
+                        // of the cache we need to zero-pad
+                        // As a half-step to keep the cache as dense as possible when we're in a boundary condition,
+                        // we let the first cacheViewThresholdIdx be shrunk for the purposes of creating
+                        // the cache layout, since this idx is definitely in the most-major dimension of
+                        // the cache view as it is the farthest out. Therefore we only consider if
+                        // idx > _cacheViewThresholdIdx instead of idx >= _cacheViewThresholdIdx
+
+                        // Additionally, if after rounding up the logical input size we've produced
+                        // an integer multiple of cacheSplitSize, we need to generate a boundary condition
+                        // branch with the full cacheSplitSize as the cache size, but with the base remainder
+                        // size as the input size
+                        remainderSize = cacheSplitSize;
+                    }
+
+                    if (cacheSplitSize <= logicalInputSize)
+                    {
+                        currentNode->generalCase = std::make_shared<BoundaryConditionTreeNode>(currentNode,
+                                                                                               cacheSplitSize,
+                                                                                               cacheSplitSize,
+                                                                                               logicalDimension,
+                                                                                               _splitIndexScaleFactors,
+                                                                                               idx >= _cacheFillThresholdIdxOffset);
+                        activeNodes.push(currentNode->generalCase);
+                    }
+                    if (baseRemainderSize > 0)
+                    {
+                        currentNode->boundaryCase = std::make_shared<BoundaryConditionTreeNode>(currentNode,
+                                                                                                remainderSize,
+                                                                                                baseRemainderSize,
+                                                                                                logicalDimension,
+                                                                                                _splitIndexScaleFactors,
+                                                                                                idx >= _cacheFillThresholdIdxOffset);
+                        activeNodes.push(currentNode->boundaryCase);
+                    }
+                }
+            }
+        }
+
+        template <typename Fn>
+        void RecursiveEmitHelper(const std::shared_ptr<BoundaryConditionTreeNode>& currentNode, const std::vector<Scalar>& isBoundaryCase, unsigned currentIdx, Fn&& func) const
+        {
+            if (currentNode->generalCase == nullptr && currentNode->boundaryCase == nullptr)
+            {
+                // Base case, call the given function with our cache shape for this leaf of the tree
+                func(currentNode->inputRegionShape, currentNode->inputRegionFillShape, currentNode->cacheShape, currentNode->cacheFillShape);
+            }
+            else if (currentNode->generalCase == nullptr)
+            {
+                // we only have a boundary case, so don't emit an If/Else but instead just recurse to the boundary case
+                RecursiveEmitHelper(currentNode->boundaryCase, isBoundaryCase, currentIdx + 1, func);
+            }
+            else if (currentNode->boundaryCase == nullptr)
+            {
+                // we only have a general case, so don't emit an If/Else but instead just recurse to the general case
+                RecursiveEmitHelper(currentNode->generalCase, isBoundaryCase, currentIdx + 1, func);
+            }
+            else
+            {
+                // We have both a general case and a boundary case, so emit an If/Else switch to emit both cases
+                If(isBoundaryCase[currentIdx], [&] {
+                    RecursiveEmitHelper(currentNode->boundaryCase, isBoundaryCase, currentIdx + 1, func);
+                }).Else([&] {
+                    RecursiveEmitHelper(currentNode->generalCase, isBoundaryCase, currentIdx + 1, func);
+                });
+            }
+        }
+
+        MemoryShape _inputShape;
+        std::vector<int> _orderedIndexSizes;
+        std::vector<int> _logicalDimensionMapping;
+        std::vector<int> _splitIndexScaleFactors;
+        unsigned _cacheFillThresholdIdxOffset;
+        unsigned _cacheViewThresholdIdx;
+        std::shared_ptr<BoundaryConditionTreeNode> _tree; // Use shared_ptr's since we need to be able to copy these helper objects into multiple lambdas
+    };
+
+    std::pair<MemoryLayout, unsigned int> ComputeCacheView(MemoryLayout cacheLayout,
+                                                           const std::vector<int>& cacheLogicalDimensionMapping,
+                                                           int logicalDimensionCount)
+    {
+        std::vector<int> cacheViewSizes;
+        cacheViewSizes.reserve(logicalDimensionCount);
+        std::vector<int> dimensionOrdering(logicalDimensionCount, -1); // initialize the dimensionOrdering since we will be filling it out-of-order
+
+        // Iterate the cacheLogicalDimensionMapping from back to front in order to walk the shape
+        // from the inner splits to the outer splits
+        int previousLogicalDimension = -1; // -1 == sentinel uninitialized value, not any of the logical dimensions
+        unsigned cacheViewThresholdIdx = cacheLogicalDimensionMapping.size() - 1; // index in the logical dimension mapping / split indices vector to start the cache view at
+        unsigned currentDimensionOrderingIdx = dimensionOrdering.size() - 1;
+        std::set<int> seenLogicalDimensions;
+        std::map<int, int> logicalDimensionToCacheViewSize; // maps from logical dimension to the cache view size
+        for (unsigned idx = cacheLogicalDimensionMapping.size() - 1; cacheLogicalDimensionMapping.size() > idx; --idx)
+        {
+            int logicalDimension = cacheLogicalDimensionMapping[idx];
+            if (previousLogicalDimension != logicalDimension)
+            {
+                // This is different from the previous logical dimension that we were collapsing
+                if (seenLogicalDimensions.find(logicalDimension) != seenLogicalDimensions.end())
+                {
+                    // If we've seen this logical dimension before and we aren't currently collapsing it then this is a repeat
+                    // that prompts us to stop building up the cache view
+                    break;
+                }
+                else
+                {
+                    // this is the first time we've seen this dimension, so insert it into the dimension ordering outside of the
+                    // dimensions we've already seen
+                    dimensionOrdering[currentDimensionOrderingIdx--] = logicalDimension;
+                    seenLogicalDimensions.insert(logicalDimension);
+                    previousLogicalDimension = logicalDimension;
+                }
+            }
+            cacheViewThresholdIdx = idx;
+        }
+
+        // Now we know the sizes of all the dimensions in the view and we need to fill the remainder of
+        // the dimension ordering with any logical dimensions in the input that aren't part of the cache view.
+        // Any logical dimensions that aren't part of the cache view have a cache view size of 1, and thus
+        // the ordering of them doesn't really matter since we'll re-view the cache before changing the
+        // index in that dimension we examine
+        for (int logicalDimension = 0; logicalDimension < logicalDimensionCount; ++logicalDimension)
+        {
+            // Set the size to 1, for logical dimensions that are also in the view, we will multiply
+            // this value by the shard sizes in the view
+            logicalDimensionToCacheViewSize[logicalDimension] = 1;
+            if (seenLogicalDimensions.find(logicalDimension) == seenLogicalDimensions.end())
+            {
+                // This dimension isn't part of the view, so insert it in the dimension ordering
+                // outside of the dimensions that are in the view
+                dimensionOrdering[currentDimensionOrderingIdx--] = logicalDimension;
+            }
+        }
+
+        // Now we need to build up the sizes of the view dimensions by taking the product of cache dimension
+        // sizes within each logical dimension after the point in the cache hierarchy where the view starts.
+        // We take the product of the sizes because the active sizes at each cache dimension represent the
+        // number of shards in that split dimension, not necessarily element count in that logical dimension.
+        // In the innermost split in each logical dimension the shareds are all of size 1 and therefore
+        // shard count == element count
+        for (unsigned idx = cacheViewThresholdIdx; idx < cacheLogicalDimensionMapping.size(); ++idx)
+        {
+            int logicalDimension = cacheLogicalDimensionMapping[idx];
+            logicalDimensionToCacheViewSize[logicalDimension] *= cacheLayout.GetActiveSize(idx);
+        }
+
+        // Now that we have the full view dimension ordering and a map from logical dimension to view size,
+        // fill out the ordered view sizes vector
+        for (unsigned idx = 0; idx < dimensionOrdering.size(); ++idx)
+        {
+            int logicalDimension = dimensionOrdering[idx];
+            cacheViewSizes.push_back(logicalDimensionToCacheViewSize[logicalDimension]);
+        }
+
+        return { MemoryLayout{ MemoryShape{ cacheViewSizes }, DimensionOrder{ dimensionOrdering } }, cacheViewThresholdIdx };
+    }
+
+    void GeneralCachingStrategy::HandleCachingImpl(LoopNest& nest)
+    {
+        // General caching strategy:
+        // Given:
+        //     - input value
+        //     - top level indices that the input uses
+        //     - name for the cache
+        //     - size of the cache to use in # of elements
+        //     - # of elements to cache at a time ( < size of cache for progressive caching, > size of cache is an error)
+        //     - Input / InputOutput / Output designation
+        //     - Reduce function operating on individual Scalars
+        //
+        // Set up 3-4 kernels:
+        //     - Cache flushing kernel
+        //     - Cache filling kernel if Input/InputOutput
+        //     - Cache viewing kernel (based on the shape of the input value)
+        //     - Cache reduce kernel if InputOutput/Output
+
+        auto extraParams = std::any_cast<std::tuple<value::ArgumentType,
+                                                    std::string,
+                                                    size_t,
+                                                    size_t,
+                                                    std::function<ReduceFunctionType>,
+                                                    bool>>(_extra);
+        value::ArgumentType argType;
+        std::string baseName;
+        size_t maxCacheElts;
+        size_t fillThreshold; // fillThreshold <= maxCacheElts
+        std::function<ReduceFunctionType> reduceFunction;
+        bool accumulateReduce;
+        std::tie(argType,
+                 baseName,
+                 maxCacheElts,
+                 fillThreshold,
+                 reduceFunction,
+                 accumulateReduce) = extraParams;
+
+        // Read target machine characteristics for number of SIMD registers and the size of the registers
+        RegisterCharacteristics registerCharacteristics = GetRegisterCharacteristics(_value.GetBaseType());
+
+        // Determine kernels needed
+        bool useFillKernel = (argType == value::ArgumentType::Input || argType == value::ArgumentType::InputOutput);
+        bool useViewKernel = true; // always include view kernel for simplicity for now, even if the re-viewing winds up being redundant
+        bool useReduceKernel = (argType == value::ArgumentType::Output || argType == value::ArgumentType::InputOutput);
+
+        size_t bufferAlignment = 16 * sizeof(float);
+
+        InvokeForContext<CppEmitterContext>([&] {
+            // TODO : Support buffer alignment in CppEmitterContext
+            bufferAlignment = 0;
+        });
+
+        auto inputArray = Array(_value);
+        int logicalDimensionCount = _value.GetLayout().NumDimensions();
+        int compositeIndexCount = _kernelIndices.size();
+        auto& underlyingNest = nest.GetUnderlyingLoopNest();
+
+        const auto& loopSequence = underlyingNest.GetLoopSequence();
+        std::vector<Index> orderedIndices;
+        for (const auto& index : loopSequence)
+        {
+            const auto& dimensionIndex = underlyingNest.GetDimensionRange(index).GetDimensionIndex();
+            auto indexIter = std::find(_kernelIndices.begin(), _kernelIndices.end(), dimensionIndex);
+            if (indexIter != _kernelIndices.end())
+            {
+                orderedIndices.push_back(index);
+            }
+        }
+
+        // Ensure we have some indices
+        if (orderedIndices.empty())
+        {
+            throw InputException(InputExceptionErrors::invalidSize, "Don't have any indices relevant to this input for this loop nest");
+        }
+
+        // If there are no _atIndices specified, default to the outermost orderedIndices index
+        if (_atIndices.empty())
+        {
+            _atIndices.push_back(orderedIndices.front());
+        }
+
+        // Compute the mapping between the orderedIndices list and the logical input dimensions
+        std::vector<int> logicalDimensionMapping;
+        logicalDimensionMapping.reserve(orderedIndices.size());
+
+        // Determine the size for each split for each logical dimension
+        // We only care about the split indices that are passed in as part of
+        // orderedIndices, so instead of recording the sizes of those indices,
+        // instead record the size of the full index range followed by the increments
+        // of the each of the orderedIndices
+        std::map<int, std::vector<int>> logicalDimensionSplitSizes;
+        for (int logicalDimension = 0; logicalDimension < logicalDimensionCount; ++logicalDimension)
+        {
+            logicalDimensionSplitSizes[logicalDimension].push_back(_value.GetLayout().GetActiveSize(logicalDimension));
+        }
+
+        // Determine the increments for each split index in the orderedIndices
+        // The cache dimensions all operate with logical increments of 1, so when we are mapping between input space and cache space
+        // we need to scale appropriately by the split index increments for each split index
+        std::vector<int> orderedIndexIncrements;
+        orderedIndexIncrements.reserve(orderedIndices.size());
+
+        for (const auto& index : orderedIndices)
+        {
+            // Compute the logical dimension mapping
+            const auto& dimensionIndex = underlyingNest.GetDimensionRange(index).GetDimensionIndex();
+            auto indexIter = std::find(_kernelIndices.begin(), _kernelIndices.end(), dimensionIndex);
+            // Here we assume:
+            //  - _kernelIndices is a vector or similar, so (iteror - begin) == idx of iterator
+            //  - _kernelIndices is arranged in logical dimension ordering for this input
+            int logicalDimension = indexIter - _kernelIndices.begin();
+            logicalDimensionMapping.push_back(logicalDimension);
+
+            // Find the index increment for this index to use for scaling index values to
+            // convert between cache dimensions and input indices
+            // Also use this for the logical dimension split sizes
+            auto increment = underlyingNest.GetIndexRange(index).Increment();
+            orderedIndexIncrements.push_back(increment);
+            logicalDimensionSplitSizes[logicalDimension].push_back(increment);
+        }
+
+        // Compute the memory shape for the cache based on the index sizes in each logical
+        // dimension. Each MemoryShape dimension counts the number of shards of the cache
+        // that dimension indexes over, so the size of each MemoryShape dimension ought to be
+        // the size of the index divided by the size of the next split index in the same
+        // logical input dimension.
+        // e.g. if Index i ranges over [0,64), and is split by 32, then by 16, then by 4
+        //      we will have split indices [0,64,32), [0,32,16), [0,16,4), and [0,4,1),
+        //      but suppose a cache doesn't use the second index, i.e. it only uses
+        //      [0,64,32), [0,16,4), and [0,4,1), then the memory shape (for split dimensions
+        //      in the i logical dimension) should be { 4, 4, 4 } since the outer index
+        //      ranging from 0 to 64 accounts for 4 shards of 16
+        //      and the next index ranging from 0 to 16 accounts for 4 shards of 4
+        //      and the next index ranging from 0 to 4 accounts for 4 shards of 1
+        //
+        // Now that we have the base dimension size and all the increments for the indices we're using
+        // we can compute the shard sizes for each logical dimension by dividing each dimension split
+        // size we accumulated above with the size that comes after it, indicating how many instnaces of
+        // the next shard occur within the current shard
+        std::map<int, std::queue<int>> logicalIndexToShardSizes;
+        std::map<int, std::queue<int>> logicalIndexToSizes; // Full element counts, not shard counts
+        for (int logicalDimension = 0; logicalDimension < logicalDimensionCount; ++logicalDimension)
+        {
+            const auto& splitSizes = logicalDimensionSplitSizes[logicalDimension];
+            for (unsigned splitIdx = 0; splitIdx < splitSizes.size() - 1; ++splitIdx)
+            {
+                int currentSize = splitSizes[splitIdx];
+                int nextSize = splitSizes[splitIdx + 1];
+                int shardSize = currentSize / nextSize;
+                if (currentSize % nextSize != 0)
+                {
+                    // Round up to account for partial shards
+                    shardSize++;
+                }
+                logicalIndexToShardSizes[logicalDimension].push(shardSize);
+                logicalIndexToSizes[logicalDimension].push(currentSize);
+            }
+        }
+
+        // Now that we have the shard sizes grouped by logical dimension, arrange them to match
+        // the orderedIndices
+        std::vector<int> orderedIndexShardSizes;
+        std::vector<int> orderedIndexSizes; // Full element counts, not shard counts
+        orderedIndexShardSizes.reserve(orderedIndices.size());
+        orderedIndexSizes.reserve(orderedIndices.size());
+        for (unsigned idx = 0; idx < logicalDimensionMapping.size(); ++idx)
+        {
+            int logicalDimension = logicalDimensionMapping[idx];
+
+            orderedIndexShardSizes.push_back(logicalIndexToShardSizes[logicalDimension].front());
+            logicalIndexToShardSizes[logicalDimension].pop();
+
+            orderedIndexSizes.push_back(logicalIndexToSizes[logicalDimension].front());
+            logicalIndexToSizes[logicalDimension].pop();
+        }
+
+        // Create a MemoryShape for the cache based on the shard counts
+        // This isn't the final cache shape and layout yet - we may need to shrink it to fit the number
+        // of elements requested in the cache
+        MemoryShape fullInputShape = { orderedIndexShardSizes };
+        MemoryLayout fullInputLayout = { fullInputShape };
+
+        // Physical Cache
+        // Determine how large the physical cache ought to be by trying to cover complete view
+        // dimensions without exceeding maxCacheElts elements in size.
+        // e.g. if the full view has 5 dimensions, and our maxCacheElts only covers the inner most two dimensions,
+        //      then the cache size is set to that size and we create our "fill" and "reduce" kernels accordingly
+        // To achieve this, start from the base full cache layout and slice off physical dimensions going from the
+        // outermost to the innermost until the full extent has no more than maxCacheElts elements
+        MemoryLayout cacheLayout = fullInputLayout;
+        unsigned cacheThresholdIdx = 0;
+        while (cacheLayout.GetMemorySize() > maxCacheElts)
+        {
+            cacheLayout = cacheLayout.GetSliceLayout(0);
+            cacheThresholdIdx++;
+        }
+        if (cacheLayout.NumElements() == 0)
+        {
+            throw InputException(InputExceptionErrors::invalidSize, "Specified cache size isn't large enough to cover the smallest dimension of the cache layout");
+        }
+        std::vector<int> cacheOrderedIndexSizes(orderedIndexSizes.begin() + cacheThresholdIdx, orderedIndexSizes.end());
+        std::vector<int> cacheLogicalDimensionMapping(logicalDimensionMapping.begin() + cacheThresholdIdx, logicalDimensionMapping.end());
+        std::vector<int> cacheOrderedIndexIncrements(orderedIndexIncrements.begin() + cacheThresholdIdx, orderedIndexIncrements.end());
+        auto cacheName = UniqueName(baseName);
+        _rawCache = StaticAllocate(cacheName, _value.GetBaseType(), cacheLayout);
+
+        // Progresive Caching
+        // To enable progressive caching, where a subset of the full physical cache is
+        // filled and used, then later the next chunk of the physical cache is filled
+        // and used, we need to find the dimension split at which fillThreshold elements
+        // is surpassed and set up a fill kernel at that point
+        // If fillThreshold == maxCacheElts or they are both exceeded in the same
+        // split, then ensure that the fill kernel occurs after the cache emptying kernel
+        if (fillThreshold > maxCacheElts)
+        {
+            throw InputException(InputExceptionErrors::invalidArgument, "Fill threshold can't be larger than the max cache size");
+        }
+        unsigned cacheFillThresholdIdx = cacheThresholdIdx;
+        MemoryLayout cacheFillLayout = cacheLayout;
+        while (cacheFillLayout.GetMemorySize() > fillThreshold)
+        {
+            cacheFillLayout = cacheFillLayout.GetSliceLayout(0);
+            cacheFillThresholdIdx++;
+        }
+        if (cacheFillLayout.NumElements() == 0)
+        {
+            throw InputException(InputExceptionErrors::invalidSize, "Specified cache fill threshold size isn't large enough to cover the smallest dimension of the cache layout");
+        }
+        std::vector<int> cacheFillOrderedIndexSizes(orderedIndexSizes.begin() + cacheFillThresholdIdx, orderedIndexSizes.end());
+        std::vector<int> cacheFillLogicalDimensionMapping(logicalDimensionMapping.begin() + cacheFillThresholdIdx, logicalDimensionMapping.end());
+        std::vector<int> cacheFillOrderedIndexIncrements(orderedIndexIncrements.begin() + cacheFillThresholdIdx, orderedIndexIncrements.end());
+
+        // Cache View
+        // The cache view needs to have the same number of dimensions as the input value
+        // but cover an area that is a subset of the full cache and represents one cache
+        // dimension per logical input dimension.
+        // This may mean that for some of the logical input dimensions, the cache view
+        // size is 1, e.g. suppose a 3-D input is cached where the inner 3 dimensions of
+        // the cache only operate over two of the logical dimensions of the input while the
+        // two innermost dimensions of those operate over the two distinct input logical
+        // dimensions. In that case the cache view would cover the inner two cache dimensions
+        // and have a 1 for the third dimension size.
+        // In general, the cache view needs to cover an area of the cache that can be
+        // contiguously represented like the logical input value.
+
+        // To build up the cache view layout, start from the innermost dimension of the
+        // cache layout and accumulate dimensions going outward until either all of the
+        // logical input dimensions are accounted for or one of the logical input dimensions
+        // repeats. However, when a single dimension is repeated multiple times in a row,
+        // those repeats can be collapsed into a single visiting of that dimension. These
+        // can be collapsed because the logical behavior is the same regardless of whether
+        // the split that produced the repeated dimension was made or not.
+        // E.g. suppose your dimensions are { 0, 0, 1, 1, 1, 0, 0 }, then the first pair of
+        //      0's can be collapsed and treated like a single visiting of that dimension,
+        //      the set of 3 1's can be collapsed, and the final pair of 0's can be collapsed,
+        //      producing a collapsed dimension ordering of { 0, 1, 0 }. With a collapsed
+        //      dimension ordering of { 0, 1, 0 }, the cache view needs to break at the inner
+        //      { 1, 0 }, because after that a dimension (the 0 dimension) will repeat.
+        MemoryLayout baseCacheViewLayout;
+        unsigned cacheViewThresholdIdxOffset;
+        std::tie(baseCacheViewLayout, cacheViewThresholdIdxOffset) = ComputeCacheView(cacheFillLayout,
+                                                                                      cacheFillLogicalDimensionMapping,
+                                                                                      logicalDimensionCount);
+        unsigned cacheViewThresholdIdx = cacheFillThresholdIdx + cacheViewThresholdIdxOffset;
+
+        auto cacheRef = _rawCache.Reference();
+        cacheRef.SetLayout(baseCacheViewLayout);
+
+        // Boundary Conditions
+        // Boundary conditions occur when the region of the input value that we want
+        // to cache does not fill the physical cache,
+        // e.g. for a matrix cache there are 4 cases, 3 of which are considered boundary condition cases:
+        //      Suppose the matrix is M x N and the physical cache is sized to hold M' x N' elements,
+        //      where M / 2 < M' < M, N / 2 < N' < N
+        //     |-------N-------|
+        //     |----N'---|----N'---|
+        // _ _ *---------------*
+        // | | |         |     |
+        // | M'|    1    |  2  |
+        // | | |         |     |
+        // M _ |_________|_____|
+        // | | |    3    |  4  |
+        // | M'|         |     |
+        // _ | *---------------*
+        //   _
+        // 1 : The cache has exactly as many rows and columns as the input matrix chunk
+        // 2 : The cache has more columns than the matrix chunk but just as many rows
+        // 3 : The cache has more rows than the matrix chunk but just as many columns
+        // 4 : The cache has more rows and columns than the matrix chunk
+        //
+        // One possible solution is to zero-pad the cache and keep the layout as-is. This would certainly work
+        //
+        // However, in order to maximize data locality in the cache (which is the purpose of the cache),
+        // we would prefer it if the cache were reshaped such that the input value chunk
+        // fills the cache from the beginning until the end of the chunk without any gaps.
+        // This reshape amounts to shrinking the cache sizes in some dimensions, however to preserve
+        // vectorization behavior we avoid shrinking the innermost dimension and instead zero-pad
+        // that dimension
+        unsigned cacheFillThresholdIdxOffset = cacheFillThresholdIdx - cacheThresholdIdx;
+        unsigned cacheViewThresholdIdxCacheOffset = cacheViewThresholdIdxOffset + cacheFillThresholdIdxOffset;
+        BoundaryConditionMemoryLayoutHelper boundaryConditionCacheHelper(_value.GetLayout().GetActiveSize(), cacheOrderedIndexSizes, cacheLogicalDimensionMapping, cacheOrderedIndexIncrements, cacheFillThresholdIdxOffset, cacheViewThresholdIdxCacheOffset);
+
+        std::vector<loopnests::Kernel> cachingKernels;
+
+        {
+            // Flush the cache to implicitly zero-pad any regions of the cache we don't fill later
+            std::vector<Index> cacheFlushPosition(orderedIndices.begin(), orderedIndices.begin() + cacheThresholdIdx);
+            auto cacheEmptyKernel = loopnests::Kernel(cacheName + "_Empty_Cache_Kernel")
+                                        .Inputs(_rawCache)
+                                        .Indices()
+                                        .Define([](Value cache) {
+                                            // TODO : determine if a vectorized approach is worthwhile here
+                                            ZeroMemory(cache);
+                                        });
+
+            underlyingNest.AddKernel(cacheEmptyKernel, loopnests::CodePositionConstraints{ loopnests::LoopFragmentType::prologue, cacheFlushPosition, {} });
+            cachingKernels.push_back(cacheEmptyKernel);
+        }
+        if (useFillKernel)
+        {
+            std::vector<Index> cacheFillPosition(orderedIndices.begin(), orderedIndices.begin() + cacheFillThresholdIdx);
+            std::vector<Index> cacheFillIndices(_kernelIndices.begin(), _kernelIndices.end());
+            cacheFillIndices.insert(cacheFillIndices.end(), cacheFillPosition.begin(), cacheFillPosition.end());
+
+            auto cacheFillKernel = loopnests::Kernel(cacheName + "_Fill_Cache_Kernel")
+                                       .Inputs(_value, _rawCache)
+                                       .Indices(cacheFillIndices)
+                                       .DefineEx([=](std::vector<Value> values, std::vector<Scalar> indices) {
+                                           auto& input = values[0];
+                                           auto& cache = values[1];
+                                           std::vector<Scalar> compositeIndexValues(indices.begin(), indices.begin() + compositeIndexCount);
+                                           std::vector<Scalar> splitIndexValues(indices.begin() + compositeIndexCount, indices.end());
+
+                                           auto offsetInput = input.Offset(compositeIndexValues);
+                                           offsetInput.SetLayout(input.GetLayout());
+                                           auto offsetInputArrayView = Array(offsetInput);
+
+                                           boundaryConditionCacheHelper.EmitBoundarySwitches(compositeIndexValues, [=](MemoryLayout inputRegionShape, MemoryLayout inputRegionFillShape, MemoryLayout boundaryCacheLayout, MemoryLayout boundaryCacheFillLayout) {
+                                               // Offset the cache write head based on the where we're at in the progressive caching
+                                               // Since fillThreshold <= maxCacheElts, we may run this kernel multiple times filling
+                                               // different portions of the cache, so we look at the indices between the
+                                               // cacheThresholdIdx and the cacheFillThresholdIdx to find what position we need to
+                                               // offset to
+                                               // these indices all map in order to the dimensions that are in the cache and outside
+                                               // the fill region since the cache memory ordering is based on these indices in this order
+
+                                               auto cacheView = cache;
+                                               cacheView.SetLayout(boundaryCacheLayout);
+                                               std::vector<Scalar> cacheOffsetIndices;
+                                               cacheOffsetIndices.reserve(boundaryCacheLayout.NumDimensions());
+
+                                               // Note: if cacheThresholdIdx == cacheFillThresholdIdx (i.e. if there is no progressive caching)
+                                               // Then the first loop is skipped and no offsetting occurs, and therefore filling the cache from
+                                               // the beginning every time this kernel is run
+                                               for (unsigned idx = cacheThresholdIdx; idx < cacheFillThresholdIdx; ++idx)
+                                               {
+                                                   // Mapping loopnest indices (input space) -> cache offsets (cache space) so divide by split index increment
+                                                   cacheOffsetIndices.push_back(splitIndexValues[idx] / orderedIndexIncrements[idx]);
+                                               }
+                                               for (unsigned idx = cacheFillThresholdIdx; idx < static_cast<unsigned>(fullInputLayout.NumDimensions()); ++idx)
+                                               {
+                                                   cacheOffsetIndices.push_back(Scalar{ 0 });
+                                               }
+                                               auto offsetCache = cacheView.Offset(cacheOffsetIndices);
+                                               offsetCache.SetLayout(boundaryCacheFillLayout);
+                                               auto cacheFillArrayView = Array(offsetCache);
+
+                                               // Prefer input-oriented loops to maximize locality as the input
+                                               // is likely to be larger than the cache in most cases
+                                               // Based on the element size and counts in different dimensions,
+                                               // we will split and unroll some of the inner loops in order to maximize
+                                               // vectorization.
+                                               // In order to get appropriate utilization of all the SIMD
+                                               // registers, we will need to use a temporary buffer (which we expect
+                                               // the compiler to optimize away) with a size equal to the total number
+                                               // of elements that can be held in all of the SIMD registers.
+                                               // The filling of this temporary buffer from the input needs to be an
+                                               // unrolled operation and the filling of the cache from the temporary
+                                               // buffer also needs to be an unrolled operation that happens after
+                                               // the full temporary buffer has been filled.
+                                               // Therefore, we need multiple levels of loopnests so that the area
+                                               // outside of the temporary buffer's addressable region can be looped
+                                               // over, and the area inside the temporary buffer region can have two
+                                               // sequential fully unrolled loopnests.
+                                               // new loopnest (outer):
+                                               // For ... {
+                                               //   For ... {
+                                               //       // start of outer loopnest prologue kernel
+                                               //       // Fill temp buf
+                                               //       new loopnest (inner #1):
+                                               //       For ... (unroll) {
+                                               //           For ... (unroll) {
+                                               //               ... {
+                                               //                   // start of inner loopnest #1 kernel
+                                               //                   tempBuf(tempBufIndices) = input(inputIndices)
+                                               //                   // end of inner loopnest #1 kernel
+                                               //               }
+                                               //               ...
+                                               //           }
+                                               //       }
+                                               //       // Fill cache
+                                               //       new loopnest (inner #2):
+                                               //       For ... (unroll) {
+                                               //           For ... (unroll) {
+                                               //               ... {
+                                               //                   // start of inner loopnest #2 kernel
+                                               //                   cache(cacheIndices) = tempBuf(tempBufIndices)
+                                               //                   // end of inner loopnest #2 kernel
+                                               //               }
+                                               //               ...
+                                               //           }
+                                               //       }
+                                               //       // end of outer loopnest kernel
+                                               //   }
+                                               // }
+
+                                               std::vector<loopnests::Index> fillIndices;
+                                               fillIndices.reserve(inputRegionFillShape.NumDimensions());
+                                               for (int idx = 0; idx < inputRegionFillShape.NumDimensions(); ++idx)
+                                               {
+                                                   fillIndices.push_back(loopnests::Index("fillIdx_" + std::to_string(idx)));
+                                               }
+
+                                               // Define LoopNest
+                                               auto fillNest = Using({ offsetInputArrayView }, ArgumentType::Input)
+                                                                   .Using({ cacheFillArrayView }, ArgumentType::Output);
+                                               for (int idx = 0; idx < inputRegionFillShape.NumDimensions(); ++idx)
+                                               {
+                                                   fillNest.ForAll(fillIndices[idx], 0, inputRegionFillShape.GetActiveSize(idx));
+                                               }
+
+                                               const int VectorizationSize = registerCharacteristics.NumberOfElementsPerSIMDRegister;
+                                               int maximumElementsInTempBuf = registerCharacteristics.NumberOfSIMDRegisters * VectorizationSize;
+                                               std::vector<int> indexSplitSizes(fillIndices.size());
+                                               std::vector<int> tmpBufDimensionMapping(indexSplitSizes.size());
+
+                                               // Handle the innermost input dimension differently since we'll be counting elements there instead of shards of a memory layout
+                                               int shardSize = VectorizationSize;
+                                               int totalElementsPerShard = VectorizationSize;
+                                               for (unsigned idx = fillIndices.size() - 1; fillIndices.size() > idx; --idx)
+                                               {
+                                                   int availableShardsInTmpBuf = maximumElementsInTempBuf / totalElementsPerShard;
+                                                   int inputDimAvailableShards = inputRegionFillShape.GetActiveSize(idx) / shardSize;
+                                                   int numShards = std::min(availableShardsInTmpBuf, inputDimAvailableShards);
+                                                   tmpBufDimensionMapping[idx] = inputRegionFillShape.GetLogicalDimension(idx);
+                                                   if (numShards > 1)
+                                                   {
+                                                       indexSplitSizes[idx] = numShards * shardSize;
+                                                       shardSize = 1; // After the initial vectorization size, we target units of entire memory layout shards
+                                                       totalElementsPerShard *= numShards; // The number of elements represented by a target scales with the number of inner targets it represents
+                                                   }
+                                                   else
+                                                   {
+                                                       indexSplitSizes[idx] = 1;
+                                                   }
+                                               }
+                                               // The index split sizes are measured in input-space, so no scaling is needed
+                                               std::vector<int> tmpBufScaleFactors(indexSplitSizes.size(), 1);
+
+                                               BoundaryConditionMemoryLayoutHelper fillKernelBoundaryHelper(inputRegionFillShape.GetActiveSize(),
+                                                                                                            indexSplitSizes,
+                                                                                                            tmpBufDimensionMapping,
+                                                                                                            tmpBufScaleFactors,
+                                                                                                            0, // Fill index doesn't matter for this usage
+                                                                                                            tmpBufDimensionMapping.size()); // Shrink any index split sizes needed since we don't have a "view" to worry about
+
+                                               auto cacheFillInternalKernel = loopnests::Kernel("Internal_Fill_Cache_Outer_Kernel")
+                                                                                  .Inputs(offsetInputArrayView, cacheFillArrayView)
+                                                                                  .Indices(fillIndices)
+                                                                                  .DefineEx([=](std::vector<Value> values, std::vector<Scalar> innerIndices) {
+                                                                                      Array offsetInput = values[0];
+                                                                                      Array cacheFillView = values[1];
+
+                                                                                      Value offsetInputInnerVal = offsetInput.GetValue().Offset(innerIndices);
+                                                                                      offsetInputInnerVal.SetLayout(offsetInput.GetValue().GetLayout());
+                                                                                      Array offsetInputInner = offsetInputInnerVal;
+
+                                                                                      std::vector<Scalar> cacheIndices;
+                                                                                      cacheIndices.reserve(boundaryCacheFillLayout.NumDimensions());
+                                                                                      for (int cacheDimIdx = 0; cacheDimIdx < boundaryCacheFillLayout.NumDimensions(); ++cacheDimIdx)
+                                                                                      {
+                                                                                          unsigned baseDimIdx = cacheFillThresholdIdx + cacheDimIdx;
+                                                                                          int logicalDimension = logicalDimensionMapping[baseDimIdx];
+                                                                                          // Mapping loopnest indices (input space) -> cache indices (cache space) so divide by split index increment
+                                                                                          cacheIndices.push_back((innerIndices[logicalDimension] / orderedIndexIncrements[baseDimIdx]) % boundaryCacheFillLayout.GetActiveSize(cacheDimIdx));
+                                                                                      }
+                                                                                      Value offsetCacheInnerVal = cacheFillView.GetValue().Offset(cacheIndices);
+                                                                                      offsetCacheInnerVal.SetLayout(cacheFillView.GetValue().GetLayout());
+                                                                                      Array offsetCacheInner = offsetCacheInnerVal;
+
+                                                                                      fillKernelBoundaryHelper.EmitBoundarySwitches(innerIndices, [=](MemoryLayout fillRegionShape, MemoryLayout, MemoryLayout boundaryTempBufLayout, MemoryLayout) {
+                                                                                          Array tmpBuf = Allocate(offsetInput.Type(), boundaryTempBufLayout, bufferAlignment);
+
+                                                                                          std::vector<loopnests::Index> tmpBufInputIndices;
+
+                                                                                          tmpBufInputIndices.reserve(fillRegionShape.NumDimensions());
+                                                                                          for (int idx = 0; idx < fillRegionShape.NumDimensions(); ++idx)
+                                                                                          {
+                                                                                              tmpBufInputIndices.push_back(loopnests::Index("tmpBuf_FillIdx_" + std::to_string(idx)));
+                                                                                          }
+
+                                                                                          auto tmpBufFillNest = Using({ offsetInputInner }, ArgumentType::Input)
+                                                                                                                    .Using({ tmpBuf }, ArgumentType::Output);
+                                                                                          for (int idx = 0; idx < fillRegionShape.NumDimensions(); ++idx)
+                                                                                          {
+                                                                                              tmpBufFillNest.ForAll(tmpBufInputIndices[idx], 0, fillRegionShape.GetActiveSize(idx));
+                                                                                          }
+
+                                                                                          auto tmpBufFill = loopnests::Kernel("Internal_TmpBuf_FillTmpBuf_Kernel")
+                                                                                                                .Inputs(offsetInputInner, tmpBuf)
+                                                                                                                .Indices(tmpBufInputIndices)
+                                                                                                                .DefineEx([=](std::vector<Value> tmpBufValues, std::vector<Scalar> tmpBufInputIndices) {
+                                                                                                                    Array offsetInputInner = tmpBufValues[0];
+                                                                                                                    Array tmpBuf = tmpBufValues[1];
+
+                                                                                                                    tmpBuf(tmpBufInputIndices) = offsetInputInner(tmpBufInputIndices);
+                                                                                                                });
+                                                                                          tmpBufFillNest.Do(tmpBufFill);
+                                                                                          auto& tmpBufFillSchedule = tmpBufFillNest.GetSchedule();
+                                                                                          // unroll everything
+                                                                                          for (unsigned idx = 0; idx < tmpBufInputIndices.size(); ++idx)
+                                                                                          {
+                                                                                              tmpBufFillSchedule.Unroll(tmpBufInputIndices[idx]);
+                                                                                          }
+                                                                                          tmpBufFillNest.Run();
+
+                                                                                          // Cache fill from tmp buf
+                                                                                          auto cacheFillNest = Using({ tmpBuf }, ArgumentType::Input)
+                                                                                                                   .Using({ offsetCacheInner }, ArgumentType::Output);
+                                                                                          for (int idx = 0; idx < tmpBuf.GetValue().GetLayout().NumDimensions(); ++idx)
+                                                                                          {
+                                                                                              cacheFillNest.ForAll(tmpBufInputIndices[idx], 0, tmpBuf.GetValue().GetLayout().GetActiveSize(idx));
+                                                                                          }
+
+                                                                                          auto cacheFill = loopnests::Kernel("Internal_TmpBuf_FillCache_Kernel")
+                                                                                                               .Inputs(tmpBuf, offsetCacheInner)
+                                                                                                               .Indices(tmpBufInputIndices)
+                                                                                                               .DefineEx([=](std::vector<Value> tmpBufValues, std::vector<Scalar> tmpBufIndices) {
+                                                                                                                   Array tmpBuf = tmpBufValues[0];
+                                                                                                                   Array offsetCacheInner = tmpBufValues[1];
+
+                                                                                                                   int cacheDimensions = offsetCacheInner.GetValue().GetLayout().NumDimensions();
+                                                                                                                   std::vector<Scalar> cacheIndices;
+                                                                                                                   cacheIndices.reserve(cacheDimensions);
+                                                                                                                   for (int cacheDimIdx = 0; cacheDimIdx < cacheDimensions; ++cacheDimIdx)
+                                                                                                                   {
+                                                                                                                       unsigned baseDimIdx = cacheFillThresholdIdx + cacheDimIdx;
+                                                                                                                       int logicalDimension = logicalDimensionMapping[baseDimIdx];
+                                                                                                                       // Mapping loopnest indices (input space) -> cache indices (cache space) so divide by split index increment
+                                                                                                                       cacheIndices.push_back((tmpBufIndices[logicalDimension] / orderedIndexIncrements[baseDimIdx]) % boundaryCacheFillLayout.GetActiveSize(cacheDimIdx));
+                                                                                                                   }
+                                                                                                                   offsetCacheInner(cacheIndices) = tmpBuf(tmpBufIndices);
+                                                                                                               });
+                                                                                          cacheFillNest.Do(cacheFill);
+                                                                                          auto& cacheFillSchedule = cacheFillNest.GetSchedule();
+                                                                                          for (unsigned idx = 0; idx < tmpBufInputIndices.size(); ++idx)
+                                                                                          {
+                                                                                              cacheFillSchedule.Unroll(tmpBufInputIndices[idx]);
+                                                                                          }
+                                                                                          cacheFillNest.Run();
+                                                                                      });
+                                                                                  });
+
+                                               auto& schedule = fillNest.GetSchedule();
+                                               std::vector<loopnests::Index> splitOuterIndices;
+                                               for (unsigned idx = 0; idx < fillIndices.size(); ++idx)
+                                               {
+                                                   if (indexSplitSizes[idx] > 1)
+                                                   {
+                                                       splitOuterIndices.push_back(schedule.Split(fillIndices[idx], indexSplitSizes[idx]));
+                                                   }
+                                                   else
+                                                   {
+                                                       splitOuterIndices.push_back(fillIndices[idx]);
+                                                   }
+                                               }
+
+                                               fillNest.Do(cacheFillInternalKernel, splitOuterIndices);
+
+                                               fillNest.Run();
+                                           });
+                                       });
+
+            underlyingNest.AddKernel(cacheFillKernel, loopnests::CodePositionConstraints{ loopnests::LoopFragmentType::prologue, cacheFillPosition, {} });
+            cachingKernels.push_back(cacheFillKernel);
+        }
+
+        if (useViewKernel)
+        {
+            // The cache view indices are all of the indices that occur before the cacheViewThresholdIdx
+            std::vector<Index> cacheViewPosition(orderedIndices.begin(), orderedIndices.begin() + cacheViewThresholdIdx);
+            std::vector<Index> cacheViewIndices(_kernelIndices.begin(), _kernelIndices.end());
+            cacheViewIndices.insert(cacheViewIndices.end(), cacheViewPosition.begin(), cacheViewPosition.end());
+
+            auto cacheViewKernel = loopnests::Kernel(cacheName + "_View_Cache_Kernel")
+                                       .Inputs(_rawCache, cacheRef)
+                                       .Indices(cacheViewIndices)
+                                       .DefineEx([boundaryConditionCacheHelper, compositeIndexCount, fullInputLayout, cacheLayout, baseCacheViewLayout, cacheLogicalDimensionMapping, logicalDimensionMapping, orderedIndices, orderedIndexIncrements, cacheThresholdIdx, cacheViewThresholdIdx, logicalDimensionCount](std::vector<Value> values, std::vector<Scalar> indices) {
+                                           auto& cache = values[0];
+                                           auto& cacheRef = values[1];
+                                           std::vector<Scalar> compositeIndexValues(indices.begin(), indices.begin() + compositeIndexCount);
+                                           std::vector<Scalar> splitIndexValues(indices.begin() + compositeIndexCount, indices.end());
+
+                                           boundaryConditionCacheHelper.EmitBoundarySwitches(compositeIndexValues, [&](MemoryLayout inputRegionShape, MemoryLayout inputRegionFillShape, MemoryLayout boundaryCacheLayout, MemoryLayout boundaryCacheFillLayout) {
+                                               // Find the view slice in the cache for this offset
+                                               // The indices in [cacheThresoldIdx, cacheViewThresholdIdx) in the indices determine which slice to use
+                                               std::vector<Scalar> cacheOffsetIndices;
+                                               cacheOffsetIndices.reserve(cacheLayout.NumDimensions());
+
+                                               // Note: if cacheThresholdIdx == cacheViewThresholdIdx (i.e. if there is no repeated re-viewing of the cache)
+                                               // Then the first loop is skipped and no offsetting occurs
+                                               auto cacheView = cache;
+                                               for (unsigned idx = cacheThresholdIdx; idx < cacheViewThresholdIdx; ++idx)
+                                               {
+                                                   // Mapping loopnest indices (input space) -> cache offsets (cache space) so divide by split index increment
+                                                   cacheOffsetIndices.push_back(splitIndexValues[idx] / orderedIndexIncrements[idx]);
+                                               }
+                                               for (unsigned idx = cacheViewThresholdIdx; idx < static_cast<unsigned>(fullInputLayout.NumDimensions()); ++idx)
+                                               {
+                                                   cacheOffsetIndices.push_back(Scalar{ 0 });
+                                               }
+
+                                               cacheView.SetLayout(boundaryCacheLayout);
+                                               auto offsetCache = cacheView.Offset(cacheOffsetIndices);
+                                               offsetCache.SetLayout(baseCacheViewLayout);
+
+                                               // Offset the cache ref from the base cache such that indexing with the current loop values
+                                               // would offset a pointer to the beginning of this view of the cache
+                                               std::vector<Scalar> offsetIndices(logicalDimensionCount);
+                                               for (int idx = 0; idx < logicalDimensionCount; ++idx)
+                                               {
+                                                   offsetIndices[idx] -= compositeIndexValues[idx];
+                                               }
+
+                                               auto offsetCacheView = offsetCache.Offset(offsetIndices);
+                                               offsetCacheView.SetLayout(baseCacheViewLayout);
+                                               cacheRef.SetLayout(baseCacheViewLayout);
+                                               cacheRef = offsetCacheView.Reference();
+                                           });
+                                       });
+
+            underlyingNest.AddKernel(cacheViewKernel, loopnests::CodePositionConstraints{ loopnests::LoopFragmentType::prologue, cacheViewPosition, {} });
+            cachingKernels.push_back(cacheViewKernel);
+        }
+
+        if (useReduceKernel)
+        {
+            // The cache reduce indices are all of the indices that occur before the cacheThresholdIdx
+            // Because the reduce is symmetric with the cache non-progressive fill / flush level of a loop nest
+            std::vector<Index> cacheReducePosition(orderedIndices.begin(), orderedIndices.begin() + cacheThresholdIdx);
+            std::vector<Index> cacheReduceIndices(_kernelIndices.begin(), _kernelIndices.end());
+            cacheReduceIndices.insert(cacheReduceIndices.end(), cacheReducePosition.begin(), cacheReducePosition.end());
+
+            auto cacheReduceKernel = loopnests::Kernel(cacheName + "_Reduce_Kernel")
+                                         .Inputs(_value, _rawCache)
+                                         .Indices(cacheReduceIndices)
+                                         .DefineEx([=](std::vector<Value> values, std::vector<Scalar> indices) {
+                                             auto& input = values[0];
+                                             auto& cache = values[1];
+                                             std::vector<Scalar> compositeIndexValues(indices.begin(), indices.begin() + compositeIndexCount);
+                                             std::vector<Scalar> splitIndexValues(indices.begin() + compositeIndexCount, indices.end());
+
+                                             auto offsetInput = input.Offset(compositeIndexValues);
+                                             offsetInput.SetLayout(input.GetLayout());
+                                             auto offsetInputArrayView = Array(offsetInput);
+
+                                             boundaryConditionCacheHelper.EmitBoundarySwitches(compositeIndexValues, [=](MemoryLayout inputRegionShape, MemoryLayout, MemoryLayout boundaryCacheLayout, MemoryLayout) {
+                                                 auto cacheArrayView = Array(cache);
+
+                                                 // Prefer input-oriented loops to maximize locality as the input
+                                                 // is likely to be larger than the cache in most cases
+                                                 // Based on the element size and counts in different dimensions,
+                                                 // we will split and unroll some of the inner loops in order to maximize
+                                                 // vectorization.
+                                                 // In order to get appropriate utilization of all the SIMD
+                                                 // registers, we will need to use a temporary buffer (which we expect
+                                                 // the compiler to optimize away) with a size equal to the total number
+                                                 // of elements that can be held in all of the SIMD registers.
+                                                 // The filling of this temporary buffer from the cache needs to be an
+                                                 // unrolled operation and the reducing of the output from the temporary
+                                                 // buffer also needs to be an unrolled operation that happens after
+                                                 // the full temporary buffer has been filled.
+                                                 // If the reduce operation is a SumReduce operation, then we need
+                                                 // a third loop in the middle which accumulates the current value
+                                                 // from the output into the temporary buffer, then have the
+                                                 // third loop copy the temporary buffer to the output
+                                                 // Therefore, we need multiple levels of loopnests so that the area
+                                                 // outside of the temporary buffer's addressable region can be looped
+                                                 // over, and the area inside the temporary buffer region can have two
+                                                 // or three sequential fully unrolled loopnests.
+                                                 // new loopnest (outer):
+                                                 // For ... {
+                                                 //   For ... {
+                                                 //       // start of outer loopnest prologue kernel
+                                                 //       // Fill temp buf with cache data
+                                                 //       new loopnest (inner #1):
+                                                 //       For ... (unroll) {
+                                                 //           For ... (unroll) {
+                                                 //               ... {
+                                                 //                   // start of inner loopnest #1 kernel
+                                                 //                   tempBuf(tempBufIndices) = cache(cacheIndices)
+                                                 //                   // end of inner loopnest #1 kernel
+                                                 //               }
+                                                 //               ...
+                                                 //           }
+                                                 //       }
+                                                 //       // if reduceFunction == SumReduce
+                                                 //       // Apply the reduce function to reduce elements of the output into the temp buf
+                                                 //       new loopnest (inner #2):
+                                                 //       For ... (unroll) {
+                                                 //           For ... (unroll) {
+                                                 //               ... {
+                                                 //                   // start of inner loopnest #2 kernel
+                                                 //                   tempBuf(tempBufIndices) += input(inputIndices)
+                                                 //                   // end of inner loopnest #2 kernel
+                                                 //               }
+                                                 //               ...
+                                                 //           }
+                                                 //       }
+                                                 //       // Copy temp buf to output
+                                                 //       new loopnest (inner #3):
+                                                 //       For ... (unroll) {
+                                                 //           For ... (unroll) {
+                                                 //               ... {
+                                                 //                   // start of inner loopnest #3 kernel
+                                                 //                   input(inputIndices) = tempBuf(tempBufIndices)
+                                                 //                   // end of inner loopnest #3 kernel
+                                                 //               }
+                                                 //               ...
+                                                 //           }
+                                                 //       }
+                                                 //       // end of outer loopnest kernel
+                                                 //   }
+                                                 // }
+
+                                                 std::vector<loopnests::Index> reduceIndices;
+                                                 reduceIndices.reserve(inputRegionShape.NumDimensions());
+                                                 for (int idx = 0; idx < inputRegionShape.NumDimensions(); ++idx)
+                                                 {
+                                                     reduceIndices.push_back(loopnests::Index("reduceIdx_" + std::to_string(idx)));
+                                                 }
+
+                                                 // Define LoopNest
+                                                 auto reduceNest = Using({ offsetInputArrayView }, ArgumentType::Input)
+                                                                       .Using({ cacheArrayView }, ArgumentType::Output);
+                                                 for (int idx = 0; idx < inputRegionShape.NumDimensions(); ++idx)
+                                                 {
+                                                     reduceNest.ForAll(reduceIndices[idx], 0, inputRegionShape.GetActiveSize(idx));
+                                                 }
+
+                                                 const int VectorizationSize = registerCharacteristics.NumberOfElementsPerSIMDRegister;
+                                                 int maximumElementsInTempBuf = registerCharacteristics.NumberOfSIMDRegisters * VectorizationSize;
+                                                 std::vector<int> indexSplitSizes(reduceIndices.size());
+                                                 std::vector<int> tmpBufDimensionMapping(indexSplitSizes.size());
+
+                                                 // Handle the innermost input dimension differently since we'll be counting elements there instead of shards of a memory layout
+                                                 int shardSize = VectorizationSize;
+                                                 int totalElementsPerShard = VectorizationSize;
+                                                 for (unsigned idx = reduceIndices.size() - 1; reduceIndices.size() > idx; --idx)
+                                                 {
+                                                     int availableShardsInTmpBuf = maximumElementsInTempBuf / totalElementsPerShard;
+                                                     int inputDimAvailableShards = inputRegionShape.GetActiveSize(idx) / shardSize;
+                                                     int numShards = std::min(availableShardsInTmpBuf, inputDimAvailableShards);
+                                                     tmpBufDimensionMapping[idx] = inputRegionShape.GetLogicalDimension(idx);
+                                                     if (numShards > 1)
+                                                     {
+                                                         indexSplitSizes[idx] = numShards * shardSize;
+                                                         shardSize = 1; // After the initial vectorization size, we target units of entire memory layout shards
+                                                         totalElementsPerShard *= numShards; // The number of elements represented by a target scales with the number of inner targets it represents
+                                                     }
+                                                     else
+                                                     {
+                                                         indexSplitSizes[idx] = 1;
+                                                     }
+                                                 }
+                                                 // The index split sizes are measured in input-space, so no scaling is needed
+                                                 std::vector<int> tmpBufScaleFactors(indexSplitSizes.size(), 1);
+
+                                                 BoundaryConditionMemoryLayoutHelper reduceKernelBoundaryHelper(inputRegionShape.GetActiveSize(),
+                                                                                                                indexSplitSizes,
+                                                                                                                tmpBufDimensionMapping,
+                                                                                                                tmpBufScaleFactors,
+                                                                                                                0, // Fill index doesn't matter for this usage
+                                                                                                                tmpBufDimensionMapping.size()); // Shrink any index split sizes needed since we don't have a "view" to worry about
+
+                                                 auto cacheReduceInternalKernel = loopnests::Kernel("Internal_Reduce_Cache_Outer_Kernel")
+                                                                                      .Inputs(offsetInputArrayView, cacheArrayView)
+                                                                                      .Indices(reduceIndices)
+                                                                                      .DefineEx([=](std::vector<Value> values, std::vector<Scalar> innerIndices) {
+                                                                                          Array offsetInput = values[0];
+                                                                                          Array cacheView = values[1];
+
+                                                                                          Value offsetInputInnerVal = offsetInput.GetValue().Offset(innerIndices);
+                                                                                          offsetInputInnerVal.SetLayout(offsetInput.GetValue().GetLayout());
+                                                                                          Array offsetInputInner = offsetInputInnerVal;
+
+                                                                                          std::vector<Scalar> cacheIndices;
+                                                                                          cacheIndices.reserve(boundaryCacheLayout.NumDimensions());
+                                                                                          for (int cacheDimIdx = 0; cacheDimIdx < boundaryCacheLayout.NumDimensions(); ++cacheDimIdx)
+                                                                                          {
+                                                                                              unsigned baseDimIdx = cacheThresholdIdx + cacheDimIdx;
+                                                                                              int logicalDimension = logicalDimensionMapping[baseDimIdx];
+                                                                                              // Mapping loopnest indices (input space) -> cache indices (cache space) so divide by split index increment
+                                                                                              cacheIndices.push_back((innerIndices[logicalDimension] / orderedIndexIncrements[baseDimIdx]) % boundaryCacheLayout.GetActiveSize(cacheDimIdx));
+                                                                                          }
+                                                                                          Value offsetCacheInnerVal = cacheView.GetValue().Offset(cacheIndices);
+                                                                                          offsetCacheInnerVal.SetLayout(cacheView.GetValue().GetLayout());
+                                                                                          Array offsetCacheInner = offsetCacheInnerVal;
+
+                                                                                          reduceKernelBoundaryHelper.EmitBoundarySwitches(innerIndices, [=](MemoryLayout reduceRegionShape, MemoryLayout, MemoryLayout boundaryTempBufLayout, MemoryLayout) {
+                                                                                              Array tmpBuf = Allocate(offsetInput.Type(), boundaryTempBufLayout, bufferAlignment);
+
+                                                                                              std::vector<loopnests::Index> tmpBufInputIndices;
+
+                                                                                              tmpBufInputIndices.reserve(reduceRegionShape.NumDimensions());
+                                                                                              for (int idx = 0; idx < reduceRegionShape.NumDimensions(); ++idx)
+                                                                                              {
+                                                                                                  tmpBufInputIndices.push_back(loopnests::Index("tmpBuf_ReduceIdx_" + std::to_string(idx)));
+                                                                                              }
+
+                                                                                              auto tmpBufFillFromCacheNest = Using({ offsetCacheInner }, ArgumentType::Input)
+                                                                                                                                 .Using({ tmpBuf }, ArgumentType::Output);
+                                                                                              for (int idx = 0; idx < reduceRegionShape.NumDimensions(); ++idx)
+                                                                                              {
+                                                                                                  tmpBufFillFromCacheNest.ForAll(tmpBufInputIndices[idx], 0, reduceRegionShape.GetActiveSize(idx));
+                                                                                              }
+
+                                                                                              // Fill tmp buf from cache
+                                                                                              auto tmpBufFillFromCache = loopnests::Kernel("Internal_TmpBuf_FillTmpBuf_Kernel")
+                                                                                                                             .Inputs(offsetCacheInner, tmpBuf)
+                                                                                                                             .Indices(tmpBufInputIndices)
+                                                                                                                             .DefineEx([=](std::vector<Value> tmpBufValues, std::vector<Scalar> tmpBufInputIndices) {
+                                                                                                                                 Array offsetCacheInner = tmpBufValues[0];
+                                                                                                                                 Array tmpBuf = tmpBufValues[1];
+
+                                                                                                                                 int cacheDimensions = offsetCacheInner.GetValue().GetLayout().NumDimensions();
+                                                                                                                                 std::vector<Scalar> cacheIndices;
+                                                                                                                                 cacheIndices.reserve(cacheDimensions);
+                                                                                                                                 for (int cacheDimIdx = 0; cacheDimIdx < cacheDimensions; ++cacheDimIdx)
+                                                                                                                                 {
+                                                                                                                                     unsigned baseDimIdx = cacheFillThresholdIdx + cacheDimIdx;
+                                                                                                                                     int logicalDimension = logicalDimensionMapping[baseDimIdx];
+                                                                                                                                     // Mapping loopnest indices (input space) -> cache indices (cache space) so divide by split index increment
+                                                                                                                                     cacheIndices.push_back((tmpBufInputIndices[logicalDimension] / orderedIndexIncrements[baseDimIdx]) % boundaryCacheLayout.GetActiveSize(cacheDimIdx));
+                                                                                                                                 }
+                                                                                                                                 tmpBuf(tmpBufInputIndices) = offsetCacheInner(cacheIndices);
+                                                                                                                             });
+                                                                                              tmpBufFillFromCacheNest.Do(tmpBufFillFromCache);
+                                                                                              auto& tmpBufFillSchedule = tmpBufFillFromCacheNest.GetSchedule();
+                                                                                              // unroll everything
+                                                                                              for (unsigned idx = 0; idx < tmpBufInputIndices.size(); ++idx)
+                                                                                              {
+                                                                                                  tmpBufFillSchedule.Unroll(tmpBufInputIndices[idx]);
+                                                                                              }
+                                                                                              tmpBufFillFromCacheNest.Run();
+
+                                                                                              if (accumulateReduce)
+                                                                                              {
+                                                                                                  // Reduce the current input/output contents into the temp buffer
+                                                                                                  auto tmpBufReduceNest = Using({ offsetInputInner }, ArgumentType::Input)
+                                                                                                                              .Using({ tmpBuf }, ArgumentType::Output);
+                                                                                                  for (int idx = 0; idx < tmpBuf.GetValue().GetLayout().NumDimensions(); ++idx)
+                                                                                                  {
+                                                                                                      tmpBufReduceNest.ForAll(tmpBufInputIndices[idx], 0, tmpBuf.GetValue().GetLayout().GetActiveSize(idx));
+                                                                                                  }
+
+                                                                                                  auto tmpBufReduce = loopnests::Kernel("Internal_TmpBuf_ReduceOutput_Kernel")
+                                                                                                                          .Inputs(tmpBuf, offsetInputInner)
+                                                                                                                          .Indices(tmpBufInputIndices)
+                                                                                                                          .DefineEx([=](std::vector<Value> tmpBufValues, std::vector<Scalar> tmpBufInputIndices) {
+                                                                                                                              Array tmpBuf = tmpBufValues[0];
+                                                                                                                              Array offsetInputInner = tmpBufValues[1];
+
+                                                                                                                              reduceFunction(tmpBuf(tmpBufInputIndices), offsetInputInner(tmpBufInputIndices));
+                                                                                                                          });
+                                                                                                  tmpBufReduceNest.Do(tmpBufReduce);
+                                                                                                  auto& tmpBufReduceSchedule = tmpBufReduceNest.GetSchedule();
+                                                                                                  for (unsigned idx = 0; idx < tmpBufInputIndices.size(); ++idx)
+                                                                                                  {
+                                                                                                      tmpBufReduceSchedule.Unroll(tmpBufInputIndices[idx]);
+                                                                                                  }
+                                                                                                  tmpBufReduceNest.Run();
+
+                                                                                                  // Copy temp buffer contents to input/output
+                                                                                                  auto storeOutNest = Using({ tmpBuf }, ArgumentType::Input)
+                                                                                                                          .Using({ offsetInputInner }, ArgumentType::Output);
+                                                                                                  for (int idx = 0; idx < tmpBuf.GetValue().GetLayout().NumDimensions(); ++idx)
+                                                                                                  {
+                                                                                                      storeOutNest.ForAll(tmpBufInputIndices[idx], 0, tmpBuf.GetValue().GetLayout().GetActiveSize(idx));
+                                                                                                  }
+
+                                                                                                  auto storeOut = loopnests::Kernel("Internal_TmpBuf_CopyOutput_Kernel")
+                                                                                                                      .Inputs(tmpBuf, offsetInputInner)
+                                                                                                                      .Indices(tmpBufInputIndices)
+                                                                                                                      .DefineEx([=](std::vector<Value> tmpBufValues, std::vector<Scalar> tmpBufInputIndices) {
+                                                                                                                          Array tmpBuf = tmpBufValues[0];
+                                                                                                                          Array offsetInputInner = tmpBufValues[1];
+
+                                                                                                                          offsetInputInner(tmpBufInputIndices) = tmpBuf(tmpBufInputIndices);
+                                                                                                                      });
+                                                                                                  storeOutNest.Do(storeOut);
+                                                                                                  auto& storeOutSchedule = storeOutNest.GetSchedule();
+                                                                                                  for (unsigned idx = 0; idx < tmpBufInputIndices.size(); ++idx)
+                                                                                                  {
+                                                                                                      storeOutSchedule.Unroll(tmpBufInputIndices[idx]);
+                                                                                                  }
+                                                                                                  storeOutNest.Run();
+                                                                                              }
+                                                                                              else
+                                                                                              {
+                                                                                                  // Reduce the temp buffer into input/output
+                                                                                                  auto outputReduceNest = Using({ tmpBuf }, ArgumentType::Input)
+                                                                                                                              .Using({ offsetInputInner }, ArgumentType::Output);
+                                                                                                  for (int idx = 0; idx < tmpBuf.GetValue().GetLayout().NumDimensions(); ++idx)
+                                                                                                  {
+                                                                                                      outputReduceNest.ForAll(tmpBufInputIndices[idx], 0, tmpBuf.GetValue().GetLayout().GetActiveSize(idx));
+                                                                                                  }
+
+                                                                                                  auto outputReduce = loopnests::Kernel("Internal_TmpBuf_ReduceOutput_Kernel")
+                                                                                                                          .Inputs(tmpBuf, offsetInputInner)
+                                                                                                                          .Indices(tmpBufInputIndices)
+                                                                                                                          .DefineEx([=](std::vector<Value> tmpBufValues, std::vector<Scalar> tmpBufInputIndices) {
+                                                                                                                              Array tmpBuf = tmpBufValues[0];
+                                                                                                                              Array offsetInputInner = tmpBufValues[1];
+
+                                                                                                                              reduceFunction(offsetInputInner(tmpBufInputIndices), tmpBuf(tmpBufInputIndices));
+                                                                                                                          });
+                                                                                                  outputReduceNest.Do(outputReduce);
+                                                                                                  auto& outputReduceSchedule = outputReduceNest.GetSchedule();
+                                                                                                  for (unsigned idx = 0; idx < tmpBufInputIndices.size(); ++idx)
+                                                                                                  {
+                                                                                                      outputReduceSchedule.Unroll(tmpBufInputIndices[idx]);
+                                                                                                  }
+                                                                                                  outputReduceNest.Run();
+                                                                                              }
+                                                                                          });
+                                                                                      });
+
+                                                 auto& schedule = reduceNest.GetSchedule();
+                                                 std::vector<loopnests::Index> splitOuterIndices;
+                                                 for (unsigned idx = 0; idx < reduceIndices.size(); ++idx)
+                                                 {
+                                                     if (indexSplitSizes[idx] > 1)
+                                                     {
+                                                         splitOuterIndices.push_back(schedule.Split(reduceIndices[idx], indexSplitSizes[idx]));
+                                                     }
+                                                 }
+
+                                                 reduceNest.Do(cacheReduceInternalKernel, splitOuterIndices);
+
+                                                 reduceNest.Run();
+                                             });
+                                         });
+
+            underlyingNest.AddKernel(cacheReduceKernel, loopnests::CodePositionConstraints{ loopnests::LoopFragmentType::epilogue, cacheReducePosition, {} });
+            cachingKernels.push_back(cacheReduceKernel);
+        }
+
+        underlyingNest.RenameVariable(_value, cacheRef, _atIndices, cachingKernels);
+    }
+
+} // namespace value
+} // namespace ell
diff --git a/libraries/value/src/ComputeContext.cpp b/libraries/value/src/ComputeContext.cpp
index 25970ca72..ca482bde2 100644
--- a/libraries/value/src/ComputeContext.cpp
+++ b/libraries/value/src/ComputeContext.cpp
@@ -18,6 +18,7 @@
 #include <algorithm>
 #include <cassert>
 #include <cmath>
+#include <cstring>
 #include <future>
 #include <iostream>
 #include <string>
@@ -30,21 +31,34 @@ namespace value
     using namespace detail;
     using namespace utilities;
 
-    struct ComputeContext::FunctionScope
+    namespace
     {
-        FunctionScope(ComputeContext& context, std::string fnName) :
-            context(context)
+        struct
         {
-            context._stack.push({ fnName, {} });
-        }
+            int Current()
+            {
+                std::lock_guard lock{ _mutex };
 
-        ~FunctionScope() { context._stack.pop(); }
+                auto it = _idMap.find(std::this_thread::get_id());
+                if (it == _idMap.end())
+                {
+                    it = _idMap.emplace_hint(it, std::this_thread::get_id(), ++_nextThreadId);
+                }
 
-        ComputeContext& context;
-    };
+                return it->second;
+            }
 
-    namespace
-    {
+            void Clear()
+            {
+                std::lock_guard lock{ _mutex };
+                _idMap.clear();
+                _nextThreadId = 0;
+            }
+
+            std::mutex _mutex;
+            std::unordered_map<std::thread::id, int> _idMap;
+            int _nextThreadId = 0;
+        } ThreadIds;
 
         // TODO: Make this the basis of an iterator for MemoryLayout
         bool IncrementMemoryCoordinateImpl(int dimension, std::vector<int>& coordinate, const std::vector<int>& maxCoordinate)
@@ -351,6 +365,10 @@ namespace value
                         {
                             throw InputException(InputExceptionErrors::invalidArgument);
                         }
+                        else if constexpr (!std::is_same_v<DataType1, DataType2>)
+                        {
+                            throw InputException(InputExceptionErrors::typeMismatch);
+                        }
                         else
                         {
                             return Value(std::copysign(*data1, *data2));
@@ -361,9 +379,185 @@ namespace value
             }
         };
 
+        struct FmaFunctionIntrinsic
+        {
+            auto operator()(std::vector<Value> args) const -> Value
+            {
+                if (args.size() != 3)
+                {
+                    throw InputException(InputExceptionErrors::invalidSize);
+                }
+
+                if (std::any_of(args.begin(), args.end(), [](Value& value) { return value.IsConstrained() && value.GetLayout() != ScalarLayout; }))
+                {
+                    throw InputException(InputExceptionErrors::invalidSize);
+                }
+
+                const auto& value1 = args[0];
+                const auto& value2 = args[1];
+                const auto& value3 = args[2];
+
+                return std::visit(
+                    [](auto&& data1, auto&& data2, auto&& data3) -> Value {
+                        using Type1 = std::decay_t<decltype(data1)>;
+                        using Type2 = std::decay_t<decltype(data2)>;
+                        using Type3 = std::decay_t<decltype(data3)>;
+                        using DataType1 = std::remove_pointer_t<Type1>;
+                        using DataType2 = std::remove_pointer_t<Type2>;
+                        using DataType3 = std::remove_pointer_t<Type3>;
+
+                        if constexpr (IsOneOf<DataType1, Emittable, Boolean> ||
+                                      IsOneOf<DataType2, Emittable, Boolean> ||
+                                      IsOneOf<DataType3, Emittable, Boolean>)
+                        {
+                            throw InputException(InputExceptionErrors::invalidArgument);
+                        }
+                        else if constexpr (!utilities::AllSame<DataType1, DataType2, DataType3>)
+                        {
+                            throw InputException(InputExceptionErrors::typeMismatch);
+                        }
+                        else
+                        {
+                            return Value(static_cast<DataType1>(std::fma(*data1, *data2, *data3)));
+                        }
+                    },
+                    value1.GetUnderlyingData(),
+                    value2.GetUnderlyingData(),
+                    value3.GetUnderlyingData());
+            }
+        };
+
+        enum class MemIntrinsicOp
+        {
+            Copy,
+            Move,
+            Set
+        };
+        template <MemIntrinsicOp op>
+        struct MemOpFunctionIntrinsic
+        {
+            auto operator()(std::vector<Value> args) const -> Value
+            {
+                if (args.size() != 3)
+                {
+                    throw InputException(InputExceptionErrors::invalidSize);
+                }
+
+                if (!std::all_of(args.begin(), args.end(), [](const Value& value) { return value.IsConstant(); }))
+                {
+                    throw InputException(InputExceptionErrors::invalidArgument);
+                }
+
+                const auto& value1 = args[0];
+                const auto& value2 = args[1];
+                const auto& value3 = args[2];
+
+                if (!value3.IsConstrained() || value3.GetLayout() != ScalarLayout)
+                {
+                    throw InputException(InputExceptionErrors::invalidArgument);
+                }
+
+                if constexpr (MemIntrinsicOp::Set == op)
+                {
+                    assert((value2.IsConstrained() && value2.GetLayout() == ScalarLayout && value2.GetType() == std::pair{ ValueType::Char8, 1 }));
+                }
+
+                std::visit(
+                    [](auto&& data1, auto&& data2, auto&& data3) {
+                        using Type1 = std::decay_t<decltype(data1)>;
+                        using Type2 = std::decay_t<decltype(data2)>;
+                        using Type3 = std::decay_t<decltype(data3)>;
+                        if constexpr (utilities::IsOneOf<Emittable, Type1, Type2, Type3>)
+                        {
+                            assert(false);
+                            return;
+                        }
+                        else
+                        {
+                            // Once we move away from VS 2017, this code can be uncommented and the code following can be simplified (lines 496-523)
+
+                            //constexpr auto memFn = [] {
+                            //    // static_casts needed because MSVC in VS 2017 can't handle the code without it
+                            //    if constexpr (static_cast<int>(MemIntrinsicOp::Set) == static_cast<int>(op))
+                            //    {
+                            //        return &std::memset;
+                            //    }
+                            //    else if constexpr (static_cast<int>(MemIntrinsicOp::Copy) == static_cast<int>(op))
+                            //    {
+                            //        return &std::memcpy;
+                            //    }
+                            //    else if constexpr (static_cast<int>(MemIntrinsicOp::Move) == static_cast<int>(op))
+                            //    {
+                            //        return &std::memmove;
+                            //    }
+                            //    else
+                            //    {
+                            //        static_assert(utilities::FalseType<Type1, Type2, Type3>{}, "Unknown enum value");
+                            //    }
+                            //}();
+
+                            constexpr bool isSet = op == MemIntrinsicOp::Set;
+                            std::decay_t<std::conditional_t<isSet, decltype(data2[0]), decltype(data2)>> real2ndParam;
+                            std::conditional_t<isSet, decltype(&std::memset), decltype(&std::memcpy)> memFn;
+                            switch (op)
+                            {
+                            case MemIntrinsicOp::Set:
+                                if constexpr (isSet)
+                                {
+                                    memFn = &std::memset;
+                                    real2ndParam = *data2;
+                                }
+                                break;
+                            case MemIntrinsicOp::Copy:
+                                if constexpr (!isSet)
+                                {
+                                    memFn = &std::memcpy;
+                                    real2ndParam = data2;
+                                }
+                                break;
+                            case MemIntrinsicOp::Move:
+                                if constexpr (!isSet)
+                                {
+                                    memFn = &std::memmove;
+                                    real2ndParam = data2;
+                                }
+                                break;
+                            default:
+                                assert(false);
+                            }
+
+                            memFn(data1, real2ndParam, *data3 * sizeof(data1[0]));
+                        }
+                    },
+                    value1.GetUnderlyingData(),
+                    value2.GetUnderlyingData(),
+                    value3.GetUnderlyingData());
+
+                return {}; // ignored
+            }
+        };
     } // namespace
 
+    struct ComputeContext::FunctionScope
+    {
+        FunctionScope(ComputeContext& context, std::string fnName) :
+            context(context)
+        {
+            std::lock_guard lock{ context._mutex };
+            context._stack.push({ fnName, {} });
+        }
+
+        ~FunctionScope()
+        {
+            std::lock_guard lock{ context._mutex };
+            context._stack.pop();
+        }
+
+        ComputeContext& context;
+    };
+
     ComputeContext::ComputeContext(std::string moduleName) :
+        EmitterContext(emitters::GetTargetDevice("host")),
         _moduleName(std::move(moduleName))
     {
         // we always have at least one stack entry, in case the top level function needs to return something
@@ -379,42 +573,49 @@ namespace value
 
         using Iterator = ConstantDataList::const_iterator;
 
-        auto it =
-            std::visit(VariantVisitor{ [](Emittable) -> Iterator { return {}; },
-                                       [this](auto&& data) -> Iterator {
-                                           using Type = std::decay_t<decltype(data)>;
-                                           using RealType = std::remove_pointer_t<Type>;
-                                           using VectorType = std::vector<RealType>;
-
-                                           const auto& frame = GetTopFrame();
-                                           auto it =
-                                               std::find_if(frame.second.begin(),
-                                                            frame.second.end(),
-                                                            [data](const ConstantData& constData) {
-                                                                if (auto ptr = std::get_if<VectorType>(&constData))
-                                                                {
-                                                                    return ptr->data() <= data &&
-                                                                           data < (ptr->data() + ptr->size());
-                                                                }
-
-                                                                return false;
-                                                            });
-
-                                           return it;
-                                       } },
-                       value.GetUnderlyingData());
+        auto it = std::visit(
+            VariantVisitor{
+                [](Emittable) -> Iterator { return {}; },
+                [this](auto&& data) -> Iterator {
+                    using Type = std::decay_t<decltype(data)>;
+                    using RealType = std::remove_pointer_t<Type>;
+                    using VectorType = std::vector<RealType>;
+
+                    const auto& frame = GetTopFrame();
+                    auto it =
+                        std::find_if(frame.second.begin(),
+                                     frame.second.end(),
+                                     [data](const ConstantData& constData) {
+                                         if (auto ptr = std::get_if<VectorType>(&constData))
+                                         {
+                                             return ptr->data() <= data &&
+                                                    data < (ptr->data() + ptr->size());
+                                         }
+
+                                         return false;
+                                     });
+
+                    return it;
+                } },
+            value.GetUnderlyingData());
 
         return *it;
     }
 
-    Value ComputeContext::AllocateImpl(ValueType type, MemoryLayout layout)
+    Value ComputeContext::AllocateImpl(ValueType type, MemoryLayout layout, size_t /* alignment */, AllocateFlags flags)
     {
+        if (flags != AllocateFlags::None)
+        {
+            throw LogicException(LogicExceptionErrors::notImplemented);
+        }
+
         // special case the scalar case
         auto size = layout == ScalarLayout ? 1u : layout.GetMemorySize();
 
         auto constantData = AllocateConstantData(type, size);
         Value value = StoreConstantData(std::move(constantData));
         value.SetLayout(layout);
+
         return value;
     }
 
@@ -430,8 +631,13 @@ namespace value
         return std::nullopt;
     }
 
-    Value ComputeContext::GlobalAllocateImpl(GlobalAllocationScope scope, std::string name, ConstantData data, MemoryLayout layout)
+    Value ComputeContext::GlobalAllocateImpl(GlobalAllocationScope scope, std::string name, ConstantData data, MemoryLayout layout, AllocateFlags flags)
     {
+        if ((flags & AllocateFlags::ThreadLocal) == AllocateFlags::ThreadLocal)
+        {
+            throw LogicException(LogicExceptionErrors::illegalState, "Thread local storage cannot be specified for constant data");
+        }
+
         std::string adjustedName = GetScopeAdjustedName(scope, name);
 
         if (_globals.find(adjustedName) != _globals.end())
@@ -440,26 +646,45 @@ namespace value
                                  "Unexpected collision in global data allocation");
         }
 
-        auto& globalData = _globals[adjustedName];
-        globalData.first = std::move(data);
-        globalData.second = std::move(layout);
+        auto& globalData = [&]() -> decltype(auto) {
+            std::lock_guard lock{ _mutex };
+            auto& globalData = _globals[adjustedName];
+            globalData.first = std::move(data);
+            globalData.second = std::move(layout);
+            return globalData;
+        }();
 
         return ConstantDataToValue(globalData.first, globalData.second);
     }
 
-    Value ComputeContext::GlobalAllocateImpl(GlobalAllocationScope scope, std::string name, ValueType type, MemoryLayout layout)
+    Value ComputeContext::GlobalAllocateImpl(GlobalAllocationScope scope, std::string name, ValueType type, MemoryLayout layout, AllocateFlags flags)
     {
         // special case the scalar case
         auto size = layout == ScalarLayout ? 1u : layout.GetMemorySize();
         auto constantData = AllocateConstantData(type, size);
-        return GlobalAllocateImpl(scope, name, constantData, layout);
+        if ((flags & AllocateFlags::ThreadLocal) == AllocateFlags::ThreadLocal)
+        {
+            name += std::to_string(ThreadIds.Current());
+
+            if (auto globalValue = EmitterContext::GetGlobalValue(scope, name, layout))
+            {
+                return *globalValue;
+            }
+
+            flags &= ~AllocateFlags::ThreadLocal;
+        }
+        return GlobalAllocateImpl(scope, name, constantData, layout, flags);
     }
 
     Value ComputeContext::StoreConstantDataImpl(ConstantData data)
     {
         Value value = ConstantDataToValue(data);
 
-        GetTopFrame().second.push_front(std::move(data));
+        {
+            std::lock_guard lock{ _mutex };
+            GetTopFrame().second.push_front(std::move(data));
+        }
+
         return value;
     }
 
@@ -507,68 +732,58 @@ namespace value
     {
         ConstantData movedOutOfScope;
 
-        std::visit(VariantVisitor{ [](Emittable) {},
-                                   [&movedOutOfScope, this](auto&& data) {
-                                       using Type = std::decay_t<decltype(data)>;
-                                       using RealType = std::remove_pointer_t<Type>;
-                                       using VectorType = std::vector<RealType>;
-
-                                       const auto& frame = GetTopFrame();
-                                       if (auto stackFrameIt =
-                                               std::find_if(frame.second.begin(),
-                                                            frame.second.end(),
-                                                            [data](const ConstantData& constData) {
-                                                                if (auto ptr = std::get_if<VectorType>(&constData))
-                                                                {
-                                                                    return ptr->data() <= data &&
-                                                                           data < (ptr->data() + ptr->size());
-                                                                }
-
-                                                                return false;
-                                                            });
-                                           stackFrameIt == frame.second.end())
-                                       {
-                                           throw LogicException(LogicExceptionErrors::illegalState,
-                                                                "Could not extract expected data");
-                                       }
-                                       else
-                                       {
-                                           movedOutOfScope = std::move(*stackFrameIt);
-                                       }
-                                   } },
-                   value.GetUnderlyingData());
+        std::lock_guard lock{ _mutex };
+
+        std::visit(
+            VariantVisitor{
+                [](Emittable) {},
+                [&movedOutOfScope,
+                 size = (value.IsConstrained() ? value.GetLayout().GetMemorySize() : 1)](auto&& data) {
+                    using Type = std::decay_t<decltype(data)>;
+                    using RealType = std::remove_pointer_t<Type>;
+                    using VectorType = std::vector<RealType>;
+
+                    movedOutOfScope = VectorType(data, data + size);
+                } },
+            value.GetUnderlyingData());
 
         return movedOutOfScope;
     }
 
     bool ComputeContext::IsGlobalValue(Value value) const
     {
-        return std::visit(VariantVisitor{ [](Emittable) -> bool {
-                                             throw LogicException(LogicExceptionErrors::illegalState);
-                                         },
-                                          [this](auto&& data) -> bool {
-                                              using Type = std::decay_t<decltype(data)>;
-                                              using RealType = std::remove_pointer_t<Type>;
-                                              using VectorType = std::vector<RealType>;
-
-                                              return std::find_if(_globals.begin(),
-                                                                  _globals.end(),
-                                                                  [data](const auto& kvp) {
-                                                                      if (auto ptr = std::get_if<VectorType>(
-                                                                              &kvp.second.first))
-                                                                      {
-                                                                          return ptr->data() <= data &&
-                                                                                 data < (ptr->data() + ptr->size());
-                                                                      }
-                                                                      return false;
-                                                                  }) != _globals.end();
-                                          } },
-                          value.GetUnderlyingData());
+        std::lock_guard lock{ _mutex };
+
+        return std::visit(
+            VariantVisitor{
+                [](Emittable) -> bool {
+                    throw LogicException(LogicExceptionErrors::illegalState);
+                },
+
+                [this](auto&& data) -> bool {
+                    using Type = std::decay_t<decltype(data)>;
+                    using RealType = std::remove_pointer_t<Type>;
+                    using VectorType = std::vector<RealType>;
+
+                    return std::find_if(
+                               _globals.begin(),
+                               _globals.end(),
+                               [data](const auto& kvp) {
+                                   if (auto ptr = std::get_if<VectorType>(
+                                           &kvp.second.first))
+                                   {
+                                       return ptr->data() <= data &&
+                                              data < (ptr->data() + ptr->size());
+                                   }
+                                   return false;
+                               }) != _globals.end();
+                } },
+            value.GetUnderlyingData());
     }
 
     detail::ValueTypeDescription ComputeContext::GetTypeImpl(Emittable)
     {
-        throw LogicException(LogicExceptionErrors::notImplemented);
+        throw LogicException(LogicExceptionErrors::illegalState);
     }
 
     EmitterContext::DefinedFunction ComputeContext::CreateFunctionImpl(FunctionDeclaration decl, EmitterContext::DefinedFunction fn)
@@ -579,6 +794,8 @@ namespace value
             throw InputException(InputExceptionErrors::invalidArgument, "Specified function is an intrinsic");
         }
 
+        std::lock_guard lock{ _mutex };
+
         if (auto it = _definedFunctions.find(decl); it != _definedFunctions.end())
         {
             return it->second;
@@ -591,7 +808,9 @@ namespace value
             const auto& fnName = decl.GetFunctionName();
             assert(expectedArgs.size() == args.size());
 
-            if (const auto& returnType = decl.GetReturnType(); returnType)
+            auto fnArgs = NormalizeReferenceLevels(args, expectedArgs);
+
+            if (const auto& returnType = decl.GetReturnType(); returnType.has_value())
             {
                 Value expectedReturn = *returnType;
 
@@ -599,18 +818,10 @@ namespace value
                 std::optional<Value> maybeGlobal;
                 {
                     FunctionScope scope(*this, fnName);
-                    std::vector<Value> fnArgs;
-                    fnArgs.reserve(expectedArgs.size());
-                    for (auto arg : expectedArgs)
-                    {
-                        fnArgs.push_back(Value(arg.GetBaseType(), arg.GetLayout()));
-                    }
-
-                    std::copy(args.begin(), args.end(), fnArgs.begin());
 
                     Value returnValue = expectedReturn;
-                    auto fnReturn = fn(args);
-                    if (!fnReturn)
+                    auto fnReturn = fn(fnArgs);
+                    if (!fnReturn.has_value())
                     {
                         throw LogicException(LogicExceptionErrors::illegalState, "Function definition was expected to return a value, but optional was empty");
                     }
@@ -641,8 +852,10 @@ namespace value
             }
             else
             {
+                FunctionScope scope(*this, fnName);
+
                 // equivalent of a void return type
-                (void)fn(args);
+                (void)fn(fnArgs);
 
                 return std::nullopt;
             }
@@ -660,40 +873,64 @@ namespace value
             return true;
         }
 
+        std::lock_guard lock{ _mutex };
         return _definedFunctions.find(decl) != _definedFunctions.end();
     }
 
     void ComputeContext::CopyDataImpl(const Value& source, Value& destination)
     {
-        std::visit(VariantVisitor{ [](Emittable) {},
-                                   [&destination, &source](auto&& sourceData) {
-                                       using SourceDataType = std::decay_t<decltype(sourceData)>;
-
-                                       auto& destinationData =
-                                           std::get<SourceDataType>(destination.GetUnderlyingData());
-                                       if (source.GetLayout().IsContiguous() && destination.GetLayout().IsContiguous())
-                                       {
-                                           auto numElements = destination.GetLayout().NumElements();
-                                           std::copy(sourceData, sourceData + numElements, destinationData);
-                                       }
-                                       else
-                                       {
-                                           auto& sourceLayout = source.GetLayout();
-                                           auto maxCoordinate = sourceLayout.GetActiveSize().ToVector();
-                                           decltype(maxCoordinate) coordinate(maxCoordinate.size());
-
-                                           do
-                                           {
-                                               auto logicalCoordinates = sourceLayout.GetLogicalCoordinates(coordinate);
-                                               auto sourceOffset =
-                                                   sourceLayout.GetLogicalEntryOffset(logicalCoordinates);
-                                               auto destinationOffset =
-                                                   destination.GetLayout().GetLogicalEntryOffset(logicalCoordinates);
-                                               *(destinationData + destinationOffset) = *(sourceData + sourceOffset);
-                                           } while (IncrementMemoryCoordinate(coordinate, maxCoordinate));
-                                       }
-                                   } },
-                   source.GetUnderlyingData());
+        std::visit(
+            VariantVisitor{
+                [](Emittable) {},
+                [&destination, &source](auto&& sourceData) {
+                    using SourceDataType = std::decay_t<decltype(sourceData)>;
+
+                    if (source.PointerLevel() == destination.PointerLevel())
+                    {
+                        if (source.PointerLevel() == 1)
+                        {
+                            auto& destinationData = std::get<SourceDataType>(destination.GetUnderlyingData());
+                            if (source.GetLayout().IsContiguous() && destination.GetLayout().IsContiguous())
+                            {
+                                auto numElements = destination.GetLayout().NumElements();
+                                std::copy(sourceData, sourceData + numElements, destinationData);
+                            }
+                            else
+                            {
+                                auto& sourceLayout = source.GetLayout();
+                                auto maxCoordinate = sourceLayout.GetActiveSize().ToVector();
+                                decltype(maxCoordinate) coordinate(maxCoordinate.size());
+
+                                do
+                                {
+                                    auto logicalCoordinates = sourceLayout.GetLogicalCoordinates(coordinate);
+                                    auto sourceOffset =
+                                        sourceLayout.GetLogicalEntryOffset(logicalCoordinates);
+                                    auto destinationOffset =
+                                        destination.GetLayout().GetLogicalEntryOffset(logicalCoordinates);
+                                    *(destinationData + destinationOffset) = *(sourceData + sourceOffset);
+                                } while (IncrementMemoryCoordinate(coordinate, maxCoordinate));
+                            }
+                        }
+                        else
+                        {
+                            std::get<IntPtrT*>(destination.GetUnderlyingData())[0] = std::get<IntPtrT*>(source.GetUnderlyingData())[0];
+                            if (source.IsConstrained())
+                            {
+                                destination.SetLayout(source.GetLayout());
+                            }
+                            else
+                            {
+                                destination.ClearLayout();
+                            }
+                        }
+                    }
+                    else
+                    {
+                        throw LogicException(LogicExceptionErrors::illegalState);
+                    }
+                } },
+            source.GetUnderlyingData());
     }
 
     void ComputeContext::MoveDataImpl(Value& source, Value& destination)
@@ -705,7 +942,7 @@ namespace value
         source.Reset();
     }
 
-    void ComputeContext::ForImpl(MemoryLayout layout, std::function<void(std::vector<Scalar>)> fn)
+    void ComputeContext::ForImpl(MemoryLayout layout, std::function<void(std::vector<Scalar>)> fn, [[maybe_unused]] const std::string& name)
     {
         auto maxCoordinate = layout.GetActiveSize().ToVector();
         decltype(maxCoordinate) coordinate(maxCoordinate.size());
@@ -717,7 +954,7 @@ namespace value
         } while (IncrementMemoryCoordinate(coordinate, maxCoordinate));
     }
 
-    void ComputeContext::ForImpl(Scalar start, Scalar stop, Scalar step, std::function<void(Scalar)> fn)
+    void ComputeContext::ForImpl(Scalar start, Scalar stop, Scalar step, std::function<void(Scalar)> fn, [[maybe_unused]] const std::string& name)
     {
         if (!(start.GetValue().IsConstant() && stop.GetValue().IsConstant() && step.GetValue().IsConstant()))
         {
@@ -736,6 +973,7 @@ namespace value
                     auto startNum = start.Get<Type>();
                     auto stopNum = stop.Get<Type>();
                     auto stepNum = step.Get<Type>();
+
                     for (; startNum < stopNum; startNum += stepNum)
                     {
                         fn(startNum);
@@ -821,7 +1059,11 @@ namespace value
                 }
                 else
                 {
-                    throw LogicException(LogicExceptionErrors::illegalState);
+                    detail::ValueTypeDescription typeDesc{ source.GetBaseType(), 0 };
+                    Value value{ typeDesc, utilities::ScalarLayout };
+
+                    value.SetData(*data);
+                    return value;
                 }
             },
             source.GetUnderlyingData());
@@ -856,42 +1098,58 @@ namespace value
         std::visit(
             VariantVisitor{
                 [](Emittable) {},
-                [](Boolean*) {},
                 [&destination, &source, op](auto&& destinationData) {
                     using DestinationDataType =
                         std::remove_pointer_t<std::decay_t<decltype(destinationData)>>;
 
                     std::function<DestinationDataType(DestinationDataType, DestinationDataType)>
                         opFn;
-                    switch (op)
+                    if constexpr (!std::is_same_v<DestinationDataType, Boolean>)
                     {
-                    case ValueBinaryOperation::add:
-                        opFn = [](auto dst, auto src) { return dst + src; };
-                        break;
-                    case ValueBinaryOperation::subtract:
-                        opFn = [](auto dst, auto src) { return dst - src; };
-                        break;
-                    case ValueBinaryOperation::multiply:
-                        opFn = [](auto dst, auto src) { return dst * src; };
-                        break;
-                    case ValueBinaryOperation::divide:
-                        opFn = [](auto dst, auto src) { return dst / src; };
-                        break;
-
-                    default:
-                        if constexpr (std::is_integral_v<DestinationDataType>)
+                        switch (op)
                         {
-                            switch (op)
+                        case ValueBinaryOperation::add:
+                            opFn = [](auto dst, auto src) { return dst + src; };
+                            break;
+                        case ValueBinaryOperation::subtract:
+                            opFn = [](auto dst, auto src) { return dst - src; };
+                            break;
+                        case ValueBinaryOperation::multiply:
+                            opFn = [](auto dst, auto src) { return dst * src; };
+                            break;
+                        case ValueBinaryOperation::divide:
+                            opFn = [](auto dst, auto src) { return dst / src; };
+                            break;
+
+                        default:
+                            if constexpr (std::is_integral_v<DestinationDataType>)
+                            {
+                                switch (op)
+                                {
+                                case ValueBinaryOperation::modulus:
+                                    opFn = [](auto dst, auto src) { return dst % src; };
+                                    break;
+                                default:
+                                    throw LogicException(LogicExceptionErrors::illegalState);
+                                }
+                            }
+                            else
                             {
-                            case ValueBinaryOperation::modulus:
-                                opFn = [](auto dst, auto src) { return dst % src; };
-                                break;
-                            default:
                                 throw LogicException(LogicExceptionErrors::illegalState);
                             }
                         }
-                        else
+                    }
+                    else
+                    {
+                        switch (op)
                         {
+                        case ValueBinaryOperation::logicalAnd:
+                            opFn = [](auto dst, auto src) { return dst && src; };
+                            break;
+                        case ValueBinaryOperation::logicalOr:
+                            opFn = [](auto dst, auto src) { return dst || src; };
+                            break;
+                        default:
                             throw LogicException(LogicExceptionErrors::illegalState);
                         }
                     }
@@ -1110,6 +1368,34 @@ namespace value
         return { std::make_unique<ComputeContext::IfContextImpl>(state) };
     }
 
+    void ComputeContext::WhileImpl(Scalar test, std::function<void()> fn)
+    {
+        if (!(test.GetValue().IsConstant()))
+        {
+            throw InputException(InputExceptionErrors::invalidArgument, "start/stop/step values must be constant for ComputeContext");
+        }
+
+        std::visit(
+            [&](auto&& data) {
+                using Type = std::remove_pointer_t<std::decay_t<decltype(data)>>;
+                if constexpr (IsOneOf<Type, Boolean>)
+                {
+                    auto testVal = test.Get<Boolean>();
+
+                    while (testVal)
+                    {
+                        fn();
+                        testVal = test.Get<Boolean>();
+                    }
+                }
+                else
+                {
+                    // error?
+                }
+            },
+            test.GetValue().GetUnderlyingData());
+    }
+
     std::optional<Value> ComputeContext::CallImpl(FunctionDeclaration func, std::vector<Value> args)
     {
         if (!std::all_of(args.begin(), args.end(), [this](const Value& value) { return ValidateValue(value); }))
@@ -1123,9 +1409,18 @@ namespace value
             return IntrinsicCall(func, args);
         }
 
-        if (auto it = _definedFunctions.find(func); it != _definedFunctions.end())
+        if (func.IsPointerSet())
         {
-            return it->second(args);
+            auto ptr = func.GetPointer();
+            return (*reinterpret_cast<DefinedFunction*>(reinterpret_cast<void*>(ptr.Get<IntPtrT>())))(args);
+        }
+
+        {
+            std::lock_guard lock{ _mutex };
+            if (auto it = _definedFunctions.find(func); it != _definedFunctions.end())
+            {
+                return it->second(args);
+            }
         }
 
         throw InputException(InputExceptionErrors::invalidArgument, "Specified function is not defined for this context");
@@ -1136,6 +1431,8 @@ namespace value
 
     void ComputeContext::ParallelizeImpl(int numTasks, std::vector<Value> captured, std::function<void(Scalar, std::vector<Value>)> fn)
     {
+        ThreadIds.Clear();
+
         std::vector<std::future<void>> futures;
         futures.reserve(numTasks);
         for (int i = 0; i < numTasks; ++i)
@@ -1178,6 +1475,11 @@ namespace value
         }
     } // namespace
 
+    void ComputeContext::DebugBreakImpl()
+    {
+        throw 0; // TODO: throw a real exception (of type value::Exception::DebugTrapException, perhaps)
+    }
+
     void ComputeContext::DebugDumpImpl(Value value, std::string tag, std::ostream& stream) const
     {
         PrintValue(value, stream);
@@ -1219,11 +1521,13 @@ namespace value
 
     void ComputeContext::SetNameImpl(const Value& value, const std::string& name)
     {
+        std::lock_guard lock{ _mutex };
         _namedValues[value] = name;
     }
 
     std::string ComputeContext::GetNameImpl(const Value& value) const
     {
+        std::lock_guard lock{ _mutex };
         if (auto it = _namedValues.find(value); it != _namedValues.end())
         {
             return it->second;
@@ -1232,6 +1536,21 @@ namespace value
         return {};
     }
 
+    void ComputeContext::ImportCodeFileImpl(std::string) { throw LogicException(LogicExceptionErrors::notImplemented); }
+
+    Scalar ComputeContext::GetFunctionAddressImpl(const FunctionDeclaration& fn)
+    {
+        {
+            std::lock_guard lock{ _mutex };
+            if (auto it = _definedFunctions.find(fn); it != _definedFunctions.end())
+            {
+                return reinterpret_cast<IntPtrT>(reinterpret_cast<void*>(&(it->second)));
+            }
+        }
+
+        throw InputException(InputExceptionErrors::invalidArgument, "ComputeContext can't take address of function that hasn't been defined");
+    }
+
     Value ComputeContext::IntrinsicCall(FunctionDeclaration intrinsic, std::vector<Value> args)
     {
         static std::unordered_map<FunctionDeclaration, std::function<Value(std::vector<Value>)>> intrinsics = {
@@ -1251,6 +1570,10 @@ namespace value
             { FloorFunctionDeclaration, SimpleNumericalFunctionIntrinsic{}([](auto n) { return std::floor(n); }) },
             { CeilFunctionDeclaration, SimpleNumericalFunctionIntrinsic{}([](auto n) { return std::ceil(n); }) },
             { CopySignFunctionDeclaration, CopySignFunctionIntrinsic{} },
+            { FmaFunctionDeclaration, FmaFunctionIntrinsic{} },
+            { MemCopyFunctionDeclaration, MemOpFunctionIntrinsic<MemIntrinsicOp::Copy>{} },
+            { MemMoveFunctionDeclaration, MemOpFunctionIntrinsic<MemIntrinsicOp::Move>{} },
+            { MemSetFunctionDeclaration, MemOpFunctionIntrinsic<MemIntrinsicOp::Set>{} },
         };
 
         if (auto it = intrinsics.find(intrinsic); it != intrinsics.end())
@@ -1273,10 +1596,13 @@ namespace value
         auto pointerLevel1 = value1.PointerLevel();
         auto pointerLevel2 = value2.PointerLevel();
 
-        if (value1.GetBaseType() == value2.GetBaseType() &&
-            pointerLevel1 == pointerLevel2 &&
+        if (pointerLevel1 == pointerLevel2 &&
             pointerLevel1 == 1)
         {
+            if (value1.GetBaseType() != value2.GetBaseType())
+            {
+                throw InputException(InputExceptionErrors::typeMismatch);
+            }
             return true;
         }
 
@@ -1322,6 +1648,7 @@ namespace value
         // Our stack always has one empty "scope" pushed to it, which we
         // can use to create our global prefix.
 
+        std::lock_guard lock{ _mutex };
         return GetGlobalScopedName(GetTopFrame().first + "_" + name);
     }
 
@@ -1329,5 +1656,16 @@ namespace value
 
     const ComputeContext::Frame& ComputeContext::GetTopFrame() const { return _stack.top(); }
 
+    void swap(ComputeContext& l, ComputeContext& r) noexcept
+    {
+        using std::swap;
+
+        swap(static_cast<EmitterContext&>(l), static_cast<EmitterContext&>(r));
+        swap(l._stack, r._stack);
+        swap(l._globals, r._globals);
+        swap(l._definedFunctions, r._definedFunctions);
+        swap(l._namedValues, r._namedValues);
+        swap(l._moduleName, r._moduleName);
+    }
 } // namespace value
 } // namespace ell
diff --git a/libraries/value/src/CppEmitterContext.cpp b/libraries/value/src/CppEmitterContext.cpp
new file mode 100644
index 000000000..e3b8ce406
--- /dev/null
+++ b/libraries/value/src/CppEmitterContext.cpp
@@ -0,0 +1,1753 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     CppEmitterContext.cpp (value)
+//  Authors:  Kern Handa
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "CppEmitterContext.h"
+#include "FunctionDeclaration.h"
+
+#include <utilities/include/Exception.h>
+#include <utilities/include/StringUtil.h>
+#include <utilities/include/TypeTraits.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cctype>
+#include <cmath>
+#include <iostream>
+#include <iterator>
+#include <sstream>
+#include <string>
+#include <vector>
+
+using namespace ell::utilities;
+
+namespace ell
+{
+namespace value
+{
+    struct CppEmitterContext::FunctionScope
+    {
+        FunctionScope(CppEmitterContext& context, const std::string& fnName) :
+            _context(context),
+            _guard(*this)
+        {
+            _context._fnStacks.push({ {}, fnName });
+            _context._promotedConstantStack.push({});
+        }
+
+        ~FunctionScope()
+        {
+            _context._fnStacks.pop();
+            _context._promotedConstantStack.pop();
+        }
+
+    private:
+        struct StreamGuard
+        {
+            StreamGuard(FunctionScope& context) :
+                _context(context),
+                _oldStream(_context._context._stream),
+                _oldIndent(_context._context._indent)
+            {
+                _context._context._indent = 0;
+                _context._context._stream = _context._sstr;
+            }
+
+            ~StreamGuard()
+            {
+                _context._context._stream = _oldStream;
+                _context._context._indent = _oldIndent;
+
+                // write our contents directly to the expression stream because the old stream
+                // might point to someone else's stream
+                _context._context._expressionStream << _context._sstr.str();
+            }
+
+        private:
+            FunctionScope& _context;
+            std::reference_wrapper<std::ostream> _oldStream;
+            decltype(CppEmitterContext::_indent) _oldIndent;
+        };
+
+        CppEmitterContext& _context;
+        std::stringstream _sstr;
+        StreamGuard _guard;
+    };
+
+    CppEmitterContext::CppEmitterContext(std::string moduleName, std::ostream& stream) :
+        CppEmitterContext(emitters::GetTargetDevice("host"), moduleName, stream)
+    {
+    }
+
+    CppEmitterContext::CppEmitterContext(std::string moduleName, std::unique_ptr<std::ostream> stream) :
+        CppEmitterContext(emitters::GetTargetDevice("host"), moduleName, std::move(stream))
+    {
+    }
+
+    CppEmitterContext::CppEmitterContext(const TargetDevice& target, std::string moduleName, std::unique_ptr<std::ostream> stream) :
+        CppEmitterContext(target, moduleName, *stream)
+    {
+        _ownedStream = std::move(stream);
+    }
+
+    CppEmitterContext::CppEmitterContext(const TargetDevice& target, std::string moduleName, std::ostream& stream) :
+        EmitterContext(target),
+        _computeContext(moduleName),
+        _stream(_expressionStream),
+        _outputStream(stream),
+        _moduleName(std::move(moduleName))
+    {
+        Global() << "// Instantiating CppEmitterContext\n"
+                    "// Writing "
+                 << _moduleName << ".cpp\n"
+                 << "\n"
+                    "#include <algorithm>\n"
+                    "#include <array>\n"
+                    "#include <cmath>\n"
+                    "#include <cstring>\n"
+                    "#include <cstdint>\n"
+                    "#include <future>\n"
+                    "#include <iostream>\n"
+                    "#include <vector>\n"
+                    "\n"
+                    "\n"
+                    "#if !defined(VALUE_CPP_EMITTER_HELPERS_DEFINED)\n"
+                    "#define VALUE_CPP_EMITTER_HELPERS_DEFINED\n"
+                    "template <typename T> using Scalar = std::array<T, 1>;\n"
+                    "#endif // VALUE_CPP_EMITTER_HELPERS_DEFINED\n"
+                    "\n"
+                    "namespace {\n";
+    }
+
+    CppEmitterContext::~CppEmitterContext()
+    {
+        Out() << "\n// Cleaning up CppEmitterContext" << std::endl;
+
+        _outputStream.get() << _globalStream.str()
+                            << "} // namespace \n"
+                            << _fnDeclStream.str() << "\n"
+                            << _expressionStream.str() << std::endl;
+
+        _outputStream.get().flush();
+    }
+
+    namespace
+    {
+        std::string ValueTypeToCTypeString(ValueType type)
+        {
+            switch (type)
+            {
+            case ValueType::Void:
+                return "void";
+            case ValueType::Boolean:
+                return "bool";
+            case ValueType::Byte:
+                return "uint8_t";
+            case ValueType::Char8:
+                return "int8_t";
+            case ValueType::Double:
+                return "double";
+            case ValueType::Float:
+                return "float";
+            case ValueType::Int16:
+                return "int16_t";
+            case ValueType::Int32:
+                return "int32_t";
+            case ValueType::Int64:
+                return "int64_t";
+            default:
+                throw LogicException(LogicExceptionErrors::illegalState);
+            }
+        }
+
+        std::string ValueTypeToCTypeString(detail::ValueTypeDescription desc, size_t size, bool forcePointer = false)
+        {
+            std::string str;
+            if (!forcePointer && desc.second == 1)
+            {
+                if (size == 1)
+                {
+                    str = "Scalar<" + ValueTypeToCTypeString(desc.first) + ">";
+                }
+                else
+                {
+                    str = "std::array<" + ValueTypeToCTypeString(desc.first) + ", " + std::to_string(size) + ">";
+                }
+            }
+            else
+            {
+                str = ValueTypeToCTypeString(desc.first);
+                str.insert(str.end(), desc.second, '*');
+            }
+            return str;
+        }
+
+        std::string ValueToCString(const Value& value, bool forcePointer = false)
+        {
+            size_t size{};
+            if (!value.IsConstrained())
+            {
+                if (!forcePointer)
+                {
+                    throw LogicException(LogicExceptionErrors::illegalState, "Can't create concrete allocation for value with no known layout");
+                }
+            }
+            else
+            {
+                size = value.GetLayout().GetMemorySize();
+            }
+            auto str = ValueTypeToCTypeString(value.GetType(), size, forcePointer);
+            return str;
+        }
+
+        template <typename T>
+        std::string TypeToCTypeString(T)
+        {
+
+#define BEGIN_TYPE_TO_CTYPE_STRING_MAP \
+    if constexpr (false)               \
+    {                                  \
+    }
+#define ADD_TYPE_TO_CTYPE_STRING_STRING(TYPE, STR) \
+    else if constexpr (std::is_same_v<TYPE, T>) { return #STR; }
+#define ADD_TYPE_TO_CTYPE_STRING(TYPE) ADD_TYPE_TO_CTYPE_STRING_STRING(TYPE, TYPE)
+#define END_TYPE_TO_CTYPE_STRING_MAP \
+    else { static_assert(utilities::FalseType<T>{}, "Unknown type"); }
+
+            BEGIN_TYPE_TO_CTYPE_STRING_MAP
+            ADD_TYPE_TO_CTYPE_STRING(bool)
+            ADD_TYPE_TO_CTYPE_STRING_STRING(char, int8_t)
+            ADD_TYPE_TO_CTYPE_STRING(uint8_t)
+            ADD_TYPE_TO_CTYPE_STRING(int16_t)
+            ADD_TYPE_TO_CTYPE_STRING(int32_t)
+            ADD_TYPE_TO_CTYPE_STRING(int64_t)
+            ADD_TYPE_TO_CTYPE_STRING(float)
+            ADD_TYPE_TO_CTYPE_STRING(double)
+            ADD_TYPE_TO_CTYPE_STRING(void)
+            END_TYPE_TO_CTYPE_STRING_MAP
+        }
+
+        template <typename StreamType, typename T, typename AllocatorType = std::allocator<T>>
+        void PrintVector(StreamType&& stream, const std::vector<T, AllocatorType>& v, const std::string& delim = ", ")
+        {
+            using RealT = std::conditional_t<sizeof(T) == 1, int, T>;
+            if (!v.empty())
+            {
+                std::copy(v.begin(), v.end() - 1, std::ostream_iterator<RealT>{ stream, delim.c_str() });
+                stream << static_cast<RealT>(v.back());
+            }
+        }
+    } // namespace
+
+    Value CppEmitterContext::AllocateImpl(ValueType type, MemoryLayout layout, size_t alignment, AllocateFlags flags)
+    {
+        if (alignment != 0 || flags != AllocateFlags::None)
+        {
+            throw LogicException(LogicExceptionErrors::notImplemented);
+        }
+
+        // TODO: add alignment directive
+        return AllocateImpl({ type, 1 }, layout, "{}; // " + layout.ToString() + "\n");
+    }
+
+    Value CppEmitterContext::AllocateImpl(
+        detail::ValueTypeDescription type,
+        std::optional<MemoryLayout> layout,
+        std::string initializationString,
+        std::optional<std::string> name,
+        bool forcePointer)
+    {
+        CppEmitterContext::ValueImpl data{ name.value_or(UniqueName("v")), type };
+
+        auto& dataList = _fnStacks.top().dataList;
+
+        dataList.push_front(std::move(data));
+        auto& front = dataList.front();
+
+        Emittable emittable{ &front };
+
+        Value value(emittable, layout);
+
+        Out() << ValueToCString(value, forcePointer) << " " << front.name << initializationString;
+
+        return value;
+    }
+
+    std::optional<Value> CppEmitterContext::GetGlobalValue(GlobalAllocationScope scope, std::string name)
+    {
+        std::string adjustedName = GetScopeAdjustedName(scope, name);
+        if (auto it = _globals.find(adjustedName); it != _globals.end())
+        {
+            return Value(it->second.first, it->second.second);
+        }
+
+        return std::nullopt;
+    }
+
+    Value CppEmitterContext::GlobalAllocateImpl(GlobalAllocationScope scope, std::string name, ConstantData data, MemoryLayout layout, AllocateFlags flags)
+    {
+        std::string adjustedName = GetScopeAdjustedName(scope, name);
+
+        auto it = _globals.find(adjustedName);
+        if (it != _globals.end())
+        {
+            throw InputException(InputExceptionErrors::invalidArgument,
+                                 "Unexpected collision in global data allocation");
+        }
+
+        auto [type, definitionString] = std::visit(
+            [](auto&& vectorData) -> std::pair<ValueType, std::string> {
+                using Type = std::decay_t<decltype(vectorData)>;
+                using VectorElementType = typename Type::value_type;
+                using ElementType = std::conditional_t<std::is_same_v<VectorElementType, utilities::Boolean>, bool, VectorElementType>;
+
+                std::stringstream sstr;
+
+                sstr << " = { ";
+                PrintVector(sstr, vectorData);
+                sstr << " };\n";
+
+                return { GetValueType<ElementType>(), sstr.str() };
+            },
+            data);
+
+        CppEmitterContext::ValueImpl valueDesc{ adjustedName, { type, 1 } };
+
+        _globalsList.push_front(std::move(valueDesc));
+        auto& front = _globalsList.front();
+
+        Emittable emittable{ &front };
+        _globals.insert(it, { adjustedName, { emittable, layout } });
+
+        Value value(emittable, layout);
+
+        std::string prefix = [flags] {
+            switch (flags)
+            {
+            case AllocateFlags::ThreadLocal:
+                return "thread_local ";
+            default:
+                return "";
+            }
+        }();
+
+        Global() << prefix << ValueToCString(value) << " " << adjustedName << definitionString;
+
+        return value;
+    }
+
+    Value CppEmitterContext::GlobalAllocateImpl(GlobalAllocationScope scope, std::string name, ValueType type, MemoryLayout layout, AllocateFlags flags)
+    {
+        std::string adjustedName = GetScopeAdjustedName(scope, name);
+
+        auto it = _globals.find(adjustedName);
+        if (it != _globals.end())
+        {
+            throw InputException(InputExceptionErrors::invalidArgument,
+                                 "Unexpected collision in global data allocation");
+        }
+
+        CppEmitterContext::ValueImpl valueDesc{ adjustedName, { type, 1 } };
+
+        _globalsList.push_front(std::move(valueDesc));
+        auto& front = _globalsList.front();
+
+        Emittable emittable{ &front };
+        _globals.insert(it, { adjustedName, { emittable, layout } });
+
+        Value value(emittable, layout);
+
+        std::string prefix = [flags] {
+            switch (flags)
+            {
+            case AllocateFlags::ThreadLocal:
+                return "thread_local ";
+            default:
+                return "";
+            }
+        }();
+
+        Global() << prefix << ValueToCString(value) << " " << adjustedName << "{}; // " << layout << "\n";
+
+        return value;
+    }
+
+    detail::ValueTypeDescription CppEmitterContext::GetTypeImpl(Emittable emittable)
+    {
+        return emittable.GetDataAs<CppEmitterContext::ValueImpl*>()->typeDesc;
+    }
+
+    void CppEmitterContext::DeclareFunction(FunctionDeclaration decl)
+    {
+        auto [it, inserted] = _declaredFunctions.emplace(decl.GetFunctionName());
+        if (!inserted)
+        {
+            // already declared
+            return;
+        }
+
+        WriteFunctionSignature(FnDecl(), decl) << ";\n";
+    }
+
+    std::ostream& CppEmitterContext::WriteFunctionSignature(std::ostream& stream, FunctionDeclaration decl)
+    {
+        const auto& argValues = decl.GetParameterTypes();
+        const auto& returnValue = decl.GetReturnType();
+        const auto& fnName = decl.GetFunctionName();
+        const auto isPublic = decl.IsPublic();
+
+        std::vector<std::string> functionArgs;
+        functionArgs.reserve(argValues.size());
+        for (auto index = 0u; index < argValues.size(); ++index)
+        {
+            auto& arg = argValues[index];
+            functionArgs.push_back(
+                ValueTypeToCTypeString(
+                    arg.GetType(),
+                    arg.IsConstrained() ? arg.GetLayout().GetMemorySize() : 0,
+                    true) +
+                " arg_" + std::to_string(index) + "/* " + (arg.IsConstrained() ? arg.GetLayout().ToString() : std::string{ "unconstrained" }) + " */");
+        }
+
+        std::string returnType = returnValue ? ValueToCString(*returnValue) : ValueTypeToCTypeString(ValueType::Void);
+
+        stream << (isPublic ? "" : "static ") << returnType << " " << fnName << "(";
+        PrintVector(stream, functionArgs);
+        stream << ")";
+
+        return stream;
+    }
+
+    EmitterContext::DefinedFunction
+    CppEmitterContext::CreateFunctionImpl(FunctionDeclaration decl, DefinedFunction fn)
+    {
+        if (const auto& intrinsics = GetIntrinsics();
+            std::find(intrinsics.begin(), intrinsics.end(), decl) != intrinsics.end())
+        {
+            throw InputException(InputExceptionErrors::invalidArgument, "Specified function is an intrinsic");
+        }
+
+        if (auto it = _definedFunctions.find(decl); it != _definedFunctions.end())
+        {
+            return it->second;
+        }
+
+        DeclareFunction(decl);
+
+        const auto& argValues = decl.GetParameterTypes();
+        const auto& fnName = decl.GetFunctionName();
+
+        {
+            FunctionScope scope(*this, fnName);
+
+            // create function sig
+            WriteFunctionSignature(Out(), decl) << " {\n";
+
+            auto params = argValues;
+            for (auto index = 0u; index < params.size(); ++index)
+            {
+                auto& param = params[index];
+                CppEmitterContext::ValueImpl data{ "arg_" + std::to_string(index), param.GetType() };
+
+                auto& dataList = _fnStacks.top().dataList;
+
+                dataList.push_front(std::move(data));
+                auto& front = dataList.front();
+
+                Emittable emittable{ &front };
+
+                param.SetData(emittable);
+            }
+
+            Indented([&] {
+                auto fnReturnValue = fn(params);
+                if (fnReturnValue)
+                {
+                    auto emittableReturn = EnsureEmittable(*fnReturnValue);
+                    Out() << "return " << emittableReturn.GetName() << ";\n";
+                }
+            });
+
+            Out() << "} \n"
+                  << std::endl;
+        }
+
+        DefinedFunction returnFn = [this, decl](std::vector<Value> args) -> std::optional<Value> {
+            const auto& argValues = decl.GetParameterTypes();
+            const auto& returnValue = decl.GetReturnType();
+            const auto& fnName = decl.GetFunctionName();
+
+            if (!std::equal(args.begin(),
+                            args.end(),
+                            argValues.begin(),
+                            argValues.end(),
+                            [](Value suppliedValue, Value fnValue) {
+                                return suppliedValue.GetBaseType() == fnValue.GetBaseType() &&
+                                       (suppliedValue.PointerLevel() == fnValue.PointerLevel() ||
+                                        suppliedValue.PointerLevel() == fnValue.PointerLevel() + 1);
+                            }))
+            {
+                throw InputException(InputExceptionErrors::invalidArgument);
+            }
+
+            std::vector<std::string> params;
+            for (auto index = 0u; index < args.size(); ++index)
+            {
+                auto arg = EnsureEmittable(args[index]);
+                auto& expected = argValues[index];
+
+                std::string param = "&" + ScalarToString(arg);
+                if (arg.PointerLevel() == expected.PointerLevel() + 1)
+                {
+                    param = "*(" + param + ")";
+                }
+                params.push_back(param);
+            }
+
+            std::stringstream funcCallStream;
+            funcCallStream << fnName << "(";
+            PrintVector(funcCallStream, params);
+            funcCallStream << ")";
+            auto funCallString = funcCallStream.str();
+
+            std::optional<Value> fnReturnValue;
+            if (returnValue)
+            {
+                auto typeDesc = returnValue->GetType();
+                auto layout = returnValue->IsConstrained() ? std::optional{ returnValue->GetLayout() } : std::optional<MemoryLayout>{};
+                bool originalScalar = false;
+                if (typeDesc.second == 0)
+                {
+                    typeDesc.second = 1;
+                    originalScalar = true;
+                }
+                std::string initStr = std::string{ " = " } +
+                                      (originalScalar ? "{ " : "") + funCallString + (originalScalar ? " }" : "") +
+                                      "; // " + (layout ? layout->ToString() : std::string{ "unconstrained" }) + "\n\n";
+
+                fnReturnValue = AllocateImpl(typeDesc, originalScalar ? ScalarLayout : layout, initStr);
+            }
+            else
+            {
+                Out() << funCallString << ";\n\n";
+            }
+
+            return fnReturnValue;
+        };
+
+        _definedFunctions[decl] = returnFn;
+
+        return returnFn;
+    }
+
+    bool CppEmitterContext::IsFunctionDefinedImpl(FunctionDeclaration decl) const
+    {
+        if (const auto& intrinsics = GetIntrinsics();
+            std::find(intrinsics.begin(), intrinsics.end(), decl) != intrinsics.end())
+        {
+            return true;
+        }
+
+        return _definedFunctions.find(decl) != _definedFunctions.end();
+    }
+
+    Value CppEmitterContext::StoreConstantDataImpl(ConstantData data)
+    {
+        return _computeContext.StoreConstantData(data);
+    }
+
+    void CppEmitterContext::ForImpl(MemoryLayout layout, std::function<void(std::vector<Scalar>)> fn, const std::string& name)
+    {
+        struct Range
+        {
+            Scalar start;
+            Scalar stop;
+            Scalar step;
+        };
+
+        using LooperFn = std::function<void(std::vector<Range>, std::vector<Scalar>, std::function<void(std::vector<Scalar>)>)>;
+
+        const auto& logicalOrder = layout.GetLogicalDimensionOrder();
+
+        LooperFn looper = [this, &looper, &logicalOrder, &name](std::vector<Range> ranges, std::vector<Scalar> indices, std::function<void(std::vector<Scalar>)> bodyFn) {
+            if (ranges.empty())
+            {
+                std::vector<Scalar> logicalIndices(indices.size());
+                for (auto index = 0u; index < indices.size(); ++index)
+                {
+                    logicalIndices[logicalOrder[index]] = indices[index];
+                }
+                bodyFn(logicalIndices);
+            }
+            else
+            {
+                Range range = std::move(ranges.front());
+                ranges.erase(ranges.begin());
+
+                ForImpl(
+                    range.start, range.stop, range.step, [=, &looper, &bodyFn](Scalar index) mutable {
+                        indices.push_back(index);
+                        looper(ranges, indices, bodyFn);
+                    },
+                    name);
+            }
+        };
+
+        std::vector<Range> ranges;
+        ranges.reserve(layout.NumDimensions());
+        for (auto index = 0; index < layout.NumDimensions(); ++index)
+        {
+            ranges.push_back({ Value(0), Value(layout.GetActiveSize(index)), Value(1) });
+        }
+
+        looper(ranges, {}, fn);
+    }
+
+    void CppEmitterContext::ForImpl(Scalar start, Scalar stop, Scalar step, std::function<void(Scalar)> fn, const std::string& name)
+    {
+        auto startStr = ScalarToString(start);
+        auto index = AllocateImpl({ ValueType::Int32, 1 }, ScalarLayout, "{ " + startStr + " };\n");
+        auto indexStr = index.GetName();
+        auto stopStr = ScalarToString(stop);
+        auto stepStr = ScalarToString(step);
+        std::string optionalTag;
+        if (!name.empty())
+        {
+            optionalTag = " // " + UniqueName(name + " loop");
+        }
+
+        Out() << "for (;" << indexStr << "[0] < " << stopStr
+              << "; " << indexStr << "[0] += " << stepStr << ") {" << optionalTag << "\n";
+
+        Indented([&] { fn(index); });
+
+        Out() << "}" << optionalTag << "\n\n";
+    }
+
+    void CppEmitterContext::MoveDataImpl(Value& source, Value& destination)
+    {
+        // we treat a move the same as a copy, except we clear out the source
+        CopyDataImpl(source, destination);
+
+        // data has been "moved", so clear the source
+        source.Reset();
+    }
+
+    void CppEmitterContext::CopyDataImpl(const Value& source, Value& destination)
+    {
+        if (destination.IsConstant())
+        {
+            if (source.IsConstant())
+            {
+                return _computeContext.CopyData(source, destination);
+            }
+            else
+            {
+                destination.SetData(AllocateImpl(source.GetType(), source.GetLayout(), "{ " + source.GetName() + " };\n"));
+            }
+        }
+        else
+        {
+            if (!source.IsConstant() && source.Get<Emittable>().GetDataAs<void*>() == destination.Get<Emittable>().GetDataAs<void*>())
+            {
+                return;
+            }
+
+            if (auto& layout = source.GetLayout(); layout == destination.GetLayout())
+            {
+                if (layout == ScalarLayout)
+                {
+                    Out() << ScalarToString(destination) << " = " << ScalarToString(source) << ";\n";
+                }
+                else
+                {
+                    auto realizedSource = EnsureEmittable(source);
+                    Out() << "std::copy_n(&" << GetNameImpl(realizedSource) << "[0], "
+                          << realizedSource.GetLayout().GetMemorySize() << ", &"
+                          << GetNameImpl(destination) << "[0]);\n";
+                }
+            }
+            else
+            {
+                throw LogicException(LogicExceptionErrors::notImplemented);
+            }
+        }
+    }
+
+    Value CppEmitterContext::ReferenceImpl(Value sourceValue)
+    {
+        auto source = Realize(sourceValue);
+        if (source.IsConstant())
+        {
+            return _computeContext.Reference(source);
+        }
+
+        auto typeDesc = source.GetType();
+        ++typeDesc.second;
+
+        auto sourceName = source.GetName();
+        auto tempOffsetValue = Offset(sourceValue, 0);
+
+        auto value = AllocateImpl(
+            typeDesc,
+            sourceValue.IsConstrained() ? sourceValue.GetLayout() : ScalarLayout,
+            " = &" + tempOffsetValue.GetName() + ";\n",
+            UniqueName(sourceName + "_ref"),
+            true);
+
+        if (!sourceValue.IsConstrained())
+        {
+            value.ClearLayout();
+        }
+
+        return value;
+    }
+
+    Value CppEmitterContext::DereferenceImpl(Value sourceValue)
+    {
+        auto source = Realize(sourceValue);
+        if (source.IsConstant())
+        {
+            return _computeContext.Dereference(source);
+        }
+
+        auto typeDesc = source.GetType();
+        --typeDesc.second;
+
+        auto sourceName = source.GetName();
+
+        auto value = AllocateImpl(
+            typeDesc,
+            sourceValue.IsConstrained() ? sourceValue.GetLayout() : ScalarLayout,
+            " = " + sourceName + "[0];\n",
+            UniqueName(sourceName + "_ref"),
+            true);
+
+        if (!sourceValue.IsConstrained())
+        {
+            value.ClearLayout();
+        }
+
+        return value;
+    }
+
+    Value CppEmitterContext::OffsetImpl(Value source, Value offset)
+    {
+        if (offset.GetLayout() != ScalarLayout)
+        {
+            throw LogicException(LogicExceptionErrors::illegalState);
+        }
+
+        if (source.IsConstant() && offset.IsConstant())
+        {
+            return _computeContext.Offset(source, offset);
+        }
+        else
+        {
+            auto emittableSource = EnsureEmittable(source);
+            auto sourceName = emittableSource.GetName();
+
+            std::string initString = " = &" + sourceName + "[0]";
+            if (auto offsetString = ScalarToString(offset); offsetString != "0")
+            {
+                initString += " + " + offsetString;
+            }
+            initString += ";\n";
+
+            auto value = AllocateImpl(source.GetType(), std::nullopt, initString, UniqueName(sourceName + "_offset"), true);
+
+            return value;
+        }
+    }
+
+    Value CppEmitterContext::UnaryOperationImpl(ValueUnaryOperation op, Value destination)
+    {
+        throw LogicException(LogicExceptionErrors::notImplemented);
+    }
+    Value CppEmitterContext::BinaryOperationImpl(ValueBinaryOperation op, Value destination, Value source)
+    {
+        if (destination.IsConstant() && source.IsConstant())
+        {
+            return _computeContext.BinaryOperation(op, destination, source);
+        }
+
+        std::string opStr;
+        bool canSelfAssign;
+        std::tie(opStr, canSelfAssign) = [op]() -> std::pair<std::string, bool> {
+            switch (op)
+            {
+            case ValueBinaryOperation::add:
+                return { " += ", true };
+            case ValueBinaryOperation::divide:
+                return { " /= ", true };
+            case ValueBinaryOperation::modulus:
+                return { " %= ", true };
+            case ValueBinaryOperation::multiply:
+                return { " *= ", true };
+            case ValueBinaryOperation::subtract:
+                return { " -= ", true };
+            case ValueBinaryOperation::logicalAnd:
+                return { " && ", false };
+            case ValueBinaryOperation::logicalOr:
+                return { " || ", false };
+            default:
+                throw LogicException(LogicExceptionErrors::illegalState);
+            }
+        }();
+
+        if (destination.IsDefined())
+        {
+            if (destination.GetLayout() != source.GetLayout())
+            {
+                throw LogicException(LogicExceptionErrors::illegalState);
+            }
+            if (destination.GetBaseType() != source.GetBaseType())
+            {
+                throw LogicException(LogicExceptionErrors::illegalState);
+            }
+        }
+        else
+        {
+            if (!source.IsConstrained())
+            {
+                throw LogicException(LogicExceptionErrors::illegalState);
+            }
+            return source;
+        }
+
+        const auto& layout = destination.GetLayout();
+        auto destStr = GetNameImpl(destination);
+        auto srcStr = GetNameImpl(source);
+        if (layout == ScalarLayout)
+        {
+            auto sourceString = ScalarToString(source);
+            switch (op)
+            {
+            case ValueBinaryOperation::add:
+                [[fallthrough]];
+            case ValueBinaryOperation::subtract:
+                if (sourceString == "0") // destination = destination { +, - } 0
+                {
+                    return destination;
+                }
+                break;
+            case ValueBinaryOperation::divide:
+                [[fallthrough]];
+            case ValueBinaryOperation::multiply:
+                if (sourceString == "1") // destination = destination { /, * } 1
+                {
+                    return destination;
+                }
+                break;
+            default:
+                break;
+            }
+            if (canSelfAssign)
+            {
+                Out() << destStr << "[0]" << opStr << sourceString << ";\n";
+            }
+            else
+            {
+                Out() << destStr << "[0] = " << destStr << "[0]" << opStr << sourceString << ";\n";
+            }
+        }
+        else
+        {
+            auto emittableSource = EnsureEmittable(source);
+            srcStr = emittableSource.GetName();
+
+            auto iterationVariable = UniqueName("index");
+            Out() << "for (size_t " << iterationVariable << " = 0; " << iterationVariable << " < " << layout.GetMemorySize() << "; " << iterationVariable << " += " << layout.GetCumulativeIncrement(layout.NumDimensions() - 1) << ") {\n";
+
+            Indented([&] {
+                if (canSelfAssign)
+                {
+                    Out() << destStr << "[" << iterationVariable << "]" << opStr << srcStr << "[" << iterationVariable << "];\n";
+                }
+                else
+                {
+                    Out() << destStr << "[" << iterationVariable << "] = " << destStr << "[" << iterationVariable << "]" << opStr << srcStr << "[" << iterationVariable << "];\n";
+                }
+            });
+
+            Out() << "}\n\n";
+        }
+        return destination;
+    }
+
+    Value CppEmitterContext::LogicalOperationImpl(ValueLogicalOperation op, Value source1, Value source2)
+    {
+        if (source1.IsConstant() && source2.IsConstant())
+        {
+            return _computeContext.LogicalOperation(op, source1, source2);
+        }
+
+        auto opStr = [op]() -> std::string {
+            switch (op)
+            {
+            case ValueLogicalOperation::equality:
+                return " == ";
+            case ValueLogicalOperation::inequality:
+                return " != ";
+            case ValueLogicalOperation::greaterthan:
+                return " > ";
+            case ValueLogicalOperation::greaterthanorequal:
+                return " >= ";
+            case ValueLogicalOperation::lessthan:
+                return " < ";
+            case ValueLogicalOperation::lessthanorequal:
+                return " <= ";
+            default:
+                throw LogicException(LogicExceptionErrors::illegalState);
+            }
+        }();
+
+        if (source1.GetLayout() == source2.GetLayout())
+        {
+            std::string initString;
+            if (source1.GetLayout() == ScalarLayout)
+            {
+                initString = ScalarToString(source1) + opStr + ScalarToString(source2);
+            }
+            else
+            {
+                auto emittableSource1 = EnsureEmittable(source1);
+                auto emittableSource2 = EnsureEmittable(source2);
+
+                initString = GetNameImpl(emittableSource1) + opStr + GetNameImpl(emittableSource2);
+            }
+
+            return AllocateImpl({ ValueType::Boolean, 1 }, ScalarLayout, "{ " + initString + " };\n");
+        }
+        else
+        {
+            throw LogicException(LogicExceptionErrors::notImplemented);
+        }
+    }
+
+    Value CppEmitterContext::CastImpl(Value value, ValueType type)
+    {
+        if (value.IsConstant())
+        {
+            return _computeContext.Cast(value, type);
+        }
+
+        if (value.PointerLevel() != 1)
+        {
+            throw LogicException(LogicExceptionErrors::illegalState);
+        }
+
+        if (value.GetBaseType() == type)
+        {
+            return value;
+        }
+
+        if (auto& layout = value.GetLayout(); layout == ScalarLayout)
+        {
+            std::string initString = "{ static_cast<" + ValueTypeToCTypeString(type) + ">(" + ScalarToString(value) + ") };\n";
+            return AllocateImpl({ type, 1 }, layout, initString);
+        }
+        else
+        {
+            auto returnValue = Allocate(type, value.GetLayout());
+            For(0,
+                static_cast<int>(layout.GetMemorySize()),
+                1,
+                [&](Scalar index) {
+                    Out() << returnValue.GetName() << "[" << ScalarToString(index) << "] = static_cast<" << ValueTypeToCTypeString(type)
+                          << ">(" << value.GetName() << "[" << ScalarToString(index) << "]);\n";
+                });
+            return returnValue;
+        }
+    }
+
+    class CppEmitterContext::IfContextImpl : public EmitterContext::IfContextImpl
+    {
+    public:
+        IfContextImpl(CppEmitterContext& context, Scalar test, std::function<void()> fn) :
+            _context(context)
+        {
+            StreamGuard guard{ *this };
+            _context.Out() << "if (" << _context.ScalarToString(test) << ") {\n";
+            _context.Indented(fn);
+            _context.Out() << "}";
+        }
+
+        ~IfContextImpl()
+        {
+            _sstr << "\n";
+            _context._stream.get() << _sstr.str();
+        }
+
+        void ElseIf(Scalar test, std::function<void()> fn) override
+        {
+            StreamGuard guard{ *this };
+            _context._stream.get() << " else if (" << _context.ScalarToString(test) << ") {\n";
+            _context.Indented(fn);
+            _context.Out() << "}";
+        }
+
+        void Else(std::function<void()> fn) override
+        {
+            StreamGuard guard{ *this };
+            _context._stream.get() << " else {\n";
+            _context.Indented(fn);
+            _context.Out() << "}\n";
+        }
+
+    private:
+        struct StreamGuard
+        {
+            StreamGuard(IfContextImpl& context) :
+                _context(context),
+                _oldStream(_context._context._stream)
+            {
+                _context._context._stream = _context._sstr;
+            }
+
+            ~StreamGuard()
+            {
+                _context._context._stream = _oldStream;
+            }
+
+        private:
+            IfContextImpl& _context;
+            std::reference_wrapper<std::ostream> _oldStream;
+        };
+
+        CppEmitterContext& _context;
+        std::stringstream _sstr;
+    };
+
+    EmitterContext::IfContext CppEmitterContext::IfImpl(Scalar test, std::function<void()> fn)
+    {
+        return EmitterContext::IfContext{ std::make_unique<CppEmitterContext::IfContextImpl>(*this, test, fn) };
+    }
+
+    void CppEmitterContext::WhileImpl(Scalar test, std::function<void()> fn)
+    {
+        auto testStr = ScalarToString(test);
+        std::string optionalTag;
+        std::string name;
+        if (!name.empty())
+        {
+            optionalTag = " // " + UniqueName(name + " loop");
+        }
+
+        Out() << "while (" << testStr << ") {" << optionalTag << "\n";
+
+        Indented(fn);
+
+        Out() << "}" << optionalTag << "\n\n";
+    }
+
+    std::optional<Value> CppEmitterContext::CallImpl(FunctionDeclaration func, std::vector<Value> args)
+    {
+        if (std::any_of(args.begin(), args.end(), [](const auto& value) { return value.IsEmpty(); }))
+        {
+            throw InputException(InputExceptionErrors::invalidArgument);
+        }
+
+        const auto& intrinsics = GetIntrinsics();
+        if (std::find(intrinsics.begin(), intrinsics.end(), func) != intrinsics.end())
+        {
+            return IntrinsicCall(func, args);
+        }
+
+        if (auto it = _definedFunctions.find(func); it != _definedFunctions.end())
+        {
+            return it->second(args);
+        }
+
+        return EmitExternalCall(func, args);
+    }
+
+    Value CppEmitterContext::SimpleNumericIntrinsic(FunctionDeclaration intrinsic, std::vector<Value> args)
+    {
+        if (args.size() != 1)
+        {
+            throw InputException(InputExceptionErrors::invalidSize);
+        }
+
+        const auto& value = args[0];
+        if (value.GetBaseType() == ValueType::Boolean)
+        {
+            throw InputException(InputExceptionErrors::typeMismatch);
+        }
+
+        auto typeDesc = value.GetType();
+        if (typeDesc.first != ValueType::Float)
+        {
+            typeDesc.first = ValueType::Double;
+        }
+
+        auto returnLayout = value.IsConstrained() ? value.GetLayout() : ScalarLayout;
+        auto fnName = ToLowercase(intrinsic.GetFunctionName());
+
+        if (returnLayout == ScalarLayout)
+        {
+            return AllocateImpl(typeDesc, returnLayout, "{ std::" + fnName + "(" + ScalarToString(value) + ") };\n");
+        }
+        else
+        {
+            auto result = Allocate(typeDesc.first, returnLayout);
+            auto valueStr = ScalarToString(value);
+            Out() << "std::transform(&" << valueStr
+                  << ", &" << valueStr << " + " << returnLayout.GetMemorySize()
+                  << ", &" << ScalarToString(result)
+                  << ", [](decltype(" << valueStr << ") x) { return std::" << fnName << "(x); });\n";
+            return result;
+        }
+    }
+
+    Value CppEmitterContext::MaxMinIntrinsic(FunctionDeclaration intrinsic, std::vector<Value> args)
+    {
+        if (args.size() == 1)
+        {
+            const auto& value = args[0];
+            if (value.GetBaseType() == ValueType::Boolean)
+            {
+                throw InputException(InputExceptionErrors::typeMismatch);
+            }
+
+            std::string fnName;
+            if (intrinsic == MaxNumFunctionDeclaration)
+            {
+                fnName = "std::max_element";
+            }
+            else if (intrinsic == MinNumFunctionDeclaration)
+            {
+                fnName = "std::min_element";
+            }
+            else
+            {
+                throw LogicException(LogicExceptionErrors::illegalState);
+            }
+
+            auto valueStr = ScalarToString(value);
+            return AllocateImpl(
+                { value.GetBaseType(), 1 },
+                ScalarLayout,
+                "{ *" + fnName + "(&" + valueStr +
+                    ", &" + valueStr + " + " + std::to_string(value.GetLayout().GetMemorySize()) +
+                    ") };\n");
+        }
+        else if (args.size() == 2)
+        {
+            const auto& value1 = args[0];
+            const auto& value2 = args[1];
+            if (value1.GetBaseType() != value2.GetBaseType())
+            {
+                throw InputException(InputExceptionErrors::typeMismatch);
+            }
+
+            if (value1.GetBaseType() == ValueType::Boolean)
+            {
+                throw InputException(InputExceptionErrors::typeMismatch);
+            }
+
+            if ((value1.IsConstrained() && value1.GetLayout() != ScalarLayout) ||
+                (value2.IsConstrained() && value2.GetLayout() != ScalarLayout))
+            {
+                throw InputException(InputExceptionErrors::invalidSize);
+            }
+
+            std::string fnName;
+            if (intrinsic == MaxNumFunctionDeclaration)
+            {
+                fnName = "std::max";
+            }
+            else if (intrinsic == MinNumFunctionDeclaration)
+            {
+                fnName = "std::min";
+            }
+            else
+            {
+                throw LogicException(LogicExceptionErrors::illegalState);
+            }
+
+            return AllocateImpl(
+                { value1.GetBaseType(), 1 },
+                ScalarLayout,
+                "{ " + fnName + "(" + ScalarToString(value1) +
+                    ", " + ScalarToString(value2) + ") };\n");
+        }
+        else
+        {
+            throw InputException(InputExceptionErrors::invalidSize);
+        }
+    }
+
+    Value CppEmitterContext::PowIntrinsic(FunctionDeclaration, std::vector<Value> args)
+    {
+        if (args.size() != 2)
+        {
+            throw InputException(InputExceptionErrors::invalidSize);
+        }
+
+        const auto& value1 = args[0];
+        const auto& value2 = args[1];
+        if (value1.GetBaseType() != value2.GetBaseType())
+        {
+            throw InputException(InputExceptionErrors::typeMismatch);
+        }
+
+        if (value1.GetBaseType() == ValueType::Boolean)
+        {
+            throw InputException(InputExceptionErrors::typeMismatch);
+        }
+
+        if (value2.IsConstrained() && value2.GetLayout() != ScalarLayout)
+        {
+            throw InputException(InputExceptionErrors::invalidSize);
+        }
+
+        auto typeDesc = value1.GetType();
+        if (typeDesc.first != ValueType::Float)
+        {
+            typeDesc.first = ValueType::Double;
+        }
+
+        auto returnLayout = value1.IsConstrained() ? value1.GetLayout() : ScalarLayout;
+        std::string fnName = "std::pow";
+
+        if (returnLayout == ScalarLayout)
+        {
+            return AllocateImpl(typeDesc, returnLayout, "{ " + fnName + "(" + ScalarToString(value1) + ", " + ScalarToString(value2) + ") };\n");
+        }
+        else
+        {
+            auto result = Allocate(typeDesc.first, returnLayout);
+            auto valueStr = ScalarToString(value1);
+            Out() << "std::transform(&" << valueStr
+                  << ", &" << valueStr << " + " << returnLayout.GetMemorySize()
+                  << ", &" << ScalarToString(result)
+                  << ", [&" << value2.GetName() << "](decltype(" << valueStr
+                  << ") x) { return " << fnName << "(x, " << ScalarToString(value2) << "); });\n";
+            return result;
+        }
+    }
+
+    Value CppEmitterContext::CopySignIntrinsic(FunctionDeclaration, std::vector<Value> args)
+    {
+        if (args.size() != 2)
+        {
+            throw InputException(InputExceptionErrors::invalidSize);
+        }
+
+        const auto& value1 = args[0];
+        const auto& value2 = args[1];
+        if (value1.GetBaseType() != value2.GetBaseType())
+        {
+            throw InputException(InputExceptionErrors::typeMismatch);
+        }
+
+        if (value1.GetBaseType() == ValueType::Boolean)
+        {
+            throw InputException(InputExceptionErrors::typeMismatch);
+        }
+
+        if ((value1.IsConstrained() && value1.GetLayout() != ScalarLayout) ||
+            (value2.IsConstrained() && value2.GetLayout() != ScalarLayout))
+        {
+            throw InputException(InputExceptionErrors::invalidSize);
+        }
+
+        auto typeDesc = value1.GetType();
+        if (typeDesc.first != ValueType::Float)
+        {
+            typeDesc.first = ValueType::Double;
+        }
+
+        return AllocateImpl(typeDesc, ScalarLayout, "{ std::copysign(" + ScalarToString(value1) + ", " + ScalarToString(value2) + ") };\n");
+    }
+
+    Value CppEmitterContext::FmaIntrinsic(FunctionDeclaration, std::vector<Value> args)
+    {
+        if (args.size() != 3)
+        {
+            throw InputException(InputExceptionErrors::invalidSize);
+        }
+
+        if (std::any_of(args.begin(), args.end(), [](Value& value) { return value.IsConstrained() && value.GetLayout() != ScalarLayout; }))
+        {
+            throw InputException(InputExceptionErrors::invalidSize);
+        }
+
+        const auto& value1 = args[0];
+        const auto& value2 = args[1];
+        const auto& value3 = args[2];
+        if (value1.GetBaseType() != value2.GetBaseType() || value1.GetBaseType() != value3.GetBaseType())
+        {
+            throw InputException(InputExceptionErrors::typeMismatch);
+        }
+
+        if (value1.GetBaseType() == ValueType::Boolean)
+        {
+            throw InputException(InputExceptionErrors::typeMismatch);
+        }
+
+        auto typeDesc = value1.GetType();
+
+        return AllocateImpl(
+            typeDesc,
+            ScalarLayout,
+            "{ static_cast<" + ValueTypeToCTypeString(typeDesc.first) + ">(std::fma(" + ScalarToString(value1) + ", " + ScalarToString(value2) + ", " + ScalarToString(value3) + ")) };\n");
+    }
+
+    Value CppEmitterContext::MemFnIntrinsic(FunctionDeclaration intrinsic, std::vector<Value> args)
+    {
+        if (args.size() != 3)
+        {
+            throw InputException(InputExceptionErrors::invalidSize);
+        }
+
+        const auto& value1 = args[0];
+        const auto& value2 = args[1];
+        const auto& value3 = args[2];
+
+        if (!value3.IsConstrained() || value3.GetLayout() != ScalarLayout)
+        {
+            throw InputException(InputExceptionErrors::invalidArgument);
+        }
+
+        std::string secondValuePrefix;
+        if (intrinsic == MemSetFunctionDeclaration)
+        {
+            assert((value2.IsConstrained() && value2.GetLayout() == ScalarLayout && value2.GetType() == std::pair{ ValueType::Char8, 1 }));
+        }
+        else
+        {
+            secondValuePrefix = "&";
+        }
+
+        auto fnName = ToLowercase(intrinsic.GetFunctionName());
+        auto value1Str = ScalarToString(value1);
+        Out() << "std::" << fnName << "(&" << value1Str << ", "
+              << secondValuePrefix << ScalarToString(value2)
+              << ", sizeof(" << value1Str << ") * " << ScalarToString(value3) << ");\n";
+
+        return {}; // unused
+    }
+
+    Value CppEmitterContext::IntrinsicCall(FunctionDeclaration intrinsic, std::vector<Value> args)
+    {
+        if (std::all_of(args.begin(), args.end(), [](const auto& value) { return value.IsConstant(); }))
+        {
+            // Compute context can handle intrinsic calls with constant data
+            return *_computeContext.Call(intrinsic, std::vector<ViewAdapter>(args.begin(), args.end()));
+        }
+
+        static std::unordered_map<FunctionDeclaration, Value (CppEmitterContext::*)(FunctionDeclaration, std::vector<Value>)> intrinsics{
+            { AbsFunctionDeclaration, &CppEmitterContext::SimpleNumericIntrinsic },
+            { CosFunctionDeclaration, &CppEmitterContext::SimpleNumericIntrinsic },
+            { ExpFunctionDeclaration, &CppEmitterContext::SimpleNumericIntrinsic },
+            { LogFunctionDeclaration, &CppEmitterContext::SimpleNumericIntrinsic },
+            { Log10FunctionDeclaration, &CppEmitterContext::SimpleNumericIntrinsic },
+            { Log2FunctionDeclaration, &CppEmitterContext::SimpleNumericIntrinsic },
+            { MaxNumFunctionDeclaration, &CppEmitterContext::MaxMinIntrinsic },
+            { MinNumFunctionDeclaration, &CppEmitterContext::MaxMinIntrinsic },
+            { PowFunctionDeclaration, &CppEmitterContext::PowIntrinsic },
+            { SinFunctionDeclaration, &CppEmitterContext::SimpleNumericIntrinsic },
+            { SqrtFunctionDeclaration, &CppEmitterContext::SimpleNumericIntrinsic },
+            { TanhFunctionDeclaration, &CppEmitterContext::SimpleNumericIntrinsic },
+            { RoundFunctionDeclaration, &CppEmitterContext::SimpleNumericIntrinsic },
+            { FloorFunctionDeclaration, &CppEmitterContext::SimpleNumericIntrinsic },
+            { CeilFunctionDeclaration, &CppEmitterContext::SimpleNumericIntrinsic },
+            { CopySignFunctionDeclaration, &CppEmitterContext::CopySignIntrinsic },
+            { FmaFunctionDeclaration, &CppEmitterContext::FmaIntrinsic },
+            { MemCopyFunctionDeclaration, &CppEmitterContext::MemFnIntrinsic },
+            { MemMoveFunctionDeclaration, &CppEmitterContext::MemFnIntrinsic },
+            { MemSetFunctionDeclaration, &CppEmitterContext::MemFnIntrinsic },
+        };
+
+        std::vector<Value> emittableArgs;
+        emittableArgs.reserve(args.size());
+        std::transform(args.begin(), args.end(), std::back_inserter(emittableArgs), [this](const auto& value) { return EnsureEmittable(value); });
+
+        return std::invoke(intrinsics.at(intrinsic), this, intrinsic, emittableArgs);
+    }
+
+    std::optional<Value> CppEmitterContext::EmitExternalCall(FunctionDeclaration externalFunc, std::vector<Value> args)
+    {
+        DeclareFunction(externalFunc);
+
+        const auto& argTypes = externalFunc.GetParameterTypes();
+
+        if (args.size() != argTypes.size())
+        {
+            throw InputException(InputExceptionErrors::sizeMismatch);
+        }
+        if (!std::equal(args.begin(),
+                        args.end(),
+                        argTypes.begin(),
+                        argTypes.end(),
+                        [](Value suppliedValue, Value fnValue) {
+                            return suppliedValue.GetBaseType() == fnValue.GetBaseType() &&
+                                   (suppliedValue.PointerLevel() == fnValue.PointerLevel() ||
+                                    suppliedValue.PointerLevel() == fnValue.PointerLevel() + 1);
+                        }))
+        {
+            throw InputException(InputExceptionErrors::invalidArgument);
+        }
+
+        // explicitly making a copy
+        auto returnType = externalFunc.GetReturnType();
+        const auto& fnName = externalFunc.GetFunctionName();
+
+        std::vector<std::string> params;
+        for (auto index = 0u; index < args.size(); ++index)
+        {
+            auto arg = EnsureEmittable(args[index]);
+            auto& expected = argTypes[index];
+
+            std::string param = "&" + ScalarToString(arg);
+            if (arg.PointerLevel() == expected.PointerLevel() + 1)
+            {
+                param = "*(" + param + ")";
+            }
+            params.push_back(param);
+        }
+
+        std::stringstream funcCallStream;
+        funcCallStream << fnName << "(";
+        PrintVector(funcCallStream, params);
+        funcCallStream << ")";
+
+        std::string returnValueString;
+        if (returnType)
+        {
+            auto typeDesc = returnType->GetType();
+            auto layout = returnType->GetLayout();
+            returnType.reset();
+            bool originalScalar = false;
+
+            if (typeDesc.second == 0)
+            {
+                typeDesc.second = 1;
+                originalScalar = true;
+            }
+            std::string initStr = std::string{ " = " } +
+                                  (originalScalar ? "{ " : "") + funcCallStream.str() + (originalScalar ? " }" : "") +
+                                  "; // " + layout.ToString() + "\n\n";
+
+            returnType = AllocateImpl(typeDesc, originalScalar ? ScalarLayout : layout, initStr);
+        }
+        else
+        {
+            Out() << funcCallStream.str() << ";\n\n";
+        }
+
+        return returnType;
+    }
+
+    void CppEmitterContext::PrefetchImpl(Value data, PrefetchType type, PrefetchLocality locality)
+    {
+        // no-op for now
+    }
+
+    void CppEmitterContext::ParallelizeImpl(int numTasks, std::vector<Value> captured, std::function<void(Scalar, std::vector<Value>)> fn)
+    {
+        auto futuresName = UniqueName("futures");
+        Out() << "std::vector<std::future<void>> " << futuresName << ";\n";
+        Out() << futuresName << ".reserve(" << numTasks << ");\n";
+        std::vector<std::string> capturedParams;
+        std::transform(captured.begin(), captured.end(), std::back_inserter(capturedParams), [this](const Value& value) {
+            auto emittableValue = EnsureEmittable(value);
+            return "&" + emittableValue.GetName();
+        });
+
+        ForRange(numTasks, [&](Scalar index) {
+            auto& outStream = Out() << futuresName << ".emplace_back(std::async([";
+            PrintVector(outStream, capturedParams);
+            auto parallelizedIndexName = UniqueName("parallelized_index");
+            outStream << "](int " << parallelizedIndexName << ") {\n";
+            Indented([&] {
+                Scalar parallelizedIndex = AllocateImpl({ ValueType::Int32, 1 }, ScalarLayout, " = { " + parallelizedIndexName + " };\n\n");
+                fn(parallelizedIndex, captured);
+            });
+            Out() << "}, " << ScalarToString(index) << "));\n";
+        });
+
+        Out() << "for (auto& " << futuresName << "_temp : " << futuresName << ") {\n";
+        Indented([&] {
+            Out() << futuresName << "_temp.wait();\n";
+        });
+        Out() << "}\n\n";
+    }
+
+    void CppEmitterContext::DebugBreakImpl()
+    {
+        // no-op for now
+    }
+
+    void CppEmitterContext::DebugDumpImpl(Value value, std::string tag, std::ostream& stream) const
+    {
+        // no-op for now
+    }
+
+    void CppEmitterContext::DebugDumpImpl(FunctionDeclaration fn, std::string tag, std::ostream& stream) const
+    {
+        // no-op for now
+    }
+
+    void CppEmitterContext::DebugPrintImpl(std::string message)
+    {
+        std::string::iterator it;
+        char tempBuffer[10] = {};
+        while ((it = std::find_if_not(message.begin(), message.end(), [](auto c) {
+                    return static_cast<bool>(::isprint(static_cast<unsigned char>(c)));
+                })) != message.end())
+        {
+            snprintf(tempBuffer, sizeof(tempBuffer), "\\x%02x", static_cast<int>(*it));
+            message.replace(it, it + 1, tempBuffer);
+        }
+        Out() << "std::cout << \"" << message << "\";\n";
+    }
+
+    void CppEmitterContext::SetNameImpl(const Value& value, const std::string& name)
+    {
+        // TODO: fix
+        // value.Get<Emittable>().GetDataAs<CppEmitterContext::ValueImpl*>()->name = name;
+    }
+
+    std::string CppEmitterContext::GetNameImpl(const Value& value) const
+    {
+        return value.IsConstant() ? _computeContext.GetName(value) : value.Get<Emittable>().GetDataAs<CppEmitterContext::ValueImpl*>()->name;
+    }
+
+    void CppEmitterContext::ImportCodeFileImpl(std::string) { throw LogicException(LogicExceptionErrors::notImplemented); }
+
+    Scalar CppEmitterContext::GetFunctionAddressImpl(const FunctionDeclaration& fn) { throw LogicException(LogicExceptionErrors::notImplemented); }
+
+    std::string CppEmitterContext::GetScopeAdjustedName(GlobalAllocationScope scope, std::string name) const
+    {
+        switch (scope)
+        {
+        case GlobalAllocationScope::Global:
+            return GetGlobalScopedName(name);
+        case GlobalAllocationScope::Function:
+            return GetCurrentFunctionScopedName(name);
+        }
+
+        throw LogicException(LogicExceptionErrors::illegalState);
+    }
+
+    std::string CppEmitterContext::GetGlobalScopedName(std::string name) const
+    {
+        return _moduleName + "_" + name;
+    }
+
+    std::string CppEmitterContext::GetCurrentFunctionScopedName(std::string name) const
+    {
+        if (_fnStacks.empty())
+        {
+            throw LogicException(LogicExceptionErrors::illegalState);
+        }
+
+        return GetGlobalScopedName(_fnStacks.top().name + "_" + name);
+    }
+
+    std::ostream& CppEmitterContext::Out()
+    {
+        return _stream.get() << std::string(2 * _indent, ' ');
+    }
+
+    std::ostream& CppEmitterContext::Global()
+    {
+        return _globalStream;
+    }
+
+    std::ostream& CppEmitterContext::FnDecl() { return _fnDeclStream; }
+
+    Value CppEmitterContext::PromoteConstantData(Value value)
+    {
+        assert(value.IsConstant() && value.IsDefined() && !value.IsEmpty());
+
+        const auto& constantData = _computeContext.GetConstantData(value);
+
+        auto [offset, size] = std::visit(
+            [&value](auto&& data) -> std::pair<ptrdiff_t, int> {
+                using Type = std::decay_t<decltype(data)>;
+                using DataType = typename Type::value_type;
+
+                auto ptrData = std::get<DataType*>(value.GetUnderlyingData());
+                ptrdiff_t offset = ptrData - data.data();
+                return { offset, static_cast<int>(data.size()) };
+            },
+            constantData);
+
+        auto type = value.GetBaseType();
+        auto promotedBaseValue = GlobalAllocateImpl(_fnStacks.empty() ? GlobalAllocationScope::Global : GlobalAllocationScope::Function,
+                                                    UniqueName("_"),
+                                                    constantData,
+                                                    MemoryLayout{ { size } },
+                                                    AllocateFlags::None);
+
+        _promotedConstantStack.top().push_back({ &constantData, promotedBaseValue.Get<Emittable>() });
+
+        if (offset == 0)
+        {
+            promotedBaseValue.SetLayout(value.GetLayout());
+            return promotedBaseValue;
+        }
+        else
+        {
+            CppEmitterContext::ValueImpl valueDesc{ UniqueName(promotedBaseValue.GetName() + "_offset"), { type, 1 } };
+
+            _globalsList.push_front(std::move(valueDesc));
+            auto& front = _globalsList.front();
+
+            Emittable emittable{ &front };
+            const auto& layout = value.GetLayout();
+
+            _globals[front.name] = { emittable, layout };
+
+            Value value(emittable, layout);
+
+            Global() << ValueTypeToCTypeString(type) << "* " << front.name
+                     << " = &"
+                     << promotedBaseValue.GetName() << "[" << offset << "];\n";
+
+            return value;
+        }
+    }
+
+    std::optional<CppEmitterContext::PromotedConstantDataDescription> CppEmitterContext::HasBeenPromoted(Value value) const
+    {
+        if (!value.IsDefined() || value.IsEmpty() || !value.IsConstant())
+        {
+            return std::nullopt;
+        }
+
+        const auto& constantData = _computeContext.GetConstantData(value);
+        const auto& promotedStack = _promotedConstantStack.top();
+
+        if (auto it = std::find_if(promotedStack.begin(),
+                                   promotedStack.end(),
+                                   [&constantData](const auto& desc) { return desc.data == &constantData; });
+            it != promotedStack.end())
+        {
+            return *it;
+        }
+        else
+        {
+            return std::nullopt;
+        }
+    }
+
+    Value CppEmitterContext::Realize(Value value)
+    {
+        if (auto desc = HasBeenPromoted(value); !desc)
+        {
+            return value;
+        }
+        else
+        {
+            const auto& promotionalDesc = *desc;
+            auto offset = std::visit(
+                [&value](auto&& data) -> ptrdiff_t {
+                    using Type = std::decay_t<decltype(data)>;
+                    using DataType = typename Type::value_type;
+
+                    auto ptrData = std::get<DataType*>(value.GetUnderlyingData());
+
+                    return ptrData - data.data();
+                },
+                *promotionalDesc.data);
+
+            Value newValue = value;
+            auto emittable = promotionalDesc.realValue;
+            if (offset == 0)
+            {
+                newValue.SetData(emittable);
+            }
+            else
+            {
+                auto type = value.GetBaseType();
+                auto valueImpl = emittable.GetDataAs<CppEmitterContext::ValueImpl*>();
+                CppEmitterContext::ValueImpl valueDesc{ UniqueName(valueImpl->name + "_offset"), { type, 1 } };
+
+                _globalsList.push_front(std::move(valueDesc));
+                auto& front = _globalsList.front();
+
+                Emittable offsetEmittable{ &front };
+                const auto& layout = value.GetLayout();
+
+                _globals[front.name] = { offsetEmittable, layout };
+
+                newValue.SetData(offsetEmittable);
+
+                Global() << ValueTypeToCTypeString(type) << "* " << front.name
+                         << " = &"
+                         << valueImpl->name << "[" << offset << "];\n";
+            }
+
+            return newValue;
+        }
+    }
+
+    Value CppEmitterContext::EnsureEmittable(Value value)
+    {
+        if (!value.IsConstant())
+        {
+            return value;
+        }
+        else if (Value newValue = Realize(value); !newValue.IsConstant())
+        {
+            return newValue;
+        }
+        else
+        {
+            return PromoteConstantData(newValue);
+        }
+    }
+
+    // Despite the name, the function does not actually try to ensure the param passed in is a Scalar
+    std::string CppEmitterContext::ScalarToString(ViewAdapter scalar) const
+    {
+        return std::visit(
+            [](auto&& data) {
+                using Type = std::decay_t<decltype(data)>;
+                if constexpr (std::is_same_v<Type, Emittable>)
+                {
+                    return data.template GetDataAs<CppEmitterContext::ValueImpl*>()->name + "[0]";
+                }
+                else
+                {
+                    using RealType = RemoveAllPointersT<Type>;
+                    if constexpr (std::is_same_v<RealType, Boolean>)
+                    {
+                        return std::to_string(static_cast<bool>(data[0]));
+                    }
+                    else if constexpr (std::is_floating_point_v<RealType>)
+                    {
+                        if (std::trunc(data[0]) == data[0])
+                        {
+                            return std::to_string(static_cast<int64_t>(data[0]));
+                        }
+                        return std::to_string(data[0]);
+                    }
+                    else
+                    {
+                        return std::to_string(data[0]);
+                    }
+                }
+            },
+            scalar.GetValue().GetUnderlyingData());
+    }
+
+    template <typename Fn>
+    void CppEmitterContext::Indented(Fn&& fn)
+    {
+        ++_indent;
+        fn();
+        --_indent;
+    }
+
+} // namespace value
+} // namespace ell
diff --git a/libraries/value/src/EmitterContext.cpp b/libraries/value/src/EmitterContext.cpp
index 7f428f9f4..07a903a71 100644
--- a/libraries/value/src/EmitterContext.cpp
+++ b/libraries/value/src/EmitterContext.cpp
@@ -72,34 +72,61 @@ namespace value
         return std::move(*this);
     }
 
-    void EmitterContext::IfContext::Else(std::function<void()> fn) && { _impl->Else(fn); }
+    void EmitterContext::IfContext::ElseIf(Scalar test, std::function<void()> fn) &
+    {
+        if (test.GetType() != ValueType::Boolean)
+        {
+            throw InputException(InputExceptionErrors::typeMismatch);
+        }
+
+        _impl->ElseIf(test, fn);
+    }
+
+    void EmitterContext::IfContext::Else(std::function<void()> fn) &&
+    {
+        _impl->Else(fn);
+    }
+
+    void EmitterContext::IfContext::Else(std::function<void()> fn) &
+    {
+        _impl->Else(fn);
+    }
 
     EmitterContext::~EmitterContext() = default;
 
-    Value EmitterContext::Allocate(ValueType type, size_t size) { return Allocate(type, MemoryLayout({ (int)size })); }
+    Value EmitterContext::Allocate(ValueType type, size_t size, size_t align, AllocateFlags flags)
+    {
+        return Allocate(type, MemoryLayout({ (int)size }), align, flags);
+    }
 
-    Value EmitterContext::Allocate(ValueType type, MemoryLayout layout) { return AllocateImpl(type, layout); }
+    Value EmitterContext::Allocate(ValueType type, MemoryLayout layout, size_t align, AllocateFlags flags)
+    {
+        return AllocateImpl(type, layout, align, flags);
+    }
 
-    Value EmitterContext::StaticAllocate(std::string name, ValueType type, utilities::MemoryLayout layout)
+    Value EmitterContext::StaticAllocate(std::string name, ValueType type, utilities::MemoryLayout layout, AllocateFlags flags)
     {
-        if (auto globalValue = GetGlobalValue(GlobalAllocationScope::Function, name))
+        if (auto globalValue = GetGlobalValue(GlobalAllocationScope::Function, name, layout))
         {
-            Value value = globalValue.value();
-            if (layout.GetMemorySize() > value.GetLayout().GetMemorySize())
-            {
-                throw InputException(InputExceptionErrors::invalidSize);
-            }
-            value.SetLayout(layout);
+            return *globalValue;
+        }
 
-            return value;
+        return GlobalAllocateImpl(GlobalAllocationScope::Function, name, type, layout, flags);
+    }
+
+    Value EmitterContext::GlobalAllocate(std::string name, ValueType type, utilities::MemoryLayout layout, AllocateFlags flags)
+    {
+        if (auto globalValue = GetGlobalValue(GlobalAllocationScope::Global, name, layout))
+        {
+            return *globalValue;
         }
 
-        return GlobalAllocateImpl(GlobalAllocationScope::Function, name, type, layout);
+        return GlobalAllocateImpl(GlobalAllocationScope::Global, name, type, layout, flags);
     }
 
-    Value EmitterContext::GlobalAllocate(std::string name, ValueType type, utilities::MemoryLayout layout)
+    std::optional<Value> EmitterContext::GetGlobalValue(GlobalAllocationScope scope, std::string name, MemoryLayout layout)
     {
-        if (auto globalValue = GetGlobalValue(GlobalAllocationScope::Global, name))
+        if (auto globalValue = GetGlobalValue(scope, name))
         {
             Value value = globalValue.value();
             if (layout.GetMemorySize() > value.GetLayout().GetMemorySize())
@@ -111,7 +138,7 @@ namespace value
             return value;
         }
 
-        return GlobalAllocateImpl(GlobalAllocationScope::Global, name, type, layout);
+        return std::nullopt;
     }
 
     detail::ValueTypeDescription EmitterContext::GetType(Emittable emittable) { return GetTypeImpl(emittable); }
@@ -128,17 +155,17 @@ namespace value
 
     Value EmitterContext::StoreConstantData(ConstantData data) { return StoreConstantDataImpl(data); }
 
-    void EmitterContext::For(MemoryLayout layout, std::function<void(std::vector<Scalar>)> fn)
+    void EmitterContext::For(MemoryLayout layout, std::function<void(std::vector<Scalar>)> fn, const std::string& name)
     {
         if (layout.NumElements() == 0)
         {
             return;
         }
 
-        return ForImpl(layout, fn);
+        return ForImpl(layout, fn, name);
     }
 
-    void EmitterContext::For(Scalar start, Scalar stop, Scalar step, std::function<void(Scalar)> fn)
+    void EmitterContext::For(Scalar start, Scalar stop, Scalar step, std::function<void(Scalar)> fn, const std::string& name)
     {
         if (!(start.GetType() == stop.GetType() && start.GetType() == step.GetType()))
         {
@@ -150,7 +177,7 @@ namespace value
             throw InputException(InputExceptionErrors::invalidArgument, "start/stop/step must not be boolean");
         }
 
-        return ForImpl(start, stop, step, fn);
+        return ForImpl(start, stop, step, fn, name);
     }
 
     void EmitterContext::MoveData(Value& source, Value& destination) { return MoveDataImpl(source, destination); }
@@ -161,11 +188,11 @@ namespace value
 
     Value EmitterContext::Dereference(Value source)
     {
-        if (source.PointerLevel() < 1)
+        if (source.PointerLevel() < 0)
         {
-            throw LogicException(LogicExceptionErrors::illegalState, "Pointer level is less than the expected minimum of 1");
+            throw LogicException(LogicExceptionErrors::illegalState, "Pointer level is less than the minimum of 0");
         }
-        else if (source.PointerLevel() == 1)
+        else if (source.PointerLevel() == 0)
         {
             throw LogicException(LogicExceptionErrors::illegalState, "Attempted to dereference Value that is not a reference");
         }
@@ -225,9 +252,20 @@ namespace value
         return IfImpl(test, fn);
     }
 
-    std::optional<Value> EmitterContext::Call(FunctionDeclaration func, std::vector<Value> args)
+    void EmitterContext::While(Scalar test, std::function<void()> fn)
+    {
+        if (test.GetType() != ValueType::Boolean)
+        {
+            throw InputException(InputExceptionErrors::typeMismatch);
+        }
+
+        return WhileImpl(test, fn);
+    }
+
+    std::optional<Value> EmitterContext::Call(FunctionDeclaration func, std::vector<ViewAdapter> args)
     {
-        return CallImpl(func, args);
+        std::vector<Value> valueArgs(args.begin(), args.end());
+        return CallImpl(func, valueArgs);
     }
 
     void EmitterContext::Prefetch(Value data, PrefetchType type, PrefetchLocality locality)
@@ -237,9 +275,16 @@ namespace value
 
     void EmitterContext::Parallelize(int numTasks, std::vector<Value> captured, std::function<void(Scalar, std::vector<Value>)> fn)
     {
+        if (numTasks == 0) return;
+
         return ParallelizeImpl(numTasks, captured, fn);
     }
 
+    void EmitterContext::DebugBreak()
+    {
+        DebugBreakImpl();
+    }
+
     void EmitterContext::DebugDump(Value value, std::string tag, std::ostream* stream) const
     {
         std::ostream& outStream = stream != nullptr ? *stream : std::cerr;
@@ -281,6 +326,11 @@ namespace value
         return GetNameImpl(value);
     }
 
+    void EmitterContext::ImportCodeFile(std::string file)
+    {
+        ImportCodeFileImpl(file);
+    }
+
     const std::vector<std::reference_wrapper<FunctionDeclaration>>& EmitterContext::GetIntrinsics() const
     {
         static std::vector intrinsics = {
@@ -299,12 +349,71 @@ namespace value
             std::ref(TanhFunctionDeclaration),
             std::ref(RoundFunctionDeclaration),
             std::ref(FloorFunctionDeclaration),
-            std::ref(CeilFunctionDeclaration)
+            std::ref(CeilFunctionDeclaration),
+            std::ref(FmaFunctionDeclaration),
+            std::ref(MemCopyFunctionDeclaration),
+            std::ref(MemMoveFunctionDeclaration),
+            std::ref(MemSetFunctionDeclaration),
         };
 
         return intrinsics;
     }
 
+    std::vector<Value> EmitterContext::NormalizeReferenceLevels(const std::vector<Value>& args, const std::vector<Value>& expected) const
+    {
+        if (args.size() != expected.size())
+        {
+            throw new InputException(InputExceptionErrors::sizeMismatch);
+        }
+        std::vector<Value> normalizedArgs;
+        normalizedArgs.reserve(args.size());
+        for (unsigned index = 0; index < args.size(); ++index)
+        {
+            auto& expectedValue = expected[index];
+            auto& arg = args[index];
+            Value value{
+                { expectedValue.GetBaseType(), expectedValue.PointerLevel() },
+                expectedValue.IsConstrained() ? std::optional{ expectedValue.GetLayout() } : std::optional<MemoryLayout>{ std::nullopt }
+            };
+            if (expectedValue.PointerLevel() == arg.PointerLevel())
+            {
+                value.SetData(arg, true);
+            }
+            else if (expectedValue.PointerLevel() == (arg.PointerLevel() - 1))
+            {
+                value.SetData(arg.Dereference(), true);
+            }
+            else
+            {
+                throw LogicException(LogicExceptionErrors::illegalState);
+            }
+            normalizedArgs.push_back(value);
+        }
+        return normalizedArgs;
+    }
+
+    std::string EmitterContext::UniqueName(const std::string& prefix)
+    {
+        auto uniqueId = _uniqueNames[prefix]++;
+        return prefix + "_" + std::to_string(uniqueId);
+    }
+
+    Scalar EmitterContext::GetFunctionAddress(const FunctionDeclaration& decl)
+    {
+        if (const auto& intrinsics = GetIntrinsics();
+            std::find(intrinsics.begin(), intrinsics.end(), decl) != intrinsics.end())
+        {
+            throw InputException(InputExceptionErrors::invalidArgument, "Cannot get function address of intrinsic");
+        }
+
+        return GetFunctionAddressImpl(decl);
+    }
+
+    void swap(EmitterContext& l, EmitterContext& r) noexcept
+    {
+        std::swap(l._uniqueNames, r._uniqueNames);
+    }
+
     namespace
     {
         EmitterContext* s_context = nullptr;
@@ -332,35 +441,66 @@ namespace value
 
     ContextGuard<>::~ContextGuard() { _oldContext ? SetContext(*_oldContext) : ClearContext(); }
 
-    Value Allocate(ValueType type, size_t size) { return GetContext().Allocate(type, size); }
+    Value Allocate(ValueType type, size_t size, size_t align, AllocateFlags flags)
+    {
+        return GetContext().Allocate(type, size, align, flags);
+    }
 
-    Value Allocate(ValueType type, MemoryLayout layout) { return GetContext().Allocate(type, layout); }
+    Value Allocate(ValueType type, MemoryLayout layout, size_t align, AllocateFlags flags)
+    {
+        return GetContext().Allocate(type, layout, align, flags);
+    }
 
-    Value StaticAllocate(std::string name, ValueType type, utilities::MemoryLayout layout)
+    Value StaticAllocate(std::string name, ValueType type, utilities::MemoryLayout layout, AllocateFlags flags)
     {
-        return GetContext().StaticAllocate(name, type, layout);
+        return GetContext().StaticAllocate(name, type, layout, flags);
     }
 
-    Value GlobalAllocate(std::string name, ValueType type, utilities::MemoryLayout layout)
+    Value GlobalAllocate(std::string name, ValueType type, utilities::MemoryLayout layout, AllocateFlags flags)
     {
-        return GetContext().GlobalAllocate(name, type, layout);
+        return GetContext().GlobalAllocate(name, type, layout, flags);
     }
 
     EmitterContext::IfContext If(Scalar test, std::function<void()> fn) { return GetContext().If(test, fn); }
 
+    void While(Scalar test, std::function<void()> fn)
+    {
+        return GetContext().While(test, fn);
+    }
+
     void ForRange(Scalar end, std::function<void(Scalar)> fn)
     {
-        ForRange(0, end, fn);
+        ForRange(std::string{}, end, fn);
+    }
+
+    void ForRange(const std::string& name, Scalar end, std::function<void(Scalar)> fn)
+    {
+        ForRange(name, Scalar{ 0 }, end, fn);
     }
 
     void ForRange(Scalar start, Scalar end, std::function<void(Scalar)> fn)
     {
-        ForRange(start, end, 1, fn);
+        ForRange(std::string{}, start, end, fn);
+    }
+
+    void ForRange(const std::string& name, Scalar start, Scalar end, std::function<void(Scalar)> fn)
+    {
+        ForRange(name, start, end, 1, fn);
     }
 
     void ForRange(Scalar start, Scalar end, Scalar step, std::function<void(Scalar)> fn)
     {
-        GetContext().For(start, end, step, fn);
+        ForRange(std::string{}, start, end, step, fn);
+    }
+
+    void ForRange(const std::string& name, Scalar start, Scalar end, Scalar step, std::function<void(Scalar)> fn)
+    {
+        GetContext().For(start, end, step, fn, name);
+    }
+
+    void DebugBreak()
+    {
+        GetContext().DebugBreak();
     }
 
     void DebugDump(FunctionDeclaration fn, std::string tag, std::ostream* stream)
@@ -373,6 +513,11 @@ namespace value
         GetContext().DebugDump(value, tag, stream);
     }
 
+    void DebugPrint(std::string message)
+    {
+        GetContext().DebugPrint(message);
+    }
+
     void Parallelize(int numTasks, std::vector<Value> captured, std::function<void(Scalar, std::vector<Value>)> fn)
     {
         GetContext().Parallelize(numTasks, captured, fn);
@@ -458,6 +603,11 @@ namespace value
         return *GetContext().Call(CeilFunctionDeclaration, { s.GetValue() });
     }
 
+    Scalar Fma(Scalar a, Scalar b, Scalar c)
+    {
+        return *GetContext().Call(FmaFunctionDeclaration, { a.GetValue(), b.GetValue(), c.GetValue() });
+    }
+
     Scalar Sign(Scalar s)
     {
         return *GetContext().Call(CopySignFunctionDeclaration, { Cast(1, s.GetType()).GetValue(), s.GetValue() });
@@ -541,9 +691,10 @@ namespace value
         {
             If(v == Cast(0, v.GetType()), [&] {
                 r = Cast(1, v.GetType());
-            }).Else([&] {
-                r = Cast(0, v.GetType());
-            });
+            })
+                .Else([&] {
+                    r = Cast(0, v.GetType());
+                });
         }
         return r;
     }
@@ -562,5 +713,38 @@ namespace value
         return *GetContext().Call(CeilFunctionDeclaration, { v.GetValue() });
     }
 
+    void MemCopy(ViewAdapter dest, ViewAdapter source, std::optional<Scalar> length)
+    {
+        (void)GetContext().Call(MemCopyFunctionDeclaration, { dest, source, length.value_or(static_cast<int64_t>(source.GetValue().GetLayout().GetMemorySize())).GetValue() });
+    }
+
+    void MemMove(ViewAdapter dest, ViewAdapter source, std::optional<Scalar> length)
+    {
+        (void)GetContext().Call(MemMoveFunctionDeclaration, { dest, source, length.value_or(static_cast<int64_t>(source.GetValue().GetLayout().GetMemorySize())).GetValue() });
+    }
+
+    void MemSet(ViewAdapter dest, Scalar data, std::optional<Scalar> length)
+    {
+        if (data.GetType() != ValueType::Char8)
+        {
+            throw InputException(InputExceptionErrors::typeMismatch, "Memory pattern specified by data is expected to be of type Char8");
+        }
+
+        (void)GetContext().Call(MemSetFunctionDeclaration, { dest, data.GetValue(), length.value_or(static_cast<int64_t>(dest.GetValue().GetLayout().GetMemorySize())).GetValue() });
+    }
+
+    void ZeroMemory(ViewAdapter dest, std::optional<Scalar> length)
+    {
+        // As of 9/11/2019, when compiling with C++17, `Value{ char{} }` in place of `Value(char{})`
+        // triggers the `std::initializer_list<T>` ctor for Value, instead of the `Value<T>(T)` ctor.
+        // This might change in C++20.
+        MemSet(dest, Value(char{}), length);
+    }
+
+    std::string UniqueName(const std::string& prefix)
+    {
+        return GetContext().UniqueName(prefix);
+    }
+
 } // namespace value
 } // namespace ell
diff --git a/libraries/value/src/FunctionDeclaration.cpp b/libraries/value/src/FunctionDeclaration.cpp
index e68d88200..467583375 100644
--- a/libraries/value/src/FunctionDeclaration.cpp
+++ b/libraries/value/src/FunctionDeclaration.cpp
@@ -10,6 +10,8 @@
 
 #include <utilities/include/Hash.h>
 
+#include <cctype>
+
 namespace ell
 {
 namespace value
@@ -19,9 +21,22 @@ namespace value
     FunctionDeclaration::FunctionDeclaration(std::string name) :
         _originalFunctionName(name),
         _isEmpty(false)
-    {}
+    {
+        if (!std::isalpha(_originalFunctionName[0]) && _originalFunctionName[0] != '_')
+        {
+            throw InputException(InputExceptionErrors::invalidArgument, "Function names must begin with an _ or alphabetical character");
+        }
+    }
+
+    FunctionDeclaration& FunctionDeclaration::DefineFromFile(std::string file)
+    {
+        CheckNonEmpty();
+
+        _importedSource = file;
+        return *this;
+    }
 
-    FunctionDeclaration& FunctionDeclaration::Returns(Value returnType)
+    FunctionDeclaration& FunctionDeclaration::Returns(ViewAdapter returnType)
     {
         CheckNonEmpty();
 
@@ -29,26 +44,45 @@ namespace value
         return *this;
     }
 
-    FunctionDeclaration& FunctionDeclaration::Parameters(std::vector<Value> paramTypes)
+    FunctionDeclaration& FunctionDeclaration::Parameters(std::vector<ViewAdapter> paramTypes)
     {
         CheckNonEmpty();
 
-        _paramTypes = paramTypes;
+        _paramTypes.assign(paramTypes.begin(), paramTypes.end());
         return *this;
     }
 
-    FunctionDeclaration& FunctionDeclaration::Decorated(FunctionDecorated shouldDecorate)
+    FunctionDeclaration& FunctionDeclaration::Decorated(bool shouldDecorate)
     {
         CheckNonEmpty();
 
-        _isDecorated = shouldDecorate == FunctionDecorated::Yes;
+        _isDecorated = shouldDecorate;
         return *this;
     }
 
-    std::optional<Value> FunctionDeclaration::Call(std::vector<Value> arguments) const
+    FunctionDeclaration& FunctionDeclaration::Public(bool isPublic)
+    {
+        _isPublic = isPublic;
+        return *this;
+    }
+
+    FunctionDeclaration& FunctionDeclaration::Inlined(FunctionInlining shouldInline)
     {
         CheckNonEmpty();
 
+        _inlineState = shouldInline;
+        return *this;
+    }
+
+    std::optional<Value> FunctionDeclaration::Call(std::vector<ViewAdapter> arguments) const
+    {
+        CheckNonEmpty();
+
+        if (!_importedSource.empty() && !IsDefined())
+        {
+            GetContext().ImportCodeFile(_importedSource);
+        }
+
         return GetContext().Call(*this, arguments);
     }
 
@@ -61,9 +95,25 @@ namespace value
             if (!_decoratedFunctionName)
             {
                 size_t hash = 0;
-                HashCombine(hash, _returnType);
-                HashCombine(hash, _paramTypes);
-                _decoratedFunctionName = _originalFunctionName + std::to_string(hash);
+                if(_returnType)
+                {
+                    HashCombine(hash, static_cast<int>(_returnType->GetBaseType()));
+                    HashCombine(hash, _returnType->PointerLevel());
+                    if (_returnType->IsConstrained())
+                    {
+                        HashCombine(hash, _returnType->GetLayout());
+                    }
+                }
+                for(auto p: _paramTypes)
+                {
+                    HashCombine(hash, static_cast<int>(p.GetBaseType()));
+                    HashCombine(hash, p.PointerLevel());
+                    if (p.IsConstrained())
+                    {
+                        HashCombine(hash, p.GetLayout());
+                    }
+                }
+                _decoratedFunctionName = _originalFunctionName + "_" + std::to_string(hash);
             }
             return *_decoratedFunctionName;
         }
@@ -87,6 +137,13 @@ namespace value
         return _returnType;
     }
 
+    bool FunctionDeclaration::IsPublic() const
+    {
+        CheckNonEmpty();
+
+        return _isPublic;
+    }
+
     bool FunctionDeclaration::IsDefined() const
     {
         CheckNonEmpty();
@@ -94,8 +151,21 @@ namespace value
         return GetContext().IsFunctionDefined(*this);
     }
 
+    bool FunctionDeclaration::IsImported() const
+    {
+        CheckNonEmpty();
+
+        return !_importedSource.empty();
+    }
+
     bool FunctionDeclaration::IsEmpty() const { return _isEmpty; }
 
+    FunctionInlining FunctionDeclaration::InlineState() const
+    {
+        CheckNonEmpty();
+        return _inlineState;
+    }
+
     void FunctionDeclaration::CheckNonEmpty() const
     {
         if (_isEmpty)
@@ -104,27 +174,41 @@ namespace value
         }
     }
 
+    Scalar FunctionDeclaration::GetPointer() const
+    {
+        if (_pointer)
+        {
+            return *_pointer;
+        }
+
+        return GetContext().GetFunctionAddress(*this);
+    }
+
     FunctionDeclaration DeclareFunction(std::string name)
     {
         return FunctionDeclaration(name);
     }
 
-    /*extern*/ FunctionDeclaration AbsFunctionDeclaration = DeclareFunction("Abs");
-    /*extern*/ FunctionDeclaration CosFunctionDeclaration = DeclareFunction("Cos");
-    /*extern*/ FunctionDeclaration CopySignFunctionDeclaration = DeclareFunction("CopySign");
-    /*extern*/ FunctionDeclaration ExpFunctionDeclaration = DeclareFunction("Exp");
-    /*extern*/ FunctionDeclaration LogFunctionDeclaration = DeclareFunction("Log");
-    /*extern*/ FunctionDeclaration Log10FunctionDeclaration = DeclareFunction("Log10");
-    /*extern*/ FunctionDeclaration Log2FunctionDeclaration = DeclareFunction("Log2");
-    /*extern*/ FunctionDeclaration MaxNumFunctionDeclaration = DeclareFunction("MaxNum");
-    /*extern*/ FunctionDeclaration MinNumFunctionDeclaration = DeclareFunction("MinNum");
-    /*extern*/ FunctionDeclaration PowFunctionDeclaration = DeclareFunction("Pow");
-    /*extern*/ FunctionDeclaration SinFunctionDeclaration = DeclareFunction("Sin");
-    /*extern*/ FunctionDeclaration SqrtFunctionDeclaration = DeclareFunction("Sqrt");
-    /*extern*/ FunctionDeclaration TanhFunctionDeclaration = DeclareFunction("Tanh");
-    /*extern*/ FunctionDeclaration RoundFunctionDeclaration = DeclareFunction("Round");
-    /*extern*/ FunctionDeclaration FloorFunctionDeclaration = DeclareFunction("Floor");
-    /*extern*/ FunctionDeclaration CeilFunctionDeclaration = DeclareFunction("Ceil");
+    /*extern*/ FunctionDeclaration AbsFunctionDeclaration = DeclareFunction("Abs").Decorated(false);
+    /*extern*/ FunctionDeclaration CosFunctionDeclaration = DeclareFunction("Cos").Decorated(false);
+    /*extern*/ FunctionDeclaration CopySignFunctionDeclaration = DeclareFunction("CopySign").Decorated(false);
+    /*extern*/ FunctionDeclaration ExpFunctionDeclaration = DeclareFunction("Exp").Decorated(false);
+    /*extern*/ FunctionDeclaration LogFunctionDeclaration = DeclareFunction("Log").Decorated(false);
+    /*extern*/ FunctionDeclaration Log10FunctionDeclaration = DeclareFunction("Log10").Decorated(false);
+    /*extern*/ FunctionDeclaration Log2FunctionDeclaration = DeclareFunction("Log2").Decorated(false);
+    /*extern*/ FunctionDeclaration MaxNumFunctionDeclaration = DeclareFunction("MaxNum").Decorated(false);
+    /*extern*/ FunctionDeclaration MinNumFunctionDeclaration = DeclareFunction("MinNum").Decorated(false);
+    /*extern*/ FunctionDeclaration PowFunctionDeclaration = DeclareFunction("Pow").Decorated(false);
+    /*extern*/ FunctionDeclaration SinFunctionDeclaration = DeclareFunction("Sin").Decorated(false);
+    /*extern*/ FunctionDeclaration SqrtFunctionDeclaration = DeclareFunction("Sqrt").Decorated(false);
+    /*extern*/ FunctionDeclaration TanhFunctionDeclaration = DeclareFunction("Tanh").Decorated(false);
+    /*extern*/ FunctionDeclaration RoundFunctionDeclaration = DeclareFunction("Round").Decorated(false);
+    /*extern*/ FunctionDeclaration FloorFunctionDeclaration = DeclareFunction("Floor").Decorated(false);
+    /*extern*/ FunctionDeclaration CeilFunctionDeclaration = DeclareFunction("Ceil").Decorated(false);
+    /*extern*/ FunctionDeclaration FmaFunctionDeclaration = DeclareFunction("Fma").Decorated(false);
+    /*extern*/ FunctionDeclaration MemCopyFunctionDeclaration = DeclareFunction("MemCpy").Decorated(false);
+    /*extern*/ FunctionDeclaration MemMoveFunctionDeclaration = DeclareFunction("MemMove").Decorated(false);
+    /*extern*/ FunctionDeclaration MemSetFunctionDeclaration = DeclareFunction("MemSet").Decorated(false);
 
 } // namespace value
 } // namespace ell
diff --git a/libraries/value/src/LLVMContext.cpp b/libraries/value/src/LLVMContext.cpp
index 8a3f88bb8..848b14537 100644
--- a/libraries/value/src/LLVMContext.cpp
+++ b/libraries/value/src/LLVMContext.cpp
@@ -11,13 +11,13 @@
 #include "Scalar.h"
 #include "Value.h"
 
-#include <emitters/include/IRModuleEmitter.h>
-
+#include <utilities/include/Files.h>
 #include <utilities/include/StringUtil.h>
 
-#include <numeric>
-
 #include <llvm/Support/raw_os_ostream.h>
+#include <llvm/Transforms/Utils/Cloning.h>
+
+#include <numeric>
 
 using namespace std::string_literals;
 
@@ -131,7 +131,7 @@ namespace value
             return type;
         }
 
-        VariableType ValueTypeToVariableType(ValueType type)
+        VariableType ValueTypeToVariableType(ValueType type, int ptrLevel = 1)
         {
             // clang-format off
 
@@ -141,7 +141,12 @@ namespace value
 
 #define VALUE_TYPE_TO_VARIABLE_TYPE_PTR(x)         \
     case ValueType::x:                             \
-        return VariableType::x##Pointer
+        if (ptrLevel == 0)                         \
+            return VariableType::x;                \
+        else if (ptrLevel == 1)                    \
+            return VariableType::x##Pointer;       \
+        else                                       \
+            return VariableType::x##PointerPointer
 
             // clang-format on
 
@@ -183,14 +188,25 @@ namespace value
             return true;
         }
 
+        emitters::FunctionInlining GetEmittersFunctionInlining(FunctionInlining inlining)
+        {
+            switch (inlining)
+            {
+            case FunctionInlining::always:
+                return emitters::FunctionInlining::always;
+            case FunctionInlining::never:
+                return emitters::FunctionInlining::never;
+            default:
+                return emitters::FunctionInlining::defaultInline;
+            }
+        }
+
         bool IncrementMemoryCoordinate(std::vector<int>& coordinate, const std::vector<int>& maxCoordinate)
         {
             assert(coordinate.size() == maxCoordinate.size());
             return IncrementMemoryCoordinateImpl(static_cast<int>(maxCoordinate.size()) - 1, coordinate, maxCoordinate);
         }
 
-        LLVMValue ToLLVMValue(Value value) { return value.Get<Emittable>().GetDataAs<LLVMValue>(); }
-
         auto SimpleNumericalFunctionIntrinsic(LLVMFunction (IRRuntime::*intrinsicFn)(VariableType)) -> std::function<Value(IRFunctionEmitter&, std::vector<Value>)>
         {
             return [intrinsicFn](IRFunctionEmitter& fnEmitter, std::vector<Value> args) -> Value {
@@ -370,6 +386,68 @@ namespace value
             };
         }
 
+        auto FmaFunctionIntrinsic() -> std::function<Value(IRFunctionEmitter&, std::vector<Value>)>
+        {
+            return [](IRFunctionEmitter& fnEmitter, std::vector<Value> args) -> Value {
+                if (args.size() != 3)
+                {
+                    throw InputException(InputExceptionErrors::invalidSize);
+                }
+
+                if (std::any_of(args.begin(), args.end(), [](Value& value) { return value.IsConstrained() && value.GetLayout() != ScalarLayout; }))
+                {
+                    throw InputException(InputExceptionErrors::invalidSize);
+                }
+
+                const auto& value1 = args[0];
+                const auto& value2 = args[1];
+                const auto& value3 = args[2];
+                if (value1.GetBaseType() != value2.GetBaseType() || value1.GetBaseType() != value3.GetBaseType())
+                {
+                    throw InputException(InputExceptionErrors::typeMismatch);
+                }
+
+                if (value1.GetBaseType() == ValueType::Boolean)
+                {
+                    throw InputException(InputExceptionErrors::typeMismatch);
+                }
+
+                Value result = value::Allocate(value1.GetBaseType(), ScalarLayout);
+
+                auto llvmValue1 = fnEmitter.ValueAt(ToLLVMValue(value1), 0);
+                auto llvmValue2 = fnEmitter.ValueAt(ToLLVMValue(value2), 0);
+                auto llvmValue3 = fnEmitter.ValueAt(ToLLVMValue(value3), 0);
+
+                auto variableType = [type = value1.GetBaseType()] {
+                    switch (type)
+                    {
+                    case ValueType::Double:
+                        return VariableType::Double;
+                    default:
+                        return VariableType::Float;
+                    }
+                }();
+
+                // TODO: fix this so that GetNonPointerType call isn't needed
+                auto value1VariableType = GetNonPointerType(ValueTypeToVariableType(value1.GetBaseType(), value1.PointerLevel()));
+                if (variableType != value1VariableType)
+                {
+                    llvmValue1 = fnEmitter.CastValue(llvmValue1, variableType);
+                    llvmValue2 = fnEmitter.CastValue(llvmValue2, variableType);
+                    llvmValue3 = fnEmitter.CastValue(llvmValue3, variableType);
+                }
+                auto llvmFunc = fnEmitter.GetModule().GetRuntime().GetFmaFunction(variableType);
+                auto callResult = fnEmitter.Call(llvmFunc, { llvmValue1, llvmValue2, llvmValue3 });
+                if (variableType != value1VariableType)
+                {
+                    callResult = fnEmitter.CastValue(callResult, value1VariableType);
+                }
+                auto resultValue = ToLLVMValue(result);
+                fnEmitter.SetValueAt(resultValue, 0, callResult);
+                return result;
+            };
+        }
+
         enum class MaxMinIntrinsic
         {
             Max,
@@ -498,6 +576,71 @@ namespace value
             };
         }
 
+        enum class MemIntrinsicOp
+        {
+            Copy,
+            Move,
+            Set
+        };
+        auto MemOpFunctionIntrinsic(MemIntrinsicOp intrinsic) -> std::function<Value(IRFunctionEmitter&, std::vector<Value>)>
+        {
+            return [intrinsic](IRFunctionEmitter& fnEmitter, std::vector<Value> args) -> Value {
+                if (args.size() != 3)
+                {
+                    throw InputException(InputExceptionErrors::invalidSize);
+                }
+
+                const auto& value1 = args[0];
+                const auto& value2 = args[1];
+                const auto& value3 = args[2];
+
+                if (!value3.IsConstrained() || value3.GetLayout() != ScalarLayout)
+                {
+                    throw InputException(InputExceptionErrors::invalidArgument);
+                }
+
+                if (intrinsic == MemIntrinsicOp::Set)
+                {
+                    assert((value2.IsConstrained() && value2.GetLayout() == ScalarLayout && value2.GetType() == std::pair{ ValueType::Char8, 1 }));
+                }
+
+                llvm::CallInst* (IREmitter::*memFn)(LLVMValue, LLVMValue, LLVMValue){};
+                switch (intrinsic)
+                {
+                case MemIntrinsicOp::Copy:
+                    memFn = &IREmitter::MemoryCopy;
+                    break;
+                case MemIntrinsicOp::Move:
+                    memFn = &IREmitter::MemoryMove;
+                    break;
+                case MemIntrinsicOp::Set:
+                    memFn = &IREmitter::MemorySet;
+                    break;
+                default:
+                    assert(false);
+                }
+
+                auto llvmValue1 = ToLLVMValue(value1);
+                auto llvmValue2 = ToLLVMValue(value2);
+                auto llvmValue3 = fnEmitter.Load(ToLLVMValue(value3));
+                auto llvmType = llvmValue1->getType()->getContainedType(0);
+                llvmValue3 = fnEmitter.LocalScalar(static_cast<int64_t>(fnEmitter.GetEmitter().SizeOf(llvmType))) * llvmValue3;
+
+                if (intrinsic == MemIntrinsicOp::Set)
+                {
+                    llvmValue2 = fnEmitter.Load(llvmValue2);
+
+                    // we're going to swap the first two params because MemorySet DOES have destination first...
+                    std::swap(llvmValue1, llvmValue2);
+                }
+
+                // IREmitter takes the first two parameters in the opposite order of everyone else!
+                (void)std::invoke(memFn, fnEmitter.GetEmitter(), llvmValue2, llvmValue1, llvmValue3);
+
+                return {}; // ignored
+            };
+        }
+
         void ConstantForLoop(const MemoryLayout& layout, std::function<void(const MemoryCoordinates&)> fn)
         {
             auto maxCoordinate = layout.GetActiveSize().ToVector();
@@ -520,7 +663,9 @@ namespace value
 
     struct LLVMContext::FunctionScope
     {
-        FunctionScope(LLVMContext& context, IRFunctionEmitter& emitter) :
+        FunctionScope(
+            LLVMContext& context,
+            IRFunctionEmitter& emitter) :
             context(context)
         {
             context._functionStack.push(emitter);
@@ -545,15 +690,32 @@ namespace value
     };
 
     LLVMContext::LLVMContext(IRModuleEmitter& emitter) :
+        EmitterContext(emitter.GetCompilerOptions().targetDevice),
         _emitter(emitter),
         _computeContext(_emitter.GetModuleName())
     {
         _promotedConstantStack.push({});
     }
 
-    IRModuleEmitter& LLVMContext::GetModuleEmitter() const { return _emitter; }
+    LLVMContext::LLVMContext(std::unique_ptr<IRModuleEmitter>&& emitter) :
+        EmitterContext(emitter->GetCompilerOptions().targetDevice),
+        _ownedEmitter(std::move(emitter)),
+        _emitter(*_ownedEmitter),
+        _computeContext(_emitter.GetModuleName())
+    {
+        _promotedConstantStack.push({});
+    }
+
+    LLVMContext::LLVMContext(const std::string& moduleName, const CompilerOptions& parameters) :
+        LLVMContext(std::make_unique<IRModuleEmitter>(moduleName, parameters))
+    {}
+
+    IRModuleEmitter& LLVMContext::GetModuleEmitter() const
+    {
+        return _emitter;
+    }
 
-    Value LLVMContext::AllocateImpl(ValueType type, MemoryLayout layout)
+    Value LLVMContext::AllocateImpl(ValueType type, MemoryLayout layout, size_t alignment, AllocateFlags flags)
     {
         auto& fn = GetFunctionEmitter();
         auto& irEmitter = fn.GetEmitter();
@@ -561,8 +723,11 @@ namespace value
         auto llvmType = ValueTypeToLLVMType(irEmitter, { type, 0 });
         assert(!llvmType->isPointerTy());
         auto allocatedVariable = fn.Variable(llvmType, layout.GetMemorySize());
+        if (alignment != 0)
+        {
+            allocatedVariable->setAlignment(alignment);
+        }
         fn.StoreZero(allocatedVariable, layout.GetMemorySize());
-
         return { Emittable{ allocatedVariable }, layout };
     }
 
@@ -577,7 +742,7 @@ namespace value
         return std::nullopt;
     }
 
-    Value LLVMContext::GlobalAllocateImpl(GlobalAllocationScope scope, std::string name, ConstantData data, MemoryLayout layout)
+    Value LLVMContext::GlobalAllocateImpl(GlobalAllocationScope scope, std::string name, ConstantData data, MemoryLayout layout, AllocateFlags flags)
     {
         std::string adjustedName = GetScopeAdjustedName(scope, name);
 
@@ -588,9 +753,10 @@ namespace value
         }
 
         llvm::GlobalVariable* global = std::visit(
-            [this, &adjustedName](auto&& vectorData) {
+            [this, flags, &adjustedName](auto&& vectorData) {
                 using Type = std::decay_t<decltype(vectorData)>;
 
+                bool isThreadLocal = (flags & AllocateFlags::ThreadLocal) == AllocateFlags::ThreadLocal;
                 if constexpr (std::is_same_v<Type, std::vector<utilities::Boolean>>)
                 {
                     // IREmitter stores a vector of bool values as a bitvector, which
@@ -601,14 +767,15 @@ namespace value
                     // originally, this was a vector of bools. This will be rectified
                     // in the near future. (2018-11-08)
                     std::vector<char> transformedData(vectorData.begin(), vectorData.end());
-                    return _emitter.GlobalArray(adjustedName, transformedData);
+                    return _emitter.GlobalArray(adjustedName, transformedData, isThreadLocal);
                 }
                 else
                 {
-                    return _emitter.GlobalArray(adjustedName, vectorData);
+                    return _emitter.GlobalArray(adjustedName, vectorData, isThreadLocal);
                 }
             },
             data);
+
         auto dereferencedGlobal = _emitter.GetIREmitter().PointerOffset(global, _emitter.GetIREmitter().Literal(0));
 
         Emittable emittable{ dereferencedGlobal };
@@ -617,7 +784,7 @@ namespace value
         return Value(emittable, layout);
     }
 
-    Value LLVMContext::GlobalAllocateImpl(GlobalAllocationScope scope, std::string name, ValueType type, MemoryLayout layout)
+    Value LLVMContext::GlobalAllocateImpl(GlobalAllocationScope scope, std::string name, ValueType type, MemoryLayout layout, AllocateFlags flags)
     {
         std::string adjustedName = GetScopeAdjustedName(scope, name);
 
@@ -629,7 +796,8 @@ namespace value
 
         auto global = _emitter.GlobalArray(adjustedName,
                                            ValueTypeToLLVMType(_emitter.GetIREmitter(), { type, 0 }),
-                                           layout.GetMemorySize());
+                                           layout.GetMemorySize(),
+                                           (flags & AllocateFlags::ThreadLocal) == AllocateFlags::ThreadLocal);
 
         auto dereferencedGlobal = _emitter.GetIREmitter().PointerOffset(global, _emitter.GetIREmitter().Literal(0));
 
@@ -664,23 +832,40 @@ namespace value
 
         std::vector<VariableType> variableArgTypes(argValues.size());
         std::transform(argValues.begin(), argValues.end(), variableArgTypes.begin(), [](Value value) {
-            return ValueTypeToVariableType(value.GetBaseType());
+            return ValueTypeToVariableType(value.GetBaseType(), value.PointerLevel());
         });
 
         const auto& fnName = decl.GetFunctionName();
+        auto argValuesCopy = argValues;
         {
             ValueType returnValueType = returnValue ? returnValue->GetBaseType() : ValueType::Void;
             FunctionScope scope(*this, fnName, ValueTypeToVariableType(returnValueType), variableArgTypes);
-            GetFunctionEmitter().SetAttributeForArguments(IRFunctionEmitter::Attributes::NoAlias);
 
-            auto functionArgs = GetFunctionEmitter().Arguments();
-            auto argValuesCopy = argValues;
+            auto& fnEmitter = GetFunctionEmitter();
+            if (decl.IsPublic())
+            {
+                fnEmitter.IncludeInHeader();
+            }
+            fnEmitter.SetAttributeForArguments(IRFunctionEmitter::Attributes::NoAlias);
+            fnEmitter.SetInlineState(GetEmittersFunctionInlining(decl.InlineState()));
+
+            auto functionArgs = fnEmitter.Arguments();
             auto returnValueCopy = returnValue;
 
             for (std::pair idx{ 0u, functionArgs.begin() }; idx.first < argValuesCopy.size(); ++idx.first, ++idx.second)
             {
-                idx.second->setName(std::string{ "arg" } + std::to_string(idx.first));
-                argValuesCopy[idx.first].SetData(Emittable{ idx.second });
+                auto& argValueCopy = argValuesCopy[idx.first];
+                auto argName = std::string{ "arg" } + std::to_string(idx.first);
+                auto inputName = argValueCopy.GetName();
+                idx.second->setName(argName);
+                argValueCopy.SetData(Emittable{ idx.second });
+
+                if (auto argType = idx.second->getType(); argValueCopy.IsConstrained() && argType->isPointerTy())
+                {
+                    auto innerType = argType->getPointerElementType();
+                    uint64_t bytes = argValueCopy.GetLayout().GetMemorySize() * fnEmitter.GetEmitter().SizeOf(innerType);
+                    fnEmitter.SetAttributeForArgument(idx.first, IRFunctionEmitter::Attributes::Dereferenceable, bytes);
+                }
             }
 
             returnValueCopy = fn(argValuesCopy);
@@ -694,7 +879,7 @@ namespace value
             }
         }
 
-        DefinedFunction returnFn = [this, decl](std::vector<Value> args) -> std::optional<Value> {
+        DefinedFunction returnFn = [this, decl, llvmExpectedValues = argValuesCopy](std::vector<Value> args) -> std::optional<Value> {
             const auto& argValues = decl.GetParameterTypes();
             const auto& returnValue = decl.GetReturnType();
             const auto& fnName = decl.GetFunctionName();
@@ -710,13 +895,12 @@ namespace value
                 throw InputException(InputExceptionErrors::invalidArgument);
             }
 
-            std::vector<LLVMValue> llvmArgs(args.size());
-            std::transform(args.begin(), args.end(), llvmArgs.begin(), [this](Value& arg) {
-                return EnsureEmittable(arg).Get<Emittable>().GetDataAs<LLVMValue>();
-            });
+            std::vector<Value> emittableArgs = EnsureEmittable(args);
+            auto normalizedArgs = NormalizeReferenceLevels(emittableArgs, llvmExpectedValues);
+            std::vector<LLVMValue> llvmArgs = ToLLVMValue(normalizedArgs);
 
             auto returnValueCopy = returnValue;
-            LLVMValue fnReturnValue = _emitter.GetCurrentFunction().Call(fnName, llvmArgs);
+            LLVMValue fnReturnValue = GetFunctionEmitter().Call(fnName, llvmArgs);
             if (returnValueCopy)
             {
                 returnValueCopy->SetData(Emittable{ fnReturnValue });
@@ -737,12 +921,17 @@ namespace value
             return true;
         }
 
-        return _definedFunctions.find(decl) != _definedFunctions.end();
+        if (_definedFunctions.find(decl) != _definedFunctions.end())
+        {
+            return true;
+        }
+
+        return _emitter.HasFunction(decl.GetFunctionName());
     }
 
     Value LLVMContext::StoreConstantDataImpl(ConstantData data) { return _computeContext.StoreConstantData(data); }
 
-    void LLVMContext::ForImpl(MemoryLayout layout, std::function<void(std::vector<Scalar>)> fn)
+    void LLVMContext::ForImpl(MemoryLayout layout, std::function<void(std::vector<Scalar>)> fn, const std::string& name)
     {
         std::vector<IRFunctionEmitter::ConstLoopRange> ranges(layout.NumDimensions());
         for (auto index = 0u; index < ranges.size(); ++index)
@@ -754,6 +943,7 @@ namespace value
         auto logicalOrder = layout.GetLogicalDimensionOrder();
 
         GetFunctionEmitter().For(
+            name,
             ranges,
             [&fn, logicalOrder](emitters::IRFunctionEmitter&, std::vector<emitters::IRLocalScalar> indices) {
                 std::vector<Scalar> logicalIndices(indices.size());
@@ -765,7 +955,7 @@ namespace value
             });
     }
 
-    void LLVMContext::ForImpl(Scalar start, Scalar stop, Scalar step, std::function<void(Scalar)> fn)
+    void LLVMContext::ForImpl(Scalar start, Scalar stop, Scalar step, std::function<void(Scalar)> fn, const std::string& name)
     {
         auto startValue = EnsureEmittable(start.GetValue());
         auto stopValue = EnsureEmittable(stop.GetValue());
@@ -780,6 +970,7 @@ namespace value
         Scalar index = value::Allocate<int>(ScalarLayout);
 
         fnEmitter.For(
+            name,
             fnEmitter.Load(ToLLVMValue(startValue)),
             fnEmitter.Load(ToLLVMValue(stopValue)),
             fnEmitter.Load(ToLLVMValue(stepValue)),
@@ -815,15 +1006,74 @@ namespace value
         }
         else
         {
-            if (!TypeCompatible(destination, source) &&
-                (destination.PointerLevel() == source.PointerLevel() ||
-                 destination.PointerLevel() == (1 + source.PointerLevel())))
+            if (!TypeCompatible(destination, source))
             {
                 throw InputException(InputExceptionErrors::typeMismatch);
             }
 
+            enum class CopyType
+            {
+                DirectScalarPassThrough,
+                DirectScalarCopy,
+                Memory,
+                Reference
+            };
+            CopyType copyType{};
+
+            if (auto srcPtrLevel = source.PointerLevel(); srcPtrLevel < 0)
+            {
+                throw LogicException(LogicExceptionErrors::illegalState);
+            }
+            else if (srcPtrLevel == 0)
+            {
+                switch (destination.PointerLevel())
+                {
+                case 0:
+                    copyType = CopyType::DirectScalarPassThrough;
+                    break;
+                case 1:
+                    copyType = CopyType::DirectScalarCopy;
+                    break;
+                default:
+                    throw LogicException(LogicExceptionErrors::illegalState);
+                }
+                if (source.GetLayout() != ScalarLayout || destination.GetLayout() != ScalarLayout)
+                {
+                    throw InputException(InputExceptionErrors::invalidSize);
+                }
+            }
+            else if (srcPtrLevel == 1)
+            {
+                if (destination.PointerLevel() != 1)
+                {
+                    throw LogicException(LogicExceptionErrors::illegalState);
+                }
+                else
+                {
+                    if (destination.GetLayout() != source.GetLayout())
+                    {
+                        throw InputException(InputExceptionErrors::sizeMismatch);
+                    }
+                    copyType = CopyType::Memory;
+                }
+            }
+            else if (destination.PointerLevel() == srcPtrLevel)
+            {
+                assert(srcPtrLevel > 1);
+                if (source.IsConstant())
+                {
+                    throw LogicException(LogicExceptionErrors::illegalState);
+                }
+                copyType = CopyType::Reference;
+            }
+            else
+            {
+                throw LogicException(LogicExceptionErrors::illegalState);
+            }
+
             auto& irEmitter = _emitter.GetIREmitter();
             auto destValue = ToLLVMValue(destination);
+
             if (source.IsConstant())
             {
                 // we're only copying active areas below. should we copy padded too?
@@ -847,10 +1097,21 @@ namespace value
                 {
                     return;
                 }
-                if (auto& layout = source.GetLayout(); layout.IsContiguous())
+                switch (copyType)
                 {
-                    if (destination.PointerLevel() == source.PointerLevel())
+                case CopyType::DirectScalarPassThrough:
+                    destination.SetData(Emittable{ srcValue });
+                    break;
+                case CopyType::DirectScalarCopy:
+                {
+                    auto destAtOffset = irEmitter.PointerOffset(destValue, irEmitter.Zero(VariableType::Int32));
+                    irEmitter.Store(destAtOffset, srcValue);
+                    break;
+                }
+                case CopyType::Memory:
+                    if (auto& layout = source.GetLayout(); layout.IsContiguous())
                     {
+                        assert(copyType == CopyType::Memory);
                         auto llvmType = srcValue->getType()->getContainedType(0);
                         auto primSize = irEmitter.SizeOf(llvmType);
                         auto memorySize = irEmitter.Literal(static_cast<int64_t>(layout.GetMemorySize() * primSize));
@@ -860,17 +1121,24 @@ namespace value
                     }
                     else
                     {
-                        auto destAtOffset = irEmitter.PointerOffset(destValue, irEmitter.Zero(VariableType::Int32));
-                        irEmitter.Store(destAtOffset, srcValue);
+                        ForImpl(
+                            layout, [&](std::vector<Scalar> index) {
+                                auto offsetSource = source.Offset(detail::CalculateOffset(source.GetLayout(), index));
+                                auto offsetDest = destination.Offset(detail::CalculateOffset(destination.GetLayout(), index));
+                                (void)irEmitter.Store(ToLLVMValue(offsetDest), irEmitter.Load(ToLLVMValue(offsetSource)));
+                            },
+                            "");
                     }
-                }
-                else
+                    break;
+                case CopyType::Reference:
                 {
-                    ForImpl(layout, [&](std::vector<Scalar> index) {
-                        auto offsetSource = source.Offset(detail::CalculateOffset(source.GetLayout(), index));
-                        auto offsetDest = destination.Offset(detail::CalculateOffset(destination.GetLayout(), index));
-                        (void)irEmitter.Store(ToLLVMValue(offsetDest), irEmitter.Load(ToLLVMValue(offsetSource)));
-                    });
+                    auto srcAtOffset = irEmitter.Load(srcValue);
+                    irEmitter.Store(destValue, srcAtOffset);
+                    destination.SetLayout(source.GetLayout());
+                    break;
+                }
+                default:
+                    throw LogicException(LogicExceptionErrors::illegalState);
                 }
             }
         }
@@ -963,13 +1231,17 @@ namespace value
             destination = Allocate(source.GetBaseType(), source.GetLayout());
         }
 
-        if (!TypeCompatible(destination, source) &&
-            (destination.PointerLevel() == source.PointerLevel() ||
-             destination.PointerLevel() == (1 + source.PointerLevel())))
+        if (!TypeCompatible(destination, source))
         {
             throw InputException(InputExceptionErrors::typeMismatch);
         }
 
+        if (!((source.PointerLevel() == 0 || source.PointerLevel() == 1) &&
+              (destination.PointerLevel() == 0 || destination.PointerLevel() == 1)))
+        {
+            throw InputException(InputExceptionErrors::invalidArgument);
+        }
+
         if (destination.GetLayout() != source.GetLayout())
         {
             throw InputException(InputExceptionErrors::sizeMismatch);
@@ -1018,6 +1290,17 @@ namespace value
                         }
                         opFn = [&fn](auto dst, auto src) { return fn.Operator(TypedOperator::moduloSigned, dst, src); };
                         break;
+                    case ValueBinaryOperation::logicalAnd:
+                        [[fallthrough]];
+                    case ValueBinaryOperation::logicalOr:
+                        if (destination.GetBaseType() != ValueType::Boolean)
+                        {
+                            throw InputException(InputExceptionErrors::invalidArgument);
+                        }
+                        opFn = [&fn, op](auto dst, auto src) {
+                            return fn.Operator(op == ValueBinaryOperation::logicalAnd ? TypedOperator::logicalAnd : TypedOperator::logicalOr, dst, src);
+                        };
+                        break;
                     default:
                         throw LogicException(LogicExceptionErrors::illegalState);
                     }
@@ -1031,22 +1314,43 @@ namespace value
                         // If the pointer levels don't match, it means the source is not a pointer (logically)
                         // and we just need to do an assignment of the value to the value pointed to by
                         // destintion
-                        bool scalarLLVMSource = source.PointerLevel() != destination.PointerLevel();
-                        ForImpl(layout, [&](std::vector<Scalar> index) {
-                            LLVMValue srcValue = nullptr;
-                            if (scalarLLVMSource)
-                            {
-                                srcValue = ToLLVMValue(source);
-                            }
-                            else
-                            {
-                                auto offsetSource = source.Offset(detail::CalculateOffset(source.GetLayout(), index));
-                                srcValue = fn.Load(ToLLVMValue(offsetSource));
-                            }
-                            auto offsetDest = destination.Offset(detail::CalculateOffset(destination.GetLayout(), index));
-                            auto destValue = ToLLVMValue(offsetDest);
-                            fn.Store(destValue, opFn(fn.Load(destValue), srcValue));
-                        });
+                        bool scalarLLVMSource = source.PointerLevel() == 0;
+                        bool scalarLLVMDestination = destination.PointerLevel() == 0;
+                        ForImpl(
+                            layout, [&](std::vector<Scalar> index) {
+                                LLVMValue srcValue = nullptr;
+                                if (scalarLLVMSource)
+                                {
+                                    srcValue = ToLLVMValue(source);
+                                }
+                                else
+                                {
+                                    auto offsetSource = source.Offset(detail::CalculateOffset(source.GetLayout(), index));
+                                    srcValue = fn.Load(ToLLVMValue(offsetSource));
+                                }
+
+                                LLVMValue destValue = nullptr;
+                                LLVMValue destValueOffset = nullptr;
+                                if (scalarLLVMDestination)
+                                {
+                                    destValue = ToLLVMValue(destination);
+                                }
+                                else
+                                {
+                                    destValueOffset = ToLLVMValue(destination.Offset(detail::CalculateOffset(destination.GetLayout(), index)));
+                                    destValue = fn.Load(destValueOffset);
+                                }
+                                auto result = opFn(destValue, srcValue);
+                                if (!scalarLLVMDestination)
+                                {
+                                    fn.Store(destValueOffset, result);
+                                }
+                                else
+                                {
+                                    const_cast<Value&>(destination).SetData(Emittable{ result });
+                                }
+                            },
+                            "");
                     }
                     else
                     {
@@ -1113,16 +1417,19 @@ namespace value
                                 auto& fn = this->GetFunctionEmitter();
                                 auto result = fn.TrueBit();
 
-                                ForImpl(source1.GetLayout(), [&](std::vector<Scalar> index) {
-                                    auto offsetSource1 = source1.Offset(detail::CalculateOffset(source1.GetLayout(), index));
-                                    auto offsetSource2 = source2.Offset(detail::CalculateOffset(source2.GetLayout(), index));
-                                    result = fn.LogicalAnd(
-                                        result,
-                                        fn.Comparison(
-                                            comparisonOp,
-                                            fn.Load(ToLLVMValue(offsetSource1)),
-                                            fn.Load(ToLLVMValue(offsetSource2))));
-                                });
+                                ForImpl(
+                                    source1.GetLayout(),
+                                    [&](std::vector<Scalar> index) {
+                                        auto offsetSource1 = source1.Offset(detail::CalculateOffset(source1.GetLayout(), index));
+                                        auto offsetSource2 = source2.Offset(detail::CalculateOffset(source2.GetLayout(), index));
+                                        result = fn.LogicalAnd(
+                                            result,
+                                            fn.Comparison(
+                                                comparisonOp,
+                                                fn.Load(ToLLVMValue(offsetSource1)),
+                                                fn.Load(ToLLVMValue(offsetSource2))));
+                                    },
+                                    "");
 
                                 return { Emittable{ result }, ScalarLayout };
                             },
@@ -1131,20 +1438,30 @@ namespace value
                                 using CastType = std::conditional_t<std::is_same_v<Type, Boolean>, bool, Type>;
                                 auto& fn = this->GetFunctionEmitter();
 
-                                auto result = fn.TrueBit();
+                                LLVMValue result = nullptr;
                                 auto llvmOp1 = source1Data.GetDataAs<LLVMValue>();
-
-                                ConstantForLoop(source1.GetLayout(), [&](const MemoryCoordinates& logicalCoordinates) {
-                                    auto source1Offset = source1.GetLayout().GetLogicalEntryOffset(logicalCoordinates);
-                                    auto source2Offset = source2.GetLayout().GetLogicalEntryOffset(logicalCoordinates);
-
-                                    result = fn.LogicalAnd(
-                                        result,
-                                        fn.Comparison(
-                                            comparisonOp,
-                                            fn.ValueAt(llvmOp1, source1Offset),
-                                            fn.Literal(static_cast<CastType>(source2Data[source2Offset]))));
-                                });
+                                if (source1.PointerLevel() == 0)
+                                {
+                                    result = fn.Comparison(
+                                        comparisonOp,
+                                        llvmOp1,
+                                        fn.Literal(static_cast<CastType>(source2Data[0])));
+                                }
+                                else
+                                {
+                                    result = fn.TrueBit();
+                                    ConstantForLoop(source1.GetLayout(), [&](const MemoryCoordinates& logicalCoordinates) {
+                                        auto source1Offset = source1.GetLayout().GetLogicalEntryOffset(logicalCoordinates);
+                                        auto source2Offset = source2.GetLayout().GetLogicalEntryOffset(logicalCoordinates);
+
+                                        result = fn.LogicalAnd(
+                                            result,
+                                            fn.Comparison(
+                                                comparisonOp,
+                                                fn.ValueAt(llvmOp1, source1Offset),
+                                                fn.Literal(static_cast<CastType>(source2Data[source2Offset]))));
+                                    });
+                                }
 
                                 return { Emittable{ result }, ScalarLayout };
                             } },
@@ -1224,7 +1541,7 @@ namespace value
             }
             else
             {
-                testValue = ToLLVMValue(value);
+                testValue = value::ToLLVMValue(value);
             }
 
             _ifEmitter.ElseIf(testValue, [fn = std::move(fn)](auto&) { fn(); });
@@ -1258,6 +1575,22 @@ namespace value
         return { std::make_unique<LLVMContext::IfContextImpl>(std::move(ifEmitter), fnEmitter) };
     }
 
+    void LLVMContext::WhileImpl(Scalar test, std::function<void()> fn)
+    {
+        auto& fnEmitter = GetFunctionEmitter();
+        LLVMValue testValue = nullptr;
+        if (auto value = test.GetValue(); value.IsConstant())
+        {
+            testValue = fnEmitter.Literal(static_cast<int>(test.Get<Boolean>()));
+        }
+        else
+        {
+            testValue = ToLLVMValue(value);
+        }
+
+        fnEmitter.While(testValue, [fn = std::move(fn)](auto&) { fn(); });
+    }
+
     std::optional<Value> LLVMContext::CallImpl(FunctionDeclaration func, std::vector<Value> args)
     {
         if (std::any_of(args.begin(), args.end(), [](const auto& value) { return value.IsEmpty(); }))
@@ -1275,9 +1608,9 @@ namespace value
         {
             return it->second(args);
         }
-
         return EmitExternalCall(func, args);
     }
+
     void LLVMContext::PrefetchImpl(Value data, PrefetchType type, PrefetchLocality locality)
     {
         if (data.IsConstant())
@@ -1431,8 +1764,37 @@ namespace value
             return _computeContext.GetName(realized);
         }
 
-        auto llvmValue = ToLLVMValue(realized);
-        return llvmValue->getName();
+        auto llvmValue = *ToLLVMValue(realized);
+        if (llvmValue != nullptr)
+            return llvmValue->getName();
+        else
+            return "";
+    }
+
+    void LLVMContext::ImportCodeFileImpl(std::string filename)
+    {
+        if (auto lowercaseFilename = utilities::ToLowercase(filename); utilities::EndsWith(lowercaseFilename, ".ll"))
+        {
+            _emitter.LoadIRFromFile(filename);
+        }
+        else if (utilities::EndsWith(lowercaseFilename, ".s"))
+        {
+            _emitter.LoadAsmFromFile(filename);
+        }
+        else
+        {
+            throw LogicException(LogicExceptionErrors::illegalState, "[LLVMContext] Don't know how to import code file " + filename);
+        }
+    }
+
+    Scalar LLVMContext::GetFunctionAddressImpl(const FunctionDeclaration& fn)
+    {
+        auto llvmFn = DeclareFunction(fn);
+
+        auto& fnEmitter = GetFunctionEmitter();
+        auto fnAddress = fnEmitter.CastPointerToInt(llvmFn, VariableType::Int64);
+        fnAddress->setName(fn.GetFunctionName() + "Ptr");
+        return Value(Emittable{ fnAddress }, ScalarLayout);
     }
 
     Value LLVMContext::IntrinsicCall(FunctionDeclaration intrinsic, std::vector<Value> args)
@@ -1454,12 +1816,16 @@ namespace value
             { FloorFunctionDeclaration, SimpleNumericalFunctionIntrinsic(&IRRuntime::GetFloorFunction) },
             { CeilFunctionDeclaration, SimpleNumericalFunctionIntrinsic(&IRRuntime::GetCeilFunction) },
             { CopySignFunctionDeclaration, CopySignFunctionIntrinsic() },
+            { FmaFunctionDeclaration, FmaFunctionIntrinsic() },
+            { MemCopyFunctionDeclaration, MemOpFunctionIntrinsic(MemIntrinsicOp::Copy) },
+            { MemMoveFunctionDeclaration, MemOpFunctionIntrinsic(MemIntrinsicOp::Move) },
+            { MemSetFunctionDeclaration, MemOpFunctionIntrinsic(MemIntrinsicOp::Set) },
         };
 
         if (std::all_of(args.begin(), args.end(), [](const auto& value) { return value.IsConstant(); }))
         {
             // Compute context can handle intrinsic calls with constant data
-            return *_computeContext.Call(intrinsic, args);
+            return *_computeContext.Call(intrinsic, std::vector<ViewAdapter>(args.begin(), args.end()));
         }
 
         std::vector<Value> emittableArgs;
@@ -1482,39 +1848,14 @@ namespace value
             throw InputException(InputExceptionErrors::sizeMismatch);
         }
 
-        auto& irEmitter = _emitter.GetIREmitter();
-        auto& fnEmitter = GetFunctionEmitter();
-
         const auto& returnType = externalFunc.GetReturnType();
 
-        // Create external function declaration
-        const auto& fnName = externalFunc.GetFunctionName();
-        if (!_emitter.HasFunction(fnName))
-        {
-            auto resultType = [&] {
-                if (returnType)
-                {
-                    return ValueTypeToLLVMType(irEmitter, { returnType->GetBaseType(), returnType->PointerLevel() });
-                }
-                else
-                {
-                    return ValueTypeToLLVMType(irEmitter, { ValueType::Void, 0 });
-                }
-            }();
-
-            std::vector<LLVMType> paramTypes(argTypes.size());
-            std::transform(argTypes.begin(), argTypes.end(), paramTypes.begin(), [&](const auto& value) {
-                return ValueTypeToLLVMType(irEmitter, { value.GetBaseType(), value.PointerLevel() });
-            });
-
-            auto fnType = llvm::FunctionType::get(resultType, paramTypes, false);
-            _emitter.DeclareFunction(fnName, fnType);
-        }
-        auto fn = _emitter.GetFunction(fnName);
+        DeclareFunction(externalFunc);
 
         // as a first approximation, if the corresponding arg type has a pointer level that's one less
         // than the passed in value, we dereference it. if it's the same, we pass it in as is. if it's anything else,
         // throw. this logic may not be sufficient for future use cases.
+        auto& fnEmitter = GetFunctionEmitter();
         std::vector<LLVMValue> argValues;
         argValues.reserve(args.size());
         for (auto idx = 0u; idx < args.size(); ++idx)
@@ -1542,7 +1883,39 @@ namespace value
             }
         }
 
+        auto fn = [&, this]() -> LLVMFunction {
+            if (externalFunc.IsPointerSet())
+            {
+                auto llvmFuncAddr = ToLLVMValue(externalFunc.GetPointer().GetValue());
+                auto& fnEmitter = GetFunctionEmitter();
+                auto fnType = ToLLVMFunctionType(externalFunc);
+                auto fnPtr = fnEmitter.CastIntToPointer(llvmFuncAddr, fnType->getPointerTo());
+                return llvm::dyn_cast<llvm::Function>(fnPtr);
+            }
+            else
+            {
+                const auto& fnName = externalFunc.GetFunctionName();
+                return _emitter.GetFunction(fnName);
+            }
+        }();
+        assert(fn != nullptr);
+
         auto resultValue = fnEmitter.Call(fn, argValues);
+        auto callInst = llvm::dyn_cast<llvm::CallInst>(resultValue);
+        if (callInst)
+        {
+            auto callingConv = fn->getCallingConv();
+            callInst->setCallingConv(callingConv);
+            if (externalFunc.InlineState() != FunctionInlining::defaultInline)
+            {
+                IRFunctionEmitter::SetInlineState(fn, GetEmittersFunctionInlining(externalFunc.InlineState()));
+
+                // Not sure if this is actually necessary or helpful:
+                llvm::InlineFunctionInfo inliner;
+                llvm::InlineFunction(callInst, inliner);
+            }
+        }
+
         auto result = returnType;
         if (result)
         {
@@ -1598,6 +1971,27 @@ namespace value
         return _functionStack.top().get();
     }
 
+    emitters::LLVMFunction LLVMContext::DeclareFunction(const FunctionDeclaration& func)
+    {
+        // Create external function declaration
+        const auto& fnName = func.GetFunctionName();
+        if (auto llvmFn = _emitter.GetFunction(fnName); llvmFn != nullptr)
+        {
+            return llvmFn;
+        }
+
+        auto fnType = ToLLVMFunctionType(func);
+        auto fn = _emitter.DeclareFunction(fnName, fnType);
+        IRFunctionEmitter::SetInlineState(fn, GetEmittersFunctionInlining(func.InlineState()));
+        return fn;
+    }
+
+    void LLVMContext::DebugBreakImpl()
+    {
+        auto& fn = GetFunctionEmitter();
+        fn.DebugBreak();
+    }
+
     Value LLVMContext::PromoteConstantData(Value value)
     {
         assert(value.IsConstant() && value.IsDefined() && !value.IsEmpty());
@@ -1729,7 +2123,7 @@ namespace value
             newValue.SetData(Emittable{ fn.PointerOffset(emittable.GetDataAs<LLVMValue>(), static_cast<int>(offset)) });
             if (!name.empty())
             {
-                ToLLVMValue(newValue)->setName(name);
+                (*ToLLVMValue(newValue))->setName(name);
             }
 
             return newValue;
@@ -1752,5 +2146,86 @@ namespace value
         }
     }
 
+    std::vector<Value> LLVMContext::EnsureEmittable(std::vector<Value> values)
+    {
+        std::vector<Value> emittables(values.size());
+        std::transform(values.begin(), values.end(), emittables.begin(), [this](Value& arg) -> Value {
+            return EnsureEmittable(arg);
+        });
+        return emittables;
+    }
+
+    std::optional<LLVMValue> LLVMContext::ToLLVMValue(Value value) const
+    {
+        if (value.IsConstant())
+        {
+            return std::nullopt;
+        }
+        return value.Get<Emittable>().GetDataAs<LLVMValue>();
+    }
+
+    LLVMValue LLVMContext::ToLLVMValue(Value value)
+    {
+        return EnsureEmittable(value).Get<Emittable>().GetDataAs<LLVMValue>();
+    }
+
+    std::vector<std::optional<LLVMValue>> LLVMContext::ToLLVMValue(std::vector<Value> values) const
+    {
+        std::vector<std::optional<LLVMValue>> llvmValues(values.size());
+        std::transform(values.begin(), values.end(), llvmValues.begin(), [this](Value& value) -> std::optional<LLVMValue> {
+            return ToLLVMValue(value);
+        });
+        return llvmValues;
+    }
+
+    std::vector<LLVMValue> LLVMContext::ToLLVMValue(std::vector<Value> values)
+    {
+        std::vector<LLVMValue> llvmValues(values.size());
+        std::transform(values.begin(), values.end(), llvmValues.begin(), [this](Value& value) -> LLVMValue {
+            return ToLLVMValue(value);
+        });
+        return llvmValues;
+    }
+
+    LLVMFunctionType LLVMContext::ToLLVMFunctionType(const FunctionDeclaration& func) const
+    {
+        auto& irEmitter = _emitter.GetIREmitter();
+        const auto& argTypes = func.GetParameterTypes();
+        const auto& returnType = func.GetReturnType();
+
+        // Create external function declaration
+        auto resultType = returnType ? ValueTypeToLLVMType(irEmitter, { returnType->GetBaseType(), returnType->PointerLevel() }) : ValueTypeToLLVMType(irEmitter, { ValueType::Void, 0 });
+        std::vector<LLVMType> paramTypes(argTypes.size());
+        std::transform(argTypes.begin(), argTypes.end(), paramTypes.begin(), [&](const auto& value) {
+            return ValueTypeToLLVMType(irEmitter, { value.GetBaseType(), value.PointerLevel() });
+        });
+
+        return llvm::FunctionType::get(resultType, paramTypes, false);
+    }
+
+    LLVMValue ToLLVMValue(Value value)
+    {
+        auto val = InvokeForContext<LLVMContext>([&](LLVMContext& context) {
+            return context.ToLLVMValue(value);
+        });
+        return val.value_or(nullptr);
+    }
+
+    LLVMValue ToLLVMValue(ViewAdapter value)
+    {
+        auto val = InvokeForContext<LLVMContext>([&](LLVMContext& context) {
+            return context.ToLLVMValue(value);
+        });
+        return val.value_or(nullptr);
+    }
+
+    std::vector<LLVMValue> ToLLVMValue(std::vector<Value> values)
+    {
+        std::vector<LLVMValue> llvmValues(values.size());
+        std::transform(values.begin(), values.end(), llvmValues.begin(), [](Value& value) -> LLVMValue {
+            return ToLLVMValue(value);
+        });
+        return llvmValues;
+    }
 } // namespace value
 } // namespace ell
diff --git a/libraries/value/src/LoopNests.cpp b/libraries/value/src/LoopNests.cpp
new file mode 100644
index 000000000..9a39f2c53
--- /dev/null
+++ b/libraries/value/src/LoopNests.cpp
@@ -0,0 +1,354 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     LoopNests.cpp (value)
+//  Authors:  Kern Handa
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "LoopNests.h"
+
+#include "loopnests/CodeGenerator.h"
+#include "loopnests/CodePositionConstraints.h"
+#include "loopnests/IndexRange.h"
+#include "loopnests/Kernel.h"
+#include "loopnests/LoopNest.h"
+
+#include <map>
+#include <optional>
+#include <tuple>
+
+using namespace ell::utilities;
+
+namespace ell
+{
+namespace value
+{
+    class LoopNestImpl
+    {
+    public:
+        ~LoopNestImpl() = default;
+
+        void Using(std::initializer_list<ViewAdapter> inputs, ArgumentType argType)
+        {
+            for (auto input : inputs)
+            {
+                _arguments.emplace_back(input, argType);
+            }
+        }
+
+        void ForAll(Index index, int begin, int end)
+        {
+            _ranges.emplace_back(index, loopnests::Range{ begin, end });
+        }
+
+        void EnsureCreated()
+        {
+            if (!_nest.has_value())
+            {
+                _nest = loopnests::LoopNest(_ranges);
+            }
+        }
+
+        void Do(std::function<void(std::vector<Value>)> fn, std::vector<Index> kernelOuterIndices, std::string kernelId)
+        {
+            std::vector<Value> arguments;
+            arguments.reserve(_arguments.size());
+            for (const auto& arg : _arguments)
+            {
+                arguments.push_back(arg.first);
+            }
+            std::vector<Index> indices;
+            for (const auto& index : _ranges)
+            {
+                indices.push_back(index.GetIndex());
+            }
+            auto name = UniqueName("kernel");
+            if (kernelId.empty())
+            {
+                kernelId = name;
+            }
+            auto kernel = loopnests::Kernel(name, kernelId)
+                              .Inputs(arguments)
+                              .Indices(indices)
+                              .Define(fn);
+
+            Do(kernel, kernelOuterIndices);
+        }
+
+        void Do(Kernel kernelFn, std::vector<Index> kernelOuterIndices)
+        {
+            _kernels.push_back(std::move(kernelFn));
+            auto& kernel = _kernels.back();
+
+            EnsureCreated();
+            if (kernelOuterIndices.empty())
+            {
+                _nest->AddKernel(kernel, loopnests::LoopNest::ConstraintType::constraint);
+            }
+            else
+            {
+                loopnests::CodePositionConstraints constraints{ loopnests::LoopFragmentType::body, kernelOuterIndices, {} };
+                _nest->AddKernel(kernel, constraints);
+            }
+        }
+
+        void Do(Kernel kernelFn, const loopnests::KernelPredicate& predicate, const loopnests::KernelPredicate& placement)
+        {
+            _kernels.push_back(std::move(kernelFn));
+            auto& kernel = _kernels.back();
+
+            EnsureCreated();
+            _nest->AddKernel(kernel, predicate, placement);
+        }
+
+        Index Split(Index& index, int factor)
+        {
+            EnsureCreated();
+            // TODO: this might not be needed in the high level api
+            auto it = _splits.find({ index, factor });
+            if (it == _splits.end())
+            {
+                auto splitResult = _nest->Split(index, factor);
+                it = _splits.insert(it, { { index, factor }, splitResult });
+            }
+
+            index = it->second.inner;
+            return it->second.outer;
+        }
+
+        void Parallelize(Index index)
+        {
+            EnsureCreated();
+            _nest->Parallelize(index);
+        }
+
+        void Unroll(Index index)
+        {
+            EnsureCreated();
+            _nest->Unroll(index);
+        }
+
+        void SetOrder(std::vector<Index> indices)
+        {
+            EnsureCreated();
+            _nest->SetLoopOrder(indices);
+        }
+
+        void Run() const
+        {
+            loopnests::CodeGenerator{}.Run(*_nest);
+        }
+
+        loopnests::LoopNest& GetUnderlyingLoopNest()
+        {
+            EnsureCreated();
+            return *_nest;
+        }
+
+    private:
+        std::vector<std::pair<Value, ArgumentType>> _arguments;
+        std::vector<loopnests::IndexRange> _ranges;
+        std::vector<Kernel> _kernels;
+        std::optional<loopnests::LoopNest> _nest;
+        std::map<std::pair<Index, int>, loopnests::SplitIndex> _splits;
+    };
+
+    LoopNest Using(std::initializer_list<ViewAdapter> inputs, ArgumentType argType)
+    {
+        return LoopNest{}.Using(inputs, argType);
+    }
+
+    //
+    // LoopNest
+    //
+
+    LoopNest::LoopNest() :
+        _impl(std::make_unique<LoopNestImpl>()),
+        _schedule(*this) {}
+    LoopNest::LoopNest(const LoopNest& other) :
+        _impl(std::make_unique<LoopNestImpl>(*other._impl)),
+        _schedule(*this) {}
+    LoopNest::LoopNest(LoopNest&& other) noexcept :
+        _impl(std::move(other._impl)),
+        _schedule(*this) {}
+
+    LoopNest& LoopNest::operator=(const LoopNest& other)
+    {
+        if (this != &other)
+        {
+            *_impl = *other._impl;
+        }
+
+        return *this;
+    }
+
+    LoopNest& LoopNest::operator=(LoopNest&& other) noexcept
+    {
+        if (this != &other)
+        {
+            _impl = std::move(other._impl);
+        }
+
+        return *this;
+    }
+
+    LoopNest::~LoopNest() = default;
+
+    LoopNest& LoopNest::Using(std::initializer_list<ViewAdapter> inputs, ArgumentType argType)
+    {
+        _impl->Using(inputs, argType);
+
+        return *this;
+    }
+
+    LoopNest& LoopNest::ForAll(Index index, int begin, int end)
+    {
+        _impl->ForAll(index, begin, end);
+
+        return *this;
+    }
+
+    LoopNest& LoopNest::Do(std::function<void(std::vector<Value>)> fn, std::vector<Index> kernelOuterIndices, std::string kernelId)
+    {
+        _impl->Do(fn, kernelOuterIndices, kernelId);
+
+        return *this;
+    }
+
+    LoopNest& LoopNest::Do(std::function<void(std::vector<Value>)> fn, std::string kernelId)
+    {
+        return Do(fn, {}, kernelId);
+    }
+
+    LoopNest& LoopNest::Do(Kernel kernel, std::vector<Index> kernelOuterIndices)
+    {
+        _impl->Do(kernel, kernelOuterIndices);
+
+        return *this;
+    }
+
+    LoopNest& LoopNest::Do(Kernel kernel, const loopnests::KernelPredicate& predicate, const loopnests::KernelPredicate& placement)
+    {
+        _impl->Do(kernel, predicate, placement);
+
+        return *this;
+    }
+
+    void LoopNest::Run() const
+    {
+        _impl->Run();
+    }
+
+    loopnests::LoopNest& LoopNest::GetUnderlyingLoopNest()
+    {
+        return _impl->GetUnderlyingLoopNest();
+    }
+
+    const loopnests::LoopNest& LoopNest::GetUnderlyingLoopNest() const
+    {
+        return _impl->GetUnderlyingLoopNest();
+    }
+
+    Schedule& LoopNest::GetSchedule()
+    {
+        return _schedule;
+    }
+
+    void swap(LoopNest& nest1, LoopNest& nest2) noexcept
+    {
+        using std::swap;
+        swap(nest1._impl, nest2._impl);
+    }
+
+    //
+    // Schedule
+    //
+
+    Schedule::Schedule(LoopNest& nest) :
+        _nest(nest),
+        _impl(*nest._impl)
+    {}
+
+    Schedule::Schedule(const Schedule& other) = default;
+    Schedule& Schedule::operator=(const Schedule& other) = default;
+
+    Index Schedule::Split(Index& index, int factor)
+    {
+        return _impl.get().Split(index, factor);
+    }
+
+    void Schedule::Parallelize(Index index)
+    {
+        _impl.get().Parallelize(index);
+    }
+
+    Index Schedule::Parallelize(Index index, int factor)
+    {
+        auto outer = Split(index, factor);
+        Parallelize(outer);
+        return outer;
+    }
+
+    void Schedule::Unroll(Index index)
+    {
+        _impl.get().Unroll(index);
+    }
+
+    Index Schedule::Unroll(Index index, int factor)
+    {
+        auto outer = Split(index, factor);
+        Unroll(outer);
+        return outer;
+    }
+
+    void Schedule::Cache(std::unique_ptr<CachingProvider> provider)
+    {
+        provider->HandleCaching(_nest.get());
+    }
+
+    utilities::MemoryShape Schedule::GetShapeFromIndicesIncrement(std::vector<Index>& kernelIndices)
+    {
+        std::vector<int> sizes;
+        for (auto index : kernelIndices)
+        {
+            auto range = _nest.get().GetUnderlyingLoopNest().GetIndexRange(index);
+            sizes.push_back(range.Increment());
+        }
+
+        return { sizes };
+    }
+
+    void Schedule::Cache(
+        CachingProvider& provider,
+        ViewAdapter view,
+        std::vector<Index> kernelIndices,
+        utilities::MemoryShape size,
+        std::vector<Index> atIndices,
+        std::optional<utilities::DimensionOrder> order,
+        std::any extras)
+    {
+        if (size.NumDimensions() == 0)
+        {
+            // Figure out size based on increment of the indices
+            size = GetShapeFromIndicesIncrement(kernelIndices);
+        };
+
+        provider.Initialize(
+            view,
+            size,
+            order.value_or(DimensionOrder(size.NumDimensions())),
+            kernelIndices,
+            atIndices.empty() ? kernelIndices : atIndices,
+            extras);
+
+        provider.HandleCaching(_nest.get());
+    }
+
+    void Schedule::SetOrder(std::vector<Index> indices)
+    {
+        _impl.get().SetOrder(indices);
+    }
+
+} // namespace value
+} // namespace ell
diff --git a/libraries/value/src/Matrix.cpp b/libraries/value/src/Matrix.cpp
index 807fe4671..ba00f16ae 100644
--- a/libraries/value/src/Matrix.cpp
+++ b/libraries/value/src/Matrix.cpp
@@ -62,7 +62,7 @@ namespace value
 
     Scalar Matrix::operator()(Scalar rowIndex, Scalar columnIndex)
     {
-        Value indexedValue = GetContext().Offset(_value, { rowIndex, columnIndex });
+		Value indexedValue = GetContext().Offset(_value, { rowIndex, columnIndex });
         indexedValue.SetLayout(ScalarLayout);
 
         return indexedValue;
@@ -135,6 +135,11 @@ namespace value
 
     size_t Matrix::Columns() const { return static_cast<size_t>(_value.GetLayout().GetLogicalDimensionActiveSize(1)); }
 
+    Matrix::MatrixLayout Matrix::GetMatrixLayout() const
+    {
+        return _value.GetLayout().IsCanonicalOrder() ? MatrixLayout::rowMajor : MatrixLayout::columnMajor;
+    }
+
     ValueType Matrix::Type() const { return _value.GetBaseType(); }
 
     void Matrix::SetName(const std::string& name) { _value.SetName(name); }
diff --git a/libraries/value/src/MatrixOperations.cpp b/libraries/value/src/MatrixOperations.cpp
index a7833f9fa..c90150ad2 100644
--- a/libraries/value/src/MatrixOperations.cpp
+++ b/libraries/value/src/MatrixOperations.cpp
@@ -20,15 +20,15 @@ using namespace utilities;
 namespace value
 {
 
-	Matrix ToMatrix(Value data, int numRows, int numCols)
-	{
-		Value matrix = data;
-		auto size = data.GetLayout().GetActiveSize().NumElements();
-		if (size != numRows * numCols || !data.GetLayout().IsContiguous())
-		{
-			throw InputException(InputExceptionErrors::invalidArgument,
-				ell::utilities::FormatString("data must be contiguous and have size %zu = %d * %d", size, numRows, numCols));
-		}
+    Matrix ToMatrix(Value data, int numRows, int numCols)
+    {
+        Value matrix = data;
+        auto size = data.GetLayout().GetActiveSize().NumElements();
+        if (size != numRows * numCols || !data.GetLayout().IsContiguous())
+        {
+            throw InputException(InputExceptionErrors::invalidArgument,
+                                 ell::utilities::FormatString("data must be contiguous and have size %zu = %d * %d", size, numRows, numCols));
+        }
         matrix.SetLayout(utilities::MemoryLayout{ { numRows, numCols } });
         return matrix;
     }
@@ -45,6 +45,11 @@ namespace value
     }
 
     void For(Matrix matrix, std::function<void(Scalar, Scalar)> fn)
+    {
+        For(std::string{}, matrix, fn);
+    }
+
+    void For(const std::string& name, Matrix matrix, std::function<void(Scalar, Scalar)> fn)
     {
         auto layout = matrix.GetValue().GetLayout();
         if (layout.NumDimensions() != 2)
@@ -53,9 +58,12 @@ namespace value
                                  "Layout being looped over must be two-dimensional");
         }
 
-        GetContext().For(layout, [fn = std::move(fn)](std::vector<Scalar> coordinates) {
-            fn(coordinates[0], coordinates[1]);
-        });
+        GetContext().For(
+            layout,
+            [fn = std::move(fn)](std::vector<Scalar> coordinates) {
+                fn(coordinates[0], coordinates[1]);
+            },
+            name);
     }
 
     Matrix GEMM(Matrix m1, Matrix m2) { throw LogicException(LogicExceptionErrors::notImplemented); }
@@ -67,7 +75,7 @@ namespace value
         if (m.Columns() != v.Size())
         {
             throw InputException(InputExceptionErrors::invalidArgument,
-                ell::utilities::FormatString("Vector size %d must match number of columns in the matrix %d", v.Size(), m.Columns()));
+                                 ell::utilities::FormatString("Vector size %d must match number of columns in the matrix %d", v.Size(), m.Columns()));
         }
         first = 1;
         For(m, [&](Scalar row, Scalar col) {
diff --git a/libraries/value/src/Print.cpp b/libraries/value/src/Print.cpp
new file mode 100644
index 000000000..481d13b13
--- /dev/null
+++ b/libraries/value/src/Print.cpp
@@ -0,0 +1,65 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     Print.cpp (value)
+//  Authors:  Chuck Jacobs
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "Print.h"
+#include "LLVMContext.h"
+
+#include <cstdlib>
+#include <functional>
+
+namespace ell
+{
+namespace value
+{
+    void Print(const std::string& text)
+    {
+        if (!InvokeForContext<LLVMContext>([&](LLVMContext& context) -> bool {
+                context.GetFunctionEmitter().Print(text);
+                return true;
+            }))
+        {
+            std::printf("%s", text.c_str());
+        }
+    }
+
+    void Printf(const std::vector<Value>& arguments)
+    {
+        if (!InvokeForContext<LLVMContext>([&](LLVMContext& context) -> bool {
+                std::vector<emitters::LLVMValue> args;
+                std::transform(arguments.begin(), arguments.end(), std::back_inserter(args), [&](auto x) {
+                    return ToLLVMValue(x);
+                });
+
+                context.GetFunctionEmitter().Printf(args);
+                return true;
+            }))
+        {
+            std::printf("<args>");
+        }
+    }
+
+    void Printf(const std::string& format, const std::vector<Value>& arguments)
+    {
+        if (!InvokeForContext<LLVMContext>([&](LLVMContext& context) -> bool {
+                std::vector<emitters::LLVMValue> args;
+                std::transform(arguments.begin(), arguments.end(), std::back_inserter(args), [&](auto x) {
+                    return ToLLVMValue(x);
+                });
+
+                context.GetFunctionEmitter().Printf(format, args);
+                return true;
+            }))
+        {
+            std::printf("%s", format.c_str());
+        }
+    }
+
+} // namespace value
+} // namespace ell
+
+#pragma endregion implementation
diff --git a/libraries/value/src/Scalar.cpp b/libraries/value/src/Scalar.cpp
index 9f35d0f65..c5cfa7530 100644
--- a/libraries/value/src/Scalar.cpp
+++ b/libraries/value/src/Scalar.cpp
@@ -103,32 +103,27 @@ namespace value
     // Free function operator overloads
     Scalar operator+(Scalar s1, Scalar s2)
     {
-        Scalar copy = s1.Copy();
-        return copy += s2;
+        return Add(s1, s2);
     }
 
     Scalar operator-(Scalar s1, Scalar s2)
     {
-        Scalar copy = s1.Copy();
-        return copy -= s2;
+        return Subtract(s1, s2);
     }
 
     Scalar operator*(Scalar s1, Scalar s2)
     {
-        Scalar copy = s1.Copy();
-        return copy *= s2;
+        return Multiply(s1, s2);
     }
 
     Scalar operator/(Scalar s1, Scalar s2)
     {
-        Scalar copy = s1.Copy();
-        return copy /= s2;
+        return Divide(s1, s2);
     }
 
     Scalar operator%(Scalar s1, Scalar s2)
     {
-        Scalar copy = s1.Copy();
-        return copy %= s2;
+        return Modulo(s1, s2);
     }
 
     Scalar operator-(Scalar s)
diff --git a/libraries/value/src/ScalarOperations.cpp b/libraries/value/src/ScalarOperations.cpp
new file mode 100644
index 000000000..026d0db46
--- /dev/null
+++ b/libraries/value/src/ScalarOperations.cpp
@@ -0,0 +1,58 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     ScalarOperations.cpp (value)
+//  Authors:  Kern Handa
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "ScalarOperations.h"
+#include "EmitterContext.h"
+#include "Scalar.h"
+
+#include <emitters/include/IREmitter.h>
+
+namespace ell
+{
+using namespace utilities;
+
+namespace value
+{
+
+    Scalar Add(Scalar s1, Scalar s2)
+    {
+        Scalar copy = s1.Copy();
+        return copy += s2;
+    }
+
+    Scalar Subtract(Scalar s1, Scalar s2)
+    {
+        Scalar copy = s1.Copy();
+        return copy -= s2;
+    }
+
+    Scalar Multiply(Scalar s1, Scalar s2)
+    {
+        Scalar copy = s1.Copy();
+        return copy *= s2;
+    }
+
+    Scalar Divide(Scalar s1, Scalar s2)
+    {
+        Scalar copy = s1.Copy();
+        return copy /= s2;
+    }
+
+    Scalar Modulo(Scalar s1, Scalar s2)
+    {
+        Scalar copy = s1.Copy();
+        return copy %= s2;
+    }
+
+    Scalar FusedMultiplyAdd(Scalar a, Scalar b, Scalar c)
+    {
+        return Fma(a, b, c);
+    }
+
+} // namespace value
+} // namespace ell
diff --git a/libraries/value/src/TensorOperations.cpp b/libraries/value/src/TensorOperations.cpp
index 237dc4cb5..298049d3d 100644
--- a/libraries/value/src/TensorOperations.cpp
+++ b/libraries/value/src/TensorOperations.cpp
@@ -29,6 +29,11 @@ namespace value
     }
 
     void For(Tensor tensor, std::function<void(Scalar, Scalar, Scalar)> fn)
+    {
+        For(std::string{}, tensor, fn);
+    }
+
+    void For(const std::string& name, Tensor tensor, std::function<void(Scalar, Scalar, Scalar)> fn)
     {
         auto layout = tensor.GetValue().GetLayout();
         if (layout.NumDimensions() != 3)
@@ -37,9 +42,12 @@ namespace value
                                  "Layout being looped over must be three-dimensional");
         }
 
-        GetContext().For(layout, [fn = std::move(fn)](std::vector<Scalar> coordinates) {
-            fn(coordinates[0], coordinates[1], coordinates[2]);
-        });
+        GetContext().For(
+            layout,
+            [fn = std::move(fn)](std::vector<Scalar> coordinates) {
+                fn(coordinates[0], coordinates[1], coordinates[2]);
+            },
+            name);
     }
 
 } // namespace value
diff --git a/libraries/value/src/Value.cpp b/libraries/value/src/Value.cpp
index 4f1baa8f5..4b60ed4f5 100644
--- a/libraries/value/src/Value.cpp
+++ b/libraries/value/src/Value.cpp
@@ -35,6 +35,7 @@ namespace value
         other._data = temp;
         std::swap(_type, other._type);
         std::swap(_layout, other._layout);
+        std::swap(_hasName, other._hasName);
     }
 
     // clang-format off
@@ -74,6 +75,7 @@ namespace value
                     _layout = other._layout;
                     _data = other._data;
                     _type = other._type;
+                    _hasName = other._hasName;
                 }
             }
             else
@@ -96,6 +98,7 @@ namespace value
                         _layout = other._layout;
                         _data = other._data;
                         _type = other._type;
+                        _hasName = other._hasName;
                     }
                     else
                     {
@@ -113,6 +116,7 @@ namespace value
                         }
                         _data = other._data;
                         _type = other._type;
+                        _hasName = other._hasName;
                     }
                     else
                     {
@@ -142,6 +146,7 @@ namespace value
                     _data = std::move(other._data);
                     _layout = std::move(other._layout);
                     _type = std::move(other._type);
+                    _hasName = std::move(other._hasName);
                 }
             }
             else
@@ -164,6 +169,7 @@ namespace value
                         _data = std::move(other._data);
                         _layout = std::move(other._layout);
                         _type = std::move(other._type);
+                        _hasName = std::move(other._hasName);
                     }
                     else
                     {
@@ -181,6 +187,7 @@ namespace value
                         }
                         _data = std::move(other._data);
                         _type = other._type;
+                        _hasName = std::move(other._hasName);
                     }
                     else
                     {
@@ -219,6 +226,7 @@ namespace value
         _type = { ValueType::Undefined, 0 };
         _layout.reset();
         _data = {};
+        _hasName = false;
     }
 
     void Value::SetData(Value value, bool force)
@@ -236,6 +244,10 @@ namespace value
                                       }
 
                                       _data = emittable;
+                                      if (!force)
+                                      {
+                                          _type = type;
+                                      }
                                   },
                                    [this, force](auto&& arg) {
                                        if (!force && GetValueType<std::decay_t<decltype(arg)>>() != _type.first)
diff --git a/libraries/value/src/ValueOperations.cpp b/libraries/value/src/ValueOperations.cpp
index 6d90fb909..9bcbfd6ff 100644
--- a/libraries/value/src/ValueOperations.cpp
+++ b/libraries/value/src/ValueOperations.cpp
@@ -28,6 +28,11 @@ namespace value
         });
     }
 
+    void For(Scalar start, Scalar stop, Scalar step, std::function<void(Scalar)> fn)
+    {
+        GetContext().For(start, stop, step, fn);
+    }
+
     Value Cast(Value value, ValueType type)
     {
         if (value.GetBaseType() == type)
diff --git a/libraries/value/src/Vector.cpp b/libraries/value/src/Vector.cpp
index 6c8bf108f..315e52ce2 100644
--- a/libraries/value/src/Vector.cpp
+++ b/libraries/value/src/Vector.cpp
@@ -286,5 +286,6 @@ namespace value
         value.SetLayout(value.GetLayout().Flatten());
         return value;
     }
+
 } // namespace value
 } // namespace ell
diff --git a/libraries/value/src/VectorOperations.cpp b/libraries/value/src/VectorOperations.cpp
index fbc495c8d..2a5b88703 100644
--- a/libraries/value/src/VectorOperations.cpp
+++ b/libraries/value/src/VectorOperations.cpp
@@ -92,7 +92,7 @@ namespace value
             result = InvokeForContext<LLVMContext>([&](LLVMContext& context) -> Scalar {
                 if (context.GetModuleEmitter().GetCompilerOptions().useBlas)
                 {
-                    auto returnValue = fn.Decorated(FunctionDecorated::No)
+                    auto returnValue = fn.Decorated(false)
                                            .Call(
                                                Scalar{ static_cast<int>(v1.Size()) },
                                                v1,
@@ -108,7 +108,12 @@ namespace value
                 }
             });
 
-            return *result;
+            if (result)
+            {
+                return *result;
+            }
+
+            return defaultImpl(v1, v2);
         }
         else if (v1.GetType() == ValueType::Double)
         {
@@ -142,7 +147,7 @@ namespace value
             result = InvokeForContext<LLVMContext>([&](LLVMContext& context) -> Scalar {
                 if (context.GetModuleEmitter().GetCompilerOptions().useBlas)
                 {
-                    auto returnValue = fn.Decorated(FunctionDecorated::No)
+                    auto returnValue = fn.Decorated(false)
                                            .Call(
                                                Scalar{ static_cast<int>(v1.Size()) },
                                                v1,
@@ -158,7 +163,12 @@ namespace value
                 }
             });
 
-            return *result;
+            if (result)
+            {
+                return *result;
+            }
+
+            return defaultImpl(v1, v2);
         }
         else
         {
@@ -167,6 +177,11 @@ namespace value
     }
 
     void For(Vector v, std::function<void(Scalar)> fn)
+    {
+        For(std::string{}, v, fn);
+    }
+
+    void For(const std::string& name, Vector v, std::function<void(Scalar)> fn)
     {
         auto layout = v.GetValue().GetLayout();
 
@@ -176,7 +191,10 @@ namespace value
                                  "Layout being looped over must be one-dimensional");
         }
 
-        GetContext().For(layout, [fn = std::move(fn)](std::vector<Scalar> coordinates) { fn(coordinates[0]); });
+        GetContext().For(
+            layout,
+            [fn = std::move(fn)](std::vector<Scalar> coordinates) { fn(coordinates[0]); },
+            name);
     }
 
 } // namespace value
diff --git a/libraries/value/src/loopnests/CodeGenerator.cpp b/libraries/value/src/loopnests/CodeGenerator.cpp
new file mode 100644
index 000000000..5a4a7caed
--- /dev/null
+++ b/libraries/value/src/loopnests/CodeGenerator.cpp
@@ -0,0 +1,461 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     CodeGenerator.cpp (value)
+//  Authors:  Chuck Jacobs, Kern Handa
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "loopnests/CodeGenerator.h"
+#include "loopnests/KernelPredicate.h"
+
+#include "LLVMContext.h"
+
+#include <algorithm>
+#include <numeric>
+#include <optional>
+#include <variant>
+
+namespace ell
+{
+namespace value
+{
+    namespace loopnests
+    {
+        namespace
+        {
+            // computes ceil(a/b)
+            int CeilDiv(int a, int b)
+            {
+                return (a - 1) / b + 1;
+            }
+        } // namespace
+
+        void CodeGenerator::Run(const LoopNest& loopNest) const
+        {
+            Visit(loopNest);
+        }
+
+        void CodeGenerator::GenerateLoopRangeNew(const LoopRange& r, const RecursionStateNew& state, const LoopVisitSchedule& schedule, std::function<void(Scalar)> codegenFn) const
+        {
+            const LoopNest& loopNest = schedule.GetLoopNest();
+            auto loopIndex = schedule.CurrentLoopIndex();
+
+            bool isParallelized = loopNest.IsParallelized(loopIndex);
+            bool isUnrolled = loopNest.IsUnrolled(loopIndex);
+            assert(!(isParallelized && isUnrolled) && "An index cannot be both unrolled and parallelized");
+
+            const int startInt = r.start.Get<int>();
+            const int stopInt = r.stop.Get<int>();
+            const int stepInt = r.step.Get<int>();
+
+            int numIterations = CeilDiv(stopInt - startInt, stepInt);
+            if (numIterations < 2)
+            {
+                isParallelized = false;
+            }
+
+            if (!(isParallelized || isUnrolled))
+            {
+                ForRange(UniqueName(loopNest.Name()), r.start, r.stop, r.step, codegenFn);
+            }
+            else if (isParallelized)
+            {
+                int numThreads = numIterations;
+
+                std::vector<Value> kernelInputs;
+                for (const auto& g : state.kernelGroups)
+                {
+                    if (g.first)
+                    {
+                        for (const auto& scheduledKernel : g.second.kernels)
+                        {
+                            auto& kernel = scheduledKernel.kernel;
+                            const auto& inputs = kernel.GetArgs();
+                            kernelInputs.insert(kernelInputs.end(), inputs.begin(), inputs.end());
+                        }
+                    }
+                }
+
+                std::once_flag onceFlag;
+                GetContext().Parallelize(numThreads, kernelInputs, [&](Scalar index, std::vector<Value> captured) mutable {
+                    assert(kernelInputs.size() == captured.size());
+
+                    std::call_once(onceFlag, [&] {
+                        for (unsigned i = 0; i < captured.size(); ++i)
+                        {
+                            // TODO: figure out what to do with the "where" parameter
+                            // TODO: get rid of const_cast
+                            const_cast<LoopNest&>(loopNest).RenameVariable(kernelInputs[i], captured[i], { loopIndex });
+                        }
+                    });
+
+                    codegenFn(index * stepInt);
+                });
+            }
+            else if (isUnrolled)
+            {
+                for (int i = startInt; i < stopInt; i += stepInt)
+                {
+                    codegenFn(i);
+                }
+            }
+        }
+
+        void CodeGenerator::GenerateLoopRangeOld(const LoopRange& r, const RecursionState& state, const LoopVisitSchedule& schedule, std::function<void(Scalar)> codegenFn) const
+        {
+            const LoopNest& loopNest = schedule.GetLoopNest();
+            auto loopIndex = schedule.CurrentLoopIndex();
+
+            bool isParallelized = loopNest.IsParallelized(loopIndex);
+            bool isUnrolled = loopNest.IsUnrolled(loopIndex);
+            assert(!(isParallelized && isUnrolled) && "An index cannot be both unrolled and parallelized");
+
+            const int startInt = r.start.Get<int>();
+            const int stopInt = r.stop.Get<int>();
+            const int stepInt = r.step.Get<int>();
+
+            int numIterations = CeilDiv(stopInt - startInt, stepInt);
+            if (numIterations < 2)
+            {
+                isParallelized = false;
+            }
+
+            if (!(isParallelized || isUnrolled))
+            {
+                ForRange(UniqueName(loopNest.Name()), r.start, r.stop, r.step, codegenFn);
+            }
+            else if (isParallelized)
+            {
+                int numThreads = numIterations;
+
+                const auto& scheduledKernels = state.activeKernels;
+                std::vector<Value> kernelInputs;
+                for (const auto& scheduledKernel : scheduledKernels)
+                {
+                    auto& kernel = scheduledKernel.kernel;
+
+                    const auto& inputs = kernel.GetArgs();
+                    kernelInputs.insert(kernelInputs.end(), inputs.begin(), inputs.end());
+                }
+
+                std::once_flag onceFlag;
+                GetContext().Parallelize(numThreads, kernelInputs, [&](Scalar index, std::vector<Value> captured) mutable {
+                    assert(kernelInputs.size() == captured.size());
+
+                    std::call_once(onceFlag, [&] {
+                        for (unsigned i = 0; i < captured.size(); ++i)
+                        {
+                            // TODO: figure out what to do with the "where" parameter
+                            // TODO: get rid of const_cast
+                            const_cast<LoopNest&>(loopNest).RenameVariable(kernelInputs[i], captured[i], { loopIndex });
+                        }
+                    });
+
+                    codegenFn(index * stepInt);
+                });
+            }
+            else if (isUnrolled)
+            {
+                for (int i = startInt; i < stopInt; i += stepInt)
+                {
+                    codegenFn(i);
+                }
+            }
+        }
+
+        Scalar CodeGenerator::EmitIndexExpression(const Index& index, const IndexExpression& expr, const LoopIndexSymbolTable& indexVariables) const
+        {
+            if (!expr.indices.empty())
+            {
+                // We can't currently optimize away the "identity" expression, because the result (a loops "index" Scalar)
+                // would be a register variable (pointer valence 0), and the generated kernel function expects a stored value
+                // (pointer valence 1). So, we need to call `Allocate()` to get a stored variable.
+                auto sum = Scalar(Allocate<int>(utilities::ScalarLayout));
+                sum = expr.begin;
+                for (auto scaledIndex : expr.indices)
+                {
+                    if (auto it = indexVariables.find(scaledIndex.index); it != indexVariables.end())
+                    {
+                        auto indexValue = it->second.value;
+                        sum += indexValue * scaledIndex.scale;
+                    }
+                }
+
+                return sum;
+            }
+            return 0;
+        }
+
+        Scalar CodeGenerator::EmitKernelPredicate(const KernelPredicate& predicate, const LoopIndexSymbolTable& runtimeIndexVariables, const LoopVisitSchedule& schedule) const
+        {
+            const auto& domain = schedule.GetLoopNest().GetDomain();
+            auto predResult = MakeScalar<int>("predResult");
+            predResult = 1; // "true"
+
+            auto emitPredicate = [&domain, &runtimeIndexVariables, &schedule](const auto& emitPredicate, const KernelPredicate& p, Scalar& result, bool defaultIsTrue) -> void {
+                if (p.IsAlwaysTrue())
+                {
+                    if (defaultIsTrue)
+                    {
+                        // nothing
+                    }
+                    else
+                    {
+                        result = Scalar(1); // "true"
+                    }
+                }
+                if (p.IsAlwaysFalse())
+                {
+                    if (defaultIsTrue)
+                    {
+                        result = Scalar(0); // "false"
+                    }
+                    else
+                    {
+                        // nothing
+                    }
+                }
+                else if (auto simplePredicate = p.As<FragmentTypePredicate>(); simplePredicate != nullptr)
+                {
+                    auto condition = simplePredicate->GetCondition();
+                    if (condition == Fragment::all)
+                    {
+                        return; // do nothing for 'all' predicates
+                    }
+
+                    auto index = simplePredicate->GetIndex();
+                    const auto range = domain.GetDimensionRange(index);
+
+                    auto loopIndices = range.GetDependentLoopIndices(index);
+                    if (loopIndices.empty())
+                    {
+                        loopIndices = { index };
+                    }
+                    for (auto loopIndex : loopIndices)
+                    {
+                        auto range = GetLoopRange(loopIndex, runtimeIndexVariables, schedule);
+
+                        int testVal = 0;
+                        bool valid = true;
+                        switch (condition)
+                        {
+                        case Fragment::first:
+                            testVal = range.Begin();
+                            break;
+                        case Fragment::last:
+                            testVal = range.End() - (range.Size() % range.Increment());
+                            if (testVal == range.End()) // not a boundary
+                            {
+                                testVal = range.End() - range.Increment();
+                            }
+                            break;
+                        case Fragment::endBoundary:
+                            testVal = range.End() - (range.Size() % range.Increment());
+                            if (testVal == range.End())
+                            {
+                                valid = false;
+                            }
+                            break;
+                        default:
+                            // throw?
+                            valid = false;
+                            break;
+                        }
+
+                        if (valid)
+                        {
+                            // if loop index not present, assume 0
+                            Scalar indexVal = MakeScalar<int>("predIndexVal");
+                            if (runtimeIndexVariables.count(loopIndex) != 0)
+                            {
+                                indexVal = runtimeIndexVariables.at(loopIndex).value;
+                            }
+
+                            if (defaultIsTrue)
+                            {
+                                If(indexVal != testVal, [&] {
+                                    result = Scalar(0); // "false"
+                                });
+                            }
+                            else
+                            {
+                                If(indexVal == testVal, [&] {
+                                    result = Scalar(1); // "true"
+                                });
+                            }
+                        }
+                    }
+                }
+                else if (p.Is<IndexDefinedPredicate>())
+                {
+                    throw utilities::LogicException(utilities::LogicExceptionErrors::notImplemented, "IsDefined predicate not implemented");
+                }
+                else if (auto conjunction = p.As<KernelPredicateConjunction>(); conjunction != nullptr)
+                {
+                    auto conjResult = MakeScalar<int>("conj");
+                    conjResult = Scalar(1); // "true"
+                    for (const auto& t : conjunction->GetTerms())
+                    {
+                        emitPredicate(emitPredicate, *t, conjResult, true);
+                    }
+
+                    if (defaultIsTrue)
+                    {
+                        If(conjResult == 0, [&result] {
+                            result = Scalar(0); // "false"
+                        });
+                    }
+                    else
+                    {
+                        If(conjResult != 0, [&result] {
+                            result = Scalar(1); // "true"
+                        });
+                    }
+                }
+                else if (auto disjunction = p.As<KernelPredicateDisjunction>(); disjunction != nullptr)
+                {
+                    auto disjResult = MakeScalar<int>("disj");
+                    disjResult = Scalar(0); // "false"
+                    for (const auto& t : disjunction->GetTerms())
+                    {
+                        emitPredicate(emitPredicate, *t, disjResult, false);
+                    }
+                    if (defaultIsTrue)
+                    {
+                        If(disjResult == 0, [&result] {
+                            result = Scalar(0); // "false"
+                        });
+                    }
+                    else
+                    {
+                        If(disjResult != 0, [&result] {
+                            result = Scalar(1); // "true"
+                        });
+                    }
+                }
+                else
+                {
+                    throw utilities::LogicException(utilities::LogicExceptionErrors::illegalState, "Unknown predicate type");
+                }
+            };
+
+            emitPredicate(emitPredicate, predicate, predResult, true);
+            return predResult;
+        }
+
+        void CodeGenerator::InvokeKernel(const Kernel& kernel, const KernelPredicate& predicate, const LoopIndexSymbolTable& runtimeIndexVariables, const LoopVisitSchedule& schedule) const
+        {
+            if (predicate.IsAlwaysTrue())
+            {
+                InvokeKernel(kernel, runtimeIndexVariables, schedule);
+            }
+            else
+            {
+                If(EmitKernelPredicate(predicate, runtimeIndexVariables, schedule) == 1, [&] {
+                    InvokeKernel(kernel, runtimeIndexVariables, schedule);
+                });
+            }
+        }
+
+        void CodeGenerator::InvokeKernel(const Kernel& kernel, const LoopIndexSymbolTable& runtimeIndexVariables, const LoopVisitSchedule& schedule) const
+        {
+            const auto& kernelArgs = kernel.GetArgs();
+            const auto& kernelIndices = kernel.GetIndices();
+
+            const auto& renameActions = schedule.GetLoopNest().GetRenameActions();
+
+            // Create argument list
+            std::vector<Value> kernelArgValues;
+            kernelArgValues.reserve(kernelArgs.size());
+
+            auto rename = [&](const Value& arg) {
+                for (const auto& action : renameActions)
+                {
+                    const auto& excludedKernels = action.excludedKernels;
+                    if (std::find(excludedKernels.begin(), excludedKernels.end(), kernel.GetId()) == excludedKernels.end() &&
+                        std::equal_to<Value>{}(arg, action.oldValue) &&
+                        AreAllFullyDefined(action.where, schedule))
+                    {
+                        return action.newValue;
+                    }
+                }
+                return arg;
+            };
+
+            for (const auto& arg : kernelArgs)
+            {
+                kernelArgValues.push_back(rename(arg));
+            }
+
+            std::vector<Value> kernelIndexValues;
+            kernelIndexValues.reserve(kernelIndices.size());
+            for (auto index : kernelIndices)
+            {
+                kernelIndexValues.push_back(runtimeIndexVariables.at(index).value.GetValue());
+                auto name = kernelIndexValues.back().GetName();
+                kernelIndexValues.back().SetName(index.GetName());
+            }
+
+            kernel.Call(kernelArgValues, kernelIndexValues);
+        }
+
+        bool CodeGenerator::InvokeKernelGroup(const ScheduledKernelGroup& kernelGroup, const LoopIndexSymbolTable& runtimeIndexVariables, const LoopVisitSchedule& schedule) const
+        {
+            // preprocess to get only valid kernels
+            auto validKernels = GetValidKernels(kernelGroup, runtimeIndexVariables, schedule);
+
+            if (validKernels.empty())
+            {
+                return false;
+            }
+
+            std::optional<EmitterContext::IfContext> ifContext;
+            for (const auto& kernel : validKernels)
+            {
+                auto predicate = schedule.GetKernelPredicate(kernel).Simplify(runtimeIndexVariables, schedule);
+                if (predicate.IsAlwaysFalse())
+                {
+                    throw utilities::LogicException(utilities::LogicExceptionErrors::illegalState, "Always-false predicates should have been removed here");
+                }
+
+                if (predicate.IsAlwaysTrue())
+                {
+                    if (ifContext)
+                    {
+                        // We're already inside an 'if' cascade, so add final 'else' clause
+                        ifContext.value().Else([&] {
+                            InvokeKernel(kernel.kernel, runtimeIndexVariables, schedule);
+                        });
+                    }
+                    else
+                    {
+                        // If the first kernel's predicate is trivially 'true', just invoke the kernel and exit
+                        InvokeKernel(kernel.kernel, runtimeIndexVariables, schedule);
+                    }
+                    break;
+                }
+                else
+                {
+                    if (!ifContext)
+                    {
+                        auto predicateVal = EmitKernelPredicate(predicate, runtimeIndexVariables, schedule);
+                        ifContext.emplace(If(predicateVal == 1, [&] {
+                            InvokeKernel(kernel.kernel, runtimeIndexVariables, schedule);
+                        }));
+                    }
+                    else
+                    {
+                        auto predicateVal = EmitKernelPredicate(predicate, runtimeIndexVariables, schedule);
+                        ifContext.value().ElseIf(predicateVal == 1, [&] {
+                            InvokeKernel(kernel.kernel, runtimeIndexVariables, schedule);
+                        });
+                    }
+                }
+            }
+
+            return true;
+        }
+    } // namespace loopnests
+} // namespace value
+} // namespace ell
diff --git a/libraries/value/src/loopnests/CodePositionConstraints.cpp b/libraries/value/src/loopnests/CodePositionConstraints.cpp
new file mode 100644
index 000000000..ba5c6dfdc
--- /dev/null
+++ b/libraries/value/src/loopnests/CodePositionConstraints.cpp
@@ -0,0 +1,104 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     CodePositionConstraints.cpp (value)
+//  Authors:  Chuck Jacobs, Kern Handa
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "loopnests/CodePositionConstraints.h"
+
+#include <utilities/include/Hash.h>
+
+#include <stdexcept>
+
+namespace ell
+{
+namespace value
+{
+    namespace loopnests
+    {
+        CodePositionConstraints::CodePositionConstraints(LoopFragmentType placement, std::vector<Index> requiredIndices, std::vector<Index> boundaryIndices) :
+            _placement(placement),
+            _requiredIndices(requiredIndices),
+            _boundaryIndices(boundaryIndices)
+        {
+        }
+
+        std::vector<Index> CodePositionConstraints::GetRequiredIndices() const
+        {
+            return _requiredIndices;
+        }
+
+        std::vector<Index> CodePositionConstraints::GetBoundaryIndices() const
+        {
+            return _boundaryIndices;
+        }
+
+        std::ostream& operator<<(std::ostream& os, LoopFragmentType t)
+        {
+            switch (t)
+            {
+            case LoopFragmentType::prologue:
+                os << "prologue";
+                break;
+            case LoopFragmentType::body:
+                os << "body";
+                break;
+            case LoopFragmentType::boundary:
+                os << "boundary";
+                break;
+            case LoopFragmentType::epilogue:
+                os << "epilogue";
+                break;
+            case LoopFragmentType::LAST:
+                os << "LAST";
+                break;
+            default:
+                throw std::runtime_error("Unknown enum value");
+            }
+            return os;
+        }
+
+        std::ostream& operator<<(std::ostream& os, LoopFragmentFlags f)
+        {
+            std::string sep = "";
+            os << "[";
+            for (int i = 0; i < static_cast<int>(LoopFragmentType::LAST); ++i)
+            {
+                if (f.GetFlag(static_cast<LoopFragmentType>(i)))
+                {
+                    os << sep << LoopFragmentType(i);
+                    sep = " | ";
+                }
+            }
+            os << "]";
+            return os;
+        }
+
+        bool operator==(const CodePositionConstraints& i1, const CodePositionConstraints& i2)
+        {
+            return (i1.GetPlacement() == i2.GetPlacement()) && (i1.GetRequiredIndices() == i2.GetRequiredIndices()) && (i1.GetBoundaryIndices() == i2.GetBoundaryIndices());
+        }
+
+        bool operator!=(const CodePositionConstraints& i1, const CodePositionConstraints& i2)
+        {
+            return !(i1 == i2);
+        }
+    } // namespace loopnests
+} // namespace value
+} // namespace ell
+
+using namespace ell::value::loopnests;
+
+std::hash<CodePositionConstraints>::result_type std::hash<CodePositionConstraints>::operator()(const argument_type& constraints) const
+{
+    using ::ell::utilities::HashCombine;
+
+    size_t hash = 0;
+    HashCombine(hash, constraints.GetPlacement());
+    HashCombine(hash, constraints.GetRequiredIndices());
+    HashCombine(hash, constraints.GetBoundaryIndices());
+
+    return hash;
+}
diff --git a/libraries/value/src/loopnests/ForAll.cpp b/libraries/value/src/loopnests/ForAll.cpp
new file mode 100644
index 000000000..a991cfd00
--- /dev/null
+++ b/libraries/value/src/loopnests/ForAll.cpp
@@ -0,0 +1,49 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     ForAll.cpp (value)
+//  Authors:  Chuck Jacobs, Kern Handa
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "loopnests/ForAll.h"
+
+namespace ell
+{
+namespace value::loopnests
+{
+    ForAll::ForAll(IterationDomain domain) :
+        _loops(domain)
+    {
+    }
+
+    ForAll& ForAll::AddKernel(const Kernel& kernel)
+    {
+        _loops.AddKernel(kernel, LoopNest::ConstraintType::constraint);
+        return *this;
+    }
+
+    ForAll& ForAll::AddKernel(const Kernel& kernel, const CodePositionConstraints& where)
+    {
+        _loops.AddKernel(kernel, where);
+        return *this;
+    }
+
+    ForAll& ForAll::Split(const Index& dimension, int size)
+    {
+        _loops.Split(dimension, size);
+        return *this;
+    }
+
+    ForAll& ForAll::SetLoopOrder(const std::vector<Index>& order)
+    {
+        _loops.SetLoopOrder(order);
+        return *this;
+    }
+
+    const LoopNest& ForAll::GetNest() const
+    {
+        return _loops;
+    }
+} // namespace value::loopnests
+} // namespace ell
\ No newline at end of file
diff --git a/libraries/value/src/loopnests/Index.cpp b/libraries/value/src/loopnests/Index.cpp
new file mode 100644
index 000000000..dd230d856
--- /dev/null
+++ b/libraries/value/src/loopnests/Index.cpp
@@ -0,0 +1,55 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     Index.cpp (value)
+//  Authors:  Chuck Jacobs, Kern Handa
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "loopnests/Index.h"
+
+namespace ell
+{
+namespace value
+{
+    namespace loopnests
+    {
+        Index::Index(const std::string& name) :
+            _name(name),
+            _id(Index::GetNextId())
+        {
+        }
+
+        const std::string& Index::GetName() const
+        {
+            return _name;
+        }
+
+        Index::Id Index::GetId() const
+        {
+            return _id;
+        }
+
+        // TODO: Change this so that IDs are the responsibility of the EmitterContext
+        Index::Id Index::GetNextId()
+        {
+            static Id _nextIndex = 0;
+            return _nextIndex++;
+        }
+
+        std::ostream& operator<<(std::ostream& os, const Index& index)
+        {
+            os << index.GetName();
+            return os;
+        }
+
+    } // namespace loopnests
+} // namespace value
+} // namespace ell
+
+using namespace ell::value::loopnests;
+
+std::hash<Index>::result_type std::hash<Index>::operator()(const argument_type& element) const
+{
+    return static_cast<size_t>(std::hash<Index::Id>()(element.GetId()));
+}
diff --git a/libraries/value/src/loopnests/IndexRange.cpp b/libraries/value/src/loopnests/IndexRange.cpp
new file mode 100644
index 000000000..55cfa7a3b
--- /dev/null
+++ b/libraries/value/src/loopnests/IndexRange.cpp
@@ -0,0 +1,65 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     IndexRange.cpp (value)
+//  Authors:  Chuck Jacobs, Kern Handa
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "loopnests/IndexRange.h"
+
+namespace ell
+{
+namespace value
+{
+    namespace loopnests
+    {
+        IndexRange::IndexRange(const Index& index, const Range& range) :
+            _index(index),
+            _range(range)
+        {
+        }
+
+        IndexRange::IndexRange(const std::string& name, const Range& range) :
+            _index({ name }),
+            _range(range)
+        {
+        }
+
+        const Index& IndexRange::GetIndex() const
+        {
+            return _index;
+        }
+
+        const std::string& IndexRange::GetName() const
+        {
+            return _index.GetName();
+        }
+
+        int IndexRange::Begin() const
+        {
+            return _range.Begin();
+        }
+
+        int IndexRange::End() const
+        {
+            return _range.End();
+        }
+
+        int IndexRange::Size() const
+        {
+            return _range.Size();
+        }
+
+        int IndexRange::Increment() const
+        {
+            return _range.Increment();
+        }
+
+        Range IndexRange::GetRange() const
+        {
+            return _range;
+        }
+    } // namespace loopnests
+} // namespace value
+} // namespace ell
diff --git a/libraries/value/src/loopnests/IterationDomain.cpp b/libraries/value/src/loopnests/IterationDomain.cpp
new file mode 100644
index 000000000..981e21aca
--- /dev/null
+++ b/libraries/value/src/loopnests/IterationDomain.cpp
@@ -0,0 +1,60 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     IterationDomain.cpp (value)
+//  Authors:  Chuck Jacobs, Kern Handa
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "loopnests/IterationDomain.h"
+
+namespace ell
+{
+namespace value
+{
+    namespace loopnests
+    {
+
+        IterationDomain::IterationDomain(const std::vector<IndexRange>& dimensions) :
+            _dimensions(dimensions)
+        {
+            for (int d = 0; d < NumDimensions(); ++d)
+            {
+                _indexToDimensionMap[_dimensions[d].GetIndex().GetId()] = d;
+            }
+            //Assert(IsUnique(Transform(dimensions, [](auto x) { return x.GetIndex().GetName(); })), "Dimensions must have unique indices");
+        }
+
+        IterationDomain::IterationDomain(const std::initializer_list<IndexRange>& dimensions) :
+            IterationDomain(std::vector<IndexRange>{ dimensions.begin(), dimensions.end() })
+        {}
+
+        int IterationDomain::NumDimensions() const
+        {
+            return static_cast<int>(_dimensions.size());
+        }
+
+        IndexRange IterationDomain::GetDimensionRange(int dimension) const
+        {
+            return _dimensions[dimension];
+        }
+
+        IndexRange IterationDomain::GetDimensionRange(const Index& index) const
+        {
+            return _dimensions[GetDimensionRangeFromIndex(index)];
+        }
+
+        const std::vector<IndexRange>& IterationDomain::GetRanges() const
+        {
+            return _dimensions;
+        }
+
+        int IterationDomain::GetDimensionRangeFromIndex(const Index& index) const
+        {
+            auto it = _indexToDimensionMap.find(index.GetId());
+            return it->second;
+        }
+
+    } // namespace loopnests
+} // namespace value
+} // namespace ell
diff --git a/libraries/value/src/loopnests/Kernel.cpp b/libraries/value/src/loopnests/Kernel.cpp
new file mode 100644
index 000000000..649f27cb2
--- /dev/null
+++ b/libraries/value/src/loopnests/Kernel.cpp
@@ -0,0 +1,148 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     Kernel.cpp (value)
+//  Authors:  Chuck Jacobs, Kern Handa
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "loopnests/Kernel.h"
+
+namespace ell
+{
+namespace value
+{
+    namespace loopnests
+    {
+        Kernel::Kernel(std::string name) :
+            _id(name),
+            _kernelName(name) {}
+
+        Kernel::Kernel(std::string name, Id id) :
+            _id(id.empty() ? name : id),
+            _kernelName(name) {}
+
+        Kernel& Kernel::Inputs(const std::vector<Value>& inputs)
+        {
+            _inputs = inputs;
+            return *this;
+        }
+
+        Kernel& Kernel::Indices(std::vector<Index> indices)
+        {
+            _indices = indices;
+            return *this;
+        }
+
+        const std::string& Kernel::GetName() const
+        {
+            return _kernelName;
+        }
+
+        const Kernel::Id& Kernel::GetId() const
+        {
+            return _id;
+        }
+
+        const std::vector<Value>& Kernel::GetArgs() const
+        {
+            return _inputs;
+        }
+
+        const std::vector<Index>& Kernel::GetIndices() const
+        {
+            return _indices;
+        }
+
+        void Kernel::Call(std::vector<Value> inputs, std::vector<Value> indices) const
+        {
+            assert(_kernel);
+            _kernel(inputs, indices);
+        }
+
+        // TODO : make this a template specialization of Define(), currently lambdas and std::functions aren't
+        // getting matched correctly
+        Kernel& Kernel::DefineEx(std::function<void(std::vector<Value>, std::vector<Scalar>)>&& fn)
+        {
+            _kernel = [numOriginalIndices = _indices.size(),
+                       originalInputs = _inputs,
+                       kernelName = UniqueName(_kernelName + "KernelFn"),
+                       fnDefinition = std::move(fn)](std::vector<Value> arguments, std::vector<Value> indices) {
+                using namespace utilities;
+
+                if (arguments.size() != originalInputs.size())
+                {
+                    throw InputException(InputExceptionErrors::invalidArgument, "Number of arguments does not match number of expected inputs");
+                }
+                if (indices.size() != numOriginalIndices)
+                {
+                    throw InputException(InputExceptionErrors::invalidArgument, "Number of indices does not match number of expected indices");
+                }
+
+                // Flatten the vectors of parameters into a single vector in order to define the emitted function
+                std::vector<ViewAdapter> fnInputs(arguments.begin(), arguments.end());
+                fnInputs.insert(fnInputs.end(), indices.begin(), indices.end());
+
+                std::vector<ViewAdapter> fnParameters(originalInputs.begin(), originalInputs.end());
+                fnParameters.insert(fnParameters.end(), indices.begin(), indices.end());
+                for (auto i = 0u; i < originalInputs.size(); ++i)
+                {
+                    Value& param = fnParameters[i];
+                    const Value& input = fnInputs[i];
+
+                    if (!input.IsConstrained())
+                    {
+                        param.ClearLayout();
+                    }
+                    else
+                    {
+                        param.SetLayout(input.GetLayout());
+                    }
+                }
+
+                auto fn = DeclareFunction(kernelName).Parameters(fnParameters);
+                fn.Inlined(FunctionInlining::always);
+                if (!fn.IsDefined())
+                {
+                    // Add a function layer to coalesce the vectors of parameters inside the function call
+                    fn.Define([originalArgCount = arguments.size(),
+                               originalIndexCount = indices.size(),
+                               innerFn = std::move(fnDefinition)](std::vector<Value> args) {
+                        if (args.size() != originalArgCount + originalIndexCount)
+                        {
+                            throw InputException(InputExceptionErrors::invalidArgument, "Number of arguments + indices does not match number of expected inputs");
+                        }
+
+                        std::vector<Value> inputs;
+                        inputs.reserve(originalArgCount);
+                        for (unsigned idx = 0; idx < originalArgCount; ++idx)
+                        {
+                            inputs.push_back(args[idx]);
+                        }
+
+                        std::vector<Scalar> parameters;
+                        parameters.reserve(originalIndexCount);
+                        for (unsigned idx = originalArgCount; idx < (originalArgCount + originalIndexCount); ++idx)
+                        {
+                            parameters.push_back(args[idx]);
+                        }
+
+                        innerFn(std::move(inputs), std::move(parameters));
+                    });
+                }
+
+                fn.Call(fnInputs);
+            };
+
+            return *this;
+        }
+    } // namespace loopnests
+} // namespace value
+} // namespace ell
+
+using namespace ell::value::loopnests;
+
+std::hash<Kernel>::result_type std::hash<Kernel>::operator()(const argument_type& kernel) const
+{
+    return static_cast<size_t>(std::hash<Kernel::Id>()(kernel.GetId()));
+}
diff --git a/libraries/value/src/loopnests/KernelPredicate.cpp b/libraries/value/src/loopnests/KernelPredicate.cpp
new file mode 100644
index 000000000..ba3ac4db2
--- /dev/null
+++ b/libraries/value/src/loopnests/KernelPredicate.cpp
@@ -0,0 +1,703 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     KernelPredicate.cpp (value)
+//  Authors:  Chuck Jacobs
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "loopnests/KernelPredicate.h"
+#include "loopnests/LoopNest.h"
+#include "loopnests/LoopNestVisitor.h"
+
+#include <utilities/include/TypeTraits.h>
+
+namespace ell
+{
+namespace value
+{
+    namespace loopnests
+    {
+        namespace
+        {
+            // computes ceil(a/b)
+            int CeilDiv(int a, int b)
+            {
+                return (a - 1) / b + 1;
+            }
+
+            bool Intersects(const Range& a, const Range& b)
+            {
+                int aIter = CeilDiv(a.End() - a.Begin(), a.Increment());
+                int bIter = CeilDiv(b.End() - b.Begin(), b.Increment());
+
+                if (aIter == 0 || bIter == 0)
+                {
+                    return false;
+                }
+                auto aLast = a.Begin() + (aIter - 1) * a.Increment();
+                auto bLast = b.Begin() + (bIter - 1) * b.Increment();
+
+                return aLast >= b.Begin() && a.Begin() <= bLast;
+            }
+        } // namespace
+
+        //
+        // ConstantPredicate
+        //
+        ConstantPredicate::ConstantPredicate(bool value) :
+            _value(value)
+        {
+        }
+
+        bool ConstantPredicate::GetValue() const
+        {
+            return _value;
+        }
+
+        //
+        // FragmentTypePredicate
+        //
+        FragmentTypePredicate::FragmentTypePredicate(const Index& index, Fragment condition) :
+            _index(index),
+            _condition(condition)
+        {
+        }
+
+        // bool FragmentTypePredicate::IsSatisfied(const std::vector<Index>& indices) const; // Perhaps pass in current loop state?
+        const Index& FragmentTypePredicate::GetIndex() const
+        {
+            return _index;
+        }
+
+        Fragment FragmentTypePredicate::GetCondition() const
+        {
+            return _condition;
+        }
+
+        KernelPredicate FragmentTypePredicate::Simplify() const
+        {
+            if (_condition == Fragment::all)
+            {
+                return ConstantPredicate(true);
+            }
+            return *this;
+        }
+
+        KernelPredicate FragmentTypePredicate::Simplify(const LoopIndexSymbolTable& indices, const LoopVisitSchedule& schedule) const
+        {
+            if (_condition == Fragment::all)
+            {
+                return ConstantPredicate(true);
+            }
+
+            const auto index = GetIndex();
+            const auto condition = GetCondition();
+
+            // Get all index variables dependent on the predicate index
+            const auto& domain = schedule.GetLoopNest().GetDomain();
+            auto loopIndices = domain.GetDependentLoopIndices(index, true);
+
+            // Evaluate a little equality "sub-predicate" for each dependent variable. All of them must be true for the result to be true.
+            for (auto loopIndex : loopIndices)
+            {
+                // TODO: move `GetLoopRage` somewhere else
+                auto fullRange = LoopNestVisitor::GetLoopRange(loopIndex, indices, schedule);
+
+                int testVal = 0;
+                bool valid = true;
+                switch (condition)
+                {
+                case Fragment::first:
+                    testVal = fullRange.Begin();
+                    break;
+                case Fragment::last:
+                    testVal = fullRange.End() - (fullRange.Size() % fullRange.Increment());
+                    if (testVal == fullRange.End()) // not a boundary
+                    {
+                        testVal = fullRange.End() - fullRange.Increment();
+                    }
+                    break;
+                case Fragment::endBoundary:
+                    testVal = fullRange.End() - (fullRange.Size() % fullRange.Increment());
+                    if (testVal == fullRange.End()) // not a boundary
+                    {
+                        valid = false;
+                    }
+                    break;
+                default:
+                    valid = false;
+                    // throw?
+                    break;
+                }
+
+                if (valid)
+                {
+                    // Loop up range of the active loop
+                    auto activeRange = fullRange;
+                    if (const auto it = indices.find(loopIndex); it != indices.end())
+                    {
+                        if (it->second.state == LoopIndexState::inProgress)
+                        {
+                            activeRange = it->second.loopRange;
+                        }
+                    }
+
+                    // Now check if testVal intersects with the loop's range
+                    if (activeRange.Increment() == 0) // bad range
+                    {
+                        return *this;
+                    }
+                    int numIterations = CeilDiv(activeRange.End() - activeRange.Begin(), activeRange.Increment());
+                    if (numIterations == 0)
+                    {
+                        return *this;
+                    }
+
+                    if (Intersects(activeRange, { testVal, testVal + 1 }))
+                    {
+                        if (numIterations == 1)
+                        {
+                            // true -- don't add anything to AND list
+                        }
+                        else
+                        {
+                            return *this;
+                            // TODO: add index, testVal to AND list, later return a conjunction of equality predicates
+                        }
+                    }
+                    else
+                    {
+                        return ConstantPredicate(false);
+                    }
+                }
+            }
+            return ConstantPredicate(true);
+        }
+
+        //
+        // PlacementPredicate
+        //
+        PlacementPredicate::PlacementPredicate(Placement placement) :
+            _index(std::nullopt),
+            _placement(placement)
+        {
+        }
+
+        PlacementPredicate::PlacementPredicate(const Index& index, Placement placement) :
+            _index(index),
+            _placement(placement)
+        {
+        }
+
+        // bool PlacementPredicate::IsSatisfied(const std::vector<Index>& indices) const; // Perhaps pass in current loop state?
+        bool PlacementPredicate::HasIndex() const
+        {
+            return _index.has_value();
+        }
+
+        Index PlacementPredicate::GetIndex() const
+        {
+            return _index.value();
+        }
+
+        Placement PlacementPredicate::GetPlacement() const
+        {
+            return _placement;
+        }
+
+        const PlacementPredicate& PlacementPredicate::Simplify() const
+        {
+            return *this;
+        }
+
+        const PlacementPredicate& PlacementPredicate::Simplify(const LoopIndexSymbolTable& indices, const LoopVisitSchedule& schedule) const
+        {
+            return *this;
+        }
+
+        //
+        // IndexDefinedPredicate
+        //
+        IndexDefinedPredicate::IndexDefinedPredicate(const Index& index) :
+            _index(index)
+        {
+        }
+
+        const Index& IndexDefinedPredicate::GetIndex() const
+        {
+            return _index;
+        }
+
+        const IndexDefinedPredicate& IndexDefinedPredicate::Simplify() const
+        {
+            return *this;
+        }
+
+        const IndexDefinedPredicate& IndexDefinedPredicate::Simplify(const LoopIndexSymbolTable& indices, const LoopVisitSchedule& schedule) const
+        {
+            return *this;
+        }
+
+        //
+        // KernelPredicateConjunction
+        //
+        KernelPredicateConjunction::KernelPredicateConjunction(const KernelPredicate& lhs, const KernelPredicate& rhs)
+        {
+            _terms.push_back(std::make_unique<KernelPredicate>(lhs));
+            _terms.push_back(std::make_unique<KernelPredicate>(rhs));
+        }
+
+        KernelPredicateConjunction::KernelPredicateConjunction(const KernelPredicateConjunction& other) :
+            KernelPredicateConjunction(other._terms)
+        {
+        }
+
+        KernelPredicateConjunction::KernelPredicateConjunction(const std::vector<std::unique_ptr<KernelPredicate>>& terms)
+        {
+            for (const auto& t : terms)
+            {
+                _terms.emplace_back(std::make_unique<KernelPredicate>(*t));
+            }
+        }
+
+        KernelPredicateConjunction& KernelPredicateConjunction::operator=(const KernelPredicateConjunction& other)
+        {
+            for (const auto& t : other._terms)
+            {
+                _terms.emplace_back(std::make_unique<KernelPredicate>(*t));
+            }
+            return *this;
+        }
+
+        const std::vector<std::unique_ptr<KernelPredicate>>& KernelPredicateConjunction::GetTerms() const
+        {
+            return _terms;
+        }
+
+        KernelPredicate KernelPredicateConjunction::Simplify() const
+        {
+            std::vector<std::unique_ptr<KernelPredicate>> terms;
+            for (const auto& t : GetTerms())
+            {
+                auto simplifiedTerm = t->Simplify();
+                if (simplifiedTerm.IsAlwaysFalse())
+                {
+                    return ConstantPredicate(false);
+                }
+                if (!simplifiedTerm.IsAlwaysTrue())
+                {
+                    terms.emplace_back(std::make_unique<KernelPredicate>(simplifiedTerm));
+                }
+            }
+
+            if (terms.empty())
+            {
+                return KernelPredicate{};
+            }
+            else if (terms.size() == 1)
+            {
+                return *terms[0];
+            }
+            else
+            {
+                return KernelPredicateConjunction(terms);
+            }
+        }
+
+        KernelPredicate KernelPredicateConjunction::Simplify(const LoopIndexSymbolTable& indices, const LoopVisitSchedule& schedule) const
+        {
+            if (GetTerms().empty())
+            {
+                return EmptyPredicate();
+            }
+
+            std::vector<std::unique_ptr<KernelPredicate>> terms;
+            for (const auto& t : GetTerms())
+            {
+                auto simplifiedTerm = t->Simplify(indices, schedule);
+                if (simplifiedTerm.IsAlwaysFalse())
+                {
+                    return ConstantPredicate(false);
+                }
+                else if (!simplifiedTerm.IsAlwaysTrue())
+                {
+                    terms.emplace_back(std::make_unique<KernelPredicate>(simplifiedTerm));
+                }
+                // If always true, do nothing
+            }
+
+            if (terms.empty())
+            {
+                return ConstantPredicate(true);
+            }
+            else if (terms.size() == 1)
+            {
+                return *terms[0];
+            }
+            else
+            {
+                return KernelPredicateConjunction(terms);
+            }
+        }
+
+        //
+        // KernelPredicateDisjunction
+        //
+        KernelPredicateDisjunction::KernelPredicateDisjunction(const KernelPredicate& lhs, const KernelPredicate& rhs)
+        {
+            _terms.push_back(std::make_unique<KernelPredicate>(lhs));
+            _terms.push_back(std::make_unique<KernelPredicate>(rhs));
+        }
+
+        KernelPredicateDisjunction::KernelPredicateDisjunction(const KernelPredicateDisjunction& other) :
+            KernelPredicateDisjunction(other._terms)
+        {
+        }
+
+        KernelPredicateDisjunction::KernelPredicateDisjunction(const std::vector<std::unique_ptr<KernelPredicate>>& terms)
+        {
+            for (const auto& t : terms)
+            {
+                _terms.emplace_back(std::make_unique<KernelPredicate>(*t));
+            }
+        }
+
+        KernelPredicateDisjunction& KernelPredicateDisjunction::operator=(const KernelPredicateDisjunction& other)
+        {
+            for (const auto& t : other._terms)
+            {
+                _terms.emplace_back(std::make_unique<KernelPredicate>(*t));
+            }
+            return *this;
+        }
+
+        const std::vector<std::unique_ptr<KernelPredicate>>& KernelPredicateDisjunction::GetTerms() const
+        {
+            return _terms;
+        }
+
+        KernelPredicate KernelPredicateDisjunction::Simplify() const
+        {
+            if (GetTerms().empty())
+            {
+                return EmptyPredicate();
+            }
+
+            std::vector<std::unique_ptr<KernelPredicate>> terms;
+            for (const auto& t : GetTerms())
+            {
+                auto simplifiedTerm = t->Simplify();
+                if (simplifiedTerm.IsAlwaysTrue())
+                {
+                    return { ConstantPredicate(true) };
+                }
+                else if (!simplifiedTerm.IsAlwaysFalse())
+                {
+                    terms.emplace_back(std::make_unique<KernelPredicate>(simplifiedTerm));
+                }
+                // If always false, do nothing
+            }
+            if (terms.empty())
+            {
+                return ConstantPredicate(false);
+            }
+            else if (terms.size() == 1)
+            {
+                return *terms[0];
+            }
+            else
+            {
+                return KernelPredicateDisjunction(terms);
+            }
+        }
+
+        KernelPredicate KernelPredicateDisjunction::Simplify(const LoopIndexSymbolTable& indices, const LoopVisitSchedule& schedule) const
+        {
+            std::vector<std::unique_ptr<KernelPredicate>> terms;
+            for (const auto& t : GetTerms())
+            {
+                auto simplifiedTerm = t->Simplify(indices, schedule);
+                if (simplifiedTerm.IsAlwaysTrue())
+                {
+                    return { ConstantPredicate(true) };
+                }
+                else if (!simplifiedTerm.IsAlwaysFalse())
+                {
+                    terms.emplace_back(std::make_unique<KernelPredicate>(simplifiedTerm));
+                }
+            }
+            if (terms.empty())
+            {
+                return KernelPredicate{};
+            }
+            else if (terms.size() == 1)
+            {
+                return *terms[0];
+            }
+            else
+            {
+                return KernelPredicateDisjunction(terms);
+            }
+        }
+
+        //
+        // KernelPredicate
+        //
+        KernelPredicate::KernelPredicate(const EmptyPredicate& predicate) :
+            _expr(predicate) {}
+
+        KernelPredicate::KernelPredicate(const ConstantPredicate& predicate) :
+            _expr(predicate) {}
+
+        KernelPredicate::KernelPredicate(const FragmentTypePredicate& predicate) :
+            _expr(predicate) {}
+
+        KernelPredicate::KernelPredicate(const PlacementPredicate& predicate) :
+            _expr(predicate) {}
+
+        KernelPredicate::KernelPredicate(const IndexDefinedPredicate& predicate) :
+            _expr(predicate) {}
+
+        KernelPredicate::KernelPredicate(const KernelPredicateConjunction& predicate) :
+            _expr(predicate) {}
+
+        KernelPredicate::KernelPredicate(const KernelPredicateDisjunction& predicate) :
+            _expr(predicate) {}
+
+        KernelPredicate KernelPredicate::Simplify() const
+        {
+            return std::visit(
+                [](auto&& pred) -> KernelPredicate {
+                    return pred.Simplify();
+                },
+                _expr);
+        }
+
+        KernelPredicate KernelPredicate::Simplify(const LoopIndexSymbolTable& indices, const LoopVisitSchedule& schedule) const
+        {
+            return std::visit(
+                [&indices, &schedule](auto&& pred) -> KernelPredicate {
+                    return pred.Simplify(indices, schedule);
+                },
+                _expr);
+        }
+
+        bool KernelPredicate::IsAlwaysTrue() const
+        {
+            if (IsEmpty())
+            {
+                return true;
+            }
+            auto simplePredicate = Simplify();
+            if (auto constPred = simplePredicate.As<ConstantPredicate>(); constPred != nullptr)
+            {
+                return constPred->GetValue() == true;
+            }
+            return false;
+        }
+
+        bool KernelPredicate::IsAlwaysFalse() const
+        {
+            if (IsEmpty())
+            {
+                return false;
+            }
+            auto simplePredicate = Simplify();
+            if (auto constPred = simplePredicate.As<ConstantPredicate>(); constPred != nullptr)
+            {
+                return constPred->GetValue() == false;
+            }
+            return false;
+        }
+
+        bool KernelPredicate::IsEmpty() const
+        {
+            return std::holds_alternative<EmptyPredicate>(_expr);
+        }
+
+        //
+        // free functions
+        //
+        KernelPredicate First(const Index& index)
+        {
+            return FragmentTypePredicate{ index, Fragment::first };
+        }
+
+        KernelPredicate Last(const Index& index)
+        {
+            return FragmentTypePredicate{ index, Fragment::last };
+        }
+
+        KernelPredicate EndBoundary(const Index& index)
+        {
+            return FragmentTypePredicate{ index, Fragment::endBoundary };
+        }
+
+        KernelPredicate All(const Index& index)
+        {
+            return FragmentTypePredicate{ index, Fragment::all };
+        }
+
+        KernelPredicate Before(const Index& index)
+        {
+            return PlacementPredicate{ index, Placement::before };
+        }
+
+        KernelPredicate After(const Index& index)
+        {
+            return PlacementPredicate{ index, Placement::after };
+        }
+
+        KernelPredicate IsDefined(const Index& index)
+        {
+            return IndexDefinedPredicate{ index };
+        }
+
+        KernelPredicate operator&&(const KernelPredicate& lhs, const KernelPredicate& rhs)
+        {
+            return KernelPredicateConjunction{ lhs, rhs };
+        }
+
+        KernelPredicate operator||(const KernelPredicate& lhs, const KernelPredicate& rhs)
+        {
+            return KernelPredicateDisjunction{ lhs, rhs };
+        }
+
+        std::string ToString(Fragment cond)
+        {
+            switch (cond)
+            {
+            case Fragment::all:
+                return "all";
+            case Fragment::first:
+                return "first";
+            case Fragment::last:
+                return "last";
+            case Fragment::endBoundary:
+                return "endBoundary";
+            default:
+                return "<>";
+            }
+        }
+
+        std::string ToString(Placement where)
+        {
+            switch (where)
+            {
+            case Placement::before:
+                return "before";
+            case Placement::after:
+                return "after";
+            default:
+                return "<>";
+            }
+        }
+
+        std::ostream& operator<<(std::ostream& os, const EmptyPredicate& predicate)
+        {
+            os << "{}";
+            return os;
+        }
+
+        std::ostream& operator<<(std::ostream& os, const ConstantPredicate& predicate)
+        {
+            os << (predicate.GetValue() ? "true" : "false");
+            return os;
+        }
+
+        std::ostream& operator<<(std::ostream& os, const FragmentTypePredicate& predicate)
+        {
+            os << ToString(predicate.GetCondition()) << "(" << predicate.GetIndex() << ")";
+            return os;
+        }
+
+        std::ostream& operator<<(std::ostream& os, const PlacementPredicate& predicate)
+        {
+            if (predicate.HasIndex())
+            {
+                os << ToString(predicate.GetPlacement()) << "(" << predicate.GetIndex() << ")";
+            }
+            else
+            {
+                os << ToString(predicate.GetPlacement()) << "()";
+            }
+            return os;
+        }
+
+        std::ostream& operator<<(std::ostream& os, const IndexDefinedPredicate& predicate)
+        {
+            os << "IsDefined(" << predicate.GetIndex() << ")";
+            return os;
+        }
+
+        std::ostream& operator<<(std::ostream& os, const KernelPredicateConjunction& predicate)
+        {
+            const auto& terms = predicate.GetTerms();
+            if (terms.size() == 0)
+            {
+                os << "true";
+            }
+            else if (terms.size() == 1)
+            {
+                os << *terms[0];
+            }
+            else
+            {
+                os << "(";
+                bool first = true;
+                for (const auto& t : terms)
+                {
+                    os << *t;
+                    if (!first)
+                    {
+                        os << " && ";
+                    }
+                    first = false;
+                }
+                os << ")";
+            }
+            return os;
+        }
+
+        std::ostream& operator<<(std::ostream& os, const KernelPredicateDisjunction& predicate)
+        {
+            const auto& terms = predicate.GetTerms();
+            if (terms.size() == 0)
+            {
+                os << "true";
+            }
+            else if (terms.size() == 1)
+            {
+                os << *terms[0];
+            }
+            else
+            {
+                os << "(";
+                bool first = true;
+                for (const auto& t : terms)
+                {
+                    os << *t;
+                    if (!first)
+                    {
+                        os << " || ";
+                    }
+                    first = false;
+                }
+                os << ")";
+            }
+            return os;
+        }
+
+        std::ostream& operator<<(std::ostream& os, const KernelPredicate& predicate)
+        {
+            std::visit([&os](auto&& pred) {
+                os << pred;
+            },
+                       predicate._expr);
+            return os;
+        }
+
+    } // namespace loopnests
+} // namespace value
+} // namespace ell
diff --git a/libraries/value/src/loopnests/LoopNest.cpp b/libraries/value/src/loopnests/LoopNest.cpp
new file mode 100644
index 000000000..67a412df0
--- /dev/null
+++ b/libraries/value/src/loopnests/LoopNest.cpp
@@ -0,0 +1,900 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     LoopNest.h (value)
+//  Authors:  Chuck Jacobs, Kern Handa
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "loopnests/LoopNest.h"
+#include "loopnests/LoopNestPrinter.h"
+
+#include <utilities/include/Exception.h>
+#include <utilities/include/Hash.h>
+#include <utilities/include/Logger.h>
+
+#include <deque>
+#include <iostream>
+#include <numeric>
+#include <queue>
+#include <set>
+#include <stdexcept>
+
+namespace ell
+{
+namespace value
+{
+    namespace loopnests
+    {
+        using logging::EOL;
+        using logging::Log;
+
+        //
+        // LoopVisitSchedule
+        //
+
+        LoopVisitSchedule::LoopVisitSchedule(const LoopNest& nest, LoopVisitSchedule::StateQueue state) :
+            LoopVisitSchedule(nest, 0, state)
+        {}
+
+        LoopVisitSchedule::LoopVisitSchedule(const LoopNest& nest, int level, StateQueue state) :
+            _level(level),
+            _state(std::move(state)),
+            _nest(nest)
+        {}
+
+        LoopVisitSchedule::LoopVisitSchedule(const LoopVisitSchedule& other) :
+            _level(other._level),
+            _state(other._state),
+            _nest(other._nest)
+        {}
+
+        LoopVisitSchedule& LoopVisitSchedule::operator=(const LoopVisitSchedule& other)
+        {
+            _level = other._level;
+            _state = other._state;
+            _nest = other._nest;
+            return *this;
+        }
+
+        const LoopVisitSchedule::LoopInfo& LoopVisitSchedule::Front() const
+        {
+            return _state[_level];
+        }
+
+        const SplitIterationDomain& LoopVisitSchedule::GetDomain() const
+        {
+            return GetLoopNest().GetDomain();
+        }
+
+        int LoopVisitSchedule::CurrentNestLevel() const
+        {
+            return _level;
+        };
+
+        bool LoopVisitSchedule::IsDone() const
+        {
+            return _level == static_cast<int>(_state.size());
+        }
+
+        bool LoopVisitSchedule::IsInnermostLoop() const
+        {
+            return _level == static_cast<int>(_state.size()) - 1;
+        }
+
+        Index LoopVisitSchedule::CurrentDimension() const
+        {
+            return Front().dimension;
+        }
+
+        Range LoopVisitSchedule::LoopRange() const
+        {
+            // ### debugging
+            assert(Front().indexRange.GetRange() == GetLoopNest().GetDomain().GetIndexRange(CurrentLoopIndex()));
+            return Front().indexRange.GetRange();
+        }
+
+        int LoopVisitSchedule::LoopSize() const
+        {
+            return LoopRange().Size();
+        }
+
+        int LoopVisitSchedule::DimensionSize() const
+        {
+            return GetDomain().GetDimensionSize(Front().dimension);
+        }
+
+        int LoopVisitSchedule::NonBoundaryEnd() const
+        {
+            auto numFullLoopIterations = LoopSize() / LoopIncrement();
+            auto nonBoundaryLoopSize = LoopIncrement() * numFullLoopIterations;
+            return nonBoundaryLoopSize + LoopRange().Begin();
+        }
+
+        int LoopVisitSchedule::LoopIncrement() const
+        {
+            return LoopRange().Increment();
+        }
+
+        int LoopVisitSchedule::LoopIndexScale() const
+        {
+            return Front().scale;
+        }
+
+        bool LoopVisitSchedule::CurrentLoopHasFragment(std::vector<ScheduledKernel> activeKernels, LoopFragmentType fragmentType) const
+        {
+            auto currentIndex = CurrentLoopIndex();
+            for (const auto& kernel : GetLoopNest().GetKernels())
+            {
+                const auto& where = kernel.constraints;
+                if (where.GetPlacement() == fragmentType)
+                {
+                    // Boundary constraints: return `true` if this loop causes all the boundary indices to be defined
+                    //   (which is to say, they're all fully-defined here but not in previous loop)
+                    const auto& outsideIndices = where.GetBoundaryIndices();
+                    if (outsideIndices.size() != 0)
+                    {
+                        bool allFullyDefined = std::all_of(outsideIndices.begin(), outsideIndices.end(), [&](auto index) {
+                            return IsFullyDefined(index);
+                        });
+                        bool definedByThisLoop = std::any_of(outsideIndices.begin(), outsideIndices.end(), [&](auto index) {
+                            return IsFullyDefinedByThisLoop(index);
+                        });
+                        if (allFullyDefined && definedByThisLoop)
+                        {
+                            return true;
+                        }
+                    }
+                }
+            }
+            return false;
+        }
+
+        bool LoopVisitSchedule::FragmentCanRunAlone(std::vector<ScheduledKernel> activeKernels, LoopFragmentType fragmentType) const
+        {
+            return true;
+        }
+
+        bool LoopVisitSchedule::FutureLoopHasFragmentForThisIndex(std::vector<ScheduledKernel> activeKernels, LoopFragmentType fragmentType) const
+        {
+            auto currentIndex = CurrentLoopIndex();
+            for (const auto& kernel : GetLoopNest().GetKernels())
+            {
+                const auto& where = kernel.constraints;
+                if (where.GetPlacement() == fragmentType)
+                {
+                    // Boundary constraints: return `true` if this loop causes all the boundary indices to be defined
+                    //   (which is to say, they're all fully-defined here but not in previous loop)
+                    const auto& outsideIndices = where.GetBoundaryIndices();
+                    bool allFullyDefined = std::all_of(outsideIndices.begin(), outsideIndices.end(), [&](auto index) {
+                        return IsFullyDefined(index);
+                    });
+                    bool thisIndexWasUsed = std::any_of(outsideIndices.begin(), outsideIndices.end(), [&](auto index) {
+                        auto domain = GetLoopNest().GetDomain();
+                        if (index == currentIndex || domain.DependsOn(index, currentIndex))
+                        {
+                            return true;
+                        }
+                        return false;
+                    });
+
+                    if (!allFullyDefined && thisIndexWasUsed)
+                    {
+                        return true;
+                    }
+                }
+            }
+            return false;
+        }
+
+        int LoopVisitSchedule::CurrentIndexEndBoundarySize() const
+        {
+            return Front().boundarySize;
+        }
+
+        Index LoopVisitSchedule::CurrentLoopIndex() const
+        {
+            return Front().indexRange.GetIndex();
+        }
+
+        LoopVisitSchedule LoopVisitSchedule::Next() const
+        {
+            if (IsDone())
+            {
+                throw utilities::LogicException(utilities::LogicExceptionErrors::illegalState, "Error: calling Next() at end of schedule");
+            }
+            return { _nest.get(), _level + 1, _state };
+        }
+
+        LoopVisitSchedule LoopVisitSchedule::Prev() const
+        {
+            if (_level == 0)
+            {
+                throw utilities::LogicException(utilities::LogicExceptionErrors::illegalState, "Error: calling Prev() on first loop level");
+            }
+            return { _nest.get(), _level - 1, _state };
+        }
+
+        bool LoopVisitSchedule::WillVisitIndex(const Index& index) const
+        {
+            auto dependentIndices = GetDomain().GetDependentIndices(index);
+
+            // Check if a future loop has a loop index that the query index depends on
+            for (auto it = _state.begin() + CurrentNestLevel(); it != _state.end(); ++it)
+            {
+                auto i = it->indexRange.GetIndex();
+                if (std::find(dependentIndices.begin(), dependentIndices.end(), i) != dependentIndices.end())
+                {
+                    return true;
+                }
+            }
+            return false;
+        }
+
+        bool LoopVisitSchedule::IsFullyDefined(const Index& index) const
+        {
+            if (index == CurrentLoopIndex())
+            {
+                return true;
+            }
+
+            for (const auto& i : GetDomain().GetDependentIndices(index))
+            {
+                if (GetDomain().IsLoopIndex(i))
+                {
+                    if (!WasIterationVariableDefined(i))
+                    {
+                        return false;
+                    }
+                }
+            }
+            return true;
+        }
+
+        bool LoopVisitSchedule::IsFullyDefinedByThisLoop(const Index& index) const
+        {
+            // return true if:
+            //    1) the given index is this loop's index variable
+            //    1) the given index is synthetic and one of its terms is this loop's index variable, and the rest
+            //       of the terms have already been defined
+            if (index == CurrentLoopIndex())
+            {
+                return true;
+            }
+
+            // look to see if this index has been defined
+            if (IsFullyDefined(index))
+            {
+                if (CurrentNestLevel() == 0)
+                {
+                    return true;
+                }
+                return !Prev().IsFullyDefined(index);
+            }
+            return false;
+        }
+
+        bool LoopVisitSchedule::WasIterationVariableDefined(const Index& index) const
+        {
+            for (auto it = _state.begin(); it != _state.begin() + CurrentNestLevel() + 1; ++it)
+            {
+                auto iterVar = it->indexRange.GetIndex();
+                if (iterVar == index)
+                {
+                    return true;
+                }
+            }
+            return false;
+        }
+
+        KernelPredicate LoopVisitSchedule::GetKernelPredicate(const ScheduledKernel& kernel) const
+        {
+            // Convert constraints to predicate
+
+            const auto& domain = GetDomain();
+
+            // Get list of conditions in existing predicate
+            std::set<FragmentTypePredicate> predicateConditions;
+            std::set<Index> predicateIndices;
+            std::set<Index> constrainedIndices;
+
+            kernel.predicate.Visit([&](const auto& p) {
+                if (auto fragmentPred = p.template As<FragmentTypePredicate>(); fragmentPred != nullptr)
+                {
+                    auto predicateIndex = fragmentPred->GetIndex();
+                    auto fragment = fragmentPred->GetCondition();
+
+                    // Convert computed indices to loop indices
+                    for (auto loopIndex : domain.GetDependentLoopIndices(predicateIndex, true))
+                    {
+                        predicateConditions.insert(FragmentTypePredicate(loopIndex, fragment));
+                        predicateIndices.insert(loopIndex);
+                    }
+                }
+                else if (p.template Is<IndexDefinedPredicate>())
+                {
+                    throw utilities::LogicException(utilities::LogicExceptionErrors::notImplemented, "IsDefined predicate not implemented");
+                }
+            });
+
+            // BEGIN CONVERT CONSTRAINTS
+            // Convert CodePositionConstraints
+            const bool convertConstraints = !kernel.newVersion;
+            if (convertConstraints)
+            {
+                for (auto constraintIndex : kernel.constraints.GetRequiredIndices())
+                {
+                    // Convert computed indices to loop indices
+                    for (auto loopIndex : domain.GetDependentLoopIndices(constraintIndex, true))
+                    {
+                        if (predicateIndices.count(loopIndex) != 0)
+                        {
+                            throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "Constraint applied to index " + loopIndex.GetName() + ", which already has a predicate");
+                        }
+
+                        predicateConditions.insert({ loopIndex, Fragment::all });
+                        predicateIndices.insert(loopIndex);
+                        constrainedIndices.insert(loopIndex);
+                    }
+                }
+
+                // Convert the kernel's predicate + constraints into equivalent predicate
+                // All "body" constraints turn into "all" conditions
+                // All "prologue" constraints turn into "first" conditions
+                // All "epilogue" constraints turn into "last" conditions
+
+                // Issue: empty "boundary indices" means everything not mentioned
+
+                const auto placement = kernel.constraints.GetPlacement();
+                auto constraintCondition = (placement == LoopFragmentType::prologue) || (placement == LoopFragmentType::body) ? Fragment::first : Fragment::last;
+                auto boundaryIndices = kernel.constraints.GetBoundaryIndices();
+                if (boundaryIndices.empty())
+                {
+                    // add all unmentioned loop indices --- all indices not dependent on any of the already-"all"'d indices
+                    for (auto loopIndex : GetLoopNest().GetLoopSequence())
+                    {
+                        if (constrainedIndices.count(loopIndex) == 0 && predicateIndices.count(loopIndex) == 0)
+                        {
+                            boundaryIndices.push_back(loopIndex);
+                        }
+                    }
+                }
+                for (auto boundaryIndex : boundaryIndices)
+                {
+                    // Convert any boundary indices into concrete loop indices
+                    for (auto loopIndex : domain.GetDependentLoopIndices(boundaryIndex, true))
+                    {
+                        constrainedIndices.insert(loopIndex);
+                        predicateIndices.insert(loopIndex);
+                        predicateConditions.insert({ loopIndex, constraintCondition });
+                    }
+                }
+
+                // All unmentioned loop indices become "first" conditions
+                for (auto loopIndex : GetLoopNest().GetLoopSequence())
+                {
+                    if (constrainedIndices.count(loopIndex) == 0 && predicateIndices.count(loopIndex) == 0)
+                    {
+                        predicateConditions.insert({ loopIndex, Fragment::first });
+                    }
+                }
+            }
+            // END CONVERT CONSTRAINTS
+
+            // new predicate == conjunction of all conditions in conditions set
+            if (predicateConditions.size() == 0)
+            {
+                return {}; // ?
+            }
+
+            auto begin = predicateConditions.begin();
+            auto first = KernelPredicate(*begin);
+            ++begin;
+            auto fullPredicate = std::accumulate(begin, predicateConditions.end(), first, [](auto lhs, auto rhs) -> KernelPredicate { return { KernelPredicateConjunction(lhs, rhs) }; });
+            auto result = fullPredicate.Simplify();
+            return result;
+        }
+
+        //
+        // LoopNest
+        //
+        LoopNest::LoopNest(IterationDomain domain) :
+            _domain(domain)
+        {
+            InitLoopSequence();
+        }
+
+        void LoopNest::InitLoopSequence()
+        {
+            // For each dimension, get a queue of loop indices
+            int numDimensions = _domain.NumDimensions();
+            std::vector<std::queue<Index>> dimensionIndices(numDimensions);
+            for (int d = 0; d < numDimensions; ++d)
+            {
+                const auto indices = _domain.GetLoopIndicesForDimension(_domain.GetBaseIndex(d));
+                dimensionIndices[d] = std::queue<Index>({ indices.begin(), indices.end() });
+            }
+
+            for (;;)
+            {
+                bool done = true;
+
+                // for each index
+                for (int d = 0; d < numDimensions; ++d)
+                {
+                    if (!dimensionIndices[d].empty())
+                    {
+                        _loopSequence.push_back(dimensionIndices[d].front());
+                        dimensionIndices[d].pop();
+                        done = false;
+                    }
+                }
+                if (done) break;
+            }
+        }
+
+        void LoopNest::ConvertKernelConstraints()
+        {
+            for (auto& k : _kernels)
+            {
+                ConvertKernelConstraints(k);
+            }
+        }
+
+        void LoopNest::ConvertKernelConstraints(ScheduledKernel& kernel)
+        {
+            // TODO: convert first/last into inequality check (<=, >=), so they can work with boundaries
+        }
+
+        void LoopNest::AddKernel(const Kernel& kernel, ConstraintType type)
+        {
+            if (type == ConstraintType::constraint)
+            {
+                AddKernel(kernel, LoopFragmentType::body);
+            }
+            else
+            {
+                CodePositionConstraints constraints{ LoopFragmentType::body, {}, {} }; // null constraints
+                _kernels.push_back({ true, kernel, constraints, {}, {} });
+            }
+        }
+
+        void LoopNest::AddKernel(const Kernel& kernel, LoopFragmentType where)
+        {
+            CodePositionConstraints constraints{ where, kernel.GetIndices(), {} };
+            AddKernel(kernel, constraints);
+        }
+
+        void LoopNest::AddKernel(const Kernel& kernel, const CodePositionConstraints& where)
+        {
+            // old version
+            _kernels.push_back({ false, kernel, where, {}, {} });
+        }
+
+        void LoopNest::AddKernel(const Kernel& kernel, const KernelPredicate& predicate)
+        {
+            AddKernel(kernel, predicate, {});
+        }
+
+        void LoopNest::AddKernel(const Kernel& kernel, const KernelPredicate& predicate, const KernelPredicate& placement)
+        {
+            // new version
+            CodePositionConstraints constraints{ LoopFragmentType::body, {}, {} }; // null constraints
+            _kernels.push_back({ true, kernel, constraints, predicate, placement });
+        }
+
+        void LoopNest::AddKernel(const Kernel& kernel, const CodePositionConstraints& where, const KernelPredicate& predicate, const KernelPredicate& placement)
+        {
+            // new version
+            _kernels.push_back({ true, kernel, where, predicate, {} });
+        }
+
+        const std::vector<ScheduledKernel>& LoopNest::GetKernels() const
+        {
+            return _kernels;
+        }
+
+        std::vector<ScheduledKernelGroup> LoopNest::GetKernelGroups() const
+        {
+            std::vector<ScheduledKernelGroup> result;
+            for (const auto& kernel : _kernels)
+            {
+                auto it = std::find_if(result.begin(), result.end(), [&](const ScheduledKernelGroup& g) {
+                    return g.id == kernel.kernel.GetId();
+                });
+                if (it == result.end())
+                {
+                    result.push_back({ kernel.kernel.GetId(), { kernel } });
+                }
+                else
+                {
+                    it->kernels.push_back(kernel);
+                }
+            }
+            return result;
+        }
+
+        int LoopNest::NumDimensions() const
+        {
+            return static_cast<int>(_domain.NumDimensions());
+        }
+
+        Range LoopNest::GetIndexRange(Index index) const
+        {
+            return _domain.GetIndexRange(index);
+        }
+
+        std::vector<IndexRange> LoopNest::GetLoopIndexRanges() const
+        {
+            std::vector<IndexRange> result;
+            for (int d = 0; d < NumDimensions(); ++d)
+            {
+                const auto& dimRange = GetDimensionRange(d);
+                for (const auto& index : dimRange.GetLoopIndices())
+                {
+                    result.emplace_back(index, dimRange.GetIndexRange(index));
+                }
+            }
+            return result;
+        }
+
+        const SplitIndexRange& LoopNest::GetDimensionRange(int dimension) const
+        {
+            return _domain.GetDimensionRange(dimension);
+        }
+
+        const SplitIndexRange& LoopNest::GetDimensionRange(const Index& dimension) const
+        {
+            return _domain.GetDimensionRange(dimension);
+        }
+
+        int LoopNest::NumSplits(const Index& dimension) const
+        {
+            return GetDimensionRange(dimension).NumSplits();
+        }
+
+        const std::vector<Index>& LoopNest::GetLoopSequence() const
+        {
+            return _loopSequence;
+        }
+
+        LoopVisitSchedule LoopNest::GetLoopSchedule() const
+        {
+            LoopVisitSchedule::StateQueue queue;
+
+            std::map<Index, std::set<Index>> availableLoopIndices;
+            int numDimensions = _domain.NumDimensions();
+            for (int i = 0; i < numDimensions; ++i)
+            {
+                auto dimension = _domain.GetBaseIndex(i);
+                auto loopIndices = _domain.GetLoopIndicesForDimension(dimension);
+                availableLoopIndices[dimension] = { loopIndices.begin(), loopIndices.end() };
+            }
+
+            for (auto loopIndex : GetLoopSequence())
+            {
+                auto range = _domain.GetIndexRange(loopIndex);
+                auto dimensionSize = _domain.GetDimensionSize(loopIndex);
+                int splitSize = range.Increment(); // need to keep track of the split size here, I think
+                int boundarySize = dimensionSize % splitSize;
+
+                auto loopIndexScale = GetLoopIndexScale(loopIndex);
+                auto dimensionIndex = _domain.GetBaseIndex(loopIndex);
+                queue.push_back(LoopVisitSchedule::LoopInfo{ dimensionIndex, IndexRange{ loopIndex, range }, boundarySize, loopIndexScale });
+            }
+
+            return { *this, queue };
+        }
+
+        SplitIndex LoopNest::Split(Index index, int size)
+        {
+            auto result = _domain.Split(index, size);
+
+            // Need to recompute loopSequence here (by replacing the index that got split with result.outer)
+            auto parent = _domain.GetParentIndex(result.outer); // this is the specific index that was split
+            auto it = std::find(_loopSequence.begin(), _loopSequence.end(), parent);
+            if (it == _loopSequence.end())
+            {
+                throw utilities::LogicException(utilities::LogicExceptionErrors::illegalState);
+            }
+            else
+            {
+                *it = result.outer;
+            }
+
+            _loopSequence.push_back(result.inner);
+            return result;
+        }
+
+        void LoopNest::Parallelize(Index index)
+        {
+            _parallelizedIndices.push_back(index);
+        }
+
+        // TODO: Move this out to the API surface
+        SplitIndex LoopNest::Parallelize(Index index, int factor)
+        {
+            auto result = Split(index, factor);
+            Parallelize(result.outer);
+            return result;
+        }
+
+        void LoopNest::Unroll(Index index)
+        {
+            _unrolledIndices.push_back(index);
+        }
+
+        // TODO: Move this out to the API surface
+        SplitIndex LoopNest::Unroll(Index index, int factor)
+        {
+            auto result = Split(index, factor);
+            Unroll(result.outer);
+            return result;
+        }
+
+        void LoopNest::SetLoopOrder(const std::vector<Index>& order)
+        {
+            if (order.size() != _loopSequence.size())
+            {
+                throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "SetLoopOrder() --- new order wrong length");
+            }
+
+            std::map<Index, std::set<Index>> availableLoopIndices;
+            int numDimensions = _domain.NumDimensions();
+            for (int i = 0; i < numDimensions; ++i)
+            {
+                auto dimension = _domain.GetBaseIndex(i);
+                auto loopIndices = _domain.GetLoopIndicesForDimension(dimension);
+                availableLoopIndices[dimension] = { loopIndices.begin(), loopIndices.end() };
+            }
+
+            // Function to get the next available concrete loop index that's a child index of a given index.
+            // Throws if there isn't one available.
+            auto getNextAvailable = [this, &availableLoopIndices](const Index& specifiedIndex) {
+                auto dimensionIndex = _domain.GetBaseIndex(specifiedIndex);
+                auto possibleIndices = _domain.GetDependentLoopIndices(specifiedIndex);
+                if (_domain.IsLoopIndex(specifiedIndex))
+                {
+                    possibleIndices.push_back(specifiedIndex);
+                }
+                for (auto i : possibleIndices)
+                {
+                    if (availableLoopIndices[dimensionIndex].count(i) != 0)
+                    {
+                        availableLoopIndices[dimensionIndex].erase(i);
+                        return i;
+                    }
+                }
+                throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "SetLoopOrder() --- new order wrong length");
+            };
+
+            std::vector<Index> newLoopSequence;
+            for (auto specifiedIndex : order)
+            {
+                auto loopIndex = getNextAvailable(specifiedIndex);
+                newLoopSequence.push_back(loopIndex);
+            }
+
+            _loopSequence = newLoopSequence;
+        }
+
+        void LoopNest::RenameVariable(ViewAdapter oldVariable, ViewAdapter newVariable, const std::vector<Index>& where, const std::vector<Kernel>& excludedKernels)
+        {
+            std::vector<Kernel::Id> kernelIds;
+            std::transform(
+                excludedKernels.begin(),
+                excludedKernels.end(),
+                std::back_inserter(kernelIds),
+                [](const Kernel& kernel) { return kernel.GetId(); });
+
+            _renameActions.push_back({ oldVariable, newVariable, where, kernelIds });
+        }
+
+        int LoopNest::GetLoopIndexScale(const Index& index) const
+        {
+            // TODO: later we may normalize the loops, in which case indexScale here will be the loop increment
+            return 1;
+        }
+
+        Index LoopNest::GetLoopIndex(const Index& dimension, int level) const
+        {
+            const auto& dim = GetDimensionRange(dimension);
+            return dim.GetSplitIndex(level);
+        }
+
+        bool LoopNest::IsUsed(const Index& index, const std::vector<ScheduledKernel>& activeKernels) const
+        {
+            for (auto k : activeKernels)
+            {
+                for (auto kernelIndex : k.kernel.GetIndices())
+                {
+                    if (kernelIndex == index || GetDomain().DependsOn(kernelIndex, index))
+                    {
+                        return true;
+                    }
+                }
+            }
+
+            return false;
+        }
+
+        bool LoopNest::IsParallelized(const Index& index) const
+        {
+            return std::find(_parallelizedIndices.begin(), _parallelizedIndices.end(), index) != _parallelizedIndices.end();
+        }
+
+        bool LoopNest::IsUnrolled(const Index& index) const
+        {
+            return std::find(_unrolledIndices.begin(), _unrolledIndices.end(), index) != _unrolledIndices.end();
+        }
+
+        const std::vector<RenameAction>& LoopNest::GetRenameActions() const
+        {
+            return _renameActions;
+        }
+
+        const SplitIterationDomain& LoopNest::GetDomain() const
+        {
+            return _domain;
+        }
+
+        Index LoopNest::GetBaseIndex(const Index& index) const { return _domain.GetBaseIndex(index); }
+
+        bool LoopNest::IsLoopIndex(const Index& index) const
+        {
+            return _domain.IsLoopIndex(index);
+        }
+
+        bool LoopNest::IsComputedIndex(const Index& index) const
+        {
+            return _domain.IsComputedIndex(index);
+        }
+
+        IndexExpression LoopNest::GetIndexExpression(const Index& index) const
+        {
+            auto loopIndices = _domain.GetDependentLoopIndices(index);
+
+            std::vector<ScaledIndex> result;
+            for (auto loopIndex : loopIndices)
+            {
+                auto indexScale = GetLoopIndexScale(index);
+                result.push_back({ indexScale, loopIndex });
+            }
+
+            auto begin = _domain.GetDimensionBegin(index);
+            return { result, begin };
+        }
+
+        void LoopNest::DebugDump(std::string tag, std::ostream* stream) const
+        {
+            auto& targetStream = stream != nullptr ? *stream : std::cerr;
+
+            GetDomain().Print(targetStream);
+
+            targetStream << "Loop order: ";
+            for (auto i : GetLoopSequence())
+            {
+                targetStream << i << " ";
+            }
+            targetStream << std::endl;
+
+            LoopNestPrinter printer(targetStream);
+
+            printer.Print(*this);
+
+            if (!tag.empty())
+            {
+                targetStream << "[tag = " << tag << "]";
+            }
+            targetStream << '\n';
+        }
+
+        void DebugDump(const LoopNest& nest, std::string tag, std::ostream* stream)
+        {
+            nest.DebugDump(tag, stream);
+        }
+
+        bool operator==(const ScheduledKernel& i1, const ScheduledKernel& i2)
+        {
+            return (i1.kernel == i2.kernel) && (i1.constraints == i2.constraints);
+        }
+
+        bool operator!=(const ScheduledKernel& i1, const ScheduledKernel& i2)
+        {
+            return !(i1 == i2);
+        }
+
+        LoopNest Fuse(const LoopNest& nest1, const LoopNest& nest2)
+        {
+            return Fuse(nest1, nest2, {}, {});
+        }
+
+        LoopNest Fuse(const LoopNest& nest1, const LoopNest& nest2, const std::vector<Index>& dependentIndexVec1, const std::vector<Index>& dependentIndexVec2)
+        {
+            // Collect all the indices for nest1 and nest2
+            std::map<Index, Range> nestIndices;
+
+            auto makeSet = [&](const auto& container) {
+                std::set<std::decay_t<decltype(*container.cbegin())>> result(container.begin(), container.end());
+                return result; };
+
+            std::set<Index> dependentIndices1 = makeSet(dependentIndexVec1);
+            std::set<Index> dependentIndices2 = makeSet(dependentIndexVec2);
+            std::set<Index> nest1Indices = makeSet(nest1.GetDomain().GetAllLoopIndices());
+            std::set<Index> nest2Indices = makeSet(nest2.GetDomain().GetAllLoopIndices());
+
+            // Collect vector of all IndexRanges, and indices in only one nest
+            // add indices in nest2 but not nest1 as "first" predicates for the nest1 kernels
+            // add indices in nest1 but not nest2 as "last" predicates for the nest2 kernels
+            auto domain1 = nest1.GetDomain();
+            std::vector<IndexRange> indexRanges;
+            for (const auto& index : nest1Indices)
+            {
+                auto range = domain1.GetIndexRange(index);
+                if (nest2Indices.count(index) == 0)
+                {
+                    dependentIndices2.insert(index);
+                }
+                indexRanges.emplace_back(index, range);
+            }
+
+            auto domain2 = nest2.GetDomain();
+            for (const auto& index : nest2Indices)
+            {
+                auto range = domain2.GetIndexRange(index);
+                if (nest1Indices.count(index) != 0)
+                {
+                    auto range1 = domain1.GetIndexRange(index);
+                    if (range != range1)
+                    {
+                        throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "Fusing loops with incompatible ranges for index " + index.GetName());
+                    }
+                }
+                else
+                {
+                    dependentIndices1.insert(index);
+                    indexRanges.emplace_back(index, range);
+                }
+            }
+
+            // Make new "selector" index
+            // Index s("sel");
+            // indexRanges.emplace_back(s, Range{ 0, 2 });
+
+            // Create new loop nest
+            LoopNest result = { indexRanges };
+
+            // Add kernels for nest1 with "first" selector
+            for (const auto& kernel : nest1.GetKernels())
+            {
+                auto fullPredicate = std::accumulate(dependentIndices1.begin(), dependentIndices1.end(), kernel.predicate, [](auto lhs, auto rhs) -> KernelPredicate { return lhs && First(rhs); });
+                // result.AddKernel(kernel.kernel, kernel.constraints, fullPredicate && First(s)); // if using selector
+                result.AddKernel(kernel.kernel, kernel.constraints, fullPredicate, kernel.placement);
+            }
+
+            // Add kernels for nest2 with "last" selector, and using the dependent indices passed in
+            for (const auto& kernel : nest2.GetKernels())
+            {
+                auto fullPredicate = std::accumulate(dependentIndices2.begin(), dependentIndices2.end(), kernel.predicate, [](auto lhs, auto rhs) -> KernelPredicate { return lhs && Last(rhs); });
+                // result.AddKernel(kernel.kernel, kernel.constraints, fullPredicate && Last(s));
+                result.AddKernel(kernel.kernel, kernel.constraints, fullPredicate, kernel.placement);
+            }
+
+            return result;
+        }
+    } // namespace loopnests
+} // namespace value
+} // namespace ell
+
+using namespace ell::value::loopnests;
+
+std::hash<ScheduledKernel>::result_type std::hash<ScheduledKernel>::operator()(const argument_type& kernel) const
+{
+    using ::ell::utilities::HashCombine;
+
+    size_t hash = 0;
+    HashCombine(hash, kernel.kernel);
+    HashCombine(hash, kernel.constraints);
+
+    return hash;
+}
diff --git a/libraries/value/src/loopnests/LoopNestPrinter.cpp b/libraries/value/src/loopnests/LoopNestPrinter.cpp
new file mode 100644
index 000000000..8a87ccb45
--- /dev/null
+++ b/libraries/value/src/loopnests/LoopNestPrinter.cpp
@@ -0,0 +1,484 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     LoopNestPrinter.cpp (value)
+//  Authors:  Chuck Jacobs
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "loopnests/LoopNestPrinter.h"
+
+#include <utilities/include/Exception.h>
+#include <utilities/include/StringUtil.h>
+
+#include <numeric>
+#include <variant>
+
+namespace ell
+{
+namespace value
+{
+    namespace loopnests
+    {
+        namespace
+        {
+            // computes ceil(a/b)
+            int CeilDiv(int a, int b)
+            {
+                return (a - 1) / b + 1;
+            }
+        } // namespace
+
+        LoopNestPrinter::LoopNestPrinter(std::ostream& stream) :
+            _stream(stream),
+            _indentLevel(0)
+        {
+        }
+
+        void LoopNestPrinter::Print(const LoopNest& loopNest) const
+        {
+            Visit(loopNest);
+        }
+
+        void LoopNestPrinter::GenerateLoopRangeNew(const LoopRange& r, const RecursionStateNew& state, const LoopVisitSchedule& schedule, std::function<void(Scalar)> codegenFn) const
+        {
+            const LoopNest& loopNest = schedule.GetLoopNest();
+            auto loopIndex = schedule.CurrentLoopIndex();
+
+            bool isParallelized = loopNest.IsParallelized(loopIndex);
+            bool isUnrolled = loopNest.IsUnrolled(loopIndex);
+            assert(!(isParallelized && isUnrolled) && "An index cannot be both unrolled and parallelized");
+
+            const int startInt = r.start.Get<int>();
+            const int stopInt = r.stop.Get<int>();
+            const int stepInt = r.step.Get<int>();
+
+            int numIterations = CeilDiv(stopInt - startInt, stepInt);
+            if (numIterations < 2)
+            {
+                isParallelized = false;
+            }
+
+            std::vector<std::string> properties;
+            if (isParallelized)
+            {
+                properties.push_back("parallel");
+            }
+            if (isUnrolled)
+            {
+                properties.push_back("unrolled");
+            }
+            if (numIterations == 1)
+            {
+                properties.push_back("single");
+            }
+
+            std::string propertiesStr;
+            if (!properties.empty())
+            {
+                propertiesStr = ": (" + utilities::Join(properties, ", ") + ")";
+            }
+
+            WriteLine("For (" + GetIndexString(loopIndex, state.loopIndices) + " = " + std::to_string(startInt) + " to " + std::to_string(stopInt) + " by " + std::to_string(stepInt) + ")" + propertiesStr);
+            WriteLine("{");
+            {
+                Indenter i(*this);
+                codegenFn(startInt);
+            }
+            WriteLine("}");
+        }
+
+        void LoopNestPrinter::GenerateLoopRangeOld(const LoopRange& r, const RecursionState& state, const LoopVisitSchedule& schedule, std::function<void(Scalar)> codegenFn) const
+        {
+            const LoopNest& loopNest = schedule.GetLoopNest();
+            auto loopIndex = schedule.CurrentLoopIndex();
+
+            bool isParallelized = loopNest.IsParallelized(loopIndex);
+            bool isUnrolled = loopNest.IsUnrolled(loopIndex);
+            assert(!(isParallelized && isUnrolled) && "An index cannot be both unrolled and parallelized");
+
+            const int startInt = r.start.Get<int>();
+            const int stopInt = r.stop.Get<int>();
+            const int stepInt = r.step.Get<int>();
+
+            std::vector<std::string> properties;
+            if (isParallelized)
+            {
+                properties.push_back("parallel");
+            }
+            if (isUnrolled)
+            {
+                properties.push_back("unrolled");
+            }
+
+            auto currentLoopHasPrologue = r.currentLoopFragmentFlags.GetFlag(LoopFragmentType::prologue);
+            auto currentLoopHasEpilogue = r.currentLoopFragmentFlags.GetFlag(LoopFragmentType::epilogue);
+
+            if (currentLoopHasPrologue)
+            {
+                properties.push_back("prologue_kernel");
+            }
+
+            if (currentLoopHasEpilogue)
+            {
+                properties.push_back("epilogue_kernel");
+            }
+
+            std::string propertiesStr;
+            if (!properties.empty())
+            {
+                propertiesStr = ": (" + utilities::Join(properties, ", ") + ")";
+            }
+
+            WriteLine("For (" + GetIndexString(loopIndex, state.loopIndices) + " = " + std::to_string(startInt) + " to " + std::to_string(stopInt) + " by " + std::to_string(stepInt) + ")" + propertiesStr);
+            WriteLine("{");
+            {
+                Indenter i(*this);
+                codegenFn(startInt);
+            }
+            WriteLine("}");
+        }
+
+        std::string LoopNestPrinter::GetIndent() const
+        {
+            constexpr int indentSize = 4;
+            return std::string(static_cast<size_t>(indentSize * _indentLevel), ' ');
+        }
+
+        void LoopNestPrinter::WriteLine(std::string l) const
+        {
+            _stream << GetIndent() << l << "\n";
+        }
+
+        Scalar LoopNestPrinter::EmitIndexExpression(const Index& index, const IndexExpression& expr, const LoopIndexSymbolTable& indexVariables) const
+        {
+            if (!expr.indices.empty())
+            {
+                // We can't currently optimize away the "identity" expression, because the result (a loops "index" Scalar)
+                // would be a register variable (pointer valence 0), and the generated kernel function expects a stored value
+                // (pointer valence 1). So, we need to call `Allocate()` to get a stored variable.
+                std::vector<std::string> terms;
+                for (auto scaledIndex : expr.indices)
+                {
+                    if (auto it = indexVariables.find(scaledIndex.index); it != indexVariables.end())
+                    {
+                        auto name = GetIndexString(scaledIndex.index, indexVariables);
+
+                        if (scaledIndex.scale == 1)
+                        {
+                            terms.push_back(name);
+                        }
+                        else
+                        {
+                            terms.push_back(std::to_string(scaledIndex.scale) + "*" + name);
+                        }
+                    }
+                }
+                terms.push_back(std::to_string(expr.begin));
+
+                WriteLine("int " + GetIndexString(index, indexVariables) + " = " + utilities::Join(terms, " + ") + ";");
+            }
+            return 0; // Silly but necessary according to the `EmitIndexExpression` API
+        }
+
+        std::string LoopNestPrinter::GetIndexString(const Index& index, const LoopIndexSymbolTable& runtimeIndexVariables) const
+        {
+            auto name = index.GetName();
+            if (auto it = runtimeIndexVariables.find(index); it != runtimeIndexVariables.end())
+            {
+                auto range = it->second.loopRange;
+                if (range.Increment() > 0)
+                {
+                    int numIterations = CeilDiv(range.End() - range.Begin(), range.Increment());
+                    if (numIterations == 1)
+                    {
+                        name = "[" + name + "=" + std::to_string(range.Begin()) + "]";
+                    }
+                }
+            }
+            return name;
+        }
+
+        std::string LoopNestPrinter::GetPredicateString(const KernelPredicate& predicate, const LoopIndexSymbolTable& runtimeIndexVariables, const LoopVisitSchedule& schedule) const
+        {
+            if (predicate.IsAlwaysTrue())
+            {
+                return "true";
+            }
+            else if (predicate.IsAlwaysFalse())
+            {
+                return "false";
+            }
+            else if (auto fragmentPred = predicate.As<FragmentTypePredicate>(); fragmentPred != nullptr)
+            {
+                auto condition = fragmentPred->GetCondition();
+                if (condition == Fragment::all)
+                {
+                    return "true";
+                }
+
+                auto index = fragmentPred->GetIndex();
+                const auto& domain = schedule.GetLoopNest().GetDomain();
+                const auto range = domain.GetDimensionRange(index);
+
+                auto loopIndices = range.GetDependentLoopIndices(index);
+                if (loopIndices.empty())
+                {
+                    loopIndices = { index };
+                }
+                bool first = true;
+                std::string result = "";
+                for (auto loopIndex : loopIndices)
+                {
+                    auto range = GetLoopRange(loopIndex, runtimeIndexVariables, schedule);
+
+                    int testVal = 0;
+                    bool valid = true;
+                    switch (condition)
+                    {
+                    case Fragment::first:
+                        testVal = range.Begin();
+                        break;
+                    case Fragment::last:
+                        testVal = range.End() - (range.Size() % range.Increment());
+                        if (testVal == range.End()) // not a boundary
+                        {
+                            testVal = range.End() - range.Increment();
+                        }
+                        break;
+                    case Fragment::endBoundary:
+                        testVal = range.End() - (range.Size() % range.Increment());
+                        if (testVal == range.End()) // not a boundary
+                        {
+                            valid = false;
+                        }
+                        break;
+                    default:
+                        valid = false;
+                        // throw?
+                        break;
+                    }
+
+                    if (valid)
+                    {
+                        if (first)
+                        {
+                            result = "(";
+                        }
+                        else
+                        {
+                            result += " && ";
+                        }
+                        first = false;
+                        result += "(" + GetIndexString(loopIndex, runtimeIndexVariables) + " == " + std::to_string(testVal) + ")";
+                    }
+                }
+                return result.empty() ? "" : result + ")";
+            }
+            else if (predicate.Is<IndexDefinedPredicate>())
+            {
+                throw utilities::LogicException(utilities::LogicExceptionErrors::notImplemented, "IsDefined predicate not implemented");
+            }
+            else if (auto conjunction = predicate.As<KernelPredicateConjunction>(); conjunction != nullptr)
+            {
+                const auto& terms = conjunction->GetTerms();
+                if (terms.size() == 0)
+                {
+                    return "true";
+                }
+                else if (terms.size() == 1)
+                {
+                    return GetPredicateString(*terms[0], runtimeIndexVariables, schedule);
+                }
+                else
+                {
+                    std::string result = "(";
+                    bool first = true;
+                    for (const auto& t : terms)
+                    {
+                        if (!first)
+                        {
+                            result += " && ";
+                        }
+                        first = false;
+                        result += GetPredicateString(*t, runtimeIndexVariables, schedule);
+                    }
+                    result += ")";
+                    return result;
+                }
+            }
+            else if (auto disjunction = predicate.As<KernelPredicateDisjunction>(); disjunction != nullptr)
+            {
+                const auto& terms = disjunction->GetTerms();
+                if (terms.size() == 0)
+                {
+                    return "true";
+                }
+                else if (terms.size() == 1)
+                {
+                    return GetPredicateString(*terms[0], runtimeIndexVariables, schedule);
+                }
+                else
+                {
+                    std::string result = "(";
+                    bool first = true;
+                    for (const auto& t : terms)
+                    {
+                        result += GetPredicateString(*t, runtimeIndexVariables, schedule);
+                        if (!first)
+                        {
+                            result += " || ";
+                        }
+                        first = false;
+                    }
+                    result += ")";
+                    return result;
+                }
+            }
+            else
+            {
+                throw utilities::LogicException(utilities::LogicExceptionErrors::illegalState, "Unknown predicate type");
+            }
+        }
+
+        void LoopNestPrinter::EmitIf(const KernelPredicate& predicate, const LoopIndexSymbolTable& runtimeIndexVariables, const LoopVisitSchedule& schedule) const
+        {
+            WriteLine("If (" + GetPredicateString(predicate, runtimeIndexVariables, schedule) + ")");
+            WriteLine("{");
+            ++_indentLevel;
+        }
+        void LoopNestPrinter::EmitElseIf(const KernelPredicate& predicate, const LoopIndexSymbolTable& runtimeIndexVariables, const LoopVisitSchedule& schedule) const
+        {
+            EmitEndIf();
+            WriteLine("ElseIf (" + GetPredicateString(predicate, runtimeIndexVariables, schedule) + ")");
+            WriteLine("{");
+            ++_indentLevel;
+        }
+        void LoopNestPrinter::EmitElse() const
+        {
+            EmitEndIf();
+            WriteLine("Else");
+            WriteLine("{");
+            ++_indentLevel;
+        }
+        void LoopNestPrinter::EmitEndIf() const
+        {
+            --_indentLevel;
+            WriteLine("}");
+        }
+
+        void LoopNestPrinter::InvokeKernel(const Kernel& kernel, const KernelPredicate& predicate, const LoopIndexSymbolTable& runtimeIndexVariables, const LoopVisitSchedule& schedule) const
+        {
+            if (!predicate.IsEmpty())
+            {
+                EmitIf(predicate, runtimeIndexVariables, schedule);
+            }
+
+            InvokeKernel(kernel, runtimeIndexVariables, schedule);
+
+            if (!predicate.IsEmpty())
+            {
+                EmitEndIf();
+            }
+        }
+
+        void LoopNestPrinter::InvokeKernel(const Kernel& kernel, const LoopIndexSymbolTable& runtimeIndexVariables, const LoopVisitSchedule& schedule) const
+        {
+            const auto& renameActions = schedule.GetLoopNest().GetRenameActions();
+
+            auto rename = [&, this](const Value& arg) {
+                for (const auto& action : renameActions)
+                {
+                    const auto& excludedKernels = action.excludedKernels;
+                    if (std::find(excludedKernels.begin(), excludedKernels.end(), kernel.GetId()) == excludedKernels.end() &&
+                        std::equal_to<Value>{}(arg, action.oldValue) &&
+                        AreAllFullyDefined(action.where, schedule))
+                    {
+                        auto newValue = action.newValue;
+                        WriteLine("Using " + newValue.GetName() + " in place of " + arg.GetName());
+                        return newValue;
+                    }
+                }
+                return arg;
+            };
+
+            std::vector<std::string> args;
+            for (auto v : kernel.GetArgs())
+            {
+                auto newV = rename(v);
+                if (auto name = newV.GetName(); name.empty())
+                {
+                    args.push_back("<arg>");
+                }
+                else
+                {
+                    args.push_back(name);
+                }
+            }
+
+            for (const auto i : kernel.GetIndices())
+            {
+                args.push_back(GetIndexString(i, runtimeIndexVariables));
+            }
+
+            WriteLine(kernel.GetName() + "(" + utilities::Join(args, ", ") + ");");
+        }
+
+        bool LoopNestPrinter::InvokeKernelGroup(const ScheduledKernelGroup& kernelGroup, const LoopIndexSymbolTable& runtimeIndexVariables, const LoopVisitSchedule& schedule) const
+        {
+            // preprocess to get only valid kernels
+            auto validKernels = GetValidKernels(kernelGroup, runtimeIndexVariables, schedule);
+            if (validKernels.empty())
+            {
+                return false;
+            }
+
+            bool first = true;
+            for (const auto& kernel : validKernels)
+            {
+                auto predicate = schedule.GetKernelPredicate(kernel).Simplify(runtimeIndexVariables, schedule);
+                if (predicate.IsAlwaysFalse())
+                {
+                    throw utilities::LogicException(utilities::LogicExceptionErrors::illegalState, "Always-false predicates should have been removed here");
+                }
+
+                if (predicate.IsAlwaysTrue())
+                {
+                    if (!first)
+                    {
+                        EmitElse();
+                    }
+                }
+                else
+                {
+                    if (first)
+                    {
+                        EmitIf(predicate, runtimeIndexVariables, schedule);
+                    }
+                    else
+                    {
+                        EmitElseIf(predicate, runtimeIndexVariables, schedule);
+                    }
+                }
+
+                InvokeKernel(kernel.kernel, runtimeIndexVariables, schedule);
+                if (predicate.IsAlwaysTrue())
+                {
+                    // Stop evaluating, we're done
+                    break;
+                }
+
+                first = false;
+            }
+
+            if (!first)
+            {
+                EmitEndIf();
+            }
+
+            return true;
+        }
+
+    } // namespace loopnests
+} // namespace value
+} // namespace ell
diff --git a/libraries/value/src/loopnests/LoopNestVisitor.cpp b/libraries/value/src/loopnests/LoopNestVisitor.cpp
new file mode 100644
index 000000000..46708b358
--- /dev/null
+++ b/libraries/value/src/loopnests/LoopNestVisitor.cpp
@@ -0,0 +1,1057 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     LoopNestVisitor.cpp (value)
+//  Authors:  Chuck Jacobs, Kern Handa
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "LLVMContext.h"
+
+#include "loopnests/KernelPredicate.h"
+#include "loopnests/LoopNestPrinter.h"
+#include "loopnests/LoopNestVisitor.h"
+
+#include <utilities/include/Exception.h>
+
+#include <algorithm>
+#include <numeric>
+#include <optional>
+#include <set>
+#include <stdexcept>
+
+namespace ell
+{
+namespace value
+{
+    namespace loopnests
+    {
+        namespace
+        {
+            // computes ceil(a/b)
+            int CeilDiv(int a, int b)
+            {
+                return (a - 1) / b + 1;
+            }
+
+            // check for a "placement" predicate without an index
+            bool IsBodyPlacementPredicate(const KernelPredicate& predicate)
+            {
+                if (auto placementPred = predicate.As<PlacementPredicate>(); placementPred != nullptr)
+                {
+                    return !placementPred->HasIndex();
+                }
+                return false;
+            }
+        } // namespace
+
+        //
+        // LoopNestVisitor::RecursionState
+        //
+        LoopNestVisitor::RecursionState::RecursionState(const LoopNest& loopNest) :
+            currentFragment(LoopFragmentFlags::All()),
+            activeKernels(loopNest.GetKernels())
+        {
+        }
+
+        //
+        // LoopNestVisitor::RecursionStateNew
+        //
+        LoopNestVisitor::RecursionStateNew::RecursionStateNew(const LoopNest& loopNest)
+        {
+            kernelGroups.reserve(loopNest.GetKernelGroups().size());
+            for (const auto& g : loopNest.GetKernelGroups())
+            {
+                kernelGroups.emplace_back(true, g);
+            }
+        }
+
+        //
+        // LoopNestVisitor
+        //
+        void LoopNestVisitor::Visit(const LoopNest& loopNest) const
+        {
+            auto schedule = loopNest.GetLoopSchedule();
+
+            if (UseNewVersion(loopNest))
+            {
+                // 0) convert old-style constraints into new predicate model
+                //    - have a "GetPredicate" function that appends constraint conditions to scheduled kernel's predicate
+                // 1) generate simple structure representing perfectly-nested loops with predicates on kernels
+                //    - should replace old `LoopNest::GetLoopSchedule()`
+                // 2) unswitch conditions by splitting loops
+                // 3) replace constant predicates with either a simple kernel invocation or a no-op
+                // 4) replace single-iteration loops with simply setting the index value and evaluating the loop body
+                // 5) identify loops / index variable statements to omit
+
+                // We need to create a RecursionState object, because it's passed in as a mutable (in/out) parameter
+                RecursionStateNew state = { loopNest };
+                GenerateLoopsNew(state, schedule);
+            }
+            else
+            {
+                GenerateLoopsOld({ loopNest }, schedule);
+            }
+        }
+
+        bool LoopNestVisitor::UseNewVersion(const LoopNest& loopNest) const
+        {
+            for (const auto& k : loopNest.GetKernels())
+            {
+                if (k.newVersion)
+                {
+                    return true;
+                }
+            }
+            return false;
+        }
+
+        void LoopNestVisitor::GenerateLoopsNew(RecursionStateNew& state, const LoopVisitSchedule& schedule) const
+        {
+            if (schedule.IsDone())
+            {
+                return;
+            }
+
+            // We're descending into the heart of the loop
+
+            // Find the active range for the current loop dimension and reduce our end amount if it exceeds the active range (boundary case)
+            auto loopIndex = schedule.CurrentLoopIndex();
+
+            bool hasValidKernels = false;
+            for (const auto& k : state.kernelGroups)
+            {
+                if (k.first)
+                {
+                    hasValidKernels = true;
+                }
+            }
+
+            // if state.kernelGroups is empty, just put all the remaining indices in the symbol table, marked "done"
+            if (!hasValidKernels)
+            {
+                // get each inner index and add it to state.loopIndices
+                auto s = schedule;
+                while (!s.IsDone())
+                {
+                    auto innerLoopIndex = s.CurrentLoopIndex();
+                    DefinePostLoopIndex(innerLoopIndex, state.loopIndices, s);
+                    s = s.Next();
+                }
+                return;
+            }
+
+            // Alg:
+
+            // 1) get splits/partitions
+            // 2) copy partition per kernel (group)
+            // 3) eval predicates and mark valid regions
+            // 4) make representation that's a list of kernels to run for each partition (e.g., [1,2 | 2 | 2, 3])
+            // 5) move adjacent fully-matching suffix on left into right partition (and expand)
+            // 6) move adjacent fully-matching prefix on right into left partition (and expand)
+
+            // ex, with S1: first(i), S2: all, S3: last(i):
+
+            // step 1: partitions: (0..1), (1..N-1), (N-1..N)
+            // step 2: partitions w/ kernels: (0..1: S1, S2, S3), (1..N-1: S1, S2, S3), (N-1..N: S1, S2, S3)
+            // step 3: eval predicates and remove kernels: (0..1: S1, S2), (1..N-1: S2), (N-1..N: S2, S3)
+            // step 4: ...
+            // step 5: Suffix of first partition matches entirety of second: move
+            //         --> (0..1: S1), (0..N-1: S2), (N1-..N: S2, S3)
+            // step 6: prefix of last partition matches entirety of second: move
+            //         --> (0..1: S1), (0..N: S2), (N1-..N: S3)
+
+            auto loopRange = GetLoopRange(loopIndex, state.loopIndices, schedule);
+            auto partitions = GetPartitions(loopIndex, loopRange, state.kernelGroups, state.loopIndices, schedule);
+            std::vector<LoopRange> ranges;
+            LoopFragmentFlags bodyFlags;
+            bodyFlags.SetFlag(LoopFragmentType::boundary, false);
+            for (const auto& p : partitions)
+            {
+                ranges.push_back({ p.range.Begin(), p.range.End(), p.range.Increment(), bodyFlags, LoopFragmentType::body });
+            }
+
+            for (auto r : ranges)
+            {
+#if 1
+                std::function<void(Scalar)> codegenFn = GetCodegenFnNew(r, state, schedule);
+                GenerateLoopRangeNew(r, state, schedule, codegenFn);
+#else
+                std::function<void(Scalar)> codegenFn = GetCodegenFnNew(r, state, schedule);
+                const int startInt = r.start.Get<int>();
+                const int stopInt = r.stop.Get<int>();
+                const int stepInt = r.step.Get<int>();
+                auto numIterations = CeilDiv(stopInt - startInt, stepInt);
+
+                if (numIterations == 0)
+                {
+                    // throw?
+                }
+                else if (numIterations == 1)
+                {
+                    // TODO: set initial value of index variable (at least in loop-nest-printing case)
+                    // SetLoopIndexValue(loopIndex, r.start);
+                    codegenFn(r.start);
+                }
+                else
+                {
+                    GenerateLoopRangeNew(r, state, schedule, codegenFn);
+                }
+#endif
+            }
+
+            // set the loop index state to be "done"
+            DefinePostLoopIndex(loopIndex, state.loopIndices, schedule);
+        }
+
+        void LoopNestVisitor::GenerateLoopsOld(const RecursionState& state, const LoopVisitSchedule& schedule) const
+        {
+            // Loop-unswitching / duplicating rules:
+            //
+            // Need to duplicate the outermost loop involving an index used to compute the constraint index
+            // Only the innermost loop involving an index used to compute the constraint index needs to start from `1` for the body case
+            // If all the loops with indices used to compute the constraint index are contiguous, and the kernel is run in the innermost of these loops,
+            //   then we can omit the 'body' from the prologue (/epilogue) fragment, and allow the body loop to start from `0`
+            //   (really, we can have the prologue (/epilogue) fragment contain only the constrained kernel)
+
+            if (schedule.IsDone())
+            {
+                return;
+            }
+
+            // We're descending into the heart of the loop
+
+            // If the index we're looping over in this loop has any prologue / epilogue kernels, we have to (potentially) break up the range
+            // into prologue / body / epilogue sections
+            auto currentDimension = schedule.CurrentDimension();
+
+            // Find the active range for the current loop dimension and reduce our end amount if it exceeds the active range (boundary case)
+            auto activeRangeIt = state.activeDimensionRanges.find(currentDimension);
+            auto loopRange = schedule.LoopRange();
+            int begin = loopRange.Begin();
+            int end = loopRange.End();
+            int increment = schedule.LoopIncrement();
+            if (activeRangeIt != state.activeDimensionRanges.end())
+            {
+                auto activeRange = activeRangeIt->second;
+                if (end > activeRange.End())
+                {
+                    end = activeRange.End();
+                    loopRange = Range{ begin, end, increment };
+                }
+            }
+
+            int nonBoundaryEnd = GetMainBodyLoopEnd(state, schedule, loopRange);
+
+            // These mean "split current loop for this fragment type"
+            auto currentLoopHasPrologue = schedule.CurrentLoopHasFragment(state.activeKernels, LoopFragmentType::prologue);
+            auto currentLoopHasEpilogue = schedule.CurrentLoopHasFragment(state.activeKernels, LoopFragmentType::epilogue);
+
+            // check if we need to emit an epilogue section to handle the end boundary for this loop
+            auto currentLoopHasEndBoundary = schedule.CurrentIndexEndBoundarySize() != 0;
+
+            auto futureLoopHasPrologue = schedule.FutureLoopHasFragmentForThisIndex(state.activeKernels, LoopFragmentType::prologue);
+            auto futureLoopHasEpilogue = schedule.FutureLoopHasFragmentForThisIndex(state.activeKernels, LoopFragmentType::epilogue);
+
+            LoopFragmentFlags bodyFlags = state.currentFragment;
+            bodyFlags.SetFlag(LoopFragmentType::boundary, false);
+
+            bool bodyInPrologue = !schedule.FragmentCanRunAlone(state.activeKernels, LoopFragmentType::prologue);
+            bool bodyInEpilogue = !schedule.FragmentCanRunAlone(state.activeKernels, LoopFragmentType::epilogue);
+
+            bool generatePrologueFragment = currentLoopHasPrologue || futureLoopHasPrologue;
+            bool generateEpilogueFragment = currentLoopHasEpilogue || futureLoopHasEpilogue;
+
+            std::vector<LoopRange> ranges;
+            auto prologueBegin = begin;
+            auto prologueEnd = begin + increment;
+
+            if (generatePrologueFragment)
+            {
+                if (bodyInPrologue)
+                {
+                    begin += increment;
+                }
+                else
+                {
+                    bodyFlags.SetFlag(LoopFragmentType::prologue, false);
+                }
+            }
+
+            // adjust loop boundary to unswitch last loop iteration if we have an epilogue kernel
+            auto epilogueBegin = end - increment;
+            auto epilogueEnd = end;
+            if (generateEpilogueFragment)
+            {
+                if (bodyInEpilogue)
+                {
+                    if (currentLoopHasEndBoundary)
+                    {
+                        epilogueBegin = nonBoundaryEnd;
+                    }
+                    else
+                    {
+                        end -= increment;
+                        nonBoundaryEnd -= increment;
+                    }
+                }
+                else
+                {
+                    bodyFlags.SetFlag(LoopFragmentType::epilogue, false);
+                }
+            }
+
+            // Add prologue section
+            if (generatePrologueFragment)
+            {
+                LoopFragmentFlags flags = bodyInPrologue ? LoopFragmentType::prologue | LoopFragmentType::body : LoopFragmentType::prologue;
+                ranges.push_back({ prologueBegin, prologueEnd, increment, flags, LoopFragmentType::prologue });
+            }
+
+            // Add main body section
+            if (nonBoundaryEnd > begin)
+            {
+                ranges.push_back({ begin, nonBoundaryEnd, increment, bodyFlags, LoopFragmentType::body });
+            }
+
+            // Add boundary case (unless epilogue case already handles it)
+            if (currentLoopHasEndBoundary && !(generateEpilogueFragment && bodyInEpilogue) && (end - nonBoundaryEnd > 0))
+            {
+                ranges.push_back({ nonBoundaryEnd, end, increment, bodyFlags | LoopFragmentType::boundary, LoopFragmentType::body });
+            }
+
+            // Add epilogue case
+            if (generateEpilogueFragment)
+            {
+                LoopFragmentFlags flags = bodyInEpilogue ? LoopFragmentType::epilogue | LoopFragmentType::body : LoopFragmentType::epilogue;
+                if (currentLoopHasEndBoundary)
+                {
+                    flags.SetFlag(LoopFragmentType::boundary, true);
+                }
+                ranges.push_back({ epilogueBegin, epilogueEnd, increment, flags, LoopFragmentType::epilogue });
+            }
+
+            for (auto r : ranges)
+            {
+                std::function<void(Scalar)> codegenFn = GetCodegenFnOld(r, state, schedule);
+                const int startInt = r.start.Get<int>();
+                const int stopInt = r.stop.Get<int>();
+                const int stepInt = r.step.Get<int>();
+                auto numIterations = CeilDiv(stopInt - startInt, stepInt);
+
+                if (numIterations == 0)
+                {
+                    // throw?
+                }
+                else if (numIterations == 1)
+                {
+                    codegenFn(r.start);
+                }
+                else
+                {
+                    GenerateLoopRangeOld(r, state, schedule, codegenFn);
+                }
+            }
+        }
+
+        std::function<void(Scalar)> LoopNestVisitor::GetCodegenFnNew(const LoopRange& r, const RecursionStateNew& state, const LoopVisitSchedule& schedule) const
+        {
+            // define the function used to do the codegen, but don't run it yet
+            return [this, &r, &state, &schedule](Scalar index) {
+                auto loopIndex = schedule.CurrentLoopIndex();
+
+                // TODO: deal with eventually not having an emit-time-constant range here
+                const int startInt = r.start.Get<int>();
+                const int stopInt = r.stop.Get<int>();
+                const int stepInt = r.step.Get<int>();
+
+                // Note: it's important that this code not be moved outside of the `codegenFn` lambda, otherwise Compute will incorrectly use old info for subsequent ranges
+                auto newState = state;
+                newState.loopIndices.insert_or_assign(loopIndex, LoopIndexSymbolTableEntry{ loopIndex, index, Range(startInt, stopInt, stepInt), LoopIndexState::inProgress });
+
+                // define vars for all kernels
+                std::vector<ScheduledKernel> kernels;
+                for (const auto& g : state.kernelGroups)
+                {
+                    if (g.first)
+                    {
+                        kernels.insert(kernels.end(), g.second.kernels.begin(), g.second.kernels.end());
+                    }
+                }
+
+                DefineComputedIndexVariables(newState.loopIndices, kernels, schedule);
+
+                // invoke all kernels valid before inner loops
+                for (auto& g : newState.kernelGroups)
+                {
+                    if (g.first)
+                    {
+                        auto invoked = InvokeKernelGroup(g.second, newState.loopIndices, schedule);
+                        if (invoked)
+                        {
+#if 0
+                            InvokeForContext<LLVMContext>([&](LLVMContext& context) {
+                                auto& fn = context.GetFunctionEmitter();
+                                fn.Print("Invoking kernel " + g.second.id + " at loop index " + loopIndex.GetName() + "\n");
+                            });
+#endif
+
+                            g.first = false;
+                        }
+                    }
+                }
+
+                // TODO: need to know if we're going to invoke any kernels after the inner loops, and remove them from the valid kernel groups
+
+                if (!schedule.IsInnermostLoop())
+                {
+                    GenerateLoopsNew(newState, schedule.Next());
+
+                    // invoke all kernels valid after inner loops
+                    for (auto& g : newState.kernelGroups)
+                    {
+                        if (g.first)
+                        {
+                            auto invoked = InvokeKernelGroup(g.second, newState.loopIndices, schedule);
+                            if (invoked)
+                            {
+#if 0
+                                InvokeForContext<LLVMContext>([&](LLVMContext& context) {
+                                    auto& fn = context.GetFunctionEmitter();
+                                    fn.Print("Invoking after-kernel " + g.second.id + " at loop index " + loopIndex.GetName() + "\n");
+                                });
+#endif
+                                g.first = false;
+                            }
+                        }
+                    }
+                }
+
+                // TODO: restore state of variables
+                // debugging
+                {
+                    DefineComputedIndexVariables(newState.loopIndices, kernels, schedule);
+                }
+            };
+        }
+
+        Range LoopNestVisitor::GetLoopRange(const Index& loopIndex, const LoopIndexSymbolTable& activeRanges, const LoopVisitSchedule& schedule)
+        {
+            const auto& loopNest = schedule.GetLoopNest();
+            const auto& domain = loopNest.GetDomain();
+            auto loopRange = domain.GetIndexRange(loopIndex);
+            int begin = loopRange.Begin();
+            int end = loopRange.End();
+            int rangeSize = end - begin;
+            int increment = loopRange.Increment();
+
+            auto fixBoundaryRange = [&](Index index) {
+                // check activeRanges for parent
+                auto outerIndex = domain.GetOuterSplitIndex(domain.GetParentIndex(index));
+                if (domain.IsLoopIndex(outerIndex) && activeRanges.count(outerIndex) != 0)
+                {
+                    // check if it's a boundary --- if so, set size to its size
+                    auto parentRange = activeRanges.at(outerIndex).loopRange;
+                    if (parentRange.Size() < rangeSize)
+                    {
+                        rangeSize = parentRange.Size();
+                        end = begin + rangeSize;
+                        loopRange = { begin, end, increment };
+                    }
+                }
+            };
+
+            if (domain.IsInnerSplitIndex(loopIndex))
+            {
+                fixBoundaryRange(loopIndex);
+            }
+            else if (domain.HasParentIndex(loopIndex))
+            {
+                auto parentIndex = domain.GetParentIndex(loopIndex);
+                if (domain.IsInnerSplitIndex(parentIndex))
+                {
+                    fixBoundaryRange(domain.GetParentIndex(loopIndex));
+                }
+            }
+
+            return loopRange;
+        }
+
+        LoopNestVisitor::PartitionList LoopNestVisitor::GetPartitions(const Index& loopIndex, Range loopRange, const ActiveKernelGroupList& kernels, const LoopIndexSymbolTable& runtimeIndexVariables, const LoopVisitSchedule& schedule) const
+        {
+            int begin = loopRange.Begin();
+            int end = loopRange.End();
+            int rangeSize = end - begin;
+            int increment = loopRange.Increment();
+
+            // Find conditions involving this index and add any relevant partition split points
+            std::set<int> splits;
+            for (const auto& g : kernels)
+            {
+                if (g.first)
+                {
+                    for (const auto& k : g.second.kernels)
+                    {
+                        auto predicate = schedule.GetKernelPredicate(k).Simplify(runtimeIndexVariables, schedule);
+                        AddSplits(loopIndex, loopRange, predicate, schedule, splits);
+                    }
+                }
+            }
+
+            // Add boundary split point, if necessary
+            int extra = rangeSize % increment;
+            if (extra != 0)
+            {
+                splits.insert(rangeSize - extra);
+            }
+
+            // get index range
+            PartitionList result;
+            for (auto partitionEnd : splits)
+            {
+                result.push_back({ loopIndex, { begin, partitionEnd, increment } });
+                begin = partitionEnd;
+            }
+            result.push_back({ loopIndex, { begin, end, increment } });
+
+            return result;
+        }
+
+        void LoopNestVisitor::AddSplits(const Index& loopIndex, Range loopRange, const KernelPredicate& predicate, const LoopVisitSchedule& schedule, std::set<int>& splits) const
+        {
+            const auto& loopNest = schedule.GetLoopNest();
+            const auto& domain = loopNest.GetDomain();
+
+            // visit predicate, adding testVal to splits
+            auto addSplits = [&splits, &domain, &loopIndex, &loopRange](const auto& addSplits, const KernelPredicate& p) -> void {
+                if (auto simplePredicate = p.As<FragmentTypePredicate>(); simplePredicate != nullptr)
+                {
+                    auto where = simplePredicate->GetCondition();
+                    if (where != Fragment::all)
+                    {
+                        auto predIndex = simplePredicate->GetIndex();
+                        if (predIndex == loopIndex || (domain.SameDimension(predIndex, loopIndex) && domain.DependsOn(predIndex, loopIndex)))
+                        {
+                            std::optional<int> splitVal;
+                            switch (simplePredicate->GetCondition())
+                            {
+                            case Fragment::first:
+                                splitVal = loopRange.Begin() + loopRange.Increment();
+                                break;
+                            case Fragment::last:
+                            {
+                                // take into account last range being a boundary condition
+                                auto extra = loopRange.End() % loopRange.Increment();
+                                if (extra == 0)
+                                {
+                                    splitVal = loopRange.End() - loopRange.Increment();
+                                }
+                                else
+                                {
+                                    splitVal = loopRange.End() - extra;
+                                }
+                            }
+                            break;
+                            case Fragment::endBoundary:
+                                // already set by automatic boundary-handling code
+                                break;
+                            default:
+                                // nothing
+                                break;
+                            }
+
+                            if (splitVal)
+                            {
+                                if (splitVal.value() > 0 && splitVal.value() < loopRange.End())
+                                {
+                                    splits.insert(splitVal.value());
+                                }
+                            }
+                        }
+                    }
+                }
+                else if (p.Is<IndexDefinedPredicate>())
+                {
+                    // nothing
+                }
+                else if (auto conjunction = p.As<KernelPredicateConjunction>(); conjunction != nullptr)
+                {
+                    for (const auto& t : conjunction->GetTerms())
+                    {
+                        addSplits(addSplits, *t);
+                    }
+                }
+                else if (auto disjunction = p.As<KernelPredicateDisjunction>(); disjunction != nullptr)
+                {
+                    for (const auto& t : disjunction->GetTerms())
+                    {
+                        addSplits(addSplits, *t);
+                    }
+                }
+            };
+
+            addSplits(addSplits, predicate);
+        }
+
+        std::function<void(Scalar)> LoopNestVisitor::GetCodegenFnOld(const LoopRange& r, const RecursionState& state, const LoopVisitSchedule& schedule) const
+        {
+            // define the function used to do the codegen, but don't run it yet
+            return [this, &r, &state, &schedule](Scalar index) {
+                const LoopNest& loopNest = schedule.GetLoopNest();
+                auto loopIndex = schedule.CurrentLoopIndex();
+
+                auto dimensionIndex = schedule.CurrentDimension();
+
+                LoopFragmentFlags flags = state.fragmentStates.count(dimensionIndex) == 0 ? LoopFragmentFlags::All() : state.fragmentStates.at(dimensionIndex);
+                flags &= r.futureLoopFragmentFlags;
+                if (r.futureLoopFragmentFlags.GetFlag(LoopFragmentType::boundary))
+                {
+                    flags.SetFlag(LoopFragmentType::boundary, true);
+                }
+
+                // Note: it's important that this code not be moved outside of the `codegenFn` lambda, otherwise Compute will incorrectly use old info for subsequent ranges
+                auto newState = state;
+                newState.currentFragment = flags;
+                newState.fragmentStates[dimensionIndex] = flags;
+                newState.loopIndices.insert_or_assign(loopIndex, LoopIndexSymbolTableEntry{ loopIndex, index, Range{ 0, r.stop.Get<int>() - r.start.Get<int>() }, LoopIndexState::inProgress });
+
+                // set the active range for the current dimension based on the loop range given
+                newState.activeDimensionRanges.insert_or_assign(dimensionIndex, Range{ 0, r.stop.Get<int>() - r.start.Get<int>() });
+
+                // Should we use 'flags' or 'r.futureLoopFragmentFlags' in GetValidKernels call?
+                auto prologueKernels = GetValidKernels(newState.activeKernels, newState.fragmentStates, r.futureLoopFragmentFlags, LoopFragmentType::prologue, schedule);
+                auto bodyKernels = GetValidKernels(newState.activeKernels, newState.fragmentStates, r.futureLoopFragmentFlags, LoopFragmentType::body, schedule);
+                auto epilogueKernels = GetValidKernels(newState.activeKernels, newState.fragmentStates, r.futureLoopFragmentFlags, { LoopFragmentType::epilogue }, schedule);
+
+                // Concatenate kernel lists together
+                std::vector<ScheduledKernel> thisLoopKernels;
+                thisLoopKernels.insert(thisLoopKernels.begin(), prologueKernels.begin(), prologueKernels.end());
+                thisLoopKernels.insert(thisLoopKernels.begin(), bodyKernels.begin(), bodyKernels.end());
+                thisLoopKernels.insert(thisLoopKernels.begin(), epilogueKernels.begin(), epilogueKernels.end());
+
+                DefineComputedIndexVariables(newState.loopIndices, thisLoopKernels, schedule);
+                auto indexVariables = GetRuntimeIndexVariables(newState.loopIndices, loopNest);
+
+                // erase all kernels in newState.activeKernels with the same ID as ones we're going to execute
+                for (const auto& k : thisLoopKernels)
+                {
+                    auto id = k.kernel.GetId();
+                    auto it = std::remove_if(newState.activeKernels.begin(), newState.activeKernels.end(), [&](auto el) {
+                        return el.kernel.GetId() == id;
+                    });
+
+                    newState.activeKernels.erase(it, newState.activeKernels.end());
+                }
+
+                // Prologue
+                for (auto k : prologueKernels)
+                {
+                    InvokeKernel(k.kernel, k.predicate, indexVariables, schedule);
+                }
+
+                // Body
+                for (auto k : bodyKernels)
+                {
+                    InvokeKernel(k.kernel, k.predicate, indexVariables, schedule);
+                }
+
+                // Recursively generate the loops inside this one
+                if (!newState.activeKernels.empty())
+                {
+                    GenerateLoopsOld(newState, schedule.Next());
+                }
+
+                // TODO: restore state of variables
+                // debugging
+                {
+                    DefineComputedIndexVariables(newState.loopIndices, thisLoopKernels, schedule);
+                    indexVariables = GetRuntimeIndexVariables(newState.loopIndices, loopNest);
+                }
+
+                for (auto k : epilogueKernels)
+                {
+                    InvokeKernel(k.kernel, k.predicate, indexVariables, schedule);
+                }
+            };
+        }
+
+        int LoopNestVisitor::GetMainBodyLoopEnd(const RecursionState& state, const LoopVisitSchedule& schedule, const Range& loopRange) const
+        {
+            if (!LoopInEndBoundaryFragment(state, schedule))
+            {
+                return schedule.NonBoundaryEnd();
+            }
+
+            auto rangeSize = loopRange.Size();
+            auto increment = loopRange.Increment();
+            int remainder = rangeSize % increment;
+            int nonBoundarySize = rangeSize - remainder;
+            return loopRange.Begin() + nonBoundarySize;
+        }
+
+        bool LoopNestVisitor::LoopInEndBoundaryFragment(const RecursionState& state, const LoopVisitSchedule& schedule) const
+        {
+            auto loopIndex = schedule.CurrentLoopIndex();
+            auto dimensionIndex = schedule.GetDomain().GetBaseIndex(loopIndex);
+            return ((state.fragmentStates.count(dimensionIndex) != 0) && state.fragmentStates.at(dimensionIndex).GetFlag(LoopFragmentType::boundary));
+        }
+
+        void LoopNestVisitor::DefineComputedIndexVariables(LoopIndexSymbolTable& indexVariables, const std::vector<ScheduledKernel>& activeKernels, const LoopVisitSchedule& schedule) const
+        {
+            const auto& loopNest = schedule.GetLoopNest();
+            const auto& domain = schedule.GetDomain();
+            int numDimensions = domain.NumDimensions();
+
+            // define all computed index variables (that are used)
+            std::set<Index> usedIndices;
+            for (int d = 0; d < numDimensions; ++d)
+            {
+                auto computedIndices = domain.GetComputedIndicesForDimension(domain.GetBaseIndex(d));
+                for (auto index : computedIndices)
+                {
+                    if (loopNest.IsUsed(index, activeKernels))
+                    {
+                        usedIndices.insert(index);
+                    }
+                }
+            }
+
+            for (const auto& index : usedIndices)
+            {
+                auto expr = loopNest.GetIndexExpression(index);
+                auto indexValue = EmitIndexExpression(index, expr, indexVariables);
+                indexVariables.insert_or_assign(index, LoopIndexSymbolTableEntry{ schedule.CurrentLoopIndex(), indexValue, Range{ 0, 0, 0 }, LoopIndexState::inProgress });
+            }
+        }
+
+        bool LoopNestVisitor::IsPlacementValid(const ScheduledKernel& kernel, const LoopIndexSymbolTable& runtimeLoopIndices, const LoopVisitSchedule& schedule) const
+        {
+            const auto& domain = schedule.GetDomain();
+            if (kernel.placement.IsEmpty() || IsBodyPlacementPredicate(kernel.placement))
+            {
+                // TODO: put this in a function that preprocesses the kernel predicates when adding the kernels to the schedule
+                for (const auto& kernelIndex : kernel.kernel.GetIndices())
+                {
+                    for (const auto& loopIndex : domain.GetDependentLoopIndices(kernelIndex, true))
+                    {
+                        // if not defined(loopIndex) return false;
+                        if (runtimeLoopIndices.count(loopIndex) == 0 || runtimeLoopIndices.at(loopIndex).state == LoopIndexState::done)
+                        {
+                            return false;
+                        }
+                    }
+                }
+
+                if (kernel.placement.IsEmpty())
+                {
+                    return true;
+                }
+            }
+
+            auto evalPlacement = [&](const auto& evalPlacement, const KernelPredicate& p) -> bool {
+                if (p.IsAlwaysTrue())
+                {
+                    return true;
+                }
+                else if (p.Is<FragmentTypePredicate>())
+                {
+                    throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "Fragment predicates not valid for placement");
+                }
+                else if (auto placementPred = p.As<PlacementPredicate>(); placementPred != nullptr)
+                {
+                    if (schedule.IsInnermostLoop())
+                    {
+                        return !placementPred->HasIndex();
+                    }
+
+                    auto nextLoopIndex = schedule.Next().CurrentLoopIndex();
+                    auto where = placementPred->GetPlacement();
+
+                    std::vector<Index> dependentLoopIndices;
+                    if (placementPred->HasIndex())
+                    {
+                        auto testIndex = placementPred->GetIndex();
+
+                        // get list of dependent indices
+                        dependentLoopIndices = domain.GetDependentLoopIndices(testIndex, true);
+
+                        // First check that we're not already inside any dependent loops
+                        for (const auto& i : dependentLoopIndices)
+                        {
+                            if (runtimeLoopIndices.count(i) != 0 && runtimeLoopIndices.at(i).state == LoopIndexState::inProgress)
+                            {
+                                return false;
+                            }
+                        }
+                    }
+                    else
+                    {
+                        dependentLoopIndices = { nextLoopIndex };
+                    }
+
+                    // Now check that the next loop at least partially defines the index in question
+                    if (std::find(dependentLoopIndices.begin(), dependentLoopIndices.end(), nextLoopIndex) != dependentLoopIndices.end())
+                    {
+                        // Finally, check that we're in the correct position (before vs. after)
+                        if (where == Placement::before)
+                        {
+                            return (runtimeLoopIndices.count(nextLoopIndex) == 0 || runtimeLoopIndices.at(nextLoopIndex).state == LoopIndexState::notVisited);
+                        }
+                        else // (where == Placement::after)
+                        {
+                            return (runtimeLoopIndices.count(nextLoopIndex) != 0 && runtimeLoopIndices.at(nextLoopIndex).state == LoopIndexState::done);
+                        }
+                    }
+                    return false;
+                }
+                else if (auto definedPred = p.As<IndexDefinedPredicate>(); definedPred != nullptr)
+                {
+                    auto definedIndex = definedPred->GetIndex();
+                    return (runtimeLoopIndices.count(definedIndex) > 0) && (runtimeLoopIndices.at(definedIndex).state != LoopIndexState::done);
+                }
+                else if (auto conjunction = p.As<KernelPredicateConjunction>(); conjunction != nullptr)
+                {
+                    bool result = true;
+                    for (const auto& t : conjunction->GetTerms())
+                    {
+                        result &= evalPlacement(evalPlacement, *t);
+                    }
+                    return result;
+                }
+                else if (auto disjunction = p.As<KernelPredicateDisjunction>(); disjunction != nullptr)
+                {
+                    bool result = false;
+                    for (const auto& t : disjunction->GetTerms())
+                    {
+                        result |= evalPlacement(evalPlacement, *t);
+                    }
+                    return result;
+                }
+                else
+                {
+                    return false;
+                }
+            };
+
+            return evalPlacement(evalPlacement, kernel.placement);
+        }
+
+        std::vector<ScheduledKernel> LoopNestVisitor::GetValidKernels(const ScheduledKernelGroup& kernelGroup, const LoopIndexSymbolTable& runtimeIndexVariables, const LoopVisitSchedule& schedule) const
+        {
+            std::vector<ScheduledKernel> validKernels;
+            std::copy_if(kernelGroup.kernels.begin(), kernelGroup.kernels.end(), std::back_inserter(validKernels), [&](const ScheduledKernel& k) {
+                if (!IsPlacementValid(k, runtimeIndexVariables, schedule))
+                {
+                    return false;
+                }
+                auto predicate = schedule.GetKernelPredicate(k).Simplify(runtimeIndexVariables, schedule);
+                if (predicate.IsAlwaysFalse())
+                {
+                    return false;
+                }
+                return true;
+            });
+            return validKernels;
+        }
+
+        LoopIndexSymbolTable LoopNestVisitor::GetRuntimeIndexVariables(const LoopIndexSymbolTable& runtimeLoopIndices, const LoopNest& loopNest) const
+        {
+            int numDimensions = loopNest.NumDimensions();
+
+            // Start with the concrete loop indices
+            LoopIndexSymbolTable indexVariables = runtimeLoopIndices;
+
+            // ...and add the variables we need to compute (because they represent an index that has been split)
+            for (int d = 0; d < numDimensions; ++d)
+            {
+                auto computedIndices = loopNest.GetDomain().GetComputedIndicesForDimension(loopNest.GetDomain().GetBaseIndex(d));
+                for (auto index : computedIndices)
+                {
+                    auto runtimeVarIter = runtimeLoopIndices.find(index);
+                    if (runtimeVarIter != runtimeLoopIndices.end())
+                    {
+                        indexVariables.insert_or_assign(index, runtimeVarIter->second);
+                    }
+                }
+            }
+            return indexVariables;
+        }
+
+        void LoopNestVisitor::DefinePostLoopIndex(const Index& loopIndex, LoopIndexSymbolTable& runtimeLoopIndices, const LoopVisitSchedule& schedule) const
+        {
+            auto loopRange = GetLoopRange(loopIndex, runtimeLoopIndices, schedule);
+            auto lastVal = loopRange.End();
+            runtimeLoopIndices.insert_or_assign(loopIndex, LoopIndexSymbolTableEntry{ loopIndex, lastVal, loopRange, LoopIndexState::done });
+        }
+
+        std::vector<ScheduledKernel> LoopNestVisitor::GetValidKernels(std::vector<ScheduledKernel> activeKernels, const std::unordered_map<Index, LoopFragmentFlags>& fragmentStates, LoopFragmentFlags currentLoopFlags, LoopFragmentFlags kernelFilter, const LoopVisitSchedule& schedule) const
+        {
+            std::vector<ScheduledKernel> result;
+            for (auto fragmentType : { LoopFragmentType::prologue, LoopFragmentType::body, LoopFragmentType::epilogue })
+            {
+                for (const auto& kernel : activeKernels)
+                {
+                    if (kernelFilter.GetFlag(kernel.constraints.GetPlacement()))
+                    {
+                        // This should only run in a loop fragment of type 'fragmentType' and allowed by currentLoopFlags
+                        if (ShouldRunKernel(kernel, fragmentType, fragmentStates, currentLoopFlags, schedule))
+                        {
+                            result.push_back(kernel);
+                        }
+                    }
+                }
+            }
+
+            return result;
+        }
+
+        bool LoopNestVisitor::ShouldRunKernel(const ScheduledKernel& kernel, LoopFragmentType kernelPlacement, const std::unordered_map<Index, LoopFragmentFlags>& constraintIndices, LoopFragmentFlags currentLoopFlags, const LoopVisitSchedule& schedule) const
+        {
+            const auto& where = kernel.constraints;
+            auto placement = where.GetPlacement();
+            bool isBodyKernel = where.GetBoundaryIndices().size() == 0;
+            if (isBodyKernel)
+                placement = LoopFragmentType::body;
+
+            // if (where.GetPlacement() != kernelPlacement)
+            if (placement != kernelPlacement)
+            {
+                return false;
+            }
+
+            // bool enforceBoundary = where.GetBoundaryIndices().size() != 0 || (currentLoopFlags.GetFlag(LoopFragmentType::prologue) || currentLoopFlags.GetFlag(LoopFragmentType::epilogue));
+            bool enforceBoundary = true;
+            if (enforceBoundary && !currentLoopFlags.GetFlag(kernelPlacement))
+            {
+                return false;
+            }
+
+            // Are we at the correct loop level (are all the indices needed by the kernel defined)?
+            // TODO: We want to only fire on a loop involving a leaf child of the index
+            auto insideIndices = where.GetRequiredIndices();
+            if (!insideIndices.empty())
+            {
+                if (currentLoopFlags.GetFlag(kernelPlacement) && where.GetBoundaryIndices().size() != 0)
+                {
+                    if (schedule.CurrentNestLevel() == 0)
+                    {
+                        return false;
+                    }
+                    if (!AreAllFullyDefined(insideIndices, schedule))
+                    {
+                        return false;
+                    }
+                }
+                else
+                {
+                    if (!AreAllFullyDefined(insideIndices, schedule))
+                    {
+                        return false;
+                    }
+
+                    // We want to return true only when _this_ loop defines all the indices, so let's check that the parent
+                    // loop wasn't also a valid candidate (but only perform this check if we're not on the first loop)
+                    if (schedule.CurrentNestLevel() != 0)
+                    {
+                        if (AreAllFullyDefined(insideIndices, schedule.Prev()))
+                        {
+                            return false;
+                        }
+                    }
+                }
+            }
+
+            // are we part of a prologue/epilogue for the indices we were constrained with?
+            for (const auto& outsideIndex : where.GetBoundaryIndices())
+            {
+                auto it = constraintIndices.find(outsideIndex);
+                if (it == constraintIndices.end() || !it->second.GetFlag(kernelPlacement))
+                {
+                    return false;
+                }
+
+                // is this the innermost loop level (or later) for the given constraint index?
+                // (to check, just ensure there are no more loops after this one with the same dimension index)
+                if (schedule.Next().WillVisitIndex(outsideIndex))
+                {
+                    return false;
+                }
+            }
+
+            return true;
+        }
+
+        bool LoopNestVisitor::WillKernelRunInThisLoop(const ScheduledKernel& kernel, LoopFragmentFlags kernelFilter, const LoopVisitSchedule& schedule) const
+        {
+            // return true if:
+            // 1) constraints position allowed by kernelFilter
+            // 2) all required indices exist
+            // 3) none of boundary indices exist (except perhaps for current loop?)
+            const auto& where = kernel.constraints;
+            if (!kernelFilter.GetFlag(where.GetPlacement()))
+            {
+                return false;
+            }
+
+            // are we at the correct loop level (are all the indices needed by the kernel defined)?
+            // TODO: need to allow using non-"dimension" indices as well (for non-innermost kernels)
+            auto insideIndices = where.GetRequiredIndices();
+            if (!insideIndices.empty())
+            {
+                // If all the required indices aren't defined yet, fail
+                if (!AreAllFullyDefined(insideIndices, schedule))
+                {
+                    return false;
+                }
+
+                // We want to return true only when _this_ loop defines all the indices, so let's check that the parent
+                // loop wasn't also a valid candidate (but only perform this check if we're not on the first loop)
+                if (schedule.CurrentNestLevel() != 0)
+                {
+                    if (AreAllFullyDefined(insideIndices, schedule.Prev()))
+                    {
+                        return false;
+                    }
+                }
+            }
+
+            // are we part of a prologue/epilogue for the indices we were constrained with?
+            for (const auto& outsideIndex : where.GetBoundaryIndices())
+            {
+                if (schedule.IsDone())
+                {
+                    return false;
+                }
+
+                if (schedule.Next().WillVisitIndex(outsideIndex))
+                {
+                    return false;
+                }
+            }
+
+            return true;
+        }
+
+        bool LoopNestVisitor::IsIdentity(const IndexExpression& expr, const Index& index) const
+        {
+            return (expr.indices.size() == 1 &&
+                    expr.indices[0].index == index &&
+                    expr.indices[0].scale == 1 &&
+                    expr.begin == 0);
+        }
+
+        bool LoopNestVisitor::AreAllFullyDefined(const std::vector<Index>& indices, const LoopVisitSchedule& schedule) const
+        {
+            for (const auto& index : indices)
+            {
+                if (!schedule.IsFullyDefined(index))
+                {
+                    return false;
+                }
+            }
+            return true;
+        }
+
+    } // namespace loopnests
+} // namespace value
+} // namespace ell
diff --git a/libraries/value/src/loopnests/Range.cpp b/libraries/value/src/loopnests/Range.cpp
new file mode 100644
index 000000000..4960efc8f
--- /dev/null
+++ b/libraries/value/src/loopnests/Range.cpp
@@ -0,0 +1,65 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     Range.h (value)
+//  Authors:  Chuck Jacobs, Kern Handa
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "loopnests/Range.h"
+
+#include <ostream>
+
+namespace ell
+{
+namespace value
+{
+    namespace loopnests
+    {
+        Range::Range(int begin, int end, int increment) :
+            _begin(begin),
+            _end(end),
+            _increment(increment) {}
+
+        int Range::Begin() const { return _begin; }
+
+        int Range::End() const { return _end; }
+
+        int Range::Size() const { return _end - _begin; }
+
+        int Range::Increment() const { return _increment; }
+
+        std::ostream& operator<<(std::ostream& os, const Range& r)
+        {
+            os << "[" << r.Begin() << "," << r.End() << ":" << r.Increment() << ")";
+            return os;
+        }
+
+        bool operator==(const Range& i1, const Range& i2)
+        {
+            return (i1.Begin() == i2.Begin()) && (i1.End() == i2.End()) && (i1.Increment() == i2.Increment());
+        }
+
+        bool operator!=(const Range& i1, const Range& i2)
+        {
+            return (i1.Begin() != i2.Begin()) || (i1.End() != i2.End()) || (i1.Increment() != i2.Increment());
+        }
+
+        bool operator<(const Range& i1, const Range& i2)
+        {
+            if (i1.Begin() != i2.Begin())
+            {
+                return i1.Begin() < i2.Begin();
+            }
+            else if (i1.End() != i2.End())
+            {
+                return i1.End() < i2.End();
+            }
+            else
+            {
+                return i1.Increment() < i2.Increment();
+            }
+        }
+    } // namespace loopnests
+} // namespace value
+} // namespace ell
diff --git a/libraries/value/src/loopnests/SplitIndexRange.cpp b/libraries/value/src/loopnests/SplitIndexRange.cpp
new file mode 100644
index 000000000..996dba4ec
--- /dev/null
+++ b/libraries/value/src/loopnests/SplitIndexRange.cpp
@@ -0,0 +1,506 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     SplitIndexRange.cpp (value)
+//  Authors:  Chuck Jacobs, Kern Handa
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "loopnests/SplitIndexRange.h"
+
+#include <utilities/include/Exception.h>
+
+#include <ostream>
+#include <queue>
+
+namespace ell
+{
+namespace value
+{
+    namespace loopnests
+    {
+        SplitIndexRange::SplitIndexRange(const IndexRange& indexRange)
+        {
+            auto index = indexRange.GetIndex();
+            auto range = indexRange.GetRange();
+
+            // Add the dimension index as the root of the index tree
+            _indices.push_back(index);
+            _ranges.push_back(range);
+            _parentOffset.push_back(-1); // 'null' sentinel
+            _leftChildOffset.push_back(-1); // 'null' sentinel
+            _indexOffset[index] = 0;
+        }
+
+        const Index& SplitIndexRange::GetDimensionIndex() const
+        {
+            return _indices[0];
+        }
+
+        int SplitIndexRange::NumSplits() const
+        {
+            auto result = static_cast<int>(_indices.size() + 1) / 2;
+            return result;
+        }
+
+        int SplitIndexRange::GetBegin() const
+        {
+            return _ranges[0].Begin();
+        }
+
+        int SplitIndexRange::GetSize() const
+        {
+            return _ranges[0].Size();
+        }
+
+        int SplitIndexRange::GetIncrement() const
+        {
+            return _ranges[0].Increment();
+        }
+
+        bool SplitIndexRange::Contains(const Index& index) const
+        {
+            if (IsDimension(index))
+            {
+                return true;
+            }
+            return _indexOffset.count(index) != 0;
+        }
+
+        bool SplitIndexRange::IsLoopIndex(const Index& index) const
+        {
+            auto node = GetNode(index);
+            return IsLeaf(node);
+        }
+
+        bool SplitIndexRange::IsComputedIndex(const Index& index) const
+        {
+            auto node = GetNode(index);
+            return IsInteriorNode(node);
+        }
+
+        bool SplitIndexRange::IsDimension(const Index& index) const
+        {
+            return index == GetDimensionIndex();
+        }
+
+        bool SplitIndexRange::IsChildOf(const Index& child, const Index& parent) const
+        {
+            auto childOffset = GetNode(child);
+            auto parentOffset = GetNode(parent);
+            return _parentOffset.at(childOffset) != -1 && _parentOffset.at(childOffset) == parentOffset;
+        }
+
+        bool SplitIndexRange::IsParentOf(const Index& parent, const Index& child) const
+        {
+            return IsChildOf(parent, child);
+        }
+
+        // index1 depends on index2? e.g., is index2 in the list of dependent indices?
+        bool SplitIndexRange::DependsOn(const Index& index1, const Index& index2) const
+        {
+            // TODO: assert index1 and index2 are both in this dimension
+
+            // The top-level dimension index depends on everything
+            if (IsDimension(index1))
+            {
+                return true;
+            }
+
+            // nothing else depends on the top-level dimension index
+            if (IsDimension(index2))
+            {
+                return false;
+            }
+
+            auto node1 = GetNode(index1);
+            auto node2 = GetNode(index2);
+            if (node1 == node2)
+            {
+                return false;
+            }
+
+            while (node2 > 0)
+            {
+                auto parentNode = GetParent(node2);
+                if (parentNode == node1)
+                {
+                    return true;
+                }
+                node2 = parentNode;
+            }
+            return false;
+        }
+
+        int SplitIndexRange::GetSplitSize(int level) const
+        {
+            if (level > NumSplits())
+            {
+                throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "GetSplitSize() --- index out of range");
+            }
+            if (level == NumSplits())
+            {
+                return 1;
+            }
+            auto node = GetNthLeaf(level);
+            return _ranges[node].Size();
+        }
+
+        Range SplitIndexRange::GetDimensionRange() const
+        {
+            return _ranges[0];
+        }
+
+        Range SplitIndexRange::GetIndexRange(const Index& index) const
+        {
+            auto node = GetNode(index);
+            return _ranges[node];
+        }
+
+        Index SplitIndexRange::GetSplitIndex(int level) const
+        {
+            auto node = GetNthLeaf(level);
+            return _indices[node];
+        }
+
+        SplitIndex SplitIndexRange::Split(int size)
+        {
+            if (size > _ranges.back().Size())
+            {
+                throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "Split size larger than smallest existing split");
+            }
+
+            auto lastLeaf = GetSmallestLeaf(0); // arbitrarily choose the last leaf --- it will be the right child of the right child of the right child... of the root (the `0` argument)
+            return SplitNode(lastLeaf, size);
+        }
+
+        SplitIndex SplitIndexRange::Split(Index index, int size)
+        {
+            if (IsDimension(index))
+            {
+                index = _indices[0];
+            }
+            auto node = GetNode(index);
+            return SplitNode(node, size);
+        }
+
+        int SplitIndexRange::GetNode(const Index& index) const
+        {
+            return _indexOffset.at(index);
+        }
+
+        SplitIndex SplitIndexRange::SplitNode(int node, int size)
+        {
+            auto prefix = GetDimensionIndex().GetName() + "_";
+            auto startIndex = static_cast<int>(_indices.size());
+            Index outer = { prefix + std::to_string(startIndex) };
+            Index inner = { prefix + std::to_string(startIndex + 1) };
+
+            // `SplitNode(n, size)` splits a leaf of `n` --- the bottom-rightmost leaf. If `n` is a leaf already, split it.
+            auto parentOffset = GetSmallestLeaf(node);
+            auto parent = _indices[parentOffset];
+            auto parentSize = _ranges[parentOffset].Size();
+            auto parentIncrement = _ranges[parentOffset].Increment();
+            auto leftChildOffset = static_cast<int>(_indices.size());
+
+            // Add outer index to data structure
+            auto offset = static_cast<int>(_indices.size());
+            _indices.push_back(outer);
+            _ranges.push_back({ 0, parentSize, size });
+            _parentOffset.push_back(parentOffset);
+            _leftChildOffset.push_back(-1);
+            _indexOffset[outer] = offset;
+
+            // Add inner index to datastructure
+            offset = static_cast<int>(_indices.size());
+            _indices.push_back(inner);
+            auto thisSize = std::min(parentSize, size); // In case split is larger than original range
+            _ranges.push_back({ 0, thisSize, parentIncrement });
+            _parentOffset.push_back(parentOffset);
+            _leftChildOffset.push_back(-1);
+            _leftChildOffset[parentOffset] = leftChildOffset;
+            _indexOffset[inner] = offset;
+
+            return SplitIndex{ outer, inner };
+        }
+
+        //
+        // Binary-tree implementation below:
+        //
+        int SplitIndexRange::GetParent(int node) const
+        {
+            return _parentOffset.at(node);
+        }
+
+        int SplitIndexRange::GetLeftChild(int node) const
+        {
+            return _leftChildOffset.at(node);
+        }
+
+        int SplitIndexRange::GetRightChild(int node) const
+        {
+            auto leftChild = _leftChildOffset.at(node);
+            return leftChild == -1 ? leftChild : leftChild + 1;
+        }
+
+        int SplitIndexRange::GetNthLeaf(int n) const
+        {
+            for (int i = 0; i < static_cast<int>(_leftChildOffset.size()); ++i)
+            {
+                auto node = _leftChildOffset[i];
+                if (node == -1) // leaves have no children
+                {
+                    if (n == 0)
+                    {
+                        return i;
+                    }
+                    --n;
+                }
+            }
+
+            throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "Couldn't find node");
+        }
+
+        // returns the "smallest" leaf descendent of index. If index is itself a leaf, return it, else return GetSmallestLeaf(index.rightChild)
+        int SplitIndexRange::GetSmallestLeaf(int node) const
+        {
+            if (IsLeaf(node))
+            {
+                return node;
+            }
+            return GetSmallestLeaf(GetRightChild(node));
+        }
+
+        bool SplitIndexRange::IsLeaf(int node) const
+        {
+            auto isLeaf = _leftChildOffset.at(node) == -1;
+            if (isLeaf && (_parentOffset.at(node) == -1 && node != 0))
+            {
+                throw utilities::LogicException(utilities::LogicExceptionErrors::illegalState);
+            }
+
+            return isLeaf;
+        }
+
+        bool SplitIndexRange::IsInteriorNode(int node) const
+        {
+            return !IsLeaf(node);
+        }
+
+        const std::vector<Index>& SplitIndexRange::GetIndices() const
+        {
+            return _indices;
+        }
+
+        std::vector<Index> SplitIndexRange::GetLoopIndices() const
+        {
+            std::vector<Index> result;
+            for (int n = 0; n < static_cast<int>(_indices.size()); ++n)
+            {
+                if (IsLeaf(n))
+                {
+                    result.push_back(_indices[n]);
+                }
+            }
+            return result;
+        }
+
+        std::vector<Index> SplitIndexRange::GetComputedIndices() const
+        {
+            std::vector<Index> result;
+            for (int n = 0; n < static_cast<int>(_indices.size()); ++n)
+            {
+                if (IsInteriorNode(n))
+                {
+                    result.push_back(_indices[n]);
+                }
+            }
+            return result;
+        }
+
+        std::vector<Index> SplitIndexRange::GetDependentIndices(const Index& index, bool includeSelf) const
+        {
+            std::vector<Index> result;
+            if (includeSelf)
+            {
+                result.push_back(index);
+            }
+
+            std::queue<int> nodesToVisit;
+            auto node = GetNode(index);
+            nodesToVisit.push(node);
+            // get all children, children of children, etc.
+            while (!nodesToVisit.empty())
+            {
+                auto n = nodesToVisit.front();
+                nodesToVisit.pop();
+                result.push_back(_indices[n]);
+                auto leftChild = GetLeftChild(n);
+                if (leftChild != -1)
+                {
+                    nodesToVisit.push(leftChild);
+                    nodesToVisit.push(leftChild + 1);
+                }
+            }
+            return result;
+        }
+
+        std::vector<Index> SplitIndexRange::GetDependentLoopIndices(const Index& index, bool includeSelf) const
+        {
+
+            std::queue<int> nodesToVisit;
+            auto node = IsDimension(index) ? 0 : GetNode(index);
+            if (includeSelf && IsLeaf(node))
+            {
+                // If we're a leaf, no need to check any further
+                return { index };
+            }
+
+            std::vector<Index> result;
+            nodesToVisit.push(node);
+            // get all children, children of children, etc.
+            while (!nodesToVisit.empty())
+            {
+                auto n = nodesToVisit.front();
+                nodesToVisit.pop();
+
+                for (auto child : { GetLeftChild(n), GetRightChild(n) })
+                {
+                    if (child != -1)
+                    {
+                        if (IsLeaf(child))
+                        {
+                            result.push_back(_indices[child]);
+                        }
+                        else
+                        {
+                            nodesToVisit.push(child);
+                        }
+                    }
+                }
+            }
+            return result;
+        }
+
+        bool SplitIndexRange::HasParentIndex(const Index& index) const
+        {
+            auto node = GetNode(index);
+            return node != -1 && node != 0;
+        }
+
+        Index SplitIndexRange::GetParentIndex(const Index& index) const
+        {
+            auto node = GetNode(index);
+            if (node == -1 || node == 0)
+                throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "GetParentIndex() --- dimension index has no parent");
+
+            return _indices[GetParent(node)];
+        }
+
+        bool SplitIndexRange::IsOuterSplitIndex(const Index& index) const
+        {
+            auto node = GetNode(index);
+            if (node == -1 || node == 0)
+            {
+                return false;
+            }
+
+            auto parentNode = GetParent(node);
+            return node == GetLeftChild(parentNode);
+        }
+
+        bool SplitIndexRange::IsInnerSplitIndex(const Index& index) const
+        {
+            auto node = GetNode(index);
+            if (node == -1 || node == 0)
+            {
+                return false;
+            }
+
+            auto parentNode = GetParent(node);
+            return node == GetRightChild(parentNode);
+        }
+
+        Index SplitIndexRange::GetOuterSplitIndex(const Index& parent) const
+        {
+            auto parentNode = GetNode(parent);
+            if (IsLeaf(parentNode))
+                throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "GetOuterSplitIndex() --- called on a non-split index");
+
+            return _indices[GetLeftChild(parentNode)];
+        }
+
+        Index SplitIndexRange::GetInnerSplitIndex(const Index& parent) const
+        {
+            auto parentNode = GetNode(parent);
+            if (IsLeaf(parentNode))
+                throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "GetInnerSplitIndex() --- called on a non-split index");
+
+            return _indices[GetRightChild(parentNode)];
+        }
+
+        std::vector<Index> SplitIndexRange::GetAllParentIndices(const Index& index) const
+        {
+            std::vector<Index> result{ GetDimensionIndex() };
+            auto node = GetNode(index);
+            if (node == -1)
+                return result;
+
+            for (;;)
+            {
+                node = GetParent(node);
+                if (node == -1)
+                {
+                    break;
+                }
+                result.push_back(_indices[node]);
+            }
+            return result;
+        }
+
+        std::vector<Index> SplitIndexRange::GetChildIndices(const Index& index) const
+        {
+            if (IsDimension(index))
+            {
+                return { _indices[0] };
+            }
+
+            auto node = GetNode(index);
+            if (node == -1)
+            {
+                throw utilities::LogicException(utilities::LogicExceptionErrors::illegalState);
+            }
+
+            return { _indices[GetLeftChild(node)], _indices[GetRightChild(node)] };
+        }
+
+        void SplitIndexRange::Print(std::ostream& os) const
+        {
+            os << "Dimension " << GetDimensionIndex() << " range: " << GetDimensionRange() << std::endl;
+
+            os << "  Loop variables:\t";
+            for (auto i : GetLoopIndices())
+            {
+                auto r = GetIndexRange(i);
+                os << i << ": " << r << ";\t";
+            }
+            os << std::endl;
+
+            os << "  Comp variables:\t";
+            for (auto i : GetComputedIndices())
+            {
+                auto r = GetIndexRange(i);
+                os << i << ": " << r << " (";
+                std::string sep = "";
+                for (auto dep : GetChildIndices(i))
+                {
+                    os << sep << dep;
+                    sep = ", ";
+                }
+                os << ");\t";
+            }
+            os << std::endl;
+        }
+    } // namespace loopnests
+} // namespace value
+} // namespace ell
diff --git a/libraries/value/src/loopnests/SplitIterationDomain.cpp b/libraries/value/src/loopnests/SplitIterationDomain.cpp
new file mode 100644
index 000000000..1faca13a3
--- /dev/null
+++ b/libraries/value/src/loopnests/SplitIterationDomain.cpp
@@ -0,0 +1,254 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     SplitIterationDomain.cpp (value)
+//  Authors:  Chuck Jacobs, Kern Handa
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "loopnests/SplitIterationDomain.h"
+
+namespace ell
+{
+namespace value
+{
+    namespace loopnests
+    {
+        SplitIterationDomain::SplitIterationDomain(const IterationDomain& domain)
+        {
+            int numDimensions = domain.NumDimensions();
+            for (int d = 0; d < numDimensions; ++d)
+            {
+                _dimensions.emplace_back(domain.GetDimensionRange(d));
+                _indexToOffsetMap[_dimensions.back().GetDimensionIndex()] = d;
+                for (auto i : _dimensions.back().GetIndices())
+                {
+                    _baseIndices.emplace(i, _dimensions.back().GetDimensionIndex());
+                }
+            }
+            //Assert(IsUnique(Transform(domain.GetRanges(), [](auto x) { return x.GetDimensionIndex().GetName(); })), "Dimensions must have unique indices");
+        }
+
+        int SplitIterationDomain::NumDimensions() const
+        {
+            return static_cast<int>(_dimensions.size());
+        }
+
+        int SplitIterationDomain::GetDimensionSize(const Index& dimensionIndex) const
+        {
+            return GetDimensionRange(dimensionIndex).GetSize();
+        }
+
+        int SplitIterationDomain::GetDimensionBegin(const Index& dimensionIndex) const
+        {
+            return GetDimensionRange(dimensionIndex).GetBegin();
+        }
+
+        Range SplitIterationDomain::GetIndexRange(const Index& index) const
+        {
+            return GetDimensionRange(index).GetIndexRange(index);
+        }
+
+        const std::vector<Index>& SplitIterationDomain::GetIndicesForDimension(const Index& dimensionIndex) const
+        {
+            return GetDimensionRange(dimensionIndex).GetIndices();
+        }
+
+        std::vector<Index> SplitIterationDomain::GetAllLoopIndices() const
+        {
+            std::vector<Index> result;
+            for (int i = 0; i < NumDimensions(); ++i)
+            {
+                auto dimensionIndices = GetDimensionRange(i).GetLoopIndices();
+                result.insert(result.end(), dimensionIndices.begin(), dimensionIndices.end());
+            }
+            return result;
+        }
+
+        std::vector<Index> SplitIterationDomain::GetLoopIndicesForDimension(const Index& dimensionIndex) const
+        {
+            return GetDimensionRange(dimensionIndex).GetLoopIndices();
+        }
+
+        std::vector<Index> SplitIterationDomain::GetComputedIndicesForDimension(const Index& dimensionIndex) const
+        {
+            return GetDimensionRange(dimensionIndex).GetComputedIndices();
+        }
+
+        std::vector<Index> SplitIterationDomain::GetDependentIndices(const Index& index, bool includeSelf) const
+        {
+            return GetDimensionRange(index).GetDependentIndices(index, includeSelf);
+        }
+
+        std::vector<Index> SplitIterationDomain::GetDependentLoopIndices(const Index& index, bool includeSelf) const
+        {
+            return GetDimensionRange(index).GetDependentLoopIndices(index, includeSelf);
+        }
+
+        bool SplitIterationDomain::Contains(const Index& index) const
+        {
+            return GetDimensionRange(index).Contains(index);
+        }
+
+        bool SplitIterationDomain::IsLoopIndex(const Index& index) const
+        {
+            return GetDimensionRange(index).IsLoopIndex(index);
+        }
+
+        bool SplitIterationDomain::IsComputedIndex(const Index& index) const
+        {
+            return GetDimensionRange(index).IsComputedIndex(index);
+        }
+
+        bool SplitIterationDomain::IsDimension(const Index& index) const
+        {
+            return GetBaseIndex(index) == index;
+        }
+
+        bool SplitIterationDomain::SameDimension(const Index& index1, const Index& index2) const
+        {
+            return GetBaseIndex(index1) == GetBaseIndex(index2);
+        }
+
+        bool SplitIterationDomain::IsParentOf(const Index& parent, const Index& child) const
+        {
+            if (!SameDimension(parent, child))
+            {
+                return false;
+            }
+            return GetDimensionRange(parent).IsParentOf(parent, child);
+        }
+
+        bool SplitIterationDomain::IsChildOf(const Index& child, const Index& parent) const
+        {
+            if (!SameDimension(child, parent))
+            {
+                return false;
+            }
+            return GetDimensionRange(child).IsChildOf(child, parent);
+        }
+
+        bool SplitIterationDomain::DependsOn(const Index& index1, const Index& index2) const
+        {
+            if (!SameDimension(index1, index2))
+            {
+                return false;
+            }
+            return GetDimensionRange(index1).DependsOn(index1, index2);
+        }
+
+        bool SplitIterationDomain::HasParentIndex(const Index& parent) const
+        {
+            return GetDimensionRange(parent).HasParentIndex(parent);
+        }
+
+        Index SplitIterationDomain::GetParentIndex(const Index& parent) const
+        {
+            return GetDimensionRange(parent).GetParentIndex(parent);
+        }
+
+        bool SplitIterationDomain::IsOuterSplitIndex(const Index& index) const
+        {
+            return GetDimensionRange(index).IsOuterSplitIndex(index);
+        }
+        bool SplitIterationDomain::IsInnerSplitIndex(const Index& index) const
+        {
+            return GetDimensionRange(index).IsInnerSplitIndex(index);
+        }
+        Index SplitIterationDomain::GetOuterSplitIndex(const Index& parent) const
+        {
+            return GetDimensionRange(parent).GetOuterSplitIndex(parent);
+        }
+        Index SplitIterationDomain::GetInnerSplitIndex(const Index& parent) const
+        {
+            return GetDimensionRange(parent).GetInnerSplitIndex(parent);
+        }
+
+        std::vector<Index> SplitIterationDomain::GetAllParentIndices(const Index& index) const
+        {
+            return GetDimensionRange(index).GetAllParentIndices(index);
+        }
+
+        std::vector<Index> SplitIterationDomain::GetChildIndices(const Index& index) const
+        {
+            return GetDimensionRange(index).GetChildIndices(index);
+        }
+
+        const SplitIndexRange& SplitIterationDomain::GetDimensionRange(int offset) const
+        {
+            return _dimensions[offset];
+        }
+
+        SplitIndexRange& SplitIterationDomain::GetDimensionRange(int offset)
+        {
+            return _dimensions[offset];
+        }
+
+        const SplitIndexRange& SplitIterationDomain::GetDimensionRange(const Index& index) const
+        {
+            auto offset = GetOffsetFromIndex(index);
+            return _dimensions[offset];
+        }
+
+        SplitIndexRange& SplitIterationDomain::GetDimensionRange(const Index& index)
+        {
+            auto offset = GetOffsetFromIndex(index);
+            return _dimensions[offset];
+        }
+
+        int SplitIterationDomain::NumSplits(const Index& index) const
+        {
+            auto offset = GetOffsetFromIndex(index);
+            return _dimensions[offset].NumSplits();
+        }
+
+        SplitIndex SplitIterationDomain::Split(const Index& index, int splitSize)
+        {
+            auto baseIndex = GetBaseIndex(index);
+            auto offset = GetOffsetFromIndex(index);
+            auto result = _dimensions[offset].Split(index, splitSize);
+            _baseIndices.emplace(result.inner, baseIndex);
+            _baseIndices.emplace(result.outer, baseIndex);
+            return result;
+        }
+
+        bool SplitIterationDomain::IsPrimaryDimension(const Index& index) const
+        {
+            return _indexToOffsetMap.count(index) != 0;
+        }
+
+        int SplitIterationDomain::GetOffsetFromIndex(const Index& index) const
+        {
+            auto baseIndex = GetBaseIndex(index);
+            return _indexToOffsetMap.at(baseIndex);
+        }
+
+        Index SplitIterationDomain::GetBaseIndex(const Index& index) const
+        {
+            auto mapIndex = _baseIndices.find(index);
+            if (mapIndex != _baseIndices.end())
+            {
+                return mapIndex->second;
+            }
+            else
+            {
+                return index;
+            }
+        }
+
+        Index SplitIterationDomain::GetBaseIndex(int offset) const
+        {
+            return _dimensions[offset].GetDimensionIndex();
+        }
+
+        void SplitIterationDomain::Print(std::ostream& os) const
+        {
+            for (const auto& d : _dimensions)
+            {
+                d.Print(os);
+            }
+        }
+
+    } // namespace loopnests
+} // namespace value
+} // namespace ell
diff --git a/libraries/value/test/include/CachingStrategy_test.h b/libraries/value/test/include/CachingStrategy_test.h
new file mode 100644
index 000000000..cb1a1ab1e
--- /dev/null
+++ b/libraries/value/test/include/CachingStrategy_test.h
@@ -0,0 +1,148 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     CachingStrategy_test.h (value)
+//  Authors:  Mason Remy
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+namespace ell
+{
+namespace value
+{
+    class Scalar;
+}
+
+// Simple Blas TCOPY tests
+value::Scalar BLASTCOPY_ValidateOutput_Test1();
+value::Scalar BLASTCOPY_ValidateOutput_Test2();
+value::Scalar BLASTCOPY_ValidateMemory_Test1();
+value::Scalar BLASTCOPY_ValidateMemory_Test2();
+value::Scalar BLASTCOPY_ValidateMemory_Test3();
+
+value::Scalar BLASTCOPY_ValidateOutput_BoundaryCondition_Test1();
+value::Scalar BLASTCOPY_ValidateOutput_BoundaryCondition_Test2();
+value::Scalar BLASTCOPY_ValidateOutput_BoundaryCondition_Test3();
+value::Scalar BLASTCOPY_ValidateOutput_BoundaryCondition_Test4();
+value::Scalar BLASTCOPY_ValidateOutput_BoundaryCondition_Test5();
+value::Scalar BLASTCOPY_ValidateOutput_BoundaryCondition_Test6();
+value::Scalar BLASTCOPY_ValidateOutput_BoundaryCondition_Test7();
+value::Scalar BLASTCOPY_ValidateOutput_BoundaryCondition_Test8();
+value::Scalar BLASTCOPY_ValidateOutput_BoundaryCondition_Test9();
+
+value::Scalar BLASTCOPY_ValidateMemory_BoundaryCondition_Test1();
+value::Scalar BLASTCOPY_ValidateMemory_BoundaryCondition_Test2();
+value::Scalar BLASTCOPY_ValidateMemory_BoundaryCondition_Test3();
+value::Scalar BLASTCOPY_ValidateMemory_BoundaryCondition_Test4();
+value::Scalar BLASTCOPY_ValidateMemory_BoundaryCondition_Test5();
+value::Scalar BLASTCOPY_ValidateMemory_BoundaryCondition_Test6();
+value::Scalar BLASTCOPY_ValidateMemory_BoundaryCondition_Test7();
+value::Scalar BLASTCOPY_ValidateMemory_BoundaryCondition_Test8();
+value::Scalar BLASTCOPY_ValidateMemory_BoundaryCondition_Test9();
+
+// Direct convolution caching
+value::Scalar ConvolutionWeight_ValidateOutput_Test1();
+value::Scalar ConvolutionWeight_Reshape_ValidateMemory_Test1();
+value::Scalar ConvolutionInput_ValidateOutput_Test1();
+value::Scalar ConvolutionInput_ValidateOutput_Test2();
+value::Scalar ConvolutionOutput_ValidateOutput_Test1();
+value::Scalar ConvolutionOutput_ValidateOutput_Test1();
+value::Scalar DirectConvolution_Test1();
+
+// General caching strategy
+value::Scalar GeneralCachingStrategy_ValidateOutput_Test1();
+value::Scalar GeneralCachingStrategy_ValidateOutput_Test2();
+value::Scalar GeneralCachingStrategy_ValidateOutput_Test3();
+value::Scalar GeneralCachingStrategy_ValidateOutput_Test4();
+value::Scalar GeneralCachingStrategy_ValidateOutput_Test5();
+value::Scalar GeneralCachingStrategy_ValidateOutput_Test6();
+value::Scalar GeneralCachingStrategy_ValidateOutput_Test7();
+value::Scalar GeneralCachingStrategy_ValidateOutput_Test8();
+value::Scalar GeneralCachingStrategy_ValidateOutput_Test9();
+value::Scalar GeneralCachingStrategy_ValidateOutput_Test10();
+value::Scalar GeneralCachingStrategy_ValidateOutput_Test11();
+value::Scalar GeneralCachingStrategy_ValidateOutput_Test12();
+value::Scalar GeneralCachingStrategy_ValidateOutput_Test13();
+
+value::Scalar GeneralCachingStrategy_ValidateMemory_Test1();
+
+// General caching strategy boundary condition output tests
+value::Scalar GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test1();
+value::Scalar GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test2();
+value::Scalar GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test3();
+value::Scalar GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test4();
+value::Scalar GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test5();
+value::Scalar GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test6();
+value::Scalar GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test7();
+value::Scalar GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test8();
+value::Scalar GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test9();
+
+// General caching strategy BLASTCopy-style tests
+value::Scalar GeneralCachingStrategy_BLASTCOPY_ValidateOutput_Test1();
+value::Scalar GeneralCachingStrategy_BLASTCOPY_ValidateOutput_Test2();
+
+value::Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_Test1();
+value::Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_Test2();
+value::Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_Test3();
+
+value::Scalar GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test1();
+value::Scalar GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test2();
+value::Scalar GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test3();
+value::Scalar GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test4();
+value::Scalar GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test5();
+value::Scalar GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test6();
+value::Scalar GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test7();
+value::Scalar GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test8();
+value::Scalar GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test9();
+
+value::Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test1();
+value::Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test2();
+value::Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test3();
+value::Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test4();
+value::Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test5();
+value::Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test6();
+value::Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test7();
+value::Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test8();
+value::Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test9();
+
+// General caching strategy ProgressiveBLASNCopy-style tests
+
+value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_Test1();
+value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_Test2();
+value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_Test1();
+value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_Test2();
+value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_Test3();
+
+value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test1();
+value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test2();
+value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test3();
+value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test4();
+value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test5();
+value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test6();
+value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test7();
+value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test8();
+value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test9();
+
+value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test1();
+value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test2();
+value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test3();
+value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test4();
+value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test5();
+value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test6();
+value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test7();
+value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test8();
+value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test9();
+
+value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test1();
+value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test2();
+value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test3();
+value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test4();
+value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test5();
+value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test6();
+value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test7();
+value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test8();
+value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test9();
+
+} // namespace ell
diff --git a/libraries/value/test/include/Functions_test.h b/libraries/value/test/include/Functions_test.h
new file mode 100644
index 000000000..910e60cb1
--- /dev/null
+++ b/libraries/value/test/include/Functions_test.h
@@ -0,0 +1,18 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     Functions_test.h (value)
+//  Authors:  Kern Handa, Chuck Jacobs
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include <value/include/Scalar.h>
+
+namespace ell
+{
+
+value::Scalar FunctionArgType_test();
+
+} // namespace ell
diff --git a/libraries/value/test/include/LoopNestAPI_test.h b/libraries/value/test/include/LoopNestAPI_test.h
new file mode 100644
index 000000000..65801252e
--- /dev/null
+++ b/libraries/value/test/include/LoopNestAPI_test.h
@@ -0,0 +1,37 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     LoopNestAPI_test.h (value)
+//  Authors:  Kern Handa
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+namespace ell
+{
+namespace value
+{
+    class Scalar;
+}
+value::Scalar LoopNest_api_test1();
+value::Scalar LoopNest_api_test2();
+value::Scalar LoopNest_api_test3();
+value::Scalar LoopNest_api_test4();
+value::Scalar LoopNest_api_test5();
+value::Scalar LoopNest_api_Parallelized_test1();
+value::Scalar LoopNest_api_Parallelized_test2();
+value::Scalar LoopNest_api_Unrolled_test1();
+value::Scalar LoopNest_api_SetOrder_test1();
+value::Scalar LoopNest_api_CachedMatrix_test1();
+value::Scalar LoopNest_api_SlidingCachedMatrix_test();
+value::Scalar SimpleGemm_HighLevelAPI();
+value::Scalar SimpleGemm_HighLevelAPI_NoCachingHelper();
+value::Scalar MLAS_GEMM_GeneralCachingStrategy();
+value::Scalar OneSplitBoundaryTest();
+value::Scalar TwoSplitBoundaryTest();
+value::Scalar SplitLargerThanSizeBoundaryTest();
+value::Scalar TwoSplitsLargerThanSizeBoundaryTest();
+value::Scalar LoopNest_api_tunable_parameters_test1();
+
+} // namespace ell
diff --git a/libraries/value/test/include/LoopNest_convolution_test.h b/libraries/value/test/include/LoopNest_convolution_test.h
new file mode 100644
index 000000000..c89cc41ea
--- /dev/null
+++ b/libraries/value/test/include/LoopNest_convolution_test.h
@@ -0,0 +1,18 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     LoopNest_convolution_test.h (value)
+//  Authors:  Mason Remy
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+namespace ell
+{
+namespace value
+{
+    class Scalar;
+}
+
+} // namespace ell
diff --git a/libraries/value/test/include/LoopNest_kernels.h b/libraries/value/test/include/LoopNest_kernels.h
new file mode 100644
index 000000000..15588fdb0
--- /dev/null
+++ b/libraries/value/test/include/LoopNest_kernels.h
@@ -0,0 +1,42 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     LoopNest_kernels.h (value)
+//  Authors:  Kern Handa, Chuck Jacobs
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+namespace ell
+{
+namespace value
+{
+    class Scalar;
+    class Matrix;
+    class Tensor;
+    class Vector;
+    struct ViewAdapter;
+} // namespace value
+
+void loopnest_passthrough(value::ViewAdapter, value::Scalar i, value::Scalar j);
+void loopnest_kernel(value::Matrix m, value::Scalar i, value::Scalar j);
+void loopnest_kernel_2(value::Matrix m, value::Scalar i, value::Scalar j);
+void loopnest_kernel_3(value::Matrix c, value::Matrix a, value::Scalar i, value::Scalar j);
+void loopnest_kernel_4(value::Matrix c, value::Matrix a, value::Scalar i, value::Scalar j);
+void matmul_kernel(value::Matrix A, value::Matrix B, value::Matrix C, value::Scalar i, value::Scalar j, value::Scalar k);
+void initToZero(value::Matrix m, value::Scalar i, value::Scalar j);
+void copyToCache(value::Matrix A, value::Matrix cache, value::Scalar i, value::Scalar j);
+void copyFromCache(value::Matrix A, value::Matrix cache, value::Scalar i, value::Scalar j);
+void copyToSmallCache(value::Matrix A, value::Matrix cache, value::Scalar i, value::Scalar j);
+void copyFromSmallCache(value::Matrix A, value::Matrix cache, value::Scalar i, value::Scalar j);
+void addOne(value::Matrix m, value::Scalar i, value::Scalar j);
+void addTwo(value::Matrix m, value::Scalar i, value::Scalar j);
+void set_vector_kernel(value::Vector v, value::Scalar i);
+void increment_vector_kernel(value::Vector v, value::Scalar i);
+void copy_vector_kernel(value::Vector v1, value::Vector v2, value::Scalar i);
+void reorder_vector_kernel(value::Vector v, value::Matrix m, value::Scalar splitParam, value::Scalar i, value::Scalar iOuter, value::Scalar iInner);
+void addCachedMatrixToUnchachedMatrix(value::Matrix A, value::Matrix B, value::Scalar Ai, value::Scalar Aj, value::Scalar Bi, value::Scalar Bj);
+void addCachedMatrixToUnchachedMatrixUnrolled(value::Matrix A, value::Matrix B, value::Scalar Ai, value::Scalar Aj, value::Scalar Bi, value::Scalar Bj);
+
+} // namespace ell
diff --git a/libraries/value/test/include/LoopNest_test.h b/libraries/value/test/include/LoopNest_test.h
new file mode 100644
index 000000000..fd1cfbf49
--- /dev/null
+++ b/libraries/value/test/include/LoopNest_test.h
@@ -0,0 +1,89 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     LoopNest_test.h (value)
+//  Authors:  Kern Handa, Chuck Jacobs
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include <value/include/Scalar.h>
+
+namespace ell
+{
+value::Scalar SplitIterationDomain_test1();
+
+// Loop nest tests
+value::Scalar LoopNest_test1();
+value::Scalar LoopNest_test2();
+value::Scalar LoopNest_test3();
+value::Scalar LoopNest_test4();
+value::Scalar LoopNest_test5();
+value::Scalar LoopNest_test6();
+
+value::Scalar LoopNestNonzeroStart_test();
+value::Scalar LoopNestBoundary_test1();
+value::Scalar LoopNestBoundary_test2();
+value::Scalar LoopNestBoundary_test3();
+value::Scalar LoopNestBoundary_test4();
+value::Scalar LoopNestBoundary_test5();
+value::Scalar LoopNestReorder_test1();
+value::Scalar LoopNestReorder_test2();
+value::Scalar TwoKernel_test();
+
+value::Scalar LoopNestLastPredicate_test1();
+value::Scalar LoopNestLastPredicate_test2();
+value::Scalar LoopNestLastPredicate_test3();
+value::Scalar LoopNestLastPredicate_test4();
+value::Scalar LoopNestBoundaryPredicate_test1();
+
+value::Scalar MissingIndex_test();
+value::Scalar RequiredIndex_test();
+value::Scalar SimpleImperfectNest_test();
+value::Scalar ImperfectNest_test_ijk();
+value::Scalar ImperfectNest_test_ikj();
+value::Scalar ImperfectNest_test_kij();
+value::Scalar ImperfectNest_test_ijkijk();
+value::Scalar ImperfectNest_test_kijijk();
+value::Scalar ImperfectNest_test_ijkkij();
+value::Scalar SplitIndex_test1_old();
+value::Scalar SplitIndex_test1();
+value::Scalar SplitIndex_test2();
+value::Scalar SplitIndex_test3();
+value::Scalar EpilogueIndex_test();
+value::Scalar RenameKernelArg_test();
+
+value::Scalar NonInnermostKernel_test1();
+value::Scalar NonInnermostKernel_test2();
+value::Scalar NonInnermostKernel_test3();
+value::Scalar NonInnermostKernel_test4();
+value::Scalar CachedMatrix_test1();
+value::Scalar CachedMatrix_test1_new();
+value::Scalar CachedMatrix_test2();
+value::Scalar CachedMatrix_test3();
+value::Scalar CachedMatrix_test4();
+value::Scalar CachedMatrix_test5();
+
+value::Scalar LoopNest_Parallelized_test1();
+value::Scalar LoopNest_Parallelized_test2();
+
+value::Scalar LoopNest_Unrolled_test1();
+
+value::Scalar LoopNest_DebugDump_test1();
+value::Scalar LoopNest_DebugDump_test2();
+
+value::Scalar SimpleMatMult_test();
+value::Scalar GotoBLASGemm_LowLevelAPI();
+value::Scalar GotoBLASGemmWithRefDeref();
+value::Scalar YG12LowLevel_TestBoundary();
+
+value::Scalar KernelPredicate_test();
+value::Scalar MatMul3_test1();
+value::Scalar MatMul3_test2();
+value::Scalar LoopNestFuse_test1();
+value::Scalar LoopNestFuse_test2();
+value::Scalar LoopNestFuse_test3();
+value::Scalar ConvertedConstraint_test1();
+value::Scalar ConvertedConstraint_test2();
+} // namespace ell
diff --git a/libraries/value/test/include/Matrix_test.h b/libraries/value/test/include/Matrix_test.h
index 63aad2f8f..93c5e340c 100644
--- a/libraries/value/test/include/Matrix_test.h
+++ b/libraries/value/test/include/Matrix_test.h
@@ -15,6 +15,7 @@ namespace ell
 value::Scalar Matrix_test1();
 value::Scalar Matrix_test2();
 value::Scalar Matrix_test3();
+value::Scalar Matrix_test4();
 value::Scalar Reshape_test();
 value::Scalar GEMV_test();
 value::Scalar MatrixReferenceTest();
diff --git a/libraries/value/test/include/Scalar_test.h b/libraries/value/test/include/Scalar_test.h
index 03bf1fed7..168cac7a6 100644
--- a/libraries/value/test/include/Scalar_test.h
+++ b/libraries/value/test/include/Scalar_test.h
@@ -21,4 +21,6 @@ value::Scalar RefScalarRefTest();
 value::Scalar RefScalarRefCtorsTest();
 value::Scalar RefScalarRefRefTest();
 value::Scalar RefScalarRefRefRefTest();
+value::Scalar SequenceLogicalAndTest();
+value::Scalar SequenceLogicalAndTestWithCopy();
 } // namespace ell
diff --git a/libraries/value/test/include/TestUtil.h b/libraries/value/test/include/TestUtil.h
index c0994c8f8..36e8e4a95 100644
--- a/libraries/value/test/include/TestUtil.h
+++ b/libraries/value/test/include/TestUtil.h
@@ -8,26 +8,105 @@
 
 #pragma once
 
+#include <utilities/include/MemoryLayout.h>
+
+#include <value/include/Array.h>
 #include <value/include/Matrix.h>
 #include <value/include/Scalar.h>
 #include <value/include/Tensor.h>
 #include <value/include/Vector.h>
+#include <value/include/loopnests/LoopNest.h>
 
 #include <string>
 
 namespace ell
 {
-value::Scalar EqualEpsilon(value::Scalar x, value::Scalar y, double epsilon);
-value::Scalar NotEqualEpsilon(value::Scalar x, value::Scalar y, double epsilon);
+value::Scalar NotEqualEpsilon(value::Scalar x, value::Scalar y, double epsilon = 1e-7);
+value::Scalar EqualEpsilon(value::Scalar x, value::Scalar y, double epsilon = 1e-7);
+
+value::Scalar VerifySame(value::Vector actual, value::Vector expected, double epsilon = 1e-7);
+value::Scalar VerifySame(value::Matrix actual, value::Matrix expected, double epsilon = 1e-7);
+value::Scalar VerifySame(value::Tensor actual, value::Tensor expected, double epsilon = 1e-7);
+value::Scalar VerifySame(value::Array actual, value::Array expected, double epsilon = 1e-7);
 
-value::Scalar Verify(value::Vector actual, value::Vector expected, double epsilon = 1e-7);
 value::Scalar VerifyDifferent(value::Vector actual, value::Vector expected, double epsilon = 1e-7);
-value::Scalar Verify(value::Matrix actual, value::Matrix expected, double epsilon = 1e-7);
-value::Scalar Verify(value::Tensor actual, value::Tensor expected, double epsilon = 1e-7);
+value::Scalar VerifyDifferent(value::Matrix actual, value::Matrix expected, double epsilon = 1e-7);
+value::Scalar VerifyDifferent(value::Tensor actual, value::Tensor expected, double epsilon = 1e-7);
+value::Scalar VerifyDifferent(value::Array actual, value::Array expected, double epsilon = 1e-7);
 
 void PrintMatrix(std::string indent, value::Matrix e);
-void DebugPrint(std::string message);
-void DebugPrint(value::Vector message); // expecting null terminated ValueType::Char8
 void DebugPrintVector(value::Vector data);
-void DebugPrintScalar(value::Scalar value);
+void PrintLoops(const value::loopnests::LoopNest& loop, std::string tag);
+
+value::Scalar GetTID();
+
+template <typename ValueType>
+value::Array MakeIncrementingArray(std::vector<int> size, const std::string& name)
+{
+    auto array = value::MakeArray<ValueType>(utilities::MemoryShape(size), name);
+    int counter = 0;
+    value::For(array, [&](const std::vector<value::Scalar>& indices) {
+        array(indices) = counter++;
+    });
+    return array;
+}
+
+template <typename ValueType>
+value::Tensor MakeIncrementingTensor(int rows, int columns, int channels, const std::string& name)
+{
+    auto tensor = value::MakeTensor<ValueType>(rows, columns, channels, name);
+    int counter = 0;
+    value::ForRange(channels, [&](value::Scalar channel) {
+        value::ForRange(rows, [&](value::Scalar row) {
+            value::ForRange(columns, [&](value::Scalar column) {
+                tensor(row, column, channel) = counter++;
+            });
+        });
+    });
+    return tensor;
+}
+
+template <typename ValueType>
+value::Matrix MakeIncrementingMatrix(int rows, int cols, const std::string& name)
+{
+    auto matrix = value::MakeMatrix<ValueType>(rows, cols, name);
+    value::ForRange(rows, [&](value::Scalar row) {
+        value::ForRange(cols, [&](value::Scalar col) {
+            matrix(row, col) = row * cols + col;
+        });
+    });
+    return matrix;
+}
+
+template <typename ValueType>
+value::Vector MakeIncrementingVector(int elements, const std::string& name)
+{
+    auto vec = value::MakeVector<ValueType>(elements, name);
+    value::ForRange(elements, [&](value::Scalar element) {
+        vec(element) = element;
+    });
+    return vec;
+}
+
+//
+// Matrix-multiply example helpers
+//
+void MultiplyMatrices(value::Matrix& A, value::Matrix& B, value::Matrix& C);
+
+struct MatMul3TestCaseParameters
+{
+    int M;
+    int N;
+    int K;
+    int L;
+    value::Matrix A;
+    value::Matrix B;
+    value::Matrix C;
+    value::Matrix D;
+    value::Matrix E;
+    value::Matrix expectedC;
+    value::Matrix expectedE;
+};
+
+MatMul3TestCaseParameters GetMatMul3TestCaseParameters(int M, int N, int K, int L);
 } // namespace ell
diff --git a/libraries/value/test/include/Value_test.h b/libraries/value/test/include/Value_test.h
index e90566c90..5443291cb 100644
--- a/libraries/value/test/include/Value_test.h
+++ b/libraries/value/test/include/Value_test.h
@@ -15,6 +15,7 @@ namespace ell
 value::Scalar Basic_test();
 value::Scalar DebugPrint_test();
 value::Scalar Value_test1();
+value::Scalar Array_test1();
 value::Scalar Casting_test1();
 value::Scalar If_test1();
 value::Scalar Sum_test();
@@ -23,10 +24,30 @@ value::Scalar Intrinsics_test1();
 value::Scalar Intrinsics_test2();
 value::Scalar For_test1();
 value::Scalar For_test2();
+value::Scalar ForInsideIf_test();
+value::Scalar While_test();
+value::Scalar WhileInsideIf_test();
 value::Scalar ForRangeCasting_test1();
 value::Scalar ForRangeCasting_test2();
 value::Scalar Parallelized_test1();
 value::Scalar Parallelized_test2();
 value::Scalar Parallelized_test3();
 value::Scalar Prefetch_test1();
+
+value::Scalar Prefetch_parallelized_test1();
+value::Scalar Fma_test1();
+value::Scalar Fma_test2();
+value::Scalar Fma_test3();
+value::Scalar UniqueName_test1();
+value::Scalar Parallelized_ComputeContext_test1();
+
+value::Scalar MemCopy_test1();
+value::Scalar MemSet_test1();
+
+value::Scalar NamedLoops_test1();
+
+value::Scalar ThreadLocalAllocation_test1();
+
+value::Scalar FunctionPointer_test1();
+
 } // namespace ell
diff --git a/libraries/value/test/src/CachingStrategy_test.cpp b/libraries/value/test/src/CachingStrategy_test.cpp
new file mode 100644
index 000000000..048ee84fa
--- /dev/null
+++ b/libraries/value/test/src/CachingStrategy_test.cpp
@@ -0,0 +1,6171 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     CachingStrategy_test.cpp (value)
+//  Authors:  Mason Remy
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "CachingStrategy_test.h"
+#include "TestUtil.h"
+
+#include <value/include/CachingStrategies.h>
+#include <value/include/ComputeContext.h>
+#include <value/include/EmitterContext.h>
+#include <value/include/FunctionDeclaration.h>
+#include <value/include/LLVMContext.h>
+#include <value/include/LoopNests.h>
+#include <value/include/Matrix.h>
+#include <value/include/Scalar.h>
+#include <value/include/Tensor.h>
+#include <value/include/Value.h>
+#include <value/include/Vector.h>
+
+#include <value/include/loopnests/CodeGenerator.h>
+#include <value/include/loopnests/Kernel.h>
+#include <value/include/loopnests/LoopNest.h>
+#include <value/include/loopnests/LoopNestPrinter.h>
+
+#include <emitters/include/IRFunctionEmitter.h>
+
+#include <math/include/Matrix.h>
+#include <math/include/Tensor.h>
+#include <math/include/Vector.h>
+
+#include <utilities/include/FunctionUtils.h>
+#include <utilities/include/Logger.h>
+
+#include <testing/include/testing.h>
+
+#include <deque>
+#include <iomanip>
+#include <iostream>
+#include <queue>
+#include <sstream>
+#include <type_traits>
+#include <unordered_map>
+#include <vector>
+
+using namespace ell::emitters;
+using namespace ell::utilities;
+using namespace ell::logging;
+using namespace ell::value;
+using namespace ell::value::loopnests;
+
+namespace ell
+{
+// Tests of LoopNest caching strategies
+
+Scalar BLASTCOPY_ValidateOutput_Test1()
+{
+    int N = 8;
+    int cacheRows = N;
+    int cacheCols = N;
+    int stripeSize = 4;
+
+    auto input = MakeIncrementingMatrix<int>(N, N, "input");
+    auto output = MakeMatrix<int>(N, N, "output");
+    auto expectedOutput = MakeIncrementingMatrix<int>(N, N, "expectedOutput");
+
+    Index i("i"), j("j");
+    auto nest = Using({ input }, ArgumentType::Input)
+                    .Using({ output }, ArgumentType::Output)
+                    .ForAll(i, 0, N)
+                    .ForAll(j, 0, N)
+                    .Do([](Matrix input, Matrix output, Scalar i, Scalar j) {
+                        output(i, j) = input(i, j);
+                    });
+
+    auto& schedule = nest.GetSchedule();
+
+    auto iCache = schedule.Split(i, cacheRows);
+    auto jCache = schedule.Split(j, cacheCols);
+    auto jStripe = schedule.Split(j, stripeSize);
+
+    schedule.SetOrder({ iCache, jCache, jStripe, i, j });
+
+    BLASTCopy cachingProvider{};
+    std::tuple<int, Index, BoundaryConditionHandling> blasTCopyExtras = { stripeSize, jStripe, BoundaryConditionHandling::ZeroPadding };
+    schedule.Cache(cachingProvider,
+                   input,
+                   { i, j },
+                   { cacheRows, cacheCols },
+                   { iCache, jCache },
+                   std::nullopt, // Order isn't used by BLASTCopy
+                   blasTCopyExtras);
+
+#if 0 // DEBUGGING
+    DebugDump(nest.GetUnderlyingLoopNest());
+#endif
+    nest.Run();
+
+    return VerifySame(output, expectedOutput);
+}
+
+// Test with smaller cache and stripe size than previous test
+Scalar BLASTCOPY_ValidateOutput_Test2()
+{
+    int N = 8;
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int stripeSize = 2;
+
+    // input, expectedOutput
+    // A:
+    // [ 0,  1,  2,  3,  4,  5,  6,  7]
+    // [ 8,  9, 10, 11, 12, 13, 14, 15]
+    // [16, 17, 18, 19, 20, 21, 22, 23]
+    // [24, 25, 26, 27, 28, 29, 30, 31]
+    // [32, 33, 34, 35, 36, 37, 38, 39]
+    // [40, 41, 42, 43, 44, 45, 46, 47]
+    // [48, 49, 50, 51, 52, 53, 54, 55]
+    // [56, 57, 58, 59, 60, 61, 62, 63]
+    auto input = MakeIncrementingMatrix<int>(N, N, "input");
+    auto output = MakeMatrix<int>(N, N, "output");
+    auto expectedOutput = MakeIncrementingMatrix<int>(N, N, "expectedOutput");
+
+    Index i("i"), j("j");
+    auto nest = Using({ input }, ArgumentType::Input)
+                    .Using({ output }, ArgumentType::Output)
+                    .ForAll(i, 0, N)
+                    .ForAll(j, 0, N)
+                    .Do([](Matrix input, Matrix output, Scalar i, Scalar j) {
+                        output(i, j) = input(i, j);
+                    });
+
+    auto& schedule = nest.GetSchedule();
+
+    auto iCache = schedule.Split(i, cacheRows);
+    auto jCache = schedule.Split(j, cacheCols);
+    auto jStripe = schedule.Split(j, stripeSize);
+
+    schedule.SetOrder({ iCache, jCache, jStripe, i, j });
+
+    BLASTCopy cachingProvider{};
+    std::tuple<int, Index, BoundaryConditionHandling> blasTCopyExtras = { stripeSize, jStripe, BoundaryConditionHandling::ZeroPadding };
+    schedule.Cache(cachingProvider,
+                   input,
+                   { i, j },
+                   { cacheRows, cacheCols },
+                   { iCache, jCache },
+                   std::nullopt, // Order isn't used by BLASTCopy
+                   blasTCopyExtras);
+
+#if 0 // DEBUGGING
+    DebugDump(nest.GetUnderlyingLoopNest());
+#endif
+    nest.Run();
+
+    return VerifySame(output, expectedOutput);
+}
+
+Scalar BLASTCOPY_ValidateMemory_Test1()
+{
+    int N = 8;
+    int cacheRows = N;
+    int cacheCols = N;
+    int stripeSize = 4;
+
+    auto input = MakeIncrementingMatrix<int>(N, N, "input");
+    auto output = MakeMatrix<int>(N, N, "output");
+
+    // input
+    // A:
+    // [ 0,  1,  2,  3,  4,  5,  6,  7]
+    // [ 8,  9, 10, 11, 12, 13, 14, 15]
+    // [16, 17, 18, 19, 20, 21, 22, 23]
+    // [24, 25, 26, 27, 28, 29, 30, 31]
+    // [32, 33, 34, 35, 36, 37, 38, 39]
+    // [40, 41, 42, 43, 44, 45, 46, 47]
+    // [48, 49, 50, 51, 52, 53, 54, 55]
+    // [56, 57, 58, 59, 60, 61, 62, 63]
+    // clang-format off
+    Vector expectedCached =
+        {
+             0,  1,  2,  3,
+             8,  9, 10, 11,
+            16, 17, 18, 19,
+            24, 25, 26, 27,
+            32, 33, 34, 35,
+            40, 41, 42, 43,
+            48, 49, 50, 51,
+            56, 57, 58, 59,
+
+             4,  5,  6,  7,
+            12, 13, 14, 15,
+            20, 21, 22, 23,
+            28, 29, 30, 31,
+            36, 37, 38, 39,
+            44, 45, 46, 47,
+            52, 53, 54, 55,
+            60, 61, 62, 63
+        };
+    // clang-format on
+
+    Index i("i"), j("j");
+    auto nest = Using({ input }, ArgumentType::Input)
+                    .Using({ output }, ArgumentType::Output)
+                    .ForAll(i, 0, N)
+                    .ForAll(j, 0, N)
+                    .Do([](Matrix input, Matrix output, Scalar i, Scalar j) {
+                        output(i, j) = input(i, j);
+                    });
+
+    auto& schedule = nest.GetSchedule();
+
+    auto iCache = schedule.Split(i, cacheRows);
+    auto jCache = schedule.Split(j, cacheCols);
+    auto jStripe = schedule.Split(j, stripeSize);
+
+    schedule.SetOrder({ iCache, jCache, jStripe, i, j });
+
+    BLASTCopy cachingProvider{};
+    std::tuple<int, Index, BoundaryConditionHandling> blasTCopyExtras = { stripeSize, jStripe, BoundaryConditionHandling::ZeroPadding };
+    schedule.Cache(cachingProvider,
+                   input,
+                   { i, j },
+                   { cacheRows, cacheCols },
+                   { iCache, jCache },
+                   std::nullopt, // Order isn't used by BLASTCopy
+                   blasTCopyExtras);
+
+#if 0 // DEBUGGING
+    DebugDump(nest.GetUnderlyingLoopNest());
+#endif
+    nest.Run();
+
+    // Examine the underlying cached memory
+    auto rawCacheValue = cachingProvider._rawCache;
+    rawCacheValue.SetLayout({ { (int)rawCacheValue.GetLayout().GetMemorySize() } });
+    auto cacheVector = Vector(rawCacheValue);
+
+    return VerifySame(cacheVector, expectedCached);
+}
+
+// Smaller stripe size than previous test
+Scalar BLASTCOPY_ValidateMemory_Test2()
+{
+    int N = 8;
+    int cacheRows = N;
+    int cacheCols = N;
+    int stripeSize = 2;
+
+    auto input = MakeIncrementingMatrix<int>(N, N, "input");
+    auto output = MakeMatrix<int>(N, N, "output");
+
+    // input
+    // A:
+    // [ 0,  1,  2,  3,  4,  5,  6,  7]
+    // [ 8,  9, 10, 11, 12, 13, 14, 15]
+    // [16, 17, 18, 19, 20, 21, 22, 23]
+    // [24, 25, 26, 27, 28, 29, 30, 31]
+    // [32, 33, 34, 35, 36, 37, 38, 39]
+    // [40, 41, 42, 43, 44, 45, 46, 47]
+    // [48, 49, 50, 51, 52, 53, 54, 55]
+    // [56, 57, 58, 59, 60, 61, 62, 63]
+    // clang-format off
+    Vector expectedCached =
+        {
+             0,  1,
+             8,  9,
+            16, 17,
+            24, 25,
+            32, 33,
+            40, 41,
+            48, 49,
+            56, 57,
+
+             2,  3,
+            10, 11,
+            18, 19,
+            26, 27,
+            34, 35,
+            42, 43,
+            50, 51,
+            58, 59,
+
+             4,  5,
+            12, 13,
+            20, 21,
+            28, 29,
+            36, 37,
+            44, 45,
+            52, 53,
+            60, 61,
+
+             6,  7,
+            14, 15,
+            22, 23,
+            30, 31,
+            38, 39,
+            46, 47,
+            54, 55,
+            62, 63
+        };
+    // clang-format on
+
+    Index i("i"), j("j");
+    auto nest = Using({ input }, ArgumentType::Input)
+                    .Using({ output }, ArgumentType::Output)
+                    .ForAll(i, 0, N)
+                    .ForAll(j, 0, N)
+                    .Do([](Matrix input, Matrix output, Scalar i, Scalar j) {
+                        output(i, j) = input(i, j);
+                    });
+
+    auto& schedule = nest.GetSchedule();
+
+    auto iCache = schedule.Split(i, cacheRows);
+    auto jCache = schedule.Split(j, cacheCols);
+    auto jStripe = schedule.Split(j, stripeSize);
+
+    schedule.SetOrder({ iCache, jCache, jStripe, i, j });
+
+    BLASTCopy cachingProvider{};
+    std::tuple<int, Index, BoundaryConditionHandling> blasTCopyExtras = { stripeSize, jStripe, BoundaryConditionHandling::ZeroPadding };
+    schedule.Cache(cachingProvider,
+                   input,
+                   { i, j },
+                   { cacheRows, cacheCols },
+                   { iCache, jCache },
+                   std::nullopt, // Order isn't used by BLASTCopy
+                   blasTCopyExtras);
+
+#if 0 // DEBUGGING
+    DebugDump(nest.GetUnderlyingLoopNest());
+#endif
+    nest.Run();
+
+    // Examine the underlying cached memory
+    auto rawCacheValue = cachingProvider._rawCache;
+    rawCacheValue.SetLayout({ { (int)rawCacheValue.GetLayout().GetMemorySize() } });
+    auto cacheVector = Vector(rawCacheValue);
+
+    return VerifySame(cacheVector, expectedCached);
+}
+
+// Same stripe size as previous test, but don't cache entire matrix at once
+Scalar BLASTCOPY_ValidateMemory_Test3()
+{
+    int N = 8;
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int stripeSize = 2;
+
+    auto input = MakeIncrementingMatrix<int>(N, N, "input");
+    auto output = MakeMatrix<int>(N, N, "output");
+
+    // input
+    // A:
+    // [ 0,  1,  2,  3,  4,  5,  6,  7]
+    // [ 8,  9, 10, 11, 12, 13, 14, 15]
+    // [16, 17, 18, 19, 20, 21, 22, 23]
+    // [24, 25, 26, 27, 28, 29, 30, 31]
+    // [32, 33, 34, 35, 36, 37, 38, 39]
+    // [40, 41, 42, 43, 44, 45, 46, 47]
+    // [48, 49, 50, 51, 52, 53, 54, 55]
+    // [56, 57, 58, 59, 60, 61, 62, 63]
+    // clang-format off
+    Vector expectedCachedUpperLeft =
+        {
+             0,  1,
+             8,  9,
+            16, 17,
+            24, 25,
+
+             2,  3,
+            10, 11,
+            18, 19,
+            26, 27,
+        };
+    Vector expectedCachedUpperRight =
+        {
+             4,  5,
+            12, 13,
+            20, 21,
+            28, 29,
+
+             6,  7,
+            14, 15,
+            22, 23,
+            30, 31
+        };
+    Vector expectedCachedLowerLeft =
+        {
+            32, 33,
+            40, 41,
+            48, 49,
+            56, 57,
+
+            34, 35,
+            42, 43,
+            50, 51,
+            58, 59,
+        };
+    Vector expectedCachedLowerRight =
+        {
+            36, 37,
+            44, 45,
+            52, 53,
+            60, 61,
+
+            38, 39,
+            46, 47,
+            54, 55,
+            62, 63
+        };
+    // clang-format on
+
+    Index i("i"), j("j");
+    auto nest = Using({ input }, ArgumentType::Input)
+                    .Using({ output }, ArgumentType::Output)
+                    .ForAll(i, 0, N)
+                    .ForAll(j, 0, N)
+                    .Do([](Matrix input, Matrix output, Scalar i, Scalar j) {
+                        output(i, j) = input(i, j);
+                    });
+
+    auto& schedule = nest.GetSchedule();
+
+    auto topLevelI = i;
+    auto topLevelJ = j;
+
+    auto iCache = schedule.Split(i, cacheRows);
+    auto jCache = schedule.Split(j, cacheCols);
+    auto jStripe = schedule.Split(j, stripeSize);
+
+    schedule.SetOrder({ iCache, jCache, jStripe, i, j });
+
+    BLASTCopy cachingProvider{};
+    std::tuple<int, Index, BoundaryConditionHandling> blasTCopyExtras = { stripeSize, jStripe, BoundaryConditionHandling::ZeroPadding };
+    schedule.Cache(cachingProvider,
+                   input,
+                   { i, j },
+                   { cacheRows, cacheCols },
+                   { iCache, jCache },
+                   std::nullopt, // Order isn't used by BLASTCopy
+                   blasTCopyExtras);
+
+    // Get a handle to the underlying cached memory
+    auto rawCacheValue = cachingProvider._rawCache;
+    int rawCacheSize = (int)rawCacheValue.GetLayout().NumElements();
+
+    auto cachedUpperLeft = MakeVector<int>(rawCacheSize);
+    auto cachedUpperRight = MakeVector<int>(rawCacheSize);
+    auto cachedLowerLeft = MakeVector<int>(rawCacheSize);
+    auto cachedLowerRight = MakeVector<int>(rawCacheSize);
+
+    // Add a low level API kernel to access the underlying cache after it has been filled
+    auto cacheSpyKernel = loopnests::Kernel("cache_spy_kernel")
+                              .Inputs(rawCacheValue, cachedUpperLeft, cachedUpperRight, cachedLowerLeft, cachedLowerRight)
+                              .Indices(topLevelI, topLevelJ)
+                              .Define([cacheRows, cacheCols](Value rawCacheValue, Vector cachedUpperLeft, Vector cachedUpperRight, Vector cachedLowerLeft, Vector cachedLowerRight, Scalar i, Scalar j) {
+                                  auto cacheView = rawCacheValue;
+                                  cacheView.SetLayout({ { (int)rawCacheValue.GetLayout().NumElements() } });
+                                  auto vectorCacheView = Vector(cacheView);
+                                  If(i == 0,
+                                     [&]() {
+                                         // TODO : remove nested if's
+                                         If(j == 0,
+                                            [&]() {
+                                                cachedUpperLeft = vectorCacheView;
+                                            })
+                                             .ElseIf(j == cacheCols,
+                                                     [&]() {
+                                                         cachedUpperRight = vectorCacheView;
+                                                     });
+                                     })
+                                      .ElseIf(i == cacheRows,
+                                              [&]() {
+                                                  If(j == 0, [&]() {
+                                                      cachedLowerLeft = vectorCacheView;
+                                                  }).ElseIf(j == cacheCols, [&]() {
+                                                      cachedLowerRight = vectorCacheView;
+                                                  });
+                                              });
+                              });
+    auto cacheSpyPosition = loopnests::CodePositionConstraints{ loopnests::LoopFragmentType::body, { iCache, jCache }, {} };
+    nest.GetUnderlyingLoopNest().AddKernel(cacheSpyKernel, cacheSpyPosition);
+
+#if 0 // DEBUGGING
+    DebugDump(nest.GetUnderlyingLoopNest());
+#endif
+    nest.Run();
+
+    auto ok = MakeScalar<int>("ok");
+    ok = 1;
+    auto printError = [&] {
+        DebugPrint("Upper Left:");
+        DebugPrintVector(cachedUpperLeft);
+        DebugPrint("\n");
+        DebugPrintVector(expectedCachedUpperLeft);
+        DebugPrint("\n");
+        DebugPrint("\n");
+        DebugPrint("Upper Right:");
+        DebugPrintVector(cachedUpperRight);
+        DebugPrint("\n");
+        DebugPrintVector(expectedCachedUpperRight);
+        DebugPrint("\n");
+        DebugPrint("\n");
+        DebugPrint("Lower Left:");
+        DebugPrintVector(cachedLowerLeft);
+        DebugPrint("\n");
+        DebugPrintVector(expectedCachedLowerLeft);
+        DebugPrint("\n");
+        DebugPrint("\n");
+        DebugPrint("Lower Right:");
+        DebugPrintVector(cachedLowerRight);
+        DebugPrint("\n");
+        DebugPrintVector(expectedCachedLowerRight);
+        DebugPrint("\n");
+        DebugPrint("\n");
+    };
+    // TODO : replace nested if's
+    If(VerifySame(cachedUpperLeft, expectedCachedUpperLeft) == 0, [&]() {
+        If(VerifySame(cachedUpperRight, expectedCachedUpperRight) == 0, [&]() {
+            If(VerifySame(cachedLowerLeft, expectedCachedLowerLeft) == 0, [&]() {
+                If(VerifySame(cachedLowerRight, expectedCachedLowerRight) == 0, [&]() {
+                    ok = 0;
+                }).Else(printError);
+            }).Else(printError);
+        }).Else(printError);
+    }).Else(printError);
+    return ok;
+}
+
+Scalar BLASTCOPY_ValidateOutput_BoundaryCondition_Runner(int M, int N, int cacheRows, int cacheCols, int stripeSize)
+{
+    auto input = MakeIncrementingMatrix<int>(M, N, "input");
+    auto output = MakeMatrix<int>(M, N, "output");
+    auto expectedOutput = MakeIncrementingMatrix<int>(M, N, "expectedOutput");
+
+    Index i("i"), j("j");
+    auto nest = Using({ input }, ArgumentType::Input)
+                    .Using({ output }, ArgumentType::Output)
+                    .ForAll(i, 0, M)
+                    .ForAll(j, 0, N)
+                    .Do([](Matrix input, Matrix output, Scalar i, Scalar j) {
+                        output(i, j) = input(i, j);
+                    });
+
+    auto& schedule = nest.GetSchedule();
+
+    auto iCache = schedule.Split(i, cacheRows);
+    auto jCache = schedule.Split(j, cacheCols);
+    auto jStripe = schedule.Split(j, stripeSize);
+
+    schedule.SetOrder({ iCache, jCache, jStripe, i, j });
+
+    BLASTCopy cachingProvider{};
+    std::tuple<int, Index, BoundaryConditionHandling> blasTCopyExtras = { stripeSize, jStripe, BoundaryConditionHandling::ZeroPadding };
+    schedule.Cache(cachingProvider,
+                   input,
+                   { i, j },
+                   { cacheRows, cacheCols },
+                   { iCache, jCache },
+                   std::nullopt, // Order isn't used by BLASTCopy
+                   blasTCopyExtras);
+
+#if 0 // DEBUGGING
+    DebugDump(nest.GetUnderlyingLoopNest());
+#endif
+    nest.Run();
+
+    return VerifySame(output, expectedOutput);
+}
+
+// input matrix rows evenly divides cache rows
+// input matrix cols doesn't evenly divide cache cols
+Scalar BLASTCOPY_ValidateOutput_BoundaryCondition_Test1()
+{
+    int M = 8;
+    int N = 7; // N doesn't evenly divide the number of cache columns
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int stripeSize = 2;
+
+    return BLASTCOPY_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, stripeSize);
+}
+
+// input matrix rows evenly divides cache rows
+// input matrix cols doesn't evenly divide cache cols but does evenly divide stripeSize
+Scalar BLASTCOPY_ValidateOutput_BoundaryCondition_Test2()
+{
+    int M = 8;
+    int N = 6; // N doesn't evenly divide the number of cache columns, but does evenly divide stripe size
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int stripeSize = 2;
+
+    return BLASTCOPY_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, stripeSize);
+}
+
+// input matrix rows doesn't evenly divides cache rows
+// input matrix cols doesn't evenly divide cache cols
+Scalar BLASTCOPY_ValidateOutput_BoundaryCondition_Test3()
+{
+    int M = 6;
+    int N = 7; // N doesn't evenly divide the number of cache columns
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int stripeSize = 2;
+
+    return BLASTCOPY_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, stripeSize);
+}
+
+// input matrix rows doesn't evenly divides cache rows
+// input matrix cols doesn't evenly divide cache cols but does evenly divide stripe size
+Scalar BLASTCOPY_ValidateOutput_BoundaryCondition_Test4()
+{
+    int M = 6;
+    int N = 6; // N doesn't evenly divide the number of cache columns, but does evenly divide stripe size
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int stripeSize = 2;
+
+    return BLASTCOPY_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, stripeSize);
+}
+
+// input matrix rows evenly divides cache rows
+// input matrix cols < cache cols, doesn't evenly divide stripe size
+Scalar BLASTCOPY_ValidateOutput_BoundaryCondition_Test5()
+{
+    int M = 8;
+    int N = 3; // N < cache columns, doesn't evenly divide stripe size
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int stripeSize = 2;
+
+    return BLASTCOPY_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, stripeSize);
+}
+
+// input matrix rows evenly divides cache rows
+// input matrix cols < cache cols, evenly divides stripe size
+Scalar BLASTCOPY_ValidateOutput_BoundaryCondition_Test6()
+{
+    int M = 8;
+    int N = 2; // N < cache columns, does evenly divide stripe size
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int stripeSize = 2;
+
+    return BLASTCOPY_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, stripeSize);
+}
+
+// input matrix rows < cache rows
+// input matrix cols < cache cols, doesn't evenly divides stripe size
+Scalar BLASTCOPY_ValidateOutput_BoundaryCondition_Test7()
+{
+    int M = 3;
+    int N = 3; // N < cache columns, doesn't evenly divide stripe size
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int stripeSize = 2;
+
+    return BLASTCOPY_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, stripeSize);
+}
+
+// input matrix rows < cache rows
+// input matrix cols < cache cols, does evenly divides stripe size
+Scalar BLASTCOPY_ValidateOutput_BoundaryCondition_Test8()
+{
+    int M = 2;
+    int N = 2; // N < cache columns, does evenly divide stripe size
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int stripeSize = 2;
+
+    return BLASTCOPY_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, stripeSize);
+}
+
+// input matrix rows < cache rows
+// input matrix cols multiple of cache cols
+Scalar BLASTCOPY_ValidateOutput_BoundaryCondition_Test9()
+{
+    int M = 2;
+    int N = 8;
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int stripeSize = 2;
+
+    return BLASTCOPY_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, stripeSize);
+}
+
+Scalar BLASTCOPY_ValidateMemory_BoundaryCondition_Runner(int M, int N, int cacheRows, int cacheCols, int stripeSize, Vector expectedCachedUpperLeft, Vector expectedCachedUpperRight, Vector expectedCachedLowerLeft, Vector expectedCachedLowerRight)
+{
+    auto input = MakeIncrementingMatrix<int>(M, N, "input");
+    auto output = MakeMatrix<int>(M, N, "output");
+
+    Index i("i"), j("j");
+    auto nest = Using({ input }, ArgumentType::Input)
+                    .Using({ output }, ArgumentType::Output)
+                    .ForAll(i, 0, M)
+                    .ForAll(j, 0, N)
+                    .Do([](Matrix input, Matrix output, Scalar i, Scalar j) {
+                        output(i, j) = input(i, j);
+                    });
+
+    auto& schedule = nest.GetSchedule();
+
+    auto topLevelI = i;
+    auto topLevelJ = j;
+
+    auto iCache = schedule.Split(i, cacheRows);
+    auto jCache = schedule.Split(j, cacheCols);
+    auto jStripe = schedule.Split(j, stripeSize);
+
+    schedule.SetOrder({ iCache, jCache, jStripe, i, j });
+
+    BLASTCopy cachingProvider{};
+    std::tuple<int, Index, BoundaryConditionHandling> blasTCopyExtras = { stripeSize, jStripe, BoundaryConditionHandling::ZeroPadding };
+    schedule.Cache(cachingProvider,
+                   input,
+                   { i, j },
+                   { cacheRows, cacheCols },
+                   { iCache, jCache },
+                   std::nullopt, // Order isn't used by BLASTCopy
+                   blasTCopyExtras);
+
+    // Get a handle to the underlying cached memory
+    auto rawCacheValue = cachingProvider._rawCache;
+    int rawCacheSize = (int)rawCacheValue.GetLayout().NumElements();
+
+    auto cachedUpperLeft = MakeVector<int>(rawCacheSize);
+    auto cachedUpperRight = MakeVector<int>(rawCacheSize);
+    auto cachedLowerLeft = MakeVector<int>(rawCacheSize);
+    auto cachedLowerRight = MakeVector<int>(rawCacheSize);
+
+    // Add a low level API kernel to access the underlying cache after it has been filled
+    auto cacheSpyKernel = loopnests::Kernel("cache_spy_kernel")
+                              .Inputs(rawCacheValue, cachedUpperLeft, cachedUpperRight, cachedLowerLeft, cachedLowerRight)
+                              .Indices(topLevelI, topLevelJ)
+                              .Define([cacheRows, cacheCols](Value rawCacheValue, Vector cachedUpperLeft, Vector cachedUpperRight, Vector cachedLowerLeft, Vector cachedLowerRight, Scalar i, Scalar j) {
+                                  auto cacheView = rawCacheValue;
+                                  cacheView.SetLayout({ { (int)rawCacheValue.GetLayout().NumElements() } });
+                                  auto vectorCacheView = Vector(cacheView);
+                                  If(i == 0,
+                                     [&]() {
+                                         // TODO : remove nested if's
+                                         If(j == 0,
+                                            [&]() {
+                                                cachedUpperLeft = vectorCacheView;
+                                            })
+                                             .ElseIf(j == cacheCols,
+                                                     [&]() {
+                                                         cachedUpperRight = vectorCacheView;
+                                                     });
+                                     })
+                                      .ElseIf(i == cacheRows,
+                                              [&]() {
+                                                  If(j == 0, [&]() {
+                                                      cachedLowerLeft = vectorCacheView;
+                                                  }).ElseIf(j == cacheCols, [&]() {
+                                                      cachedLowerRight = vectorCacheView;
+                                                  });
+                                              });
+                              });
+    auto cacheSpyPosition = loopnests::CodePositionConstraints{ loopnests::LoopFragmentType::body, { iCache, jCache }, {} };
+    nest.GetUnderlyingLoopNest().AddKernel(cacheSpyKernel, cacheSpyPosition);
+
+#if 0 // DEBUGGING
+    DebugDump(nest.GetUnderlyingLoopNest());
+#endif
+    nest.Run();
+
+    auto ok = MakeScalar<int>("ok");
+    ok = 1;
+    auto printError = [&] {
+        DebugPrint("Upper Left:");
+        DebugPrintVector(cachedUpperLeft);
+        DebugPrint("\n");
+        DebugPrintVector(expectedCachedUpperLeft);
+        DebugPrint("\n");
+        DebugPrint("\n");
+        DebugPrint("Upper Right:");
+        DebugPrintVector(cachedUpperRight);
+        DebugPrint("\n");
+        DebugPrintVector(expectedCachedUpperRight);
+        DebugPrint("\n");
+        DebugPrint("\n");
+        DebugPrint("Lower Left:");
+        DebugPrintVector(cachedLowerLeft);
+        DebugPrint("\n");
+        DebugPrintVector(expectedCachedLowerLeft);
+        DebugPrint("\n");
+        DebugPrint("\n");
+        DebugPrint("Lower Right:");
+        DebugPrintVector(cachedLowerRight);
+        DebugPrint("\n");
+        DebugPrintVector(expectedCachedLowerRight);
+        DebugPrint("\n");
+        DebugPrint("\n");
+    };
+    // TODO : replace nested if's
+    If(VerifySame(cachedUpperLeft, expectedCachedUpperLeft) == 0, [&]() {
+        If(VerifySame(cachedUpperRight, expectedCachedUpperRight) == 0, [&]() {
+            If(VerifySame(cachedLowerLeft, expectedCachedLowerLeft) == 0, [&]() {
+                If(VerifySame(cachedLowerRight, expectedCachedLowerRight) == 0, [&]() {
+                    ok = 0;
+                }).Else(printError);
+            }).Else(printError);
+        }).Else(printError);
+    }).Else(printError);
+    return ok;
+}
+
+Scalar BLASTCOPY_ValidateMemory_BoundaryCondition_Runner_LeftCachesOnly(int M, int N, int cacheRows, int cacheCols, int stripeSize, Vector expectedCachedUpperLeft, Vector expectedCachedLowerLeft)
+{
+    auto input = MakeIncrementingMatrix<int>(M, N, "input");
+    auto output = MakeMatrix<int>(M, N, "output");
+
+    Index i("i"), j("j");
+    auto nest = Using({ input }, ArgumentType::Input)
+                    .Using({ output }, ArgumentType::Output)
+                    .ForAll(i, 0, M)
+                    .ForAll(j, 0, N)
+                    .Do([](Matrix input, Matrix output, Scalar i, Scalar j) {
+                        output(i, j) = input(i, j);
+                    });
+
+    auto& schedule = nest.GetSchedule();
+
+    auto topLevelI = i;
+    auto topLevelJ = j;
+
+    auto iCache = schedule.Split(i, cacheRows);
+    auto jCache = schedule.Split(j, cacheCols);
+    auto jStripe = schedule.Split(j, stripeSize);
+
+    schedule.SetOrder({ iCache, jCache, jStripe, i, j });
+
+    BLASTCopy cachingProvider{};
+    std::tuple<int, Index, BoundaryConditionHandling> blasTCopyExtras = { stripeSize, jStripe, BoundaryConditionHandling::ZeroPadding };
+    schedule.Cache(cachingProvider,
+                   input,
+                   { i, j },
+                   { cacheRows, cacheCols },
+                   { iCache, jCache },
+                   std::nullopt, // Order isn't used by BLASTCopy
+                   blasTCopyExtras);
+
+    // Get a handle to the underlying cached memory
+    auto rawCacheValue = cachingProvider._rawCache;
+    int rawCacheSize = (int)rawCacheValue.GetLayout().NumElements();
+
+    // No right caches when N < cacheCols
+    auto cachedUpperLeft = MakeVector<int>(rawCacheSize);
+    auto cachedLowerLeft = MakeVector<int>(rawCacheSize);
+
+    // Add a low level API kernel to access the underlying cache after it has been filled
+    auto cacheSpyKernel = loopnests::Kernel("cache_spy_kernel")
+                              .Inputs(rawCacheValue, cachedUpperLeft, cachedLowerLeft)
+                              .Indices(topLevelI, topLevelJ)
+                              .Define([cacheRows](Value rawCacheValue, Vector cachedUpperLeft, Vector cachedLowerLeft, Scalar i, Scalar j) {
+                                  auto cacheView = rawCacheValue;
+                                  cacheView.SetLayout({ { (int)rawCacheValue.GetLayout().NumElements() } });
+                                  auto vectorCacheView = Vector(cacheView);
+                                  If(i == 0,
+                                     [&]() {
+                                         // TODO : remove nested if's
+                                         If(j == 0,
+                                            [&]() {
+                                                cachedUpperLeft = vectorCacheView;
+                                            });
+                                     })
+                                      .ElseIf(i == cacheRows,
+                                              [&]() {
+                                                  If(j == 0, [&]() {
+                                                      cachedLowerLeft = vectorCacheView;
+                                                  });
+                                              });
+                              });
+    auto cacheSpyPosition = loopnests::CodePositionConstraints{ loopnests::LoopFragmentType::body, { iCache, jCache }, {} };
+    nest.GetUnderlyingLoopNest().AddKernel(cacheSpyKernel, cacheSpyPosition);
+
+#if 0 // DEBUGGING
+    DebugDump(nest.GetUnderlyingLoopNest());
+#endif
+    nest.Run();
+
+    auto ok = MakeScalar<int>("ok");
+    ok = 1;
+    auto printError = [&] {
+        DebugPrint("Upper Left:");
+        DebugPrintVector(cachedUpperLeft);
+        DebugPrint("\n");
+        DebugPrintVector(expectedCachedUpperLeft);
+        DebugPrint("\n");
+        DebugPrint("\n");
+        DebugPrint("Lower Left:");
+        DebugPrintVector(cachedLowerLeft);
+        DebugPrint("\n");
+        DebugPrintVector(expectedCachedLowerLeft);
+        DebugPrint("\n");
+        DebugPrint("\n");
+    };
+    // TODO : replace nested if's
+    If(VerifySame(cachedUpperLeft, expectedCachedUpperLeft) == 0, [&]() {
+        If(VerifySame(cachedLowerLeft, expectedCachedLowerLeft) == 0, [&]() {
+            ok = 0;
+        }).Else(printError);
+    }).Else(printError);
+    return ok;
+}
+
+Scalar BLASTCOPY_ValidateMemory_BoundaryCondition_Runner_UpperCachesOnly(int M, int N, int cacheRows, int cacheCols, int stripeSize, Vector expectedCachedUpperLeft, Vector expectedCachedUpperRight)
+{
+    auto input = MakeIncrementingMatrix<int>(M, N, "input");
+    auto output = MakeMatrix<int>(M, N, "output");
+
+    Index i("i"), j("j");
+    auto nest = Using({ input }, ArgumentType::Input)
+                    .Using({ output }, ArgumentType::Output)
+                    .ForAll(i, 0, M)
+                    .ForAll(j, 0, N)
+                    .Do([](Matrix input, Matrix output, Scalar i, Scalar j) {
+                        output(i, j) = input(i, j);
+                    });
+
+    auto& schedule = nest.GetSchedule();
+
+    auto topLevelI = i;
+    auto topLevelJ = j;
+
+    auto iCache = schedule.Split(i, cacheRows);
+    auto jCache = schedule.Split(j, cacheCols);
+    auto jStripe = schedule.Split(j, stripeSize);
+
+    schedule.SetOrder({ iCache, jCache, jStripe, i, j });
+
+    BLASTCopy cachingProvider{};
+    std::tuple<int, Index, BoundaryConditionHandling> blasTCopyExtras = { stripeSize, jStripe, BoundaryConditionHandling::ZeroPadding };
+    schedule.Cache(cachingProvider,
+                   input,
+                   { i, j },
+                   { cacheRows, cacheCols },
+                   { iCache, jCache },
+                   std::nullopt, // Order isn't used by BLASTCopy
+                   blasTCopyExtras);
+
+    // Get a handle to the underlying cached memory
+    auto rawCacheValue = cachingProvider._rawCache;
+    int rawCacheSize = (int)rawCacheValue.GetLayout().NumElements();
+
+    auto cachedUpperLeft = MakeVector<int>(rawCacheSize);
+    auto cachedUpperRight = MakeVector<int>(rawCacheSize);
+
+    // Add a low level API kernel to access the underlying cache after it has been filled
+    auto cacheSpyKernel = loopnests::Kernel("cache_spy_kernel")
+                              .Inputs(rawCacheValue, cachedUpperLeft, cachedUpperRight)
+                              .Indices(topLevelI, topLevelJ)
+                              .Define([cacheCols](Value rawCacheValue, Vector cachedUpperLeft, Vector cachedUpperRight, Scalar i, Scalar j) {
+                                  auto cacheView = rawCacheValue;
+                                  cacheView.SetLayout({ { (int)rawCacheValue.GetLayout().NumElements() } });
+                                  auto vectorCacheView = Vector(cacheView);
+                                  If(i == 0,
+                                     [&]() {
+                                         // TODO : remove nested if's
+                                         If(j == 0,
+                                            [&]() {
+                                                cachedUpperLeft = vectorCacheView;
+                                            })
+                                             .ElseIf(j == cacheCols,
+                                                     [&]() {
+                                                         cachedUpperRight = vectorCacheView;
+                                                     });
+                                     });
+                              });
+    auto cacheSpyPosition = loopnests::CodePositionConstraints{ loopnests::LoopFragmentType::body, { iCache, jCache }, {} };
+    nest.GetUnderlyingLoopNest().AddKernel(cacheSpyKernel, cacheSpyPosition);
+
+#if 0 // DEBUGGING
+    DebugDump(nest.GetUnderlyingLoopNest());
+#endif
+    nest.Run();
+
+    auto ok = MakeScalar<int>("ok");
+    ok = 1;
+    auto printError = [&] {
+        DebugPrint("Upper Left:");
+        DebugPrintVector(cachedUpperLeft);
+        DebugPrint("\n");
+        DebugPrintVector(expectedCachedUpperLeft);
+        DebugPrint("\n");
+        DebugPrint("\n");
+        DebugPrint("Upper Right:");
+        DebugPrintVector(cachedUpperRight);
+        DebugPrint("\n");
+        DebugPrintVector(expectedCachedUpperRight);
+        DebugPrint("\n");
+        DebugPrint("\n");
+    };
+    // TODO : replace nested if's
+    If(VerifySame(cachedUpperLeft, expectedCachedUpperLeft) == 0, [&]() {
+        If(VerifySame(cachedUpperRight, expectedCachedUpperRight) == 0, [&]() {
+            ok = 0;
+        }).Else(printError);
+    }).Else(printError);
+    return ok;
+}
+
+Scalar BLASTCOPY_ValidateMemory_BoundaryCondition_Runner_UpperLeftCacheOnly(int M, int N, int cacheRows, int cacheCols, int stripeSize, Vector expectedCachedUpperLeft)
+{
+    auto input = MakeIncrementingMatrix<int>(M, N, "input");
+    auto output = MakeMatrix<int>(M, N, "output");
+
+    Index i("i"), j("j");
+    auto nest = Using({ input }, ArgumentType::Input)
+                    .Using({ output }, ArgumentType::Output)
+                    .ForAll(i, 0, M)
+                    .ForAll(j, 0, N)
+                    .Do([](Matrix input, Matrix output, Scalar i, Scalar j) {
+                        output(i, j) = input(i, j);
+                    });
+
+    auto& schedule = nest.GetSchedule();
+
+    auto topLevelI = i;
+    auto topLevelJ = j;
+
+    auto iCache = schedule.Split(i, cacheRows);
+    auto jCache = schedule.Split(j, cacheCols);
+    auto jStripe = schedule.Split(j, stripeSize);
+
+    schedule.SetOrder({ iCache, jCache, jStripe, i, j });
+
+    BLASTCopy cachingProvider{};
+    std::tuple<int, Index, BoundaryConditionHandling> blasTCopyExtras = { stripeSize, jStripe, BoundaryConditionHandling::ZeroPadding };
+    schedule.Cache(cachingProvider,
+                   input,
+                   { i, j },
+                   { cacheRows, cacheCols },
+                   { iCache, jCache },
+                   std::nullopt, // Order isn't used by BLASTCopy
+                   blasTCopyExtras);
+
+    // Get a handle to the underlying cached memory
+    auto rawCacheValue = cachingProvider._rawCache;
+    int rawCacheSize = (int)rawCacheValue.GetLayout().NumElements();
+
+    // No right caches when N < cacheCols
+    auto cachedUpperLeft = MakeVector<int>(rawCacheSize);
+
+    // Add a low level API kernel to access the underlying cache after it has been filled
+    auto cacheSpyKernel = loopnests::Kernel("cache_spy_kernel")
+                              .Inputs(rawCacheValue, cachedUpperLeft)
+                              .Indices(topLevelI, topLevelJ)
+                              .Define([](Value rawCacheValue, Vector cachedUpperLeft, Scalar i, Scalar j) {
+                                  auto cacheView = rawCacheValue;
+                                  cacheView.SetLayout({ { (int)rawCacheValue.GetLayout().NumElements() } });
+                                  auto vectorCacheView = Vector(cacheView);
+                                  If(i == 0,
+                                     [&]() {
+                                         // TODO : remove nested if's
+                                         If(j == 0,
+                                            [&]() {
+                                                cachedUpperLeft = vectorCacheView;
+                                            });
+                                     });
+                              });
+    auto cacheSpyPosition = loopnests::CodePositionConstraints{ loopnests::LoopFragmentType::body, { iCache, jCache }, {} };
+    nest.GetUnderlyingLoopNest().AddKernel(cacheSpyKernel, cacheSpyPosition);
+
+#if 0 // DEBUGGING
+    DebugDump(nest.GetUnderlyingLoopNest());
+#endif
+    nest.Run();
+
+    return VerifySame(cachedUpperLeft, expectedCachedUpperLeft);
+}
+
+Scalar BLASTCOPY_ValidateMemory_BoundaryCondition_Test1()
+{
+    int M = 8; // M does evenly divide cache rows
+    int N = 7; // N doesn't evenly divide cache columns
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int stripeSize = 2;
+
+    // input
+    // A:
+    // [ 0,  1,  2,  3,  4,  5,  6],
+    // [ 7,  8,  9, 10, 11, 12, 13],
+    // [14, 15, 16, 17, 18, 19, 20],
+    // [21, 22, 23, 24, 25, 26, 27],
+    // [28, 29, 30, 31, 32, 33, 34],
+    // [35, 36, 37, 38, 39, 40, 41],
+    // [42, 43, 44, 45, 46, 47, 48],
+    // [49, 50, 51, 52, 53, 54, 55]
+    // clang-format off
+    Vector expectedCachedUpperLeft =
+        {
+             0,  1,
+             7,  8,
+            14, 15,
+            21, 22,
+
+             2,  3,
+             9, 10,
+            16, 17,
+            23, 24,
+        };
+    Vector expectedCachedUpperRight =
+        {
+             4,  5,
+            11, 12,
+            18, 19,
+            25, 26,
+
+             6,  0,
+            13,  0,
+            20,  0,
+            27,  0
+        };
+    Vector expectedCachedLowerLeft =
+        {
+            28, 29,
+            35, 36,
+            42, 43,
+            49, 50,
+
+            30, 31,
+            37, 38,
+            44, 45,
+            51, 52,
+        };
+    Vector expectedCachedLowerRight =
+        {
+            32, 33,
+            39, 40,
+            46, 47,
+            53, 54,
+
+            34,  0,
+            41,  0,
+            48,  0,
+            55,  0
+        };
+    // clang-format on
+
+    return BLASTCOPY_ValidateMemory_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, stripeSize, expectedCachedUpperLeft, expectedCachedUpperRight, expectedCachedLowerLeft, expectedCachedLowerRight);
+}
+
+Scalar BLASTCOPY_ValidateMemory_BoundaryCondition_Test2()
+{
+    int M = 8; // M does evenly divide cache rows
+    int N = 6; // N doesn't evenly divide cache columns, but does evenly divide stripe size
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int stripeSize = 2;
+
+    auto input = MakeIncrementingMatrix<int>(M, N, "input");
+    auto output = MakeMatrix<int>(M, N, "output");
+
+    // input
+    // A:
+    // [ 0,  1,  2,  3,  4,  5],
+    // [ 6,  7,  8,  9, 10, 11],
+    // [12, 13, 14, 15, 16, 17],
+    // [18, 19, 20, 21, 22, 23],
+    // [24, 25, 26, 27, 28, 29],
+    // [30, 31, 32, 33, 34, 35],
+    // [36, 37, 38, 39, 40, 41],
+    // [42, 43, 44, 45, 46, 47]
+    // clang-format off
+    Vector expectedCachedUpperLeft =
+        {
+             0,  1,
+             6,  7,
+            12, 13,
+            18, 19,
+
+             2,  3,
+             8,  9,
+            14, 15,
+            20, 21,
+        };
+    Vector expectedCachedUpperRight =
+        {
+             4,  5,
+            10, 11,
+            16, 17,
+            22, 23,
+
+             0,  0,
+             0,  0,
+             0,  0,
+             0,  0
+        };
+    Vector expectedCachedLowerLeft =
+        {
+            24, 25,
+            30, 31,
+            36, 37,
+            42, 43,
+
+            26, 27,
+            32, 33,
+            38, 39,
+            44, 45,
+        };
+    Vector expectedCachedLowerRight =
+        {
+            28, 29,
+            34, 35,
+            40, 41,
+            46, 47,
+
+             0,  0,
+             0,  0,
+             0,  0,
+             0,  0
+        };
+    // clang-format on
+
+    return BLASTCOPY_ValidateMemory_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, stripeSize, expectedCachedUpperLeft, expectedCachedUpperRight, expectedCachedLowerLeft, expectedCachedLowerRight);
+}
+
+// input matrix rows doesn't evenly divides cache rows
+// input matrix cols doesn't evenly divide cache cols
+Scalar BLASTCOPY_ValidateMemory_BoundaryCondition_Test3()
+{
+    int M = 6;
+    int N = 7; // N doesn't evenly divide the number of cache columns
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int stripeSize = 2;
+
+    // input
+    // A:
+    // [ 0,  1,  2,  3,  4,  5,  6],
+    // [ 7,  8,  9, 10, 11, 12, 13],
+    // [14, 15, 16, 17, 18, 19, 20],
+    // [21, 22, 23, 24, 25, 26, 27],
+    // [28, 29, 30, 31, 32, 33, 34],
+    // [35, 36, 37, 38, 39, 40, 41],
+    // clang-format off
+    Vector expectedCachedUpperLeft =
+        {
+             0,  1,
+             7,  8,
+            14, 15,
+            21, 22,
+
+             2,  3,
+             9, 10,
+            16, 17,
+            23, 24,
+        };
+    Vector expectedCachedUpperRight =
+        {
+             4,  5,
+            11, 12,
+            18, 19,
+            25, 26,
+
+             6,  0,
+            13,  0,
+            20,  0,
+            27,  0
+        };
+    
+    // Check that it gets reviewed correctly to keep the cached data contiguous
+    Vector expectedCachedLowerLeft =
+        {
+            28, 29,
+            35, 36,
+            30, 31,
+            37, 38,
+
+             0,  0,
+             0,  0,            
+             0,  0,
+             0,  0,
+        };
+    Vector expectedCachedLowerRight =
+        {
+            32, 33,
+            39, 40,
+            34,  0,
+            41,  0,
+
+             0,  0,
+             0,  0,
+             0,  0,
+             0,  0
+        };
+    // clang-format on
+
+    return BLASTCOPY_ValidateMemory_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, stripeSize, expectedCachedUpperLeft, expectedCachedUpperRight, expectedCachedLowerLeft, expectedCachedLowerRight);
+}
+
+// input matrix rows doesn't evenly divides cache rows
+// input matrix cols doesn't evenly divide cache cols but does evenly divide stripe size
+Scalar BLASTCOPY_ValidateMemory_BoundaryCondition_Test4()
+{
+    int M = 6;
+    int N = 6; // N doesn't evenly divide the number of cache columns, but does evenly divide stripe size
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int stripeSize = 2;
+
+    // input
+    // A:
+    // [ 0,  1,  2,  3,  4,  5],
+    // [ 6,  7,  8,  9, 10, 11],
+    // [12, 13, 14, 15, 16, 17],
+    // [18, 19, 20, 21, 22, 23],
+    // [24, 25, 26, 27, 28, 29],
+    // [30, 31, 32, 33, 34, 35]
+    // clang-format off
+    Vector expectedCachedUpperLeft =
+        {
+             0,  1,
+             6,  7,
+            12, 13,
+            18, 19,
+
+             2,  3,
+             8,  9,
+            14, 15,
+            20, 21,
+        };
+    Vector expectedCachedUpperRight =
+        {
+             4,  5,
+            10, 11,
+            16, 17,
+            22, 23,
+
+             0,  0,
+             0,  0,
+             0,  0,
+             0,  0
+        };
+    Vector expectedCachedLowerLeft =
+        {
+            24, 25,
+            30, 31,
+            26, 27,
+            32, 33,
+
+             0,  0,
+             0,  0,
+             0,  0,
+             0,  0,
+        };
+    Vector expectedCachedLowerRight =
+        {
+            28, 29,
+            34, 35,
+             0,  0,
+             0,  0,
+
+             0,  0,
+             0,  0,
+             0,  0,
+             0,  0
+        };
+    // clang-format on
+    return BLASTCOPY_ValidateMemory_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, stripeSize, expectedCachedUpperLeft, expectedCachedUpperRight, expectedCachedLowerLeft, expectedCachedLowerRight);
+}
+
+// input matrix rows evenly divides cache rows
+// input matrix cols < cache cols, doesn't evenly divide stripe size
+Scalar BLASTCOPY_ValidateMemory_BoundaryCondition_Test5()
+{
+    int M = 8;
+    int N = 3; // N < cache columns, doesn't evenly divide stripe size
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int stripeSize = 2;
+
+    // input
+    // A:
+    // [ 0,  1,  2],
+    // [ 3,  4,  5],
+    // [ 6,  7,  8],
+    // [ 9, 10, 11],
+    // [12, 13, 14],
+    // [15, 16, 17],
+    // [18, 19, 20],
+    // [21, 22, 23]
+    // clang-format off
+    Vector expectedCachedUpperLeft =
+        {
+             0,  1,
+             3,  4,
+             6,  7,
+             9, 10,
+
+             2, 0,
+             5, 0,
+             8, 0,
+            11, 0,
+        };
+    Vector expectedCachedLowerLeft =
+        {
+            12, 13,
+            15, 16,
+            18, 19,
+            21, 22,
+
+            14,  0,
+            17,  0,
+            20,  0,
+            23,  0,
+        };
+    // clang-format on
+
+    return BLASTCOPY_ValidateMemory_BoundaryCondition_Runner_LeftCachesOnly(M, N, cacheRows, cacheCols, stripeSize, expectedCachedUpperLeft, expectedCachedLowerLeft);
+}
+
+// input matrix rows evenly divides cache rows
+// input matrix cols < cache cols, evenly divides stripe size
+Scalar BLASTCOPY_ValidateMemory_BoundaryCondition_Test6()
+{
+    int M = 8;
+    int N = 2; // N < cache columns, does evenly divide stripe size
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int stripeSize = 2;
+
+    // input
+    // A:
+    // [ 0,  1],
+    // [ 2,  3],
+    // [ 4,  5],
+    // [ 6,  7],
+    // [ 8,  9],
+    // [10, 11],
+    // [12, 13],
+    // [14, 15]
+    // clang-format off
+    Vector expectedCachedUpperLeft =
+        {
+             0,  1,
+             2,  3,
+             4,  5,
+             6,  7,
+
+             0,  0,
+             0,  0,
+             0,  0,
+             0,  0,
+        };
+    Vector expectedCachedLowerLeft =
+        {
+             8,  9,
+            10, 11,
+            12, 13,
+            14, 15,
+
+             0,  0,
+             0,  0,
+             0,  0,
+             0,  0,
+        };
+    // clang-format on
+    return BLASTCOPY_ValidateMemory_BoundaryCondition_Runner_LeftCachesOnly(M, N, cacheRows, cacheCols, stripeSize, expectedCachedUpperLeft, expectedCachedLowerLeft);
+}
+
+// input matrix rows < cache rows
+// input matrix cols < cache cols, doesn't evenly divides stripe size
+Scalar BLASTCOPY_ValidateMemory_BoundaryCondition_Test7()
+{
+    int M = 3;
+    int N = 3; // N < cache columns, doesn't evenly divide stripe size
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int stripeSize = 2;
+
+    // input
+    // A:
+    // [0, 1, 2],
+    // [3, 4, 5],
+    // [6, 7, 8]
+    // clang-format off
+    Vector expectedCachedUpperLeft =
+        {
+            0, 1,
+            3, 4,
+            6, 7,
+            2, 0,
+
+            5, 0,
+            8, 0,
+            0, 0,
+            0, 0
+        };
+    // clang-format on
+    return BLASTCOPY_ValidateMemory_BoundaryCondition_Runner_UpperLeftCacheOnly(M, N, cacheRows, cacheCols, stripeSize, expectedCachedUpperLeft);
+}
+
+// input matrix rows < cache rows
+// input matrix cols < cache cols, does evenly divides stripe size
+Scalar BLASTCOPY_ValidateMemory_BoundaryCondition_Test8()
+{
+    int M = 2;
+    int N = 2; // N < cache columns, does evenly divide stripe size
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int stripeSize = 2;
+
+    // input
+    // A:
+    // [ 0, 1],
+    // [ 2, 3]
+    // clang-format off
+    Vector expectedCachedUpperLeft =
+        {
+            0, 1,
+            2, 3,
+            0, 0,
+            0, 0,
+
+            0, 0,
+            0, 0,
+            0, 0,
+            0, 0,
+        };
+    // clang-format on
+    return BLASTCOPY_ValidateMemory_BoundaryCondition_Runner_UpperLeftCacheOnly(M, N, cacheRows, cacheCols, stripeSize, expectedCachedUpperLeft);
+}
+
+// input matrix rows < cache rows
+// input matrix cols multiple of cache cols
+Scalar BLASTCOPY_ValidateMemory_BoundaryCondition_Test9()
+{
+    int M = 2;
+    int N = 8;
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int stripeSize = 2;
+
+    // input
+    // A:
+    // [ 0,  1,  2,  3,  4,  5,  6,  7],
+    // [ 8,  9, 10, 11, 12, 13, 14, 15]
+    // clang-format off
+    Vector expectedCachedUpperLeft =
+        {
+             0,  1,
+             8,  9,
+             2,  3,
+            10, 11,
+
+            0, 0,
+            0, 0,
+            0, 0,
+            0, 0,
+        };
+    Vector expectedCachedUpperRight =
+        {
+             4,  5,
+            12, 13,
+             6,  7,
+            14, 15,
+
+             0,  0,
+             0,  0,
+             0,  0,
+             0,  0
+        };
+    // clang-format on
+    return BLASTCOPY_ValidateMemory_BoundaryCondition_Runner_UpperCachesOnly(M, N, cacheRows, cacheCols, stripeSize, expectedCachedUpperLeft, expectedCachedUpperRight);
+}
+
+// Convolution caching tests
+
+// General Caching Strategy
+
+Scalar GeneralCachingStrategy_ValidateOutput_Test1()
+{
+    // Square matrix tiling
+    loopnests::Index i("i"), j("j");
+
+    const int Rows = 8;
+    const int Columns = 8;
+    const int SplitSize = 4;
+
+    auto input = MakeIncrementingMatrix<int>(Rows, Columns, "input");
+    auto output = MakeMatrix<int>(Rows, Columns, "output");
+
+    // Define LoopNest
+    auto nest = Using({ input }, ArgumentType::Input)
+                    .Using({ output }, ArgumentType::Output)
+                    .ForAll(i, 0, Rows)
+                    .ForAll(j, 0, Columns)
+                    .Do([=](Matrix input_, Matrix output_, Scalar i_, Scalar j_) {
+                        output_(i_, j_) = input_(i_, j_);
+                    });
+
+    auto& schedule = nest.GetSchedule();
+
+    auto iTopLevel = i;
+    auto jTopLevel = j;
+
+    auto iBlock = schedule.Split(i, SplitSize);
+    auto jBlock = schedule.Split(j, SplitSize);
+
+    std::vector<Index> orderedIndices = { iBlock, jBlock, i, j };
+    schedule.SetOrder(orderedIndices);
+
+    ArgumentType argType = ArgumentType::Input;
+    std::string cacheName = "cacheInput";
+    size_t maxCacheElts = SplitSize * SplitSize;
+    size_t fillThreshold = maxCacheElts;
+    std::function<void(Scalar, Scalar)> reduceFunction = CopyReduce;
+    auto extraCacheParams = std::make_tuple(argType,
+                                            cacheName,
+                                            maxCacheElts,
+                                            fillThreshold,
+                                            reduceFunction,
+                                            false);
+
+    schedule.Cache<GeneralCachingStrategy>(input,
+                                           { iTopLevel, jTopLevel },
+                                           {},
+                                           {},
+                                           std::nullopt,
+                                           extraCacheParams);
+
+#if 0 // DEBUGGING
+    auto loop = nest.GetUnderlyingLoopNest();
+    DebugDump(loop);
+#endif
+
+    nest.Run();
+
+    return VerifySame(output, input);
+}
+
+Scalar GeneralCachingStrategy_ValidateMemory_Test1()
+{
+    // Square matrix tiling
+    loopnests::Index i("i"), j("j");
+
+    const int Rows = 8;
+    const int Columns = 8;
+    const int SplitSize = 4;
+
+    // input
+    // [ 0,  1,  2,  3,  4,  5,  6,  7]
+    // [ 8,  9, 10, 11, 12, 13, 14, 15]
+    // [16, 17, 18, 19, 20, 21, 22, 23]
+    // [24, 25, 26, 27, 28, 29, 30, 31]
+    // [32, 33, 34, 35, 36, 37, 38, 39]
+    // [40, 41, 42, 43, 44, 45, 46, 47]
+    // [48, 49, 50, 51, 52, 53, 54, 55]
+    // [56, 57, 58, 59, 60, 61, 62, 63]
+    // clang-format off
+    Vector expectedCachedUpperLeft =
+        {
+             0,  1,  2,  3,
+             8,  9, 10, 11,
+            16, 17, 18, 19,
+            24, 25, 26, 27
+        };
+    Vector expectedCachedLowerLeft =
+        {
+            32, 33, 34, 35,
+            40, 41, 42, 43,
+            48, 49, 50, 51,
+            56, 57, 58, 59
+        };
+    Vector expectedCachedUpperRight =
+        {
+             4,  5,  6,  7,
+            12, 13, 14, 15,
+            20, 21, 22, 23,
+            28, 29, 30, 31
+        };
+    Vector expectedCachedLowerRight =
+        {
+            36, 37, 38, 39,
+            44, 45, 46, 47,
+            52, 53, 54, 55,
+            60, 61, 62, 63
+        };
+    // clang-format on
+    auto input = MakeIncrementingMatrix<int>(Rows, Columns, "input");
+    auto output = MakeMatrix<int>(Rows, Columns, "output");
+
+    // Define LoopNest
+    auto nest = Using({ input }, ArgumentType::Input)
+                    .Using({ output }, ArgumentType::Output)
+                    .ForAll(i, 0, Rows)
+                    .ForAll(j, 0, Columns)
+                    .Do([=](Matrix input_, Matrix output_, Scalar i_, Scalar j_) {
+                        output_(i_, j_) = input_(i_, j_);
+                    });
+
+    auto& schedule = nest.GetSchedule();
+
+    auto iTopLevel = i;
+    auto jTopLevel = j;
+
+    auto iBlock = schedule.Split(i, SplitSize);
+    auto jBlock = schedule.Split(j, SplitSize);
+
+    schedule.SetOrder({ iBlock, jBlock, i, j });
+
+    GeneralCachingStrategy cachingProvider{};
+    ArgumentType argType = ArgumentType::Input;
+    std::string cacheName = "cacheInput";
+    size_t maxCacheElts = SplitSize * SplitSize;
+    size_t fillThreshold = maxCacheElts;
+    std::function<void(Scalar, Scalar)> reduceFunction = CopyReduce;
+    auto extraCacheParams = std::make_tuple(argType,
+                                            cacheName,
+                                            maxCacheElts,
+                                            fillThreshold,
+                                            reduceFunction,
+                                            false);
+    schedule.Cache(cachingProvider,
+                   input,
+                   { iTopLevel, jTopLevel },
+                   {},
+                   {},
+                   std::nullopt,
+                   extraCacheParams);
+
+    // Get a handle to the underlying cached memory
+    auto rawCacheValue = cachingProvider._rawCache;
+    int rawCacheSize = (int)rawCacheValue.GetLayout().NumElements();
+
+    auto cachedUpperLeft = MakeVector<int>(rawCacheSize);
+    auto cachedUpperRight = MakeVector<int>(rawCacheSize);
+    auto cachedLowerLeft = MakeVector<int>(rawCacheSize);
+    auto cachedLowerRight = MakeVector<int>(rawCacheSize);
+
+    // Add a low level API kernel to access the underlying cache after it has been filled
+    auto cacheSpyKernel = loopnests::Kernel("cache_spy_kernel")
+                              .Inputs(rawCacheValue, cachedUpperLeft, cachedUpperRight, cachedLowerLeft, cachedLowerRight)
+                              .Indices(iTopLevel, jTopLevel)
+                              .Define([=](Value rawCacheValue, Vector cachedUpperLeft, Vector cachedUpperRight, Vector cachedLowerLeft, Vector cachedLowerRight, Scalar i, Scalar j) {
+                                  auto cacheView = rawCacheValue;
+                                  cacheView.SetLayout({ { (int)rawCacheValue.GetLayout().NumElements() } });
+                                  auto vectorCacheView = Vector(cacheView);
+                                  If(i == 0,
+                                     [&]() {
+                                         // TODO : remove nested if's
+                                         If(j == 0,
+                                            [&]() {
+                                                cachedUpperLeft = vectorCacheView;
+                                            })
+                                             .ElseIf(j == SplitSize,
+                                                     [&]() {
+                                                         cachedUpperRight = vectorCacheView;
+                                                     });
+                                     })
+                                      .ElseIf(i == SplitSize,
+                                              [&]() {
+                                                  If(j == 0, [&]() {
+                                                      cachedLowerLeft = vectorCacheView;
+                                                  }).ElseIf(j == SplitSize, [&]() {
+                                                      cachedLowerRight = vectorCacheView;
+                                                  });
+                                              });
+                              });
+    auto cacheSpyPosition = loopnests::CodePositionConstraints{ loopnests::LoopFragmentType::epilogue, { iBlock, jBlock }, {} };
+    nest.GetUnderlyingLoopNest().AddKernel(cacheSpyKernel, cacheSpyPosition);
+
+    nest.Run();
+
+    auto ok = MakeScalar<int>("ok");
+    ok = 1;
+    auto printError = [&] {
+        DebugPrint("Upper Left:");
+        DebugPrintVector(cachedUpperLeft);
+        DebugPrint("\n");
+        DebugPrintVector(expectedCachedUpperLeft);
+        DebugPrint("\n");
+        DebugPrint("\n");
+        DebugPrint("Upper Right:");
+        DebugPrintVector(cachedUpperRight);
+        DebugPrint("\n");
+        DebugPrintVector(expectedCachedUpperRight);
+        DebugPrint("\n");
+        DebugPrint("\n");
+        DebugPrint("Lower Left:");
+        DebugPrintVector(cachedLowerLeft);
+        DebugPrint("\n");
+        DebugPrintVector(expectedCachedLowerLeft);
+        DebugPrint("\n");
+        DebugPrint("\n");
+        DebugPrint("Lower Right:");
+        DebugPrintVector(cachedLowerRight);
+        DebugPrint("\n");
+        DebugPrintVector(expectedCachedLowerRight);
+        DebugPrint("\n");
+        DebugPrint("\n");
+    };
+    // TODO : replace nested if's
+    If(VerifySame(cachedUpperLeft, expectedCachedUpperLeft) == 0, [&]() {
+        If(VerifySame(cachedUpperRight, expectedCachedUpperRight) == 0, [&]() {
+            If(VerifySame(cachedLowerLeft, expectedCachedLowerLeft) == 0, [&]() {
+                If(VerifySame(cachedLowerRight, expectedCachedLowerRight) == 0, [&]() {
+                    ok = 0;
+                }).Else(printError);
+            }).Else(printError);
+        }).Else(printError);
+    }).Else(printError);
+    return ok;
+}
+
+Scalar GeneralCachingStrategy_ValidateOutput_Test2()
+{
+    // BLASTCopy caching
+    loopnests::Index i("i"), j("j");
+
+    const int Rows = 16;
+    const int Columns = 16;
+    const int InputCacheRows = 8;
+    const int InputCacheCols = 8;
+    const int StripeSize = 4;
+    const int VecSize = 2;
+
+    auto input = MakeIncrementingMatrix<int>(Rows, Columns, "input");
+    auto output = MakeMatrix<int>(Rows, Columns, "output");
+
+    // Define LoopNest
+    auto nest = Using({ input }, ArgumentType::Input)
+                    .Using({ output }, ArgumentType::Output)
+                    .ForAll(i, 0, Rows)
+                    .ForAll(j, 0, Columns)
+                    .Do([=](Matrix input_, Matrix output_, Scalar i_, Scalar j_) {
+                        output_(i_, j_) = input_(i_, j_);
+                    });
+
+    auto& schedule = nest.GetSchedule();
+
+    auto iTopLevel = i;
+    auto jTopLevel = j;
+
+    auto iBlock = schedule.Split(i, InputCacheRows);
+    auto jBlock = schedule.Split(j, InputCacheCols);
+    auto jStripe = schedule.Split(j, StripeSize);
+    auto jVec = schedule.Split(j, VecSize);
+
+    std::vector<Index> orderedIndices = { jBlock,
+                                          iBlock,
+                                          jStripe,
+                                          i,
+                                          jVec,
+                                          j };
+    schedule.SetOrder(orderedIndices);
+
+    ArgumentType argType = ArgumentType::Input;
+    std::string cacheName = "cacheInput";
+    size_t maxCacheElts = InputCacheRows * InputCacheCols;
+    size_t fillThreshold = maxCacheElts;
+    std::function<void(Scalar, Scalar)> reduceFunction = CopyReduce;
+    auto extraCacheParams = std::make_tuple(argType,
+                                            cacheName,
+                                            maxCacheElts,
+                                            fillThreshold,
+                                            reduceFunction,
+                                            false);
+    schedule.Cache<GeneralCachingStrategy>(input,
+                                           { iTopLevel, jTopLevel },
+                                           {},
+                                           {},
+                                           std::nullopt,
+                                           extraCacheParams);
+
+#if 0 // DEBUGGING
+    auto loop = nest.GetUnderlyingLoopNest();
+    DebugDump(loop);
+#endif
+
+    nest.Run();
+
+    return VerifySame(output, input);
+}
+
+Scalar GeneralCachingStrategy_ValidateOutput_Test3()
+{
+    // Progressive BLASTCopy caching
+    loopnests::Index i("i"), j("j");
+
+    const int Rows = 16;
+    const int Columns = 16;
+    const int InputCacheRows = 8;
+    const int InputCacheCols = 8;
+    const int StripeSize = 4;
+    const int VecSize = 2;
+
+    auto input = MakeIncrementingMatrix<int>(Rows, Columns, "input");
+    auto output = MakeMatrix<int>(Rows, Columns, "output");
+
+    // Define LoopNest
+    auto nest = Using({ input }, ArgumentType::Input)
+                    .Using({ output }, ArgumentType::Output)
+                    .ForAll(i, 0, Rows)
+                    .ForAll(j, 0, Columns)
+                    .Do([=](Matrix input_, Matrix output_, Scalar i_, Scalar j_) {
+                        output_(i_, j_) = input_(i_, j_);
+                    });
+
+    auto& schedule = nest.GetSchedule();
+
+    auto iTopLevel = i;
+    auto jTopLevel = j;
+
+    auto iBlock = schedule.Split(i, InputCacheRows);
+    auto jBlock = schedule.Split(j, InputCacheCols);
+    auto jStripe = schedule.Split(j, StripeSize);
+    auto jVec = schedule.Split(j, VecSize);
+
+    std::vector<Index> orderedIndices = { jBlock,
+                                          iBlock,
+                                          jStripe,
+                                          i,
+                                          jVec,
+                                          j };
+    schedule.SetOrder(orderedIndices);
+
+    ArgumentType argType = ArgumentType::Input;
+    std::string cacheName = "cacheInput";
+    size_t maxCacheElts = InputCacheRows * InputCacheCols;
+    size_t fillThreshold = InputCacheRows * StripeSize;
+    std::function<void(Scalar, Scalar)> reduceFunction = CopyReduce;
+    auto extraCacheParams = std::make_tuple(argType,
+                                            cacheName,
+                                            maxCacheElts,
+                                            fillThreshold,
+                                            reduceFunction,
+                                            false);
+    schedule.Cache<GeneralCachingStrategy>(input,
+                                           { iTopLevel, jTopLevel },
+                                           {},
+                                           {},
+                                           std::nullopt,
+                                           extraCacheParams);
+
+#if 0 // DEBUGGING
+    auto loop = nest.GetUnderlyingLoopNest();
+    DebugDump(loop);
+#endif
+
+    nest.Run();
+
+    return VerifySame(output, input);
+}
+
+Scalar GeneralCachingStrategy_ValidateOutput_Test4()
+{
+    // BLASTCopy caching with boundary condition on rows
+    loopnests::Index i("i"), j("j");
+
+    const int Rows = 15;
+    const int Columns = 16;
+    const int InputCacheRows = 8;
+    const int InputCacheCols = 8;
+    const int StripeSize = 4;
+    const int VecSize = 2;
+
+    auto input = MakeIncrementingMatrix<int>(Rows, Columns, "input");
+    auto output = MakeMatrix<int>(Rows, Columns, "output");
+
+    // Define LoopNest
+    auto nest = Using({ input }, ArgumentType::Input)
+                    .Using({ output }, ArgumentType::Output)
+                    .ForAll(i, 0, Rows)
+                    .ForAll(j, 0, Columns)
+                    .Do([=](Matrix input_, Matrix output_, Scalar i_, Scalar j_) {
+                        output_(i_, j_) = input_(i_, j_);
+                    });
+
+    auto& schedule = nest.GetSchedule();
+
+    auto iTopLevel = i;
+    auto jTopLevel = j;
+
+    auto iBlock = schedule.Split(i, InputCacheRows);
+    auto jBlock = schedule.Split(j, InputCacheCols);
+    auto jStripe = schedule.Split(j, StripeSize);
+    auto jVec = schedule.Split(j, VecSize);
+
+    std::vector<Index> orderedIndices = { jBlock,
+                                          iBlock,
+                                          jStripe,
+                                          i,
+                                          jVec,
+                                          j };
+    schedule.SetOrder(orderedIndices);
+
+    ArgumentType argType = ArgumentType::Input;
+    std::string cacheName = "cacheInput";
+    size_t maxCacheElts = InputCacheRows * InputCacheCols;
+    size_t fillThreshold = maxCacheElts;
+    std::function<void(Scalar, Scalar)> reduceFunction = CopyReduce;
+    auto extraCacheParams = std::make_tuple(argType,
+                                            cacheName,
+                                            maxCacheElts,
+                                            fillThreshold,
+                                            reduceFunction,
+                                            false);
+    schedule.Cache<GeneralCachingStrategy>(input,
+                                           { iTopLevel, jTopLevel },
+                                           {},
+                                           {},
+                                           std::nullopt,
+                                           extraCacheParams);
+
+#if 0 // DEBUGGING
+    auto loop = nest.GetUnderlyingLoopNest();
+    DebugDump(loop);
+#endif
+
+    nest.Run();
+
+    return VerifySame(output, input);
+}
+
+Scalar GeneralCachingStrategy_ValidateOutput_Test5()
+{
+    // Square output cache
+    loopnests::Index i("i"), j("j");
+
+    const int Rows = 8;
+    const int Columns = 8;
+    const int OutputCacheRows = 2;
+    const int OutputCacheCols = 2;
+
+    auto input = MakeIncrementingMatrix<int>(Rows, Columns, "input");
+    auto output = MakeMatrix<int>(Rows, Columns, "output");
+
+    // Define LoopNest
+    auto nest = Using({ input }, ArgumentType::Input)
+                    .Using({ output }, ArgumentType::Output)
+                    .ForAll(i, 0, Rows)
+                    .ForAll(j, 0, Columns)
+                    .Do([=](Matrix input_, Matrix output_, Scalar i_, Scalar j_) {
+                        output_(i_, j_) = input_(i_, j_);
+                    });
+
+    auto& schedule = nest.GetSchedule();
+
+    auto iTopLevel = i;
+    auto jTopLevel = j;
+
+    auto iOutput = schedule.Split(i, OutputCacheRows);
+    auto jOutput = schedule.Split(j, OutputCacheCols);
+
+    std::vector<Index> orderedIndices = { iOutput, jOutput, i, j };
+    schedule.SetOrder(orderedIndices);
+
+    ArgumentType argType = ArgumentType::Output;
+    std::string cacheName = "cacheOutput";
+
+    size_t maxCacheElts = OutputCacheRows * OutputCacheCols;
+    size_t fillThreshold = maxCacheElts;
+    std::function<void(Scalar, Scalar)> reduceFunction = CopyReduce;
+    auto extraCacheParams = std::make_tuple(argType,
+                                            cacheName,
+                                            maxCacheElts,
+                                            fillThreshold,
+                                            reduceFunction,
+                                            false);
+    schedule.Cache<GeneralCachingStrategy>(output,
+                                           { iTopLevel, jTopLevel },
+                                           {},
+                                           {},
+                                           std::nullopt,
+                                           extraCacheParams);
+
+#if 0 // DEBUGGING
+    auto loop = nest.GetUnderlyingLoopNest();
+    DebugDump(loop);
+#endif
+
+    nest.Run();
+
+    return VerifySame(output, input);
+}
+
+Scalar GeneralCachingStrategy_ValidateOutput_Test6()
+{
+    // Rectangular output cache
+    loopnests::Index i("i"), j("j");
+
+    const int Rows = 8;
+    const int Columns = 8;
+    const int OutputCacheRows = 4;
+    const int OutputCacheCols = 2;
+
+    auto input = MakeIncrementingMatrix<int>(Rows, Columns, "input");
+    auto output = MakeMatrix<int>(Rows, Columns, "output");
+
+    // Define LoopNest
+    auto nest = Using({ input }, ArgumentType::Input)
+                    .Using({ output }, ArgumentType::Output)
+                    .ForAll(i, 0, Rows)
+                    .ForAll(j, 0, Columns)
+                    .Do([=](Matrix input_, Matrix output_, Scalar i_, Scalar j_) {
+                        output_(i_, j_) = input_(i_, j_);
+                    });
+
+    auto& schedule = nest.GetSchedule();
+
+    auto iTopLevel = i;
+    auto jTopLevel = j;
+
+    auto iOutput = schedule.Split(i, OutputCacheRows);
+    auto jOutput = schedule.Split(j, OutputCacheCols);
+
+    std::vector<Index> orderedIndices = { iOutput, jOutput, i, j };
+    schedule.SetOrder(orderedIndices);
+
+    ArgumentType argType = ArgumentType::Output;
+    std::string cacheName = "cacheOutput";
+
+    size_t maxCacheElts = OutputCacheRows * OutputCacheCols;
+    size_t fillThreshold = maxCacheElts;
+    std::function<void(Scalar, Scalar)> reduceFunction = CopyReduce;
+    auto extraCacheParams = std::make_tuple(argType,
+                                            cacheName,
+                                            maxCacheElts,
+                                            fillThreshold,
+                                            reduceFunction,
+                                            false);
+    schedule.Cache<GeneralCachingStrategy>(output,
+                                           { iTopLevel, jTopLevel },
+                                           {},
+                                           {},
+                                           std::nullopt,
+                                           extraCacheParams);
+
+#if 0 // DEBUGGING
+    auto loop = nest.GetUnderlyingLoopNest();
+    DebugDump(loop);
+#endif
+
+    nest.Run();
+
+    return VerifySame(output, input);
+}
+
+Scalar GeneralCachingStrategy_ValidateOutput_Test7()
+{
+    // Square matrix tiling with square output cache
+    loopnests::Index i("i"), j("j");
+
+    const int Rows = 8;
+    const int Columns = 8;
+    const int InputCacheRows = 4;
+    const int InputCacheCols = 4;
+    const int OutputCacheRows = 2;
+    const int OutputCacheCols = 2;
+
+    auto input = MakeIncrementingMatrix<int>(Rows, Columns, "input");
+    auto output = MakeMatrix<int>(Rows, Columns, "output");
+
+    // Define LoopNest
+    auto nest = Using({ input }, ArgumentType::Input)
+                    .Using({ output }, ArgumentType::Output)
+                    .ForAll(i, 0, Rows)
+                    .ForAll(j, 0, Columns)
+                    .Do([=](Matrix input_, Matrix output_, Scalar i_, Scalar j_) {
+                        output_(i_, j_) = input_(i_, j_);
+                    });
+
+    auto& schedule = nest.GetSchedule();
+
+    auto iTopLevel = i;
+    auto jTopLevel = j;
+
+    auto iBlock = schedule.Split(i, InputCacheRows);
+    auto jBlock = schedule.Split(j, InputCacheCols);
+    auto iOutput = schedule.Split(i, OutputCacheRows);
+    auto jOutput = schedule.Split(j, OutputCacheCols);
+
+    std::vector<Index> orderedIndices = { iBlock, jBlock, iOutput, jOutput, i, j };
+    schedule.SetOrder(orderedIndices);
+
+    ArgumentType argType = ArgumentType::Input;
+    std::string cacheName = "cacheInput";
+
+    size_t maxCacheElts = InputCacheRows * InputCacheCols;
+    size_t fillThreshold = maxCacheElts;
+    std::function<void(Scalar, Scalar)> reduceFunction = CopyReduce;
+    auto extraCacheParams = std::make_tuple(argType,
+                                            cacheName,
+                                            maxCacheElts,
+                                            fillThreshold,
+                                            reduceFunction,
+                                            false);
+    schedule.Cache<GeneralCachingStrategy>(input,
+                                           { iTopLevel, jTopLevel },
+                                           {},
+                                           {},
+                                           std::nullopt,
+                                           extraCacheParams);
+
+    ArgumentType output_argType = ArgumentType::Output;
+    std::string output_cacheName = "cacheOutput";
+
+    size_t output_maxCacheElts = OutputCacheRows * OutputCacheCols;
+    size_t output_fillThreshold = output_maxCacheElts;
+    std::function<void(Scalar, Scalar)> output_reduceFunction = CopyReduce;
+    auto output_extraCacheParams = std::make_tuple(output_argType,
+                                                   output_cacheName,
+                                                   output_maxCacheElts,
+                                                   output_fillThreshold,
+                                                   output_reduceFunction,
+                                                   false);
+    schedule.Cache<GeneralCachingStrategy>(output,
+                                           { iTopLevel, jTopLevel },
+                                           {},
+                                           {},
+                                           std::nullopt,
+                                           output_extraCacheParams);
+
+#if 0 // DEBUGGING
+    auto loop = nest.GetUnderlyingLoopNest();
+    DebugDump(loop);
+#endif
+
+    nest.Run();
+
+    return VerifySame(output, input);
+}
+
+Scalar GeneralCachingStrategy_ValidateOutput_Test8()
+{
+    // Rectangular matrix input tiling with different rectangular output cache
+    loopnests::Index i("i"), j("j");
+
+    const int Rows = 8;
+    const int Columns = 8;
+    const int InputCacheRows = 4;
+    const int InputCacheCols = 2;
+    const int OutputCacheRows = 2;
+    const int OutputCacheCols = 4;
+
+    auto input = MakeIncrementingMatrix<int>(Rows, Columns, "input");
+    auto output = MakeMatrix<int>(Rows, Columns, "output");
+
+    // Define LoopNest
+    auto nest = Using({ input }, ArgumentType::Input)
+                    .Using({ output }, ArgumentType::Output)
+                    .ForAll(i, 0, Rows)
+                    .ForAll(j, 0, Columns)
+                    .Do([=](Matrix input_, Matrix output_, Scalar i_, Scalar j_) {
+                        output_(i_, j_) = input_(i_, j_);
+                    });
+
+    auto& schedule = nest.GetSchedule();
+
+    auto iTopLevel = i;
+    auto jTopLevel = j;
+
+    auto iBlock = schedule.Split(i, InputCacheRows);
+    auto iOutput = schedule.Split(i, OutputCacheRows);
+    auto jOutput = schedule.Split(j, OutputCacheCols);
+    auto jBlock = schedule.Split(j, InputCacheCols);
+
+    std::vector<Index> orderedIndices = { iBlock, iOutput, jOutput, jBlock, i, j };
+    schedule.SetOrder(orderedIndices);
+
+    ArgumentType argType = ArgumentType::Input;
+    std::string cacheName = "cacheInput";
+
+    size_t maxCacheElts = InputCacheRows * InputCacheCols;
+    size_t fillThreshold = maxCacheElts;
+    std::function<void(Scalar, Scalar)> reduceFunction = CopyReduce;
+    auto extraCacheParams = std::make_tuple(argType,
+                                            cacheName,
+                                            maxCacheElts,
+                                            fillThreshold,
+                                            reduceFunction,
+                                            false);
+    schedule.Cache<GeneralCachingStrategy>(input,
+                                           { iTopLevel, jTopLevel },
+                                           {},
+                                           {},
+                                           std::nullopt,
+                                           extraCacheParams);
+
+    ArgumentType output_argType = ArgumentType::Output;
+    std::string output_cacheName = "cacheOutput";
+
+    size_t output_maxCacheElts = OutputCacheRows * OutputCacheCols;
+    size_t output_fillThreshold = output_maxCacheElts;
+    std::function<void(Scalar, Scalar)> output_reduceFunction = CopyReduce;
+    auto output_extraCacheParams = std::make_tuple(output_argType,
+                                                   output_cacheName,
+                                                   output_maxCacheElts,
+                                                   output_fillThreshold,
+                                                   output_reduceFunction,
+                                                   false);
+    schedule.Cache<GeneralCachingStrategy>(output,
+                                           { iTopLevel, jTopLevel },
+                                           {},
+                                           {},
+                                           std::nullopt,
+                                           output_extraCacheParams);
+
+#if 0 // DEBUGGING
+    auto loop = nest.GetUnderlyingLoopNest();
+    DebugDump(loop);
+#endif
+
+    nest.Run();
+
+    return VerifySame(output, input);
+}
+
+Scalar GeneralCachingStrategy_ValidateOutput_Test9()
+{
+    // BLASTCopy input caching with square output cache
+    loopnests::Index i("i"), j("j");
+
+    const int Rows = 16;
+    const int Columns = 16;
+    const int InputCacheRows = 8;
+    const int InputCacheCols = 8;
+    const int StripeSize = 4;
+    const int VecSize = 2;
+    const int OutputCacheRows = 2;
+    const int OutputCacheCols = 2;
+
+    auto input = MakeIncrementingMatrix<int>(Rows, Columns, "input");
+    auto output = MakeMatrix<int>(Rows, Columns, "output");
+
+    // Define LoopNest
+    auto nest = Using({ input }, ArgumentType::Input)
+                    .Using({ output }, ArgumentType::Output)
+                    .ForAll(i, 0, Rows)
+                    .ForAll(j, 0, Columns)
+                    .Do([=](Matrix input_, Matrix output_, Scalar i_, Scalar j_) {
+                        output_(i_, j_) = input_(i_, j_);
+                    });
+
+    auto& schedule = nest.GetSchedule();
+
+    auto iTopLevel = i;
+    auto jTopLevel = j;
+
+    auto iBlock = schedule.Split(i, InputCacheRows);
+    auto iOutput = schedule.Split(i, OutputCacheRows);
+    auto jBlock = schedule.Split(j, InputCacheCols);
+    auto jStripe = schedule.Split(j, StripeSize);
+    auto jOutput = schedule.Split(j, OutputCacheCols);
+    auto jVec = schedule.Split(j, VecSize);
+
+    std::vector<Index> orderedIndices = { jBlock,
+                                          iBlock,
+                                          jStripe,
+                                          iOutput,
+                                          jOutput,
+                                          i,
+                                          jVec,
+                                          j };
+    schedule.SetOrder(orderedIndices);
+
+    ArgumentType argType = ArgumentType::Input;
+    std::string cacheName = "cacheInput";
+    size_t maxCacheElts = InputCacheRows * InputCacheCols;
+    size_t fillThreshold = maxCacheElts;
+    std::function<void(Scalar, Scalar)> reduceFunction = CopyReduce;
+    auto extraCacheParams = std::make_tuple(argType,
+                                            cacheName,
+                                            maxCacheElts,
+                                            fillThreshold,
+                                            reduceFunction,
+                                            false);
+    schedule.Cache<GeneralCachingStrategy>(input,
+                                           { iTopLevel, jTopLevel },
+                                           {},
+                                           {},
+                                           std::nullopt,
+                                           extraCacheParams);
+
+    ArgumentType output_argType = ArgumentType::Output;
+    std::string output_cacheName = "cacheOutput";
+
+    size_t output_maxCacheElts = OutputCacheRows * OutputCacheCols;
+    size_t output_fillThreshold = output_maxCacheElts;
+    std::function<void(Scalar, Scalar)> output_reduceFunction = CopyReduce;
+    auto output_extraCacheParams = std::make_tuple(output_argType,
+                                                   output_cacheName,
+                                                   output_maxCacheElts,
+                                                   output_fillThreshold,
+                                                   output_reduceFunction,
+                                                   false);
+    schedule.Cache<GeneralCachingStrategy>(output,
+                                           { iTopLevel, jTopLevel },
+                                           {},
+                                           {},
+                                           std::nullopt,
+                                           output_extraCacheParams);
+
+#if 0 // DEBUGGING
+    auto loop = nest.GetUnderlyingLoopNest();
+    DebugDump(loop);
+#endif
+
+    nest.Run();
+
+    return VerifySame(output, input);
+}
+
+Scalar GeneralCachingStrategy_ValidateOutput_Test10()
+{
+    // BLASTCopy input caching with rectangular output cache
+    loopnests::Index i("i"), j("j");
+
+    const int Rows = 16;
+    const int Columns = 16;
+    const int InputCacheRows = 8;
+    const int InputCacheCols = 8;
+    const int StripeSize = 4;
+    const int VecSize = 2;
+    const int OutputCacheRows = 2;
+    const int OutputCacheCols = 4;
+
+    auto input = MakeIncrementingMatrix<int>(Rows, Columns, "input");
+    auto output = MakeMatrix<int>(Rows, Columns, "output");
+
+    // Define LoopNest
+    auto nest = Using({ input }, ArgumentType::Input)
+                    .Using({ output }, ArgumentType::Output)
+                    .ForAll(i, 0, Rows)
+                    .ForAll(j, 0, Columns)
+                    .Do([=](Matrix input_, Matrix output_, Scalar i_, Scalar j_) {
+                        output_(i_, j_) = input_(i_, j_);
+                    });
+
+    auto& schedule = nest.GetSchedule();
+
+    auto iTopLevel = i;
+    auto jTopLevel = j;
+
+    auto iBlock = schedule.Split(i, InputCacheRows);
+    auto iOutput = schedule.Split(i, OutputCacheRows);
+    auto jBlock = schedule.Split(j, InputCacheCols);
+    auto jStripe = schedule.Split(j, StripeSize);
+    auto jOutput = schedule.Split(j, OutputCacheCols);
+    auto jVec = schedule.Split(j, VecSize);
+
+    std::vector<Index> orderedIndices = { jBlock,
+                                          iBlock,
+                                          jStripe,
+                                          iOutput,
+                                          jOutput,
+                                          i,
+                                          jVec,
+                                          j };
+    schedule.SetOrder(orderedIndices);
+
+    ArgumentType argType = ArgumentType::Input;
+    std::string cacheName = "cacheInput";
+    size_t maxCacheElts = InputCacheRows * InputCacheCols;
+    size_t fillThreshold = maxCacheElts;
+    std::function<void(Scalar, Scalar)> reduceFunction = CopyReduce;
+    auto extraCacheParams = std::make_tuple(argType,
+                                            cacheName,
+                                            maxCacheElts,
+                                            fillThreshold,
+                                            reduceFunction,
+                                            false);
+    schedule.Cache<GeneralCachingStrategy>(input,
+                                           { iTopLevel, jTopLevel },
+                                           {},
+                                           {},
+                                           std::nullopt,
+                                           extraCacheParams);
+
+    ArgumentType output_argType = ArgumentType::Output;
+    std::string output_cacheName = "cacheOutput";
+
+    size_t output_maxCacheElts = OutputCacheRows * OutputCacheCols;
+    size_t output_fillThreshold = output_maxCacheElts;
+    std::function<void(Scalar, Scalar)> output_reduceFunction = CopyReduce;
+    auto output_extraCacheParams = std::make_tuple(output_argType,
+                                                   output_cacheName,
+                                                   output_maxCacheElts,
+                                                   output_fillThreshold,
+                                                   output_reduceFunction,
+                                                   false);
+    schedule.Cache<GeneralCachingStrategy>(output,
+                                           { iTopLevel, jTopLevel },
+                                           {},
+                                           { iOutput },
+                                           std::nullopt,
+                                           output_extraCacheParams);
+
+#if 0 // DEBUGGING
+    auto loop = nest.GetUnderlyingLoopNest();
+    DebugDump(loop);
+#endif
+
+    nest.Run();
+
+    return VerifySame(output, input);
+}
+
+Scalar GeneralCachingStrategy_ValidateOutput_Test11()
+{
+    // BLASTCopy output caching
+    loopnests::Index i("i"), j("j");
+
+    const int Rows = 16;
+    const int Columns = 16;
+    const int CacheRows = 8;
+    const int CacheCols = 8;
+    const int StripeSize = 4;
+    const int VecSize = 2;
+
+    auto input = MakeIncrementingMatrix<int>(Rows, Columns, "input");
+    auto output = MakeMatrix<int>(Rows, Columns, "output");
+
+    // Define LoopNest
+    auto nest = Using({ input }, ArgumentType::Input)
+                    .Using({ output }, ArgumentType::Output)
+                    .ForAll(i, 0, Rows)
+                    .ForAll(j, 0, Columns)
+                    .Do([=](Matrix input_, Matrix output_, Scalar i_, Scalar j_) {
+                        output_(i_, j_) = input_(i_, j_);
+                    });
+
+    auto& schedule = nest.GetSchedule();
+
+    auto iTopLevel = i;
+    auto jTopLevel = j;
+
+    auto iBlock = schedule.Split(i, CacheRows);
+    auto jBlock = schedule.Split(j, CacheCols);
+    auto jStripe = schedule.Split(j, StripeSize);
+    auto jVec = schedule.Split(j, VecSize);
+
+    std::vector<Index> orderedIndices = { jBlock,
+                                          iBlock,
+                                          jStripe,
+                                          i,
+                                          jVec,
+                                          j };
+    schedule.SetOrder(orderedIndices);
+
+    ArgumentType argType = ArgumentType::Output;
+    std::string cacheName = "cacheOutput";
+    size_t maxCacheElts = CacheRows * CacheCols;
+    size_t fillThreshold = maxCacheElts;
+    std::function<void(Scalar, Scalar)> reduceFunction = CopyReduce;
+    auto extraCacheParams = std::make_tuple(argType,
+                                            cacheName,
+                                            maxCacheElts,
+                                            fillThreshold,
+                                            reduceFunction,
+                                            false);
+    schedule.Cache<GeneralCachingStrategy>(output,
+                                           { iTopLevel, jTopLevel },
+                                           {},
+                                           {},
+                                           std::nullopt,
+                                           extraCacheParams);
+
+#if 0 // DEBUGGING
+    auto loop = nest.GetUnderlyingLoopNest();
+    DebugDump(loop);
+#endif
+
+    nest.Run();
+
+    return VerifySame(output, input);
+}
+
+Scalar GeneralCachingStrategy_ValidateOutput_Test12()
+{
+    // BLASTCopy input caching with same BLASTCopy output caching
+    loopnests::Index i("i"), j("j");
+
+    const int Rows = 16;
+    const int Columns = 16;
+    const int CacheRows = 8;
+    const int CacheCols = 8;
+    const int StripeSize = 4;
+    const int VecSize = 2;
+
+    auto input = MakeIncrementingMatrix<int>(Rows, Columns, "input");
+    auto output = MakeMatrix<int>(Rows, Columns, "output");
+
+    // Define LoopNest
+    auto nest = Using({ input }, ArgumentType::Input)
+                    .Using({ output }, ArgumentType::Output)
+                    .ForAll(i, 0, Rows)
+                    .ForAll(j, 0, Columns)
+                    .Do([=](Matrix input_, Matrix output_, Scalar i_, Scalar j_) {
+                        output_(i_, j_) = input_(i_, j_);
+                    });
+
+    auto& schedule = nest.GetSchedule();
+
+    auto iTopLevel = i;
+    auto jTopLevel = j;
+
+    auto iBlock = schedule.Split(i, CacheRows);
+
+    auto jBlock = schedule.Split(j, CacheCols);
+    auto jStripe = schedule.Split(j, StripeSize);
+    auto jVec = schedule.Split(j, VecSize);
+
+    std::vector<Index> orderedIndices = { jBlock,
+                                          iBlock,
+                                          jStripe,
+                                          i,
+                                          jVec,
+                                          j };
+    schedule.SetOrder(orderedIndices);
+
+    ArgumentType argType = ArgumentType::Input;
+    std::string cacheName = "cacheInput";
+    size_t maxCacheElts = CacheRows * CacheCols;
+    size_t fillThreshold = maxCacheElts;
+    std::function<void(Scalar, Scalar)> reduceFunction = CopyReduce;
+    auto extraCacheParams = std::make_tuple(argType,
+                                            cacheName,
+                                            maxCacheElts,
+                                            fillThreshold,
+                                            reduceFunction,
+                                            false);
+    schedule.Cache<GeneralCachingStrategy>(input,
+                                           { iTopLevel, jTopLevel },
+                                           {},
+                                           {},
+                                           std::nullopt,
+                                           extraCacheParams);
+
+    ArgumentType output_argType = ArgumentType::Output;
+    std::string output_cacheName = "cacheOutput";
+    auto output_extraCacheParams = std::make_tuple(output_argType,
+                                                   output_cacheName,
+                                                   maxCacheElts,
+                                                   fillThreshold,
+                                                   reduceFunction,
+                                                   false);
+    schedule.Cache<GeneralCachingStrategy>(output,
+                                           { iTopLevel, jTopLevel },
+                                           {},
+                                           {},
+                                           std::nullopt,
+                                           output_extraCacheParams);
+
+#if 0 // DEBUGGING
+    auto loop = nest.GetUnderlyingLoopNest();
+    DebugDump(loop);
+#endif
+
+    nest.Run();
+
+    return VerifySame(output, input);
+}
+
+Scalar GeneralCachingStrategy_ValidateOutput_Test13()
+{
+    // BLASTCopy input caching with different BLASTCopy output caching
+    loopnests::Index i("i"), j("j");
+
+    const int Rows = 32;
+    const int Columns = 32;
+    const int InputCacheRows = 16;
+    const int InputCacheCols = 16;
+    const int InputStripeSize = 8;
+    const int VecSize = 2;
+    const int OutputCacheRows = 8;
+    const int OutputCacheCols = InputStripeSize; // == InputStripeSize and in same dimension
+    const int OutputStripeSize = 4;
+
+    auto input = MakeIncrementingMatrix<int>(Rows, Columns, "input");
+    auto output = MakeMatrix<int>(Rows, Columns, "output");
+
+    // Define LoopNest
+    auto nest = Using({ input }, ArgumentType::Input)
+                    .Using({ output }, ArgumentType::Output)
+                    .ForAll(i, 0, Rows)
+                    .ForAll(j, 0, Columns)
+                    .Do([=](Matrix input_, Matrix output_, Scalar i_, Scalar j_) {
+                        output_(i_, j_) = input_(i_, j_);
+                    });
+
+    auto& schedule = nest.GetSchedule();
+
+    auto iTopLevel = i;
+    auto jTopLevel = j;
+
+    auto iBlock = schedule.Split(i, InputCacheRows);
+    auto iOutput = schedule.Split(i, OutputCacheRows);
+
+    auto jBlock = schedule.Split(j, InputCacheCols);
+    auto jOutput = schedule.Split(j, OutputCacheCols);
+    auto jInputStripe = jOutput; // Split by the same amount in the same dimension
+    auto jOutputStripe = schedule.Split(j, OutputStripeSize);
+    auto jVec = schedule.Split(j, VecSize);
+
+    std::vector<Index> orderedIndices = { jBlock,
+                                          iBlock,
+                                          jOutput,
+                                          iOutput,
+                                          jOutputStripe,
+                                          i,
+                                          jVec,
+                                          j };
+    schedule.SetOrder(orderedIndices);
+
+    ArgumentType argType = ArgumentType::Input;
+    std::string cacheName = "cacheInput";
+    size_t maxCacheElts = InputCacheRows * InputCacheCols;
+    size_t fillThreshold = maxCacheElts;
+    std::function<void(Scalar, Scalar)> reduceFunction = CopyReduce;
+    auto extraCacheParams = std::make_tuple(argType,
+                                            cacheName,
+                                            maxCacheElts,
+                                            fillThreshold,
+                                            reduceFunction,
+                                            false);
+    schedule.Cache<GeneralCachingStrategy>(input,
+                                           { iTopLevel, jTopLevel },
+                                           {},
+                                           {},
+                                           std::nullopt,
+                                           extraCacheParams);
+
+    ArgumentType output_argType = ArgumentType::Output;
+    std::string output_cacheName = "cacheOutput";
+
+    size_t output_maxCacheElts = OutputCacheRows * OutputCacheCols;
+    size_t output_fillThreshold = output_maxCacheElts;
+    std::function<void(Scalar, Scalar)> output_reduceFunction = CopyReduce;
+    auto output_extraCacheParams = std::make_tuple(output_argType,
+                                                   output_cacheName,
+                                                   output_maxCacheElts,
+                                                   output_fillThreshold,
+                                                   output_reduceFunction,
+                                                   false);
+    schedule.Cache<GeneralCachingStrategy>(output,
+                                           { iTopLevel, jTopLevel },
+                                           {},
+                                           {},
+                                           std::nullopt,
+                                           output_extraCacheParams);
+
+#if 0 // DEBUGGING
+    auto loop = nest.GetUnderlyingLoopNest();
+    DebugDump(loop);
+#endif
+
+    nest.Run();
+
+    return VerifySame(output, input);
+}
+
+Scalar GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput(int rows, int columns, int outputCacheRows, int outputCacheColumns)
+{
+    // Square output cache
+    loopnests::Index i("i"), j("j");
+
+    auto input = MakeIncrementingMatrix<int>(rows, columns, "input");
+    auto output = MakeMatrix<int>(rows, columns, "output");
+
+    // Define LoopNest
+    auto nest = Using({ input }, ArgumentType::Input)
+                    .Using({ output }, ArgumentType::Output)
+                    .ForAll(i, 0, rows)
+                    .ForAll(j, 0, columns)
+                    .Do([=](Matrix input_, Matrix output_, Scalar i_, Scalar j_) {
+                        output_(i_, j_) = input_(i_, j_);
+                    });
+
+    auto& schedule = nest.GetSchedule();
+
+    auto iTopLevel = i;
+    auto jTopLevel = j;
+
+    auto iOutput = schedule.Split(i, outputCacheRows);
+    auto jOutput = schedule.Split(j, outputCacheColumns);
+
+    std::vector<Index> orderedIndices = { iOutput, jOutput, i, j };
+    schedule.SetOrder(orderedIndices);
+
+    ArgumentType argType = ArgumentType::Output;
+    std::string cacheName = "cacheOutput";
+
+    size_t maxCacheElts = outputCacheRows * outputCacheColumns;
+    size_t fillThreshold = maxCacheElts;
+    std::function<void(Scalar, Scalar)> reduceFunction = CopyReduce;
+    auto extraCacheParams = std::make_tuple(argType,
+                                            cacheName,
+                                            maxCacheElts,
+                                            fillThreshold,
+                                            reduceFunction,
+                                            false);
+    schedule.Cache<GeneralCachingStrategy>(output,
+                                           { iTopLevel, jTopLevel },
+                                           {},
+                                           {},
+                                           std::nullopt,
+                                           extraCacheParams);
+
+#if 0 // DEBUGGING
+    auto loop = nest.GetUnderlyingLoopNest();
+    DebugDump(loop);
+#endif
+
+    nest.Run();
+
+    return VerifySame(output, input);
+}
+
+Scalar GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test1()
+{
+    const int Rows = 8;
+    const int Columns = 8;
+    const int CacheRows = 2;
+    const int CacheColumns = 3;
+    return GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput(Rows, Columns, CacheRows, CacheColumns);
+}
+
+Scalar GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test2()
+{
+    const int Rows = 8;
+    const int Columns = 8;
+    const int CacheRows = 3;
+    const int CacheColumns = 2;
+    return GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput(Rows, Columns, CacheRows, CacheColumns);
+}
+
+Scalar GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test3()
+{
+    const int Rows = 8;
+    const int Columns = 8;
+    const int CacheRows = 3;
+    const int CacheColumns = 3;
+    return GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput(Rows, Columns, CacheRows, CacheColumns);
+}
+
+Scalar GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test4()
+{
+    const int Rows = 8;
+    const int Columns = 8;
+    const int CacheRows = 4;
+    const int CacheColumns = 5;
+    return GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput(Rows, Columns, CacheRows, CacheColumns);
+}
+
+Scalar GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test5()
+{
+    const int Rows = 8;
+    const int Columns = 8;
+    const int CacheRows = 5;
+    const int CacheColumns = 4;
+    return GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput(Rows, Columns, CacheRows, CacheColumns);
+}
+
+Scalar GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test6()
+{
+    const int Rows = 8;
+    const int Columns = 8;
+    const int CacheRows = 5;
+    const int CacheColumns = 5;
+    return GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput(Rows, Columns, CacheRows, CacheColumns);
+}
+
+Scalar GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test7()
+{
+    const int Rows = 8;
+    const int Columns = 7;
+    const int CacheRows = 2;
+    const int CacheColumns = 2;
+    return GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput(Rows, Columns, CacheRows, CacheColumns);
+}
+
+Scalar GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test8()
+{
+    const int Rows = 7;
+    const int Columns = 8;
+    const int CacheRows = 2;
+    const int CacheColumns = 2;
+    return GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput(Rows, Columns, CacheRows, CacheColumns);
+}
+
+Scalar GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test9()
+{
+    const int Rows = 7;
+    const int Columns = 7;
+    const int CacheRows = 2;
+    const int CacheColumns = 2;
+    return GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput(Rows, Columns, CacheRows, CacheColumns);
+}
+
+// BLASTCOPY tests from above with GeneralCachingStrategy
+
+Scalar GeneralCachingStrategy_BLASTCOPY_ValidateOutput_Test1()
+{
+    int N = 8;
+    int cacheRows = N;
+    int cacheCols = N;
+    int stripeSize = 4;
+
+    auto input = MakeIncrementingMatrix<int>(N, N, "input");
+    auto output = MakeMatrix<int>(N, N, "output");
+    auto expectedOutput = MakeIncrementingMatrix<int>(N, N, "expectedOutput");
+
+    Index i("i"), j("j");
+    auto nest = Using({ input }, ArgumentType::Input)
+                    .Using({ output }, ArgumentType::Output)
+                    .ForAll(i, 0, N)
+                    .ForAll(j, 0, N)
+                    .Do([](Matrix input, Matrix output, Scalar i, Scalar j) {
+                        output(i, j) = input(i, j);
+                    });
+
+    auto& schedule = nest.GetSchedule();
+
+    auto iTopLevel = i;
+    auto jTopLevel = j;
+
+    auto iCache = schedule.Split(i, cacheRows);
+    auto jCache = schedule.Split(j, cacheCols);
+    auto jStripe = schedule.Split(j, stripeSize);
+
+    schedule.SetOrder({ iCache, jCache, jStripe, i, j });
+
+    ArgumentType argType = ArgumentType::Input;
+    std::string cacheName = "cacheInput";
+    size_t maxCacheElts = cacheRows * cacheCols;
+    size_t fillThreshold = maxCacheElts;
+    std::function<void(Scalar, Scalar)> reduceFunction = CopyReduce;
+    auto extraCacheParams = std::make_tuple(argType,
+                                            cacheName,
+                                            maxCacheElts,
+                                            fillThreshold,
+                                            reduceFunction,
+                                            false);
+    schedule.Cache<GeneralCachingStrategy>(input,
+                                           { iTopLevel, jTopLevel },
+                                           {},
+                                           {},
+                                           std::nullopt,
+                                           extraCacheParams);
+
+#if 0 // DEBUGGING
+    DebugDump(nest.GetUnderlyingLoopNest());
+#endif
+    nest.Run();
+
+    return VerifySame(output, expectedOutput);
+}
+
+// Test with smaller cache and stripe size than previous test
+Scalar GeneralCachingStrategy_BLASTCOPY_ValidateOutput_Test2()
+{
+    int N = 8;
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int stripeSize = 2;
+
+    // input, expectedOutput
+    // A:
+    // [ 0,  1,  2,  3,  4,  5,  6,  7]
+    // [ 8,  9, 10, 11, 12, 13, 14, 15]
+    // [16, 17, 18, 19, 20, 21, 22, 23]
+    // [24, 25, 26, 27, 28, 29, 30, 31]
+    // [32, 33, 34, 35, 36, 37, 38, 39]
+    // [40, 41, 42, 43, 44, 45, 46, 47]
+    // [48, 49, 50, 51, 52, 53, 54, 55]
+    // [56, 57, 58, 59, 60, 61, 62, 63]
+    auto input = MakeIncrementingMatrix<int>(N, N, "input");
+    auto output = MakeMatrix<int>(N, N, "output");
+    auto expectedOutput = MakeIncrementingMatrix<int>(N, N, "expectedOutput");
+
+    Index i("i"), j("j");
+    auto nest = Using({ input }, ArgumentType::Input)
+                    .Using({ output }, ArgumentType::Output)
+                    .ForAll(i, 0, N)
+                    .ForAll(j, 0, N)
+                    .Do([](Matrix input, Matrix output, Scalar i, Scalar j) {
+                        output(i, j) = input(i, j);
+                    });
+
+    auto& schedule = nest.GetSchedule();
+
+    auto iTopLevel = i;
+    auto jTopLevel = j;
+
+    auto iCache = schedule.Split(i, cacheRows);
+    auto jCache = schedule.Split(j, cacheCols);
+    auto jStripe = schedule.Split(j, stripeSize);
+
+    schedule.SetOrder({ iCache, jCache, jStripe, i, j });
+
+    ArgumentType argType = ArgumentType::Input;
+    std::string cacheName = "cacheInput";
+    size_t maxCacheElts = cacheRows * cacheCols;
+    size_t fillThreshold = maxCacheElts;
+    std::function<void(Scalar, Scalar)> reduceFunction = CopyReduce;
+    auto extraCacheParams = std::make_tuple(argType,
+                                            cacheName,
+                                            maxCacheElts,
+                                            fillThreshold,
+                                            reduceFunction,
+                                            false);
+    schedule.Cache<GeneralCachingStrategy>(input,
+                                           { iTopLevel, jTopLevel },
+                                           {},
+                                           {},
+                                           std::nullopt,
+                                           extraCacheParams);
+
+#if 0 // DEBUGGING
+    DebugDump(nest.GetUnderlyingLoopNest());
+#endif
+    nest.Run();
+
+    return VerifySame(output, expectedOutput);
+}
+
+Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_Test1()
+{
+    int N = 8;
+    int cacheRows = N;
+    int cacheCols = N;
+    int stripeSize = 4;
+    int vecSize = stripeSize / 2;
+
+    auto input = MakeIncrementingMatrix<int>(N, N, "input");
+    auto output = MakeMatrix<int>(N, N, "output");
+
+    // input
+    // A:
+    // [ 0,  1,  2,  3,  4,  5,  6,  7]
+    // [ 8,  9, 10, 11, 12, 13, 14, 15]
+    // [16, 17, 18, 19, 20, 21, 22, 23]
+    // [24, 25, 26, 27, 28, 29, 30, 31]
+    // [32, 33, 34, 35, 36, 37, 38, 39]
+    // [40, 41, 42, 43, 44, 45, 46, 47]
+    // [48, 49, 50, 51, 52, 53, 54, 55]
+    // [56, 57, 58, 59, 60, 61, 62, 63]
+    // clang-format off
+    Vector expectedCached =
+        {
+             0,  1,  2,  3,
+             8,  9, 10, 11,
+            16, 17, 18, 19,
+            24, 25, 26, 27,
+            32, 33, 34, 35,
+            40, 41, 42, 43,
+            48, 49, 50, 51,
+            56, 57, 58, 59,
+
+             4,  5,  6,  7,
+            12, 13, 14, 15,
+            20, 21, 22, 23,
+            28, 29, 30, 31,
+            36, 37, 38, 39,
+            44, 45, 46, 47,
+            52, 53, 54, 55,
+            60, 61, 62, 63
+        };
+    // clang-format on
+
+    Index i("i"), j("j");
+    auto nest = Using({ input }, ArgumentType::Input)
+                    .Using({ output }, ArgumentType::Output)
+                    .ForAll(i, 0, N)
+                    .ForAll(j, 0, N)
+                    .Do([](Matrix input, Matrix output, Scalar i, Scalar j) {
+                        output(i, j) = input(i, j);
+                    });
+
+    auto& schedule = nest.GetSchedule();
+
+    auto iTopLevel = i;
+    auto jTopLevel = j;
+
+    auto iCache = schedule.Split(i, cacheRows);
+    auto jCache = schedule.Split(j, cacheCols);
+    auto jStripe = schedule.Split(j, stripeSize);
+    auto jVec = schedule.Split(j, vecSize);
+
+    schedule.SetOrder({ jCache, iCache, jStripe, i, jVec, j });
+
+    ArgumentType argType = ArgumentType::Input;
+    std::string cacheName = "cacheInput";
+    size_t maxCacheElts = cacheRows * cacheCols;
+    size_t fillThreshold = maxCacheElts;
+    std::function<void(Scalar, Scalar)> reduceFunction = CopyReduce;
+    auto extraCacheParams = std::make_tuple(argType,
+                                            cacheName,
+                                            maxCacheElts,
+                                            fillThreshold,
+                                            reduceFunction,
+                                            false);
+    GeneralCachingStrategy cachingProvider{};
+    schedule.Cache(cachingProvider,
+                   input,
+                   { iTopLevel, jTopLevel },
+                   {},
+                   {},
+                   std::nullopt,
+                   extraCacheParams);
+
+#if 0 // DEBUGGING
+    DebugDump(nest.GetUnderlyingLoopNest());
+#endif
+    nest.Run();
+
+    // Examine the underlying cached memory
+    auto rawCacheValue = cachingProvider._rawCache;
+    rawCacheValue.SetLayout({ { (int)rawCacheValue.GetLayout().GetMemorySize() } });
+    auto cacheVector = Vector(rawCacheValue);
+
+    return VerifySame(cacheVector, expectedCached);
+}
+
+// Smaller stripe size than previous test
+Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_Test2()
+{
+    int N = 8;
+    int cacheRows = N;
+    int cacheCols = N;
+    int stripeSize = 2;
+
+    auto input = MakeIncrementingMatrix<int>(N, N, "input");
+    auto output = MakeMatrix<int>(N, N, "output");
+
+    // input
+    // A:
+    // [ 0,  1,  2,  3,  4,  5,  6,  7]
+    // [ 8,  9, 10, 11, 12, 13, 14, 15]
+    // [16, 17, 18, 19, 20, 21, 22, 23]
+    // [24, 25, 26, 27, 28, 29, 30, 31]
+    // [32, 33, 34, 35, 36, 37, 38, 39]
+    // [40, 41, 42, 43, 44, 45, 46, 47]
+    // [48, 49, 50, 51, 52, 53, 54, 55]
+    // [56, 57, 58, 59, 60, 61, 62, 63]
+    // clang-format off
+    Vector expectedCached =
+        {
+             0,  1,
+             8,  9,
+            16, 17,
+            24, 25,
+            32, 33,
+            40, 41,
+            48, 49,
+            56, 57,
+
+             2,  3,
+            10, 11,
+            18, 19,
+            26, 27,
+            34, 35,
+            42, 43,
+            50, 51,
+            58, 59,
+
+             4,  5,
+            12, 13,
+            20, 21,
+            28, 29,
+            36, 37,
+            44, 45,
+            52, 53,
+            60, 61,
+
+             6,  7,
+            14, 15,
+            22, 23,
+            30, 31,
+            38, 39,
+            46, 47,
+            54, 55,
+            62, 63
+        };
+    // clang-format on
+
+    Index i("i"), j("j");
+    auto nest = Using({ input }, ArgumentType::Input)
+                    .Using({ output }, ArgumentType::Output)
+                    .ForAll(i, 0, N)
+                    .ForAll(j, 0, N)
+                    .Do([](Matrix input, Matrix output, Scalar i, Scalar j) {
+                        output(i, j) = input(i, j);
+                    });
+
+    auto& schedule = nest.GetSchedule();
+
+    auto iTopLevel = i;
+    auto jTopLevel = j;
+
+    auto iCache = schedule.Split(i, cacheRows);
+    auto jCache = schedule.Split(j, cacheCols);
+    auto jStripe = schedule.Split(j, stripeSize);
+
+    schedule.SetOrder({ iCache, jCache, jStripe, i, j });
+
+    ArgumentType argType = ArgumentType::Input;
+    std::string cacheName = "cacheInput";
+    size_t maxCacheElts = cacheRows * cacheCols;
+    size_t fillThreshold = maxCacheElts;
+    std::function<void(Scalar, Scalar)> reduceFunction = CopyReduce;
+    auto extraCacheParams = std::make_tuple(argType,
+                                            cacheName,
+                                            maxCacheElts,
+                                            fillThreshold,
+                                            reduceFunction,
+                                            false);
+    GeneralCachingStrategy cachingProvider{};
+    schedule.Cache(cachingProvider,
+                   input,
+                   { iTopLevel, jTopLevel },
+                   {},
+                   {},
+                   std::nullopt,
+                   extraCacheParams);
+#if 0 // DEBUGGING
+    DebugDump(nest.GetUnderlyingLoopNest());
+#endif
+    nest.Run();
+
+    // Examine the underlying cached memory
+    auto rawCacheValue = cachingProvider._rawCache;
+    rawCacheValue.SetLayout({ { (int)rawCacheValue.GetLayout().GetMemorySize() } });
+    auto cacheVector = Vector(rawCacheValue);
+
+    return VerifySame(cacheVector, expectedCached);
+}
+
+// Same stripe size as previous test, but don't cache entire matrix at once
+Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_Test3()
+{
+    int N = 8;
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int stripeSize = 2;
+
+    auto input = MakeIncrementingMatrix<int>(N, N, "input");
+    auto output = MakeMatrix<int>(N, N, "output");
+
+    // input
+    // A:
+    // [ 0,  1,  2,  3,  4,  5,  6,  7]
+    // [ 8,  9, 10, 11, 12, 13, 14, 15]
+    // [16, 17, 18, 19, 20, 21, 22, 23]
+    // [24, 25, 26, 27, 28, 29, 30, 31]
+    // [32, 33, 34, 35, 36, 37, 38, 39]
+    // [40, 41, 42, 43, 44, 45, 46, 47]
+    // [48, 49, 50, 51, 52, 53, 54, 55]
+    // [56, 57, 58, 59, 60, 61, 62, 63]
+    // clang-format off
+    Vector expectedCachedUpperLeft =
+        {
+             0,  1,
+             8,  9,
+            16, 17,
+            24, 25,
+
+             2,  3,
+            10, 11,
+            18, 19,
+            26, 27,
+        };
+    Vector expectedCachedUpperRight =
+        {
+             4,  5,
+            12, 13,
+            20, 21,
+            28, 29,
+
+             6,  7,
+            14, 15,
+            22, 23,
+            30, 31
+        };
+    Vector expectedCachedLowerLeft =
+        {
+            32, 33,
+            40, 41,
+            48, 49,
+            56, 57,
+
+            34, 35,
+            42, 43,
+            50, 51,
+            58, 59,
+        };
+    Vector expectedCachedLowerRight =
+        {
+            36, 37,
+            44, 45,
+            52, 53,
+            60, 61,
+
+            38, 39,
+            46, 47,
+            54, 55,
+            62, 63
+        };
+    // clang-format on
+
+    Index i("i"), j("j");
+    auto nest = Using({ input }, ArgumentType::Input)
+                    .Using({ output }, ArgumentType::Output)
+                    .ForAll(i, 0, N)
+                    .ForAll(j, 0, N)
+                    .Do([](Matrix input, Matrix output, Scalar i, Scalar j) {
+                        output(i, j) = input(i, j);
+                    });
+
+    auto& schedule = nest.GetSchedule();
+
+    auto iTopLevel = i;
+    auto jTopLevel = j;
+
+    auto iCache = schedule.Split(i, cacheRows);
+    auto jCache = schedule.Split(j, cacheCols);
+    auto jStripe = schedule.Split(j, stripeSize);
+
+    schedule.SetOrder({ iCache, jCache, jStripe, i, j });
+
+    ArgumentType argType = ArgumentType::Input;
+    std::string cacheName = "cacheInput";
+    size_t maxCacheElts = cacheRows * cacheCols;
+    size_t fillThreshold = maxCacheElts;
+    std::function<void(Scalar, Scalar)> reduceFunction = CopyReduce;
+    auto extraCacheParams = std::make_tuple(argType,
+                                            cacheName,
+                                            maxCacheElts,
+                                            fillThreshold,
+                                            reduceFunction,
+                                            false);
+    GeneralCachingStrategy cachingProvider{};
+    schedule.Cache(cachingProvider,
+                   input,
+                   { iTopLevel, jTopLevel },
+                   {},
+                   {},
+                   std::nullopt,
+                   extraCacheParams);
+
+    // Get a handle to the underlying cached memory
+    auto rawCacheValue = cachingProvider._rawCache;
+    int rawCacheSize = (int)rawCacheValue.GetLayout().NumElements();
+
+    auto cachedUpperLeft = MakeVector<int>(rawCacheSize);
+    auto cachedUpperRight = MakeVector<int>(rawCacheSize);
+    auto cachedLowerLeft = MakeVector<int>(rawCacheSize);
+    auto cachedLowerRight = MakeVector<int>(rawCacheSize);
+
+    // Add a low level API kernel to access the underlying cache after it has been filled
+    auto cacheSpyKernel = loopnests::Kernel("cache_spy_kernel")
+                              .Inputs(rawCacheValue, cachedUpperLeft, cachedUpperRight, cachedLowerLeft, cachedLowerRight)
+                              .Indices(iTopLevel, jTopLevel)
+                              .Define([cacheRows, cacheCols](Value rawCacheValue, Vector cachedUpperLeft, Vector cachedUpperRight, Vector cachedLowerLeft, Vector cachedLowerRight, Scalar i, Scalar j) {
+                                  auto cacheView = rawCacheValue;
+                                  cacheView.SetLayout({ { (int)rawCacheValue.GetLayout().NumElements() } });
+                                  auto vectorCacheView = Vector(cacheView);
+                                  If(i == 0,
+                                     [&]() {
+                                         // TODO : remove nested if's
+                                         If(j == 0,
+                                            [&]() {
+                                                cachedUpperLeft = vectorCacheView;
+                                            })
+                                             .ElseIf(j == cacheCols,
+                                                     [&]() {
+                                                         cachedUpperRight = vectorCacheView;
+                                                     });
+                                     })
+                                      .ElseIf(i == cacheRows,
+                                              [&]() {
+                                                  If(j == 0, [&]() {
+                                                      cachedLowerLeft = vectorCacheView;
+                                                  }).ElseIf(j == cacheCols, [&]() {
+                                                      cachedLowerRight = vectorCacheView;
+                                                  });
+                                              });
+                              });
+    auto cacheSpyPosition = loopnests::CodePositionConstraints{ loopnests::LoopFragmentType::body, { iCache, jCache }, {} };
+    nest.GetUnderlyingLoopNest().AddKernel(cacheSpyKernel, cacheSpyPosition);
+
+#if 0 // DEBUGGING
+    DebugDump(nest.GetUnderlyingLoopNest());
+#endif
+    nest.Run();
+
+    auto ok = MakeScalar<int>("ok");
+    ok = 1;
+    auto printError = [&] {
+        DebugPrint("Upper Left:");
+        DebugPrintVector(cachedUpperLeft);
+        DebugPrint("\n");
+        DebugPrintVector(expectedCachedUpperLeft);
+        DebugPrint("\n");
+        DebugPrint("\n");
+        DebugPrint("Upper Right:");
+        DebugPrintVector(cachedUpperRight);
+        DebugPrint("\n");
+        DebugPrintVector(expectedCachedUpperRight);
+        DebugPrint("\n");
+        DebugPrint("\n");
+        DebugPrint("Lower Left:");
+        DebugPrintVector(cachedLowerLeft);
+        DebugPrint("\n");
+        DebugPrintVector(expectedCachedLowerLeft);
+        DebugPrint("\n");
+        DebugPrint("\n");
+        DebugPrint("Lower Right:");
+        DebugPrintVector(cachedLowerRight);
+        DebugPrint("\n");
+        DebugPrintVector(expectedCachedLowerRight);
+        DebugPrint("\n");
+        DebugPrint("\n");
+    };
+    // TODO : replace nested if's
+    If(VerifySame(cachedUpperLeft, expectedCachedUpperLeft) == 0, [&]() {
+        If(VerifySame(cachedUpperRight, expectedCachedUpperRight) == 0, [&]() {
+            If(VerifySame(cachedLowerLeft, expectedCachedLowerLeft) == 0, [&]() {
+                If(VerifySame(cachedLowerRight, expectedCachedLowerRight) == 0, [&]() {
+                    ok = 0;
+                }).Else(printError);
+            }).Else(printError);
+        }).Else(printError);
+    }).Else(printError);
+    return ok;
+}
+
+Scalar GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Runner(int M, int N, int cacheRows, int cacheCols, int stripeSize)
+{
+    int vecSize = stripeSize / 2;
+
+    auto input = MakeIncrementingMatrix<int>(M, N, "input");
+    auto output = MakeMatrix<int>(M, N, "output");
+    auto expectedOutput = MakeIncrementingMatrix<int>(M, N, "expectedOutput");
+
+    Index i("i"), j("j");
+    auto nest = Using({ input }, ArgumentType::Input)
+                    .Using({ output }, ArgumentType::Output)
+                    .ForAll(i, 0, M)
+                    .ForAll(j, 0, N)
+                    .Do([](Matrix input, Matrix output, Scalar i, Scalar j) {
+                        output(i, j) = input(i, j);
+                    });
+
+    auto& schedule = nest.GetSchedule();
+
+    auto iTopLevel = i;
+    auto jTopLevel = j;
+
+    auto iCache = schedule.Split(i, cacheRows);
+    auto jCache = schedule.Split(j, cacheCols);
+    auto jStripe = schedule.Split(j, stripeSize);
+    auto jVec = schedule.Split(j, vecSize);
+
+    schedule.SetOrder({ jCache, iCache, jStripe, i, jVec, j });
+
+    ArgumentType argType = ArgumentType::Input;
+    std::string cacheName = "cacheInput";
+    size_t maxCacheElts = cacheRows * cacheCols;
+    size_t fillThreshold = maxCacheElts;
+    std::function<void(Scalar, Scalar)> reduceFunction = CopyReduce;
+    auto extraCacheParams = std::make_tuple(argType,
+                                            cacheName,
+                                            maxCacheElts,
+                                            fillThreshold,
+                                            reduceFunction,
+                                            false);
+    GeneralCachingStrategy cachingProvider{};
+    schedule.Cache(cachingProvider,
+                   input,
+                   { iTopLevel, jTopLevel },
+                   {},
+                   {},
+                   std::nullopt,
+                   extraCacheParams);
+
+#if 0 // DEBUGGING
+    DebugDump(nest.GetUnderlyingLoopNest());
+#endif
+    nest.Run();
+
+    return VerifySame(output, expectedOutput);
+}
+
+// input matrix rows evenly divides cache rows
+// input matrix cols doesn't evenly divide cache cols
+Scalar GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test1()
+{
+    int M = 8;
+    int N = 7; // N doesn't evenly divide the number of cache columns
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int stripeSize = 2;
+
+    return GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, stripeSize);
+}
+
+// input matrix rows evenly divides cache rows
+// input matrix cols doesn't evenly divide cache cols but does evenly divide stripeSize
+Scalar GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test2()
+{
+    int M = 8;
+    int N = 6; // N doesn't evenly divide the number of cache columns, but does evenly divide stripe size
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int stripeSize = 2;
+
+    return GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, stripeSize);
+}
+
+// input matrix rows doesn't evenly divides cache rows
+// input matrix cols doesn't evenly divide cache cols
+Scalar GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test3()
+{
+    int M = 6;
+    int N = 7; // N doesn't evenly divide the number of cache columns
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int stripeSize = 2;
+
+    return GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, stripeSize);
+}
+
+// input matrix rows doesn't evenly divides cache rows
+// input matrix cols doesn't evenly divide cache cols but does evenly divide stripe size
+Scalar GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test4()
+{
+    int M = 6;
+    int N = 6; // N doesn't evenly divide the number of cache columns, but does evenly divide stripe size
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int stripeSize = 2;
+
+    return GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, stripeSize);
+}
+
+// input matrix rows evenly divides cache rows
+// input matrix cols < cache cols, doesn't evenly divide stripe size
+Scalar GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test5()
+{
+    int M = 8;
+    int N = 3; // N < cache columns, doesn't evenly divide stripe size
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int stripeSize = 2;
+
+    return GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, stripeSize);
+}
+
+// input matrix rows evenly divides cache rows
+// input matrix cols < cache cols, evenly divides stripe size
+Scalar GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test6()
+{
+    int M = 8;
+    int N = 2; // N < cache columns, does evenly divide stripe size
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int stripeSize = 2;
+
+    return GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, stripeSize);
+}
+
+// input matrix rows < cache rows
+// input matrix cols < cache cols, doesn't evenly divides stripe size
+Scalar GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test7()
+{
+    int M = 3;
+    int N = 3; // N < cache columns, doesn't evenly divide stripe size
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int stripeSize = 2;
+
+    return GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, stripeSize);
+}
+
+// input matrix rows < cache rows
+// input matrix cols < cache cols, does evenly divides stripe size
+Scalar GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test8()
+{
+    int M = 2;
+    int N = 2; // N < cache columns, does evenly divide stripe size
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int stripeSize = 2;
+
+    return GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, stripeSize);
+}
+
+// input matrix rows < cache rows
+// input matrix cols multiple of cache cols
+Scalar GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test9()
+{
+    int M = 2;
+    int N = 8;
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int stripeSize = 2;
+
+    return GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, stripeSize);
+}
+
+Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Runner(int M, int N, int cacheRows, int cacheCols, int stripeSize, Vector expectedCachedUpperLeft, Vector expectedCachedUpperRight, Vector expectedCachedLowerLeft, Vector expectedCachedLowerRight)
+{
+    int vecSize = stripeSize / 2;
+    auto input = MakeIncrementingMatrix<int>(M, N, "input");
+    auto output = MakeMatrix<int>(M, N, "output");
+
+    Index i("i"), j("j");
+    auto nest = Using({ input }, ArgumentType::Input)
+                    .Using({ output }, ArgumentType::Output)
+                    .ForAll(i, 0, M)
+                    .ForAll(j, 0, N)
+                    .Do([](Matrix input, Matrix output, Scalar i, Scalar j) {
+                        output(i, j) = input(i, j);
+                    });
+
+    auto& schedule = nest.GetSchedule();
+
+    auto iTopLevel = i;
+    auto jTopLevel = j;
+
+    auto iCache = schedule.Split(i, cacheRows);
+    auto jCache = schedule.Split(j, cacheCols);
+    auto jStripe = schedule.Split(j, stripeSize);
+    auto jVec = schedule.Split(j, vecSize);
+
+    schedule.SetOrder({ jCache, iCache, jStripe, i, jVec, j });
+
+    ArgumentType argType = ArgumentType::Input;
+    std::string cacheName = "cacheInput";
+    size_t maxCacheElts = cacheRows * cacheCols;
+    size_t fillThreshold = maxCacheElts;
+    std::function<void(Scalar, Scalar)> reduceFunction = CopyReduce;
+    auto extraCacheParams = std::make_tuple(argType,
+                                            cacheName,
+                                            maxCacheElts,
+                                            fillThreshold,
+                                            reduceFunction,
+                                            false);
+    GeneralCachingStrategy cachingProvider{};
+    schedule.Cache(cachingProvider,
+                   input,
+                   { iTopLevel, jTopLevel },
+                   {},
+                   {},
+                   std::nullopt,
+                   extraCacheParams);
+
+    // Get a handle to the underlying cached memory
+    auto rawCacheValue = cachingProvider._rawCache;
+    int rawCacheSize = (int)rawCacheValue.GetLayout().NumElements();
+
+    auto cachedUpperLeft = MakeVector<int>(rawCacheSize);
+    auto cachedUpperRight = MakeVector<int>(rawCacheSize);
+    auto cachedLowerLeft = MakeVector<int>(rawCacheSize);
+    auto cachedLowerRight = MakeVector<int>(rawCacheSize);
+
+    // Add a low level API kernel to access the underlying cache after it has been filled
+    auto cacheSpyKernel = loopnests::Kernel("cache_spy_kernel")
+                              .Inputs(rawCacheValue, cachedUpperLeft, cachedUpperRight, cachedLowerLeft, cachedLowerRight)
+                              .Indices(iTopLevel, jTopLevel)
+                              .Define([cacheRows, cacheCols](Value rawCacheValue, Vector cachedUpperLeft, Vector cachedUpperRight, Vector cachedLowerLeft, Vector cachedLowerRight, Scalar i, Scalar j) {
+                                  auto cacheView = rawCacheValue;
+                                  cacheView.SetLayout({ { (int)rawCacheValue.GetLayout().NumElements() } });
+                                  auto vectorCacheView = Vector(cacheView);
+                                  If(i == 0,
+                                     [&]() {
+                                         // TODO : remove nested if's
+                                         If(j == 0,
+                                            [&]() {
+                                                cachedUpperLeft = vectorCacheView;
+                                            })
+                                             .ElseIf(j == cacheCols,
+                                                     [&]() {
+                                                         cachedUpperRight = vectorCacheView;
+                                                     });
+                                     })
+                                      .ElseIf(i == cacheRows,
+                                              [&]() {
+                                                  If(j == 0, [&]() {
+                                                      cachedLowerLeft = vectorCacheView;
+                                                  }).ElseIf(j == cacheCols, [&]() {
+                                                      cachedLowerRight = vectorCacheView;
+                                                  });
+                                              });
+                              });
+    auto cacheSpyPosition = loopnests::CodePositionConstraints{ loopnests::LoopFragmentType::body, { iCache, jCache }, {} };
+    nest.GetUnderlyingLoopNest().AddKernel(cacheSpyKernel, cacheSpyPosition);
+
+#if 0 // DEBUGGING
+    DebugDump(nest.GetUnderlyingLoopNest());
+#endif
+    nest.Run();
+
+    auto ok = MakeScalar<int>("ok");
+    ok = 1;
+    auto printError = [&] {
+        DebugPrint("Upper Left:");
+        DebugPrintVector(cachedUpperLeft);
+        DebugPrint("\n");
+        DebugPrintVector(expectedCachedUpperLeft);
+        DebugPrint("\n");
+        DebugPrint("\n");
+        DebugPrint("Upper Right:");
+        DebugPrintVector(cachedUpperRight);
+        DebugPrint("\n");
+        DebugPrintVector(expectedCachedUpperRight);
+        DebugPrint("\n");
+        DebugPrint("\n");
+        DebugPrint("Lower Left:");
+        DebugPrintVector(cachedLowerLeft);
+        DebugPrint("\n");
+        DebugPrintVector(expectedCachedLowerLeft);
+        DebugPrint("\n");
+        DebugPrint("\n");
+        DebugPrint("Lower Right:");
+        DebugPrintVector(cachedLowerRight);
+        DebugPrint("\n");
+        DebugPrintVector(expectedCachedLowerRight);
+        DebugPrint("\n");
+        DebugPrint("\n");
+    };
+    // TODO : replace nested if's
+    If(VerifySame(cachedUpperLeft, expectedCachedUpperLeft) == 0, [&]() {
+        If(VerifySame(cachedUpperRight, expectedCachedUpperRight) == 0, [&]() {
+            If(VerifySame(cachedLowerLeft, expectedCachedLowerLeft) == 0, [&]() {
+                If(VerifySame(cachedLowerRight, expectedCachedLowerRight) == 0, [&]() {
+                    ok = 0;
+                }).Else(printError);
+            }).Else(printError);
+        }).Else(printError);
+    }).Else(printError);
+    return ok;
+}
+
+Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Runner_LeftCachesOnly(int M, int N, int cacheRows, int cacheCols, int stripeSize, Vector expectedCachedUpperLeft, Vector expectedCachedLowerLeft)
+{
+    int vecSize = stripeSize / 2;
+    auto input = MakeIncrementingMatrix<int>(M, N, "input");
+    auto output = MakeMatrix<int>(M, N, "output");
+
+    Index i("i"), j("j");
+    auto nest = Using({ input }, ArgumentType::Input)
+                    .Using({ output }, ArgumentType::Output)
+                    .ForAll(i, 0, M)
+                    .ForAll(j, 0, N)
+                    .Do([](Matrix input, Matrix output, Scalar i, Scalar j) {
+                        output(i, j) = input(i, j);
+                    });
+
+    auto& schedule = nest.GetSchedule();
+
+    auto iTopLevel = i;
+    auto jTopLevel = j;
+
+    auto iCache = schedule.Split(i, cacheRows);
+    auto jCache = schedule.Split(j, cacheCols);
+    auto jStripe = schedule.Split(j, stripeSize);
+    auto jVec = schedule.Split(j, vecSize);
+
+    schedule.SetOrder({ jCache, iCache, jStripe, i, jVec, j });
+
+    ArgumentType argType = ArgumentType::Input;
+    std::string cacheName = "cacheInput";
+    size_t maxCacheElts = cacheRows * cacheCols;
+    size_t fillThreshold = maxCacheElts;
+    std::function<void(Scalar, Scalar)> reduceFunction = CopyReduce;
+    auto extraCacheParams = std::make_tuple(argType,
+                                            cacheName,
+                                            maxCacheElts,
+                                            fillThreshold,
+                                            reduceFunction,
+                                            false);
+    GeneralCachingStrategy cachingProvider{};
+    schedule.Cache(cachingProvider,
+                   input,
+                   { iTopLevel, jTopLevel },
+                   {},
+                   {},
+                   std::nullopt,
+                   extraCacheParams);
+
+    // Get a handle to the underlying cached memory
+    auto rawCacheValue = cachingProvider._rawCache;
+    int rawCacheSize = (int)rawCacheValue.GetLayout().NumElements();
+
+    // No right caches when N < cacheCols
+    auto cachedUpperLeft = MakeVector<int>(rawCacheSize);
+    auto cachedLowerLeft = MakeVector<int>(rawCacheSize);
+
+    // Add a low level API kernel to access the underlying cache after it has been filled
+    auto cacheSpyKernel = loopnests::Kernel("cache_spy_kernel")
+                              .Inputs(rawCacheValue, cachedUpperLeft, cachedLowerLeft)
+                              .Indices(iTopLevel, jTopLevel)
+                              .Define([cacheRows](Value rawCacheValue, Vector cachedUpperLeft, Vector cachedLowerLeft, Scalar i, Scalar j) {
+                                  auto cacheView = rawCacheValue;
+                                  cacheView.SetLayout({ { (int)rawCacheValue.GetLayout().NumElements() } });
+                                  auto vectorCacheView = Vector(cacheView);
+                                  If(i == 0,
+                                     [&]() {
+                                         // TODO : remove nested if's
+                                         If(j == 0,
+                                            [&]() {
+                                                cachedUpperLeft = vectorCacheView;
+                                            });
+                                     })
+                                      .ElseIf(i == cacheRows,
+                                              [&]() {
+                                                  If(j == 0, [&]() {
+                                                      cachedLowerLeft = vectorCacheView;
+                                                  });
+                                              });
+                              });
+    auto cacheSpyPosition = loopnests::CodePositionConstraints{ loopnests::LoopFragmentType::body, { iCache, jCache }, {} };
+    nest.GetUnderlyingLoopNest().AddKernel(cacheSpyKernel, cacheSpyPosition);
+
+#if 0 // DEBUGGING
+    DebugDump(nest.GetUnderlyingLoopNest());
+#endif
+    nest.Run();
+
+    auto ok = MakeScalar<int>("ok");
+    ok = 1;
+    auto printError = [&] {
+        DebugPrint("Upper Left:");
+        DebugPrintVector(cachedUpperLeft);
+        DebugPrint("\n");
+        DebugPrintVector(expectedCachedUpperLeft);
+        DebugPrint("\n");
+        DebugPrint("\n");
+        DebugPrint("Lower Left:");
+        DebugPrintVector(cachedLowerLeft);
+        DebugPrint("\n");
+        DebugPrintVector(expectedCachedLowerLeft);
+        DebugPrint("\n");
+        DebugPrint("\n");
+    };
+    // TODO : replace nested if's
+    If(VerifySame(cachedUpperLeft, expectedCachedUpperLeft) == 0, [&]() {
+        If(VerifySame(cachedLowerLeft, expectedCachedLowerLeft) == 0, [&]() {
+            ok = 0;
+        }).Else(printError);
+    }).Else(printError);
+    return ok;
+}
+
+Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Runner_UpperCachesOnly(int M, int N, int cacheRows, int cacheCols, int stripeSize, Vector expectedCachedUpperLeft, Vector expectedCachedUpperRight)
+{
+    int vecSize = stripeSize / 2;
+    auto input = MakeIncrementingMatrix<int>(M, N, "input");
+    auto output = MakeMatrix<int>(M, N, "output");
+
+    Index i("i"), j("j");
+    auto nest = Using({ input }, ArgumentType::Input)
+                    .Using({ output }, ArgumentType::Output)
+                    .ForAll(i, 0, M)
+                    .ForAll(j, 0, N)
+                    .Do([](Matrix input, Matrix output, Scalar i, Scalar j) {
+                        output(i, j) = input(i, j);
+                    });
+
+    auto& schedule = nest.GetSchedule();
+
+    auto iTopLevel = i;
+    auto jTopLevel = j;
+
+    auto iCache = schedule.Split(i, cacheRows);
+    auto jCache = schedule.Split(j, cacheCols);
+    auto jStripe = schedule.Split(j, stripeSize);
+    auto jVec = schedule.Split(j, vecSize);
+
+    schedule.SetOrder({ jCache, iCache, jStripe, i, jVec, j });
+
+    ArgumentType argType = ArgumentType::Input;
+    std::string cacheName = "cacheInput";
+    size_t maxCacheElts = cacheRows * cacheCols;
+    size_t fillThreshold = maxCacheElts;
+    std::function<void(Scalar, Scalar)> reduceFunction = CopyReduce;
+    auto extraCacheParams = std::make_tuple(argType,
+                                            cacheName,
+                                            maxCacheElts,
+                                            fillThreshold,
+                                            reduceFunction,
+                                            false);
+    GeneralCachingStrategy cachingProvider{};
+    schedule.Cache(cachingProvider,
+                   input,
+                   { iTopLevel, jTopLevel },
+                   {},
+                   {},
+                   std::nullopt,
+                   extraCacheParams);
+
+    // Get a handle to the underlying cached memory
+    auto rawCacheValue = cachingProvider._rawCache;
+    int rawCacheSize = (int)rawCacheValue.GetLayout().NumElements();
+
+    auto cachedUpperLeft = MakeVector<int>(rawCacheSize);
+    auto cachedUpperRight = MakeVector<int>(rawCacheSize);
+
+    // Add a low level API kernel to access the underlying cache after it has been filled
+    auto cacheSpyKernel = loopnests::Kernel("cache_spy_kernel")
+                              .Inputs(rawCacheValue, cachedUpperLeft, cachedUpperRight)
+                              .Indices(iTopLevel, jTopLevel)
+                              .Define([cacheCols](Value rawCacheValue, Vector cachedUpperLeft, Vector cachedUpperRight, Scalar i, Scalar j) {
+                                  auto cacheView = rawCacheValue;
+                                  cacheView.SetLayout({ { (int)rawCacheValue.GetLayout().NumElements() } });
+                                  auto vectorCacheView = Vector(cacheView);
+                                  If(i == 0,
+                                     [&]() {
+                                         // TODO : remove nested if's
+                                         If(j == 0,
+                                            [&]() {
+                                                cachedUpperLeft = vectorCacheView;
+                                            })
+                                             .ElseIf(j == cacheCols,
+                                                     [&]() {
+                                                         cachedUpperRight = vectorCacheView;
+                                                     });
+                                     });
+                              });
+    auto cacheSpyPosition = loopnests::CodePositionConstraints{ loopnests::LoopFragmentType::body, { iCache, jCache }, {} };
+    nest.GetUnderlyingLoopNest().AddKernel(cacheSpyKernel, cacheSpyPosition);
+
+#if 0 // DEBUGGING
+    DebugDump(nest.GetUnderlyingLoopNest());
+#endif
+    nest.Run();
+
+    auto ok = MakeScalar<int>("ok");
+    ok = 1;
+    auto printError = [&] {
+        DebugPrint("Upper Left:");
+        DebugPrintVector(cachedUpperLeft);
+        DebugPrint("\n");
+        DebugPrintVector(expectedCachedUpperLeft);
+        DebugPrint("\n");
+        DebugPrint("\n");
+        DebugPrint("Upper Right:");
+        DebugPrintVector(cachedUpperRight);
+        DebugPrint("\n");
+        DebugPrintVector(expectedCachedUpperRight);
+        DebugPrint("\n");
+        DebugPrint("\n");
+    };
+    // TODO : replace nested if's
+    If(VerifySame(cachedUpperLeft, expectedCachedUpperLeft) == 0, [&]() {
+        If(VerifySame(cachedUpperRight, expectedCachedUpperRight) == 0, [&]() {
+            ok = 0;
+        }).Else(printError);
+    }).Else(printError);
+    return ok;
+}
+
+Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Runner_UpperLeftCacheOnly(int M, int N, int cacheRows, int cacheCols, int stripeSize, Vector expectedCachedUpperLeft)
+{
+    int vecSize = stripeSize / 2;
+    auto input = MakeIncrementingMatrix<int>(M, N, "input");
+    auto output = MakeMatrix<int>(M, N, "output");
+
+    Index i("i"), j("j");
+    auto nest = Using({ input }, ArgumentType::Input)
+                    .Using({ output }, ArgumentType::Output)
+                    .ForAll(i, 0, M)
+                    .ForAll(j, 0, N)
+                    .Do([](Matrix input, Matrix output, Scalar i, Scalar j) {
+                        output(i, j) = input(i, j);
+                    });
+
+    auto& schedule = nest.GetSchedule();
+
+    auto iTopLevel = i;
+    auto jTopLevel = j;
+
+    auto iCache = schedule.Split(i, cacheRows);
+    auto jCache = schedule.Split(j, cacheCols);
+    auto jStripe = schedule.Split(j, stripeSize);
+    auto jVec = schedule.Split(j, vecSize);
+
+    schedule.SetOrder({ jCache, iCache, jStripe, i, jVec, j });
+
+    ArgumentType argType = ArgumentType::Input;
+    std::string cacheName = "cacheInput";
+    size_t maxCacheElts = cacheRows * cacheCols;
+    size_t fillThreshold = maxCacheElts;
+    std::function<void(Scalar, Scalar)> reduceFunction = CopyReduce;
+    auto extraCacheParams = std::make_tuple(argType,
+                                            cacheName,
+                                            maxCacheElts,
+                                            fillThreshold,
+                                            reduceFunction,
+                                            false);
+    GeneralCachingStrategy cachingProvider{};
+    schedule.Cache(cachingProvider,
+                   input,
+                   { iTopLevel, jTopLevel },
+                   {},
+                   {},
+                   std::nullopt,
+                   extraCacheParams);
+
+    // Get a handle to the underlying cached memory
+    auto rawCacheValue = cachingProvider._rawCache;
+    int rawCacheSize = (int)rawCacheValue.GetLayout().NumElements();
+
+    // No right caches when N < cacheCols
+    auto cachedUpperLeft = MakeVector<int>(rawCacheSize);
+
+    // Add a low level API kernel to access the underlying cache after it has been filled
+    auto cacheSpyKernel = loopnests::Kernel("cache_spy_kernel")
+                              .Inputs(rawCacheValue, cachedUpperLeft)
+                              .Indices(iTopLevel, jTopLevel)
+                              .Define([](Value rawCacheValue, Vector cachedUpperLeft, Scalar i, Scalar j) {
+                                  auto cacheView = rawCacheValue;
+                                  cacheView.SetLayout({ { (int)rawCacheValue.GetLayout().NumElements() } });
+                                  auto vectorCacheView = Vector(cacheView);
+                                  If(i == 0,
+                                     [&]() {
+                                         // TODO : remove nested if's
+                                         If(j == 0,
+                                            [&]() {
+                                                cachedUpperLeft = vectorCacheView;
+                                            });
+                                     });
+                              });
+    auto cacheSpyPosition = loopnests::CodePositionConstraints{ loopnests::LoopFragmentType::body, { iCache, jCache }, {} };
+    nest.GetUnderlyingLoopNest().AddKernel(cacheSpyKernel, cacheSpyPosition);
+
+#if 0 // DEBUGGING
+    DebugDump(nest.GetUnderlyingLoopNest());
+#endif
+    nest.Run();
+
+    return VerifySame(cachedUpperLeft, expectedCachedUpperLeft);
+}
+
+Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test1()
+{
+    int M = 8; // M does evenly divide cache rows
+    int N = 7; // N doesn't evenly divide cache columns
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int stripeSize = 2;
+
+    // input
+    // A:
+    // [ 0,  1,  2,  3,  4,  5,  6],
+    // [ 7,  8,  9, 10, 11, 12, 13],
+    // [14, 15, 16, 17, 18, 19, 20],
+    // [21, 22, 23, 24, 25, 26, 27],
+    // [28, 29, 30, 31, 32, 33, 34],
+    // [35, 36, 37, 38, 39, 40, 41],
+    // [42, 43, 44, 45, 46, 47, 48],
+    // [49, 50, 51, 52, 53, 54, 55]
+    // clang-format off
+    Vector expectedCachedUpperLeft =
+        {
+             0,  1,
+             7,  8,
+            14, 15,
+            21, 22,
+
+             2,  3,
+             9, 10,
+            16, 17,
+            23, 24,
+        };
+    Vector expectedCachedUpperRight =
+        {
+             4,  5,
+            11, 12,
+            18, 19,
+            25, 26,
+
+             6,  0,
+            13,  0,
+            20,  0,
+            27,  0
+        };
+    Vector expectedCachedLowerLeft =
+        {
+            28, 29,
+            35, 36,
+            42, 43,
+            49, 50,
+
+            30, 31,
+            37, 38,
+            44, 45,
+            51, 52,
+        };
+    Vector expectedCachedLowerRight =
+        {
+            32, 33,
+            39, 40,
+            46, 47,
+            53, 54,
+
+            34,  0,
+            41,  0,
+            48,  0,
+            55,  0
+        };
+    // clang-format on
+
+    return GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, stripeSize, expectedCachedUpperLeft, expectedCachedUpperRight, expectedCachedLowerLeft, expectedCachedLowerRight);
+}
+
+Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test2()
+{
+    int M = 8; // M does evenly divide cache rows
+    int N = 6; // N doesn't evenly divide cache columns, but does evenly divide stripe size
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int stripeSize = 2;
+
+    auto input = MakeIncrementingMatrix<int>(M, N, "input");
+    auto output = MakeMatrix<int>(M, N, "output");
+
+    // input
+    // A:
+    // [ 0,  1,  2,  3,  4,  5],
+    // [ 6,  7,  8,  9, 10, 11],
+    // [12, 13, 14, 15, 16, 17],
+    // [18, 19, 20, 21, 22, 23],
+    // [24, 25, 26, 27, 28, 29],
+    // [30, 31, 32, 33, 34, 35],
+    // [36, 37, 38, 39, 40, 41],
+    // [42, 43, 44, 45, 46, 47]
+    // clang-format off
+    Vector expectedCachedUpperLeft =
+        {
+             0,  1,
+             6,  7,
+            12, 13,
+            18, 19,
+
+             2,  3,
+             8,  9,
+            14, 15,
+            20, 21,
+        };
+    Vector expectedCachedUpperRight =
+        {
+             4,  5,
+            10, 11,
+            16, 17,
+            22, 23,
+
+             0,  0,
+             0,  0,
+             0,  0,
+             0,  0
+        };
+    Vector expectedCachedLowerLeft =
+        {
+            24, 25,
+            30, 31,
+            36, 37,
+            42, 43,
+
+            26, 27,
+            32, 33,
+            38, 39,
+            44, 45,
+        };
+    Vector expectedCachedLowerRight =
+        {
+            28, 29,
+            34, 35,
+            40, 41,
+            46, 47,
+
+             0,  0,
+             0,  0,
+             0,  0,
+             0,  0
+        };
+    // clang-format on
+
+    return GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, stripeSize, expectedCachedUpperLeft, expectedCachedUpperRight, expectedCachedLowerLeft, expectedCachedLowerRight);
+}
+
+// input matrix rows doesn't evenly divides cache rows
+// input matrix cols doesn't evenly divide cache cols
+Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test3()
+{
+    int M = 6;
+    int N = 7; // N doesn't evenly divide the number of cache columns
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int stripeSize = 2;
+
+    // input
+    // A:
+    // [ 0,  1,  2,  3,  4,  5,  6],
+    // [ 7,  8,  9, 10, 11, 12, 13],
+    // [14, 15, 16, 17, 18, 19, 20],
+    // [21, 22, 23, 24, 25, 26, 27],
+    // [28, 29, 30, 31, 32, 33, 34],
+    // [35, 36, 37, 38, 39, 40, 41],
+    // clang-format off
+    Vector expectedCachedUpperLeft =
+        {
+             0,  1,
+             7,  8,
+            14, 15,
+            21, 22,
+
+             2,  3,
+             9, 10,
+            16, 17,
+            23, 24,
+        };
+    Vector expectedCachedUpperRight =
+        {
+             4,  5,
+            11, 12,
+            18, 19,
+            25, 26,
+
+             6,  0,
+            13,  0,
+            20,  0,
+            27,  0
+        };
+    
+    // Check that it gets reviewed correctly to keep the cached data contiguous
+    Vector expectedCachedLowerLeft =
+        {
+            28, 29,
+            35, 36,
+            30, 31,
+            37, 38,
+
+             0,  0,
+             0,  0,            
+             0,  0,
+             0,  0,
+        };
+    Vector expectedCachedLowerRight =
+        {
+            32, 33,
+            39, 40,
+            34,  0,
+            41,  0,
+
+             0,  0,
+             0,  0,
+             0,  0,
+             0,  0
+        };
+    // clang-format on
+
+    return GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, stripeSize, expectedCachedUpperLeft, expectedCachedUpperRight, expectedCachedLowerLeft, expectedCachedLowerRight);
+}
+
+// input matrix rows doesn't evenly divides cache rows
+// input matrix cols doesn't evenly divide cache cols but does evenly divide stripe size
+Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test4()
+{
+    int M = 6;
+    int N = 6; // N doesn't evenly divide the number of cache columns, but does evenly divide stripe size
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int stripeSize = 2;
+
+    // input
+    // A:
+    // [ 0,  1,  2,  3,  4,  5],
+    // [ 6,  7,  8,  9, 10, 11],
+    // [12, 13, 14, 15, 16, 17],
+    // [18, 19, 20, 21, 22, 23],
+    // [24, 25, 26, 27, 28, 29],
+    // [30, 31, 32, 33, 34, 35]
+    // clang-format off
+    Vector expectedCachedUpperLeft =
+        {
+             0,  1,
+             6,  7,
+            12, 13,
+            18, 19,
+
+             2,  3,
+             8,  9,
+            14, 15,
+            20, 21,
+        };
+    Vector expectedCachedUpperRight =
+        {
+             4,  5,
+            10, 11,
+            16, 17,
+            22, 23,
+
+             0,  0,
+             0,  0,
+             0,  0,
+             0,  0
+        };
+    Vector expectedCachedLowerLeft =
+        {
+            24, 25,
+            30, 31,
+            26, 27,
+            32, 33,
+
+             0,  0,
+             0,  0,
+             0,  0,
+             0,  0,
+        };
+    Vector expectedCachedLowerRight =
+        {
+            28, 29,
+            34, 35,
+             0,  0,
+             0,  0,
+
+             0,  0,
+             0,  0,
+             0,  0,
+             0,  0
+        };
+    // clang-format on
+    return GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, stripeSize, expectedCachedUpperLeft, expectedCachedUpperRight, expectedCachedLowerLeft, expectedCachedLowerRight);
+}
+
+// input matrix rows evenly divides cache rows
+// input matrix cols < cache cols, doesn't evenly divide stripe size
+Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test5()
+{
+    int M = 8;
+    int N = 3; // N < cache columns, doesn't evenly divide stripe size
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int stripeSize = 2;
+
+    // input
+    // A:
+    // [ 0,  1,  2],
+    // [ 3,  4,  5],
+    // [ 6,  7,  8],
+    // [ 9, 10, 11],
+    // [12, 13, 14],
+    // [15, 16, 17],
+    // [18, 19, 20],
+    // [21, 22, 23]
+    // clang-format off
+    Vector expectedCachedUpperLeft =
+        {
+             0,  1,
+             3,  4,
+             6,  7,
+             9, 10,
+
+             2, 0,
+             5, 0,
+             8, 0,
+            11, 0,
+        };
+    Vector expectedCachedLowerLeft =
+        {
+            12, 13,
+            15, 16,
+            18, 19,
+            21, 22,
+
+            14,  0,
+            17,  0,
+            20,  0,
+            23,  0,
+        };
+    // clang-format on
+
+    return GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Runner_LeftCachesOnly(M, N, cacheRows, cacheCols, stripeSize, expectedCachedUpperLeft, expectedCachedLowerLeft);
+}
+
+// input matrix rows evenly divides cache rows
+// input matrix cols < cache cols, evenly divides stripe size
+Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test6()
+{
+    int M = 8;
+    int N = 2; // N < cache columns, does evenly divide stripe size
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int stripeSize = 2;
+
+    // input
+    // A:
+    // [ 0,  1],
+    // [ 2,  3],
+    // [ 4,  5],
+    // [ 6,  7],
+    // [ 8,  9],
+    // [10, 11],
+    // [12, 13],
+    // [14, 15]
+    // clang-format off
+    Vector expectedCachedUpperLeft =
+        {
+             0,  1,
+             2,  3,
+             4,  5,
+             6,  7,
+
+             0,  0,
+             0,  0,
+             0,  0,
+             0,  0,
+        };
+    Vector expectedCachedLowerLeft =
+        {
+             8,  9,
+            10, 11,
+            12, 13,
+            14, 15,
+
+             0,  0,
+             0,  0,
+             0,  0,
+             0,  0,
+        };
+    // clang-format on
+    return GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Runner_LeftCachesOnly(M, N, cacheRows, cacheCols, stripeSize, expectedCachedUpperLeft, expectedCachedLowerLeft);
+}
+
+// input matrix rows < cache rows
+// input matrix cols < cache cols, doesn't evenly divides stripe size
+Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test7()
+{
+    int M = 3;
+    int N = 3; // N < cache columns, doesn't evenly divide stripe size
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int stripeSize = 2;
+
+    // input
+    // A:
+    // [0, 1, 2],
+    // [3, 4, 5],
+    // [6, 7, 8]
+    // clang-format off
+    Vector expectedCachedUpperLeft =
+        {
+            0, 1,
+            3, 4,
+            6, 7,
+            2, 0,
+
+            5, 0,
+            8, 0,
+            0, 0,
+            0, 0
+        };
+    // clang-format on
+    return GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Runner_UpperLeftCacheOnly(M, N, cacheRows, cacheCols, stripeSize, expectedCachedUpperLeft);
+}
+
+// input matrix rows < cache rows
+// input matrix cols < cache cols, does evenly divides stripe size
+Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test8()
+{
+    int M = 2;
+    int N = 2; // N < cache columns, does evenly divide stripe size
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int stripeSize = 2;
+
+    // input
+    // A:
+    // [ 0, 1],
+    // [ 2, 3]
+    // clang-format off
+    Vector expectedCachedUpperLeft =
+        {
+            0, 1,
+            2, 3,
+            0, 0,
+            0, 0,
+
+            0, 0,
+            0, 0,
+            0, 0,
+            0, 0,
+        };
+    // clang-format on
+    return GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Runner_UpperLeftCacheOnly(M, N, cacheRows, cacheCols, stripeSize, expectedCachedUpperLeft);
+}
+
+// input matrix rows < cache rows
+// input matrix cols multiple of cache cols
+Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test9()
+{
+    int M = 2;
+    int N = 8;
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int stripeSize = 2;
+
+    // input
+    // A:
+    // [ 0,  1,  2,  3,  4,  5,  6,  7],
+    // [ 8,  9, 10, 11, 12, 13, 14, 15]
+    // clang-format off
+    Vector expectedCachedUpperLeft =
+        {
+             0,  1,
+             8,  9,
+             2,  3,
+            10, 11,
+
+            0, 0,
+            0, 0,
+            0, 0,
+            0, 0,
+        };
+    Vector expectedCachedUpperRight =
+        {
+             4,  5,
+            12, 13,
+             6,  7,
+            14, 15,
+
+             0,  0,
+             0,  0,
+             0,  0,
+             0,  0
+        };
+    // clang-format on
+    return GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Runner_UpperCachesOnly(M, N, cacheRows, cacheCols, stripeSize, expectedCachedUpperLeft, expectedCachedUpperRight);
+}
+
+// General caching strategy Progressive BLASNCopy-style caching
+Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_Test1()
+{
+    int N = 8;
+    int cacheRows = N;
+    int cacheCols = N;
+    int blockSize = 4;
+    int stripeSize = 2;
+
+    auto input = MakeIncrementingMatrix<int>(N, N, "input");
+    auto output = MakeMatrix<int>(N, N, "output");
+    auto expectedOutput = MakeIncrementingMatrix<int>(N, N, "expectedOutput");
+
+    // input:
+    // A:
+    // [ 0,  1,  2,  3,  4,  5,  6,  7]
+    // [ 8,  9, 10, 11, 12, 13, 14, 15]
+    // [16, 17, 18, 19, 20, 21, 22, 23]
+    // [24, 25, 26, 27, 28, 29, 30, 31]
+    // [32, 33, 34, 35, 36, 37, 38, 39]
+    // [40, 41, 42, 43, 44, 45, 46, 47]
+    // [48, 49, 50, 51, 52, 53, 54, 55]
+    // [56, 57, 58, 59, 60, 61, 62, 63]
+
+    Index i("i"), j("j");
+    auto nest = Using({ input }, ArgumentType::Input)
+                    .Using({ output }, ArgumentType::Output)
+                    .ForAll(i, 0, N)
+                    .ForAll(j, 0, N)
+                    .Do([](Matrix input, Matrix output, Scalar i, Scalar j) {
+                        output(i, j) = input(i, j);
+                    });
+
+    auto& schedule = nest.GetSchedule();
+
+    auto iTopLevel = i;
+    auto jTopLevel = j;
+
+    auto iCache = schedule.Split(i, cacheRows);
+    auto iBlock = schedule.Split(i, blockSize);
+    auto iStripe = schedule.Split(i, stripeSize);
+    auto jCache = schedule.Split(j, cacheCols);
+
+    schedule.SetOrder({ iCache, jCache, iBlock, iStripe, j, i });
+
+    ArgumentType argType = ArgumentType::Input;
+    std::string cacheName = "cacheInput";
+    size_t maxCacheElts = cacheRows * cacheCols;
+    size_t fillThreshold = blockSize * cacheCols;
+    std::function<void(Scalar, Scalar)> reduceFunction = CopyReduce;
+    auto extraCacheParams = std::make_tuple(argType,
+                                            cacheName,
+                                            maxCacheElts,
+                                            fillThreshold,
+                                            reduceFunction,
+                                            false);
+    GeneralCachingStrategy cachingProvider{};
+    schedule.Cache(cachingProvider,
+                   input,
+                   { iTopLevel, jTopLevel },
+                   {},
+                   {},
+                   std::nullopt,
+                   extraCacheParams);
+
+#if 0 // DEBUGGING
+    DebugDump(nest.GetUnderlyingLoopNest());
+#endif
+    nest.Run();
+
+    return VerifySame(output, expectedOutput);
+}
+
+// Test with smaller cache, block, and stripe size than previous test
+Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_Test2()
+{
+    int N = 8;
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int blockSize = 2;
+    int stripeSize = 1;
+
+    // input, expectedOutput
+    // A:
+    // [ 0,  1,  2,  3,  4,  5,  6,  7]
+    // [ 8,  9, 10, 11, 12, 13, 14, 15]
+    // [16, 17, 18, 19, 20, 21, 22, 23]
+    // [24, 25, 26, 27, 28, 29, 30, 31]
+    // [32, 33, 34, 35, 36, 37, 38, 39]
+    // [40, 41, 42, 43, 44, 45, 46, 47]
+    // [48, 49, 50, 51, 52, 53, 54, 55]
+    // [56, 57, 58, 59, 60, 61, 62, 63]
+    auto input = MakeIncrementingMatrix<int>(N, N, "input");
+    auto output = MakeMatrix<int>(N, N, "output");
+    auto expectedOutput = MakeIncrementingMatrix<int>(N, N, "expectedOutput");
+
+    Index i("i"), j("j");
+    auto nest = Using({ input }, ArgumentType::Input)
+                    .Using({ output }, ArgumentType::Output)
+                    .ForAll(i, 0, N)
+                    .ForAll(j, 0, N)
+                    .Do([](Matrix input, Matrix output, Scalar i, Scalar j) {
+                        output(i, j) = input(i, j);
+                    });
+
+    auto& schedule = nest.GetSchedule();
+
+    auto iTopLevel = i;
+    auto jTopLevel = j;
+
+    auto iCache = schedule.Split(i, cacheRows);
+    auto iBlock = schedule.Split(i, blockSize);
+    auto iStripe = schedule.Split(i, stripeSize);
+    auto jCache = schedule.Split(j, cacheCols);
+
+    schedule.SetOrder({ iCache, jCache, iBlock, iStripe, j, i });
+
+    ArgumentType argType = ArgumentType::Input;
+    std::string cacheName = "cacheInput";
+    size_t maxCacheElts = cacheRows * cacheCols;
+    size_t fillThreshold = blockSize * cacheCols;
+    std::function<void(Scalar, Scalar)> reduceFunction = CopyReduce;
+    auto extraCacheParams = std::make_tuple(argType,
+                                            cacheName,
+                                            maxCacheElts,
+                                            fillThreshold,
+                                            reduceFunction,
+                                            false);
+    GeneralCachingStrategy cachingProvider{};
+    schedule.Cache(cachingProvider,
+                   input,
+                   { iTopLevel, jTopLevel },
+                   {},
+                   {},
+                   std::nullopt,
+                   extraCacheParams);
+
+#if 0 // DEBUGGING
+    DebugDump(nest.GetUnderlyingLoopNest());
+#endif
+    nest.Run();
+
+    return VerifySame(output, expectedOutput);
+}
+
+Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_Test1()
+{
+    int N = 8;
+    int cacheRows = N;
+    int cacheCols = N;
+    int blockSize = N;
+    int stripeSize = 4;
+
+    auto input = MakeIncrementingMatrix<int>(N, N, "input");
+    auto output = MakeMatrix<int>(N, N, "output");
+
+    // input
+    // A:
+    // [ 0,  1,  2,  3,  4,  5,  6,  7]
+    // [ 8,  9, 10, 11, 12, 13, 14, 15]
+    // [16, 17, 18, 19, 20, 21, 22, 23]
+    // [24, 25, 26, 27, 28, 29, 30, 31]
+    // [32, 33, 34, 35, 36, 37, 38, 39]
+    // [40, 41, 42, 43, 44, 45, 46, 47]
+    // [48, 49, 50, 51, 52, 53, 54, 55]
+    // [56, 57, 58, 59, 60, 61, 62, 63]
+    // clang-format off
+    Vector expectedCached =
+        {
+             0,  8, 16, 24,
+             1,  9, 17, 25,
+             2, 10, 18, 26,
+             3, 11, 19, 27,
+             4, 12, 20, 28,
+             5, 13, 21, 29,
+             6, 14, 22, 30,
+             7, 15, 23, 31,
+
+            32, 40, 48, 56,
+            33, 41, 49, 57,
+            34, 42, 50, 58,
+            35, 43, 51, 59,
+            36, 44, 52, 60,
+            37, 45, 53, 61,
+            38, 46, 54, 62,
+            39, 47, 55, 63
+        };
+    // clang-format on
+
+    Index i("i"), j("j");
+    auto nest = Using({ input }, ArgumentType::Input)
+                    .Using({ output }, ArgumentType::Output)
+                    .ForAll(i, 0, N)
+                    .ForAll(j, 0, N)
+                    .Do([](Matrix input, Matrix output, Scalar i, Scalar j) {
+                        output(i, j) = input(i, j);
+                    });
+
+    auto& schedule = nest.GetSchedule();
+
+    auto iTopLevel = i;
+    auto jTopLevel = j;
+
+    auto iCache = schedule.Split(i, cacheRows);
+    auto iBlock = schedule.Split(i, blockSize);
+    auto iStripe = schedule.Split(i, stripeSize);
+    auto jCache = schedule.Split(j, cacheCols);
+
+    schedule.SetOrder({ iCache, jCache, iBlock, iStripe, j, i });
+
+    ArgumentType argType = ArgumentType::Input;
+    std::string cacheName = "cacheInput";
+    size_t maxCacheElts = cacheRows * cacheCols;
+    size_t fillThreshold = blockSize * cacheCols;
+    std::function<void(Scalar, Scalar)> reduceFunction = CopyReduce;
+    auto extraCacheParams = std::make_tuple(argType,
+                                            cacheName,
+                                            maxCacheElts,
+                                            fillThreshold,
+                                            reduceFunction,
+                                            false);
+    GeneralCachingStrategy cachingProvider{};
+    schedule.Cache(cachingProvider,
+                   input,
+                   { iTopLevel, jTopLevel },
+                   {},
+                   {},
+                   std::nullopt,
+                   extraCacheParams);
+
+#if 0 // DEBUGGING
+    DebugDump(nest.GetUnderlyingLoopNest());
+#endif
+    nest.Run();
+
+    // Examine the underlying cached memory
+    auto rawCacheValue = cachingProvider._rawCache;
+    rawCacheValue.SetLayout({ { (int)rawCacheValue.GetLayout().GetMemorySize() } });
+    auto cacheVector = Vector(rawCacheValue);
+
+    return VerifySame(cacheVector, expectedCached);
+}
+
+// Smaller stripe size than previous test
+Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_Test2()
+{
+    int N = 8;
+    int cacheRows = N;
+    int cacheCols = N;
+    int blockSize = 4;
+    int stripeSize = 2;
+
+    auto input = MakeIncrementingMatrix<int>(N, N, "input");
+    auto output = MakeMatrix<int>(N, N, "output");
+
+    // input
+    // A:
+    // [ 0,  1,  2,  3,  4,  5,  6,  7]
+    // [ 8,  9, 10, 11, 12, 13, 14, 15]
+    // [16, 17, 18, 19, 20, 21, 22, 23]
+    // [24, 25, 26, 27, 28, 29, 30, 31]
+    // [32, 33, 34, 35, 36, 37, 38, 39]
+    // [40, 41, 42, 43, 44, 45, 46, 47]
+    // [48, 49, 50, 51, 52, 53, 54, 55]
+    // [56, 57, 58, 59, 60, 61, 62, 63]
+    // clang-format off
+    Vector expectedCached =
+        {
+             0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15,
+            16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31,
+            
+            32, 40, 33, 41, 34, 42, 35, 43, 36, 44, 37, 45, 38, 46, 39, 47,
+            48, 56, 49, 57, 50, 58, 51, 59, 52, 60, 53, 61, 54, 62, 55, 63,
+        };
+    // clang-format on
+
+    Index i("i"), j("j");
+    auto nest = Using({ input }, ArgumentType::Input)
+                    .Using({ output }, ArgumentType::Output)
+                    .ForAll(i, 0, N)
+                    .ForAll(j, 0, N)
+                    .Do([](Matrix input, Matrix output, Scalar i, Scalar j) {
+                        output(i, j) = input(i, j);
+                    });
+
+    auto& schedule = nest.GetSchedule();
+
+    auto iTopLevel = i;
+    auto jTopLevel = j;
+
+    auto iCache = schedule.Split(i, cacheRows);
+    auto iBlock = schedule.Split(i, blockSize);
+    auto iStripe = schedule.Split(i, stripeSize);
+    auto jCache = schedule.Split(j, cacheCols);
+
+    schedule.SetOrder({ iCache, jCache, iBlock, iStripe, j, i });
+
+    ArgumentType argType = ArgumentType::Input;
+    std::string cacheName = "cacheInput";
+    size_t maxCacheElts = cacheRows * cacheCols;
+    size_t fillThreshold = blockSize * cacheCols;
+    std::function<void(Scalar, Scalar)> reduceFunction = CopyReduce;
+    auto extraCacheParams = std::make_tuple(argType,
+                                            cacheName,
+                                            maxCacheElts,
+                                            fillThreshold,
+                                            reduceFunction,
+                                            false);
+    GeneralCachingStrategy cachingProvider{};
+    schedule.Cache(cachingProvider,
+                   input,
+                   { iTopLevel, jTopLevel },
+                   {},
+                   {},
+                   std::nullopt,
+                   extraCacheParams);
+
+#if 0 // DEBUGGING
+    DebugDump(nest.GetUnderlyingLoopNest());
+#endif
+    nest.Run();
+
+    // Examine the underlying cached memory
+    auto rawCacheValue = cachingProvider._rawCache;
+    rawCacheValue.SetLayout({ { (int)rawCacheValue.GetLayout().GetMemorySize() } });
+    auto cacheVector = Vector(rawCacheValue);
+
+    return VerifySame(cacheVector, expectedCached);
+}
+
+// Same stripe size as previous test, but don't cache entire matrix at once
+// Doesn't test the progressive nature of the cache over blocks
+Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_Test3()
+{
+    int N = 8;
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int blockSize = 2;
+    int stripeSize = 2;
+
+    auto input = MakeIncrementingMatrix<int>(N, N, "input");
+    auto output = MakeMatrix<int>(N, N, "output");
+
+    // input
+    // A:
+    // [ 0,  1,  2,  3,  4,  5,  6,  7]
+    // [ 8,  9, 10, 11, 12, 13, 14, 15]
+    // [16, 17, 18, 19, 20, 21, 22, 23]
+    // [24, 25, 26, 27, 28, 29, 30, 31]
+    // [32, 33, 34, 35, 36, 37, 38, 39]
+    // [40, 41, 42, 43, 44, 45, 46, 47]
+    // [48, 49, 50, 51, 52, 53, 54, 55]
+    // [56, 57, 58, 59, 60, 61, 62, 63]
+    // clang-format off
+    Vector expectedCachedUpperLeft =
+        {
+             0,  8,  1,  9,  2, 10,  3, 11,
+            16, 24, 17, 25, 18, 26, 19, 27
+        };
+    Vector expectedCachedUpperRight =
+        {
+             4, 12,  5, 13,  6, 14,  7, 15,
+            20, 28, 21, 29, 22, 30, 23, 31
+        };
+    Vector expectedCachedLowerLeft =
+        {
+            32, 40, 33, 41, 34, 42, 35, 43,
+            48, 56, 49, 57, 50, 58, 51, 59
+        };
+    Vector expectedCachedLowerRight =
+        {
+            36, 44, 37, 45, 38, 46, 39, 47,
+            52, 60, 53, 61, 54, 62, 55, 63
+        };
+    // clang-format on
+
+    Index i("i"), j("j");
+    auto nest = Using({ input }, ArgumentType::Input)
+                    .Using({ output }, ArgumentType::Output)
+                    .ForAll(i, 0, N)
+                    .ForAll(j, 0, N)
+                    .Do([](Matrix input, Matrix output, Scalar i, Scalar j) {
+                        output(i, j) = input(i, j);
+                    });
+
+    auto& schedule = nest.GetSchedule();
+
+    auto iTopLevel = i;
+    auto jTopLevel = j;
+
+    auto iCache = schedule.Split(i, cacheRows);
+    auto iBlock = schedule.Split(i, blockSize);
+    auto iStripe = schedule.Split(i, stripeSize);
+    auto jCache = schedule.Split(j, cacheCols);
+
+    schedule.SetOrder({ iCache, jCache, iBlock, iStripe, j, i });
+
+    ArgumentType argType = ArgumentType::Input;
+    std::string cacheName = "cacheInput";
+    size_t maxCacheElts = cacheRows * cacheCols;
+    size_t fillThreshold = blockSize * cacheCols;
+    std::function<void(Scalar, Scalar)> reduceFunction = CopyReduce;
+    auto extraCacheParams = std::make_tuple(argType,
+                                            cacheName,
+                                            maxCacheElts,
+                                            fillThreshold,
+                                            reduceFunction,
+                                            false);
+    GeneralCachingStrategy cachingProvider{};
+    schedule.Cache(cachingProvider,
+                   input,
+                   { iTopLevel, jTopLevel },
+                   {},
+                   {},
+                   std::nullopt,
+                   extraCacheParams);
+
+    // Get a handle to the underlying cached memory
+    auto rawCacheValue = cachingProvider._rawCache;
+    int rawCacheSize = (int)rawCacheValue.GetLayout().NumElements();
+
+    auto cachedUpperLeft = MakeVector<int>(rawCacheSize);
+    auto cachedUpperRight = MakeVector<int>(rawCacheSize);
+    auto cachedLowerLeft = MakeVector<int>(rawCacheSize);
+    auto cachedLowerRight = MakeVector<int>(rawCacheSize);
+
+    // Add a low level API kernel to access the underlying cache after it has been filled
+    auto cacheSpyKernel = loopnests::Kernel("cache_spy_kernel")
+                              .Inputs(rawCacheValue, cachedUpperLeft, cachedUpperRight, cachedLowerLeft, cachedLowerRight)
+                              .Indices(iTopLevel, jTopLevel)
+                              .Define([cacheRows, cacheCols](Value rawCacheValue, Vector cachedUpperLeft, Vector cachedUpperRight, Vector cachedLowerLeft, Vector cachedLowerRight, Scalar i, Scalar j) {
+                                  auto cacheView = rawCacheValue;
+                                  cacheView.SetLayout({ { (int)rawCacheValue.GetLayout().NumElements() } });
+                                  auto vectorCacheView = Vector(cacheView);
+                                  If(i == 0,
+                                     [&]() {
+                                         // TODO : remove nested if's
+                                         If(j == 0,
+                                            [&]() {
+                                                cachedUpperLeft = vectorCacheView;
+                                            })
+                                             .ElseIf(j == cacheCols,
+                                                     [&]() {
+                                                         cachedUpperRight = vectorCacheView;
+                                                     });
+                                     })
+                                      .ElseIf(i == cacheRows,
+                                              [&]() {
+                                                  If(j == 0, [&]() {
+                                                      cachedLowerLeft = vectorCacheView;
+                                                  }).ElseIf(j == cacheCols, [&]() {
+                                                      cachedLowerRight = vectorCacheView;
+                                                  });
+                                              });
+                              });
+    auto cacheSpyPosition = loopnests::CodePositionConstraints{ loopnests::LoopFragmentType::epilogue, { iCache, jCache }, {} };
+    nest.GetUnderlyingLoopNest().AddKernel(cacheSpyKernel, cacheSpyPosition);
+
+#if 0 // DEBUGGING
+    DebugDump(nest.GetUnderlyingLoopNest());
+#endif
+    nest.Run();
+
+    auto ok = MakeScalar<int>("ok");
+    ok = 1;
+    auto printError = [&] {
+        DebugPrint("Upper Left:");
+        DebugPrintVector(cachedUpperLeft);
+        DebugPrint("\n");
+        DebugPrintVector(expectedCachedUpperLeft);
+        DebugPrint("\n");
+        DebugPrint("\n");
+        DebugPrint("Upper Right:");
+        DebugPrintVector(cachedUpperRight);
+        DebugPrint("\n");
+        DebugPrintVector(expectedCachedUpperRight);
+        DebugPrint("\n");
+        DebugPrint("\n");
+        DebugPrint("Lower Left:");
+        DebugPrintVector(cachedLowerLeft);
+        DebugPrint("\n");
+        DebugPrintVector(expectedCachedLowerLeft);
+        DebugPrint("\n");
+        DebugPrint("\n");
+        DebugPrint("Lower Right:");
+        DebugPrintVector(cachedLowerRight);
+        DebugPrint("\n");
+        DebugPrintVector(expectedCachedLowerRight);
+        DebugPrint("\n");
+        DebugPrint("\n");
+    };
+    // TODO : replace nested if's
+    If(VerifySame(cachedUpperLeft, expectedCachedUpperLeft) == 0, [&]() {
+        If(VerifySame(cachedUpperRight, expectedCachedUpperRight) == 0, [&]() {
+            If(VerifySame(cachedLowerLeft, expectedCachedLowerLeft) == 0, [&]() {
+                If(VerifySame(cachedLowerRight, expectedCachedLowerRight) == 0, [&]() {
+                    ok = 0;
+                }).Else(printError);
+            }).Else(printError);
+        }).Else(printError);
+    }).Else(printError);
+    return ok;
+}
+
+Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_Runner(int M, int N, int cacheRows, int cacheCols, int blockSize, int stripeSize)
+{
+    auto input = MakeIncrementingMatrix<int>(M, N, "input");
+    auto output = MakeMatrix<int>(M, N, "output");
+    auto expectedOutput = MakeIncrementingMatrix<int>(M, N, "expectedOutput");
+
+    Index i("i"), j("j");
+    auto nest = Using({ input }, ArgumentType::Input)
+                    .Using({ output }, ArgumentType::Output)
+                    .ForAll(i, 0, M)
+                    .ForAll(j, 0, N)
+                    .Do([](Matrix input, Matrix output, Scalar i, Scalar j) {
+                        output(i, j) = input(i, j);
+                    });
+
+    auto& schedule = nest.GetSchedule();
+
+    auto iTopLevel = i;
+    auto jTopLevel = j;
+
+    auto iCache = schedule.Split(i, cacheRows);
+    auto iBlock = schedule.Split(i, blockSize);
+    auto iStripe = schedule.Split(i, stripeSize);
+    auto jCache = schedule.Split(j, cacheCols);
+
+    schedule.SetOrder({ iCache, jCache, iBlock, iStripe, j, i });
+
+    ArgumentType argType = ArgumentType::Input;
+    std::string cacheName = "cacheInput";
+    size_t maxCacheElts = cacheRows * cacheCols;
+    size_t fillThreshold = blockSize * cacheCols;
+    std::function<void(Scalar, Scalar)> reduceFunction = CopyReduce;
+    auto extraCacheParams = std::make_tuple(argType,
+                                            cacheName,
+                                            maxCacheElts,
+                                            fillThreshold,
+                                            reduceFunction,
+                                            false);
+    GeneralCachingStrategy cachingProvider{};
+    schedule.Cache(cachingProvider,
+                   input,
+                   { iTopLevel, jTopLevel },
+                   {},
+                   {},
+                   std::nullopt,
+                   extraCacheParams);
+
+#if 0 // DEBUGGING
+    DebugDump(nest.GetUnderlyingLoopNest());
+#endif
+    nest.Run();
+
+    return VerifySame(output, expectedOutput);
+}
+
+// input matrix rows doesn't evenly divide cache rows
+// input matrix cols evenly divides cache cols
+// blockSize == stripeSize == cacheRows / 2
+Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test1()
+{
+    int M = 7; // M doesn't evenly divide the number of cache rows
+    int N = 8;
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int blockSize = 2;
+    int stripeSize = 2;
+
+    return GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, blockSize, stripeSize);
+}
+
+// input matrix rows doesn't evenly divide cache rows, does evenly divide blocksize/stripesize
+// input matrix cols evenly divides cache cols
+// blockSize == stripeSize == cacheRows / 2
+Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test2()
+{
+    int M = 6; // M doesn't evenly divide the number of cache rows
+    int N = 8;
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int blockSize = 2;
+    int stripeSize = 2;
+
+    return GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, blockSize, stripeSize);
+}
+
+// input matrix rows doesn't evenly divide cache rows, does evenly divide blocksize/stripesize
+// input matrix cols doesn't evenly divide cache cols
+// blockSize == stripeSize == cacheRows / 2
+Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test3()
+{
+    int M = 7; // M doesn't evenly divide the number of cache rows
+    int N = 6;
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int blockSize = 2;
+    int stripeSize = 2;
+
+    return GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, blockSize, stripeSize);
+}
+
+// input matrix rows doesn't evenly divide cache rows, does evenly divide blocksize/stripesize
+// input matrix cols doesn't evenly divide cache cols
+// blockSize == stripeSize == cacheRows / 2
+Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test4()
+{
+    int M = 6; // M doesn't evenly divide the number of cache rows
+    int N = 6;
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int blockSize = 2;
+    int stripeSize = 2;
+
+    return GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, blockSize, stripeSize);
+}
+
+// input matrix rows < cache rows, doesn't evenly divide blocksize/stripesize
+// input matrix cols evenly divides cache cols
+// blockSize == stripeSize == cacheRows / 2
+Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test5()
+{
+    int M = 3; // M < cache rows
+    int N = 8;
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int blockSize = 2;
+    int stripeSize = 2;
+
+    return GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, blockSize, stripeSize);
+}
+
+// input matrix rows < cache rows, evenly divides blocksize/stripesize
+// input matrix cols evenly divides cache cols
+// blockSize == stripeSize == cacheRows / 2
+Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test6()
+{
+    int M = 2; // M < cache rows, evenly divides stripesize
+    int N = 8;
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int blockSize = 2;
+    int stripeSize = 2;
+
+    return GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, blockSize, stripeSize);
+}
+
+// input matrix rows < cache rows, doesn't evenly divide blocksize/stripesize
+// input matrix cols < cache cols
+// blockSize == stripeSize == cacheRows / 2
+Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test7()
+{
+    int M = 3; // M < cache rows, doens't evenly divide stripesize
+    int N = 3;
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int blockSize = 2;
+    int stripeSize = 2;
+
+    return GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, blockSize, stripeSize);
+}
+
+// input matrix rows < cache rows, evenly divides blocksize/stripesize
+// input matrix cols < cache cols
+// blockSize == stripeSize == cacheRows / 2
+Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test8()
+{
+    int M = 2; // M < cache rows, evenly divides stripesize
+    int N = 2;
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int blockSize = 2;
+    int stripeSize = 2;
+
+    return GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, blockSize, stripeSize);
+}
+
+// input matrix rows multiple of cache rows
+// input matrix cols < cache cols
+// blockSize == stripeSize == cacheRows / 2
+Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test9()
+{
+    int M = 8;
+    int N = 2; // N < cache cols
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int blockSize = 2;
+    int stripeSize = 2;
+
+    return GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, blockSize, stripeSize);
+}
+
+// input matrix rows doesn't evenly divide cache rows
+// input matrix cols evenly divides cache cols
+// stripeSize == blockSize / 2, blockSize == cacheRows
+Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test1()
+{
+    int M = 7; // M doesn't evenly divide the number of cache rows
+    int N = 8;
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int blockSize = 4;
+    int stripeSize = 2;
+
+    return GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, blockSize, stripeSize);
+}
+
+// input matrix rows doesn't evenly divide cache rows, does evenly divide blocksize/stripesize
+// input matrix cols evenly divides cache cols
+// stripeSize == blockSize / 2, blockSize == cacheRows
+Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test2()
+{
+    int M = 6; // M doesn't evenly divide the number of cache rows
+    int N = 8;
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int blockSize = 4;
+    int stripeSize = 2;
+
+    return GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, blockSize, stripeSize);
+}
+
+// input matrix rows doesn't evenly divide cache rows, does evenly divide blocksize/stripesize
+// input matrix cols doesn't evenly divide cache cols
+// stripeSize == blockSize / 2, blockSize == cacheRows
+Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test3()
+{
+    int M = 7; // M doesn't evenly divide the number of cache rows
+    int N = 6;
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int blockSize = 4;
+    int stripeSize = 2;
+
+    return GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, blockSize, stripeSize);
+}
+
+// input matrix rows doesn't evenly divide cache rows, does evenly divide blocksize/stripesize
+// input matrix cols doesn't evenly divide cache cols
+// stripeSize == blockSize / 2, blockSize == cacheRows
+Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test4()
+{
+    int M = 6; // M doesn't evenly divide the number of cache rows
+    int N = 6;
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int blockSize = 4;
+    int stripeSize = 2;
+
+    return GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, blockSize, stripeSize);
+}
+
+// input matrix rows < cache rows, doesn't evenly divide blocksize/stripesize
+// input matrix cols evenly divides cache cols
+// stripeSize == blockSize / 2, blockSize == cacheRows
+Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test5()
+{
+    int M = 3; // M < cache rows
+    int N = 8;
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int blockSize = 4;
+    int stripeSize = 2;
+
+    return GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, blockSize, stripeSize);
+}
+
+// input matrix rows < cache rows, evenly divides blocksize/stripesize
+// input matrix cols evenly divides cache cols
+// stripeSize == blockSize / 2, blockSize == cacheRows
+Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test6()
+{
+    int M = 2; // M < cache rows, evenly divides stripesize
+    int N = 8;
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int blockSize = 4;
+    int stripeSize = 2;
+
+    return GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, blockSize, stripeSize);
+}
+
+// input matrix rows < cache rows, doesn't evenly divide blocksize/stripesize
+// input matrix cols < cache cols
+// stripeSize == blockSize / 2, blockSize == cacheRows
+Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test7()
+{
+    int M = 3; // M < cache rows, doens't evenly divide stripesize
+    int N = 3;
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int blockSize = 4;
+    int stripeSize = 2;
+
+    return GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, blockSize, stripeSize);
+}
+
+// input matrix rows < cache rows, evenly divides blocksize/stripesize
+// input matrix cols < cache cols
+// stripeSize == blockSize / 2, blockSize == cacheRows
+Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test8()
+{
+    int M = 2; // M < cache rows, evenly divides stripesize
+    int N = 2;
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int blockSize = 4;
+    int stripeSize = 2;
+
+    return GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, blockSize, stripeSize);
+}
+
+// input matrix rows multiple of cache rows
+// input matrix cols < cache cols
+// stripeSize == blockSize / 2, blockSize == cacheRows
+Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test9()
+{
+    int M = 8;
+    int N = 2; // N < cache cols
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int blockSize = 4;
+    int stripeSize = 2;
+
+    return GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, blockSize, stripeSize);
+}
+
+Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Runner(int M, int N, int cacheRows, int cacheCols, int blockSize, int stripeSize, Vector expectedCachedUpperLeft, Vector expectedCachedUpperRight, Vector expectedCachedLowerLeft, Vector expectedCachedLowerRight)
+{
+    auto input = MakeIncrementingMatrix<int>(M, N, "input");
+    auto output = MakeMatrix<int>(M, N, "output");
+    Index i("i"), j("j");
+    auto nest = Using({ input }, ArgumentType::Input)
+                    .Using({ output }, ArgumentType::Output)
+                    .ForAll(i, 0, M)
+                    .ForAll(j, 0, N)
+                    .Do([](Matrix input, Matrix output, Scalar i, Scalar j) {
+                        output(i, j) = input(i, j);
+                    });
+
+    auto& schedule = nest.GetSchedule();
+
+    auto iTopLevel = i;
+    auto jTopLevel = j;
+
+    auto iCache = schedule.Split(i, cacheRows);
+    auto iBlock = schedule.Split(i, blockSize);
+    auto iStripe = schedule.Split(i, stripeSize);
+    auto jCache = schedule.Split(j, cacheCols);
+
+    schedule.SetOrder({ iCache, jCache, iBlock, iStripe, j, i });
+
+    ArgumentType argType = ArgumentType::Input;
+    std::string cacheName = "cacheInput";
+    size_t maxCacheElts = cacheRows * cacheCols;
+    size_t fillThreshold = blockSize * cacheCols;
+    std::function<void(Scalar, Scalar)> reduceFunction = CopyReduce;
+    auto extraCacheParams = std::make_tuple(argType,
+                                            cacheName,
+                                            maxCacheElts,
+                                            fillThreshold,
+                                            reduceFunction,
+                                            false);
+    GeneralCachingStrategy cachingProvider{};
+    schedule.Cache(cachingProvider,
+                   input,
+                   { iTopLevel, jTopLevel },
+                   {},
+                   {},
+                   std::nullopt,
+                   extraCacheParams);
+
+    // Get a handle to the underlying cached memory
+    auto rawCacheValue = cachingProvider._rawCache;
+    int rawCacheSize = (int)rawCacheValue.GetLayout().NumElements();
+
+    auto cachedUpperLeft = MakeVector<int>(rawCacheSize);
+    auto cachedUpperRight = MakeVector<int>(rawCacheSize);
+    auto cachedLowerLeft = MakeVector<int>(rawCacheSize);
+    auto cachedLowerRight = MakeVector<int>(rawCacheSize);
+
+    // Add a low level API kernel to access the underlying cache after it has been filled
+    auto cacheSpyKernel = loopnests::Kernel("cache_spy_kernel")
+                              .Inputs(rawCacheValue, cachedUpperLeft, cachedUpperRight, cachedLowerLeft, cachedLowerRight)
+                              .Indices(iTopLevel, jTopLevel)
+                              .Define([cacheRows, cacheCols](Value rawCacheValue, Vector cachedUpperLeft, Vector cachedUpperRight, Vector cachedLowerLeft, Vector cachedLowerRight, Scalar i, Scalar j) {
+                                  auto cacheView = rawCacheValue;
+                                  cacheView.SetLayout({ { (int)rawCacheValue.GetLayout().NumElements() } });
+                                  auto vectorCacheView = Vector(cacheView);
+                                  If(i == 0,
+                                     [&]() {
+                                         // TODO : remove nested if's
+                                         If(j == 0,
+                                            [&]() {
+                                                cachedUpperLeft = vectorCacheView;
+                                            })
+                                             .ElseIf(j == cacheCols,
+                                                     [&]() {
+                                                         cachedUpperRight = vectorCacheView;
+                                                     });
+                                     })
+                                      .ElseIf(i == cacheRows,
+                                              [&]() {
+                                                  If(j == 0, [&]() {
+                                                      cachedLowerLeft = vectorCacheView;
+                                                  }).ElseIf(j == cacheCols, [&]() {
+                                                      cachedLowerRight = vectorCacheView;
+                                                  });
+                                              });
+                              });
+    auto cacheSpyPosition = loopnests::CodePositionConstraints{ loopnests::LoopFragmentType::epilogue, { iCache, jCache }, {} };
+    nest.GetUnderlyingLoopNest().AddKernel(cacheSpyKernel, cacheSpyPosition);
+
+#if 0 // DEBUGGING
+    DebugDump(nest.GetUnderlyingLoopNest());
+#endif
+    nest.Run();
+
+    auto ok = MakeScalar<int>("ok");
+    ok = 1;
+    auto printError = [&] {
+        DebugPrint("Upper Left:");
+        DebugPrintVector(cachedUpperLeft);
+        DebugPrint("\n");
+        DebugPrintVector(expectedCachedUpperLeft);
+        DebugPrint("\n");
+        DebugPrint("\n");
+        DebugPrint("Upper Right:");
+        DebugPrintVector(cachedUpperRight);
+        DebugPrint("\n");
+        DebugPrintVector(expectedCachedUpperRight);
+        DebugPrint("\n");
+        DebugPrint("\n");
+        DebugPrint("Lower Left:");
+        DebugPrintVector(cachedLowerLeft);
+        DebugPrint("\n");
+        DebugPrintVector(expectedCachedLowerLeft);
+        DebugPrint("\n");
+        DebugPrint("\n");
+        DebugPrint("Lower Right:");
+        DebugPrintVector(cachedLowerRight);
+        DebugPrint("\n");
+        DebugPrintVector(expectedCachedLowerRight);
+        DebugPrint("\n");
+        DebugPrint("\n");
+    };
+    // TODO : replace nested if's
+    If(VerifySame(cachedUpperLeft, expectedCachedUpperLeft) == 0, [&]() {
+        If(VerifySame(cachedUpperRight, expectedCachedUpperRight) == 0, [&]() {
+            If(VerifySame(cachedLowerLeft, expectedCachedLowerLeft) == 0, [&]() {
+                If(VerifySame(cachedLowerRight, expectedCachedLowerRight) == 0, [&]() {
+                    ok = 0;
+                }).Else(printError);
+            }).Else(printError);
+        }).Else(printError);
+    }).Else(printError);
+    return ok;
+}
+
+Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Runner_LeftCachesOnly(int M, int N, int cacheRows, int cacheCols, int blockSize, int stripeSize, Vector expectedCachedUpperLeft, Vector expectedCachedLowerLeft)
+{
+    auto input = MakeIncrementingMatrix<int>(M, N, "input");
+    auto output = MakeMatrix<int>(M, N, "output");
+    Index i("i"), j("j");
+    auto nest = Using({ input }, ArgumentType::Input)
+                    .Using({ output }, ArgumentType::Output)
+                    .ForAll(i, 0, M)
+                    .ForAll(j, 0, N)
+                    .Do([](Matrix input, Matrix output, Scalar i, Scalar j) {
+                        output(i, j) = input(i, j);
+                    });
+
+    auto& schedule = nest.GetSchedule();
+
+    auto iTopLevel = i;
+    auto jTopLevel = j;
+
+    auto iCache = schedule.Split(i, cacheRows);
+    auto iBlock = schedule.Split(i, blockSize);
+    auto iStripe = schedule.Split(i, stripeSize);
+    auto jCache = schedule.Split(j, cacheCols);
+
+    schedule.SetOrder({ iCache, jCache, iBlock, iStripe, j, i });
+
+    ArgumentType argType = ArgumentType::Input;
+    std::string cacheName = "cacheInput";
+    size_t maxCacheElts = cacheRows * cacheCols;
+    size_t fillThreshold = blockSize * cacheCols;
+    std::function<void(Scalar, Scalar)> reduceFunction = CopyReduce;
+    auto extraCacheParams = std::make_tuple(argType,
+                                            cacheName,
+                                            maxCacheElts,
+                                            fillThreshold,
+                                            reduceFunction,
+                                            false);
+    GeneralCachingStrategy cachingProvider{};
+    schedule.Cache(cachingProvider,
+                   input,
+                   { iTopLevel, jTopLevel },
+                   {},
+                   {},
+                   std::nullopt,
+                   extraCacheParams);
+
+    // Get a handle to the underlying cached memory
+    auto rawCacheValue = cachingProvider._rawCache;
+    int rawCacheSize = (int)rawCacheValue.GetLayout().NumElements();
+
+    auto cachedUpperLeft = MakeVector<int>(rawCacheSize);
+    auto cachedLowerLeft = MakeVector<int>(rawCacheSize);
+
+    // Add a low level API kernel to access the underlying cache after it has been filled
+    auto cacheSpyKernel = loopnests::Kernel("cache_spy_kernel")
+                              .Inputs(rawCacheValue, cachedUpperLeft, cachedLowerLeft)
+                              .Indices(iTopLevel, jTopLevel)
+                              .Define([cacheRows](Value rawCacheValue, Vector cachedUpperLeft, Vector cachedLowerLeft, Scalar i, Scalar j) {
+                                  auto cacheView = rawCacheValue;
+                                  cacheView.SetLayout({ { (int)rawCacheValue.GetLayout().NumElements() } });
+                                  auto vectorCacheView = Vector(cacheView);
+                                  If(i == 0,
+                                     [&]() {
+                                         // TODO : remove nested if's
+                                         If(j == 0,
+                                            [&]() {
+                                                cachedUpperLeft = vectorCacheView;
+                                            });
+                                     })
+                                      .ElseIf(i == cacheRows,
+                                              [&]() {
+                                                  If(j == 0, [&]() {
+                                                      cachedLowerLeft = vectorCacheView;
+                                                  });
+                                              });
+                              });
+    auto cacheSpyPosition = loopnests::CodePositionConstraints{ loopnests::LoopFragmentType::epilogue, { iCache, jCache }, {} };
+    nest.GetUnderlyingLoopNest().AddKernel(cacheSpyKernel, cacheSpyPosition);
+
+#if 0 // DEBUGGING
+    DebugDump(nest.GetUnderlyingLoopNest());
+#endif
+    nest.Run();
+
+    auto ok = MakeScalar<int>("ok");
+    ok = 1;
+    auto printError = [&] {
+        DebugPrint("Upper Left:");
+        DebugPrintVector(cachedUpperLeft);
+        DebugPrint("\n");
+        DebugPrintVector(expectedCachedUpperLeft);
+        DebugPrint("\n");
+        DebugPrint("\n");
+        DebugPrint("Lower Left:");
+        DebugPrintVector(cachedLowerLeft);
+        DebugPrint("\n");
+        DebugPrintVector(expectedCachedLowerLeft);
+        DebugPrint("\n");
+        DebugPrint("\n");
+    };
+    // TODO : replace nested if's
+    If(VerifySame(cachedUpperLeft, expectedCachedUpperLeft) == 0, [&]() {
+        If(VerifySame(cachedLowerLeft, expectedCachedLowerLeft) == 0, [&]() {
+            ok = 0;
+        }).Else(printError);
+    }).Else(printError);
+    return ok;
+}
+
+Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Runner_UpperCachesOnly(int M, int N, int cacheRows, int cacheCols, int blockSize, int stripeSize, Vector expectedCachedUpperLeft, Vector expectedCachedUpperRight)
+{
+    auto input = MakeIncrementingMatrix<int>(M, N, "input");
+    auto output = MakeMatrix<int>(M, N, "output");
+    Index i("i"), j("j");
+    auto nest = Using({ input }, ArgumentType::Input)
+                    .Using({ output }, ArgumentType::Output)
+                    .ForAll(i, 0, M)
+                    .ForAll(j, 0, N)
+                    .Do([](Matrix input, Matrix output, Scalar i, Scalar j) {
+                        output(i, j) = input(i, j);
+                    });
+
+    auto& schedule = nest.GetSchedule();
+
+    auto iTopLevel = i;
+    auto jTopLevel = j;
+
+    auto iCache = schedule.Split(i, cacheRows);
+    auto iBlock = schedule.Split(i, blockSize);
+    auto iStripe = schedule.Split(i, stripeSize);
+    auto jCache = schedule.Split(j, cacheCols);
+
+    schedule.SetOrder({ iCache, jCache, iBlock, iStripe, j, i });
+
+    ArgumentType argType = ArgumentType::Input;
+    std::string cacheName = "cacheInput";
+    size_t maxCacheElts = cacheRows * cacheCols;
+    size_t fillThreshold = blockSize * cacheCols;
+    std::function<void(Scalar, Scalar)> reduceFunction = CopyReduce;
+    auto extraCacheParams = std::make_tuple(argType,
+                                            cacheName,
+                                            maxCacheElts,
+                                            fillThreshold,
+                                            reduceFunction,
+                                            false);
+    GeneralCachingStrategy cachingProvider{};
+    schedule.Cache(cachingProvider,
+                   input,
+                   { iTopLevel, jTopLevel },
+                   {},
+                   {},
+                   std::nullopt,
+                   extraCacheParams);
+
+    // Get a handle to the underlying cached memory
+    auto rawCacheValue = cachingProvider._rawCache;
+    int rawCacheSize = (int)rawCacheValue.GetLayout().NumElements();
+
+    auto cachedUpperLeft = MakeVector<int>(rawCacheSize);
+    auto cachedUpperRight = MakeVector<int>(rawCacheSize);
+
+    // Add a low level API kernel to access the underlying cache after it has been filled
+    auto cacheSpyKernel = loopnests::Kernel("cache_spy_kernel")
+                              .Inputs(rawCacheValue, cachedUpperLeft, cachedUpperRight)
+                              .Indices(iTopLevel, jTopLevel)
+                              .Define([cacheCols](Value rawCacheValue, Vector cachedUpperLeft, Vector cachedUpperRight, Scalar i, Scalar j) {
+                                  auto cacheView = rawCacheValue;
+                                  cacheView.SetLayout({ { (int)rawCacheValue.GetLayout().NumElements() } });
+                                  auto vectorCacheView = Vector(cacheView);
+                                  If(i == 0,
+                                     [&]() {
+                                         // TODO : remove nested if's
+                                         If(j == 0,
+                                            [&]() {
+                                                cachedUpperLeft = vectorCacheView;
+                                            })
+                                             .ElseIf(j == cacheCols,
+                                                     [&]() {
+                                                         cachedUpperRight = vectorCacheView;
+                                                     });
+                                     });
+                              });
+    auto cacheSpyPosition = loopnests::CodePositionConstraints{ loopnests::LoopFragmentType::epilogue, { iCache, jCache }, {} };
+    nest.GetUnderlyingLoopNest().AddKernel(cacheSpyKernel, cacheSpyPosition);
+
+#if 0 // DEBUGGING
+    DebugDump(nest.GetUnderlyingLoopNest());
+#endif
+    nest.Run();
+
+    auto ok = MakeScalar<int>("ok");
+    ok = 1;
+    auto printError = [&] {
+        DebugPrint("Upper Left:");
+        DebugPrintVector(cachedUpperLeft);
+        DebugPrint("\n");
+        DebugPrintVector(expectedCachedUpperLeft);
+        DebugPrint("\n");
+        DebugPrint("\n");
+        DebugPrint("Upper Right:");
+        DebugPrintVector(cachedUpperRight);
+        DebugPrint("\n");
+        DebugPrintVector(expectedCachedUpperRight);
+        DebugPrint("\n");
+        DebugPrint("\n");
+    };
+    // TODO : replace nested if's
+    If(VerifySame(cachedUpperLeft, expectedCachedUpperLeft) == 0, [&]() {
+        If(VerifySame(cachedUpperRight, expectedCachedUpperRight) == 0, [&]() {
+            ok = 0;
+        }).Else(printError);
+    }).Else(printError);
+    return ok;
+}
+
+Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Runner_UpperLeftCacheOnly(int M, int N, int cacheRows, int cacheCols, int blockSize, int stripeSize, Vector expectedCachedUpperLeft)
+{
+    auto input = MakeIncrementingMatrix<int>(M, N, "input");
+    auto output = MakeMatrix<int>(M, N, "output");
+    Index i("i"), j("j");
+    auto nest = Using({ input }, ArgumentType::Input)
+                    .Using({ output }, ArgumentType::Output)
+                    .ForAll(i, 0, M)
+                    .ForAll(j, 0, N)
+                    .Do([](Matrix input, Matrix output, Scalar i, Scalar j) {
+                        output(i, j) = input(i, j);
+                    });
+
+    auto& schedule = nest.GetSchedule();
+
+    auto iTopLevel = i;
+    auto jTopLevel = j;
+
+    auto iCache = schedule.Split(i, cacheRows);
+    auto iBlock = schedule.Split(i, blockSize);
+    auto iStripe = schedule.Split(i, stripeSize);
+    auto jCache = schedule.Split(j, cacheCols);
+
+    schedule.SetOrder({ iCache, jCache, iBlock, iStripe, j, i });
+
+    ArgumentType argType = ArgumentType::Input;
+    std::string cacheName = "cacheInput";
+    size_t maxCacheElts = cacheRows * cacheCols;
+    size_t fillThreshold = blockSize * cacheCols;
+    std::function<void(Scalar, Scalar)> reduceFunction = CopyReduce;
+    auto extraCacheParams = std::make_tuple(argType,
+                                            cacheName,
+                                            maxCacheElts,
+                                            fillThreshold,
+                                            reduceFunction,
+                                            false);
+    GeneralCachingStrategy cachingProvider{};
+    schedule.Cache(cachingProvider,
+                   input,
+                   { iTopLevel, jTopLevel },
+                   {},
+                   {},
+                   std::nullopt,
+                   extraCacheParams);
+
+    // Get a handle to the underlying cached memory
+    auto rawCacheValue = cachingProvider._rawCache;
+    int rawCacheSize = (int)rawCacheValue.GetLayout().NumElements();
+
+    auto cachedUpperLeft = MakeVector<int>(rawCacheSize);
+
+    // Add a low level API kernel to access the underlying cache after it has been filled
+    auto cacheSpyKernel = loopnests::Kernel("cache_spy_kernel")
+                              .Inputs(rawCacheValue, cachedUpperLeft)
+                              .Indices(iTopLevel, jTopLevel)
+                              .Define([](Value rawCacheValue, Vector cachedUpperLeft, Scalar i, Scalar j) {
+                                  auto cacheView = rawCacheValue;
+                                  cacheView.SetLayout({ { (int)rawCacheValue.GetLayout().NumElements() } });
+                                  auto vectorCacheView = Vector(cacheView);
+                                  If(i == 0,
+                                     [&]() {
+                                         // TODO : remove nested if's
+                                         If(j == 0,
+                                            [&]() {
+                                                cachedUpperLeft = vectorCacheView;
+                                            });
+                                     });
+                              });
+    auto cacheSpyPosition = loopnests::CodePositionConstraints{ loopnests::LoopFragmentType::epilogue, { iCache, jCache }, {} };
+    nest.GetUnderlyingLoopNest().AddKernel(cacheSpyKernel, cacheSpyPosition);
+
+#if 0 // DEBUGGING
+    DebugDump(nest.GetUnderlyingLoopNest());
+#endif
+    nest.Run();
+
+    return VerifySame(cachedUpperLeft, expectedCachedUpperLeft);
+}
+
+// input matrix rows doesn't evenly divide cache rows
+// input matrix cols evenly divides cache cols
+// blockSize == stripeSize == cacheRows / 2
+Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test1()
+{
+    int M = 7; // M doesn't evenly divide the number of cache rows
+    int N = 8;
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int smallBlockSize = 2;
+    int largeBlockSize = 4;
+    int stripeSize = 2;
+
+    // input
+    // A:
+    // [ 0,  1,  2,  3,  4,  5,  6,  7]
+    // [ 8,  9, 10, 11, 12, 13, 14, 15]
+    // [16, 17, 18, 19, 20, 21, 22, 23]
+    // [24, 25, 26, 27, 28, 29, 30, 31]
+    // [32, 33, 34, 35, 36, 37, 38, 39]
+    // [40, 41, 42, 43, 44, 45, 46, 47]
+    // [48, 49, 50, 51, 52, 53, 54, 55]
+    // clang-format off
+    Vector expectedCachedUpperLeft =
+        {
+             0,  8,  1,  9,  2, 10,  3, 11,
+            16, 24, 17, 25, 18, 26, 19, 27
+        };
+    Vector expectedCachedUpperRight =
+        {
+             4, 12,  5, 13,  6, 14,  7, 15,
+            20, 28, 21, 29, 22, 30, 23, 31
+        };
+    Vector expectedCachedLowerLeft =
+        {
+            32, 40, 33, 41, 34, 42, 35, 43,
+            48,  0, 49,  0, 50,  0, 51,  0
+        };
+    Vector expectedCachedLowerRight =
+        {
+            36, 44, 37, 45, 38, 46, 39, 47,
+            52,  0, 53,  0, 54,  0, 55,  0
+        };
+    // clang-format on
+
+    auto smallBlockResult = GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, smallBlockSize, stripeSize, expectedCachedUpperLeft, expectedCachedUpperRight, expectedCachedLowerLeft, expectedCachedLowerRight);
+    auto largeBlockResult = GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, largeBlockSize, stripeSize, expectedCachedUpperLeft, expectedCachedUpperRight, expectedCachedLowerLeft, expectedCachedLowerRight);
+    return smallBlockResult + largeBlockResult;
+}
+
+// input matrix rows doesn't evenly divide cache rows, does evenly divide blocksize/stripesize
+// input matrix cols evenly divides cache cols
+// blockSize == stripeSize == cacheRows / 2
+Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test2()
+{
+    int M = 6; // M doesn't evenly divide the number of cache rows
+    int N = 8;
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int smallBlockSize = 2;
+    int largeBlockSize = 4;
+    int stripeSize = 2;
+
+    // input
+    // A:
+    // [ 0,  1,  2,  3,  4,  5,  6,  7]
+    // [ 8,  9, 10, 11, 12, 13, 14, 15]
+    // [16, 17, 18, 19, 20, 21, 22, 23]
+    // [24, 25, 26, 27, 28, 29, 30, 31]
+    // [32, 33, 34, 35, 36, 37, 38, 39]
+    // [40, 41, 42, 43, 44, 45, 46, 47]
+    // clang-format off
+    Vector expectedCachedUpperLeft =
+        {
+             0,  8,  1,  9,  2, 10,  3, 11,
+            16, 24, 17, 25, 18, 26, 19, 27
+        };
+    Vector expectedCachedUpperRight =
+        {
+             4, 12,  5, 13,  6, 14,  7, 15,
+            20, 28, 21, 29, 22, 30, 23, 31
+        };
+    Vector expectedCachedLowerLeft =
+        {
+            32, 40, 33, 41, 34, 42, 35, 43,
+             0,  0,  0,  0,  0,  0,  0,  0
+        };
+    Vector expectedCachedLowerRight =
+        {
+            36, 44, 37, 45, 38, 46, 39, 47,
+             0,  0,  0,  0,  0,  0,  0,  0
+        };
+    // clang-format on
+
+    auto smallBlockResult = GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, smallBlockSize, stripeSize, expectedCachedUpperLeft, expectedCachedUpperRight, expectedCachedLowerLeft, expectedCachedLowerRight);
+    auto largeBlockResult = GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, largeBlockSize, stripeSize, expectedCachedUpperLeft, expectedCachedUpperRight, expectedCachedLowerLeft, expectedCachedLowerRight);
+    return smallBlockResult + largeBlockResult;
+}
+
+// input matrix rows doesn't evenly divide cache rows, does evenly divide blocksize/stripesize
+// input matrix cols doesn't evenly divide cache cols
+// blockSize == stripeSize == cacheRows / 2
+Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test3()
+{
+    int M = 7; // M doesn't evenly divide the number of cache rows
+    int N = 6;
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int smallBlockSize = 2;
+    int largeBlockSize = 4;
+    int stripeSize = 2;
+
+    // input
+    // A:
+    // [ 0,  1,  2,  3,  4,  5]
+    // [ 6,  7,  8,  9, 10, 11]
+    // [12, 13, 14, 15, 16, 17]
+    // [18, 19, 20, 21, 22, 23]
+    // [24, 25, 26, 27, 28, 29]
+    // [30, 31, 32, 33, 34, 35]
+    // [36, 37, 38, 39, 40, 41]
+    // clang-format off
+    Vector expectedCachedUpperLeft =
+        {
+             0,  6,  1,  7,  2,  8,  3,  9,
+            12, 18, 13, 19, 14, 20, 15, 21
+        };
+    Vector expectedCachedUpperRight =
+        {
+             4, 10,  5, 11, 16, 22, 17, 23,
+             0,  0,  0,  0,  0,  0,  0,  0
+        };
+    Vector expectedCachedLowerLeft =
+        {
+            24, 30, 25, 31, 26, 32, 27, 33,
+            36,  0, 37,  0, 38,  0, 39,  0
+        };
+    Vector expectedCachedLowerRight =
+        {
+            28, 34, 29, 35, 40,  0, 41,  0,
+             0,  0,  0,  0,  0,  0,  0,  0
+        };
+    // clang-format on
+
+    auto smallBlockResult = GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, smallBlockSize, stripeSize, expectedCachedUpperLeft, expectedCachedUpperRight, expectedCachedLowerLeft, expectedCachedLowerRight);
+    auto largeBlockResult = GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, largeBlockSize, stripeSize, expectedCachedUpperLeft, expectedCachedUpperRight, expectedCachedLowerLeft, expectedCachedLowerRight);
+    return smallBlockResult + largeBlockResult;
+}
+
+// input matrix rows doesn't evenly divide cache rows, does evenly divide blocksize/stripesize
+// input matrix cols doesn't evenly divide cache cols
+// blockSize == stripeSize == cacheRows / 2
+Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test4()
+{
+    int M = 6; // M doesn't evenly divide the number of cache rows
+    int N = 6;
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int smallBlockSize = 2;
+    int largeBlockSize = 4;
+    int stripeSize = 2;
+
+    // input
+    // A:
+    // [ 0,  1,  2,  3,  4,  5],
+    // [ 6,  7,  8,  9, 10, 11],
+    // [12, 13, 14, 15, 16, 17],
+    // [18, 19, 20, 21, 22, 23],
+    // [24, 25, 26, 27, 28, 29],
+    // [30, 31, 32, 33, 34, 35]
+    // clang-format off
+    Vector expectedCachedUpperLeft =
+        {
+             0,  6,  1,  7,  2,  8,  3,  9,
+            12, 18, 13, 19, 14, 20, 15, 21
+        };
+    Vector expectedCachedUpperRight =
+        {
+             4, 10,  5, 11, 16, 22, 17, 23,
+             0,  0,  0,  0,  0,  0,  0,  0
+        };
+    Vector expectedCachedLowerLeft =
+        {
+            24, 30, 25, 31, 26, 32, 27, 33,
+             0,  0,  0,  0,  0,  0,  0,  0
+        };
+    Vector expectedCachedLowerRight =
+        {
+            28, 34, 29, 35,  0,  0,  0,  0,
+             0,  0,  0,  0,  0,  0,  0,  0
+        };
+    // clang-format on
+
+    auto smallBlockResult = GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, smallBlockSize, stripeSize, expectedCachedUpperLeft, expectedCachedUpperRight, expectedCachedLowerLeft, expectedCachedLowerRight);
+    auto largeBlockResult = GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, largeBlockSize, stripeSize, expectedCachedUpperLeft, expectedCachedUpperRight, expectedCachedLowerLeft, expectedCachedLowerRight);
+    return smallBlockResult + largeBlockResult;
+}
+
+// input matrix rows < cache rows, doesn't evenly divide blocksize/stripesize
+// input matrix cols evenly divides cache cols
+// blockSize == stripeSize == cacheRows / 2
+Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test5()
+{
+    int M = 3; // M < cache rows
+    int N = 8;
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int smallBlockSize = 2;
+    int largeBlockSize = 4;
+    int stripeSize = 2;
+
+    // input
+    // A:
+    // [ 0,  1,  2,  3,  4,  5,  6,  7],
+    // [ 8,  9, 10, 11, 12, 13, 14, 15],
+    // [16, 17, 18, 19, 20, 21, 22, 23]
+    // clang-format off
+    Vector expectedCachedUpperLeft =
+        {
+             0,  8,  1,  9,  2, 10,  3, 11,
+            16,  0, 17,  0, 18,  0, 19,  0
+        };
+    Vector expectedCachedUpperRight =
+        {
+             4, 12,  5, 13,  6, 14,  7, 15,
+            20,  0, 21,  0, 22,  0, 23,  0
+        };
+    // clang-format on
+
+    auto smallBlockResult = GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Runner_UpperCachesOnly(M, N, cacheRows, cacheCols, smallBlockSize, stripeSize, expectedCachedUpperLeft, expectedCachedUpperRight);
+    auto largeBlockResult = GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Runner_UpperCachesOnly(M, N, cacheRows, cacheCols, largeBlockSize, stripeSize, expectedCachedUpperLeft, expectedCachedUpperRight);
+    return smallBlockResult + largeBlockResult;
+}
+
+// input matrix rows < cache rows, evenly divides blocksize/stripesize
+// input matrix cols evenly divides cache cols
+// blockSize == stripeSize == cacheRows / 2
+Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test6()
+{
+    int M = 2; // M < cache rows, evenly divides stripesize
+    int N = 8;
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int smallBlockSize = 2;
+    int largeBlockSize = 4;
+    int stripeSize = 2;
+
+    // input
+    // A:
+    // [ 0,  1,  2,  3,  4,  5,  6,  7],
+    // [ 8,  9, 10, 11, 12, 13, 14, 15]
+    // clang-format off
+    Vector expectedCachedUpperLeft =
+        {
+             0,  8,  1,  9,  2, 10,  3, 11,
+             0,  0,  0,  0,  0,  0,  0,  0
+        };
+    Vector expectedCachedUpperRight =
+        {
+             4, 12,  5, 13,  6, 14,  7, 15,
+             0,  0,  0,  0,  0,  0,  0,  0
+        };
+    // clang-format on
+
+    auto smallBlockResult = GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Runner_UpperCachesOnly(M, N, cacheRows, cacheCols, smallBlockSize, stripeSize, expectedCachedUpperLeft, expectedCachedUpperRight);
+    auto largeBlockResult = GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Runner_UpperCachesOnly(M, N, cacheRows, cacheCols, largeBlockSize, stripeSize, expectedCachedUpperLeft, expectedCachedUpperRight);
+    return smallBlockResult + largeBlockResult;
+}
+
+// input matrix rows < cache rows, doesn't evenly divide blocksize/stripesize
+// input matrix cols < cache cols
+// blockSize == stripeSize == cacheRows / 2
+Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test7()
+{
+    int M = 3; // M < cache rows, doens't evenly divide stripesize
+    int N = 3;
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int smallBlockSize = 2;
+    int largeBlockSize = 4;
+    int stripeSize = 2;
+
+    // input
+    // A:
+    // [0, 1, 2],
+    // [3, 4, 5],
+    // [6, 7, 8]
+    // clang-format off
+    Vector expectedCachedUpperLeft =
+        {
+             0, 3, 1, 4, 2, 5, 6, 0,
+             7, 0, 8, 0, 0, 0, 0, 0
+        };
+    // clang-format on
+    auto smallBlockResult = GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Runner_UpperLeftCacheOnly(M, N, cacheRows, cacheCols, smallBlockSize, stripeSize, expectedCachedUpperLeft);
+    auto largeBlockResult = GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Runner_UpperLeftCacheOnly(M, N, cacheRows, cacheCols, largeBlockSize, stripeSize, expectedCachedUpperLeft);
+    return smallBlockResult + largeBlockResult;
+}
+
+// input matrix rows < cache rows, evenly divides blocksize/stripesize
+// input matrix cols < cache cols
+// blockSize == stripeSize == cacheRows / 2
+Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test8()
+{
+    int M = 2; // M < cache rows, evenly divides stripesize
+    int N = 2;
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int smallBlockSize = 2;
+    int largeBlockSize = 4;
+    int stripeSize = 2;
+
+    // input
+    // A:
+    // [0, 1]
+    // [2, 3]
+    // clang-format off
+    Vector expectedCachedUpperLeft =
+        {
+             0, 2, 1, 3, 0, 0, 0, 0,
+             0, 0, 0, 0, 0, 0, 0, 0
+        };
+    // clang-format on
+
+    auto smallBlockResult = GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Runner_UpperLeftCacheOnly(M, N, cacheRows, cacheCols, smallBlockSize, stripeSize, expectedCachedUpperLeft);
+    auto largeBlockResult = GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Runner_UpperLeftCacheOnly(M, N, cacheRows, cacheCols, largeBlockSize, stripeSize, expectedCachedUpperLeft);
+    return smallBlockResult + largeBlockResult;
+}
+
+// input matrix rows multiple of cache rows
+// input matrix cols < cache cols
+// blockSize == stripeSize == cacheRows / 2
+Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test9()
+{
+    int M = 8;
+    int N = 2; // N < cache cols
+    int cacheRows = 4;
+    int cacheCols = 4;
+    int smallBlockSize = 2;
+    int largeBlockSize = 4;
+    int stripeSize = 2;
+
+    // input
+    // A:
+    // [ 0,  1],
+    // [ 2,  3],
+    // [ 4,  5],
+    // [ 6,  7],
+    // [ 8,  9],
+    // [10, 11],
+    // [12, 13],
+    // [14, 15]
+    // clang-format off
+    Vector expectedCachedUpperLeft =
+        {
+            0, 2, 1, 3, 4, 6, 5, 7,
+            0, 0, 0, 0, 0, 0, 0, 0
+        };
+    Vector expectedCachedLowerLeft =
+        {
+            8, 10,  9, 11, 12, 14, 13, 15,
+            0,  0,  0,  0,  0,  0,  0,  0
+        };
+    // clang-format on
+
+    auto smallBlockResult = GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Runner_LeftCachesOnly(M, N, cacheRows, cacheCols, smallBlockSize, stripeSize, expectedCachedUpperLeft, expectedCachedLowerLeft);
+    auto largeBlockResult = GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Runner_LeftCachesOnly(M, N, cacheRows, cacheCols, largeBlockSize, stripeSize, expectedCachedUpperLeft, expectedCachedLowerLeft);
+    return smallBlockResult + largeBlockResult;
+}
+
+} // namespace ell
diff --git a/libraries/value/test/src/Functions_test.cpp b/libraries/value/test/src/Functions_test.cpp
new file mode 100644
index 000000000..b28ecb1fc
--- /dev/null
+++ b/libraries/value/test/src/Functions_test.cpp
@@ -0,0 +1,61 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     Functions_test.cpp (value)
+//  Authors:  Kern Handa, Chuck Jacobs
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "Functions_test.h"
+#include "TestUtil.h"
+
+#include <value/include/ComputeContext.h>
+#include <value/include/FunctionDeclaration.h>
+#include <value/include/Scalar.h>
+
+#include <vector>
+
+#if !defined(WIN32)
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <unistd.h>
+#else
+#include <windows.h>
+#endif // !defined(WIN32)
+
+using namespace ell::utilities;
+using namespace ell::value;
+
+#define PRINT_IR 0
+
+namespace ell
+{
+Scalar FunctionArgType_test()
+{
+    auto fn = DeclareFunction("FunctionArgType_test")
+                  .Parameters(
+                              Value(ValueType::Float, ScalarLayout),
+                              Value({ValueType::Float, 0}, ScalarLayout),
+                              Value(ValueType::Int32, ScalarLayout),
+                              Value({ValueType::Int32, 0}, ScalarLayout))
+                  .Returns(Value(ValueType::Int32, ScalarLayout))
+                  .Define([](Scalar f, Scalar f0, Scalar i, Scalar i0) {
+                      auto ff = MakeScalar<float>();
+                      auto ff0 = MakeScalar<float>();
+                      auto ii = MakeScalar<int>();
+                      auto ii0 = MakeScalar<int>();
+
+                      ff = f;
+                      ff0 = f0;
+                      ii = i;
+                      ii0 = i0;
+                      return Scalar(0);
+                  });
+
+    auto arg1 = MakeScalar<float>();
+    auto arg2 = MakeScalar<float>();
+    auto arg3 = MakeScalar<int>();
+    auto arg4 = MakeScalar<int>();
+    return fn(arg1, arg2, arg3, arg4);
+}
+} // namespace ell
diff --git a/libraries/value/test/src/LoopNestAPI_test.cpp b/libraries/value/test/src/LoopNestAPI_test.cpp
new file mode 100644
index 000000000..4133c2c0b
--- /dev/null
+++ b/libraries/value/test/src/LoopNestAPI_test.cpp
@@ -0,0 +1,1090 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     LoopNestAPI_test.cpp (value)
+//  Authors:  Kern Handa
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "LoopNestAPI_test.h"
+#include "LoopNest_kernels.h"
+#include "TestUtil.h"
+
+#include <value/include/CachingStrategies.h>
+#include <value/include/LLVMContext.h>
+#include <value/include/LoopNests.h>
+#include <value/include/Matrix.h>
+#include <value/include/Reference.h>
+#include <value/include/Scalar.h>
+#include <value/include/Value.h>
+#include <value/include/loopnests/LoopNest.h>
+
+#include <utilities/include/Logger.h>
+#include <utilities/include/TunableParameters.h>
+
+#include <llvm/ADT/Twine.h>
+
+#include <llvm/IR/Value.h>
+
+#if 0 // DEBUGGING
+#include <value/include/loopnests/LoopNest.h>
+#endif
+
+using namespace ell::utilities;
+using namespace ell::value;
+using namespace ell::logging;
+
+namespace ell
+{
+Scalar LoopNest_api_test1()
+{
+    auto matrix = MakeMatrix<int>(4, 5);
+    Index i("i"), j("j");
+
+    Using({ matrix }, ArgumentType::Output)
+        .ForAll(i, 0, 4)
+        .ForAll(j, 0, 5)
+        .Do(loopnest_kernel)
+        .Run();
+
+    return matrix(2, 3) - 19; // will return 0 if calculation is correct
+}
+
+Scalar LoopNest_api_test2()
+{
+    auto matrix = MakeMatrix<int>(4, 5);
+    Index i("i"), j("j");
+
+    auto nest = Using({ matrix }, ArgumentType::Output)
+                    .ForAll(i, 0, 4)
+                    .ForAll(j, 0, 5)
+                    .Do(loopnest_kernel);
+
+    nest.GetSchedule().Split(i, 2);
+
+    nest.Run();
+
+    return matrix(2, 3) - 19; // will return 0 if calculation is correct
+}
+
+Scalar LoopNest_api_test3()
+{
+    // Declare the input matrix
+    std::vector<std::vector<int>> dt{
+        std::vector<int>{ 1, 2, 3 },
+        std::vector<int>{ 4, 5, 6 },
+    };
+    auto matrix = Matrix(dt);
+    // Declare the output matrix and initialize its values to 10.
+    auto output = MakeMatrix<int>(static_cast<int>(matrix.Rows()), static_cast<int>(matrix.Columns()));
+    For(output, [&](Scalar row, Scalar column) {
+        output(row, column) = 10;
+    });
+
+    Index i("i"), j("j");
+
+    //// Use a Loopnest to call loopnest_kernel_3 for each element of the input matrix and write the result to
+    //// our output.
+    Using({ output }, ArgumentType::Output)
+        .Using({ matrix }, ArgumentType::Input)
+        .ForAll(i, 0, static_cast<int>(matrix.Rows()))
+        .ForAll(j, 0, static_cast<int>(matrix.Columns()))
+        .Do(loopnest_kernel_3)
+        .Run();
+
+    // loopnest_kernel_3 will add the input element to the output element.
+    // Since we initialized the output to 10, we expect the result to be
+    // 10 greater than the input.
+    std::vector<int> expectedValues{ 11, 12, 13, 14, 15, 16 };
+    auto expected = Vector(expectedValues);
+
+    // View the result as a Vector
+    Vector actual = AsVector(AsFullView(output));
+
+    // Verify that the actual result is what we expect
+    return VerifySame(actual, expected);
+}
+
+Scalar LoopNest_api_test4()
+{
+    // Declare the output matrix and initialize its values to 0.
+    auto output = MakeMatrix<int>(2, 6);
+
+    Index i("i"), j("j");
+
+    // Use a Loopnest to call loopnest_kernel_4 for each element of the input matrix and write the result to
+    // our output.
+    auto nest = Using({ output }, ArgumentType::Output)
+                    .Using({ output }, ArgumentType::Input) // this isn't how you'd write in real life, hopefully (using the same memory for both input and output)
+                    .ForAll(i, 0, static_cast<int>(output.Rows()))
+                    .ForAll(j, 0, static_cast<int>(output.Columns()))
+                    .Do(loopnest_kernel_4);
+
+    nest.GetSchedule().Split(j, 2);
+
+    nest.Run();
+
+    // loopnest_kernel_4 will multiply row by 10 and add the column.
+    std::vector<int> expectedValues{ 0, 1, 2, 3, 4, 5, 10, 11, 12, 13, 14, 15 };
+    auto expected = Vector(expectedValues);
+
+    // View the result as a Vector
+    Vector actual = AsVector(AsFullView(output));
+
+    // Verify that the actual result is what we expect
+    return VerifySame(actual, expected);
+}
+
+Scalar LoopNest_api_test5()
+{
+    // Declare the output matrix and initialize its values to 0.
+    auto output = MakeMatrix<int>(2, 8);
+
+    Index i("i"), j("j");
+
+    // Use a Loopnest to call loopnest_kernel_4 for each element of the input matrix and write the result to
+    // our output.
+    auto nest = Using({ output }, ArgumentType::Output)
+                    .Using({ output }, ArgumentType::Input) // this isn't how you'd write in real life, hopefully (using the same memory for both input and output)
+                    .ForAll(i, 0, static_cast<int>(output.Rows()))
+                    .ForAll(j, 0, static_cast<int>(output.Columns()))
+                    .Do(loopnest_kernel_4);
+
+    auto& schedule = nest.GetSchedule();
+    schedule.Split(j, 4);
+    schedule.Split(j, 2);
+
+    nest.Run();
+
+    // loopnest_kernel_4 will multiply row by 10 and add the column.
+    std::vector<int> expectedValues{ 0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13, 14, 15, 16, 17 };
+    auto expected = Vector(expectedValues);
+
+    // View the result as a Vector
+    Vector actual = AsVector(AsFullView(output));
+
+    // Verify that the actual result is what we expect
+    return VerifySame(actual, expected);
+}
+
+Scalar LoopNest_api_Parallelized_test1()
+{
+    Scalar ok = Allocate<int>(ScalarLayout);
+    auto matrix = MakeMatrix<int>(4, 5);
+    InvokeForContext<LLVMContext>([&] {
+        auto v = matrix.GetValue().Get<Emittable>().GetDataAs<llvm::Value*>();
+        v->setName("matrix");
+    });
+
+    Index i("i"), j("j");
+
+    auto nest = Using({ matrix }, ArgumentType::Output)
+                    .ForAll(i, 0, 4)
+                    .ForAll(j, 0, 5)
+                    .Do([](Matrix m, Scalar i, Scalar j) {
+                        Scalar tid = GetTID();
+#if 1 // Useful to turn off/on for debugging
+                        InvokeForContext<ComputeContext>([&](auto&) {
+                            auto iInt = i.Get<int>();
+                            auto jInt = j.Get<int>();
+                            Log() << "m(" << iInt << ", " << jInt << ") = " << (iInt * 2 + jInt * 5)
+                                  << " [Thread " << tid.Get<int>() << "]"
+                                  << EOL;
+                        });
+#endif // 1
+                        m(i, j) = i * 2 + j * 5;
+                    });
+
+    nest.GetSchedule().Parallelize(i, 2);
+
+    nest.Run();
+
+    ok = matrix(2, 3) - 19;
+    return ok; // will return 0 if calculation is correct
+}
+
+Scalar LoopNest_api_Parallelized_test2()
+{
+    Scalar ok = Allocate<int>(ScalarLayout);
+
+    auto matrix = MakeMatrix<int>(4, 5);
+    Index i("i"), j("j");
+
+    auto nest = Using({ matrix }, ArgumentType::Output)
+                    .ForAll(i, 0, 4)
+                    .ForAll(j, 0, 5)
+                    .Do([](Matrix m, Scalar i, Scalar j) {
+                        Scalar tid = GetTID();
+#if 1 // Useful to turn off/on for debugging
+                        InvokeForContext<ComputeContext>([&](auto&) {
+                            auto iInt = i.Get<int>();
+                            auto jInt = j.Get<int>();
+                            Log() << "m(" << iInt << ", " << jInt << ") = " << tid.Get<int>()
+                                  << " [Thread " << tid.Get<int>() << "]" << EOL;
+                        });
+#endif // 1
+                        m(i, j) = tid;
+                    });
+
+    nest.GetSchedule().Parallelize(i, 2);
+
+    nest.Run();
+
+    auto expected = MakeMatrix<int>(4, 5);
+    If(
+        VerifySame(matrix, expected) == 0,
+        [&] {
+            ok = 1;
+        })
+        .Else([&] {
+            auto value = matrix.GetValue();
+            value.SetLayout({ { (int)matrix.Size() } });
+            DebugPrintVector(value);
+            DebugPrint("\n");
+        });
+    return ok;
+}
+
+Scalar LoopNest_api_Unrolled_test1()
+{
+    auto matrix = MakeMatrix<int>(20, 5);
+    Index i("i"), j("j");
+
+    auto nest = Using({ matrix }, ArgumentType::Output)
+                    .ForAll(i, 0, 4)
+                    .ForAll(j, 0, 5)
+                    .Do(loopnest_kernel);
+
+    auto& schedule = nest.GetSchedule();
+
+    schedule.Parallelize(i, 2);
+    schedule.Unroll(j);
+
+    nest.Run();
+
+    return matrix(2, 3) - 19; // will return 0 if calculation is correct
+}
+
+Scalar LoopNest_api_SetOrder_test1()
+{
+    auto matrix = MakeMatrix<int>(4, 5);
+    Index i("i"), j("j");
+
+    auto nest = Using({ matrix }, ArgumentType::Output)
+                    .ForAll(i, 0, 4)
+                    .ForAll(j, 0, 5)
+                    .Do(loopnest_kernel);
+
+    auto& schedule = nest.GetSchedule();
+    auto i_o = schedule.Split(i, 2);
+    schedule.SetOrder({ i_o, j, i });
+
+    nest.Run();
+
+    return matrix(2, 3) - 19; // will return 0 if calculation is correct
+}
+
+Scalar LoopNest_api_CachedMatrix_test1()
+{
+    const int N = 4;
+    auto A = MakeMatrix<int>(N, N, "A");
+    For(A, [&](Scalar i, Scalar j) {
+        A(i, j) = i - j;
+    });
+
+    // A:
+    // [ 0, -1, -2, -3 ]
+    // [ 1,  0, -1, -2 ]
+    // [ 2,  1,  0, -1 ]
+    // [ 3,  2,  1,  0 ]
+
+    Index i("i"), j("j");
+
+    auto nest = Using({ A }, ArgumentType::InputOutput)
+                    .ForAll(i, 0, N)
+                    .ForAll(j, 0, N)
+                    .Do(addOne);
+
+    nest.GetSchedule().Cache(
+        CreateCacheFor(A)
+            .Size({ N, N })
+            .Using({ i, j })
+            .Type(SubMatrixCopyInCopyOutCache{}));
+
+    nest.Run();
+
+    return A(2, 0) + A(0, 2) - 2; // will return 0 if calculation is correct
+}
+
+Scalar LoopNest_api_SlidingCachedMatrix_test()
+{
+    const int N = 8;
+    const int cacheARows = N / 2;
+    const int cacheACols = N / 2;
+
+    auto A = MakeMatrix<int>(N, N, "A");
+    auto B = MakeMatrix<int>(N, N, "B");
+
+    // initialize A
+    ForRange(N, [&](Scalar i) {
+        ForRange(N, [&](Scalar j) {
+            A(i, j) = i - j;
+        });
+    });
+
+    // The input matrices:
+    // A:                                 B:
+    // [ 0, -1, -2, -3, -4, -5, -6, -7]   [ 0, 0, ... ]
+    // [ 1,  0, -1, -2, -3, -4, -5, -6]   [ 0, 0, ... ]
+    // [ 2,  1,  0, -1, -2, -3, -4, -5]   [ 0, 0, ... ]
+    // [ 3,  2,  1,  0, -1, -2, -3, -4]   [ 0, 0, ... ]
+    // [ 4,  3,  2,  1,  0, -1, -2, -3]   [ 0, 0, ... ]
+    // [ 5,  4,  3,  2,  1,  0, -1, -2]   [ 0, 0, ... ]
+    // [ 6,  5,  4,  3,  2,  1,  0, -1]   [ 0, 0, ... ]
+    // [ 7,  6,  5,  4,  3,  2,  1,  0]   [ 0, 0, ... ]
+
+    Index i("i"), j("j");
+    auto nest = Using({ A }, ArgumentType::Input)
+                    .Using({ B }, ArgumentType::InputOutput)
+                    .ForAll(i, 0, N)
+                    .ForAll(j, 0, N)
+                    .Do([](Matrix A, Matrix B, Scalar i, Scalar j) {
+                        B(i, j) = A(i, j);
+                    });
+
+    auto& schedule = nest.GetSchedule();
+
+    auto i_o = schedule.Split(i, cacheARows);
+    auto j_o = schedule.Split(j, cacheACols);
+    schedule.Cache(
+        CreateCacheFor(A)
+            .Size({ cacheARows, cacheACols })
+            .Using({ i, j })
+            .At({ i_o, j_o })
+            .Type(SubMatrixCopyIn{}));
+
+    nest.Run();
+
+    Scalar ok = Allocate<int>(ScalarLayout);
+    ok = 1;
+    If(
+        VerifySame(A, B) == 0,
+        [&] {
+            ok = 0;
+        })
+        .Else([&] {
+            auto value = B.GetValue();
+            value.SetLayout({ { (int)B.Size() } });
+            DebugPrintVector(value);
+            DebugPrint("\n");
+            auto expectedValue = A.GetValue();
+            expectedValue.SetLayout({ { (int)A.Size() } });
+            DebugPrintVector(expectedValue);
+            DebugPrint("\n");
+        });
+    return ok;
+}
+
+Scalar SimpleGemm_HighLevelAPI()
+{
+    const int N = 8;
+    const int cacheARows = 4;
+    const int cacheACols = 4;
+    const int cacheBRows = cacheACols;
+    const int cacheBCols = N;
+    const int resultCacheRows = 2;
+    const int resultCacheCols = 2;
+
+    auto A = MakeMatrix<int>(N, N, "A");
+    auto B = MakeMatrix<int>(N, N, "B");
+    auto C = MakeMatrix<int>(N, N, "C");
+    auto expected = MakeMatrix<int>(N, N, "expected");
+
+    // initialize A, B, and C
+    ForRange(N, [&](Scalar i) {
+        ForRange(N, [&](Scalar j) {
+            A(i, j) = i - j;
+            B(i, j) = i + 2 * j;
+        });
+    });
+
+    // fill out expected with a simple for-loop gemm
+    ForRange(N, [&](Scalar i) {
+        ForRange(N, [&](Scalar j) {
+            ForRange(N, [&](Scalar k) {
+                expected(i, j) += A(i, k) * B(k, j);
+            });
+        });
+    });
+
+    // The input matrices:
+    // A:                                 B:                                 C:
+    // [ 0, -1, -2, -3, -4, -5, -6, -7]   [ 0,  2,  4,  6,  8, 10, 12, 14]   [ 0 0 ... ]
+    // [ 1,  0, -1, -2, -3, -4, -5, -6]   [ 1,  3,  5,  7,  9, 11, 13, 15]   [ 0 0 ... ]
+    // [ 2,  1,  0, -1, -2, -3, -4, -5]   [ 2,  4,  6,  8, 10, 12, 14, 16]   [   ...   ]
+    // [ 3,  2,  1,  0, -1, -2, -3, -4]   [ 3,  5,  7,  9, 11, 13, 15, 17]   [   ...   ]
+    // [ 4,  3,  2,  1,  0, -1, -2, -3]   [ 4,  6,  8, 10, 12, 14, 16, 18]   [   ...   ]
+    // [ 5,  4,  3,  2,  1,  0, -1, -2]   [ 5,  7,  9, 11, 13, 15, 17, 19]   [   ...   ]
+    // [ 6,  5,  4,  3,  2,  1,  0, -1]   [ 6,  8, 10, 12, 14, 16, 18, 20]   [   ...   ]
+    // [ 7,  6,  5,  4,  3,  2,  1,  0]   [ 7,  9, 11, 13, 15, 17, 19, 21]   [   ...   ]
+
+    // (A * B) (the desired result):
+    // [-140, -196, -252, -308, -364, -420, -476, -532]
+    // [-112, -152, -192, -232, -272, -312, -352, -392]
+    // [ -84, -108, -132, -156, -180, -204, -228, -252]
+    // [ -56,  -64,  -72,  -80,  -88,  -96, -104, -112]
+    // [ -28,  -20,  -12,   -4,    4,   12,   20,   28]
+    // [   0,   24,   48,   72,   96,  120,  144,  168]
+    // [  28,   68,  108,  148,  188,  228,  268,  308]
+    // [  56,  112,  168,  224,  280,  336,  392,  448]
+
+    Index i("i"), j("j"), k("k");
+    auto nest = Using({ A, B }, ArgumentType::Input)
+                    .Using({ C }, ArgumentType::InputOutput)
+                    .ForAll(i, 0, N)
+                    .ForAll(j, 0, N)
+                    .ForAll(k, 0, N)
+                    .Do([](Matrix A, Matrix B, Matrix C, Scalar i, Scalar j, Scalar k) {
+#if 0 // DEBUGGING
+                        InvokeForContext<ComputeContext>([&] {
+                            std::cout << "C(" << i.Get<int>() << "," << j.Get<int>() << ") pointing at (kernel): " << std::hex << reinterpret_cast<intptr_t>(std::get<int*>(C(i, j).GetValue().GetUnderlyingData())) << std::dec << std::endl;
+                        });
+#endif
+                        C(i, j) += A(i, k) * B(k, j);
+                    });
+
+    auto& schedule = nest.GetSchedule();
+    auto i_b_o = schedule.Split(i, cacheARows);
+    auto k_b_o = schedule.Split(k, cacheACols);
+    schedule.Cache(CreateCacheFor(A)
+                       .Size({ cacheARows, cacheACols }, utilities::RowMajorMatrixOrder)
+                       .Using({ i_b_o, k_b_o })
+                       .Type(SubMatrixCopyIn{}));
+    schedule.Cache(CreateCacheFor(B)
+                       .Size({ cacheBRows, cacheBCols }, utilities::ColumnMajorMatrixOrder)
+                       .Using({ k, j })
+                       .At({ k_b_o })
+                       .Type(SubMatrixCopyIn{}));
+
+    auto i_o = schedule.Split(i, resultCacheRows);
+    auto j_o = schedule.Split(j, resultCacheCols);
+    schedule.Cache(CreateCacheFor(C)
+                       .Size({ resultCacheRows, resultCacheCols }, utilities::RowMajorMatrixOrder)
+                       .Using({ i_o, j_o })
+                       .Type(ZeroInputCopyOutMatrixCache{}));
+
+    schedule.SetOrder({ k_b_o, i_b_o, j_o, i_o, k, j, i });
+    schedule.Unroll(i);
+    schedule.Unroll(j);
+    nest.Run();
+
+    Scalar ok = Allocate<int>(ScalarLayout);
+    ok = 1;
+    If(
+        VerifySame(C, expected) == 0,
+        [&] {
+            ok = 0;
+        })
+        .Else([&] {
+            auto value = C.GetValue();
+            value.SetLayout({ { (int)C.Size() } });
+            DebugPrintVector(value);
+            DebugPrint("\n");
+            auto expectedValue = expected.GetValue();
+            expectedValue.SetLayout({ { (int)expected.Size() } });
+            DebugPrintVector(expectedValue);
+            DebugPrint("\n");
+        });
+    return ok;
+}
+
+Scalar SimpleGemm_HighLevelAPI_NoCachingHelper()
+{
+    const int N = 8;
+    const int cacheARows = 4;
+    const int cacheACols = 4;
+    const int cacheBRows = cacheACols;
+    const int cacheBCols = N;
+    const int resultCacheRows = 2;
+    const int resultCacheCols = 2;
+
+    auto A = MakeMatrix<int>(N, N, "A");
+    auto B = MakeMatrix<int>(N, N, "B");
+    auto C = MakeMatrix<int>(N, N, "C");
+    auto expected = MakeMatrix<int>(N, N, "expected");
+
+    // initialize A, B, and C
+    ForRange(N, [&](Scalar i) {
+        ForRange(N, [&](Scalar j) {
+            A(i, j) = i - j;
+            B(i, j) = i + 2 * j;
+        });
+    });
+
+    // fill out expected with a simple for-loop gemm
+    ForRange(N, [&](Scalar i) {
+        ForRange(N, [&](Scalar j) {
+            ForRange(N, [&](Scalar k) {
+                expected(i, j) += A(i, k) * B(k, j);
+            });
+        });
+    });
+
+    // The input matrices:
+    // A:                                 B:                                 C:
+    // [ 0, -1, -2, -3, -4, -5, -6, -7]   [ 0,  2,  4,  6,  8, 10, 12, 14]   [ 0 0 ... ]
+    // [ 1,  0, -1, -2, -3, -4, -5, -6]   [ 1,  3,  5,  7,  9, 11, 13, 15]   [ 0 0 ... ]
+    // [ 2,  1,  0, -1, -2, -3, -4, -5]   [ 2,  4,  6,  8, 10, 12, 14, 16]   [   ...   ]
+    // [ 3,  2,  1,  0, -1, -2, -3, -4]   [ 3,  5,  7,  9, 11, 13, 15, 17]   [   ...   ]
+    // [ 4,  3,  2,  1,  0, -1, -2, -3]   [ 4,  6,  8, 10, 12, 14, 16, 18]   [   ...   ]
+    // [ 5,  4,  3,  2,  1,  0, -1, -2]   [ 5,  7,  9, 11, 13, 15, 17, 19]   [   ...   ]
+    // [ 6,  5,  4,  3,  2,  1,  0, -1]   [ 6,  8, 10, 12, 14, 16, 18, 20]   [   ...   ]
+    // [ 7,  6,  5,  4,  3,  2,  1,  0]   [ 7,  9, 11, 13, 15, 17, 19, 21]   [   ...   ]
+
+    // (A * B) (the desired result):
+    // [-140, -196, -252, -308, -364, -420, -476, -532]
+    // [-112, -152, -192, -232, -272, -312, -352, -392]
+    // [ -84, -108, -132, -156, -180, -204, -228, -252]
+    // [ -56,  -64,  -72,  -80,  -88,  -96, -104, -112]
+    // [ -28,  -20,  -12,   -4,    4,   12,   20,   28]
+    // [   0,   24,   48,   72,   96,  120,  144,  168]
+    // [  28,   68,  108,  148,  188,  228,  268,  308]
+    // [  56,  112,  168,  224,  280,  336,  392,  448]
+
+    Index i("i"), j("j"), k("k");
+    auto nest = Using({ A, B }, ArgumentType::Input)
+                    .Using({ C }, ArgumentType::InputOutput)
+                    .ForAll(i, 0, N)
+                    .ForAll(j, 0, N)
+                    .ForAll(k, 0, N)
+                    .Do([](Matrix A, Matrix B, Matrix C, Scalar i, Scalar j, Scalar k) {
+#if 0 // DEBUGGING
+                        InvokeForContext<ComputeContext>([&] {
+                            std::cout << "C(" << i.Get<int>() << "," << j.Get<int>() << ") pointing at (kernel): " << std::hex << reinterpret_cast<intptr_t>(std::get<int*>(C(i, j).GetValue().GetUnderlyingData())) << std::dec << std::endl;
+                        });
+#endif
+                        C(i, j) += A(i, k) * B(k, j);
+                    });
+
+    auto& schedule = nest.GetSchedule();
+    auto i_b_o = schedule.Split(i, cacheARows);
+    auto k_b_o = schedule.Split(k, cacheACols);
+    schedule.Cache<CopyInputCopyOutput>(
+        A,
+        { i, k },
+        { cacheARows, cacheACols },
+        { i_b_o, k_b_o },
+        RowMajorMatrixOrder);
+    schedule.Cache<CopyInputCopyOutput>(
+        B,
+        { k, j },
+        { cacheBRows, cacheBCols },
+        { k_b_o },
+        ColumnMajorMatrixOrder);
+
+    auto i_o = schedule.Split(i, resultCacheRows);
+    auto j_o = schedule.Split(j, resultCacheCols);
+    schedule.SetOrder({ k_b_o, i_b_o, j_o, i_o, k, j, i });
+    schedule.Cache<ZeroInputReduceOutput>(
+        C,
+        { i, j },
+        { resultCacheRows, resultCacheCols },
+        { i_o, j_o },
+        RowMajorMatrixOrder);
+
+    schedule.Unroll(i);
+    schedule.Unroll(j);
+    nest.Run();
+
+    Scalar ok = Allocate<int>(ScalarLayout);
+    ok = 1;
+    If(
+        VerifySame(C, expected) == 0,
+        [&] {
+            ok = 0;
+        })
+        .Else([&] {
+            auto value = C.GetValue();
+            value.SetLayout({ { (int)C.Size() } });
+            DebugPrintVector(value);
+            DebugPrint("\n");
+            auto expectedValue = expected.GetValue();
+            expectedValue.SetLayout({ { (int)expected.Size() } });
+            DebugPrintVector(expectedValue);
+            DebugPrint("\n");
+        });
+    return ok;
+}
+
+Scalar MLAS_GEMM_GeneralCachingStrategy()
+{
+    const int OutputRows = 16;
+    const int InnerDimension = 16;
+    const int OutputColumns = 16;
+    const int kUnroll = 4;
+    const int cacheBRows = InnerDimension / 2;
+    const int cacheBCols = OutputColumns / 2;
+    const int stripeSize = cacheBCols / 2;
+    const int vectorSize = stripeSize / 2;
+    const int NumRowsInKernel = OutputRows / 8;
+    const int NumColumnsInKernel = 2 * vectorSize;
+
+    auto A = MakeIncrementingMatrix<int>(OutputRows, InnerDimension, "A");
+    auto B = MakeIncrementingMatrix<int>(InnerDimension, OutputColumns, "B");
+    auto C = MakeMatrix<int>(OutputRows, OutputColumns, "C");
+
+    auto expected = MakeMatrix<int>(OutputRows, OutputColumns, "expected");
+    ForRange(OutputRows, [&](Scalar m) {
+        ForRange(OutputColumns, [&](Scalar n) {
+            ForRange(InnerDimension, [&](Scalar k) {
+                expected(m, n) += A(m, k) * B(k, n);
+            });
+        });
+    });
+
+    // Declare indexes
+    loopnests::Index i("i"), j("j"), k("k");
+    // Define LoopNest
+    auto nest = Using({ A, B }, ArgumentType::Input)
+                    .Using({ C }, ArgumentType::Output)
+                    .ForAll(i, 0, OutputRows)
+                    .ForAll(j, 0, OutputColumns)
+                    .ForAll(k, 0, InnerDimension)
+                    .Do([](Matrix A_, Matrix B_, Matrix C_, Scalar i_, Scalar j_, Scalar k_) {
+                        C_(i_, j_) += B_(k_, j_) * A_(i_, k_);
+                    });
+    auto& schedule = nest.GetSchedule();
+
+    auto topLevelI = i;
+    auto topLevelJ = j;
+    auto topLevelK = k;
+
+    // Declare splits
+    auto jCache = schedule.Split(j, cacheBCols);
+    auto kCache = schedule.Split(k, cacheBRows);
+    auto kBlock = schedule.Split(k, kUnroll);
+    auto jKernelOuter2 = schedule.Split(j, NumColumnsInKernel);
+    auto jKernelOuter = schedule.Split(j, vectorSize);
+    auto iKernelOuter = schedule.Split(i, NumRowsInKernel);
+
+    // Set the order
+    schedule.SetOrder({ jCache, kCache, iKernelOuter, jKernelOuter2, kBlock, k, i, jKernelOuter, j });
+
+    // Set up caching
+    ArgumentType argType = ArgumentType::Input;
+    std::string cacheName = "cacheBInput";
+    size_t maxCacheElts = cacheBRows * cacheBCols;
+    size_t fillThreshold = maxCacheElts;
+    std::function<void(Scalar, Scalar)> reduceFunction = CopyReduce;
+    auto extraCacheParams = std::make_tuple(argType,
+                                            cacheName,
+                                            maxCacheElts,
+                                            fillThreshold,
+                                            reduceFunction,
+                                            false);
+    schedule.Cache<GeneralCachingStrategy>(B,
+                                           { topLevelK, topLevelJ },
+                                           {},
+                                           {},
+                                           std::nullopt,
+                                           extraCacheParams);
+
+    ArgumentType argTypeC = ArgumentType::Output;
+    std::string cacheNameC = "cacheCOutput";
+    size_t maxCacheEltsC = NumRowsInKernel * NumColumnsInKernel;
+    size_t fillThresholdC = maxCacheEltsC;
+    std::function<void(Scalar, Scalar)> reduceFunctionC = SumReduce;
+    auto extraCacheCParams = std::make_tuple(argTypeC,
+                                             cacheNameC,
+                                             maxCacheEltsC,
+                                             fillThresholdC,
+                                             reduceFunctionC,
+                                             true);
+    schedule.Cache<GeneralCachingStrategy>(C,
+                                           { topLevelI, topLevelJ },
+                                           {},
+                                           {},
+                                           std::nullopt,
+                                           extraCacheCParams);
+
+    // Set unrolling
+    schedule.Unroll(jKernelOuter);
+    schedule.Unroll(i);
+    schedule.Unroll(k);
+
+#if 0 // DEBUGGING
+    auto loop = nest.GetUnderlyingLoopNest();
+    DebugDump(loop);
+#endif
+    // Run the generator
+    nest.Run();
+
+    If(
+        VerifySame(C, expected) == 0,
+        [&] {
+        })
+        .Else([&] {
+            auto value = C.GetValue();
+            value.SetLayout({ { (int)C.Size() } });
+            DebugPrintVector(value);
+            DebugPrint("\n");
+            auto expectedValue = expected.GetValue();
+            expectedValue.SetLayout({ { (int)expected.Size() } });
+            DebugPrintVector(expectedValue);
+            DebugPrint("\n");
+        });
+
+    return VerifySame(C, expected);
+}
+
+Scalar OneSplitBoundaryTest()
+{
+    const int M = 4;
+    const int N = 3;
+    const int split = 2;
+    auto input = MakeIncrementingMatrix<int>(M, N, "input");
+    auto output = MakeMatrix<int>(M, N, "output");
+    auto expectedOutput = MakeIncrementingMatrix<int>(M, N, "expectedOutput");
+
+    Index i("i"), j("j");
+    auto nest = Using({ input }, ArgumentType::Input)
+                    .Using({ output }, ArgumentType::InputOutput)
+                    .ForAll(i, 0, M)
+                    .ForAll(j, 0, N)
+                    .Do([](Matrix input, Matrix output, Scalar i, Scalar j) {
+#if 0 // DEBUGGING
+                        InvokeForContext<ComputeContext>([&] {
+                            std::cout << "inner kernel, (i,j) == (" << i.Get<int>() << ", " << j.Get<int>() << ")" << std::endl;
+                        });
+#endif
+                        output(i, j) = input(i, j);
+                    });
+
+    auto schedule = nest.GetSchedule();
+
+    auto topLevelI = i;
+    auto topLevelJ = j;
+    auto iSplit = schedule.Split(i, split);
+    auto jSplit = schedule.Split(j, split);
+
+#if 0 // DEBUGGING
+    auto kernel = loopnests::Kernel("split_log_kernel")
+                              .Inputs()
+                              .Indices(iSplit, jSplit)
+                              .Define([](Scalar i, Scalar j) {
+                                    InvokeForContext<ComputeContext>([&] {
+                                        std::cout << "simple boundary split log kernel, (i,j) == (" << i.Get<int>() << ", " << j.Get<int>() << ")" << std::endl;
+                                    });
+                              });
+    nest.GetUnderlyingLoopNest().AddKernel(kernel);
+#endif
+
+    schedule.SetOrder({ iSplit, jSplit, i, j });
+
+#if 0 // DEBUGGING
+    DebugDump(nest.GetUnderlyingLoopNest());
+#endif
+    nest.Run();
+
+    Scalar ok = Allocate<int>(ScalarLayout);
+    ok = 1;
+    If(
+        VerifySame(output, expectedOutput) == 0,
+        [&] {
+            ok = 0;
+        })
+        .Else([&] {
+            auto value = output.GetValue();
+            value.SetLayout({ { (int)output.Size() } });
+            DebugPrintVector(value);
+            DebugPrint("\n");
+            auto expectedValue = expectedOutput.GetValue();
+            expectedValue.SetLayout({ { (int)expectedOutput.Size() } });
+            DebugPrintVector(expectedValue);
+            DebugPrint("\n");
+        });
+    return ok;
+}
+
+Scalar TwoSplitBoundaryTest()
+{
+    const int M = 8;
+    const int N = 7;
+    const int bigSplit = 4;
+    const int smallSplit = 2;
+    auto input = MakeIncrementingMatrix<int>(M, N, "input");
+    auto output = MakeMatrix<int>(M, N, "output");
+    auto expectedOutput = MakeIncrementingMatrix<int>(M, N, "expectedOutput");
+
+    Index i("i"), j("j");
+    auto nest = Using({ input }, ArgumentType::Input)
+                    .Using({ output }, ArgumentType::InputOutput)
+                    .ForAll(i, 0, M)
+                    .ForAll(j, 0, N)
+                    .Do([](Matrix input, Matrix output, Scalar i, Scalar j) {
+#if 0 // DEBUGGING
+                        InvokeForContext<ComputeContext>([&] {
+                            std::cout << "inner kernel, (i,j) == (" << i.Get<int>() << ", " << j.Get<int>() << ")" << std::endl;
+                        });
+#endif
+                        output(i, j) = input(i, j);
+                    });
+
+    auto schedule = nest.GetSchedule();
+
+    auto topLevelI = i;
+    auto topLevelJ = j;
+    auto iBigSplit = schedule.Split(i, bigSplit);
+    auto jBigSplit = schedule.Split(j, bigSplit);
+    auto iSmallSplit = schedule.Split(i, smallSplit);
+    auto jSmallSplit = schedule.Split(j, smallSplit);
+
+#if 0 // DEBUGGING
+    auto kernel = loopnests::Kernel("big_split_log_kernel")
+                              .Inputs()
+                              .Indices(iBigSplit, jBigSplit)
+                              .Define([](Scalar i, Scalar j) {
+                                    InvokeForContext<ComputeContext>([&] {
+                                        std::cout << "simple boundary big split log kernel, (i,j) == (" << i.Get<int>() << ", " << j.Get<int>() << ")" << std::endl;
+                                    });
+                              });
+    nest.GetUnderlyingLoopNest().AddKernel(kernel);
+    auto kernel2 = loopnests::Kernel("small_split_log_kernel")
+                              .Inputs()
+                              .Indices(iSmallSplit, jSmallSplit)
+                              .Define([](Scalar i, Scalar j) {
+                                    InvokeForContext<ComputeContext>([&] {
+                                        std::cout << "simple boundary Small split log kernel, (i,j) == (" << i.Get<int>() << ", " << j.Get<int>() << ")" << std::endl;
+                                    });
+                              });
+    nest.GetUnderlyingLoopNest().AddKernel(kernel2);
+#endif
+
+    schedule.SetOrder({ iBigSplit, jBigSplit, iSmallSplit, jSmallSplit, i, j });
+
+#if 0 // DEBUGGING
+    DebugDump(nest.GetUnderlyingLoopNest());
+#endif
+    nest.Run();
+
+    Scalar ok = Allocate<int>(ScalarLayout);
+    ok = 1;
+    If(
+        VerifySame(output, expectedOutput) == 0,
+        [&] {
+            ok = 0;
+        })
+        .Else([&] {
+            auto value = output.GetValue();
+            value.SetLayout({ { (int)output.Size() } });
+            DebugPrintVector(value);
+            DebugPrint("\n");
+            auto expectedValue = expectedOutput.GetValue();
+            expectedValue.SetLayout({ { (int)expectedOutput.Size() } });
+            DebugPrintVector(expectedValue);
+            DebugPrint("\n");
+        });
+    return ok;
+}
+
+Scalar SplitLargerThanSizeBoundaryTest()
+{
+    const int M = 8;
+    const int N = 3;
+    const int split = 4;
+    auto input = MakeIncrementingMatrix<int>(M, N, "input");
+    auto output = MakeMatrix<int>(M, N, "output");
+    auto expectedOutput = MakeIncrementingMatrix<int>(M, N, "expectedOutput");
+
+    Index i("i"), j("j");
+    auto nest = Using({ input }, ArgumentType::Input)
+                    .Using({ output }, ArgumentType::InputOutput)
+                    .ForAll(i, 0, M)
+                    .ForAll(j, 0, N)
+                    .Do([](Matrix input, Matrix output, Scalar i, Scalar j) {
+#if 0 // DEBUGGING
+                        InvokeForContext<ComputeContext>([&] {
+                            std::cout << "inner kernel, (i,j) == (" << i.Get<int>() << ", " << j.Get<int>() << ")" << std::endl;
+                        });
+#endif
+                        output(i, j) = input(i, j);
+                    });
+
+    auto schedule = nest.GetSchedule();
+
+    auto topLevelI = i;
+    auto topLevelJ = j;
+    auto iSplit = schedule.Split(i, split);
+    auto jSplit = schedule.Split(j, split);
+
+#if 0 // DEBUGGING
+    auto kernel = loopnests::Kernel("split_log_kernel")
+                              .Inputs()
+                              .Indices(iSplit, jSplit)
+                              .Define([](Scalar i, Scalar j) {
+                                    InvokeForContext<ComputeContext>([&] {
+                                        std::cout << "simple boundary split log kernel, (i,j) == (" << i.Get<int>() << ", " << j.Get<int>() << ")" << std::endl;
+                                    });
+                              });
+    nest.GetUnderlyingLoopNest().AddKernel(kernel);
+#endif
+
+    schedule.SetOrder({ iSplit, jSplit, i, j });
+
+#if 0 // DEBUGGING
+    DebugDump(nest.GetUnderlyingLoopNest());
+#endif
+    nest.Run();
+
+    Scalar ok = Allocate<int>(ScalarLayout);
+    ok = 1;
+    If(
+        VerifySame(output, expectedOutput) == 0,
+        [&] {
+            ok = 0;
+        })
+        .Else([&] {
+            auto value = output.GetValue();
+            value.SetLayout({ { (int)output.Size() } });
+            DebugPrintVector(value);
+            DebugPrint("\n");
+            auto expectedValue = expectedOutput.GetValue();
+            expectedValue.SetLayout({ { (int)expectedOutput.Size() } });
+            DebugPrintVector(expectedValue);
+            DebugPrint("\n");
+        });
+    return ok;
+}
+
+Scalar TwoSplitsLargerThanSizeBoundaryTest()
+{
+    const int M = 8;
+    const int N = 3;
+    const int bigSplit = 4;
+    const int smallSplit = 2;
+    auto input = MakeIncrementingMatrix<int>(M, N, "input");
+    auto output = MakeMatrix<int>(M, N, "output");
+    auto expectedOutput = MakeIncrementingMatrix<int>(M, N, "expectedOutput");
+
+    Index i("i"), j("j");
+    auto nest = Using({ input }, ArgumentType::Input)
+                    .Using({ output }, ArgumentType::InputOutput)
+                    .ForAll(i, 0, M)
+                    .ForAll(j, 0, N)
+                    .Do([](Matrix input, Matrix output, Scalar i, Scalar j) {
+#if 0 // DEBUGGING
+                        InvokeForContext<ComputeContext>([&] {
+                            std::cout << "inner kernel, (i,j) == (" << i.Get<int>() << ", " << j.Get<int>() << ")" << std::endl;
+                        });
+#endif
+                        output(i, j) = input(i, j);
+                    });
+
+    auto schedule = nest.GetSchedule();
+
+    auto topLevelI = i;
+    auto topLevelJ = j;
+    auto iBigSplit = schedule.Split(i, bigSplit);
+    auto jBigSplit = schedule.Split(j, bigSplit);
+    auto iSmallSplit = schedule.Split(i, smallSplit);
+    auto jSmallSplit = schedule.Split(j, smallSplit);
+
+#if 0 // DEBUGGING
+    auto kernel = loopnests::Kernel("big_split_log_kernel")
+                              .Inputs()
+                              .Indices(iBigSplit, jBigSplit)
+                              .Define([](Scalar i, Scalar j) {
+                                    InvokeForContext<ComputeContext>([&] {
+                                        std::cout << "simple boundary big split log kernel, (i,j) == (" << i.Get<int>() << ", " << j.Get<int>() << ")" << std::endl;
+                                    });
+                              });
+    nest.GetUnderlyingLoopNest().AddKernel(kernel);
+    auto kernel2 = loopnests::Kernel("small_split_log_kernel")
+                              .Inputs()
+                              .Indices(iSmallSplit, jSmallSplit)
+                              .Define([](Scalar i, Scalar j) {
+                                    InvokeForContext<ComputeContext>([&] {
+                                        std::cout << "simple boundary Small split log kernel, (i,j) == (" << i.Get<int>() << ", " << j.Get<int>() << ")" << std::endl;
+                                    });
+                              });
+    nest.GetUnderlyingLoopNest().AddKernel(kernel2);
+#endif
+
+    schedule.SetOrder({ iBigSplit, jBigSplit, iSmallSplit, jSmallSplit, i, j });
+
+#if 0 // DEBUGGING
+    DebugDump(nest.GetUnderlyingLoopNest());
+#endif
+    nest.Run();
+
+    Scalar ok = Allocate<int>(ScalarLayout);
+    ok = 1;
+    If(
+        VerifySame(output, expectedOutput) == 0,
+        [&] {
+            ok = 0;
+        })
+        .Else([&] {
+            auto value = output.GetValue();
+            value.SetLayout({ { (int)output.Size() } });
+            DebugPrintVector(value);
+            DebugPrint("\n");
+            auto expectedValue = expectedOutput.GetValue();
+            expectedValue.SetLayout({ { (int)expectedOutput.Size() } });
+            DebugPrintVector(expectedValue);
+            DebugPrint("\n");
+        });
+    return ok;
+}
+
+Scalar LoopNest_api_tunable_parameters_test1()
+{
+    auto ok = MakeScalar<int>();
+
+    // loopnest_kernel_4 will multiply row by 10 and add the column.
+    std::vector<int> expectedValues{ 0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13, 14, 15, 16, 17 };
+    auto expected = Vector(expectedValues);
+
+    TunableParameter j_o{ std::vector{ 2, 4 }, "j_o" };
+    TunableParameter j_o_o{ std::vector{ 1, 2 }, "j_o_o" };
+    TuningEngine engine(j_o, j_o_o);
+    do
+    {
+        // Declare the output matrix and initialize its values to 0.
+        auto output = MakeMatrix<int>(2, 8);
+
+        (void)DeclareFunction("LoopNext_tunable_" + engine.ToString())
+            .Decorated(false)
+            .Parameters(output)
+            .Define([&](Matrix matrix) {
+                Index i("i"), j("j");
+
+                auto nest = Using({ matrix }, ArgumentType::InputOutput)
+                                .ForAll(i, 0, static_cast<int>(output.Rows()))
+                                .ForAll(j, 0, static_cast<int>(output.Columns()))
+                                .Do([](Matrix m, Scalar i, Scalar j) {
+                                    Scalar v = Allocate(m.Type(), ScalarLayout);
+
+                                    v = i * 10;
+                                    v += j;
+
+                                    m(i, j) = v;
+                                });
+
+                auto& schedule = nest.GetSchedule();
+                schedule.Split(j, j_o);
+                schedule.Split(j, j_o_o);
+
+                nest.Run();
+            })(output);
+
+        // View the result as a Vector
+        Vector actual = AsVector(AsFullView(output));
+
+        // Verify that the actual result is what we expect
+        If(ok == 0, [&] { ok = VerifySame(actual, expected); });
+    } while (engine.Next());
+
+    return ok;
+}
+} // namespace ell
diff --git a/libraries/value/test/src/LoopNest_convolution_test.cpp b/libraries/value/test/src/LoopNest_convolution_test.cpp
new file mode 100644
index 000000000..a4dbe8bfe
--- /dev/null
+++ b/libraries/value/test/src/LoopNest_convolution_test.cpp
@@ -0,0 +1,198 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     LoopNest_convolution_test.cpp (value)
+//  Authors:  Mason Remy
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "LoopNest_convolution_test.h"
+#include "TestUtil.h"
+
+#include <value/include/CachingStrategies.h>
+#include <value/include/ComputeContext.h>
+#include <value/include/EmitterContext.h>
+#include <value/include/FunctionDeclaration.h>
+#include <value/include/LLVMContext.h>
+#include <value/include/LoopNests.h>
+#include <value/include/Matrix.h>
+#include <value/include/Scalar.h>
+#include <value/include/Tensor.h>
+#include <value/include/Value.h>
+#include <value/include/Vector.h>
+
+#include <value/include/loopnests/CodeGenerator.h>
+#include <value/include/loopnests/Kernel.h>
+#include <value/include/loopnests/LoopNest.h>
+#include <value/include/loopnests/LoopNestPrinter.h>
+
+#include <emitters/include/IRFunctionEmitter.h>
+
+#include <math/include/Matrix.h>
+#include <math/include/Tensor.h>
+#include <math/include/Vector.h>
+
+#include <utilities/include/FunctionUtils.h>
+#include <utilities/include/Logger.h>
+
+#include <testing/include/testing.h>
+
+#include <optional>
+#include <tuple>
+#include <vector>
+
+using namespace ell::emitters;
+using namespace ell::utilities;
+using namespace ell::logging;
+using namespace ell::value;
+using namespace ell::value::loopnests;
+
+namespace ell
+{
+// Tests of convolution via LoopNests
+
+int GetOutputDimensionSize(int inputSize, int receptiveFieldSize, int stride, int paddingSize)
+{
+    return (inputSize + 2 * paddingSize - receptiveFieldSize) / stride + 1;
+}
+
+struct ConvolutionConfig
+{
+    ConvolutionConfig(const std::vector<int>& inputSizes,
+                      int outputFilters,
+                      const std::vector<int>& receptiveFieldSize,
+                      const std::vector<int>& strideSize,
+                      const std::vector<int>& paddingSize,
+                      const std::vector<int>& inputBlockSizes,
+                      const std::vector<int>& outputBlockSizes)
+    {
+        outputSize[2] = outputFilters;
+        for (int dim = 0; dim < 3; dim++)
+        {
+            inputSize[dim] = inputSizes[dim];
+            inputBlockSize[dim] = inputBlockSizes[dim];
+            outputBlockSize[dim] = outputBlockSizes[dim];
+
+            // Value that are only computed in the row/column dimensions
+            if (dim < 2)
+            {
+                receptiveField[dim] = receptiveFieldSize[dim];
+                stride[dim] = strideSize[dim];
+                padding[dim] = paddingSize[dim];
+                outputSize[dim] = GetOutputDimensionSize(inputSize[dim], receptiveFieldSize[dim], strideSize[dim], paddingSize[dim]);
+            }
+
+            if (inputBlockSize[dim] > 0)
+            {
+                inputBlockCount[dim] = inputSize[dim] / inputBlockSize[dim];
+                if (inputSize[dim] % inputBlockSize[dim] != 0)
+                {
+                    inputBlockCount[dim]++;
+                }
+            }
+
+            if (outputBlockSize[dim] > 0)
+            {
+                outputBlockCount[dim] = outputSize[dim] / outputBlockSize[dim];
+                if (outputSize[dim] % outputBlockSize[dim] != 0)
+                {
+                    outputBlockCount[dim]++;
+                }
+            }
+        }
+
+        weightSize[0] = outputSize[2];
+        weightSize[1] = inputSize[2];
+        weightSize[2] = receptiveField[0];
+        weightSize[3] = receptiveField[1];
+
+        MemoryShape inputPackedShape = { inputBlockCount[2], inputSize[0], inputSize[1], inputBlockSize[2] };
+        MemoryShape inputPackedPadding = { 0, padding[0], padding[1], 0 };
+        inputPackedPaddedLayout = { inputPackedShape, inputPackedPadding };
+        MemoryShape inputLogicalPadding = { padding[0], padding[1], 0 };
+        inputLogicalPaddedLayout = { MemoryShape{ inputSize[0], inputSize[1], inputSize[2] }, inputLogicalPadding };
+
+        outputPackedLayout = { MemoryShape{ outputBlockCount[2], outputSize[0], outputSize[1], outputBlockSize[2] } };
+        outputLogicalLayout = { MemoryShape{ outputSize[0], outputSize[1], outputSize[2] } };
+
+        weightPackedLayout = { MemoryShape{
+            outputBlockCount[2],
+            inputBlockCount[2],
+            weightSize[2],
+            weightSize[3],
+            inputBlockSize[2],
+            outputBlockSize[2] } };
+    }
+
+    int inputSize[3];
+    int outputSize[3];
+    int weightSize[4];
+    int receptiveField[2];
+    int stride[2];
+    int padding[2];
+
+    int inputBlockSize[3];
+    int outputBlockSize[3];
+
+    int inputBlockCount[3];
+    int outputBlockCount[3];
+
+    MemoryLayout inputPackedPaddedLayout;
+    MemoryLayout inputLogicalPaddedLayout;
+
+    MemoryLayout outputPackedLayout;
+    MemoryLayout outputLogicalLayout;
+
+    MemoryLayout weightPackedLayout;
+};
+
+Tensor NaiveForLoopConvolution(const ConvolutionConfig& config, Tensor input, Array weights)
+{
+    auto output = MakeTensor<int>(config.outputSize[0], config.outputSize[1], config.outputSize[2], "expectedOutput");
+    ForRange(config.outputSize[2], [&](Scalar outputChannel) {
+        ForRange(config.inputSize[2], [&](Scalar inputChannel) {
+            ForRange(config.outputSize[0], [&](Scalar outputRow) {
+                ForRange(config.outputSize[1], [&](Scalar outputColumn) {
+                    ForRange(config.receptiveField[0], [&](Scalar weightRow) {
+                        ForRange(config.receptiveField[1], [&](Scalar weightColumn) {
+                            Scalar inputRow = outputRow * config.stride[0] + weightRow - config.padding[0];
+                            Scalar inputColumn = outputColumn * config.stride[1] + weightColumn - config.padding[1];
+                            If(inputRow >= 0, [&] {
+                                If(inputRow < Scalar{ config.inputSize[0] }, [&] {
+                                    If(inputColumn >= 0, [&] {
+                                        If(inputColumn < Scalar{ config.inputSize[1] }, [&] {
+                                            output(outputRow, outputColumn, outputChannel) +=
+                                                input(inputRow, inputColumn, inputChannel) *
+                                                weights({ outputChannel, inputChannel, weightRow, weightColumn });
+                                        });
+                                    });
+                                });
+                            });
+                        });
+                    });
+                });
+            });
+        });
+    });
+    return output;
+}
+
+Tensor UnpackOutputTensor(const ConvolutionConfig& config, Value packedOutput)
+{
+    auto unpackedOutput = MakeTensor<int>(config.outputSize[0], config.outputSize[1], config.outputSize[2], "unpackedOutput");
+    packedOutput.SetLayout(config.outputPackedLayout);
+    auto packedOutputArray = Array(packedOutput);
+
+    ForRange(config.outputSize[2], [&](Scalar channelIdx) {
+        ForRange(config.outputSize[0], [&](Scalar rowIdx) {
+            ForRange(config.outputSize[1], [&](Scalar columnIdx) {
+                unpackedOutput(rowIdx, columnIdx, channelIdx) = packedOutputArray({ channelIdx / config.outputBlockSize[2],
+                                                                                    rowIdx,
+                                                                                    columnIdx,
+                                                                                    channelIdx % config.outputBlockSize[2] });
+            });
+        });
+    });
+    return unpackedOutput;
+}
+} // namespace ell
diff --git a/libraries/value/test/src/LoopNest_kernels.cpp b/libraries/value/test/src/LoopNest_kernels.cpp
new file mode 100644
index 000000000..2e6c30895
--- /dev/null
+++ b/libraries/value/test/src/LoopNest_kernels.cpp
@@ -0,0 +1,234 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     LoopNest_kernels.cpp (value)
+//  Authors:  Kern Handa, Chuck Jacobs
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "LoopNest_kernels.h"
+
+#include <value/include/ComputeContext.h>
+#include <value/include/EmitterContext.h>
+#include <value/include/FunctionDeclaration.h>
+#include <value/include/LLVMContext.h>
+#include <value/include/Matrix.h>
+#include <value/include/Tensor.h>
+#include <value/include/Value.h>
+#include <value/include/Vector.h>
+
+#include <utilities/include/Logger.h>
+
+using namespace ell::logging;
+using namespace ell::utilities;
+using namespace ell::value;
+
+namespace ell
+{
+
+void loopnest_passthrough(ViewAdapter, Scalar i, Scalar j)
+{}
+
+void loopnest_kernel(Matrix m, Scalar i, Scalar j)
+{
+    InvokeForContext<ComputeContext>([&] {
+        auto iInt = i.Get<int>();
+        auto jInt = j.Get<int>();
+        Log() << "m(" << iInt << ", " << jInt << ") = " << (iInt * 2 + jInt * 5) << EOL;
+    });
+    m(i, j) = i * 2 + j * 5;
+}
+
+void loopnest_kernel_2(Matrix m, Scalar i, Scalar j)
+{
+    InvokeForContext<ComputeContext>([&] {
+        auto iInt = i.Get<int>();
+        auto jInt = j.Get<int>();
+        Log() << "m(" << iInt << ", " << jInt << ") += " << (iInt * 10 + jInt * 2) << EOL;
+    });
+    m(i, j) += i * 10 + j * 2;
+}
+
+void loopnest_kernel_3(Matrix c, Matrix a, Scalar i, Scalar j)
+{
+    c(i, j) += a(i, j);
+}
+
+void loopnest_kernel_4(Matrix c, Matrix a, Scalar i, Scalar j)
+{
+    Scalar v = Allocate(c.GetValue().GetBaseType(), ScalarLayout);
+
+    v = i * 10;
+    v += j;
+
+    c(i, j) = a(i, j) + v;
+}
+
+void matmul_kernel(Matrix A, Matrix B, Matrix C, Scalar i, Scalar j, Scalar k)
+{
+    InvokeForContext<ComputeContext>([&] {
+        auto iInt = i.Get<int>();
+        auto jInt = j.Get<int>();
+        auto kInt = k.Get<int>();
+        Log() << "C(" << iInt << ", " << jInt << ") += "
+              << "A(" << iInt << ", " << kInt << ") * B(" << kInt << ", " << jInt << ")" << EOL;
+    });
+    C(i, j) += A(i, k) * B(k, j);
+}
+
+void initToZero(Matrix m, Scalar i, Scalar j)
+{
+    InvokeForContext<ComputeContext>([&] {
+        auto iInt = i.Get<int>();
+        auto jInt = j.Get<int>();
+        Log() << "m(" << iInt << ", " << jInt << ") = " << 0 << EOL;
+    });
+    m(i, j) = 0;
+}
+
+void copyToCache(Matrix A, Matrix cache, Scalar i, Scalar j)
+{
+    InvokeForContext<ComputeContext>([&] {
+        auto iInt = i.Get<int>();
+        auto jInt = j.Get<int>();
+        Log() << cache.GetValue().GetName() << "(" << iInt << "-" << (iInt + 2) << ", " << jInt << "-" << (jInt + 2) << ") = "
+              << A.GetValue().GetName() << "(" << iInt << "-" << (iInt + 2) << ", " << jInt << "-" << (jInt + 2) << ")" << EOL;
+    });
+    cache(i, j) = A(i, j);
+    cache(i + 1, j) = A(i + 1, j);
+    cache(i, j + 1) = A(i, j + 1);
+    cache(i + 1, j + 1) = A(i + 1, j + 1);
+}
+
+void copyFromCache(Matrix A, Matrix cache, Scalar i, Scalar j)
+{
+    InvokeForContext<ComputeContext>([&] {
+        auto iInt = i.Get<int>();
+        auto jInt = j.Get<int>();
+        Log() << A.GetValue().GetName() << "(" << iInt << "-" << (iInt + 2) << ", " << jInt << "-" << (jInt + 2) << ") = "
+              << cache.GetValue().GetName() << "(" << iInt << "-" << (iInt + 2) << ", " << jInt << "-" << (jInt + 2) << ")" << EOL;
+    });
+    A(i, j) = cache(i, j);
+    A(i + 1, j) = cache(i + 1, j);
+    A(i, j + 1) = cache(i, j + 1);
+    A(i + 1, j + 1) = cache(i + 1, j + 1);
+}
+
+void copyToSmallCache(Matrix A, Matrix cache, Scalar i, Scalar j)
+{
+    InvokeForContext<ComputeContext>([&] {
+        auto iInt = i.Get<int>();
+        auto jInt = j.Get<int>();
+        Log() << "* " << cache.GetValue().GetName() << " = "
+              << A.GetValue().GetName() << "(" << iInt << "-" << (iInt + 2) << ", " << jInt << "-" << (jInt + 2) << ")" << EOL;
+    });
+    cache(0, 0) = A(i, j);
+    cache(1, 0) = A(i + 1, j);
+    cache(0, 1) = A(i, j + 1);
+    cache(1, 1) = A(i + 1, j + 1);
+}
+
+void copyFromSmallCache(Matrix A, Matrix cache, Scalar i, Scalar j)
+{
+    InvokeForContext<ComputeContext>([&] {
+        auto iInt = i.Get<int>();
+        auto jInt = j.Get<int>();
+        Log() << "* " << A.GetValue().GetName() << "(" << iInt << "-" << (iInt + 2) << ", " << jInt << "-" << (jInt + 2) << ") = "
+              << cache.GetValue().GetName() << EOL;
+    });
+    A(i, j) = cache(0, 0);
+    A(i + 1, j) = cache(1, 0);
+    A(i, j + 1) = cache(0, 1);
+    A(i + 1, j + 1) = cache(1, 1);
+}
+
+void addOne(Matrix m, Scalar i, Scalar j)
+{
+    InvokeForContext<ComputeContext>([&] {
+        auto iInt = i.Get<int>();
+        auto jInt = j.Get<int>();
+        Log() << m.GetValue().GetName() << "(" << iInt << ", " << jInt << ") += " << 1 << EOL;
+    });
+    m(i, j) += 1;
+}
+
+void addTwo(Matrix m, Scalar i, Scalar j)
+{
+    InvokeForContext<ComputeContext>([&] {
+        auto iInt = i.Get<int>();
+        auto jInt = j.Get<int>();
+        Log() << m.GetValue().GetName() << "(" << iInt << ", " << jInt << ") += " << 2 << EOL;
+    });
+    m(i, j) += 2;
+}
+
+void set_vector_kernel(Vector v, Scalar i)
+{
+    InvokeForContext<ComputeContext>([&] {
+        auto iInt = i.Get<int>();
+        Log() << "v(" << iInt << ") = " << iInt << EOL;
+    });
+    v(i) = i;
+}
+
+void increment_vector_kernel(Vector v, Scalar i)
+{
+    InvokeForContext<ComputeContext>([&] {
+        auto iInt = i.Get<int>();
+        Log() << "v(" << iInt << ") = " << iInt << EOL;
+    });
+    v(i) += 1;
+}
+
+void copy_vector_kernel(Vector v1, Vector v2, Scalar i)
+{
+    v2(i) = v1(i);
+}
+
+void reorder_vector_kernel(Vector v, Matrix m, Scalar splitParam, Scalar i, Scalar iOuter, Scalar iInner)
+{
+    InvokeForContext<ComputeContext>([&] {
+        auto iInt = i.Get<int>();
+        auto iOuterInt = iOuter.Get<int>();
+        auto iInnerInt = iInner.Get<int>();
+        auto splitInt = splitParam.Get<int>();
+        Log() << "m(" << iOuterInt << "/" << splitInt << ", " << iInnerInt << ") = v(" << iInt << ")" << EOL;
+    });
+    m(iOuter / splitParam, iInner) = v(i);
+}
+
+void addCachedMatrixToUnchachedMatrix(Matrix A, Matrix B, Scalar Ai, Scalar Aj, Scalar Bi, Scalar Bj)
+{
+    InvokeForContext<ComputeContext>([&](auto&) {
+        auto AiInt = Ai.Get<int>();
+        auto AjInt = Aj.Get<int>();
+        auto BiInt = Bi.Get<int>();
+        auto BjInt = Bj.Get<int>();
+        Log() << A.GetValue().GetName() << "(" << AiInt << ", " << AjInt << ") += " << B.GetValue().GetName() << "(" << BiInt << ", " << BjInt << ")" << EOL;
+    });
+    A(Ai, Aj) += B(Bi, Bj);
+}
+
+void addCachedMatrixToUnchachedMatrixUnrolled(Matrix A, Matrix B, Scalar Ai, Scalar Aj, Scalar Bi, Scalar Bj)
+{
+    InvokeForContext<ComputeContext>([&](auto&) {
+        auto BiInt = Bi.Get<int>();
+        auto BjInt = Bj.Get<int>();
+        for (int i = 0; i < 2; ++i)
+        {
+            for (int j = 0; j < 2; ++j)
+            {
+                Log() << A.GetValue().GetName() << "(" << i << ", " << j << ") += " << B.GetValue().GetName() << "(" << (BiInt + i) << ", " << (BjInt + j) << ")" << EOL;
+            }
+        }
+    });
+    for (int i = 0; i < 2; ++i)
+    {
+        for (int j = 0; j < 2; ++j)
+        {
+            A(i, j) += B(Bi + i, Bj + j);
+        }
+    }
+}
+
+} // namespace ell
diff --git a/libraries/value/test/src/LoopNest_test.cpp b/libraries/value/test/src/LoopNest_test.cpp
new file mode 100644
index 000000000..e01483029
--- /dev/null
+++ b/libraries/value/test/src/LoopNest_test.cpp
@@ -0,0 +1,3661 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     LoopNest_test.cpp (value)
+//  Authors:  Kern Handa, Chuck Jacobs
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "LoopNest_test.h"
+#include "LoopNest_kernels.h"
+#include "TestUtil.h"
+
+#include <value/include/ComputeContext.h>
+#include <value/include/EmitterContext.h>
+#include <value/include/FunctionDeclaration.h>
+#include <value/include/LLVMContext.h>
+#include <value/include/Matrix.h>
+#include <value/include/Tensor.h>
+#include <value/include/Value.h>
+#include <value/include/Vector.h>
+
+#include <value/include/loopnests/CodeGenerator.h>
+#include <value/include/loopnests/Kernel.h>
+#include <value/include/loopnests/LoopNest.h>
+#include <value/include/loopnests/LoopNestPrinter.h>
+
+#include <emitters/include/IRFunctionEmitter.h>
+
+#include <math/include/Matrix.h>
+#include <math/include/Tensor.h>
+#include <math/include/Vector.h>
+
+#include <utilities/include/FunctionUtils.h>
+#include <utilities/include/Logger.h>
+
+#include <testing/include/testing.h>
+
+#include <deque>
+#include <iomanip>
+#include <iostream>
+#include <queue>
+#include <sstream>
+#include <type_traits>
+#include <unordered_map>
+#include <vector>
+
+using namespace ell::emitters;
+using namespace ell::utilities;
+using namespace ell::logging;
+using namespace ell::value;
+using namespace ell::value::loopnests;
+
+template <ell::math::MatrixLayout layout>
+using LayoutType = std::integral_constant<ell::math::MatrixLayout, layout>;
+
+namespace ell
+{
+// LoopNest-specific test utilities
+namespace
+{
+    template <typename ListType, typename ElementType>
+    bool InList(const ListType& list, ElementType item)
+    {
+        return std::find(list.begin(), list.end(), item) != list.end();
+    }
+
+    void SplitAndSetOrder(LoopNest& loops, const std::vector<Index>& indices, const std::vector<int>& splitSizes, std::string order)
+    {
+        using IndexMap = std::unordered_map<char, std::pair<Index, std::queue<int>>>;
+        IndexMap indexInfo;
+        for (const auto& index : indices)
+        {
+            indexInfo.insert({ index.GetName()[0], { index, {} } });
+        }
+
+        std::vector<Index> splits;
+        for (auto ch : order)
+        {
+            auto& [index, indexSplits] = indexInfo.at(ch);
+            splits.push_back(index);
+            if (indexSplits.empty()) // first visit for this index, copy split list over instead of splitting
+            {
+                indexSplits = std::queue<int>{ std::deque<int>{ splitSizes.begin(), splitSizes.end() } };
+            }
+            else
+            {
+                loops.Split(index, indexSplits.front());
+                indexSplits.pop();
+            }
+        }
+
+        loops.SetLoopOrder(splits);
+    }
+
+} // namespace
+
+// Low-level tests of loop nest infrastructure
+Scalar SplitIterationDomain_test1()
+{
+    Index i("i"), j("j");
+    SplitIterationDomain domain({ { i, { 0, 120 } },
+                                  { j, { 0, 200 } } });
+
+    auto [i1, i2] = domain.Split(i, 30);
+    auto [i3, i4] = domain.Split(i2, 15);
+    auto [i5, i6] = domain.Split(i4, 5);
+
+    auto [j1, j2] = domain.Split(j, 50);
+    auto [j3, j4] = domain.Split(j2, 10);
+
+    if (domain.NumDimensions() != 2)
+    {
+        return 1;
+    }
+
+    // `NumSplits` returns the number of loops, not splits. It should be 4 for `i` and 3 for `j`
+    if (domain.NumSplits(i) != 4 || domain.NumSplits(j) != 3)
+    {
+        return 1;
+    }
+
+    if (!domain.IsPrimaryDimension(i) || !domain.IsPrimaryDimension(j))
+        return 1;
+
+    for (Index index : { i1, i2, i3, i4, i5, i6 })
+    {
+        if (domain.GetBaseIndex(index) != i)
+            return 1;
+        if (domain.IsPrimaryDimension(index))
+            return 1;
+    }
+
+    for (Index index : { j1, j2, j3, j4 })
+    {
+        if (domain.GetBaseIndex(index) != j)
+            return 1;
+        if (domain.IsPrimaryDimension(index))
+            return 1;
+    }
+
+    const auto iRange = domain.GetDimensionRange(i);
+    for (Index index : { i1, i3, i5, i6 })
+    {
+        if (!iRange.IsLoopIndex(index))
+            return 1;
+    }
+    for (Index index : { i, i2, i4 })
+    {
+        if (!iRange.IsComputedIndex(index))
+            return 1;
+    }
+    auto parents = iRange.GetAllParentIndices(i4); // should be i, i_0, i_2
+    if (!InList(parents, i) || !InList(parents, i2))
+        return 1;
+
+    auto dependents = iRange.GetDependentIndices(i4); // /should be i5, i6
+    if (!InList(dependents, i5) || !InList(dependents, i6))
+        return 1;
+
+    return 0;
+}
+
+// Tests of actual loop nests
+Scalar LoopNest_test1()
+{
+    auto matrix = MakeMatrix<int>(4, 5);
+    IndexRange i("i", { 0, 4 }), j("j", { 0, 5 });
+
+    auto kernel = Kernel("kernel")
+                      .Inputs(matrix.GetValue())
+                      .Indices(i.GetIndex(), j.GetIndex())
+                      .Define(loopnest_kernel);
+
+    LoopNest loop(std::vector<IndexRange>{ i, j });
+    loop.AddKernel(kernel, LoopNest::ConstraintType::predicate);
+
+    CodeGenerator generator;
+    generator.Run(loop);
+
+#if 0
+    PrintLoops(loop, "LoopNest_test1");
+#endif
+
+    return matrix(2, 3) - 19; // will return 0 if calculation is correct
+}
+
+Scalar LoopNest_test2()
+{
+    auto matrix = MakeMatrix<int>(4, 5);
+    IndexRange i("i", { 0, 4 }), j("j", { 0, 5 });
+
+    auto kernel = Kernel("kernel")
+                      .Inputs(matrix.GetValue())
+                      .Indices(i.GetIndex(), j.GetIndex())
+                      .Define(loopnest_kernel);
+
+    LoopNest loop(std::vector<IndexRange>{ i, j });
+    loop.AddKernel(kernel, LoopNest::ConstraintType::predicate);
+    loop.Split(i.GetIndex(), 2);
+
+    CodeGenerator generator;
+    generator.Run(loop);
+
+    return matrix(2, 3) - 19; // will return 0 if calculation is correct
+}
+
+// This tests that the loopnest works with a degenerate (1x1) kernel, both for compute and compile
+Scalar LoopNest_test3()
+{
+    // Declare the input matrix
+    std::vector<std::vector<int>> dt{
+        std::vector<int>{ 1, 2, 3 },
+        std::vector<int>{ 4, 5, 6 },
+    };
+    auto matrix = Matrix(dt);
+    // Declare the output matrix and initialize its values to 10.
+    auto output = MakeMatrix<int>(static_cast<int>(matrix.Rows()), static_cast<int>(matrix.Columns()));
+    For(output, [&](Scalar row, Scalar column) {
+        output(row, column) = 10;
+    });
+
+    // Use a Loopnest to call loopnest_kernel_3 for each element of the input matrix and write the result to
+    // our output.
+    loopnests::IndexRange i("i", { 0, static_cast<int>(matrix.Rows()) }), j("j", { 0, static_cast<int>(matrix.Columns()) });
+
+    auto kernel = loopnests::Kernel("kernel")
+                      .Inputs(output.GetValue(), matrix.GetValue())
+                      .Indices(i.GetIndex(), j.GetIndex())
+                      .Define(loopnest_kernel_3);
+
+    loopnests::LoopNest loop(std::vector<loopnests::IndexRange>{ i, j });
+    loop.AddKernel(kernel, LoopNest::ConstraintType::predicate);
+
+    loopnests::CodeGenerator generator;
+    generator.Run(loop);
+
+    // loopnest_kernel_3 will add the input element to the output element.
+    // Since we initialized the output to 10, we expect the result to be
+    // 10 greater than the input.
+    std::vector<int> expectedValues{ 11, 12, 13, 14, 15, 16 };
+    auto expected = Vector(expectedValues);
+
+    // View the result as a Vector
+    Vector actual = AsVector(AsFullView(output));
+
+    // Verify that the actual result is what we expect
+    return VerifySame(actual, expected);
+}
+
+// This tests that the loopnest works with a degenerate (1x1) kernel, both for compute and compile,
+// when the kernel has non-trival assignment code in it.
+Scalar LoopNest_test4()
+{
+    // Declare the output matrix and initialize its values to 0.
+    auto output = MakeMatrix<int>(2, 6);
+    For(output, [&](Scalar row, Scalar column) {
+        output(row, column) = 0;
+    });
+
+    // Use a Loopnest to call loopnest_kernel_3 for each element of the input matrix and write the result to
+    // our output.
+    loopnests::IndexRange i("i", { 0, static_cast<int>(output.Rows()) }), j("j", { 0, static_cast<int>(output.Columns()) });
+
+    auto kernel = loopnests::Kernel("kernel")
+                      .Inputs(output.GetValue(), output.GetValue())
+                      .Indices(i.GetIndex(), j.GetIndex())
+                      .Define(loopnest_kernel_4);
+
+    loopnests::LoopNest loop(std::vector<loopnests::IndexRange>{ i, j });
+    loop.AddKernel(kernel, LoopNest::ConstraintType::predicate);
+    loop.Split(j.GetIndex(), 2);
+
+    loopnests::CodeGenerator generator;
+    generator.Run(loop);
+
+    // loopnest_kernel_4 will multiply row by 10 and add the column.
+    std::vector<int> expectedValues{ 0, 1, 2, 3, 4, 5, 10, 11, 12, 13, 14, 15 };
+    auto expected = Vector(expectedValues);
+
+    // View the result as a Vector
+    Vector actual = AsVector(AsFullView(output));
+
+    // Verify that the actual result is what we expect
+    return VerifySame(actual, expected);
+}
+
+// Simple loopnest test using variable-length inputs and indices APIs
+Scalar LoopNest_test5()
+{
+    auto matrix = MakeMatrix<int>(4, 5);
+    Index i("i"), j("j");
+    LoopNest loop({ { i, { 0, 4 } },
+                    { j, { 0, 5 } } });
+
+    auto kernel = Kernel("kernel")
+                      .Inputs({ matrix.GetValue() })
+                      .Indices({ i, j })
+                      .Define(loopnest_kernel);
+
+    loop.AddKernel(kernel, LoopNest::ConstraintType::predicate);
+    loop.Split(i, 2);
+
+    CodeGenerator generator;
+    generator.Run(loop);
+
+    return matrix(2, 3) - 19; // will return 0 if calculation is correct
+}
+
+// Simple loopnest test that loops from X to N where N > X > 0
+Scalar LoopNest_test6()
+{
+    int N = 4;
+    int X = 2;
+    auto matrix = MakeMatrix<int>(N, N, "matrix");
+
+    ForRange(N, [&](Scalar i) {
+        ForRange(N, [&](Scalar j) {
+            matrix(i, j) = i + j;
+        });
+    });
+    // matrix:
+    // [ 0,  1,  2,  3 ]
+    // [ 1,  2,  3,  4 ]
+    // [ 2,  3,  4,  5 ]
+    // [ 3,  4,  5,  6 ]
+
+    // Sum the bottom right quadrant of the matrix and store the value in position (0, 0)
+
+    Index i("i"), j("j");
+    LoopNest loop({ { i, { X, N } },
+                    { j, { X, N } } });
+    auto kernel = Kernel("kernel")
+                      .Inputs(matrix.GetValue())
+                      .Indices(i, j)
+                      .Define([](Matrix mat, Scalar i, Scalar j) {
+                          mat(0, 0) += mat(i, j);
+                      });
+    loop.AddKernel(kernel, LoopNest::ConstraintType::predicate);
+    CodeGenerator generator;
+    generator.Run(loop);
+    return matrix(0, 0) - 20; // Will return 0 if calculation is correct
+}
+
+Scalar LoopNestNonzeroStart_test()
+{
+    const int size = 12;
+    const int begin = 2;
+    const int end = 10;
+    auto vector = MakeVector<int>(size);
+    for (int i = 0; i < size; ++i)
+    {
+        vector(i) = 100 * i;
+    }
+    std::vector<int> expectedValues(size);
+    for (int i = 0; i < size; ++i)
+    {
+        expectedValues[i] = 100 * i;
+    }
+    for (int i = begin; i < end; ++i)
+    {
+        expectedValues[i] = i;
+    }
+    auto expected = Vector(expectedValues);
+
+    Index i("i");
+    LoopNest loop({ { i, { begin, end } } });
+    auto kernel = Kernel("k")
+                      .Inputs(vector.GetValue())
+                      .Indices(i)
+                      .Define(set_vector_kernel);
+
+    loop.AddKernel(kernel, LoopNest::ConstraintType::predicate);
+
+    CodeGenerator generator;
+    generator.Run(loop);
+
+#if 0
+    PrintLoops(loop, "LoopNestNonzeroStart_test");
+#endif
+
+    // Verify that the actual result is what we expect
+    return VerifySame(vector, expected);
+}
+
+Scalar LoopNestBoundary_test1()
+{
+    const int size = 12;
+    const int n = 10;
+    auto vector = MakeVector<int>(size);
+    for (int i = n; i < size; ++i)
+    {
+        vector(i) = 100 * i;
+    }
+    std::vector<int> expectedValues(size);
+    for (int i = 0; i < n; ++i)
+    {
+        expectedValues[i] = i;
+    }
+    for (int i = n; i < size; ++i)
+    {
+        expectedValues[i] = 100 * i;
+    }
+    auto expected = Vector(expectedValues);
+
+    Index i("i");
+    LoopNest loop({ { i, { 0, n } } });
+    auto split_i = loop.Split(i, 4);
+
+    auto kernel = Kernel("k")
+                      .Inputs(vector.GetValue())
+                      .Indices(i)
+                      .Define(set_vector_kernel);
+
+    loop.AddKernel(kernel, LoopNest::ConstraintType::predicate);
+
+    CodeGenerator generator;
+    generator.Run(loop);
+
+#if 0
+    PrintLoops(loop, "LoopNestBoundary_test1");
+#endif
+
+    // Verify that the actual result is what we expect
+    return VerifySame(vector, expected);
+}
+
+Scalar LoopNestBoundary_test2()
+{
+    const int size = 12;
+    const int n = 10;
+    auto vector = MakeVector<int>(size);
+    for (int i = n; i < size; ++i)
+    {
+        vector(i) = 100 * i;
+    }
+    std::vector<int> expectedValues(size);
+    for (int i = 0; i < n; ++i)
+    {
+        expectedValues[i] = i;
+    }
+    for (int i = n; i < size; ++i)
+    {
+        expectedValues[i] = 100 * i;
+    }
+    auto expected = Vector(expectedValues);
+
+    Index i("i");
+    LoopNest loop({ { i, { 0, n } } });
+    auto split_i = loop.Split(i, 4);
+    auto split_i2 = loop.Split(i, 2);
+
+    auto kernel = Kernel("k")
+                      .Inputs(vector.GetValue())
+                      .Indices(i)
+                      .Define(set_vector_kernel);
+
+    loop.AddKernel(kernel, LoopNest::ConstraintType::predicate);
+
+    CodeGenerator generator;
+    generator.Run(loop);
+
+#if 0
+    PrintLoops(loop, "LoopNestBoundary_test2");
+#endif
+
+    // Verify that the actual result is what we expect
+    return VerifySame(vector, expected);
+}
+
+Scalar LoopNestBoundary_test3()
+{
+    const int size = 12;
+    const int n = 8;
+    auto vector = MakeVector<int>(size);
+    for (int i = n; i < size; ++i)
+    {
+        vector(i) = 100 * i;
+    }
+    std::vector<int> expectedValues(size);
+    for (int i = 0; i < n; ++i)
+    {
+        expectedValues[i] = i;
+    }
+    for (int i = n; i < size; ++i)
+    {
+        expectedValues[i] = 100 * i;
+    }
+    auto expected = Vector(expectedValues);
+
+    Index i("i"), j("j");
+    LoopNest loop({ { i, { 0, n } }, { j, { 0, n } } });
+    loop.Split(i, 4);
+    loop.Split(i, 2);
+    loop.Split(j, 4);
+
+    loop.SetLoopOrder({ i, j, i, j, i });
+
+    auto kernel = Kernel("k")
+                      .Inputs(vector.GetValue())
+                      .Indices(i)
+                      .Define(set_vector_kernel);
+
+    loop.AddKernel(kernel, LoopNest::ConstraintType::predicate);
+
+    CodeGenerator generator;
+    generator.Run(loop);
+
+#if 0
+    PrintLoops(loop, "LoopNestBoundary_test3");
+#endif
+
+    // Verify that the actual result is what we expect
+    return VerifySame(vector, expected);
+}
+
+Scalar LoopNestBoundary_test4()
+{
+    // The input matrices:
+    // A:                                 B:                                 C:
+    // [ 0, -1, -2, -3, -4, -5, -6, -7]   [ 0,  2,  4,  6,  8, 10, 12, 14]   [ 100 100 100 ... ]
+    // [ 1,  0, -1, -2, -3, -4, -5, -6]   [ 1,  3,  5,  7,  9, 11, 13, 15]   [ 100 100 100 ... ]
+    // [ 2,  1,  0, -1, -2, -3, -4, -5]   [ 2,  4,  6,  8, 10, 12, 14, 16]   [      ...        ]
+    // [ 3,  2,  1,  0, -1, -2, -3, -4]   [ 3,  5,  7,  9, 11, 13, 15, 17]   [      ...        ]
+    // [ 4,  3,  2,  1,  0, -1, -2, -3]   [ 4,  6,  8, 10, 12, 14, 16, 18]   [      ...        ]
+    // [ 5,  4,  3,  2,  1,  0, -1, -2]   [ 5,  7,  9, 11, 13, 15, 17, 19]   [      ...        ]
+    // [ 6,  5,  4,  3,  2,  1,  0, -1]   [ 6,  8, 10, 12, 14, 16, 18, 20]   [      ...        ]
+    // [ 7,  6,  5,  4,  3,  2,  1,  0]   [ 7,  9, 11, 13, 15, 17, 19, 21]   [      ...        ]
+
+    // (A * B) (the desired result):
+    // [-140, -196, -252, -308, -364, -420, -476, -532]
+    // [-112, -152, -192, -232, -272, -312, -352, -392]
+    // [ -84, -108, -132, -156, -180, -204, -228, -252]
+    // [ -56,  -64,  -72,  -80,  -88,  -96, -104, -112]
+    // [ -28,  -20,  -12,   -4,    4,   12,   20,   28]
+    // [   0,   24,   48,   72,   96,  120,  144,  168]
+    // [  28,   68,  108,  148,  188,  228,  268,  308]
+    // [  56,  112,  168,  224,  280,  336,  392,  448]
+
+    const int N = 8;
+    const int M = N;
+    const int K = N;
+
+    auto A = MakeMatrix<int>(M, K, "A");
+    auto B = MakeMatrix<int>(K, N, "B");
+    auto C = MakeMatrix<int>(M, N, "C");
+    auto expected = MakeMatrix<int>(M, N, "expected");
+
+    // initialize A, B, and C
+    ForRange(N, [&](Scalar i) {
+        ForRange(N, [&](Scalar j) {
+            A(i, j) = i - j;
+            B(i, j) = i + 2 * j;
+            C(i, j) = 100;
+            expected(i, j) = 0;
+        });
+    });
+
+    // fill out expected with a simple for-loop gemm
+    ForRange(N, [&](Scalar i) {
+        ForRange(N, [&](Scalar j) {
+            ForRange(N, [&](Scalar k) {
+                expected(i, j) += A(i, k) * B(k, j);
+            });
+        });
+    });
+
+    // Do computation in blocks of k_r x k_c
+    const int k_r = 3;
+    const int k_c = 4;
+    Matrix temp = MakeMatrix<int>(k_r, k_c, "temp");
+
+    loopnests::Index i("i"), j("j"), k("k");
+    loopnests::LoopNest loop({ { i, { 0, M } },
+                               { j, { 0, N } },
+                               { k, { 0, K } } });
+
+    auto [i_outer, i_inner] = loop.Split(i, k_r);
+    auto [j_outer, j_inner] = loop.Split(j, k_c);
+
+    auto prologueKernel = loopnests::Kernel("prologue")
+                              .Inputs(temp.GetValue())
+                              .Indices(i_inner, j_inner)
+                              .Define([](Matrix temp, Scalar i_inner, Scalar j_inner) {
+                                  temp(i_inner, j_inner) = 0;
+                              });
+
+    auto bodyKernel = loopnests::Kernel("body")
+                          .Inputs(A.GetValue(), B.GetValue(), C.GetValue(), temp.GetValue())
+                          .Indices(i, j, i_inner, j_inner, k)
+                          .Define([](Matrix A, Matrix B, Matrix C, Matrix temp, Scalar i, Scalar j, Scalar i_inner, Scalar j_inner, Scalar k) {
+                              temp(i_inner, j_inner) += A(i, k) * B(k, j);
+                          });
+    auto epilogueKernel = loopnests::Kernel("epilogue")
+                              .Inputs(C.GetValue(), temp.GetValue())
+                              .Indices(i, j, j_outer, i_inner, j_inner)
+                              .Define([](Matrix C, Matrix temp, Scalar i, Scalar j, Scalar j_outer, Scalar i_inner, Scalar j_inner) {
+                                  C(i, j) = temp(i_inner, j_inner);
+                              });
+
+    loop.SetLoopOrder({ i_outer, j_outer, k, j_inner, i_inner });
+
+    loop.AddKernel(prologueKernel, First(k));
+    loop.AddKernel(bodyKernel, LoopNest::ConstraintType::predicate);
+    loop.AddKernel(epilogueKernel, Last(k));
+
+    loopnests::CodeGenerator generator;
+    generator.Run(loop);
+
+#if 0
+    PrintLoops(loop, "LoopNestBoundary_test4");
+#endif
+
+    return VerifySame(C, expected);
+}
+
+Scalar LoopNestBoundary_test5()
+{
+    const int M = 9;
+    const int N = 10;
+    const int K = 11;
+
+    // Computes A*B + 1
+    auto A = MakeMatrix<int>(M, K, "A");
+    auto B = MakeMatrix<int>(K, N, "B");
+    auto C = MakeMatrix<int>(M, N, "C");
+    auto expected = MakeMatrix<int>(M, N, "expected");
+
+    // initialize A, B, and C
+    ForRange(M, [&](Scalar i) {
+        ForRange(K, [&](Scalar j) {
+            // A(i, j) = i - j;
+            A(i, j) = 1;
+        });
+    });
+
+    ForRange(K, [&](Scalar i) {
+        ForRange(N, [&](Scalar j) {
+            // B(i, j) = i + 2 * j;
+            B(i, j) = 1;
+        });
+    });
+
+    ForRange(M, [&](Scalar i) {
+        ForRange(N, [&](Scalar j) {
+            C(i, j) = 100;
+            expected(i, j) = 0;
+        });
+    });
+
+    // fill out expected with a simple for-loop gemm (plus 1)
+    ForRange(M, [&](Scalar i) {
+        ForRange(N, [&](Scalar j) {
+            ForRange(K, [&](Scalar k) {
+                expected(i, j) += A(i, k) * B(k, j);
+            });
+            expected(i, j) += 1;
+        });
+    });
+
+    loopnests::Index i("i"), j("j"), k("k");
+    loopnests::LoopNest loop({ { i, { 0, M } },
+                               { j, { 0, N } },
+                               { k, { 0, K } } });
+
+    auto prologueKernel = loopnests::Kernel("Prologue")
+                              .Inputs(C.GetValue())
+                              .Indices(i, j)
+                              .Define([](Matrix C, Scalar i, Scalar j) {
+                                  C(i, j) = 0;
+                              });
+    auto bodyKernel = loopnests::Kernel("Body")
+                          .Inputs(A.GetValue(), B.GetValue(), C.GetValue())
+                          .Indices(i, j, k)
+                          .Define([](Matrix A, Matrix B, Matrix C, Scalar i, Scalar j, Scalar k) {
+                              C(i, j) += A(i, k) * B(k, j);
+                          });
+    auto epilogueKernel = loopnests::Kernel("Epilogue")
+                              .Inputs(C.GetValue())
+                              .Indices(i, j)
+                              .Define([](Matrix C, Scalar i, Scalar j) {
+                                  C(i, j) += 1;
+                              });
+
+    loop.AddKernel(prologueKernel, First(k));
+    loop.AddKernel(bodyKernel, LoopNest::ConstraintType::predicate);
+    loop.AddKernel(epilogueKernel, Last(k), PlacementPredicate{ Placement::after });
+
+    auto [i_outer, i_inner] = loop.Split(i, 4);
+    auto [j_outer, j_inner] = loop.Split(j, 4);
+    auto [k_outer, k_inner] = loop.Split(k, 4);
+
+    loopnests::CodeGenerator generator;
+    generator.Run(loop);
+
+#if 0
+    PrintLoops(loop, "LoopNestBoundary_test5");
+#endif
+
+    return VerifySame(C, expected);
+}
+
+Scalar LoopNestReorder_test1()
+{
+    auto matrix = MakeMatrix<int>(4, 5);
+    IndexRange i("i", { 0, 4 }), j("j", { 0, 5 });
+    auto iIndex = i.GetIndex();
+    auto jIndex = j.GetIndex();
+
+    auto kernel = Kernel("kernel")
+                      .Inputs(matrix.GetValue())
+                      .Indices(iIndex, jIndex)
+                      .Define(loopnest_kernel);
+
+    LoopNest loop(IterationDomain({ i, j }));
+    loop.AddKernel(kernel, LoopNest::ConstraintType::predicate);
+    loop.Split(i.GetIndex(), 2);
+    loop.SetLoopOrder({ iIndex, jIndex, iIndex });
+
+    CodeGenerator generator;
+    generator.Run(loop);
+
+#if 0
+    PrintLoops(loop, "LoopNestReorder_test1");
+#endif
+
+    return matrix(2, 3) - 19; // will return 0 if calculation is correct
+}
+
+Scalar LoopNestReorder_test2()
+{
+    auto matrix = MakeMatrix<int>(4, 5);
+    Index i("i"), j("j");
+
+    auto kernel = Kernel("kernel")
+                      .Inputs(matrix.GetValue())
+                      .Indices(i, j)
+                      .Define(loopnest_kernel);
+
+    LoopNest loop(IterationDomain({ { i, { 0, 4 } }, { j, { 0, 5 } } }));
+    loop.AddKernel(kernel, LoopNest::ConstraintType::predicate);
+    auto [iOuter, iInner] = loop.Split(i, 2);
+    loop.SetLoopOrder({ iInner, j, iOuter });
+
+    CodeGenerator generator;
+    generator.Run(loop);
+
+#if 0
+    PrintLoops(loop, "LoopNestReorder_test2");
+#endif
+
+    return matrix(2, 3) - 19; // will return 0 if calculation is correct
+}
+
+Scalar TwoKernel_test()
+{
+    auto matrix = MakeMatrix<int>(4, 5);
+    IndexRange i("i", { 0, 4 }), j("j", { 0, 5 });
+    auto iIndex = i.GetIndex();
+    auto jIndex = j.GetIndex();
+
+    auto kernel1 = Kernel("kernel1")
+                       .Inputs(matrix.GetValue())
+                       .Indices(iIndex, jIndex)
+                       .Define(loopnest_kernel);
+
+    auto kernel2 = Kernel("kernel2")
+                       .Inputs(matrix.GetValue())
+                       .Indices(iIndex, jIndex)
+                       .Define(loopnest_kernel_2);
+
+    LoopNest loop(IterationDomain({ i, j }));
+    loop.AddKernel(kernel1, LoopNest::ConstraintType::predicate);
+    loop.AddKernel(kernel2, LoopNest::ConstraintType::predicate);
+    loop.Split(i.GetIndex(), 2);
+    loop.SetLoopOrder({ iIndex, jIndex, iIndex });
+
+    CodeGenerator generator;
+    generator.Run(loop);
+
+#if 0
+    PrintLoops(loop, "TwoKernel_test");
+#endif
+
+    auto expected = 19 + 26; // 19 == 1st kernel (2*i + 5*j), 26 == 2nd kernel (10*i+2*j)
+    return matrix(2, 3) - expected; // will return 0 if calculation is correct
+}
+
+// Prototype for test with a kernel that runs on the last iteration of an index
+// split: where to split the loop (0 if no split)
+// id: id to use for body and "last" kernels ("" if they should not share an ID)
+Scalar LoopNestLastPredicate_test(std::string tag, int split, std::string id)
+{
+    const int n = 32;
+    std::vector<int> expectedValues(n);
+    for (int i = 0; i < n; ++i)
+    {
+        expectedValues[i] = i;
+    }
+    if (id.empty())
+    {
+        expectedValues[n - 1] += 1;
+    }
+    else
+    {
+        expectedValues[n - 1] = 1;
+    }
+    auto expected = Vector(expectedValues);
+
+    auto vector = MakeVector<int>(n);
+    Index i("i");
+    LoopNest loop({ { i, { 0, n } } });
+
+    if (split != 0)
+    {
+        loop.Split(i, split);
+    }
+
+    auto kernel = Kernel("k", id)
+                      .Inputs(vector.GetValue())
+                      .Indices(i)
+                      .Define(set_vector_kernel);
+
+    auto boundaryKernel = Kernel("boundary", id)
+                              .Inputs(vector.GetValue())
+                              .Indices(i)
+                              .Define(increment_vector_kernel);
+
+    if (id.empty())
+    {
+        loop.AddKernel(kernel, LoopNest::ConstraintType::predicate);
+        loop.AddKernel(boundaryKernel, { Last(i) });
+    }
+    else
+    {
+        loop.AddKernel(boundaryKernel, { Last(i) });
+        loop.AddKernel(kernel, LoopNest::ConstraintType::predicate);
+    }
+
+#if 0
+    PrintLoops(loop, "LoopNestLastPredicate_test_" + tag);
+#endif
+
+    CodeGenerator generator;
+    generator.Run(loop);
+
+    // Verify that the actual result is what we expect
+    return VerifySame(vector, expected);
+}
+
+// Test with a kernel that runs on the last iteration of an index
+Scalar LoopNestLastPredicate_test1()
+{
+    return LoopNestLastPredicate_test("1", 0, "");
+}
+
+// Test with a kernel that runs on the last iteration of an index, with a split loop
+Scalar LoopNestLastPredicate_test2()
+{
+    return LoopNestLastPredicate_test("2", 4, "");
+}
+
+// Test with an alternate kernel that runs on the last iteration of an index (instead of the main kernel)
+Scalar LoopNestLastPredicate_test3()
+{
+    return LoopNestLastPredicate_test("3", 0, "k");
+}
+// Test with an alternate kernel that runs on the last iteration of an index (instead of the main kernel), with a split loop
+Scalar LoopNestLastPredicate_test4()
+{
+    return LoopNestLastPredicate_test("4", 4, "k");
+}
+
+Scalar LoopNestBoundaryPredicate_test1()
+{
+    const int size = 12;
+    const int n = 10;
+    auto vector = MakeVector<int>(size);
+    for (int i = n; i < size; ++i)
+    {
+        vector(i) = 100 * i;
+    }
+
+    std::vector<int> expectedValues(size);
+    int mainEnd = 4 * (n / 4);
+    for (int i = 0; i < mainEnd; ++i)
+    {
+        expectedValues[i] = i;
+    }
+    for (int i = mainEnd; i < n; ++i)
+    {
+        expectedValues[i] = 1;
+    }
+    for (int i = n; i < size; ++i)
+    {
+        expectedValues[i] = 100 * i; // same as initialized vector, untouched
+    }
+    auto expected = Vector(expectedValues);
+
+    Index i("i");
+    LoopNest loop({ { i, { 0, n } } });
+    auto split_i = loop.Split(i, 4);
+
+    auto kernel = Kernel("k")
+                      .Inputs(vector.GetValue())
+                      .Indices(i)
+                      .Define(set_vector_kernel);
+
+    auto boundaryKernel = Kernel("boundary", kernel.GetId())
+                              .Inputs(vector.GetValue())
+                              .Indices(i)
+                              .Define(increment_vector_kernel);
+
+    loop.AddKernel(boundaryKernel, { EndBoundary(i) });
+    loop.AddKernel(kernel, LoopNest::ConstraintType::predicate);
+
+    CodeGenerator generator;
+    generator.Run(loop);
+
+#if 0
+    PrintLoops(loop, "LoopNestBoundaryPredicate_test1");
+#endif
+
+    // Verify that the actual result is what we expect
+    return VerifySame(vector, expected);
+}
+
+Scalar MissingIndex_test()
+{
+    const int n = 12;
+    const int splitAmount = 4;
+    auto vector = MakeVector<int>(n);
+
+    ForRange(n, [&](Scalar i) {
+        vector(i) = Scalar(100);
+    });
+
+    std::vector<int> expectedValues(n);
+    for (int i = 0; i < n; ++i)
+    {
+        expectedValues[i] = 100;
+    }
+    for (int i = 0; i < n; i += splitAmount)
+    {
+        expectedValues[i] = i;
+    }
+    auto expected = Vector(expectedValues);
+
+    Index i("i");
+    LoopNest loop({ { i, { 0, n } } });
+    auto [i_outer, i_inner] = loop.Split(i, splitAmount);
+
+    auto kernel = Kernel("k")
+                      .Inputs(vector.GetValue())
+                      .Indices(i)
+                      .Define(set_vector_kernel); // v[i] = i
+
+    CodePositionConstraints constraint(LoopFragmentType::body, { i_outer }, {});
+    loop.AddKernel(kernel, constraint);
+
+    CodeGenerator generator;
+    generator.Run(loop);
+
+#if 0
+    PrintLoops(loop, "MissingIndex_test");
+#endif
+
+    // Verify that the actual result is what we expect
+    return VerifySame(vector, expected);
+}
+
+Scalar RequiredIndex_test()
+{
+    std::string loopOrder = "ijk";
+    const int N = 8;
+    auto A = MakeMatrix<int>(N, N, "A");
+    auto B = MakeMatrix<int>(N, N, "B");
+    auto C = MakeMatrix<int>(N, N, "C");
+
+    // initialize A, B, and C
+    ForRange(N, [&](Scalar i) {
+        ForRange(N, [&](Scalar j) {
+            A(i, j) = i - j;
+            B(i, j) = i + 2 * j;
+            C(i, j) = 100;
+        });
+    });
+
+    // The input matrices:
+    // A:                                 B:                                 C:
+    // [ 0, -1, -2, -3, -4, -5, -6, -7]   [ 0,  2,  4,  6,  8, 10, 12, 14]   [ 100 100 100 ... ]
+    // [ 1,  0, -1, -2, -3, -4, -5, -6]   [ 1,  3,  5,  7,  9, 11, 13, 15]   [ 100 100 100 ... ]
+    // [ 2,  1,  0, -1, -2, -3, -4, -5]   [ 2,  4,  6,  8, 10, 12, 14, 16]   [      ...        ]
+    // [ 3,  2,  1,  0, -1, -2, -3, -4]   [ 3,  5,  7,  9, 11, 13, 15, 17]   [      ...        ]
+    // [ 4,  3,  2,  1,  0, -1, -2, -3]   [ 4,  6,  8, 10, 12, 14, 16, 18]   [      ...        ]
+    // [ 5,  4,  3,  2,  1,  0, -1, -2]   [ 5,  7,  9, 11, 13, 15, 17, 19]   [      ...        ]
+    // [ 6,  5,  4,  3,  2,  1,  0, -1]   [ 6,  8, 10, 12, 14, 16, 18, 20]   [      ...        ]
+    // [ 7,  6,  5,  4,  3,  2,  1,  0]   [ 7,  9, 11, 13, 15, 17, 19, 21]   [      ...        ]
+
+    // (A * B) + 1  (the desired result):
+    // [-139, -195, -251, -307, -363, -419, -475, -531]
+    // [-111, -151, -191, -231, -271, -311, -351, -391]
+    // [ -83, -107, -131, -155, -179, -203, -227, -251]
+    // [ -55,  -63,  -71,  -79,  -87,  -95, -103, -111]
+    // [ -27,  -19,  -11,   -3,    5,   13,   21,   29]
+    // [   1,   25,   49,   73,   97,  121,  145,  169]
+    // [  29,   69,  109,  149,  189,  229,  269,  309]
+    // [  57,  113,  169,  225,  281,  337,  393,  449]
+
+    Index i("i"), j("j"), k("k");
+
+    auto initCKernel = Kernel("init")
+                           .Inputs(C.GetValue())
+                           .Indices(i, j)
+                           .Define(initToZero);
+    auto innerKernel = Kernel("matmul")
+                           .Inputs(A.GetValue(), B.GetValue(), C.GetValue())
+                           .Indices(i, j, k)
+                           .Define(matmul_kernel);
+    auto postProcessCKernel = Kernel("post")
+                                  .Inputs(C.GetValue())
+                                  .Indices(i, j)
+                                  .Define(addOne);
+
+    LoopNest loop({ { i, { 0, N } },
+                    { j, { 0, N } },
+                    { k, { 0, N } } });
+
+    CodePositionConstraints preConstraint{ LoopFragmentType::prologue, { i, j }, {} };
+    loop.AddKernel(initCKernel, preConstraint);
+
+    loop.AddKernel(innerKernel, LoopNest::ConstraintType::constraint);
+
+    CodePositionConstraints postConstraint{ LoopFragmentType::epilogue, { i, j }, {} };
+    loop.AddKernel(postProcessCKernel, postConstraint);
+
+    // SplitAndSetOrder(loop, { i, j, k }, { 4, 2 }, loopOrder);
+
+    CodeGenerator generator;
+    generator.Run(loop);
+
+#if 0 // DEBUGGING
+    PrintLoops(loop, "RequiredIndex_test");
+
+    InvokeForContext<ComputeContext>([&](auto&) {
+        ForRange(N, [&](Scalar i) {
+            ForRange(N, [&](Scalar j) {
+                auto val = C(i, j).Get<int>();
+                Log() << std::setw(5) << val;
+            });
+            Log() << EOL;
+        });
+    });
+#endif
+
+    return C(1, 2) + C(2, 1) - (-191 + -107); // will return 0 if calculation is correct
+}
+
+Scalar SimpleImperfectNest_test()
+{
+    const int N = 4;
+    Vector A = MakeVector<int>(N);
+    Vector B = MakeVector<int>(N);
+
+    // initialize A, B, and C
+    ForRange(N, [&](Scalar i) {
+        A(i) = 10;
+        B(i) = 20;
+    });
+    Index i("i");
+
+    auto prologueKernel = Kernel("prologue")
+                              .Inputs(A.GetValue())
+                              .Indices(i)
+                              .Define(set_vector_kernel);
+    auto bodyKernel = Kernel("body")
+                          .Inputs(A.GetValue())
+                          .Indices(i)
+                          .Define(increment_vector_kernel);
+    auto epilogueKernel = Kernel("epilogue")
+                              .Inputs(A.GetValue(), B.GetValue())
+                              .Indices(i)
+                              .Define(copy_vector_kernel);
+
+    LoopNest loop({ { i, { 0, N } } });
+
+    CodePositionConstraints prologueConstraint{ LoopFragmentType::prologue, {}, { i } };
+    loop.AddKernel(prologueKernel, prologueConstraint);
+
+    loop.AddKernel(bodyKernel, LoopNest::ConstraintType::constraint);
+
+    CodePositionConstraints epilogueConstraint{ LoopFragmentType::epilogue, {}, { i } };
+    loop.AddKernel(epilogueKernel, epilogueConstraint);
+
+    CodeGenerator generator;
+    generator.Run(loop);
+
+    // DEBUGGING
+#if 0
+    PrintLoops(loop, "SimpleImperfectNest_test");
+#endif
+
+#if 0
+    InvokeForContext<ComputeContext>([&](auto&) {
+        ForRange(N, [&](Scalar i) {
+            ForRange(N, [&](Scalar j) {
+                auto val = C(i, j).Get<int>();
+                Log() << std::setw(5) << val;
+            });
+            Log() << EOL;
+        });
+    });
+#endif
+
+    std::vector<int> expectedValues{ 20, 20, 20, 11 };
+    auto expected = Vector(expectedValues);
+
+    // Verify that the actual result is what we expect
+    return VerifySame(B, expected);
+}
+
+Scalar ImperfectNest_test(std::string loopOrder)
+{
+    const int N = 8;
+    auto A = MakeMatrix<int>(N, N, "A");
+    auto B = MakeMatrix<int>(N, N, "B");
+    auto C = MakeMatrix<int>(N, N, "C");
+
+    // initialize A, B, and C
+    ForRange(N, [&](Scalar i) {
+        ForRange(N, [&](Scalar j) {
+            A(i, j) = i - j;
+            B(i, j) = i + 2 * j;
+            C(i, j) = 100;
+        });
+    });
+
+    // The input matrices:
+    // A:                                 B:                                 C:
+    // [ 0, -1, -2, -3, -4, -5, -6, -7]   [ 0,  2,  4,  6,  8, 10, 12, 14]   [ 100 100 100 ... ]
+    // [ 1,  0, -1, -2, -3, -4, -5, -6]   [ 1,  3,  5,  7,  9, 11, 13, 15]   [ 100 100 100 ... ]
+    // [ 2,  1,  0, -1, -2, -3, -4, -5]   [ 2,  4,  6,  8, 10, 12, 14, 16]   [      ...        ]
+    // [ 3,  2,  1,  0, -1, -2, -3, -4]   [ 3,  5,  7,  9, 11, 13, 15, 17]   [      ...        ]
+    // [ 4,  3,  2,  1,  0, -1, -2, -3]   [ 4,  6,  8, 10, 12, 14, 16, 18]   [      ...        ]
+    // [ 5,  4,  3,  2,  1,  0, -1, -2]   [ 5,  7,  9, 11, 13, 15, 17, 19]   [      ...        ]
+    // [ 6,  5,  4,  3,  2,  1,  0, -1]   [ 6,  8, 10, 12, 14, 16, 18, 20]   [      ...        ]
+    // [ 7,  6,  5,  4,  3,  2,  1,  0]   [ 7,  9, 11, 13, 15, 17, 19, 21]   [      ...        ]
+
+    // (A * B) + 1  (the desired result):
+    // [-139, -195, -251, -307, -363, -419, -475, -531]
+    // [-111, -151, -191, -231, -271, -311, -351, -391]
+    // [ -83, -107, -131, -155, -179, -203, -227, -251]
+    // [ -55,  -63,  -71,  -79,  -87,  -95, -103, -111]
+    // [ -27,  -19,  -11,   -3,    5,   13,   21,   29]
+    // [   1,   25,   49,   73,   97,  121,  145,  169]
+    // [  29,   69,  109,  149,  189,  229,  269,  309]
+    // [  57,  113,  169,  225,  281,  337,  393,  449]
+
+    Index i("i"), j("j"), k("k");
+
+    auto innerKernel = Kernel("matmul")
+                           .Inputs(A.GetValue(), B.GetValue(), C.GetValue())
+                           .Indices(i, j, k)
+                           .Define(matmul_kernel);
+    auto initCKernel = Kernel("init")
+                           .Inputs(C.GetValue())
+                           .Indices(i, j)
+                           .Define(initToZero);
+    auto postProcessCKernel = Kernel("post")
+                                  .Inputs(C.GetValue())
+                                  .Indices(i, j)
+                                  .Define(addOne);
+
+    LoopNest loop({ { i, { 0, N } },
+                    { j, { 0, N } },
+                    { k, { 0, N } } });
+
+    CodePositionConstraints preConstraint{ LoopFragmentType::prologue, { i, j }, { k } };
+    loop.AddKernel(initCKernel, preConstraint);
+
+    loop.AddKernel(innerKernel, LoopNest::ConstraintType::constraint);
+
+    CodePositionConstraints postConstraint{ LoopFragmentType::epilogue, { i, j }, { k } };
+    loop.AddKernel(postProcessCKernel, postConstraint);
+
+    SplitAndSetOrder(loop, { i, j, k }, { 4, 2 }, loopOrder);
+
+    CodeGenerator generator;
+    generator.Run(loop);
+
+#if 0
+    PrintLoops(loop, "ImperfectNest_test_" + loopOrder);
+#endif
+
+#if 0
+    InvokeForContext<ComputeContext>([&](auto&) {
+        ForRange(N, [&](Scalar i) {
+            ForRange(N, [&](Scalar j) {
+                auto val = C(i, j).Get<int>();
+                Log() << std::setw(5) << val;
+            });
+            Log() << EOL;
+        });
+    });
+#endif
+
+    return C(1, 2) + C(2, 1) - (-191 + -107); // will return 0 if calculation is correct
+}
+
+Scalar ImperfectNest_test_ijk()
+{
+    return ImperfectNest_test("ijk");
+}
+
+Scalar ImperfectNest_test_ikj()
+{
+    return ImperfectNest_test("ikj");
+}
+
+Scalar ImperfectNest_test_kij()
+{
+    return ImperfectNest_test("kij");
+}
+
+Scalar ImperfectNest_test_ijkijk()
+{
+    return ImperfectNest_test("ijkijk");
+}
+
+Scalar ImperfectNest_test_kijijk()
+{
+    return ImperfectNest_test("kijijk");
+}
+
+Scalar ImperfectNest_test_ijkkij()
+{
+    return ImperfectNest_test("ijkkij");
+}
+
+Scalar SplitIndex_test1_old()
+{
+    auto vector = MakeVector<int>(4 * 5);
+    auto matrix = MakeMatrix<int>(4, 5);
+    auto splitParam = Scalar(Allocate<int>(utilities::ScalarLayout));
+    splitParam = 5;
+    IndexRange i("i", { 0, 4 * 5 });
+    auto iIndex = i.GetIndex();
+    LoopNest loop(std::vector<IndexRange>{ i });
+    auto [i_outer, i_inner] = loop.Split(iIndex, 10);
+
+    auto kernel1 = Kernel("set_vector")
+                       .Inputs(vector.GetValue())
+                       .Indices(iIndex)
+                       .Define(set_vector_kernel);
+
+    auto kernel2 = Kernel("reorder_vector")
+                       .Inputs(vector.GetValue(), matrix.GetValue(), splitParam.GetValue())
+                       .Indices(iIndex, i_outer, i_inner)
+                       .Define(reorder_vector_kernel);
+
+    loop.AddKernel(kernel1, LoopNest::ConstraintType::constraint);
+    loop.AddKernel(kernel2, LoopNest::ConstraintType::constraint);
+
+    CodeGenerator generator;
+    generator.Run(loop);
+
+#if 0
+    PrintLoops(loop, "SplitIndex_test1_old");
+#endif
+
+    return (vector(0) - matrix(0, 0)) + (vector(13) - matrix(2, 3)) + (matrix(3, 2) - (3 * 5 + 2));
+}
+
+Scalar SplitIndex_test1()
+{
+    auto vector = MakeVector<int>(4 * 5);
+    auto matrix = MakeMatrix<int>(4, 5);
+    auto splitParam = Scalar(Allocate<int>(utilities::ScalarLayout));
+    splitParam = 5;
+    IndexRange i("i", { 0, 4 * 5 });
+    auto iIndex = i.GetIndex();
+    LoopNest loop(std::vector<IndexRange>{ i });
+    auto [i_outer, i_inner] = loop.Split(iIndex, 10);
+
+    auto kernel1 = Kernel("set_vector")
+                       .Inputs(vector.GetValue())
+                       .Indices(iIndex)
+                       .Define(set_vector_kernel);
+
+    auto kernel2 = Kernel("reorder_vector")
+                       .Inputs(vector.GetValue(), matrix.GetValue(), splitParam.GetValue())
+                       .Indices(iIndex, i_outer, i_inner)
+                       .Define(reorder_vector_kernel);
+
+    loop.AddKernel(kernel1, LoopNest::ConstraintType::predicate);
+    loop.AddKernel(kernel2, LoopNest::ConstraintType::predicate);
+
+    CodeGenerator generator;
+    generator.Run(loop);
+
+#if 0
+    PrintLoops(loop, "SplitIndex_test1");
+#endif
+
+    return (vector(0) - matrix(0, 0)) + (vector(13) - matrix(2, 3)) + (matrix(3, 2) - (3 * 5 + 2));
+}
+
+// Same as SplitIndex_test1, but with an extra split
+Scalar SplitIndex_test2()
+{
+    auto vector = MakeVector<int>(4 * 5);
+    auto matrix = MakeMatrix<int>(4, 5);
+    auto splitParam = Scalar(Allocate<int>(utilities::ScalarLayout));
+    splitParam = 5;
+
+    Index i("i");
+    LoopNest loop({ { i, { 0, 4 * 5 } } });
+    auto [i_outer, temp] = loop.Split(i, 10);
+    auto [i_middle, i_inner] = loop.Split(i, 5);
+
+    auto kernel1 = Kernel("set_vector")
+                       .Inputs(vector.GetValue())
+                       .Indices(i)
+                       .Define(set_vector_kernel);
+
+    auto kernel2 = Kernel("reorder_vector")
+                       .Inputs(vector.GetValue(), matrix.GetValue(), splitParam.GetValue())
+                       .Indices(i, i_outer, temp)
+                       .Define(reorder_vector_kernel);
+
+    loop.AddKernel(kernel1, LoopNest::ConstraintType::predicate);
+    loop.AddKernel(kernel2, LoopNest::ConstraintType::predicate);
+
+#if 0
+    PrintLoops(loop, "SplitIndex_test2");
+#endif
+
+    CodeGenerator generator;
+    generator.Run(loop);
+
+    return (vector(0) - matrix(0, 0)) + (vector(13) - matrix(2, 3)) + (matrix(3, 2) - (3 * 5 + 2));
+}
+
+// Same as SplitIndex_test2, but splitting an outer index
+Scalar SplitIndex_test3()
+{
+    auto vector = MakeVector<int>(4 * 5);
+    auto matrix = MakeMatrix<int>(4, 5);
+    auto splitParam = Scalar(Allocate<int>(utilities::ScalarLayout));
+    splitParam = 5;
+
+    Index i("i");
+    LoopNest loop({ { i, { 0, 4 * 5 } } });
+    auto [temp, i_inner] = loop.Split(i, 5);
+    auto [i_outer, i_middle] = loop.Split(temp, 10);
+    loop.SetLoopOrder({ i_outer, i_middle, i_inner });
+    auto kernel1 = Kernel("set_vector")
+                       .Inputs(vector.GetValue())
+                       .Indices(i)
+                       .Define(set_vector_kernel);
+
+    auto kernel2 = Kernel("reorder_vector")
+                       .Inputs(vector.GetValue(), matrix.GetValue(), splitParam.GetValue())
+                       .Indices(i, temp, i_inner)
+                       .Define(reorder_vector_kernel);
+
+    loop.AddKernel(kernel1, LoopNest::ConstraintType::predicate);
+    loop.AddKernel(kernel2, LoopNest::ConstraintType::predicate);
+
+#if 0
+    PrintLoops(loop, "SplitIndex_test3");
+#endif
+
+    CodeGenerator generator;
+    generator.Run(loop);
+
+    return (vector(0) - matrix(0, 0)) + (vector(13) - matrix(2, 3)) + (matrix(3, 2) - (3 * 5 + 2));
+}
+
+Scalar EpilogueIndex_test()
+{
+    const int N = 8;
+    auto vector = MakeVector<int>(N);
+
+    Index i("i");
+    LoopNest loop({ { i, { 0, N } } });
+    auto [i_outer, i_inner] = loop.Split(i, 4);
+
+    auto prologueKernel = Kernel("prologue")
+                              .Inputs(vector.GetValue())
+                              .Indices(i)
+                              .Define([](Vector v, Scalar i) {
+                                  v[i] = i;
+                              });
+    auto bodyKernel = Kernel("body")
+                          .Inputs(vector.GetValue())
+                          .Indices(i)
+                          .Define([](Vector v, Scalar i) {
+                              v[i] += 10;
+                          });
+    auto epilogueKernel = Kernel("epilogue")
+                              .Inputs(vector.GetValue())
+                              .Indices(i)
+                              .Define([](Vector v, Scalar i) {
+                                  v[i] += 1;
+                              });
+
+    loopnests::CodePositionConstraints prologueConstraints{ loopnests::LoopFragmentType::prologue, { i_outer }, {} };
+    loopnests::CodePositionConstraints epilogueConstraints{ loopnests::LoopFragmentType::epilogue, { i_outer }, {} };
+    loop.AddKernel(prologueKernel, prologueConstraints);
+    loop.AddKernel(bodyKernel, LoopNest::ConstraintType::constraint);
+    loop.AddKernel(epilogueKernel, epilogueConstraints);
+
+#if 0
+    PrintLoops(loop, "EpilogueIndex_test");
+#endif
+
+    Scalar ok = Allocate<int>(ScalarLayout);
+    ok = 1;
+
+    CodeGenerator generator;
+    generator.Run(loop);
+
+    std::vector<int> expectedValues{ 10, 10, 10, 11, 14, 10, 10, 11 };
+    auto expected = Vector(expectedValues);
+
+    If(
+        VerifySame(vector, expected) == 0,
+        [&] {
+            ok = 0;
+        })
+        .Else([&] {
+            ok = 1;
+        });
+
+    return ok;
+}
+
+Scalar RenameKernelArg_test()
+{
+    auto matrix = MakeMatrix<int>(4, 5);
+    IndexRange i("i", { 0, 4 }), j("j", { 0, 5 });
+    LoopNest loop(std::vector<IndexRange>{ i, j });
+
+    auto kernel = Kernel("kernel")
+                      .Inputs(matrix.GetValue())
+                      .Indices(i.GetIndex(), j.GetIndex())
+                      .Define(loopnest_kernel);
+
+    loop.AddKernel(kernel, LoopNest::ConstraintType::constraint);
+
+    auto newMatrix = MakeMatrix<int>(4, 5);
+    auto [outer, inner] = loop.Split(i.GetIndex(), 2);
+    loop.RenameVariable(matrix, newMatrix, { inner });
+
+    CodeGenerator generator;
+    generator.Run(loop);
+
+    return newMatrix(2, 3) - 19; // will return 0 if calculation is correct
+}
+
+Scalar NonInnermostKernel_test1()
+{
+    auto matrix = MakeMatrix<int>(4, 4);
+    ForRange(4, [&](Scalar i) {
+        ForRange(4, [&](Scalar j) {
+            matrix(i, j) = i - j;
+        });
+    });
+
+    // matrix:
+    //  0  -1  -2  -3
+    //  1   0  -1  -2
+    //  2   1   0  -1
+    //  3   2   1   0
+
+    // result with 1x1 only:
+    //  1   0  -1  -2
+    //  2   1   0  -1
+    //  3   2   1   0
+    //  4   3   2   1
+
+    // (correct) result with 2x2 only ("before"):
+    //  2  -1   0  -3
+    //  1   0  -1  -2
+    //  4   1   2  -1
+    //  3   2   1   0
+
+    // result with 2x2 invoked at the outer level, and 1x1 invoked inside ("first"):
+    //  2   0   0  -2
+    //  2   1   0  -1
+    //  4   2   2   0
+    //  4   3   2   1
+
+    Index i("i"), j("j");
+    LoopNest loop({ { i, { 0, 4 } },
+                    { j, { 0, 4 } } });
+
+    auto kernel1x1 = Kernel("kernel_1x1")
+                         .Inputs(matrix.GetValue())
+                         .Indices(i, j)
+                         .Define(addOne);
+
+    auto [iOuter, iInner] = loop.Split(i, 2);
+    auto [jOuter, jInner] = loop.Split(j, 2);
+
+    auto kernel2x2 = Kernel("kernel_2x2", kernel1x1.GetId())
+                         .Inputs(matrix.GetValue())
+                         .Indices(iOuter, jOuter)
+                         .Define(addTwo);
+    loop.AddKernel(kernel2x2, LoopNest::ConstraintType::constraint);
+    loop.AddKernel(kernel1x1, LoopNest::ConstraintType::constraint);
+
+#if 0
+    PrintLoops(loop, "NonInnermostKernel_test1");
+#endif
+
+    CodeGenerator generator;
+    generator.Run(loop);
+
+#if 0 // DEBUGGING
+    InvokeForContext<ComputeContext>([&](auto&) {
+        ForRange(4, [&](Scalar i) {
+            ForRange(4, [&](Scalar j) {
+                auto val = matrix(i, j).Get<int>();
+                Log() << std::setw(4) << val;
+            });
+            Log() << EOL;
+        });
+    });
+#endif
+    return matrix(1, 1).Copy(); // will return 0 if calculation is correct
+}
+
+Scalar NonInnermostKernel_test2()
+{
+    auto matrix = MakeMatrix<int>(4, 4, "matrix");
+    auto expected = MakeMatrix<int>(4, 4, "matrix");
+    ForRange(4, [&](Scalar i) {
+        ForRange(4, [&](Scalar j) {
+            matrix(i, j) = i - j;
+            expected(i, j) = i - j;
+        });
+    });
+    ForRange(2, [&](Scalar i) {
+        ForRange(2, [&](Scalar j) {
+            expected(2 * i, 2 * j) += 2;
+        });
+    });
+
+    // matrix:
+    //  0  -1  -2  -3
+    //  1   0  -1  -2
+    //  2   1   0  -1
+    //  3   2   1   0
+
+    // result with 1x1:
+    //  1   0  -1  -2
+    //  2   1   0  -1
+    //  3   2   1   0
+    //  4   3   2   1
+
+    // result with 2x2:
+    //  2  -1   0  -3
+    //  1   0  -1  -2
+    //  4   1   2  -1
+    //  3   2   1   0
+
+    Index i("i"), j("j");
+    LoopNest loop({ { i, { 0, 4 } },
+                    { j, { 0, 4 } } });
+
+    auto kernel1x1 = Kernel("kernel_1x1")
+                         .Inputs(matrix.GetValue())
+                         .Indices(i, j)
+                         .Define(addOne);
+
+    auto [iOuter, iInner] = loop.Split(i, 2);
+    auto [jOuter, jInner] = loop.Split(j, 2);
+
+    auto kernel2x2 = Kernel("kernel_2x2", kernel1x1.GetId())
+                         .Inputs(matrix.GetValue())
+                         .Indices(iOuter, jOuter)
+                         .Define(addTwo);
+
+    CodePositionConstraints bodyConstraint{ LoopFragmentType::body, { iOuter, jOuter }, {} };
+    loop.AddKernel(kernel2x2, bodyConstraint);
+    loop.AddKernel(kernel1x1, LoopNest::ConstraintType::constraint);
+
+#if 0
+    PrintLoops(loop, "NonInnermostKernel_test2");
+#endif
+
+    CodeGenerator generator;
+    generator.Run(loop);
+
+#if 0 // DEBUGGING
+    InvokeForContext<ComputeContext>([&](auto&) {
+        ForRange(4, [&](Scalar i) {
+            ForRange(4, [&](Scalar j) {
+                auto val = matrix(i, j).Get<int>();
+                Log() << std::setw(4) << val;
+            });
+            Log() << EOL;
+        });
+    });
+#endif
+
+    Scalar ok = Allocate<int>(ScalarLayout);
+    If(
+        VerifySame(matrix, expected) == 0,
+        [&] {
+            ok = 0;
+        })
+        .Else([&] {
+            ok = 1;
+        });
+
+    return matrix(1, 1).Copy(); // will return 0 if calculation is correct
+}
+
+Scalar NonInnermostKernel_test3()
+{
+    auto matrix = MakeMatrix<int>(4, 4, "matrix");
+    ForRange(4, [&](Scalar i) {
+        ForRange(4, [&](Scalar j) {
+            matrix(i, j) = i - j;
+        });
+    });
+
+    // matrix:
+    //  0  -1  -2  -3
+    //  1   0  -1  -2
+    //  2   1   0  -1
+    //  3   2   1   0
+
+    // result with 1x1:
+    //  1   0  -1  -2
+    //  2   1   0  -1
+    //  3   2   1   0
+    //  4   3   2   1
+
+    // result with 2x2:
+    //  2  -1   0  -3
+    //  1   0  -1  -2
+    //  4   1   2  -1
+    //  3   2   1   0
+
+    Index i("i"), j("j");
+    LoopNest loop({ { i, { 0, 4 } },
+                    { j, { 0, 4 } } });
+
+    auto kernel1x1 = Kernel("kernel_1x1")
+                         .Inputs(matrix.GetValue())
+                         .Indices(i, j)
+                         .Define(addOne);
+
+    auto [iOuter, iInner] = loop.Split(i, 2);
+    auto [jOuter, jInner] = loop.Split(j, 2);
+
+    auto kernel2x2 = Kernel("kernel_2x2", kernel1x1.GetId())
+                         .Inputs(matrix.GetValue())
+                         .Indices(iOuter, jOuter)
+                         .Define(addTwo);
+
+    loop.AddKernel(kernel2x2, LoopFragmentType::body);
+    loop.AddKernel(kernel1x1, LoopFragmentType::body);
+
+#if 0
+    PrintLoops(loop, "NonInnermostKernel_test3");
+#endif
+
+    CodeGenerator generator;
+    generator.Run(loop);
+
+#if 0 // DEBUGGING
+    InvokeForContext<ComputeContext>([&](auto&) {
+        ForRange(4, [&](Scalar i) {
+            ForRange(4, [&](Scalar j) {
+                auto val = matrix(i, j).Get<int>();
+                Log() << std::setw(4) << val;
+            });
+            Log() << EOL;
+        });
+    });
+#endif
+    return matrix(1, 1).Copy(); // will return 0 if calculation is correct
+}
+
+Scalar NonInnermostKernel_test4()
+{
+    auto matrix = MakeMatrix<int>(4, 4);
+    ForRange(4, [&](Scalar i) {
+        ForRange(4, [&](Scalar j) {
+            matrix(i, j) = i - j;
+        });
+    });
+
+    // matrix:
+    //  0  -1  -2  -3
+    //  1   0  -1  -2
+    //  2   1   0  -1
+    //  3   2   1   0
+
+    // result with 1x1 only:
+    //  1   0  -1  -2
+    //  2   1   0  -1
+    //  3   2   1   0
+    //  4   3   2   1
+
+    // result with 2x2 only ("before"):
+    //  2  -1   0  -3
+    //  1   0  -1  -2
+    //  4   1   2  -1
+    //  3   2   1   0
+
+    // result with 2x2 invoked at the outer level, and 1x1 invoked inside ("first"):
+    //  2   0   0  -2
+    //  2   1   0  -1
+    //  4   2   2   0
+    //  4   3   2   1
+
+    Index i("i"), j("j");
+    LoopNest loop({ { i, { 0, 4 } },
+                    { j, { 0, 4 } } });
+
+    auto kernel1x1 = Kernel("kernel_1x1")
+                         .Inputs(matrix.GetValue())
+                         .Indices(i, j)
+                         .Define(addOne);
+
+    auto [iOuter, iInner] = loop.Split(i, 2);
+    auto [jOuter, jInner] = loop.Split(j, 2);
+
+    auto kernel2x2 = Kernel("kernel_2x2", kernel1x1.GetId())
+                         .Inputs(matrix.GetValue())
+                         .Indices(iOuter, jOuter)
+                         .Define(addTwo);
+
+    loop.AddKernel(kernel1x1, LoopNest::ConstraintType::predicate);
+    loop.AddKernel(kernel2x2, {}, IsDefined(iOuter) && IsDefined(jOuter));
+
+#if 0
+    PrintLoops(loop, "NonInnermostKernel_test4");
+#endif
+
+    CodeGenerator generator;
+    generator.Run(loop);
+
+#if 0 // DEBUGGING
+    InvokeForContext<ComputeContext>([&](auto&) {
+        ForRange(4, [&](Scalar i) {
+            ForRange(4, [&](Scalar j) {
+                auto val = matrix(i, j).Get<int>();
+                Log() << std::setw(4) << val;
+            });
+            Log() << EOL;
+        });
+    });
+#endif
+    return matrix(1, 1).Copy(); // will return 0 if calculation is correct
+}
+
+// This test adds 1 to each element in a 4x4 matrix, but does all the work in on a cached piece of the matrix.
+// The i and j dimensions are subdivided into 2x2 tiles, then each tile is copied into the cache, operated on,
+// and copied back.
+// In this version of the test, the cache is the same size as the original matrix. The next test shows a more useful
+// scenario, where the cache is the size of a single tile.
+Scalar CachedMatrix_test1()
+{
+    const int N = 4;
+    // auto A = MakeMatrix<int>(N, N);
+    // A.GetValue().SetName("A"); // BUG(?): this doesn't work because GetValue() returns a copy
+    auto A = MakeMatrix<int>(N, N, "A");
+
+    ForRange(N, [&](Scalar i) {
+        ForRange(N, [&](Scalar j) {
+            A(i, j) = i - j;
+        });
+    });
+
+    // A:
+    // [ 0, -1, -2, -3 ]
+    // [ 1,  0, -1, -2 ]
+    // [ 2,  1,  0, -1 ]
+    // [ 3,  2,  1,  0 ]
+
+    const int cacheSize = 2;
+    auto cache = MakeMatrix<int>(N, N, "cache");
+
+    Index i("i"), j("j");
+    LoopNest loop({ { i, { 0, N } },
+                    { j, { 0, N } } });
+    auto computeKernel = Kernel("compute")
+                             .Inputs(A.GetValue())
+                             .Indices(i, j)
+                             .Define(addOne);
+
+    auto [iOuter, iInner] = loop.Split(i, cacheSize);
+    auto [jOuter, jInner] = loop.Split(j, cacheSize);
+
+    auto initCacheKernel = Kernel("cache")
+                               .Inputs(A.GetValue(), cache.GetValue())
+                               .Indices(iOuter, jOuter)
+                               .Define(copyToCache);
+
+    auto copybackKernel = Kernel("uncache")
+                              .Inputs(A.GetValue(), cache.GetValue())
+                              .Indices(iOuter, jOuter)
+                              .Define(copyFromCache);
+
+    // inside iInner, jInner loop (and inside them), "cache" is used instead of "A"
+    loop.RenameVariable(A, cache, { iInner, jInner });
+
+    loop.AddKernel(initCacheKernel, LoopFragmentType::prologue);
+    loop.AddKernel(computeKernel, LoopNest::ConstraintType::constraint);
+    loop.AddKernel(copybackKernel, LoopFragmentType::epilogue);
+
+    CodeGenerator generator;
+    generator.Run(loop);
+
+#if 0 // DEBUGGING
+    PrintLoops(loop, "CachedMatrix_test1");
+
+    InvokeForContext<ComputeContext>([&](auto&) {
+        ForRange(N, [&](Scalar i) {
+            ForRange(N, [&](Scalar j) {
+                auto val = A(i, j).Get<int>();
+                Log() << std::setw(4) << val;
+            });
+            Log() << EOL;
+        });
+    });
+#endif
+    return A(2, 0) + A(0, 2) - 2; // will return 0 if calculation is correct
+}
+
+// This test adds 1 to each element in a 4x4 matrix, but does all the work in on a cached piece of the matrix.
+// The i and j dimensions are subdivided into 2x2 tiles, then each tile is copied into the cache, operated on,
+// and copied back.
+// In this version of the test, the cache is the same size as the original matrix. The next test shows a more useful
+// scenario, where the cache is the size of a single tile.
+Scalar CachedMatrix_test1_new()
+{
+    const int N = 4;
+    // auto A = MakeMatrix<int>(N, N);
+    // A.GetValue().SetName("A"); // BUG(?): this doesn't work because GetValue() returns a copy
+    auto A = MakeMatrix<int>(N, N, "A");
+
+    ForRange(N, [&](Scalar i) {
+        ForRange(N, [&](Scalar j) {
+            A(i, j) = i - j;
+        });
+    });
+
+    // A:
+    // [ 0, -1, -2, -3 ]
+    // [ 1,  0, -1, -2 ]
+    // [ 2,  1,  0, -1 ]
+    // [ 3,  2,  1,  0 ]
+
+    const int cacheSize = 2;
+    auto cache = MakeMatrix<int>(N, N, "cache");
+
+    Index i("i"), j("j");
+    LoopNest loop({ { i, { 0, N } },
+                    { j, { 0, N } } });
+    auto computeKernel = Kernel("compute")
+                             .Inputs(A.GetValue())
+                             .Indices(i, j)
+                             .Define(addOne);
+
+    auto [iOuter, iInner] = loop.Split(i, cacheSize);
+    auto [jOuter, jInner] = loop.Split(j, cacheSize);
+
+    auto initCacheKernel = Kernel("cache")
+                               .Inputs(A.GetValue(), cache.GetValue())
+                               .Indices(iOuter, jOuter)
+                               .Define(copyToCache);
+
+    auto copybackKernel = Kernel("uncache")
+                              .Inputs(A.GetValue(), cache.GetValue())
+                              .Indices(iOuter, jOuter)
+                              .Define(copyFromCache);
+
+    // inside iInner, jInner loop (and inside them), "cache" is used instead of "A"
+    loop.RenameVariable(A, cache, { iInner, jInner });
+
+    // loop.AddKernel(initCacheKernel, {}, Before(iInner) || Before(jInner));
+    loop.AddKernel(initCacheKernel, {}, Before(iInner));
+    loop.AddKernel(computeKernel, LoopNest::ConstraintType::predicate);
+    loop.AddKernel(copybackKernel, {}, After(iInner) || After(jInner));
+
+    CodeGenerator generator;
+    generator.Run(loop);
+
+#if 0 // DEBUGGING
+    PrintLoops(loop, "CachedMatrix_test1_new");
+#endif
+
+#if 0 // DEBUGGING
+    InvokeForContext<ComputeContext>([&](auto&) {
+        ForRange(N, [&](Scalar i) {
+            ForRange(N, [&](Scalar j) {
+                auto val = A(i, j).Get<int>();
+                Log() << std::setw(4) << val;
+            });
+            Log() << EOL;
+        });
+    });
+#endif
+    return A(2, 0) + A(0, 2) - 2; // will return 0 if calculation is correct
+}
+
+// This test adds 1 to each element in a 4x4 matrix, but does all the work in on a cached piece of the matrix.
+// The i and j dimensions are subdivided into 2x2 tiles, then each tile is copied into the cache, operated on,
+// and copied back.
+// The cache here is a 2x2 matrix that gets reused for each tile. In this version of the test, we need to add the
+// compute kernel after the loops are split, so that we can have it use the inner tile indices instead of
+// the full matrix indices.
+Scalar CachedMatrix_test2()
+{
+    // Create the 'A' matrix
+    const int N = 4;
+    auto A = MakeMatrix<int>(N, N, "A");
+
+    // Create the small cache matrix
+    const int cacheSize = 2;
+    auto cache = MakeMatrix<int>(cacheSize, cacheSize, "cache");
+
+    // Initialize A to this matrix:
+    // [ 0, -1, -2, -3 ]
+    // [ 1,  0, -1, -2 ]
+    // [ 2,  1,  0, -1 ]
+    // [ 3,  2,  1,  0 ]
+    ForRange(N, [&](Scalar i) {
+        ForRange(N, [&](Scalar j) {
+            A(i, j) = i - j;
+        });
+    });
+
+    // Create a loop nest to iterate over A's domain
+    Index i("i"), j("j");
+    LoopNest loop({ { i, { 0, N } },
+                    { j, { 0, N } } });
+
+    // Split the loops into tiles the size of the cache
+    auto [iOuter, iInner] = loop.Split(i, cacheSize);
+    auto [jOuter, jInner] = loop.Split(j, cacheSize);
+
+    // Tell the loop nest that kernels that run on the individual tiles should use 'cache' in place of 'A'
+    loop.RenameVariable(A, cache, { iInner, jInner });
+
+    // Add the code to initialize the cache with a tile of 'A'
+    auto initCacheKernel = Kernel("init")
+                               .Inputs(A.GetValue(), cache.GetValue())
+                               .Indices(iOuter, jOuter)
+                               .Define(copyToSmallCache);
+
+    // We use "prologue" as the placement to tell the system to run this kernel before the inner tile loops
+    loop.AddKernel(initCacheKernel, LoopFragmentType::prologue);
+
+    // Add the compute kernel, using the inner, tile-relative indices
+    auto kernel = Kernel("kernel")
+                      .Inputs(A.GetValue())
+                      .Indices(iInner, jInner)
+                      .Define(addOne);
+    loop.AddKernel(kernel, LoopNest::ConstraintType::constraint);
+
+    // ...and the code to copy the processed data back from the kernel into 'A'
+    auto copybackKernel = Kernel("copyback")
+                              .Inputs(A.GetValue(), cache.GetValue())
+                              .Indices(iOuter, jOuter)
+                              .Define(copyFromSmallCache);
+
+    // Here, we use "epilogue" as the placement to tell the system to run this kernel after the inner tile loops
+    loop.AddKernel(copybackKernel, LoopFragmentType::epilogue);
+
+    CodeGenerator generator;
+    generator.Run(loop);
+
+#if 0 // DEBUGGING
+    InvokeForContext<ComputeContext>([&](auto&) {
+        ForRange(N, [&](Scalar i) {
+            ForRange(N, [&](Scalar j) {
+                auto val = A(i, j).Get<int>();
+                Log() << std::setw(4) << val;
+            });
+            Log() << EOL;
+        });
+    });
+#endif
+    return A(2, 0) + A(0, 2) - 2; // will return 0 if calculation is correct
+}
+
+// This test adds 1 to each element in a 4x4 matrix, but does all the work in on a cached piece of the matrix.
+// The i and j dimensions are subdivided into 2x2 tiles, then each tile is copied into the cache, operated on,
+// and copied back.
+// The cache here is a 2x2 matrix that gets reused for each tile. In this version of the test, we need to add the
+// compute kernel after the loops are split, so that we can have it use the inner tile indices instead of
+// the full matrix indices.
+//
+// This version differs from CachedMatrix_test2 only in how the cached matrix is given to the kernel
+Scalar CachedMatrix_test3()
+{
+    const int N = 4;
+
+    // Create and initialize the 'A' matrix
+    auto A = MakeMatrix<int>(N, N, "A");
+    ForRange(N, [&](Scalar i) {
+        ForRange(N, [&](Scalar j) {
+            A(i, j) = i - j;
+        });
+    });
+
+    // Create the small cache matrix
+    const int cacheSize = 2;
+    auto cache = MakeMatrix<int>(cacheSize, cacheSize, "cache");
+
+    // Create a loop nest to iterate over A's domain
+    Index i("i"), j("j");
+    LoopNest loop({ { i, { 0, N } },
+                    { j, { 0, N } } });
+
+    // Split the loops into tiles the size of the cache
+    auto [iOuter, iInner] = loop.Split(i, cacheSize);
+    auto [jOuter, jInner] = loop.Split(j, cacheSize);
+
+    // Add the code to initialize the cache with a tile of 'A'
+    auto initCacheKernel = Kernel("init")
+                               .Inputs(A.GetValue(), cache.GetValue())
+                               .Indices(iOuter, jOuter)
+                               .Define(copyToSmallCache);
+
+    // We use "prologue" as the placement to tell the system to run this kernel before the inner tile loops
+    loop.AddKernel(initCacheKernel, LoopFragmentType::prologue);
+
+    // Add the compute kernel, using the inner, tile-relative indices, and the cached matrix
+    auto kernel = Kernel("kernel")
+                      .Inputs(cache.GetValue())
+                      .Indices(iInner, jInner)
+                      .Define(addOne);
+    loop.AddKernel(kernel, LoopNest::ConstraintType::constraint);
+
+    // ...and the code to copy the processed data back from the kernel into 'A'
+    auto copybackKernel = Kernel("copyback")
+                              .Inputs(A.GetValue(), cache.GetValue())
+                              .Indices(iOuter, jOuter)
+                              .Define(copyFromSmallCache);
+
+    // Here, we use "epilogue" as the placement to tell the system to run this kernel after the inner tile loops
+    loop.AddKernel(copybackKernel, LoopFragmentType::epilogue);
+
+    CodeGenerator generator;
+    generator.Run(loop);
+
+#if 0 // DEBUGGING
+    InvokeForContext<ComputeContext>([&](auto&) {
+        ForRange(N, [&](Scalar i) {
+            ForRange(N, [&](Scalar j) {
+                auto val = A(i, j).Get<int>();
+                Log() << std::setw(4) << val;
+            });
+            Log() << EOL;
+        });
+    });
+#endif
+    return A(2, 0) + A(0, 2) - 2; // will return 0 if calculation is correct
+}
+
+// This test does an element-wise sum of two 4x4 matrices, storing the result in the left matrix,
+// where the left matrix is cached in 2x2 tiles and the right matrix is not.
+// The i and j dimensions are subdivided into 2x2 tiles, then each tile of the left matrix is copied into the cache,
+// operated on with the right matrix, and copied back.
+// The cache here is a 2x2 matrix that gets reused for each tile.
+// In this version, we need to pass in both the split indices and the global indices into the kernel since one matrix
+// is a cache using the split indices, while the other is uncached and needs the global indices
+Scalar CachedMatrix_test4()
+{
+    const int N = 4;
+    const int cacheSize = 2;
+    auto A = MakeMatrix<int>(N, N, "A");
+    auto B = MakeMatrix<int>(N, N, "B");
+    auto expected = MakeMatrix<int>(N, N);
+    auto cacheA = MakeMatrix<int>(cacheSize, cacheSize, "cacheA");
+
+    // Initialize the 'A' matrix
+    // A:
+    // [ 0, -1, -2, -3 ]
+    // [ 1,  0, -1, -2 ]
+    // [ 2,  1,  0, -1 ]
+    // [ 3,  2,  1,  0 ]
+    ForRange(N, [&](Scalar i) {
+        ForRange(N, [&](Scalar j) {
+            A(i, j) = i - j;
+        });
+    });
+
+    // Initialize the 'B' matrix
+    // B:
+    // [ 0,  1,  2,  3 ]
+    // [ 1,  2,  3,  4 ]
+    // [ 2,  3,  4,  5 ]
+    // [ 3,  4,  5,  6 ]
+    ForRange(N, [&](Scalar i) {
+        ForRange(N, [&](Scalar j) {
+            B(i, j) = i + j;
+        });
+    });
+
+    // Initialize the 'expected' matrix = A + B
+    // A + B:
+    // [ 0,  0,  0,  0 ]
+    // [ 2,  2,  2,  2 ]
+    // [ 4,  4,  4,  4 ]
+    // [ 6,  6,  6,  6 ]
+    ForRange(N, [&](Scalar i) {
+        ForRange(N, [&](Scalar j) {
+            expected(i, j) = (i - j) + (i + j);
+        });
+    });
+
+    // Create a loop nest to iterate over A's and B's domains
+    Index i("i"), j("j");
+    LoopNest loop({ { i, { 0, N } },
+                    { j, { 0, N } } });
+
+    // Split the loops into tiles the size of the cache
+    auto [iOuter, iInner] = loop.Split(i, cacheSize);
+    auto [jOuter, jInner] = loop.Split(j, cacheSize);
+
+    // Add the code to initialize the cache with a tile of 'A'
+    auto initCacheKernel = Kernel("init")
+                               .Inputs(A.GetValue(), cacheA.GetValue())
+                               .Indices(iOuter, jOuter)
+                               .Define(copyToSmallCache);
+
+    // We use "prologue" as the placement to tell the system to run this kernel before the inner tile loops
+    loop.AddKernel(initCacheKernel, LoopFragmentType::prologue);
+
+    // Add the compute kernel, using the inner, tile-relative indices, and the cached matrix
+    auto kernel = Kernel("kernel")
+                      .Inputs(cacheA.GetValue(), B.GetValue())
+                      .Indices(iInner, jInner, i, j)
+                      .Define(addCachedMatrixToUnchachedMatrix);
+    CodePositionConstraints constraints{ LoopFragmentType::body, { iInner, jInner }, {} };
+    loop.AddKernel(kernel, constraints);
+
+    // ...and the code to copy the processed data back from the kernel into 'A'
+    auto copybackKernel = Kernel("copyback")
+                              .Inputs(A.GetValue(), cacheA.GetValue())
+                              .Indices(iOuter, jOuter)
+                              .Define(copyFromSmallCache);
+
+    // Here, we use "epilogue" as the placement to tell the system to run this kernel after the inner tile loops
+    loop.AddKernel(copybackKernel, LoopFragmentType::epilogue);
+
+    CodeGenerator generator;
+    generator.Run(loop);
+
+    Scalar ok = Allocate<int>(ScalarLayout);
+    ok = 1;
+    If(
+        VerifySame(A, expected) == 0,
+        [&] {
+            ok = 0;
+        })
+        .Else([&] {
+            auto value = A.GetValue();
+            value.SetLayout({ { (int)A.Size() } });
+            DebugPrintVector(value);
+            DebugPrint("\n");
+        });
+    return ok;
+}
+
+// This test does an element-wise sum of two 4x4 matrices, storing the result in the left matrix,
+// where the left matrix is cached in 2x2 tiles and the right matrix is not.
+// The i and j dimensions are subdivided into 2x2 tiles, then each tile of the left matrix is copied into the cache,
+// operated on with the right matrix, and copied back.
+// The cache here is a 2x2 matrix that gets reused for each tile.
+// In this version, we need to pass in both the split indices and the global indices into the kernel since one matrix
+// is a cache using the split indices, while the other is uncached and needs the global indices
+//
+// The difference with the previous test is that the kernel is unrolled and operates on a panel rather than individual indices
+Scalar CachedMatrix_test5()
+{
+    const int N = 4;
+    const int cacheSize = 2;
+    auto A = MakeMatrix<int>(N, N, "A");
+    auto B = MakeMatrix<int>(N, N, "B");
+    auto expected = MakeMatrix<int>(N, N);
+    auto cacheA = MakeMatrix<int>(cacheSize, cacheSize, "cacheA");
+
+    // Initialize the 'A' matrix
+    // A:
+    // [ 0, -1, -2, -3 ]
+    // [ 1,  0, -1, -2 ]
+    // [ 2,  1,  0, -1 ]
+    // [ 3,  2,  1,  0 ]
+    ForRange(N, [&](Scalar i) {
+        ForRange(N, [&](Scalar j) {
+            A(i, j) = i - j;
+        });
+    });
+
+    // Initialize the 'B' matrix
+    // B:
+    // [ 0,  1,  2,  3 ]
+    // [ 1,  2,  3,  4 ]
+    // [ 2,  3,  4,  5 ]
+    // [ 3,  4,  5,  6 ]
+    ForRange(N, [&](Scalar i) {
+        ForRange(N, [&](Scalar j) {
+            B(i, j) = i + j;
+        });
+    });
+
+    // A + B:
+    // [ 0,  0,  0,  0 ]
+    // [ 2,  2,  2,  2 ]
+    // [ 4,  4,  4,  4 ]
+    // [ 6,  6,  6,  6 ]
+    ForRange(N, [&](Scalar i) {
+        ForRange(N, [&](Scalar j) {
+            expected(i, j) = i * 2;
+        });
+    });
+
+    // Create a loop nest to iterate over A's and B's domains
+    Index i("i"), j("j");
+    LoopNest loop({ { i, { 0, N } },
+                    { j, { 0, N } } });
+
+    // Split the loops into tiles the size of the cache
+    auto [iOuter, iInner] = loop.Split(i, cacheSize);
+    auto [jOuter, jInner] = loop.Split(j, cacheSize);
+
+    // Add the code to initialize the cache with a tile of 'A'
+    auto initCacheKernel = Kernel("init")
+                               .Inputs(A.GetValue(), cacheA.GetValue())
+                               .Indices(iOuter, jOuter)
+                               .Define(copyToSmallCache);
+
+    // We use "prologue" as the placement to tell the system to run this kernel before the inner tile loops
+    loop.AddKernel(initCacheKernel, LoopFragmentType::prologue);
+
+    // Add the compute kernel, using the inner, tile-relative indices, and the cached matrix
+    auto kernel = Kernel("kernel")
+                      .Inputs(cacheA.GetValue(), B.GetValue())
+                      .Indices(iOuter, jOuter, i, j)
+                      .Define(addCachedMatrixToUnchachedMatrixUnrolled);
+    CodePositionConstraints constraints{ LoopFragmentType::body, { iOuter, jOuter }, {} };
+    loop.AddKernel(kernel, constraints);
+
+    // ...and the code to copy the processed data back from the kernel into 'A'
+    auto copybackKernel = Kernel("copyback")
+                              .Inputs(A.GetValue(), cacheA.GetValue())
+                              .Indices(iOuter, jOuter)
+                              .Define(copyFromSmallCache);
+
+    // Here, we use "epilogue" as the placement to tell the system to run this kernel after the inner tile loops
+    loop.AddKernel(copybackKernel, LoopFragmentType::epilogue);
+
+    CodeGenerator generator;
+    generator.Run(loop);
+
+    Scalar ok = Allocate<int>(ScalarLayout);
+    ok = 1;
+    If(
+        VerifySame(A, expected) == 0,
+        [&] {
+            ok = 0;
+        })
+        .Else([&] {
+            auto value = A.GetValue();
+            value.SetLayout({ { (int)A.Size() } });
+            DebugPrintVector(value);
+            DebugPrint("\n");
+        });
+    return ok;
+}
+
+Scalar LoopNest_Parallelized_test1()
+{
+    Scalar ok = Allocate<int>(ScalarLayout);
+    auto matrix = MakeMatrix<int>(4, 5);
+    InvokeForContext<LLVMContext>([&] {
+        auto v = matrix.GetValue().Get<Emittable>().GetDataAs<LLVMValue>();
+        v->setName("matrix");
+    });
+    loopnests::IndexRange i("i", { 0, 4 }), j("j", { 0, 5 });
+
+    auto kernel = loopnests::Kernel("kernel")
+                      .Inputs(matrix.GetValue())
+                      .Indices(i.GetIndex(), j.GetIndex())
+                      .Define([](Matrix m, Scalar i, Scalar j) {
+                          Scalar tid = GetTID();
+#if 0 // Useful to turn off/on for debugging
+                          InvokeForContext<ComputeContext>([&](auto&) {
+                              auto iInt = i.Get<int>();
+                              auto jInt = j.Get<int>();
+                              Log() << "m(" << iInt << ", " << jInt << ") = " << (iInt * 2 + jInt * 5)
+                                    << " [Thread " << tid.Get<int>() << "]"
+                                    << EOL;
+                          });
+#endif // 1
+                          m(i, j) = i * 2 + j * 5;
+                      });
+
+    loopnests::LoopNest loop(std::vector<loopnests::IndexRange>{ i, j });
+    loop.AddKernel(kernel, LoopNest::ConstraintType::predicate);
+    loop.Parallelize(i.GetIndex(), 2);
+
+    loopnests::CodeGenerator generator;
+    generator.Run(loop);
+
+    ok = matrix(2, 3) - 19;
+    return ok; // will return 0 if calculation is correct
+}
+
+Scalar LoopNest_Parallelized_test2()
+{
+    Scalar ok = Allocate<int>(ScalarLayout);
+
+    auto matrix = MakeMatrix<int>(4, 5);
+    loopnests::IndexRange i("i", { 0, 4 }), j("j", { 0, 5 });
+
+    auto kernel = loopnests::Kernel("kernel")
+                      .Inputs(matrix.GetValue())
+                      .Indices(i.GetIndex(), j.GetIndex())
+                      .Define([](Matrix m, Scalar i, Scalar j) {
+                          Scalar tid = GetTID();
+#if 0 // Useful to turn off/on for debugging
+                          InvokeForContext<ComputeContext>([&](auto&) {
+                              auto iInt = i.Get<int>();
+                              auto jInt = j.Get<int>();
+                              Log() << "m(" << iInt << ", " << jInt << ") = " << tid.Get<int>()
+                                    << " [Thread " << tid.Get<int>() << "]" << EOL;
+                          });
+#endif // 1
+                          m(i, j) = tid;
+                      });
+
+    loopnests::LoopNest loop(std::vector<loopnests::IndexRange>{ i, j });
+    loop.AddKernel(kernel, LoopNest::ConstraintType::predicate);
+    loop.Parallelize(i.GetIndex(), 2);
+
+    loopnests::CodeGenerator generator;
+    generator.Run(loop);
+
+    auto expected = MakeMatrix<int>(4, 5);
+    If(
+        VerifySame(matrix, expected) == 0,
+        [&] {
+            ok = 1;
+        })
+        .Else([&] {
+            auto value = matrix.GetValue();
+            value.SetLayout({ { (int)matrix.Size() } });
+            DebugPrintVector(value);
+            DebugPrint("\n");
+        });
+    return ok;
+}
+
+Scalar LoopNest_Unrolled_test1()
+{
+    auto matrix = MakeMatrix<int>(20, 5);
+    IndexRange i("i", { 0, 20 }), j("j", { 0, 5 });
+
+    auto kernel = Kernel("kernel")
+                      .Inputs(matrix.GetValue())
+                      .Indices(i.GetIndex(), j.GetIndex())
+                      .Define(loopnest_kernel);
+
+    LoopNest loop(std::vector<IndexRange>{ i, j });
+    loop.AddKernel(kernel, LoopNest::ConstraintType::predicate);
+    loop.Unroll(j.GetIndex());
+
+    CodeGenerator generator;
+    generator.Run(loop);
+
+    return matrix(2, 3) - 19; // will return 0 if calculation is correct
+}
+
+Scalar LoopNest_DebugDump_test1()
+{
+    auto matrix = MakeMatrix<int>(4, 5);
+    Index i("i"), j("j");
+    LoopNest loop({ { i, { 0, 4 } },
+                    { j, { 0, 5 } } });
+
+    auto kernel = Kernel("kernel")
+                      .Inputs(matrix.GetValue())
+                      .Indices(i, j)
+                      .Define(loopnest_kernel);
+
+    loop.AddKernel(kernel, LoopNest::ConstraintType::predicate);
+    loop.Split(i, 2);
+    loop.Unroll(j);
+    loop.SetLoopOrder({ i, j, i });
+
+    InvokeForContext<ComputeContext>([&](auto&) {
+        std::stringstream ss;
+        DebugDump(loop, "DebugDump test", &ss);
+        Log() << ss.str() << EOL;
+        // TODO: verify somehow that the printing worked
+    });
+
+    return 0;
+}
+
+Scalar LoopNest_DebugDump_test2()
+{
+    const int N = 8;
+    auto matrix = MakeMatrix<int>(N, N);
+    Index i("i"), j("j");
+    LoopNest loop({ { i, { 0, N } },
+                    { j, { 0, N } } });
+
+    auto kernel = Kernel("kernel")
+                      .Inputs(matrix.GetValue())
+                      .Indices(i, j)
+                      .Define(loopnest_kernel);
+
+    loop.AddKernel(kernel, LoopNest::ConstraintType::predicate);
+    SplitAndSetOrder(loop, { i, j }, { 4, 2 }, "ijij");
+
+    InvokeForContext<ComputeContext>([&](auto&) {
+        std::stringstream ss;
+        DebugDump(loop, "DebugDump test", &ss);
+        Log() << ss.str() << EOL;
+        // TODO: verify somehow that the printing worked
+    });
+
+    return 0;
+}
+
+Scalar SimpleMatMult_test()
+{
+    const int N = 8;
+    auto A = MakeMatrix<int>(N, N, "A");
+    auto B = MakeMatrix<int>(N, N, "B");
+    auto C = MakeMatrix<int>(N, N, "C");
+    auto expected = MakeMatrix<int>(N, N, "expected");
+
+    // initialize A, B, and C
+    ForRange(N, [&](Scalar i) {
+        ForRange(N, [&](Scalar j) {
+            A(i, j) = i - j;
+            B(i, j) = i + 2 * j;
+            C(i, j) = 100;
+        });
+    });
+
+    // fill out expected with a simple for-loop gemm
+    ForRange(N, [&](Scalar i) {
+        ForRange(N, [&](Scalar j) {
+            ForRange(N, [&](Scalar k) {
+                expected(i, j) += A(i, k) * B(k, j);
+            });
+        });
+    });
+
+    // The input matrices:
+    // A:                                 B:                                 C:
+    // [ 0, -1, -2, -3, -4, -5, -6, -7]   [ 0,  2,  4,  6,  8, 10, 12, 14]   [ 100 100 100 ... ]
+    // [ 1,  0, -1, -2, -3, -4, -5, -6]   [ 1,  3,  5,  7,  9, 11, 13, 15]   [ 100 100 100 ... ]
+    // [ 2,  1,  0, -1, -2, -3, -4, -5]   [ 2,  4,  6,  8, 10, 12, 14, 16]   [      ...        ]
+    // [ 3,  2,  1,  0, -1, -2, -3, -4]   [ 3,  5,  7,  9, 11, 13, 15, 17]   [      ...        ]
+    // [ 4,  3,  2,  1,  0, -1, -2, -3]   [ 4,  6,  8, 10, 12, 14, 16, 18]   [      ...        ]
+    // [ 5,  4,  3,  2,  1,  0, -1, -2]   [ 5,  7,  9, 11, 13, 15, 17, 19]   [      ...        ]
+    // [ 6,  5,  4,  3,  2,  1,  0, -1]   [ 6,  8, 10, 12, 14, 16, 18, 20]   [      ...        ]
+    // [ 7,  6,  5,  4,  3,  2,  1,  0]   [ 7,  9, 11, 13, 15, 17, 19, 21]   [      ...        ]
+
+    // (A * B) (the desired result):
+    // [-140, -196, -252, -308, -364, -420, -476, -532]
+    // [-112, -152, -192, -232, -272, -312, -352, -392]
+    // [ -84, -108, -132, -156, -180, -204, -228, -252]
+    // [ -56,  -64,  -72,  -80,  -88,  -96, -104, -112]
+    // [ -28,  -20,  -12,   -4,    4,   12,   20,   28]
+    // [   0,   24,   48,   72,   96,  120,  144,  168]
+    // [  28,   68,  108,  148,  188,  228,  268,  308]
+    // [  56,  112,  168,  224,  280,  336,  392,  448]
+
+    Index i("i"), j("j"), k("k");
+
+    LoopNest loop({ { i, { 0, N } },
+                    { j, { 0, N } },
+                    { k, { 0, N } } });
+
+    int cacheARows = 4;
+    int cacheACols = 4;
+    int resultCacheRows = 2;
+    int resultCacheCols = 2;
+
+    auto [i_panel_outer, i_panel_inner] = loop.Split(i, cacheARows);
+    auto [k_panel_outer, k_panel_inner] = loop.Split(k, cacheACols);
+    auto [i_kernel_outer, i_kernel_inner] = loop.Split(i, resultCacheRows);
+    auto [j_kernel_outer, j_kernel_inner] = loop.Split(j, resultCacheCols);
+
+    auto initCKernel = Kernel("init")
+                           .Inputs(C.GetValue())
+                           .Indices(i, j)
+                           .Define([](Matrix C, Scalar i, Scalar j) {
+                               C(i, j) = 0;
+                           });
+
+    auto innerKernel = Kernel("matmul")
+                           .Inputs(A.GetValue(), B.GetValue(), C.GetValue())
+                           .Indices(i, j, k)
+                           .Define([](Matrix A, Matrix B, Matrix C, Scalar i, Scalar j, Scalar k) {
+                               C(i, j) += A(i, k) * B(k, j);
+                           });
+
+    loopnests::CodePositionConstraints initConstraints{ loopnests::LoopFragmentType::prologue, { i, j }, { k } };
+    loop.AddKernel(initCKernel, initConstraints);
+    loop.AddKernel(innerKernel, LoopNest::ConstraintType::constraint);
+
+    loop.SetLoopOrder({ k, i, j, i, k, j, i });
+
+    CodeGenerator generator;
+    generator.Run(loop);
+
+#if 0
+    PrintLoops(loop, "SimpleMatMult_test");
+#endif
+
+    Scalar ok = Allocate<int>(ScalarLayout);
+    ok = 1;
+    If(
+        VerifySame(C, expected) == 0,
+        [&] {
+            ok = 0;
+        })
+        .Else([&] {
+            auto value = C.GetValue();
+            value.SetLayout({ { (int)C.Size() } });
+            DebugPrintVector(value);
+            DebugPrint("\n");
+            auto expectedValue = expected.GetValue();
+            expectedValue.SetLayout({ { (int)expected.Size() } });
+            DebugPrintVector(expectedValue);
+            DebugPrint("\n");
+        });
+    return ok;
+}
+
+Scalar GotoBLASGemm_LowLevelAPI()
+{
+    const int N = 8;
+    auto A = MakeMatrix<int>(N, N, "A");
+    auto B = MakeMatrix<int>(N, N, "B");
+    auto C = MakeMatrix<int>(N, N, "C");
+    auto expected = MakeMatrix<int>(N, N, "expected");
+
+    // initialize A, B, and C
+    ForRange(N, [&](Scalar i) {
+        ForRange(N, [&](Scalar j) {
+            A(i, j) = i - j;
+            B(i, j) = i + 2 * j;
+        });
+    });
+
+    // fill out expected with a simple for-loop gemm
+    ForRange(N, [&](Scalar i) {
+        ForRange(N, [&](Scalar j) {
+            ForRange(N, [&](Scalar k) {
+                expected(i, j) += A(i, k) * B(k, j);
+            });
+        });
+    });
+
+    // The input matrices:
+    // A:                                 B:                                 C:
+    // [ 0, -1, -2, -3, -4, -5, -6, -7]   [ 0,  2,  4,  6,  8, 10, 12, 14]   [ 0 0 0 ... ]
+    // [ 1,  0, -1, -2, -3, -4, -5, -6]   [ 1,  3,  5,  7,  9, 11, 13, 15]   [ 0 0 0 ... ]
+    // [ 2,  1,  0, -1, -2, -3, -4, -5]   [ 2,  4,  6,  8, 10, 12, 14, 16]   [    ...    ]
+    // [ 3,  2,  1,  0, -1, -2, -3, -4]   [ 3,  5,  7,  9, 11, 13, 15, 17]   [    ...    ]
+    // [ 4,  3,  2,  1,  0, -1, -2, -3]   [ 4,  6,  8, 10, 12, 14, 16, 18]   [    ...    ]
+    // [ 5,  4,  3,  2,  1,  0, -1, -2]   [ 5,  7,  9, 11, 13, 15, 17, 19]   [    ...    ]
+    // [ 6,  5,  4,  3,  2,  1,  0, -1]   [ 6,  8, 10, 12, 14, 16, 18, 20]   [    ...    ]
+    // [ 7,  6,  5,  4,  3,  2,  1,  0]   [ 7,  9, 11, 13, 15, 17, 19, 21]   [    ...    ]
+
+    // (A * B) (the desired result):
+    // [-140, -196, -252, -308, -364, -420, -476, -532]
+    // [-112, -152, -192, -232, -272, -312, -352, -392]
+    // [ -84, -108, -132, -156, -180, -204, -228, -252]
+    // [ -56,  -64,  -72,  -80,  -88,  -96, -104, -112]
+    // [ -28,  -20,  -12,   -4,    4,   12,   20,   28]
+    // [   0,   24,   48,   72,   96,  120,  144,  168]
+    // [  28,   68,  108,  148,  188,  228,  268,  308]
+    // [  56,  112,  168,  224,  280,  336,  392,  448]
+
+    Index i("i"), j("j"), k("k");
+
+    LoopNest loop({ { i, { 0, N } },
+                    { j, { 0, N } },
+                    { k, { 0, N } } });
+
+    int cacheARows = 4;
+    int cacheACols = 4;
+    int cacheBRows = cacheACols;
+    int cacheBCols = N;
+    int resultCacheRows = 2;
+    int resultCacheCols = 2;
+
+    auto [i_panel_outer, i_panel_inner] = loop.Split(i, cacheARows);
+    auto [k_panel_outer, k_panel_inner] = loop.Split(k, cacheACols);
+    auto [i_kernel_outer, i_kernel_inner] = loop.Split(i, resultCacheRows);
+    auto [j_kernel_outer, j_kernel_inner] = loop.Split(j, resultCacheCols);
+
+    auto cachedResult = MakeMatrix<int>(resultCacheRows, resultCacheCols, "cachedResult");
+
+    auto cacheA = MakeMatrix<int>(cacheARows, cacheACols, "cacheA");
+    auto transposeCacheB = MakeMatrix<int>(cacheBCols, cacheBRows, "transposeCacheB");
+
+    auto cacheAKernel = Kernel("cacheAKernel")
+                            .Inputs(A.GetValue(), cacheA.GetValue())
+                            .Indices(i_panel_outer, k_panel_outer)
+                            .Define([&](Matrix A, Matrix cache, Scalar iPanel, Scalar kPanel) {
+                                for (int i = 0; i < cacheARows; ++i)
+                                {
+                                    for (int k = 0; k < cacheACols; ++k)
+                                    {
+                                        cache(i, k) = A(iPanel + i, kPanel + k);
+                                    }
+                                }
+                            });
+
+    auto transposeCacheBKernel = Kernel("transposeCacheBKernel")
+                                     .Inputs(B.GetValue(), transposeCacheB.GetValue())
+                                     .Indices(k_panel_outer)
+                                     .Define([&](Matrix B, Matrix transposeCache, Scalar kPanel) {
+                                         for (int k = 0; k < cacheBRows; ++k)
+                                         {
+                                             for (int j = 0; j < cacheBCols; ++j)
+                                             {
+                                                 transposeCache(j, k) = B(kPanel + k, j);
+                                             }
+                                         }
+                                     });
+
+    auto innerKernel = Kernel("matmul")
+                           .Inputs(cacheA.GetValue(), transposeCacheB.GetValue(), cachedResult.GetValue())
+                           .Indices(i, j, k, i_kernel_inner, j_kernel_inner, i_kernel_outer, k_panel_inner)
+                           .Define([](Matrix A, Matrix B, Matrix C, Scalar i, Scalar j, Scalar k, Scalar iInner, Scalar jInner, Scalar iOuter, Scalar kPanelInner) {
+                               C(iInner, jInner) += A(iOuter + iInner, kPanelInner) * B(j, kPanelInner);
+                           });
+
+    auto clearCacheKernel = Kernel("clearCacheKernel")
+                                .Inputs(cachedResult.GetValue())
+                                .Indices(i_kernel_outer, j_kernel_outer)
+                                .Define([&](Matrix cache, Scalar iOuter, Scalar jOuter) {
+                                    Scalar zeroValue = Allocate<int>(utilities::ScalarLayout);
+                                    for (int i = 0; i < resultCacheRows; ++i)
+                                    {
+                                        for (int j = 0; j < resultCacheCols; ++j)
+                                        {
+                                            cache(i, j) = zeroValue;
+                                        }
+                                    }
+                                });
+
+    auto uncacheKernel = Kernel("uncacheKernel")
+                             .Inputs(C.GetValue(), cachedResult.GetValue())
+                             .Indices(i_panel_outer, i_kernel_outer, j_kernel_outer)
+                             .Define([&](Matrix C, Matrix cache, Scalar iPanelOuter, Scalar iOuter, Scalar jOuter) {
+                                 for (int i = 0; i < resultCacheRows; ++i)
+                                 {
+                                     for (int j = 0; j < resultCacheCols; ++j)
+                                     {
+                                         C(iPanelOuter + iOuter + i, jOuter + j) += cache(i, j);
+                                     }
+                                 }
+                             });
+
+    CodePositionConstraints cacheAConstraint{ LoopFragmentType::prologue, { i_panel_outer, k_panel_outer }, {} };
+    loop.AddKernel(cacheAKernel, cacheAConstraint);
+
+    CodePositionConstraints cacheBConstraint{ LoopFragmentType::prologue, { k_panel_outer }, {} };
+    loop.AddKernel(transposeCacheBKernel, cacheBConstraint);
+
+    CodePositionConstraints constraint{ LoopFragmentType::body, { k, i_kernel_inner, j_kernel_inner }, {} };
+    loop.AddKernel(innerKernel, constraint);
+
+    CodePositionConstraints preConstraint{ LoopFragmentType::prologue, { i_kernel_outer, j_kernel_outer }, {} };
+    loop.AddKernel(clearCacheKernel, preConstraint);
+
+    CodePositionConstraints postConstraint{ LoopFragmentType::epilogue, { i_kernel_outer, j_kernel_outer }, {} };
+    loop.AddKernel(uncacheKernel, postConstraint);
+    loop.SetLoopOrder({ k, i, j, i, k, j, i });
+
+    loop.Unroll(i_kernel_inner);
+    loop.Unroll(j_kernel_inner);
+    CodeGenerator generator;
+    generator.Run(loop);
+
+#if 0
+    PrintLoops(loop, "GotoBLASGemm_LowLevelAPI");
+#endif
+
+    Scalar ok = Allocate<int>(ScalarLayout);
+    ok = 1;
+    If(
+        VerifySame(C, expected) == 0,
+        [&] {
+            ok = 0;
+        })
+        .Else([&] {
+            auto value = C.GetValue();
+            value.SetLayout({ { (int)C.Size() } });
+            DebugPrintVector(value);
+            DebugPrint("\n");
+            auto expectedValue = expected.GetValue();
+            expectedValue.SetLayout({ { (int)expected.Size() } });
+            DebugPrintVector(expectedValue);
+            DebugPrint("\n");
+        });
+    return ok;
+}
+
+Scalar GotoBLASGemmWithRefDeref()
+{
+    const int N = 8;
+    auto A = MakeMatrix<int>(N, N, "A");
+    auto B = MakeMatrix<int>(N, N, "B");
+    auto C = MakeMatrix<int>(N, N, "C");
+    auto expected = MakeMatrix<int>(N, N, "expected");
+
+    // initialize A, B, and C
+    ForRange(N, [&](Scalar i) {
+        ForRange(N, [&](Scalar j) {
+            A(i, j) = i - j;
+            B(i, j) = i + 2 * j;
+        });
+    });
+
+    // fill out expected with a simple for-loop gemm
+    ForRange(N, [&](Scalar i) {
+        ForRange(N, [&](Scalar j) {
+            ForRange(N, [&](Scalar k) {
+                expected(i, j) += A(i, k) * B(k, j);
+            });
+        });
+    });
+
+    // The input matrices:
+    // A:                                 B:                                 C:
+    // [ 0, -1, -2, -3, -4, -5, -6, -7]   [ 0,  2,  4,  6,  8, 10, 12, 14]   [ 100 100 100 ... ]
+    // [ 1,  0, -1, -2, -3, -4, -5, -6]   [ 1,  3,  5,  7,  9, 11, 13, 15]   [ 100 100 100 ... ]
+    // [ 2,  1,  0, -1, -2, -3, -4, -5]   [ 2,  4,  6,  8, 10, 12, 14, 16]   [      ...        ]
+    // [ 3,  2,  1,  0, -1, -2, -3, -4]   [ 3,  5,  7,  9, 11, 13, 15, 17]   [      ...        ]
+    // [ 4,  3,  2,  1,  0, -1, -2, -3]   [ 4,  6,  8, 10, 12, 14, 16, 18]   [      ...        ]
+    // [ 5,  4,  3,  2,  1,  0, -1, -2]   [ 5,  7,  9, 11, 13, 15, 17, 19]   [      ...        ]
+    // [ 6,  5,  4,  3,  2,  1,  0, -1]   [ 6,  8, 10, 12, 14, 16, 18, 20]   [      ...        ]
+    // [ 7,  6,  5,  4,  3,  2,  1,  0]   [ 7,  9, 11, 13, 15, 17, 19, 21]   [      ...        ]
+
+    // (A * B) (the desired result):
+    // [-140, -196, -252, -308, -364, -420, -476, -532]
+    // [-112, -152, -192, -232, -272, -312, -352, -392]
+    // [ -84, -108, -132, -156, -180, -204, -228, -252]
+    // [ -56,  -64,  -72,  -80,  -88,  -96, -104, -112]
+    // [ -28,  -20,  -12,   -4,    4,   12,   20,   28]
+    // [   0,   24,   48,   72,   96,  120,  144,  168]
+    // [  28,   68,  108,  148,  188,  228,  268,  308]
+    // [  56,  112,  168,  224,  280,  336,  392,  448]
+
+    Index i("i"), j("j"), k("k");
+
+    loopnests::LoopNest loop({ { i, { 0, N } },
+                               { j, { 0, N } },
+                               { k, { 0, N } } });
+
+    int cacheARows = 4;
+    int cacheACols = 4;
+    int cacheBRows = cacheACols;
+    int cacheBCols = N;
+    int resultCacheRows = 2;
+    int resultCacheCols = 2;
+
+    auto [i_panel_outer, i_panel_inner] = loop.Split(i, cacheARows);
+    auto [k_panel_outer, k_panel_inner] = loop.Split(k, cacheACols);
+    auto [i_kernel_outer, i_kernel_inner] = loop.Split(i, resultCacheRows);
+    auto [j_kernel_outer, j_kernel_inner] = loop.Split(j, resultCacheCols);
+
+    auto cachedResult = MakeMatrix<int>(resultCacheRows, resultCacheCols, "cachedResult");
+
+    auto cacheA = MakeMatrix<int>(cacheARows, cacheACols, "cacheA");
+    auto cacheARef = cacheA.GetValue().Reference();
+
+    auto transposeCacheB = MakeMatrix<int>(cacheBCols, cacheBRows, "transposeCacheB");
+
+    auto cacheAKernel = loopnests::Kernel("cacheAKernel")
+                            .Inputs(A.GetValue(), cacheARef)
+                            .Indices(i_panel_outer, k_panel_outer)
+                            .Define([cacheARows, cacheACols](Matrix A, Value cacheRef, Scalar iPanel, Scalar kPanel) {
+                                auto cache = Matrix(cacheRef.Dereference());
+                                for (int i = 0; i < cacheARows; ++i)
+                                {
+                                    for (int k = 0; k < cacheACols; ++k)
+                                    {
+                                        cache(i, k) = A(iPanel + i, kPanel + k);
+                                    }
+                                }
+                                // Update cacheRef so that global (i, k) index into the corect spot in the cache
+                                auto cacheTmp = cacheRef.Dereference();
+                                cacheTmp.SetLayout(MemoryShape{ cacheARows, cacheACols });
+                                auto cacheTmpOffset = cacheTmp.Offset({ -1 * iPanel, -1 * kPanel });
+                                cacheTmpOffset.SetLayout(MemoryShape{ cacheARows, cacheACols });
+                                cacheRef = cacheTmpOffset.Reference();
+                            });
+
+    auto resetCacheAKernel = loopnests::Kernel("resetCacheAKernel")
+                                 .Inputs(cacheARef)
+                                 .Indices(i_panel_outer, k_panel_outer)
+                                 .Define([cacheARows, cacheACols](Value cacheRef, Scalar iPanel, Scalar kPanel) {
+                                     // Reset cacheRef to point to the cache while we have iPanel and kPanel in hand
+                                     auto offsetCache = cacheRef.Dereference();
+                                     offsetCache.SetLayout(MemoryShape{ cacheARows, cacheACols });
+                                     auto realCache = offsetCache.Offset({ iPanel, kPanel });
+                                     realCache.SetLayout(MemoryShape{ cacheARows, cacheACols });
+                                     cacheRef = realCache.Reference();
+                                 });
+
+    auto transposeCacheBKernel = loopnests::Kernel("transposeCacheBKernel")
+                                     .Inputs(B.GetValue(), transposeCacheB.GetValue())
+                                     .Indices(k_panel_outer)
+                                     .Define([cacheBRows, cacheBCols](Matrix B, Matrix transposeCache, Scalar kPanel) {
+                                         for (int k = 0; k < cacheBRows; ++k)
+                                         {
+                                             for (int j = 0; j < cacheBCols; ++j)
+                                             {
+                                                 transposeCache(j, k) = B(kPanel + k, j);
+                                             }
+                                         }
+                                     });
+
+    auto innerKernel = loopnests::Kernel("matmul")
+                           .Inputs(cacheARef, transposeCacheB.GetValue(), cachedResult.GetValue())
+                           .Indices(i, j, k, i_kernel_inner, j_kernel_inner, i_kernel_outer, k_panel_inner, i_panel_outer, k_panel_outer)
+                           .Define([cacheARows, cacheACols](Value Aref, Matrix B, Matrix C, Scalar i, Scalar j, Scalar k, Scalar iInner, Scalar jInner, Scalar iOuter, Scalar kPanelInner, Scalar iPanel, Scalar kPanel) {
+                               auto offsetA = Aref.Dereference();
+                               offsetA.SetLayout(MemoryShape{ cacheARows, cacheACols });
+                               auto A = Matrix(offsetA);
+                               C(iInner, jInner) += A(i, k) * B(j, kPanelInner);
+                           });
+
+    auto clearCacheKernel = loopnests::Kernel("clearCacheKernel")
+                                .Inputs(cachedResult.GetValue())
+                                .Indices(i_kernel_outer, j_kernel_outer)
+                                .Define([resultCacheRows, resultCacheCols](Matrix cache, Scalar iOuter, Scalar jOuter) {
+                                    Scalar zeroValue = Allocate<int>(utilities::ScalarLayout);
+                                    for (int i = 0; i < resultCacheRows; ++i)
+                                    {
+                                        for (int j = 0; j < resultCacheCols; ++j)
+                                        {
+                                            cache(i, j) = zeroValue;
+                                        }
+                                    }
+                                });
+
+    auto uncacheKernel = loopnests::Kernel("uncacheKernel")
+                             .Inputs(C.GetValue(), cachedResult.GetValue())
+                             .Indices(i_panel_outer, i_kernel_outer, j_kernel_outer)
+                             .Define([resultCacheRows, resultCacheCols](Matrix C, Matrix cache, Scalar iPanelOuter, Scalar iOuter, Scalar jOuter) {
+                                 for (int i = 0; i < resultCacheRows; ++i)
+                                 {
+                                     for (int j = 0; j < resultCacheCols; ++j)
+                                     {
+                                         C(iPanelOuter + iOuter + i, jOuter + j) += cache(i, j);
+                                     }
+                                 }
+                             });
+
+    loopnests::CodePositionConstraints cacheAConstraint{ loopnests::LoopFragmentType::prologue, { i_panel_outer, k_panel_outer }, {} };
+    loop.AddKernel(cacheAKernel, cacheAConstraint);
+
+    loopnests::CodePositionConstraints resetCacheAConstraint{ loopnests::LoopFragmentType::epilogue, { i_panel_outer, k_panel_outer }, {} };
+    loop.AddKernel(resetCacheAKernel, resetCacheAConstraint);
+
+    loopnests::CodePositionConstraints cacheBConstraint{ loopnests::LoopFragmentType::prologue, { k_panel_outer }, {} };
+    loop.AddKernel(transposeCacheBKernel, cacheBConstraint);
+
+    loopnests::CodePositionConstraints constraint{ loopnests::LoopFragmentType::body, { k, i_kernel_inner, j_kernel_inner }, {} };
+    loop.AddKernel(innerKernel, constraint);
+
+    loopnests::CodePositionConstraints preConstraint{ loopnests::LoopFragmentType::prologue, { i_kernel_outer, j_kernel_outer }, {} };
+    loop.AddKernel(clearCacheKernel, preConstraint);
+
+    loopnests::CodePositionConstraints postConstraint{ loopnests::LoopFragmentType::epilogue, { i_kernel_outer, j_kernel_outer }, {} };
+    loop.AddKernel(uncacheKernel, postConstraint);
+    loop.SetLoopOrder({ k, i, j, i, k, j, i });
+
+    loop.Unroll(i_kernel_inner);
+    loop.Unroll(j_kernel_inner);
+    loopnests::CodeGenerator generator;
+    generator.Run(loop);
+#if 0
+    PrintLoops(loop, "GotoBLASGemmWithRefDeref");
+#endif
+
+    Scalar ok = Allocate<int>(ScalarLayout);
+    ok = 1;
+    If(
+        VerifySame(C, expected) == 0,
+        [&] {
+            ok = 0;
+        })
+        .Else([&] {
+            auto value = C.GetValue();
+            value.SetLayout({ { (int)C.Size() } });
+            DebugPrintVector(value);
+            DebugPrint("\n");
+            auto expectedValue = expected.GetValue();
+            expectedValue.SetLayout({ { (int)expected.Size() } });
+            DebugPrintVector(expectedValue);
+            DebugPrint("\n");
+        });
+    return ok;
+}
+
+Scalar YG12LowLevel_TestBoundary()
+{
+    const int N = 8;
+    auto A = MakeMatrix<int>(N, N, "A");
+    auto B = MakeMatrix<int>(N, N, "B");
+    auto C = MakeMatrix<int>(N, N, "C");
+    auto expected = MakeMatrix<int>(N, N, "expected");
+
+    // initialize A, B, and C
+    ForRange(N, [&](Scalar i) {
+        ForRange(N, [&](Scalar j) {
+            A(i, j) = i - j;
+            B(i, j) = i + 2 * j;
+        });
+    });
+
+    // fill out expected with a simple for-loop gemm
+    ForRange(N, [&](Scalar i) {
+        ForRange(N, [&](Scalar j) {
+            ForRange(N, [&](Scalar k) {
+                expected(i, j) += A(i, k) * B(k, j);
+            });
+        });
+    });
+
+    static const int k_r = 3;
+    static const int k_c = 4;
+
+    // Cache B into a columnMajor matrix
+    auto transposeBVal = MakeMatrix<int>(N, N, "transposeB");
+    auto transposeB = Matrix(transposeBVal);
+
+    Index transposeK("transposeK"), transposeN("transposeN");
+    LoopNest transposeLoop({ { transposeK, { 0, N } },
+                             { transposeN, { 0, N } } });
+    static const int transposeRows = N;
+    static const int transposeCols = N;
+
+    auto [k_transpose_outer, k_transpose_inner] = transposeLoop.Split(transposeK, transposeRows);
+    auto [n_transpose_outer, n_transpose_inner] = transposeLoop.Split(transposeN, transposeCols);
+
+    auto transposeKernel = Kernel("transpose_kernel")
+                               .Inputs(B.GetValue(), transposeB.GetValue())
+                               .Indices(transposeK, transposeN)
+                               .Define([](Matrix input, Matrix output, Scalar row, Scalar col) {
+                                   output(col, row) = input(row, col);
+                               });
+    transposeLoop.Unroll(n_transpose_inner);
+    transposeLoop.SetLoopOrder({ transposeK, transposeN, transposeK, transposeN });
+    loopnests::CodePositionConstraints transposeConstraints{ loopnests::LoopFragmentType::body, { k_transpose_inner, n_transpose_inner }, {} };
+    transposeLoop.AddKernel(transposeKernel, loopnests::LoopFragmentType::body);
+
+    loopnests::CodeGenerator transposeGenerator;
+    transposeGenerator.Run(transposeLoop);
+
+    // Do computation in blocks of k_r x k_c
+    {
+        Matrix temp = MakeMatrix<int>(k_r, k_c);
+
+        loopnests::Index m("m"), n("n"), k("k");
+        loopnests::LoopNest loop({ { m, { 0, N - (N % k_r) } },
+                                   { n, { 0, N } },
+                                   { k, { 0, N } } });
+
+        auto [n_block_outer, n_block_inner] = loop.Split(n, 4);
+
+        auto [m_outer, m_inner] = loop.Split(m, k_r);
+        auto [n_outer, n_inner] = loop.Split(n, k_c);
+
+        auto kernel = loopnests::Kernel("MatrixMatrixMultiplyNode_Kernel")
+                          .Inputs(A.GetValue(), transposeB.GetValue(), C.GetValue(), temp.GetValue())
+                          .Indices(m, n, m_inner, n_inner, k)
+                          .Define([](Matrix A, Matrix B, Matrix C, Matrix temp, Scalar i, Scalar j, Scalar i_inner, Scalar j_inner, Scalar k) {
+                              temp(i_inner, j_inner) += A(i, k) * B(j, k);
+                          });
+        auto kernel2 = loopnests::Kernel("MatrixMatrixMultiplyNode_Reduce")
+                           .Inputs(C.GetValue(), temp.GetValue())
+                           .Indices(m_outer, n_outer, n_block_outer)
+                           .Define([](Matrix C, Matrix temp, Scalar i, Scalar j, Scalar j_outer) {
+                               For(temp, [&](Scalar i_inner, Scalar j_inner) {
+                                   C(i + i_inner, j_outer + j + j_inner) = temp(i_inner, j_inner);
+                               });
+                           });
+        auto kernel3 = loopnests::Kernel("MatrixMatrixMultiplyNode_InitializeCache")
+                           .Inputs(temp.GetValue())
+                           .Indices(m_outer, n_outer)
+                           .Define([](Matrix temp, Scalar i, Scalar j) {
+                               For(temp, [&](Scalar i_inner, Scalar j_inner) {
+                                   temp(i_inner, j_inner) = static_cast<int>(0);
+                               });
+                           });
+        loop.Unroll(m_inner);
+        loop.Unroll(n_inner);
+
+        loop.Unroll(n_outer);
+
+        loop.SetLoopOrder({ n, m, n, k, n, m });
+        loopnests::CodePositionConstraints constraints2{ loopnests::LoopFragmentType::epilogue, { m_outer, n_outer }, {} };
+        loopnests::CodePositionConstraints constraints3{ loopnests::LoopFragmentType::prologue, { m_outer, n_outer }, {} };
+
+        loop.AddKernel(kernel3, constraints3);
+        loop.AddKernel(kernel, LoopNest::ConstraintType::constraint);
+        loop.AddKernel(kernel2, constraints2);
+        loopnests::CodeGenerator generator;
+#if 0
+        PrintLoops(loop, "YG12_Boundary_test_first");
+#endif
+        generator.Run(loop);
+    }
+
+    // Do remainder
+    {
+        auto remainderRows = N % k_r;
+        auto startM = N - remainderRows;
+        Matrix temp = MakeMatrix<int>(remainderRows, k_c);
+
+        loopnests::Index m("m"), n("n"), k("k");
+        loopnests::LoopNest loop({ { m, { startM, N } },
+                                   { n, { 0, N } },
+                                   { k, { 0, N } } });
+
+        auto [n_block_outer, n_block_inner] = loop.Split(n, 4);
+        auto [m_outer, m_inner] = loop.Split(m, remainderRows);
+        auto [n_outer, n_inner] = loop.Split(n, k_c);
+
+        auto kernel = loopnests::Kernel("MatrixMatrixMultiplyNode_Kernel_remainder")
+                          .Inputs(A.GetValue(), transposeB.GetValue(), C.GetValue(), temp.GetValue())
+                          .Indices(m, n, m_inner, n_inner, k)
+                          .Define([](Matrix A, Matrix B, Matrix C, Matrix temp, Scalar i, Scalar j, Scalar i_inner, Scalar j_inner, Scalar k) {
+                              temp(i_inner, j_inner) += A(i, k) * B(j, k);
+                          });
+        auto kernel2 = loopnests::Kernel("MatrixMatrixMultiplyNode_Reduce_remainder")
+                           .Inputs(C.GetValue(), temp.GetValue())
+                           .Indices(m, m_outer, n_outer, n_block_outer)
+                           .Define([](Matrix C, Matrix temp, Scalar i, Scalar i_outer, Scalar j_outer, Scalar j_block_outer) {
+                               For(temp, [&](Scalar i_inner, Scalar j_inner) {
+                                   C(i + i_inner, j_block_outer + j_outer + j_inner) = temp(i_inner, j_inner);
+                               });
+                           });
+        auto kernel3 = loopnests::Kernel("MatrixMatrixMultiplyNode_InitializeCache")
+                           .Inputs(temp.GetValue())
+                           .Indices()
+                           .Define([](Matrix temp) {
+                               For(temp, [&](Scalar i_inner, Scalar j_inner) {
+                                   temp(i_inner, j_inner) = static_cast<int>(0);
+                               });
+                           });
+        loop.Unroll(n_inner);
+        loop.Unroll(n_outer);
+
+        loop.SetLoopOrder({ n, m, n, k, n, m });
+        loopnests::CodePositionConstraints constraints2{ loopnests::LoopFragmentType::epilogue, { n_outer, m_outer }, {} };
+        loopnests::CodePositionConstraints constraints3{ loopnests::LoopFragmentType::prologue, { n_outer, m_outer }, {} };
+        loop.AddKernel(kernel3, constraints3);
+        loop.AddKernel(kernel, LoopNest::ConstraintType::constraint);
+        loop.AddKernel(kernel2, constraints2);
+        loopnests::CodeGenerator generator;
+#if 0
+        PrintLoops(loop, "YG12_Boundary_test_remainder");
+#endif
+        generator.Run(loop);
+    }
+
+    Scalar ok = Allocate<int>(ScalarLayout);
+    ok = 1;
+    If(
+        VerifySame(C, expected) == 0,
+        [&] {
+            ok = 0;
+        })
+        .Else([&] {
+            auto value = C.GetValue();
+            value.SetLayout({ { (int)C.Size() } });
+            DebugPrintVector(value);
+            DebugPrint("\n");
+            auto expectedValue = expected.GetValue();
+            expectedValue.SetLayout({ { (int)expected.Size() } });
+            DebugPrintVector(expectedValue);
+            DebugPrint("\n");
+        });
+    return ok;
+}
+
+Scalar KernelPredicate_test()
+{
+    const int M = 8;
+    const int N = M;
+    const int K = M;
+    auto A = MakeMatrix<int>(M, K, "A");
+    auto B = MakeMatrix<int>(K, N, "B");
+    auto C = MakeMatrix<int>(M, N, "C");
+    auto expected = MakeMatrix<int>(M, N, "expected");
+
+    // initialize A, B, and C
+    ForRange(M, [&](Scalar i) {
+        ForRange(N, [&](Scalar j) {
+            A(i, j) = i - j;
+            B(i, j) = i + 2 * j;
+            C(i, j) = 100;
+        });
+    });
+
+    // fill out expected with a simple for-loop gemm (plus 1)
+    ForRange(M, [&](Scalar i) {
+        ForRange(N, [&](Scalar j) {
+            ForRange(K, [&](Scalar k) {
+                expected(i, j) += A(i, k) * B(k, j);
+            });
+            expected(i, j) += 1;
+        });
+    });
+
+    // The input matrices:
+    // A:                                 B:                                 C:
+    // [ 0, -1, -2, -3, -4, -5, -6, -7]   [ 0,  2,  4,  6,  8, 10, 12, 14]   [ 100 100 100 ... ]
+    // [ 1,  0, -1, -2, -3, -4, -5, -6]   [ 1,  3,  5,  7,  9, 11, 13, 15]   [ 100 100 100 ... ]
+    // [ 2,  1,  0, -1, -2, -3, -4, -5]   [ 2,  4,  6,  8, 10, 12, 14, 16]   [      ...        ]
+    // [ 3,  2,  1,  0, -1, -2, -3, -4]   [ 3,  5,  7,  9, 11, 13, 15, 17]   [      ...        ]
+    // [ 4,  3,  2,  1,  0, -1, -2, -3]   [ 4,  6,  8, 10, 12, 14, 16, 18]   [      ...        ]
+    // [ 5,  4,  3,  2,  1,  0, -1, -2]   [ 5,  7,  9, 11, 13, 15, 17, 19]   [      ...        ]
+    // [ 6,  5,  4,  3,  2,  1,  0, -1]   [ 6,  8, 10, 12, 14, 16, 18, 20]   [      ...        ]
+    // [ 7,  6,  5,  4,  3,  2,  1,  0]   [ 7,  9, 11, 13, 15, 17, 19, 21]   [      ...        ]
+
+    // (A * B) (the desired result):
+    // [-140, -196, -252, -308, -364, -420, -476, -532]
+    // [-112, -152, -192, -232, -272, -312, -352, -392]
+    // [ -84, -108, -132, -156, -180, -204, -228, -252]
+    // [ -56,  -64,  -72,  -80,  -88,  -96, -104, -112]
+    // [ -28,  -20,  -12,   -4,    4,   12,   20,   28]
+    // [   0,   24,   48,   72,   96,  120,  144,  168]
+    // [  28,   68,  108,  148,  188,  228,  268,  308]
+    // [  56,  112,  168,  224,  280,  336,  392,  448]
+
+    Index i("i"), j("j"), k("k");
+
+    LoopNest loop({ { i, { 0, N } },
+                    { j, { 0, N } },
+                    { k, { 0, N } } });
+
+    auto initCKernel = Kernel("init")
+                           .Inputs(C.GetValue())
+                           .Indices(i, j, k)
+                           .Define([](Matrix C, Scalar i, Scalar j, Scalar k) {
+                               C(i, j) = 0;
+                           });
+
+    auto innerKernel = Kernel("matmul")
+                           .Inputs(A.GetValue(), B.GetValue(), C.GetValue())
+                           .Indices(i, j, k)
+                           .Define([](Matrix A, Matrix B, Matrix C, Scalar i, Scalar j, Scalar k) {
+                               C(i, j) += A(i, k) * B(k, j);
+                           });
+
+    auto postKernel = Kernel("addone")
+                          .Inputs(A.GetValue(), B.GetValue(), C.GetValue())
+                          .Indices(i, j, k)
+                          .Define([](Matrix A, Matrix B, Matrix C, Scalar i, Scalar j, Scalar k) {
+                              C(i, j) += 1;
+                          });
+
+    loop.AddKernel(initCKernel, { First(k) });
+    loop.AddKernel(innerKernel, LoopNest::ConstraintType::predicate);
+    loop.AddKernel(postKernel, { Last(k) });
+
+#if 1
+    auto [i_panel_outer, i_panel_inner] = loop.Split(i, 2);
+    auto [j_kernel_outer, j_kernel_inner] = loop.Split(j, 4);
+    auto [k_panel_outer, k_panel_inner] = loop.Split(k, 2);
+
+    loop.SetLoopOrder({ k, j, i, j, i, k });
+#endif
+
+#if 0
+    PrintLoops(loop, "KernelPredicate_test");
+#endif
+
+    CodeGenerator generator;
+    generator.Run(loop);
+
+    Scalar ok = Allocate<int>(ScalarLayout);
+    ok = 1;
+
+    If(
+        VerifySame(C, expected) == 0,
+        [&] {
+            ok = 0;
+        })
+        .Else([&] {
+#if 0
+            auto value = C.GetValue();
+            value.SetLayout({ { (int)C.Size() } });
+            DebugPrintVector(value);
+            DebugPrint("\n");
+            auto expectedValue = expected.GetValue();
+            expectedValue.SetLayout({ { (int)expected.Size() } });
+            DebugPrintVector(expectedValue);
+            DebugPrint("\n");
+#endif
+        });
+
+    return ok;
+}
+
+Scalar MatMul3_test1()
+{
+    auto p = GetMatMul3TestCaseParameters(8, 8, 8, 8);
+    const auto M = p.M;
+    const auto N = p.N;
+    const auto K = p.K;
+    const auto L = p.L;
+
+    Index i("i"), j("j"), k("k"), l("l");
+    LoopNest loop({ { i, { 0, M } },
+                    { j, { 0, N } },
+                    { k, { 0, K } },
+                    { l, { 0, L } } });
+
+    auto initCKernel = Kernel("initC")
+                           .Inputs(p.C.GetValue())
+                           .Indices(i, j)
+                           .Define([](Matrix C, Scalar i, Scalar j) {
+                               C(i, j) = 0;
+                           });
+
+    auto computeCKernel = Kernel("matmulC")
+                              .Inputs(p.A.GetValue(), p.B.GetValue(), p.C.GetValue())
+                              .Indices(i, j, k)
+                              .Define([](Matrix A, Matrix B, Matrix C, Scalar i, Scalar j, Scalar k) {
+                                  C(i, j) += A(i, k) * B(k, j);
+                              });
+
+    auto initEKernel = Kernel("initE")
+                           .Inputs(p.E.GetValue())
+                           .Indices(i, l)
+                           .Define([](Matrix E, Scalar i, Scalar l) {
+                               E(i, l) = 0;
+                           });
+
+    auto computeEKernel = Kernel("matmulE")
+                              .Inputs(p.C.GetValue(), p.D.GetValue(), p.E.GetValue())
+                              .Indices(i, j, l)
+                              .Define([](Matrix C, Matrix D, Matrix E, Scalar i, Scalar j, Scalar l) {
+                                  E(i, l) += C(i, j) * D(j, l);
+                              });
+
+    loop.AddKernel(initCKernel, { First(k) && First(l) });
+    loop.AddKernel(computeCKernel, { First(l) });
+
+    loop.AddKernel(initEKernel, { Last(k) && First(j) });
+    loop.AddKernel(computeEKernel, { Last(k) });
+
+#if 0
+    PrintLoops(loop, "MatMul3_test1");
+#endif
+
+    CodeGenerator generator;
+    generator.Run(loop);
+
+    return VerifySame(p.E, p.expectedE);
+}
+
+Scalar MatMul3_test2()
+{
+    auto p = GetMatMul3TestCaseParameters(8, 8, 8, 8);
+    const auto M = p.M;
+    const auto N = p.N;
+    const auto K = p.K;
+    const auto L = p.L;
+
+    Index i("i"), j("j"), k("k"), l("l");
+    LoopNest loop({ { i, { 0, M } },
+                    { j, { 0, N } },
+                    { k, { 0, K } },
+                    { l, { 0, L } } });
+
+    int stepI = 4;
+    int stepJ = 4;
+    // int stepK = 4;
+    auto [iOuter, iInner] = loop.Split(i, stepI);
+    auto [jOuter, jInner] = loop.Split(j, stepJ);
+    // auto [kOuter, kInner] = loop.Split(k, stepK);
+
+    loop.SetLoopOrder({ iOuter, jOuter, k, l, iInner, jInner });
+
+    auto initCKernel = Kernel("initC")
+                           .Inputs(p.C.GetValue())
+                           .Indices(i, j)
+                           .Define([&](Matrix C, Scalar i, Scalar j) {
+                               ForRange(stepI, [&](Scalar ii) {
+                                   ForRange(stepJ, [&](Scalar jj) {
+                                       C(i + ii, j + jj) = 0;
+                                   });
+                               });
+                           });
+
+    auto computeCKernel = Kernel("matmulC")
+                              .Inputs(p.A.GetValue(), p.B.GetValue(), p.C.GetValue())
+                              .Indices(i, j, k)
+                              .Define([&](Matrix A, Matrix B, Matrix C, Scalar i, Scalar j, Scalar k) {
+                                  // accum into C(I,J) via GEMM
+                                  ForRange(stepI, [&](Scalar ii) {
+                                      ForRange(stepJ, [&](Scalar jj) {
+                                          C(i + ii, j + jj) += A(i + ii, k) * B(k, j + jj);
+                                      });
+                                  });
+                              });
+
+    auto initEKernel = Kernel("initE")
+                           .Inputs(p.E.GetValue())
+                           .Indices(i, l)
+                           .Define([&](Matrix E, Scalar i, Scalar l) {
+                               ForRange(stepI, [&](Scalar ii) {
+                                   E(i + ii, l) = 0;
+                               });
+                           });
+
+    auto computeEKernel = Kernel("matmulE")
+                              .Inputs(p.C.GetValue(), p.D.GetValue(), p.E.GetValue())
+                              .Indices(i, j, l)
+                              .Define([&](Matrix C, Matrix D, Matrix E, Scalar i, Scalar j, Scalar l) {
+                                  ForRange(stepI, [&](Scalar ii) {
+                                      ForRange(stepJ, [&](Scalar jj) {
+                                          // accum into E(I,L) via GEMM
+                                          E(i + ii, l) += C(i + ii, j + jj) * D(j + jj, l);
+                                      });
+                                  });
+                              });
+
+    loop.AddKernel(initCKernel, { First(k) && First(l) }, { Before(iInner) || Before(jInner) });
+    loop.AddKernel(computeCKernel, { First(l) }, { Before(iInner) || Before(jInner) });
+
+    loop.AddKernel(initEKernel, { Last(k) && First(j) }, { Before(iInner) || Before(jInner) });
+    loop.AddKernel(computeEKernel, { Last(k) }, { Before(iInner) || Before(jInner) });
+
+#if 0
+    PrintLoops(loop, "MatMul3_test2");
+#endif
+
+    CodeGenerator generator;
+    generator.Run(loop);
+
+    return VerifySame(p.E, p.expectedE);
+}
+
+Scalar LoopNestFuse_test1()
+{
+    auto p = GetMatMul3TestCaseParameters(8, 8, 8, 8);
+    const auto M = p.M;
+    const auto N = p.N;
+    const auto K = p.K;
+    const auto L = p.L;
+
+    Index i("i"), j("j"), k("k"), l("l");
+    LoopNest loopC({ { i, { 0, M } },
+                     { j, { 0, N } },
+                     { k, { 0, K } } });
+    LoopNest loopE({ { i, { 0, M } },
+                     { j, { 0, N } },
+                     { l, { 0, L } } });
+
+    auto initCKernel = Kernel("initC")
+                           .Inputs(p.C.GetValue())
+                           .Indices(i, j)
+                           .Define([](Matrix C, Scalar i, Scalar j) {
+                               C(i, j) = 0;
+                           });
+
+    auto computeCKernel = Kernel("matmulC")
+                              .Inputs(p.A.GetValue(), p.B.GetValue(), p.C.GetValue())
+                              .Indices(i, j, k)
+                              .Define([](Matrix A, Matrix B, Matrix C, Scalar i, Scalar j, Scalar k) {
+                                  C(i, j) += A(i, k) * B(k, j);
+                              });
+
+    auto initEKernel = Kernel("initE")
+                           .Inputs(p.E.GetValue())
+                           .Indices(i, l)
+                           .Define([](Matrix E, Scalar i, Scalar l) {
+                               E(i, l) = 0;
+                           });
+
+    auto computeEKernel = Kernel("matmulE")
+                              .Inputs(p.C.GetValue(), p.D.GetValue(), p.E.GetValue())
+                              .Indices(i, j, l)
+                              .Define([](Matrix C, Matrix D, Matrix E, Scalar i, Scalar j, Scalar l) {
+                                  E(i, l) += C(i, j) * D(j, l);
+                              });
+
+    loopC.AddKernel(initCKernel, { First(k) });
+    loopC.AddKernel(computeCKernel, LoopNest::ConstraintType::predicate);
+
+    loopE.AddKernel(initEKernel, { First(j) });
+    loopE.AddKernel(computeEKernel, LoopNest::ConstraintType::predicate);
+
+    // Now fuse the loops
+    auto fusedLoops = Fuse(loopC, loopE, { l }, { k });
+    fusedLoops.SetLoopOrder({ i, j, k, l });
+
+#if 0
+    PrintLoops(fusedLoops, "LoopNestFuse_test1: fusedLoops");
+#endif
+
+    Scalar ok = Allocate<int>(ScalarLayout);
+    ok = 1;
+
+    CodeGenerator generator;
+    generator.Run(fusedLoops);
+
+    return VerifySame(p.E, p.expectedE);
+}
+
+LoopNest GetMatMulLoopNest(std::string name, const Matrix& A, const Matrix& B, const Matrix& C, const Index& i, const Index& j, const Index& k, bool initResult = true)
+{
+    const int M = static_cast<int>(C.Rows());
+    const int N = static_cast<int>(C.Columns());
+    const int K = static_cast<int>(A.Columns());
+
+    LoopNest loop({ { i, { 0, M } },
+                    { j, { 0, N } },
+                    { k, { 0, K } } });
+
+    auto initCKernel = Kernel("init_" + name)
+                           .Inputs(C.GetValue())
+                           .Indices(i, j)
+                           .Define([](Matrix C, Scalar i, Scalar j) {
+                               C(i, j) = 0;
+                           });
+
+    auto innerKernel = Kernel("matmul_" + name)
+                           .Inputs(A.GetValue(), B.GetValue(), C.GetValue())
+                           .Indices(i, j, k)
+                           .Define([](Matrix A, Matrix B, Matrix C, Scalar i, Scalar j, Scalar k) {
+                               C(i, j) += A(i, k) * B(k, j);
+                           });
+
+    if (initResult)
+        loop.AddKernel(initCKernel, { First(k) });
+    loop.AddKernel(innerKernel, LoopNest::ConstraintType::predicate);
+
+    return loop;
+}
+
+Scalar LoopNestFuse_test2()
+{
+    auto p = GetMatMul3TestCaseParameters(8, 8, 8, 8);
+
+    Index i("i"), j("j"), k("k"), l("l");
+
+    LoopNest loopC = GetMatMulLoopNest("C", p.A, p.B, p.C, i, j, k); // C = A * B
+    LoopNest loopE = GetMatMulLoopNest("E", p.C, p.D, p.E, i, l, j); // E = C * D
+
+#if 0
+    PrintLoops(loopC, "LoopNestFuse_test: loopC");
+    PrintLoops(loopE, "LoopNestFuse_test: loopE");
+#endif
+
+    // Now fuse the loops
+    auto fusedLoops = Fuse(loopC, loopE, { l }, { k });
+
+#if 0
+    PrintLoops(fusedLoops, "LoopNestFuse_test2: fusedLoops");
+#endif
+
+    Scalar ok = Allocate<int>(ScalarLayout);
+    ok = 1;
+
+    CodeGenerator generator;
+    generator.Run(fusedLoops);
+
+    If(
+        VerifySame(p.E, p.expectedE) == 0,
+        [&] {
+            ok = 0;
+        });
+
+    return ok;
+}
+
+Scalar LoopNestFuse_test3()
+{
+    auto p = GetMatMul3TestCaseParameters(8, 8, 8, 8);
+
+    Index i("i"), j("j"), k("k"), l("l");
+
+    LoopNest loopC = GetMatMulLoopNest("C", p.A, p.B, p.C, i, j, k); // C = A * B
+    LoopNest loopE = GetMatMulLoopNest("E", p.C, p.D, p.E, i, l, j); // E = C * D
+
+#if 0
+    PrintLoops(loopC, "LoopNestFuse_test: loopC");
+    PrintLoops(loopE, "LoopNestFuse_test: loopE");
+#endif
+
+    // Now fuse the loops
+    auto fusedLoops = Fuse(loopC, loopE);
+
+#if 0
+    PrintLoops(fusedLoops, "LoopNestFuse_test2: fusedLoops");
+#endif
+
+    Scalar ok = Allocate<int>(ScalarLayout);
+    ok = 1;
+
+    CodeGenerator generator;
+    generator.Run(fusedLoops);
+
+    If(
+        VerifySame(p.E, p.expectedE) == 0,
+        [&] {
+            ok = 0;
+        });
+
+    return ok;
+}
+
+Scalar ConvertedConstraint_test1()
+{
+    std::string loopOrder = "ijk";
+    const int N = 8;
+    auto A = MakeMatrix<int>(N, N, "A");
+    auto B = MakeMatrix<int>(N, N, "B");
+    auto C = MakeMatrix<int>(N, N, "C");
+
+    // initialize A, B, and C
+    ForRange(N, [&](Scalar i) {
+        ForRange(N, [&](Scalar j) {
+            A(i, j) = i - j;
+            B(i, j) = i + 2 * j;
+            C(i, j) = 100;
+        });
+    });
+
+    // The input matrices:
+    // A:                                 B:                                 C:
+    // [ 0, -1, -2, -3, -4, -5, -6, -7]   [ 0,  2,  4,  6,  8, 10, 12, 14]   [ 100 100 100 ... ]
+    // [ 1,  0, -1, -2, -3, -4, -5, -6]   [ 1,  3,  5,  7,  9, 11, 13, 15]   [ 100 100 100 ... ]
+    // [ 2,  1,  0, -1, -2, -3, -4, -5]   [ 2,  4,  6,  8, 10, 12, 14, 16]   [      ...        ]
+    // [ 3,  2,  1,  0, -1, -2, -3, -4]   [ 3,  5,  7,  9, 11, 13, 15, 17]   [      ...        ]
+    // [ 4,  3,  2,  1,  0, -1, -2, -3]   [ 4,  6,  8, 10, 12, 14, 16, 18]   [      ...        ]
+    // [ 5,  4,  3,  2,  1,  0, -1, -2]   [ 5,  7,  9, 11, 13, 15, 17, 19]   [      ...        ]
+    // [ 6,  5,  4,  3,  2,  1,  0, -1]   [ 6,  8, 10, 12, 14, 16, 18, 20]   [      ...        ]
+    // [ 7,  6,  5,  4,  3,  2,  1,  0]   [ 7,  9, 11, 13, 15, 17, 19, 21]   [      ...        ]
+
+    // (A * B) + 1  (the desired result):
+    // [-139, -195, -251, -307, -363, -419, -475, -531]
+    // [-111, -151, -191, -231, -271, -311, -351, -391]
+    // [ -83, -107, -131, -155, -179, -203, -227, -251]
+    // [ -55,  -63,  -71,  -79,  -87,  -95, -103, -111]
+    // [ -27,  -19,  -11,   -3,    5,   13,   21,   29]
+    // [   1,   25,   49,   73,   97,  121,  145,  169]
+    // [  29,   69,  109,  149,  189,  229,  269,  309]
+    // [  57,  113,  169,  225,  281,  337,  393,  449]
+
+    Index i("i"), j("j"), k("k");
+
+    auto innerKernel = Kernel("matmul")
+                           .Inputs(A.GetValue(), B.GetValue(), C.GetValue())
+                           .Indices(i, j, k)
+                           .Define(matmul_kernel);
+    auto initCKernel = Kernel("init")
+                           .Inputs(C.GetValue())
+                           .Indices(i, j)
+                           .Define(initToZero);
+    auto postProcessCKernel = Kernel("post")
+                                  .Inputs(C.GetValue())
+                                  .Indices(i, j)
+                                  .Define(addOne);
+
+    LoopNest loop({ { i, { 0, N } },
+                    { j, { 0, N } },
+                    { k, { 0, N } } });
+
+    CodePositionConstraints preConstraint{ LoopFragmentType::prologue, { i, j }, {} };
+    loop.AddKernel(initCKernel, preConstraint);
+
+    loop.AddKernel(innerKernel, LoopNest::ConstraintType::constraint);
+
+    CodePositionConstraints postConstraint{ LoopFragmentType::epilogue, { i, j }, {} };
+    loop.AddKernel(postProcessCKernel, postConstraint);
+
+    SplitAndSetOrder(loop, { i, j, k }, { 4, 2 }, loopOrder);
+
+    CodeGenerator generator;
+    generator.Run(loop);
+
+#if 0 // DEBUGGING
+    PrintLoops(loop, "ConvertedConstraint_test1");
+#endif
+
+#if 0 // DEBUGGING
+    InvokeForContext<ComputeContext>([&](auto&) {
+        ForRange(N, [&](Scalar i) {
+            ForRange(N, [&](Scalar j) {
+                auto val = C(i, j).Get<int>();
+                Log() << std::setw(5) << val;
+            });
+            Log() << EOL;
+        });
+    });
+#endif
+
+    return C(1, 2) + C(2, 1) - (-191 + -107); // will return 0 if calculation is correct
+}
+
+Scalar ConvertedConstraint_test2()
+{
+    auto matrix = MakeMatrix<int>(4, 5);
+    IndexRange i("i", { 0, 4 }), j("j", { 0, 5 });
+
+    auto kernel = Kernel("kernel")
+                      .Inputs(matrix.GetValue())
+                      .Indices(i.GetIndex(), j.GetIndex())
+                      .Define(loopnest_kernel);
+
+    LoopNest loop(std::vector<IndexRange>{ i, j });
+    loop.AddKernel(kernel, LoopNest::ConstraintType::constraint);
+    loop.Split(i.GetIndex(), 2);
+
+    CodeGenerator generator;
+    generator.Run(loop);
+
+#if 0 // DEBUGGING
+    PrintLoops(loop, "ConvertedConstraint_test2");
+#endif
+
+#if 0 // DEBUGGING
+    InvokeForContext<ComputeContext>([&](auto&) {
+        ForRange(4, [&](Scalar i) {
+            ForRange(5, [&](Scalar j) {
+                auto val = matrix(i, j).Get<int>();
+                Log() << std::setw(5) << val;
+            });
+            Log() << EOL;
+        });
+    });
+#endif
+
+    return matrix(2, 3) - 19; // will return 0 if calculation is correct
+}
+} // namespace ell
diff --git a/libraries/value/test/src/Matrix_test.cpp b/libraries/value/test/src/Matrix_test.cpp
index db2c965cf..2b960e07b 100644
--- a/libraries/value/test/src/Matrix_test.cpp
+++ b/libraries/value/test/src/Matrix_test.cpp
@@ -71,7 +71,7 @@ namespace
             auto mathRowVector = mathMatrix.GetRow(rowIndex);
             auto rowVector = matrix.Row((int)rowIndex);
             Vector expected = mathRowVector.ToArray();
-            If(Verify(rowVector, expected) != 0, [&] {
+            If(VerifySame(rowVector, expected) != 0, [&] {
                 ok2 = 1;
             });
         }
@@ -87,7 +87,7 @@ namespace
             auto mathColumnVector = mathMatrix.GetColumn(columnIndex);
             auto columnVector = matrix.Column((int)columnIndex);
             Vector expected = mathColumnVector.ToArray();
-            If(Verify(columnVector, expected) != 0, [&] {
+            If(VerifySame(columnVector, expected) != 0, [&] {
                 ok2 = 1;
             });
         }
@@ -169,7 +169,7 @@ Scalar Matrix_test3()
             std::vector<float>{ 1.2f + 3.4f, 2.3f + 3.4f },
             std::vector<float>{ 3.4f + 3.4f, 4.5f + 3.4f } });
         Matrix actual = m + testScalar;
-        If(0 != Verify(actual, expected), [&] {
+        If(0 != VerifySame(actual, expected), [&] {
             DebugPrint("Matrix_test3 matrix scalar addition failed \n");
             ok = 1;
         });
@@ -179,7 +179,7 @@ Scalar Matrix_test3()
             std::vector<float>{ 1.2f - 3.4f, 2.3f - 3.4f },
             std::vector<float>{ 3.4f - 3.4f, 4.5f - 3.4f } });
         Matrix actual = m - testScalar;
-        If(0 != Verify(actual, expected), [&] {
+        If(0 != VerifySame(actual, expected), [&] {
             DebugPrint("Matrix_test3 matrix scalar subtraction failed \n");
             ok = 1;
         });
@@ -189,7 +189,7 @@ Scalar Matrix_test3()
             std::vector<float>{ 1.2f * 3.4f, 2.3f * 3.4f },
             std::vector<float>{ 3.4f * 3.4f, 4.5f * 3.4f } });
         Matrix actual = m * testScalar;
-        If(0 != Verify(actual, expected), [&] {
+        If(0 != VerifySame(actual, expected), [&] {
             DebugPrint("Matrix_test3 matrix scalar multiplication failed \n");
             ok = 1;
         });
@@ -199,7 +199,7 @@ Scalar Matrix_test3()
             std::vector<float>{ 1.2f / 3.4f, 2.3f / 3.4f },
             std::vector<float>{ 3.4f / 3.4f, 4.5f / 3.4f } });
         Matrix actual = m / testScalar;
-        If(0 != Verify(actual, expected), [&] {
+        If(0 != VerifySame(actual, expected), [&] {
             DebugPrint("Matrix_test3 matrix scalar division failed \n");
             ok = 1;
         });
@@ -211,7 +211,7 @@ Scalar Matrix_test3()
             std::vector<float>{ 1.2f + 0.1f, 2.3f + 1.2f },
             std::vector<float>{ 3.4f + 2.3f, 4.5f + 3.4f } });
         Matrix actual = m + testMatrix;
-        If(0 != Verify(actual, expected), [&] {
+        If(0 != VerifySame(actual, expected), [&] {
             DebugPrint("Matrix_test3 matrix + matrix failed \n");
             ok = 1;
         });
@@ -221,7 +221,7 @@ Scalar Matrix_test3()
             std::vector<float>{ 1.2f - 0.1f, 2.3f - 1.2f },
             std::vector<float>{ 3.4f - 2.3f, 4.5f - 3.4f } });
         Matrix actual = m - testMatrix;
-        If(0 != Verify(actual, expected), [&] {
+        If(0 != VerifySame(actual, expected), [&] {
             DebugPrint("Matrix_test3 matrix - matrix failed \n");
             ok = 1;
         });
@@ -229,6 +229,40 @@ Scalar Matrix_test3()
     return ok;
 }
 
+// This test verifies:
+// - "For" with Matrix
+// - Assignment from Matrix of one dimension order to another
+// NOTE: This test currently passes for Compute but FAILS for Compile
+Scalar Matrix_test4()
+{
+    Scalar ok = Allocate(ValueType::Int32, ScalarLayout);
+    ok = 0;
+
+    std::vector<std::vector<int>> dt{
+        std::vector<int>{ 1, 2, 3 },
+        std::vector<int>{ 4, 5, 6 },
+    };
+    auto source = Matrix(dt);
+    auto destValue = Allocate(value::ValueType::Int32, source.GetValue().GetLayout().ReorderedCopy(DimensionOrder{ 1, 0 }));
+    auto dest = Matrix(destValue);
+
+    For(source, [&](value::Scalar row, value::Scalar column) {
+        dest(row, column) = source(row, column);
+    });
+
+    std::vector<int> expectedValues{ 1, 4, 2, 5, 3, 6 };
+    auto expected = Vector(expectedValues);
+
+    Vector actual = AsVector(AsFullView(dest));
+
+    If(VerifySame(actual, expected) == 1, [&] {
+        DebugPrint("Matrix_test4 matrix assignment to different dimension order failed \n");
+        ok = 1;
+    });
+
+    return ok;
+}
+
 Scalar Reshape_test()
 {
     Scalar ok = Allocate(ValueType::Int32, ScalarLayout);
@@ -241,12 +275,12 @@ Scalar Reshape_test()
 
     Vector v = std::vector<float>{ 1, 2, 3, 4, 5, 6 };
 
-    If(0 != Verify(ToVector(m.GetValue()), v), [&] {
+    If(0 != VerifySame(ToVector(m.GetValue()), v), [&] {
         DebugPrint("Reshape_test matrix into a vector failed \n");
         ok = 1;
     });
 
-    If(0 != Verify(ToMatrix(v.GetValue(), 2, 3), m), [&] {
+    If(0 != VerifySame(ToMatrix(v.GetValue(), 2, 3), m), [&] {
         DebugPrint("Reshape_test vector into a matrix failed \n");
         ok = 1;
     });
@@ -270,7 +304,7 @@ Scalar GEMV_test()
 
     Vector expected(std::vector<float>{ 9.3f, 20.3f });
 
-    If(0 != Verify(actual, expected, 1e-5), [&] {
+    If(0 != VerifySame(actual, expected, 1e-5), [&] {
         DebugPrint("GEMV_test - failed \n");
         ok = 1;
     });
@@ -281,8 +315,8 @@ Scalar MatrixReferenceTest()
 {
     const int N = 4;
     const int kernelSize = 2;
-    const int offsetRows = 0;
-    const int offsetCols = 1;
+    const Scalar offsetRows = 0;
+    const Scalar offsetCols = 1;
 
     auto A = MakeMatrix<int>(N, N);
 
@@ -322,7 +356,7 @@ Scalar MatrixReferenceTest()
     Scalar ok = Allocate<int>(ScalarLayout);
     ok = 1;
     If(
-        Verify(valueCachePtr, expected) == 0,
+        VerifySame(valueCachePtr, expected) == 0,
         [&] {
             ok = 0;
         })
@@ -343,8 +377,8 @@ Scalar RefMatrixReferenceTest()
 {
     const int N = 4;
     const int kernelSize = 2;
-    const int offsetRows = 0;
-    const int offsetCols = 1;
+    const Scalar offsetRows = 0;
+    const Scalar offsetCols = 1;
 
     auto A = MakeMatrix<int>(N, N, "A");
 
@@ -384,7 +418,7 @@ Scalar RefMatrixReferenceTest()
     Scalar ok = MakeScalar<int>("ok");
     ok = 1;
     If(
-        Verify(valueCachePtr, expected) == 0,
+        VerifySame(valueCachePtr, expected) == 0,
         [&] {
             ok = 0;
         })
diff --git a/libraries/value/test/src/Scalar_test.cpp b/libraries/value/test/src/Scalar_test.cpp
index ffb15d410..4dbdb9de3 100644
--- a/libraries/value/test/src/Scalar_test.cpp
+++ b/libraries/value/test/src/Scalar_test.cpp
@@ -147,13 +147,13 @@ Scalar RefScalarRefCtorsTest()
 
     Scalar expected = Allocate<int>(ScalarLayout);
     expected = 100;
-    testing::ProcessTest("Value initial pointer level", expected.GetValue().PointerLevel() == 1);
+    testing::ProcessQuietTest("Value initial pointer level", expected.GetValue().PointerLevel() == 1);
     Ref<Scalar> scalarPtr = x;
-    testing::ProcessTest("Ref ctor", scalarPtr.GetValue().PointerLevel() == 2);
+    testing::ProcessQuietTest("Ref ctor", scalarPtr.GetValue().PointerLevel() == 2);
     Ref<Scalar> scalarPtrCopy = x;
-    testing::ProcessTest("Ref copy semantics", scalarPtr.GetValue().PointerLevel() == scalarPtrCopy.GetValue().PointerLevel());
+    testing::ProcessQuietTest("Ref copy semantics", scalarPtr.GetValue().PointerLevel() == scalarPtrCopy.GetValue().PointerLevel());
     Ref<Scalar> scalarPtrMove = std::move(scalarPtr);
-    testing::ProcessTest("Ref move semantics", !scalarPtr.GetValue().IsDefined() && scalarPtrMove.GetValue().PointerLevel() == 2);
+    testing::ProcessQuietTest("Ref move semantics", !scalarPtr.GetValue().IsDefined() && scalarPtrMove.GetValue().PointerLevel() == 2);
 
     return result;
 }
@@ -198,4 +198,67 @@ Scalar RefScalarRefRefRefTest()
     If(scalar != expected, [&] { result = 1; });
     return result;
 }
+
+Scalar SequenceLogicalAndTest()
+{
+    int fourInt = 4;
+    Scalar twoScalar = 2;
+    Scalar fourScalar = 4;
+
+    Scalar fourGTTwo = fourInt > twoScalar;
+    Scalar fourGTFour = fourInt > fourScalar;
+
+    Scalar ok = Allocate<int>(ScalarLayout);
+    ok = 1;
+    If((fourGTTwo && fourGTFour),
+        [&]() {
+            DebugPrint("Error! 4 > 2 && 4 > 4\n");
+        })
+    .ElseIf(fourGTTwo,
+        [&]() {
+            ok = 0;
+        })
+    .ElseIf(fourGTFour,
+        [&]() {
+            DebugPrint("Error! 4 <= 2 && 4 > 4\n");
+        })
+    .Else(
+        [&]() {
+            DebugPrint("Error! 4 <= 2 && 4 <= 4\n");
+        });
+    return ok;
+}
+
+Scalar SequenceLogicalAndTestWithCopy()
+{
+    int fourInt = 4;
+    Scalar twoScalar = 2;
+    Scalar fourScalar = 4;
+
+    Scalar fourGTTwo = fourInt > twoScalar;
+    Scalar copyFourGTTwo = fourInt > twoScalar;
+    Scalar fourGTFour = fourInt > fourScalar;
+    Scalar copyFourGTFour = fourInt > fourScalar;
+
+    Scalar ok = Allocate<int>(ScalarLayout);
+    ok = 1;
+    If((fourGTTwo && fourGTFour),
+        [&]() {
+            DebugPrint("Error! 4 > 2 && 4 > 4\n");
+        })
+    .ElseIf(copyFourGTTwo,
+        [&]() {
+            ok = 0;
+        })
+    .ElseIf(copyFourGTFour,
+        [&]() {
+            DebugPrint("Error! 4 <= 2 && 4 > 4\n");
+        })
+    .Else(
+        [&]() {
+            DebugPrint("Error! 4 <= 2 && 4 <= 4\n");
+        });
+    return ok;
+}
+
 } // namespace ell
diff --git a/libraries/value/test/src/Tensor_test.cpp b/libraries/value/test/src/Tensor_test.cpp
index 7ec94c047..7d2716515 100644
--- a/libraries/value/test/src/Tensor_test.cpp
+++ b/libraries/value/test/src/Tensor_test.cpp
@@ -154,21 +154,21 @@ Scalar Tensor_test1()
                 {
                     Vector mathSlicedVector = math::GetSlice<Dimension::row>(mathTensor, column, channel).ToArray();
                     auto slicedVector = tensor.Slice(Slice::All, column, channel);
-                    If(Verify(slicedVector, mathSlicedVector) != 0, [&] {
+                    If(VerifySame(slicedVector, mathSlicedVector) != 0, [&] {
                         ok2 = 1;
                     });
                 }
                 {
                     Vector mathSlicedVector = math::GetSlice<Dimension::column>(mathTensor, row, channel).ToArray();
                     auto slicedVector = tensor.Slice(row, Slice::All, channel);
-                    If(Verify(slicedVector, mathSlicedVector) != 0, [&] {
+                    If(VerifySame(slicedVector, mathSlicedVector) != 0, [&] {
                         ok2 = 1;
                     });
                 }
             }
             Vector mathSlicedVector = math::GetSlice<Dimension::channel>(mathTensor, row, column).ToArray();
             auto slicedVector = tensor.Slice(row, column, Slice::All);
-            If(Verify(slicedVector, mathSlicedVector) != 0, [&] {
+            If(VerifySame(slicedVector, mathSlicedVector) != 0, [&] {
                 ok2 = 1;
             });
         }
@@ -251,7 +251,7 @@ Scalar Tensor_test3()
                     std::vector<float>{ 3.2f + s, 2.1f + s } },
             };
         Tensor actual = t + testScalar;
-        If(Verify(actual, expected) != 0, [&] {
+        If(VerifySame(actual, expected) != 0, [&] {
             ok = 1;
             DebugPrint("Tensor_test3: Tensor scalar addition failed\n");
         });
@@ -267,7 +267,7 @@ Scalar Tensor_test3()
                     std::vector<float>{ 3.2f - s, 2.1f - s } },
             };
         Tensor actual = t - testScalar;
-        If(Verify(actual, expected) != 0, [&] {
+        If(VerifySame(actual, expected) != 0, [&] {
             ok = 1;
             DebugPrint("Tensor_test3: Tensor scalar subtraction failed\n");
         });
@@ -283,7 +283,7 @@ Scalar Tensor_test3()
                     std::vector<float>{ 3.2f * s, 2.1f * s } },
             };
         Tensor actual = t * testScalar;
-        If(Verify(actual, expected) != 0, [&] {
+        If(VerifySame(actual, expected) != 0, [&] {
             ok = 1;
             DebugPrint("Tensor_test3: Tensor scalar multiplication failed\n");
         });
@@ -299,7 +299,7 @@ Scalar Tensor_test3()
                     std::vector<float>{ 3.2f / s, 2.1f / s } },
             };
         Tensor actual = t / testScalar;
-        If(Verify(actual, expected) != 0, [&] {
+        If(VerifySame(actual, expected) != 0, [&] {
             ok = 1;
             DebugPrint("Tensor_test3: Tensor scalar division failed\n");
         });
@@ -384,7 +384,7 @@ Scalar Tensor_slice_test1()
         Matrix mathMatrix = ToMatrix(mathTensor.GetSlice<Dimension::row, Dimension::column>(0));
         auto matrix = inputTensor.Slice(Slice::All, Slice::All, 0);
 
-        If(Verify(matrix, mathMatrix) != 0, [&] {
+        If(VerifySame(matrix, mathMatrix) != 0, [&] {
             ok = 1;
             DebugPrint("Tensor_slice_test1: Tensor row-column GetSlice failed\n");
         });
@@ -396,7 +396,7 @@ Scalar Tensor_slice_test1()
         Matrix mathMatrix = ToMatrix(slice);
         auto matrix = inputTensor.Slice(0, Slice::All, Slice::All);
 
-        If(Verify(matrix, mathMatrix) != 0, [&] {
+        If(VerifySame(matrix, mathMatrix) != 0, [&] {
             ok = 1;
             DebugPrint("Tensor_slice_test1: Tensor column-channel GetSlice failed\n");
         });
@@ -406,7 +406,7 @@ Scalar Tensor_slice_test1()
         Vector mathVector = mathTensor.GetSlice<Dimension::channel>(0, 0).ToArray();
         auto vector = inputTensor.Slice(0, 0, Slice::All);
 
-        If(Verify(mathVector, vector) != 0, [&] {
+        If(VerifySame(mathVector, vector) != 0, [&] {
             ok = 1;
             DebugPrint("Tensor_slice_test1: Tensor channel vector failed\n");
         });
@@ -416,7 +416,7 @@ Scalar Tensor_slice_test1()
         Vector mathVector = mathTensor.GetSlice<Dimension::column>(0, 0).ToArray();
         auto vector = inputTensor.Slice(0, Slice::All, 0);
 
-        If(Verify(mathVector, vector) != 0, [&] {
+        If(VerifySame(mathVector, vector) != 0, [&] {
             ok = 1;
             DebugPrint("Tensor_slice_test1: Tensor column vector failed");
         });
@@ -426,7 +426,7 @@ Scalar Tensor_slice_test1()
         Vector mathVector = mathTensor.GetSlice<Dimension::row>(0, 0).ToArray();
         auto vector = inputTensor.Slice(Slice::All, 0, 0);
 
-        If(Verify(mathVector, vector) != 0, [&] {
+        If(VerifySame(mathVector, vector) != 0, [&] {
             ok = 1;
             DebugPrint("Tensor_slice_test1: Tensor row vector failed");
         });
diff --git a/libraries/value/test/src/TestUtil.cpp b/libraries/value/test/src/TestUtil.cpp
index 4967e5c3c..935b2e2a1 100644
--- a/libraries/value/test/src/TestUtil.cpp
+++ b/libraries/value/test/src/TestUtil.cpp
@@ -8,25 +8,45 @@
 
 #include "TestUtil.h"
 
+#include <value/include/Array.h>
 #include <value/include/ComputeContext.h>
+#include <value/include/FunctionDeclaration.h>
+#include <value/include/LLVMContext.h>
 #include <value/include/Matrix.h>
 #include <value/include/Tensor.h>
 #include <value/include/Value.h>
 #include <value/include/Vector.h>
 
+#include <value/include/loopnests/Kernel.h>
+#include <value/include/loopnests/LoopNest.h>
+
 #include <math/include/Matrix.h>
 #include <math/include/Tensor.h>
+#include <math/include/Vector.h>
 
-#include <utilities/include/Boolean.h>
-#include <utilities/include/MemoryLayout.h>
+#include <utilities/include/FunctionUtils.h>
 
 #include <testing/include/testing.h>
 
+#include <algorithm>
 #include <iomanip>
 #include <iostream>
-#include <string>
+#include <limits>
+#include <memory>
+#include <numeric>
+#include <optional>
 #include <type_traits>
+#include <vector>
+
+#if !defined(WIN32)
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <unistd.h>
+#else
+#include <windows.h>
+#endif // !defined(WIN32)
 
+using namespace ell::emitters;
 using namespace ell::utilities;
 using namespace ell::value;
 namespace math = ell::math;
@@ -36,13 +56,12 @@ using math::MatrixLayout;
 template <MatrixLayout layout>
 using LayoutType = std::integral_constant<MatrixLayout, layout>;
 
+using namespace ell::emitters;
+using namespace ell::utilities;
+using namespace ell::value;
+
 namespace ell
 {
-void DebugPrint(std::string message)
-{
-    GetContext().DebugPrint(message);
-}
-
 void PrintMatrix(std::string indent, Matrix e)
 {
     if (!e.GetValue().IsConstant())
@@ -109,69 +128,81 @@ void PrintMatrix(std::string indent, Matrix e)
     }
 }
 
+void PrintLoops(const value::loopnests::LoopNest& loop, std::string tag)
+{
+    InvokeForContext<ComputeContext>([&](auto&) {
+        std::stringstream ss;
+        DebugDump(loop, tag, &ss);
+        std::cout << ss.str() << std::endl;
+    });
+}
+
 Scalar EqualEpsilon(Scalar x, Scalar y, double epsilon)
 {
-    if (x.GetType() == ValueType::Int32)
-    {
-        return x == y;
-    }
-    Scalar e = Allocate(ValueType::Double, ScalarLayout);
-    e = epsilon;
-    Scalar tens = Floor(Log10(Cast(x, ValueType::Double)));
-    If(tens > 0.0, [&] {
-        // then we have some precision already on the left hand side of the decimal place, so remove that from epsilon
-        e *= Pow(10.0, tens);
-    });
-    Scalar rx = Allocate(ValueType::Double, ScalarLayout);
-    Scalar ry = Allocate(ValueType::Double, ScalarLayout);
-    rx = Floor(Cast(x, ValueType::Double) / e) * e;
-    ry = Floor(Cast(y, ValueType::Double) / e) * e;
-    return rx == ry;
+    Scalar result = Allocate<int>(ScalarLayout);
+    result = 1;
+
+    If(x == y, [&] {
+#if 0 // Useful for debugging
+        DebugPrint("## Scalar compare passed (exactly equal)\n");
+        DebugPrint("  Expected: ");
+        DebugPrintVector(AsVector(x));
+        DebugPrint("\n");
+        DebugPrint("  Actual:   ");
+        DebugPrintVector(AsVector(y));
+        DebugPrint("\n");
+#endif // 0
+        result = 1;
+    })
+        .Else([&] {
+            if (auto type = x.GetType(); type == ValueType::Float || type == ValueType::Double)
+            {
+                auto tolerance = Cast<Scalar>(epsilon, type);
+                If((x - y) <= tolerance, [&] {
+                    If((y - x) <= tolerance, [&] {
+#if 0 // Useful for debugging
+                    DebugPrint("## Scalar compare passed\n");
+                    DebugPrint("  Expected: ");
+                    DebugPrintVector(AsVector(x));
+                    DebugPrint("\n");
+                    DebugPrint("  Actual:   ");
+                    DebugPrintVector(AsVector(y));
+                    DebugPrint("\n");
+#endif // 0
+                        result = 1;
+                    });
+                });
+            }
+            else
+            {
+                result = 0;
+            }
+        });
+
+    return result;
 }
 
 Scalar NotEqualEpsilon(Scalar x, Scalar y, double epsilon)
 {
-    if (x.GetType() == ValueType::Int32)
-    {
-        return x != y;
-    }
-    Scalar ep = epsilon;
-    Scalar e = Allocate(ValueType::Double, ScalarLayout);
-    e = epsilon;
-    Scalar tens = Floor(Log10(Cast(x, ValueType::Double)));
-    If(tens > 0.0, [&] {
-        // then we have some precision already on the left hand side of the decimal place, so remove that from epsilon
-        e *= Pow(10.0, tens);
-    });
-    Scalar rx = Allocate(ValueType::Double, ScalarLayout);
-    Scalar ry = Allocate(ValueType::Double, ScalarLayout);
-    rx = Floor(Cast(x, ValueType::Double) / e) * e;
-    ry = Floor(Cast(y, ValueType::Double) / e) * e;
-    Scalar diff = Abs(rx - ry);
-    InvokeForContext<ComputeContext>([&] {
-        double t = tens.Get<double>();
-        double dx = diff.Get<double>();
-        if (dx > epsilon)
-        {
-            std::cout << std::setprecision(10);
-            std::cout << "  NotEqualEpsilon failed: t=" << t << ": " << dx << " >  " << epsilon << "\n";
-        }
-    });
-    return diff > ep;
+    auto result = EqualEpsilon(x, y, epsilon);
+
+    // TODO: overload the logical not operator
+    If(result == 1, [&] { result = 0; }).Else([&] { result = 1; });
+    return result;
 }
 
-Scalar Verify(Vector actual, Vector expected, double epsilon)
+Scalar VerifySame(Vector actual, Vector expected, double epsilon)
 {
-    Scalar fail = 1;
     Scalar ok = Allocate(ValueType::Int32, ScalarLayout);
-    ok = 0;
     For(actual, [&](Scalar index) {
         Scalar x = actual(index);
         Scalar y = expected(index);
-        If(NotEqualEpsilon(x, y, epsilon), [&] {
-            ok = fail;
+
+        If(ok == 0, [&] {
+            ok = NotEqualEpsilon(x, y, epsilon);
         });
     });
+
     If(ok != 0, [&] {
         DebugPrint("## Vector compare failed\n");
         DebugPrint("  Expected: ");
@@ -186,14 +217,12 @@ Scalar Verify(Vector actual, Vector expected, double epsilon)
 
 Scalar VerifyDifferent(Vector actual, Vector expected, double epsilon)
 {
-    Scalar fail = 1;
     Scalar ok = Allocate(ValueType::Int32, ScalarLayout);
-    ok = 0;
     For(actual, [&](Scalar index) {
         Scalar x = actual(index);
         Scalar y = expected(index);
-        If(EqualEpsilon(x, y, epsilon), [&] {
-            ok = fail;
+        If(ok == 0, [&] {
+            ok = EqualEpsilon(x, y, epsilon);
         });
     });
     If(ok != 0, [&] {
@@ -205,24 +234,47 @@ Scalar VerifyDifferent(Vector actual, Vector expected, double epsilon)
         DebugPrintVector(actual);
         DebugPrint("\n");
     });
+
     return ok;
 }
 
-Scalar Verify(Matrix actual, Matrix expected, double epsilon)
+Scalar VerifySame(Matrix actual, Matrix expected, double epsilon)
+{
+    Scalar ok = Allocate(ValueType::Int32, ScalarLayout);
+    For(actual, [&](Scalar row, Scalar col) {
+        Scalar x = actual(row, col);
+        Scalar y = expected(row, col);
+        If(ok == 0, [&] {
+            ok = NotEqualEpsilon(x, y, epsilon);
+        });
+    });
+    If(ok != 0, [&] {
+        DebugPrint("## Matrices are different\n");
+        InvokeForContext<ComputeContext>([&] {
+            std::cout << "Expected: \n";
+            PrintMatrix("   ", expected);
+            std::cout << "\n";
+            std::cout << "Actual:   \n";
+            PrintMatrix("   ", actual);
+            std::cout << "\n";
+        });
+    });
+    return ok;
+}
+
+Scalar VerifyDifferent(Matrix actual, Matrix expected, double epsilon)
 {
-    Scalar fail = 1;
     Scalar ok = Allocate(ValueType::Int32, ScalarLayout);
-    ok = 0;
     For(actual, [&](Scalar row, Scalar col) {
         Scalar x = actual(row, col);
         Scalar y = expected(row, col);
-        If(NotEqualEpsilon(x, y, epsilon), [&] {
-            ok = fail;
+        If(ok == 0, [&] {
+            ok = EqualEpsilon(x, y, epsilon);
         });
     });
     If(ok != 0, [&] {
-        DebugPrint("## Matrix compare failed\n");
-        InvokeForContext<ComputeContext>([&](auto&) {
+        DebugPrint("## Matrices are not different\n");
+        InvokeForContext<ComputeContext>([&] {
             std::cout << "Expected: \n";
             PrintMatrix("   ", expected);
             std::cout << "\n";
@@ -234,7 +286,20 @@ Scalar Verify(Matrix actual, Matrix expected, double epsilon)
     return ok;
 }
 
-Scalar Verify(Tensor actual, Tensor expected, double epsilon)
+Scalar VerifySame(Tensor actual, Tensor expected, double epsilon)
+{
+    Scalar ok = Allocate(ValueType::Int32, ScalarLayout);
+    For(actual, [&](Scalar row, Scalar col, Scalar ch) {
+        Scalar x = actual(row, col, ch);
+        Scalar y = expected(row, col, ch);
+        If(ok == 0, [&] {
+            ok = NotEqualEpsilon(x, y, epsilon);
+        });
+    });
+    return ok;
+}
+
+Scalar VerifyDifferent(Tensor actual, Tensor expected, double epsilon)
 {
     Scalar fail = Cast(1, ValueType::Int32);
     Scalar ok = Allocate(ValueType::Int32, ScalarLayout);
@@ -242,11 +307,175 @@ Scalar Verify(Tensor actual, Tensor expected, double epsilon)
     For(actual, [&](Scalar row, Scalar col, Scalar ch) {
         Scalar x = actual(row, col, ch);
         Scalar y = expected(row, col, ch);
-        If(NotEqualEpsilon(x, y, epsilon), [&] {
-            DebugPrint("## Tensor compare failed\n");
-            ok = fail;
+        If(ok == 0, [&] {
+            ok = EqualEpsilon(x, y, epsilon);
+        });
+    });
+    return ok;
+}
+
+Scalar VerifySame(Array actual, Array expected, double epsilon)
+{
+    Scalar ok = Allocate(ValueType::Int32, ScalarLayout);
+    For(actual, [&](const std::vector<Scalar>& indices) {
+        Scalar x = actual(indices);
+        Scalar y = expected(indices);
+        If(ok == 0, [&] {
+            ok = NotEqualEpsilon(x, y, epsilon);
+        });
+    });
+    return ok;
+}
+
+Scalar VerifyDifferent(Array actual, Array expected, double epsilon)
+{
+    Scalar fail = Cast(1, ValueType::Int32);
+    Scalar ok = Allocate(ValueType::Int32, ScalarLayout);
+    ok = 0;
+    For(actual, [&](const std::vector<Scalar>& indices) {
+        Scalar x = actual(indices);
+        Scalar y = expected(indices);
+        If(ok == 0, [&] {
+            ok = EqualEpsilon(x, y, epsilon);
         });
     });
     return ok;
 }
+
+Scalar GetTID()
+{
+    if (auto result = InvokeForContext<ComputeContext>(
+            [](auto&) {
+#if !defined(WIN32)
+#if defined(__APPLE__)
+#pragma message("Note: syscall() is deprecated in macOS")
+    // Note: syscall() is deprecated in macOS, perhaps use pthread_self instead:
+    // return static_cast<int32_t>(reinterpret_cast<uintptr_t>(pthread_self()));
+#endif // defined(__APPLE__)
+                return (int32_t)(pid_t)syscall(SYS_gettid);
+#else
+                return (int32_t)GetCurrentThreadId();
+#endif // !defined(WIN32)
+            });
+        result)
+    {
+        return *result;
+    }
+
+    return Scalar(
+#if !defined(WIN32)
+#if defined(__APPLE__)
+#pragma message("Note: syscall() is deprecated in macOS")
+#endif // defined(__APPLE__)
+        Cast<int>(
+            *DeclareFunction("syscall")
+                 .Decorated(false)
+                 .Returns(Value({ ValueType::Int64, 0 }, ScalarLayout))
+                 .Parameters(
+                     Value({ ValueType::Int64, 0 }, ScalarLayout))
+                 .Call(Scalar{ (int64_t)SYS_gettid }))
+#else
+        *DeclareFunction("GetCurrentThreadId")
+             .Decorated(false)
+             .Returns(Value({ ValueType::Int32, 0 }, ScalarLayout))
+             .Call()
+#endif // !defined(WIN32)
+    );
+}
+
+void MultiplyMatrices(Matrix& A, Matrix& B, Matrix& C)
+{
+    auto M = static_cast<int>(C.Rows());
+    auto N = static_cast<int>(C.Columns());
+    auto K = static_cast<int>(A.Columns());
+
+    // fill out expected with a simple for-loop gemm
+    ForRange(M, [&](Scalar i) {
+        ForRange(N, [&](Scalar j) {
+            C(i, j) = 0;
+        });
+    });
+    ForRange(M, [&](Scalar i) {
+        ForRange(N, [&](Scalar j) {
+            ForRange(K, [&](Scalar k) {
+                C(i, j) += A(i, k) * B(k, j);
+            });
+        });
+    });
+}
+
+MatMul3TestCaseParameters GetMatMul3TestCaseParameters(int M, int N, int K, int L)
+{
+    auto A = MakeMatrix<int>(M, K, "A");
+    auto B = MakeMatrix<int>(K, N, "B");
+    auto C = MakeMatrix<int>(M, N, "C");
+    auto D = MakeMatrix<int>(N, L, "D");
+    auto E = MakeMatrix<int>(M, L, "E");
+
+    auto expectedC = MakeMatrix<int>(M, N, "expectedC");
+    auto expectedE = MakeMatrix<int>(M, L, "expectedE");
+
+    // initialize matrices
+    ForRange(M, [&](Scalar i) {
+        ForRange(K, [&](Scalar j) {
+            A(i, j) = i - j;
+        });
+    });
+
+    ForRange(K, [&](Scalar i) {
+        ForRange(N, [&](Scalar j) {
+            B(i, j) = i + 2 * j;
+        });
+    });
+
+    ForRange(M, [&](Scalar i) {
+        ForRange(N, [&](Scalar j) {
+            C(i, j) = 0;
+        });
+    });
+
+    ForRange(N, [&](Scalar i) {
+        ForRange(L, [&](Scalar j) {
+            D(i, j) = j - i;
+        });
+    });
+
+    ForRange(M, [&](Scalar i) {
+        ForRange(L, [&](Scalar j) {
+            E(i, j) = 0;
+        });
+    });
+
+    // fill out expected results with a simple for-loop gemm
+    // C
+    ForRange(M, [&](Scalar i) {
+        ForRange(N, [&](Scalar j) {
+            expectedC(i, j) = 0;
+        });
+    });
+    ForRange(M, [&](Scalar i) {
+        ForRange(N, [&](Scalar j) {
+            ForRange(K, [&](Scalar k) {
+                expectedC(i, j) += A(i, k) * B(k, j);
+            });
+        });
+    });
+
+    // E
+    ForRange(M, [&](Scalar i) {
+        ForRange(L, [&](Scalar l) {
+            expectedE(i, l) = 0;
+        });
+    });
+    ForRange(M, [&](Scalar i) {
+        ForRange(N, [&](Scalar j) {
+            ForRange(L, [&](Scalar l) {
+                expectedE(i, l) += expectedC(i, j) * D(j, l);
+            });
+        });
+    });
+
+    return { M, N, K, L, A, B, C, D, E, expectedC, expectedE };
+}
+
 } // namespace ell
diff --git a/libraries/value/test/src/Value_test.cpp b/libraries/value/test/src/Value_test.cpp
index 32b3ad4ac..99b054f25 100644
--- a/libraries/value/test/src/Value_test.cpp
+++ b/libraries/value/test/src/Value_test.cpp
@@ -9,23 +9,44 @@
 #include "Value_test.h"
 #include "TestUtil.h"
 
+#include <value/include/Array.h>
 #include <value/include/ComputeContext.h>
+#include <value/include/CppEmitterContext.h>
 #include <value/include/FunctionDeclaration.h>
 #include <value/include/LLVMContext.h>
+#include <value/include/Matrix.h>
+#include <value/include/Tensor.h>
 #include <value/include/Value.h>
 #include <value/include/Vector.h>
 
-#include <testing/include/testing.h>
+#include <value/include/loopnests/CodeGenerator.h>
+#include <value/include/loopnests/Kernel.h>
+#include <value/include/loopnests/LoopNest.h>
+
+#include <math/include/Matrix.h>
+#include <math/include/Tensor.h>
+#include <math/include/Vector.h>
+
+#include <emitters/include/CompilerOptions.h>
+#include <emitters/include/IRExecutionEngine.h>
+#include <emitters/include/IRModuleEmitter.h>
 
 #include <utilities/include/Exception.h>
 #include <utilities/include/FunctionUtils.h>
+#include <utilities/include/Logger.h>
 #include <utilities/include/MemoryLayout.h>
 
+#include <testing/include/testing.h>
+
 #include <algorithm>
+#include <atomic>
 #include <cmath>
+#include <condition_variable>
 #include <iostream>
+#include <mutex>
 #include <numeric>
 #include <optional>
+#include <thread>
 #include <tuple>
 #include <type_traits>
 #include <vector>
@@ -39,55 +60,13 @@
 #endif // !defined(WIN32)
 
 using namespace ell::utilities;
+using namespace ell::logging;
 using namespace ell::value;
 
 #define PRINT_IR 0
 
 namespace ell
 {
-namespace
-{
-    Scalar GetTID()
-    {
-        if (auto result = InvokeForContext<ComputeContext>(
-                [](auto&) {
-#if !defined(WIN32)
-                    return (int32_t)(pid_t)syscall(SYS_gettid);
-#else
-                    return (int32_t)GetCurrentThreadId();
-#endif // !defined(WIN32)
-                });
-            result)
-        {
-            return *result;
-        }
-
-        if (auto result = InvokeForContext<LLVMContext>(
-                [] {
-                    return Scalar(
-#if !defined(WIN32)
-                        *DeclareFunction("syscall")
-                             .Decorated(FunctionDecorated::No)
-                             .Returns(Value({ ValueType::Int64, 0 }, ScalarLayout))
-                             .Parameters(
-                                 Value({ ValueType::Int64, 0 }, ScalarLayout))
-                             .Call(Scalar{ (int64_t)SYS_gettid })
-#else
-                        *DeclareFunction("GetCurrentThreadId")
-                             .Decorated(FunctionDecorated::No)
-                             .Returns(Value({ ValueType::Int32, 0 }, ScalarLayout))
-                             .Call()
-#endif // !defined(WIN32)
-                    );
-                }))
-        {
-            return Cast<int>(*result);
-        }
-
-        throw LogicException(LogicExceptionErrors::notImplemented);
-    }
-
-} // namespace
 
 void ValueGetTests()
 {
@@ -138,6 +117,122 @@ Scalar Basic_test()
     return 0;
 }
 
+Scalar Array_test1()
+{
+    Scalar ok = Allocate(ValueType::Int32, ScalarLayout);
+    ok = 0;
+
+    constexpr int rows = 3, columns = 5, channels = 7;
+    std::vector<int> arrayData(rows * columns * channels);
+    std::generate(arrayData.begin(), arrayData.end(), [i = 0]() mutable { return ++i; });
+    math::ChannelColumnRowTensor<int> mathTensor(3, 5, 7, arrayData);
+
+    MemoryShape physicalSize{ rows, columns, channels };
+    DimensionOrder dimensionOrder = RowMajorTensorOrder;
+    MemoryLayout memoryLayout(physicalSize, dimensionOrder);
+    Array array(Value(arrayData, memoryLayout));
+
+    // Check shape
+    {
+        auto shape = array.GetValue().GetLayout().GetExtent();
+        auto actual1 = static_cast<int>(shape[0]);
+        auto expected1 = static_cast<int>(mathTensor.NumRows());
+        if (actual1 != expected1)
+        {
+            DebugPrint("Array_test1: value::Array and math::Tensor row check failed\n");
+            ok = 1;
+        }
+
+        auto actual2 = static_cast<int>(shape[1]);
+        auto expected2 = static_cast<int>(mathTensor.NumColumns());
+        if (actual2 != expected2)
+        {
+            DebugPrint("Array_test1: value::Array and math::Tensor column check failed\n");
+            ok = 1;
+        }
+
+        auto actual3 = static_cast<int>(shape[2]);
+        auto expected3 = static_cast<int>(mathTensor.NumChannels());
+        if (actual3 != expected3)
+        {
+            DebugPrint("Array_test1: value::Array and math::Tensor channel check failed\n");
+            ok = 1;
+        }
+    }
+
+    // Check for loop iterations
+    {
+        Scalar count = Allocate(ValueType::Int32, ScalarLayout);
+
+        // test we can enumerate all items of an array.
+        For(array, [&](const std::vector<Scalar>& coordinates) {
+            count += 1;
+        });
+        If(count != static_cast<int>(mathTensor.Size()), [&] {
+            DebugPrint("Array_test1: for loop didn't visit all elements\n");
+            ok = 1;
+        });
+    }
+
+    Scalar ok2 = Allocate(ValueType::Int32, ScalarLayout);
+    ok2 = 0;
+
+    // Check operator(Scalar...)
+    InvokeForContext<ComputeContext>([&](auto&) {
+        // These tests use row.Get<int>() to get the actual row,col indexes as constants, which can
+        // only be done during ComputeContext.
+
+        // test we can enumerate all items of an array.
+        For(array, [&](const std::vector<Scalar>& coordinates) {
+            const auto& row = coordinates[0];
+            const auto& col = coordinates[1];
+            const auto& ch = coordinates[2];
+            auto rowInt = row.Get<int>();
+            auto colInt = col.Get<int>();
+            auto chInt = ch.Get<int>();
+            auto tensorVal = mathTensor(rowInt, colInt, chInt);
+            Scalar expected = tensorVal;
+            Scalar actual = array(row, col, ch);
+
+            If(actual != expected, [&] {
+                ok2 = 1;
+            });
+        });
+        If(ok2 != 0, [&] {
+            DebugPrint("Array_test1: value::Array and math::Tensor equality check failed\n");
+            ok = 1;
+        });
+    });
+
+    ok2 = 0;
+    // Check operator(vector<Scalar>)
+    InvokeForContext<ComputeContext>([&](auto&) {
+        // These tests use row.Get<int>() to get the actual row,col indexes as constants, which can
+        // only be done during ComputeContext.
+
+        // test we can enumerate all items of an array.
+        For(array, [&](const std::vector<Scalar>& coordinates) {
+            const auto& row = coordinates[0];
+            const auto& col = coordinates[1];
+            const auto& ch = coordinates[2];
+            auto rowInt = row.Get<int>();
+            auto colInt = col.Get<int>();
+            auto chInt = ch.Get<int>();
+            Scalar expected = mathTensor(rowInt, colInt, chInt);
+            Scalar actual = array(coordinates);
+            If(actual != expected, [&] {
+                ok2 = 1;
+            });
+        });
+        If(ok2 != 0, [&] {
+            DebugPrint("Array_test1: value::Array and math::Tensor equality check failed\n");
+            ok = 1;
+        });
+    });
+
+    return ok;
+}
+
 Scalar DebugPrint_test()
 {
     DebugPrint("### Test that debug print is working: ");
@@ -172,40 +267,43 @@ Scalar For_test1()
     For(input, [&](Scalar index) {
         actual(index) = input(index);
     });
-    return Verify(input, actual);
+    return VerifySame(input, actual);
 }
 
-void TripleLoop(value::Vector input, value::Vector output)
+namespace
 {
-    if (input.Size() == 0)
+    void TripleLoop(value::Vector input, value::Vector output)
     {
-        return;
-    }
+        if (input.Size() == 0)
+        {
+            return;
+        }
 
-    Scalar max = Allocate(input.GetType(), ScalarLayout);
-    max = Cast(0, input.GetType());
-    For(input, [&](Scalar index) {
-        Scalar v = input(index);
-        If(v > max, [&] {
-            max = v;
+        Scalar max = Allocate(input.GetType(), ScalarLayout);
+        max = Cast(0, input.GetType());
+        For(input, [&](Scalar index) {
+            Scalar v = input(index);
+            If(v > max, [&] {
+                max = v;
+            });
         });
-    });
 
-    Scalar sum = Allocate(input.GetType(), ScalarLayout);
-    sum = Cast(0, input.GetType());
-    For(input, [&](Scalar index) {
-        Scalar v = input(index);
-        v -= max;
-        sum += v;
-        output(index) = v;
-    });
+        Scalar sum = Allocate(input.GetType(), ScalarLayout);
+        sum = Cast(0, input.GetType());
+        For(input, [&](Scalar index) {
+            Scalar v = input(index);
+            v -= max;
+            sum += v;
+            output(index) = v;
+        });
 
-    For(output, [&](Scalar index) {
-        Scalar v = input(index);
-        v /= sum;
-        output(index) = v;
-    });
-}
+        For(output, [&](Scalar index) {
+            Scalar v = input(index);
+            v /= sum;
+            output(index) = v;
+        });
+    }
+} // namespace
 
 Scalar For_test2()
 {
@@ -213,7 +311,72 @@ Scalar For_test2()
     Vector expected(std::vector<double>{ 0.4, 0.3, 0.2, 0.1, 0 });
     Vector output = MakeVector<double>(input.Size());
     TripleLoop(input, output);
-    return Verify(output, expected);
+    return VerifySame(output, expected);
+}
+
+Scalar ForInsideIf_test()
+{
+    auto zero = MakeScalar<int>();
+    auto sum = MakeScalar<int>();
+
+    If(zero == Scalar(0), [&] {
+        ForRange(10, [&](Scalar i) {
+            sum += 1;
+        });
+    }).Else([&] {
+        ForRange(10, [&](Scalar i) {
+            sum += 2;
+        });
+    });
+
+    Scalar ok = MakeScalar<int>();
+    If(sum != 10, [&] { ok = 1; });
+    return ok;
+}
+
+Scalar While_test()
+{
+    Scalar test = MakeScalar<Boolean>();
+    Scalar count = MakeScalar<int>();
+
+    test = (count != 5);
+    While(test, [&] {
+        count += 5;
+        test = (count != 5);
+    });
+
+    Scalar ok = MakeScalar<int>();
+    If(count != 5, [&] { ok = 1; });
+    return ok;
+}
+
+Scalar WhileInsideIf_test()
+{
+    auto zero = MakeScalar<int>();
+    auto count = MakeScalar<int>();
+    count = 10;
+    auto sum = MakeScalar<int>();
+    Scalar notDone = MakeScalar<Boolean>();
+
+    If(zero == Scalar(0), [&] {
+        notDone = count > 0;
+        While(notDone, [&] {
+            sum += 1;
+            count -= 1;
+            notDone = count > 0;
+        });
+    }).Else([&] {
+        notDone = count > 0;
+        While(notDone, [&] {
+            sum += 2;
+            count -= 1;
+            notDone = count > 0;
+        });
+    });
+
+    Scalar ok = MakeScalar<int>();
+    If(sum != 10, [&] { ok = 1; });
+    return ok;
 }
 
 Scalar Casting_test1()
@@ -295,7 +458,7 @@ Scalar Sum_test()
         If(result != expected, [&] {
             ok = 1;
             InvokeForContext<ComputeContext>([&] {
-                std::cout << "### Sum_test failed for size " << i << "\n";
+                Log() << "### Sum_test failed for size " << i << "\n";
             });
         });
     }
@@ -347,13 +510,13 @@ namespace
                 Vector input = intrinsics_data;
                 Vector actual = f(input);
                 Vector expected = expected_data;
-                If(Verify(actual, expected, 1e-5) != 0, [&] {
+                If(VerifySame(actual, expected, 1e-5) != 0, [&] {
                     ok = 1;
                     DebugPrint("Intrinsics " + fnName + " test failed\n");
                 });
             }
 
-            // recurrsively process next item in the tuple
+            // recursively process next item in the tuple
             Scalar r = Intrinsics_test1_impl(tuple, std::integral_constant<size_t, index + 1>{});
 
             If(r != 0, [&] {
@@ -521,7 +684,7 @@ Scalar Parallelized_test1()
         }
     }
 
-    If(Verify(data, expected) != 0, [&] {
+    If(VerifySame(data, expected) != 0, [&] {
         ok = 1;
     });
 
@@ -556,8 +719,6 @@ Scalar Parallelized_test2()
     return ok;
 }
 
-// Prefetches have no effect on the behavior of the program but can change its performance characteristics, so this
-// test just makes sure that the code compiles/runs and behavior is not affected
 Scalar Parallelized_test3()
 {
     constexpr int DataPerThread = 8;
@@ -588,7 +749,34 @@ Scalar Parallelized_test3()
 
 // Prefetches have no effect on the behavior of the program but can change its performance characteristics, so this
 // test just makes sure that the code compiles/runs and behavior is not affected
+// This test is just Sum_test with prefetching added in
 Scalar Prefetch_test1()
+{
+    Scalar ok = Allocate<int>(ScalarLayout);
+    for (int i = 1; i < 10; ++i)
+    {
+        Vector v = MakeVector<float>(i);
+        std::vector<float> reference(i);
+        std::iota(reference.begin(), reference.end(), 0);
+        auto expected = std::accumulate(reference.begin(), reference.end(), 0.f);
+
+        v = reference;
+
+        Prefetch(v);
+        Scalar result = Sum(v);
+        If(result != expected, [&] {
+            ok = 1;
+            InvokeForContext<ComputeContext>([&] {
+                Log() << "### Sum_test failed for size " << i << "\n";
+            });
+        });
+    }
+    return ok;
+}
+
+// Prefetches have no effect on the behavior of the program but can change its performance characteristics, so this
+// test just makes sure that the code compiles/runs and behavior is not affected
+Scalar Prefetch_parallelized_test1()
 {
     constexpr int DataPerThread = 8;
     constexpr int NumThreads = 4;
@@ -615,7 +803,282 @@ Scalar Prefetch_test1()
             If(B[i * DataPerThread + j] != i, [&] { ok = 1; });
         });
     });
+
+    return ok;
+}
+
+Scalar Fma_test1()
+{
+    Scalar ok = Allocate<int>(ScalarLayout);
+
+    constexpr float a_ = 3.14f, b_ = 1.8f, c_ = 8.1f, expected_ = a_ * b_ + c_;
+
+    Scalar a = Allocate<float>(ScalarLayout);
+    Scalar b = Allocate<float>(ScalarLayout);
+    Scalar c = Allocate<float>(ScalarLayout);
+    Scalar result = Allocate<float>(ScalarLayout);
+    Scalar expected = Allocate<float>(ScalarLayout);
+
+    a = a_;
+    b = b_;
+    c = c_;
+    expected = expected_;
+    result = FusedMultiplyAdd(a, b, c);
+
+    If(NotEqualEpsilon(result, expected, 1e-5) == 1, [&] { ok = 1; });
+    return ok;
+}
+
+Scalar Fma_test2()
+{
+    Scalar ok = Allocate<int>(ScalarLayout);
+
+    constexpr double a_ = 1.763, b_ = 6.182, c_ = 9.1029, expected_ = a_ * b_ + c_;
+
+    Scalar a = Allocate<double>(ScalarLayout);
+    Scalar b = Allocate<double>(ScalarLayout);
+    Scalar c = Allocate<double>(ScalarLayout);
+    Scalar result = Allocate<double>(ScalarLayout);
+    Scalar expected = Allocate<double>(ScalarLayout);
+
+    a = a_;
+    b = b_;
+    c = c_;
+    expected = expected_;
+    result = FusedMultiplyAdd(a, b, c);
+
+    If(NotEqualEpsilon(result, expected, 1e-7) == 1, [&] { ok = 1; });
     return ok;
 }
 
+Scalar Fma_test3()
+{
+    Scalar ok = Allocate<int>(ScalarLayout);
+
+    constexpr int a_ = 8, b_ = 5, c_ = 2, expected_ = a_ * b_ + c_;
+
+    Scalar a = Allocate<int>(ScalarLayout);
+    Scalar b = Allocate<int>(ScalarLayout);
+    Scalar c = Allocate<int>(ScalarLayout);
+    Scalar result = Allocate<int>(ScalarLayout);
+    Scalar expected = Allocate<int>(ScalarLayout);
+
+    a = a_;
+    b = b_;
+    c = c_;
+    expected = expected_;
+    result = FusedMultiplyAdd(a, b, c);
+
+    If(expected != result, [&] { ok = 1; });
+    return ok;
+}
+
+Scalar UniqueName_test1()
+{
+    Scalar ok = Allocate<int>(ScalarLayout);
+
+    ell::testing::IsEqual(UniqueName(""), "_0");
+    ell::testing::IsEqual(UniqueName(""), "_1");
+
+    ell::testing::IsEqual(UniqueName("foo"), "foo_0");
+    ell::testing::IsEqual(UniqueName("foo"), "foo_1");
+
+    return ok;
+}
+
+Scalar Parallelized_ComputeContext_test1()
+{
+    Scalar ok = Allocate<int>(ScalarLayout);
+
+    InvokeForContext<ComputeContext>([] {
+        constexpr auto numItems = 100000;
+        constexpr auto numThreads = 16;
+
+        bool ready = false;
+        std::mutex m1, m2;
+        std::condition_variable cv1, cv2;
+        std::atomic_int atomicIndex = 0;
+
+        std::array<std::thread, numThreads> threads;
+        std::generate(std::begin(threads), std::end(threads), [&] {
+            return std::thread([&] {
+                std::unique_lock lock{ m1 };
+                cv1.wait(lock, [&] { return ready; });
+                int index{};
+                while ((index = atomicIndex.fetch_add(1)) < numItems)
+                {
+                    [[maybe_unused]] Scalar s{ index };
+                }
+
+                cv2.notify_one();
+            });
+        });
+
+        {
+            std::unique_lock lock{ m1 };
+            ready = true;
+        }
+        cv1.notify_all();
+
+        {
+            std::unique_lock lock{ m2 };
+            cv2.wait(lock, [&] { return atomicIndex < numItems; });
+        }
+        std::for_each(std::begin(threads), std::end(threads), [](std::thread& thread) { thread.join(); });
+    });
+
+    return ok;
+}
+
+Scalar MemCopy_test1()
+{
+    auto vec = MakeVector<int>(4);
+
+    std::vector expected{ 10, 20, 30, 40 };
+    MemCopy(vec, Vector(expected));
+
+    return VerifySame(vec, expected);
+}
+
+Scalar MemSet_test1()
+{
+    auto vec = MakeVector<int>(4);
+    constexpr auto fill = char{ 0x3D };
+
+    union
+    {
+        char c[sizeof(int)];
+        int i;
+    } expected;
+    std::memset(&expected.c, fill, sizeof(expected.c));
+
+    MemSet(vec, fill);
+
+    auto ok = MakeScalar<int>();
+    For(vec, [&](Scalar index) {
+        If(vec[index] != expected.i, [&] {
+            ok = 1;
+        });
+    });
+
+    return ok;
+}
+
+Scalar NamedLoops_test1()
+{
+    {
+        auto accum = MakeScalar<int>();
+        ForRange(std::string{ "ForRangeLoop" }, 10, [&](Scalar index) { accum += index; });
+    }
+
+    {
+        auto v = MakeVector<int>(10);
+        For("ForVectorLoop", v, [&](Scalar index) { v[index] = index; });
+    }
+
+    {
+        auto m = MakeMatrix<int>(10, 10);
+        For("ForMatrixLoop", m, [&](Scalar row, Scalar col) { m(row, col) = row + row * col; });
+    }
+
+    {
+        auto t = MakeTensor<int>(10, 10, 10);
+        For("ForTensorLoop", t, [&](Scalar row, Scalar col, Scalar ch) { t(row, col, ch) = row + col + ch + ch * col * row; });
+    }
+
+    return MakeScalar<int>();
+}
+
+Scalar ThreadLocalAllocation_test1()
+{
+    auto ok = MakeScalar<int>("ok");
+
+#ifdef WIN32
+    // This thread is disabled for windows + LLVM due to issues with threading and TLS
+    if (dynamic_cast<LLVMContext*>(&GetContext()) != nullptr)
+    {
+        return ok;
+    }
+#endif // WIN32
+
+    constexpr int NumWorkItems = 40;
+    auto threadIds = MakeVector<int>(NumWorkItems, "threadIds");
+    Parallelize(
+        NumWorkItems,
+        std::tuple{ threadIds },
+        std::function<void(Scalar, Vector)>{ [](Scalar threadId, Vector threadIds) {
+            Scalar alreadySeen = StaticAllocate("AlreadySeen", ValueType::Int64, ScalarLayout, AllocateFlags::ThreadLocal);
+            auto tid = Cast<int64_t>(GetTID());
+            If(
+                alreadySeen == int64_t{ 0 },
+                [&] {
+                    alreadySeen = tid;
+                    threadIds[threadId] = 1;
+                })
+                .ElseIf(
+                    alreadySeen != tid,
+                    [&] {
+                        threadIds[threadId] = -1;
+                    });
+        } });
+
+    auto totalThreadsRan = MakeScalar<int>("totalThreadsRan");
+    auto totalErrors = MakeScalar<int>("totalErrors");
+    For(threadIds, [&](Scalar index) {
+        If(threadIds[index] == 1, [&] {
+            ++totalThreadsRan;
+        }).ElseIf(threadIds[index] == -1, [&] {
+            ++totalErrors;
+        });
+    });
+
+    DebugPrint("Number of errors detected in TLS code: ");
+    DebugPrintVector(AsVector(totalErrors));
+    DebugPrint("\n");
+    DebugPrint("Number of actual threads used to complete " + std::to_string(NumWorkItems) + " work items: ");
+    DebugPrintVector(AsVector(totalThreadsRan));
+    DebugPrint("\n");
+
+    If(totalThreadsRan < 1, [&] { ok = 1; });
+    If(totalErrors > 0, [&] { ok = 1; });
+
+    return ok;
+}
+
+Scalar FunctionPointer_test1()
+{
+    auto ok = MakeScalar<int>("ok");
+
+    // This thread is disabled CppEmitterContext for now
+    if (dynamic_cast<CppEmitterContext*>(&GetContext()) != nullptr)
+    {
+        return ok;
+    }
+
+    auto realFnDecl = DeclareFunction("foo")
+                          .Returns(Scalar(0))
+                          .Parameters(Scalar(0));
+    auto realFn = realFnDecl
+                      .Define([](Scalar x) -> Scalar {
+                          auto r = MakeScalar(x.GetType());
+                          r = x + 10;
+                          return r;
+                      });
+
+    auto fnPtr = DeclareFunction("bar").Returns(Scalar(0)).Parameters(Scalar(0));
+    fnPtr.SetPointer(realFnDecl.GetPointer());
+
+    auto in1 = MakeScalar<int>();
+    in1 = 100;
+    Scalar y = realFn(in1);
+
+    If(y != 110, [&] { ok = 1; });
+
+    in1 = 200;
+    Scalar z = *fnPtr.Call(in1);
+
+    If(z != 210, [&] { ok = 1; });
+
+    return ok;
+}
 } // namespace ell
diff --git a/libraries/value/test/src/Vector_test.cpp b/libraries/value/test/src/Vector_test.cpp
index 1f401d4fe..223200583 100644
--- a/libraries/value/test/src/Vector_test.cpp
+++ b/libraries/value/test/src/Vector_test.cpp
@@ -106,7 +106,7 @@ Scalar Vector_test1()
     // Vector result = convolve1D(signal, filter);
 
     Vector expected(referenceResult);
-    return Verify(result, expected);
+    return VerifySame(result, expected);
 }
 
 Scalar Vector_test2()
@@ -119,7 +119,7 @@ Scalar Vector_test2()
     {
         Vector expected(std::vector<float>{ 1.2f + 3.4f, 2.3f + 3.4f });
         Vector actual = v + testScalar;
-        If(Verify(actual, expected) != 0, [&] {
+        If(VerifySame(actual, expected) != 0, [&] {
             DebugPrint("## Vector_test2 vector scalar addition failed\n");
             ok = 1;
         });
@@ -127,7 +127,7 @@ Scalar Vector_test2()
     {
         Vector expected(std::vector<float>{ 1.2f - 3.4f, 2.3f - 3.4f });
         Vector actual = v - testScalar;
-        If(Verify(actual, expected) != 0, [&] {
+        If(VerifySame(actual, expected) != 0, [&] {
             DebugPrint("## Vector_test2 vector scalar subtraction failed\n");
             ok = 1;
         });
@@ -135,7 +135,7 @@ Scalar Vector_test2()
     {
         Vector expected(std::vector<float>{ 1.2f * 3.4f, 2.3f * 3.4f });
         Vector actual = v * testScalar;
-        If(Verify(actual, expected) != 0, [&] {
+        If(VerifySame(actual, expected) != 0, [&] {
             DebugPrint("## Vector_test2 vector scalar multiplication failed\n");
             ok = 1;
         });
@@ -143,7 +143,7 @@ Scalar Vector_test2()
     {
         Vector expected(std::vector<float>{ 1.2f / 3.4f, 2.3f / 3.4f });
         Vector actual = v / testScalar;
-        If(Verify(actual, expected) != 0, [&] {
+        If(VerifySame(actual, expected) != 0, [&] {
             DebugPrint("## Vector_test2 vector scalar division failed\n");
             ok = 1;
         });
@@ -153,7 +153,7 @@ Scalar Vector_test2()
     {
         Vector expected(std::vector<float>{ 1.2f + 0.1f, 2.3f + 1.2f });
         Vector actual = v + testVector;
-        If(Verify(actual, expected) != 0, [&] {
+        If(VerifySame(actual, expected) != 0, [&] {
             DebugPrint("## Vector_test2 vector+vector failed\n");
             ok = 1;
         });
@@ -161,7 +161,7 @@ Scalar Vector_test2()
     {
         Vector expected(std::vector<float>{ 1.2f - 0.1f, 2.3f - 1.2f });
         Vector actual = v - testVector;
-        If(Verify(actual, expected) != 0, [&] {
+        If(VerifySame(actual, expected) != 0, [&] {
             DebugPrint("## Vector_test2 vector-vector failed\n");
             ok = 1;
         });
@@ -185,7 +185,7 @@ Scalar Vector_test3()
 
     Vector e = std::vector<float>{ 1, 1, 1, 1, 1, 1, 2, 2, 2 };
 
-    If(Verify(v, e) != 0, [&] {
+    If(VerifySame(v, e) != 0, [&] {
         DebugPrint("## Vector_test3 subvector assignment failed\n");
         ok = 1;
     });
diff --git a/libraries/value/test/src/main.cpp b/libraries/value/test/src/main.cpp
index f84efa886..b52c361e3 100644
--- a/libraries/value/test/src/main.cpp
+++ b/libraries/value/test/src/main.cpp
@@ -2,10 +2,15 @@
 //
 //  Project:  Embedded Learning Library (ELL)
 //  File:     main.cpp (value)
-//  Authors:  Kern Handa, Chuck Jacobs
+//  Authors:  Kern Handa, Chuck Jacobs, Mason Remy
 //
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
+#include "CachingStrategy_test.h"
+#include "Functions_test.h"
+#include "LoopNestAPI_test.h"
+#include "LoopNest_convolution_test.h"
+#include "LoopNest_test.h"
 #include "Matrix_test.h"
 #include "Scalar_test.h"
 #include "Tensor_test.h"
@@ -13,6 +18,7 @@
 #include "Vector_test.h"
 
 #include <value/include/ComputeContext.h>
+#include <value/include/CppEmitterContext.h>
 #include <value/include/FunctionDeclaration.h>
 #include <value/include/LLVMContext.h>
 #include <value/include/Vector.h>
@@ -22,6 +28,7 @@
 #include <emitters/include/IRModuleEmitter.h>
 
 #include <utilities/include/Exception.h>
+#include <utilities/include/Logger.h>
 #include <utilities/include/StringUtil.h>
 #include <utilities/include/TypeAliases.h>
 
@@ -31,6 +38,7 @@
 #include <iostream>
 #include <type_traits>
 
+using namespace ell::logging;
 using namespace ell::utilities;
 using namespace ell::value;
 
@@ -60,11 +68,9 @@ void PrintIR(TestLLVMContext& context)
 #endif // PRINT_IR
 }
 
-extern "C"
-{
+extern "C" {
 void JittedDebugPrintInts(int* ints, int* len)
 {
-    std::cout << std::setprecision(6);
     for (int i = 0; i < *len; i++)
     {
         if (i > 0)
@@ -185,7 +191,7 @@ void DebugPrintVector(Vector message)
                                  .Parameters(
                                      Value(message.GetType(), MemoryLayout{ { size } }),
                                      Value(ValueType::Int32, ScalarLayout))
-                                 .Decorated(FunctionDecorated::No);
+                                 .Decorated(false);
         printFunction.Call(message, Scalar{ size });
     });
 }
@@ -195,19 +201,19 @@ void DebugPrintScalar(Scalar value)
     InvokeForContext<ComputeContext>([&] {
         std::visit(
             [](auto&& data) {
-            using Type = std::decay_t<decltype(data)>;
-            if constexpr (IsOneOf<Type, Emittable, Boolean*>)
-            {
-                throw LogicException(LogicExceptionErrors::notImplemented);
-            }
-            else
-            {
-                std::copy(
-                    data,
-                    data + 1,
-                    std::ostream_iterator<std::remove_pointer_t<Type>>(std::cout, ", "));
-            }
-        },
+                using Type = std::decay_t<decltype(data)>;
+                if constexpr (IsOneOf<Type, Emittable, Boolean*>)
+                {
+                    throw LogicException(LogicExceptionErrors::notImplemented);
+                }
+                else
+                {
+                    std::copy(
+                        data,
+                        data + 1,
+                        std::ostream_iterator<std::remove_pointer_t<Type>>(std::cout, ", "));
+                }
+            },
             value.GetValue().GetUnderlyingData());
     });
 
@@ -231,10 +237,10 @@ void DebugPrintScalar(Scalar value)
             return;
         }
         auto printFunction = FunctionDeclaration(fnName)
-            .Parameters(
-                Value(value.GetType(), MemoryLayout{ { 1 } }),
-                Value(ValueType::Int32, ScalarLayout))
-            .Decorated(FunctionDecorated::No);
+                                 .Parameters(
+                                     Value(value.GetType(), MemoryLayout{ { 1 } }),
+                                     Value(ValueType::Int32, ScalarLayout))
+                                 .Decorated(false);
         printFunction.Call(vector, Scalar{ 1 });
     });
 }
@@ -242,6 +248,8 @@ void DebugPrintScalar(Scalar value)
 
 void ComputeTest(std::string testName, std::function<Scalar()> defineFunction)
 {
+    std::cout << "Running compute test " << testName << std::endl;
+
     // Run the test in the ComputeContext
     ContextGuard<ComputeContext> guard("Value_test_compute");
 
@@ -260,6 +268,8 @@ void ComputeTest(std::string testName, std::function<Scalar()> defineFunction)
 
 void LLVMJitTest(std::string testName, std::function<Scalar()> defineFunction)
 {
+    std::cout << "Running LLVM JIT test " << testName << std::endl;
+
     // Run the test in the LLVM context
     ell::emitters::CompilerOptions compilerSettings;
     compilerSettings.useBlas = false;
@@ -275,6 +285,7 @@ void LLVMJitTest(std::string testName, std::function<Scalar()> defineFunction)
     fn.Define(defineFunction);
 
 #if 0 // Useful for debugging, dumps to stderr
+#pragma message("DEBUGGING")
     DebugDump(fn);
 #endif // 0
 
@@ -296,6 +307,17 @@ void LLVMJitTest(std::string testName, std::function<Scalar()> defineFunction)
     ell::testing::ProcessTest(ell::utilities::FormatString(msg.c_str(), rc), rc == 0);
 }
 
+void CppEmitterTest(std::string testName, std::function<Scalar()> defineFunction)
+{
+    std::cout << "// Running CppEmitter test " << testName << std::endl;
+
+    ContextGuard<CppEmitterContext> guard(testName, Log());
+    auto fn = DeclareFunction(testName)
+                  .Returns(Value(ValueType::Int32, ScalarLayout));
+
+    fn.Define(defineFunction);
+}
+
 void RunTest(std::string testName, std::function<Scalar()> defineFunction)
 {
     try
@@ -315,6 +337,15 @@ void RunTest(std::string testName, std::function<Scalar()> defineFunction)
     {
         ell::testing::ProcessTest(testName + " Jitted LLVM failed with exception, " + e.what(), false);
     }
+
+    try
+    {
+        CppEmitterTest(testName, defineFunction);
+    }
+    catch (const std::exception& e)
+    {
+        ell::testing::ProcessTest("/*\n" + testName + " CppEmitter test failed with exception, " + e.what() + "\n*/", false);
+    }
 }
 
 int main()
@@ -323,9 +354,12 @@ int main()
     using namespace utilities;
     try
     {
-#define ADD_TEST_FUNCTION(a) testFunctions.push_back({ #a, a });
+#define ADD_TEST_FUNCTION(a) testFunctions.push_back({ #a, a })
         std::vector<std::pair<std::string, std::function<Scalar()>>> testFunctions;
 
+        // Low-level infrastructure tests
+        ADD_TEST_FUNCTION(SplitIterationDomain_test1);
+
         // Value tests
         ADD_TEST_FUNCTION(Basic_test);
         ADD_TEST_FUNCTION(DebugPrint_test);
@@ -344,6 +378,7 @@ int main()
         ADD_TEST_FUNCTION(Matrix_test1);
         ADD_TEST_FUNCTION(Matrix_test2);
         ADD_TEST_FUNCTION(Matrix_test3);
+        ADD_TEST_FUNCTION(Matrix_test4);
         ADD_TEST_FUNCTION(Reshape_test);
         ADD_TEST_FUNCTION(GEMV_test);
         ADD_TEST_FUNCTION(Tensor_test1);
@@ -351,6 +386,8 @@ int main()
         ADD_TEST_FUNCTION(Tensor_test3);
         ADD_TEST_FUNCTION(Tensor_slice_test1);
 
+        ADD_TEST_FUNCTION(Array_test1);
+
         ADD_TEST_FUNCTION(Casting_test1);
         ADD_TEST_FUNCTION(Sum_test);
         ADD_TEST_FUNCTION(Dot_test);
@@ -358,6 +395,9 @@ int main()
         ADD_TEST_FUNCTION(Intrinsics_test2);
         ADD_TEST_FUNCTION(For_test1);
         ADD_TEST_FUNCTION(For_test2);
+        ADD_TEST_FUNCTION(ForInsideIf_test);
+        ADD_TEST_FUNCTION(While_test);
+        ADD_TEST_FUNCTION(WhileInsideIf_test);
         ADD_TEST_FUNCTION(ForRangeCasting_test1);
         ADD_TEST_FUNCTION(ForRangeCasting_test2);
         ADD_TEST_FUNCTION(Parallelized_test1);
@@ -374,6 +414,247 @@ int main()
         ADD_TEST_FUNCTION(RefScalarRefRefRefTest);
         ADD_TEST_FUNCTION(RefMatrixReferenceTest);
 
+        ADD_TEST_FUNCTION(Prefetch_parallelized_test1);
+        ADD_TEST_FUNCTION(Fma_test1);
+        ADD_TEST_FUNCTION(Fma_test2);
+        ADD_TEST_FUNCTION(Fma_test3);
+        ADD_TEST_FUNCTION(UniqueName_test1);
+
+        ADD_TEST_FUNCTION(LoopNest_test1);
+        ADD_TEST_FUNCTION(LoopNest_test2);
+        ADD_TEST_FUNCTION(LoopNest_test3);
+        ADD_TEST_FUNCTION(LoopNest_test4);
+        ADD_TEST_FUNCTION(LoopNest_test5);
+        ADD_TEST_FUNCTION(LoopNest_test6);
+
+        ADD_TEST_FUNCTION(LoopNestNonzeroStart_test);
+        ADD_TEST_FUNCTION(LoopNestBoundary_test1);
+        ADD_TEST_FUNCTION(LoopNestBoundary_test2);
+        ADD_TEST_FUNCTION(LoopNestBoundary_test3);
+        ADD_TEST_FUNCTION(LoopNestBoundary_test4);
+        ADD_TEST_FUNCTION(LoopNestBoundary_test5);
+        ADD_TEST_FUNCTION(LoopNestReorder_test1);
+        ADD_TEST_FUNCTION(LoopNestReorder_test2);
+        ADD_TEST_FUNCTION(TwoKernel_test);
+
+        ADD_TEST_FUNCTION(LoopNestLastPredicate_test1);
+        ADD_TEST_FUNCTION(LoopNestLastPredicate_test2);
+        ADD_TEST_FUNCTION(LoopNestLastPredicate_test3);
+        ADD_TEST_FUNCTION(LoopNestLastPredicate_test4);
+        ADD_TEST_FUNCTION(LoopNestBoundaryPredicate_test1);
+
+        ADD_TEST_FUNCTION(MissingIndex_test);
+        ADD_TEST_FUNCTION(RequiredIndex_test);
+        ADD_TEST_FUNCTION(SimpleImperfectNest_test);
+        ADD_TEST_FUNCTION(ImperfectNest_test_ijk);
+        ADD_TEST_FUNCTION(ImperfectNest_test_ikj);
+        ADD_TEST_FUNCTION(ImperfectNest_test_kij);
+        ADD_TEST_FUNCTION(ImperfectNest_test_ijkijk);
+        ADD_TEST_FUNCTION(ImperfectNest_test_kijijk);
+        ADD_TEST_FUNCTION(ImperfectNest_test_ijkkij);
+        ADD_TEST_FUNCTION(SplitIndex_test1);
+        ADD_TEST_FUNCTION(SplitIndex_test2);
+        ADD_TEST_FUNCTION(SplitIndex_test3);
+        // ADD_TEST_FUNCTION(EpilogueIndex_test); // ill-defined test
+        ADD_TEST_FUNCTION(RenameKernelArg_test);
+
+        ADD_TEST_FUNCTION(NonInnermostKernel_test1);
+        ADD_TEST_FUNCTION(NonInnermostKernel_test2);
+        ADD_TEST_FUNCTION(NonInnermostKernel_test3);
+
+        // ADD_TEST_FUNCTION(FunctionArgType_test); // currently fails
+
+        ADD_TEST_FUNCTION(CachedMatrix_test1);
+        ADD_TEST_FUNCTION(CachedMatrix_test1_new);
+        ADD_TEST_FUNCTION(CachedMatrix_test2);
+        ADD_TEST_FUNCTION(CachedMatrix_test3);
+        ADD_TEST_FUNCTION(CachedMatrix_test4);
+        ADD_TEST_FUNCTION(CachedMatrix_test5);
+
+        ADD_TEST_FUNCTION(LoopNest_Parallelized_test1);
+        ADD_TEST_FUNCTION(LoopNest_Parallelized_test2);
+
+        ADD_TEST_FUNCTION(LoopNest_Unrolled_test1);
+
+        ADD_TEST_FUNCTION(LoopNest_DebugDump_test1);
+        ADD_TEST_FUNCTION(LoopNest_DebugDump_test2);
+
+        ADD_TEST_FUNCTION(SimpleMatMult_test);
+
+        ADD_TEST_FUNCTION(LoopNest_api_test1);
+        ADD_TEST_FUNCTION(LoopNest_api_test2);
+        ADD_TEST_FUNCTION(LoopNest_api_test3);
+        ADD_TEST_FUNCTION(LoopNest_api_test4);
+        ADD_TEST_FUNCTION(LoopNest_api_test5);
+        ADD_TEST_FUNCTION(LoopNest_api_Parallelized_test1);
+        ADD_TEST_FUNCTION(LoopNest_api_Parallelized_test2);
+        ADD_TEST_FUNCTION(LoopNest_api_Unrolled_test1);
+        ADD_TEST_FUNCTION(LoopNest_api_SetOrder_test1);
+        // ADD_TEST_FUNCTION(LoopNest_api_CachedMatrix_test1); // Fails
+        ADD_TEST_FUNCTION(GotoBLASGemmWithRefDeref);
+        ADD_TEST_FUNCTION(YG12LowLevel_TestBoundary);
+
+        ADD_TEST_FUNCTION(Parallelized_ComputeContext_test1);
+
+        ADD_TEST_FUNCTION(MemCopy_test1);
+        ADD_TEST_FUNCTION(MemSet_test1);
+
+        // ADD_TEST_FUNCTION(GotoBLASGemm_HighLevelAPI_NoCachingHelper); // currently fails due to unimplemented caching strategy
+
+        ADD_TEST_FUNCTION(NamedLoops_test1);
+
+        // ADD_TEST_FUNCTION(SequenceLogicalAndTest); // Currently fails due to known bug
+        ADD_TEST_FUNCTION(SequenceLogicalAndTestWithCopy);
+        ADD_TEST_FUNCTION(OneSplitBoundaryTest);
+        ADD_TEST_FUNCTION(TwoSplitBoundaryTest);
+        ADD_TEST_FUNCTION(SplitLargerThanSizeBoundaryTest);
+        ADD_TEST_FUNCTION(TwoSplitsLargerThanSizeBoundaryTest);
+
+        ADD_TEST_FUNCTION(BLASTCOPY_ValidateOutput_Test1);
+        ADD_TEST_FUNCTION(BLASTCOPY_ValidateOutput_Test2);
+
+        ADD_TEST_FUNCTION(BLASTCOPY_ValidateMemory_Test1);
+        ADD_TEST_FUNCTION(BLASTCOPY_ValidateMemory_Test2);
+        ADD_TEST_FUNCTION(BLASTCOPY_ValidateMemory_Test3);
+
+        ADD_TEST_FUNCTION(BLASTCOPY_ValidateOutput_BoundaryCondition_Test1);
+        ADD_TEST_FUNCTION(BLASTCOPY_ValidateOutput_BoundaryCondition_Test2);
+
+        ADD_TEST_FUNCTION(BLASTCOPY_ValidateOutput_BoundaryCondition_Test3);
+        ADD_TEST_FUNCTION(BLASTCOPY_ValidateOutput_BoundaryCondition_Test4);
+        ADD_TEST_FUNCTION(BLASTCOPY_ValidateOutput_BoundaryCondition_Test5);
+        ADD_TEST_FUNCTION(BLASTCOPY_ValidateOutput_BoundaryCondition_Test6);
+        ADD_TEST_FUNCTION(BLASTCOPY_ValidateOutput_BoundaryCondition_Test7);
+        ADD_TEST_FUNCTION(BLASTCOPY_ValidateOutput_BoundaryCondition_Test8);
+        ADD_TEST_FUNCTION(BLASTCOPY_ValidateOutput_BoundaryCondition_Test9);
+
+        ADD_TEST_FUNCTION(BLASTCOPY_ValidateMemory_BoundaryCondition_Test1);
+        ADD_TEST_FUNCTION(BLASTCOPY_ValidateMemory_BoundaryCondition_Test2);
+        ADD_TEST_FUNCTION(BLASTCOPY_ValidateMemory_BoundaryCondition_Test3);
+        ADD_TEST_FUNCTION(BLASTCOPY_ValidateMemory_BoundaryCondition_Test4);
+        ADD_TEST_FUNCTION(BLASTCOPY_ValidateMemory_BoundaryCondition_Test5);
+        ADD_TEST_FUNCTION(BLASTCOPY_ValidateMemory_BoundaryCondition_Test6);
+        ADD_TEST_FUNCTION(BLASTCOPY_ValidateMemory_BoundaryCondition_Test7);
+        ADD_TEST_FUNCTION(BLASTCOPY_ValidateMemory_BoundaryCondition_Test8);
+        ADD_TEST_FUNCTION(BLASTCOPY_ValidateMemory_BoundaryCondition_Test9);
+
+        // Bug: these tests with compile-time constant buffers of input data fail in LLVM JIT / CppEmitter cases only but pass for Compute
+        // ADD_TEST_FUNCTION(ConvolutionOutput_ValidateOutput_Test1);
+        // ADD_TEST_FUNCTION(EfficientDirectConvolution_Test1);
+        // Bug: these tests fail in LLVM JIT case only
+        // ADD_TEST_FUNCTION(ConvolutionOutput_ValidateOutput_Test1);
+        // ADD_TEST_FUNCTION(DirectConvolution_Test1);
+        // ADD_TEST_FUNCTION(ConvolutionInput_ValidateOutput_Test1);
+        // ADD_TEST_FUNCTION(ConvolutionInput_ValidateOutput_Test2);
+        // ADD_TEST_FUNCTION(ConvolutionWeight_ValidateOutput_Test1);
+        // ADD_TEST_FUNCTION(ConvolutionWeight_Reshape_ValidateMemory_Test1);
+
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_ValidateOutput_Test1);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_ValidateOutput_Test2);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_ValidateOutput_Test3);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_ValidateOutput_Test4);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_ValidateOutput_Test5);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_ValidateOutput_Test6);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_ValidateOutput_Test7);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_ValidateOutput_Test8);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_ValidateOutput_Test9);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_ValidateOutput_Test10);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_ValidateOutput_Test11);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_ValidateOutput_Test12);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_ValidateOutput_Test13);
+
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_ValidateMemory_Test1);
+
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test1);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test2);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test3);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test4);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test5);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test6);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test7);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test8);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test9);
+
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_BLASTCOPY_ValidateOutput_Test1);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_BLASTCOPY_ValidateOutput_Test2);
+
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_BLASTCOPY_ValidateMemory_Test1);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_BLASTCOPY_ValidateMemory_Test2);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_BLASTCOPY_ValidateMemory_Test3);
+
+        ADD_TEST_FUNCTION(MLAS_GEMM_GeneralCachingStrategy);
+
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test1);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test2);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test3);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test4);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test5);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test6);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test7);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test8);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test9);
+
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test1);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test2);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test3);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test4);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test5);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test6);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test7);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test8);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test9);
+
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_Test1);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_Test2);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_Test1);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_Test2);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_Test3);
+
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test1);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test2);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test3);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test4);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test5);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test6);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test7);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test8);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test9);
+
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test1);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test2);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test3);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test4);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test5);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test6);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test7);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test8);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test9);
+
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test1);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test2);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test3);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test4);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test5);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test6);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test7);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test8);
+        ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test9);
+
+        ADD_TEST_FUNCTION(LoopNest_api_tunable_parameters_test1);
+#if !defined(__APPLE__)
+        ADD_TEST_FUNCTION(ThreadLocalAllocation_test1);
+#endif
+        ADD_TEST_FUNCTION(KernelPredicate_test);
+        ADD_TEST_FUNCTION(MatMul3_test1);
+        ADD_TEST_FUNCTION(MatMul3_test2);
+        ADD_TEST_FUNCTION(LoopNestFuse_test1);
+        ADD_TEST_FUNCTION(LoopNestFuse_test2);
+        ADD_TEST_FUNCTION(LoopNestFuse_test3);
+        ADD_TEST_FUNCTION(ConvertedConstraint_test1);
+        ADD_TEST_FUNCTION(ConvertedConstraint_test2);
+
+        ADD_TEST_FUNCTION(FunctionPointer_test1);
+
         for (auto [name, fn] : testFunctions)
         {
             RunTest(name, fn);
diff --git a/tools/importers/CNTK/cntk_to_ell.py b/tools/importers/CNTK/cntk_to_ell.py
index 0da4c5f3a..6af8cc2b5 100644
--- a/tools/importers/CNTK/cntk_to_ell.py
+++ b/tools/importers/CNTK/cntk_to_ell.py
@@ -200,7 +200,7 @@ def get_node_output_in_ell_order(cntk_node_results):
             np.float).reshape(original_shape[1], 1, original_shape[0])
     elif len(original_shape) == 1:
         ordered_weights = ordered_weights.ravel().astype(
-            np.float).reshape(1, 1, original_shape.size)
+            np.float).reshape(1, 1, cntk_node_results.size)
     else:
         raise NotImplementedError(
             "Unsupported tensor dimensions {}".format(len(original_shape)))
@@ -242,7 +242,7 @@ def verify_ell_nodes_in_vision_model(ell_map, cntk_model, cntk_nodes, ordered_im
 
         # Feed input to the ELL model
         _logger.info("Getting computed ELL results")
-        ell_map.Compute(ell_input_tensor, dtype=np.float32)
+        ell_map.Compute(ell_input_tensor)
 
         # Walk list of importer nodes
         for importer_node in ordered_importer_nodes:
@@ -355,16 +355,15 @@ def verify_ell_output_in_vision_model(ell_map, cntk_model, testing_info):
 
         # Get computed ELL result
         _logger.info("Getting computed ELL results")
-        result_from_compute = np.array(ell_map.Compute(ell_input_tensor, dtype=np.float32))
+        result_from_compute = np.array(ell_map.Compute(ell_input_tensor))
 
         # Get compiled ELL result
         _logger.info("Getting compiled ELL results")
         compiler_options = ell.model.MapCompilerOptions()
         compiler_options.useBlas = True
-        compiled_ell_map = ell_map.Compile("host", "model", "predict", compilerOptions=compiler_options,
-                                           dtype=np.float32)
+        compiled_ell_map = ell_map.Compile("host", "model", "predict", compilerOptions=compiler_options)
 
-        result_from_compiled = np.array(compiled_ell_map.Compute(ell_input_tensor, dtype=np.float32))
+        result_from_compiled = np.array(compiled_ell_map.Compute(ell_input_tensor))
 
         # Verify the computed result against the cntk result
         np.testing.assert_array_almost_equal(
@@ -406,7 +405,7 @@ def verify_ell_output_in_vision_model(ell_map, cntk_model, testing_info):
 
 
 def verify_compiled_ell_nodes_in_vision_model(modelFile, cntk_model, model_cntk_nodes, ordered_importer_nodes,
-                                              step_interval_msec=0, lag_threshold_msec=0, plot_model=False,
+                                              step_interval_msec=None, lag_threshold_msec=None, plot_model=False,
                                               verify_model={"audio": False, "vision": False}):
 
     _logger = logger.get()
@@ -450,7 +449,7 @@ def verify_compiled_ell_nodes_in_vision_model(modelFile, cntk_model, model_cntk_
 
             # Feed input to the ELL model
             _logger.info("Getting computed ELL results")
-            ell_map.Compute(ell_input_tensor, dtype=np.float32)
+            ell_map.Compute(ell_input_tensor)
             model_clone = None
             if cntk_node.op_name != "UserFunction":
                 model_clone = cntk_node.clone(CloneMethod.clone)
@@ -470,10 +469,9 @@ def verify_compiled_ell_nodes_in_vision_model(modelFile, cntk_model, model_cntk_
                     _logger.info("Getting compiled ELL results")
                     compiler_options = ell.model.MapCompilerOptions()
                     compiler_options.useBlas = True
-                    compiled_ell_map = ell_map.Compile("host", "model", "predict", compilerOptions=compiler_options,
-                                                       dtype=np.float32)
+                    compiled_ell_map = ell_map.Compile("host", "model", "predict", compilerOptions=compiler_options)
 
-                    result_from_compiled = np.array(compiled_ell_map.Compute(ell_input_tensor, dtype=np.float32))
+                    result_from_compiled = np.array(compiled_ell_map.Compute(ell_input_tensor))
                     output_shape = cntk_output.shape
                     if (len(output_shape) == 3):
                         if cntk_output.size == result_from_compiled.size:
@@ -488,6 +486,13 @@ def verify_compiled_ell_nodes_in_vision_model(modelFile, cntk_model, model_cntk_
                             result_from_compiled = result_from_compiled[padding:output_shape[0] + padding,
                                                                         padding:output_shape[1] + padding, :]
 
+                    # Put the ELL results into same order as CNTK
+                    # if prefix_ordered_importer_nodes[-1].output_shapes[0][1] == "channel_row_column":
+                    print(result_from_compiled.shape, cntk_output.shape)
+                    # result_from_compiled = memory_shapes.get_tensor_in_ell_order(result_from_compiled, "xyz")
+                    # print(result_from_compiled)
+                    # print(cntk_output)
+
                     # Compare results. Some layers have large numbers (e.g > 500.734) and some small numbers
                     # (e.g. 0.0038453). To make the comparison more resilient and meaningful for large numbers,
                     # normalize before comparing, since comparison is being done on significant digits.
@@ -553,8 +558,9 @@ def map_from_cntk_model_using_new_engine(modelFile, step_interval_msec=None, lag
         testing_info = {"apply_softmax": False}
         try:
             ordered_importer_nodes, node_mapping = importer_engine.get_importer_node_to_ell_mapping()
-            verify_ell_nodes_in_vision_model(ell_map, cntk_model, cntk_nodes, ordered_importer_nodes, node_mapping,
-                                             testing_info)
+            # RESTORE:
+            # verify_ell_nodes_in_vision_model(ell_map, cntk_model, cntk_nodes, ordered_importer_nodes, node_mapping,
+            # testing_info)
             verify_compiled_ell_nodes_in_vision_model(modelFile, cntk_model, cntk_nodes, ordered_importer_nodes,
                                                       verify_model=verify_model)
             verify_ell_output_in_vision_model(ell_map, cntk_model, testing_info)
diff --git a/tools/importers/torch/test/CMakeLists.txt b/tools/importers/torch/test/CMakeLists.txt
index f32079e46..a0a9c2ea0 100644
--- a/tools/importers/torch/test/CMakeLists.txt
+++ b/tools/importers/torch/test/CMakeLists.txt
@@ -21,6 +21,7 @@ if(${PYTHON_ENABLED})
     # copy files
     copy_newer_files(${test_name} test_src)
 
-    add_test(NAME ${test_name} COMMAND ${PYTHON_EXECUTABLE} -m unittest torch_importer_test.py)
+    # disabled until we merge with master
+    # add_test(NAME ${test_name} COMMAND ${PYTHON_EXECUTABLE} -m unittest torch_importer_test.py)
 
-endif()  # PYTHON_ENABLED
\ No newline at end of file
+endif()  # PYTHON_ENABLED
diff --git a/tools/trainers/forestTrainer/CMakeLists.txt b/tools/trainers/forestTrainer/CMakeLists.txt
index c80391081..ac109e4df 100644
--- a/tools/trainers/forestTrainer/CMakeLists.txt
+++ b/tools/trainers/forestTrainer/CMakeLists.txt
@@ -27,5 +27,5 @@ set (GLOBAL_BIN_DIR ${CMAKE_BINARY_DIR}/bin)
 set (test_name ${tool_name}_test)
 add_test(NAME ${test_name}
          WORKING_DIRECTORY ${GLOBAL_BIN_DIR}
-         COMMAND ${tool_name} -idf ${CMAKE_SOURCE_DIR}/examples/data/tinyTestData.txt -dd auto -omf null -v)
+         COMMAND ${tool_name} -idf ${ELL_ROOT}/examples/data/tinyTestData.txt -dd auto -omf null -v)
 set_test_library_path(${test_name})
diff --git a/tools/trainers/linearTrainer/CMakeLists.txt b/tools/trainers/linearTrainer/CMakeLists.txt
index 42f09cb8b..5ac6e250c 100644
--- a/tools/trainers/linearTrainer/CMakeLists.txt
+++ b/tools/trainers/linearTrainer/CMakeLists.txt
@@ -31,65 +31,65 @@ set (GLOBAL_BIN_DIR ${CMAKE_BINARY_DIR}/bin)
 set (test_name ${tool_name}_test_0)
 add_test(NAME ${test_name}
          WORKING_DIRECTORY ${GLOBAL_BIN_DIR}
-         COMMAND ${tool_name} --inputDataFilename ${CMAKE_SOURCE_DIR}/examples/data/tinyTestData.txt -dd 3 -r 0.01 --outputModelFilename linearTrainer_model_1.model -v -ne 20 --lossFunction log)
+         COMMAND ${tool_name} --inputDataFilename ${ELL_ROOT}/examples/data/tinyTestData.txt -dd 3 -r 0.01 --outputModelFilename linearTrainer_model_1.model -v -ne 20 --lossFunction log)
 set_test_library_path(${test_name})
 
 set (test_name ${tool_name}_test_1)
 add_test(NAME ${test_name}
          WORKING_DIRECTORY ${GLOBAL_BIN_DIR}
-         COMMAND ${tool_name} --inputDataFilename ${CMAKE_SOURCE_DIR}/examples/data/testData.txt --inputModelFilename ${CMAKE_BINARY_DIR}/examples/models/model_3.model --modelInputs 1053 --modelOutputs 1060.output -dd 3 -r 0.01 --outputModelFilename linearTrainer_model_3.model -v -ne 20 --lossFunction log)
+         COMMAND ${tool_name} --inputDataFilename ${ELL_ROOT}/examples/data/testData.txt --inputModelFilename ${CMAKE_BINARY_DIR}/examples/models/model_3.model --modelInputs 1053 --modelOutputs 1060.output -dd 3 -r 0.01 --outputModelFilename linearTrainer_model_3.model -v -ne 20 --lossFunction log)
 set_test_library_path(${test_name})
 
 set (test_name ${tool_name}_test_2)
 add_test(NAME ${test_name}
          WORKING_DIRECTORY ${GLOBAL_BIN_DIR}
-         COMMAND ${tool_name} --inputDataFilename ${CMAKE_SOURCE_DIR}/examples/data/testData.txt -dd 3 -lf log -v -ne 30 -r 0.001 -a SGD)
+         COMMAND ${tool_name} --inputDataFilename ${ELL_ROOT}/examples/data/testData.txt -dd 3 -lf log -v -ne 30 -r 0.001 -a SGD)
 set_test_library_path(${test_name})
 
 set (test_name ${tool_name}_test_3)
 add_test(NAME ${test_name}
          WORKING_DIRECTORY ${GLOBAL_BIN_DIR}
-         COMMAND ${tool_name} --inputDataFilename ${CMAKE_SOURCE_DIR}/examples/data/testData.txt -dd 3 -lf hinge -v -ne 30 -r 0.001 -a SGD)
+         COMMAND ${tool_name} --inputDataFilename ${ELL_ROOT}/examples/data/testData.txt -dd 3 -lf hinge -v -ne 30 -r 0.001 -a SGD)
 set_test_library_path(${test_name})
 
 set (test_name ${tool_name}_test_4)
 add_test(NAME ${test_name}
          WORKING_DIRECTORY ${GLOBAL_BIN_DIR}
-         COMMAND ${tool_name} --inputDataFilename ${CMAKE_SOURCE_DIR}/examples/data/testData.txt -dd 3 -lf squared -v -ne 30 -r 1 -a SGD)
+         COMMAND ${tool_name} --inputDataFilename ${ELL_ROOT}/examples/data/testData.txt -dd 3 -lf squared -v -ne 30 -r 1 -a SGD)
 set_test_library_path(${test_name})
 
 set (test_name ${tool_name}_test_5)
 add_test(NAME ${test_name}
          WORKING_DIRECTORY ${GLOBAL_BIN_DIR}
-         COMMAND ${tool_name} --inputDataFilename ${CMAKE_SOURCE_DIR}/examples/data/testData.txt -dd 3 -lf log -v -ne 30 -r 0.001 -a SparseDataSGD)
+         COMMAND ${tool_name} --inputDataFilename ${ELL_ROOT}/examples/data/testData.txt -dd 3 -lf log -v -ne 30 -r 0.001 -a SparseDataSGD)
 set_test_library_path(${test_name})
 
 set (test_name ${tool_name}_test_6)
 add_test(NAME ${test_name}
          WORKING_DIRECTORY ${GLOBAL_BIN_DIR}
-         COMMAND ${tool_name} --inputDataFilename ${CMAKE_SOURCE_DIR}/examples/data/testData.txt -dd 3 -lf hinge -v -ne 30 -r 0.001 -a SparseDataSGD)
+         COMMAND ${tool_name} --inputDataFilename ${ELL_ROOT}/examples/data/testData.txt -dd 3 -lf hinge -v -ne 30 -r 0.001 -a SparseDataSGD)
 set_test_library_path(${test_name})
 
 set (test_name ${tool_name}_test_7)
 add_test(NAME ${test_name}
          WORKING_DIRECTORY ${GLOBAL_BIN_DIR}
-         COMMAND ${tool_name} --inputDataFilename ${CMAKE_SOURCE_DIR}/examples/data/testData.txt -dd 3 -lf squared -v -ne 30 -r 1 -a SparseDataSGD)
+         COMMAND ${tool_name} --inputDataFilename ${ELL_ROOT}/examples/data/testData.txt -dd 3 -lf squared -v -ne 30 -r 1 -a SparseDataSGD)
 set_test_library_path(${test_name})
 
 set (test_name ${tool_name}_test_8)
 add_test(NAME ${test_name}
          WORKING_DIRECTORY ${GLOBAL_BIN_DIR}
-         COMMAND ${tool_name} --inputDataFilename ${CMAKE_SOURCE_DIR}/examples/data/testData.txt -dd 3 -lf log -v -ne 30 -r 0.01 -a SparseDataCenteredSGD)
+         COMMAND ${tool_name} --inputDataFilename ${ELL_ROOT}/examples/data/testData.txt -dd 3 -lf log -v -ne 30 -r 0.01 -a SparseDataCenteredSGD)
 set_test_library_path(${test_name})
 
 set (test_name ${tool_name}_test_9)
 add_test(NAME ${test_name}
          WORKING_DIRECTORY ${GLOBAL_BIN_DIR}
-         COMMAND ${tool_name} --inputDataFilename ${CMAKE_SOURCE_DIR}/examples/data/testData.txt -dd 3 -lf hinge -v -ne 30 -r 0.01 -a SparseDataCenteredSGD)
+         COMMAND ${tool_name} --inputDataFilename ${ELL_ROOT}/examples/data/testData.txt -dd 3 -lf hinge -v -ne 30 -r 0.01 -a SparseDataCenteredSGD)
 set_test_library_path(${test_name})
 
 set (test_name ${tool_name}_test_10)
 add_test(NAME ${test_name}
          WORKING_DIRECTORY ${GLOBAL_BIN_DIR}
-         COMMAND ${tool_name} --inputDataFilename ${CMAKE_SOURCE_DIR}/examples/data/testData.txt -dd 3 -lf squared -v -ne 30 -r 1 -a SparseDataCenteredSGD)
+         COMMAND ${tool_name} --inputDataFilename ${ELL_ROOT}/examples/data/testData.txt -dd 3 -lf squared -v -ne 30 -r 1 -a SparseDataCenteredSGD)
 set_test_library_path(${test_name})
diff --git a/tools/trainers/protoNNTrainer/CMakeLists.txt b/tools/trainers/protoNNTrainer/CMakeLists.txt
index e5815a3c2..26c92e8d8 100644
--- a/tools/trainers/protoNNTrainer/CMakeLists.txt
+++ b/tools/trainers/protoNNTrainer/CMakeLists.txt
@@ -31,5 +31,5 @@ set (GLOBAL_BIN_DIR ${CMAKE_BINARY_DIR}/bin)
 set (test_name ${tool_name}_test_0)
 add_test(NAME ${test_name}
          WORKING_DIRECTORY ${GLOBAL_BIN_DIR}
-         COMMAND ${tool_name} --inputDataFilename ${CMAKE_SOURCE_DIR}/examples/data/protonnTestData.txt -dd auto -sw 1 -sb 1 -sz 1 -pd 10 -l 2 -mp 5 -v --evaluationFrequency 1 -plf L2)
+         COMMAND ${tool_name} --inputDataFilename ${ELL_ROOT}/examples/data/protonnTestData.txt -dd auto -sw 1 -sb 1 -sz 1 -pd 10 -l 2 -mp 5 -v --evaluationFrequency 1 -plf L2)
 set_test_library_path(${test_name})
diff --git a/tools/trainers/sweepingSGDTrainer/CMakeLists.txt b/tools/trainers/sweepingSGDTrainer/CMakeLists.txt
index 207bfa1ac..464ceb8ee 100644
--- a/tools/trainers/sweepingSGDTrainer/CMakeLists.txt
+++ b/tools/trainers/sweepingSGDTrainer/CMakeLists.txt
@@ -27,5 +27,5 @@ set (GLOBAL_BIN_DIR ${CMAKE_BINARY_DIR}/bin)
 set (test_name ${tool_name}_test)
 add_test(NAME ${test_name}
          WORKING_DIRECTORY ${GLOBAL_BIN_DIR}
-         COMMAND ${tool_name} -idf ${CMAKE_SOURCE_DIR}/examples/data/testData.txt -dd 21 -omf sweepingSgdTrainer_model.xml -v -lf log)
+         COMMAND ${tool_name} -idf ${ELL_ROOT}/examples/data/testData.txt -dd 21 -omf sweepingSgdTrainer_model.xml -v -lf log)
 set_test_library_path(${test_name})
diff --git a/tools/utilities/finetune/CMakeLists.txt b/tools/utilities/finetune/CMakeLists.txt
index 62a7a4a44..fbfa8db5a 100644
--- a/tools/utilities/finetune/CMakeLists.txt
+++ b/tools/utilities/finetune/CMakeLists.txt
@@ -5,7 +5,7 @@
 # finetune tool
 set(tool_name finetune)
 
-set(common_src 
+set(common_src
     src/DataStatistics.cpp
     src/DataUtils.cpp
     src/FineTuneArguments.cpp
@@ -43,13 +43,13 @@ source_group("include" FILES ${include})
 
 # create executable in build\bin
 set (GLOBAL_BIN_DIR ${CMAKE_BINARY_DIR}/bin)
-set (EXECUTABLE_OUTPUT_PATH ${GLOBAL_BIN_DIR}) 
+set (EXECUTABLE_OUTPUT_PATH ${GLOBAL_BIN_DIR})
 add_executable(${tool_name} ${docs} ${src} ${main_src} ${include})
 target_include_directories(${tool_name} PRIVATE include ${ELL_LIBRARIES_DIR})
 target_link_libraries(${tool_name} common data dsp emitters evaluators functions model nodes optimization passes predictors utilities)
 copy_shared_libraries(${tool_name})
 
-# put this project in the utilities folder in the IDE 
+# put this project in the utilities folder in the IDE
 set_property(TARGET ${tool_name} PROPERTY FOLDER "tools/trainers")
 
 #
@@ -80,9 +80,6 @@ set(test_include
     test/include/TestModelUtils.h
     test/include/TestOptimizationUtils.h
     test/include/TestTransformData.h
-)   
-
-set(test_tcc
 )
 
 source_group("src" FILES ${test_src})
diff --git a/tools/utilities/nodeTiming/gemmCodeNode/.gitignore b/tools/utilities/nodeTiming/gemmCodeNode/.gitignore
new file mode 100644
index 000000000..c28f134eb
--- /dev/null
+++ b/tools/utilities/nodeTiming/gemmCodeNode/.gitignore
@@ -0,0 +1,2 @@
+models/
+test_*/
diff --git a/tools/utilities/nodeTiming/gemmCodeNode/README.md b/tools/utilities/nodeTiming/gemmCodeNode/README.md
new file mode 100644
index 000000000..3db2bf5cd
--- /dev/null
+++ b/tools/utilities/nodeTiming/gemmCodeNode/README.md
@@ -0,0 +1,46 @@
+The `nodeTiming/gemmCodeNode` directory contains tools for building ELL models with different Matrix-Matrix multiplication implementations and measuring the performance of those implementations.
+
+Layout of gemmCodeNode:
+
+```
+deploy/                                 -- contains files to be copied without alteration into implementation timing directories
+    full_pass.(cmd|sh)                  -- runs import_all.(cmd|sh), build_all.(cmd|sh), and run_all.(cmd|sh) in a timing directory
+    run.py -c N                         -- runs run_all.(cmd|sh) N times (default 1) and runs the timing_aggregator processing on the results
+    timing_aggregator.py -f filename    -- reads filename and parses for gemm time output. For each uniquely named gemm impl it sees it will aggregate those times and report the range, average, and ratio of average time against the first implementation that it sees
+
+scripts/                                -- contains scripts for building ELL models with various GEMM impls and generating the test projects for building, running, and timing the performance of the models
+    build_gemm_models.py                -- builds ELL models with specified GEMM implementation and panel/kernel parameters
+    make_default_models.py              -- builds ELL models for all of the different GEMM implementations and the OpenBLAS, naive for-loop, and (if -mkl specified and MKL is installed) the MKL implementation
+    special_model_args.py               -- contains a dictionary mapping model file name to a list of additional arguments to pass to wrap.py for that model when importing it
+    build_tests.py                      -- generates the testing and timing projects for the given models and implementations (specified with the -v option)
+
+src/                                    -- contains source files and template source files for the testing and timing projects
+    CMakeLists.txt.in                   -- CMakeLists.txt template file for the timing project. This is read and CMakeLists.txt is produced by scripts/build_tests.py.
+    Runner.cpp.in                       -- Runner.cpp template file for the timing project. This generates the main cpp file for model running and timing. This is read and CMakeLists.txt is produced by scripts/build_tests.py.
+```
+
+General workflow with these tools:
+```
+# Modify and introduce a new GEMM implementation
+<< build ELL >>
+
+cd <ELLROOT>/tools/utilities/gemmCodeNode
+
+# Generate ELL models for each GEMM implementation in the ./models/ directory
+# This only needs to be done whenever a new GEMM implementation is added, but not when an existing implementation is modified
+python scripts/make_default_models.py
+
+# Generate the testing projects for the GEMM implementations being tested
+# Suppose implementations 2, 5, 7, and 14 are being tested and the directory to put the test projects in is named my_testing_dir
+# To include mkl, provide the -mkl flag
+python scripts/build_tests.py -v 2 5 7 14 -o my_testing_dir
+
+# cd into the test directory for the data type and size that you want to test
+cd my_testing_dir/float/256x256
+
+# Run import_all, build_all, and run_all:
+./full_pass.(cmd|sh)
+
+# To get aggregate timing over 20 runs:
+python ./run.py -c 20
+```
\ No newline at end of file
diff --git a/tools/utilities/nodeTiming/gemmCodeNode/deploy/full_pass.cmd b/tools/utilities/nodeTiming/gemmCodeNode/deploy/full_pass.cmd
new file mode 100644
index 000000000..444e95208
--- /dev/null
+++ b/tools/utilities/nodeTiming/gemmCodeNode/deploy/full_pass.cmd
@@ -0,0 +1,12 @@
+@echo off
+REM ####################################################################################################
+REM #
+REM #  Project:  Embedded Learning Library (ELL)
+REM #  File:     full_pass.cmd
+REM #  Authors:  Mason Remy
+REM #
+REM ####################################################################################################
+
+call import_all.cmd
+call build_all.cmd
+call run_all.cmd
\ No newline at end of file
diff --git a/tools/utilities/nodeTiming/gemmCodeNode/deploy/full_pass.sh b/tools/utilities/nodeTiming/gemmCodeNode/deploy/full_pass.sh
new file mode 100755
index 000000000..b94f1746c
--- /dev/null
+++ b/tools/utilities/nodeTiming/gemmCodeNode/deploy/full_pass.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+####################################################################################################
+#
+#  Project:  Embedded Learning Library (ELL)
+#  File:     full_pass.sh
+#  Authors:  Mason Remy
+#
+####################################################################################################
+
+chmod +x import_all.sh
+./import_all.sh
+
+chmod +x build_all.sh
+./build_all.sh
+
+chmod +x run_all.sh
+./run_all.sh
diff --git a/tools/utilities/nodeTiming/gemmCodeNode/deploy/run.py b/tools/utilities/nodeTiming/gemmCodeNode/deploy/run.py
new file mode 100755
index 000000000..bf486757d
--- /dev/null
+++ b/tools/utilities/nodeTiming/gemmCodeNode/deploy/run.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python3
+####################################################################################################
+#
+#  Project:  Embedded Learning Library (ELL)
+#  File:     run.py
+#  Authors:  Mason Remy
+#
+#  Requires: Python 3.x
+#
+####################################################################################################
+
+import argparse
+import subprocess
+import platform
+import os
+import timing_aggregator
+
+script_path = os.path.dirname(os.path.realpath(__file__))
+
+def run():
+    platform_run_script = "run_all.sh"
+    if platform.system() == "Windows":
+        platform_run_script = "run_all.cmd"
+    run_full_path = os.path.join(script_path, platform_run_script)
+    results = subprocess.run([run_full_path], stdout=subprocess.PIPE)
+    split_lines = results.stdout.decode("utf-8").split(os.linesep)
+    return split_lines
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-c", "--count", default=1, type=int)
+    args = parser.parse_args()
+
+    accumulated_result_lines = []
+    print("Running {} time(s)...".format(args.count))
+    for i in range(args.count):
+        accumulated_result_lines.extend(run())
+        print("{}/{} complete".format(i + 1, args.count))
+    
+    results_dict = timing_aggregator.parse_output(accumulated_result_lines)
+    timing_aggregator.print_results(results_dict)
\ No newline at end of file
diff --git a/tools/utilities/nodeTiming/gemmCodeNode/deploy/timing_aggregator.py b/tools/utilities/nodeTiming/gemmCodeNode/deploy/timing_aggregator.py
new file mode 100755
index 000000000..72b437f90
--- /dev/null
+++ b/tools/utilities/nodeTiming/gemmCodeNode/deploy/timing_aggregator.py
@@ -0,0 +1,95 @@
+#!/usr/bin/env python3
+####################################################################################################
+#
+#  Project:  Embedded Learning Library (ELL)
+#  File:     timing_aggregator.py
+#  Authors:  Mason Remy
+#
+#  Requires: Python 3.x
+#
+####################################################################################################
+
+import re
+import argparse
+import operator
+
+class TimingResult:
+    key = ""
+    count = 0
+    result_values = []
+    average = 0
+    result_range = ()
+
+    def __init__(self, key="", raw_results=[]):
+        self.key = key
+        self.result_values = raw_results
+        self.count = len(raw_results)
+        if self.count > 0:
+            self.average = sum(raw_results) / self.count
+            self.result_range = (min(raw_results), max(raw_results))
+
+    def print_summary(self, indenting=0):
+        indent = "\t" * indenting
+        print("{}{}".format(indent, self.key))
+        print("{}\tAvg   = {}".format(indent, self.average))
+        print("{}\tRange = {}".format(indent, self.result_range))
+        print("{}\tCount = {}".format(indent, self.count))
+
+    def print_raw_results(self, indenting=0):
+        indent = "\t" * indenting
+        print("{}{} times:".format(indent, self.key))
+        for result in self.result_values:
+            print("{}\t{}".format(indent, result))
+
+def parse_output(output_lines):
+    timing_pattern = "(gemm.*) time = (.*)"
+    timing_regex_matcher = re.compile(timing_pattern)
+    time_dict = {}
+    for line in output_lines:
+        match = timing_regex_matcher.match(line)
+        if match:
+            key = match.group(1)
+            value = float(match.group(2))
+            if key in time_dict:
+                time_dict[key].append(value)
+            else:
+                time_dict[key] = [value]
+    results = {}
+    for key in time_dict:
+        results[key] = TimingResult(key, time_dict[key])
+    return results
+
+def get_ratios_against_best(timing_results_dict):
+    best_time = None
+    for key in timing_results_dict:
+        if best_time == None or best_time > timing_results_dict[key].average:
+            best_time = timing_results_dict[key].average
+    ratios_dict = {}
+    for key in timing_results_dict:
+        ratios_dict[key] = timing_results_dict[key].average / best_time
+    ordered_key_ratio_pairs = sorted(ratios_dict.items(), key=operator.itemgetter(1))
+    return ordered_key_ratio_pairs
+
+def print_results(results_dict, include_raw_results=True, include_statistics=True, include_ratios=True):
+    for key in results_dict:
+        if include_raw_results:
+            results_dict[key].print_raw_results()
+            print()
+        if include_statistics:
+            results_dict[key].print_summary()
+            print()
+    if include_ratios:
+        ratios_list = get_ratios_against_best(results_dict)
+        print("Ratios:")
+        for (key, ratio) in ratios_list:
+            print("{} : {}".format(key, ratio))
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-f", "--file", required=True)
+    args = parser.parse_args()
+
+    with open(args.file, 'r') as f:
+        lines = f.readlines()
+    results_dict = parse_output(lines)
+    print_results(results_dict)
diff --git a/tools/utilities/nodeTiming/gemmCodeNode/scripts/build_gemm_models.py b/tools/utilities/nodeTiming/gemmCodeNode/scripts/build_gemm_models.py
new file mode 100755
index 000000000..ff12131da
--- /dev/null
+++ b/tools/utilities/nodeTiming/gemmCodeNode/scripts/build_gemm_models.py
@@ -0,0 +1,128 @@
+#!/usr/bin/env python3
+####################################################################################################
+#
+#  Project:  Embedded Learning Library (ELL)
+#  File:     build_gemm_models.py
+#  Authors:  Mason Remy
+#
+#  Requires: Python 3.x
+#
+####################################################################################################
+
+import os
+import sys
+
+script_path = os.path.dirname(os.path.realpath(__file__))
+sys.path += [os.path.join(script_path, "../../../pythonlibs")]
+
+import find_ell
+
+ell_build_root = find_ell.find_ell_build()
+sys.path += [os.path.join(ell_build_root, "interfaces", "python", "package")]
+
+import ell
+import random
+import argparse
+import numpy as np
+
+type_mapping = {
+    "double": ell.nodes.PortType.real,
+    "float": ell.nodes.PortType.smallReal
+}
+
+def make_matrix(rows, cols, use_fixed_seed, data_type_str):
+    if use_fixed_seed:
+        np.random.seed(0)
+    return ell.math.DoubleVector(np.random.rand(rows * cols))
+
+def build_model(output_filename, use_fallback, gemm_impl, M=256, N=256, K=256, kernel_size=[1, 1, 1], cache_sizes=[64, 64, 64], data_type_str="double", ignore_correctness=False):
+    model = ell.model.Model()
+    mb = ell.model.ModelBuilder()
+
+    data_type = type_mapping[data_type_str]
+
+    # input node is the left matrix in the matrix multiplication
+    input_layout = ell.model.PortMemoryLayout([M, K])
+    input_node = mb.AddInputNode(model, input_layout, data_type)
+    input_node_output = ell.nodes.PortElements(input_node.GetOutputPort("output"))
+
+    realign_node = mb.AddOutputNode(model, input_layout, input_node_output)
+    realign_node_output = ell.nodes.PortElements(realign_node.GetOutputPort("output"))
+
+    right_matrix = make_matrix(K, N, not ignore_correctness, data_type_str)
+    right_constant_memory_layout = ell.model.PortMemoryLayout([K, N])
+    right_constant_node = mb.AddConstantNode(model, right_matrix, right_constant_memory_layout, data_type)
+    right_node_output = ell.nodes.PortElements(right_constant_node.GetOutputPort("output"))
+
+    gemm_output = None
+    if use_fallback:
+        fallback_gemm_node = mb.AddMatrixMatrixMultiplyNode(model, realign_node_output, right_node_output)
+        gemm_output = ell.nodes.PortElements(fallback_gemm_node.GetOutputPort("output"))
+    else:
+        gemm_node = mb.AddMatrixMatrixMultiplyCodeNode(model, realign_node_output, right_node_output, cache_sizes[0], cache_sizes[1], cache_sizes[2], kernel_size[0], kernel_size[1], kernel_size[2], gemm_impl)
+        gemm_output = ell.nodes.PortElements(gemm_node.GetOutputPort("output"))
+
+    # add output node
+    output_node = mb.AddOutputNode(model, ell.model.PortMemoryLayout([M, N]), gemm_output)
+
+    ell_map = ell.model.Map(model, input_node, ell.nodes.PortElements(output_node.GetOutputPort("output")))
+    ell_map.Save(output_filename)
+
+def build_all_models(output_dir, M=256, N=256, K=256, kernel_size=[4, 4, 4], cache_sizes=[64, 64, 64], data_type_str="double", base_filename="gemm", file_extension="ell", ignore_correctness=False):
+    gemm_impl_count = ell.nodes.MatrixMatrixMultiplyImplementation.ImplementationCount
+    for gemm_impl in range(gemm_impl_count):
+        output_filename = os.path.join(output_dir, "{}{}.{}".format(base_filename, gemm_impl, file_extension))
+        build_model(output_filename=output_filename,
+                    use_fallback=False,
+                    gemm_impl=gemm_impl,
+                    M=M,
+                    N=N,
+                    K=K,
+                    kernel_size=kernel_size,
+                    cache_sizes=cache_sizes,
+                    data_type_str=data_type_str,
+                    ignore_correctness=ignore_correctness)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-o", "--output_dir", required=True)
+    parser.add_argument("-M", type=int, default=256)
+    parser.add_argument("-N", type=int, default=256)
+    parser.add_argument("-K", type=int, default=256)
+    parser.add_argument("--type", "-t", choices=list(type_mapping), default="double")
+    parser.add_argument("--ignore_correctness", help="Don't use a fixed random seed. A fixed random seed is used to validate GEMM results between different implementations", action="store_true")
+    parser.add_argument("--panel_size", "-ps", type=int, nargs=3, default=[64, 64, 64], help="Panel size values for M, N, and K dimensions")
+    parser.add_argument("--kernel_size", "-ks", type=int, nargs=3, default=[1, 1, 1], help="Kernel size values for M, N, and K dimensions")
+    parser.add_argument("--base_filename", default="gemm")
+    parser.add_argument("--file_extension", default="ell")
+
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument("--all_impls", "-a", action="store_true")
+    group.add_argument("--impl", "-i", type=int, default=0)
+    group.add_argument("--fallback", "-f", action="store_true", help="Use ELL naive for-loops or BLAS impl via MatrixMultiplyNode (non-code-node)")
+
+    args = parser.parse_args()
+
+    if args.all_impls:
+        build_all_models(output_dir=args.output_dir,
+                         M=args.M,
+                         N=args.N,
+                         K=args.K,
+                         kernel_size=args.kernel_size,
+                         cache_sizes=args.panel_size,
+                         data_type_str=args.type,
+                         base_filename=args.base_filename,
+                         file_extension=args.file_extension,
+                         ignore_correctness=args.ignore_correctness)
+    else:
+        output_filename = os.path.join(args.output_dir, "{}.{}".format(args.base_filename, args.file_extension))
+        build_model(output_filename=output_filename,
+                    use_fallback=args.fallback,
+                    gemm_impl=args.impl,
+                    M=args.M,
+                    N=args.N,
+                    K=args.K,
+                    kernel_size=args.kernel_size,
+                    cache_sizes=args.panel_size,
+                    data_type_str=args.type,
+                    ignore_correctness=args.ignore_correctness)
diff --git a/tools/utilities/nodeTiming/gemmCodeNode/scripts/build_tests.py b/tools/utilities/nodeTiming/gemmCodeNode/scripts/build_tests.py
new file mode 100755
index 000000000..3e26db8b2
--- /dev/null
+++ b/tools/utilities/nodeTiming/gemmCodeNode/scripts/build_tests.py
@@ -0,0 +1,289 @@
+#!/usr/bin/env python3
+####################################################################################################
+#
+#  Project:  Embedded Learning Library (ELL)
+#  File:     build_tests.py
+#  Authors:  Mason Remy
+#
+#  Requires: Python 3.x
+#
+####################################################################################################
+
+import os
+import sys
+import platform
+import argparse
+import shutil
+from special_model_args import special_model_args
+from make_default_models import default_model_dir
+
+script_path = os.path.dirname(os.path.realpath(__file__))
+sys.path += [os.path.join(script_path, "../../pythonlibs")]
+
+import find_ell
+
+win_script_header = "@echo off\n\n"
+
+def make_cmakelists(srcdir, outdir, dir_name, model_name, run_count=1000, warmup_count=100, additional_libraries=[]):
+    add_subdirectory_str = "add_subdirectory({})"
+
+    add_subdirectory_tag = "@ADD_SUBDIRECTORIES@"
+    link_libraries_tag = "@LINK_LIBRARIES@"
+    run_count_tag = "@RUN_COUNT@"
+    warmup_count_tag = "@WARMUP_COUNT@"
+
+    cmake_template_file = os.path.join(srcdir, "CMakeLists.txt.in")
+    output_dir = os.path.join(outdir, dir_name)
+    os.makedirs(output_dir, exist_ok=True)
+    cmake_outfile = os.path.join(outdir, "{}/CMakeLists.txt".format(dir_name))
+    with open(cmake_template_file) as f:
+        template = f.read()
+
+    link_libraries = [model_name] + additional_libraries
+    template = template.replace(add_subdirectory_tag, add_subdirectory_str.format(dir_name))
+    template = template.replace(link_libraries_tag, "{}".format(" ".join(link_libraries)))
+    template = template.replace(run_count_tag, str(run_count))
+    template = template.replace(warmup_count_tag, str(warmup_count))
+    with open(cmake_outfile, 'w', newline='\n') as f:
+        f.write(template)
+
+def make_runner_cpp(srcdir, outdir, dir_name, model_name, data_type):
+    capitalized_model_name = model_name[0].upper() + model_name[1:]
+
+    model_name_tag = "@MODEL_NAME@"
+    model_dir_tag = "@MODEL_DIR@"
+    allcaps_model_name_tag = "@ALLCAPS_MODEL_NAME@"
+    data_type_tag = "@DATA_TYPE@"
+
+    cpp_template_file = os.path.join(srcdir, "Runner.cpp.in")
+    cpp_outfile = os.path.join(outdir, "{}/Runner.cpp".format(dir_name))
+    with open(cpp_template_file) as f:
+        template = f.read()
+    tags_list = [
+        (model_name_tag, model_name),
+        (model_dir_tag, dir_name),
+        (allcaps_model_name_tag, capitalized_model_name),
+        (data_type_tag, data_type)
+    ]
+    for tag, replacement in tags_list:
+        template = template.replace(tag, replacement)
+
+    with open(cpp_outfile, 'w', newline='\n') as f:
+        f.write(template)
+
+def make_build_script(outdir, dir_name, model_name, use_mkl):
+    win_build_script_str = ('mkdir build_{}\n'
+                        'cd build_{}\n'
+                        'cmake -G "Visual Studio 15 2017 Win64" -Thost=x64 {} .. && cmake --build . --config release -- /m /verbosity:minimal\n'
+                        'cd ..')
+    unix_build_script_str = ('mkdir build_{}\n'
+                        'cd build_{}\n'
+                        'cmake .. -DCMAKE_BUILD_TYPE=Release {}\n'
+                        'make -j\n'
+                        'if [[ "$OSTYPE" == "darwin"* ]]; then\n'
+                        '  objdump -d --no-show-raw-insn ./Runner > Runner.s\n'
+                        'else\n'
+                        '  objdump -d -w --no-show-raw-insn -rRSC ./Runner > Runner.s\n'
+                        'fi\n'
+                        'cd ..')
+    use_mkl_str = '-DUSE_MKL=1' if use_mkl else ''
+    win_build_str = win_build_script_str.format(model_name, model_name, use_mkl_str)
+    unix_build_str = unix_build_script_str.format(model_name, model_name, use_mkl_str)
+    win_outfile = os.path.join(outdir, "{}/build.cmd".format(dir_name))
+    unix_outfile = os.path.join(outdir, "{}/build.sh".format(dir_name))
+    with open(win_outfile, 'w') as f:
+        f.write(win_build_str)
+    with open(unix_outfile, 'w', newline='\n') as f:
+        f.write(unix_build_str)
+
+def make_build_all_script(ell_models, outdir):
+    win_build_script_str = ('cd {}\n'
+                            'call build.cmd\n'
+                            'cd ..\n\n')
+    unix_build_script_str = ('cd {}\n'
+                            'chmod +x build.sh\n'
+                            './build.sh\n'
+                            'cd ..\n\n')
+    win_str_list = [win_build_script_str.format(model_name) for (model_name, model_filename, model_relpath, model_path) in ell_models]
+    unix_str_list = [unix_build_script_str.format(model_name) for (model_name, model_filename, model_relpath, model_path) in ell_models]
+    win_outfile = os.path.join(outdir, "build_all.cmd")
+    unix_outfile = os.path.join(outdir, "build_all.sh")
+    with open(win_outfile, 'w') as f:
+        f.write(win_script_header)
+        f.write("\n".join(win_str_list))
+    with open(unix_outfile, 'w', newline='\n') as f:
+        f.write("\n".join(unix_str_list))
+
+def make_run_all_script(ell_models, outdir, target):
+    win_build_script_str = ('cd {}\\build_{}\\Release\n'
+                            'call Runner.exe\n'
+                            'cd ..\\..\\..\n')
+    unix_build_script_str = ('cd {}/build_{}\n'
+                            '{}./Runner\n'
+                            'cd ../..\n')
+    win_str_list = [win_build_script_str.format(model_name, model_name) for (model_name, model_filename, model_relpath, model_path) in ell_models]
+
+    preload_str = ""
+    if target == "pi3":
+        preload_str = "LD_PRELOAD=~/miniconda3/envs/py34/lib/libopenblas.so "
+    unix_str_list = [unix_build_script_str.format(model_name, model_name, preload_str) for (model_name, model_filename, model_relpath, model_path) in ell_models]
+
+    win_outfile = os.path.join(outdir, "run_all.cmd")
+    unix_outfile = os.path.join(outdir, "run_all.sh")
+    with open(win_outfile, 'w') as f:
+        f.write(win_script_header)
+        f.write("\n".join(win_str_list))
+    with open(unix_outfile, 'w', newline='\n') as f:
+        f.write("\n".join(unix_str_list))
+
+def make_clean_all_script(ell_models, outdir):
+    win_clean_str = "rd /s /q {}"
+    unix_clean_str = "rm -rf {}"
+    win_str_list = [win_clean_str.format(model_name) for (model_name, model_filename, model_relpath, model_path) in ell_models]
+    unix_str_list = [unix_clean_str.format(model_name) for (model_name, model_filename, model_relpath, model_path) in ell_models]
+    win_outfile = os.path.join(outdir, "clean_all.cmd")
+    unix_outfile = os.path.join(outdir, "clean_all.sh")
+    with open(win_outfile, 'w') as f:
+        f.write(win_script_header)
+        f.write("\n".join(win_str_list))
+    with open(unix_outfile, 'w', newline='\n') as f:
+        f.write("\n".join(unix_str_list))
+
+def make_import_all_script(ell_models, outdir, language="cpp", target="host", profile=False):
+    mkdir_str = "mkdir {}"
+    ell_root = find_ell.get_ell_root()
+    wrap_path = os.path.abspath(os.path.join(ell_root, "tools", "wrap", "wrap.py"))
+    wrap_str = "python {} -t {} -l {} --llvm_format ir -od {}/{} {} -f {} \n"
+    profile_str =  "--profile" if profile else ""
+    str_list = []
+    for (model_name, model_filename, model_relpath, model_path) in ell_models:
+        str_list.append(mkdir_str.format(model_name))
+        last_str = model_path
+        if model_filename in special_model_args:
+            last_str += " " + " ".join(special_model_args[model_filename])
+        str_list.append(wrap_str.format(wrap_path, target, language, model_name, model_name, profile_str, last_str))
+
+    win_outfile = os.path.join(outdir, "import_all.cmd")
+    unix_outfile = os.path.join(outdir, "import_all.sh")
+    with open(win_outfile, 'w') as f:
+        f.write(win_script_header)
+        f.write("\n".join(str_list))
+    with open(unix_outfile, 'w', newline='\n') as f:
+        f.write("\n".join(str_list))
+
+def find_ell_models_under_path(path_to_walk, suffix=".ell"):
+    found_ell_models = [] # List of tuples (name, filename, fullpath)
+    for root, dirs, files in os.walk(path_to_walk):
+        for filename in files:
+            if filename.endswith(suffix):
+                relpath = os.path.relpath(root, start=path_to_walk)
+                name = filename.rstrip(suffix)
+                found_ell_models.append((name, filename, relpath, os.path.abspath(os.path.join(root, filename))))
+    return found_ell_models
+
+def copy_static_files(input_path, output_path):
+    for file_name in os.listdir(input_path):
+        full_path = os.path.join(input_path, file_name)
+        if os.path.isfile(full_path):
+            shutil.copy(full_path, output_path)
+
+def get_first_path_element(path):
+    first_path_element = None
+    while len(path) > 0:
+        split_relpath = os.path.split(path)
+        path = split_relpath[0]
+        first_path_element = split_relpath[1]
+    return first_path_element
+
+def create_gitignore(path, ignore_contents=["*"]):
+    output_file = os.path.join(path, ".gitignore")
+    with open(output_file, 'w', newline='\n') as f:
+        f.writelines(ignore_contents)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_path", "-m", default=default_model_dir, help="Path to search under for ELL models")
+    parser.add_argument("--variants", "-v", nargs="*", help="Which implementation variants of GEMM to build. Default is to run all", default=None)
+    parser.add_argument("--target", "-t", default="host")
+    parser.add_argument("--language", "-l", default="cpp")
+    parser.add_argument("--outdir", "-o", default="test_output")
+    parser.add_argument("--profile", "-p", action="store_true")
+    parser.add_argument("--run_count", "-r", type=int, default=1000)
+    parser.add_argument("--warmup_count", "-w", type=int, default=100)
+    parser.add_argument("--include_mkl", "-mkl", action="store_true")
+    parser.add_argument("--include_for_loops", action="store_true")
+    parser.add_argument("--additional_libraries", nargs="*", default=[])
+    parser.add_argument("--data_type", "-dt", choices=["float", "double"], default=None)
+    args = parser.parse_args()
+
+    mkl_name = "gemmMKL"
+    blas_name = "gemmBLAS"
+    naive_for_loops_name = "gemmELL"
+    models_using_mkl = [mkl_name]
+    order_precedence = [mkl_name, blas_name, naive_for_loops_name]
+
+    ell_models = find_ell_models_under_path(args.model_path)
+    if len(ell_models) == 0:
+        print("No ELL models found at {}".format(args.model_path))
+        sys.exit()
+
+    # Re-order ell models for better result printing
+    # Sort by the order_precedence list, then sort by N as an integer from the "gemmN" model name
+    ell_models.sort(key=lambda x: order_precedence.index(x[0]) if x[0] in order_precedence else (len(order_precedence) + int(x[0].lstrip("gemm"))))
+
+    if args.variants and len(args.variants) > 0:
+        models_to_include = []
+        if args.include_mkl:
+            # Add MKL first so other scripts can default to comparing against the first results
+            models_to_include.append(mkl_name)
+        models_to_include.append(blas_name)
+        if args.include_for_loops:
+            models_to_include.append(naive_for_loops_name)
+        additional_models_to_include = ["gemm{}".format(value) for value in args.variants]
+        models_to_include.extend(additional_models_to_include)
+        ell_models = list(filter(lambda ell_model_info: ell_model_info[0] in models_to_include, ell_models))
+    else:
+        if not args.include_mkl:
+            ell_models = list(filter(lambda ell_model_info: ell_model_info[0] != mkl_name, ell_models))
+        if not args.include_for_loops:
+            ell_models = list(filter(lambda ell_model_info: ell_model_info[0] != naive_for_loops_name, ell_models))
+
+
+    # Make and the testing output directory and add a .gitignore file to it
+    os.makedirs(args.outdir, exist_ok=True)
+    create_gitignore(args.outdir)
+
+    # Group ELL models by their relative paths to separate the same implementation for different sizes/types
+    rel_path_to_ell_models = {}
+    for (model_name, model_filename, model_relpath, model_path) in ell_models:
+        if model_relpath not in rel_path_to_ell_models:
+            rel_path_to_ell_models[model_relpath] = []
+        rel_path_to_ell_models[model_relpath].append((model_name, model_filename, model_relpath, model_path))
+
+    for rel_path in rel_path_to_ell_models:
+        data_type = args.data_type
+        if data_type is None:
+            # Try to get the data type from the name of the first directory in the model relpath
+            first_path_elt = get_first_path_element(rel_path)
+            if first_path_elt in ["float", "double"]:
+                data_type = first_path_elt
+            else:
+                data_type = "double" # Default
+
+        outdir = os.path.join(args.outdir, rel_path)
+        current_ell_models = rel_path_to_ell_models[rel_path]
+        os.makedirs(outdir, exist_ok=True)
+
+        copy_static_files(os.path.join(script_path, "..", "deploy"), outdir)
+        make_import_all_script(current_ell_models, outdir, args.language, args.target, args.profile)
+        make_build_all_script(current_ell_models, outdir)
+        make_run_all_script(current_ell_models, outdir, args.target)
+        make_clean_all_script(current_ell_models, outdir)
+        src_dir = os.path.join(script_path, "..", "src")
+        for (model_name, model_filename, model_relpath, model_path) in current_ell_models:
+            os.makedirs(os.path.join(outdir, model_name), exist_ok=True)
+            make_cmakelists(src_dir, outdir, model_name, model_name, args.run_count, args.warmup_count, additional_libraries=args.additional_libraries)
+            make_runner_cpp(src_dir, outdir, model_name, model_name, data_type)
+            make_build_script(outdir, model_name, model_name, model_name in models_using_mkl)
+
+    print("Created testing utilities at {}".format(args.outdir))
diff --git a/tools/utilities/nodeTiming/gemmCodeNode/scripts/make_default_models.py b/tools/utilities/nodeTiming/gemmCodeNode/scripts/make_default_models.py
new file mode 100755
index 000000000..44a394f1b
--- /dev/null
+++ b/tools/utilities/nodeTiming/gemmCodeNode/scripts/make_default_models.py
@@ -0,0 +1,71 @@
+#!/usr/bin/env python3
+####################################################################################################
+#
+#  Project:  Embedded Learning Library (ELL)
+#  File:     make_default_models.py
+#  Authors:  Mason Remy
+#
+#  Requires: Python 3.x
+#
+####################################################################################################
+
+import os
+import shutil
+import argparse
+
+import build_gemm_models
+
+script_path = os.path.dirname(os.path.realpath(__file__))
+default_model_dir = os.path.join(script_path, "..", "models")
+
+def make_dirs(base_output_dir, data_types, sizes):
+    os.makedirs(base_output_dir, exist_ok=True)
+    dir_map = {}
+    def add_path(path_dict, base_dir, dir, key):
+        path = os.path.join(base_dir, dir)
+        path_dict[key] = {
+            "path": path
+        }
+        os.makedirs(path, exist_ok=True)
+        return path
+
+    for datatype in data_types:
+        datatype_dir = add_path(dir_map, base_output_dir, datatype, datatype)
+        for size in sizes:
+            size_tag = "{}x{}".format(size, size)
+            add_path(dir_map[datatype], datatype_dir, size_tag, size)
+    return dir_map
+
+def build_loopnest_models(output_dir_map, data_types, sizes):
+    for datatype in data_types:
+        for size in sizes:
+            output_dir = output_dir_map[datatype][size]["path"]
+            build_gemm_models.build_all_models(output_dir=output_dir, 
+                                               data_type_str=datatype,
+                                               M=size,
+                                               N=size,
+                                               K=size)
+
+def build_fallback_models(output_dir_map, data_types, sizes):
+    for datatype in data_types:
+        for size in sizes:
+            output_dir = output_dir_map[datatype][size]["path"]
+            output_filename = os.path.join(output_dir, "gemmELL.ell")
+            build_gemm_models.build_model(output_filename=output_filename,
+                                          use_fallback=True,
+                                          gemm_impl=0,
+                                          data_type_str=datatype,
+                                          M=size,
+                                          N=size,
+                                          K=size)
+            blas_filename = os.path.join(output_dir, "gemmBLAS.ell")
+            shutil.copy(output_filename, blas_filename)
+            mkl_filename = os.path.join(output_dir, "gemmMKL.ell")
+            shutil.copy(output_filename, mkl_filename)
+
+if __name__ == "__main__":
+    data_types = ["float", "double"]
+    sizes = [256]
+    dir_map = make_dirs(default_model_dir, data_types, sizes)
+    build_fallback_models(dir_map, data_types, sizes)
+    build_loopnest_models(dir_map, data_types, sizes)
diff --git a/tools/utilities/nodeTiming/gemmCodeNode/scripts/special_model_args.py b/tools/utilities/nodeTiming/gemmCodeNode/scripts/special_model_args.py
new file mode 100755
index 000000000..5235f993f
--- /dev/null
+++ b/tools/utilities/nodeTiming/gemmCodeNode/scripts/special_model_args.py
@@ -0,0 +1,13 @@
+####################################################################################################
+#
+#  Project:  Embedded Learning Library (ELL)
+#  File:     special_model_args.py
+#  Authors:  Mason Remy
+#
+#  Requires: Python 3.x
+#
+####################################################################################################
+
+special_model_args = {
+    "gemmELL.ell": ["--blas false"]
+}
\ No newline at end of file
diff --git a/tools/utilities/nodeTiming/gemmCodeNode/src/CMakeLists.txt.in b/tools/utilities/nodeTiming/gemmCodeNode/src/CMakeLists.txt.in
new file mode 100644
index 000000000..55dbfc274
--- /dev/null
+++ b/tools/utilities/nodeTiming/gemmCodeNode/src/CMakeLists.txt.in
@@ -0,0 +1,21 @@
+#
+# cmake file for MatrixMatrixMultiplyCodeNode timing scripts
+#
+
+cmake_minimum_required(VERSION 2.8)
+project(MatrixMatrixMultiplyCodeNodeRunner)
+set(CMAKE_CXX_STANDARD 11)
+
+@ADD_SUBDIRECTORIES@
+
+add_definitions(-DRUN_COUNT=@RUN_COUNT@)
+add_definitions(-DWARMUP_COUNT=@WARMUP_COUNT@)
+add_executable(Runner Runner.cpp)
+
+if(MSVC)
+    target_link_libraries(Runner @LINK_LIBRARIES@)
+elseif(${CMAKE_CXX_COMPILER_ID} STREQUAL Clang)
+    target_link_libraries(Runner @LINK_LIBRARIES@ -Wl)
+else()
+    target_link_libraries(Runner @LINK_LIBRARIES@ -Wl,--gc-sections)
+endif()
diff --git a/tools/utilities/nodeTiming/gemmCodeNode/src/Runner.cpp.in b/tools/utilities/nodeTiming/gemmCodeNode/src/Runner.cpp.in
new file mode 100644
index 000000000..24cb5801d
--- /dev/null
+++ b/tools/utilities/nodeTiming/gemmCodeNode/src/Runner.cpp.in
@@ -0,0 +1,134 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Project:  Embedded Learning Library (ELL)
+//  File:     Runner.template.cpp / Runner.cpp
+//  Authors:  Mason Remy
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <cstdlib>
+#include <iostream>
+#include <stdio.h>
+#include <numeric>
+#include <chrono>
+#include <random>
+
+#define @MODEL_NAME@_MAIN
+
+#include "@MODEL_DIR@/@MODEL_NAME@.h"
+
+#ifndef RUN_COUNT
+#define RUN_COUNT 1000
+#endif
+
+#ifndef WARMUP_COUNT
+#define WARMUP_COUNT 100
+#endif
+
+using dtype = @DATA_TYPE@;
+
+const unsigned int BUFFER_ROWS = 0;
+
+// makes a row major matrix
+static inline void randomInitMatrix(dtype* A, unsigned int rows, unsigned int cols)
+{
+    for (unsigned int i = 0; i < rows; ++i)
+    {
+        for (unsigned int j = 0; j < cols; ++j)
+        {
+            A[i * cols + j] = (dtype)(rand()) / RAND_MAX;
+        }
+    }
+    for (unsigned int i = rows; i < rows + BUFFER_ROWS; ++i)
+    {
+        for (unsigned int j = 0; j < cols; ++j)
+        {
+            A[i * cols + j] = 0;
+        }
+    }
+}
+
+class Timer
+{
+private:
+    using clock_t = std::chrono::high_resolution_clock;
+    using second_t = std::chrono::duration<double, std::ratio<1> >;
+    std::chrono::time_point<clock_t> _start;
+public:
+    Timer() : _start(clock_t::now()) {}
+    double elapsed() const
+    {
+        return std::chrono::duration_cast<second_t>(clock_t::now() - _start).count();
+    }
+};
+
+void PrintMat(const std::vector<dtype>& data, int rows, int cols)
+{
+    for (int r = 0; r < rows; ++r)
+    {
+        for (int c = 0; c < cols; ++c)
+        {
+            std::cout << std::fixed << data[r * cols + c] << "\t";
+        }
+        std::cout << std::endl;
+    }
+}
+
+template<typename WrapperType>
+double RunCheck(WrapperType& wrapper)
+{
+    TensorShape inputShape = wrapper.GetInputShape();
+    int M = inputShape.columns;
+    int N = inputShape.channels;
+    
+    std::vector<dtype> A(M * N);
+    std::vector<dtype> results(M * N);
+
+    // Return the sum of the first result so algorithm correctness can be verified (if the model was built without a fixed seed this is meaningless)
+    randomInitMatrix(A.data(), M, N);
+    results = wrapper.Predict(A);
+    dtype firstValSum = std::accumulate(results.begin(), results.end(), static_cast<dtype>(0));
+    return firstValSum;
+}
+
+template<typename WrapperType>
+double RunTiming(WrapperType& wrapper)
+{
+    TensorShape inputShape = wrapper.GetInputShape();
+    int M = inputShape.columns;
+    int N = inputShape.channels;
+    
+    std::vector<dtype> A(M * N);
+    std::vector<dtype> results(M * N);
+
+    randomInitMatrix(A.data(), M, N);
+    for (int i = 0; i < WARMUP_COUNT; i++)
+    {
+        randomInitMatrix(A.data(), M, N);
+        results = wrapper.Predict(A);
+    }
+
+    Timer t;
+    for (int i = 0; i < RUN_COUNT; i++)
+    {
+        randomInitMatrix(A.data(), M, N);
+        results = wrapper.Predict(A);
+    }
+    double duration = t.elapsed();
+
+    return duration;
+}
+
+int main(int argc, char** argv)
+{
+    srand(0);
+
+    @ALLCAPS_MODEL_NAME@Wrapper modelWrapper;
+
+    double correctnessCheck = RunCheck(modelWrapper);
+    double duration = RunTiming(modelWrapper);
+
+    std::cout << "@MODEL_NAME@ time = " << std::fixed << duration << "\tcheck = " << std::fixed << correctnessCheck << std::endl;
+
+    return 0;
+}
\ No newline at end of file
diff --git a/tools/utilities/pitest/drivetest.py b/tools/utilities/pitest/drivetest.py
index d49433f5e..33c8c2626 100644
--- a/tools/utilities/pitest/drivetest.py
+++ b/tools/utilities/pitest/drivetest.py
@@ -43,7 +43,7 @@ def __init__(self, ipaddress=None, cluster=None, outdir=None, profile=False,
                  model=None, labels=None, target="pi3", target_dir="/home/pi/test",
                  username="pi", password="raspberry", iterations=1, expected=None,
                  blas=True, compile=COMPILE_INCREMENTAL, test=True, timeout=None, apikey=None,
-                 gitrepo=None, wrap_options=None):
+                 skip_ellcode=False, gitrepo=None, wrap_options=None):
         self.ipaddress = ipaddress
         self.build_root = find_ell.find_ell_build()
         self.ell_root = os.path.dirname(self.build_root)
@@ -63,6 +63,7 @@ def __init__(self, ipaddress=None, cluster=None, outdir=None, profile=False,
         self.compile = compile
         self.test = test
         self.prediction_time = None
+        self.skip_ellcode = skip_ellcode
         self.logger = logger.get()
         self.rePlatform = "ARMv7.*"
         if target == "pi0":
@@ -295,6 +296,9 @@ def wrap_project(self):
         if self.wrap_options:
             builder_args += ['--'] + self.wrap_options
 
+        if self.skip_ellcode:
+            builder_args.append("--skip_ellcode")
+
         builder.parse_command_line(builder_args)
         builder.run()
 
@@ -370,7 +374,6 @@ def run_test(self):
                     sys.path.append(os.path.join(current_path, "..", "..", "wrap", "test"))
                     mpp = __import__("wrap_test")
                     mpp.make_project(self.output_dir)
-
                     cmd = ["python",
                            os.path.join(current_path, "..", "pythonlibs", "vision", "demo.py"),
                            self.labels_file,
diff --git a/tools/utilities/print/src/PrintGraph.cpp b/tools/utilities/print/src/PrintGraph.cpp
index 6e084286a..7fde0a6b7 100644
--- a/tools/utilities/print/src/PrintGraph.cpp
+++ b/tools/utilities/print/src/PrintGraph.cpp
@@ -115,6 +115,8 @@ std::string ToShortString(BinaryOperationType op)
         return "||";
     case BinaryOperationType::logicalXor:
         return "^";
+    case BinaryOperationType::modulo:
+        return "%";
     }
     return "";
 }
diff --git a/tools/utilities/profile/CMakeLists-device-parallel.txt.in b/tools/utilities/profile/CMakeLists-device-parallel.txt.in
index 8212d2676..80e4efd6d 100644
--- a/tools/utilities/profile/CMakeLists-device-parallel.txt.in
+++ b/tools/utilities/profile/CMakeLists-device-parallel.txt.in
@@ -8,7 +8,8 @@ project(profiler)
 
 set(CMAKE_CXX_STANDARD 14)
 
-set(PACKAGE_ROOT @EXTERNAL_DIR@)
+set(PACKAGE_ROOT @ELL_EXTERNAL_DIR@)
+
 include(./OpenBLASSetup.cmake)
 
 if(MSVC)
diff --git a/tools/utilities/profile/CMakeLists.txt b/tools/utilities/profile/CMakeLists.txt
index 5a162ef90..c27d1d59d 100644
--- a/tools/utilities/profile/CMakeLists.txt
+++ b/tools/utilities/profile/CMakeLists.txt
@@ -109,7 +109,7 @@ configure_file(build_and_run.sh.in build_and_run.sh @ONLY NEWLINE_STYLE UNIX)
 configure_file(build_and_run.cmd.in build_and_run.cmd @ONLY NEWLINE_STYLE WIN32)
 configure_file(remote_test.sh.in remote_test.sh @ONLY NEWLINE_STYLE UNIX)
 configure_file(remote_test.cmd.in remote_test.cmd @ONLY NEWLINE_STYLE WIN32)
-configure_file(${CMAKE_SOURCE_DIR}/CMake/OpenBLASSetup.cmake OpenBLASSetup.cmake COPYONLY)
+configure_file(${ELL_ROOT}/CMake/OpenBLASSetup.cmake OpenBLASSetup.cmake COPYONLY)
 
 
 if(WIN32)
diff --git a/tools/utilities/pythonlibs/audio/view_audio.py b/tools/utilities/pythonlibs/audio/view_audio.py
index eae5ddc9d..62d1fe0e7 100644
--- a/tools/utilities/pythonlibs/audio/view_audio.py
+++ b/tools/utilities/pythonlibs/audio/view_audio.py
@@ -138,6 +138,11 @@ def __init__(self, featurizer_model=None, classifier_model=None, auto_scale=True
         elif self.CLASSIFIER_MODEL_KEY in self.settings:
             self.classifier_model = self.settings[self.CLASSIFIER_MODEL_KEY]
 
+        self.vad = None
+        if vad_model:
+            self.vad = vad.VoiceActivityDetector(vad_model)
+            self.previous_vad = 0
+
         self.wav_filename = wav_file
         if self.wav_filename is None and self.WAV_FILE_KEY in self.settings:
             self.wav_filename = self.settings[self.WAV_FILE_KEY]
@@ -394,7 +399,9 @@ def evaluate_classifier(self):
             prediction, probability, label, _ = self.classifier.predict(self.classifier_feature_data.ravel())
             if prediction is not None:
                 percent = int(100 * probability)
-                if self.last_prediction != prediction or self.probability < probability:
+                if label == "silence":
+                    self.classifier.reset()
+                elif self.last_prediction != prediction or self.probability < probability:
                     self.last_prediction = prediction
                     self.probability = probability
                     self.show_output(" DETECTED ({}) {}% {}".format(prediction, percent, label))
diff --git a/tools/utilities/pythonlibs/buildtools.py b/tools/utilities/pythonlibs/buildtools.py
index 5ef177534..7efa17fbb 100644
--- a/tools/utilities/pythonlibs/buildtools.py
+++ b/tools/utilities/pythonlibs/buildtools.py
@@ -113,23 +113,25 @@ def logstream(self, stream):
             if "closed file" not in msg:
                 self.logger.info(msg)
 
-    def run(self, command, print_output=True, shell=False):
+    def run(self, command, print_output=True, shell=False, cwd=None):
         cmdstr = command if isinstance(command, str) else " ".join(command)
         if self.verbose:
             self.logger.info(cmdstr)
         try:
-            with subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=0,
-                                  universal_newlines=True, shell=shell) as proc:
+            output_target = subprocess.PIPE if print_output else subprocess.DEVNULL
+            with subprocess.Popen(command, stdout=output_target, stderr=output_target, bufsize=0,
+                                  universal_newlines=True, shell=shell, cwd=cwd) as proc:
                 self.output = ''
 
-                stdout_thread = Thread(target=self.logstream, args=(proc.stdout,))
-                stderr_thread = Thread(target=self.logstream, args=(proc.stderr,))
+                if print_output:
+                    stdout_thread = Thread(target=self.logstream, args=(proc.stdout,))
+                    stderr_thread = Thread(target=self.logstream, args=(proc.stderr,))
 
-                stdout_thread.start()
-                stderr_thread.start()
+                    stdout_thread.start()
+                    stderr_thread.start()
 
-                while stdout_thread.isAlive() or stderr_thread.isAlive():
-                    pass
+                    while stdout_thread.isAlive() or stderr_thread.isAlive():
+                        pass
 
                 proc.wait()
 
@@ -191,7 +193,8 @@ def llc(self, output_dir, input_file, target, optimization_level="3", objext=".o
         args = [self.llcexe,
                 input_file,
                 "-o", out_file,
-                "-O" + optimization_level
+                "-O" + optimization_level,
+                '' if optimization_level == '0' else "-fp-contract=fast"
                 ]
         args = args + self.get_llc_options(target)
         # Save the parameters passed to llc. This is used for archiving purposes.
@@ -202,25 +205,26 @@ def llc(self, output_dir, input_file, target, optimization_level="3", objext=".o
 
         return out_file
 
-    def opt(self, output_dir, input_file, optimization_level="3"):
+    def opt(self, output_dir, input_file, optimization_level="3", print_output=True):
         # opt compiled_model.ll -o compiled_model_opt.ll -O3
         model_name = os.path.splitext(os.path.basename(input_file))[0]
         out_file = os.path.join(output_dir, model_name + ".opt.bc")
         args = [self.optexe,
                 input_file,
                 "-o", out_file,
-                "-O" + optimization_level]
+                "-O" + optimization_level,
+                '' if optimization_level == '0' else "-fp-contract=fast"]
         # Save the parameters passed to opt. This is used for archiving purposes.
         self.log_command_arguments(args, log_dir=output_dir)
 
         self.logger.info("running opt ...")
-        self.run(args)
+        self.run(args, print_output=print_output)
         return out_file
 
-    def compile(self, model_file, func_name, model_name, target, output_dir,
+    def compile(self, model_file, func_name, model_name, target, output_dir, skip_ellcode=False,
                 use_blas=False, fuse_linear_ops=True, optimize_reorder_data_nodes=True, profile=False, llvm_format="bc",
                 optimize=True, parallelize=True, vectorize=True, debug=False, is_model_file=False, swig=True,
-                header=False, objext=".o", extra_options=[]):
+                header=False, objext=".o", global_value_alignment=32, extra_options=[]):
         file_arg = "-imf" if is_model_file else "-imap"
         format_flag = {
             "bc": "--bitcode",
@@ -244,7 +248,8 @@ def compile(self, model_file, func_name, model_name, target, output_dir,
                 "--target", target,
                 "-od", output_dir,
                 "--fuseLinearOps", str(fuse_linear_ops),
-                "--optimizeReorderDataNodes", str(optimize_reorder_data_nodes)
+                "--optimizeReorderDataNodes", str(optimize_reorder_data_nodes),
+                "--globalValueAlignment", str(global_value_alignment)
                 ]
         if swig:
             args.append("--swig")
@@ -270,6 +275,9 @@ def compile(self, model_file, func_name, model_name, target, output_dir,
         if profile:
             args.append("--profile")
 
+        if skip_ellcode:
+            args.append("--skip_ellcode")
+
         args += extra_options
 
         # Save the parameters passed to compile. This is used for archiving purposes.
diff --git a/tools/wrap/CMakeLists.txt b/tools/wrap/CMakeLists.txt
index 880f4df93..213d63173 100644
--- a/tools/wrap/CMakeLists.txt
+++ b/tools/wrap/CMakeLists.txt
@@ -45,7 +45,7 @@ if(${PYTHON_ENABLED})
         message(ERROR "LLVM not found, please check that LLVM is installed.")
         return()
     endif()
-
+    
     set(OUTPUT_DIR ${CMAKE_BINARY_DIR})
 
     set(JSON "{ \"llc\": \"${LLC_EXECUTABLE}\", \"swig\": \"${SWIG_EXECUTABLE}\", \"compile\": \"${GLOBAL_BIN_DIR}/compile\", \"blas\": \"${BLAS_LIBS}\", \"opt\": \"${OPT_EXECUTABLE}\", \"cmake_generator\": \"${CMAKE_GENERATOR}\", \"cmake_version\": \"${CMAKE_VERSION}\" }")
diff --git a/tools/wrap/templates/CMakeLists.cpp.txt.in b/tools/wrap/templates/CMakeLists.cpp.txt.in
index c0ce95a3d..a88d86a20 100644
--- a/tools/wrap/templates/CMakeLists.cpp.txt.in
+++ b/tools/wrap/templates/CMakeLists.cpp.txt.in
@@ -33,12 +33,12 @@ if(WIN32)
     # path to the OpenBLAS Nuget
     set(PACKAGE_ROOT "@ELL_ROOT@")
 endif()
+
 include(OpenBLASSetup.cmake)
 
 add_library(${target_name} STATIC IMPORTED GLOBAL)
 
+
 set_property(TARGET ${target_name} APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES "${CMAKE_CURRENT_SOURCE_DIR};${CMAKE_CURRENT_SOURCE_DIR}/include")
 set_target_properties(${target_name} PROPERTIES IMPORTED_LOCATION ${CMAKE_CURRENT_SOURCE_DIR}/@ELL_model@.@OBJECT_EXTENSION@)
-if(BLAS_LIBS)
-    target_link_libraries(${target_name} INTERFACE ${BLAS_LIBS})
-endif()
+target_link_libraries(${target_name} INTERFACE ${BLAS_LIBS})
diff --git a/tools/wrap/templates/CMakeLists.python.txt.in b/tools/wrap/templates/CMakeLists.python.txt.in
index c355b1386..ba5ee03d9 100644
--- a/tools/wrap/templates/CMakeLists.python.txt.in
+++ b/tools/wrap/templates/CMakeLists.python.txt.in
@@ -41,6 +41,7 @@ else()
     # SWIG has this GCC 7 warning.
     add_compile_options("-Wno-psabi")
 endif()
+
 include(OpenBLASSetup.cmake)
 
 find_package(PythonInterp 3.4)
diff --git a/tools/wrap/wrap.py b/tools/wrap/wrap.py
index 56e16cadd..574555f04 100755
--- a/tools/wrap/wrap.py
+++ b/tools/wrap/wrap.py
@@ -13,6 +13,7 @@
 import json
 import logging
 import os
+import platform
 import sys
 import time
 from shutil import copyfile
@@ -65,6 +66,12 @@ class ModuleBuilder:
             "help": "the target platform",
             "choices": ["pi3", "pi0", "orangepi0", "pi3_64", "aarch64", "host"]
         },
+        "skip_ellcode":
+        {
+            "short": "skip_ellcode",
+            "default": False,
+            "help": "skip ELLCode"
+        },
         "language":
         {
             "short": "l",
@@ -161,6 +168,13 @@ class ModuleBuilder:
             "default": False,
             "help": "emit debug code"
         },
+        "global_value_alignment":
+        {
+            "short": "gva",
+            "default": 32,
+            "help": "The number of bytes to align global buffers to",
+            "type": int
+        },
         "stats":
         {
             "short": "stats",
@@ -193,6 +207,7 @@ def __init__(self):
         self.func_name = "Predict"
         self.objext = "o"
         self.logger = None
+        self.skip_ellcode = False
 
     def str2bool(self, v):
         return v.lower() in ("yes", "true", "t", "1")
@@ -215,19 +230,22 @@ def parse_command_line(self, args=None):
 
         for arg in self.arguments.keys():
             argdef = self.arguments[arg]
+            arg_type = str
+            if "type" in argdef.keys():
+                arg_type = argdef["type"]
             if "required" in argdef.keys():
                 arg_parser.add_argument("--" + arg, "-" + argdef["short"],
-                                        help=argdef["help"], required=True)
+                                        help=argdef["help"], type=arg_type, required=True)
             elif "choices" in argdef.keys():
                 arg_parser.add_argument("--" + arg, "-" + argdef["short"],
-                                        help=argdef["help"], default=argdef["default"],
+                                        help=argdef["help"], type=arg_type, default=argdef["default"],
                                         choices=argdef["choices"])
             elif type(argdef["default"]) is bool and not argdef["default"]:
                 arg_parser.add_argument("--" + arg, "-" + argdef["short"],
                                         help=argdef["help"], action="store_true", default=False)
             else:
                 arg_parser.add_argument("--" + arg, "-" + argdef["short"],
-                                        help=argdef["help"], default=argdef["default"])
+                                        help=argdef["help"], type=arg_type, default=argdef["default"])
 
         compile_args = []
         if '--' in args:
@@ -247,6 +265,7 @@ def parse_command_line(self, args=None):
             self.model_name = self.model_file_base.replace('-', '_')
         self.language = args.language
         self.target = args.target
+        self.skip_ellcode = args.skip_ellcode
         self.objext = self.get_objext(self.target)
         self.output_dir = args.outdir
         if self.output_dir is None:
@@ -273,18 +292,23 @@ def parse_command_line(self, args=None):
         self.swig = self.language != "cpp"
         self.cpp_header = self.language == "cpp"
         self.compile_args = compile_args
+        self.global_value_alignment = args.global_value_alignment
         self.stats = args.stats
         self.times = {}
 
     def find_files(self):
         __script_path = os.path.dirname(os.path.abspath(__file__))
         self.cmake_template = os.path.join(__script_path, "templates/CMakeLists.%s.txt.in" % (self.language))
-        if (not os.path.isfile(self.cmake_template)):
+
+        if not os.path.isfile(self.cmake_template):
             raise Exception("Could not find CMakeLists template: %s" % (self.cmake_template))
+
         if self.language == "python":
             self.module_init_template = os.path.join(__script_path, "templates/__init__.py.in")
+
             if not os.path.isfile(self.module_init_template):
                 raise Exception("Could not find __init__.py template: %s" % (self.module_init_template))
+
         self.files.append(os.path.join(self.ell_root, "CMake/OpenBLASSetup.cmake"))
 
     def copy_files(self, filelist, folder):
@@ -292,14 +316,19 @@ def copy_files(self, filelist, folder):
             target_dir = self.output_dir
         else:
             target_dir = os.path.join(self.output_dir, folder)
+
         os.makedirs(target_dir, exist_ok=True)
+
         for path in filelist:
             if not os.path.isfile(path):
                 raise Exception("expected file not found: " + path)
+
             _, file_name = os.path.split(path)
             dest = os.path.join(target_dir, file_name)
+
             if self.verbose:
                 self.logger.info("copy \"%s\" \"%s\"" % (path, dest))
+
             copyfile(path, dest)
 
     def create_template_file(self, template_filename, output_filename):
@@ -312,6 +341,10 @@ def create_template_file(self, template_filename, output_filename):
         template = template.replace("@Arch@", self.target)
         template = template.replace("@OBJECT_EXTENSION@", self.objext)
         template = template.replace("@ELL_ROOT@", os.path.join(self.ell_root, "external").replace("\\", "/"))
+        shell_type = "UNIX"
+        if self.target == "host" and platform.system() == "Windows":
+            shell_type = "WINDOWS"
+        template = template.replace("@SHELL_TYPE@", shell_type)
         output_template = os.path.join(self.output_dir, output_filename)
         with open(output_template, 'w') as f:
             f.write(template)
@@ -358,6 +391,7 @@ def run(self):
             func_name=self.func_name,
             model_name=self.model_name,
             target=self.target,
+            skip_ellcode=self.skip_ellcode,
             output_dir=self.output_dir,
             use_blas=self.blas,
             fuse_linear_ops=self.fuse_linear_ops,
@@ -372,6 +406,7 @@ def run(self):
             swig=self.swig,
             header=self.cpp_header,
             objext="." + self.objext,
+            global_value_alignment=self.global_value_alignment,
             extra_options=self.compile_args)
         self.stop_timer("compile")
         if self.swig: