From f96051e154f7c3a735c17108f1bff840bab2b355 Mon Sep 17 00:00:00 2001 From: "Kern Handa (KERN)" Date: Wed, 10 Jun 2020 21:26:30 +0000 Subject: [PATCH] Merged PR 1583: ELL release v3.1.0 ELL release v3.1.0 - Move to VS 2019 - Fix a codegen error that was resulting in incorrect functional behavior - Fix regressions in audio training tutorial (#232) - Add importing of Sum nodes to ONNX importer - Fix crash in LLVMContext::SetName - Improved performance of CNN models on Pi3 with new implementations of spatial, pointwise and regular convolutions - Improved performance of reorder node - New nodes: ReorderDataCodeNode, SpatialConvolutionNode, MatrixMatrixMultiplyCodeNode - Implement parallelization strategies for matrix multiplication nodes. - Only enable new MatrixMatrixMultipleCodeNode path for select ARM targets like Pi, and not Intel/AMD CPUs - Add the flag `--skip_ellcode` to `compile` and `wrap.py` tools to use OpenBLAS for linear algebra computations. --- .gitattributes | 5 + .gitignore | 1 + CMake/CommonInterfaces.cmake | 4 +- CMake/CopySharedLibraries.cmake | 14 +- CMake/LLVMSetup.cmake | 3 - CMake/OpenBLASSetup.cmake | 13 +- CMakeLists.txt | 37 +- History.md | 13 + VERSION | 2 +- docs/gallery/ILSVRC2012/Asparagus.md | 4 +- docs/gallery/ILSVRC2012/Bean.md | 6 +- docs/gallery/ILSVRC2012/Buckthorn.md | 6 +- docs/gallery/ILSVRC2012/Carrot.md | 6 +- docs/gallery/ILSVRC2012/CashewNut.md | 6 +- docs/gallery/ILSVRC2012/Chalta.md | 6 +- docs/gallery/ILSVRC2012/Clary.md | 6 +- docs/gallery/ILSVRC2012/Clover.md | 6 +- docs/gallery/ILSVRC2012/Coconut.md | 6 +- docs/gallery/ILSVRC2012/Ginger.md | 6 +- docs/gallery/ILSVRC2012/Mashua.md | 4 +- docs/gallery/ILSVRC2012/PandanFlower.md | 6 +- docs/gallery/ILSVRC2012/Pear.md | 6 +- docs/gallery/ILSVRC2012/SevenSisters.md | 6 +- docs/gallery/ILSVRC2012/Sweetsop.md | 6 +- docs/gallery/ILSVRC2012/Tamarind.md | 6 +- docs/gallery/ILSVRC2012/WaterApple.md | 6 +- docs/gallery/ILSVRC2012/Wattleseed.md | 4 +- .../index.md | 1 - interfaces/CMakeLists.txt | 1 + .../MatrixMatrixMultiplyImplementation.h | 17 + .../common/include/ModelBuilderInterface.h | 3 + interfaces/common/include/ModelInterface.h | 3 + interfaces/common/model.i | 2 + interfaces/common/model_python_post.i | 10 + .../common/src/ModelBuilderInterface.cpp | 126 +- interfaces/common/src/ModelInterface.cpp | 1 + interfaces/python/CMakeLists.txt | 17 +- interfaces/python/package/CMakeLists.txt | 9 +- interfaces/python/package/ell/CMakeLists.txt | 26 +- .../python/package/ell/nodes/__init__.py | 1 + interfaces/python/test/CMakeLists.txt | 4 +- interfaces/python/test/compiled_model_test.py | 3 + .../common/include/MapCompilerArguments.h | 2 + libraries/common/src/LoadModel.cpp | 6 + libraries/common/src/MapCompilerArguments.cpp | 16 + libraries/emittable_functions/CMakeLists.txt | 6 +- libraries/emitters/CMakeLists.txt | 43 +- libraries/emitters/include/CompilerOptions.h | 6 + libraries/emitters/include/EmitterTypes.h | 30 +- .../emitters/include/FunctionDeclaration.h | 3 +- libraries/emitters/include/IRAssemblyWriter.h | 6 + libraries/emitters/include/IREmitter.h | 7 +- .../emitters/include/IRExecutionEngine.h | 10 +- .../emitters/include/IRFunctionEmitter.h | 184 +- libraries/emitters/include/IRIfEmitter.h | 4 +- libraries/emitters/include/IRLocalValue.h | 9 +- libraries/emitters/include/IRLoopEmitter.h | 10 +- libraries/emitters/include/IRModuleEmitter.h | 101 +- libraries/emitters/include/IRPosixRuntime.h | 4 +- libraries/emitters/include/IRRuntime.h | 10 +- libraries/emitters/include/LLVMUtilities.h | 6 + libraries/emitters/include/ModuleEmitter.h | 2 +- libraries/emitters/include/TargetDevice.h | 14 + libraries/emitters/src/CompilerOptions.cpp | 2 + libraries/emitters/src/EmitterTypes.cpp | 26 + libraries/emitters/src/IRAssemblyWriter.cpp | 5 + libraries/emitters/src/IREmitter.cpp | 28 +- libraries/emitters/src/IRExecutionEngine.cpp | 15 +- libraries/emitters/src/IRFunctionEmitter.cpp | 264 +- libraries/emitters/src/IRIfEmitter.cpp | 4 +- libraries/emitters/src/IRLocalValue.cpp | 25 + libraries/emitters/src/IRLoopEmitter.cpp | 87 +- libraries/emitters/src/IRModuleEmitter.cpp | 282 +- libraries/emitters/src/IRRuntime.cpp | 25 +- libraries/emitters/src/LLVMUtilities.cpp | 88 +- libraries/emitters/src/TargetDevice.cpp | 17 + .../templates/LLVMEmitterTargets.h.in | 16 + .../emitters/test/include/IREmitterTest.h | 2 + libraries/emitters/test/src/IREmitterTest.cpp | 75 + libraries/emitters/test/src/main.cpp | 2 + libraries/math/CMakeLists.txt | 1 + libraries/math/src/BlasWrapper.cpp | 10 +- libraries/model/include/Map.h | 2 - libraries/model/include/OutputPort.h | 1 - libraries/model/src/CompilableCodeNode.cpp | 4 +- libraries/model/src/Map.cpp | 5 + .../model/test/include/CompilableNodesTest.h | 8 + .../model/test/src/CompilableNodesTest.cpp | 288 + .../test/src/model_compiler_test_main.cpp | 29 +- .../include/ModelTestUtilities.h | 54 +- libraries/nodes/CMakeLists.txt | 5 + .../nodes/include/BroadcastOperationNodes.h | 519 +- libraries/nodes/include/IRNode.h | 2 - .../include/MatrixMatrixMultiplyCodeNode.h | 271 + .../MatrixMatrixMultiplyImplementation.h | 23 + libraries/nodes/include/NodeOperations.h | 3 +- libraries/nodes/include/ReorderDataCodeNode.h | 586 ++ .../nodes/include/SpatialConvolutionNode.h | 239 + .../nodes/include/UnrolledConvolutionNode.h | 1 + .../src/BinaryConvolutionalLayerNode.cpp | 4 +- .../nodes/src/ConvolutionalLayerNode.cpp | 19 +- .../src/MatrixMatrixMultiplyCodeNode.cpp | 549 ++ .../nodes/src/NeuralNetworkPredictorNode.cpp | 4 +- .../nodes/src/UnrolledConvolutionNode.cpp | 99 +- .../nodes/src/WinogradConvolutionNode.cpp | 4 +- .../nodes/test/src/BasicMathNodesTests.cpp | 8 +- libraries/nodes/test/src/DSPNodesTests.cpp | 8 +- libraries/optimization/CMakeLists.txt | 4 +- .../optimization/include/VectorSolution.h | 2 +- libraries/optimization/src/Interval.cpp | 2 +- ...OptimizeReorderDataNodesTransformation.cpp | 10 +- .../passes/test/src/ModelOptimizerTest.cpp | 26 +- .../passes/test/src/TransformationTest.cpp | 26 +- libraries/utilities/CMakeLists.txt | 4 + libraries/utilities/include/EnumFlagHelpers.h | 39 + libraries/utilities/include/FunctionUtils.h | 79 +- libraries/utilities/include/MemoryLayout.h | 8 + libraries/utilities/include/StringUtil.h | 14 + .../utilities/include/TunableParameters.h | 165 + libraries/utilities/include/TypeAliases.h | 1 + libraries/utilities/include/TypeTraits.h | 7 +- libraries/utilities/src/Files.cpp | 7 +- libraries/utilities/src/MemoryLayout.cpp | 31 +- libraries/utilities/src/StringUtil.cpp | 10 + .../test/include/TunableParameters_test.h | 15 + .../test/src/TunableParameters_test.cpp | 80 + libraries/utilities/test/src/main.cpp | 5 + libraries/value/CMakeLists.txt | 61 +- libraries/value/README.md | 2 +- libraries/value/include/Array.h | 185 + libraries/value/include/CachingProvider.h | 191 + libraries/value/include/CachingStrategies.h | 63 + libraries/value/include/ComputeContext.h | 22 +- libraries/value/include/CppEmitterContext.h | 166 + libraries/value/include/EmitterContext.h | 214 +- libraries/value/include/FunctionDeclaration.h | 151 +- libraries/value/include/LLVMContext.h | 53 +- libraries/value/include/LoopNests.h | 224 + libraries/value/include/Matrix.h | 40 + libraries/value/include/MatrixOperations.h | 8 +- libraries/value/include/Print.h | 36 + libraries/value/include/Scalar.h | 1 + libraries/value/include/ScalarOperations.h | 29 + libraries/value/include/TensorOperations.h | 6 + libraries/value/include/Value.h | 13 +- libraries/value/include/ValueOperations.h | 7 + libraries/value/include/ValueType.h | 5 + libraries/value/include/VectorOperations.h | 6 + .../value/include/loopnests/CodeGenerator.h | 43 + .../loopnests/CodePositionConstraints.h | 166 + libraries/value/include/loopnests/ForAll.h | 40 + libraries/value/include/loopnests/Index.h | 68 + .../value/include/loopnests/IndexRange.h | 50 + .../value/include/loopnests/IterationDomain.h | 47 + libraries/value/include/loopnests/Kernel.h | 166 + .../value/include/loopnests/KernelPredicate.h | 315 + .../value/include/loopnests/LoopIndexInfo.h | 41 + libraries/value/include/loopnests/LoopNest.h | 292 + .../value/include/loopnests/LoopNestPrinter.h | 65 + .../value/include/loopnests/LoopNestVisitor.h | 138 + libraries/value/include/loopnests/Range.h | 44 + .../value/include/loopnests/SplitIndexRange.h | 112 + .../include/loopnests/SplitIterationDomain.h | 95 + libraries/value/src/Array.cpp | 120 + libraries/value/src/CachingProvider.cpp | 39 + libraries/value/src/CachingStrategies.cpp | 1943 ++++++ libraries/value/src/ComputeContext.cpp | 668 +- libraries/value/src/CppEmitterContext.cpp | 1753 +++++ libraries/value/src/EmitterContext.cpp | 260 +- libraries/value/src/FunctionDeclaration.cpp | 136 +- libraries/value/src/LLVMContext.cpp | 719 +- libraries/value/src/LoopNests.cpp | 354 + libraries/value/src/Matrix.cpp | 7 +- libraries/value/src/MatrixOperations.cpp | 34 +- libraries/value/src/Print.cpp | 65 + libraries/value/src/Scalar.cpp | 15 +- libraries/value/src/ScalarOperations.cpp | 58 + libraries/value/src/TensorOperations.cpp | 14 +- libraries/value/src/Value.cpp | 12 + libraries/value/src/ValueOperations.cpp | 5 + libraries/value/src/Vector.cpp | 1 + libraries/value/src/VectorOperations.cpp | 28 +- .../value/src/loopnests/CodeGenerator.cpp | 461 ++ .../src/loopnests/CodePositionConstraints.cpp | 104 + libraries/value/src/loopnests/ForAll.cpp | 49 + libraries/value/src/loopnests/Index.cpp | 55 + libraries/value/src/loopnests/IndexRange.cpp | 65 + .../value/src/loopnests/IterationDomain.cpp | 60 + libraries/value/src/loopnests/Kernel.cpp | 148 + .../value/src/loopnests/KernelPredicate.cpp | 703 ++ libraries/value/src/loopnests/LoopNest.cpp | 900 +++ .../value/src/loopnests/LoopNestPrinter.cpp | 484 ++ .../value/src/loopnests/LoopNestVisitor.cpp | 1057 +++ libraries/value/src/loopnests/Range.cpp | 65 + .../value/src/loopnests/SplitIndexRange.cpp | 506 ++ .../src/loopnests/SplitIterationDomain.cpp | 254 + .../value/test/include/CachingStrategy_test.h | 148 + libraries/value/test/include/Functions_test.h | 18 + .../value/test/include/LoopNestAPI_test.h | 37 + .../test/include/LoopNest_convolution_test.h | 18 + .../value/test/include/LoopNest_kernels.h | 42 + libraries/value/test/include/LoopNest_test.h | 89 + libraries/value/test/include/Matrix_test.h | 1 + libraries/value/test/include/Scalar_test.h | 2 + libraries/value/test/include/TestUtil.h | 95 +- libraries/value/test/include/Value_test.h | 21 + .../value/test/src/CachingStrategy_test.cpp | 6171 +++++++++++++++++ libraries/value/test/src/Functions_test.cpp | 61 + libraries/value/test/src/LoopNestAPI_test.cpp | 1090 +++ .../test/src/LoopNest_convolution_test.cpp | 198 + libraries/value/test/src/LoopNest_kernels.cpp | 234 + libraries/value/test/src/LoopNest_test.cpp | 3661 ++++++++++ libraries/value/test/src/Matrix_test.cpp | 68 +- libraries/value/test/src/Scalar_test.cpp | 71 +- libraries/value/test/src/Tensor_test.cpp | 24 +- libraries/value/test/src/TestUtil.cpp | 371 +- libraries/value/test/src/Value_test.cpp | 617 +- libraries/value/test/src/Vector_test.cpp | 16 +- libraries/value/test/src/main.cpp | 327 +- tools/importers/CNTK/cntk_to_ell.py | 32 +- tools/importers/torch/test/CMakeLists.txt | 5 +- tools/trainers/forestTrainer/CMakeLists.txt | 2 +- tools/trainers/linearTrainer/CMakeLists.txt | 22 +- tools/trainers/protoNNTrainer/CMakeLists.txt | 2 +- .../sweepingSGDTrainer/CMakeLists.txt | 2 +- tools/utilities/finetune/CMakeLists.txt | 9 +- .../nodeTiming/gemmCodeNode/.gitignore | 2 + .../nodeTiming/gemmCodeNode/README.md | 46 + .../gemmCodeNode/deploy/full_pass.cmd | 12 + .../gemmCodeNode/deploy/full_pass.sh | 17 + .../nodeTiming/gemmCodeNode/deploy/run.py | 41 + .../gemmCodeNode/deploy/timing_aggregator.py | 95 + .../gemmCodeNode/scripts/build_gemm_models.py | 128 + .../gemmCodeNode/scripts/build_tests.py | 289 + .../scripts/make_default_models.py | 71 + .../scripts/special_model_args.py | 13 + .../gemmCodeNode/src/CMakeLists.txt.in | 21 + .../nodeTiming/gemmCodeNode/src/Runner.cpp.in | 134 + tools/utilities/pitest/drivetest.py | 7 +- tools/utilities/print/src/PrintGraph.cpp | 2 + .../profile/CMakeLists-device-parallel.txt.in | 3 +- tools/utilities/profile/CMakeLists.txt | 2 +- .../utilities/pythonlibs/audio/view_audio.py | 9 +- tools/utilities/pythonlibs/buildtools.py | 40 +- tools/wrap/CMakeLists.txt | 2 +- tools/wrap/templates/CMakeLists.cpp.txt.in | 6 +- tools/wrap/templates/CMakeLists.python.txt.in | 1 + tools/wrap/wrap.py | 43 +- 248 files changed, 32163 insertions(+), 1723 deletions(-) create mode 100644 interfaces/common/include/MatrixMatrixMultiplyImplementation.h create mode 100644 libraries/emitters/templates/LLVMEmitterTargets.h.in create mode 100644 libraries/nodes/include/MatrixMatrixMultiplyCodeNode.h create mode 100644 libraries/nodes/include/MatrixMatrixMultiplyImplementation.h create mode 100644 libraries/nodes/include/ReorderDataCodeNode.h create mode 100644 libraries/nodes/include/SpatialConvolutionNode.h create mode 100644 libraries/nodes/src/MatrixMatrixMultiplyCodeNode.cpp create mode 100644 libraries/utilities/include/EnumFlagHelpers.h create mode 100644 libraries/utilities/include/TunableParameters.h create mode 100644 libraries/utilities/test/include/TunableParameters_test.h create mode 100644 libraries/utilities/test/src/TunableParameters_test.cpp create mode 100644 libraries/value/include/Array.h create mode 100644 libraries/value/include/CachingProvider.h create mode 100644 libraries/value/include/CachingStrategies.h create mode 100644 libraries/value/include/CppEmitterContext.h create mode 100644 libraries/value/include/LoopNests.h create mode 100644 libraries/value/include/Print.h create mode 100644 libraries/value/include/ScalarOperations.h create mode 100644 libraries/value/include/loopnests/CodeGenerator.h create mode 100644 libraries/value/include/loopnests/CodePositionConstraints.h create mode 100644 libraries/value/include/loopnests/ForAll.h create mode 100644 libraries/value/include/loopnests/Index.h create mode 100644 libraries/value/include/loopnests/IndexRange.h create mode 100644 libraries/value/include/loopnests/IterationDomain.h create mode 100644 libraries/value/include/loopnests/Kernel.h create mode 100644 libraries/value/include/loopnests/KernelPredicate.h create mode 100644 libraries/value/include/loopnests/LoopIndexInfo.h create mode 100644 libraries/value/include/loopnests/LoopNest.h create mode 100644 libraries/value/include/loopnests/LoopNestPrinter.h create mode 100644 libraries/value/include/loopnests/LoopNestVisitor.h create mode 100644 libraries/value/include/loopnests/Range.h create mode 100644 libraries/value/include/loopnests/SplitIndexRange.h create mode 100644 libraries/value/include/loopnests/SplitIterationDomain.h create mode 100644 libraries/value/src/Array.cpp create mode 100644 libraries/value/src/CachingProvider.cpp create mode 100644 libraries/value/src/CachingStrategies.cpp create mode 100644 libraries/value/src/CppEmitterContext.cpp create mode 100644 libraries/value/src/LoopNests.cpp create mode 100644 libraries/value/src/Print.cpp create mode 100644 libraries/value/src/ScalarOperations.cpp create mode 100644 libraries/value/src/loopnests/CodeGenerator.cpp create mode 100644 libraries/value/src/loopnests/CodePositionConstraints.cpp create mode 100644 libraries/value/src/loopnests/ForAll.cpp create mode 100644 libraries/value/src/loopnests/Index.cpp create mode 100644 libraries/value/src/loopnests/IndexRange.cpp create mode 100644 libraries/value/src/loopnests/IterationDomain.cpp create mode 100644 libraries/value/src/loopnests/Kernel.cpp create mode 100644 libraries/value/src/loopnests/KernelPredicate.cpp create mode 100644 libraries/value/src/loopnests/LoopNest.cpp create mode 100644 libraries/value/src/loopnests/LoopNestPrinter.cpp create mode 100644 libraries/value/src/loopnests/LoopNestVisitor.cpp create mode 100644 libraries/value/src/loopnests/Range.cpp create mode 100644 libraries/value/src/loopnests/SplitIndexRange.cpp create mode 100644 libraries/value/src/loopnests/SplitIterationDomain.cpp create mode 100644 libraries/value/test/include/CachingStrategy_test.h create mode 100644 libraries/value/test/include/Functions_test.h create mode 100644 libraries/value/test/include/LoopNestAPI_test.h create mode 100644 libraries/value/test/include/LoopNest_convolution_test.h create mode 100644 libraries/value/test/include/LoopNest_kernels.h create mode 100644 libraries/value/test/include/LoopNest_test.h create mode 100644 libraries/value/test/src/CachingStrategy_test.cpp create mode 100644 libraries/value/test/src/Functions_test.cpp create mode 100644 libraries/value/test/src/LoopNestAPI_test.cpp create mode 100644 libraries/value/test/src/LoopNest_convolution_test.cpp create mode 100644 libraries/value/test/src/LoopNest_kernels.cpp create mode 100644 libraries/value/test/src/LoopNest_test.cpp create mode 100644 tools/utilities/nodeTiming/gemmCodeNode/.gitignore create mode 100644 tools/utilities/nodeTiming/gemmCodeNode/README.md create mode 100644 tools/utilities/nodeTiming/gemmCodeNode/deploy/full_pass.cmd create mode 100755 tools/utilities/nodeTiming/gemmCodeNode/deploy/full_pass.sh create mode 100755 tools/utilities/nodeTiming/gemmCodeNode/deploy/run.py create mode 100755 tools/utilities/nodeTiming/gemmCodeNode/deploy/timing_aggregator.py create mode 100755 tools/utilities/nodeTiming/gemmCodeNode/scripts/build_gemm_models.py create mode 100755 tools/utilities/nodeTiming/gemmCodeNode/scripts/build_tests.py create mode 100755 tools/utilities/nodeTiming/gemmCodeNode/scripts/make_default_models.py create mode 100755 tools/utilities/nodeTiming/gemmCodeNode/scripts/special_model_args.py create mode 100644 tools/utilities/nodeTiming/gemmCodeNode/src/CMakeLists.txt.in create mode 100644 tools/utilities/nodeTiming/gemmCodeNode/src/Runner.cpp.in diff --git a/.gitattributes b/.gitattributes index 2d2188e4f..3fb5eea16 100644 --- a/.gitattributes +++ b/.gitattributes @@ -3,6 +3,11 @@ ############################################################################### * text=auto +############################################################################### +# Explicitly force .sh scripts to have LF line endings +############################################################################### +*.sh text eol=lf + ############################################################################### # Set default behavior for command prompt diff. # diff --git a/.gitignore b/.gitignore index 08f9f6946..a52e99a56 100644 --- a/.gitignore +++ b/.gitignore @@ -66,6 +66,7 @@ artifacts/ *.pidb *.svclog *.scc +*.ll # Chutzpah Test files _Chutzpah* diff --git a/CMake/CommonInterfaces.cmake b/CMake/CommonInterfaces.cmake index 1539c5d3c..c528431ed 100644 --- a/CMake/CommonInterfaces.cmake +++ b/CMake/CommonInterfaces.cmake @@ -8,11 +8,9 @@ # On Linux and Mac, this can be done by call *make* on the specific language wrapper e.g. # make _ELL_python -cmake_minimum_required(VERSION 3.8 FATAL_ERROR) - set(GLOBAL_BIN_DIR "${CMAKE_BINARY_DIR}/bin") if(WIN32) -set(GLOBAL_BIN_DIR "${CMAKE_BINARY_DIR}/bin/release") + set(GLOBAL_BIN_DIR "${GLOBAL_BIN_DIR}/release") endif() # diff --git a/CMake/CopySharedLibraries.cmake b/CMake/CopySharedLibraries.cmake index a8b44eb32..a7605f6b3 100644 --- a/CMake/CopySharedLibraries.cmake +++ b/CMake/CopySharedLibraries.cmake @@ -3,21 +3,27 @@ # # Copies necessary DLLs to global binary directory -macro(copy_shared_libraries target_name) +macro(copy_shared_libraries_to target_name target_location) if(WIN32) - set(target_location "${CMAKE_BINARY_DIR}/bin/$/") if(EXISTS ${BLAS_DLL_DIR}) set(command_target_name copy_dlls_to_${target_name}) foreach(blas_dll ${BLAS_DLLS}) add_custom_command(TARGET ${target_name} POST_BUILD - COMMAND ${CMAKE_COMMAND} -E make_directory ${target_location} - COMMAND ${CMAKE_COMMAND} -E copy_if_different ${BLAS_DLL_DIR}/${blas_dll} ${target_location} + COMMAND ${CMAKE_COMMAND} -E make_directory "${target_location}/$" + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${BLAS_DLL_DIR}/${blas_dll} "${target_location}/$" ) endforeach() endif() endif() endmacro() +macro(copy_shared_libraries target_name) + if(WIN32) + set(target_location "${CMAKE_BINARY_DIR}/bin") + copy_shared_libraries_to(${target_name} ${target_location}) + endif() +endmacro() + macro(set_test_library_path test_name) if(WIN32) set (GLOBAL_BIN_DIR ${CMAKE_BINARY_DIR}/bin) diff --git a/CMake/LLVMSetup.cmake b/CMake/LLVMSetup.cmake index 7c1bab565..7ffce54bd 100644 --- a/CMake/LLVMSetup.cmake +++ b/CMake/LLVMSetup.cmake @@ -66,7 +66,4 @@ foreach(DEFINITION ${LLVM_DEFINITIONS}) add_definitions(${DEFINITION}) endforeach() -set(LLVM_LIBS ${LLVM_AVAILABLE_LIBS}) -list(FILTER LLVM_LIBS INCLUDE REGEX "LLVM.+") - set_property(TARGET intrinsics_gen PROPERTY FOLDER "cmake_macros") diff --git a/CMake/OpenBLASSetup.cmake b/CMake/OpenBLASSetup.cmake index f4f2be1c4..e63b3a1a1 100644 --- a/CMake/OpenBLASSetup.cmake +++ b/CMake/OpenBLASSetup.cmake @@ -123,14 +123,17 @@ else() set(BLAS_LIB_SEARCH_PATHS ${BLAS_PACKAGE_DIR}/lib/) set(BLAS_FOUND TRUE) else() - # Known registry ID (family, model) settings for various CPU types + # Known registry ID (family, model) settings for various Intel CPU types # # Haswell: Family 6, model 60, 63, 69 # Broadwell: Family 6, Model 70, 79 (compatible with Haswell) # Kaby Lake: Family 6, Model 78, 142, 158 (compatible with Haswell) # Sandybridge: Family 6, model 42, 45 # Ivybridge: Family 6, model 58 (compatible with Sandybridge) - # Skylake: Family 6, model 42 + # Skylake: Family 6, model 85 + # + # Known registry ID (family, model) settings for various AMD CPU types + # Epyc: Family 23, model 1 (compatible with Haswell) # We can set up a mapping from a detected processor generation to the version of # the OpenBLAS libraries to use with the set_processor_mapping macro. For instance, @@ -159,11 +162,15 @@ else() set(processor_generation "sandybridge") elseif(processor_model EQUAL 58) set(processor_generation "sandybridge") # actually ivybridge, but it is compatible with sandybridge - elseif(processor_model EQUAL 42) + elseif(processor_model EQUAL 85) set(processor_generation "sandybridge") # actually skylake, but it is compatible with sandybridge else() set(processor_generation "unknown") endif() + elseif(processor_family EQUAL 23) + if(processor_model EQUAL 1) + set(processor_generation "haswell") + endif() endif() else() set(processor_generation "${PROCESSOR_HINT}") diff --git a/CMakeLists.txt b/CMakeLists.txt index 2cf446dff..d01e464fd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,11 +4,18 @@ cmake_minimum_required(VERSION 3.8 FATAL_ERROR) +# Error on non-existent dependency in add_dependencies. +cmake_policy(SET CMP0046 NEW) + # Include modules in the CMake directory. -list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/CMake") +set(ELL_ROOT "${CMAKE_CURRENT_SOURCE_DIR}") +list(APPEND CMAKE_MODULE_PATH "${ELL_ROOT}/CMake") include(CompilerCache) -project(ELL) +project(ELL CXX ASM) +if(MSVC) + enable_language(ASM_MASM) +endif() file(STRINGS "VERSION" ELL_VERSION) message(STATUS "ELL version ${ELL_VERSION}") @@ -41,11 +48,14 @@ option(DISABLE_PYTHON "Explicitly disable building python modules" OFF) option(CNTK "Enable CNTK importer and related unit tests (requires CNTK python module)" OFF) option(ONNX "Enable ONNX importer and related unit tests (requires PyTorch and ONNX python modules)" OFF) -set(ELL_ROOT "${CMAKE_SOURCE_DIR}") -set(FLAKE8_CONFIG "${CMAKE_SOURCE_DIR}/.flake8") -set(TEST_MODELS_REPO "https://github.com/Microsoft/ell-test-models" CACHE DOCUMENTATION "URL to the git repo containing test models" ) +set(FLAKE8_CONFIG "${ELL_ROOT}/.flake8") +set(TEST_MODELS_REPO "https://github.com/Microsoft/ell-test-models" CACHE STRING "URL to the git repo containing test models" ) message(STATUS "Configuring tests to use TEST_MODELS_REPO at: ${TEST_MODELS_REPO}") -set(EXTERNAL_DIR "${CMAKE_SOURCE_DIR}/external" CACHE DOCUMENTATION "Directory to install external dependencies" ) + +if(NOT ELL_EXTERNAL_DIR) + set(ELL_EXTERNAL_DIR "${ELL_ROOT}/external" CACHE STRING "Directory to install external dependencies" ) +endif(NOT ELL_EXTERNAL_DIR) + set(RPI_PASSWORD "$ENV{RPI_PASSWORD}") set(RPI_CLUSTER "$ENV{RPI_CLUSTER}") set(RPI_KEY "$ENV{RPI_APIKEY}") @@ -106,7 +116,8 @@ endif() enable_testing() # Set up global variables to help find NuGet projects -set(PACKAGE_ROOT ${EXTERNAL_DIR}) +set(PACKAGE_ROOT ${ELL_EXTERNAL_DIR}) + include(OpenBLASSetup) include(LLVMSetup) include(SWIGSetup) @@ -138,14 +149,16 @@ else() add_compile_options(-Wmissing-field-initializers) add_compile_options(-fvisibility-inlines-hidden) add_compile_options(-Wno-unknown-pragmas) - add_compile_options(-Wno-backslash-newline-escape) add_compile_options(-Wno-comment) set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -ggdb3 -O0") set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -ggdb3 -O0") set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -ggdb3") set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} -ggdb3") if(${CMAKE_CXX_COMPILER_ID} STREQUAL Clang) + add_compile_options(-Wno-backslash-newline-escape) add_compile_options(-Wno-self-assign) + else() # GCC + add_compile_options(-Wno-ignored-attributes) endif() endif() @@ -163,17 +176,17 @@ add_subdirectory(interfaces) add_subdirectory(examples) # Add user directories to ELL build if requested -if(EXISTS "${CMAKE_SOURCE_DIR}/user") +if(EXISTS "${ELL_ROOT}/user") # Add root user directory if it has a CMakeLists.txt file and INCLUDE_IN_ELL_BUILD.txt file - if(EXISTS"${CMAKE_SOURCE_DIR}/user/CMakeLists.txt" AND EXISTS "${CMAKE_SOURCE_DIR}/user/INCLUDE_IN_ELL_BUILD.txt") + if(EXISTS"${ELL_ROOT}/user/CMakeLists.txt" AND EXISTS "${ELL_ROOT}/user/INCLUDE_IN_ELL_BUILD.txt") message(STATUS "Adding user directory to ELL build") add_subdirectory(user) endif() # Now add all child directories that have CMakeLists.txt files and INCLUDE_IN_ELL_BUILD.txt file - file(GLOB children RELATIVE "${CMAKE_SOURCE_DIR}/user" "${CMAKE_SOURCE_DIR}/user/*") + file(GLOB children RELATIVE "${ELL_ROOT}/user" "${ELL_ROOT}/user/*") foreach(child ${children}) - if(IS_DIRECTORY "${CMAKE_SOURCE_DIR}/user/${child}" AND EXISTS "${CMAKE_SOURCE_DIR}/user/${child}/CMakeLists.txt" AND EXISTS "${CMAKE_SOURCE_DIR}/user/${child}/INCLUDE_IN_ELL_BUILD.txt") + if(IS_DIRECTORY "${ELL_ROOT}/user/${child}" AND EXISTS "${ELL_ROOT}/user/${child}/CMakeLists.txt" AND EXISTS "${ELL_ROOT}/user/${child}/INCLUDE_IN_ELL_BUILD.txt") message(STATUS "Adding user directory ${child} to ELL build") add_subdirectory("user/${child}") endif() diff --git a/History.md b/History.md index 20ac1ea0a..8ee727020 100644 --- a/History.md +++ b/History.md @@ -1,3 +1,16 @@ +## 3.1.0 +- Move to VS 2019 +- Fix a codegen error that was resulting in incorrect functional behavior +- Fix regressions in audio training tutorial (#232) +- Add importing of Sum nodes to ONNX importer +- Fix crash in LLVMContext::SetName +- Improved performance of CNN models on Pi3 with new implementations of spatial, pointwise and regular convolutions +- Improved performance of reorder node +- New nodes: ReorderDataCodeNode, SpatialConvolutionNode, MatrixMatrixMultiplyCodeNode +- Implement parallelization strategies for matrix multiplication nodes. +- Only enable new MatrixMatrixMultipleCodeNode path for select ARM targets like Pi, and not Intel/AMD CPUs +- Add the flag `--skip_ellcode` to `compile` and `wrap.py` tools to use OpenBLAS for linear algebra computations. + ## 3.0.3 - Fix VS 2019 build. diff --git a/VERSION b/VERSION index 75a22a26a..fd2a01863 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -3.0.3 +3.1.0 diff --git a/docs/gallery/ILSVRC2012/Asparagus.md b/docs/gallery/ILSVRC2012/Asparagus.md index 9e72d64b0..c9294035a 100644 --- a/docs/gallery/ILSVRC2012/Asparagus.md +++ b/docs/gallery/ILSVRC2012/Asparagus.md @@ -6,7 +6,7 @@ permalink: /gallery/ILSVRC2012/Asparagus [Back to Gallery](/ELL/gallery) -## ILSVRC2012 Classification: 64x64x3 Convolutional Neural Network (52.07% top 1 accuracy, 76.40% top 5 accuracy, 108ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz) +## ILSVRC2012 Classification: 64x64x3 Convolutional Neural Network (52.07% top 1 accuracy, 76.40% top 5 accuracy, 181ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz) @@ -19,7 +19,7 @@ permalink: /gallery/ILSVRC2012/Asparagus - + diff --git a/docs/gallery/ILSVRC2012/Bean.md b/docs/gallery/ILSVRC2012/Bean.md index 924fcf52c..8cfd6c457 100644 --- a/docs/gallery/ILSVRC2012/Bean.md +++ b/docs/gallery/ILSVRC2012/Bean.md @@ -6,12 +6,12 @@ permalink: /gallery/ILSVRC2012/Bean [Back to Gallery](/ELL/gallery) -## ILSVRC2012 Classification: 128x128x3 Convolutional Neural Network (55.12% top 1 accuracy, 78.21% top 5 accuracy, 144ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz) +## ILSVRC2012 Classification: 128x128x3 Convolutional Neural Network (55.12% top 1 accuracy, 78.21% top 5 accuracy, 87ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz)
Performance Raspberry Pi 3 (Raspbian) @ 700MHz : 108ms/frame Raspberry Pi 3 (Raspbian) @ 700MHz : 181ms/frame
Uncompressed Size
- + @@ -19,7 +19,7 @@ permalink: /gallery/ILSVRC2012/Bean - + diff --git a/docs/gallery/ILSVRC2012/Buckthorn.md b/docs/gallery/ILSVRC2012/Buckthorn.md index 6d0aacf4d..cebefe939 100644 --- a/docs/gallery/ILSVRC2012/Buckthorn.md +++ b/docs/gallery/ILSVRC2012/Buckthorn.md @@ -6,12 +6,12 @@ permalink: /gallery/ILSVRC2012/Buckthorn [Back to Gallery](/ELL/gallery) -## ILSVRC2012 Classification: 64x64x3 Convolutional Neural Network (57.57% top 1 accuracy, 80.55% top 5 accuracy, 171ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz) +## ILSVRC2012 Classification: 64x64x3 Convolutional Neural Network (57.57% top 1 accuracy, 80.55% top 5 accuracy, 113ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz)
Download Bean_pi3.ell.zip Bean.ell.zip
Accuracy
Performance Raspberry Pi 3 (Raspbian) @ 700MHz : 144ms/frame Raspberry Pi 3 (Raspbian) @ 700MHz : 87ms/frame
Uncompressed Size
- + @@ -19,7 +19,7 @@ permalink: /gallery/ILSVRC2012/Buckthorn - + diff --git a/docs/gallery/ILSVRC2012/Carrot.md b/docs/gallery/ILSVRC2012/Carrot.md index 9b83882f4..6b5f98dfa 100644 --- a/docs/gallery/ILSVRC2012/Carrot.md +++ b/docs/gallery/ILSVRC2012/Carrot.md @@ -6,12 +6,12 @@ permalink: /gallery/ILSVRC2012/Carrot [Back to Gallery](/ELL/gallery) -## ILSVRC2012 Classification: 128x128x3 Convolutional Neural Network (64.61% top 1 accuracy, 85.63% top 5 accuracy, 397ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz) +## ILSVRC2012 Classification: 128x128x3 Convolutional Neural Network (64.61% top 1 accuracy, 85.63% top 5 accuracy, 341ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz)
Download Buckthorn_pi3.ell.zip Buckthorn.ell.zip
Accuracy
Performance Raspberry Pi 3 (Raspbian) @ 700MHz : 171ms/frame Raspberry Pi 3 (Raspbian) @ 700MHz : 113ms/frame
Uncompressed Size
- + @@ -19,7 +19,7 @@ permalink: /gallery/ILSVRC2012/Carrot - + diff --git a/docs/gallery/ILSVRC2012/CashewNut.md b/docs/gallery/ILSVRC2012/CashewNut.md index fb5cab1c7..459054082 100644 --- a/docs/gallery/ILSVRC2012/CashewNut.md +++ b/docs/gallery/ILSVRC2012/CashewNut.md @@ -6,12 +6,12 @@ permalink: /gallery/ILSVRC2012/CashewNut [Back to Gallery](/ELL/gallery) -## ILSVRC2012 Classification: 128x128x3 Convolutional Neural Network (60.22% top 1 accuracy, 82.44% top 5 accuracy, 178ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz) +## ILSVRC2012 Classification: 128x128x3 Convolutional Neural Network (60.22% top 1 accuracy, 82.44% top 5 accuracy, 106ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz)
Download Carrot_pi3.ell.zip Carrot.ell.zip
Accuracy
Performance Raspberry Pi 3 (Raspbian) @ 700MHz: 397ms/frame Raspberry Pi 3 (Raspbian) @ 700MHz: 341ms/frame
Uncompressed Size
- + @@ -19,7 +19,7 @@ permalink: /gallery/ILSVRC2012/CashewNut - + diff --git a/docs/gallery/ILSVRC2012/Chalta.md b/docs/gallery/ILSVRC2012/Chalta.md index a952c5a50..0f762a38c 100644 --- a/docs/gallery/ILSVRC2012/Chalta.md +++ b/docs/gallery/ILSVRC2012/Chalta.md @@ -6,12 +6,12 @@ permalink: /gallery/ILSVRC2012/Chalta [Back to Gallery](/ELL/gallery) -## ILSVRC2012 Classification: 64x64x3 Convolutional Neural Network (58.74% top 1 accuracy, 81.59% top 5 accuracy, 221ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz) +## ILSVRC2012 Classification: 64x64x3 Convolutional Neural Network (58.74% top 1 accuracy, 81.59% top 5 accuracy, 147ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz)
Download CashewNut_pi3.ell.zip CashewNut.ell.zip
Accuracy
Performance Raspberry Pi 3 (Raspbian) @ 700MHz: 178ms/frame Raspberry Pi 3 (Raspbian) @ 700MHz: 106ms/frame
Uncompressed Size
- + @@ -19,7 +19,7 @@ permalink: /gallery/ILSVRC2012/Chalta - + diff --git a/docs/gallery/ILSVRC2012/Clary.md b/docs/gallery/ILSVRC2012/Clary.md index 7fa372ed7..e1b4b4fd8 100644 --- a/docs/gallery/ILSVRC2012/Clary.md +++ b/docs/gallery/ILSVRC2012/Clary.md @@ -6,12 +6,12 @@ permalink: /gallery/ILSVRC2012/Clary [Back to Gallery](/ELL/gallery) -## ILSVRC2012 Classification: 128x128x3 Convolutional Neural Network (66.65% top 1 accuracy, 87.17% top 5 accuracy, 506ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz) +## ILSVRC2012 Classification: 128x128x3 Convolutional Neural Network (66.65% top 1 accuracy, 87.17% top 5 accuracy, 361ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz)
Download Chalta_pi3.ell.zip Chalta.ell.zip
Accuracy
Performance Raspberry Pi 3 (Raspbian) @ 700MHz: 221ms/frame Raspberry Pi 3 (Raspbian) @ 700MHz: 147ms/frame
Uncompressed Size
- + @@ -19,7 +19,7 @@ permalink: /gallery/ILSVRC2012/Clary - + diff --git a/docs/gallery/ILSVRC2012/Clover.md b/docs/gallery/ILSVRC2012/Clover.md index dbc619710..f171badb4 100644 --- a/docs/gallery/ILSVRC2012/Clover.md +++ b/docs/gallery/ILSVRC2012/Clover.md @@ -6,12 +6,12 @@ permalink: /gallery/ILSVRC2012/Clover [Back to Gallery](/ELL/gallery) -## ILSVRC2012 Classification: 64x64x3 Convolutional Neural Network (53.04% top 1 accuracy, 77.12% top 5 accuracy, 126ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz) +## ILSVRC2012 Classification: 64x64x3 Convolutional Neural Network (53.04% top 1 accuracy, 77.12% top 5 accuracy, 90ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz)
Download Clary_pi3.ell.zip Clary.ell.zip
Accuracy
Performance Raspberry Pi 3 (Raspbian) @ 700MHz: 506ms/frame Raspberry Pi 3 (Raspbian) @ 700MHz: 361ms/frame
Uncompressed Size
- + @@ -19,7 +19,7 @@ permalink: /gallery/ILSVRC2012/Clover - + diff --git a/docs/gallery/ILSVRC2012/Coconut.md b/docs/gallery/ILSVRC2012/Coconut.md index 7deca6415..dff22e5ff 100644 --- a/docs/gallery/ILSVRC2012/Coconut.md +++ b/docs/gallery/ILSVRC2012/Coconut.md @@ -6,12 +6,12 @@ permalink: /gallery/ILSVRC2012/Coconut [Back to Gallery](/ELL/gallery) -## ILSVRC2012 Classification: 64x64x3 Convolutional Neural Network (44.41% top 1 accuracy, 69.41% top 5 accuracy, 46ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz) +## ILSVRC2012 Classification: 64x64x3 Convolutional Neural Network (44.41% top 1 accuracy, 69.41% top 5 accuracy,30ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz)
Download Clover_pi3.ell.zip Clover.ell.zip
Accuracy
Performance Raspberry Pi 3 (Raspbian) @ 700MHz: 126ms/frame Raspberry Pi 3 (Raspbian) @ 700MHz: 90ms/frame
Uncompressed Size
- + @@ -19,7 +19,7 @@ permalink: /gallery/ILSVRC2012/Coconut - + diff --git a/docs/gallery/ILSVRC2012/Ginger.md b/docs/gallery/ILSVRC2012/Ginger.md index bea5f6b22..4b35d720a 100644 --- a/docs/gallery/ILSVRC2012/Ginger.md +++ b/docs/gallery/ILSVRC2012/Ginger.md @@ -6,12 +6,12 @@ permalink: /gallery/ILSVRC2012/Ginger [Back to Gallery](/ELL/gallery) -## ILSVRC2012 Classification: 128x128x3 Convolutional Neural Network (62.33% top 1 accuracy, 84.14% top 5 accuracy, 205ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz) +## ILSVRC2012 Classification: 128x128x3 Convolutional Neural Network (62.33% top 1 accuracy, 84.14% top 5 accuracy, 122ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz)
Download Coconut_pi3.ell.zip Coconut.ell.zip
Accuracy
Performance Raspberry Pi 3 (Raspbian) @ 700MHz: 46ms/frame Raspberry Pi 3 (Raspbian) @ 700MHz: 30ms/frame
Uncompressed Size
- + @@ -19,7 +19,7 @@ permalink: /gallery/ILSVRC2012/Ginger - + diff --git a/docs/gallery/ILSVRC2012/Mashua.md b/docs/gallery/ILSVRC2012/Mashua.md index 96d95e492..995f2ff38 100644 --- a/docs/gallery/ILSVRC2012/Mashua.md +++ b/docs/gallery/ILSVRC2012/Mashua.md @@ -6,7 +6,7 @@ permalink: /gallery/ILSVRC2012/Mashua [Back to Gallery](/ELL/gallery) -## ILSVRC2012 Classification: 64x64x3 Convolutional Neural Network (64.59% top 1 accuracy, 85.50% top 5 accuracy, 804ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz) +## ILSVRC2012 Classification: 64x64x3 Convolutional Neural Network (64.59% top 1 accuracy, 85.50% top 5 accuracy, 525ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz)
Download Ginger_pi3.ell.zip Ginger.ell.zip
Accuracy
Performance Raspberry Pi 3 (Raspbian) @ 700MHz: 205ms/frame Raspberry Pi 3 (Raspbian) @ 700MHz: 122ms/frame
Uncompressed Size
@@ -19,7 +19,7 @@ permalink: /gallery/ILSVRC2012/Mashua - + diff --git a/docs/gallery/ILSVRC2012/PandanFlower.md b/docs/gallery/ILSVRC2012/PandanFlower.md index 140b9634f..7c34d0536 100644 --- a/docs/gallery/ILSVRC2012/PandanFlower.md +++ b/docs/gallery/ILSVRC2012/PandanFlower.md @@ -6,12 +6,12 @@ permalink: /gallery/ILSVRC2012/PandanFlower [Back to Gallery](/ELL/gallery) -## ILSVRC2012 Classification: 64x64x3 Convolutional Neural Network (62.05% top 1 accuracy, 83.80% top 5 accuracy, 347ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz) +## ILSVRC2012 Classification: 64x64x3 Convolutional Neural Network (62.05% top 1 accuracy, 83.80% top 5 accuracy, 239ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz)
Performance Raspberry Pi 3 (Raspbian) @ 700MHz: 804ms/frame Raspberry Pi 3 (Raspbian) @ 700MHz: 525ms/frame
Uncompressed Size
- + @@ -19,7 +19,7 @@ permalink: /gallery/ILSVRC2012/PandanFlower - + diff --git a/docs/gallery/ILSVRC2012/Pear.md b/docs/gallery/ILSVRC2012/Pear.md index 0a50db4d6..213426135 100644 --- a/docs/gallery/ILSVRC2012/Pear.md +++ b/docs/gallery/ILSVRC2012/Pear.md @@ -6,12 +6,12 @@ permalink: /gallery/ILSVRC2012/Pear [Back to Gallery](/ELL/gallery) -## ILSVRC2012 Classification: 64x64x3 Convolutional Neural Network (50.08% top 1 accuracy, 74.74% top 5 accuracy, 90ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz) +## ILSVRC2012 Classification: 64x64x3 Convolutional Neural Network (50.08% top 1 accuracy, 74.74% top 5 accuracy, 85ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz)
Download PandanFlower_pi3.ell.zip PandanFlower.ell.zip
Accuracy
Performance Raspberry Pi 3 (Raspbian) @ 700MHz: 347ms/frame Raspberry Pi 3 (Raspbian) @ 700MHz: 239ms/frame
Uncompressed Size
- + @@ -19,7 +19,7 @@ permalink: /gallery/ILSVRC2012/Pear - + diff --git a/docs/gallery/ILSVRC2012/SevenSisters.md b/docs/gallery/ILSVRC2012/SevenSisters.md index 94d7f155e..80866b933 100644 --- a/docs/gallery/ILSVRC2012/SevenSisters.md +++ b/docs/gallery/ILSVRC2012/SevenSisters.md @@ -6,12 +6,12 @@ permalink: /gallery/ILSVRC2012/SevenSisters [Back to Gallery](/ELL/gallery) -## ILSVRC2012 Classification: 128x128x3 Convolutional Neural Network (66.04% top 1 accuracy, 86.59% top 5 accuracy, 489ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz) +## ILSVRC2012 Classification: 128x128x3 Convolutional Neural Network (66.04% top 1 accuracy, 86.59% top 5 accuracy, 415ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz)
Download Pear_pi3.ell.zip Pear.ell.zip
Accuracy
Performance Raspberry Pi 3 (Raspbian) @ 700MHz : 90ms/frame Raspberry Pi 3 (Raspbian) @ 700MHz : 85ms/frame
Uncompressed Size
- + @@ -19,7 +19,7 @@ permalink: /gallery/ILSVRC2012/SevenSisters - + diff --git a/docs/gallery/ILSVRC2012/Sweetsop.md b/docs/gallery/ILSVRC2012/Sweetsop.md index 4ea819445..3d5acd33f 100644 --- a/docs/gallery/ILSVRC2012/Sweetsop.md +++ b/docs/gallery/ILSVRC2012/Sweetsop.md @@ -6,12 +6,12 @@ permalink: /gallery/ILSVRC2012/Sweetsop [Back to Gallery](/ELL/gallery) -## ILSVRC2012 Classification: 128x128x3 Convolutional Neural Network (72.24% top 1 accuracy, 90.78% top 5 accuracy, 1012ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz) +## ILSVRC2012 Classification: 128x128x3 Convolutional Neural Network (72.24% top 1 accuracy, 90.78% top 5 accuracy, 719ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz)
Download SevenSisters_pi3.ell.zip SevenSisters.ell.zip
Accuracy
Performance Raspberry Pi 3 (Raspbian) @ 700MHz: 489ms/frame Raspberry Pi 3 (Raspbian) @ 700MHz: 415ms/frame
Uncompressed Size
- + @@ -19,7 +19,7 @@ permalink: /gallery/ILSVRC2012/Sweetsop - + diff --git a/docs/gallery/ILSVRC2012/Tamarind.md b/docs/gallery/ILSVRC2012/Tamarind.md index ff3f78d8d..d721e8e96 100644 --- a/docs/gallery/ILSVRC2012/Tamarind.md +++ b/docs/gallery/ILSVRC2012/Tamarind.md @@ -6,12 +6,12 @@ permalink: /gallery/ILSVRC2012/Tamarind [Back to Gallery](/ELL/gallery) -## ILSVRC2012 Classification: 128x128x3 Convolutional Neural Network (70.46% top 1 accuracy, 89.57% top 5 accuracy, 665ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz) +## ILSVRC2012 Classification: 128x128x3 Convolutional Neural Network (70.46% top 1 accuracy, 89.57% top 5 accuracy, 470ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz)
Download Sweetsop_pi3.ell.zip Sweetsop.ell.zip
Accuracy
Performance Raspberry Pi 3 (Raspbian) @ 700MHz: 1012ms/frame Raspberry Pi 3 (Raspbian) @ 700MHz: 719ms/frame
Uncompressed Size
- + @@ -19,7 +19,7 @@ permalink: /gallery/ILSVRC2012/Tamarind - + diff --git a/docs/gallery/ILSVRC2012/WaterApple.md b/docs/gallery/ILSVRC2012/WaterApple.md index b58ae1df0..f3b86f088 100644 --- a/docs/gallery/ILSVRC2012/WaterApple.md +++ b/docs/gallery/ILSVRC2012/WaterApple.md @@ -6,12 +6,12 @@ permalink: /gallery/ILSVRC2012/WaterApple [Back to Gallery](/ELL/gallery) -## ILSVRC2012 Classification: 64x64x3 Convolutional Neural Network (61.21% top 1 accuracy, 83.23% top 5 accuracy, 269ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz) +## ILSVRC2012 Classification: 64x64x3 Convolutional Neural Network (61.21% top 1 accuracy, 83.23% top 5 accuracy, 186ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz)
Download Tamarind_pi3.ell.zip Tamarind.ell.zip
Accuracy
Performance Raspberry Pi 3 (Raspbian) @ 700MHz: 665ms/frame Raspberry Pi 3 (Raspbian) @ 700MHz: 470ms/frame
Uncompressed Size
- + @@ -19,7 +19,7 @@ permalink: /gallery/ILSVRC2012/WaterApple - + diff --git a/docs/gallery/ILSVRC2012/Wattleseed.md b/docs/gallery/ILSVRC2012/Wattleseed.md index c8d94d9ba..ff41582ab 100644 --- a/docs/gallery/ILSVRC2012/Wattleseed.md +++ b/docs/gallery/ILSVRC2012/Wattleseed.md @@ -6,7 +6,7 @@ permalink: /gallery/ILSVRC2012/Wattleseed [Back to Gallery](/ELL/gallery) -## ILSVRC2012 Classification: 64x64x3 Convolutional Neural Network (63.23% top 1 accuracy, 84.72% top 5 accuracy, 601ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz) +## ILSVRC2012 Classification: 64x64x3 Convolutional Neural Network (63.23% top 1 accuracy, 84.72% top 5 accuracy, 350ms/frame on Raspberry Pi 3 (Raspbian) @ 700MHz)
Download WaterApple_pi3.ell.zip WaterApple.ell.zip
Accuracy
Performance Raspberry Pi 3 (Raspbian) @ 700MHz: 269ms/frame Raspberry Pi 3 (Raspbian) @ 700MHz: 186ms/frame
Uncompressed Size
@@ -19,7 +19,7 @@ permalink: /gallery/ILSVRC2012/Wattleseed - + diff --git a/docs/tutorials/Boosting-classifier-accuracy-by-grouping-categories/index.md b/docs/tutorials/Boosting-classifier-accuracy-by-grouping-categories/index.md index db551aa21..a9fbd8d31 100644 --- a/docs/tutorials/Boosting-classifier-accuracy-by-grouping-categories/index.md +++ b/docs/tutorials/Boosting-classifier-accuracy-by-grouping-categories/index.md @@ -294,4 +294,3 @@ def main(): ## Troubleshooting Find tips in the Troubleshooting section of the [Raspberry Pi Setup Instructions](/ELL/tutorials/Raspberry-Pi-setup). - diff --git a/interfaces/CMakeLists.txt b/interfaces/CMakeLists.txt index a2d64d917..90c852a1b 100644 --- a/interfaces/CMakeLists.txt +++ b/interfaces/CMakeLists.txt @@ -26,6 +26,7 @@ set(_sources common/include/CallbackInterface.h common/include/DatasetInterface.h common/include/DatasetInterfaceImpl.h + common/include/MatrixMatrixMultiplyImplementation.h common/include/MathInterface.h common/include/ModelBuilderInterface.h common/include/ModelInterface.h diff --git a/interfaces/common/include/MatrixMatrixMultiplyImplementation.h b/interfaces/common/include/MatrixMatrixMultiplyImplementation.h new file mode 100644 index 000000000..2b02c4431 --- /dev/null +++ b/interfaces/common/include/MatrixMatrixMultiplyImplementation.h @@ -0,0 +1,17 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: MatrixMatrixMultiplyImplementation.h (interfaces) +// Authors: Mason Remy +// +//////////////////////////////////////////////////////////////////////////////////////////////////// +#pragma once + +#include + +enum class MatrixMatrixMultiplyImplementation : int +{ + SimpleForLoops = (int)ell::nodes::MatrixMatrixMultiplyImplementation::SimpleForLoops, + Mlas_Loopnest_Value = (int)ell::nodes::MatrixMatrixMultiplyImplementation::Mlas_Loopnest_Value, + ImplementationCount = (int)ell::nodes::MatrixMatrixMultiplyImplementation::LAST +}; diff --git a/interfaces/common/include/ModelBuilderInterface.h b/interfaces/common/include/ModelBuilderInterface.h index 9ef08106e..46db1039a 100644 --- a/interfaces/common/include/ModelBuilderInterface.h +++ b/interfaces/common/include/ModelBuilderInterface.h @@ -48,6 +48,9 @@ class ModelBuilder Node AddConstantNode(Model model, std::vector values, const PortMemoryLayout& outputMemoryLayout, PortType type); Node AddDCTNode(Model model, PortElements input, int numFilters); Node AddMatrixMultiplyNode(Model model, PortElements input1, PortElements input2); + Node AddMatrixMatrixMultiplyNode(Model model, PortElements input1, PortElements input2); + Node AddMatrixMatrixMultiplyCodeNode(Model model, PortElements input1, PortElements input2, int gemmImpl); + Node AddMatrixMatrixMultiplyCodeNode(Model model, PortElements input1, PortElements input2, int panelM, int panelN, int panelK, int kernelM, int kernelN, int kernelK, int gemmImpl); Node AddDotProductNode(Model model, PortElements input1, PortElements input2); Node AddNeuralNetworkPredictorNode(Model model, PortElements input, ell::api::predictors::NeuralNetworkPredictor predictor); Node AddFFTNode(Model model, PortElements input, int nfft = 0); diff --git a/interfaces/common/include/ModelInterface.h b/interfaces/common/include/ModelInterface.h index 5ea9f9223..2dc977e37 100644 --- a/interfaces/common/include/ModelInterface.h +++ b/interfaces/common/include/ModelInterface.h @@ -495,6 +495,9 @@ struct MapCompilerOptions /// Emit debug code. bool debug = false; + + /// Skip ELLCode optimization. + bool skip_ellcode = false; }; // diff --git a/interfaces/common/model.i b/interfaces/common/model.i index c102386f2..4facda208 100644 --- a/interfaces/common/model.i +++ b/interfaces/common/model.i @@ -17,6 +17,7 @@ std::vector GetOutputBuffersFromList(std::shared_ptr map #endif // SWIGPYTHON #include "Ports.h" +#include "MatrixMatrixMultiplyImplementation.h" #include "ModelInterface.h" #include "ModelBuilderInterface.h" @@ -33,6 +34,7 @@ std::vector GetOutputBuffersFromList(std::shared_ptr map // Include the C++ code to be wrapped %include "Ports.h" +%include "MatrixMatrixMultiplyImplementation.h" %include "ModelInterface.h" %include "ModelBuilderInterface.h" %include "macros.i" diff --git a/interfaces/common/model_python_post.i b/interfaces/common/model_python_post.i index 35014482a..a02fa69b5 100644 --- a/interfaces/common/model_python_post.i +++ b/interfaces/common/model_python_post.i @@ -8,6 +8,16 @@ %pythoncode %{ +# Python friendly class for MatrixMatrixMultiplyImplementation +class MatrixMatrixMultiplyImplementation: + SimpleForLoops = MatrixMatrixMultiplyImplementation_SimpleForLoops + Mlas_Loopnest_Value = MatrixMatrixMultiplyImplementation_Mlas_Loopnest_Value + ImplementationCount = MatrixMatrixMultiplyImplementation_ImplementationCount + +del MatrixMatrixMultiplyImplementation_SimpleForLoops +del MatrixMatrixMultiplyImplementation_Mlas_Loopnest_Value +del MatrixMatrixMultiplyImplementation_ImplementationCount + # Python friendly class for PortType class PortType: bigInt = PortType_bigInt diff --git a/interfaces/common/src/ModelBuilderInterface.cpp b/interfaces/common/src/ModelBuilderInterface.cpp index 4340a3933..db9e644ed 100644 --- a/interfaces/common/src/ModelBuilderInterface.cpp +++ b/interfaces/common/src/ModelBuilderInterface.cpp @@ -37,11 +37,13 @@ #include #include #include +#include +#include #include #include #include #include -#include +#include #include #include #include @@ -455,7 +457,7 @@ Node ModelBuilder::AddReorderDataNode(Model model, PortElements input, PortMemor switch (type) { case PortType::real: - newNode = model.GetModel()->AddNode>( + newNode = model.GetModel()->AddNode>( ell::model::PortElements(elements), inputMemoryLayout.Get(), outputMemoryLayout.Get(), @@ -463,7 +465,7 @@ Node ModelBuilder::AddReorderDataNode(Model model, PortElements input, PortMemor outputPaddingValue); break; case PortType::smallReal: - newNode = model.GetModel()->AddNode>( + newNode = model.GetModel()->AddNode>( ell::model::PortElements(elements), inputMemoryLayout.Get(), outputMemoryLayout.Get(), @@ -484,12 +486,12 @@ Node ModelBuilder::AddReorderDataNode(Model model, PortElements input, std::vect switch (type) { case PortType::real: - newNode = model.GetModel()->AddNode>( + newNode = model.GetModel()->AddNode>( ell::model::PortElements(elements), order); break; case PortType::smallReal: - newNode = model.GetModel()->AddNode>( + newNode = model.GetModel()->AddNode>( ell::model::PortElements(elements), order); break; @@ -971,6 +973,120 @@ Node ModelBuilder::AddMatrixMultiplyNode(Model model, PortElements input1, PortE return Node(newNode, model.GetModel()); } +Node ModelBuilder::AddMatrixMatrixMultiplyNode(Model model, PortElements input1, PortElements input2) +{ + auto type = input1.GetType(); + auto t2 = input2.GetType(); + if (type != t2) + { + throw std::invalid_argument("Error: input1 has different element types from input2"); + } + auto elements1 = input1.GetPortElements(); + auto elements2 = input2.GetPortElements(); + + auto layout1 = elements1.GetMemoryLayout(); + auto layout2 = elements2.GetMemoryLayout(); + + ell::model::Node* newNode = nullptr; + + if (layout1.NumDimensions() != 2 && layout2.NumDimensions() != 2) + { + throw std::invalid_argument("Error: input sizes invalid"); + } + else + { + switch (type) + { + case PortType::real: + newNode = model.GetModel()->AddNode>(ell::model::PortElements(elements1), ell::model::PortElements(elements2)); + break; + case PortType::smallReal: + newNode = model.GetModel()->AddNode>(ell::model::PortElements(elements1), ell::model::PortElements(elements2)); + break; + default: + throw std::invalid_argument("Error: could not create MatrixMatrixMultiplyNode of the requested type"); + } + } + + return Node(newNode, model.GetModel()); +} + +Node ModelBuilder::AddMatrixMatrixMultiplyCodeNode(Model model, PortElements input1, PortElements input2, int gemmImpl) +{ + auto type = input1.GetType(); + auto t2 = input2.GetType(); + if (type != t2) + { + throw std::invalid_argument("Error: input1 has different element types from input2"); + } + auto elements1 = input1.GetPortElements(); + auto elements2 = input2.GetPortElements(); + + auto layout1 = elements1.GetMemoryLayout(); + auto layout2 = elements2.GetMemoryLayout(); + + ell::model::Node* newNode = nullptr; + + if (layout1.NumDimensions() != 2 && layout2.NumDimensions() != 2) + { + throw std::invalid_argument("Error: input sizes invalid"); + } + else + { + switch (type) + { + case PortType::real: + newNode = model.GetModel()->AddNode>(ell::model::PortElements(elements1), ell::model::PortElements(elements2), static_cast(gemmImpl)); + break; + case PortType::smallReal: + newNode = model.GetModel()->AddNode>(ell::model::PortElements(elements1), ell::model::PortElements(elements2), static_cast(gemmImpl)); + break; + default: + throw std::invalid_argument("Error: could not create MatrixMatrixMultiplyCodeNode of the requested type"); + } + } + + return Node(newNode, model.GetModel()); +} + +Node ModelBuilder::AddMatrixMatrixMultiplyCodeNode(Model model, PortElements input1, PortElements input2, int panelM, int panelN, int panelK, int kernelM, int kernelN, int kernelK, int gemmImpl) +{ + auto type = input1.GetType(); + auto t2 = input2.GetType(); + if (type != t2) + { + throw std::invalid_argument("Error: input1 has different element types from input2"); + } + auto elements1 = input1.GetPortElements(); + auto elements2 = input2.GetPortElements(); + + auto layout1 = elements1.GetMemoryLayout(); + auto layout2 = elements2.GetMemoryLayout(); + + ell::model::Node* newNode = nullptr; + + if (layout1.NumDimensions() != 2 && layout2.NumDimensions() != 2) + { + throw std::invalid_argument("Error: input sizes invalid"); + } + else + { + switch (type) + { + case PortType::real: + newNode = model.GetModel()->AddNode>(ell::model::PortElements(elements1), ell::model::PortElements(elements2), panelM, panelN, panelK, kernelM, kernelN, kernelK, static_cast(gemmImpl)); + break; + case PortType::smallReal: + newNode = model.GetModel()->AddNode>(ell::model::PortElements(elements1), ell::model::PortElements(elements2), panelM, panelN, panelK, kernelM, kernelN, kernelK, static_cast(gemmImpl)); + break; + default: + throw std::invalid_argument("Error: could not create MatrixMatrixMultiplyCodeNode of the requested type"); + } + } + + return Node(newNode, model.GetModel()); +} + Node ModelBuilder::AddDotProductNode(Model model, PortElements input1, PortElements input2) { ell::model::Node* newNode = nullptr; diff --git a/interfaces/common/src/ModelInterface.cpp b/interfaces/common/src/ModelInterface.cpp index a329997bf..35d540ed1 100644 --- a/interfaces/common/src/ModelInterface.cpp +++ b/interfaces/common/src/ModelInterface.cpp @@ -960,6 +960,7 @@ CompiledMap Map::Compile(const std::string& targetDevice, const std::string& mod settings.compilerSettings.allowVectorInstructions = compilerSettings.allowVectorInstructions; settings.compilerSettings.vectorWidth = compilerSettings.vectorWidth; settings.compilerSettings.debug = compilerSettings.debug; + settings.compilerSettings.skip_ellcode = compilerSettings.skip_ellcode; ell::model::ModelOptimizerOptions optimizerOptions; optimizerOptions["fuseLinearFunctionNodes"] = optimizerSettings.fuseLinearFunctionNodes; diff --git a/interfaces/python/CMakeLists.txt b/interfaces/python/CMakeLists.txt index c08117b71..2a001f1e9 100644 --- a/interfaces/python/CMakeLists.txt +++ b/interfaces/python/CMakeLists.txt @@ -37,16 +37,16 @@ if (TARGET _ELL_python) add_custom_command(TARGET _ELL_python POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy ${PYTHON_DIR}/ell_py.py ${PYTHON_DIR}/package/ell/ell_py.py - COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/tools/utilities/pythonlibs/buildtools.py ${PYTHON_DIR}/package/ell/util/buildtools.py - COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/docs/tutorials/shared/tutorial_helpers.py ${PYTHON_DIR}/package/ell/util/tutorialHelpers.py + COMMAND ${CMAKE_COMMAND} -E copy ${ELL_ROOT}/tools/utilities/pythonlibs/buildtools.py ${PYTHON_DIR}/package/ell/util/buildtools.py + COMMAND ${CMAKE_COMMAND} -E copy ${ELL_ROOT}/docs/tutorials/shared/tutorial_helpers.py ${PYTHON_DIR}/package/ell/util/tutorialHelpers.py ) - file(GLOB PKGHDR RELATIVE ${CMAKE_SOURCE_DIR}/interfaces ${CMAKE_SOURCE_DIR}/interfaces/common/*.i ${CMAKE_SOURCE_DIR}/interfaces/common/include/*.h) + file(GLOB PKGHDR RELATIVE ${ELL_ROOT}/interfaces ${ELL_ROOT}/interfaces/common/*.i ${ELL_ROOT}/interfaces/common/include/*.h) foreach(hdr ${PKGHDR}) add_custom_command(TARGET _ELL_python POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/interfaces/${hdr} ${CMAKE_BINARY_DIR}/interfaces/python/package/ell/headers/${hdr} + COMMAND ${CMAKE_COMMAND} -E copy ${ELL_ROOT}/interfaces/${hdr} ${CMAKE_BINARY_DIR}/interfaces/python/package/ell/headers/${hdr} ) endforeach() @@ -54,9 +54,9 @@ if (TARGET _ELL_python) add_custom_command(TARGET _ELL_python POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/CMake/OpenBLASSetup.cmake ${DEPLOYDIR}/OpenBLASSetup.cmake - COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/interfaces/common/include/CallbackInterface.h ${DEPLOYDIR}/include/CallbackInterface.h - COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/tools/wrap/templates/CMakeLists.python.txt.in ${DEPLOYDIR}/CMakeLists.python.txt.in + COMMAND ${CMAKE_COMMAND} -E copy ${ELL_ROOT}/CMake/OpenBLASSetup.cmake ${DEPLOYDIR}/OpenBLASSetup.cmake + COMMAND ${CMAKE_COMMAND} -E copy ${ELL_ROOT}/interfaces/common/include/CallbackInterface.h ${DEPLOYDIR}/include/CallbackInterface.h + COMMAND ${CMAKE_COMMAND} -E copy ${ELL_ROOT}/tools/wrap/templates/CMakeLists.python.txt.in ${DEPLOYDIR}/CMakeLists.python.txt.in ) if(WIN32) @@ -73,4 +73,7 @@ if (TARGET _ELL_python) COMMAND ${CMAKE_COMMAND} -E copy $ ${PYTHON_DIR}/package/ell/$ ) endif() + + add_dependencies(_ELL_python pythonpackage) + endif(TARGET _ELL_python) diff --git a/interfaces/python/package/CMakeLists.txt b/interfaces/python/package/CMakeLists.txt index b241b8dc5..0977e904e 100644 --- a/interfaces/python/package/CMakeLists.txt +++ b/interfaces/python/package/CMakeLists.txt @@ -1,15 +1,16 @@ if(${PYTHON_ENABLED}) set(module_name "pythonpackage") - + set(src bld.bat build.sh MANIFEST.in meta.yaml setup.py) add_custom_target(${module_name} ALL DEPENDS SOURCES ${src}) - + copy_newer_files(${module_name} src) set_property(TARGET ${module_name} PROPERTY FOLDER "interfaces/python/package") -endif() # PYTHON_ENABLED + add_subdirectory(ell) + add_dependencies(${module_name} ${module_name}_ell) -add_subdirectory(ell) +endif() # PYTHON_ENABLED diff --git a/interfaces/python/package/ell/CMakeLists.txt b/interfaces/python/package/ell/CMakeLists.txt index 4b2cee0a9..49b8f5a9a 100644 --- a/interfaces/python/package/ell/CMakeLists.txt +++ b/interfaces/python/package/ell/CMakeLists.txt @@ -3,18 +3,24 @@ if(${PYTHON_ENABLED}) set(module_name "pythonpackage_ell") set(src __init__.py rpi_magic.py platform.py) + add_custom_target(${module_name} ALL DEPENDS SOURCES ${src}) - add_subdirectory(data) - add_subdirectory(math) - add_subdirectory(model) - add_subdirectory(neural) - add_subdirectory(nodes) - add_subdirectory(trainers) - add_subdirectory(util) - add_subdirectory(vision) + set(module_components + data + math + model + neural + nodes + trainers + util + vision + ) + + foreach(component ${module_components}) + add_subdirectory(${component}) + add_dependencies(${module_name} ${module_name}_${component}) + endforeach(component ${module_components}) - add_custom_target(${module_name} ALL DEPENDS SOURCES ${src}) - copy_newer_files(${module_name} src) set_property(TARGET ${module_name} PROPERTY FOLDER "interfaces/python/package/ell") diff --git a/interfaces/python/package/ell/nodes/__init__.py b/interfaces/python/package/ell/nodes/__init__.py index c51bf4912..803a0a769 100644 --- a/interfaces/python/package/ell/nodes/__init__.py +++ b/interfaces/python/package/ell/nodes/__init__.py @@ -11,6 +11,7 @@ InputNodeList,\ InputPort,\ InputPortIterator, \ +MatrixMatrixMultiplyImplementation, \ Node,\ NodeIterator,\ OutputNode,\ diff --git a/interfaces/python/test/CMakeLists.txt b/interfaces/python/test/CMakeLists.txt index fb467c11b..2e9d3c618 100644 --- a/interfaces/python/test/CMakeLists.txt +++ b/interfaces/python/test/CMakeLists.txt @@ -8,7 +8,7 @@ if(${PYTHON_ENABLED}) file(GLOB test_src RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.py) add_custom_target(${test_name} ALL DEPENDS ${test_src} SOURCES ${test_src}) - add_dependencies(${test_name} pythonlibs) + add_dependencies(${test_name} pythonlibs dspDataFiles _ELL_python) set_property(TARGET ${test_name} PROPERTY FOLDER "tests") # copy the contents of the test directory to build/interfaces/python @@ -20,4 +20,4 @@ if(${PYTHON_ENABLED}) COMMAND ${PYTHON_EXECUTABLE} test.py) set_property(TARGET ${test_name} PROPERTY FOLDER "tests") -endif() # PYTHON_ENABLED \ No newline at end of file +endif() # PYTHON_ENABLED diff --git a/interfaces/python/test/compiled_model_test.py b/interfaces/python/test/compiled_model_test.py index 33465fa2a..bf49d8d81 100644 --- a/interfaces/python/test/compiled_model_test.py +++ b/interfaces/python/test/compiled_model_test.py @@ -157,6 +157,9 @@ def test(): return 1 else: return 0 + if x > (1 - bias) / scale: + return 1 + return (scale * x) + bias if __name__ == '__main__': diff --git a/libraries/common/include/MapCompilerArguments.h b/libraries/common/include/MapCompilerArguments.h index 89542bbff..b579ed51e 100644 --- a/libraries/common/include/MapCompilerArguments.h +++ b/libraries/common/include/MapCompilerArguments.h @@ -40,6 +40,7 @@ namespace common bool useBlas = false; bool debug = false; utilities::Optional positionIndependentCode = false; // for generating -fPIC object code + int globalValueAlignment = 32; // potentially per-node options: bool enableVectorization = true; @@ -67,6 +68,7 @@ namespace common std::string targetArchitecture = ""; std::string targetFeatures = ""; std::string targetDataLayout = ""; + bool skip_ellcode = false; /// Gets a `MapCompilerOptions` with the settings specified in the commandline arguments. /// diff --git a/libraries/common/src/LoadModel.cpp b/libraries/common/src/LoadModel.cpp index 84eae5fa3..0845c9a32 100644 --- a/libraries/common/src/LoadModel.cpp +++ b/libraries/common/src/LoadModel.cpp @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -52,9 +53,11 @@ #include #include #include +#include #include #include #include +#include #include #include #include @@ -129,11 +132,13 @@ namespace common context.GetTypeFactory().AddType>(); context.GetTypeFactory().AddType>(); context.GetTypeFactory().AddType>(); + context.GetTypeFactory().AddType>(); context.GetTypeFactory().AddType>(); context.GetTypeFactory().AddType>(); context.GetTypeFactory().AddType>(); context.GetTypeFactory().AddType>(); context.GetTypeFactory().AddType>(); + context.GetTypeFactory().AddType>(); context.GetTypeFactory().AddType>(); context.GetTypeFactory().AddType>(); context.GetTypeFactory().AddType>(); @@ -141,6 +146,7 @@ namespace common context.GetTypeFactory().AddType>(); context.GetTypeFactory().AddType>(); context.GetTypeFactory().AddType>(); + context.GetTypeFactory().AddType>(); context.GetTypeFactory().AddType>(); context.GetTypeFactory().AddType>(); context.GetTypeFactory().AddType>(); diff --git a/libraries/common/src/MapCompilerArguments.cpp b/libraries/common/src/MapCompilerArguments.cpp index d66b47bf8..33fd28277 100644 --- a/libraries/common/src/MapCompilerArguments.cpp +++ b/libraries/common/src/MapCompilerArguments.cpp @@ -191,6 +191,20 @@ namespace common { "true", utilities::Optional(true) }, { "false", utilities::Optional(false) } }, "auto"); + + parser.AddOption( + globalValueAlignment, + "globalValueAlignment", + "gva", + "The number of bytes to align global buffers to", + 32); + + parser.AddOption( + skip_ellcode, + "skip_ellcode", + "skip_ellcode", + "To skip ELLCode", + false); } model::MapCompilerOptions MapCompilerArguments::GetMapCompilerOptions(const std::string& modelName) const @@ -231,6 +245,8 @@ namespace common settings.profile = profile; settings.compilerSettings.profile = profile; settings.compilerSettings.positionIndependentCode = positionIndependentCode; + settings.compilerSettings.globalValueAlignment = globalValueAlignment; + settings.compilerSettings.skip_ellcode = skip_ellcode; if (target != "") { diff --git a/libraries/emittable_functions/CMakeLists.txt b/libraries/emittable_functions/CMakeLists.txt index fbe9883b1..c3e09e3bd 100644 --- a/libraries/emittable_functions/CMakeLists.txt +++ b/libraries/emittable_functions/CMakeLists.txt @@ -18,14 +18,10 @@ set(include include/VoiceActivityDetector.h ) -set(tcc -) - source_group("src" FILES ${src}) source_group("include" FILES ${include}) -source_group("tcc" FILES ${tcc}) -add_library(${library_name} ${src} ${include} ${tcc}) +add_library(${library_name} ${src} ${include}) target_include_directories(${library_name} PRIVATE include ${ELL_LIBRARIES_DIR}) target_link_libraries(${library_name} PUBLIC value) target_compile_options(${library_name} PUBLIC ${LLVM_COMPILE_OPTIONS}) diff --git a/libraries/emitters/CMakeLists.txt b/libraries/emitters/CMakeLists.txt index e82aa20be..8373c4c54 100644 --- a/libraries/emitters/CMakeLists.txt +++ b/libraries/emitters/CMakeLists.txt @@ -91,19 +91,58 @@ set (include set (templates templates/CppPredictWrapper.in + templates/LLVMEmitterTargets.h.in templates/SwigModule.in templates/SwigPredictPython.in templates/SwigShapeWrappers.in ) +# This is supposed to be overriden on the command line +# As of LLVM 8.0.1, the possible values within the list are: +# AArch64 AMDGPU ARM BPF Hexagon Lanai Mips MSP430 NVPTX PowerPC Sparc SystemZ +# WebAssembly X86 XCore +set(LLVM_EMITTER_TARGETS "X86;ARM" CACHE STRING "List of LLVM emitter targets to support. Default is \"X86;ARM\". Specify 'ALL' to support all targets") +if(LLVM_EMITTER_TARGETS STREQUAL "ALL") + set(LLVM_EMITTER_TARGETS_FINAL ${LLVM_ALL_TARGETS}) +else() + set(LLVM_EMITTER_TARGETS_FINAL ${LLVM_EMITTER_TARGETS}) +endif() + +set(emitter_targets_content "") +set(llvm_emitter_target_libs ) +foreach(LLVM_EMITTER_TARGET ${LLVM_EMITTER_TARGETS_FINAL}) + if(NOT ${LLVM_EMITTER_TARGET} IN_LIST LLVM_ALL_TARGETS) + message(FATAL_ERROR "Unrecognized LLVM emitter target: ${LLVM_EMITTER_TARGET}.\n\nTargets must be one of: ${LLVM_ALL_TARGETS}") + endif() + set(emitter_targets_content "${emitter_targets_content} EMITTER_TARGET_ACTION(${LLVM_EMITTER_TARGET}) \\\n") + set(llvm_emitter_target_libs + ${llvm_emitter_target_libs} + LLVM${LLVM_EMITTER_TARGET}CodeGen + LLVM${LLVM_EMITTER_TARGET}AsmParser + LLVM${LLVM_EMITTER_TARGET}Disassembler + LLVM${LLVM_EMITTER_TARGET}AsmPrinter + LLVM${LLVM_EMITTER_TARGET}Desc + LLVM${LLVM_EMITTER_TARGET}Info + ) +endforeach(LLVM_EMITTER_TARGET LLVM_EMITTER_TARGETS) +configure_file(templates/LLVMEmitterTargets.h.in build/LLVMEmitterTargets.h @ONLY) + source_group("src" FILES ${src}) source_group("include" FILES ${include}) source_group("templates" FILES ${templates}) add_library(${library_name} ${src} ${include} ${templates}) -target_include_directories(${library_name} PRIVATE include templates ${ELL_LIBRARIES_DIR}) +target_include_directories(${library_name} PRIVATE ${CMAKE_CURRENT_BINARY_DIR} include templates ${ELL_LIBRARIES_DIR}) target_include_directories(${library_name} SYSTEM PUBLIC ${LLVM_INCLUDE_DIRS}) -target_link_libraries(${library_name} math utilities ${LLVM_LIBS}) +target_link_libraries( + ${library_name} + math + utilities + + LLVMMCJIT + ${llvm_emitter_target_libs} + LLVMipo +) target_compile_options(${library_name} PUBLIC ${LLVM_COMPILE_OPTIONS}) set_property(TARGET ${library_name} PROPERTY FOLDER "libraries") diff --git a/libraries/emitters/include/CompilerOptions.h b/libraries/emitters/include/CompilerOptions.h index 7b60cef2e..a0a7b68cb 100644 --- a/libraries/emitters/include/CompilerOptions.h +++ b/libraries/emitters/include/CompilerOptions.h @@ -92,6 +92,12 @@ namespace emitters /// The name of the file being compiled. std::string modelFile; + /// The byte alignment to use for global values. + int globalValueAlignment = 32; + + /// Skip ELLCode optimization. + bool skip_ellcode = false; + private: void AddOptions(const utilities::PropertyBag& properties); }; diff --git a/libraries/emitters/include/EmitterTypes.h b/libraries/emitters/include/EmitterTypes.h index dc7d4866c..c92bf80b6 100644 --- a/libraries/emitters/include/EmitterTypes.h +++ b/libraries/emitters/include/EmitterTypes.h @@ -56,6 +56,25 @@ namespace emitters /// Pointer to a Double DoublePointer, + // + // Pointers to pointers + // + VoidPointerPointer, + /// Pointer to a pointer to a character array + Char8PointerPointer, + /// Pointer to a pointer to a byte + BytePointerPointer, + /// Pointer to a pointer to a Int16 + Int16PointerPointer, + /// Pointer to a pointer to an Int32 + Int32PointerPointer, + /// Pointer to a pointer to an Int64 + Int64PointerPointer, + /// Pointer to a pointer to a Float + FloatPointerPointer, + /// Pointer to a pointer to a Double + DoublePointerPointer, + // // Custom Structs // @@ -249,13 +268,20 @@ namespace emitters template VariableType GetVariableType(); - /// Gets the value form the VariableType enum that corresponds to a pointer to a given nonpointer type. + /// Gets the value from the VariableType enum that corresponds to a pointer from a given nonpointer type. /// - /// The nonpointer type, such as Short or Double. + /// The nonpointer type, such as Int16 or Double. /// /// A VariableType that corresponds to the pointer to a given type. VariableType GetPointerType(VariableType type); + /// Gets the value from the VariableType enum that corresponds to a nonpointer from a given pointer type. + /// + /// The pointer type, such as Int16Pointer or DoublePointer. + /// + /// A VariableType that corresponds to the nonpointer from a given type. + VariableType GetNonPointerType(VariableType type); + /// Gets the default value for a certain type. /// /// The type. diff --git a/libraries/emitters/include/FunctionDeclaration.h b/libraries/emitters/include/FunctionDeclaration.h index cea7d1b0f..060db2912 100644 --- a/libraries/emitters/include/FunctionDeclaration.h +++ b/libraries/emitters/include/FunctionDeclaration.h @@ -51,6 +51,7 @@ namespace emitters /// Get the LLVM type, if we have it. LLVMType GetLLVMType() const { return _llvmType; } + private: std::string _name; VariableType _type; @@ -58,7 +59,7 @@ namespace emitters LLVMType _llvmType; }; - /// Collections of argument flags + /// Collections of argument flags using FunctionArgumentList = std::vector; /// A function definition that defines the name, return type and arguments of a function diff --git a/libraries/emitters/include/IRAssemblyWriter.h b/libraries/emitters/include/IRAssemblyWriter.h index a7368ea9f..4793965c1 100644 --- a/libraries/emitters/include/IRAssemblyWriter.h +++ b/libraries/emitters/include/IRAssemblyWriter.h @@ -45,7 +45,13 @@ namespace emitters OptimizationLevel optimizationLevel = OptimizationLevel::Default; FloatABIType floatABI = FloatABIType::Default; + FloatFusionMode floatFusionMode = FloatFusionMode::Fast; + bool unsafeFPMath = true; + bool noInfsFPMath = true; + bool noNaNsFPMath = true; + bool noSignedZerosFPMath = true; + OutputRelocationModel relocModel = OutputRelocationModel::Static; }; diff --git a/libraries/emitters/include/IREmitter.h b/libraries/emitters/include/IREmitter.h index 87568217f..6cfeb352c 100644 --- a/libraries/emitters/include/IREmitter.h +++ b/libraries/emitters/include/IREmitter.h @@ -42,7 +42,7 @@ namespace emitters using LLVMException = utilities::ErrorCodeException; /// - /// Wraps the LLVM API with an easy to use object model that hides some unncessary detail. + /// Wraps the LLVM API with an easy to use object model that hides some unnecessary detail. /// Incorporates our own x-compiler abstractions such as VariableType and TypedOperator. /// /// Note: IREmitter is stateful. It has a "current block" that it is emitting IR into. @@ -53,7 +53,6 @@ namespace emitters IREmitter(const IREmitter&) = delete; IREmitter(IREmitter&&) = default; IREmitter& operator=(const IREmitter&) = delete; - IREmitter& operator=(IREmitter&&) = default; ~IREmitter() = default; /// Get the LLVM Type information for a VariableType. @@ -1115,7 +1114,7 @@ namespace emitters private: friend class IRModuleEmitter; - IREmitter(IRModuleEmitter& moduleEmitter, llvm::LLVMContext& context); + IREmitter(llvm::LLVMContext& context, llvm::Module& module); LLVMType GetBaseVariableType(VariableType type) const; llvm::Constant* Integer(VariableType type, const size_t value); @@ -1131,7 +1130,7 @@ namespace emitters LLVMFunction CreateFunction(llvm::Module* pModule, const std::string& name, llvm::Function::LinkageTypes linkage, llvm::FunctionType* pFunctionType); LLVMValue Zero(); - IRModuleEmitter& _moduleEmitter; + llvm::Module& _module; llvm::LLVMContext& _llvmContext; // LLVM global context mutable llvm::IRBuilder<> _irBuilder; // IRBuilder API IRValueTable _stringLiterals; // String literals are emitted as constants. We have to track them ourselves to prevent dupes. diff --git a/libraries/emitters/include/IRExecutionEngine.h b/libraries/emitters/include/IRExecutionEngine.h index 423c62e17..a12b29e14 100644 --- a/libraries/emitters/include/IRExecutionEngine.h +++ b/libraries/emitters/include/IRExecutionEngine.h @@ -37,13 +37,13 @@ namespace emitters /// /// The module. /// Indicates if the execution engine should run a verification pass before running the code. - IRExecutionEngine(IRModuleEmitter&& module, bool verify = false); + IRExecutionEngine(IRModuleEmitter&& module, bool verify = true, llvm::CodeGenOpt::Level optLevel = llvm::CodeGenOpt::Level::Default); /// Inject the primary "owner" module into the execution engine. /// /// The module. /// Indicates if the execution engine should run a verification pass before running the code. - IRExecutionEngine(std::unique_ptr pModule, bool verify = false); + IRExecutionEngine(std::unique_ptr pModule, bool verify = false, llvm::CodeGenOpt::Level optLevel = llvm::CodeGenOpt::Level::Default); /// Destructor ~IRExecutionEngine(); @@ -103,6 +103,12 @@ namespace emitters /// The address of the function being defined. void DefineFunction(LLVMFunction func, utilities::UIntPtrT address); + /// Set the address of a named function already defined elsewhere in the binary. + /// + /// The function name being defined. + /// The address of the function being referenced. + void DefineFunction(const std::string& name, utilities::UIntPtrT address); + /// /// Return a main function that takes no arguments - if one exists. Returns nullptr if not found. /// diff --git a/libraries/emitters/include/IRFunctionEmitter.h b/libraries/emitters/include/IRFunctionEmitter.h index 6a1cc5d82..392d8c0b6 100644 --- a/libraries/emitters/include/IRFunctionEmitter.h +++ b/libraries/emitters/include/IRFunctionEmitter.h @@ -2,7 +2,7 @@ // // Project: Embedded Learning Library (ELL) // File: IRFunctionEmitter.h (emitters) -// Authors: Umesh Madan, Chuck Jacobs +// Authors: Umesh Madan, Chuck Jacobs, Kern Handa // //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -25,6 +25,8 @@ #include "LLVMUtilities.h" #include "Variable.h" +#include + #include #include #include @@ -33,10 +35,12 @@ #include #include +#include #include #include #include #include +#include #include namespace ell @@ -48,6 +52,15 @@ namespace emitters /// A list of IRLocalScalar values using IRScalarList = std::vector; + /// Helper enum used to specify whether a FunctionDeclaration should be inlined + enum class FunctionInlining + { + defaultInline, + always, + prefer, + never + }; + /// Used to emit code into an existing LLVM IR Function class IRFunctionEmitter { @@ -57,7 +70,11 @@ namespace emitters { None = 0, /// Suppress alias analysis - NoAlias + NoAlias, + /// This indicates that the parameter or return pointer is dereferenceable. + /// This attribute may only be applied to pointer typed parameters. A pointer that is + /// dereferenceable can be loaded from speculatively without a risk of trapping. + Dereferenceable }; /// Query if this IRFunctionEmitter is valid. @@ -705,18 +722,19 @@ namespace emitters /// /// The index of the argument /// The attribute - void SetAttributeForArgument(size_t index, Attributes attribute); + /// Any extra information that the attribute might make use of + void SetAttributeForArgument(size_t index, Attributes attribute, const std::any& extra = {}); /// Sets an attribute for all arguments /// /// The attribute - void SetAttributeForArguments(Attributes attribute); + void SetAttributeForArguments(Attributes attribute, const std::any& extra = {}); /// Sets an attribute for arguments at the specified indices /// /// The indices of the arguments /// The attribute - void SetAttributeForArguments(std::vector indices, Attributes attribute); + void SetAttributeForArguments(std::vector indices, Attributes attribute, const std::any& extra = {}); /// Emit a stack variable. /// @@ -756,15 +774,6 @@ namespace emitters /// Pointer to the array. llvm::AllocaInst* Variable(VariableType type, int size); - /// Emit a 2D stack array of the given dimensions. - /// - /// The array entry type. - /// The number of rows in the array. - /// The number of columns in the array. - /// - /// Pointer to the array. - llvm::AllocaInst* Variable(VariableType type, int rows, int columns); - /// Emit a stack array of the given size. /// /// The array entry type. @@ -773,15 +782,6 @@ namespace emitters /// Pointer to the array. llvm::AllocaInst* Variable(LLVMType type, int size); - /// Emit a 2D stack array of the given dimensions. - /// - /// The array entry type. - /// The number of rows in the array. - /// The number of columns in the array. - /// - /// Pointer to the array. - llvm::AllocaInst* Variable(LLVMType type, int rows, int columns); - /// Return an emitted stack variable and assign it a name. /// /// The variable type. @@ -1032,12 +1032,26 @@ namespace emitters /// A function that emits the body of the loop. void For(int count, ForLoopBodyFunction body); + /// Emits a for loop counting from zero to a constant end value. + /// + /// Tag to use when naming the basic block regions + /// The number of iterations to make. + /// A function that emits the body of the loop. + void For(const std::string& tag, int count, ForLoopBodyFunction body); + /// Emits a for loop counting from zero to a constant end value. /// /// The number of iterations to make. /// A function that emits the body of the loop. void For(LLVMValue count, ForLoopBodyFunction body); + /// Emits a for loop counting from zero to a constant end value. + /// + /// Tag to use when naming the basic block regions + /// The number of iterations to make. + /// A function that emits the body of the loop. + void For(const std::string& tag, LLVMValue count, ForLoopBodyFunction body); + /// Emits a for loop counting from a begin value up to (but not including) a constant end value. /// /// The starting value of the loop iterator. @@ -1045,6 +1059,14 @@ namespace emitters /// A function that emits the body of the loop. void For(int beginValue, int endValue, ForLoopBodyFunction body); + /// Emits a for loop counting from a begin value up to (but not including) a constant end value. + /// + /// Tag to use when naming the basic block regions + /// The starting value of the loop iterator. + /// The ending value of the loop iterator. + /// A function that emits the body of the loop. + void For(const std::string& tag, int beginValue, int endValue, ForLoopBodyFunction body); + /// Emits a for loop counting from a begin value up to (but not including) a constant end value. /// /// The starting value of the loop iterator. @@ -1052,6 +1074,14 @@ namespace emitters /// A function that emits the body of the loop. void For(LLVMValue beginValue, LLVMValue endValue, ForLoopBodyFunction body); + /// Emits a for loop counting from a begin value up to (but not including) a constant end value. + /// + /// Tag to use when naming the basic block regions + /// The starting value of the loop iterator. + /// The ending value of the loop iterator. + /// A function that emits the body of the loop. + void For(const std::string& tag, LLVMValue beginValue, LLVMValue endValue, ForLoopBodyFunction body); + /// Emits a for loop counting from a begin value up to (but not including) a constant end value with a given increment. /// /// The starting value of the loop iterator. @@ -1060,6 +1090,15 @@ namespace emitters /// A function that emits the body of the loop. void For(int beginValue, int endValue, int increment, ForLoopBodyFunction body); + /// Emits a for loop counting from a begin value up to (but not including) a constant end value with a given increment. + /// + /// Tag to use when naming the basic block regions + /// The starting value of the loop iterator. + /// The ending value of the loop iterator. + /// The increment for the iterator. + /// A function that emits the body of the loop. + void For(const std::string& tag, int beginValue, int endValue, int increment, ForLoopBodyFunction body); + /// Emits a for loop counting from a begin value up to (but not including) a constant end value with a given increment. /// /// The starting value of the loop iterator. @@ -1068,6 +1107,15 @@ namespace emitters /// A function that emits the body of the loop. void For(LLVMValue beginValue, LLVMValue endValue, LLVMValue increment, ForLoopBodyFunction body); + /// Emits a for loop counting from a begin value up to (but not including) a constant end value with a given increment. + /// + /// Tag to use when naming the basic block regions + /// The starting value of the loop iterator. + /// The ending value of the loop iterator. + /// The increment for the iterator. + /// A function that emits the body of the loop. + void For(const std::string& tag, LLVMValue beginValue, LLVMValue endValue, LLVMValue increment, ForLoopBodyFunction body); + // // Extended for loops // @@ -1078,36 +1126,78 @@ namespace emitters /// A function that emits the body of the loop. void For(const std::vector& ranges, MultiDimForLoopBodyFunction body); + /// Emits a set of nested for loops, each counting from a begin value up to (but not including) an end value. + /// + /// Tag to use when naming the basic block regions + /// The range objects describing the ranges to iterate over (begin, end). + /// A function that emits the body of the loop. + void For(const std::string& tag, const std::vector& ranges, MultiDimForLoopBodyFunction body); + /// Emits a set of nested for loops, each counting from a begin value up to (but not including) an end value. /// /// The range objects describing the ranges to iterate over (begin, end). /// A function that emits the body of the loop. void For(const std::vector& ranges, MultiDimForLoopBodyFunction body); + /// Emits a set of nested for loops, each counting from a begin value up to (but not including) an end value. + /// + /// Tag to use when naming the basic block regions + /// The range objects describing the ranges to iterate over (begin, end). + /// A function that emits the body of the loop. + void For(const std::string& tag, const std::vector& ranges, MultiDimForLoopBodyFunction body); + /// Emits a tiled for loop counting from a begin value up to (but not including) an end value with a given increment. /// /// The range object describing the range to iterate over (begin, end, and increment). /// A function that emits the body of the loop. void For(ConstTiledLoopRange range, TiledForLoopBodyFunction body); + /// Emits a tiled for loop counting from a begin value up to (but not including) an end value with a given increment. + /// + /// Tag to use when naming the basic block regions + /// The range object describing the range to iterate over (begin, end, and increment). + /// A function that emits the body of the loop. + void For(const std::string& tag, ConstTiledLoopRange range, TiledForLoopBodyFunction body); + /// Emits a tiled for loop counting from a begin value up to (but not including) an end value with a given increment. /// /// The range object describing the range to iterate over (begin, end, and increment). /// A function that emits the body of the loop. void For(TiledLoopRange range, TiledForLoopBodyFunction body); + /// Emits a tiled for loop counting from a begin value up to (but not including) an end value with a given increment. + /// + /// Tag to use when naming the basic block regions + /// The range object describing the range to iterate over (begin, end, and increment). + /// A function that emits the body of the loop. + void For(const std::string& tag, TiledLoopRange range, TiledForLoopBodyFunction body); + /// Emits a set of nested tiled for loops, each counting from a begin value up to (but not including) an end value, with a given increment. /// /// The range objects describing the ranges to iterate over (begin, end, increment). /// A function that emits the body of the loop. void For(const std::vector& ranges, TiledMultiDimForLoopBodyFunction body); + /// Emits a set of nested tiled for loops, each counting from a begin value up to (but not including) an end value, with a given increment. + /// + /// Tag to use when naming the basic block regions + /// The range objects describing the ranges to iterate over (begin, end, increment). + /// A function that emits the body of the loop. + void For(const std::string& tag, const std::vector& ranges, TiledMultiDimForLoopBodyFunction body); + /// Emits a set of nested tiled for loops, each counting from a begin value up to (but not including) an end value, with a given increment. /// /// The range objects describing the ranges to iterate over (begin, end, increment). /// A function that emits the body of the loop. void For(const std::vector& ranges, TiledMultiDimForLoopBodyFunction body); + /// Emits a set of nested tiled for loops, each counting from a begin value up to (but not including) an end value, with a given increment. + /// + /// Tag to use when naming the basic block regions + /// The range objects describing the ranges to iterate over (begin, end, increment). + /// A function that emits the body of the loop. + void For(const std::string& tag, const std::vector& ranges, TiledMultiDimForLoopBodyFunction body); + /// Emits a parallel for loop counting from zero to a constant end value. /// /// The number of iterations to make. @@ -1166,11 +1256,24 @@ namespace emitters /// Emits a while loop. /// - /// A function the emits code returning a single-bit boolean test value + /// Tag to use when naming the basic block regions + /// Pointer to a memory location that will be dereferenced for the test value. + /// A function that emits the body of the loop. + void While(const std::string& tag, LLVMValue pTestValuePointer, WhileLoopBodyFunction body); + + /// Emits a while loop. /// + /// A function the emits code returning a single-bit boolean test value /// A function that emits the body of the loop. void While(std::function condition, WhileLoopBodyFunction body); + /// Emits a while loop. + /// + /// Tag to use when naming the basic block regions + /// A function the emits code returning a single-bit boolean test value + /// A function that emits the body of the loop. + void While(const std::string& tag, std::function condition, WhileLoopBodyFunction body); + /// Emits an if statement. /// /// Pointer to a memory location that will be dereferenced for the test value. @@ -1312,6 +1415,13 @@ namespace emitters /// Pointer to the return value of the call to the printf function. LLVMValue Printf(std::initializer_list arguments); + /// Emits a printf call. + /// + /// Arguments to the printf call. + /// + /// Pointer to the return value of the call to the printf function. + LLVMValue Printf(std::vector arguments); + /// Emits a printf call. /// /// Describes the printf format to use. @@ -1636,6 +1746,9 @@ namespace emitters /// Gets the CPU id of the currently-running thread. Currently only available on Linux. Returns -1 if unavailable. LLVMValue GetCpu(); + /// Emits a system call to trap into the debugger + void DebugBreak(); + // // Information about the current function begin emitted // @@ -1701,6 +1814,12 @@ namespace emitters /// Tags a profiling function to be included in the SWIG interface. void IncludeInSwigInterface(); + /// Tags a function to be inlined or not, depending on the value passed in. + void SetInlineState(FunctionInlining inlineState); + + /// Tags an LLVM function pointer to be inlined or not, depending on the value passed in. + static void SetInlineState(LLVMFunction function, FunctionInlining inlineState); + private: friend class IRModuleEmitter; @@ -1713,13 +1832,11 @@ namespace emitters { public: EntryBlockScope(IRFunctionEmitter& function); - void ExitScope(); ~EntryBlockScope(); private: IRFunctionEmitter& _function; llvm::IRBuilder<>::InsertPoint _oldPos; - bool _inScope = true; }; LLVMValue PtrOffsetA(LLVMValue pPointer, int offset); @@ -1735,6 +1852,21 @@ namespace emitters LLVMValue SetValueAtH(LLVMValue pPointer, int offset, LLVMValue pValue); llvm::BasicBlock* GetEntryBlock() { return _entryBlock; } + + template + auto ExecuteInEntryBlock(FunctionType&& fn) -> utilities::FunctionReturnType + { + EntryBlockScope scope(*this); + if constexpr (utilities::HasReturnValue()) + { + return fn(); + } + else + { + fn(); + } + } + void SetUpFunction(); void RegisterFunctionArgs(const NamedVariableTypeList& args); diff --git a/libraries/emitters/include/IRIfEmitter.h b/libraries/emitters/include/IRIfEmitter.h index 658bf533b..710cd3ff3 100644 --- a/libraries/emitters/include/IRIfEmitter.h +++ b/libraries/emitters/include/IRIfEmitter.h @@ -79,10 +79,10 @@ namespace emitters IRIfEmitter& operator=(const IRIfEmitter&) = delete; /// Move constructor - IRIfEmitter(IRIfEmitter&& other); + IRIfEmitter(IRIfEmitter&& other) noexcept; /// Move assignment operator - IRIfEmitter& operator=(IRIfEmitter&& other); + IRIfEmitter& operator=(IRIfEmitter&& other) noexcept; /// Emits an 'if' block. /// diff --git a/libraries/emitters/include/IRLocalValue.h b/libraries/emitters/include/IRLocalValue.h index 589cc85a2..759ddcb24 100644 --- a/libraries/emitters/include/IRLocalValue.h +++ b/libraries/emitters/include/IRLocalValue.h @@ -18,6 +18,7 @@ namespace emitters { class IRFunctionEmitter; struct IRLocalValue; + struct IRLocalScalar; namespace detail { @@ -63,9 +64,6 @@ namespace emitters /// The LLVMValue being wrapped. LLVMValue value; - - private: - IRLocalValue() = default; }; /// @@ -74,6 +72,11 @@ namespace emitters struct IRLocalPointer : public IRLocalValue { using IRLocalValue::IRLocalValue; + + IRLocalValue Load() const; + IRLocalPointer Offset(int offset) const; + IRLocalPointer Offset(LLVMValue offset) const; + IRLocalPointer Offset(const IRLocalScalar& offset) const; }; } // namespace emitters } // namespace ell diff --git a/libraries/emitters/include/IRLoopEmitter.h b/libraries/emitters/include/IRLoopEmitter.h index 6b451f39f..d2341be5f 100644 --- a/libraries/emitters/include/IRLoopEmitter.h +++ b/libraries/emitters/include/IRLoopEmitter.h @@ -25,7 +25,7 @@ namespace emitters { public: virtual ~IRLoopEmitter() = default; - + protected: IRLoopEmitter(IRFunctionEmitter& functionEmitter); void AddLoopMetadata(llvm::BranchInst* branch, bool unroll, bool parallel); @@ -40,7 +40,8 @@ namespace emitters /// Constructs an instance of IRForLoopEmitter. /// /// The function emitter. - IRForLoopEmitter(IRFunctionEmitter& functionEmitter); + /// Optional, tag to use when naming the basic block regions + IRForLoopEmitter(IRFunctionEmitter& functionEmitter, const std::string& tag = ""); /// Gets the block containing the body of the for loop. /// @@ -105,6 +106,7 @@ namespace emitters llvm::BasicBlock* _pIncrementBlock = nullptr; // Here we increment the iteration variable llvm::BasicBlock* _pAfterBlock = nullptr; // When the loop is done, we branch to this block LLVMValue _pIterationVariable = nullptr; + std::string _tag; }; /// Class that simplifies while loop creation. Used internally by IRFunctionEmitter. @@ -116,7 +118,8 @@ namespace emitters /// Constructs an instance of IRWhileLoopEmitter. /// /// The function emitter. - IRWhileLoopEmitter(IRFunctionEmitter& functionEmitter); + /// Optional, tag to use when naming the basic block regions + IRWhileLoopEmitter(IRFunctionEmitter& functionEmitter, const std::string& tag = ""); /// Emits the beginning of a while loop that uses a mutable test value. /// @@ -150,6 +153,7 @@ namespace emitters llvm::BasicBlock* _pConditionBlock = nullptr; // Here we do the loop termination check llvm::BasicBlock* _pBodyBlock = nullptr; // The body of the loop llvm::BasicBlock* _pAfterBlock = nullptr; // When the loop is done, we branch to this block + std::string _tag; }; } // namespace emitters } // namespace ell diff --git a/libraries/emitters/include/IRModuleEmitter.h b/libraries/emitters/include/IRModuleEmitter.h index b7e77b8e1..570b96bad 100644 --- a/libraries/emitters/include/IRModuleEmitter.h +++ b/libraries/emitters/include/IRModuleEmitter.h @@ -57,9 +57,9 @@ namespace emitters IRModuleEmitter(const std::string& moduleName, const CompilerOptions& parameters); IRModuleEmitter(const IRModuleEmitter&) = delete; - IRModuleEmitter(IRModuleEmitter&&) = default; + IRModuleEmitter(IRModuleEmitter&&) = delete; IRModuleEmitter& operator=(const IRModuleEmitter&) = delete; - IRModuleEmitter& operator=(IRModuleEmitter&&) = default; + IRModuleEmitter& operator=(IRModuleEmitter&&) = delete; // // Properties of the module @@ -87,17 +87,17 @@ namespace emitters /// Returns the runtime object that manages functions. /// /// Reference to the `IRRuntime`. - IRRuntime& GetRuntime() { return _runtime; } + IRRuntime& GetRuntime() { return *_runtime; } /// Gets a reference to the profiler. /// /// Reference to the `IRProfiler` object for this module. - IRProfiler& GetProfiler() { return _profiler; } + IRProfiler& GetProfiler() { return *_profiler; } /// Gets a reference to the underlying IREmitter. /// /// Reference to the underlying IREmitter. - IREmitter& GetIREmitter() { return _emitter; } + IREmitter& GetIREmitter() { return *_emitter; } /// Can this module emitter still be used to add functions to the module? /// @@ -240,74 +240,82 @@ namespace emitters /// /// The variable type. /// The name of the variable. + /// Specifies whether the global memory address is unique to each thread /// /// Pointer to the llvm::GlobalVariable that represents the variable. - llvm::GlobalVariable* Global(VariableType type, const std::string& name); + llvm::GlobalVariable* Global(VariableType type, const std::string& name, bool isThreadLocal = false); /// Emit a named global variable of the given type. /// /// Pointer to the runtime value that contains the variable type. /// The name of the variable. + /// Specifies whether the global memory address is unique to each thread /// /// Pointer to the llvm::GlobalVariable that represents the variable. - llvm::GlobalVariable* Global(LLVMType pType, const std::string& name); + llvm::GlobalVariable* Global(LLVMType pType, const std::string& name, bool isThreadLocal = false); /// Emit a named global variable of a template type. /// /// The variable type. /// The name of the variable. /// The initial value of the variable. + /// Specifies whether the global memory address is unique to each thread /// /// Pointer to the llvm::GlobalVariable that represents the variable. template - llvm::GlobalVariable* Global(const std::string& name, ValueType value); + llvm::GlobalVariable* Global(const std::string& name, ValueType value, bool isThreadLocal = false); /// Emit a named global variable of Pointer type, initialized to nullptr. /// /// The variable type. /// The name of the variable. /// The variable type. + /// Specifies whether the global memory address is unique to each thread /// /// Pointer to the llvm::GlobalVariable that represents the variable. - llvm::GlobalVariable* GlobalPointer(const std::string& name, VariableType type); + llvm::GlobalVariable* GlobalPointer(const std::string& name, VariableType type, bool isThreadLocal = false); /// Emit a named global array of the given type and size. /// /// The variable type. /// The name of the variable. /// The array size. + /// Specifies whether the global memory address is unique to each thread /// /// Pointer to the llvm::GlobalVariable that represents the variable. - llvm::GlobalVariable* GlobalArray(VariableType type, const std::string& name, const size_t size); + llvm::GlobalVariable* GlobalArray(VariableType type, const std::string& name, const size_t size, bool isThreadLocal = false); /// Emit a named global array of the given type and size. /// /// The name of the variable. /// Pointer to the runtime value that contains the variable type. /// The array size. + /// Specifies whether the global memory address is unique to each thread /// /// Pointer to the llvm::GlobalVariable that represents the variable. - llvm::GlobalVariable* GlobalArray(const std::string& name, LLVMType pType, const size_t size); + llvm::GlobalVariable* GlobalArray(const std::string& name, LLVMType pType, const size_t size, bool isThreadLocal = false); /// Emit a zero-initialized named, module scoped array of a template type. /// /// Type of each array entry. /// The name of the variable. /// The size of the array. + /// Specifies whether the global memory address is unique to each thread /// /// Pointer to the llvm::GlobalVariable that represents the variable. template - llvm::GlobalVariable* GlobalArray(const std::string& name, size_t size); + llvm::GlobalVariable* GlobalArray(const std::string& name, size_t size, bool isThreadLocal = false); /// Emit a named, module scoped array of a template type. /// /// Type of each array entry. /// The name of the variable. /// The value of the array. + /// Specifies whether the global memory address is unique to each thread /// /// Pointer to the llvm::GlobalVariable that represents the variable. template - llvm::GlobalVariable* GlobalArray(const std::string& name, const std::vector& value); + llvm::GlobalVariable* GlobalArray(const std::string& name, const std::vector& value, bool isThreadLocal = false); // // Functions @@ -355,6 +363,13 @@ namespace emitters /// Pointer to an llvm::Function that represents the requested function, or nullptr if it doesn't exist. LLVMFunction GetFunction(const std::string& name) const; + /// Get an LLVM intrinsic function taking no arguments with the given id. + /// + /// The intrinsic function identifier. + /// + /// Pointer to an llvm::Function that represents the requested function. + LLVMFunction GetIntrinsic(llvm::Intrinsic::ID id); + /// Get an LLVM intrinsic function with the given id and signature. /// /// The intrinsic function identifier. @@ -531,6 +546,31 @@ namespace emitters /// The IR text. void LoadIR(const std::string& text); + /// Load LLVM IR from the given stream + /// + /// The stream that serves as the input + void LoadIR(std::istream& stream); + + /// Load LLVM IR from the file into this module. + /// + /// The name of the file containing the IR + void LoadIRFromFile(const std::string& filename); + + /// Load Assembler text into this module. + /// + /// The IR text. + void LoadAsm(const std::string& text); + + /// Load Assembler from the given stream + /// + /// The stream that serves as the input + void LoadAsm(std::istream& stream); + + /// Load Assembler from the file into this module. + /// + /// The name of the file containing the Assembler text + void LoadAsmFromFile(const std::string& filename); + // // Optimization // @@ -752,7 +792,7 @@ namespace emitters void InsertFunctionMetadata(LLVMFunction function, const std::string& tag, const std::vector& value = { "" }); // Get a reference to the thread pool - IRThreadPool& GetThreadPool() { return _threadPool; } + IRThreadPool& GetThreadPool() { return *_threadPool; } // Actual code output implementations void WriteHeader(std::ostream& stream); @@ -762,18 +802,13 @@ namespace emitters // Lower-level internal functions // void SetCompilerOptions(const CompilerOptions& parameters) override; - llvm::GlobalVariable* AddGlobal(const std::string& name, LLVMType pType, llvm::Constant* pInitial, bool isConst); + llvm::GlobalVariable* AddGlobal(const std::string& name, LLVMType pType, llvm::Constant* pInitial, bool isConst, bool isThreadLocal = false); IRFunctionEmitter Function(const std::string& name, VariableType returnType, const VariableTypeList* pArguments, bool isPublic); llvm::Function::LinkageTypes Linkage(bool isPublic); llvm::ConstantAggregateZero* ZeroInitializer(LLVMType pType); static void CompleteCompilerOptions(CompilerOptions& parameters); void SetTargetTriple(const std::string& triple); - - // - // LLVM global state management - // - void InitializeLLVM(); - static llvm::PassRegistry* InitializeGlobalPassRegistry(); + MachineCodeOutputOptions GetMachineCodeOutputOptions() const; // // Data members @@ -781,14 +816,14 @@ namespace emitters std::unique_ptr _llvmContext; // LLVM global context std::unique_ptr _llvmModule; // The LLVM Module being emitted std::unique_ptr _diagnosticHandler = nullptr; - IREmitter _emitter; + std::unique_ptr _emitter; std::stack::InsertPoint>> _functionStack; // contains the location we were emitting code into when we paused to emit a new function IRValueTable _literals; // Symbol table - name to literals IRValueTable _globals; // Symbol table - name to global variables - IRRuntime _runtime; // Manages emission of runtime functions - IRThreadPool _threadPool; // A pool of worker threads -- gets initialized the first time it's used (?) - IRProfiler _profiler; + std::unique_ptr _runtime; // Manages emission of runtime functions + std::unique_ptr _threadPool; // A pool of worker threads -- gets initialized the first time it's used (?) + std::unique_ptr _profiler; int _globalStringIndex = 0; // Info to modify how code is written out @@ -824,31 +859,31 @@ namespace emitters template llvm::GlobalVariable* IRModuleEmitter::Constant(const std::string& name, ValueType value) { - return AddGlobal(name, _emitter.Type(GetVariableType()), _emitter.Literal(value), true); + return AddGlobal(name, GetIREmitter().Type(GetVariableType()), GetIREmitter().Literal(value), true); } template - llvm::GlobalVariable* IRModuleEmitter::Global(const std::string& name, ValueType value) + llvm::GlobalVariable* IRModuleEmitter::Global(const std::string& name, ValueType value, bool isThreadLocal) { - return AddGlobal(name, _emitter.Type(GetVariableType()), _emitter.Literal(value), false); + return AddGlobal(name, GetIREmitter().Type(GetVariableType()), GetIREmitter().Literal(value), false, isThreadLocal); } template llvm::GlobalVariable* IRModuleEmitter::ConstantArray(const std::string& name, const std::vector& value) { - return AddGlobal(name, _emitter.ArrayType(GetVariableType(), value.size()), _emitter.Literal(value), true); + return AddGlobal(name, GetIREmitter().ArrayType(GetVariableType(), value.size()), GetIREmitter().Literal(value), true); } template - llvm::GlobalVariable* IRModuleEmitter::GlobalArray(const std::string& name, size_t size) + llvm::GlobalVariable* IRModuleEmitter::GlobalArray(const std::string& name, size_t size, bool isThreadLocal) { - return GlobalArray(GetVariableType(), name, size); + return GlobalArray(GetVariableType(), name, size, isThreadLocal); } template - llvm::GlobalVariable* IRModuleEmitter::GlobalArray(const std::string& name, const std::vector& value) + llvm::GlobalVariable* IRModuleEmitter::GlobalArray(const std::string& name, const std::vector& value, bool isThreadLocal) { - return AddGlobal(name, _emitter.ArrayType(GetVariableType(), value.size()), _emitter.Literal(value), false); + return AddGlobal(name, GetIREmitter().ArrayType(GetVariableType(), value.size()), GetIREmitter().Literal(value), false, isThreadLocal); } // diff --git a/libraries/emitters/include/IRPosixRuntime.h b/libraries/emitters/include/IRPosixRuntime.h index d68decf38..5c7adace0 100644 --- a/libraries/emitters/include/IRPosixRuntime.h +++ b/libraries/emitters/include/IRPosixRuntime.h @@ -161,10 +161,12 @@ namespace emitters // GetPthreadAttrType // GetPthreadOnceType + IRPosixRuntime(const IRPosixRuntime&) = delete; + private: friend IRModuleEmitter; friend IRRuntime; - IRPosixRuntime(IRModuleEmitter& module); + explicit IRPosixRuntime(IRModuleEmitter& module); LLVMType GetIntType(); // returns LLVM type for native `int` LLVMType GetPointerSizedIntType(); // returns LLVM type for an int the size of a pointer diff --git a/libraries/emitters/include/IRRuntime.h b/libraries/emitters/include/IRRuntime.h index 36774cc03..03e3da461 100644 --- a/libraries/emitters/include/IRRuntime.h +++ b/libraries/emitters/include/IRRuntime.h @@ -88,6 +88,12 @@ namespace emitters template LLVMFunction GetTanhFunction(); + /// Get the fma function + /// + /// An LLVM function pointer to the function. + template + LLVMFunction GetFmaFunction(); + // emitter types LLVMFunction GetSqrtFunction(VariableType argType); LLVMFunction GetAbsFunction(VariableType argType); @@ -103,6 +109,7 @@ namespace emitters LLVMFunction GetFloorFunction(VariableType argType); LLVMFunction GetCeilFunction(VariableType argType); LLVMFunction GetCopySignFunction(VariableType argType); + LLVMFunction GetFmaFunction(VariableType argType); // llvm types LLVMFunction GetSqrtFunction(LLVMType argType); @@ -119,6 +126,7 @@ namespace emitters LLVMFunction GetFloorFunction(LLVMType argType); LLVMFunction GetCeilFunction(LLVMType argType); LLVMFunction GetCopySignFunction(LLVMType argType); + LLVMFunction GetFmaFunction(LLVMType argType); LLVMFunction GetPrefetchFunction(); @@ -172,7 +180,7 @@ namespace emitters private: friend IRModuleEmitter; - IRRuntime(IRModuleEmitter& module); + explicit IRRuntime(IRModuleEmitter& module); LLVMType GetIntType(); // returns LLVM type for native `int` diff --git a/libraries/emitters/include/LLVMUtilities.h b/libraries/emitters/include/LLVMUtilities.h index 698184208..405fec92e 100644 --- a/libraries/emitters/include/LLVMUtilities.h +++ b/libraries/emitters/include/LLVMUtilities.h @@ -26,6 +26,9 @@ namespace emitters /// Nice name for llvm::Function pointers. using LLVMFunction = llvm::Function*; + /// Nice name for llvm::Function pointers. + using LLVMFunctionType = llvm::FunctionType*; + /// Nice name for llvm::Type pointers. using LLVMType = llvm::Type*; @@ -74,5 +77,8 @@ namespace emitters /// The VariableType or VariableType::Custom for anything that doesn't map. VariableType ToVariableType(LLVMType type); + /// Initializes LLVM + void InitializeLLVM(); + } // namespace emitters } // namespace ell diff --git a/libraries/emitters/include/ModuleEmitter.h b/libraries/emitters/include/ModuleEmitter.h index dd44a9c08..045b24b87 100644 --- a/libraries/emitters/include/ModuleEmitter.h +++ b/libraries/emitters/include/ModuleEmitter.h @@ -40,7 +40,7 @@ namespace emitters /// Return the base compiler settings /// /// The settings for the compiler - CompilerOptions GetCompilerOptions() const { return _options; } + const CompilerOptions& GetCompilerOptions() const { return _options; } // Note, this differs from IRModuleEmitter::BeginFunction only by return type /// Set a function declaration. Note that BeginMapPredictFunction can't be called from within a function - it completes the currently-being-emitted function diff --git a/libraries/emitters/include/TargetDevice.h b/libraries/emitters/include/TargetDevice.h index 6c1e0f1a0..be2c20d07 100644 --- a/libraries/emitters/include/TargetDevice.h +++ b/libraries/emitters/include/TargetDevice.h @@ -25,6 +25,20 @@ namespace emitters std::string features = ""; size_t numBits = 0; + /// Helper function to test whether the TargetDevice has a particular feature + /// If this is filled in by LLVM for the host target, the possible features are target dependent + /// and include, but are not limited to, the following: + /// X86: cx8, cmov, mmx, fxsr, sse, sse2, sse3, pclmul, ssse3, cx16, sse4.1, sse4.2, movbe, popcnt, aes, rdrnd, + /// avx, fma, xsave, f16c, sahf, lzcnt, sse4a, prfchw, xop, lwp, fma4, tbm, mwaitx, 64bit, clzero, wbnoinvd, + /// fsgsbase, sgx, bmi, avx2, bmi2, invpcid, rtm, avx512f, avx512dq, rdseed, adx, avx512ifma, clflushopt, + /// clwb, avx512pf, avx512er, avx512cd, sha, avx512bw, avx512vl, prefetchwt1, avx512vbmi, pku, waitpkg, + /// avx512vbmi2, shstk, gfni, vaes, vpclmulqdq, avx512vnni, avx512bitalg, avx512vpopcntdq, rdpid, cldemote, + /// movdiri, movdir64b, enqcmd, pconfig, avx512bf16, xsaveopt, xsavec, xsaves, ptwrite + /// AArch64: neon, fp-armv8, crc, crypto + /// ARM: fp16, neon, vfp3, d16, vfp4, hwdiv-arm, hwdiv + /// + inline bool HasFeature(const std::string& feature) const { return features.find(feature) != std::string::npos; } + /// Indicates if the target device is a Windows system bool IsWindows() const; diff --git a/libraries/emitters/src/CompilerOptions.cpp b/libraries/emitters/src/CompilerOptions.cpp index 3259e74ed..fe45c1a8c 100644 --- a/libraries/emitters/src/CompilerOptions.cpp +++ b/libraries/emitters/src/CompilerOptions.cpp @@ -72,6 +72,8 @@ namespace emitters maxThreads = properties.GetOrParseEntry("maxThreads", maxThreads); useFastMath = properties.GetOrParseEntry("useFastMath", useFastMath); debug = properties.GetOrParseEntry("debug", debug); + globalValueAlignment = properties.GetOrParseEntry("globalValueAlignment", globalValueAlignment); + skip_ellcode = properties.GetOrParseEntry("skip_ellcode", skip_ellcode); if (properties.HasEntry("deviceName")) { diff --git a/libraries/emitters/src/EmitterTypes.cpp b/libraries/emitters/src/EmitterTypes.cpp index c5a207c2f..c66a9d854 100644 --- a/libraries/emitters/src/EmitterTypes.cpp +++ b/libraries/emitters/src/EmitterTypes.cpp @@ -330,6 +330,32 @@ namespace emitters return type; } + VariableType GetNonPointerType(VariableType type) + { + switch (type) + { + case VariableType::VoidPointer: + return VariableType::Void; + case VariableType::BytePointer: + return VariableType::Byte; + case VariableType::Int16Pointer: + return VariableType::Int16; + case VariableType::Int32Pointer: + return VariableType::Int32; + case VariableType::Int64Pointer: + return VariableType::Int64; + case VariableType::FloatPointer: + return VariableType::Float; + case VariableType::DoublePointer: + return VariableType::Double; + case VariableType::Char8Pointer: + return VariableType::Char8; + default: + break; + } + return type; + } + template <> TypedOperator GetAddForValueType() { diff --git a/libraries/emitters/src/IRAssemblyWriter.cpp b/libraries/emitters/src/IRAssemblyWriter.cpp index 5cae93755..af67d5075 100644 --- a/libraries/emitters/src/IRAssemblyWriter.cpp +++ b/libraries/emitters/src/IRAssemblyWriter.cpp @@ -142,6 +142,11 @@ namespace emitters llvm::TargetOptions targetOptions = MakeTargetOptions(); targetOptions.MCOptions.AsmVerbose = ellOptions.verboseOutput; targetOptions.FloatABIType = ellOptions.floatABI; + targetOptions.AllowFPOpFusion = ellOptions.floatFusionMode; + targetOptions.UnsafeFPMath = ellOptions.unsafeFPMath ? 1 : 0; + targetOptions.NoInfsFPMath = ellOptions.noInfsFPMath ? 1 : 0; + targetOptions.NoNaNsFPMath = ellOptions.noNaNsFPMath ? 1 : 0; + targetOptions.NoSignedZerosFPMath = ellOptions.noSignedZerosFPMath ? 1 : 0; OutputRelocationModel relocModel = ellOptions.relocModel; llvm::CodeModel::Model codeModel = llvm::CodeModel::Small; // If this code gets run during JIT, we may have to change to medium/large diff --git a/libraries/emitters/src/IREmitter.cpp b/libraries/emitters/src/IREmitter.cpp index 29e2c3c0c..6f4a19e1d 100644 --- a/libraries/emitters/src/IREmitter.cpp +++ b/libraries/emitters/src/IREmitter.cpp @@ -7,8 +7,8 @@ //////////////////////////////////////////////////////////////////////////////////////////////////// #include "IREmitter.h" -#include "IRModuleEmitter.h" #include "EmitterException.h" +#include "IRModuleEmitter.h" #include "LLVMUtilities.h" #include @@ -77,8 +77,8 @@ namespace emitters // // IREmitter implementation // - IREmitter::IREmitter(IRModuleEmitter& moduleEmitter, llvm::LLVMContext& context) : - _moduleEmitter(moduleEmitter), + IREmitter::IREmitter(llvm::LLVMContext& context, llvm::Module& module) : + _module(module), _llvmContext(context), _irBuilder(context) {} @@ -90,39 +90,55 @@ namespace emitters case VariableType::Void: return GetBaseVariableType(type); case VariableType::VoidPointer: - // We use BytePointer to avoid LLVM Assertion failed: isValidElementType(EltTy) && "Invalid type for pointer element!", + // We use BytePointer to avoid LLVM Assertion failed: isValidElementType(EltTy) && "Invalid type for pointer element!", // file ~\llvm-8\lib\ir\type.cpp, line 632 return GetBaseVariableType(VariableType::Byte)->getPointerTo(); + case VariableType::VoidPointerPointer: + return GetBaseVariableType(VariableType::Byte)->getPointerTo()->getPointerTo(); case VariableType::Boolean: return GetBaseVariableType(type); case VariableType::Byte: return GetBaseVariableType(type); case VariableType::BytePointer: return GetBaseVariableType(VariableType::Byte)->getPointerTo(); + case VariableType::BytePointerPointer: + return GetBaseVariableType(VariableType::Byte)->getPointerTo()->getPointerTo(); case VariableType::Int16: return GetBaseVariableType(type); case VariableType::Int16Pointer: return GetBaseVariableType(VariableType::Int16)->getPointerTo(); + case VariableType::Int16PointerPointer: + return GetBaseVariableType(VariableType::Int16)->getPointerTo()->getPointerTo(); case VariableType::Int32: return GetBaseVariableType(type); case VariableType::Int32Pointer: return GetBaseVariableType(VariableType::Int32)->getPointerTo(); + case VariableType::Int32PointerPointer: + return GetBaseVariableType(VariableType::Int32)->getPointerTo()->getPointerTo(); case VariableType::Int64: return GetBaseVariableType(type); case VariableType::Int64Pointer: return GetBaseVariableType(VariableType::Int64)->getPointerTo(); + case VariableType::Int64PointerPointer: + return GetBaseVariableType(VariableType::Int64)->getPointerTo()->getPointerTo(); case VariableType::Float: return GetBaseVariableType(type); case VariableType::FloatPointer: return GetBaseVariableType(VariableType::Float)->getPointerTo(); + case VariableType::FloatPointerPointer: + return GetBaseVariableType(VariableType::Float)->getPointerTo()->getPointerTo(); case VariableType::Double: return GetBaseVariableType(type); case VariableType::DoublePointer: return GetBaseVariableType(VariableType::Double)->getPointerTo(); + case VariableType::DoublePointerPointer: + return GetBaseVariableType(VariableType::Double)->getPointerTo()->getPointerTo(); case VariableType::Char8: return GetBaseVariableType(type); case VariableType::Char8Pointer: return GetBaseVariableType(VariableType::Char8)->getPointerTo(); + case VariableType::Char8PointerPointer: + return GetBaseVariableType(VariableType::Char8)->getPointerTo()->getPointerTo(); default: throw EmitterException(EmitterError::valueTypeNotSupported); } @@ -1115,7 +1131,9 @@ namespace emitters uint64_t IREmitter::SizeOf(LLVMType type) const { - return _moduleEmitter.GetTargetDataLayout().getTypeAllocSize(type); + assert(!_module.getDataLayout().getStringRepresentation().empty()); + + return _module.getDataLayout().getTypeAllocSize(type); } uint64_t IREmitter::SizeOf(VariableType type) const diff --git a/libraries/emitters/src/IRExecutionEngine.cpp b/libraries/emitters/src/IRExecutionEngine.cpp index ed8e36390..d444581ba 100644 --- a/libraries/emitters/src/IRExecutionEngine.cpp +++ b/libraries/emitters/src/IRExecutionEngine.cpp @@ -40,12 +40,12 @@ namespace emitters throw emitters::EmitterException(emitters::EmitterError::unexpected, msg); } - IRExecutionEngine::IRExecutionEngine(IRModuleEmitter&& module, bool verify) : - IRExecutionEngine(module.TransferOwnership(), verify) + IRExecutionEngine::IRExecutionEngine(IRModuleEmitter&& module, bool verify, llvm::CodeGenOpt::Level optLevel) : + IRExecutionEngine(module.TransferOwnership(), verify, optLevel) { } - IRExecutionEngine::IRExecutionEngine(std::unique_ptr pModule, bool verify) + IRExecutionEngine::IRExecutionEngine(std::unique_ptr pModule, bool verify, llvm::CodeGenOpt::Level optLevel) { auto debugPrintFunction = pModule->getFunction("DebugPrint"); @@ -53,7 +53,7 @@ namespace emitters llvm::InitializeNativeTargetAsmPrinter(); _pBuilder = std::make_unique(std::move(pModule)); - _pBuilder->setEngineKind(llvm::EngineKind::JIT).setVerifyModules(verify).setUseOrcMCJITReplacement(false); + _pBuilder->setEngineKind(llvm::EngineKind::JIT).setVerifyModules(verify).setOptLevel(optLevel).setEmulatedTLS(true); static bool installed = false; if (!installed) @@ -68,7 +68,6 @@ namespace emitters { DefineFunction(debugPrintFunction, reinterpret_cast(&DebugPrintImpl)); } - } IRExecutionEngine::~IRExecutionEngine() @@ -124,6 +123,12 @@ namespace emitters _pEngine->addGlobalMapping(func, (void*)address); } + void IRExecutionEngine::DefineFunction(const std::string& name, UIntPtrT address) + { + EnsureEngine(); + _pEngine->addGlobalMapping(name, address); + } + DynamicFunction IRExecutionEngine::GetMain() { return reinterpret_cast(GetFunctionAddress("main")); diff --git a/libraries/emitters/src/IRFunctionEmitter.cpp b/libraries/emitters/src/IRFunctionEmitter.cpp index 911b2a130..349583c61 100644 --- a/libraries/emitters/src/IRFunctionEmitter.cpp +++ b/libraries/emitters/src/IRFunctionEmitter.cpp @@ -39,7 +39,7 @@ namespace emitters namespace { // Helper function for recursive function - void MultiDimFor(IRFunctionEmitter& function, std::vector ranges, std::vector prevIndices, IRFunctionEmitter::MultiDimForLoopBodyFunction body) + void MultiDimFor(IRFunctionEmitter& function, std::vector ranges, std::vector prevIndices, IRFunctionEmitter::MultiDimForLoopBodyFunction body, const std::string& tag = "") { if (ranges.empty()) { @@ -49,16 +49,16 @@ namespace emitters { auto range = ranges.front(); std::vector suffix(ranges.begin() + 1, ranges.end()); - function.For(range.begin, range.end, [suffix, prevIndices, body](IRFunctionEmitter& function, auto index) { + function.For(range.begin, range.end, [suffix, prevIndices, body, tag](IRFunctionEmitter& function, auto index) { std::vector prefix(prevIndices.begin(), prevIndices.end()); prefix.push_back(index); - MultiDimFor(function, suffix, prefix, body); + MultiDimFor(function, suffix, prefix, body, tag); }); } } // Helper function for recursive function - void MultiDimFor(IRFunctionEmitter& function, std::vector ranges, std::vector prevIndices, IRFunctionEmitter::MultiDimForLoopBodyFunction body) + void MultiDimFor(IRFunctionEmitter& function, std::vector ranges, std::vector prevIndices, IRFunctionEmitter::MultiDimForLoopBodyFunction body, const std::string& tag = "") { if (ranges.empty()) { @@ -68,16 +68,16 @@ namespace emitters { auto range = ranges.front(); std::vector suffix(ranges.begin() + 1, ranges.end()); - function.For(range.begin, range.end, [suffix, prevIndices, body](IRFunctionEmitter& function, auto index) { + function.For(range.begin, range.end, [suffix, prevIndices, body, tag](IRFunctionEmitter& function, auto index) { std::vector prefix(prevIndices.begin(), prevIndices.end()); prefix.push_back(index); - MultiDimFor(function, suffix, prefix, body); + MultiDimFor(function, suffix, prefix, body, tag); }); } } // Helper function for recursive function - void TiledMultiDimFor(IRFunctionEmitter& function, std::vector ranges, std::vector prevIntervals, IRFunctionEmitter::TiledMultiDimForLoopBodyFunction body) + void TiledMultiDimFor(IRFunctionEmitter& function, std::vector ranges, std::vector prevIntervals, IRFunctionEmitter::TiledMultiDimForLoopBodyFunction body, const std::string& tag = "") { if (ranges.empty()) { @@ -87,16 +87,16 @@ namespace emitters { auto range = ranges.front(); std::vector suffix(ranges.begin() + 1, ranges.end()); - function.For(range, [suffix, prevIntervals, body](IRFunctionEmitter& function, auto interval) { + function.For(range, [suffix, prevIntervals, body, tag](IRFunctionEmitter& function, auto interval) { std::vector prefix(prevIntervals.begin(), prevIntervals.end()); prefix.push_back(interval); - TiledMultiDimFor(function, suffix, prefix, body); + TiledMultiDimFor(function, suffix, prefix, body, tag); }); } } // Helper function for recursive function - void TiledMultiDimFor(IRFunctionEmitter& function, std::vector ranges, std::vector prevIntervals, IRFunctionEmitter::TiledMultiDimForLoopBodyFunction body) + void TiledMultiDimFor(IRFunctionEmitter& function, std::vector ranges, std::vector prevIntervals, IRFunctionEmitter::TiledMultiDimForLoopBodyFunction body, const std::string& tag = "") { if (ranges.empty()) { @@ -106,24 +106,26 @@ namespace emitters { auto range = ranges.front(); std::vector suffix(ranges.begin() + 1, ranges.end()); - function.For(range, [suffix, prevIntervals, body](IRFunctionEmitter& function, auto interval) { + function.For(range, [suffix, prevIntervals, body, tag](IRFunctionEmitter& function, auto interval) { std::vector prefix(prevIntervals.begin(), prevIntervals.end()); prefix.push_back(interval); - TiledMultiDimFor(function, suffix, prefix, body); + TiledMultiDimFor(function, suffix, prefix, body, tag); }); } } - constexpr llvm::Attribute::AttrKind ToLLVMAttr(IRFunctionEmitter::Attributes attr) + llvm::Attribute ToLLVMAttr(llvm::LLVMContext& context, IRFunctionEmitter::Attributes attr, const std::any& extra) { switch (attr) { default: [[fallthrough]]; case IRFunctionEmitter::Attributes::None: - return llvm::Attribute::AttrKind::None; + return llvm::Attribute::get(context, llvm::Attribute::AttrKind::None); case IRFunctionEmitter::Attributes::NoAlias: - return llvm::Attribute::AttrKind::NoAlias; + return llvm::Attribute::get(context, llvm::Attribute::AttrKind::NoAlias); + case IRFunctionEmitter::Attributes::Dereferenceable: + return llvm::Attribute::getWithDereferenceableBytes(context, std::any_cast(extra)); } } } // namespace @@ -168,6 +170,7 @@ namespace emitters { Log() << "Completing function " << GetFunctionName() << EOL; Verify(); + // TODO: set a flag indicating that this function is done } void IRFunctionEmitter::SetUpFunction() @@ -711,67 +714,53 @@ namespace emitters } } - void IRFunctionEmitter::EntryBlockScope::ExitScope() - { - if (_inScope) - { - _function.SetCurrentInsertPoint(_oldPos); - _inScope = false; - } - } - IRFunctionEmitter::EntryBlockScope::~EntryBlockScope() { - ExitScope(); + _function.SetCurrentInsertPoint(_oldPos); } - void IRFunctionEmitter::SetAttributeForArgument(size_t index, IRFunctionEmitter::Attributes attribute) + void IRFunctionEmitter::SetAttributeForArgument(size_t index, IRFunctionEmitter::Attributes attribute, const std::any& extra) { - (_pFunction->arg_begin() + index)->addAttr(ToLLVMAttr(attribute)); + (_pFunction->arg_begin() + index)->addAttr(ToLLVMAttr(GetLLVMContext(), attribute, extra)); } - void IRFunctionEmitter::SetAttributeForArguments(IRFunctionEmitter::Attributes attribute) + void IRFunctionEmitter::SetAttributeForArguments(IRFunctionEmitter::Attributes attribute, const std::any& extra) { for (auto& argument : Arguments()) { - argument.addAttr(ToLLVMAttr(attribute)); + argument.addAttr(ToLLVMAttr(GetLLVMContext(), attribute, extra)); } } - void IRFunctionEmitter::SetAttributeForArguments(std::vector indices, IRFunctionEmitter::Attributes attribute) + void IRFunctionEmitter::SetAttributeForArguments(std::vector indices, IRFunctionEmitter::Attributes attribute, const std::any& extra) { for (auto index : indices) { - SetAttributeForArgument(index, attribute); + SetAttributeForArgument(index, attribute, extra); } } llvm::AllocaInst* IRFunctionEmitter::Variable(VariableType type) { - EntryBlockScope scope(*this); - auto alloca = GetEmitter().StackAllocate(type); - scope.ExitScope(); - - return alloca; + return ExecuteInEntryBlock([this, type] { + return GetEmitter().StackAllocate(type); + }); } llvm::AllocaInst* IRFunctionEmitter::Variable(LLVMType type) { - EntryBlockScope scope(*this); - auto alloca = GetEmitter().StackAllocate(type); - scope.ExitScope(); - - return alloca; + return ExecuteInEntryBlock([this, type] { + return GetEmitter().StackAllocate(type); + }); } llvm::AllocaInst* IRFunctionEmitter::Variable(VariableType type, const std::string& namePrefix) { - EntryBlockScope scope(*this); - // don't do this for emitted variables! auto name = _locals.GetUniqueName(namePrefix); - auto result = GetEmitter().StackAllocate(type, name); - scope.ExitScope(); + auto result = ExecuteInEntryBlock([this, type, name] { + return GetEmitter().StackAllocate(type, name); + }); _locals.Add(name, result); return result; @@ -779,10 +768,10 @@ namespace emitters llvm::AllocaInst* IRFunctionEmitter::Variable(LLVMType type, const std::string& namePrefix) { - EntryBlockScope scope(*this); auto name = _locals.GetUniqueName(namePrefix); - auto result = GetEmitter().StackAllocate(type, name); - scope.ExitScope(); + auto result = ExecuteInEntryBlock([this, type, name] { + return GetEmitter().StackAllocate(type, name); + }); _locals.Add(name, result); return result; @@ -790,9 +779,9 @@ namespace emitters llvm::AllocaInst* IRFunctionEmitter::EmittedVariable(VariableType type, const std::string& name) { - EntryBlockScope scope(*this); - auto result = GetEmitter().StackAllocate(type, name); - scope.ExitScope(); + auto result = ExecuteInEntryBlock([this, type, name] { + return GetEmitter().StackAllocate(type, name); + }); _locals.Add(name, result); return result; @@ -800,38 +789,16 @@ namespace emitters llvm::AllocaInst* IRFunctionEmitter::Variable(VariableType type, int size) { - EntryBlockScope scope(*this); - auto alloca = GetEmitter().StackAllocate(type, size); - scope.ExitScope(); - - return alloca; - } - - llvm::AllocaInst* IRFunctionEmitter::Variable(VariableType type, int rows, int columns) - { - EntryBlockScope scope(*this); - auto alloca = GetEmitter().StackAllocate(type, rows, columns); - scope.ExitScope(); - - return alloca; + return ExecuteInEntryBlock([this, type, size] { + return GetEmitter().StackAllocate(type, size); + }); } llvm::AllocaInst* IRFunctionEmitter::Variable(LLVMType type, int size) { - EntryBlockScope scope(*this); - auto alloca = GetEmitter().StackAllocate(type, size); - scope.ExitScope(); - - return alloca; - } - - llvm::AllocaInst* IRFunctionEmitter::Variable(LLVMType type, int rows, int columns) - { - EntryBlockScope scope(*this); - auto alloca = GetEmitter().StackAllocate(type, rows, columns); - scope.ExitScope(); - - return alloca; + return ExecuteInEntryBlock([this, type, size] { + return GetEmitter().StackAllocate(type, size); + }); } LLVMValue IRFunctionEmitter::Load(LLVMValue pPointer) @@ -1072,13 +1039,18 @@ namespace emitters // For loops // void IRFunctionEmitter::For(int count, std::function body) + { + For(std::string{}, count, body); + } + + void IRFunctionEmitter::For(const std::string& tag, int count, std::function body) { if (count < 0) { throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "For loop count must be >= 0"); } - auto loop = IRForLoopEmitter(*this); + auto loop = IRForLoopEmitter(*this, tag); loop.Begin(count); body(*this, LocalScalar(loop.LoadIterationVariable())); loop.End(); @@ -1086,33 +1058,53 @@ namespace emitters void IRFunctionEmitter::For(LLVMValue count, std::function body) { - auto loop = IRForLoopEmitter(*this); + For(std::string{}, count, body); + } + + void IRFunctionEmitter::For(const std::string& tag, LLVMValue count, std::function body) + { + auto loop = IRForLoopEmitter(*this, tag); loop.Begin(count); body(*this, LocalScalar(loop.LoadIterationVariable())); loop.End(); } void IRFunctionEmitter::For(int beginValue, int endValue, std::function body) + { + For(std::string{}, beginValue, endValue, body); + } + + void IRFunctionEmitter::For(const std::string& tag, int beginValue, int endValue, std::function body) { if (endValue < beginValue) { throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "For loop begin must be <= end"); } - For(beginValue, endValue, 1, body); + For(tag, beginValue, endValue, 1, body); } void IRFunctionEmitter::For(LLVMValue beginValue, LLVMValue endValue, std::function body) { - For(beginValue, endValue, Literal(1), body); + For(std::string{}, beginValue, endValue, body); + } + + void IRFunctionEmitter::For(const std::string& tag, LLVMValue beginValue, LLVMValue endValue, std::function body) + { + For(tag, beginValue, endValue, Literal(1), body); } void IRFunctionEmitter::For(int beginValue, int endValue, int increment, std::function body) + { + For(std::string{}, beginValue, endValue, increment, body); + } + + void IRFunctionEmitter::For(const std::string& tag, int beginValue, int endValue, int increment, std::function body) { if (endValue < beginValue) { throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "For loop begin must be <= end"); } - auto loop = IRForLoopEmitter(*this); + auto loop = IRForLoopEmitter(*this, tag); loop.Begin(beginValue, endValue, increment); body(*this, LocalScalar(loop.LoadIterationVariable())); loop.End(); @@ -1120,7 +1112,12 @@ namespace emitters void IRFunctionEmitter::For(LLVMValue beginValue, LLVMValue endValue, LLVMValue increment, std::function body) { - auto loop = IRForLoopEmitter(*this); + For(std::string{}, beginValue, endValue, increment, body); + } + + void IRFunctionEmitter::For(const std::string& tag, LLVMValue beginValue, LLVMValue endValue, LLVMValue increment, std::function body) + { + auto loop = IRForLoopEmitter(*this, tag); loop.Begin(beginValue, endValue, increment); body(*this, LocalScalar(loop.LoadIterationVariable())); loop.End(); @@ -1132,15 +1129,30 @@ namespace emitters void IRFunctionEmitter::For(const std::vector& ranges, MultiDimForLoopBodyFunction body) { - emitters::MultiDimFor(*this, ranges, {}, body); + For(std::string{}, ranges, body); + } + + void IRFunctionEmitter::For(const std::string& tag, const std::vector& ranges, MultiDimForLoopBodyFunction body) + { + emitters::MultiDimFor(*this, ranges, {}, body, tag); } void IRFunctionEmitter::For(const std::vector& ranges, MultiDimForLoopBodyFunction body) { - emitters::MultiDimFor(*this, ranges, {}, body); + For(std::string{}, ranges, body); + } + + void IRFunctionEmitter::For(const std::string& tag, const std::vector& ranges, MultiDimForLoopBodyFunction body) + { + emitters::MultiDimFor(*this, ranges, {}, body, tag); } void IRFunctionEmitter::For(ConstTiledLoopRange range, TiledForLoopBodyFunction body) + { + For(std::string{}, range, body); + } + + void IRFunctionEmitter::For(const std::string& tag, ConstTiledLoopRange range, TiledForLoopBodyFunction body) { auto stepSize = range.blockSize; auto numFullBlocks = (range.end - range.begin) / stepSize; @@ -1150,7 +1162,7 @@ namespace emitters if (numFullBlocks > 0) { // For(range.begin, fullBlocksEnd, stepSize, [stepSize, body](IRFunctionEmitter function, auto index) { - For(numFullBlocks, [stepSize, range, body](IRFunctionEmitter function, auto blockIndex) { + For(tag, numFullBlocks, [stepSize, range, body](IRFunctionEmitter function, auto blockIndex) { auto index = range.begin + blockIndex * stepSize; body(function, { index, index + stepSize, function.LocalScalar(stepSize), blockIndex }); }); @@ -1164,6 +1176,11 @@ namespace emitters } void IRFunctionEmitter::For(TiledLoopRange range, TiledForLoopBodyFunction body) + { + For(std::string{}, range, body); + } + + void IRFunctionEmitter::For(const std::string& tag, TiledLoopRange range, TiledForLoopBodyFunction body) { if (!range.blockSize.IsConstantInt()) { @@ -1175,8 +1192,8 @@ namespace emitters auto fullBlocksEnd = range.begin + (numFullBlocks * stepSize); // full blocks - If(numFullBlocks > 0, [numFullBlocks, stepSize, range, body](auto& function) { - function.For(numFullBlocks, range.blockSize, [stepSize, range, body](IRFunctionEmitter function, auto blockIndex) { + If(numFullBlocks > 0, [numFullBlocks, stepSize, range, body, tag](auto& function) { + function.For(tag, numFullBlocks, range.blockSize, [stepSize, range, body](IRFunctionEmitter function, auto blockIndex) { auto index = range.begin + blockIndex * stepSize; body(function, { index, index + range.blockSize, range.blockSize, blockIndex }); }); @@ -1190,12 +1207,22 @@ namespace emitters void IRFunctionEmitter::For(const std::vector& ranges, TiledMultiDimForLoopBodyFunction body) { - emitters::TiledMultiDimFor(*this, ranges, {}, body); + For(std::string{}, ranges, body); + } + + void IRFunctionEmitter::For(const std::string& tag, const std::vector& ranges, TiledMultiDimForLoopBodyFunction body) + { + emitters::TiledMultiDimFor(*this, ranges, {}, body, tag); } void IRFunctionEmitter::For(const std::vector& ranges, TiledMultiDimForLoopBodyFunction body) { - emitters::TiledMultiDimFor(*this, ranges, {}, body); + For(std::string{}, ranges, body); + } + + void IRFunctionEmitter::For(const std::string& tag, const std::vector& ranges, TiledMultiDimForLoopBodyFunction body) + { + emitters::TiledMultiDimFor(*this, ranges, {}, body, tag); } // @@ -1251,7 +1278,12 @@ namespace emitters void IRFunctionEmitter::While(LLVMValue pTestValuePointer, std::function body) { - auto loop = IRWhileLoopEmitter(*this); + While(std::string{}, pTestValuePointer, body); + } + + void IRFunctionEmitter::While(const std::string& tag, LLVMValue pTestValuePointer, std::function body) + { + auto loop = IRWhileLoopEmitter(*this, tag); loop.Begin(pTestValuePointer); body(*this); loop.End(); @@ -1259,7 +1291,12 @@ namespace emitters void IRFunctionEmitter::While(std::function condition, WhileLoopBodyFunction body) { - auto loop = IRWhileLoopEmitter(*this); + While(std::string{}, condition, body); + } + + void IRFunctionEmitter::While(const std::string& tag, std::function condition, WhileLoopBodyFunction body) + { + auto loop = IRWhileLoopEmitter(*this, tag); loop.Begin(condition); body(*this); loop.End(); @@ -1372,6 +1409,12 @@ namespace emitters return Call(PrintfFnName, arguments); } + LLVMValue IRFunctionEmitter::Printf(std::vector arguments) + { + EnsurePrintf(); + return Call(PrintfFnName, arguments); + } + LLVMValue IRFunctionEmitter::Printf(const std::string& format, std::initializer_list arguments) { EnsurePrintf(); @@ -1732,6 +1775,11 @@ namespace emitters } } + void IRFunctionEmitter::DebugBreak() + { + GetModule().GetIntrinsic(llvm::Intrinsic::debugtrap); + } + // // Information about the current function begin emitted // @@ -1798,6 +1846,34 @@ namespace emitters InsertMetadata(c_swigFunctionTagName); } + void IRFunctionEmitter::SetInlineState(FunctionInlining inlineState) + { + SetInlineState(_pFunction, inlineState); + } + + void IRFunctionEmitter::SetInlineState(LLVMFunction function, FunctionInlining inlineState) + { + for (auto attr : { llvm::Attribute::AttrKind::AlwaysInline, llvm::Attribute::AttrKind::InlineHint, llvm::Attribute::AttrKind::NoInline }) + { + function->removeFnAttr(attr); + } + + switch (inlineState) + { + case FunctionInlining::always: + function->addFnAttr(llvm::Attribute::AttrKind::AlwaysInline); + break; + case FunctionInlining::prefer: + function->addFnAttr(llvm::Attribute::AttrKind::InlineHint); + break; + case FunctionInlining::never: + function->addFnAttr(llvm::Attribute::AttrKind::NoInline); + break; + default: + break; + } + } + // // Internal functions // diff --git a/libraries/emitters/src/IRIfEmitter.cpp b/libraries/emitters/src/IRIfEmitter.cpp index a94e1c36b..4b4959f2a 100644 --- a/libraries/emitters/src/IRIfEmitter.cpp +++ b/libraries/emitters/src/IRIfEmitter.cpp @@ -52,12 +52,12 @@ namespace emitters // Move ctor and assignment op are needed to explicitly swap out values, // since the default behavior for "moving" fundamental types (aka, pointers) is to do a bitwise-copy - IRIfEmitter::IRIfEmitter(IRIfEmitter&& other) + IRIfEmitter::IRIfEmitter(IRIfEmitter&& other) noexcept { *this = std::move(other); } - IRIfEmitter& IRIfEmitter::operator=(IRIfEmitter&& other) + IRIfEmitter& IRIfEmitter::operator=(IRIfEmitter&& other) noexcept { if (this != &other) { diff --git a/libraries/emitters/src/IRLocalValue.cpp b/libraries/emitters/src/IRLocalValue.cpp index 87aa96737..e4510de76 100644 --- a/libraries/emitters/src/IRLocalValue.cpp +++ b/libraries/emitters/src/IRLocalValue.cpp @@ -8,6 +8,7 @@ #include "IRLocalValue.h" #include "IRFunctionEmitter.h" +#include "IRLocalScalar.h" #include @@ -87,5 +88,29 @@ namespace emitters this->value = value; return *this; } + + // + // IRLocalPointer + // + IRLocalValue IRLocalPointer::Load() const + { + return { function, function.Load(value) }; + } + + IRLocalPointer IRLocalPointer::Offset(int offset) const + { + return { function, function.PointerOffset(value, offset) }; + } + + IRLocalPointer IRLocalPointer::Offset(LLVMValue offset) const + { + return { function, function.PointerOffset(value, offset) }; + } + + IRLocalPointer IRLocalPointer::Offset(const IRLocalScalar& offset) const + { + return { function, function.PointerOffset(value, offset) }; + } + } // namespace emitters } // namespace ell diff --git a/libraries/emitters/src/IRLoopEmitter.cpp b/libraries/emitters/src/IRLoopEmitter.cpp index 7db6ce16e..dd8997312 100644 --- a/libraries/emitters/src/IRLoopEmitter.cpp +++ b/libraries/emitters/src/IRLoopEmitter.cpp @@ -19,6 +19,33 @@ namespace emitters const std::string LoopIncBlockName = "loop.inc"; const std::string LoopAfterBlockName = "loop.after"; + namespace + { + std::array GenerateUnrollMetadata(llvm::LLVMContext& context) + { + return { llvm::MDString::get(context, "llvm.loop.unroll.enable"), + llvm::ConstantAsMetadata::get(llvm::ConstantInt::get( + llvm::Type::getInt1Ty(context), true)) }; + } + + std::array GenerateVectorizeMetadata(llvm::LLVMContext& context) + { + return { llvm::MDString::get(context, "llvm.loop.vectorize.enable"), + llvm::ConstantAsMetadata::get(llvm::ConstantInt::get( + llvm::Type::getInt1Ty(context), true)) }; + } + + std::array GenerateVectorizeFollowupMetadata(llvm::LLVMContext& context) + { + return { llvm::MDString::get(context, "llvm.loop.vectorize.followup_vectorized"), llvm::MDNode::get(context, GenerateUnrollMetadata(context)) }; + } + + std::array GenerateDisableNonforcedMetadata(llvm::LLVMContext& context) + { + return { llvm::MDString::get(context, "llvm.loop.disable_nonforced") }; + } + } // namespace + IRLoopEmitter::IRLoopEmitter(IRFunctionEmitter& functionEmitter) : _functionEmitter(functionEmitter) {} @@ -32,20 +59,22 @@ namespace emitters auto tempNode = llvm::MDNode::getTemporary(context, {}); metadataElements.push_back(tempNode.get()); - if (unroll) + if (vectorize) { - llvm::Metadata* vals[] = { llvm::MDString::get(context, "llvm.loop.unroll.enable"), - llvm::ConstantAsMetadata::get(llvm::ConstantInt::get( - llvm::Type::getInt1Ty(context), true)) }; - metadataElements.push_back(llvm::MDNode::get(context, vals)); + metadataElements.push_back(llvm::MDNode::get(context, GenerateVectorizeMetadata(context))); + if (unroll) + { + metadataElements.push_back(llvm::MDNode::get(context, GenerateVectorizeFollowupMetadata(context))); + } + } + else if (unroll) + { + metadataElements.push_back(llvm::MDNode::get(context, GenerateUnrollMetadata(context))); } - if (vectorize) + if (unroll || vectorize) { - llvm::Metadata* vals[] = { llvm::MDString::get(context, "llvm.loop.vectorize.enable"), - llvm::ConstantAsMetadata::get(llvm::ConstantInt::get( - llvm::Type::getInt1Ty(context), true)) }; - metadataElements.push_back(llvm::MDNode::get(context, vals)); + metadataElements.push_back(llvm::MDNode::get(context, GenerateDisableNonforcedMetadata(context))); } auto loopID = llvm::MDNode::get(context, metadataElements); @@ -62,16 +91,18 @@ namespace emitters // _pAfterBlock -- branch to this block when done // - IRForLoopEmitter::IRForLoopEmitter(IRFunctionEmitter& functionEmitter) : - IRLoopEmitter(functionEmitter) {} + IRForLoopEmitter::IRForLoopEmitter(IRFunctionEmitter& functionEmitter, const std::string& tag) : + IRLoopEmitter(functionEmitter), + _tag(tag) + {} void IRForLoopEmitter::CreateBlocks() { - _pInitializationBlock = _functionEmitter.Block(LoopInitBlockName); - _pConditionBlock = _functionEmitter.Block(LoopConditionBlockName); - _pBodyBlock = _functionEmitter.Block(LoopBodyBlockName); - _pIncrementBlock = _functionEmitter.Block(LoopIncBlockName); - _pAfterBlock = _functionEmitter.Block(LoopAfterBlockName); + _pInitializationBlock = _functionEmitter.Block(_tag + LoopInitBlockName); + _pConditionBlock = _functionEmitter.Block(_tag + LoopConditionBlockName); + _pBodyBlock = _functionEmitter.Block(_tag + LoopBodyBlockName); + _pIncrementBlock = _functionEmitter.Block(_tag + LoopIncBlockName); + _pAfterBlock = _functionEmitter.Block(_tag + LoopAfterBlockName); } llvm::BasicBlock* IRForLoopEmitter::Begin(int repeatCount) @@ -123,9 +154,9 @@ namespace emitters _functionEmitter.Branch(_pConditionBlock); _functionEmitter.SetCurrentBlock(_pConditionBlock); auto branchInst = _functionEmitter.Branch(comparison, _functionEmitter.Load(_pIterationVariable), pTestValue, _pBodyBlock, _pAfterBlock); - + bool unroll = false; - bool vectorize = false; + bool vectorize = true; AddLoopMetadata(branchInst, unroll, vectorize); } @@ -168,15 +199,17 @@ namespace emitters // // IRWhileLoopEmitter // - IRWhileLoopEmitter::IRWhileLoopEmitter(IRFunctionEmitter& functionEmitter) : - IRLoopEmitter(functionEmitter) {} + IRWhileLoopEmitter::IRWhileLoopEmitter(IRFunctionEmitter& functionEmitter, const std::string& tag) : + IRLoopEmitter(functionEmitter), + _tag(tag) + {} void IRWhileLoopEmitter::CreateBlocks() { - _pInitializationBlock = _functionEmitter.Block(LoopInitBlockName); - _pConditionBlock = _functionEmitter.Block(LoopConditionBlockName); - _pBodyBlock = _functionEmitter.Block(LoopBodyBlockName); - _pAfterBlock = _functionEmitter.Block(LoopAfterBlockName); + _pInitializationBlock = _functionEmitter.Block(_tag + LoopInitBlockName); + _pConditionBlock = _functionEmitter.Block(_tag + LoopConditionBlockName); + _pBodyBlock = _functionEmitter.Block(_tag + LoopBodyBlockName); + _pAfterBlock = _functionEmitter.Block(_tag + LoopAfterBlockName); } llvm::BasicBlock* IRWhileLoopEmitter::Begin(LLVMValue pTestValuePointer) @@ -226,8 +259,8 @@ namespace emitters _functionEmitter.SetCurrentBlock(_pConditionBlock); auto conditionValue = condition(_functionEmitter); auto branchInst = _functionEmitter.Branch(conditionValue, - _pBodyBlock, - _pAfterBlock); + _pBodyBlock, + _pAfterBlock); bool unroll = false; bool vectorize = false; AddLoopMetadata(branchInst, unroll, vectorize); diff --git a/libraries/emitters/src/IRModuleEmitter.cpp b/libraries/emitters/src/IRModuleEmitter.cpp index c9fec8ed9..8cf9a1a72 100644 --- a/libraries/emitters/src/IRModuleEmitter.cpp +++ b/libraries/emitters/src/IRModuleEmitter.cpp @@ -20,18 +20,11 @@ #include #include -#include #include -#include -#include #include -#include -#include #include -#include #include #include -#include #include #include @@ -52,13 +45,15 @@ namespace emitters IRModuleEmitter::IRModuleEmitter(const std::string& moduleName, const CompilerOptions& parameters) : _llvmContext(std::make_unique()), _llvmModule(std::make_unique(moduleName, *_llvmContext)), - _emitter(*this, *_llvmContext), - _runtime(*this), - _threadPool(*this), - _profiler(*this, parameters.profile) + _emitter(new IREmitter(*_llvmContext, *_llvmModule)), + _runtime(new IRRuntime(*this)), + _threadPool(new IRThreadPool(*this)), + _profiler(new IRProfiler(*this, parameters.profile)) { InitializeLLVM(); - InitializeGlobalPassRegistry(); + + // Create a diagnostic handler to record if there was an error + _diagnosticHandler = std::unique_ptr(new IRDiagnosticHandler(*_llvmContext)); SetCompilerOptions(parameters); if (GetCompilerOptions().includeDiagnosticInfo) @@ -66,7 +61,7 @@ namespace emitters DeclarePrintf(); } - _profiler.Init(); + _profiler->Init(); } void IRModuleEmitter::SetCompilerOptions(const CompilerOptions& parameters) @@ -135,7 +130,7 @@ namespace emitters IRFunctionEmitter& IRModuleEmitter::BeginFunction(const std::string& functionName, VariableType returnType) { _functions[functionName] = FunctionDeclaration(functionName, returnType); - return BeginFunction(functionName, _emitter.Type(returnType)); + return BeginFunction(functionName, GetIREmitter().Type(returnType)); } IRFunctionEmitter& IRModuleEmitter::BeginFunction(const std::string& functionName, LLVMType returnType) @@ -151,21 +146,21 @@ namespace emitters fake.push_back({ "", t }); } _functions[functionName] = FunctionDeclaration(functionName, returnType, fake); - return BeginFunction(functionName, _emitter.Type(returnType), _emitter.GetLLVMTypes(args)); + return BeginFunction(functionName, GetIREmitter().Type(returnType), GetIREmitter().GetLLVMTypes(args)); } IRFunctionEmitter& IRModuleEmitter::BeginFunction(const std::string& functionName, VariableType returnType, const NamedVariableTypeList& args) { _functions[functionName] = FunctionDeclaration(functionName, returnType, args); - return BeginFunction(functionName, _emitter.Type(returnType), args); + return BeginFunction(functionName, GetIREmitter().Type(returnType), args); } IRFunctionEmitter& IRModuleEmitter::BeginFunction(const std::string& functionName, VariableType returnType, const FunctionArgumentList& args) { _functions[functionName] = FunctionDeclaration(functionName, returnType, args); Log() << "Begin emitting IR for function " << functionName << EOL; - auto currentPos = _emitter.GetCurrentInsertPoint(); - IRFunctionEmitter newFunction = Function(functionName, _emitter.Type(returnType), args, false); + auto currentPos = GetIREmitter().GetCurrentInsertPoint(); + IRFunctionEmitter newFunction = Function(functionName, GetIREmitter().Type(returnType), args, false); _functionStack.emplace(newFunction, currentPos); return _functionStack.top().first; } @@ -177,7 +172,7 @@ namespace emitters _functions[functionName] = FunctionDeclaration(functionName, VariableType::Custom, args); } Log() << "Begin emitting IR for function " << functionName << EOL; - auto currentPos = _emitter.GetCurrentInsertPoint(); + auto currentPos = GetIREmitter().GetCurrentInsertPoint(); IRFunctionEmitter newFunction = Function(functionName, returnType, args, false); _functionStack.emplace(newFunction, currentPos); return _functionStack.top().first; @@ -195,7 +190,7 @@ namespace emitters _functions[functionName] = FunctionDeclaration(functionName, ToVariableType(returnType), argInfo); } Log() << "Begin emitting IR for function " << functionName << EOL; - auto currentPos = _emitter.GetCurrentInsertPoint(); + auto currentPos = GetIREmitter().GetCurrentInsertPoint(); IRFunctionEmitter newFunction = Function(functionName, returnType, argTypes, false); _functionStack.emplace(newFunction, currentPos); return _functionStack.top().first; @@ -213,7 +208,7 @@ namespace emitters _functions[functionName] = FunctionDeclaration(functionName, ToVariableType(returnType), argInfo); } Log() << "Begin emitting IR for function " << functionName << EOL; - auto currentPos = _emitter.GetCurrentInsertPoint(); + auto currentPos = GetIREmitter().GetCurrentInsertPoint(); IRFunctionEmitter newFunction = Function(functionName, returnType, args, false); _functionStack.emplace(newFunction, currentPos); return _functionStack.top().first; @@ -266,7 +261,7 @@ namespace emitters currentFunction.ConcatRegions(); currentFunction.CompleteFunction(); } - _emitter.SetCurrentInsertPoint(previousPos); + GetIREmitter().SetCurrentInsertPoint(previousPos); Log() << "End emitting of function " << currentFunction.GetFunctionName() << EOL; } @@ -375,10 +370,10 @@ namespace emitters std::vector metadataElements; for (const auto& value : values) { - metadataElements.push_back({ llvm::MDString::get(_emitter.GetContext(), value) }); + metadataElements.push_back({ llvm::MDString::get(GetIREmitter().GetContext(), value) }); } - auto metadataNode = llvm::MDNode::get(_emitter.GetContext(), metadataElements); + auto metadataNode = llvm::MDNode::get(GetIREmitter().GetContext(), metadataElements); auto metadata = _llvmModule->getOrInsertNamedMetadata(tag); metadata->addOperand(metadataNode); } @@ -402,10 +397,10 @@ namespace emitters std::vector metadataElements; for (const auto& value : values) { - metadataElements.push_back({ llvm::MDString::get(_emitter.GetContext(), value) }); + metadataElements.push_back({ llvm::MDString::get(GetIREmitter().GetContext(), value) }); } - auto metadataNode = llvm::MDNode::get(_emitter.GetContext(), metadataElements); + auto metadataNode = llvm::MDNode::get(GetIREmitter().GetContext(), metadataElements); function->setMetadata(tag, metadataNode); } @@ -449,49 +444,52 @@ namespace emitters llvm::GlobalVariable* IRModuleEmitter::Constant(VariableType type, const std::string& name, double value) { - return AddGlobal(name, _emitter.Type(type), _emitter.Literal(value), true); + return AddGlobal(name, GetIREmitter().Type(type), GetIREmitter().Literal(value), true); } - llvm::GlobalVariable* IRModuleEmitter::Global(VariableType type, const std::string& name) + llvm::GlobalVariable* IRModuleEmitter::Global(VariableType type, const std::string& name, bool isThreadLocal) { - return AddGlobal(name, _emitter.Type(type), _emitter.Zero(type), false); + return AddGlobal(name, GetIREmitter().Type(type), GetIREmitter().Zero(type), false, isThreadLocal); } - llvm::GlobalVariable* IRModuleEmitter::Global(LLVMType pType, const std::string& name) + llvm::GlobalVariable* IRModuleEmitter::Global(LLVMType pType, const std::string& name, bool isThreadLocal) { auto initializer = ZeroInitializer(pType); - return AddGlobal(name, pType, initializer, false); + return AddGlobal(name, pType, initializer, false, isThreadLocal); } - llvm::GlobalVariable* IRModuleEmitter::GlobalPointer(const std::string& name, VariableType type) + llvm::GlobalVariable* IRModuleEmitter::GlobalPointer(const std::string& name, VariableType type, bool isThreadLocal) { - llvm::PointerType* pointerType = _emitter.Type(type)->getPointerTo(); - return AddGlobal(name, pointerType, _emitter.NullPointer(pointerType), false); + llvm::PointerType* pointerType = GetIREmitter().Type(type)->getPointerTo(); + return AddGlobal(name, pointerType, GetIREmitter().NullPointer(pointerType), false, isThreadLocal); } - llvm::GlobalVariable* IRModuleEmitter::GlobalArray(VariableType type, const std::string& name, const size_t size) + llvm::GlobalVariable* IRModuleEmitter::GlobalArray(VariableType type, const std::string& name, const size_t size, bool isThreadLocal) { - llvm::ArrayType* pArrayType = _emitter.ArrayType(type, size); - return AddGlobal(name, pArrayType, ZeroInitializer(pArrayType), false); + llvm::ArrayType* pArrayType = GetIREmitter().ArrayType(type, size); + return AddGlobal(name, pArrayType, ZeroInitializer(pArrayType), false, isThreadLocal); } - llvm::GlobalVariable* IRModuleEmitter::GlobalArray(const std::string& name, LLVMType pType, const size_t size) + llvm::GlobalVariable* IRModuleEmitter::GlobalArray(const std::string& name, LLVMType pType, const size_t size, bool isThreadLocal) { assert(pType != nullptr); llvm::ArrayType* pArrayType = llvm::ArrayType::get(pType, size); - return AddGlobal(name, pArrayType, ZeroInitializer(pArrayType), false); + return AddGlobal(name, pArrayType, ZeroInitializer(pArrayType), false, isThreadLocal); } // This function has the actual implementation for all the above Global/GlobalArray() methods - llvm::GlobalVariable* IRModuleEmitter::AddGlobal(const std::string& name, LLVMType pType, llvm::Constant* pInitial, bool isConst) + llvm::GlobalVariable* IRModuleEmitter::AddGlobal(const std::string& name, LLVMType pType, llvm::Constant* pInitial, bool isConst, bool isThreadLocal) { + CompilerOptions options = GetCompilerOptions(); _llvmModule->getOrInsertGlobal(name, pType); auto global = _llvmModule->getNamedGlobal(name); + global->setAlignment(options.globalValueAlignment); global->setInitializer(pInitial); global->setConstant(isConst); global->setExternallyInitialized(false); global->setLinkage(llvm::GlobalValue::LinkageTypes::InternalLinkage); + global->setThreadLocal(isThreadLocal); assert(llvm::isa(global)); return llvm::cast(global); } @@ -503,7 +501,7 @@ namespace emitters LLVMFunction IRModuleEmitter::DeclareFunction(const std::string& name, VariableType returnType) { _functions[name] = FunctionDeclaration(name, returnType); - return _emitter.DeclareFunction(GetLLVMModule(), name, returnType); + return GetIREmitter().DeclareFunction(GetLLVMModule(), name, returnType); } LLVMFunction IRModuleEmitter::DeclareFunction(const std::string& name, VariableType returnType, const VariableTypeList& arguments) @@ -516,18 +514,19 @@ namespace emitters // record this function definition in our local _functions so that these definitions can be found via GetFunctionNames // and GetCallbackFunctionNames _functions[name] = FunctionDeclaration(name, returnType, fake); - return _emitter.DeclareFunction(GetLLVMModule(), name, returnType, arguments); + return GetIREmitter().DeclareFunction(GetLLVMModule(), name, returnType, arguments); } LLVMFunction IRModuleEmitter::DeclareFunction(const std::string& name, VariableType returnType, const NamedVariableTypeList& arguments) { _functions[name] = FunctionDeclaration(name, returnType, arguments); - return _emitter.DeclareFunction(GetLLVMModule(), name, returnType, arguments); + return GetIREmitter().DeclareFunction(GetLLVMModule(), name, returnType, arguments); } LLVMFunction IRModuleEmitter::DeclareFunction(const std::string& name, llvm::FunctionType* functionType) { - return _emitter.DeclareFunction(GetLLVMModule(), name, functionType); + // TODO: add to _functions list + return GetIREmitter().DeclareFunction(GetLLVMModule(), name, functionType); } IRFunctionEmitter IRModuleEmitter::Function(const std::string& name, VariableType returnType, bool isPublic) @@ -548,7 +547,8 @@ namespace emitters IRFunctionEmitter IRModuleEmitter::Function(const std::string& name, VariableType returnType, const NamedVariableTypeList& arguments, bool isPublic) { - LLVMFunction pFunction = _emitter.Function(GetLLVMModule(), name, returnType, Linkage(isPublic), arguments); + // TODO: add this function to the _functions list?? + LLVMFunction pFunction = GetIREmitter().Function(GetLLVMModule(), name, returnType, Linkage(isPublic), arguments); if (pFunction == nullptr) { throw EmitterException(EmitterError::functionNotFound); @@ -560,7 +560,8 @@ namespace emitters IRFunctionEmitter IRModuleEmitter::Function(const std::string& name, LLVMType returnType, const NamedVariableTypeList& arguments, bool isPublic) { - LLVMFunction pFunction = _emitter.Function(GetLLVMModule(), name, returnType, Linkage(isPublic), arguments); + // TODO: add this function to the _functions list?? + LLVMFunction pFunction = GetIREmitter().Function(GetLLVMModule(), name, returnType, Linkage(isPublic), arguments); if (pFunction == nullptr) { throw EmitterException(EmitterError::functionNotFound); @@ -572,7 +573,8 @@ namespace emitters IRFunctionEmitter IRModuleEmitter::Function(const std::string& name, LLVMType returnType, const FunctionArgumentList& arguments, bool isPublic) { - LLVMFunction pFunction = _emitter.Function(GetLLVMModule(), name, returnType, Linkage(isPublic), arguments); + // TODO: add this function to the _functions list?? + LLVMFunction pFunction = GetIREmitter().Function(GetLLVMModule(), name, returnType, Linkage(isPublic), arguments); if (pFunction == nullptr) { throw EmitterException(EmitterError::functionNotFound); @@ -584,7 +586,8 @@ namespace emitters IRFunctionEmitter IRModuleEmitter::Function(const std::string& name, VariableType returnType, const VariableTypeList* pArguments, bool isPublic) { - LLVMFunction pFunction = _emitter.Function(GetLLVMModule(), name, returnType, Linkage(isPublic), pArguments); + // TODO: add this function to the _functions list?? + LLVMFunction pFunction = GetIREmitter().Function(GetLLVMModule(), name, returnType, Linkage(isPublic), pArguments); if (pFunction == nullptr) { throw EmitterException(EmitterError::functionNotFound); @@ -594,7 +597,8 @@ namespace emitters IRFunctionEmitter IRModuleEmitter::Function(const std::string& name, LLVMType returnType, const std::vector& argTypes, bool isPublic) { - LLVMFunction pFunction = _emitter.Function(GetLLVMModule(), name, returnType, Linkage(isPublic), argTypes); + // TODO: add this function to the _functions list?? + LLVMFunction pFunction = GetIREmitter().Function(GetLLVMModule(), name, returnType, Linkage(isPublic), argTypes); if (pFunction == nullptr) { throw EmitterException(EmitterError::functionNotFound); @@ -604,7 +608,8 @@ namespace emitters IRFunctionEmitter IRModuleEmitter::Function(const std::string& name, LLVMType returnType, const NamedLLVMTypeList& arguments, bool isPublic) { - LLVMFunction pFunction = _emitter.Function(GetLLVMModule(), name, returnType, Linkage(isPublic), arguments); + // TODO: add this function to the _functions list?? + LLVMFunction pFunction = GetIREmitter().Function(GetLLVMModule(), name, returnType, Linkage(isPublic), arguments); if (pFunction == nullptr) { throw EmitterException(EmitterError::functionNotFound); @@ -622,16 +627,21 @@ namespace emitters return GetLLVMModule()->getFunction(name); } + LLVMFunction IRModuleEmitter::GetIntrinsic(llvm::Intrinsic::ID id) + { + return GetIREmitter().GetIntrinsic(GetLLVMModule(), id, VariableTypeList{}); + } + LLVMFunction IRModuleEmitter::GetIntrinsic(llvm::Intrinsic::ID id, const std::initializer_list& arguments) { VariableTypeList valueTypeList = arguments; - return _emitter.GetIntrinsic(GetLLVMModule(), id, valueTypeList); + return GetIREmitter().GetIntrinsic(GetLLVMModule(), id, valueTypeList); } LLVMFunction IRModuleEmitter::GetIntrinsic(llvm::Intrinsic::ID id, const std::initializer_list& arguments) { LLVMTypeList valueTypeList = arguments; - return _emitter.GetIntrinsic(GetLLVMModule(), id, valueTypeList); + return GetIREmitter().GetIntrinsic(GetLLVMModule(), id, valueTypeList); } // @@ -643,7 +653,7 @@ namespace emitters NamedLLVMTypeList llvmFields; for (auto& field : fields) { - llvmFields.emplace_back(field.first, _emitter.Type(field.second)); + llvmFields.emplace_back(field.first, GetIREmitter().Type(field.second)); } return GetOrCreateStruct(name, llvmFields); } @@ -666,7 +676,7 @@ namespace emitters llvm::StructType* IRModuleEmitter::GetOrCreateStruct(const std::string& name, const LLVMTypeList& fields) { - if (auto structType = _emitter.GetStruct(name)) + if (auto structType = GetIREmitter().GetStruct(name)) { // Check that existing struct fields match the ones we're trying to create auto structFields = structType->elements(); @@ -687,24 +697,24 @@ namespace emitters return structType; } - return _emitter.DeclareStruct(name, fields); + return GetIREmitter().DeclareStruct(name, fields); } llvm::StructType* IRModuleEmitter::GetAnonymousStructType(const LLVMTypeList& fieldTypes, bool packed) { - return _emitter.GetAnonymousStructType(fieldTypes, packed); + return GetIREmitter().GetAnonymousStructType(fieldTypes, packed); } llvm::StructType* IRModuleEmitter::GetStruct(const std::string& name) { - return _emitter.GetStruct(name); + return GetIREmitter().GetStruct(name); } // // Code output / input // - void IRModuleEmitter::WriteToFile(const std::string& filePath, ModuleOutputFormat format) + MachineCodeOutputOptions IRModuleEmitter::GetMachineCodeOutputOptions() const { MachineCodeOutputOptions options; auto compilerOptions = GetCompilerOptions(); @@ -716,6 +726,11 @@ namespace emitters options.verifyModule = true; options.floatFusionMode = compilerOptions.useFastMath ? FloatFusionMode::Fast : FloatFusionMode::Standard; + options.unsafeFPMath = compilerOptions.useFastMath; + options.noInfsFPMath = compilerOptions.useFastMath; + options.noNaNsFPMath = compilerOptions.useFastMath; + options.noSignedZerosFPMath = compilerOptions.useFastMath; + if (compilerOptions.positionIndependentCode.HasValue()) { options.relocModel = compilerOptions.positionIndependentCode.GetValue() ? OutputRelocationModel::PIC_ : OutputRelocationModel::Static; @@ -727,7 +742,15 @@ namespace emitters } // Other params to possibly set: + // bool verboseOutput = false; + // bool verifyModule = false; // FloatABIType floatABI = FloatABIType::Default; + return options; + } + + void IRModuleEmitter::WriteToFile(const std::string& filePath, ModuleOutputFormat format) + { + auto options = GetMachineCodeOutputOptions(); switch (format) { @@ -779,18 +802,7 @@ namespace emitters void IRModuleEmitter::WriteToStream(std::ostream& stream, ModuleOutputFormat format) { - MachineCodeOutputOptions options; - auto compilerOptions = GetCompilerOptions(); - options.targetDevice = compilerOptions.targetDevice; - if (compilerOptions.optimize) - { - options.optimizationLevel = OptimizationLevel::Aggressive; - } - // Other params to possibly set: - // bool verboseOutput = false; - // bool verifyModule = false; - // FloatABIType floatABI = FloatABIType::Default; - // FloatFusionMode floatFusionMode = FloatFusionMode::Standard; + auto options = GetMachineCodeOutputOptions(); WriteToStream(stream, format, options); } @@ -836,16 +848,56 @@ namespace emitters void IRModuleEmitter::LoadIR(const std::string& text) { - llvm::MemoryBufferRef buffer(text, ""); // See Parser.cpp in LLVM code base for why... + auto buffer = llvm::MemoryBuffer::getMemBuffer(text); + llvm::SMDiagnostic errorHandler; + bool hadError = llvm::parseAssemblyInto(*buffer, GetLLVMModule(), nullptr, errorHandler); + if (hadError) + { + std::string message = errorHandler.getMessage(); + throw EmitterException(EmitterError::parserError, message); + } + } + + void IRModuleEmitter::LoadIR(std::istream& stream) + { + std::string irStr(std::istreambuf_iterator(stream), {}); + LoadIR(irStr); + } + + void IRModuleEmitter::LoadIRFromFile(const std::string& filename) + { + auto buffer = llvm::MemoryBuffer::getFile(filename); + if (!buffer) + { + throw EmitterException(EmitterError::parserError, "Unable to open " + filename); + } + llvm::SMDiagnostic errorHandler; - bool hadError = llvm::parseAssemblyInto(buffer, GetLLVMModule(), nullptr, errorHandler); + bool hadError = llvm::parseAssemblyInto(*(buffer->get()), GetLLVMModule(), nullptr, errorHandler); if (hadError) { - std::string message = errorHandler.getMessage(); //IRLoader::ErrorToString(errorHandler); + std::string message = errorHandler.getMessage(); throw EmitterException(EmitterError::parserError, message); } } + void IRModuleEmitter::LoadAsm(const std::string& text) + { + GetLLVMModule()->appendModuleInlineAsm(text); + } + + void IRModuleEmitter::LoadAsm(std::istream& stream) + { + std::string asmStr(std::istreambuf_iterator(stream), {}); + LoadAsm(asmStr); + } + + void IRModuleEmitter::LoadAsmFromFile(const std::string& filename) + { + auto stream = OpenIfstream(filename); + LoadAsm(stream); + } + void IRModuleEmitter::WriteHeader(std::ostream& os) { WriteModuleHeader(os, *this); @@ -905,16 +957,16 @@ namespace emitters { std::string globalName = GetModuleName() + "_debug_message_" + std::to_string(_globalStringIndex++); llvm::GlobalVariable* msg = ConstantArray(globalName, std::vector(message.c_str(), message.c_str() + message.size() + 1)); - auto& context = _emitter.GetContext(); + auto& context = GetIREmitter().GetContext(); auto charPointerType = llvm::Type::getInt8Ty(context)->getPointerTo(); - llvm::Value* msgptr = _emitter.CastPointer(msg, charPointerType); + llvm::Value* msgptr = GetIREmitter().CastPointer(msg, charPointerType); auto function = DeclareDebugPrint(); - _emitter.Call(function, msgptr); + GetIREmitter().Call(function, msgptr); } void IRModuleEmitter::DeclarePrintf() { - auto& context = _emitter.GetContext(); + auto& context = GetIREmitter().GetContext(); auto type = llvm::FunctionType::get( llvm::Type::getInt32Ty(context), { llvm::Type::getInt8PtrTy(context) }, @@ -983,7 +1035,14 @@ namespace emitters } auto relocModel = parameters.targetDevice.IsWindows() ? OutputRelocationModel::Static : OutputRelocationModel::PIC_; - const llvm::TargetOptions options; + llvm::TargetOptions options; + options.FloatABIType = llvm::FloatABI::Default; + options.AllowFPOpFusion = parameters.useFastMath ? llvm::FPOpFusion::Standard : llvm::FPOpFusion::Fast; + options.UnsafeFPMath = parameters.useFastMath ? 1 : 0; + options.NoInfsFPMath = parameters.useFastMath ? 1 : 0; + options.NoNaNsFPMath = parameters.useFastMath ? 1 : 0; + options.NoSignedZerosFPMath = parameters.useFastMath ? 1 : 0; + const llvm::CodeModel::Model codeModel = llvm::CodeModel::Small; auto tm = target->createTargetMachine(llvm::Triple::normalize(parameters.targetDevice.triple), parameters.targetDevice.cpu, @@ -1126,73 +1185,6 @@ namespace emitters return llvm::ConstantAggregateZero::get(pType); } - // - // Global LLVM state management - // - void IRModuleEmitter::InitializeLLVM() - { - // All targets - llvm::InitializeAllTargetInfos(); - llvm::InitializeAllTargets(); - llvm::InitializeAllTargetMCs(); - llvm::InitializeAllAsmPrinters(); - llvm::InitializeAllAsmParsers(); - llvm::InitializeAllDisassemblers(); - - // Native target (perhaps unnecessary, since we're initializing _all_ the targets above) - llvm::InitializeNativeTarget(); - llvm::InitializeNativeTargetAsmPrinter(); - llvm::InitializeNativeTargetAsmParser(); - llvm::InitializeNativeTargetDisassembler(); - - // Create a diagnostic handler to record if there was an error - _diagnosticHandler = std::unique_ptr(new IRDiagnosticHandler(*_llvmContext)); - } - - llvm::PassRegistry* IRModuleEmitter::InitializeGlobalPassRegistry() - { - // Get the global pass registry - llvm::PassRegistry* registry = llvm::PassRegistry::getPassRegistry(); - - // Initialize all of the optimization passes (probably unnecessary) - llvm::initializeCore(*registry); - llvm::initializeCoroutines(*registry); - llvm::initializeScalarOpts(*registry); - llvm::initializeObjCARCOpts(*registry); - llvm::initializeVectorization(*registry); - llvm::initializeIPO(*registry); - llvm::initializeAnalysis(*registry); - llvm::initializeTransformUtils(*registry); - llvm::initializeInstCombine(*registry); - llvm::initializeAggressiveInstCombine(*registry); - llvm::initializeInstrumentation(*registry); - llvm::initializeTarget(*registry); - // For codegen passes, only passes that do IR to IR transformation are - // supported. - llvm::initializeExpandMemCmpPassPass(*registry); - llvm::initializeScalarizeMaskedMemIntrinPass(*registry); - llvm::initializeCodeGenPreparePass(*registry); - llvm::initializeAtomicExpandPass(*registry); - llvm::initializeRewriteSymbolsLegacyPassPass(*registry); - llvm::initializeWinEHPreparePass(*registry); - llvm::initializeDwarfEHPreparePass(*registry); - llvm::initializeSafeStackLegacyPassPass(*registry); - llvm::initializeSjLjEHPreparePass(*registry); - llvm::initializePreISelIntrinsicLoweringLegacyPassPass(*registry); - llvm::initializeGlobalMergePass(*registry); - llvm::initializeIndirectBrExpandPassPass(*registry); - llvm::initializeInterleavedLoadCombinePass(*registry); - llvm::initializeInterleavedAccessPass(*registry); - llvm::initializeEntryExitInstrumenterPass(*registry); - llvm::initializePostInlineEntryExitInstrumenterPass(*registry); - llvm::initializeUnreachableBlockElimLegacyPassPass(*registry); - llvm::initializeExpandReductionsPass(*registry); - llvm::initializeWasmEHPreparePass(*registry); - llvm::initializeWriteBitcodePassPass(*registry); - - return registry; - } - template <> CallbackRegistry& IRModuleEmitter::GetCallbackRegistry() const { diff --git a/libraries/emitters/src/IRRuntime.cpp b/libraries/emitters/src/IRRuntime.cpp index dca4241df..51300e8c3 100644 --- a/libraries/emitters/src/IRRuntime.cpp +++ b/libraries/emitters/src/IRRuntime.cpp @@ -420,17 +420,22 @@ namespace emitters return _module.GetIntrinsic(llvm::Intrinsic::sin, { argType }); } - LLVMFunction IRRuntime::GetCosFunction(VariableType argType) + LLVMFunction IRRuntime::GetCosFunction(VariableType argType) { return _module.GetIntrinsic(llvm::Intrinsic::cos, { argType }); } - LLVMFunction IRRuntime::GetCopySignFunction(VariableType argType) + LLVMFunction IRRuntime::GetCopySignFunction(VariableType argType) { return _module.GetIntrinsic(llvm::Intrinsic::copysign, { argType }); } - LLVMFunction IRRuntime::GetTanhFunction(VariableType argType) + LLVMFunction IRRuntime::GetFmaFunction(VariableType argType) + { + return _module.GetIntrinsic(llvm::Intrinsic::fma, { argType }); + } + + LLVMFunction IRRuntime::GetTanhFunction(VariableType argType) { // This assumes a standard C runtime library is linked auto& emitter = _module.GetIREmitter(); @@ -529,6 +534,11 @@ namespace emitters return _module.GetIntrinsic(llvm::Intrinsic::copysign, { argType }); } + LLVMFunction IRRuntime::GetFmaFunction(LLVMType argType) + { + return _module.GetIntrinsic(llvm::Intrinsic::fma, { argType }); + } + LLVMFunction IRRuntime::GetPrefetchFunction() { return _module.GetIntrinsic(llvm::Intrinsic::prefetch, std::initializer_list{}); @@ -563,11 +573,10 @@ namespace emitters // got to the end of both strings, so they are equal, return 1. fn.Store(result, fn.Literal(1)); fn.Store(continuing, fn.FalseBit()); - }) - .ElseIf(achar == zero || bchar == zero || achar != bchar, [continuing](emitters::IRFunctionEmitter& fn) { - // terminate loop with 0 result - fn.Store(continuing, fn.FalseBit()); - }); + }).ElseIf(achar == zero || bchar == zero || achar != bchar, [continuing](emitters::IRFunctionEmitter& fn) { + // terminate loop with 0 result + fn.Store(continuing, fn.FalseBit()); + }); fn.Store(index, indexValue + 1); }); diff --git a/libraries/emitters/src/LLVMUtilities.cpp b/libraries/emitters/src/LLVMUtilities.cpp index 49710854f..5da919227 100644 --- a/libraries/emitters/src/LLVMUtilities.cpp +++ b/libraries/emitters/src/LLVMUtilities.cpp @@ -9,9 +9,11 @@ #include "LLVMUtilities.h" #include "EmitterException.h" -#include -#include +#include "build/LLVMEmitterTargets.h" + #include +#include +#include namespace ell { @@ -90,16 +92,20 @@ namespace emitters } else { - if (type->isVoidTy()) { + if (type->isVoidTy()) + { return VariableType::Void; } - else if (type->isDoubleTy()) { + else if (type->isDoubleTy()) + { return VariableType::Double; } - else if (type->isFloatTy()) { + else if (type->isFloatTy()) + { return VariableType::Float; } - else if (type->isIntegerTy()) { + else if (type->isIntegerTy()) + { switch (type->getIntegerBitWidth()) { case 8: @@ -116,5 +122,75 @@ namespace emitters return VariableType::Custom; } + namespace + { + void InitializeLLVMTargets() + { + // This block is part of a X-Macro. LLVM_EMITTER_TARGETS below is + // defined in build/LLVMEmitterTargets.h at CMake configure time. + // It is dependent on the value of the CMake variable LLVM_EMITTER_TARGETS. + // For each LLVM target specified in that variable, EMITTER_TARGET_ACTION + // below gets called +#define EMITTER_TARGET_ACTION(TargetName) \ + LLVMInitialize##TargetName##TargetInfo(); \ + LLVMInitialize##TargetName##Target(); \ + LLVMInitialize##TargetName##TargetMC(); \ + LLVMInitialize##TargetName##AsmPrinter(); \ + LLVMInitialize##TargetName##AsmParser(); \ + LLVMInitialize##TargetName##Disassembler(); + LLVM_EMITTER_TARGETS +#undef EMITTER_TARGET_ACTION + + llvm::InitializeNativeTarget(); + } + + void InitializeGlobalPassRegistry() + { + // Get the global pass registry + llvm::PassRegistry* registry = llvm::PassRegistry::getPassRegistry(); + + // Initialize all of the optimization passes (probably unnecessary) + llvm::initializeCore(*registry); + llvm::initializeScalarOpts(*registry); + llvm::initializeVectorization(*registry); + llvm::initializeIPO(*registry); + llvm::initializeAnalysis(*registry); + llvm::initializeTransformUtils(*registry); + llvm::initializeInstCombine(*registry); + llvm::initializeAggressiveInstCombine(*registry); + llvm::initializeInstrumentation(*registry); + llvm::initializeTarget(*registry); + llvm::initializeGlobalISel(*registry); + + // For codegen passes, only passes that do IR to IR transformation are + // supported. + llvm::initializeExpandMemCmpPassPass(*registry); + llvm::initializeScalarizeMaskedMemIntrinPass(*registry); + llvm::initializeCodeGenPreparePass(*registry); + llvm::initializeAtomicExpandPass(*registry); + llvm::initializeRewriteSymbolsLegacyPassPass(*registry); + llvm::initializeWinEHPreparePass(*registry); + llvm::initializeDwarfEHPreparePass(*registry); + llvm::initializeSafeStackLegacyPassPass(*registry); + llvm::initializeSjLjEHPreparePass(*registry); + llvm::initializePreISelIntrinsicLoweringLegacyPassPass(*registry); + llvm::initializeGlobalMergePass(*registry); + llvm::initializeIndirectBrExpandPassPass(*registry); + llvm::initializeInterleavedLoadCombinePass(*registry); + llvm::initializeInterleavedAccessPass(*registry); + llvm::initializeEntryExitInstrumenterPass(*registry); + llvm::initializePostInlineEntryExitInstrumenterPass(*registry); + llvm::initializeUnreachableBlockElimLegacyPassPass(*registry); + llvm::initializeExpandReductionsPass(*registry); + llvm::initializeWriteBitcodePassPass(*registry); + } + } // namespace + + void InitializeLLVM() + { + InitializeLLVMTargets(); + InitializeGlobalPassRegistry(); + } + } // namespace emitters } // namespace ell diff --git a/libraries/emitters/src/TargetDevice.cpp b/libraries/emitters/src/TargetDevice.cpp index 923f18b09..90a933484 100644 --- a/libraries/emitters/src/TargetDevice.cpp +++ b/libraries/emitters/src/TargetDevice.cpp @@ -9,6 +9,7 @@ #include "TargetDevice.h" #include "EmitterException.h" #include "IRAssemblyWriter.h" // for OutputRelocationModel +#include "LLVMUtilities.h" #include #include @@ -219,6 +220,8 @@ namespace emitters void SetHostTargetProperties(TargetDevice& targetDevice) { + InitializeLLVM(); + auto hostTripleString = llvm::sys::getProcessTriple(); llvm::Triple hostTriple(hostTripleString); @@ -226,6 +229,20 @@ namespace emitters targetDevice.architecture = llvm::Triple::getArchTypeName(hostTriple.getArch()); targetDevice.cpu = llvm::sys::getHostCPUName(); + llvm::StringMap features; + llvm::sys::getHostCPUFeatures(features); + for (const auto& feature : features) + { + if (feature.second) + { + targetDevice.features += '+' + feature.first().str() + ","; + } + } + if (!targetDevice.features.empty()) + { + targetDevice.features.pop_back(); + } + SetTargetDataLayout(targetDevice); } diff --git a/libraries/emitters/templates/LLVMEmitterTargets.h.in b/libraries/emitters/templates/LLVMEmitterTargets.h.in new file mode 100644 index 000000000..9b66e6fde --- /dev/null +++ b/libraries/emitters/templates/LLVMEmitterTargets.h.in @@ -0,0 +1,16 @@ +// Auto-generated +// The contents of this file are based on the CMake variable LLVM_EMITTER_TARGETS +// Specified in libraries/emitters/CMakeLists.txt +// The generated preprocessor define is part of an X-Macro (https://en.wikipedia.org/wiki/X_Macro) +// The other part of the X-Macro relies on the definition of the macro EMITTER_TARGET_ACTION(TargetName) + +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: LLVMEmitterTargets.h (emitters) +// Authors: Kern Handa +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#define LLVM_EMITTER_TARGETS \ +@emitter_targets_content@ diff --git a/libraries/emitters/test/include/IREmitterTest.h b/libraries/emitters/test/include/IREmitterTest.h index d5fda7574..2ee12b460 100644 --- a/libraries/emitters/test/include/IREmitterTest.h +++ b/libraries/emitters/test/include/IREmitterTest.h @@ -42,3 +42,5 @@ void TestElseIfWithComputedCondition(); void TestCastValue(); void TestCastToConditionalBool(); + +void TestInlineAssembly(); diff --git a/libraries/emitters/test/src/IREmitterTest.cpp b/libraries/emitters/test/src/IREmitterTest.cpp index 7ca4772b5..6a6cf23a6 100644 --- a/libraries/emitters/test/src/IREmitterTest.cpp +++ b/libraries/emitters/test/src/IREmitterTest.cpp @@ -23,6 +23,8 @@ #include #include +#include + #include #include #include @@ -1164,3 +1166,76 @@ void TestCastToConditionalBool() TestCastToConditionalBool(); TestCastToConditionalBool(); } + +void TestInlineAssembly() +{ + VariableType inType = emitters::GetVariableType(); + VariableType outType = emitters::GetVariableType(); + + auto module = MakeHostModuleEmitter("TestInlineAssembly"); + auto targetDevice = module.GetCompilerOptions().targetDevice; + auto functionIdentifier = targetDevice.IsMacOS() ? "_square" : "square"; + auto functionName = "square"; + + std::string asmStr; + if(targetDevice.IsWindows()) + { + asmStr= R"XX( + .globl FUNCTION +FUNCTION: + movl %ecx, %eax + imull %ecx, %eax + retq +)XX"; + } + else + { + asmStr= R"XX( + .globl FUNCTION +FUNCTION: + imull %edi, %edi + movl %edi, %eax + retq +)XX"; + } + + ReplaceAll(asmStr, "FUNCTION", functionIdentifier); + + module.GetLLVMModule()->appendModuleInlineAsm(asmStr); + + module.DeclareFunction(functionName, outType, { inType }); + module.DebugDump(); + + const emitters::NamedVariableTypeList parameters = { { "x", inType } }; + auto fn = module.BeginFunction("InlineAssembly", outType, parameters); + { + auto arguments = fn.Arguments().begin(); + auto x = fn.LocalScalar(&(*arguments++)); + auto squareFn = module.GetFunction(functionName); + squareFn->addFnAttr(llvm::Attribute::AttrKind::AlwaysInline); + auto result = fn.Call(squareFn, {x}); + + fn.Return(result); + } + module.EndFunction(); + +#if 0 + module.DebugDump(); + module.WriteToStream(std::cout, ModuleOutputFormat::assembly); +#endif + fn.Verify(); + + IRExecutionEngine jit(std::move(module)); + auto testFn = jit.GetFunction("InlineAssembly"); + + bool success = true; + auto trials = std::vector{ 1, 2, 35 }; + for (auto val : trials) + { + auto result = testFn(val); + auto expected = val * val; + success = success && (result == expected); + } + + testing::ProcessTest("Testing InlineAssembly", success); +} diff --git a/libraries/emitters/test/src/main.cpp b/libraries/emitters/test/src/main.cpp index 199e5f0f4..a3fc007d9 100644 --- a/libraries/emitters/test/src/main.cpp +++ b/libraries/emitters/test/src/main.cpp @@ -52,6 +52,8 @@ void TestIR() TestCastValue(); TestCastToConditionalBool(); + + TestInlineAssembly(); } void TestIRFunction() diff --git a/libraries/math/CMakeLists.txt b/libraries/math/CMakeLists.txt index 41fabdfc0..215a06aa8 100644 --- a/libraries/math/CMakeLists.txt +++ b/libraries/math/CMakeLists.txt @@ -9,6 +9,7 @@ if(MSVC) endif() include (OpenBLASSetup) +add_compile_options(-DUSE_OPENBLAS=1) set(src src/BlasWrapper.cpp src/Tensor.cpp diff --git a/libraries/math/src/BlasWrapper.cpp b/libraries/math/src/BlasWrapper.cpp index 3bc1cdc60..67eef22c4 100644 --- a/libraries/math/src/BlasWrapper.cpp +++ b/libraries/math/src/BlasWrapper.cpp @@ -10,8 +10,12 @@ #include "Matrix.h" #if USE_BLAS +#if USE_MKL +#include +#elif USE_OPENBLAS #include #endif +#endif #include // for hardware_concurrency() @@ -69,8 +73,12 @@ namespace math { numThreads = std::thread::hardware_concurrency(); } -#ifdef OPENBLAS_CONST +#if USE_BLAS +#if defined(USE_OPENBLAS) && defined(OPENBLAS_CONST) openblas_set_num_threads(numThreads); +#elif USE_MKL + mkl_set_num_threads(numThreads); +#endif #endif } diff --git a/libraries/model/include/Map.h b/libraries/model/include/Map.h index 8e842052c..16927d5ac 100644 --- a/libraries/model/include/Map.h +++ b/libraries/model/include/Map.h @@ -65,8 +65,6 @@ namespace model /// The other map. Map(const Map& other); - Map(Map&& other) = default; - /// Assignment operator. /// /// The other map. diff --git a/libraries/model/include/OutputPort.h b/libraries/model/include/OutputPort.h index d47949f6c..8564775ec 100644 --- a/libraries/model/include/OutputPort.h +++ b/libraries/model/include/OutputPort.h @@ -32,7 +32,6 @@ namespace model public: OutputPortBase() = default; OutputPortBase(const OutputPortBase& other) = delete; - OutputPortBase(OutputPortBase&& other) = default; /// Constructor /// diff --git a/libraries/model/src/CompilableCodeNode.cpp b/libraries/model/src/CompilableCodeNode.cpp index d121a9ce9..3f052c4be 100644 --- a/libraries/model/src/CompilableCodeNode.cpp +++ b/libraries/model/src/CompilableCodeNode.cpp @@ -77,7 +77,7 @@ namespace model const auto& inputs = GetInputPorts(); const auto& outputs = GetOutputPorts(); - std::vector parameters; + std::vector parameters; parameters.reserve(inputs.size() + outputs.size()); std::transform(inputs.begin(), inputs.end(), std::back_inserter(parameters), PortToValue); std::transform(outputs.begin(), outputs.end(), std::back_inserter(parameters), PortToValue); @@ -222,7 +222,7 @@ namespace model const auto& inputs = GetInputPorts(); const auto& outputs = GetOutputPorts(); - std::vector args; + std::vector args; args.reserve(inputs.size() + outputs.size()); std::transform(inputs.begin(), inputs.end(), std::back_inserter(args), PortToValue); diff --git a/libraries/model/src/Map.cpp b/libraries/model/src/Map.cpp index cb27db0f0..0b491e9be 100644 --- a/libraries/model/src/Map.cpp +++ b/libraries/model/src/Map.cpp @@ -88,6 +88,10 @@ namespace model AddOutput(name, transformer.GetCorrespondingOutputs(*outputPort)); } + _metadata = other._metadata; + + // TODO (kerha): _computeContext isn't copied right now. Not sure if it should be. [2019-08-23] + _model.Verify(); } @@ -326,6 +330,7 @@ namespace model swap(a._outputs, b._outputs); swap(a._outputNames, b._outputNames); swap(a._outputsMap, b._outputsMap); + swap(a._metadata, b._metadata); swap(a._computeContext, b._computeContext); } diff --git a/libraries/model/test/include/CompilableNodesTest.h b/libraries/model/test/include/CompilableNodesTest.h index b9176a25a..a464acbf3 100644 --- a/libraries/model/test/include/CompilableNodesTest.h +++ b/libraries/model/test/include/CompilableNodesTest.h @@ -19,6 +19,7 @@ #include #include +#include #include #include @@ -50,6 +51,10 @@ void TestReinterpretLayoutNode(); void TestReorderDataNode1(); void TestReorderDataNode2(); void TestReorderDataNode3(); +void TestReorderDataCodeNode1(); +void TestReorderDataCodeNode2(); +void TestReorderDataCodeNode3(); +void TestReorderDataCodeNode4(); void TestReceptiveFieldMatrixNode(size_t numChannels, bool useNewReshape); void TestCompilableAccumulatorNodeFunction(); void TestCompilableSourceNode(); @@ -70,6 +75,7 @@ void TestBufferNode(); void TestMatrixVectorMultiplyNode(int m, int n, bool useBlas); void TestMatrixMatrixMultiplyNode(int m, int n, int k, bool useBlas); void TestOrderedMatrixMatrixMultiplyNode(int m, int n, int k, bool transposeA, bool transposeB, bool transposeC, bool useBlas); +void TestMatrixMatrixMultiplyCodeNode(int m, int n, int k, int panelM, int panelN, int panelK, int kernelM, int kernelN, int kernelK, nodes::MatrixMatrixMultiplyImplementation gemmImpl); void TestBroadcasUnaryOperationNodeCompile(); void TestBroadcasBinaryOperationNodeCompileAdd(); @@ -107,8 +113,10 @@ void TestMaxPoolingLayerNode(size_t inRows, size_t inCols, size_t numChannels, s void TestMeanPoolingLayerNode(size_t inRows, size_t inCols, size_t numChannels, size_t outRows, size_t outCols, size_t poolingSize, size_t poolingStride, size_t inputPadding = 0, size_t outputPadding = 0); void TestScalingLayerNode(size_t inputPadding = 0, size_t outputPadding = 0); void TestSoftmaxLayerNode(size_t inputPadding = 0, size_t outputPadding = 0); +void TestSpatialConvolutionNode(size_t inputPadding = 1, size_t outputPadding = 0); void TestFusedLinearLayerNodes(size_t rows, size_t columns, size_t channels); void TestRegionDetectionNode(); +void TestIRNode(); #pragma region implementation diff --git a/libraries/model/test/src/CompilableNodesTest.cpp b/libraries/model/test/src/CompilableNodesTest.cpp index cda5cd546..0b5d9a1a6 100644 --- a/libraries/model/test/src/CompilableNodesTest.cpp +++ b/libraries/model/test/src/CompilableNodesTest.cpp @@ -49,6 +49,7 @@ #include #include #include +#include #include #include #include @@ -59,10 +60,12 @@ #include #include #include +#include #include #include #include #include +#include #include #include #include @@ -1174,6 +1177,135 @@ void TestReorderDataNode3() VerifyCompiledOutput(map, compiledMap, signal, "ReorderDataNode"); } +void TestReorderDataCodeNode1() +{ + using ElementType = float; + int numRows = 2; + int numColumns = 3; + int numChannels = 16; + model::Model model; + model::PortMemoryLayout inputLayout(model::MemoryShape{ numRows, numColumns, numChannels }); // Default order: 0, 1, 2 == rows, columns, channels + auto outputLayout = inputLayout.ReorderedCopy({ 2, 0, 1 }); + + size_t inputSize = inputLayout.GetMemorySize(); + auto inputNode = model.AddNode>(inputSize); + const auto& testOutput = ReorderDataWithCodeNode(inputNode->output, inputLayout, outputLayout); + auto map = model::Map(model, { { "input", inputNode } }, { { "output", testOutput } }); + + // First, create the input tensor and expected output + math::ChannelColumnRowTensor input(numRows, numColumns, numChannels); + math::ColumnRowChannelTensor expectedOutput(numRows, numColumns, numChannels); + + // Next, verify that the compiled output is correct + FillTensor(input, 1, 1); + FillTensor(expectedOutput, 1, 1); + + Log() << "Input:" << EOL << input.ToArray() << EOL; + + std::string name = "TestReorderDataCodeNode1"; + TestWithSerialization(map, name, [&](model::Map& map, int iteration) { + model::IRMapCompiler compiler; + auto compiledMap = compiler.Compile(map); + + // compare output + std::vector> signal = { input.ToArray() }; + std::vector> expected = { expectedOutput.ToArray() }; + VerifyCompiledOutputAndResult(map, compiledMap, signal, expected, utilities::FormatString("%s iteration %d", name.c_str(), iteration)); + }); +} + +void TestReorderDataCodeNode2() +{ + using ElementType = float; + int numRows = 3; + int numColumns = 3; + int numChannels = 16; + int padding = 1; + model::Model model; + model::PortMemoryLayout inputLayout(model::MemoryShape{ numRows, numColumns, numChannels }, model::MemoryShape{ padding, padding, 0 }); // Default order: 0, 1, 2 == rows, columns, channels + auto outputLayout = inputLayout.ReorderedCopy({ 2, 0, 1 }); + + size_t inputSize = inputLayout.GetMemorySize(); + auto inputNode = model.AddNode>(inputSize); + const auto& testOutput = ReorderDataWithCodeNode(inputNode->output, inputLayout, outputLayout, std::vector{ 2, 0, 1 }); + auto map = model::Map(model, { { "input", inputNode } }, { { "output", testOutput } }); + model::IRMapCompiler compiler; + auto compiledMap = compiler.Compile(map); + + std::vector input(inputSize); + FillVector(input, 1.0f); + Log() << "Input:" << EOL << input << EOL; + + // compare output + std::vector> signal = { input }; + VerifyCompiledOutput(map, compiledMap, signal, "ReorderDataCodeNode2"); +} + +void TestReorderDataCodeNode3() +{ + using ElementType = float; + int numRows = 3; + int numColumns = 4; + int numChannels = 2; + int padding = 1; + model::Model model; + model::PortMemoryLayout inputLayout(model::MemoryShape{ numRows, numColumns, numChannels }, model::MemoryShape{ padding, padding, 0 }); // Default order: 0, 1, 2 == rows, columns, channels + auto outputLayout = inputLayout.ReorderedCopy({ 2, 0, 1 }); + + size_t inputSize = inputLayout.GetMemorySize(); + auto inputNode = model.AddNode>(inputSize); + const auto& testOutput = ReorderDataWithCodeNode(inputNode->output, inputLayout, outputLayout); + auto map = model::Map(model, { { "input", inputNode } }, { { "output", testOutput } }); + model::IRMapCompiler compiler; + auto compiledMap = compiler.Compile(map); + + std::vector input(inputSize); + FillVector(input, 1.0f); + Log() << "Input:" << EOL << input << EOL; + + // compare output + std::vector> signal = { input }; + VerifyCompiledOutput(map, compiledMap, signal, "ReorderDataCodeNode3"); +} + +void TestReorderDataCodeNode4() +{ + using ElementType = float; + int numRows = 2; + int numColumns = 5; + model::Model model; + model::PortMemoryLayout inputLayout(model::MemoryShape{ numRows, numColumns }); + auto outputLayout = inputLayout.ReorderedCopy({ 1, 0 }); + + size_t inputSize = inputLayout.GetMemorySize(); + auto inputNode = model.AddNode>(inputSize); + const auto& testOutput = ReorderDataWithCodeNode(inputNode->output, inputLayout, outputLayout); + auto map = model::Map(model, { { "input", inputNode } }, { { "output", testOutput } }); + + // First, create the input tensor and expected output + math::RowMatrix input(numRows, numColumns); + math::ColumnMatrix expectedOutput(numRows, numColumns); + + // Next, verify that the compiled output is correct + FillMatrix(input, 1, 1); + FillMatrix(expectedOutput, 1, 1); + + std::vector> signal = { input.ToArray() }; + std::vector> expected = { expectedOutput.ToArray() }; + + Log() << "Input:" << EOL << input.ToArray() << EOL; + + std::string name = "TestReorderDataCodeNode4"; + TestWithSerialization(map, name, [&](model::Map& map, int iteration) { + model::IRMapCompiler compiler; + auto compiledMap = compiler.Compile(map); + + // compare output + VerifyCompiledOutputAndResult(map, compiledMap, signal, expected, utilities::FormatString("%s iteration %d", name.c_str(), iteration)); + }); +} + + void TestReceptiveFieldMatrixNode(size_t numChannels, bool useNewReshape) { const std::array rcdOrder = std::array{ 0, 1, 2 }; @@ -1644,6 +1776,59 @@ void TestOrderedMatrixMatrixMultiplyNode(int m, int n, int k, bool transposeA, b }); } +void TestMatrixMatrixMultiplyCodeNode(int m, int n, int k, int panelM, int panelN, int panelK, int kernelM, int kernelN, int kernelK, MatrixMatrixMultiplyImplementation gemmImpl) +{ + using ValueType = float; + std::vector matrixBVals(k * n); + FillRandomVector(matrixBVals); + + model::Model model; + auto inputMatrixNode = model.AddNode>(m * k); + auto matrixBNode = model.AddNode>(matrixBVals); + + int lda = k; + int ldb = n; + int ldc = n; + + auto matMatMultNode = model.AddNode>(inputMatrixNode->output, m, n, k, lda, matrixBNode->output, ldb, ldc, panelM, panelN, panelK, kernelM, kernelN, kernelK, gemmImpl); + + auto map = model::Map(model, { { "inputMatrix", inputMatrixNode } }, { { "output", matMatMultNode->output } }); + + std::string name = "MatrixMatrixMultiplyCodeNode"; + TestWithSerialization(map, name, [&](model::Map& map, int iteration) { + // compare output + std::vector matrixAVals(m * k); + FillRandomVector(matrixAVals); + std::vector> signal = { matrixAVals }; + + model::MapCompilerOptions settings; + model::ModelOptimizerOptions optimizerOptions; + model::IRMapCompiler compiler(settings, optimizerOptions); + auto compiledMap = compiler.Compile(map); + + std::vector expectedResult(m * n); + for (int i = 0; i < m; i++) + { + for (int j = 0; j < n; j++) + { + for (int kVal = 0; kVal < k; kVal++) + { + expectedResult[i * n + j] += matrixAVals[i * k + kVal] * matrixBVals[kVal * n + j]; + } + } + } + + std::vector> expected { expectedResult }; + std::stringstream id; + id << std::boolalpha << "MatrixMatrixMultiplyCodeNode(impl = " << static_cast(gemmImpl) + << ", m = " << m << ", n = " << n << ", k = " << k + << ", panelM = " << panelM << ", panelN =" << panelN << ", panelK = " << panelK + << ", kernelM = " << kernelM << ", kernelN = " << kernelN << ", kernelK" << kernelK << ") iteration " << iteration; + + VerifyCompiledOutputAndResult(map, compiledMap, signal, expected, id.str()); + }); +} + // C callback (called by emitted code) static int lagNotificationCallbackCount = 0; extern "C" { @@ -3625,3 +3810,106 @@ void TestBroadcasBinaryOperationNodeCompileWithOrdering() auto computed = compiledMap.Compute(input1Vals); testing::ProcessTest("TestBroadcastBinaryOperationNodeCompileWithOrdering", testing::IsEqual(computed, expected)); } + +void TestSpatialConvolutionNode(size_t inputPaddingSize, size_t outputPaddingSize) +{ + // Abbreviations: + // + // r == # input rows + // c == # input columns + // ch == # input channels + // fw == filter width + // nf == # filters + // pi == input padding amount + // po == output padding amount + + // Data dimensions: + // + // Input: r x c x ch, with padding -> r+2pi x c+2pi x ch + // == 1 x 2 x 2, with padding == 1 -> 3 x 4 x 2 + // Weights: nf x fw x fw x ch + // == 2 x 3 x 3 x 2, (2 3x3 filters, with 2 input channels each) + // Output: r x c x nf, with padding -> 1+2po x 2+2po x 2 + // == 1 x 2 x 2, with padding == 0 -> 1 x 2 x 2 + + using ElementType = double; + using LayerParameters = typename Layer::LayerParameters; + using TensorType = typename Layer::TensorType; + using TensorReferenceType = typename Layer::TensorReferenceType; + using Shape = typename Layer::Shape; + + assert(inputPaddingSize == 1); + TensorType inputWithPadding(2 + 2 * inputPaddingSize, 2 + 2 * inputPaddingSize, 2); + TensorReferenceType input = inputWithPadding.GetSubTensor({ inputPaddingSize, inputPaddingSize, 0 }, { 2, 2, 2 }); + inputWithPadding.Fill(0); + input(0, 0, 0) = 2; + input(0, 1, 0) = 1; + input(0, 0, 1) = 3; + input(0, 1, 1) = 2; + // Input channel 0: [2, 3], input channel 1: [1, 2] + + Shape outputShape = { 2, 2, 2 }; + + LayerParameters parameters{ inputWithPadding, ZeroPadding(inputPaddingSize), outputShape, ZeroPadding(outputPaddingSize) }; + ConvolutionalParameters convolutionalParams{ 3, 1, ConvolutionMethod::automatic, 2 }; + + // Filter weights in `weightsVector` are in numFilters x numChannels x filterSize x filterSize order + // clang-format off + std::vector weightsVector { + 1, 3, 2, 3, 1, 1, 2, 3, 1, // Filter 1, channel 1 + 1, 2, 1, 2, 3, 2, 1, 2, 1}; // Filter 2, channel 2 + // clang-format on + + // Viewed as planar filters (ch x fw x fw): + // + // 1 3 2 + // f0 = 3 1 1 + // 2 3 1 + // + // 1 2 1 + // f1 = 2 3 2 + // 1 2 1 + + // Filter weights in `weights` tensor are in numFilters x filterSize x filterSize x 1 order + TensorType weights(convolutionalParams.receptiveField * outputShape.NumChannels(), convolutionalParams.receptiveField, 1); + + size_t vectorIndex = 0; + for (size_t f = 0; f < outputShape.NumChannels(); ++f) + { + for (size_t k = 0; k < 1; ++k) + { + for (size_t i = 0; i < convolutionalParams.receptiveField; ++i) + { + for (size_t j = 0; j < convolutionalParams.receptiveField; ++j) + { + weights(f * convolutionalParams.receptiveField + i, j, k) = weightsVector[vectorIndex++]; + } + } + } + } + + // + // Verify ConvolutionalLayerNode + // + ConvolutionalLayer layer(parameters, convolutionalParams, weights); + layer.Compute(); + auto output = layer.GetOutput(); + + // Create model + model::Model model; + auto inputMemoryLayout = utilities::MemoryLayout( + utilities::MemoryShape{ 2, 2, 2 }, + utilities::MemoryShape{ static_cast(inputPaddingSize), static_cast(inputPaddingSize), 0 }); + // BUGBUG: This fails when the order is not canonical order. + auto inputNode = model.AddNode>(inputMemoryLayout.ReorderedCopy({ 2, 0, 1 })); + auto outputMemoryLayout = utilities::MemoryLayout(utilities::MemoryShape{ 2, 2, 2 }); + auto computeNode = model.AddNode>(inputNode->output, layer, outputMemoryLayout); + auto map = model::Map(model, { { "input", inputNode } }, { { "output", computeNode->output } }); + + const auto info = "TestSpatialConvolutionalLayer"; + + VerifyLayerMap(map, computeNode, inputWithPadding, output, info); + + // Test archiving / unarchiving produces same result + VerifyArchiveAndUnarchivingMap(map, computeNode, inputWithPadding, output, info); +} diff --git a/libraries/model/test/src/model_compiler_test_main.cpp b/libraries/model/test/src/model_compiler_test_main.cpp index c20717aa7..567a2343c 100644 --- a/libraries/model/test/src/model_compiler_test_main.cpp +++ b/libraries/model/test/src/model_compiler_test_main.cpp @@ -11,6 +11,7 @@ #include "CompilerTest.h" #include "ModelHeaderOutputTest.h" #include "PerformanceCountersTest.h" +#include #include @@ -22,8 +23,25 @@ using namespace ell; using namespace ell::emitters; using namespace ell::predictors::neural; +void TestMatrixMatrixMultiplyCodeNodeImplementations() +{ + const int fallbackPanelM = 1; + const int fallbackPanelN = 1; + const int fallbackPanelK = 1; + const int fallbackKernelM = 1; + const int fallbackKernelN = 1; + const int fallbackKernelK = 1; + // Naive for-loop implementation + TestMatrixMatrixMultiplyCodeNode(1, 1, 1, fallbackPanelM, fallbackPanelN, fallbackPanelK, fallbackKernelM, fallbackKernelN, fallbackKernelK, nodes::MatrixMatrixMultiplyImplementation::SimpleForLoops); + TestMatrixMatrixMultiplyCodeNode(4, 4, 4, fallbackPanelM, fallbackPanelN, fallbackPanelK, fallbackKernelM, fallbackKernelN, fallbackKernelK, nodes::MatrixMatrixMultiplyImplementation::SimpleForLoops); + TestMatrixMatrixMultiplyCodeNode(4, 8, 8, fallbackPanelM, fallbackPanelN, fallbackPanelK, fallbackKernelM, fallbackKernelN, fallbackKernelK, nodes::MatrixMatrixMultiplyImplementation::SimpleForLoops); + TestMatrixMatrixMultiplyCodeNode(4, 4, 8, fallbackPanelM, fallbackPanelN, fallbackPanelK, fallbackKernelM, fallbackKernelN, fallbackKernelK, nodes::MatrixMatrixMultiplyImplementation::SimpleForLoops); +} + void TestIRCompiler() { + // TestIRNode(); // Failing on Windows + TestBufferNode(); TestBufferNode(); TestBufferNode(); @@ -99,6 +117,8 @@ void TestIRCompiler() // TestMatrixMatrixMultiplyNode(15, 25600, 27, false); // Fails due to numerical issues + TestMatrixMatrixMultiplyCodeNodeImplementations(); + TestCompilableScalarOutputNode(); TestCompilableVectorOutputNode(); TestCompilableAccumulatorNode(); @@ -120,7 +140,11 @@ void TestIRCompiler() TestReorderDataNode1(); TestReorderDataNode2(); TestReorderDataNode3(); - TestReceptiveFieldMatrixNode(1, true); // new version + TestReorderDataCodeNode1(); + TestReorderDataCodeNode2(); + TestReorderDataCodeNode3(); + TestReorderDataCodeNode4(); + TestReceptiveFieldMatrixNode(1, true); // new version TestReceptiveFieldMatrixNode(1, false); // old (slow) version TestReceptiveFieldMatrixNode(2, true); // new version // TestReceptiveFieldMatrixNode(2, false); // old (slow) version -- Fails @@ -239,6 +263,9 @@ void TestIRCompiler() TestConvolutionalLayerNode2(ConvolutionMethod::winograd, 1, 0); TestConvolutionalLayerNode3(ConvolutionMethod::winograd, 1, 0); + //BUGBUG: This test currently fails for Compute but passes for Compile. + //TestSpatialConvolutionNode(1, 0); + TestFullyConnectedLayerNode(); // TestFullyConnectedLayerNode(0, 1); // Fully-connected layer nodes can't have padding (yet) // TestFullyConnectedLayerNode(0, 2); // Fully-connected layer nodes can't have padding (yet) diff --git a/libraries/model_testing/include/ModelTestUtilities.h b/libraries/model_testing/include/ModelTestUtilities.h index d4e7aaf90..5874884cc 100644 --- a/libraries/model_testing/include/ModelTestUtilities.h +++ b/libraries/model_testing/include/ModelTestUtilities.h @@ -155,12 +155,18 @@ void FillTensor(ell::math::ChannelColumnRowTensor& tensor, ElementT template void FillTensor(math::TensorReference& tensor, ElementType startValue = 0, ElementType step = 1); +template +void FillTensor(ell::math::ColumnRowChannelTensor& tensor, ElementType startValue = 0, ElementType step = 1); + template void FillWeightsTensor(ell::math::ChannelColumnRowTensor& tensor, ElementType startValue = 0, ElementType step = 1); template void FillMatrix(math::RowMatrix& matrix, ElementType startValue = 0, ElementType step = 1); +template +void FillMatrix(math::ColumnMatrix& matrix, ElementType startValue = 0, ElementType step = 1); + #pragma region implementation template @@ -385,11 +391,9 @@ void VerifyCompiledOutput(model::Map& map, model::IRCompiledMap& compiledMap, co } } - /// Verify the compiled output matches the computed output, and also verify computed output matches a given expected output template -bool VerifyCompiledOutputAndResult(model::Map& map, model::IRCompiledMap& compiledMap, const std::vector>& signal, - const std::vector>& expectedOutput, const std::string& name, const std::string& additionalMessage, double epsilon) +bool VerifyCompiledOutputAndResult(model::Map& map, model::IRCompiledMap& compiledMap, const std::vector>& signal, const std::vector>& expectedOutput, const std::string& name, const std::string& additionalMessage, double epsilon) { bool ok = true; std::vector computedResult; @@ -430,7 +434,6 @@ bool VerifyCompiledOutputAndResult(model::Map& map, model::IRCompiledMap& compil return ok; } - template class Uniform { @@ -520,6 +523,23 @@ void FillTensor(math::TensorReference +void FillTensor(ell::math::ColumnRowChannelTensor& tensor, ElementType startValue, ElementType step) +{ + ElementType val = startValue; + for (size_t row = 0; row < tensor.NumRows(); row++) + { + for (size_t column = 0; column < tensor.NumColumns(); column++) + { + for (size_t channel = 0; channel < tensor.NumChannels(); channel++) + { + tensor(row, column, channel) = val; + val += step; + } + } + } +} + template void FillWeightsTensor(ell::math::ChannelColumnRowTensor& tensor, ElementType startValue, ElementType step) { @@ -534,10 +554,28 @@ template void FillMatrix(math::RowMatrix& matrix, ElementType startValue, ElementType step) { ElementType val = startValue; - matrix.Generate([&val, step]() { - auto result = val; - val += step; - return result; }); + for (size_t row = 0; row < matrix.NumRows(); row++) + { + for (size_t column = 0; column < matrix.NumColumns(); column++) + { + matrix(row, column) = val; + val += step; + } + } +} + +template +void FillMatrix(math::ColumnMatrix& matrix, ElementType startValue, ElementType step) +{ + ElementType val = startValue; + for (size_t row = 0; row < matrix.NumRows(); row++) + { + for (size_t column = 0; column < matrix.NumColumns(); column++) + { + matrix(row, column) = val; + val += step; + } + } } #pragma endregion implementation diff --git a/libraries/nodes/CMakeLists.txt b/libraries/nodes/CMakeLists.txt index 5e70370ec..e9943b0e0 100644 --- a/libraries/nodes/CMakeLists.txt +++ b/libraries/nodes/CMakeLists.txt @@ -25,7 +25,9 @@ set(src src/IIRFilterNode.cpp src/IRNode.cpp src/LSTMNode.cpp + src/MatrixMatrixMultiplyCodeNode.cpp src/MatrixMatrixMultiplyNode.cpp + src/MatrixMatrixMultiplyCodeNode.cpp src/MatrixVectorMultiplyNode.cpp src/NeuralNetworkPredictorNode.cpp src/PoolingLayerNode.cpp @@ -79,6 +81,7 @@ set(include include/L2NormSquaredNode.h include/LSTMNode.h include/LinearPredictorNode.h + include/MatrixMatrixMultiplyCodeNode.h include/MatrixMatrixMultiplyNode.h include/MatrixVectorMultiplyNode.h include/MatrixVectorProductNode.h @@ -94,6 +97,7 @@ set(include include/RNNNode.h include/RegionDetectionLayerNode.h include/ReinterpretLayoutNode.h + include/ReorderDataCodeNode.h include/ReorderDataNode.h include/ScalingLayerNode.h include/SimpleConvolutionNode.h @@ -101,6 +105,7 @@ set(include include/SinkNode.h include/SoftmaxLayerNode.h include/SourceNode.h + include/SpatialConvolutionNode.h include/SquaredEuclideanDistanceNode.h include/SumNode.h include/TypeCastNode.h diff --git a/libraries/nodes/include/BroadcastOperationNodes.h b/libraries/nodes/include/BroadcastOperationNodes.h index ad1051939..6f7d63ff0 100644 --- a/libraries/nodes/include/BroadcastOperationNodes.h +++ b/libraries/nodes/include/BroadcastOperationNodes.h @@ -10,15 +10,7 @@ #include "NodeOperations.h" -#include -#include -#include -#include -#include - -#include -#include -#include +#include #include #include #include @@ -27,6 +19,13 @@ #include #include +#include +#include +#include + +#include +#include + #include #include #include @@ -37,6 +36,10 @@ namespace ell { namespace nodes { + using UnaryScalarFunction = value::Scalar (*)(value::Scalar); + using BinaryScalarFunction = value::Scalar (*)(value::Scalar, value::Scalar); + using TernaryScalarFunction = value::Scalar (*)(value::Scalar, value::Scalar, value::Scalar); + /// /// /// Broadcast operation nodes perform elementwise operations on multidimensional arrays, using "broadcast" semantics. If the @@ -51,7 +54,7 @@ namespace nodes // Base class for broadcast nodes template - class BroadcastOperationNode : public model::CompilableNode + class BroadcastOperationNode : public model::CompilableCodeNode { public: /// @name Output Port @@ -60,6 +63,8 @@ namespace nodes /// @} protected: + using KernelFunctionType = std::function& args)>; + BroadcastOperationNode(const std::vector& inputsPortRefs, const std::vector& inputs, ValueType padding = 0); @@ -75,59 +80,36 @@ namespace nodes const model::OutputPort& GetOutput() const; const model::InputPort& GetInput(int index) const; - const FunctionType& GetFunction() const; - template - void SetFunction(OpFunctionType&& function); - virtual ValueType ComputeOperation(const std::vector& args) const = 0; - virtual emitters::IRLocalScalar CompileOperation(const std::vector& args) const = 0; - - void Compute() const override; - void Compile(model::IRMapCompiler& compiler, emitters::IRFunctionEmitter& function) override; + void Define(ell::value::FunctionDeclaration& fn) override; + KernelFunctionType MakeKernel(FunctionType f) const; + virtual KernelFunctionType GetKernelFunction() const = 0; + value::Scalar CallKernelFunction(FunctionType f, std::vector inputs, std::vector> indices) const; bool HasState() const override { return true; } // stored state: function and padding value - protected: void WriteToArchive(utilities::Archiver& archiver) const override; void ReadFromArchive(utilities::Unarchiver& archiver) override; private: - void ComputeDimensionLoop(int dimension, - const std::vector& prevInputDimensionOffsets, - const std::vector& lastActiveInputDimensions, - const std::vector& inputValues, - int prevOutputDimensionOffset, - std::vector& output) const; - - void CompileDimensionLoop(model::IRMapCompiler& compiler, - emitters::IRFunctionEmitter& function, - int dimension, - const std::vector& inputs, - const std::vector& prevInputDimensionOffsets, - const std::vector& lastActiveInputDimensions, - const std::vector& inputValues, - emitters::IRLocalScalar prevOutputDimensionOffset, - emitters::IRLocalArray& output) const; - std::vector GetLastActiveInputDimensions() const; - utilities::ArchiveVersion GetArchiveVersion() const override; bool CanReadArchiveVersion(const utilities::ArchiveVersion& version) const override; ValueType GetOutputPadding() const { return _paddingValue; } model::OutputPort _output; - std::unique_ptr _function; ValueType _paddingValue; }; // // BroadcastUnaryOperationNode // + template - class BroadcastUnaryOperationNode : public BroadcastOperationNode> + class BroadcastUnaryOperationNode : public BroadcastOperationNode { public: using OperationType = UnaryOperationType; - using FunctionType = UnaryFunctionType; + using FunctionType = UnaryScalarFunction; /// Default constructor. BroadcastUnaryOperationNode(); @@ -166,16 +148,15 @@ namespace nodes protected: using BroadcastOperationNode::GetOutput; - using BroadcastOperationNode::GetFunction; - using BroadcastOperationNode::SetFunction; - ValueType ComputeOperation(const std::vector& args) const override; - emitters::IRLocalScalar CompileOperation(const std::vector& args) const override; + using BroadcastOperationNode::MakeKernel; + using KernelFunctionType = typename BroadcastOperationNode::KernelFunctionType; + + KernelFunctionType GetKernelFunction() const override; void WriteToArchive(utilities::Archiver& archiver) const override; void ReadFromArchive(utilities::Unarchiver& archiver) override; private: - void SetOperationFunction(); void Copy(model::ModelTransformer& transformer) const override; model::InputPort _input; @@ -186,11 +167,11 @@ namespace nodes // BroadcastBinaryOperationNode // template - class BroadcastBinaryOperationNode : public BroadcastOperationNode> + class BroadcastBinaryOperationNode : public BroadcastOperationNode { public: using OperationType = BinaryOperationType; - using FunctionType = BinaryFunctionType; + using FunctionType = BinaryScalarFunction; /// Default constructor. BroadcastBinaryOperationNode(); @@ -236,16 +217,15 @@ namespace nodes protected: using BroadcastOperationNode::GetOutput; - using BroadcastOperationNode::GetFunction; - using BroadcastOperationNode::SetFunction; - ValueType ComputeOperation(const std::vector& args) const override; - emitters::IRLocalScalar CompileOperation(const std::vector& args) const override; + using BroadcastOperationNode::MakeKernel; + using KernelFunctionType = typename BroadcastOperationNode::KernelFunctionType; + + KernelFunctionType GetKernelFunction() const override; void WriteToArchive(utilities::Archiver& archiver) const override; void ReadFromArchive(utilities::Unarchiver& archiver) override; private: - void SetOperationFunction(); void Copy(model::ModelTransformer& transformer) const override; model::InputPort _input1; @@ -257,11 +237,11 @@ namespace nodes // BroadcastTernaryOperationNode // template - class BroadcastTernaryOperationNode : public BroadcastOperationNode> + class BroadcastTernaryOperationNode : public BroadcastOperationNode { public: using OperationType = TernaryOperationType; - using FunctionType = TernaryFunctionType; + using FunctionType = TernaryScalarFunction; /// Default constructor. BroadcastTernaryOperationNode(); @@ -308,16 +288,15 @@ namespace nodes protected: using BroadcastOperationNode::GetOutput; - using BroadcastOperationNode::GetFunction; - using BroadcastOperationNode::SetFunction; - ValueType ComputeOperation(const std::vector& args) const override; - emitters::IRLocalScalar CompileOperation(const std::vector& args) const override; + using BroadcastOperationNode::MakeKernel; + using KernelFunctionType = typename BroadcastOperationNode::KernelFunctionType; + + KernelFunctionType GetKernelFunction() const override; void WriteToArchive(utilities::Archiver& archiver) const override; void ReadFromArchive(utilities::Unarchiver& archiver) override; private: - void SetOperationFunction(); void Copy(model::ModelTransformer& transformer) const override; model::InputPort _input1; @@ -345,7 +324,7 @@ namespace nodes BroadcastOperationNode::BroadcastOperationNode(const std::vector& inputPortRefs, const std::vector& inputs, ValueType paddingValue) : - CompilableNode(inputPortRefs, { &_output }), + CompilableCodeNode("BroadcastOperationNode", inputPortRefs, { &_output }), _output(this, ell::model::Node::defaultOutputPortName, ComputeBroadcastedLayout(inputs)), _paddingValue(paddingValue) { @@ -356,7 +335,7 @@ namespace nodes const std::vector& inputs, const model::PortMemoryLayout& outputLayout, ValueType paddingValue) : - CompilableNode(inputPortRefs, { &_output }), + CompilableCodeNode("BroadcastOperationNode", inputPortRefs, { &_output }), _output(this, ell::model::Node::defaultOutputPortName, outputLayout), _paddingValue(paddingValue) { @@ -389,215 +368,99 @@ namespace nodes } template - template - void BroadcastOperationNode::SetFunction(OpFunctionType&& function) - { - _function = std::make_unique(std::move(function)); - } - - template - const FunctionType& BroadcastOperationNode::GetFunction() const + void BroadcastOperationNode::Define(ell::value::FunctionDeclaration& fn) { - return *_function; - } - - // - // Arbitrary-depth nested loops are generated recursively. The EmitComputeDimensionLoop - // function emits `numDimensions` nested loops of the form: - // - // for(iz = 0; iz < sz; ++iz) - // { - // zOffset = (iz+offset[2]) * stride[2]; - // for(iy = 0; iy < sy; ++iy) - // { - // yOffset = zOffset + (iy+offset[1]) * stride[1]; - // for(ix = 0; ix < sx; ++ix) - // { - // offset = yOffset + (ix+offset[0]) * stride[0]; - // x = arr[offset]; - // val = f(x); - // output[offset] = val; - // } - // } - // } - // + using namespace value::loopnests; - template - std::vector BroadcastOperationNode::GetLastActiveInputDimensions() const - { - const auto numDimensions = NumDimensions(); - auto numInputs = NumInputPorts(); - std::vector lastActiveInputDimensions(numInputs, 0); - for (int i = 0; i < numInputs; ++i) - { - const auto& inputLayout = GetInput(i).GetMemoryLayout(); - const auto& activeSize = inputLayout.GetLogicalDimensionActiveSize(); - for (int j = numDimensions - 1; j >= 0; --j) + (void)fn.Define([this](const std::vector& args) { + if (static_cast(args.size()) != (this->NumInputPorts() + this->NumOutputPorts())) { - if (activeSize[j] != 1) - { - lastActiveInputDimensions[i] = j; - break; - } + throw utilities::LogicException(utilities::LogicExceptionErrors::illegalState); } - } - return lastActiveInputDimensions; - } + auto outputLayout = this->GetOutputMemoryLayout(); + auto numDim = outputLayout.NumDimensions(); - template - void BroadcastOperationNode::ComputeDimensionLoop(int dimension, - const std::vector& prevInputDimensionOffsetsIn, - const std::vector& lastActiveInputDimensions, - const std::vector& inputValuesIn, - int prevOutputDimensionOffset, - std::vector& output) const - { - auto prevInputDimensionOffsets = prevInputDimensionOffsetsIn; - auto inputValues = inputValuesIn; - const auto& outputLayout = GetOutputMemoryLayout(); - const auto outputGlobalOffset = outputLayout.GetFirstEntryOffset(); - const auto& outputSize = outputLayout.GetLogicalDimensionActiveSize(); - const auto& outputIncrement = outputLayout.GetLogicalDimensionIncrement(); - - const auto numDimensions = outputLayout.NumDimensions(); - const auto numInputs = NumInputPorts(); - - for (int loopIndex = 0; loopIndex < outputSize[dimension]; ++loopIndex) - { - auto thisOutputDimensionOffset = prevOutputDimensionOffset + loopIndex * outputIncrement[dimension]; - std::vector thisInputDimensionOffsets(numInputs, 0); - for (int inputIndex = 0; inputIndex < numInputs; ++inputIndex) + // Create the indices and ranges for the loop nest + std::vector indices; + std::vector ranges; + for (int d = 0; d < numDim; ++d) { - const auto& input = GetInput(inputIndex); - const auto& inputLayout = input.GetMemoryLayout(); - const auto inputGlobalOffset = inputLayout.GetFirstEntryOffset(); - const auto& inputSize = inputLayout.GetLogicalDimensionActiveSize(); - const auto& inputIncrement = inputLayout.GetLogicalDimensionIncrement(); - - // Account for broadcasting dimensions by setting loopIndex to 0 if this is a broadcast dimension for this input - auto thisLoopIndex = inputSize[dimension] == 1 ? 0 : loopIndex; - auto thisInputDimensionOffset = prevInputDimensionOffsets[inputIndex] + thisLoopIndex * inputIncrement[dimension]; - thisInputDimensionOffsets[inputIndex] = thisInputDimensionOffset; - if (dimension == lastActiveInputDimensions[inputIndex]) - { - inputValues[inputIndex] = input[inputGlobalOffset + thisInputDimensionOffset]; - } + auto name = "i_" + std::to_string(d); + Index i(name); + indices.emplace_back(i); + int size = static_cast(outputLayout.GetLogicalDimensionActiveSize()[d]); + ranges.push_back({ i, { 0, size } }); } - if (dimension < numDimensions - 1) - { - // Recursive call to emit nested loop - ComputeDimensionLoop(dimension + 1, thisInputDimensionOffsets, lastActiveInputDimensions, inputValues, thisOutputDimensionOffset, output); - } - else - { - // We're in the innermost loop --- compute the value - auto outputValue = ComputeOperation(inputValues); - output[outputGlobalOffset + thisOutputDimensionOffset] = outputValue; - } - } + LoopNest loop(ranges); + auto kernel = value::loopnests::Kernel("kernel") + .Inputs(args) + .Indices(indices) + .Define(GetKernelFunction()); + loop.AddKernel(kernel); + + CodeGenerator generator; + generator.Run(loop); + }); } template - void BroadcastOperationNode::CompileDimensionLoop(model::IRMapCompiler& compiler, - emitters::IRFunctionEmitter& function, - int dimension, - const std::vector& inputsIn, - const std::vector& prevInputDimensionOffsetsIn, - const std::vector& lastActiveInputDimensions, - const std::vector& inputValuesIn, - emitters::IRLocalScalar prevOutputDimensionOffset, - emitters::IRLocalArray& output) const - { - auto inputs = inputsIn; - auto prevInputDimensionOffsets = prevInputDimensionOffsetsIn; - auto inputValues = inputValuesIn; - - model::PortMemoryLayout outputLayout = GetOutputMemoryLayout(); - const auto outputGlobalOffset = static_cast(outputLayout.GetFirstEntryOffset()); - const auto& outputSize = outputLayout.GetLogicalDimensionActiveSize(); - const auto& outputIncrement = outputLayout.GetLogicalDimensionIncrement(); - - const auto numDimensions = outputLayout.NumDimensions(); - const auto numInputs = NumInputPorts(); - - function.For(0, outputSize[dimension], [&](emitters::IRFunctionEmitter& function, auto loopIndex) { - auto thisOutputDimensionOffset = prevOutputDimensionOffset + loopIndex * outputIncrement[dimension]; - std::vector thisInputDimensionOffsets(numInputs, function.LocalScalar(0)); - for (int inputIndex = 0; inputIndex < numInputs; ++inputIndex) + auto BroadcastOperationNode::MakeKernel(FunctionType f) const -> KernelFunctionType + { + // # args = # inputs + # outputs + // the rest are indices + return [this, f = std::move(f)](const std::vector& args) { + if (static_cast(args.size()) != NumDimensions() + NumInputPorts() + 1) { - const auto& inputPort = GetInput(inputIndex); - const auto& inputLayout = inputPort.GetMemoryLayout(); - const auto inputGlobalOffset = static_cast(inputLayout.GetFirstEntryOffset()); - const auto& inputSize = inputLayout.GetLogicalDimensionActiveSize(); - const auto& inputIncrement = inputLayout.GetLogicalDimensionIncrement(); - const auto& input = inputs[inputIndex]; - - // Account for broadcasting dimensions by setting loopIndex to 0 if this is a broadcast dimension for this input - auto thisLoopIndex = inputSize[dimension] == 1 ? function.LocalScalar(0) : loopIndex; - auto thisInputDimensionOffset = prevInputDimensionOffsets[inputIndex] + thisLoopIndex * inputIncrement[dimension]; - thisInputDimensionOffsets[inputIndex] = thisInputDimensionOffset; - if (dimension == lastActiveInputDimensions[inputIndex]) - { - inputValues[inputIndex] = input[thisInputDimensionOffset + inputGlobalOffset]; - } + throw utilities::LogicException(utilities::LogicExceptionErrors::illegalState); } - if (dimension < numDimensions - 1) + const int numInputs = this->NumInputPorts(); + std::vector inputs; + auto it = args.begin(); + for (int i = 0; i < numInputs; ++i) { - // Recursive call to emit nested loop - CompileDimensionLoop(compiler, function, dimension + 1, inputs, thisInputDimensionOffsets, lastActiveInputDimensions, inputValues, thisOutputDimensionOffset, output); + inputs.push_back({ *it++ }); } - else + auto output = value::Array(*it++); + std::vector> indices(numInputs); + std::vector outputIndices; + int dimension = 0; + for (; it != args.end(); ++it) { - // We're in the innermost loop --- compute the value - auto outputValue = CompileOperation(inputValues); - output[outputGlobalOffset + thisOutputDimensionOffset] = outputValue; + for (int i = 0; i < numInputs; ++i) + { + indices[i].push_back(GetInputMemoryLayout(i).GetLogicalDimensionActiveSize(dimension) > 1 ? value::Scalar{ *it } : value::Scalar(0)); + } + outputIndices.push_back({ *it }); + ++dimension; } - }); - } - template - void BroadcastOperationNode::Compute() const - { - const auto& outputLayout = GetOutputMemoryLayout(); - const auto numInputs = NumInputPorts(); - - auto outputSize = outputLayout.GetMemorySize(); - auto output = std::vector(outputSize); - - const int startDimension = 0; - std::vector prevInputOffsets(numInputs, 0); - auto lastActiveInputDimensions = GetLastActiveInputDimensions(); - std::vector inputValues(numInputs); - const int startOffset = 0; - ComputeDimensionLoop(startDimension, prevInputOffsets, lastActiveInputDimensions, inputValues, startOffset, output); - - GetOutput().SetOutput(output); + output(outputIndices) = CallKernelFunction(f, inputs, indices); + }; } template - void BroadcastOperationNode::Compile(model::IRMapCompiler& compiler, emitters::IRFunctionEmitter& function) + value::Scalar BroadcastOperationNode::CallKernelFunction(FunctionType f, std::vector inputs, std::vector> indices) const { - const auto numInputs = NumInputPorts(); - - std::vector inputs; - for (int index = 0; index < numInputs; ++index) + // TODO: if FunctionType was a function that took a vector of inputs, then we could dispense with this `if constexpr` block + if constexpr(std::is_same_v) { - const auto& inputPort = GetInput(index); - auto inputVar = function.LocalArray(compiler.EnsurePortEmitted(inputPort)); - inputs.push_back(inputVar); + return f(inputs[0](indices[0])); + } + else if constexpr(std::is_same_v) + { + return f(inputs[0](indices[0]), inputs[1](indices[1])); + } + else if constexpr(std::is_same_v) + { + return f(inputs[0](indices[0]), inputs[1](indices[1]), inputs[2](indices[2])); + } + else + { + throw utilities::LogicException(utilities::LogicExceptionErrors::illegalState); } - - auto output = function.LocalArray(compiler.EnsurePortEmitted(GetOutput(), this->GetOutputPadding())); - - const int startDimension = 0; - std::vector prevInputOffsets(numInputs, function.LocalScalar(0)); - auto lastActiveInputDimensions = GetLastActiveInputDimensions(); - std::vector inputValues(numInputs, function.LocalScalar()); - const emitters::IRLocalScalar startOffset = function.LocalScalar(0); - CompileDimensionLoop(compiler, function, startDimension, inputs, prevInputOffsets, lastActiveInputDimensions, inputValues, startOffset, output); } template @@ -619,7 +482,7 @@ namespace nodes template void BroadcastOperationNode::WriteToArchive(utilities::Archiver& archiver) const { - model::CompilableNode::WriteToArchive(archiver); + model::CompilableCodeNode::WriteToArchive(archiver); auto outputLayout = GetOutputMemoryLayout(); archiver["outputLayout"] << outputLayout; archiver["padding"] << _paddingValue; @@ -628,7 +491,7 @@ namespace nodes template void BroadcastOperationNode::ReadFromArchive(utilities::Unarchiver& archiver) { - model::CompilableNode::ReadFromArchive(archiver); + model::CompilableCodeNode::ReadFromArchive(archiver); model::PortMemoryLayout outputLayout; archiver["outputLayout"] >> outputLayout; _output.SetMemoryLayout(outputLayout); @@ -641,7 +504,7 @@ namespace nodes template BroadcastUnaryOperationNode::BroadcastUnaryOperationNode() : BroadcastOperationNode({ &_input }, {}, static_cast(0)), - _input(this, {}, model::CompilableNode::defaultInputPortName), + _input(this, {}, model::CompilableCodeNode::defaultInputPortName), _operation(OperationType::none) { } @@ -649,39 +512,17 @@ namespace nodes template BroadcastUnaryOperationNode::BroadcastUnaryOperationNode(const model::OutputPort& input, OperationType operation, ValueType paddingValue) : BroadcastOperationNode({ &_input }, { &input }, paddingValue), - _input(this, input, model::CompilableNode::defaultInputPortName), + _input(this, input, model::CompilableCodeNode::defaultInputPortName), _operation(operation) { - SetOperationFunction(); } template BroadcastUnaryOperationNode::BroadcastUnaryOperationNode(const model::OutputPort& input, const model::PortMemoryLayout& outputLayout, OperationType operation, ValueType paddingValue) : BroadcastOperationNode({ &_input }, { &input }, outputLayout, paddingValue), - _input(this, input, model::CompilableNode::defaultInputPortName), + _input(this, input, model::CompilableCodeNode::defaultInputPortName), _operation(operation) { - SetOperationFunction(); - } - - template - ValueType BroadcastUnaryOperationNode::ComputeOperation(const std::vector& args) const - { - if (args.size() != 1) - { - throw utilities::LogicException(utilities::LogicExceptionErrors::illegalState); - } - return GetFunction().Compute(args[0]); - } - - template - emitters::IRLocalScalar BroadcastUnaryOperationNode::CompileOperation(const std::vector& args) const - { - if (args.size() != 1) - { - throw utilities::LogicException(utilities::LogicExceptionErrors::illegalState); - } - return GetFunction().Compile(args[0].function, args[0]); } template @@ -698,7 +539,7 @@ namespace nodes void BroadcastUnaryOperationNode::WriteToArchive(utilities::Archiver& archiver) const { BroadcastOperationNode::WriteToArchive(archiver); - archiver[model::CompilableNode::defaultInputPortName] << _input; + archiver[model::CompilableCodeNode::defaultInputPortName] << _input; archiver["operation"] << ToString(_operation); } @@ -706,11 +547,10 @@ namespace nodes void BroadcastUnaryOperationNode::ReadFromArchive(utilities::Unarchiver& archiver) { BroadcastOperationNode::ReadFromArchive(archiver); - archiver[model::CompilableNode::defaultInputPortName] >> _input; + archiver[model::CompilableCodeNode::defaultInputPortName] >> _input; std::string operation; archiver["operation"] >> operation; _operation = FromString(operation); - SetOperationFunction(); } // @@ -719,8 +559,8 @@ namespace nodes template BroadcastBinaryOperationNode::BroadcastBinaryOperationNode() : BroadcastOperationNode({ &_input1, &_input2 }, {}), - _input1(this, {}, model::CompilableNode::defaultInput1PortName), - _input2(this, {}, model::CompilableNode::defaultInput2PortName), + _input1(this, {}, model::CompilableCodeNode::defaultInput1PortName), + _input2(this, {}, model::CompilableCodeNode::defaultInput2PortName), _operation(OperationType::none) { } @@ -731,11 +571,10 @@ namespace nodes OperationType operation, ValueType paddingValue) : BroadcastOperationNode({ &_input1, &_input2 }, { &input1, &input2 }, paddingValue), - _input1(this, input1, model::CompilableNode::defaultInput1PortName), - _input2(this, input2, model::CompilableNode::defaultInput2PortName), + _input1(this, input1, model::CompilableCodeNode::defaultInput1PortName), + _input2(this, input2, model::CompilableCodeNode::defaultInput2PortName), _operation(operation) { - SetOperationFunction(); } template @@ -745,31 +584,10 @@ namespace nodes OperationType operation, ValueType paddingValue) : BroadcastOperationNode({ &_input1, &_input2 }, { &input1, &input2 }, outputLayout, paddingValue), - _input1(this, input1, model::CompilableNode::defaultInput1PortName), - _input2(this, input2, model::CompilableNode::defaultInput2PortName), + _input1(this, input1, model::CompilableCodeNode::defaultInput1PortName), + _input2(this, input2, model::CompilableCodeNode::defaultInput2PortName), _operation(operation) { - SetOperationFunction(); - } - - template - ValueType BroadcastBinaryOperationNode::ComputeOperation(const std::vector& args) const - { - if (args.size() != 2) - { - throw utilities::LogicException(utilities::LogicExceptionErrors::illegalState); - } - return GetFunction().Compute(args[0], args[1]); - } - - template - emitters::IRLocalScalar BroadcastBinaryOperationNode::CompileOperation(const std::vector& args) const - { - if (args.size() != 2) - { - throw utilities::LogicException(utilities::LogicExceptionErrors::illegalState); - } - return GetFunction().Compile(args[0].function, args[0], args[1]); } template @@ -788,8 +606,8 @@ namespace nodes void BroadcastBinaryOperationNode::WriteToArchive(utilities::Archiver& archiver) const { BroadcastOperationNode::WriteToArchive(archiver); - archiver[model::CompilableNode::defaultInput1PortName] << _input1; - archiver[model::CompilableNode::defaultInput1PortName] << _input2; + archiver[model::CompilableCodeNode::defaultInput1PortName] << _input1; + archiver[model::CompilableCodeNode::defaultInput2PortName] << _input2; archiver["operation"] << ToString(_operation); } @@ -797,12 +615,11 @@ namespace nodes void BroadcastBinaryOperationNode::ReadFromArchive(utilities::Unarchiver& archiver) { BroadcastOperationNode::ReadFromArchive(archiver); - archiver[model::CompilableNode::defaultInput1PortName] >> _input1; - archiver[model::CompilableNode::defaultInput1PortName] >> _input2; + archiver[model::CompilableCodeNode::defaultInput1PortName] >> _input1; + archiver[model::CompilableCodeNode::defaultInput2PortName] >> _input2; std::string operation; archiver["operation"] >> operation; _operation = FromString(operation); - SetOperationFunction(); } // @@ -811,9 +628,9 @@ namespace nodes template BroadcastTernaryOperationNode::BroadcastTernaryOperationNode() : BroadcastOperationNode({ &_input1, &_input2, &_input3 }, {}), - _input1(this, {}, model::CompilableNode::defaultInput1PortName), - _input2(this, {}, model::CompilableNode::defaultInput2PortName), - _input3(this, {}, model::CompilableNode::defaultInput3PortName), + _input1(this, {}, model::CompilableCodeNode::defaultInput1PortName), + _input2(this, {}, model::CompilableCodeNode::defaultInput2PortName), + _input3(this, {}, model::CompilableCodeNode::defaultInput3PortName), _operation(OperationType::none) { } @@ -825,12 +642,11 @@ namespace nodes OperationType operation, ValueType paddingValue) : BroadcastOperationNode({ &_input1, &_input2, &_input3 }, { &input1, &input2, &input3 }, paddingValue), - _input1(this, input1, model::CompilableNode::defaultInput1PortName), - _input2(this, input2, model::CompilableNode::defaultInput2PortName), - _input3(this, input3, model::CompilableNode::defaultInput3PortName), + _input1(this, input1, model::CompilableCodeNode::defaultInput1PortName), + _input2(this, input2, model::CompilableCodeNode::defaultInput2PortName), + _input3(this, input3, model::CompilableCodeNode::defaultInput3PortName), _operation(operation) { - SetOperationFunction(); } template @@ -841,32 +657,11 @@ namespace nodes OperationType operation, ValueType paddingValue) : BroadcastOperationNode({ &_input1, &_input2, &_input3 }, { &input1, &input2, &input3 }, outputLayout, paddingValue), - _input1(this, input1, model::CompilableNode::defaultInput1PortName), - _input2(this, input2, model::CompilableNode::defaultInput2PortName), - _input3(this, input3, model::CompilableNode::defaultInput3PortName), + _input1(this, input1, model::CompilableCodeNode::defaultInput1PortName), + _input2(this, input2, model::CompilableCodeNode::defaultInput2PortName), + _input3(this, input3, model::CompilableCodeNode::defaultInput3PortName), _operation(operation) { - SetOperationFunction(); - } - - template - ValueType BroadcastTernaryOperationNode::ComputeOperation(const std::vector& args) const - { - if (args.size() != 3) - { - throw utilities::LogicException(utilities::LogicExceptionErrors::illegalState); - } - return GetFunction().Compute(args[0], args[1], args[2]); - } - - template - emitters::IRLocalScalar BroadcastTernaryOperationNode::CompileOperation(const std::vector& args) const - { - if (args.size() != 3) - { - throw utilities::LogicException(utilities::LogicExceptionErrors::illegalState); - } - return GetFunction().Compile(args[0].function, args[0], args[1], args[2]); } template @@ -887,9 +682,9 @@ namespace nodes void BroadcastTernaryOperationNode::WriteToArchive(utilities::Archiver& archiver) const { BroadcastOperationNode::WriteToArchive(archiver); - archiver[model::CompilableNode::defaultInput1PortName] << _input1; - archiver[model::CompilableNode::defaultInput2PortName] << _input2; - archiver[model::CompilableNode::defaultInput3PortName] << _input3; + archiver[model::CompilableCodeNode::defaultInput1PortName] << _input1; + archiver[model::CompilableCodeNode::defaultInput2PortName] << _input2; + archiver[model::CompilableCodeNode::defaultInput3PortName] << _input3; archiver["operation"] << ToString(_operation); } @@ -897,45 +692,44 @@ namespace nodes void BroadcastTernaryOperationNode::ReadFromArchive(utilities::Unarchiver& archiver) { BroadcastOperationNode::ReadFromArchive(archiver); - archiver[model::CompilableNode::defaultInput1PortName] >> _input1; - archiver[model::CompilableNode::defaultInput2PortName] >> _input2; - archiver[model::CompilableNode::defaultInput3PortName] >> _input3; + archiver[model::CompilableCodeNode::defaultInput1PortName] >> _input1; + archiver[model::CompilableCodeNode::defaultInput2PortName] >> _input2; + archiver[model::CompilableCodeNode::defaultInput3PortName] >> _input3; std::string operation; archiver["operation"] >> operation; _operation = FromString(operation); - SetOperationFunction(); } template - void BroadcastUnaryOperationNode::SetOperationFunction() + auto BroadcastUnaryOperationNode::GetKernelFunction() const -> KernelFunctionType { switch (_operation) { case UnaryOperationType::abs: - SetFunction(AbsFunction()); + return MakeKernel(value::Abs); break; case UnaryOperationType::exp: - SetFunction(ExpFunction()); + return MakeKernel(value::Exp); break; case UnaryOperationType::log: - SetFunction(LogFunction()); + return MakeKernel(value::Log); break; case UnaryOperationType::sqrt: - SetFunction(SqrtFunction()); + return MakeKernel(value::Sqrt); break; case UnaryOperationType::logicalNot: throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "Not implemented"); case UnaryOperationType::tanh: - SetFunction(TanhFunction()); + return MakeKernel(value::Tanh); break; case UnaryOperationType::square: - SetFunction(SquareFunction()); + return MakeKernel(value::Square); break; case UnaryOperationType::sin: - SetFunction(SinFunction()); + return MakeKernel(value::Sin); break; case UnaryOperationType::cos: - SetFunction(CosFunction()); + return MakeKernel(value::Cos); break; default: throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "Illegal operation"); @@ -943,21 +737,24 @@ namespace nodes } template - void BroadcastBinaryOperationNode::SetOperationFunction() + auto BroadcastBinaryOperationNode::GetKernelFunction() const -> KernelFunctionType { switch (_operation) { case BinaryOperationType::add: - SetFunction(AddFunction()); + return MakeKernel(value::Add); break; case BinaryOperationType::subtract: - SetFunction(SubtractFunction()); + return MakeKernel(value::Subtract); break; case BinaryOperationType::multiply: - SetFunction(MultiplyFunction()); + return MakeKernel(value::Multiply); break; case BinaryOperationType::divide: - SetFunction(DivideFunction()); + return MakeKernel(value::Divide); + break; + case BinaryOperationType::modulo: + return MakeKernel(value::Modulo); break; case BinaryOperationType::logicalAnd: throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "Not implemented"); @@ -971,12 +768,12 @@ namespace nodes } template - void BroadcastTernaryOperationNode::SetOperationFunction() + auto BroadcastTernaryOperationNode::GetKernelFunction() const -> KernelFunctionType { switch (_operation) { case TernaryOperationType::fma: - SetFunction(FMAFunction()); + return MakeKernel(value::FusedMultiplyAdd); break; default: throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "Illegal operation"); diff --git a/libraries/nodes/include/IRNode.h b/libraries/nodes/include/IRNode.h index 81820b637..d94f61ef7 100644 --- a/libraries/nodes/include/IRNode.h +++ b/libraries/nodes/include/IRNode.h @@ -41,8 +41,6 @@ namespace nodes class IRNode : public model::CompilableNode { public: - IRNode() = default; - IRNode(const IRNode&) = delete; IRNode(IRNode&&) = delete; diff --git a/libraries/nodes/include/MatrixMatrixMultiplyCodeNode.h b/libraries/nodes/include/MatrixMatrixMultiplyCodeNode.h new file mode 100644 index 000000000..9b286b63c --- /dev/null +++ b/libraries/nodes/include/MatrixMatrixMultiplyCodeNode.h @@ -0,0 +1,271 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: MatrixMatrixMultiplyCodeNode.h (nodes) +// Authors: Mason Remy, Denny Sun +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// Current gaps: +// Doesn't support transposed matrices +// Doesn't support alpha and beta values + +#pragma once + +#include +#include +#include + +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace ell +{ +namespace nodes +{ + /// A node that multiplies two matrices. + template + class MatrixMatrixMultiplyCodeNode : public model::CompilableCodeNode + { + public: + /// @name Input and Output Ports + /// @{ + const model::InputPort& input1 = _input1; + const model::InputPort& input2 = _input2; + const model::OutputPort& output = _output; + /// @} + + /// Default Constructor + MatrixMatrixMultiplyCodeNode(); + + /// Constructor. + /// + /// The left-hand input of the matrix multiplication, a row-major matrix of size m x k. + /// The right-hand input of the matrix multiplication, a row-major matrix of size k x n. + /// Which implementation of matrix-matrix multiplication to use + MatrixMatrixMultiplyCodeNode(const model::OutputPort& input1, const model::OutputPort& input2, const MatrixMatrixMultiplyImplementation& gemmImpl = MatrixMatrixMultiplyImplementation::DEFAULT); + + /// Constructor. + /// + /// The left-hand input of the matrix multiplication, a row-major matrix of size m x k. + /// The right-hand input of the matrix multiplication, a row-major matrix of size k x n. + /// The panel size to use in the M dimension (rows of A, C) + /// The panel size to use in the N dimension (columns of B, C) + /// The panel size to use in the K dimension (columns of A, rows of B) + /// The kernel size to use in the M dimension (rows of A, C). + /// The kernel size to use in the N dimension (columns of B, C). + /// The kernel size to use in the K dimension (columns of A, rows of B). + /// Which implementation of matrix-matrix multiplication to use + MatrixMatrixMultiplyCodeNode(const model::OutputPort& input1, const model::OutputPort& input2, int panelM, int panelN, int panelK, int kernelM, int kernelN, int kernelK, const MatrixMatrixMultiplyImplementation& gemmImpl = MatrixMatrixMultiplyImplementation::DEFAULT); + + /// Constructor. + /// + /// The left-hand input of the matrix multiplication, a row-major matrix of size m x k. + /// The right-hand input of the matrix multiplication, a row-major matrix of size k x n. + /// The output memory layout to use. + /// Which implementation of matrix-matrix multiplication to use + MatrixMatrixMultiplyCodeNode(const model::OutputPort& input1, const model::OutputPort& input2, const model::PortMemoryLayout& outputMemoryLayout, const MatrixMatrixMultiplyImplementation& gemmImpl = MatrixMatrixMultiplyImplementation::DEFAULT); + + /// Constructor. + /// + /// The left-hand input of the matrix multiplication, a row-major matrix of size m x k. + /// The right-hand input of the matrix multiplication, a row-major matrix of size k x n. + /// The output memory layout to use. + /// The panel size to use in the M dimension (rows of A, C) + /// The panel size to use in the N dimension (columns of B, C) + /// The panel size to use in the K dimension (columns of A, rows of B) + /// The kernel size to use in the M dimension (rows of A, C). + /// The kernel size to use in the N dimension (columns of B, C). + /// The kernel size to use in the K dimension (columns of A, rows of B). + /// Which implementation of matrix-matrix multiplication to use + MatrixMatrixMultiplyCodeNode(const model::OutputPort& input1, const model::OutputPort& input2, const model::PortMemoryLayout& outputMemoryLayout, int panelM, int panelN, int panelK, int kernelM, int kernelN, int kernelK, const MatrixMatrixMultiplyImplementation& gemmImpl = MatrixMatrixMultiplyImplementation::DEFAULT); + + /// Constructor. + /// + /// The left-hand input of the matrix multiplication, a row-major matrix of size m x k. + /// The number of rows in the left hand input matrix and in the output matrix. + /// The number of columns in the right hand input matrix and in the output matrix. + /// The number of columns in the left hand input matrix and the number of columns in the right hand input matrix. + /// The number of elements between successive elements in a single column in the left hand input matrix. + /// The right-hand input of the matrix multiplication, a row-major matrix of size k x n. + /// The number of elements between successive elements in a single column in the right hand input matrix. + /// The number of elements between successive elements in a single column in the output matrix. + /// Which implementation of matrix-matrix multiplication to use + MatrixMatrixMultiplyCodeNode(const model::OutputPort& input1, int m, int n, int k, int matrix1Stride, const model::OutputPort& input2, int matrix2Stride, int outputMatrixStride, const MatrixMatrixMultiplyImplementation& gemmImpl = MatrixMatrixMultiplyImplementation::DEFAULT); + + /// Constructor. + /// + /// The left-hand input of the matrix multiplication, a row-major matrix of size m x k. + /// The number of rows in the left hand input matrix and in the output matrix. + /// The number of columns in the right hand input matrix and in the output matrix. + /// The number of columns in the left hand input matrix and the number of columns in the right hand input matrix. + /// The number of elements between successive elements in a single column in the left hand input matrix. + /// The right-hand input of the matrix multiplication, a row-major matrix of size k x n. + /// The number of elements between successive elements in a single column in the right hand input matrix. + /// The number of elements between successive elements in a single column in the output matrix. + /// The panel size to use in the M dimension (rows of A, C) + /// The panel size to use in the N dimension (columns of B, C) + /// The panel size to use in the K dimension (columns of A, rows of B) + /// The kernel size to use in the M dimension (rows of A, C). + /// The kernel size to use in the N dimension (columns of B, C). + /// The kernel size to use in the K dimension (columns of A, rows of B). + /// Which implementation of matrix-matrix multiplication to use + MatrixMatrixMultiplyCodeNode(const model::OutputPort& input1, int m, int n, int k, int matrix1Stride, const model::OutputPort& input2, int matrix2Stride, int outputMatrixStride, int panelM, int panelN, int panelK, int kernelM, int kernelN, int kernelK, const MatrixMatrixMultiplyImplementation& gemmImpl = MatrixMatrixMultiplyImplementation::DEFAULT); + + /// Constructor. + /// + /// The left-hand input of the matrix multiplication, a row-major matrix of size m x k. + /// The number of rows in the left hand input matrix and in the output matrix. + /// The number of columns in the right hand input matrix and in the output matrix. + /// The number of columns in the left hand input matrix and the number of columns in the right hand input matrix. + /// The number of elements between successive elements in a single column in the left hand input matrix. + /// If true, transpose the left-hand input matrix. + /// The right-hand input of the matrix multiplication, a row-major matrix of size k x n. + /// The number of elements between successive elements in a single column in the right hand input matrix. + /// If true, transpose the right-hand input matrix. + /// The number of elements between successive elements in a single column in the output matrix. + /// Which implementation of matrix-matrix multiplication to use + MatrixMatrixMultiplyCodeNode(const model::OutputPort& input1, int m, int n, int k, int matrix1Stride, bool transpose1, const model::OutputPort& input2, int matrix2Stride, bool transpose2, int outputMatrixStride, const MatrixMatrixMultiplyImplementation& gemmImpl = MatrixMatrixMultiplyImplementation::DEFAULT); + + /// Constructor. + /// + /// The left-hand input of the matrix multiplication, a row-major matrix of size m x k. + /// The number of rows in the left hand input matrix and in the output matrix. + /// The number of columns in the right hand input matrix and in the output matrix. + /// The number of columns in the left hand input matrix and the number of columns in the right hand input matrix. + /// The number of elements between successive elements in a single column in the left hand input matrix. + /// If true, transpose the left-hand input matrix. + /// The right-hand input of the matrix multiplication, a row-major matrix of size k x n. + /// The number of elements between successive elements in a single column in the right hand input matrix. + /// If true, transpose the right-hand input matrix. + /// The number of elements between successive elements in a single column in the output matrix. + /// The panel size to use in the M dimension (rows of A, C) + /// The panel size to use in the N dimension (columns of B, C) + /// The panel size to use in the K dimension (columns of A, rows of B) + /// The kernel size to use in the M dimension (rows of A, C). + /// The kernel size to use in the N dimension (columns of B, C). + /// The kernel size to use in the K dimension (columns of A, rows of B). + /// Which implementation of matrix-matrix multiplication to use + MatrixMatrixMultiplyCodeNode(const model::OutputPort& input1, int m, int n, int k, int matrix1Stride, bool transpose1, const model::OutputPort& input2, int matrix2Stride, bool transpose2, int outputMatrixStride, int panelM, int panelN, int panelK, int kernelM, int kernelN, int kernelK, const MatrixMatrixMultiplyImplementation& gemmImpl = MatrixMatrixMultiplyImplementation::DEFAULT); + + /// Constructor. + /// + /// The left-hand input of the matrix multiplication, a row-major matrix of size m x k. + /// The number of rows in the left hand input matrix and in the output matrix. + /// The number of columns in the right hand input matrix and in the output matrix. + /// The number of columns in the left hand input matrix and the number of columns in the right hand input matrix. + /// The number of elements between successive elements in a single column in the left hand input matrix. + /// If true, transpose the left-hand input matrix. + /// The right-hand input of the matrix multiplication, a row-major matrix of size k x n. + /// The number of elements between successive elements in a single column in the right hand input matrix. + /// If true, transpose the right-hand input matrix. + /// The number of elements between successive elements in a single column in the output matrix. + /// If true, transpose the output matrix. + /// Which implementation of matrix-matrix multiplication to use + MatrixMatrixMultiplyCodeNode(const model::OutputPort& input1, int m, int n, int k, int matrix1Stride, bool transpose1, const model::OutputPort& input2, int matrix2Stride, bool transpose2, int outputMatrixStride, bool transposeOutput, const MatrixMatrixMultiplyImplementation& gemmImpl = MatrixMatrixMultiplyImplementation::DEFAULT); + + /// Constructor. + /// + /// The left-hand input of the matrix multiplication, a row-major matrix of size m x k. + /// The number of rows in the left hand input matrix and in the output matrix. + /// The number of columns in the right hand input matrix and in the output matrix. + /// The number of columns in the left hand input matrix and the number of columns in the right hand input matrix. + /// The number of elements between successive elements in a single column in the left hand input matrix. + /// If true, transpose the left-hand input matrix. + /// The right-hand input of the matrix multiplication, a row-major matrix of size k x n. + /// The number of elements between successive elements in a single column in the right hand input matrix. + /// If true, transpose the right-hand input matrix. + /// The number of elements between successive elements in a single column in the output matrix. + /// If true, transpose the output matrix. + /// The panel size to use in the M dimension (rows of A, C) + /// The panel size to use in the N dimension (columns of B, C) + /// The panel size to use in the K dimension (columns of A, rows of B) + /// The kernel size to use in the M dimension (rows of A, C). + /// The kernel size to use in the N dimension (columns of B, C). + /// The kernel size to use in the K dimension (columns of A, rows of B). + /// Which implementation of matrix-matrix multiplication to use + MatrixMatrixMultiplyCodeNode(const model::OutputPort& input1, int m, int n, int k, int matrix1Stride, bool transpose1, const model::OutputPort& input2, int matrix2Stride, bool transpose2, int outputMatrixStride, bool transposeOutput, int panelM, int panelN, int panelK, int kernelM, int kernelN, int kernelK, const MatrixMatrixMultiplyImplementation& gemmImpl = MatrixMatrixMultiplyImplementation::DEFAULT); + + /// Gets the name of this type (for serialization). + /// + /// The name of this type. + static std::string GetTypeName() { return utilities::GetCompositeTypeName("MatrixMatrixMultiplyCodeNode"); } + + /// Gets the name of this type (for serialization). + /// + /// The name of this type. + std::string GetRuntimeTypeName() const override { return GetTypeName(); } + + protected: + void Define(value::FunctionDeclaration& fn) override; + utilities::ArchiveVersion GetArchiveVersion() const override; + bool CanReadArchiveVersion(const utilities::ArchiveVersion& version) const override; + void WriteToArchive(utilities::Archiver& archiver) const override; + void ReadFromArchive(utilities::Unarchiver& archiver) override; + bool HasState() const override { return true; } // stored state: m, n, k, lda, ldb, ldc, transpose + + private: + void Copy(model::ModelTransformer& transformer) const override; + + void ZeroMatrix(value::Matrix matrix) const; + + void ForLoopGEMM(const value::Matrix matA, const value::Matrix matB, value::Matrix matC); + void Gemm(const value::Matrix mat, const value::Matrix matB, value::Matrix matC); + void GemmFn(const value::Matrix mat, const value::Matrix matB, value::Matrix matC, int thread_num = 0); + void ParallelizeGemmCol(const value::Matrix matA, const value::Matrix matB, value::Matrix matC, int numThreads = 2); + void ParallelizeGemmRow(const value::Matrix matA, const value::Matrix matB, value::Matrix matC, int numThreads = 2); + void ELLCodeGEMM(const value::Matrix matA, const value::Matrix matB, value::Matrix matC); + + // Inputs + model::InputPort _input1; + model::InputPort _input2; + + // Output + model::OutputPort _output; + + // Matrix dimensions + // Matrix 1 is MxK, Matrix 2 is KxN, Output is MxN + int _m = 0, _n = 0, _k = 0; + int _lda = 0, _ldb = 0, _ldc = 0; + bool _transpose1 = false, _transpose2 = false, _transposeOutput = false; + + // Implementation-controlling members + int _panelM; + int _panelN; + int _panelK; + int _kernelM; + int _kernelN; + int _kernelK; + MatrixMatrixMultiplyImplementation _impl; + + static const int _defaultPanelM = 64; + static const int _defaultPanelN = 64; + static const int _defaultPanelK = 64; + static const int _defaultKernelM = 4; + static const int _defaultKernelN = 4; + static const int _defaultKernelK = 4; + }; + + // + // Explicit instantiation declarations + // + extern template class MatrixMatrixMultiplyCodeNode; + extern template class MatrixMatrixMultiplyCodeNode; +} // namespace nodes +} // namespace ell diff --git a/libraries/nodes/include/MatrixMatrixMultiplyImplementation.h b/libraries/nodes/include/MatrixMatrixMultiplyImplementation.h new file mode 100644 index 000000000..655433e7d --- /dev/null +++ b/libraries/nodes/include/MatrixMatrixMultiplyImplementation.h @@ -0,0 +1,23 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: MatrixMatrixMultiplyImplementation.h (nodes) +// Authors: Mason Remy, Denny Sun +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +namespace ell +{ +namespace nodes +{ + enum class MatrixMatrixMultiplyImplementation : int + { + SimpleForLoops = 0, + Mlas_Loopnest_Value, + LAST, + DEFAULT = Mlas_Loopnest_Value + }; +} // namespace nodes +} // namespace ell diff --git a/libraries/nodes/include/NodeOperations.h b/libraries/nodes/include/NodeOperations.h index 81a899dce..faa1779d5 100644 --- a/libraries/nodes/include/NodeOperations.h +++ b/libraries/nodes/include/NodeOperations.h @@ -59,7 +59,8 @@ namespace nodes divide, logicalAnd, logicalOr, - logicalXor + logicalXor, + modulo }; template diff --git a/libraries/nodes/include/ReorderDataCodeNode.h b/libraries/nodes/include/ReorderDataCodeNode.h new file mode 100644 index 000000000..37b3085e7 --- /dev/null +++ b/libraries/nodes/include/ReorderDataCodeNode.h @@ -0,0 +1,586 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: ReorderDataCodeNode.h (nodes) +// Authors: Byron Changuion +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +namespace ell +{ +namespace nodes +{ + /// A node that takes data from its input and outputs it in a different order. + template + class ReorderDataCodeNode : public model::CompilableCodeNode + { + + public: + /// @name Input and Output Ports + /// @{ + const model::InputPort& input = _input; + const model::OutputPort& output = _output; + /// @} + + /// Default Constructor + ReorderDataCodeNode(); + + /// Constructor with no reordering + /// + /// The input to reorder. + /// The memory layout of the output. Data will be copied into the "active" area, and the rest will be zeroed out according to the padding value. + /// The value to fill the inactive area with. + ReorderDataCodeNode(const model::OutputPort& input, const model::PortMemoryLayout& outputMemoryLayout, ValueType paddingValue = 0); + + /// Constructor with no reordering + /// + /// The input to reorder. + /// The memory layout of the input. Only data in the "active" area will be copied. + /// The memory layout of the output. Data will be copied into the "active" area, and the rest will be zeroed out according to the padding value. + /// The value to fill the inactive area with. + ReorderDataCodeNode(const model::OutputPort& input, const model::PortMemoryLayout& inputMemoryLayout, const model::PortMemoryLayout& outputMemoryLayout, ValueType paddingValue = 0); + + /// Constructor with reordering + /// + /// The input to reorder. + /// The permutation vector to apply to the dimensions when copying. Input dimension `i` will get copied to output dimension `order[i]`. If left empty, no reordering is done. + // For instance, to reorder the normal interleaved image order into a planar order, the `order` parameter would be + /// set to {2, 0, 1} --- reordering {row, column, channel} to {channel, row, column} + ReorderDataCodeNode(const model::OutputPort& input, const model::DimensionOrder& order); + + /// Constructor with reordering + /// + /// The input to reorder. + /// The memory layout of the output. Data will be copied into the "active" area, and the rest will be zeroed out according to the padding value. + /// The permutation vector to apply to the dimensions when copying. Input dimension `i` will get copied to output dimension `order[i]`. If left empty, no reordering is done. + /// For instance, to reorder the normal interleaved image order into a planar order, the `order` parameter would be + /// set to {2, 0, 1} --- reordering {row, column, channel} to {channel, row, column} + /// The value to fill the inactive area with. + ReorderDataCodeNode(const model::OutputPort& input, const model::PortMemoryLayout& outputMemoryLayout, const model::DimensionOrder& order, ValueType paddingValue = 0); + + /// Constructor with reordering + /// + /// The input to reorder. + /// The memory layout of the input. Only data in the "active" area is guaranteed to be copied. + /// The memory layout of the output. Data will be copied into the "active" area, and the rest will be zeroed out according to the padding value. + /// The permutation vector to apply to the dimensions when copying. Input dimension `i` will get copied to output dimension `order[i]`. If left empty, no reordering is done. + /// For instance, to reorder the normal interleaved image order into a planar order, the `order` parameter would be + /// set to {2, 0, 1} --- reordering {row, column, channel} to {channel, row, column} + /// The value to fill the inactive area with. + ReorderDataCodeNode(const model::OutputPort& input, const model::PortMemoryLayout& inputMemoryLayout, const model::PortMemoryLayout& outputMemoryLayout, const model::DimensionOrder& order, ValueType paddingValue = 0); + + /// Gets information about the input memory layout + const model::PortMemoryLayout& GetInputMemoryLayout() const { return _inputMemoryLayout; } + + /// Gets information about the input memory layout + model::PortMemoryLayout GetOutputMemoryLayout() const { return _outputMemoryLayout; } + + /// Returns padding value + /// + /// Padding value + ValueType GetPaddingValue() const { return _paddingValue; } + + /// Returns true if the node can accept input with this memory layout order, else false + /// + /// The memory layout order for all the input ports + /// If the node can accept the input memory layout order, true, else false + bool CanAcceptInputLayout(const utilities::DimensionOrder& order) const override + { + return GetInputMemoryLayout().GetLogicalDimensionOrder() == order; + } + + /// Gets the name of this type (for serialization). + /// + /// The name of this type. + static std::string GetTypeName() { return utilities::GetCompositeTypeName("ReorderDataCodeNode"); } + + protected: + void Define(ell::value::FunctionDeclaration& fn) override; + void WriteToArchive(utilities::Archiver& archiver) const override; + void ReadFromArchive(utilities::Unarchiver& archiver) override; + bool HasState() const override { return true; } // stored state: operation + std::string GetRuntimeTypeName() const override { return GetTypeName(); } + + private: + void Copy(model::ModelTransformer& transformer) const override; + + void reorder_kernel_optimized_columns(value::Tensor source, value::Tensor dest, value::Scalar i, value::Scalar j, value::Scalar k); + void reorder_kernel_optimized_channels(value::Tensor source, value::Tensor dest, value::Scalar i, value::Scalar j, value::Scalar k); + static void reorder_kernel_basic(value::Tensor source, value::Tensor dest, value::Scalar i, value::Scalar j, value::Scalar k); + + // Inputs + model::InputPort _input; + + // Output + model::OutputPort _output; + + // Memory Layouts + model::PortMemoryLayout _inputMemoryLayout; + model::PortMemoryLayout _outputMemoryLayout; + + ValueType _paddingValue; + + // This is used in the Define function as a workaround for passing in constant Scalar values + // to the kernel + int _kernel_size; +}; + + /// Convenience function for adding a node to a model. + /// + /// The input to reorder. + /// The memory layout of the output. Data will be copied into the "active" area, and the rest will be zeroed out according to the padding value. + /// The value to fill the inactive area with. + /// + /// The output of the new node. + template + const model::OutputPort& ReorderDataWithCodeNode(const model::OutputPort& input, const model::PortMemoryLayout& outputMemoryLayout, ValueType paddingValue = 0); + + /// Convenience function for adding a node to a model. + /// + /// The input to reorder. + /// The memory layout of the input. Only data in the "active" area will be copied. + /// The memory layout of the output. Data will be copied into the "active" area, and the rest will be zeroed out according to the padding value. + /// The value to fill the inactive area with. + /// + /// The output of the new node. + template + const model::OutputPort& ReorderDataWithCodeNode(const model::OutputPort& input, const model::PortMemoryLayout& inputMemoryLayout, const model::PortMemoryLayout& outputMemoryLayout, ValueType paddingValue = 0); + + /// Convenience function for adding a node to a model. + /// + /// The input to reorder. + /// The memory layout of the output. Data will be copied into the "active" area, and the rest will be zeroed out according to the padding value. + /// The permutation vector to apply to the dimensions when copying. Input dimension `i` will get copied to output dimension `order[i]`. If left empty, no reordering is done. + /// For instance, to reorder the normal interleaved image order into a planar order, the `order` parameter would be + /// set to {2, 0, 1} --- reordering {row, column, channel} to {channel, row, column} + /// The value to fill the inactive area with. + /// + /// The output of the new node. + template + const model::OutputPort& ReorderDataWithCodeNode(const model::OutputPort& input, const model::PortMemoryLayout& outputMemoryLayout, const model::DimensionOrder& order, ValueType paddingValue = 0); + + /// Convenience function for adding a node to a model. + /// + /// The input to reorder. + /// The memory layout of the input. Only data in the "active" area will be copied. + /// The memory layout of the output. Data will be copied into the "active" area, and the rest will be zeroed out according to the padding value. + /// The permutation vector to apply to the dimensions when copying. Input dimension `i` will get copied to output dimension `order[i]`. If left empty, no reordering is done. + /// For instance, to reorder the normal interleaved image order into a planar order, the `order` parameter would be + /// set to {2, 0, 1} --- reordering {row, column, channel} to {channel, row, column} + /// The value to fill the inactive area with. + /// + /// The output of the new node. + template + const model::OutputPort& ReorderDataWithCodeNode(const model::OutputPort& input, const model::PortMemoryLayout& inputMemoryLayout, const model::PortMemoryLayout& outputMemoryLayout, const model::DimensionOrder& order, ValueType paddingValue = 0); + + /// Convenience function for adding a node to a model. + /// + /// The input to reorder. + /// The permutation vector to apply to the dimensions when copying. Input dimension `i` will get copied to output dimension `order[i]`. If left empty, no reordering is done. + /// For instance, to reorder the normal interleaved image order into a planar order, the `order` parameter would be + /// set to {2, 0, 1} --- reordering {row, column, channel} to {channel, row, column} + /// + /// The output of the new node. + template + const model::OutputPort& ReorderDataWithCodeNode(const model::OutputPort& input, const model::DimensionOrder& order); + +} // namespace nodes +} // namespace ell + +#pragma region implementation + +namespace ell +{ +namespace nodes +{ + // + // ReorderDataCodeNode + // + template + ReorderDataCodeNode::ReorderDataCodeNode() : + CompilableCodeNode("ReorderDataCodeNode", { &_input }, { &_output }), + _input(this, {}, defaultInputPortName), + _output(this, defaultOutputPortName, 0), + _inputMemoryLayout(utilities::MemoryShape{}), + _outputMemoryLayout(_output.GetMemoryLayout()), + _paddingValue(0), + _kernel_size(1) + {} + + // + // Without reordering ("reshape" / slicing) + // + template + ReorderDataCodeNode::ReorderDataCodeNode(const model::OutputPort& input, + const model::PortMemoryLayout& outputMemoryLayout, + ValueType paddingValue) : + CompilableCodeNode("ReorderDataCodeNode", { &_input }, { &_output }), + _input(this, input, defaultInputPortName), + _output(this, defaultOutputPortName, outputMemoryLayout), + _inputMemoryLayout(_input.GetMemoryLayout()), + _outputMemoryLayout(_output.GetMemoryLayout()), + _paddingValue(paddingValue), + _kernel_size(1) + { + if (_inputMemoryLayout.NumDimensions() != outputMemoryLayout.NumDimensions()) + { + throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, + "Error: input and output layouts must have same dimension"); + } + } + + template + ReorderDataCodeNode::ReorderDataCodeNode(const model::OutputPort& input, + const model::PortMemoryLayout& inputMemoryLayout, + const model::PortMemoryLayout& outputMemoryLayout, + ValueType paddingValue) : + CompilableCodeNode("ReorderDataCodeNode", { &_input }, { &_output }), + _input(this, input, defaultInputPortName), + _output(this, defaultOutputPortName, outputMemoryLayout), + _inputMemoryLayout(inputMemoryLayout), + _outputMemoryLayout(_output.GetMemoryLayout()), + _paddingValue(paddingValue), + _kernel_size(1) + { + if (inputMemoryLayout.NumDimensions() != outputMemoryLayout.NumDimensions()) + { + throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, + "Error: input and output layouts must have same dimension"); + } + } + + // + // With reordering ("reshape" / slicing, followed by transpose / dimension reordering) + // + template + ReorderDataCodeNode::ReorderDataCodeNode(const model::OutputPort& input, + const model::DimensionOrder& order) : + CompilableCodeNode("ReorderDataCodeNode", { &_input }, { &_output }), + _input(this, input, defaultInputPortName), + _output(this, defaultOutputPortName, _input.GetMemoryLayout().ReorderedCopy(order)), + _inputMemoryLayout(_input.GetMemoryLayout()), + _outputMemoryLayout(_output.GetMemoryLayout()), + _paddingValue(0), + _kernel_size(1) + { + if (_inputMemoryLayout.NumDimensions() != order.NumDimensions()) + { + throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, + "Error: input and output layouts must have same dimension"); + } + } + + template + ReorderDataCodeNode::ReorderDataCodeNode(const model::OutputPort& input, + const model::PortMemoryLayout& outputMemoryLayout, + const model::DimensionOrder& order, + ValueType paddingValue) : + CompilableCodeNode("ReorderDataCodeNode", { &_input }, { &_output }), + _input(this, input, defaultInputPortName), + _output(this, defaultOutputPortName, outputMemoryLayout.ReorderedCopy(order)), + _inputMemoryLayout(_input.GetMemoryLayout()), + _outputMemoryLayout(_output.GetMemoryLayout()), + _paddingValue(paddingValue), + _kernel_size(1) + { + if (_inputMemoryLayout.NumDimensions() != outputMemoryLayout.NumDimensions()) + { + throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, + "Error: input and output layouts must have same dimension"); + } + } + + template + ReorderDataCodeNode::ReorderDataCodeNode(const model::OutputPort& input, + const model::PortMemoryLayout& inputMemoryLayout, + const model::PortMemoryLayout& outputMemoryLayout, + const model::DimensionOrder& order, + ValueType paddingValue) : + CompilableCodeNode("ReorderDataCodeNode", { &_input }, { &_output }), + _input(this, input, defaultInputPortName), + _output(this, defaultOutputPortName, outputMemoryLayout.ReorderedCopy(order)), + _inputMemoryLayout(inputMemoryLayout), + _outputMemoryLayout(_output.GetMemoryLayout()), + _paddingValue(paddingValue), + _kernel_size(1) + { + if (inputMemoryLayout.NumDimensions() != outputMemoryLayout.NumDimensions()) + { + throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, + "Error: input and output layouts must have same dimension"); + } + } + + // + // A reorder kernel that is optimized for when channels are the minor increment + // + template + void ReorderDataCodeNode::reorder_kernel_optimized_channels(value::Tensor source, value::Tensor dest, value::Scalar i, value::Scalar j, value::Scalar k) + { + value::Vector cache = value::MakeVector(_kernel_size); + for (int l = 0; l < _kernel_size; ++l) + { + cache(l) = source(i, j, k * _kernel_size + l); + } + + for (int l = 0; l < _kernel_size; ++l) + { + dest(i, j, k * _kernel_size + l) = cache(l); + } + } + + // + // A reorder kernel that is optimized for when columns are the minor increment + // + template + void ReorderDataCodeNode::reorder_kernel_optimized_columns(value::Tensor source, value::Tensor dest, value::Scalar i, value::Scalar j, value::Scalar k) + { + value::Vector cache = value::MakeVector(_kernel_size); + for (int l = 0; l < _kernel_size; ++l) + { + cache(l) = source(i, j * _kernel_size + l, k); + } + + for (int l = 0; l < _kernel_size; ++l) + { + dest(i, j * _kernel_size + l, k) = cache(l); + } + } + + // + // A basic, unoptimized reorder kernel + // + template + void ReorderDataCodeNode::reorder_kernel_basic(value::Tensor source, value::Tensor dest, value::Scalar i, value::Scalar j, value::Scalar k) + { + dest(i, j, k) = source(i, j, k); + } + + template + void ReorderDataCodeNode::Define(ell::value::FunctionDeclaration& fn) + { + (void)fn.Define([this](const value::Value value_input, value::Value output) { + namespace loopnests = ell::value::loopnests; + + auto input = value_input; + // Set the layout to use for the input view. + input.SetLayout(_inputMemoryLayout); + + // Check if this is a Tensor + if (input.GetLayout().NumDimensions() == 3) + { + auto data = value::Tensor(input); + auto result = value::Tensor(output); + + _kernel_size = 8; + if (output.GetLayout().GetLogicalDimensionOrder() == utilities::DimensionOrder({ 0, 1, 2 })) + { + if (result.Channels() % _kernel_size != 0) + { + _kernel_size = 4; + } + } + else if (output.GetLayout().GetLogicalDimensionOrder() == utilities::DimensionOrder({ 2, 0, 1 })) + { + if (result.Columns() % _kernel_size != 0) + { + _kernel_size = 4; + } + } + + // Check the order to see which kernel to use. Additionally, verify that an optimized kernel can run on this input, else fallback + // to the simple one. + if (output.GetLayout().GetLogicalDimensionOrder() == utilities::DimensionOrder({ 2, 0, 1 }) && result.Columns() % _kernel_size == 0) + { + // Declare the indexes + loopnests::IndexRange i("i", { 0, (int)(data.Rows()) }); + loopnests::IndexRange j("j", { 0, (int)(data.Columns() / _kernel_size) }); + loopnests::IndexRange k("k", { 0, (int)(data.Channels()) }); + + auto kernel = loopnests::Kernel("kernel") + .Inputs(input, output) + .Indices(i.GetIndex(), j.GetIndex(), k.GetIndex()) + .Define([this](value::Tensor source, value::Tensor dest, value::Scalar i, value::Scalar j, value::Scalar k) { + reorder_kernel_optimized_columns(source, dest, i, j, k); + }); + + loopnests::LoopNest loop(std::vector{ i, j, k }); + loop.AddKernel(kernel); + + loopnests::CodeGenerator generator; + generator.Run(loop); + } + else if (output.GetLayout().GetLogicalDimensionOrder() == utilities::DimensionOrder({ 0, 1, 2 }) && result.Channels() % _kernel_size == 0) + { + // Declare the indexes + loopnests::IndexRange i("i", { 0, (int)(data.Rows()) }); + loopnests::IndexRange j("j", { 0, (int)(data.Columns()) }); + loopnests::IndexRange k("k", { 0, (int)(data.Channels() / _kernel_size) }); + + auto kernel = loopnests::Kernel("kernel") + .Inputs(input, output) + .Indices(i.GetIndex(), j.GetIndex(), k.GetIndex()) + .Define([this](value::Tensor source, value::Tensor dest, value::Scalar i, value::Scalar j, value::Scalar k) { + reorder_kernel_optimized_channels(source, dest, i, j, k); + }); + + loopnests::LoopNest loop(std::vector{ i, j, k }); + loop.AddKernel(kernel); + + loopnests::CodeGenerator generator; + generator.Run(loop); + } + else + { + // Declare the indexes + loopnests::IndexRange i("i", { 0, (int)(data.Rows()) }); + loopnests::IndexRange j("j", { 0, (int)(data.Columns()) }); + loopnests::IndexRange k("k", { 0, (int)(data.Channels()) }); + + // This is the basic fallback kernel that can do one element at a time + auto kernel = loopnests::Kernel("kernel") + .Inputs(input, output) + .Indices(i.GetIndex(), j.GetIndex(), k.GetIndex()) + .Define(reorder_kernel_basic); + + loopnests::LoopNest loop(std::vector{ i, j, k }); + loop.AddKernel(kernel); + + loopnests::CodeGenerator generator; + generator.Run(loop); + } + } + else if (input.GetLayout().NumDimensions() == 2) + { + auto data = value::Matrix(input); + auto result = value::Matrix(output); + + value::Scalar v = value::Allocate(result.Type(), ell::utilities::ScalarLayout); + For(data, [&](value::Scalar row, value::Scalar column) { + result(row, column) = data(row, column); + }); + } + else + { + auto data = value::Vector(input); + auto result = value::Vector(output); + + For(data, [&](value::Scalar index) { + result[index] = data[index]; + }); + } + }); + } + + template + void ReorderDataCodeNode::WriteToArchive(utilities::Archiver& archiver) const + { + CompilableNode::WriteToArchive(archiver); + archiver[defaultInputPortName] << _input; + archiver["inputLayout"] << _inputMemoryLayout; + archiver["outputLayout"] << _outputMemoryLayout; + archiver["paddingValue"] << _paddingValue; + } + + template + void ReorderDataCodeNode::ReadFromArchive(utilities::Unarchiver& archiver) + { + CompilableNode::ReadFromArchive(archiver); + archiver[defaultInputPortName] >> _input; + archiver["inputLayout"] >> _inputMemoryLayout; + archiver["outputLayout"] >> _outputMemoryLayout; + archiver["paddingValue"] >> _paddingValue; + _output.SetMemoryLayout(_outputMemoryLayout); + } + + template + void ReorderDataCodeNode::Copy(model::ModelTransformer& transformer) const + { + const auto& newInputs = transformer.GetCorrespondingInputs(_input); + auto newNode = transformer.AddNode>(newInputs, _inputMemoryLayout, _outputMemoryLayout, _paddingValue); + transformer.MapNodeOutput(output, newNode->output); + } + + template + const model::OutputPort& ReorderDataWithCodeNode(const model::OutputPort& input, const model::PortMemoryLayout& outputMemoryLayout, ValueType paddingValue) + { + model::Model* model = input.GetNode()->GetModel(); + if (model == nullptr) + { + throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "Input not part of a model"); + } + auto node = model->AddNode>(input, outputMemoryLayout, paddingValue); + return node->output; + } + + template + const model::OutputPort& ReorderDataWithCodeNode(const model::OutputPort& input, const model::PortMemoryLayout& inputMemoryLayout, const model::PortMemoryLayout& outputMemoryLayout, ValueType paddingValue) + { + model::Model* model = input.GetNode()->GetModel(); + if (model == nullptr) + { + throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "Input not part of a model"); + } + auto node = model->AddNode>(input, inputMemoryLayout, outputMemoryLayout, paddingValue); + return node->output; + } + + template + const model::OutputPort& ReorderDataWithCodeNode(const model::OutputPort& input, const model::PortMemoryLayout& outputMemoryLayout, const model::DimensionOrder& order, ValueType paddingValue) + { + model::Model* model = input.GetNode()->GetModel(); + if (model == nullptr) + { + throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "Input not part of a model"); + } + auto node = model->AddNode>(input, outputMemoryLayout, order, paddingValue); + return node->output; + } + + template + const model::OutputPort& ReorderDataWithCodeNode(const model::OutputPort& input, const model::PortMemoryLayout& inputMemoryLayout, const model::PortMemoryLayout& outputMemoryLayout, const model::DimensionOrder& order, ValueType paddingValue) + { + model::Model* model = input.GetNode()->GetModel(); + if (model == nullptr) + { + throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "Input not part of a model"); + } + auto node = model->AddNode>(input, inputMemoryLayout, outputMemoryLayout, order, paddingValue); + return node->output; + } + + template + const model::OutputPort& ReorderDataWithCodeNode(const model::OutputPort& input, const model::DimensionOrder& order) + { + model::Model* model = input.GetNode()->GetModel(); + if (model == nullptr) + { + throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "Input not part of a model"); + } + auto node = model->AddNode>(input, order); + return node->output; + } + +} // namespace nodes +} // namespace ell + +#pragma endregion implementation diff --git a/libraries/nodes/include/SpatialConvolutionNode.h b/libraries/nodes/include/SpatialConvolutionNode.h new file mode 100644 index 000000000..63bdaf803 --- /dev/null +++ b/libraries/nodes/include/SpatialConvolutionNode.h @@ -0,0 +1,239 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: SpatialConvolutionNode.h (nodes) +// Authors: Byron Changuion +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include +#include + +namespace ell +{ +namespace nodes +{ + /// A node that performs the spatial convolution in a depthwise separable + /// convolutional model. By definition, this node requires: + /// - Number of input channels per weights filter to be 1 + /// - Number of filters must equal number of input channels + /// + template + class SpatialConvolutionNode : public model::CompilableCodeNode + { + + public: + using LayerType = predictors::neural::ConvolutionalLayer; + + /// @name Input and Output Ports + /// @{ + const model::InputPort& input = _input; + const model::OutputPort& output = _output; + /// @} + + /// Default Constructor + SpatialConvolutionNode(); + + /// Constructor from a layer. + /// + /// + /// The convolutional layer to wrap. + SpatialConvolutionNode(const model::OutputPort& input, const LayerType& layer, const model::PortMemoryLayout& outputMemoryLayout); + + /// Returns true if the node can accept input with this memory layout order, else false + /// + /// The memory layout order for all the input ports + /// If the node can accept the input memory layout order, true, else false + bool CanAcceptInputLayout(const utilities::DimensionOrder& order) const override + { + return true; + } + + /// Gets the name of this type (for serialization). + /// + /// The name of this type. + static std::string GetTypeName() { return utilities::GetCompositeTypeName("SpatialConvolutionNode"); } + + protected: + void Define(ell::value::FunctionDeclaration& fn) override; + void WriteToArchive(utilities::Archiver& archiver) const override; + void ReadFromArchive(utilities::Unarchiver& archiver) override; + bool HasState() const override { return true; } // stored state: convolutional parameters + std::string GetRuntimeTypeName() const override { return GetTypeName(); } + + private: + void Copy(model::ModelTransformer& transformer) const override; + + // Called with output i, j, k + void spatial_convolutional_kernel(value::Tensor output, value::Tensor input, value::Tensor weights, value::Scalar i, value::Scalar j, value::Scalar k); + + // Inputs + model::InputPort _input; + + // Output + model::OutputPort _output; + + // Convolutional layer + LayerType _layer; + }; + +} // namespace nodes +} // namespace ell + +#pragma region implementation + +namespace ell +{ +namespace nodes +{ + // + // SpatialConvolutionNode + // + template + SpatialConvolutionNode::SpatialConvolutionNode() : + CompilableCodeNode("SpatialConvolutionNode", { &_input }, { &_output }), + _input(this, {}, defaultInputPortName), + _output(this, defaultOutputPortName, 0) + {} + + // + // SpatialConvolutionNode + // + template + SpatialConvolutionNode::SpatialConvolutionNode(const model::OutputPort& input, + const LayerType& layer, + const model::PortMemoryLayout& outputMemoryLayout) : + CompilableCodeNode("SpatialConvolutionNode", { &_input }, { &_output }), + _input(this, input, defaultInputPortName), + _output(this, defaultOutputPortName, outputMemoryLayout), + _layer(layer) + { + const auto& weights = _layer.GetWeights(); + if (weights.NumChannels() != 1) + { + throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, + "Error: weights for Spatial Convolution must have single channel"); + } + if (_input.GetMemoryLayout().GetLogicalDimensionExtent(2) != _output.GetMemoryLayout().GetLogicalDimensionExtent(2)) + { + throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, + "Error: input and output number of channels must match for Spatial Convolution"); + } + } + + // + // A spatial convolution kernel + // + template + void SpatialConvolutionNode::spatial_convolutional_kernel(value::Tensor output, value::Tensor input, value::Tensor weights, value::Scalar row, value::Scalar column, value::Scalar channel) + { + const auto& parameters = _layer.GetConvolutionalParameters(); + const int receptiveFieldRows = (int)parameters.receptiveField; + const int receptiveFieldColumns = (int)parameters.receptiveField; + const int rowStride = (int)parameters.stride; + const int columnStride = (int)parameters.stride; + + value::Scalar temp = value::Allocate(output.GetValue().GetBaseType(), ell::utilities::ScalarLayout); + temp = static_cast(0.0); + + // Unroll the calculations for the receptive field size in row and column dimensions + for (int k_r = 0; k_r < receptiveFieldRows; ++k_r) + { + for (int k_c = 0; k_c < receptiveFieldColumns; ++k_c) + { + // Weight filters are stacked in the row dimension. For spatial convolutions, the weights channel index is always 0 + // since there is only one channel per filter. + temp += input(row * rowStride + k_r, column * columnStride + k_c, channel) * weights(channel * receptiveFieldRows + k_r, k_c, 0); + } + } + output(row, column, channel) = temp; + } + + template + void SpatialConvolutionNode::Define(ell::value::FunctionDeclaration& fn) + { + (void)fn.Define([this](const value::Tensor input_tensor, value::Tensor output) { + namespace loopnests = ell::value::loopnests; + + value::Value input_value(input_tensor.GetValue()); + input_value.SetLayout(utilities::MemoryLayout(input_value.GetLayout().GetExtent(), input_value.GetLayout().GetLogicalDimensionOrder())); + value::Tensor input(input_value); + + // Declare constants + const auto& w = _layer.GetWeights(); + std::vector data = w.ToArray(); + value::Tensor weights({ data, + utilities::MemoryLayout({ static_cast(w.NumRows()), static_cast(w.NumColumns()), static_cast(w.NumChannels()) }, + utilities::DimensionOrder(utilities::RowMajorTensorOrder)) }); + + // Declare the indexes + loopnests::IndexRange i("i", { 0, (int)(output.Rows()) }); + loopnests::IndexRange j("j", { 0, (int)(output.Columns()) }); + loopnests::IndexRange k("k", { 0, (int)(output.Channels()) }); + + auto kernel = loopnests::Kernel("kernel") + .Inputs(output.GetValue(), input.GetValue(), weights.GetValue()) + .Indices(i.GetIndex(), j.GetIndex(), k.GetIndex()) + .Define([this](value::Tensor output, value::Tensor input, value::Tensor weights, value::Scalar row, value::Scalar column, value::Scalar channel) { + spatial_convolutional_kernel(output, input, weights, row, column, channel); + }); + + loopnests::LoopNest loop(std::vector{ i, j, k }); + loop.AddKernel(kernel); + loop.SetLoopOrder({ k.GetIndex(), i.GetIndex(), j.GetIndex() }); + + loopnests::CodeGenerator generator; + generator.Run(loop); + }); + } + + template + void SpatialConvolutionNode::WriteToArchive(utilities::Archiver& archiver) const + { + Node::WriteToArchive(archiver); + archiver[defaultInputPortName] << _input; + archiver["outputLayout"] << _output.GetMemoryLayout(); + archiver["layer"] << _layer; + } + + template + void SpatialConvolutionNode::ReadFromArchive(utilities::Unarchiver& archiver) + { + Node::ReadFromArchive(archiver); + archiver[defaultInputPortName] >> _input; + model::PortMemoryLayout outputMemoryLayout; + archiver["outputLayout"] >> outputMemoryLayout; + _output.SetMemoryLayout(outputMemoryLayout); + archiver["layer"] >> _layer; + } + + template + void SpatialConvolutionNode::Copy(model::ModelTransformer& transformer) const + { + const auto& newInputs = transformer.GetCorrespondingInputs(_input); + auto newNode = transformer.AddNode>(newInputs, _layer, _output.GetMemoryLayout()); + transformer.MapNodeOutput(output, newNode->output); + } + +} // namespace nodes +} // namespace ell + +#pragma endregion implementation diff --git a/libraries/nodes/include/UnrolledConvolutionNode.h b/libraries/nodes/include/UnrolledConvolutionNode.h index 142c5a8c5..98dd91d66 100644 --- a/libraries/nodes/include/UnrolledConvolutionNode.h +++ b/libraries/nodes/include/UnrolledConvolutionNode.h @@ -106,6 +106,7 @@ namespace nodes void Copy(model::ModelTransformer& transformer) const override; MatrixType GetWeightsMatrix(const ConstTensorReferenceType& weightsTensor) const; + bool IsELLCodeTarget(model::ModelTransformer& transformer) const; // Input model::InputPort _input; diff --git a/libraries/nodes/src/BinaryConvolutionalLayerNode.cpp b/libraries/nodes/src/BinaryConvolutionalLayerNode.cpp index aa8e617a5..68fa1a8e2 100644 --- a/libraries/nodes/src/BinaryConvolutionalLayerNode.cpp +++ b/libraries/nodes/src/BinaryConvolutionalLayerNode.cpp @@ -8,7 +8,7 @@ #include "BinaryConvolutionalLayerNode.h" #include "ConstantNode.h" -#include "ReorderDataNode.h" +#include "ReorderDataCodeNode.h" #include #include @@ -240,7 +240,7 @@ namespace nodes // Output of xnor is in (f x h x w) order, need to transpose to the canonical (h x w x f) order model::PortMemoryLayout outputShape(model::MemoryShape{ numFilters, outputImageHeight, outputImageWidth }, model::DimensionOrder{ 2, 0, 1 }); // Note: memory layout constructor takes the sizes in physical dimension order model::PortMemoryLayout transposedOutputShape(model::MemoryShape{ outputImageHeight, outputImageWidth, numFilters }, model::MemoryShape{ outputDataPadding, outputDataPadding, 0 }, model::DimensionOrder{ 0, 1, 2 }); - const auto& reorderedOutput = ReorderData(xnorOutput, outputShape, transposedOutputShape); + const auto& reorderedOutput = ReorderDataWithCodeNode(xnorOutput, outputShape, transposedOutputShape); transformer.MapNodeOutput(this->output, reorderedOutput); return true; } diff --git a/libraries/nodes/src/ConvolutionalLayerNode.cpp b/libraries/nodes/src/ConvolutionalLayerNode.cpp index d992ede9a..26d288d62 100644 --- a/libraries/nodes/src/ConvolutionalLayerNode.cpp +++ b/libraries/nodes/src/ConvolutionalLayerNode.cpp @@ -8,8 +8,9 @@ #include "ConvolutionalLayerNode.h" #include "DiagonalConvolutionNode.h" -#include "ReorderDataNode.h" +#include "ReorderDataCodeNode.h" #include "SimpleConvolutionNode.h" +#include "SpatialConvolutionNode.h" #include "UnrolledConvolutionNode.h" #include "WinogradConvolutionNode.h" @@ -48,7 +49,7 @@ namespace nodes auto convInputLayout = originalInputLayout.ReorderedCopy({ shouldReorderToChannelMajor ? utilities::ChannelMajorTensorOrder : utilities::RowMajorTensorOrder }); auto convOutputLayout = originalOutputLayout.ReorderedCopy({ shouldReorderToChannelMajor ? utilities::ChannelMajorTensorOrder : utilities::RowMajorTensorOrder }); - const auto& preConvReorder = ReorderData(*newInput, originalInputLayout, convInputLayout); + const auto& preConvReorder = ReorderDataWithCodeNode(*newInput, originalInputLayout, convInputLayout); newInput = &preConvReorder; const model::OutputPort* convOutput; @@ -57,8 +58,16 @@ namespace nodes { case ConvolutionMethod::simple: { - auto convNode = transformer.AddNode>(*newInput, convInputLayout, convOutputLayout, weights, convParams.stride); - convOutput = &convNode->output; + if (isDepthwiseSeparable) + { + auto convNode = transformer.AddNode>(*newInput, this->GetLayer(), convOutputLayout); + convOutput = &convNode->output; + } + else + { + auto convNode = transformer.AddNode>(*newInput, convInputLayout, convOutputLayout, weights, convParams.stride); + convOutput = &convNode->output; + } } break; case ConvolutionMethod::unrolled: @@ -86,7 +95,7 @@ namespace nodes // Copy metadata const_cast(convOutput->GetNode())->GetMetadata() = this->GetMetadata(); - const auto& postConvReorder = ReorderData(*convOutput, originalOutputLayout); + const auto& postConvReorder = ReorderDataWithCodeNode(*convOutput, originalOutputLayout); transformer.MapNodeOutput(this->output, postConvReorder); return true; diff --git a/libraries/nodes/src/MatrixMatrixMultiplyCodeNode.cpp b/libraries/nodes/src/MatrixMatrixMultiplyCodeNode.cpp new file mode 100644 index 000000000..bfe311f55 --- /dev/null +++ b/libraries/nodes/src/MatrixMatrixMultiplyCodeNode.cpp @@ -0,0 +1,549 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: MatrixMatrixMultiplyCodeNode.h (nodes) +// Authors: Mason Remy, Denny Sun +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#include + +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +//using namespace ell::utilities; +using namespace ell::value; + +namespace ell +{ +namespace nodes +{ + template + MatrixMatrixMultiplyCodeNode::MatrixMatrixMultiplyCodeNode() : + CompilableCodeNode("MatrixMatrixMultiplyCodeNode", { &_input1, &_input2 }, { &_output }), + _input1(this, {}, defaultInput1PortName), + _input2(this, {}, defaultInput2PortName), + _output(this, defaultOutputPortName, 0), + _impl(MatrixMatrixMultiplyImplementation::DEFAULT) + { + } + + template + MatrixMatrixMultiplyCodeNode::MatrixMatrixMultiplyCodeNode(const model::OutputPort& input1, const model::OutputPort& input2, const MatrixMatrixMultiplyImplementation& gemmImpl) : + MatrixMatrixMultiplyCodeNode(input1, input2, _defaultPanelM, _defaultPanelN, _defaultPanelK, _defaultKernelM, _defaultKernelN, _defaultKernelK, gemmImpl) + { + } + + template + MatrixMatrixMultiplyCodeNode::MatrixMatrixMultiplyCodeNode(const model::OutputPort& input1, const model::OutputPort& input2, int panelM, int panelN, int panelK, int kernelM, int kernelN, int kernelK, const MatrixMatrixMultiplyImplementation& gemmImpl) : + MatrixMatrixMultiplyCodeNode(input1, input2, model::PortMemoryLayout({ input1.GetMemoryLayout().GetActiveSize(0), input2.GetMemoryLayout().GetActiveSize(1) }), panelM, panelN, panelK, kernelM, kernelN, kernelK, gemmImpl) + { + } + + template + MatrixMatrixMultiplyCodeNode::MatrixMatrixMultiplyCodeNode(const model::OutputPort& input1, const model::OutputPort& input2, const model::PortMemoryLayout& outputLayout, const MatrixMatrixMultiplyImplementation& gemmImpl) : + MatrixMatrixMultiplyCodeNode(input1, input2, outputLayout, _defaultPanelM, _defaultPanelN, _defaultPanelK, _defaultKernelM, _defaultKernelN, _defaultKernelK, gemmImpl) + { + } + + template + MatrixMatrixMultiplyCodeNode::MatrixMatrixMultiplyCodeNode(const model::OutputPort& input1, const model::OutputPort& input2, const model::PortMemoryLayout& outputLayout, int panelM, int panelN, int panelK, int kernelM, int kernelN, int kernelK, const MatrixMatrixMultiplyImplementation& gemmImpl) : + MatrixMatrixMultiplyCodeNode::MatrixMatrixMultiplyCodeNode(input1, + input1.GetMemoryLayout().GetLogicalDimensionActiveSize(0), + input2.GetMemoryLayout().GetLogicalDimensionActiveSize(1), + input1.GetMemoryLayout().GetLogicalDimensionActiveSize(1), + input1.GetMemoryLayout().GetExtent(1), + !input1.GetMemoryLayout().IsCanonicalOrder(), + input2, + input2.GetMemoryLayout().GetExtent(1), + !input2.GetMemoryLayout().IsCanonicalOrder(), + outputLayout.GetExtent(1), + !outputLayout.IsCanonicalOrder(), + panelM, + panelN, + panelK, + kernelM, + kernelN, + kernelK, + gemmImpl) + { + if (input1.GetMemoryLayout().NumDimensions() != 2 || input2.GetMemoryLayout().NumDimensions() != 2) + { + throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "Input matrices must have a memory layout with 2 dimensions"); + } + if (_k != input2.GetMemoryLayout().GetLogicalDimensionActiveSize(0)) + { + throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "Input matrices incompatible"); + } + } + + template + MatrixMatrixMultiplyCodeNode::MatrixMatrixMultiplyCodeNode(const model::OutputPort& input1, int m, int n, int k, int matrix1Stride, const model::OutputPort& input2, int matrix2Stride, int outputMatrixStride, const MatrixMatrixMultiplyImplementation& gemmImpl) : + MatrixMatrixMultiplyCodeNode(input1, m, n, k, matrix1Stride, input2, matrix2Stride, outputMatrixStride, _defaultPanelM, _defaultPanelN, _defaultPanelK, _defaultKernelM, _defaultKernelN, _defaultKernelK, gemmImpl) + { + } + + template + MatrixMatrixMultiplyCodeNode::MatrixMatrixMultiplyCodeNode(const model::OutputPort& input1, int m, int n, int k, int matrix1Stride, const model::OutputPort& input2, int matrix2Stride, int outputMatrixStride, int panelM, int panelN, int panelK, int kernelM, int kernelN, int kernelK, const MatrixMatrixMultiplyImplementation& gemmImpl) : + MatrixMatrixMultiplyCodeNode(input1, m, n, k, matrix1Stride, false, input2, matrix2Stride, false, outputMatrixStride, false, panelM, panelN, panelK, kernelM, kernelN, kernelK, gemmImpl) + { + } + + template + MatrixMatrixMultiplyCodeNode::MatrixMatrixMultiplyCodeNode(const model::OutputPort& input1, int m, int n, int k, int matrix1Stride, bool transpose1, const model::OutputPort& input2, int matrix2Stride, bool transpose2, int outputMatrixStride, const MatrixMatrixMultiplyImplementation& gemmImpl) : + MatrixMatrixMultiplyCodeNode(input1, m, n, k, matrix1Stride, transpose1, input2, matrix2Stride, transpose2, outputMatrixStride, _defaultPanelM, _defaultPanelN, _defaultPanelK, _defaultKernelM, _defaultKernelN, _defaultKernelK, gemmImpl) + { + } + + template + MatrixMatrixMultiplyCodeNode::MatrixMatrixMultiplyCodeNode(const model::OutputPort& input1, int m, int n, int k, int matrix1Stride, bool transpose1, const model::OutputPort& input2, int matrix2Stride, bool transpose2, int outputMatrixStride, int panelM, int panelN, int panelK, int kernelM, int kernelN, int kernelK, const MatrixMatrixMultiplyImplementation& gemmImpl) : + MatrixMatrixMultiplyCodeNode(input1, m, n, k, matrix1Stride, transpose1, input2, matrix2Stride, transpose2, outputMatrixStride, false, panelM, panelN, panelK, kernelM, kernelN, kernelK, gemmImpl) + { + } + + template + MatrixMatrixMultiplyCodeNode::MatrixMatrixMultiplyCodeNode(const model::OutputPort& input1, int m, int n, int k, int matrix1Stride, bool transpose1, const model::OutputPort& input2, int matrix2Stride, bool transpose2, int outputMatrixStride, bool transposeOutput, const MatrixMatrixMultiplyImplementation& gemmImpl) : + MatrixMatrixMultiplyCodeNode(input1, m, n, k, matrix1Stride, transpose1, input2, matrix2Stride, transpose2, outputMatrixStride, transposeOutput, _defaultPanelM, _defaultPanelN, _defaultPanelK, _defaultKernelM, _defaultKernelN, _defaultKernelK, gemmImpl) + { + } + + template + MatrixMatrixMultiplyCodeNode::MatrixMatrixMultiplyCodeNode(const model::OutputPort& input1, int m, int n, int k, int matrix1Stride, bool transpose1, const model::OutputPort& input2, int matrix2Stride, bool transpose2, int outputMatrixStride, bool transposeOutput, int panelM, int panelN, int panelK, int kernelM, int kernelN, int kernelK, const MatrixMatrixMultiplyImplementation& gemmImpl) : + CompilableCodeNode("MatrixMatrixMultiplyCodeNode", { &_input1, &_input2 }, { &_output }), + _input1(this, input1, defaultInput1PortName), + _input2(this, input2, defaultInput2PortName), + _output(this, defaultOutputPortName, utilities::MemoryShape{ m, n }), + _m(m), + _n(n), + _k(k), + _lda(matrix1Stride), + _ldb(matrix2Stride), + _ldc(outputMatrixStride), + _transpose1(transpose1), + _transpose2(transpose2), + _transposeOutput(transposeOutput), + _panelM(panelM), + _panelN(panelN), + _panelK(panelK), + _kernelM(kernelM), + _kernelN(kernelN), + _kernelK(kernelK), + _impl(gemmImpl) + { + if (static_cast(input1.Size()) != m * k) + { + throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "Input matrix 1 size incorrect"); + } + + if (static_cast(input2.Size()) != k * n) + { + throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "Input matrix 2 size incorrect"); + } + } + + template + void MatrixMatrixMultiplyCodeNode::ZeroMatrix(value::Matrix matrix) const + { + namespace loopnests = ell::value::loopnests; + int M = (int)(matrix.Rows()); + int N = (int)(matrix.Columns()); + loopnests::Index m("m"), n("n"); + loopnests::LoopNest zeroingLoop({ { m, { 0, M } }, + { n, { 0, N } } }); + + auto [mKernelOuter, mKernelInner] = zeroingLoop.Split(m, _kernelM); + auto [nKernelOuter, nKernelInner] = zeroingLoop.Split(n, _kernelN); + auto zeroingKernel = loopnests::Kernel("Zero_output") + .Inputs(matrix.GetValue()) + .Indices(m, n) + .Define([&](value::Matrix C, value::Scalar row, value::Scalar col) { + C(row, col) = static_cast(0); + }); + zeroingLoop.AddKernel(zeroingKernel); + zeroingLoop.Unroll(mKernelInner); + zeroingLoop.Unroll(nKernelInner); + loopnests::CodeGenerator zeroingGenerator; + zeroingGenerator.Run(zeroingLoop); + } + + template + void MatrixMatrixMultiplyCodeNode::ForLoopGEMM(const value::Matrix matA, const value::Matrix matB, value::Matrix matC) + { + namespace loopnests = ell::value::loopnests; + // Currently treat beta as 0 + ZeroMatrix(matC); + + int M = (int)(matA.Rows()); + int N = (int)(matB.Columns()); + int K = (int)(matA.Columns()); + + loopnests::Index m("m"), n("n"), k("k"); + loopnests::LoopNest loop({ { m, { 0, M } }, + { n, { 0, N } }, + { k, { 0, K } } }); + + // innermost GEMM kernel + auto kernel = loopnests::Kernel("GEMMKernel") + .Inputs(matA.GetValue(), matB.GetValue(), matC.GetValue()) + .Indices(m, n, k) + .Define([](value::Matrix A, value::Matrix B, value::Matrix C, value::Scalar m, value::Scalar n, value::Scalar k) { + C(m, n) += A(m, k) * B(k, n); + }); + + loop.AddKernel(kernel, loopnests::LoopFragmentType::body); + loop.SetLoopOrder({ m, k, n }); + auto outputC = matC.GetValue(); + outputC.SetLayout({ { (int)matC.Size() } }); + // ell::DebugPrintVector(outputC); + loopnests::CodeGenerator generator; + generator.Run(loop); + } + + template + void MatrixMatrixMultiplyCodeNode::Gemm(value::Matrix A, value::Matrix B, value::Matrix C) + { + using namespace value; + + int vectorSize = 4; + int NumRowsInKernel = 2; + + InvokeForContext([&](LLVMContext& context) { + auto targetMachine = context.GetModuleEmitter().GetTargetMachine(); + auto fn = context.GetFunctionEmitter().GetFunction(); + auto info = targetMachine->getTargetTransformInfo(*fn); + // See https://llvm.org/doxygen/classllvm_1_1TargetTransformInfo.html for the big list of amazing things you can get from this TargetMachineInfo object + vectorSize = static_cast(info.getRegisterBitWidth(true)) / (8 * sizeof(float)); + if (vectorSize > 8) + { + // The vector width is 16 floats instead of 8 (e.g. in AVX-512), so double the sizes as needed + vectorSize = 16; + NumRowsInKernel = 12; + } + }); + + int NumColumnsInKernel = 2 * vectorSize; + if (NumColumnsInKernel > (int)B.Columns()) + { + NumColumnsInKernel = vectorSize; + NumRowsInKernel *= 2; + } + + // Declare and/or calculate constants + const int OutputRows = (int)(A.Rows()); + const int OutputColumns = (int)(B.Columns()); + const int InnerDimension = (int)(A.Columns()); + const int kUnroll = 4; + int columnBlock = std::min(64, OutputColumns); + int innerDimensionBlock = std::min(256, InnerDimension); + + // Declare indexes + loopnests::Index i("i"), j("j"), k("k"); + // Define LoopNest + auto nest = Using({ A, B }, ArgumentType::Input) + .Using({ C }, ArgumentType::Output) + .ForAll(i, 0, OutputRows) + .ForAll(j, 0, OutputColumns) + .ForAll(k, 0, InnerDimension) + .Do([](Matrix A_, Matrix B_, Matrix C_, Scalar i_, Scalar j_, Scalar k_) { + C_(i_, j_) += B_(k_, j_) * A_(i_, k_); + }); + auto& schedule = nest.GetSchedule(); + + auto topLevelJ = j; + auto topLevelK = k; + + // Declare splits + auto jCache = schedule.Split(j, columnBlock); + auto kCache = schedule.Split(k, innerDimensionBlock); + auto kBlock = schedule.Split(k, kUnroll); + auto jKernelOuter2 = schedule.Split(j, NumColumnsInKernel); + auto jKernelOuter = schedule.Split(j, vectorSize); + auto iKernelOuter = schedule.Split(i, NumRowsInKernel); + + // Set the order + schedule.SetOrder({ jCache, kCache, iKernelOuter, jKernelOuter2, kBlock, k, i, jKernelOuter, j }); + + // Set up caching + if ((OutputColumns > NumColumnsInKernel) && ((OutputColumns % NumColumnsInKernel) == 0)) + { + auto extraCacheBParams = std::make_tuple(NumColumnsInKernel, jKernelOuter2, BoundaryConditionHandling::ZeroPadding); + schedule.template Cache(B, + { topLevelK, topLevelJ }, + { innerDimensionBlock, columnBlock }, + { kCache, jCache }, + std::nullopt, // Order isn't used by BLASTCopy + extraCacheBParams); + } + auto extraZeroInputReduceOutputParams = std::make_tuple(vectorSize); + schedule.template Cache(C, + { iKernelOuter, jKernelOuter2 }, + { NumRowsInKernel, NumColumnsInKernel }, + { iKernelOuter, jKernelOuter2 }, + utilities::RowMajorMatrixOrder, + extraZeroInputReduceOutputParams); + + // Set unrolling + schedule.Unroll(jKernelOuter); + schedule.Unroll(i); + schedule.Unroll(k); + + // Run the generator + nest.Run(); + } + + template + void MatrixMatrixMultiplyCodeNode::GemmFn(value::Matrix A, value::Matrix B, value::Matrix C, int thread_num) + { + value::DeclareFunction("InnerMatMul" + std::to_string(thread_num)) + .Parameters(A, B, C) + .Define([this](value::Matrix A, value::Matrix B, value::Matrix C) { + Gemm(A, B, C); + })(A, B, C); + } + + template + void MatrixMatrixMultiplyCodeNode::ParallelizeGemmCol(Matrix A, Matrix B, Matrix C, int numThreads) + { + const int columns = B.Columns() / numThreads; + const int col_spill = B.Columns() % numThreads; + + Parallelize( + numThreads, + std::tuple{ A, B, C }, + [=](value::Scalar id, value::Matrix A, value::Matrix B, value::Matrix C) + { + value::Scalar colStart = id * value::Scalar{columns}; + int thread_seq = 0; + + EmitterContext::IfContext IfCxt = If(id == thread_seq, + [&] { + GemmFn( + A, + B.SubMatrix(value::Scalar{0}, colStart, (int)B.Rows(), columns), + C.SubMatrix(value::Scalar{0}, colStart, (int)C.Rows(), columns), + thread_seq); + }); + + thread_seq++; + + for(int i = thread_seq; i < numThreads; i++) + { + IfCxt.ElseIf(id == i, + [&] { + int actualColumns = i==(numThreads-1) ? columns + col_spill : columns; + + GemmFn( + A, + B.SubMatrix(value::Scalar{0}, colStart, (int)B.Rows(), actualColumns), + C.SubMatrix(value::Scalar{0}, colStart, (int)C.Rows(), actualColumns), + i); + }); + } + }); + } + + template + void MatrixMatrixMultiplyCodeNode::ParallelizeGemmRow(Matrix A, Matrix B, Matrix C, int numThreads) + { + const int rows = A.Rows() / numThreads; + const int row_spill = A.Rows() % numThreads; + + Parallelize( + numThreads, + std::tuple{ A, B, C }, + [=](value::Scalar id, value::Matrix A, value::Matrix B, value::Matrix C) + { + value::Scalar rowStart = id * value::Scalar{rows}; + int thread_seq = 0; + + EmitterContext::IfContext IfCxt = If(id == thread_seq, + [&] { + GemmFn( + A.SubMatrix(rowStart, value::Scalar{0}, rows, (int)A.Columns()), + B, + C.SubMatrix(rowStart, value::Scalar{0}, rows, (int)C.Columns()), + thread_seq); + }); + + thread_seq++; + + for(int i = thread_seq; i < numThreads; i++) + { + IfCxt.ElseIf(id == i, + [&] { + int actualRows = i==(numThreads-1) ? rows + row_spill : rows; + GemmFn( + A.SubMatrix(rowStart, value::Scalar{0}, actualRows, (int)A.Columns()), + B, + C.SubMatrix(rowStart, value::Scalar{0}, actualRows, (int)C.Columns()), + i); + }); + } + }); + } + + template + void MatrixMatrixMultiplyCodeNode::ELLCodeGEMM(const value::Matrix matA, const value::Matrix matB, value::Matrix matC) + { + double computationSize = double(matC.Rows() * matC.Columns() * matA.Columns()); + + size_t minThreadLoad = 112 * 1024; + const size_t maxThreads = 4; + size_t numThreads = maxThreads; + + if (computationSize < double(minThreadLoad * maxThreads)) + { + numThreads = std::min(int(computationSize / double(minThreadLoad)) + 1, int(maxThreads)); + } + if (numThreads > 1) + { + if (matC.Rows() > matC.Columns()) + { + ParallelizeGemmRow(matA, matB, matC, int(numThreads)); + } + else + { + ParallelizeGemmCol(matA, matB, matC, int(numThreads)); + } + } + else + { + Gemm(matA, matB, matC); + } + } + + template + void MatrixMatrixMultiplyCodeNode::Define(value::FunctionDeclaration& fn) + { + (void)fn.Define([this](const value::Value valueA, const value::Value valueB, value::Value valueC) { + auto tempA = valueA; + tempA.SetLayout(utilities::MemoryLayout({ _m, _k })); + auto tempB = valueB; + tempB.SetLayout(utilities::MemoryLayout({ _k, _n })); + auto tempC = valueC; + if (_transposeOutput) + { + tempC.SetLayout(utilities::MemoryLayout({ _n, _m }, utilities::DimensionOrder{1, 0})); + } + else + { + tempC.SetLayout(utilities::MemoryLayout({ _m, _n })); + } + + auto matA = value::Matrix(tempA); + auto matB = value::Matrix(tempB); + auto matC = value::Matrix(tempC); + + switch (_impl) + { + case (MatrixMatrixMultiplyImplementation::SimpleForLoops): + ForLoopGEMM(matA, matB, matC); + break; + case (MatrixMatrixMultiplyImplementation::Mlas_Loopnest_Value): + ELLCodeGEMM(matA, matB, matC); + break; + case (MatrixMatrixMultiplyImplementation::LAST): + throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "MatrixMatrixMultiplyImplementation::LAST is not a valid impl value"); + break; + } + }); + } + + template + void MatrixMatrixMultiplyCodeNode::Copy(model::ModelTransformer& transformer) const + { + const auto& newInput1 = transformer.GetCorrespondingInputs(_input1); + const auto& newInput2 = transformer.GetCorrespondingInputs(_input2); + auto newNode = transformer.AddNode>(newInput1, _m, _n, _k, _lda, _transpose1, newInput2, _ldb, _transpose2, _ldc, _transposeOutput, _panelM, _panelN, _panelK, _kernelM, _kernelN, _kernelK, _impl); + transformer.MapNodeOutput(output, newNode->output); + } + + template + utilities::ArchiveVersion MatrixMatrixMultiplyCodeNode::GetArchiveVersion() const + { + constexpr utilities::ArchiveVersion currentArchiveVersion = { utilities::ArchiveVersionNumbers::v2 }; + return std::max(currentArchiveVersion, CompilableCodeNode::GetArchiveVersion()); + } + + template + bool MatrixMatrixMultiplyCodeNode::CanReadArchiveVersion(const utilities::ArchiveVersion& version) const + { + return CompilableCodeNode::CanReadArchiveVersion(version); + } + + template + void MatrixMatrixMultiplyCodeNode::WriteToArchive(utilities::Archiver& archiver) const + { + Node::WriteToArchive(archiver); + archiver[defaultInput1PortName] << _input1; + archiver[defaultInput2PortName] << _input2; + archiver[defaultOutputPortName] << _output; + archiver["m"] << _m; + archiver["n"] << _n; + archiver["k"] << _k; + archiver["lda"] << _lda; + archiver["ldb"] << _ldb; + archiver["ldc"] << _ldc; + archiver["transpose1"] << _transpose1; + archiver["transpose2"] << _transpose2; + archiver["transposeOutput"] << _transposeOutput; + archiver["panelM"] << _panelM; + archiver["panelN"] << _panelN; + archiver["panelK"] << _panelK; + archiver["kernelM"] << _kernelM; + archiver["kernelN"] << _kernelN; + archiver["kernelK"] << _kernelK; + archiver["gemmImpl"] << static_cast(_impl); + } + + template + void MatrixMatrixMultiplyCodeNode::ReadFromArchive(utilities::Unarchiver& archiver) + { + Node::ReadFromArchive(archiver); + archiver[defaultInput1PortName] >> _input1; + archiver[defaultInput2PortName] >> _input2; + archiver[defaultOutputPortName] >> _output; + archiver["m"] >> _m; + archiver["n"] >> _n; + archiver["k"] >> _k; + archiver["lda"] >> _lda; + archiver["ldb"] >> _ldb; + archiver["ldc"] >> _ldc; + archiver["transpose1"] >> _transpose1; + archiver["transpose2"] >> _transpose2; + archiver.OptionalProperty("transposeOutput", false) >> _transposeOutput; + archiver["panelM"] >> _panelM; + archiver["panelN"] >> _panelN; + archiver["panelK"] >> _panelK; + archiver["kernelM"] >> _kernelM; + archiver["kernelN"] >> _kernelN; + archiver["kernelK"] >> _kernelK; + int gemmImpl = 0; + archiver["gemmImpl"] >> gemmImpl; + _impl = static_cast(gemmImpl); + } + + // + // Explicit instantiation definitions + // + template class MatrixMatrixMultiplyCodeNode; + template class MatrixMatrixMultiplyCodeNode; +} // namespace nodes +} // namespace ell diff --git a/libraries/nodes/src/NeuralNetworkPredictorNode.cpp b/libraries/nodes/src/NeuralNetworkPredictorNode.cpp index 23f1d7c7b..db2b090b0 100644 --- a/libraries/nodes/src/NeuralNetworkPredictorNode.cpp +++ b/libraries/nodes/src/NeuralNetworkPredictorNode.cpp @@ -8,7 +8,7 @@ #include "NeuralNetworkPredictorNode.h" #include "ConstantNode.h" -#include "ReorderDataNode.h" +#include "ReorderDataCodeNode.h" #include @@ -122,7 +122,7 @@ namespace nodes // If the input layer wants padding on its output, add a ReorderDataNode to add padding model::PortMemoryLayout inputNodeLayout(model::MemoryShape{ (int)inputShape.NumRows(), (int)inputShape.NumColumns(), (int)inputShape.NumChannels() }); model::PortMemoryLayout paddedInputNodeLayout(model::MemoryShape{ (int)inputShape.NumRows(), (int)inputShape.NumColumns(), (int)inputShape.NumChannels() }, model::MemoryShape{ (int)padding, (int)padding, 0 }); - const auto& paddedInput = ReorderData(*newInputElements, inputNodeLayout, paddedInputNodeLayout, predictors::neural::GetPaddingValue(outputPadding.paddingScheme)); + const auto& paddedInput = ReorderDataWithCodeNode(*newInputElements, inputNodeLayout, paddedInputNodeLayout, predictors::neural::GetPaddingValue(outputPadding.paddingScheme)); newInputElements = &paddedInput; } diff --git a/libraries/nodes/src/UnrolledConvolutionNode.cpp b/libraries/nodes/src/UnrolledConvolutionNode.cpp index 8524f7f85..d02495f86 100644 --- a/libraries/nodes/src/UnrolledConvolutionNode.cpp +++ b/libraries/nodes/src/UnrolledConvolutionNode.cpp @@ -9,8 +9,10 @@ #include "UnrolledConvolutionNode.h" #include "ConstantNode.h" #include "MatrixMatrixMultiplyNode.h" +#include "MatrixMatrixMultiplyCodeNode.h" #include "ReceptiveFieldMatrixNode.h" -#include "ReorderDataNode.h" +#include "ReorderDataCodeNode.h" +#include #include @@ -90,6 +92,23 @@ namespace nodes return weightsMatrix; } + template + bool UnrolledConvolutionNode::IsELLCodeTarget(model::ModelTransformer& transformer) const + { + auto compiler = dynamic_cast(transformer.GetContext().GetCompiler()); + if(compiler != nullptr) + { + auto device_name = compiler->GetCompilerOptions().targetDevice.deviceName; + bool skip_ELLCode = compiler->GetCompilerOptions().skip_ellcode; + if (device_name.compare("pi3") == 0 && !skip_ELLCode) + { + return true; + } + } + + return false; + } + template void UnrolledConvolutionNode::Copy(model::ModelTransformer& transformer) const { @@ -148,6 +167,7 @@ namespace nodes std::array dataOrder = useNewMethod ? drcOrder : rcdOrder; assert(outputPadding == 0 && "Unrolled convolution node output padding not supported yet"); + bool isELLCodeTarget = IsELLCodeTarget(transformer); // weights: numFilters x fieldVolumeSize == m x k // ShapedInput: fieldVolumeSize x outputRows == k x n // Matrix multiply output: numFilters x outputRows = m x n @@ -155,20 +175,39 @@ namespace nodes if (dataOrder == rcdOrder) // don't reorder input -- use old method { auto receptiveFieldMatrixNode = transformer.AddNode>(newInput, inputLayout, filterSize, _stride, inputPadding, dataOrder, outputImageWidth, outputImageHeight); - auto matrixMultNode = transformer.AddNode>(weights, m, n, k, lda, false, receptiveFieldMatrixNode->output, ldb, false, ldc, true); - - if (outputPadding != 0) + if(isELLCodeTarget) { - // Add padding - model::PortMemoryLayout outputLayout(model::MemoryShape{ outputImageHeight, outputImageWidth, numFilters }); - model::PortMemoryLayout paddedOutputLayout(model::MemoryShape{ outputImageHeight, outputImageWidth, numFilters }, model::MemoryShape{ outputPadding, outputPadding, 0 }); - const auto& reorderedOutput = ReorderData(matrixMultNode->output, outputLayout, paddedOutputLayout); - transformer.MapNodeOutput(this->output, reorderedOutput); + auto matrixMultNode = transformer.AddNode>(weights, m, n, k, lda, false, receptiveFieldMatrixNode->output, ldb, false, ldc, true); + if (outputPadding != 0) + { + // Add padding + model::PortMemoryLayout outputLayout(model::MemoryShape{ outputImageHeight, outputImageWidth, numFilters }); + model::PortMemoryLayout paddedOutputLayout(model::MemoryShape{ outputImageHeight, outputImageWidth, numFilters }, model::MemoryShape{ outputPadding, outputPadding, 0 }); + const auto& reorderedOutput = ReorderDataWithCodeNode(matrixMultNode->output, outputLayout, paddedOutputLayout); + transformer.MapNodeOutput(this->output, reorderedOutput); + } + else + { + transformer.MapNodeOutput(this->output, matrixMultNode->output); + } } else { - transformer.MapNodeOutput(this->output, matrixMultNode->output); + auto matrixMultNode = transformer.AddNode>(weights, m, n, k, lda, false, receptiveFieldMatrixNode->output, ldb, false, ldc, true); + if (outputPadding != 0) + { + // Add padding + model::PortMemoryLayout outputLayout(model::MemoryShape{ outputImageHeight, outputImageWidth, numFilters }); + model::PortMemoryLayout paddedOutputLayout(model::MemoryShape{ outputImageHeight, outputImageWidth, numFilters }, model::MemoryShape{ outputPadding, outputPadding, 0 }); + const auto& reorderedOutput = ReorderDataWithCodeNode(matrixMultNode->output, outputLayout, paddedOutputLayout); + transformer.MapNodeOutput(this->output, reorderedOutput); + } + else + { + transformer.MapNodeOutput(this->output, matrixMultNode->output); + } } + } else // reorder input to be channels x rows x columns (drc) (then we can use the 'new' receptive field matrix generation) { @@ -177,23 +216,41 @@ namespace nodes // Remove padding and transpose to channel-major order model::PortMemoryLayout inputLayout(model::MemoryShape{ inputHeight, inputWidth, inputDepth }, model::MemoryShape{ inputPadding, inputPadding, 0 }); model::PortMemoryLayout transposedInputLayout(model::MemoryShape{ inputDepth, inputHeight, inputWidth }, model::DimensionOrder{ 2, 0, 1 }); // Note: memory layout constructor takes the sizes in physical dimension order - const auto& reorderedInput = ReorderData(newInput, inputLayout, transposedInputLayout); + const auto& reorderedInput = ReorderDataWithCodeNode(newInput, inputLayout, transposedInputLayout); auto receptiveFieldMatrixNode = transformer.AddNode>(reorderedInput, reorderedInput.GetMemoryLayout(), _filterSize, _stride, inputPadding, dataOrder, outputImageWidth, outputImageHeight); - auto matrixMultNode = transformer.AddNode>(weights, m, n, k, lda, false, receptiveFieldMatrixNode->output, ldb, false, ldc, true); - - if (outputPadding != 0) + if(isELLCodeTarget) { - // Add padding - model::PortMemoryLayout outputLayout(model::MemoryShape{ outputImageHeight, outputImageWidth, numFilters }); - model::PortMemoryLayout paddedOutputLayout(model::MemoryShape{ outputImageHeight, outputImageWidth, numFilters }, model::MemoryShape{ outputPadding, outputPadding, 0 }); - const auto& reorderedOutput = ReorderData(matrixMultNode->output, outputLayout, paddedOutputLayout); - transformer.MapNodeOutput(this->output, reorderedOutput); + auto matrixMultNode = transformer.AddNode>(weights, m, n, k, lda, false, receptiveFieldMatrixNode->output, ldb, false, ldc, true); + if (outputPadding != 0) + { + // Add padding + model::PortMemoryLayout outputLayout(model::MemoryShape{ outputImageHeight, outputImageWidth, numFilters }); + model::PortMemoryLayout paddedOutputLayout(model::MemoryShape{ outputImageHeight, outputImageWidth, numFilters }, model::MemoryShape{ outputPadding, outputPadding, 0 }); + const auto& reorderedOutput = ReorderDataWithCodeNode(matrixMultNode->output, outputLayout, paddedOutputLayout); + transformer.MapNodeOutput(this->output, reorderedOutput); + } + else + { + transformer.MapNodeOutput(this->output, matrixMultNode->output); + } } else { - transformer.MapNodeOutput(this->output, matrixMultNode->output); - } + auto matrixMultNode = transformer.AddNode>(weights, m, n, k, lda, false, receptiveFieldMatrixNode->output, ldb, false, ldc, true); + if (outputPadding != 0) + { + // Add padding + model::PortMemoryLayout outputLayout(model::MemoryShape{ outputImageHeight, outputImageWidth, numFilters }); + model::PortMemoryLayout paddedOutputLayout(model::MemoryShape{ outputImageHeight, outputImageWidth, numFilters }, model::MemoryShape{ outputPadding, outputPadding, 0 }); + const auto& reorderedOutput = ReorderDataWithCodeNode(matrixMultNode->output, outputLayout, paddedOutputLayout); + transformer.MapNodeOutput(this->output, reorderedOutput); + } + else + { + transformer.MapNodeOutput(this->output, matrixMultNode->output); + } + } } return true; } diff --git a/libraries/nodes/src/WinogradConvolutionNode.cpp b/libraries/nodes/src/WinogradConvolutionNode.cpp index 48e25e9ed..189ff6147 100644 --- a/libraries/nodes/src/WinogradConvolutionNode.cpp +++ b/libraries/nodes/src/WinogradConvolutionNode.cpp @@ -8,7 +8,7 @@ #include "WinogradConvolutionNode.h" #include "ConstantNode.h" -#include "ReorderDataNode.h" +#include "ReorderDataCodeNode.h" #include @@ -1187,7 +1187,7 @@ namespace nodes { // add a ReorderDataNode to convert to channel-major, which is more efficient in this case auto orderArr = utilities::ChannelMajorTensorOrder; - const auto& reorderedData = ReorderData(*newInput, convInputLayout, convInputLayout, utilities::DimensionOrder{ orderArr }); + const auto& reorderedData = ReorderDataWithCodeNode(*newInput, convInputLayout, convInputLayout, utilities::DimensionOrder{ orderArr }); newInput = &reorderedData; convInputLayout = reorderedData.GetMemoryLayout(); } diff --git a/libraries/nodes/test/src/BasicMathNodesTests.cpp b/libraries/nodes/test/src/BasicMathNodesTests.cpp index ae5fe0e4e..6ad6bd9ac 100644 --- a/libraries/nodes/test/src/BasicMathNodesTests.cpp +++ b/libraries/nodes/test/src/BasicMathNodesTests.cpp @@ -70,7 +70,7 @@ void TestBasicMathNodes() TestUnaryOperationNodeCompute(); TestBroadcastUnaryOperationNodeCompute(); - TestBroadcastLinearFunctionNodeCompute(); + TestBroadcastBinaryOperationNodeComputeFull(); TestBroadcastBinaryOperationNodeComputeAdd(); TestBroadcastBinaryOperationNodeComputeSubtract(); @@ -79,6 +79,8 @@ void TestBasicMathNodes() TestBroadcastBinaryOperationNodeComputeWithBadLayout(); TestBroadcastBinaryOperationNodeComputeDifferentBroadcastDimensions(); TestBroadcastTernaryOperationNodeComputeFMA(); + + TestBroadcastLinearFunctionNodeCompute(); } void TestUnaryOperationNodeCompute(UnaryOperationType op, double (*expectedTransform)(double)) @@ -296,7 +298,6 @@ void TestBroadcastBinaryOperationNodeComputeAdd() model::PortMemoryLayout input1Layout({ numRows, numColumns, numChannels }); model::PortMemoryLayout input2Layout({ 1, numColumns, 1 }); - model::PortMemoryLayout input3Layout({ 1, numColumns, 1 }); // clang-format off std::vector input1Vals{ 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4 }; @@ -327,7 +328,6 @@ void TestBroadcastBinaryOperationNodeComputeSubtract() model::PortMemoryLayout input1Layout({ numRows, numColumns, numChannels }); model::PortMemoryLayout input2Layout({ 1, numColumns, 1 }); - model::PortMemoryLayout input3Layout({ 1, numColumns, 1 }); // clang-format off std::vector input1Vals{ 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4 }; @@ -473,7 +473,7 @@ void TestBroadcastBinaryOperationNodeComputeDifferentBroadcastDimensions() // clang-format off std::vector input1Vals{ 1, - 2}; + 2 }; // broadcasts to: { 1, 1, 1, // 2, 2, 2 } std::vector input2Vals{ 2, 4, 6 }; diff --git a/libraries/nodes/test/src/DSPNodesTests.cpp b/libraries/nodes/test/src/DSPNodesTests.cpp index 5282128e3..4458aa989 100644 --- a/libraries/nodes/test/src/DSPNodesTests.cpp +++ b/libraries/nodes/test/src/DSPNodesTests.cpp @@ -37,7 +37,7 @@ #include #include #include -#include +#include #include #include #include @@ -370,7 +370,7 @@ static void TestIIRFilterNode4() template static void TestMelFilterBankNode() { - const ValueType epsilon = static_cast(1e-6); + const ValueType epsilon = static_cast(1e-5); const size_t numFilters = 13; const size_t windowSize = 512; const size_t fftSize = 512; @@ -593,7 +593,7 @@ static void TestConvolutionNodeCompileVsReference(ImageShape inputShape, Filters auto convInputLayout = inputMemoryLayout.ReorderedCopy({ shouldReorderToChannelMajor ? utilities::ChannelMajorTensorOrder : utilities::RowMajorTensorOrder }); auto convOutputLayout = outputMemoryLayout.ReorderedCopy({ shouldReorderToChannelMajor ? utilities::ChannelMajorTensorOrder : utilities::RowMajorTensorOrder }); - auto preConvReorderNode = model.AddNode>(inputNode->output, inputMemoryLayout, convInputLayout); + auto preConvReorderNode = model.AddNode>(inputNode->output, inputMemoryLayout, convInputLayout); const auto* newInput = &preConvReorderNode->output; model::PortElements convOutput; @@ -629,7 +629,7 @@ static void TestConvolutionNodeCompileVsReference(ImageShape inputShape, Filters } } - auto postConvReorderNode = model.AddNode>(convOutput, convOutputLayout, outputMemoryLayout); + auto postConvReorderNode = model.AddNode>(convOutput, convOutputLayout, outputMemoryLayout); auto map = model::Map(model, { { "input", inputNode } }, { { "output", postConvReorderNode->output } }); diff --git a/libraries/optimization/CMakeLists.txt b/libraries/optimization/CMakeLists.txt index 54ae1b097..49641749b 100644 --- a/libraries/optimization/CMakeLists.txt +++ b/libraries/optimization/CMakeLists.txt @@ -42,7 +42,7 @@ set(include source_group("src" FILES ${src}) source_group("include" FILES ${include}) -add_library(${library_name} ${src} ${include} ${tcc} ${doc}) +add_library(${library_name} ${src} ${include}) target_include_directories(${library_name} PRIVATE include ${ELL_LIBRARIES_DIR}) target_link_libraries(${library_name} math) @@ -81,7 +81,7 @@ set(test_include source_group("src" FILES ${test_src}) source_group("include" FILES ${test_include}) -add_executable(${test_name} ${test_src} ${test_include} ${test_tcc}) +add_executable(${test_name} ${test_src} ${test_include}) target_include_directories(${test_name} PRIVATE test/include ${ELL_LIBRARIES_DIR}) target_link_libraries(${test_name} optimization testing) copy_shared_libraries(${test_name}) diff --git a/libraries/optimization/include/VectorSolution.h b/libraries/optimization/include/VectorSolution.h index 5d93f463b..04e833b12 100644 --- a/libraries/optimization/include/VectorSolution.h +++ b/libraries/optimization/include/VectorSolution.h @@ -33,7 +33,7 @@ namespace optimization /// Solutions are expected to have a ParameterType. Empty here because this solution type doesn't need any parameters. struct ParametersType {}; - + /// Default constructor. VectorSolution() = default; diff --git a/libraries/optimization/src/Interval.cpp b/libraries/optimization/src/Interval.cpp index 845e5bbdf..5bc3aca35 100644 --- a/libraries/optimization/src/Interval.cpp +++ b/libraries/optimization/src/Interval.cpp @@ -1,7 +1,7 @@ //////////////////////////////////////////////////////////////////////////////////////////////////// // // Project: Embedded Learning Library (ELL) -// File: Interval.tcc (optimization) +// File: Interval.cpp (optimization) // Authors: Ofer Dekel // //////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/libraries/passes/src/OptimizeReorderDataNodesTransformation.cpp b/libraries/passes/src/OptimizeReorderDataNodesTransformation.cpp index f46b94647..6dd58e5ee 100644 --- a/libraries/passes/src/OptimizeReorderDataNodesTransformation.cpp +++ b/libraries/passes/src/OptimizeReorderDataNodesTransformation.cpp @@ -10,7 +10,7 @@ #include -#include +#include #include #include @@ -50,7 +50,7 @@ namespace passes return true; } - if (auto reorderNode = dynamic_cast*>(&nodeToOptimize)) + if (auto reorderNode = dynamic_cast*>(&nodeToOptimize)) { const auto& node = *reorderNode; @@ -71,9 +71,9 @@ namespace passes while (currentNode != nullptr) { // iff we have one dependent node and it's a reorder node - const ReorderDataNode* nextNode = nullptr; + const ReorderDataCodeNode* nextNode = nullptr; if (currentNode->GetDependentNodes().size() == 1 && - (nextNode = dynamic_cast*>(currentNode->GetDependentNodes()[0]))) + (nextNode = dynamic_cast*>(currentNode->GetDependentNodes()[0]))) { Log() << "Removing node ReorderDataNode [id = " << currentNode->GetId().ToString() << "] since it is followed by another ReorderDataNode" << EOL; @@ -114,7 +114,7 @@ namespace passes // otherwise, create a new reorder node and use the input to the chain and map its output to the // final output of the chain const auto& newInput = transformer.GetCorrespondingInputs(node.input); - const auto& reorderedInput = nodes::ReorderData(newInput, inputLayout, outputLayout, node.GetPaddingValue()); + const auto& reorderedInput = nodes::ReorderDataWithCodeNode(newInput, inputLayout, outputLayout, node.GetPaddingValue()); transformer.MapNodeOutput(*finalOutputPort, reorderedInput); Log() << "ReorderDataNode chain's input and output memory layout are different. Entire chain is being " diff --git a/libraries/passes/test/src/ModelOptimizerTest.cpp b/libraries/passes/test/src/ModelOptimizerTest.cpp index 02b045b7b..2eb097af4 100644 --- a/libraries/passes/test/src/ModelOptimizerTest.cpp +++ b/libraries/passes/test/src/ModelOptimizerTest.cpp @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include @@ -230,11 +230,11 @@ void TestOptimizeReorderDataNodes1() model::Model model; auto inputMatrixNode = model.AddNode>(model::MemoryShape{ m, k }); - auto reorderedInputMatrixNode = model.AddNode>(inputMatrixNode->output, orderA); + auto reorderedInputMatrixNode = model.AddNode>(inputMatrixNode->output, orderA); std::vector matrixBVals(k * n); auto matrixBNode = model.AddNode>(matrixBVals, model::MemoryShape{ k, n }); - auto reorderedMatrixBNode = model.AddNode>(matrixBNode->output, orderB); + auto reorderedMatrixBNode = model.AddNode>(matrixBNode->output, orderB); auto matMatMultNode = model.AddNode>(reorderedInputMatrixNode->output, reorderedMatrixBNode->output, outputLayout); @@ -280,11 +280,11 @@ void TestOptimizeReorderDataNodes2() model::Model model; auto inputMatrixNode = model.AddNode>(model::MemoryShape{ m, k }); - auto reorderedInputMatrixNode = model.AddNode>(inputMatrixNode->output, orderA); + auto reorderedInputMatrixNode = model.AddNode>(inputMatrixNode->output, orderA); std::vector matrixBVals(k * n); auto matrixBNode = model.AddNode>(matrixBVals, model::MemoryShape{ k, n }); - auto reorderedMatrixBNode = model.AddNode>(matrixBNode->output, orderB); + auto reorderedMatrixBNode = model.AddNode>(matrixBNode->output, orderB); auto matMatMultNode = model.AddNode>(reorderedInputMatrixNode->output, reorderedMatrixBNode->output, outputLayout); @@ -330,11 +330,11 @@ void TestOptimizeReorderDataNodes3() model::Model model; auto inputMatrixNode = model.AddNode>(model::MemoryShape{ m, k }); - auto reorderedInputMatrixNode = model.AddNode>(inputMatrixNode->output, orderA); + auto reorderedInputMatrixNode = model.AddNode>(inputMatrixNode->output, orderA); std::vector matrixBVals(k * n); auto matrixBNode = model.AddNode>(matrixBVals, model::MemoryShape{ k, n }); - auto reorderedMatrixBNode = model.AddNode>(matrixBNode->output, orderB); + auto reorderedMatrixBNode = model.AddNode>(matrixBNode->output, orderB); auto matMatMultNode = model.AddNode>(reorderedInputMatrixNode->output, reorderedMatrixBNode->output, outputLayout); @@ -380,17 +380,17 @@ void TestOptimizeReorderDataNodes4() auto rowMajorLayout = model::PortMemoryLayout(model::MemoryShape{ m, k }).ReorderedCopy(rowMajor); auto colMajorLayout = model::PortMemoryLayout(model::MemoryShape{ m, k }).ReorderedCopy(colMajor); auto inputMatrixNode = model.AddNode>(rowMajorLayout.GetActiveSize()); - auto reorderedInputMatrixNode1 = model.AddNode>(inputMatrixNode->output, rowMajorLayout, colMajorLayout); - auto reorderedInputMatrixNode2 = model.AddNode>(reorderedInputMatrixNode1->output, colMajorLayout, rowMajorLayout); - auto reorderedInputMatrixNode3 = model.AddNode>(reorderedInputMatrixNode2->output, rowMajorLayout, rowMajorLayout); + auto reorderedInputMatrixNode1 = model.AddNode>(inputMatrixNode->output, rowMajorLayout, colMajorLayout); + auto reorderedInputMatrixNode2 = model.AddNode>(reorderedInputMatrixNode1->output, colMajorLayout, rowMajorLayout); + auto reorderedInputMatrixNode3 = model.AddNode>(reorderedInputMatrixNode2->output, rowMajorLayout, rowMajorLayout); std::vector matrixBVals(k * n); rowMajorLayout = model::PortMemoryLayout(model::MemoryShape{ k, n }).ReorderedCopy(rowMajor); colMajorLayout = model::PortMemoryLayout(model::MemoryShape{ k, n }).ReorderedCopy(colMajor); auto matrixBNode = model.AddNode>(matrixBVals, rowMajorLayout); - auto reorderedMatrixBNode1 = model.AddNode>(matrixBNode->output, rowMajorLayout, rowMajorLayout); - auto reorderedMatrixBNode2 = model.AddNode>(reorderedMatrixBNode1->output, rowMajorLayout, colMajorLayout); - auto reorderedMatrixBNode3 = model.AddNode>(reorderedMatrixBNode2->output, colMajorLayout, colMajorLayout); + auto reorderedMatrixBNode1 = model.AddNode>(matrixBNode->output, rowMajorLayout, rowMajorLayout); + auto reorderedMatrixBNode2 = model.AddNode>(reorderedMatrixBNode1->output, rowMajorLayout, colMajorLayout); + auto reorderedMatrixBNode3 = model.AddNode>(reorderedMatrixBNode2->output, colMajorLayout, colMajorLayout); auto matMatMultNode = model.AddNode>(reorderedInputMatrixNode3->output, reorderedMatrixBNode3->output, outputLayout); diff --git a/libraries/passes/test/src/TransformationTest.cpp b/libraries/passes/test/src/TransformationTest.cpp index 5b0e585f6..3e3f5f24b 100644 --- a/libraries/passes/test/src/TransformationTest.cpp +++ b/libraries/passes/test/src/TransformationTest.cpp @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include @@ -304,11 +304,11 @@ void TestOptimizeReorderDataNodesTransformation1() model::Model model; auto inputMatrixNode = model.AddNode>(model::MemoryShape{ m, k }); - auto reorderedInputMatrixNode = model.AddNode>(inputMatrixNode->output, orderA); + auto reorderedInputMatrixNode = model.AddNode>(inputMatrixNode->output, orderA); std::vector matrixBVals(k * n); auto matrixBNode = model.AddNode>(matrixBVals, model::MemoryShape{ k, n }); - auto reorderedMatrixBNode = model.AddNode>(matrixBNode->output, orderB); + auto reorderedMatrixBNode = model.AddNode>(matrixBNode->output, orderB); auto matMatMultNode = model.AddNode>(reorderedInputMatrixNode->output, reorderedMatrixBNode->output, outputLayout); @@ -345,11 +345,11 @@ void TestOptimizeReorderDataNodesTransformation2() model::Model model; auto inputMatrixNode = model.AddNode>(model::MemoryShape{ m, k }); - auto reorderedInputMatrixNode = model.AddNode>(inputMatrixNode->output, orderA); + auto reorderedInputMatrixNode = model.AddNode>(inputMatrixNode->output, orderA); std::vector matrixBVals(k * n); auto matrixBNode = model.AddNode>(matrixBVals, model::MemoryShape{ k, n }); - auto reorderedMatrixBNode = model.AddNode>(matrixBNode->output, orderB); + auto reorderedMatrixBNode = model.AddNode>(matrixBNode->output, orderB); auto matMatMultNode = model.AddNode>(reorderedInputMatrixNode->output, reorderedMatrixBNode->output, outputLayout); @@ -385,11 +385,11 @@ void TestOptimizeReorderDataNodesTransformation3() model::Model model; auto inputMatrixNode = model.AddNode>(model::MemoryShape{ m, k }); - auto reorderedInputMatrixNode = model.AddNode>(inputMatrixNode->output, orderA); + auto reorderedInputMatrixNode = model.AddNode>(inputMatrixNode->output, orderA); std::vector matrixBVals(k * n); auto matrixBNode = model.AddNode>(matrixBVals, model::MemoryShape{ k, n }); - auto reorderedMatrixBNode = model.AddNode>(matrixBNode->output, orderB); + auto reorderedMatrixBNode = model.AddNode>(matrixBNode->output, orderB); auto matMatMultNode = model.AddNode>(reorderedInputMatrixNode->output, reorderedMatrixBNode->output, outputLayout); @@ -430,17 +430,17 @@ void TestOptimizeReorderDataNodesTransformation4() auto rowMajorLayout = model::PortMemoryLayout(model::MemoryShape{ m, k }).ReorderedCopy(rowMajor); auto colMajorLayout = model::PortMemoryLayout(model::MemoryShape{ m, k }).ReorderedCopy(colMajor); auto inputMatrixNode = model.AddNode>(rowMajorLayout.GetActiveSize()); - auto reorderedInputMatrixNode1 = model.AddNode>(inputMatrixNode->output, rowMajorLayout, colMajorLayout); - auto reorderedInputMatrixNode2 = model.AddNode>(reorderedInputMatrixNode1->output, colMajorLayout, rowMajorLayout); - auto reorderedInputMatrixNode3 = model.AddNode>(reorderedInputMatrixNode2->output, rowMajorLayout, rowMajorLayout); + auto reorderedInputMatrixNode1 = model.AddNode>(inputMatrixNode->output, rowMajorLayout, colMajorLayout); + auto reorderedInputMatrixNode2 = model.AddNode>(reorderedInputMatrixNode1->output, colMajorLayout, rowMajorLayout); + auto reorderedInputMatrixNode3 = model.AddNode>(reorderedInputMatrixNode2->output, rowMajorLayout, rowMajorLayout); std::vector matrixBVals(k * n); rowMajorLayout = model::PortMemoryLayout(model::MemoryShape{ k, n }).ReorderedCopy(rowMajor); colMajorLayout = model::PortMemoryLayout(model::MemoryShape{ k, n }).ReorderedCopy(colMajor); auto matrixBNode = model.AddNode>(matrixBVals, rowMajorLayout); - auto reorderedMatrixBNode1 = model.AddNode>(matrixBNode->output, rowMajorLayout, rowMajorLayout); - auto reorderedMatrixBNode2 = model.AddNode>(reorderedMatrixBNode1->output, rowMajorLayout, colMajorLayout); - auto reorderedMatrixBNode3 = model.AddNode>(reorderedMatrixBNode2->output, colMajorLayout, colMajorLayout); + auto reorderedMatrixBNode1 = model.AddNode>(matrixBNode->output, rowMajorLayout, rowMajorLayout); + auto reorderedMatrixBNode2 = model.AddNode>(reorderedMatrixBNode1->output, rowMajorLayout, colMajorLayout); + auto reorderedMatrixBNode3 = model.AddNode>(reorderedMatrixBNode2->output, colMajorLayout, colMajorLayout); auto matMatMultNode = model.AddNode>(reorderedInputMatrixNode3->output, reorderedMatrixBNode3->output, outputLayout); diff --git a/libraries/utilities/CMakeLists.txt b/libraries/utilities/CMakeLists.txt index 16e82e6bf..5b3115eff 100644 --- a/libraries/utilities/CMakeLists.txt +++ b/libraries/utilities/CMakeLists.txt @@ -49,6 +49,7 @@ set(include include/CStringParser.h include/Debug.h include/Graph.h + include/EnumFlagHelpers.h include/Exception.h include/Files.h include/Format.h @@ -79,6 +80,7 @@ set(include include/StringUtil.h include/Tokenizer.h include/TransformIterator.h + include/TunableParameters.h include/TupleUtils.h include/TypeAliases.h include/TypeFactory.h @@ -121,6 +123,7 @@ set(test_src test/src/ObjectArchive_test.cpp test/src/PropertyBag_test.cpp test/src/RingBuffer_test.cpp + test/src/TunableParameters_test.cpp test/src/TypeFactory_test.cpp test/src/TypeName_test.cpp test/src/Variant_test.cpp @@ -137,6 +140,7 @@ set(test_include test/include/ObjectArchive_test.h test/include/PropertyBag_test.h test/include/RingBuffer_test.h + test/include/TunableParameters_test.h test/include/TypeFactory_test.h test/include/TypeName_test.h test/include/Variant_test.h diff --git a/libraries/utilities/include/EnumFlagHelpers.h b/libraries/utilities/include/EnumFlagHelpers.h new file mode 100644 index 000000000..b1bcdd6c6 --- /dev/null +++ b/libraries/utilities/include/EnumFlagHelpers.h @@ -0,0 +1,39 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: EnumFlagHelpers.h (utilities) +// Authors: Kern Handa +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#define ELL_DEFINE_ENUM_FLAG_OPERATORS(ENUMTYPE) \ + inline ENUMTYPE operator|(ENUMTYPE a, ENUMTYPE b) \ + { \ + return ENUMTYPE(((std::underlying_type_t)a) | ((std::underlying_type_t)b)); \ + } \ + inline ENUMTYPE& operator|=(ENUMTYPE& a, ENUMTYPE b) \ + { \ + return (ENUMTYPE&)(((std::underlying_type_t&)a) |= ((std::underlying_type_t)b)); \ + } \ + inline ENUMTYPE operator&(ENUMTYPE a, ENUMTYPE b) \ + { \ + return ENUMTYPE(((std::underlying_type_t)a) & ((std::underlying_type_t)b)); \ + } \ + inline ENUMTYPE& operator&=(ENUMTYPE& a, ENUMTYPE b) \ + { \ + return (ENUMTYPE&)(((std::underlying_type_t&)a) &= ((std::underlying_type_t)b)); \ + } \ + inline ENUMTYPE operator~(ENUMTYPE a) \ + { \ + return ENUMTYPE(~((std::underlying_type_t)a)); \ + } \ + inline ENUMTYPE operator^(ENUMTYPE a, ENUMTYPE b) \ + { \ + return ENUMTYPE(((std::underlying_type_t)a) ^ ((std::underlying_type_t)b)); \ + } \ + inline ENUMTYPE& operator^=(ENUMTYPE& a, ENUMTYPE b) \ + { \ + return (ENUMTYPE&)(((std::underlying_type_t&)a) ^= ((std::underlying_type_t)b)); \ + } diff --git a/libraries/utilities/include/FunctionUtils.h b/libraries/utilities/include/FunctionUtils.h index d6e0ecadf..c51f6caea 100644 --- a/libraries/utilities/include/FunctionUtils.h +++ b/libraries/utilities/include/FunctionUtils.h @@ -68,6 +68,24 @@ namespace utilities template void ApplyToEach(FunctionType&& function, Arg&& arg, Args&&... args); + namespace detail + { + template + void ApplyToEach(FunctionType&& function, Tuple&& tuple, std::index_sequence) + { + (function(std::get(tuple)), ...); + } + } // namespace detail + + template + void ApplyToEach(FunctionType&& function, std::tuple& tuple) + { + detail::ApplyToEach( + std::forward(function), + std::forward>(tuple), + std::make_index_sequence()); + } + // // FunctionTraits // @@ -75,12 +93,22 @@ namespace utilities /// FunctionTraits: A type-traits-like way to get the return type and argument types of a function /// template - struct FunctionTraits; // undefined base template + struct FunctionTraits : public FunctionTraits { }; // generic base template // Function pointers template struct FunctionTraits { + using Type = ReturnT(Args...); + using ReturnType = ReturnT; + using ArgTypes = std::tuple; + static constexpr size_t NumArgs = typename std::tuple_size(); + }; + + template + struct FunctionTraits + { + using Type = ReturnT(Args...); using ReturnType = ReturnT; using ArgTypes = std::tuple; static constexpr size_t NumArgs = typename std::tuple_size(); @@ -90,23 +118,72 @@ namespace utilities template struct FunctionTraits> { + using Type = ReturnT(Args...); using ReturnType = ReturnT; using ArgTypes = std::tuple; static constexpr size_t NumArgs = typename std::tuple_size(); }; + // const std::function template struct FunctionTraits> { + using Type = ReturnT(Args...); + using ReturnType = ReturnT; + using ArgTypes = std::tuple; + static constexpr size_t NumArgs = typename std::tuple_size(); + }; + + // class member + template + struct FunctionTraits + { + using Type = ReturnT(Args...); + using ReturnType = ReturnT; + using ArgTypes = std::tuple; + static constexpr size_t NumArgs = typename std::tuple_size(); + }; + + template + struct FunctionTraits + { + using Type = ReturnT(Args...); + using ReturnType = ReturnT; + using ArgTypes = std::tuple; + static constexpr size_t NumArgs = typename std::tuple_size(); + }; + + template + struct FunctionTraits + { + using Type = ReturnT(Args...); + using ReturnType = ReturnT; + using ArgTypes = std::tuple; + static constexpr size_t NumArgs = typename std::tuple_size(); + }; + + template + struct FunctionTraits + { + using Type = ReturnT(Args...); using ReturnType = ReturnT; using ArgTypes = std::tuple; static constexpr size_t NumArgs = typename std::tuple_size(); }; // Handy type aliases + template + using FunctionType = typename FunctionTraits::Type; + template using FunctionReturnType = typename FunctionTraits::ReturnType; + template + constexpr bool HasReturnValue() + { + return !std::is_same_v>; + } + template using FunctionArgTypes = typename FunctionTraits::ArgTypes; diff --git a/libraries/utilities/include/MemoryLayout.h b/libraries/utilities/include/MemoryLayout.h index 5ad6d640c..ca35301d7 100644 --- a/libraries/utilities/include/MemoryLayout.h +++ b/libraries/utilities/include/MemoryLayout.h @@ -109,6 +109,8 @@ namespace utilities /// Element access operator. int operator[](int index) const; + std::string ToString() const; + private: int& operator[](int index); }; @@ -159,6 +161,8 @@ namespace utilities /// /// The name of this type. static std::string GetTypeName() { return "MemoryShape"; } + + std::string ToString() const; }; /// A vector of numbers representing an index into a multidimensional array. @@ -195,6 +199,8 @@ namespace utilities /// /// The name of this type. static std::string GetTypeName() { return "MemoryCoordinates"; } + + std::string ToString() const; }; /// A class representing layout of a block of data in memory where the block can also @@ -553,6 +559,8 @@ namespace utilities /// a simple one dimensional vector, otherwise throws an exception. MemoryLayout Flatten() const; + std::string ToString() const; + protected: size_t GetDataOffset() const; // offset for physical entry {0,0,0...} void WriteToArchive(utilities::Archiver& archiver) const override; diff --git a/libraries/utilities/include/StringUtil.h b/libraries/utilities/include/StringUtil.h index ad8a8711f..8b06bfe21 100644 --- a/libraries/utilities/include/StringUtil.h +++ b/libraries/utilities/include/StringUtil.h @@ -22,6 +22,20 @@ namespace utilities /// `true` if the substring is contained in the string (according to `std::string::find`) bool Contains(const std::string& s, const std::string& substring); + /// Checks whether a string starts with the specified substring (using case-sensitive comparison) + /// + /// The string to search + /// The substring to search for + /// `true` if the string starts with the substring (according to `std::string::compare`) + bool StartsWith(const std::string& s, const std::string& prefix); + + /// Checks whether a string ends with the specified substring (using case-sensitive comparison) + /// + /// The string to search + /// The substring to search for + /// `true` if the string ends with the substring (according to `std::string::compare`) + bool EndsWith(const std::string& s, const std::string& suffix); + /// Returns copy of std::string with all lowercase characters /// /// The string to convert to lowercase diff --git a/libraries/utilities/include/TunableParameters.h b/libraries/utilities/include/TunableParameters.h new file mode 100644 index 000000000..ba3723b01 --- /dev/null +++ b/libraries/utilities/include/TunableParameters.h @@ -0,0 +1,165 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: TunableParameters.h (utilities) +// Authors: Kern Handa +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include "FunctionUtils.h" + +#include +#include +#include +#include +#include + +namespace ell +{ +namespace utilities +{ + /// Represents a range of values to iterate over, in conjunction with the `TuningEngine` class below. + /// + /// Construction of parameters should take place before construction of the TuningEngine, which should be passed + /// the set of parameters to iterate over, creating a full set of combinatorial possibilities. The parameters should + /// then be used in place of `T`, resulting in a full exploration of the possibilities. For example, + /// ``` + /// TunableParameter M = std::vector{ 2, 4, 6 }, N = std::vector{ 3, 5 }; + /// TuningEngine engine(M, N); + /// do { + /// std::cout << (int)M * (int)N << " "; + /// } while (engine.Next()); + /// ``` + /// will produce the following output: + /// ``` + /// 6 10 12 20 18 30 + /// ``` + /// The internal states of `TunableParameter` instances are modified when iterated over by `TuningEngine`. To reset the state + /// the `Reset()` function can be called either on an individual instance of `TunableParameter` or on `TuningEngine`, which will + /// call `Reset()` on the set of parameters the engine is operating over. + /// + template + class TunableParameter + { + public: + TunableParameter(std::vector range, const std::string& name) : + _name(name), + _range(std::move(range)), + _current(_range.begin()) + {} + + operator T() const + { + return *_current; + } + + bool Next() + { + return ++_current != _range.end(); + } + + void Reset() + { + _current = _range.begin(); + } + + std::string Name() const + { + return _name; + } + + std::string ValueString() const + { + // Obviously, this will only work if there's an overload for std::to_string that takes T + return std::to_string(*_current); + } + + std::string ToString() const + { + return Name() + ValueString(); + } + + private: + std::string _name; + std::vector _range; + typename std::vector::iterator _current; + }; + + /// Takes an arbitrary number of lvalue references to `TunableParameter` instances and iterates over them in a + /// combinatorial manner. `TunableParameter` instances have their state modified by the iteration of the engine, + /// as explained above in the documentation for `TunableParameter`. + template + class TuningEngine + { + public: + TuningEngine(TunableParameter&... params) : + _params(std::tie(params...)) + { + } + + bool Next() + { + ++_currentIteration; + return NextImpl(std::make_integer_sequence()); + } + + void Reset() + { + ApplyToEach([](auto& param) { param.Reset(); }, _params); + } + + size_t CurrentIteration() const { return _currentIteration; } + + std::string ToString(const std::string& sep = "_") const + { + return ToStringImpl(sep, std::make_index_sequence()); + } + + std::map CurrentValues() const + { + return CurrentValuesImpl(std::make_integer_sequence()); + } + + private: + template + bool NextImpl(std::integer_sequence seq) + { + // Uses fold expressions combined with boolean OR short-circuiting + // to iteratively call Next() on the individual parameters, starting + // with the last one and working our way to the first one. + return ( + [](auto& param) { + auto b = param.Next(); + if (!b) + { + param.Reset(); + } + return b; + }(std::get<(seq.size() - 1) - Is>(_params)) || + ...); + } + + template + std::string ToStringImpl(const std::string& sep, std::index_sequence seq) const + { + return ((std::get(_params).ToString() + sep) + ... + (std::get(_params).ToString())); + } + + template + std::map CurrentValuesImpl(std::integer_sequence seq) const + { + std::map result; + ([&](auto& param) { + result[param.Name()] = param.ValueString(); + }(std::get<(seq.size() - 1) - Is>(_params)), + ...); + return result; + } + + std::tuple&...> _params; + size_t _currentIteration = 0; + }; +} // namespace utilities +} // namespace ell diff --git a/libraries/utilities/include/TypeAliases.h b/libraries/utilities/include/TypeAliases.h index 8e24eb85a..ea8387194 100644 --- a/libraries/utilities/include/TypeAliases.h +++ b/libraries/utilities/include/TypeAliases.h @@ -8,6 +8,7 @@ #pragma once +#include #include namespace ell diff --git a/libraries/utilities/include/TypeTraits.h b/libraries/utilities/include/TypeTraits.h index b66355acf..5bfbe43c7 100644 --- a/libraries/utilities/include/TypeTraits.h +++ b/libraries/utilities/include/TypeTraits.h @@ -26,6 +26,11 @@ namespace utilities {}; } // namespace detail + /// Templated type that always has a value of `false`. + template + struct FalseType : std::false_type + {}; + /// Enabled if ValueType is a boolean. template using IsBoolean = std::enable_if_t, bool>::value, bool>; @@ -161,7 +166,7 @@ namespace utilities { using type = std::remove_cv_t>; }; - } + } // namespace detail // Convenience type alias to remove all references and const/volatile qualifiers // from a type. diff --git a/libraries/utilities/src/Files.cpp b/libraries/utilities/src/Files.cpp index 8dfd5c51e..9631c8075 100644 --- a/libraries/utilities/src/Files.cpp +++ b/libraries/utilities/src/Files.cpp @@ -36,10 +36,15 @@ namespace utilities const auto& path = filepath; #endif // open file + if(!FileExists(filepath)) + { + throw utilities::InputException(InputExceptionErrors::invalidArgument, "file " + filepath + " doesn't exist"); + } + auto stream = std::ifstream(path, mode); // check that it opened - if (!stream.is_open()) + if (!stream) { throw utilities::InputException(InputExceptionErrors::invalidArgument, "error opening file " + filepath); } diff --git a/libraries/utilities/src/MemoryLayout.cpp b/libraries/utilities/src/MemoryLayout.cpp index b11ea5bb3..fc7ca98c5 100644 --- a/libraries/utilities/src/MemoryLayout.cpp +++ b/libraries/utilities/src/MemoryLayout.cpp @@ -13,6 +13,7 @@ #include #include +#include namespace ell { @@ -154,6 +155,13 @@ namespace utilities return true; } + std::string DimensionOrder::ToString() const + { + std::stringstream ss; + ss << *this; + return ss.str(); + } + // // MemoryShape / Coordinates // @@ -177,6 +185,20 @@ namespace utilities } } + std::string MemoryShape::ToString() const + { + std::stringstream ss; + ss << *this; + return ss.str(); + } + + std::string MemoryCoordinates::ToString() const + { + std::stringstream ss; + ss << *this; + return ss.str(); + } + // // MemoryLayout // @@ -582,6 +604,13 @@ namespace utilities } } + std::string MemoryLayout::ToString() const + { + std::stringstream ss; + ss << *this; + return ss.str(); + } + bool Equal(const DimensionVector& shape1, const DimensionVector& shape2) { auto size = shape1.NumDimensions(); @@ -706,7 +735,7 @@ namespace utilities else { throw InputException(InputExceptionErrors::invalidArgument, - "Cannot flatten a discontiguous MemoryLayout."); + "Cannot flatten a discontiguous MemoryLayout."); } } diff --git a/libraries/utilities/src/StringUtil.cpp b/libraries/utilities/src/StringUtil.cpp index c99b51b2c..2b2515beb 100644 --- a/libraries/utilities/src/StringUtil.cpp +++ b/libraries/utilities/src/StringUtil.cpp @@ -23,6 +23,16 @@ namespace utilities return s.find(substring) != std::string::npos; } + bool StartsWith(const std::string& s, const std::string& prefix) + { + return s.size() >= prefix.size() && s.compare(0, prefix.size(), prefix) == 0; + } + + bool EndsWith(const std::string& s, const std::string& suffix) + { + return s.size() >= suffix.size() && s.compare(s.size() - suffix.size(), std::string::npos, suffix) == 0; + } + std::string ToLowercase(const std::string& s) { std::string lower = s; diff --git a/libraries/utilities/test/include/TunableParameters_test.h b/libraries/utilities/test/include/TunableParameters_test.h new file mode 100644 index 000000000..573de3bdc --- /dev/null +++ b/libraries/utilities/test/include/TunableParameters_test.h @@ -0,0 +1,15 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: TunableParameters_test.h (utilities) +// Authors: Kern Handa +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +namespace ell +{ +void TunableParameters_test1(); +void TunableParameters_test2(); +} // namespace ell diff --git a/libraries/utilities/test/src/TunableParameters_test.cpp b/libraries/utilities/test/src/TunableParameters_test.cpp new file mode 100644 index 000000000..36d1409f8 --- /dev/null +++ b/libraries/utilities/test/src/TunableParameters_test.cpp @@ -0,0 +1,80 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: TunableParameters_test.cpp (utilities) +// Authors: Kern Handa +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "TunableParameters_test.h" + +#include + +#include + +#include +#include + +namespace ell +{ + +using namespace utilities; + +void TunableParameters_test1() +{ + std::vector expected{ 1, 2, 3, 4 }; + std::vector actual; + TunableParameter p(expected, "expected"); + TuningEngine engine(p); + do + { + actual.push_back(p); + } while (engine.Next()); + + testing::ProcessTest("TunableParameters_test1", actual == expected); +} + +void TunableParameters_test2() +{ + std::vector p1Values{ 1, 2, 3 }; + std::vector p2Values{ 4, 5 }; + std::vector p3Values{ 6, 7, 8 }; + TunableParameter p1(p1Values, "p1"); + TunableParameter p2(p2Values, "p2"); + TunableParameter p3(p3Values, "p3"); + std::vector expected{ + std::vector{ 1, 4, 6 }, + std::vector{ 1, 4, 7 }, + std::vector{ 1, 4, 8 }, + std::vector{ 1, 5, 6 }, + std::vector{ 1, 5, 7 }, + std::vector{ 1, 5, 8 }, + std::vector{ 2, 4, 6 }, + std::vector{ 2, 4, 7 }, + std::vector{ 2, 4, 8 }, + std::vector{ 2, 5, 6 }, + std::vector{ 2, 5, 7 }, + std::vector{ 2, 5, 8 }, + std::vector{ 3, 4, 6 }, + std::vector{ 3, 4, 7 }, + std::vector{ 3, 4, 8 }, + std::vector{ 3, 5, 6 }, + std::vector{ 3, 5, 7 }, + std::vector{ 3, 5, 8 }, + }; + std::vector> actual; + TuningEngine engine(p1, p2, p3); + do + { + actual.push_back(std::vector{ p1, p2, p3 }); + } while (engine.Next()); + + testing::ProcessTest( + "TunableParameters_test2", + std::equal(expected.begin(), expected.end(), actual.begin(), actual.end())); + + engine.Reset(); + testing::ProcessTest("TunableParameters_test2 - Reset", expected[0] == std::vector{ p1, p2, p3 }); +} + +} // namespace ell diff --git a/libraries/utilities/test/src/main.cpp b/libraries/utilities/test/src/main.cpp index e79c72a44..c176e7766 100644 --- a/libraries/utilities/test/src/main.cpp +++ b/libraries/utilities/test/src/main.cpp @@ -16,6 +16,7 @@ #include "ObjectArchive_test.h" #include "PropertyBag_test.h" #include "RingBuffer_test.h" +#include "TunableParameters_test.h" #include "TypeFactory_test.h" #include "TypeName_test.h" #include "Variant_test.h" @@ -117,6 +118,10 @@ int main(int argc, char* argv[]) // PropertyBag tests TestPropertyBag(); TestRecursivePropertyBag(); + + // TunableParameters + TunableParameters_test1(); + TunableParameters_test2(); } catch (const utilities::Exception& exception) { diff --git a/libraries/value/CMakeLists.txt b/libraries/value/CMakeLists.txt index f89302aa6..b821536d4 100644 --- a/libraries/value/CMakeLists.txt +++ b/libraries/value/CMakeLists.txt @@ -5,31 +5,59 @@ set(library_name value) set(src + src/Array.cpp + src/CachingProvider.cpp + src/CachingStrategies.cpp src/ComputeContext.cpp + src/CppEmitterContext.cpp src/Emittable.cpp src/EmitterContext.cpp src/FunctionDeclaration.cpp src/LLVMContext.cpp + src/LoopNests.cpp src/Matrix.cpp src/MatrixOperations.cpp + src/Print.cpp src/Reference.cpp src/Scalar.cpp + src/ScalarOperations.cpp src/Tensor.cpp src/TensorOperations.cpp src/Value.cpp src/ValueOperations.cpp src/Vector.cpp src/VectorOperations.cpp + + src/loopnests/CodeGenerator.cpp + src/loopnests/CodePositionConstraints.cpp + src/loopnests/ForAll.cpp + src/loopnests/Index.cpp + src/loopnests/IndexRange.cpp + src/loopnests/IterationDomain.cpp + src/loopnests/Kernel.cpp + src/loopnests/KernelPredicate.cpp + src/loopnests/LoopNest.cpp + src/loopnests/LoopNestPrinter.cpp + src/loopnests/LoopNestVisitor.cpp + src/loopnests/Range.cpp + src/loopnests/SplitIndexRange.cpp + src/loopnests/SplitIterationDomain.cpp ) set(include + include/Array.h + include/CachingProvider.h + include/CachingStrategies.h include/ComputeContext.h + include/CppEmitterContext.h include/Emittable.h include/EmitterContext.h include/FunctionDeclaration.h include/LLVMContext.h + include/LoopNests.h include/Matrix.h include/MatrixOperations.h + include/Print.h include/Reference.h include/Scalar.h include/Tensor.h @@ -39,6 +67,22 @@ set(include include/ValueType.h include/Vector.h include/VectorOperations.h + + include/loopnests/CodeGenerator.h + include/loopnests/CodePositionConstraints.h + include/loopnests/ForAll.h + include/loopnests/Index.h + include/loopnests/IndexRange.h + include/loopnests/IterationDomain.h + include/loopnests/Kernel.h + include/loopnests/KernelPredicate.h + include/loopnests/LoopIndexInfo.h + include/loopnests/LoopNest.h + include/loopnests/LoopNestPrinter.h + include/loopnests/LoopNestVisitor.h + include/loopnests/Range.h + include/loopnests/SplitIndexRange.h + include/loopnests/SplitIterationDomain.h ) set(doc @@ -53,7 +97,6 @@ target_include_directories(${library_name} PRIVATE include ${ELL_LIBRARIES_DIR}) target_include_directories(${library_name} SYSTEM PUBLIC ${LLVM_INCLUDE_DIRS}) target_link_libraries(${library_name} PUBLIC ${LLVM_LIBS} emitters utilities) target_compile_options(${library_name} PUBLIC ${LLVM_COMPILE_OPTIONS}) - set_property(TARGET ${library_name} PROPERTY FOLDER "libraries") # @@ -68,21 +111,33 @@ endif() set(test_src test/src/main.cpp + test/src/CachingStrategy_test.cpp + test/src/Functions_test.cpp + test/src/LoopNest_convolution_test.cpp + test/src/LoopNest_kernels.cpp + test/src/LoopNest_test.cpp + test/src/LoopNestAPI_test.cpp test/src/Matrix_test.cpp test/src/Scalar_test.cpp test/src/Tensor_test.cpp test/src/TestUtil.cpp - test/src/Vector_test.cpp test/src/Value_test.cpp + test/src/Vector_test.cpp ) set(test_include + test/include/CachingStrategy_test.h + test/include/Functions_test.h + test/include/LoopNest_convolution_test.h + test/include/LoopNest_kernels.h + test/include/LoopNest_test.h + test/include/LoopNestAPI_test.h test/include/Matrix_test.h test/include/Scalar_test.h test/include/Tensor_test.h test/include/TestUtil.h - test/include/Vector_test.h test/include/Value_test.h + test/include/Vector_test.h ) source_group("src" FILES ${test_src}) diff --git a/libraries/value/README.md b/libraries/value/README.md index addcf9c8b..fa4ab9f29 100644 --- a/libraries/value/README.md +++ b/libraries/value/README.md @@ -27,7 +27,7 @@ littering it with namespace-level qualifications. NB: this may not be necessary due to ADL, depending on what one is trying to do. As this API will be type-erased (more below), usage of this API does not need -to be templated and thus does not need to be in `.h`/`.tcc` files. +to be templated and thus does not need to be in `.h` files. ## Classes * `Value` - top-level type-erased class that will be the basis of all diff --git a/libraries/value/include/Array.h b/libraries/value/include/Array.h new file mode 100644 index 000000000..038b207f7 --- /dev/null +++ b/libraries/value/include/Array.h @@ -0,0 +1,185 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: Array.h (value) +// Authors: Chuck Jacobs +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include "EmitterContext.h" + +#include +#include + +#include +#include + +namespace ell +{ +namespace value +{ + + /// Wraps a Value instance and enforces a memory layout that represents a multidimensional array + class Array + { + public: + Array(); + + /// Constructor that wraps the provided instance of Value + /// The Value instance to wrap + Array(Value value, const std::string& name = ""); + + /// Constructs an instance from a 1D std::vector reshaped into the given array shape + /// Any fundamental type accepted by Value + /// The data represented as a std::vector, in canonical row-major layout + /// The shape of the memory + template + Array(const std::vector& data, const utilities::MemoryShape& shape); + + /// Constructs an instance from a 1D std::vector reshaped into the given array shape + /// Any fundamental type accepted by Value + /// The data represented as a std::vector, in canonical row-major layout + /// The layout of the memory + template + Array(const std::vector& data, const utilities::MemoryLayout& layout); + + Array(const Array&); + Array(Array&&) noexcept; + Array& operator=(const Array&); + Array& operator=(Array&&); + ~Array(); + + /// Array element access operator. + /// The Scalar value wrapping the value that is at the specified index within the array + Scalar operator()(const std::vector& indices); + + /// Array element access operator. + /// A copy of the Scalar value that is at the specified index within the array + Scalar operator()(const std::vector& indices) const; + + /// Array element access operator. + /// The Scalar value wrapping the value that is at the specified index within the array + template + Scalar operator()(T... indices); + + /// Array element access operator. + /// A copy of the Scalar value that is at the specified index within the array + template + Scalar operator()(T... indices) const; + + /// Gets the underlying wrapped Value instance + Value GetValue() const; + + /// Creates a new Array instance that contains the same data as this instance + /// A new Array instance that points to a new, distinct memory that contains the same data as this instance + Array Copy() const; + + /// Returns the number of active elements + /// The size of the array + size_t Size() const; + + /// Retrieves the type of data stored in the wrapped Value instance + /// The type + ValueType Type() const; + + void SetName(const std::string& name); + std::string GetName() const; + + private: + Value _value; + }; + + /// Creates a for loop over the array + /// The instance of Array that references the data over which to iterate + /// The function to be called for each coordinate where there is an active element + void For(Array array, std::function&)> fn); + + /// Constructs an allocated instance with the specified dimensions + /// Any fundamental type accepted by Value + /// The shape of the memory + template + Array MakeArray(const utilities::MemoryShape& shape) + { + return Array(Allocate(utilities::MemoryLayout(shape))); + } + + /// Constructs an allocated instance with the specified dimensions + /// Any fundamental type accepted by Value + /// The shape of the memory + /// The name of the allocated matrix + template + Array MakeArray(const utilities::MemoryShape& shape, std::string name) + { + auto result = MakeArray(shape); + result.SetName(name); + return result; + } + +} // namespace value +} // namespace ell + +#pragma region implementation + +namespace ell +{ +namespace value +{ + template + Array::Array(const std::vector& data, const utilities::MemoryShape& shape) + { + using namespace utilities; + + int size = static_cast(data.size()); + if (size != shape.NumElements()) + { + throw InputException(InputExceptionErrors::invalidSize); + } + _value = Value(data, MemoryLayout(shape)); + } + + template + Array::Array(const std::vector& data, const utilities::MemoryLayout& layout) + { + using namespace utilities; + + auto size = data.size(); + if (size != layout.GetMemorySize()) + { + throw InputException(InputExceptionErrors::invalidSize); + } + _value = Value(data, layout); + } + + template + Scalar Array::operator()(T... indices) + { + static_assert(utilities::AllSame...>); + if (sizeof...(T) != GetValue().GetLayout().NumDimensions()) + { + throw utilities::InputException(utilities::InputExceptionErrors::invalidSize); + } + Value indexedValue = GetContext().Offset(_value, { indices... }); + indexedValue.SetLayout(utilities::ScalarLayout); + + return indexedValue; + } + + template + Scalar Array::operator()(T... indices) const + { + static_assert(utilities::AllSame...>); + if (sizeof...(T) != GetValue().GetLayout().NumDimensions()) + { + throw utilities::InputException(utilities::InputExceptionErrors::invalidSize); + } + Value indexedValue = GetContext().Offset(_value, { indices... }); // shouldn't this be "load(offset(...))"? + indexedValue.SetLayout(utilities::ScalarLayout); + + return Scalar(indexedValue).Copy(); + } +} // namespace value +} // namespace ell + +#pragma endregion implementation diff --git a/libraries/value/include/CachingProvider.h b/libraries/value/include/CachingProvider.h new file mode 100644 index 000000000..fa3eb95ae --- /dev/null +++ b/libraries/value/include/CachingProvider.h @@ -0,0 +1,191 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: CachingProvider.h (value) +// Authors: Kern Handa +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include "EmitterContext.h" + +#include "loopnests/Index.h" + +#include +#include +#include +#include +#include + +namespace ell +{ +namespace value +{ + using loopnests::Index; + + class LoopNest; + + class CachingProvider + { + public: + CachingProvider() = default; + virtual ~CachingProvider() = default; + + void Initialize(ViewAdapter view, utilities::MemoryShape cacheShape, utilities::DimensionOrder order, std::vector kernelIndices, std::vector atIndices, std::any extra); + + void HandleCaching(LoopNest&); + + protected: + Value _value; + utilities::MemoryShape _shape; + utilities::DimensionOrder _order; + std::vector _kernelIndices; + std::vector _atIndices; + std::any _extra; + + private: + virtual void HandleCachingImpl(LoopNest&) = 0; + }; + + namespace + { + class CachingHelper + { + public: + CachingHelper(const CachingHelper&) = delete; + CachingHelper& operator=(const CachingHelper&) = delete; + CachingHelper(CachingHelper&& other) + { + *this = std::move(other); + } + + CachingHelper& operator=(CachingHelper&& other) + { + if (this != &other) + { + std::swap(_value, other._value); + std::swap(_atIndices, other._atIndices); + std::swap(_kernelIndices, other._kernelIndices); + std::swap(_shape, other._shape); + std::swap(_order, other._order); + std::swap(_provider, other._provider); + std::swap(_extra, other._extra); + } + + return *this; + } + + CachingHelper(ViewAdapter view) : + _value(view) + { + if (!_value.IsDefined()) + { + throw utilities::LogicException(utilities::LogicExceptionErrors::illegalState, + "View to be cached is not defined"); + } + } + + CachingHelper Using(std::vector indices) && + { + if (indices.empty()) + { + throw utilities::LogicException(utilities::LogicExceptionErrors::illegalState, + "Indices that specify caching indices cannot be empty"); + } + + _kernelIndices = indices; + if (_atIndices.empty()) + { + _atIndices = _kernelIndices; + } + + return std::move(*this); + } + + CachingHelper At(std::vector indices) && + { + if (indices.empty()) + { + throw utilities::LogicException(utilities::LogicExceptionErrors::illegalState, + "Indices that specify caching kernel location cannot be empty"); + } + + _atIndices = indices; + return std::move(*this); + } + + CachingHelper Size(utilities::MemoryShape shape, utilities::DimensionOrder order) && + { + _shape = shape; + _order = order; + + return std::move(*this); + } + + CachingHelper Size(utilities::MemoryShape shape) && + { + return std::move(*this).Size(shape, utilities::DimensionOrder(shape.NumDimensions())); + } + + template + CachingHelper Extra(Ts&&... ts) && + { + _extra = std::make_tuple(std::forward(ts)...); + return std::move(*this); + } + + template + CachingHelper Type(T&&) && + { + if (!_value.IsDefined()) + { + throw utilities::LogicException(utilities::LogicExceptionErrors::illegalState, + "View to be cached is not defined"); + } + if (_kernelIndices.empty()) + { + throw utilities::LogicException(utilities::LogicExceptionErrors::illegalState, + "Indices that specify caching location cannot be empty"); + } + if (!_shape) + { + throw utilities::LogicException(utilities::LogicExceptionErrors::illegalState, + "Memory shape of cached location must be provided"); + } + + auto provider = std::make_unique(); + provider->Initialize(std::move(_value), std::move(*_shape), std::move(*_order), std::move(_kernelIndices), std::move(_atIndices), std::move(_extra)); + _provider = std::move(provider); + return std::move(*this); + } + + operator std::unique_ptr() && + { + return std::move(_provider); + } + + private: + Value _value; + std::vector _atIndices; + std::vector _kernelIndices; + std::optional _shape; + std::optional _order; + std::unique_ptr _provider; + std::any _extra; + }; + } // namespace + + inline CachingHelper CreateCacheFor(ViewAdapter view) + { + return CachingHelper(view); + } + + template + struct CachingStrategyType + { + using ProviderType = CachingProviderType; + }; + +} // namespace value +} // namespace ell diff --git a/libraries/value/include/CachingStrategies.h b/libraries/value/include/CachingStrategies.h new file mode 100644 index 000000000..9e78b742a --- /dev/null +++ b/libraries/value/include/CachingStrategies.h @@ -0,0 +1,63 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: CachingStrategies.h (value) +// Authors: Kern Handa +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include "CachingProvider.h" + +namespace ell +{ +namespace value +{ + enum class BoundaryConditionHandling : int + { + ZeroPadding + }; + + using ReduceFunctionType = void(value::Scalar, value::Scalar); + void CopyReduce(value::Scalar, value::Scalar); + void SumReduce(value::Scalar, value::Scalar); + + class CopyInputCopyOutput : public CachingProvider + { + void HandleCachingImpl(LoopNest&) override; + }; + + class CopyInputNoOutput : public CachingProvider + { + void HandleCachingImpl(LoopNest&) override; + }; + + class ZeroInputReduceOutput : public CachingProvider + { + void HandleCachingImpl(LoopNest&) override; + }; + + class BLASTCopy : public CachingProvider + { + public: + void HandleCachingImpl(LoopNest&) override; + + Value _rawCache; + }; + + class GeneralCachingStrategy : public CachingProvider + { + public: + void HandleCachingImpl(LoopNest&) override; + Value _rawCache; + }; + + using SubMatrixCopyInCopyOutCache = CachingStrategyType; + using SubMatrixCopyIn = CachingStrategyType; + using ZeroInputCopyOutMatrixCache = CachingStrategyType; + + using BLASTCopyCache = CachingStrategyType; + +} // namespace value +} // namespace ell diff --git a/libraries/value/include/ComputeContext.h b/libraries/value/include/ComputeContext.h index 22d45a6b9..844cc97c4 100644 --- a/libraries/value/include/ComputeContext.h +++ b/libraries/value/include/ComputeContext.h @@ -12,9 +12,11 @@ #include "FunctionDeclaration.h" #include "Scalar.h" +#include #include #include #include +#include #include #include @@ -34,12 +36,12 @@ namespace value const ConstantData& GetConstantData(Value value) const; private: - Value AllocateImpl(ValueType type, MemoryLayout layout) override; + Value AllocateImpl(ValueType type, MemoryLayout layout, size_t alignment, AllocateFlags flags) override; std::optional GetGlobalValue(GlobalAllocationScope scope, std::string name) override; - Value GlobalAllocateImpl(GlobalAllocationScope scope, std::string name, ConstantData data, MemoryLayout layout) override; - Value GlobalAllocateImpl(GlobalAllocationScope scope, std::string name, ValueType type, MemoryLayout layout) override; + Value GlobalAllocateImpl(GlobalAllocationScope scope, std::string name, ConstantData data, MemoryLayout layout, AllocateFlags flags) override; + Value GlobalAllocateImpl(GlobalAllocationScope scope, std::string name, ValueType type, MemoryLayout layout, AllocateFlags flags) override; detail::ValueTypeDescription GetTypeImpl(Emittable emittable) override; @@ -48,8 +50,8 @@ namespace value Value StoreConstantDataImpl(ConstantData data) override; - void ForImpl(MemoryLayout layout, std::function)> fn) override; - void ForImpl(Scalar start, Scalar stop, Scalar step, std::function fn) override; + void ForImpl(MemoryLayout layout, std::function)> fn, const std::string& name) override; + void ForImpl(Scalar start, Scalar stop, Scalar step, std::function fn, const std::string& name) override; void MoveDataImpl(Value& source, Value& destination) override; @@ -71,12 +73,15 @@ namespace value IfContext IfImpl(Scalar test, std::function fn) override; + void WhileImpl(Scalar test, std::function fn) override; + std::optional CallImpl(FunctionDeclaration func, std::vector args) override; void PrefetchImpl(Value data, PrefetchType type, PrefetchLocality locality) override; void ParallelizeImpl(int numTasks, std::vector captured, std::function)> fn) override; + void DebugBreakImpl() override; void DebugDumpImpl(Value value, std::string tag, std::ostream& stream) const override; void DebugDumpImpl(FunctionDeclaration fn, std::string tag, std::ostream& stream) const override; void DebugPrintImpl(std::string message) override; @@ -84,6 +89,10 @@ namespace value void SetNameImpl(const Value& value, const std::string& name) override; std::string GetNameImpl(const Value& value) const override; + void ImportCodeFileImpl(std::string) override; + + Scalar GetFunctionAddressImpl(const FunctionDeclaration& fn) override; + Value IntrinsicCall(FunctionDeclaration intrinsic, std::vector args); bool ValidateValue(Value value) const; @@ -102,9 +111,12 @@ namespace value Frame& GetTopFrame(); const Frame& GetTopFrame() const; + friend void swap(ComputeContext&, ComputeContext&) noexcept; + class IfContextImpl; struct FunctionScope; + mutable std::recursive_mutex _mutex; std::stack _stack; std::map> _globals; std::unordered_map _definedFunctions; diff --git a/libraries/value/include/CppEmitterContext.h b/libraries/value/include/CppEmitterContext.h new file mode 100644 index 000000000..f166a2969 --- /dev/null +++ b/libraries/value/include/CppEmitterContext.h @@ -0,0 +1,166 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: CppEmitterContext.h (value) +// Authors: Kern Handa +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include "ComputeContext.h" +#include "EmitterContext.h" +#include "FunctionDeclaration.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace ell +{ +namespace value +{ + class CppEmitterContext : public EmitterContext + { + public: + CppEmitterContext(std::string moduleName, std::ostream& stream); + CppEmitterContext(std::string modulename, std::unique_ptr stream); + CppEmitterContext(const TargetDevice& target, std::string moduleName, std::ostream& stream); + CppEmitterContext(const TargetDevice& target, std::string modulename, std::unique_ptr stream); + + ~CppEmitterContext(); + + private: + Value AllocateImpl(ValueType, MemoryLayout, size_t /* alignment */, AllocateFlags flags) override; + Value AllocateImpl(detail::ValueTypeDescription, std::optional, std::string, std::optional = std::nullopt, bool = false); + + std::optional GetGlobalValue(GlobalAllocationScope scope, std::string name) override; + Value GlobalAllocateImpl(GlobalAllocationScope scope, std::string name, ConstantData data, MemoryLayout layout, AllocateFlags flags) override; + Value GlobalAllocateImpl(GlobalAllocationScope scope, std::string name, ValueType type, MemoryLayout layout, AllocateFlags flags) override; + + detail::ValueTypeDescription GetTypeImpl(Emittable) override; + + DefinedFunction CreateFunctionImpl(FunctionDeclaration decl, DefinedFunction fn) override; + bool IsFunctionDefinedImpl(FunctionDeclaration decl) const override; + + Value StoreConstantDataImpl(ConstantData data) override; + + void ForImpl(MemoryLayout layout, std::function)> fn, const std::string& name) override; + void ForImpl(Scalar start, Scalar stop, Scalar step, std::function fn, const std::string& name) override; + + void MoveDataImpl(Value& source, Value& destination) override; + + void CopyDataImpl(const Value& source, Value& destination) override; + + Value ReferenceImpl(Value source) override; + + Value DereferenceImpl(Value source) override; + + Value OffsetImpl(Value source, Value offset) override; + + Value UnaryOperationImpl(ValueUnaryOperation op, Value destination) override; + Value BinaryOperationImpl(ValueBinaryOperation op, Value destination, Value source) override; + + Value LogicalOperationImpl(ValueLogicalOperation op, Value source1, Value source2) override; + + Value CastImpl(Value value, ValueType type) override; + + IfContext IfImpl(Scalar test, std::function fn) override; + + void WhileImpl(Scalar test, std::function fn) override; + + std::optional CallImpl(FunctionDeclaration func, std::vector args) override; + + void PrefetchImpl(Value data, PrefetchType type, PrefetchLocality locality) override; + + void ParallelizeImpl(int numTasks, std::vector captured, std::function)> fn) override; + + void DebugBreakImpl() override; + void DebugDumpImpl(Value value, std::string tag, std::ostream& stream) const override; + void DebugDumpImpl(FunctionDeclaration fn, std::string tag, std::ostream& stream) const override; + void DebugPrintImpl(std::string message) override; + + void SetNameImpl(const Value& value, const std::string& name) override; + std::string GetNameImpl(const Value& value) const override; + + void ImportCodeFileImpl(std::string) override; + + Scalar GetFunctionAddressImpl(const FunctionDeclaration& fn) override; + + Value IntrinsicCall(FunctionDeclaration intrinsic, std::vector args); + std::optional EmitExternalCall(FunctionDeclaration externalFunc, std::vector args); + + void DeclareFunction(FunctionDeclaration decl); + std::ostream& WriteFunctionSignature(std::ostream& stream, FunctionDeclaration decl); + + std::ostream& Out(); + std::ostream& Global(); + std::ostream& FnDecl(); + + std::string GetScopeAdjustedName(GlobalAllocationScope scope, std::string name) const; + std::string GetGlobalScopedName(std::string name) const; + std::string GetCurrentFunctionScopedName(std::string name) const; + + Value SimpleNumericIntrinsic(FunctionDeclaration intrinsic, std::vector args); + Value MaxMinIntrinsic(FunctionDeclaration intrinsic, std::vector args); + Value PowIntrinsic(FunctionDeclaration intrinsic, std::vector args); + Value CopySignIntrinsic(FunctionDeclaration intrinsic, std::vector args); + Value FmaIntrinsic(FunctionDeclaration intrinsic, std::vector args); + Value MemFnIntrinsic(FunctionDeclaration intrinsic, std::vector args); + + template + void Indented(Fn&&); + + struct PromotedConstantDataDescription + { + const ConstantData* data; + Emittable realValue; + }; + + Value PromoteConstantData(Value value); + std::optional HasBeenPromoted(Value value) const; + Value Realize(Value value); + Value EnsureEmittable(Value value); + + std::string ScalarToString(ViewAdapter scalar) const; + + struct ValueImpl + { + std::string name; + detail::ValueTypeDescription typeDesc; + }; + + struct FnContext + { + std::forward_list dataList; + std::string name; + }; + + class IfContextImpl; + struct FunctionScope; + + std::unique_ptr _ownedStream; + ComputeContext _computeContext; + + std::stack> _promotedConstantStack; + std::stack _fnStacks; + std::ostringstream _globalStream; + std::ostringstream _fnDeclStream; + std::ostringstream _expressionStream; + std::reference_wrapper _stream; + std::reference_wrapper _outputStream; + std::unordered_map _definedFunctions; + std::map> _globals; + std::forward_list _globalsList; + std::unordered_set _declaredFunctions; + std::string _moduleName; + size_t _indent = 0; + }; + +} // namespace value +} // namespace ell diff --git a/libraries/value/include/EmitterContext.h b/libraries/value/include/EmitterContext.h index 0f1fe1b79..7a1dfbad4 100644 --- a/libraries/value/include/EmitterContext.h +++ b/libraries/value/include/EmitterContext.h @@ -13,13 +13,17 @@ #include "Value.h" #include "ValueType.h" +#include + #include +#include #include #include #include #include #include +#include #include #include #include @@ -31,6 +35,7 @@ namespace ell { namespace value { + using emitters::TargetDevice; namespace detail { @@ -42,6 +47,13 @@ namespace value enum class PrefetchType; enum class PrefetchLocality; + enum class AllocateFlags : uint64_t + { + None = 0, + ThreadLocal = 1 << 0, + }; + ELL_DEFINE_ENUM_FLAG_OPERATORS(AllocateFlags); + /// An interface describing the global context that's used by the Value library /// This class employs the non-virtual interface pattern to provide an easy to use API while /// minimizing the functions needed to be overloaded. @@ -73,11 +85,14 @@ namespace value public: IfContext(std::unique_ptr impl); IfContext(const IfContext&) = delete; - IfContext(IfContext&&) = delete; + IfContext(IfContext&&) = default; IfContext&& ElseIf(Scalar, std::function) &&; void Else(std::function) &&; + void ElseIf(Scalar, std::function) &; + void Else(std::function) &; + private: std::unique_ptr _impl; }; @@ -85,32 +100,43 @@ namespace value /// Describes the type that can be used to represent constant C++ data using ConstantData = detail::ConstantData; + EmitterContext(const TargetDevice& target) : + _targetDevice(target) {} + virtual ~EmitterContext(); /// Allocates data with the specified type and size /// The type of the data to allocate /// The size of the allocation, in number of elements - /// An instance of Value that contains a referece to the allocated memory - Value Allocate(ValueType type, size_t size); + /// The byte alignment to use for the allocated value. + /// Any additional flags. Not all contexts may support all flags. + /// An instance of Value that contains a reference to the allocated memory + Value Allocate(ValueType type, size_t size, size_t alignment = 0, AllocateFlags flags = AllocateFlags::None); /// Allocates data with the specified type and size /// The type of the data to allocate /// The memory layout of the allocation, in number of elements - /// An instance of Value that contains a referece to the allocated memory - Value Allocate(ValueType type, MemoryLayout layout); + /// The byte alignment to use for the allocated value. + /// Any additional flags. Not all contexts may support all flags. + /// An instance of Value that contains a reference to the allocated memory + Value Allocate(ValueType type, MemoryLayout layout, size_t alignment = 0, AllocateFlags flags = AllocateFlags::None); /// Allocates function static data /// The name of the variable /// The type of the data /// The layout of the data - Value StaticAllocate(std::string name, ValueType type, utilities::MemoryLayout layout); + /// Any additional flags. Not all contexts may support all flags. + /// An instance of Value that contains a reference to the allocated memory + Value StaticAllocate(std::string name, ValueType type, utilities::MemoryLayout layout, AllocateFlags flags = AllocateFlags::None); /// Allocates function static data /// The name of the variable /// The data /// The layout of the data + /// Any additional flags. Not all contexts may support all flags. + /// An instance of Value that contains a reference to the allocated memory template , void*> = nullptr> - Value StaticAllocate(std::string name, const std::vector& data, std::optional layout = {}) + Value StaticAllocate(std::string name, const std::vector& data, std::optional layout = {}, AllocateFlags flags = AllocateFlags::None) { if (auto globalValue = GetGlobalValue(GlobalAllocationScope::Function, name)) { @@ -118,33 +144,40 @@ namespace value } auto optionalLayout = utilities::MemoryLayout({ static_cast(data.size()) }); - return GlobalAllocateImpl(GlobalAllocationScope::Function, name, data, layout.value_or(optionalLayout)); + return GlobalAllocateImpl(GlobalAllocationScope::Function, name, data, layout.value_or(optionalLayout), flags); } /// Allocates scalar function static data /// The name of the variable /// The data + /// Any additional flags. Not all contexts may support all flags. + /// An instance of Value that contains a reference to the allocated memory template , void*> = nullptr> - Value StaticAllocate(std::string name, T t) + Value StaticAllocate(std::string name, T t, AllocateFlags flags = AllocateFlags::None) { return this ->template StaticAllocate(name, std::vector< - std::conditional_t, utilities::Boolean, T>>{ t }); + std::conditional_t, utilities::Boolean, T>>{ t }, + flags); } /// Allocates global data /// The name of the variable /// The type of the data /// The layout of the data - Value GlobalAllocate(std::string name, ValueType type, utilities::MemoryLayout layout); + /// Any additional flags. Not all contexts may support all flags. + /// An instance of Value that contains a reference to the allocated memory + Value GlobalAllocate(std::string name, ValueType type, utilities::MemoryLayout layout, AllocateFlags flags = AllocateFlags::None); /// Allocates global data /// The name of the variable /// The data /// The layout of the data + /// Any additional flags. Not all contexts may support all flags. + /// An instance of Value that contains a reference to the allocated memory template , void*> = nullptr> - Value GlobalAllocate(std::string name, const std::vector& data, std::optional layout = {}) + Value GlobalAllocate(std::string name, const std::vector& data, std::optional layout = {}, AllocateFlags flags = AllocateFlags::None) { if (auto globalValue = GetGlobalValue(GlobalAllocationScope::Global, name)) { @@ -152,19 +185,22 @@ namespace value } auto optionalLayout = utilities::MemoryLayout({ static_cast(data.size()) }); - return GlobalAllocateImpl(GlobalAllocationScope::Global, name, data, layout.value_or(optionalLayout)); + return GlobalAllocateImpl(GlobalAllocationScope::Global, name, data, layout.value_or(optionalLayout), flags); } /// Allocates scalar global data /// The name of the variable /// The data + /// Any additional flags. Not all contexts may support all flags. + /// An instance of Value that contains a reference to the allocated memory template , void*> = nullptr> - Value GlobalAllocate(std::string name, T t) + Value GlobalAllocate(std::string name, T t, AllocateFlags flags = AllocateFlags::None) { return this ->template GlobalAllocate(name, std::vector< - std::conditional_t, utilities::Boolean, T>>{ t }); + std::conditional_t, utilities::Boolean, T>>{ t }, + flags); } /// Gets the type information contained in an instance of Emittable @@ -184,20 +220,22 @@ namespace value /// Stores data known ahead of time in the form of a std::vector of one of the fundamental types /// The data that is to be stored by the context instance - /// An instance of Value that contains a referece to the allocated memory + /// An instance of Value that contains a reference to the allocated memory Value StoreConstantData(ConstantData data); /// Creates a for loop over the memory pointed to with the given layout /// The layout used to describe the iteration characteristics. Only active elements are iterated over. /// The function to be called for each coordinate where there is an active element - void For(MemoryLayout layout, std::function)> fn); + /// Optional, a name that can be used by the emitter context to tag this loop in the emitted code + void For(MemoryLayout layout, std::function)> fn, const std::string& name = ""); /// Creates a for loop beggining at `start`, ending at `stop`, and incrementing by `step` /// The value used to initialize the loop counter /// The terminal value of the loop /// The value by which the loop counter is incremented /// The function to be called for each coordinate where there is an active element - void For(Scalar start, Scalar stop, Scalar step, std::function fn); + /// Optional, a name that can be used by the emitter context to tag this loop in the emitted code + void For(Scalar start, Scalar stop, Scalar step, std::function fn, const std::string& name = ""); /// Moves the data from one location to another /// The source of the memory to be moved @@ -247,7 +285,9 @@ namespace value IfContext If(Scalar test, std::function fn); - std::optional Call(FunctionDeclaration func, std::vector args); + void While(Scalar test, std::function fn); + + std::optional Call(FunctionDeclaration func, std::vector args); void Prefetch(Value data, PrefetchType type, PrefetchLocality locality); @@ -258,9 +298,15 @@ namespace value /// will be filled in with the values provided within the `captured` parameter. void Parallelize(int numTasks, std::vector captured, std::function)> fn); + void DebugBreak(); void DebugDump(Value value, std::string tag, std::ostream* stream) const; void DebugDump(FunctionDeclaration fn, std::string tag, std::ostream* stream) const; + /// Returns a unique name based on the prefix provided + /// The prefix for the unique name desired + /// A unique name for this instance + std::string UniqueName(const std::string& prefix); + /// Emit a debug print message. This assumes the application /// on the target platform implements a "void DebugPrint(char* message)" function. This function will be /// defined for you when running in JIT or Compute mode. @@ -276,15 +322,27 @@ namespace value /// The Value instance std::string GetName(const Value& value) const; + void ImportCodeFile(std::string filename); + + Scalar GetFunctionAddress(const FunctionDeclaration& fn); + + const TargetDevice& GetTargetDevice() const { return _targetDevice; } + protected: const std::vector>& GetIntrinsics() const; + std::vector NormalizeReferenceLevels(const std::vector& args, const std::vector& expected) const; + + std::optional GetGlobalValue(GlobalAllocationScope, std::string, MemoryLayout); + + std::map _uniqueNames; + private: - virtual Value AllocateImpl(ValueType, MemoryLayout) = 0; + virtual Value AllocateImpl(ValueType, MemoryLayout, size_t alignment, AllocateFlags flags) = 0; virtual std::optional GetGlobalValue(GlobalAllocationScope scope, std::string name) = 0; - virtual Value GlobalAllocateImpl(GlobalAllocationScope scope, std::string name, ConstantData data, MemoryLayout layout) = 0; - virtual Value GlobalAllocateImpl(GlobalAllocationScope scope, std::string name, ValueType type, MemoryLayout layout) = 0; + virtual Value GlobalAllocateImpl(GlobalAllocationScope scope, std::string name, ConstantData data, MemoryLayout layout, AllocateFlags flags) = 0; + virtual Value GlobalAllocateImpl(GlobalAllocationScope scope, std::string name, ValueType type, MemoryLayout layout, AllocateFlags flags) = 0; virtual detail::ValueTypeDescription GetTypeImpl(Emittable) = 0; @@ -293,8 +351,8 @@ namespace value virtual Value StoreConstantDataImpl(ConstantData data) = 0; - virtual void ForImpl(MemoryLayout layout, std::function)> fn) = 0; - virtual void ForImpl(Scalar start, Scalar stop, Scalar step, std::function fn) = 0; + virtual void ForImpl(MemoryLayout layout, std::function)> fn, const std::string& name) = 0; + virtual void ForImpl(Scalar start, Scalar stop, Scalar step, std::function fn, const std::string& name) = 0; virtual void MoveDataImpl(Value& source, Value& destination) = 0; @@ -314,12 +372,16 @@ namespace value virtual IfContext IfImpl(Scalar test, std::function fn) = 0; + virtual void WhileImpl(Scalar test, std::function fn) = 0; + virtual std::optional CallImpl(FunctionDeclaration func, std::vector args) = 0; virtual void PrefetchImpl(Value data, PrefetchType type, PrefetchLocality locality) = 0; virtual void ParallelizeImpl(int numTasks, std::vector captured, std::function)> fn) = 0; + virtual void DebugBreakImpl() = 0; + virtual void DebugDumpImpl(Value value, std::string tag, std::ostream& stream) const = 0; virtual void DebugDumpImpl(FunctionDeclaration fn, std::string tag, std::ostream& stream) const = 0; @@ -327,6 +389,14 @@ namespace value virtual void SetNameImpl(const Value& value, const std::string& name) = 0; virtual std::string GetNameImpl(const Value& value) const = 0; + + virtual void ImportCodeFileImpl(std::string filename) = 0; + + virtual Scalar GetFunctionAddressImpl(const FunctionDeclaration& fn) = 0; + + friend void swap(EmitterContext&, EmitterContext&) noexcept; + + TargetDevice _targetDevice; }; /// Returns the global instance of EmitterContext @@ -389,8 +459,8 @@ namespace value EmitterContext* _oldContext; }; - template - struct ContextGuard : private ContextGuard<> + template + struct ContextGuard : private ContextGuard<> { template ContextGuard(Args&&... args) : @@ -404,75 +474,81 @@ namespace value T _context; }; + inline const TargetDevice& GetContextTargetDevice() + { + return GetContext().GetTargetDevice(); + } + /// Allocates data with the specified type and size /// The type of the data to allocate /// The size of the allocation, in number of elements - /// An instance of Value that contains a referece to the allocated memory - Value Allocate(ValueType type, size_t size); + /// An instance of Value that contains a reference to the allocated memory + Value Allocate(ValueType type, size_t size, size_t alignment = 0, AllocateFlags flags = AllocateFlags::None); /// Allocates data with the specified type and size /// The type of the data to allocate /// The memory layout of the allocation, in number of elements - /// An instance of Value that contains a referece to the allocated memory - Value Allocate(ValueType type, utilities::MemoryLayout layout); + /// An instance of Value that contains a reference to the allocated memory + Value Allocate(ValueType type, utilities::MemoryLayout layout, size_t alignment = 0, AllocateFlags flags = AllocateFlags::None); /// Allocates data with the specified type and size /// The type of the data to allocate /// The size of the allocation, in number of elements - /// An instance of Value that contains a referece to the allocated memory + /// An instance of Value that contains a reference to the allocated memory template - Value Allocate(size_t size) + Value Allocate(size_t size, size_t alignment = 0, AllocateFlags flags = AllocateFlags::None) { - return Allocate(GetValueType(), size); + return Allocate(GetValueType(), size, alignment, flags); } /// Allocates data with the specified type and size /// The type of the data to allocate /// The memory layout of the allocation, in number of elements - /// An instance of Value that contains a referece to the allocated memory + /// An instance of Value that contains a reference to the allocated memory template - Value Allocate(utilities::MemoryLayout layout) + Value Allocate(utilities::MemoryLayout layout, size_t alignment = 0, AllocateFlags flags = AllocateFlags::None) { - return Allocate(GetValueType(), layout); + return Allocate(GetValueType(), layout, alignment, flags); } /// Allocates function static data /// The name of the variable /// The type of the data /// The layout of the data - Value StaticAllocate(std::string name, ValueType type, utilities::MemoryLayout layout); + Value StaticAllocate(std::string name, ValueType type, utilities::MemoryLayout layout, AllocateFlags flags = AllocateFlags::None); /// Allocates function static data /// The name of the variable /// The data /// The layout of the data template , void*> = nullptr> - Value StaticAllocate(std::string name, const std::vector& data, std::optional layout = {}) + Value StaticAllocate(std::string name, const std::vector& data, std::optional layout = {}, AllocateFlags flags = AllocateFlags::None) { - return GetContext().StaticAllocate(name, data, layout); + return GetContext().StaticAllocate(name, data, layout, flags); } /// Allocates scalar function static data /// The name of the variable /// The data template , void*> = nullptr> - Scalar StaticAllocate(std::string name, T t) + Scalar StaticAllocate(std::string name, T t, AllocateFlags flags = AllocateFlags::None) { return StaticAllocate(name, std::vector, utilities::Boolean, T>>{ t }, - utilities::ScalarLayout); + utilities::ScalarLayout, + flags); } /// Allocates global data /// The name of the variable /// The type of the data /// The layout of the data - Value GlobalAllocate(std::string name, ValueType type, utilities::MemoryLayout layout); + Value GlobalAllocate(std::string name, ValueType type, utilities::MemoryLayout layout, AllocateFlags flags = AllocateFlags::None); template , void*> = nullptr> - Value GlobalAllocate(std::string name, utilities::MemoryLayout layout) + Value GlobalAllocate(std::string name, utilities::MemoryLayout layout, AllocateFlags flags = AllocateFlags::None) { - return GlobalAllocate(name, GetValueType(), layout); + return GlobalAllocate(name, GetValueType(), layout, flags); } /// Allocates global data @@ -480,22 +556,24 @@ namespace value /// The data /// The layout of the data template , void*> = nullptr> - Value GlobalAllocate(std::string name, const std::vector& data, std::optional layout = {}) + Value GlobalAllocate(std::string name, const std::vector& data, std::optional layout = {}, AllocateFlags flags = AllocateFlags::None) { - return GetContext().GlobalAllocate(name, data, layout); + return GetContext().GlobalAllocate(name, data, layout, flags); } /// Allocates scalar global data /// The name of the variable /// The data template , void*> = nullptr> - Scalar GlobalAllocate(std::string name, T t) + Scalar GlobalAllocate(std::string name, T t, AllocateFlags flags = AllocateFlags::None) { return GlobalAllocate(name, std::vector, utilities::Boolean, T>>{ t }, - utilities::ScalarLayout); + utilities::ScalarLayout, + flags); } + void DebugBreak(); void DebugDump(FunctionDeclaration fn, std::string tag = "", std::ostream* stream = nullptr); void DebugDump(Value value, std::string tag = "", std::ostream* stream = nullptr); @@ -505,8 +583,18 @@ namespace value return DebugDump(value.GetValue(), tag, stream); } + /// Emit a debug print message. This assumes the application + /// on the target platform implements a "void DebugPrint(char* message)" function. This function will be + /// defined for you when running in JIT or Compute mode. + void DebugPrint(std::string message); + EmitterContext::IfContext If(Scalar test, std::function fn); + void While(Scalar test, std::function fn); + + void ForRange(const std::string& name, Scalar end, std::function fn); + void ForRange(const std::string& name, Scalar start, Scalar end, std::function fn); + void ForRange(const std::string& name, Scalar start, Scalar end, Scalar step, std::function fn); void ForRange(Scalar end, std::function fn); void ForRange(Scalar start, Scalar end, std::function fn); void ForRange(Scalar start, Scalar end, Scalar step, std::function fn); @@ -516,10 +604,11 @@ namespace value /// function called `GetValue()` which returns a `Value` instance. /// The number of tasks that should be created /// A list of values to be used inside the function - /// The function that gets run for each task. The first parameter is the task number for that particular call. Subsequent parameters must match the typelist - /// `Tys...` and will be filled in with the values provided within the `captured` parameter. - template - void Parallelize(int numTasks, std::tuple captured, std::function fn); + /// The function that gets run for each task. The first parameter is the task number for that particular call. + /// Subsequent parameters must match the typelist `Tys...` and will be filled in with the values provided within the `captured` + /// parameter. In other words, the signature for the function should be `void fn(Scalar, Tys...)`. + template + void Parallelize(int numTasks, std::tuple captured, Fn&& fn); /// Runs the provided function, in parallel if possible /// The number of tasks that should be created @@ -544,6 +633,10 @@ namespace value extern FunctionDeclaration RoundFunctionDeclaration; extern FunctionDeclaration FloorFunctionDeclaration; extern FunctionDeclaration CeilFunctionDeclaration; + extern FunctionDeclaration FmaFunctionDeclaration; + extern FunctionDeclaration MemCopyFunctionDeclaration; + extern FunctionDeclaration MemMoveFunctionDeclaration; + extern FunctionDeclaration MemSetFunctionDeclaration; Scalar Abs(Scalar s); Scalar Cos(Scalar s); @@ -564,6 +657,7 @@ namespace value Scalar Sign(Scalar s); Scalar Square(Scalar s); Scalar LogicalNot(Scalar v); + Scalar Fma(Scalar a, Scalar b, Scalar c); Vector Abs(Vector v); Vector Cos(Vector v); @@ -581,6 +675,11 @@ namespace value Vector Floor(Vector v); Vector Ceil(Vector v); + void MemCopy(ViewAdapter dest, ViewAdapter source, std::optional length = std::nullopt); + void MemMove(ViewAdapter dest, ViewAdapter source, std::optional length = std::nullopt); + void MemSet(ViewAdapter dest, Scalar data, std::optional length = std::nullopt); + void ZeroMemory(ViewAdapter dest, std::optional length = std::nullopt); + /// Specifier determining if the fetch should be for a read or a write enum class PrefetchType { @@ -605,6 +704,11 @@ namespace value template void Prefetch(ViewType view, PrefetchType type = PrefetchType::Read, PrefetchLocality locality = PrefetchLocality::None); + /// Returns a unique name based on the prefix provided + /// The prefix for the unique name desired + /// A unique name for the current EmitterContext instance + std::string UniqueName(const std::string& prefix); + /// Returns the passed in View type with the memory layout representative of the full view of the memory, i.e., no padding. template ViewType AsFullView(ViewType view); @@ -643,8 +747,8 @@ namespace value } } - template - void Parallelize(int numTasks, std::tuple captured, std::function fn) + template + void Parallelize(int numTasks, std::tuple captured, Fn&& fn) { auto capturedValues = utilities::TupleToVector([](auto view) { return detail::GetValue(view); }, captured); @@ -677,7 +781,7 @@ namespace value template ViewType AsFullView(ViewType view) { - auto value = GetValue(view); + auto value = detail::GetValue(view); value.SetLayout(utilities::MemoryLayout{ value.GetLayout().GetExtent() }); return value; } diff --git a/libraries/value/include/FunctionDeclaration.h b/libraries/value/include/FunctionDeclaration.h index f1a29e7f4..d53a86fed 100644 --- a/libraries/value/include/FunctionDeclaration.h +++ b/libraries/value/include/FunctionDeclaration.h @@ -9,6 +9,7 @@ #pragma once #include "EmitterContext.h" +#include "Scalar.h" #include "Value.h" #include @@ -23,11 +24,12 @@ namespace ell { namespace value { - /// Helper enum used to specify whether a FunctionDeclaration object should decorate its function name - enum class FunctionDecorated + /// Helper enum used to specify whether a FunctionDeclaration should be inlined + enum class FunctionInlining { - Yes, - No + defaultInline, + always, + never }; /// Describes a function that can be acted upon by an EmitterContext instance @@ -39,49 +41,60 @@ namespace value /// Constructor /// The name of the function - FunctionDeclaration(std::string name); + explicit FunctionDeclaration(std::string name); /// Sets the return type for this function declaration /// A Value instance describing type of the value that is expected and its memory layout to be returned by the function /// A reference to this instance /// If this function is not called, the instance defaults to a void return type - FunctionDeclaration& Returns(Value returnType); + FunctionDeclaration& Returns(ViewAdapter returnType); /// Sets whether this function should be decorated (mangled) /// An enum value specifying whether this function should be decorated /// A reference to this instance /// By default, a function is decorated, which means the name gets suffixed by an encoding of the function's parameter and return types. /// Functions that are declared externally should probably not be decorated - FunctionDeclaration& Decorated(FunctionDecorated shouldDecorate); + FunctionDeclaration& Decorated(bool shouldDecorate); + + /// If `public` is true, set the function to appear in the public header, otherwise the function is internal + FunctionDeclaration& Public(bool isPublic); + + /// Sets whether this function should be inlined + /// A FunctionInlining value specifying whether this function should be inlined or not + FunctionDeclaration& Inlined(FunctionInlining shouldInline = FunctionInlining::always); /// Specifies a function definition for this declaration /// A function object that takes zero or more Value library observer types and returns void or a Value library observer type. /// This function object defines this function. /// A std::function function object that matches the signature of the function passed in - /// If this function is not called, this function declaration is treated as an external function. Not all contexts may support an external + /// If this function or `Imported` is not called, this function declaration is treated as an external function. Not all contexts may support an external /// function template [[maybe_unused]] auto Define(Fn&& fn); + /// Specifies the code file that is to be imported to define this function + /// The specified file is imported when this function declaration is used to emit a call. + FunctionDeclaration& DefineFromFile(std::string file); + /// Sets the parameters this function requires /// Zero or more Value instances or view types with a GetValue() member function describing /// the types of the arguments and their memory layout expected by the function /// A reference to this instance /// If this function is not called, the instance defaults to taking no arguments template - FunctionDeclaration& Parameters(Types&& ... paramTypes); + FunctionDeclaration& Parameters(Types&&... paramTypes); /// Sets the parameters this function requires /// Zero or more Value instances describing the types of the arguments and their memory layout expected by the function /// A reference to this instance /// If this function is not called, the instance defaults to taking no arguments - [[nodiscard]] FunctionDeclaration& Parameters(std::vector parameters); + [[nodiscard]] FunctionDeclaration& Parameters(std::vector parameters); /// Emits a call to the function declaration /// A vector of Value instances that hold the arguments for the function call /// A std::optional instance that holds a Value instance with the return value of the call, if it is expected, otherwise empty /// If the function is not defined and the context is capable of it, this will emit a call to an external function - [[maybe_unused]] std::optional Call(std::vector arguments) const; + [[maybe_unused]] std::optional Call(std::vector arguments) const; /// Emits a call to the function declaration /// Zero or more Value instances or view types with a GetValue() member function describing @@ -101,23 +114,49 @@ namespace value /// Otherwise, the std::optional instance is empty const std::optional& GetReturnType() const; + /// Returns true if function is to appear in the public header, false otherwise + [[nodiscard]] bool IsPublic() const; + /// Returns true if function is defined for current context, false otherwise [[nodiscard]] bool IsDefined() const; /// Returns true if the instance is an empty function declaration [[nodiscard]] bool IsEmpty() const; + /// Returns true if the instance represents an imported function + [[nodiscard]] bool IsImported() const; + + /// Returns true if the instance is inlined + [[nodiscard]] FunctionInlining InlineState() const; + + Scalar GetPointer() const; + + void SetPointer(Scalar pointer) { _pointer = pointer; } + + bool IsPointerSet() const { return _pointer.has_value(); } + private: template [[maybe_unused]] std::function DefineImpl(std::function fn); + template + [[maybe_unused]] std::function DefineImpl(std::false_type, std::function fn); + + template + [[maybe_unused]] inline std::function DefineImpl(std::true_type, std::function); + void CheckNonEmpty() const; + std::string _importedSource; std::string _originalFunctionName; mutable std::optional _decoratedFunctionName; std::optional _returnType; std::vector _paramTypes; + std::optional _pointer; + + FunctionInlining _inlineState = FunctionInlining::defaultInline; bool _isDecorated = true; + bool _isPublic = false; bool _isEmpty = true; }; @@ -153,16 +192,6 @@ namespace value { // Until MacOS's compiler has proper std::function deduction guides #if defined(__APPLE__) - template - struct Function : public std::function - { - Function(const std::function& fn) : - std::function(fn) {} - Function(std::function&& fn) : - std::function(std::move(fn)) {} - using std::function::function; - }; - template struct StdFunctionDeductionGuideHelper {}; @@ -191,6 +220,16 @@ namespace value using Type = ReturnT(Args...); }; + template + struct Function : public std::function + { + Function(const std::function& fn) : + std::function(fn) {} + Function(std::function&& fn) : + std::function(std::move(fn)) {} + using std::function::function; + }; + // Function pointer template Function(ReturnT (*)(Args...))->Function; @@ -212,48 +251,75 @@ namespace value template [[maybe_unused]] auto FunctionDeclaration::Define(Fn&& fn) { - return DefineImpl(FUNCTION_TYPE{ std::forward(fn) }); + return DefineImpl(FUNCTION_TYPE(std::forward(fn))); } - #undef FUNCTION_TYPE - template - FunctionDeclaration& FunctionDeclaration::Parameters(Types&&... paramTypes) + template + inline std::function FunctionDeclaration::DefineImpl(std::function fn) { - return Parameters(std::vector{ detail::GetValue(std::forward(paramTypes))... }); + if constexpr (sizeof...(Args) == 1 && utilities::AllSame..., std::vector>) + { + return DefineImpl(std::true_type{}, fn); + } + else + { + return DefineImpl(std::false_type{}, fn); + } } - template - std::optional FunctionDeclaration::Call(Types&&... arguments) const + template + inline std::function FunctionDeclaration::DefineImpl(std::true_type, std::function fn) { - return Call(std::vector{ detail::GetValue(std::forward(arguments))... }); + auto createdFn = GetContext().CreateFunction(*this, [fn = std::move(fn)](std::vector args) -> std::optional { + if constexpr (std::is_same_v) + { + fn(args); + return std::nullopt; + } + else + { + return fn(args); + } + }); + + return [createdFn = std::move(createdFn)](Args... args) -> ReturnT { + if constexpr (std::is_same_v) + { + createdFn(args...); + } + else + { + return *createdFn(args...); + } + }; } template - [[maybe_unused]] std::function FunctionDeclaration::DefineImpl(std::function fn) + [[maybe_unused]] std::function FunctionDeclaration::DefineImpl(std::false_type, std::function fn) { if constexpr (std::is_same_v) { if (_returnType.has_value()) { - throw utilities::InputException(utilities::InputExceptionErrors::typeMismatch, "Defining function has a return value, but declaration does not"); + throw utilities::InputException(utilities::InputExceptionErrors::typeMismatch, utilities::FormatString("[%s] Defining function has a return value, but declaration does not", GetFunctionName().c_str())); } } else { if (!_returnType.has_value()) { - throw utilities::InputException(utilities::InputExceptionErrors::typeMismatch, "Defining function returns void, but declaration does not"); + throw utilities::InputException(utilities::InputExceptionErrors::typeMismatch, utilities::FormatString("[%s] Defining function returns void, but declaration does not", GetFunctionName().c_str())); } - // Try to instantiate an instance of the return type (R) with the Value instance that reprepsents the return type (_returnType) + // Try to instantiate an instance of the return type (R) with the Value instance that represents the return type (_returnType) // If this throws, the return value of the defining function is not compatible with the Value instance specified in the declaration ReturnT returnType = *_returnType; } if (sizeof...(Args) != _paramTypes.size()) { - throw utilities::InputException(utilities::InputExceptionErrors::typeMismatch, utilities::FormatString("Defining function takes %zu parameters, but declaration was specific to have %zu.", sizeof...(Args), _paramTypes.size())); + throw utilities::InputException(utilities::InputExceptionErrors::typeMismatch, utilities::FormatString("[%s] Defining function takes %zu parameters, but declaration was specific to have %zu.", GetFunctionName().c_str(), sizeof...(Args), _paramTypes.size())); } if constexpr (sizeof...(Args) > 0) @@ -277,7 +343,7 @@ namespace value } }); - return [createdFn = std::move(createdFn)](Args&&... args) -> ReturnT { + return [createdFn = std::move(createdFn), name = GetFunctionName()](Args&&... args) -> ReturnT { constexpr auto argSize = sizeof...(Args); std::vector argValues; argValues.reserve(argSize); @@ -289,7 +355,7 @@ namespace value if (fnReturn) { throw utilities::LogicException(utilities::LogicExceptionErrors::illegalState, - "Function is supposed to return void, but a value was returned from the defining function"); + utilities::FormatString("[%s] Function is supposed to return void, but a value was returned from the defining function", name.c_str())); } } else @@ -298,6 +364,19 @@ namespace value } }; } + + template + FunctionDeclaration& FunctionDeclaration::Parameters(Types&&... paramTypes) + { + return Parameters(std::vector{ std::forward(paramTypes)... }); + } + + template + std::optional FunctionDeclaration::Call(Types&&... arguments) const + { + return Call(std::vector{ std::forward(arguments)... }); + } + } // namespace value } // namespace ell diff --git a/libraries/value/include/LLVMContext.h b/libraries/value/include/LLVMContext.h index 2b90bf1d0..b3578abe1 100644 --- a/libraries/value/include/LLVMContext.h +++ b/libraries/value/include/LLVMContext.h @@ -13,18 +13,18 @@ #include "FunctionDeclaration.h" #include "Scalar.h" +#include +#include + #include -#include +#include #include namespace ell { namespace emitters { - class IRFunctionEmitter; - class IRModuleEmitter; - } // namespace emitters } // namespace ell @@ -38,19 +38,37 @@ namespace value public: /// Constructor /// A reference to an IRModuleEmitter that will be used to emit LLVM IR - LLVMContext(emitters::IRModuleEmitter& emitter); + explicit LLVMContext(emitters::IRModuleEmitter& emitter); + + /// Constructor + /// Takes ownership of the IRModuleEmitter that will be used to emit LLVM IR + explicit LLVMContext(std::unique_ptr&& emitter); + + /// Constructor + /// + /// Name of the module. + /// Options for the compiler + LLVMContext(const std::string& moduleName, const emitters::CompilerOptions& parameters); emitters::IRModuleEmitter& GetModuleEmitter() const; emitters::IRFunctionEmitter& GetFunctionEmitter() const; + emitters::LLVMFunction DeclareFunction(const FunctionDeclaration& func); + + std::optional ToLLVMValue(Value value) const; + std::vector> ToLLVMValue(std::vector values) const; + + emitters::LLVMValue ToLLVMValue(Value value); + std::vector ToLLVMValue(std::vector values); + private: - Value AllocateImpl(ValueType value, MemoryLayout layout) override; + Value AllocateImpl(ValueType value, MemoryLayout layout, size_t alignment, AllocateFlags flags = AllocateFlags::None) override; std::optional GetGlobalValue(GlobalAllocationScope scope, std::string name) override; - Value GlobalAllocateImpl(GlobalAllocationScope scope, std::string name, ConstantData data, MemoryLayout layout) override; - Value GlobalAllocateImpl(GlobalAllocationScope scope, std::string name, ValueType type, MemoryLayout layout) override; + Value GlobalAllocateImpl(GlobalAllocationScope scope, std::string name, ConstantData data, MemoryLayout layout, AllocateFlags flags = AllocateFlags::None) override; + Value GlobalAllocateImpl(GlobalAllocationScope scope, std::string name, ValueType type, MemoryLayout layout, AllocateFlags flags = AllocateFlags::None) override; detail::ValueTypeDescription GetTypeImpl(Emittable emittable) override; @@ -59,8 +77,8 @@ namespace value Value StoreConstantDataImpl(ConstantData data) override; - void ForImpl(MemoryLayout layout, std::function)> fn) override; - void ForImpl(Scalar start, Scalar stop, Scalar step, std::function fn) override; + void ForImpl(MemoryLayout layout, std::function)> fn, const std::string& name) override; + void ForImpl(Scalar start, Scalar stop, Scalar step, std::function fn, const std::string& name) override; void MoveDataImpl(Value& source, Value& destination) override; @@ -81,12 +99,15 @@ namespace value IfContext IfImpl(Scalar test, std::function fn) override; + void WhileImpl(Scalar test, std::function fn) override; + std::optional CallImpl(FunctionDeclaration func, std::vector args) override; void PrefetchImpl(Value data, PrefetchType type, PrefetchLocality locality) override; void ParallelizeImpl(int numTasks, std::vector captured, std::function)> fn) override; + void DebugBreakImpl() override; void DebugDumpImpl(Value value, std::string tag, std::ostream& stream) const override; void DebugDumpImpl(FunctionDeclaration fn, std::string tag, std::ostream& stream) const override; void DebugPrintImpl(std::string message) override; @@ -94,6 +115,10 @@ namespace value void SetNameImpl(const Value& value, const std::string& name) override; std::string GetNameImpl(const Value& value) const override; + void ImportCodeFileImpl(std::string) override; + + Scalar GetFunctionAddressImpl(const FunctionDeclaration& fn) override; + Value IntrinsicCall(FunctionDeclaration intrinsic, std::vector args); std::optional EmitExternalCall(FunctionDeclaration func, std::vector args); @@ -104,6 +129,8 @@ namespace value std::string GetGlobalScopedName(std::string name) const; std::string GetCurrentFunctionScopedName(std::string name) const; + emitters::LLVMFunctionType ToLLVMFunctionType(const FunctionDeclaration& func) const; + struct PromotedConstantDataDescription { const ConstantData* data; @@ -114,12 +141,14 @@ namespace value std::optional HasBeenPromoted(Value value) const; Value Realize(Value value) const; Value EnsureEmittable(Value value); + std::vector EnsureEmittable(std::vector values); class IfContextImpl; struct FunctionScope; std::stack> _promotedConstantStack; + std::unique_ptr _ownedEmitter; emitters::IRModuleEmitter& _emitter; // LLVMContext uses ComputeContext internally to handle cases where all relevant operands are constant @@ -130,5 +159,9 @@ namespace value std::unordered_map _definedFunctions; }; + emitters::LLVMValue ToLLVMValue(Value value); + emitters::LLVMValue ToLLVMValue(ViewAdapter value); + + std::vector ToLLVMValue(std::vector values); } // namespace value } // namespace ell diff --git a/libraries/value/include/LoopNests.h b/libraries/value/include/LoopNests.h new file mode 100644 index 000000000..1b9d94018 --- /dev/null +++ b/libraries/value/include/LoopNests.h @@ -0,0 +1,224 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: LoopNests.h (value) +// Authors: Kern Handa +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include "CachingProvider.h" +#include "EmitterContext.h" +#include "FunctionDeclaration.h" +#include "Value.h" + +#include "loopnests/Index.h" +#include "loopnests/Kernel.h" +#include "loopnests/KernelPredicate.h" + +#include + +#include +#include +#include +#include + +namespace ell +{ +namespace value +{ + namespace loopnests + { + class LoopNest; + } + + using loopnests::Index; + using loopnests::Kernel; + + enum class ArgumentType + { + Input, + InputOutput, + Output, + Temporary + }; + + class LoopNestImpl; + + class Schedule + { + public: + Schedule(const Schedule&); + Schedule& operator=(const Schedule&); + + Schedule(Schedule&&) noexcept = delete; + Schedule& operator=(Schedule&&) noexcept = delete; + + Index Split(Index& index, int factor); + + /// Parallelizes the loop represented by the index, if parallelization is enabled. No effect if parallelization is disabled + /// Represents the loop to parallelize + void Parallelize(Index index); + + /// Parallelizes the loop represented by the index, if parallelization is enabled. No effect if parallelization is disabled + /// Represents the loop to parallelize. On return, this index points to the inner loop created by the split + /// The factor by which to parallelize. Ideally, represents the number of threads to use + /// The index which represents the outer loop, now parallelized + Index Parallelize(Index index, int factor); + + /// Unrolls the loop represented by the index + /// Represents the loop to unroll + void Unroll(Index index); + + /// Unrolls the loop represented by the index + /// Represents the loop to unroll. On return, this index points to the inner loop created by the split + /// The factor by which to unroll + /// The index which represents the outer loop, now unrolled + Index Unroll(Index index, int factor); + + void Cache(std::unique_ptr provider); + + template + void Cache( + ViewAdapter view, + std::vector kernelIndices, + utilities::MemoryShape size = {}, + std::vector atIndices = {}, + std::optional order = std::nullopt, + std::any extras = {}); + + void Cache( + CachingProvider& provider, + ViewAdapter view, + std::vector kernelIndices, + utilities::MemoryShape size, + std::vector atIndices = {}, + std::optional order = std::nullopt, + std::any extras = {}); + + void SetOrder(std::vector indices); + + private: + Schedule(LoopNest&); + friend class LoopNest; + utilities::MemoryShape GetShapeFromIndicesIncrement(std::vector& kernelIndices); + + std::reference_wrapper _nest; + std::reference_wrapper _impl; + }; + + class LoopNest + { + public: + LoopNest(); + LoopNest(const LoopNest&); + LoopNest(LoopNest&&) noexcept; + LoopNest& operator=(const LoopNest&); + LoopNest& operator=(LoopNest&&) noexcept; + ~LoopNest(); + + LoopNest& Using(std::initializer_list inputs, ArgumentType argType); + LoopNest& ForAll(Index index, int begin, int end); + + template + LoopNest& Do(Fn&& fn, std::vector kernelOuterIndices = {}, std::string kernelId = ""); + + template + LoopNest& Do(Fn&& fn, std::string kernelId); + + LoopNest& Do(std::function)> fn, std::vector kernelOuterIndices = {}, std::string kernelId = ""); + + LoopNest& Do(std::function)> fn, std::string kernelId); + + LoopNest& Do(Kernel kernel, std::vector kernelOuterIndex = {}); + + LoopNest& Do(Kernel kernel, const loopnests::KernelPredicate& predicate, const loopnests::KernelPredicate& placement = {}); + + Schedule& GetSchedule(); + + void Run() const; + + loopnests::LoopNest& GetUnderlyingLoopNest(); + const loopnests::LoopNest& GetUnderlyingLoopNest() const; + + private: + template + LoopNest& DoImpl(std::function fn, std::vector kernelOuterIndices, std::string kernelId); + + friend void swap(LoopNest& nest1, LoopNest& nest2) noexcept; + friend class Schedule; + + std::unique_ptr _impl; + Schedule _schedule; + }; + + LoopNest Using(std::initializer_list inputs, ArgumentType argType); +} // namespace value +} // namespace ell + +#pragma region implementation + +namespace ell +{ +namespace value +{ +#if defined(__APPLE__) +#define FUNCTION_TYPE detail::Function // defined in implementation region of FunctionDeclaration.h +#else +#define FUNCTION_TYPE std::function +#endif // defined(__APPLE__) + + template + LoopNest& LoopNest::Do(Fn&& fn, std::vector kernelOuterIndices, std::string kernelId) + { + return DoImpl(FUNCTION_TYPE(std::forward(fn)), kernelOuterIndices, kernelId); + } + + template + LoopNest& LoopNest::Do(Fn&& fn, std::string kernelId) + { + return Do(std::move(fn), {}, kernelId); + } + + template + LoopNest& LoopNest::DoImpl(std::function fn, std::vector kernelOuterIndices, std::string kernelId) + { + static_assert(std::conjunction_v...>); + return Do( + std::function)>{ + [fn = std::move(fn)](std::vector args) { + std::tuple tupleArgs = utilities::VectorToTuple(args); + std::apply(fn, tupleArgs); + } }, + kernelOuterIndices, + kernelId); + } + + template + void Schedule::Cache( + ViewAdapter view, + std::vector kernelIndices, + utilities::MemoryShape size, + std::vector atIndices, + std::optional order, + std::any extras) + { + static_assert(std::is_base_of_v, "CachingStrategyType must inherit from CachingProvider!"); + + CachingStrategyType provider{}; + Cache( + provider, + view, + kernelIndices, + size, + atIndices, + order, + extras); + } + +#undef FUNCTION_TYPE +} // namespace value +} // namespace ell + +#pragma endregion implementation diff --git a/libraries/value/include/Matrix.h b/libraries/value/include/Matrix.h index b3396adc0..ef4e56f60 100644 --- a/libraries/value/include/Matrix.h +++ b/libraries/value/include/Matrix.h @@ -25,6 +25,12 @@ namespace value class Matrix { public: + enum class MatrixLayout + { + rowMajor, + columnMajor + }; + Matrix(); /// Constructor that wraps the provided instance of Value @@ -97,6 +103,8 @@ namespace value /// Gets the number of columns within the active area size_t Columns() const; + MatrixLayout GetMatrixLayout() const; + /// Retrieves the type of data stored in the wrapped Value instance /// The type ValueType Type() const; @@ -128,6 +136,16 @@ namespace value Value _value; }; + /// Constructs an allocated instance with the specified dimensions + /// The number of rows of the allocated matrix + /// The number of columns of the allocated matrix + /// The type of the elements + /// The optional name + inline Matrix MakeMatrix(int rows, int columns, ValueType type, const std::string& name = "") + { + return Matrix(Allocate(type, utilities::MemoryLayout({ rows, columns })), name); + } + /// Constructs an allocated instance with the specified dimensions /// Any fundamental type accepted by Value /// The number of rows of the allocated matrix @@ -139,6 +157,28 @@ namespace value return Matrix(Allocate(utilities::MemoryLayout({ rows, columns })), name); } + + /// Constructs a statically-allocated instance with the specified dimensions + /// The number of rows of the allocated matrix + /// The number of columns of the allocated matrix + /// The type of the elements + /// The optional name + inline Matrix MakeStaticMatrix(int rows, int columns, ValueType type, const std::string& name = "") + { + return Matrix(StaticAllocate(name, type, utilities::MemoryLayout({ rows, columns }))); + } + + /// Constructs a statically-allocated instance with the specified dimensions + /// Any fundamental type accepted by Value + /// The number of rows of the allocated matrix + /// The number of columns of the allocated matrix + /// The optional name + template + Matrix MakeStaticMatrix(int rows, int columns, const std::string& name = "") + { + return Matrix(StaticAllocate(name, utilities::MemoryLayout({ rows, columns }))); + } + } // namespace value } // namespace ell diff --git a/libraries/value/include/MatrixOperations.h b/libraries/value/include/MatrixOperations.h index 477dbb096..4f0bffc4e 100644 --- a/libraries/value/include/MatrixOperations.h +++ b/libraries/value/include/MatrixOperations.h @@ -23,7 +23,7 @@ namespace value class Vector; /// Reinterprets the given data value as a matrix of the given size - Matrix ToMatrix(Value data, int numRows, int numCols); + Matrix ToMatrix(Value data, int numRows, int numCols); Scalar Sum(Matrix matrix, Scalar initialValue); @@ -32,6 +32,12 @@ namespace value /// The function to be called for each coordinate where there is an active element void For(Matrix matrix, std::function fn); + /// Creates a for loop over the matrix + /// A name that can be used by the emitter context to tag this loop in the emitted code + /// The instance of Matrix that references the data over which to iterate + /// The function to be called for each coordinate where there is an active element + void For(const std::string& name, Matrix matrix, std::function fn); + Matrix GEMM(Matrix m1, Matrix m2); Vector GEMV(Matrix m, Vector v); diff --git a/libraries/value/include/Print.h b/libraries/value/include/Print.h new file mode 100644 index 000000000..e5315a7ce --- /dev/null +++ b/libraries/value/include/Print.h @@ -0,0 +1,36 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: Print.h (value) +// Authors: Chuck Jacobs +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include "Value.h" + +#include +#include + +namespace ell +{ +namespace value +{ + /// Emits a print call. + /// + /// The text to print. + void Print(const std::string& text); + + /// Emits a printf call. + /// + /// Arguments to the printf call. + void Printf(const std::vector& arguments); + + /// Emits a printf call. + /// + /// Describes the printf format to use. + /// Arguments to the printf call. + void Printf(const std::string& format, const std::vector& arguments); +} // namespace value +} // namespace ell diff --git a/libraries/value/include/Scalar.h b/libraries/value/include/Scalar.h index e6ca718c4..3ff84a23e 100644 --- a/libraries/value/include/Scalar.h +++ b/libraries/value/include/Scalar.h @@ -8,6 +8,7 @@ #pragma once +#include "ScalarOperations.h" #include "Value.h" namespace ell diff --git a/libraries/value/include/ScalarOperations.h b/libraries/value/include/ScalarOperations.h new file mode 100644 index 000000000..f63123189 --- /dev/null +++ b/libraries/value/include/ScalarOperations.h @@ -0,0 +1,29 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: ScalarOperations.h (value) +// Authors: Kern Handa +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include + +namespace ell +{ +namespace value +{ + + class Scalar; + + /// Arithmetic operators + Scalar Add(Scalar, Scalar); + Scalar Subtract(Scalar, Scalar); + Scalar Multiply(Scalar, Scalar); + Scalar Divide(Scalar, Scalar); + Scalar Modulo(Scalar, Scalar); + Scalar FusedMultiplyAdd(Scalar a, Scalar b, Scalar c); // returns (a*b)+c + +} // namespace value +} // namespace ell diff --git a/libraries/value/include/TensorOperations.h b/libraries/value/include/TensorOperations.h index 8cce73089..2c3926b7f 100644 --- a/libraries/value/include/TensorOperations.h +++ b/libraries/value/include/TensorOperations.h @@ -33,5 +33,11 @@ namespace value /// The function to be called for each coordinate where there is an active element void For(Tensor tensor, std::function fn); + /// Creates a for loop over the tensor + /// A name that can be used by the emitter context to tag this loop in the emitted code + /// The instance of Tensor that references the data over which to iterate + /// The function to be called for each coordinate where there is an active element + void For(const std::string& name, Tensor tensor, std::function fn); + } // namespace value } // namespace ell diff --git a/libraries/value/include/Value.h b/libraries/value/include/Value.h index 8975ba082..55a07fd9a 100644 --- a/libraries/value/include/Value.h +++ b/libraries/value/include/Value.h @@ -404,6 +404,9 @@ namespace value /// The underlying data storage const UnderlyingDataType& GetUnderlyingData() const; + /// Gets a reference to the complete type description being held + const detail::ValueTypeDescription& GetType() const { return _type; } + /// Set the name for this instance with the current emitter context /// The name void SetName(const std::string& name); @@ -427,12 +430,16 @@ namespace value /// `Value GetValue()` struct ViewAdapter { - template - ViewAdapter(ViewType view) : + template + ViewAdapter(View view) : _value(detail::GetValue(view)) {} - inline operator Value() const { return _value; } + inline operator const Value&() const { return _value; } + inline operator Value&() { return _value; } + + inline Value& GetValue() { return _value; } + inline const Value& GetValue() const { return _value; } private: Value _value; diff --git a/libraries/value/include/ValueOperations.h b/libraries/value/include/ValueOperations.h index 7f939ab1d..1ae3be1de 100644 --- a/libraries/value/include/ValueOperations.h +++ b/libraries/value/include/ValueOperations.h @@ -27,6 +27,13 @@ namespace value /// The function to be called for each coordinate where there is an active element void For(utilities::MemoryLayout layout, std::function fn); + /// Creates a for loop beggining at `start`, ending at `stop`, and incrementing by `step` + /// The value used to initialize the loop counter + /// The terminal value of the loop + /// The value by which the loop counter is incremented + /// The function to be called for each coordinate where there is an active element + void For(Scalar start, Scalar stop, Scalar step, std::function fn); + /// Cast a value to another type, returning a new value /// The data to convert /// The type to which the data should be casted diff --git a/libraries/value/include/ValueType.h b/libraries/value/include/ValueType.h index 3e426a0a0..67094bbe0 100644 --- a/libraries/value/include/ValueType.h +++ b/libraries/value/include/ValueType.h @@ -9,6 +9,7 @@ #pragma once #include +#include #include #include @@ -118,6 +119,10 @@ namespace value { return ValueType::Double; } + else + { + static_assert(utilities::FalseType::value, "Unknown value type"); + } } /// Get a string representation of the enum value diff --git a/libraries/value/include/VectorOperations.h b/libraries/value/include/VectorOperations.h index 269ac7aed..65e4d9e9f 100644 --- a/libraries/value/include/VectorOperations.h +++ b/libraries/value/include/VectorOperations.h @@ -49,5 +49,11 @@ namespace value /// The function to be called for each coordinate where there is an active element void For(Vector vector, std::function fn); + /// Creates a for loop over the vector + /// A name that can be used by the emitter context to tag this loop in the emitted code + /// The instance of Vector that references the data over which to iterate + /// The function to be called for each coordinate where there is an active element + void For(const std::string& name, Vector vector, std::function fn); + } // namespace value } // namespace ell diff --git a/libraries/value/include/loopnests/CodeGenerator.h b/libraries/value/include/loopnests/CodeGenerator.h new file mode 100644 index 000000000..e2cbfc860 --- /dev/null +++ b/libraries/value/include/loopnests/CodeGenerator.h @@ -0,0 +1,43 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: CodeGenerator.h (value) +// Authors: Chuck Jacobs, Kern Handa +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include "LoopNest.h" +#include "LoopNestVisitor.h" + +#include + +namespace ell +{ +namespace value +{ + namespace loopnests + { + /// + /// Takes a loop nest and generates code for it + /// + class CodeGenerator : public LoopNestVisitor + { + public: + void Run(const LoopNest& loopNest) const; + + private: + void GenerateLoopRangeOld(const LoopRange& range, const RecursionState& state, const LoopVisitSchedule& schedule, std::function codegenFn) const override; + void GenerateLoopRangeNew(const LoopRange& range, const RecursionStateNew& state, const LoopVisitSchedule& schedule, std::function codegenFn) const override; + Scalar EmitIndexExpression(const Index& index, const IndexExpression& expr, const LoopIndexSymbolTable& indexVariables) const override; + void InvokeKernel(const Kernel& kernel, const KernelPredicate& predicate, const LoopIndexSymbolTable& runtimeIndexVariables, const LoopVisitSchedule& schedule) const override; + bool InvokeKernelGroup(const ScheduledKernelGroup& kernelGroup, const LoopIndexSymbolTable& runtimeIndexVariables, const LoopVisitSchedule& schedule) const override; + + void InvokeKernel(const Kernel& kernel, const LoopIndexSymbolTable& runtimeIndexVariables, const LoopVisitSchedule& schedule) const; + Scalar EmitKernelPredicate(const KernelPredicate& predicate, const LoopIndexSymbolTable& runtimeIndexVariables, const LoopVisitSchedule& schedule) const; + }; + + } // namespace loopnests +} // namespace value +} // namespace ell diff --git a/libraries/value/include/loopnests/CodePositionConstraints.h b/libraries/value/include/loopnests/CodePositionConstraints.h new file mode 100644 index 000000000..6fad5147d --- /dev/null +++ b/libraries/value/include/loopnests/CodePositionConstraints.h @@ -0,0 +1,166 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: CodePositionConstraints.h (value) +// Authors: Chuck Jacobs, Kern Handa +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include "Index.h" + +#include +#include +#include + +namespace ell +{ +namespace value +{ + namespace loopnests + { + enum class LoopFragmentType : int + { + prologue, // Occurs as part of a loop prologue, before the main loop body + body, // Occurs as part of the main loop body + boundary, // Occurs as part of a loop boundary condition + epilogue, // Occurs as part of a loop epilogue, after the main loop body + LAST + }; + + bool IsBoundaryFragment(LoopFragmentType f); + + std::ostream& operator<<(std::ostream& os, LoopFragmentType t); + + // A set of LoopFragmentTypes + class LoopFragmentFlags + { + public: + LoopFragmentFlags() = default; + LoopFragmentFlags(LoopFragmentType type) + { + _flags[static_cast(type)] = 1; + }; + explicit LoopFragmentFlags(int flags); + + bool GetFlag(LoopFragmentType type) const + { + return _flags[static_cast(type)]; + } + + void SetFlag(LoopFragmentType type, bool value) + { + _flags[static_cast(type)] = value; + } + + static LoopFragmentFlags All() + { + LoopFragmentFlags result; + result.SetFlag(LoopFragmentType::prologue, true); + result.SetFlag(LoopFragmentType::body, true); + result.SetFlag(LoopFragmentType::boundary, false); + result.SetFlag(LoopFragmentType::epilogue, true); + return result; + } + + LoopFragmentFlags& operator&=(const LoopFragmentFlags& other) + { + _flags &= other._flags; + return *this; + } + + LoopFragmentFlags& operator|=(const LoopFragmentFlags& other) + { + _flags |= other._flags; + return *this; + } + + private: + std::bitset(LoopFragmentType::LAST)> _flags; + }; + + std::ostream& operator<<(std::ostream& os, LoopFragmentFlags f); + + inline LoopFragmentFlags operator&(LoopFragmentType a, LoopFragmentType b) + { + LoopFragmentFlags result(a); + result &= b; + return result; + } + + inline LoopFragmentFlags operator&(LoopFragmentFlags a, LoopFragmentType b) + { + a &= b; + return a; + } + + inline LoopFragmentFlags operator&(LoopFragmentType a, LoopFragmentFlags b) + { + return b & a; + } + + inline LoopFragmentFlags operator|(LoopFragmentType a, LoopFragmentType b) + { + LoopFragmentFlags result(a); + result |= b; + return result; + } + + inline LoopFragmentFlags operator|(LoopFragmentFlags a, LoopFragmentType b) + { + a |= b; + return a; + } + + inline LoopFragmentFlags operator|(LoopFragmentType a, LoopFragmentFlags b) + { + return b | a; + } + + /// + /// A class to hold the constraints that govern where a piece of code may / must run. Used to generate a concrete + /// schedule for running (non-"kernel") code + /// + + // TODO: each boundary index needs its own "placement" value (e.g., you could have a kernel that runs when j==0 and k==N-1) + class CodePositionConstraints + { + public: + CodePositionConstraints(LoopFragmentType placement, std::vector requiredIndices, std::vector boundaryIndices); + + LoopFragmentType GetPlacement() const { return _placement; } + + std::vector GetRequiredIndices() const; // indices we depend on + std::vector GetBoundaryIndices() const; // indices defining the fragment + + private: + LoopFragmentType _placement; + std::vector _requiredIndices; + std::vector _boundaryIndices; + }; + + bool operator==(const CodePositionConstraints& i1, const CodePositionConstraints& i2); + bool operator!=(const CodePositionConstraints& i1, const CodePositionConstraints& i2); + } // namespace loopnests +} // namespace value +} // namespace ell + +// +// Custom specialization of std::hash so we can keep constraints in containers that require hashable types +// +namespace std +{ +/// Implements a hash function for the CodePositionConstraints class, so that it can be used with associative containers (maps, sets, and the like). +template <> +struct hash<::ell::value::loopnests::CodePositionConstraints> +{ + using argument_type = ell::value::loopnests::CodePositionConstraints; + using result_type = std::size_t; + + /// Computes a hash of the input value. + /// + /// A hash value for the given input. + result_type operator()(const argument_type& constraints) const; +}; +} // namespace std diff --git a/libraries/value/include/loopnests/ForAll.h b/libraries/value/include/loopnests/ForAll.h new file mode 100644 index 000000000..f1333ce44 --- /dev/null +++ b/libraries/value/include/loopnests/ForAll.h @@ -0,0 +1,40 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: ForAll.h (value) +// Authors: Chuck Jacobs, Kern Handa +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include "LoopNest.h" + +namespace ell +{ +namespace value::loopnests +{ + // + // Syntactic sugar class for stringing together loop nest calls + // + class ForAll + { + public: + ForAll(const ForAll& other) = delete; + ForAll(ForAll&& other) = default; + ForAll(IterationDomain domain); + ForAll& operator=(const ForAll& other) = delete; + ForAll& operator=(ForAll&& other) = default; + + ForAll& AddKernel(const Kernel& kernel); + ForAll& AddKernel(const Kernel& kernel, const CodePositionConstraints& where); + ForAll& Split(const Index& dimension, int size); + ForAll& SetLoopOrder(const std::vector& order); + + const LoopNest& GetNest() const; + + private: + LoopNest _loops; + }; +} // namespace value::loopnests +} // namespace ell \ No newline at end of file diff --git a/libraries/value/include/loopnests/Index.h b/libraries/value/include/loopnests/Index.h new file mode 100644 index 000000000..474467392 --- /dev/null +++ b/libraries/value/include/loopnests/Index.h @@ -0,0 +1,68 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: Index.h (value) +// Authors: Chuck Jacobs, Kern Handa +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include +#include + +namespace ell +{ +namespace value +{ + namespace loopnests + { + /// + /// A placeholder object representing a runtime variable used as the index for a loop (e.g., the 'i' in 'for(i = ...)'). + /// + class Index + { + public: + using Id = int; + Index() = default; + Index(const Index& other) = default; + Index(Index&& other) = default; + Index(const std::string& name); + Index& operator=(const Index& other) = default; + Index& operator=(Index&& other) = default; + + const std::string& GetName() const; + Id GetId() const; + + private: + static int GetNextId(); + + friend inline bool operator==(const Index& i1, const Index& i2) { return i1.GetId() == i2.GetId(); } + friend inline bool operator!=(const Index& i1, const Index& i2) { return !(i1 == i2); } + friend inline bool operator<(const Index& i1, const Index& i2) { return i1.GetId() < i2.GetId(); } + + std::string _name; + Id _id = -1; + }; + + struct SplitIndex + { + Index outer; + Index inner; + }; + + std::ostream& operator<<(std::ostream& os, const Index& index); + } // namespace loopnests +} // namespace value +} // namespace ell + +namespace std +{ +template <> +struct hash<::ell::value::loopnests::Index> +{ + using argument_type = ::ell::value::loopnests::Index; + using result_type = std::size_t; + result_type operator()(const argument_type& index) const; +}; +} // namespace std diff --git a/libraries/value/include/loopnests/IndexRange.h b/libraries/value/include/loopnests/IndexRange.h new file mode 100644 index 000000000..759886606 --- /dev/null +++ b/libraries/value/include/loopnests/IndexRange.h @@ -0,0 +1,50 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: IndexRange.h (value) +// Authors: Chuck Jacobs, Kern Handa +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include "Index.h" +#include "Range.h" + +#include + +namespace ell +{ +namespace value +{ + namespace loopnests + { + /// + /// A range of integer values, used to express the interval that an index variable may take on. + /// + class IndexRange + { + public: + IndexRange(const Index& index, const Range& range); + IndexRange(const std::string& name, const Range& range); + + const Index& GetIndex() const; + const std::string& GetName() const; + int Begin() const; + int End() const; + int Size() const; + int Increment() const; + Range GetRange() const; + + private: + Index _index; + Range _range; + + friend inline bool operator==(const IndexRange& i1, const IndexRange& i2) { return (i1.GetIndex() == i2.GetIndex()) && (i1.GetRange() == i2.GetRange()); } + friend inline bool operator!=(const IndexRange& i1, const IndexRange& i2) { return (i1.GetIndex() != i2.GetIndex()) || (i1.GetRange() != i2.GetRange()); } + friend inline bool operator<(const IndexRange& i1, const IndexRange& i2) { return (i1.GetIndex() != i2.GetIndex()) ? (i1.GetIndex() == i2.GetIndex()) : (i1.GetRange() < i2.GetRange()); } + }; + + } // namespace loopnests +} // namespace value +} // namespace ell \ No newline at end of file diff --git a/libraries/value/include/loopnests/IterationDomain.h b/libraries/value/include/loopnests/IterationDomain.h new file mode 100644 index 000000000..b7ba65531 --- /dev/null +++ b/libraries/value/include/loopnests/IterationDomain.h @@ -0,0 +1,47 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: IterationDomain.h (value) +// Authors: Chuck Jacobs, Kern Handa +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include "IndexRange.h" + +#include +#include +#include + +namespace ell +{ +namespace value +{ + namespace loopnests + { + /// + /// The set of all points (IterationVectors) to be visited by a loop or loop nest. + /// + class IterationDomain + { + public: + IterationDomain() = default; + IterationDomain(const std::vector& ranges); + IterationDomain(const std::initializer_list& ranges); + + int NumDimensions() const; + IndexRange GetDimensionRange(int dimension) const; + IndexRange GetDimensionRange(const Index& index) const; + const std::vector& GetRanges() const; + + private: + int GetDimensionRangeFromIndex(const Index& index) const; + + std::vector _dimensions; + std::map _indexToDimensionMap; + }; + + } // namespace loopnests +} // namespace value +} // namespace ell diff --git a/libraries/value/include/loopnests/Kernel.h b/libraries/value/include/loopnests/Kernel.h new file mode 100644 index 000000000..80403d110 --- /dev/null +++ b/libraries/value/include/loopnests/Kernel.h @@ -0,0 +1,166 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: Kernel.h (value) +// Authors: Chuck Jacobs, Kern Handa +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include "Index.h" + +#include "../FunctionDeclaration.h" +#include "../Scalar.h" +#include "../Value.h" + +#include +#include +#include +#include + +namespace ell +{ +namespace value +{ + namespace loopnests + { + /// + /// Code that runs inside the loop nest. This is the code that actually implements the operation. The loops and + /// scheduling are all for the purpose of calling this code. + /// + class Kernel + { + public: + using Id = std::string; + + explicit Kernel(std::string name); + Kernel(std::string name, Id id); + + template + Kernel& Inputs(Types... inputs); + + Kernel& Inputs(const std::vector& inputs); + + Kernel& Indices(std::vector indices); + + template + Kernel& Indices(Types... indices); + + template + Kernel& Define(Fn&& fn); + + // TODO : make this a template specialization of Define(), currently lambdas and std::functions aren't + // getting matched correctly + Kernel& DefineEx(std::function, std::vector)>&& fn); + + void Call(std::vector arguments, std::vector indices) const; + + const std::string& GetName() const; + const Id& GetId() const; + const std::vector& GetArgs() const; + const std::vector& GetIndices() const; + + private: + Id _id; + std::string _kernelName; + std::vector _inputs; + std::vector _indices; + std::function arguments, std::vector indices)> _kernel; + }; + + inline bool operator==(const Kernel& i1, const Kernel& i2) { return i1.GetId() == i2.GetId(); } + inline bool operator!=(const Kernel& i1, const Kernel& i2) { return !(i1 == i2); } + } // namespace loopnests +} // namespace value +} // namespace ell + +namespace std +{ +/// Implements a hash function for the Kernel class, so that it can be used with associative containers (maps, sets, and the like). +template <> +struct hash<::ell::value::loopnests::Kernel> +{ + using argument_type = ::ell::value::loopnests::Kernel; + using result_type = std::size_t; + result_type operator()(const argument_type& kernel) const; +}; +} // namespace std + +#pragma region implementation + +namespace ell +{ +namespace value +{ + namespace loopnests + { + template + Kernel& Kernel::Inputs(Types... inputs) + { + static_assert(std::conjunction_v...>); + return Inputs(std::vector{ ViewAdapter{ std::forward(inputs) }... }); + } + + template + Kernel& Kernel::Indices(Types... indices) + { + static_assert(utilities::AllSame...>); + return Indices(std::vector{ std::forward(indices)... }); + } + + template + Kernel& Kernel::Define(Fn&& fn) + { + _kernel = [numOriginalIndices = _indices.size(), + originalInputs = _inputs, + kernelName = UniqueName(_kernelName + "KernelFn"), + fnDefinition = std::move(fn)](std::vector arguments, std::vector indices) { + using namespace utilities; + + if (arguments.size() != originalInputs.size()) + { + throw InputException(InputExceptionErrors::invalidArgument, "Number of arguments does not match number of expected inputs"); + } + if (indices.size() != numOriginalIndices) + { + throw InputException(InputExceptionErrors::invalidArgument, "Number of indices does not match number of expected indices"); + } + + std::vector fnInputs(arguments.begin(), arguments.end()); + fnInputs.insert(fnInputs.end(), indices.begin(), indices.end()); + + std::vector fnParameters(originalInputs.begin(), originalInputs.end()); + fnParameters.insert(fnParameters.end(), indices.begin(), indices.end()); + for (auto i = 0u; i < originalInputs.size(); ++i) + { + Value& param = fnParameters[i]; + const Value& input = fnInputs[i]; + + if (!input.IsConstrained()) + { + param.ClearLayout(); + } + else + { + param.SetLayout(input.GetLayout()); + } + } + + auto fn = DeclareFunction(kernelName).Parameters(fnParameters); + fn.Inlined(FunctionInlining::always); + if (!fn.IsDefined()) + { + fn.Define(fnDefinition); + } + + fn.Call(fnInputs); + }; + + return *this; + } + + } // namespace loopnests +} // namespace value +} // namespace ell +#pragma endregion diff --git a/libraries/value/include/loopnests/KernelPredicate.h b/libraries/value/include/loopnests/KernelPredicate.h new file mode 100644 index 000000000..bcee6c06f --- /dev/null +++ b/libraries/value/include/loopnests/KernelPredicate.h @@ -0,0 +1,315 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: KernelPredicate.h (value) +// Authors: Chuck Jacobs +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include "Index.h" +#include "LoopIndexInfo.h" + +#include "../ValueType.h" + +#include +#include +#include +#include +#include + +namespace ell +{ +namespace value +{ + namespace loopnests + { + class KernelPredicate; + class LoopVisitSchedule; + + enum class Fragment + { + first, + last, + endBoundary, + all + }; + + enum class Placement + { + before, + after + }; + + class EmptyPredicate + { + public: + const EmptyPredicate& Simplify() const { return *this; } + const EmptyPredicate& Simplify(const LoopIndexSymbolTable& indices, const LoopVisitSchedule& schedule) const { return *this; } + + private: + friend inline bool operator==(const EmptyPredicate& i1, const EmptyPredicate& i2) { return true; } + friend inline bool operator!=(const EmptyPredicate& i1, const EmptyPredicate& i2) { return false; } + friend inline bool operator<(const EmptyPredicate& i1, const EmptyPredicate& i2) { return false; } + }; + + class ConstantPredicate + { + public: + explicit ConstantPredicate(bool value); + + bool GetValue() const; + + const ConstantPredicate& Simplify() const { return *this; } + const ConstantPredicate& Simplify(const LoopIndexSymbolTable& indices, const LoopVisitSchedule& schedule) const { return *this; } + + private: + bool _value; + + friend inline bool operator==(const ConstantPredicate& i1, const ConstantPredicate& i2) { return i1.GetValue() == i2.GetValue(); } + friend inline bool operator!=(const ConstantPredicate& i1, const ConstantPredicate& i2) { return i1.GetValue() != i2.GetValue(); } + friend inline bool operator<(const ConstantPredicate& i1, const ConstantPredicate& i2) { return i1.GetValue() < i2.GetValue(); } + }; + + class FragmentTypePredicate + { + public: + FragmentTypePredicate(const Index& index, Fragment condition); + + const Index& GetIndex() const; + Fragment GetCondition() const; + + KernelPredicate Simplify() const; + KernelPredicate Simplify(const LoopIndexSymbolTable& indices, const LoopVisitSchedule& schedule) const; + + private: + Index _index; + Fragment _condition; + + friend inline bool operator==(const FragmentTypePredicate& i1, const FragmentTypePredicate& i2) { return (i1.GetIndex() == i2.GetIndex()) && (i1.GetCondition() == i2.GetCondition()); } + friend inline bool operator!=(const FragmentTypePredicate& i1, const FragmentTypePredicate& i2) { return (i1.GetIndex() != i2.GetIndex()) || (i1.GetCondition() != i2.GetCondition()); } + friend inline bool operator<(const FragmentTypePredicate& i1, const FragmentTypePredicate& i2) { return (i1.GetIndex() != i2.GetIndex()) ? (i1.GetIndex() < i2.GetIndex()) : (i1.GetCondition() < i2.GetCondition()); } + }; + + class PlacementPredicate + { + public: + // Where to schedule kernel in its loop (before or after any inner loops) + explicit PlacementPredicate(Placement where); + + // Where to schedule kernel in relation to an index + PlacementPredicate(const Index& index, Placement where); + + bool HasIndex() const; + Index GetIndex() const; + Placement GetPlacement() const; + + const PlacementPredicate& Simplify() const; + const PlacementPredicate& Simplify(const LoopIndexSymbolTable& indices, const LoopVisitSchedule& schedule) const; + + private: + std::optional _index; + Placement _placement; + + friend inline bool operator==(const PlacementPredicate& i1, const PlacementPredicate& i2) { return (i1.GetIndex() == i2.GetIndex()) && (i1.GetPlacement() == i2.GetPlacement()); } + friend inline bool operator!=(const PlacementPredicate& i1, const PlacementPredicate& i2) { return (i1.GetIndex() != i2.GetIndex()) || (i1.GetPlacement() != i2.GetPlacement()); } + friend inline bool operator<(const PlacementPredicate& i1, const PlacementPredicate& i2) { return (i1.GetIndex() != i2.GetIndex()) ? (i1.GetIndex() < i2.GetIndex()) : (i1.GetPlacement() < i2.GetPlacement()); } + }; + + class IndexDefinedPredicate + { + public: + explicit IndexDefinedPredicate(const Index& index); + + const Index& GetIndex() const; + + const IndexDefinedPredicate& Simplify() const; + const IndexDefinedPredicate& Simplify(const LoopIndexSymbolTable& indices, const LoopVisitSchedule& schedule) const; + + private: + Index _index; + + friend inline bool operator==(const IndexDefinedPredicate& i1, const IndexDefinedPredicate& i2) { return i1.GetIndex() == i2.GetIndex(); } + friend inline bool operator!=(const IndexDefinedPredicate& i1, const IndexDefinedPredicate& i2) { return i1.GetIndex() != i2.GetIndex(); } + friend inline bool operator<(const IndexDefinedPredicate& i1, const IndexDefinedPredicate& i2) { return i1.GetIndex() < i2.GetIndex(); } + }; + + class KernelPredicateConjunction + { + public: + KernelPredicateConjunction(const KernelPredicate& lhs, const KernelPredicate& rhs); + KernelPredicateConjunction(const KernelPredicateConjunction& other); + KernelPredicateConjunction(KernelPredicateConjunction&& other) = default; + KernelPredicateConjunction& operator=(const KernelPredicateConjunction& other); + KernelPredicateConjunction& operator=(KernelPredicateConjunction&& other) = default; + + const std::vector>& GetTerms() const; + + KernelPredicate Simplify() const; + KernelPredicate Simplify(const LoopIndexSymbolTable& indices, const LoopVisitSchedule& schedule) const; + + private: + friend class KernelPredicate; + KernelPredicateConjunction(const std::vector>& terms); + + std::vector> _terms; + }; + + class KernelPredicateDisjunction + { + public: + KernelPredicateDisjunction(const KernelPredicate& lhs, const KernelPredicate& rhs); + KernelPredicateDisjunction(const KernelPredicateDisjunction& other); + KernelPredicateDisjunction(KernelPredicateDisjunction&& other) = default; + KernelPredicateDisjunction& operator=(const KernelPredicateDisjunction& other); + KernelPredicateDisjunction& operator=(KernelPredicateDisjunction&& other) = default; + + const std::vector>& GetTerms() const; + + KernelPredicate Simplify() const; + KernelPredicate Simplify(const LoopIndexSymbolTable& indices, const LoopVisitSchedule& schedule) const; + + private: + friend class KernelPredicate; + KernelPredicateDisjunction(const std::vector>& terms); + + std::vector> _terms; + }; + + class KernelPredicate + { + public: + KernelPredicate() = default; + + KernelPredicate(const EmptyPredicate& predicate); + KernelPredicate(const ConstantPredicate& predicate); + KernelPredicate(const FragmentTypePredicate& predicate); + KernelPredicate(const PlacementPredicate& predicate); + KernelPredicate(const IndexDefinedPredicate& predicate); + KernelPredicate(const KernelPredicateConjunction& predicate); + KernelPredicate(const KernelPredicateDisjunction& predicate); + + KernelPredicate(const KernelPredicate&) = default; + KernelPredicate(KernelPredicate&&) = default; + KernelPredicate& operator=(const KernelPredicate&) = default; + KernelPredicate& operator=(KernelPredicate&&) = default; + + KernelPredicate Simplify() const; + KernelPredicate Simplify(const LoopIndexSymbolTable& indices, const LoopVisitSchedule& schedule) const; + + template + void Visit(FunctionType&& f) const; // FunctionType of the form void(const KernelPredicate&) + + bool IsConstant() const; + bool GetConstantValue() const; + + bool IsAlwaysTrue() const; + bool IsAlwaysFalse() const; + bool IsEmpty() const; + + template + bool Is() const; + + template + const T* As() const; + + friend std::ostream& operator<<(std::ostream& os, const KernelPredicate& predicate); + + private: + std::variant + _expr; + }; + + KernelPredicate First(const Index& index); + KernelPredicate Last(const Index& index); + KernelPredicate EndBoundary(const Index& index); + KernelPredicate All(const Index& index); + KernelPredicate IsDefined(const Index& index); + KernelPredicate Before(const Index& index); + KernelPredicate After(const Index& index); + + KernelPredicate operator&&(const KernelPredicate& lhs, const KernelPredicate& rhs); + KernelPredicate operator||(const KernelPredicate& lhs, const KernelPredicate& rhs); + + KernelPredicate operator==(const Index& index, int value); + KernelPredicate operator==(int value, const Index& index); + // KernelPredicate operator==(const Index& index1, const Index& index2); + KernelPredicate operator!=(const Index& index, int value); + KernelPredicate operator!=(int value, const Index& index); + // KernelPredicate operator!=(const Index& index1, const Index& index2); + KernelPredicate operator<(const Index& index, int value); + KernelPredicate operator<(int value, const Index& index); + // KernelPredicate operator<(const Index& index1, const Index& index2); + KernelPredicate operator>(const Index& index, int value); + KernelPredicate operator>(int value, const Index& index); + // KernelPredicate operator>(const Index& index1, const Index& index2); + KernelPredicate operator<=(const Index& index, int value); + KernelPredicate operator<=(int value, const Index& index); + // KernelPredicate operator<=(const Index& index1, const Index& index2); + KernelPredicate operator>=(const Index& index, int value); + KernelPredicate operator>=(int value, const Index& index); + // KernelPredicate operator>=(const Index& index1, const Index& index2); + + std::string ToString(Fragment condition); + + std::ostream& operator<<(std::ostream& os, const EmptyPredicate& predicate); + std::ostream& operator<<(std::ostream& os, const ConstantPredicate& predicate); + std::ostream& operator<<(std::ostream& os, const FragmentTypePredicate& predicate); + std::ostream& operator<<(std::ostream& os, const PlacementPredicate& predicate); + std::ostream& operator<<(std::ostream& os, const KernelPredicateConjunction& predicate); + std::ostream& operator<<(std::ostream& os, const KernelPredicateDisjunction& predicate); + // std::ostream& operator<<(std::ostream& os, const KernelPredicate& predicate); + + } // namespace loopnests +} // namespace value +} // namespace ell + +#pragma region implementation +namespace ell +{ +namespace value +{ + namespace loopnests + { + template + bool KernelPredicate::Is() const + { + return std::holds_alternative(_expr); + } + + template + const T* KernelPredicate::As() const + { + return std::get_if(&_expr); + } + + template + void KernelPredicate::Visit(FunctionType&& f) const + { + f(*this); + if (auto conj = As(); conj != nullptr) + { + for (const auto& t : conj->GetTerms()) + { + t->Visit(f); + } + } + else if (auto disj = As(); disj != nullptr) + { + for (const auto& t : disj->GetTerms()) + { + t->Visit(f); + } + } + } + } // namespace loopnests +} // namespace value +} // namespace ell +#pragma endregion implementation diff --git a/libraries/value/include/loopnests/LoopIndexInfo.h b/libraries/value/include/loopnests/LoopIndexInfo.h new file mode 100644 index 000000000..224fef47e --- /dev/null +++ b/libraries/value/include/loopnests/LoopIndexInfo.h @@ -0,0 +1,41 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: LoopIndexInfo.h (value) +// Authors: Chuck Jacobs +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include "Index.h" +#include "Range.h" + +#include "../Scalar.h" + +#include + +namespace ell +{ +namespace value +{ + namespace loopnests + { + enum class LoopIndexState + { + notVisited, + inProgress, + done + }; + + struct LoopIndexSymbolTableEntry + { + Index scope; // redundant with key in symbol table map + Scalar value; + Range loopRange; + LoopIndexState state; + }; + using LoopIndexSymbolTable = std::unordered_map; + } // namespace loopnests +} // namespace value +} // namespace ell diff --git a/libraries/value/include/loopnests/LoopNest.h b/libraries/value/include/loopnests/LoopNest.h new file mode 100644 index 000000000..b00a606c8 --- /dev/null +++ b/libraries/value/include/loopnests/LoopNest.h @@ -0,0 +1,292 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: LoopNest.h (value) +// Authors: Chuck Jacobs, Kern Handa +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include "CodePositionConstraints.h" +#include "Index.h" +#include "IndexRange.h" +#include "IterationDomain.h" +#include "Kernel.h" +#include "KernelPredicate.h" +#include "SplitIndexRange.h" +#include "SplitIterationDomain.h" + +#include "../Value.h" + +#include +#include +#include + +namespace ell +{ +namespace value +{ + namespace loopnests + { + class LoopNest; + + struct ScheduledKernel + { + bool newVersion = false; // Temporary workaround... to be removed + + Kernel kernel; + CodePositionConstraints constraints; + KernelPredicate predicate; + KernelPredicate placement; + }; + + struct ScheduledKernelGroup + { + Kernel::Id id; + std::vector kernels; + }; + + struct RenameAction + { + Value oldValue; + Value newValue; + std::vector where; + std::vector excludedKernels; + }; + + struct ScaledIndex + { + int scale; + Index index; + }; + + struct IndexExpression + { + std::vector indices; + int begin = 0; + }; + + struct LoopInfo + { + Index loopIndex; + Range fullRange; // the range for the "unswitched" part of this loop + Range fragmentRange; + }; + + /// + /// Represents the concrete sequence of loops to be generated, in detail. Derived from the loop nest and the + /// order of the loops. + /// + class LoopVisitSchedule + { + public: + struct LoopInfo + { + Index dimension; + IndexRange indexRange; + int boundarySize = 0; + int scale = 0; + }; + + using StateQueue = std::vector; + + /// Copy and move constructors / assignment operators + LoopVisitSchedule(const LoopVisitSchedule& other); + LoopVisitSchedule(LoopVisitSchedule&&) = default; + LoopVisitSchedule& operator=(const LoopVisitSchedule& other); + LoopVisitSchedule& operator=(LoopVisitSchedule&& other) = default; + + /// Returns the global nest level: how many total loops are current (over all dimensions) + int CurrentNestLevel() const; + + /// Returns `true` if all the loops have been visited + bool IsDone() const; + + /// Returns `true` if the current loop is the innermost level + bool IsInnermostLoop() const; + + /// The index of the current loop (e.g., `i_1`) + Index CurrentLoopIndex() const; + + /// Returns the logical (dimension) index of the current loop (i.e., `i`, not `i_0`, `i_1`, ...) + Index CurrentDimension() const; + + /// The span (start-end) of the entire dimension the current loop is part of. + int DimensionSize() const; + + /// The range [start, end) of the current loop. + Range LoopRange() const; + + /// The span (start-end) of the current loop. Only the same as the + /// number of iterations if the increment is 1. + int LoopSize() const; + int LoopIncrement() const; + int NonBoundaryEnd() const; + + /// The amount this loop index needs to be scaled when generating the expression for the original dimension index. + int LoopIndexScale() const; + + /// Returns `true` if the current loop index has a prologue / epilogue of the given type (because there is a kernel associated with it). + bool CurrentLoopHasFragment(std::vector activeKernels, LoopFragmentType fragmentType) const; + + /// Returns `true` if a future loop (one inside this loop) has a prologue / epilogue of the given type (because there is a kernel associated with it) on the same dimension as the current loop. + bool FutureLoopHasFragmentForThisIndex(std::vector activeKernels, LoopFragmentType fragmentType) const; + + bool FragmentCanRunAlone(std::vector activeKernels, LoopFragmentType fragmentType) const; // Returns `true` if the current loop index has a prologue / epilogue of the given type (because there is a kernel associated with it), and if all such kernels are able to be in their own fragment + + int CurrentIndexEndBoundarySize() const; // Returns the size of the last inner loop if the current loop has a potentially-unswitched boundary condition at the end, or zero if the increment divides the size evenly + + bool WillVisitIndex(const Index& index) const; + bool IsFullyDefined(const Index& index) const; + bool IsFullyDefinedByThisLoop(const Index& index) const; + bool WasIterationVariableDefined(const Index& index) const; + KernelPredicate GetKernelPredicate(const ScheduledKernel& kernel) const; + + LoopVisitSchedule Next() const; + LoopVisitSchedule Prev() const; + + const LoopInfo& Front() const; + + const SplitIterationDomain& GetDomain() const; + const LoopNest& GetLoopNest() const { return _nest.get(); } + + private: + friend class LoopNest; + LoopVisitSchedule(const LoopNest& nest, StateQueue state); + LoopVisitSchedule(const LoopNest& nest, int level, StateQueue state); + + int _level; // == current position in state queue + StateQueue _state; + std::reference_wrapper _nest; + }; + + /// + /// A nested set of loops, and the code (kernels) that run inside them + /// + class LoopNest + { + public: + LoopNest(IterationDomain domain); + + enum class ConstraintType + { + constraint, + predicate + }; + + /// Add a "body" kernel to be run in the middle of the loop nest + void AddKernel(const Kernel& kernel, ConstraintType type = ConstraintType::constraint); + + /// Add a kernel to be run as the prologue or epilogue of a loop + void AddKernel(const Kernel& kernel, LoopFragmentType where); + + /// Add a kernel to be run as the prologue or epilogue of a loop + void AddKernel(const Kernel& kernel, const CodePositionConstraints& where); + + /// Add a kernel to be run as allowed by a predicate + void AddKernel(const Kernel& kernel, const KernelPredicate& predicate); + + /// Add a kernel to be run as allowed by a predicate and a placement predicate + void AddKernel(const Kernel& kernel, const KernelPredicate& predicate, const KernelPredicate& placement); + + /// Add a kernel to be run as the prologue or epilogue of a loop + void AddKernel(const Kernel& kernel, const CodePositionConstraints& where, const KernelPredicate& predicate, const KernelPredicate& placement); + + const std::vector& GetKernels() const; + + std::vector GetKernelGroups() const; + + void Parallelize(Index index); + [[maybe_unused]] SplitIndex Parallelize(Index index, int factor); + + void Unroll(Index index); + [[maybe_unused]] SplitIndex Unroll(Index index, int factor); + + [[maybe_unused]] SplitIndex Split(Index index, int size); + + void SetLoopOrder(const std::vector& order); + + void RenameVariable(ViewAdapter oldVariable, ViewAdapter newVariable, const std::vector& where, const std::vector& excludedKernels = {}); + + int NumDimensions() const; + Range GetIndexRange(Index index) const; + std::vector GetLoopIndexRanges() const; + const SplitIndexRange& GetDimensionRange(int dimension) const; + const SplitIndexRange& GetDimensionRange(const Index& dimension) const; + int NumSplits(const Index& dimension) const; + const std::vector& GetLoopSequence() const; + LoopVisitSchedule GetLoopSchedule() const; + + // Methods used by code generators + int GetLoopIndexScale(const Index& index) const; + + /// Get the concrete loop index given a logical dimension index and split level + Index GetLoopIndex(const Index& dimension, int level) const; + + bool IsParallelized(const Index& index) const; + + bool IsUnrolled(const Index& index) const; + + /// See if an Index is used as a parameter to a kernel + bool IsUsed(const Index& index, const std::vector& activeKernels) const; + + /// Preliminary "variable-renaming" support + const std::vector& GetRenameActions() const; + + const SplitIterationDomain& GetDomain() const; + + Index GetBaseIndex(const Index& index) const; + + /// Return `true` iff `index` is a concrete index for a loop. + bool IsLoopIndex(const Index& index) const; + + /// Return `true` iff `index` must be computed from other indices (i.e., it has been split). + bool IsComputedIndex(const Index& index) const; + + IndexExpression GetIndexExpression(const Index& index) const; + + void DebugDump(std::string tag, std::ostream* stream) const; + + const std::string& Name() const { return _name; } + + private: + void InitLoopSequence(); + void ConvertKernelConstraints(); + void ConvertKernelConstraints(ScheduledKernel& kernel); + + SplitIterationDomain _domain; + std::vector _loopSequence; + std::vector _kernels; + std::vector _renameActions; + std::vector _parallelizedIndices; + std::vector _unrolledIndices; + std::string _name = UniqueName("LoopNest"); + }; + + void DebugDump(const LoopNest& nest, std::string tag = "", std::ostream* stream = nullptr); + + LoopNest Fuse(const LoopNest& nest1, const LoopNest& nest2); + LoopNest Fuse(const LoopNest& nest1, const LoopNest& nest2, const std::vector& dependentIndices1, const std::vector& dependentIndices2); + + bool operator==(const ScheduledKernel& i1, const ScheduledKernel& i2); + bool operator!=(const ScheduledKernel& i1, const ScheduledKernel& i2); + } // namespace loopnests +} // namespace value +} // namespace ell + +namespace std +{ +/// Implements a hash function for the ScheduledKernel class, so that it can be used with associative containers (maps, sets, and the like). +template <> +struct hash<::ell::value::loopnests::ScheduledKernel> +{ + using argument_type = ell::value::loopnests::ScheduledKernel; + using result_type = std::size_t; + + /// Computes a hash of the input value. + /// + /// A hash value for the given input. + result_type operator()(const argument_type& constraints) const; +}; +} // namespace std diff --git a/libraries/value/include/loopnests/LoopNestPrinter.h b/libraries/value/include/loopnests/LoopNestPrinter.h new file mode 100644 index 000000000..5bc7239fa --- /dev/null +++ b/libraries/value/include/loopnests/LoopNestPrinter.h @@ -0,0 +1,65 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: LoopNestPrinter.h (value) +// Authors: Chuck Jacobs +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include "LoopNest.h" +#include "LoopNestVisitor.h" + +#include +#include + +namespace ell +{ +namespace value +{ + namespace loopnests + { + /// + /// Takes a loop nest and generates code for it + /// + class LoopNestPrinter : public LoopNestVisitor + { + public: + LoopNestPrinter(std::ostream& stream); + void Print(const LoopNest& loopNest) const; + + private: + std::ostream& _stream; + mutable int _indentLevel; + + // RAII struct to manage indent level + struct Indenter + { + Indenter(const LoopNestPrinter& printer) : + printer(printer) { ++printer._indentLevel; } + ~Indenter() { --printer._indentLevel; } + const LoopNestPrinter& printer; + }; + + void GenerateLoopRangeOld(const LoopRange& range, const RecursionState& state, const LoopVisitSchedule& schedule, std::function codegenFn) const override; + void GenerateLoopRangeNew(const LoopRange& range, const RecursionStateNew& state, const LoopVisitSchedule& schedule, std::function codegenFn) const override; + Scalar EmitIndexExpression(const Index& index, const IndexExpression& expr, const LoopIndexSymbolTable& indexVariables) const override; + void InvokeKernel(const Kernel& kernel, const KernelPredicate& predicate, const LoopIndexSymbolTable& runtimeIndexVariables, const LoopVisitSchedule& schedule) const override; + bool InvokeKernelGroup(const ScheduledKernelGroup& kernelGroup, const LoopIndexSymbolTable& runtimeIndexVariables, const LoopVisitSchedule& schedule) const override; + + void InvokeKernel(const Kernel& kernel, const LoopIndexSymbolTable& runtimeIndexVariables, const LoopVisitSchedule& schedule) const; + void EmitIf(const KernelPredicate& predicate, const LoopIndexSymbolTable& runtimeIndexVariables, const LoopVisitSchedule& schedule) const; + void EmitElseIf(const KernelPredicate& predicate, const LoopIndexSymbolTable& runtimeIndexVariables, const LoopVisitSchedule& schedule) const; + void EmitElse() const; + void EmitEndIf() const; + + std::string GetIndent() const; + void WriteLine(std::string l) const; + std::string GetIndexString(const Index& predicate, const LoopIndexSymbolTable& runtimeIndexVariables) const; + std::string GetPredicateString(const KernelPredicate& predicate, const LoopIndexSymbolTable& runtimeIndexVariables, const LoopVisitSchedule& schedule) const; + }; + + } // namespace loopnests +} // namespace value +} // namespace ell diff --git a/libraries/value/include/loopnests/LoopNestVisitor.h b/libraries/value/include/loopnests/LoopNestVisitor.h new file mode 100644 index 000000000..baf928384 --- /dev/null +++ b/libraries/value/include/loopnests/LoopNestVisitor.h @@ -0,0 +1,138 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: LoopNestVisitor.h (value) +// Authors: Chuck Jacobs, Kern Handa +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include "KernelPredicate.h" +#include "LoopIndexInfo.h" +#include "LoopNest.h" + +#include +#include +#include +#include + +namespace ell +{ +namespace value +{ + namespace loopnests + { + /// + /// Abstract base class for objects that visit a loop nest (e.g., code generators) + /// + class LoopNestVisitor + { + public: + static Range GetLoopRange(const Index& loopIndex, const LoopIndexSymbolTable& activeRanges, const LoopVisitSchedule& schedule); + + protected: + virtual ~LoopNestVisitor() = default; + + void Visit(const LoopNest& loopNest) const; + + bool WillKernelRunInThisLoop(const ScheduledKernel& kernel, LoopFragmentFlags kernelFilter, const LoopVisitSchedule& schedule) const; + + std::vector GetValidKernels(std::vector activeKernels, + const std::unordered_map& currentFragmentStates, + LoopFragmentFlags currentLoopFlags, + LoopFragmentFlags kernelFilter, + const LoopVisitSchedule& schedule) const; + + bool ShouldRunKernel(const ScheduledKernel& kernel, + LoopFragmentType placement, + const std::unordered_map& constraintIndices, + LoopFragmentFlags currentLoopFlags, + const LoopVisitSchedule& schedule) const; + + bool IsIdentity(const IndexExpression& expr, const Index& index) const; + + /// + /// Returns `true` if the current loop body is inside the loop for the given index (so, "inside" counts the current loop being emitted) + /// + bool IsFullyDefined(const Index& index, const LoopVisitSchedule& schedule) const; + + /// + /// Returns `true` if the current loop body is inside the loop for the given index (so, "inside" counts the current loop being emitted) + /// + bool AreAllFullyDefined(const std::vector& indices, const LoopVisitSchedule& schedule) const; + + struct RecursionState + { + RecursionState(const LoopNest& loopNest); + RecursionState(const RecursionState&) = default; + + LoopIndexSymbolTable loopIndices; // map from an loop Index variable -> the actual (Scalar) runtime loop index for that loop + LoopFragmentFlags currentFragment; + std::unordered_map activeDimensionRanges; // map from dimension index variable -> active loop range for that dimension at this recursion level if that dimension has been previously visited + std::vector activeKernels; + std::unordered_map fragmentStates; + }; + + struct Partition + { + Index index; + Range range; + }; + using PartitionList = std::vector; + + using ActiveKernelGroupList = std::vector>; + + struct RecursionStateNew + { + RecursionStateNew(const LoopNest& loopNest); + RecursionStateNew(const RecursionStateNew&) = default; + + LoopIndexSymbolTable loopIndices; // map from an loop Index variable -> + // the actual (Scalar) runtime loop index for that loop + // range visited by that variable in this branch of the code (for loops that have already been visited) + // state of the variable's loop (before, inside, after) + ActiveKernelGroupList kernelGroups; + }; + + struct LoopRange + { + Scalar start; + Scalar stop; + Scalar step; + LoopFragmentFlags futureLoopFragmentFlags; + LoopFragmentFlags currentLoopFragmentFlags; + }; + + bool UseNewVersion(const LoopNest& loopNest) const; + void GenerateLoopsOld(const RecursionState& state, const LoopVisitSchedule& schedule) const; + void GenerateLoopsNew(RecursionStateNew& state, const LoopVisitSchedule& schedule) const; + std::function GetCodegenFnOld(const LoopRange& r, const RecursionState& state, const LoopVisitSchedule& schedule) const; + std::function GetCodegenFnNew(const LoopRange& r, const RecursionStateNew& state, const LoopVisitSchedule& schedule) const; + + PartitionList GetPartitions(const Index& loopIndex, Range loopRange, const ActiveKernelGroupList& kernels, const LoopIndexSymbolTable& runtimeIndexVariables, const LoopVisitSchedule& schedule) const; + void AddSplits(const Index& loopIndex, Range loopRange, const KernelPredicate& predicate, const LoopVisitSchedule& schedule, std::set& splits) const; + + // Get the end of the "regular" part of this loop (the part that divides by the tile size evenly) + int GetMainBodyLoopEnd(const RecursionState& state, const LoopVisitSchedule& schedule, const Range& loopRange) const; + bool LoopInEndBoundaryFragment(const RecursionState& state, const LoopVisitSchedule& schedule) const; + + void DefineComputedIndexVariables(LoopIndexSymbolTable& runtimeLoopIndices, const std::vector& activeKernels, const LoopVisitSchedule& schedule) const; + LoopIndexSymbolTable GetRuntimeIndexVariables(const LoopIndexSymbolTable& runtimeLoopIndices, const LoopNest& loopNest) const; + void DefinePostLoopIndex(const Index& loopIndex, LoopIndexSymbolTable& runtimeLoopIndices, const LoopVisitSchedule& schedule) const; + + KernelPredicate GetKernelPredicate(const ScheduledKernel& kernel, const LoopVisitSchedule& schedule) const; + bool IsPlacementValid(const ScheduledKernel& kernel, const LoopIndexSymbolTable& runtimeLoopIndices, const LoopVisitSchedule& schedule) const; + std::vector GetValidKernels(const ScheduledKernelGroup& kernelGroup, const LoopIndexSymbolTable& runtimeIndexVariables, const LoopVisitSchedule& schedule) const; + + // abstract: + virtual void GenerateLoopRangeOld(const LoopRange& range, const RecursionState& state, const LoopVisitSchedule& schedule, std::function codegenFn) const = 0; + virtual void GenerateLoopRangeNew(const LoopRange& range, const RecursionStateNew& state, const LoopVisitSchedule& schedule, std::function codegenFn) const = 0; + virtual Scalar EmitIndexExpression(const Index& index, const IndexExpression& expr, const LoopIndexSymbolTable& indexVariables) const = 0; + virtual void InvokeKernel(const Kernel& kernel, const KernelPredicate& predicate, const LoopIndexSymbolTable& runtimeIndexVariables, const LoopVisitSchedule& schedule) const = 0; + virtual bool InvokeKernelGroup(const ScheduledKernelGroup& kernel, const LoopIndexSymbolTable& runtimeIndexVariables, const LoopVisitSchedule& schedule) const = 0; + }; + + } // namespace loopnests +} // namespace value +} // namespace ell diff --git a/libraries/value/include/loopnests/Range.h b/libraries/value/include/loopnests/Range.h new file mode 100644 index 000000000..50df26e5b --- /dev/null +++ b/libraries/value/include/loopnests/Range.h @@ -0,0 +1,44 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: Range.h (value) +// Authors: Chuck Jacobs, Kern Handa +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include + +namespace ell +{ +namespace value +{ + namespace loopnests + { + /// + /// A class representing the half-open interval `[begin, end)`, with an increment between points of _increment. + /// + class Range + { + public: + Range(int begin, int end, int increment = 1); + + int Begin() const; + int End() const; + int Size() const; + int Increment() const; + + private: + int _begin; + int _end; + int _increment; + }; + + bool operator==(const Range& i1, const Range& i2); + bool operator!=(const Range& i1, const Range& i2); + bool operator<(const Range& i1, const Range& i2); + std::ostream& operator<<(std::ostream& os, const Range& r); + } // namespace loopnests +} // namespace value +} // namespace ell diff --git a/libraries/value/include/loopnests/SplitIndexRange.h b/libraries/value/include/loopnests/SplitIndexRange.h new file mode 100644 index 000000000..4148c506b --- /dev/null +++ b/libraries/value/include/loopnests/SplitIndexRange.h @@ -0,0 +1,112 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: SplitIndexRange.h (value) +// Authors: Chuck Jacobs, Kern Handa +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include "Index.h" +#include "IndexRange.h" + +#include +#include +#include +#include + +namespace ell +{ +namespace value +{ + namespace loopnests + { + /// + /// A hierarchically-decomposed dimension range, used to represent the sizes for the different loop levels of a tiled + /// loop + /// + class SplitIndexRange + { + public: + SplitIndexRange() = default; + SplitIndexRange(const IndexRange& range); + + const Index& GetDimensionIndex() const; // The dimension index (e.g., `i`), not a concrete loop index + Range GetDimensionRange() const; // Returns the full range over the dimension + + int NumSplits() const; + int GetBegin() const; + int GetSize() const; + int GetIncrement() const; + + int GetSplitSize(int level) const; // note: returns '1' for the last level + Index GetSplitIndex(int level) const; + Range GetIndexRange(const Index& index) const; + + bool Contains(const Index& index) const; // returns `true` if the given index belongs to this domain + + bool IsLoopIndex(const Index& index) const; // a leaf node in the index tree + bool IsComputedIndex(const Index& index) const; // an interior node in the index tree + bool IsDimension(const Index& index) const; // the index corresponding to the original range + + bool IsParentOf(const Index& parent, const Index& child) const; // is 'parent' the (immediate) parent of 'child'? + bool IsChildOf(const Index& child, const Index& parent) const; // is 'child' a(n) (immediate) child of 'parent'? + bool DependsOn(const Index& index1, const Index& index2) const; // does 'index1' depend on 'index2'? (after a split, the parent depends on the new (leaf) indices) + + const std::vector& GetIndices() const; + std::vector GetLoopIndices() const; + std::vector GetComputedIndices() const; + std::vector GetDependentIndices(const Index& index, bool includeSelf = false) const; + std::vector GetDependentLoopIndices(const Index& index, bool includeSelf = false) const; + + bool HasParentIndex(const Index& parent) const; + + /// Get the index that was split in order to create the given index + Index GetParentIndex(const Index& parent) const; + + bool IsOuterSplitIndex(const Index& index) const; + bool IsInnerSplitIndex(const Index& index) const; + Index GetOuterSplitIndex(const Index& parent) const; + Index GetInnerSplitIndex(const Index& parent) const; + + std::vector GetAllParentIndices(const Index& index) const; + std::vector GetChildIndices(const Index& index) const; + + void Print(std::ostream& os) const; + + private: + friend class SplitIterationDomain; + + SplitIndex Split(int size); // add a split --- must be smaller than last split + SplitIndex Split(Index index, int size); // split the given index + SplitIndex SplitNode(int node, int size); // split the given index + + int GetNode(const Index& index) const; // returns the offset (index into a vector) for the given index + int GetParent(int node) const; + int GetLeftChild(int node) const; + int GetRightChild(int node) const; + int GetNthLeaf(int n) const; + int GetSmallestLeaf(int node) const; // returns the "smallest" leaf descendent of index. If index is itself a leaf, return it, else return GetSmallestLeaf(index.rightChild) + bool IsLeaf(int node) const; + bool IsInteriorNode(int node) const; + + // The indices and their properties are stored in a binary tree + // The root is the dimension index (e.g., 'i') + // Leaves are concrete loop indices + // Interior nodes are computed indices + // + // Initially, the tree has 1 node: the dimension index. + // Splitting a leaf turns it into an interior node with 2 children (the loop indices of the 2 new loops). + // Splitting an interior node is illegal. If an interior node is specified, the rightmost leaf node + // is chosen (arbitrarily). + + std::unordered_map _indexOffset; // A map from Index -> location in the below vectors for info about that index + std::vector _indices; // _indices[0] is i + std::vector _parentOffset; // parent[0] is null (-1) + std::vector _leftChildOffset; // offset to entry of the first (left) child. The right child is adjacent, so it's at (this value)+1 + std::vector _ranges; + }; + } // namespace loopnests +} // namespace value +} // namespace ell diff --git a/libraries/value/include/loopnests/SplitIterationDomain.h b/libraries/value/include/loopnests/SplitIterationDomain.h new file mode 100644 index 000000000..57c98cd3f --- /dev/null +++ b/libraries/value/include/loopnests/SplitIterationDomain.h @@ -0,0 +1,95 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: SplitIterationDomain.h (value) +// Authors: Chuck Jacobs, Kern Handa +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include "IterationDomain.h" +#include "SplitIndexRange.h" + +#include +#include +#include + +namespace ell +{ +namespace value +{ + namespace loopnests + { + /// + /// An `IterationDomain` where some of the dimensions may have been split (tiled). + /// + class SplitIterationDomain + { + public: + SplitIterationDomain(const IterationDomain& domain); + + int NumDimensions() const; + int GetDimensionSize(const Index& dimensionIndex) const; + int GetDimensionBegin(const Index& dimensionIndex) const; + + Range GetIndexRange(const Index& index) const; + + std::vector GetAllLoopIndices() const; + + const std::vector& GetIndicesForDimension(const Index& dimensionIndex) const; + std::vector GetLoopIndicesForDimension(const Index& dimensionIndex) const; + std::vector GetComputedIndicesForDimension(const Index& dimensionIndex) const; + + std::vector GetDependentIndices(const Index& index, bool includeSelf = false) const; + std::vector GetDependentLoopIndices(const Index& index, bool includeSelf = false) const; + + bool Contains(const Index& index) const; // returns `true` if the given index belongs to this domain + bool IsLoopIndex(const Index& index) const; // a leaf node in the index tree + bool IsComputedIndex(const Index& index) const; // an interior node in the index tree + bool IsDimension(const Index& index) const; // the index corresponding to the original range + + bool SameDimension(const Index& index1, const Index& index2) const; + bool IsParentOf(const Index& parent, const Index& child) const; // is 'parent' the (immediate) parent of 'child'? + bool IsChildOf(const Index& child, const Index& parent) const; // is 'child' a (immediate) child of 'parent'? + bool DependsOn(const Index& index1, const Index& index2) const; // does 'index1' depend on 'index2'? (after a split, the parent depends on the new (leaf) indices) + + bool HasParentIndex(const Index& parent) const; + + /// Get the index that was split in order to create the given index + Index GetParentIndex(const Index& parent) const; + + bool IsOuterSplitIndex(const Index& index) const; + bool IsInnerSplitIndex(const Index& index) const; + Index GetOuterSplitIndex(const Index& parent) const; + Index GetInnerSplitIndex(const Index& parent) const; + + std::vector GetAllParentIndices(const Index& index) const; + std::vector GetChildIndices(const Index& index) const; + + const SplitIndexRange& GetDimensionRange(const Index& index) const; + SplitIndexRange& GetDimensionRange(const Index& index); + + const SplitIndexRange& GetDimensionRange(int offset) const; + SplitIndexRange& GetDimensionRange(int offset); + + int NumSplits(const Index& dimensionIndex) const; + Index GetBaseIndex(const Index& index) const; + Index GetBaseIndex(int offset) const; + bool IsPrimaryDimension(const Index& index) const; + + SplitIndex Split(const Index& index, int splitSize); + + void Print(std::ostream& os) const; + + private: + int GetOffsetFromIndex(const Index& index) const; + + std::unordered_map _baseIndices; + std::vector _dimensions; + std::unordered_map _indexToOffsetMap; + }; + + } // namespace loopnests +} // namespace value +} // namespace ell diff --git a/libraries/value/src/Array.cpp b/libraries/value/src/Array.cpp new file mode 100644 index 000000000..d8f2505a1 --- /dev/null +++ b/libraries/value/src/Array.cpp @@ -0,0 +1,120 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: Array.cpp (value) +// Authors: Chuck Jacobs +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "Array.h" +#include "EmitterContext.h" + +#include + +#include +#include + +namespace ell +{ +using namespace utilities; + +namespace value +{ + Array::Array() = default; + + Array::Array(Value value, const std::string& name) : + _value(value) + { + if (!_value.IsDefined() || !_value.IsConstrained()) + { + throw InputException(InputExceptionErrors::invalidArgument, "Value passed in must be defined and have a memory layout"); + } + if (_value.GetLayout() == utilities::ScalarLayout) + { + throw InputException(InputExceptionErrors::invalidArgument, "Value passed in must not be scalar"); + } + if (!name.empty()) + { + SetName(name); + } + } + + Array::~Array() = default; + Array::Array(const Array&) = default; + Array::Array(Array&&) noexcept = default; + + Array& Array::operator=(const Array& other) + { + if (this != &other) + { + _value = other._value; + } + return *this; + } + + Array& Array::operator=(Array&& other) + { + if (this != &other) + { + _value = std::move(other._value); + other._value = Value(); + } + return *this; + } + + Value Array::GetValue() const { return _value; } + + Array Array::Copy() const + { + auto newValue = Allocate(_value.GetBaseType(), _value.GetLayout()); + newValue = _value; + return newValue; + } + + Scalar Array::operator()(const std::vector& indices) + { + if (static_cast(indices.size()) != GetValue().GetLayout().NumDimensions()) + { + throw utilities::InputException(utilities::InputExceptionErrors::invalidSize); + } + Value indexedValue = GetContext().Offset(_value, indices); + indexedValue.SetLayout(utilities::ScalarLayout); + + return indexedValue; + } + + Scalar Array::operator()(const std::vector& indices) const + { + if (static_cast(indices.size()) != GetValue().GetLayout().NumDimensions()) + { + throw utilities::InputException(utilities::InputExceptionErrors::invalidSize); + } + Value indexedValue = GetContext().Offset(_value, indices); + indexedValue.SetLayout(utilities::ScalarLayout); + + return Scalar(indexedValue).Copy(); + } + + size_t Array::Size() const { return _value.GetLayout().NumElements(); } + + ValueType Array::Type() const { return _value.GetBaseType(); } + + void Array::SetName(const std::string& name) { _value.SetName(name); } + + std::string Array::GetName() const { return _value.GetName(); } + + void For(Array array, std::function&)> fn) + { + auto layout = array.GetValue().GetLayout(); + GetContext().For(layout, [fn = std::move(fn), &layout](std::vector coordinates) { + if (layout.NumDimensions() != static_cast(coordinates.size())) + { + throw utilities::InputException(utilities::InputExceptionErrors::invalidSize); + } + + fn(coordinates); + }); + } + +} // namespace value +} // namespace ell diff --git a/libraries/value/src/CachingProvider.cpp b/libraries/value/src/CachingProvider.cpp new file mode 100644 index 000000000..32ca06813 --- /dev/null +++ b/libraries/value/src/CachingProvider.cpp @@ -0,0 +1,39 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: CachingProvider.cpp (value) +// Authors: Kern Handa +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "CachingProvider.h" +#include "LoopNests.h" + +#include "loopnests/LoopNest.h" + +namespace ell +{ +namespace value +{ + void CachingProvider::Initialize(ViewAdapter view, utilities::MemoryShape cacheShape, utilities::DimensionOrder order, std::vector kernelIndices, std::vector atIndices, std::any extra) + { + _value = view; + _shape = cacheShape; + _order = order; + _kernelIndices = kernelIndices; + _atIndices = atIndices; + _extra = extra; + } + + void CachingProvider::HandleCaching(LoopNest& loopnest) + { + for (auto& index : _kernelIndices) + { + index = loopnest.GetUnderlyingLoopNest().GetBaseIndex(index); + } + + HandleCachingImpl(loopnest); + } + +} // namespace value +} // namespace ell \ No newline at end of file diff --git a/libraries/value/src/CachingStrategies.cpp b/libraries/value/src/CachingStrategies.cpp new file mode 100644 index 000000000..cb51732e3 --- /dev/null +++ b/libraries/value/src/CachingStrategies.cpp @@ -0,0 +1,1943 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: CachingStrategies.cpp (value) +// Authors: Kern Handa, Mason Remy +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "ComputeContext.h" +#include "CppEmitterContext.h" +#include "LLVMContext.h" + +#include +#include +#include + +#include +#include +#include + +#if 1 // DEBUGGING +#include +#endif + +namespace ell +{ +namespace value +{ + using namespace utilities; + + // TODO : Generalize to machine characteristics and move out of CachingStrategies + struct RegisterCharacteristics + { + unsigned NumberOfSIMDRegisters; + unsigned NumberOfElementsPerSIMDRegister; + }; + + template + RegisterCharacteristics GetRegisterCharacteristics() + { + RegisterCharacteristics characteristics; + // Set some defaults for non-LLVMContext + characteristics.NumberOfSIMDRegisters = 8; + characteristics.NumberOfElementsPerSIMDRegister = 4; + InvokeForContext([&](LLVMContext& context) { + auto targetMachine = context.GetModuleEmitter().GetTargetMachine(); + auto fn = context.GetFunctionEmitter().GetFunction(); + auto info = targetMachine->getTargetTransformInfo(*fn); + // See https://llvm.org/doxygen/classllvm_1_1TargetTransformInfo.html for the big list of amazing things you can get from this TargetMachineInfo object + characteristics.NumberOfSIMDRegisters = info.getNumberOfRegisters(true); + auto SIMDRegisterBitWidth = info.getRegisterBitWidth(true); + + auto bytesPerElement = context.GetModuleEmitter().GetIREmitter().SizeOf(); + auto bitsPerElement = 8 * bytesPerElement; + characteristics.NumberOfElementsPerSIMDRegister = SIMDRegisterBitWidth / bitsPerElement; + }); + return characteristics; + } + + RegisterCharacteristics GetRegisterCharacteristics(ValueType type) + { + switch (type) + { + case ValueType::Void: + return GetRegisterCharacteristics(); + break; + case ValueType::Boolean: + return GetRegisterCharacteristics(); + break; + case ValueType::Char8: + return GetRegisterCharacteristics(); + break; + case ValueType::Byte: + return GetRegisterCharacteristics(); + break; + case ValueType::Int16: + return GetRegisterCharacteristics(); + break; + case ValueType::Int32: + return GetRegisterCharacteristics(); + break; + case ValueType::Int64: + return GetRegisterCharacteristics(); + break; + case ValueType::Float: + return GetRegisterCharacteristics(); + break; + case ValueType::Double: + return GetRegisterCharacteristics(); + break; + default: + throw InputException(InputExceptionErrors::invalidArgument, "Unrecognized or unsupported ValueType"); + } + } + + void CopyReduce(Scalar baseValue, Scalar cacheValue) + { + baseValue = cacheValue; + } + + void SumReduce(Scalar baseValue, Scalar cacheValue) + { + baseValue += cacheValue; + } + + // Makes a vector of all integers that are a power of base that are strictly less than N, ordered in decreasing value + std::vector GetTelescopingSizes(int N, int base = 2) + { + int maxPower = std::log2(N); + if (std::pow(base, maxPower) == N) + { + // If N is already a power of base, dont add it to the vector + maxPower--; + } + std::vector result; + result.reserve(maxPower); + for (int power = maxPower; power >= 0; --power) + { + result.push_back(static_cast(std::pow(base, power))); + } + return result; + } + + int RoundUpToMultiple(int input, int factor) + { + int remainder = input % factor; + return remainder > 0 ? input + (factor - remainder) : input; + } + + static inline void ValidateInputDimensionality(const Value& value, const MemoryShape& cacheSize, const DimensionOrder& order) + { + if (cacheSize.NumDimensions() != value.GetLayout().NumDimensions()) + { + throw LogicException(LogicExceptionErrors::illegalState, "Dimensionality of data-to-be-cached must match shape of requested cache size"); + } + if (cacheSize.NumDimensions() != order.NumDimensions()) + { + throw LogicException(LogicExceptionErrors::illegalState, "Dimensionality of dimension order must match shape of requested cache size"); + } + + if (value.GetLayout().NumDimensions() != 2) + { + throw LogicException(LogicExceptionErrors::notImplemented, "Only matrix source data is supported at this time"); + } + } + + // TODO move to Array slice code and generalize + Array SliceArray4_1(Array array, Scalar firstIndex) + { + Value indexedValue = array.GetValue().Offset({ firstIndex, 0, 0, 0 }); + auto currentLayout = array.GetValue().GetLayout(); + + indexedValue.SetLayout(currentLayout.GetSliceLayout(currentLayout.GetPhysicalDimension(0))); + + return indexedValue; + } + + Array SliceArray4_1_offset(Array array, Scalar firstIndex) + { + auto currentLayout = array.GetValue().GetLayout(); + auto memoryOffsets = currentLayout.GetOffset(); + const auto memoryOrder = currentLayout.GetLogicalDimensionOrder(); + + // TODO : replace memory offsets with absolute offset support + Value indexedValue = array.GetValue().Offset({ firstIndex - memoryOffsets[memoryOrder[0]], + 0 - memoryOffsets[memoryOrder[1]], + 0 - memoryOffsets[memoryOrder[2]], + 0 - memoryOffsets[memoryOrder[3]] }); + indexedValue.SetLayout(currentLayout.GetSliceLayout(currentLayout.GetPhysicalDimension(0))); + + return indexedValue; + } + Matrix SliceArray4_2(Array array, Scalar firstIndex, Scalar secondIndex) + { + Value indexedValue = array.GetValue().Offset({ firstIndex, secondIndex, 0, 0 }); + auto currentLayout = array.GetValue().GetLayout(); + + auto newLayout = currentLayout.GetSliceLayout(currentLayout.GetPhysicalDimension(0)); + newLayout = newLayout.GetSliceLayout(newLayout.GetPhysicalDimension(0)); + + indexedValue.SetLayout(newLayout); + return indexedValue; + } + Array SliceArray6_2(Array array, Scalar firstIndex, Scalar secondIndex) + { + Value indexedValue = array.GetValue().Offset({ firstIndex, secondIndex, 0, 0, 0, 0 }); + auto currentLayout = array.GetValue().GetLayout(); + + auto newLayout = currentLayout.GetSliceLayout(currentLayout.GetPhysicalDimension(0)); + newLayout = newLayout.GetSliceLayout(newLayout.GetPhysicalDimension(0)); + + indexedValue.SetLayout(newLayout); + return indexedValue; + } + Array SliceArray6_4(Array array, Scalar firstIndex, Scalar secondIndex, Scalar thirdIndex, Scalar fourthIndex) + { + Value indexedValue = array.GetValue().Offset({ firstIndex, secondIndex, thirdIndex, fourthIndex, 0, 0 }); + auto newLayout = array.GetValue().GetLayout(); + newLayout = newLayout.GetSliceLayout(newLayout.GetPhysicalDimension(0)); + newLayout = newLayout.GetSliceLayout(newLayout.GetPhysicalDimension(0)); + newLayout = newLayout.GetSliceLayout(newLayout.GetPhysicalDimension(0)); + newLayout = newLayout.GetSliceLayout(newLayout.GetPhysicalDimension(0)); + + indexedValue.SetLayout(newLayout); + return indexedValue; + } + + void CopyInputCopyOutput::HandleCachingImpl(LoopNest& nest) + { + // throw LogicException(LogicExceptionErrors::notImplemented); + } + + void CopyInputNoOutput::HandleCachingImpl(LoopNest& nest) + { + ValidateInputDimensionality(_value, _shape, _order); + + // _shape is specified in logical dimensions, if _order is not canonical order then we need to reorder the layout + auto canonicalLayout = MemoryLayout{ _shape }; + auto orderedLayout = canonicalLayout.ReorderedCopy(_order); + + auto cacheName = UniqueName("copyInputNoOutputCache"); + auto cacheValue = StaticAllocate(cacheName, _value.GetBaseType(), orderedLayout, AllocateFlags::ThreadLocal); + cacheValue.SetName(cacheName); + auto cacheRef = cacheValue.Reference(); + cacheRef.SetName(cacheName + "Ref"); + + [[maybe_unused]] IntPtrT origAddress{}; + InvokeForContext([&] { origAddress = std::get(cacheRef.GetUnderlyingData())[0]; }); + + auto copyInputKernel = loopnests::Kernel(cacheName + "_Init_Kernel") + .Inputs(_value, cacheRef) + .Indices(_kernelIndices) + .Define([origAddress, orderedLayout](value::Matrix input, value::Value cacheRef, value::Scalar i, value::Scalar j) { + DEBUG_USED(origAddress); + InvokeForContext([&] { + [[maybe_unused]] auto addr = std::get(cacheRef.GetUnderlyingData())[0]; + assert(addr == origAddress); + }); + + Matrix cacheMatrix = cacheRef.Dereference(); + int M = static_cast(input.Rows()); + int N = static_cast(input.Columns()); + Scalar cacheRows = value::Min(M - i, orderedLayout.GetLogicalDimensionActiveSize(0)); + Scalar cacheColumns = value::Min(N - j, orderedLayout.GetLogicalDimensionActiveSize(1)); + + if (input.GetMatrixLayout() == Matrix::MatrixLayout::rowMajor) + { + ForRange(cacheRows, [&](Scalar i_inner) { + ForRange(cacheColumns, [&](Scalar j_inner) { + cacheMatrix(i_inner, j_inner) = input(i + i_inner, j + j_inner); + }); + }); + } + else + { + ForRange(cacheColumns, [&](Scalar j_inner) { + ForRange(cacheRows, [&](Scalar i_inner) { + cacheMatrix(i_inner, j_inner) = input(i + i_inner, j + j_inner); + }); + }); + } + auto offsetCacheValue = cacheMatrix.GetValue().Offset({ -1 * i, -1 * j }); + offsetCacheValue.SetLayout(orderedLayout); + cacheRef = offsetCacheValue.Reference(); + }); + + auto resetOffsetKernel = loopnests::Kernel(cacheName + "_Reset_Kernel") + .Inputs(cacheRef) + .Indices(_kernelIndices) + .Define([orderedLayout](value::Value cacheRef, value::Scalar i, value::Scalar j) { + Matrix cacheMatrix = cacheRef.Dereference(); + auto offsetCacheValue = cacheMatrix.GetValue().Offset({ i, j }); + offsetCacheValue.SetLayout(orderedLayout); + cacheRef = offsetCacheValue.Reference(); + }); + + auto& underlyingNest = nest.GetUnderlyingLoopNest(); + underlyingNest.AddKernel(copyInputKernel, loopnests::CodePositionConstraints{ loopnests::LoopFragmentType::prologue, _atIndices, {} }); + underlyingNest.AddKernel(resetOffsetKernel, loopnests::CodePositionConstraints{ loopnests::LoopFragmentType::epilogue, _atIndices, {} }); + underlyingNest.RenameVariable(_value, cacheRef, _atIndices, { copyInputKernel }); + } + + void ZeroInputReduceOutput::HandleCachingImpl(LoopNest& nest) + { + ValidateInputDimensionality(_value, _shape, _order); + auto canonicalLayout = MemoryLayout{ _shape }; + auto orderedLayout = canonicalLayout.ReorderedCopy(_order); + + auto cacheName = UniqueName("empyInputCopyOutputCache"); + auto tempValue = StaticAllocate(cacheName, _value.GetBaseType(), orderedLayout, AllocateFlags::ThreadLocal); + tempValue.SetName(cacheName); + Matrix temp(tempValue); + auto cacheRef = tempValue.Reference(); + cacheRef.SetName(cacheName + "Ref"); + [[maybe_unused]] IntPtrT origAddress{}; + InvokeForContext([&] { origAddress = std::get(cacheRef.GetUnderlyingData())[0]; }); + + auto kernel3 = loopnests::Kernel(cacheName + "_Init_Kernel") + .Inputs(cacheRef) + .Indices(_kernelIndices) + .Define([shape = orderedLayout](value::Value temp, value::Scalar i, value::Scalar j) { + Matrix tempMatrix = temp.Dereference(); + + value::For(tempMatrix, [&](value::Scalar i_inner, value::Scalar j_inner) { + tempMatrix(i_inner, j_inner) = Cast(0, tempMatrix.Type()); + }); + + // Update cacheRef so that global (i, k) index into the corect spot in the cache + auto cacheTmpOffset = tempMatrix.GetValue().Offset({ -1 * i, -1 * j }); + cacheTmpOffset.SetLayout(shape); + temp = cacheTmpOffset.Reference(); + }); + + auto& underlyingNest = nest.GetUnderlyingLoopNest(); + underlyingNest.AddKernel(kernel3, loopnests::CodePositionConstraints{ loopnests::LoopFragmentType::prologue, _atIndices, {} }); + + auto kernel2 = loopnests::Kernel(cacheName + "_Reduce_Kernel") + .Inputs(_value, cacheRef) + .Indices(_kernelIndices) + .Define([shape = orderedLayout](value::Matrix C, value::Value temp, value::Scalar i, value::Scalar j) { + auto cacheTmpOffset = temp.Dereference().Offset({ i, j }); + cacheTmpOffset.SetLayout(shape); + temp = cacheTmpOffset.Reference(); + auto cache = value::Matrix(temp.Dereference()); + + int M = static_cast(C.Rows()); + int N = static_cast(C.Columns()); + Scalar extraM = value::Min(M - i, shape.GetLogicalDimensionActiveSize(0)); + Scalar extraN = value::Min(N - j, shape.GetLogicalDimensionActiveSize(1)); + + ForRange(extraM, [&](Scalar i_inner) { + ForRange(extraN, [&](Scalar j_inner) { + C(i + i_inner, j + j_inner) += cache(i_inner, j_inner); + }); + }); + }); + underlyingNest.AddKernel(kernel2, loopnests::CodePositionConstraints{ loopnests::LoopFragmentType::epilogue, _atIndices, {} }); + underlyingNest.RenameVariable(_value, cacheRef, _atIndices, { kernel2, kernel3 }); + } // namespace value + + void BLASTCopy::HandleCachingImpl(LoopNest& nest) + { + /* BLAS T COPY: + suppose input matrix is M x N, cache size is M' x N', stripeSize = 4 + so cache successive M'x4 row-major submatrices from the input matrix + + 0 1 2 3 16 17 18 19 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 ... + 4 5 6 7 20 21 22 23 -> 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 + 8 9 10 11 24 25 26 27 + 12 13 14 15 28 29 30 31 + + Need 2 layers of caching: + at M x N level, build up cache values + at stripeSize level, set up pointer and memory layout + */ + + ValidateInputDimensionality(_value, _shape, _order); + + // get block size, stripe size, and stripe slitting index from extras + auto extraParams = std::any_cast>(_extra); + int stripeSize; + Index stripeSplitIndex; + BoundaryConditionHandling boundaryHandling; + std::tie(stripeSize, stripeSplitIndex, boundaryHandling) = extraParams; + + if (boundaryHandling == BoundaryConditionHandling::ZeroPadding && _shape[1] % stripeSize != 0) + { + // To avoid an odd repeated edge case, enforce that the number of cache columns is a multiple of the stripe size + // So the base 3D cache view can represent the full cache + throw InputException(InputExceptionErrors::invalidSize, "The number of cache columns must be a multiple of the cache stripe size"); + } + + // Cache structure: + // Lift the 2D submatrix into a 3D array to set up the cache simply + // The first dimension identifies which cached column block to use + // The second two dimensions identify the element inside of that cached submatrix block + // Index mapping: input ( i, j ) -> cache ( j / stripeSize, i, j % stripeSize ) + // cache ( i, j, k ) -> input ( j, i * stripeSize + k ) + + // Boundary handling + // There are 4 boundary scenarios (possibly all 4 can happen in a single input matrix + cache size combination + // while iterating over the matrix): + // |-------N-------| + // |----N'---| + // _ _ *---------------* + // | | | | | + // | M'| 1 | 2 | + // | | | | | + // M _ |_________|_____| + // | | 3 | 4 | + // | | | | + // _ *---------------* + + // 1 : The cache has exactly as many rows and columns as the input matrix chunk + // - This is the simple case, leave the cache as { M' x N' } + // 2 : The cache has more columns than the input matrix but fewer rows + // - re-view the cache to be { M' x remainingColumns } + // 3 : The cache has more rows than the input matrix but fewer columns + // - re-view the cache to be { remainingRows x N' } + // 4 : The cache has more rows and columns than the input matrix + // - re-view the cache to be { remainingRows x remainingColumns } + // Note: it is assumed that the input matrix is stepped over in splits based on the + // cache size given, so the cache can never be smaller than the input matrix chunk + + // Since the matrix and cache sizes are known ahead of time, we can compute all of the boundary + // condition layouts that are needed: + // remainingRows = M % M' + // remainingColumns = N % N' + + auto inputMatrix = Matrix(_value); + int inputRows = inputMatrix.Rows(); + int inputCols = inputMatrix.Columns(); + int remainingRows = inputRows % _shape[0]; + int remainingCols = inputCols % _shape[1]; + int roundedRemainingCols = RoundUpToMultiple(remainingCols, stripeSize); + // we don't need to round up remainingRows since stripe size only applies to columns in BLASTCopy + + auto generateTCOPYCacheLayout = [stripeSize](int rows, int cols) { + auto cacheDimOrder = DimensionOrder{ 0, 1, 2 }; + auto liftedShape = MemoryShape{ cols / stripeSize, rows, stripeSize }; + auto cacheLayout = MemoryLayout{ liftedShape, cacheDimOrder }; + return cacheLayout; + }; + auto generateTCOPYCacheViewLayout = [stripeSize](int rows, int cols) { + auto cacheViewLayout = MemoryLayout{ { rows, stripeSize }, RowMajorMatrixOrder }; + return cacheViewLayout; + }; + + auto baseCacheLayout = generateTCOPYCacheLayout(_shape[0], _shape[1]); // The non-boundary-case 3D lifted shape + auto baseCacheViewLayout = generateTCOPYCacheViewLayout(_shape[0], _shape[1]); + + // "Boundary" condition 1 is the general case (i.e. non-boundary case) + auto boundaryConditionCacheLayout1 = baseCacheLayout; + auto cacheViewLayout1 = baseCacheViewLayout; + + // Boundary condition 2, re-view to M' x remainingColumns + auto boundaryConditionCacheLayout2 = generateTCOPYCacheLayout(_shape[0], roundedRemainingCols); + auto cacheViewLayout2 = generateTCOPYCacheViewLayout(_shape[0], roundedRemainingCols); + + // Boundary condition 3, re-view to remainingRows x N' + auto boundaryConditionCacheLayout3 = generateTCOPYCacheLayout(remainingRows, _shape[1]); + auto cacheViewLayout3 = generateTCOPYCacheViewLayout(remainingRows, _shape[1]); + + // Boundary condition 4, re-view to remainingRows x remainingColumns + auto boundaryConditionCacheLayout4 = generateTCOPYCacheLayout(remainingRows, roundedRemainingCols); + auto cacheViewLayout4 = generateTCOPYCacheViewLayout(remainingRows, roundedRemainingCols); + + auto cacheName = UniqueName("BLASTCopyCache"); + _rawCache = StaticAllocate(cacheName, _value.GetBaseType(), baseCacheLayout); + Array liftedCache(_rawCache); + + auto cacheRef = _rawCache.Reference(); + cacheRef.SetLayout(baseCacheViewLayout); + cacheRef.SetName(cacheName + "_Ref"); + + auto cacheFillKernel = loopnests::Kernel(cacheName + "_Fill_Cache_Kernel") + .Inputs(_value, liftedCache) + .Indices(_kernelIndices) + .Define([remainingRows, remainingCols, stripeSize, shape = _shape, inputRows, inputCols, boundaryConditionCacheLayout1, boundaryConditionCacheLayout2, boundaryConditionCacheLayout3, boundaryConditionCacheLayout4](value::Matrix input, value::Array cache, value::Scalar i, value::Scalar j) { + // We may need to re-view the cache to a smaller layout if we have less + // data to cache than we have available space in the cache. + // If we re-view the cache then we can keep the smaller cached data + // physically contiguous while still using the same looping APIs + Scalar kernelRemainingRows = inputRows - i; + Scalar kernelRemainingCols = inputCols - j; + Scalar notEnoughRows = shape[0] > kernelRemainingRows; + Scalar notEnoughCols = shape[1] > kernelRemainingCols; + ZeroMemory(cache); + + // Generate the cache fill loop in a parameterized lambda so we can emit the different layout versions independently + auto cacheFillLoop = [&](MemoryLayout cacheFillLayout, int rows, int cols) { + auto cacheFillView = cache.GetValue(); + cacheFillView.SetLayout(cacheFillLayout); + auto reViewedCache = Array(cacheFillView); + + ForRange(Scalar{ 0 }, Scalar{ cols / stripeSize }, [&](Scalar stripeColumnChunk) { + ForRange(Scalar{ 0 }, Scalar{ rows }, [&](Scalar row) { + ForRange(Scalar{ 0 }, Scalar{ stripeSize }, [&](Scalar stripeColumn) { + reViewedCache({ stripeColumnChunk, row, stripeColumn }) = input(i + row, j + stripeColumnChunk * stripeSize + stripeColumn); + }); + }); + }); + auto finalColumnChunk = Scalar{ cols / stripeSize }; + ForRange(Scalar{ 0 }, Scalar{ rows }, [&](Scalar row) { + ForRange(Scalar{ 0 }, Scalar{ cols % stripeSize }, [&](Scalar stripeColumn) { + reViewedCache({ finalColumnChunk, row, stripeColumn }) = input(i + row, j + finalColumnChunk * stripeSize + stripeColumn); + }); + }); + }; + + // Emit all of the different loops individually since the cache layouts are set at emit-time + If(notEnoughRows, + [&]() { + If(notEnoughCols, + [&]() { + // Boundary condition 4 + cacheFillLoop(boundaryConditionCacheLayout4, remainingRows, remainingCols); + }) + .Else( + [&]() { + // Boundary condition 3 + cacheFillLoop(boundaryConditionCacheLayout3, remainingRows, shape[1]); + }); + }) + .ElseIf(notEnoughCols, + [&]() { + // Boundary condition 2 + cacheFillLoop(boundaryConditionCacheLayout2, shape[0], remainingCols); + }) + .Else( + [&]() { + // Boundary condition 1 + cacheFillLoop(boundaryConditionCacheLayout1, shape[0], shape[1]); + }); + }); + + auto& underlyingNest = nest.GetUnderlyingLoopNest(); + underlyingNest.AddKernel(cacheFillKernel, loopnests::CodePositionConstraints{ loopnests::LoopFragmentType::prologue, _atIndices, {} }); + + std::vector viewInitKernelIndices; + viewInitKernelIndices.assign(_kernelIndices.begin(), _kernelIndices.end()); + viewInitKernelIndices.push_back(stripeSplitIndex); + auto viewInitKernel = loopnests::Kernel(cacheName + "_View_Init_Kernel") + .Inputs(liftedCache, cacheRef) + .Indices(viewInitKernelIndices) + .Define([shape = _shape, stripeSize, inputRows, inputCols, cacheViewLayout1, cacheViewLayout2, cacheViewLayout3, cacheViewLayout4, boundaryConditionCacheLayout1, boundaryConditionCacheLayout2, boundaryConditionCacheLayout3, boundaryConditionCacheLayout4](value::Array cache, value::Value cacheRef, value::Scalar i, value::Scalar j, value::Scalar jStripe) { + // To set up the view for the kernel to use, we need to set up the cacheRef reference + // so that a kernel indexing with (i, j) winds up in the right spot, pointing into the + // cached row-major submatrix that is the (j / stripeSize, ALL, ALL) slice of the cache array + + // We may need to re-view the cache view to a smaller layout if we are in one of the boundary conditions + Scalar remainingRows = inputRows - i; + Scalar remainingCols = inputCols - j; + Scalar notEnoughRows = shape[0] > remainingRows; + Scalar notEnoughCols = shape[1] > remainingCols; + + auto cacheViewFn = [&](MemoryLayout cacheLayout, MemoryLayout viewLayout) { + // Re-View the cache so we can index into the correct cached stripe + auto cacheView = cache.GetValue(); + cacheView.SetLayout(cacheLayout); + auto cacheStripe = jStripe % shape[1]; // If N > N', make sure we index into the re-initialized cache position + auto indexedCacheView = cacheView.Offset({ cacheStripe / stripeSize, 0, 0 }); + + // Re-View the indexed cache as a 2-D matrix so we can position the offset pointer for use in the inner kernels + indexedCacheView.SetLayout(viewLayout); + auto offsetIndexedCacheView = indexedCacheView.Offset({ -1 * i, -1 * j }); + offsetIndexedCacheView.SetLayout(viewLayout); + cacheRef.SetLayout(viewLayout); + cacheRef = offsetIndexedCacheView.Reference(); + }; + + // Emit all of the views and offsets individually since the cache layouts are set at emit-time + If(notEnoughRows, + [&]() { + If(notEnoughCols, + [&]() { + // Boundary condition 4 + cacheViewFn(boundaryConditionCacheLayout4, cacheViewLayout4); + }) + .Else( + [&]() { + // Boundary condition 3 + cacheViewFn(boundaryConditionCacheLayout3, cacheViewLayout3); + }); + }) + .ElseIf(notEnoughCols, + [&]() { + // Boundary condition 2 + cacheViewFn(boundaryConditionCacheLayout2, cacheViewLayout2); + }) + .Else( + [&]() { + // Boundary condition 1 + cacheViewFn(boundaryConditionCacheLayout1, cacheViewLayout1); + }); + }); + underlyingNest.AddKernel(viewInitKernel, loopnests::CodePositionConstraints{ loopnests::LoopFragmentType::prologue, { stripeSplitIndex }, {} }); + underlyingNest.RenameVariable(_value, cacheRef, _atIndices, { cacheFillKernel, viewInitKernel }); + } + + // namespace value + + + + + + // Helper class to hold a binary tree with a MemoryLayout at each leaf node corresponding to a different + // boundary condition and with a number of levels equal to the number of dimensions in a cache layout + class BoundaryConditionMemoryLayoutHelper + { + // A multi-dimensional cache memory layout with N dimensions can have 2^N different boundary condition + // layouts since each dimension of the cache memory layout could either be in a: + // - general case - the number of elements in that dimension in this particular slice in the + // cache layout is less than or equal to the number of elements remaining in the input + // for that dimension + // - boundary case - the number of elements in that dimension in this particular slice in the cache + // layout is greater than the number of elements remaining in the input for that dimension + // + // We must generate two different types of things for these cases: + // 1) a set of memory layouts for each possible scenario - at emit time we can know all of the general + // or boundary cases that we will hit. We need the shape of the input region, the input fill region, + // the cache layout, and the cache fill layout. + // 2) a nested set of emitted If/Else switches that will switch on the remaining size of the input for + // each cache dimension and call a given lambda with the appropriate boundary condition memory layout + public: + BoundaryConditionMemoryLayoutHelper(MemoryShape inputShape, + const std::vector& orderedIndexSizes, + const std::vector& logicalDimensionMapping, + const std::vector& splitIndexScaleFactors, + unsigned cacheFillThresholdIdxOffset, + unsigned cacheViewThresholdIdxOffset) : + _inputShape(inputShape), + _orderedIndexSizes(orderedIndexSizes), + _logicalDimensionMapping(logicalDimensionMapping), + _splitIndexScaleFactors(splitIndexScaleFactors), + _cacheFillThresholdIdxOffset(cacheFillThresholdIdxOffset), + _cacheViewThresholdIdx(cacheViewThresholdIdxOffset) + { + if (orderedIndexSizes.size() != logicalDimensionMapping.size()) + { + throw InputException(InputExceptionErrors::invalidSize, "Need to provide the same number of ordered index sizes as logical dimension mappings"); + } + if (orderedIndexSizes.size() != splitIndexScaleFactors.size()) + { + throw InputException(InputExceptionErrors::invalidSize, "Need to provide the same number of ordered index sizes as split index scale factors mappings"); + } + if (orderedIndexSizes.empty()) + { + throw InputException(InputExceptionErrors::invalidSize, "Need to provide at least one index size"); + } + FillTree(); + } + + template + void EmitBoundarySwitches(const std::vector& compositeIndices, Fn&& func) const + { + unsigned inputLogicalDimensionCount = _inputShape.NumDimensions(); + if (compositeIndices.size() != inputLogicalDimensionCount) + { + throw InputException(InputExceptionErrors::invalidSize, "Need to provide one scalar index value per logical dimension in the input"); + } + + // Compute how many elements are remaining in each logical dimension + std::vector remainingElements; + remainingElements.reserve(inputLogicalDimensionCount); + for (size_t logicalDimension = 0; logicalDimension < inputLogicalDimensionCount; ++logicalDimension) + { + Scalar remaining = _inputShape[logicalDimension] - compositeIndices[logicalDimension]; + remainingElements.push_back(remaining); + } + + // Determine which levels of the tree are going to be in a boundary condition based on the remaining elements + std::vector isBoundaryCase; + isBoundaryCase.reserve(_logicalDimensionMapping.size()); + for (unsigned idx = 0; idx < _logicalDimensionMapping.size(); ++idx) + { + isBoundaryCase.push_back(_orderedIndexSizes[idx] > remainingElements[_logicalDimensionMapping[idx]]); + } + + // Run a depth-first traversal of the tree to emit the nested If/Else switches to handle all the boundary conditions + RecursiveEmitHelper(_tree, isBoundaryCase, 0, func); + } + + private: + struct BoundaryConditionTreeNode + { + BoundaryConditionTreeNode(const std::vector& logicalDimensionSizes) : + cacheLogicalDimensionSizes(logicalDimensionSizes.size()), + cacheFillLogicalDimensionSizes(logicalDimensionSizes.size()), + subLogicalDimensionSizes(logicalDimensionSizes) + {} + BoundaryConditionTreeNode(const std::shared_ptr& parent, + int newCacheSize, + int newInputSize, + int logicalDimension, + const std::vector& splitIndexScaleFactors, + bool isCacheFillIdx) : + cacheSizes(parent->cacheSizes), + cacheFillSizes(parent->cacheFillSizes), + cacheLogicalDimensionSizes(parent->cacheLogicalDimensionSizes), + cacheFillLogicalDimensionSizes(parent->cacheFillLogicalDimensionSizes), + subLogicalDimensionSizes(parent->subLogicalDimensionSizes) + { + cacheSizes.push_back(newCacheSize); + + if (cacheLogicalDimensionSizes[logicalDimension] == 0) + { + cacheLogicalDimensionSizes[logicalDimension] = newInputSize; + } + + if (isCacheFillIdx) + { + cacheFillSizes.push_back(newCacheSize); + if (cacheFillLogicalDimensionSizes[logicalDimension] == 0) + { + cacheFillLogicalDimensionSizes[logicalDimension] = newInputSize; + } + } + subLogicalDimensionSizes[logicalDimension] = newInputSize; + ComputeShape(splitIndexScaleFactors); + } + + void ComputeShape(const std::vector& splitIndexScaleFactors) + { + // Create a cache shape for this level + std::vector shardSizes; + shardSizes.reserve(cacheSizes.size()); + + std::vector fillShardSizes; + fillShardSizes.reserve(cacheFillSizes.size()); + unsigned fillOffset = cacheSizes.size() - cacheFillSizes.size(); + for (unsigned idx = 0; idx < cacheSizes.size(); ++idx) + { + int shardSize = cacheSizes[idx] / splitIndexScaleFactors[idx]; + if (cacheSizes[idx] % splitIndexScaleFactors[idx] != 0) + { + // Account for partial shards + shardSize++; + } + shardSizes.push_back(shardSize); + if (idx >= fillOffset) + { + fillShardSizes.push_back(shardSize); + } + } + + cacheShape = { shardSizes }; + cacheFillShape = { fillShardSizes }; + inputRegionShape = { cacheLogicalDimensionSizes }; + inputRegionFillShape = { cacheFillLogicalDimensionSizes }; + } + + // Use shared_ptr instead of unique_ptr since we need to be able to copy these helper objects into multiple lambdas + std::shared_ptr generalCase; + std::shared_ptr boundaryCase; + std::vector cacheSizes; + std::vector cacheFillSizes; + std::vector cacheLogicalDimensionSizes; // logical dimension sizes represented by the full cache + std::vector cacheFillLogicalDimensionSizes; // logical dimension sizes represented by the fill view of the cache + std::vector subLogicalDimensionSizes; // logical input dimension sizes represented by this portion of the tree + MemoryShape cacheShape; + MemoryShape cacheFillShape; + MemoryShape inputRegionShape; + MemoryShape inputRegionFillShape; + }; + + void FillTree() + { + int logicalDimensionCount = _inputShape.NumDimensions(); + std::vector baseLogicalDimensionCacheSizes; + baseLogicalDimensionCacheSizes.reserve(logicalDimensionCount); + for (int logicalDimension = 0; logicalDimension < logicalDimensionCount; ++logicalDimension) + { + baseLogicalDimensionCacheSizes.push_back(_inputShape[logicalDimension]); + } + _tree = std::make_shared(baseLogicalDimensionCacheSizes); + std::queue> activeNodes; + activeNodes.push(_tree); + + // Compute the minimum sizes for remainders / boundary cases for each level + // A remainder / boundary case needs to cover an integer number of the splits that + // occur later on in that logical dimension, so we compute the possible sizes for + // the remainders in each cache dimension up front + // E.g. if we have a cache that is 4x4, with a split of 2 in the column dimension, + // ordered at {1,0,1} so that the cache size is {2,4,2}, but our input is + // 4x3, we need to zero-pad the innermost dimension since we need to keep + // an integer number of them, and thus behave as though it's still a 4x4 + // input region and we have a {2,4,2} cache. + // If instead we have 4x2 input, we can reduce the outermost dimension shard + // count by 1 and still cover an integer number of the inner splits with a + // {1,4,2} cache + std::vector remainderMinimumSizes(_logicalDimensionMapping.size()); + std::map logicalDimensionWorkingSizes; + // loop from the innermost split dimension to the outermost + for (unsigned idx = _logicalDimensionMapping.size() - 1; _logicalDimensionMapping.size() > idx; --idx) + { + int logicalDimension = _logicalDimensionMapping[idx]; + auto workingSizeIter = logicalDimensionWorkingSizes.find(logicalDimension); + int size = _orderedIndexSizes[idx]; + if (workingSizeIter == logicalDimensionWorkingSizes.end()) + { + remainderMinimumSizes[idx] = 1; + } + else + { + remainderMinimumSizes[idx] = logicalDimensionWorkingSizes[logicalDimension]; + } + logicalDimensionWorkingSizes[logicalDimension] = size; + } + + for (unsigned idx = 0; idx < _logicalDimensionMapping.size(); ++idx) + { + int logicalDimension = _logicalDimensionMapping[idx]; + int cacheSplitSize = _orderedIndexSizes[idx]; + size_t numNodesInLevel = activeNodes.size(); + for (unsigned nodeIdx = 0; nodeIdx < numNodesInLevel; ++nodeIdx) + { + auto currentNode = activeNodes.front(); + activeNodes.pop(); + + int baseLogicalInputSize = currentNode->subLogicalDimensionSizes[logicalDimension]; + int baseRemainderSize = baseLogicalInputSize % cacheSplitSize; + + // round up the logical input size based on the remainder minimum size for this dimension + int logicalInputSize = RoundUpToMultiple(baseLogicalInputSize, remainderMinimumSizes[idx]); + int remainderSize = logicalInputSize % cacheSplitSize; + + if (idx > _cacheViewThresholdIdx || remainderSize == 0) + { + // We can't reshape the cache view, so if we're inside of the view portion + // of the cache we need to zero-pad + // As a half-step to keep the cache as dense as possible when we're in a boundary condition, + // we let the first cacheViewThresholdIdx be shrunk for the purposes of creating + // the cache layout, since this idx is definitely in the most-major dimension of + // the cache view as it is the farthest out. Therefore we only consider if + // idx > _cacheViewThresholdIdx instead of idx >= _cacheViewThresholdIdx + + // Additionally, if after rounding up the logical input size we've produced + // an integer multiple of cacheSplitSize, we need to generate a boundary condition + // branch with the full cacheSplitSize as the cache size, but with the base remainder + // size as the input size + remainderSize = cacheSplitSize; + } + + if (cacheSplitSize <= logicalInputSize) + { + currentNode->generalCase = std::make_shared(currentNode, + cacheSplitSize, + cacheSplitSize, + logicalDimension, + _splitIndexScaleFactors, + idx >= _cacheFillThresholdIdxOffset); + activeNodes.push(currentNode->generalCase); + } + if (baseRemainderSize > 0) + { + currentNode->boundaryCase = std::make_shared(currentNode, + remainderSize, + baseRemainderSize, + logicalDimension, + _splitIndexScaleFactors, + idx >= _cacheFillThresholdIdxOffset); + activeNodes.push(currentNode->boundaryCase); + } + } + } + } + + template + void RecursiveEmitHelper(const std::shared_ptr& currentNode, const std::vector& isBoundaryCase, unsigned currentIdx, Fn&& func) const + { + if (currentNode->generalCase == nullptr && currentNode->boundaryCase == nullptr) + { + // Base case, call the given function with our cache shape for this leaf of the tree + func(currentNode->inputRegionShape, currentNode->inputRegionFillShape, currentNode->cacheShape, currentNode->cacheFillShape); + } + else if (currentNode->generalCase == nullptr) + { + // we only have a boundary case, so don't emit an If/Else but instead just recurse to the boundary case + RecursiveEmitHelper(currentNode->boundaryCase, isBoundaryCase, currentIdx + 1, func); + } + else if (currentNode->boundaryCase == nullptr) + { + // we only have a general case, so don't emit an If/Else but instead just recurse to the general case + RecursiveEmitHelper(currentNode->generalCase, isBoundaryCase, currentIdx + 1, func); + } + else + { + // We have both a general case and a boundary case, so emit an If/Else switch to emit both cases + If(isBoundaryCase[currentIdx], [&] { + RecursiveEmitHelper(currentNode->boundaryCase, isBoundaryCase, currentIdx + 1, func); + }).Else([&] { + RecursiveEmitHelper(currentNode->generalCase, isBoundaryCase, currentIdx + 1, func); + }); + } + } + + MemoryShape _inputShape; + std::vector _orderedIndexSizes; + std::vector _logicalDimensionMapping; + std::vector _splitIndexScaleFactors; + unsigned _cacheFillThresholdIdxOffset; + unsigned _cacheViewThresholdIdx; + std::shared_ptr _tree; // Use shared_ptr's since we need to be able to copy these helper objects into multiple lambdas + }; + + std::pair ComputeCacheView(MemoryLayout cacheLayout, + const std::vector& cacheLogicalDimensionMapping, + int logicalDimensionCount) + { + std::vector cacheViewSizes; + cacheViewSizes.reserve(logicalDimensionCount); + std::vector dimensionOrdering(logicalDimensionCount, -1); // initialize the dimensionOrdering since we will be filling it out-of-order + + // Iterate the cacheLogicalDimensionMapping from back to front in order to walk the shape + // from the inner splits to the outer splits + int previousLogicalDimension = -1; // -1 == sentinel uninitialized value, not any of the logical dimensions + unsigned cacheViewThresholdIdx = cacheLogicalDimensionMapping.size() - 1; // index in the logical dimension mapping / split indices vector to start the cache view at + unsigned currentDimensionOrderingIdx = dimensionOrdering.size() - 1; + std::set seenLogicalDimensions; + std::map logicalDimensionToCacheViewSize; // maps from logical dimension to the cache view size + for (unsigned idx = cacheLogicalDimensionMapping.size() - 1; cacheLogicalDimensionMapping.size() > idx; --idx) + { + int logicalDimension = cacheLogicalDimensionMapping[idx]; + if (previousLogicalDimension != logicalDimension) + { + // This is different from the previous logical dimension that we were collapsing + if (seenLogicalDimensions.find(logicalDimension) != seenLogicalDimensions.end()) + { + // If we've seen this logical dimension before and we aren't currently collapsing it then this is a repeat + // that prompts us to stop building up the cache view + break; + } + else + { + // this is the first time we've seen this dimension, so insert it into the dimension ordering outside of the + // dimensions we've already seen + dimensionOrdering[currentDimensionOrderingIdx--] = logicalDimension; + seenLogicalDimensions.insert(logicalDimension); + previousLogicalDimension = logicalDimension; + } + } + cacheViewThresholdIdx = idx; + } + + // Now we know the sizes of all the dimensions in the view and we need to fill the remainder of + // the dimension ordering with any logical dimensions in the input that aren't part of the cache view. + // Any logical dimensions that aren't part of the cache view have a cache view size of 1, and thus + // the ordering of them doesn't really matter since we'll re-view the cache before changing the + // index in that dimension we examine + for (int logicalDimension = 0; logicalDimension < logicalDimensionCount; ++logicalDimension) + { + // Set the size to 1, for logical dimensions that are also in the view, we will multiply + // this value by the shard sizes in the view + logicalDimensionToCacheViewSize[logicalDimension] = 1; + if (seenLogicalDimensions.find(logicalDimension) == seenLogicalDimensions.end()) + { + // This dimension isn't part of the view, so insert it in the dimension ordering + // outside of the dimensions that are in the view + dimensionOrdering[currentDimensionOrderingIdx--] = logicalDimension; + } + } + + // Now we need to build up the sizes of the view dimensions by taking the product of cache dimension + // sizes within each logical dimension after the point in the cache hierarchy where the view starts. + // We take the product of the sizes because the active sizes at each cache dimension represent the + // number of shards in that split dimension, not necessarily element count in that logical dimension. + // In the innermost split in each logical dimension the shareds are all of size 1 and therefore + // shard count == element count + for (unsigned idx = cacheViewThresholdIdx; idx < cacheLogicalDimensionMapping.size(); ++idx) + { + int logicalDimension = cacheLogicalDimensionMapping[idx]; + logicalDimensionToCacheViewSize[logicalDimension] *= cacheLayout.GetActiveSize(idx); + } + + // Now that we have the full view dimension ordering and a map from logical dimension to view size, + // fill out the ordered view sizes vector + for (unsigned idx = 0; idx < dimensionOrdering.size(); ++idx) + { + int logicalDimension = dimensionOrdering[idx]; + cacheViewSizes.push_back(logicalDimensionToCacheViewSize[logicalDimension]); + } + + return { MemoryLayout{ MemoryShape{ cacheViewSizes }, DimensionOrder{ dimensionOrdering } }, cacheViewThresholdIdx }; + } + + void GeneralCachingStrategy::HandleCachingImpl(LoopNest& nest) + { + // General caching strategy: + // Given: + // - input value + // - top level indices that the input uses + // - name for the cache + // - size of the cache to use in # of elements + // - # of elements to cache at a time ( < size of cache for progressive caching, > size of cache is an error) + // - Input / InputOutput / Output designation + // - Reduce function operating on individual Scalars + // + // Set up 3-4 kernels: + // - Cache flushing kernel + // - Cache filling kernel if Input/InputOutput + // - Cache viewing kernel (based on the shape of the input value) + // - Cache reduce kernel if InputOutput/Output + + auto extraParams = std::any_cast, + bool>>(_extra); + value::ArgumentType argType; + std::string baseName; + size_t maxCacheElts; + size_t fillThreshold; // fillThreshold <= maxCacheElts + std::function reduceFunction; + bool accumulateReduce; + std::tie(argType, + baseName, + maxCacheElts, + fillThreshold, + reduceFunction, + accumulateReduce) = extraParams; + + // Read target machine characteristics for number of SIMD registers and the size of the registers + RegisterCharacteristics registerCharacteristics = GetRegisterCharacteristics(_value.GetBaseType()); + + // Determine kernels needed + bool useFillKernel = (argType == value::ArgumentType::Input || argType == value::ArgumentType::InputOutput); + bool useViewKernel = true; // always include view kernel for simplicity for now, even if the re-viewing winds up being redundant + bool useReduceKernel = (argType == value::ArgumentType::Output || argType == value::ArgumentType::InputOutput); + + size_t bufferAlignment = 16 * sizeof(float); + + InvokeForContext([&] { + // TODO : Support buffer alignment in CppEmitterContext + bufferAlignment = 0; + }); + + auto inputArray = Array(_value); + int logicalDimensionCount = _value.GetLayout().NumDimensions(); + int compositeIndexCount = _kernelIndices.size(); + auto& underlyingNest = nest.GetUnderlyingLoopNest(); + + const auto& loopSequence = underlyingNest.GetLoopSequence(); + std::vector orderedIndices; + for (const auto& index : loopSequence) + { + const auto& dimensionIndex = underlyingNest.GetDimensionRange(index).GetDimensionIndex(); + auto indexIter = std::find(_kernelIndices.begin(), _kernelIndices.end(), dimensionIndex); + if (indexIter != _kernelIndices.end()) + { + orderedIndices.push_back(index); + } + } + + // Ensure we have some indices + if (orderedIndices.empty()) + { + throw InputException(InputExceptionErrors::invalidSize, "Don't have any indices relevant to this input for this loop nest"); + } + + // If there are no _atIndices specified, default to the outermost orderedIndices index + if (_atIndices.empty()) + { + _atIndices.push_back(orderedIndices.front()); + } + + // Compute the mapping between the orderedIndices list and the logical input dimensions + std::vector logicalDimensionMapping; + logicalDimensionMapping.reserve(orderedIndices.size()); + + // Determine the size for each split for each logical dimension + // We only care about the split indices that are passed in as part of + // orderedIndices, so instead of recording the sizes of those indices, + // instead record the size of the full index range followed by the increments + // of the each of the orderedIndices + std::map> logicalDimensionSplitSizes; + for (int logicalDimension = 0; logicalDimension < logicalDimensionCount; ++logicalDimension) + { + logicalDimensionSplitSizes[logicalDimension].push_back(_value.GetLayout().GetActiveSize(logicalDimension)); + } + + // Determine the increments for each split index in the orderedIndices + // The cache dimensions all operate with logical increments of 1, so when we are mapping between input space and cache space + // we need to scale appropriately by the split index increments for each split index + std::vector orderedIndexIncrements; + orderedIndexIncrements.reserve(orderedIndices.size()); + + for (const auto& index : orderedIndices) + { + // Compute the logical dimension mapping + const auto& dimensionIndex = underlyingNest.GetDimensionRange(index).GetDimensionIndex(); + auto indexIter = std::find(_kernelIndices.begin(), _kernelIndices.end(), dimensionIndex); + // Here we assume: + // - _kernelIndices is a vector or similar, so (iteror - begin) == idx of iterator + // - _kernelIndices is arranged in logical dimension ordering for this input + int logicalDimension = indexIter - _kernelIndices.begin(); + logicalDimensionMapping.push_back(logicalDimension); + + // Find the index increment for this index to use for scaling index values to + // convert between cache dimensions and input indices + // Also use this for the logical dimension split sizes + auto increment = underlyingNest.GetIndexRange(index).Increment(); + orderedIndexIncrements.push_back(increment); + logicalDimensionSplitSizes[logicalDimension].push_back(increment); + } + + // Compute the memory shape for the cache based on the index sizes in each logical + // dimension. Each MemoryShape dimension counts the number of shards of the cache + // that dimension indexes over, so the size of each MemoryShape dimension ought to be + // the size of the index divided by the size of the next split index in the same + // logical input dimension. + // e.g. if Index i ranges over [0,64), and is split by 32, then by 16, then by 4 + // we will have split indices [0,64,32), [0,32,16), [0,16,4), and [0,4,1), + // but suppose a cache doesn't use the second index, i.e. it only uses + // [0,64,32), [0,16,4), and [0,4,1), then the memory shape (for split dimensions + // in the i logical dimension) should be { 4, 4, 4 } since the outer index + // ranging from 0 to 64 accounts for 4 shards of 16 + // and the next index ranging from 0 to 16 accounts for 4 shards of 4 + // and the next index ranging from 0 to 4 accounts for 4 shards of 1 + // + // Now that we have the base dimension size and all the increments for the indices we're using + // we can compute the shard sizes for each logical dimension by dividing each dimension split + // size we accumulated above with the size that comes after it, indicating how many instnaces of + // the next shard occur within the current shard + std::map> logicalIndexToShardSizes; + std::map> logicalIndexToSizes; // Full element counts, not shard counts + for (int logicalDimension = 0; logicalDimension < logicalDimensionCount; ++logicalDimension) + { + const auto& splitSizes = logicalDimensionSplitSizes[logicalDimension]; + for (unsigned splitIdx = 0; splitIdx < splitSizes.size() - 1; ++splitIdx) + { + int currentSize = splitSizes[splitIdx]; + int nextSize = splitSizes[splitIdx + 1]; + int shardSize = currentSize / nextSize; + if (currentSize % nextSize != 0) + { + // Round up to account for partial shards + shardSize++; + } + logicalIndexToShardSizes[logicalDimension].push(shardSize); + logicalIndexToSizes[logicalDimension].push(currentSize); + } + } + + // Now that we have the shard sizes grouped by logical dimension, arrange them to match + // the orderedIndices + std::vector orderedIndexShardSizes; + std::vector orderedIndexSizes; // Full element counts, not shard counts + orderedIndexShardSizes.reserve(orderedIndices.size()); + orderedIndexSizes.reserve(orderedIndices.size()); + for (unsigned idx = 0; idx < logicalDimensionMapping.size(); ++idx) + { + int logicalDimension = logicalDimensionMapping[idx]; + + orderedIndexShardSizes.push_back(logicalIndexToShardSizes[logicalDimension].front()); + logicalIndexToShardSizes[logicalDimension].pop(); + + orderedIndexSizes.push_back(logicalIndexToSizes[logicalDimension].front()); + logicalIndexToSizes[logicalDimension].pop(); + } + + // Create a MemoryShape for the cache based on the shard counts + // This isn't the final cache shape and layout yet - we may need to shrink it to fit the number + // of elements requested in the cache + MemoryShape fullInputShape = { orderedIndexShardSizes }; + MemoryLayout fullInputLayout = { fullInputShape }; + + // Physical Cache + // Determine how large the physical cache ought to be by trying to cover complete view + // dimensions without exceeding maxCacheElts elements in size. + // e.g. if the full view has 5 dimensions, and our maxCacheElts only covers the inner most two dimensions, + // then the cache size is set to that size and we create our "fill" and "reduce" kernels accordingly + // To achieve this, start from the base full cache layout and slice off physical dimensions going from the + // outermost to the innermost until the full extent has no more than maxCacheElts elements + MemoryLayout cacheLayout = fullInputLayout; + unsigned cacheThresholdIdx = 0; + while (cacheLayout.GetMemorySize() > maxCacheElts) + { + cacheLayout = cacheLayout.GetSliceLayout(0); + cacheThresholdIdx++; + } + if (cacheLayout.NumElements() == 0) + { + throw InputException(InputExceptionErrors::invalidSize, "Specified cache size isn't large enough to cover the smallest dimension of the cache layout"); + } + std::vector cacheOrderedIndexSizes(orderedIndexSizes.begin() + cacheThresholdIdx, orderedIndexSizes.end()); + std::vector cacheLogicalDimensionMapping(logicalDimensionMapping.begin() + cacheThresholdIdx, logicalDimensionMapping.end()); + std::vector cacheOrderedIndexIncrements(orderedIndexIncrements.begin() + cacheThresholdIdx, orderedIndexIncrements.end()); + auto cacheName = UniqueName(baseName); + _rawCache = StaticAllocate(cacheName, _value.GetBaseType(), cacheLayout); + + // Progresive Caching + // To enable progressive caching, where a subset of the full physical cache is + // filled and used, then later the next chunk of the physical cache is filled + // and used, we need to find the dimension split at which fillThreshold elements + // is surpassed and set up a fill kernel at that point + // If fillThreshold == maxCacheElts or they are both exceeded in the same + // split, then ensure that the fill kernel occurs after the cache emptying kernel + if (fillThreshold > maxCacheElts) + { + throw InputException(InputExceptionErrors::invalidArgument, "Fill threshold can't be larger than the max cache size"); + } + unsigned cacheFillThresholdIdx = cacheThresholdIdx; + MemoryLayout cacheFillLayout = cacheLayout; + while (cacheFillLayout.GetMemorySize() > fillThreshold) + { + cacheFillLayout = cacheFillLayout.GetSliceLayout(0); + cacheFillThresholdIdx++; + } + if (cacheFillLayout.NumElements() == 0) + { + throw InputException(InputExceptionErrors::invalidSize, "Specified cache fill threshold size isn't large enough to cover the smallest dimension of the cache layout"); + } + std::vector cacheFillOrderedIndexSizes(orderedIndexSizes.begin() + cacheFillThresholdIdx, orderedIndexSizes.end()); + std::vector cacheFillLogicalDimensionMapping(logicalDimensionMapping.begin() + cacheFillThresholdIdx, logicalDimensionMapping.end()); + std::vector cacheFillOrderedIndexIncrements(orderedIndexIncrements.begin() + cacheFillThresholdIdx, orderedIndexIncrements.end()); + + // Cache View + // The cache view needs to have the same number of dimensions as the input value + // but cover an area that is a subset of the full cache and represents one cache + // dimension per logical input dimension. + // This may mean that for some of the logical input dimensions, the cache view + // size is 1, e.g. suppose a 3-D input is cached where the inner 3 dimensions of + // the cache only operate over two of the logical dimensions of the input while the + // two innermost dimensions of those operate over the two distinct input logical + // dimensions. In that case the cache view would cover the inner two cache dimensions + // and have a 1 for the third dimension size. + // In general, the cache view needs to cover an area of the cache that can be + // contiguously represented like the logical input value. + + // To build up the cache view layout, start from the innermost dimension of the + // cache layout and accumulate dimensions going outward until either all of the + // logical input dimensions are accounted for or one of the logical input dimensions + // repeats. However, when a single dimension is repeated multiple times in a row, + // those repeats can be collapsed into a single visiting of that dimension. These + // can be collapsed because the logical behavior is the same regardless of whether + // the split that produced the repeated dimension was made or not. + // E.g. suppose your dimensions are { 0, 0, 1, 1, 1, 0, 0 }, then the first pair of + // 0's can be collapsed and treated like a single visiting of that dimension, + // the set of 3 1's can be collapsed, and the final pair of 0's can be collapsed, + // producing a collapsed dimension ordering of { 0, 1, 0 }. With a collapsed + // dimension ordering of { 0, 1, 0 }, the cache view needs to break at the inner + // { 1, 0 }, because after that a dimension (the 0 dimension) will repeat. + MemoryLayout baseCacheViewLayout; + unsigned cacheViewThresholdIdxOffset; + std::tie(baseCacheViewLayout, cacheViewThresholdIdxOffset) = ComputeCacheView(cacheFillLayout, + cacheFillLogicalDimensionMapping, + logicalDimensionCount); + unsigned cacheViewThresholdIdx = cacheFillThresholdIdx + cacheViewThresholdIdxOffset; + + auto cacheRef = _rawCache.Reference(); + cacheRef.SetLayout(baseCacheViewLayout); + + // Boundary Conditions + // Boundary conditions occur when the region of the input value that we want + // to cache does not fill the physical cache, + // e.g. for a matrix cache there are 4 cases, 3 of which are considered boundary condition cases: + // Suppose the matrix is M x N and the physical cache is sized to hold M' x N' elements, + // where M / 2 < M' < M, N / 2 < N' < N + // |-------N-------| + // |----N'---|----N'---| + // _ _ *---------------* + // | | | | | + // | M'| 1 | 2 | + // | | | | | + // M _ |_________|_____| + // | | | 3 | 4 | + // | M'| | | + // _ | *---------------* + // _ + // 1 : The cache has exactly as many rows and columns as the input matrix chunk + // 2 : The cache has more columns than the matrix chunk but just as many rows + // 3 : The cache has more rows than the matrix chunk but just as many columns + // 4 : The cache has more rows and columns than the matrix chunk + // + // One possible solution is to zero-pad the cache and keep the layout as-is. This would certainly work + // + // However, in order to maximize data locality in the cache (which is the purpose of the cache), + // we would prefer it if the cache were reshaped such that the input value chunk + // fills the cache from the beginning until the end of the chunk without any gaps. + // This reshape amounts to shrinking the cache sizes in some dimensions, however to preserve + // vectorization behavior we avoid shrinking the innermost dimension and instead zero-pad + // that dimension + unsigned cacheFillThresholdIdxOffset = cacheFillThresholdIdx - cacheThresholdIdx; + unsigned cacheViewThresholdIdxCacheOffset = cacheViewThresholdIdxOffset + cacheFillThresholdIdxOffset; + BoundaryConditionMemoryLayoutHelper boundaryConditionCacheHelper(_value.GetLayout().GetActiveSize(), cacheOrderedIndexSizes, cacheLogicalDimensionMapping, cacheOrderedIndexIncrements, cacheFillThresholdIdxOffset, cacheViewThresholdIdxCacheOffset); + + std::vector cachingKernels; + + { + // Flush the cache to implicitly zero-pad any regions of the cache we don't fill later + std::vector cacheFlushPosition(orderedIndices.begin(), orderedIndices.begin() + cacheThresholdIdx); + auto cacheEmptyKernel = loopnests::Kernel(cacheName + "_Empty_Cache_Kernel") + .Inputs(_rawCache) + .Indices() + .Define([](Value cache) { + // TODO : determine if a vectorized approach is worthwhile here + ZeroMemory(cache); + }); + + underlyingNest.AddKernel(cacheEmptyKernel, loopnests::CodePositionConstraints{ loopnests::LoopFragmentType::prologue, cacheFlushPosition, {} }); + cachingKernels.push_back(cacheEmptyKernel); + } + if (useFillKernel) + { + std::vector cacheFillPosition(orderedIndices.begin(), orderedIndices.begin() + cacheFillThresholdIdx); + std::vector cacheFillIndices(_kernelIndices.begin(), _kernelIndices.end()); + cacheFillIndices.insert(cacheFillIndices.end(), cacheFillPosition.begin(), cacheFillPosition.end()); + + auto cacheFillKernel = loopnests::Kernel(cacheName + "_Fill_Cache_Kernel") + .Inputs(_value, _rawCache) + .Indices(cacheFillIndices) + .DefineEx([=](std::vector values, std::vector indices) { + auto& input = values[0]; + auto& cache = values[1]; + std::vector compositeIndexValues(indices.begin(), indices.begin() + compositeIndexCount); + std::vector splitIndexValues(indices.begin() + compositeIndexCount, indices.end()); + + auto offsetInput = input.Offset(compositeIndexValues); + offsetInput.SetLayout(input.GetLayout()); + auto offsetInputArrayView = Array(offsetInput); + + boundaryConditionCacheHelper.EmitBoundarySwitches(compositeIndexValues, [=](MemoryLayout inputRegionShape, MemoryLayout inputRegionFillShape, MemoryLayout boundaryCacheLayout, MemoryLayout boundaryCacheFillLayout) { + // Offset the cache write head based on the where we're at in the progressive caching + // Since fillThreshold <= maxCacheElts, we may run this kernel multiple times filling + // different portions of the cache, so we look at the indices between the + // cacheThresholdIdx and the cacheFillThresholdIdx to find what position we need to + // offset to + // these indices all map in order to the dimensions that are in the cache and outside + // the fill region since the cache memory ordering is based on these indices in this order + + auto cacheView = cache; + cacheView.SetLayout(boundaryCacheLayout); + std::vector cacheOffsetIndices; + cacheOffsetIndices.reserve(boundaryCacheLayout.NumDimensions()); + + // Note: if cacheThresholdIdx == cacheFillThresholdIdx (i.e. if there is no progressive caching) + // Then the first loop is skipped and no offsetting occurs, and therefore filling the cache from + // the beginning every time this kernel is run + for (unsigned idx = cacheThresholdIdx; idx < cacheFillThresholdIdx; ++idx) + { + // Mapping loopnest indices (input space) -> cache offsets (cache space) so divide by split index increment + cacheOffsetIndices.push_back(splitIndexValues[idx] / orderedIndexIncrements[idx]); + } + for (unsigned idx = cacheFillThresholdIdx; idx < static_cast(fullInputLayout.NumDimensions()); ++idx) + { + cacheOffsetIndices.push_back(Scalar{ 0 }); + } + auto offsetCache = cacheView.Offset(cacheOffsetIndices); + offsetCache.SetLayout(boundaryCacheFillLayout); + auto cacheFillArrayView = Array(offsetCache); + + // Prefer input-oriented loops to maximize locality as the input + // is likely to be larger than the cache in most cases + // Based on the element size and counts in different dimensions, + // we will split and unroll some of the inner loops in order to maximize + // vectorization. + // In order to get appropriate utilization of all the SIMD + // registers, we will need to use a temporary buffer (which we expect + // the compiler to optimize away) with a size equal to the total number + // of elements that can be held in all of the SIMD registers. + // The filling of this temporary buffer from the input needs to be an + // unrolled operation and the filling of the cache from the temporary + // buffer also needs to be an unrolled operation that happens after + // the full temporary buffer has been filled. + // Therefore, we need multiple levels of loopnests so that the area + // outside of the temporary buffer's addressable region can be looped + // over, and the area inside the temporary buffer region can have two + // sequential fully unrolled loopnests. + // new loopnest (outer): + // For ... { + // For ... { + // // start of outer loopnest prologue kernel + // // Fill temp buf + // new loopnest (inner #1): + // For ... (unroll) { + // For ... (unroll) { + // ... { + // // start of inner loopnest #1 kernel + // tempBuf(tempBufIndices) = input(inputIndices) + // // end of inner loopnest #1 kernel + // } + // ... + // } + // } + // // Fill cache + // new loopnest (inner #2): + // For ... (unroll) { + // For ... (unroll) { + // ... { + // // start of inner loopnest #2 kernel + // cache(cacheIndices) = tempBuf(tempBufIndices) + // // end of inner loopnest #2 kernel + // } + // ... + // } + // } + // // end of outer loopnest kernel + // } + // } + + std::vector fillIndices; + fillIndices.reserve(inputRegionFillShape.NumDimensions()); + for (int idx = 0; idx < inputRegionFillShape.NumDimensions(); ++idx) + { + fillIndices.push_back(loopnests::Index("fillIdx_" + std::to_string(idx))); + } + + // Define LoopNest + auto fillNest = Using({ offsetInputArrayView }, ArgumentType::Input) + .Using({ cacheFillArrayView }, ArgumentType::Output); + for (int idx = 0; idx < inputRegionFillShape.NumDimensions(); ++idx) + { + fillNest.ForAll(fillIndices[idx], 0, inputRegionFillShape.GetActiveSize(idx)); + } + + const int VectorizationSize = registerCharacteristics.NumberOfElementsPerSIMDRegister; + int maximumElementsInTempBuf = registerCharacteristics.NumberOfSIMDRegisters * VectorizationSize; + std::vector indexSplitSizes(fillIndices.size()); + std::vector tmpBufDimensionMapping(indexSplitSizes.size()); + + // Handle the innermost input dimension differently since we'll be counting elements there instead of shards of a memory layout + int shardSize = VectorizationSize; + int totalElementsPerShard = VectorizationSize; + for (unsigned idx = fillIndices.size() - 1; fillIndices.size() > idx; --idx) + { + int availableShardsInTmpBuf = maximumElementsInTempBuf / totalElementsPerShard; + int inputDimAvailableShards = inputRegionFillShape.GetActiveSize(idx) / shardSize; + int numShards = std::min(availableShardsInTmpBuf, inputDimAvailableShards); + tmpBufDimensionMapping[idx] = inputRegionFillShape.GetLogicalDimension(idx); + if (numShards > 1) + { + indexSplitSizes[idx] = numShards * shardSize; + shardSize = 1; // After the initial vectorization size, we target units of entire memory layout shards + totalElementsPerShard *= numShards; // The number of elements represented by a target scales with the number of inner targets it represents + } + else + { + indexSplitSizes[idx] = 1; + } + } + // The index split sizes are measured in input-space, so no scaling is needed + std::vector tmpBufScaleFactors(indexSplitSizes.size(), 1); + + BoundaryConditionMemoryLayoutHelper fillKernelBoundaryHelper(inputRegionFillShape.GetActiveSize(), + indexSplitSizes, + tmpBufDimensionMapping, + tmpBufScaleFactors, + 0, // Fill index doesn't matter for this usage + tmpBufDimensionMapping.size()); // Shrink any index split sizes needed since we don't have a "view" to worry about + + auto cacheFillInternalKernel = loopnests::Kernel("Internal_Fill_Cache_Outer_Kernel") + .Inputs(offsetInputArrayView, cacheFillArrayView) + .Indices(fillIndices) + .DefineEx([=](std::vector values, std::vector innerIndices) { + Array offsetInput = values[0]; + Array cacheFillView = values[1]; + + Value offsetInputInnerVal = offsetInput.GetValue().Offset(innerIndices); + offsetInputInnerVal.SetLayout(offsetInput.GetValue().GetLayout()); + Array offsetInputInner = offsetInputInnerVal; + + std::vector cacheIndices; + cacheIndices.reserve(boundaryCacheFillLayout.NumDimensions()); + for (int cacheDimIdx = 0; cacheDimIdx < boundaryCacheFillLayout.NumDimensions(); ++cacheDimIdx) + { + unsigned baseDimIdx = cacheFillThresholdIdx + cacheDimIdx; + int logicalDimension = logicalDimensionMapping[baseDimIdx]; + // Mapping loopnest indices (input space) -> cache indices (cache space) so divide by split index increment + cacheIndices.push_back((innerIndices[logicalDimension] / orderedIndexIncrements[baseDimIdx]) % boundaryCacheFillLayout.GetActiveSize(cacheDimIdx)); + } + Value offsetCacheInnerVal = cacheFillView.GetValue().Offset(cacheIndices); + offsetCacheInnerVal.SetLayout(cacheFillView.GetValue().GetLayout()); + Array offsetCacheInner = offsetCacheInnerVal; + + fillKernelBoundaryHelper.EmitBoundarySwitches(innerIndices, [=](MemoryLayout fillRegionShape, MemoryLayout, MemoryLayout boundaryTempBufLayout, MemoryLayout) { + Array tmpBuf = Allocate(offsetInput.Type(), boundaryTempBufLayout, bufferAlignment); + + std::vector tmpBufInputIndices; + + tmpBufInputIndices.reserve(fillRegionShape.NumDimensions()); + for (int idx = 0; idx < fillRegionShape.NumDimensions(); ++idx) + { + tmpBufInputIndices.push_back(loopnests::Index("tmpBuf_FillIdx_" + std::to_string(idx))); + } + + auto tmpBufFillNest = Using({ offsetInputInner }, ArgumentType::Input) + .Using({ tmpBuf }, ArgumentType::Output); + for (int idx = 0; idx < fillRegionShape.NumDimensions(); ++idx) + { + tmpBufFillNest.ForAll(tmpBufInputIndices[idx], 0, fillRegionShape.GetActiveSize(idx)); + } + + auto tmpBufFill = loopnests::Kernel("Internal_TmpBuf_FillTmpBuf_Kernel") + .Inputs(offsetInputInner, tmpBuf) + .Indices(tmpBufInputIndices) + .DefineEx([=](std::vector tmpBufValues, std::vector tmpBufInputIndices) { + Array offsetInputInner = tmpBufValues[0]; + Array tmpBuf = tmpBufValues[1]; + + tmpBuf(tmpBufInputIndices) = offsetInputInner(tmpBufInputIndices); + }); + tmpBufFillNest.Do(tmpBufFill); + auto& tmpBufFillSchedule = tmpBufFillNest.GetSchedule(); + // unroll everything + for (unsigned idx = 0; idx < tmpBufInputIndices.size(); ++idx) + { + tmpBufFillSchedule.Unroll(tmpBufInputIndices[idx]); + } + tmpBufFillNest.Run(); + + // Cache fill from tmp buf + auto cacheFillNest = Using({ tmpBuf }, ArgumentType::Input) + .Using({ offsetCacheInner }, ArgumentType::Output); + for (int idx = 0; idx < tmpBuf.GetValue().GetLayout().NumDimensions(); ++idx) + { + cacheFillNest.ForAll(tmpBufInputIndices[idx], 0, tmpBuf.GetValue().GetLayout().GetActiveSize(idx)); + } + + auto cacheFill = loopnests::Kernel("Internal_TmpBuf_FillCache_Kernel") + .Inputs(tmpBuf, offsetCacheInner) + .Indices(tmpBufInputIndices) + .DefineEx([=](std::vector tmpBufValues, std::vector tmpBufIndices) { + Array tmpBuf = tmpBufValues[0]; + Array offsetCacheInner = tmpBufValues[1]; + + int cacheDimensions = offsetCacheInner.GetValue().GetLayout().NumDimensions(); + std::vector cacheIndices; + cacheIndices.reserve(cacheDimensions); + for (int cacheDimIdx = 0; cacheDimIdx < cacheDimensions; ++cacheDimIdx) + { + unsigned baseDimIdx = cacheFillThresholdIdx + cacheDimIdx; + int logicalDimension = logicalDimensionMapping[baseDimIdx]; + // Mapping loopnest indices (input space) -> cache indices (cache space) so divide by split index increment + cacheIndices.push_back((tmpBufIndices[logicalDimension] / orderedIndexIncrements[baseDimIdx]) % boundaryCacheFillLayout.GetActiveSize(cacheDimIdx)); + } + offsetCacheInner(cacheIndices) = tmpBuf(tmpBufIndices); + }); + cacheFillNest.Do(cacheFill); + auto& cacheFillSchedule = cacheFillNest.GetSchedule(); + for (unsigned idx = 0; idx < tmpBufInputIndices.size(); ++idx) + { + cacheFillSchedule.Unroll(tmpBufInputIndices[idx]); + } + cacheFillNest.Run(); + }); + }); + + auto& schedule = fillNest.GetSchedule(); + std::vector splitOuterIndices; + for (unsigned idx = 0; idx < fillIndices.size(); ++idx) + { + if (indexSplitSizes[idx] > 1) + { + splitOuterIndices.push_back(schedule.Split(fillIndices[idx], indexSplitSizes[idx])); + } + else + { + splitOuterIndices.push_back(fillIndices[idx]); + } + } + + fillNest.Do(cacheFillInternalKernel, splitOuterIndices); + + fillNest.Run(); + }); + }); + + underlyingNest.AddKernel(cacheFillKernel, loopnests::CodePositionConstraints{ loopnests::LoopFragmentType::prologue, cacheFillPosition, {} }); + cachingKernels.push_back(cacheFillKernel); + } + + if (useViewKernel) + { + // The cache view indices are all of the indices that occur before the cacheViewThresholdIdx + std::vector cacheViewPosition(orderedIndices.begin(), orderedIndices.begin() + cacheViewThresholdIdx); + std::vector cacheViewIndices(_kernelIndices.begin(), _kernelIndices.end()); + cacheViewIndices.insert(cacheViewIndices.end(), cacheViewPosition.begin(), cacheViewPosition.end()); + + auto cacheViewKernel = loopnests::Kernel(cacheName + "_View_Cache_Kernel") + .Inputs(_rawCache, cacheRef) + .Indices(cacheViewIndices) + .DefineEx([boundaryConditionCacheHelper, compositeIndexCount, fullInputLayout, cacheLayout, baseCacheViewLayout, cacheLogicalDimensionMapping, logicalDimensionMapping, orderedIndices, orderedIndexIncrements, cacheThresholdIdx, cacheViewThresholdIdx, logicalDimensionCount](std::vector values, std::vector indices) { + auto& cache = values[0]; + auto& cacheRef = values[1]; + std::vector compositeIndexValues(indices.begin(), indices.begin() + compositeIndexCount); + std::vector splitIndexValues(indices.begin() + compositeIndexCount, indices.end()); + + boundaryConditionCacheHelper.EmitBoundarySwitches(compositeIndexValues, [&](MemoryLayout inputRegionShape, MemoryLayout inputRegionFillShape, MemoryLayout boundaryCacheLayout, MemoryLayout boundaryCacheFillLayout) { + // Find the view slice in the cache for this offset + // The indices in [cacheThresoldIdx, cacheViewThresholdIdx) in the indices determine which slice to use + std::vector cacheOffsetIndices; + cacheOffsetIndices.reserve(cacheLayout.NumDimensions()); + + // Note: if cacheThresholdIdx == cacheViewThresholdIdx (i.e. if there is no repeated re-viewing of the cache) + // Then the first loop is skipped and no offsetting occurs + auto cacheView = cache; + for (unsigned idx = cacheThresholdIdx; idx < cacheViewThresholdIdx; ++idx) + { + // Mapping loopnest indices (input space) -> cache offsets (cache space) so divide by split index increment + cacheOffsetIndices.push_back(splitIndexValues[idx] / orderedIndexIncrements[idx]); + } + for (unsigned idx = cacheViewThresholdIdx; idx < static_cast(fullInputLayout.NumDimensions()); ++idx) + { + cacheOffsetIndices.push_back(Scalar{ 0 }); + } + + cacheView.SetLayout(boundaryCacheLayout); + auto offsetCache = cacheView.Offset(cacheOffsetIndices); + offsetCache.SetLayout(baseCacheViewLayout); + + // Offset the cache ref from the base cache such that indexing with the current loop values + // would offset a pointer to the beginning of this view of the cache + std::vector offsetIndices(logicalDimensionCount); + for (int idx = 0; idx < logicalDimensionCount; ++idx) + { + offsetIndices[idx] -= compositeIndexValues[idx]; + } + + auto offsetCacheView = offsetCache.Offset(offsetIndices); + offsetCacheView.SetLayout(baseCacheViewLayout); + cacheRef.SetLayout(baseCacheViewLayout); + cacheRef = offsetCacheView.Reference(); + }); + }); + + underlyingNest.AddKernel(cacheViewKernel, loopnests::CodePositionConstraints{ loopnests::LoopFragmentType::prologue, cacheViewPosition, {} }); + cachingKernels.push_back(cacheViewKernel); + } + + if (useReduceKernel) + { + // The cache reduce indices are all of the indices that occur before the cacheThresholdIdx + // Because the reduce is symmetric with the cache non-progressive fill / flush level of a loop nest + std::vector cacheReducePosition(orderedIndices.begin(), orderedIndices.begin() + cacheThresholdIdx); + std::vector cacheReduceIndices(_kernelIndices.begin(), _kernelIndices.end()); + cacheReduceIndices.insert(cacheReduceIndices.end(), cacheReducePosition.begin(), cacheReducePosition.end()); + + auto cacheReduceKernel = loopnests::Kernel(cacheName + "_Reduce_Kernel") + .Inputs(_value, _rawCache) + .Indices(cacheReduceIndices) + .DefineEx([=](std::vector values, std::vector indices) { + auto& input = values[0]; + auto& cache = values[1]; + std::vector compositeIndexValues(indices.begin(), indices.begin() + compositeIndexCount); + std::vector splitIndexValues(indices.begin() + compositeIndexCount, indices.end()); + + auto offsetInput = input.Offset(compositeIndexValues); + offsetInput.SetLayout(input.GetLayout()); + auto offsetInputArrayView = Array(offsetInput); + + boundaryConditionCacheHelper.EmitBoundarySwitches(compositeIndexValues, [=](MemoryLayout inputRegionShape, MemoryLayout, MemoryLayout boundaryCacheLayout, MemoryLayout) { + auto cacheArrayView = Array(cache); + + // Prefer input-oriented loops to maximize locality as the input + // is likely to be larger than the cache in most cases + // Based on the element size and counts in different dimensions, + // we will split and unroll some of the inner loops in order to maximize + // vectorization. + // In order to get appropriate utilization of all the SIMD + // registers, we will need to use a temporary buffer (which we expect + // the compiler to optimize away) with a size equal to the total number + // of elements that can be held in all of the SIMD registers. + // The filling of this temporary buffer from the cache needs to be an + // unrolled operation and the reducing of the output from the temporary + // buffer also needs to be an unrolled operation that happens after + // the full temporary buffer has been filled. + // If the reduce operation is a SumReduce operation, then we need + // a third loop in the middle which accumulates the current value + // from the output into the temporary buffer, then have the + // third loop copy the temporary buffer to the output + // Therefore, we need multiple levels of loopnests so that the area + // outside of the temporary buffer's addressable region can be looped + // over, and the area inside the temporary buffer region can have two + // or three sequential fully unrolled loopnests. + // new loopnest (outer): + // For ... { + // For ... { + // // start of outer loopnest prologue kernel + // // Fill temp buf with cache data + // new loopnest (inner #1): + // For ... (unroll) { + // For ... (unroll) { + // ... { + // // start of inner loopnest #1 kernel + // tempBuf(tempBufIndices) = cache(cacheIndices) + // // end of inner loopnest #1 kernel + // } + // ... + // } + // } + // // if reduceFunction == SumReduce + // // Apply the reduce function to reduce elements of the output into the temp buf + // new loopnest (inner #2): + // For ... (unroll) { + // For ... (unroll) { + // ... { + // // start of inner loopnest #2 kernel + // tempBuf(tempBufIndices) += input(inputIndices) + // // end of inner loopnest #2 kernel + // } + // ... + // } + // } + // // Copy temp buf to output + // new loopnest (inner #3): + // For ... (unroll) { + // For ... (unroll) { + // ... { + // // start of inner loopnest #3 kernel + // input(inputIndices) = tempBuf(tempBufIndices) + // // end of inner loopnest #3 kernel + // } + // ... + // } + // } + // // end of outer loopnest kernel + // } + // } + + std::vector reduceIndices; + reduceIndices.reserve(inputRegionShape.NumDimensions()); + for (int idx = 0; idx < inputRegionShape.NumDimensions(); ++idx) + { + reduceIndices.push_back(loopnests::Index("reduceIdx_" + std::to_string(idx))); + } + + // Define LoopNest + auto reduceNest = Using({ offsetInputArrayView }, ArgumentType::Input) + .Using({ cacheArrayView }, ArgumentType::Output); + for (int idx = 0; idx < inputRegionShape.NumDimensions(); ++idx) + { + reduceNest.ForAll(reduceIndices[idx], 0, inputRegionShape.GetActiveSize(idx)); + } + + const int VectorizationSize = registerCharacteristics.NumberOfElementsPerSIMDRegister; + int maximumElementsInTempBuf = registerCharacteristics.NumberOfSIMDRegisters * VectorizationSize; + std::vector indexSplitSizes(reduceIndices.size()); + std::vector tmpBufDimensionMapping(indexSplitSizes.size()); + + // Handle the innermost input dimension differently since we'll be counting elements there instead of shards of a memory layout + int shardSize = VectorizationSize; + int totalElementsPerShard = VectorizationSize; + for (unsigned idx = reduceIndices.size() - 1; reduceIndices.size() > idx; --idx) + { + int availableShardsInTmpBuf = maximumElementsInTempBuf / totalElementsPerShard; + int inputDimAvailableShards = inputRegionShape.GetActiveSize(idx) / shardSize; + int numShards = std::min(availableShardsInTmpBuf, inputDimAvailableShards); + tmpBufDimensionMapping[idx] = inputRegionShape.GetLogicalDimension(idx); + if (numShards > 1) + { + indexSplitSizes[idx] = numShards * shardSize; + shardSize = 1; // After the initial vectorization size, we target units of entire memory layout shards + totalElementsPerShard *= numShards; // The number of elements represented by a target scales with the number of inner targets it represents + } + else + { + indexSplitSizes[idx] = 1; + } + } + // The index split sizes are measured in input-space, so no scaling is needed + std::vector tmpBufScaleFactors(indexSplitSizes.size(), 1); + + BoundaryConditionMemoryLayoutHelper reduceKernelBoundaryHelper(inputRegionShape.GetActiveSize(), + indexSplitSizes, + tmpBufDimensionMapping, + tmpBufScaleFactors, + 0, // Fill index doesn't matter for this usage + tmpBufDimensionMapping.size()); // Shrink any index split sizes needed since we don't have a "view" to worry about + + auto cacheReduceInternalKernel = loopnests::Kernel("Internal_Reduce_Cache_Outer_Kernel") + .Inputs(offsetInputArrayView, cacheArrayView) + .Indices(reduceIndices) + .DefineEx([=](std::vector values, std::vector innerIndices) { + Array offsetInput = values[0]; + Array cacheView = values[1]; + + Value offsetInputInnerVal = offsetInput.GetValue().Offset(innerIndices); + offsetInputInnerVal.SetLayout(offsetInput.GetValue().GetLayout()); + Array offsetInputInner = offsetInputInnerVal; + + std::vector cacheIndices; + cacheIndices.reserve(boundaryCacheLayout.NumDimensions()); + for (int cacheDimIdx = 0; cacheDimIdx < boundaryCacheLayout.NumDimensions(); ++cacheDimIdx) + { + unsigned baseDimIdx = cacheThresholdIdx + cacheDimIdx; + int logicalDimension = logicalDimensionMapping[baseDimIdx]; + // Mapping loopnest indices (input space) -> cache indices (cache space) so divide by split index increment + cacheIndices.push_back((innerIndices[logicalDimension] / orderedIndexIncrements[baseDimIdx]) % boundaryCacheLayout.GetActiveSize(cacheDimIdx)); + } + Value offsetCacheInnerVal = cacheView.GetValue().Offset(cacheIndices); + offsetCacheInnerVal.SetLayout(cacheView.GetValue().GetLayout()); + Array offsetCacheInner = offsetCacheInnerVal; + + reduceKernelBoundaryHelper.EmitBoundarySwitches(innerIndices, [=](MemoryLayout reduceRegionShape, MemoryLayout, MemoryLayout boundaryTempBufLayout, MemoryLayout) { + Array tmpBuf = Allocate(offsetInput.Type(), boundaryTempBufLayout, bufferAlignment); + + std::vector tmpBufInputIndices; + + tmpBufInputIndices.reserve(reduceRegionShape.NumDimensions()); + for (int idx = 0; idx < reduceRegionShape.NumDimensions(); ++idx) + { + tmpBufInputIndices.push_back(loopnests::Index("tmpBuf_ReduceIdx_" + std::to_string(idx))); + } + + auto tmpBufFillFromCacheNest = Using({ offsetCacheInner }, ArgumentType::Input) + .Using({ tmpBuf }, ArgumentType::Output); + for (int idx = 0; idx < reduceRegionShape.NumDimensions(); ++idx) + { + tmpBufFillFromCacheNest.ForAll(tmpBufInputIndices[idx], 0, reduceRegionShape.GetActiveSize(idx)); + } + + // Fill tmp buf from cache + auto tmpBufFillFromCache = loopnests::Kernel("Internal_TmpBuf_FillTmpBuf_Kernel") + .Inputs(offsetCacheInner, tmpBuf) + .Indices(tmpBufInputIndices) + .DefineEx([=](std::vector tmpBufValues, std::vector tmpBufInputIndices) { + Array offsetCacheInner = tmpBufValues[0]; + Array tmpBuf = tmpBufValues[1]; + + int cacheDimensions = offsetCacheInner.GetValue().GetLayout().NumDimensions(); + std::vector cacheIndices; + cacheIndices.reserve(cacheDimensions); + for (int cacheDimIdx = 0; cacheDimIdx < cacheDimensions; ++cacheDimIdx) + { + unsigned baseDimIdx = cacheFillThresholdIdx + cacheDimIdx; + int logicalDimension = logicalDimensionMapping[baseDimIdx]; + // Mapping loopnest indices (input space) -> cache indices (cache space) so divide by split index increment + cacheIndices.push_back((tmpBufInputIndices[logicalDimension] / orderedIndexIncrements[baseDimIdx]) % boundaryCacheLayout.GetActiveSize(cacheDimIdx)); + } + tmpBuf(tmpBufInputIndices) = offsetCacheInner(cacheIndices); + }); + tmpBufFillFromCacheNest.Do(tmpBufFillFromCache); + auto& tmpBufFillSchedule = tmpBufFillFromCacheNest.GetSchedule(); + // unroll everything + for (unsigned idx = 0; idx < tmpBufInputIndices.size(); ++idx) + { + tmpBufFillSchedule.Unroll(tmpBufInputIndices[idx]); + } + tmpBufFillFromCacheNest.Run(); + + if (accumulateReduce) + { + // Reduce the current input/output contents into the temp buffer + auto tmpBufReduceNest = Using({ offsetInputInner }, ArgumentType::Input) + .Using({ tmpBuf }, ArgumentType::Output); + for (int idx = 0; idx < tmpBuf.GetValue().GetLayout().NumDimensions(); ++idx) + { + tmpBufReduceNest.ForAll(tmpBufInputIndices[idx], 0, tmpBuf.GetValue().GetLayout().GetActiveSize(idx)); + } + + auto tmpBufReduce = loopnests::Kernel("Internal_TmpBuf_ReduceOutput_Kernel") + .Inputs(tmpBuf, offsetInputInner) + .Indices(tmpBufInputIndices) + .DefineEx([=](std::vector tmpBufValues, std::vector tmpBufInputIndices) { + Array tmpBuf = tmpBufValues[0]; + Array offsetInputInner = tmpBufValues[1]; + + reduceFunction(tmpBuf(tmpBufInputIndices), offsetInputInner(tmpBufInputIndices)); + }); + tmpBufReduceNest.Do(tmpBufReduce); + auto& tmpBufReduceSchedule = tmpBufReduceNest.GetSchedule(); + for (unsigned idx = 0; idx < tmpBufInputIndices.size(); ++idx) + { + tmpBufReduceSchedule.Unroll(tmpBufInputIndices[idx]); + } + tmpBufReduceNest.Run(); + + // Copy temp buffer contents to input/output + auto storeOutNest = Using({ tmpBuf }, ArgumentType::Input) + .Using({ offsetInputInner }, ArgumentType::Output); + for (int idx = 0; idx < tmpBuf.GetValue().GetLayout().NumDimensions(); ++idx) + { + storeOutNest.ForAll(tmpBufInputIndices[idx], 0, tmpBuf.GetValue().GetLayout().GetActiveSize(idx)); + } + + auto storeOut = loopnests::Kernel("Internal_TmpBuf_CopyOutput_Kernel") + .Inputs(tmpBuf, offsetInputInner) + .Indices(tmpBufInputIndices) + .DefineEx([=](std::vector tmpBufValues, std::vector tmpBufInputIndices) { + Array tmpBuf = tmpBufValues[0]; + Array offsetInputInner = tmpBufValues[1]; + + offsetInputInner(tmpBufInputIndices) = tmpBuf(tmpBufInputIndices); + }); + storeOutNest.Do(storeOut); + auto& storeOutSchedule = storeOutNest.GetSchedule(); + for (unsigned idx = 0; idx < tmpBufInputIndices.size(); ++idx) + { + storeOutSchedule.Unroll(tmpBufInputIndices[idx]); + } + storeOutNest.Run(); + } + else + { + // Reduce the temp buffer into input/output + auto outputReduceNest = Using({ tmpBuf }, ArgumentType::Input) + .Using({ offsetInputInner }, ArgumentType::Output); + for (int idx = 0; idx < tmpBuf.GetValue().GetLayout().NumDimensions(); ++idx) + { + outputReduceNest.ForAll(tmpBufInputIndices[idx], 0, tmpBuf.GetValue().GetLayout().GetActiveSize(idx)); + } + + auto outputReduce = loopnests::Kernel("Internal_TmpBuf_ReduceOutput_Kernel") + .Inputs(tmpBuf, offsetInputInner) + .Indices(tmpBufInputIndices) + .DefineEx([=](std::vector tmpBufValues, std::vector tmpBufInputIndices) { + Array tmpBuf = tmpBufValues[0]; + Array offsetInputInner = tmpBufValues[1]; + + reduceFunction(offsetInputInner(tmpBufInputIndices), tmpBuf(tmpBufInputIndices)); + }); + outputReduceNest.Do(outputReduce); + auto& outputReduceSchedule = outputReduceNest.GetSchedule(); + for (unsigned idx = 0; idx < tmpBufInputIndices.size(); ++idx) + { + outputReduceSchedule.Unroll(tmpBufInputIndices[idx]); + } + outputReduceNest.Run(); + } + }); + }); + + auto& schedule = reduceNest.GetSchedule(); + std::vector splitOuterIndices; + for (unsigned idx = 0; idx < reduceIndices.size(); ++idx) + { + if (indexSplitSizes[idx] > 1) + { + splitOuterIndices.push_back(schedule.Split(reduceIndices[idx], indexSplitSizes[idx])); + } + } + + reduceNest.Do(cacheReduceInternalKernel, splitOuterIndices); + + reduceNest.Run(); + }); + }); + + underlyingNest.AddKernel(cacheReduceKernel, loopnests::CodePositionConstraints{ loopnests::LoopFragmentType::epilogue, cacheReducePosition, {} }); + cachingKernels.push_back(cacheReduceKernel); + } + + underlyingNest.RenameVariable(_value, cacheRef, _atIndices, cachingKernels); + } + +} // namespace value +} // namespace ell diff --git a/libraries/value/src/ComputeContext.cpp b/libraries/value/src/ComputeContext.cpp index 25970ca72..ca482bde2 100644 --- a/libraries/value/src/ComputeContext.cpp +++ b/libraries/value/src/ComputeContext.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -30,21 +31,34 @@ namespace value using namespace detail; using namespace utilities; - struct ComputeContext::FunctionScope + namespace { - FunctionScope(ComputeContext& context, std::string fnName) : - context(context) + struct { - context._stack.push({ fnName, {} }); - } + int Current() + { + std::lock_guard lock{ _mutex }; - ~FunctionScope() { context._stack.pop(); } + auto it = _idMap.find(std::this_thread::get_id()); + if (it == _idMap.end()) + { + it = _idMap.emplace_hint(it, std::this_thread::get_id(), ++_nextThreadId); + } - ComputeContext& context; - }; + return it->second; + } - namespace - { + void Clear() + { + std::lock_guard lock{ _mutex }; + _idMap.clear(); + _nextThreadId = 0; + } + + std::mutex _mutex; + std::unordered_map _idMap; + int _nextThreadId = 0; + } ThreadIds; // TODO: Make this the basis of an iterator for MemoryLayout bool IncrementMemoryCoordinateImpl(int dimension, std::vector& coordinate, const std::vector& maxCoordinate) @@ -351,6 +365,10 @@ namespace value { throw InputException(InputExceptionErrors::invalidArgument); } + else if constexpr (!std::is_same_v) + { + throw InputException(InputExceptionErrors::typeMismatch); + } else { return Value(std::copysign(*data1, *data2)); @@ -361,9 +379,185 @@ namespace value } }; + struct FmaFunctionIntrinsic + { + auto operator()(std::vector args) const -> Value + { + if (args.size() != 3) + { + throw InputException(InputExceptionErrors::invalidSize); + } + + if (std::any_of(args.begin(), args.end(), [](Value& value) { return value.IsConstrained() && value.GetLayout() != ScalarLayout; })) + { + throw InputException(InputExceptionErrors::invalidSize); + } + + const auto& value1 = args[0]; + const auto& value2 = args[1]; + const auto& value3 = args[2]; + + return std::visit( + [](auto&& data1, auto&& data2, auto&& data3) -> Value { + using Type1 = std::decay_t; + using Type2 = std::decay_t; + using Type3 = std::decay_t; + using DataType1 = std::remove_pointer_t; + using DataType2 = std::remove_pointer_t; + using DataType3 = std::remove_pointer_t; + + if constexpr (IsOneOf || + IsOneOf || + IsOneOf) + { + throw InputException(InputExceptionErrors::invalidArgument); + } + else if constexpr (!utilities::AllSame) + { + throw InputException(InputExceptionErrors::typeMismatch); + } + else + { + return Value(static_cast(std::fma(*data1, *data2, *data3))); + } + }, + value1.GetUnderlyingData(), + value2.GetUnderlyingData(), + value3.GetUnderlyingData()); + } + }; + + enum class MemIntrinsicOp + { + Copy, + Move, + Set + }; + template + struct MemOpFunctionIntrinsic + { + auto operator()(std::vector args) const -> Value + { + if (args.size() != 3) + { + throw InputException(InputExceptionErrors::invalidSize); + } + + if (!std::all_of(args.begin(), args.end(), [](const Value& value) { return value.IsConstant(); })) + { + throw InputException(InputExceptionErrors::invalidArgument); + } + + const auto& value1 = args[0]; + const auto& value2 = args[1]; + const auto& value3 = args[2]; + + if (!value3.IsConstrained() || value3.GetLayout() != ScalarLayout) + { + throw InputException(InputExceptionErrors::invalidArgument); + } + + if constexpr (MemIntrinsicOp::Set == op) + { + assert((value2.IsConstrained() && value2.GetLayout() == ScalarLayout && value2.GetType() == std::pair{ ValueType::Char8, 1 })); + } + + std::visit( + [](auto&& data1, auto&& data2, auto&& data3) { + using Type1 = std::decay_t; + using Type2 = std::decay_t; + using Type3 = std::decay_t; + if constexpr (utilities::IsOneOf) + { + assert(false); + return; + } + else + { + // Once we move away from VS 2017, this code can be uncommented and the code following can be simplified (lines 496-523) + + //constexpr auto memFn = [] { + // // static_casts needed because MSVC in VS 2017 can't handle the code without it + // if constexpr (static_cast(MemIntrinsicOp::Set) == static_cast(op)) + // { + // return &std::memset; + // } + // else if constexpr (static_cast(MemIntrinsicOp::Copy) == static_cast(op)) + // { + // return &std::memcpy; + // } + // else if constexpr (static_cast(MemIntrinsicOp::Move) == static_cast(op)) + // { + // return &std::memmove; + // } + // else + // { + // static_assert(utilities::FalseType{}, "Unknown enum value"); + // } + //}(); + + constexpr bool isSet = op == MemIntrinsicOp::Set; + std::decay_t> real2ndParam; + std::conditional_t memFn; + switch (op) + { + case MemIntrinsicOp::Set: + if constexpr (isSet) + { + memFn = &std::memset; + real2ndParam = *data2; + } + break; + case MemIntrinsicOp::Copy: + if constexpr (!isSet) + { + memFn = &std::memcpy; + real2ndParam = data2; + } + break; + case MemIntrinsicOp::Move: + if constexpr (!isSet) + { + memFn = &std::memmove; + real2ndParam = data2; + } + break; + default: + assert(false); + } + + memFn(data1, real2ndParam, *data3 * sizeof(data1[0])); + } + }, + value1.GetUnderlyingData(), + value2.GetUnderlyingData(), + value3.GetUnderlyingData()); + + return {}; // ignored + } + }; } // namespace + struct ComputeContext::FunctionScope + { + FunctionScope(ComputeContext& context, std::string fnName) : + context(context) + { + std::lock_guard lock{ context._mutex }; + context._stack.push({ fnName, {} }); + } + + ~FunctionScope() + { + std::lock_guard lock{ context._mutex }; + context._stack.pop(); + } + + ComputeContext& context; + }; + ComputeContext::ComputeContext(std::string moduleName) : + EmitterContext(emitters::GetTargetDevice("host")), _moduleName(std::move(moduleName)) { // we always have at least one stack entry, in case the top level function needs to return something @@ -379,42 +573,49 @@ namespace value using Iterator = ConstantDataList::const_iterator; - auto it = - std::visit(VariantVisitor{ [](Emittable) -> Iterator { return {}; }, - [this](auto&& data) -> Iterator { - using Type = std::decay_t; - using RealType = std::remove_pointer_t; - using VectorType = std::vector; - - const auto& frame = GetTopFrame(); - auto it = - std::find_if(frame.second.begin(), - frame.second.end(), - [data](const ConstantData& constData) { - if (auto ptr = std::get_if(&constData)) - { - return ptr->data() <= data && - data < (ptr->data() + ptr->size()); - } - - return false; - }); - - return it; - } }, - value.GetUnderlyingData()); + auto it = std::visit( + VariantVisitor{ + [](Emittable) -> Iterator { return {}; }, + [this](auto&& data) -> Iterator { + using Type = std::decay_t; + using RealType = std::remove_pointer_t; + using VectorType = std::vector; + + const auto& frame = GetTopFrame(); + auto it = + std::find_if(frame.second.begin(), + frame.second.end(), + [data](const ConstantData& constData) { + if (auto ptr = std::get_if(&constData)) + { + return ptr->data() <= data && + data < (ptr->data() + ptr->size()); + } + + return false; + }); + + return it; + } }, + value.GetUnderlyingData()); return *it; } - Value ComputeContext::AllocateImpl(ValueType type, MemoryLayout layout) + Value ComputeContext::AllocateImpl(ValueType type, MemoryLayout layout, size_t /* alignment */, AllocateFlags flags) { + if (flags != AllocateFlags::None) + { + throw LogicException(LogicExceptionErrors::notImplemented); + } + // special case the scalar case auto size = layout == ScalarLayout ? 1u : layout.GetMemorySize(); auto constantData = AllocateConstantData(type, size); Value value = StoreConstantData(std::move(constantData)); value.SetLayout(layout); + return value; } @@ -430,8 +631,13 @@ namespace value return std::nullopt; } - Value ComputeContext::GlobalAllocateImpl(GlobalAllocationScope scope, std::string name, ConstantData data, MemoryLayout layout) + Value ComputeContext::GlobalAllocateImpl(GlobalAllocationScope scope, std::string name, ConstantData data, MemoryLayout layout, AllocateFlags flags) { + if ((flags & AllocateFlags::ThreadLocal) == AllocateFlags::ThreadLocal) + { + throw LogicException(LogicExceptionErrors::illegalState, "Thread local storage cannot be specified for constant data"); + } + std::string adjustedName = GetScopeAdjustedName(scope, name); if (_globals.find(adjustedName) != _globals.end()) @@ -440,26 +646,45 @@ namespace value "Unexpected collision in global data allocation"); } - auto& globalData = _globals[adjustedName]; - globalData.first = std::move(data); - globalData.second = std::move(layout); + auto& globalData = [&]() -> decltype(auto) { + std::lock_guard lock{ _mutex }; + auto& globalData = _globals[adjustedName]; + globalData.first = std::move(data); + globalData.second = std::move(layout); + return globalData; + }(); return ConstantDataToValue(globalData.first, globalData.second); } - Value ComputeContext::GlobalAllocateImpl(GlobalAllocationScope scope, std::string name, ValueType type, MemoryLayout layout) + Value ComputeContext::GlobalAllocateImpl(GlobalAllocationScope scope, std::string name, ValueType type, MemoryLayout layout, AllocateFlags flags) { // special case the scalar case auto size = layout == ScalarLayout ? 1u : layout.GetMemorySize(); auto constantData = AllocateConstantData(type, size); - return GlobalAllocateImpl(scope, name, constantData, layout); + if ((flags & AllocateFlags::ThreadLocal) == AllocateFlags::ThreadLocal) + { + name += std::to_string(ThreadIds.Current()); + + if (auto globalValue = EmitterContext::GetGlobalValue(scope, name, layout)) + { + return *globalValue; + } + + flags &= ~AllocateFlags::ThreadLocal; + } + return GlobalAllocateImpl(scope, name, constantData, layout, flags); } Value ComputeContext::StoreConstantDataImpl(ConstantData data) { Value value = ConstantDataToValue(data); - GetTopFrame().second.push_front(std::move(data)); + { + std::lock_guard lock{ _mutex }; + GetTopFrame().second.push_front(std::move(data)); + } + return value; } @@ -507,68 +732,58 @@ namespace value { ConstantData movedOutOfScope; - std::visit(VariantVisitor{ [](Emittable) {}, - [&movedOutOfScope, this](auto&& data) { - using Type = std::decay_t; - using RealType = std::remove_pointer_t; - using VectorType = std::vector; - - const auto& frame = GetTopFrame(); - if (auto stackFrameIt = - std::find_if(frame.second.begin(), - frame.second.end(), - [data](const ConstantData& constData) { - if (auto ptr = std::get_if(&constData)) - { - return ptr->data() <= data && - data < (ptr->data() + ptr->size()); - } - - return false; - }); - stackFrameIt == frame.second.end()) - { - throw LogicException(LogicExceptionErrors::illegalState, - "Could not extract expected data"); - } - else - { - movedOutOfScope = std::move(*stackFrameIt); - } - } }, - value.GetUnderlyingData()); + std::lock_guard lock{ _mutex }; + + std::visit( + VariantVisitor{ + [](Emittable) {}, + [&movedOutOfScope, + size = (value.IsConstrained() ? value.GetLayout().GetMemorySize() : 1)](auto&& data) { + using Type = std::decay_t; + using RealType = std::remove_pointer_t; + using VectorType = std::vector; + + movedOutOfScope = VectorType(data, data + size); + } }, + value.GetUnderlyingData()); return movedOutOfScope; } bool ComputeContext::IsGlobalValue(Value value) const { - return std::visit(VariantVisitor{ [](Emittable) -> bool { - throw LogicException(LogicExceptionErrors::illegalState); - }, - [this](auto&& data) -> bool { - using Type = std::decay_t; - using RealType = std::remove_pointer_t; - using VectorType = std::vector; - - return std::find_if(_globals.begin(), - _globals.end(), - [data](const auto& kvp) { - if (auto ptr = std::get_if( - &kvp.second.first)) - { - return ptr->data() <= data && - data < (ptr->data() + ptr->size()); - } - return false; - }) != _globals.end(); - } }, - value.GetUnderlyingData()); + std::lock_guard lock{ _mutex }; + + return std::visit( + VariantVisitor{ + [](Emittable) -> bool { + throw LogicException(LogicExceptionErrors::illegalState); + }, + + [this](auto&& data) -> bool { + using Type = std::decay_t; + using RealType = std::remove_pointer_t; + using VectorType = std::vector; + + return std::find_if( + _globals.begin(), + _globals.end(), + [data](const auto& kvp) { + if (auto ptr = std::get_if( + &kvp.second.first)) + { + return ptr->data() <= data && + data < (ptr->data() + ptr->size()); + } + return false; + }) != _globals.end(); + } }, + value.GetUnderlyingData()); } detail::ValueTypeDescription ComputeContext::GetTypeImpl(Emittable) { - throw LogicException(LogicExceptionErrors::notImplemented); + throw LogicException(LogicExceptionErrors::illegalState); } EmitterContext::DefinedFunction ComputeContext::CreateFunctionImpl(FunctionDeclaration decl, EmitterContext::DefinedFunction fn) @@ -579,6 +794,8 @@ namespace value throw InputException(InputExceptionErrors::invalidArgument, "Specified function is an intrinsic"); } + std::lock_guard lock{ _mutex }; + if (auto it = _definedFunctions.find(decl); it != _definedFunctions.end()) { return it->second; @@ -591,7 +808,9 @@ namespace value const auto& fnName = decl.GetFunctionName(); assert(expectedArgs.size() == args.size()); - if (const auto& returnType = decl.GetReturnType(); returnType) + auto fnArgs = NormalizeReferenceLevels(args, expectedArgs); + + if (const auto& returnType = decl.GetReturnType(); returnType.has_value()) { Value expectedReturn = *returnType; @@ -599,18 +818,10 @@ namespace value std::optional maybeGlobal; { FunctionScope scope(*this, fnName); - std::vector fnArgs; - fnArgs.reserve(expectedArgs.size()); - for (auto arg : expectedArgs) - { - fnArgs.push_back(Value(arg.GetBaseType(), arg.GetLayout())); - } - - std::copy(args.begin(), args.end(), fnArgs.begin()); Value returnValue = expectedReturn; - auto fnReturn = fn(args); - if (!fnReturn) + auto fnReturn = fn(fnArgs); + if (!fnReturn.has_value()) { throw LogicException(LogicExceptionErrors::illegalState, "Function definition was expected to return a value, but optional was empty"); } @@ -641,8 +852,10 @@ namespace value } else { + FunctionScope scope(*this, fnName); + // equivalent of a void return type - (void)fn(args); + (void)fn(fnArgs); return std::nullopt; } @@ -660,40 +873,64 @@ namespace value return true; } + std::lock_guard lock{ _mutex }; return _definedFunctions.find(decl) != _definedFunctions.end(); } void ComputeContext::CopyDataImpl(const Value& source, Value& destination) { - std::visit(VariantVisitor{ [](Emittable) {}, - [&destination, &source](auto&& sourceData) { - using SourceDataType = std::decay_t; - - auto& destinationData = - std::get(destination.GetUnderlyingData()); - if (source.GetLayout().IsContiguous() && destination.GetLayout().IsContiguous()) - { - auto numElements = destination.GetLayout().NumElements(); - std::copy(sourceData, sourceData + numElements, destinationData); - } - else - { - auto& sourceLayout = source.GetLayout(); - auto maxCoordinate = sourceLayout.GetActiveSize().ToVector(); - decltype(maxCoordinate) coordinate(maxCoordinate.size()); - - do - { - auto logicalCoordinates = sourceLayout.GetLogicalCoordinates(coordinate); - auto sourceOffset = - sourceLayout.GetLogicalEntryOffset(logicalCoordinates); - auto destinationOffset = - destination.GetLayout().GetLogicalEntryOffset(logicalCoordinates); - *(destinationData + destinationOffset) = *(sourceData + sourceOffset); - } while (IncrementMemoryCoordinate(coordinate, maxCoordinate)); - } - } }, - source.GetUnderlyingData()); + std::visit( + VariantVisitor{ + [](Emittable) {}, + [&destination, &source](auto&& sourceData) { + using SourceDataType = std::decay_t; + + if (source.PointerLevel() == destination.PointerLevel()) + { + if (source.PointerLevel() == 1) + { + auto& destinationData = std::get(destination.GetUnderlyingData()); + if (source.GetLayout().IsContiguous() && destination.GetLayout().IsContiguous()) + { + auto numElements = destination.GetLayout().NumElements(); + std::copy(sourceData, sourceData + numElements, destinationData); + } + else + { + auto& sourceLayout = source.GetLayout(); + auto maxCoordinate = sourceLayout.GetActiveSize().ToVector(); + decltype(maxCoordinate) coordinate(maxCoordinate.size()); + + do + { + auto logicalCoordinates = sourceLayout.GetLogicalCoordinates(coordinate); + auto sourceOffset = + sourceLayout.GetLogicalEntryOffset(logicalCoordinates); + auto destinationOffset = + destination.GetLayout().GetLogicalEntryOffset(logicalCoordinates); + *(destinationData + destinationOffset) = *(sourceData + sourceOffset); + } while (IncrementMemoryCoordinate(coordinate, maxCoordinate)); + } + } + else + { + std::get(destination.GetUnderlyingData())[0] = std::get(source.GetUnderlyingData())[0]; + if (source.IsConstrained()) + { + destination.SetLayout(source.GetLayout()); + } + else + { + destination.ClearLayout(); + } + } + } + else + { + throw LogicException(LogicExceptionErrors::illegalState); + } + } }, + source.GetUnderlyingData()); } void ComputeContext::MoveDataImpl(Value& source, Value& destination) @@ -705,7 +942,7 @@ namespace value source.Reset(); } - void ComputeContext::ForImpl(MemoryLayout layout, std::function)> fn) + void ComputeContext::ForImpl(MemoryLayout layout, std::function)> fn, [[maybe_unused]] const std::string& name) { auto maxCoordinate = layout.GetActiveSize().ToVector(); decltype(maxCoordinate) coordinate(maxCoordinate.size()); @@ -717,7 +954,7 @@ namespace value } while (IncrementMemoryCoordinate(coordinate, maxCoordinate)); } - void ComputeContext::ForImpl(Scalar start, Scalar stop, Scalar step, std::function fn) + void ComputeContext::ForImpl(Scalar start, Scalar stop, Scalar step, std::function fn, [[maybe_unused]] const std::string& name) { if (!(start.GetValue().IsConstant() && stop.GetValue().IsConstant() && step.GetValue().IsConstant())) { @@ -736,6 +973,7 @@ namespace value auto startNum = start.Get(); auto stopNum = stop.Get(); auto stepNum = step.Get(); + for (; startNum < stopNum; startNum += stepNum) { fn(startNum); @@ -821,7 +1059,11 @@ namespace value } else { - throw LogicException(LogicExceptionErrors::illegalState); + detail::ValueTypeDescription typeDesc{ source.GetBaseType(), 0 }; + Value value{ typeDesc, utilities::ScalarLayout }; + + value.SetData(*data); + return value; } }, source.GetUnderlyingData()); @@ -856,42 +1098,58 @@ namespace value std::visit( VariantVisitor{ [](Emittable) {}, - [](Boolean*) {}, [&destination, &source, op](auto&& destinationData) { using DestinationDataType = std::remove_pointer_t>; std::function opFn; - switch (op) + if constexpr (!std::is_same_v) { - case ValueBinaryOperation::add: - opFn = [](auto dst, auto src) { return dst + src; }; - break; - case ValueBinaryOperation::subtract: - opFn = [](auto dst, auto src) { return dst - src; }; - break; - case ValueBinaryOperation::multiply: - opFn = [](auto dst, auto src) { return dst * src; }; - break; - case ValueBinaryOperation::divide: - opFn = [](auto dst, auto src) { return dst / src; }; - break; - - default: - if constexpr (std::is_integral_v) + switch (op) { - switch (op) + case ValueBinaryOperation::add: + opFn = [](auto dst, auto src) { return dst + src; }; + break; + case ValueBinaryOperation::subtract: + opFn = [](auto dst, auto src) { return dst - src; }; + break; + case ValueBinaryOperation::multiply: + opFn = [](auto dst, auto src) { return dst * src; }; + break; + case ValueBinaryOperation::divide: + opFn = [](auto dst, auto src) { return dst / src; }; + break; + + default: + if constexpr (std::is_integral_v) + { + switch (op) + { + case ValueBinaryOperation::modulus: + opFn = [](auto dst, auto src) { return dst % src; }; + break; + default: + throw LogicException(LogicExceptionErrors::illegalState); + } + } + else { - case ValueBinaryOperation::modulus: - opFn = [](auto dst, auto src) { return dst % src; }; - break; - default: throw LogicException(LogicExceptionErrors::illegalState); } } - else + } + else + { + switch (op) { + case ValueBinaryOperation::logicalAnd: + opFn = [](auto dst, auto src) { return dst && src; }; + break; + case ValueBinaryOperation::logicalOr: + opFn = [](auto dst, auto src) { return dst || src; }; + break; + default: throw LogicException(LogicExceptionErrors::illegalState); } } @@ -1110,6 +1368,34 @@ namespace value return { std::make_unique(state) }; } + void ComputeContext::WhileImpl(Scalar test, std::function fn) + { + if (!(test.GetValue().IsConstant())) + { + throw InputException(InputExceptionErrors::invalidArgument, "start/stop/step values must be constant for ComputeContext"); + } + + std::visit( + [&](auto&& data) { + using Type = std::remove_pointer_t>; + if constexpr (IsOneOf) + { + auto testVal = test.Get(); + + while (testVal) + { + fn(); + testVal = test.Get(); + } + } + else + { + // error? + } + }, + test.GetValue().GetUnderlyingData()); + } + std::optional ComputeContext::CallImpl(FunctionDeclaration func, std::vector args) { if (!std::all_of(args.begin(), args.end(), [this](const Value& value) { return ValidateValue(value); })) @@ -1123,9 +1409,18 @@ namespace value return IntrinsicCall(func, args); } - if (auto it = _definedFunctions.find(func); it != _definedFunctions.end()) + if (func.IsPointerSet()) { - return it->second(args); + auto ptr = func.GetPointer(); + return (*reinterpret_cast(reinterpret_cast(ptr.Get())))(args); + } + + { + std::lock_guard lock{ _mutex }; + if (auto it = _definedFunctions.find(func); it != _definedFunctions.end()) + { + return it->second(args); + } } throw InputException(InputExceptionErrors::invalidArgument, "Specified function is not defined for this context"); @@ -1136,6 +1431,8 @@ namespace value void ComputeContext::ParallelizeImpl(int numTasks, std::vector captured, std::function)> fn) { + ThreadIds.Clear(); + std::vector> futures; futures.reserve(numTasks); for (int i = 0; i < numTasks; ++i) @@ -1178,6 +1475,11 @@ namespace value } } // namespace + void ComputeContext::DebugBreakImpl() + { + throw 0; // TODO: throw a real exception (of type value::Exception::DebugTrapException, perhaps) + } + void ComputeContext::DebugDumpImpl(Value value, std::string tag, std::ostream& stream) const { PrintValue(value, stream); @@ -1219,11 +1521,13 @@ namespace value void ComputeContext::SetNameImpl(const Value& value, const std::string& name) { + std::lock_guard lock{ _mutex }; _namedValues[value] = name; } std::string ComputeContext::GetNameImpl(const Value& value) const { + std::lock_guard lock{ _mutex }; if (auto it = _namedValues.find(value); it != _namedValues.end()) { return it->second; @@ -1232,6 +1536,21 @@ namespace value return {}; } + void ComputeContext::ImportCodeFileImpl(std::string) { throw LogicException(LogicExceptionErrors::notImplemented); } + + Scalar ComputeContext::GetFunctionAddressImpl(const FunctionDeclaration& fn) + { + { + std::lock_guard lock{ _mutex }; + if (auto it = _definedFunctions.find(fn); it != _definedFunctions.end()) + { + return reinterpret_cast(reinterpret_cast(&(it->second))); + } + } + + throw InputException(InputExceptionErrors::invalidArgument, "ComputeContext can't take address of function that hasn't been defined"); + } + Value ComputeContext::IntrinsicCall(FunctionDeclaration intrinsic, std::vector args) { static std::unordered_map)>> intrinsics = { @@ -1251,6 +1570,10 @@ namespace value { FloorFunctionDeclaration, SimpleNumericalFunctionIntrinsic{}([](auto n) { return std::floor(n); }) }, { CeilFunctionDeclaration, SimpleNumericalFunctionIntrinsic{}([](auto n) { return std::ceil(n); }) }, { CopySignFunctionDeclaration, CopySignFunctionIntrinsic{} }, + { FmaFunctionDeclaration, FmaFunctionIntrinsic{} }, + { MemCopyFunctionDeclaration, MemOpFunctionIntrinsic{} }, + { MemMoveFunctionDeclaration, MemOpFunctionIntrinsic{} }, + { MemSetFunctionDeclaration, MemOpFunctionIntrinsic{} }, }; if (auto it = intrinsics.find(intrinsic); it != intrinsics.end()) @@ -1273,10 +1596,13 @@ namespace value auto pointerLevel1 = value1.PointerLevel(); auto pointerLevel2 = value2.PointerLevel(); - if (value1.GetBaseType() == value2.GetBaseType() && - pointerLevel1 == pointerLevel2 && + if (pointerLevel1 == pointerLevel2 && pointerLevel1 == 1) { + if (value1.GetBaseType() != value2.GetBaseType()) + { + throw InputException(InputExceptionErrors::typeMismatch); + } return true; } @@ -1322,6 +1648,7 @@ namespace value // Our stack always has one empty "scope" pushed to it, which we // can use to create our global prefix. + std::lock_guard lock{ _mutex }; return GetGlobalScopedName(GetTopFrame().first + "_" + name); } @@ -1329,5 +1656,16 @@ namespace value const ComputeContext::Frame& ComputeContext::GetTopFrame() const { return _stack.top(); } + void swap(ComputeContext& l, ComputeContext& r) noexcept + { + using std::swap; + + swap(static_cast(l), static_cast(r)); + swap(l._stack, r._stack); + swap(l._globals, r._globals); + swap(l._definedFunctions, r._definedFunctions); + swap(l._namedValues, r._namedValues); + swap(l._moduleName, r._moduleName); + } } // namespace value } // namespace ell diff --git a/libraries/value/src/CppEmitterContext.cpp b/libraries/value/src/CppEmitterContext.cpp new file mode 100644 index 000000000..e3b8ce406 --- /dev/null +++ b/libraries/value/src/CppEmitterContext.cpp @@ -0,0 +1,1753 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: CppEmitterContext.cpp (value) +// Authors: Kern Handa +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "CppEmitterContext.h" +#include "FunctionDeclaration.h" + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace ell::utilities; + +namespace ell +{ +namespace value +{ + struct CppEmitterContext::FunctionScope + { + FunctionScope(CppEmitterContext& context, const std::string& fnName) : + _context(context), + _guard(*this) + { + _context._fnStacks.push({ {}, fnName }); + _context._promotedConstantStack.push({}); + } + + ~FunctionScope() + { + _context._fnStacks.pop(); + _context._promotedConstantStack.pop(); + } + + private: + struct StreamGuard + { + StreamGuard(FunctionScope& context) : + _context(context), + _oldStream(_context._context._stream), + _oldIndent(_context._context._indent) + { + _context._context._indent = 0; + _context._context._stream = _context._sstr; + } + + ~StreamGuard() + { + _context._context._stream = _oldStream; + _context._context._indent = _oldIndent; + + // write our contents directly to the expression stream because the old stream + // might point to someone else's stream + _context._context._expressionStream << _context._sstr.str(); + } + + private: + FunctionScope& _context; + std::reference_wrapper _oldStream; + decltype(CppEmitterContext::_indent) _oldIndent; + }; + + CppEmitterContext& _context; + std::stringstream _sstr; + StreamGuard _guard; + }; + + CppEmitterContext::CppEmitterContext(std::string moduleName, std::ostream& stream) : + CppEmitterContext(emitters::GetTargetDevice("host"), moduleName, stream) + { + } + + CppEmitterContext::CppEmitterContext(std::string moduleName, std::unique_ptr stream) : + CppEmitterContext(emitters::GetTargetDevice("host"), moduleName, std::move(stream)) + { + } + + CppEmitterContext::CppEmitterContext(const TargetDevice& target, std::string moduleName, std::unique_ptr stream) : + CppEmitterContext(target, moduleName, *stream) + { + _ownedStream = std::move(stream); + } + + CppEmitterContext::CppEmitterContext(const TargetDevice& target, std::string moduleName, std::ostream& stream) : + EmitterContext(target), + _computeContext(moduleName), + _stream(_expressionStream), + _outputStream(stream), + _moduleName(std::move(moduleName)) + { + Global() << "// Instantiating CppEmitterContext\n" + "// Writing " + << _moduleName << ".cpp\n" + << "\n" + "#include \n" + "#include \n" + "#include \n" + "#include \n" + "#include \n" + "#include \n" + "#include \n" + "#include \n" + "\n" + "\n" + "#if !defined(VALUE_CPP_EMITTER_HELPERS_DEFINED)\n" + "#define VALUE_CPP_EMITTER_HELPERS_DEFINED\n" + "template using Scalar = std::array;\n" + "#endif // VALUE_CPP_EMITTER_HELPERS_DEFINED\n" + "\n" + "namespace {\n"; + } + + CppEmitterContext::~CppEmitterContext() + { + Out() << "\n// Cleaning up CppEmitterContext" << std::endl; + + _outputStream.get() << _globalStream.str() + << "} // namespace \n" + << _fnDeclStream.str() << "\n" + << _expressionStream.str() << std::endl; + + _outputStream.get().flush(); + } + + namespace + { + std::string ValueTypeToCTypeString(ValueType type) + { + switch (type) + { + case ValueType::Void: + return "void"; + case ValueType::Boolean: + return "bool"; + case ValueType::Byte: + return "uint8_t"; + case ValueType::Char8: + return "int8_t"; + case ValueType::Double: + return "double"; + case ValueType::Float: + return "float"; + case ValueType::Int16: + return "int16_t"; + case ValueType::Int32: + return "int32_t"; + case ValueType::Int64: + return "int64_t"; + default: + throw LogicException(LogicExceptionErrors::illegalState); + } + } + + std::string ValueTypeToCTypeString(detail::ValueTypeDescription desc, size_t size, bool forcePointer = false) + { + std::string str; + if (!forcePointer && desc.second == 1) + { + if (size == 1) + { + str = "Scalar<" + ValueTypeToCTypeString(desc.first) + ">"; + } + else + { + str = "std::array<" + ValueTypeToCTypeString(desc.first) + ", " + std::to_string(size) + ">"; + } + } + else + { + str = ValueTypeToCTypeString(desc.first); + str.insert(str.end(), desc.second, '*'); + } + return str; + } + + std::string ValueToCString(const Value& value, bool forcePointer = false) + { + size_t size{}; + if (!value.IsConstrained()) + { + if (!forcePointer) + { + throw LogicException(LogicExceptionErrors::illegalState, "Can't create concrete allocation for value with no known layout"); + } + } + else + { + size = value.GetLayout().GetMemorySize(); + } + auto str = ValueTypeToCTypeString(value.GetType(), size, forcePointer); + return str; + } + + template + std::string TypeToCTypeString(T) + { + +#define BEGIN_TYPE_TO_CTYPE_STRING_MAP \ + if constexpr (false) \ + { \ + } +#define ADD_TYPE_TO_CTYPE_STRING_STRING(TYPE, STR) \ + else if constexpr (std::is_same_v) { return #STR; } +#define ADD_TYPE_TO_CTYPE_STRING(TYPE) ADD_TYPE_TO_CTYPE_STRING_STRING(TYPE, TYPE) +#define END_TYPE_TO_CTYPE_STRING_MAP \ + else { static_assert(utilities::FalseType{}, "Unknown type"); } + + BEGIN_TYPE_TO_CTYPE_STRING_MAP + ADD_TYPE_TO_CTYPE_STRING(bool) + ADD_TYPE_TO_CTYPE_STRING_STRING(char, int8_t) + ADD_TYPE_TO_CTYPE_STRING(uint8_t) + ADD_TYPE_TO_CTYPE_STRING(int16_t) + ADD_TYPE_TO_CTYPE_STRING(int32_t) + ADD_TYPE_TO_CTYPE_STRING(int64_t) + ADD_TYPE_TO_CTYPE_STRING(float) + ADD_TYPE_TO_CTYPE_STRING(double) + ADD_TYPE_TO_CTYPE_STRING(void) + END_TYPE_TO_CTYPE_STRING_MAP + } + + template > + void PrintVector(StreamType&& stream, const std::vector& v, const std::string& delim = ", ") + { + using RealT = std::conditional_t; + if (!v.empty()) + { + std::copy(v.begin(), v.end() - 1, std::ostream_iterator{ stream, delim.c_str() }); + stream << static_cast(v.back()); + } + } + } // namespace + + Value CppEmitterContext::AllocateImpl(ValueType type, MemoryLayout layout, size_t alignment, AllocateFlags flags) + { + if (alignment != 0 || flags != AllocateFlags::None) + { + throw LogicException(LogicExceptionErrors::notImplemented); + } + + // TODO: add alignment directive + return AllocateImpl({ type, 1 }, layout, "{}; // " + layout.ToString() + "\n"); + } + + Value CppEmitterContext::AllocateImpl( + detail::ValueTypeDescription type, + std::optional layout, + std::string initializationString, + std::optional name, + bool forcePointer) + { + CppEmitterContext::ValueImpl data{ name.value_or(UniqueName("v")), type }; + + auto& dataList = _fnStacks.top().dataList; + + dataList.push_front(std::move(data)); + auto& front = dataList.front(); + + Emittable emittable{ &front }; + + Value value(emittable, layout); + + Out() << ValueToCString(value, forcePointer) << " " << front.name << initializationString; + + return value; + } + + std::optional CppEmitterContext::GetGlobalValue(GlobalAllocationScope scope, std::string name) + { + std::string adjustedName = GetScopeAdjustedName(scope, name); + if (auto it = _globals.find(adjustedName); it != _globals.end()) + { + return Value(it->second.first, it->second.second); + } + + return std::nullopt; + } + + Value CppEmitterContext::GlobalAllocateImpl(GlobalAllocationScope scope, std::string name, ConstantData data, MemoryLayout layout, AllocateFlags flags) + { + std::string adjustedName = GetScopeAdjustedName(scope, name); + + auto it = _globals.find(adjustedName); + if (it != _globals.end()) + { + throw InputException(InputExceptionErrors::invalidArgument, + "Unexpected collision in global data allocation"); + } + + auto [type, definitionString] = std::visit( + [](auto&& vectorData) -> std::pair { + using Type = std::decay_t; + using VectorElementType = typename Type::value_type; + using ElementType = std::conditional_t, bool, VectorElementType>; + + std::stringstream sstr; + + sstr << " = { "; + PrintVector(sstr, vectorData); + sstr << " };\n"; + + return { GetValueType(), sstr.str() }; + }, + data); + + CppEmitterContext::ValueImpl valueDesc{ adjustedName, { type, 1 } }; + + _globalsList.push_front(std::move(valueDesc)); + auto& front = _globalsList.front(); + + Emittable emittable{ &front }; + _globals.insert(it, { adjustedName, { emittable, layout } }); + + Value value(emittable, layout); + + std::string prefix = [flags] { + switch (flags) + { + case AllocateFlags::ThreadLocal: + return "thread_local "; + default: + return ""; + } + }(); + + Global() << prefix << ValueToCString(value) << " " << adjustedName << definitionString; + + return value; + } + + Value CppEmitterContext::GlobalAllocateImpl(GlobalAllocationScope scope, std::string name, ValueType type, MemoryLayout layout, AllocateFlags flags) + { + std::string adjustedName = GetScopeAdjustedName(scope, name); + + auto it = _globals.find(adjustedName); + if (it != _globals.end()) + { + throw InputException(InputExceptionErrors::invalidArgument, + "Unexpected collision in global data allocation"); + } + + CppEmitterContext::ValueImpl valueDesc{ adjustedName, { type, 1 } }; + + _globalsList.push_front(std::move(valueDesc)); + auto& front = _globalsList.front(); + + Emittable emittable{ &front }; + _globals.insert(it, { adjustedName, { emittable, layout } }); + + Value value(emittable, layout); + + std::string prefix = [flags] { + switch (flags) + { + case AllocateFlags::ThreadLocal: + return "thread_local "; + default: + return ""; + } + }(); + + Global() << prefix << ValueToCString(value) << " " << adjustedName << "{}; // " << layout << "\n"; + + return value; + } + + detail::ValueTypeDescription CppEmitterContext::GetTypeImpl(Emittable emittable) + { + return emittable.GetDataAs()->typeDesc; + } + + void CppEmitterContext::DeclareFunction(FunctionDeclaration decl) + { + auto [it, inserted] = _declaredFunctions.emplace(decl.GetFunctionName()); + if (!inserted) + { + // already declared + return; + } + + WriteFunctionSignature(FnDecl(), decl) << ";\n"; + } + + std::ostream& CppEmitterContext::WriteFunctionSignature(std::ostream& stream, FunctionDeclaration decl) + { + const auto& argValues = decl.GetParameterTypes(); + const auto& returnValue = decl.GetReturnType(); + const auto& fnName = decl.GetFunctionName(); + const auto isPublic = decl.IsPublic(); + + std::vector functionArgs; + functionArgs.reserve(argValues.size()); + for (auto index = 0u; index < argValues.size(); ++index) + { + auto& arg = argValues[index]; + functionArgs.push_back( + ValueTypeToCTypeString( + arg.GetType(), + arg.IsConstrained() ? arg.GetLayout().GetMemorySize() : 0, + true) + + " arg_" + std::to_string(index) + "/* " + (arg.IsConstrained() ? arg.GetLayout().ToString() : std::string{ "unconstrained" }) + " */"); + } + + std::string returnType = returnValue ? ValueToCString(*returnValue) : ValueTypeToCTypeString(ValueType::Void); + + stream << (isPublic ? "" : "static ") << returnType << " " << fnName << "("; + PrintVector(stream, functionArgs); + stream << ")"; + + return stream; + } + + EmitterContext::DefinedFunction + CppEmitterContext::CreateFunctionImpl(FunctionDeclaration decl, DefinedFunction fn) + { + if (const auto& intrinsics = GetIntrinsics(); + std::find(intrinsics.begin(), intrinsics.end(), decl) != intrinsics.end()) + { + throw InputException(InputExceptionErrors::invalidArgument, "Specified function is an intrinsic"); + } + + if (auto it = _definedFunctions.find(decl); it != _definedFunctions.end()) + { + return it->second; + } + + DeclareFunction(decl); + + const auto& argValues = decl.GetParameterTypes(); + const auto& fnName = decl.GetFunctionName(); + + { + FunctionScope scope(*this, fnName); + + // create function sig + WriteFunctionSignature(Out(), decl) << " {\n"; + + auto params = argValues; + for (auto index = 0u; index < params.size(); ++index) + { + auto& param = params[index]; + CppEmitterContext::ValueImpl data{ "arg_" + std::to_string(index), param.GetType() }; + + auto& dataList = _fnStacks.top().dataList; + + dataList.push_front(std::move(data)); + auto& front = dataList.front(); + + Emittable emittable{ &front }; + + param.SetData(emittable); + } + + Indented([&] { + auto fnReturnValue = fn(params); + if (fnReturnValue) + { + auto emittableReturn = EnsureEmittable(*fnReturnValue); + Out() << "return " << emittableReturn.GetName() << ";\n"; + } + }); + + Out() << "} \n" + << std::endl; + } + + DefinedFunction returnFn = [this, decl](std::vector args) -> std::optional { + const auto& argValues = decl.GetParameterTypes(); + const auto& returnValue = decl.GetReturnType(); + const auto& fnName = decl.GetFunctionName(); + + if (!std::equal(args.begin(), + args.end(), + argValues.begin(), + argValues.end(), + [](Value suppliedValue, Value fnValue) { + return suppliedValue.GetBaseType() == fnValue.GetBaseType() && + (suppliedValue.PointerLevel() == fnValue.PointerLevel() || + suppliedValue.PointerLevel() == fnValue.PointerLevel() + 1); + })) + { + throw InputException(InputExceptionErrors::invalidArgument); + } + + std::vector params; + for (auto index = 0u; index < args.size(); ++index) + { + auto arg = EnsureEmittable(args[index]); + auto& expected = argValues[index]; + + std::string param = "&" + ScalarToString(arg); + if (arg.PointerLevel() == expected.PointerLevel() + 1) + { + param = "*(" + param + ")"; + } + params.push_back(param); + } + + std::stringstream funcCallStream; + funcCallStream << fnName << "("; + PrintVector(funcCallStream, params); + funcCallStream << ")"; + auto funCallString = funcCallStream.str(); + + std::optional fnReturnValue; + if (returnValue) + { + auto typeDesc = returnValue->GetType(); + auto layout = returnValue->IsConstrained() ? std::optional{ returnValue->GetLayout() } : std::optional{}; + bool originalScalar = false; + if (typeDesc.second == 0) + { + typeDesc.second = 1; + originalScalar = true; + } + std::string initStr = std::string{ " = " } + + (originalScalar ? "{ " : "") + funCallString + (originalScalar ? " }" : "") + + "; // " + (layout ? layout->ToString() : std::string{ "unconstrained" }) + "\n\n"; + + fnReturnValue = AllocateImpl(typeDesc, originalScalar ? ScalarLayout : layout, initStr); + } + else + { + Out() << funCallString << ";\n\n"; + } + + return fnReturnValue; + }; + + _definedFunctions[decl] = returnFn; + + return returnFn; + } + + bool CppEmitterContext::IsFunctionDefinedImpl(FunctionDeclaration decl) const + { + if (const auto& intrinsics = GetIntrinsics(); + std::find(intrinsics.begin(), intrinsics.end(), decl) != intrinsics.end()) + { + return true; + } + + return _definedFunctions.find(decl) != _definedFunctions.end(); + } + + Value CppEmitterContext::StoreConstantDataImpl(ConstantData data) + { + return _computeContext.StoreConstantData(data); + } + + void CppEmitterContext::ForImpl(MemoryLayout layout, std::function)> fn, const std::string& name) + { + struct Range + { + Scalar start; + Scalar stop; + Scalar step; + }; + + using LooperFn = std::function, std::vector, std::function)>)>; + + const auto& logicalOrder = layout.GetLogicalDimensionOrder(); + + LooperFn looper = [this, &looper, &logicalOrder, &name](std::vector ranges, std::vector indices, std::function)> bodyFn) { + if (ranges.empty()) + { + std::vector logicalIndices(indices.size()); + for (auto index = 0u; index < indices.size(); ++index) + { + logicalIndices[logicalOrder[index]] = indices[index]; + } + bodyFn(logicalIndices); + } + else + { + Range range = std::move(ranges.front()); + ranges.erase(ranges.begin()); + + ForImpl( + range.start, range.stop, range.step, [=, &looper, &bodyFn](Scalar index) mutable { + indices.push_back(index); + looper(ranges, indices, bodyFn); + }, + name); + } + }; + + std::vector ranges; + ranges.reserve(layout.NumDimensions()); + for (auto index = 0; index < layout.NumDimensions(); ++index) + { + ranges.push_back({ Value(0), Value(layout.GetActiveSize(index)), Value(1) }); + } + + looper(ranges, {}, fn); + } + + void CppEmitterContext::ForImpl(Scalar start, Scalar stop, Scalar step, std::function fn, const std::string& name) + { + auto startStr = ScalarToString(start); + auto index = AllocateImpl({ ValueType::Int32, 1 }, ScalarLayout, "{ " + startStr + " };\n"); + auto indexStr = index.GetName(); + auto stopStr = ScalarToString(stop); + auto stepStr = ScalarToString(step); + std::string optionalTag; + if (!name.empty()) + { + optionalTag = " // " + UniqueName(name + " loop"); + } + + Out() << "for (;" << indexStr << "[0] < " << stopStr + << "; " << indexStr << "[0] += " << stepStr << ") {" << optionalTag << "\n"; + + Indented([&] { fn(index); }); + + Out() << "}" << optionalTag << "\n\n"; + } + + void CppEmitterContext::MoveDataImpl(Value& source, Value& destination) + { + // we treat a move the same as a copy, except we clear out the source + CopyDataImpl(source, destination); + + // data has been "moved", so clear the source + source.Reset(); + } + + void CppEmitterContext::CopyDataImpl(const Value& source, Value& destination) + { + if (destination.IsConstant()) + { + if (source.IsConstant()) + { + return _computeContext.CopyData(source, destination); + } + else + { + destination.SetData(AllocateImpl(source.GetType(), source.GetLayout(), "{ " + source.GetName() + " };\n")); + } + } + else + { + if (!source.IsConstant() && source.Get().GetDataAs() == destination.Get().GetDataAs()) + { + return; + } + + if (auto& layout = source.GetLayout(); layout == destination.GetLayout()) + { + if (layout == ScalarLayout) + { + Out() << ScalarToString(destination) << " = " << ScalarToString(source) << ";\n"; + } + else + { + auto realizedSource = EnsureEmittable(source); + Out() << "std::copy_n(&" << GetNameImpl(realizedSource) << "[0], " + << realizedSource.GetLayout().GetMemorySize() << ", &" + << GetNameImpl(destination) << "[0]);\n"; + } + } + else + { + throw LogicException(LogicExceptionErrors::notImplemented); + } + } + } + + Value CppEmitterContext::ReferenceImpl(Value sourceValue) + { + auto source = Realize(sourceValue); + if (source.IsConstant()) + { + return _computeContext.Reference(source); + } + + auto typeDesc = source.GetType(); + ++typeDesc.second; + + auto sourceName = source.GetName(); + auto tempOffsetValue = Offset(sourceValue, 0); + + auto value = AllocateImpl( + typeDesc, + sourceValue.IsConstrained() ? sourceValue.GetLayout() : ScalarLayout, + " = &" + tempOffsetValue.GetName() + ";\n", + UniqueName(sourceName + "_ref"), + true); + + if (!sourceValue.IsConstrained()) + { + value.ClearLayout(); + } + + return value; + } + + Value CppEmitterContext::DereferenceImpl(Value sourceValue) + { + auto source = Realize(sourceValue); + if (source.IsConstant()) + { + return _computeContext.Dereference(source); + } + + auto typeDesc = source.GetType(); + --typeDesc.second; + + auto sourceName = source.GetName(); + + auto value = AllocateImpl( + typeDesc, + sourceValue.IsConstrained() ? sourceValue.GetLayout() : ScalarLayout, + " = " + sourceName + "[0];\n", + UniqueName(sourceName + "_ref"), + true); + + if (!sourceValue.IsConstrained()) + { + value.ClearLayout(); + } + + return value; + } + + Value CppEmitterContext::OffsetImpl(Value source, Value offset) + { + if (offset.GetLayout() != ScalarLayout) + { + throw LogicException(LogicExceptionErrors::illegalState); + } + + if (source.IsConstant() && offset.IsConstant()) + { + return _computeContext.Offset(source, offset); + } + else + { + auto emittableSource = EnsureEmittable(source); + auto sourceName = emittableSource.GetName(); + + std::string initString = " = &" + sourceName + "[0]"; + if (auto offsetString = ScalarToString(offset); offsetString != "0") + { + initString += " + " + offsetString; + } + initString += ";\n"; + + auto value = AllocateImpl(source.GetType(), std::nullopt, initString, UniqueName(sourceName + "_offset"), true); + + return value; + } + } + + Value CppEmitterContext::UnaryOperationImpl(ValueUnaryOperation op, Value destination) + { + throw LogicException(LogicExceptionErrors::notImplemented); + } + Value CppEmitterContext::BinaryOperationImpl(ValueBinaryOperation op, Value destination, Value source) + { + if (destination.IsConstant() && source.IsConstant()) + { + return _computeContext.BinaryOperation(op, destination, source); + } + + std::string opStr; + bool canSelfAssign; + std::tie(opStr, canSelfAssign) = [op]() -> std::pair { + switch (op) + { + case ValueBinaryOperation::add: + return { " += ", true }; + case ValueBinaryOperation::divide: + return { " /= ", true }; + case ValueBinaryOperation::modulus: + return { " %= ", true }; + case ValueBinaryOperation::multiply: + return { " *= ", true }; + case ValueBinaryOperation::subtract: + return { " -= ", true }; + case ValueBinaryOperation::logicalAnd: + return { " && ", false }; + case ValueBinaryOperation::logicalOr: + return { " || ", false }; + default: + throw LogicException(LogicExceptionErrors::illegalState); + } + }(); + + if (destination.IsDefined()) + { + if (destination.GetLayout() != source.GetLayout()) + { + throw LogicException(LogicExceptionErrors::illegalState); + } + if (destination.GetBaseType() != source.GetBaseType()) + { + throw LogicException(LogicExceptionErrors::illegalState); + } + } + else + { + if (!source.IsConstrained()) + { + throw LogicException(LogicExceptionErrors::illegalState); + } + return source; + } + + const auto& layout = destination.GetLayout(); + auto destStr = GetNameImpl(destination); + auto srcStr = GetNameImpl(source); + if (layout == ScalarLayout) + { + auto sourceString = ScalarToString(source); + switch (op) + { + case ValueBinaryOperation::add: + [[fallthrough]]; + case ValueBinaryOperation::subtract: + if (sourceString == "0") // destination = destination { +, - } 0 + { + return destination; + } + break; + case ValueBinaryOperation::divide: + [[fallthrough]]; + case ValueBinaryOperation::multiply: + if (sourceString == "1") // destination = destination { /, * } 1 + { + return destination; + } + break; + default: + break; + } + if (canSelfAssign) + { + Out() << destStr << "[0]" << opStr << sourceString << ";\n"; + } + else + { + Out() << destStr << "[0] = " << destStr << "[0]" << opStr << sourceString << ";\n"; + } + } + else + { + auto emittableSource = EnsureEmittable(source); + srcStr = emittableSource.GetName(); + + auto iterationVariable = UniqueName("index"); + Out() << "for (size_t " << iterationVariable << " = 0; " << iterationVariable << " < " << layout.GetMemorySize() << "; " << iterationVariable << " += " << layout.GetCumulativeIncrement(layout.NumDimensions() - 1) << ") {\n"; + + Indented([&] { + if (canSelfAssign) + { + Out() << destStr << "[" << iterationVariable << "]" << opStr << srcStr << "[" << iterationVariable << "];\n"; + } + else + { + Out() << destStr << "[" << iterationVariable << "] = " << destStr << "[" << iterationVariable << "]" << opStr << srcStr << "[" << iterationVariable << "];\n"; + } + }); + + Out() << "}\n\n"; + } + return destination; + } + + Value CppEmitterContext::LogicalOperationImpl(ValueLogicalOperation op, Value source1, Value source2) + { + if (source1.IsConstant() && source2.IsConstant()) + { + return _computeContext.LogicalOperation(op, source1, source2); + } + + auto opStr = [op]() -> std::string { + switch (op) + { + case ValueLogicalOperation::equality: + return " == "; + case ValueLogicalOperation::inequality: + return " != "; + case ValueLogicalOperation::greaterthan: + return " > "; + case ValueLogicalOperation::greaterthanorequal: + return " >= "; + case ValueLogicalOperation::lessthan: + return " < "; + case ValueLogicalOperation::lessthanorequal: + return " <= "; + default: + throw LogicException(LogicExceptionErrors::illegalState); + } + }(); + + if (source1.GetLayout() == source2.GetLayout()) + { + std::string initString; + if (source1.GetLayout() == ScalarLayout) + { + initString = ScalarToString(source1) + opStr + ScalarToString(source2); + } + else + { + auto emittableSource1 = EnsureEmittable(source1); + auto emittableSource2 = EnsureEmittable(source2); + + initString = GetNameImpl(emittableSource1) + opStr + GetNameImpl(emittableSource2); + } + + return AllocateImpl({ ValueType::Boolean, 1 }, ScalarLayout, "{ " + initString + " };\n"); + } + else + { + throw LogicException(LogicExceptionErrors::notImplemented); + } + } + + Value CppEmitterContext::CastImpl(Value value, ValueType type) + { + if (value.IsConstant()) + { + return _computeContext.Cast(value, type); + } + + if (value.PointerLevel() != 1) + { + throw LogicException(LogicExceptionErrors::illegalState); + } + + if (value.GetBaseType() == type) + { + return value; + } + + if (auto& layout = value.GetLayout(); layout == ScalarLayout) + { + std::string initString = "{ static_cast<" + ValueTypeToCTypeString(type) + ">(" + ScalarToString(value) + ") };\n"; + return AllocateImpl({ type, 1 }, layout, initString); + } + else + { + auto returnValue = Allocate(type, value.GetLayout()); + For(0, + static_cast(layout.GetMemorySize()), + 1, + [&](Scalar index) { + Out() << returnValue.GetName() << "[" << ScalarToString(index) << "] = static_cast<" << ValueTypeToCTypeString(type) + << ">(" << value.GetName() << "[" << ScalarToString(index) << "]);\n"; + }); + return returnValue; + } + } + + class CppEmitterContext::IfContextImpl : public EmitterContext::IfContextImpl + { + public: + IfContextImpl(CppEmitterContext& context, Scalar test, std::function fn) : + _context(context) + { + StreamGuard guard{ *this }; + _context.Out() << "if (" << _context.ScalarToString(test) << ") {\n"; + _context.Indented(fn); + _context.Out() << "}"; + } + + ~IfContextImpl() + { + _sstr << "\n"; + _context._stream.get() << _sstr.str(); + } + + void ElseIf(Scalar test, std::function fn) override + { + StreamGuard guard{ *this }; + _context._stream.get() << " else if (" << _context.ScalarToString(test) << ") {\n"; + _context.Indented(fn); + _context.Out() << "}"; + } + + void Else(std::function fn) override + { + StreamGuard guard{ *this }; + _context._stream.get() << " else {\n"; + _context.Indented(fn); + _context.Out() << "}\n"; + } + + private: + struct StreamGuard + { + StreamGuard(IfContextImpl& context) : + _context(context), + _oldStream(_context._context._stream) + { + _context._context._stream = _context._sstr; + } + + ~StreamGuard() + { + _context._context._stream = _oldStream; + } + + private: + IfContextImpl& _context; + std::reference_wrapper _oldStream; + }; + + CppEmitterContext& _context; + std::stringstream _sstr; + }; + + EmitterContext::IfContext CppEmitterContext::IfImpl(Scalar test, std::function fn) + { + return EmitterContext::IfContext{ std::make_unique(*this, test, fn) }; + } + + void CppEmitterContext::WhileImpl(Scalar test, std::function fn) + { + auto testStr = ScalarToString(test); + std::string optionalTag; + std::string name; + if (!name.empty()) + { + optionalTag = " // " + UniqueName(name + " loop"); + } + + Out() << "while (" << testStr << ") {" << optionalTag << "\n"; + + Indented(fn); + + Out() << "}" << optionalTag << "\n\n"; + } + + std::optional CppEmitterContext::CallImpl(FunctionDeclaration func, std::vector args) + { + if (std::any_of(args.begin(), args.end(), [](const auto& value) { return value.IsEmpty(); })) + { + throw InputException(InputExceptionErrors::invalidArgument); + } + + const auto& intrinsics = GetIntrinsics(); + if (std::find(intrinsics.begin(), intrinsics.end(), func) != intrinsics.end()) + { + return IntrinsicCall(func, args); + } + + if (auto it = _definedFunctions.find(func); it != _definedFunctions.end()) + { + return it->second(args); + } + + return EmitExternalCall(func, args); + } + + Value CppEmitterContext::SimpleNumericIntrinsic(FunctionDeclaration intrinsic, std::vector args) + { + if (args.size() != 1) + { + throw InputException(InputExceptionErrors::invalidSize); + } + + const auto& value = args[0]; + if (value.GetBaseType() == ValueType::Boolean) + { + throw InputException(InputExceptionErrors::typeMismatch); + } + + auto typeDesc = value.GetType(); + if (typeDesc.first != ValueType::Float) + { + typeDesc.first = ValueType::Double; + } + + auto returnLayout = value.IsConstrained() ? value.GetLayout() : ScalarLayout; + auto fnName = ToLowercase(intrinsic.GetFunctionName()); + + if (returnLayout == ScalarLayout) + { + return AllocateImpl(typeDesc, returnLayout, "{ std::" + fnName + "(" + ScalarToString(value) + ") };\n"); + } + else + { + auto result = Allocate(typeDesc.first, returnLayout); + auto valueStr = ScalarToString(value); + Out() << "std::transform(&" << valueStr + << ", &" << valueStr << " + " << returnLayout.GetMemorySize() + << ", &" << ScalarToString(result) + << ", [](decltype(" << valueStr << ") x) { return std::" << fnName << "(x); });\n"; + return result; + } + } + + Value CppEmitterContext::MaxMinIntrinsic(FunctionDeclaration intrinsic, std::vector args) + { + if (args.size() == 1) + { + const auto& value = args[0]; + if (value.GetBaseType() == ValueType::Boolean) + { + throw InputException(InputExceptionErrors::typeMismatch); + } + + std::string fnName; + if (intrinsic == MaxNumFunctionDeclaration) + { + fnName = "std::max_element"; + } + else if (intrinsic == MinNumFunctionDeclaration) + { + fnName = "std::min_element"; + } + else + { + throw LogicException(LogicExceptionErrors::illegalState); + } + + auto valueStr = ScalarToString(value); + return AllocateImpl( + { value.GetBaseType(), 1 }, + ScalarLayout, + "{ *" + fnName + "(&" + valueStr + + ", &" + valueStr + " + " + std::to_string(value.GetLayout().GetMemorySize()) + + ") };\n"); + } + else if (args.size() == 2) + { + const auto& value1 = args[0]; + const auto& value2 = args[1]; + if (value1.GetBaseType() != value2.GetBaseType()) + { + throw InputException(InputExceptionErrors::typeMismatch); + } + + if (value1.GetBaseType() == ValueType::Boolean) + { + throw InputException(InputExceptionErrors::typeMismatch); + } + + if ((value1.IsConstrained() && value1.GetLayout() != ScalarLayout) || + (value2.IsConstrained() && value2.GetLayout() != ScalarLayout)) + { + throw InputException(InputExceptionErrors::invalidSize); + } + + std::string fnName; + if (intrinsic == MaxNumFunctionDeclaration) + { + fnName = "std::max"; + } + else if (intrinsic == MinNumFunctionDeclaration) + { + fnName = "std::min"; + } + else + { + throw LogicException(LogicExceptionErrors::illegalState); + } + + return AllocateImpl( + { value1.GetBaseType(), 1 }, + ScalarLayout, + "{ " + fnName + "(" + ScalarToString(value1) + + ", " + ScalarToString(value2) + ") };\n"); + } + else + { + throw InputException(InputExceptionErrors::invalidSize); + } + } + + Value CppEmitterContext::PowIntrinsic(FunctionDeclaration, std::vector args) + { + if (args.size() != 2) + { + throw InputException(InputExceptionErrors::invalidSize); + } + + const auto& value1 = args[0]; + const auto& value2 = args[1]; + if (value1.GetBaseType() != value2.GetBaseType()) + { + throw InputException(InputExceptionErrors::typeMismatch); + } + + if (value1.GetBaseType() == ValueType::Boolean) + { + throw InputException(InputExceptionErrors::typeMismatch); + } + + if (value2.IsConstrained() && value2.GetLayout() != ScalarLayout) + { + throw InputException(InputExceptionErrors::invalidSize); + } + + auto typeDesc = value1.GetType(); + if (typeDesc.first != ValueType::Float) + { + typeDesc.first = ValueType::Double; + } + + auto returnLayout = value1.IsConstrained() ? value1.GetLayout() : ScalarLayout; + std::string fnName = "std::pow"; + + if (returnLayout == ScalarLayout) + { + return AllocateImpl(typeDesc, returnLayout, "{ " + fnName + "(" + ScalarToString(value1) + ", " + ScalarToString(value2) + ") };\n"); + } + else + { + auto result = Allocate(typeDesc.first, returnLayout); + auto valueStr = ScalarToString(value1); + Out() << "std::transform(&" << valueStr + << ", &" << valueStr << " + " << returnLayout.GetMemorySize() + << ", &" << ScalarToString(result) + << ", [&" << value2.GetName() << "](decltype(" << valueStr + << ") x) { return " << fnName << "(x, " << ScalarToString(value2) << "); });\n"; + return result; + } + } + + Value CppEmitterContext::CopySignIntrinsic(FunctionDeclaration, std::vector args) + { + if (args.size() != 2) + { + throw InputException(InputExceptionErrors::invalidSize); + } + + const auto& value1 = args[0]; + const auto& value2 = args[1]; + if (value1.GetBaseType() != value2.GetBaseType()) + { + throw InputException(InputExceptionErrors::typeMismatch); + } + + if (value1.GetBaseType() == ValueType::Boolean) + { + throw InputException(InputExceptionErrors::typeMismatch); + } + + if ((value1.IsConstrained() && value1.GetLayout() != ScalarLayout) || + (value2.IsConstrained() && value2.GetLayout() != ScalarLayout)) + { + throw InputException(InputExceptionErrors::invalidSize); + } + + auto typeDesc = value1.GetType(); + if (typeDesc.first != ValueType::Float) + { + typeDesc.first = ValueType::Double; + } + + return AllocateImpl(typeDesc, ScalarLayout, "{ std::copysign(" + ScalarToString(value1) + ", " + ScalarToString(value2) + ") };\n"); + } + + Value CppEmitterContext::FmaIntrinsic(FunctionDeclaration, std::vector args) + { + if (args.size() != 3) + { + throw InputException(InputExceptionErrors::invalidSize); + } + + if (std::any_of(args.begin(), args.end(), [](Value& value) { return value.IsConstrained() && value.GetLayout() != ScalarLayout; })) + { + throw InputException(InputExceptionErrors::invalidSize); + } + + const auto& value1 = args[0]; + const auto& value2 = args[1]; + const auto& value3 = args[2]; + if (value1.GetBaseType() != value2.GetBaseType() || value1.GetBaseType() != value3.GetBaseType()) + { + throw InputException(InputExceptionErrors::typeMismatch); + } + + if (value1.GetBaseType() == ValueType::Boolean) + { + throw InputException(InputExceptionErrors::typeMismatch); + } + + auto typeDesc = value1.GetType(); + + return AllocateImpl( + typeDesc, + ScalarLayout, + "{ static_cast<" + ValueTypeToCTypeString(typeDesc.first) + ">(std::fma(" + ScalarToString(value1) + ", " + ScalarToString(value2) + ", " + ScalarToString(value3) + ")) };\n"); + } + + Value CppEmitterContext::MemFnIntrinsic(FunctionDeclaration intrinsic, std::vector args) + { + if (args.size() != 3) + { + throw InputException(InputExceptionErrors::invalidSize); + } + + const auto& value1 = args[0]; + const auto& value2 = args[1]; + const auto& value3 = args[2]; + + if (!value3.IsConstrained() || value3.GetLayout() != ScalarLayout) + { + throw InputException(InputExceptionErrors::invalidArgument); + } + + std::string secondValuePrefix; + if (intrinsic == MemSetFunctionDeclaration) + { + assert((value2.IsConstrained() && value2.GetLayout() == ScalarLayout && value2.GetType() == std::pair{ ValueType::Char8, 1 })); + } + else + { + secondValuePrefix = "&"; + } + + auto fnName = ToLowercase(intrinsic.GetFunctionName()); + auto value1Str = ScalarToString(value1); + Out() << "std::" << fnName << "(&" << value1Str << ", " + << secondValuePrefix << ScalarToString(value2) + << ", sizeof(" << value1Str << ") * " << ScalarToString(value3) << ");\n"; + + return {}; // unused + } + + Value CppEmitterContext::IntrinsicCall(FunctionDeclaration intrinsic, std::vector args) + { + if (std::all_of(args.begin(), args.end(), [](const auto& value) { return value.IsConstant(); })) + { + // Compute context can handle intrinsic calls with constant data + return *_computeContext.Call(intrinsic, std::vector(args.begin(), args.end())); + } + + static std::unordered_map)> intrinsics{ + { AbsFunctionDeclaration, &CppEmitterContext::SimpleNumericIntrinsic }, + { CosFunctionDeclaration, &CppEmitterContext::SimpleNumericIntrinsic }, + { ExpFunctionDeclaration, &CppEmitterContext::SimpleNumericIntrinsic }, + { LogFunctionDeclaration, &CppEmitterContext::SimpleNumericIntrinsic }, + { Log10FunctionDeclaration, &CppEmitterContext::SimpleNumericIntrinsic }, + { Log2FunctionDeclaration, &CppEmitterContext::SimpleNumericIntrinsic }, + { MaxNumFunctionDeclaration, &CppEmitterContext::MaxMinIntrinsic }, + { MinNumFunctionDeclaration, &CppEmitterContext::MaxMinIntrinsic }, + { PowFunctionDeclaration, &CppEmitterContext::PowIntrinsic }, + { SinFunctionDeclaration, &CppEmitterContext::SimpleNumericIntrinsic }, + { SqrtFunctionDeclaration, &CppEmitterContext::SimpleNumericIntrinsic }, + { TanhFunctionDeclaration, &CppEmitterContext::SimpleNumericIntrinsic }, + { RoundFunctionDeclaration, &CppEmitterContext::SimpleNumericIntrinsic }, + { FloorFunctionDeclaration, &CppEmitterContext::SimpleNumericIntrinsic }, + { CeilFunctionDeclaration, &CppEmitterContext::SimpleNumericIntrinsic }, + { CopySignFunctionDeclaration, &CppEmitterContext::CopySignIntrinsic }, + { FmaFunctionDeclaration, &CppEmitterContext::FmaIntrinsic }, + { MemCopyFunctionDeclaration, &CppEmitterContext::MemFnIntrinsic }, + { MemMoveFunctionDeclaration, &CppEmitterContext::MemFnIntrinsic }, + { MemSetFunctionDeclaration, &CppEmitterContext::MemFnIntrinsic }, + }; + + std::vector emittableArgs; + emittableArgs.reserve(args.size()); + std::transform(args.begin(), args.end(), std::back_inserter(emittableArgs), [this](const auto& value) { return EnsureEmittable(value); }); + + return std::invoke(intrinsics.at(intrinsic), this, intrinsic, emittableArgs); + } + + std::optional CppEmitterContext::EmitExternalCall(FunctionDeclaration externalFunc, std::vector args) + { + DeclareFunction(externalFunc); + + const auto& argTypes = externalFunc.GetParameterTypes(); + + if (args.size() != argTypes.size()) + { + throw InputException(InputExceptionErrors::sizeMismatch); + } + if (!std::equal(args.begin(), + args.end(), + argTypes.begin(), + argTypes.end(), + [](Value suppliedValue, Value fnValue) { + return suppliedValue.GetBaseType() == fnValue.GetBaseType() && + (suppliedValue.PointerLevel() == fnValue.PointerLevel() || + suppliedValue.PointerLevel() == fnValue.PointerLevel() + 1); + })) + { + throw InputException(InputExceptionErrors::invalidArgument); + } + + // explicitly making a copy + auto returnType = externalFunc.GetReturnType(); + const auto& fnName = externalFunc.GetFunctionName(); + + std::vector params; + for (auto index = 0u; index < args.size(); ++index) + { + auto arg = EnsureEmittable(args[index]); + auto& expected = argTypes[index]; + + std::string param = "&" + ScalarToString(arg); + if (arg.PointerLevel() == expected.PointerLevel() + 1) + { + param = "*(" + param + ")"; + } + params.push_back(param); + } + + std::stringstream funcCallStream; + funcCallStream << fnName << "("; + PrintVector(funcCallStream, params); + funcCallStream << ")"; + + std::string returnValueString; + if (returnType) + { + auto typeDesc = returnType->GetType(); + auto layout = returnType->GetLayout(); + returnType.reset(); + bool originalScalar = false; + + if (typeDesc.second == 0) + { + typeDesc.second = 1; + originalScalar = true; + } + std::string initStr = std::string{ " = " } + + (originalScalar ? "{ " : "") + funcCallStream.str() + (originalScalar ? " }" : "") + + "; // " + layout.ToString() + "\n\n"; + + returnType = AllocateImpl(typeDesc, originalScalar ? ScalarLayout : layout, initStr); + } + else + { + Out() << funcCallStream.str() << ";\n\n"; + } + + return returnType; + } + + void CppEmitterContext::PrefetchImpl(Value data, PrefetchType type, PrefetchLocality locality) + { + // no-op for now + } + + void CppEmitterContext::ParallelizeImpl(int numTasks, std::vector captured, std::function)> fn) + { + auto futuresName = UniqueName("futures"); + Out() << "std::vector> " << futuresName << ";\n"; + Out() << futuresName << ".reserve(" << numTasks << ");\n"; + std::vector capturedParams; + std::transform(captured.begin(), captured.end(), std::back_inserter(capturedParams), [this](const Value& value) { + auto emittableValue = EnsureEmittable(value); + return "&" + emittableValue.GetName(); + }); + + ForRange(numTasks, [&](Scalar index) { + auto& outStream = Out() << futuresName << ".emplace_back(std::async(["; + PrintVector(outStream, capturedParams); + auto parallelizedIndexName = UniqueName("parallelized_index"); + outStream << "](int " << parallelizedIndexName << ") {\n"; + Indented([&] { + Scalar parallelizedIndex = AllocateImpl({ ValueType::Int32, 1 }, ScalarLayout, " = { " + parallelizedIndexName + " };\n\n"); + fn(parallelizedIndex, captured); + }); + Out() << "}, " << ScalarToString(index) << "));\n"; + }); + + Out() << "for (auto& " << futuresName << "_temp : " << futuresName << ") {\n"; + Indented([&] { + Out() << futuresName << "_temp.wait();\n"; + }); + Out() << "}\n\n"; + } + + void CppEmitterContext::DebugBreakImpl() + { + // no-op for now + } + + void CppEmitterContext::DebugDumpImpl(Value value, std::string tag, std::ostream& stream) const + { + // no-op for now + } + + void CppEmitterContext::DebugDumpImpl(FunctionDeclaration fn, std::string tag, std::ostream& stream) const + { + // no-op for now + } + + void CppEmitterContext::DebugPrintImpl(std::string message) + { + std::string::iterator it; + char tempBuffer[10] = {}; + while ((it = std::find_if_not(message.begin(), message.end(), [](auto c) { + return static_cast(::isprint(static_cast(c))); + })) != message.end()) + { + snprintf(tempBuffer, sizeof(tempBuffer), "\\x%02x", static_cast(*it)); + message.replace(it, it + 1, tempBuffer); + } + Out() << "std::cout << \"" << message << "\";\n"; + } + + void CppEmitterContext::SetNameImpl(const Value& value, const std::string& name) + { + // TODO: fix + // value.Get().GetDataAs()->name = name; + } + + std::string CppEmitterContext::GetNameImpl(const Value& value) const + { + return value.IsConstant() ? _computeContext.GetName(value) : value.Get().GetDataAs()->name; + } + + void CppEmitterContext::ImportCodeFileImpl(std::string) { throw LogicException(LogicExceptionErrors::notImplemented); } + + Scalar CppEmitterContext::GetFunctionAddressImpl(const FunctionDeclaration& fn) { throw LogicException(LogicExceptionErrors::notImplemented); } + + std::string CppEmitterContext::GetScopeAdjustedName(GlobalAllocationScope scope, std::string name) const + { + switch (scope) + { + case GlobalAllocationScope::Global: + return GetGlobalScopedName(name); + case GlobalAllocationScope::Function: + return GetCurrentFunctionScopedName(name); + } + + throw LogicException(LogicExceptionErrors::illegalState); + } + + std::string CppEmitterContext::GetGlobalScopedName(std::string name) const + { + return _moduleName + "_" + name; + } + + std::string CppEmitterContext::GetCurrentFunctionScopedName(std::string name) const + { + if (_fnStacks.empty()) + { + throw LogicException(LogicExceptionErrors::illegalState); + } + + return GetGlobalScopedName(_fnStacks.top().name + "_" + name); + } + + std::ostream& CppEmitterContext::Out() + { + return _stream.get() << std::string(2 * _indent, ' '); + } + + std::ostream& CppEmitterContext::Global() + { + return _globalStream; + } + + std::ostream& CppEmitterContext::FnDecl() { return _fnDeclStream; } + + Value CppEmitterContext::PromoteConstantData(Value value) + { + assert(value.IsConstant() && value.IsDefined() && !value.IsEmpty()); + + const auto& constantData = _computeContext.GetConstantData(value); + + auto [offset, size] = std::visit( + [&value](auto&& data) -> std::pair { + using Type = std::decay_t; + using DataType = typename Type::value_type; + + auto ptrData = std::get(value.GetUnderlyingData()); + ptrdiff_t offset = ptrData - data.data(); + return { offset, static_cast(data.size()) }; + }, + constantData); + + auto type = value.GetBaseType(); + auto promotedBaseValue = GlobalAllocateImpl(_fnStacks.empty() ? GlobalAllocationScope::Global : GlobalAllocationScope::Function, + UniqueName("_"), + constantData, + MemoryLayout{ { size } }, + AllocateFlags::None); + + _promotedConstantStack.top().push_back({ &constantData, promotedBaseValue.Get() }); + + if (offset == 0) + { + promotedBaseValue.SetLayout(value.GetLayout()); + return promotedBaseValue; + } + else + { + CppEmitterContext::ValueImpl valueDesc{ UniqueName(promotedBaseValue.GetName() + "_offset"), { type, 1 } }; + + _globalsList.push_front(std::move(valueDesc)); + auto& front = _globalsList.front(); + + Emittable emittable{ &front }; + const auto& layout = value.GetLayout(); + + _globals[front.name] = { emittable, layout }; + + Value value(emittable, layout); + + Global() << ValueTypeToCTypeString(type) << "* " << front.name + << " = &" + << promotedBaseValue.GetName() << "[" << offset << "];\n"; + + return value; + } + } + + std::optional CppEmitterContext::HasBeenPromoted(Value value) const + { + if (!value.IsDefined() || value.IsEmpty() || !value.IsConstant()) + { + return std::nullopt; + } + + const auto& constantData = _computeContext.GetConstantData(value); + const auto& promotedStack = _promotedConstantStack.top(); + + if (auto it = std::find_if(promotedStack.begin(), + promotedStack.end(), + [&constantData](const auto& desc) { return desc.data == &constantData; }); + it != promotedStack.end()) + { + return *it; + } + else + { + return std::nullopt; + } + } + + Value CppEmitterContext::Realize(Value value) + { + if (auto desc = HasBeenPromoted(value); !desc) + { + return value; + } + else + { + const auto& promotionalDesc = *desc; + auto offset = std::visit( + [&value](auto&& data) -> ptrdiff_t { + using Type = std::decay_t; + using DataType = typename Type::value_type; + + auto ptrData = std::get(value.GetUnderlyingData()); + + return ptrData - data.data(); + }, + *promotionalDesc.data); + + Value newValue = value; + auto emittable = promotionalDesc.realValue; + if (offset == 0) + { + newValue.SetData(emittable); + } + else + { + auto type = value.GetBaseType(); + auto valueImpl = emittable.GetDataAs(); + CppEmitterContext::ValueImpl valueDesc{ UniqueName(valueImpl->name + "_offset"), { type, 1 } }; + + _globalsList.push_front(std::move(valueDesc)); + auto& front = _globalsList.front(); + + Emittable offsetEmittable{ &front }; + const auto& layout = value.GetLayout(); + + _globals[front.name] = { offsetEmittable, layout }; + + newValue.SetData(offsetEmittable); + + Global() << ValueTypeToCTypeString(type) << "* " << front.name + << " = &" + << valueImpl->name << "[" << offset << "];\n"; + } + + return newValue; + } + } + + Value CppEmitterContext::EnsureEmittable(Value value) + { + if (!value.IsConstant()) + { + return value; + } + else if (Value newValue = Realize(value); !newValue.IsConstant()) + { + return newValue; + } + else + { + return PromoteConstantData(newValue); + } + } + + // Despite the name, the function does not actually try to ensure the param passed in is a Scalar + std::string CppEmitterContext::ScalarToString(ViewAdapter scalar) const + { + return std::visit( + [](auto&& data) { + using Type = std::decay_t; + if constexpr (std::is_same_v) + { + return data.template GetDataAs()->name + "[0]"; + } + else + { + using RealType = RemoveAllPointersT; + if constexpr (std::is_same_v) + { + return std::to_string(static_cast(data[0])); + } + else if constexpr (std::is_floating_point_v) + { + if (std::trunc(data[0]) == data[0]) + { + return std::to_string(static_cast(data[0])); + } + return std::to_string(data[0]); + } + else + { + return std::to_string(data[0]); + } + } + }, + scalar.GetValue().GetUnderlyingData()); + } + + template + void CppEmitterContext::Indented(Fn&& fn) + { + ++_indent; + fn(); + --_indent; + } + +} // namespace value +} // namespace ell diff --git a/libraries/value/src/EmitterContext.cpp b/libraries/value/src/EmitterContext.cpp index 7f428f9f4..07a903a71 100644 --- a/libraries/value/src/EmitterContext.cpp +++ b/libraries/value/src/EmitterContext.cpp @@ -72,34 +72,61 @@ namespace value return std::move(*this); } - void EmitterContext::IfContext::Else(std::function fn) && { _impl->Else(fn); } + void EmitterContext::IfContext::ElseIf(Scalar test, std::function fn) & + { + if (test.GetType() != ValueType::Boolean) + { + throw InputException(InputExceptionErrors::typeMismatch); + } + + _impl->ElseIf(test, fn); + } + + void EmitterContext::IfContext::Else(std::function fn) && + { + _impl->Else(fn); + } + + void EmitterContext::IfContext::Else(std::function fn) & + { + _impl->Else(fn); + } EmitterContext::~EmitterContext() = default; - Value EmitterContext::Allocate(ValueType type, size_t size) { return Allocate(type, MemoryLayout({ (int)size })); } + Value EmitterContext::Allocate(ValueType type, size_t size, size_t align, AllocateFlags flags) + { + return Allocate(type, MemoryLayout({ (int)size }), align, flags); + } - Value EmitterContext::Allocate(ValueType type, MemoryLayout layout) { return AllocateImpl(type, layout); } + Value EmitterContext::Allocate(ValueType type, MemoryLayout layout, size_t align, AllocateFlags flags) + { + return AllocateImpl(type, layout, align, flags); + } - Value EmitterContext::StaticAllocate(std::string name, ValueType type, utilities::MemoryLayout layout) + Value EmitterContext::StaticAllocate(std::string name, ValueType type, utilities::MemoryLayout layout, AllocateFlags flags) { - if (auto globalValue = GetGlobalValue(GlobalAllocationScope::Function, name)) + if (auto globalValue = GetGlobalValue(GlobalAllocationScope::Function, name, layout)) { - Value value = globalValue.value(); - if (layout.GetMemorySize() > value.GetLayout().GetMemorySize()) - { - throw InputException(InputExceptionErrors::invalidSize); - } - value.SetLayout(layout); + return *globalValue; + } - return value; + return GlobalAllocateImpl(GlobalAllocationScope::Function, name, type, layout, flags); + } + + Value EmitterContext::GlobalAllocate(std::string name, ValueType type, utilities::MemoryLayout layout, AllocateFlags flags) + { + if (auto globalValue = GetGlobalValue(GlobalAllocationScope::Global, name, layout)) + { + return *globalValue; } - return GlobalAllocateImpl(GlobalAllocationScope::Function, name, type, layout); + return GlobalAllocateImpl(GlobalAllocationScope::Global, name, type, layout, flags); } - Value EmitterContext::GlobalAllocate(std::string name, ValueType type, utilities::MemoryLayout layout) + std::optional EmitterContext::GetGlobalValue(GlobalAllocationScope scope, std::string name, MemoryLayout layout) { - if (auto globalValue = GetGlobalValue(GlobalAllocationScope::Global, name)) + if (auto globalValue = GetGlobalValue(scope, name)) { Value value = globalValue.value(); if (layout.GetMemorySize() > value.GetLayout().GetMemorySize()) @@ -111,7 +138,7 @@ namespace value return value; } - return GlobalAllocateImpl(GlobalAllocationScope::Global, name, type, layout); + return std::nullopt; } detail::ValueTypeDescription EmitterContext::GetType(Emittable emittable) { return GetTypeImpl(emittable); } @@ -128,17 +155,17 @@ namespace value Value EmitterContext::StoreConstantData(ConstantData data) { return StoreConstantDataImpl(data); } - void EmitterContext::For(MemoryLayout layout, std::function)> fn) + void EmitterContext::For(MemoryLayout layout, std::function)> fn, const std::string& name) { if (layout.NumElements() == 0) { return; } - return ForImpl(layout, fn); + return ForImpl(layout, fn, name); } - void EmitterContext::For(Scalar start, Scalar stop, Scalar step, std::function fn) + void EmitterContext::For(Scalar start, Scalar stop, Scalar step, std::function fn, const std::string& name) { if (!(start.GetType() == stop.GetType() && start.GetType() == step.GetType())) { @@ -150,7 +177,7 @@ namespace value throw InputException(InputExceptionErrors::invalidArgument, "start/stop/step must not be boolean"); } - return ForImpl(start, stop, step, fn); + return ForImpl(start, stop, step, fn, name); } void EmitterContext::MoveData(Value& source, Value& destination) { return MoveDataImpl(source, destination); } @@ -161,11 +188,11 @@ namespace value Value EmitterContext::Dereference(Value source) { - if (source.PointerLevel() < 1) + if (source.PointerLevel() < 0) { - throw LogicException(LogicExceptionErrors::illegalState, "Pointer level is less than the expected minimum of 1"); + throw LogicException(LogicExceptionErrors::illegalState, "Pointer level is less than the minimum of 0"); } - else if (source.PointerLevel() == 1) + else if (source.PointerLevel() == 0) { throw LogicException(LogicExceptionErrors::illegalState, "Attempted to dereference Value that is not a reference"); } @@ -225,9 +252,20 @@ namespace value return IfImpl(test, fn); } - std::optional EmitterContext::Call(FunctionDeclaration func, std::vector args) + void EmitterContext::While(Scalar test, std::function fn) + { + if (test.GetType() != ValueType::Boolean) + { + throw InputException(InputExceptionErrors::typeMismatch); + } + + return WhileImpl(test, fn); + } + + std::optional EmitterContext::Call(FunctionDeclaration func, std::vector args) { - return CallImpl(func, args); + std::vector valueArgs(args.begin(), args.end()); + return CallImpl(func, valueArgs); } void EmitterContext::Prefetch(Value data, PrefetchType type, PrefetchLocality locality) @@ -237,9 +275,16 @@ namespace value void EmitterContext::Parallelize(int numTasks, std::vector captured, std::function)> fn) { + if (numTasks == 0) return; + return ParallelizeImpl(numTasks, captured, fn); } + void EmitterContext::DebugBreak() + { + DebugBreakImpl(); + } + void EmitterContext::DebugDump(Value value, std::string tag, std::ostream* stream) const { std::ostream& outStream = stream != nullptr ? *stream : std::cerr; @@ -281,6 +326,11 @@ namespace value return GetNameImpl(value); } + void EmitterContext::ImportCodeFile(std::string file) + { + ImportCodeFileImpl(file); + } + const std::vector>& EmitterContext::GetIntrinsics() const { static std::vector intrinsics = { @@ -299,12 +349,71 @@ namespace value std::ref(TanhFunctionDeclaration), std::ref(RoundFunctionDeclaration), std::ref(FloorFunctionDeclaration), - std::ref(CeilFunctionDeclaration) + std::ref(CeilFunctionDeclaration), + std::ref(FmaFunctionDeclaration), + std::ref(MemCopyFunctionDeclaration), + std::ref(MemMoveFunctionDeclaration), + std::ref(MemSetFunctionDeclaration), }; return intrinsics; } + std::vector EmitterContext::NormalizeReferenceLevels(const std::vector& args, const std::vector& expected) const + { + if (args.size() != expected.size()) + { + throw new InputException(InputExceptionErrors::sizeMismatch); + } + std::vector normalizedArgs; + normalizedArgs.reserve(args.size()); + for (unsigned index = 0; index < args.size(); ++index) + { + auto& expectedValue = expected[index]; + auto& arg = args[index]; + Value value{ + { expectedValue.GetBaseType(), expectedValue.PointerLevel() }, + expectedValue.IsConstrained() ? std::optional{ expectedValue.GetLayout() } : std::optional{ std::nullopt } + }; + if (expectedValue.PointerLevel() == arg.PointerLevel()) + { + value.SetData(arg, true); + } + else if (expectedValue.PointerLevel() == (arg.PointerLevel() - 1)) + { + value.SetData(arg.Dereference(), true); + } + else + { + throw LogicException(LogicExceptionErrors::illegalState); + } + normalizedArgs.push_back(value); + } + return normalizedArgs; + } + + std::string EmitterContext::UniqueName(const std::string& prefix) + { + auto uniqueId = _uniqueNames[prefix]++; + return prefix + "_" + std::to_string(uniqueId); + } + + Scalar EmitterContext::GetFunctionAddress(const FunctionDeclaration& decl) + { + if (const auto& intrinsics = GetIntrinsics(); + std::find(intrinsics.begin(), intrinsics.end(), decl) != intrinsics.end()) + { + throw InputException(InputExceptionErrors::invalidArgument, "Cannot get function address of intrinsic"); + } + + return GetFunctionAddressImpl(decl); + } + + void swap(EmitterContext& l, EmitterContext& r) noexcept + { + std::swap(l._uniqueNames, r._uniqueNames); + } + namespace { EmitterContext* s_context = nullptr; @@ -332,35 +441,66 @@ namespace value ContextGuard<>::~ContextGuard() { _oldContext ? SetContext(*_oldContext) : ClearContext(); } - Value Allocate(ValueType type, size_t size) { return GetContext().Allocate(type, size); } + Value Allocate(ValueType type, size_t size, size_t align, AllocateFlags flags) + { + return GetContext().Allocate(type, size, align, flags); + } - Value Allocate(ValueType type, MemoryLayout layout) { return GetContext().Allocate(type, layout); } + Value Allocate(ValueType type, MemoryLayout layout, size_t align, AllocateFlags flags) + { + return GetContext().Allocate(type, layout, align, flags); + } - Value StaticAllocate(std::string name, ValueType type, utilities::MemoryLayout layout) + Value StaticAllocate(std::string name, ValueType type, utilities::MemoryLayout layout, AllocateFlags flags) { - return GetContext().StaticAllocate(name, type, layout); + return GetContext().StaticAllocate(name, type, layout, flags); } - Value GlobalAllocate(std::string name, ValueType type, utilities::MemoryLayout layout) + Value GlobalAllocate(std::string name, ValueType type, utilities::MemoryLayout layout, AllocateFlags flags) { - return GetContext().GlobalAllocate(name, type, layout); + return GetContext().GlobalAllocate(name, type, layout, flags); } EmitterContext::IfContext If(Scalar test, std::function fn) { return GetContext().If(test, fn); } + void While(Scalar test, std::function fn) + { + return GetContext().While(test, fn); + } + void ForRange(Scalar end, std::function fn) { - ForRange(0, end, fn); + ForRange(std::string{}, end, fn); + } + + void ForRange(const std::string& name, Scalar end, std::function fn) + { + ForRange(name, Scalar{ 0 }, end, fn); } void ForRange(Scalar start, Scalar end, std::function fn) { - ForRange(start, end, 1, fn); + ForRange(std::string{}, start, end, fn); + } + + void ForRange(const std::string& name, Scalar start, Scalar end, std::function fn) + { + ForRange(name, start, end, 1, fn); } void ForRange(Scalar start, Scalar end, Scalar step, std::function fn) { - GetContext().For(start, end, step, fn); + ForRange(std::string{}, start, end, step, fn); + } + + void ForRange(const std::string& name, Scalar start, Scalar end, Scalar step, std::function fn) + { + GetContext().For(start, end, step, fn, name); + } + + void DebugBreak() + { + GetContext().DebugBreak(); } void DebugDump(FunctionDeclaration fn, std::string tag, std::ostream* stream) @@ -373,6 +513,11 @@ namespace value GetContext().DebugDump(value, tag, stream); } + void DebugPrint(std::string message) + { + GetContext().DebugPrint(message); + } + void Parallelize(int numTasks, std::vector captured, std::function)> fn) { GetContext().Parallelize(numTasks, captured, fn); @@ -458,6 +603,11 @@ namespace value return *GetContext().Call(CeilFunctionDeclaration, { s.GetValue() }); } + Scalar Fma(Scalar a, Scalar b, Scalar c) + { + return *GetContext().Call(FmaFunctionDeclaration, { a.GetValue(), b.GetValue(), c.GetValue() }); + } + Scalar Sign(Scalar s) { return *GetContext().Call(CopySignFunctionDeclaration, { Cast(1, s.GetType()).GetValue(), s.GetValue() }); @@ -541,9 +691,10 @@ namespace value { If(v == Cast(0, v.GetType()), [&] { r = Cast(1, v.GetType()); - }).Else([&] { - r = Cast(0, v.GetType()); - }); + }) + .Else([&] { + r = Cast(0, v.GetType()); + }); } return r; } @@ -562,5 +713,38 @@ namespace value return *GetContext().Call(CeilFunctionDeclaration, { v.GetValue() }); } + void MemCopy(ViewAdapter dest, ViewAdapter source, std::optional length) + { + (void)GetContext().Call(MemCopyFunctionDeclaration, { dest, source, length.value_or(static_cast(source.GetValue().GetLayout().GetMemorySize())).GetValue() }); + } + + void MemMove(ViewAdapter dest, ViewAdapter source, std::optional length) + { + (void)GetContext().Call(MemMoveFunctionDeclaration, { dest, source, length.value_or(static_cast(source.GetValue().GetLayout().GetMemorySize())).GetValue() }); + } + + void MemSet(ViewAdapter dest, Scalar data, std::optional length) + { + if (data.GetType() != ValueType::Char8) + { + throw InputException(InputExceptionErrors::typeMismatch, "Memory pattern specified by data is expected to be of type Char8"); + } + + (void)GetContext().Call(MemSetFunctionDeclaration, { dest, data.GetValue(), length.value_or(static_cast(dest.GetValue().GetLayout().GetMemorySize())).GetValue() }); + } + + void ZeroMemory(ViewAdapter dest, std::optional length) + { + // As of 9/11/2019, when compiling with C++17, `Value{ char{} }` in place of `Value(char{})` + // triggers the `std::initializer_list` ctor for Value, instead of the `Value(T)` ctor. + // This might change in C++20. + MemSet(dest, Value(char{}), length); + } + + std::string UniqueName(const std::string& prefix) + { + return GetContext().UniqueName(prefix); + } + } // namespace value } // namespace ell diff --git a/libraries/value/src/FunctionDeclaration.cpp b/libraries/value/src/FunctionDeclaration.cpp index e68d88200..467583375 100644 --- a/libraries/value/src/FunctionDeclaration.cpp +++ b/libraries/value/src/FunctionDeclaration.cpp @@ -10,6 +10,8 @@ #include +#include + namespace ell { namespace value @@ -19,9 +21,22 @@ namespace value FunctionDeclaration::FunctionDeclaration(std::string name) : _originalFunctionName(name), _isEmpty(false) - {} + { + if (!std::isalpha(_originalFunctionName[0]) && _originalFunctionName[0] != '_') + { + throw InputException(InputExceptionErrors::invalidArgument, "Function names must begin with an _ or alphabetical character"); + } + } + + FunctionDeclaration& FunctionDeclaration::DefineFromFile(std::string file) + { + CheckNonEmpty(); + + _importedSource = file; + return *this; + } - FunctionDeclaration& FunctionDeclaration::Returns(Value returnType) + FunctionDeclaration& FunctionDeclaration::Returns(ViewAdapter returnType) { CheckNonEmpty(); @@ -29,26 +44,45 @@ namespace value return *this; } - FunctionDeclaration& FunctionDeclaration::Parameters(std::vector paramTypes) + FunctionDeclaration& FunctionDeclaration::Parameters(std::vector paramTypes) { CheckNonEmpty(); - _paramTypes = paramTypes; + _paramTypes.assign(paramTypes.begin(), paramTypes.end()); return *this; } - FunctionDeclaration& FunctionDeclaration::Decorated(FunctionDecorated shouldDecorate) + FunctionDeclaration& FunctionDeclaration::Decorated(bool shouldDecorate) { CheckNonEmpty(); - _isDecorated = shouldDecorate == FunctionDecorated::Yes; + _isDecorated = shouldDecorate; return *this; } - std::optional FunctionDeclaration::Call(std::vector arguments) const + FunctionDeclaration& FunctionDeclaration::Public(bool isPublic) + { + _isPublic = isPublic; + return *this; + } + + FunctionDeclaration& FunctionDeclaration::Inlined(FunctionInlining shouldInline) { CheckNonEmpty(); + _inlineState = shouldInline; + return *this; + } + + std::optional FunctionDeclaration::Call(std::vector arguments) const + { + CheckNonEmpty(); + + if (!_importedSource.empty() && !IsDefined()) + { + GetContext().ImportCodeFile(_importedSource); + } + return GetContext().Call(*this, arguments); } @@ -61,9 +95,25 @@ namespace value if (!_decoratedFunctionName) { size_t hash = 0; - HashCombine(hash, _returnType); - HashCombine(hash, _paramTypes); - _decoratedFunctionName = _originalFunctionName + std::to_string(hash); + if(_returnType) + { + HashCombine(hash, static_cast(_returnType->GetBaseType())); + HashCombine(hash, _returnType->PointerLevel()); + if (_returnType->IsConstrained()) + { + HashCombine(hash, _returnType->GetLayout()); + } + } + for(auto p: _paramTypes) + { + HashCombine(hash, static_cast(p.GetBaseType())); + HashCombine(hash, p.PointerLevel()); + if (p.IsConstrained()) + { + HashCombine(hash, p.GetLayout()); + } + } + _decoratedFunctionName = _originalFunctionName + "_" + std::to_string(hash); } return *_decoratedFunctionName; } @@ -87,6 +137,13 @@ namespace value return _returnType; } + bool FunctionDeclaration::IsPublic() const + { + CheckNonEmpty(); + + return _isPublic; + } + bool FunctionDeclaration::IsDefined() const { CheckNonEmpty(); @@ -94,8 +151,21 @@ namespace value return GetContext().IsFunctionDefined(*this); } + bool FunctionDeclaration::IsImported() const + { + CheckNonEmpty(); + + return !_importedSource.empty(); + } + bool FunctionDeclaration::IsEmpty() const { return _isEmpty; } + FunctionInlining FunctionDeclaration::InlineState() const + { + CheckNonEmpty(); + return _inlineState; + } + void FunctionDeclaration::CheckNonEmpty() const { if (_isEmpty) @@ -104,27 +174,41 @@ namespace value } } + Scalar FunctionDeclaration::GetPointer() const + { + if (_pointer) + { + return *_pointer; + } + + return GetContext().GetFunctionAddress(*this); + } + FunctionDeclaration DeclareFunction(std::string name) { return FunctionDeclaration(name); } - /*extern*/ FunctionDeclaration AbsFunctionDeclaration = DeclareFunction("Abs"); - /*extern*/ FunctionDeclaration CosFunctionDeclaration = DeclareFunction("Cos"); - /*extern*/ FunctionDeclaration CopySignFunctionDeclaration = DeclareFunction("CopySign"); - /*extern*/ FunctionDeclaration ExpFunctionDeclaration = DeclareFunction("Exp"); - /*extern*/ FunctionDeclaration LogFunctionDeclaration = DeclareFunction("Log"); - /*extern*/ FunctionDeclaration Log10FunctionDeclaration = DeclareFunction("Log10"); - /*extern*/ FunctionDeclaration Log2FunctionDeclaration = DeclareFunction("Log2"); - /*extern*/ FunctionDeclaration MaxNumFunctionDeclaration = DeclareFunction("MaxNum"); - /*extern*/ FunctionDeclaration MinNumFunctionDeclaration = DeclareFunction("MinNum"); - /*extern*/ FunctionDeclaration PowFunctionDeclaration = DeclareFunction("Pow"); - /*extern*/ FunctionDeclaration SinFunctionDeclaration = DeclareFunction("Sin"); - /*extern*/ FunctionDeclaration SqrtFunctionDeclaration = DeclareFunction("Sqrt"); - /*extern*/ FunctionDeclaration TanhFunctionDeclaration = DeclareFunction("Tanh"); - /*extern*/ FunctionDeclaration RoundFunctionDeclaration = DeclareFunction("Round"); - /*extern*/ FunctionDeclaration FloorFunctionDeclaration = DeclareFunction("Floor"); - /*extern*/ FunctionDeclaration CeilFunctionDeclaration = DeclareFunction("Ceil"); + /*extern*/ FunctionDeclaration AbsFunctionDeclaration = DeclareFunction("Abs").Decorated(false); + /*extern*/ FunctionDeclaration CosFunctionDeclaration = DeclareFunction("Cos").Decorated(false); + /*extern*/ FunctionDeclaration CopySignFunctionDeclaration = DeclareFunction("CopySign").Decorated(false); + /*extern*/ FunctionDeclaration ExpFunctionDeclaration = DeclareFunction("Exp").Decorated(false); + /*extern*/ FunctionDeclaration LogFunctionDeclaration = DeclareFunction("Log").Decorated(false); + /*extern*/ FunctionDeclaration Log10FunctionDeclaration = DeclareFunction("Log10").Decorated(false); + /*extern*/ FunctionDeclaration Log2FunctionDeclaration = DeclareFunction("Log2").Decorated(false); + /*extern*/ FunctionDeclaration MaxNumFunctionDeclaration = DeclareFunction("MaxNum").Decorated(false); + /*extern*/ FunctionDeclaration MinNumFunctionDeclaration = DeclareFunction("MinNum").Decorated(false); + /*extern*/ FunctionDeclaration PowFunctionDeclaration = DeclareFunction("Pow").Decorated(false); + /*extern*/ FunctionDeclaration SinFunctionDeclaration = DeclareFunction("Sin").Decorated(false); + /*extern*/ FunctionDeclaration SqrtFunctionDeclaration = DeclareFunction("Sqrt").Decorated(false); + /*extern*/ FunctionDeclaration TanhFunctionDeclaration = DeclareFunction("Tanh").Decorated(false); + /*extern*/ FunctionDeclaration RoundFunctionDeclaration = DeclareFunction("Round").Decorated(false); + /*extern*/ FunctionDeclaration FloorFunctionDeclaration = DeclareFunction("Floor").Decorated(false); + /*extern*/ FunctionDeclaration CeilFunctionDeclaration = DeclareFunction("Ceil").Decorated(false); + /*extern*/ FunctionDeclaration FmaFunctionDeclaration = DeclareFunction("Fma").Decorated(false); + /*extern*/ FunctionDeclaration MemCopyFunctionDeclaration = DeclareFunction("MemCpy").Decorated(false); + /*extern*/ FunctionDeclaration MemMoveFunctionDeclaration = DeclareFunction("MemMove").Decorated(false); + /*extern*/ FunctionDeclaration MemSetFunctionDeclaration = DeclareFunction("MemSet").Decorated(false); } // namespace value } // namespace ell diff --git a/libraries/value/src/LLVMContext.cpp b/libraries/value/src/LLVMContext.cpp index 8a3f88bb8..848b14537 100644 --- a/libraries/value/src/LLVMContext.cpp +++ b/libraries/value/src/LLVMContext.cpp @@ -11,13 +11,13 @@ #include "Scalar.h" #include "Value.h" -#include - +#include #include -#include - #include +#include + +#include using namespace std::string_literals; @@ -131,7 +131,7 @@ namespace value return type; } - VariableType ValueTypeToVariableType(ValueType type) + VariableType ValueTypeToVariableType(ValueType type, int ptrLevel = 1) { // clang-format off @@ -141,7 +141,12 @@ namespace value #define VALUE_TYPE_TO_VARIABLE_TYPE_PTR(x) \ case ValueType::x: \ - return VariableType::x##Pointer + if (ptrLevel == 0) \ + return VariableType::x; \ + else if (ptrLevel == 1) \ + return VariableType::x##Pointer; \ + else \ + return VariableType::x##PointerPointer // clang-format on @@ -183,14 +188,25 @@ namespace value return true; } + emitters::FunctionInlining GetEmittersFunctionInlining(FunctionInlining inlining) + { + switch (inlining) + { + case FunctionInlining::always: + return emitters::FunctionInlining::always; + case FunctionInlining::never: + return emitters::FunctionInlining::never; + default: + return emitters::FunctionInlining::defaultInline; + } + } + bool IncrementMemoryCoordinate(std::vector& coordinate, const std::vector& maxCoordinate) { assert(coordinate.size() == maxCoordinate.size()); return IncrementMemoryCoordinateImpl(static_cast(maxCoordinate.size()) - 1, coordinate, maxCoordinate); } - LLVMValue ToLLVMValue(Value value) { return value.Get().GetDataAs(); } - auto SimpleNumericalFunctionIntrinsic(LLVMFunction (IRRuntime::*intrinsicFn)(VariableType)) -> std::function)> { return [intrinsicFn](IRFunctionEmitter& fnEmitter, std::vector args) -> Value { @@ -370,6 +386,68 @@ namespace value }; } + auto FmaFunctionIntrinsic() -> std::function)> + { + return [](IRFunctionEmitter& fnEmitter, std::vector args) -> Value { + if (args.size() != 3) + { + throw InputException(InputExceptionErrors::invalidSize); + } + + if (std::any_of(args.begin(), args.end(), [](Value& value) { return value.IsConstrained() && value.GetLayout() != ScalarLayout; })) + { + throw InputException(InputExceptionErrors::invalidSize); + } + + const auto& value1 = args[0]; + const auto& value2 = args[1]; + const auto& value3 = args[2]; + if (value1.GetBaseType() != value2.GetBaseType() || value1.GetBaseType() != value3.GetBaseType()) + { + throw InputException(InputExceptionErrors::typeMismatch); + } + + if (value1.GetBaseType() == ValueType::Boolean) + { + throw InputException(InputExceptionErrors::typeMismatch); + } + + Value result = value::Allocate(value1.GetBaseType(), ScalarLayout); + + auto llvmValue1 = fnEmitter.ValueAt(ToLLVMValue(value1), 0); + auto llvmValue2 = fnEmitter.ValueAt(ToLLVMValue(value2), 0); + auto llvmValue3 = fnEmitter.ValueAt(ToLLVMValue(value3), 0); + + auto variableType = [type = value1.GetBaseType()] { + switch (type) + { + case ValueType::Double: + return VariableType::Double; + default: + return VariableType::Float; + } + }(); + + // TODO: fix this so that GetNonPointerType call isn't needed + auto value1VariableType = GetNonPointerType(ValueTypeToVariableType(value1.GetBaseType(), value1.PointerLevel())); + if (variableType != value1VariableType) + { + llvmValue1 = fnEmitter.CastValue(llvmValue1, variableType); + llvmValue2 = fnEmitter.CastValue(llvmValue2, variableType); + llvmValue3 = fnEmitter.CastValue(llvmValue3, variableType); + } + auto llvmFunc = fnEmitter.GetModule().GetRuntime().GetFmaFunction(variableType); + auto callResult = fnEmitter.Call(llvmFunc, { llvmValue1, llvmValue2, llvmValue3 }); + if (variableType != value1VariableType) + { + callResult = fnEmitter.CastValue(callResult, value1VariableType); + } + auto resultValue = ToLLVMValue(result); + fnEmitter.SetValueAt(resultValue, 0, callResult); + return result; + }; + } + enum class MaxMinIntrinsic { Max, @@ -498,6 +576,71 @@ namespace value }; } + enum class MemIntrinsicOp + { + Copy, + Move, + Set + }; + auto MemOpFunctionIntrinsic(MemIntrinsicOp intrinsic) -> std::function)> + { + return [intrinsic](IRFunctionEmitter& fnEmitter, std::vector args) -> Value { + if (args.size() != 3) + { + throw InputException(InputExceptionErrors::invalidSize); + } + + const auto& value1 = args[0]; + const auto& value2 = args[1]; + const auto& value3 = args[2]; + + if (!value3.IsConstrained() || value3.GetLayout() != ScalarLayout) + { + throw InputException(InputExceptionErrors::invalidArgument); + } + + if (intrinsic == MemIntrinsicOp::Set) + { + assert((value2.IsConstrained() && value2.GetLayout() == ScalarLayout && value2.GetType() == std::pair{ ValueType::Char8, 1 })); + } + + llvm::CallInst* (IREmitter::*memFn)(LLVMValue, LLVMValue, LLVMValue){}; + switch (intrinsic) + { + case MemIntrinsicOp::Copy: + memFn = &IREmitter::MemoryCopy; + break; + case MemIntrinsicOp::Move: + memFn = &IREmitter::MemoryMove; + break; + case MemIntrinsicOp::Set: + memFn = &IREmitter::MemorySet; + break; + default: + assert(false); + } + + auto llvmValue1 = ToLLVMValue(value1); + auto llvmValue2 = ToLLVMValue(value2); + auto llvmValue3 = fnEmitter.Load(ToLLVMValue(value3)); + auto llvmType = llvmValue1->getType()->getContainedType(0); + llvmValue3 = fnEmitter.LocalScalar(static_cast(fnEmitter.GetEmitter().SizeOf(llvmType))) * llvmValue3; + + if (intrinsic == MemIntrinsicOp::Set) + { + llvmValue2 = fnEmitter.Load(llvmValue2); + + // we're going to swap the first two params because MemorySet DOES have destination first... + std::swap(llvmValue1, llvmValue2); + } + + // IREmitter takes the first two parameters in the opposite order of everyone else! + (void)std::invoke(memFn, fnEmitter.GetEmitter(), llvmValue2, llvmValue1, llvmValue3); + + return {}; // ignored + }; + } + void ConstantForLoop(const MemoryLayout& layout, std::function fn) { auto maxCoordinate = layout.GetActiveSize().ToVector(); @@ -520,7 +663,9 @@ namespace value struct LLVMContext::FunctionScope { - FunctionScope(LLVMContext& context, IRFunctionEmitter& emitter) : + FunctionScope( + LLVMContext& context, + IRFunctionEmitter& emitter) : context(context) { context._functionStack.push(emitter); @@ -545,15 +690,32 @@ namespace value }; LLVMContext::LLVMContext(IRModuleEmitter& emitter) : + EmitterContext(emitter.GetCompilerOptions().targetDevice), _emitter(emitter), _computeContext(_emitter.GetModuleName()) { _promotedConstantStack.push({}); } - IRModuleEmitter& LLVMContext::GetModuleEmitter() const { return _emitter; } + LLVMContext::LLVMContext(std::unique_ptr&& emitter) : + EmitterContext(emitter->GetCompilerOptions().targetDevice), + _ownedEmitter(std::move(emitter)), + _emitter(*_ownedEmitter), + _computeContext(_emitter.GetModuleName()) + { + _promotedConstantStack.push({}); + } + + LLVMContext::LLVMContext(const std::string& moduleName, const CompilerOptions& parameters) : + LLVMContext(std::make_unique(moduleName, parameters)) + {} + + IRModuleEmitter& LLVMContext::GetModuleEmitter() const + { + return _emitter; + } - Value LLVMContext::AllocateImpl(ValueType type, MemoryLayout layout) + Value LLVMContext::AllocateImpl(ValueType type, MemoryLayout layout, size_t alignment, AllocateFlags flags) { auto& fn = GetFunctionEmitter(); auto& irEmitter = fn.GetEmitter(); @@ -561,8 +723,11 @@ namespace value auto llvmType = ValueTypeToLLVMType(irEmitter, { type, 0 }); assert(!llvmType->isPointerTy()); auto allocatedVariable = fn.Variable(llvmType, layout.GetMemorySize()); + if (alignment != 0) + { + allocatedVariable->setAlignment(alignment); + } fn.StoreZero(allocatedVariable, layout.GetMemorySize()); - return { Emittable{ allocatedVariable }, layout }; } @@ -577,7 +742,7 @@ namespace value return std::nullopt; } - Value LLVMContext::GlobalAllocateImpl(GlobalAllocationScope scope, std::string name, ConstantData data, MemoryLayout layout) + Value LLVMContext::GlobalAllocateImpl(GlobalAllocationScope scope, std::string name, ConstantData data, MemoryLayout layout, AllocateFlags flags) { std::string adjustedName = GetScopeAdjustedName(scope, name); @@ -588,9 +753,10 @@ namespace value } llvm::GlobalVariable* global = std::visit( - [this, &adjustedName](auto&& vectorData) { + [this, flags, &adjustedName](auto&& vectorData) { using Type = std::decay_t; + bool isThreadLocal = (flags & AllocateFlags::ThreadLocal) == AllocateFlags::ThreadLocal; if constexpr (std::is_same_v>) { // IREmitter stores a vector of bool values as a bitvector, which @@ -601,14 +767,15 @@ namespace value // originally, this was a vector of bools. This will be rectified // in the near future. (2018-11-08) std::vector transformedData(vectorData.begin(), vectorData.end()); - return _emitter.GlobalArray(adjustedName, transformedData); + return _emitter.GlobalArray(adjustedName, transformedData, isThreadLocal); } else { - return _emitter.GlobalArray(adjustedName, vectorData); + return _emitter.GlobalArray(adjustedName, vectorData, isThreadLocal); } }, data); + auto dereferencedGlobal = _emitter.GetIREmitter().PointerOffset(global, _emitter.GetIREmitter().Literal(0)); Emittable emittable{ dereferencedGlobal }; @@ -617,7 +784,7 @@ namespace value return Value(emittable, layout); } - Value LLVMContext::GlobalAllocateImpl(GlobalAllocationScope scope, std::string name, ValueType type, MemoryLayout layout) + Value LLVMContext::GlobalAllocateImpl(GlobalAllocationScope scope, std::string name, ValueType type, MemoryLayout layout, AllocateFlags flags) { std::string adjustedName = GetScopeAdjustedName(scope, name); @@ -629,7 +796,8 @@ namespace value auto global = _emitter.GlobalArray(adjustedName, ValueTypeToLLVMType(_emitter.GetIREmitter(), { type, 0 }), - layout.GetMemorySize()); + layout.GetMemorySize(), + (flags & AllocateFlags::ThreadLocal) == AllocateFlags::ThreadLocal); auto dereferencedGlobal = _emitter.GetIREmitter().PointerOffset(global, _emitter.GetIREmitter().Literal(0)); @@ -664,23 +832,40 @@ namespace value std::vector variableArgTypes(argValues.size()); std::transform(argValues.begin(), argValues.end(), variableArgTypes.begin(), [](Value value) { - return ValueTypeToVariableType(value.GetBaseType()); + return ValueTypeToVariableType(value.GetBaseType(), value.PointerLevel()); }); const auto& fnName = decl.GetFunctionName(); + auto argValuesCopy = argValues; { ValueType returnValueType = returnValue ? returnValue->GetBaseType() : ValueType::Void; FunctionScope scope(*this, fnName, ValueTypeToVariableType(returnValueType), variableArgTypes); - GetFunctionEmitter().SetAttributeForArguments(IRFunctionEmitter::Attributes::NoAlias); - auto functionArgs = GetFunctionEmitter().Arguments(); - auto argValuesCopy = argValues; + auto& fnEmitter = GetFunctionEmitter(); + if (decl.IsPublic()) + { + fnEmitter.IncludeInHeader(); + } + fnEmitter.SetAttributeForArguments(IRFunctionEmitter::Attributes::NoAlias); + fnEmitter.SetInlineState(GetEmittersFunctionInlining(decl.InlineState())); + + auto functionArgs = fnEmitter.Arguments(); auto returnValueCopy = returnValue; for (std::pair idx{ 0u, functionArgs.begin() }; idx.first < argValuesCopy.size(); ++idx.first, ++idx.second) { - idx.second->setName(std::string{ "arg" } + std::to_string(idx.first)); - argValuesCopy[idx.first].SetData(Emittable{ idx.second }); + auto& argValueCopy = argValuesCopy[idx.first]; + auto argName = std::string{ "arg" } + std::to_string(idx.first); + auto inputName = argValueCopy.GetName(); + idx.second->setName(argName); + argValueCopy.SetData(Emittable{ idx.second }); + + if (auto argType = idx.second->getType(); argValueCopy.IsConstrained() && argType->isPointerTy()) + { + auto innerType = argType->getPointerElementType(); + uint64_t bytes = argValueCopy.GetLayout().GetMemorySize() * fnEmitter.GetEmitter().SizeOf(innerType); + fnEmitter.SetAttributeForArgument(idx.first, IRFunctionEmitter::Attributes::Dereferenceable, bytes); + } } returnValueCopy = fn(argValuesCopy); @@ -694,7 +879,7 @@ namespace value } } - DefinedFunction returnFn = [this, decl](std::vector args) -> std::optional { + DefinedFunction returnFn = [this, decl, llvmExpectedValues = argValuesCopy](std::vector args) -> std::optional { const auto& argValues = decl.GetParameterTypes(); const auto& returnValue = decl.GetReturnType(); const auto& fnName = decl.GetFunctionName(); @@ -710,13 +895,12 @@ namespace value throw InputException(InputExceptionErrors::invalidArgument); } - std::vector llvmArgs(args.size()); - std::transform(args.begin(), args.end(), llvmArgs.begin(), [this](Value& arg) { - return EnsureEmittable(arg).Get().GetDataAs(); - }); + std::vector emittableArgs = EnsureEmittable(args); + auto normalizedArgs = NormalizeReferenceLevels(emittableArgs, llvmExpectedValues); + std::vector llvmArgs = ToLLVMValue(normalizedArgs); auto returnValueCopy = returnValue; - LLVMValue fnReturnValue = _emitter.GetCurrentFunction().Call(fnName, llvmArgs); + LLVMValue fnReturnValue = GetFunctionEmitter().Call(fnName, llvmArgs); if (returnValueCopy) { returnValueCopy->SetData(Emittable{ fnReturnValue }); @@ -737,12 +921,17 @@ namespace value return true; } - return _definedFunctions.find(decl) != _definedFunctions.end(); + if (_definedFunctions.find(decl) != _definedFunctions.end()) + { + return true; + } + + return _emitter.HasFunction(decl.GetFunctionName()); } Value LLVMContext::StoreConstantDataImpl(ConstantData data) { return _computeContext.StoreConstantData(data); } - void LLVMContext::ForImpl(MemoryLayout layout, std::function)> fn) + void LLVMContext::ForImpl(MemoryLayout layout, std::function)> fn, const std::string& name) { std::vector ranges(layout.NumDimensions()); for (auto index = 0u; index < ranges.size(); ++index) @@ -754,6 +943,7 @@ namespace value auto logicalOrder = layout.GetLogicalDimensionOrder(); GetFunctionEmitter().For( + name, ranges, [&fn, logicalOrder](emitters::IRFunctionEmitter&, std::vector indices) { std::vector logicalIndices(indices.size()); @@ -765,7 +955,7 @@ namespace value }); } - void LLVMContext::ForImpl(Scalar start, Scalar stop, Scalar step, std::function fn) + void LLVMContext::ForImpl(Scalar start, Scalar stop, Scalar step, std::function fn, const std::string& name) { auto startValue = EnsureEmittable(start.GetValue()); auto stopValue = EnsureEmittable(stop.GetValue()); @@ -780,6 +970,7 @@ namespace value Scalar index = value::Allocate(ScalarLayout); fnEmitter.For( + name, fnEmitter.Load(ToLLVMValue(startValue)), fnEmitter.Load(ToLLVMValue(stopValue)), fnEmitter.Load(ToLLVMValue(stepValue)), @@ -815,15 +1006,74 @@ namespace value } else { - if (!TypeCompatible(destination, source) && - (destination.PointerLevel() == source.PointerLevel() || - destination.PointerLevel() == (1 + source.PointerLevel()))) + if (!TypeCompatible(destination, source)) { throw InputException(InputExceptionErrors::typeMismatch); } + enum class CopyType + { + DirectScalarPassThrough, + DirectScalarCopy, + Memory, + Reference + }; + CopyType copyType{}; + + if (auto srcPtrLevel = source.PointerLevel(); srcPtrLevel < 0) + { + throw LogicException(LogicExceptionErrors::illegalState); + } + else if (srcPtrLevel == 0) + { + switch (destination.PointerLevel()) + { + case 0: + copyType = CopyType::DirectScalarPassThrough; + break; + case 1: + copyType = CopyType::DirectScalarCopy; + break; + default: + throw LogicException(LogicExceptionErrors::illegalState); + } + if (source.GetLayout() != ScalarLayout || destination.GetLayout() != ScalarLayout) + { + throw InputException(InputExceptionErrors::invalidSize); + } + } + else if (srcPtrLevel == 1) + { + if (destination.PointerLevel() != 1) + { + throw LogicException(LogicExceptionErrors::illegalState); + } + else + { + if (destination.GetLayout() != source.GetLayout()) + { + throw InputException(InputExceptionErrors::sizeMismatch); + } + copyType = CopyType::Memory; + } + } + else if (destination.PointerLevel() == srcPtrLevel) + { + assert(srcPtrLevel > 1); + if (source.IsConstant()) + { + throw LogicException(LogicExceptionErrors::illegalState); + } + copyType = CopyType::Reference; + } + else + { + throw LogicException(LogicExceptionErrors::illegalState); + } + auto& irEmitter = _emitter.GetIREmitter(); auto destValue = ToLLVMValue(destination); + if (source.IsConstant()) { // we're only copying active areas below. should we copy padded too? @@ -847,10 +1097,21 @@ namespace value { return; } - if (auto& layout = source.GetLayout(); layout.IsContiguous()) + switch (copyType) { - if (destination.PointerLevel() == source.PointerLevel()) + case CopyType::DirectScalarPassThrough: + destination.SetData(Emittable{ srcValue }); + break; + case CopyType::DirectScalarCopy: + { + auto destAtOffset = irEmitter.PointerOffset(destValue, irEmitter.Zero(VariableType::Int32)); + irEmitter.Store(destAtOffset, srcValue); + break; + } + case CopyType::Memory: + if (auto& layout = source.GetLayout(); layout.IsContiguous()) { + assert(copyType == CopyType::Memory); auto llvmType = srcValue->getType()->getContainedType(0); auto primSize = irEmitter.SizeOf(llvmType); auto memorySize = irEmitter.Literal(static_cast(layout.GetMemorySize() * primSize)); @@ -860,17 +1121,24 @@ namespace value } else { - auto destAtOffset = irEmitter.PointerOffset(destValue, irEmitter.Zero(VariableType::Int32)); - irEmitter.Store(destAtOffset, srcValue); + ForImpl( + layout, [&](std::vector index) { + auto offsetSource = source.Offset(detail::CalculateOffset(source.GetLayout(), index)); + auto offsetDest = destination.Offset(detail::CalculateOffset(destination.GetLayout(), index)); + (void)irEmitter.Store(ToLLVMValue(offsetDest), irEmitter.Load(ToLLVMValue(offsetSource))); + }, + ""); } - } - else + break; + case CopyType::Reference: { - ForImpl(layout, [&](std::vector index) { - auto offsetSource = source.Offset(detail::CalculateOffset(source.GetLayout(), index)); - auto offsetDest = destination.Offset(detail::CalculateOffset(destination.GetLayout(), index)); - (void)irEmitter.Store(ToLLVMValue(offsetDest), irEmitter.Load(ToLLVMValue(offsetSource))); - }); + auto srcAtOffset = irEmitter.Load(srcValue); + irEmitter.Store(destValue, srcAtOffset); + destination.SetLayout(source.GetLayout()); + break; + } + default: + throw LogicException(LogicExceptionErrors::illegalState); } } } @@ -963,13 +1231,17 @@ namespace value destination = Allocate(source.GetBaseType(), source.GetLayout()); } - if (!TypeCompatible(destination, source) && - (destination.PointerLevel() == source.PointerLevel() || - destination.PointerLevel() == (1 + source.PointerLevel()))) + if (!TypeCompatible(destination, source)) { throw InputException(InputExceptionErrors::typeMismatch); } + if (!((source.PointerLevel() == 0 || source.PointerLevel() == 1) && + (destination.PointerLevel() == 0 || destination.PointerLevel() == 1))) + { + throw InputException(InputExceptionErrors::invalidArgument); + } + if (destination.GetLayout() != source.GetLayout()) { throw InputException(InputExceptionErrors::sizeMismatch); @@ -1018,6 +1290,17 @@ namespace value } opFn = [&fn](auto dst, auto src) { return fn.Operator(TypedOperator::moduloSigned, dst, src); }; break; + case ValueBinaryOperation::logicalAnd: + [[fallthrough]]; + case ValueBinaryOperation::logicalOr: + if (destination.GetBaseType() != ValueType::Boolean) + { + throw InputException(InputExceptionErrors::invalidArgument); + } + opFn = [&fn, op](auto dst, auto src) { + return fn.Operator(op == ValueBinaryOperation::logicalAnd ? TypedOperator::logicalAnd : TypedOperator::logicalOr, dst, src); + }; + break; default: throw LogicException(LogicExceptionErrors::illegalState); } @@ -1031,22 +1314,43 @@ namespace value // If the pointer levels don't match, it means the source is not a pointer (logically) // and we just need to do an assignment of the value to the value pointed to by // destintion - bool scalarLLVMSource = source.PointerLevel() != destination.PointerLevel(); - ForImpl(layout, [&](std::vector index) { - LLVMValue srcValue = nullptr; - if (scalarLLVMSource) - { - srcValue = ToLLVMValue(source); - } - else - { - auto offsetSource = source.Offset(detail::CalculateOffset(source.GetLayout(), index)); - srcValue = fn.Load(ToLLVMValue(offsetSource)); - } - auto offsetDest = destination.Offset(detail::CalculateOffset(destination.GetLayout(), index)); - auto destValue = ToLLVMValue(offsetDest); - fn.Store(destValue, opFn(fn.Load(destValue), srcValue)); - }); + bool scalarLLVMSource = source.PointerLevel() == 0; + bool scalarLLVMDestination = destination.PointerLevel() == 0; + ForImpl( + layout, [&](std::vector index) { + LLVMValue srcValue = nullptr; + if (scalarLLVMSource) + { + srcValue = ToLLVMValue(source); + } + else + { + auto offsetSource = source.Offset(detail::CalculateOffset(source.GetLayout(), index)); + srcValue = fn.Load(ToLLVMValue(offsetSource)); + } + + LLVMValue destValue = nullptr; + LLVMValue destValueOffset = nullptr; + if (scalarLLVMDestination) + { + destValue = ToLLVMValue(destination); + } + else + { + destValueOffset = ToLLVMValue(destination.Offset(detail::CalculateOffset(destination.GetLayout(), index))); + destValue = fn.Load(destValueOffset); + } + auto result = opFn(destValue, srcValue); + if (!scalarLLVMDestination) + { + fn.Store(destValueOffset, result); + } + else + { + const_cast(destination).SetData(Emittable{ result }); + } + }, + ""); } else { @@ -1113,16 +1417,19 @@ namespace value auto& fn = this->GetFunctionEmitter(); auto result = fn.TrueBit(); - ForImpl(source1.GetLayout(), [&](std::vector index) { - auto offsetSource1 = source1.Offset(detail::CalculateOffset(source1.GetLayout(), index)); - auto offsetSource2 = source2.Offset(detail::CalculateOffset(source2.GetLayout(), index)); - result = fn.LogicalAnd( - result, - fn.Comparison( - comparisonOp, - fn.Load(ToLLVMValue(offsetSource1)), - fn.Load(ToLLVMValue(offsetSource2)))); - }); + ForImpl( + source1.GetLayout(), + [&](std::vector index) { + auto offsetSource1 = source1.Offset(detail::CalculateOffset(source1.GetLayout(), index)); + auto offsetSource2 = source2.Offset(detail::CalculateOffset(source2.GetLayout(), index)); + result = fn.LogicalAnd( + result, + fn.Comparison( + comparisonOp, + fn.Load(ToLLVMValue(offsetSource1)), + fn.Load(ToLLVMValue(offsetSource2)))); + }, + ""); return { Emittable{ result }, ScalarLayout }; }, @@ -1131,20 +1438,30 @@ namespace value using CastType = std::conditional_t, bool, Type>; auto& fn = this->GetFunctionEmitter(); - auto result = fn.TrueBit(); + LLVMValue result = nullptr; auto llvmOp1 = source1Data.GetDataAs(); - - ConstantForLoop(source1.GetLayout(), [&](const MemoryCoordinates& logicalCoordinates) { - auto source1Offset = source1.GetLayout().GetLogicalEntryOffset(logicalCoordinates); - auto source2Offset = source2.GetLayout().GetLogicalEntryOffset(logicalCoordinates); - - result = fn.LogicalAnd( - result, - fn.Comparison( - comparisonOp, - fn.ValueAt(llvmOp1, source1Offset), - fn.Literal(static_cast(source2Data[source2Offset])))); - }); + if (source1.PointerLevel() == 0) + { + result = fn.Comparison( + comparisonOp, + llvmOp1, + fn.Literal(static_cast(source2Data[0]))); + } + else + { + result = fn.TrueBit(); + ConstantForLoop(source1.GetLayout(), [&](const MemoryCoordinates& logicalCoordinates) { + auto source1Offset = source1.GetLayout().GetLogicalEntryOffset(logicalCoordinates); + auto source2Offset = source2.GetLayout().GetLogicalEntryOffset(logicalCoordinates); + + result = fn.LogicalAnd( + result, + fn.Comparison( + comparisonOp, + fn.ValueAt(llvmOp1, source1Offset), + fn.Literal(static_cast(source2Data[source2Offset])))); + }); + } return { Emittable{ result }, ScalarLayout }; } }, @@ -1224,7 +1541,7 @@ namespace value } else { - testValue = ToLLVMValue(value); + testValue = value::ToLLVMValue(value); } _ifEmitter.ElseIf(testValue, [fn = std::move(fn)](auto&) { fn(); }); @@ -1258,6 +1575,22 @@ namespace value return { std::make_unique(std::move(ifEmitter), fnEmitter) }; } + void LLVMContext::WhileImpl(Scalar test, std::function fn) + { + auto& fnEmitter = GetFunctionEmitter(); + LLVMValue testValue = nullptr; + if (auto value = test.GetValue(); value.IsConstant()) + { + testValue = fnEmitter.Literal(static_cast(test.Get())); + } + else + { + testValue = ToLLVMValue(value); + } + + fnEmitter.While(testValue, [fn = std::move(fn)](auto&) { fn(); }); + } + std::optional LLVMContext::CallImpl(FunctionDeclaration func, std::vector args) { if (std::any_of(args.begin(), args.end(), [](const auto& value) { return value.IsEmpty(); })) @@ -1275,9 +1608,9 @@ namespace value { return it->second(args); } - return EmitExternalCall(func, args); } + void LLVMContext::PrefetchImpl(Value data, PrefetchType type, PrefetchLocality locality) { if (data.IsConstant()) @@ -1431,8 +1764,37 @@ namespace value return _computeContext.GetName(realized); } - auto llvmValue = ToLLVMValue(realized); - return llvmValue->getName(); + auto llvmValue = *ToLLVMValue(realized); + if (llvmValue != nullptr) + return llvmValue->getName(); + else + return ""; + } + + void LLVMContext::ImportCodeFileImpl(std::string filename) + { + if (auto lowercaseFilename = utilities::ToLowercase(filename); utilities::EndsWith(lowercaseFilename, ".ll")) + { + _emitter.LoadIRFromFile(filename); + } + else if (utilities::EndsWith(lowercaseFilename, ".s")) + { + _emitter.LoadAsmFromFile(filename); + } + else + { + throw LogicException(LogicExceptionErrors::illegalState, "[LLVMContext] Don't know how to import code file " + filename); + } + } + + Scalar LLVMContext::GetFunctionAddressImpl(const FunctionDeclaration& fn) + { + auto llvmFn = DeclareFunction(fn); + + auto& fnEmitter = GetFunctionEmitter(); + auto fnAddress = fnEmitter.CastPointerToInt(llvmFn, VariableType::Int64); + fnAddress->setName(fn.GetFunctionName() + "Ptr"); + return Value(Emittable{ fnAddress }, ScalarLayout); } Value LLVMContext::IntrinsicCall(FunctionDeclaration intrinsic, std::vector args) @@ -1454,12 +1816,16 @@ namespace value { FloorFunctionDeclaration, SimpleNumericalFunctionIntrinsic(&IRRuntime::GetFloorFunction) }, { CeilFunctionDeclaration, SimpleNumericalFunctionIntrinsic(&IRRuntime::GetCeilFunction) }, { CopySignFunctionDeclaration, CopySignFunctionIntrinsic() }, + { FmaFunctionDeclaration, FmaFunctionIntrinsic() }, + { MemCopyFunctionDeclaration, MemOpFunctionIntrinsic(MemIntrinsicOp::Copy) }, + { MemMoveFunctionDeclaration, MemOpFunctionIntrinsic(MemIntrinsicOp::Move) }, + { MemSetFunctionDeclaration, MemOpFunctionIntrinsic(MemIntrinsicOp::Set) }, }; if (std::all_of(args.begin(), args.end(), [](const auto& value) { return value.IsConstant(); })) { // Compute context can handle intrinsic calls with constant data - return *_computeContext.Call(intrinsic, args); + return *_computeContext.Call(intrinsic, std::vector(args.begin(), args.end())); } std::vector emittableArgs; @@ -1482,39 +1848,14 @@ namespace value throw InputException(InputExceptionErrors::sizeMismatch); } - auto& irEmitter = _emitter.GetIREmitter(); - auto& fnEmitter = GetFunctionEmitter(); - const auto& returnType = externalFunc.GetReturnType(); - // Create external function declaration - const auto& fnName = externalFunc.GetFunctionName(); - if (!_emitter.HasFunction(fnName)) - { - auto resultType = [&] { - if (returnType) - { - return ValueTypeToLLVMType(irEmitter, { returnType->GetBaseType(), returnType->PointerLevel() }); - } - else - { - return ValueTypeToLLVMType(irEmitter, { ValueType::Void, 0 }); - } - }(); - - std::vector paramTypes(argTypes.size()); - std::transform(argTypes.begin(), argTypes.end(), paramTypes.begin(), [&](const auto& value) { - return ValueTypeToLLVMType(irEmitter, { value.GetBaseType(), value.PointerLevel() }); - }); - - auto fnType = llvm::FunctionType::get(resultType, paramTypes, false); - _emitter.DeclareFunction(fnName, fnType); - } - auto fn = _emitter.GetFunction(fnName); + DeclareFunction(externalFunc); // as a first approximation, if the corresponding arg type has a pointer level that's one less // than the passed in value, we dereference it. if it's the same, we pass it in as is. if it's anything else, // throw. this logic may not be sufficient for future use cases. + auto& fnEmitter = GetFunctionEmitter(); std::vector argValues; argValues.reserve(args.size()); for (auto idx = 0u; idx < args.size(); ++idx) @@ -1542,7 +1883,39 @@ namespace value } } + auto fn = [&, this]() -> LLVMFunction { + if (externalFunc.IsPointerSet()) + { + auto llvmFuncAddr = ToLLVMValue(externalFunc.GetPointer().GetValue()); + auto& fnEmitter = GetFunctionEmitter(); + auto fnType = ToLLVMFunctionType(externalFunc); + auto fnPtr = fnEmitter.CastIntToPointer(llvmFuncAddr, fnType->getPointerTo()); + return llvm::dyn_cast(fnPtr); + } + else + { + const auto& fnName = externalFunc.GetFunctionName(); + return _emitter.GetFunction(fnName); + } + }(); + assert(fn != nullptr); + auto resultValue = fnEmitter.Call(fn, argValues); + auto callInst = llvm::dyn_cast(resultValue); + if (callInst) + { + auto callingConv = fn->getCallingConv(); + callInst->setCallingConv(callingConv); + if (externalFunc.InlineState() != FunctionInlining::defaultInline) + { + IRFunctionEmitter::SetInlineState(fn, GetEmittersFunctionInlining(externalFunc.InlineState())); + + // Not sure if this is actually necessary or helpful: + llvm::InlineFunctionInfo inliner; + llvm::InlineFunction(callInst, inliner); + } + } + auto result = returnType; if (result) { @@ -1598,6 +1971,27 @@ namespace value return _functionStack.top().get(); } + emitters::LLVMFunction LLVMContext::DeclareFunction(const FunctionDeclaration& func) + { + // Create external function declaration + const auto& fnName = func.GetFunctionName(); + if (auto llvmFn = _emitter.GetFunction(fnName); llvmFn != nullptr) + { + return llvmFn; + } + + auto fnType = ToLLVMFunctionType(func); + auto fn = _emitter.DeclareFunction(fnName, fnType); + IRFunctionEmitter::SetInlineState(fn, GetEmittersFunctionInlining(func.InlineState())); + return fn; + } + + void LLVMContext::DebugBreakImpl() + { + auto& fn = GetFunctionEmitter(); + fn.DebugBreak(); + } + Value LLVMContext::PromoteConstantData(Value value) { assert(value.IsConstant() && value.IsDefined() && !value.IsEmpty()); @@ -1729,7 +2123,7 @@ namespace value newValue.SetData(Emittable{ fn.PointerOffset(emittable.GetDataAs(), static_cast(offset)) }); if (!name.empty()) { - ToLLVMValue(newValue)->setName(name); + (*ToLLVMValue(newValue))->setName(name); } return newValue; @@ -1752,5 +2146,86 @@ namespace value } } + std::vector LLVMContext::EnsureEmittable(std::vector values) + { + std::vector emittables(values.size()); + std::transform(values.begin(), values.end(), emittables.begin(), [this](Value& arg) -> Value { + return EnsureEmittable(arg); + }); + return emittables; + } + + std::optional LLVMContext::ToLLVMValue(Value value) const + { + if (value.IsConstant()) + { + return std::nullopt; + } + return value.Get().GetDataAs(); + } + + LLVMValue LLVMContext::ToLLVMValue(Value value) + { + return EnsureEmittable(value).Get().GetDataAs(); + } + + std::vector> LLVMContext::ToLLVMValue(std::vector values) const + { + std::vector> llvmValues(values.size()); + std::transform(values.begin(), values.end(), llvmValues.begin(), [this](Value& value) -> std::optional { + return ToLLVMValue(value); + }); + return llvmValues; + } + + std::vector LLVMContext::ToLLVMValue(std::vector values) + { + std::vector llvmValues(values.size()); + std::transform(values.begin(), values.end(), llvmValues.begin(), [this](Value& value) -> LLVMValue { + return ToLLVMValue(value); + }); + return llvmValues; + } + + LLVMFunctionType LLVMContext::ToLLVMFunctionType(const FunctionDeclaration& func) const + { + auto& irEmitter = _emitter.GetIREmitter(); + const auto& argTypes = func.GetParameterTypes(); + const auto& returnType = func.GetReturnType(); + + // Create external function declaration + auto resultType = returnType ? ValueTypeToLLVMType(irEmitter, { returnType->GetBaseType(), returnType->PointerLevel() }) : ValueTypeToLLVMType(irEmitter, { ValueType::Void, 0 }); + std::vector paramTypes(argTypes.size()); + std::transform(argTypes.begin(), argTypes.end(), paramTypes.begin(), [&](const auto& value) { + return ValueTypeToLLVMType(irEmitter, { value.GetBaseType(), value.PointerLevel() }); + }); + + return llvm::FunctionType::get(resultType, paramTypes, false); + } + + LLVMValue ToLLVMValue(Value value) + { + auto val = InvokeForContext([&](LLVMContext& context) { + return context.ToLLVMValue(value); + }); + return val.value_or(nullptr); + } + + LLVMValue ToLLVMValue(ViewAdapter value) + { + auto val = InvokeForContext([&](LLVMContext& context) { + return context.ToLLVMValue(value); + }); + return val.value_or(nullptr); + } + + std::vector ToLLVMValue(std::vector values) + { + std::vector llvmValues(values.size()); + std::transform(values.begin(), values.end(), llvmValues.begin(), [](Value& value) -> LLVMValue { + return ToLLVMValue(value); + }); + return llvmValues; + } } // namespace value } // namespace ell diff --git a/libraries/value/src/LoopNests.cpp b/libraries/value/src/LoopNests.cpp new file mode 100644 index 000000000..9a39f2c53 --- /dev/null +++ b/libraries/value/src/LoopNests.cpp @@ -0,0 +1,354 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: LoopNests.cpp (value) +// Authors: Kern Handa +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "LoopNests.h" + +#include "loopnests/CodeGenerator.h" +#include "loopnests/CodePositionConstraints.h" +#include "loopnests/IndexRange.h" +#include "loopnests/Kernel.h" +#include "loopnests/LoopNest.h" + +#include +#include +#include + +using namespace ell::utilities; + +namespace ell +{ +namespace value +{ + class LoopNestImpl + { + public: + ~LoopNestImpl() = default; + + void Using(std::initializer_list inputs, ArgumentType argType) + { + for (auto input : inputs) + { + _arguments.emplace_back(input, argType); + } + } + + void ForAll(Index index, int begin, int end) + { + _ranges.emplace_back(index, loopnests::Range{ begin, end }); + } + + void EnsureCreated() + { + if (!_nest.has_value()) + { + _nest = loopnests::LoopNest(_ranges); + } + } + + void Do(std::function)> fn, std::vector kernelOuterIndices, std::string kernelId) + { + std::vector arguments; + arguments.reserve(_arguments.size()); + for (const auto& arg : _arguments) + { + arguments.push_back(arg.first); + } + std::vector indices; + for (const auto& index : _ranges) + { + indices.push_back(index.GetIndex()); + } + auto name = UniqueName("kernel"); + if (kernelId.empty()) + { + kernelId = name; + } + auto kernel = loopnests::Kernel(name, kernelId) + .Inputs(arguments) + .Indices(indices) + .Define(fn); + + Do(kernel, kernelOuterIndices); + } + + void Do(Kernel kernelFn, std::vector kernelOuterIndices) + { + _kernels.push_back(std::move(kernelFn)); + auto& kernel = _kernels.back(); + + EnsureCreated(); + if (kernelOuterIndices.empty()) + { + _nest->AddKernel(kernel, loopnests::LoopNest::ConstraintType::constraint); + } + else + { + loopnests::CodePositionConstraints constraints{ loopnests::LoopFragmentType::body, kernelOuterIndices, {} }; + _nest->AddKernel(kernel, constraints); + } + } + + void Do(Kernel kernelFn, const loopnests::KernelPredicate& predicate, const loopnests::KernelPredicate& placement) + { + _kernels.push_back(std::move(kernelFn)); + auto& kernel = _kernels.back(); + + EnsureCreated(); + _nest->AddKernel(kernel, predicate, placement); + } + + Index Split(Index& index, int factor) + { + EnsureCreated(); + // TODO: this might not be needed in the high level api + auto it = _splits.find({ index, factor }); + if (it == _splits.end()) + { + auto splitResult = _nest->Split(index, factor); + it = _splits.insert(it, { { index, factor }, splitResult }); + } + + index = it->second.inner; + return it->second.outer; + } + + void Parallelize(Index index) + { + EnsureCreated(); + _nest->Parallelize(index); + } + + void Unroll(Index index) + { + EnsureCreated(); + _nest->Unroll(index); + } + + void SetOrder(std::vector indices) + { + EnsureCreated(); + _nest->SetLoopOrder(indices); + } + + void Run() const + { + loopnests::CodeGenerator{}.Run(*_nest); + } + + loopnests::LoopNest& GetUnderlyingLoopNest() + { + EnsureCreated(); + return *_nest; + } + + private: + std::vector> _arguments; + std::vector _ranges; + std::vector _kernels; + std::optional _nest; + std::map, loopnests::SplitIndex> _splits; + }; + + LoopNest Using(std::initializer_list inputs, ArgumentType argType) + { + return LoopNest{}.Using(inputs, argType); + } + + // + // LoopNest + // + + LoopNest::LoopNest() : + _impl(std::make_unique()), + _schedule(*this) {} + LoopNest::LoopNest(const LoopNest& other) : + _impl(std::make_unique(*other._impl)), + _schedule(*this) {} + LoopNest::LoopNest(LoopNest&& other) noexcept : + _impl(std::move(other._impl)), + _schedule(*this) {} + + LoopNest& LoopNest::operator=(const LoopNest& other) + { + if (this != &other) + { + *_impl = *other._impl; + } + + return *this; + } + + LoopNest& LoopNest::operator=(LoopNest&& other) noexcept + { + if (this != &other) + { + _impl = std::move(other._impl); + } + + return *this; + } + + LoopNest::~LoopNest() = default; + + LoopNest& LoopNest::Using(std::initializer_list inputs, ArgumentType argType) + { + _impl->Using(inputs, argType); + + return *this; + } + + LoopNest& LoopNest::ForAll(Index index, int begin, int end) + { + _impl->ForAll(index, begin, end); + + return *this; + } + + LoopNest& LoopNest::Do(std::function)> fn, std::vector kernelOuterIndices, std::string kernelId) + { + _impl->Do(fn, kernelOuterIndices, kernelId); + + return *this; + } + + LoopNest& LoopNest::Do(std::function)> fn, std::string kernelId) + { + return Do(fn, {}, kernelId); + } + + LoopNest& LoopNest::Do(Kernel kernel, std::vector kernelOuterIndices) + { + _impl->Do(kernel, kernelOuterIndices); + + return *this; + } + + LoopNest& LoopNest::Do(Kernel kernel, const loopnests::KernelPredicate& predicate, const loopnests::KernelPredicate& placement) + { + _impl->Do(kernel, predicate, placement); + + return *this; + } + + void LoopNest::Run() const + { + _impl->Run(); + } + + loopnests::LoopNest& LoopNest::GetUnderlyingLoopNest() + { + return _impl->GetUnderlyingLoopNest(); + } + + const loopnests::LoopNest& LoopNest::GetUnderlyingLoopNest() const + { + return _impl->GetUnderlyingLoopNest(); + } + + Schedule& LoopNest::GetSchedule() + { + return _schedule; + } + + void swap(LoopNest& nest1, LoopNest& nest2) noexcept + { + using std::swap; + swap(nest1._impl, nest2._impl); + } + + // + // Schedule + // + + Schedule::Schedule(LoopNest& nest) : + _nest(nest), + _impl(*nest._impl) + {} + + Schedule::Schedule(const Schedule& other) = default; + Schedule& Schedule::operator=(const Schedule& other) = default; + + Index Schedule::Split(Index& index, int factor) + { + return _impl.get().Split(index, factor); + } + + void Schedule::Parallelize(Index index) + { + _impl.get().Parallelize(index); + } + + Index Schedule::Parallelize(Index index, int factor) + { + auto outer = Split(index, factor); + Parallelize(outer); + return outer; + } + + void Schedule::Unroll(Index index) + { + _impl.get().Unroll(index); + } + + Index Schedule::Unroll(Index index, int factor) + { + auto outer = Split(index, factor); + Unroll(outer); + return outer; + } + + void Schedule::Cache(std::unique_ptr provider) + { + provider->HandleCaching(_nest.get()); + } + + utilities::MemoryShape Schedule::GetShapeFromIndicesIncrement(std::vector& kernelIndices) + { + std::vector sizes; + for (auto index : kernelIndices) + { + auto range = _nest.get().GetUnderlyingLoopNest().GetIndexRange(index); + sizes.push_back(range.Increment()); + } + + return { sizes }; + } + + void Schedule::Cache( + CachingProvider& provider, + ViewAdapter view, + std::vector kernelIndices, + utilities::MemoryShape size, + std::vector atIndices, + std::optional order, + std::any extras) + { + if (size.NumDimensions() == 0) + { + // Figure out size based on increment of the indices + size = GetShapeFromIndicesIncrement(kernelIndices); + }; + + provider.Initialize( + view, + size, + order.value_or(DimensionOrder(size.NumDimensions())), + kernelIndices, + atIndices.empty() ? kernelIndices : atIndices, + extras); + + provider.HandleCaching(_nest.get()); + } + + void Schedule::SetOrder(std::vector indices) + { + _impl.get().SetOrder(indices); + } + +} // namespace value +} // namespace ell diff --git a/libraries/value/src/Matrix.cpp b/libraries/value/src/Matrix.cpp index 807fe4671..ba00f16ae 100644 --- a/libraries/value/src/Matrix.cpp +++ b/libraries/value/src/Matrix.cpp @@ -62,7 +62,7 @@ namespace value Scalar Matrix::operator()(Scalar rowIndex, Scalar columnIndex) { - Value indexedValue = GetContext().Offset(_value, { rowIndex, columnIndex }); + Value indexedValue = GetContext().Offset(_value, { rowIndex, columnIndex }); indexedValue.SetLayout(ScalarLayout); return indexedValue; @@ -135,6 +135,11 @@ namespace value size_t Matrix::Columns() const { return static_cast(_value.GetLayout().GetLogicalDimensionActiveSize(1)); } + Matrix::MatrixLayout Matrix::GetMatrixLayout() const + { + return _value.GetLayout().IsCanonicalOrder() ? MatrixLayout::rowMajor : MatrixLayout::columnMajor; + } + ValueType Matrix::Type() const { return _value.GetBaseType(); } void Matrix::SetName(const std::string& name) { _value.SetName(name); } diff --git a/libraries/value/src/MatrixOperations.cpp b/libraries/value/src/MatrixOperations.cpp index a7833f9fa..c90150ad2 100644 --- a/libraries/value/src/MatrixOperations.cpp +++ b/libraries/value/src/MatrixOperations.cpp @@ -20,15 +20,15 @@ using namespace utilities; namespace value { - Matrix ToMatrix(Value data, int numRows, int numCols) - { - Value matrix = data; - auto size = data.GetLayout().GetActiveSize().NumElements(); - if (size != numRows * numCols || !data.GetLayout().IsContiguous()) - { - throw InputException(InputExceptionErrors::invalidArgument, - ell::utilities::FormatString("data must be contiguous and have size %zu = %d * %d", size, numRows, numCols)); - } + Matrix ToMatrix(Value data, int numRows, int numCols) + { + Value matrix = data; + auto size = data.GetLayout().GetActiveSize().NumElements(); + if (size != numRows * numCols || !data.GetLayout().IsContiguous()) + { + throw InputException(InputExceptionErrors::invalidArgument, + ell::utilities::FormatString("data must be contiguous and have size %zu = %d * %d", size, numRows, numCols)); + } matrix.SetLayout(utilities::MemoryLayout{ { numRows, numCols } }); return matrix; } @@ -45,6 +45,11 @@ namespace value } void For(Matrix matrix, std::function fn) + { + For(std::string{}, matrix, fn); + } + + void For(const std::string& name, Matrix matrix, std::function fn) { auto layout = matrix.GetValue().GetLayout(); if (layout.NumDimensions() != 2) @@ -53,9 +58,12 @@ namespace value "Layout being looped over must be two-dimensional"); } - GetContext().For(layout, [fn = std::move(fn)](std::vector coordinates) { - fn(coordinates[0], coordinates[1]); - }); + GetContext().For( + layout, + [fn = std::move(fn)](std::vector coordinates) { + fn(coordinates[0], coordinates[1]); + }, + name); } Matrix GEMM(Matrix m1, Matrix m2) { throw LogicException(LogicExceptionErrors::notImplemented); } @@ -67,7 +75,7 @@ namespace value if (m.Columns() != v.Size()) { throw InputException(InputExceptionErrors::invalidArgument, - ell::utilities::FormatString("Vector size %d must match number of columns in the matrix %d", v.Size(), m.Columns())); + ell::utilities::FormatString("Vector size %d must match number of columns in the matrix %d", v.Size(), m.Columns())); } first = 1; For(m, [&](Scalar row, Scalar col) { diff --git a/libraries/value/src/Print.cpp b/libraries/value/src/Print.cpp new file mode 100644 index 000000000..481d13b13 --- /dev/null +++ b/libraries/value/src/Print.cpp @@ -0,0 +1,65 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: Print.cpp (value) +// Authors: Chuck Jacobs +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "Print.h" +#include "LLVMContext.h" + +#include +#include + +namespace ell +{ +namespace value +{ + void Print(const std::string& text) + { + if (!InvokeForContext([&](LLVMContext& context) -> bool { + context.GetFunctionEmitter().Print(text); + return true; + })) + { + std::printf("%s", text.c_str()); + } + } + + void Printf(const std::vector& arguments) + { + if (!InvokeForContext([&](LLVMContext& context) -> bool { + std::vector args; + std::transform(arguments.begin(), arguments.end(), std::back_inserter(args), [&](auto x) { + return ToLLVMValue(x); + }); + + context.GetFunctionEmitter().Printf(args); + return true; + })) + { + std::printf(""); + } + } + + void Printf(const std::string& format, const std::vector& arguments) + { + if (!InvokeForContext([&](LLVMContext& context) -> bool { + std::vector args; + std::transform(arguments.begin(), arguments.end(), std::back_inserter(args), [&](auto x) { + return ToLLVMValue(x); + }); + + context.GetFunctionEmitter().Printf(format, args); + return true; + })) + { + std::printf("%s", format.c_str()); + } + } + +} // namespace value +} // namespace ell + +#pragma endregion implementation diff --git a/libraries/value/src/Scalar.cpp b/libraries/value/src/Scalar.cpp index 9f35d0f65..c5cfa7530 100644 --- a/libraries/value/src/Scalar.cpp +++ b/libraries/value/src/Scalar.cpp @@ -103,32 +103,27 @@ namespace value // Free function operator overloads Scalar operator+(Scalar s1, Scalar s2) { - Scalar copy = s1.Copy(); - return copy += s2; + return Add(s1, s2); } Scalar operator-(Scalar s1, Scalar s2) { - Scalar copy = s1.Copy(); - return copy -= s2; + return Subtract(s1, s2); } Scalar operator*(Scalar s1, Scalar s2) { - Scalar copy = s1.Copy(); - return copy *= s2; + return Multiply(s1, s2); } Scalar operator/(Scalar s1, Scalar s2) { - Scalar copy = s1.Copy(); - return copy /= s2; + return Divide(s1, s2); } Scalar operator%(Scalar s1, Scalar s2) { - Scalar copy = s1.Copy(); - return copy %= s2; + return Modulo(s1, s2); } Scalar operator-(Scalar s) diff --git a/libraries/value/src/ScalarOperations.cpp b/libraries/value/src/ScalarOperations.cpp new file mode 100644 index 000000000..026d0db46 --- /dev/null +++ b/libraries/value/src/ScalarOperations.cpp @@ -0,0 +1,58 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: ScalarOperations.cpp (value) +// Authors: Kern Handa +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "ScalarOperations.h" +#include "EmitterContext.h" +#include "Scalar.h" + +#include + +namespace ell +{ +using namespace utilities; + +namespace value +{ + + Scalar Add(Scalar s1, Scalar s2) + { + Scalar copy = s1.Copy(); + return copy += s2; + } + + Scalar Subtract(Scalar s1, Scalar s2) + { + Scalar copy = s1.Copy(); + return copy -= s2; + } + + Scalar Multiply(Scalar s1, Scalar s2) + { + Scalar copy = s1.Copy(); + return copy *= s2; + } + + Scalar Divide(Scalar s1, Scalar s2) + { + Scalar copy = s1.Copy(); + return copy /= s2; + } + + Scalar Modulo(Scalar s1, Scalar s2) + { + Scalar copy = s1.Copy(); + return copy %= s2; + } + + Scalar FusedMultiplyAdd(Scalar a, Scalar b, Scalar c) + { + return Fma(a, b, c); + } + +} // namespace value +} // namespace ell diff --git a/libraries/value/src/TensorOperations.cpp b/libraries/value/src/TensorOperations.cpp index 237dc4cb5..298049d3d 100644 --- a/libraries/value/src/TensorOperations.cpp +++ b/libraries/value/src/TensorOperations.cpp @@ -29,6 +29,11 @@ namespace value } void For(Tensor tensor, std::function fn) + { + For(std::string{}, tensor, fn); + } + + void For(const std::string& name, Tensor tensor, std::function fn) { auto layout = tensor.GetValue().GetLayout(); if (layout.NumDimensions() != 3) @@ -37,9 +42,12 @@ namespace value "Layout being looped over must be three-dimensional"); } - GetContext().For(layout, [fn = std::move(fn)](std::vector coordinates) { - fn(coordinates[0], coordinates[1], coordinates[2]); - }); + GetContext().For( + layout, + [fn = std::move(fn)](std::vector coordinates) { + fn(coordinates[0], coordinates[1], coordinates[2]); + }, + name); } } // namespace value diff --git a/libraries/value/src/Value.cpp b/libraries/value/src/Value.cpp index 4f1baa8f5..4b60ed4f5 100644 --- a/libraries/value/src/Value.cpp +++ b/libraries/value/src/Value.cpp @@ -35,6 +35,7 @@ namespace value other._data = temp; std::swap(_type, other._type); std::swap(_layout, other._layout); + std::swap(_hasName, other._hasName); } // clang-format off @@ -74,6 +75,7 @@ namespace value _layout = other._layout; _data = other._data; _type = other._type; + _hasName = other._hasName; } } else @@ -96,6 +98,7 @@ namespace value _layout = other._layout; _data = other._data; _type = other._type; + _hasName = other._hasName; } else { @@ -113,6 +116,7 @@ namespace value } _data = other._data; _type = other._type; + _hasName = other._hasName; } else { @@ -142,6 +146,7 @@ namespace value _data = std::move(other._data); _layout = std::move(other._layout); _type = std::move(other._type); + _hasName = std::move(other._hasName); } } else @@ -164,6 +169,7 @@ namespace value _data = std::move(other._data); _layout = std::move(other._layout); _type = std::move(other._type); + _hasName = std::move(other._hasName); } else { @@ -181,6 +187,7 @@ namespace value } _data = std::move(other._data); _type = other._type; + _hasName = std::move(other._hasName); } else { @@ -219,6 +226,7 @@ namespace value _type = { ValueType::Undefined, 0 }; _layout.reset(); _data = {}; + _hasName = false; } void Value::SetData(Value value, bool force) @@ -236,6 +244,10 @@ namespace value } _data = emittable; + if (!force) + { + _type = type; + } }, [this, force](auto&& arg) { if (!force && GetValueType>() != _type.first) diff --git a/libraries/value/src/ValueOperations.cpp b/libraries/value/src/ValueOperations.cpp index 6d90fb909..9bcbfd6ff 100644 --- a/libraries/value/src/ValueOperations.cpp +++ b/libraries/value/src/ValueOperations.cpp @@ -28,6 +28,11 @@ namespace value }); } + void For(Scalar start, Scalar stop, Scalar step, std::function fn) + { + GetContext().For(start, stop, step, fn); + } + Value Cast(Value value, ValueType type) { if (value.GetBaseType() == type) diff --git a/libraries/value/src/Vector.cpp b/libraries/value/src/Vector.cpp index 6c8bf108f..315e52ce2 100644 --- a/libraries/value/src/Vector.cpp +++ b/libraries/value/src/Vector.cpp @@ -286,5 +286,6 @@ namespace value value.SetLayout(value.GetLayout().Flatten()); return value; } + } // namespace value } // namespace ell diff --git a/libraries/value/src/VectorOperations.cpp b/libraries/value/src/VectorOperations.cpp index fbc495c8d..2a5b88703 100644 --- a/libraries/value/src/VectorOperations.cpp +++ b/libraries/value/src/VectorOperations.cpp @@ -92,7 +92,7 @@ namespace value result = InvokeForContext([&](LLVMContext& context) -> Scalar { if (context.GetModuleEmitter().GetCompilerOptions().useBlas) { - auto returnValue = fn.Decorated(FunctionDecorated::No) + auto returnValue = fn.Decorated(false) .Call( Scalar{ static_cast(v1.Size()) }, v1, @@ -108,7 +108,12 @@ namespace value } }); - return *result; + if (result) + { + return *result; + } + + return defaultImpl(v1, v2); } else if (v1.GetType() == ValueType::Double) { @@ -142,7 +147,7 @@ namespace value result = InvokeForContext([&](LLVMContext& context) -> Scalar { if (context.GetModuleEmitter().GetCompilerOptions().useBlas) { - auto returnValue = fn.Decorated(FunctionDecorated::No) + auto returnValue = fn.Decorated(false) .Call( Scalar{ static_cast(v1.Size()) }, v1, @@ -158,7 +163,12 @@ namespace value } }); - return *result; + if (result) + { + return *result; + } + + return defaultImpl(v1, v2); } else { @@ -167,6 +177,11 @@ namespace value } void For(Vector v, std::function fn) + { + For(std::string{}, v, fn); + } + + void For(const std::string& name, Vector v, std::function fn) { auto layout = v.GetValue().GetLayout(); @@ -176,7 +191,10 @@ namespace value "Layout being looped over must be one-dimensional"); } - GetContext().For(layout, [fn = std::move(fn)](std::vector coordinates) { fn(coordinates[0]); }); + GetContext().For( + layout, + [fn = std::move(fn)](std::vector coordinates) { fn(coordinates[0]); }, + name); } } // namespace value diff --git a/libraries/value/src/loopnests/CodeGenerator.cpp b/libraries/value/src/loopnests/CodeGenerator.cpp new file mode 100644 index 000000000..5a4a7caed --- /dev/null +++ b/libraries/value/src/loopnests/CodeGenerator.cpp @@ -0,0 +1,461 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: CodeGenerator.cpp (value) +// Authors: Chuck Jacobs, Kern Handa +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "loopnests/CodeGenerator.h" +#include "loopnests/KernelPredicate.h" + +#include "LLVMContext.h" + +#include +#include +#include +#include + +namespace ell +{ +namespace value +{ + namespace loopnests + { + namespace + { + // computes ceil(a/b) + int CeilDiv(int a, int b) + { + return (a - 1) / b + 1; + } + } // namespace + + void CodeGenerator::Run(const LoopNest& loopNest) const + { + Visit(loopNest); + } + + void CodeGenerator::GenerateLoopRangeNew(const LoopRange& r, const RecursionStateNew& state, const LoopVisitSchedule& schedule, std::function codegenFn) const + { + const LoopNest& loopNest = schedule.GetLoopNest(); + auto loopIndex = schedule.CurrentLoopIndex(); + + bool isParallelized = loopNest.IsParallelized(loopIndex); + bool isUnrolled = loopNest.IsUnrolled(loopIndex); + assert(!(isParallelized && isUnrolled) && "An index cannot be both unrolled and parallelized"); + + const int startInt = r.start.Get(); + const int stopInt = r.stop.Get(); + const int stepInt = r.step.Get(); + + int numIterations = CeilDiv(stopInt - startInt, stepInt); + if (numIterations < 2) + { + isParallelized = false; + } + + if (!(isParallelized || isUnrolled)) + { + ForRange(UniqueName(loopNest.Name()), r.start, r.stop, r.step, codegenFn); + } + else if (isParallelized) + { + int numThreads = numIterations; + + std::vector kernelInputs; + for (const auto& g : state.kernelGroups) + { + if (g.first) + { + for (const auto& scheduledKernel : g.second.kernels) + { + auto& kernel = scheduledKernel.kernel; + const auto& inputs = kernel.GetArgs(); + kernelInputs.insert(kernelInputs.end(), inputs.begin(), inputs.end()); + } + } + } + + std::once_flag onceFlag; + GetContext().Parallelize(numThreads, kernelInputs, [&](Scalar index, std::vector captured) mutable { + assert(kernelInputs.size() == captured.size()); + + std::call_once(onceFlag, [&] { + for (unsigned i = 0; i < captured.size(); ++i) + { + // TODO: figure out what to do with the "where" parameter + // TODO: get rid of const_cast + const_cast(loopNest).RenameVariable(kernelInputs[i], captured[i], { loopIndex }); + } + }); + + codegenFn(index * stepInt); + }); + } + else if (isUnrolled) + { + for (int i = startInt; i < stopInt; i += stepInt) + { + codegenFn(i); + } + } + } + + void CodeGenerator::GenerateLoopRangeOld(const LoopRange& r, const RecursionState& state, const LoopVisitSchedule& schedule, std::function codegenFn) const + { + const LoopNest& loopNest = schedule.GetLoopNest(); + auto loopIndex = schedule.CurrentLoopIndex(); + + bool isParallelized = loopNest.IsParallelized(loopIndex); + bool isUnrolled = loopNest.IsUnrolled(loopIndex); + assert(!(isParallelized && isUnrolled) && "An index cannot be both unrolled and parallelized"); + + const int startInt = r.start.Get(); + const int stopInt = r.stop.Get(); + const int stepInt = r.step.Get(); + + int numIterations = CeilDiv(stopInt - startInt, stepInt); + if (numIterations < 2) + { + isParallelized = false; + } + + if (!(isParallelized || isUnrolled)) + { + ForRange(UniqueName(loopNest.Name()), r.start, r.stop, r.step, codegenFn); + } + else if (isParallelized) + { + int numThreads = numIterations; + + const auto& scheduledKernels = state.activeKernels; + std::vector kernelInputs; + for (const auto& scheduledKernel : scheduledKernels) + { + auto& kernel = scheduledKernel.kernel; + + const auto& inputs = kernel.GetArgs(); + kernelInputs.insert(kernelInputs.end(), inputs.begin(), inputs.end()); + } + + std::once_flag onceFlag; + GetContext().Parallelize(numThreads, kernelInputs, [&](Scalar index, std::vector captured) mutable { + assert(kernelInputs.size() == captured.size()); + + std::call_once(onceFlag, [&] { + for (unsigned i = 0; i < captured.size(); ++i) + { + // TODO: figure out what to do with the "where" parameter + // TODO: get rid of const_cast + const_cast(loopNest).RenameVariable(kernelInputs[i], captured[i], { loopIndex }); + } + }); + + codegenFn(index * stepInt); + }); + } + else if (isUnrolled) + { + for (int i = startInt; i < stopInt; i += stepInt) + { + codegenFn(i); + } + } + } + + Scalar CodeGenerator::EmitIndexExpression(const Index& index, const IndexExpression& expr, const LoopIndexSymbolTable& indexVariables) const + { + if (!expr.indices.empty()) + { + // We can't currently optimize away the "identity" expression, because the result (a loops "index" Scalar) + // would be a register variable (pointer valence 0), and the generated kernel function expects a stored value + // (pointer valence 1). So, we need to call `Allocate()` to get a stored variable. + auto sum = Scalar(Allocate(utilities::ScalarLayout)); + sum = expr.begin; + for (auto scaledIndex : expr.indices) + { + if (auto it = indexVariables.find(scaledIndex.index); it != indexVariables.end()) + { + auto indexValue = it->second.value; + sum += indexValue * scaledIndex.scale; + } + } + + return sum; + } + return 0; + } + + Scalar CodeGenerator::EmitKernelPredicate(const KernelPredicate& predicate, const LoopIndexSymbolTable& runtimeIndexVariables, const LoopVisitSchedule& schedule) const + { + const auto& domain = schedule.GetLoopNest().GetDomain(); + auto predResult = MakeScalar("predResult"); + predResult = 1; // "true" + + auto emitPredicate = [&domain, &runtimeIndexVariables, &schedule](const auto& emitPredicate, const KernelPredicate& p, Scalar& result, bool defaultIsTrue) -> void { + if (p.IsAlwaysTrue()) + { + if (defaultIsTrue) + { + // nothing + } + else + { + result = Scalar(1); // "true" + } + } + if (p.IsAlwaysFalse()) + { + if (defaultIsTrue) + { + result = Scalar(0); // "false" + } + else + { + // nothing + } + } + else if (auto simplePredicate = p.As(); simplePredicate != nullptr) + { + auto condition = simplePredicate->GetCondition(); + if (condition == Fragment::all) + { + return; // do nothing for 'all' predicates + } + + auto index = simplePredicate->GetIndex(); + const auto range = domain.GetDimensionRange(index); + + auto loopIndices = range.GetDependentLoopIndices(index); + if (loopIndices.empty()) + { + loopIndices = { index }; + } + for (auto loopIndex : loopIndices) + { + auto range = GetLoopRange(loopIndex, runtimeIndexVariables, schedule); + + int testVal = 0; + bool valid = true; + switch (condition) + { + case Fragment::first: + testVal = range.Begin(); + break; + case Fragment::last: + testVal = range.End() - (range.Size() % range.Increment()); + if (testVal == range.End()) // not a boundary + { + testVal = range.End() - range.Increment(); + } + break; + case Fragment::endBoundary: + testVal = range.End() - (range.Size() % range.Increment()); + if (testVal == range.End()) + { + valid = false; + } + break; + default: + // throw? + valid = false; + break; + } + + if (valid) + { + // if loop index not present, assume 0 + Scalar indexVal = MakeScalar("predIndexVal"); + if (runtimeIndexVariables.count(loopIndex) != 0) + { + indexVal = runtimeIndexVariables.at(loopIndex).value; + } + + if (defaultIsTrue) + { + If(indexVal != testVal, [&] { + result = Scalar(0); // "false" + }); + } + else + { + If(indexVal == testVal, [&] { + result = Scalar(1); // "true" + }); + } + } + } + } + else if (p.Is()) + { + throw utilities::LogicException(utilities::LogicExceptionErrors::notImplemented, "IsDefined predicate not implemented"); + } + else if (auto conjunction = p.As(); conjunction != nullptr) + { + auto conjResult = MakeScalar("conj"); + conjResult = Scalar(1); // "true" + for (const auto& t : conjunction->GetTerms()) + { + emitPredicate(emitPredicate, *t, conjResult, true); + } + + if (defaultIsTrue) + { + If(conjResult == 0, [&result] { + result = Scalar(0); // "false" + }); + } + else + { + If(conjResult != 0, [&result] { + result = Scalar(1); // "true" + }); + } + } + else if (auto disjunction = p.As(); disjunction != nullptr) + { + auto disjResult = MakeScalar("disj"); + disjResult = Scalar(0); // "false" + for (const auto& t : disjunction->GetTerms()) + { + emitPredicate(emitPredicate, *t, disjResult, false); + } + if (defaultIsTrue) + { + If(disjResult == 0, [&result] { + result = Scalar(0); // "false" + }); + } + else + { + If(disjResult != 0, [&result] { + result = Scalar(1); // "true" + }); + } + } + else + { + throw utilities::LogicException(utilities::LogicExceptionErrors::illegalState, "Unknown predicate type"); + } + }; + + emitPredicate(emitPredicate, predicate, predResult, true); + return predResult; + } + + void CodeGenerator::InvokeKernel(const Kernel& kernel, const KernelPredicate& predicate, const LoopIndexSymbolTable& runtimeIndexVariables, const LoopVisitSchedule& schedule) const + { + if (predicate.IsAlwaysTrue()) + { + InvokeKernel(kernel, runtimeIndexVariables, schedule); + } + else + { + If(EmitKernelPredicate(predicate, runtimeIndexVariables, schedule) == 1, [&] { + InvokeKernel(kernel, runtimeIndexVariables, schedule); + }); + } + } + + void CodeGenerator::InvokeKernel(const Kernel& kernel, const LoopIndexSymbolTable& runtimeIndexVariables, const LoopVisitSchedule& schedule) const + { + const auto& kernelArgs = kernel.GetArgs(); + const auto& kernelIndices = kernel.GetIndices(); + + const auto& renameActions = schedule.GetLoopNest().GetRenameActions(); + + // Create argument list + std::vector kernelArgValues; + kernelArgValues.reserve(kernelArgs.size()); + + auto rename = [&](const Value& arg) { + for (const auto& action : renameActions) + { + const auto& excludedKernels = action.excludedKernels; + if (std::find(excludedKernels.begin(), excludedKernels.end(), kernel.GetId()) == excludedKernels.end() && + std::equal_to{}(arg, action.oldValue) && + AreAllFullyDefined(action.where, schedule)) + { + return action.newValue; + } + } + return arg; + }; + + for (const auto& arg : kernelArgs) + { + kernelArgValues.push_back(rename(arg)); + } + + std::vector kernelIndexValues; + kernelIndexValues.reserve(kernelIndices.size()); + for (auto index : kernelIndices) + { + kernelIndexValues.push_back(runtimeIndexVariables.at(index).value.GetValue()); + auto name = kernelIndexValues.back().GetName(); + kernelIndexValues.back().SetName(index.GetName()); + } + + kernel.Call(kernelArgValues, kernelIndexValues); + } + + bool CodeGenerator::InvokeKernelGroup(const ScheduledKernelGroup& kernelGroup, const LoopIndexSymbolTable& runtimeIndexVariables, const LoopVisitSchedule& schedule) const + { + // preprocess to get only valid kernels + auto validKernels = GetValidKernels(kernelGroup, runtimeIndexVariables, schedule); + + if (validKernels.empty()) + { + return false; + } + + std::optional ifContext; + for (const auto& kernel : validKernels) + { + auto predicate = schedule.GetKernelPredicate(kernel).Simplify(runtimeIndexVariables, schedule); + if (predicate.IsAlwaysFalse()) + { + throw utilities::LogicException(utilities::LogicExceptionErrors::illegalState, "Always-false predicates should have been removed here"); + } + + if (predicate.IsAlwaysTrue()) + { + if (ifContext) + { + // We're already inside an 'if' cascade, so add final 'else' clause + ifContext.value().Else([&] { + InvokeKernel(kernel.kernel, runtimeIndexVariables, schedule); + }); + } + else + { + // If the first kernel's predicate is trivially 'true', just invoke the kernel and exit + InvokeKernel(kernel.kernel, runtimeIndexVariables, schedule); + } + break; + } + else + { + if (!ifContext) + { + auto predicateVal = EmitKernelPredicate(predicate, runtimeIndexVariables, schedule); + ifContext.emplace(If(predicateVal == 1, [&] { + InvokeKernel(kernel.kernel, runtimeIndexVariables, schedule); + })); + } + else + { + auto predicateVal = EmitKernelPredicate(predicate, runtimeIndexVariables, schedule); + ifContext.value().ElseIf(predicateVal == 1, [&] { + InvokeKernel(kernel.kernel, runtimeIndexVariables, schedule); + }); + } + } + } + + return true; + } + } // namespace loopnests +} // namespace value +} // namespace ell diff --git a/libraries/value/src/loopnests/CodePositionConstraints.cpp b/libraries/value/src/loopnests/CodePositionConstraints.cpp new file mode 100644 index 000000000..ba5c6dfdc --- /dev/null +++ b/libraries/value/src/loopnests/CodePositionConstraints.cpp @@ -0,0 +1,104 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: CodePositionConstraints.cpp (value) +// Authors: Chuck Jacobs, Kern Handa +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "loopnests/CodePositionConstraints.h" + +#include + +#include + +namespace ell +{ +namespace value +{ + namespace loopnests + { + CodePositionConstraints::CodePositionConstraints(LoopFragmentType placement, std::vector requiredIndices, std::vector boundaryIndices) : + _placement(placement), + _requiredIndices(requiredIndices), + _boundaryIndices(boundaryIndices) + { + } + + std::vector CodePositionConstraints::GetRequiredIndices() const + { + return _requiredIndices; + } + + std::vector CodePositionConstraints::GetBoundaryIndices() const + { + return _boundaryIndices; + } + + std::ostream& operator<<(std::ostream& os, LoopFragmentType t) + { + switch (t) + { + case LoopFragmentType::prologue: + os << "prologue"; + break; + case LoopFragmentType::body: + os << "body"; + break; + case LoopFragmentType::boundary: + os << "boundary"; + break; + case LoopFragmentType::epilogue: + os << "epilogue"; + break; + case LoopFragmentType::LAST: + os << "LAST"; + break; + default: + throw std::runtime_error("Unknown enum value"); + } + return os; + } + + std::ostream& operator<<(std::ostream& os, LoopFragmentFlags f) + { + std::string sep = ""; + os << "["; + for (int i = 0; i < static_cast(LoopFragmentType::LAST); ++i) + { + if (f.GetFlag(static_cast(i))) + { + os << sep << LoopFragmentType(i); + sep = " | "; + } + } + os << "]"; + return os; + } + + bool operator==(const CodePositionConstraints& i1, const CodePositionConstraints& i2) + { + return (i1.GetPlacement() == i2.GetPlacement()) && (i1.GetRequiredIndices() == i2.GetRequiredIndices()) && (i1.GetBoundaryIndices() == i2.GetBoundaryIndices()); + } + + bool operator!=(const CodePositionConstraints& i1, const CodePositionConstraints& i2) + { + return !(i1 == i2); + } + } // namespace loopnests +} // namespace value +} // namespace ell + +using namespace ell::value::loopnests; + +std::hash::result_type std::hash::operator()(const argument_type& constraints) const +{ + using ::ell::utilities::HashCombine; + + size_t hash = 0; + HashCombine(hash, constraints.GetPlacement()); + HashCombine(hash, constraints.GetRequiredIndices()); + HashCombine(hash, constraints.GetBoundaryIndices()); + + return hash; +} diff --git a/libraries/value/src/loopnests/ForAll.cpp b/libraries/value/src/loopnests/ForAll.cpp new file mode 100644 index 000000000..a991cfd00 --- /dev/null +++ b/libraries/value/src/loopnests/ForAll.cpp @@ -0,0 +1,49 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: ForAll.cpp (value) +// Authors: Chuck Jacobs, Kern Handa +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "loopnests/ForAll.h" + +namespace ell +{ +namespace value::loopnests +{ + ForAll::ForAll(IterationDomain domain) : + _loops(domain) + { + } + + ForAll& ForAll::AddKernel(const Kernel& kernel) + { + _loops.AddKernel(kernel, LoopNest::ConstraintType::constraint); + return *this; + } + + ForAll& ForAll::AddKernel(const Kernel& kernel, const CodePositionConstraints& where) + { + _loops.AddKernel(kernel, where); + return *this; + } + + ForAll& ForAll::Split(const Index& dimension, int size) + { + _loops.Split(dimension, size); + return *this; + } + + ForAll& ForAll::SetLoopOrder(const std::vector& order) + { + _loops.SetLoopOrder(order); + return *this; + } + + const LoopNest& ForAll::GetNest() const + { + return _loops; + } +} // namespace value::loopnests +} // namespace ell \ No newline at end of file diff --git a/libraries/value/src/loopnests/Index.cpp b/libraries/value/src/loopnests/Index.cpp new file mode 100644 index 000000000..dd230d856 --- /dev/null +++ b/libraries/value/src/loopnests/Index.cpp @@ -0,0 +1,55 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: Index.cpp (value) +// Authors: Chuck Jacobs, Kern Handa +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "loopnests/Index.h" + +namespace ell +{ +namespace value +{ + namespace loopnests + { + Index::Index(const std::string& name) : + _name(name), + _id(Index::GetNextId()) + { + } + + const std::string& Index::GetName() const + { + return _name; + } + + Index::Id Index::GetId() const + { + return _id; + } + + // TODO: Change this so that IDs are the responsibility of the EmitterContext + Index::Id Index::GetNextId() + { + static Id _nextIndex = 0; + return _nextIndex++; + } + + std::ostream& operator<<(std::ostream& os, const Index& index) + { + os << index.GetName(); + return os; + } + + } // namespace loopnests +} // namespace value +} // namespace ell + +using namespace ell::value::loopnests; + +std::hash::result_type std::hash::operator()(const argument_type& element) const +{ + return static_cast(std::hash()(element.GetId())); +} diff --git a/libraries/value/src/loopnests/IndexRange.cpp b/libraries/value/src/loopnests/IndexRange.cpp new file mode 100644 index 000000000..55cfa7a3b --- /dev/null +++ b/libraries/value/src/loopnests/IndexRange.cpp @@ -0,0 +1,65 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: IndexRange.cpp (value) +// Authors: Chuck Jacobs, Kern Handa +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "loopnests/IndexRange.h" + +namespace ell +{ +namespace value +{ + namespace loopnests + { + IndexRange::IndexRange(const Index& index, const Range& range) : + _index(index), + _range(range) + { + } + + IndexRange::IndexRange(const std::string& name, const Range& range) : + _index({ name }), + _range(range) + { + } + + const Index& IndexRange::GetIndex() const + { + return _index; + } + + const std::string& IndexRange::GetName() const + { + return _index.GetName(); + } + + int IndexRange::Begin() const + { + return _range.Begin(); + } + + int IndexRange::End() const + { + return _range.End(); + } + + int IndexRange::Size() const + { + return _range.Size(); + } + + int IndexRange::Increment() const + { + return _range.Increment(); + } + + Range IndexRange::GetRange() const + { + return _range; + } + } // namespace loopnests +} // namespace value +} // namespace ell diff --git a/libraries/value/src/loopnests/IterationDomain.cpp b/libraries/value/src/loopnests/IterationDomain.cpp new file mode 100644 index 000000000..981e21aca --- /dev/null +++ b/libraries/value/src/loopnests/IterationDomain.cpp @@ -0,0 +1,60 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: IterationDomain.cpp (value) +// Authors: Chuck Jacobs, Kern Handa +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "loopnests/IterationDomain.h" + +namespace ell +{ +namespace value +{ + namespace loopnests + { + + IterationDomain::IterationDomain(const std::vector& dimensions) : + _dimensions(dimensions) + { + for (int d = 0; d < NumDimensions(); ++d) + { + _indexToDimensionMap[_dimensions[d].GetIndex().GetId()] = d; + } + //Assert(IsUnique(Transform(dimensions, [](auto x) { return x.GetIndex().GetName(); })), "Dimensions must have unique indices"); + } + + IterationDomain::IterationDomain(const std::initializer_list& dimensions) : + IterationDomain(std::vector{ dimensions.begin(), dimensions.end() }) + {} + + int IterationDomain::NumDimensions() const + { + return static_cast(_dimensions.size()); + } + + IndexRange IterationDomain::GetDimensionRange(int dimension) const + { + return _dimensions[dimension]; + } + + IndexRange IterationDomain::GetDimensionRange(const Index& index) const + { + return _dimensions[GetDimensionRangeFromIndex(index)]; + } + + const std::vector& IterationDomain::GetRanges() const + { + return _dimensions; + } + + int IterationDomain::GetDimensionRangeFromIndex(const Index& index) const + { + auto it = _indexToDimensionMap.find(index.GetId()); + return it->second; + } + + } // namespace loopnests +} // namespace value +} // namespace ell diff --git a/libraries/value/src/loopnests/Kernel.cpp b/libraries/value/src/loopnests/Kernel.cpp new file mode 100644 index 000000000..649f27cb2 --- /dev/null +++ b/libraries/value/src/loopnests/Kernel.cpp @@ -0,0 +1,148 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: Kernel.cpp (value) +// Authors: Chuck Jacobs, Kern Handa +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "loopnests/Kernel.h" + +namespace ell +{ +namespace value +{ + namespace loopnests + { + Kernel::Kernel(std::string name) : + _id(name), + _kernelName(name) {} + + Kernel::Kernel(std::string name, Id id) : + _id(id.empty() ? name : id), + _kernelName(name) {} + + Kernel& Kernel::Inputs(const std::vector& inputs) + { + _inputs = inputs; + return *this; + } + + Kernel& Kernel::Indices(std::vector indices) + { + _indices = indices; + return *this; + } + + const std::string& Kernel::GetName() const + { + return _kernelName; + } + + const Kernel::Id& Kernel::GetId() const + { + return _id; + } + + const std::vector& Kernel::GetArgs() const + { + return _inputs; + } + + const std::vector& Kernel::GetIndices() const + { + return _indices; + } + + void Kernel::Call(std::vector inputs, std::vector indices) const + { + assert(_kernel); + _kernel(inputs, indices); + } + + // TODO : make this a template specialization of Define(), currently lambdas and std::functions aren't + // getting matched correctly + Kernel& Kernel::DefineEx(std::function, std::vector)>&& fn) + { + _kernel = [numOriginalIndices = _indices.size(), + originalInputs = _inputs, + kernelName = UniqueName(_kernelName + "KernelFn"), + fnDefinition = std::move(fn)](std::vector arguments, std::vector indices) { + using namespace utilities; + + if (arguments.size() != originalInputs.size()) + { + throw InputException(InputExceptionErrors::invalidArgument, "Number of arguments does not match number of expected inputs"); + } + if (indices.size() != numOriginalIndices) + { + throw InputException(InputExceptionErrors::invalidArgument, "Number of indices does not match number of expected indices"); + } + + // Flatten the vectors of parameters into a single vector in order to define the emitted function + std::vector fnInputs(arguments.begin(), arguments.end()); + fnInputs.insert(fnInputs.end(), indices.begin(), indices.end()); + + std::vector fnParameters(originalInputs.begin(), originalInputs.end()); + fnParameters.insert(fnParameters.end(), indices.begin(), indices.end()); + for (auto i = 0u; i < originalInputs.size(); ++i) + { + Value& param = fnParameters[i]; + const Value& input = fnInputs[i]; + + if (!input.IsConstrained()) + { + param.ClearLayout(); + } + else + { + param.SetLayout(input.GetLayout()); + } + } + + auto fn = DeclareFunction(kernelName).Parameters(fnParameters); + fn.Inlined(FunctionInlining::always); + if (!fn.IsDefined()) + { + // Add a function layer to coalesce the vectors of parameters inside the function call + fn.Define([originalArgCount = arguments.size(), + originalIndexCount = indices.size(), + innerFn = std::move(fnDefinition)](std::vector args) { + if (args.size() != originalArgCount + originalIndexCount) + { + throw InputException(InputExceptionErrors::invalidArgument, "Number of arguments + indices does not match number of expected inputs"); + } + + std::vector inputs; + inputs.reserve(originalArgCount); + for (unsigned idx = 0; idx < originalArgCount; ++idx) + { + inputs.push_back(args[idx]); + } + + std::vector parameters; + parameters.reserve(originalIndexCount); + for (unsigned idx = originalArgCount; idx < (originalArgCount + originalIndexCount); ++idx) + { + parameters.push_back(args[idx]); + } + + innerFn(std::move(inputs), std::move(parameters)); + }); + } + + fn.Call(fnInputs); + }; + + return *this; + } + } // namespace loopnests +} // namespace value +} // namespace ell + +using namespace ell::value::loopnests; + +std::hash::result_type std::hash::operator()(const argument_type& kernel) const +{ + return static_cast(std::hash()(kernel.GetId())); +} diff --git a/libraries/value/src/loopnests/KernelPredicate.cpp b/libraries/value/src/loopnests/KernelPredicate.cpp new file mode 100644 index 000000000..ba3ac4db2 --- /dev/null +++ b/libraries/value/src/loopnests/KernelPredicate.cpp @@ -0,0 +1,703 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: KernelPredicate.cpp (value) +// Authors: Chuck Jacobs +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "loopnests/KernelPredicate.h" +#include "loopnests/LoopNest.h" +#include "loopnests/LoopNestVisitor.h" + +#include + +namespace ell +{ +namespace value +{ + namespace loopnests + { + namespace + { + // computes ceil(a/b) + int CeilDiv(int a, int b) + { + return (a - 1) / b + 1; + } + + bool Intersects(const Range& a, const Range& b) + { + int aIter = CeilDiv(a.End() - a.Begin(), a.Increment()); + int bIter = CeilDiv(b.End() - b.Begin(), b.Increment()); + + if (aIter == 0 || bIter == 0) + { + return false; + } + auto aLast = a.Begin() + (aIter - 1) * a.Increment(); + auto bLast = b.Begin() + (bIter - 1) * b.Increment(); + + return aLast >= b.Begin() && a.Begin() <= bLast; + } + } // namespace + + // + // ConstantPredicate + // + ConstantPredicate::ConstantPredicate(bool value) : + _value(value) + { + } + + bool ConstantPredicate::GetValue() const + { + return _value; + } + + // + // FragmentTypePredicate + // + FragmentTypePredicate::FragmentTypePredicate(const Index& index, Fragment condition) : + _index(index), + _condition(condition) + { + } + + // bool FragmentTypePredicate::IsSatisfied(const std::vector& indices) const; // Perhaps pass in current loop state? + const Index& FragmentTypePredicate::GetIndex() const + { + return _index; + } + + Fragment FragmentTypePredicate::GetCondition() const + { + return _condition; + } + + KernelPredicate FragmentTypePredicate::Simplify() const + { + if (_condition == Fragment::all) + { + return ConstantPredicate(true); + } + return *this; + } + + KernelPredicate FragmentTypePredicate::Simplify(const LoopIndexSymbolTable& indices, const LoopVisitSchedule& schedule) const + { + if (_condition == Fragment::all) + { + return ConstantPredicate(true); + } + + const auto index = GetIndex(); + const auto condition = GetCondition(); + + // Get all index variables dependent on the predicate index + const auto& domain = schedule.GetLoopNest().GetDomain(); + auto loopIndices = domain.GetDependentLoopIndices(index, true); + + // Evaluate a little equality "sub-predicate" for each dependent variable. All of them must be true for the result to be true. + for (auto loopIndex : loopIndices) + { + // TODO: move `GetLoopRage` somewhere else + auto fullRange = LoopNestVisitor::GetLoopRange(loopIndex, indices, schedule); + + int testVal = 0; + bool valid = true; + switch (condition) + { + case Fragment::first: + testVal = fullRange.Begin(); + break; + case Fragment::last: + testVal = fullRange.End() - (fullRange.Size() % fullRange.Increment()); + if (testVal == fullRange.End()) // not a boundary + { + testVal = fullRange.End() - fullRange.Increment(); + } + break; + case Fragment::endBoundary: + testVal = fullRange.End() - (fullRange.Size() % fullRange.Increment()); + if (testVal == fullRange.End()) // not a boundary + { + valid = false; + } + break; + default: + valid = false; + // throw? + break; + } + + if (valid) + { + // Loop up range of the active loop + auto activeRange = fullRange; + if (const auto it = indices.find(loopIndex); it != indices.end()) + { + if (it->second.state == LoopIndexState::inProgress) + { + activeRange = it->second.loopRange; + } + } + + // Now check if testVal intersects with the loop's range + if (activeRange.Increment() == 0) // bad range + { + return *this; + } + int numIterations = CeilDiv(activeRange.End() - activeRange.Begin(), activeRange.Increment()); + if (numIterations == 0) + { + return *this; + } + + if (Intersects(activeRange, { testVal, testVal + 1 })) + { + if (numIterations == 1) + { + // true -- don't add anything to AND list + } + else + { + return *this; + // TODO: add index, testVal to AND list, later return a conjunction of equality predicates + } + } + else + { + return ConstantPredicate(false); + } + } + } + return ConstantPredicate(true); + } + + // + // PlacementPredicate + // + PlacementPredicate::PlacementPredicate(Placement placement) : + _index(std::nullopt), + _placement(placement) + { + } + + PlacementPredicate::PlacementPredicate(const Index& index, Placement placement) : + _index(index), + _placement(placement) + { + } + + // bool PlacementPredicate::IsSatisfied(const std::vector& indices) const; // Perhaps pass in current loop state? + bool PlacementPredicate::HasIndex() const + { + return _index.has_value(); + } + + Index PlacementPredicate::GetIndex() const + { + return _index.value(); + } + + Placement PlacementPredicate::GetPlacement() const + { + return _placement; + } + + const PlacementPredicate& PlacementPredicate::Simplify() const + { + return *this; + } + + const PlacementPredicate& PlacementPredicate::Simplify(const LoopIndexSymbolTable& indices, const LoopVisitSchedule& schedule) const + { + return *this; + } + + // + // IndexDefinedPredicate + // + IndexDefinedPredicate::IndexDefinedPredicate(const Index& index) : + _index(index) + { + } + + const Index& IndexDefinedPredicate::GetIndex() const + { + return _index; + } + + const IndexDefinedPredicate& IndexDefinedPredicate::Simplify() const + { + return *this; + } + + const IndexDefinedPredicate& IndexDefinedPredicate::Simplify(const LoopIndexSymbolTable& indices, const LoopVisitSchedule& schedule) const + { + return *this; + } + + // + // KernelPredicateConjunction + // + KernelPredicateConjunction::KernelPredicateConjunction(const KernelPredicate& lhs, const KernelPredicate& rhs) + { + _terms.push_back(std::make_unique(lhs)); + _terms.push_back(std::make_unique(rhs)); + } + + KernelPredicateConjunction::KernelPredicateConjunction(const KernelPredicateConjunction& other) : + KernelPredicateConjunction(other._terms) + { + } + + KernelPredicateConjunction::KernelPredicateConjunction(const std::vector>& terms) + { + for (const auto& t : terms) + { + _terms.emplace_back(std::make_unique(*t)); + } + } + + KernelPredicateConjunction& KernelPredicateConjunction::operator=(const KernelPredicateConjunction& other) + { + for (const auto& t : other._terms) + { + _terms.emplace_back(std::make_unique(*t)); + } + return *this; + } + + const std::vector>& KernelPredicateConjunction::GetTerms() const + { + return _terms; + } + + KernelPredicate KernelPredicateConjunction::Simplify() const + { + std::vector> terms; + for (const auto& t : GetTerms()) + { + auto simplifiedTerm = t->Simplify(); + if (simplifiedTerm.IsAlwaysFalse()) + { + return ConstantPredicate(false); + } + if (!simplifiedTerm.IsAlwaysTrue()) + { + terms.emplace_back(std::make_unique(simplifiedTerm)); + } + } + + if (terms.empty()) + { + return KernelPredicate{}; + } + else if (terms.size() == 1) + { + return *terms[0]; + } + else + { + return KernelPredicateConjunction(terms); + } + } + + KernelPredicate KernelPredicateConjunction::Simplify(const LoopIndexSymbolTable& indices, const LoopVisitSchedule& schedule) const + { + if (GetTerms().empty()) + { + return EmptyPredicate(); + } + + std::vector> terms; + for (const auto& t : GetTerms()) + { + auto simplifiedTerm = t->Simplify(indices, schedule); + if (simplifiedTerm.IsAlwaysFalse()) + { + return ConstantPredicate(false); + } + else if (!simplifiedTerm.IsAlwaysTrue()) + { + terms.emplace_back(std::make_unique(simplifiedTerm)); + } + // If always true, do nothing + } + + if (terms.empty()) + { + return ConstantPredicate(true); + } + else if (terms.size() == 1) + { + return *terms[0]; + } + else + { + return KernelPredicateConjunction(terms); + } + } + + // + // KernelPredicateDisjunction + // + KernelPredicateDisjunction::KernelPredicateDisjunction(const KernelPredicate& lhs, const KernelPredicate& rhs) + { + _terms.push_back(std::make_unique(lhs)); + _terms.push_back(std::make_unique(rhs)); + } + + KernelPredicateDisjunction::KernelPredicateDisjunction(const KernelPredicateDisjunction& other) : + KernelPredicateDisjunction(other._terms) + { + } + + KernelPredicateDisjunction::KernelPredicateDisjunction(const std::vector>& terms) + { + for (const auto& t : terms) + { + _terms.emplace_back(std::make_unique(*t)); + } + } + + KernelPredicateDisjunction& KernelPredicateDisjunction::operator=(const KernelPredicateDisjunction& other) + { + for (const auto& t : other._terms) + { + _terms.emplace_back(std::make_unique(*t)); + } + return *this; + } + + const std::vector>& KernelPredicateDisjunction::GetTerms() const + { + return _terms; + } + + KernelPredicate KernelPredicateDisjunction::Simplify() const + { + if (GetTerms().empty()) + { + return EmptyPredicate(); + } + + std::vector> terms; + for (const auto& t : GetTerms()) + { + auto simplifiedTerm = t->Simplify(); + if (simplifiedTerm.IsAlwaysTrue()) + { + return { ConstantPredicate(true) }; + } + else if (!simplifiedTerm.IsAlwaysFalse()) + { + terms.emplace_back(std::make_unique(simplifiedTerm)); + } + // If always false, do nothing + } + if (terms.empty()) + { + return ConstantPredicate(false); + } + else if (terms.size() == 1) + { + return *terms[0]; + } + else + { + return KernelPredicateDisjunction(terms); + } + } + + KernelPredicate KernelPredicateDisjunction::Simplify(const LoopIndexSymbolTable& indices, const LoopVisitSchedule& schedule) const + { + std::vector> terms; + for (const auto& t : GetTerms()) + { + auto simplifiedTerm = t->Simplify(indices, schedule); + if (simplifiedTerm.IsAlwaysTrue()) + { + return { ConstantPredicate(true) }; + } + else if (!simplifiedTerm.IsAlwaysFalse()) + { + terms.emplace_back(std::make_unique(simplifiedTerm)); + } + } + if (terms.empty()) + { + return KernelPredicate{}; + } + else if (terms.size() == 1) + { + return *terms[0]; + } + else + { + return KernelPredicateDisjunction(terms); + } + } + + // + // KernelPredicate + // + KernelPredicate::KernelPredicate(const EmptyPredicate& predicate) : + _expr(predicate) {} + + KernelPredicate::KernelPredicate(const ConstantPredicate& predicate) : + _expr(predicate) {} + + KernelPredicate::KernelPredicate(const FragmentTypePredicate& predicate) : + _expr(predicate) {} + + KernelPredicate::KernelPredicate(const PlacementPredicate& predicate) : + _expr(predicate) {} + + KernelPredicate::KernelPredicate(const IndexDefinedPredicate& predicate) : + _expr(predicate) {} + + KernelPredicate::KernelPredicate(const KernelPredicateConjunction& predicate) : + _expr(predicate) {} + + KernelPredicate::KernelPredicate(const KernelPredicateDisjunction& predicate) : + _expr(predicate) {} + + KernelPredicate KernelPredicate::Simplify() const + { + return std::visit( + [](auto&& pred) -> KernelPredicate { + return pred.Simplify(); + }, + _expr); + } + + KernelPredicate KernelPredicate::Simplify(const LoopIndexSymbolTable& indices, const LoopVisitSchedule& schedule) const + { + return std::visit( + [&indices, &schedule](auto&& pred) -> KernelPredicate { + return pred.Simplify(indices, schedule); + }, + _expr); + } + + bool KernelPredicate::IsAlwaysTrue() const + { + if (IsEmpty()) + { + return true; + } + auto simplePredicate = Simplify(); + if (auto constPred = simplePredicate.As(); constPred != nullptr) + { + return constPred->GetValue() == true; + } + return false; + } + + bool KernelPredicate::IsAlwaysFalse() const + { + if (IsEmpty()) + { + return false; + } + auto simplePredicate = Simplify(); + if (auto constPred = simplePredicate.As(); constPred != nullptr) + { + return constPred->GetValue() == false; + } + return false; + } + + bool KernelPredicate::IsEmpty() const + { + return std::holds_alternative(_expr); + } + + // + // free functions + // + KernelPredicate First(const Index& index) + { + return FragmentTypePredicate{ index, Fragment::first }; + } + + KernelPredicate Last(const Index& index) + { + return FragmentTypePredicate{ index, Fragment::last }; + } + + KernelPredicate EndBoundary(const Index& index) + { + return FragmentTypePredicate{ index, Fragment::endBoundary }; + } + + KernelPredicate All(const Index& index) + { + return FragmentTypePredicate{ index, Fragment::all }; + } + + KernelPredicate Before(const Index& index) + { + return PlacementPredicate{ index, Placement::before }; + } + + KernelPredicate After(const Index& index) + { + return PlacementPredicate{ index, Placement::after }; + } + + KernelPredicate IsDefined(const Index& index) + { + return IndexDefinedPredicate{ index }; + } + + KernelPredicate operator&&(const KernelPredicate& lhs, const KernelPredicate& rhs) + { + return KernelPredicateConjunction{ lhs, rhs }; + } + + KernelPredicate operator||(const KernelPredicate& lhs, const KernelPredicate& rhs) + { + return KernelPredicateDisjunction{ lhs, rhs }; + } + + std::string ToString(Fragment cond) + { + switch (cond) + { + case Fragment::all: + return "all"; + case Fragment::first: + return "first"; + case Fragment::last: + return "last"; + case Fragment::endBoundary: + return "endBoundary"; + default: + return "<>"; + } + } + + std::string ToString(Placement where) + { + switch (where) + { + case Placement::before: + return "before"; + case Placement::after: + return "after"; + default: + return "<>"; + } + } + + std::ostream& operator<<(std::ostream& os, const EmptyPredicate& predicate) + { + os << "{}"; + return os; + } + + std::ostream& operator<<(std::ostream& os, const ConstantPredicate& predicate) + { + os << (predicate.GetValue() ? "true" : "false"); + return os; + } + + std::ostream& operator<<(std::ostream& os, const FragmentTypePredicate& predicate) + { + os << ToString(predicate.GetCondition()) << "(" << predicate.GetIndex() << ")"; + return os; + } + + std::ostream& operator<<(std::ostream& os, const PlacementPredicate& predicate) + { + if (predicate.HasIndex()) + { + os << ToString(predicate.GetPlacement()) << "(" << predicate.GetIndex() << ")"; + } + else + { + os << ToString(predicate.GetPlacement()) << "()"; + } + return os; + } + + std::ostream& operator<<(std::ostream& os, const IndexDefinedPredicate& predicate) + { + os << "IsDefined(" << predicate.GetIndex() << ")"; + return os; + } + + std::ostream& operator<<(std::ostream& os, const KernelPredicateConjunction& predicate) + { + const auto& terms = predicate.GetTerms(); + if (terms.size() == 0) + { + os << "true"; + } + else if (terms.size() == 1) + { + os << *terms[0]; + } + else + { + os << "("; + bool first = true; + for (const auto& t : terms) + { + os << *t; + if (!first) + { + os << " && "; + } + first = false; + } + os << ")"; + } + return os; + } + + std::ostream& operator<<(std::ostream& os, const KernelPredicateDisjunction& predicate) + { + const auto& terms = predicate.GetTerms(); + if (terms.size() == 0) + { + os << "true"; + } + else if (terms.size() == 1) + { + os << *terms[0]; + } + else + { + os << "("; + bool first = true; + for (const auto& t : terms) + { + os << *t; + if (!first) + { + os << " || "; + } + first = false; + } + os << ")"; + } + return os; + } + + std::ostream& operator<<(std::ostream& os, const KernelPredicate& predicate) + { + std::visit([&os](auto&& pred) { + os << pred; + }, + predicate._expr); + return os; + } + + } // namespace loopnests +} // namespace value +} // namespace ell diff --git a/libraries/value/src/loopnests/LoopNest.cpp b/libraries/value/src/loopnests/LoopNest.cpp new file mode 100644 index 000000000..67a412df0 --- /dev/null +++ b/libraries/value/src/loopnests/LoopNest.cpp @@ -0,0 +1,900 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: LoopNest.h (value) +// Authors: Chuck Jacobs, Kern Handa +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "loopnests/LoopNest.h" +#include "loopnests/LoopNestPrinter.h" + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace ell +{ +namespace value +{ + namespace loopnests + { + using logging::EOL; + using logging::Log; + + // + // LoopVisitSchedule + // + + LoopVisitSchedule::LoopVisitSchedule(const LoopNest& nest, LoopVisitSchedule::StateQueue state) : + LoopVisitSchedule(nest, 0, state) + {} + + LoopVisitSchedule::LoopVisitSchedule(const LoopNest& nest, int level, StateQueue state) : + _level(level), + _state(std::move(state)), + _nest(nest) + {} + + LoopVisitSchedule::LoopVisitSchedule(const LoopVisitSchedule& other) : + _level(other._level), + _state(other._state), + _nest(other._nest) + {} + + LoopVisitSchedule& LoopVisitSchedule::operator=(const LoopVisitSchedule& other) + { + _level = other._level; + _state = other._state; + _nest = other._nest; + return *this; + } + + const LoopVisitSchedule::LoopInfo& LoopVisitSchedule::Front() const + { + return _state[_level]; + } + + const SplitIterationDomain& LoopVisitSchedule::GetDomain() const + { + return GetLoopNest().GetDomain(); + } + + int LoopVisitSchedule::CurrentNestLevel() const + { + return _level; + }; + + bool LoopVisitSchedule::IsDone() const + { + return _level == static_cast(_state.size()); + } + + bool LoopVisitSchedule::IsInnermostLoop() const + { + return _level == static_cast(_state.size()) - 1; + } + + Index LoopVisitSchedule::CurrentDimension() const + { + return Front().dimension; + } + + Range LoopVisitSchedule::LoopRange() const + { + // ### debugging + assert(Front().indexRange.GetRange() == GetLoopNest().GetDomain().GetIndexRange(CurrentLoopIndex())); + return Front().indexRange.GetRange(); + } + + int LoopVisitSchedule::LoopSize() const + { + return LoopRange().Size(); + } + + int LoopVisitSchedule::DimensionSize() const + { + return GetDomain().GetDimensionSize(Front().dimension); + } + + int LoopVisitSchedule::NonBoundaryEnd() const + { + auto numFullLoopIterations = LoopSize() / LoopIncrement(); + auto nonBoundaryLoopSize = LoopIncrement() * numFullLoopIterations; + return nonBoundaryLoopSize + LoopRange().Begin(); + } + + int LoopVisitSchedule::LoopIncrement() const + { + return LoopRange().Increment(); + } + + int LoopVisitSchedule::LoopIndexScale() const + { + return Front().scale; + } + + bool LoopVisitSchedule::CurrentLoopHasFragment(std::vector activeKernels, LoopFragmentType fragmentType) const + { + auto currentIndex = CurrentLoopIndex(); + for (const auto& kernel : GetLoopNest().GetKernels()) + { + const auto& where = kernel.constraints; + if (where.GetPlacement() == fragmentType) + { + // Boundary constraints: return `true` if this loop causes all the boundary indices to be defined + // (which is to say, they're all fully-defined here but not in previous loop) + const auto& outsideIndices = where.GetBoundaryIndices(); + if (outsideIndices.size() != 0) + { + bool allFullyDefined = std::all_of(outsideIndices.begin(), outsideIndices.end(), [&](auto index) { + return IsFullyDefined(index); + }); + bool definedByThisLoop = std::any_of(outsideIndices.begin(), outsideIndices.end(), [&](auto index) { + return IsFullyDefinedByThisLoop(index); + }); + if (allFullyDefined && definedByThisLoop) + { + return true; + } + } + } + } + return false; + } + + bool LoopVisitSchedule::FragmentCanRunAlone(std::vector activeKernels, LoopFragmentType fragmentType) const + { + return true; + } + + bool LoopVisitSchedule::FutureLoopHasFragmentForThisIndex(std::vector activeKernels, LoopFragmentType fragmentType) const + { + auto currentIndex = CurrentLoopIndex(); + for (const auto& kernel : GetLoopNest().GetKernels()) + { + const auto& where = kernel.constraints; + if (where.GetPlacement() == fragmentType) + { + // Boundary constraints: return `true` if this loop causes all the boundary indices to be defined + // (which is to say, they're all fully-defined here but not in previous loop) + const auto& outsideIndices = where.GetBoundaryIndices(); + bool allFullyDefined = std::all_of(outsideIndices.begin(), outsideIndices.end(), [&](auto index) { + return IsFullyDefined(index); + }); + bool thisIndexWasUsed = std::any_of(outsideIndices.begin(), outsideIndices.end(), [&](auto index) { + auto domain = GetLoopNest().GetDomain(); + if (index == currentIndex || domain.DependsOn(index, currentIndex)) + { + return true; + } + return false; + }); + + if (!allFullyDefined && thisIndexWasUsed) + { + return true; + } + } + } + return false; + } + + int LoopVisitSchedule::CurrentIndexEndBoundarySize() const + { + return Front().boundarySize; + } + + Index LoopVisitSchedule::CurrentLoopIndex() const + { + return Front().indexRange.GetIndex(); + } + + LoopVisitSchedule LoopVisitSchedule::Next() const + { + if (IsDone()) + { + throw utilities::LogicException(utilities::LogicExceptionErrors::illegalState, "Error: calling Next() at end of schedule"); + } + return { _nest.get(), _level + 1, _state }; + } + + LoopVisitSchedule LoopVisitSchedule::Prev() const + { + if (_level == 0) + { + throw utilities::LogicException(utilities::LogicExceptionErrors::illegalState, "Error: calling Prev() on first loop level"); + } + return { _nest.get(), _level - 1, _state }; + } + + bool LoopVisitSchedule::WillVisitIndex(const Index& index) const + { + auto dependentIndices = GetDomain().GetDependentIndices(index); + + // Check if a future loop has a loop index that the query index depends on + for (auto it = _state.begin() + CurrentNestLevel(); it != _state.end(); ++it) + { + auto i = it->indexRange.GetIndex(); + if (std::find(dependentIndices.begin(), dependentIndices.end(), i) != dependentIndices.end()) + { + return true; + } + } + return false; + } + + bool LoopVisitSchedule::IsFullyDefined(const Index& index) const + { + if (index == CurrentLoopIndex()) + { + return true; + } + + for (const auto& i : GetDomain().GetDependentIndices(index)) + { + if (GetDomain().IsLoopIndex(i)) + { + if (!WasIterationVariableDefined(i)) + { + return false; + } + } + } + return true; + } + + bool LoopVisitSchedule::IsFullyDefinedByThisLoop(const Index& index) const + { + // return true if: + // 1) the given index is this loop's index variable + // 1) the given index is synthetic and one of its terms is this loop's index variable, and the rest + // of the terms have already been defined + if (index == CurrentLoopIndex()) + { + return true; + } + + // look to see if this index has been defined + if (IsFullyDefined(index)) + { + if (CurrentNestLevel() == 0) + { + return true; + } + return !Prev().IsFullyDefined(index); + } + return false; + } + + bool LoopVisitSchedule::WasIterationVariableDefined(const Index& index) const + { + for (auto it = _state.begin(); it != _state.begin() + CurrentNestLevel() + 1; ++it) + { + auto iterVar = it->indexRange.GetIndex(); + if (iterVar == index) + { + return true; + } + } + return false; + } + + KernelPredicate LoopVisitSchedule::GetKernelPredicate(const ScheduledKernel& kernel) const + { + // Convert constraints to predicate + + const auto& domain = GetDomain(); + + // Get list of conditions in existing predicate + std::set predicateConditions; + std::set predicateIndices; + std::set constrainedIndices; + + kernel.predicate.Visit([&](const auto& p) { + if (auto fragmentPred = p.template As(); fragmentPred != nullptr) + { + auto predicateIndex = fragmentPred->GetIndex(); + auto fragment = fragmentPred->GetCondition(); + + // Convert computed indices to loop indices + for (auto loopIndex : domain.GetDependentLoopIndices(predicateIndex, true)) + { + predicateConditions.insert(FragmentTypePredicate(loopIndex, fragment)); + predicateIndices.insert(loopIndex); + } + } + else if (p.template Is()) + { + throw utilities::LogicException(utilities::LogicExceptionErrors::notImplemented, "IsDefined predicate not implemented"); + } + }); + + // BEGIN CONVERT CONSTRAINTS + // Convert CodePositionConstraints + const bool convertConstraints = !kernel.newVersion; + if (convertConstraints) + { + for (auto constraintIndex : kernel.constraints.GetRequiredIndices()) + { + // Convert computed indices to loop indices + for (auto loopIndex : domain.GetDependentLoopIndices(constraintIndex, true)) + { + if (predicateIndices.count(loopIndex) != 0) + { + throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "Constraint applied to index " + loopIndex.GetName() + ", which already has a predicate"); + } + + predicateConditions.insert({ loopIndex, Fragment::all }); + predicateIndices.insert(loopIndex); + constrainedIndices.insert(loopIndex); + } + } + + // Convert the kernel's predicate + constraints into equivalent predicate + // All "body" constraints turn into "all" conditions + // All "prologue" constraints turn into "first" conditions + // All "epilogue" constraints turn into "last" conditions + + // Issue: empty "boundary indices" means everything not mentioned + + const auto placement = kernel.constraints.GetPlacement(); + auto constraintCondition = (placement == LoopFragmentType::prologue) || (placement == LoopFragmentType::body) ? Fragment::first : Fragment::last; + auto boundaryIndices = kernel.constraints.GetBoundaryIndices(); + if (boundaryIndices.empty()) + { + // add all unmentioned loop indices --- all indices not dependent on any of the already-"all"'d indices + for (auto loopIndex : GetLoopNest().GetLoopSequence()) + { + if (constrainedIndices.count(loopIndex) == 0 && predicateIndices.count(loopIndex) == 0) + { + boundaryIndices.push_back(loopIndex); + } + } + } + for (auto boundaryIndex : boundaryIndices) + { + // Convert any boundary indices into concrete loop indices + for (auto loopIndex : domain.GetDependentLoopIndices(boundaryIndex, true)) + { + constrainedIndices.insert(loopIndex); + predicateIndices.insert(loopIndex); + predicateConditions.insert({ loopIndex, constraintCondition }); + } + } + + // All unmentioned loop indices become "first" conditions + for (auto loopIndex : GetLoopNest().GetLoopSequence()) + { + if (constrainedIndices.count(loopIndex) == 0 && predicateIndices.count(loopIndex) == 0) + { + predicateConditions.insert({ loopIndex, Fragment::first }); + } + } + } + // END CONVERT CONSTRAINTS + + // new predicate == conjunction of all conditions in conditions set + if (predicateConditions.size() == 0) + { + return {}; // ? + } + + auto begin = predicateConditions.begin(); + auto first = KernelPredicate(*begin); + ++begin; + auto fullPredicate = std::accumulate(begin, predicateConditions.end(), first, [](auto lhs, auto rhs) -> KernelPredicate { return { KernelPredicateConjunction(lhs, rhs) }; }); + auto result = fullPredicate.Simplify(); + return result; + } + + // + // LoopNest + // + LoopNest::LoopNest(IterationDomain domain) : + _domain(domain) + { + InitLoopSequence(); + } + + void LoopNest::InitLoopSequence() + { + // For each dimension, get a queue of loop indices + int numDimensions = _domain.NumDimensions(); + std::vector> dimensionIndices(numDimensions); + for (int d = 0; d < numDimensions; ++d) + { + const auto indices = _domain.GetLoopIndicesForDimension(_domain.GetBaseIndex(d)); + dimensionIndices[d] = std::queue({ indices.begin(), indices.end() }); + } + + for (;;) + { + bool done = true; + + // for each index + for (int d = 0; d < numDimensions; ++d) + { + if (!dimensionIndices[d].empty()) + { + _loopSequence.push_back(dimensionIndices[d].front()); + dimensionIndices[d].pop(); + done = false; + } + } + if (done) break; + } + } + + void LoopNest::ConvertKernelConstraints() + { + for (auto& k : _kernels) + { + ConvertKernelConstraints(k); + } + } + + void LoopNest::ConvertKernelConstraints(ScheduledKernel& kernel) + { + // TODO: convert first/last into inequality check (<=, >=), so they can work with boundaries + } + + void LoopNest::AddKernel(const Kernel& kernel, ConstraintType type) + { + if (type == ConstraintType::constraint) + { + AddKernel(kernel, LoopFragmentType::body); + } + else + { + CodePositionConstraints constraints{ LoopFragmentType::body, {}, {} }; // null constraints + _kernels.push_back({ true, kernel, constraints, {}, {} }); + } + } + + void LoopNest::AddKernel(const Kernel& kernel, LoopFragmentType where) + { + CodePositionConstraints constraints{ where, kernel.GetIndices(), {} }; + AddKernel(kernel, constraints); + } + + void LoopNest::AddKernel(const Kernel& kernel, const CodePositionConstraints& where) + { + // old version + _kernels.push_back({ false, kernel, where, {}, {} }); + } + + void LoopNest::AddKernel(const Kernel& kernel, const KernelPredicate& predicate) + { + AddKernel(kernel, predicate, {}); + } + + void LoopNest::AddKernel(const Kernel& kernel, const KernelPredicate& predicate, const KernelPredicate& placement) + { + // new version + CodePositionConstraints constraints{ LoopFragmentType::body, {}, {} }; // null constraints + _kernels.push_back({ true, kernel, constraints, predicate, placement }); + } + + void LoopNest::AddKernel(const Kernel& kernel, const CodePositionConstraints& where, const KernelPredicate& predicate, const KernelPredicate& placement) + { + // new version + _kernels.push_back({ true, kernel, where, predicate, {} }); + } + + const std::vector& LoopNest::GetKernels() const + { + return _kernels; + } + + std::vector LoopNest::GetKernelGroups() const + { + std::vector result; + for (const auto& kernel : _kernels) + { + auto it = std::find_if(result.begin(), result.end(), [&](const ScheduledKernelGroup& g) { + return g.id == kernel.kernel.GetId(); + }); + if (it == result.end()) + { + result.push_back({ kernel.kernel.GetId(), { kernel } }); + } + else + { + it->kernels.push_back(kernel); + } + } + return result; + } + + int LoopNest::NumDimensions() const + { + return static_cast(_domain.NumDimensions()); + } + + Range LoopNest::GetIndexRange(Index index) const + { + return _domain.GetIndexRange(index); + } + + std::vector LoopNest::GetLoopIndexRanges() const + { + std::vector result; + for (int d = 0; d < NumDimensions(); ++d) + { + const auto& dimRange = GetDimensionRange(d); + for (const auto& index : dimRange.GetLoopIndices()) + { + result.emplace_back(index, dimRange.GetIndexRange(index)); + } + } + return result; + } + + const SplitIndexRange& LoopNest::GetDimensionRange(int dimension) const + { + return _domain.GetDimensionRange(dimension); + } + + const SplitIndexRange& LoopNest::GetDimensionRange(const Index& dimension) const + { + return _domain.GetDimensionRange(dimension); + } + + int LoopNest::NumSplits(const Index& dimension) const + { + return GetDimensionRange(dimension).NumSplits(); + } + + const std::vector& LoopNest::GetLoopSequence() const + { + return _loopSequence; + } + + LoopVisitSchedule LoopNest::GetLoopSchedule() const + { + LoopVisitSchedule::StateQueue queue; + + std::map> availableLoopIndices; + int numDimensions = _domain.NumDimensions(); + for (int i = 0; i < numDimensions; ++i) + { + auto dimension = _domain.GetBaseIndex(i); + auto loopIndices = _domain.GetLoopIndicesForDimension(dimension); + availableLoopIndices[dimension] = { loopIndices.begin(), loopIndices.end() }; + } + + for (auto loopIndex : GetLoopSequence()) + { + auto range = _domain.GetIndexRange(loopIndex); + auto dimensionSize = _domain.GetDimensionSize(loopIndex); + int splitSize = range.Increment(); // need to keep track of the split size here, I think + int boundarySize = dimensionSize % splitSize; + + auto loopIndexScale = GetLoopIndexScale(loopIndex); + auto dimensionIndex = _domain.GetBaseIndex(loopIndex); + queue.push_back(LoopVisitSchedule::LoopInfo{ dimensionIndex, IndexRange{ loopIndex, range }, boundarySize, loopIndexScale }); + } + + return { *this, queue }; + } + + SplitIndex LoopNest::Split(Index index, int size) + { + auto result = _domain.Split(index, size); + + // Need to recompute loopSequence here (by replacing the index that got split with result.outer) + auto parent = _domain.GetParentIndex(result.outer); // this is the specific index that was split + auto it = std::find(_loopSequence.begin(), _loopSequence.end(), parent); + if (it == _loopSequence.end()) + { + throw utilities::LogicException(utilities::LogicExceptionErrors::illegalState); + } + else + { + *it = result.outer; + } + + _loopSequence.push_back(result.inner); + return result; + } + + void LoopNest::Parallelize(Index index) + { + _parallelizedIndices.push_back(index); + } + + // TODO: Move this out to the API surface + SplitIndex LoopNest::Parallelize(Index index, int factor) + { + auto result = Split(index, factor); + Parallelize(result.outer); + return result; + } + + void LoopNest::Unroll(Index index) + { + _unrolledIndices.push_back(index); + } + + // TODO: Move this out to the API surface + SplitIndex LoopNest::Unroll(Index index, int factor) + { + auto result = Split(index, factor); + Unroll(result.outer); + return result; + } + + void LoopNest::SetLoopOrder(const std::vector& order) + { + if (order.size() != _loopSequence.size()) + { + throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "SetLoopOrder() --- new order wrong length"); + } + + std::map> availableLoopIndices; + int numDimensions = _domain.NumDimensions(); + for (int i = 0; i < numDimensions; ++i) + { + auto dimension = _domain.GetBaseIndex(i); + auto loopIndices = _domain.GetLoopIndicesForDimension(dimension); + availableLoopIndices[dimension] = { loopIndices.begin(), loopIndices.end() }; + } + + // Function to get the next available concrete loop index that's a child index of a given index. + // Throws if there isn't one available. + auto getNextAvailable = [this, &availableLoopIndices](const Index& specifiedIndex) { + auto dimensionIndex = _domain.GetBaseIndex(specifiedIndex); + auto possibleIndices = _domain.GetDependentLoopIndices(specifiedIndex); + if (_domain.IsLoopIndex(specifiedIndex)) + { + possibleIndices.push_back(specifiedIndex); + } + for (auto i : possibleIndices) + { + if (availableLoopIndices[dimensionIndex].count(i) != 0) + { + availableLoopIndices[dimensionIndex].erase(i); + return i; + } + } + throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "SetLoopOrder() --- new order wrong length"); + }; + + std::vector newLoopSequence; + for (auto specifiedIndex : order) + { + auto loopIndex = getNextAvailable(specifiedIndex); + newLoopSequence.push_back(loopIndex); + } + + _loopSequence = newLoopSequence; + } + + void LoopNest::RenameVariable(ViewAdapter oldVariable, ViewAdapter newVariable, const std::vector& where, const std::vector& excludedKernels) + { + std::vector kernelIds; + std::transform( + excludedKernels.begin(), + excludedKernels.end(), + std::back_inserter(kernelIds), + [](const Kernel& kernel) { return kernel.GetId(); }); + + _renameActions.push_back({ oldVariable, newVariable, where, kernelIds }); + } + + int LoopNest::GetLoopIndexScale(const Index& index) const + { + // TODO: later we may normalize the loops, in which case indexScale here will be the loop increment + return 1; + } + + Index LoopNest::GetLoopIndex(const Index& dimension, int level) const + { + const auto& dim = GetDimensionRange(dimension); + return dim.GetSplitIndex(level); + } + + bool LoopNest::IsUsed(const Index& index, const std::vector& activeKernels) const + { + for (auto k : activeKernels) + { + for (auto kernelIndex : k.kernel.GetIndices()) + { + if (kernelIndex == index || GetDomain().DependsOn(kernelIndex, index)) + { + return true; + } + } + } + + return false; + } + + bool LoopNest::IsParallelized(const Index& index) const + { + return std::find(_parallelizedIndices.begin(), _parallelizedIndices.end(), index) != _parallelizedIndices.end(); + } + + bool LoopNest::IsUnrolled(const Index& index) const + { + return std::find(_unrolledIndices.begin(), _unrolledIndices.end(), index) != _unrolledIndices.end(); + } + + const std::vector& LoopNest::GetRenameActions() const + { + return _renameActions; + } + + const SplitIterationDomain& LoopNest::GetDomain() const + { + return _domain; + } + + Index LoopNest::GetBaseIndex(const Index& index) const { return _domain.GetBaseIndex(index); } + + bool LoopNest::IsLoopIndex(const Index& index) const + { + return _domain.IsLoopIndex(index); + } + + bool LoopNest::IsComputedIndex(const Index& index) const + { + return _domain.IsComputedIndex(index); + } + + IndexExpression LoopNest::GetIndexExpression(const Index& index) const + { + auto loopIndices = _domain.GetDependentLoopIndices(index); + + std::vector result; + for (auto loopIndex : loopIndices) + { + auto indexScale = GetLoopIndexScale(index); + result.push_back({ indexScale, loopIndex }); + } + + auto begin = _domain.GetDimensionBegin(index); + return { result, begin }; + } + + void LoopNest::DebugDump(std::string tag, std::ostream* stream) const + { + auto& targetStream = stream != nullptr ? *stream : std::cerr; + + GetDomain().Print(targetStream); + + targetStream << "Loop order: "; + for (auto i : GetLoopSequence()) + { + targetStream << i << " "; + } + targetStream << std::endl; + + LoopNestPrinter printer(targetStream); + + printer.Print(*this); + + if (!tag.empty()) + { + targetStream << "[tag = " << tag << "]"; + } + targetStream << '\n'; + } + + void DebugDump(const LoopNest& nest, std::string tag, std::ostream* stream) + { + nest.DebugDump(tag, stream); + } + + bool operator==(const ScheduledKernel& i1, const ScheduledKernel& i2) + { + return (i1.kernel == i2.kernel) && (i1.constraints == i2.constraints); + } + + bool operator!=(const ScheduledKernel& i1, const ScheduledKernel& i2) + { + return !(i1 == i2); + } + + LoopNest Fuse(const LoopNest& nest1, const LoopNest& nest2) + { + return Fuse(nest1, nest2, {}, {}); + } + + LoopNest Fuse(const LoopNest& nest1, const LoopNest& nest2, const std::vector& dependentIndexVec1, const std::vector& dependentIndexVec2) + { + // Collect all the indices for nest1 and nest2 + std::map nestIndices; + + auto makeSet = [&](const auto& container) { + std::set> result(container.begin(), container.end()); + return result; }; + + std::set dependentIndices1 = makeSet(dependentIndexVec1); + std::set dependentIndices2 = makeSet(dependentIndexVec2); + std::set nest1Indices = makeSet(nest1.GetDomain().GetAllLoopIndices()); + std::set nest2Indices = makeSet(nest2.GetDomain().GetAllLoopIndices()); + + // Collect vector of all IndexRanges, and indices in only one nest + // add indices in nest2 but not nest1 as "first" predicates for the nest1 kernels + // add indices in nest1 but not nest2 as "last" predicates for the nest2 kernels + auto domain1 = nest1.GetDomain(); + std::vector indexRanges; + for (const auto& index : nest1Indices) + { + auto range = domain1.GetIndexRange(index); + if (nest2Indices.count(index) == 0) + { + dependentIndices2.insert(index); + } + indexRanges.emplace_back(index, range); + } + + auto domain2 = nest2.GetDomain(); + for (const auto& index : nest2Indices) + { + auto range = domain2.GetIndexRange(index); + if (nest1Indices.count(index) != 0) + { + auto range1 = domain1.GetIndexRange(index); + if (range != range1) + { + throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "Fusing loops with incompatible ranges for index " + index.GetName()); + } + } + else + { + dependentIndices1.insert(index); + indexRanges.emplace_back(index, range); + } + } + + // Make new "selector" index + // Index s("sel"); + // indexRanges.emplace_back(s, Range{ 0, 2 }); + + // Create new loop nest + LoopNest result = { indexRanges }; + + // Add kernels for nest1 with "first" selector + for (const auto& kernel : nest1.GetKernels()) + { + auto fullPredicate = std::accumulate(dependentIndices1.begin(), dependentIndices1.end(), kernel.predicate, [](auto lhs, auto rhs) -> KernelPredicate { return lhs && First(rhs); }); + // result.AddKernel(kernel.kernel, kernel.constraints, fullPredicate && First(s)); // if using selector + result.AddKernel(kernel.kernel, kernel.constraints, fullPredicate, kernel.placement); + } + + // Add kernels for nest2 with "last" selector, and using the dependent indices passed in + for (const auto& kernel : nest2.GetKernels()) + { + auto fullPredicate = std::accumulate(dependentIndices2.begin(), dependentIndices2.end(), kernel.predicate, [](auto lhs, auto rhs) -> KernelPredicate { return lhs && Last(rhs); }); + // result.AddKernel(kernel.kernel, kernel.constraints, fullPredicate && Last(s)); + result.AddKernel(kernel.kernel, kernel.constraints, fullPredicate, kernel.placement); + } + + return result; + } + } // namespace loopnests +} // namespace value +} // namespace ell + +using namespace ell::value::loopnests; + +std::hash::result_type std::hash::operator()(const argument_type& kernel) const +{ + using ::ell::utilities::HashCombine; + + size_t hash = 0; + HashCombine(hash, kernel.kernel); + HashCombine(hash, kernel.constraints); + + return hash; +} diff --git a/libraries/value/src/loopnests/LoopNestPrinter.cpp b/libraries/value/src/loopnests/LoopNestPrinter.cpp new file mode 100644 index 000000000..8a87ccb45 --- /dev/null +++ b/libraries/value/src/loopnests/LoopNestPrinter.cpp @@ -0,0 +1,484 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: LoopNestPrinter.cpp (value) +// Authors: Chuck Jacobs +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "loopnests/LoopNestPrinter.h" + +#include +#include + +#include +#include + +namespace ell +{ +namespace value +{ + namespace loopnests + { + namespace + { + // computes ceil(a/b) + int CeilDiv(int a, int b) + { + return (a - 1) / b + 1; + } + } // namespace + + LoopNestPrinter::LoopNestPrinter(std::ostream& stream) : + _stream(stream), + _indentLevel(0) + { + } + + void LoopNestPrinter::Print(const LoopNest& loopNest) const + { + Visit(loopNest); + } + + void LoopNestPrinter::GenerateLoopRangeNew(const LoopRange& r, const RecursionStateNew& state, const LoopVisitSchedule& schedule, std::function codegenFn) const + { + const LoopNest& loopNest = schedule.GetLoopNest(); + auto loopIndex = schedule.CurrentLoopIndex(); + + bool isParallelized = loopNest.IsParallelized(loopIndex); + bool isUnrolled = loopNest.IsUnrolled(loopIndex); + assert(!(isParallelized && isUnrolled) && "An index cannot be both unrolled and parallelized"); + + const int startInt = r.start.Get(); + const int stopInt = r.stop.Get(); + const int stepInt = r.step.Get(); + + int numIterations = CeilDiv(stopInt - startInt, stepInt); + if (numIterations < 2) + { + isParallelized = false; + } + + std::vector properties; + if (isParallelized) + { + properties.push_back("parallel"); + } + if (isUnrolled) + { + properties.push_back("unrolled"); + } + if (numIterations == 1) + { + properties.push_back("single"); + } + + std::string propertiesStr; + if (!properties.empty()) + { + propertiesStr = ": (" + utilities::Join(properties, ", ") + ")"; + } + + WriteLine("For (" + GetIndexString(loopIndex, state.loopIndices) + " = " + std::to_string(startInt) + " to " + std::to_string(stopInt) + " by " + std::to_string(stepInt) + ")" + propertiesStr); + WriteLine("{"); + { + Indenter i(*this); + codegenFn(startInt); + } + WriteLine("}"); + } + + void LoopNestPrinter::GenerateLoopRangeOld(const LoopRange& r, const RecursionState& state, const LoopVisitSchedule& schedule, std::function codegenFn) const + { + const LoopNest& loopNest = schedule.GetLoopNest(); + auto loopIndex = schedule.CurrentLoopIndex(); + + bool isParallelized = loopNest.IsParallelized(loopIndex); + bool isUnrolled = loopNest.IsUnrolled(loopIndex); + assert(!(isParallelized && isUnrolled) && "An index cannot be both unrolled and parallelized"); + + const int startInt = r.start.Get(); + const int stopInt = r.stop.Get(); + const int stepInt = r.step.Get(); + + std::vector properties; + if (isParallelized) + { + properties.push_back("parallel"); + } + if (isUnrolled) + { + properties.push_back("unrolled"); + } + + auto currentLoopHasPrologue = r.currentLoopFragmentFlags.GetFlag(LoopFragmentType::prologue); + auto currentLoopHasEpilogue = r.currentLoopFragmentFlags.GetFlag(LoopFragmentType::epilogue); + + if (currentLoopHasPrologue) + { + properties.push_back("prologue_kernel"); + } + + if (currentLoopHasEpilogue) + { + properties.push_back("epilogue_kernel"); + } + + std::string propertiesStr; + if (!properties.empty()) + { + propertiesStr = ": (" + utilities::Join(properties, ", ") + ")"; + } + + WriteLine("For (" + GetIndexString(loopIndex, state.loopIndices) + " = " + std::to_string(startInt) + " to " + std::to_string(stopInt) + " by " + std::to_string(stepInt) + ")" + propertiesStr); + WriteLine("{"); + { + Indenter i(*this); + codegenFn(startInt); + } + WriteLine("}"); + } + + std::string LoopNestPrinter::GetIndent() const + { + constexpr int indentSize = 4; + return std::string(static_cast(indentSize * _indentLevel), ' '); + } + + void LoopNestPrinter::WriteLine(std::string l) const + { + _stream << GetIndent() << l << "\n"; + } + + Scalar LoopNestPrinter::EmitIndexExpression(const Index& index, const IndexExpression& expr, const LoopIndexSymbolTable& indexVariables) const + { + if (!expr.indices.empty()) + { + // We can't currently optimize away the "identity" expression, because the result (a loops "index" Scalar) + // would be a register variable (pointer valence 0), and the generated kernel function expects a stored value + // (pointer valence 1). So, we need to call `Allocate()` to get a stored variable. + std::vector terms; + for (auto scaledIndex : expr.indices) + { + if (auto it = indexVariables.find(scaledIndex.index); it != indexVariables.end()) + { + auto name = GetIndexString(scaledIndex.index, indexVariables); + + if (scaledIndex.scale == 1) + { + terms.push_back(name); + } + else + { + terms.push_back(std::to_string(scaledIndex.scale) + "*" + name); + } + } + } + terms.push_back(std::to_string(expr.begin)); + + WriteLine("int " + GetIndexString(index, indexVariables) + " = " + utilities::Join(terms, " + ") + ";"); + } + return 0; // Silly but necessary according to the `EmitIndexExpression` API + } + + std::string LoopNestPrinter::GetIndexString(const Index& index, const LoopIndexSymbolTable& runtimeIndexVariables) const + { + auto name = index.GetName(); + if (auto it = runtimeIndexVariables.find(index); it != runtimeIndexVariables.end()) + { + auto range = it->second.loopRange; + if (range.Increment() > 0) + { + int numIterations = CeilDiv(range.End() - range.Begin(), range.Increment()); + if (numIterations == 1) + { + name = "[" + name + "=" + std::to_string(range.Begin()) + "]"; + } + } + } + return name; + } + + std::string LoopNestPrinter::GetPredicateString(const KernelPredicate& predicate, const LoopIndexSymbolTable& runtimeIndexVariables, const LoopVisitSchedule& schedule) const + { + if (predicate.IsAlwaysTrue()) + { + return "true"; + } + else if (predicate.IsAlwaysFalse()) + { + return "false"; + } + else if (auto fragmentPred = predicate.As(); fragmentPred != nullptr) + { + auto condition = fragmentPred->GetCondition(); + if (condition == Fragment::all) + { + return "true"; + } + + auto index = fragmentPred->GetIndex(); + const auto& domain = schedule.GetLoopNest().GetDomain(); + const auto range = domain.GetDimensionRange(index); + + auto loopIndices = range.GetDependentLoopIndices(index); + if (loopIndices.empty()) + { + loopIndices = { index }; + } + bool first = true; + std::string result = ""; + for (auto loopIndex : loopIndices) + { + auto range = GetLoopRange(loopIndex, runtimeIndexVariables, schedule); + + int testVal = 0; + bool valid = true; + switch (condition) + { + case Fragment::first: + testVal = range.Begin(); + break; + case Fragment::last: + testVal = range.End() - (range.Size() % range.Increment()); + if (testVal == range.End()) // not a boundary + { + testVal = range.End() - range.Increment(); + } + break; + case Fragment::endBoundary: + testVal = range.End() - (range.Size() % range.Increment()); + if (testVal == range.End()) // not a boundary + { + valid = false; + } + break; + default: + valid = false; + // throw? + break; + } + + if (valid) + { + if (first) + { + result = "("; + } + else + { + result += " && "; + } + first = false; + result += "(" + GetIndexString(loopIndex, runtimeIndexVariables) + " == " + std::to_string(testVal) + ")"; + } + } + return result.empty() ? "" : result + ")"; + } + else if (predicate.Is()) + { + throw utilities::LogicException(utilities::LogicExceptionErrors::notImplemented, "IsDefined predicate not implemented"); + } + else if (auto conjunction = predicate.As(); conjunction != nullptr) + { + const auto& terms = conjunction->GetTerms(); + if (terms.size() == 0) + { + return "true"; + } + else if (terms.size() == 1) + { + return GetPredicateString(*terms[0], runtimeIndexVariables, schedule); + } + else + { + std::string result = "("; + bool first = true; + for (const auto& t : terms) + { + if (!first) + { + result += " && "; + } + first = false; + result += GetPredicateString(*t, runtimeIndexVariables, schedule); + } + result += ")"; + return result; + } + } + else if (auto disjunction = predicate.As(); disjunction != nullptr) + { + const auto& terms = disjunction->GetTerms(); + if (terms.size() == 0) + { + return "true"; + } + else if (terms.size() == 1) + { + return GetPredicateString(*terms[0], runtimeIndexVariables, schedule); + } + else + { + std::string result = "("; + bool first = true; + for (const auto& t : terms) + { + result += GetPredicateString(*t, runtimeIndexVariables, schedule); + if (!first) + { + result += " || "; + } + first = false; + } + result += ")"; + return result; + } + } + else + { + throw utilities::LogicException(utilities::LogicExceptionErrors::illegalState, "Unknown predicate type"); + } + } + + void LoopNestPrinter::EmitIf(const KernelPredicate& predicate, const LoopIndexSymbolTable& runtimeIndexVariables, const LoopVisitSchedule& schedule) const + { + WriteLine("If (" + GetPredicateString(predicate, runtimeIndexVariables, schedule) + ")"); + WriteLine("{"); + ++_indentLevel; + } + void LoopNestPrinter::EmitElseIf(const KernelPredicate& predicate, const LoopIndexSymbolTable& runtimeIndexVariables, const LoopVisitSchedule& schedule) const + { + EmitEndIf(); + WriteLine("ElseIf (" + GetPredicateString(predicate, runtimeIndexVariables, schedule) + ")"); + WriteLine("{"); + ++_indentLevel; + } + void LoopNestPrinter::EmitElse() const + { + EmitEndIf(); + WriteLine("Else"); + WriteLine("{"); + ++_indentLevel; + } + void LoopNestPrinter::EmitEndIf() const + { + --_indentLevel; + WriteLine("}"); + } + + void LoopNestPrinter::InvokeKernel(const Kernel& kernel, const KernelPredicate& predicate, const LoopIndexSymbolTable& runtimeIndexVariables, const LoopVisitSchedule& schedule) const + { + if (!predicate.IsEmpty()) + { + EmitIf(predicate, runtimeIndexVariables, schedule); + } + + InvokeKernel(kernel, runtimeIndexVariables, schedule); + + if (!predicate.IsEmpty()) + { + EmitEndIf(); + } + } + + void LoopNestPrinter::InvokeKernel(const Kernel& kernel, const LoopIndexSymbolTable& runtimeIndexVariables, const LoopVisitSchedule& schedule) const + { + const auto& renameActions = schedule.GetLoopNest().GetRenameActions(); + + auto rename = [&, this](const Value& arg) { + for (const auto& action : renameActions) + { + const auto& excludedKernels = action.excludedKernels; + if (std::find(excludedKernels.begin(), excludedKernels.end(), kernel.GetId()) == excludedKernels.end() && + std::equal_to{}(arg, action.oldValue) && + AreAllFullyDefined(action.where, schedule)) + { + auto newValue = action.newValue; + WriteLine("Using " + newValue.GetName() + " in place of " + arg.GetName()); + return newValue; + } + } + return arg; + }; + + std::vector args; + for (auto v : kernel.GetArgs()) + { + auto newV = rename(v); + if (auto name = newV.GetName(); name.empty()) + { + args.push_back(""); + } + else + { + args.push_back(name); + } + } + + for (const auto i : kernel.GetIndices()) + { + args.push_back(GetIndexString(i, runtimeIndexVariables)); + } + + WriteLine(kernel.GetName() + "(" + utilities::Join(args, ", ") + ");"); + } + + bool LoopNestPrinter::InvokeKernelGroup(const ScheduledKernelGroup& kernelGroup, const LoopIndexSymbolTable& runtimeIndexVariables, const LoopVisitSchedule& schedule) const + { + // preprocess to get only valid kernels + auto validKernels = GetValidKernels(kernelGroup, runtimeIndexVariables, schedule); + if (validKernels.empty()) + { + return false; + } + + bool first = true; + for (const auto& kernel : validKernels) + { + auto predicate = schedule.GetKernelPredicate(kernel).Simplify(runtimeIndexVariables, schedule); + if (predicate.IsAlwaysFalse()) + { + throw utilities::LogicException(utilities::LogicExceptionErrors::illegalState, "Always-false predicates should have been removed here"); + } + + if (predicate.IsAlwaysTrue()) + { + if (!first) + { + EmitElse(); + } + } + else + { + if (first) + { + EmitIf(predicate, runtimeIndexVariables, schedule); + } + else + { + EmitElseIf(predicate, runtimeIndexVariables, schedule); + } + } + + InvokeKernel(kernel.kernel, runtimeIndexVariables, schedule); + if (predicate.IsAlwaysTrue()) + { + // Stop evaluating, we're done + break; + } + + first = false; + } + + if (!first) + { + EmitEndIf(); + } + + return true; + } + + } // namespace loopnests +} // namespace value +} // namespace ell diff --git a/libraries/value/src/loopnests/LoopNestVisitor.cpp b/libraries/value/src/loopnests/LoopNestVisitor.cpp new file mode 100644 index 000000000..46708b358 --- /dev/null +++ b/libraries/value/src/loopnests/LoopNestVisitor.cpp @@ -0,0 +1,1057 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: LoopNestVisitor.cpp (value) +// Authors: Chuck Jacobs, Kern Handa +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "LLVMContext.h" + +#include "loopnests/KernelPredicate.h" +#include "loopnests/LoopNestPrinter.h" +#include "loopnests/LoopNestVisitor.h" + +#include + +#include +#include +#include +#include +#include + +namespace ell +{ +namespace value +{ + namespace loopnests + { + namespace + { + // computes ceil(a/b) + int CeilDiv(int a, int b) + { + return (a - 1) / b + 1; + } + + // check for a "placement" predicate without an index + bool IsBodyPlacementPredicate(const KernelPredicate& predicate) + { + if (auto placementPred = predicate.As(); placementPred != nullptr) + { + return !placementPred->HasIndex(); + } + return false; + } + } // namespace + + // + // LoopNestVisitor::RecursionState + // + LoopNestVisitor::RecursionState::RecursionState(const LoopNest& loopNest) : + currentFragment(LoopFragmentFlags::All()), + activeKernels(loopNest.GetKernels()) + { + } + + // + // LoopNestVisitor::RecursionStateNew + // + LoopNestVisitor::RecursionStateNew::RecursionStateNew(const LoopNest& loopNest) + { + kernelGroups.reserve(loopNest.GetKernelGroups().size()); + for (const auto& g : loopNest.GetKernelGroups()) + { + kernelGroups.emplace_back(true, g); + } + } + + // + // LoopNestVisitor + // + void LoopNestVisitor::Visit(const LoopNest& loopNest) const + { + auto schedule = loopNest.GetLoopSchedule(); + + if (UseNewVersion(loopNest)) + { + // 0) convert old-style constraints into new predicate model + // - have a "GetPredicate" function that appends constraint conditions to scheduled kernel's predicate + // 1) generate simple structure representing perfectly-nested loops with predicates on kernels + // - should replace old `LoopNest::GetLoopSchedule()` + // 2) unswitch conditions by splitting loops + // 3) replace constant predicates with either a simple kernel invocation or a no-op + // 4) replace single-iteration loops with simply setting the index value and evaluating the loop body + // 5) identify loops / index variable statements to omit + + // We need to create a RecursionState object, because it's passed in as a mutable (in/out) parameter + RecursionStateNew state = { loopNest }; + GenerateLoopsNew(state, schedule); + } + else + { + GenerateLoopsOld({ loopNest }, schedule); + } + } + + bool LoopNestVisitor::UseNewVersion(const LoopNest& loopNest) const + { + for (const auto& k : loopNest.GetKernels()) + { + if (k.newVersion) + { + return true; + } + } + return false; + } + + void LoopNestVisitor::GenerateLoopsNew(RecursionStateNew& state, const LoopVisitSchedule& schedule) const + { + if (schedule.IsDone()) + { + return; + } + + // We're descending into the heart of the loop + + // Find the active range for the current loop dimension and reduce our end amount if it exceeds the active range (boundary case) + auto loopIndex = schedule.CurrentLoopIndex(); + + bool hasValidKernels = false; + for (const auto& k : state.kernelGroups) + { + if (k.first) + { + hasValidKernels = true; + } + } + + // if state.kernelGroups is empty, just put all the remaining indices in the symbol table, marked "done" + if (!hasValidKernels) + { + // get each inner index and add it to state.loopIndices + auto s = schedule; + while (!s.IsDone()) + { + auto innerLoopIndex = s.CurrentLoopIndex(); + DefinePostLoopIndex(innerLoopIndex, state.loopIndices, s); + s = s.Next(); + } + return; + } + + // Alg: + + // 1) get splits/partitions + // 2) copy partition per kernel (group) + // 3) eval predicates and mark valid regions + // 4) make representation that's a list of kernels to run for each partition (e.g., [1,2 | 2 | 2, 3]) + // 5) move adjacent fully-matching suffix on left into right partition (and expand) + // 6) move adjacent fully-matching prefix on right into left partition (and expand) + + // ex, with S1: first(i), S2: all, S3: last(i): + + // step 1: partitions: (0..1), (1..N-1), (N-1..N) + // step 2: partitions w/ kernels: (0..1: S1, S2, S3), (1..N-1: S1, S2, S3), (N-1..N: S1, S2, S3) + // step 3: eval predicates and remove kernels: (0..1: S1, S2), (1..N-1: S2), (N-1..N: S2, S3) + // step 4: ... + // step 5: Suffix of first partition matches entirety of second: move + // --> (0..1: S1), (0..N-1: S2), (N1-..N: S2, S3) + // step 6: prefix of last partition matches entirety of second: move + // --> (0..1: S1), (0..N: S2), (N1-..N: S3) + + auto loopRange = GetLoopRange(loopIndex, state.loopIndices, schedule); + auto partitions = GetPartitions(loopIndex, loopRange, state.kernelGroups, state.loopIndices, schedule); + std::vector ranges; + LoopFragmentFlags bodyFlags; + bodyFlags.SetFlag(LoopFragmentType::boundary, false); + for (const auto& p : partitions) + { + ranges.push_back({ p.range.Begin(), p.range.End(), p.range.Increment(), bodyFlags, LoopFragmentType::body }); + } + + for (auto r : ranges) + { +#if 1 + std::function codegenFn = GetCodegenFnNew(r, state, schedule); + GenerateLoopRangeNew(r, state, schedule, codegenFn); +#else + std::function codegenFn = GetCodegenFnNew(r, state, schedule); + const int startInt = r.start.Get(); + const int stopInt = r.stop.Get(); + const int stepInt = r.step.Get(); + auto numIterations = CeilDiv(stopInt - startInt, stepInt); + + if (numIterations == 0) + { + // throw? + } + else if (numIterations == 1) + { + // TODO: set initial value of index variable (at least in loop-nest-printing case) + // SetLoopIndexValue(loopIndex, r.start); + codegenFn(r.start); + } + else + { + GenerateLoopRangeNew(r, state, schedule, codegenFn); + } +#endif + } + + // set the loop index state to be "done" + DefinePostLoopIndex(loopIndex, state.loopIndices, schedule); + } + + void LoopNestVisitor::GenerateLoopsOld(const RecursionState& state, const LoopVisitSchedule& schedule) const + { + // Loop-unswitching / duplicating rules: + // + // Need to duplicate the outermost loop involving an index used to compute the constraint index + // Only the innermost loop involving an index used to compute the constraint index needs to start from `1` for the body case + // If all the loops with indices used to compute the constraint index are contiguous, and the kernel is run in the innermost of these loops, + // then we can omit the 'body' from the prologue (/epilogue) fragment, and allow the body loop to start from `0` + // (really, we can have the prologue (/epilogue) fragment contain only the constrained kernel) + + if (schedule.IsDone()) + { + return; + } + + // We're descending into the heart of the loop + + // If the index we're looping over in this loop has any prologue / epilogue kernels, we have to (potentially) break up the range + // into prologue / body / epilogue sections + auto currentDimension = schedule.CurrentDimension(); + + // Find the active range for the current loop dimension and reduce our end amount if it exceeds the active range (boundary case) + auto activeRangeIt = state.activeDimensionRanges.find(currentDimension); + auto loopRange = schedule.LoopRange(); + int begin = loopRange.Begin(); + int end = loopRange.End(); + int increment = schedule.LoopIncrement(); + if (activeRangeIt != state.activeDimensionRanges.end()) + { + auto activeRange = activeRangeIt->second; + if (end > activeRange.End()) + { + end = activeRange.End(); + loopRange = Range{ begin, end, increment }; + } + } + + int nonBoundaryEnd = GetMainBodyLoopEnd(state, schedule, loopRange); + + // These mean "split current loop for this fragment type" + auto currentLoopHasPrologue = schedule.CurrentLoopHasFragment(state.activeKernels, LoopFragmentType::prologue); + auto currentLoopHasEpilogue = schedule.CurrentLoopHasFragment(state.activeKernels, LoopFragmentType::epilogue); + + // check if we need to emit an epilogue section to handle the end boundary for this loop + auto currentLoopHasEndBoundary = schedule.CurrentIndexEndBoundarySize() != 0; + + auto futureLoopHasPrologue = schedule.FutureLoopHasFragmentForThisIndex(state.activeKernels, LoopFragmentType::prologue); + auto futureLoopHasEpilogue = schedule.FutureLoopHasFragmentForThisIndex(state.activeKernels, LoopFragmentType::epilogue); + + LoopFragmentFlags bodyFlags = state.currentFragment; + bodyFlags.SetFlag(LoopFragmentType::boundary, false); + + bool bodyInPrologue = !schedule.FragmentCanRunAlone(state.activeKernels, LoopFragmentType::prologue); + bool bodyInEpilogue = !schedule.FragmentCanRunAlone(state.activeKernels, LoopFragmentType::epilogue); + + bool generatePrologueFragment = currentLoopHasPrologue || futureLoopHasPrologue; + bool generateEpilogueFragment = currentLoopHasEpilogue || futureLoopHasEpilogue; + + std::vector ranges; + auto prologueBegin = begin; + auto prologueEnd = begin + increment; + + if (generatePrologueFragment) + { + if (bodyInPrologue) + { + begin += increment; + } + else + { + bodyFlags.SetFlag(LoopFragmentType::prologue, false); + } + } + + // adjust loop boundary to unswitch last loop iteration if we have an epilogue kernel + auto epilogueBegin = end - increment; + auto epilogueEnd = end; + if (generateEpilogueFragment) + { + if (bodyInEpilogue) + { + if (currentLoopHasEndBoundary) + { + epilogueBegin = nonBoundaryEnd; + } + else + { + end -= increment; + nonBoundaryEnd -= increment; + } + } + else + { + bodyFlags.SetFlag(LoopFragmentType::epilogue, false); + } + } + + // Add prologue section + if (generatePrologueFragment) + { + LoopFragmentFlags flags = bodyInPrologue ? LoopFragmentType::prologue | LoopFragmentType::body : LoopFragmentType::prologue; + ranges.push_back({ prologueBegin, prologueEnd, increment, flags, LoopFragmentType::prologue }); + } + + // Add main body section + if (nonBoundaryEnd > begin) + { + ranges.push_back({ begin, nonBoundaryEnd, increment, bodyFlags, LoopFragmentType::body }); + } + + // Add boundary case (unless epilogue case already handles it) + if (currentLoopHasEndBoundary && !(generateEpilogueFragment && bodyInEpilogue) && (end - nonBoundaryEnd > 0)) + { + ranges.push_back({ nonBoundaryEnd, end, increment, bodyFlags | LoopFragmentType::boundary, LoopFragmentType::body }); + } + + // Add epilogue case + if (generateEpilogueFragment) + { + LoopFragmentFlags flags = bodyInEpilogue ? LoopFragmentType::epilogue | LoopFragmentType::body : LoopFragmentType::epilogue; + if (currentLoopHasEndBoundary) + { + flags.SetFlag(LoopFragmentType::boundary, true); + } + ranges.push_back({ epilogueBegin, epilogueEnd, increment, flags, LoopFragmentType::epilogue }); + } + + for (auto r : ranges) + { + std::function codegenFn = GetCodegenFnOld(r, state, schedule); + const int startInt = r.start.Get(); + const int stopInt = r.stop.Get(); + const int stepInt = r.step.Get(); + auto numIterations = CeilDiv(stopInt - startInt, stepInt); + + if (numIterations == 0) + { + // throw? + } + else if (numIterations == 1) + { + codegenFn(r.start); + } + else + { + GenerateLoopRangeOld(r, state, schedule, codegenFn); + } + } + } + + std::function LoopNestVisitor::GetCodegenFnNew(const LoopRange& r, const RecursionStateNew& state, const LoopVisitSchedule& schedule) const + { + // define the function used to do the codegen, but don't run it yet + return [this, &r, &state, &schedule](Scalar index) { + auto loopIndex = schedule.CurrentLoopIndex(); + + // TODO: deal with eventually not having an emit-time-constant range here + const int startInt = r.start.Get(); + const int stopInt = r.stop.Get(); + const int stepInt = r.step.Get(); + + // Note: it's important that this code not be moved outside of the `codegenFn` lambda, otherwise Compute will incorrectly use old info for subsequent ranges + auto newState = state; + newState.loopIndices.insert_or_assign(loopIndex, LoopIndexSymbolTableEntry{ loopIndex, index, Range(startInt, stopInt, stepInt), LoopIndexState::inProgress }); + + // define vars for all kernels + std::vector kernels; + for (const auto& g : state.kernelGroups) + { + if (g.first) + { + kernels.insert(kernels.end(), g.second.kernels.begin(), g.second.kernels.end()); + } + } + + DefineComputedIndexVariables(newState.loopIndices, kernels, schedule); + + // invoke all kernels valid before inner loops + for (auto& g : newState.kernelGroups) + { + if (g.first) + { + auto invoked = InvokeKernelGroup(g.second, newState.loopIndices, schedule); + if (invoked) + { +#if 0 + InvokeForContext([&](LLVMContext& context) { + auto& fn = context.GetFunctionEmitter(); + fn.Print("Invoking kernel " + g.second.id + " at loop index " + loopIndex.GetName() + "\n"); + }); +#endif + + g.first = false; + } + } + } + + // TODO: need to know if we're going to invoke any kernels after the inner loops, and remove them from the valid kernel groups + + if (!schedule.IsInnermostLoop()) + { + GenerateLoopsNew(newState, schedule.Next()); + + // invoke all kernels valid after inner loops + for (auto& g : newState.kernelGroups) + { + if (g.first) + { + auto invoked = InvokeKernelGroup(g.second, newState.loopIndices, schedule); + if (invoked) + { +#if 0 + InvokeForContext([&](LLVMContext& context) { + auto& fn = context.GetFunctionEmitter(); + fn.Print("Invoking after-kernel " + g.second.id + " at loop index " + loopIndex.GetName() + "\n"); + }); +#endif + g.first = false; + } + } + } + } + + // TODO: restore state of variables + // debugging + { + DefineComputedIndexVariables(newState.loopIndices, kernels, schedule); + } + }; + } + + Range LoopNestVisitor::GetLoopRange(const Index& loopIndex, const LoopIndexSymbolTable& activeRanges, const LoopVisitSchedule& schedule) + { + const auto& loopNest = schedule.GetLoopNest(); + const auto& domain = loopNest.GetDomain(); + auto loopRange = domain.GetIndexRange(loopIndex); + int begin = loopRange.Begin(); + int end = loopRange.End(); + int rangeSize = end - begin; + int increment = loopRange.Increment(); + + auto fixBoundaryRange = [&](Index index) { + // check activeRanges for parent + auto outerIndex = domain.GetOuterSplitIndex(domain.GetParentIndex(index)); + if (domain.IsLoopIndex(outerIndex) && activeRanges.count(outerIndex) != 0) + { + // check if it's a boundary --- if so, set size to its size + auto parentRange = activeRanges.at(outerIndex).loopRange; + if (parentRange.Size() < rangeSize) + { + rangeSize = parentRange.Size(); + end = begin + rangeSize; + loopRange = { begin, end, increment }; + } + } + }; + + if (domain.IsInnerSplitIndex(loopIndex)) + { + fixBoundaryRange(loopIndex); + } + else if (domain.HasParentIndex(loopIndex)) + { + auto parentIndex = domain.GetParentIndex(loopIndex); + if (domain.IsInnerSplitIndex(parentIndex)) + { + fixBoundaryRange(domain.GetParentIndex(loopIndex)); + } + } + + return loopRange; + } + + LoopNestVisitor::PartitionList LoopNestVisitor::GetPartitions(const Index& loopIndex, Range loopRange, const ActiveKernelGroupList& kernels, const LoopIndexSymbolTable& runtimeIndexVariables, const LoopVisitSchedule& schedule) const + { + int begin = loopRange.Begin(); + int end = loopRange.End(); + int rangeSize = end - begin; + int increment = loopRange.Increment(); + + // Find conditions involving this index and add any relevant partition split points + std::set splits; + for (const auto& g : kernels) + { + if (g.first) + { + for (const auto& k : g.second.kernels) + { + auto predicate = schedule.GetKernelPredicate(k).Simplify(runtimeIndexVariables, schedule); + AddSplits(loopIndex, loopRange, predicate, schedule, splits); + } + } + } + + // Add boundary split point, if necessary + int extra = rangeSize % increment; + if (extra != 0) + { + splits.insert(rangeSize - extra); + } + + // get index range + PartitionList result; + for (auto partitionEnd : splits) + { + result.push_back({ loopIndex, { begin, partitionEnd, increment } }); + begin = partitionEnd; + } + result.push_back({ loopIndex, { begin, end, increment } }); + + return result; + } + + void LoopNestVisitor::AddSplits(const Index& loopIndex, Range loopRange, const KernelPredicate& predicate, const LoopVisitSchedule& schedule, std::set& splits) const + { + const auto& loopNest = schedule.GetLoopNest(); + const auto& domain = loopNest.GetDomain(); + + // visit predicate, adding testVal to splits + auto addSplits = [&splits, &domain, &loopIndex, &loopRange](const auto& addSplits, const KernelPredicate& p) -> void { + if (auto simplePredicate = p.As(); simplePredicate != nullptr) + { + auto where = simplePredicate->GetCondition(); + if (where != Fragment::all) + { + auto predIndex = simplePredicate->GetIndex(); + if (predIndex == loopIndex || (domain.SameDimension(predIndex, loopIndex) && domain.DependsOn(predIndex, loopIndex))) + { + std::optional splitVal; + switch (simplePredicate->GetCondition()) + { + case Fragment::first: + splitVal = loopRange.Begin() + loopRange.Increment(); + break; + case Fragment::last: + { + // take into account last range being a boundary condition + auto extra = loopRange.End() % loopRange.Increment(); + if (extra == 0) + { + splitVal = loopRange.End() - loopRange.Increment(); + } + else + { + splitVal = loopRange.End() - extra; + } + } + break; + case Fragment::endBoundary: + // already set by automatic boundary-handling code + break; + default: + // nothing + break; + } + + if (splitVal) + { + if (splitVal.value() > 0 && splitVal.value() < loopRange.End()) + { + splits.insert(splitVal.value()); + } + } + } + } + } + else if (p.Is()) + { + // nothing + } + else if (auto conjunction = p.As(); conjunction != nullptr) + { + for (const auto& t : conjunction->GetTerms()) + { + addSplits(addSplits, *t); + } + } + else if (auto disjunction = p.As(); disjunction != nullptr) + { + for (const auto& t : disjunction->GetTerms()) + { + addSplits(addSplits, *t); + } + } + }; + + addSplits(addSplits, predicate); + } + + std::function LoopNestVisitor::GetCodegenFnOld(const LoopRange& r, const RecursionState& state, const LoopVisitSchedule& schedule) const + { + // define the function used to do the codegen, but don't run it yet + return [this, &r, &state, &schedule](Scalar index) { + const LoopNest& loopNest = schedule.GetLoopNest(); + auto loopIndex = schedule.CurrentLoopIndex(); + + auto dimensionIndex = schedule.CurrentDimension(); + + LoopFragmentFlags flags = state.fragmentStates.count(dimensionIndex) == 0 ? LoopFragmentFlags::All() : state.fragmentStates.at(dimensionIndex); + flags &= r.futureLoopFragmentFlags; + if (r.futureLoopFragmentFlags.GetFlag(LoopFragmentType::boundary)) + { + flags.SetFlag(LoopFragmentType::boundary, true); + } + + // Note: it's important that this code not be moved outside of the `codegenFn` lambda, otherwise Compute will incorrectly use old info for subsequent ranges + auto newState = state; + newState.currentFragment = flags; + newState.fragmentStates[dimensionIndex] = flags; + newState.loopIndices.insert_or_assign(loopIndex, LoopIndexSymbolTableEntry{ loopIndex, index, Range{ 0, r.stop.Get() - r.start.Get() }, LoopIndexState::inProgress }); + + // set the active range for the current dimension based on the loop range given + newState.activeDimensionRanges.insert_or_assign(dimensionIndex, Range{ 0, r.stop.Get() - r.start.Get() }); + + // Should we use 'flags' or 'r.futureLoopFragmentFlags' in GetValidKernels call? + auto prologueKernels = GetValidKernels(newState.activeKernels, newState.fragmentStates, r.futureLoopFragmentFlags, LoopFragmentType::prologue, schedule); + auto bodyKernels = GetValidKernels(newState.activeKernels, newState.fragmentStates, r.futureLoopFragmentFlags, LoopFragmentType::body, schedule); + auto epilogueKernels = GetValidKernels(newState.activeKernels, newState.fragmentStates, r.futureLoopFragmentFlags, { LoopFragmentType::epilogue }, schedule); + + // Concatenate kernel lists together + std::vector thisLoopKernels; + thisLoopKernels.insert(thisLoopKernels.begin(), prologueKernels.begin(), prologueKernels.end()); + thisLoopKernels.insert(thisLoopKernels.begin(), bodyKernels.begin(), bodyKernels.end()); + thisLoopKernels.insert(thisLoopKernels.begin(), epilogueKernels.begin(), epilogueKernels.end()); + + DefineComputedIndexVariables(newState.loopIndices, thisLoopKernels, schedule); + auto indexVariables = GetRuntimeIndexVariables(newState.loopIndices, loopNest); + + // erase all kernels in newState.activeKernels with the same ID as ones we're going to execute + for (const auto& k : thisLoopKernels) + { + auto id = k.kernel.GetId(); + auto it = std::remove_if(newState.activeKernels.begin(), newState.activeKernels.end(), [&](auto el) { + return el.kernel.GetId() == id; + }); + + newState.activeKernels.erase(it, newState.activeKernels.end()); + } + + // Prologue + for (auto k : prologueKernels) + { + InvokeKernel(k.kernel, k.predicate, indexVariables, schedule); + } + + // Body + for (auto k : bodyKernels) + { + InvokeKernel(k.kernel, k.predicate, indexVariables, schedule); + } + + // Recursively generate the loops inside this one + if (!newState.activeKernels.empty()) + { + GenerateLoopsOld(newState, schedule.Next()); + } + + // TODO: restore state of variables + // debugging + { + DefineComputedIndexVariables(newState.loopIndices, thisLoopKernels, schedule); + indexVariables = GetRuntimeIndexVariables(newState.loopIndices, loopNest); + } + + for (auto k : epilogueKernels) + { + InvokeKernel(k.kernel, k.predicate, indexVariables, schedule); + } + }; + } + + int LoopNestVisitor::GetMainBodyLoopEnd(const RecursionState& state, const LoopVisitSchedule& schedule, const Range& loopRange) const + { + if (!LoopInEndBoundaryFragment(state, schedule)) + { + return schedule.NonBoundaryEnd(); + } + + auto rangeSize = loopRange.Size(); + auto increment = loopRange.Increment(); + int remainder = rangeSize % increment; + int nonBoundarySize = rangeSize - remainder; + return loopRange.Begin() + nonBoundarySize; + } + + bool LoopNestVisitor::LoopInEndBoundaryFragment(const RecursionState& state, const LoopVisitSchedule& schedule) const + { + auto loopIndex = schedule.CurrentLoopIndex(); + auto dimensionIndex = schedule.GetDomain().GetBaseIndex(loopIndex); + return ((state.fragmentStates.count(dimensionIndex) != 0) && state.fragmentStates.at(dimensionIndex).GetFlag(LoopFragmentType::boundary)); + } + + void LoopNestVisitor::DefineComputedIndexVariables(LoopIndexSymbolTable& indexVariables, const std::vector& activeKernels, const LoopVisitSchedule& schedule) const + { + const auto& loopNest = schedule.GetLoopNest(); + const auto& domain = schedule.GetDomain(); + int numDimensions = domain.NumDimensions(); + + // define all computed index variables (that are used) + std::set usedIndices; + for (int d = 0; d < numDimensions; ++d) + { + auto computedIndices = domain.GetComputedIndicesForDimension(domain.GetBaseIndex(d)); + for (auto index : computedIndices) + { + if (loopNest.IsUsed(index, activeKernels)) + { + usedIndices.insert(index); + } + } + } + + for (const auto& index : usedIndices) + { + auto expr = loopNest.GetIndexExpression(index); + auto indexValue = EmitIndexExpression(index, expr, indexVariables); + indexVariables.insert_or_assign(index, LoopIndexSymbolTableEntry{ schedule.CurrentLoopIndex(), indexValue, Range{ 0, 0, 0 }, LoopIndexState::inProgress }); + } + } + + bool LoopNestVisitor::IsPlacementValid(const ScheduledKernel& kernel, const LoopIndexSymbolTable& runtimeLoopIndices, const LoopVisitSchedule& schedule) const + { + const auto& domain = schedule.GetDomain(); + if (kernel.placement.IsEmpty() || IsBodyPlacementPredicate(kernel.placement)) + { + // TODO: put this in a function that preprocesses the kernel predicates when adding the kernels to the schedule + for (const auto& kernelIndex : kernel.kernel.GetIndices()) + { + for (const auto& loopIndex : domain.GetDependentLoopIndices(kernelIndex, true)) + { + // if not defined(loopIndex) return false; + if (runtimeLoopIndices.count(loopIndex) == 0 || runtimeLoopIndices.at(loopIndex).state == LoopIndexState::done) + { + return false; + } + } + } + + if (kernel.placement.IsEmpty()) + { + return true; + } + } + + auto evalPlacement = [&](const auto& evalPlacement, const KernelPredicate& p) -> bool { + if (p.IsAlwaysTrue()) + { + return true; + } + else if (p.Is()) + { + throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "Fragment predicates not valid for placement"); + } + else if (auto placementPred = p.As(); placementPred != nullptr) + { + if (schedule.IsInnermostLoop()) + { + return !placementPred->HasIndex(); + } + + auto nextLoopIndex = schedule.Next().CurrentLoopIndex(); + auto where = placementPred->GetPlacement(); + + std::vector dependentLoopIndices; + if (placementPred->HasIndex()) + { + auto testIndex = placementPred->GetIndex(); + + // get list of dependent indices + dependentLoopIndices = domain.GetDependentLoopIndices(testIndex, true); + + // First check that we're not already inside any dependent loops + for (const auto& i : dependentLoopIndices) + { + if (runtimeLoopIndices.count(i) != 0 && runtimeLoopIndices.at(i).state == LoopIndexState::inProgress) + { + return false; + } + } + } + else + { + dependentLoopIndices = { nextLoopIndex }; + } + + // Now check that the next loop at least partially defines the index in question + if (std::find(dependentLoopIndices.begin(), dependentLoopIndices.end(), nextLoopIndex) != dependentLoopIndices.end()) + { + // Finally, check that we're in the correct position (before vs. after) + if (where == Placement::before) + { + return (runtimeLoopIndices.count(nextLoopIndex) == 0 || runtimeLoopIndices.at(nextLoopIndex).state == LoopIndexState::notVisited); + } + else // (where == Placement::after) + { + return (runtimeLoopIndices.count(nextLoopIndex) != 0 && runtimeLoopIndices.at(nextLoopIndex).state == LoopIndexState::done); + } + } + return false; + } + else if (auto definedPred = p.As(); definedPred != nullptr) + { + auto definedIndex = definedPred->GetIndex(); + return (runtimeLoopIndices.count(definedIndex) > 0) && (runtimeLoopIndices.at(definedIndex).state != LoopIndexState::done); + } + else if (auto conjunction = p.As(); conjunction != nullptr) + { + bool result = true; + for (const auto& t : conjunction->GetTerms()) + { + result &= evalPlacement(evalPlacement, *t); + } + return result; + } + else if (auto disjunction = p.As(); disjunction != nullptr) + { + bool result = false; + for (const auto& t : disjunction->GetTerms()) + { + result |= evalPlacement(evalPlacement, *t); + } + return result; + } + else + { + return false; + } + }; + + return evalPlacement(evalPlacement, kernel.placement); + } + + std::vector LoopNestVisitor::GetValidKernels(const ScheduledKernelGroup& kernelGroup, const LoopIndexSymbolTable& runtimeIndexVariables, const LoopVisitSchedule& schedule) const + { + std::vector validKernels; + std::copy_if(kernelGroup.kernels.begin(), kernelGroup.kernels.end(), std::back_inserter(validKernels), [&](const ScheduledKernel& k) { + if (!IsPlacementValid(k, runtimeIndexVariables, schedule)) + { + return false; + } + auto predicate = schedule.GetKernelPredicate(k).Simplify(runtimeIndexVariables, schedule); + if (predicate.IsAlwaysFalse()) + { + return false; + } + return true; + }); + return validKernels; + } + + LoopIndexSymbolTable LoopNestVisitor::GetRuntimeIndexVariables(const LoopIndexSymbolTable& runtimeLoopIndices, const LoopNest& loopNest) const + { + int numDimensions = loopNest.NumDimensions(); + + // Start with the concrete loop indices + LoopIndexSymbolTable indexVariables = runtimeLoopIndices; + + // ...and add the variables we need to compute (because they represent an index that has been split) + for (int d = 0; d < numDimensions; ++d) + { + auto computedIndices = loopNest.GetDomain().GetComputedIndicesForDimension(loopNest.GetDomain().GetBaseIndex(d)); + for (auto index : computedIndices) + { + auto runtimeVarIter = runtimeLoopIndices.find(index); + if (runtimeVarIter != runtimeLoopIndices.end()) + { + indexVariables.insert_or_assign(index, runtimeVarIter->second); + } + } + } + return indexVariables; + } + + void LoopNestVisitor::DefinePostLoopIndex(const Index& loopIndex, LoopIndexSymbolTable& runtimeLoopIndices, const LoopVisitSchedule& schedule) const + { + auto loopRange = GetLoopRange(loopIndex, runtimeLoopIndices, schedule); + auto lastVal = loopRange.End(); + runtimeLoopIndices.insert_or_assign(loopIndex, LoopIndexSymbolTableEntry{ loopIndex, lastVal, loopRange, LoopIndexState::done }); + } + + std::vector LoopNestVisitor::GetValidKernels(std::vector activeKernels, const std::unordered_map& fragmentStates, LoopFragmentFlags currentLoopFlags, LoopFragmentFlags kernelFilter, const LoopVisitSchedule& schedule) const + { + std::vector result; + for (auto fragmentType : { LoopFragmentType::prologue, LoopFragmentType::body, LoopFragmentType::epilogue }) + { + for (const auto& kernel : activeKernels) + { + if (kernelFilter.GetFlag(kernel.constraints.GetPlacement())) + { + // This should only run in a loop fragment of type 'fragmentType' and allowed by currentLoopFlags + if (ShouldRunKernel(kernel, fragmentType, fragmentStates, currentLoopFlags, schedule)) + { + result.push_back(kernel); + } + } + } + } + + return result; + } + + bool LoopNestVisitor::ShouldRunKernel(const ScheduledKernel& kernel, LoopFragmentType kernelPlacement, const std::unordered_map& constraintIndices, LoopFragmentFlags currentLoopFlags, const LoopVisitSchedule& schedule) const + { + const auto& where = kernel.constraints; + auto placement = where.GetPlacement(); + bool isBodyKernel = where.GetBoundaryIndices().size() == 0; + if (isBodyKernel) + placement = LoopFragmentType::body; + + // if (where.GetPlacement() != kernelPlacement) + if (placement != kernelPlacement) + { + return false; + } + + // bool enforceBoundary = where.GetBoundaryIndices().size() != 0 || (currentLoopFlags.GetFlag(LoopFragmentType::prologue) || currentLoopFlags.GetFlag(LoopFragmentType::epilogue)); + bool enforceBoundary = true; + if (enforceBoundary && !currentLoopFlags.GetFlag(kernelPlacement)) + { + return false; + } + + // Are we at the correct loop level (are all the indices needed by the kernel defined)? + // TODO: We want to only fire on a loop involving a leaf child of the index + auto insideIndices = where.GetRequiredIndices(); + if (!insideIndices.empty()) + { + if (currentLoopFlags.GetFlag(kernelPlacement) && where.GetBoundaryIndices().size() != 0) + { + if (schedule.CurrentNestLevel() == 0) + { + return false; + } + if (!AreAllFullyDefined(insideIndices, schedule)) + { + return false; + } + } + else + { + if (!AreAllFullyDefined(insideIndices, schedule)) + { + return false; + } + + // We want to return true only when _this_ loop defines all the indices, so let's check that the parent + // loop wasn't also a valid candidate (but only perform this check if we're not on the first loop) + if (schedule.CurrentNestLevel() != 0) + { + if (AreAllFullyDefined(insideIndices, schedule.Prev())) + { + return false; + } + } + } + } + + // are we part of a prologue/epilogue for the indices we were constrained with? + for (const auto& outsideIndex : where.GetBoundaryIndices()) + { + auto it = constraintIndices.find(outsideIndex); + if (it == constraintIndices.end() || !it->second.GetFlag(kernelPlacement)) + { + return false; + } + + // is this the innermost loop level (or later) for the given constraint index? + // (to check, just ensure there are no more loops after this one with the same dimension index) + if (schedule.Next().WillVisitIndex(outsideIndex)) + { + return false; + } + } + + return true; + } + + bool LoopNestVisitor::WillKernelRunInThisLoop(const ScheduledKernel& kernel, LoopFragmentFlags kernelFilter, const LoopVisitSchedule& schedule) const + { + // return true if: + // 1) constraints position allowed by kernelFilter + // 2) all required indices exist + // 3) none of boundary indices exist (except perhaps for current loop?) + const auto& where = kernel.constraints; + if (!kernelFilter.GetFlag(where.GetPlacement())) + { + return false; + } + + // are we at the correct loop level (are all the indices needed by the kernel defined)? + // TODO: need to allow using non-"dimension" indices as well (for non-innermost kernels) + auto insideIndices = where.GetRequiredIndices(); + if (!insideIndices.empty()) + { + // If all the required indices aren't defined yet, fail + if (!AreAllFullyDefined(insideIndices, schedule)) + { + return false; + } + + // We want to return true only when _this_ loop defines all the indices, so let's check that the parent + // loop wasn't also a valid candidate (but only perform this check if we're not on the first loop) + if (schedule.CurrentNestLevel() != 0) + { + if (AreAllFullyDefined(insideIndices, schedule.Prev())) + { + return false; + } + } + } + + // are we part of a prologue/epilogue for the indices we were constrained with? + for (const auto& outsideIndex : where.GetBoundaryIndices()) + { + if (schedule.IsDone()) + { + return false; + } + + if (schedule.Next().WillVisitIndex(outsideIndex)) + { + return false; + } + } + + return true; + } + + bool LoopNestVisitor::IsIdentity(const IndexExpression& expr, const Index& index) const + { + return (expr.indices.size() == 1 && + expr.indices[0].index == index && + expr.indices[0].scale == 1 && + expr.begin == 0); + } + + bool LoopNestVisitor::AreAllFullyDefined(const std::vector& indices, const LoopVisitSchedule& schedule) const + { + for (const auto& index : indices) + { + if (!schedule.IsFullyDefined(index)) + { + return false; + } + } + return true; + } + + } // namespace loopnests +} // namespace value +} // namespace ell diff --git a/libraries/value/src/loopnests/Range.cpp b/libraries/value/src/loopnests/Range.cpp new file mode 100644 index 000000000..4960efc8f --- /dev/null +++ b/libraries/value/src/loopnests/Range.cpp @@ -0,0 +1,65 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: Range.h (value) +// Authors: Chuck Jacobs, Kern Handa +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "loopnests/Range.h" + +#include + +namespace ell +{ +namespace value +{ + namespace loopnests + { + Range::Range(int begin, int end, int increment) : + _begin(begin), + _end(end), + _increment(increment) {} + + int Range::Begin() const { return _begin; } + + int Range::End() const { return _end; } + + int Range::Size() const { return _end - _begin; } + + int Range::Increment() const { return _increment; } + + std::ostream& operator<<(std::ostream& os, const Range& r) + { + os << "[" << r.Begin() << "," << r.End() << ":" << r.Increment() << ")"; + return os; + } + + bool operator==(const Range& i1, const Range& i2) + { + return (i1.Begin() == i2.Begin()) && (i1.End() == i2.End()) && (i1.Increment() == i2.Increment()); + } + + bool operator!=(const Range& i1, const Range& i2) + { + return (i1.Begin() != i2.Begin()) || (i1.End() != i2.End()) || (i1.Increment() != i2.Increment()); + } + + bool operator<(const Range& i1, const Range& i2) + { + if (i1.Begin() != i2.Begin()) + { + return i1.Begin() < i2.Begin(); + } + else if (i1.End() != i2.End()) + { + return i1.End() < i2.End(); + } + else + { + return i1.Increment() < i2.Increment(); + } + } + } // namespace loopnests +} // namespace value +} // namespace ell diff --git a/libraries/value/src/loopnests/SplitIndexRange.cpp b/libraries/value/src/loopnests/SplitIndexRange.cpp new file mode 100644 index 000000000..996dba4ec --- /dev/null +++ b/libraries/value/src/loopnests/SplitIndexRange.cpp @@ -0,0 +1,506 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: SplitIndexRange.cpp (value) +// Authors: Chuck Jacobs, Kern Handa +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "loopnests/SplitIndexRange.h" + +#include + +#include +#include + +namespace ell +{ +namespace value +{ + namespace loopnests + { + SplitIndexRange::SplitIndexRange(const IndexRange& indexRange) + { + auto index = indexRange.GetIndex(); + auto range = indexRange.GetRange(); + + // Add the dimension index as the root of the index tree + _indices.push_back(index); + _ranges.push_back(range); + _parentOffset.push_back(-1); // 'null' sentinel + _leftChildOffset.push_back(-1); // 'null' sentinel + _indexOffset[index] = 0; + } + + const Index& SplitIndexRange::GetDimensionIndex() const + { + return _indices[0]; + } + + int SplitIndexRange::NumSplits() const + { + auto result = static_cast(_indices.size() + 1) / 2; + return result; + } + + int SplitIndexRange::GetBegin() const + { + return _ranges[0].Begin(); + } + + int SplitIndexRange::GetSize() const + { + return _ranges[0].Size(); + } + + int SplitIndexRange::GetIncrement() const + { + return _ranges[0].Increment(); + } + + bool SplitIndexRange::Contains(const Index& index) const + { + if (IsDimension(index)) + { + return true; + } + return _indexOffset.count(index) != 0; + } + + bool SplitIndexRange::IsLoopIndex(const Index& index) const + { + auto node = GetNode(index); + return IsLeaf(node); + } + + bool SplitIndexRange::IsComputedIndex(const Index& index) const + { + auto node = GetNode(index); + return IsInteriorNode(node); + } + + bool SplitIndexRange::IsDimension(const Index& index) const + { + return index == GetDimensionIndex(); + } + + bool SplitIndexRange::IsChildOf(const Index& child, const Index& parent) const + { + auto childOffset = GetNode(child); + auto parentOffset = GetNode(parent); + return _parentOffset.at(childOffset) != -1 && _parentOffset.at(childOffset) == parentOffset; + } + + bool SplitIndexRange::IsParentOf(const Index& parent, const Index& child) const + { + return IsChildOf(parent, child); + } + + // index1 depends on index2? e.g., is index2 in the list of dependent indices? + bool SplitIndexRange::DependsOn(const Index& index1, const Index& index2) const + { + // TODO: assert index1 and index2 are both in this dimension + + // The top-level dimension index depends on everything + if (IsDimension(index1)) + { + return true; + } + + // nothing else depends on the top-level dimension index + if (IsDimension(index2)) + { + return false; + } + + auto node1 = GetNode(index1); + auto node2 = GetNode(index2); + if (node1 == node2) + { + return false; + } + + while (node2 > 0) + { + auto parentNode = GetParent(node2); + if (parentNode == node1) + { + return true; + } + node2 = parentNode; + } + return false; + } + + int SplitIndexRange::GetSplitSize(int level) const + { + if (level > NumSplits()) + { + throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "GetSplitSize() --- index out of range"); + } + if (level == NumSplits()) + { + return 1; + } + auto node = GetNthLeaf(level); + return _ranges[node].Size(); + } + + Range SplitIndexRange::GetDimensionRange() const + { + return _ranges[0]; + } + + Range SplitIndexRange::GetIndexRange(const Index& index) const + { + auto node = GetNode(index); + return _ranges[node]; + } + + Index SplitIndexRange::GetSplitIndex(int level) const + { + auto node = GetNthLeaf(level); + return _indices[node]; + } + + SplitIndex SplitIndexRange::Split(int size) + { + if (size > _ranges.back().Size()) + { + throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "Split size larger than smallest existing split"); + } + + auto lastLeaf = GetSmallestLeaf(0); // arbitrarily choose the last leaf --- it will be the right child of the right child of the right child... of the root (the `0` argument) + return SplitNode(lastLeaf, size); + } + + SplitIndex SplitIndexRange::Split(Index index, int size) + { + if (IsDimension(index)) + { + index = _indices[0]; + } + auto node = GetNode(index); + return SplitNode(node, size); + } + + int SplitIndexRange::GetNode(const Index& index) const + { + return _indexOffset.at(index); + } + + SplitIndex SplitIndexRange::SplitNode(int node, int size) + { + auto prefix = GetDimensionIndex().GetName() + "_"; + auto startIndex = static_cast(_indices.size()); + Index outer = { prefix + std::to_string(startIndex) }; + Index inner = { prefix + std::to_string(startIndex + 1) }; + + // `SplitNode(n, size)` splits a leaf of `n` --- the bottom-rightmost leaf. If `n` is a leaf already, split it. + auto parentOffset = GetSmallestLeaf(node); + auto parent = _indices[parentOffset]; + auto parentSize = _ranges[parentOffset].Size(); + auto parentIncrement = _ranges[parentOffset].Increment(); + auto leftChildOffset = static_cast(_indices.size()); + + // Add outer index to data structure + auto offset = static_cast(_indices.size()); + _indices.push_back(outer); + _ranges.push_back({ 0, parentSize, size }); + _parentOffset.push_back(parentOffset); + _leftChildOffset.push_back(-1); + _indexOffset[outer] = offset; + + // Add inner index to datastructure + offset = static_cast(_indices.size()); + _indices.push_back(inner); + auto thisSize = std::min(parentSize, size); // In case split is larger than original range + _ranges.push_back({ 0, thisSize, parentIncrement }); + _parentOffset.push_back(parentOffset); + _leftChildOffset.push_back(-1); + _leftChildOffset[parentOffset] = leftChildOffset; + _indexOffset[inner] = offset; + + return SplitIndex{ outer, inner }; + } + + // + // Binary-tree implementation below: + // + int SplitIndexRange::GetParent(int node) const + { + return _parentOffset.at(node); + } + + int SplitIndexRange::GetLeftChild(int node) const + { + return _leftChildOffset.at(node); + } + + int SplitIndexRange::GetRightChild(int node) const + { + auto leftChild = _leftChildOffset.at(node); + return leftChild == -1 ? leftChild : leftChild + 1; + } + + int SplitIndexRange::GetNthLeaf(int n) const + { + for (int i = 0; i < static_cast(_leftChildOffset.size()); ++i) + { + auto node = _leftChildOffset[i]; + if (node == -1) // leaves have no children + { + if (n == 0) + { + return i; + } + --n; + } + } + + throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "Couldn't find node"); + } + + // returns the "smallest" leaf descendent of index. If index is itself a leaf, return it, else return GetSmallestLeaf(index.rightChild) + int SplitIndexRange::GetSmallestLeaf(int node) const + { + if (IsLeaf(node)) + { + return node; + } + return GetSmallestLeaf(GetRightChild(node)); + } + + bool SplitIndexRange::IsLeaf(int node) const + { + auto isLeaf = _leftChildOffset.at(node) == -1; + if (isLeaf && (_parentOffset.at(node) == -1 && node != 0)) + { + throw utilities::LogicException(utilities::LogicExceptionErrors::illegalState); + } + + return isLeaf; + } + + bool SplitIndexRange::IsInteriorNode(int node) const + { + return !IsLeaf(node); + } + + const std::vector& SplitIndexRange::GetIndices() const + { + return _indices; + } + + std::vector SplitIndexRange::GetLoopIndices() const + { + std::vector result; + for (int n = 0; n < static_cast(_indices.size()); ++n) + { + if (IsLeaf(n)) + { + result.push_back(_indices[n]); + } + } + return result; + } + + std::vector SplitIndexRange::GetComputedIndices() const + { + std::vector result; + for (int n = 0; n < static_cast(_indices.size()); ++n) + { + if (IsInteriorNode(n)) + { + result.push_back(_indices[n]); + } + } + return result; + } + + std::vector SplitIndexRange::GetDependentIndices(const Index& index, bool includeSelf) const + { + std::vector result; + if (includeSelf) + { + result.push_back(index); + } + + std::queue nodesToVisit; + auto node = GetNode(index); + nodesToVisit.push(node); + // get all children, children of children, etc. + while (!nodesToVisit.empty()) + { + auto n = nodesToVisit.front(); + nodesToVisit.pop(); + result.push_back(_indices[n]); + auto leftChild = GetLeftChild(n); + if (leftChild != -1) + { + nodesToVisit.push(leftChild); + nodesToVisit.push(leftChild + 1); + } + } + return result; + } + + std::vector SplitIndexRange::GetDependentLoopIndices(const Index& index, bool includeSelf) const + { + + std::queue nodesToVisit; + auto node = IsDimension(index) ? 0 : GetNode(index); + if (includeSelf && IsLeaf(node)) + { + // If we're a leaf, no need to check any further + return { index }; + } + + std::vector result; + nodesToVisit.push(node); + // get all children, children of children, etc. + while (!nodesToVisit.empty()) + { + auto n = nodesToVisit.front(); + nodesToVisit.pop(); + + for (auto child : { GetLeftChild(n), GetRightChild(n) }) + { + if (child != -1) + { + if (IsLeaf(child)) + { + result.push_back(_indices[child]); + } + else + { + nodesToVisit.push(child); + } + } + } + } + return result; + } + + bool SplitIndexRange::HasParentIndex(const Index& index) const + { + auto node = GetNode(index); + return node != -1 && node != 0; + } + + Index SplitIndexRange::GetParentIndex(const Index& index) const + { + auto node = GetNode(index); + if (node == -1 || node == 0) + throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "GetParentIndex() --- dimension index has no parent"); + + return _indices[GetParent(node)]; + } + + bool SplitIndexRange::IsOuterSplitIndex(const Index& index) const + { + auto node = GetNode(index); + if (node == -1 || node == 0) + { + return false; + } + + auto parentNode = GetParent(node); + return node == GetLeftChild(parentNode); + } + + bool SplitIndexRange::IsInnerSplitIndex(const Index& index) const + { + auto node = GetNode(index); + if (node == -1 || node == 0) + { + return false; + } + + auto parentNode = GetParent(node); + return node == GetRightChild(parentNode); + } + + Index SplitIndexRange::GetOuterSplitIndex(const Index& parent) const + { + auto parentNode = GetNode(parent); + if (IsLeaf(parentNode)) + throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "GetOuterSplitIndex() --- called on a non-split index"); + + return _indices[GetLeftChild(parentNode)]; + } + + Index SplitIndexRange::GetInnerSplitIndex(const Index& parent) const + { + auto parentNode = GetNode(parent); + if (IsLeaf(parentNode)) + throw utilities::InputException(utilities::InputExceptionErrors::invalidArgument, "GetInnerSplitIndex() --- called on a non-split index"); + + return _indices[GetRightChild(parentNode)]; + } + + std::vector SplitIndexRange::GetAllParentIndices(const Index& index) const + { + std::vector result{ GetDimensionIndex() }; + auto node = GetNode(index); + if (node == -1) + return result; + + for (;;) + { + node = GetParent(node); + if (node == -1) + { + break; + } + result.push_back(_indices[node]); + } + return result; + } + + std::vector SplitIndexRange::GetChildIndices(const Index& index) const + { + if (IsDimension(index)) + { + return { _indices[0] }; + } + + auto node = GetNode(index); + if (node == -1) + { + throw utilities::LogicException(utilities::LogicExceptionErrors::illegalState); + } + + return { _indices[GetLeftChild(node)], _indices[GetRightChild(node)] }; + } + + void SplitIndexRange::Print(std::ostream& os) const + { + os << "Dimension " << GetDimensionIndex() << " range: " << GetDimensionRange() << std::endl; + + os << " Loop variables:\t"; + for (auto i : GetLoopIndices()) + { + auto r = GetIndexRange(i); + os << i << ": " << r << ";\t"; + } + os << std::endl; + + os << " Comp variables:\t"; + for (auto i : GetComputedIndices()) + { + auto r = GetIndexRange(i); + os << i << ": " << r << " ("; + std::string sep = ""; + for (auto dep : GetChildIndices(i)) + { + os << sep << dep; + sep = ", "; + } + os << ");\t"; + } + os << std::endl; + } + } // namespace loopnests +} // namespace value +} // namespace ell diff --git a/libraries/value/src/loopnests/SplitIterationDomain.cpp b/libraries/value/src/loopnests/SplitIterationDomain.cpp new file mode 100644 index 000000000..1faca13a3 --- /dev/null +++ b/libraries/value/src/loopnests/SplitIterationDomain.cpp @@ -0,0 +1,254 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: SplitIterationDomain.cpp (value) +// Authors: Chuck Jacobs, Kern Handa +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "loopnests/SplitIterationDomain.h" + +namespace ell +{ +namespace value +{ + namespace loopnests + { + SplitIterationDomain::SplitIterationDomain(const IterationDomain& domain) + { + int numDimensions = domain.NumDimensions(); + for (int d = 0; d < numDimensions; ++d) + { + _dimensions.emplace_back(domain.GetDimensionRange(d)); + _indexToOffsetMap[_dimensions.back().GetDimensionIndex()] = d; + for (auto i : _dimensions.back().GetIndices()) + { + _baseIndices.emplace(i, _dimensions.back().GetDimensionIndex()); + } + } + //Assert(IsUnique(Transform(domain.GetRanges(), [](auto x) { return x.GetDimensionIndex().GetName(); })), "Dimensions must have unique indices"); + } + + int SplitIterationDomain::NumDimensions() const + { + return static_cast(_dimensions.size()); + } + + int SplitIterationDomain::GetDimensionSize(const Index& dimensionIndex) const + { + return GetDimensionRange(dimensionIndex).GetSize(); + } + + int SplitIterationDomain::GetDimensionBegin(const Index& dimensionIndex) const + { + return GetDimensionRange(dimensionIndex).GetBegin(); + } + + Range SplitIterationDomain::GetIndexRange(const Index& index) const + { + return GetDimensionRange(index).GetIndexRange(index); + } + + const std::vector& SplitIterationDomain::GetIndicesForDimension(const Index& dimensionIndex) const + { + return GetDimensionRange(dimensionIndex).GetIndices(); + } + + std::vector SplitIterationDomain::GetAllLoopIndices() const + { + std::vector result; + for (int i = 0; i < NumDimensions(); ++i) + { + auto dimensionIndices = GetDimensionRange(i).GetLoopIndices(); + result.insert(result.end(), dimensionIndices.begin(), dimensionIndices.end()); + } + return result; + } + + std::vector SplitIterationDomain::GetLoopIndicesForDimension(const Index& dimensionIndex) const + { + return GetDimensionRange(dimensionIndex).GetLoopIndices(); + } + + std::vector SplitIterationDomain::GetComputedIndicesForDimension(const Index& dimensionIndex) const + { + return GetDimensionRange(dimensionIndex).GetComputedIndices(); + } + + std::vector SplitIterationDomain::GetDependentIndices(const Index& index, bool includeSelf) const + { + return GetDimensionRange(index).GetDependentIndices(index, includeSelf); + } + + std::vector SplitIterationDomain::GetDependentLoopIndices(const Index& index, bool includeSelf) const + { + return GetDimensionRange(index).GetDependentLoopIndices(index, includeSelf); + } + + bool SplitIterationDomain::Contains(const Index& index) const + { + return GetDimensionRange(index).Contains(index); + } + + bool SplitIterationDomain::IsLoopIndex(const Index& index) const + { + return GetDimensionRange(index).IsLoopIndex(index); + } + + bool SplitIterationDomain::IsComputedIndex(const Index& index) const + { + return GetDimensionRange(index).IsComputedIndex(index); + } + + bool SplitIterationDomain::IsDimension(const Index& index) const + { + return GetBaseIndex(index) == index; + } + + bool SplitIterationDomain::SameDimension(const Index& index1, const Index& index2) const + { + return GetBaseIndex(index1) == GetBaseIndex(index2); + } + + bool SplitIterationDomain::IsParentOf(const Index& parent, const Index& child) const + { + if (!SameDimension(parent, child)) + { + return false; + } + return GetDimensionRange(parent).IsParentOf(parent, child); + } + + bool SplitIterationDomain::IsChildOf(const Index& child, const Index& parent) const + { + if (!SameDimension(child, parent)) + { + return false; + } + return GetDimensionRange(child).IsChildOf(child, parent); + } + + bool SplitIterationDomain::DependsOn(const Index& index1, const Index& index2) const + { + if (!SameDimension(index1, index2)) + { + return false; + } + return GetDimensionRange(index1).DependsOn(index1, index2); + } + + bool SplitIterationDomain::HasParentIndex(const Index& parent) const + { + return GetDimensionRange(parent).HasParentIndex(parent); + } + + Index SplitIterationDomain::GetParentIndex(const Index& parent) const + { + return GetDimensionRange(parent).GetParentIndex(parent); + } + + bool SplitIterationDomain::IsOuterSplitIndex(const Index& index) const + { + return GetDimensionRange(index).IsOuterSplitIndex(index); + } + bool SplitIterationDomain::IsInnerSplitIndex(const Index& index) const + { + return GetDimensionRange(index).IsInnerSplitIndex(index); + } + Index SplitIterationDomain::GetOuterSplitIndex(const Index& parent) const + { + return GetDimensionRange(parent).GetOuterSplitIndex(parent); + } + Index SplitIterationDomain::GetInnerSplitIndex(const Index& parent) const + { + return GetDimensionRange(parent).GetInnerSplitIndex(parent); + } + + std::vector SplitIterationDomain::GetAllParentIndices(const Index& index) const + { + return GetDimensionRange(index).GetAllParentIndices(index); + } + + std::vector SplitIterationDomain::GetChildIndices(const Index& index) const + { + return GetDimensionRange(index).GetChildIndices(index); + } + + const SplitIndexRange& SplitIterationDomain::GetDimensionRange(int offset) const + { + return _dimensions[offset]; + } + + SplitIndexRange& SplitIterationDomain::GetDimensionRange(int offset) + { + return _dimensions[offset]; + } + + const SplitIndexRange& SplitIterationDomain::GetDimensionRange(const Index& index) const + { + auto offset = GetOffsetFromIndex(index); + return _dimensions[offset]; + } + + SplitIndexRange& SplitIterationDomain::GetDimensionRange(const Index& index) + { + auto offset = GetOffsetFromIndex(index); + return _dimensions[offset]; + } + + int SplitIterationDomain::NumSplits(const Index& index) const + { + auto offset = GetOffsetFromIndex(index); + return _dimensions[offset].NumSplits(); + } + + SplitIndex SplitIterationDomain::Split(const Index& index, int splitSize) + { + auto baseIndex = GetBaseIndex(index); + auto offset = GetOffsetFromIndex(index); + auto result = _dimensions[offset].Split(index, splitSize); + _baseIndices.emplace(result.inner, baseIndex); + _baseIndices.emplace(result.outer, baseIndex); + return result; + } + + bool SplitIterationDomain::IsPrimaryDimension(const Index& index) const + { + return _indexToOffsetMap.count(index) != 0; + } + + int SplitIterationDomain::GetOffsetFromIndex(const Index& index) const + { + auto baseIndex = GetBaseIndex(index); + return _indexToOffsetMap.at(baseIndex); + } + + Index SplitIterationDomain::GetBaseIndex(const Index& index) const + { + auto mapIndex = _baseIndices.find(index); + if (mapIndex != _baseIndices.end()) + { + return mapIndex->second; + } + else + { + return index; + } + } + + Index SplitIterationDomain::GetBaseIndex(int offset) const + { + return _dimensions[offset].GetDimensionIndex(); + } + + void SplitIterationDomain::Print(std::ostream& os) const + { + for (const auto& d : _dimensions) + { + d.Print(os); + } + } + + } // namespace loopnests +} // namespace value +} // namespace ell diff --git a/libraries/value/test/include/CachingStrategy_test.h b/libraries/value/test/include/CachingStrategy_test.h new file mode 100644 index 000000000..cb1a1ab1e --- /dev/null +++ b/libraries/value/test/include/CachingStrategy_test.h @@ -0,0 +1,148 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: CachingStrategy_test.h (value) +// Authors: Mason Remy +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +namespace ell +{ +namespace value +{ + class Scalar; +} + +// Simple Blas TCOPY tests +value::Scalar BLASTCOPY_ValidateOutput_Test1(); +value::Scalar BLASTCOPY_ValidateOutput_Test2(); +value::Scalar BLASTCOPY_ValidateMemory_Test1(); +value::Scalar BLASTCOPY_ValidateMemory_Test2(); +value::Scalar BLASTCOPY_ValidateMemory_Test3(); + +value::Scalar BLASTCOPY_ValidateOutput_BoundaryCondition_Test1(); +value::Scalar BLASTCOPY_ValidateOutput_BoundaryCondition_Test2(); +value::Scalar BLASTCOPY_ValidateOutput_BoundaryCondition_Test3(); +value::Scalar BLASTCOPY_ValidateOutput_BoundaryCondition_Test4(); +value::Scalar BLASTCOPY_ValidateOutput_BoundaryCondition_Test5(); +value::Scalar BLASTCOPY_ValidateOutput_BoundaryCondition_Test6(); +value::Scalar BLASTCOPY_ValidateOutput_BoundaryCondition_Test7(); +value::Scalar BLASTCOPY_ValidateOutput_BoundaryCondition_Test8(); +value::Scalar BLASTCOPY_ValidateOutput_BoundaryCondition_Test9(); + +value::Scalar BLASTCOPY_ValidateMemory_BoundaryCondition_Test1(); +value::Scalar BLASTCOPY_ValidateMemory_BoundaryCondition_Test2(); +value::Scalar BLASTCOPY_ValidateMemory_BoundaryCondition_Test3(); +value::Scalar BLASTCOPY_ValidateMemory_BoundaryCondition_Test4(); +value::Scalar BLASTCOPY_ValidateMemory_BoundaryCondition_Test5(); +value::Scalar BLASTCOPY_ValidateMemory_BoundaryCondition_Test6(); +value::Scalar BLASTCOPY_ValidateMemory_BoundaryCondition_Test7(); +value::Scalar BLASTCOPY_ValidateMemory_BoundaryCondition_Test8(); +value::Scalar BLASTCOPY_ValidateMemory_BoundaryCondition_Test9(); + +// Direct convolution caching +value::Scalar ConvolutionWeight_ValidateOutput_Test1(); +value::Scalar ConvolutionWeight_Reshape_ValidateMemory_Test1(); +value::Scalar ConvolutionInput_ValidateOutput_Test1(); +value::Scalar ConvolutionInput_ValidateOutput_Test2(); +value::Scalar ConvolutionOutput_ValidateOutput_Test1(); +value::Scalar ConvolutionOutput_ValidateOutput_Test1(); +value::Scalar DirectConvolution_Test1(); + +// General caching strategy +value::Scalar GeneralCachingStrategy_ValidateOutput_Test1(); +value::Scalar GeneralCachingStrategy_ValidateOutput_Test2(); +value::Scalar GeneralCachingStrategy_ValidateOutput_Test3(); +value::Scalar GeneralCachingStrategy_ValidateOutput_Test4(); +value::Scalar GeneralCachingStrategy_ValidateOutput_Test5(); +value::Scalar GeneralCachingStrategy_ValidateOutput_Test6(); +value::Scalar GeneralCachingStrategy_ValidateOutput_Test7(); +value::Scalar GeneralCachingStrategy_ValidateOutput_Test8(); +value::Scalar GeneralCachingStrategy_ValidateOutput_Test9(); +value::Scalar GeneralCachingStrategy_ValidateOutput_Test10(); +value::Scalar GeneralCachingStrategy_ValidateOutput_Test11(); +value::Scalar GeneralCachingStrategy_ValidateOutput_Test12(); +value::Scalar GeneralCachingStrategy_ValidateOutput_Test13(); + +value::Scalar GeneralCachingStrategy_ValidateMemory_Test1(); + +// General caching strategy boundary condition output tests +value::Scalar GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test1(); +value::Scalar GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test2(); +value::Scalar GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test3(); +value::Scalar GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test4(); +value::Scalar GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test5(); +value::Scalar GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test6(); +value::Scalar GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test7(); +value::Scalar GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test8(); +value::Scalar GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test9(); + +// General caching strategy BLASTCopy-style tests +value::Scalar GeneralCachingStrategy_BLASTCOPY_ValidateOutput_Test1(); +value::Scalar GeneralCachingStrategy_BLASTCOPY_ValidateOutput_Test2(); + +value::Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_Test1(); +value::Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_Test2(); +value::Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_Test3(); + +value::Scalar GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test1(); +value::Scalar GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test2(); +value::Scalar GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test3(); +value::Scalar GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test4(); +value::Scalar GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test5(); +value::Scalar GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test6(); +value::Scalar GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test7(); +value::Scalar GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test8(); +value::Scalar GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test9(); + +value::Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test1(); +value::Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test2(); +value::Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test3(); +value::Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test4(); +value::Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test5(); +value::Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test6(); +value::Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test7(); +value::Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test8(); +value::Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test9(); + +// General caching strategy ProgressiveBLASNCopy-style tests + +value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_Test1(); +value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_Test2(); +value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_Test1(); +value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_Test2(); +value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_Test3(); + +value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test1(); +value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test2(); +value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test3(); +value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test4(); +value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test5(); +value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test6(); +value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test7(); +value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test8(); +value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test9(); + +value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test1(); +value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test2(); +value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test3(); +value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test4(); +value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test5(); +value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test6(); +value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test7(); +value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test8(); +value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test9(); + +value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test1(); +value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test2(); +value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test3(); +value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test4(); +value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test5(); +value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test6(); +value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test7(); +value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test8(); +value::Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test9(); + +} // namespace ell diff --git a/libraries/value/test/include/Functions_test.h b/libraries/value/test/include/Functions_test.h new file mode 100644 index 000000000..910e60cb1 --- /dev/null +++ b/libraries/value/test/include/Functions_test.h @@ -0,0 +1,18 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: Functions_test.h (value) +// Authors: Kern Handa, Chuck Jacobs +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include + +namespace ell +{ + +value::Scalar FunctionArgType_test(); + +} // namespace ell diff --git a/libraries/value/test/include/LoopNestAPI_test.h b/libraries/value/test/include/LoopNestAPI_test.h new file mode 100644 index 000000000..65801252e --- /dev/null +++ b/libraries/value/test/include/LoopNestAPI_test.h @@ -0,0 +1,37 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: LoopNestAPI_test.h (value) +// Authors: Kern Handa +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +namespace ell +{ +namespace value +{ + class Scalar; +} +value::Scalar LoopNest_api_test1(); +value::Scalar LoopNest_api_test2(); +value::Scalar LoopNest_api_test3(); +value::Scalar LoopNest_api_test4(); +value::Scalar LoopNest_api_test5(); +value::Scalar LoopNest_api_Parallelized_test1(); +value::Scalar LoopNest_api_Parallelized_test2(); +value::Scalar LoopNest_api_Unrolled_test1(); +value::Scalar LoopNest_api_SetOrder_test1(); +value::Scalar LoopNest_api_CachedMatrix_test1(); +value::Scalar LoopNest_api_SlidingCachedMatrix_test(); +value::Scalar SimpleGemm_HighLevelAPI(); +value::Scalar SimpleGemm_HighLevelAPI_NoCachingHelper(); +value::Scalar MLAS_GEMM_GeneralCachingStrategy(); +value::Scalar OneSplitBoundaryTest(); +value::Scalar TwoSplitBoundaryTest(); +value::Scalar SplitLargerThanSizeBoundaryTest(); +value::Scalar TwoSplitsLargerThanSizeBoundaryTest(); +value::Scalar LoopNest_api_tunable_parameters_test1(); + +} // namespace ell diff --git a/libraries/value/test/include/LoopNest_convolution_test.h b/libraries/value/test/include/LoopNest_convolution_test.h new file mode 100644 index 000000000..c89cc41ea --- /dev/null +++ b/libraries/value/test/include/LoopNest_convolution_test.h @@ -0,0 +1,18 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: LoopNest_convolution_test.h (value) +// Authors: Mason Remy +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +namespace ell +{ +namespace value +{ + class Scalar; +} + +} // namespace ell diff --git a/libraries/value/test/include/LoopNest_kernels.h b/libraries/value/test/include/LoopNest_kernels.h new file mode 100644 index 000000000..15588fdb0 --- /dev/null +++ b/libraries/value/test/include/LoopNest_kernels.h @@ -0,0 +1,42 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: LoopNest_kernels.h (value) +// Authors: Kern Handa, Chuck Jacobs +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +namespace ell +{ +namespace value +{ + class Scalar; + class Matrix; + class Tensor; + class Vector; + struct ViewAdapter; +} // namespace value + +void loopnest_passthrough(value::ViewAdapter, value::Scalar i, value::Scalar j); +void loopnest_kernel(value::Matrix m, value::Scalar i, value::Scalar j); +void loopnest_kernel_2(value::Matrix m, value::Scalar i, value::Scalar j); +void loopnest_kernel_3(value::Matrix c, value::Matrix a, value::Scalar i, value::Scalar j); +void loopnest_kernel_4(value::Matrix c, value::Matrix a, value::Scalar i, value::Scalar j); +void matmul_kernel(value::Matrix A, value::Matrix B, value::Matrix C, value::Scalar i, value::Scalar j, value::Scalar k); +void initToZero(value::Matrix m, value::Scalar i, value::Scalar j); +void copyToCache(value::Matrix A, value::Matrix cache, value::Scalar i, value::Scalar j); +void copyFromCache(value::Matrix A, value::Matrix cache, value::Scalar i, value::Scalar j); +void copyToSmallCache(value::Matrix A, value::Matrix cache, value::Scalar i, value::Scalar j); +void copyFromSmallCache(value::Matrix A, value::Matrix cache, value::Scalar i, value::Scalar j); +void addOne(value::Matrix m, value::Scalar i, value::Scalar j); +void addTwo(value::Matrix m, value::Scalar i, value::Scalar j); +void set_vector_kernel(value::Vector v, value::Scalar i); +void increment_vector_kernel(value::Vector v, value::Scalar i); +void copy_vector_kernel(value::Vector v1, value::Vector v2, value::Scalar i); +void reorder_vector_kernel(value::Vector v, value::Matrix m, value::Scalar splitParam, value::Scalar i, value::Scalar iOuter, value::Scalar iInner); +void addCachedMatrixToUnchachedMatrix(value::Matrix A, value::Matrix B, value::Scalar Ai, value::Scalar Aj, value::Scalar Bi, value::Scalar Bj); +void addCachedMatrixToUnchachedMatrixUnrolled(value::Matrix A, value::Matrix B, value::Scalar Ai, value::Scalar Aj, value::Scalar Bi, value::Scalar Bj); + +} // namespace ell diff --git a/libraries/value/test/include/LoopNest_test.h b/libraries/value/test/include/LoopNest_test.h new file mode 100644 index 000000000..fd1cfbf49 --- /dev/null +++ b/libraries/value/test/include/LoopNest_test.h @@ -0,0 +1,89 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: LoopNest_test.h (value) +// Authors: Kern Handa, Chuck Jacobs +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include + +namespace ell +{ +value::Scalar SplitIterationDomain_test1(); + +// Loop nest tests +value::Scalar LoopNest_test1(); +value::Scalar LoopNest_test2(); +value::Scalar LoopNest_test3(); +value::Scalar LoopNest_test4(); +value::Scalar LoopNest_test5(); +value::Scalar LoopNest_test6(); + +value::Scalar LoopNestNonzeroStart_test(); +value::Scalar LoopNestBoundary_test1(); +value::Scalar LoopNestBoundary_test2(); +value::Scalar LoopNestBoundary_test3(); +value::Scalar LoopNestBoundary_test4(); +value::Scalar LoopNestBoundary_test5(); +value::Scalar LoopNestReorder_test1(); +value::Scalar LoopNestReorder_test2(); +value::Scalar TwoKernel_test(); + +value::Scalar LoopNestLastPredicate_test1(); +value::Scalar LoopNestLastPredicate_test2(); +value::Scalar LoopNestLastPredicate_test3(); +value::Scalar LoopNestLastPredicate_test4(); +value::Scalar LoopNestBoundaryPredicate_test1(); + +value::Scalar MissingIndex_test(); +value::Scalar RequiredIndex_test(); +value::Scalar SimpleImperfectNest_test(); +value::Scalar ImperfectNest_test_ijk(); +value::Scalar ImperfectNest_test_ikj(); +value::Scalar ImperfectNest_test_kij(); +value::Scalar ImperfectNest_test_ijkijk(); +value::Scalar ImperfectNest_test_kijijk(); +value::Scalar ImperfectNest_test_ijkkij(); +value::Scalar SplitIndex_test1_old(); +value::Scalar SplitIndex_test1(); +value::Scalar SplitIndex_test2(); +value::Scalar SplitIndex_test3(); +value::Scalar EpilogueIndex_test(); +value::Scalar RenameKernelArg_test(); + +value::Scalar NonInnermostKernel_test1(); +value::Scalar NonInnermostKernel_test2(); +value::Scalar NonInnermostKernel_test3(); +value::Scalar NonInnermostKernel_test4(); +value::Scalar CachedMatrix_test1(); +value::Scalar CachedMatrix_test1_new(); +value::Scalar CachedMatrix_test2(); +value::Scalar CachedMatrix_test3(); +value::Scalar CachedMatrix_test4(); +value::Scalar CachedMatrix_test5(); + +value::Scalar LoopNest_Parallelized_test1(); +value::Scalar LoopNest_Parallelized_test2(); + +value::Scalar LoopNest_Unrolled_test1(); + +value::Scalar LoopNest_DebugDump_test1(); +value::Scalar LoopNest_DebugDump_test2(); + +value::Scalar SimpleMatMult_test(); +value::Scalar GotoBLASGemm_LowLevelAPI(); +value::Scalar GotoBLASGemmWithRefDeref(); +value::Scalar YG12LowLevel_TestBoundary(); + +value::Scalar KernelPredicate_test(); +value::Scalar MatMul3_test1(); +value::Scalar MatMul3_test2(); +value::Scalar LoopNestFuse_test1(); +value::Scalar LoopNestFuse_test2(); +value::Scalar LoopNestFuse_test3(); +value::Scalar ConvertedConstraint_test1(); +value::Scalar ConvertedConstraint_test2(); +} // namespace ell diff --git a/libraries/value/test/include/Matrix_test.h b/libraries/value/test/include/Matrix_test.h index 63aad2f8f..93c5e340c 100644 --- a/libraries/value/test/include/Matrix_test.h +++ b/libraries/value/test/include/Matrix_test.h @@ -15,6 +15,7 @@ namespace ell value::Scalar Matrix_test1(); value::Scalar Matrix_test2(); value::Scalar Matrix_test3(); +value::Scalar Matrix_test4(); value::Scalar Reshape_test(); value::Scalar GEMV_test(); value::Scalar MatrixReferenceTest(); diff --git a/libraries/value/test/include/Scalar_test.h b/libraries/value/test/include/Scalar_test.h index 03bf1fed7..168cac7a6 100644 --- a/libraries/value/test/include/Scalar_test.h +++ b/libraries/value/test/include/Scalar_test.h @@ -21,4 +21,6 @@ value::Scalar RefScalarRefTest(); value::Scalar RefScalarRefCtorsTest(); value::Scalar RefScalarRefRefTest(); value::Scalar RefScalarRefRefRefTest(); +value::Scalar SequenceLogicalAndTest(); +value::Scalar SequenceLogicalAndTestWithCopy(); } // namespace ell diff --git a/libraries/value/test/include/TestUtil.h b/libraries/value/test/include/TestUtil.h index c0994c8f8..36e8e4a95 100644 --- a/libraries/value/test/include/TestUtil.h +++ b/libraries/value/test/include/TestUtil.h @@ -8,26 +8,105 @@ #pragma once +#include + +#include #include #include #include #include +#include #include namespace ell { -value::Scalar EqualEpsilon(value::Scalar x, value::Scalar y, double epsilon); -value::Scalar NotEqualEpsilon(value::Scalar x, value::Scalar y, double epsilon); +value::Scalar NotEqualEpsilon(value::Scalar x, value::Scalar y, double epsilon = 1e-7); +value::Scalar EqualEpsilon(value::Scalar x, value::Scalar y, double epsilon = 1e-7); + +value::Scalar VerifySame(value::Vector actual, value::Vector expected, double epsilon = 1e-7); +value::Scalar VerifySame(value::Matrix actual, value::Matrix expected, double epsilon = 1e-7); +value::Scalar VerifySame(value::Tensor actual, value::Tensor expected, double epsilon = 1e-7); +value::Scalar VerifySame(value::Array actual, value::Array expected, double epsilon = 1e-7); -value::Scalar Verify(value::Vector actual, value::Vector expected, double epsilon = 1e-7); value::Scalar VerifyDifferent(value::Vector actual, value::Vector expected, double epsilon = 1e-7); -value::Scalar Verify(value::Matrix actual, value::Matrix expected, double epsilon = 1e-7); -value::Scalar Verify(value::Tensor actual, value::Tensor expected, double epsilon = 1e-7); +value::Scalar VerifyDifferent(value::Matrix actual, value::Matrix expected, double epsilon = 1e-7); +value::Scalar VerifyDifferent(value::Tensor actual, value::Tensor expected, double epsilon = 1e-7); +value::Scalar VerifyDifferent(value::Array actual, value::Array expected, double epsilon = 1e-7); void PrintMatrix(std::string indent, value::Matrix e); -void DebugPrint(std::string message); -void DebugPrint(value::Vector message); // expecting null terminated ValueType::Char8 void DebugPrintVector(value::Vector data); -void DebugPrintScalar(value::Scalar value); +void PrintLoops(const value::loopnests::LoopNest& loop, std::string tag); + +value::Scalar GetTID(); + +template +value::Array MakeIncrementingArray(std::vector size, const std::string& name) +{ + auto array = value::MakeArray(utilities::MemoryShape(size), name); + int counter = 0; + value::For(array, [&](const std::vector& indices) { + array(indices) = counter++; + }); + return array; +} + +template +value::Tensor MakeIncrementingTensor(int rows, int columns, int channels, const std::string& name) +{ + auto tensor = value::MakeTensor(rows, columns, channels, name); + int counter = 0; + value::ForRange(channels, [&](value::Scalar channel) { + value::ForRange(rows, [&](value::Scalar row) { + value::ForRange(columns, [&](value::Scalar column) { + tensor(row, column, channel) = counter++; + }); + }); + }); + return tensor; +} + +template +value::Matrix MakeIncrementingMatrix(int rows, int cols, const std::string& name) +{ + auto matrix = value::MakeMatrix(rows, cols, name); + value::ForRange(rows, [&](value::Scalar row) { + value::ForRange(cols, [&](value::Scalar col) { + matrix(row, col) = row * cols + col; + }); + }); + return matrix; +} + +template +value::Vector MakeIncrementingVector(int elements, const std::string& name) +{ + auto vec = value::MakeVector(elements, name); + value::ForRange(elements, [&](value::Scalar element) { + vec(element) = element; + }); + return vec; +} + +// +// Matrix-multiply example helpers +// +void MultiplyMatrices(value::Matrix& A, value::Matrix& B, value::Matrix& C); + +struct MatMul3TestCaseParameters +{ + int M; + int N; + int K; + int L; + value::Matrix A; + value::Matrix B; + value::Matrix C; + value::Matrix D; + value::Matrix E; + value::Matrix expectedC; + value::Matrix expectedE; +}; + +MatMul3TestCaseParameters GetMatMul3TestCaseParameters(int M, int N, int K, int L); } // namespace ell diff --git a/libraries/value/test/include/Value_test.h b/libraries/value/test/include/Value_test.h index e90566c90..5443291cb 100644 --- a/libraries/value/test/include/Value_test.h +++ b/libraries/value/test/include/Value_test.h @@ -15,6 +15,7 @@ namespace ell value::Scalar Basic_test(); value::Scalar DebugPrint_test(); value::Scalar Value_test1(); +value::Scalar Array_test1(); value::Scalar Casting_test1(); value::Scalar If_test1(); value::Scalar Sum_test(); @@ -23,10 +24,30 @@ value::Scalar Intrinsics_test1(); value::Scalar Intrinsics_test2(); value::Scalar For_test1(); value::Scalar For_test2(); +value::Scalar ForInsideIf_test(); +value::Scalar While_test(); +value::Scalar WhileInsideIf_test(); value::Scalar ForRangeCasting_test1(); value::Scalar ForRangeCasting_test2(); value::Scalar Parallelized_test1(); value::Scalar Parallelized_test2(); value::Scalar Parallelized_test3(); value::Scalar Prefetch_test1(); + +value::Scalar Prefetch_parallelized_test1(); +value::Scalar Fma_test1(); +value::Scalar Fma_test2(); +value::Scalar Fma_test3(); +value::Scalar UniqueName_test1(); +value::Scalar Parallelized_ComputeContext_test1(); + +value::Scalar MemCopy_test1(); +value::Scalar MemSet_test1(); + +value::Scalar NamedLoops_test1(); + +value::Scalar ThreadLocalAllocation_test1(); + +value::Scalar FunctionPointer_test1(); + } // namespace ell diff --git a/libraries/value/test/src/CachingStrategy_test.cpp b/libraries/value/test/src/CachingStrategy_test.cpp new file mode 100644 index 000000000..048ee84fa --- /dev/null +++ b/libraries/value/test/src/CachingStrategy_test.cpp @@ -0,0 +1,6171 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: CachingStrategy_test.cpp (value) +// Authors: Mason Remy +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "CachingStrategy_test.h" +#include "TestUtil.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace ell::emitters; +using namespace ell::utilities; +using namespace ell::logging; +using namespace ell::value; +using namespace ell::value::loopnests; + +namespace ell +{ +// Tests of LoopNest caching strategies + +Scalar BLASTCOPY_ValidateOutput_Test1() +{ + int N = 8; + int cacheRows = N; + int cacheCols = N; + int stripeSize = 4; + + auto input = MakeIncrementingMatrix(N, N, "input"); + auto output = MakeMatrix(N, N, "output"); + auto expectedOutput = MakeIncrementingMatrix(N, N, "expectedOutput"); + + Index i("i"), j("j"); + auto nest = Using({ input }, ArgumentType::Input) + .Using({ output }, ArgumentType::Output) + .ForAll(i, 0, N) + .ForAll(j, 0, N) + .Do([](Matrix input, Matrix output, Scalar i, Scalar j) { + output(i, j) = input(i, j); + }); + + auto& schedule = nest.GetSchedule(); + + auto iCache = schedule.Split(i, cacheRows); + auto jCache = schedule.Split(j, cacheCols); + auto jStripe = schedule.Split(j, stripeSize); + + schedule.SetOrder({ iCache, jCache, jStripe, i, j }); + + BLASTCopy cachingProvider{}; + std::tuple blasTCopyExtras = { stripeSize, jStripe, BoundaryConditionHandling::ZeroPadding }; + schedule.Cache(cachingProvider, + input, + { i, j }, + { cacheRows, cacheCols }, + { iCache, jCache }, + std::nullopt, // Order isn't used by BLASTCopy + blasTCopyExtras); + +#if 0 // DEBUGGING + DebugDump(nest.GetUnderlyingLoopNest()); +#endif + nest.Run(); + + return VerifySame(output, expectedOutput); +} + +// Test with smaller cache and stripe size than previous test +Scalar BLASTCOPY_ValidateOutput_Test2() +{ + int N = 8; + int cacheRows = 4; + int cacheCols = 4; + int stripeSize = 2; + + // input, expectedOutput + // A: + // [ 0, 1, 2, 3, 4, 5, 6, 7] + // [ 8, 9, 10, 11, 12, 13, 14, 15] + // [16, 17, 18, 19, 20, 21, 22, 23] + // [24, 25, 26, 27, 28, 29, 30, 31] + // [32, 33, 34, 35, 36, 37, 38, 39] + // [40, 41, 42, 43, 44, 45, 46, 47] + // [48, 49, 50, 51, 52, 53, 54, 55] + // [56, 57, 58, 59, 60, 61, 62, 63] + auto input = MakeIncrementingMatrix(N, N, "input"); + auto output = MakeMatrix(N, N, "output"); + auto expectedOutput = MakeIncrementingMatrix(N, N, "expectedOutput"); + + Index i("i"), j("j"); + auto nest = Using({ input }, ArgumentType::Input) + .Using({ output }, ArgumentType::Output) + .ForAll(i, 0, N) + .ForAll(j, 0, N) + .Do([](Matrix input, Matrix output, Scalar i, Scalar j) { + output(i, j) = input(i, j); + }); + + auto& schedule = nest.GetSchedule(); + + auto iCache = schedule.Split(i, cacheRows); + auto jCache = schedule.Split(j, cacheCols); + auto jStripe = schedule.Split(j, stripeSize); + + schedule.SetOrder({ iCache, jCache, jStripe, i, j }); + + BLASTCopy cachingProvider{}; + std::tuple blasTCopyExtras = { stripeSize, jStripe, BoundaryConditionHandling::ZeroPadding }; + schedule.Cache(cachingProvider, + input, + { i, j }, + { cacheRows, cacheCols }, + { iCache, jCache }, + std::nullopt, // Order isn't used by BLASTCopy + blasTCopyExtras); + +#if 0 // DEBUGGING + DebugDump(nest.GetUnderlyingLoopNest()); +#endif + nest.Run(); + + return VerifySame(output, expectedOutput); +} + +Scalar BLASTCOPY_ValidateMemory_Test1() +{ + int N = 8; + int cacheRows = N; + int cacheCols = N; + int stripeSize = 4; + + auto input = MakeIncrementingMatrix(N, N, "input"); + auto output = MakeMatrix(N, N, "output"); + + // input + // A: + // [ 0, 1, 2, 3, 4, 5, 6, 7] + // [ 8, 9, 10, 11, 12, 13, 14, 15] + // [16, 17, 18, 19, 20, 21, 22, 23] + // [24, 25, 26, 27, 28, 29, 30, 31] + // [32, 33, 34, 35, 36, 37, 38, 39] + // [40, 41, 42, 43, 44, 45, 46, 47] + // [48, 49, 50, 51, 52, 53, 54, 55] + // [56, 57, 58, 59, 60, 61, 62, 63] + // clang-format off + Vector expectedCached = + { + 0, 1, 2, 3, + 8, 9, 10, 11, + 16, 17, 18, 19, + 24, 25, 26, 27, + 32, 33, 34, 35, + 40, 41, 42, 43, + 48, 49, 50, 51, + 56, 57, 58, 59, + + 4, 5, 6, 7, + 12, 13, 14, 15, + 20, 21, 22, 23, + 28, 29, 30, 31, + 36, 37, 38, 39, + 44, 45, 46, 47, + 52, 53, 54, 55, + 60, 61, 62, 63 + }; + // clang-format on + + Index i("i"), j("j"); + auto nest = Using({ input }, ArgumentType::Input) + .Using({ output }, ArgumentType::Output) + .ForAll(i, 0, N) + .ForAll(j, 0, N) + .Do([](Matrix input, Matrix output, Scalar i, Scalar j) { + output(i, j) = input(i, j); + }); + + auto& schedule = nest.GetSchedule(); + + auto iCache = schedule.Split(i, cacheRows); + auto jCache = schedule.Split(j, cacheCols); + auto jStripe = schedule.Split(j, stripeSize); + + schedule.SetOrder({ iCache, jCache, jStripe, i, j }); + + BLASTCopy cachingProvider{}; + std::tuple blasTCopyExtras = { stripeSize, jStripe, BoundaryConditionHandling::ZeroPadding }; + schedule.Cache(cachingProvider, + input, + { i, j }, + { cacheRows, cacheCols }, + { iCache, jCache }, + std::nullopt, // Order isn't used by BLASTCopy + blasTCopyExtras); + +#if 0 // DEBUGGING + DebugDump(nest.GetUnderlyingLoopNest()); +#endif + nest.Run(); + + // Examine the underlying cached memory + auto rawCacheValue = cachingProvider._rawCache; + rawCacheValue.SetLayout({ { (int)rawCacheValue.GetLayout().GetMemorySize() } }); + auto cacheVector = Vector(rawCacheValue); + + return VerifySame(cacheVector, expectedCached); +} + +// Smaller stripe size than previous test +Scalar BLASTCOPY_ValidateMemory_Test2() +{ + int N = 8; + int cacheRows = N; + int cacheCols = N; + int stripeSize = 2; + + auto input = MakeIncrementingMatrix(N, N, "input"); + auto output = MakeMatrix(N, N, "output"); + + // input + // A: + // [ 0, 1, 2, 3, 4, 5, 6, 7] + // [ 8, 9, 10, 11, 12, 13, 14, 15] + // [16, 17, 18, 19, 20, 21, 22, 23] + // [24, 25, 26, 27, 28, 29, 30, 31] + // [32, 33, 34, 35, 36, 37, 38, 39] + // [40, 41, 42, 43, 44, 45, 46, 47] + // [48, 49, 50, 51, 52, 53, 54, 55] + // [56, 57, 58, 59, 60, 61, 62, 63] + // clang-format off + Vector expectedCached = + { + 0, 1, + 8, 9, + 16, 17, + 24, 25, + 32, 33, + 40, 41, + 48, 49, + 56, 57, + + 2, 3, + 10, 11, + 18, 19, + 26, 27, + 34, 35, + 42, 43, + 50, 51, + 58, 59, + + 4, 5, + 12, 13, + 20, 21, + 28, 29, + 36, 37, + 44, 45, + 52, 53, + 60, 61, + + 6, 7, + 14, 15, + 22, 23, + 30, 31, + 38, 39, + 46, 47, + 54, 55, + 62, 63 + }; + // clang-format on + + Index i("i"), j("j"); + auto nest = Using({ input }, ArgumentType::Input) + .Using({ output }, ArgumentType::Output) + .ForAll(i, 0, N) + .ForAll(j, 0, N) + .Do([](Matrix input, Matrix output, Scalar i, Scalar j) { + output(i, j) = input(i, j); + }); + + auto& schedule = nest.GetSchedule(); + + auto iCache = schedule.Split(i, cacheRows); + auto jCache = schedule.Split(j, cacheCols); + auto jStripe = schedule.Split(j, stripeSize); + + schedule.SetOrder({ iCache, jCache, jStripe, i, j }); + + BLASTCopy cachingProvider{}; + std::tuple blasTCopyExtras = { stripeSize, jStripe, BoundaryConditionHandling::ZeroPadding }; + schedule.Cache(cachingProvider, + input, + { i, j }, + { cacheRows, cacheCols }, + { iCache, jCache }, + std::nullopt, // Order isn't used by BLASTCopy + blasTCopyExtras); + +#if 0 // DEBUGGING + DebugDump(nest.GetUnderlyingLoopNest()); +#endif + nest.Run(); + + // Examine the underlying cached memory + auto rawCacheValue = cachingProvider._rawCache; + rawCacheValue.SetLayout({ { (int)rawCacheValue.GetLayout().GetMemorySize() } }); + auto cacheVector = Vector(rawCacheValue); + + return VerifySame(cacheVector, expectedCached); +} + +// Same stripe size as previous test, but don't cache entire matrix at once +Scalar BLASTCOPY_ValidateMemory_Test3() +{ + int N = 8; + int cacheRows = 4; + int cacheCols = 4; + int stripeSize = 2; + + auto input = MakeIncrementingMatrix(N, N, "input"); + auto output = MakeMatrix(N, N, "output"); + + // input + // A: + // [ 0, 1, 2, 3, 4, 5, 6, 7] + // [ 8, 9, 10, 11, 12, 13, 14, 15] + // [16, 17, 18, 19, 20, 21, 22, 23] + // [24, 25, 26, 27, 28, 29, 30, 31] + // [32, 33, 34, 35, 36, 37, 38, 39] + // [40, 41, 42, 43, 44, 45, 46, 47] + // [48, 49, 50, 51, 52, 53, 54, 55] + // [56, 57, 58, 59, 60, 61, 62, 63] + // clang-format off + Vector expectedCachedUpperLeft = + { + 0, 1, + 8, 9, + 16, 17, + 24, 25, + + 2, 3, + 10, 11, + 18, 19, + 26, 27, + }; + Vector expectedCachedUpperRight = + { + 4, 5, + 12, 13, + 20, 21, + 28, 29, + + 6, 7, + 14, 15, + 22, 23, + 30, 31 + }; + Vector expectedCachedLowerLeft = + { + 32, 33, + 40, 41, + 48, 49, + 56, 57, + + 34, 35, + 42, 43, + 50, 51, + 58, 59, + }; + Vector expectedCachedLowerRight = + { + 36, 37, + 44, 45, + 52, 53, + 60, 61, + + 38, 39, + 46, 47, + 54, 55, + 62, 63 + }; + // clang-format on + + Index i("i"), j("j"); + auto nest = Using({ input }, ArgumentType::Input) + .Using({ output }, ArgumentType::Output) + .ForAll(i, 0, N) + .ForAll(j, 0, N) + .Do([](Matrix input, Matrix output, Scalar i, Scalar j) { + output(i, j) = input(i, j); + }); + + auto& schedule = nest.GetSchedule(); + + auto topLevelI = i; + auto topLevelJ = j; + + auto iCache = schedule.Split(i, cacheRows); + auto jCache = schedule.Split(j, cacheCols); + auto jStripe = schedule.Split(j, stripeSize); + + schedule.SetOrder({ iCache, jCache, jStripe, i, j }); + + BLASTCopy cachingProvider{}; + std::tuple blasTCopyExtras = { stripeSize, jStripe, BoundaryConditionHandling::ZeroPadding }; + schedule.Cache(cachingProvider, + input, + { i, j }, + { cacheRows, cacheCols }, + { iCache, jCache }, + std::nullopt, // Order isn't used by BLASTCopy + blasTCopyExtras); + + // Get a handle to the underlying cached memory + auto rawCacheValue = cachingProvider._rawCache; + int rawCacheSize = (int)rawCacheValue.GetLayout().NumElements(); + + auto cachedUpperLeft = MakeVector(rawCacheSize); + auto cachedUpperRight = MakeVector(rawCacheSize); + auto cachedLowerLeft = MakeVector(rawCacheSize); + auto cachedLowerRight = MakeVector(rawCacheSize); + + // Add a low level API kernel to access the underlying cache after it has been filled + auto cacheSpyKernel = loopnests::Kernel("cache_spy_kernel") + .Inputs(rawCacheValue, cachedUpperLeft, cachedUpperRight, cachedLowerLeft, cachedLowerRight) + .Indices(topLevelI, topLevelJ) + .Define([cacheRows, cacheCols](Value rawCacheValue, Vector cachedUpperLeft, Vector cachedUpperRight, Vector cachedLowerLeft, Vector cachedLowerRight, Scalar i, Scalar j) { + auto cacheView = rawCacheValue; + cacheView.SetLayout({ { (int)rawCacheValue.GetLayout().NumElements() } }); + auto vectorCacheView = Vector(cacheView); + If(i == 0, + [&]() { + // TODO : remove nested if's + If(j == 0, + [&]() { + cachedUpperLeft = vectorCacheView; + }) + .ElseIf(j == cacheCols, + [&]() { + cachedUpperRight = vectorCacheView; + }); + }) + .ElseIf(i == cacheRows, + [&]() { + If(j == 0, [&]() { + cachedLowerLeft = vectorCacheView; + }).ElseIf(j == cacheCols, [&]() { + cachedLowerRight = vectorCacheView; + }); + }); + }); + auto cacheSpyPosition = loopnests::CodePositionConstraints{ loopnests::LoopFragmentType::body, { iCache, jCache }, {} }; + nest.GetUnderlyingLoopNest().AddKernel(cacheSpyKernel, cacheSpyPosition); + +#if 0 // DEBUGGING + DebugDump(nest.GetUnderlyingLoopNest()); +#endif + nest.Run(); + + auto ok = MakeScalar("ok"); + ok = 1; + auto printError = [&] { + DebugPrint("Upper Left:"); + DebugPrintVector(cachedUpperLeft); + DebugPrint("\n"); + DebugPrintVector(expectedCachedUpperLeft); + DebugPrint("\n"); + DebugPrint("\n"); + DebugPrint("Upper Right:"); + DebugPrintVector(cachedUpperRight); + DebugPrint("\n"); + DebugPrintVector(expectedCachedUpperRight); + DebugPrint("\n"); + DebugPrint("\n"); + DebugPrint("Lower Left:"); + DebugPrintVector(cachedLowerLeft); + DebugPrint("\n"); + DebugPrintVector(expectedCachedLowerLeft); + DebugPrint("\n"); + DebugPrint("\n"); + DebugPrint("Lower Right:"); + DebugPrintVector(cachedLowerRight); + DebugPrint("\n"); + DebugPrintVector(expectedCachedLowerRight); + DebugPrint("\n"); + DebugPrint("\n"); + }; + // TODO : replace nested if's + If(VerifySame(cachedUpperLeft, expectedCachedUpperLeft) == 0, [&]() { + If(VerifySame(cachedUpperRight, expectedCachedUpperRight) == 0, [&]() { + If(VerifySame(cachedLowerLeft, expectedCachedLowerLeft) == 0, [&]() { + If(VerifySame(cachedLowerRight, expectedCachedLowerRight) == 0, [&]() { + ok = 0; + }).Else(printError); + }).Else(printError); + }).Else(printError); + }).Else(printError); + return ok; +} + +Scalar BLASTCOPY_ValidateOutput_BoundaryCondition_Runner(int M, int N, int cacheRows, int cacheCols, int stripeSize) +{ + auto input = MakeIncrementingMatrix(M, N, "input"); + auto output = MakeMatrix(M, N, "output"); + auto expectedOutput = MakeIncrementingMatrix(M, N, "expectedOutput"); + + Index i("i"), j("j"); + auto nest = Using({ input }, ArgumentType::Input) + .Using({ output }, ArgumentType::Output) + .ForAll(i, 0, M) + .ForAll(j, 0, N) + .Do([](Matrix input, Matrix output, Scalar i, Scalar j) { + output(i, j) = input(i, j); + }); + + auto& schedule = nest.GetSchedule(); + + auto iCache = schedule.Split(i, cacheRows); + auto jCache = schedule.Split(j, cacheCols); + auto jStripe = schedule.Split(j, stripeSize); + + schedule.SetOrder({ iCache, jCache, jStripe, i, j }); + + BLASTCopy cachingProvider{}; + std::tuple blasTCopyExtras = { stripeSize, jStripe, BoundaryConditionHandling::ZeroPadding }; + schedule.Cache(cachingProvider, + input, + { i, j }, + { cacheRows, cacheCols }, + { iCache, jCache }, + std::nullopt, // Order isn't used by BLASTCopy + blasTCopyExtras); + +#if 0 // DEBUGGING + DebugDump(nest.GetUnderlyingLoopNest()); +#endif + nest.Run(); + + return VerifySame(output, expectedOutput); +} + +// input matrix rows evenly divides cache rows +// input matrix cols doesn't evenly divide cache cols +Scalar BLASTCOPY_ValidateOutput_BoundaryCondition_Test1() +{ + int M = 8; + int N = 7; // N doesn't evenly divide the number of cache columns + int cacheRows = 4; + int cacheCols = 4; + int stripeSize = 2; + + return BLASTCOPY_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, stripeSize); +} + +// input matrix rows evenly divides cache rows +// input matrix cols doesn't evenly divide cache cols but does evenly divide stripeSize +Scalar BLASTCOPY_ValidateOutput_BoundaryCondition_Test2() +{ + int M = 8; + int N = 6; // N doesn't evenly divide the number of cache columns, but does evenly divide stripe size + int cacheRows = 4; + int cacheCols = 4; + int stripeSize = 2; + + return BLASTCOPY_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, stripeSize); +} + +// input matrix rows doesn't evenly divides cache rows +// input matrix cols doesn't evenly divide cache cols +Scalar BLASTCOPY_ValidateOutput_BoundaryCondition_Test3() +{ + int M = 6; + int N = 7; // N doesn't evenly divide the number of cache columns + int cacheRows = 4; + int cacheCols = 4; + int stripeSize = 2; + + return BLASTCOPY_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, stripeSize); +} + +// input matrix rows doesn't evenly divides cache rows +// input matrix cols doesn't evenly divide cache cols but does evenly divide stripe size +Scalar BLASTCOPY_ValidateOutput_BoundaryCondition_Test4() +{ + int M = 6; + int N = 6; // N doesn't evenly divide the number of cache columns, but does evenly divide stripe size + int cacheRows = 4; + int cacheCols = 4; + int stripeSize = 2; + + return BLASTCOPY_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, stripeSize); +} + +// input matrix rows evenly divides cache rows +// input matrix cols < cache cols, doesn't evenly divide stripe size +Scalar BLASTCOPY_ValidateOutput_BoundaryCondition_Test5() +{ + int M = 8; + int N = 3; // N < cache columns, doesn't evenly divide stripe size + int cacheRows = 4; + int cacheCols = 4; + int stripeSize = 2; + + return BLASTCOPY_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, stripeSize); +} + +// input matrix rows evenly divides cache rows +// input matrix cols < cache cols, evenly divides stripe size +Scalar BLASTCOPY_ValidateOutput_BoundaryCondition_Test6() +{ + int M = 8; + int N = 2; // N < cache columns, does evenly divide stripe size + int cacheRows = 4; + int cacheCols = 4; + int stripeSize = 2; + + return BLASTCOPY_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, stripeSize); +} + +// input matrix rows < cache rows +// input matrix cols < cache cols, doesn't evenly divides stripe size +Scalar BLASTCOPY_ValidateOutput_BoundaryCondition_Test7() +{ + int M = 3; + int N = 3; // N < cache columns, doesn't evenly divide stripe size + int cacheRows = 4; + int cacheCols = 4; + int stripeSize = 2; + + return BLASTCOPY_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, stripeSize); +} + +// input matrix rows < cache rows +// input matrix cols < cache cols, does evenly divides stripe size +Scalar BLASTCOPY_ValidateOutput_BoundaryCondition_Test8() +{ + int M = 2; + int N = 2; // N < cache columns, does evenly divide stripe size + int cacheRows = 4; + int cacheCols = 4; + int stripeSize = 2; + + return BLASTCOPY_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, stripeSize); +} + +// input matrix rows < cache rows +// input matrix cols multiple of cache cols +Scalar BLASTCOPY_ValidateOutput_BoundaryCondition_Test9() +{ + int M = 2; + int N = 8; + int cacheRows = 4; + int cacheCols = 4; + int stripeSize = 2; + + return BLASTCOPY_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, stripeSize); +} + +Scalar BLASTCOPY_ValidateMemory_BoundaryCondition_Runner(int M, int N, int cacheRows, int cacheCols, int stripeSize, Vector expectedCachedUpperLeft, Vector expectedCachedUpperRight, Vector expectedCachedLowerLeft, Vector expectedCachedLowerRight) +{ + auto input = MakeIncrementingMatrix(M, N, "input"); + auto output = MakeMatrix(M, N, "output"); + + Index i("i"), j("j"); + auto nest = Using({ input }, ArgumentType::Input) + .Using({ output }, ArgumentType::Output) + .ForAll(i, 0, M) + .ForAll(j, 0, N) + .Do([](Matrix input, Matrix output, Scalar i, Scalar j) { + output(i, j) = input(i, j); + }); + + auto& schedule = nest.GetSchedule(); + + auto topLevelI = i; + auto topLevelJ = j; + + auto iCache = schedule.Split(i, cacheRows); + auto jCache = schedule.Split(j, cacheCols); + auto jStripe = schedule.Split(j, stripeSize); + + schedule.SetOrder({ iCache, jCache, jStripe, i, j }); + + BLASTCopy cachingProvider{}; + std::tuple blasTCopyExtras = { stripeSize, jStripe, BoundaryConditionHandling::ZeroPadding }; + schedule.Cache(cachingProvider, + input, + { i, j }, + { cacheRows, cacheCols }, + { iCache, jCache }, + std::nullopt, // Order isn't used by BLASTCopy + blasTCopyExtras); + + // Get a handle to the underlying cached memory + auto rawCacheValue = cachingProvider._rawCache; + int rawCacheSize = (int)rawCacheValue.GetLayout().NumElements(); + + auto cachedUpperLeft = MakeVector(rawCacheSize); + auto cachedUpperRight = MakeVector(rawCacheSize); + auto cachedLowerLeft = MakeVector(rawCacheSize); + auto cachedLowerRight = MakeVector(rawCacheSize); + + // Add a low level API kernel to access the underlying cache after it has been filled + auto cacheSpyKernel = loopnests::Kernel("cache_spy_kernel") + .Inputs(rawCacheValue, cachedUpperLeft, cachedUpperRight, cachedLowerLeft, cachedLowerRight) + .Indices(topLevelI, topLevelJ) + .Define([cacheRows, cacheCols](Value rawCacheValue, Vector cachedUpperLeft, Vector cachedUpperRight, Vector cachedLowerLeft, Vector cachedLowerRight, Scalar i, Scalar j) { + auto cacheView = rawCacheValue; + cacheView.SetLayout({ { (int)rawCacheValue.GetLayout().NumElements() } }); + auto vectorCacheView = Vector(cacheView); + If(i == 0, + [&]() { + // TODO : remove nested if's + If(j == 0, + [&]() { + cachedUpperLeft = vectorCacheView; + }) + .ElseIf(j == cacheCols, + [&]() { + cachedUpperRight = vectorCacheView; + }); + }) + .ElseIf(i == cacheRows, + [&]() { + If(j == 0, [&]() { + cachedLowerLeft = vectorCacheView; + }).ElseIf(j == cacheCols, [&]() { + cachedLowerRight = vectorCacheView; + }); + }); + }); + auto cacheSpyPosition = loopnests::CodePositionConstraints{ loopnests::LoopFragmentType::body, { iCache, jCache }, {} }; + nest.GetUnderlyingLoopNest().AddKernel(cacheSpyKernel, cacheSpyPosition); + +#if 0 // DEBUGGING + DebugDump(nest.GetUnderlyingLoopNest()); +#endif + nest.Run(); + + auto ok = MakeScalar("ok"); + ok = 1; + auto printError = [&] { + DebugPrint("Upper Left:"); + DebugPrintVector(cachedUpperLeft); + DebugPrint("\n"); + DebugPrintVector(expectedCachedUpperLeft); + DebugPrint("\n"); + DebugPrint("\n"); + DebugPrint("Upper Right:"); + DebugPrintVector(cachedUpperRight); + DebugPrint("\n"); + DebugPrintVector(expectedCachedUpperRight); + DebugPrint("\n"); + DebugPrint("\n"); + DebugPrint("Lower Left:"); + DebugPrintVector(cachedLowerLeft); + DebugPrint("\n"); + DebugPrintVector(expectedCachedLowerLeft); + DebugPrint("\n"); + DebugPrint("\n"); + DebugPrint("Lower Right:"); + DebugPrintVector(cachedLowerRight); + DebugPrint("\n"); + DebugPrintVector(expectedCachedLowerRight); + DebugPrint("\n"); + DebugPrint("\n"); + }; + // TODO : replace nested if's + If(VerifySame(cachedUpperLeft, expectedCachedUpperLeft) == 0, [&]() { + If(VerifySame(cachedUpperRight, expectedCachedUpperRight) == 0, [&]() { + If(VerifySame(cachedLowerLeft, expectedCachedLowerLeft) == 0, [&]() { + If(VerifySame(cachedLowerRight, expectedCachedLowerRight) == 0, [&]() { + ok = 0; + }).Else(printError); + }).Else(printError); + }).Else(printError); + }).Else(printError); + return ok; +} + +Scalar BLASTCOPY_ValidateMemory_BoundaryCondition_Runner_LeftCachesOnly(int M, int N, int cacheRows, int cacheCols, int stripeSize, Vector expectedCachedUpperLeft, Vector expectedCachedLowerLeft) +{ + auto input = MakeIncrementingMatrix(M, N, "input"); + auto output = MakeMatrix(M, N, "output"); + + Index i("i"), j("j"); + auto nest = Using({ input }, ArgumentType::Input) + .Using({ output }, ArgumentType::Output) + .ForAll(i, 0, M) + .ForAll(j, 0, N) + .Do([](Matrix input, Matrix output, Scalar i, Scalar j) { + output(i, j) = input(i, j); + }); + + auto& schedule = nest.GetSchedule(); + + auto topLevelI = i; + auto topLevelJ = j; + + auto iCache = schedule.Split(i, cacheRows); + auto jCache = schedule.Split(j, cacheCols); + auto jStripe = schedule.Split(j, stripeSize); + + schedule.SetOrder({ iCache, jCache, jStripe, i, j }); + + BLASTCopy cachingProvider{}; + std::tuple blasTCopyExtras = { stripeSize, jStripe, BoundaryConditionHandling::ZeroPadding }; + schedule.Cache(cachingProvider, + input, + { i, j }, + { cacheRows, cacheCols }, + { iCache, jCache }, + std::nullopt, // Order isn't used by BLASTCopy + blasTCopyExtras); + + // Get a handle to the underlying cached memory + auto rawCacheValue = cachingProvider._rawCache; + int rawCacheSize = (int)rawCacheValue.GetLayout().NumElements(); + + // No right caches when N < cacheCols + auto cachedUpperLeft = MakeVector(rawCacheSize); + auto cachedLowerLeft = MakeVector(rawCacheSize); + + // Add a low level API kernel to access the underlying cache after it has been filled + auto cacheSpyKernel = loopnests::Kernel("cache_spy_kernel") + .Inputs(rawCacheValue, cachedUpperLeft, cachedLowerLeft) + .Indices(topLevelI, topLevelJ) + .Define([cacheRows](Value rawCacheValue, Vector cachedUpperLeft, Vector cachedLowerLeft, Scalar i, Scalar j) { + auto cacheView = rawCacheValue; + cacheView.SetLayout({ { (int)rawCacheValue.GetLayout().NumElements() } }); + auto vectorCacheView = Vector(cacheView); + If(i == 0, + [&]() { + // TODO : remove nested if's + If(j == 0, + [&]() { + cachedUpperLeft = vectorCacheView; + }); + }) + .ElseIf(i == cacheRows, + [&]() { + If(j == 0, [&]() { + cachedLowerLeft = vectorCacheView; + }); + }); + }); + auto cacheSpyPosition = loopnests::CodePositionConstraints{ loopnests::LoopFragmentType::body, { iCache, jCache }, {} }; + nest.GetUnderlyingLoopNest().AddKernel(cacheSpyKernel, cacheSpyPosition); + +#if 0 // DEBUGGING + DebugDump(nest.GetUnderlyingLoopNest()); +#endif + nest.Run(); + + auto ok = MakeScalar("ok"); + ok = 1; + auto printError = [&] { + DebugPrint("Upper Left:"); + DebugPrintVector(cachedUpperLeft); + DebugPrint("\n"); + DebugPrintVector(expectedCachedUpperLeft); + DebugPrint("\n"); + DebugPrint("\n"); + DebugPrint("Lower Left:"); + DebugPrintVector(cachedLowerLeft); + DebugPrint("\n"); + DebugPrintVector(expectedCachedLowerLeft); + DebugPrint("\n"); + DebugPrint("\n"); + }; + // TODO : replace nested if's + If(VerifySame(cachedUpperLeft, expectedCachedUpperLeft) == 0, [&]() { + If(VerifySame(cachedLowerLeft, expectedCachedLowerLeft) == 0, [&]() { + ok = 0; + }).Else(printError); + }).Else(printError); + return ok; +} + +Scalar BLASTCOPY_ValidateMemory_BoundaryCondition_Runner_UpperCachesOnly(int M, int N, int cacheRows, int cacheCols, int stripeSize, Vector expectedCachedUpperLeft, Vector expectedCachedUpperRight) +{ + auto input = MakeIncrementingMatrix(M, N, "input"); + auto output = MakeMatrix(M, N, "output"); + + Index i("i"), j("j"); + auto nest = Using({ input }, ArgumentType::Input) + .Using({ output }, ArgumentType::Output) + .ForAll(i, 0, M) + .ForAll(j, 0, N) + .Do([](Matrix input, Matrix output, Scalar i, Scalar j) { + output(i, j) = input(i, j); + }); + + auto& schedule = nest.GetSchedule(); + + auto topLevelI = i; + auto topLevelJ = j; + + auto iCache = schedule.Split(i, cacheRows); + auto jCache = schedule.Split(j, cacheCols); + auto jStripe = schedule.Split(j, stripeSize); + + schedule.SetOrder({ iCache, jCache, jStripe, i, j }); + + BLASTCopy cachingProvider{}; + std::tuple blasTCopyExtras = { stripeSize, jStripe, BoundaryConditionHandling::ZeroPadding }; + schedule.Cache(cachingProvider, + input, + { i, j }, + { cacheRows, cacheCols }, + { iCache, jCache }, + std::nullopt, // Order isn't used by BLASTCopy + blasTCopyExtras); + + // Get a handle to the underlying cached memory + auto rawCacheValue = cachingProvider._rawCache; + int rawCacheSize = (int)rawCacheValue.GetLayout().NumElements(); + + auto cachedUpperLeft = MakeVector(rawCacheSize); + auto cachedUpperRight = MakeVector(rawCacheSize); + + // Add a low level API kernel to access the underlying cache after it has been filled + auto cacheSpyKernel = loopnests::Kernel("cache_spy_kernel") + .Inputs(rawCacheValue, cachedUpperLeft, cachedUpperRight) + .Indices(topLevelI, topLevelJ) + .Define([cacheCols](Value rawCacheValue, Vector cachedUpperLeft, Vector cachedUpperRight, Scalar i, Scalar j) { + auto cacheView = rawCacheValue; + cacheView.SetLayout({ { (int)rawCacheValue.GetLayout().NumElements() } }); + auto vectorCacheView = Vector(cacheView); + If(i == 0, + [&]() { + // TODO : remove nested if's + If(j == 0, + [&]() { + cachedUpperLeft = vectorCacheView; + }) + .ElseIf(j == cacheCols, + [&]() { + cachedUpperRight = vectorCacheView; + }); + }); + }); + auto cacheSpyPosition = loopnests::CodePositionConstraints{ loopnests::LoopFragmentType::body, { iCache, jCache }, {} }; + nest.GetUnderlyingLoopNest().AddKernel(cacheSpyKernel, cacheSpyPosition); + +#if 0 // DEBUGGING + DebugDump(nest.GetUnderlyingLoopNest()); +#endif + nest.Run(); + + auto ok = MakeScalar("ok"); + ok = 1; + auto printError = [&] { + DebugPrint("Upper Left:"); + DebugPrintVector(cachedUpperLeft); + DebugPrint("\n"); + DebugPrintVector(expectedCachedUpperLeft); + DebugPrint("\n"); + DebugPrint("\n"); + DebugPrint("Upper Right:"); + DebugPrintVector(cachedUpperRight); + DebugPrint("\n"); + DebugPrintVector(expectedCachedUpperRight); + DebugPrint("\n"); + DebugPrint("\n"); + }; + // TODO : replace nested if's + If(VerifySame(cachedUpperLeft, expectedCachedUpperLeft) == 0, [&]() { + If(VerifySame(cachedUpperRight, expectedCachedUpperRight) == 0, [&]() { + ok = 0; + }).Else(printError); + }).Else(printError); + return ok; +} + +Scalar BLASTCOPY_ValidateMemory_BoundaryCondition_Runner_UpperLeftCacheOnly(int M, int N, int cacheRows, int cacheCols, int stripeSize, Vector expectedCachedUpperLeft) +{ + auto input = MakeIncrementingMatrix(M, N, "input"); + auto output = MakeMatrix(M, N, "output"); + + Index i("i"), j("j"); + auto nest = Using({ input }, ArgumentType::Input) + .Using({ output }, ArgumentType::Output) + .ForAll(i, 0, M) + .ForAll(j, 0, N) + .Do([](Matrix input, Matrix output, Scalar i, Scalar j) { + output(i, j) = input(i, j); + }); + + auto& schedule = nest.GetSchedule(); + + auto topLevelI = i; + auto topLevelJ = j; + + auto iCache = schedule.Split(i, cacheRows); + auto jCache = schedule.Split(j, cacheCols); + auto jStripe = schedule.Split(j, stripeSize); + + schedule.SetOrder({ iCache, jCache, jStripe, i, j }); + + BLASTCopy cachingProvider{}; + std::tuple blasTCopyExtras = { stripeSize, jStripe, BoundaryConditionHandling::ZeroPadding }; + schedule.Cache(cachingProvider, + input, + { i, j }, + { cacheRows, cacheCols }, + { iCache, jCache }, + std::nullopt, // Order isn't used by BLASTCopy + blasTCopyExtras); + + // Get a handle to the underlying cached memory + auto rawCacheValue = cachingProvider._rawCache; + int rawCacheSize = (int)rawCacheValue.GetLayout().NumElements(); + + // No right caches when N < cacheCols + auto cachedUpperLeft = MakeVector(rawCacheSize); + + // Add a low level API kernel to access the underlying cache after it has been filled + auto cacheSpyKernel = loopnests::Kernel("cache_spy_kernel") + .Inputs(rawCacheValue, cachedUpperLeft) + .Indices(topLevelI, topLevelJ) + .Define([](Value rawCacheValue, Vector cachedUpperLeft, Scalar i, Scalar j) { + auto cacheView = rawCacheValue; + cacheView.SetLayout({ { (int)rawCacheValue.GetLayout().NumElements() } }); + auto vectorCacheView = Vector(cacheView); + If(i == 0, + [&]() { + // TODO : remove nested if's + If(j == 0, + [&]() { + cachedUpperLeft = vectorCacheView; + }); + }); + }); + auto cacheSpyPosition = loopnests::CodePositionConstraints{ loopnests::LoopFragmentType::body, { iCache, jCache }, {} }; + nest.GetUnderlyingLoopNest().AddKernel(cacheSpyKernel, cacheSpyPosition); + +#if 0 // DEBUGGING + DebugDump(nest.GetUnderlyingLoopNest()); +#endif + nest.Run(); + + return VerifySame(cachedUpperLeft, expectedCachedUpperLeft); +} + +Scalar BLASTCOPY_ValidateMemory_BoundaryCondition_Test1() +{ + int M = 8; // M does evenly divide cache rows + int N = 7; // N doesn't evenly divide cache columns + int cacheRows = 4; + int cacheCols = 4; + int stripeSize = 2; + + // input + // A: + // [ 0, 1, 2, 3, 4, 5, 6], + // [ 7, 8, 9, 10, 11, 12, 13], + // [14, 15, 16, 17, 18, 19, 20], + // [21, 22, 23, 24, 25, 26, 27], + // [28, 29, 30, 31, 32, 33, 34], + // [35, 36, 37, 38, 39, 40, 41], + // [42, 43, 44, 45, 46, 47, 48], + // [49, 50, 51, 52, 53, 54, 55] + // clang-format off + Vector expectedCachedUpperLeft = + { + 0, 1, + 7, 8, + 14, 15, + 21, 22, + + 2, 3, + 9, 10, + 16, 17, + 23, 24, + }; + Vector expectedCachedUpperRight = + { + 4, 5, + 11, 12, + 18, 19, + 25, 26, + + 6, 0, + 13, 0, + 20, 0, + 27, 0 + }; + Vector expectedCachedLowerLeft = + { + 28, 29, + 35, 36, + 42, 43, + 49, 50, + + 30, 31, + 37, 38, + 44, 45, + 51, 52, + }; + Vector expectedCachedLowerRight = + { + 32, 33, + 39, 40, + 46, 47, + 53, 54, + + 34, 0, + 41, 0, + 48, 0, + 55, 0 + }; + // clang-format on + + return BLASTCOPY_ValidateMemory_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, stripeSize, expectedCachedUpperLeft, expectedCachedUpperRight, expectedCachedLowerLeft, expectedCachedLowerRight); +} + +Scalar BLASTCOPY_ValidateMemory_BoundaryCondition_Test2() +{ + int M = 8; // M does evenly divide cache rows + int N = 6; // N doesn't evenly divide cache columns, but does evenly divide stripe size + int cacheRows = 4; + int cacheCols = 4; + int stripeSize = 2; + + auto input = MakeIncrementingMatrix(M, N, "input"); + auto output = MakeMatrix(M, N, "output"); + + // input + // A: + // [ 0, 1, 2, 3, 4, 5], + // [ 6, 7, 8, 9, 10, 11], + // [12, 13, 14, 15, 16, 17], + // [18, 19, 20, 21, 22, 23], + // [24, 25, 26, 27, 28, 29], + // [30, 31, 32, 33, 34, 35], + // [36, 37, 38, 39, 40, 41], + // [42, 43, 44, 45, 46, 47] + // clang-format off + Vector expectedCachedUpperLeft = + { + 0, 1, + 6, 7, + 12, 13, + 18, 19, + + 2, 3, + 8, 9, + 14, 15, + 20, 21, + }; + Vector expectedCachedUpperRight = + { + 4, 5, + 10, 11, + 16, 17, + 22, 23, + + 0, 0, + 0, 0, + 0, 0, + 0, 0 + }; + Vector expectedCachedLowerLeft = + { + 24, 25, + 30, 31, + 36, 37, + 42, 43, + + 26, 27, + 32, 33, + 38, 39, + 44, 45, + }; + Vector expectedCachedLowerRight = + { + 28, 29, + 34, 35, + 40, 41, + 46, 47, + + 0, 0, + 0, 0, + 0, 0, + 0, 0 + }; + // clang-format on + + return BLASTCOPY_ValidateMemory_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, stripeSize, expectedCachedUpperLeft, expectedCachedUpperRight, expectedCachedLowerLeft, expectedCachedLowerRight); +} + +// input matrix rows doesn't evenly divides cache rows +// input matrix cols doesn't evenly divide cache cols +Scalar BLASTCOPY_ValidateMemory_BoundaryCondition_Test3() +{ + int M = 6; + int N = 7; // N doesn't evenly divide the number of cache columns + int cacheRows = 4; + int cacheCols = 4; + int stripeSize = 2; + + // input + // A: + // [ 0, 1, 2, 3, 4, 5, 6], + // [ 7, 8, 9, 10, 11, 12, 13], + // [14, 15, 16, 17, 18, 19, 20], + // [21, 22, 23, 24, 25, 26, 27], + // [28, 29, 30, 31, 32, 33, 34], + // [35, 36, 37, 38, 39, 40, 41], + // clang-format off + Vector expectedCachedUpperLeft = + { + 0, 1, + 7, 8, + 14, 15, + 21, 22, + + 2, 3, + 9, 10, + 16, 17, + 23, 24, + }; + Vector expectedCachedUpperRight = + { + 4, 5, + 11, 12, + 18, 19, + 25, 26, + + 6, 0, + 13, 0, + 20, 0, + 27, 0 + }; + + // Check that it gets reviewed correctly to keep the cached data contiguous + Vector expectedCachedLowerLeft = + { + 28, 29, + 35, 36, + 30, 31, + 37, 38, + + 0, 0, + 0, 0, + 0, 0, + 0, 0, + }; + Vector expectedCachedLowerRight = + { + 32, 33, + 39, 40, + 34, 0, + 41, 0, + + 0, 0, + 0, 0, + 0, 0, + 0, 0 + }; + // clang-format on + + return BLASTCOPY_ValidateMemory_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, stripeSize, expectedCachedUpperLeft, expectedCachedUpperRight, expectedCachedLowerLeft, expectedCachedLowerRight); +} + +// input matrix rows doesn't evenly divides cache rows +// input matrix cols doesn't evenly divide cache cols but does evenly divide stripe size +Scalar BLASTCOPY_ValidateMemory_BoundaryCondition_Test4() +{ + int M = 6; + int N = 6; // N doesn't evenly divide the number of cache columns, but does evenly divide stripe size + int cacheRows = 4; + int cacheCols = 4; + int stripeSize = 2; + + // input + // A: + // [ 0, 1, 2, 3, 4, 5], + // [ 6, 7, 8, 9, 10, 11], + // [12, 13, 14, 15, 16, 17], + // [18, 19, 20, 21, 22, 23], + // [24, 25, 26, 27, 28, 29], + // [30, 31, 32, 33, 34, 35] + // clang-format off + Vector expectedCachedUpperLeft = + { + 0, 1, + 6, 7, + 12, 13, + 18, 19, + + 2, 3, + 8, 9, + 14, 15, + 20, 21, + }; + Vector expectedCachedUpperRight = + { + 4, 5, + 10, 11, + 16, 17, + 22, 23, + + 0, 0, + 0, 0, + 0, 0, + 0, 0 + }; + Vector expectedCachedLowerLeft = + { + 24, 25, + 30, 31, + 26, 27, + 32, 33, + + 0, 0, + 0, 0, + 0, 0, + 0, 0, + }; + Vector expectedCachedLowerRight = + { + 28, 29, + 34, 35, + 0, 0, + 0, 0, + + 0, 0, + 0, 0, + 0, 0, + 0, 0 + }; + // clang-format on + return BLASTCOPY_ValidateMemory_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, stripeSize, expectedCachedUpperLeft, expectedCachedUpperRight, expectedCachedLowerLeft, expectedCachedLowerRight); +} + +// input matrix rows evenly divides cache rows +// input matrix cols < cache cols, doesn't evenly divide stripe size +Scalar BLASTCOPY_ValidateMemory_BoundaryCondition_Test5() +{ + int M = 8; + int N = 3; // N < cache columns, doesn't evenly divide stripe size + int cacheRows = 4; + int cacheCols = 4; + int stripeSize = 2; + + // input + // A: + // [ 0, 1, 2], + // [ 3, 4, 5], + // [ 6, 7, 8], + // [ 9, 10, 11], + // [12, 13, 14], + // [15, 16, 17], + // [18, 19, 20], + // [21, 22, 23] + // clang-format off + Vector expectedCachedUpperLeft = + { + 0, 1, + 3, 4, + 6, 7, + 9, 10, + + 2, 0, + 5, 0, + 8, 0, + 11, 0, + }; + Vector expectedCachedLowerLeft = + { + 12, 13, + 15, 16, + 18, 19, + 21, 22, + + 14, 0, + 17, 0, + 20, 0, + 23, 0, + }; + // clang-format on + + return BLASTCOPY_ValidateMemory_BoundaryCondition_Runner_LeftCachesOnly(M, N, cacheRows, cacheCols, stripeSize, expectedCachedUpperLeft, expectedCachedLowerLeft); +} + +// input matrix rows evenly divides cache rows +// input matrix cols < cache cols, evenly divides stripe size +Scalar BLASTCOPY_ValidateMemory_BoundaryCondition_Test6() +{ + int M = 8; + int N = 2; // N < cache columns, does evenly divide stripe size + int cacheRows = 4; + int cacheCols = 4; + int stripeSize = 2; + + // input + // A: + // [ 0, 1], + // [ 2, 3], + // [ 4, 5], + // [ 6, 7], + // [ 8, 9], + // [10, 11], + // [12, 13], + // [14, 15] + // clang-format off + Vector expectedCachedUpperLeft = + { + 0, 1, + 2, 3, + 4, 5, + 6, 7, + + 0, 0, + 0, 0, + 0, 0, + 0, 0, + }; + Vector expectedCachedLowerLeft = + { + 8, 9, + 10, 11, + 12, 13, + 14, 15, + + 0, 0, + 0, 0, + 0, 0, + 0, 0, + }; + // clang-format on + return BLASTCOPY_ValidateMemory_BoundaryCondition_Runner_LeftCachesOnly(M, N, cacheRows, cacheCols, stripeSize, expectedCachedUpperLeft, expectedCachedLowerLeft); +} + +// input matrix rows < cache rows +// input matrix cols < cache cols, doesn't evenly divides stripe size +Scalar BLASTCOPY_ValidateMemory_BoundaryCondition_Test7() +{ + int M = 3; + int N = 3; // N < cache columns, doesn't evenly divide stripe size + int cacheRows = 4; + int cacheCols = 4; + int stripeSize = 2; + + // input + // A: + // [0, 1, 2], + // [3, 4, 5], + // [6, 7, 8] + // clang-format off + Vector expectedCachedUpperLeft = + { + 0, 1, + 3, 4, + 6, 7, + 2, 0, + + 5, 0, + 8, 0, + 0, 0, + 0, 0 + }; + // clang-format on + return BLASTCOPY_ValidateMemory_BoundaryCondition_Runner_UpperLeftCacheOnly(M, N, cacheRows, cacheCols, stripeSize, expectedCachedUpperLeft); +} + +// input matrix rows < cache rows +// input matrix cols < cache cols, does evenly divides stripe size +Scalar BLASTCOPY_ValidateMemory_BoundaryCondition_Test8() +{ + int M = 2; + int N = 2; // N < cache columns, does evenly divide stripe size + int cacheRows = 4; + int cacheCols = 4; + int stripeSize = 2; + + // input + // A: + // [ 0, 1], + // [ 2, 3] + // clang-format off + Vector expectedCachedUpperLeft = + { + 0, 1, + 2, 3, + 0, 0, + 0, 0, + + 0, 0, + 0, 0, + 0, 0, + 0, 0, + }; + // clang-format on + return BLASTCOPY_ValidateMemory_BoundaryCondition_Runner_UpperLeftCacheOnly(M, N, cacheRows, cacheCols, stripeSize, expectedCachedUpperLeft); +} + +// input matrix rows < cache rows +// input matrix cols multiple of cache cols +Scalar BLASTCOPY_ValidateMemory_BoundaryCondition_Test9() +{ + int M = 2; + int N = 8; + int cacheRows = 4; + int cacheCols = 4; + int stripeSize = 2; + + // input + // A: + // [ 0, 1, 2, 3, 4, 5, 6, 7], + // [ 8, 9, 10, 11, 12, 13, 14, 15] + // clang-format off + Vector expectedCachedUpperLeft = + { + 0, 1, + 8, 9, + 2, 3, + 10, 11, + + 0, 0, + 0, 0, + 0, 0, + 0, 0, + }; + Vector expectedCachedUpperRight = + { + 4, 5, + 12, 13, + 6, 7, + 14, 15, + + 0, 0, + 0, 0, + 0, 0, + 0, 0 + }; + // clang-format on + return BLASTCOPY_ValidateMemory_BoundaryCondition_Runner_UpperCachesOnly(M, N, cacheRows, cacheCols, stripeSize, expectedCachedUpperLeft, expectedCachedUpperRight); +} + +// Convolution caching tests + +// General Caching Strategy + +Scalar GeneralCachingStrategy_ValidateOutput_Test1() +{ + // Square matrix tiling + loopnests::Index i("i"), j("j"); + + const int Rows = 8; + const int Columns = 8; + const int SplitSize = 4; + + auto input = MakeIncrementingMatrix(Rows, Columns, "input"); + auto output = MakeMatrix(Rows, Columns, "output"); + + // Define LoopNest + auto nest = Using({ input }, ArgumentType::Input) + .Using({ output }, ArgumentType::Output) + .ForAll(i, 0, Rows) + .ForAll(j, 0, Columns) + .Do([=](Matrix input_, Matrix output_, Scalar i_, Scalar j_) { + output_(i_, j_) = input_(i_, j_); + }); + + auto& schedule = nest.GetSchedule(); + + auto iTopLevel = i; + auto jTopLevel = j; + + auto iBlock = schedule.Split(i, SplitSize); + auto jBlock = schedule.Split(j, SplitSize); + + std::vector orderedIndices = { iBlock, jBlock, i, j }; + schedule.SetOrder(orderedIndices); + + ArgumentType argType = ArgumentType::Input; + std::string cacheName = "cacheInput"; + size_t maxCacheElts = SplitSize * SplitSize; + size_t fillThreshold = maxCacheElts; + std::function reduceFunction = CopyReduce; + auto extraCacheParams = std::make_tuple(argType, + cacheName, + maxCacheElts, + fillThreshold, + reduceFunction, + false); + + schedule.Cache(input, + { iTopLevel, jTopLevel }, + {}, + {}, + std::nullopt, + extraCacheParams); + +#if 0 // DEBUGGING + auto loop = nest.GetUnderlyingLoopNest(); + DebugDump(loop); +#endif + + nest.Run(); + + return VerifySame(output, input); +} + +Scalar GeneralCachingStrategy_ValidateMemory_Test1() +{ + // Square matrix tiling + loopnests::Index i("i"), j("j"); + + const int Rows = 8; + const int Columns = 8; + const int SplitSize = 4; + + // input + // [ 0, 1, 2, 3, 4, 5, 6, 7] + // [ 8, 9, 10, 11, 12, 13, 14, 15] + // [16, 17, 18, 19, 20, 21, 22, 23] + // [24, 25, 26, 27, 28, 29, 30, 31] + // [32, 33, 34, 35, 36, 37, 38, 39] + // [40, 41, 42, 43, 44, 45, 46, 47] + // [48, 49, 50, 51, 52, 53, 54, 55] + // [56, 57, 58, 59, 60, 61, 62, 63] + // clang-format off + Vector expectedCachedUpperLeft = + { + 0, 1, 2, 3, + 8, 9, 10, 11, + 16, 17, 18, 19, + 24, 25, 26, 27 + }; + Vector expectedCachedLowerLeft = + { + 32, 33, 34, 35, + 40, 41, 42, 43, + 48, 49, 50, 51, + 56, 57, 58, 59 + }; + Vector expectedCachedUpperRight = + { + 4, 5, 6, 7, + 12, 13, 14, 15, + 20, 21, 22, 23, + 28, 29, 30, 31 + }; + Vector expectedCachedLowerRight = + { + 36, 37, 38, 39, + 44, 45, 46, 47, + 52, 53, 54, 55, + 60, 61, 62, 63 + }; + // clang-format on + auto input = MakeIncrementingMatrix(Rows, Columns, "input"); + auto output = MakeMatrix(Rows, Columns, "output"); + + // Define LoopNest + auto nest = Using({ input }, ArgumentType::Input) + .Using({ output }, ArgumentType::Output) + .ForAll(i, 0, Rows) + .ForAll(j, 0, Columns) + .Do([=](Matrix input_, Matrix output_, Scalar i_, Scalar j_) { + output_(i_, j_) = input_(i_, j_); + }); + + auto& schedule = nest.GetSchedule(); + + auto iTopLevel = i; + auto jTopLevel = j; + + auto iBlock = schedule.Split(i, SplitSize); + auto jBlock = schedule.Split(j, SplitSize); + + schedule.SetOrder({ iBlock, jBlock, i, j }); + + GeneralCachingStrategy cachingProvider{}; + ArgumentType argType = ArgumentType::Input; + std::string cacheName = "cacheInput"; + size_t maxCacheElts = SplitSize * SplitSize; + size_t fillThreshold = maxCacheElts; + std::function reduceFunction = CopyReduce; + auto extraCacheParams = std::make_tuple(argType, + cacheName, + maxCacheElts, + fillThreshold, + reduceFunction, + false); + schedule.Cache(cachingProvider, + input, + { iTopLevel, jTopLevel }, + {}, + {}, + std::nullopt, + extraCacheParams); + + // Get a handle to the underlying cached memory + auto rawCacheValue = cachingProvider._rawCache; + int rawCacheSize = (int)rawCacheValue.GetLayout().NumElements(); + + auto cachedUpperLeft = MakeVector(rawCacheSize); + auto cachedUpperRight = MakeVector(rawCacheSize); + auto cachedLowerLeft = MakeVector(rawCacheSize); + auto cachedLowerRight = MakeVector(rawCacheSize); + + // Add a low level API kernel to access the underlying cache after it has been filled + auto cacheSpyKernel = loopnests::Kernel("cache_spy_kernel") + .Inputs(rawCacheValue, cachedUpperLeft, cachedUpperRight, cachedLowerLeft, cachedLowerRight) + .Indices(iTopLevel, jTopLevel) + .Define([=](Value rawCacheValue, Vector cachedUpperLeft, Vector cachedUpperRight, Vector cachedLowerLeft, Vector cachedLowerRight, Scalar i, Scalar j) { + auto cacheView = rawCacheValue; + cacheView.SetLayout({ { (int)rawCacheValue.GetLayout().NumElements() } }); + auto vectorCacheView = Vector(cacheView); + If(i == 0, + [&]() { + // TODO : remove nested if's + If(j == 0, + [&]() { + cachedUpperLeft = vectorCacheView; + }) + .ElseIf(j == SplitSize, + [&]() { + cachedUpperRight = vectorCacheView; + }); + }) + .ElseIf(i == SplitSize, + [&]() { + If(j == 0, [&]() { + cachedLowerLeft = vectorCacheView; + }).ElseIf(j == SplitSize, [&]() { + cachedLowerRight = vectorCacheView; + }); + }); + }); + auto cacheSpyPosition = loopnests::CodePositionConstraints{ loopnests::LoopFragmentType::epilogue, { iBlock, jBlock }, {} }; + nest.GetUnderlyingLoopNest().AddKernel(cacheSpyKernel, cacheSpyPosition); + + nest.Run(); + + auto ok = MakeScalar("ok"); + ok = 1; + auto printError = [&] { + DebugPrint("Upper Left:"); + DebugPrintVector(cachedUpperLeft); + DebugPrint("\n"); + DebugPrintVector(expectedCachedUpperLeft); + DebugPrint("\n"); + DebugPrint("\n"); + DebugPrint("Upper Right:"); + DebugPrintVector(cachedUpperRight); + DebugPrint("\n"); + DebugPrintVector(expectedCachedUpperRight); + DebugPrint("\n"); + DebugPrint("\n"); + DebugPrint("Lower Left:"); + DebugPrintVector(cachedLowerLeft); + DebugPrint("\n"); + DebugPrintVector(expectedCachedLowerLeft); + DebugPrint("\n"); + DebugPrint("\n"); + DebugPrint("Lower Right:"); + DebugPrintVector(cachedLowerRight); + DebugPrint("\n"); + DebugPrintVector(expectedCachedLowerRight); + DebugPrint("\n"); + DebugPrint("\n"); + }; + // TODO : replace nested if's + If(VerifySame(cachedUpperLeft, expectedCachedUpperLeft) == 0, [&]() { + If(VerifySame(cachedUpperRight, expectedCachedUpperRight) == 0, [&]() { + If(VerifySame(cachedLowerLeft, expectedCachedLowerLeft) == 0, [&]() { + If(VerifySame(cachedLowerRight, expectedCachedLowerRight) == 0, [&]() { + ok = 0; + }).Else(printError); + }).Else(printError); + }).Else(printError); + }).Else(printError); + return ok; +} + +Scalar GeneralCachingStrategy_ValidateOutput_Test2() +{ + // BLASTCopy caching + loopnests::Index i("i"), j("j"); + + const int Rows = 16; + const int Columns = 16; + const int InputCacheRows = 8; + const int InputCacheCols = 8; + const int StripeSize = 4; + const int VecSize = 2; + + auto input = MakeIncrementingMatrix(Rows, Columns, "input"); + auto output = MakeMatrix(Rows, Columns, "output"); + + // Define LoopNest + auto nest = Using({ input }, ArgumentType::Input) + .Using({ output }, ArgumentType::Output) + .ForAll(i, 0, Rows) + .ForAll(j, 0, Columns) + .Do([=](Matrix input_, Matrix output_, Scalar i_, Scalar j_) { + output_(i_, j_) = input_(i_, j_); + }); + + auto& schedule = nest.GetSchedule(); + + auto iTopLevel = i; + auto jTopLevel = j; + + auto iBlock = schedule.Split(i, InputCacheRows); + auto jBlock = schedule.Split(j, InputCacheCols); + auto jStripe = schedule.Split(j, StripeSize); + auto jVec = schedule.Split(j, VecSize); + + std::vector orderedIndices = { jBlock, + iBlock, + jStripe, + i, + jVec, + j }; + schedule.SetOrder(orderedIndices); + + ArgumentType argType = ArgumentType::Input; + std::string cacheName = "cacheInput"; + size_t maxCacheElts = InputCacheRows * InputCacheCols; + size_t fillThreshold = maxCacheElts; + std::function reduceFunction = CopyReduce; + auto extraCacheParams = std::make_tuple(argType, + cacheName, + maxCacheElts, + fillThreshold, + reduceFunction, + false); + schedule.Cache(input, + { iTopLevel, jTopLevel }, + {}, + {}, + std::nullopt, + extraCacheParams); + +#if 0 // DEBUGGING + auto loop = nest.GetUnderlyingLoopNest(); + DebugDump(loop); +#endif + + nest.Run(); + + return VerifySame(output, input); +} + +Scalar GeneralCachingStrategy_ValidateOutput_Test3() +{ + // Progressive BLASTCopy caching + loopnests::Index i("i"), j("j"); + + const int Rows = 16; + const int Columns = 16; + const int InputCacheRows = 8; + const int InputCacheCols = 8; + const int StripeSize = 4; + const int VecSize = 2; + + auto input = MakeIncrementingMatrix(Rows, Columns, "input"); + auto output = MakeMatrix(Rows, Columns, "output"); + + // Define LoopNest + auto nest = Using({ input }, ArgumentType::Input) + .Using({ output }, ArgumentType::Output) + .ForAll(i, 0, Rows) + .ForAll(j, 0, Columns) + .Do([=](Matrix input_, Matrix output_, Scalar i_, Scalar j_) { + output_(i_, j_) = input_(i_, j_); + }); + + auto& schedule = nest.GetSchedule(); + + auto iTopLevel = i; + auto jTopLevel = j; + + auto iBlock = schedule.Split(i, InputCacheRows); + auto jBlock = schedule.Split(j, InputCacheCols); + auto jStripe = schedule.Split(j, StripeSize); + auto jVec = schedule.Split(j, VecSize); + + std::vector orderedIndices = { jBlock, + iBlock, + jStripe, + i, + jVec, + j }; + schedule.SetOrder(orderedIndices); + + ArgumentType argType = ArgumentType::Input; + std::string cacheName = "cacheInput"; + size_t maxCacheElts = InputCacheRows * InputCacheCols; + size_t fillThreshold = InputCacheRows * StripeSize; + std::function reduceFunction = CopyReduce; + auto extraCacheParams = std::make_tuple(argType, + cacheName, + maxCacheElts, + fillThreshold, + reduceFunction, + false); + schedule.Cache(input, + { iTopLevel, jTopLevel }, + {}, + {}, + std::nullopt, + extraCacheParams); + +#if 0 // DEBUGGING + auto loop = nest.GetUnderlyingLoopNest(); + DebugDump(loop); +#endif + + nest.Run(); + + return VerifySame(output, input); +} + +Scalar GeneralCachingStrategy_ValidateOutput_Test4() +{ + // BLASTCopy caching with boundary condition on rows + loopnests::Index i("i"), j("j"); + + const int Rows = 15; + const int Columns = 16; + const int InputCacheRows = 8; + const int InputCacheCols = 8; + const int StripeSize = 4; + const int VecSize = 2; + + auto input = MakeIncrementingMatrix(Rows, Columns, "input"); + auto output = MakeMatrix(Rows, Columns, "output"); + + // Define LoopNest + auto nest = Using({ input }, ArgumentType::Input) + .Using({ output }, ArgumentType::Output) + .ForAll(i, 0, Rows) + .ForAll(j, 0, Columns) + .Do([=](Matrix input_, Matrix output_, Scalar i_, Scalar j_) { + output_(i_, j_) = input_(i_, j_); + }); + + auto& schedule = nest.GetSchedule(); + + auto iTopLevel = i; + auto jTopLevel = j; + + auto iBlock = schedule.Split(i, InputCacheRows); + auto jBlock = schedule.Split(j, InputCacheCols); + auto jStripe = schedule.Split(j, StripeSize); + auto jVec = schedule.Split(j, VecSize); + + std::vector orderedIndices = { jBlock, + iBlock, + jStripe, + i, + jVec, + j }; + schedule.SetOrder(orderedIndices); + + ArgumentType argType = ArgumentType::Input; + std::string cacheName = "cacheInput"; + size_t maxCacheElts = InputCacheRows * InputCacheCols; + size_t fillThreshold = maxCacheElts; + std::function reduceFunction = CopyReduce; + auto extraCacheParams = std::make_tuple(argType, + cacheName, + maxCacheElts, + fillThreshold, + reduceFunction, + false); + schedule.Cache(input, + { iTopLevel, jTopLevel }, + {}, + {}, + std::nullopt, + extraCacheParams); + +#if 0 // DEBUGGING + auto loop = nest.GetUnderlyingLoopNest(); + DebugDump(loop); +#endif + + nest.Run(); + + return VerifySame(output, input); +} + +Scalar GeneralCachingStrategy_ValidateOutput_Test5() +{ + // Square output cache + loopnests::Index i("i"), j("j"); + + const int Rows = 8; + const int Columns = 8; + const int OutputCacheRows = 2; + const int OutputCacheCols = 2; + + auto input = MakeIncrementingMatrix(Rows, Columns, "input"); + auto output = MakeMatrix(Rows, Columns, "output"); + + // Define LoopNest + auto nest = Using({ input }, ArgumentType::Input) + .Using({ output }, ArgumentType::Output) + .ForAll(i, 0, Rows) + .ForAll(j, 0, Columns) + .Do([=](Matrix input_, Matrix output_, Scalar i_, Scalar j_) { + output_(i_, j_) = input_(i_, j_); + }); + + auto& schedule = nest.GetSchedule(); + + auto iTopLevel = i; + auto jTopLevel = j; + + auto iOutput = schedule.Split(i, OutputCacheRows); + auto jOutput = schedule.Split(j, OutputCacheCols); + + std::vector orderedIndices = { iOutput, jOutput, i, j }; + schedule.SetOrder(orderedIndices); + + ArgumentType argType = ArgumentType::Output; + std::string cacheName = "cacheOutput"; + + size_t maxCacheElts = OutputCacheRows * OutputCacheCols; + size_t fillThreshold = maxCacheElts; + std::function reduceFunction = CopyReduce; + auto extraCacheParams = std::make_tuple(argType, + cacheName, + maxCacheElts, + fillThreshold, + reduceFunction, + false); + schedule.Cache(output, + { iTopLevel, jTopLevel }, + {}, + {}, + std::nullopt, + extraCacheParams); + +#if 0 // DEBUGGING + auto loop = nest.GetUnderlyingLoopNest(); + DebugDump(loop); +#endif + + nest.Run(); + + return VerifySame(output, input); +} + +Scalar GeneralCachingStrategy_ValidateOutput_Test6() +{ + // Rectangular output cache + loopnests::Index i("i"), j("j"); + + const int Rows = 8; + const int Columns = 8; + const int OutputCacheRows = 4; + const int OutputCacheCols = 2; + + auto input = MakeIncrementingMatrix(Rows, Columns, "input"); + auto output = MakeMatrix(Rows, Columns, "output"); + + // Define LoopNest + auto nest = Using({ input }, ArgumentType::Input) + .Using({ output }, ArgumentType::Output) + .ForAll(i, 0, Rows) + .ForAll(j, 0, Columns) + .Do([=](Matrix input_, Matrix output_, Scalar i_, Scalar j_) { + output_(i_, j_) = input_(i_, j_); + }); + + auto& schedule = nest.GetSchedule(); + + auto iTopLevel = i; + auto jTopLevel = j; + + auto iOutput = schedule.Split(i, OutputCacheRows); + auto jOutput = schedule.Split(j, OutputCacheCols); + + std::vector orderedIndices = { iOutput, jOutput, i, j }; + schedule.SetOrder(orderedIndices); + + ArgumentType argType = ArgumentType::Output; + std::string cacheName = "cacheOutput"; + + size_t maxCacheElts = OutputCacheRows * OutputCacheCols; + size_t fillThreshold = maxCacheElts; + std::function reduceFunction = CopyReduce; + auto extraCacheParams = std::make_tuple(argType, + cacheName, + maxCacheElts, + fillThreshold, + reduceFunction, + false); + schedule.Cache(output, + { iTopLevel, jTopLevel }, + {}, + {}, + std::nullopt, + extraCacheParams); + +#if 0 // DEBUGGING + auto loop = nest.GetUnderlyingLoopNest(); + DebugDump(loop); +#endif + + nest.Run(); + + return VerifySame(output, input); +} + +Scalar GeneralCachingStrategy_ValidateOutput_Test7() +{ + // Square matrix tiling with square output cache + loopnests::Index i("i"), j("j"); + + const int Rows = 8; + const int Columns = 8; + const int InputCacheRows = 4; + const int InputCacheCols = 4; + const int OutputCacheRows = 2; + const int OutputCacheCols = 2; + + auto input = MakeIncrementingMatrix(Rows, Columns, "input"); + auto output = MakeMatrix(Rows, Columns, "output"); + + // Define LoopNest + auto nest = Using({ input }, ArgumentType::Input) + .Using({ output }, ArgumentType::Output) + .ForAll(i, 0, Rows) + .ForAll(j, 0, Columns) + .Do([=](Matrix input_, Matrix output_, Scalar i_, Scalar j_) { + output_(i_, j_) = input_(i_, j_); + }); + + auto& schedule = nest.GetSchedule(); + + auto iTopLevel = i; + auto jTopLevel = j; + + auto iBlock = schedule.Split(i, InputCacheRows); + auto jBlock = schedule.Split(j, InputCacheCols); + auto iOutput = schedule.Split(i, OutputCacheRows); + auto jOutput = schedule.Split(j, OutputCacheCols); + + std::vector orderedIndices = { iBlock, jBlock, iOutput, jOutput, i, j }; + schedule.SetOrder(orderedIndices); + + ArgumentType argType = ArgumentType::Input; + std::string cacheName = "cacheInput"; + + size_t maxCacheElts = InputCacheRows * InputCacheCols; + size_t fillThreshold = maxCacheElts; + std::function reduceFunction = CopyReduce; + auto extraCacheParams = std::make_tuple(argType, + cacheName, + maxCacheElts, + fillThreshold, + reduceFunction, + false); + schedule.Cache(input, + { iTopLevel, jTopLevel }, + {}, + {}, + std::nullopt, + extraCacheParams); + + ArgumentType output_argType = ArgumentType::Output; + std::string output_cacheName = "cacheOutput"; + + size_t output_maxCacheElts = OutputCacheRows * OutputCacheCols; + size_t output_fillThreshold = output_maxCacheElts; + std::function output_reduceFunction = CopyReduce; + auto output_extraCacheParams = std::make_tuple(output_argType, + output_cacheName, + output_maxCacheElts, + output_fillThreshold, + output_reduceFunction, + false); + schedule.Cache(output, + { iTopLevel, jTopLevel }, + {}, + {}, + std::nullopt, + output_extraCacheParams); + +#if 0 // DEBUGGING + auto loop = nest.GetUnderlyingLoopNest(); + DebugDump(loop); +#endif + + nest.Run(); + + return VerifySame(output, input); +} + +Scalar GeneralCachingStrategy_ValidateOutput_Test8() +{ + // Rectangular matrix input tiling with different rectangular output cache + loopnests::Index i("i"), j("j"); + + const int Rows = 8; + const int Columns = 8; + const int InputCacheRows = 4; + const int InputCacheCols = 2; + const int OutputCacheRows = 2; + const int OutputCacheCols = 4; + + auto input = MakeIncrementingMatrix(Rows, Columns, "input"); + auto output = MakeMatrix(Rows, Columns, "output"); + + // Define LoopNest + auto nest = Using({ input }, ArgumentType::Input) + .Using({ output }, ArgumentType::Output) + .ForAll(i, 0, Rows) + .ForAll(j, 0, Columns) + .Do([=](Matrix input_, Matrix output_, Scalar i_, Scalar j_) { + output_(i_, j_) = input_(i_, j_); + }); + + auto& schedule = nest.GetSchedule(); + + auto iTopLevel = i; + auto jTopLevel = j; + + auto iBlock = schedule.Split(i, InputCacheRows); + auto iOutput = schedule.Split(i, OutputCacheRows); + auto jOutput = schedule.Split(j, OutputCacheCols); + auto jBlock = schedule.Split(j, InputCacheCols); + + std::vector orderedIndices = { iBlock, iOutput, jOutput, jBlock, i, j }; + schedule.SetOrder(orderedIndices); + + ArgumentType argType = ArgumentType::Input; + std::string cacheName = "cacheInput"; + + size_t maxCacheElts = InputCacheRows * InputCacheCols; + size_t fillThreshold = maxCacheElts; + std::function reduceFunction = CopyReduce; + auto extraCacheParams = std::make_tuple(argType, + cacheName, + maxCacheElts, + fillThreshold, + reduceFunction, + false); + schedule.Cache(input, + { iTopLevel, jTopLevel }, + {}, + {}, + std::nullopt, + extraCacheParams); + + ArgumentType output_argType = ArgumentType::Output; + std::string output_cacheName = "cacheOutput"; + + size_t output_maxCacheElts = OutputCacheRows * OutputCacheCols; + size_t output_fillThreshold = output_maxCacheElts; + std::function output_reduceFunction = CopyReduce; + auto output_extraCacheParams = std::make_tuple(output_argType, + output_cacheName, + output_maxCacheElts, + output_fillThreshold, + output_reduceFunction, + false); + schedule.Cache(output, + { iTopLevel, jTopLevel }, + {}, + {}, + std::nullopt, + output_extraCacheParams); + +#if 0 // DEBUGGING + auto loop = nest.GetUnderlyingLoopNest(); + DebugDump(loop); +#endif + + nest.Run(); + + return VerifySame(output, input); +} + +Scalar GeneralCachingStrategy_ValidateOutput_Test9() +{ + // BLASTCopy input caching with square output cache + loopnests::Index i("i"), j("j"); + + const int Rows = 16; + const int Columns = 16; + const int InputCacheRows = 8; + const int InputCacheCols = 8; + const int StripeSize = 4; + const int VecSize = 2; + const int OutputCacheRows = 2; + const int OutputCacheCols = 2; + + auto input = MakeIncrementingMatrix(Rows, Columns, "input"); + auto output = MakeMatrix(Rows, Columns, "output"); + + // Define LoopNest + auto nest = Using({ input }, ArgumentType::Input) + .Using({ output }, ArgumentType::Output) + .ForAll(i, 0, Rows) + .ForAll(j, 0, Columns) + .Do([=](Matrix input_, Matrix output_, Scalar i_, Scalar j_) { + output_(i_, j_) = input_(i_, j_); + }); + + auto& schedule = nest.GetSchedule(); + + auto iTopLevel = i; + auto jTopLevel = j; + + auto iBlock = schedule.Split(i, InputCacheRows); + auto iOutput = schedule.Split(i, OutputCacheRows); + auto jBlock = schedule.Split(j, InputCacheCols); + auto jStripe = schedule.Split(j, StripeSize); + auto jOutput = schedule.Split(j, OutputCacheCols); + auto jVec = schedule.Split(j, VecSize); + + std::vector orderedIndices = { jBlock, + iBlock, + jStripe, + iOutput, + jOutput, + i, + jVec, + j }; + schedule.SetOrder(orderedIndices); + + ArgumentType argType = ArgumentType::Input; + std::string cacheName = "cacheInput"; + size_t maxCacheElts = InputCacheRows * InputCacheCols; + size_t fillThreshold = maxCacheElts; + std::function reduceFunction = CopyReduce; + auto extraCacheParams = std::make_tuple(argType, + cacheName, + maxCacheElts, + fillThreshold, + reduceFunction, + false); + schedule.Cache(input, + { iTopLevel, jTopLevel }, + {}, + {}, + std::nullopt, + extraCacheParams); + + ArgumentType output_argType = ArgumentType::Output; + std::string output_cacheName = "cacheOutput"; + + size_t output_maxCacheElts = OutputCacheRows * OutputCacheCols; + size_t output_fillThreshold = output_maxCacheElts; + std::function output_reduceFunction = CopyReduce; + auto output_extraCacheParams = std::make_tuple(output_argType, + output_cacheName, + output_maxCacheElts, + output_fillThreshold, + output_reduceFunction, + false); + schedule.Cache(output, + { iTopLevel, jTopLevel }, + {}, + {}, + std::nullopt, + output_extraCacheParams); + +#if 0 // DEBUGGING + auto loop = nest.GetUnderlyingLoopNest(); + DebugDump(loop); +#endif + + nest.Run(); + + return VerifySame(output, input); +} + +Scalar GeneralCachingStrategy_ValidateOutput_Test10() +{ + // BLASTCopy input caching with rectangular output cache + loopnests::Index i("i"), j("j"); + + const int Rows = 16; + const int Columns = 16; + const int InputCacheRows = 8; + const int InputCacheCols = 8; + const int StripeSize = 4; + const int VecSize = 2; + const int OutputCacheRows = 2; + const int OutputCacheCols = 4; + + auto input = MakeIncrementingMatrix(Rows, Columns, "input"); + auto output = MakeMatrix(Rows, Columns, "output"); + + // Define LoopNest + auto nest = Using({ input }, ArgumentType::Input) + .Using({ output }, ArgumentType::Output) + .ForAll(i, 0, Rows) + .ForAll(j, 0, Columns) + .Do([=](Matrix input_, Matrix output_, Scalar i_, Scalar j_) { + output_(i_, j_) = input_(i_, j_); + }); + + auto& schedule = nest.GetSchedule(); + + auto iTopLevel = i; + auto jTopLevel = j; + + auto iBlock = schedule.Split(i, InputCacheRows); + auto iOutput = schedule.Split(i, OutputCacheRows); + auto jBlock = schedule.Split(j, InputCacheCols); + auto jStripe = schedule.Split(j, StripeSize); + auto jOutput = schedule.Split(j, OutputCacheCols); + auto jVec = schedule.Split(j, VecSize); + + std::vector orderedIndices = { jBlock, + iBlock, + jStripe, + iOutput, + jOutput, + i, + jVec, + j }; + schedule.SetOrder(orderedIndices); + + ArgumentType argType = ArgumentType::Input; + std::string cacheName = "cacheInput"; + size_t maxCacheElts = InputCacheRows * InputCacheCols; + size_t fillThreshold = maxCacheElts; + std::function reduceFunction = CopyReduce; + auto extraCacheParams = std::make_tuple(argType, + cacheName, + maxCacheElts, + fillThreshold, + reduceFunction, + false); + schedule.Cache(input, + { iTopLevel, jTopLevel }, + {}, + {}, + std::nullopt, + extraCacheParams); + + ArgumentType output_argType = ArgumentType::Output; + std::string output_cacheName = "cacheOutput"; + + size_t output_maxCacheElts = OutputCacheRows * OutputCacheCols; + size_t output_fillThreshold = output_maxCacheElts; + std::function output_reduceFunction = CopyReduce; + auto output_extraCacheParams = std::make_tuple(output_argType, + output_cacheName, + output_maxCacheElts, + output_fillThreshold, + output_reduceFunction, + false); + schedule.Cache(output, + { iTopLevel, jTopLevel }, + {}, + { iOutput }, + std::nullopt, + output_extraCacheParams); + +#if 0 // DEBUGGING + auto loop = nest.GetUnderlyingLoopNest(); + DebugDump(loop); +#endif + + nest.Run(); + + return VerifySame(output, input); +} + +Scalar GeneralCachingStrategy_ValidateOutput_Test11() +{ + // BLASTCopy output caching + loopnests::Index i("i"), j("j"); + + const int Rows = 16; + const int Columns = 16; + const int CacheRows = 8; + const int CacheCols = 8; + const int StripeSize = 4; + const int VecSize = 2; + + auto input = MakeIncrementingMatrix(Rows, Columns, "input"); + auto output = MakeMatrix(Rows, Columns, "output"); + + // Define LoopNest + auto nest = Using({ input }, ArgumentType::Input) + .Using({ output }, ArgumentType::Output) + .ForAll(i, 0, Rows) + .ForAll(j, 0, Columns) + .Do([=](Matrix input_, Matrix output_, Scalar i_, Scalar j_) { + output_(i_, j_) = input_(i_, j_); + }); + + auto& schedule = nest.GetSchedule(); + + auto iTopLevel = i; + auto jTopLevel = j; + + auto iBlock = schedule.Split(i, CacheRows); + auto jBlock = schedule.Split(j, CacheCols); + auto jStripe = schedule.Split(j, StripeSize); + auto jVec = schedule.Split(j, VecSize); + + std::vector orderedIndices = { jBlock, + iBlock, + jStripe, + i, + jVec, + j }; + schedule.SetOrder(orderedIndices); + + ArgumentType argType = ArgumentType::Output; + std::string cacheName = "cacheOutput"; + size_t maxCacheElts = CacheRows * CacheCols; + size_t fillThreshold = maxCacheElts; + std::function reduceFunction = CopyReduce; + auto extraCacheParams = std::make_tuple(argType, + cacheName, + maxCacheElts, + fillThreshold, + reduceFunction, + false); + schedule.Cache(output, + { iTopLevel, jTopLevel }, + {}, + {}, + std::nullopt, + extraCacheParams); + +#if 0 // DEBUGGING + auto loop = nest.GetUnderlyingLoopNest(); + DebugDump(loop); +#endif + + nest.Run(); + + return VerifySame(output, input); +} + +Scalar GeneralCachingStrategy_ValidateOutput_Test12() +{ + // BLASTCopy input caching with same BLASTCopy output caching + loopnests::Index i("i"), j("j"); + + const int Rows = 16; + const int Columns = 16; + const int CacheRows = 8; + const int CacheCols = 8; + const int StripeSize = 4; + const int VecSize = 2; + + auto input = MakeIncrementingMatrix(Rows, Columns, "input"); + auto output = MakeMatrix(Rows, Columns, "output"); + + // Define LoopNest + auto nest = Using({ input }, ArgumentType::Input) + .Using({ output }, ArgumentType::Output) + .ForAll(i, 0, Rows) + .ForAll(j, 0, Columns) + .Do([=](Matrix input_, Matrix output_, Scalar i_, Scalar j_) { + output_(i_, j_) = input_(i_, j_); + }); + + auto& schedule = nest.GetSchedule(); + + auto iTopLevel = i; + auto jTopLevel = j; + + auto iBlock = schedule.Split(i, CacheRows); + + auto jBlock = schedule.Split(j, CacheCols); + auto jStripe = schedule.Split(j, StripeSize); + auto jVec = schedule.Split(j, VecSize); + + std::vector orderedIndices = { jBlock, + iBlock, + jStripe, + i, + jVec, + j }; + schedule.SetOrder(orderedIndices); + + ArgumentType argType = ArgumentType::Input; + std::string cacheName = "cacheInput"; + size_t maxCacheElts = CacheRows * CacheCols; + size_t fillThreshold = maxCacheElts; + std::function reduceFunction = CopyReduce; + auto extraCacheParams = std::make_tuple(argType, + cacheName, + maxCacheElts, + fillThreshold, + reduceFunction, + false); + schedule.Cache(input, + { iTopLevel, jTopLevel }, + {}, + {}, + std::nullopt, + extraCacheParams); + + ArgumentType output_argType = ArgumentType::Output; + std::string output_cacheName = "cacheOutput"; + auto output_extraCacheParams = std::make_tuple(output_argType, + output_cacheName, + maxCacheElts, + fillThreshold, + reduceFunction, + false); + schedule.Cache(output, + { iTopLevel, jTopLevel }, + {}, + {}, + std::nullopt, + output_extraCacheParams); + +#if 0 // DEBUGGING + auto loop = nest.GetUnderlyingLoopNest(); + DebugDump(loop); +#endif + + nest.Run(); + + return VerifySame(output, input); +} + +Scalar GeneralCachingStrategy_ValidateOutput_Test13() +{ + // BLASTCopy input caching with different BLASTCopy output caching + loopnests::Index i("i"), j("j"); + + const int Rows = 32; + const int Columns = 32; + const int InputCacheRows = 16; + const int InputCacheCols = 16; + const int InputStripeSize = 8; + const int VecSize = 2; + const int OutputCacheRows = 8; + const int OutputCacheCols = InputStripeSize; // == InputStripeSize and in same dimension + const int OutputStripeSize = 4; + + auto input = MakeIncrementingMatrix(Rows, Columns, "input"); + auto output = MakeMatrix(Rows, Columns, "output"); + + // Define LoopNest + auto nest = Using({ input }, ArgumentType::Input) + .Using({ output }, ArgumentType::Output) + .ForAll(i, 0, Rows) + .ForAll(j, 0, Columns) + .Do([=](Matrix input_, Matrix output_, Scalar i_, Scalar j_) { + output_(i_, j_) = input_(i_, j_); + }); + + auto& schedule = nest.GetSchedule(); + + auto iTopLevel = i; + auto jTopLevel = j; + + auto iBlock = schedule.Split(i, InputCacheRows); + auto iOutput = schedule.Split(i, OutputCacheRows); + + auto jBlock = schedule.Split(j, InputCacheCols); + auto jOutput = schedule.Split(j, OutputCacheCols); + auto jInputStripe = jOutput; // Split by the same amount in the same dimension + auto jOutputStripe = schedule.Split(j, OutputStripeSize); + auto jVec = schedule.Split(j, VecSize); + + std::vector orderedIndices = { jBlock, + iBlock, + jOutput, + iOutput, + jOutputStripe, + i, + jVec, + j }; + schedule.SetOrder(orderedIndices); + + ArgumentType argType = ArgumentType::Input; + std::string cacheName = "cacheInput"; + size_t maxCacheElts = InputCacheRows * InputCacheCols; + size_t fillThreshold = maxCacheElts; + std::function reduceFunction = CopyReduce; + auto extraCacheParams = std::make_tuple(argType, + cacheName, + maxCacheElts, + fillThreshold, + reduceFunction, + false); + schedule.Cache(input, + { iTopLevel, jTopLevel }, + {}, + {}, + std::nullopt, + extraCacheParams); + + ArgumentType output_argType = ArgumentType::Output; + std::string output_cacheName = "cacheOutput"; + + size_t output_maxCacheElts = OutputCacheRows * OutputCacheCols; + size_t output_fillThreshold = output_maxCacheElts; + std::function output_reduceFunction = CopyReduce; + auto output_extraCacheParams = std::make_tuple(output_argType, + output_cacheName, + output_maxCacheElts, + output_fillThreshold, + output_reduceFunction, + false); + schedule.Cache(output, + { iTopLevel, jTopLevel }, + {}, + {}, + std::nullopt, + output_extraCacheParams); + +#if 0 // DEBUGGING + auto loop = nest.GetUnderlyingLoopNest(); + DebugDump(loop); +#endif + + nest.Run(); + + return VerifySame(output, input); +} + +Scalar GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput(int rows, int columns, int outputCacheRows, int outputCacheColumns) +{ + // Square output cache + loopnests::Index i("i"), j("j"); + + auto input = MakeIncrementingMatrix(rows, columns, "input"); + auto output = MakeMatrix(rows, columns, "output"); + + // Define LoopNest + auto nest = Using({ input }, ArgumentType::Input) + .Using({ output }, ArgumentType::Output) + .ForAll(i, 0, rows) + .ForAll(j, 0, columns) + .Do([=](Matrix input_, Matrix output_, Scalar i_, Scalar j_) { + output_(i_, j_) = input_(i_, j_); + }); + + auto& schedule = nest.GetSchedule(); + + auto iTopLevel = i; + auto jTopLevel = j; + + auto iOutput = schedule.Split(i, outputCacheRows); + auto jOutput = schedule.Split(j, outputCacheColumns); + + std::vector orderedIndices = { iOutput, jOutput, i, j }; + schedule.SetOrder(orderedIndices); + + ArgumentType argType = ArgumentType::Output; + std::string cacheName = "cacheOutput"; + + size_t maxCacheElts = outputCacheRows * outputCacheColumns; + size_t fillThreshold = maxCacheElts; + std::function reduceFunction = CopyReduce; + auto extraCacheParams = std::make_tuple(argType, + cacheName, + maxCacheElts, + fillThreshold, + reduceFunction, + false); + schedule.Cache(output, + { iTopLevel, jTopLevel }, + {}, + {}, + std::nullopt, + extraCacheParams); + +#if 0 // DEBUGGING + auto loop = nest.GetUnderlyingLoopNest(); + DebugDump(loop); +#endif + + nest.Run(); + + return VerifySame(output, input); +} + +Scalar GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test1() +{ + const int Rows = 8; + const int Columns = 8; + const int CacheRows = 2; + const int CacheColumns = 3; + return GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput(Rows, Columns, CacheRows, CacheColumns); +} + +Scalar GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test2() +{ + const int Rows = 8; + const int Columns = 8; + const int CacheRows = 3; + const int CacheColumns = 2; + return GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput(Rows, Columns, CacheRows, CacheColumns); +} + +Scalar GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test3() +{ + const int Rows = 8; + const int Columns = 8; + const int CacheRows = 3; + const int CacheColumns = 3; + return GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput(Rows, Columns, CacheRows, CacheColumns); +} + +Scalar GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test4() +{ + const int Rows = 8; + const int Columns = 8; + const int CacheRows = 4; + const int CacheColumns = 5; + return GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput(Rows, Columns, CacheRows, CacheColumns); +} + +Scalar GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test5() +{ + const int Rows = 8; + const int Columns = 8; + const int CacheRows = 5; + const int CacheColumns = 4; + return GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput(Rows, Columns, CacheRows, CacheColumns); +} + +Scalar GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test6() +{ + const int Rows = 8; + const int Columns = 8; + const int CacheRows = 5; + const int CacheColumns = 5; + return GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput(Rows, Columns, CacheRows, CacheColumns); +} + +Scalar GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test7() +{ + const int Rows = 8; + const int Columns = 7; + const int CacheRows = 2; + const int CacheColumns = 2; + return GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput(Rows, Columns, CacheRows, CacheColumns); +} + +Scalar GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test8() +{ + const int Rows = 7; + const int Columns = 8; + const int CacheRows = 2; + const int CacheColumns = 2; + return GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput(Rows, Columns, CacheRows, CacheColumns); +} + +Scalar GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test9() +{ + const int Rows = 7; + const int Columns = 7; + const int CacheRows = 2; + const int CacheColumns = 2; + return GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput(Rows, Columns, CacheRows, CacheColumns); +} + +// BLASTCOPY tests from above with GeneralCachingStrategy + +Scalar GeneralCachingStrategy_BLASTCOPY_ValidateOutput_Test1() +{ + int N = 8; + int cacheRows = N; + int cacheCols = N; + int stripeSize = 4; + + auto input = MakeIncrementingMatrix(N, N, "input"); + auto output = MakeMatrix(N, N, "output"); + auto expectedOutput = MakeIncrementingMatrix(N, N, "expectedOutput"); + + Index i("i"), j("j"); + auto nest = Using({ input }, ArgumentType::Input) + .Using({ output }, ArgumentType::Output) + .ForAll(i, 0, N) + .ForAll(j, 0, N) + .Do([](Matrix input, Matrix output, Scalar i, Scalar j) { + output(i, j) = input(i, j); + }); + + auto& schedule = nest.GetSchedule(); + + auto iTopLevel = i; + auto jTopLevel = j; + + auto iCache = schedule.Split(i, cacheRows); + auto jCache = schedule.Split(j, cacheCols); + auto jStripe = schedule.Split(j, stripeSize); + + schedule.SetOrder({ iCache, jCache, jStripe, i, j }); + + ArgumentType argType = ArgumentType::Input; + std::string cacheName = "cacheInput"; + size_t maxCacheElts = cacheRows * cacheCols; + size_t fillThreshold = maxCacheElts; + std::function reduceFunction = CopyReduce; + auto extraCacheParams = std::make_tuple(argType, + cacheName, + maxCacheElts, + fillThreshold, + reduceFunction, + false); + schedule.Cache(input, + { iTopLevel, jTopLevel }, + {}, + {}, + std::nullopt, + extraCacheParams); + +#if 0 // DEBUGGING + DebugDump(nest.GetUnderlyingLoopNest()); +#endif + nest.Run(); + + return VerifySame(output, expectedOutput); +} + +// Test with smaller cache and stripe size than previous test +Scalar GeneralCachingStrategy_BLASTCOPY_ValidateOutput_Test2() +{ + int N = 8; + int cacheRows = 4; + int cacheCols = 4; + int stripeSize = 2; + + // input, expectedOutput + // A: + // [ 0, 1, 2, 3, 4, 5, 6, 7] + // [ 8, 9, 10, 11, 12, 13, 14, 15] + // [16, 17, 18, 19, 20, 21, 22, 23] + // [24, 25, 26, 27, 28, 29, 30, 31] + // [32, 33, 34, 35, 36, 37, 38, 39] + // [40, 41, 42, 43, 44, 45, 46, 47] + // [48, 49, 50, 51, 52, 53, 54, 55] + // [56, 57, 58, 59, 60, 61, 62, 63] + auto input = MakeIncrementingMatrix(N, N, "input"); + auto output = MakeMatrix(N, N, "output"); + auto expectedOutput = MakeIncrementingMatrix(N, N, "expectedOutput"); + + Index i("i"), j("j"); + auto nest = Using({ input }, ArgumentType::Input) + .Using({ output }, ArgumentType::Output) + .ForAll(i, 0, N) + .ForAll(j, 0, N) + .Do([](Matrix input, Matrix output, Scalar i, Scalar j) { + output(i, j) = input(i, j); + }); + + auto& schedule = nest.GetSchedule(); + + auto iTopLevel = i; + auto jTopLevel = j; + + auto iCache = schedule.Split(i, cacheRows); + auto jCache = schedule.Split(j, cacheCols); + auto jStripe = schedule.Split(j, stripeSize); + + schedule.SetOrder({ iCache, jCache, jStripe, i, j }); + + ArgumentType argType = ArgumentType::Input; + std::string cacheName = "cacheInput"; + size_t maxCacheElts = cacheRows * cacheCols; + size_t fillThreshold = maxCacheElts; + std::function reduceFunction = CopyReduce; + auto extraCacheParams = std::make_tuple(argType, + cacheName, + maxCacheElts, + fillThreshold, + reduceFunction, + false); + schedule.Cache(input, + { iTopLevel, jTopLevel }, + {}, + {}, + std::nullopt, + extraCacheParams); + +#if 0 // DEBUGGING + DebugDump(nest.GetUnderlyingLoopNest()); +#endif + nest.Run(); + + return VerifySame(output, expectedOutput); +} + +Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_Test1() +{ + int N = 8; + int cacheRows = N; + int cacheCols = N; + int stripeSize = 4; + int vecSize = stripeSize / 2; + + auto input = MakeIncrementingMatrix(N, N, "input"); + auto output = MakeMatrix(N, N, "output"); + + // input + // A: + // [ 0, 1, 2, 3, 4, 5, 6, 7] + // [ 8, 9, 10, 11, 12, 13, 14, 15] + // [16, 17, 18, 19, 20, 21, 22, 23] + // [24, 25, 26, 27, 28, 29, 30, 31] + // [32, 33, 34, 35, 36, 37, 38, 39] + // [40, 41, 42, 43, 44, 45, 46, 47] + // [48, 49, 50, 51, 52, 53, 54, 55] + // [56, 57, 58, 59, 60, 61, 62, 63] + // clang-format off + Vector expectedCached = + { + 0, 1, 2, 3, + 8, 9, 10, 11, + 16, 17, 18, 19, + 24, 25, 26, 27, + 32, 33, 34, 35, + 40, 41, 42, 43, + 48, 49, 50, 51, + 56, 57, 58, 59, + + 4, 5, 6, 7, + 12, 13, 14, 15, + 20, 21, 22, 23, + 28, 29, 30, 31, + 36, 37, 38, 39, + 44, 45, 46, 47, + 52, 53, 54, 55, + 60, 61, 62, 63 + }; + // clang-format on + + Index i("i"), j("j"); + auto nest = Using({ input }, ArgumentType::Input) + .Using({ output }, ArgumentType::Output) + .ForAll(i, 0, N) + .ForAll(j, 0, N) + .Do([](Matrix input, Matrix output, Scalar i, Scalar j) { + output(i, j) = input(i, j); + }); + + auto& schedule = nest.GetSchedule(); + + auto iTopLevel = i; + auto jTopLevel = j; + + auto iCache = schedule.Split(i, cacheRows); + auto jCache = schedule.Split(j, cacheCols); + auto jStripe = schedule.Split(j, stripeSize); + auto jVec = schedule.Split(j, vecSize); + + schedule.SetOrder({ jCache, iCache, jStripe, i, jVec, j }); + + ArgumentType argType = ArgumentType::Input; + std::string cacheName = "cacheInput"; + size_t maxCacheElts = cacheRows * cacheCols; + size_t fillThreshold = maxCacheElts; + std::function reduceFunction = CopyReduce; + auto extraCacheParams = std::make_tuple(argType, + cacheName, + maxCacheElts, + fillThreshold, + reduceFunction, + false); + GeneralCachingStrategy cachingProvider{}; + schedule.Cache(cachingProvider, + input, + { iTopLevel, jTopLevel }, + {}, + {}, + std::nullopt, + extraCacheParams); + +#if 0 // DEBUGGING + DebugDump(nest.GetUnderlyingLoopNest()); +#endif + nest.Run(); + + // Examine the underlying cached memory + auto rawCacheValue = cachingProvider._rawCache; + rawCacheValue.SetLayout({ { (int)rawCacheValue.GetLayout().GetMemorySize() } }); + auto cacheVector = Vector(rawCacheValue); + + return VerifySame(cacheVector, expectedCached); +} + +// Smaller stripe size than previous test +Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_Test2() +{ + int N = 8; + int cacheRows = N; + int cacheCols = N; + int stripeSize = 2; + + auto input = MakeIncrementingMatrix(N, N, "input"); + auto output = MakeMatrix(N, N, "output"); + + // input + // A: + // [ 0, 1, 2, 3, 4, 5, 6, 7] + // [ 8, 9, 10, 11, 12, 13, 14, 15] + // [16, 17, 18, 19, 20, 21, 22, 23] + // [24, 25, 26, 27, 28, 29, 30, 31] + // [32, 33, 34, 35, 36, 37, 38, 39] + // [40, 41, 42, 43, 44, 45, 46, 47] + // [48, 49, 50, 51, 52, 53, 54, 55] + // [56, 57, 58, 59, 60, 61, 62, 63] + // clang-format off + Vector expectedCached = + { + 0, 1, + 8, 9, + 16, 17, + 24, 25, + 32, 33, + 40, 41, + 48, 49, + 56, 57, + + 2, 3, + 10, 11, + 18, 19, + 26, 27, + 34, 35, + 42, 43, + 50, 51, + 58, 59, + + 4, 5, + 12, 13, + 20, 21, + 28, 29, + 36, 37, + 44, 45, + 52, 53, + 60, 61, + + 6, 7, + 14, 15, + 22, 23, + 30, 31, + 38, 39, + 46, 47, + 54, 55, + 62, 63 + }; + // clang-format on + + Index i("i"), j("j"); + auto nest = Using({ input }, ArgumentType::Input) + .Using({ output }, ArgumentType::Output) + .ForAll(i, 0, N) + .ForAll(j, 0, N) + .Do([](Matrix input, Matrix output, Scalar i, Scalar j) { + output(i, j) = input(i, j); + }); + + auto& schedule = nest.GetSchedule(); + + auto iTopLevel = i; + auto jTopLevel = j; + + auto iCache = schedule.Split(i, cacheRows); + auto jCache = schedule.Split(j, cacheCols); + auto jStripe = schedule.Split(j, stripeSize); + + schedule.SetOrder({ iCache, jCache, jStripe, i, j }); + + ArgumentType argType = ArgumentType::Input; + std::string cacheName = "cacheInput"; + size_t maxCacheElts = cacheRows * cacheCols; + size_t fillThreshold = maxCacheElts; + std::function reduceFunction = CopyReduce; + auto extraCacheParams = std::make_tuple(argType, + cacheName, + maxCacheElts, + fillThreshold, + reduceFunction, + false); + GeneralCachingStrategy cachingProvider{}; + schedule.Cache(cachingProvider, + input, + { iTopLevel, jTopLevel }, + {}, + {}, + std::nullopt, + extraCacheParams); +#if 0 // DEBUGGING + DebugDump(nest.GetUnderlyingLoopNest()); +#endif + nest.Run(); + + // Examine the underlying cached memory + auto rawCacheValue = cachingProvider._rawCache; + rawCacheValue.SetLayout({ { (int)rawCacheValue.GetLayout().GetMemorySize() } }); + auto cacheVector = Vector(rawCacheValue); + + return VerifySame(cacheVector, expectedCached); +} + +// Same stripe size as previous test, but don't cache entire matrix at once +Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_Test3() +{ + int N = 8; + int cacheRows = 4; + int cacheCols = 4; + int stripeSize = 2; + + auto input = MakeIncrementingMatrix(N, N, "input"); + auto output = MakeMatrix(N, N, "output"); + + // input + // A: + // [ 0, 1, 2, 3, 4, 5, 6, 7] + // [ 8, 9, 10, 11, 12, 13, 14, 15] + // [16, 17, 18, 19, 20, 21, 22, 23] + // [24, 25, 26, 27, 28, 29, 30, 31] + // [32, 33, 34, 35, 36, 37, 38, 39] + // [40, 41, 42, 43, 44, 45, 46, 47] + // [48, 49, 50, 51, 52, 53, 54, 55] + // [56, 57, 58, 59, 60, 61, 62, 63] + // clang-format off + Vector expectedCachedUpperLeft = + { + 0, 1, + 8, 9, + 16, 17, + 24, 25, + + 2, 3, + 10, 11, + 18, 19, + 26, 27, + }; + Vector expectedCachedUpperRight = + { + 4, 5, + 12, 13, + 20, 21, + 28, 29, + + 6, 7, + 14, 15, + 22, 23, + 30, 31 + }; + Vector expectedCachedLowerLeft = + { + 32, 33, + 40, 41, + 48, 49, + 56, 57, + + 34, 35, + 42, 43, + 50, 51, + 58, 59, + }; + Vector expectedCachedLowerRight = + { + 36, 37, + 44, 45, + 52, 53, + 60, 61, + + 38, 39, + 46, 47, + 54, 55, + 62, 63 + }; + // clang-format on + + Index i("i"), j("j"); + auto nest = Using({ input }, ArgumentType::Input) + .Using({ output }, ArgumentType::Output) + .ForAll(i, 0, N) + .ForAll(j, 0, N) + .Do([](Matrix input, Matrix output, Scalar i, Scalar j) { + output(i, j) = input(i, j); + }); + + auto& schedule = nest.GetSchedule(); + + auto iTopLevel = i; + auto jTopLevel = j; + + auto iCache = schedule.Split(i, cacheRows); + auto jCache = schedule.Split(j, cacheCols); + auto jStripe = schedule.Split(j, stripeSize); + + schedule.SetOrder({ iCache, jCache, jStripe, i, j }); + + ArgumentType argType = ArgumentType::Input; + std::string cacheName = "cacheInput"; + size_t maxCacheElts = cacheRows * cacheCols; + size_t fillThreshold = maxCacheElts; + std::function reduceFunction = CopyReduce; + auto extraCacheParams = std::make_tuple(argType, + cacheName, + maxCacheElts, + fillThreshold, + reduceFunction, + false); + GeneralCachingStrategy cachingProvider{}; + schedule.Cache(cachingProvider, + input, + { iTopLevel, jTopLevel }, + {}, + {}, + std::nullopt, + extraCacheParams); + + // Get a handle to the underlying cached memory + auto rawCacheValue = cachingProvider._rawCache; + int rawCacheSize = (int)rawCacheValue.GetLayout().NumElements(); + + auto cachedUpperLeft = MakeVector(rawCacheSize); + auto cachedUpperRight = MakeVector(rawCacheSize); + auto cachedLowerLeft = MakeVector(rawCacheSize); + auto cachedLowerRight = MakeVector(rawCacheSize); + + // Add a low level API kernel to access the underlying cache after it has been filled + auto cacheSpyKernel = loopnests::Kernel("cache_spy_kernel") + .Inputs(rawCacheValue, cachedUpperLeft, cachedUpperRight, cachedLowerLeft, cachedLowerRight) + .Indices(iTopLevel, jTopLevel) + .Define([cacheRows, cacheCols](Value rawCacheValue, Vector cachedUpperLeft, Vector cachedUpperRight, Vector cachedLowerLeft, Vector cachedLowerRight, Scalar i, Scalar j) { + auto cacheView = rawCacheValue; + cacheView.SetLayout({ { (int)rawCacheValue.GetLayout().NumElements() } }); + auto vectorCacheView = Vector(cacheView); + If(i == 0, + [&]() { + // TODO : remove nested if's + If(j == 0, + [&]() { + cachedUpperLeft = vectorCacheView; + }) + .ElseIf(j == cacheCols, + [&]() { + cachedUpperRight = vectorCacheView; + }); + }) + .ElseIf(i == cacheRows, + [&]() { + If(j == 0, [&]() { + cachedLowerLeft = vectorCacheView; + }).ElseIf(j == cacheCols, [&]() { + cachedLowerRight = vectorCacheView; + }); + }); + }); + auto cacheSpyPosition = loopnests::CodePositionConstraints{ loopnests::LoopFragmentType::body, { iCache, jCache }, {} }; + nest.GetUnderlyingLoopNest().AddKernel(cacheSpyKernel, cacheSpyPosition); + +#if 0 // DEBUGGING + DebugDump(nest.GetUnderlyingLoopNest()); +#endif + nest.Run(); + + auto ok = MakeScalar("ok"); + ok = 1; + auto printError = [&] { + DebugPrint("Upper Left:"); + DebugPrintVector(cachedUpperLeft); + DebugPrint("\n"); + DebugPrintVector(expectedCachedUpperLeft); + DebugPrint("\n"); + DebugPrint("\n"); + DebugPrint("Upper Right:"); + DebugPrintVector(cachedUpperRight); + DebugPrint("\n"); + DebugPrintVector(expectedCachedUpperRight); + DebugPrint("\n"); + DebugPrint("\n"); + DebugPrint("Lower Left:"); + DebugPrintVector(cachedLowerLeft); + DebugPrint("\n"); + DebugPrintVector(expectedCachedLowerLeft); + DebugPrint("\n"); + DebugPrint("\n"); + DebugPrint("Lower Right:"); + DebugPrintVector(cachedLowerRight); + DebugPrint("\n"); + DebugPrintVector(expectedCachedLowerRight); + DebugPrint("\n"); + DebugPrint("\n"); + }; + // TODO : replace nested if's + If(VerifySame(cachedUpperLeft, expectedCachedUpperLeft) == 0, [&]() { + If(VerifySame(cachedUpperRight, expectedCachedUpperRight) == 0, [&]() { + If(VerifySame(cachedLowerLeft, expectedCachedLowerLeft) == 0, [&]() { + If(VerifySame(cachedLowerRight, expectedCachedLowerRight) == 0, [&]() { + ok = 0; + }).Else(printError); + }).Else(printError); + }).Else(printError); + }).Else(printError); + return ok; +} + +Scalar GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Runner(int M, int N, int cacheRows, int cacheCols, int stripeSize) +{ + int vecSize = stripeSize / 2; + + auto input = MakeIncrementingMatrix(M, N, "input"); + auto output = MakeMatrix(M, N, "output"); + auto expectedOutput = MakeIncrementingMatrix(M, N, "expectedOutput"); + + Index i("i"), j("j"); + auto nest = Using({ input }, ArgumentType::Input) + .Using({ output }, ArgumentType::Output) + .ForAll(i, 0, M) + .ForAll(j, 0, N) + .Do([](Matrix input, Matrix output, Scalar i, Scalar j) { + output(i, j) = input(i, j); + }); + + auto& schedule = nest.GetSchedule(); + + auto iTopLevel = i; + auto jTopLevel = j; + + auto iCache = schedule.Split(i, cacheRows); + auto jCache = schedule.Split(j, cacheCols); + auto jStripe = schedule.Split(j, stripeSize); + auto jVec = schedule.Split(j, vecSize); + + schedule.SetOrder({ jCache, iCache, jStripe, i, jVec, j }); + + ArgumentType argType = ArgumentType::Input; + std::string cacheName = "cacheInput"; + size_t maxCacheElts = cacheRows * cacheCols; + size_t fillThreshold = maxCacheElts; + std::function reduceFunction = CopyReduce; + auto extraCacheParams = std::make_tuple(argType, + cacheName, + maxCacheElts, + fillThreshold, + reduceFunction, + false); + GeneralCachingStrategy cachingProvider{}; + schedule.Cache(cachingProvider, + input, + { iTopLevel, jTopLevel }, + {}, + {}, + std::nullopt, + extraCacheParams); + +#if 0 // DEBUGGING + DebugDump(nest.GetUnderlyingLoopNest()); +#endif + nest.Run(); + + return VerifySame(output, expectedOutput); +} + +// input matrix rows evenly divides cache rows +// input matrix cols doesn't evenly divide cache cols +Scalar GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test1() +{ + int M = 8; + int N = 7; // N doesn't evenly divide the number of cache columns + int cacheRows = 4; + int cacheCols = 4; + int stripeSize = 2; + + return GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, stripeSize); +} + +// input matrix rows evenly divides cache rows +// input matrix cols doesn't evenly divide cache cols but does evenly divide stripeSize +Scalar GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test2() +{ + int M = 8; + int N = 6; // N doesn't evenly divide the number of cache columns, but does evenly divide stripe size + int cacheRows = 4; + int cacheCols = 4; + int stripeSize = 2; + + return GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, stripeSize); +} + +// input matrix rows doesn't evenly divides cache rows +// input matrix cols doesn't evenly divide cache cols +Scalar GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test3() +{ + int M = 6; + int N = 7; // N doesn't evenly divide the number of cache columns + int cacheRows = 4; + int cacheCols = 4; + int stripeSize = 2; + + return GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, stripeSize); +} + +// input matrix rows doesn't evenly divides cache rows +// input matrix cols doesn't evenly divide cache cols but does evenly divide stripe size +Scalar GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test4() +{ + int M = 6; + int N = 6; // N doesn't evenly divide the number of cache columns, but does evenly divide stripe size + int cacheRows = 4; + int cacheCols = 4; + int stripeSize = 2; + + return GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, stripeSize); +} + +// input matrix rows evenly divides cache rows +// input matrix cols < cache cols, doesn't evenly divide stripe size +Scalar GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test5() +{ + int M = 8; + int N = 3; // N < cache columns, doesn't evenly divide stripe size + int cacheRows = 4; + int cacheCols = 4; + int stripeSize = 2; + + return GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, stripeSize); +} + +// input matrix rows evenly divides cache rows +// input matrix cols < cache cols, evenly divides stripe size +Scalar GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test6() +{ + int M = 8; + int N = 2; // N < cache columns, does evenly divide stripe size + int cacheRows = 4; + int cacheCols = 4; + int stripeSize = 2; + + return GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, stripeSize); +} + +// input matrix rows < cache rows +// input matrix cols < cache cols, doesn't evenly divides stripe size +Scalar GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test7() +{ + int M = 3; + int N = 3; // N < cache columns, doesn't evenly divide stripe size + int cacheRows = 4; + int cacheCols = 4; + int stripeSize = 2; + + return GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, stripeSize); +} + +// input matrix rows < cache rows +// input matrix cols < cache cols, does evenly divides stripe size +Scalar GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test8() +{ + int M = 2; + int N = 2; // N < cache columns, does evenly divide stripe size + int cacheRows = 4; + int cacheCols = 4; + int stripeSize = 2; + + return GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, stripeSize); +} + +// input matrix rows < cache rows +// input matrix cols multiple of cache cols +Scalar GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test9() +{ + int M = 2; + int N = 8; + int cacheRows = 4; + int cacheCols = 4; + int stripeSize = 2; + + return GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, stripeSize); +} + +Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Runner(int M, int N, int cacheRows, int cacheCols, int stripeSize, Vector expectedCachedUpperLeft, Vector expectedCachedUpperRight, Vector expectedCachedLowerLeft, Vector expectedCachedLowerRight) +{ + int vecSize = stripeSize / 2; + auto input = MakeIncrementingMatrix(M, N, "input"); + auto output = MakeMatrix(M, N, "output"); + + Index i("i"), j("j"); + auto nest = Using({ input }, ArgumentType::Input) + .Using({ output }, ArgumentType::Output) + .ForAll(i, 0, M) + .ForAll(j, 0, N) + .Do([](Matrix input, Matrix output, Scalar i, Scalar j) { + output(i, j) = input(i, j); + }); + + auto& schedule = nest.GetSchedule(); + + auto iTopLevel = i; + auto jTopLevel = j; + + auto iCache = schedule.Split(i, cacheRows); + auto jCache = schedule.Split(j, cacheCols); + auto jStripe = schedule.Split(j, stripeSize); + auto jVec = schedule.Split(j, vecSize); + + schedule.SetOrder({ jCache, iCache, jStripe, i, jVec, j }); + + ArgumentType argType = ArgumentType::Input; + std::string cacheName = "cacheInput"; + size_t maxCacheElts = cacheRows * cacheCols; + size_t fillThreshold = maxCacheElts; + std::function reduceFunction = CopyReduce; + auto extraCacheParams = std::make_tuple(argType, + cacheName, + maxCacheElts, + fillThreshold, + reduceFunction, + false); + GeneralCachingStrategy cachingProvider{}; + schedule.Cache(cachingProvider, + input, + { iTopLevel, jTopLevel }, + {}, + {}, + std::nullopt, + extraCacheParams); + + // Get a handle to the underlying cached memory + auto rawCacheValue = cachingProvider._rawCache; + int rawCacheSize = (int)rawCacheValue.GetLayout().NumElements(); + + auto cachedUpperLeft = MakeVector(rawCacheSize); + auto cachedUpperRight = MakeVector(rawCacheSize); + auto cachedLowerLeft = MakeVector(rawCacheSize); + auto cachedLowerRight = MakeVector(rawCacheSize); + + // Add a low level API kernel to access the underlying cache after it has been filled + auto cacheSpyKernel = loopnests::Kernel("cache_spy_kernel") + .Inputs(rawCacheValue, cachedUpperLeft, cachedUpperRight, cachedLowerLeft, cachedLowerRight) + .Indices(iTopLevel, jTopLevel) + .Define([cacheRows, cacheCols](Value rawCacheValue, Vector cachedUpperLeft, Vector cachedUpperRight, Vector cachedLowerLeft, Vector cachedLowerRight, Scalar i, Scalar j) { + auto cacheView = rawCacheValue; + cacheView.SetLayout({ { (int)rawCacheValue.GetLayout().NumElements() } }); + auto vectorCacheView = Vector(cacheView); + If(i == 0, + [&]() { + // TODO : remove nested if's + If(j == 0, + [&]() { + cachedUpperLeft = vectorCacheView; + }) + .ElseIf(j == cacheCols, + [&]() { + cachedUpperRight = vectorCacheView; + }); + }) + .ElseIf(i == cacheRows, + [&]() { + If(j == 0, [&]() { + cachedLowerLeft = vectorCacheView; + }).ElseIf(j == cacheCols, [&]() { + cachedLowerRight = vectorCacheView; + }); + }); + }); + auto cacheSpyPosition = loopnests::CodePositionConstraints{ loopnests::LoopFragmentType::body, { iCache, jCache }, {} }; + nest.GetUnderlyingLoopNest().AddKernel(cacheSpyKernel, cacheSpyPosition); + +#if 0 // DEBUGGING + DebugDump(nest.GetUnderlyingLoopNest()); +#endif + nest.Run(); + + auto ok = MakeScalar("ok"); + ok = 1; + auto printError = [&] { + DebugPrint("Upper Left:"); + DebugPrintVector(cachedUpperLeft); + DebugPrint("\n"); + DebugPrintVector(expectedCachedUpperLeft); + DebugPrint("\n"); + DebugPrint("\n"); + DebugPrint("Upper Right:"); + DebugPrintVector(cachedUpperRight); + DebugPrint("\n"); + DebugPrintVector(expectedCachedUpperRight); + DebugPrint("\n"); + DebugPrint("\n"); + DebugPrint("Lower Left:"); + DebugPrintVector(cachedLowerLeft); + DebugPrint("\n"); + DebugPrintVector(expectedCachedLowerLeft); + DebugPrint("\n"); + DebugPrint("\n"); + DebugPrint("Lower Right:"); + DebugPrintVector(cachedLowerRight); + DebugPrint("\n"); + DebugPrintVector(expectedCachedLowerRight); + DebugPrint("\n"); + DebugPrint("\n"); + }; + // TODO : replace nested if's + If(VerifySame(cachedUpperLeft, expectedCachedUpperLeft) == 0, [&]() { + If(VerifySame(cachedUpperRight, expectedCachedUpperRight) == 0, [&]() { + If(VerifySame(cachedLowerLeft, expectedCachedLowerLeft) == 0, [&]() { + If(VerifySame(cachedLowerRight, expectedCachedLowerRight) == 0, [&]() { + ok = 0; + }).Else(printError); + }).Else(printError); + }).Else(printError); + }).Else(printError); + return ok; +} + +Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Runner_LeftCachesOnly(int M, int N, int cacheRows, int cacheCols, int stripeSize, Vector expectedCachedUpperLeft, Vector expectedCachedLowerLeft) +{ + int vecSize = stripeSize / 2; + auto input = MakeIncrementingMatrix(M, N, "input"); + auto output = MakeMatrix(M, N, "output"); + + Index i("i"), j("j"); + auto nest = Using({ input }, ArgumentType::Input) + .Using({ output }, ArgumentType::Output) + .ForAll(i, 0, M) + .ForAll(j, 0, N) + .Do([](Matrix input, Matrix output, Scalar i, Scalar j) { + output(i, j) = input(i, j); + }); + + auto& schedule = nest.GetSchedule(); + + auto iTopLevel = i; + auto jTopLevel = j; + + auto iCache = schedule.Split(i, cacheRows); + auto jCache = schedule.Split(j, cacheCols); + auto jStripe = schedule.Split(j, stripeSize); + auto jVec = schedule.Split(j, vecSize); + + schedule.SetOrder({ jCache, iCache, jStripe, i, jVec, j }); + + ArgumentType argType = ArgumentType::Input; + std::string cacheName = "cacheInput"; + size_t maxCacheElts = cacheRows * cacheCols; + size_t fillThreshold = maxCacheElts; + std::function reduceFunction = CopyReduce; + auto extraCacheParams = std::make_tuple(argType, + cacheName, + maxCacheElts, + fillThreshold, + reduceFunction, + false); + GeneralCachingStrategy cachingProvider{}; + schedule.Cache(cachingProvider, + input, + { iTopLevel, jTopLevel }, + {}, + {}, + std::nullopt, + extraCacheParams); + + // Get a handle to the underlying cached memory + auto rawCacheValue = cachingProvider._rawCache; + int rawCacheSize = (int)rawCacheValue.GetLayout().NumElements(); + + // No right caches when N < cacheCols + auto cachedUpperLeft = MakeVector(rawCacheSize); + auto cachedLowerLeft = MakeVector(rawCacheSize); + + // Add a low level API kernel to access the underlying cache after it has been filled + auto cacheSpyKernel = loopnests::Kernel("cache_spy_kernel") + .Inputs(rawCacheValue, cachedUpperLeft, cachedLowerLeft) + .Indices(iTopLevel, jTopLevel) + .Define([cacheRows](Value rawCacheValue, Vector cachedUpperLeft, Vector cachedLowerLeft, Scalar i, Scalar j) { + auto cacheView = rawCacheValue; + cacheView.SetLayout({ { (int)rawCacheValue.GetLayout().NumElements() } }); + auto vectorCacheView = Vector(cacheView); + If(i == 0, + [&]() { + // TODO : remove nested if's + If(j == 0, + [&]() { + cachedUpperLeft = vectorCacheView; + }); + }) + .ElseIf(i == cacheRows, + [&]() { + If(j == 0, [&]() { + cachedLowerLeft = vectorCacheView; + }); + }); + }); + auto cacheSpyPosition = loopnests::CodePositionConstraints{ loopnests::LoopFragmentType::body, { iCache, jCache }, {} }; + nest.GetUnderlyingLoopNest().AddKernel(cacheSpyKernel, cacheSpyPosition); + +#if 0 // DEBUGGING + DebugDump(nest.GetUnderlyingLoopNest()); +#endif + nest.Run(); + + auto ok = MakeScalar("ok"); + ok = 1; + auto printError = [&] { + DebugPrint("Upper Left:"); + DebugPrintVector(cachedUpperLeft); + DebugPrint("\n"); + DebugPrintVector(expectedCachedUpperLeft); + DebugPrint("\n"); + DebugPrint("\n"); + DebugPrint("Lower Left:"); + DebugPrintVector(cachedLowerLeft); + DebugPrint("\n"); + DebugPrintVector(expectedCachedLowerLeft); + DebugPrint("\n"); + DebugPrint("\n"); + }; + // TODO : replace nested if's + If(VerifySame(cachedUpperLeft, expectedCachedUpperLeft) == 0, [&]() { + If(VerifySame(cachedLowerLeft, expectedCachedLowerLeft) == 0, [&]() { + ok = 0; + }).Else(printError); + }).Else(printError); + return ok; +} + +Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Runner_UpperCachesOnly(int M, int N, int cacheRows, int cacheCols, int stripeSize, Vector expectedCachedUpperLeft, Vector expectedCachedUpperRight) +{ + int vecSize = stripeSize / 2; + auto input = MakeIncrementingMatrix(M, N, "input"); + auto output = MakeMatrix(M, N, "output"); + + Index i("i"), j("j"); + auto nest = Using({ input }, ArgumentType::Input) + .Using({ output }, ArgumentType::Output) + .ForAll(i, 0, M) + .ForAll(j, 0, N) + .Do([](Matrix input, Matrix output, Scalar i, Scalar j) { + output(i, j) = input(i, j); + }); + + auto& schedule = nest.GetSchedule(); + + auto iTopLevel = i; + auto jTopLevel = j; + + auto iCache = schedule.Split(i, cacheRows); + auto jCache = schedule.Split(j, cacheCols); + auto jStripe = schedule.Split(j, stripeSize); + auto jVec = schedule.Split(j, vecSize); + + schedule.SetOrder({ jCache, iCache, jStripe, i, jVec, j }); + + ArgumentType argType = ArgumentType::Input; + std::string cacheName = "cacheInput"; + size_t maxCacheElts = cacheRows * cacheCols; + size_t fillThreshold = maxCacheElts; + std::function reduceFunction = CopyReduce; + auto extraCacheParams = std::make_tuple(argType, + cacheName, + maxCacheElts, + fillThreshold, + reduceFunction, + false); + GeneralCachingStrategy cachingProvider{}; + schedule.Cache(cachingProvider, + input, + { iTopLevel, jTopLevel }, + {}, + {}, + std::nullopt, + extraCacheParams); + + // Get a handle to the underlying cached memory + auto rawCacheValue = cachingProvider._rawCache; + int rawCacheSize = (int)rawCacheValue.GetLayout().NumElements(); + + auto cachedUpperLeft = MakeVector(rawCacheSize); + auto cachedUpperRight = MakeVector(rawCacheSize); + + // Add a low level API kernel to access the underlying cache after it has been filled + auto cacheSpyKernel = loopnests::Kernel("cache_spy_kernel") + .Inputs(rawCacheValue, cachedUpperLeft, cachedUpperRight) + .Indices(iTopLevel, jTopLevel) + .Define([cacheCols](Value rawCacheValue, Vector cachedUpperLeft, Vector cachedUpperRight, Scalar i, Scalar j) { + auto cacheView = rawCacheValue; + cacheView.SetLayout({ { (int)rawCacheValue.GetLayout().NumElements() } }); + auto vectorCacheView = Vector(cacheView); + If(i == 0, + [&]() { + // TODO : remove nested if's + If(j == 0, + [&]() { + cachedUpperLeft = vectorCacheView; + }) + .ElseIf(j == cacheCols, + [&]() { + cachedUpperRight = vectorCacheView; + }); + }); + }); + auto cacheSpyPosition = loopnests::CodePositionConstraints{ loopnests::LoopFragmentType::body, { iCache, jCache }, {} }; + nest.GetUnderlyingLoopNest().AddKernel(cacheSpyKernel, cacheSpyPosition); + +#if 0 // DEBUGGING + DebugDump(nest.GetUnderlyingLoopNest()); +#endif + nest.Run(); + + auto ok = MakeScalar("ok"); + ok = 1; + auto printError = [&] { + DebugPrint("Upper Left:"); + DebugPrintVector(cachedUpperLeft); + DebugPrint("\n"); + DebugPrintVector(expectedCachedUpperLeft); + DebugPrint("\n"); + DebugPrint("\n"); + DebugPrint("Upper Right:"); + DebugPrintVector(cachedUpperRight); + DebugPrint("\n"); + DebugPrintVector(expectedCachedUpperRight); + DebugPrint("\n"); + DebugPrint("\n"); + }; + // TODO : replace nested if's + If(VerifySame(cachedUpperLeft, expectedCachedUpperLeft) == 0, [&]() { + If(VerifySame(cachedUpperRight, expectedCachedUpperRight) == 0, [&]() { + ok = 0; + }).Else(printError); + }).Else(printError); + return ok; +} + +Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Runner_UpperLeftCacheOnly(int M, int N, int cacheRows, int cacheCols, int stripeSize, Vector expectedCachedUpperLeft) +{ + int vecSize = stripeSize / 2; + auto input = MakeIncrementingMatrix(M, N, "input"); + auto output = MakeMatrix(M, N, "output"); + + Index i("i"), j("j"); + auto nest = Using({ input }, ArgumentType::Input) + .Using({ output }, ArgumentType::Output) + .ForAll(i, 0, M) + .ForAll(j, 0, N) + .Do([](Matrix input, Matrix output, Scalar i, Scalar j) { + output(i, j) = input(i, j); + }); + + auto& schedule = nest.GetSchedule(); + + auto iTopLevel = i; + auto jTopLevel = j; + + auto iCache = schedule.Split(i, cacheRows); + auto jCache = schedule.Split(j, cacheCols); + auto jStripe = schedule.Split(j, stripeSize); + auto jVec = schedule.Split(j, vecSize); + + schedule.SetOrder({ jCache, iCache, jStripe, i, jVec, j }); + + ArgumentType argType = ArgumentType::Input; + std::string cacheName = "cacheInput"; + size_t maxCacheElts = cacheRows * cacheCols; + size_t fillThreshold = maxCacheElts; + std::function reduceFunction = CopyReduce; + auto extraCacheParams = std::make_tuple(argType, + cacheName, + maxCacheElts, + fillThreshold, + reduceFunction, + false); + GeneralCachingStrategy cachingProvider{}; + schedule.Cache(cachingProvider, + input, + { iTopLevel, jTopLevel }, + {}, + {}, + std::nullopt, + extraCacheParams); + + // Get a handle to the underlying cached memory + auto rawCacheValue = cachingProvider._rawCache; + int rawCacheSize = (int)rawCacheValue.GetLayout().NumElements(); + + // No right caches when N < cacheCols + auto cachedUpperLeft = MakeVector(rawCacheSize); + + // Add a low level API kernel to access the underlying cache after it has been filled + auto cacheSpyKernel = loopnests::Kernel("cache_spy_kernel") + .Inputs(rawCacheValue, cachedUpperLeft) + .Indices(iTopLevel, jTopLevel) + .Define([](Value rawCacheValue, Vector cachedUpperLeft, Scalar i, Scalar j) { + auto cacheView = rawCacheValue; + cacheView.SetLayout({ { (int)rawCacheValue.GetLayout().NumElements() } }); + auto vectorCacheView = Vector(cacheView); + If(i == 0, + [&]() { + // TODO : remove nested if's + If(j == 0, + [&]() { + cachedUpperLeft = vectorCacheView; + }); + }); + }); + auto cacheSpyPosition = loopnests::CodePositionConstraints{ loopnests::LoopFragmentType::body, { iCache, jCache }, {} }; + nest.GetUnderlyingLoopNest().AddKernel(cacheSpyKernel, cacheSpyPosition); + +#if 0 // DEBUGGING + DebugDump(nest.GetUnderlyingLoopNest()); +#endif + nest.Run(); + + return VerifySame(cachedUpperLeft, expectedCachedUpperLeft); +} + +Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test1() +{ + int M = 8; // M does evenly divide cache rows + int N = 7; // N doesn't evenly divide cache columns + int cacheRows = 4; + int cacheCols = 4; + int stripeSize = 2; + + // input + // A: + // [ 0, 1, 2, 3, 4, 5, 6], + // [ 7, 8, 9, 10, 11, 12, 13], + // [14, 15, 16, 17, 18, 19, 20], + // [21, 22, 23, 24, 25, 26, 27], + // [28, 29, 30, 31, 32, 33, 34], + // [35, 36, 37, 38, 39, 40, 41], + // [42, 43, 44, 45, 46, 47, 48], + // [49, 50, 51, 52, 53, 54, 55] + // clang-format off + Vector expectedCachedUpperLeft = + { + 0, 1, + 7, 8, + 14, 15, + 21, 22, + + 2, 3, + 9, 10, + 16, 17, + 23, 24, + }; + Vector expectedCachedUpperRight = + { + 4, 5, + 11, 12, + 18, 19, + 25, 26, + + 6, 0, + 13, 0, + 20, 0, + 27, 0 + }; + Vector expectedCachedLowerLeft = + { + 28, 29, + 35, 36, + 42, 43, + 49, 50, + + 30, 31, + 37, 38, + 44, 45, + 51, 52, + }; + Vector expectedCachedLowerRight = + { + 32, 33, + 39, 40, + 46, 47, + 53, 54, + + 34, 0, + 41, 0, + 48, 0, + 55, 0 + }; + // clang-format on + + return GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, stripeSize, expectedCachedUpperLeft, expectedCachedUpperRight, expectedCachedLowerLeft, expectedCachedLowerRight); +} + +Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test2() +{ + int M = 8; // M does evenly divide cache rows + int N = 6; // N doesn't evenly divide cache columns, but does evenly divide stripe size + int cacheRows = 4; + int cacheCols = 4; + int stripeSize = 2; + + auto input = MakeIncrementingMatrix(M, N, "input"); + auto output = MakeMatrix(M, N, "output"); + + // input + // A: + // [ 0, 1, 2, 3, 4, 5], + // [ 6, 7, 8, 9, 10, 11], + // [12, 13, 14, 15, 16, 17], + // [18, 19, 20, 21, 22, 23], + // [24, 25, 26, 27, 28, 29], + // [30, 31, 32, 33, 34, 35], + // [36, 37, 38, 39, 40, 41], + // [42, 43, 44, 45, 46, 47] + // clang-format off + Vector expectedCachedUpperLeft = + { + 0, 1, + 6, 7, + 12, 13, + 18, 19, + + 2, 3, + 8, 9, + 14, 15, + 20, 21, + }; + Vector expectedCachedUpperRight = + { + 4, 5, + 10, 11, + 16, 17, + 22, 23, + + 0, 0, + 0, 0, + 0, 0, + 0, 0 + }; + Vector expectedCachedLowerLeft = + { + 24, 25, + 30, 31, + 36, 37, + 42, 43, + + 26, 27, + 32, 33, + 38, 39, + 44, 45, + }; + Vector expectedCachedLowerRight = + { + 28, 29, + 34, 35, + 40, 41, + 46, 47, + + 0, 0, + 0, 0, + 0, 0, + 0, 0 + }; + // clang-format on + + return GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, stripeSize, expectedCachedUpperLeft, expectedCachedUpperRight, expectedCachedLowerLeft, expectedCachedLowerRight); +} + +// input matrix rows doesn't evenly divides cache rows +// input matrix cols doesn't evenly divide cache cols +Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test3() +{ + int M = 6; + int N = 7; // N doesn't evenly divide the number of cache columns + int cacheRows = 4; + int cacheCols = 4; + int stripeSize = 2; + + // input + // A: + // [ 0, 1, 2, 3, 4, 5, 6], + // [ 7, 8, 9, 10, 11, 12, 13], + // [14, 15, 16, 17, 18, 19, 20], + // [21, 22, 23, 24, 25, 26, 27], + // [28, 29, 30, 31, 32, 33, 34], + // [35, 36, 37, 38, 39, 40, 41], + // clang-format off + Vector expectedCachedUpperLeft = + { + 0, 1, + 7, 8, + 14, 15, + 21, 22, + + 2, 3, + 9, 10, + 16, 17, + 23, 24, + }; + Vector expectedCachedUpperRight = + { + 4, 5, + 11, 12, + 18, 19, + 25, 26, + + 6, 0, + 13, 0, + 20, 0, + 27, 0 + }; + + // Check that it gets reviewed correctly to keep the cached data contiguous + Vector expectedCachedLowerLeft = + { + 28, 29, + 35, 36, + 30, 31, + 37, 38, + + 0, 0, + 0, 0, + 0, 0, + 0, 0, + }; + Vector expectedCachedLowerRight = + { + 32, 33, + 39, 40, + 34, 0, + 41, 0, + + 0, 0, + 0, 0, + 0, 0, + 0, 0 + }; + // clang-format on + + return GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, stripeSize, expectedCachedUpperLeft, expectedCachedUpperRight, expectedCachedLowerLeft, expectedCachedLowerRight); +} + +// input matrix rows doesn't evenly divides cache rows +// input matrix cols doesn't evenly divide cache cols but does evenly divide stripe size +Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test4() +{ + int M = 6; + int N = 6; // N doesn't evenly divide the number of cache columns, but does evenly divide stripe size + int cacheRows = 4; + int cacheCols = 4; + int stripeSize = 2; + + // input + // A: + // [ 0, 1, 2, 3, 4, 5], + // [ 6, 7, 8, 9, 10, 11], + // [12, 13, 14, 15, 16, 17], + // [18, 19, 20, 21, 22, 23], + // [24, 25, 26, 27, 28, 29], + // [30, 31, 32, 33, 34, 35] + // clang-format off + Vector expectedCachedUpperLeft = + { + 0, 1, + 6, 7, + 12, 13, + 18, 19, + + 2, 3, + 8, 9, + 14, 15, + 20, 21, + }; + Vector expectedCachedUpperRight = + { + 4, 5, + 10, 11, + 16, 17, + 22, 23, + + 0, 0, + 0, 0, + 0, 0, + 0, 0 + }; + Vector expectedCachedLowerLeft = + { + 24, 25, + 30, 31, + 26, 27, + 32, 33, + + 0, 0, + 0, 0, + 0, 0, + 0, 0, + }; + Vector expectedCachedLowerRight = + { + 28, 29, + 34, 35, + 0, 0, + 0, 0, + + 0, 0, + 0, 0, + 0, 0, + 0, 0 + }; + // clang-format on + return GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, stripeSize, expectedCachedUpperLeft, expectedCachedUpperRight, expectedCachedLowerLeft, expectedCachedLowerRight); +} + +// input matrix rows evenly divides cache rows +// input matrix cols < cache cols, doesn't evenly divide stripe size +Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test5() +{ + int M = 8; + int N = 3; // N < cache columns, doesn't evenly divide stripe size + int cacheRows = 4; + int cacheCols = 4; + int stripeSize = 2; + + // input + // A: + // [ 0, 1, 2], + // [ 3, 4, 5], + // [ 6, 7, 8], + // [ 9, 10, 11], + // [12, 13, 14], + // [15, 16, 17], + // [18, 19, 20], + // [21, 22, 23] + // clang-format off + Vector expectedCachedUpperLeft = + { + 0, 1, + 3, 4, + 6, 7, + 9, 10, + + 2, 0, + 5, 0, + 8, 0, + 11, 0, + }; + Vector expectedCachedLowerLeft = + { + 12, 13, + 15, 16, + 18, 19, + 21, 22, + + 14, 0, + 17, 0, + 20, 0, + 23, 0, + }; + // clang-format on + + return GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Runner_LeftCachesOnly(M, N, cacheRows, cacheCols, stripeSize, expectedCachedUpperLeft, expectedCachedLowerLeft); +} + +// input matrix rows evenly divides cache rows +// input matrix cols < cache cols, evenly divides stripe size +Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test6() +{ + int M = 8; + int N = 2; // N < cache columns, does evenly divide stripe size + int cacheRows = 4; + int cacheCols = 4; + int stripeSize = 2; + + // input + // A: + // [ 0, 1], + // [ 2, 3], + // [ 4, 5], + // [ 6, 7], + // [ 8, 9], + // [10, 11], + // [12, 13], + // [14, 15] + // clang-format off + Vector expectedCachedUpperLeft = + { + 0, 1, + 2, 3, + 4, 5, + 6, 7, + + 0, 0, + 0, 0, + 0, 0, + 0, 0, + }; + Vector expectedCachedLowerLeft = + { + 8, 9, + 10, 11, + 12, 13, + 14, 15, + + 0, 0, + 0, 0, + 0, 0, + 0, 0, + }; + // clang-format on + return GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Runner_LeftCachesOnly(M, N, cacheRows, cacheCols, stripeSize, expectedCachedUpperLeft, expectedCachedLowerLeft); +} + +// input matrix rows < cache rows +// input matrix cols < cache cols, doesn't evenly divides stripe size +Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test7() +{ + int M = 3; + int N = 3; // N < cache columns, doesn't evenly divide stripe size + int cacheRows = 4; + int cacheCols = 4; + int stripeSize = 2; + + // input + // A: + // [0, 1, 2], + // [3, 4, 5], + // [6, 7, 8] + // clang-format off + Vector expectedCachedUpperLeft = + { + 0, 1, + 3, 4, + 6, 7, + 2, 0, + + 5, 0, + 8, 0, + 0, 0, + 0, 0 + }; + // clang-format on + return GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Runner_UpperLeftCacheOnly(M, N, cacheRows, cacheCols, stripeSize, expectedCachedUpperLeft); +} + +// input matrix rows < cache rows +// input matrix cols < cache cols, does evenly divides stripe size +Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test8() +{ + int M = 2; + int N = 2; // N < cache columns, does evenly divide stripe size + int cacheRows = 4; + int cacheCols = 4; + int stripeSize = 2; + + // input + // A: + // [ 0, 1], + // [ 2, 3] + // clang-format off + Vector expectedCachedUpperLeft = + { + 0, 1, + 2, 3, + 0, 0, + 0, 0, + + 0, 0, + 0, 0, + 0, 0, + 0, 0, + }; + // clang-format on + return GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Runner_UpperLeftCacheOnly(M, N, cacheRows, cacheCols, stripeSize, expectedCachedUpperLeft); +} + +// input matrix rows < cache rows +// input matrix cols multiple of cache cols +Scalar GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test9() +{ + int M = 2; + int N = 8; + int cacheRows = 4; + int cacheCols = 4; + int stripeSize = 2; + + // input + // A: + // [ 0, 1, 2, 3, 4, 5, 6, 7], + // [ 8, 9, 10, 11, 12, 13, 14, 15] + // clang-format off + Vector expectedCachedUpperLeft = + { + 0, 1, + 8, 9, + 2, 3, + 10, 11, + + 0, 0, + 0, 0, + 0, 0, + 0, 0, + }; + Vector expectedCachedUpperRight = + { + 4, 5, + 12, 13, + 6, 7, + 14, 15, + + 0, 0, + 0, 0, + 0, 0, + 0, 0 + }; + // clang-format on + return GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Runner_UpperCachesOnly(M, N, cacheRows, cacheCols, stripeSize, expectedCachedUpperLeft, expectedCachedUpperRight); +} + +// General caching strategy Progressive BLASNCopy-style caching +Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_Test1() +{ + int N = 8; + int cacheRows = N; + int cacheCols = N; + int blockSize = 4; + int stripeSize = 2; + + auto input = MakeIncrementingMatrix(N, N, "input"); + auto output = MakeMatrix(N, N, "output"); + auto expectedOutput = MakeIncrementingMatrix(N, N, "expectedOutput"); + + // input: + // A: + // [ 0, 1, 2, 3, 4, 5, 6, 7] + // [ 8, 9, 10, 11, 12, 13, 14, 15] + // [16, 17, 18, 19, 20, 21, 22, 23] + // [24, 25, 26, 27, 28, 29, 30, 31] + // [32, 33, 34, 35, 36, 37, 38, 39] + // [40, 41, 42, 43, 44, 45, 46, 47] + // [48, 49, 50, 51, 52, 53, 54, 55] + // [56, 57, 58, 59, 60, 61, 62, 63] + + Index i("i"), j("j"); + auto nest = Using({ input }, ArgumentType::Input) + .Using({ output }, ArgumentType::Output) + .ForAll(i, 0, N) + .ForAll(j, 0, N) + .Do([](Matrix input, Matrix output, Scalar i, Scalar j) { + output(i, j) = input(i, j); + }); + + auto& schedule = nest.GetSchedule(); + + auto iTopLevel = i; + auto jTopLevel = j; + + auto iCache = schedule.Split(i, cacheRows); + auto iBlock = schedule.Split(i, blockSize); + auto iStripe = schedule.Split(i, stripeSize); + auto jCache = schedule.Split(j, cacheCols); + + schedule.SetOrder({ iCache, jCache, iBlock, iStripe, j, i }); + + ArgumentType argType = ArgumentType::Input; + std::string cacheName = "cacheInput"; + size_t maxCacheElts = cacheRows * cacheCols; + size_t fillThreshold = blockSize * cacheCols; + std::function reduceFunction = CopyReduce; + auto extraCacheParams = std::make_tuple(argType, + cacheName, + maxCacheElts, + fillThreshold, + reduceFunction, + false); + GeneralCachingStrategy cachingProvider{}; + schedule.Cache(cachingProvider, + input, + { iTopLevel, jTopLevel }, + {}, + {}, + std::nullopt, + extraCacheParams); + +#if 0 // DEBUGGING + DebugDump(nest.GetUnderlyingLoopNest()); +#endif + nest.Run(); + + return VerifySame(output, expectedOutput); +} + +// Test with smaller cache, block, and stripe size than previous test +Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_Test2() +{ + int N = 8; + int cacheRows = 4; + int cacheCols = 4; + int blockSize = 2; + int stripeSize = 1; + + // input, expectedOutput + // A: + // [ 0, 1, 2, 3, 4, 5, 6, 7] + // [ 8, 9, 10, 11, 12, 13, 14, 15] + // [16, 17, 18, 19, 20, 21, 22, 23] + // [24, 25, 26, 27, 28, 29, 30, 31] + // [32, 33, 34, 35, 36, 37, 38, 39] + // [40, 41, 42, 43, 44, 45, 46, 47] + // [48, 49, 50, 51, 52, 53, 54, 55] + // [56, 57, 58, 59, 60, 61, 62, 63] + auto input = MakeIncrementingMatrix(N, N, "input"); + auto output = MakeMatrix(N, N, "output"); + auto expectedOutput = MakeIncrementingMatrix(N, N, "expectedOutput"); + + Index i("i"), j("j"); + auto nest = Using({ input }, ArgumentType::Input) + .Using({ output }, ArgumentType::Output) + .ForAll(i, 0, N) + .ForAll(j, 0, N) + .Do([](Matrix input, Matrix output, Scalar i, Scalar j) { + output(i, j) = input(i, j); + }); + + auto& schedule = nest.GetSchedule(); + + auto iTopLevel = i; + auto jTopLevel = j; + + auto iCache = schedule.Split(i, cacheRows); + auto iBlock = schedule.Split(i, blockSize); + auto iStripe = schedule.Split(i, stripeSize); + auto jCache = schedule.Split(j, cacheCols); + + schedule.SetOrder({ iCache, jCache, iBlock, iStripe, j, i }); + + ArgumentType argType = ArgumentType::Input; + std::string cacheName = "cacheInput"; + size_t maxCacheElts = cacheRows * cacheCols; + size_t fillThreshold = blockSize * cacheCols; + std::function reduceFunction = CopyReduce; + auto extraCacheParams = std::make_tuple(argType, + cacheName, + maxCacheElts, + fillThreshold, + reduceFunction, + false); + GeneralCachingStrategy cachingProvider{}; + schedule.Cache(cachingProvider, + input, + { iTopLevel, jTopLevel }, + {}, + {}, + std::nullopt, + extraCacheParams); + +#if 0 // DEBUGGING + DebugDump(nest.GetUnderlyingLoopNest()); +#endif + nest.Run(); + + return VerifySame(output, expectedOutput); +} + +Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_Test1() +{ + int N = 8; + int cacheRows = N; + int cacheCols = N; + int blockSize = N; + int stripeSize = 4; + + auto input = MakeIncrementingMatrix(N, N, "input"); + auto output = MakeMatrix(N, N, "output"); + + // input + // A: + // [ 0, 1, 2, 3, 4, 5, 6, 7] + // [ 8, 9, 10, 11, 12, 13, 14, 15] + // [16, 17, 18, 19, 20, 21, 22, 23] + // [24, 25, 26, 27, 28, 29, 30, 31] + // [32, 33, 34, 35, 36, 37, 38, 39] + // [40, 41, 42, 43, 44, 45, 46, 47] + // [48, 49, 50, 51, 52, 53, 54, 55] + // [56, 57, 58, 59, 60, 61, 62, 63] + // clang-format off + Vector expectedCached = + { + 0, 8, 16, 24, + 1, 9, 17, 25, + 2, 10, 18, 26, + 3, 11, 19, 27, + 4, 12, 20, 28, + 5, 13, 21, 29, + 6, 14, 22, 30, + 7, 15, 23, 31, + + 32, 40, 48, 56, + 33, 41, 49, 57, + 34, 42, 50, 58, + 35, 43, 51, 59, + 36, 44, 52, 60, + 37, 45, 53, 61, + 38, 46, 54, 62, + 39, 47, 55, 63 + }; + // clang-format on + + Index i("i"), j("j"); + auto nest = Using({ input }, ArgumentType::Input) + .Using({ output }, ArgumentType::Output) + .ForAll(i, 0, N) + .ForAll(j, 0, N) + .Do([](Matrix input, Matrix output, Scalar i, Scalar j) { + output(i, j) = input(i, j); + }); + + auto& schedule = nest.GetSchedule(); + + auto iTopLevel = i; + auto jTopLevel = j; + + auto iCache = schedule.Split(i, cacheRows); + auto iBlock = schedule.Split(i, blockSize); + auto iStripe = schedule.Split(i, stripeSize); + auto jCache = schedule.Split(j, cacheCols); + + schedule.SetOrder({ iCache, jCache, iBlock, iStripe, j, i }); + + ArgumentType argType = ArgumentType::Input; + std::string cacheName = "cacheInput"; + size_t maxCacheElts = cacheRows * cacheCols; + size_t fillThreshold = blockSize * cacheCols; + std::function reduceFunction = CopyReduce; + auto extraCacheParams = std::make_tuple(argType, + cacheName, + maxCacheElts, + fillThreshold, + reduceFunction, + false); + GeneralCachingStrategy cachingProvider{}; + schedule.Cache(cachingProvider, + input, + { iTopLevel, jTopLevel }, + {}, + {}, + std::nullopt, + extraCacheParams); + +#if 0 // DEBUGGING + DebugDump(nest.GetUnderlyingLoopNest()); +#endif + nest.Run(); + + // Examine the underlying cached memory + auto rawCacheValue = cachingProvider._rawCache; + rawCacheValue.SetLayout({ { (int)rawCacheValue.GetLayout().GetMemorySize() } }); + auto cacheVector = Vector(rawCacheValue); + + return VerifySame(cacheVector, expectedCached); +} + +// Smaller stripe size than previous test +Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_Test2() +{ + int N = 8; + int cacheRows = N; + int cacheCols = N; + int blockSize = 4; + int stripeSize = 2; + + auto input = MakeIncrementingMatrix(N, N, "input"); + auto output = MakeMatrix(N, N, "output"); + + // input + // A: + // [ 0, 1, 2, 3, 4, 5, 6, 7] + // [ 8, 9, 10, 11, 12, 13, 14, 15] + // [16, 17, 18, 19, 20, 21, 22, 23] + // [24, 25, 26, 27, 28, 29, 30, 31] + // [32, 33, 34, 35, 36, 37, 38, 39] + // [40, 41, 42, 43, 44, 45, 46, 47] + // [48, 49, 50, 51, 52, 53, 54, 55] + // [56, 57, 58, 59, 60, 61, 62, 63] + // clang-format off + Vector expectedCached = + { + 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, + 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31, + + 32, 40, 33, 41, 34, 42, 35, 43, 36, 44, 37, 45, 38, 46, 39, 47, + 48, 56, 49, 57, 50, 58, 51, 59, 52, 60, 53, 61, 54, 62, 55, 63, + }; + // clang-format on + + Index i("i"), j("j"); + auto nest = Using({ input }, ArgumentType::Input) + .Using({ output }, ArgumentType::Output) + .ForAll(i, 0, N) + .ForAll(j, 0, N) + .Do([](Matrix input, Matrix output, Scalar i, Scalar j) { + output(i, j) = input(i, j); + }); + + auto& schedule = nest.GetSchedule(); + + auto iTopLevel = i; + auto jTopLevel = j; + + auto iCache = schedule.Split(i, cacheRows); + auto iBlock = schedule.Split(i, blockSize); + auto iStripe = schedule.Split(i, stripeSize); + auto jCache = schedule.Split(j, cacheCols); + + schedule.SetOrder({ iCache, jCache, iBlock, iStripe, j, i }); + + ArgumentType argType = ArgumentType::Input; + std::string cacheName = "cacheInput"; + size_t maxCacheElts = cacheRows * cacheCols; + size_t fillThreshold = blockSize * cacheCols; + std::function reduceFunction = CopyReduce; + auto extraCacheParams = std::make_tuple(argType, + cacheName, + maxCacheElts, + fillThreshold, + reduceFunction, + false); + GeneralCachingStrategy cachingProvider{}; + schedule.Cache(cachingProvider, + input, + { iTopLevel, jTopLevel }, + {}, + {}, + std::nullopt, + extraCacheParams); + +#if 0 // DEBUGGING + DebugDump(nest.GetUnderlyingLoopNest()); +#endif + nest.Run(); + + // Examine the underlying cached memory + auto rawCacheValue = cachingProvider._rawCache; + rawCacheValue.SetLayout({ { (int)rawCacheValue.GetLayout().GetMemorySize() } }); + auto cacheVector = Vector(rawCacheValue); + + return VerifySame(cacheVector, expectedCached); +} + +// Same stripe size as previous test, but don't cache entire matrix at once +// Doesn't test the progressive nature of the cache over blocks +Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_Test3() +{ + int N = 8; + int cacheRows = 4; + int cacheCols = 4; + int blockSize = 2; + int stripeSize = 2; + + auto input = MakeIncrementingMatrix(N, N, "input"); + auto output = MakeMatrix(N, N, "output"); + + // input + // A: + // [ 0, 1, 2, 3, 4, 5, 6, 7] + // [ 8, 9, 10, 11, 12, 13, 14, 15] + // [16, 17, 18, 19, 20, 21, 22, 23] + // [24, 25, 26, 27, 28, 29, 30, 31] + // [32, 33, 34, 35, 36, 37, 38, 39] + // [40, 41, 42, 43, 44, 45, 46, 47] + // [48, 49, 50, 51, 52, 53, 54, 55] + // [56, 57, 58, 59, 60, 61, 62, 63] + // clang-format off + Vector expectedCachedUpperLeft = + { + 0, 8, 1, 9, 2, 10, 3, 11, + 16, 24, 17, 25, 18, 26, 19, 27 + }; + Vector expectedCachedUpperRight = + { + 4, 12, 5, 13, 6, 14, 7, 15, + 20, 28, 21, 29, 22, 30, 23, 31 + }; + Vector expectedCachedLowerLeft = + { + 32, 40, 33, 41, 34, 42, 35, 43, + 48, 56, 49, 57, 50, 58, 51, 59 + }; + Vector expectedCachedLowerRight = + { + 36, 44, 37, 45, 38, 46, 39, 47, + 52, 60, 53, 61, 54, 62, 55, 63 + }; + // clang-format on + + Index i("i"), j("j"); + auto nest = Using({ input }, ArgumentType::Input) + .Using({ output }, ArgumentType::Output) + .ForAll(i, 0, N) + .ForAll(j, 0, N) + .Do([](Matrix input, Matrix output, Scalar i, Scalar j) { + output(i, j) = input(i, j); + }); + + auto& schedule = nest.GetSchedule(); + + auto iTopLevel = i; + auto jTopLevel = j; + + auto iCache = schedule.Split(i, cacheRows); + auto iBlock = schedule.Split(i, blockSize); + auto iStripe = schedule.Split(i, stripeSize); + auto jCache = schedule.Split(j, cacheCols); + + schedule.SetOrder({ iCache, jCache, iBlock, iStripe, j, i }); + + ArgumentType argType = ArgumentType::Input; + std::string cacheName = "cacheInput"; + size_t maxCacheElts = cacheRows * cacheCols; + size_t fillThreshold = blockSize * cacheCols; + std::function reduceFunction = CopyReduce; + auto extraCacheParams = std::make_tuple(argType, + cacheName, + maxCacheElts, + fillThreshold, + reduceFunction, + false); + GeneralCachingStrategy cachingProvider{}; + schedule.Cache(cachingProvider, + input, + { iTopLevel, jTopLevel }, + {}, + {}, + std::nullopt, + extraCacheParams); + + // Get a handle to the underlying cached memory + auto rawCacheValue = cachingProvider._rawCache; + int rawCacheSize = (int)rawCacheValue.GetLayout().NumElements(); + + auto cachedUpperLeft = MakeVector(rawCacheSize); + auto cachedUpperRight = MakeVector(rawCacheSize); + auto cachedLowerLeft = MakeVector(rawCacheSize); + auto cachedLowerRight = MakeVector(rawCacheSize); + + // Add a low level API kernel to access the underlying cache after it has been filled + auto cacheSpyKernel = loopnests::Kernel("cache_spy_kernel") + .Inputs(rawCacheValue, cachedUpperLeft, cachedUpperRight, cachedLowerLeft, cachedLowerRight) + .Indices(iTopLevel, jTopLevel) + .Define([cacheRows, cacheCols](Value rawCacheValue, Vector cachedUpperLeft, Vector cachedUpperRight, Vector cachedLowerLeft, Vector cachedLowerRight, Scalar i, Scalar j) { + auto cacheView = rawCacheValue; + cacheView.SetLayout({ { (int)rawCacheValue.GetLayout().NumElements() } }); + auto vectorCacheView = Vector(cacheView); + If(i == 0, + [&]() { + // TODO : remove nested if's + If(j == 0, + [&]() { + cachedUpperLeft = vectorCacheView; + }) + .ElseIf(j == cacheCols, + [&]() { + cachedUpperRight = vectorCacheView; + }); + }) + .ElseIf(i == cacheRows, + [&]() { + If(j == 0, [&]() { + cachedLowerLeft = vectorCacheView; + }).ElseIf(j == cacheCols, [&]() { + cachedLowerRight = vectorCacheView; + }); + }); + }); + auto cacheSpyPosition = loopnests::CodePositionConstraints{ loopnests::LoopFragmentType::epilogue, { iCache, jCache }, {} }; + nest.GetUnderlyingLoopNest().AddKernel(cacheSpyKernel, cacheSpyPosition); + +#if 0 // DEBUGGING + DebugDump(nest.GetUnderlyingLoopNest()); +#endif + nest.Run(); + + auto ok = MakeScalar("ok"); + ok = 1; + auto printError = [&] { + DebugPrint("Upper Left:"); + DebugPrintVector(cachedUpperLeft); + DebugPrint("\n"); + DebugPrintVector(expectedCachedUpperLeft); + DebugPrint("\n"); + DebugPrint("\n"); + DebugPrint("Upper Right:"); + DebugPrintVector(cachedUpperRight); + DebugPrint("\n"); + DebugPrintVector(expectedCachedUpperRight); + DebugPrint("\n"); + DebugPrint("\n"); + DebugPrint("Lower Left:"); + DebugPrintVector(cachedLowerLeft); + DebugPrint("\n"); + DebugPrintVector(expectedCachedLowerLeft); + DebugPrint("\n"); + DebugPrint("\n"); + DebugPrint("Lower Right:"); + DebugPrintVector(cachedLowerRight); + DebugPrint("\n"); + DebugPrintVector(expectedCachedLowerRight); + DebugPrint("\n"); + DebugPrint("\n"); + }; + // TODO : replace nested if's + If(VerifySame(cachedUpperLeft, expectedCachedUpperLeft) == 0, [&]() { + If(VerifySame(cachedUpperRight, expectedCachedUpperRight) == 0, [&]() { + If(VerifySame(cachedLowerLeft, expectedCachedLowerLeft) == 0, [&]() { + If(VerifySame(cachedLowerRight, expectedCachedLowerRight) == 0, [&]() { + ok = 0; + }).Else(printError); + }).Else(printError); + }).Else(printError); + }).Else(printError); + return ok; +} + +Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_Runner(int M, int N, int cacheRows, int cacheCols, int blockSize, int stripeSize) +{ + auto input = MakeIncrementingMatrix(M, N, "input"); + auto output = MakeMatrix(M, N, "output"); + auto expectedOutput = MakeIncrementingMatrix(M, N, "expectedOutput"); + + Index i("i"), j("j"); + auto nest = Using({ input }, ArgumentType::Input) + .Using({ output }, ArgumentType::Output) + .ForAll(i, 0, M) + .ForAll(j, 0, N) + .Do([](Matrix input, Matrix output, Scalar i, Scalar j) { + output(i, j) = input(i, j); + }); + + auto& schedule = nest.GetSchedule(); + + auto iTopLevel = i; + auto jTopLevel = j; + + auto iCache = schedule.Split(i, cacheRows); + auto iBlock = schedule.Split(i, blockSize); + auto iStripe = schedule.Split(i, stripeSize); + auto jCache = schedule.Split(j, cacheCols); + + schedule.SetOrder({ iCache, jCache, iBlock, iStripe, j, i }); + + ArgumentType argType = ArgumentType::Input; + std::string cacheName = "cacheInput"; + size_t maxCacheElts = cacheRows * cacheCols; + size_t fillThreshold = blockSize * cacheCols; + std::function reduceFunction = CopyReduce; + auto extraCacheParams = std::make_tuple(argType, + cacheName, + maxCacheElts, + fillThreshold, + reduceFunction, + false); + GeneralCachingStrategy cachingProvider{}; + schedule.Cache(cachingProvider, + input, + { iTopLevel, jTopLevel }, + {}, + {}, + std::nullopt, + extraCacheParams); + +#if 0 // DEBUGGING + DebugDump(nest.GetUnderlyingLoopNest()); +#endif + nest.Run(); + + return VerifySame(output, expectedOutput); +} + +// input matrix rows doesn't evenly divide cache rows +// input matrix cols evenly divides cache cols +// blockSize == stripeSize == cacheRows / 2 +Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test1() +{ + int M = 7; // M doesn't evenly divide the number of cache rows + int N = 8; + int cacheRows = 4; + int cacheCols = 4; + int blockSize = 2; + int stripeSize = 2; + + return GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, blockSize, stripeSize); +} + +// input matrix rows doesn't evenly divide cache rows, does evenly divide blocksize/stripesize +// input matrix cols evenly divides cache cols +// blockSize == stripeSize == cacheRows / 2 +Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test2() +{ + int M = 6; // M doesn't evenly divide the number of cache rows + int N = 8; + int cacheRows = 4; + int cacheCols = 4; + int blockSize = 2; + int stripeSize = 2; + + return GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, blockSize, stripeSize); +} + +// input matrix rows doesn't evenly divide cache rows, does evenly divide blocksize/stripesize +// input matrix cols doesn't evenly divide cache cols +// blockSize == stripeSize == cacheRows / 2 +Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test3() +{ + int M = 7; // M doesn't evenly divide the number of cache rows + int N = 6; + int cacheRows = 4; + int cacheCols = 4; + int blockSize = 2; + int stripeSize = 2; + + return GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, blockSize, stripeSize); +} + +// input matrix rows doesn't evenly divide cache rows, does evenly divide blocksize/stripesize +// input matrix cols doesn't evenly divide cache cols +// blockSize == stripeSize == cacheRows / 2 +Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test4() +{ + int M = 6; // M doesn't evenly divide the number of cache rows + int N = 6; + int cacheRows = 4; + int cacheCols = 4; + int blockSize = 2; + int stripeSize = 2; + + return GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, blockSize, stripeSize); +} + +// input matrix rows < cache rows, doesn't evenly divide blocksize/stripesize +// input matrix cols evenly divides cache cols +// blockSize == stripeSize == cacheRows / 2 +Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test5() +{ + int M = 3; // M < cache rows + int N = 8; + int cacheRows = 4; + int cacheCols = 4; + int blockSize = 2; + int stripeSize = 2; + + return GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, blockSize, stripeSize); +} + +// input matrix rows < cache rows, evenly divides blocksize/stripesize +// input matrix cols evenly divides cache cols +// blockSize == stripeSize == cacheRows / 2 +Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test6() +{ + int M = 2; // M < cache rows, evenly divides stripesize + int N = 8; + int cacheRows = 4; + int cacheCols = 4; + int blockSize = 2; + int stripeSize = 2; + + return GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, blockSize, stripeSize); +} + +// input matrix rows < cache rows, doesn't evenly divide blocksize/stripesize +// input matrix cols < cache cols +// blockSize == stripeSize == cacheRows / 2 +Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test7() +{ + int M = 3; // M < cache rows, doens't evenly divide stripesize + int N = 3; + int cacheRows = 4; + int cacheCols = 4; + int blockSize = 2; + int stripeSize = 2; + + return GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, blockSize, stripeSize); +} + +// input matrix rows < cache rows, evenly divides blocksize/stripesize +// input matrix cols < cache cols +// blockSize == stripeSize == cacheRows / 2 +Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test8() +{ + int M = 2; // M < cache rows, evenly divides stripesize + int N = 2; + int cacheRows = 4; + int cacheCols = 4; + int blockSize = 2; + int stripeSize = 2; + + return GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, blockSize, stripeSize); +} + +// input matrix rows multiple of cache rows +// input matrix cols < cache cols +// blockSize == stripeSize == cacheRows / 2 +Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test9() +{ + int M = 8; + int N = 2; // N < cache cols + int cacheRows = 4; + int cacheCols = 4; + int blockSize = 2; + int stripeSize = 2; + + return GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, blockSize, stripeSize); +} + +// input matrix rows doesn't evenly divide cache rows +// input matrix cols evenly divides cache cols +// stripeSize == blockSize / 2, blockSize == cacheRows +Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test1() +{ + int M = 7; // M doesn't evenly divide the number of cache rows + int N = 8; + int cacheRows = 4; + int cacheCols = 4; + int blockSize = 4; + int stripeSize = 2; + + return GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, blockSize, stripeSize); +} + +// input matrix rows doesn't evenly divide cache rows, does evenly divide blocksize/stripesize +// input matrix cols evenly divides cache cols +// stripeSize == blockSize / 2, blockSize == cacheRows +Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test2() +{ + int M = 6; // M doesn't evenly divide the number of cache rows + int N = 8; + int cacheRows = 4; + int cacheCols = 4; + int blockSize = 4; + int stripeSize = 2; + + return GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, blockSize, stripeSize); +} + +// input matrix rows doesn't evenly divide cache rows, does evenly divide blocksize/stripesize +// input matrix cols doesn't evenly divide cache cols +// stripeSize == blockSize / 2, blockSize == cacheRows +Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test3() +{ + int M = 7; // M doesn't evenly divide the number of cache rows + int N = 6; + int cacheRows = 4; + int cacheCols = 4; + int blockSize = 4; + int stripeSize = 2; + + return GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, blockSize, stripeSize); +} + +// input matrix rows doesn't evenly divide cache rows, does evenly divide blocksize/stripesize +// input matrix cols doesn't evenly divide cache cols +// stripeSize == blockSize / 2, blockSize == cacheRows +Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test4() +{ + int M = 6; // M doesn't evenly divide the number of cache rows + int N = 6; + int cacheRows = 4; + int cacheCols = 4; + int blockSize = 4; + int stripeSize = 2; + + return GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, blockSize, stripeSize); +} + +// input matrix rows < cache rows, doesn't evenly divide blocksize/stripesize +// input matrix cols evenly divides cache cols +// stripeSize == blockSize / 2, blockSize == cacheRows +Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test5() +{ + int M = 3; // M < cache rows + int N = 8; + int cacheRows = 4; + int cacheCols = 4; + int blockSize = 4; + int stripeSize = 2; + + return GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, blockSize, stripeSize); +} + +// input matrix rows < cache rows, evenly divides blocksize/stripesize +// input matrix cols evenly divides cache cols +// stripeSize == blockSize / 2, blockSize == cacheRows +Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test6() +{ + int M = 2; // M < cache rows, evenly divides stripesize + int N = 8; + int cacheRows = 4; + int cacheCols = 4; + int blockSize = 4; + int stripeSize = 2; + + return GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, blockSize, stripeSize); +} + +// input matrix rows < cache rows, doesn't evenly divide blocksize/stripesize +// input matrix cols < cache cols +// stripeSize == blockSize / 2, blockSize == cacheRows +Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test7() +{ + int M = 3; // M < cache rows, doens't evenly divide stripesize + int N = 3; + int cacheRows = 4; + int cacheCols = 4; + int blockSize = 4; + int stripeSize = 2; + + return GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, blockSize, stripeSize); +} + +// input matrix rows < cache rows, evenly divides blocksize/stripesize +// input matrix cols < cache cols +// stripeSize == blockSize / 2, blockSize == cacheRows +Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test8() +{ + int M = 2; // M < cache rows, evenly divides stripesize + int N = 2; + int cacheRows = 4; + int cacheCols = 4; + int blockSize = 4; + int stripeSize = 2; + + return GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, blockSize, stripeSize); +} + +// input matrix rows multiple of cache rows +// input matrix cols < cache cols +// stripeSize == blockSize / 2, blockSize == cacheRows +Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test9() +{ + int M = 8; + int N = 2; // N < cache cols + int cacheRows = 4; + int cacheCols = 4; + int blockSize = 4; + int stripeSize = 2; + + return GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, blockSize, stripeSize); +} + +Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Runner(int M, int N, int cacheRows, int cacheCols, int blockSize, int stripeSize, Vector expectedCachedUpperLeft, Vector expectedCachedUpperRight, Vector expectedCachedLowerLeft, Vector expectedCachedLowerRight) +{ + auto input = MakeIncrementingMatrix(M, N, "input"); + auto output = MakeMatrix(M, N, "output"); + Index i("i"), j("j"); + auto nest = Using({ input }, ArgumentType::Input) + .Using({ output }, ArgumentType::Output) + .ForAll(i, 0, M) + .ForAll(j, 0, N) + .Do([](Matrix input, Matrix output, Scalar i, Scalar j) { + output(i, j) = input(i, j); + }); + + auto& schedule = nest.GetSchedule(); + + auto iTopLevel = i; + auto jTopLevel = j; + + auto iCache = schedule.Split(i, cacheRows); + auto iBlock = schedule.Split(i, blockSize); + auto iStripe = schedule.Split(i, stripeSize); + auto jCache = schedule.Split(j, cacheCols); + + schedule.SetOrder({ iCache, jCache, iBlock, iStripe, j, i }); + + ArgumentType argType = ArgumentType::Input; + std::string cacheName = "cacheInput"; + size_t maxCacheElts = cacheRows * cacheCols; + size_t fillThreshold = blockSize * cacheCols; + std::function reduceFunction = CopyReduce; + auto extraCacheParams = std::make_tuple(argType, + cacheName, + maxCacheElts, + fillThreshold, + reduceFunction, + false); + GeneralCachingStrategy cachingProvider{}; + schedule.Cache(cachingProvider, + input, + { iTopLevel, jTopLevel }, + {}, + {}, + std::nullopt, + extraCacheParams); + + // Get a handle to the underlying cached memory + auto rawCacheValue = cachingProvider._rawCache; + int rawCacheSize = (int)rawCacheValue.GetLayout().NumElements(); + + auto cachedUpperLeft = MakeVector(rawCacheSize); + auto cachedUpperRight = MakeVector(rawCacheSize); + auto cachedLowerLeft = MakeVector(rawCacheSize); + auto cachedLowerRight = MakeVector(rawCacheSize); + + // Add a low level API kernel to access the underlying cache after it has been filled + auto cacheSpyKernel = loopnests::Kernel("cache_spy_kernel") + .Inputs(rawCacheValue, cachedUpperLeft, cachedUpperRight, cachedLowerLeft, cachedLowerRight) + .Indices(iTopLevel, jTopLevel) + .Define([cacheRows, cacheCols](Value rawCacheValue, Vector cachedUpperLeft, Vector cachedUpperRight, Vector cachedLowerLeft, Vector cachedLowerRight, Scalar i, Scalar j) { + auto cacheView = rawCacheValue; + cacheView.SetLayout({ { (int)rawCacheValue.GetLayout().NumElements() } }); + auto vectorCacheView = Vector(cacheView); + If(i == 0, + [&]() { + // TODO : remove nested if's + If(j == 0, + [&]() { + cachedUpperLeft = vectorCacheView; + }) + .ElseIf(j == cacheCols, + [&]() { + cachedUpperRight = vectorCacheView; + }); + }) + .ElseIf(i == cacheRows, + [&]() { + If(j == 0, [&]() { + cachedLowerLeft = vectorCacheView; + }).ElseIf(j == cacheCols, [&]() { + cachedLowerRight = vectorCacheView; + }); + }); + }); + auto cacheSpyPosition = loopnests::CodePositionConstraints{ loopnests::LoopFragmentType::epilogue, { iCache, jCache }, {} }; + nest.GetUnderlyingLoopNest().AddKernel(cacheSpyKernel, cacheSpyPosition); + +#if 0 // DEBUGGING + DebugDump(nest.GetUnderlyingLoopNest()); +#endif + nest.Run(); + + auto ok = MakeScalar("ok"); + ok = 1; + auto printError = [&] { + DebugPrint("Upper Left:"); + DebugPrintVector(cachedUpperLeft); + DebugPrint("\n"); + DebugPrintVector(expectedCachedUpperLeft); + DebugPrint("\n"); + DebugPrint("\n"); + DebugPrint("Upper Right:"); + DebugPrintVector(cachedUpperRight); + DebugPrint("\n"); + DebugPrintVector(expectedCachedUpperRight); + DebugPrint("\n"); + DebugPrint("\n"); + DebugPrint("Lower Left:"); + DebugPrintVector(cachedLowerLeft); + DebugPrint("\n"); + DebugPrintVector(expectedCachedLowerLeft); + DebugPrint("\n"); + DebugPrint("\n"); + DebugPrint("Lower Right:"); + DebugPrintVector(cachedLowerRight); + DebugPrint("\n"); + DebugPrintVector(expectedCachedLowerRight); + DebugPrint("\n"); + DebugPrint("\n"); + }; + // TODO : replace nested if's + If(VerifySame(cachedUpperLeft, expectedCachedUpperLeft) == 0, [&]() { + If(VerifySame(cachedUpperRight, expectedCachedUpperRight) == 0, [&]() { + If(VerifySame(cachedLowerLeft, expectedCachedLowerLeft) == 0, [&]() { + If(VerifySame(cachedLowerRight, expectedCachedLowerRight) == 0, [&]() { + ok = 0; + }).Else(printError); + }).Else(printError); + }).Else(printError); + }).Else(printError); + return ok; +} + +Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Runner_LeftCachesOnly(int M, int N, int cacheRows, int cacheCols, int blockSize, int stripeSize, Vector expectedCachedUpperLeft, Vector expectedCachedLowerLeft) +{ + auto input = MakeIncrementingMatrix(M, N, "input"); + auto output = MakeMatrix(M, N, "output"); + Index i("i"), j("j"); + auto nest = Using({ input }, ArgumentType::Input) + .Using({ output }, ArgumentType::Output) + .ForAll(i, 0, M) + .ForAll(j, 0, N) + .Do([](Matrix input, Matrix output, Scalar i, Scalar j) { + output(i, j) = input(i, j); + }); + + auto& schedule = nest.GetSchedule(); + + auto iTopLevel = i; + auto jTopLevel = j; + + auto iCache = schedule.Split(i, cacheRows); + auto iBlock = schedule.Split(i, blockSize); + auto iStripe = schedule.Split(i, stripeSize); + auto jCache = schedule.Split(j, cacheCols); + + schedule.SetOrder({ iCache, jCache, iBlock, iStripe, j, i }); + + ArgumentType argType = ArgumentType::Input; + std::string cacheName = "cacheInput"; + size_t maxCacheElts = cacheRows * cacheCols; + size_t fillThreshold = blockSize * cacheCols; + std::function reduceFunction = CopyReduce; + auto extraCacheParams = std::make_tuple(argType, + cacheName, + maxCacheElts, + fillThreshold, + reduceFunction, + false); + GeneralCachingStrategy cachingProvider{}; + schedule.Cache(cachingProvider, + input, + { iTopLevel, jTopLevel }, + {}, + {}, + std::nullopt, + extraCacheParams); + + // Get a handle to the underlying cached memory + auto rawCacheValue = cachingProvider._rawCache; + int rawCacheSize = (int)rawCacheValue.GetLayout().NumElements(); + + auto cachedUpperLeft = MakeVector(rawCacheSize); + auto cachedLowerLeft = MakeVector(rawCacheSize); + + // Add a low level API kernel to access the underlying cache after it has been filled + auto cacheSpyKernel = loopnests::Kernel("cache_spy_kernel") + .Inputs(rawCacheValue, cachedUpperLeft, cachedLowerLeft) + .Indices(iTopLevel, jTopLevel) + .Define([cacheRows](Value rawCacheValue, Vector cachedUpperLeft, Vector cachedLowerLeft, Scalar i, Scalar j) { + auto cacheView = rawCacheValue; + cacheView.SetLayout({ { (int)rawCacheValue.GetLayout().NumElements() } }); + auto vectorCacheView = Vector(cacheView); + If(i == 0, + [&]() { + // TODO : remove nested if's + If(j == 0, + [&]() { + cachedUpperLeft = vectorCacheView; + }); + }) + .ElseIf(i == cacheRows, + [&]() { + If(j == 0, [&]() { + cachedLowerLeft = vectorCacheView; + }); + }); + }); + auto cacheSpyPosition = loopnests::CodePositionConstraints{ loopnests::LoopFragmentType::epilogue, { iCache, jCache }, {} }; + nest.GetUnderlyingLoopNest().AddKernel(cacheSpyKernel, cacheSpyPosition); + +#if 0 // DEBUGGING + DebugDump(nest.GetUnderlyingLoopNest()); +#endif + nest.Run(); + + auto ok = MakeScalar("ok"); + ok = 1; + auto printError = [&] { + DebugPrint("Upper Left:"); + DebugPrintVector(cachedUpperLeft); + DebugPrint("\n"); + DebugPrintVector(expectedCachedUpperLeft); + DebugPrint("\n"); + DebugPrint("\n"); + DebugPrint("Lower Left:"); + DebugPrintVector(cachedLowerLeft); + DebugPrint("\n"); + DebugPrintVector(expectedCachedLowerLeft); + DebugPrint("\n"); + DebugPrint("\n"); + }; + // TODO : replace nested if's + If(VerifySame(cachedUpperLeft, expectedCachedUpperLeft) == 0, [&]() { + If(VerifySame(cachedLowerLeft, expectedCachedLowerLeft) == 0, [&]() { + ok = 0; + }).Else(printError); + }).Else(printError); + return ok; +} + +Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Runner_UpperCachesOnly(int M, int N, int cacheRows, int cacheCols, int blockSize, int stripeSize, Vector expectedCachedUpperLeft, Vector expectedCachedUpperRight) +{ + auto input = MakeIncrementingMatrix(M, N, "input"); + auto output = MakeMatrix(M, N, "output"); + Index i("i"), j("j"); + auto nest = Using({ input }, ArgumentType::Input) + .Using({ output }, ArgumentType::Output) + .ForAll(i, 0, M) + .ForAll(j, 0, N) + .Do([](Matrix input, Matrix output, Scalar i, Scalar j) { + output(i, j) = input(i, j); + }); + + auto& schedule = nest.GetSchedule(); + + auto iTopLevel = i; + auto jTopLevel = j; + + auto iCache = schedule.Split(i, cacheRows); + auto iBlock = schedule.Split(i, blockSize); + auto iStripe = schedule.Split(i, stripeSize); + auto jCache = schedule.Split(j, cacheCols); + + schedule.SetOrder({ iCache, jCache, iBlock, iStripe, j, i }); + + ArgumentType argType = ArgumentType::Input; + std::string cacheName = "cacheInput"; + size_t maxCacheElts = cacheRows * cacheCols; + size_t fillThreshold = blockSize * cacheCols; + std::function reduceFunction = CopyReduce; + auto extraCacheParams = std::make_tuple(argType, + cacheName, + maxCacheElts, + fillThreshold, + reduceFunction, + false); + GeneralCachingStrategy cachingProvider{}; + schedule.Cache(cachingProvider, + input, + { iTopLevel, jTopLevel }, + {}, + {}, + std::nullopt, + extraCacheParams); + + // Get a handle to the underlying cached memory + auto rawCacheValue = cachingProvider._rawCache; + int rawCacheSize = (int)rawCacheValue.GetLayout().NumElements(); + + auto cachedUpperLeft = MakeVector(rawCacheSize); + auto cachedUpperRight = MakeVector(rawCacheSize); + + // Add a low level API kernel to access the underlying cache after it has been filled + auto cacheSpyKernel = loopnests::Kernel("cache_spy_kernel") + .Inputs(rawCacheValue, cachedUpperLeft, cachedUpperRight) + .Indices(iTopLevel, jTopLevel) + .Define([cacheCols](Value rawCacheValue, Vector cachedUpperLeft, Vector cachedUpperRight, Scalar i, Scalar j) { + auto cacheView = rawCacheValue; + cacheView.SetLayout({ { (int)rawCacheValue.GetLayout().NumElements() } }); + auto vectorCacheView = Vector(cacheView); + If(i == 0, + [&]() { + // TODO : remove nested if's + If(j == 0, + [&]() { + cachedUpperLeft = vectorCacheView; + }) + .ElseIf(j == cacheCols, + [&]() { + cachedUpperRight = vectorCacheView; + }); + }); + }); + auto cacheSpyPosition = loopnests::CodePositionConstraints{ loopnests::LoopFragmentType::epilogue, { iCache, jCache }, {} }; + nest.GetUnderlyingLoopNest().AddKernel(cacheSpyKernel, cacheSpyPosition); + +#if 0 // DEBUGGING + DebugDump(nest.GetUnderlyingLoopNest()); +#endif + nest.Run(); + + auto ok = MakeScalar("ok"); + ok = 1; + auto printError = [&] { + DebugPrint("Upper Left:"); + DebugPrintVector(cachedUpperLeft); + DebugPrint("\n"); + DebugPrintVector(expectedCachedUpperLeft); + DebugPrint("\n"); + DebugPrint("\n"); + DebugPrint("Upper Right:"); + DebugPrintVector(cachedUpperRight); + DebugPrint("\n"); + DebugPrintVector(expectedCachedUpperRight); + DebugPrint("\n"); + DebugPrint("\n"); + }; + // TODO : replace nested if's + If(VerifySame(cachedUpperLeft, expectedCachedUpperLeft) == 0, [&]() { + If(VerifySame(cachedUpperRight, expectedCachedUpperRight) == 0, [&]() { + ok = 0; + }).Else(printError); + }).Else(printError); + return ok; +} + +Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Runner_UpperLeftCacheOnly(int M, int N, int cacheRows, int cacheCols, int blockSize, int stripeSize, Vector expectedCachedUpperLeft) +{ + auto input = MakeIncrementingMatrix(M, N, "input"); + auto output = MakeMatrix(M, N, "output"); + Index i("i"), j("j"); + auto nest = Using({ input }, ArgumentType::Input) + .Using({ output }, ArgumentType::Output) + .ForAll(i, 0, M) + .ForAll(j, 0, N) + .Do([](Matrix input, Matrix output, Scalar i, Scalar j) { + output(i, j) = input(i, j); + }); + + auto& schedule = nest.GetSchedule(); + + auto iTopLevel = i; + auto jTopLevel = j; + + auto iCache = schedule.Split(i, cacheRows); + auto iBlock = schedule.Split(i, blockSize); + auto iStripe = schedule.Split(i, stripeSize); + auto jCache = schedule.Split(j, cacheCols); + + schedule.SetOrder({ iCache, jCache, iBlock, iStripe, j, i }); + + ArgumentType argType = ArgumentType::Input; + std::string cacheName = "cacheInput"; + size_t maxCacheElts = cacheRows * cacheCols; + size_t fillThreshold = blockSize * cacheCols; + std::function reduceFunction = CopyReduce; + auto extraCacheParams = std::make_tuple(argType, + cacheName, + maxCacheElts, + fillThreshold, + reduceFunction, + false); + GeneralCachingStrategy cachingProvider{}; + schedule.Cache(cachingProvider, + input, + { iTopLevel, jTopLevel }, + {}, + {}, + std::nullopt, + extraCacheParams); + + // Get a handle to the underlying cached memory + auto rawCacheValue = cachingProvider._rawCache; + int rawCacheSize = (int)rawCacheValue.GetLayout().NumElements(); + + auto cachedUpperLeft = MakeVector(rawCacheSize); + + // Add a low level API kernel to access the underlying cache after it has been filled + auto cacheSpyKernel = loopnests::Kernel("cache_spy_kernel") + .Inputs(rawCacheValue, cachedUpperLeft) + .Indices(iTopLevel, jTopLevel) + .Define([](Value rawCacheValue, Vector cachedUpperLeft, Scalar i, Scalar j) { + auto cacheView = rawCacheValue; + cacheView.SetLayout({ { (int)rawCacheValue.GetLayout().NumElements() } }); + auto vectorCacheView = Vector(cacheView); + If(i == 0, + [&]() { + // TODO : remove nested if's + If(j == 0, + [&]() { + cachedUpperLeft = vectorCacheView; + }); + }); + }); + auto cacheSpyPosition = loopnests::CodePositionConstraints{ loopnests::LoopFragmentType::epilogue, { iCache, jCache }, {} }; + nest.GetUnderlyingLoopNest().AddKernel(cacheSpyKernel, cacheSpyPosition); + +#if 0 // DEBUGGING + DebugDump(nest.GetUnderlyingLoopNest()); +#endif + nest.Run(); + + return VerifySame(cachedUpperLeft, expectedCachedUpperLeft); +} + +// input matrix rows doesn't evenly divide cache rows +// input matrix cols evenly divides cache cols +// blockSize == stripeSize == cacheRows / 2 +Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test1() +{ + int M = 7; // M doesn't evenly divide the number of cache rows + int N = 8; + int cacheRows = 4; + int cacheCols = 4; + int smallBlockSize = 2; + int largeBlockSize = 4; + int stripeSize = 2; + + // input + // A: + // [ 0, 1, 2, 3, 4, 5, 6, 7] + // [ 8, 9, 10, 11, 12, 13, 14, 15] + // [16, 17, 18, 19, 20, 21, 22, 23] + // [24, 25, 26, 27, 28, 29, 30, 31] + // [32, 33, 34, 35, 36, 37, 38, 39] + // [40, 41, 42, 43, 44, 45, 46, 47] + // [48, 49, 50, 51, 52, 53, 54, 55] + // clang-format off + Vector expectedCachedUpperLeft = + { + 0, 8, 1, 9, 2, 10, 3, 11, + 16, 24, 17, 25, 18, 26, 19, 27 + }; + Vector expectedCachedUpperRight = + { + 4, 12, 5, 13, 6, 14, 7, 15, + 20, 28, 21, 29, 22, 30, 23, 31 + }; + Vector expectedCachedLowerLeft = + { + 32, 40, 33, 41, 34, 42, 35, 43, + 48, 0, 49, 0, 50, 0, 51, 0 + }; + Vector expectedCachedLowerRight = + { + 36, 44, 37, 45, 38, 46, 39, 47, + 52, 0, 53, 0, 54, 0, 55, 0 + }; + // clang-format on + + auto smallBlockResult = GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, smallBlockSize, stripeSize, expectedCachedUpperLeft, expectedCachedUpperRight, expectedCachedLowerLeft, expectedCachedLowerRight); + auto largeBlockResult = GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, largeBlockSize, stripeSize, expectedCachedUpperLeft, expectedCachedUpperRight, expectedCachedLowerLeft, expectedCachedLowerRight); + return smallBlockResult + largeBlockResult; +} + +// input matrix rows doesn't evenly divide cache rows, does evenly divide blocksize/stripesize +// input matrix cols evenly divides cache cols +// blockSize == stripeSize == cacheRows / 2 +Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test2() +{ + int M = 6; // M doesn't evenly divide the number of cache rows + int N = 8; + int cacheRows = 4; + int cacheCols = 4; + int smallBlockSize = 2; + int largeBlockSize = 4; + int stripeSize = 2; + + // input + // A: + // [ 0, 1, 2, 3, 4, 5, 6, 7] + // [ 8, 9, 10, 11, 12, 13, 14, 15] + // [16, 17, 18, 19, 20, 21, 22, 23] + // [24, 25, 26, 27, 28, 29, 30, 31] + // [32, 33, 34, 35, 36, 37, 38, 39] + // [40, 41, 42, 43, 44, 45, 46, 47] + // clang-format off + Vector expectedCachedUpperLeft = + { + 0, 8, 1, 9, 2, 10, 3, 11, + 16, 24, 17, 25, 18, 26, 19, 27 + }; + Vector expectedCachedUpperRight = + { + 4, 12, 5, 13, 6, 14, 7, 15, + 20, 28, 21, 29, 22, 30, 23, 31 + }; + Vector expectedCachedLowerLeft = + { + 32, 40, 33, 41, 34, 42, 35, 43, + 0, 0, 0, 0, 0, 0, 0, 0 + }; + Vector expectedCachedLowerRight = + { + 36, 44, 37, 45, 38, 46, 39, 47, + 0, 0, 0, 0, 0, 0, 0, 0 + }; + // clang-format on + + auto smallBlockResult = GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, smallBlockSize, stripeSize, expectedCachedUpperLeft, expectedCachedUpperRight, expectedCachedLowerLeft, expectedCachedLowerRight); + auto largeBlockResult = GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, largeBlockSize, stripeSize, expectedCachedUpperLeft, expectedCachedUpperRight, expectedCachedLowerLeft, expectedCachedLowerRight); + return smallBlockResult + largeBlockResult; +} + +// input matrix rows doesn't evenly divide cache rows, does evenly divide blocksize/stripesize +// input matrix cols doesn't evenly divide cache cols +// blockSize == stripeSize == cacheRows / 2 +Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test3() +{ + int M = 7; // M doesn't evenly divide the number of cache rows + int N = 6; + int cacheRows = 4; + int cacheCols = 4; + int smallBlockSize = 2; + int largeBlockSize = 4; + int stripeSize = 2; + + // input + // A: + // [ 0, 1, 2, 3, 4, 5] + // [ 6, 7, 8, 9, 10, 11] + // [12, 13, 14, 15, 16, 17] + // [18, 19, 20, 21, 22, 23] + // [24, 25, 26, 27, 28, 29] + // [30, 31, 32, 33, 34, 35] + // [36, 37, 38, 39, 40, 41] + // clang-format off + Vector expectedCachedUpperLeft = + { + 0, 6, 1, 7, 2, 8, 3, 9, + 12, 18, 13, 19, 14, 20, 15, 21 + }; + Vector expectedCachedUpperRight = + { + 4, 10, 5, 11, 16, 22, 17, 23, + 0, 0, 0, 0, 0, 0, 0, 0 + }; + Vector expectedCachedLowerLeft = + { + 24, 30, 25, 31, 26, 32, 27, 33, + 36, 0, 37, 0, 38, 0, 39, 0 + }; + Vector expectedCachedLowerRight = + { + 28, 34, 29, 35, 40, 0, 41, 0, + 0, 0, 0, 0, 0, 0, 0, 0 + }; + // clang-format on + + auto smallBlockResult = GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, smallBlockSize, stripeSize, expectedCachedUpperLeft, expectedCachedUpperRight, expectedCachedLowerLeft, expectedCachedLowerRight); + auto largeBlockResult = GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, largeBlockSize, stripeSize, expectedCachedUpperLeft, expectedCachedUpperRight, expectedCachedLowerLeft, expectedCachedLowerRight); + return smallBlockResult + largeBlockResult; +} + +// input matrix rows doesn't evenly divide cache rows, does evenly divide blocksize/stripesize +// input matrix cols doesn't evenly divide cache cols +// blockSize == stripeSize == cacheRows / 2 +Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test4() +{ + int M = 6; // M doesn't evenly divide the number of cache rows + int N = 6; + int cacheRows = 4; + int cacheCols = 4; + int smallBlockSize = 2; + int largeBlockSize = 4; + int stripeSize = 2; + + // input + // A: + // [ 0, 1, 2, 3, 4, 5], + // [ 6, 7, 8, 9, 10, 11], + // [12, 13, 14, 15, 16, 17], + // [18, 19, 20, 21, 22, 23], + // [24, 25, 26, 27, 28, 29], + // [30, 31, 32, 33, 34, 35] + // clang-format off + Vector expectedCachedUpperLeft = + { + 0, 6, 1, 7, 2, 8, 3, 9, + 12, 18, 13, 19, 14, 20, 15, 21 + }; + Vector expectedCachedUpperRight = + { + 4, 10, 5, 11, 16, 22, 17, 23, + 0, 0, 0, 0, 0, 0, 0, 0 + }; + Vector expectedCachedLowerLeft = + { + 24, 30, 25, 31, 26, 32, 27, 33, + 0, 0, 0, 0, 0, 0, 0, 0 + }; + Vector expectedCachedLowerRight = + { + 28, 34, 29, 35, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0 + }; + // clang-format on + + auto smallBlockResult = GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, smallBlockSize, stripeSize, expectedCachedUpperLeft, expectedCachedUpperRight, expectedCachedLowerLeft, expectedCachedLowerRight); + auto largeBlockResult = GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Runner(M, N, cacheRows, cacheCols, largeBlockSize, stripeSize, expectedCachedUpperLeft, expectedCachedUpperRight, expectedCachedLowerLeft, expectedCachedLowerRight); + return smallBlockResult + largeBlockResult; +} + +// input matrix rows < cache rows, doesn't evenly divide blocksize/stripesize +// input matrix cols evenly divides cache cols +// blockSize == stripeSize == cacheRows / 2 +Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test5() +{ + int M = 3; // M < cache rows + int N = 8; + int cacheRows = 4; + int cacheCols = 4; + int smallBlockSize = 2; + int largeBlockSize = 4; + int stripeSize = 2; + + // input + // A: + // [ 0, 1, 2, 3, 4, 5, 6, 7], + // [ 8, 9, 10, 11, 12, 13, 14, 15], + // [16, 17, 18, 19, 20, 21, 22, 23] + // clang-format off + Vector expectedCachedUpperLeft = + { + 0, 8, 1, 9, 2, 10, 3, 11, + 16, 0, 17, 0, 18, 0, 19, 0 + }; + Vector expectedCachedUpperRight = + { + 4, 12, 5, 13, 6, 14, 7, 15, + 20, 0, 21, 0, 22, 0, 23, 0 + }; + // clang-format on + + auto smallBlockResult = GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Runner_UpperCachesOnly(M, N, cacheRows, cacheCols, smallBlockSize, stripeSize, expectedCachedUpperLeft, expectedCachedUpperRight); + auto largeBlockResult = GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Runner_UpperCachesOnly(M, N, cacheRows, cacheCols, largeBlockSize, stripeSize, expectedCachedUpperLeft, expectedCachedUpperRight); + return smallBlockResult + largeBlockResult; +} + +// input matrix rows < cache rows, evenly divides blocksize/stripesize +// input matrix cols evenly divides cache cols +// blockSize == stripeSize == cacheRows / 2 +Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test6() +{ + int M = 2; // M < cache rows, evenly divides stripesize + int N = 8; + int cacheRows = 4; + int cacheCols = 4; + int smallBlockSize = 2; + int largeBlockSize = 4; + int stripeSize = 2; + + // input + // A: + // [ 0, 1, 2, 3, 4, 5, 6, 7], + // [ 8, 9, 10, 11, 12, 13, 14, 15] + // clang-format off + Vector expectedCachedUpperLeft = + { + 0, 8, 1, 9, 2, 10, 3, 11, + 0, 0, 0, 0, 0, 0, 0, 0 + }; + Vector expectedCachedUpperRight = + { + 4, 12, 5, 13, 6, 14, 7, 15, + 0, 0, 0, 0, 0, 0, 0, 0 + }; + // clang-format on + + auto smallBlockResult = GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Runner_UpperCachesOnly(M, N, cacheRows, cacheCols, smallBlockSize, stripeSize, expectedCachedUpperLeft, expectedCachedUpperRight); + auto largeBlockResult = GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Runner_UpperCachesOnly(M, N, cacheRows, cacheCols, largeBlockSize, stripeSize, expectedCachedUpperLeft, expectedCachedUpperRight); + return smallBlockResult + largeBlockResult; +} + +// input matrix rows < cache rows, doesn't evenly divide blocksize/stripesize +// input matrix cols < cache cols +// blockSize == stripeSize == cacheRows / 2 +Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test7() +{ + int M = 3; // M < cache rows, doens't evenly divide stripesize + int N = 3; + int cacheRows = 4; + int cacheCols = 4; + int smallBlockSize = 2; + int largeBlockSize = 4; + int stripeSize = 2; + + // input + // A: + // [0, 1, 2], + // [3, 4, 5], + // [6, 7, 8] + // clang-format off + Vector expectedCachedUpperLeft = + { + 0, 3, 1, 4, 2, 5, 6, 0, + 7, 0, 8, 0, 0, 0, 0, 0 + }; + // clang-format on + auto smallBlockResult = GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Runner_UpperLeftCacheOnly(M, N, cacheRows, cacheCols, smallBlockSize, stripeSize, expectedCachedUpperLeft); + auto largeBlockResult = GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Runner_UpperLeftCacheOnly(M, N, cacheRows, cacheCols, largeBlockSize, stripeSize, expectedCachedUpperLeft); + return smallBlockResult + largeBlockResult; +} + +// input matrix rows < cache rows, evenly divides blocksize/stripesize +// input matrix cols < cache cols +// blockSize == stripeSize == cacheRows / 2 +Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test8() +{ + int M = 2; // M < cache rows, evenly divides stripesize + int N = 2; + int cacheRows = 4; + int cacheCols = 4; + int smallBlockSize = 2; + int largeBlockSize = 4; + int stripeSize = 2; + + // input + // A: + // [0, 1] + // [2, 3] + // clang-format off + Vector expectedCachedUpperLeft = + { + 0, 2, 1, 3, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0 + }; + // clang-format on + + auto smallBlockResult = GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Runner_UpperLeftCacheOnly(M, N, cacheRows, cacheCols, smallBlockSize, stripeSize, expectedCachedUpperLeft); + auto largeBlockResult = GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Runner_UpperLeftCacheOnly(M, N, cacheRows, cacheCols, largeBlockSize, stripeSize, expectedCachedUpperLeft); + return smallBlockResult + largeBlockResult; +} + +// input matrix rows multiple of cache rows +// input matrix cols < cache cols +// blockSize == stripeSize == cacheRows / 2 +Scalar GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test9() +{ + int M = 8; + int N = 2; // N < cache cols + int cacheRows = 4; + int cacheCols = 4; + int smallBlockSize = 2; + int largeBlockSize = 4; + int stripeSize = 2; + + // input + // A: + // [ 0, 1], + // [ 2, 3], + // [ 4, 5], + // [ 6, 7], + // [ 8, 9], + // [10, 11], + // [12, 13], + // [14, 15] + // clang-format off + Vector expectedCachedUpperLeft = + { + 0, 2, 1, 3, 4, 6, 5, 7, + 0, 0, 0, 0, 0, 0, 0, 0 + }; + Vector expectedCachedLowerLeft = + { + 8, 10, 9, 11, 12, 14, 13, 15, + 0, 0, 0, 0, 0, 0, 0, 0 + }; + // clang-format on + + auto smallBlockResult = GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Runner_LeftCachesOnly(M, N, cacheRows, cacheCols, smallBlockSize, stripeSize, expectedCachedUpperLeft, expectedCachedLowerLeft); + auto largeBlockResult = GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Runner_LeftCachesOnly(M, N, cacheRows, cacheCols, largeBlockSize, stripeSize, expectedCachedUpperLeft, expectedCachedLowerLeft); + return smallBlockResult + largeBlockResult; +} + +} // namespace ell diff --git a/libraries/value/test/src/Functions_test.cpp b/libraries/value/test/src/Functions_test.cpp new file mode 100644 index 000000000..b28ecb1fc --- /dev/null +++ b/libraries/value/test/src/Functions_test.cpp @@ -0,0 +1,61 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: Functions_test.cpp (value) +// Authors: Kern Handa, Chuck Jacobs +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "Functions_test.h" +#include "TestUtil.h" + +#include +#include +#include + +#include + +#if !defined(WIN32) +#include +#include +#include +#else +#include +#endif // !defined(WIN32) + +using namespace ell::utilities; +using namespace ell::value; + +#define PRINT_IR 0 + +namespace ell +{ +Scalar FunctionArgType_test() +{ + auto fn = DeclareFunction("FunctionArgType_test") + .Parameters( + Value(ValueType::Float, ScalarLayout), + Value({ValueType::Float, 0}, ScalarLayout), + Value(ValueType::Int32, ScalarLayout), + Value({ValueType::Int32, 0}, ScalarLayout)) + .Returns(Value(ValueType::Int32, ScalarLayout)) + .Define([](Scalar f, Scalar f0, Scalar i, Scalar i0) { + auto ff = MakeScalar(); + auto ff0 = MakeScalar(); + auto ii = MakeScalar(); + auto ii0 = MakeScalar(); + + ff = f; + ff0 = f0; + ii = i; + ii0 = i0; + return Scalar(0); + }); + + auto arg1 = MakeScalar(); + auto arg2 = MakeScalar(); + auto arg3 = MakeScalar(); + auto arg4 = MakeScalar(); + return fn(arg1, arg2, arg3, arg4); +} +} // namespace ell diff --git a/libraries/value/test/src/LoopNestAPI_test.cpp b/libraries/value/test/src/LoopNestAPI_test.cpp new file mode 100644 index 000000000..4133c2c0b --- /dev/null +++ b/libraries/value/test/src/LoopNestAPI_test.cpp @@ -0,0 +1,1090 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: LoopNestAPI_test.cpp (value) +// Authors: Kern Handa +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "LoopNestAPI_test.h" +#include "LoopNest_kernels.h" +#include "TestUtil.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include + +#if 0 // DEBUGGING +#include +#endif + +using namespace ell::utilities; +using namespace ell::value; +using namespace ell::logging; + +namespace ell +{ +Scalar LoopNest_api_test1() +{ + auto matrix = MakeMatrix(4, 5); + Index i("i"), j("j"); + + Using({ matrix }, ArgumentType::Output) + .ForAll(i, 0, 4) + .ForAll(j, 0, 5) + .Do(loopnest_kernel) + .Run(); + + return matrix(2, 3) - 19; // will return 0 if calculation is correct +} + +Scalar LoopNest_api_test2() +{ + auto matrix = MakeMatrix(4, 5); + Index i("i"), j("j"); + + auto nest = Using({ matrix }, ArgumentType::Output) + .ForAll(i, 0, 4) + .ForAll(j, 0, 5) + .Do(loopnest_kernel); + + nest.GetSchedule().Split(i, 2); + + nest.Run(); + + return matrix(2, 3) - 19; // will return 0 if calculation is correct +} + +Scalar LoopNest_api_test3() +{ + // Declare the input matrix + std::vector> dt{ + std::vector{ 1, 2, 3 }, + std::vector{ 4, 5, 6 }, + }; + auto matrix = Matrix(dt); + // Declare the output matrix and initialize its values to 10. + auto output = MakeMatrix(static_cast(matrix.Rows()), static_cast(matrix.Columns())); + For(output, [&](Scalar row, Scalar column) { + output(row, column) = 10; + }); + + Index i("i"), j("j"); + + //// Use a Loopnest to call loopnest_kernel_3 for each element of the input matrix and write the result to + //// our output. + Using({ output }, ArgumentType::Output) + .Using({ matrix }, ArgumentType::Input) + .ForAll(i, 0, static_cast(matrix.Rows())) + .ForAll(j, 0, static_cast(matrix.Columns())) + .Do(loopnest_kernel_3) + .Run(); + + // loopnest_kernel_3 will add the input element to the output element. + // Since we initialized the output to 10, we expect the result to be + // 10 greater than the input. + std::vector expectedValues{ 11, 12, 13, 14, 15, 16 }; + auto expected = Vector(expectedValues); + + // View the result as a Vector + Vector actual = AsVector(AsFullView(output)); + + // Verify that the actual result is what we expect + return VerifySame(actual, expected); +} + +Scalar LoopNest_api_test4() +{ + // Declare the output matrix and initialize its values to 0. + auto output = MakeMatrix(2, 6); + + Index i("i"), j("j"); + + // Use a Loopnest to call loopnest_kernel_4 for each element of the input matrix and write the result to + // our output. + auto nest = Using({ output }, ArgumentType::Output) + .Using({ output }, ArgumentType::Input) // this isn't how you'd write in real life, hopefully (using the same memory for both input and output) + .ForAll(i, 0, static_cast(output.Rows())) + .ForAll(j, 0, static_cast(output.Columns())) + .Do(loopnest_kernel_4); + + nest.GetSchedule().Split(j, 2); + + nest.Run(); + + // loopnest_kernel_4 will multiply row by 10 and add the column. + std::vector expectedValues{ 0, 1, 2, 3, 4, 5, 10, 11, 12, 13, 14, 15 }; + auto expected = Vector(expectedValues); + + // View the result as a Vector + Vector actual = AsVector(AsFullView(output)); + + // Verify that the actual result is what we expect + return VerifySame(actual, expected); +} + +Scalar LoopNest_api_test5() +{ + // Declare the output matrix and initialize its values to 0. + auto output = MakeMatrix(2, 8); + + Index i("i"), j("j"); + + // Use a Loopnest to call loopnest_kernel_4 for each element of the input matrix and write the result to + // our output. + auto nest = Using({ output }, ArgumentType::Output) + .Using({ output }, ArgumentType::Input) // this isn't how you'd write in real life, hopefully (using the same memory for both input and output) + .ForAll(i, 0, static_cast(output.Rows())) + .ForAll(j, 0, static_cast(output.Columns())) + .Do(loopnest_kernel_4); + + auto& schedule = nest.GetSchedule(); + schedule.Split(j, 4); + schedule.Split(j, 2); + + nest.Run(); + + // loopnest_kernel_4 will multiply row by 10 and add the column. + std::vector expectedValues{ 0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13, 14, 15, 16, 17 }; + auto expected = Vector(expectedValues); + + // View the result as a Vector + Vector actual = AsVector(AsFullView(output)); + + // Verify that the actual result is what we expect + return VerifySame(actual, expected); +} + +Scalar LoopNest_api_Parallelized_test1() +{ + Scalar ok = Allocate(ScalarLayout); + auto matrix = MakeMatrix(4, 5); + InvokeForContext([&] { + auto v = matrix.GetValue().Get().GetDataAs(); + v->setName("matrix"); + }); + + Index i("i"), j("j"); + + auto nest = Using({ matrix }, ArgumentType::Output) + .ForAll(i, 0, 4) + .ForAll(j, 0, 5) + .Do([](Matrix m, Scalar i, Scalar j) { + Scalar tid = GetTID(); +#if 1 // Useful to turn off/on for debugging + InvokeForContext([&](auto&) { + auto iInt = i.Get(); + auto jInt = j.Get(); + Log() << "m(" << iInt << ", " << jInt << ") = " << (iInt * 2 + jInt * 5) + << " [Thread " << tid.Get() << "]" + << EOL; + }); +#endif // 1 + m(i, j) = i * 2 + j * 5; + }); + + nest.GetSchedule().Parallelize(i, 2); + + nest.Run(); + + ok = matrix(2, 3) - 19; + return ok; // will return 0 if calculation is correct +} + +Scalar LoopNest_api_Parallelized_test2() +{ + Scalar ok = Allocate(ScalarLayout); + + auto matrix = MakeMatrix(4, 5); + Index i("i"), j("j"); + + auto nest = Using({ matrix }, ArgumentType::Output) + .ForAll(i, 0, 4) + .ForAll(j, 0, 5) + .Do([](Matrix m, Scalar i, Scalar j) { + Scalar tid = GetTID(); +#if 1 // Useful to turn off/on for debugging + InvokeForContext([&](auto&) { + auto iInt = i.Get(); + auto jInt = j.Get(); + Log() << "m(" << iInt << ", " << jInt << ") = " << tid.Get() + << " [Thread " << tid.Get() << "]" << EOL; + }); +#endif // 1 + m(i, j) = tid; + }); + + nest.GetSchedule().Parallelize(i, 2); + + nest.Run(); + + auto expected = MakeMatrix(4, 5); + If( + VerifySame(matrix, expected) == 0, + [&] { + ok = 1; + }) + .Else([&] { + auto value = matrix.GetValue(); + value.SetLayout({ { (int)matrix.Size() } }); + DebugPrintVector(value); + DebugPrint("\n"); + }); + return ok; +} + +Scalar LoopNest_api_Unrolled_test1() +{ + auto matrix = MakeMatrix(20, 5); + Index i("i"), j("j"); + + auto nest = Using({ matrix }, ArgumentType::Output) + .ForAll(i, 0, 4) + .ForAll(j, 0, 5) + .Do(loopnest_kernel); + + auto& schedule = nest.GetSchedule(); + + schedule.Parallelize(i, 2); + schedule.Unroll(j); + + nest.Run(); + + return matrix(2, 3) - 19; // will return 0 if calculation is correct +} + +Scalar LoopNest_api_SetOrder_test1() +{ + auto matrix = MakeMatrix(4, 5); + Index i("i"), j("j"); + + auto nest = Using({ matrix }, ArgumentType::Output) + .ForAll(i, 0, 4) + .ForAll(j, 0, 5) + .Do(loopnest_kernel); + + auto& schedule = nest.GetSchedule(); + auto i_o = schedule.Split(i, 2); + schedule.SetOrder({ i_o, j, i }); + + nest.Run(); + + return matrix(2, 3) - 19; // will return 0 if calculation is correct +} + +Scalar LoopNest_api_CachedMatrix_test1() +{ + const int N = 4; + auto A = MakeMatrix(N, N, "A"); + For(A, [&](Scalar i, Scalar j) { + A(i, j) = i - j; + }); + + // A: + // [ 0, -1, -2, -3 ] + // [ 1, 0, -1, -2 ] + // [ 2, 1, 0, -1 ] + // [ 3, 2, 1, 0 ] + + Index i("i"), j("j"); + + auto nest = Using({ A }, ArgumentType::InputOutput) + .ForAll(i, 0, N) + .ForAll(j, 0, N) + .Do(addOne); + + nest.GetSchedule().Cache( + CreateCacheFor(A) + .Size({ N, N }) + .Using({ i, j }) + .Type(SubMatrixCopyInCopyOutCache{})); + + nest.Run(); + + return A(2, 0) + A(0, 2) - 2; // will return 0 if calculation is correct +} + +Scalar LoopNest_api_SlidingCachedMatrix_test() +{ + const int N = 8; + const int cacheARows = N / 2; + const int cacheACols = N / 2; + + auto A = MakeMatrix(N, N, "A"); + auto B = MakeMatrix(N, N, "B"); + + // initialize A + ForRange(N, [&](Scalar i) { + ForRange(N, [&](Scalar j) { + A(i, j) = i - j; + }); + }); + + // The input matrices: + // A: B: + // [ 0, -1, -2, -3, -4, -5, -6, -7] [ 0, 0, ... ] + // [ 1, 0, -1, -2, -3, -4, -5, -6] [ 0, 0, ... ] + // [ 2, 1, 0, -1, -2, -3, -4, -5] [ 0, 0, ... ] + // [ 3, 2, 1, 0, -1, -2, -3, -4] [ 0, 0, ... ] + // [ 4, 3, 2, 1, 0, -1, -2, -3] [ 0, 0, ... ] + // [ 5, 4, 3, 2, 1, 0, -1, -2] [ 0, 0, ... ] + // [ 6, 5, 4, 3, 2, 1, 0, -1] [ 0, 0, ... ] + // [ 7, 6, 5, 4, 3, 2, 1, 0] [ 0, 0, ... ] + + Index i("i"), j("j"); + auto nest = Using({ A }, ArgumentType::Input) + .Using({ B }, ArgumentType::InputOutput) + .ForAll(i, 0, N) + .ForAll(j, 0, N) + .Do([](Matrix A, Matrix B, Scalar i, Scalar j) { + B(i, j) = A(i, j); + }); + + auto& schedule = nest.GetSchedule(); + + auto i_o = schedule.Split(i, cacheARows); + auto j_o = schedule.Split(j, cacheACols); + schedule.Cache( + CreateCacheFor(A) + .Size({ cacheARows, cacheACols }) + .Using({ i, j }) + .At({ i_o, j_o }) + .Type(SubMatrixCopyIn{})); + + nest.Run(); + + Scalar ok = Allocate(ScalarLayout); + ok = 1; + If( + VerifySame(A, B) == 0, + [&] { + ok = 0; + }) + .Else([&] { + auto value = B.GetValue(); + value.SetLayout({ { (int)B.Size() } }); + DebugPrintVector(value); + DebugPrint("\n"); + auto expectedValue = A.GetValue(); + expectedValue.SetLayout({ { (int)A.Size() } }); + DebugPrintVector(expectedValue); + DebugPrint("\n"); + }); + return ok; +} + +Scalar SimpleGemm_HighLevelAPI() +{ + const int N = 8; + const int cacheARows = 4; + const int cacheACols = 4; + const int cacheBRows = cacheACols; + const int cacheBCols = N; + const int resultCacheRows = 2; + const int resultCacheCols = 2; + + auto A = MakeMatrix(N, N, "A"); + auto B = MakeMatrix(N, N, "B"); + auto C = MakeMatrix(N, N, "C"); + auto expected = MakeMatrix(N, N, "expected"); + + // initialize A, B, and C + ForRange(N, [&](Scalar i) { + ForRange(N, [&](Scalar j) { + A(i, j) = i - j; + B(i, j) = i + 2 * j; + }); + }); + + // fill out expected with a simple for-loop gemm + ForRange(N, [&](Scalar i) { + ForRange(N, [&](Scalar j) { + ForRange(N, [&](Scalar k) { + expected(i, j) += A(i, k) * B(k, j); + }); + }); + }); + + // The input matrices: + // A: B: C: + // [ 0, -1, -2, -3, -4, -5, -6, -7] [ 0, 2, 4, 6, 8, 10, 12, 14] [ 0 0 ... ] + // [ 1, 0, -1, -2, -3, -4, -5, -6] [ 1, 3, 5, 7, 9, 11, 13, 15] [ 0 0 ... ] + // [ 2, 1, 0, -1, -2, -3, -4, -5] [ 2, 4, 6, 8, 10, 12, 14, 16] [ ... ] + // [ 3, 2, 1, 0, -1, -2, -3, -4] [ 3, 5, 7, 9, 11, 13, 15, 17] [ ... ] + // [ 4, 3, 2, 1, 0, -1, -2, -3] [ 4, 6, 8, 10, 12, 14, 16, 18] [ ... ] + // [ 5, 4, 3, 2, 1, 0, -1, -2] [ 5, 7, 9, 11, 13, 15, 17, 19] [ ... ] + // [ 6, 5, 4, 3, 2, 1, 0, -1] [ 6, 8, 10, 12, 14, 16, 18, 20] [ ... ] + // [ 7, 6, 5, 4, 3, 2, 1, 0] [ 7, 9, 11, 13, 15, 17, 19, 21] [ ... ] + + // (A * B) (the desired result): + // [-140, -196, -252, -308, -364, -420, -476, -532] + // [-112, -152, -192, -232, -272, -312, -352, -392] + // [ -84, -108, -132, -156, -180, -204, -228, -252] + // [ -56, -64, -72, -80, -88, -96, -104, -112] + // [ -28, -20, -12, -4, 4, 12, 20, 28] + // [ 0, 24, 48, 72, 96, 120, 144, 168] + // [ 28, 68, 108, 148, 188, 228, 268, 308] + // [ 56, 112, 168, 224, 280, 336, 392, 448] + + Index i("i"), j("j"), k("k"); + auto nest = Using({ A, B }, ArgumentType::Input) + .Using({ C }, ArgumentType::InputOutput) + .ForAll(i, 0, N) + .ForAll(j, 0, N) + .ForAll(k, 0, N) + .Do([](Matrix A, Matrix B, Matrix C, Scalar i, Scalar j, Scalar k) { +#if 0 // DEBUGGING + InvokeForContext([&] { + std::cout << "C(" << i.Get() << "," << j.Get() << ") pointing at (kernel): " << std::hex << reinterpret_cast(std::get(C(i, j).GetValue().GetUnderlyingData())) << std::dec << std::endl; + }); +#endif + C(i, j) += A(i, k) * B(k, j); + }); + + auto& schedule = nest.GetSchedule(); + auto i_b_o = schedule.Split(i, cacheARows); + auto k_b_o = schedule.Split(k, cacheACols); + schedule.Cache(CreateCacheFor(A) + .Size({ cacheARows, cacheACols }, utilities::RowMajorMatrixOrder) + .Using({ i_b_o, k_b_o }) + .Type(SubMatrixCopyIn{})); + schedule.Cache(CreateCacheFor(B) + .Size({ cacheBRows, cacheBCols }, utilities::ColumnMajorMatrixOrder) + .Using({ k, j }) + .At({ k_b_o }) + .Type(SubMatrixCopyIn{})); + + auto i_o = schedule.Split(i, resultCacheRows); + auto j_o = schedule.Split(j, resultCacheCols); + schedule.Cache(CreateCacheFor(C) + .Size({ resultCacheRows, resultCacheCols }, utilities::RowMajorMatrixOrder) + .Using({ i_o, j_o }) + .Type(ZeroInputCopyOutMatrixCache{})); + + schedule.SetOrder({ k_b_o, i_b_o, j_o, i_o, k, j, i }); + schedule.Unroll(i); + schedule.Unroll(j); + nest.Run(); + + Scalar ok = Allocate(ScalarLayout); + ok = 1; + If( + VerifySame(C, expected) == 0, + [&] { + ok = 0; + }) + .Else([&] { + auto value = C.GetValue(); + value.SetLayout({ { (int)C.Size() } }); + DebugPrintVector(value); + DebugPrint("\n"); + auto expectedValue = expected.GetValue(); + expectedValue.SetLayout({ { (int)expected.Size() } }); + DebugPrintVector(expectedValue); + DebugPrint("\n"); + }); + return ok; +} + +Scalar SimpleGemm_HighLevelAPI_NoCachingHelper() +{ + const int N = 8; + const int cacheARows = 4; + const int cacheACols = 4; + const int cacheBRows = cacheACols; + const int cacheBCols = N; + const int resultCacheRows = 2; + const int resultCacheCols = 2; + + auto A = MakeMatrix(N, N, "A"); + auto B = MakeMatrix(N, N, "B"); + auto C = MakeMatrix(N, N, "C"); + auto expected = MakeMatrix(N, N, "expected"); + + // initialize A, B, and C + ForRange(N, [&](Scalar i) { + ForRange(N, [&](Scalar j) { + A(i, j) = i - j; + B(i, j) = i + 2 * j; + }); + }); + + // fill out expected with a simple for-loop gemm + ForRange(N, [&](Scalar i) { + ForRange(N, [&](Scalar j) { + ForRange(N, [&](Scalar k) { + expected(i, j) += A(i, k) * B(k, j); + }); + }); + }); + + // The input matrices: + // A: B: C: + // [ 0, -1, -2, -3, -4, -5, -6, -7] [ 0, 2, 4, 6, 8, 10, 12, 14] [ 0 0 ... ] + // [ 1, 0, -1, -2, -3, -4, -5, -6] [ 1, 3, 5, 7, 9, 11, 13, 15] [ 0 0 ... ] + // [ 2, 1, 0, -1, -2, -3, -4, -5] [ 2, 4, 6, 8, 10, 12, 14, 16] [ ... ] + // [ 3, 2, 1, 0, -1, -2, -3, -4] [ 3, 5, 7, 9, 11, 13, 15, 17] [ ... ] + // [ 4, 3, 2, 1, 0, -1, -2, -3] [ 4, 6, 8, 10, 12, 14, 16, 18] [ ... ] + // [ 5, 4, 3, 2, 1, 0, -1, -2] [ 5, 7, 9, 11, 13, 15, 17, 19] [ ... ] + // [ 6, 5, 4, 3, 2, 1, 0, -1] [ 6, 8, 10, 12, 14, 16, 18, 20] [ ... ] + // [ 7, 6, 5, 4, 3, 2, 1, 0] [ 7, 9, 11, 13, 15, 17, 19, 21] [ ... ] + + // (A * B) (the desired result): + // [-140, -196, -252, -308, -364, -420, -476, -532] + // [-112, -152, -192, -232, -272, -312, -352, -392] + // [ -84, -108, -132, -156, -180, -204, -228, -252] + // [ -56, -64, -72, -80, -88, -96, -104, -112] + // [ -28, -20, -12, -4, 4, 12, 20, 28] + // [ 0, 24, 48, 72, 96, 120, 144, 168] + // [ 28, 68, 108, 148, 188, 228, 268, 308] + // [ 56, 112, 168, 224, 280, 336, 392, 448] + + Index i("i"), j("j"), k("k"); + auto nest = Using({ A, B }, ArgumentType::Input) + .Using({ C }, ArgumentType::InputOutput) + .ForAll(i, 0, N) + .ForAll(j, 0, N) + .ForAll(k, 0, N) + .Do([](Matrix A, Matrix B, Matrix C, Scalar i, Scalar j, Scalar k) { +#if 0 // DEBUGGING + InvokeForContext([&] { + std::cout << "C(" << i.Get() << "," << j.Get() << ") pointing at (kernel): " << std::hex << reinterpret_cast(std::get(C(i, j).GetValue().GetUnderlyingData())) << std::dec << std::endl; + }); +#endif + C(i, j) += A(i, k) * B(k, j); + }); + + auto& schedule = nest.GetSchedule(); + auto i_b_o = schedule.Split(i, cacheARows); + auto k_b_o = schedule.Split(k, cacheACols); + schedule.Cache( + A, + { i, k }, + { cacheARows, cacheACols }, + { i_b_o, k_b_o }, + RowMajorMatrixOrder); + schedule.Cache( + B, + { k, j }, + { cacheBRows, cacheBCols }, + { k_b_o }, + ColumnMajorMatrixOrder); + + auto i_o = schedule.Split(i, resultCacheRows); + auto j_o = schedule.Split(j, resultCacheCols); + schedule.SetOrder({ k_b_o, i_b_o, j_o, i_o, k, j, i }); + schedule.Cache( + C, + { i, j }, + { resultCacheRows, resultCacheCols }, + { i_o, j_o }, + RowMajorMatrixOrder); + + schedule.Unroll(i); + schedule.Unroll(j); + nest.Run(); + + Scalar ok = Allocate(ScalarLayout); + ok = 1; + If( + VerifySame(C, expected) == 0, + [&] { + ok = 0; + }) + .Else([&] { + auto value = C.GetValue(); + value.SetLayout({ { (int)C.Size() } }); + DebugPrintVector(value); + DebugPrint("\n"); + auto expectedValue = expected.GetValue(); + expectedValue.SetLayout({ { (int)expected.Size() } }); + DebugPrintVector(expectedValue); + DebugPrint("\n"); + }); + return ok; +} + +Scalar MLAS_GEMM_GeneralCachingStrategy() +{ + const int OutputRows = 16; + const int InnerDimension = 16; + const int OutputColumns = 16; + const int kUnroll = 4; + const int cacheBRows = InnerDimension / 2; + const int cacheBCols = OutputColumns / 2; + const int stripeSize = cacheBCols / 2; + const int vectorSize = stripeSize / 2; + const int NumRowsInKernel = OutputRows / 8; + const int NumColumnsInKernel = 2 * vectorSize; + + auto A = MakeIncrementingMatrix(OutputRows, InnerDimension, "A"); + auto B = MakeIncrementingMatrix(InnerDimension, OutputColumns, "B"); + auto C = MakeMatrix(OutputRows, OutputColumns, "C"); + + auto expected = MakeMatrix(OutputRows, OutputColumns, "expected"); + ForRange(OutputRows, [&](Scalar m) { + ForRange(OutputColumns, [&](Scalar n) { + ForRange(InnerDimension, [&](Scalar k) { + expected(m, n) += A(m, k) * B(k, n); + }); + }); + }); + + // Declare indexes + loopnests::Index i("i"), j("j"), k("k"); + // Define LoopNest + auto nest = Using({ A, B }, ArgumentType::Input) + .Using({ C }, ArgumentType::Output) + .ForAll(i, 0, OutputRows) + .ForAll(j, 0, OutputColumns) + .ForAll(k, 0, InnerDimension) + .Do([](Matrix A_, Matrix B_, Matrix C_, Scalar i_, Scalar j_, Scalar k_) { + C_(i_, j_) += B_(k_, j_) * A_(i_, k_); + }); + auto& schedule = nest.GetSchedule(); + + auto topLevelI = i; + auto topLevelJ = j; + auto topLevelK = k; + + // Declare splits + auto jCache = schedule.Split(j, cacheBCols); + auto kCache = schedule.Split(k, cacheBRows); + auto kBlock = schedule.Split(k, kUnroll); + auto jKernelOuter2 = schedule.Split(j, NumColumnsInKernel); + auto jKernelOuter = schedule.Split(j, vectorSize); + auto iKernelOuter = schedule.Split(i, NumRowsInKernel); + + // Set the order + schedule.SetOrder({ jCache, kCache, iKernelOuter, jKernelOuter2, kBlock, k, i, jKernelOuter, j }); + + // Set up caching + ArgumentType argType = ArgumentType::Input; + std::string cacheName = "cacheBInput"; + size_t maxCacheElts = cacheBRows * cacheBCols; + size_t fillThreshold = maxCacheElts; + std::function reduceFunction = CopyReduce; + auto extraCacheParams = std::make_tuple(argType, + cacheName, + maxCacheElts, + fillThreshold, + reduceFunction, + false); + schedule.Cache(B, + { topLevelK, topLevelJ }, + {}, + {}, + std::nullopt, + extraCacheParams); + + ArgumentType argTypeC = ArgumentType::Output; + std::string cacheNameC = "cacheCOutput"; + size_t maxCacheEltsC = NumRowsInKernel * NumColumnsInKernel; + size_t fillThresholdC = maxCacheEltsC; + std::function reduceFunctionC = SumReduce; + auto extraCacheCParams = std::make_tuple(argTypeC, + cacheNameC, + maxCacheEltsC, + fillThresholdC, + reduceFunctionC, + true); + schedule.Cache(C, + { topLevelI, topLevelJ }, + {}, + {}, + std::nullopt, + extraCacheCParams); + + // Set unrolling + schedule.Unroll(jKernelOuter); + schedule.Unroll(i); + schedule.Unroll(k); + +#if 0 // DEBUGGING + auto loop = nest.GetUnderlyingLoopNest(); + DebugDump(loop); +#endif + // Run the generator + nest.Run(); + + If( + VerifySame(C, expected) == 0, + [&] { + }) + .Else([&] { + auto value = C.GetValue(); + value.SetLayout({ { (int)C.Size() } }); + DebugPrintVector(value); + DebugPrint("\n"); + auto expectedValue = expected.GetValue(); + expectedValue.SetLayout({ { (int)expected.Size() } }); + DebugPrintVector(expectedValue); + DebugPrint("\n"); + }); + + return VerifySame(C, expected); +} + +Scalar OneSplitBoundaryTest() +{ + const int M = 4; + const int N = 3; + const int split = 2; + auto input = MakeIncrementingMatrix(M, N, "input"); + auto output = MakeMatrix(M, N, "output"); + auto expectedOutput = MakeIncrementingMatrix(M, N, "expectedOutput"); + + Index i("i"), j("j"); + auto nest = Using({ input }, ArgumentType::Input) + .Using({ output }, ArgumentType::InputOutput) + .ForAll(i, 0, M) + .ForAll(j, 0, N) + .Do([](Matrix input, Matrix output, Scalar i, Scalar j) { +#if 0 // DEBUGGING + InvokeForContext([&] { + std::cout << "inner kernel, (i,j) == (" << i.Get() << ", " << j.Get() << ")" << std::endl; + }); +#endif + output(i, j) = input(i, j); + }); + + auto schedule = nest.GetSchedule(); + + auto topLevelI = i; + auto topLevelJ = j; + auto iSplit = schedule.Split(i, split); + auto jSplit = schedule.Split(j, split); + +#if 0 // DEBUGGING + auto kernel = loopnests::Kernel("split_log_kernel") + .Inputs() + .Indices(iSplit, jSplit) + .Define([](Scalar i, Scalar j) { + InvokeForContext([&] { + std::cout << "simple boundary split log kernel, (i,j) == (" << i.Get() << ", " << j.Get() << ")" << std::endl; + }); + }); + nest.GetUnderlyingLoopNest().AddKernel(kernel); +#endif + + schedule.SetOrder({ iSplit, jSplit, i, j }); + +#if 0 // DEBUGGING + DebugDump(nest.GetUnderlyingLoopNest()); +#endif + nest.Run(); + + Scalar ok = Allocate(ScalarLayout); + ok = 1; + If( + VerifySame(output, expectedOutput) == 0, + [&] { + ok = 0; + }) + .Else([&] { + auto value = output.GetValue(); + value.SetLayout({ { (int)output.Size() } }); + DebugPrintVector(value); + DebugPrint("\n"); + auto expectedValue = expectedOutput.GetValue(); + expectedValue.SetLayout({ { (int)expectedOutput.Size() } }); + DebugPrintVector(expectedValue); + DebugPrint("\n"); + }); + return ok; +} + +Scalar TwoSplitBoundaryTest() +{ + const int M = 8; + const int N = 7; + const int bigSplit = 4; + const int smallSplit = 2; + auto input = MakeIncrementingMatrix(M, N, "input"); + auto output = MakeMatrix(M, N, "output"); + auto expectedOutput = MakeIncrementingMatrix(M, N, "expectedOutput"); + + Index i("i"), j("j"); + auto nest = Using({ input }, ArgumentType::Input) + .Using({ output }, ArgumentType::InputOutput) + .ForAll(i, 0, M) + .ForAll(j, 0, N) + .Do([](Matrix input, Matrix output, Scalar i, Scalar j) { +#if 0 // DEBUGGING + InvokeForContext([&] { + std::cout << "inner kernel, (i,j) == (" << i.Get() << ", " << j.Get() << ")" << std::endl; + }); +#endif + output(i, j) = input(i, j); + }); + + auto schedule = nest.GetSchedule(); + + auto topLevelI = i; + auto topLevelJ = j; + auto iBigSplit = schedule.Split(i, bigSplit); + auto jBigSplit = schedule.Split(j, bigSplit); + auto iSmallSplit = schedule.Split(i, smallSplit); + auto jSmallSplit = schedule.Split(j, smallSplit); + +#if 0 // DEBUGGING + auto kernel = loopnests::Kernel("big_split_log_kernel") + .Inputs() + .Indices(iBigSplit, jBigSplit) + .Define([](Scalar i, Scalar j) { + InvokeForContext([&] { + std::cout << "simple boundary big split log kernel, (i,j) == (" << i.Get() << ", " << j.Get() << ")" << std::endl; + }); + }); + nest.GetUnderlyingLoopNest().AddKernel(kernel); + auto kernel2 = loopnests::Kernel("small_split_log_kernel") + .Inputs() + .Indices(iSmallSplit, jSmallSplit) + .Define([](Scalar i, Scalar j) { + InvokeForContext([&] { + std::cout << "simple boundary Small split log kernel, (i,j) == (" << i.Get() << ", " << j.Get() << ")" << std::endl; + }); + }); + nest.GetUnderlyingLoopNest().AddKernel(kernel2); +#endif + + schedule.SetOrder({ iBigSplit, jBigSplit, iSmallSplit, jSmallSplit, i, j }); + +#if 0 // DEBUGGING + DebugDump(nest.GetUnderlyingLoopNest()); +#endif + nest.Run(); + + Scalar ok = Allocate(ScalarLayout); + ok = 1; + If( + VerifySame(output, expectedOutput) == 0, + [&] { + ok = 0; + }) + .Else([&] { + auto value = output.GetValue(); + value.SetLayout({ { (int)output.Size() } }); + DebugPrintVector(value); + DebugPrint("\n"); + auto expectedValue = expectedOutput.GetValue(); + expectedValue.SetLayout({ { (int)expectedOutput.Size() } }); + DebugPrintVector(expectedValue); + DebugPrint("\n"); + }); + return ok; +} + +Scalar SplitLargerThanSizeBoundaryTest() +{ + const int M = 8; + const int N = 3; + const int split = 4; + auto input = MakeIncrementingMatrix(M, N, "input"); + auto output = MakeMatrix(M, N, "output"); + auto expectedOutput = MakeIncrementingMatrix(M, N, "expectedOutput"); + + Index i("i"), j("j"); + auto nest = Using({ input }, ArgumentType::Input) + .Using({ output }, ArgumentType::InputOutput) + .ForAll(i, 0, M) + .ForAll(j, 0, N) + .Do([](Matrix input, Matrix output, Scalar i, Scalar j) { +#if 0 // DEBUGGING + InvokeForContext([&] { + std::cout << "inner kernel, (i,j) == (" << i.Get() << ", " << j.Get() << ")" << std::endl; + }); +#endif + output(i, j) = input(i, j); + }); + + auto schedule = nest.GetSchedule(); + + auto topLevelI = i; + auto topLevelJ = j; + auto iSplit = schedule.Split(i, split); + auto jSplit = schedule.Split(j, split); + +#if 0 // DEBUGGING + auto kernel = loopnests::Kernel("split_log_kernel") + .Inputs() + .Indices(iSplit, jSplit) + .Define([](Scalar i, Scalar j) { + InvokeForContext([&] { + std::cout << "simple boundary split log kernel, (i,j) == (" << i.Get() << ", " << j.Get() << ")" << std::endl; + }); + }); + nest.GetUnderlyingLoopNest().AddKernel(kernel); +#endif + + schedule.SetOrder({ iSplit, jSplit, i, j }); + +#if 0 // DEBUGGING + DebugDump(nest.GetUnderlyingLoopNest()); +#endif + nest.Run(); + + Scalar ok = Allocate(ScalarLayout); + ok = 1; + If( + VerifySame(output, expectedOutput) == 0, + [&] { + ok = 0; + }) + .Else([&] { + auto value = output.GetValue(); + value.SetLayout({ { (int)output.Size() } }); + DebugPrintVector(value); + DebugPrint("\n"); + auto expectedValue = expectedOutput.GetValue(); + expectedValue.SetLayout({ { (int)expectedOutput.Size() } }); + DebugPrintVector(expectedValue); + DebugPrint("\n"); + }); + return ok; +} + +Scalar TwoSplitsLargerThanSizeBoundaryTest() +{ + const int M = 8; + const int N = 3; + const int bigSplit = 4; + const int smallSplit = 2; + auto input = MakeIncrementingMatrix(M, N, "input"); + auto output = MakeMatrix(M, N, "output"); + auto expectedOutput = MakeIncrementingMatrix(M, N, "expectedOutput"); + + Index i("i"), j("j"); + auto nest = Using({ input }, ArgumentType::Input) + .Using({ output }, ArgumentType::InputOutput) + .ForAll(i, 0, M) + .ForAll(j, 0, N) + .Do([](Matrix input, Matrix output, Scalar i, Scalar j) { +#if 0 // DEBUGGING + InvokeForContext([&] { + std::cout << "inner kernel, (i,j) == (" << i.Get() << ", " << j.Get() << ")" << std::endl; + }); +#endif + output(i, j) = input(i, j); + }); + + auto schedule = nest.GetSchedule(); + + auto topLevelI = i; + auto topLevelJ = j; + auto iBigSplit = schedule.Split(i, bigSplit); + auto jBigSplit = schedule.Split(j, bigSplit); + auto iSmallSplit = schedule.Split(i, smallSplit); + auto jSmallSplit = schedule.Split(j, smallSplit); + +#if 0 // DEBUGGING + auto kernel = loopnests::Kernel("big_split_log_kernel") + .Inputs() + .Indices(iBigSplit, jBigSplit) + .Define([](Scalar i, Scalar j) { + InvokeForContext([&] { + std::cout << "simple boundary big split log kernel, (i,j) == (" << i.Get() << ", " << j.Get() << ")" << std::endl; + }); + }); + nest.GetUnderlyingLoopNest().AddKernel(kernel); + auto kernel2 = loopnests::Kernel("small_split_log_kernel") + .Inputs() + .Indices(iSmallSplit, jSmallSplit) + .Define([](Scalar i, Scalar j) { + InvokeForContext([&] { + std::cout << "simple boundary Small split log kernel, (i,j) == (" << i.Get() << ", " << j.Get() << ")" << std::endl; + }); + }); + nest.GetUnderlyingLoopNest().AddKernel(kernel2); +#endif + + schedule.SetOrder({ iBigSplit, jBigSplit, iSmallSplit, jSmallSplit, i, j }); + +#if 0 // DEBUGGING + DebugDump(nest.GetUnderlyingLoopNest()); +#endif + nest.Run(); + + Scalar ok = Allocate(ScalarLayout); + ok = 1; + If( + VerifySame(output, expectedOutput) == 0, + [&] { + ok = 0; + }) + .Else([&] { + auto value = output.GetValue(); + value.SetLayout({ { (int)output.Size() } }); + DebugPrintVector(value); + DebugPrint("\n"); + auto expectedValue = expectedOutput.GetValue(); + expectedValue.SetLayout({ { (int)expectedOutput.Size() } }); + DebugPrintVector(expectedValue); + DebugPrint("\n"); + }); + return ok; +} + +Scalar LoopNest_api_tunable_parameters_test1() +{ + auto ok = MakeScalar(); + + // loopnest_kernel_4 will multiply row by 10 and add the column. + std::vector expectedValues{ 0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13, 14, 15, 16, 17 }; + auto expected = Vector(expectedValues); + + TunableParameter j_o{ std::vector{ 2, 4 }, "j_o" }; + TunableParameter j_o_o{ std::vector{ 1, 2 }, "j_o_o" }; + TuningEngine engine(j_o, j_o_o); + do + { + // Declare the output matrix and initialize its values to 0. + auto output = MakeMatrix(2, 8); + + (void)DeclareFunction("LoopNext_tunable_" + engine.ToString()) + .Decorated(false) + .Parameters(output) + .Define([&](Matrix matrix) { + Index i("i"), j("j"); + + auto nest = Using({ matrix }, ArgumentType::InputOutput) + .ForAll(i, 0, static_cast(output.Rows())) + .ForAll(j, 0, static_cast(output.Columns())) + .Do([](Matrix m, Scalar i, Scalar j) { + Scalar v = Allocate(m.Type(), ScalarLayout); + + v = i * 10; + v += j; + + m(i, j) = v; + }); + + auto& schedule = nest.GetSchedule(); + schedule.Split(j, j_o); + schedule.Split(j, j_o_o); + + nest.Run(); + })(output); + + // View the result as a Vector + Vector actual = AsVector(AsFullView(output)); + + // Verify that the actual result is what we expect + If(ok == 0, [&] { ok = VerifySame(actual, expected); }); + } while (engine.Next()); + + return ok; +} +} // namespace ell diff --git a/libraries/value/test/src/LoopNest_convolution_test.cpp b/libraries/value/test/src/LoopNest_convolution_test.cpp new file mode 100644 index 000000000..a4dbe8bfe --- /dev/null +++ b/libraries/value/test/src/LoopNest_convolution_test.cpp @@ -0,0 +1,198 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: LoopNest_convolution_test.cpp (value) +// Authors: Mason Remy +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "LoopNest_convolution_test.h" +#include "TestUtil.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include +#include + +#include + +#include +#include +#include + +using namespace ell::emitters; +using namespace ell::utilities; +using namespace ell::logging; +using namespace ell::value; +using namespace ell::value::loopnests; + +namespace ell +{ +// Tests of convolution via LoopNests + +int GetOutputDimensionSize(int inputSize, int receptiveFieldSize, int stride, int paddingSize) +{ + return (inputSize + 2 * paddingSize - receptiveFieldSize) / stride + 1; +} + +struct ConvolutionConfig +{ + ConvolutionConfig(const std::vector& inputSizes, + int outputFilters, + const std::vector& receptiveFieldSize, + const std::vector& strideSize, + const std::vector& paddingSize, + const std::vector& inputBlockSizes, + const std::vector& outputBlockSizes) + { + outputSize[2] = outputFilters; + for (int dim = 0; dim < 3; dim++) + { + inputSize[dim] = inputSizes[dim]; + inputBlockSize[dim] = inputBlockSizes[dim]; + outputBlockSize[dim] = outputBlockSizes[dim]; + + // Value that are only computed in the row/column dimensions + if (dim < 2) + { + receptiveField[dim] = receptiveFieldSize[dim]; + stride[dim] = strideSize[dim]; + padding[dim] = paddingSize[dim]; + outputSize[dim] = GetOutputDimensionSize(inputSize[dim], receptiveFieldSize[dim], strideSize[dim], paddingSize[dim]); + } + + if (inputBlockSize[dim] > 0) + { + inputBlockCount[dim] = inputSize[dim] / inputBlockSize[dim]; + if (inputSize[dim] % inputBlockSize[dim] != 0) + { + inputBlockCount[dim]++; + } + } + + if (outputBlockSize[dim] > 0) + { + outputBlockCount[dim] = outputSize[dim] / outputBlockSize[dim]; + if (outputSize[dim] % outputBlockSize[dim] != 0) + { + outputBlockCount[dim]++; + } + } + } + + weightSize[0] = outputSize[2]; + weightSize[1] = inputSize[2]; + weightSize[2] = receptiveField[0]; + weightSize[3] = receptiveField[1]; + + MemoryShape inputPackedShape = { inputBlockCount[2], inputSize[0], inputSize[1], inputBlockSize[2] }; + MemoryShape inputPackedPadding = { 0, padding[0], padding[1], 0 }; + inputPackedPaddedLayout = { inputPackedShape, inputPackedPadding }; + MemoryShape inputLogicalPadding = { padding[0], padding[1], 0 }; + inputLogicalPaddedLayout = { MemoryShape{ inputSize[0], inputSize[1], inputSize[2] }, inputLogicalPadding }; + + outputPackedLayout = { MemoryShape{ outputBlockCount[2], outputSize[0], outputSize[1], outputBlockSize[2] } }; + outputLogicalLayout = { MemoryShape{ outputSize[0], outputSize[1], outputSize[2] } }; + + weightPackedLayout = { MemoryShape{ + outputBlockCount[2], + inputBlockCount[2], + weightSize[2], + weightSize[3], + inputBlockSize[2], + outputBlockSize[2] } }; + } + + int inputSize[3]; + int outputSize[3]; + int weightSize[4]; + int receptiveField[2]; + int stride[2]; + int padding[2]; + + int inputBlockSize[3]; + int outputBlockSize[3]; + + int inputBlockCount[3]; + int outputBlockCount[3]; + + MemoryLayout inputPackedPaddedLayout; + MemoryLayout inputLogicalPaddedLayout; + + MemoryLayout outputPackedLayout; + MemoryLayout outputLogicalLayout; + + MemoryLayout weightPackedLayout; +}; + +Tensor NaiveForLoopConvolution(const ConvolutionConfig& config, Tensor input, Array weights) +{ + auto output = MakeTensor(config.outputSize[0], config.outputSize[1], config.outputSize[2], "expectedOutput"); + ForRange(config.outputSize[2], [&](Scalar outputChannel) { + ForRange(config.inputSize[2], [&](Scalar inputChannel) { + ForRange(config.outputSize[0], [&](Scalar outputRow) { + ForRange(config.outputSize[1], [&](Scalar outputColumn) { + ForRange(config.receptiveField[0], [&](Scalar weightRow) { + ForRange(config.receptiveField[1], [&](Scalar weightColumn) { + Scalar inputRow = outputRow * config.stride[0] + weightRow - config.padding[0]; + Scalar inputColumn = outputColumn * config.stride[1] + weightColumn - config.padding[1]; + If(inputRow >= 0, [&] { + If(inputRow < Scalar{ config.inputSize[0] }, [&] { + If(inputColumn >= 0, [&] { + If(inputColumn < Scalar{ config.inputSize[1] }, [&] { + output(outputRow, outputColumn, outputChannel) += + input(inputRow, inputColumn, inputChannel) * + weights({ outputChannel, inputChannel, weightRow, weightColumn }); + }); + }); + }); + }); + }); + }); + }); + }); + }); + }); + return output; +} + +Tensor UnpackOutputTensor(const ConvolutionConfig& config, Value packedOutput) +{ + auto unpackedOutput = MakeTensor(config.outputSize[0], config.outputSize[1], config.outputSize[2], "unpackedOutput"); + packedOutput.SetLayout(config.outputPackedLayout); + auto packedOutputArray = Array(packedOutput); + + ForRange(config.outputSize[2], [&](Scalar channelIdx) { + ForRange(config.outputSize[0], [&](Scalar rowIdx) { + ForRange(config.outputSize[1], [&](Scalar columnIdx) { + unpackedOutput(rowIdx, columnIdx, channelIdx) = packedOutputArray({ channelIdx / config.outputBlockSize[2], + rowIdx, + columnIdx, + channelIdx % config.outputBlockSize[2] }); + }); + }); + }); + return unpackedOutput; +} +} // namespace ell diff --git a/libraries/value/test/src/LoopNest_kernels.cpp b/libraries/value/test/src/LoopNest_kernels.cpp new file mode 100644 index 000000000..2e6c30895 --- /dev/null +++ b/libraries/value/test/src/LoopNest_kernels.cpp @@ -0,0 +1,234 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: LoopNest_kernels.cpp (value) +// Authors: Kern Handa, Chuck Jacobs +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "LoopNest_kernels.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +using namespace ell::logging; +using namespace ell::utilities; +using namespace ell::value; + +namespace ell +{ + +void loopnest_passthrough(ViewAdapter, Scalar i, Scalar j) +{} + +void loopnest_kernel(Matrix m, Scalar i, Scalar j) +{ + InvokeForContext([&] { + auto iInt = i.Get(); + auto jInt = j.Get(); + Log() << "m(" << iInt << ", " << jInt << ") = " << (iInt * 2 + jInt * 5) << EOL; + }); + m(i, j) = i * 2 + j * 5; +} + +void loopnest_kernel_2(Matrix m, Scalar i, Scalar j) +{ + InvokeForContext([&] { + auto iInt = i.Get(); + auto jInt = j.Get(); + Log() << "m(" << iInt << ", " << jInt << ") += " << (iInt * 10 + jInt * 2) << EOL; + }); + m(i, j) += i * 10 + j * 2; +} + +void loopnest_kernel_3(Matrix c, Matrix a, Scalar i, Scalar j) +{ + c(i, j) += a(i, j); +} + +void loopnest_kernel_4(Matrix c, Matrix a, Scalar i, Scalar j) +{ + Scalar v = Allocate(c.GetValue().GetBaseType(), ScalarLayout); + + v = i * 10; + v += j; + + c(i, j) = a(i, j) + v; +} + +void matmul_kernel(Matrix A, Matrix B, Matrix C, Scalar i, Scalar j, Scalar k) +{ + InvokeForContext([&] { + auto iInt = i.Get(); + auto jInt = j.Get(); + auto kInt = k.Get(); + Log() << "C(" << iInt << ", " << jInt << ") += " + << "A(" << iInt << ", " << kInt << ") * B(" << kInt << ", " << jInt << ")" << EOL; + }); + C(i, j) += A(i, k) * B(k, j); +} + +void initToZero(Matrix m, Scalar i, Scalar j) +{ + InvokeForContext([&] { + auto iInt = i.Get(); + auto jInt = j.Get(); + Log() << "m(" << iInt << ", " << jInt << ") = " << 0 << EOL; + }); + m(i, j) = 0; +} + +void copyToCache(Matrix A, Matrix cache, Scalar i, Scalar j) +{ + InvokeForContext([&] { + auto iInt = i.Get(); + auto jInt = j.Get(); + Log() << cache.GetValue().GetName() << "(" << iInt << "-" << (iInt + 2) << ", " << jInt << "-" << (jInt + 2) << ") = " + << A.GetValue().GetName() << "(" << iInt << "-" << (iInt + 2) << ", " << jInt << "-" << (jInt + 2) << ")" << EOL; + }); + cache(i, j) = A(i, j); + cache(i + 1, j) = A(i + 1, j); + cache(i, j + 1) = A(i, j + 1); + cache(i + 1, j + 1) = A(i + 1, j + 1); +} + +void copyFromCache(Matrix A, Matrix cache, Scalar i, Scalar j) +{ + InvokeForContext([&] { + auto iInt = i.Get(); + auto jInt = j.Get(); + Log() << A.GetValue().GetName() << "(" << iInt << "-" << (iInt + 2) << ", " << jInt << "-" << (jInt + 2) << ") = " + << cache.GetValue().GetName() << "(" << iInt << "-" << (iInt + 2) << ", " << jInt << "-" << (jInt + 2) << ")" << EOL; + }); + A(i, j) = cache(i, j); + A(i + 1, j) = cache(i + 1, j); + A(i, j + 1) = cache(i, j + 1); + A(i + 1, j + 1) = cache(i + 1, j + 1); +} + +void copyToSmallCache(Matrix A, Matrix cache, Scalar i, Scalar j) +{ + InvokeForContext([&] { + auto iInt = i.Get(); + auto jInt = j.Get(); + Log() << "* " << cache.GetValue().GetName() << " = " + << A.GetValue().GetName() << "(" << iInt << "-" << (iInt + 2) << ", " << jInt << "-" << (jInt + 2) << ")" << EOL; + }); + cache(0, 0) = A(i, j); + cache(1, 0) = A(i + 1, j); + cache(0, 1) = A(i, j + 1); + cache(1, 1) = A(i + 1, j + 1); +} + +void copyFromSmallCache(Matrix A, Matrix cache, Scalar i, Scalar j) +{ + InvokeForContext([&] { + auto iInt = i.Get(); + auto jInt = j.Get(); + Log() << "* " << A.GetValue().GetName() << "(" << iInt << "-" << (iInt + 2) << ", " << jInt << "-" << (jInt + 2) << ") = " + << cache.GetValue().GetName() << EOL; + }); + A(i, j) = cache(0, 0); + A(i + 1, j) = cache(1, 0); + A(i, j + 1) = cache(0, 1); + A(i + 1, j + 1) = cache(1, 1); +} + +void addOne(Matrix m, Scalar i, Scalar j) +{ + InvokeForContext([&] { + auto iInt = i.Get(); + auto jInt = j.Get(); + Log() << m.GetValue().GetName() << "(" << iInt << ", " << jInt << ") += " << 1 << EOL; + }); + m(i, j) += 1; +} + +void addTwo(Matrix m, Scalar i, Scalar j) +{ + InvokeForContext([&] { + auto iInt = i.Get(); + auto jInt = j.Get(); + Log() << m.GetValue().GetName() << "(" << iInt << ", " << jInt << ") += " << 2 << EOL; + }); + m(i, j) += 2; +} + +void set_vector_kernel(Vector v, Scalar i) +{ + InvokeForContext([&] { + auto iInt = i.Get(); + Log() << "v(" << iInt << ") = " << iInt << EOL; + }); + v(i) = i; +} + +void increment_vector_kernel(Vector v, Scalar i) +{ + InvokeForContext([&] { + auto iInt = i.Get(); + Log() << "v(" << iInt << ") = " << iInt << EOL; + }); + v(i) += 1; +} + +void copy_vector_kernel(Vector v1, Vector v2, Scalar i) +{ + v2(i) = v1(i); +} + +void reorder_vector_kernel(Vector v, Matrix m, Scalar splitParam, Scalar i, Scalar iOuter, Scalar iInner) +{ + InvokeForContext([&] { + auto iInt = i.Get(); + auto iOuterInt = iOuter.Get(); + auto iInnerInt = iInner.Get(); + auto splitInt = splitParam.Get(); + Log() << "m(" << iOuterInt << "/" << splitInt << ", " << iInnerInt << ") = v(" << iInt << ")" << EOL; + }); + m(iOuter / splitParam, iInner) = v(i); +} + +void addCachedMatrixToUnchachedMatrix(Matrix A, Matrix B, Scalar Ai, Scalar Aj, Scalar Bi, Scalar Bj) +{ + InvokeForContext([&](auto&) { + auto AiInt = Ai.Get(); + auto AjInt = Aj.Get(); + auto BiInt = Bi.Get(); + auto BjInt = Bj.Get(); + Log() << A.GetValue().GetName() << "(" << AiInt << ", " << AjInt << ") += " << B.GetValue().GetName() << "(" << BiInt << ", " << BjInt << ")" << EOL; + }); + A(Ai, Aj) += B(Bi, Bj); +} + +void addCachedMatrixToUnchachedMatrixUnrolled(Matrix A, Matrix B, Scalar Ai, Scalar Aj, Scalar Bi, Scalar Bj) +{ + InvokeForContext([&](auto&) { + auto BiInt = Bi.Get(); + auto BjInt = Bj.Get(); + for (int i = 0; i < 2; ++i) + { + for (int j = 0; j < 2; ++j) + { + Log() << A.GetValue().GetName() << "(" << i << ", " << j << ") += " << B.GetValue().GetName() << "(" << (BiInt + i) << ", " << (BjInt + j) << ")" << EOL; + } + } + }); + for (int i = 0; i < 2; ++i) + { + for (int j = 0; j < 2; ++j) + { + A(i, j) += B(Bi + i, Bj + j); + } + } +} + +} // namespace ell diff --git a/libraries/value/test/src/LoopNest_test.cpp b/libraries/value/test/src/LoopNest_test.cpp new file mode 100644 index 000000000..e01483029 --- /dev/null +++ b/libraries/value/test/src/LoopNest_test.cpp @@ -0,0 +1,3661 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: LoopNest_test.cpp (value) +// Authors: Kern Handa, Chuck Jacobs +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "LoopNest_test.h" +#include "LoopNest_kernels.h" +#include "TestUtil.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace ell::emitters; +using namespace ell::utilities; +using namespace ell::logging; +using namespace ell::value; +using namespace ell::value::loopnests; + +template +using LayoutType = std::integral_constant; + +namespace ell +{ +// LoopNest-specific test utilities +namespace +{ + template + bool InList(const ListType& list, ElementType item) + { + return std::find(list.begin(), list.end(), item) != list.end(); + } + + void SplitAndSetOrder(LoopNest& loops, const std::vector& indices, const std::vector& splitSizes, std::string order) + { + using IndexMap = std::unordered_map>>; + IndexMap indexInfo; + for (const auto& index : indices) + { + indexInfo.insert({ index.GetName()[0], { index, {} } }); + } + + std::vector splits; + for (auto ch : order) + { + auto& [index, indexSplits] = indexInfo.at(ch); + splits.push_back(index); + if (indexSplits.empty()) // first visit for this index, copy split list over instead of splitting + { + indexSplits = std::queue{ std::deque{ splitSizes.begin(), splitSizes.end() } }; + } + else + { + loops.Split(index, indexSplits.front()); + indexSplits.pop(); + } + } + + loops.SetLoopOrder(splits); + } + +} // namespace + +// Low-level tests of loop nest infrastructure +Scalar SplitIterationDomain_test1() +{ + Index i("i"), j("j"); + SplitIterationDomain domain({ { i, { 0, 120 } }, + { j, { 0, 200 } } }); + + auto [i1, i2] = domain.Split(i, 30); + auto [i3, i4] = domain.Split(i2, 15); + auto [i5, i6] = domain.Split(i4, 5); + + auto [j1, j2] = domain.Split(j, 50); + auto [j3, j4] = domain.Split(j2, 10); + + if (domain.NumDimensions() != 2) + { + return 1; + } + + // `NumSplits` returns the number of loops, not splits. It should be 4 for `i` and 3 for `j` + if (domain.NumSplits(i) != 4 || domain.NumSplits(j) != 3) + { + return 1; + } + + if (!domain.IsPrimaryDimension(i) || !domain.IsPrimaryDimension(j)) + return 1; + + for (Index index : { i1, i2, i3, i4, i5, i6 }) + { + if (domain.GetBaseIndex(index) != i) + return 1; + if (domain.IsPrimaryDimension(index)) + return 1; + } + + for (Index index : { j1, j2, j3, j4 }) + { + if (domain.GetBaseIndex(index) != j) + return 1; + if (domain.IsPrimaryDimension(index)) + return 1; + } + + const auto iRange = domain.GetDimensionRange(i); + for (Index index : { i1, i3, i5, i6 }) + { + if (!iRange.IsLoopIndex(index)) + return 1; + } + for (Index index : { i, i2, i4 }) + { + if (!iRange.IsComputedIndex(index)) + return 1; + } + auto parents = iRange.GetAllParentIndices(i4); // should be i, i_0, i_2 + if (!InList(parents, i) || !InList(parents, i2)) + return 1; + + auto dependents = iRange.GetDependentIndices(i4); // /should be i5, i6 + if (!InList(dependents, i5) || !InList(dependents, i6)) + return 1; + + return 0; +} + +// Tests of actual loop nests +Scalar LoopNest_test1() +{ + auto matrix = MakeMatrix(4, 5); + IndexRange i("i", { 0, 4 }), j("j", { 0, 5 }); + + auto kernel = Kernel("kernel") + .Inputs(matrix.GetValue()) + .Indices(i.GetIndex(), j.GetIndex()) + .Define(loopnest_kernel); + + LoopNest loop(std::vector{ i, j }); + loop.AddKernel(kernel, LoopNest::ConstraintType::predicate); + + CodeGenerator generator; + generator.Run(loop); + +#if 0 + PrintLoops(loop, "LoopNest_test1"); +#endif + + return matrix(2, 3) - 19; // will return 0 if calculation is correct +} + +Scalar LoopNest_test2() +{ + auto matrix = MakeMatrix(4, 5); + IndexRange i("i", { 0, 4 }), j("j", { 0, 5 }); + + auto kernel = Kernel("kernel") + .Inputs(matrix.GetValue()) + .Indices(i.GetIndex(), j.GetIndex()) + .Define(loopnest_kernel); + + LoopNest loop(std::vector{ i, j }); + loop.AddKernel(kernel, LoopNest::ConstraintType::predicate); + loop.Split(i.GetIndex(), 2); + + CodeGenerator generator; + generator.Run(loop); + + return matrix(2, 3) - 19; // will return 0 if calculation is correct +} + +// This tests that the loopnest works with a degenerate (1x1) kernel, both for compute and compile +Scalar LoopNest_test3() +{ + // Declare the input matrix + std::vector> dt{ + std::vector{ 1, 2, 3 }, + std::vector{ 4, 5, 6 }, + }; + auto matrix = Matrix(dt); + // Declare the output matrix and initialize its values to 10. + auto output = MakeMatrix(static_cast(matrix.Rows()), static_cast(matrix.Columns())); + For(output, [&](Scalar row, Scalar column) { + output(row, column) = 10; + }); + + // Use a Loopnest to call loopnest_kernel_3 for each element of the input matrix and write the result to + // our output. + loopnests::IndexRange i("i", { 0, static_cast(matrix.Rows()) }), j("j", { 0, static_cast(matrix.Columns()) }); + + auto kernel = loopnests::Kernel("kernel") + .Inputs(output.GetValue(), matrix.GetValue()) + .Indices(i.GetIndex(), j.GetIndex()) + .Define(loopnest_kernel_3); + + loopnests::LoopNest loop(std::vector{ i, j }); + loop.AddKernel(kernel, LoopNest::ConstraintType::predicate); + + loopnests::CodeGenerator generator; + generator.Run(loop); + + // loopnest_kernel_3 will add the input element to the output element. + // Since we initialized the output to 10, we expect the result to be + // 10 greater than the input. + std::vector expectedValues{ 11, 12, 13, 14, 15, 16 }; + auto expected = Vector(expectedValues); + + // View the result as a Vector + Vector actual = AsVector(AsFullView(output)); + + // Verify that the actual result is what we expect + return VerifySame(actual, expected); +} + +// This tests that the loopnest works with a degenerate (1x1) kernel, both for compute and compile, +// when the kernel has non-trival assignment code in it. +Scalar LoopNest_test4() +{ + // Declare the output matrix and initialize its values to 0. + auto output = MakeMatrix(2, 6); + For(output, [&](Scalar row, Scalar column) { + output(row, column) = 0; + }); + + // Use a Loopnest to call loopnest_kernel_3 for each element of the input matrix and write the result to + // our output. + loopnests::IndexRange i("i", { 0, static_cast(output.Rows()) }), j("j", { 0, static_cast(output.Columns()) }); + + auto kernel = loopnests::Kernel("kernel") + .Inputs(output.GetValue(), output.GetValue()) + .Indices(i.GetIndex(), j.GetIndex()) + .Define(loopnest_kernel_4); + + loopnests::LoopNest loop(std::vector{ i, j }); + loop.AddKernel(kernel, LoopNest::ConstraintType::predicate); + loop.Split(j.GetIndex(), 2); + + loopnests::CodeGenerator generator; + generator.Run(loop); + + // loopnest_kernel_4 will multiply row by 10 and add the column. + std::vector expectedValues{ 0, 1, 2, 3, 4, 5, 10, 11, 12, 13, 14, 15 }; + auto expected = Vector(expectedValues); + + // View the result as a Vector + Vector actual = AsVector(AsFullView(output)); + + // Verify that the actual result is what we expect + return VerifySame(actual, expected); +} + +// Simple loopnest test using variable-length inputs and indices APIs +Scalar LoopNest_test5() +{ + auto matrix = MakeMatrix(4, 5); + Index i("i"), j("j"); + LoopNest loop({ { i, { 0, 4 } }, + { j, { 0, 5 } } }); + + auto kernel = Kernel("kernel") + .Inputs({ matrix.GetValue() }) + .Indices({ i, j }) + .Define(loopnest_kernel); + + loop.AddKernel(kernel, LoopNest::ConstraintType::predicate); + loop.Split(i, 2); + + CodeGenerator generator; + generator.Run(loop); + + return matrix(2, 3) - 19; // will return 0 if calculation is correct +} + +// Simple loopnest test that loops from X to N where N > X > 0 +Scalar LoopNest_test6() +{ + int N = 4; + int X = 2; + auto matrix = MakeMatrix(N, N, "matrix"); + + ForRange(N, [&](Scalar i) { + ForRange(N, [&](Scalar j) { + matrix(i, j) = i + j; + }); + }); + // matrix: + // [ 0, 1, 2, 3 ] + // [ 1, 2, 3, 4 ] + // [ 2, 3, 4, 5 ] + // [ 3, 4, 5, 6 ] + + // Sum the bottom right quadrant of the matrix and store the value in position (0, 0) + + Index i("i"), j("j"); + LoopNest loop({ { i, { X, N } }, + { j, { X, N } } }); + auto kernel = Kernel("kernel") + .Inputs(matrix.GetValue()) + .Indices(i, j) + .Define([](Matrix mat, Scalar i, Scalar j) { + mat(0, 0) += mat(i, j); + }); + loop.AddKernel(kernel, LoopNest::ConstraintType::predicate); + CodeGenerator generator; + generator.Run(loop); + return matrix(0, 0) - 20; // Will return 0 if calculation is correct +} + +Scalar LoopNestNonzeroStart_test() +{ + const int size = 12; + const int begin = 2; + const int end = 10; + auto vector = MakeVector(size); + for (int i = 0; i < size; ++i) + { + vector(i) = 100 * i; + } + std::vector expectedValues(size); + for (int i = 0; i < size; ++i) + { + expectedValues[i] = 100 * i; + } + for (int i = begin; i < end; ++i) + { + expectedValues[i] = i; + } + auto expected = Vector(expectedValues); + + Index i("i"); + LoopNest loop({ { i, { begin, end } } }); + auto kernel = Kernel("k") + .Inputs(vector.GetValue()) + .Indices(i) + .Define(set_vector_kernel); + + loop.AddKernel(kernel, LoopNest::ConstraintType::predicate); + + CodeGenerator generator; + generator.Run(loop); + +#if 0 + PrintLoops(loop, "LoopNestNonzeroStart_test"); +#endif + + // Verify that the actual result is what we expect + return VerifySame(vector, expected); +} + +Scalar LoopNestBoundary_test1() +{ + const int size = 12; + const int n = 10; + auto vector = MakeVector(size); + for (int i = n; i < size; ++i) + { + vector(i) = 100 * i; + } + std::vector expectedValues(size); + for (int i = 0; i < n; ++i) + { + expectedValues[i] = i; + } + for (int i = n; i < size; ++i) + { + expectedValues[i] = 100 * i; + } + auto expected = Vector(expectedValues); + + Index i("i"); + LoopNest loop({ { i, { 0, n } } }); + auto split_i = loop.Split(i, 4); + + auto kernel = Kernel("k") + .Inputs(vector.GetValue()) + .Indices(i) + .Define(set_vector_kernel); + + loop.AddKernel(kernel, LoopNest::ConstraintType::predicate); + + CodeGenerator generator; + generator.Run(loop); + +#if 0 + PrintLoops(loop, "LoopNestBoundary_test1"); +#endif + + // Verify that the actual result is what we expect + return VerifySame(vector, expected); +} + +Scalar LoopNestBoundary_test2() +{ + const int size = 12; + const int n = 10; + auto vector = MakeVector(size); + for (int i = n; i < size; ++i) + { + vector(i) = 100 * i; + } + std::vector expectedValues(size); + for (int i = 0; i < n; ++i) + { + expectedValues[i] = i; + } + for (int i = n; i < size; ++i) + { + expectedValues[i] = 100 * i; + } + auto expected = Vector(expectedValues); + + Index i("i"); + LoopNest loop({ { i, { 0, n } } }); + auto split_i = loop.Split(i, 4); + auto split_i2 = loop.Split(i, 2); + + auto kernel = Kernel("k") + .Inputs(vector.GetValue()) + .Indices(i) + .Define(set_vector_kernel); + + loop.AddKernel(kernel, LoopNest::ConstraintType::predicate); + + CodeGenerator generator; + generator.Run(loop); + +#if 0 + PrintLoops(loop, "LoopNestBoundary_test2"); +#endif + + // Verify that the actual result is what we expect + return VerifySame(vector, expected); +} + +Scalar LoopNestBoundary_test3() +{ + const int size = 12; + const int n = 8; + auto vector = MakeVector(size); + for (int i = n; i < size; ++i) + { + vector(i) = 100 * i; + } + std::vector expectedValues(size); + for (int i = 0; i < n; ++i) + { + expectedValues[i] = i; + } + for (int i = n; i < size; ++i) + { + expectedValues[i] = 100 * i; + } + auto expected = Vector(expectedValues); + + Index i("i"), j("j"); + LoopNest loop({ { i, { 0, n } }, { j, { 0, n } } }); + loop.Split(i, 4); + loop.Split(i, 2); + loop.Split(j, 4); + + loop.SetLoopOrder({ i, j, i, j, i }); + + auto kernel = Kernel("k") + .Inputs(vector.GetValue()) + .Indices(i) + .Define(set_vector_kernel); + + loop.AddKernel(kernel, LoopNest::ConstraintType::predicate); + + CodeGenerator generator; + generator.Run(loop); + +#if 0 + PrintLoops(loop, "LoopNestBoundary_test3"); +#endif + + // Verify that the actual result is what we expect + return VerifySame(vector, expected); +} + +Scalar LoopNestBoundary_test4() +{ + // The input matrices: + // A: B: C: + // [ 0, -1, -2, -3, -4, -5, -6, -7] [ 0, 2, 4, 6, 8, 10, 12, 14] [ 100 100 100 ... ] + // [ 1, 0, -1, -2, -3, -4, -5, -6] [ 1, 3, 5, 7, 9, 11, 13, 15] [ 100 100 100 ... ] + // [ 2, 1, 0, -1, -2, -3, -4, -5] [ 2, 4, 6, 8, 10, 12, 14, 16] [ ... ] + // [ 3, 2, 1, 0, -1, -2, -3, -4] [ 3, 5, 7, 9, 11, 13, 15, 17] [ ... ] + // [ 4, 3, 2, 1, 0, -1, -2, -3] [ 4, 6, 8, 10, 12, 14, 16, 18] [ ... ] + // [ 5, 4, 3, 2, 1, 0, -1, -2] [ 5, 7, 9, 11, 13, 15, 17, 19] [ ... ] + // [ 6, 5, 4, 3, 2, 1, 0, -1] [ 6, 8, 10, 12, 14, 16, 18, 20] [ ... ] + // [ 7, 6, 5, 4, 3, 2, 1, 0] [ 7, 9, 11, 13, 15, 17, 19, 21] [ ... ] + + // (A * B) (the desired result): + // [-140, -196, -252, -308, -364, -420, -476, -532] + // [-112, -152, -192, -232, -272, -312, -352, -392] + // [ -84, -108, -132, -156, -180, -204, -228, -252] + // [ -56, -64, -72, -80, -88, -96, -104, -112] + // [ -28, -20, -12, -4, 4, 12, 20, 28] + // [ 0, 24, 48, 72, 96, 120, 144, 168] + // [ 28, 68, 108, 148, 188, 228, 268, 308] + // [ 56, 112, 168, 224, 280, 336, 392, 448] + + const int N = 8; + const int M = N; + const int K = N; + + auto A = MakeMatrix(M, K, "A"); + auto B = MakeMatrix(K, N, "B"); + auto C = MakeMatrix(M, N, "C"); + auto expected = MakeMatrix(M, N, "expected"); + + // initialize A, B, and C + ForRange(N, [&](Scalar i) { + ForRange(N, [&](Scalar j) { + A(i, j) = i - j; + B(i, j) = i + 2 * j; + C(i, j) = 100; + expected(i, j) = 0; + }); + }); + + // fill out expected with a simple for-loop gemm + ForRange(N, [&](Scalar i) { + ForRange(N, [&](Scalar j) { + ForRange(N, [&](Scalar k) { + expected(i, j) += A(i, k) * B(k, j); + }); + }); + }); + + // Do computation in blocks of k_r x k_c + const int k_r = 3; + const int k_c = 4; + Matrix temp = MakeMatrix(k_r, k_c, "temp"); + + loopnests::Index i("i"), j("j"), k("k"); + loopnests::LoopNest loop({ { i, { 0, M } }, + { j, { 0, N } }, + { k, { 0, K } } }); + + auto [i_outer, i_inner] = loop.Split(i, k_r); + auto [j_outer, j_inner] = loop.Split(j, k_c); + + auto prologueKernel = loopnests::Kernel("prologue") + .Inputs(temp.GetValue()) + .Indices(i_inner, j_inner) + .Define([](Matrix temp, Scalar i_inner, Scalar j_inner) { + temp(i_inner, j_inner) = 0; + }); + + auto bodyKernel = loopnests::Kernel("body") + .Inputs(A.GetValue(), B.GetValue(), C.GetValue(), temp.GetValue()) + .Indices(i, j, i_inner, j_inner, k) + .Define([](Matrix A, Matrix B, Matrix C, Matrix temp, Scalar i, Scalar j, Scalar i_inner, Scalar j_inner, Scalar k) { + temp(i_inner, j_inner) += A(i, k) * B(k, j); + }); + auto epilogueKernel = loopnests::Kernel("epilogue") + .Inputs(C.GetValue(), temp.GetValue()) + .Indices(i, j, j_outer, i_inner, j_inner) + .Define([](Matrix C, Matrix temp, Scalar i, Scalar j, Scalar j_outer, Scalar i_inner, Scalar j_inner) { + C(i, j) = temp(i_inner, j_inner); + }); + + loop.SetLoopOrder({ i_outer, j_outer, k, j_inner, i_inner }); + + loop.AddKernel(prologueKernel, First(k)); + loop.AddKernel(bodyKernel, LoopNest::ConstraintType::predicate); + loop.AddKernel(epilogueKernel, Last(k)); + + loopnests::CodeGenerator generator; + generator.Run(loop); + +#if 0 + PrintLoops(loop, "LoopNestBoundary_test4"); +#endif + + return VerifySame(C, expected); +} + +Scalar LoopNestBoundary_test5() +{ + const int M = 9; + const int N = 10; + const int K = 11; + + // Computes A*B + 1 + auto A = MakeMatrix(M, K, "A"); + auto B = MakeMatrix(K, N, "B"); + auto C = MakeMatrix(M, N, "C"); + auto expected = MakeMatrix(M, N, "expected"); + + // initialize A, B, and C + ForRange(M, [&](Scalar i) { + ForRange(K, [&](Scalar j) { + // A(i, j) = i - j; + A(i, j) = 1; + }); + }); + + ForRange(K, [&](Scalar i) { + ForRange(N, [&](Scalar j) { + // B(i, j) = i + 2 * j; + B(i, j) = 1; + }); + }); + + ForRange(M, [&](Scalar i) { + ForRange(N, [&](Scalar j) { + C(i, j) = 100; + expected(i, j) = 0; + }); + }); + + // fill out expected with a simple for-loop gemm (plus 1) + ForRange(M, [&](Scalar i) { + ForRange(N, [&](Scalar j) { + ForRange(K, [&](Scalar k) { + expected(i, j) += A(i, k) * B(k, j); + }); + expected(i, j) += 1; + }); + }); + + loopnests::Index i("i"), j("j"), k("k"); + loopnests::LoopNest loop({ { i, { 0, M } }, + { j, { 0, N } }, + { k, { 0, K } } }); + + auto prologueKernel = loopnests::Kernel("Prologue") + .Inputs(C.GetValue()) + .Indices(i, j) + .Define([](Matrix C, Scalar i, Scalar j) { + C(i, j) = 0; + }); + auto bodyKernel = loopnests::Kernel("Body") + .Inputs(A.GetValue(), B.GetValue(), C.GetValue()) + .Indices(i, j, k) + .Define([](Matrix A, Matrix B, Matrix C, Scalar i, Scalar j, Scalar k) { + C(i, j) += A(i, k) * B(k, j); + }); + auto epilogueKernel = loopnests::Kernel("Epilogue") + .Inputs(C.GetValue()) + .Indices(i, j) + .Define([](Matrix C, Scalar i, Scalar j) { + C(i, j) += 1; + }); + + loop.AddKernel(prologueKernel, First(k)); + loop.AddKernel(bodyKernel, LoopNest::ConstraintType::predicate); + loop.AddKernel(epilogueKernel, Last(k), PlacementPredicate{ Placement::after }); + + auto [i_outer, i_inner] = loop.Split(i, 4); + auto [j_outer, j_inner] = loop.Split(j, 4); + auto [k_outer, k_inner] = loop.Split(k, 4); + + loopnests::CodeGenerator generator; + generator.Run(loop); + +#if 0 + PrintLoops(loop, "LoopNestBoundary_test5"); +#endif + + return VerifySame(C, expected); +} + +Scalar LoopNestReorder_test1() +{ + auto matrix = MakeMatrix(4, 5); + IndexRange i("i", { 0, 4 }), j("j", { 0, 5 }); + auto iIndex = i.GetIndex(); + auto jIndex = j.GetIndex(); + + auto kernel = Kernel("kernel") + .Inputs(matrix.GetValue()) + .Indices(iIndex, jIndex) + .Define(loopnest_kernel); + + LoopNest loop(IterationDomain({ i, j })); + loop.AddKernel(kernel, LoopNest::ConstraintType::predicate); + loop.Split(i.GetIndex(), 2); + loop.SetLoopOrder({ iIndex, jIndex, iIndex }); + + CodeGenerator generator; + generator.Run(loop); + +#if 0 + PrintLoops(loop, "LoopNestReorder_test1"); +#endif + + return matrix(2, 3) - 19; // will return 0 if calculation is correct +} + +Scalar LoopNestReorder_test2() +{ + auto matrix = MakeMatrix(4, 5); + Index i("i"), j("j"); + + auto kernel = Kernel("kernel") + .Inputs(matrix.GetValue()) + .Indices(i, j) + .Define(loopnest_kernel); + + LoopNest loop(IterationDomain({ { i, { 0, 4 } }, { j, { 0, 5 } } })); + loop.AddKernel(kernel, LoopNest::ConstraintType::predicate); + auto [iOuter, iInner] = loop.Split(i, 2); + loop.SetLoopOrder({ iInner, j, iOuter }); + + CodeGenerator generator; + generator.Run(loop); + +#if 0 + PrintLoops(loop, "LoopNestReorder_test2"); +#endif + + return matrix(2, 3) - 19; // will return 0 if calculation is correct +} + +Scalar TwoKernel_test() +{ + auto matrix = MakeMatrix(4, 5); + IndexRange i("i", { 0, 4 }), j("j", { 0, 5 }); + auto iIndex = i.GetIndex(); + auto jIndex = j.GetIndex(); + + auto kernel1 = Kernel("kernel1") + .Inputs(matrix.GetValue()) + .Indices(iIndex, jIndex) + .Define(loopnest_kernel); + + auto kernel2 = Kernel("kernel2") + .Inputs(matrix.GetValue()) + .Indices(iIndex, jIndex) + .Define(loopnest_kernel_2); + + LoopNest loop(IterationDomain({ i, j })); + loop.AddKernel(kernel1, LoopNest::ConstraintType::predicate); + loop.AddKernel(kernel2, LoopNest::ConstraintType::predicate); + loop.Split(i.GetIndex(), 2); + loop.SetLoopOrder({ iIndex, jIndex, iIndex }); + + CodeGenerator generator; + generator.Run(loop); + +#if 0 + PrintLoops(loop, "TwoKernel_test"); +#endif + + auto expected = 19 + 26; // 19 == 1st kernel (2*i + 5*j), 26 == 2nd kernel (10*i+2*j) + return matrix(2, 3) - expected; // will return 0 if calculation is correct +} + +// Prototype for test with a kernel that runs on the last iteration of an index +// split: where to split the loop (0 if no split) +// id: id to use for body and "last" kernels ("" if they should not share an ID) +Scalar LoopNestLastPredicate_test(std::string tag, int split, std::string id) +{ + const int n = 32; + std::vector expectedValues(n); + for (int i = 0; i < n; ++i) + { + expectedValues[i] = i; + } + if (id.empty()) + { + expectedValues[n - 1] += 1; + } + else + { + expectedValues[n - 1] = 1; + } + auto expected = Vector(expectedValues); + + auto vector = MakeVector(n); + Index i("i"); + LoopNest loop({ { i, { 0, n } } }); + + if (split != 0) + { + loop.Split(i, split); + } + + auto kernel = Kernel("k", id) + .Inputs(vector.GetValue()) + .Indices(i) + .Define(set_vector_kernel); + + auto boundaryKernel = Kernel("boundary", id) + .Inputs(vector.GetValue()) + .Indices(i) + .Define(increment_vector_kernel); + + if (id.empty()) + { + loop.AddKernel(kernel, LoopNest::ConstraintType::predicate); + loop.AddKernel(boundaryKernel, { Last(i) }); + } + else + { + loop.AddKernel(boundaryKernel, { Last(i) }); + loop.AddKernel(kernel, LoopNest::ConstraintType::predicate); + } + +#if 0 + PrintLoops(loop, "LoopNestLastPredicate_test_" + tag); +#endif + + CodeGenerator generator; + generator.Run(loop); + + // Verify that the actual result is what we expect + return VerifySame(vector, expected); +} + +// Test with a kernel that runs on the last iteration of an index +Scalar LoopNestLastPredicate_test1() +{ + return LoopNestLastPredicate_test("1", 0, ""); +} + +// Test with a kernel that runs on the last iteration of an index, with a split loop +Scalar LoopNestLastPredicate_test2() +{ + return LoopNestLastPredicate_test("2", 4, ""); +} + +// Test with an alternate kernel that runs on the last iteration of an index (instead of the main kernel) +Scalar LoopNestLastPredicate_test3() +{ + return LoopNestLastPredicate_test("3", 0, "k"); +} +// Test with an alternate kernel that runs on the last iteration of an index (instead of the main kernel), with a split loop +Scalar LoopNestLastPredicate_test4() +{ + return LoopNestLastPredicate_test("4", 4, "k"); +} + +Scalar LoopNestBoundaryPredicate_test1() +{ + const int size = 12; + const int n = 10; + auto vector = MakeVector(size); + for (int i = n; i < size; ++i) + { + vector(i) = 100 * i; + } + + std::vector expectedValues(size); + int mainEnd = 4 * (n / 4); + for (int i = 0; i < mainEnd; ++i) + { + expectedValues[i] = i; + } + for (int i = mainEnd; i < n; ++i) + { + expectedValues[i] = 1; + } + for (int i = n; i < size; ++i) + { + expectedValues[i] = 100 * i; // same as initialized vector, untouched + } + auto expected = Vector(expectedValues); + + Index i("i"); + LoopNest loop({ { i, { 0, n } } }); + auto split_i = loop.Split(i, 4); + + auto kernel = Kernel("k") + .Inputs(vector.GetValue()) + .Indices(i) + .Define(set_vector_kernel); + + auto boundaryKernel = Kernel("boundary", kernel.GetId()) + .Inputs(vector.GetValue()) + .Indices(i) + .Define(increment_vector_kernel); + + loop.AddKernel(boundaryKernel, { EndBoundary(i) }); + loop.AddKernel(kernel, LoopNest::ConstraintType::predicate); + + CodeGenerator generator; + generator.Run(loop); + +#if 0 + PrintLoops(loop, "LoopNestBoundaryPredicate_test1"); +#endif + + // Verify that the actual result is what we expect + return VerifySame(vector, expected); +} + +Scalar MissingIndex_test() +{ + const int n = 12; + const int splitAmount = 4; + auto vector = MakeVector(n); + + ForRange(n, [&](Scalar i) { + vector(i) = Scalar(100); + }); + + std::vector expectedValues(n); + for (int i = 0; i < n; ++i) + { + expectedValues[i] = 100; + } + for (int i = 0; i < n; i += splitAmount) + { + expectedValues[i] = i; + } + auto expected = Vector(expectedValues); + + Index i("i"); + LoopNest loop({ { i, { 0, n } } }); + auto [i_outer, i_inner] = loop.Split(i, splitAmount); + + auto kernel = Kernel("k") + .Inputs(vector.GetValue()) + .Indices(i) + .Define(set_vector_kernel); // v[i] = i + + CodePositionConstraints constraint(LoopFragmentType::body, { i_outer }, {}); + loop.AddKernel(kernel, constraint); + + CodeGenerator generator; + generator.Run(loop); + +#if 0 + PrintLoops(loop, "MissingIndex_test"); +#endif + + // Verify that the actual result is what we expect + return VerifySame(vector, expected); +} + +Scalar RequiredIndex_test() +{ + std::string loopOrder = "ijk"; + const int N = 8; + auto A = MakeMatrix(N, N, "A"); + auto B = MakeMatrix(N, N, "B"); + auto C = MakeMatrix(N, N, "C"); + + // initialize A, B, and C + ForRange(N, [&](Scalar i) { + ForRange(N, [&](Scalar j) { + A(i, j) = i - j; + B(i, j) = i + 2 * j; + C(i, j) = 100; + }); + }); + + // The input matrices: + // A: B: C: + // [ 0, -1, -2, -3, -4, -5, -6, -7] [ 0, 2, 4, 6, 8, 10, 12, 14] [ 100 100 100 ... ] + // [ 1, 0, -1, -2, -3, -4, -5, -6] [ 1, 3, 5, 7, 9, 11, 13, 15] [ 100 100 100 ... ] + // [ 2, 1, 0, -1, -2, -3, -4, -5] [ 2, 4, 6, 8, 10, 12, 14, 16] [ ... ] + // [ 3, 2, 1, 0, -1, -2, -3, -4] [ 3, 5, 7, 9, 11, 13, 15, 17] [ ... ] + // [ 4, 3, 2, 1, 0, -1, -2, -3] [ 4, 6, 8, 10, 12, 14, 16, 18] [ ... ] + // [ 5, 4, 3, 2, 1, 0, -1, -2] [ 5, 7, 9, 11, 13, 15, 17, 19] [ ... ] + // [ 6, 5, 4, 3, 2, 1, 0, -1] [ 6, 8, 10, 12, 14, 16, 18, 20] [ ... ] + // [ 7, 6, 5, 4, 3, 2, 1, 0] [ 7, 9, 11, 13, 15, 17, 19, 21] [ ... ] + + // (A * B) + 1 (the desired result): + // [-139, -195, -251, -307, -363, -419, -475, -531] + // [-111, -151, -191, -231, -271, -311, -351, -391] + // [ -83, -107, -131, -155, -179, -203, -227, -251] + // [ -55, -63, -71, -79, -87, -95, -103, -111] + // [ -27, -19, -11, -3, 5, 13, 21, 29] + // [ 1, 25, 49, 73, 97, 121, 145, 169] + // [ 29, 69, 109, 149, 189, 229, 269, 309] + // [ 57, 113, 169, 225, 281, 337, 393, 449] + + Index i("i"), j("j"), k("k"); + + auto initCKernel = Kernel("init") + .Inputs(C.GetValue()) + .Indices(i, j) + .Define(initToZero); + auto innerKernel = Kernel("matmul") + .Inputs(A.GetValue(), B.GetValue(), C.GetValue()) + .Indices(i, j, k) + .Define(matmul_kernel); + auto postProcessCKernel = Kernel("post") + .Inputs(C.GetValue()) + .Indices(i, j) + .Define(addOne); + + LoopNest loop({ { i, { 0, N } }, + { j, { 0, N } }, + { k, { 0, N } } }); + + CodePositionConstraints preConstraint{ LoopFragmentType::prologue, { i, j }, {} }; + loop.AddKernel(initCKernel, preConstraint); + + loop.AddKernel(innerKernel, LoopNest::ConstraintType::constraint); + + CodePositionConstraints postConstraint{ LoopFragmentType::epilogue, { i, j }, {} }; + loop.AddKernel(postProcessCKernel, postConstraint); + + // SplitAndSetOrder(loop, { i, j, k }, { 4, 2 }, loopOrder); + + CodeGenerator generator; + generator.Run(loop); + +#if 0 // DEBUGGING + PrintLoops(loop, "RequiredIndex_test"); + + InvokeForContext([&](auto&) { + ForRange(N, [&](Scalar i) { + ForRange(N, [&](Scalar j) { + auto val = C(i, j).Get(); + Log() << std::setw(5) << val; + }); + Log() << EOL; + }); + }); +#endif + + return C(1, 2) + C(2, 1) - (-191 + -107); // will return 0 if calculation is correct +} + +Scalar SimpleImperfectNest_test() +{ + const int N = 4; + Vector A = MakeVector(N); + Vector B = MakeVector(N); + + // initialize A, B, and C + ForRange(N, [&](Scalar i) { + A(i) = 10; + B(i) = 20; + }); + Index i("i"); + + auto prologueKernel = Kernel("prologue") + .Inputs(A.GetValue()) + .Indices(i) + .Define(set_vector_kernel); + auto bodyKernel = Kernel("body") + .Inputs(A.GetValue()) + .Indices(i) + .Define(increment_vector_kernel); + auto epilogueKernel = Kernel("epilogue") + .Inputs(A.GetValue(), B.GetValue()) + .Indices(i) + .Define(copy_vector_kernel); + + LoopNest loop({ { i, { 0, N } } }); + + CodePositionConstraints prologueConstraint{ LoopFragmentType::prologue, {}, { i } }; + loop.AddKernel(prologueKernel, prologueConstraint); + + loop.AddKernel(bodyKernel, LoopNest::ConstraintType::constraint); + + CodePositionConstraints epilogueConstraint{ LoopFragmentType::epilogue, {}, { i } }; + loop.AddKernel(epilogueKernel, epilogueConstraint); + + CodeGenerator generator; + generator.Run(loop); + + // DEBUGGING +#if 0 + PrintLoops(loop, "SimpleImperfectNest_test"); +#endif + +#if 0 + InvokeForContext([&](auto&) { + ForRange(N, [&](Scalar i) { + ForRange(N, [&](Scalar j) { + auto val = C(i, j).Get(); + Log() << std::setw(5) << val; + }); + Log() << EOL; + }); + }); +#endif + + std::vector expectedValues{ 20, 20, 20, 11 }; + auto expected = Vector(expectedValues); + + // Verify that the actual result is what we expect + return VerifySame(B, expected); +} + +Scalar ImperfectNest_test(std::string loopOrder) +{ + const int N = 8; + auto A = MakeMatrix(N, N, "A"); + auto B = MakeMatrix(N, N, "B"); + auto C = MakeMatrix(N, N, "C"); + + // initialize A, B, and C + ForRange(N, [&](Scalar i) { + ForRange(N, [&](Scalar j) { + A(i, j) = i - j; + B(i, j) = i + 2 * j; + C(i, j) = 100; + }); + }); + + // The input matrices: + // A: B: C: + // [ 0, -1, -2, -3, -4, -5, -6, -7] [ 0, 2, 4, 6, 8, 10, 12, 14] [ 100 100 100 ... ] + // [ 1, 0, -1, -2, -3, -4, -5, -6] [ 1, 3, 5, 7, 9, 11, 13, 15] [ 100 100 100 ... ] + // [ 2, 1, 0, -1, -2, -3, -4, -5] [ 2, 4, 6, 8, 10, 12, 14, 16] [ ... ] + // [ 3, 2, 1, 0, -1, -2, -3, -4] [ 3, 5, 7, 9, 11, 13, 15, 17] [ ... ] + // [ 4, 3, 2, 1, 0, -1, -2, -3] [ 4, 6, 8, 10, 12, 14, 16, 18] [ ... ] + // [ 5, 4, 3, 2, 1, 0, -1, -2] [ 5, 7, 9, 11, 13, 15, 17, 19] [ ... ] + // [ 6, 5, 4, 3, 2, 1, 0, -1] [ 6, 8, 10, 12, 14, 16, 18, 20] [ ... ] + // [ 7, 6, 5, 4, 3, 2, 1, 0] [ 7, 9, 11, 13, 15, 17, 19, 21] [ ... ] + + // (A * B) + 1 (the desired result): + // [-139, -195, -251, -307, -363, -419, -475, -531] + // [-111, -151, -191, -231, -271, -311, -351, -391] + // [ -83, -107, -131, -155, -179, -203, -227, -251] + // [ -55, -63, -71, -79, -87, -95, -103, -111] + // [ -27, -19, -11, -3, 5, 13, 21, 29] + // [ 1, 25, 49, 73, 97, 121, 145, 169] + // [ 29, 69, 109, 149, 189, 229, 269, 309] + // [ 57, 113, 169, 225, 281, 337, 393, 449] + + Index i("i"), j("j"), k("k"); + + auto innerKernel = Kernel("matmul") + .Inputs(A.GetValue(), B.GetValue(), C.GetValue()) + .Indices(i, j, k) + .Define(matmul_kernel); + auto initCKernel = Kernel("init") + .Inputs(C.GetValue()) + .Indices(i, j) + .Define(initToZero); + auto postProcessCKernel = Kernel("post") + .Inputs(C.GetValue()) + .Indices(i, j) + .Define(addOne); + + LoopNest loop({ { i, { 0, N } }, + { j, { 0, N } }, + { k, { 0, N } } }); + + CodePositionConstraints preConstraint{ LoopFragmentType::prologue, { i, j }, { k } }; + loop.AddKernel(initCKernel, preConstraint); + + loop.AddKernel(innerKernel, LoopNest::ConstraintType::constraint); + + CodePositionConstraints postConstraint{ LoopFragmentType::epilogue, { i, j }, { k } }; + loop.AddKernel(postProcessCKernel, postConstraint); + + SplitAndSetOrder(loop, { i, j, k }, { 4, 2 }, loopOrder); + + CodeGenerator generator; + generator.Run(loop); + +#if 0 + PrintLoops(loop, "ImperfectNest_test_" + loopOrder); +#endif + +#if 0 + InvokeForContext([&](auto&) { + ForRange(N, [&](Scalar i) { + ForRange(N, [&](Scalar j) { + auto val = C(i, j).Get(); + Log() << std::setw(5) << val; + }); + Log() << EOL; + }); + }); +#endif + + return C(1, 2) + C(2, 1) - (-191 + -107); // will return 0 if calculation is correct +} + +Scalar ImperfectNest_test_ijk() +{ + return ImperfectNest_test("ijk"); +} + +Scalar ImperfectNest_test_ikj() +{ + return ImperfectNest_test("ikj"); +} + +Scalar ImperfectNest_test_kij() +{ + return ImperfectNest_test("kij"); +} + +Scalar ImperfectNest_test_ijkijk() +{ + return ImperfectNest_test("ijkijk"); +} + +Scalar ImperfectNest_test_kijijk() +{ + return ImperfectNest_test("kijijk"); +} + +Scalar ImperfectNest_test_ijkkij() +{ + return ImperfectNest_test("ijkkij"); +} + +Scalar SplitIndex_test1_old() +{ + auto vector = MakeVector(4 * 5); + auto matrix = MakeMatrix(4, 5); + auto splitParam = Scalar(Allocate(utilities::ScalarLayout)); + splitParam = 5; + IndexRange i("i", { 0, 4 * 5 }); + auto iIndex = i.GetIndex(); + LoopNest loop(std::vector{ i }); + auto [i_outer, i_inner] = loop.Split(iIndex, 10); + + auto kernel1 = Kernel("set_vector") + .Inputs(vector.GetValue()) + .Indices(iIndex) + .Define(set_vector_kernel); + + auto kernel2 = Kernel("reorder_vector") + .Inputs(vector.GetValue(), matrix.GetValue(), splitParam.GetValue()) + .Indices(iIndex, i_outer, i_inner) + .Define(reorder_vector_kernel); + + loop.AddKernel(kernel1, LoopNest::ConstraintType::constraint); + loop.AddKernel(kernel2, LoopNest::ConstraintType::constraint); + + CodeGenerator generator; + generator.Run(loop); + +#if 0 + PrintLoops(loop, "SplitIndex_test1_old"); +#endif + + return (vector(0) - matrix(0, 0)) + (vector(13) - matrix(2, 3)) + (matrix(3, 2) - (3 * 5 + 2)); +} + +Scalar SplitIndex_test1() +{ + auto vector = MakeVector(4 * 5); + auto matrix = MakeMatrix(4, 5); + auto splitParam = Scalar(Allocate(utilities::ScalarLayout)); + splitParam = 5; + IndexRange i("i", { 0, 4 * 5 }); + auto iIndex = i.GetIndex(); + LoopNest loop(std::vector{ i }); + auto [i_outer, i_inner] = loop.Split(iIndex, 10); + + auto kernel1 = Kernel("set_vector") + .Inputs(vector.GetValue()) + .Indices(iIndex) + .Define(set_vector_kernel); + + auto kernel2 = Kernel("reorder_vector") + .Inputs(vector.GetValue(), matrix.GetValue(), splitParam.GetValue()) + .Indices(iIndex, i_outer, i_inner) + .Define(reorder_vector_kernel); + + loop.AddKernel(kernel1, LoopNest::ConstraintType::predicate); + loop.AddKernel(kernel2, LoopNest::ConstraintType::predicate); + + CodeGenerator generator; + generator.Run(loop); + +#if 0 + PrintLoops(loop, "SplitIndex_test1"); +#endif + + return (vector(0) - matrix(0, 0)) + (vector(13) - matrix(2, 3)) + (matrix(3, 2) - (3 * 5 + 2)); +} + +// Same as SplitIndex_test1, but with an extra split +Scalar SplitIndex_test2() +{ + auto vector = MakeVector(4 * 5); + auto matrix = MakeMatrix(4, 5); + auto splitParam = Scalar(Allocate(utilities::ScalarLayout)); + splitParam = 5; + + Index i("i"); + LoopNest loop({ { i, { 0, 4 * 5 } } }); + auto [i_outer, temp] = loop.Split(i, 10); + auto [i_middle, i_inner] = loop.Split(i, 5); + + auto kernel1 = Kernel("set_vector") + .Inputs(vector.GetValue()) + .Indices(i) + .Define(set_vector_kernel); + + auto kernel2 = Kernel("reorder_vector") + .Inputs(vector.GetValue(), matrix.GetValue(), splitParam.GetValue()) + .Indices(i, i_outer, temp) + .Define(reorder_vector_kernel); + + loop.AddKernel(kernel1, LoopNest::ConstraintType::predicate); + loop.AddKernel(kernel2, LoopNest::ConstraintType::predicate); + +#if 0 + PrintLoops(loop, "SplitIndex_test2"); +#endif + + CodeGenerator generator; + generator.Run(loop); + + return (vector(0) - matrix(0, 0)) + (vector(13) - matrix(2, 3)) + (matrix(3, 2) - (3 * 5 + 2)); +} + +// Same as SplitIndex_test2, but splitting an outer index +Scalar SplitIndex_test3() +{ + auto vector = MakeVector(4 * 5); + auto matrix = MakeMatrix(4, 5); + auto splitParam = Scalar(Allocate(utilities::ScalarLayout)); + splitParam = 5; + + Index i("i"); + LoopNest loop({ { i, { 0, 4 * 5 } } }); + auto [temp, i_inner] = loop.Split(i, 5); + auto [i_outer, i_middle] = loop.Split(temp, 10); + loop.SetLoopOrder({ i_outer, i_middle, i_inner }); + auto kernel1 = Kernel("set_vector") + .Inputs(vector.GetValue()) + .Indices(i) + .Define(set_vector_kernel); + + auto kernel2 = Kernel("reorder_vector") + .Inputs(vector.GetValue(), matrix.GetValue(), splitParam.GetValue()) + .Indices(i, temp, i_inner) + .Define(reorder_vector_kernel); + + loop.AddKernel(kernel1, LoopNest::ConstraintType::predicate); + loop.AddKernel(kernel2, LoopNest::ConstraintType::predicate); + +#if 0 + PrintLoops(loop, "SplitIndex_test3"); +#endif + + CodeGenerator generator; + generator.Run(loop); + + return (vector(0) - matrix(0, 0)) + (vector(13) - matrix(2, 3)) + (matrix(3, 2) - (3 * 5 + 2)); +} + +Scalar EpilogueIndex_test() +{ + const int N = 8; + auto vector = MakeVector(N); + + Index i("i"); + LoopNest loop({ { i, { 0, N } } }); + auto [i_outer, i_inner] = loop.Split(i, 4); + + auto prologueKernel = Kernel("prologue") + .Inputs(vector.GetValue()) + .Indices(i) + .Define([](Vector v, Scalar i) { + v[i] = i; + }); + auto bodyKernel = Kernel("body") + .Inputs(vector.GetValue()) + .Indices(i) + .Define([](Vector v, Scalar i) { + v[i] += 10; + }); + auto epilogueKernel = Kernel("epilogue") + .Inputs(vector.GetValue()) + .Indices(i) + .Define([](Vector v, Scalar i) { + v[i] += 1; + }); + + loopnests::CodePositionConstraints prologueConstraints{ loopnests::LoopFragmentType::prologue, { i_outer }, {} }; + loopnests::CodePositionConstraints epilogueConstraints{ loopnests::LoopFragmentType::epilogue, { i_outer }, {} }; + loop.AddKernel(prologueKernel, prologueConstraints); + loop.AddKernel(bodyKernel, LoopNest::ConstraintType::constraint); + loop.AddKernel(epilogueKernel, epilogueConstraints); + +#if 0 + PrintLoops(loop, "EpilogueIndex_test"); +#endif + + Scalar ok = Allocate(ScalarLayout); + ok = 1; + + CodeGenerator generator; + generator.Run(loop); + + std::vector expectedValues{ 10, 10, 10, 11, 14, 10, 10, 11 }; + auto expected = Vector(expectedValues); + + If( + VerifySame(vector, expected) == 0, + [&] { + ok = 0; + }) + .Else([&] { + ok = 1; + }); + + return ok; +} + +Scalar RenameKernelArg_test() +{ + auto matrix = MakeMatrix(4, 5); + IndexRange i("i", { 0, 4 }), j("j", { 0, 5 }); + LoopNest loop(std::vector{ i, j }); + + auto kernel = Kernel("kernel") + .Inputs(matrix.GetValue()) + .Indices(i.GetIndex(), j.GetIndex()) + .Define(loopnest_kernel); + + loop.AddKernel(kernel, LoopNest::ConstraintType::constraint); + + auto newMatrix = MakeMatrix(4, 5); + auto [outer, inner] = loop.Split(i.GetIndex(), 2); + loop.RenameVariable(matrix, newMatrix, { inner }); + + CodeGenerator generator; + generator.Run(loop); + + return newMatrix(2, 3) - 19; // will return 0 if calculation is correct +} + +Scalar NonInnermostKernel_test1() +{ + auto matrix = MakeMatrix(4, 4); + ForRange(4, [&](Scalar i) { + ForRange(4, [&](Scalar j) { + matrix(i, j) = i - j; + }); + }); + + // matrix: + // 0 -1 -2 -3 + // 1 0 -1 -2 + // 2 1 0 -1 + // 3 2 1 0 + + // result with 1x1 only: + // 1 0 -1 -2 + // 2 1 0 -1 + // 3 2 1 0 + // 4 3 2 1 + + // (correct) result with 2x2 only ("before"): + // 2 -1 0 -3 + // 1 0 -1 -2 + // 4 1 2 -1 + // 3 2 1 0 + + // result with 2x2 invoked at the outer level, and 1x1 invoked inside ("first"): + // 2 0 0 -2 + // 2 1 0 -1 + // 4 2 2 0 + // 4 3 2 1 + + Index i("i"), j("j"); + LoopNest loop({ { i, { 0, 4 } }, + { j, { 0, 4 } } }); + + auto kernel1x1 = Kernel("kernel_1x1") + .Inputs(matrix.GetValue()) + .Indices(i, j) + .Define(addOne); + + auto [iOuter, iInner] = loop.Split(i, 2); + auto [jOuter, jInner] = loop.Split(j, 2); + + auto kernel2x2 = Kernel("kernel_2x2", kernel1x1.GetId()) + .Inputs(matrix.GetValue()) + .Indices(iOuter, jOuter) + .Define(addTwo); + loop.AddKernel(kernel2x2, LoopNest::ConstraintType::constraint); + loop.AddKernel(kernel1x1, LoopNest::ConstraintType::constraint); + +#if 0 + PrintLoops(loop, "NonInnermostKernel_test1"); +#endif + + CodeGenerator generator; + generator.Run(loop); + +#if 0 // DEBUGGING + InvokeForContext([&](auto&) { + ForRange(4, [&](Scalar i) { + ForRange(4, [&](Scalar j) { + auto val = matrix(i, j).Get(); + Log() << std::setw(4) << val; + }); + Log() << EOL; + }); + }); +#endif + return matrix(1, 1).Copy(); // will return 0 if calculation is correct +} + +Scalar NonInnermostKernel_test2() +{ + auto matrix = MakeMatrix(4, 4, "matrix"); + auto expected = MakeMatrix(4, 4, "matrix"); + ForRange(4, [&](Scalar i) { + ForRange(4, [&](Scalar j) { + matrix(i, j) = i - j; + expected(i, j) = i - j; + }); + }); + ForRange(2, [&](Scalar i) { + ForRange(2, [&](Scalar j) { + expected(2 * i, 2 * j) += 2; + }); + }); + + // matrix: + // 0 -1 -2 -3 + // 1 0 -1 -2 + // 2 1 0 -1 + // 3 2 1 0 + + // result with 1x1: + // 1 0 -1 -2 + // 2 1 0 -1 + // 3 2 1 0 + // 4 3 2 1 + + // result with 2x2: + // 2 -1 0 -3 + // 1 0 -1 -2 + // 4 1 2 -1 + // 3 2 1 0 + + Index i("i"), j("j"); + LoopNest loop({ { i, { 0, 4 } }, + { j, { 0, 4 } } }); + + auto kernel1x1 = Kernel("kernel_1x1") + .Inputs(matrix.GetValue()) + .Indices(i, j) + .Define(addOne); + + auto [iOuter, iInner] = loop.Split(i, 2); + auto [jOuter, jInner] = loop.Split(j, 2); + + auto kernel2x2 = Kernel("kernel_2x2", kernel1x1.GetId()) + .Inputs(matrix.GetValue()) + .Indices(iOuter, jOuter) + .Define(addTwo); + + CodePositionConstraints bodyConstraint{ LoopFragmentType::body, { iOuter, jOuter }, {} }; + loop.AddKernel(kernel2x2, bodyConstraint); + loop.AddKernel(kernel1x1, LoopNest::ConstraintType::constraint); + +#if 0 + PrintLoops(loop, "NonInnermostKernel_test2"); +#endif + + CodeGenerator generator; + generator.Run(loop); + +#if 0 // DEBUGGING + InvokeForContext([&](auto&) { + ForRange(4, [&](Scalar i) { + ForRange(4, [&](Scalar j) { + auto val = matrix(i, j).Get(); + Log() << std::setw(4) << val; + }); + Log() << EOL; + }); + }); +#endif + + Scalar ok = Allocate(ScalarLayout); + If( + VerifySame(matrix, expected) == 0, + [&] { + ok = 0; + }) + .Else([&] { + ok = 1; + }); + + return matrix(1, 1).Copy(); // will return 0 if calculation is correct +} + +Scalar NonInnermostKernel_test3() +{ + auto matrix = MakeMatrix(4, 4, "matrix"); + ForRange(4, [&](Scalar i) { + ForRange(4, [&](Scalar j) { + matrix(i, j) = i - j; + }); + }); + + // matrix: + // 0 -1 -2 -3 + // 1 0 -1 -2 + // 2 1 0 -1 + // 3 2 1 0 + + // result with 1x1: + // 1 0 -1 -2 + // 2 1 0 -1 + // 3 2 1 0 + // 4 3 2 1 + + // result with 2x2: + // 2 -1 0 -3 + // 1 0 -1 -2 + // 4 1 2 -1 + // 3 2 1 0 + + Index i("i"), j("j"); + LoopNest loop({ { i, { 0, 4 } }, + { j, { 0, 4 } } }); + + auto kernel1x1 = Kernel("kernel_1x1") + .Inputs(matrix.GetValue()) + .Indices(i, j) + .Define(addOne); + + auto [iOuter, iInner] = loop.Split(i, 2); + auto [jOuter, jInner] = loop.Split(j, 2); + + auto kernel2x2 = Kernel("kernel_2x2", kernel1x1.GetId()) + .Inputs(matrix.GetValue()) + .Indices(iOuter, jOuter) + .Define(addTwo); + + loop.AddKernel(kernel2x2, LoopFragmentType::body); + loop.AddKernel(kernel1x1, LoopFragmentType::body); + +#if 0 + PrintLoops(loop, "NonInnermostKernel_test3"); +#endif + + CodeGenerator generator; + generator.Run(loop); + +#if 0 // DEBUGGING + InvokeForContext([&](auto&) { + ForRange(4, [&](Scalar i) { + ForRange(4, [&](Scalar j) { + auto val = matrix(i, j).Get(); + Log() << std::setw(4) << val; + }); + Log() << EOL; + }); + }); +#endif + return matrix(1, 1).Copy(); // will return 0 if calculation is correct +} + +Scalar NonInnermostKernel_test4() +{ + auto matrix = MakeMatrix(4, 4); + ForRange(4, [&](Scalar i) { + ForRange(4, [&](Scalar j) { + matrix(i, j) = i - j; + }); + }); + + // matrix: + // 0 -1 -2 -3 + // 1 0 -1 -2 + // 2 1 0 -1 + // 3 2 1 0 + + // result with 1x1 only: + // 1 0 -1 -2 + // 2 1 0 -1 + // 3 2 1 0 + // 4 3 2 1 + + // result with 2x2 only ("before"): + // 2 -1 0 -3 + // 1 0 -1 -2 + // 4 1 2 -1 + // 3 2 1 0 + + // result with 2x2 invoked at the outer level, and 1x1 invoked inside ("first"): + // 2 0 0 -2 + // 2 1 0 -1 + // 4 2 2 0 + // 4 3 2 1 + + Index i("i"), j("j"); + LoopNest loop({ { i, { 0, 4 } }, + { j, { 0, 4 } } }); + + auto kernel1x1 = Kernel("kernel_1x1") + .Inputs(matrix.GetValue()) + .Indices(i, j) + .Define(addOne); + + auto [iOuter, iInner] = loop.Split(i, 2); + auto [jOuter, jInner] = loop.Split(j, 2); + + auto kernel2x2 = Kernel("kernel_2x2", kernel1x1.GetId()) + .Inputs(matrix.GetValue()) + .Indices(iOuter, jOuter) + .Define(addTwo); + + loop.AddKernel(kernel1x1, LoopNest::ConstraintType::predicate); + loop.AddKernel(kernel2x2, {}, IsDefined(iOuter) && IsDefined(jOuter)); + +#if 0 + PrintLoops(loop, "NonInnermostKernel_test4"); +#endif + + CodeGenerator generator; + generator.Run(loop); + +#if 0 // DEBUGGING + InvokeForContext([&](auto&) { + ForRange(4, [&](Scalar i) { + ForRange(4, [&](Scalar j) { + auto val = matrix(i, j).Get(); + Log() << std::setw(4) << val; + }); + Log() << EOL; + }); + }); +#endif + return matrix(1, 1).Copy(); // will return 0 if calculation is correct +} + +// This test adds 1 to each element in a 4x4 matrix, but does all the work in on a cached piece of the matrix. +// The i and j dimensions are subdivided into 2x2 tiles, then each tile is copied into the cache, operated on, +// and copied back. +// In this version of the test, the cache is the same size as the original matrix. The next test shows a more useful +// scenario, where the cache is the size of a single tile. +Scalar CachedMatrix_test1() +{ + const int N = 4; + // auto A = MakeMatrix(N, N); + // A.GetValue().SetName("A"); // BUG(?): this doesn't work because GetValue() returns a copy + auto A = MakeMatrix(N, N, "A"); + + ForRange(N, [&](Scalar i) { + ForRange(N, [&](Scalar j) { + A(i, j) = i - j; + }); + }); + + // A: + // [ 0, -1, -2, -3 ] + // [ 1, 0, -1, -2 ] + // [ 2, 1, 0, -1 ] + // [ 3, 2, 1, 0 ] + + const int cacheSize = 2; + auto cache = MakeMatrix(N, N, "cache"); + + Index i("i"), j("j"); + LoopNest loop({ { i, { 0, N } }, + { j, { 0, N } } }); + auto computeKernel = Kernel("compute") + .Inputs(A.GetValue()) + .Indices(i, j) + .Define(addOne); + + auto [iOuter, iInner] = loop.Split(i, cacheSize); + auto [jOuter, jInner] = loop.Split(j, cacheSize); + + auto initCacheKernel = Kernel("cache") + .Inputs(A.GetValue(), cache.GetValue()) + .Indices(iOuter, jOuter) + .Define(copyToCache); + + auto copybackKernel = Kernel("uncache") + .Inputs(A.GetValue(), cache.GetValue()) + .Indices(iOuter, jOuter) + .Define(copyFromCache); + + // inside iInner, jInner loop (and inside them), "cache" is used instead of "A" + loop.RenameVariable(A, cache, { iInner, jInner }); + + loop.AddKernel(initCacheKernel, LoopFragmentType::prologue); + loop.AddKernel(computeKernel, LoopNest::ConstraintType::constraint); + loop.AddKernel(copybackKernel, LoopFragmentType::epilogue); + + CodeGenerator generator; + generator.Run(loop); + +#if 0 // DEBUGGING + PrintLoops(loop, "CachedMatrix_test1"); + + InvokeForContext([&](auto&) { + ForRange(N, [&](Scalar i) { + ForRange(N, [&](Scalar j) { + auto val = A(i, j).Get(); + Log() << std::setw(4) << val; + }); + Log() << EOL; + }); + }); +#endif + return A(2, 0) + A(0, 2) - 2; // will return 0 if calculation is correct +} + +// This test adds 1 to each element in a 4x4 matrix, but does all the work in on a cached piece of the matrix. +// The i and j dimensions are subdivided into 2x2 tiles, then each tile is copied into the cache, operated on, +// and copied back. +// In this version of the test, the cache is the same size as the original matrix. The next test shows a more useful +// scenario, where the cache is the size of a single tile. +Scalar CachedMatrix_test1_new() +{ + const int N = 4; + // auto A = MakeMatrix(N, N); + // A.GetValue().SetName("A"); // BUG(?): this doesn't work because GetValue() returns a copy + auto A = MakeMatrix(N, N, "A"); + + ForRange(N, [&](Scalar i) { + ForRange(N, [&](Scalar j) { + A(i, j) = i - j; + }); + }); + + // A: + // [ 0, -1, -2, -3 ] + // [ 1, 0, -1, -2 ] + // [ 2, 1, 0, -1 ] + // [ 3, 2, 1, 0 ] + + const int cacheSize = 2; + auto cache = MakeMatrix(N, N, "cache"); + + Index i("i"), j("j"); + LoopNest loop({ { i, { 0, N } }, + { j, { 0, N } } }); + auto computeKernel = Kernel("compute") + .Inputs(A.GetValue()) + .Indices(i, j) + .Define(addOne); + + auto [iOuter, iInner] = loop.Split(i, cacheSize); + auto [jOuter, jInner] = loop.Split(j, cacheSize); + + auto initCacheKernel = Kernel("cache") + .Inputs(A.GetValue(), cache.GetValue()) + .Indices(iOuter, jOuter) + .Define(copyToCache); + + auto copybackKernel = Kernel("uncache") + .Inputs(A.GetValue(), cache.GetValue()) + .Indices(iOuter, jOuter) + .Define(copyFromCache); + + // inside iInner, jInner loop (and inside them), "cache" is used instead of "A" + loop.RenameVariable(A, cache, { iInner, jInner }); + + // loop.AddKernel(initCacheKernel, {}, Before(iInner) || Before(jInner)); + loop.AddKernel(initCacheKernel, {}, Before(iInner)); + loop.AddKernel(computeKernel, LoopNest::ConstraintType::predicate); + loop.AddKernel(copybackKernel, {}, After(iInner) || After(jInner)); + + CodeGenerator generator; + generator.Run(loop); + +#if 0 // DEBUGGING + PrintLoops(loop, "CachedMatrix_test1_new"); +#endif + +#if 0 // DEBUGGING + InvokeForContext([&](auto&) { + ForRange(N, [&](Scalar i) { + ForRange(N, [&](Scalar j) { + auto val = A(i, j).Get(); + Log() << std::setw(4) << val; + }); + Log() << EOL; + }); + }); +#endif + return A(2, 0) + A(0, 2) - 2; // will return 0 if calculation is correct +} + +// This test adds 1 to each element in a 4x4 matrix, but does all the work in on a cached piece of the matrix. +// The i and j dimensions are subdivided into 2x2 tiles, then each tile is copied into the cache, operated on, +// and copied back. +// The cache here is a 2x2 matrix that gets reused for each tile. In this version of the test, we need to add the +// compute kernel after the loops are split, so that we can have it use the inner tile indices instead of +// the full matrix indices. +Scalar CachedMatrix_test2() +{ + // Create the 'A' matrix + const int N = 4; + auto A = MakeMatrix(N, N, "A"); + + // Create the small cache matrix + const int cacheSize = 2; + auto cache = MakeMatrix(cacheSize, cacheSize, "cache"); + + // Initialize A to this matrix: + // [ 0, -1, -2, -3 ] + // [ 1, 0, -1, -2 ] + // [ 2, 1, 0, -1 ] + // [ 3, 2, 1, 0 ] + ForRange(N, [&](Scalar i) { + ForRange(N, [&](Scalar j) { + A(i, j) = i - j; + }); + }); + + // Create a loop nest to iterate over A's domain + Index i("i"), j("j"); + LoopNest loop({ { i, { 0, N } }, + { j, { 0, N } } }); + + // Split the loops into tiles the size of the cache + auto [iOuter, iInner] = loop.Split(i, cacheSize); + auto [jOuter, jInner] = loop.Split(j, cacheSize); + + // Tell the loop nest that kernels that run on the individual tiles should use 'cache' in place of 'A' + loop.RenameVariable(A, cache, { iInner, jInner }); + + // Add the code to initialize the cache with a tile of 'A' + auto initCacheKernel = Kernel("init") + .Inputs(A.GetValue(), cache.GetValue()) + .Indices(iOuter, jOuter) + .Define(copyToSmallCache); + + // We use "prologue" as the placement to tell the system to run this kernel before the inner tile loops + loop.AddKernel(initCacheKernel, LoopFragmentType::prologue); + + // Add the compute kernel, using the inner, tile-relative indices + auto kernel = Kernel("kernel") + .Inputs(A.GetValue()) + .Indices(iInner, jInner) + .Define(addOne); + loop.AddKernel(kernel, LoopNest::ConstraintType::constraint); + + // ...and the code to copy the processed data back from the kernel into 'A' + auto copybackKernel = Kernel("copyback") + .Inputs(A.GetValue(), cache.GetValue()) + .Indices(iOuter, jOuter) + .Define(copyFromSmallCache); + + // Here, we use "epilogue" as the placement to tell the system to run this kernel after the inner tile loops + loop.AddKernel(copybackKernel, LoopFragmentType::epilogue); + + CodeGenerator generator; + generator.Run(loop); + +#if 0 // DEBUGGING + InvokeForContext([&](auto&) { + ForRange(N, [&](Scalar i) { + ForRange(N, [&](Scalar j) { + auto val = A(i, j).Get(); + Log() << std::setw(4) << val; + }); + Log() << EOL; + }); + }); +#endif + return A(2, 0) + A(0, 2) - 2; // will return 0 if calculation is correct +} + +// This test adds 1 to each element in a 4x4 matrix, but does all the work in on a cached piece of the matrix. +// The i and j dimensions are subdivided into 2x2 tiles, then each tile is copied into the cache, operated on, +// and copied back. +// The cache here is a 2x2 matrix that gets reused for each tile. In this version of the test, we need to add the +// compute kernel after the loops are split, so that we can have it use the inner tile indices instead of +// the full matrix indices. +// +// This version differs from CachedMatrix_test2 only in how the cached matrix is given to the kernel +Scalar CachedMatrix_test3() +{ + const int N = 4; + + // Create and initialize the 'A' matrix + auto A = MakeMatrix(N, N, "A"); + ForRange(N, [&](Scalar i) { + ForRange(N, [&](Scalar j) { + A(i, j) = i - j; + }); + }); + + // Create the small cache matrix + const int cacheSize = 2; + auto cache = MakeMatrix(cacheSize, cacheSize, "cache"); + + // Create a loop nest to iterate over A's domain + Index i("i"), j("j"); + LoopNest loop({ { i, { 0, N } }, + { j, { 0, N } } }); + + // Split the loops into tiles the size of the cache + auto [iOuter, iInner] = loop.Split(i, cacheSize); + auto [jOuter, jInner] = loop.Split(j, cacheSize); + + // Add the code to initialize the cache with a tile of 'A' + auto initCacheKernel = Kernel("init") + .Inputs(A.GetValue(), cache.GetValue()) + .Indices(iOuter, jOuter) + .Define(copyToSmallCache); + + // We use "prologue" as the placement to tell the system to run this kernel before the inner tile loops + loop.AddKernel(initCacheKernel, LoopFragmentType::prologue); + + // Add the compute kernel, using the inner, tile-relative indices, and the cached matrix + auto kernel = Kernel("kernel") + .Inputs(cache.GetValue()) + .Indices(iInner, jInner) + .Define(addOne); + loop.AddKernel(kernel, LoopNest::ConstraintType::constraint); + + // ...and the code to copy the processed data back from the kernel into 'A' + auto copybackKernel = Kernel("copyback") + .Inputs(A.GetValue(), cache.GetValue()) + .Indices(iOuter, jOuter) + .Define(copyFromSmallCache); + + // Here, we use "epilogue" as the placement to tell the system to run this kernel after the inner tile loops + loop.AddKernel(copybackKernel, LoopFragmentType::epilogue); + + CodeGenerator generator; + generator.Run(loop); + +#if 0 // DEBUGGING + InvokeForContext([&](auto&) { + ForRange(N, [&](Scalar i) { + ForRange(N, [&](Scalar j) { + auto val = A(i, j).Get(); + Log() << std::setw(4) << val; + }); + Log() << EOL; + }); + }); +#endif + return A(2, 0) + A(0, 2) - 2; // will return 0 if calculation is correct +} + +// This test does an element-wise sum of two 4x4 matrices, storing the result in the left matrix, +// where the left matrix is cached in 2x2 tiles and the right matrix is not. +// The i and j dimensions are subdivided into 2x2 tiles, then each tile of the left matrix is copied into the cache, +// operated on with the right matrix, and copied back. +// The cache here is a 2x2 matrix that gets reused for each tile. +// In this version, we need to pass in both the split indices and the global indices into the kernel since one matrix +// is a cache using the split indices, while the other is uncached and needs the global indices +Scalar CachedMatrix_test4() +{ + const int N = 4; + const int cacheSize = 2; + auto A = MakeMatrix(N, N, "A"); + auto B = MakeMatrix(N, N, "B"); + auto expected = MakeMatrix(N, N); + auto cacheA = MakeMatrix(cacheSize, cacheSize, "cacheA"); + + // Initialize the 'A' matrix + // A: + // [ 0, -1, -2, -3 ] + // [ 1, 0, -1, -2 ] + // [ 2, 1, 0, -1 ] + // [ 3, 2, 1, 0 ] + ForRange(N, [&](Scalar i) { + ForRange(N, [&](Scalar j) { + A(i, j) = i - j; + }); + }); + + // Initialize the 'B' matrix + // B: + // [ 0, 1, 2, 3 ] + // [ 1, 2, 3, 4 ] + // [ 2, 3, 4, 5 ] + // [ 3, 4, 5, 6 ] + ForRange(N, [&](Scalar i) { + ForRange(N, [&](Scalar j) { + B(i, j) = i + j; + }); + }); + + // Initialize the 'expected' matrix = A + B + // A + B: + // [ 0, 0, 0, 0 ] + // [ 2, 2, 2, 2 ] + // [ 4, 4, 4, 4 ] + // [ 6, 6, 6, 6 ] + ForRange(N, [&](Scalar i) { + ForRange(N, [&](Scalar j) { + expected(i, j) = (i - j) + (i + j); + }); + }); + + // Create a loop nest to iterate over A's and B's domains + Index i("i"), j("j"); + LoopNest loop({ { i, { 0, N } }, + { j, { 0, N } } }); + + // Split the loops into tiles the size of the cache + auto [iOuter, iInner] = loop.Split(i, cacheSize); + auto [jOuter, jInner] = loop.Split(j, cacheSize); + + // Add the code to initialize the cache with a tile of 'A' + auto initCacheKernel = Kernel("init") + .Inputs(A.GetValue(), cacheA.GetValue()) + .Indices(iOuter, jOuter) + .Define(copyToSmallCache); + + // We use "prologue" as the placement to tell the system to run this kernel before the inner tile loops + loop.AddKernel(initCacheKernel, LoopFragmentType::prologue); + + // Add the compute kernel, using the inner, tile-relative indices, and the cached matrix + auto kernel = Kernel("kernel") + .Inputs(cacheA.GetValue(), B.GetValue()) + .Indices(iInner, jInner, i, j) + .Define(addCachedMatrixToUnchachedMatrix); + CodePositionConstraints constraints{ LoopFragmentType::body, { iInner, jInner }, {} }; + loop.AddKernel(kernel, constraints); + + // ...and the code to copy the processed data back from the kernel into 'A' + auto copybackKernel = Kernel("copyback") + .Inputs(A.GetValue(), cacheA.GetValue()) + .Indices(iOuter, jOuter) + .Define(copyFromSmallCache); + + // Here, we use "epilogue" as the placement to tell the system to run this kernel after the inner tile loops + loop.AddKernel(copybackKernel, LoopFragmentType::epilogue); + + CodeGenerator generator; + generator.Run(loop); + + Scalar ok = Allocate(ScalarLayout); + ok = 1; + If( + VerifySame(A, expected) == 0, + [&] { + ok = 0; + }) + .Else([&] { + auto value = A.GetValue(); + value.SetLayout({ { (int)A.Size() } }); + DebugPrintVector(value); + DebugPrint("\n"); + }); + return ok; +} + +// This test does an element-wise sum of two 4x4 matrices, storing the result in the left matrix, +// where the left matrix is cached in 2x2 tiles and the right matrix is not. +// The i and j dimensions are subdivided into 2x2 tiles, then each tile of the left matrix is copied into the cache, +// operated on with the right matrix, and copied back. +// The cache here is a 2x2 matrix that gets reused for each tile. +// In this version, we need to pass in both the split indices and the global indices into the kernel since one matrix +// is a cache using the split indices, while the other is uncached and needs the global indices +// +// The difference with the previous test is that the kernel is unrolled and operates on a panel rather than individual indices +Scalar CachedMatrix_test5() +{ + const int N = 4; + const int cacheSize = 2; + auto A = MakeMatrix(N, N, "A"); + auto B = MakeMatrix(N, N, "B"); + auto expected = MakeMatrix(N, N); + auto cacheA = MakeMatrix(cacheSize, cacheSize, "cacheA"); + + // Initialize the 'A' matrix + // A: + // [ 0, -1, -2, -3 ] + // [ 1, 0, -1, -2 ] + // [ 2, 1, 0, -1 ] + // [ 3, 2, 1, 0 ] + ForRange(N, [&](Scalar i) { + ForRange(N, [&](Scalar j) { + A(i, j) = i - j; + }); + }); + + // Initialize the 'B' matrix + // B: + // [ 0, 1, 2, 3 ] + // [ 1, 2, 3, 4 ] + // [ 2, 3, 4, 5 ] + // [ 3, 4, 5, 6 ] + ForRange(N, [&](Scalar i) { + ForRange(N, [&](Scalar j) { + B(i, j) = i + j; + }); + }); + + // A + B: + // [ 0, 0, 0, 0 ] + // [ 2, 2, 2, 2 ] + // [ 4, 4, 4, 4 ] + // [ 6, 6, 6, 6 ] + ForRange(N, [&](Scalar i) { + ForRange(N, [&](Scalar j) { + expected(i, j) = i * 2; + }); + }); + + // Create a loop nest to iterate over A's and B's domains + Index i("i"), j("j"); + LoopNest loop({ { i, { 0, N } }, + { j, { 0, N } } }); + + // Split the loops into tiles the size of the cache + auto [iOuter, iInner] = loop.Split(i, cacheSize); + auto [jOuter, jInner] = loop.Split(j, cacheSize); + + // Add the code to initialize the cache with a tile of 'A' + auto initCacheKernel = Kernel("init") + .Inputs(A.GetValue(), cacheA.GetValue()) + .Indices(iOuter, jOuter) + .Define(copyToSmallCache); + + // We use "prologue" as the placement to tell the system to run this kernel before the inner tile loops + loop.AddKernel(initCacheKernel, LoopFragmentType::prologue); + + // Add the compute kernel, using the inner, tile-relative indices, and the cached matrix + auto kernel = Kernel("kernel") + .Inputs(cacheA.GetValue(), B.GetValue()) + .Indices(iOuter, jOuter, i, j) + .Define(addCachedMatrixToUnchachedMatrixUnrolled); + CodePositionConstraints constraints{ LoopFragmentType::body, { iOuter, jOuter }, {} }; + loop.AddKernel(kernel, constraints); + + // ...and the code to copy the processed data back from the kernel into 'A' + auto copybackKernel = Kernel("copyback") + .Inputs(A.GetValue(), cacheA.GetValue()) + .Indices(iOuter, jOuter) + .Define(copyFromSmallCache); + + // Here, we use "epilogue" as the placement to tell the system to run this kernel after the inner tile loops + loop.AddKernel(copybackKernel, LoopFragmentType::epilogue); + + CodeGenerator generator; + generator.Run(loop); + + Scalar ok = Allocate(ScalarLayout); + ok = 1; + If( + VerifySame(A, expected) == 0, + [&] { + ok = 0; + }) + .Else([&] { + auto value = A.GetValue(); + value.SetLayout({ { (int)A.Size() } }); + DebugPrintVector(value); + DebugPrint("\n"); + }); + return ok; +} + +Scalar LoopNest_Parallelized_test1() +{ + Scalar ok = Allocate(ScalarLayout); + auto matrix = MakeMatrix(4, 5); + InvokeForContext([&] { + auto v = matrix.GetValue().Get().GetDataAs(); + v->setName("matrix"); + }); + loopnests::IndexRange i("i", { 0, 4 }), j("j", { 0, 5 }); + + auto kernel = loopnests::Kernel("kernel") + .Inputs(matrix.GetValue()) + .Indices(i.GetIndex(), j.GetIndex()) + .Define([](Matrix m, Scalar i, Scalar j) { + Scalar tid = GetTID(); +#if 0 // Useful to turn off/on for debugging + InvokeForContext([&](auto&) { + auto iInt = i.Get(); + auto jInt = j.Get(); + Log() << "m(" << iInt << ", " << jInt << ") = " << (iInt * 2 + jInt * 5) + << " [Thread " << tid.Get() << "]" + << EOL; + }); +#endif // 1 + m(i, j) = i * 2 + j * 5; + }); + + loopnests::LoopNest loop(std::vector{ i, j }); + loop.AddKernel(kernel, LoopNest::ConstraintType::predicate); + loop.Parallelize(i.GetIndex(), 2); + + loopnests::CodeGenerator generator; + generator.Run(loop); + + ok = matrix(2, 3) - 19; + return ok; // will return 0 if calculation is correct +} + +Scalar LoopNest_Parallelized_test2() +{ + Scalar ok = Allocate(ScalarLayout); + + auto matrix = MakeMatrix(4, 5); + loopnests::IndexRange i("i", { 0, 4 }), j("j", { 0, 5 }); + + auto kernel = loopnests::Kernel("kernel") + .Inputs(matrix.GetValue()) + .Indices(i.GetIndex(), j.GetIndex()) + .Define([](Matrix m, Scalar i, Scalar j) { + Scalar tid = GetTID(); +#if 0 // Useful to turn off/on for debugging + InvokeForContext([&](auto&) { + auto iInt = i.Get(); + auto jInt = j.Get(); + Log() << "m(" << iInt << ", " << jInt << ") = " << tid.Get() + << " [Thread " << tid.Get() << "]" << EOL; + }); +#endif // 1 + m(i, j) = tid; + }); + + loopnests::LoopNest loop(std::vector{ i, j }); + loop.AddKernel(kernel, LoopNest::ConstraintType::predicate); + loop.Parallelize(i.GetIndex(), 2); + + loopnests::CodeGenerator generator; + generator.Run(loop); + + auto expected = MakeMatrix(4, 5); + If( + VerifySame(matrix, expected) == 0, + [&] { + ok = 1; + }) + .Else([&] { + auto value = matrix.GetValue(); + value.SetLayout({ { (int)matrix.Size() } }); + DebugPrintVector(value); + DebugPrint("\n"); + }); + return ok; +} + +Scalar LoopNest_Unrolled_test1() +{ + auto matrix = MakeMatrix(20, 5); + IndexRange i("i", { 0, 20 }), j("j", { 0, 5 }); + + auto kernel = Kernel("kernel") + .Inputs(matrix.GetValue()) + .Indices(i.GetIndex(), j.GetIndex()) + .Define(loopnest_kernel); + + LoopNest loop(std::vector{ i, j }); + loop.AddKernel(kernel, LoopNest::ConstraintType::predicate); + loop.Unroll(j.GetIndex()); + + CodeGenerator generator; + generator.Run(loop); + + return matrix(2, 3) - 19; // will return 0 if calculation is correct +} + +Scalar LoopNest_DebugDump_test1() +{ + auto matrix = MakeMatrix(4, 5); + Index i("i"), j("j"); + LoopNest loop({ { i, { 0, 4 } }, + { j, { 0, 5 } } }); + + auto kernel = Kernel("kernel") + .Inputs(matrix.GetValue()) + .Indices(i, j) + .Define(loopnest_kernel); + + loop.AddKernel(kernel, LoopNest::ConstraintType::predicate); + loop.Split(i, 2); + loop.Unroll(j); + loop.SetLoopOrder({ i, j, i }); + + InvokeForContext([&](auto&) { + std::stringstream ss; + DebugDump(loop, "DebugDump test", &ss); + Log() << ss.str() << EOL; + // TODO: verify somehow that the printing worked + }); + + return 0; +} + +Scalar LoopNest_DebugDump_test2() +{ + const int N = 8; + auto matrix = MakeMatrix(N, N); + Index i("i"), j("j"); + LoopNest loop({ { i, { 0, N } }, + { j, { 0, N } } }); + + auto kernel = Kernel("kernel") + .Inputs(matrix.GetValue()) + .Indices(i, j) + .Define(loopnest_kernel); + + loop.AddKernel(kernel, LoopNest::ConstraintType::predicate); + SplitAndSetOrder(loop, { i, j }, { 4, 2 }, "ijij"); + + InvokeForContext([&](auto&) { + std::stringstream ss; + DebugDump(loop, "DebugDump test", &ss); + Log() << ss.str() << EOL; + // TODO: verify somehow that the printing worked + }); + + return 0; +} + +Scalar SimpleMatMult_test() +{ + const int N = 8; + auto A = MakeMatrix(N, N, "A"); + auto B = MakeMatrix(N, N, "B"); + auto C = MakeMatrix(N, N, "C"); + auto expected = MakeMatrix(N, N, "expected"); + + // initialize A, B, and C + ForRange(N, [&](Scalar i) { + ForRange(N, [&](Scalar j) { + A(i, j) = i - j; + B(i, j) = i + 2 * j; + C(i, j) = 100; + }); + }); + + // fill out expected with a simple for-loop gemm + ForRange(N, [&](Scalar i) { + ForRange(N, [&](Scalar j) { + ForRange(N, [&](Scalar k) { + expected(i, j) += A(i, k) * B(k, j); + }); + }); + }); + + // The input matrices: + // A: B: C: + // [ 0, -1, -2, -3, -4, -5, -6, -7] [ 0, 2, 4, 6, 8, 10, 12, 14] [ 100 100 100 ... ] + // [ 1, 0, -1, -2, -3, -4, -5, -6] [ 1, 3, 5, 7, 9, 11, 13, 15] [ 100 100 100 ... ] + // [ 2, 1, 0, -1, -2, -3, -4, -5] [ 2, 4, 6, 8, 10, 12, 14, 16] [ ... ] + // [ 3, 2, 1, 0, -1, -2, -3, -4] [ 3, 5, 7, 9, 11, 13, 15, 17] [ ... ] + // [ 4, 3, 2, 1, 0, -1, -2, -3] [ 4, 6, 8, 10, 12, 14, 16, 18] [ ... ] + // [ 5, 4, 3, 2, 1, 0, -1, -2] [ 5, 7, 9, 11, 13, 15, 17, 19] [ ... ] + // [ 6, 5, 4, 3, 2, 1, 0, -1] [ 6, 8, 10, 12, 14, 16, 18, 20] [ ... ] + // [ 7, 6, 5, 4, 3, 2, 1, 0] [ 7, 9, 11, 13, 15, 17, 19, 21] [ ... ] + + // (A * B) (the desired result): + // [-140, -196, -252, -308, -364, -420, -476, -532] + // [-112, -152, -192, -232, -272, -312, -352, -392] + // [ -84, -108, -132, -156, -180, -204, -228, -252] + // [ -56, -64, -72, -80, -88, -96, -104, -112] + // [ -28, -20, -12, -4, 4, 12, 20, 28] + // [ 0, 24, 48, 72, 96, 120, 144, 168] + // [ 28, 68, 108, 148, 188, 228, 268, 308] + // [ 56, 112, 168, 224, 280, 336, 392, 448] + + Index i("i"), j("j"), k("k"); + + LoopNest loop({ { i, { 0, N } }, + { j, { 0, N } }, + { k, { 0, N } } }); + + int cacheARows = 4; + int cacheACols = 4; + int resultCacheRows = 2; + int resultCacheCols = 2; + + auto [i_panel_outer, i_panel_inner] = loop.Split(i, cacheARows); + auto [k_panel_outer, k_panel_inner] = loop.Split(k, cacheACols); + auto [i_kernel_outer, i_kernel_inner] = loop.Split(i, resultCacheRows); + auto [j_kernel_outer, j_kernel_inner] = loop.Split(j, resultCacheCols); + + auto initCKernel = Kernel("init") + .Inputs(C.GetValue()) + .Indices(i, j) + .Define([](Matrix C, Scalar i, Scalar j) { + C(i, j) = 0; + }); + + auto innerKernel = Kernel("matmul") + .Inputs(A.GetValue(), B.GetValue(), C.GetValue()) + .Indices(i, j, k) + .Define([](Matrix A, Matrix B, Matrix C, Scalar i, Scalar j, Scalar k) { + C(i, j) += A(i, k) * B(k, j); + }); + + loopnests::CodePositionConstraints initConstraints{ loopnests::LoopFragmentType::prologue, { i, j }, { k } }; + loop.AddKernel(initCKernel, initConstraints); + loop.AddKernel(innerKernel, LoopNest::ConstraintType::constraint); + + loop.SetLoopOrder({ k, i, j, i, k, j, i }); + + CodeGenerator generator; + generator.Run(loop); + +#if 0 + PrintLoops(loop, "SimpleMatMult_test"); +#endif + + Scalar ok = Allocate(ScalarLayout); + ok = 1; + If( + VerifySame(C, expected) == 0, + [&] { + ok = 0; + }) + .Else([&] { + auto value = C.GetValue(); + value.SetLayout({ { (int)C.Size() } }); + DebugPrintVector(value); + DebugPrint("\n"); + auto expectedValue = expected.GetValue(); + expectedValue.SetLayout({ { (int)expected.Size() } }); + DebugPrintVector(expectedValue); + DebugPrint("\n"); + }); + return ok; +} + +Scalar GotoBLASGemm_LowLevelAPI() +{ + const int N = 8; + auto A = MakeMatrix(N, N, "A"); + auto B = MakeMatrix(N, N, "B"); + auto C = MakeMatrix(N, N, "C"); + auto expected = MakeMatrix(N, N, "expected"); + + // initialize A, B, and C + ForRange(N, [&](Scalar i) { + ForRange(N, [&](Scalar j) { + A(i, j) = i - j; + B(i, j) = i + 2 * j; + }); + }); + + // fill out expected with a simple for-loop gemm + ForRange(N, [&](Scalar i) { + ForRange(N, [&](Scalar j) { + ForRange(N, [&](Scalar k) { + expected(i, j) += A(i, k) * B(k, j); + }); + }); + }); + + // The input matrices: + // A: B: C: + // [ 0, -1, -2, -3, -4, -5, -6, -7] [ 0, 2, 4, 6, 8, 10, 12, 14] [ 0 0 0 ... ] + // [ 1, 0, -1, -2, -3, -4, -5, -6] [ 1, 3, 5, 7, 9, 11, 13, 15] [ 0 0 0 ... ] + // [ 2, 1, 0, -1, -2, -3, -4, -5] [ 2, 4, 6, 8, 10, 12, 14, 16] [ ... ] + // [ 3, 2, 1, 0, -1, -2, -3, -4] [ 3, 5, 7, 9, 11, 13, 15, 17] [ ... ] + // [ 4, 3, 2, 1, 0, -1, -2, -3] [ 4, 6, 8, 10, 12, 14, 16, 18] [ ... ] + // [ 5, 4, 3, 2, 1, 0, -1, -2] [ 5, 7, 9, 11, 13, 15, 17, 19] [ ... ] + // [ 6, 5, 4, 3, 2, 1, 0, -1] [ 6, 8, 10, 12, 14, 16, 18, 20] [ ... ] + // [ 7, 6, 5, 4, 3, 2, 1, 0] [ 7, 9, 11, 13, 15, 17, 19, 21] [ ... ] + + // (A * B) (the desired result): + // [-140, -196, -252, -308, -364, -420, -476, -532] + // [-112, -152, -192, -232, -272, -312, -352, -392] + // [ -84, -108, -132, -156, -180, -204, -228, -252] + // [ -56, -64, -72, -80, -88, -96, -104, -112] + // [ -28, -20, -12, -4, 4, 12, 20, 28] + // [ 0, 24, 48, 72, 96, 120, 144, 168] + // [ 28, 68, 108, 148, 188, 228, 268, 308] + // [ 56, 112, 168, 224, 280, 336, 392, 448] + + Index i("i"), j("j"), k("k"); + + LoopNest loop({ { i, { 0, N } }, + { j, { 0, N } }, + { k, { 0, N } } }); + + int cacheARows = 4; + int cacheACols = 4; + int cacheBRows = cacheACols; + int cacheBCols = N; + int resultCacheRows = 2; + int resultCacheCols = 2; + + auto [i_panel_outer, i_panel_inner] = loop.Split(i, cacheARows); + auto [k_panel_outer, k_panel_inner] = loop.Split(k, cacheACols); + auto [i_kernel_outer, i_kernel_inner] = loop.Split(i, resultCacheRows); + auto [j_kernel_outer, j_kernel_inner] = loop.Split(j, resultCacheCols); + + auto cachedResult = MakeMatrix(resultCacheRows, resultCacheCols, "cachedResult"); + + auto cacheA = MakeMatrix(cacheARows, cacheACols, "cacheA"); + auto transposeCacheB = MakeMatrix(cacheBCols, cacheBRows, "transposeCacheB"); + + auto cacheAKernel = Kernel("cacheAKernel") + .Inputs(A.GetValue(), cacheA.GetValue()) + .Indices(i_panel_outer, k_panel_outer) + .Define([&](Matrix A, Matrix cache, Scalar iPanel, Scalar kPanel) { + for (int i = 0; i < cacheARows; ++i) + { + for (int k = 0; k < cacheACols; ++k) + { + cache(i, k) = A(iPanel + i, kPanel + k); + } + } + }); + + auto transposeCacheBKernel = Kernel("transposeCacheBKernel") + .Inputs(B.GetValue(), transposeCacheB.GetValue()) + .Indices(k_panel_outer) + .Define([&](Matrix B, Matrix transposeCache, Scalar kPanel) { + for (int k = 0; k < cacheBRows; ++k) + { + for (int j = 0; j < cacheBCols; ++j) + { + transposeCache(j, k) = B(kPanel + k, j); + } + } + }); + + auto innerKernel = Kernel("matmul") + .Inputs(cacheA.GetValue(), transposeCacheB.GetValue(), cachedResult.GetValue()) + .Indices(i, j, k, i_kernel_inner, j_kernel_inner, i_kernel_outer, k_panel_inner) + .Define([](Matrix A, Matrix B, Matrix C, Scalar i, Scalar j, Scalar k, Scalar iInner, Scalar jInner, Scalar iOuter, Scalar kPanelInner) { + C(iInner, jInner) += A(iOuter + iInner, kPanelInner) * B(j, kPanelInner); + }); + + auto clearCacheKernel = Kernel("clearCacheKernel") + .Inputs(cachedResult.GetValue()) + .Indices(i_kernel_outer, j_kernel_outer) + .Define([&](Matrix cache, Scalar iOuter, Scalar jOuter) { + Scalar zeroValue = Allocate(utilities::ScalarLayout); + for (int i = 0; i < resultCacheRows; ++i) + { + for (int j = 0; j < resultCacheCols; ++j) + { + cache(i, j) = zeroValue; + } + } + }); + + auto uncacheKernel = Kernel("uncacheKernel") + .Inputs(C.GetValue(), cachedResult.GetValue()) + .Indices(i_panel_outer, i_kernel_outer, j_kernel_outer) + .Define([&](Matrix C, Matrix cache, Scalar iPanelOuter, Scalar iOuter, Scalar jOuter) { + for (int i = 0; i < resultCacheRows; ++i) + { + for (int j = 0; j < resultCacheCols; ++j) + { + C(iPanelOuter + iOuter + i, jOuter + j) += cache(i, j); + } + } + }); + + CodePositionConstraints cacheAConstraint{ LoopFragmentType::prologue, { i_panel_outer, k_panel_outer }, {} }; + loop.AddKernel(cacheAKernel, cacheAConstraint); + + CodePositionConstraints cacheBConstraint{ LoopFragmentType::prologue, { k_panel_outer }, {} }; + loop.AddKernel(transposeCacheBKernel, cacheBConstraint); + + CodePositionConstraints constraint{ LoopFragmentType::body, { k, i_kernel_inner, j_kernel_inner }, {} }; + loop.AddKernel(innerKernel, constraint); + + CodePositionConstraints preConstraint{ LoopFragmentType::prologue, { i_kernel_outer, j_kernel_outer }, {} }; + loop.AddKernel(clearCacheKernel, preConstraint); + + CodePositionConstraints postConstraint{ LoopFragmentType::epilogue, { i_kernel_outer, j_kernel_outer }, {} }; + loop.AddKernel(uncacheKernel, postConstraint); + loop.SetLoopOrder({ k, i, j, i, k, j, i }); + + loop.Unroll(i_kernel_inner); + loop.Unroll(j_kernel_inner); + CodeGenerator generator; + generator.Run(loop); + +#if 0 + PrintLoops(loop, "GotoBLASGemm_LowLevelAPI"); +#endif + + Scalar ok = Allocate(ScalarLayout); + ok = 1; + If( + VerifySame(C, expected) == 0, + [&] { + ok = 0; + }) + .Else([&] { + auto value = C.GetValue(); + value.SetLayout({ { (int)C.Size() } }); + DebugPrintVector(value); + DebugPrint("\n"); + auto expectedValue = expected.GetValue(); + expectedValue.SetLayout({ { (int)expected.Size() } }); + DebugPrintVector(expectedValue); + DebugPrint("\n"); + }); + return ok; +} + +Scalar GotoBLASGemmWithRefDeref() +{ + const int N = 8; + auto A = MakeMatrix(N, N, "A"); + auto B = MakeMatrix(N, N, "B"); + auto C = MakeMatrix(N, N, "C"); + auto expected = MakeMatrix(N, N, "expected"); + + // initialize A, B, and C + ForRange(N, [&](Scalar i) { + ForRange(N, [&](Scalar j) { + A(i, j) = i - j; + B(i, j) = i + 2 * j; + }); + }); + + // fill out expected with a simple for-loop gemm + ForRange(N, [&](Scalar i) { + ForRange(N, [&](Scalar j) { + ForRange(N, [&](Scalar k) { + expected(i, j) += A(i, k) * B(k, j); + }); + }); + }); + + // The input matrices: + // A: B: C: + // [ 0, -1, -2, -3, -4, -5, -6, -7] [ 0, 2, 4, 6, 8, 10, 12, 14] [ 100 100 100 ... ] + // [ 1, 0, -1, -2, -3, -4, -5, -6] [ 1, 3, 5, 7, 9, 11, 13, 15] [ 100 100 100 ... ] + // [ 2, 1, 0, -1, -2, -3, -4, -5] [ 2, 4, 6, 8, 10, 12, 14, 16] [ ... ] + // [ 3, 2, 1, 0, -1, -2, -3, -4] [ 3, 5, 7, 9, 11, 13, 15, 17] [ ... ] + // [ 4, 3, 2, 1, 0, -1, -2, -3] [ 4, 6, 8, 10, 12, 14, 16, 18] [ ... ] + // [ 5, 4, 3, 2, 1, 0, -1, -2] [ 5, 7, 9, 11, 13, 15, 17, 19] [ ... ] + // [ 6, 5, 4, 3, 2, 1, 0, -1] [ 6, 8, 10, 12, 14, 16, 18, 20] [ ... ] + // [ 7, 6, 5, 4, 3, 2, 1, 0] [ 7, 9, 11, 13, 15, 17, 19, 21] [ ... ] + + // (A * B) (the desired result): + // [-140, -196, -252, -308, -364, -420, -476, -532] + // [-112, -152, -192, -232, -272, -312, -352, -392] + // [ -84, -108, -132, -156, -180, -204, -228, -252] + // [ -56, -64, -72, -80, -88, -96, -104, -112] + // [ -28, -20, -12, -4, 4, 12, 20, 28] + // [ 0, 24, 48, 72, 96, 120, 144, 168] + // [ 28, 68, 108, 148, 188, 228, 268, 308] + // [ 56, 112, 168, 224, 280, 336, 392, 448] + + Index i("i"), j("j"), k("k"); + + loopnests::LoopNest loop({ { i, { 0, N } }, + { j, { 0, N } }, + { k, { 0, N } } }); + + int cacheARows = 4; + int cacheACols = 4; + int cacheBRows = cacheACols; + int cacheBCols = N; + int resultCacheRows = 2; + int resultCacheCols = 2; + + auto [i_panel_outer, i_panel_inner] = loop.Split(i, cacheARows); + auto [k_panel_outer, k_panel_inner] = loop.Split(k, cacheACols); + auto [i_kernel_outer, i_kernel_inner] = loop.Split(i, resultCacheRows); + auto [j_kernel_outer, j_kernel_inner] = loop.Split(j, resultCacheCols); + + auto cachedResult = MakeMatrix(resultCacheRows, resultCacheCols, "cachedResult"); + + auto cacheA = MakeMatrix(cacheARows, cacheACols, "cacheA"); + auto cacheARef = cacheA.GetValue().Reference(); + + auto transposeCacheB = MakeMatrix(cacheBCols, cacheBRows, "transposeCacheB"); + + auto cacheAKernel = loopnests::Kernel("cacheAKernel") + .Inputs(A.GetValue(), cacheARef) + .Indices(i_panel_outer, k_panel_outer) + .Define([cacheARows, cacheACols](Matrix A, Value cacheRef, Scalar iPanel, Scalar kPanel) { + auto cache = Matrix(cacheRef.Dereference()); + for (int i = 0; i < cacheARows; ++i) + { + for (int k = 0; k < cacheACols; ++k) + { + cache(i, k) = A(iPanel + i, kPanel + k); + } + } + // Update cacheRef so that global (i, k) index into the corect spot in the cache + auto cacheTmp = cacheRef.Dereference(); + cacheTmp.SetLayout(MemoryShape{ cacheARows, cacheACols }); + auto cacheTmpOffset = cacheTmp.Offset({ -1 * iPanel, -1 * kPanel }); + cacheTmpOffset.SetLayout(MemoryShape{ cacheARows, cacheACols }); + cacheRef = cacheTmpOffset.Reference(); + }); + + auto resetCacheAKernel = loopnests::Kernel("resetCacheAKernel") + .Inputs(cacheARef) + .Indices(i_panel_outer, k_panel_outer) + .Define([cacheARows, cacheACols](Value cacheRef, Scalar iPanel, Scalar kPanel) { + // Reset cacheRef to point to the cache while we have iPanel and kPanel in hand + auto offsetCache = cacheRef.Dereference(); + offsetCache.SetLayout(MemoryShape{ cacheARows, cacheACols }); + auto realCache = offsetCache.Offset({ iPanel, kPanel }); + realCache.SetLayout(MemoryShape{ cacheARows, cacheACols }); + cacheRef = realCache.Reference(); + }); + + auto transposeCacheBKernel = loopnests::Kernel("transposeCacheBKernel") + .Inputs(B.GetValue(), transposeCacheB.GetValue()) + .Indices(k_panel_outer) + .Define([cacheBRows, cacheBCols](Matrix B, Matrix transposeCache, Scalar kPanel) { + for (int k = 0; k < cacheBRows; ++k) + { + for (int j = 0; j < cacheBCols; ++j) + { + transposeCache(j, k) = B(kPanel + k, j); + } + } + }); + + auto innerKernel = loopnests::Kernel("matmul") + .Inputs(cacheARef, transposeCacheB.GetValue(), cachedResult.GetValue()) + .Indices(i, j, k, i_kernel_inner, j_kernel_inner, i_kernel_outer, k_panel_inner, i_panel_outer, k_panel_outer) + .Define([cacheARows, cacheACols](Value Aref, Matrix B, Matrix C, Scalar i, Scalar j, Scalar k, Scalar iInner, Scalar jInner, Scalar iOuter, Scalar kPanelInner, Scalar iPanel, Scalar kPanel) { + auto offsetA = Aref.Dereference(); + offsetA.SetLayout(MemoryShape{ cacheARows, cacheACols }); + auto A = Matrix(offsetA); + C(iInner, jInner) += A(i, k) * B(j, kPanelInner); + }); + + auto clearCacheKernel = loopnests::Kernel("clearCacheKernel") + .Inputs(cachedResult.GetValue()) + .Indices(i_kernel_outer, j_kernel_outer) + .Define([resultCacheRows, resultCacheCols](Matrix cache, Scalar iOuter, Scalar jOuter) { + Scalar zeroValue = Allocate(utilities::ScalarLayout); + for (int i = 0; i < resultCacheRows; ++i) + { + for (int j = 0; j < resultCacheCols; ++j) + { + cache(i, j) = zeroValue; + } + } + }); + + auto uncacheKernel = loopnests::Kernel("uncacheKernel") + .Inputs(C.GetValue(), cachedResult.GetValue()) + .Indices(i_panel_outer, i_kernel_outer, j_kernel_outer) + .Define([resultCacheRows, resultCacheCols](Matrix C, Matrix cache, Scalar iPanelOuter, Scalar iOuter, Scalar jOuter) { + for (int i = 0; i < resultCacheRows; ++i) + { + for (int j = 0; j < resultCacheCols; ++j) + { + C(iPanelOuter + iOuter + i, jOuter + j) += cache(i, j); + } + } + }); + + loopnests::CodePositionConstraints cacheAConstraint{ loopnests::LoopFragmentType::prologue, { i_panel_outer, k_panel_outer }, {} }; + loop.AddKernel(cacheAKernel, cacheAConstraint); + + loopnests::CodePositionConstraints resetCacheAConstraint{ loopnests::LoopFragmentType::epilogue, { i_panel_outer, k_panel_outer }, {} }; + loop.AddKernel(resetCacheAKernel, resetCacheAConstraint); + + loopnests::CodePositionConstraints cacheBConstraint{ loopnests::LoopFragmentType::prologue, { k_panel_outer }, {} }; + loop.AddKernel(transposeCacheBKernel, cacheBConstraint); + + loopnests::CodePositionConstraints constraint{ loopnests::LoopFragmentType::body, { k, i_kernel_inner, j_kernel_inner }, {} }; + loop.AddKernel(innerKernel, constraint); + + loopnests::CodePositionConstraints preConstraint{ loopnests::LoopFragmentType::prologue, { i_kernel_outer, j_kernel_outer }, {} }; + loop.AddKernel(clearCacheKernel, preConstraint); + + loopnests::CodePositionConstraints postConstraint{ loopnests::LoopFragmentType::epilogue, { i_kernel_outer, j_kernel_outer }, {} }; + loop.AddKernel(uncacheKernel, postConstraint); + loop.SetLoopOrder({ k, i, j, i, k, j, i }); + + loop.Unroll(i_kernel_inner); + loop.Unroll(j_kernel_inner); + loopnests::CodeGenerator generator; + generator.Run(loop); +#if 0 + PrintLoops(loop, "GotoBLASGemmWithRefDeref"); +#endif + + Scalar ok = Allocate(ScalarLayout); + ok = 1; + If( + VerifySame(C, expected) == 0, + [&] { + ok = 0; + }) + .Else([&] { + auto value = C.GetValue(); + value.SetLayout({ { (int)C.Size() } }); + DebugPrintVector(value); + DebugPrint("\n"); + auto expectedValue = expected.GetValue(); + expectedValue.SetLayout({ { (int)expected.Size() } }); + DebugPrintVector(expectedValue); + DebugPrint("\n"); + }); + return ok; +} + +Scalar YG12LowLevel_TestBoundary() +{ + const int N = 8; + auto A = MakeMatrix(N, N, "A"); + auto B = MakeMatrix(N, N, "B"); + auto C = MakeMatrix(N, N, "C"); + auto expected = MakeMatrix(N, N, "expected"); + + // initialize A, B, and C + ForRange(N, [&](Scalar i) { + ForRange(N, [&](Scalar j) { + A(i, j) = i - j; + B(i, j) = i + 2 * j; + }); + }); + + // fill out expected with a simple for-loop gemm + ForRange(N, [&](Scalar i) { + ForRange(N, [&](Scalar j) { + ForRange(N, [&](Scalar k) { + expected(i, j) += A(i, k) * B(k, j); + }); + }); + }); + + static const int k_r = 3; + static const int k_c = 4; + + // Cache B into a columnMajor matrix + auto transposeBVal = MakeMatrix(N, N, "transposeB"); + auto transposeB = Matrix(transposeBVal); + + Index transposeK("transposeK"), transposeN("transposeN"); + LoopNest transposeLoop({ { transposeK, { 0, N } }, + { transposeN, { 0, N } } }); + static const int transposeRows = N; + static const int transposeCols = N; + + auto [k_transpose_outer, k_transpose_inner] = transposeLoop.Split(transposeK, transposeRows); + auto [n_transpose_outer, n_transpose_inner] = transposeLoop.Split(transposeN, transposeCols); + + auto transposeKernel = Kernel("transpose_kernel") + .Inputs(B.GetValue(), transposeB.GetValue()) + .Indices(transposeK, transposeN) + .Define([](Matrix input, Matrix output, Scalar row, Scalar col) { + output(col, row) = input(row, col); + }); + transposeLoop.Unroll(n_transpose_inner); + transposeLoop.SetLoopOrder({ transposeK, transposeN, transposeK, transposeN }); + loopnests::CodePositionConstraints transposeConstraints{ loopnests::LoopFragmentType::body, { k_transpose_inner, n_transpose_inner }, {} }; + transposeLoop.AddKernel(transposeKernel, loopnests::LoopFragmentType::body); + + loopnests::CodeGenerator transposeGenerator; + transposeGenerator.Run(transposeLoop); + + // Do computation in blocks of k_r x k_c + { + Matrix temp = MakeMatrix(k_r, k_c); + + loopnests::Index m("m"), n("n"), k("k"); + loopnests::LoopNest loop({ { m, { 0, N - (N % k_r) } }, + { n, { 0, N } }, + { k, { 0, N } } }); + + auto [n_block_outer, n_block_inner] = loop.Split(n, 4); + + auto [m_outer, m_inner] = loop.Split(m, k_r); + auto [n_outer, n_inner] = loop.Split(n, k_c); + + auto kernel = loopnests::Kernel("MatrixMatrixMultiplyNode_Kernel") + .Inputs(A.GetValue(), transposeB.GetValue(), C.GetValue(), temp.GetValue()) + .Indices(m, n, m_inner, n_inner, k) + .Define([](Matrix A, Matrix B, Matrix C, Matrix temp, Scalar i, Scalar j, Scalar i_inner, Scalar j_inner, Scalar k) { + temp(i_inner, j_inner) += A(i, k) * B(j, k); + }); + auto kernel2 = loopnests::Kernel("MatrixMatrixMultiplyNode_Reduce") + .Inputs(C.GetValue(), temp.GetValue()) + .Indices(m_outer, n_outer, n_block_outer) + .Define([](Matrix C, Matrix temp, Scalar i, Scalar j, Scalar j_outer) { + For(temp, [&](Scalar i_inner, Scalar j_inner) { + C(i + i_inner, j_outer + j + j_inner) = temp(i_inner, j_inner); + }); + }); + auto kernel3 = loopnests::Kernel("MatrixMatrixMultiplyNode_InitializeCache") + .Inputs(temp.GetValue()) + .Indices(m_outer, n_outer) + .Define([](Matrix temp, Scalar i, Scalar j) { + For(temp, [&](Scalar i_inner, Scalar j_inner) { + temp(i_inner, j_inner) = static_cast(0); + }); + }); + loop.Unroll(m_inner); + loop.Unroll(n_inner); + + loop.Unroll(n_outer); + + loop.SetLoopOrder({ n, m, n, k, n, m }); + loopnests::CodePositionConstraints constraints2{ loopnests::LoopFragmentType::epilogue, { m_outer, n_outer }, {} }; + loopnests::CodePositionConstraints constraints3{ loopnests::LoopFragmentType::prologue, { m_outer, n_outer }, {} }; + + loop.AddKernel(kernel3, constraints3); + loop.AddKernel(kernel, LoopNest::ConstraintType::constraint); + loop.AddKernel(kernel2, constraints2); + loopnests::CodeGenerator generator; +#if 0 + PrintLoops(loop, "YG12_Boundary_test_first"); +#endif + generator.Run(loop); + } + + // Do remainder + { + auto remainderRows = N % k_r; + auto startM = N - remainderRows; + Matrix temp = MakeMatrix(remainderRows, k_c); + + loopnests::Index m("m"), n("n"), k("k"); + loopnests::LoopNest loop({ { m, { startM, N } }, + { n, { 0, N } }, + { k, { 0, N } } }); + + auto [n_block_outer, n_block_inner] = loop.Split(n, 4); + auto [m_outer, m_inner] = loop.Split(m, remainderRows); + auto [n_outer, n_inner] = loop.Split(n, k_c); + + auto kernel = loopnests::Kernel("MatrixMatrixMultiplyNode_Kernel_remainder") + .Inputs(A.GetValue(), transposeB.GetValue(), C.GetValue(), temp.GetValue()) + .Indices(m, n, m_inner, n_inner, k) + .Define([](Matrix A, Matrix B, Matrix C, Matrix temp, Scalar i, Scalar j, Scalar i_inner, Scalar j_inner, Scalar k) { + temp(i_inner, j_inner) += A(i, k) * B(j, k); + }); + auto kernel2 = loopnests::Kernel("MatrixMatrixMultiplyNode_Reduce_remainder") + .Inputs(C.GetValue(), temp.GetValue()) + .Indices(m, m_outer, n_outer, n_block_outer) + .Define([](Matrix C, Matrix temp, Scalar i, Scalar i_outer, Scalar j_outer, Scalar j_block_outer) { + For(temp, [&](Scalar i_inner, Scalar j_inner) { + C(i + i_inner, j_block_outer + j_outer + j_inner) = temp(i_inner, j_inner); + }); + }); + auto kernel3 = loopnests::Kernel("MatrixMatrixMultiplyNode_InitializeCache") + .Inputs(temp.GetValue()) + .Indices() + .Define([](Matrix temp) { + For(temp, [&](Scalar i_inner, Scalar j_inner) { + temp(i_inner, j_inner) = static_cast(0); + }); + }); + loop.Unroll(n_inner); + loop.Unroll(n_outer); + + loop.SetLoopOrder({ n, m, n, k, n, m }); + loopnests::CodePositionConstraints constraints2{ loopnests::LoopFragmentType::epilogue, { n_outer, m_outer }, {} }; + loopnests::CodePositionConstraints constraints3{ loopnests::LoopFragmentType::prologue, { n_outer, m_outer }, {} }; + loop.AddKernel(kernel3, constraints3); + loop.AddKernel(kernel, LoopNest::ConstraintType::constraint); + loop.AddKernel(kernel2, constraints2); + loopnests::CodeGenerator generator; +#if 0 + PrintLoops(loop, "YG12_Boundary_test_remainder"); +#endif + generator.Run(loop); + } + + Scalar ok = Allocate(ScalarLayout); + ok = 1; + If( + VerifySame(C, expected) == 0, + [&] { + ok = 0; + }) + .Else([&] { + auto value = C.GetValue(); + value.SetLayout({ { (int)C.Size() } }); + DebugPrintVector(value); + DebugPrint("\n"); + auto expectedValue = expected.GetValue(); + expectedValue.SetLayout({ { (int)expected.Size() } }); + DebugPrintVector(expectedValue); + DebugPrint("\n"); + }); + return ok; +} + +Scalar KernelPredicate_test() +{ + const int M = 8; + const int N = M; + const int K = M; + auto A = MakeMatrix(M, K, "A"); + auto B = MakeMatrix(K, N, "B"); + auto C = MakeMatrix(M, N, "C"); + auto expected = MakeMatrix(M, N, "expected"); + + // initialize A, B, and C + ForRange(M, [&](Scalar i) { + ForRange(N, [&](Scalar j) { + A(i, j) = i - j; + B(i, j) = i + 2 * j; + C(i, j) = 100; + }); + }); + + // fill out expected with a simple for-loop gemm (plus 1) + ForRange(M, [&](Scalar i) { + ForRange(N, [&](Scalar j) { + ForRange(K, [&](Scalar k) { + expected(i, j) += A(i, k) * B(k, j); + }); + expected(i, j) += 1; + }); + }); + + // The input matrices: + // A: B: C: + // [ 0, -1, -2, -3, -4, -5, -6, -7] [ 0, 2, 4, 6, 8, 10, 12, 14] [ 100 100 100 ... ] + // [ 1, 0, -1, -2, -3, -4, -5, -6] [ 1, 3, 5, 7, 9, 11, 13, 15] [ 100 100 100 ... ] + // [ 2, 1, 0, -1, -2, -3, -4, -5] [ 2, 4, 6, 8, 10, 12, 14, 16] [ ... ] + // [ 3, 2, 1, 0, -1, -2, -3, -4] [ 3, 5, 7, 9, 11, 13, 15, 17] [ ... ] + // [ 4, 3, 2, 1, 0, -1, -2, -3] [ 4, 6, 8, 10, 12, 14, 16, 18] [ ... ] + // [ 5, 4, 3, 2, 1, 0, -1, -2] [ 5, 7, 9, 11, 13, 15, 17, 19] [ ... ] + // [ 6, 5, 4, 3, 2, 1, 0, -1] [ 6, 8, 10, 12, 14, 16, 18, 20] [ ... ] + // [ 7, 6, 5, 4, 3, 2, 1, 0] [ 7, 9, 11, 13, 15, 17, 19, 21] [ ... ] + + // (A * B) (the desired result): + // [-140, -196, -252, -308, -364, -420, -476, -532] + // [-112, -152, -192, -232, -272, -312, -352, -392] + // [ -84, -108, -132, -156, -180, -204, -228, -252] + // [ -56, -64, -72, -80, -88, -96, -104, -112] + // [ -28, -20, -12, -4, 4, 12, 20, 28] + // [ 0, 24, 48, 72, 96, 120, 144, 168] + // [ 28, 68, 108, 148, 188, 228, 268, 308] + // [ 56, 112, 168, 224, 280, 336, 392, 448] + + Index i("i"), j("j"), k("k"); + + LoopNest loop({ { i, { 0, N } }, + { j, { 0, N } }, + { k, { 0, N } } }); + + auto initCKernel = Kernel("init") + .Inputs(C.GetValue()) + .Indices(i, j, k) + .Define([](Matrix C, Scalar i, Scalar j, Scalar k) { + C(i, j) = 0; + }); + + auto innerKernel = Kernel("matmul") + .Inputs(A.GetValue(), B.GetValue(), C.GetValue()) + .Indices(i, j, k) + .Define([](Matrix A, Matrix B, Matrix C, Scalar i, Scalar j, Scalar k) { + C(i, j) += A(i, k) * B(k, j); + }); + + auto postKernel = Kernel("addone") + .Inputs(A.GetValue(), B.GetValue(), C.GetValue()) + .Indices(i, j, k) + .Define([](Matrix A, Matrix B, Matrix C, Scalar i, Scalar j, Scalar k) { + C(i, j) += 1; + }); + + loop.AddKernel(initCKernel, { First(k) }); + loop.AddKernel(innerKernel, LoopNest::ConstraintType::predicate); + loop.AddKernel(postKernel, { Last(k) }); + +#if 1 + auto [i_panel_outer, i_panel_inner] = loop.Split(i, 2); + auto [j_kernel_outer, j_kernel_inner] = loop.Split(j, 4); + auto [k_panel_outer, k_panel_inner] = loop.Split(k, 2); + + loop.SetLoopOrder({ k, j, i, j, i, k }); +#endif + +#if 0 + PrintLoops(loop, "KernelPredicate_test"); +#endif + + CodeGenerator generator; + generator.Run(loop); + + Scalar ok = Allocate(ScalarLayout); + ok = 1; + + If( + VerifySame(C, expected) == 0, + [&] { + ok = 0; + }) + .Else([&] { +#if 0 + auto value = C.GetValue(); + value.SetLayout({ { (int)C.Size() } }); + DebugPrintVector(value); + DebugPrint("\n"); + auto expectedValue = expected.GetValue(); + expectedValue.SetLayout({ { (int)expected.Size() } }); + DebugPrintVector(expectedValue); + DebugPrint("\n"); +#endif + }); + + return ok; +} + +Scalar MatMul3_test1() +{ + auto p = GetMatMul3TestCaseParameters(8, 8, 8, 8); + const auto M = p.M; + const auto N = p.N; + const auto K = p.K; + const auto L = p.L; + + Index i("i"), j("j"), k("k"), l("l"); + LoopNest loop({ { i, { 0, M } }, + { j, { 0, N } }, + { k, { 0, K } }, + { l, { 0, L } } }); + + auto initCKernel = Kernel("initC") + .Inputs(p.C.GetValue()) + .Indices(i, j) + .Define([](Matrix C, Scalar i, Scalar j) { + C(i, j) = 0; + }); + + auto computeCKernel = Kernel("matmulC") + .Inputs(p.A.GetValue(), p.B.GetValue(), p.C.GetValue()) + .Indices(i, j, k) + .Define([](Matrix A, Matrix B, Matrix C, Scalar i, Scalar j, Scalar k) { + C(i, j) += A(i, k) * B(k, j); + }); + + auto initEKernel = Kernel("initE") + .Inputs(p.E.GetValue()) + .Indices(i, l) + .Define([](Matrix E, Scalar i, Scalar l) { + E(i, l) = 0; + }); + + auto computeEKernel = Kernel("matmulE") + .Inputs(p.C.GetValue(), p.D.GetValue(), p.E.GetValue()) + .Indices(i, j, l) + .Define([](Matrix C, Matrix D, Matrix E, Scalar i, Scalar j, Scalar l) { + E(i, l) += C(i, j) * D(j, l); + }); + + loop.AddKernel(initCKernel, { First(k) && First(l) }); + loop.AddKernel(computeCKernel, { First(l) }); + + loop.AddKernel(initEKernel, { Last(k) && First(j) }); + loop.AddKernel(computeEKernel, { Last(k) }); + +#if 0 + PrintLoops(loop, "MatMul3_test1"); +#endif + + CodeGenerator generator; + generator.Run(loop); + + return VerifySame(p.E, p.expectedE); +} + +Scalar MatMul3_test2() +{ + auto p = GetMatMul3TestCaseParameters(8, 8, 8, 8); + const auto M = p.M; + const auto N = p.N; + const auto K = p.K; + const auto L = p.L; + + Index i("i"), j("j"), k("k"), l("l"); + LoopNest loop({ { i, { 0, M } }, + { j, { 0, N } }, + { k, { 0, K } }, + { l, { 0, L } } }); + + int stepI = 4; + int stepJ = 4; + // int stepK = 4; + auto [iOuter, iInner] = loop.Split(i, stepI); + auto [jOuter, jInner] = loop.Split(j, stepJ); + // auto [kOuter, kInner] = loop.Split(k, stepK); + + loop.SetLoopOrder({ iOuter, jOuter, k, l, iInner, jInner }); + + auto initCKernel = Kernel("initC") + .Inputs(p.C.GetValue()) + .Indices(i, j) + .Define([&](Matrix C, Scalar i, Scalar j) { + ForRange(stepI, [&](Scalar ii) { + ForRange(stepJ, [&](Scalar jj) { + C(i + ii, j + jj) = 0; + }); + }); + }); + + auto computeCKernel = Kernel("matmulC") + .Inputs(p.A.GetValue(), p.B.GetValue(), p.C.GetValue()) + .Indices(i, j, k) + .Define([&](Matrix A, Matrix B, Matrix C, Scalar i, Scalar j, Scalar k) { + // accum into C(I,J) via GEMM + ForRange(stepI, [&](Scalar ii) { + ForRange(stepJ, [&](Scalar jj) { + C(i + ii, j + jj) += A(i + ii, k) * B(k, j + jj); + }); + }); + }); + + auto initEKernel = Kernel("initE") + .Inputs(p.E.GetValue()) + .Indices(i, l) + .Define([&](Matrix E, Scalar i, Scalar l) { + ForRange(stepI, [&](Scalar ii) { + E(i + ii, l) = 0; + }); + }); + + auto computeEKernel = Kernel("matmulE") + .Inputs(p.C.GetValue(), p.D.GetValue(), p.E.GetValue()) + .Indices(i, j, l) + .Define([&](Matrix C, Matrix D, Matrix E, Scalar i, Scalar j, Scalar l) { + ForRange(stepI, [&](Scalar ii) { + ForRange(stepJ, [&](Scalar jj) { + // accum into E(I,L) via GEMM + E(i + ii, l) += C(i + ii, j + jj) * D(j + jj, l); + }); + }); + }); + + loop.AddKernel(initCKernel, { First(k) && First(l) }, { Before(iInner) || Before(jInner) }); + loop.AddKernel(computeCKernel, { First(l) }, { Before(iInner) || Before(jInner) }); + + loop.AddKernel(initEKernel, { Last(k) && First(j) }, { Before(iInner) || Before(jInner) }); + loop.AddKernel(computeEKernel, { Last(k) }, { Before(iInner) || Before(jInner) }); + +#if 0 + PrintLoops(loop, "MatMul3_test2"); +#endif + + CodeGenerator generator; + generator.Run(loop); + + return VerifySame(p.E, p.expectedE); +} + +Scalar LoopNestFuse_test1() +{ + auto p = GetMatMul3TestCaseParameters(8, 8, 8, 8); + const auto M = p.M; + const auto N = p.N; + const auto K = p.K; + const auto L = p.L; + + Index i("i"), j("j"), k("k"), l("l"); + LoopNest loopC({ { i, { 0, M } }, + { j, { 0, N } }, + { k, { 0, K } } }); + LoopNest loopE({ { i, { 0, M } }, + { j, { 0, N } }, + { l, { 0, L } } }); + + auto initCKernel = Kernel("initC") + .Inputs(p.C.GetValue()) + .Indices(i, j) + .Define([](Matrix C, Scalar i, Scalar j) { + C(i, j) = 0; + }); + + auto computeCKernel = Kernel("matmulC") + .Inputs(p.A.GetValue(), p.B.GetValue(), p.C.GetValue()) + .Indices(i, j, k) + .Define([](Matrix A, Matrix B, Matrix C, Scalar i, Scalar j, Scalar k) { + C(i, j) += A(i, k) * B(k, j); + }); + + auto initEKernel = Kernel("initE") + .Inputs(p.E.GetValue()) + .Indices(i, l) + .Define([](Matrix E, Scalar i, Scalar l) { + E(i, l) = 0; + }); + + auto computeEKernel = Kernel("matmulE") + .Inputs(p.C.GetValue(), p.D.GetValue(), p.E.GetValue()) + .Indices(i, j, l) + .Define([](Matrix C, Matrix D, Matrix E, Scalar i, Scalar j, Scalar l) { + E(i, l) += C(i, j) * D(j, l); + }); + + loopC.AddKernel(initCKernel, { First(k) }); + loopC.AddKernel(computeCKernel, LoopNest::ConstraintType::predicate); + + loopE.AddKernel(initEKernel, { First(j) }); + loopE.AddKernel(computeEKernel, LoopNest::ConstraintType::predicate); + + // Now fuse the loops + auto fusedLoops = Fuse(loopC, loopE, { l }, { k }); + fusedLoops.SetLoopOrder({ i, j, k, l }); + +#if 0 + PrintLoops(fusedLoops, "LoopNestFuse_test1: fusedLoops"); +#endif + + Scalar ok = Allocate(ScalarLayout); + ok = 1; + + CodeGenerator generator; + generator.Run(fusedLoops); + + return VerifySame(p.E, p.expectedE); +} + +LoopNest GetMatMulLoopNest(std::string name, const Matrix& A, const Matrix& B, const Matrix& C, const Index& i, const Index& j, const Index& k, bool initResult = true) +{ + const int M = static_cast(C.Rows()); + const int N = static_cast(C.Columns()); + const int K = static_cast(A.Columns()); + + LoopNest loop({ { i, { 0, M } }, + { j, { 0, N } }, + { k, { 0, K } } }); + + auto initCKernel = Kernel("init_" + name) + .Inputs(C.GetValue()) + .Indices(i, j) + .Define([](Matrix C, Scalar i, Scalar j) { + C(i, j) = 0; + }); + + auto innerKernel = Kernel("matmul_" + name) + .Inputs(A.GetValue(), B.GetValue(), C.GetValue()) + .Indices(i, j, k) + .Define([](Matrix A, Matrix B, Matrix C, Scalar i, Scalar j, Scalar k) { + C(i, j) += A(i, k) * B(k, j); + }); + + if (initResult) + loop.AddKernel(initCKernel, { First(k) }); + loop.AddKernel(innerKernel, LoopNest::ConstraintType::predicate); + + return loop; +} + +Scalar LoopNestFuse_test2() +{ + auto p = GetMatMul3TestCaseParameters(8, 8, 8, 8); + + Index i("i"), j("j"), k("k"), l("l"); + + LoopNest loopC = GetMatMulLoopNest("C", p.A, p.B, p.C, i, j, k); // C = A * B + LoopNest loopE = GetMatMulLoopNest("E", p.C, p.D, p.E, i, l, j); // E = C * D + +#if 0 + PrintLoops(loopC, "LoopNestFuse_test: loopC"); + PrintLoops(loopE, "LoopNestFuse_test: loopE"); +#endif + + // Now fuse the loops + auto fusedLoops = Fuse(loopC, loopE, { l }, { k }); + +#if 0 + PrintLoops(fusedLoops, "LoopNestFuse_test2: fusedLoops"); +#endif + + Scalar ok = Allocate(ScalarLayout); + ok = 1; + + CodeGenerator generator; + generator.Run(fusedLoops); + + If( + VerifySame(p.E, p.expectedE) == 0, + [&] { + ok = 0; + }); + + return ok; +} + +Scalar LoopNestFuse_test3() +{ + auto p = GetMatMul3TestCaseParameters(8, 8, 8, 8); + + Index i("i"), j("j"), k("k"), l("l"); + + LoopNest loopC = GetMatMulLoopNest("C", p.A, p.B, p.C, i, j, k); // C = A * B + LoopNest loopE = GetMatMulLoopNest("E", p.C, p.D, p.E, i, l, j); // E = C * D + +#if 0 + PrintLoops(loopC, "LoopNestFuse_test: loopC"); + PrintLoops(loopE, "LoopNestFuse_test: loopE"); +#endif + + // Now fuse the loops + auto fusedLoops = Fuse(loopC, loopE); + +#if 0 + PrintLoops(fusedLoops, "LoopNestFuse_test2: fusedLoops"); +#endif + + Scalar ok = Allocate(ScalarLayout); + ok = 1; + + CodeGenerator generator; + generator.Run(fusedLoops); + + If( + VerifySame(p.E, p.expectedE) == 0, + [&] { + ok = 0; + }); + + return ok; +} + +Scalar ConvertedConstraint_test1() +{ + std::string loopOrder = "ijk"; + const int N = 8; + auto A = MakeMatrix(N, N, "A"); + auto B = MakeMatrix(N, N, "B"); + auto C = MakeMatrix(N, N, "C"); + + // initialize A, B, and C + ForRange(N, [&](Scalar i) { + ForRange(N, [&](Scalar j) { + A(i, j) = i - j; + B(i, j) = i + 2 * j; + C(i, j) = 100; + }); + }); + + // The input matrices: + // A: B: C: + // [ 0, -1, -2, -3, -4, -5, -6, -7] [ 0, 2, 4, 6, 8, 10, 12, 14] [ 100 100 100 ... ] + // [ 1, 0, -1, -2, -3, -4, -5, -6] [ 1, 3, 5, 7, 9, 11, 13, 15] [ 100 100 100 ... ] + // [ 2, 1, 0, -1, -2, -3, -4, -5] [ 2, 4, 6, 8, 10, 12, 14, 16] [ ... ] + // [ 3, 2, 1, 0, -1, -2, -3, -4] [ 3, 5, 7, 9, 11, 13, 15, 17] [ ... ] + // [ 4, 3, 2, 1, 0, -1, -2, -3] [ 4, 6, 8, 10, 12, 14, 16, 18] [ ... ] + // [ 5, 4, 3, 2, 1, 0, -1, -2] [ 5, 7, 9, 11, 13, 15, 17, 19] [ ... ] + // [ 6, 5, 4, 3, 2, 1, 0, -1] [ 6, 8, 10, 12, 14, 16, 18, 20] [ ... ] + // [ 7, 6, 5, 4, 3, 2, 1, 0] [ 7, 9, 11, 13, 15, 17, 19, 21] [ ... ] + + // (A * B) + 1 (the desired result): + // [-139, -195, -251, -307, -363, -419, -475, -531] + // [-111, -151, -191, -231, -271, -311, -351, -391] + // [ -83, -107, -131, -155, -179, -203, -227, -251] + // [ -55, -63, -71, -79, -87, -95, -103, -111] + // [ -27, -19, -11, -3, 5, 13, 21, 29] + // [ 1, 25, 49, 73, 97, 121, 145, 169] + // [ 29, 69, 109, 149, 189, 229, 269, 309] + // [ 57, 113, 169, 225, 281, 337, 393, 449] + + Index i("i"), j("j"), k("k"); + + auto innerKernel = Kernel("matmul") + .Inputs(A.GetValue(), B.GetValue(), C.GetValue()) + .Indices(i, j, k) + .Define(matmul_kernel); + auto initCKernel = Kernel("init") + .Inputs(C.GetValue()) + .Indices(i, j) + .Define(initToZero); + auto postProcessCKernel = Kernel("post") + .Inputs(C.GetValue()) + .Indices(i, j) + .Define(addOne); + + LoopNest loop({ { i, { 0, N } }, + { j, { 0, N } }, + { k, { 0, N } } }); + + CodePositionConstraints preConstraint{ LoopFragmentType::prologue, { i, j }, {} }; + loop.AddKernel(initCKernel, preConstraint); + + loop.AddKernel(innerKernel, LoopNest::ConstraintType::constraint); + + CodePositionConstraints postConstraint{ LoopFragmentType::epilogue, { i, j }, {} }; + loop.AddKernel(postProcessCKernel, postConstraint); + + SplitAndSetOrder(loop, { i, j, k }, { 4, 2 }, loopOrder); + + CodeGenerator generator; + generator.Run(loop); + +#if 0 // DEBUGGING + PrintLoops(loop, "ConvertedConstraint_test1"); +#endif + +#if 0 // DEBUGGING + InvokeForContext([&](auto&) { + ForRange(N, [&](Scalar i) { + ForRange(N, [&](Scalar j) { + auto val = C(i, j).Get(); + Log() << std::setw(5) << val; + }); + Log() << EOL; + }); + }); +#endif + + return C(1, 2) + C(2, 1) - (-191 + -107); // will return 0 if calculation is correct +} + +Scalar ConvertedConstraint_test2() +{ + auto matrix = MakeMatrix(4, 5); + IndexRange i("i", { 0, 4 }), j("j", { 0, 5 }); + + auto kernel = Kernel("kernel") + .Inputs(matrix.GetValue()) + .Indices(i.GetIndex(), j.GetIndex()) + .Define(loopnest_kernel); + + LoopNest loop(std::vector{ i, j }); + loop.AddKernel(kernel, LoopNest::ConstraintType::constraint); + loop.Split(i.GetIndex(), 2); + + CodeGenerator generator; + generator.Run(loop); + +#if 0 // DEBUGGING + PrintLoops(loop, "ConvertedConstraint_test2"); +#endif + +#if 0 // DEBUGGING + InvokeForContext([&](auto&) { + ForRange(4, [&](Scalar i) { + ForRange(5, [&](Scalar j) { + auto val = matrix(i, j).Get(); + Log() << std::setw(5) << val; + }); + Log() << EOL; + }); + }); +#endif + + return matrix(2, 3) - 19; // will return 0 if calculation is correct +} +} // namespace ell diff --git a/libraries/value/test/src/Matrix_test.cpp b/libraries/value/test/src/Matrix_test.cpp index db2c965cf..2b960e07b 100644 --- a/libraries/value/test/src/Matrix_test.cpp +++ b/libraries/value/test/src/Matrix_test.cpp @@ -71,7 +71,7 @@ namespace auto mathRowVector = mathMatrix.GetRow(rowIndex); auto rowVector = matrix.Row((int)rowIndex); Vector expected = mathRowVector.ToArray(); - If(Verify(rowVector, expected) != 0, [&] { + If(VerifySame(rowVector, expected) != 0, [&] { ok2 = 1; }); } @@ -87,7 +87,7 @@ namespace auto mathColumnVector = mathMatrix.GetColumn(columnIndex); auto columnVector = matrix.Column((int)columnIndex); Vector expected = mathColumnVector.ToArray(); - If(Verify(columnVector, expected) != 0, [&] { + If(VerifySame(columnVector, expected) != 0, [&] { ok2 = 1; }); } @@ -169,7 +169,7 @@ Scalar Matrix_test3() std::vector{ 1.2f + 3.4f, 2.3f + 3.4f }, std::vector{ 3.4f + 3.4f, 4.5f + 3.4f } }); Matrix actual = m + testScalar; - If(0 != Verify(actual, expected), [&] { + If(0 != VerifySame(actual, expected), [&] { DebugPrint("Matrix_test3 matrix scalar addition failed \n"); ok = 1; }); @@ -179,7 +179,7 @@ Scalar Matrix_test3() std::vector{ 1.2f - 3.4f, 2.3f - 3.4f }, std::vector{ 3.4f - 3.4f, 4.5f - 3.4f } }); Matrix actual = m - testScalar; - If(0 != Verify(actual, expected), [&] { + If(0 != VerifySame(actual, expected), [&] { DebugPrint("Matrix_test3 matrix scalar subtraction failed \n"); ok = 1; }); @@ -189,7 +189,7 @@ Scalar Matrix_test3() std::vector{ 1.2f * 3.4f, 2.3f * 3.4f }, std::vector{ 3.4f * 3.4f, 4.5f * 3.4f } }); Matrix actual = m * testScalar; - If(0 != Verify(actual, expected), [&] { + If(0 != VerifySame(actual, expected), [&] { DebugPrint("Matrix_test3 matrix scalar multiplication failed \n"); ok = 1; }); @@ -199,7 +199,7 @@ Scalar Matrix_test3() std::vector{ 1.2f / 3.4f, 2.3f / 3.4f }, std::vector{ 3.4f / 3.4f, 4.5f / 3.4f } }); Matrix actual = m / testScalar; - If(0 != Verify(actual, expected), [&] { + If(0 != VerifySame(actual, expected), [&] { DebugPrint("Matrix_test3 matrix scalar division failed \n"); ok = 1; }); @@ -211,7 +211,7 @@ Scalar Matrix_test3() std::vector{ 1.2f + 0.1f, 2.3f + 1.2f }, std::vector{ 3.4f + 2.3f, 4.5f + 3.4f } }); Matrix actual = m + testMatrix; - If(0 != Verify(actual, expected), [&] { + If(0 != VerifySame(actual, expected), [&] { DebugPrint("Matrix_test3 matrix + matrix failed \n"); ok = 1; }); @@ -221,7 +221,7 @@ Scalar Matrix_test3() std::vector{ 1.2f - 0.1f, 2.3f - 1.2f }, std::vector{ 3.4f - 2.3f, 4.5f - 3.4f } }); Matrix actual = m - testMatrix; - If(0 != Verify(actual, expected), [&] { + If(0 != VerifySame(actual, expected), [&] { DebugPrint("Matrix_test3 matrix - matrix failed \n"); ok = 1; }); @@ -229,6 +229,40 @@ Scalar Matrix_test3() return ok; } +// This test verifies: +// - "For" with Matrix +// - Assignment from Matrix of one dimension order to another +// NOTE: This test currently passes for Compute but FAILS for Compile +Scalar Matrix_test4() +{ + Scalar ok = Allocate(ValueType::Int32, ScalarLayout); + ok = 0; + + std::vector> dt{ + std::vector{ 1, 2, 3 }, + std::vector{ 4, 5, 6 }, + }; + auto source = Matrix(dt); + auto destValue = Allocate(value::ValueType::Int32, source.GetValue().GetLayout().ReorderedCopy(DimensionOrder{ 1, 0 })); + auto dest = Matrix(destValue); + + For(source, [&](value::Scalar row, value::Scalar column) { + dest(row, column) = source(row, column); + }); + + std::vector expectedValues{ 1, 4, 2, 5, 3, 6 }; + auto expected = Vector(expectedValues); + + Vector actual = AsVector(AsFullView(dest)); + + If(VerifySame(actual, expected) == 1, [&] { + DebugPrint("Matrix_test4 matrix assignment to different dimension order failed \n"); + ok = 1; + }); + + return ok; +} + Scalar Reshape_test() { Scalar ok = Allocate(ValueType::Int32, ScalarLayout); @@ -241,12 +275,12 @@ Scalar Reshape_test() Vector v = std::vector{ 1, 2, 3, 4, 5, 6 }; - If(0 != Verify(ToVector(m.GetValue()), v), [&] { + If(0 != VerifySame(ToVector(m.GetValue()), v), [&] { DebugPrint("Reshape_test matrix into a vector failed \n"); ok = 1; }); - If(0 != Verify(ToMatrix(v.GetValue(), 2, 3), m), [&] { + If(0 != VerifySame(ToMatrix(v.GetValue(), 2, 3), m), [&] { DebugPrint("Reshape_test vector into a matrix failed \n"); ok = 1; }); @@ -270,7 +304,7 @@ Scalar GEMV_test() Vector expected(std::vector{ 9.3f, 20.3f }); - If(0 != Verify(actual, expected, 1e-5), [&] { + If(0 != VerifySame(actual, expected, 1e-5), [&] { DebugPrint("GEMV_test - failed \n"); ok = 1; }); @@ -281,8 +315,8 @@ Scalar MatrixReferenceTest() { const int N = 4; const int kernelSize = 2; - const int offsetRows = 0; - const int offsetCols = 1; + const Scalar offsetRows = 0; + const Scalar offsetCols = 1; auto A = MakeMatrix(N, N); @@ -322,7 +356,7 @@ Scalar MatrixReferenceTest() Scalar ok = Allocate(ScalarLayout); ok = 1; If( - Verify(valueCachePtr, expected) == 0, + VerifySame(valueCachePtr, expected) == 0, [&] { ok = 0; }) @@ -343,8 +377,8 @@ Scalar RefMatrixReferenceTest() { const int N = 4; const int kernelSize = 2; - const int offsetRows = 0; - const int offsetCols = 1; + const Scalar offsetRows = 0; + const Scalar offsetCols = 1; auto A = MakeMatrix(N, N, "A"); @@ -384,7 +418,7 @@ Scalar RefMatrixReferenceTest() Scalar ok = MakeScalar("ok"); ok = 1; If( - Verify(valueCachePtr, expected) == 0, + VerifySame(valueCachePtr, expected) == 0, [&] { ok = 0; }) diff --git a/libraries/value/test/src/Scalar_test.cpp b/libraries/value/test/src/Scalar_test.cpp index ffb15d410..4dbdb9de3 100644 --- a/libraries/value/test/src/Scalar_test.cpp +++ b/libraries/value/test/src/Scalar_test.cpp @@ -147,13 +147,13 @@ Scalar RefScalarRefCtorsTest() Scalar expected = Allocate(ScalarLayout); expected = 100; - testing::ProcessTest("Value initial pointer level", expected.GetValue().PointerLevel() == 1); + testing::ProcessQuietTest("Value initial pointer level", expected.GetValue().PointerLevel() == 1); Ref scalarPtr = x; - testing::ProcessTest("Ref ctor", scalarPtr.GetValue().PointerLevel() == 2); + testing::ProcessQuietTest("Ref ctor", scalarPtr.GetValue().PointerLevel() == 2); Ref scalarPtrCopy = x; - testing::ProcessTest("Ref copy semantics", scalarPtr.GetValue().PointerLevel() == scalarPtrCopy.GetValue().PointerLevel()); + testing::ProcessQuietTest("Ref copy semantics", scalarPtr.GetValue().PointerLevel() == scalarPtrCopy.GetValue().PointerLevel()); Ref scalarPtrMove = std::move(scalarPtr); - testing::ProcessTest("Ref move semantics", !scalarPtr.GetValue().IsDefined() && scalarPtrMove.GetValue().PointerLevel() == 2); + testing::ProcessQuietTest("Ref move semantics", !scalarPtr.GetValue().IsDefined() && scalarPtrMove.GetValue().PointerLevel() == 2); return result; } @@ -198,4 +198,67 @@ Scalar RefScalarRefRefRefTest() If(scalar != expected, [&] { result = 1; }); return result; } + +Scalar SequenceLogicalAndTest() +{ + int fourInt = 4; + Scalar twoScalar = 2; + Scalar fourScalar = 4; + + Scalar fourGTTwo = fourInt > twoScalar; + Scalar fourGTFour = fourInt > fourScalar; + + Scalar ok = Allocate(ScalarLayout); + ok = 1; + If((fourGTTwo && fourGTFour), + [&]() { + DebugPrint("Error! 4 > 2 && 4 > 4\n"); + }) + .ElseIf(fourGTTwo, + [&]() { + ok = 0; + }) + .ElseIf(fourGTFour, + [&]() { + DebugPrint("Error! 4 <= 2 && 4 > 4\n"); + }) + .Else( + [&]() { + DebugPrint("Error! 4 <= 2 && 4 <= 4\n"); + }); + return ok; +} + +Scalar SequenceLogicalAndTestWithCopy() +{ + int fourInt = 4; + Scalar twoScalar = 2; + Scalar fourScalar = 4; + + Scalar fourGTTwo = fourInt > twoScalar; + Scalar copyFourGTTwo = fourInt > twoScalar; + Scalar fourGTFour = fourInt > fourScalar; + Scalar copyFourGTFour = fourInt > fourScalar; + + Scalar ok = Allocate(ScalarLayout); + ok = 1; + If((fourGTTwo && fourGTFour), + [&]() { + DebugPrint("Error! 4 > 2 && 4 > 4\n"); + }) + .ElseIf(copyFourGTTwo, + [&]() { + ok = 0; + }) + .ElseIf(copyFourGTFour, + [&]() { + DebugPrint("Error! 4 <= 2 && 4 > 4\n"); + }) + .Else( + [&]() { + DebugPrint("Error! 4 <= 2 && 4 <= 4\n"); + }); + return ok; +} + } // namespace ell diff --git a/libraries/value/test/src/Tensor_test.cpp b/libraries/value/test/src/Tensor_test.cpp index 7ec94c047..7d2716515 100644 --- a/libraries/value/test/src/Tensor_test.cpp +++ b/libraries/value/test/src/Tensor_test.cpp @@ -154,21 +154,21 @@ Scalar Tensor_test1() { Vector mathSlicedVector = math::GetSlice(mathTensor, column, channel).ToArray(); auto slicedVector = tensor.Slice(Slice::All, column, channel); - If(Verify(slicedVector, mathSlicedVector) != 0, [&] { + If(VerifySame(slicedVector, mathSlicedVector) != 0, [&] { ok2 = 1; }); } { Vector mathSlicedVector = math::GetSlice(mathTensor, row, channel).ToArray(); auto slicedVector = tensor.Slice(row, Slice::All, channel); - If(Verify(slicedVector, mathSlicedVector) != 0, [&] { + If(VerifySame(slicedVector, mathSlicedVector) != 0, [&] { ok2 = 1; }); } } Vector mathSlicedVector = math::GetSlice(mathTensor, row, column).ToArray(); auto slicedVector = tensor.Slice(row, column, Slice::All); - If(Verify(slicedVector, mathSlicedVector) != 0, [&] { + If(VerifySame(slicedVector, mathSlicedVector) != 0, [&] { ok2 = 1; }); } @@ -251,7 +251,7 @@ Scalar Tensor_test3() std::vector{ 3.2f + s, 2.1f + s } }, }; Tensor actual = t + testScalar; - If(Verify(actual, expected) != 0, [&] { + If(VerifySame(actual, expected) != 0, [&] { ok = 1; DebugPrint("Tensor_test3: Tensor scalar addition failed\n"); }); @@ -267,7 +267,7 @@ Scalar Tensor_test3() std::vector{ 3.2f - s, 2.1f - s } }, }; Tensor actual = t - testScalar; - If(Verify(actual, expected) != 0, [&] { + If(VerifySame(actual, expected) != 0, [&] { ok = 1; DebugPrint("Tensor_test3: Tensor scalar subtraction failed\n"); }); @@ -283,7 +283,7 @@ Scalar Tensor_test3() std::vector{ 3.2f * s, 2.1f * s } }, }; Tensor actual = t * testScalar; - If(Verify(actual, expected) != 0, [&] { + If(VerifySame(actual, expected) != 0, [&] { ok = 1; DebugPrint("Tensor_test3: Tensor scalar multiplication failed\n"); }); @@ -299,7 +299,7 @@ Scalar Tensor_test3() std::vector{ 3.2f / s, 2.1f / s } }, }; Tensor actual = t / testScalar; - If(Verify(actual, expected) != 0, [&] { + If(VerifySame(actual, expected) != 0, [&] { ok = 1; DebugPrint("Tensor_test3: Tensor scalar division failed\n"); }); @@ -384,7 +384,7 @@ Scalar Tensor_slice_test1() Matrix mathMatrix = ToMatrix(mathTensor.GetSlice(0)); auto matrix = inputTensor.Slice(Slice::All, Slice::All, 0); - If(Verify(matrix, mathMatrix) != 0, [&] { + If(VerifySame(matrix, mathMatrix) != 0, [&] { ok = 1; DebugPrint("Tensor_slice_test1: Tensor row-column GetSlice failed\n"); }); @@ -396,7 +396,7 @@ Scalar Tensor_slice_test1() Matrix mathMatrix = ToMatrix(slice); auto matrix = inputTensor.Slice(0, Slice::All, Slice::All); - If(Verify(matrix, mathMatrix) != 0, [&] { + If(VerifySame(matrix, mathMatrix) != 0, [&] { ok = 1; DebugPrint("Tensor_slice_test1: Tensor column-channel GetSlice failed\n"); }); @@ -406,7 +406,7 @@ Scalar Tensor_slice_test1() Vector mathVector = mathTensor.GetSlice(0, 0).ToArray(); auto vector = inputTensor.Slice(0, 0, Slice::All); - If(Verify(mathVector, vector) != 0, [&] { + If(VerifySame(mathVector, vector) != 0, [&] { ok = 1; DebugPrint("Tensor_slice_test1: Tensor channel vector failed\n"); }); @@ -416,7 +416,7 @@ Scalar Tensor_slice_test1() Vector mathVector = mathTensor.GetSlice(0, 0).ToArray(); auto vector = inputTensor.Slice(0, Slice::All, 0); - If(Verify(mathVector, vector) != 0, [&] { + If(VerifySame(mathVector, vector) != 0, [&] { ok = 1; DebugPrint("Tensor_slice_test1: Tensor column vector failed"); }); @@ -426,7 +426,7 @@ Scalar Tensor_slice_test1() Vector mathVector = mathTensor.GetSlice(0, 0).ToArray(); auto vector = inputTensor.Slice(Slice::All, 0, 0); - If(Verify(mathVector, vector) != 0, [&] { + If(VerifySame(mathVector, vector) != 0, [&] { ok = 1; DebugPrint("Tensor_slice_test1: Tensor row vector failed"); }); diff --git a/libraries/value/test/src/TestUtil.cpp b/libraries/value/test/src/TestUtil.cpp index 4967e5c3c..935b2e2a1 100644 --- a/libraries/value/test/src/TestUtil.cpp +++ b/libraries/value/test/src/TestUtil.cpp @@ -8,25 +8,45 @@ #include "TestUtil.h" +#include #include +#include +#include #include #include #include #include +#include +#include + #include #include +#include -#include -#include +#include #include +#include #include #include -#include +#include +#include +#include +#include #include +#include + +#if !defined(WIN32) +#include +#include +#include +#else +#include +#endif // !defined(WIN32) +using namespace ell::emitters; using namespace ell::utilities; using namespace ell::value; namespace math = ell::math; @@ -36,13 +56,12 @@ using math::MatrixLayout; template using LayoutType = std::integral_constant; +using namespace ell::emitters; +using namespace ell::utilities; +using namespace ell::value; + namespace ell { -void DebugPrint(std::string message) -{ - GetContext().DebugPrint(message); -} - void PrintMatrix(std::string indent, Matrix e) { if (!e.GetValue().IsConstant()) @@ -109,69 +128,81 @@ void PrintMatrix(std::string indent, Matrix e) } } +void PrintLoops(const value::loopnests::LoopNest& loop, std::string tag) +{ + InvokeForContext([&](auto&) { + std::stringstream ss; + DebugDump(loop, tag, &ss); + std::cout << ss.str() << std::endl; + }); +} + Scalar EqualEpsilon(Scalar x, Scalar y, double epsilon) { - if (x.GetType() == ValueType::Int32) - { - return x == y; - } - Scalar e = Allocate(ValueType::Double, ScalarLayout); - e = epsilon; - Scalar tens = Floor(Log10(Cast(x, ValueType::Double))); - If(tens > 0.0, [&] { - // then we have some precision already on the left hand side of the decimal place, so remove that from epsilon - e *= Pow(10.0, tens); - }); - Scalar rx = Allocate(ValueType::Double, ScalarLayout); - Scalar ry = Allocate(ValueType::Double, ScalarLayout); - rx = Floor(Cast(x, ValueType::Double) / e) * e; - ry = Floor(Cast(y, ValueType::Double) / e) * e; - return rx == ry; + Scalar result = Allocate(ScalarLayout); + result = 1; + + If(x == y, [&] { +#if 0 // Useful for debugging + DebugPrint("## Scalar compare passed (exactly equal)\n"); + DebugPrint(" Expected: "); + DebugPrintVector(AsVector(x)); + DebugPrint("\n"); + DebugPrint(" Actual: "); + DebugPrintVector(AsVector(y)); + DebugPrint("\n"); +#endif // 0 + result = 1; + }) + .Else([&] { + if (auto type = x.GetType(); type == ValueType::Float || type == ValueType::Double) + { + auto tolerance = Cast(epsilon, type); + If((x - y) <= tolerance, [&] { + If((y - x) <= tolerance, [&] { +#if 0 // Useful for debugging + DebugPrint("## Scalar compare passed\n"); + DebugPrint(" Expected: "); + DebugPrintVector(AsVector(x)); + DebugPrint("\n"); + DebugPrint(" Actual: "); + DebugPrintVector(AsVector(y)); + DebugPrint("\n"); +#endif // 0 + result = 1; + }); + }); + } + else + { + result = 0; + } + }); + + return result; } Scalar NotEqualEpsilon(Scalar x, Scalar y, double epsilon) { - if (x.GetType() == ValueType::Int32) - { - return x != y; - } - Scalar ep = epsilon; - Scalar e = Allocate(ValueType::Double, ScalarLayout); - e = epsilon; - Scalar tens = Floor(Log10(Cast(x, ValueType::Double))); - If(tens > 0.0, [&] { - // then we have some precision already on the left hand side of the decimal place, so remove that from epsilon - e *= Pow(10.0, tens); - }); - Scalar rx = Allocate(ValueType::Double, ScalarLayout); - Scalar ry = Allocate(ValueType::Double, ScalarLayout); - rx = Floor(Cast(x, ValueType::Double) / e) * e; - ry = Floor(Cast(y, ValueType::Double) / e) * e; - Scalar diff = Abs(rx - ry); - InvokeForContext([&] { - double t = tens.Get(); - double dx = diff.Get(); - if (dx > epsilon) - { - std::cout << std::setprecision(10); - std::cout << " NotEqualEpsilon failed: t=" << t << ": " << dx << " > " << epsilon << "\n"; - } - }); - return diff > ep; + auto result = EqualEpsilon(x, y, epsilon); + + // TODO: overload the logical not operator + If(result == 1, [&] { result = 0; }).Else([&] { result = 1; }); + return result; } -Scalar Verify(Vector actual, Vector expected, double epsilon) +Scalar VerifySame(Vector actual, Vector expected, double epsilon) { - Scalar fail = 1; Scalar ok = Allocate(ValueType::Int32, ScalarLayout); - ok = 0; For(actual, [&](Scalar index) { Scalar x = actual(index); Scalar y = expected(index); - If(NotEqualEpsilon(x, y, epsilon), [&] { - ok = fail; + + If(ok == 0, [&] { + ok = NotEqualEpsilon(x, y, epsilon); }); }); + If(ok != 0, [&] { DebugPrint("## Vector compare failed\n"); DebugPrint(" Expected: "); @@ -186,14 +217,12 @@ Scalar Verify(Vector actual, Vector expected, double epsilon) Scalar VerifyDifferent(Vector actual, Vector expected, double epsilon) { - Scalar fail = 1; Scalar ok = Allocate(ValueType::Int32, ScalarLayout); - ok = 0; For(actual, [&](Scalar index) { Scalar x = actual(index); Scalar y = expected(index); - If(EqualEpsilon(x, y, epsilon), [&] { - ok = fail; + If(ok == 0, [&] { + ok = EqualEpsilon(x, y, epsilon); }); }); If(ok != 0, [&] { @@ -205,24 +234,47 @@ Scalar VerifyDifferent(Vector actual, Vector expected, double epsilon) DebugPrintVector(actual); DebugPrint("\n"); }); + return ok; } -Scalar Verify(Matrix actual, Matrix expected, double epsilon) +Scalar VerifySame(Matrix actual, Matrix expected, double epsilon) +{ + Scalar ok = Allocate(ValueType::Int32, ScalarLayout); + For(actual, [&](Scalar row, Scalar col) { + Scalar x = actual(row, col); + Scalar y = expected(row, col); + If(ok == 0, [&] { + ok = NotEqualEpsilon(x, y, epsilon); + }); + }); + If(ok != 0, [&] { + DebugPrint("## Matrices are different\n"); + InvokeForContext([&] { + std::cout << "Expected: \n"; + PrintMatrix(" ", expected); + std::cout << "\n"; + std::cout << "Actual: \n"; + PrintMatrix(" ", actual); + std::cout << "\n"; + }); + }); + return ok; +} + +Scalar VerifyDifferent(Matrix actual, Matrix expected, double epsilon) { - Scalar fail = 1; Scalar ok = Allocate(ValueType::Int32, ScalarLayout); - ok = 0; For(actual, [&](Scalar row, Scalar col) { Scalar x = actual(row, col); Scalar y = expected(row, col); - If(NotEqualEpsilon(x, y, epsilon), [&] { - ok = fail; + If(ok == 0, [&] { + ok = EqualEpsilon(x, y, epsilon); }); }); If(ok != 0, [&] { - DebugPrint("## Matrix compare failed\n"); - InvokeForContext([&](auto&) { + DebugPrint("## Matrices are not different\n"); + InvokeForContext([&] { std::cout << "Expected: \n"; PrintMatrix(" ", expected); std::cout << "\n"; @@ -234,7 +286,20 @@ Scalar Verify(Matrix actual, Matrix expected, double epsilon) return ok; } -Scalar Verify(Tensor actual, Tensor expected, double epsilon) +Scalar VerifySame(Tensor actual, Tensor expected, double epsilon) +{ + Scalar ok = Allocate(ValueType::Int32, ScalarLayout); + For(actual, [&](Scalar row, Scalar col, Scalar ch) { + Scalar x = actual(row, col, ch); + Scalar y = expected(row, col, ch); + If(ok == 0, [&] { + ok = NotEqualEpsilon(x, y, epsilon); + }); + }); + return ok; +} + +Scalar VerifyDifferent(Tensor actual, Tensor expected, double epsilon) { Scalar fail = Cast(1, ValueType::Int32); Scalar ok = Allocate(ValueType::Int32, ScalarLayout); @@ -242,11 +307,175 @@ Scalar Verify(Tensor actual, Tensor expected, double epsilon) For(actual, [&](Scalar row, Scalar col, Scalar ch) { Scalar x = actual(row, col, ch); Scalar y = expected(row, col, ch); - If(NotEqualEpsilon(x, y, epsilon), [&] { - DebugPrint("## Tensor compare failed\n"); - ok = fail; + If(ok == 0, [&] { + ok = EqualEpsilon(x, y, epsilon); + }); + }); + return ok; +} + +Scalar VerifySame(Array actual, Array expected, double epsilon) +{ + Scalar ok = Allocate(ValueType::Int32, ScalarLayout); + For(actual, [&](const std::vector& indices) { + Scalar x = actual(indices); + Scalar y = expected(indices); + If(ok == 0, [&] { + ok = NotEqualEpsilon(x, y, epsilon); + }); + }); + return ok; +} + +Scalar VerifyDifferent(Array actual, Array expected, double epsilon) +{ + Scalar fail = Cast(1, ValueType::Int32); + Scalar ok = Allocate(ValueType::Int32, ScalarLayout); + ok = 0; + For(actual, [&](const std::vector& indices) { + Scalar x = actual(indices); + Scalar y = expected(indices); + If(ok == 0, [&] { + ok = EqualEpsilon(x, y, epsilon); }); }); return ok; } + +Scalar GetTID() +{ + if (auto result = InvokeForContext( + [](auto&) { +#if !defined(WIN32) +#if defined(__APPLE__) +#pragma message("Note: syscall() is deprecated in macOS") + // Note: syscall() is deprecated in macOS, perhaps use pthread_self instead: + // return static_cast(reinterpret_cast(pthread_self())); +#endif // defined(__APPLE__) + return (int32_t)(pid_t)syscall(SYS_gettid); +#else + return (int32_t)GetCurrentThreadId(); +#endif // !defined(WIN32) + }); + result) + { + return *result; + } + + return Scalar( +#if !defined(WIN32) +#if defined(__APPLE__) +#pragma message("Note: syscall() is deprecated in macOS") +#endif // defined(__APPLE__) + Cast( + *DeclareFunction("syscall") + .Decorated(false) + .Returns(Value({ ValueType::Int64, 0 }, ScalarLayout)) + .Parameters( + Value({ ValueType::Int64, 0 }, ScalarLayout)) + .Call(Scalar{ (int64_t)SYS_gettid })) +#else + *DeclareFunction("GetCurrentThreadId") + .Decorated(false) + .Returns(Value({ ValueType::Int32, 0 }, ScalarLayout)) + .Call() +#endif // !defined(WIN32) + ); +} + +void MultiplyMatrices(Matrix& A, Matrix& B, Matrix& C) +{ + auto M = static_cast(C.Rows()); + auto N = static_cast(C.Columns()); + auto K = static_cast(A.Columns()); + + // fill out expected with a simple for-loop gemm + ForRange(M, [&](Scalar i) { + ForRange(N, [&](Scalar j) { + C(i, j) = 0; + }); + }); + ForRange(M, [&](Scalar i) { + ForRange(N, [&](Scalar j) { + ForRange(K, [&](Scalar k) { + C(i, j) += A(i, k) * B(k, j); + }); + }); + }); +} + +MatMul3TestCaseParameters GetMatMul3TestCaseParameters(int M, int N, int K, int L) +{ + auto A = MakeMatrix(M, K, "A"); + auto B = MakeMatrix(K, N, "B"); + auto C = MakeMatrix(M, N, "C"); + auto D = MakeMatrix(N, L, "D"); + auto E = MakeMatrix(M, L, "E"); + + auto expectedC = MakeMatrix(M, N, "expectedC"); + auto expectedE = MakeMatrix(M, L, "expectedE"); + + // initialize matrices + ForRange(M, [&](Scalar i) { + ForRange(K, [&](Scalar j) { + A(i, j) = i - j; + }); + }); + + ForRange(K, [&](Scalar i) { + ForRange(N, [&](Scalar j) { + B(i, j) = i + 2 * j; + }); + }); + + ForRange(M, [&](Scalar i) { + ForRange(N, [&](Scalar j) { + C(i, j) = 0; + }); + }); + + ForRange(N, [&](Scalar i) { + ForRange(L, [&](Scalar j) { + D(i, j) = j - i; + }); + }); + + ForRange(M, [&](Scalar i) { + ForRange(L, [&](Scalar j) { + E(i, j) = 0; + }); + }); + + // fill out expected results with a simple for-loop gemm + // C + ForRange(M, [&](Scalar i) { + ForRange(N, [&](Scalar j) { + expectedC(i, j) = 0; + }); + }); + ForRange(M, [&](Scalar i) { + ForRange(N, [&](Scalar j) { + ForRange(K, [&](Scalar k) { + expectedC(i, j) += A(i, k) * B(k, j); + }); + }); + }); + + // E + ForRange(M, [&](Scalar i) { + ForRange(L, [&](Scalar l) { + expectedE(i, l) = 0; + }); + }); + ForRange(M, [&](Scalar i) { + ForRange(N, [&](Scalar j) { + ForRange(L, [&](Scalar l) { + expectedE(i, l) += expectedC(i, j) * D(j, l); + }); + }); + }); + + return { M, N, K, L, A, B, C, D, E, expectedC, expectedE }; +} + } // namespace ell diff --git a/libraries/value/test/src/Value_test.cpp b/libraries/value/test/src/Value_test.cpp index 32b3ad4ac..99b054f25 100644 --- a/libraries/value/test/src/Value_test.cpp +++ b/libraries/value/test/src/Value_test.cpp @@ -9,23 +9,44 @@ #include "Value_test.h" #include "TestUtil.h" +#include #include +#include #include #include +#include +#include #include #include -#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include #include #include +#include #include +#include + #include +#include #include +#include #include +#include #include #include +#include #include #include #include @@ -39,55 +60,13 @@ #endif // !defined(WIN32) using namespace ell::utilities; +using namespace ell::logging; using namespace ell::value; #define PRINT_IR 0 namespace ell { -namespace -{ - Scalar GetTID() - { - if (auto result = InvokeForContext( - [](auto&) { -#if !defined(WIN32) - return (int32_t)(pid_t)syscall(SYS_gettid); -#else - return (int32_t)GetCurrentThreadId(); -#endif // !defined(WIN32) - }); - result) - { - return *result; - } - - if (auto result = InvokeForContext( - [] { - return Scalar( -#if !defined(WIN32) - *DeclareFunction("syscall") - .Decorated(FunctionDecorated::No) - .Returns(Value({ ValueType::Int64, 0 }, ScalarLayout)) - .Parameters( - Value({ ValueType::Int64, 0 }, ScalarLayout)) - .Call(Scalar{ (int64_t)SYS_gettid }) -#else - *DeclareFunction("GetCurrentThreadId") - .Decorated(FunctionDecorated::No) - .Returns(Value({ ValueType::Int32, 0 }, ScalarLayout)) - .Call() -#endif // !defined(WIN32) - ); - })) - { - return Cast(*result); - } - - throw LogicException(LogicExceptionErrors::notImplemented); - } - -} // namespace void ValueGetTests() { @@ -138,6 +117,122 @@ Scalar Basic_test() return 0; } +Scalar Array_test1() +{ + Scalar ok = Allocate(ValueType::Int32, ScalarLayout); + ok = 0; + + constexpr int rows = 3, columns = 5, channels = 7; + std::vector arrayData(rows * columns * channels); + std::generate(arrayData.begin(), arrayData.end(), [i = 0]() mutable { return ++i; }); + math::ChannelColumnRowTensor mathTensor(3, 5, 7, arrayData); + + MemoryShape physicalSize{ rows, columns, channels }; + DimensionOrder dimensionOrder = RowMajorTensorOrder; + MemoryLayout memoryLayout(physicalSize, dimensionOrder); + Array array(Value(arrayData, memoryLayout)); + + // Check shape + { + auto shape = array.GetValue().GetLayout().GetExtent(); + auto actual1 = static_cast(shape[0]); + auto expected1 = static_cast(mathTensor.NumRows()); + if (actual1 != expected1) + { + DebugPrint("Array_test1: value::Array and math::Tensor row check failed\n"); + ok = 1; + } + + auto actual2 = static_cast(shape[1]); + auto expected2 = static_cast(mathTensor.NumColumns()); + if (actual2 != expected2) + { + DebugPrint("Array_test1: value::Array and math::Tensor column check failed\n"); + ok = 1; + } + + auto actual3 = static_cast(shape[2]); + auto expected3 = static_cast(mathTensor.NumChannels()); + if (actual3 != expected3) + { + DebugPrint("Array_test1: value::Array and math::Tensor channel check failed\n"); + ok = 1; + } + } + + // Check for loop iterations + { + Scalar count = Allocate(ValueType::Int32, ScalarLayout); + + // test we can enumerate all items of an array. + For(array, [&](const std::vector& coordinates) { + count += 1; + }); + If(count != static_cast(mathTensor.Size()), [&] { + DebugPrint("Array_test1: for loop didn't visit all elements\n"); + ok = 1; + }); + } + + Scalar ok2 = Allocate(ValueType::Int32, ScalarLayout); + ok2 = 0; + + // Check operator(Scalar...) + InvokeForContext([&](auto&) { + // These tests use row.Get() to get the actual row,col indexes as constants, which can + // only be done during ComputeContext. + + // test we can enumerate all items of an array. + For(array, [&](const std::vector& coordinates) { + const auto& row = coordinates[0]; + const auto& col = coordinates[1]; + const auto& ch = coordinates[2]; + auto rowInt = row.Get(); + auto colInt = col.Get(); + auto chInt = ch.Get(); + auto tensorVal = mathTensor(rowInt, colInt, chInt); + Scalar expected = tensorVal; + Scalar actual = array(row, col, ch); + + If(actual != expected, [&] { + ok2 = 1; + }); + }); + If(ok2 != 0, [&] { + DebugPrint("Array_test1: value::Array and math::Tensor equality check failed\n"); + ok = 1; + }); + }); + + ok2 = 0; + // Check operator(vector) + InvokeForContext([&](auto&) { + // These tests use row.Get() to get the actual row,col indexes as constants, which can + // only be done during ComputeContext. + + // test we can enumerate all items of an array. + For(array, [&](const std::vector& coordinates) { + const auto& row = coordinates[0]; + const auto& col = coordinates[1]; + const auto& ch = coordinates[2]; + auto rowInt = row.Get(); + auto colInt = col.Get(); + auto chInt = ch.Get(); + Scalar expected = mathTensor(rowInt, colInt, chInt); + Scalar actual = array(coordinates); + If(actual != expected, [&] { + ok2 = 1; + }); + }); + If(ok2 != 0, [&] { + DebugPrint("Array_test1: value::Array and math::Tensor equality check failed\n"); + ok = 1; + }); + }); + + return ok; +} + Scalar DebugPrint_test() { DebugPrint("### Test that debug print is working: "); @@ -172,40 +267,43 @@ Scalar For_test1() For(input, [&](Scalar index) { actual(index) = input(index); }); - return Verify(input, actual); + return VerifySame(input, actual); } -void TripleLoop(value::Vector input, value::Vector output) +namespace { - if (input.Size() == 0) + void TripleLoop(value::Vector input, value::Vector output) { - return; - } + if (input.Size() == 0) + { + return; + } - Scalar max = Allocate(input.GetType(), ScalarLayout); - max = Cast(0, input.GetType()); - For(input, [&](Scalar index) { - Scalar v = input(index); - If(v > max, [&] { - max = v; + Scalar max = Allocate(input.GetType(), ScalarLayout); + max = Cast(0, input.GetType()); + For(input, [&](Scalar index) { + Scalar v = input(index); + If(v > max, [&] { + max = v; + }); }); - }); - Scalar sum = Allocate(input.GetType(), ScalarLayout); - sum = Cast(0, input.GetType()); - For(input, [&](Scalar index) { - Scalar v = input(index); - v -= max; - sum += v; - output(index) = v; - }); + Scalar sum = Allocate(input.GetType(), ScalarLayout); + sum = Cast(0, input.GetType()); + For(input, [&](Scalar index) { + Scalar v = input(index); + v -= max; + sum += v; + output(index) = v; + }); - For(output, [&](Scalar index) { - Scalar v = input(index); - v /= sum; - output(index) = v; - }); -} + For(output, [&](Scalar index) { + Scalar v = input(index); + v /= sum; + output(index) = v; + }); + } +} // namespace Scalar For_test2() { @@ -213,7 +311,72 @@ Scalar For_test2() Vector expected(std::vector{ 0.4, 0.3, 0.2, 0.1, 0 }); Vector output = MakeVector(input.Size()); TripleLoop(input, output); - return Verify(output, expected); + return VerifySame(output, expected); +} + +Scalar ForInsideIf_test() +{ + auto zero = MakeScalar(); + auto sum = MakeScalar(); + + If(zero == Scalar(0), [&] { + ForRange(10, [&](Scalar i) { + sum += 1; + }); + }).Else([&] { + ForRange(10, [&](Scalar i) { + sum += 2; + }); + }); + + Scalar ok = MakeScalar(); + If(sum != 10, [&] { ok = 1; }); + return ok; +} + +Scalar While_test() +{ + Scalar test = MakeScalar(); + Scalar count = MakeScalar(); + + test = (count != 5); + While(test, [&] { + count += 5; + test = (count != 5); + }); + + Scalar ok = MakeScalar(); + If(count != 5, [&] { ok = 1; }); + return ok; +} + +Scalar WhileInsideIf_test() +{ + auto zero = MakeScalar(); + auto count = MakeScalar(); + count = 10; + auto sum = MakeScalar(); + Scalar notDone = MakeScalar(); + + If(zero == Scalar(0), [&] { + notDone = count > 0; + While(notDone, [&] { + sum += 1; + count -= 1; + notDone = count > 0; + }); + }).Else([&] { + notDone = count > 0; + While(notDone, [&] { + sum += 2; + count -= 1; + notDone = count > 0; + }); + }); + + Scalar ok = MakeScalar(); + If(sum != 10, [&] { ok = 1; }); + return ok; } Scalar Casting_test1() @@ -295,7 +458,7 @@ Scalar Sum_test() If(result != expected, [&] { ok = 1; InvokeForContext([&] { - std::cout << "### Sum_test failed for size " << i << "\n"; + Log() << "### Sum_test failed for size " << i << "\n"; }); }); } @@ -347,13 +510,13 @@ namespace Vector input = intrinsics_data; Vector actual = f(input); Vector expected = expected_data; - If(Verify(actual, expected, 1e-5) != 0, [&] { + If(VerifySame(actual, expected, 1e-5) != 0, [&] { ok = 1; DebugPrint("Intrinsics " + fnName + " test failed\n"); }); } - // recurrsively process next item in the tuple + // recursively process next item in the tuple Scalar r = Intrinsics_test1_impl(tuple, std::integral_constant{}); If(r != 0, [&] { @@ -521,7 +684,7 @@ Scalar Parallelized_test1() } } - If(Verify(data, expected) != 0, [&] { + If(VerifySame(data, expected) != 0, [&] { ok = 1; }); @@ -556,8 +719,6 @@ Scalar Parallelized_test2() return ok; } -// Prefetches have no effect on the behavior of the program but can change its performance characteristics, so this -// test just makes sure that the code compiles/runs and behavior is not affected Scalar Parallelized_test3() { constexpr int DataPerThread = 8; @@ -588,7 +749,34 @@ Scalar Parallelized_test3() // Prefetches have no effect on the behavior of the program but can change its performance characteristics, so this // test just makes sure that the code compiles/runs and behavior is not affected +// This test is just Sum_test with prefetching added in Scalar Prefetch_test1() +{ + Scalar ok = Allocate(ScalarLayout); + for (int i = 1; i < 10; ++i) + { + Vector v = MakeVector(i); + std::vector reference(i); + std::iota(reference.begin(), reference.end(), 0); + auto expected = std::accumulate(reference.begin(), reference.end(), 0.f); + + v = reference; + + Prefetch(v); + Scalar result = Sum(v); + If(result != expected, [&] { + ok = 1; + InvokeForContext([&] { + Log() << "### Sum_test failed for size " << i << "\n"; + }); + }); + } + return ok; +} + +// Prefetches have no effect on the behavior of the program but can change its performance characteristics, so this +// test just makes sure that the code compiles/runs and behavior is not affected +Scalar Prefetch_parallelized_test1() { constexpr int DataPerThread = 8; constexpr int NumThreads = 4; @@ -615,7 +803,282 @@ Scalar Prefetch_test1() If(B[i * DataPerThread + j] != i, [&] { ok = 1; }); }); }); + + return ok; +} + +Scalar Fma_test1() +{ + Scalar ok = Allocate(ScalarLayout); + + constexpr float a_ = 3.14f, b_ = 1.8f, c_ = 8.1f, expected_ = a_ * b_ + c_; + + Scalar a = Allocate(ScalarLayout); + Scalar b = Allocate(ScalarLayout); + Scalar c = Allocate(ScalarLayout); + Scalar result = Allocate(ScalarLayout); + Scalar expected = Allocate(ScalarLayout); + + a = a_; + b = b_; + c = c_; + expected = expected_; + result = FusedMultiplyAdd(a, b, c); + + If(NotEqualEpsilon(result, expected, 1e-5) == 1, [&] { ok = 1; }); + return ok; +} + +Scalar Fma_test2() +{ + Scalar ok = Allocate(ScalarLayout); + + constexpr double a_ = 1.763, b_ = 6.182, c_ = 9.1029, expected_ = a_ * b_ + c_; + + Scalar a = Allocate(ScalarLayout); + Scalar b = Allocate(ScalarLayout); + Scalar c = Allocate(ScalarLayout); + Scalar result = Allocate(ScalarLayout); + Scalar expected = Allocate(ScalarLayout); + + a = a_; + b = b_; + c = c_; + expected = expected_; + result = FusedMultiplyAdd(a, b, c); + + If(NotEqualEpsilon(result, expected, 1e-7) == 1, [&] { ok = 1; }); return ok; } +Scalar Fma_test3() +{ + Scalar ok = Allocate(ScalarLayout); + + constexpr int a_ = 8, b_ = 5, c_ = 2, expected_ = a_ * b_ + c_; + + Scalar a = Allocate(ScalarLayout); + Scalar b = Allocate(ScalarLayout); + Scalar c = Allocate(ScalarLayout); + Scalar result = Allocate(ScalarLayout); + Scalar expected = Allocate(ScalarLayout); + + a = a_; + b = b_; + c = c_; + expected = expected_; + result = FusedMultiplyAdd(a, b, c); + + If(expected != result, [&] { ok = 1; }); + return ok; +} + +Scalar UniqueName_test1() +{ + Scalar ok = Allocate(ScalarLayout); + + ell::testing::IsEqual(UniqueName(""), "_0"); + ell::testing::IsEqual(UniqueName(""), "_1"); + + ell::testing::IsEqual(UniqueName("foo"), "foo_0"); + ell::testing::IsEqual(UniqueName("foo"), "foo_1"); + + return ok; +} + +Scalar Parallelized_ComputeContext_test1() +{ + Scalar ok = Allocate(ScalarLayout); + + InvokeForContext([] { + constexpr auto numItems = 100000; + constexpr auto numThreads = 16; + + bool ready = false; + std::mutex m1, m2; + std::condition_variable cv1, cv2; + std::atomic_int atomicIndex = 0; + + std::array threads; + std::generate(std::begin(threads), std::end(threads), [&] { + return std::thread([&] { + std::unique_lock lock{ m1 }; + cv1.wait(lock, [&] { return ready; }); + int index{}; + while ((index = atomicIndex.fetch_add(1)) < numItems) + { + [[maybe_unused]] Scalar s{ index }; + } + + cv2.notify_one(); + }); + }); + + { + std::unique_lock lock{ m1 }; + ready = true; + } + cv1.notify_all(); + + { + std::unique_lock lock{ m2 }; + cv2.wait(lock, [&] { return atomicIndex < numItems; }); + } + std::for_each(std::begin(threads), std::end(threads), [](std::thread& thread) { thread.join(); }); + }); + + return ok; +} + +Scalar MemCopy_test1() +{ + auto vec = MakeVector(4); + + std::vector expected{ 10, 20, 30, 40 }; + MemCopy(vec, Vector(expected)); + + return VerifySame(vec, expected); +} + +Scalar MemSet_test1() +{ + auto vec = MakeVector(4); + constexpr auto fill = char{ 0x3D }; + + union + { + char c[sizeof(int)]; + int i; + } expected; + std::memset(&expected.c, fill, sizeof(expected.c)); + + MemSet(vec, fill); + + auto ok = MakeScalar(); + For(vec, [&](Scalar index) { + If(vec[index] != expected.i, [&] { + ok = 1; + }); + }); + + return ok; +} + +Scalar NamedLoops_test1() +{ + { + auto accum = MakeScalar(); + ForRange(std::string{ "ForRangeLoop" }, 10, [&](Scalar index) { accum += index; }); + } + + { + auto v = MakeVector(10); + For("ForVectorLoop", v, [&](Scalar index) { v[index] = index; }); + } + + { + auto m = MakeMatrix(10, 10); + For("ForMatrixLoop", m, [&](Scalar row, Scalar col) { m(row, col) = row + row * col; }); + } + + { + auto t = MakeTensor(10, 10, 10); + For("ForTensorLoop", t, [&](Scalar row, Scalar col, Scalar ch) { t(row, col, ch) = row + col + ch + ch * col * row; }); + } + + return MakeScalar(); +} + +Scalar ThreadLocalAllocation_test1() +{ + auto ok = MakeScalar("ok"); + +#ifdef WIN32 + // This thread is disabled for windows + LLVM due to issues with threading and TLS + if (dynamic_cast(&GetContext()) != nullptr) + { + return ok; + } +#endif // WIN32 + + constexpr int NumWorkItems = 40; + auto threadIds = MakeVector(NumWorkItems, "threadIds"); + Parallelize( + NumWorkItems, + std::tuple{ threadIds }, + std::function{ [](Scalar threadId, Vector threadIds) { + Scalar alreadySeen = StaticAllocate("AlreadySeen", ValueType::Int64, ScalarLayout, AllocateFlags::ThreadLocal); + auto tid = Cast(GetTID()); + If( + alreadySeen == int64_t{ 0 }, + [&] { + alreadySeen = tid; + threadIds[threadId] = 1; + }) + .ElseIf( + alreadySeen != tid, + [&] { + threadIds[threadId] = -1; + }); + } }); + + auto totalThreadsRan = MakeScalar("totalThreadsRan"); + auto totalErrors = MakeScalar("totalErrors"); + For(threadIds, [&](Scalar index) { + If(threadIds[index] == 1, [&] { + ++totalThreadsRan; + }).ElseIf(threadIds[index] == -1, [&] { + ++totalErrors; + }); + }); + + DebugPrint("Number of errors detected in TLS code: "); + DebugPrintVector(AsVector(totalErrors)); + DebugPrint("\n"); + DebugPrint("Number of actual threads used to complete " + std::to_string(NumWorkItems) + " work items: "); + DebugPrintVector(AsVector(totalThreadsRan)); + DebugPrint("\n"); + + If(totalThreadsRan < 1, [&] { ok = 1; }); + If(totalErrors > 0, [&] { ok = 1; }); + + return ok; +} + +Scalar FunctionPointer_test1() +{ + auto ok = MakeScalar("ok"); + + // This thread is disabled CppEmitterContext for now + if (dynamic_cast(&GetContext()) != nullptr) + { + return ok; + } + + auto realFnDecl = DeclareFunction("foo") + .Returns(Scalar(0)) + .Parameters(Scalar(0)); + auto realFn = realFnDecl + .Define([](Scalar x) -> Scalar { + auto r = MakeScalar(x.GetType()); + r = x + 10; + return r; + }); + + auto fnPtr = DeclareFunction("bar").Returns(Scalar(0)).Parameters(Scalar(0)); + fnPtr.SetPointer(realFnDecl.GetPointer()); + + auto in1 = MakeScalar(); + in1 = 100; + Scalar y = realFn(in1); + + If(y != 110, [&] { ok = 1; }); + + in1 = 200; + Scalar z = *fnPtr.Call(in1); + + If(z != 210, [&] { ok = 1; }); + + return ok; +} } // namespace ell diff --git a/libraries/value/test/src/Vector_test.cpp b/libraries/value/test/src/Vector_test.cpp index 1f401d4fe..223200583 100644 --- a/libraries/value/test/src/Vector_test.cpp +++ b/libraries/value/test/src/Vector_test.cpp @@ -106,7 +106,7 @@ Scalar Vector_test1() // Vector result = convolve1D(signal, filter); Vector expected(referenceResult); - return Verify(result, expected); + return VerifySame(result, expected); } Scalar Vector_test2() @@ -119,7 +119,7 @@ Scalar Vector_test2() { Vector expected(std::vector{ 1.2f + 3.4f, 2.3f + 3.4f }); Vector actual = v + testScalar; - If(Verify(actual, expected) != 0, [&] { + If(VerifySame(actual, expected) != 0, [&] { DebugPrint("## Vector_test2 vector scalar addition failed\n"); ok = 1; }); @@ -127,7 +127,7 @@ Scalar Vector_test2() { Vector expected(std::vector{ 1.2f - 3.4f, 2.3f - 3.4f }); Vector actual = v - testScalar; - If(Verify(actual, expected) != 0, [&] { + If(VerifySame(actual, expected) != 0, [&] { DebugPrint("## Vector_test2 vector scalar subtraction failed\n"); ok = 1; }); @@ -135,7 +135,7 @@ Scalar Vector_test2() { Vector expected(std::vector{ 1.2f * 3.4f, 2.3f * 3.4f }); Vector actual = v * testScalar; - If(Verify(actual, expected) != 0, [&] { + If(VerifySame(actual, expected) != 0, [&] { DebugPrint("## Vector_test2 vector scalar multiplication failed\n"); ok = 1; }); @@ -143,7 +143,7 @@ Scalar Vector_test2() { Vector expected(std::vector{ 1.2f / 3.4f, 2.3f / 3.4f }); Vector actual = v / testScalar; - If(Verify(actual, expected) != 0, [&] { + If(VerifySame(actual, expected) != 0, [&] { DebugPrint("## Vector_test2 vector scalar division failed\n"); ok = 1; }); @@ -153,7 +153,7 @@ Scalar Vector_test2() { Vector expected(std::vector{ 1.2f + 0.1f, 2.3f + 1.2f }); Vector actual = v + testVector; - If(Verify(actual, expected) != 0, [&] { + If(VerifySame(actual, expected) != 0, [&] { DebugPrint("## Vector_test2 vector+vector failed\n"); ok = 1; }); @@ -161,7 +161,7 @@ Scalar Vector_test2() { Vector expected(std::vector{ 1.2f - 0.1f, 2.3f - 1.2f }); Vector actual = v - testVector; - If(Verify(actual, expected) != 0, [&] { + If(VerifySame(actual, expected) != 0, [&] { DebugPrint("## Vector_test2 vector-vector failed\n"); ok = 1; }); @@ -185,7 +185,7 @@ Scalar Vector_test3() Vector e = std::vector{ 1, 1, 1, 1, 1, 1, 2, 2, 2 }; - If(Verify(v, e) != 0, [&] { + If(VerifySame(v, e) != 0, [&] { DebugPrint("## Vector_test3 subvector assignment failed\n"); ok = 1; }); diff --git a/libraries/value/test/src/main.cpp b/libraries/value/test/src/main.cpp index f84efa886..b52c361e3 100644 --- a/libraries/value/test/src/main.cpp +++ b/libraries/value/test/src/main.cpp @@ -2,10 +2,15 @@ // // Project: Embedded Learning Library (ELL) // File: main.cpp (value) -// Authors: Kern Handa, Chuck Jacobs +// Authors: Kern Handa, Chuck Jacobs, Mason Remy // //////////////////////////////////////////////////////////////////////////////////////////////////// +#include "CachingStrategy_test.h" +#include "Functions_test.h" +#include "LoopNestAPI_test.h" +#include "LoopNest_convolution_test.h" +#include "LoopNest_test.h" #include "Matrix_test.h" #include "Scalar_test.h" #include "Tensor_test.h" @@ -13,6 +18,7 @@ #include "Vector_test.h" #include +#include #include #include #include @@ -22,6 +28,7 @@ #include #include +#include #include #include @@ -31,6 +38,7 @@ #include #include +using namespace ell::logging; using namespace ell::utilities; using namespace ell::value; @@ -60,11 +68,9 @@ void PrintIR(TestLLVMContext& context) #endif // PRINT_IR } -extern "C" -{ +extern "C" { void JittedDebugPrintInts(int* ints, int* len) { - std::cout << std::setprecision(6); for (int i = 0; i < *len; i++) { if (i > 0) @@ -185,7 +191,7 @@ void DebugPrintVector(Vector message) .Parameters( Value(message.GetType(), MemoryLayout{ { size } }), Value(ValueType::Int32, ScalarLayout)) - .Decorated(FunctionDecorated::No); + .Decorated(false); printFunction.Call(message, Scalar{ size }); }); } @@ -195,19 +201,19 @@ void DebugPrintScalar(Scalar value) InvokeForContext([&] { std::visit( [](auto&& data) { - using Type = std::decay_t; - if constexpr (IsOneOf) - { - throw LogicException(LogicExceptionErrors::notImplemented); - } - else - { - std::copy( - data, - data + 1, - std::ostream_iterator>(std::cout, ", ")); - } - }, + using Type = std::decay_t; + if constexpr (IsOneOf) + { + throw LogicException(LogicExceptionErrors::notImplemented); + } + else + { + std::copy( + data, + data + 1, + std::ostream_iterator>(std::cout, ", ")); + } + }, value.GetValue().GetUnderlyingData()); }); @@ -231,10 +237,10 @@ void DebugPrintScalar(Scalar value) return; } auto printFunction = FunctionDeclaration(fnName) - .Parameters( - Value(value.GetType(), MemoryLayout{ { 1 } }), - Value(ValueType::Int32, ScalarLayout)) - .Decorated(FunctionDecorated::No); + .Parameters( + Value(value.GetType(), MemoryLayout{ { 1 } }), + Value(ValueType::Int32, ScalarLayout)) + .Decorated(false); printFunction.Call(vector, Scalar{ 1 }); }); } @@ -242,6 +248,8 @@ void DebugPrintScalar(Scalar value) void ComputeTest(std::string testName, std::function defineFunction) { + std::cout << "Running compute test " << testName << std::endl; + // Run the test in the ComputeContext ContextGuard guard("Value_test_compute"); @@ -260,6 +268,8 @@ void ComputeTest(std::string testName, std::function defineFunction) void LLVMJitTest(std::string testName, std::function defineFunction) { + std::cout << "Running LLVM JIT test " << testName << std::endl; + // Run the test in the LLVM context ell::emitters::CompilerOptions compilerSettings; compilerSettings.useBlas = false; @@ -275,6 +285,7 @@ void LLVMJitTest(std::string testName, std::function defineFunction) fn.Define(defineFunction); #if 0 // Useful for debugging, dumps to stderr +#pragma message("DEBUGGING") DebugDump(fn); #endif // 0 @@ -296,6 +307,17 @@ void LLVMJitTest(std::string testName, std::function defineFunction) ell::testing::ProcessTest(ell::utilities::FormatString(msg.c_str(), rc), rc == 0); } +void CppEmitterTest(std::string testName, std::function defineFunction) +{ + std::cout << "// Running CppEmitter test " << testName << std::endl; + + ContextGuard guard(testName, Log()); + auto fn = DeclareFunction(testName) + .Returns(Value(ValueType::Int32, ScalarLayout)); + + fn.Define(defineFunction); +} + void RunTest(std::string testName, std::function defineFunction) { try @@ -315,6 +337,15 @@ void RunTest(std::string testName, std::function defineFunction) { ell::testing::ProcessTest(testName + " Jitted LLVM failed with exception, " + e.what(), false); } + + try + { + CppEmitterTest(testName, defineFunction); + } + catch (const std::exception& e) + { + ell::testing::ProcessTest("/*\n" + testName + " CppEmitter test failed with exception, " + e.what() + "\n*/", false); + } } int main() @@ -323,9 +354,12 @@ int main() using namespace utilities; try { -#define ADD_TEST_FUNCTION(a) testFunctions.push_back({ #a, a }); +#define ADD_TEST_FUNCTION(a) testFunctions.push_back({ #a, a }) std::vector>> testFunctions; + // Low-level infrastructure tests + ADD_TEST_FUNCTION(SplitIterationDomain_test1); + // Value tests ADD_TEST_FUNCTION(Basic_test); ADD_TEST_FUNCTION(DebugPrint_test); @@ -344,6 +378,7 @@ int main() ADD_TEST_FUNCTION(Matrix_test1); ADD_TEST_FUNCTION(Matrix_test2); ADD_TEST_FUNCTION(Matrix_test3); + ADD_TEST_FUNCTION(Matrix_test4); ADD_TEST_FUNCTION(Reshape_test); ADD_TEST_FUNCTION(GEMV_test); ADD_TEST_FUNCTION(Tensor_test1); @@ -351,6 +386,8 @@ int main() ADD_TEST_FUNCTION(Tensor_test3); ADD_TEST_FUNCTION(Tensor_slice_test1); + ADD_TEST_FUNCTION(Array_test1); + ADD_TEST_FUNCTION(Casting_test1); ADD_TEST_FUNCTION(Sum_test); ADD_TEST_FUNCTION(Dot_test); @@ -358,6 +395,9 @@ int main() ADD_TEST_FUNCTION(Intrinsics_test2); ADD_TEST_FUNCTION(For_test1); ADD_TEST_FUNCTION(For_test2); + ADD_TEST_FUNCTION(ForInsideIf_test); + ADD_TEST_FUNCTION(While_test); + ADD_TEST_FUNCTION(WhileInsideIf_test); ADD_TEST_FUNCTION(ForRangeCasting_test1); ADD_TEST_FUNCTION(ForRangeCasting_test2); ADD_TEST_FUNCTION(Parallelized_test1); @@ -374,6 +414,247 @@ int main() ADD_TEST_FUNCTION(RefScalarRefRefRefTest); ADD_TEST_FUNCTION(RefMatrixReferenceTest); + ADD_TEST_FUNCTION(Prefetch_parallelized_test1); + ADD_TEST_FUNCTION(Fma_test1); + ADD_TEST_FUNCTION(Fma_test2); + ADD_TEST_FUNCTION(Fma_test3); + ADD_TEST_FUNCTION(UniqueName_test1); + + ADD_TEST_FUNCTION(LoopNest_test1); + ADD_TEST_FUNCTION(LoopNest_test2); + ADD_TEST_FUNCTION(LoopNest_test3); + ADD_TEST_FUNCTION(LoopNest_test4); + ADD_TEST_FUNCTION(LoopNest_test5); + ADD_TEST_FUNCTION(LoopNest_test6); + + ADD_TEST_FUNCTION(LoopNestNonzeroStart_test); + ADD_TEST_FUNCTION(LoopNestBoundary_test1); + ADD_TEST_FUNCTION(LoopNestBoundary_test2); + ADD_TEST_FUNCTION(LoopNestBoundary_test3); + ADD_TEST_FUNCTION(LoopNestBoundary_test4); + ADD_TEST_FUNCTION(LoopNestBoundary_test5); + ADD_TEST_FUNCTION(LoopNestReorder_test1); + ADD_TEST_FUNCTION(LoopNestReorder_test2); + ADD_TEST_FUNCTION(TwoKernel_test); + + ADD_TEST_FUNCTION(LoopNestLastPredicate_test1); + ADD_TEST_FUNCTION(LoopNestLastPredicate_test2); + ADD_TEST_FUNCTION(LoopNestLastPredicate_test3); + ADD_TEST_FUNCTION(LoopNestLastPredicate_test4); + ADD_TEST_FUNCTION(LoopNestBoundaryPredicate_test1); + + ADD_TEST_FUNCTION(MissingIndex_test); + ADD_TEST_FUNCTION(RequiredIndex_test); + ADD_TEST_FUNCTION(SimpleImperfectNest_test); + ADD_TEST_FUNCTION(ImperfectNest_test_ijk); + ADD_TEST_FUNCTION(ImperfectNest_test_ikj); + ADD_TEST_FUNCTION(ImperfectNest_test_kij); + ADD_TEST_FUNCTION(ImperfectNest_test_ijkijk); + ADD_TEST_FUNCTION(ImperfectNest_test_kijijk); + ADD_TEST_FUNCTION(ImperfectNest_test_ijkkij); + ADD_TEST_FUNCTION(SplitIndex_test1); + ADD_TEST_FUNCTION(SplitIndex_test2); + ADD_TEST_FUNCTION(SplitIndex_test3); + // ADD_TEST_FUNCTION(EpilogueIndex_test); // ill-defined test + ADD_TEST_FUNCTION(RenameKernelArg_test); + + ADD_TEST_FUNCTION(NonInnermostKernel_test1); + ADD_TEST_FUNCTION(NonInnermostKernel_test2); + ADD_TEST_FUNCTION(NonInnermostKernel_test3); + + // ADD_TEST_FUNCTION(FunctionArgType_test); // currently fails + + ADD_TEST_FUNCTION(CachedMatrix_test1); + ADD_TEST_FUNCTION(CachedMatrix_test1_new); + ADD_TEST_FUNCTION(CachedMatrix_test2); + ADD_TEST_FUNCTION(CachedMatrix_test3); + ADD_TEST_FUNCTION(CachedMatrix_test4); + ADD_TEST_FUNCTION(CachedMatrix_test5); + + ADD_TEST_FUNCTION(LoopNest_Parallelized_test1); + ADD_TEST_FUNCTION(LoopNest_Parallelized_test2); + + ADD_TEST_FUNCTION(LoopNest_Unrolled_test1); + + ADD_TEST_FUNCTION(LoopNest_DebugDump_test1); + ADD_TEST_FUNCTION(LoopNest_DebugDump_test2); + + ADD_TEST_FUNCTION(SimpleMatMult_test); + + ADD_TEST_FUNCTION(LoopNest_api_test1); + ADD_TEST_FUNCTION(LoopNest_api_test2); + ADD_TEST_FUNCTION(LoopNest_api_test3); + ADD_TEST_FUNCTION(LoopNest_api_test4); + ADD_TEST_FUNCTION(LoopNest_api_test5); + ADD_TEST_FUNCTION(LoopNest_api_Parallelized_test1); + ADD_TEST_FUNCTION(LoopNest_api_Parallelized_test2); + ADD_TEST_FUNCTION(LoopNest_api_Unrolled_test1); + ADD_TEST_FUNCTION(LoopNest_api_SetOrder_test1); + // ADD_TEST_FUNCTION(LoopNest_api_CachedMatrix_test1); // Fails + ADD_TEST_FUNCTION(GotoBLASGemmWithRefDeref); + ADD_TEST_FUNCTION(YG12LowLevel_TestBoundary); + + ADD_TEST_FUNCTION(Parallelized_ComputeContext_test1); + + ADD_TEST_FUNCTION(MemCopy_test1); + ADD_TEST_FUNCTION(MemSet_test1); + + // ADD_TEST_FUNCTION(GotoBLASGemm_HighLevelAPI_NoCachingHelper); // currently fails due to unimplemented caching strategy + + ADD_TEST_FUNCTION(NamedLoops_test1); + + // ADD_TEST_FUNCTION(SequenceLogicalAndTest); // Currently fails due to known bug + ADD_TEST_FUNCTION(SequenceLogicalAndTestWithCopy); + ADD_TEST_FUNCTION(OneSplitBoundaryTest); + ADD_TEST_FUNCTION(TwoSplitBoundaryTest); + ADD_TEST_FUNCTION(SplitLargerThanSizeBoundaryTest); + ADD_TEST_FUNCTION(TwoSplitsLargerThanSizeBoundaryTest); + + ADD_TEST_FUNCTION(BLASTCOPY_ValidateOutput_Test1); + ADD_TEST_FUNCTION(BLASTCOPY_ValidateOutput_Test2); + + ADD_TEST_FUNCTION(BLASTCOPY_ValidateMemory_Test1); + ADD_TEST_FUNCTION(BLASTCOPY_ValidateMemory_Test2); + ADD_TEST_FUNCTION(BLASTCOPY_ValidateMemory_Test3); + + ADD_TEST_FUNCTION(BLASTCOPY_ValidateOutput_BoundaryCondition_Test1); + ADD_TEST_FUNCTION(BLASTCOPY_ValidateOutput_BoundaryCondition_Test2); + + ADD_TEST_FUNCTION(BLASTCOPY_ValidateOutput_BoundaryCondition_Test3); + ADD_TEST_FUNCTION(BLASTCOPY_ValidateOutput_BoundaryCondition_Test4); + ADD_TEST_FUNCTION(BLASTCOPY_ValidateOutput_BoundaryCondition_Test5); + ADD_TEST_FUNCTION(BLASTCOPY_ValidateOutput_BoundaryCondition_Test6); + ADD_TEST_FUNCTION(BLASTCOPY_ValidateOutput_BoundaryCondition_Test7); + ADD_TEST_FUNCTION(BLASTCOPY_ValidateOutput_BoundaryCondition_Test8); + ADD_TEST_FUNCTION(BLASTCOPY_ValidateOutput_BoundaryCondition_Test9); + + ADD_TEST_FUNCTION(BLASTCOPY_ValidateMemory_BoundaryCondition_Test1); + ADD_TEST_FUNCTION(BLASTCOPY_ValidateMemory_BoundaryCondition_Test2); + ADD_TEST_FUNCTION(BLASTCOPY_ValidateMemory_BoundaryCondition_Test3); + ADD_TEST_FUNCTION(BLASTCOPY_ValidateMemory_BoundaryCondition_Test4); + ADD_TEST_FUNCTION(BLASTCOPY_ValidateMemory_BoundaryCondition_Test5); + ADD_TEST_FUNCTION(BLASTCOPY_ValidateMemory_BoundaryCondition_Test6); + ADD_TEST_FUNCTION(BLASTCOPY_ValidateMemory_BoundaryCondition_Test7); + ADD_TEST_FUNCTION(BLASTCOPY_ValidateMemory_BoundaryCondition_Test8); + ADD_TEST_FUNCTION(BLASTCOPY_ValidateMemory_BoundaryCondition_Test9); + + // Bug: these tests with compile-time constant buffers of input data fail in LLVM JIT / CppEmitter cases only but pass for Compute + // ADD_TEST_FUNCTION(ConvolutionOutput_ValidateOutput_Test1); + // ADD_TEST_FUNCTION(EfficientDirectConvolution_Test1); + // Bug: these tests fail in LLVM JIT case only + // ADD_TEST_FUNCTION(ConvolutionOutput_ValidateOutput_Test1); + // ADD_TEST_FUNCTION(DirectConvolution_Test1); + // ADD_TEST_FUNCTION(ConvolutionInput_ValidateOutput_Test1); + // ADD_TEST_FUNCTION(ConvolutionInput_ValidateOutput_Test2); + // ADD_TEST_FUNCTION(ConvolutionWeight_ValidateOutput_Test1); + // ADD_TEST_FUNCTION(ConvolutionWeight_Reshape_ValidateMemory_Test1); + + ADD_TEST_FUNCTION(GeneralCachingStrategy_ValidateOutput_Test1); + ADD_TEST_FUNCTION(GeneralCachingStrategy_ValidateOutput_Test2); + ADD_TEST_FUNCTION(GeneralCachingStrategy_ValidateOutput_Test3); + ADD_TEST_FUNCTION(GeneralCachingStrategy_ValidateOutput_Test4); + ADD_TEST_FUNCTION(GeneralCachingStrategy_ValidateOutput_Test5); + ADD_TEST_FUNCTION(GeneralCachingStrategy_ValidateOutput_Test6); + ADD_TEST_FUNCTION(GeneralCachingStrategy_ValidateOutput_Test7); + ADD_TEST_FUNCTION(GeneralCachingStrategy_ValidateOutput_Test8); + ADD_TEST_FUNCTION(GeneralCachingStrategy_ValidateOutput_Test9); + ADD_TEST_FUNCTION(GeneralCachingStrategy_ValidateOutput_Test10); + ADD_TEST_FUNCTION(GeneralCachingStrategy_ValidateOutput_Test11); + ADD_TEST_FUNCTION(GeneralCachingStrategy_ValidateOutput_Test12); + ADD_TEST_FUNCTION(GeneralCachingStrategy_ValidateOutput_Test13); + + ADD_TEST_FUNCTION(GeneralCachingStrategy_ValidateMemory_Test1); + + ADD_TEST_FUNCTION(GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test1); + ADD_TEST_FUNCTION(GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test2); + ADD_TEST_FUNCTION(GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test3); + ADD_TEST_FUNCTION(GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test4); + ADD_TEST_FUNCTION(GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test5); + ADD_TEST_FUNCTION(GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test6); + ADD_TEST_FUNCTION(GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test7); + ADD_TEST_FUNCTION(GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test8); + ADD_TEST_FUNCTION(GeneralCachingStrategy_BoundaryConditionOutput_ValidateOutput_Test9); + + ADD_TEST_FUNCTION(GeneralCachingStrategy_BLASTCOPY_ValidateOutput_Test1); + ADD_TEST_FUNCTION(GeneralCachingStrategy_BLASTCOPY_ValidateOutput_Test2); + + ADD_TEST_FUNCTION(GeneralCachingStrategy_BLASTCOPY_ValidateMemory_Test1); + ADD_TEST_FUNCTION(GeneralCachingStrategy_BLASTCOPY_ValidateMemory_Test2); + ADD_TEST_FUNCTION(GeneralCachingStrategy_BLASTCOPY_ValidateMemory_Test3); + + ADD_TEST_FUNCTION(MLAS_GEMM_GeneralCachingStrategy); + + ADD_TEST_FUNCTION(GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test1); + ADD_TEST_FUNCTION(GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test2); + ADD_TEST_FUNCTION(GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test3); + ADD_TEST_FUNCTION(GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test4); + ADD_TEST_FUNCTION(GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test5); + ADD_TEST_FUNCTION(GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test6); + ADD_TEST_FUNCTION(GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test7); + ADD_TEST_FUNCTION(GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test8); + ADD_TEST_FUNCTION(GeneralCachingStrategy_BLASTCOPY_ValidateOutput_BoundaryCondition_Test9); + + ADD_TEST_FUNCTION(GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test1); + ADD_TEST_FUNCTION(GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test2); + ADD_TEST_FUNCTION(GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test3); + ADD_TEST_FUNCTION(GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test4); + ADD_TEST_FUNCTION(GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test5); + ADD_TEST_FUNCTION(GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test6); + ADD_TEST_FUNCTION(GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test7); + ADD_TEST_FUNCTION(GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test8); + ADD_TEST_FUNCTION(GeneralCachingStrategy_BLASTCOPY_ValidateMemory_BoundaryCondition_Test9); + + ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_Test1); + ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_Test2); + ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_Test1); + ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_Test2); + ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_Test3); + + ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test1); + ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test2); + ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test3); + ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test4); + ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test5); + ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test6); + ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test7); + ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test8); + ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_SmallBlocks_Test9); + + ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test1); + ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test2); + ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test3); + ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test4); + ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test5); + ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test6); + ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test7); + ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test8); + ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateOutput_BoundaryCondition_LargeBlocks_Test9); + + ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test1); + ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test2); + ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test3); + ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test4); + ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test5); + ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test6); + ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test7); + ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test8); + ADD_TEST_FUNCTION(GeneralCachingStrategy_ProgressiveBLASNCopy_ValidateMemory_BoundaryCondition_Test9); + + ADD_TEST_FUNCTION(LoopNest_api_tunable_parameters_test1); +#if !defined(__APPLE__) + ADD_TEST_FUNCTION(ThreadLocalAllocation_test1); +#endif + ADD_TEST_FUNCTION(KernelPredicate_test); + ADD_TEST_FUNCTION(MatMul3_test1); + ADD_TEST_FUNCTION(MatMul3_test2); + ADD_TEST_FUNCTION(LoopNestFuse_test1); + ADD_TEST_FUNCTION(LoopNestFuse_test2); + ADD_TEST_FUNCTION(LoopNestFuse_test3); + ADD_TEST_FUNCTION(ConvertedConstraint_test1); + ADD_TEST_FUNCTION(ConvertedConstraint_test2); + + ADD_TEST_FUNCTION(FunctionPointer_test1); + for (auto [name, fn] : testFunctions) { RunTest(name, fn); diff --git a/tools/importers/CNTK/cntk_to_ell.py b/tools/importers/CNTK/cntk_to_ell.py index 0da4c5f3a..6af8cc2b5 100644 --- a/tools/importers/CNTK/cntk_to_ell.py +++ b/tools/importers/CNTK/cntk_to_ell.py @@ -200,7 +200,7 @@ def get_node_output_in_ell_order(cntk_node_results): np.float).reshape(original_shape[1], 1, original_shape[0]) elif len(original_shape) == 1: ordered_weights = ordered_weights.ravel().astype( - np.float).reshape(1, 1, original_shape.size) + np.float).reshape(1, 1, cntk_node_results.size) else: raise NotImplementedError( "Unsupported tensor dimensions {}".format(len(original_shape))) @@ -242,7 +242,7 @@ def verify_ell_nodes_in_vision_model(ell_map, cntk_model, cntk_nodes, ordered_im # Feed input to the ELL model _logger.info("Getting computed ELL results") - ell_map.Compute(ell_input_tensor, dtype=np.float32) + ell_map.Compute(ell_input_tensor) # Walk list of importer nodes for importer_node in ordered_importer_nodes: @@ -355,16 +355,15 @@ def verify_ell_output_in_vision_model(ell_map, cntk_model, testing_info): # Get computed ELL result _logger.info("Getting computed ELL results") - result_from_compute = np.array(ell_map.Compute(ell_input_tensor, dtype=np.float32)) + result_from_compute = np.array(ell_map.Compute(ell_input_tensor)) # Get compiled ELL result _logger.info("Getting compiled ELL results") compiler_options = ell.model.MapCompilerOptions() compiler_options.useBlas = True - compiled_ell_map = ell_map.Compile("host", "model", "predict", compilerOptions=compiler_options, - dtype=np.float32) + compiled_ell_map = ell_map.Compile("host", "model", "predict", compilerOptions=compiler_options) - result_from_compiled = np.array(compiled_ell_map.Compute(ell_input_tensor, dtype=np.float32)) + result_from_compiled = np.array(compiled_ell_map.Compute(ell_input_tensor)) # Verify the computed result against the cntk result np.testing.assert_array_almost_equal( @@ -406,7 +405,7 @@ def verify_ell_output_in_vision_model(ell_map, cntk_model, testing_info): def verify_compiled_ell_nodes_in_vision_model(modelFile, cntk_model, model_cntk_nodes, ordered_importer_nodes, - step_interval_msec=0, lag_threshold_msec=0, plot_model=False, + step_interval_msec=None, lag_threshold_msec=None, plot_model=False, verify_model={"audio": False, "vision": False}): _logger = logger.get() @@ -450,7 +449,7 @@ def verify_compiled_ell_nodes_in_vision_model(modelFile, cntk_model, model_cntk_ # Feed input to the ELL model _logger.info("Getting computed ELL results") - ell_map.Compute(ell_input_tensor, dtype=np.float32) + ell_map.Compute(ell_input_tensor) model_clone = None if cntk_node.op_name != "UserFunction": model_clone = cntk_node.clone(CloneMethod.clone) @@ -470,10 +469,9 @@ def verify_compiled_ell_nodes_in_vision_model(modelFile, cntk_model, model_cntk_ _logger.info("Getting compiled ELL results") compiler_options = ell.model.MapCompilerOptions() compiler_options.useBlas = True - compiled_ell_map = ell_map.Compile("host", "model", "predict", compilerOptions=compiler_options, - dtype=np.float32) + compiled_ell_map = ell_map.Compile("host", "model", "predict", compilerOptions=compiler_options) - result_from_compiled = np.array(compiled_ell_map.Compute(ell_input_tensor, dtype=np.float32)) + result_from_compiled = np.array(compiled_ell_map.Compute(ell_input_tensor)) output_shape = cntk_output.shape if (len(output_shape) == 3): if cntk_output.size == result_from_compiled.size: @@ -488,6 +486,13 @@ def verify_compiled_ell_nodes_in_vision_model(modelFile, cntk_model, model_cntk_ result_from_compiled = result_from_compiled[padding:output_shape[0] + padding, padding:output_shape[1] + padding, :] + # Put the ELL results into same order as CNTK + # if prefix_ordered_importer_nodes[-1].output_shapes[0][1] == "channel_row_column": + print(result_from_compiled.shape, cntk_output.shape) + # result_from_compiled = memory_shapes.get_tensor_in_ell_order(result_from_compiled, "xyz") + # print(result_from_compiled) + # print(cntk_output) + # Compare results. Some layers have large numbers (e.g > 500.734) and some small numbers # (e.g. 0.0038453). To make the comparison more resilient and meaningful for large numbers, # normalize before comparing, since comparison is being done on significant digits. @@ -553,8 +558,9 @@ def map_from_cntk_model_using_new_engine(modelFile, step_interval_msec=None, lag testing_info = {"apply_softmax": False} try: ordered_importer_nodes, node_mapping = importer_engine.get_importer_node_to_ell_mapping() - verify_ell_nodes_in_vision_model(ell_map, cntk_model, cntk_nodes, ordered_importer_nodes, node_mapping, - testing_info) + # RESTORE: + # verify_ell_nodes_in_vision_model(ell_map, cntk_model, cntk_nodes, ordered_importer_nodes, node_mapping, + # testing_info) verify_compiled_ell_nodes_in_vision_model(modelFile, cntk_model, cntk_nodes, ordered_importer_nodes, verify_model=verify_model) verify_ell_output_in_vision_model(ell_map, cntk_model, testing_info) diff --git a/tools/importers/torch/test/CMakeLists.txt b/tools/importers/torch/test/CMakeLists.txt index f32079e46..a0a9c2ea0 100644 --- a/tools/importers/torch/test/CMakeLists.txt +++ b/tools/importers/torch/test/CMakeLists.txt @@ -21,6 +21,7 @@ if(${PYTHON_ENABLED}) # copy files copy_newer_files(${test_name} test_src) - add_test(NAME ${test_name} COMMAND ${PYTHON_EXECUTABLE} -m unittest torch_importer_test.py) + # disabled until we merge with master + # add_test(NAME ${test_name} COMMAND ${PYTHON_EXECUTABLE} -m unittest torch_importer_test.py) -endif() # PYTHON_ENABLED \ No newline at end of file +endif() # PYTHON_ENABLED diff --git a/tools/trainers/forestTrainer/CMakeLists.txt b/tools/trainers/forestTrainer/CMakeLists.txt index c80391081..ac109e4df 100644 --- a/tools/trainers/forestTrainer/CMakeLists.txt +++ b/tools/trainers/forestTrainer/CMakeLists.txt @@ -27,5 +27,5 @@ set (GLOBAL_BIN_DIR ${CMAKE_BINARY_DIR}/bin) set (test_name ${tool_name}_test) add_test(NAME ${test_name} WORKING_DIRECTORY ${GLOBAL_BIN_DIR} - COMMAND ${tool_name} -idf ${CMAKE_SOURCE_DIR}/examples/data/tinyTestData.txt -dd auto -omf null -v) + COMMAND ${tool_name} -idf ${ELL_ROOT}/examples/data/tinyTestData.txt -dd auto -omf null -v) set_test_library_path(${test_name}) diff --git a/tools/trainers/linearTrainer/CMakeLists.txt b/tools/trainers/linearTrainer/CMakeLists.txt index 42f09cb8b..5ac6e250c 100644 --- a/tools/trainers/linearTrainer/CMakeLists.txt +++ b/tools/trainers/linearTrainer/CMakeLists.txt @@ -31,65 +31,65 @@ set (GLOBAL_BIN_DIR ${CMAKE_BINARY_DIR}/bin) set (test_name ${tool_name}_test_0) add_test(NAME ${test_name} WORKING_DIRECTORY ${GLOBAL_BIN_DIR} - COMMAND ${tool_name} --inputDataFilename ${CMAKE_SOURCE_DIR}/examples/data/tinyTestData.txt -dd 3 -r 0.01 --outputModelFilename linearTrainer_model_1.model -v -ne 20 --lossFunction log) + COMMAND ${tool_name} --inputDataFilename ${ELL_ROOT}/examples/data/tinyTestData.txt -dd 3 -r 0.01 --outputModelFilename linearTrainer_model_1.model -v -ne 20 --lossFunction log) set_test_library_path(${test_name}) set (test_name ${tool_name}_test_1) add_test(NAME ${test_name} WORKING_DIRECTORY ${GLOBAL_BIN_DIR} - COMMAND ${tool_name} --inputDataFilename ${CMAKE_SOURCE_DIR}/examples/data/testData.txt --inputModelFilename ${CMAKE_BINARY_DIR}/examples/models/model_3.model --modelInputs 1053 --modelOutputs 1060.output -dd 3 -r 0.01 --outputModelFilename linearTrainer_model_3.model -v -ne 20 --lossFunction log) + COMMAND ${tool_name} --inputDataFilename ${ELL_ROOT}/examples/data/testData.txt --inputModelFilename ${CMAKE_BINARY_DIR}/examples/models/model_3.model --modelInputs 1053 --modelOutputs 1060.output -dd 3 -r 0.01 --outputModelFilename linearTrainer_model_3.model -v -ne 20 --lossFunction log) set_test_library_path(${test_name}) set (test_name ${tool_name}_test_2) add_test(NAME ${test_name} WORKING_DIRECTORY ${GLOBAL_BIN_DIR} - COMMAND ${tool_name} --inputDataFilename ${CMAKE_SOURCE_DIR}/examples/data/testData.txt -dd 3 -lf log -v -ne 30 -r 0.001 -a SGD) + COMMAND ${tool_name} --inputDataFilename ${ELL_ROOT}/examples/data/testData.txt -dd 3 -lf log -v -ne 30 -r 0.001 -a SGD) set_test_library_path(${test_name}) set (test_name ${tool_name}_test_3) add_test(NAME ${test_name} WORKING_DIRECTORY ${GLOBAL_BIN_DIR} - COMMAND ${tool_name} --inputDataFilename ${CMAKE_SOURCE_DIR}/examples/data/testData.txt -dd 3 -lf hinge -v -ne 30 -r 0.001 -a SGD) + COMMAND ${tool_name} --inputDataFilename ${ELL_ROOT}/examples/data/testData.txt -dd 3 -lf hinge -v -ne 30 -r 0.001 -a SGD) set_test_library_path(${test_name}) set (test_name ${tool_name}_test_4) add_test(NAME ${test_name} WORKING_DIRECTORY ${GLOBAL_BIN_DIR} - COMMAND ${tool_name} --inputDataFilename ${CMAKE_SOURCE_DIR}/examples/data/testData.txt -dd 3 -lf squared -v -ne 30 -r 1 -a SGD) + COMMAND ${tool_name} --inputDataFilename ${ELL_ROOT}/examples/data/testData.txt -dd 3 -lf squared -v -ne 30 -r 1 -a SGD) set_test_library_path(${test_name}) set (test_name ${tool_name}_test_5) add_test(NAME ${test_name} WORKING_DIRECTORY ${GLOBAL_BIN_DIR} - COMMAND ${tool_name} --inputDataFilename ${CMAKE_SOURCE_DIR}/examples/data/testData.txt -dd 3 -lf log -v -ne 30 -r 0.001 -a SparseDataSGD) + COMMAND ${tool_name} --inputDataFilename ${ELL_ROOT}/examples/data/testData.txt -dd 3 -lf log -v -ne 30 -r 0.001 -a SparseDataSGD) set_test_library_path(${test_name}) set (test_name ${tool_name}_test_6) add_test(NAME ${test_name} WORKING_DIRECTORY ${GLOBAL_BIN_DIR} - COMMAND ${tool_name} --inputDataFilename ${CMAKE_SOURCE_DIR}/examples/data/testData.txt -dd 3 -lf hinge -v -ne 30 -r 0.001 -a SparseDataSGD) + COMMAND ${tool_name} --inputDataFilename ${ELL_ROOT}/examples/data/testData.txt -dd 3 -lf hinge -v -ne 30 -r 0.001 -a SparseDataSGD) set_test_library_path(${test_name}) set (test_name ${tool_name}_test_7) add_test(NAME ${test_name} WORKING_DIRECTORY ${GLOBAL_BIN_DIR} - COMMAND ${tool_name} --inputDataFilename ${CMAKE_SOURCE_DIR}/examples/data/testData.txt -dd 3 -lf squared -v -ne 30 -r 1 -a SparseDataSGD) + COMMAND ${tool_name} --inputDataFilename ${ELL_ROOT}/examples/data/testData.txt -dd 3 -lf squared -v -ne 30 -r 1 -a SparseDataSGD) set_test_library_path(${test_name}) set (test_name ${tool_name}_test_8) add_test(NAME ${test_name} WORKING_DIRECTORY ${GLOBAL_BIN_DIR} - COMMAND ${tool_name} --inputDataFilename ${CMAKE_SOURCE_DIR}/examples/data/testData.txt -dd 3 -lf log -v -ne 30 -r 0.01 -a SparseDataCenteredSGD) + COMMAND ${tool_name} --inputDataFilename ${ELL_ROOT}/examples/data/testData.txt -dd 3 -lf log -v -ne 30 -r 0.01 -a SparseDataCenteredSGD) set_test_library_path(${test_name}) set (test_name ${tool_name}_test_9) add_test(NAME ${test_name} WORKING_DIRECTORY ${GLOBAL_BIN_DIR} - COMMAND ${tool_name} --inputDataFilename ${CMAKE_SOURCE_DIR}/examples/data/testData.txt -dd 3 -lf hinge -v -ne 30 -r 0.01 -a SparseDataCenteredSGD) + COMMAND ${tool_name} --inputDataFilename ${ELL_ROOT}/examples/data/testData.txt -dd 3 -lf hinge -v -ne 30 -r 0.01 -a SparseDataCenteredSGD) set_test_library_path(${test_name}) set (test_name ${tool_name}_test_10) add_test(NAME ${test_name} WORKING_DIRECTORY ${GLOBAL_BIN_DIR} - COMMAND ${tool_name} --inputDataFilename ${CMAKE_SOURCE_DIR}/examples/data/testData.txt -dd 3 -lf squared -v -ne 30 -r 1 -a SparseDataCenteredSGD) + COMMAND ${tool_name} --inputDataFilename ${ELL_ROOT}/examples/data/testData.txt -dd 3 -lf squared -v -ne 30 -r 1 -a SparseDataCenteredSGD) set_test_library_path(${test_name}) diff --git a/tools/trainers/protoNNTrainer/CMakeLists.txt b/tools/trainers/protoNNTrainer/CMakeLists.txt index e5815a3c2..26c92e8d8 100644 --- a/tools/trainers/protoNNTrainer/CMakeLists.txt +++ b/tools/trainers/protoNNTrainer/CMakeLists.txt @@ -31,5 +31,5 @@ set (GLOBAL_BIN_DIR ${CMAKE_BINARY_DIR}/bin) set (test_name ${tool_name}_test_0) add_test(NAME ${test_name} WORKING_DIRECTORY ${GLOBAL_BIN_DIR} - COMMAND ${tool_name} --inputDataFilename ${CMAKE_SOURCE_DIR}/examples/data/protonnTestData.txt -dd auto -sw 1 -sb 1 -sz 1 -pd 10 -l 2 -mp 5 -v --evaluationFrequency 1 -plf L2) + COMMAND ${tool_name} --inputDataFilename ${ELL_ROOT}/examples/data/protonnTestData.txt -dd auto -sw 1 -sb 1 -sz 1 -pd 10 -l 2 -mp 5 -v --evaluationFrequency 1 -plf L2) set_test_library_path(${test_name}) diff --git a/tools/trainers/sweepingSGDTrainer/CMakeLists.txt b/tools/trainers/sweepingSGDTrainer/CMakeLists.txt index 207bfa1ac..464ceb8ee 100644 --- a/tools/trainers/sweepingSGDTrainer/CMakeLists.txt +++ b/tools/trainers/sweepingSGDTrainer/CMakeLists.txt @@ -27,5 +27,5 @@ set (GLOBAL_BIN_DIR ${CMAKE_BINARY_DIR}/bin) set (test_name ${tool_name}_test) add_test(NAME ${test_name} WORKING_DIRECTORY ${GLOBAL_BIN_DIR} - COMMAND ${tool_name} -idf ${CMAKE_SOURCE_DIR}/examples/data/testData.txt -dd 21 -omf sweepingSgdTrainer_model.xml -v -lf log) + COMMAND ${tool_name} -idf ${ELL_ROOT}/examples/data/testData.txt -dd 21 -omf sweepingSgdTrainer_model.xml -v -lf log) set_test_library_path(${test_name}) diff --git a/tools/utilities/finetune/CMakeLists.txt b/tools/utilities/finetune/CMakeLists.txt index 62a7a4a44..fbfa8db5a 100644 --- a/tools/utilities/finetune/CMakeLists.txt +++ b/tools/utilities/finetune/CMakeLists.txt @@ -5,7 +5,7 @@ # finetune tool set(tool_name finetune) -set(common_src +set(common_src src/DataStatistics.cpp src/DataUtils.cpp src/FineTuneArguments.cpp @@ -43,13 +43,13 @@ source_group("include" FILES ${include}) # create executable in build\bin set (GLOBAL_BIN_DIR ${CMAKE_BINARY_DIR}/bin) -set (EXECUTABLE_OUTPUT_PATH ${GLOBAL_BIN_DIR}) +set (EXECUTABLE_OUTPUT_PATH ${GLOBAL_BIN_DIR}) add_executable(${tool_name} ${docs} ${src} ${main_src} ${include}) target_include_directories(${tool_name} PRIVATE include ${ELL_LIBRARIES_DIR}) target_link_libraries(${tool_name} common data dsp emitters evaluators functions model nodes optimization passes predictors utilities) copy_shared_libraries(${tool_name}) -# put this project in the utilities folder in the IDE +# put this project in the utilities folder in the IDE set_property(TARGET ${tool_name} PROPERTY FOLDER "tools/trainers") # @@ -80,9 +80,6 @@ set(test_include test/include/TestModelUtils.h test/include/TestOptimizationUtils.h test/include/TestTransformData.h -) - -set(test_tcc ) source_group("src" FILES ${test_src}) diff --git a/tools/utilities/nodeTiming/gemmCodeNode/.gitignore b/tools/utilities/nodeTiming/gemmCodeNode/.gitignore new file mode 100644 index 000000000..c28f134eb --- /dev/null +++ b/tools/utilities/nodeTiming/gemmCodeNode/.gitignore @@ -0,0 +1,2 @@ +models/ +test_*/ diff --git a/tools/utilities/nodeTiming/gemmCodeNode/README.md b/tools/utilities/nodeTiming/gemmCodeNode/README.md new file mode 100644 index 000000000..3db2bf5cd --- /dev/null +++ b/tools/utilities/nodeTiming/gemmCodeNode/README.md @@ -0,0 +1,46 @@ +The `nodeTiming/gemmCodeNode` directory contains tools for building ELL models with different Matrix-Matrix multiplication implementations and measuring the performance of those implementations. + +Layout of gemmCodeNode: + +``` +deploy/ -- contains files to be copied without alteration into implementation timing directories + full_pass.(cmd|sh) -- runs import_all.(cmd|sh), build_all.(cmd|sh), and run_all.(cmd|sh) in a timing directory + run.py -c N -- runs run_all.(cmd|sh) N times (default 1) and runs the timing_aggregator processing on the results + timing_aggregator.py -f filename -- reads filename and parses for gemm time output. For each uniquely named gemm impl it sees it will aggregate those times and report the range, average, and ratio of average time against the first implementation that it sees + +scripts/ -- contains scripts for building ELL models with various GEMM impls and generating the test projects for building, running, and timing the performance of the models + build_gemm_models.py -- builds ELL models with specified GEMM implementation and panel/kernel parameters + make_default_models.py -- builds ELL models for all of the different GEMM implementations and the OpenBLAS, naive for-loop, and (if -mkl specified and MKL is installed) the MKL implementation + special_model_args.py -- contains a dictionary mapping model file name to a list of additional arguments to pass to wrap.py for that model when importing it + build_tests.py -- generates the testing and timing projects for the given models and implementations (specified with the -v option) + +src/ -- contains source files and template source files for the testing and timing projects + CMakeLists.txt.in -- CMakeLists.txt template file for the timing project. This is read and CMakeLists.txt is produced by scripts/build_tests.py. + Runner.cpp.in -- Runner.cpp template file for the timing project. This generates the main cpp file for model running and timing. This is read and CMakeLists.txt is produced by scripts/build_tests.py. +``` + +General workflow with these tools: +``` +# Modify and introduce a new GEMM implementation +<< build ELL >> + +cd /tools/utilities/gemmCodeNode + +# Generate ELL models for each GEMM implementation in the ./models/ directory +# This only needs to be done whenever a new GEMM implementation is added, but not when an existing implementation is modified +python scripts/make_default_models.py + +# Generate the testing projects for the GEMM implementations being tested +# Suppose implementations 2, 5, 7, and 14 are being tested and the directory to put the test projects in is named my_testing_dir +# To include mkl, provide the -mkl flag +python scripts/build_tests.py -v 2 5 7 14 -o my_testing_dir + +# cd into the test directory for the data type and size that you want to test +cd my_testing_dir/float/256x256 + +# Run import_all, build_all, and run_all: +./full_pass.(cmd|sh) + +# To get aggregate timing over 20 runs: +python ./run.py -c 20 +``` \ No newline at end of file diff --git a/tools/utilities/nodeTiming/gemmCodeNode/deploy/full_pass.cmd b/tools/utilities/nodeTiming/gemmCodeNode/deploy/full_pass.cmd new file mode 100644 index 000000000..444e95208 --- /dev/null +++ b/tools/utilities/nodeTiming/gemmCodeNode/deploy/full_pass.cmd @@ -0,0 +1,12 @@ +@echo off +REM #################################################################################################### +REM # +REM # Project: Embedded Learning Library (ELL) +REM # File: full_pass.cmd +REM # Authors: Mason Remy +REM # +REM #################################################################################################### + +call import_all.cmd +call build_all.cmd +call run_all.cmd \ No newline at end of file diff --git a/tools/utilities/nodeTiming/gemmCodeNode/deploy/full_pass.sh b/tools/utilities/nodeTiming/gemmCodeNode/deploy/full_pass.sh new file mode 100755 index 000000000..b94f1746c --- /dev/null +++ b/tools/utilities/nodeTiming/gemmCodeNode/deploy/full_pass.sh @@ -0,0 +1,17 @@ +#!/bin/bash +#################################################################################################### +# +# Project: Embedded Learning Library (ELL) +# File: full_pass.sh +# Authors: Mason Remy +# +#################################################################################################### + +chmod +x import_all.sh +./import_all.sh + +chmod +x build_all.sh +./build_all.sh + +chmod +x run_all.sh +./run_all.sh diff --git a/tools/utilities/nodeTiming/gemmCodeNode/deploy/run.py b/tools/utilities/nodeTiming/gemmCodeNode/deploy/run.py new file mode 100755 index 000000000..bf486757d --- /dev/null +++ b/tools/utilities/nodeTiming/gemmCodeNode/deploy/run.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python3 +#################################################################################################### +# +# Project: Embedded Learning Library (ELL) +# File: run.py +# Authors: Mason Remy +# +# Requires: Python 3.x +# +#################################################################################################### + +import argparse +import subprocess +import platform +import os +import timing_aggregator + +script_path = os.path.dirname(os.path.realpath(__file__)) + +def run(): + platform_run_script = "run_all.sh" + if platform.system() == "Windows": + platform_run_script = "run_all.cmd" + run_full_path = os.path.join(script_path, platform_run_script) + results = subprocess.run([run_full_path], stdout=subprocess.PIPE) + split_lines = results.stdout.decode("utf-8").split(os.linesep) + return split_lines + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("-c", "--count", default=1, type=int) + args = parser.parse_args() + + accumulated_result_lines = [] + print("Running {} time(s)...".format(args.count)) + for i in range(args.count): + accumulated_result_lines.extend(run()) + print("{}/{} complete".format(i + 1, args.count)) + + results_dict = timing_aggregator.parse_output(accumulated_result_lines) + timing_aggregator.print_results(results_dict) \ No newline at end of file diff --git a/tools/utilities/nodeTiming/gemmCodeNode/deploy/timing_aggregator.py b/tools/utilities/nodeTiming/gemmCodeNode/deploy/timing_aggregator.py new file mode 100755 index 000000000..72b437f90 --- /dev/null +++ b/tools/utilities/nodeTiming/gemmCodeNode/deploy/timing_aggregator.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python3 +#################################################################################################### +# +# Project: Embedded Learning Library (ELL) +# File: timing_aggregator.py +# Authors: Mason Remy +# +# Requires: Python 3.x +# +#################################################################################################### + +import re +import argparse +import operator + +class TimingResult: + key = "" + count = 0 + result_values = [] + average = 0 + result_range = () + + def __init__(self, key="", raw_results=[]): + self.key = key + self.result_values = raw_results + self.count = len(raw_results) + if self.count > 0: + self.average = sum(raw_results) / self.count + self.result_range = (min(raw_results), max(raw_results)) + + def print_summary(self, indenting=0): + indent = "\t" * indenting + print("{}{}".format(indent, self.key)) + print("{}\tAvg = {}".format(indent, self.average)) + print("{}\tRange = {}".format(indent, self.result_range)) + print("{}\tCount = {}".format(indent, self.count)) + + def print_raw_results(self, indenting=0): + indent = "\t" * indenting + print("{}{} times:".format(indent, self.key)) + for result in self.result_values: + print("{}\t{}".format(indent, result)) + +def parse_output(output_lines): + timing_pattern = "(gemm.*) time = (.*)" + timing_regex_matcher = re.compile(timing_pattern) + time_dict = {} + for line in output_lines: + match = timing_regex_matcher.match(line) + if match: + key = match.group(1) + value = float(match.group(2)) + if key in time_dict: + time_dict[key].append(value) + else: + time_dict[key] = [value] + results = {} + for key in time_dict: + results[key] = TimingResult(key, time_dict[key]) + return results + +def get_ratios_against_best(timing_results_dict): + best_time = None + for key in timing_results_dict: + if best_time == None or best_time > timing_results_dict[key].average: + best_time = timing_results_dict[key].average + ratios_dict = {} + for key in timing_results_dict: + ratios_dict[key] = timing_results_dict[key].average / best_time + ordered_key_ratio_pairs = sorted(ratios_dict.items(), key=operator.itemgetter(1)) + return ordered_key_ratio_pairs + +def print_results(results_dict, include_raw_results=True, include_statistics=True, include_ratios=True): + for key in results_dict: + if include_raw_results: + results_dict[key].print_raw_results() + print() + if include_statistics: + results_dict[key].print_summary() + print() + if include_ratios: + ratios_list = get_ratios_against_best(results_dict) + print("Ratios:") + for (key, ratio) in ratios_list: + print("{} : {}".format(key, ratio)) + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("-f", "--file", required=True) + args = parser.parse_args() + + with open(args.file, 'r') as f: + lines = f.readlines() + results_dict = parse_output(lines) + print_results(results_dict) diff --git a/tools/utilities/nodeTiming/gemmCodeNode/scripts/build_gemm_models.py b/tools/utilities/nodeTiming/gemmCodeNode/scripts/build_gemm_models.py new file mode 100755 index 000000000..ff12131da --- /dev/null +++ b/tools/utilities/nodeTiming/gemmCodeNode/scripts/build_gemm_models.py @@ -0,0 +1,128 @@ +#!/usr/bin/env python3 +#################################################################################################### +# +# Project: Embedded Learning Library (ELL) +# File: build_gemm_models.py +# Authors: Mason Remy +# +# Requires: Python 3.x +# +#################################################################################################### + +import os +import sys + +script_path = os.path.dirname(os.path.realpath(__file__)) +sys.path += [os.path.join(script_path, "../../../pythonlibs")] + +import find_ell + +ell_build_root = find_ell.find_ell_build() +sys.path += [os.path.join(ell_build_root, "interfaces", "python", "package")] + +import ell +import random +import argparse +import numpy as np + +type_mapping = { + "double": ell.nodes.PortType.real, + "float": ell.nodes.PortType.smallReal +} + +def make_matrix(rows, cols, use_fixed_seed, data_type_str): + if use_fixed_seed: + np.random.seed(0) + return ell.math.DoubleVector(np.random.rand(rows * cols)) + +def build_model(output_filename, use_fallback, gemm_impl, M=256, N=256, K=256, kernel_size=[1, 1, 1], cache_sizes=[64, 64, 64], data_type_str="double", ignore_correctness=False): + model = ell.model.Model() + mb = ell.model.ModelBuilder() + + data_type = type_mapping[data_type_str] + + # input node is the left matrix in the matrix multiplication + input_layout = ell.model.PortMemoryLayout([M, K]) + input_node = mb.AddInputNode(model, input_layout, data_type) + input_node_output = ell.nodes.PortElements(input_node.GetOutputPort("output")) + + realign_node = mb.AddOutputNode(model, input_layout, input_node_output) + realign_node_output = ell.nodes.PortElements(realign_node.GetOutputPort("output")) + + right_matrix = make_matrix(K, N, not ignore_correctness, data_type_str) + right_constant_memory_layout = ell.model.PortMemoryLayout([K, N]) + right_constant_node = mb.AddConstantNode(model, right_matrix, right_constant_memory_layout, data_type) + right_node_output = ell.nodes.PortElements(right_constant_node.GetOutputPort("output")) + + gemm_output = None + if use_fallback: + fallback_gemm_node = mb.AddMatrixMatrixMultiplyNode(model, realign_node_output, right_node_output) + gemm_output = ell.nodes.PortElements(fallback_gemm_node.GetOutputPort("output")) + else: + gemm_node = mb.AddMatrixMatrixMultiplyCodeNode(model, realign_node_output, right_node_output, cache_sizes[0], cache_sizes[1], cache_sizes[2], kernel_size[0], kernel_size[1], kernel_size[2], gemm_impl) + gemm_output = ell.nodes.PortElements(gemm_node.GetOutputPort("output")) + + # add output node + output_node = mb.AddOutputNode(model, ell.model.PortMemoryLayout([M, N]), gemm_output) + + ell_map = ell.model.Map(model, input_node, ell.nodes.PortElements(output_node.GetOutputPort("output"))) + ell_map.Save(output_filename) + +def build_all_models(output_dir, M=256, N=256, K=256, kernel_size=[4, 4, 4], cache_sizes=[64, 64, 64], data_type_str="double", base_filename="gemm", file_extension="ell", ignore_correctness=False): + gemm_impl_count = ell.nodes.MatrixMatrixMultiplyImplementation.ImplementationCount + for gemm_impl in range(gemm_impl_count): + output_filename = os.path.join(output_dir, "{}{}.{}".format(base_filename, gemm_impl, file_extension)) + build_model(output_filename=output_filename, + use_fallback=False, + gemm_impl=gemm_impl, + M=M, + N=N, + K=K, + kernel_size=kernel_size, + cache_sizes=cache_sizes, + data_type_str=data_type_str, + ignore_correctness=ignore_correctness) + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("-o", "--output_dir", required=True) + parser.add_argument("-M", type=int, default=256) + parser.add_argument("-N", type=int, default=256) + parser.add_argument("-K", type=int, default=256) + parser.add_argument("--type", "-t", choices=list(type_mapping), default="double") + parser.add_argument("--ignore_correctness", help="Don't use a fixed random seed. A fixed random seed is used to validate GEMM results between different implementations", action="store_true") + parser.add_argument("--panel_size", "-ps", type=int, nargs=3, default=[64, 64, 64], help="Panel size values for M, N, and K dimensions") + parser.add_argument("--kernel_size", "-ks", type=int, nargs=3, default=[1, 1, 1], help="Kernel size values for M, N, and K dimensions") + parser.add_argument("--base_filename", default="gemm") + parser.add_argument("--file_extension", default="ell") + + group = parser.add_mutually_exclusive_group() + group.add_argument("--all_impls", "-a", action="store_true") + group.add_argument("--impl", "-i", type=int, default=0) + group.add_argument("--fallback", "-f", action="store_true", help="Use ELL naive for-loops or BLAS impl via MatrixMultiplyNode (non-code-node)") + + args = parser.parse_args() + + if args.all_impls: + build_all_models(output_dir=args.output_dir, + M=args.M, + N=args.N, + K=args.K, + kernel_size=args.kernel_size, + cache_sizes=args.panel_size, + data_type_str=args.type, + base_filename=args.base_filename, + file_extension=args.file_extension, + ignore_correctness=args.ignore_correctness) + else: + output_filename = os.path.join(args.output_dir, "{}.{}".format(args.base_filename, args.file_extension)) + build_model(output_filename=output_filename, + use_fallback=args.fallback, + gemm_impl=args.impl, + M=args.M, + N=args.N, + K=args.K, + kernel_size=args.kernel_size, + cache_sizes=args.panel_size, + data_type_str=args.type, + ignore_correctness=args.ignore_correctness) diff --git a/tools/utilities/nodeTiming/gemmCodeNode/scripts/build_tests.py b/tools/utilities/nodeTiming/gemmCodeNode/scripts/build_tests.py new file mode 100755 index 000000000..3e26db8b2 --- /dev/null +++ b/tools/utilities/nodeTiming/gemmCodeNode/scripts/build_tests.py @@ -0,0 +1,289 @@ +#!/usr/bin/env python3 +#################################################################################################### +# +# Project: Embedded Learning Library (ELL) +# File: build_tests.py +# Authors: Mason Remy +# +# Requires: Python 3.x +# +#################################################################################################### + +import os +import sys +import platform +import argparse +import shutil +from special_model_args import special_model_args +from make_default_models import default_model_dir + +script_path = os.path.dirname(os.path.realpath(__file__)) +sys.path += [os.path.join(script_path, "../../pythonlibs")] + +import find_ell + +win_script_header = "@echo off\n\n" + +def make_cmakelists(srcdir, outdir, dir_name, model_name, run_count=1000, warmup_count=100, additional_libraries=[]): + add_subdirectory_str = "add_subdirectory({})" + + add_subdirectory_tag = "@ADD_SUBDIRECTORIES@" + link_libraries_tag = "@LINK_LIBRARIES@" + run_count_tag = "@RUN_COUNT@" + warmup_count_tag = "@WARMUP_COUNT@" + + cmake_template_file = os.path.join(srcdir, "CMakeLists.txt.in") + output_dir = os.path.join(outdir, dir_name) + os.makedirs(output_dir, exist_ok=True) + cmake_outfile = os.path.join(outdir, "{}/CMakeLists.txt".format(dir_name)) + with open(cmake_template_file) as f: + template = f.read() + + link_libraries = [model_name] + additional_libraries + template = template.replace(add_subdirectory_tag, add_subdirectory_str.format(dir_name)) + template = template.replace(link_libraries_tag, "{}".format(" ".join(link_libraries))) + template = template.replace(run_count_tag, str(run_count)) + template = template.replace(warmup_count_tag, str(warmup_count)) + with open(cmake_outfile, 'w', newline='\n') as f: + f.write(template) + +def make_runner_cpp(srcdir, outdir, dir_name, model_name, data_type): + capitalized_model_name = model_name[0].upper() + model_name[1:] + + model_name_tag = "@MODEL_NAME@" + model_dir_tag = "@MODEL_DIR@" + allcaps_model_name_tag = "@ALLCAPS_MODEL_NAME@" + data_type_tag = "@DATA_TYPE@" + + cpp_template_file = os.path.join(srcdir, "Runner.cpp.in") + cpp_outfile = os.path.join(outdir, "{}/Runner.cpp".format(dir_name)) + with open(cpp_template_file) as f: + template = f.read() + tags_list = [ + (model_name_tag, model_name), + (model_dir_tag, dir_name), + (allcaps_model_name_tag, capitalized_model_name), + (data_type_tag, data_type) + ] + for tag, replacement in tags_list: + template = template.replace(tag, replacement) + + with open(cpp_outfile, 'w', newline='\n') as f: + f.write(template) + +def make_build_script(outdir, dir_name, model_name, use_mkl): + win_build_script_str = ('mkdir build_{}\n' + 'cd build_{}\n' + 'cmake -G "Visual Studio 15 2017 Win64" -Thost=x64 {} .. && cmake --build . --config release -- /m /verbosity:minimal\n' + 'cd ..') + unix_build_script_str = ('mkdir build_{}\n' + 'cd build_{}\n' + 'cmake .. -DCMAKE_BUILD_TYPE=Release {}\n' + 'make -j\n' + 'if [[ "$OSTYPE" == "darwin"* ]]; then\n' + ' objdump -d --no-show-raw-insn ./Runner > Runner.s\n' + 'else\n' + ' objdump -d -w --no-show-raw-insn -rRSC ./Runner > Runner.s\n' + 'fi\n' + 'cd ..') + use_mkl_str = '-DUSE_MKL=1' if use_mkl else '' + win_build_str = win_build_script_str.format(model_name, model_name, use_mkl_str) + unix_build_str = unix_build_script_str.format(model_name, model_name, use_mkl_str) + win_outfile = os.path.join(outdir, "{}/build.cmd".format(dir_name)) + unix_outfile = os.path.join(outdir, "{}/build.sh".format(dir_name)) + with open(win_outfile, 'w') as f: + f.write(win_build_str) + with open(unix_outfile, 'w', newline='\n') as f: + f.write(unix_build_str) + +def make_build_all_script(ell_models, outdir): + win_build_script_str = ('cd {}\n' + 'call build.cmd\n' + 'cd ..\n\n') + unix_build_script_str = ('cd {}\n' + 'chmod +x build.sh\n' + './build.sh\n' + 'cd ..\n\n') + win_str_list = [win_build_script_str.format(model_name) for (model_name, model_filename, model_relpath, model_path) in ell_models] + unix_str_list = [unix_build_script_str.format(model_name) for (model_name, model_filename, model_relpath, model_path) in ell_models] + win_outfile = os.path.join(outdir, "build_all.cmd") + unix_outfile = os.path.join(outdir, "build_all.sh") + with open(win_outfile, 'w') as f: + f.write(win_script_header) + f.write("\n".join(win_str_list)) + with open(unix_outfile, 'w', newline='\n') as f: + f.write("\n".join(unix_str_list)) + +def make_run_all_script(ell_models, outdir, target): + win_build_script_str = ('cd {}\\build_{}\\Release\n' + 'call Runner.exe\n' + 'cd ..\\..\\..\n') + unix_build_script_str = ('cd {}/build_{}\n' + '{}./Runner\n' + 'cd ../..\n') + win_str_list = [win_build_script_str.format(model_name, model_name) for (model_name, model_filename, model_relpath, model_path) in ell_models] + + preload_str = "" + if target == "pi3": + preload_str = "LD_PRELOAD=~/miniconda3/envs/py34/lib/libopenblas.so " + unix_str_list = [unix_build_script_str.format(model_name, model_name, preload_str) for (model_name, model_filename, model_relpath, model_path) in ell_models] + + win_outfile = os.path.join(outdir, "run_all.cmd") + unix_outfile = os.path.join(outdir, "run_all.sh") + with open(win_outfile, 'w') as f: + f.write(win_script_header) + f.write("\n".join(win_str_list)) + with open(unix_outfile, 'w', newline='\n') as f: + f.write("\n".join(unix_str_list)) + +def make_clean_all_script(ell_models, outdir): + win_clean_str = "rd /s /q {}" + unix_clean_str = "rm -rf {}" + win_str_list = [win_clean_str.format(model_name) for (model_name, model_filename, model_relpath, model_path) in ell_models] + unix_str_list = [unix_clean_str.format(model_name) for (model_name, model_filename, model_relpath, model_path) in ell_models] + win_outfile = os.path.join(outdir, "clean_all.cmd") + unix_outfile = os.path.join(outdir, "clean_all.sh") + with open(win_outfile, 'w') as f: + f.write(win_script_header) + f.write("\n".join(win_str_list)) + with open(unix_outfile, 'w', newline='\n') as f: + f.write("\n".join(unix_str_list)) + +def make_import_all_script(ell_models, outdir, language="cpp", target="host", profile=False): + mkdir_str = "mkdir {}" + ell_root = find_ell.get_ell_root() + wrap_path = os.path.abspath(os.path.join(ell_root, "tools", "wrap", "wrap.py")) + wrap_str = "python {} -t {} -l {} --llvm_format ir -od {}/{} {} -f {} \n" + profile_str = "--profile" if profile else "" + str_list = [] + for (model_name, model_filename, model_relpath, model_path) in ell_models: + str_list.append(mkdir_str.format(model_name)) + last_str = model_path + if model_filename in special_model_args: + last_str += " " + " ".join(special_model_args[model_filename]) + str_list.append(wrap_str.format(wrap_path, target, language, model_name, model_name, profile_str, last_str)) + + win_outfile = os.path.join(outdir, "import_all.cmd") + unix_outfile = os.path.join(outdir, "import_all.sh") + with open(win_outfile, 'w') as f: + f.write(win_script_header) + f.write("\n".join(str_list)) + with open(unix_outfile, 'w', newline='\n') as f: + f.write("\n".join(str_list)) + +def find_ell_models_under_path(path_to_walk, suffix=".ell"): + found_ell_models = [] # List of tuples (name, filename, fullpath) + for root, dirs, files in os.walk(path_to_walk): + for filename in files: + if filename.endswith(suffix): + relpath = os.path.relpath(root, start=path_to_walk) + name = filename.rstrip(suffix) + found_ell_models.append((name, filename, relpath, os.path.abspath(os.path.join(root, filename)))) + return found_ell_models + +def copy_static_files(input_path, output_path): + for file_name in os.listdir(input_path): + full_path = os.path.join(input_path, file_name) + if os.path.isfile(full_path): + shutil.copy(full_path, output_path) + +def get_first_path_element(path): + first_path_element = None + while len(path) > 0: + split_relpath = os.path.split(path) + path = split_relpath[0] + first_path_element = split_relpath[1] + return first_path_element + +def create_gitignore(path, ignore_contents=["*"]): + output_file = os.path.join(path, ".gitignore") + with open(output_file, 'w', newline='\n') as f: + f.writelines(ignore_contents) + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--model_path", "-m", default=default_model_dir, help="Path to search under for ELL models") + parser.add_argument("--variants", "-v", nargs="*", help="Which implementation variants of GEMM to build. Default is to run all", default=None) + parser.add_argument("--target", "-t", default="host") + parser.add_argument("--language", "-l", default="cpp") + parser.add_argument("--outdir", "-o", default="test_output") + parser.add_argument("--profile", "-p", action="store_true") + parser.add_argument("--run_count", "-r", type=int, default=1000) + parser.add_argument("--warmup_count", "-w", type=int, default=100) + parser.add_argument("--include_mkl", "-mkl", action="store_true") + parser.add_argument("--include_for_loops", action="store_true") + parser.add_argument("--additional_libraries", nargs="*", default=[]) + parser.add_argument("--data_type", "-dt", choices=["float", "double"], default=None) + args = parser.parse_args() + + mkl_name = "gemmMKL" + blas_name = "gemmBLAS" + naive_for_loops_name = "gemmELL" + models_using_mkl = [mkl_name] + order_precedence = [mkl_name, blas_name, naive_for_loops_name] + + ell_models = find_ell_models_under_path(args.model_path) + if len(ell_models) == 0: + print("No ELL models found at {}".format(args.model_path)) + sys.exit() + + # Re-order ell models for better result printing + # Sort by the order_precedence list, then sort by N as an integer from the "gemmN" model name + ell_models.sort(key=lambda x: order_precedence.index(x[0]) if x[0] in order_precedence else (len(order_precedence) + int(x[0].lstrip("gemm")))) + + if args.variants and len(args.variants) > 0: + models_to_include = [] + if args.include_mkl: + # Add MKL first so other scripts can default to comparing against the first results + models_to_include.append(mkl_name) + models_to_include.append(blas_name) + if args.include_for_loops: + models_to_include.append(naive_for_loops_name) + additional_models_to_include = ["gemm{}".format(value) for value in args.variants] + models_to_include.extend(additional_models_to_include) + ell_models = list(filter(lambda ell_model_info: ell_model_info[0] in models_to_include, ell_models)) + else: + if not args.include_mkl: + ell_models = list(filter(lambda ell_model_info: ell_model_info[0] != mkl_name, ell_models)) + if not args.include_for_loops: + ell_models = list(filter(lambda ell_model_info: ell_model_info[0] != naive_for_loops_name, ell_models)) + + + # Make and the testing output directory and add a .gitignore file to it + os.makedirs(args.outdir, exist_ok=True) + create_gitignore(args.outdir) + + # Group ELL models by their relative paths to separate the same implementation for different sizes/types + rel_path_to_ell_models = {} + for (model_name, model_filename, model_relpath, model_path) in ell_models: + if model_relpath not in rel_path_to_ell_models: + rel_path_to_ell_models[model_relpath] = [] + rel_path_to_ell_models[model_relpath].append((model_name, model_filename, model_relpath, model_path)) + + for rel_path in rel_path_to_ell_models: + data_type = args.data_type + if data_type is None: + # Try to get the data type from the name of the first directory in the model relpath + first_path_elt = get_first_path_element(rel_path) + if first_path_elt in ["float", "double"]: + data_type = first_path_elt + else: + data_type = "double" # Default + + outdir = os.path.join(args.outdir, rel_path) + current_ell_models = rel_path_to_ell_models[rel_path] + os.makedirs(outdir, exist_ok=True) + + copy_static_files(os.path.join(script_path, "..", "deploy"), outdir) + make_import_all_script(current_ell_models, outdir, args.language, args.target, args.profile) + make_build_all_script(current_ell_models, outdir) + make_run_all_script(current_ell_models, outdir, args.target) + make_clean_all_script(current_ell_models, outdir) + src_dir = os.path.join(script_path, "..", "src") + for (model_name, model_filename, model_relpath, model_path) in current_ell_models: + os.makedirs(os.path.join(outdir, model_name), exist_ok=True) + make_cmakelists(src_dir, outdir, model_name, model_name, args.run_count, args.warmup_count, additional_libraries=args.additional_libraries) + make_runner_cpp(src_dir, outdir, model_name, model_name, data_type) + make_build_script(outdir, model_name, model_name, model_name in models_using_mkl) + + print("Created testing utilities at {}".format(args.outdir)) diff --git a/tools/utilities/nodeTiming/gemmCodeNode/scripts/make_default_models.py b/tools/utilities/nodeTiming/gemmCodeNode/scripts/make_default_models.py new file mode 100755 index 000000000..44a394f1b --- /dev/null +++ b/tools/utilities/nodeTiming/gemmCodeNode/scripts/make_default_models.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 +#################################################################################################### +# +# Project: Embedded Learning Library (ELL) +# File: make_default_models.py +# Authors: Mason Remy +# +# Requires: Python 3.x +# +#################################################################################################### + +import os +import shutil +import argparse + +import build_gemm_models + +script_path = os.path.dirname(os.path.realpath(__file__)) +default_model_dir = os.path.join(script_path, "..", "models") + +def make_dirs(base_output_dir, data_types, sizes): + os.makedirs(base_output_dir, exist_ok=True) + dir_map = {} + def add_path(path_dict, base_dir, dir, key): + path = os.path.join(base_dir, dir) + path_dict[key] = { + "path": path + } + os.makedirs(path, exist_ok=True) + return path + + for datatype in data_types: + datatype_dir = add_path(dir_map, base_output_dir, datatype, datatype) + for size in sizes: + size_tag = "{}x{}".format(size, size) + add_path(dir_map[datatype], datatype_dir, size_tag, size) + return dir_map + +def build_loopnest_models(output_dir_map, data_types, sizes): + for datatype in data_types: + for size in sizes: + output_dir = output_dir_map[datatype][size]["path"] + build_gemm_models.build_all_models(output_dir=output_dir, + data_type_str=datatype, + M=size, + N=size, + K=size) + +def build_fallback_models(output_dir_map, data_types, sizes): + for datatype in data_types: + for size in sizes: + output_dir = output_dir_map[datatype][size]["path"] + output_filename = os.path.join(output_dir, "gemmELL.ell") + build_gemm_models.build_model(output_filename=output_filename, + use_fallback=True, + gemm_impl=0, + data_type_str=datatype, + M=size, + N=size, + K=size) + blas_filename = os.path.join(output_dir, "gemmBLAS.ell") + shutil.copy(output_filename, blas_filename) + mkl_filename = os.path.join(output_dir, "gemmMKL.ell") + shutil.copy(output_filename, mkl_filename) + +if __name__ == "__main__": + data_types = ["float", "double"] + sizes = [256] + dir_map = make_dirs(default_model_dir, data_types, sizes) + build_fallback_models(dir_map, data_types, sizes) + build_loopnest_models(dir_map, data_types, sizes) diff --git a/tools/utilities/nodeTiming/gemmCodeNode/scripts/special_model_args.py b/tools/utilities/nodeTiming/gemmCodeNode/scripts/special_model_args.py new file mode 100755 index 000000000..5235f993f --- /dev/null +++ b/tools/utilities/nodeTiming/gemmCodeNode/scripts/special_model_args.py @@ -0,0 +1,13 @@ +#################################################################################################### +# +# Project: Embedded Learning Library (ELL) +# File: special_model_args.py +# Authors: Mason Remy +# +# Requires: Python 3.x +# +#################################################################################################### + +special_model_args = { + "gemmELL.ell": ["--blas false"] +} \ No newline at end of file diff --git a/tools/utilities/nodeTiming/gemmCodeNode/src/CMakeLists.txt.in b/tools/utilities/nodeTiming/gemmCodeNode/src/CMakeLists.txt.in new file mode 100644 index 000000000..55dbfc274 --- /dev/null +++ b/tools/utilities/nodeTiming/gemmCodeNode/src/CMakeLists.txt.in @@ -0,0 +1,21 @@ +# +# cmake file for MatrixMatrixMultiplyCodeNode timing scripts +# + +cmake_minimum_required(VERSION 2.8) +project(MatrixMatrixMultiplyCodeNodeRunner) +set(CMAKE_CXX_STANDARD 11) + +@ADD_SUBDIRECTORIES@ + +add_definitions(-DRUN_COUNT=@RUN_COUNT@) +add_definitions(-DWARMUP_COUNT=@WARMUP_COUNT@) +add_executable(Runner Runner.cpp) + +if(MSVC) + target_link_libraries(Runner @LINK_LIBRARIES@) +elseif(${CMAKE_CXX_COMPILER_ID} STREQUAL Clang) + target_link_libraries(Runner @LINK_LIBRARIES@ -Wl) +else() + target_link_libraries(Runner @LINK_LIBRARIES@ -Wl,--gc-sections) +endif() diff --git a/tools/utilities/nodeTiming/gemmCodeNode/src/Runner.cpp.in b/tools/utilities/nodeTiming/gemmCodeNode/src/Runner.cpp.in new file mode 100644 index 000000000..24cb5801d --- /dev/null +++ b/tools/utilities/nodeTiming/gemmCodeNode/src/Runner.cpp.in @@ -0,0 +1,134 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Project: Embedded Learning Library (ELL) +// File: Runner.template.cpp / Runner.cpp +// Authors: Mason Remy +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#include +#include +#include +#include +#include +#include + +#define @MODEL_NAME@_MAIN + +#include "@MODEL_DIR@/@MODEL_NAME@.h" + +#ifndef RUN_COUNT +#define RUN_COUNT 1000 +#endif + +#ifndef WARMUP_COUNT +#define WARMUP_COUNT 100 +#endif + +using dtype = @DATA_TYPE@; + +const unsigned int BUFFER_ROWS = 0; + +// makes a row major matrix +static inline void randomInitMatrix(dtype* A, unsigned int rows, unsigned int cols) +{ + for (unsigned int i = 0; i < rows; ++i) + { + for (unsigned int j = 0; j < cols; ++j) + { + A[i * cols + j] = (dtype)(rand()) / RAND_MAX; + } + } + for (unsigned int i = rows; i < rows + BUFFER_ROWS; ++i) + { + for (unsigned int j = 0; j < cols; ++j) + { + A[i * cols + j] = 0; + } + } +} + +class Timer +{ +private: + using clock_t = std::chrono::high_resolution_clock; + using second_t = std::chrono::duration >; + std::chrono::time_point _start; +public: + Timer() : _start(clock_t::now()) {} + double elapsed() const + { + return std::chrono::duration_cast(clock_t::now() - _start).count(); + } +}; + +void PrintMat(const std::vector& data, int rows, int cols) +{ + for (int r = 0; r < rows; ++r) + { + for (int c = 0; c < cols; ++c) + { + std::cout << std::fixed << data[r * cols + c] << "\t"; + } + std::cout << std::endl; + } +} + +template +double RunCheck(WrapperType& wrapper) +{ + TensorShape inputShape = wrapper.GetInputShape(); + int M = inputShape.columns; + int N = inputShape.channels; + + std::vector A(M * N); + std::vector results(M * N); + + // Return the sum of the first result so algorithm correctness can be verified (if the model was built without a fixed seed this is meaningless) + randomInitMatrix(A.data(), M, N); + results = wrapper.Predict(A); + dtype firstValSum = std::accumulate(results.begin(), results.end(), static_cast(0)); + return firstValSum; +} + +template +double RunTiming(WrapperType& wrapper) +{ + TensorShape inputShape = wrapper.GetInputShape(); + int M = inputShape.columns; + int N = inputShape.channels; + + std::vector A(M * N); + std::vector results(M * N); + + randomInitMatrix(A.data(), M, N); + for (int i = 0; i < WARMUP_COUNT; i++) + { + randomInitMatrix(A.data(), M, N); + results = wrapper.Predict(A); + } + + Timer t; + for (int i = 0; i < RUN_COUNT; i++) + { + randomInitMatrix(A.data(), M, N); + results = wrapper.Predict(A); + } + double duration = t.elapsed(); + + return duration; +} + +int main(int argc, char** argv) +{ + srand(0); + + @ALLCAPS_MODEL_NAME@Wrapper modelWrapper; + + double correctnessCheck = RunCheck(modelWrapper); + double duration = RunTiming(modelWrapper); + + std::cout << "@MODEL_NAME@ time = " << std::fixed << duration << "\tcheck = " << std::fixed << correctnessCheck << std::endl; + + return 0; +} \ No newline at end of file diff --git a/tools/utilities/pitest/drivetest.py b/tools/utilities/pitest/drivetest.py index d49433f5e..33c8c2626 100644 --- a/tools/utilities/pitest/drivetest.py +++ b/tools/utilities/pitest/drivetest.py @@ -43,7 +43,7 @@ def __init__(self, ipaddress=None, cluster=None, outdir=None, profile=False, model=None, labels=None, target="pi3", target_dir="/home/pi/test", username="pi", password="raspberry", iterations=1, expected=None, blas=True, compile=COMPILE_INCREMENTAL, test=True, timeout=None, apikey=None, - gitrepo=None, wrap_options=None): + skip_ellcode=False, gitrepo=None, wrap_options=None): self.ipaddress = ipaddress self.build_root = find_ell.find_ell_build() self.ell_root = os.path.dirname(self.build_root) @@ -63,6 +63,7 @@ def __init__(self, ipaddress=None, cluster=None, outdir=None, profile=False, self.compile = compile self.test = test self.prediction_time = None + self.skip_ellcode = skip_ellcode self.logger = logger.get() self.rePlatform = "ARMv7.*" if target == "pi0": @@ -295,6 +296,9 @@ def wrap_project(self): if self.wrap_options: builder_args += ['--'] + self.wrap_options + if self.skip_ellcode: + builder_args.append("--skip_ellcode") + builder.parse_command_line(builder_args) builder.run() @@ -370,7 +374,6 @@ def run_test(self): sys.path.append(os.path.join(current_path, "..", "..", "wrap", "test")) mpp = __import__("wrap_test") mpp.make_project(self.output_dir) - cmd = ["python", os.path.join(current_path, "..", "pythonlibs", "vision", "demo.py"), self.labels_file, diff --git a/tools/utilities/print/src/PrintGraph.cpp b/tools/utilities/print/src/PrintGraph.cpp index 6e084286a..7fde0a6b7 100644 --- a/tools/utilities/print/src/PrintGraph.cpp +++ b/tools/utilities/print/src/PrintGraph.cpp @@ -115,6 +115,8 @@ std::string ToShortString(BinaryOperationType op) return "||"; case BinaryOperationType::logicalXor: return "^"; + case BinaryOperationType::modulo: + return "%"; } return ""; } diff --git a/tools/utilities/profile/CMakeLists-device-parallel.txt.in b/tools/utilities/profile/CMakeLists-device-parallel.txt.in index 8212d2676..80e4efd6d 100644 --- a/tools/utilities/profile/CMakeLists-device-parallel.txt.in +++ b/tools/utilities/profile/CMakeLists-device-parallel.txt.in @@ -8,7 +8,8 @@ project(profiler) set(CMAKE_CXX_STANDARD 14) -set(PACKAGE_ROOT @EXTERNAL_DIR@) +set(PACKAGE_ROOT @ELL_EXTERNAL_DIR@) + include(./OpenBLASSetup.cmake) if(MSVC) diff --git a/tools/utilities/profile/CMakeLists.txt b/tools/utilities/profile/CMakeLists.txt index 5a162ef90..c27d1d59d 100644 --- a/tools/utilities/profile/CMakeLists.txt +++ b/tools/utilities/profile/CMakeLists.txt @@ -109,7 +109,7 @@ configure_file(build_and_run.sh.in build_and_run.sh @ONLY NEWLINE_STYLE UNIX) configure_file(build_and_run.cmd.in build_and_run.cmd @ONLY NEWLINE_STYLE WIN32) configure_file(remote_test.sh.in remote_test.sh @ONLY NEWLINE_STYLE UNIX) configure_file(remote_test.cmd.in remote_test.cmd @ONLY NEWLINE_STYLE WIN32) -configure_file(${CMAKE_SOURCE_DIR}/CMake/OpenBLASSetup.cmake OpenBLASSetup.cmake COPYONLY) +configure_file(${ELL_ROOT}/CMake/OpenBLASSetup.cmake OpenBLASSetup.cmake COPYONLY) if(WIN32) diff --git a/tools/utilities/pythonlibs/audio/view_audio.py b/tools/utilities/pythonlibs/audio/view_audio.py index eae5ddc9d..62d1fe0e7 100644 --- a/tools/utilities/pythonlibs/audio/view_audio.py +++ b/tools/utilities/pythonlibs/audio/view_audio.py @@ -138,6 +138,11 @@ def __init__(self, featurizer_model=None, classifier_model=None, auto_scale=True elif self.CLASSIFIER_MODEL_KEY in self.settings: self.classifier_model = self.settings[self.CLASSIFIER_MODEL_KEY] + self.vad = None + if vad_model: + self.vad = vad.VoiceActivityDetector(vad_model) + self.previous_vad = 0 + self.wav_filename = wav_file if self.wav_filename is None and self.WAV_FILE_KEY in self.settings: self.wav_filename = self.settings[self.WAV_FILE_KEY] @@ -394,7 +399,9 @@ def evaluate_classifier(self): prediction, probability, label, _ = self.classifier.predict(self.classifier_feature_data.ravel()) if prediction is not None: percent = int(100 * probability) - if self.last_prediction != prediction or self.probability < probability: + if label == "silence": + self.classifier.reset() + elif self.last_prediction != prediction or self.probability < probability: self.last_prediction = prediction self.probability = probability self.show_output(" DETECTED ({}) {}% {}".format(prediction, percent, label)) diff --git a/tools/utilities/pythonlibs/buildtools.py b/tools/utilities/pythonlibs/buildtools.py index 5ef177534..7efa17fbb 100644 --- a/tools/utilities/pythonlibs/buildtools.py +++ b/tools/utilities/pythonlibs/buildtools.py @@ -113,23 +113,25 @@ def logstream(self, stream): if "closed file" not in msg: self.logger.info(msg) - def run(self, command, print_output=True, shell=False): + def run(self, command, print_output=True, shell=False, cwd=None): cmdstr = command if isinstance(command, str) else " ".join(command) if self.verbose: self.logger.info(cmdstr) try: - with subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=0, - universal_newlines=True, shell=shell) as proc: + output_target = subprocess.PIPE if print_output else subprocess.DEVNULL + with subprocess.Popen(command, stdout=output_target, stderr=output_target, bufsize=0, + universal_newlines=True, shell=shell, cwd=cwd) as proc: self.output = '' - stdout_thread = Thread(target=self.logstream, args=(proc.stdout,)) - stderr_thread = Thread(target=self.logstream, args=(proc.stderr,)) + if print_output: + stdout_thread = Thread(target=self.logstream, args=(proc.stdout,)) + stderr_thread = Thread(target=self.logstream, args=(proc.stderr,)) - stdout_thread.start() - stderr_thread.start() + stdout_thread.start() + stderr_thread.start() - while stdout_thread.isAlive() or stderr_thread.isAlive(): - pass + while stdout_thread.isAlive() or stderr_thread.isAlive(): + pass proc.wait() @@ -191,7 +193,8 @@ def llc(self, output_dir, input_file, target, optimization_level="3", objext=".o args = [self.llcexe, input_file, "-o", out_file, - "-O" + optimization_level + "-O" + optimization_level, + '' if optimization_level == '0' else "-fp-contract=fast" ] args = args + self.get_llc_options(target) # Save the parameters passed to llc. This is used for archiving purposes. @@ -202,25 +205,26 @@ def llc(self, output_dir, input_file, target, optimization_level="3", objext=".o return out_file - def opt(self, output_dir, input_file, optimization_level="3"): + def opt(self, output_dir, input_file, optimization_level="3", print_output=True): # opt compiled_model.ll -o compiled_model_opt.ll -O3 model_name = os.path.splitext(os.path.basename(input_file))[0] out_file = os.path.join(output_dir, model_name + ".opt.bc") args = [self.optexe, input_file, "-o", out_file, - "-O" + optimization_level] + "-O" + optimization_level, + '' if optimization_level == '0' else "-fp-contract=fast"] # Save the parameters passed to opt. This is used for archiving purposes. self.log_command_arguments(args, log_dir=output_dir) self.logger.info("running opt ...") - self.run(args) + self.run(args, print_output=print_output) return out_file - def compile(self, model_file, func_name, model_name, target, output_dir, + def compile(self, model_file, func_name, model_name, target, output_dir, skip_ellcode=False, use_blas=False, fuse_linear_ops=True, optimize_reorder_data_nodes=True, profile=False, llvm_format="bc", optimize=True, parallelize=True, vectorize=True, debug=False, is_model_file=False, swig=True, - header=False, objext=".o", extra_options=[]): + header=False, objext=".o", global_value_alignment=32, extra_options=[]): file_arg = "-imf" if is_model_file else "-imap" format_flag = { "bc": "--bitcode", @@ -244,7 +248,8 @@ def compile(self, model_file, func_name, model_name, target, output_dir, "--target", target, "-od", output_dir, "--fuseLinearOps", str(fuse_linear_ops), - "--optimizeReorderDataNodes", str(optimize_reorder_data_nodes) + "--optimizeReorderDataNodes", str(optimize_reorder_data_nodes), + "--globalValueAlignment", str(global_value_alignment) ] if swig: args.append("--swig") @@ -270,6 +275,9 @@ def compile(self, model_file, func_name, model_name, target, output_dir, if profile: args.append("--profile") + if skip_ellcode: + args.append("--skip_ellcode") + args += extra_options # Save the parameters passed to compile. This is used for archiving purposes. diff --git a/tools/wrap/CMakeLists.txt b/tools/wrap/CMakeLists.txt index 880f4df93..213d63173 100644 --- a/tools/wrap/CMakeLists.txt +++ b/tools/wrap/CMakeLists.txt @@ -45,7 +45,7 @@ if(${PYTHON_ENABLED}) message(ERROR "LLVM not found, please check that LLVM is installed.") return() endif() - + set(OUTPUT_DIR ${CMAKE_BINARY_DIR}) set(JSON "{ \"llc\": \"${LLC_EXECUTABLE}\", \"swig\": \"${SWIG_EXECUTABLE}\", \"compile\": \"${GLOBAL_BIN_DIR}/compile\", \"blas\": \"${BLAS_LIBS}\", \"opt\": \"${OPT_EXECUTABLE}\", \"cmake_generator\": \"${CMAKE_GENERATOR}\", \"cmake_version\": \"${CMAKE_VERSION}\" }") diff --git a/tools/wrap/templates/CMakeLists.cpp.txt.in b/tools/wrap/templates/CMakeLists.cpp.txt.in index c0ce95a3d..a88d86a20 100644 --- a/tools/wrap/templates/CMakeLists.cpp.txt.in +++ b/tools/wrap/templates/CMakeLists.cpp.txt.in @@ -33,12 +33,12 @@ if(WIN32) # path to the OpenBLAS Nuget set(PACKAGE_ROOT "@ELL_ROOT@") endif() + include(OpenBLASSetup.cmake) add_library(${target_name} STATIC IMPORTED GLOBAL) + set_property(TARGET ${target_name} APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES "${CMAKE_CURRENT_SOURCE_DIR};${CMAKE_CURRENT_SOURCE_DIR}/include") set_target_properties(${target_name} PROPERTIES IMPORTED_LOCATION ${CMAKE_CURRENT_SOURCE_DIR}/@ELL_model@.@OBJECT_EXTENSION@) -if(BLAS_LIBS) - target_link_libraries(${target_name} INTERFACE ${BLAS_LIBS}) -endif() +target_link_libraries(${target_name} INTERFACE ${BLAS_LIBS}) diff --git a/tools/wrap/templates/CMakeLists.python.txt.in b/tools/wrap/templates/CMakeLists.python.txt.in index c355b1386..ba5ee03d9 100644 --- a/tools/wrap/templates/CMakeLists.python.txt.in +++ b/tools/wrap/templates/CMakeLists.python.txt.in @@ -41,6 +41,7 @@ else() # SWIG has this GCC 7 warning. add_compile_options("-Wno-psabi") endif() + include(OpenBLASSetup.cmake) find_package(PythonInterp 3.4) diff --git a/tools/wrap/wrap.py b/tools/wrap/wrap.py index 56e16cadd..574555f04 100755 --- a/tools/wrap/wrap.py +++ b/tools/wrap/wrap.py @@ -13,6 +13,7 @@ import json import logging import os +import platform import sys import time from shutil import copyfile @@ -65,6 +66,12 @@ class ModuleBuilder: "help": "the target platform", "choices": ["pi3", "pi0", "orangepi0", "pi3_64", "aarch64", "host"] }, + "skip_ellcode": + { + "short": "skip_ellcode", + "default": False, + "help": "skip ELLCode" + }, "language": { "short": "l", @@ -161,6 +168,13 @@ class ModuleBuilder: "default": False, "help": "emit debug code" }, + "global_value_alignment": + { + "short": "gva", + "default": 32, + "help": "The number of bytes to align global buffers to", + "type": int + }, "stats": { "short": "stats", @@ -193,6 +207,7 @@ def __init__(self): self.func_name = "Predict" self.objext = "o" self.logger = None + self.skip_ellcode = False def str2bool(self, v): return v.lower() in ("yes", "true", "t", "1") @@ -215,19 +230,22 @@ def parse_command_line(self, args=None): for arg in self.arguments.keys(): argdef = self.arguments[arg] + arg_type = str + if "type" in argdef.keys(): + arg_type = argdef["type"] if "required" in argdef.keys(): arg_parser.add_argument("--" + arg, "-" + argdef["short"], - help=argdef["help"], required=True) + help=argdef["help"], type=arg_type, required=True) elif "choices" in argdef.keys(): arg_parser.add_argument("--" + arg, "-" + argdef["short"], - help=argdef["help"], default=argdef["default"], + help=argdef["help"], type=arg_type, default=argdef["default"], choices=argdef["choices"]) elif type(argdef["default"]) is bool and not argdef["default"]: arg_parser.add_argument("--" + arg, "-" + argdef["short"], help=argdef["help"], action="store_true", default=False) else: arg_parser.add_argument("--" + arg, "-" + argdef["short"], - help=argdef["help"], default=argdef["default"]) + help=argdef["help"], type=arg_type, default=argdef["default"]) compile_args = [] if '--' in args: @@ -247,6 +265,7 @@ def parse_command_line(self, args=None): self.model_name = self.model_file_base.replace('-', '_') self.language = args.language self.target = args.target + self.skip_ellcode = args.skip_ellcode self.objext = self.get_objext(self.target) self.output_dir = args.outdir if self.output_dir is None: @@ -273,18 +292,23 @@ def parse_command_line(self, args=None): self.swig = self.language != "cpp" self.cpp_header = self.language == "cpp" self.compile_args = compile_args + self.global_value_alignment = args.global_value_alignment self.stats = args.stats self.times = {} def find_files(self): __script_path = os.path.dirname(os.path.abspath(__file__)) self.cmake_template = os.path.join(__script_path, "templates/CMakeLists.%s.txt.in" % (self.language)) - if (not os.path.isfile(self.cmake_template)): + + if not os.path.isfile(self.cmake_template): raise Exception("Could not find CMakeLists template: %s" % (self.cmake_template)) + if self.language == "python": self.module_init_template = os.path.join(__script_path, "templates/__init__.py.in") + if not os.path.isfile(self.module_init_template): raise Exception("Could not find __init__.py template: %s" % (self.module_init_template)) + self.files.append(os.path.join(self.ell_root, "CMake/OpenBLASSetup.cmake")) def copy_files(self, filelist, folder): @@ -292,14 +316,19 @@ def copy_files(self, filelist, folder): target_dir = self.output_dir else: target_dir = os.path.join(self.output_dir, folder) + os.makedirs(target_dir, exist_ok=True) + for path in filelist: if not os.path.isfile(path): raise Exception("expected file not found: " + path) + _, file_name = os.path.split(path) dest = os.path.join(target_dir, file_name) + if self.verbose: self.logger.info("copy \"%s\" \"%s\"" % (path, dest)) + copyfile(path, dest) def create_template_file(self, template_filename, output_filename): @@ -312,6 +341,10 @@ def create_template_file(self, template_filename, output_filename): template = template.replace("@Arch@", self.target) template = template.replace("@OBJECT_EXTENSION@", self.objext) template = template.replace("@ELL_ROOT@", os.path.join(self.ell_root, "external").replace("\\", "/")) + shell_type = "UNIX" + if self.target == "host" and platform.system() == "Windows": + shell_type = "WINDOWS" + template = template.replace("@SHELL_TYPE@", shell_type) output_template = os.path.join(self.output_dir, output_filename) with open(output_template, 'w') as f: f.write(template) @@ -358,6 +391,7 @@ def run(self): func_name=self.func_name, model_name=self.model_name, target=self.target, + skip_ellcode=self.skip_ellcode, output_dir=self.output_dir, use_blas=self.blas, fuse_linear_ops=self.fuse_linear_ops, @@ -372,6 +406,7 @@ def run(self): swig=self.swig, header=self.cpp_header, objext="." + self.objext, + global_value_alignment=self.global_value_alignment, extra_options=self.compile_args) self.stop_timer("compile") if self.swig:
Performance Raspberry Pi 3 (Raspbian) @ 700MHz: 601ms/frame Raspberry Pi 3 (Raspbian) @ 700MHz: 350ms/frame
Uncompressed Size