diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index ade377c06a..f5e8065ea8 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -437,6 +437,12 @@ jobs: source env/activate pytest -ssv runtime/tools/python/test/test_perf.py + - name: ttrt api tests + shell: bash + run: | + source env/activate + pytest -ssv runtime/tools/python/test/test_api.py + build-and-test-explorer: needs: build-image timeout-minutes: 60 diff --git a/runtime/include/tt/runtime/detail/ttmetal.h b/runtime/include/tt/runtime/detail/ttmetal.h index 7a68a7e944..ac04d87a9d 100644 --- a/runtime/include/tt/runtime/detail/ttmetal.h +++ b/runtime/include/tt/runtime/detail/ttmetal.h @@ -39,8 +39,8 @@ void closeDevice(Device device); void deallocateBuffers(Device device); -Event submit(Device device, Binary executable, std::uint32_t programIndex, - std::vector const &inputs, +Event submit(Device deviceHandle, Binary executableHandle, + std::uint32_t programIndex, std::vector const &inputs, std::vector const &outputs); void wait(Event event); diff --git a/runtime/include/tt/runtime/detail/ttnn.h b/runtime/include/tt/runtime/detail/ttnn.h index 6c55ac1de7..d76ba67f4b 100644 --- a/runtime/include/tt/runtime/detail/ttnn.h +++ b/runtime/include/tt/runtime/detail/ttnn.h @@ -75,11 +75,16 @@ void closeDevice(Device device); void deallocateBuffers(Device device); -Event submit(Device device, Binary executable, std::uint32_t programIndex, - std::vector const &inputs, - std::vector const &outputs); +Tensor toHost(Tensor tensor, bool untilize = false); -void wait(Event event); +Tensor toDevice(Tensor tensor, Device device); + +Tensor toDevice(Tensor tensor, Device device, Layout layout); + +Tensor toLayout(Tensor tensor, Layout layout); + +Layout getLayout(Binary executableHandle, std::uint32_t programIndex, + std::uint32_t inputIndex); std::string getOpDebugString(OpContext opContextHandle); @@ -88,10 +93,34 @@ Tensor getOpOutputTensor(OpContext opContextHandle, std::vector getTensorData(Tensor tensor); +void memcpy(void *dst, Tensor src); + +void memcpy(Tensor dst, Tensor src); + +void deallocateTensor(Tensor &tensor, bool force = false); + +namespace legacy { +/* Will be deprecated soon once FEs migrate to new API */ +void wait(Event event); + +Event submit(Device deviceHandle, Binary executableHandle, + std::uint32_t programIndex, std::vector const &inputs, + std::vector const &outputs); + void runProgram(::ttnn::MeshDevice &meshDevice, Binary &executableHandle, std::uint32_t programIndex, std::vector<::ttnn::Tensor *> const &inputs, std::vector<::ttnn::Tensor *> const &outputs); +} // namespace legacy + +std::vector submit(Device deviceHandle, Binary executableHandle, + std::uint32_t programIndex, + std::vector const &inputs); + +std::vector runProgram(::ttnn::MeshDevice &meshDevice, + Binary executableHandle, + std::uint32_t programIndex, + std::vector<::ttnn::Tensor *> const &inputs); } // namespace tt::runtime::ttnn diff --git a/runtime/include/tt/runtime/runtime.h b/runtime/include/tt/runtime/runtime.h index 1dc721f662..7a8a92aec1 100644 --- a/runtime/include/tt/runtime/runtime.h +++ b/runtime/include/tt/runtime/runtime.h @@ -63,12 +63,19 @@ Device openDevice(DeviceIds const &deviceIds, size_t numHWCQs = 1); void closeDevice(Device device); -Event submit(Device device, Binary executable, std::uint32_t programIndex, - std::vector const &inputs, - std::vector const &outputs); - void wait(Event event); +Tensor toHost(Tensor tensor, bool untilize = false); + +Tensor toDevice(Tensor tensor, Device device); + +Tensor toDevice(Tensor tensor, Device device, Layout layout); + +Tensor toLayout(Tensor tensor, Layout layout); + +Layout getLayout(Binary executableHandle, std::uint32_t programIndex, + std::uint32_t inputIndex); + std::string getOpDebugString(OpContext opContextHandle); Tensor getOpOutputTensor(OpContext opContextHandle, @@ -76,6 +83,20 @@ Tensor getOpOutputTensor(OpContext opContextHandle, std::vector getTensorData(Tensor tensor); +void memcpy(void *dst, Tensor src); + +void memcpy(Tensor dst, Tensor src); + +void deallocateTensor(Tensor &tensor, bool force = false); + +std::vector submit(Device deviceHandle, Binary executableHandle, + std::uint32_t programIndex, + std::vector const &inputs); + +Event submit(Device deviceHandle, Binary executableHandle, + std::uint32_t programIndex, std::vector const &inputs, + std::vector const &outputs); + } // namespace tt::runtime #endif diff --git a/runtime/include/tt/runtime/types.h b/runtime/include/tt/runtime/types.h index 8fd641195f..cc2791e237 100644 --- a/runtime/include/tt/runtime/types.h +++ b/runtime/include/tt/runtime/types.h @@ -122,10 +122,20 @@ struct Event : public detail::RuntimeCheckedObjectImpl { struct Tensor : public detail::RuntimeCheckedObjectImpl { std::shared_ptr data; - + Event event; Tensor(std::shared_ptr handle, std::shared_ptr data, DeviceRuntime runtime) - : detail::RuntimeCheckedObjectImpl(handle, runtime), data(data) {} + : detail::RuntimeCheckedObjectImpl(handle, runtime), data(data), + event(nullptr, runtime) {} + + Tensor(std::shared_ptr handle, std::shared_ptr data, + std::shared_ptr eventHandle, DeviceRuntime runtime) + : detail::RuntimeCheckedObjectImpl(handle, runtime), data(data), + event(eventHandle, runtime) {} +}; + +struct Layout : public detail::RuntimeCheckedObjectImpl { + using detail::RuntimeCheckedObjectImpl::RuntimeCheckedObjectImpl; }; struct CallbackContext : public detail::RuntimeCheckedObjectImpl { diff --git a/runtime/lib/binary.cpp b/runtime/lib/binary.cpp index 92be39d27f..b81bf1a770 100644 --- a/runtime/lib/binary.cpp +++ b/runtime/lib/binary.cpp @@ -27,15 +27,12 @@ static std::string asJson(void const *fbb, uint8_t const *binarySchema, flatbuffers::Parser parser(opts); if (not parser.Deserialize(binarySchema, schemaSize)) { - throw std::runtime_error("Failed to deserialize schema"); + LOG_FATAL("Failed to deserialize schema"); } std::string text; const char *err = ::flatbuffers::GenerateText(parser, fbb, &text); - if (err) { - throw std::runtime_error("Failed to generate JSON: " + std::string(err)); - } - + LOG_ASSERT(not err, "Failed to generate JSON: ", err); return text; } @@ -44,9 +41,7 @@ namespace ttnn { ::tt::target::ttnn::TTNNBinary const *getBinary(Flatbuffer binary) { bool isTTNN = ::tt::target::ttnn::SizePrefixedTTNNBinaryBufferHasIdentifier( binary.handle.get()); - if (not isTTNN) { - throw std::runtime_error("Unsupported binary format"); - } + LOG_ASSERT(isTTNN, "Unsupported binary format"); return ::tt::target::ttnn::GetSizePrefixedTTNNBinary(binary.handle.get()); } @@ -128,9 +123,7 @@ ::tt::target::metal::TTMetalBinary const *getBinary(Flatbuffer binary) { bool isTTMetal = ::tt::target::metal::SizePrefixedTTMetalBinaryBufferHasIdentifier( binary.handle.get()); - if (not isTTMetal) { - throw std::runtime_error("Unsupported binary format"); - } + LOG_ASSERT(isTTMetal, "Unsupported binary format"); return ::tt::target::metal::GetSizePrefixedTTMetalBinary(binary.handle.get()); } @@ -207,7 +200,7 @@ namespace system_desc { ::tt::target::SystemDescRoot const *getBinary(Flatbuffer binary) { if (!::tt::target::SizePrefixedSystemDescRootBufferHasIdentifier( binary.handle.get())) { - throw std::runtime_error("Unsupported binary format"); + LOG_FATAL("Unsupported binary format"); } return ::tt::target::GetSizePrefixedSystemDescRoot(binary.handle.get()); } @@ -234,10 +227,7 @@ std::string asJson(Flatbuffer binary) { Flatbuffer Flatbuffer::loadFromPath(char const *path) { // load a flatbuffer from path std::ifstream fbb(path, std::ios::binary | std::ios::ate); - if (!fbb.is_open()) { - throw std::runtime_error("Failed to open file: " + std::string(path)); - } - + LOG_ASSERT(fbb.is_open(), "Failed to open file: ", path); std::streampos size = fbb.tellg(); fbb.seekg(0, std::ios::beg); auto buffer = ::tt::runtime::utils::malloc_shared(size); @@ -269,7 +259,7 @@ std::string_view Flatbuffer::getFileIdentifier() const { return ::tt::target::SystemDescRootIdentifier(); } - throw std::runtime_error("Unsupported binary format"); + LOG_FATAL("Unsupported binary format"); } std::string Flatbuffer::getVersion() const { @@ -288,7 +278,7 @@ std::string Flatbuffer::getVersion() const { return system_desc::getVersion(*this); } - throw std::runtime_error("Unsupported binary format"); + LOG_FATAL("Unsupported binary format"); } std::string_view Flatbuffer::getTTMLIRGitHash() const { @@ -307,7 +297,7 @@ std::string_view Flatbuffer::getTTMLIRGitHash() const { return system_desc::getTTMLIRGitHash(*this); } - throw std::runtime_error("Unsupported binary format"); + LOG_FATAL("Unsupported binary format"); } std::string Flatbuffer::asJson() const { @@ -326,7 +316,7 @@ std::string Flatbuffer::asJson() const { return system_desc::asJson(*this); } - throw std::runtime_error("Unsupported binary format"); + LOG_FATAL("Unsupported binary format"); } SystemDesc SystemDesc::loadFromPath(char const *path) { @@ -349,7 +339,7 @@ Binary::getProgramInputs(std::uint32_t programIndex) const { return metal::getProgramInputs(*this, programIndex); } - throw std::runtime_error("Unsupported binary format"); + LOG_FATAL("Unsupported binary format"); } std::vector @@ -364,7 +354,7 @@ Binary::getProgramOutputs(std::uint32_t programIndex) const { return metal::getProgramOutputs(*this, programIndex); } - throw std::runtime_error("Unsupported binary format"); + LOG_FATAL("Unsupported binary format"); } const ::tt::target::GoldenTensor * diff --git a/runtime/lib/common/system_desc.cpp b/runtime/lib/common/system_desc.cpp index f1210d00aa..3b4685901a 100644 --- a/runtime/lib/common/system_desc.cpp +++ b/runtime/lib/common/system_desc.cpp @@ -32,7 +32,7 @@ static ::tt::target::Arch toFlatbuffer(::tt::ARCH arch) { break; } - throw std::runtime_error("Unsupported arch"); + LOG_FATAL("Unsupported arch"); } static std::vector<::tt::target::ChipChannel> @@ -246,7 +246,7 @@ static std::unique_ptr<::tt::runtime::SystemDesc> getCurrentSystemDescImpl( ::tt::target::FinishSizePrefixedSystemDescRootBuffer(fbb, root); ::flatbuffers::Verifier verifier(fbb.GetBufferPointer(), fbb.GetSize()); if (!::tt::target::VerifySizePrefixedSystemDescRootBuffer(verifier)) { - throw std::runtime_error("Failed to verify system desc root buffer"); + LOG_FATAL("Failed to verify system desc root buffer"); } uint8_t *buf = fbb.GetBufferPointer(); auto size = fbb.GetSize(); diff --git a/runtime/lib/runtime.cpp b/runtime/lib/runtime.cpp index 586b8394ea..591133acee 100644 --- a/runtime/lib/runtime.cpp +++ b/runtime/lib/runtime.cpp @@ -42,7 +42,7 @@ void deallocateBuffers(Device device) { return ::tt::runtime::ttmetal::deallocateBuffers(device); } #endif - throw std::runtime_error("runtime is not enabled"); + LOG_FATAL("runtime is not enabled"); } } // namespace detail @@ -91,15 +91,14 @@ void setCompatibleRuntime(const Binary &binary) { return setCurrentRuntime(DeviceRuntime::TTMetal); } #endif - throw std::runtime_error( - "Unsupported binary file identifier or runtime not enabled"); + LOG_FATAL("Unsupported binary file identifier or runtime not enabled"); } std::pair getCurrentSystemDesc() { #if defined(TT_RUNTIME_ENABLE_TTNN) || defined(TT_RUNTIME_ENABLE_TTMETAL) return system_desc::getCurrentSystemDesc(); #endif - throw std::runtime_error("runtime is not enabled"); + LOG_FATAL("runtime is not enabled"); } Tensor createTensor(std::shared_ptr data, @@ -122,7 +121,7 @@ Tensor createTensor(std::shared_ptr data, dataType); } #endif - throw std::runtime_error("runtime is not enabled"); + LOG_FATAL("runtime is not enabled"); } Tensor @@ -143,10 +142,10 @@ createTensor(std::vector> &data, #if defined(TT_RUNTIME_ENABLE_TTMETAL) if (getCurrentRuntime() == DeviceRuntime::TTMetal) { - throw std::runtime_error("Not implemented"); + LOG_FATAL("Not implemented"); } #endif - throw std::runtime_error("runtime is not enabled"); + LOG_FATAL("runtime is not enabled"); } tt::target::DataType getTensorDataType(Tensor tensor) { @@ -161,7 +160,7 @@ tt::target::DataType getTensorDataType(Tensor tensor) { return ::tt::runtime::ttmetal::getTensorDataType(tensor); } #endif - throw std::runtime_error("runtime is not enabled"); + LOG_FATAL("runtime is not enabled"); } size_t getNumAvailableDevices() { @@ -176,7 +175,7 @@ size_t getNumAvailableDevices() { return ::tt::runtime::ttmetal::getNumAvailableDevices(); } #endif - throw std::runtime_error("runtime is not enabled"); + LOG_FATAL("runtime is not enabled"); } Device openDevice(DeviceIds const &deviceIds, size_t numHWCQs) { @@ -191,7 +190,7 @@ Device openDevice(DeviceIds const &deviceIds, size_t numHWCQs) { return ::tt::runtime::ttmetal::openDevice(deviceIds, numHWCQs); } #endif - throw std::runtime_error("runtime is not enabled"); + LOG_FATAL("runtime is not enabled"); } void closeDevice(Device device) { @@ -206,44 +205,100 @@ void closeDevice(Device device) { return ::tt::runtime::ttmetal::closeDevice(device); } #endif - throw std::runtime_error("runtime is not enabled"); + LOG_FATAL("runtime is not enabled"); } -Event submit(Device deviceHandle, Binary executableHandle, - std::uint32_t programIndex, - std::vector const &inputHandles, - std::vector const &outputHandles) { +void wait(Event event) { #if defined(TT_RUNTIME_ENABLE_TTNN) if (getCurrentRuntime() == DeviceRuntime::TTNN) { - return ::tt::runtime::ttnn::submit(deviceHandle, executableHandle, - programIndex, inputHandles, - outputHandles); + LOG_WARNING("wait API will be deprecated for TTNN runtime."); + return ::tt::runtime::ttnn::legacy::wait(event); } #endif #if defined(TT_RUNTIME_ENABLE_TTMETAL) if (getCurrentRuntime() == DeviceRuntime::TTMetal) { - return ::tt::runtime::ttmetal::submit(deviceHandle, executableHandle, - programIndex, inputHandles, - outputHandles); + return ::tt::runtime::ttmetal::wait(event); } #endif - throw std::runtime_error("runtime is not enabled"); + LOG_FATAL("runtime is not enabled"); } -void wait(Event event) { +Tensor toHost(Tensor tensor, bool untilize) { #if defined(TT_RUNTIME_ENABLE_TTNN) if (getCurrentRuntime() == DeviceRuntime::TTNN) { - return ::tt::runtime::ttnn::wait(event); + return ::tt::runtime::ttnn::toHost(tensor, untilize); } #endif #if defined(TT_RUNTIME_ENABLE_TTMETAL) if (getCurrentRuntime() == DeviceRuntime::TTMetal) { - return ::tt::runtime::ttmetal::wait(event); + LOG_FATAL("not implemented"); + } +#endif + LOG_FATAL("runtime is not enabled"); +} + +Tensor toDevice(Tensor tensor, Device device) { +#if defined(TT_RUNTIME_ENABLE_TTNN) + if (getCurrentRuntime() == DeviceRuntime::TTNN) { + return ::tt::runtime::ttnn::toDevice(tensor, device); + } +#endif + +#if defined(TT_RUNTIME_ENABLE_TTMETAL) + if (getCurrentRuntime() == DeviceRuntime::TTMetal) { + LOG_FATAL("not implemented"); } #endif - throw std::runtime_error("runtime is not enabled"); + LOG_FATAL("runtime is not enabled"); +} + +Tensor toDevice(Tensor tensor, Device device, Layout layout) { +#if defined(TT_RUNTIME_ENABLE_TTNN) + if (getCurrentRuntime() == DeviceRuntime::TTNN) { + return ::tt::runtime::ttnn::toDevice(tensor, device, layout); + } +#endif + +#if defined(TT_RUNTIME_ENABLE_TTMETAL) + if (getCurrentRuntime() == DeviceRuntime::TTMetal) { + LOG_FATAL("not implemented"); + } +#endif + LOG_FATAL("runtime is not enabled"); +} + +Tensor toLayout(Tensor tensor, Layout layout) { +#if defined(TT_RUNTIME_ENABLE_TTNN) + if (getCurrentRuntime() == DeviceRuntime::TTNN) { + return ::tt::runtime::ttnn::toLayout(tensor, layout); + } +#endif + +#if defined(TT_RUNTIME_ENABLE_TTMETAL) + if (getCurrentRuntime() == DeviceRuntime::TTMetal) { + LOG_FATAL("not implemented"); + } +#endif + LOG_FATAL("runtime is not enabled"); +} + +Layout getLayout(Binary executableHandle, std::uint32_t programIndex, + std::uint32_t inputIndex) { +#if defined(TT_RUNTIME_ENABLE_TTNN) + if (getCurrentRuntime() == DeviceRuntime::TTNN) { + return ::tt::runtime::ttnn::getLayout(executableHandle, programIndex, + inputIndex); + } +#endif + +#if defined(TT_RUNTIME_ENABLE_TTMETAL) + if (getCurrentRuntime() == DeviceRuntime::TTMetal) { + LOG_FATAL("not implemented"); + } +#endif + LOG_FATAL("runtime is not enabled"); } std::string getOpDebugString(OpContext opContextHandle) { @@ -258,7 +313,7 @@ std::string getOpDebugString(OpContext opContextHandle) { return ::tt::runtime::ttmetal::getOpDebugString(opContextHandle); } #endif - throw std::runtime_error("runtime is not enabled"); + LOG_FATAL("runtime is not enabled"); } Tensor getOpOutputTensor(OpContext opContextHandle, @@ -276,7 +331,7 @@ Tensor getOpOutputTensor(OpContext opContextHandle, programContextHandle); } #endif - throw std::runtime_error("runtime is not enabled"); + LOG_FATAL("runtime is not enabled"); } std::vector getTensorData(Tensor tensor) { @@ -292,7 +347,93 @@ std::vector getTensorData(Tensor tensor) { } #endif - throw std::runtime_error("runtime is not enabled"); + LOG_FATAL("runtime is not enabled"); +} + +void memcpy(void *dst, Tensor src) { +#if defined(TT_RUNTIME_ENABLE_TTNN) + if (getCurrentRuntime() == DeviceRuntime::TTNN) { + return ::tt::runtime::ttnn::memcpy(dst, src); + } +#endif + +#if defined(TT_RUNTIME_ENABLE_TTMETAL) + if (getCurrentRuntime() == DeviceRuntime::TTMetal) { + LOG_FATAL("not implemented"); + } +#endif + LOG_FATAL("runtime is not enabled"); +} + +void memcpy(Tensor dst, Tensor src) { +#if defined(TT_RUNTIME_ENABLE_TTNN) + if (getCurrentRuntime() == DeviceRuntime::TTNN) { + return ::tt::runtime::ttnn::memcpy(dst, src); + } +#endif + +#if defined(TT_RUNTIME_ENABLE_TTMETAL) + if (getCurrentRuntime() == DeviceRuntime::TTMetal) { + LOG_FATAL("not implemented"); + } +#endif + LOG_FATAL("runtime is not enabled"); +} + +void deallocateTensor(Tensor &tensor, bool force) { +#if defined(TT_RUNTIME_ENABLE_TTNN) + if (getCurrentRuntime() == DeviceRuntime::TTNN) { + return ::tt::runtime::ttnn::deallocateTensor(tensor, force); + } +#endif + +#if defined(TT_RUNTIME_ENABLE_TTMETAL) + if (getCurrentRuntime() == DeviceRuntime::TTMetal) { + LOG_FATAL("not implemented"); + } +#endif + LOG_FATAL("runtime is not enabled"); } +std::vector submit(Device deviceHandle, Binary executableHandle, + std::uint32_t programIndex, + std::vector const &inputHandles) { +#if defined(TT_RUNTIME_ENABLE_TTNN) + if (getCurrentRuntime() == DeviceRuntime::TTNN) { + return ::tt::runtime::ttnn::submit(deviceHandle, executableHandle, + programIndex, inputHandles); + } +#endif + +#if defined(TT_RUNTIME_ENABLE_TTMETAL) + if (getCurrentRuntime() == DeviceRuntime::TTMetal) { + LOG_FATAL("not implemented"); + } +#endif + LOG_FATAL("runtime is not enabled"); +} + +Event submit(Device deviceHandle, Binary executableHandle, + std::uint32_t programIndex, + std::vector const &inputHandles, + std::vector const &outputHandles) { +#if defined(TT_RUNTIME_ENABLE_TTNN) + if (getCurrentRuntime() == DeviceRuntime::TTNN) { + LOG_WARNING("This submit API will soon be deprecated. Please switch to the " + "new API."); + return ::tt::runtime::ttnn::legacy::submit(deviceHandle, executableHandle, + programIndex, inputHandles, + outputHandles); + } +#endif + +#if defined(TT_RUNTIME_ENABLE_TTMETAL) + if (getCurrentRuntime() == DeviceRuntime::TTMetal) { + return ::tt::runtime::ttmetal::submit(deviceHandle, executableHandle, + programIndex, inputHandles, + outputHandles); + } +#endif + LOG_FATAL("runtime is not enabled"); +} } // namespace tt::runtime diff --git a/runtime/lib/ttmetal/command_queue.cpp b/runtime/lib/ttmetal/command_queue.cpp index 9a408a66b1..3480458e6a 100644 --- a/runtime/lib/ttmetal/command_queue.cpp +++ b/runtime/lib/ttmetal/command_queue.cpp @@ -137,7 +137,7 @@ void CQExecutor::execute(::tt::target::metal::Command const *command) { break; } default: - throw std::runtime_error("Unsupported command type"); + LOG_FATAL("Unsupported command type"); break; } } @@ -328,7 +328,7 @@ createKernelConfig(::tt::target::metal::KernelSource const *kernelSource) { break; } } - throw std::runtime_error("Unsupported kernel source type"); + LOG_FATAL("Unsupported kernel source type"); } static ::tt::DataFormat toDataFormat(::tt::target::DataType dataType) { @@ -346,7 +346,7 @@ static ::tt::DataFormat toDataFormat(::tt::target::DataType dataType) { case ::tt::target::DataType::UInt8: return ::tt::DataFormat::UInt8; default: - throw std::runtime_error("Unsupported data type"); + LOG_FATAL("Unsupported data type"); } } @@ -358,7 +358,7 @@ static CoreType toCoreType(::tt::target::metal::CoreType coreType) { case ::tt::target::metal::CoreType::ETH: return CoreType::ETH; } - throw std::runtime_error("Unsupported core type"); + LOG_FATAL("Unsupported core type"); } static ::tt::tt_metal::CircularBufferConfig createCircularBufferConfig( @@ -427,7 +427,7 @@ static void processRuntimeArgs( break; } case ::tt::target::metal::RuntimeArg::NONE: - throw std::runtime_error("Unsupported runtime arg type"); + LOG_FATAL("Unsupported runtime arg type"); } } @@ -516,7 +516,7 @@ void CQExecutor::execute( break; } default: - throw std::runtime_error("Unsupported HostBuffer type"); + LOG_FATAL("Unsupported HostBuffer type"); } } @@ -524,7 +524,7 @@ void CQExecutor::execute( ::tt::target::metal::EnqueueReadBufferCommand const *command) { ZoneScopedN("EnqueueReadBufferCommand"); // Maybe we will need this in the future, like paging to system mem? - throw std::runtime_error("Unsupported EnqueueReadBufferCommand"); + LOG_FATAL("Unsupported EnqueueReadBufferCommand"); } void CQExecutor::execute( diff --git a/runtime/lib/ttmetal/runtime.cpp b/runtime/lib/ttmetal/runtime.cpp index ab343554ed..a695e28075 100644 --- a/runtime/lib/ttmetal/runtime.cpp +++ b/runtime/lib/ttmetal/runtime.cpp @@ -24,7 +24,7 @@ static ::tt::target::metal::TTMetalBinary const *getBinary(Flatbuffer binary) { ::tt::target::metal::SizePrefixedTTMetalBinaryBufferHasIdentifier( binary.handle.get()); if (not isTTMetal) { - throw std::runtime_error("Unsupported binary format"); + LOG_FATAL("Unsupported binary format"); } return ::tt::target::metal::GetSizePrefixedTTMetalBinary(binary.handle.get()); } @@ -56,7 +56,7 @@ tt::target::DataType getTensorDataType(Tensor tensor) { } if (std::holds_alternative>( metalTensor)) { - throw std::runtime_error("Datatype mapping from buffer not supported yet."); + LOG_FATAL("Datatype mapping from buffer not supported yet."); } LOG_ASSERT(false, "Unsupported tensor type"); return ::tt::target::DataType::Float32; @@ -117,7 +117,7 @@ prepareInput(::tt::tt_metal::Device *device, MetalTensor const &metalTensor, metalTensor)) { std::shared_ptr<::tt::tt_metal::Buffer> buffer = std::get>(metalTensor); - throw std::runtime_error("Input from buffer not supported yet"); + LOG_FATAL("Input from buffer not supported yet"); } LOG_ASSERT(false, "Unsupported tensor type"); return std::make_pair(nullptr, nullptr); diff --git a/runtime/lib/ttnn/CMakeLists.txt b/runtime/lib/ttnn/CMakeLists.txt index 92581cf46f..c50701d992 100644 --- a/runtime/lib/ttnn/CMakeLists.txt +++ b/runtime/lib/ttnn/CMakeLists.txt @@ -1,3 +1,4 @@ +add_subdirectory(include) add_subdirectory(operations) add_library(TTRuntimeTTNN STATIC @@ -11,5 +12,5 @@ target_include_directories(TTRuntimeTTNN PUBLIC ${PROJECT_BINARY_DIR}/include/ttmlir/Target/Common ) target_include_directories(TTRuntimeTTNN SYSTEM PUBLIC "$") -target_link_libraries(TTRuntimeTTNN PUBLIC TTRuntimeTTNNOps) +target_link_libraries(TTRuntimeTTNN PUBLIC TTRuntimeTTNNOps TTRuntimeTTNNHelpers) add_dependencies(TTRuntimeTTNN TTRuntimeTTNNOps) diff --git a/runtime/lib/ttnn/include/CMakeLists.txt b/runtime/lib/ttnn/include/CMakeLists.txt new file mode 100644 index 0000000000..a5cbd0e08f --- /dev/null +++ b/runtime/lib/ttnn/include/CMakeLists.txt @@ -0,0 +1,16 @@ +add_library(TTRuntimeTTNNHelpers + STATIC + tt/runtime/ttnn/utils.cpp + tt/runtime/ttnn/types.cpp +) +set_property(TARGET TTRuntimeTTNNHelpers PROPERTY CXX_STANDARD 20) +target_compile_options(TTRuntimeTTNNHelpers PUBLIC -mavx -mavx2 -fsized-deallocation) +target_include_directories(TTRuntimeTTNNHelpers PUBLIC + ${PROJECT_SOURCE_DIR}/runtime/include + ${PROJECT_SOURCE_DIR}/runtime/lib/ttnn/include + ${PROJECT_SOURCE_DIR}/runtime/lib/ttnn/operations/include + ${PROJECT_BINARY_DIR}/include/ttmlir/Target/Common +) +target_include_directories(TTRuntimeTTNNHelpers SYSTEM PUBLIC "$") +add_dependencies(TTRuntimeTTNNHelpers TTNN_LIBRARY tt-metal FBS_GENERATION) +target_link_libraries(TTRuntimeTTNNHelpers PUBLIC TTNN_LIBRARY) diff --git a/runtime/lib/ttnn/include/tt/runtime/ttnn/types.cpp b/runtime/lib/ttnn/include/tt/runtime/ttnn/types.cpp new file mode 100644 index 0000000000..bac739c140 --- /dev/null +++ b/runtime/lib/ttnn/include/tt/runtime/ttnn/types.cpp @@ -0,0 +1,112 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#include "tt/runtime/ttnn/types.h" +#include "tt/runtime/detail/logger.h" +#include "tt/runtime/ttnn/utils.h" + +namespace tt::runtime::ttnn { + +// +// ProgramTensorPool APIs +// +std::pair::iterator, bool> +ProgramTensorPool::try_emplace(std::uint32_t globalId, + const ::ttnn::Tensor &tensor) { + auto it = liveTensors.find(globalId); + if (it != liveTensors.end()) { + return std::make_pair(it, false); + } + LOG_ASSERT(!intermedTensors.contains(globalId)); + intermedTensors.try_emplace(globalId, tensor); + return liveTensors.try_emplace(globalId, &intermedTensors.at(globalId)); +} + +std::pair::iterator, bool> +ProgramTensorPool::insert_or_assign(std::uint32_t globalId, + const ::ttnn::Tensor &tensor) { + intermedTensors.insert_or_assign(globalId, tensor); + return liveTensors.insert_or_assign(globalId, &intermedTensors.at(globalId)); +} + +::ttnn::Tensor &ProgramTensorPool::at(std::uint32_t globalId) { + LOG_ASSERT(liveTensors.contains(globalId)); + return *liveTensors.at(globalId); +} + +const ::ttnn::Tensor &ProgramTensorPool::at(std::uint32_t globalId) const { + LOG_ASSERT(liveTensors.contains(globalId)); + return *liveTensors.at(globalId); +} + +size_t ProgramTensorPool::erase(std::uint32_t globalId) { + LOG_ASSERT(liveTensors.contains(globalId) && + intermedTensors.contains(globalId)); + intermedTensors.erase(globalId); + return liveTensors.erase(globalId); +} + +std::vector ProgramTensorPool::gatherOutputTensors() { + std::vector outputTensors; + outputTensors.reserve(programOutputs.size()); + std::transform( + programOutputs.begin(), programOutputs.end(), + std::back_inserter(outputTensors), [this](uint32_t outputGlobalId) { + return utils::createRuntimeTensorFromTTNN(this->at(outputGlobalId)); + }); + return outputTensors; +} + +// +// ProgramContext APIs +// +ProgramContext::ProgramContext( + const std::unordered_map &liveTensors, + const std::vector &programInputs, + const std::vector &programOutputs, ::ttnn::MeshDevice *parentMesh) + : tensorPool(ProgramTensorPool(liveTensors, programInputs, programOutputs)), + parentMesh(parentMesh) { + LOG_ASSERT(parentMesh, "Parent mesh cannot be null"); +} + +void ProgramContext::addSubMesh(uint32_t meshId, + std::shared_ptr<::ttnn::MeshDevice> subMesh) { + auto [it, inserted] = subMeshes.try_emplace(meshId, subMesh); + LOG_ASSERT(inserted, "Submesh already exists"); +} + +::ttnn::MeshDevice &ProgramContext::getSubMesh(uint32_t meshId) { + LOG_ASSERT(subMeshes.contains(meshId)); + return *subMeshes.at(meshId); +} + +size_t ProgramContext::subMeshSize(uint32_t meshId) const { + LOG_ASSERT(subMeshes.contains(meshId)); + return subMeshes.at(meshId)->num_devices(); +} + +::ttnn::Device &ProgramContext::getDeviceFromSubMesh(uint32_t meshId, + int physicalDeviceId) { + LOG_ASSERT(subMeshes.contains(meshId)); + auto &subMesh = *subMeshes.at(meshId); + return *subMesh.get_device(physicalDeviceId); +} + +::ttnn::Device &ProgramContext::getDeviceIndexFromSubMesh(uint32_t meshId, + int deviceIndex) { + LOG_ASSERT(subMeshes.contains(meshId)); + auto &subMesh = *subMeshes.at(meshId); + return *subMesh.get_device_index(deviceIndex); +} + +DeviceVariant ProgramContext::getTargetDevice(uint32_t meshId) { + LOG_ASSERT(subMeshes.contains(meshId)); + auto &subMesh = *subMeshes.at(meshId); + if (subMesh.num_devices() == 1) { + return std::ref(*subMesh.get_device_index(0)); + } + return std::ref(subMesh); +} + +} // namespace tt::runtime::ttnn diff --git a/runtime/lib/ttnn/include/tt/runtime/ttnn/types.h b/runtime/lib/ttnn/include/tt/runtime/ttnn/types.h index 5cd08c7ed0..49ae3cda33 100644 --- a/runtime/lib/ttnn/include/tt/runtime/ttnn/types.h +++ b/runtime/lib/ttnn/include/tt/runtime/ttnn/types.h @@ -6,18 +6,37 @@ #define TT_RUNTIME_TTNN_TYPES_H #include "tt/runtime/detail/ttnn.h" +#include +#include namespace tt::runtime::ttnn { - -using TensorMap = std::unordered_map; using DeviceVariant = std::variant, std::reference_wrapper<::ttnn::MeshDevice>>; +struct LayoutDesc { + ::ttnn::BufferType bufferType; + ::ttnn::Layout layout; + ::ttnn::DataType dataType; + std::optional<::ttnn::MemoryConfig> memoryConfig; + + LayoutDesc(const ::ttnn::BufferType &bufferType, const ::ttnn::Layout &layout, + const ::ttnn::DataType &dataType, + const std::optional<::ttnn::MemoryConfig> &memoryConfig) + : bufferType(bufferType), layout(layout), dataType(dataType), + memoryConfig(memoryConfig) {} + + bool isOnHost() const { + return bufferType == ::ttnn::BufferType::SYSTEM_MEMORY; + } + bool isOnDevice() const { return !isOnHost(); } +}; + class ProgramTensorPool { public: - ProgramTensorPool(const TensorMap &liveTensors, - const std::unordered_set &programInputs, - const std::unordered_set &programOutputs) + ProgramTensorPool( + const std::unordered_map &liveTensors, + const std::vector &programInputs, + const std::vector &programOutputs) : programInputs(programInputs), programOutputs(programOutputs), liveTensors(liveTensors) {} ProgramTensorPool(const ProgramTensorPool &) = delete; @@ -25,72 +44,38 @@ class ProgramTensorPool { ProgramTensorPool(ProgramTensorPool &&) = default; ProgramTensorPool &operator=(ProgramTensorPool &&) = default; - auto try_emplace(std::uint32_t globalId, const ::ttnn::Tensor &tensor) { - auto it = liveTensors.find(globalId); - if (it != liveTensors.end()) { - return std::make_pair(it, false); - } - assert(!intermedTensors.contains(globalId)); - intermedTensors.try_emplace(globalId, tensor); - return liveTensors.try_emplace(globalId, &intermedTensors.at(globalId)); - } + std::pair::iterator, bool> + try_emplace(std::uint32_t globalId, const ::ttnn::Tensor &tensor); - auto insert_or_assign(std::uint32_t globalId, const ::ttnn::Tensor &tensor) { - intermedTensors.insert_or_assign(globalId, tensor); - return liveTensors.insert_or_assign(globalId, - &intermedTensors.at(globalId)); - } + std::pair::iterator, bool> + insert_or_assign(std::uint32_t globalId, const ::ttnn::Tensor &tensor); - ::ttnn::Tensor &at(std::uint32_t globalId) { - assert(liveTensors.contains(globalId)); - return *liveTensors.at(globalId); - } + ::ttnn::Tensor &at(std::uint32_t globalId); - const ::ttnn::Tensor &at(std::uint32_t globalId) const { - assert(liveTensors.contains(globalId)); - return *liveTensors.at(globalId); - } + const ::ttnn::Tensor &at(std::uint32_t globalId) const; - size_t erase(std::uint32_t globalId) { - assert(liveTensors.contains(globalId) && - intermedTensors.contains(globalId)); - intermedTensors.erase(globalId); - return liveTensors.erase(globalId); - } + size_t erase(std::uint32_t globalId); - void copyTensorToUserOutput(std::uint32_t outputGlobalId, - const ::ttnn::Tensor &srcTensor) { - assert(liveTensors.contains(outputGlobalId)); - assert(isUserOutput(outputGlobalId)); - ::ttnn::Tensor &outputTensor = *liveTensors.at(outputGlobalId); - void *src = ::tt::tt_metal::get_raw_host_data_ptr(srcTensor); - void *dst = ::tt::tt_metal::get_raw_host_data_ptr(outputTensor); - size_t size = outputTensor.volume() * outputTensor.element_size(); - std::memcpy(dst, src, size); - } + std::vector gatherOutputTensors(); bool contains(std::uint32_t globalId) const { return liveTensors.contains(globalId); } - bool isUserOutput(std::uint32_t globalId) const { - return programOutputs.contains(globalId); - } - - const std::unordered_set &getProgramInputs() const { + const std::vector &getProgramInputs() const { return programInputs; } - const std::unordered_set &getProgramOutputs() const { + const std::vector &getProgramOutputs() const { return programOutputs; } private: - std::unordered_set programInputs; - std::unordered_set programOutputs; + std::vector programInputs; + std::vector programOutputs; // A superset of intermedTensors, containing pointers to all tensors created - // by the program and the input/output tensors passed in by the user - TensorMap liveTensors; + // by the program and the input tensors passed in by the user + std::unordered_map liveTensors; // A subset of liveTensors, containing values of any intermediate tensors // created by the program @@ -99,15 +84,11 @@ class ProgramTensorPool { class ProgramContext { public: - ProgramContext(const TensorMap &liveTensors, - const std::unordered_set &programInputs, - const std::unordered_set &programOutputs, - ::ttnn::MeshDevice *parentMesh) - : tensorPool( - ProgramTensorPool(liveTensors, programInputs, programOutputs)), - parentMesh(parentMesh) { - assert(parentMesh && "Parent mesh cannot be null"); - } + ProgramContext( + const std::unordered_map &liveTensors, + const std::vector &programInputs, + const std::vector &programOutputs, + ::ttnn::MeshDevice *parentMesh); ProgramContext(const ProgramContext &) = delete; ProgramContext &operator=(const ProgramContext &) = delete; ProgramContext(ProgramContext &&) = default; @@ -125,42 +106,17 @@ class ProgramContext { // // Sub Mesh Operations // - void addSubMesh(uint32_t meshId, - std::shared_ptr<::ttnn::MeshDevice> subMesh) { - auto [it, inserted] = subMeshes.try_emplace(meshId, subMesh); - assert(inserted && "Submesh already exists"); - } + void addSubMesh(uint32_t meshId, std::shared_ptr<::ttnn::MeshDevice> subMesh); - ::ttnn::MeshDevice &getSubMesh(uint32_t meshId) { - assert(subMeshes.contains(meshId)); - return *subMeshes.at(meshId); - } + ::ttnn::MeshDevice &getSubMesh(uint32_t meshId); - size_t subMeshSize(uint32_t meshId) const { - assert(subMeshes.contains(meshId)); - return subMeshes.at(meshId)->num_devices(); - } + size_t subMeshSize(uint32_t meshId) const; - ::ttnn::Device &getDeviceFromSubMesh(uint32_t meshId, int physicalDeviceId) { - assert(subMeshes.contains(meshId)); - auto &subMesh = *subMeshes.at(meshId); - return *subMesh.get_device(physicalDeviceId); - } + ::ttnn::Device &getDeviceFromSubMesh(uint32_t meshId, int physicalDeviceId); - ::ttnn::Device &getDeviceIndexFromSubMesh(uint32_t meshId, int deviceIndex) { - assert(subMeshes.contains(meshId)); - auto &subMesh = *subMeshes.at(meshId); - return *subMesh.get_device_index(deviceIndex); - } + ::ttnn::Device &getDeviceIndexFromSubMesh(uint32_t meshId, int deviceIndex); - DeviceVariant getTargetDevice(uint32_t meshId) { - assert(subMeshes.contains(meshId)); - auto &subMesh = *subMeshes.at(meshId); - if (subMesh.num_devices() == 1) { - return std::ref(*subMesh.get_device_index(0)); - } - return std::ref(subMesh); - } + DeviceVariant getTargetDevice(uint32_t meshId); // // Tensor Pool Operations diff --git a/runtime/lib/ttnn/include/tt/runtime/ttnn/utils.cpp b/runtime/lib/ttnn/include/tt/runtime/ttnn/utils.cpp new file mode 100644 index 0000000000..914c80cf38 --- /dev/null +++ b/runtime/lib/ttnn/include/tt/runtime/ttnn/utils.cpp @@ -0,0 +1,216 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#include "tt/runtime/ttnn/utils.h" +#include "tt/runtime/detail/logger.h" + +namespace tt::runtime::ttnn::utils { + +// TODO (bug #701) +// Currently the memory layout/location in flatbuffer is incorrect +// These methods are workarounds for operations such that we query the info +// directly from the TTNN tensor. Ideally, we should be able to get all of this +// info directly from the flatbuffer +bool isOnHost(const ::ttnn::StorageType &storageType) { + return storageType == ::tt::tt_metal::StorageType::BORROWED or + storageType == ::tt::tt_metal::StorageType::OWNED or + storageType == ::tt::tt_metal::StorageType::MULTI_DEVICE_HOST; +} + +bool isOnDevice(const ::ttnn::StorageType &storageType) { + return storageType == ::tt::tt_metal::StorageType::DEVICE or + storageType == ::tt::tt_metal::StorageType::MULTI_DEVICE; +} + +bool isValidTileShape(const ::tt::target::Dim2d *shape) { + return (shape->x() == 1 and shape->y() == 1) or + (shape->x() == 32 and shape->y() == 32); +} + +::ttnn::DataType toTTNNDataType(::tt::target::DataType dataType) { + switch (dataType) { + case ::tt::target::DataType::Float32: + return ::ttnn::DataType::FLOAT32; + case ::tt::target::DataType::BFloat16: + return ::ttnn::DataType::BFLOAT16; + case ::tt::target::DataType::BFP_BFloat8: + return ::ttnn::DataType::BFLOAT8_B; + case ::tt::target::DataType::BFP_BFloat4: + return ::ttnn::DataType::BFLOAT4_B; + case ::tt::target::DataType::UInt32: + return ::ttnn::DataType::UINT32; + case ::tt::target::DataType::UInt16: + return ::ttnn::DataType::UINT16; + + default: + LOG_FATAL("Unsupported data type"); + } +} + +::tt::target::DataType fromTTNNDataType(::ttnn::DataType dataType) { + switch (dataType) { + case ::ttnn::DataType::FLOAT32: + return ::tt::target::DataType::Float32; + case ::ttnn::DataType::BFLOAT16: + return ::tt::target::DataType::BFloat16; + case ::ttnn::DataType::BFLOAT8_B: + return ::tt::target::DataType::BFP_BFloat8; + case ::ttnn::DataType::BFLOAT4_B: + return ::tt::target::DataType::BFP_BFloat4; + case ::ttnn::DataType::UINT32: + return ::tt::target::DataType::UInt32; + case ::ttnn::DataType::UINT16: + return ::tt::target::DataType::UInt16; + + default: + LOG_FATAL("Unsupported data type"); + } +} + +::ttnn::Layout toTTNNLayout(::tt::target::TensorLayout layout) { + switch (layout) { + case ::tt::target::TensorLayout::Tile: + return ::ttnn::Layout::TILE; + case ::tt::target::TensorLayout::RowMajor: + return ::ttnn::Layout::ROW_MAJOR; + default: + LOG_FATAL("Unsupported layout"); + } +} + +::ttnn::TensorMemoryLayout +toTTNNTensorMemoryLayout(::tt::target::TensorMemoryLayout tensorMemoryLayout) { + + switch (tensorMemoryLayout) { + case ::tt::target::TensorMemoryLayout::Interleaved: + return ::ttnn::TensorMemoryLayout::INTERLEAVED; + case ::tt::target::TensorMemoryLayout::SingleBank: + return ::ttnn::TensorMemoryLayout::SINGLE_BANK; + case ::tt::target::TensorMemoryLayout::HeightSharded: + return ::ttnn::TensorMemoryLayout::HEIGHT_SHARDED; + case ::tt::target::TensorMemoryLayout::WidthSharded: + return ::ttnn::TensorMemoryLayout::WIDTH_SHARDED; + case ::tt::target::TensorMemoryLayout::BlockSharded: + return ::ttnn::TensorMemoryLayout::BLOCK_SHARDED; + case ::tt::target::TensorMemoryLayout::None: + LOG_FATAL("Unsupported tensor memory layout None"); + } +} + +// This method will be deprecated in favor of method below +// +::tt::tt_metal::BufferType +toTTNNBufferType(::tt::target::MemorySpace memorySpace) { + switch (memorySpace) { + case ::tt::target::MemorySpace::System: + case ::tt::target::MemorySpace::SystemMMIO: + return ::tt::tt_metal::BufferType::SYSTEM_MEMORY; + case ::tt::target::MemorySpace::DeviceDRAM: + return ::tt::tt_metal::BufferType::DRAM; + case ::tt::target::MemorySpace::DeviceL1: + return ::tt::tt_metal::BufferType::L1; + } +} + +// Prefer to use this method +// +::ttnn::BufferType toTTNNBufferType(::tt::target::BufferType bufferType) { + + switch (bufferType) { + case ::tt::target::BufferType::DRAM: + return ::ttnn::BufferType::DRAM; + case ::tt::target::BufferType::L1: + return ::ttnn::BufferType::L1; + case ::tt::target::BufferType::SystemMemory: + return ::ttnn::BufferType::SYSTEM_MEMORY; + case ::tt::target::BufferType::L1Small: + return ::ttnn::BufferType::L1_SMALL; + case ::tt::target::BufferType::Trace: + return ::ttnn::BufferType::TRACE; + } +}; + +std::vector +toShapeFromFBShape(const flatbuffers::Vector &vec) { + return std::vector(vec.begin(), vec.end()); +} + +::ttnn::Layout +inferLayoutFromTileShape(const ::tt::target::TensorRef *tensorRef) { + const ::tt::target::Dim2d *tileShape = + tensorRef->desc()->layout()->memory_desc()->tile_shape(); + LOG_ASSERT(isValidTileShape(tileShape)); + if (tileShape->x() == 1 and tileShape->y() == 1) { + return ::ttnn::Layout::ROW_MAJOR; + } + return ::ttnn::Layout::TILE; +} + +CoreRangeSet +toCoreRangeSet(const ::flatbuffers::Vector + *coreRangeSet) { + std::set coreRanges; + for (::tt::target::Dim2dRange const *coreRange : *coreRangeSet) { + CoreCoord start(coreRange->loc().x(), coreRange->loc().y()); + // End is inclusive + CoreCoord end(coreRange->loc().x() + coreRange->size().x() - 1, + coreRange->loc().y() + coreRange->size().y() - 1); + + coreRanges.emplace(start, end); + } + return CoreRangeSet(coreRanges); +} + +::tt::tt_metal::MemoryConfig +createMemoryConfig(const ::tt::target::TensorRef *tensorRef) { + const ::tt::target::LayoutDesc *layout = tensorRef->desc()->layout(); + const ::tt::target::TensorMemoryLayout targetMemoryLayout = + layout->memory_desc()->memory_layout(); + const ::tt::target::MemorySpace targetMemorySpace = + layout->memory_desc()->memory_space(); + const ::flatbuffers::Vector + *targetCoreRangeSet = layout->core_range_set(); + const ::flatbuffers::Vector *targetShardShape = + layout->memory_desc()->shape(); + const ::tt::target::Dim2d *tileShape = layout->memory_desc()->tile_shape(); + + LOG_ASSERT(targetCoreRangeSet->size() == 1, + "Currently only single core range/grid is supported"); + + LOG_ASSERT(targetShardShape->size() == 2, + "Only 2D shard shape is supported in TTNN backend"); + + LOG_ASSERT(::tt::runtime::ttnn::utils::isValidTileShape(tileShape), + "Invalid tile shape"); + + CoreRangeSet ttnnCoreRangeSet = toCoreRangeSet(targetCoreRangeSet); + std::array ttnnShardShape; + std::copy(targetShardShape->begin(), targetShardShape->end(), + ttnnShardShape.begin()); + + ttnnShardShape[0] *= tileShape->y(); + ttnnShardShape[1] *= tileShape->x(); + + ::tt::tt_metal::ShardSpec shardSpec( + ttnnCoreRangeSet, ttnnShardShape, + ::tt::tt_metal::ShardOrientation::ROW_MAJOR, false); + + ::tt::tt_metal::TensorMemoryLayout ttnnMemLayout = + toTTNNTensorMemoryLayout(targetMemoryLayout); + + ::tt::tt_metal::BufferType ttnnBufferType = + toTTNNBufferType(targetMemorySpace); + ::tt::tt_metal::MemoryConfig memoryConfig{.memory_layout = ttnnMemLayout, + .buffer_type = ttnnBufferType, + .shard_spec = shardSpec}; + return memoryConfig; +} + +Tensor createRuntimeTensorFromTTNN(const ::ttnn::Tensor &tensor) { + auto tensorPtr = std::make_shared<::ttnn::Tensor>(tensor); + return Tensor(std::static_pointer_cast(tensorPtr), nullptr, + DeviceRuntime::TTNN); +} + +} // namespace tt::runtime::ttnn::utils diff --git a/runtime/lib/ttnn/include/tt/runtime/ttnn/utils.h b/runtime/lib/ttnn/include/tt/runtime/ttnn/utils.h index ca50ad58b3..353195b8df 100644 --- a/runtime/lib/ttnn/include/tt/runtime/ttnn/utils.h +++ b/runtime/lib/ttnn/include/tt/runtime/ttnn/utils.h @@ -6,126 +6,50 @@ #define TT_RUNTIME_TTNN_UTILS_H #include "flatbuffers/vector.h" +#include "tt/runtime/detail/ttnn.h" #include "ttmlir/Target/Common/types_generated.h" #include "ttmlir/Target/TTNN/Target.h" -#include "ttnn/types.hpp" namespace tt::runtime::ttnn::utils { -inline bool isValidTileShape(const ::tt::target::Dim2d *shape) { - return (shape->x() == 1 and shape->y() == 1) or - (shape->x() == 32 and shape->y() == 32); -} - -inline ::ttnn::DataType toTTNNDataType(::tt::target::DataType dataType) { - switch (dataType) { - case ::tt::target::DataType::Float32: - return ::ttnn::DataType::FLOAT32; - case ::tt::target::DataType::BFloat16: - return ::ttnn::DataType::BFLOAT16; - case ::tt::target::DataType::BFP_BFloat8: - return ::ttnn::DataType::BFLOAT8_B; - case ::tt::target::DataType::BFP_BFloat4: - return ::ttnn::DataType::BFLOAT4_B; - case ::tt::target::DataType::UInt32: - return ::ttnn::DataType::UINT32; - case ::tt::target::DataType::UInt16: - return ::ttnn::DataType::UINT16; - - default: - throw std::runtime_error("Unsupported data type"); - } -} - -inline ::tt::target::DataType fromTTNNDataType(::ttnn::DataType dataType) { - switch (dataType) { - case ::ttnn::DataType::FLOAT32: - return ::tt::target::DataType::Float32; - case ::ttnn::DataType::BFLOAT16: - return ::tt::target::DataType::BFloat16; - case ::ttnn::DataType::BFLOAT8_B: - return ::tt::target::DataType::BFP_BFloat8; - case ::ttnn::DataType::BFLOAT4_B: - return ::tt::target::DataType::BFP_BFloat4; - case ::ttnn::DataType::UINT32: - return ::tt::target::DataType::UInt32; - case ::ttnn::DataType::UINT16: - return ::tt::target::DataType::UInt16; - - default: - throw std::runtime_error("Unsupported data type"); - } -} - -inline ::ttnn::Layout toTTNNLayout(::tt::target::TensorLayout layout) { - switch (layout) { - case ::tt::target::TensorLayout::Tile: - return ::ttnn::Layout::TILE; - case ::tt::target::TensorLayout::RowMajor: - return ::ttnn::Layout::ROW_MAJOR; - default: - throw std::runtime_error("Unsupported layout"); - } -} - -inline ::ttnn::TensorMemoryLayout -toTTNNTensorMemoryLayout(::tt::target::TensorMemoryLayout tensorMemoryLayout) { - - switch (tensorMemoryLayout) { - case ::tt::target::TensorMemoryLayout::Interleaved: - return ::ttnn::TensorMemoryLayout::INTERLEAVED; - case ::tt::target::TensorMemoryLayout::SingleBank: - return ::ttnn::TensorMemoryLayout::SINGLE_BANK; - case ::tt::target::TensorMemoryLayout::HeightSharded: - return ::ttnn::TensorMemoryLayout::HEIGHT_SHARDED; - case ::tt::target::TensorMemoryLayout::WidthSharded: - return ::ttnn::TensorMemoryLayout::WIDTH_SHARDED; - case ::tt::target::TensorMemoryLayout::BlockSharded: - return ::ttnn::TensorMemoryLayout::BLOCK_SHARDED; - case ::tt::target::TensorMemoryLayout::None: - assert(false && - "Unsupported tensor memory layout TensorMemoryLayout::None"); - } -} +bool isOnHost(const ::ttnn::StorageType &storageType); + +bool isOnDevice(const ::ttnn::StorageType &storageType); + +bool isValidTileShape(const ::tt::target::Dim2d *shape); + +::ttnn::DataType toTTNNDataType(::tt::target::DataType dataType); + +::tt::target::DataType fromTTNNDataType(::ttnn::DataType dataType); + +::ttnn::Layout toTTNNLayout(::tt::target::TensorLayout layout); + +::ttnn::TensorMemoryLayout +toTTNNTensorMemoryLayout(::tt::target::TensorMemoryLayout tensorMemoryLayout); // This method will be deprecated in favor of method below // -inline ::tt::tt_metal::BufferType -toTTNNBufferType(::tt::target::MemorySpace memorySpace) { - switch (memorySpace) { - case ::tt::target::MemorySpace::System: - case ::tt::target::MemorySpace::SystemMMIO: - return ::tt::tt_metal::BufferType::SYSTEM_MEMORY; - case ::tt::target::MemorySpace::DeviceDRAM: - return ::tt::tt_metal::BufferType::DRAM; - case ::tt::target::MemorySpace::DeviceL1: - return ::tt::tt_metal::BufferType::L1; - } -} +::tt::tt_metal::BufferType +toTTNNBufferType(::tt::target::MemorySpace memorySpace); // Prefer to use this method // -inline ::ttnn::BufferType -toTTNNBufferType(::tt::target::BufferType bufferType) { - - switch (bufferType) { - case ::tt::target::BufferType::DRAM: - return ::ttnn::BufferType::DRAM; - case ::tt::target::BufferType::L1: - return ::ttnn::BufferType::L1; - case ::tt::target::BufferType::SystemMemory: - return ::ttnn::BufferType::SYSTEM_MEMORY; - case ::tt::target::BufferType::L1Small: - return ::ttnn::BufferType::L1_SMALL; - case ::tt::target::BufferType::Trace: - return ::ttnn::BufferType::TRACE; - } -}; - -inline std::vector -toShapeFromFBShape(const flatbuffers::Vector &vec) { - return std::vector(vec.begin(), vec.end()); -} +::ttnn::BufferType toTTNNBufferType(::tt::target::BufferType bufferType); + +std::vector +toShapeFromFBShape(const flatbuffers::Vector &vec); + +::ttnn::Layout +inferLayoutFromTileShape(const ::tt::target::TensorRef *tensorRef); + +CoreRangeSet +toCoreRangeSet(const ::flatbuffers::Vector + *coreRangeSet); + +::tt::tt_metal::MemoryConfig +createMemoryConfig(const ::tt::target::TensorRef *tensorRef); + +Tensor createRuntimeTensorFromTTNN(const ::ttnn::Tensor &tensor); } // namespace tt::runtime::ttnn::utils diff --git a/runtime/lib/ttnn/operations/CMakeLists.txt b/runtime/lib/ttnn/operations/CMakeLists.txt index 4edc4780b9..df32e2c759 100644 --- a/runtime/lib/ttnn/operations/CMakeLists.txt +++ b/runtime/lib/ttnn/operations/CMakeLists.txt @@ -45,12 +45,13 @@ target_include_directories(TTRuntimeTTNNOps PUBLIC ${PROJECT_SOURCE_DIR}/runtime/lib/ttnn/operations/include ${PROJECT_BINARY_DIR}/include/ttmlir/Target/Common ) + target_include_directories(TTRuntimeTTNNOps SYSTEM PUBLIC "$") -target_link_libraries(TTRuntimeTTNNOps PUBLIC TTNN_LIBRARY) +target_link_libraries(TTRuntimeTTNNOps PUBLIC TTNN_LIBRARY TTRuntimeTTNNHelpers) if (TT_RUNTIME_ENABLE_PERF_TRACE) target_link_libraries(TTRuntimeTTNNOps PUBLIC TRACY_LIBRARY) endif() -add_dependencies(TTRuntimeTTNNOps TTNN_LIBRARY tt-metal FBS_GENERATION) +add_dependencies(TTRuntimeTTNNOps TTNN_LIBRARY tt-metal FBS_GENERATION TTRuntimeTTNNHelpers) diff --git a/runtime/lib/ttnn/operations/ccl/all_gather.cpp b/runtime/lib/ttnn/operations/ccl/all_gather.cpp index 37bf7427bf..eee27e7bab 100644 --- a/runtime/lib/ttnn/operations/ccl/all_gather.cpp +++ b/runtime/lib/ttnn/operations/ccl/all_gather.cpp @@ -5,6 +5,7 @@ #include "all_gather.h" #include "tt/runtime/detail/ttnn.h" #include "tt/runtime/ttnn/operations/utils.h" +#include "tt/runtime/ttnn/utils.h" namespace tt::runtime::ttnn::operations::ccl { void run(const ::tt::target::ttnn::AllGatherOp *op, ProgramContext &context) { @@ -13,7 +14,7 @@ void run(const ::tt::target::ttnn::AllGatherOp *op, ProgramContext &context) { int32_t dim = op->dim(); int32_t num_links = op->num_links(); ::tt::tt_metal::MemoryConfig outputMemoryConfig = - utils::createMemoryConfig(op->out()); + ::tt::runtime::ttnn::utils::createMemoryConfig(op->out()); ::ttnn::Tensor out = ::ttnn::all_gather(input, dim, num_links, outputMemoryConfig); tensorPool.insert_or_assign(op->out()->global_id(), out); diff --git a/runtime/lib/ttnn/operations/conv/conv2d.cpp b/runtime/lib/ttnn/operations/conv/conv2d.cpp index e6670c1131..5e00b929e7 100644 --- a/runtime/lib/ttnn/operations/conv/conv2d.cpp +++ b/runtime/lib/ttnn/operations/conv/conv2d.cpp @@ -6,6 +6,7 @@ #include "tt/runtime/detail/logger.h" #include "tt/runtime/detail/ttnn.h" #include "tt/runtime/ttnn/operations/utils.h" +#include "tt/runtime/ttnn/utils.h" #include "ttmlir/Target/TTNN/program_generated.h" #include "ttnn/types.hpp" @@ -23,7 +24,8 @@ void run(const ::tt::target::ttnn::Conv2dOp *op, ProgramContext &context) { auto config = ::ttnn::operations::conv::conv2d::Conv2dConfig(); config.dtype = utils::getDataType(op->input()); config.weights_dtype = utils::getDataType(op->weight()); - ::ttnn::MemoryConfig outMemConfig = utils::createMemoryConfig(op->out()); + ::ttnn::MemoryConfig outMemConfig = + ::tt::runtime::ttnn::utils::createMemoryConfig(op->out()); DeviceVariant targetDevice = context.getTargetDevice(op->device()->global_id()); ::ttnn::Tensor out = std::visit( diff --git a/runtime/lib/ttnn/operations/creation/empty.cpp b/runtime/lib/ttnn/operations/creation/empty.cpp index bed68e0f14..d504a798b2 100644 --- a/runtime/lib/ttnn/operations/creation/empty.cpp +++ b/runtime/lib/ttnn/operations/creation/empty.cpp @@ -62,11 +62,12 @@ createEmptyOnMultiDevice(ProgramContext &context, EmptyTensorConfig &config, ::tt::tt_metal::DistributedTensorConfig strategy = config.distributedTensorConfig(); std::vector<::ttnn::Tensor> tensorShards; - tensorShards.resize(config.numShards); - std::generate_n( - tensorShards.begin(), config.numShards, [&config]() -> ::ttnn::Tensor { - return ::ttnn::zeros(config.shape, config.dtype, config.layout); - }); + tensorShards.reserve(config.numShards); + std::generate_n(std::back_inserter(tensorShards), config.numShards, + [&config]() -> ::ttnn::Tensor { + return ::ttnn::zeros(config.shape, config.dtype, + config.layout); + }); ::ttnn::Tensor out = ::ttnn::distributed::api::create_multi_device_tensor( tensorShards, ::tt::tt_metal::StorageType::MULTI_DEVICE_HOST, strategy); if (deviceRef) { @@ -101,6 +102,6 @@ void run(const ::tt::target::ttnn::EmptyOp *op, ProgramContext &context) { } else { LOG_FATAL("Unsupported num shards"); } - utils::updateTensorPool(tensorPool, out, op->out()->global_id()); + tensorPool.insert_or_assign(op->out()->global_id(), out); } } // namespace tt::runtime::ttnn::operations::creation diff --git a/runtime/lib/ttnn/operations/creation/full.cpp b/runtime/lib/ttnn/operations/creation/full.cpp index 6a224f935d..7f6a6c0b6f 100644 --- a/runtime/lib/ttnn/operations/creation/full.cpp +++ b/runtime/lib/ttnn/operations/creation/full.cpp @@ -26,7 +26,7 @@ struct FullTensorConfig { fillValue(op->fill_value()), numShards(op->num_shards()), strategy(op->strategy()) { - layout = utils::inferLayoutFromTileShape(op->out()); + layout = ::tt::runtime::ttnn::utils::inferLayoutFromTileShape(op->out()); // TODO(bug #272), determine correct layout by tile shape in the future // currently tile shape is not set correctly, so as a workaround, hardcode @@ -42,8 +42,7 @@ struct FullTensorConfig { } if (!utils::inSystemMemory(op->out())) { - memoryConfig = - ::tt::runtime::ttnn::operations::utils::createMemoryConfig(op->out()); + memoryConfig = ::tt::runtime::ttnn::utils::createMemoryConfig(op->out()); } validate(); } @@ -72,8 +71,8 @@ createFullOnMultiDevice(ProgramContext &context, FullTensorConfig &config, ::tt::tt_metal::DistributedTensorConfig strategy = config.distributedTensorConfig(); std::vector<::ttnn::Tensor> tensorShards; - tensorShards.resize(config.numShards); - std::generate_n(tensorShards.begin(), config.numShards, + tensorShards.reserve(config.numShards); + std::generate_n(std::back_inserter(tensorShards), config.numShards, [&config]() -> ::ttnn::Tensor { return ::ttnn::full(config.shape, config.fillValue, config.dtype, config.layout); @@ -116,6 +115,6 @@ void run(const ::tt::target::ttnn::FullOp *op, ProgramContext &context) { } else { LOG_FATAL("Unsupported num shards"); } - utils::updateTensorPool(tensorPool, out, op->out()->global_id()); + tensorPool.insert_or_assign(op->out()->global_id(), out); } } // namespace tt::runtime::ttnn::operations::creation diff --git a/runtime/lib/ttnn/operations/data_movement/transpose.cpp b/runtime/lib/ttnn/operations/data_movement/transpose.cpp index ef8dcf1b13..c86c0ee10a 100644 --- a/runtime/lib/ttnn/operations/data_movement/transpose.cpp +++ b/runtime/lib/ttnn/operations/data_movement/transpose.cpp @@ -6,6 +6,7 @@ #include "tt/runtime/detail/logger.h" #include "tt/runtime/detail/ttnn.h" #include "tt/runtime/ttnn/operations/utils.h" +#include "tt/runtime/ttnn/utils.h" namespace tt::runtime::ttnn::operations::data_movement { void run(const ::tt::target::ttnn::TransposeOp *op, ProgramContext &context) { @@ -15,7 +16,7 @@ void run(const ::tt::target::ttnn::TransposeOp *op, ProgramContext &context) { int32_t dim0 = op->dim0(); int32_t dim1 = op->dim1(); ::tt::tt_metal::MemoryConfig outputMemoryConfig = - utils::createMemoryConfig(op->out()); + ::tt::runtime::ttnn::utils::createMemoryConfig(op->out()); ::ttnn::Tensor out = ::ttnn::transpose(in, dim0, dim1, outputMemoryConfig); tensorPool.insert_or_assign(op->out()->global_id(), out); } diff --git a/runtime/lib/ttnn/operations/deletion/deallocate.cpp b/runtime/lib/ttnn/operations/deletion/deallocate.cpp index 6204945b34..e871a9ea64 100644 --- a/runtime/lib/ttnn/operations/deletion/deallocate.cpp +++ b/runtime/lib/ttnn/operations/deletion/deallocate.cpp @@ -11,13 +11,6 @@ void run(const ::tt::target::ttnn::DeallocateOp *op, ProgramContext &context) { ::ttnn::Tensor &tensor = tensorPool.at(op->in()->global_id()); DEBUG_ASSERT(tensor.is_allocated()); ::ttnn::deallocate(tensor, op->force()); - - // The tensor should be deallocated after the deallocate call. - // Still this assert may be hit in the future for multidevice/async ttnn - // support. In that case, we will reevaluate the assert/dealloc behaviour and - // adjust it accordingly. - // - DEBUG_ASSERT(!tensor.is_allocated()); tensorPool.erase(op->in()->global_id()); } } // namespace tt::runtime::ttnn::operations::deletion diff --git a/runtime/lib/ttnn/operations/eltwise/binary/binary.cpp b/runtime/lib/ttnn/operations/eltwise/binary/binary.cpp index 5913971192..ff47bdcdd8 100644 --- a/runtime/lib/ttnn/operations/eltwise/binary/binary.cpp +++ b/runtime/lib/ttnn/operations/eltwise/binary/binary.cpp @@ -6,6 +6,7 @@ #include "tt/runtime/detail/ttnn.h" #include "tt/runtime/ttnn/operations/eltwise/binary/utils.h" #include "tt/runtime/ttnn/operations/utils.h" +#include "tt/runtime/ttnn/utils.h" #include "ttnn/operations/eltwise/binary/binary_composite.hpp" namespace tt::runtime::ttnn::operations::binary { @@ -26,7 +27,7 @@ static void runEltwiseBinaryOp( ::ttnn::DataType outputDataType = utils::getDataType(op->out()); ::tt::tt_metal::MemoryConfig outputMemoryConfig = - utils::createMemoryConfig(op->out()); + ::tt::runtime::ttnn::utils::createMemoryConfig(op->out()); ::ttnn::Tensor out = ttnnOp(*lhs, *rhs, outputDataType, outputMemoryConfig, std::nullopt, std::nullopt, std::nullopt); diff --git a/runtime/lib/ttnn/operations/eltwise/binary/binary_composite.cpp b/runtime/lib/ttnn/operations/eltwise/binary/binary_composite.cpp index 2a05d6246f..5ed4418f96 100644 --- a/runtime/lib/ttnn/operations/eltwise/binary/binary_composite.cpp +++ b/runtime/lib/ttnn/operations/eltwise/binary/binary_composite.cpp @@ -6,6 +6,7 @@ #include "tt/runtime/detail/ttnn.h" #include "tt/runtime/ttnn/operations/eltwise/binary/utils.h" #include "tt/runtime/ttnn/operations/utils.h" +#include "tt/runtime/ttnn/utils.h" namespace tt::runtime::ttnn::operations::binary::composite { @@ -20,7 +21,7 @@ static void runEltwiseBinaryCompositeOp( getEltwiseBinaryOpInputTensors(op, tensorPool, &lhs, &rhs); ::tt::tt_metal::MemoryConfig outputMemoryConfig = - utils::createMemoryConfig(op->out()); + ::tt::runtime::ttnn::utils::createMemoryConfig(op->out()); ::ttnn::Tensor out = ttnnOp(*lhs, *rhs, outputMemoryConfig); tensorPool.insert_or_assign(op->out()->global_id(), out); diff --git a/runtime/lib/ttnn/operations/eltwise/ternary/ternary.cpp b/runtime/lib/ttnn/operations/eltwise/ternary/ternary.cpp index 6afde5d663..44f1413898 100644 --- a/runtime/lib/ttnn/operations/eltwise/ternary/ternary.cpp +++ b/runtime/lib/ttnn/operations/eltwise/ternary/ternary.cpp @@ -22,7 +22,7 @@ static void runEltwiseTernaryWhereOp( getEltwiseTernaryOpInputTensors(op, tensorPool, &first, &second, &third); ::tt::tt_metal::MemoryConfig outputMemoryConfig = - utils::createMemoryConfig(op->out()); + ::tt::runtime::ttnn::utils::createMemoryConfig(op->out()); ::ttnn::Tensor out = ttnnOp(*first, *second, *third, outputMemoryConfig); tensorPool.insert_or_assign(op->out()->global_id(), out); diff --git a/runtime/lib/ttnn/operations/eltwise/unary/unary.cpp b/runtime/lib/ttnn/operations/eltwise/unary/unary.cpp index 5a09b43a9f..50c53f8dbe 100644 --- a/runtime/lib/ttnn/operations/eltwise/unary/unary.cpp +++ b/runtime/lib/ttnn/operations/eltwise/unary/unary.cpp @@ -6,6 +6,7 @@ #include "tt/runtime/detail/ttnn.h" #include "tt/runtime/ttnn/operations/eltwise/unary/utils.h" #include "tt/runtime/ttnn/operations/utils.h" +#include "tt/runtime/ttnn/utils.h" #include "ttmlir/Target/TTNN/program_generated.h" #include "ttnn/operations/copy.hpp" @@ -22,7 +23,7 @@ static void runEltwiseUnaryOp( getEltwiseUnaryOpInputTensor(op, tensorPool, &in); ::tt::tt_metal::MemoryConfig outputMemoryConfig = - utils::createMemoryConfig(op->out()); + ::tt::runtime::ttnn::utils::createMemoryConfig(op->out()); ::ttnn::Tensor out = ttnnOp(*in, outputMemoryConfig, std::nullopt); tensorPool.insert_or_assign(op->out()->global_id(), out); @@ -39,7 +40,7 @@ static void runEltwiseUnaryWithFastAndApproximateModeOp( getEltwiseUnaryOpInputTensor(op, tensorPool, &in); ::tt::tt_metal::MemoryConfig outputMemoryConfig = - utils::createMemoryConfig(op->out()); + ::tt::runtime::ttnn::utils::createMemoryConfig(op->out()); ::ttnn::Tensor out = ttnnOp(*in, false /* parameter */, outputMemoryConfig, std::nullopt); @@ -56,7 +57,7 @@ static void runEltwiseUnaryWithFloatParameterOp( float parameter = op->params_as_EltwiseOpWithFloatParams()->parameter(); ::tt::tt_metal::MemoryConfig outputMemoryConfig = - utils::createMemoryConfig(op->out()); + ::tt::runtime::ttnn::utils::createMemoryConfig(op->out()); ::ttnn::Tensor out = ttnnOp(*in, parameter, outputMemoryConfig); tensorPool.insert_or_assign(op->out()->global_id(), out); } diff --git a/runtime/lib/ttnn/operations/eltwise/unary/unary_composite.cpp b/runtime/lib/ttnn/operations/eltwise/unary/unary_composite.cpp index fd378d5a26..31514f0fe5 100644 --- a/runtime/lib/ttnn/operations/eltwise/unary/unary_composite.cpp +++ b/runtime/lib/ttnn/operations/eltwise/unary/unary_composite.cpp @@ -6,6 +6,7 @@ #include "tt/runtime/detail/ttnn.h" #include "tt/runtime/ttnn/operations/eltwise/unary/utils.h" #include "tt/runtime/ttnn/operations/utils.h" +#include "tt/runtime/ttnn/utils.h" #include "ttnn/operations/eltwise/unary/unary_composite.hpp" namespace tt::runtime::ttnn::operations::unary::composite { @@ -20,27 +21,26 @@ static void runEltwiseUnaryCompositeOp( getEltwiseUnaryOpInputTensor(op, tensorPool, &in); ::tt::tt_metal::MemoryConfig outputMemoryConfig = - utils::createMemoryConfig(op->out()); + ::tt::runtime::ttnn::utils::createMemoryConfig(op->out()); ::ttnn::Tensor out = ttnnOp(*in, outputMemoryConfig); tensorPool.insert_or_assign(op->out()->global_id(), out); } -static void runEltwiseUnaryCompositeClampOP( +static void runEltwiseUnaryCompositeClampOp( const ::tt::target::ttnn::EltwiseOp *op, ProgramTensorPool &tensorPool, - std::function<::ttnn::Tensor(const ::ttnn::Tensor &, float, float, - const ::tt::tt_metal::MemoryConfig &)> - ttnnOp) { + const std::function<::ttnn::Tensor(const ::ttnn::Tensor &, float, float, + const ::tt::tt_metal::MemoryConfig &)> + &ttnnOp) { ::ttnn::Tensor *in = nullptr; getEltwiseUnaryOpInputTensor(op, tensorPool, &in); float min = op->params_as_ClampOpParams()->min(); float max = op->params_as_ClampOpParams()->max(); ::tt::tt_metal::MemoryConfig outputMemoryConfig = - utils::createMemoryConfig(op->out()); + ::tt::runtime::ttnn::utils::createMemoryConfig(op->out()); ::ttnn::Tensor out = ttnnOp(*in, min, max, outputMemoryConfig); tensorPool.insert_or_assign(op->out()->global_id(), out); - return; } void run(const ::tt::target::ttnn::EltwiseOp *op, ProgramContext &context) { @@ -51,7 +51,7 @@ void run(const ::tt::target::ttnn::EltwiseOp *op, ProgramContext &context) { break; } case ::tt::target::ttnn::EltwiseOpType::Clamp: { - runEltwiseUnaryCompositeClampOP(op, tensorPool, ::ttnn::clamp); + runEltwiseUnaryCompositeClampOp(op, tensorPool, ::ttnn::clamp); break; } case ::tt::target::ttnn::EltwiseOpType::Log1p: { diff --git a/runtime/lib/ttnn/operations/embedding/embedding.cpp b/runtime/lib/ttnn/operations/embedding/embedding.cpp index 47b27ca9ac..511d8256de 100644 --- a/runtime/lib/ttnn/operations/embedding/embedding.cpp +++ b/runtime/lib/ttnn/operations/embedding/embedding.cpp @@ -6,6 +6,7 @@ #include "tt/runtime/detail/logger.h" #include "tt/runtime/detail/ttnn.h" #include "tt/runtime/ttnn/operations/utils.h" +#include "tt/runtime/ttnn/utils.h" namespace tt::runtime::ttnn::operations::embedding { void run(const ::tt::target::ttnn::EmbeddingOp *op, ProgramContext &context) { @@ -24,7 +25,7 @@ void run(const ::tt::target::ttnn::EmbeddingOp *op, ProgramContext &context) { auto embeddingsType = ::ttnn::operations::embedding::EmbeddingsType::GENERIC; ::ttnn::DataType outputDataType = utils::getDataType(op->out()); ::ttnn::MemoryConfig outputMemoryConfig = - utils::createMemoryConfig(op->out()); + ::tt::runtime::ttnn::utils::createMemoryConfig(op->out()); ::ttnn::Tensor out = ::ttnn::embedding(input, weight, padToken, layout, embeddingsType, outputDataType, outputMemoryConfig); diff --git a/runtime/lib/ttnn/operations/include/tt/runtime/ttnn/operations/utils.cpp b/runtime/lib/ttnn/operations/include/tt/runtime/ttnn/operations/utils.cpp index 435607b87e..df3a34cf89 100644 --- a/runtime/lib/ttnn/operations/include/tt/runtime/ttnn/operations/utils.cpp +++ b/runtime/lib/ttnn/operations/include/tt/runtime/ttnn/operations/utils.cpp @@ -7,25 +7,6 @@ namespace tt::runtime::ttnn::operations::utils { -// TODO (bug #701) -// Currently the memory layout/location in flatbuffer is incorrect -// These methods are workarounds such that we query the info directly from the -// TTNN tensor Ideally, we should be able to get all of this info directly from -// the flatbuffer -bool isOnHost(const ::ttnn::Tensor &tensor) { - // Currently only supports borrowed or owned host storage - return tensor.storage_type() == ::tt::tt_metal::StorageType::BORROWED or - tensor.storage_type() == ::tt::tt_metal::StorageType::OWNED or - tensor.storage_type() == - ::tt::tt_metal::StorageType::MULTI_DEVICE_HOST; -} - -bool isOnDevice(const ::ttnn::Tensor &tensor) { - // Currently only supports single device storage - return tensor.storage_type() == ::tt::tt_metal::StorageType::DEVICE or - tensor.storage_type() == ::tt::tt_metal::StorageType::MULTI_DEVICE; -} - bool isTilized(const ::tt::target::TensorRef *tensorRef) { const ::tt::target::Dim2d *tileShape = tensorRef->desc()->layout()->memory_desc()->tile_shape(); @@ -43,93 +24,11 @@ bool inSystemMemory(const ::tt::target::TensorRef *tensorRef) { targetMemorySpace == ::tt::target::MemorySpace::SystemMMIO; } -void updateTensorPool(ProgramTensorPool &tensorPool, - const ::ttnn::Tensor &tensor, uint32_t outputGlobalId) { - if (tensorPool.isUserOutput(outputGlobalId)) { - tensorPool.copyTensorToUserOutput(outputGlobalId, tensor); - } else { - tensorPool.insert_or_assign(outputGlobalId, tensor); - } -} - ::ttnn::DataType getDataType(const ::tt::target::TensorRef *tensorRef) { return ::tt::runtime::ttnn::utils::toTTNNDataType( tensorRef->desc()->layout()->memory_desc()->data_type()); } -::ttnn::Layout -inferLayoutFromTileShape(const ::tt::target::TensorRef *tensorRef) { - const ::tt::target::Dim2d *tileShape = - tensorRef->desc()->layout()->memory_desc()->tile_shape(); - LOG_ASSERT(::tt::runtime::ttnn::utils::isValidTileShape(tileShape)); - if (tileShape->x() == 1 and tileShape->y() == 1) { - return ::ttnn::Layout::ROW_MAJOR; - } - return ::ttnn::Layout::TILE; -} - -CoreRangeSet -toCoreRangeSet(const ::flatbuffers::Vector - *coreRangeSet) { - std::set coreRanges; - for (::tt::target::Dim2dRange const *coreRange : *coreRangeSet) { - CoreCoord start(coreRange->loc().x(), coreRange->loc().y()); - // End is inclusive - CoreCoord end(coreRange->loc().x() + coreRange->size().x() - 1, - coreRange->loc().y() + coreRange->size().y() - 1); - - coreRanges.emplace(start, end); - } - return CoreRangeSet(coreRanges); -} - -// This method will soon be deprecated, prefer to use the method below -// -::tt::tt_metal::MemoryConfig -createMemoryConfig(const ::tt::target::TensorRef *tensorRef) { - const ::tt::target::LayoutDesc *layout = tensorRef->desc()->layout(); - const ::tt::target::TensorMemoryLayout targetMemoryLayout = - layout->memory_desc()->memory_layout(); - const ::tt::target::MemorySpace targetMemorySpace = - layout->memory_desc()->memory_space(); - const ::flatbuffers::Vector - *targetCoreRangeSet = layout->core_range_set(); - const ::flatbuffers::Vector *targetShardShape = - layout->memory_desc()->shape(); - const ::tt::target::Dim2d *tileShape = layout->memory_desc()->tile_shape(); - - LOG_ASSERT(targetCoreRangeSet->size() == 1, - "Currently only single core range/grid is supported"); - - LOG_ASSERT(targetShardShape->size() == 2, - "Only 2D shard shape is supported in TTNN backend"); - - LOG_ASSERT(::tt::runtime::ttnn::utils::isValidTileShape(tileShape), - "Invalid tile shape"); - - CoreRangeSet ttnnCoreRangeSet = toCoreRangeSet(targetCoreRangeSet); - std::array ttnnShardShape; - std::copy(targetShardShape->begin(), targetShardShape->end(), - ttnnShardShape.begin()); - - ttnnShardShape[0] *= tileShape->y(); - ttnnShardShape[1] *= tileShape->x(); - - ::tt::tt_metal::ShardSpec shardSpec( - ttnnCoreRangeSet, ttnnShardShape, - ::tt::tt_metal::ShardOrientation::ROW_MAJOR, false); - - ::tt::tt_metal::TensorMemoryLayout ttnnMemLayout = - ::tt::runtime::ttnn::utils::toTTNNTensorMemoryLayout(targetMemoryLayout); - - ::tt::tt_metal::BufferType ttnnBufferType = - ::tt::runtime::ttnn::utils::toTTNNBufferType(targetMemorySpace); - - return {ttnnMemLayout, ttnnBufferType, shardSpec}; -} - -// Prefer to use this method over the one above -// ::tt::tt_metal::MemoryConfig createMemoryConfig(const ::tt::target::MemoryConfigDesc *memcfg, const ::tt::target::TensorRef *tensorRef) { @@ -144,7 +43,8 @@ createMemoryConfig(const ::tt::target::MemoryConfigDesc *memcfg, const ::tt::target::LayoutDesc *layout = tensorRef->desc()->layout(); const ::flatbuffers::Vector *targetCoreRangeSet = layout->core_range_set(); - CoreRangeSet ttnnCoreRangeSet = toCoreRangeSet(targetCoreRangeSet); + CoreRangeSet ttnnCoreRangeSet = + ::tt::runtime::ttnn::utils::toCoreRangeSet(targetCoreRangeSet); const ::flatbuffers::Vector *shardShape = memcfg->shard_spec()->shard_shape(); const ::tt::target::Dim2d *tileShape = layout->memory_desc()->tile_shape(); diff --git a/runtime/lib/ttnn/operations/include/tt/runtime/ttnn/operations/utils.h b/runtime/lib/ttnn/operations/include/tt/runtime/ttnn/operations/utils.h index b922e120a6..269e0328f9 100644 --- a/runtime/lib/ttnn/operations/include/tt/runtime/ttnn/operations/utils.h +++ b/runtime/lib/ttnn/operations/include/tt/runtime/ttnn/operations/utils.h @@ -13,32 +13,15 @@ namespace tt::runtime::ttnn::operations::utils { -bool isOnHost(const ::ttnn::Tensor &tensor); - -bool isOnDevice(const ::ttnn::Tensor &tensor); - bool isTilized(const ::tt::target::TensorRef *tensorRef); bool inSystemMemory(const ::tt::target::TensorRef *tensorRef); -void updateTensorPool(ProgramTensorPool &tensorPool, - const ::ttnn::Tensor &tensor, uint32_t outputGlobalId); - ::tt::target::MemorySpace getMemorySpace(const ::tt::target::TensorRef *tensorRef); ::ttnn::DataType getDataType(const ::tt::target::TensorRef *tensorRef); -::ttnn::Layout -inferLayoutFromTileShape(const ::tt::target::TensorRef *tensorRef); - -CoreRangeSet -toCoreRangeSet(const ::flatbuffers::Vector - *coreRangeSet); - -::tt::tt_metal::MemoryConfig -createMemoryConfig(const ::tt::target::TensorRef *tensorRef); - ::tt::tt_metal::MemoryConfig createMemoryConfig(const ::tt::target::MemoryConfigDesc *memcfg, const ::tt::target::TensorRef *tensorRef); diff --git a/runtime/lib/ttnn/operations/layout/from_device.cpp b/runtime/lib/ttnn/operations/layout/from_device.cpp index b6820b6ec7..e26e3be2a3 100644 --- a/runtime/lib/ttnn/operations/layout/from_device.cpp +++ b/runtime/lib/ttnn/operations/layout/from_device.cpp @@ -12,10 +12,11 @@ void run(const ::tt::target::ttnn::FromDeviceOp *op, ProgramContext &context) { ProgramTensorPool &tensorPool = context.getTensorPool(); const ::ttnn::Tensor &inputTensor = tensorPool.at(op->in()->global_id()); DEBUG_ASSERT(inputTensor.is_allocated()); - LOG_ASSERT(utils::isOnDevice(inputTensor), - "Calling ttnn::from_device on a host tensor"); + DEBUG_ASSERT( + ::tt::runtime::ttnn::utils::isOnDevice(inputTensor.storage_type()), + "Calling ttnn::from_device on a host tensor"); ::ttnn::Tensor out = ::ttnn::from_device(inputTensor); - utils::updateTensorPool(tensorPool, out, op->out()->global_id()); + tensorPool.insert_or_assign(op->out()->global_id(), out); } } // namespace tt::runtime::ttnn::operations::layout diff --git a/runtime/lib/ttnn/operations/layout/to_device.cpp b/runtime/lib/ttnn/operations/layout/to_device.cpp index 34af89f504..414afc9f05 100644 --- a/runtime/lib/ttnn/operations/layout/to_device.cpp +++ b/runtime/lib/ttnn/operations/layout/to_device.cpp @@ -14,7 +14,7 @@ void run(const ::tt::target::ttnn::ToDeviceOp *op, ProgramContext &context) { ProgramTensorPool &tensorPool = context.getTensorPool(); const ::ttnn::Tensor &inputTensor = tensorPool.at(op->in()->global_id()); DEBUG_ASSERT(inputTensor.is_allocated()); - DEBUG_ASSERT(utils::isOnHost(inputTensor), + DEBUG_ASSERT(::tt::runtime::ttnn::utils::isOnHost(inputTensor.storage_type()), "Calling ttnn::to_device on a device tensor"); std::optional<::ttnn::MemoryConfig> memoryConfig = std::nullopt; diff --git a/runtime/lib/ttnn/operations/layout/to_layout.cpp b/runtime/lib/ttnn/operations/layout/to_layout.cpp index 5e78a67187..bf80ef292e 100644 --- a/runtime/lib/ttnn/operations/layout/to_layout.cpp +++ b/runtime/lib/ttnn/operations/layout/to_layout.cpp @@ -57,7 +57,7 @@ void run(const ::tt::target::ttnn::ToLayoutOp *op, ProgramContext &context) { out = ::ttnn::to_layout(inputTensor, layout, dtype, memoryConfig, static_cast<::ttnn::Device *>(nullptr)); } - utils::updateTensorPool(tensorPool, out, op->out()->global_id()); + tensorPool.insert_or_assign(op->out()->global_id(), out); } } // namespace tt::runtime::ttnn::operations::layout diff --git a/runtime/lib/ttnn/operations/layout/typecast.cpp b/runtime/lib/ttnn/operations/layout/typecast.cpp index 5529c6112c..e59a64a401 100644 --- a/runtime/lib/ttnn/operations/layout/typecast.cpp +++ b/runtime/lib/ttnn/operations/layout/typecast.cpp @@ -17,7 +17,7 @@ void run(const ::tt::target::ttnn::TypecastOp *op, ProgramContext &context) { ::tt::runtime::ttnn::utils::toTTNNDataType(op->dtype()); ::ttnn::Tensor out = ::ttnn::typecast(inputTensor, targetDataType); - utils::updateTensorPool(tensorPool, out, op->out()->global_id()); + tensorPool.insert_or_assign(op->out()->global_id(), out); } } // namespace tt::runtime::ttnn::operations::layout diff --git a/runtime/lib/ttnn/operations/matmul/matmul.cpp b/runtime/lib/ttnn/operations/matmul/matmul.cpp index abe71f9707..dc72ce7b9e 100644 --- a/runtime/lib/ttnn/operations/matmul/matmul.cpp +++ b/runtime/lib/ttnn/operations/matmul/matmul.cpp @@ -6,6 +6,7 @@ #include "tt/runtime/detail/logger.h" #include "tt/runtime/detail/ttnn.h" #include "tt/runtime/ttnn/operations/utils.h" +#include "tt/runtime/ttnn/utils.h" #include // ANCHOR: adding_an_op_matmul_runtime_operations @@ -18,7 +19,7 @@ void run(const ::tt::target::ttnn::MatmulOp *op, ProgramContext &context) { DEBUG_ASSERT(rhs.is_allocated()); ::ttnn::DataType outputDataType = utils::getDataType(op->out()); ::tt::tt_metal::MemoryConfig outputMemoryConfig = - utils::createMemoryConfig(op->out()); + ::tt::runtime::ttnn::utils::createMemoryConfig(op->out()); std::optional< ::ttnn::operations::matmul::MatmulMultiCoreReuseMultiCast1DProgramConfig> diff --git a/runtime/lib/ttnn/operations/normalization/softmax.cpp b/runtime/lib/ttnn/operations/normalization/softmax.cpp index a83358567c..432f920956 100644 --- a/runtime/lib/ttnn/operations/normalization/softmax.cpp +++ b/runtime/lib/ttnn/operations/normalization/softmax.cpp @@ -6,6 +6,7 @@ #include "tt/runtime/detail/logger.h" #include "tt/runtime/detail/ttnn.h" #include "tt/runtime/ttnn/operations/utils.h" +#include "tt/runtime/ttnn/utils.h" namespace tt::runtime::ttnn::operations::normalization { void run(const ::tt::target::ttnn::SoftmaxOp *op, ProgramContext &context) { @@ -14,7 +15,7 @@ void run(const ::tt::target::ttnn::SoftmaxOp *op, ProgramContext &context) { DEBUG_ASSERT(in.is_allocated()); int32_t dimension = op->dimension(); ::tt::tt_metal::MemoryConfig outputMemoryConfig = - utils::createMemoryConfig(op->out()); + ::tt::runtime::ttnn::utils::createMemoryConfig(op->out()); ::ttnn::Tensor out = ::ttnn::softmax(in, dimension, outputMemoryConfig); tensorPool.insert_or_assign(op->out()->global_id(), out); } diff --git a/runtime/lib/ttnn/operations/pool/maxpool2d.cpp b/runtime/lib/ttnn/operations/pool/maxpool2d.cpp index dfd8b9375e..23a935334f 100644 --- a/runtime/lib/ttnn/operations/pool/maxpool2d.cpp +++ b/runtime/lib/ttnn/operations/pool/maxpool2d.cpp @@ -58,7 +58,8 @@ void run(const ::tt::target::ttnn::MaxPool2dOp *op, ProgramContext &context) { }, targetDevice); } - ::ttnn::MemoryConfig outMemConfig = utils::createMemoryConfig(op->out()); + ::ttnn::MemoryConfig outMemConfig = + ::tt::runtime::ttnn::utils::createMemoryConfig(op->out()); ::ttnn::Tensor out = operation.invoke( 0, input, op->batch_size(), op->input_height(), op->input_width(), op->channels(), {op->kernel_height(), op->kernel_width()}, diff --git a/runtime/lib/ttnn/operations/reduction/reduction.cpp b/runtime/lib/ttnn/operations/reduction/reduction.cpp index 3af46efc9c..a74373ee9f 100644 --- a/runtime/lib/ttnn/operations/reduction/reduction.cpp +++ b/runtime/lib/ttnn/operations/reduction/reduction.cpp @@ -6,6 +6,7 @@ #include "tt/runtime/detail/logger.h" #include "tt/runtime/detail/ttnn.h" #include "tt/runtime/ttnn/operations/utils.h" +#include "tt/runtime/ttnn/utils.h" namespace tt::runtime::ttnn::operations::reduction { static void runReductionOp( @@ -17,7 +18,7 @@ static void runReductionOp( const std::optional<::ttnn::DeviceComputeKernelConfig> &, float)> &ttnnOp) { ::tt::tt_metal::MemoryConfig outputMemoryConfig = - utils::createMemoryConfig(op->out()); + ::tt::runtime::ttnn::utils::createMemoryConfig(op->out()); const ::ttnn::Tensor &in = tensorPool.at(op->in()->global_id()); DEBUG_ASSERT(in.is_allocated()); diff --git a/runtime/lib/ttnn/program.cpp b/runtime/lib/ttnn/program.cpp index 8cfa013891..b22bb26d2a 100644 --- a/runtime/lib/ttnn/program.cpp +++ b/runtime/lib/ttnn/program.cpp @@ -29,6 +29,7 @@ #include "tt/runtime/detail/debug.h" #include "tt/runtime/detail/logger.h" #include "tt/runtime/ttnn/types.h" +#include "tt/runtime/ttnn/utils.h" #include "tt/runtime/utils.h" #include "ttmlir/Target/TTNN/program_generated.h" @@ -46,28 +47,19 @@ static ::tt::target::ttnn::TTNNBinary const *getBinary(Flatbuffer binary) { class ProgramExecutor { public: - ProgramExecutor(Binary &executableHandle, const TensorMap &liveTensors, - const std::unordered_set &programInputs, - const std::unordered_set &programOutputs, - ::ttnn::MeshDevice *meshDevice) + ProgramExecutor( + const Binary &executableHandle, + const std::unordered_map &liveTensors, + const std::vector &programInputs, + const std::vector &programOutputs, + ::ttnn::MeshDevice *meshDevice) : executableHandle(executableHandle), context(ProgramContext(liveTensors, programInputs, programOutputs, meshDevice)) {} void runCallback(Binary &executableHandle, const ::tt::target::ttnn::Operation *opContext, - ProgramContext *programContext) { - if (auto callback = debug::Hooks::get().getOperatorCallback(); callback) { - std::shared_ptr programContextPtr = - ::tt::runtime::utils::unsafe_borrow_shared(programContext); - std::shared_ptr opContextPtr = - ::tt::runtime::utils::unsafe_borrow_shared( - const_cast<::tt::target::ttnn::Operation *>(opContext)); - (*callback)(executableHandle, - CallbackContext(programContextPtr, DeviceRuntime::TTNN), - OpContext(opContextPtr, DeviceRuntime::TTNN)); - } - } + ProgramContext *programContext); void execute(const ::tt::target::ttnn::Program *program) { for (const ::tt::target::ttnn::Operation *op : *program->operations()) { @@ -79,6 +71,9 @@ class ProgramExecutor { } ProgramContext &getContext() { return context; } + std::vector gatherOutputTensors() { + return context.getTensorPool().gatherOutputTensors(); + } private: Binary executableHandle; @@ -87,6 +82,21 @@ class ProgramExecutor { void runEltwiseOperation(const ::tt::target::ttnn::EltwiseOp *op); }; +void ProgramExecutor::runCallback( + Binary &executableHandle, const ::tt::target::ttnn::Operation *opContext, + ProgramContext *programContext) { + if (auto callback = debug::Hooks::get().getOperatorCallback(); callback) { + std::shared_ptr programContextPtr = + ::tt::runtime::utils::unsafe_borrow_shared(programContext); + std::shared_ptr opContextPtr = + ::tt::runtime::utils::unsafe_borrow_shared( + const_cast<::tt::target::ttnn::Operation *>(opContext)); + (*callback)(executableHandle, + CallbackContext(programContextPtr, DeviceRuntime::TTNN), + OpContext(opContextPtr, DeviceRuntime::TTNN)); + } +} + void ProgramExecutor::runEltwiseOperation( const ::tt::target::ttnn::EltwiseOp *op) { auto runUnaryOp = [&]() { @@ -193,6 +203,26 @@ void ProgramExecutor::runOperation(const ::tt::target::ttnn::Operation *op) { } // Nop is single input, output tensor where input is returned as output. +static bool isNopProgram(const ::tt::target::ttnn::Program *program) { + return program->inputs()->size() == 1 && program->outputs()->size() == 1 && + program->inputs()->Get(0)->global_id() == + program->outputs()->Get(0)->global_id(); +} + +static ::ttnn::Tensor +handleNopProgram(::tt::target::ttnn::Program const *program, + std::vector<::ttnn::Tensor *> const &inputs) { + const ::ttnn::Tensor &input = *inputs[0]; + ::ttnn::Tensor output = + ::ttnn::zeros(input.get_shape(), input.get_dtype(), input.get_layout()); + const void *src = ::tt::tt_metal::get_raw_host_data_ptr(input); + void *dst = ::tt::tt_metal::get_raw_host_data_ptr(output); + std::memcpy(dst, src, input.volume() * input.element_size()); + return output; +} + +namespace legacy { + static bool handleNopProgram(::tt::target::ttnn::Program const *program, std::vector<::ttnn::Tensor *> const &inputs, std::vector<::ttnn::Tensor *> const &outputs) { @@ -221,8 +251,8 @@ void runProgram(::ttnn::MeshDevice &meshDevice, Binary &executableHandle, if (handleNopProgram(program, inputs, outputs)) { return; } - TensorMap liveTensors; - std::unordered_set programInputs; + std::unordered_map liveTensors; + std::vector programInputs; int inputIndex = 0; LOG_ASSERT(program->inputs()->size() == inputs.size(), "Program input size mismatch: ", program->inputs()->size(), @@ -231,21 +261,68 @@ void runProgram(::ttnn::MeshDevice &meshDevice, Binary &executableHandle, auto [iter, inserted] = liveTensors.try_emplace(input->global_id(), inputs[inputIndex++]); LOG_ASSERT(inserted, "Duplicate input tensor"); - programInputs.emplace(input->global_id()); + programInputs.push_back(input->global_id()); } int outputIndex = 0; - std::unordered_set programOutputs; + std::vector programOutputs; LOG_ASSERT(program->outputs()->size() == outputs.size()); for (::tt::target::TensorRef const *output : *program->outputs()) { auto [iter, inserted] = liveTensors.try_emplace(output->global_id(), outputs[outputIndex++]); LOG_ASSERT(inserted, "Duplicate output tensor"); - programOutputs.emplace(output->global_id()); + programOutputs.push_back(output->global_id()); + } + ProgramExecutor executor(executableHandle, liveTensors, programInputs, + programOutputs, &meshDevice); + executor.execute(program); + for (uint32_t outputId : programOutputs) { + const ::ttnn::Tensor &src = + executor.getContext().getTensorPool().at(outputId); + const ::ttnn::Tensor &dst = *outputs[outputId]; + size_t srcSize = src.volume() * src.element_size(); + size_t dstSize = dst.volume() * dst.element_size(); + LOG_ASSERT(srcSize == dstSize, "Output tensor size mismatch"); + void *srcPtr = ::tt::tt_metal::get_raw_host_data_ptr(src); + void *dstPtr = ::tt::tt_metal::get_raw_host_data_ptr(dst); + std::memcpy(dstPtr, srcPtr, dstSize); + } +} +} // namespace legacy + +std::vector runProgram(::ttnn::MeshDevice &meshDevice, + Binary executableHandle, + std::uint32_t programIndex, + std::vector<::ttnn::Tensor *> const &inputs) { + ::tt::target::ttnn::TTNNBinary const &fbb = *getBinary(executableHandle); + ::tt::target::ttnn::Program const *program = + fbb.programs()->Get(programIndex); + if (isNopProgram(program)) { + Tensor out = + utils::createRuntimeTensorFromTTNN(handleNopProgram(program, inputs)); + return {out}; + } + std::unordered_map liveTensors; + std::vector programInputs; + int inputIndex = 0; + LOG_ASSERT(program->inputs()->size() == inputs.size(), + "Program input size mismatch: ", program->inputs()->size(), + " != ", inputs.size()); + for (::tt::target::TensorRef const *input : *program->inputs()) { + auto [iter, inserted] = + liveTensors.try_emplace(input->global_id(), inputs[inputIndex++]); + LOG_ASSERT(inserted, "Duplicate input tensor"); + programInputs.push_back(input->global_id()); + } + std::vector programOutputs; + for (::tt::target::TensorRef const *output : *program->outputs()) { + programOutputs.push_back(output->global_id()); } ProgramExecutor executor(executableHandle, liveTensors, programInputs, programOutputs, &meshDevice); executor.execute(program); + std::vector outputTensors = executor.gatherOutputTensors(); + return outputTensors; } } // namespace tt::runtime::ttnn diff --git a/runtime/lib/ttnn/runtime.cpp b/runtime/lib/ttnn/runtime.cpp index 86fd2d25c6..44d49638e3 100644 --- a/runtime/lib/ttnn/runtime.cpp +++ b/runtime/lib/ttnn/runtime.cpp @@ -21,6 +21,267 @@ using ::tt::tt_metal::DistributedTensorConfig; using ::tt::tt_metal::OwnedStorage; using ::tt::tt_metal::raise_unsupported_storage; +namespace detail { +class LayoutConverter { +public: + LayoutDesc inputDesc; + LayoutDesc outputDesc; + bool shouldTilize = false; + bool shouldUntilize = false; + bool shouldTypecast = false; + bool shouldToDevice = false; + bool shouldToMemoryConfig = false; + LayoutConverter(const LayoutDesc &inputDesc, const LayoutDesc &outputDesc) + : inputDesc(inputDesc), outputDesc(outputDesc) { + shouldTilize = (inputDesc.layout == ::ttnn::Layout::ROW_MAJOR and + outputDesc.layout == ::ttnn::Layout::TILE); + shouldUntilize = (inputDesc.layout == ::ttnn::Layout::TILE and + outputDesc.layout == ::ttnn::Layout::ROW_MAJOR); + + shouldTypecast = (inputDesc.dataType != outputDesc.dataType); + + shouldToDevice = (inputDesc.isOnHost() and outputDesc.isOnDevice()); + + shouldToMemoryConfig = + (not shouldToDevice and outputDesc.isOnDevice() and + (inputDesc.memoryConfig != outputDesc.memoryConfig)); + } + + ::ttnn::Tensor + convertTensorLayout(const ::ttnn::Tensor &input, + std::optional targetDevice) { + if (inputDesc.isOnHost()) { + return convertHostTensorLayout(input, targetDevice); + } + return convertDeviceTensorLayout(input); + } + +private: + ::ttnn::Tensor toLayoutIfNeeded(const ::ttnn::Tensor &input) { + if (shouldTilize) { + return ::ttnn::to_layout(input, ::ttnn::Layout::TILE, std::nullopt, + std::nullopt, + static_cast<::ttnn::Device *>(nullptr)); + } + if (shouldUntilize) { + return ::ttnn::to_layout(input, ::ttnn::Layout::ROW_MAJOR, std::nullopt, + std::nullopt, + static_cast<::ttnn::Device *>(nullptr)); + } + return input; + } + + ::ttnn::Tensor typecastIfNeeded(const ::ttnn::Tensor &input) { + if (shouldTypecast) { + return ::ttnn::typecast(input, outputDesc.dataType); + } + return input; + } + + ::ttnn::Tensor toDeviceIfNeeded(const ::ttnn::Tensor &input, + std::optional targetDevice, + bool force = false) { + if (shouldToDevice or force) { + LOG_ASSERT(targetDevice.has_value()); + LOG_ASSERT(outputDesc.memoryConfig.has_value()); + return std::visit( + [&](auto &&targetDevice) -> ::ttnn::Tensor { + return ::ttnn::to_device(input, &(targetDevice.get()), + outputDesc.memoryConfig.value()); + }, + targetDevice.value()); + } + return input; + } + + ::ttnn::Tensor toMemoryConfigIfNeeded(const ::ttnn::Tensor &input) { + if (shouldToMemoryConfig) { + LOG_ASSERT(outputDesc.memoryConfig.has_value()); + return ::ttnn::to_memory_config(input, outputDesc.memoryConfig.value()); + } + return input; + } + + // + // Host input tensor APIs + // + ::ttnn::Tensor + handleHostInputNoLayoutNoTypecast(const ::ttnn::Tensor &input, + std::optional targetDevice) { + ::ttnn::Tensor out = toDeviceIfNeeded(input, targetDevice); + out = toMemoryConfigIfNeeded(out); + return out; + } + + ::ttnn::Tensor + handleHostInputLayoutNoTypecast(const ::ttnn::Tensor &input, + std::optional targetDevice) { + if (shouldUntilize) { + ::ttnn::Tensor out = toLayoutIfNeeded(input); + out = toDeviceIfNeeded(out, targetDevice); + out = toMemoryConfigIfNeeded(out); + return out; + } + + if (shouldTilize and outputDesc.dataType == ::ttnn::DataType::BFLOAT16) { + ::ttnn::Tensor out = toDeviceIfNeeded(input, targetDevice); + out = toLayoutIfNeeded(out); + out = toMemoryConfigIfNeeded(out); + return out; + } + + if (shouldTilize and outputDesc.dataType != ::ttnn::DataType::BFLOAT16) { + ::ttnn::Tensor out = toLayoutIfNeeded(input); + out = toDeviceIfNeeded(out, targetDevice); + out = toMemoryConfigIfNeeded(out); + return out; + } + LOG_FATAL("Unreachable code path"); + } + + ::ttnn::Tensor + handleHostInputNoLayoutTypecast(const ::ttnn::Tensor &input, + std::optional targetDevice) { + if (outputDesc.layout == ::ttnn::Layout::TILE) { + ::ttnn::Tensor out = toDeviceIfNeeded(input, targetDevice); + out = typecastIfNeeded(out); + out = toMemoryConfigIfNeeded(out); + return out; + } + + if (outputDesc.layout != ::ttnn::Layout::TILE) { + ::ttnn::Tensor out = typecastIfNeeded(input); + out = toDeviceIfNeeded(out, targetDevice); + out = toMemoryConfigIfNeeded(out); + return out; + } + LOG_FATAL("Unreachable code path"); + } + + ::ttnn::Tensor + handleHostInputLayoutTypecast(const ::ttnn::Tensor &input, + std::optional targetDevice) { + if (shouldUntilize) { + ::ttnn::Tensor out = typecastIfNeeded(input); + out = toLayoutIfNeeded(out); + out = toDeviceIfNeeded(out, targetDevice); + out = toMemoryConfigIfNeeded(out); + return out; + } + + if (shouldTilize and inputDesc.dataType == ::ttnn::DataType::BFLOAT16) { + ::ttnn::Tensor out = toDeviceIfNeeded(input, targetDevice); + out = toLayoutIfNeeded(out); + out = typecastIfNeeded(out); + out = toMemoryConfigIfNeeded(out); + return out; + } + + if (shouldTilize and outputDesc.dataType == ::ttnn::DataType::BFLOAT16) { + ::ttnn::Tensor out = typecastIfNeeded(input); + out = toDeviceIfNeeded(out, targetDevice); + out = toLayoutIfNeeded(input); + out = toMemoryConfigIfNeeded(out); + return out; + } + + if (shouldTilize and inputDesc.dataType != ::ttnn::DataType::BFLOAT16 and + outputDesc.dataType != ::ttnn::DataType::BFLOAT16) { + ::ttnn::Tensor out = typecastIfNeeded(input); + out = toLayoutIfNeeded(out); + out = toDeviceIfNeeded(out, targetDevice); + out = toMemoryConfigIfNeeded(out); + return out; + } + + LOG_FATAL("Unreachable code path"); + } + + ::ttnn::Tensor + convertHostTensorLayout(const ::ttnn::Tensor &input, + std::optional targetDevice) { + bool shouldToLayout = (shouldTilize or shouldUntilize); + LOG_ASSERT(not shouldToDevice or targetDevice.has_value(), + "Target device must be provided for ToDevice"); + if (not shouldToLayout and not shouldTypecast) { + return handleHostInputNoLayoutNoTypecast(input, targetDevice); + } + if (shouldToLayout and not shouldTypecast) { + return handleHostInputLayoutNoTypecast(input, targetDevice); + } + if (not shouldToLayout and shouldTypecast) { + return handleHostInputNoLayoutTypecast(input, targetDevice); + } + if (shouldToLayout and shouldTypecast) { + return handleHostInputLayoutTypecast(input, targetDevice); + } + LOG_FATAL("Unreachable code path"); + } + + // + // Device input tensor APIs + // + ::ttnn::Tensor + handleDeviceInputNoLayoutNoTypecast(const ::ttnn::Tensor &input) { + return toMemoryConfigIfNeeded(input); + } + + ::ttnn::Tensor + handleDeviceInputLayoutNoTypecast(const ::ttnn::Tensor &input) { + if (shouldUntilize) { + LOG_WARNING( + "Device to device tilize is not fully supported on the ttnn side"); + } + ::ttnn::Tensor out = toLayoutIfNeeded(input); + out = toMemoryConfigIfNeeded(out); + return out; + } + + ::ttnn::Tensor + handleDeviceInputNoLayoutTypecast(const ::ttnn::Tensor &input) { + ::ttnn::Tensor out = typecastIfNeeded(input); + out = toMemoryConfigIfNeeded(out); + return out; + } + + ::ttnn::Tensor handleDeviceInputLayoutTypecast(const ::ttnn::Tensor &input) { + if (shouldUntilize) { + LOG_WARNING( + "Device to device untilize is not fully supported on the ttnn side"); + ::ttnn::Tensor out = typecastIfNeeded(input); + out = toLayoutIfNeeded(out); + out = toMemoryConfigIfNeeded(out); + return out; + } + if (shouldTilize) { + ::ttnn::Tensor out = toLayoutIfNeeded(input); + out = typecastIfNeeded(out); + out = toMemoryConfigIfNeeded(out); + return out; + } + LOG_FATAL("Unreachable code path"); + } + + ::ttnn::Tensor convertDeviceTensorLayout(const ::ttnn::Tensor &input) { + bool shouldToLayout = (shouldTilize or shouldUntilize); + if (not shouldToLayout and not shouldTypecast) { + return handleDeviceInputNoLayoutNoTypecast(input); + } + if (shouldToLayout and not shouldTypecast) { + return handleDeviceInputLayoutNoTypecast(input); + } + if (not shouldToLayout and shouldTypecast) { + return handleDeviceInputNoLayoutTypecast(input); + } + if (shouldToLayout and shouldTypecast) { + return handleDeviceInputLayoutTypecast(input); + } + LOG_FATAL("Unreachable code path"); + } +}; + +} // namespace detail + template static StorageType createStorage(ElementType *ptr, std::uint32_t numElements) { if constexpr (std::is_same_v) { @@ -76,6 +337,20 @@ static Tensor createNullTensor() { return Tensor(nullptr, nullptr, DeviceRuntime::TTNN); } +static DeviceVariant getTargetDevice(::ttnn::MeshDevice &meshDevice) { + if (meshDevice.num_devices() == 1) { + return std::ref(*(meshDevice.get_device_index(0))); + } + return std::ref(meshDevice); +} + +static ::tt::target::ttnn::TTNNBinary const *getBinary(Flatbuffer binary) { + bool isTTNN = ::tt::target::ttnn::SizePrefixedTTNNBinaryBufferHasIdentifier( + binary.handle.get()); + LOG_ASSERT(isTTNN, "Unsupported binary format"); + return ::tt::target::ttnn::GetSizePrefixedTTNNBinary(binary.handle.get()); +} + Tensor createTensor(std::shared_ptr data, std::vector const &shape, std::vector const &stride, @@ -89,7 +364,7 @@ Tensor createTensor(std::shared_ptr data, createStorage(data.get(), numElements, dataType), ::ttnn::Shape(small_vector_shape), utils::toTTNNDataType(dataType), ::ttnn::Layout::ROW_MAJOR); - return Tensor(std::static_pointer_cast(tensor), data, + return Tensor(std::static_pointer_cast(tensor), nullptr, DeviceRuntime::TTNN); } @@ -100,8 +375,8 @@ createTensor(std::vector> &data, ::tt::target::DataType dataType, std::unordered_map const &strategy) { std::vector<::ttnn::Tensor> tensorShards; - tensorShards.resize(data.size()); - std::transform(data.begin(), data.end(), tensorShards.begin(), + tensorShards.reserve(data.size()); + std::transform(data.begin(), data.end(), std::back_inserter(tensorShards), [&](std::shared_ptr &dataShard) -> ::ttnn::Tensor { return createOwnedTensor(dataShard, shape, stride, itemsize, dataType); @@ -112,10 +387,7 @@ createTensor(std::vector> &data, ::ttnn::distributed::api::create_multi_device_tensor( tensorShards, ::tt::tt_metal::StorageType::MULTI_DEVICE_HOST, distributionStrategy)); - std::shared_ptr>> borrowedData = - std::make_shared>>(data); - return Tensor(std::static_pointer_cast(tensor), - std::static_pointer_cast(borrowedData), + return Tensor(std::static_pointer_cast(tensor), nullptr, DeviceRuntime::TTNN); } @@ -166,34 +438,112 @@ void deallocateBuffers(Device deviceHandle) { } } -Event submit(Device deviceHandle, Binary executableHandle, - std::uint32_t programIndex, - std::vector const &inputHandles, - std::vector const &outputHandles) { +Tensor toHost(Tensor tensor, bool untilize) { + const ::ttnn::Tensor &deviceTensor = + tensor.as<::ttnn::Tensor>(DeviceRuntime::TTNN); + std::shared_ptr<::ttnn::Tensor> hostTensor = + std::make_shared<::ttnn::Tensor>(::ttnn::from_device(deviceTensor)); + if (untilize) { + hostTensor = std::make_shared<::ttnn::Tensor>(::ttnn::to_layout( + *hostTensor, ::ttnn::Layout::ROW_MAJOR, std::nullopt, std::nullopt, + static_cast<::ttnn::Device *>(nullptr))); + } + return Tensor(std::static_pointer_cast(hostTensor), nullptr, + DeviceRuntime::TTNN); +} + +Tensor toDevice(Tensor tensor, Device device) { + const ::ttnn::Tensor &hostTensor = + tensor.as<::ttnn::Tensor>(DeviceRuntime::TTNN); + auto &meshDevice = device.as<::ttnn::MeshDevice>(DeviceRuntime::TTNN); + DeviceVariant targetDevice = getTargetDevice(meshDevice); + std::shared_ptr<::ttnn::Tensor> deviceTensor = std::visit( + [&hostTensor](auto &&device) -> std::shared_ptr<::ttnn::Tensor> { + return std::make_shared<::ttnn::Tensor>( + ::ttnn::to_device(hostTensor, &(device.get()), std::nullopt)); + }, + targetDevice); + return Tensor(std::static_pointer_cast(deviceTensor), nullptr, + DeviceRuntime::TTNN); +} + +Tensor toDevice(Tensor tensor, Device device, Layout layout) { + const ::ttnn::Tensor &hostTensor = + tensor.as<::ttnn::Tensor>(DeviceRuntime::TTNN); + LOG_ASSERT(utils::isOnHost(hostTensor.storage_type()), + "Input tensor to ToDevice must be on host"); + const ::ttnn::Layout &inputLayout = hostTensor.get_layout(); + const ::ttnn::DataType &inputDataType = hostTensor.get_dtype(); + LayoutDesc inputLayoutDesc(::ttnn::BufferType::SYSTEM_MEMORY, inputLayout, + inputDataType, std::nullopt); + const LayoutDesc &outputLayoutDesc = + layout.as(DeviceRuntime::TTNN); + LOG_ASSERT(outputLayoutDesc.isOnDevice(), "Output layout must be on device"); + LOG_ASSERT(outputLayoutDesc.memoryConfig.has_value(), + "Output layout must have memory config"); ::ttnn::MeshDevice &meshDevice = - deviceHandle.as<::ttnn::MeshDevice>(DeviceRuntime::TTNN); - std::vector<::ttnn::Tensor *> inputs; - inputs.reserve(inputHandles.size()); - for (auto &input : inputHandles) { - LOG_ASSERT(input.matchesRuntime(DeviceRuntime::TTNN)); - inputs.push_back(static_cast<::ttnn::Tensor *>(input.handle.get())); - } + device.as<::ttnn::MeshDevice>(DeviceRuntime::TTNN); + DeviceVariant targetDevice = getTargetDevice(meshDevice); - std::vector<::ttnn::Tensor *> outputs; - outputs.reserve(outputHandles.size()); - for (auto &output : outputHandles) { - LOG_ASSERT(output.matchesRuntime(DeviceRuntime::TTNN)); - outputs.push_back(static_cast<::ttnn::Tensor *>(output.handle.get())); - } + detail::LayoutConverter converter(inputLayoutDesc, outputLayoutDesc); + std::shared_ptr<::ttnn::Tensor> out = std::make_shared<::ttnn::Tensor>( + converter.convertTensorLayout(hostTensor, targetDevice)); - tt::runtime::ttnn::runProgram(meshDevice, executableHandle, programIndex, - inputs, outputs); - return Event(nullptr, DeviceRuntime::TTNN); + return Tensor(std::static_pointer_cast(out), nullptr, + DeviceRuntime::TTNN); } -void wait(Event event) { - // Not implemented - LOG_ASSERT(event.matchesRuntime(DeviceRuntime::TTNN)); +Tensor toLayout(Tensor tensor, Layout layout) { + const ::ttnn::Tensor &inputTensor = + tensor.as<::ttnn::Tensor>(DeviceRuntime::TTNN); + const ::ttnn::StorageType &inputStorageType = inputTensor.storage_type(); + const ::ttnn::BufferType &inputBufferType = + utils::isOnHost(inputStorageType) + ? ::ttnn::BufferType::SYSTEM_MEMORY + : inputTensor.memory_config().buffer_type; + const ::ttnn::Layout &inputLayout = inputTensor.get_layout(); + const ::ttnn::DataType &inputDataType = inputTensor.get_dtype(); + std::optional<::ttnn::MemoryConfig> inputMemoryConfig = std::nullopt; + if (utils::isOnDevice(inputStorageType)) { + inputMemoryConfig = + std::make_optional<::ttnn::MemoryConfig>(inputTensor.memory_config()); + } + LayoutDesc inputLayoutDesc(inputBufferType, inputLayout, inputDataType, + inputMemoryConfig); + const LayoutDesc &outputLayoutDesc = + layout.as(DeviceRuntime::TTNN); + LOG_ASSERT(inputLayoutDesc.isOnHost() == outputLayoutDesc.isOnHost(), + "Input and output layout must be on the same memory space"); + + detail::LayoutConverter converter(inputLayoutDesc, outputLayoutDesc); + std::shared_ptr<::ttnn::Tensor> out = std::make_shared<::ttnn::Tensor>( + converter.convertTensorLayout(inputTensor, std::nullopt)); + + return Tensor(std::static_pointer_cast(out), nullptr, + DeviceRuntime::TTNN); +} + +Layout getLayout(Binary executableHandle, std::uint32_t programIndex, + std::uint32_t inputIndex) { + const ::tt::target::ttnn::TTNNBinary &fbb = *getBinary(executableHandle); + LOG_ASSERT(programIndex < fbb.programs()->size(), "Invalid program index"); + const ::tt::target::ttnn::Program *program = + fbb.programs()->Get(programIndex); + LOG_ASSERT(inputIndex < program->inputs()->size(), "Invalid input index"); + const ::tt::target::TensorRef *input = program->inputs()->Get(inputIndex); + ::ttnn::BufferType inputBufferType = utils::toTTNNBufferType( + input->desc()->layout()->memory_desc()->memory_space()); + ::ttnn::Layout inputLayout = utils::inferLayoutFromTileShape(input); + ::ttnn::DataType inputDataType = utils::toTTNNDataType( + input->desc()->layout()->memory_desc()->data_type()); + std::optional<::ttnn::MemoryConfig> inputMemoryConfig = std::nullopt; + if (inputBufferType != ::ttnn::BufferType::SYSTEM_MEMORY) { + inputMemoryConfig = utils::createMemoryConfig(input); + } + std::shared_ptr layoutDesc = std::make_shared( + inputBufferType, inputLayout, inputDataType, inputMemoryConfig); + return Layout(std::static_pointer_cast(layoutDesc), + DeviceRuntime::TTNN); } std::string getOpDebugString(OpContext opContextHandle) { @@ -299,7 +649,7 @@ Tensor getOpOutputTensor(OpContext opContextHandle, return createNullTensor(); } default: { - throw std::runtime_error("Unsupported operation type"); + LOG_FATAL("Unsupported operation type"); } } @@ -326,12 +676,13 @@ Tensor getOpOutputTensor(OpContext opContextHandle, outCopy.shape().value, ::ttnn::DataType::FLOAT32, ::ttnn::Layout::ROW_MAJOR); - return Tensor(std::static_pointer_cast(tensor), data, + return Tensor(std::static_pointer_cast(tensor), nullptr, DeviceRuntime::TTNN); } std::vector getTensorData(Tensor tensor) { - ::ttnn::Tensor *nnTensor = static_cast<::ttnn::Tensor *>(tensor.handle.get()); + const ::ttnn::Tensor *nnTensor = + static_cast<::ttnn::Tensor *>(tensor.handle.get()); if (nnTensor == nullptr) { return {}; } @@ -341,4 +692,79 @@ std::vector getTensorData(Tensor tensor) { static_cast(dataPtr) + nnTensor->volume()); } +void memcpy(void *dst, Tensor src) { + const ::ttnn::Tensor &srcTensor = src.as<::ttnn::Tensor>(DeviceRuntime::TTNN); + void *srcPtr = ::tt::tt_metal::get_raw_host_data_ptr(srcTensor); + size_t size = srcTensor.volume() * srcTensor.element_size(); + std::memcpy(dst, srcPtr, size); +} + +void memcpy(Tensor dst, Tensor src) { + ::ttnn::Tensor &dstTensor = dst.as<::ttnn::Tensor>(DeviceRuntime::TTNN); + const ::ttnn::Tensor &srcTensor = src.as<::ttnn::Tensor>(DeviceRuntime::TTNN); + LOG_ASSERT(srcTensor.volume() * srcTensor.element_size() == + dstTensor.volume() * dstTensor.element_size(), + "Input output tensor size mismatch in memcpy: ", + srcTensor.volume(), " * ", srcTensor.element_size(), + " != ", dstTensor.volume(), " * ", dstTensor.element_size()); + void *dstPtr = ::tt::tt_metal::get_raw_host_data_ptr(dstTensor); + ::tt::runtime::ttnn::memcpy(dstPtr, src); +} + +void deallocateTensor(Tensor &tensor, bool force) { + ::ttnn::Tensor &ttnnTensor = tensor.as<::ttnn::Tensor>(DeviceRuntime::TTNN); + ::ttnn::deallocate(ttnnTensor, force); +} + +namespace legacy { +void wait(Event event) { + // Not implemented + LOG_ASSERT(event.matchesRuntime(DeviceRuntime::TTNN)); +} + +Event submit(Device deviceHandle, Binary executableHandle, + std::uint32_t programIndex, + std::vector const &inputHandles, + std::vector const &outputHandles) { + ::ttnn::MeshDevice &meshDevice = + deviceHandle.as<::ttnn::MeshDevice>(DeviceRuntime::TTNN); + std::vector<::ttnn::Tensor *> inputs; + inputs.reserve(inputHandles.size()); + for (auto &input : inputHandles) { + LOG_ASSERT(input.matchesRuntime(DeviceRuntime::TTNN)); + inputs.push_back(static_cast<::ttnn::Tensor *>(input.handle.get())); + } + + std::vector<::ttnn::Tensor *> outputs; + outputs.reserve(outputHandles.size()); + for (auto &output : outputHandles) { + LOG_ASSERT(output.matchesRuntime(DeviceRuntime::TTNN)); + outputs.push_back(static_cast<::ttnn::Tensor *>(output.handle.get())); + } + + tt::runtime::ttnn::legacy::runProgram(meshDevice, executableHandle, + programIndex, inputs, outputs); + return Event(nullptr, DeviceRuntime::TTNN); +} +} // namespace legacy + +std::vector submit(Device deviceHandle, Binary executableHandle, + std::uint32_t programIndex, + std::vector const &inputHandles) { + ::ttnn::MeshDevice &meshDevice = + deviceHandle.as<::ttnn::MeshDevice>(DeviceRuntime::TTNN); + std::vector<::ttnn::Tensor *> inputs; + inputs.reserve(inputHandles.size()); + std::transform(inputHandles.begin(), inputHandles.end(), + std::back_inserter(inputs), + [](const Tensor &input) -> ::ttnn::Tensor * { + LOG_ASSERT(input.matchesRuntime(DeviceRuntime::TTNN)); + return static_cast<::ttnn::Tensor *>(input.handle.get()); + }); + std::vector outputs = ::tt::runtime::ttnn::runProgram( + meshDevice, executableHandle, programIndex, inputs); + + return outputs; +} + } // namespace tt::runtime::ttnn diff --git a/runtime/test/ttnn/test_subtract.cpp b/runtime/test/ttnn/test_subtract.cpp index 00aebe20fb..995b95665c 100644 --- a/runtime/test/ttnn/test_subtract.cpp +++ b/runtime/test/ttnn/test_subtract.cpp @@ -21,12 +21,13 @@ TEST(TTNNSubtract, Equal) { const char *fbPath = std::getenv("TTMLIR_SUBTRACT_FB_PATH"); assert(fbPath && "Path to subtract flatbuffer must be provided"); ::tt::runtime::Binary fbb = ::tt::runtime::Binary::loadFromPath(fbPath); - EXPECT_EQ(fbb.getFileIdentifier(), "TTNN"); + ASSERT_EQ(fbb.getFileIdentifier(), "TTNN"); ::tt::runtime::setCompatibleRuntime(fbb); std::vector<::tt::runtime::TensorDesc> inputDescs = fbb.getProgramInputs(0); + assert(inputDescs.size() == 2); std::vector<::tt::runtime::TensorDesc> outputDescs = fbb.getProgramOutputs(0); - std::vector<::tt::runtime::Tensor> inputTensors, outputTensors; - + assert(outputDescs.size() == 1); + std::vector<::tt::runtime::Tensor> inputTensors; std::uint32_t tensorSize = inputDescs[0].itemsize; for (const int dim : inputDescs[0].shape) { tensorSize *= dim; @@ -38,26 +39,27 @@ TEST(TTNNSubtract, Equal) { std::memset(data.get(), 1, tensorSize); inputTensors.emplace_back(::tt::runtime::createTensor(data, desc)); } - for (const auto &desc : outputDescs) { - std::shared_ptr data = - ::tt::runtime::utils::malloc_shared(tensorSize); - // Set to wrong value on purpose here - std::memset(data.get(), 1, tensorSize); - outputTensors.emplace_back(::tt::runtime::createTensor(data, desc)); - } + + std::shared_ptr outputDataPtr = + ::tt::runtime::utils::malloc_shared(tensorSize); + // Set to wrong value on purpose here + std::memset(outputDataPtr.get(), 1, tensorSize); + ::tt::runtime::Tensor outputTensor = + ::tt::runtime::createTensor(outputDataPtr, outputDescs[0]); size_t numDevices = ::tt::runtime::getNumAvailableDevices(); std::vector deviceIds(numDevices); std::iota(deviceIds.begin(), deviceIds.end(), 0); - auto device = ::tt::runtime::openDevice(deviceIds); - auto ev = ::tt::runtime::submit(device, fbb, 0, inputTensors, outputTensors); + auto device = ::tt::runtime::openDevice({deviceIds[0]}); + std::vector<::tt::runtime::Tensor> output = + ::tt::runtime::submit(device, fbb, 0, inputTensors); ::tt::runtime::closeDevice(device); - + assert(output.size() == 1); std::shared_ptr expected = ::tt::runtime::utils::malloc_shared(tensorSize); std::memset(expected.get(), 0, tensorSize); - for (const auto &outputTensor : outputTensors) { - EXPECT_EQ(std::memcmp(outputTensor.data.get(), expected.get(), tensorSize), - 0); - } + ::tt::runtime::Tensor submitOutput = output[0]; + ASSERT_NE(std::memcmp(outputDataPtr.get(), expected.get(), tensorSize), 0); + ::tt::runtime::memcpy(outputTensor, submitOutput); + ASSERT_EQ(std::memcmp(outputDataPtr.get(), expected.get(), tensorSize), 0); } diff --git a/runtime/tools/python/setup.py b/runtime/tools/python/setup.py index ddbe3da9fe..50d6b18392 100644 --- a/runtime/tools/python/setup.py +++ b/runtime/tools/python/setup.py @@ -59,7 +59,12 @@ linklibs = ["TTBinary"] if enable_ttnn: runlibs += ["_ttnn.so"] - linklibs += ["TTRuntimeTTNN", "TTRuntimeTTNNOps", ":_ttnn.so"] + linklibs += [ + "TTRuntimeTTNN", + "TTRuntimeTTNNOps", + "TTRuntimeTTNNHelpers", + ":_ttnn.so", + ] if enable_ttmetal: runlibs += ["libtt_metal.so"] @@ -231,6 +236,7 @@ def package_files(directory): f"{src_dir}/build/runtime/lib/common", f"{src_dir}/build/runtime/lib/ttnn", f"{src_dir}/build/runtime/lib/ttnn/operations", + f"{src_dir}/build/runtime/lib/ttnn/include", f"{src_dir}/build/runtime/lib/ttmetal", f"{toolchain}/lib", f"{src_dir}/build/runtime/tools/python/ttrt/runtime", diff --git a/runtime/tools/python/test/test_api.py b/runtime/tools/python/test/test_api.py new file mode 100644 index 0000000000..a0ddb087b8 --- /dev/null +++ b/runtime/tools/python/test/test_api.py @@ -0,0 +1,150 @@ +# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +# +# SPDX-License-Identifier: Apache-2.0 + +import os +import pytest +import ttrt +import torch +from ttrt.common.run import Run +from ttrt.common.query import Query +from ttrt.common.util import * +from ttrt.common.api import API +from util import * + + +def log_debug(logger, message, prefix="------", suffix=""): + logger.debug(f"{prefix}{message}{suffix}") + + +class TestRunner: + def __init__(self, logger=None): + self.artifacts_dir = f"{os.getcwd()}/ttrt-artifacts" + self.logger = logger if logger is not None else Logger() + self.file_manager = FileManager(self.logger) + self.results = Results(self.logger, self.file_manager) + self.artifacts = Artifacts( + self.logger, self.file_manager, artifacts_folder_path=self.artifacts_dir + ) + self.query = Query({"--quiet": True}, self.logger, self.artifacts) + self.query() + self.results_path = None + self.results = None + + def initialize(self, binary_path, results_path): + self.binary_path = binary_path + self.binary = Binary(self.logger, self.file_manager, binary_path) + self.results_path = results_path + self.results = Results(self.logger, self.file_manager) + + def teardown(self): + self.binary_path = None + self.binary = None + self.results_path = None + self.results = None + + def create_and_save_result( + self, status: str, exception: str = "", program_index: int = 0 + ): + test_result = { + "file_path": self.binary_path, + "result": status, + "exception": exception, + "log_file": self.logger.file_name, + "artifacts": self.artifacts.artifacts_folder_path, + "program_index": program_index, + } + self.results.add_result(test_result) + self.results.save_results(self.results_path) + + def check_constraints(self): + self.binary.check_version() + self.binary.check_system_desc(self.query) + + def test_to_device_to_host(self, shape, dtype): + tensor = torch.randn(shape, dtype=dtype) + result_tensor = torch.zeros(shape, dtype=dtype) + runtime_input_tensor = ttrt.runtime.create_tensor( + tensor.data_ptr(), + list(tensor.shape), + list(tensor.stride()), + tensor.element_size(), + Binary.Program.to_data_type(tensor.dtype), + ) + device = ttrt.runtime.open_device([self.query.device_ids[0]]) + device_tensor = ttrt.runtime.to_device(runtime_input_tensor, device) + host_tensor = ttrt.runtime.to_host(device_tensor) + ttrt.runtime.close_device(device) + ttrt.runtime.memcpy(result_tensor.data_ptr(), host_tensor) + assert torch.allclose(tensor, result_tensor) + + def test_host_layout_conversion(self): + program_indices = list(range(self.binary.get_num_programs())) + for program_index in program_indices: + program = self.binary.get_program(program_index) + program.populate_inputs(Run.TorchInitializer.get_initilizer("randn")) + for input_index, input_tensor in enumerate(program.input_tensors): + layout = ttrt.runtime.get_layout( + self.binary.fbb, program_index, input_index + ) + input_tensor_runtime = ttrt.runtime.create_tensor( + input_tensor.data_ptr(), + list(input_tensor.shape), + list(input_tensor.stride()), + input_tensor.element_size(), + Binary.Program.to_data_type(input_tensor.dtype), + ) + converted_tensor = ttrt.runtime.to_layout(input_tensor_runtime, layout) + result_tensor = torch.zeros( + input_tensor.shape, dtype=input_tensor.dtype + ) + ttrt.runtime.memcpy(result_tensor.data_ptr(), converted_tensor) + assert torch.allclose(input_tensor, result_tensor) + + # TODO: Add test for to_device with layout and on-device layout conversion + # once compiler supports non-host input tensor layouts + + +class TestContext: + def __init__(self, test_runner: TestRunner): + self.test_runner = test_runner + + def __enter__(self): + return self.test_runner + + def __exit__(self, exc_type, exc_val, exc_tb): + try: + if exc_type is None: + self.test_runner.create_and_save_result("pass") + else: + self.test_runner.create_and_save_result("error", str(exc_val)) + finally: + self.test_runner.teardown() + return False + + +@pytest.fixture(scope="module") +def test_runner(): + API.initialize_apis() + runner = TestRunner() + yield runner + + +@pytest.mark.parametrize("shape", [(64, 128)]) +@pytest.mark.parametrize("dtype", [torch.float32, torch.bfloat16]) +def test_to_device_to_host(test_runner: TestRunner, shape, dtype, request): + results_path = f"ttrt-results/{request.node.name}.json" + binary_path = BINARY_FILE_PATH + test_runner.initialize(binary_path, results_path) + with TestContext(test_runner): + test_runner.check_constraints() + test_runner.test_to_device_to_host(shape, dtype) + + +def test_host_layout_conversion(test_runner: TestRunner, request): + results_path = f"ttrt-results/{request.node.name}.json" + binary_path = BINARY_FILE_PATH + test_runner.initialize(binary_path, results_path) + with TestContext(test_runner): + test_runner.check_constraints() + test_runner.test_host_layout_conversion() diff --git a/runtime/tools/python/ttrt/common/run.py b/runtime/tools/python/ttrt/common/run.py index c2ae10ac9a..ff77de75ed 100644 --- a/runtime/tools/python/ttrt/common/run.py +++ b/runtime/tools/python/ttrt/common/run.py @@ -380,6 +380,7 @@ def _execute(binaries): self.logging.debug(f"setting torch manual seed={self['--seed']}") torch.manual_seed(self["--seed"]) ttrt.runtime.set_compatible_runtime(binaries[0].fbb) + current_runtime = ttrt.runtime.get_current_runtime() self.logging.debug(f"opening devices={self.query.device_ids}") device = ttrt.runtime.open_device(self.query.device_ids) @@ -459,20 +460,42 @@ def _execute(binaries): self.logging.debug( f"starting loop={loop+1}/{self['--loops']} for binary={bin.file_path}" ) + if ( + current_runtime + == ttrt.runtime.DeviceRuntime.TTMetal + ): + event = ttrt.runtime.submit( + device, + bin.fbb, + program_index, + total_inputs[loop], + total_outputs[loop], + ) - event = ttrt.runtime.submit( - device, - bin.fbb, - program_index, - total_inputs[loop], - total_outputs[loop], - ) + elif current_runtime == ttrt.runtime.DeviceRuntime.TTNN: + runtime_outputs = ttrt.runtime.submit( + device, + bin.fbb, + program_index, + total_inputs[loop], + ) + for i, runtime_output_tensor in enumerate( + runtime_outputs + ): + ttrt.runtime.memcpy( + total_outputs[loop][i], + runtime_output_tensor, + ) + ttrt.runtime.deallocate_tensor( + runtime_output_tensor, force=True + ) self.logging.debug( f"finished loop={loop+1}/{self['--loops']} for binary={bin.file_path}" ) - ttrt.runtime.wait(event) + if event is not None: + ttrt.runtime.wait(event) if self["--identity"]: self.logging.debug( diff --git a/runtime/tools/python/ttrt/runtime/__init__.py b/runtime/tools/python/ttrt/runtime/__init__.py index 642b0401f5..dc8ac5c35c 100644 --- a/runtime/tools/python/ttrt/runtime/__init__.py +++ b/runtime/tools/python/ttrt/runtime/__init__.py @@ -20,8 +20,14 @@ create_tensor, create_multi_device_tensor, wait, + to_host, + to_device, + to_layout, + get_layout, get_op_output_tensor, get_op_debug_str, + memcpy, + deallocate_tensor, WorkaroundEnv, ) except ModuleNotFoundError: diff --git a/runtime/tools/python/ttrt/runtime/module.cpp b/runtime/tools/python/ttrt/runtime/module.cpp index dfc4a68201..e64c752ba5 100644 --- a/runtime/tools/python/ttrt/runtime/module.cpp +++ b/runtime/tools/python/ttrt/runtime/module.cpp @@ -22,6 +22,7 @@ PYBIND11_MODULE(_C, m) { .def("deallocate_buffers", &tt::runtime::detail::deallocateBuffers); py::class_(m, "Event"); py::class_(m, "Tensor"); + py::class_(m, "Layout"); py::class_(m, "OpContext"); py::class_(m, "CallbackContext"); py::enum_<::tt::target::DataType>(m, "DataType") @@ -69,8 +70,8 @@ PYBIND11_MODULE(_C, m) { ::tt::target::DataType dataType, std::unordered_map const &strategy) { std::vector> data; - data.resize(ptrs.size()); - std::transform(ptrs.begin(), ptrs.end(), data.begin(), + data.reserve(ptrs.size()); + std::transform(ptrs.begin(), ptrs.end(), std::back_inserter(data), [](std::uintptr_t ptr) { return ::tt::runtime::utils::unsafe_borrow_shared( reinterpret_cast(ptr)); @@ -85,9 +86,53 @@ PYBIND11_MODULE(_C, m) { py::arg("num_hw_cqs") = size_t{1}, "Open a mesh of devices for execution"); m.def("close_device", &tt::runtime::closeDevice, "Close a mesh device"); - m.def("submit", &tt::runtime::submit, py::arg("device"), - py::arg("executable"), py::arg("program_index"), py::arg("inputs"), - py::arg("outputs"), "Submit a binary for execution"); + m.def("to_host", &tt::runtime::toHost, py::arg("tensor"), + py::arg("utilize") = false, "Copy the tensor to the host"); + m.def( + "to_device", + [](const ::tt::runtime::Tensor tensor, + ::tt::runtime::Device device) -> ::tt::runtime::Tensor { + return ::tt::runtime::toDevice(tensor, device); + }, + py::arg("tensor"), py::arg("device"), "Copy the tensor to the device"); + m.def( + "to_device", + [](::tt::runtime::Tensor tensor, ::tt::runtime::Device device, + ::tt::runtime::Layout layout) -> ::tt::runtime::Tensor { + return ::tt::runtime::toDevice(tensor, device, layout); + }, + py::arg("tensor"), py::arg("device"), py::arg("layout"), + "Copy the tensor to the device with the specified layout"); + m.def("to_layout", &tt::runtime::toLayout, py::arg("tensor"), + py::arg("layout"), + "Create a copy of the tensor with the specified layout"); + m.def("get_layout", &tt::runtime::getLayout, py::arg("executable"), + py::arg("program_index"), py::arg("input_index"), + "Get the layout of the input tensor"); + m.def( + "submit", + [](::tt::runtime::Device device, ::tt::runtime::Binary executable, + std::uint32_t programIndex, + const std::vector<::tt::runtime::Tensor> &inputs) + -> std::vector<::tt::runtime::Tensor> { + return ::tt::runtime::submit(device, executable, programIndex, inputs); + }, + py::arg("device"), py::arg("executable"), py::arg("program_index"), + py::arg("inputs"), + "Submit a ttnn binary for execution, returns a vector of output tensors"); + m.def( + "submit", + [](::tt::runtime::Device device, ::tt::runtime::Binary executable, + std::uint32_t programIndex, + const std::vector<::tt::runtime::Tensor> &inputs, + const std::vector<::tt::runtime::Tensor> &outputs) + -> ::tt::runtime::Event { + return ::tt::runtime::submit(device, executable, programIndex, inputs, + outputs); + }, + py::arg("device"), py::arg("executable"), py::arg("program_index"), + py::arg("inputs"), py::arg("outputs"), + "Submit a ttmetal binary for execution. returns event wrapper"); m.def("wait", &tt::runtime::wait, py::arg("event")); m.def( "get_op_output_tensor", @@ -100,7 +145,22 @@ PYBIND11_MODULE(_C, m) { "Get the input tensor of the op"); m.def("get_op_debug_str", &tt::runtime::getOpDebugString, "Get the debug string of the op"); - + m.def( + "memcpy", + [](std::uintptr_t dst, ::tt::runtime::Tensor src) { + ::tt::runtime::memcpy(reinterpret_cast(dst), src); + }, + py::arg("dst"), py::arg("src"), + "Copy the data from src tensor to dst memory location"); + m.def( + "memcpy", + [](::tt::runtime::Tensor dst, ::tt::runtime::Tensor src) { + ::tt::runtime::memcpy(dst, src); + }, + py::arg("dst"), py::arg("src"), + "Copy the data from src tensor to dst tensor"); + m.def("deallocate_tensor", &tt::runtime::deallocateTensor, py::arg("tensor"), + py::arg("force") = false, "Deallocate the tensor memory"); py::class_(m, "DebugEnv") .def_static("get", &tt::runtime::debug::Env::get) .def("__str__", [](const tt::runtime::debug::Env &env) { diff --git a/test/ttmlir/Dialect/TTNN/eltwise/unary/isfinite/simple_isfinite.mlir b/test/ttmlir/Dialect/TTNN/eltwise/unary/isfinite/simple_isfinite.mlir index e819e68f4b..3089da6692 100644 --- a/test/ttmlir/Dialect/TTNN/eltwise/unary/isfinite/simple_isfinite.mlir +++ b/test/ttmlir/Dialect/TTNN/eltwise/unary/isfinite/simple_isfinite.mlir @@ -1,15 +1,15 @@ // RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline %s | FileCheck %s #any_device = #tt.operand_constraint module attributes {} { - func.func @is_finite(%arg0: tensor<64x128xf32>) -> tensor<64x128xbf16> { + func.func @is_finite(%arg0: tensor<64x128xbf16>) -> tensor<64x128xbf16> { // CHECK: %[[C:.*]] = "ttnn.empty" // CHECK-SAME: [[TENSOR:tensor<64x128xbf16,]] %0 = tensor.empty() : tensor<64x128xbf16> // CHECK: %[[C:.*]] = "ttnn.isfinite" - // CHECK-SAME: tensor<64x128xf32, + // CHECK-SAME: tensor<64x128xbf16, // CHECK-SAME: [[TENSOR]] // CHECK-SAME: -> [[TENSOR]] - %1 = "ttir.isfinite"(%arg0, %0) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xbf16>) -> tensor<64x128xbf16> + %1 = "ttir.isfinite"(%arg0, %0) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device]}> : (tensor<64x128xbf16>, tensor<64x128xbf16>) -> tensor<64x128xbf16> return %1 : tensor<64x128xbf16> } } diff --git a/test/ttmlir/Silicon/StableHLO/Unary/isfinite_op.mlir b/test/ttmlir/Silicon/StableHLO/Unary/isfinite_op.mlir index 04b9f1fefb..35682c8c0b 100644 --- a/test/ttmlir/Silicon/StableHLO/Unary/isfinite_op.mlir +++ b/test/ttmlir/Silicon/StableHLO/Unary/isfinite_op.mlir @@ -7,14 +7,14 @@ // RUN: FileCheck --input-file=%t.mlir %s module @jit_eltwise_isfinite attributes {} { - func.func public @test_isfinite(%arg0: tensor<64x128xf32>) -> tensor<64x128xi1> { + func.func public @test_isfinite(%arg0: tensor<64x128xbf16>) -> tensor<64x128xi1> { // CHECK-LABEL: func.func public @test_isfinite // CHECK: ttnn.empty // CHECK: ttnn.isfinite - // CHECK-SAME: tensor<64x128xf32, + // CHECK-SAME: tensor<64x128xbf16, // CHECK-SAME: tensor<64x128xbf16, // CHECK-SAME: -> tensor<64x128xbf16, - %0 = stablehlo.is_finite %arg0 : (tensor<64x128xf32>) -> tensor<64x128xi1> + %0 = stablehlo.is_finite %arg0 : (tensor<64x128xbf16>) -> tensor<64x128xi1> return %0 : tensor<64x128xi1> } } diff --git a/test/ttmlir/Silicon/StableHLO/select_op.mlir b/test/ttmlir/Silicon/StableHLO/select_op.mlir index 23b7182ce0..1cdc5e9d05 100644 --- a/test/ttmlir/Silicon/StableHLO/select_op.mlir +++ b/test/ttmlir/Silicon/StableHLO/select_op.mlir @@ -6,23 +6,23 @@ // RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn module @jit_eltwise_select attributes {} { - func.func public @test_select(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> { + func.func public @test_select(%arg0: tensor<64x128xbf16>, %arg1: tensor<64x128xbf16>) -> tensor<64x128xbf16> { // CHECK-LABEL: func.func public @test_select // CHECK: tensor.empty // CHECK: [[EQ:{{0-9}}+]] = "ttnn.eq" - // CHECK-SAME: tensor<64x128xf32 - // CHECK-SAME: tensor<64x128xf32 + // CHECK-SAME: tensor<64x128xbf16 + // CHECK-SAME: tensor<64x128xbf16 // CHECK-SAME: tensor<64x128xbf16 // CHECK-SAME: -> tensor<64x128xbf16 - %0 = stablehlo.compare EQ, %arg0, %arg1 : (tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xi1> + %0 = stablehlo.compare EQ, %arg0, %arg1 : (tensor<64x128xbf16>, tensor<64x128xbf16>) -> tensor<64x128xi1> // CHECK: ttnn.where // CHECK-SAME: [[EQ]] // CHECK-SAME: tensor<64x128xbf16 - // CHECK-SAME: tensor<64x128xf32 - // CHECK-SAME: tensor<64x128xf32 - // CHECK-SAME: tensor<64x128xf32 - // CHECK-SAME: -> tensor<64x128xf32 - %1 = stablehlo.select %0, %arg0, %arg1 : (tensor<64x128xi1>, tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32> - return %1 : tensor<64x128xf32> + // CHECK-SAME: tensor<64x128xbf16 + // CHECK-SAME: tensor<64x128xbf16 + // CHECK-SAME: tensor<64x128xbf16 + // CHECK-SAME: -> tensor<64x128xbf16 + %1 = stablehlo.select %0, %arg0, %arg1 : (tensor<64x128xi1>, tensor<64x128xbf16>, tensor<64x128xbf16>) -> tensor<64x128xbf16> + return %1 : tensor<64x128xbf16> } } diff --git a/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_isfinite.mlir b/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_isfinite.mlir index ce0146be40..f1489a5ebd 100644 --- a/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_isfinite.mlir +++ b/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_isfinite.mlir @@ -4,14 +4,14 @@ #any_device = #tt.operand_constraint #any_device_tile = #tt.operand_constraint -func.func @is_finite(%arg0: tensor<64x128xf32>) -> tensor<64x128xbf16> { +func.func @is_finite(%arg0: tensor<64x128xbf16>) -> tensor<64x128xbf16> { // CHECK: %[[C:.*]] = "ttnn.empty" // CHECK-SAME: [[TENSOR:tensor<64x128xbf16,]] %0 = tensor.empty() : tensor<64x128xbf16> // CHECK: %[[C:.*]] = "ttnn.isfinite" - // CHECK-SAME: tensor<64x128xf32, + // CHECK-SAME: tensor<64x128xbf16, // CHECK-SAME: [[TENSOR]] // CHECK-SAME: -> [[TENSOR]] - %1 = "ttir.isfinite"(%arg0, %0) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xbf16>) -> tensor<64x128xbf16> + %1 = "ttir.isfinite"(%arg0, %0) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device]}> : (tensor<64x128xbf16>, tensor<64x128xbf16>) -> tensor<64x128xbf16> return %1 : tensor<64x128xbf16> } diff --git a/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_le.mlir b/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_le.mlir deleted file mode 100644 index 79de8c062d..0000000000 --- a/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_le.mlir +++ /dev/null @@ -1,21 +0,0 @@ -// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path%" %s > %t.mlir -// RUN: FileCheck %s --input-file=%t.mlir -// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn - -#any_device = #tt.operand_constraint -#any_device_tile = #tt.operand_constraint - -module attributes {} { - func.func @less_equal(%arg0: tensor<13x31xf32>, %arg1: tensor<13x31xf32>) -> tensor<13x31xf32> { - // CHECK: %[[C:.*]] = "ttnn.empty - // CHECK-SAME: [[TENSOR:tensor<13x31xf32,]] - %0 = tensor.empty() : tensor<13x31xf32> - // CHECK: %[[C:.*]] = "ttnn.le" - // CHECK-SAME: [[TENSOR]] - // CHECK-SAME: [[TENSOR]] - // CHECK-SAME: [[TENSOR]] - // CHECK-SAME: -> [[TENSOR]] - %1 = "ttir.le"(%arg0, %arg1, %0) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<13x31xf32>, tensor<13x31xf32>, tensor<13x31xf32>) -> tensor<13x31xf32> - return %1 : tensor<13x31xf32> - } -} diff --git a/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_where.mlir b/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_where.mlir index 3bed0528c6..647f94e61e 100644 --- a/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_where.mlir +++ b/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_where.mlir @@ -4,13 +4,13 @@ #any_device = #tt.operand_constraint #any_device_tile = #tt.operand_constraint -func.func @test_where(%arg0: tensor<13x37xf32>, %arg1: tensor<13x37xf32>) -> tensor<13x37xf32> { +func.func @test_where(%arg0: tensor<13x37xbf16>, %arg1: tensor<13x37xbf16>) -> tensor<13x37xbf16> { %0 = tensor.empty() : tensor<13x37xbf16> - %1 = "ttir.eq"(%arg0, %arg1, %0) <{operandSegmentSizes = array, operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<13x37xf32>, tensor<13x37xf32>, tensor<13x37xbf16>) -> tensor<13x37xbf16> - %2 = tensor.empty() : tensor<13x37xf32> - %3 = "ttir.where"(%1, %arg0, %arg1, %2) <{operandSegmentSizes = array, operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<13x37xbf16>, tensor<13x37xf32>, tensor<13x37xf32>, tensor<13x37xf32>) -> tensor<13x37xf32> + %1 = "ttir.eq"(%arg0, %arg1, %0) <{operandSegmentSizes = array, operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<13x37xbf16>, tensor<13x37xbf16>, tensor<13x37xbf16>) -> tensor<13x37xbf16> + %2 = tensor.empty() : tensor<13x37xbf16> + %3 = "ttir.where"(%1, %arg0, %arg1, %2) <{operandSegmentSizes = array, operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<13x37xbf16>, tensor<13x37xbf16>, tensor<13x37xbf16>, tensor<13x37xbf16>) -> tensor<13x37xbf16> // CHECK: %[[EMPTY:.*]] = "ttnn.empty"{{.*}} // CHECK: %[[VAL1:[0-9]+]] = "ttnn.eq"(%{{[0-9]+}}, %{{[0-9]+}}, %[[EMPTY]]) // CHECK: %{{[0-9]+}} = "ttnn.where"(%[[VAL1]], %{{[0-9]+}}, %{{[0-9]+}}, %{{[0-9]+}}) - return %3 : tensor<13x37xf32> + return %3 : tensor<13x37xbf16> } diff --git a/test/ttmlir/Silicon/TTNN/simple_broadcast.mlir b/test/ttmlir/Silicon/TTNN/simple_broadcast.mlir index 1d88725d1d..db02d1ba13 100644 --- a/test/ttmlir/Silicon/TTNN/simple_broadcast.mlir +++ b/test/ttmlir/Silicon/TTNN/simple_broadcast.mlir @@ -1,6 +1,7 @@ // RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path%" %s > %t.mlir // RUN: FileCheck %s --input-file=%t.mlir // RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn +// UNSUPPORTED: true #any_device = #tt.operand_constraint func.func public @broadcast() -> (tensor<32xf32>) { diff --git a/test/ttmlir/Silicon/TTNN/simple_eltwise.mlir b/test/ttmlir/Silicon/TTNN/simple_eltwise.mlir index 976f2867db..0bb337e7f9 100644 --- a/test/ttmlir/Silicon/TTNN/simple_eltwise.mlir +++ b/test/ttmlir/Silicon/TTNN/simple_eltwise.mlir @@ -65,15 +65,15 @@ func.func @floor(%arg0: tensor<64x128xf32>) -> tensor<64x128xf32> { return %1 : tensor<64x128xf32> } -func.func @is_finite(%arg0: tensor<64x128xf32>) -> tensor<64x128xbf16> { +func.func @is_finite(%arg0: tensor<64x128xbf16>) -> tensor<64x128xbf16> { // CHECK: %[[C:.*]] = "ttnn.empty" // CHECK-SAME: [[TENSOR:tensor<64x128xbf16,]] %0 = tensor.empty() : tensor<64x128xbf16> // CHECK: %[[C:.*]] = "ttnn.isfinite" - // CHECK-SAME: tensor<64x128xf32, + // CHECK-SAME: tensor<64x128xbf16, // CHECK-SAME: [[TENSOR]] // CHECK-SAME: -> [[TENSOR]] - %1 = "ttir.isfinite"(%arg0, %0) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xbf16>) -> tensor<64x128xbf16> + %1 = "ttir.isfinite"(%arg0, %0) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device]}> : (tensor<64x128xbf16>, tensor<64x128xbf16>) -> tensor<64x128xbf16> return %1 : tensor<64x128xbf16> } @@ -278,15 +278,15 @@ func.func @get_dimension_size(%arg0: tensor<13x21x3xf32>) -> tensor<1xi32> { // CHECK: return [[VAL]] : tensor<1xi32, {{.*}}> } -func.func @test_where(%arg0: tensor<13x37xf32>, %arg1: tensor<13x37xf32>) -> tensor<13x37xf32> { +func.func @test_where(%arg0: tensor<13x37xbf16>, %arg1: tensor<13x37xbf16>) -> tensor<13x37xbf16> { %0 = tensor.empty() : tensor<13x37xbf16> - %1 = "ttir.eq"(%arg0, %arg1, %0) <{operandSegmentSizes = array, operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<13x37xf32>, tensor<13x37xf32>, tensor<13x37xbf16>) -> tensor<13x37xbf16> - %2 = tensor.empty() : tensor<13x37xf32> - %3 = "ttir.where"(%1, %arg0, %arg1, %2) <{operandSegmentSizes = array, operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<13x37xbf16>, tensor<13x37xf32>, tensor<13x37xf32>, tensor<13x37xf32>) -> tensor<13x37xf32> + %1 = "ttir.eq"(%arg0, %arg1, %0) <{operandSegmentSizes = array, operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<13x37xbf16>, tensor<13x37xbf16>, tensor<13x37xbf16>) -> tensor<13x37xbf16> + %2 = tensor.empty() : tensor<13x37xbf16> + %3 = "ttir.where"(%1, %arg0, %arg1, %2) <{operandSegmentSizes = array, operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<13x37xbf16>, tensor<13x37xbf16>, tensor<13x37xbf16>, tensor<13x37xbf16>) -> tensor<13x37xbf16> // CHECK: %[[EMPTY:.*]] = "ttnn.empty"{{.*}} // CHECK: %[[VAL1:[0-9]+]] = "ttnn.eq"(%{{[0-9]+}}, %{{[0-9]+}}, %[[EMPTY]]) // CHECK: %{{[0-9]+}} = "ttnn.where"(%[[VAL1]], %{{[0-9]+}}, %{{[0-9]+}}, %{{[0-9]+}}) - return %3 : tensor<13x37xf32> + return %3 : tensor<13x37xbf16> } func.func @gelu(%arg0: tensor<64x128xf32>) -> tensor<64x128xf32> {