diff --git a/.github/actions/build-toolchain/action.yml b/.github/actions/build-toolchain/action.yml index ee75b1e9d3..6fe6cc3444 100644 --- a/.github/actions/build-toolchain/action.yml +++ b/.github/actions/build-toolchain/action.yml @@ -4,6 +4,10 @@ inputs: os: description: 'Operating System' required: true + sdk: + description: 'MacOS SDK, if applicable' + required: true + default: '0' runs: using: 'composite' steps: @@ -18,7 +22,7 @@ runs: uses: actions/cache@v4 with: path: /opt/ttmlir-toolchain - key: ${{ inputs.os }}-ttmlir-toolchain-${{ hashFiles('env/**') }} + key: ${{ inputs.os }}-ttmlir-toolchain-${{ hashFiles('env/**') }}-${{ inputs.sdk }} - name: 'Build ttmlir-toolchain' if: steps.cache-toolchain.outputs.cache-hit != 'true' diff --git a/.github/workflows/macos-build.yml b/.github/workflows/macos-build.yml index 43dd21370c..82c464244d 100644 --- a/.github/workflows/macos-build.yml +++ b/.github/workflows/macos-build.yml @@ -4,6 +4,9 @@ on: workflow_dispatch: workflow_call: +env: + SDK_VERSION: "0" + jobs: build: strategy: @@ -23,16 +26,23 @@ jobs: with: os: ${{ matrix.build.runs-on }} + - name: Get macos sdk version + if: startsWith(matrix.build.runs-on, 'macos') + shell: bash + run: | + echo "SDK_VERSION=$(xcrun --show-sdk-version)" >> $GITHUB_ENV + - name: Build and cache ttmlir-toolchain uses: ./.github/actions/build-toolchain with: os: ${{ matrix.build.runs-on }} + sdk: ${{ env.SDK_VERSION }} - name: ccache uses: hendrikmuhs/ccache-action@v1.2 with: create-symlink: true - key: ${{ matrix.build.runs-on }}-runtime-${{ matrix.build.enable_runtime }} + key: ${{ matrix.build.runs-on }}-runtime-${{ matrix.build.enable_runtime }}-${{ env.SDK_VERSION }} - name: Set reusable strings id: strings @@ -72,6 +82,12 @@ jobs: source env/activate cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build.build_type }} -- check-ttmlir + - name: Build ttrt + shell: bash + run: | + source env/activate + cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build.build_type }} -- ttrt + - name: Upload Test Report uses: actions/upload-artifact@v4 with: diff --git a/CMakeLists.txt b/CMakeLists.txt index e1226581d9..374a31d442 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,6 +5,8 @@ if (NOT DEFINED ENV{TTMLIR_ENV_ACTIVATED}) message(FATAL_ERROR "tt-mlir environment not activated. Please run 'source env/activate'.") endif() +option(TT_RUNTIME_ENABLE_PERF_TRACE "Enable performance mode" OFF) + set(CMAKE_BUILD_WITH_INSTALL_NAME_DIR ON) set(CMAKE_CXX_STANDARD 17 CACHE STRING "C++ standard to conform to") diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt index 7452829c41..1520f5faf2 100644 --- a/docs/CMakeLists.txt +++ b/docs/CMakeLists.txt @@ -1,6 +1,6 @@ add_custom_target(copy-docs-dir COMMAND - cp -r ${CMAKE_CURRENT_SOURCE_DIR}/ ${CMAKE_CURRENT_BINARY_DIR} + cp -r ${CMAKE_CURRENT_SOURCE_DIR}/* ${CMAKE_CURRENT_BINARY_DIR} ) add_custom_target(autogen-summary diff --git a/docs/src/SUMMARY.md b/docs/src/SUMMARY.md index 09036280a6..29eec9d1f7 100644 --- a/docs/src/SUMMARY.md +++ b/docs/src/SUMMARY.md @@ -25,3 +25,4 @@ - [Runtime Stitching](./specs/runtime-stitching.md) - [Tensor Layout](./specs/tensor-layout.md) - [TTNN Interactive Visualizer](./specs/tensor-layout-interactive.md) + - [Device](./specs/device.md) diff --git a/docs/src/adding-an-op.md b/docs/src/adding-an-op.md index 883b4e5069..0eea465126 100644 --- a/docs/src/adding-an-op.md +++ b/docs/src/adding-an-op.md @@ -7,13 +7,19 @@ reference the diff alongside this guide to see the changes in full. This guide will cover the following steps: -1. [Define the Op in the TTIR frontend dialect](#1-define-the-op-in-the-ttir-frontend-dialect) -2. [Define the Op in the TTNN backend dialect](#2-define-the-op-in-the-ttnn-backend-dialect) -3. [Convert / Implement the Op in the TTNN passes](#3-convert--implement-the-op-in-the-ttnn-passes) -4. [Add a unit test for the Op](#4-add-a-unit-test-for-the-op) -5. [Define flatbuffer schema for the Op](#5-define-flatbuffer-schema-for-the-op) -6. [Serialize the Op in the flatbuffer format](#6-serialize-the-op-in-the-flatbuffer-format) -7. [Add runtime support for the Op](#7-add-runtime-support-for-the-op) +- [Adding an Op](#adding-an-op) + - [1. Define the Op in the TTIR frontend dialect](#1-define-the-op-in-the-ttir-frontend-dialect) + - [2. Define the Op in the TTNN backend dialect](#2-define-the-op-in-the-ttnn-backend-dialect) + - [`TTNNOps.td`](#ttnnopstd) + - [`TTNNOps.cpp`](#ttnnopscpp) + - [3. Convert / Implement the Op in the TTNN passes](#3-convert--implement-the-op-in-the-ttnn-passes) + - [4. Add a unit test for the Op](#4-add-a-unit-test-for-the-op) + - [`test/ttmlir/Dialect/TTNN/simple_matmul.mlir`](#testttmlirdialectttnnsimple_matmulmlir) + - [5. Define flatbuffer schema for the Op](#5-define-flatbuffer-schema-for-the-op) + - [`include/ttmlir/Target/TTNN/program.fbs`](#includettmlirtargetttnnprogramfbs) + - [6. Serialize the Op in the flatbuffer format](#6-serialize-the-op-in-the-flatbuffer-format) + - [7. Add runtime support for the Op](#7-add-runtime-support-for-the-op) + - [`runtime/lib/ttnn/program.cpp`](#runtimelibttnnprogramcpp) ## 1. Define the Op in the TTIR frontend dialect @@ -99,13 +105,13 @@ section for details, the process is the same. Next we will implement the conversion from the TTIR `matmul` Op to the TTNN `matmul` Op. This is a trivial conversion, as the Ops are identical in their semantics, so the changeset isn't going to be very instructive, but will at least point to the -files involved. The conversion is implemented in the `ConvertTTIRToTNN` pass in -file `lib/Dialect/TTNN/Transforms/Passes.cpp`. +files involved. The conversion is implemented in the `ConvertTTIRToTTNNPass` pass in +file `lib/Conversion/TTIRToTTNN/TTIRToTTNNPass.cpp`. -Zooming into `class ConvertTTIRToTNN` we can see we implement the pass interface +Zooming into `class ConvertTTIRToTTNNPass` we can see we implement the pass interface via member function `void runOnOperation() final`. This function will be called for every operation matching the type specified in the pass tablegen file. A -quick look at `include/ttmlir/Dialect/TTNN/Passes.td` we can see: +quick look at `include/ttmlir/Conversion/Passes.td` we can see: ``` def ConvertTTIRToTTNN: Pass<"convert-ttir-to-ttnn", "::mlir::ModuleOp"> { @@ -121,22 +127,21 @@ can match much more complicated patterns (nested inside of the `ModuleOp`'s than just a single operation. ```cpp -{{#include ../../../lib/Dialect/TTNN/Transforms/Passes.cpp:adding_an_op_matmul_rewrite_pattern_set}} +{{#include ../../../lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp:adding_an_op_matmul_rewrite_pattern_set}} ``` -> More information on rewrite patterns and their capabilities can be found in the [MLIR -> documentation](https://mlir.llvm.org/docs/PatternRewriter/). +> More information on rewrite patterns and their capabilities can be found in the MLIR documentation [here](https://mlir.llvm.org/docs/PatternRewriter/) and [here](https://mlir.llvm.org/docs/DialectConversion/). -For matmul, we defined a new pattern rewriter that's generic to all binary ops +For matmul, we defined a new conversion pattern that's generic to all binary ops with arguments named `a` and `b`: ```cpp -{{#include ../../../lib/Dialect/TTNN/Transforms/Passes.cpp:adding_an_op_matmul_op_rewriter}} +{{#include ../../../lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp:adding_an_op_matmul_op_rewriter}} ``` Invoked as part of the rewrite set: ```cpp -TTIRToTTNNBinaryOpRewriter +MatmulOpConversionPattern ``` We also need to add this op to the C++ emitter, @@ -149,7 +154,7 @@ So far we have defined the Op in the TTIR and TTNN dialects, implemented verifiers, and have conversion passes. Now we need to add a unit test to ensure that the pass is working correctly. The unit tests are located in `test/ttmlir/Dialect` area. In this case we'll add a test under the `TTNN` -subdirectory since we are testing the `ConvertTTIRToTTNN` pass. +subdirectory since we are testing the `ConvertTTIRToTTNNPass`. #### `test/ttmlir/Dialect/TTNN/simple_matmul.mlir` @@ -215,11 +220,11 @@ to a program called `flatc` which generates C++ code (or any language for that matter) for serializing and deserializing the schema. This generated code can be found in `build/include/ttmlir/Target/TTNN/program_generated.h`. -Let's head over to `lib/Dialect/TTNN/Transforms/TTNNToSerializedBinary.cpp` to define +Let's head over to `lib/Target/TTNN/TTNNToFlatbuffer.cpp` to define a `createOp` overloaded function that does the conversion from MLIR to flatbuffer: ```cpp -{{#include ../../../lib/Dialect/TTNN/Transforms/TTNNToSerializedBinary.cpp:adding_an_op_matmul_serialize_to_binary}} +{{#include ../../../lib/Target/TTNN/TTNNToFlatbuffer.cpp:adding_an_op_matmul_serialize_to_binary}} ``` Lots of things are happening here, let's break it down: @@ -241,7 +246,7 @@ Lots of things are happening here, let's break it down: We can finally generate a binary with our new Op! We can use the following command: ```bash -./build/bin/ttmlir-opt --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn --ttnn-serialize-to-binary="output=out.ttnn" test/ttmlir/Dialect/TTNN/simple_matmul.mlir +./build/bin/ttmlir-opt --ttir-to-ttnn-backend-pipeline test/ttmlir/Dialect/TTNN/simple_matmul.mlir | ./build/bin/ttmlir-translate --ttnn-to-flatbuffer -o out.ttnn ``` And we can inspect the with [`ttrt`](./ttrt.md): diff --git a/docs/src/build.md b/docs/src/build.md index 4d3bbae99d..b075e54aca 100644 --- a/docs/src/build.md +++ b/docs/src/build.md @@ -33,6 +33,7 @@ cmake --build build ``` > - To enable the ttnn/metal runtime add `-DTTMLIR_ENABLE_RUNTIME=ON` +> - To enable the ttnn/metal perf runtime add `-DTT_RUNTIME_ENABLE_PERF_TRACE=ON` and `export ENABLE_TRACY=1` to environment before building > - To accelerate the builds with ccache use `-DCMAKE_CXX_COMPILER_LAUNCHER=ccache` > - To accelerate builds further, if python bindings aren't needed, `-DTTMLIR_ENABLE_BINDINGS_PYTHON=OFF`. For some reason the python bindings link step is very slow. > - TTNN build is automatically integrated / handled by tt-mlir cmake build system. For debugging and further information regarding the TTNN backend build step, please refer to [TTNN Documentation](https://tenstorrent.github.io/tt-metal/latest/ttnn/ttnn/installing.html). @@ -45,11 +46,11 @@ cmake --build build > For more information, please refer to > [TT-NN and TT-Metailium installation documentation](https://tenstorrent.github.io/tt-metal/latest/ttnn/ttnn/installing.html#step-4-install-and-start-using-tt-nn-and-tt-metalium). -| OS | Offline Compiler Only | Runtime Enabled Build | -|----|-----------------------|-----------------------| -| Ubuntu 22.04 | ✅ | ❌ | -| Ubuntu 20.04 | ✅ | ✅ | -| MacOS | ✅ | ❌ | +| OS | Offline Compiler Only | Runtime Enabled Build | Runtime + Perf Enabled Build | +|----|-----------------------|-----------------------| -----------------------------| +| Ubuntu 22.04 | ✅ | ❌ | ❌ | +| Ubuntu 20.04 | ✅ | ✅ | ✅ | +| MacOS | ✅ | ❌ | ❌ | ## Test @@ -104,10 +105,11 @@ For more information visit [pre-commit](https://pre-commit.com/) ```bash source env/activate cmake --build build -- docs -mdbook serve build/docs/book +mdbook serve build/docs ``` > - `mdbook` can be installed with the system's package manager. +> - `mdbook serve` will by default create a local server at `http://localhost:3000`. ## Dependencies @@ -119,6 +121,15 @@ Make sure to have Git LFS installed. You can install it with the following comma sudo apt-get install git-lfs ``` +If you are building performance trace with `-DTT_RUNTIME_ENABLE_PERF_TRACE=ON`, you will have to install the following packages + +```bash +pip install loguru +pip install torch +pip install pandas +pip install seaborn +``` + ### Ubuntu 22.04 We need to install Ninja which can be done with the following command diff --git a/docs/src/dialects-overview.md b/docs/src/dialects-overview.md index 9856655cf9..e886fb90c1 100644 --- a/docs/src/dialects-overview.md +++ b/docs/src/dialects-overview.md @@ -6,7 +6,7 @@ individual dialect documentation for more details.: - `tt`: Common types such as, `tt.tile`, `tt.layout`, `tt.grid`, etc. and enums such as, data formats, memory spaces, iterator types etc. - `ttir`: A high level dialect that models the tensor compute graph on tenstorrent devices. Accepts `tosa` and `linalg` input. - `ttir.generic`: Generically describe compute work. - - `ttir.layout`: Convert between different tensor memory layouts and transfer between different memory spaces. + - `ttir.to_layout`: Convert between different tensor memory layouts and transfer between different memory spaces. - `tensor.pad`: Pad a tensor with a value (ie. convs) - `ttir.yield`: return result memref of computation in dispatch region body, lowers to `ttkernel.yield` - `ttir.kernel`: lowers to some backend kernel diff --git a/docs/src/specs/device.md b/docs/src/specs/device.md new file mode 100644 index 0000000000..ac0e490e4d --- /dev/null +++ b/docs/src/specs/device.md @@ -0,0 +1,295 @@ +# Device + +Device in tt-mlir is somewhat of an overloaded term and can refer to different +things depending on the context. This document will only speak to the compiler's +abstract representation of a device captured by attribute `#tt.device`. + +## Terms + +There are many overloaded terms when talking about devices and grids, this +document will use the following definitions: + +- **Physical Grid**: A 2D array of tensix cores on a chip. +- **Chip**: A single physical chip with a **Physical Grid** of cores. +- **Card**: A PCIE or Ethernet card that may contain multiple **Chips**. +- **System**: A collection of **Cards** that are usually connected together on the + same host via PCIE or networked via ethernet. A system is represented by + `SystemDesc` in the compiler. +- **Device**: Device is always presented as a single entity to the enclosing + scope, but it may be virtualized to abstract a multi-card **System** and + part of its encoding carries a **Logical Grid**. Another way to think of device + is a view over the system. +- **Logical Grid** or just **Grid**: Is a logical shape that abstracts one or + more **Physical Grids**. + +## Motivation + +The device attribute strives to achieve the following goals: +- Provide a convenient representation of a physical grid that decouples the + logical division of tensors from the physical layout of the hardware. This not + only simplifies reasoning about how tensors get divided into shards, but can also + enable reinterpretations of the device grid for data layout optimization decoupled + from the existing encoding of the tensor layouts. +- Following the first point, the device attribute should be able to represent + many different forms of logical grids, from simple 2D grids, to more complex + topologies like extra-wide grids or higher dimensional grids. +- Device attribute captures encoding both single chip and multi-chip systems + under a single, virtualized representation. +- Enable many forms of data parallel execution strategies for single and + multi chip systems under a single representation. + +## Examples + +All of the following examples will assume the physical hardware has an 8x8 physical +grid of cores. We will use notation `[N, 8x8]` to represent a `N` chip system, +each with an 8x8 physical grid. + +`#tt.device` in is simplest, single chip form `[1, 8x8]`, just maps directly 1-1 to the +underlying physical hardware device. + +```mlir +#tt.device<#tt.grid<8x8, (d0, d1) -> (0, d0, d1)>, [0]> +``` + +Let's break down what each of these attributes mean: +- `#tt.grid<8x8, (d0, d1) -> (0, d0, d1)>`: This is a 2D logical grid with dim 8x8. + It's followed by an affine map `(d0, d1) -> (0, d0, d1)` that provides a mapping + from the logical grid to the physical grid. In this case, the logical grid is the same + as the physical grid, so the mapping is the identity function. The logical + grid can have any rank, but the physical mapping is always 3D, with the first + being the chip index, followed by the 2D physical core index within the chip. +- `[0]`: This is a list of chip indices. These chip indices directly reference + the same chip indices in the system descriptor. The `SystemDesc` attribute + that this is in reference to is tagged on the top level `ModuleOp`. + +Specific examples that this document will cover: +- [Data Parallel Over Batch](#data-parallel-over-batch) +- [Data Parallel Over 2d](#data-parallel-over-2d) +- [Data Parallel Over 2d and Batch](#data-parallel-over-2d-and-batch) +- [Pipeline Parallel](#pipeline-parallel) +- [Reinterpreted Grids (Transpose)](#reinterpreted-grids-transpose) +- [Reinterpreted Grids (Training Usecase)](#reinterpreted-grids-training-usecase) +- [Reinterpreted Grids (Extra)](#reinterpreted-grids-extra) + +> Before we move on to more complex examples, it's worth having on hand: +> - The python test `test/python/device_attr.py` which shows how all of these examples +> can actually be programmed for the device attribute. +> - The [Tensor Layout](./tensor-layout.md) spec as the following examples +> will demonstrate how tensor layout interacts with the logical device grid. + +> **Note on Data Parallel**: There is existing literature that explicitly distinguishes +> between data parallel and tensor parallel, oftentimes describing data parallel +> as duplicating the model across multiple devices and trivially dividing up the batch +> whereas tensor parallel refers to tensor data being distributed and potentially +> communicated between devices during execution. While this is true for multi-GPU/CPU +> systems, it is somewhat of an implementation detail and given the flexibility +> of tenstorrent hardware there is an opportunity to generalize this concept. In this +> document we will use the term data parallel to refer to any form of parallelism that +> divides any dimension of the tensor across multiple cores/chips. + +> **Note on Constraints**: Many of the examples below require careful virtualization +> of the underlying physical system, i.e. some device configurations might +> only work if the chips are connected via ethernet and with a particular +> topology, but these constraints are +> outside the scope of the examples and will be discussed further in the +> [Backend Lowering and Constraints](#backend-lowering-and-constraints) section. + +### Data Parallel Over Batch + +Given a 2 chip system, `[2, 8x8]`, we can represent a simple data parallel +logical grid that divides the batch dimension in half across the two chips. + +```mlir +#tt.device<#tt.grid<2x8x8, (d0, d1, d2) -> (d0, d1, d2)>, [0, 1]> +``` + +The affine map here is just identity, so dims `d1` and `d2` directly index +the physical grid and `d0` indexes the chip. + +Now we can consider some tensor that, importantly, has a grid of the same rank as +the logical device grid: + +```mlir +tensor<16x3x64x128xf32, + #tt.layout<(d0, d1, d2, d3) -> (d0, d1 * 64 + d2, d3), + undef, + <2x2x4>, + memref<8x3x1x!tt.tile<32 x 32, bfp_bf8>, #tt.memory_space> + > +> +``` + +If we map this tensor onto the above device, it will span across both chips, +half of the batch dimension on each chip. Within each chip the tensor occupies +a 2x4 grid out of the 8x8 physical grid available. + +### Data Parallel Over 2d + +In this example we will consider a 2 chip system, `[2, 8x8]`, and view it as +though the two chips are concatenated together side by side to form a single +`8x16` grid. + +```mlir +#tt.device<#tt.grid<8x16, (d0, d1) -> ((d0 floordiv 8) * 2 + d1 floordiv 8, d0, d1 mod 8)>, [0, 1]> +``` + +Here we can see that the affine map encodes an indexing pattern such that when +we extend past 8 cores in the second dimension, we wrap around to the next chip. + +Now we can consider some tensor that, importantly, has a grid of the same rank as +the logical device grid: + +```mlir +tensor<256x1024xf32, + #tt.layout<(d0, d1) -> (d0, d1), + undef, + <4x16>, + memref<2x2x!tt.tile<32 x 32, bfp_bf8>, #tt.memory_space> + > +> +``` + +This single tensor maps trivially onto the logical grid, spanning the upper +half. Decoupled from the tensor's layout, under the hood the tensor is actually +physically spanning across two chips. + +### Data Parallel Over 2d and Batch + +The previous 2 examples can be composed together to form a logical grid that +divides tensor across multiple dimensions. Here we will consider a 4 chip +system `[4, 8x8]` and view it as a `2x8x16` grid. + +```mlir +#tt.device<#tt.grid<2x8x16, (d0, d1, d2) -> (d0 * 2 + (d1 floordiv 8) * 2 + d2 floordiv 8, d1, d2 mod 8)>, [0, 1, 2, 3]> +``` + +We can evaluate the affine map to see that the chips are interpreted in chunks of +two, where groups `[0, 1]` and `[2, 3]` each form 8x16 grids and these 2 groups +concatenate to form a 2x8x16 grid. + +We can consider the following tensor to map onto this grid: + +```mlir +tensor<64x256x1024xf32, + #tt.layout<(d0, d1) -> (d0, d1), + undef, + <2x4x16>, + memref<32x2x2x!tt.tile<32 x 32, bfp_bf8>, #tt.memory_space> + > +> +``` + +### Pipeline Parallel + +Pipeline parallel in the scope of this spec isn't particularly interesting, it +is intended to be used in conjunction with the `ttir.pipeline` operation which +will group sections of the module's operations into groups to form pipeline regions +and will be covered in a separate spec. + +What we can demonstrate here is how we can take multiple non-overlapping views +of the system descriptor to form distinct virtual devices. + +Given an 8 chip system `[8, 8x8]`, we can form two virtual devices that each +take 4 chips and interpret them differently (though they could take the same +logical grid). + +```mlir +#tt.device<#tt.grid<2x8x16, (d0, d1, d2) -> (d0 * 2 + (d1 floordiv 8) * 2 + d2 floordiv 8, d1, d2 mod 8)>, [0, 1, 2, 3]> +#tt.device<#tt.grid<16x16, (d0, d1) -> ((d0 floordiv 8) * 2 + d1 floordiv 8, d0 mod 8, d1 mod 8)>, [4, 5, 6, 7]> +``` + +### Reinterpreted Grids (Transpose) + +One particularly interesting usecase that logical grids could enable is to +reinterpret the grid as a form of data layout optimization. For example, if we +wanted to transpose a tensor, instead of having to move the data around to +implement transpose, we could instead reinterpret the grid as being transposed, +leveraging the fact that the relevant data is already located on the correct +cores/chips. + +To keep things simple, let's consider a 1 chip system `[1, 8x8]`, but it's not +too big a leap to see how this could map to multi-chip where the cost of moving +data is even higher. + + +Let's also consider a simple (totally contrived) eltwise unary graph: + +```python +a = exp(a) +aT = transpose(a) +relu(aT) +``` + +1. We'll establish a regular, single chip, identity logical grid: +```mlir +#tt.device<#tt.grid<8x8, (d0, d1) -> (0, d0, d1)>, [0]> +``` +2. Execute `exp`. +3. We'll reinterpret the grid as transposed: +```mlir +#tt.device<#tt.grid<8x8, (d0, d1) -> (0, d1, d0)>, [0]> +``` +4. _Execute_ `transpose`. Note that each core only needs to transpose their + data locally. Eventually this could be implemented as a no-op by reindexing + the tile visitation order of the successive operation. +5. Execute `relu`. + +It's important to note that we effectively implemented transpose without moving +data anywhere. + +### Reinterpreted Grids (Extra) + +For the sake of examples, here's a few more ways of reinterpreting the logical grid. + +#### Extra Wide Grid +```mlir +#tt.device<#tt.grid<1x64, (d0, d1) -> (0, d0 * 8 + d1 floordiv 8, d1 mod 8)>, [0]> +``` + +#### Extra Tall + Transposed Grid +```mlir +#tt.device<#tt.grid<64x1, (d0, d1) -> (0, d1 * 8 + d0 floordiv 8, d0 mod 8)>, [0]> +``` + +#### Staircase +```mlir +#tt.device<#tt.grid<8x8, (d0, d1) -> (0, d0, (d0 + d1) mod 8)>, [0]> +``` + +This could be an interesting starting position for data in implementing matmul as a +systolic array in a ring topology. + +## Backend Lowering and Constraints + +While the above device attribute encoding is quite flexible, this does not +necessarily mean the target backend can actually support all of these +interpretations. TTNN backend will be relatively constrained to support only +the specialized grid topologies that are supported by the API. + +### TTNN + +TODO: + +- Multi-device +- Grid orientation +- Height / Width sharded +- TTNN Generic + +### TTMetal + +In TTMetal dialect we are only constrained by what we've implemented in the +tt-mlir compiler, this means it is much more flexible and can theoretically +support any of the grid interpretations above. + +## Test Plan + +- `test/python/device_attr.py` covers all of the examples above and asserts the + IR is correctly generated. +- Additional functional unit tests will be added as op and runtime support is + added. + +## Concerns + +- `tt.device` is very flexible, but with this flexibility comes the potential + for misuse. It's important that the compiler is able to validate the legal + configurations of this attribute for the target backend. diff --git a/docs/src/ttmlir-opt.md b/docs/src/ttmlir-opt.md index fd58e57187..fe14d2ff46 100644 --- a/docs/src/ttmlir-opt.md +++ b/docs/src/ttmlir-opt.md @@ -2,12 +2,6 @@ The `ttmlir` optimizer driver. This tool is used to run the `ttmlir` compiler passes on a `.mlir` source files and is central to developing and testing the compiler. -## Generate a flatbuffer file - -```bash -./build/bin/ttmlir-opt --ttir-to-ttnn-backend-pipeline --ttnn-serialize-to-binary="output=out.ttnn" test/ttmlir/Dialect/TTNN/simple_multiply.mlir -``` - ## Simple Test ```bash diff --git a/docs/src/ttmlir-translate.md b/docs/src/ttmlir-translate.md index fa9fd8287a..fa6f0bef50 100644 --- a/docs/src/ttmlir-translate.md +++ b/docs/src/ttmlir-translate.md @@ -11,6 +11,15 @@ The `ttmlir-translate` translation utility. Unlike `ttmlir-opt` tool which is us ./build/bin/ttmlir-translate -mlir-to-cpp c.mlir -allow-unregistered-dialect ``` +## Generate flatbuffer file from MLIR +```bash +# First run `ttmlir-opt` to convert to proper dialect +./build/bin/ttmlir-opt --ttir-to-ttnn-backend-pipeline test/ttmlir/Dialect/TTNN/simple_multiply.mlir -o ttnn.mlir + +# Now run `ttmlir-translate` to produce flatbuffer file +./build/bin/ttmlir-translate --ttnn-to-flatbuffer ttnn.mlir -o out.ttnn +``` + Bonus: These two commands can be piped, to avoid writing a `mlir` file to disk, like so: ```bash ./build/bin/ttmlir-opt --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn --convert-ttnn-to-emitc test/ttmlir/Dialect/TTNN/simple_multiply.mlir | ./build/bin/ttmlir-translate -mlir-to-cpp -allow-unregistered-dialect diff --git a/docs/src/ttrt.md b/docs/src/ttrt.md index b062af9109..c9a0da3522 100644 --- a/docs/src/ttrt.md +++ b/docs/src/ttrt.md @@ -16,34 +16,71 @@ ttrt --help See the [ttmlir-opt](./ttmlir-opt.md) documentation for more information on how to generate a flatbuffer file. -## Read sections from the flatbuffer +## APIs +```bash +ttrt --help +``` +### read ```bash +ttrt read --help +ttrt read --section mlir out.ttnn +ttrt read --section cpp out.ttnn ttrt read --section version out.ttnn ttrt read --section system-desc out.ttnn -ttrt read out.ttnn # Dump the whole thing as json -ttrt read --help +ttrt read --section inputs out.ttnn +ttrt read --section outputs out.ttnn +ttrt read --section all out.ttnn +ttrt read --section all out.ttnn --clean-artifacts +ttrt read --section all out.ttnn --save-artifacts +ttrt read --section all /dir/of/flatbuffers ``` -## Query information about the current system +### run +Note: It's required to be on a system with silicon and to have a runtime enabled +build `-DTTMLIR_ENABLE_RUNTIME=ON`. + +```bash +ttrt run --help +ttrt run out.ttnn +ttrt run out.ttnn --clean-artifacts +ttrt run out.ttnn --save-artifacts +ttrt run out.ttnn --loops 10 +ttrt run --program-index all out.ttnn +ttrt run --program-index 0 out.ttnn +ttrt run /dir/of/flatbuffers +ttrt run /dir/of/flatbuffers --loops 10 +``` +### query Note: It's required to be on a system with silicon and to have a runtime enabled build `-DTTMLIR_ENABLE_RUNTIME=ON`. ```bash +ttrt query --help ttrt query --system-desc -ttrt query --save-system-desc n300.ttsys +ttrt query --system-desc-as-json +ttrt query --system-desc-as-dict +ttrt query --save-artifacts +ttrt query --clean-artifacts ``` -## Execute flatbuffer files - -Note: -- It's required to be on a system with silicon and to have a runtime enabled -build `-DTTMLIR_ENABLE_RUNTIME=ON`. -- It's required to have installed `torch` in your python environment. +### perf +Note: It's required to be on a system with silicon and to have a runtime enabled +build `-DTTMLIR_ENABLE_RUNTIME=ON`. Also need perf enabled build `-DTT_RUNTIME_ENABLE_PERF_TRACE=ON` with `export ENABLE_TRACY=1`. ```bash -ttrt run out.ttnn +ttrt perf --help +ttrt perf out.ttnn +ttrt perf out.ttnn --clean-artifacts +ttrt perf out.ttnn --save-artifacts +ttrt perf out.ttnn --loops 10 +ttrt perf --program-index all out.ttnn +ttrt perf --program-index 0 out.ttnn +ttrt perf --device out.ttnn +ttrt perf --generate-params --perf-csv trace.csv +ttrt perf /dir/of/flatbuffers +ttrt perf /dir/of/flatbuffers --loops 10 ``` ## ttrt is written as a python library, so it can be used in custom python scripts @@ -54,3 +91,6 @@ import ttrt.binary fbb = ttrt.binary.load_from_path("out.ttnn") d = ttrt.binary.as_dict(fbb) ``` + +## bonus +artifacts are saved in ttrt-artifacts directory if the option `--save-artifacts` is provided diff --git a/env/CMakeLists.txt b/env/CMakeLists.txt index 6df12416aa..f19b60ee47 100644 --- a/env/CMakeLists.txt +++ b/env/CMakeLists.txt @@ -1,8 +1,8 @@ cmake_minimum_required(VERSION 3.20.0) project(ttmlir-toolchain LANGUAGES CXX C) -set(FLATBUFFERS_VERSION "v24.3.7") -set(LLVM_PROJECT_VERSION "llvmorg-18.1.0") +set(FLATBUFFERS_VERSION "fb9afbafc7dfe226b9db54d4923bfb8839635274") +set(LLVM_PROJECT_VERSION "9ddfe62f5c11e3f65f444209f514029ded2d58b9") include(ExternalProject) diff --git a/env/activate b/env/activate index f21adff2fa..26de77394e 100644 --- a/env/activate +++ b/env/activate @@ -12,5 +12,7 @@ fi export TTMLIR_ENV_ACTIVATED=1 export PATH=$TTMLIR_TOOLCHAIN_DIR/bin:$TTMLIR_TOOLCHAIN_DIR/venv/bin:$PATH export TT_METAL_HOME="$(pwd)/third_party/tt-metal/src/tt-metal" -export PYTHONPATH="$(pwd)/build/python_packages:$(pwd)/.local/toolchain/python_packages/mlir_core" +export TT_METAL_BUILD_HOME="$(pwd)/third_party/tt-metal/src/tt-metal-build" +export TT_MLIR_HOME="$(pwd)" +export PYTHONPATH="$(pwd)/build/python_packages:$(pwd)/.local/toolchain/python_packages/mlir_core:${TT_METAL_HOME}:${TT_METAL_HOME}/tt_eager:${TT_METAL_BUILD_HOME}/tools/profiler/bin" export ARCH_NAME="${ARCH_NAME:-wormhole_b0}" diff --git a/include/ttmlir-c/TTAttrs.h b/include/ttmlir-c/TTAttrs.h new file mode 100644 index 0000000000..750c201b09 --- /dev/null +++ b/include/ttmlir-c/TTAttrs.h @@ -0,0 +1,74 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef TTMLIR_C_TTATTRS_H +#define TTMLIR_C_TTATTRS_H + +#include "mlir-c/AffineMap.h" +#include "ttmlir-c/Dialects.h" + +#ifdef __cplusplus +extern "C" { +#endif + +MLIR_CAPI_EXPORTED MlirAttribute ttmlirTTGridAttrGet(MlirContext ctx, + int64_t *shape, + size_t shapeSize); + +MLIR_CAPI_EXPORTED MlirAttribute +ttmlirTTChipCapabilityAttrGet(MlirContext ctx, uint32_t chipCapability); + +MLIR_CAPI_EXPORTED MlirAttribute ttmlirTTArchAttrGet(MlirContext ctx, + uint32_t arch); + +MLIR_CAPI_EXPORTED MlirAttribute ttmlirTTChipDescAttrGet( + MlirContext ctx, MlirAttribute arch, int64_t *grid, size_t gridSize, + unsigned l1Size, unsigned numDramChannels, unsigned dramChannelSize, + unsigned nocL1AddressAlignBytes, unsigned pcieAddressAlignBytes, + unsigned nocDRAMAddressAlignBytes); + +MLIR_CAPI_EXPORTED MlirAttribute ttmlirTTChipCoordAttrGet( + MlirContext ctx, unsigned rack, unsigned shelf, unsigned y, unsigned x); + +MLIR_CAPI_EXPORTED MlirAttribute ttmlirTTChipChannelAttrGet(MlirContext ctx, + unsigned endpoint0, + unsigned endpoint1); + +MLIR_CAPI_EXPORTED MlirAttribute ttmlirTTSystemDescAttrGet( + MlirContext ctx, MlirAttribute *chipDescs, size_t chipDescsSize, + unsigned *chipDescIndices, size_t chipDescIndicesSize, + MlirAttribute *chipCapabilities, size_t chipCapabilitiesSize, + MlirAttribute *chipCoords, size_t chipCoordsSize, + MlirAttribute *chipChannels, size_t chipChannelsSize); + +MLIR_CAPI_EXPORTED MlirAttribute ttmlirTTLayoutAttrGet(MlirContext ctx, + MlirAffineMap linear, + unsigned oobVal, + MlirAttribute grid, + MlirType memref); + +MLIR_CAPI_EXPORTED MlirAttribute +ttmlirTTMemorySpaceAttrGet(MlirContext ctx, uint32_t memorySpace); + +MLIR_CAPI_EXPORTED MlirAttribute ttmlirTTOOBValAttrGet(MlirContext ctx, + uint32_t oobVal); + +MLIR_CAPI_EXPORTED MlirAttribute +ttmlirTTIteratorTypeAttrGet(MlirContext ctx, uint32_t iteratorType); + +MLIR_CAPI_EXPORTED MlirAttribute ttmlirTTIteratorTypeArrayAttrGet( + MlirContext ctx, uint32_t *iteratorTypes, size_t iteratorTypesSize); + +MLIR_CAPI_EXPORTED MlirAttribute +ttmlirTTOperandConstraintAttrGet(MlirContext ctx, uint32_t OperandConstraint); + +MLIR_CAPI_EXPORTED MlirAttribute ttmlirTTOperandConstraintArrayAttrGet( + MlirContext ctx, uint32_t *OperandConstraints, + size_t OperandConstraintsSize); + +#ifdef __cplusplus +} +#endif + +#endif // TTMLIR_C_TTATTRS_H diff --git a/include/ttmlir-c/TTTypes.h b/include/ttmlir-c/TTTypes.h new file mode 100644 index 0000000000..8038a2d465 --- /dev/null +++ b/include/ttmlir-c/TTTypes.h @@ -0,0 +1,25 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef TTMLIR_C_TTKERNELTYPES_H +#define TTMLIR_C_TTKERNELTYPES_H + +#include "ttmlir-c/Dialects.h" + +#ifdef __cplusplus +extern "C" { +#endif + +MLIR_CAPI_EXPORTED MlirType ttmlirTTTileTypeGet(MlirContext ctx, + unsigned height, unsigned width, + uint32_t dataType); + +MLIR_CAPI_EXPORTED MlirType ttmlirTTDeviceTypeGet(MlirContext ctx, + MlirAttribute deviceAttr); + +#ifdef __cplusplus +} +#endif + +#endif // TTMLIR_C_TTKERNELTYPES_H diff --git a/include/ttmlir/Conversion/Passes.h b/include/ttmlir/Conversion/Passes.h index 9ed050d512..bd4ee2753b 100644 --- a/include/ttmlir/Conversion/Passes.h +++ b/include/ttmlir/Conversion/Passes.h @@ -5,7 +5,10 @@ #ifndef TTMLIR_CONVERSION_PASSES_H #define TTMLIR_CONVERSION_PASSES_H +#include "ttmlir/Conversion/TTIRToTTNN/TTIRToTTNN.h" #include "ttmlir/Conversion/TTNNToEmitC/TTNNToEmitC.h" +#include "ttmlir/Conversion/TosaToTTIR/TosaToTTIR.h" +#include "ttmlir/Dialect/TTIR/IR/TTIR.h" #include "ttmlir/Dialect/TTNN/IR/TTNN.h" #include "mlir/Dialect/EmitC/IR/EmitC.h" diff --git a/include/ttmlir/Conversion/Passes.td b/include/ttmlir/Conversion/Passes.td index 3d8e3bdcb8..42f2b267db 100644 --- a/include/ttmlir/Conversion/Passes.td +++ b/include/ttmlir/Conversion/Passes.td @@ -7,7 +7,19 @@ include "mlir/Pass/PassBase.td" -def ConvertTTNNToEmitC : Pass<"convert-ttnn-to-emitc", "::mlir::func::FuncOp"> { +def ConvertTosaToTTIR : Pass<"convert-tosa-to-ttir", "::mlir::ModuleOp"> { + let summary = "Convert TOSA dialect to TTIR dialect."; + let constructor = "createConvertTosaToTTIRPass()"; + let dependentDialects = ["mlir::tt::ttir::TTIRDialect"]; +} + +def ConvertTTIRToTTNN: Pass<"convert-ttir-to-ttnn", "::mlir::ModuleOp"> { + let summary = "Convert TTIR dialect to TTNN dialect."; + let constructor = "createConvertTTIRToTTNNPass()"; + let dependentDialects = ["mlir::tt::ttir::TTIRDialect", "mlir::tt::ttnn::TTNNDialect"]; +} + +def ConvertTTNNToEmitC : Pass<"convert-ttnn-to-emitc", "::mlir::ModuleOp"> { let summary = "Convert TTNN dialect to EmitC dialect."; let constructor = "createConvertTTNNToEmitCPass()"; let dependentDialects = ["mlir::emitc::EmitCDialect", "mlir::tt::ttnn::TTNNDialect"]; diff --git a/include/ttmlir/Conversion/TTIRToTTNN/TTIRToTTNN.h b/include/ttmlir/Conversion/TTIRToTTNN/TTIRToTTNN.h new file mode 100644 index 0000000000..5163188047 --- /dev/null +++ b/include/ttmlir/Conversion/TTIRToTTNN/TTIRToTTNN.h @@ -0,0 +1,20 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef TTMLIR_CONVERSION_TTIRTOTTNN_TTIRTOTTNN_H +#define TTMLIR_CONVERSION_TTIRTOTTNN_TTIRTOTTNN_H + +#include "mlir/Pass/Pass.h" +#include "mlir/Transforms/DialectConversion.h" + +namespace mlir::tt { + +void populateTTIRToTTNNPatterns(MLIRContext *ctx, RewritePatternSet &patterns, + TypeConverter &typeConverter); + +std::unique_ptr> createConvertTTIRToTTNNPass(); + +} // namespace mlir::tt + +#endif // TTMLIR_CONVERSION_TTIRTOTTNN_TTIRTOTTNN_H diff --git a/include/ttmlir/Conversion/TTNNToEmitC/TTNNToEmitC.h b/include/ttmlir/Conversion/TTNNToEmitC/TTNNToEmitC.h index 8211d65ee2..9d412144ea 100644 --- a/include/ttmlir/Conversion/TTNNToEmitC/TTNNToEmitC.h +++ b/include/ttmlir/Conversion/TTNNToEmitC/TTNNToEmitC.h @@ -5,13 +5,15 @@ #ifndef TTMLIR_CONVERSION_TTNNTOEMITC_TTNNTOEMITC_H #define TTMLIR_CONVERSION_TTNNTOEMITC_TTNNTOEMITC_H -#include "mlir/Dialect/Func/IR/FuncOps.h" -#include "mlir/IR/BuiltinOps.h" #include "mlir/Pass/Pass.h" +#include "mlir/Transforms/DialectConversion.h" namespace mlir::tt { -std::unique_ptr> createConvertTTNNToEmitCPass(); +void populateTTNNToEmitCPatterns(MLIRContext *ctx, RewritePatternSet &patterns, + TypeConverter &typeConverter); + +std::unique_ptr> createConvertTTNNToEmitCPass(); } // namespace mlir::tt diff --git a/include/ttmlir/Conversion/TosaToTTIR/TosaToTTIR.h b/include/ttmlir/Conversion/TosaToTTIR/TosaToTTIR.h new file mode 100644 index 0000000000..acd5373c90 --- /dev/null +++ b/include/ttmlir/Conversion/TosaToTTIR/TosaToTTIR.h @@ -0,0 +1,17 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef TTMLIR_CONVERSION_TOSATOTTIR_TOSATOTTIR_H +#define TTMLIR_CONVERSION_TOSATOTTIR_TOSATOTTIR_H + +#include "mlir/IR/BuiltinOps.h" +#include "mlir/Pass/Pass.h" + +namespace mlir::tt { + +std::unique_ptr> createConvertTosaToTTIRPass(); + +} // namespace mlir::tt + +#endif diff --git a/include/ttmlir/Dialect/TT/IR/TTOpsTypes.h b/include/ttmlir/Dialect/TT/IR/TTOpsTypes.h index 4ea14969f6..5b90a9a742 100644 --- a/include/ttmlir/Dialect/TT/IR/TTOpsTypes.h +++ b/include/ttmlir/Dialect/TT/IR/TTOpsTypes.h @@ -28,4 +28,8 @@ inline bool isDeviceMemorySpace(MemorySpace memorySpace) { #define GET_TYPEDEF_CLASSES #include "ttmlir/Dialect/TT/IR/TTOpsTypes.h.inc" +namespace mlir::tt { +DeviceAttr getCurrentScopeDevice(Operation *op); +} // namespace mlir::tt + #endif diff --git a/include/ttmlir/Dialect/TT/IR/TTOpsTypes.td b/include/ttmlir/Dialect/TT/IR/TTOpsTypes.td index f2f0d52ce9..27a64d7861 100644 --- a/include/ttmlir/Dialect/TT/IR/TTOpsTypes.td +++ b/include/ttmlir/Dialect/TT/IR/TTOpsTypes.td @@ -33,8 +33,8 @@ def TT_GridAttr : TT_Attr<"Grid", "grid"> { let parameters = (ins ArrayRefParameter<"int64_t">:$shape, DefaultValuedParameter< "AffineMap", - "$_builder.getEmptyAffineMap()">:$physical_grid_mapping); - let assemblyFormat = "`<` custom($shape) (`,` $physical_grid_mapping^)? `>`"; + "$_builder.getEmptyAffineMap()">:$mapping); + let assemblyFormat = "`<` custom($shape) (`,` $mapping^)? `>`"; let extraClassDeclaration = [{ static GridAttr get(::mlir::MLIRContext *context) { @@ -61,8 +61,8 @@ def TT_ChipDescAttr : TT_Attr<"ChipDesc", "chip_desc"> { TT chip_desc attribute }]; - let parameters = (ins "ArchAttr":$arch, TT_GridAttr:$grid, "unsigned":$l1Size, "unsigned":$numDramChannels, "unsigned":$dramChannelSize, "unsigned":$nocL1AddressAlignBytes, "unsigned":$pcieAddressAlignBytes, "unsigned":$nocDRAMAddressAlignBytes); - let assemblyFormat = "`{` `arch` `=` $arch `,` `grid` `=` $grid `,` `l1_size` `=` $l1Size `,` `num_dram_channels` `=` $numDramChannels `,` `dram_channel_size` `=` $dramChannelSize `,` `noc_l1_address_align_bytes` `=` $nocL1AddressAlignBytes `,` `pcie_address_align_bytes` `=` $pcieAddressAlignBytes `,` `noc_dram_address_align_bytes` `=` $nocDRAMAddressAlignBytes `}`"; + let parameters = (ins "ArchAttr":$arch, ArrayRefParameter<"int64_t">:$grid, "unsigned":$l1Size, "unsigned":$numDramChannels, "unsigned":$dramChannelSize, "unsigned":$nocL1AddressAlignBytes, "unsigned":$pcieAddressAlignBytes, "unsigned":$nocDRAMAddressAlignBytes); + let assemblyFormat = "`{` `arch` `=` $arch `,` `grid` `=` custom($grid) `,` `l1_size` `=` $l1Size `,` `num_dram_channels` `=` $numDramChannels `,` `dram_channel_size` `=` $dramChannelSize `,` `noc_l1_address_align_bytes` `=` $nocL1AddressAlignBytes `,` `pcie_address_align_bytes` `=` $pcieAddressAlignBytes `,` `noc_dram_address_align_bytes` `=` $nocDRAMAddressAlignBytes `}`"; } def TT_ChipCoordAttr : TT_Attr<"ChipCoord", "chip_coord"> { @@ -100,6 +100,7 @@ def TT_SystemDescAttr : TT_Attr<"SystemDesc", "system_desc"> { let extraClassDeclaration = [{ static tt::SystemDescAttr getDefault(MLIRContext *context); + static tt::SystemDescAttr getFromPath(MLIRContext *context, std::string& path); }]; } @@ -201,6 +202,24 @@ def TT_LayoutAttr : TT_Attr<"Layout", "layout"> { }]; } +def TT_DeviceAttr : TT_Attr<"Device", "device", []> { + let summary = "Device attribute in TT dialect"; + let description = [{ + }]; + let parameters = (ins TT_GridAttr:$grid, ArrayRefParameter<"unsigned">:$chipIds); + let assemblyFormat = "`<` qualified($grid) `,` `[` $chipIds `]` `>`"; + + let extraClassDeclaration = [{ + static DeviceAttr get(::mlir::MLIRContext *context, ArrayRef shape, AffineMap physicalGridMapping, ArrayRef chipIds) { + return DeviceAttr::get(context, GridAttr::get(context, shape, physicalGridMapping), chipIds); + } + static DeviceAttr get(::mlir::MLIRContext *context, SystemDescAttr systemDesc, ArrayRef chipIds); + static DeviceAttr get(::mlir::MLIRContext *context, SystemDescAttr systemDesc); + }]; + + let genVerifyDecl = 1; +} + def TT_MemorySpaceAttr : EnumAttr { let assemblyFormat = "`<` $value `>`"; } @@ -239,14 +258,15 @@ def TT_Tile : TT_Type<"Tile", "tile", [MemRefElementTypeInterface]> { let extraClassDeclaration = [{ SmallVector getScalarShape(SmallVector tiledShape) const; SmallVector getTiledShape(SmallVector scalarShape) const; + uint64_t getSizeBytes() const; }]; } def TT_Device : TT_Type<"Device", "device", []> { let summary = "TT device"; let description = "Device type in TT dialect"; - let parameters = (ins TT_GridAttr:$mesh, ArrayRefParameter<"unsigned">:$chipIds); - let assemblyFormat = "`<` qualified($mesh) `,` `[` $chipIds `]` `>`"; + let parameters = (ins TT_DeviceAttr:$desc); + let assemblyFormat = "`<` $desc `>`"; } #endif diff --git a/include/ttmlir/Dialect/TTIR/Analysis/GridAnalysis.h b/include/ttmlir/Dialect/TTIR/Analysis/GridAnalysis.h deleted file mode 100644 index 6a306d4de7..0000000000 --- a/include/ttmlir/Dialect/TTIR/Analysis/GridAnalysis.h +++ /dev/null @@ -1,58 +0,0 @@ -// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC -// -// SPDX-License-Identifier: Apache-2.0 - -#ifndef TTMLIR_DIALECT_TTIR_ANALYSIS_GRIDANALYSIS_H -#define TTMLIR_DIALECT_TTIR_ANALYSIS_GRIDANALYSIS_H - -#include "ttmlir/Dialect/TTIR/Analysis/TTIRAnalysis.h" -#include "llvm/ADT/StringMap.h" - -namespace mlir::tt::ttir { - -struct GridAnalysisResult { - int target_rows = 1; - int target_columns = 1; -}; - -struct GridAnalysisInput { - int max_supported_rows; - int max_supported_columns; - llvm::StringMap> *grid_size_overrides; - - GridAnalysisInput() - : max_supported_rows(1), max_supported_columns(1), - grid_size_overrides(nullptr) {} - - GridAnalysisInput(int max_supported_rows, int max_supported_columns, - llvm::StringMap> *grid_size_overrides) - : max_supported_rows(max_supported_rows), - max_supported_columns(max_supported_columns), - grid_size_overrides(grid_size_overrides) {} - - bool operator==(const GridAnalysisInput &rhs) const { - return max_supported_rows == rhs.max_supported_rows && - max_supported_columns == rhs.max_supported_columns && - grid_size_overrides == rhs.grid_size_overrides; - } - - bool operator!=(const GridAnalysisInput &rhs) const { - return !(*this == rhs); - } -}; - -// Determine target grid size for each op. -// -class GridAnalysis - : public TTIRAnalysis { - -private: - void analysisImplementation() override; - bool applyOverrides() override; - -public: - GridAnalysis(Operation *op) : TTIRAnalysis(op) {} -}; -} // namespace mlir::tt::ttir - -#endif diff --git a/include/ttmlir/Dialect/TTIR/Analysis/LegalGridAnalysis.h b/include/ttmlir/Dialect/TTIR/Analysis/LegalGridAnalysis.h new file mode 100644 index 0000000000..b2ffede5ef --- /dev/null +++ b/include/ttmlir/Dialect/TTIR/Analysis/LegalGridAnalysis.h @@ -0,0 +1,53 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef TTMLIR_DIALECT_TTIR_ANALYSIS_LEGALGRIDANALYSIS_H +#define TTMLIR_DIALECT_TTIR_ANALYSIS_LEGALGRIDANALYSIS_H + +#include "ttmlir/Dialect/TT/IR/TTOpsTypes.h" +#include "ttmlir/Dialect/TTIR/Analysis/TTIRAnalysis.h" +#include "llvm/ADT/StringMap.h" + +namespace mlir::tt::ttir { + +struct LegalGridAnalysisInput { + ChipDescAttr chipDesc; + GridAttr maxGrid; + RankedTensorType tensorType; + llvm::StringMap> *gridSizeOverrides; + + LegalGridAnalysisInput() + : chipDesc(nullptr), maxGrid(nullptr), tensorType(nullptr), + gridSizeOverrides(nullptr) {} + + LegalGridAnalysisInput( + ChipDescAttr chipDesc, GridAttr maxGrid, RankedTensorType tensorType, + llvm::StringMap> *gridSizeOverrides) + : chipDesc(chipDesc), maxGrid(maxGrid), tensorType(tensorType), + gridSizeOverrides(gridSizeOverrides) {} + + bool operator==(const LegalGridAnalysisInput &rhs) const { + return chipDesc == rhs.chipDesc && maxGrid == rhs.maxGrid && + tensorType == rhs.tensorType && + gridSizeOverrides == rhs.gridSizeOverrides; + } + + bool operator!=(const LegalGridAnalysisInput &rhs) const { + return !(*this == rhs); + } +}; + +class LegalGridAnalysis + : public TTIRAnalysis> { +private: + void analysisImplementation() override; + bool applyOverrides() override; + +public: + LegalGridAnalysis(Operation *op) : TTIRAnalysis(op) {} +}; + +} // namespace mlir::tt::ttir + +#endif // TTMLIR_DIALECT_TTIR_ANALYSIS_LEGALGRIDANALYSIS_H diff --git a/include/ttmlir/Dialect/TTIR/Analysis/OptimalTargetGridAnalysis.h b/include/ttmlir/Dialect/TTIR/Analysis/OptimalTargetGridAnalysis.h new file mode 100644 index 0000000000..cfba2ca039 --- /dev/null +++ b/include/ttmlir/Dialect/TTIR/Analysis/OptimalTargetGridAnalysis.h @@ -0,0 +1,46 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef TTMLIR_DIALECT_TTIR_ANALYSIS_OPTIMALTARGETGRIDANALYSIS_H +#define TTMLIR_DIALECT_TTIR_ANALYSIS_OPTIMALTARGETGRIDANALYSIS_H + +#include "ttmlir/Dialect/TT/IR/TTOpsTypes.h" +#include "ttmlir/Dialect/TTIR/Analysis/TTIRAnalysis.h" + +namespace mlir::tt::ttir { + +struct OptimalTargetGridAnalysisInput { + llvm::DenseMap> legalGrids; + + OptimalTargetGridAnalysisInput() : legalGrids() {} + + OptimalTargetGridAnalysisInput( + const llvm::DenseMap> &&legalGrids) + : legalGrids(std::move(legalGrids)) {} + + bool operator==(const OptimalTargetGridAnalysisInput &rhs) const { + return legalGrids == rhs.legalGrids; + } + + bool operator!=(const OptimalTargetGridAnalysisInput &rhs) const { + return !(*this == rhs); + } +}; + +// Determine optimal target grid size for each op. +// +class OptimalTargetGridAnalysis + : public TTIRAnalysis> { + +private: + void analysisImplementation() override; + bool applyOverrides() override; + +public: + OptimalTargetGridAnalysis(Operation *op) : TTIRAnalysis(op) {} +}; +} // namespace mlir::tt::ttir + +#endif // TTMLIR_DIALECT_TTIR_ANALYSIS_OPTIMALTARGETGRIDANALYSIS_H diff --git a/include/ttmlir/Dialect/TTIR/Analysis/TTIRAnalysis.h b/include/ttmlir/Dialect/TTIR/Analysis/TTIRAnalysis.h index 95180f935b..1c0bb13f47 100644 --- a/include/ttmlir/Dialect/TTIR/Analysis/TTIRAnalysis.h +++ b/include/ttmlir/Dialect/TTIR/Analysis/TTIRAnalysis.h @@ -13,9 +13,9 @@ namespace mlir::tt::ttir { template class TTIRAnalysis { protected: Operation *op; - bool is_valid = false; - R analysis_result; - I analysis_input; + bool isValid = false; + R analysisResult; + I analysisInput; TTIRAnalysis(Operation *op) : op(op) {} @@ -38,9 +38,9 @@ template class TTIRAnalysis { void init(const I &input) { // Analysis can be cached and reused. Check that input remained the same. // - if (analysis_input != input) { - analysis_input = input; - is_valid = false; + if (analysisInput != input) { + analysisInput = input; + isValid = false; } } @@ -48,7 +48,7 @@ template class TTIRAnalysis { // const R &getResult() { runAnalysis(); - return analysis_result; + return analysisResult; } private: @@ -57,16 +57,16 @@ template class TTIRAnalysis { void runAnalysis() { // Skip the analysis if it was already run and input params haven't changed. // - if (!is_valid) { + if (!isValid) { // Apply overrides if needed. // - bool skip_analysis = applyOverrides(); + bool skipAnalysis = applyOverrides(); - if (!skip_analysis) { + if (!skipAnalysis) { analysisImplementation(); } - is_valid = true; + isValid = true; } } }; diff --git a/include/ttmlir/Dialect/TTIR/IR/TTIROps.td b/include/ttmlir/Dialect/TTIR/IR/TTIROps.td index 69e329bc07..99cae3ed29 100644 --- a/include/ttmlir/Dialect/TTIR/IR/TTIROps.td +++ b/include/ttmlir/Dialect/TTIR/IR/TTIROps.td @@ -60,10 +60,10 @@ def TTIR_GenericOp : TTIR_DPSOp<"generic", [AttrSizedOperandSegments]> { let regions = (region AnyRegion:$region); } -def TTIR_LayoutOp : TTIR_Op<"layout", [DestinationStyleOpInterface]> { - let summary = "Layout op."; +def TTIR_ToLayoutOp : TTIR_Op<"to_layout", [DestinationStyleOpInterface]> { + let summary = "ToLayout op."; let description = [{ - Layout operation, transition tensors from one layout to another. Some examples include: + ToLayout operation, transition tensors from one layout to another. Some examples include: - Transitioning between different memory spaces, e.g. DRAM to L1. - Transitioning between different data types, e.g. f32 to f16. - Transitioning between different tile sizes, e.g. 1x16 to 32x32 @@ -73,7 +73,7 @@ def TTIR_LayoutOp : TTIR_Op<"layout", [DestinationStyleOpInterface]> { ```llvm #layout = #tt.layout<8192x128x1, undef, <1x1>, memref<64x128xf32, #system>> #layout1 = #tt.layout<8192x128x1, undef, <1x1>, memref<64x128xf32, #l1_>> - %1 = "ttir.layout"(%arg0, %0) : (tensor<64x128xf32, #layout>, tensor<64x128xf32, #layout1>) -> tensor<64x128xf32, #layout1> + %1 = "ttir.to_layout"(%arg0, %0) : (tensor<64x128xf32, #layout>, tensor<64x128xf32, #layout1>) -> tensor<64x128xf32, #layout1> ``` }]; @@ -217,6 +217,13 @@ def TTIR_SumOp : TTIR_ReductionOp<"sum"> { }]; } +def TTIR_MeanOp : TTIR_ReductionOp<"mean"> { + let summary = "Mean reduction op."; + let description = [{ + Mean reduction op. + }]; +} + def TTIR_SoftmaxOp : TTIR_DPSOp<"softmax"> { let summary = "Softmax operation."; let description = [{ @@ -225,7 +232,7 @@ def TTIR_SoftmaxOp : TTIR_DPSOp<"softmax"> { let arguments = (ins AnyRankedTensor:$input, AnyRankedTensor:$output, - I32Attr:$dimension, + SI32Attr:$dimension, TT_OperandConstraintArrayAttr:$operand_constraints); let results = (outs AnyRankedTensor:$result); diff --git a/include/ttmlir/Dialect/TTIR/IR/TTIROpsInterfaces.h b/include/ttmlir/Dialect/TTIR/IR/TTIROpsInterfaces.h index aad064211f..1d88e8a657 100644 --- a/include/ttmlir/Dialect/TTIR/IR/TTIROpsInterfaces.h +++ b/include/ttmlir/Dialect/TTIR/IR/TTIROpsInterfaces.h @@ -5,6 +5,7 @@ #ifndef TTMLIR_DIALECT_TTIR_IR_TTIROPSINTERFACES_H #define TTMLIR_DIALECT_TTIR_IR_TTIROPSINTERFACES_H +#include "ttmlir/Dialect/TT/IR/TTOpsTypes.h" #include "ttmlir/Dialect/TTIR/IR/TTIR.h" namespace mlir { diff --git a/include/ttmlir/Dialect/TTIR/IR/TTIROpsInterfaces.td b/include/ttmlir/Dialect/TTIR/IR/TTIROpsInterfaces.td index 6b6d2dda1c..0c24b685ad 100644 --- a/include/ttmlir/Dialect/TTIR/IR/TTIROpsInterfaces.td +++ b/include/ttmlir/Dialect/TTIR/IR/TTIROpsInterfaces.td @@ -8,7 +8,7 @@ include "mlir/IR/OpBase.td" include "ttmlir/Dialect/TT/IR/TTOpsTypes.td" -def TTIROpInterface : OpInterface<"TTIROpInterface"> { +def TTIROpInterface : OpInterface<"TTIROp"> { let cppNamespace = "::mlir::tt::ttir"; let methods = [ InterfaceMethod< @@ -21,6 +21,16 @@ def TTIROpInterface : OpInterface<"TTIROpInterface"> { /*methodBody=*/"", /*defaultImplementation=*/"" >, + InterfaceMethod< + /*desc=*/[{ + Get the device of the current scope. + }], + /*retTy=*/"::mlir::tt::DeviceAttr", + /*methodName=*/"getDevice", + /*args=*/(ins), + /*methodBody=*/"", + /*defaultImplementation=*/"return ::mlir::tt::getCurrentScopeDevice($_op);" + >, ]; } diff --git a/include/ttmlir/Dialect/TTIR/Transforms/Passes.td b/include/ttmlir/Dialect/TTIR/Transforms/Passes.td index 987ce57c16..8c1391af64 100644 --- a/include/ttmlir/Dialect/TTIR/Transforms/Passes.td +++ b/include/ttmlir/Dialect/TTIR/Transforms/Passes.td @@ -7,10 +7,11 @@ include "mlir/Pass/PassBase.td" -def ConvertTosaToTTIR: Pass<"convert-tosa-to-ttir", "::mlir::ModuleOp"> { - let summary = ""; +def TTIRImplicitDevice: Pass<"ttir-implicit-device", "::mlir::ModuleOp"> { + let summary = "Create an implicit device"; let description = [{ - Convert TOSA ops to TTIR ops. + This pass will take a view of the system descriptor and create an implicit + device around it. }]; } @@ -50,10 +51,21 @@ def TTIRGridSet: Pass<"ttir-grid-set", "::mlir::ModuleOp"> { }]; let options = [ Option<"overrideGridSizes", "override-grid-sizes", - "llvm::StringMap>", - /*default=*/"llvm::StringMap>()", + "llvm::StringMap>", + /*default=*/"llvm::StringMap>()", "Override grid sizes for specific ops.">, ]; } +def TTIRLoadSystemDesc: Pass<"ttir-load-system-desc", "::mlir::ModuleOp"> { + let summary = "Load system desc."; + let description = [{ + Load system descriptor as a compiler pass. + }]; + + list