-
Notifications
You must be signed in to change notification settings - Fork 11.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] Introduce a pseudo mnemonic for S_DELAY_ALU in MIR. #96004
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Michael Bedy (mjbedy) ChangesPatch is 22.16 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/96004.diff 5 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp
index 6ec4178053b20..7bca5b3bb9ef6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp
@@ -17,6 +17,160 @@
using namespace llvm;
+void AMDGPUMIRFormatter::printImm(raw_ostream &OS, const MachineInstr &MI,
+ std::optional<unsigned int> OpIdx, int64_t Imm) const {
+
+ switch(MI.getOpcode()) {
+ case AMDGPU::S_DELAY_ALU:
+ assert(OpIdx == 0);
+ printSDelayAluImm(Imm, OS);
+ break;
+ default:
+ MIRFormatter::printImm(OS, MI, OpIdx, Imm);
+ break;
+ }
+}
+
+/// Implement target specific parsing of immediate mnemonics. The mnemonic is
+/// dot seperated strings.
+bool AMDGPUMIRFormatter::parseImmMnemonic(const unsigned OpCode,
+ const unsigned OpIdx,
+ StringRef Src, int64_t &Imm,
+ ErrorCallbackType ErrorCallback) const
+{
+
+ switch(OpCode) {
+ case AMDGPU::S_DELAY_ALU:
+ return parseSDelayAluImmMnemonic(OpIdx, Imm, Src, ErrorCallback);
+ default:
+ break;
+ }
+ return true; // Don't know what this is
+}
+
+void AMDGPUMIRFormatter::printSDelayAluImm(int64_t Imm,
+ llvm::raw_ostream &OS) const {
+ // Construct an immediate string to represent the information encoded in the
+ // s_delay_alu immediate.
+ // .id0_<dep>[_skip_<count>_id1<dep>]
+ constexpr int64_t None = 0;
+ constexpr int64_t Same = 0;
+
+ uint64_t Id0 = (Imm & 0xF);
+ uint64_t Skip = ((Imm >> 4) & 0x7);
+ uint64_t Id1 = ((Imm >> 7) & 0xF);
+ auto outdep = [&](uint64_t Id) {
+ if (Id == None) {
+ OS << "NONE";
+ } else if (Id < 5) {
+ OS << "VALU_DEP_" << Id;
+ } else if (Id < 8) {
+ OS << "TRANS32_DEP_" << Id - 4;
+ } else {
+ OS << "SALU_CYCLE_" << Id - 8;
+ }
+ };
+
+ OS << ".id0_";
+ outdep(Id0);
+
+ // If the second inst is "same" and "none", no need to print the rest of the
+ // string.
+ if (Skip == Same && Id1 == None)
+ return;
+
+ // Encode the second delay specification.
+ OS << "_skip_";
+ if (Skip == 0) {
+ OS << "SAME";
+ } else if (Skip == 1) {
+ OS << "NEXT";
+ } else {
+ OS << "SKIP_" << Skip - 1;
+ }
+ OS << "_id1_";
+ outdep(Id1);
+}
+
+bool AMDGPUMIRFormatter::parseSDelayAluImmMnemonic(
+ const unsigned int OpIdx, int64_t &Imm, llvm::StringRef &Src,
+ llvm::MIRFormatter::ErrorCallbackType &ErrorCallback) const
+{
+ assert(OpIdx == 0);
+
+ Imm = 0;
+ bool expected = Src.consume_front(".id0_");
+ if (!expected) {
+ return ErrorCallback(Src.begin(), "Expected .id0_");
+ }
+
+ auto expect_int = [&](StringRef &Src, int64_t Offset) -> int64_t {
+ int64_t Dep;
+ if (!Src.consumeInteger(10, Dep)) {
+ return Dep + Offset;
+ } else {
+ return -1;
+ }
+ };
+
+ auto decode_delay = [&](StringRef &Src) -> int64_t {
+ if (Src.consume_front("NONE")) {
+ return 0;
+ } else if (Src.consume_front("VALU_DEP_")) {
+ return expect_int(Src, 0);
+ } else if (Src.consume_front("TRANS32_DEP_")) {
+ return expect_int(Src, 4);
+ } else if (Src.consume_front("SALU_CYCLE_")) {
+ return expect_int(Src, 8);
+ }
+ return -1;
+ };
+
+ int64_t Delay0 = decode_delay(Src);
+ int64_t Skip = 0;
+ int64_t Delay1 = 0;
+ if (Delay0 == -1) {
+ return ErrorCallback(Src.begin(), "Could not decode delay0");
+ }
+
+ // Set the Imm so far, to that early return has the correct value.
+ Imm = Delay0;
+
+ // If that was the end of the string, the second instruction is "same" and
+ // "none"
+ if (Src.begin() == Src.end())
+ return false;
+
+ expected = Src.consume_front("_skip_");
+ if (!expected) {
+ return ErrorCallback(Src.begin(), "Expected _skip_");
+ }
+
+ if (Src.consume_front("SAME")) {
+ Skip = 0;
+ } else if (Src.consume_front("NEXT")) {
+ Skip = 1;
+ } else if (Src.consume_front("SKIP_")) {
+ if (Src.consumeInteger(10, Skip)) {
+ return ErrorCallback(Src.begin(), "Expected integer Skip value");
+ }
+ } else {
+ ErrorCallback(Src.begin(), "Unexpected Skip Value");
+ }
+
+ expected = Src.consume_front("_id1_");
+ if (!expected) {
+ return ErrorCallback(Src.begin(), "Expected _id1_");
+ }
+
+ Delay1 = decode_delay(Src);
+ if (Delay1 == -1) {
+ return ErrorCallback(Src.begin(), "Could not decode delay1");
+ }
+ Imm = Imm | (Skip << 4) | (Delay1 << 7);
+ return false;
+}
+
bool AMDGPUMIRFormatter::parseCustomPseudoSourceValue(
StringRef Src, MachineFunction &MF, PerFunctionMIParsingState &PFS,
const PseudoSourceValue *&PSV, ErrorCallbackType ErrorCallback) const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h
index 98b5031071cf4..80bb3dfe7a364 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h
@@ -28,12 +28,35 @@ class AMDGPUMIRFormatter final : public MIRFormatter {
AMDGPUMIRFormatter() = default;
virtual ~AMDGPUMIRFormatter() = default;
+ /// Implement target specific printing for machine operand immediate value, so
+ /// that we can have more meaningful mnemonic than a 64-bit integer. Passing
+ /// None to OpIdx means the index is unknown.
+ virtual void printImm(raw_ostream &OS, const MachineInstr &MI,
+ std::optional<unsigned> OpIdx,
+ int64_t Imm) const override;
+
+ /// Implement target specific parsing of immediate mnemonics. The mnemonic is
+ /// dot seperated strings.
+ virtual bool parseImmMnemonic(const unsigned OpCode, const unsigned OpIdx,
+ StringRef Src, int64_t &Imm,
+ ErrorCallbackType ErrorCallback) const override;
+
/// Implement target specific parsing of target custom pseudo source value.
bool
parseCustomPseudoSourceValue(StringRef Src, MachineFunction &MF,
PerFunctionMIParsingState &PFS,
const PseudoSourceValue *&PSV,
ErrorCallbackType ErrorCallback) const override;
+
+private:
+ /// Print the string to represent s_delay_alu immediate value
+ void printSDelayAluImm(int64_t Imm, llvm::raw_ostream &OS) const;
+
+ /// Parse the immediate pseudo literal for s_delay_alu
+ bool parseSDelayAluImmMnemonic(
+ const unsigned int OpIdx, int64_t &Imm, llvm::StringRef &Src,
+ llvm::MIRFormatter::ErrorCallbackType &ErrorCallback) const;
+
};
} // end namespace llvm
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-literal.mir b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-literal.mir
new file mode 100644
index 0000000000000..7788e50ed4d24
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-literal.mir
@@ -0,0 +1,175 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass=amdgpu-insert-delay-alu %s -o - | FileCheck %s
+
+---
+name: valu_dep_1
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: valu_dep_1
+ ; CHECK: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ ; CHECK-NEXT: S_DELAY_ALU .id0_VALU_DEP_1
+ ; CHECK-NEXT: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+...
+
+---
+name: valu_dep_2
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: valu_dep_2
+ ; CHECK: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ ; CHECK-NEXT: $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
+ ; CHECK-NEXT: S_DELAY_ALU .id0_VALU_DEP_2
+ ; CHECK-NEXT: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+...
+
+---
+name: valu_dep_3
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: valu_dep_3
+ ; CHECK: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ ; CHECK-NEXT: $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
+ ; CHECK-NEXT: $vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
+ ; CHECK-NEXT: S_DELAY_ALU .id0_VALU_DEP_3
+ ; CHECK-NEXT: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
+ $vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+...
+
+---
+name: valu_dep_4
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: valu_dep_4
+ ; CHECK: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ ; CHECK-NEXT: $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
+ ; CHECK-NEXT: $vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
+ ; CHECK-NEXT: $vgpr3 = V_ADD_U32_e32 $vgpr3, $vgpr3, implicit $exec
+ ; CHECK-NEXT: S_DELAY_ALU .id0_VALU_DEP_4
+ ; CHECK-NEXT: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
+ $vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
+ $vgpr3 = V_ADD_U32_e32 $vgpr3, $vgpr3, implicit $exec
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+...
+
+---
+name: trans32_dep_1
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: trans32_dep_1
+ ; CHECK: $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
+ ; CHECK-NEXT: S_DELAY_ALU .id0_TRANS32_DEP_1
+ ; CHECK-NEXT: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+...
+
+---
+name: trans32_dep_2
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: trans32_dep_2
+ ; CHECK: $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
+ ; CHECK-NEXT: $vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode
+ ; CHECK-NEXT: S_DELAY_ALU .id0_TRANS32_DEP_2
+ ; CHECK-NEXT: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
+ $vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+...
+
+---
+name: trans32_dep_3
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: trans32_dep_3
+ ; CHECK: $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
+ ; CHECK-NEXT: $vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode
+ ; CHECK-NEXT: $vgpr2 = V_EXP_F32_e32 $vgpr2, implicit $exec, implicit $mode
+ ; CHECK-NEXT: S_DELAY_ALU .id0_TRANS32_DEP_3
+ ; CHECK-NEXT: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
+ $vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode
+ $vgpr2 = V_EXP_F32_e32 $vgpr2, implicit $exec, implicit $mode
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+...
+
+---
+name: salu_cycle_1
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: salu_cycle_1
+ ; CHECK: $sgpr0 = S_MOV_B32 0
+ ; CHECK-NEXT: S_DELAY_ALU .id0_SALU_CYCLE_1
+ ; CHECK-NEXT: $vgpr0 = V_ADD_U32_e32 $sgpr0, $vgpr0, implicit $exec
+ $sgpr0 = S_MOV_B32 0
+ $vgpr0 = V_ADD_U32_e32 $sgpr0, $vgpr0, implicit $exec
+...
+
+---
+name: valu_dep_1_same_trans32_dep_1
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: valu_dep_1_same_trans32_dep_1
+ ; CHECK: $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
+ ; CHECK-NEXT: $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
+ ; CHECK-NEXT: S_DELAY_ALU .id0_TRANS32_DEP_1_skip_SAME_id1_VALU_DEP_1
+ ; CHECK-NEXT: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
+ $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
+ $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
+...
+
+---
+name: valu_dep_1_same_salu_cycle_1
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: valu_dep_1_same_salu_cycle_1
+ ; CHECK: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ ; CHECK-NEXT: $sgpr0 = S_MOV_B32 0
+ ; CHECK-NEXT: S_DELAY_ALU .id0_VALU_DEP_1_skip_SAME_id1_SALU_CYCLE_1
+ ; CHECK-NEXT: $vgpr0 = V_ADD_U32_e32 $sgpr0, $vgpr0, implicit $exec
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ $sgpr0 = S_MOV_B32 0
+ $vgpr0 = V_ADD_U32_e32 $sgpr0, $vgpr0, implicit $exec
+...
+
+---
+name: valu_dep_1_next_valu_dep_1
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: valu_dep_1_next_valu_dep_1
+ ; CHECK: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ ; CHECK-NEXT: S_DELAY_ALU .id0_VALU_DEP_1_skip_NEXT_id1_VALU_DEP_1
+ ; CHECK-NEXT: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ ; CHECK-NEXT: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+...
+
+---
+name: valu_dep_2_next_valu_dep_2
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: valu_dep_2_next_valu_dep_2
+ ; CHECK: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ ; CHECK-NEXT: $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
+ ; CHECK-NEXT: S_DELAY_ALU .id0_VALU_DEP_2_skip_NEXT_id1_VALU_DEP_2
+ ; CHECK-NEXT: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ ; CHECK-NEXT: $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
+...
+
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-parse.mir b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-parse.mir
new file mode 100644
index 0000000000000..0d264629d4b72
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-parse.mir
@@ -0,0 +1,198 @@
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -start-after=amdgpu-insert-delay-alu %s -o - | FileCheck %s
+
+---
+name: valu_dep_1
+body: |
+ bb.0:
+ ; CHECK-LABEL: {{^}}valu_dep_1:
+ ; CHECK: %bb.0:
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ S_DELAY_ALU .id0_VALU_DEP_1
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+...
+
+---
+name: valu_dep_2
+body: |
+ bb.0:
+ ; CHECK-LABEL: {{^}}valu_dep_2:
+ ; CHECK: %bb.0:
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
+ ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2)
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
+ S_DELAY_ALU .id0_VALU_DEP_2
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+...
+
+---
+name: valu_dep_3
+body: |
+ bb.0:
+ ; CHECK-LABEL: {{^}}valu_dep_3:
+ ; CHECK: %bb.0:
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
+ ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v2
+ ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3)
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
+ $vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
+ S_DELAY_ALU .id0_VALU_DEP_3
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+...
+
+---
+name: valu_dep_4
+body: |
+ bb.0:
+ ; CHECK-LABEL: {{^}}valu_dep_4:
+ ; CHECK: %bb.0:
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
+ ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v2
+ ; CHECK-NEXT: v_add_nc_u32_e32 v3, v3, v3
+ ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_4)
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
+ $vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
+ $vgpr3 = V_ADD_U32_e32 $vgpr3, $vgpr3, implicit $exec
+ S_DELAY_ALU .id0_VALU_DEP_4
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+...
+
+---
+name: trans32_dep_1
+body: |
+ bb.0:
+ ; CHECK-LABEL: {{^}}trans32_dep_1:
+ ; CHECK: %bb.0:
+ ; CHECK-NEXT: v_exp_f32_e32 v0, v0
+ ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
+ S_DELAY_ALU .id0_TRANS32_DEP_1
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+...
+
+---
+name: trans32_dep_2
+body: |
+ bb.0:
+ ; CHECK-LABEL: {{^}}trans32_dep_2:
+ ; CHECK: %bb.0:
+ ; CHECK-NEXT: v_exp_f32_e32 v0, v0
+ ; CHECK-NEXT: v_exp_f32_e32 v1, v1
+ ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_2)
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
+ $vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode
+ S_DELAY_ALU .id0_TRANS32_DEP_2
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+...
+
+---
+name: trans32_dep_3
+body: |
+ bb.0:
+ ; CHECK-LABEL: {{^}}trans32_dep_3:
+ ; CHECK: %bb.0:
+ ; CHECK-NEXT: v_exp_f32_e32 v0, v0
+ ; CHECK-NEXT: v_exp_f32_e32 v1, v1
+ ; CHECK-NEXT: v_exp_f32_e32 v2, v2
+ ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_3)
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
+ $vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode
+ $vgpr2 = V_EXP_F32_e32 $vgpr2, implicit $exec, implicit $mode
+ S_DELAY_ALU .id0_TRANS32_DEP_3
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+...
+
+---
+name: salu_cycle_1
+body: |
+ bb.0:
+ ; CHECK-LABEL: {{^}}salu_cycle_1:
+ ; CHECK: %bb.0:
+ ; CHECK-NEXT: s_mov_b32 s0, 0
+ ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, s0, v0
+ $sgpr0 = S_MOV_B32 0
+ S_DELAY_ALU .id0_SALU_CYCLE_1
+ $vgpr0 = V_ADD_U32_e32 $sgpr0, $vgpr0, implicit $exec
+...
+
+---
+name: valu_dep_1_same_trans32_dep_1
+body: |
+ bb.0:
+ ; CHECK-LABEL: {{^}}valu_dep_1_same_trans32_dep_1:
+ ; CHECK: %bb.0:
+ ; CHECK-NEXT: v_exp_f32_e32 v0, v0
+ ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
+ ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v1
+ $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
+ $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
+ S_DELAY_ALU .id0_TRANS32_DEP_1_skip_SAME_id1_VALU_DEP_1
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
+...
+
+---
+name: valu_dep_1_same_salu_cycle_1
+body: |
+ bb.0:
+ ; CHECK-LABEL: {{^}}valu_dep_1_same_salu_cycle_1:
+ ; CHECK: %bb.0:
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ; CHECK-NEXT: s_mov_b32 s0, 0
+ ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, s0, v0
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ $sgpr0 = S_MOV_B32 0
+ S_DELAY_ALU .id0_VALU_DEP_1_skip_SAME_id1_SALU_CYCLE_1
+ $vgpr0 = V_ADD_U32_e32 $sgpr0, $vgpr0, implicit $exec
+...
+
+---
+name: valu_dep_1_next_valu_dep_1
+body: |
+ bb.0:
+ ; CHECK-LABEL: {{^}}valu_dep_1_next_valu_dep_1:
+ ; CHECK: %bb.0:
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ S_DELAY_ALU .id0_VALU_DEP_1_skip_NEXT_id1_VALU_DEP_1
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+...
+
+---
+name: valu_dep_2_next_valu_dep_2
+body: |
+ bb.0:
+ ; CHECK-LABEL: {{^}}valu_dep_2_next_valu_dep_2:
+ ; CHECK: %bb.0:
+ ; CHEC...
[truncated]
|
I think this looks pretty nice, unless there are any principled objections to using custom formatted MIR operands? Doing a similar thing for |
There are a few other literals that I think could benefit as well. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM with a few more formatting and spelling fixes.
@@ -0,0 +1,192 @@ | |||
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 | |||
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass=amdgpu-insert-delay-alu %s -o - | FileCheck %s |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There's no reason to specify -wavefrontsize32
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If you don't you end up with both +wavefrontsize32 and +wavefrontsize64 set which can lead to weird inconsistencies. Since #86957 we check for the inconsistency, but only during isel so the check never runs on MIR test cases. I'd love to hear ideas for better places to put the check.
@@ -0,0 +1,217 @@ | |||
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -start-after=amdgpu-insert-delay-alu %s -o - | FileCheck %s |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Parser tests should go in test/CodeGen/MIR/AMDGPU, and use run-pass=none
No description provided.