Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AMDGPU] Introduce a pseudo mnemonic for S_DELAY_ALU in MIR. #96004

Merged
merged 5 commits into from
Jun 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
151 changes: 151 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,157 @@

using namespace llvm;

void AMDGPUMIRFormatter::printImm(raw_ostream &OS, const MachineInstr &MI,
std::optional<unsigned int> OpIdx, int64_t Imm) const {

switch (MI.getOpcode()) {
case AMDGPU::S_DELAY_ALU:
assert(OpIdx == 0);
printSDelayAluImm(Imm, OS);
break;
default:
MIRFormatter::printImm(OS, MI, OpIdx, Imm);
break;
}
}

/// Implement target specific parsing of immediate mnemonics. The mnemonic is
/// a string with a leading dot.
bool AMDGPUMIRFormatter::parseImmMnemonic(const unsigned OpCode,
const unsigned OpIdx,
StringRef Src, int64_t &Imm,
ErrorCallbackType ErrorCallback) const
{

switch (OpCode) {
case AMDGPU::S_DELAY_ALU:
return parseSDelayAluImmMnemonic(OpIdx, Imm, Src, ErrorCallback);
default:
break;
}
return true; // Don't know what this is
}

void AMDGPUMIRFormatter::printSDelayAluImm(int64_t Imm,
llvm::raw_ostream &OS) const {
// Construct an immediate string to represent the information encoded in the
// s_delay_alu immediate.
// .id0_<dep>[_skip_<count>_id1<dep>]
constexpr int64_t None = 0;
constexpr int64_t Same = 0;

uint64_t Id0 = (Imm & 0xF);
uint64_t Skip = ((Imm >> 4) & 0x7);
uint64_t Id1 = ((Imm >> 7) & 0xF);
auto Outdep = [&](uint64_t Id) {
if (Id == None)
OS << "NONE";
else if (Id < 5)
OS << "VALU_DEP_" << Id;
else if (Id < 8)
OS << "TRANS32_DEP_" << Id - 4;
else
OS << "SALU_CYCLE_" << Id - 8;
};

OS << ".id0_";
Outdep(Id0);

// If the second inst is "same" and "none", no need to print the rest of the
// string.
if (Skip == Same && Id1 == None)
return;

// Encode the second delay specification.
OS << "_skip_";
if (Skip == 0)
OS << "SAME";
else if (Skip == 1)
OS << "NEXT";
else
OS << "SKIP_" << Skip - 1;

OS << "_id1_";
Outdep(Id1);
}

bool AMDGPUMIRFormatter::parseSDelayAluImmMnemonic(
const unsigned int OpIdx, int64_t &Imm, llvm::StringRef &Src,
llvm::MIRFormatter::ErrorCallbackType &ErrorCallback) const
{
assert(OpIdx == 0);

Imm = 0;
bool Expected = Src.consume_front(".id0_");
if (!Expected)
return ErrorCallback(Src.begin(), "Expected .id0_");

auto ExpectInt = [&](StringRef &Src, int64_t Offset) -> int64_t {
int64_t Dep;
if (!Src.consumeInteger(10, Dep))
return Dep + Offset;

return -1;
};

auto DecodeDelay = [&](StringRef &Src) -> int64_t {
if (Src.consume_front("NONE"))
return 0;
if (Src.consume_front("VALU_DEP_"))
return ExpectInt(Src, 0);
if (Src.consume_front("TRANS32_DEP_"))
return ExpectInt(Src, 4);
if (Src.consume_front("SALU_CYCLE_"))
return ExpectInt(Src, 8);

return -1;
};

int64_t Delay0 = DecodeDelay(Src);
int64_t Skip = 0;
int64_t Delay1 = 0;
if (Delay0 == -1)
return ErrorCallback(Src.begin(), "Could not decode delay0");


// Set the Imm so far, to that early return has the correct value.
Imm = Delay0;

// If that was the end of the string, the second instruction is "same" and
// "none"
if (Src.begin() == Src.end())
return false;

Expected = Src.consume_front("_skip_");
if (!Expected)
return ErrorCallback(Src.begin(), "Expected _skip_");


if (Src.consume_front("SAME")) {
Skip = 0;
} else if (Src.consume_front("NEXT")) {
Skip = 1;
} else if (Src.consume_front("SKIP_")) {
if (Src.consumeInteger(10, Skip)) {
return ErrorCallback(Src.begin(), "Expected integer Skip value");
}
Skip += 1;
} else {
ErrorCallback(Src.begin(), "Unexpected Skip Value");
}

Expected = Src.consume_front("_id1_");
if (!Expected)
return ErrorCallback(Src.begin(), "Expected _id1_");

Delay1 = DecodeDelay(Src);
if (Delay1 == -1)
return ErrorCallback(Src.begin(), "Could not decode delay1");

Imm = Imm | (Skip << 4) | (Delay1 << 7);
return false;
}

bool AMDGPUMIRFormatter::parseCustomPseudoSourceValue(
StringRef Src, MachineFunction &MF, PerFunctionMIParsingState &PFS,
const PseudoSourceValue *&PSV, ErrorCallbackType ErrorCallback) const {
Expand Down
23 changes: 23 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,35 @@ class AMDGPUMIRFormatter final : public MIRFormatter {
AMDGPUMIRFormatter() = default;
virtual ~AMDGPUMIRFormatter() = default;

/// Implement target specific printing for machine operand immediate value, so
/// that we can have more meaningful mnemonic than a 64-bit integer. Passing
/// None to OpIdx means the index is unknown.
virtual void printImm(raw_ostream &OS, const MachineInstr &MI,
std::optional<unsigned> OpIdx,
int64_t Imm) const override;

/// Implement target specific parsing of immediate mnemonics. The mnemonic is
/// a string with a leading dot.
virtual bool parseImmMnemonic(const unsigned OpCode, const unsigned OpIdx,
StringRef Src, int64_t &Imm,
ErrorCallbackType ErrorCallback) const override;

/// Implement target specific parsing of target custom pseudo source value.
bool
parseCustomPseudoSourceValue(StringRef Src, MachineFunction &MF,
PerFunctionMIParsingState &PFS,
const PseudoSourceValue *&PSV,
ErrorCallbackType ErrorCallback) const override;

private:
/// Print the string to represent s_delay_alu immediate value
void printSDelayAluImm(int64_t Imm, llvm::raw_ostream &OS) const;

/// Parse the immediate pseudo literal for s_delay_alu
bool parseSDelayAluImmMnemonic(
const unsigned int OpIdx, int64_t &Imm, llvm::StringRef &Src,
llvm::MIRFormatter::ErrorCallbackType &ErrorCallback) const;

};

} // end namespace llvm
Expand Down
192 changes: 192 additions & 0 deletions llvm/test/CodeGen/AMDGPU/insert-delay-alu-literal.mir
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass=amdgpu-insert-delay-alu %s -o - | FileCheck %s
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's no reason to specify -wavefrontsize32

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you don't you end up with both +wavefrontsize32 and +wavefrontsize64 set which can lead to weird inconsistencies. Since #86957 we check for the inconsistency, but only during isel so the check never runs on MIR test cases. I'd love to hear ideas for better places to put the check.


---
name: valu_dep_1
body: |
bb.0:
; CHECK-LABEL: name: valu_dep_1
; CHECK: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
; CHECK-NEXT: S_DELAY_ALU .id0_VALU_DEP_1
; CHECK-NEXT: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...

---
name: valu_dep_2
body: |
bb.0:
; CHECK-LABEL: name: valu_dep_2
; CHECK: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
; CHECK-NEXT: $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
; CHECK-NEXT: S_DELAY_ALU .id0_VALU_DEP_2
; CHECK-NEXT: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...

---
name: valu_dep_3
body: |
bb.0:
; CHECK-LABEL: name: valu_dep_3
; CHECK: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
; CHECK-NEXT: $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
; CHECK-NEXT: $vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
; CHECK-NEXT: S_DELAY_ALU .id0_VALU_DEP_3
; CHECK-NEXT: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
$vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...

---
name: valu_dep_4
body: |
bb.0:
; CHECK-LABEL: name: valu_dep_4
; CHECK: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
; CHECK-NEXT: $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
; CHECK-NEXT: $vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
; CHECK-NEXT: $vgpr3 = V_ADD_U32_e32 $vgpr3, $vgpr3, implicit $exec
; CHECK-NEXT: S_DELAY_ALU .id0_VALU_DEP_4
; CHECK-NEXT: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
$vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
$vgpr3 = V_ADD_U32_e32 $vgpr3, $vgpr3, implicit $exec
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...

---
name: trans32_dep_1
body: |
bb.0:
; CHECK-LABEL: name: trans32_dep_1
; CHECK: $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
; CHECK-NEXT: S_DELAY_ALU .id0_TRANS32_DEP_1
; CHECK-NEXT: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...

---
name: trans32_dep_2
body: |
bb.0:
; CHECK-LABEL: name: trans32_dep_2
; CHECK: $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
; CHECK-NEXT: $vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode
; CHECK-NEXT: S_DELAY_ALU .id0_TRANS32_DEP_2
; CHECK-NEXT: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
$vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...

---
name: trans32_dep_3
body: |
bb.0:
; CHECK-LABEL: name: trans32_dep_3
; CHECK: $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
; CHECK-NEXT: $vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode
; CHECK-NEXT: $vgpr2 = V_EXP_F32_e32 $vgpr2, implicit $exec, implicit $mode
; CHECK-NEXT: S_DELAY_ALU .id0_TRANS32_DEP_3
; CHECK-NEXT: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
$vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode
$vgpr2 = V_EXP_F32_e32 $vgpr2, implicit $exec, implicit $mode
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...

---
name: salu_cycle_1
body: |
bb.0:
; CHECK-LABEL: name: salu_cycle_1
; CHECK: $sgpr0 = S_MOV_B32 0
; CHECK-NEXT: S_DELAY_ALU .id0_SALU_CYCLE_1
; CHECK-NEXT: $vgpr0 = V_ADD_U32_e32 $sgpr0, $vgpr0, implicit $exec
$sgpr0 = S_MOV_B32 0
$vgpr0 = V_ADD_U32_e32 $sgpr0, $vgpr0, implicit $exec
...

---
name: valu_dep_1_same_trans32_dep_1
body: |
bb.0:
; CHECK-LABEL: name: valu_dep_1_same_trans32_dep_1
; CHECK: $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
; CHECK-NEXT: $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
; CHECK-NEXT: S_DELAY_ALU .id0_TRANS32_DEP_1_skip_SAME_id1_VALU_DEP_1
; CHECK-NEXT: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
$vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
$vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
...

---
name: valu_dep_1_same_salu_cycle_1
body: |
bb.0:
; CHECK-LABEL: name: valu_dep_1_same_salu_cycle_1
; CHECK: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
; CHECK-NEXT: $sgpr0 = S_MOV_B32 0
; CHECK-NEXT: S_DELAY_ALU .id0_VALU_DEP_1_skip_SAME_id1_SALU_CYCLE_1
; CHECK-NEXT: $vgpr0 = V_ADD_U32_e32 $sgpr0, $vgpr0, implicit $exec
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$sgpr0 = S_MOV_B32 0
$vgpr0 = V_ADD_U32_e32 $sgpr0, $vgpr0, implicit $exec
...

---
name: valu_dep_1_next_valu_dep_1
body: |
bb.0:
; CHECK-LABEL: name: valu_dep_1_next_valu_dep_1
; CHECK: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
; CHECK-NEXT: S_DELAY_ALU .id0_VALU_DEP_1_skip_NEXT_id1_VALU_DEP_1
; CHECK-NEXT: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
; CHECK-NEXT: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...

---
name: valu_dep_2_next_valu_dep_2
body: |
bb.0:
; CHECK-LABEL: name: valu_dep_2_next_valu_dep_2
; CHECK: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
; CHECK-NEXT: $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
; CHECK-NEXT: S_DELAY_ALU .id0_VALU_DEP_2_skip_NEXT_id1_VALU_DEP_2
; CHECK-NEXT: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
; CHECK-NEXT: $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
...

---
name: valu_dep_2_skip_valu_dep_2
body: |
bb.0:
; CHECK-LABEL: name: valu_dep_2_skip_valu_dep_2
; CHECK: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
; CHECK-NEXT: $vgpr2 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
; CHECK-NEXT: S_DELAY_ALU .id0_VALU_DEP_2_skip_SKIP_1_id1_VALU_DEP_2
; CHECK-NEXT: $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
; CHECK-NEXT: $vgpr4 = V_ADD_U32_e32 $vgpr3, $vgpr3, implicit $exec
; CHECK-NEXT: $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr2 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
$vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
$vgpr4 = V_ADD_U32_e32 $vgpr3, $vgpr3, implicit $exec
$vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
...
Loading
Loading