Skip to content

Commit

Permalink
[AMDGPU] Introduce a pseudo mnemonic for S_DELAY_ALU in MIR. (#96004)
Browse files Browse the repository at this point in the history
  • Loading branch information
mbedya committed Jun 20, 2024
1 parent 9473e16 commit edf2d0a
Show file tree
Hide file tree
Showing 5 changed files with 585 additions and 2 deletions.
151 changes: 151 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,157 @@

using namespace llvm;

void AMDGPUMIRFormatter::printImm(raw_ostream &OS, const MachineInstr &MI,
std::optional<unsigned int> OpIdx, int64_t Imm) const {

switch (MI.getOpcode()) {
case AMDGPU::S_DELAY_ALU:
assert(OpIdx == 0);
printSDelayAluImm(Imm, OS);
break;
default:
MIRFormatter::printImm(OS, MI, OpIdx, Imm);
break;
}
}

/// Implement target specific parsing of immediate mnemonics. The mnemonic is
/// a string with a leading dot.
bool AMDGPUMIRFormatter::parseImmMnemonic(const unsigned OpCode,
const unsigned OpIdx,
StringRef Src, int64_t &Imm,
ErrorCallbackType ErrorCallback) const
{

switch (OpCode) {
case AMDGPU::S_DELAY_ALU:
return parseSDelayAluImmMnemonic(OpIdx, Imm, Src, ErrorCallback);
default:
break;
}
return true; // Don't know what this is
}

void AMDGPUMIRFormatter::printSDelayAluImm(int64_t Imm,
llvm::raw_ostream &OS) const {
// Construct an immediate string to represent the information encoded in the
// s_delay_alu immediate.
// .id0_<dep>[_skip_<count>_id1<dep>]
constexpr int64_t None = 0;
constexpr int64_t Same = 0;

uint64_t Id0 = (Imm & 0xF);
uint64_t Skip = ((Imm >> 4) & 0x7);
uint64_t Id1 = ((Imm >> 7) & 0xF);
auto Outdep = [&](uint64_t Id) {
if (Id == None)
OS << "NONE";
else if (Id < 5)
OS << "VALU_DEP_" << Id;
else if (Id < 8)
OS << "TRANS32_DEP_" << Id - 4;
else
OS << "SALU_CYCLE_" << Id - 8;
};

OS << ".id0_";
Outdep(Id0);

// If the second inst is "same" and "none", no need to print the rest of the
// string.
if (Skip == Same && Id1 == None)
return;

// Encode the second delay specification.
OS << "_skip_";
if (Skip == 0)
OS << "SAME";
else if (Skip == 1)
OS << "NEXT";
else
OS << "SKIP_" << Skip - 1;

OS << "_id1_";
Outdep(Id1);
}

bool AMDGPUMIRFormatter::parseSDelayAluImmMnemonic(
const unsigned int OpIdx, int64_t &Imm, llvm::StringRef &Src,
llvm::MIRFormatter::ErrorCallbackType &ErrorCallback) const
{
assert(OpIdx == 0);

Imm = 0;
bool Expected = Src.consume_front(".id0_");
if (!Expected)
return ErrorCallback(Src.begin(), "Expected .id0_");

auto ExpectInt = [&](StringRef &Src, int64_t Offset) -> int64_t {
int64_t Dep;
if (!Src.consumeInteger(10, Dep))
return Dep + Offset;

return -1;
};

auto DecodeDelay = [&](StringRef &Src) -> int64_t {
if (Src.consume_front("NONE"))
return 0;
if (Src.consume_front("VALU_DEP_"))
return ExpectInt(Src, 0);
if (Src.consume_front("TRANS32_DEP_"))
return ExpectInt(Src, 4);
if (Src.consume_front("SALU_CYCLE_"))
return ExpectInt(Src, 8);

return -1;
};

int64_t Delay0 = DecodeDelay(Src);
int64_t Skip = 0;
int64_t Delay1 = 0;
if (Delay0 == -1)
return ErrorCallback(Src.begin(), "Could not decode delay0");


// Set the Imm so far, to that early return has the correct value.
Imm = Delay0;

// If that was the end of the string, the second instruction is "same" and
// "none"
if (Src.begin() == Src.end())
return false;

Expected = Src.consume_front("_skip_");
if (!Expected)
return ErrorCallback(Src.begin(), "Expected _skip_");


if (Src.consume_front("SAME")) {
Skip = 0;
} else if (Src.consume_front("NEXT")) {
Skip = 1;
} else if (Src.consume_front("SKIP_")) {
if (Src.consumeInteger(10, Skip)) {
return ErrorCallback(Src.begin(), "Expected integer Skip value");
}
Skip += 1;
} else {
ErrorCallback(Src.begin(), "Unexpected Skip Value");
}

Expected = Src.consume_front("_id1_");
if (!Expected)
return ErrorCallback(Src.begin(), "Expected _id1_");

Delay1 = DecodeDelay(Src);
if (Delay1 == -1)
return ErrorCallback(Src.begin(), "Could not decode delay1");

Imm = Imm | (Skip << 4) | (Delay1 << 7);
return false;
}

bool AMDGPUMIRFormatter::parseCustomPseudoSourceValue(
StringRef Src, MachineFunction &MF, PerFunctionMIParsingState &PFS,
const PseudoSourceValue *&PSV, ErrorCallbackType ErrorCallback) const {
Expand Down
23 changes: 23 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,35 @@ class AMDGPUMIRFormatter final : public MIRFormatter {
AMDGPUMIRFormatter() = default;
virtual ~AMDGPUMIRFormatter() = default;

/// Implement target specific printing for machine operand immediate value, so
/// that we can have more meaningful mnemonic than a 64-bit integer. Passing
/// None to OpIdx means the index is unknown.
virtual void printImm(raw_ostream &OS, const MachineInstr &MI,
std::optional<unsigned> OpIdx,
int64_t Imm) const override;

/// Implement target specific parsing of immediate mnemonics. The mnemonic is
/// a string with a leading dot.
virtual bool parseImmMnemonic(const unsigned OpCode, const unsigned OpIdx,
StringRef Src, int64_t &Imm,
ErrorCallbackType ErrorCallback) const override;

/// Implement target specific parsing of target custom pseudo source value.
bool
parseCustomPseudoSourceValue(StringRef Src, MachineFunction &MF,
PerFunctionMIParsingState &PFS,
const PseudoSourceValue *&PSV,
ErrorCallbackType ErrorCallback) const override;

private:
/// Print the string to represent s_delay_alu immediate value
void printSDelayAluImm(int64_t Imm, llvm::raw_ostream &OS) const;

/// Parse the immediate pseudo literal for s_delay_alu
bool parseSDelayAluImmMnemonic(
const unsigned int OpIdx, int64_t &Imm, llvm::StringRef &Src,
llvm::MIRFormatter::ErrorCallbackType &ErrorCallback) const;

};

} // end namespace llvm
Expand Down
192 changes: 192 additions & 0 deletions llvm/test/CodeGen/AMDGPU/insert-delay-alu-literal.mir
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass=amdgpu-insert-delay-alu %s -o - | FileCheck %s

---
name: valu_dep_1
body: |
bb.0:
; CHECK-LABEL: name: valu_dep_1
; CHECK: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
; CHECK-NEXT: S_DELAY_ALU .id0_VALU_DEP_1
; CHECK-NEXT: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...

---
name: valu_dep_2
body: |
bb.0:
; CHECK-LABEL: name: valu_dep_2
; CHECK: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
; CHECK-NEXT: $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
; CHECK-NEXT: S_DELAY_ALU .id0_VALU_DEP_2
; CHECK-NEXT: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...

---
name: valu_dep_3
body: |
bb.0:
; CHECK-LABEL: name: valu_dep_3
; CHECK: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
; CHECK-NEXT: $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
; CHECK-NEXT: $vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
; CHECK-NEXT: S_DELAY_ALU .id0_VALU_DEP_3
; CHECK-NEXT: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
$vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...

---
name: valu_dep_4
body: |
bb.0:
; CHECK-LABEL: name: valu_dep_4
; CHECK: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
; CHECK-NEXT: $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
; CHECK-NEXT: $vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
; CHECK-NEXT: $vgpr3 = V_ADD_U32_e32 $vgpr3, $vgpr3, implicit $exec
; CHECK-NEXT: S_DELAY_ALU .id0_VALU_DEP_4
; CHECK-NEXT: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
$vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
$vgpr3 = V_ADD_U32_e32 $vgpr3, $vgpr3, implicit $exec
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...

---
name: trans32_dep_1
body: |
bb.0:
; CHECK-LABEL: name: trans32_dep_1
; CHECK: $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
; CHECK-NEXT: S_DELAY_ALU .id0_TRANS32_DEP_1
; CHECK-NEXT: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...

---
name: trans32_dep_2
body: |
bb.0:
; CHECK-LABEL: name: trans32_dep_2
; CHECK: $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
; CHECK-NEXT: $vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode
; CHECK-NEXT: S_DELAY_ALU .id0_TRANS32_DEP_2
; CHECK-NEXT: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
$vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...

---
name: trans32_dep_3
body: |
bb.0:
; CHECK-LABEL: name: trans32_dep_3
; CHECK: $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
; CHECK-NEXT: $vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode
; CHECK-NEXT: $vgpr2 = V_EXP_F32_e32 $vgpr2, implicit $exec, implicit $mode
; CHECK-NEXT: S_DELAY_ALU .id0_TRANS32_DEP_3
; CHECK-NEXT: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
$vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode
$vgpr2 = V_EXP_F32_e32 $vgpr2, implicit $exec, implicit $mode
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...

---
name: salu_cycle_1
body: |
bb.0:
; CHECK-LABEL: name: salu_cycle_1
; CHECK: $sgpr0 = S_MOV_B32 0
; CHECK-NEXT: S_DELAY_ALU .id0_SALU_CYCLE_1
; CHECK-NEXT: $vgpr0 = V_ADD_U32_e32 $sgpr0, $vgpr0, implicit $exec
$sgpr0 = S_MOV_B32 0
$vgpr0 = V_ADD_U32_e32 $sgpr0, $vgpr0, implicit $exec
...

---
name: valu_dep_1_same_trans32_dep_1
body: |
bb.0:
; CHECK-LABEL: name: valu_dep_1_same_trans32_dep_1
; CHECK: $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
; CHECK-NEXT: $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
; CHECK-NEXT: S_DELAY_ALU .id0_TRANS32_DEP_1_skip_SAME_id1_VALU_DEP_1
; CHECK-NEXT: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
$vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
$vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
...

---
name: valu_dep_1_same_salu_cycle_1
body: |
bb.0:
; CHECK-LABEL: name: valu_dep_1_same_salu_cycle_1
; CHECK: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
; CHECK-NEXT: $sgpr0 = S_MOV_B32 0
; CHECK-NEXT: S_DELAY_ALU .id0_VALU_DEP_1_skip_SAME_id1_SALU_CYCLE_1
; CHECK-NEXT: $vgpr0 = V_ADD_U32_e32 $sgpr0, $vgpr0, implicit $exec
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$sgpr0 = S_MOV_B32 0
$vgpr0 = V_ADD_U32_e32 $sgpr0, $vgpr0, implicit $exec
...

---
name: valu_dep_1_next_valu_dep_1
body: |
bb.0:
; CHECK-LABEL: name: valu_dep_1_next_valu_dep_1
; CHECK: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
; CHECK-NEXT: S_DELAY_ALU .id0_VALU_DEP_1_skip_NEXT_id1_VALU_DEP_1
; CHECK-NEXT: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
; CHECK-NEXT: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...

---
name: valu_dep_2_next_valu_dep_2
body: |
bb.0:
; CHECK-LABEL: name: valu_dep_2_next_valu_dep_2
; CHECK: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
; CHECK-NEXT: $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
; CHECK-NEXT: S_DELAY_ALU .id0_VALU_DEP_2_skip_NEXT_id1_VALU_DEP_2
; CHECK-NEXT: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
; CHECK-NEXT: $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
...

---
name: valu_dep_2_skip_valu_dep_2
body: |
bb.0:
; CHECK-LABEL: name: valu_dep_2_skip_valu_dep_2
; CHECK: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
; CHECK-NEXT: $vgpr2 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
; CHECK-NEXT: S_DELAY_ALU .id0_VALU_DEP_2_skip_SKIP_1_id1_VALU_DEP_2
; CHECK-NEXT: $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
; CHECK-NEXT: $vgpr4 = V_ADD_U32_e32 $vgpr3, $vgpr3, implicit $exec
; CHECK-NEXT: $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr2 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
$vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
$vgpr4 = V_ADD_U32_e32 $vgpr3, $vgpr3, implicit $exec
$vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
...
Loading

0 comments on commit edf2d0a

Please sign in to comment.