diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index 8b0b05c0ea424..b7097308f6e89 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -319,6 +319,64 @@ used in the '``llvm.nvvm.idp4a.[us].u``' variants, while sign-extension is used with '``llvm.nvvm.idp4a.[us].s``' variants. The dot product of these 4-element vectors is added to ``%c`` to produce the return. +Bit Manipulation Intrinsics +--------------------------- + +'``llvm.nvvm.fshl.clamp.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare i32 @llvm.nvvm.fshl.clamp.i32(i32 %hi, i32 %lo, i32 %n) + +Overview: +""""""""" + +The '``llvm.nvvm.fshl.clamp``' family of intrinsics performs a clamped funnel +shift left. These intrinsics are very similar to '``llvm.fshl``', except the +shift ammont is clamped at the integer width (instead of modulo it). Currently, +only ``i32`` is supported. + +Semantics: +"""""""""" + +The '``llvm.nvvm.fshl.clamp``' family of intrinsic functions performs a clamped +funnel shift left: the first two values are concatenated as { %hi : %lo } (%hi +is the most significant bits of the wide value), the combined value is shifted +left, and the most significant bits are extracted to produce a result that is +the same size as the original arguments. The shift amount is the minimum of the +value of %n and the bit width of the integer type. + +'``llvm.nvvm.fshr.clamp.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare i32 @llvm.nvvm.fshr.clamp.i32(i32 %hi, i32 %lo, i32 %n) + +Overview: +""""""""" + +The '``llvm.nvvm.fshr.clamp``' family of intrinsics perform a clamped funnel +shift right. These intrinsics are very similar to '``llvm.fshr``', except the +shift ammont is clamped at the integer width (instead of modulo it). Currently, +only ``i32`` is supported. + +Semantics: +"""""""""" + +The '``llvm.nvvm.fshr.clamp``' family of intrinsic functions performs a clamped +funnel shift right: the first two values are concatenated as { %hi : %lo } (%hi +is the most significant bits of the wide value), the combined value is shifted +right, and the least significant bits are extracted to produce a result that is +the same size as the original arguments. The shift amount is the minimum of the +value of %n and the bit width of the integer type. Other Intrinsics diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 7b8ffe417fccd..b4a06f583f2c9 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -1080,6 +1080,16 @@ let TargetPrefix = "nvvm" in { } } +// +// Funnel-shift +// + foreach direction = ["l", "r"] in + def int_nvvm_fsh # direction # _clamp : + DefaultAttrsIntrinsic<[llvm_anyint_ty], + [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], + [IntrNoMem, IntrSpeculatable, IntrWillReturn]>; + + // // Convert // diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index b5478b8f09ceb..5f6cba397c535 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -3535,6 +3535,15 @@ let hasSideEffects = false in { defm SHF_R_WRAP : ShfInst<"r.wrap", fshr>; } +def : Pat<(i32 (int_nvvm_fshl_clamp (i32 Int32Regs:$hi), (i32 Int32Regs:$lo), (i32 Int32Regs:$amt))), + (SHF_L_CLAMP_r (i32 Int32Regs:$lo), (i32 Int32Regs:$hi), (i32 Int32Regs:$amt))>; +def : Pat<(i32 (int_nvvm_fshl_clamp (i32 Int32Regs:$hi), (i32 Int32Regs:$lo), (i32 imm:$amt))), + (SHF_L_CLAMP_i (i32 Int32Regs:$lo), (i32 Int32Regs:$hi), (i32 imm:$amt))>; +def : Pat<(i32 (int_nvvm_fshr_clamp (i32 Int32Regs:$hi), (i32 Int32Regs:$lo), (i32 Int32Regs:$amt))), + (SHF_R_CLAMP_r (i32 Int32Regs:$lo), (i32 Int32Regs:$hi), (i32 Int32Regs:$amt))>; +def : Pat<(i32 (int_nvvm_fshr_clamp (i32 Int32Regs:$hi), (i32 Int32Regs:$lo), (i32 imm:$amt))), + (SHF_R_CLAMP_i (i32 Int32Regs:$lo), (i32 Int32Regs:$hi), (i32 imm:$amt))>; + // Count leading zeros let hasSideEffects = false in { def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a), diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp index b141229dcfc73..e35ba25b47880 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -14,8 +14,12 @@ #include "llvm/CodeGen/BasicTTIImpl.h" #include "llvm/CodeGen/CostTable.h" #include "llvm/CodeGen/TargetLowering.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsNVPTX.h" -#include "llvm/Support/Debug.h" +#include "llvm/IR/Value.h" +#include "llvm/Support/Casting.h" +#include "llvm/Transforms/InstCombine/InstCombiner.h" #include using namespace llvm; @@ -134,6 +138,7 @@ static Instruction *simplifyNvvmIntrinsic(IntrinsicInst *II, InstCombiner &IC) { // simplify. enum SpecialCase { SPC_Reciprocal, + SCP_FunnelShiftClamp, }; // SimplifyAction is a poor-man's variant (plus an additional flag) that @@ -314,6 +319,10 @@ static Instruction *simplifyNvvmIntrinsic(IntrinsicInst *II, InstCombiner &IC) { case Intrinsic::nvvm_rcp_rn_d: return {SPC_Reciprocal, FTZ_Any}; + case Intrinsic::nvvm_fshl_clamp: + case Intrinsic::nvvm_fshr_clamp: + return {SCP_FunnelShiftClamp, FTZ_Any}; + // We do not currently simplify intrinsics that give an approximate // answer. These include: // @@ -384,6 +393,22 @@ static Instruction *simplifyNvvmIntrinsic(IntrinsicInst *II, InstCombiner &IC) { return BinaryOperator::Create( Instruction::FDiv, ConstantFP::get(II->getArgOperand(0)->getType(), 1), II->getArgOperand(0), II->getName()); + + case SCP_FunnelShiftClamp: { + // Canonicalize a clamping funnel shift to the generic llvm funnel shift + // when possible, as this is easier for llvm to optimize further. + if (const auto *ShiftConst = dyn_cast(II->getArgOperand(2))) { + const bool IsLeft = II->getIntrinsicID() == Intrinsic::nvvm_fshl_clamp; + if (ShiftConst->getZExtValue() >= II->getType()->getIntegerBitWidth()) + return IC.replaceInstUsesWith(*II, II->getArgOperand(IsLeft ? 1 : 0)); + + const unsigned FshIID = IsLeft ? Intrinsic::fshl : Intrinsic::fshr; + return CallInst::Create(Intrinsic::getOrInsertDeclaration( + II->getModule(), FshIID, II->getType()), + SmallVector(II->args())); + } + return nullptr; + } } llvm_unreachable("All SpecialCase enumerators should be handled in switch."); } diff --git a/llvm/test/CodeGen/NVPTX/funnel-shift-clamp.ll b/llvm/test/CodeGen/NVPTX/funnel-shift-clamp.ll new file mode 100644 index 0000000000000..b0d04805e7d3f --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/funnel-shift-clamp.ll @@ -0,0 +1,70 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -march=nvptx -mcpu=sm_61 | FileCheck %s +; RUN: llc < %s -march=nvptx64 -mcpu=sm_61 | FileCheck %s + +target triple = "nvptx-nvidia-cuda" + +declare i32 @llvm.nvvm.fshr.clamp.i32(i32, i32, i32) +declare i32 @llvm.nvvm.fshl.clamp.i32(i32, i32, i32) + +define i32 @fshr_clamp_r(i32 %hi, i32 %lo, i32 %n) { +; CHECK-LABEL: fshr_clamp_r( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [fshr_clamp_r_param_0]; +; CHECK-NEXT: ld.param.u32 %r2, [fshr_clamp_r_param_1]; +; CHECK-NEXT: ld.param.u32 %r3, [fshr_clamp_r_param_2]; +; CHECK-NEXT: shf.r.clamp.b32 %r4, %r2, %r1, %r3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r4; +; CHECK-NEXT: ret; + %call = call i32 @llvm.nvvm.fshr.clamp.i32(i32 %hi, i32 %lo, i32 %n) + ret i32 %call +} + +define i32 @fshl_clamp_r(i32 %hi, i32 %lo, i32 %n) { +; CHECK-LABEL: fshl_clamp_r( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [fshl_clamp_r_param_0]; +; CHECK-NEXT: ld.param.u32 %r2, [fshl_clamp_r_param_1]; +; CHECK-NEXT: ld.param.u32 %r3, [fshl_clamp_r_param_2]; +; CHECK-NEXT: shf.l.clamp.b32 %r4, %r2, %r1, %r3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r4; +; CHECK-NEXT: ret; + %call = call i32 @llvm.nvvm.fshl.clamp.i32(i32 %hi, i32 %lo, i32 %n) + ret i32 %call +} + +define i32 @fshr_clamp_i(i32 %hi, i32 %lo) { +; CHECK-LABEL: fshr_clamp_i( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [fshr_clamp_i_param_0]; +; CHECK-NEXT: ld.param.u32 %r2, [fshr_clamp_i_param_1]; +; CHECK-NEXT: shf.r.clamp.b32 %r3, %r2, %r1, 3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %call = call i32 @llvm.nvvm.fshr.clamp.i32(i32 %hi, i32 %lo, i32 3) + ret i32 %call +} + +define i32 @fshl_clamp_i(i32 %hi, i32 %lo) { +; CHECK-LABEL: fshl_clamp_i( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [fshl_clamp_i_param_0]; +; CHECK-NEXT: ld.param.u32 %r2, [fshl_clamp_i_param_1]; +; CHECK-NEXT: shf.l.clamp.b32 %r3, %r2, %r1, 3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %call = call i32 @llvm.nvvm.fshl.clamp.i32(i32 %hi, i32 %lo, i32 3) + ret i32 %call +} diff --git a/llvm/test/Transforms/InstCombine/NVPTX/nvvm-intrins.ll b/llvm/test/Transforms/InstCombine/NVPTX/nvvm-intrins.ll index 35a81fffac3a7..a1517409e1ee1 100644 --- a/llvm/test/Transforms/InstCombine/NVPTX/nvvm-intrins.ll +++ b/llvm/test/Transforms/InstCombine/NVPTX/nvvm-intrins.ll @@ -384,6 +384,48 @@ define float @test_sqrt_rn_f_ftz(float %a) #0 { ret float %ret } +; CHECK-LABEL: @test_fshl_clamp_1 +define i32 @test_fshl_clamp_1(i32 %a, i32 %b) { +; CHECK: call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 3) + %call = call i32 @llvm.nvvm.fshl.clamp.i32(i32 %a, i32 %b, i32 3) + ret i32 %call +} + +; CHECK-LABEL: @test_fshl_clamp_2 +define i32 @test_fshl_clamp_2(i32 %a, i32 %b) { +; CHECK: ret i32 %b + %call = call i32 @llvm.nvvm.fshl.clamp.i32(i32 %a, i32 %b, i32 300) + ret i32 %call +} + +; CHECK-LABEL: @test_fshl_clamp_3 +define i32 @test_fshl_clamp_3(i32 %a, i32 %b, i32 %c) { +; CHECK: call i32 @llvm.nvvm.fshl.clamp.i32(i32 %a, i32 %b, i32 %c) + %call = call i32 @llvm.nvvm.fshl.clamp.i32(i32 %a, i32 %b, i32 %c) + ret i32 %call +} + +; CHECK-LABEL: @test_fshr_clamp_1 +define i32 @test_fshr_clamp_1(i32 %a, i32 %b) { +; CHECK: call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 29) + %call = call i32 @llvm.nvvm.fshr.clamp.i32(i32 %a, i32 %b, i32 3) + ret i32 %call +} + +; CHECK-LABEL: @test_fshr_clamp_2 +define i32 @test_fshr_clamp_2(i32 %a, i32 %b) { +; CHECK: ret i32 %a + %call = call i32 @llvm.nvvm.fshr.clamp.i32(i32 %a, i32 %b, i32 300) + ret i32 %call +} + +; CHECK-LABEL: @test_fshr_clamp_3 +define i32 @test_fshr_clamp_3(i32 %a, i32 %b, i32 %c) { +; CHECK: call i32 @llvm.nvvm.fshr.clamp.i32(i32 %a, i32 %b, i32 %c) + %call = call i32 @llvm.nvvm.fshr.clamp.i32(i32 %a, i32 %b, i32 %c) + ret i32 %call +} + declare double @llvm.nvvm.add.rn.d(double, double) declare float @llvm.nvvm.add.rn.f(float, float) declare float @llvm.nvvm.add.rn.ftz.f(float, float) @@ -454,3 +496,5 @@ declare double @llvm.nvvm.ui2d.rn(i32) declare float @llvm.nvvm.ui2f.rn(i32) declare double @llvm.nvvm.ull2d.rn(i64) declare float @llvm.nvvm.ull2f.rn(i64) +declare i32 @llvm.nvvm.fshr.clamp.i32(i32, i32, i32) +declare i32 @llvm.nvvm.fshl.clamp.i32(i32, i32, i32) \ No newline at end of file