diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 5445746ab2a1b..7f41fb08944c5 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -8211,14 +8211,11 @@ isImpliedCondMatchingOperands(CmpInst::Predicate LPred, return std::nullopt; } -/// Return true if "icmp LPred X, LC" implies "icmp RPred X, RC" is true. -/// Return false if "icmp LPred X, LC" implies "icmp RPred X, RC" is false. +/// Return true if `X in DomCR` implies `X in CR` is true. +/// Return false if `X in DomCR` implies `X in CR` is false. /// Otherwise, return std::nullopt if we can't infer anything. -static std::optional isImpliedCondCommonOperandWithConstants( - CmpInst::Predicate LPred, const APInt &LC, CmpInst::Predicate RPred, - const APInt &RC) { - ConstantRange DomCR = ConstantRange::makeExactICmpRegion(LPred, LC); - ConstantRange CR = ConstantRange::makeExactICmpRegion(RPred, RC); +static std::optional isImpliedCondWithRange(const ConstantRange &DomCR, + const ConstantRange &CR) { ConstantRange Intersection = DomCR.intersectWith(CR); ConstantRange Difference = DomCR.difference(CR); if (Intersection.isEmptySet()) @@ -8228,6 +8225,17 @@ static std::optional isImpliedCondCommonOperandWithConstants( return std::nullopt; } +/// Return true if "icmp LPred X, LC" implies "icmp RPred X, RC" is true. +/// Return false if "icmp LPred X, LC" implies "icmp RPred X, RC" is false. +/// Otherwise, return std::nullopt if we can't infer anything. +static std::optional isImpliedCondCommonOperandWithConstants( + CmpInst::Predicate LPred, const APInt &LC, CmpInst::Predicate RPred, + const APInt &RC) { + ConstantRange DomCR = ConstantRange::makeExactICmpRegion(LPred, LC); + ConstantRange CR = ConstantRange::makeExactICmpRegion(RPred, RC); + return isImpliedCondWithRange(DomCR, CR); +} + /// Return true if LHS implies RHS (expanded to its components as "R0 RPred R1") /// is true. Return false if LHS implies RHS is false. Otherwise, return /// std::nullopt if we can't infer anything. @@ -8247,8 +8255,36 @@ static std::optional isImpliedCondICmps(const ICmpInst *LHS, // Can we infer anything when the 0-operands match and the 1-operands are // constants (not necessarily matching)? const APInt *LC, *RC; - if (L0 == R0 && match(L1, m_APInt(LC)) && match(R1, m_APInt(RC))) - return isImpliedCondCommonOperandWithConstants(LPred, *LC, RPred, *RC); + if (match(L1, m_APInt(LC)) && match(R1, m_APInt(RC))) { + if (L0 == R0) + return isImpliedCondCommonOperandWithConstants(LPred, *LC, RPred, *RC); + + // handle R0 = L0 binop V and R0 = V binop L0 + Value *R0Op1 = nullptr; + if (match(R0, m_c_BinOp(m_Specific(L0), m_Value(R0Op1)))) { + ConstantRange LHSRange = ConstantRange::makeExactICmpRegion(LPred, *LC); + ConstantRange CR = ConstantRange::makeExactICmpRegion(RPred, *RC); + // TODO: use contextual information from SimplifyQuery + ConstantRange RHSRange = + computeConstantRange(R0Op1, ICmpInst::isSigned(RPred), + /*UseInstrInfo*/ true, /*AC*/ nullptr, + /*CtxI*/ nullptr, /*DT*/ nullptr, Depth); + auto *BO = cast(R0); + if (BO->getOperand(0) != L0) + std::swap(LHSRange, RHSRange); + unsigned NoWrapKind = 0; + if (auto *OBO = dyn_cast(BO)) { + if (OBO->hasNoUnsignedWrap()) + NoWrapKind |= OverflowingBinaryOperator::NoUnsignedWrap; + if (OBO->hasNoSignedWrap()) + NoWrapKind |= OverflowingBinaryOperator::NoSignedWrap; + } + ConstantRange Range = + LHSRange.overflowingBinaryOp(BO->getOpcode(), RHSRange, NoWrapKind); + if (auto Res = isImpliedCondWithRange(Range, CR)) + return Res; + } + } // Can we infer anything when the two compares have matching operands? bool AreSwappedOps; diff --git a/llvm/test/Analysis/ValueTracking/implied-icmp-binop.ll b/llvm/test/Analysis/ValueTracking/implied-icmp-binop.ll new file mode 100644 index 0000000000000..b9cc3f9a1b161 --- /dev/null +++ b/llvm/test/Analysis/ValueTracking/implied-icmp-binop.ll @@ -0,0 +1,206 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 +; RUN: opt -passes=instcombine -S < %s | FileCheck %s + +; Tests from PR68799 + +define i1 @f_and(i32 %x, i32 %y) { +; CHECK-LABEL: define i1 @f_and( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: ret i1 false +; +entry: + %cmp = icmp ne i32 %x, 0 + %0 = or i32 %x, %y + %and14 = icmp eq i32 %0, 0 + %and1115 = and i1 %cmp, %and14 + ret i1 %and1115 +} + +define i1 @f_or(i32 %x, i32 %y) { +; CHECK-LABEL: define i1 @f_or( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: ret i1 true +; +entry: + %cmp.not = icmp eq i32 %x, 0 + %0 = or i32 %x, %y + %or14 = icmp ne i32 %0, 0 + %or1115 = or i1 %cmp.not, %or14 + ret i1 %or1115 +} + +; Tests for more binops + +define i1 @f_add(i32 %x, i32 %y) { +; CHECK-LABEL: define i1 @f_add( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: ret i1 false +; +entry: + %yr = and i32 %y, 7 + %cmp = icmp ult i32 %x, 8 + %0 = add i32 %yr, %x + %cmp2 = icmp ugt i32 %0, 16 + %and = and i1 %cmp, %cmp2 + ret i1 %and +} + +define i1 @f_add_nsw(i32 %x, i32 %y) { +; CHECK-LABEL: define i1 @f_add_nsw( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: ret i1 false +; +entry: + %yr = and i32 %y, 2147483647 + %cmp = icmp sgt i32 %x, 5 + %0 = add nsw i32 %yr, %x + %cmp2 = icmp slt i32 %0, 5 + %and = and i1 %cmp, %cmp2 + ret i1 %and +} + +define i1 @f_add_nuw(i32 %x, i32 %y) { +; CHECK-LABEL: define i1 @f_add_nuw( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: ret i1 false +; +entry: + %cmp = icmp ugt i32 %x, 1 + %0 = add nuw i32 %x, %y + %cmp2 = icmp eq i32 %0, 1 + %and = and i1 %cmp, %cmp2 + ret i1 %and +} + +define i1 @f_sub_nsw(i32 %x, i32 %y) { +; CHECK-LABEL: define i1 @f_sub_nsw( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X]], 5 +; CHECK-NEXT: ret i1 [[CMP]] +; +entry: + %yr = and i32 %y, 2147483647 + %cmp = icmp slt i32 %x, 5 + %0 = sub nsw i32 %x, %yr + %cmp2 = icmp slt i32 %0, 5 + %and = and i1 %cmp, %cmp2 + ret i1 %and +} + +define i1 @f_sub_nuw(i32 %x, i32 %y) { +; CHECK-LABEL: define i1 @f_sub_nuw( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: ret i1 false +; +entry: + %cmp = icmp ult i32 %x, 5 + %0 = sub nuw i32 %x, %y + %cmp2 = icmp eq i32 %0, 6 + %and = and i1 %cmp, %cmp2 + ret i1 %and +} + +; Negative tests + +; non-constant range +define i1 @f_add_nofold1(i32 %x, i32 %y, i32 %z) { +; CHECK-LABEL: define i1 @f_add_nofold1( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]], i32 [[Z:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[YR:%.*]] = and i32 [[Y]], 7 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[X]], [[Z]] +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[YR]], [[X]] +; CHECK-NEXT: [[CMP2:%.*]] = icmp ugt i32 [[TMP0]], 16 +; CHECK-NEXT: [[AND:%.*]] = and i1 [[CMP]], [[CMP2]] +; CHECK-NEXT: ret i1 [[AND]] +; +entry: + %yr = and i32 %y, 7 + %cmp = icmp ult i32 %x, %z + %0 = add i32 %yr, %x + %cmp2 = icmp ugt i32 %0, 16 + %and = and i1 %cmp, %cmp2 + ret i1 %and +} + +define i1 @f_add_nofold2(i32 %x, i32 %y, i32 %z) { +; CHECK-LABEL: define i1 @f_add_nofold2( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]], i32 [[Z:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[YR:%.*]] = and i32 [[Y]], 7 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[X]], 8 +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[YR]], [[X]] +; CHECK-NEXT: [[CMP2:%.*]] = icmp ugt i32 [[TMP0]], [[Z]] +; CHECK-NEXT: [[AND:%.*]] = and i1 [[CMP]], [[CMP2]] +; CHECK-NEXT: ret i1 [[AND]] +; +entry: + %yr = and i32 %y, 7 + %cmp = icmp ult i32 %x, 8 + %0 = add i32 %yr, %x + %cmp2 = icmp ugt i32 %0, %z + %and = and i1 %cmp, %cmp2 + ret i1 %and +} + +; narrower range +define i1 @f_add_nofold3(i32 %x, i32 %y) { +; CHECK-LABEL: define i1 @f_add_nofold3( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[YR:%.*]] = and i32 [[Y]], 7 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[X]], 8 +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[YR]], [[X]] +; CHECK-NEXT: [[CMP2:%.*]] = icmp ugt i32 [[TMP0]], 10 +; CHECK-NEXT: [[AND:%.*]] = and i1 [[CMP]], [[CMP2]] +; CHECK-NEXT: ret i1 [[AND]] +; +entry: + %yr = and i32 %y, 7 + %cmp = icmp ult i32 %x, 8 + %0 = add i32 %yr, %x + %cmp2 = icmp ugt i32 %0, 10 + %and = and i1 %cmp, %cmp2 + ret i1 %and +} + +; sub is not commutative +define i1 @f_sub_nsw_nofold(i32 %x, i32 %y) { +; CHECK-LABEL: define i1 @f_sub_nsw_nofold( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[YR:%.*]] = and i32 [[Y]], 2147483647 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X]], 5 +; CHECK-NEXT: [[TMP0:%.*]] = sub nsw i32 [[YR]], [[X]] +; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP0]], 5 +; CHECK-NEXT: [[AND:%.*]] = and i1 [[CMP]], [[CMP2]] +; CHECK-NEXT: ret i1 [[AND]] +; +entry: + %yr = and i32 %y, 2147483647 + %cmp = icmp slt i32 %x, 5 + %0 = sub nsw i32 %yr, %x + %cmp2 = icmp slt i32 %0, 5 + %and = and i1 %cmp, %cmp2 + ret i1 %and +} + +define i1 @pr69038(i32 %a, i32 %b) { +; CHECK-LABEL: define i1 @pr69038( +; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[A]], 0 +; CHECK-NEXT: ret i1 [[TOBOOL]] +; + %tobool = icmp ne i32 %a, 0 + %or = or i32 %a, %b + %tobool1 = icmp ne i32 %or, 0 + %and = and i1 %tobool, %tobool1 + ret i1 %and +} diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll index c48370a9c6c75..b08cfb2b5d257 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -142,7 +142,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-IR-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; GCN-IR-NEXT: s_mov_b32 s15, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_ashr_i32 s0, s7, 31 ; GCN-IR-NEXT: s_mov_b32 s1, s0 @@ -156,16 +155,16 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_subb_u32 s7, s7, s2 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[12:13], 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[6:7], 0 +; GCN-IR-NEXT: s_flbit_i32_b32 s14, s13 ; GCN-IR-NEXT: s_or_b64 s[10:11], s[10:11], s[8:9] ; GCN-IR-NEXT: s_flbit_i32_b32 s8, s6 ; GCN-IR-NEXT: s_add_i32 s8, s8, 32 ; GCN-IR-NEXT: s_flbit_i32_b32 s9, s7 -; GCN-IR-NEXT: s_min_u32 s14, s8, s9 -; GCN-IR-NEXT: s_flbit_i32_b32 s8, s12 -; GCN-IR-NEXT: s_add_i32 s8, s8, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s9, s13 -; GCN-IR-NEXT: s_min_u32 s20, s8, s9 -; GCN-IR-NEXT: s_sub_u32 s16, s14, s20 +; GCN-IR-NEXT: s_min_u32 s8, s8, s9 +; GCN-IR-NEXT: s_flbit_i32_b32 s9, s12 +; GCN-IR-NEXT: s_add_i32 s9, s9, 32 +; GCN-IR-NEXT: s_min_u32 s20, s9, s14 +; GCN-IR-NEXT: s_sub_u32 s16, s8, s20 ; GCN-IR-NEXT: s_subb_u32 s17, 0, 0 ; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[18:19], s[16:17], 63 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[22:23], s[16:17], 63 @@ -174,27 +173,21 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_cselect_b32 s11, 0, s13 ; GCN-IR-NEXT: s_cselect_b32 s10, 0, s12 ; GCN-IR-NEXT: s_or_b64 s[18:19], s[18:19], s[22:23] -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: s_mov_b64 s[14:15], 0 ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[18:19] -; GCN-IR-NEXT: s_cbranch_vccz .LBB0_5 -; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s18, s16, 1 -; GCN-IR-NEXT: s_addc_u32 s19, s17, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[18:19], 0 -; GCN-IR-NEXT: s_sub_i32 s16, 63, s16 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[10:11] -; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[12:13], s16 +; GCN-IR-NEXT: s_mov_b32 s9, 0 ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_4 -; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[16:17], s[12:13], s18 +; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 +; GCN-IR-NEXT: s_add_i32 s17, s16, 1 +; GCN-IR-NEXT: s_sub_i32 s10, 63, s16 +; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[12:13], s10 +; GCN-IR-NEXT: s_lshr_b64 s[16:17], s[12:13], s17 ; GCN-IR-NEXT: s_add_u32 s18, s6, -1 ; GCN-IR-NEXT: s_addc_u32 s19, s7, -1 -; GCN-IR-NEXT: s_not_b64 s[8:9], s[14:15] -; GCN-IR-NEXT: s_add_u32 s12, s8, s20 -; GCN-IR-NEXT: s_addc_u32 s13, s9, 0 -; GCN-IR-NEXT: s_mov_b64 s[14:15], 0 -; GCN-IR-NEXT: s_mov_b32 s9, 0 -; GCN-IR-NEXT: .LBB0_3: ; %udiv-do-while +; GCN-IR-NEXT: s_not_b64 s[12:13], s[8:9] +; GCN-IR-NEXT: s_add_u32 s12, s12, s20 +; GCN-IR-NEXT: s_addc_u32 s13, s13, 0 +; GCN-IR-NEXT: .LBB0_2: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: s_lshl_b64 s[16:17], s[16:17], 1 ; GCN-IR-NEXT: s_lshr_b32 s8, s11, 31 @@ -214,11 +207,11 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[20:21], s[12:13], 0 ; GCN-IR-NEXT: s_mov_b64 s[14:15], s[8:9] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[20:21] -; GCN-IR-NEXT: s_cbranch_vccz .LBB0_3 -; GCN-IR-NEXT: .LBB0_4: ; %Flow7 +; GCN-IR-NEXT: s_cbranch_vccz .LBB0_2 +; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit ; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[10:11], 1 ; GCN-IR-NEXT: s_or_b64 s[10:11], s[8:9], s[6:7] -; GCN-IR-NEXT: .LBB0_5: ; %udiv-end +; GCN-IR-NEXT: .LBB0_4: ; %udiv-end ; GCN-IR-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1] ; GCN-IR-NEXT: s_xor_b64 s[2:3], s[10:11], s[0:1] ; GCN-IR-NEXT: s_sub_u32 s0, s2, s0 @@ -372,86 +365,75 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_xor_b32_e32 v1, v13, v3 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v13 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v13, vcc +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[6:7] ; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0 -; GCN-IR-NEXT: v_add_i32_e64 v2, s[6:7], 32, v2 +; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 ; GCN-IR-NEXT: v_min_u32_e32 v10, v2, v3 ; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v6 -; GCN-IR-NEXT: v_add_i32_e64 v2, s[6:7], 32, v2 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v7 -; GCN-IR-NEXT: v_min_u32_e32 v11, v2, v3 -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[6:7], v10, v11 -; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[6:7] -; GCN-IR-NEXT: v_subb_u32_e64 v3, s[6:7], 0, 0, s[6:7] -; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[6:7], 63, v[2:3] -; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[2:3] -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; GCN-IR-NEXT: v_min_u32_e32 v2, v2, v3 +; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, v10, v2 +; GCN-IR-NEXT: v_subb_u32_e64 v9, s[8:9], 0, 0, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[8:9] ; GCN-IR-NEXT: v_mov_b32_e32 v14, v12 +; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[8:9] +; GCN-IR-NEXT: s_xor_b64 s[8:9], s[4:5], -1 ; GCN-IR-NEXT: v_mov_b32_e32 v15, v13 +; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 +; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 ; GCN-IR-NEXT: v_cndmask_b32_e64 v5, v7, 0, s[4:5] ; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v6, 0, s[4:5] -; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB1_6 +; GCN-IR-NEXT: s_and_b64 s[4:5], s[8:9], vcc +; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] +; GCN-IR-NEXT: s_cbranch_execz .LBB1_4 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v2 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[6:7], v2 -; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB1_5 -; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader +; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, 1, v8 +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 63, v8 ; GCN-IR-NEXT: v_add_i32_e32 v16, vcc, -1, v0 +; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[6:7], v4 +; GCN-IR-NEXT: v_lshr_b64 v[8:9], v[6:7], v9 ; GCN-IR-NEXT: v_addc_u32_e32 v17, vcc, -1, v1, vcc -; GCN-IR-NEXT: v_not_b32_e32 v5, v10 -; GCN-IR-NEXT: v_lshr_b64 v[8:9], v[6:7], v8 -; GCN-IR-NEXT: v_not_b32_e32 v4, 0 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, v5, v11 +; GCN-IR-NEXT: v_not_b32_e32 v6, v10 +; GCN-IR-NEXT: v_not_b32_e32 v7, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 -; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v4, vcc -; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 +; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, v6, v2 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: .LBB1_3: ; %udiv-do-while +; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; GCN-IR-NEXT: .LBB1_2: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 -; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 -; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4 -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v16, v8 -; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v17, v9, vcc -; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v5 +; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v2 +; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, v16, v8 +; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v17, v9, vcc +; GCN-IR-NEXT: v_or_b32_e32 v4, v10, v4 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v2 ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6 -; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 -; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10 +; GCN-IR-NEXT: v_or_b32_e32 v5, v11, v5 +; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v10 ; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1 ; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] ; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10 ; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 -; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v10, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3 -; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: .LBB1_5: ; %Flow4 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1 -; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v1 -; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v0 -; GCN-IR-NEXT: .LBB1_6: ; %Flow5 +; GCN-IR-NEXT: v_mov_b32_e32 v11, v3 +; GCN-IR-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN-IR-NEXT: v_mov_b32_e32 v10, v2 +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: s_cbranch_execnz .LBB1_2 +; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 +; GCN-IR-NEXT: v_or_b32_e32 v4, v2, v4 +; GCN-IR-NEXT: .LBB1_4: ; %Flow +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_xor_b32_e32 v0, v13, v12 ; GCN-IR-NEXT: v_xor_b32_e32 v1, v15, v14 ; GCN-IR-NEXT: v_xor_b32_e32 v3, v4, v0 @@ -971,7 +953,6 @@ define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 % ; GCN-IR-LABEL: s_test_sdiv24_48: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GCN-IR-NEXT: s_mov_b32 s15, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_sext_i32_i16 s5, s5 ; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[4:5], 24 @@ -993,16 +974,16 @@ define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 % ; GCN-IR-NEXT: s_subb_u32 s7, s7, s4 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[6:7], 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[12:13], 0 +; GCN-IR-NEXT: s_flbit_i32_b32 s14, s13 ; GCN-IR-NEXT: s_or_b64 s[10:11], s[8:9], s[10:11] ; GCN-IR-NEXT: s_flbit_i32_b32 s8, s6 ; GCN-IR-NEXT: s_add_i32 s8, s8, 32 ; GCN-IR-NEXT: s_flbit_i32_b32 s9, s7 -; GCN-IR-NEXT: s_min_u32 s14, s8, s9 -; GCN-IR-NEXT: s_flbit_i32_b32 s8, s12 -; GCN-IR-NEXT: s_add_i32 s8, s8, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s9, s13 -; GCN-IR-NEXT: s_min_u32 s20, s8, s9 -; GCN-IR-NEXT: s_sub_u32 s16, s14, s20 +; GCN-IR-NEXT: s_min_u32 s8, s8, s9 +; GCN-IR-NEXT: s_flbit_i32_b32 s9, s12 +; GCN-IR-NEXT: s_add_i32 s9, s9, 32 +; GCN-IR-NEXT: s_min_u32 s20, s9, s14 +; GCN-IR-NEXT: s_sub_u32 s16, s8, s20 ; GCN-IR-NEXT: s_subb_u32 s17, 0, 0 ; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[18:19], s[16:17], 63 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[22:23], s[16:17], 63 @@ -1011,27 +992,21 @@ define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 % ; GCN-IR-NEXT: s_cselect_b32 s11, 0, s13 ; GCN-IR-NEXT: s_cselect_b32 s10, 0, s12 ; GCN-IR-NEXT: s_or_b64 s[18:19], s[18:19], s[22:23] -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: s_mov_b64 s[14:15], 0 ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[18:19] -; GCN-IR-NEXT: s_cbranch_vccz .LBB9_5 -; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s18, s16, 1 -; GCN-IR-NEXT: s_addc_u32 s19, s17, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[18:19], 0 -; GCN-IR-NEXT: s_sub_i32 s16, 63, s16 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[10:11] -; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[12:13], s16 +; GCN-IR-NEXT: s_mov_b32 s9, 0 ; GCN-IR-NEXT: s_cbranch_vccz .LBB9_4 -; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[16:17], s[12:13], s18 +; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 +; GCN-IR-NEXT: s_add_i32 s17, s16, 1 +; GCN-IR-NEXT: s_sub_i32 s10, 63, s16 +; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[12:13], s10 +; GCN-IR-NEXT: s_lshr_b64 s[16:17], s[12:13], s17 ; GCN-IR-NEXT: s_add_u32 s18, s6, -1 ; GCN-IR-NEXT: s_addc_u32 s19, s7, -1 -; GCN-IR-NEXT: s_not_b64 s[8:9], s[14:15] -; GCN-IR-NEXT: s_add_u32 s12, s8, s20 -; GCN-IR-NEXT: s_addc_u32 s13, s9, 0 -; GCN-IR-NEXT: s_mov_b64 s[14:15], 0 -; GCN-IR-NEXT: s_mov_b32 s9, 0 -; GCN-IR-NEXT: .LBB9_3: ; %udiv-do-while +; GCN-IR-NEXT: s_not_b64 s[12:13], s[8:9] +; GCN-IR-NEXT: s_add_u32 s12, s12, s20 +; GCN-IR-NEXT: s_addc_u32 s13, s13, 0 +; GCN-IR-NEXT: .LBB9_2: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: s_lshl_b64 s[16:17], s[16:17], 1 ; GCN-IR-NEXT: s_lshr_b32 s8, s11, 31 @@ -1051,11 +1026,11 @@ define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 % ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[20:21], s[12:13], 0 ; GCN-IR-NEXT: s_mov_b64 s[14:15], s[8:9] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[20:21] -; GCN-IR-NEXT: s_cbranch_vccz .LBB9_3 -; GCN-IR-NEXT: .LBB9_4: ; %Flow4 +; GCN-IR-NEXT: s_cbranch_vccz .LBB9_2 +; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit ; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[10:11], 1 ; GCN-IR-NEXT: s_or_b64 s[10:11], s[8:9], s[6:7] -; GCN-IR-NEXT: .LBB9_5: ; %udiv-end +; GCN-IR-NEXT: .LBB9_4: ; %udiv-end ; GCN-IR-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 ; GCN-IR-NEXT: s_xor_b64 s[0:1], s[4:5], s[2:3] ; GCN-IR-NEXT: s_xor_b64 s[2:3], s[10:11], s[0:1] @@ -1196,7 +1171,7 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-LABEL: s_test_sdiv_k_num_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_ashr_i32 s4, s3, 31 ; GCN-IR-NEXT: s_mov_b32 s5, s4 @@ -1206,61 +1181,54 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_flbit_i32_b32 s10, s2 ; GCN-IR-NEXT: s_add_i32 s10, s10, 32 ; GCN-IR-NEXT: s_flbit_i32_b32 s11, s3 -; GCN-IR-NEXT: s_min_u32 s14, s10, s11 -; GCN-IR-NEXT: s_add_u32 s10, s14, 0xffffffc5 +; GCN-IR-NEXT: s_min_u32 s12, s10, s11 +; GCN-IR-NEXT: s_add_u32 s10, s12, 0xffffffc5 ; GCN-IR-NEXT: s_addc_u32 s11, 0, -1 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[2:3], 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[12:13], s[10:11], 63 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[2:3], 0 +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[14:15], s[10:11], 63 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[16:17], s[10:11], 63 -; GCN-IR-NEXT: s_or_b64 s[12:13], s[8:9], s[12:13] -; GCN-IR-NEXT: s_and_b64 s[8:9], s[12:13], exec -; GCN-IR-NEXT: s_cselect_b32 s8, 0, 24 -; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[16:17] -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[12:13] -; GCN-IR-NEXT: s_mov_b32 s9, 0 -; GCN-IR-NEXT: s_cbranch_vccz .LBB10_5 +; GCN-IR-NEXT: s_or_b64 s[14:15], s[6:7], s[14:15] +; GCN-IR-NEXT: s_and_b64 s[6:7], s[14:15], exec +; GCN-IR-NEXT: s_cselect_b32 s6, 0, 24 +; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] +; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[14:15] +; GCN-IR-NEXT: s_mov_b32 s7, 0 +; GCN-IR-NEXT: s_cbranch_vccz .LBB10_4 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s12, s10, 1 -; GCN-IR-NEXT: s_addc_u32 s13, s11, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[12:13], 0 +; GCN-IR-NEXT: s_add_i32 s6, s10, 1 ; GCN-IR-NEXT: s_sub_i32 s10, 63, s10 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[8:9] -; GCN-IR-NEXT: s_lshl_b64 s[8:9], 24, s10 -; GCN-IR-NEXT: s_cbranch_vccz .LBB10_4 -; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[12:13], 24, s12 +; GCN-IR-NEXT: s_lshl_b64 s[10:11], 24, s10 +; GCN-IR-NEXT: s_lshr_b64 s[14:15], 24, s6 ; GCN-IR-NEXT: s_add_u32 s16, s2, -1 ; GCN-IR-NEXT: s_addc_u32 s17, s3, -1 -; GCN-IR-NEXT: s_sub_u32 s10, 58, s14 -; GCN-IR-NEXT: s_subb_u32 s11, 0, 0 -; GCN-IR-NEXT: s_mov_b64 s[14:15], 0 -; GCN-IR-NEXT: s_mov_b32 s7, 0 -; GCN-IR-NEXT: .LBB10_3: ; %udiv-do-while +; GCN-IR-NEXT: s_sub_u32 s12, 58, s12 +; GCN-IR-NEXT: s_subb_u32 s13, 0, 0 +; GCN-IR-NEXT: .LBB10_2: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1 -; GCN-IR-NEXT: s_lshr_b32 s6, s9, 31 -; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1 -; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[6:7] -; GCN-IR-NEXT: s_or_b64 s[8:9], s[14:15], s[8:9] -; GCN-IR-NEXT: s_sub_u32 s6, s16, s12 -; GCN-IR-NEXT: s_subb_u32 s6, s17, s13 -; GCN-IR-NEXT: s_ashr_i32 s14, s6, 31 -; GCN-IR-NEXT: s_mov_b32 s15, s14 -; GCN-IR-NEXT: s_and_b32 s6, s14, 1 -; GCN-IR-NEXT: s_and_b64 s[14:15], s[14:15], s[2:3] -; GCN-IR-NEXT: s_sub_u32 s12, s12, s14 -; GCN-IR-NEXT: s_subb_u32 s13, s13, s15 -; GCN-IR-NEXT: s_add_u32 s10, s10, 1 -; GCN-IR-NEXT: s_addc_u32 s11, s11, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[10:11], 0 -; GCN-IR-NEXT: s_mov_b64 s[14:15], s[6:7] +; GCN-IR-NEXT: s_lshl_b64 s[14:15], s[14:15], 1 +; GCN-IR-NEXT: s_lshr_b32 s6, s11, 31 +; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 +; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[6:7] +; GCN-IR-NEXT: s_or_b64 s[10:11], s[8:9], s[10:11] +; GCN-IR-NEXT: s_sub_u32 s6, s16, s14 +; GCN-IR-NEXT: s_subb_u32 s6, s17, s15 +; GCN-IR-NEXT: s_ashr_i32 s8, s6, 31 +; GCN-IR-NEXT: s_mov_b32 s9, s8 +; GCN-IR-NEXT: s_and_b32 s6, s8, 1 +; GCN-IR-NEXT: s_and_b64 s[8:9], s[8:9], s[2:3] +; GCN-IR-NEXT: s_sub_u32 s14, s14, s8 +; GCN-IR-NEXT: s_subb_u32 s15, s15, s9 +; GCN-IR-NEXT: s_add_u32 s12, s12, 1 +; GCN-IR-NEXT: s_addc_u32 s13, s13, 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[12:13], 0 +; GCN-IR-NEXT: s_mov_b64 s[8:9], s[6:7] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[18:19] -; GCN-IR-NEXT: s_cbranch_vccz .LBB10_3 -; GCN-IR-NEXT: .LBB10_4: ; %Flow6 -; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[8:9], 1 -; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[2:3] -; GCN-IR-NEXT: .LBB10_5: ; %udiv-end -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[8:9], s[4:5] +; GCN-IR-NEXT: s_cbranch_vccz .LBB10_2 +; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit +; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[10:11], 1 +; GCN-IR-NEXT: s_or_b64 s[6:7], s[6:7], s[2:3] +; GCN-IR-NEXT: .LBB10_4: ; %udiv-end +; GCN-IR-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] ; GCN-IR-NEXT: s_sub_u32 s4, s6, s4 ; GCN-IR-NEXT: s_subb_u32 s5, s7, s5 ; GCN-IR-NEXT: v_mov_b32_e32 v0, s4 @@ -1388,82 +1356,72 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v1 ; GCN-IR-NEXT: v_xor_b32_e32 v0, v12, v0 ; GCN-IR-NEXT: v_xor_b32_e32 v1, v12, v1 -; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v12 -; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc -; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0 -; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2 -; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 -; GCN-IR-NEXT: v_min_u32_e32 v10, v2, v3 +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, v0, v12 +; GCN-IR-NEXT: v_subb_u32_e32 v3, vcc, v1, v12, vcc +; GCN-IR-NEXT: v_ffbh_u32_e32 v0, v2 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN-IR-NEXT: v_ffbh_u32_e32 v1, v3 +; GCN-IR-NEXT: v_min_u32_e32 v6, v0, v1 ; GCN-IR-NEXT: s_movk_i32 s6, 0xffc5 -; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s6, v10 -; GCN-IR-NEXT: v_addc_u32_e64 v3, s[6:7], 0, -1, vcc -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] -; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[2:3] +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, s6, v6 +; GCN-IR-NEXT: v_addc_u32_e64 v5, s[6:7], 0, -1, vcc +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[2:3] +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5] +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[4:5] ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GCN-IR-NEXT: v_cndmask_b32_e64 v4, 24, 0, s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v0, 24, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GCN-IR-NEXT: v_mov_b32_e32 v13, v12 -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 ; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB11_6 +; GCN-IR-NEXT: s_cbranch_execz .LBB11_4 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v2 -; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; GCN-IR-NEXT: v_lshl_b64 v[2:3], 24, v2 -; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB11_5 -; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, -1, v0 -; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v1, vcc -; GCN-IR-NEXT: v_lshr_b64 v[8:9], 24, v6 -; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 58, v10 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 1, v4 +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 63, v4 +; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, -1, v2 +; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v3, vcc +; GCN-IR-NEXT: v_lshl_b64 v[4:5], 24, v4 +; GCN-IR-NEXT: v_lshr_b64 v[8:9], 24, v0 ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 -; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc -; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 58, v6 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: .LBB11_3: ; %udiv-do-while +; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc +; GCN-IR-NEXT: .LBB11_2: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 -; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 -; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4 -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v14, v8 -; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v15, v9, vcc -; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v0, 31, v5 +; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v0 +; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v14, v8 +; GCN-IR-NEXT: v_subb_u32_e32 v0, vcc, v15, v9, vcc +; GCN-IR-NEXT: v_or_b32_e32 v4, v10, v4 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v0 ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6 -; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 -; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10 -; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1 -; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0 +; GCN-IR-NEXT: v_or_b32_e32 v5, v11, v5 +; GCN-IR-NEXT: v_and_b32_e32 v0, 1, v10 +; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v3 +; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v2 ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] ; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10 ; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 -; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v10, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: s_cbranch_execnz .LBB11_3 -; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: .LBB11_5: ; %Flow4 +; GCN-IR-NEXT: v_mov_b32_e32 v11, v1 +; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_mov_b32_e32 v10, v0 +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_cbranch_execnz .LBB11_2 +; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1 -; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v1 -; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v0 -; GCN-IR-NEXT: .LBB11_6: ; %Flow5 +; GCN-IR-NEXT: v_lshl_b64 v[1:2], v[4:5], 1 +; GCN-IR-NEXT: v_or_b32_e32 v1, v0, v1 +; GCN-IR-NEXT: v_mov_b32_e32 v0, v1 +; GCN-IR-NEXT: v_mov_b32_e32 v1, v2 +; GCN-IR-NEXT: .LBB11_4: ; %Flow ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-IR-NEXT: v_xor_b32_e32 v0, v4, v12 -; GCN-IR-NEXT: v_xor_b32_e32 v1, v5, v13 +; GCN-IR-NEXT: v_xor_b32_e32 v0, v0, v12 +; GCN-IR-NEXT: v_xor_b32_e32 v1, v1, v13 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v12 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v13, vcc ; GCN-IR-NEXT: s_setpc_b64 s[30:31] @@ -1581,84 +1539,74 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v1 ; GCN-IR-NEXT: v_xor_b32_e32 v0, v12, v0 ; GCN-IR-NEXT: v_xor_b32_e32 v1, v12, v1 -; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v12 -; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc -; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0 -; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2 -; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 -; GCN-IR-NEXT: v_min_u32_e32 v10, v2, v3 +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, v0, v12 +; GCN-IR-NEXT: v_subb_u32_e32 v3, vcc, v1, v12, vcc +; GCN-IR-NEXT: v_ffbh_u32_e32 v0, v2 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN-IR-NEXT: v_ffbh_u32_e32 v1, v3 +; GCN-IR-NEXT: v_min_u32_e32 v6, v0, v1 ; GCN-IR-NEXT: s_movk_i32 s6, 0xffd0 -; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s6, v10 -; GCN-IR-NEXT: v_addc_u32_e64 v3, s[6:7], 0, -1, vcc -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] -; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[2:3] -; GCN-IR-NEXT: v_mov_b32_e32 v4, 0x8000 +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, s6, v6 +; GCN-IR-NEXT: v_addc_u32_e64 v5, s[6:7], 0, -1, vcc +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[2:3] +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5] +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[4:5] +; GCN-IR-NEXT: v_mov_b32_e32 v0, 0x8000 ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GCN-IR-NEXT: v_mov_b32_e32 v13, v12 -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 ; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB12_6 +; GCN-IR-NEXT: s_cbranch_execz .LBB12_4 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v2 -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 -; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 1, v4 +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 63, v4 +; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, -1, v2 ; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[4:5], v2 -; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GCN-IR-NEXT: s_cbranch_execz .LBB12_5 -; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, -1, v0 -; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v1, vcc -; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v6 -; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v10 +; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v3, vcc +; GCN-IR-NEXT: v_lshl_b64 v[4:5], s[4:5], v4 +; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v0 ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 -; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc -; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v6 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: .LBB12_3: ; %udiv-do-while +; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc +; GCN-IR-NEXT: .LBB12_2: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 -; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 -; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4 -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v14, v8 -; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v15, v9, vcc -; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v0, 31, v5 +; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v0 +; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v14, v8 +; GCN-IR-NEXT: v_subb_u32_e32 v0, vcc, v15, v9, vcc +; GCN-IR-NEXT: v_or_b32_e32 v4, v10, v4 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v0 ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6 -; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 -; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10 -; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1 -; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0 +; GCN-IR-NEXT: v_or_b32_e32 v5, v11, v5 +; GCN-IR-NEXT: v_and_b32_e32 v0, 1, v10 +; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v3 +; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v2 ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] ; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10 ; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 -; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v10, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: s_cbranch_execnz .LBB12_3 -; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: .LBB12_5: ; %Flow4 +; GCN-IR-NEXT: v_mov_b32_e32 v11, v1 +; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_mov_b32_e32 v10, v0 +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_cbranch_execnz .LBB12_2 +; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1 -; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v1 -; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v0 -; GCN-IR-NEXT: .LBB12_6: ; %Flow5 +; GCN-IR-NEXT: v_lshl_b64 v[1:2], v[4:5], 1 +; GCN-IR-NEXT: v_or_b32_e32 v1, v0, v1 +; GCN-IR-NEXT: v_mov_b32_e32 v0, v1 +; GCN-IR-NEXT: v_mov_b32_e32 v1, v2 +; GCN-IR-NEXT: .LBB12_4: ; %Flow ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-IR-NEXT: v_xor_b32_e32 v0, v4, v12 -; GCN-IR-NEXT: v_xor_b32_e32 v1, v5, v13 +; GCN-IR-NEXT: v_xor_b32_e32 v0, v0, v12 +; GCN-IR-NEXT: v_xor_b32_e32 v1, v1, v13 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v12 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v13, vcc ; GCN-IR-NEXT: s_setpc_b64 s[30:31] @@ -1688,72 +1636,61 @@ define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_ffbh_u32_e32 v0, v4 ; GCN-IR-NEXT: v_add_i32_e64 v0, s[4:5], 32, v0 ; GCN-IR-NEXT: v_ffbh_u32_e32 v1, v5 -; GCN-IR-NEXT: v_min_u32_e32 v8, v0, v1 -; GCN-IR-NEXT: v_sub_i32_e64 v0, s[4:5], 48, v8 -; GCN-IR-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, s[4:5] +; GCN-IR-NEXT: v_min_u32_e32 v0, v0, v1 +; GCN-IR-NEXT: v_sub_i32_e64 v6, s[4:5], 48, v0 +; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, s[4:5] ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] -; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], 63, v[0:1] +; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], 63, v[6:7] ; GCN-IR-NEXT: v_mov_b32_e32 v11, v10 ; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[0:1] -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[6:7] +; GCN-IR-NEXT: s_xor_b64 s[8:9], s[4:5], -1 +; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 ; GCN-IR-NEXT: v_cndmask_b32_e64 v3, v5, 0, s[4:5] ; GCN-IR-NEXT: v_cndmask_b32_e64 v2, v4, 0, s[4:5] -; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB13_6 +; GCN-IR-NEXT: s_and_b64 s[4:5], s[8:9], vcc +; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] +; GCN-IR-NEXT: s_cbranch_execz .LBB13_4 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v0 -; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v0, s[4:5], 63, v0 -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[4:5], v0 -; GCN-IR-NEXT: v_mov_b32_e32 v2, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 -; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB13_5 -; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: v_lshr_b64 v[6:7], v[4:5], v6 -; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 0xffffffcf, v8 +; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 1, v6 +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, 63, v6 +; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[4:5], v2 +; GCN-IR-NEXT: v_lshr_b64 v[6:7], v[4:5], v7 +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 0xffffffcf, v0 ; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 ; GCN-IR-NEXT: v_addc_u32_e64 v5, s[4:5], 0, -1, vcc -; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 -; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff -; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 -; GCN-IR-NEXT: .LBB13_3: ; %udiv-do-while +; GCN-IR-NEXT: s_movk_i32 s10, 0x7fff +; GCN-IR-NEXT: .LBB13_2: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 -; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v1 -; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v2 -; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s12, v6 -; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_lshrrev_b32_e32 v0, 31, v3 +; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v0 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s10, v6 +; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 +; GCN-IR-NEXT: v_subb_u32_e32 v0, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 1, v4 -; GCN-IR-NEXT: v_or_b32_e32 v0, v8, v0 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v2 +; GCN-IR-NEXT: v_or_b32_e32 v2, v8, v2 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v8 +; GCN-IR-NEXT: v_and_b32_e32 v0, 1, v8 ; GCN-IR-NEXT: v_and_b32_e32 v8, 0x8000, v8 ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] -; GCN-IR-NEXT: v_or_b32_e32 v1, v9, v1 +; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3 ; GCN-IR-NEXT: v_sub_i32_e64 v6, s[4:5], v6, v8 -; GCN-IR-NEXT: v_mov_b32_e32 v9, v3 +; GCN-IR-NEXT: v_mov_b32_e32 v9, v1 ; GCN-IR-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v7, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v8, v2 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: s_cbranch_execnz .LBB13_3 -; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: .LBB13_5: ; %Flow4 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; GCN-IR-NEXT: v_or_b32_e32 v3, v3, v1 -; GCN-IR-NEXT: v_or_b32_e32 v2, v2, v0 -; GCN-IR-NEXT: .LBB13_6: ; %Flow5 +; GCN-IR-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN-IR-NEXT: v_mov_b32_e32 v8, v0 +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: s_cbranch_execnz .LBB13_2 +; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 +; GCN-IR-NEXT: v_or_b32_e32 v2, v0, v2 +; GCN-IR-NEXT: .LBB13_4: ; %Flow +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_xor_b32_e32 v0, v2, v10 ; GCN-IR-NEXT: v_xor_b32_e32 v1, v3, v11 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll index ac212d22e9cfa..27a56bae6b7a8 100644 --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -124,73 +124,66 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_mov_b64 s[12:13], 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[4:5], 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[2:3], 0 ; GCN-IR-NEXT: s_flbit_i32_b32 s10, s4 -; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9] -; GCN-IR-NEXT: s_flbit_i32_b32 s6, s2 ; GCN-IR-NEXT: s_flbit_i32_b32 s11, s5 ; GCN-IR-NEXT: s_add_i32 s10, s10, 32 -; GCN-IR-NEXT: s_add_i32 s6, s6, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s7, s3 -; GCN-IR-NEXT: s_min_u32 s10, s10, s11 -; GCN-IR-NEXT: s_min_u32 s18, s6, s7 -; GCN-IR-NEXT: s_sub_u32 s12, s10, s18 -; GCN-IR-NEXT: s_subb_u32 s13, 0, 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[14:15], s[12:13], 63 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[16:17], s[12:13], 63 +; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9] +; GCN-IR-NEXT: s_flbit_i32_b32 s7, s2 +; GCN-IR-NEXT: s_min_u32 s6, s10, s11 +; GCN-IR-NEXT: s_add_i32 s7, s7, 32 +; GCN-IR-NEXT: s_flbit_i32_b32 s10, s3 +; GCN-IR-NEXT: s_min_u32 s18, s7, s10 +; GCN-IR-NEXT: s_sub_u32 s10, s6, s18 +; GCN-IR-NEXT: s_subb_u32 s11, 0, 0 +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[14:15], s[10:11], 63 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[16:17], s[10:11], 63 ; GCN-IR-NEXT: s_or_b64 s[14:15], s[8:9], s[14:15] ; GCN-IR-NEXT: s_and_b64 s[8:9], s[14:15], exec ; GCN-IR-NEXT: s_cselect_b32 s9, 0, s3 ; GCN-IR-NEXT: s_cselect_b32 s8, 0, s2 ; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] -; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[14:15] -; GCN-IR-NEXT: s_mov_b32 s11, 0 -; GCN-IR-NEXT: s_cbranch_vccz .LBB0_5 -; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s14, s12, 1 -; GCN-IR-NEXT: s_addc_u32 s15, s13, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[14:15], 0 -; GCN-IR-NEXT: s_sub_i32 s12, 63, s12 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[8:9] -; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[2:3], s12 +; GCN-IR-NEXT: s_mov_b32 s7, 0 ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_4 -; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[12:13], s[2:3], s14 +; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 +; GCN-IR-NEXT: s_add_i32 s11, s10, 1 +; GCN-IR-NEXT: s_sub_i32 s8, 63, s10 +; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 +; GCN-IR-NEXT: s_lshr_b64 s[14:15], s[2:3], s11 ; GCN-IR-NEXT: s_add_u32 s16, s4, -1 ; GCN-IR-NEXT: s_addc_u32 s17, s5, -1 -; GCN-IR-NEXT: s_not_b64 s[6:7], s[10:11] -; GCN-IR-NEXT: s_add_u32 s10, s6, s18 -; GCN-IR-NEXT: s_addc_u32 s11, s7, 0 -; GCN-IR-NEXT: s_mov_b64 s[14:15], 0 -; GCN-IR-NEXT: s_mov_b32 s7, 0 -; GCN-IR-NEXT: .LBB0_3: ; %udiv-do-while +; GCN-IR-NEXT: s_not_b64 s[10:11], s[6:7] +; GCN-IR-NEXT: s_add_u32 s10, s10, s18 +; GCN-IR-NEXT: s_addc_u32 s11, s11, 0 +; GCN-IR-NEXT: .LBB0_2: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1 +; GCN-IR-NEXT: s_lshl_b64 s[14:15], s[14:15], 1 ; GCN-IR-NEXT: s_lshr_b32 s6, s9, 31 ; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1 -; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[6:7] -; GCN-IR-NEXT: s_or_b64 s[8:9], s[14:15], s[8:9] -; GCN-IR-NEXT: s_sub_u32 s6, s16, s12 -; GCN-IR-NEXT: s_subb_u32 s6, s17, s13 -; GCN-IR-NEXT: s_ashr_i32 s14, s6, 31 -; GCN-IR-NEXT: s_mov_b32 s15, s14 -; GCN-IR-NEXT: s_and_b32 s6, s14, 1 -; GCN-IR-NEXT: s_and_b64 s[14:15], s[14:15], s[4:5] -; GCN-IR-NEXT: s_sub_u32 s12, s12, s14 -; GCN-IR-NEXT: s_subb_u32 s13, s13, s15 +; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[6:7] +; GCN-IR-NEXT: s_or_b64 s[8:9], s[12:13], s[8:9] +; GCN-IR-NEXT: s_sub_u32 s6, s16, s14 +; GCN-IR-NEXT: s_subb_u32 s6, s17, s15 +; GCN-IR-NEXT: s_ashr_i32 s12, s6, 31 +; GCN-IR-NEXT: s_mov_b32 s13, s12 +; GCN-IR-NEXT: s_and_b32 s6, s12, 1 +; GCN-IR-NEXT: s_and_b64 s[12:13], s[12:13], s[4:5] +; GCN-IR-NEXT: s_sub_u32 s14, s14, s12 +; GCN-IR-NEXT: s_subb_u32 s15, s15, s13 ; GCN-IR-NEXT: s_add_u32 s10, s10, 1 ; GCN-IR-NEXT: s_addc_u32 s11, s11, 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[10:11], 0 -; GCN-IR-NEXT: s_mov_b64 s[14:15], s[6:7] +; GCN-IR-NEXT: s_mov_b64 s[12:13], s[6:7] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[18:19] -; GCN-IR-NEXT: s_cbranch_vccz .LBB0_3 -; GCN-IR-NEXT: .LBB0_4: ; %Flow7 +; GCN-IR-NEXT: s_cbranch_vccz .LBB0_2 +; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit ; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1 ; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9] -; GCN-IR-NEXT: .LBB0_5: ; %udiv-end +; GCN-IR-NEXT: .LBB0_4: ; %udiv-end ; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 ; GCN-IR-NEXT: v_mul_hi_u32 v0, s4, v0 ; GCN-IR-NEXT: s_mov_b32 s12, s0 @@ -349,85 +342,74 @@ define i64 @v_test_srem(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_xor_b32_e32 v3, v3, v4 ; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 ; GCN-IR-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v2 -; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4 +; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4 ; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v3 ; GCN-IR-NEXT: v_min_u32_e32 v12, v4, v5 ; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v0 -; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4 +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4 ; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v1 -; GCN-IR-NEXT: v_min_u32_e32 v13, v4, v5 -; GCN-IR-NEXT: v_sub_i32_e64 v4, s[6:7], v12, v13 -; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] -; GCN-IR-NEXT: v_subb_u32_e64 v5, s[6:7], 0, 0, s[6:7] -; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[6:7], 63, v[4:5] -; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[4:5] -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; GCN-IR-NEXT: v_min_u32_e32 v4, v4, v5 +; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, v12, v4 +; GCN-IR-NEXT: v_subb_u32_e64 v9, s[8:9], 0, 0, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[8:9] ; GCN-IR-NEXT: v_mov_b32_e32 v15, v14 +; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[8:9] +; GCN-IR-NEXT: s_xor_b64 s[8:9], s[4:5], -1 +; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: v_cndmask_b32_e64 v7, v1, 0, s[4:5] ; GCN-IR-NEXT: v_cndmask_b32_e64 v6, v0, 0, s[4:5] -; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB1_6 +; GCN-IR-NEXT: s_and_b64 s[4:5], s[8:9], vcc +; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] +; GCN-IR-NEXT: s_cbranch_execz .LBB1_4 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v4 -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] -; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[0:1], v4 -; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 -; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB1_5 -; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader +; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, 1, v8 +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 63, v8 ; GCN-IR-NEXT: v_add_i32_e32 v16, vcc, -1, v2 ; GCN-IR-NEXT: v_addc_u32_e32 v17, vcc, -1, v3, vcc -; GCN-IR-NEXT: v_not_b32_e32 v7, v12 -; GCN-IR-NEXT: v_lshr_b64 v[10:11], v[0:1], v8 -; GCN-IR-NEXT: v_not_b32_e32 v6, 0 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, v7, v13 +; GCN-IR-NEXT: v_not_b32_e32 v8, v12 +; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[0:1], v6 +; GCN-IR-NEXT: v_lshr_b64 v[10:11], v[0:1], v9 +; GCN-IR-NEXT: v_not_b32_e32 v9, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v6, vcc -; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 +; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, v8, v4 ; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 -; GCN-IR-NEXT: .LBB1_3: ; %udiv-do-while +; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; GCN-IR-NEXT: .LBB1_2: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 -; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v5 -; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v6 -; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v16, v10 -; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v17, v11, vcc -; GCN-IR-NEXT: v_or_b32_e32 v4, v12, v4 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v6 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v7 +; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v4 +; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v16, v10 +; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v17, v11, vcc +; GCN-IR-NEXT: v_or_b32_e32 v6, v12, v6 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v4 ; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v8 -; GCN-IR-NEXT: v_or_b32_e32 v5, v13, v5 -; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v12 +; GCN-IR-NEXT: v_or_b32_e32 v7, v13, v7 +; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v12 ; GCN-IR-NEXT: v_and_b32_e32 v13, v12, v3 ; GCN-IR-NEXT: v_and_b32_e32 v12, v12, v2 ; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] ; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v12 ; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v13, s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v13, v7 -; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v12, v6 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3 -; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: .LBB1_5: ; %Flow4 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 -; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v5 -; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v4 -; GCN-IR-NEXT: .LBB1_6: ; %Flow5 +; GCN-IR-NEXT: v_mov_b32_e32 v13, v5 +; GCN-IR-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN-IR-NEXT: v_mov_b32_e32 v12, v4 +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: s_cbranch_execnz .LBB1_2 +; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 +; GCN-IR-NEXT: v_or_b32_e32 v6, v4, v6 +; GCN-IR-NEXT: .LBB1_4: ; %Flow +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_mul_lo_u32 v4, v2, v7 ; GCN-IR-NEXT: v_mul_hi_u32 v5, v2, v6 ; GCN-IR-NEXT: v_mul_lo_u32 v3, v3, v6 @@ -1013,7 +995,7 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GCN-IR-NEXT: s_mov_b32 s13, 0 +; GCN-IR-NEXT: s_mov_b64 s[14:15], 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[6:7], 31 ; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[0:1], 31 @@ -1029,69 +1011,62 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: s_subb_u32 s9, s7, s10 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[8:9], 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[2:3], 0 +; GCN-IR-NEXT: s_flbit_i32_b32 s12, s3 ; GCN-IR-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11] ; GCN-IR-NEXT: s_flbit_i32_b32 s6, s8 ; GCN-IR-NEXT: s_add_i32 s6, s6, 32 ; GCN-IR-NEXT: s_flbit_i32_b32 s7, s9 -; GCN-IR-NEXT: s_min_u32 s12, s6, s7 -; GCN-IR-NEXT: s_flbit_i32_b32 s6, s2 -; GCN-IR-NEXT: s_add_i32 s6, s6, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s7, s3 -; GCN-IR-NEXT: s_min_u32 s20, s6, s7 -; GCN-IR-NEXT: s_sub_u32 s14, s12, s20 -; GCN-IR-NEXT: s_subb_u32 s15, 0, 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[16:17], s[14:15], 63 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[14:15], 63 +; GCN-IR-NEXT: s_min_u32 s6, s6, s7 +; GCN-IR-NEXT: s_flbit_i32_b32 s7, s2 +; GCN-IR-NEXT: s_add_i32 s7, s7, 32 +; GCN-IR-NEXT: s_min_u32 s20, s7, s12 +; GCN-IR-NEXT: s_sub_u32 s12, s6, s20 +; GCN-IR-NEXT: s_subb_u32 s13, 0, 0 +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[16:17], s[12:13], 63 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[12:13], 63 ; GCN-IR-NEXT: s_or_b64 s[16:17], s[10:11], s[16:17] ; GCN-IR-NEXT: s_and_b64 s[10:11], s[16:17], exec ; GCN-IR-NEXT: s_cselect_b32 s11, 0, s3 ; GCN-IR-NEXT: s_cselect_b32 s10, 0, s2 ; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19] -; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[16:17] -; GCN-IR-NEXT: s_cbranch_vccz .LBB8_5 -; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s16, s14, 1 -; GCN-IR-NEXT: s_addc_u32 s17, s15, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[16:17], 0 -; GCN-IR-NEXT: s_sub_i32 s14, 63, s14 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[10:11] -; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[2:3], s14 +; GCN-IR-NEXT: s_mov_b32 s7, 0 ; GCN-IR-NEXT: s_cbranch_vccz .LBB8_4 -; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[14:15], s[2:3], s16 +; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 +; GCN-IR-NEXT: s_add_i32 s13, s12, 1 +; GCN-IR-NEXT: s_sub_i32 s10, 63, s12 +; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[2:3], s10 +; GCN-IR-NEXT: s_lshr_b64 s[16:17], s[2:3], s13 ; GCN-IR-NEXT: s_add_u32 s18, s8, -1 ; GCN-IR-NEXT: s_addc_u32 s19, s9, -1 -; GCN-IR-NEXT: s_not_b64 s[6:7], s[12:13] -; GCN-IR-NEXT: s_add_u32 s12, s6, s20 -; GCN-IR-NEXT: s_addc_u32 s13, s7, 0 -; GCN-IR-NEXT: s_mov_b64 s[16:17], 0 -; GCN-IR-NEXT: s_mov_b32 s7, 0 -; GCN-IR-NEXT: .LBB8_3: ; %udiv-do-while +; GCN-IR-NEXT: s_not_b64 s[12:13], s[6:7] +; GCN-IR-NEXT: s_add_u32 s12, s12, s20 +; GCN-IR-NEXT: s_addc_u32 s13, s13, 0 +; GCN-IR-NEXT: .LBB8_2: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: s_lshl_b64 s[14:15], s[14:15], 1 +; GCN-IR-NEXT: s_lshl_b64 s[16:17], s[16:17], 1 ; GCN-IR-NEXT: s_lshr_b32 s6, s11, 31 ; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 -; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[6:7] -; GCN-IR-NEXT: s_or_b64 s[10:11], s[16:17], s[10:11] -; GCN-IR-NEXT: s_sub_u32 s6, s18, s14 -; GCN-IR-NEXT: s_subb_u32 s6, s19, s15 -; GCN-IR-NEXT: s_ashr_i32 s16, s6, 31 -; GCN-IR-NEXT: s_mov_b32 s17, s16 -; GCN-IR-NEXT: s_and_b32 s6, s16, 1 -; GCN-IR-NEXT: s_and_b64 s[16:17], s[16:17], s[8:9] -; GCN-IR-NEXT: s_sub_u32 s14, s14, s16 -; GCN-IR-NEXT: s_subb_u32 s15, s15, s17 +; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[6:7] +; GCN-IR-NEXT: s_or_b64 s[10:11], s[14:15], s[10:11] +; GCN-IR-NEXT: s_sub_u32 s6, s18, s16 +; GCN-IR-NEXT: s_subb_u32 s6, s19, s17 +; GCN-IR-NEXT: s_ashr_i32 s14, s6, 31 +; GCN-IR-NEXT: s_mov_b32 s15, s14 +; GCN-IR-NEXT: s_and_b32 s6, s14, 1 +; GCN-IR-NEXT: s_and_b64 s[14:15], s[14:15], s[8:9] +; GCN-IR-NEXT: s_sub_u32 s16, s16, s14 +; GCN-IR-NEXT: s_subb_u32 s17, s17, s15 ; GCN-IR-NEXT: s_add_u32 s12, s12, 1 ; GCN-IR-NEXT: s_addc_u32 s13, s13, 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[20:21], s[12:13], 0 -; GCN-IR-NEXT: s_mov_b64 s[16:17], s[6:7] +; GCN-IR-NEXT: s_mov_b64 s[14:15], s[6:7] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[20:21] -; GCN-IR-NEXT: s_cbranch_vccz .LBB8_3 -; GCN-IR-NEXT: .LBB8_4: ; %Flow7 +; GCN-IR-NEXT: s_cbranch_vccz .LBB8_2 +; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit ; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 ; GCN-IR-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11] -; GCN-IR-NEXT: .LBB8_5: ; %udiv-end +; GCN-IR-NEXT: .LBB8_4: ; %udiv-end ; GCN-IR-NEXT: v_mov_b32_e32 v0, s10 ; GCN-IR-NEXT: v_mul_hi_u32 v0, s8, v0 ; GCN-IR-NEXT: s_mul_i32 s11, s8, s11 @@ -1158,7 +1133,7 @@ define amdgpu_kernel void @s_test_srem24_48(ptr addrspace(1) %out, i48 %x, i48 % ; GCN-IR-LABEL: s_test_srem24_48: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GCN-IR-NEXT: s_mov_b32 s13, 0 +; GCN-IR-NEXT: s_mov_b64 s[14:15], 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_sext_i32_i16 s5, s5 ; GCN-IR-NEXT: s_sext_i32_i16 s7, s7 @@ -1180,69 +1155,62 @@ define amdgpu_kernel void @s_test_srem24_48(ptr addrspace(1) %out, i48 %x, i48 % ; GCN-IR-NEXT: s_subb_u32 s7, s7, s10 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[6:7], 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[4:5], 0 +; GCN-IR-NEXT: s_flbit_i32_b32 s12, s5 ; GCN-IR-NEXT: s_or_b64 s[10:11], s[8:9], s[10:11] ; GCN-IR-NEXT: s_flbit_i32_b32 s8, s6 ; GCN-IR-NEXT: s_add_i32 s8, s8, 32 ; GCN-IR-NEXT: s_flbit_i32_b32 s9, s7 -; GCN-IR-NEXT: s_min_u32 s12, s8, s9 -; GCN-IR-NEXT: s_flbit_i32_b32 s8, s4 -; GCN-IR-NEXT: s_add_i32 s8, s8, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s9, s5 -; GCN-IR-NEXT: s_min_u32 s20, s8, s9 -; GCN-IR-NEXT: s_sub_u32 s14, s12, s20 -; GCN-IR-NEXT: s_subb_u32 s15, 0, 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[16:17], s[14:15], 63 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[14:15], 63 +; GCN-IR-NEXT: s_min_u32 s8, s8, s9 +; GCN-IR-NEXT: s_flbit_i32_b32 s9, s4 +; GCN-IR-NEXT: s_add_i32 s9, s9, 32 +; GCN-IR-NEXT: s_min_u32 s20, s9, s12 +; GCN-IR-NEXT: s_sub_u32 s12, s8, s20 +; GCN-IR-NEXT: s_subb_u32 s13, 0, 0 +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[16:17], s[12:13], 63 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[12:13], 63 ; GCN-IR-NEXT: s_or_b64 s[16:17], s[10:11], s[16:17] ; GCN-IR-NEXT: s_and_b64 s[10:11], s[16:17], exec ; GCN-IR-NEXT: s_cselect_b32 s11, 0, s5 ; GCN-IR-NEXT: s_cselect_b32 s10, 0, s4 ; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19] -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[16:17] -; GCN-IR-NEXT: s_cbranch_vccz .LBB9_5 -; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s16, s14, 1 -; GCN-IR-NEXT: s_addc_u32 s17, s15, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[16:17], 0 -; GCN-IR-NEXT: s_sub_i32 s14, 63, s14 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[10:11] -; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[4:5], s14 +; GCN-IR-NEXT: s_mov_b32 s9, 0 ; GCN-IR-NEXT: s_cbranch_vccz .LBB9_4 -; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[14:15], s[4:5], s16 +; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 +; GCN-IR-NEXT: s_add_i32 s13, s12, 1 +; GCN-IR-NEXT: s_sub_i32 s10, 63, s12 +; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[4:5], s10 +; GCN-IR-NEXT: s_lshr_b64 s[16:17], s[4:5], s13 ; GCN-IR-NEXT: s_add_u32 s18, s6, -1 ; GCN-IR-NEXT: s_addc_u32 s19, s7, -1 -; GCN-IR-NEXT: s_not_b64 s[8:9], s[12:13] -; GCN-IR-NEXT: s_add_u32 s12, s8, s20 -; GCN-IR-NEXT: s_addc_u32 s13, s9, 0 -; GCN-IR-NEXT: s_mov_b64 s[16:17], 0 -; GCN-IR-NEXT: s_mov_b32 s9, 0 -; GCN-IR-NEXT: .LBB9_3: ; %udiv-do-while +; GCN-IR-NEXT: s_not_b64 s[12:13], s[8:9] +; GCN-IR-NEXT: s_add_u32 s12, s12, s20 +; GCN-IR-NEXT: s_addc_u32 s13, s13, 0 +; GCN-IR-NEXT: .LBB9_2: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: s_lshl_b64 s[14:15], s[14:15], 1 +; GCN-IR-NEXT: s_lshl_b64 s[16:17], s[16:17], 1 ; GCN-IR-NEXT: s_lshr_b32 s8, s11, 31 ; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 -; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[8:9] -; GCN-IR-NEXT: s_or_b64 s[10:11], s[16:17], s[10:11] -; GCN-IR-NEXT: s_sub_u32 s8, s18, s14 -; GCN-IR-NEXT: s_subb_u32 s8, s19, s15 -; GCN-IR-NEXT: s_ashr_i32 s16, s8, 31 -; GCN-IR-NEXT: s_mov_b32 s17, s16 -; GCN-IR-NEXT: s_and_b32 s8, s16, 1 -; GCN-IR-NEXT: s_and_b64 s[16:17], s[16:17], s[6:7] -; GCN-IR-NEXT: s_sub_u32 s14, s14, s16 -; GCN-IR-NEXT: s_subb_u32 s15, s15, s17 +; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[8:9] +; GCN-IR-NEXT: s_or_b64 s[10:11], s[14:15], s[10:11] +; GCN-IR-NEXT: s_sub_u32 s8, s18, s16 +; GCN-IR-NEXT: s_subb_u32 s8, s19, s17 +; GCN-IR-NEXT: s_ashr_i32 s14, s8, 31 +; GCN-IR-NEXT: s_mov_b32 s15, s14 +; GCN-IR-NEXT: s_and_b32 s8, s14, 1 +; GCN-IR-NEXT: s_and_b64 s[14:15], s[14:15], s[6:7] +; GCN-IR-NEXT: s_sub_u32 s16, s16, s14 +; GCN-IR-NEXT: s_subb_u32 s17, s17, s15 ; GCN-IR-NEXT: s_add_u32 s12, s12, 1 ; GCN-IR-NEXT: s_addc_u32 s13, s13, 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[20:21], s[12:13], 0 -; GCN-IR-NEXT: s_mov_b64 s[16:17], s[8:9] +; GCN-IR-NEXT: s_mov_b64 s[14:15], s[8:9] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[20:21] -; GCN-IR-NEXT: s_cbranch_vccz .LBB9_3 -; GCN-IR-NEXT: .LBB9_4: ; %Flow4 +; GCN-IR-NEXT: s_cbranch_vccz .LBB9_2 +; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit ; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 ; GCN-IR-NEXT: s_or_b64 s[10:11], s[8:9], s[10:11] -; GCN-IR-NEXT: .LBB9_5: ; %udiv-end +; GCN-IR-NEXT: .LBB9_4: ; %udiv-end ; GCN-IR-NEXT: v_mov_b32_e32 v0, s10 ; GCN-IR-NEXT: v_mul_hi_u32 v0, s6, v0 ; GCN-IR-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 @@ -1386,76 +1354,69 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-LABEL: s_test_srem_k_num_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i32 s8, s3, 31 -; GCN-IR-NEXT: s_mov_b32 s9, s8 -; GCN-IR-NEXT: s_xor_b64 s[2:3], s[2:3], s[8:9] -; GCN-IR-NEXT: s_sub_u32 s4, s2, s8 -; GCN-IR-NEXT: s_subb_u32 s5, s3, s8 +; GCN-IR-NEXT: s_ashr_i32 s6, s3, 31 +; GCN-IR-NEXT: s_mov_b32 s7, s6 +; GCN-IR-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7] +; GCN-IR-NEXT: s_sub_u32 s4, s2, s6 +; GCN-IR-NEXT: s_subb_u32 s5, s3, s6 ; GCN-IR-NEXT: s_flbit_i32_b32 s2, s4 ; GCN-IR-NEXT: s_add_i32 s2, s2, 32 ; GCN-IR-NEXT: s_flbit_i32_b32 s3, s5 -; GCN-IR-NEXT: s_min_u32 s12, s2, s3 -; GCN-IR-NEXT: s_add_u32 s2, s12, 0xffffffc5 +; GCN-IR-NEXT: s_min_u32 s10, s2, s3 +; GCN-IR-NEXT: s_add_u32 s2, s10, 0xffffffc5 ; GCN-IR-NEXT: s_addc_u32 s3, 0, -1 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[4:5], 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[10:11], s[2:3], 63 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[4:5], 0 +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[12:13], s[2:3], 63 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[14:15], s[2:3], 63 -; GCN-IR-NEXT: s_or_b64 s[10:11], s[8:9], s[10:11] -; GCN-IR-NEXT: s_and_b64 s[8:9], s[10:11], exec -; GCN-IR-NEXT: s_cselect_b32 s8, 0, 24 -; GCN-IR-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15] -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[10:11] -; GCN-IR-NEXT: s_mov_b32 s9, 0 -; GCN-IR-NEXT: s_cbranch_vccz .LBB10_5 +; GCN-IR-NEXT: s_or_b64 s[12:13], s[6:7], s[12:13] +; GCN-IR-NEXT: s_and_b64 s[6:7], s[12:13], exec +; GCN-IR-NEXT: s_cselect_b32 s6, 0, 24 +; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] +; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[12:13] +; GCN-IR-NEXT: s_mov_b32 s7, 0 +; GCN-IR-NEXT: s_cbranch_vccz .LBB10_4 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s8, s2, 1 -; GCN-IR-NEXT: s_addc_u32 s9, s3, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[8:9], 0 +; GCN-IR-NEXT: s_add_i32 s6, s2, 1 ; GCN-IR-NEXT: s_sub_i32 s2, 63, s2 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[10:11] ; GCN-IR-NEXT: s_lshl_b64 s[2:3], 24, s2 -; GCN-IR-NEXT: s_cbranch_vccz .LBB10_4 -; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[10:11], 24, s8 +; GCN-IR-NEXT: s_lshr_b64 s[12:13], 24, s6 ; GCN-IR-NEXT: s_add_u32 s14, s4, -1 ; GCN-IR-NEXT: s_addc_u32 s15, s5, -1 -; GCN-IR-NEXT: s_sub_u32 s8, 58, s12 -; GCN-IR-NEXT: s_subb_u32 s9, 0, 0 -; GCN-IR-NEXT: s_mov_b64 s[12:13], 0 -; GCN-IR-NEXT: s_mov_b32 s7, 0 -; GCN-IR-NEXT: .LBB10_3: ; %udiv-do-while +; GCN-IR-NEXT: s_sub_u32 s10, 58, s10 +; GCN-IR-NEXT: s_subb_u32 s11, 0, 0 +; GCN-IR-NEXT: .LBB10_2: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 +; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1 ; GCN-IR-NEXT: s_lshr_b32 s6, s3, 31 ; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GCN-IR-NEXT: s_or_b64 s[10:11], s[10:11], s[6:7] -; GCN-IR-NEXT: s_or_b64 s[2:3], s[12:13], s[2:3] -; GCN-IR-NEXT: s_sub_u32 s6, s14, s10 -; GCN-IR-NEXT: s_subb_u32 s6, s15, s11 -; GCN-IR-NEXT: s_ashr_i32 s12, s6, 31 -; GCN-IR-NEXT: s_mov_b32 s13, s12 -; GCN-IR-NEXT: s_and_b32 s6, s12, 1 -; GCN-IR-NEXT: s_and_b64 s[12:13], s[12:13], s[4:5] -; GCN-IR-NEXT: s_sub_u32 s10, s10, s12 -; GCN-IR-NEXT: s_subb_u32 s11, s11, s13 -; GCN-IR-NEXT: s_add_u32 s8, s8, 1 -; GCN-IR-NEXT: s_addc_u32 s9, s9, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[16:17], s[8:9], 0 -; GCN-IR-NEXT: s_mov_b64 s[12:13], s[6:7] +; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[6:7] +; GCN-IR-NEXT: s_or_b64 s[2:3], s[8:9], s[2:3] +; GCN-IR-NEXT: s_sub_u32 s6, s14, s12 +; GCN-IR-NEXT: s_subb_u32 s6, s15, s13 +; GCN-IR-NEXT: s_ashr_i32 s8, s6, 31 +; GCN-IR-NEXT: s_mov_b32 s9, s8 +; GCN-IR-NEXT: s_and_b32 s6, s8, 1 +; GCN-IR-NEXT: s_and_b64 s[8:9], s[8:9], s[4:5] +; GCN-IR-NEXT: s_sub_u32 s12, s12, s8 +; GCN-IR-NEXT: s_subb_u32 s13, s13, s9 +; GCN-IR-NEXT: s_add_u32 s10, s10, 1 +; GCN-IR-NEXT: s_addc_u32 s11, s11, 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[16:17], s[10:11], 0 +; GCN-IR-NEXT: s_mov_b64 s[8:9], s[6:7] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[16:17] -; GCN-IR-NEXT: s_cbranch_vccz .LBB10_3 -; GCN-IR-NEXT: .LBB10_4: ; %Flow6 +; GCN-IR-NEXT: s_cbranch_vccz .LBB10_2 +; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit ; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[2:3] -; GCN-IR-NEXT: .LBB10_5: ; %udiv-end -; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 +; GCN-IR-NEXT: s_or_b64 s[6:7], s[6:7], s[2:3] +; GCN-IR-NEXT: .LBB10_4: ; %udiv-end +; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 ; GCN-IR-NEXT: v_mul_hi_u32 v0, s4, v0 -; GCN-IR-NEXT: s_mul_i32 s6, s4, s9 -; GCN-IR-NEXT: s_mul_i32 s5, s5, s8 -; GCN-IR-NEXT: s_mul_i32 s4, s4, s8 -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; GCN-IR-NEXT: s_mul_i32 s7, s4, s7 +; GCN-IR-NEXT: s_mul_i32 s5, s5, s6 +; GCN-IR-NEXT: s_mul_i32 s4, s4, s6 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s7, v0 ; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, s5, v0 ; GCN-IR-NEXT: v_sub_i32_e64 v0, vcc, 24, s4 ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 @@ -1582,80 +1543,70 @@ define i64 @v_test_srem_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0 ; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 -; GCN-IR-NEXT: v_min_u32_e32 v10, v2, v3 +; GCN-IR-NEXT: v_min_u32_e32 v6, v2, v3 ; GCN-IR-NEXT: s_movk_i32 s6, 0xffc5 -; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s6, v10 -; GCN-IR-NEXT: v_addc_u32_e64 v3, s[6:7], 0, -1, vcc +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, s6, v6 +; GCN-IR-NEXT: v_addc_u32_e64 v5, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] -; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[2:3] +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5] +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[4:5] ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GCN-IR-NEXT: v_cndmask_b32_e64 v4, 24, 0, s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v2, 24, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 ; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB11_6 +; GCN-IR-NEXT: s_cbranch_execz .LBB11_4 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v2 -; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; GCN-IR-NEXT: v_lshl_b64 v[2:3], 24, v2 -; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB11_5 -; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 1, v4 +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 63, v4 ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc -; GCN-IR-NEXT: v_lshr_b64 v[8:9], 24, v6 -; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 58, v10 +; GCN-IR-NEXT: v_lshl_b64 v[4:5], 24, v4 +; GCN-IR-NEXT: v_lshr_b64 v[8:9], 24, v2 ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 -; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc -; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 58, v6 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: .LBB11_3: ; %udiv-do-while +; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc +; GCN-IR-NEXT: .LBB11_2: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 -; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 -; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4 -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v8 -; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v13, v9, vcc -; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v5 +; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v2 +; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, v12, v8 +; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v13, v9, vcc +; GCN-IR-NEXT: v_or_b32_e32 v4, v10, v4 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v2 ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6 -; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 -; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10 +; GCN-IR-NEXT: v_or_b32_e32 v5, v11, v5 +; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v10 ; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1 ; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] ; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10 ; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 -; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v10, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: s_cbranch_execnz .LBB11_3 -; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: .LBB11_5: ; %Flow4 +; GCN-IR-NEXT: v_mov_b32_e32 v11, v3 +; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_mov_b32_e32 v10, v2 +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_cbranch_execnz .LBB11_2 +; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3 -; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2 -; GCN-IR-NEXT: .LBB11_6: ; %Flow5 +; GCN-IR-NEXT: v_lshl_b64 v[3:4], v[4:5], 1 +; GCN-IR-NEXT: v_or_b32_e32 v3, v2, v3 +; GCN-IR-NEXT: v_mov_b32_e32 v2, v3 +; GCN-IR-NEXT: v_mov_b32_e32 v3, v4 +; GCN-IR-NEXT: .LBB11_4: ; %Flow ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-IR-NEXT: v_mul_lo_u32 v2, v0, v5 -; GCN-IR-NEXT: v_mul_hi_u32 v3, v0, v4 -; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v4 -; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v4 -; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GCN-IR-NEXT: v_mul_lo_u32 v3, v0, v3 +; GCN-IR-NEXT: v_mul_hi_u32 v4, v0, v2 +; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v2 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v2 +; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 24, v0 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, 0, v1, vcc ; GCN-IR-NEXT: s_setpc_b64 s[30:31] @@ -1773,82 +1724,72 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0 ; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 -; GCN-IR-NEXT: v_min_u32_e32 v10, v2, v3 +; GCN-IR-NEXT: v_min_u32_e32 v6, v2, v3 ; GCN-IR-NEXT: s_movk_i32 s6, 0xffd0 -; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s6, v10 -; GCN-IR-NEXT: v_addc_u32_e64 v3, s[6:7], 0, -1, vcc +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, s6, v6 +; GCN-IR-NEXT: v_addc_u32_e64 v5, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] -; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[2:3] -; GCN-IR-NEXT: v_mov_b32_e32 v4, 0x8000 +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5] +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[4:5] +; GCN-IR-NEXT: v_mov_b32_e32 v2, 0x8000 ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v2, v2, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 ; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB12_6 +; GCN-IR-NEXT: s_cbranch_execz .LBB12_4 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v2 -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 -; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc -; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[4:5], v2 -; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GCN-IR-NEXT: s_cbranch_execz .LBB12_5 -; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 1, v4 +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 63, v4 ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0 +; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc -; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v6 -; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v10 +; GCN-IR-NEXT: v_lshl_b64 v[4:5], s[4:5], v4 +; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v2 ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 -; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc -; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v6 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: .LBB12_3: ; %udiv-do-while +; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc +; GCN-IR-NEXT: .LBB12_2: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 -; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 -; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4 -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v8 -; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v13, v9, vcc -; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v5 +; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v2 +; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, v12, v8 +; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v13, v9, vcc +; GCN-IR-NEXT: v_or_b32_e32 v4, v10, v4 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v2 ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6 -; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 -; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10 +; GCN-IR-NEXT: v_or_b32_e32 v5, v11, v5 +; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v10 ; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1 ; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] ; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10 ; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 -; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v10, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: s_cbranch_execnz .LBB12_3 -; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: .LBB12_5: ; %Flow4 +; GCN-IR-NEXT: v_mov_b32_e32 v11, v3 +; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_mov_b32_e32 v10, v2 +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_cbranch_execnz .LBB12_2 +; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3 -; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2 -; GCN-IR-NEXT: .LBB12_6: ; %Flow5 +; GCN-IR-NEXT: v_lshl_b64 v[3:4], v[4:5], 1 +; GCN-IR-NEXT: v_or_b32_e32 v3, v2, v3 +; GCN-IR-NEXT: v_mov_b32_e32 v2, v3 +; GCN-IR-NEXT: v_mov_b32_e32 v3, v4 +; GCN-IR-NEXT: .LBB12_4: ; %Flow ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-IR-NEXT: v_mul_lo_u32 v2, v0, v5 -; GCN-IR-NEXT: v_mul_hi_u32 v3, v0, v4 -; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v4 -; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v4 -; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GCN-IR-NEXT: v_mul_lo_u32 v3, v0, v3 +; GCN-IR-NEXT: v_mul_hi_u32 v4, v0, v2 +; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v2 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v2 +; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 0x8000, v0 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, 0, v1, vcc ; GCN-IR-NEXT: s_setpc_b64 s[30:31] @@ -1880,72 +1821,61 @@ define i64 @v_test_srem_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0 ; GCN-IR-NEXT: v_add_i32_e64 v2, s[4:5], 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 -; GCN-IR-NEXT: v_min_u32_e32 v10, v2, v3 -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 48, v10 -; GCN-IR-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, s[4:5] +; GCN-IR-NEXT: v_min_u32_e32 v2, v2, v3 +; GCN-IR-NEXT: v_sub_i32_e64 v6, s[4:5], 48, v2 +; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, s[4:5] ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], 63, v[2:3] +; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], 63, v[6:7] ; GCN-IR-NEXT: v_mov_b32_e32 v13, v12 ; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[2:3] -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[6:7] +; GCN-IR-NEXT: s_xor_b64 s[8:9], s[4:5], -1 +; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 +; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 ; GCN-IR-NEXT: v_cndmask_b32_e64 v5, v1, 0, s[4:5] ; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v0, 0, s[4:5] -; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB13_6 +; GCN-IR-NEXT: s_and_b64 s[4:5], s[8:9], vcc +; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] +; GCN-IR-NEXT: s_cbranch_execz .LBB13_4 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v2 -; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[0:1], v2 -; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB13_5 -; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: v_lshr_b64 v[8:9], v[0:1], v6 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 0xffffffcf, v10 +; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 1, v6 +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 63, v6 +; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[0:1], v4 +; GCN-IR-NEXT: v_lshr_b64 v[8:9], v[0:1], v7 +; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 0xffffffcf, v2 ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_addc_u32_e64 v7, s[4:5], 0, -1, vcc -; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 -; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: .LBB13_3: ; %udiv-do-while +; GCN-IR-NEXT: s_movk_i32 s10, 0x7fff +; GCN-IR-NEXT: .LBB13_2: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 -; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 -; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4 -; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, s12, v8 -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, 0, v9, vcc +; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v5 +; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v2 +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s10, v8 +; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 +; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, 0, v9, vcc ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6 -; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4 +; GCN-IR-NEXT: v_or_b32_e32 v4, v10, v4 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v2 ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10 +; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v10 ; GCN-IR-NEXT: v_and_b32_e32 v10, 0x8000, v10 ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] -; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 +; GCN-IR-NEXT: v_or_b32_e32 v5, v11, v5 ; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10 -; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 +; GCN-IR-NEXT: v_mov_b32_e32 v11, v3 ; GCN-IR-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v9, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v10, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: s_cbranch_execnz .LBB13_3 -; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: .LBB13_5: ; %Flow4 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3 -; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2 -; GCN-IR-NEXT: .LBB13_6: ; %Flow5 +; GCN-IR-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN-IR-NEXT: v_mov_b32_e32 v10, v2 +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: s_cbranch_execnz .LBB13_2 +; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 +; GCN-IR-NEXT: v_or_b32_e32 v4, v2, v4 +; GCN-IR-NEXT: .LBB13_4: ; %Flow +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[4:5], 15 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll index 9301170c034d8..652fbb3306a48 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -129,15 +129,15 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[4:5], 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[2:3], 0 ; GCN-IR-NEXT: s_flbit_i32_b32 s10, s4 -; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9] -; GCN-IR-NEXT: s_flbit_i32_b32 s6, s2 ; GCN-IR-NEXT: s_flbit_i32_b32 s11, s5 ; GCN-IR-NEXT: s_add_i32 s10, s10, 32 -; GCN-IR-NEXT: s_add_i32 s6, s6, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s7, s3 -; GCN-IR-NEXT: s_min_u32 s10, s10, s11 -; GCN-IR-NEXT: s_min_u32 s16, s6, s7 -; GCN-IR-NEXT: s_sub_u32 s12, s10, s16 +; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9] +; GCN-IR-NEXT: s_flbit_i32_b32 s7, s2 +; GCN-IR-NEXT: s_min_u32 s6, s10, s11 +; GCN-IR-NEXT: s_add_i32 s7, s7, 32 +; GCN-IR-NEXT: s_flbit_i32_b32 s10, s3 +; GCN-IR-NEXT: s_min_u32 s16, s7, s10 +; GCN-IR-NEXT: s_sub_u32 s12, s6, s16 ; GCN-IR-NEXT: s_subb_u32 s13, 0, 0 ; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[14:15], s[12:13], 63 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[12:13], 63 @@ -146,28 +146,21 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-IR-NEXT: s_cselect_b32 s9, 0, s3 ; GCN-IR-NEXT: s_cselect_b32 s8, 0, s2 ; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[18:19] -; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 +; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[14:15] -; GCN-IR-NEXT: s_mov_b32 s11, 0 -; GCN-IR-NEXT: s_cbranch_vccz .LBB0_5 -; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s14, s12, 1 -; GCN-IR-NEXT: s_addc_u32 s15, s13, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[14:15], 0 -; GCN-IR-NEXT: s_sub_i32 s12, 63, s12 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[8:9] -; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[2:3], s12 +; GCN-IR-NEXT: s_mov_b32 s7, 0 ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_4 -; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[12:13], s[2:3], s14 +; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 +; GCN-IR-NEXT: s_add_i32 s13, s12, 1 +; GCN-IR-NEXT: s_sub_i32 s8, 63, s12 +; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 +; GCN-IR-NEXT: s_lshr_b64 s[12:13], s[2:3], s13 ; GCN-IR-NEXT: s_add_u32 s14, s4, -1 ; GCN-IR-NEXT: s_addc_u32 s15, s5, -1 -; GCN-IR-NEXT: s_not_b64 s[2:3], s[10:11] +; GCN-IR-NEXT: s_not_b64 s[2:3], s[6:7] ; GCN-IR-NEXT: s_add_u32 s2, s2, s16 ; GCN-IR-NEXT: s_addc_u32 s3, s3, 0 -; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 -; GCN-IR-NEXT: s_mov_b32 s7, 0 -; GCN-IR-NEXT: .LBB0_3: ; %udiv-do-while +; GCN-IR-NEXT: .LBB0_2: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1 ; GCN-IR-NEXT: s_lshr_b32 s6, s9, 31 @@ -187,11 +180,11 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[16:17], s[2:3], 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], s[6:7] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[16:17] -; GCN-IR-NEXT: s_cbranch_vccz .LBB0_3 -; GCN-IR-NEXT: .LBB0_4: ; %Flow7 +; GCN-IR-NEXT: s_cbranch_vccz .LBB0_2 +; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit ; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[8:9], 1 ; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[2:3] -; GCN-IR-NEXT: .LBB0_5: ; %udiv-end +; GCN-IR-NEXT: .LBB0_4: ; %udiv-end ; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 @@ -316,52 +309,44 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) { ; GCN-IR-LABEL: v_test_udiv_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v2 -; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4 +; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4 ; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v3 ; GCN-IR-NEXT: v_min_u32_e32 v10, v4, v5 ; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v0 -; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4 +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4 ; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v1 -; GCN-IR-NEXT: v_min_u32_e32 v11, v4, v5 -; GCN-IR-NEXT: v_sub_i32_e64 v6, s[6:7], v10, v11 -; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] -; GCN-IR-NEXT: v_subb_u32_e64 v7, s[6:7], 0, 0, s[6:7] -; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[6:7], 63, v[6:7] -; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[6:7] -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v1, 0, s[4:5] -; GCN-IR-NEXT: v_cndmask_b32_e64 v5, v0, 0, s[4:5] -; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB1_6 -; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v6 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v7, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v6 -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] -; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[0:1], v4 -; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 +; GCN-IR-NEXT: v_min_u32_e32 v6, v4, v5 +; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, v10, v6 +; GCN-IR-NEXT: v_subb_u32_e64 v9, s[8:9], 0, 0, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[8:9] +; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 +; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[8:9] +; GCN-IR-NEXT: s_xor_b64 s[8:9], s[4:5], -1 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 -; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB1_5 -; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader +; GCN-IR-NEXT: v_cndmask_b32_e64 v5, v1, 0, s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v0, 0, s[4:5] +; GCN-IR-NEXT: s_and_b64 s[4:5], s[8:9], vcc +; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] +; GCN-IR-NEXT: s_cbranch_execz .LBB1_4 +; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 +; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, 1, v8 +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 63, v8 ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v2 -; GCN-IR-NEXT: v_lshr_b64 v[8:9], v[0:1], v8 +; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[0:1], v4 +; GCN-IR-NEXT: v_lshr_b64 v[8:9], v[0:1], v9 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v3, vcc ; GCN-IR-NEXT: v_not_b32_e32 v0, v10 ; GCN-IR-NEXT: v_not_b32_e32 v1, 0 -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v11 ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 -; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 -; GCN-IR-NEXT: .LBB1_3: ; %udiv-do-while +; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-IR-NEXT: .LBB1_2: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v5 @@ -381,21 +366,18 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10 ; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5] ; GCN-IR-NEXT: v_mov_b32_e32 v11, v7 -; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GCN-IR-NEXT: v_mov_b32_e32 v10, v6 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3 -; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: .LBB1_5: ; %Flow4 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[4:5], 1 -; GCN-IR-NEXT: v_or_b32_e32 v4, v7, v1 -; GCN-IR-NEXT: v_or_b32_e32 v5, v6, v0 -; GCN-IR-NEXT: .LBB1_6: ; %Flow5 +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: s_cbranch_execnz .LBB1_2 +; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-IR-NEXT: v_mov_b32_e32 v0, v5 -; GCN-IR-NEXT: v_mov_b32_e32 v1, v4 +; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 +; GCN-IR-NEXT: v_or_b32_e32 v4, v6, v4 +; GCN-IR-NEXT: .LBB1_4: ; %Flow +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: v_mov_b32_e32 v0, v4 +; GCN-IR-NEXT: v_mov_b32_e32 v1, v5 ; GCN-IR-NEXT: s_setpc_b64 s[30:31] %result = udiv i64 %x, %y ret i64 %result @@ -784,7 +766,6 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48 ; GCN-IR-LABEL: s_test_udiv24_i48: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GCN-IR-NEXT: s_mov_b32 s11, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_and_b32 s3, s5, 0xffff ; GCN-IR-NEXT: s_and_b32 s2, s4, 0xff000000 @@ -796,16 +777,16 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48 ; GCN-IR-NEXT: s_and_b32 s3, s3, 0xffff ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], s[2:3], 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[8:9], 0 +; GCN-IR-NEXT: s_flbit_i32_b32 s10, s9 ; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[6:7] ; GCN-IR-NEXT: s_flbit_i32_b32 s4, s2 ; GCN-IR-NEXT: s_add_i32 s4, s4, 32 ; GCN-IR-NEXT: s_flbit_i32_b32 s5, s3 -; GCN-IR-NEXT: s_min_u32 s10, s4, s5 -; GCN-IR-NEXT: s_flbit_i32_b32 s4, s8 -; GCN-IR-NEXT: s_add_i32 s4, s4, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s5, s9 -; GCN-IR-NEXT: s_min_u32 s16, s4, s5 -; GCN-IR-NEXT: s_sub_u32 s12, s10, s16 +; GCN-IR-NEXT: s_min_u32 s4, s4, s5 +; GCN-IR-NEXT: s_flbit_i32_b32 s5, s8 +; GCN-IR-NEXT: s_add_i32 s5, s5, 32 +; GCN-IR-NEXT: s_min_u32 s16, s5, s10 +; GCN-IR-NEXT: s_sub_u32 s12, s4, s16 ; GCN-IR-NEXT: s_subb_u32 s13, 0, 0 ; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[14:15], s[12:13], 63 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[12:13], 63 @@ -814,27 +795,21 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48 ; GCN-IR-NEXT: s_cselect_b32 s7, 0, s9 ; GCN-IR-NEXT: s_cselect_b32 s6, 0, s8 ; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[18:19] -; GCN-IR-NEXT: s_mov_b64 s[4:5], 0 +; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[14:15] -; GCN-IR-NEXT: s_cbranch_vccz .LBB7_5 -; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s14, s12, 1 -; GCN-IR-NEXT: s_addc_u32 s15, s13, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[14:15], 0 -; GCN-IR-NEXT: s_sub_i32 s12, 63, s12 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[6:7] -; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[8:9], s12 +; GCN-IR-NEXT: s_mov_b32 s5, 0 ; GCN-IR-NEXT: s_cbranch_vccz .LBB7_4 -; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[12:13], s[8:9], s14 +; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 +; GCN-IR-NEXT: s_add_i32 s13, s12, 1 +; GCN-IR-NEXT: s_sub_i32 s6, 63, s12 +; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[8:9], s6 +; GCN-IR-NEXT: s_lshr_b64 s[12:13], s[8:9], s13 ; GCN-IR-NEXT: s_add_u32 s14, s2, -1 ; GCN-IR-NEXT: s_addc_u32 s15, s3, -1 -; GCN-IR-NEXT: s_not_b64 s[4:5], s[10:11] -; GCN-IR-NEXT: s_add_u32 s8, s4, s16 -; GCN-IR-NEXT: s_addc_u32 s9, s5, 0 -; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 -; GCN-IR-NEXT: s_mov_b32 s5, 0 -; GCN-IR-NEXT: .LBB7_3: ; %udiv-do-while +; GCN-IR-NEXT: s_not_b64 s[8:9], s[4:5] +; GCN-IR-NEXT: s_add_u32 s8, s8, s16 +; GCN-IR-NEXT: s_addc_u32 s9, s9, 0 +; GCN-IR-NEXT: .LBB7_2: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1 ; GCN-IR-NEXT: s_lshr_b32 s4, s7, 31 @@ -854,11 +829,11 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[16:17], s[8:9], 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[16:17] -; GCN-IR-NEXT: s_cbranch_vccz .LBB7_3 -; GCN-IR-NEXT: .LBB7_4: ; %Flow4 +; GCN-IR-NEXT: s_cbranch_vccz .LBB7_2 +; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit ; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[6:7], 1 ; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[2:3] -; GCN-IR-NEXT: .LBB7_5: ; %udiv-end +; GCN-IR-NEXT: .LBB7_4: ; %udiv-end ; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 @@ -984,69 +959,62 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-LABEL: s_test_udiv_k_num_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-IR-NEXT: s_mov_b64 s[4:5], 0 +; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_flbit_i32_b32 s8, s2 ; GCN-IR-NEXT: s_flbit_i32_b32 s9, s3 ; GCN-IR-NEXT: s_add_i32 s8, s8, 32 -; GCN-IR-NEXT: s_min_u32 s12, s8, s9 -; GCN-IR-NEXT: s_add_u32 s8, s12, 0xffffffc5 +; GCN-IR-NEXT: s_min_u32 s10, s8, s9 +; GCN-IR-NEXT: s_add_u32 s8, s10, 0xffffffc5 ; GCN-IR-NEXT: s_addc_u32 s9, 0, -1 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[2:3], 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[10:11], s[8:9], 63 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], s[2:3], 0 +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[12:13], s[8:9], 63 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[14:15], s[8:9], 63 -; GCN-IR-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11] -; GCN-IR-NEXT: s_and_b64 s[6:7], s[10:11], exec -; GCN-IR-NEXT: s_cselect_b32 s6, 0, 24 -; GCN-IR-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15] -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[10:11] -; GCN-IR-NEXT: s_mov_b32 s7, 0 -; GCN-IR-NEXT: s_cbranch_vccz .LBB8_5 +; GCN-IR-NEXT: s_or_b64 s[12:13], s[4:5], s[12:13] +; GCN-IR-NEXT: s_and_b64 s[4:5], s[12:13], exec +; GCN-IR-NEXT: s_cselect_b32 s4, 0, 24 +; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] +; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[12:13] +; GCN-IR-NEXT: s_mov_b32 s5, 0 +; GCN-IR-NEXT: s_cbranch_vccz .LBB8_4 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s10, s8, 1 -; GCN-IR-NEXT: s_addc_u32 s11, s9, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], 0 +; GCN-IR-NEXT: s_add_i32 s4, s8, 1 ; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[6:7] -; GCN-IR-NEXT: s_lshl_b64 s[6:7], 24, s8 -; GCN-IR-NEXT: s_cbranch_vccz .LBB8_4 -; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[10:11], 24, s10 +; GCN-IR-NEXT: s_lshl_b64 s[8:9], 24, s8 +; GCN-IR-NEXT: s_lshr_b64 s[12:13], 24, s4 ; GCN-IR-NEXT: s_add_u32 s14, s2, -1 ; GCN-IR-NEXT: s_addc_u32 s15, s3, -1 -; GCN-IR-NEXT: s_sub_u32 s8, 58, s12 -; GCN-IR-NEXT: s_subb_u32 s9, 0, 0 -; GCN-IR-NEXT: s_mov_b64 s[12:13], 0 -; GCN-IR-NEXT: s_mov_b32 s5, 0 -; GCN-IR-NEXT: .LBB8_3: ; %udiv-do-while +; GCN-IR-NEXT: s_sub_u32 s10, 58, s10 +; GCN-IR-NEXT: s_subb_u32 s11, 0, 0 +; GCN-IR-NEXT: .LBB8_2: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 -; GCN-IR-NEXT: s_lshr_b32 s4, s7, 31 -; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 -; GCN-IR-NEXT: s_or_b64 s[10:11], s[10:11], s[4:5] -; GCN-IR-NEXT: s_or_b64 s[6:7], s[12:13], s[6:7] -; GCN-IR-NEXT: s_sub_u32 s4, s14, s10 -; GCN-IR-NEXT: s_subb_u32 s4, s15, s11 -; GCN-IR-NEXT: s_ashr_i32 s12, s4, 31 -; GCN-IR-NEXT: s_mov_b32 s13, s12 -; GCN-IR-NEXT: s_and_b32 s4, s12, 1 -; GCN-IR-NEXT: s_and_b64 s[12:13], s[12:13], s[2:3] -; GCN-IR-NEXT: s_sub_u32 s10, s10, s12 -; GCN-IR-NEXT: s_subb_u32 s11, s11, s13 -; GCN-IR-NEXT: s_add_u32 s8, s8, 1 -; GCN-IR-NEXT: s_addc_u32 s9, s9, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[16:17], s[8:9], 0 -; GCN-IR-NEXT: s_mov_b64 s[12:13], s[4:5] +; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1 +; GCN-IR-NEXT: s_lshr_b32 s4, s9, 31 +; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1 +; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[4:5] +; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9] +; GCN-IR-NEXT: s_sub_u32 s4, s14, s12 +; GCN-IR-NEXT: s_subb_u32 s4, s15, s13 +; GCN-IR-NEXT: s_ashr_i32 s6, s4, 31 +; GCN-IR-NEXT: s_mov_b32 s7, s6 +; GCN-IR-NEXT: s_and_b32 s4, s6, 1 +; GCN-IR-NEXT: s_and_b64 s[6:7], s[6:7], s[2:3] +; GCN-IR-NEXT: s_sub_u32 s12, s12, s6 +; GCN-IR-NEXT: s_subb_u32 s13, s13, s7 +; GCN-IR-NEXT: s_add_u32 s10, s10, 1 +; GCN-IR-NEXT: s_addc_u32 s11, s11, 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[16:17], s[10:11], 0 +; GCN-IR-NEXT: s_mov_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[16:17] -; GCN-IR-NEXT: s_cbranch_vccz .LBB8_3 -; GCN-IR-NEXT: .LBB8_4: ; %Flow6 -; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[6:7], 1 -; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[2:3] -; GCN-IR-NEXT: .LBB8_5: ; %udiv-end -; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 +; GCN-IR-NEXT: s_cbranch_vccz .LBB8_2 +; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit +; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[8:9], 1 +; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[2:3] +; GCN-IR-NEXT: .LBB8_4: ; %udiv-end +; GCN-IR-NEXT: v_mov_b32_e32 v0, s4 ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s7 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s5 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %result = udiv i64 24, %x @@ -1157,80 +1125,70 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) { ; GCN-IR-LABEL: v_test_udiv_pow2_k_num_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0 -; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2 -; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 -; GCN-IR-NEXT: v_min_u32_e32 v10, v2, v3 -; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 0xffffffd0, v10 +; GCN-IR-NEXT: v_mov_b32_e32 v2, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v3, v1 +; GCN-IR-NEXT: v_ffbh_u32_e32 v0, v2 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN-IR-NEXT: v_ffbh_u32_e32 v1, v3 +; GCN-IR-NEXT: v_min_u32_e32 v6, v0, v1 +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 0xffffffd0, v6 ; GCN-IR-NEXT: v_addc_u32_e64 v5, s[6:7], 0, -1, vcc -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[2:3] ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5] ; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v3, 0x8000 +; GCN-IR-NEXT: v_mov_b32_e32 v0, 0x8000 ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GCN-IR-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GCN-IR-NEXT: v_mov_b32_e32 v2, 0 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 ; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB9_6 +; GCN-IR-NEXT: s_cbranch_execz .LBB9_4 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v4 -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v4 -; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 1, v4 +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 63, v4 +; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v2 ; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[4:5], v2 -; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GCN-IR-NEXT: s_cbranch_execz .LBB9_5 -; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0 -; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc -; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v6 -; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v10 +; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v3, vcc +; GCN-IR-NEXT: v_lshl_b64 v[4:5], s[4:5], v4 +; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v0 ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 -; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc -; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v6 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: .LBB9_3: ; %udiv-do-while +; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc +; GCN-IR-NEXT: .LBB9_2: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 -; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 -; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4 -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v8 -; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v13, v9, vcc -; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v0, 31, v5 +; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v0 +; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v12, v8 +; GCN-IR-NEXT: v_subb_u32_e32 v0, vcc, v13, v9, vcc +; GCN-IR-NEXT: v_or_b32_e32 v4, v10, v4 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v0 ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6 -; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 -; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10 -; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1 -; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0 +; GCN-IR-NEXT: v_or_b32_e32 v5, v11, v5 +; GCN-IR-NEXT: v_and_b32_e32 v0, 1, v10 +; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v3 +; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v2 ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] ; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10 ; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 -; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v10, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: s_cbranch_execnz .LBB9_3 -; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: .LBB9_5: ; %Flow4 +; GCN-IR-NEXT: v_mov_b32_e32 v11, v1 +; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_mov_b32_e32 v10, v0 +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_cbranch_execnz .LBB9_2 +; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1 -; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v1 -; GCN-IR-NEXT: v_or_b32_e32 v3, v4, v0 -; GCN-IR-NEXT: .LBB9_6: ; %Flow5 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-IR-NEXT: v_mov_b32_e32 v0, v3 +; GCN-IR-NEXT: v_lshl_b64 v[1:2], v[4:5], 1 +; GCN-IR-NEXT: v_or_b32_e32 v1, v0, v1 +; GCN-IR-NEXT: v_mov_b32_e32 v0, v1 ; GCN-IR-NEXT: v_mov_b32_e32 v1, v2 +; GCN-IR-NEXT: .LBB9_4: ; %Flow +; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: s_setpc_b64 s[30:31] %result = udiv i64 32768, %x ret i64 %result @@ -1250,45 +1208,37 @@ define i64 @v_test_udiv_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0 ; GCN-IR-NEXT: v_add_i32_e64 v2, s[4:5], 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 -; GCN-IR-NEXT: v_min_u32_e32 v8, v2, v3 -; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 48, v8 -; GCN-IR-NEXT: v_subb_u32_e64 v5, s[4:5], 0, 0, s[4:5] +; GCN-IR-NEXT: v_min_u32_e32 v4, v2, v3 +; GCN-IR-NEXT: v_sub_i32_e64 v6, s[4:5], 48, v4 +; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, s[4:5] ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], 63, v[4:5] +; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], 63, v[6:7] +; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 ; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[4:5] -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; GCN-IR-NEXT: v_cndmask_b32_e64 v2, v1, 0, s[4:5] -; GCN-IR-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5] -; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB10_6 +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[6:7] +; GCN-IR-NEXT: s_xor_b64 s[8:9], s[4:5], -1 +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: v_cndmask_b32_e64 v3, v1, 0, s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[4:5] +; GCN-IR-NEXT: s_and_b64 s[4:5], s[8:9], vcc +; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] +; GCN-IR-NEXT: s_cbranch_execz .LBB10_4 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v4 -; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v4 -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 1, v6 +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, 63, v6 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[0:1], v2 -; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB10_5 -; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: v_lshr_b64 v[6:7], v[0:1], v6 -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 0xffffffcf, v8 +; GCN-IR-NEXT: v_lshr_b64 v[6:7], v[0:1], v7 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 0xffffffcf, v4 ; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 ; GCN-IR-NEXT: v_addc_u32_e64 v1, s[4:5], 0, -1, vcc -; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 -; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: .LBB10_3: ; %udiv-do-while +; GCN-IR-NEXT: s_movk_i32 s10, 0x7fff +; GCN-IR-NEXT: .LBB10_2: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 ; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v4 -; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, s12, v6 +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, s10, v6 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 1, v0 @@ -1302,21 +1252,18 @@ define i64 @v_test_udiv_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_sub_i32_e64 v6, s[4:5], v6, v8 ; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 ; GCN-IR-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v7, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GCN-IR-NEXT: v_mov_b32_e32 v8, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: s_cbranch_execnz .LBB10_3 -; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: .LBB10_5: ; %Flow4 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1 -; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v1 -; GCN-IR-NEXT: v_or_b32_e32 v3, v4, v0 -; GCN-IR-NEXT: .LBB10_6: ; %Flow5 +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: s_cbranch_execnz .LBB10_2 +; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-IR-NEXT: v_mov_b32_e32 v0, v3 -; GCN-IR-NEXT: v_mov_b32_e32 v1, v2 +; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 +; GCN-IR-NEXT: v_or_b32_e32 v2, v4, v2 +; GCN-IR-NEXT: .LBB10_4: ; %Flow +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: v_mov_b32_e32 v0, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v1, v3 ; GCN-IR-NEXT: s_setpc_b64 s[30:31] %result = udiv i64 %x, 32768 ret i64 %result @@ -1405,66 +1352,60 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-LABEL: s_test_udiv_k_den_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_mov_b64 s[4:5], 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_flbit_i32_b32 s6, s2 ; GCN-IR-NEXT: s_flbit_i32_b32 s7, s3 ; GCN-IR-NEXT: s_add_i32 s6, s6, 32 ; GCN-IR-NEXT: s_min_u32 s12, s6, s7 -; GCN-IR-NEXT: s_sub_u32 s8, 59, s12 -; GCN-IR-NEXT: s_subb_u32 s9, 0, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], s[2:3], 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[6:7], s[8:9], 63 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[8:9], 63 -; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GCN-IR-NEXT: s_and_b64 s[6:7], s[4:5], exec -; GCN-IR-NEXT: s_cselect_b32 s7, 0, s3 -; GCN-IR-NEXT: s_cselect_b32 s6, 0, s2 -; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[10:11] -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; GCN-IR-NEXT: s_mov_b64 s[4:5], 0 -; GCN-IR-NEXT: s_cbranch_vccz .LBB11_5 -; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s10, s8, 1 -; GCN-IR-NEXT: s_addc_u32 s11, s9, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], 0 -; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[6:7] -; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[2:3], s8 +; GCN-IR-NEXT: s_sub_u32 s6, 59, s12 +; GCN-IR-NEXT: s_subb_u32 s7, 0, 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[2:3], 0 +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[10:11], s[6:7], 63 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[14:15], s[6:7], 63 +; GCN-IR-NEXT: s_or_b64 s[10:11], s[8:9], s[10:11] +; GCN-IR-NEXT: s_and_b64 s[8:9], s[10:11], exec +; GCN-IR-NEXT: s_cselect_b32 s9, 0, s3 +; GCN-IR-NEXT: s_cselect_b32 s8, 0, s2 +; GCN-IR-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15] +; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[10:11] +; GCN-IR-NEXT: s_mov_b32 s7, 0 ; GCN-IR-NEXT: s_cbranch_vccz .LBB11_4 -; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[8:9], s[2:3], s10 +; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 +; GCN-IR-NEXT: s_add_i32 s10, s6, 1 +; GCN-IR-NEXT: s_sub_i32 s6, 63, s6 +; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[2:3], s6 +; GCN-IR-NEXT: s_lshr_b64 s[10:11], s[2:3], s10 ; GCN-IR-NEXT: s_add_u32 s2, s12, 0xffffffc4 ; GCN-IR-NEXT: s_addc_u32 s3, 0, -1 -; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 -; GCN-IR-NEXT: s_mov_b32 s5, 0 -; GCN-IR-NEXT: .LBB11_3: ; %udiv-do-while +; GCN-IR-NEXT: .LBB11_2: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 +; GCN-IR-NEXT: s_lshr_b32 s6, s9, 31 ; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1 -; GCN-IR-NEXT: s_lshr_b32 s4, s7, 31 -; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 -; GCN-IR-NEXT: s_or_b64 s[8:9], s[8:9], s[4:5] -; GCN-IR-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7] -; GCN-IR-NEXT: s_sub_u32 s4, 23, s8 -; GCN-IR-NEXT: s_subb_u32 s4, 0, s9 -; GCN-IR-NEXT: s_ashr_i32 s10, s4, 31 -; GCN-IR-NEXT: s_and_b32 s4, s10, 1 -; GCN-IR-NEXT: s_and_b32 s10, s10, 24 -; GCN-IR-NEXT: s_sub_u32 s8, s8, s10 -; GCN-IR-NEXT: s_subb_u32 s9, s9, 0 +; GCN-IR-NEXT: s_or_b64 s[10:11], s[10:11], s[6:7] +; GCN-IR-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] +; GCN-IR-NEXT: s_sub_u32 s4, 23, s10 +; GCN-IR-NEXT: s_subb_u32 s4, 0, s11 +; GCN-IR-NEXT: s_ashr_i32 s4, s4, 31 +; GCN-IR-NEXT: s_and_b32 s6, s4, 1 +; GCN-IR-NEXT: s_and_b32 s4, s4, 24 +; GCN-IR-NEXT: s_sub_u32 s10, s10, s4 +; GCN-IR-NEXT: s_subb_u32 s11, s11, 0 ; GCN-IR-NEXT: s_add_u32 s2, s2, 1 ; GCN-IR-NEXT: s_addc_u32 s3, s3, 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[2:3], 0 -; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5] +; GCN-IR-NEXT: s_mov_b64 s[4:5], s[6:7] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[12:13] -; GCN-IR-NEXT: s_cbranch_vccz .LBB11_3 -; GCN-IR-NEXT: .LBB11_4: ; %Flow6 -; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[6:7], 1 -; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[2:3] -; GCN-IR-NEXT: .LBB11_5: ; %udiv-end -; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 +; GCN-IR-NEXT: s_cbranch_vccz .LBB11_2 +; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit +; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[8:9], 1 +; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[2:3] +; GCN-IR-NEXT: .LBB11_4: ; %udiv-end +; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s7 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s9 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %result = udiv i64 %x, 24 @@ -1551,39 +1492,31 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0 ; GCN-IR-NEXT: v_add_i32_e64 v2, s[4:5], 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 -; GCN-IR-NEXT: v_min_u32_e32 v8, v2, v3 -; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 59, v8 -; GCN-IR-NEXT: v_subb_u32_e64 v5, s[4:5], 0, 0, s[4:5] +; GCN-IR-NEXT: v_min_u32_e32 v4, v2, v3 +; GCN-IR-NEXT: v_sub_i32_e64 v6, s[4:5], 59, v4 +; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, s[4:5] ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], 63, v[4:5] +; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], 63, v[6:7] +; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 ; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[4:5] -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; GCN-IR-NEXT: v_cndmask_b32_e64 v2, v1, 0, s[4:5] -; GCN-IR-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5] -; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB12_6 +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[6:7] +; GCN-IR-NEXT: s_xor_b64 s[8:9], s[4:5], -1 +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: v_cndmask_b32_e64 v3, v1, 0, s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[4:5] +; GCN-IR-NEXT: s_and_b64 s[4:5], s[8:9], vcc +; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] +; GCN-IR-NEXT: s_cbranch_execz .LBB12_4 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v4 -; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v4 -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 1, v6 +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, 63, v6 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[0:1], v2 -; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB12_5 -; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: v_lshr_b64 v[6:7], v[0:1], v6 -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 0xffffffc4, v8 +; GCN-IR-NEXT: v_lshr_b64 v[6:7], v[0:1], v7 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 0xffffffc4, v4 ; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 -; GCN-IR-NEXT: v_addc_u32_e64 v1, s[4:5], 0, -1, vcc -; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: .LBB12_3: ; %udiv-do-while +; GCN-IR-NEXT: v_addc_u32_e64 v1, s[4:5], 0, -1, vcc +; GCN-IR-NEXT: .LBB12_2: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 @@ -1602,21 +1535,18 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_sub_i32_e64 v6, s[4:5], v6, v8 ; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 ; GCN-IR-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v7, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GCN-IR-NEXT: v_mov_b32_e32 v8, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: s_cbranch_execnz .LBB12_3 -; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: .LBB12_5: ; %Flow4 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1 -; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v1 -; GCN-IR-NEXT: v_or_b32_e32 v3, v4, v0 -; GCN-IR-NEXT: .LBB12_6: ; %Flow5 +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: s_cbranch_execnz .LBB12_2 +; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-IR-NEXT: v_mov_b32_e32 v0, v3 -; GCN-IR-NEXT: v_mov_b32_e32 v1, v2 +; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 +; GCN-IR-NEXT: v_or_b32_e32 v2, v4, v2 +; GCN-IR-NEXT: .LBB12_4: ; %Flow +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: v_mov_b32_e32 v0, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v1, v3 ; GCN-IR-NEXT: s_setpc_b64 s[30:31] %result = udiv i64 %x, 24 ret i64 %result diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll index 784993ccd3bd1..6cf9e63043187 100644 --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -124,73 +124,66 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_mov_b64 s[12:13], 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[4:5], 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[2:3], 0 ; GCN-IR-NEXT: s_flbit_i32_b32 s10, s4 -; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9] -; GCN-IR-NEXT: s_flbit_i32_b32 s6, s2 ; GCN-IR-NEXT: s_flbit_i32_b32 s11, s5 ; GCN-IR-NEXT: s_add_i32 s10, s10, 32 -; GCN-IR-NEXT: s_add_i32 s6, s6, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s7, s3 -; GCN-IR-NEXT: s_min_u32 s10, s10, s11 -; GCN-IR-NEXT: s_min_u32 s18, s6, s7 -; GCN-IR-NEXT: s_sub_u32 s12, s10, s18 -; GCN-IR-NEXT: s_subb_u32 s13, 0, 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[14:15], s[12:13], 63 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[16:17], s[12:13], 63 +; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9] +; GCN-IR-NEXT: s_flbit_i32_b32 s7, s2 +; GCN-IR-NEXT: s_min_u32 s6, s10, s11 +; GCN-IR-NEXT: s_add_i32 s7, s7, 32 +; GCN-IR-NEXT: s_flbit_i32_b32 s10, s3 +; GCN-IR-NEXT: s_min_u32 s18, s7, s10 +; GCN-IR-NEXT: s_sub_u32 s10, s6, s18 +; GCN-IR-NEXT: s_subb_u32 s11, 0, 0 +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[14:15], s[10:11], 63 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[16:17], s[10:11], 63 ; GCN-IR-NEXT: s_or_b64 s[14:15], s[8:9], s[14:15] ; GCN-IR-NEXT: s_and_b64 s[8:9], s[14:15], exec ; GCN-IR-NEXT: s_cselect_b32 s9, 0, s3 ; GCN-IR-NEXT: s_cselect_b32 s8, 0, s2 ; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] -; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[14:15] -; GCN-IR-NEXT: s_mov_b32 s11, 0 -; GCN-IR-NEXT: s_cbranch_vccz .LBB0_5 -; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s14, s12, 1 -; GCN-IR-NEXT: s_addc_u32 s15, s13, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[14:15], 0 -; GCN-IR-NEXT: s_sub_i32 s12, 63, s12 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[8:9] -; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[2:3], s12 +; GCN-IR-NEXT: s_mov_b32 s7, 0 ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_4 -; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[12:13], s[2:3], s14 +; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 +; GCN-IR-NEXT: s_add_i32 s11, s10, 1 +; GCN-IR-NEXT: s_sub_i32 s8, 63, s10 +; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 +; GCN-IR-NEXT: s_lshr_b64 s[14:15], s[2:3], s11 ; GCN-IR-NEXT: s_add_u32 s16, s4, -1 ; GCN-IR-NEXT: s_addc_u32 s17, s5, -1 -; GCN-IR-NEXT: s_not_b64 s[6:7], s[10:11] -; GCN-IR-NEXT: s_add_u32 s10, s6, s18 -; GCN-IR-NEXT: s_addc_u32 s11, s7, 0 -; GCN-IR-NEXT: s_mov_b64 s[14:15], 0 -; GCN-IR-NEXT: s_mov_b32 s7, 0 -; GCN-IR-NEXT: .LBB0_3: ; %udiv-do-while +; GCN-IR-NEXT: s_not_b64 s[10:11], s[6:7] +; GCN-IR-NEXT: s_add_u32 s10, s10, s18 +; GCN-IR-NEXT: s_addc_u32 s11, s11, 0 +; GCN-IR-NEXT: .LBB0_2: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1 +; GCN-IR-NEXT: s_lshl_b64 s[14:15], s[14:15], 1 ; GCN-IR-NEXT: s_lshr_b32 s6, s9, 31 ; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1 -; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[6:7] -; GCN-IR-NEXT: s_or_b64 s[8:9], s[14:15], s[8:9] -; GCN-IR-NEXT: s_sub_u32 s6, s16, s12 -; GCN-IR-NEXT: s_subb_u32 s6, s17, s13 -; GCN-IR-NEXT: s_ashr_i32 s14, s6, 31 -; GCN-IR-NEXT: s_mov_b32 s15, s14 -; GCN-IR-NEXT: s_and_b32 s6, s14, 1 -; GCN-IR-NEXT: s_and_b64 s[14:15], s[14:15], s[4:5] -; GCN-IR-NEXT: s_sub_u32 s12, s12, s14 -; GCN-IR-NEXT: s_subb_u32 s13, s13, s15 +; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[6:7] +; GCN-IR-NEXT: s_or_b64 s[8:9], s[12:13], s[8:9] +; GCN-IR-NEXT: s_sub_u32 s6, s16, s14 +; GCN-IR-NEXT: s_subb_u32 s6, s17, s15 +; GCN-IR-NEXT: s_ashr_i32 s12, s6, 31 +; GCN-IR-NEXT: s_mov_b32 s13, s12 +; GCN-IR-NEXT: s_and_b32 s6, s12, 1 +; GCN-IR-NEXT: s_and_b64 s[12:13], s[12:13], s[4:5] +; GCN-IR-NEXT: s_sub_u32 s14, s14, s12 +; GCN-IR-NEXT: s_subb_u32 s15, s15, s13 ; GCN-IR-NEXT: s_add_u32 s10, s10, 1 ; GCN-IR-NEXT: s_addc_u32 s11, s11, 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[10:11], 0 -; GCN-IR-NEXT: s_mov_b64 s[14:15], s[6:7] +; GCN-IR-NEXT: s_mov_b64 s[12:13], s[6:7] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[18:19] -; GCN-IR-NEXT: s_cbranch_vccz .LBB0_3 -; GCN-IR-NEXT: .LBB0_4: ; %Flow7 +; GCN-IR-NEXT: s_cbranch_vccz .LBB0_2 +; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit ; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1 ; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9] -; GCN-IR-NEXT: .LBB0_5: ; %udiv-end +; GCN-IR-NEXT: .LBB0_4: ; %udiv-end ; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 ; GCN-IR-NEXT: v_mul_hi_u32 v0, s4, v0 ; GCN-IR-NEXT: s_mov_b32 s12, s0 @@ -325,84 +318,73 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) { ; GCN-IR-LABEL: v_test_urem_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v2 -; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4 +; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4 ; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v3 ; GCN-IR-NEXT: v_min_u32_e32 v12, v4, v5 ; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v0 -; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4 +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4 ; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v1 -; GCN-IR-NEXT: v_min_u32_e32 v13, v4, v5 -; GCN-IR-NEXT: v_sub_i32_e64 v4, s[6:7], v12, v13 -; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] -; GCN-IR-NEXT: v_subb_u32_e64 v5, s[6:7], 0, 0, s[6:7] -; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[6:7], 63, v[4:5] -; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[4:5] -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; GCN-IR-NEXT: v_min_u32_e32 v4, v4, v5 +; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, v12, v4 +; GCN-IR-NEXT: v_subb_u32_e64 v9, s[8:9], 0, 0, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[8:9] +; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 +; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[8:9] +; GCN-IR-NEXT: s_xor_b64 s[8:9], s[4:5], -1 +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: v_cndmask_b32_e64 v7, v1, 0, s[4:5] ; GCN-IR-NEXT: v_cndmask_b32_e64 v6, v0, 0, s[4:5] -; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB1_6 +; GCN-IR-NEXT: s_and_b64 s[4:5], s[8:9], vcc +; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] +; GCN-IR-NEXT: s_cbranch_execz .LBB1_4 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v4 -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] -; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[0:1], v4 -; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 -; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB1_5 -; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader +; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, 1, v8 +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 63, v8 ; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, -1, v2 ; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v3, vcc -; GCN-IR-NEXT: v_not_b32_e32 v7, v12 -; GCN-IR-NEXT: v_lshr_b64 v[10:11], v[0:1], v8 -; GCN-IR-NEXT: v_not_b32_e32 v6, 0 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, v7, v13 +; GCN-IR-NEXT: v_not_b32_e32 v8, v12 +; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[0:1], v6 +; GCN-IR-NEXT: v_lshr_b64 v[10:11], v[0:1], v9 +; GCN-IR-NEXT: v_not_b32_e32 v9, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v6, vcc -; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 +; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, v8, v4 ; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 -; GCN-IR-NEXT: .LBB1_3: ; %udiv-do-while +; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; GCN-IR-NEXT: .LBB1_2: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 -; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v5 -; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v6 -; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v14, v10 -; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v15, v11, vcc -; GCN-IR-NEXT: v_or_b32_e32 v4, v12, v4 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v6 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v7 +; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v4 +; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v14, v10 +; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v15, v11, vcc +; GCN-IR-NEXT: v_or_b32_e32 v6, v12, v6 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v4 ; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v8 -; GCN-IR-NEXT: v_or_b32_e32 v5, v13, v5 -; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v12 +; GCN-IR-NEXT: v_or_b32_e32 v7, v13, v7 +; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v12 ; GCN-IR-NEXT: v_and_b32_e32 v13, v12, v3 ; GCN-IR-NEXT: v_and_b32_e32 v12, v12, v2 ; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] ; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v12 ; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v13, s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v13, v7 -; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v12, v6 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3 -; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: .LBB1_5: ; %Flow4 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 -; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v5 -; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v4 -; GCN-IR-NEXT: .LBB1_6: ; %Flow5 +; GCN-IR-NEXT: v_mov_b32_e32 v13, v5 +; GCN-IR-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN-IR-NEXT: v_mov_b32_e32 v12, v4 +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: s_cbranch_execnz .LBB1_2 +; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 +; GCN-IR-NEXT: v_or_b32_e32 v6, v4, v6 +; GCN-IR-NEXT: .LBB1_4: ; %Flow +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_mul_lo_u32 v4, v2, v7 ; GCN-IR-NEXT: v_mul_hi_u32 v5, v2, v6 ; GCN-IR-NEXT: v_mul_lo_u32 v3, v3, v6 @@ -812,74 +794,67 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-LABEL: s_test_urem_k_num_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-IR-NEXT: s_mov_b64 s[4:5], 0 +; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_flbit_i32_b32 s8, s2 ; GCN-IR-NEXT: s_flbit_i32_b32 s9, s3 ; GCN-IR-NEXT: s_add_i32 s8, s8, 32 -; GCN-IR-NEXT: s_min_u32 s12, s8, s9 -; GCN-IR-NEXT: s_add_u32 s8, s12, 0xffffffc5 +; GCN-IR-NEXT: s_min_u32 s10, s8, s9 +; GCN-IR-NEXT: s_add_u32 s8, s10, 0xffffffc5 ; GCN-IR-NEXT: s_addc_u32 s9, 0, -1 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[2:3], 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[10:11], s[8:9], 63 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], s[2:3], 0 +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[12:13], s[8:9], 63 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[14:15], s[8:9], 63 -; GCN-IR-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11] -; GCN-IR-NEXT: s_and_b64 s[6:7], s[10:11], exec -; GCN-IR-NEXT: s_cselect_b32 s6, 0, 24 -; GCN-IR-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15] -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[10:11] -; GCN-IR-NEXT: s_mov_b32 s7, 0 -; GCN-IR-NEXT: s_cbranch_vccz .LBB6_5 +; GCN-IR-NEXT: s_or_b64 s[12:13], s[4:5], s[12:13] +; GCN-IR-NEXT: s_and_b64 s[4:5], s[12:13], exec +; GCN-IR-NEXT: s_cselect_b32 s4, 0, 24 +; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] +; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[12:13] +; GCN-IR-NEXT: s_mov_b32 s5, 0 +; GCN-IR-NEXT: s_cbranch_vccz .LBB6_4 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s10, s8, 1 -; GCN-IR-NEXT: s_addc_u32 s11, s9, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], 0 +; GCN-IR-NEXT: s_add_i32 s4, s8, 1 ; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[6:7] -; GCN-IR-NEXT: s_lshl_b64 s[6:7], 24, s8 -; GCN-IR-NEXT: s_cbranch_vccz .LBB6_4 -; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[10:11], 24, s10 +; GCN-IR-NEXT: s_lshl_b64 s[8:9], 24, s8 +; GCN-IR-NEXT: s_lshr_b64 s[12:13], 24, s4 ; GCN-IR-NEXT: s_add_u32 s14, s2, -1 ; GCN-IR-NEXT: s_addc_u32 s15, s3, -1 -; GCN-IR-NEXT: s_sub_u32 s8, 58, s12 -; GCN-IR-NEXT: s_subb_u32 s9, 0, 0 -; GCN-IR-NEXT: s_mov_b64 s[12:13], 0 -; GCN-IR-NEXT: s_mov_b32 s5, 0 -; GCN-IR-NEXT: .LBB6_3: ; %udiv-do-while +; GCN-IR-NEXT: s_sub_u32 s10, 58, s10 +; GCN-IR-NEXT: s_subb_u32 s11, 0, 0 +; GCN-IR-NEXT: .LBB6_2: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 -; GCN-IR-NEXT: s_lshr_b32 s4, s7, 31 -; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 -; GCN-IR-NEXT: s_or_b64 s[10:11], s[10:11], s[4:5] -; GCN-IR-NEXT: s_or_b64 s[6:7], s[12:13], s[6:7] -; GCN-IR-NEXT: s_sub_u32 s4, s14, s10 -; GCN-IR-NEXT: s_subb_u32 s4, s15, s11 -; GCN-IR-NEXT: s_ashr_i32 s12, s4, 31 -; GCN-IR-NEXT: s_mov_b32 s13, s12 -; GCN-IR-NEXT: s_and_b32 s4, s12, 1 -; GCN-IR-NEXT: s_and_b64 s[12:13], s[12:13], s[2:3] -; GCN-IR-NEXT: s_sub_u32 s10, s10, s12 -; GCN-IR-NEXT: s_subb_u32 s11, s11, s13 -; GCN-IR-NEXT: s_add_u32 s8, s8, 1 -; GCN-IR-NEXT: s_addc_u32 s9, s9, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[16:17], s[8:9], 0 -; GCN-IR-NEXT: s_mov_b64 s[12:13], s[4:5] +; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1 +; GCN-IR-NEXT: s_lshr_b32 s4, s9, 31 +; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1 +; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[4:5] +; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9] +; GCN-IR-NEXT: s_sub_u32 s4, s14, s12 +; GCN-IR-NEXT: s_subb_u32 s4, s15, s13 +; GCN-IR-NEXT: s_ashr_i32 s6, s4, 31 +; GCN-IR-NEXT: s_mov_b32 s7, s6 +; GCN-IR-NEXT: s_and_b32 s4, s6, 1 +; GCN-IR-NEXT: s_and_b64 s[6:7], s[6:7], s[2:3] +; GCN-IR-NEXT: s_sub_u32 s12, s12, s6 +; GCN-IR-NEXT: s_subb_u32 s13, s13, s7 +; GCN-IR-NEXT: s_add_u32 s10, s10, 1 +; GCN-IR-NEXT: s_addc_u32 s11, s11, 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[16:17], s[10:11], 0 +; GCN-IR-NEXT: s_mov_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[16:17] -; GCN-IR-NEXT: s_cbranch_vccz .LBB6_3 -; GCN-IR-NEXT: .LBB6_4: ; %Flow6 -; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 -; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[6:7] -; GCN-IR-NEXT: .LBB6_5: ; %udiv-end -; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 +; GCN-IR-NEXT: s_cbranch_vccz .LBB6_2 +; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit +; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[8:9], 1 +; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GCN-IR-NEXT: .LBB6_4: ; %udiv-end +; GCN-IR-NEXT: v_mov_b32_e32 v0, s4 ; GCN-IR-NEXT: v_mul_hi_u32 v0, s2, v0 ; GCN-IR-NEXT: s_mov_b32 s8, s0 -; GCN-IR-NEXT: s_mul_i32 s0, s2, s7 +; GCN-IR-NEXT: s_mul_i32 s0, s2, s5 ; GCN-IR-NEXT: s_mov_b32 s11, 0xf000 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; GCN-IR-NEXT: s_mul_i32 s0, s3, s6 +; GCN-IR-NEXT: s_mul_i32 s0, s3, s4 ; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, s0, v0 -; GCN-IR-NEXT: s_mul_i32 s0, s2, s6 +; GCN-IR-NEXT: s_mul_i32 s0, s2, s4 ; GCN-IR-NEXT: v_sub_i32_e64 v0, vcc, 24, s0 ; GCN-IR-NEXT: s_mov_b32 s10, -1 ; GCN-IR-NEXT: s_mov_b32 s9, s1 @@ -972,75 +947,69 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-LABEL: s_test_urem_k_den_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_mov_b64 s[4:5], 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_flbit_i32_b32 s6, s2 ; GCN-IR-NEXT: s_flbit_i32_b32 s7, s3 ; GCN-IR-NEXT: s_add_i32 s6, s6, 32 -; GCN-IR-NEXT: s_min_u32 s12, s6, s7 -; GCN-IR-NEXT: s_sub_u32 s8, 59, s12 -; GCN-IR-NEXT: s_subb_u32 s9, 0, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], s[2:3], 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[6:7], s[8:9], 63 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[8:9], 63 -; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GCN-IR-NEXT: s_and_b64 s[6:7], s[4:5], exec -; GCN-IR-NEXT: s_cselect_b32 s7, 0, s3 -; GCN-IR-NEXT: s_cselect_b32 s6, 0, s2 -; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[10:11] -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; GCN-IR-NEXT: s_mov_b64 s[4:5], 0 -; GCN-IR-NEXT: s_cbranch_vccz .LBB7_5 -; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s10, s8, 1 -; GCN-IR-NEXT: s_addc_u32 s11, s9, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], 0 -; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[6:7] -; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[2:3], s8 +; GCN-IR-NEXT: s_min_u32 s10, s6, s7 +; GCN-IR-NEXT: s_sub_u32 s6, 59, s10 +; GCN-IR-NEXT: s_subb_u32 s7, 0, 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[2:3], 0 +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[12:13], s[6:7], 63 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[14:15], s[6:7], 63 +; GCN-IR-NEXT: s_or_b64 s[12:13], s[8:9], s[12:13] +; GCN-IR-NEXT: s_and_b64 s[8:9], s[12:13], exec +; GCN-IR-NEXT: s_cselect_b32 s9, 0, s3 +; GCN-IR-NEXT: s_cselect_b32 s8, 0, s2 +; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] +; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[12:13] +; GCN-IR-NEXT: s_mov_b32 s7, 0 ; GCN-IR-NEXT: s_cbranch_vccz .LBB7_4 -; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[10:11], s[2:3], s10 -; GCN-IR-NEXT: s_add_u32 s8, s12, 0xffffffc4 -; GCN-IR-NEXT: s_addc_u32 s9, 0, -1 -; GCN-IR-NEXT: s_mov_b64 s[12:13], 0 -; GCN-IR-NEXT: s_mov_b32 s5, 0 -; GCN-IR-NEXT: .LBB7_3: ; %udiv-do-while +; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 +; GCN-IR-NEXT: s_add_i32 s11, s6, 1 +; GCN-IR-NEXT: s_sub_i32 s6, 63, s6 +; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[2:3], s6 +; GCN-IR-NEXT: s_lshr_b64 s[12:13], s[2:3], s11 +; GCN-IR-NEXT: s_add_u32 s10, s10, 0xffffffc4 +; GCN-IR-NEXT: s_addc_u32 s11, 0, -1 +; GCN-IR-NEXT: .LBB7_2: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 -; GCN-IR-NEXT: s_lshr_b32 s4, s7, 31 -; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 -; GCN-IR-NEXT: s_or_b64 s[10:11], s[10:11], s[4:5] -; GCN-IR-NEXT: s_or_b64 s[6:7], s[12:13], s[6:7] -; GCN-IR-NEXT: s_sub_u32 s4, 23, s10 -; GCN-IR-NEXT: s_subb_u32 s4, 0, s11 -; GCN-IR-NEXT: s_ashr_i32 s12, s4, 31 -; GCN-IR-NEXT: s_and_b32 s4, s12, 1 -; GCN-IR-NEXT: s_and_b32 s12, s12, 24 -; GCN-IR-NEXT: s_sub_u32 s10, s10, s12 -; GCN-IR-NEXT: s_subb_u32 s11, s11, 0 -; GCN-IR-NEXT: s_add_u32 s8, s8, 1 -; GCN-IR-NEXT: s_addc_u32 s9, s9, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[14:15], s[8:9], 0 -; GCN-IR-NEXT: s_mov_b64 s[12:13], s[4:5] +; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1 +; GCN-IR-NEXT: s_lshr_b32 s6, s9, 31 +; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1 +; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[6:7] +; GCN-IR-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] +; GCN-IR-NEXT: s_sub_u32 s4, 23, s12 +; GCN-IR-NEXT: s_subb_u32 s4, 0, s13 +; GCN-IR-NEXT: s_ashr_i32 s4, s4, 31 +; GCN-IR-NEXT: s_and_b32 s6, s4, 1 +; GCN-IR-NEXT: s_and_b32 s4, s4, 24 +; GCN-IR-NEXT: s_sub_u32 s12, s12, s4 +; GCN-IR-NEXT: s_subb_u32 s13, s13, 0 +; GCN-IR-NEXT: s_add_u32 s10, s10, 1 +; GCN-IR-NEXT: s_addc_u32 s11, s11, 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[14:15], s[10:11], 0 +; GCN-IR-NEXT: s_mov_b64 s[4:5], s[6:7] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[14:15] -; GCN-IR-NEXT: s_cbranch_vccz .LBB7_3 -; GCN-IR-NEXT: .LBB7_4: ; %Flow6 -; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 -; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[6:7] -; GCN-IR-NEXT: .LBB7_5: ; %udiv-end -; GCN-IR-NEXT: v_mul_hi_u32 v0, s6, 24 -; GCN-IR-NEXT: s_mov_b32 s8, s0 -; GCN-IR-NEXT: s_mul_i32 s0, s7, 24 +; GCN-IR-NEXT: s_cbranch_vccz .LBB7_2 +; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit +; GCN-IR-NEXT: s_lshl_b64 s[4:5], s[8:9], 1 +; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[4:5] +; GCN-IR-NEXT: .LBB7_4: ; %udiv-end +; GCN-IR-NEXT: v_mul_hi_u32 v0, s8, 24 +; GCN-IR-NEXT: s_mov_b32 s4, s0 +; GCN-IR-NEXT: s_mul_i32 s0, s9, 24 ; GCN-IR-NEXT: v_mov_b32_e32 v2, s3 ; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, s0, v0 -; GCN-IR-NEXT: s_mul_i32 s0, s6, 24 +; GCN-IR-NEXT: s_mul_i32 s0, s8, 24 ; GCN-IR-NEXT: v_mov_b32_e32 v0, s0 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 -; GCN-IR-NEXT: s_mov_b32 s11, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s10, -1 -; GCN-IR-NEXT: s_mov_b32 s9, s1 +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s6, -1 +; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc -; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm %result = urem i64 %x, 24 store i64 %result, ptr addrspace(1) %out @@ -1153,81 +1122,71 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0 ; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 -; GCN-IR-NEXT: v_min_u32_e32 v10, v2, v3 -; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 0xffffffd0, v10 -; GCN-IR-NEXT: v_addc_u32_e64 v3, s[6:7], 0, -1, vcc +; GCN-IR-NEXT: v_min_u32_e32 v6, v2, v3 +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 0xffffffd0, v6 +; GCN-IR-NEXT: v_addc_u32_e64 v5, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] -; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[2:3] -; GCN-IR-NEXT: v_mov_b32_e32 v4, 0x8000 +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5] +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[4:5] +; GCN-IR-NEXT: v_mov_b32_e32 v2, 0x8000 ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v2, v2, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 ; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB8_6 +; GCN-IR-NEXT: s_cbranch_execz .LBB8_4 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v2 -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 -; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc -; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[4:5], v2 -; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GCN-IR-NEXT: s_cbranch_execz .LBB8_5 -; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 1, v4 +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 63, v4 ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0 +; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc -; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v6 -; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v10 +; GCN-IR-NEXT: v_lshl_b64 v[4:5], s[4:5], v4 +; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v2 ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 -; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc -; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v6 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: .LBB8_3: ; %udiv-do-while +; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc +; GCN-IR-NEXT: .LBB8_2: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 -; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 -; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4 -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v8 -; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v13, v9, vcc -; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v5 +; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v2 +; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, v12, v8 +; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v13, v9, vcc +; GCN-IR-NEXT: v_or_b32_e32 v4, v10, v4 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v2 ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6 -; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 -; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10 +; GCN-IR-NEXT: v_or_b32_e32 v5, v11, v5 +; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v10 ; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1 ; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] ; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10 ; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 -; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v10, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: s_cbranch_execnz .LBB8_3 -; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: .LBB8_5: ; %Flow4 +; GCN-IR-NEXT: v_mov_b32_e32 v11, v3 +; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_mov_b32_e32 v10, v2 +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_cbranch_execnz .LBB8_2 +; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3 -; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2 -; GCN-IR-NEXT: .LBB8_6: ; %Flow5 +; GCN-IR-NEXT: v_lshl_b64 v[3:4], v[4:5], 1 +; GCN-IR-NEXT: v_or_b32_e32 v3, v2, v3 +; GCN-IR-NEXT: v_mov_b32_e32 v2, v3 +; GCN-IR-NEXT: v_mov_b32_e32 v3, v4 +; GCN-IR-NEXT: .LBB8_4: ; %Flow ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-IR-NEXT: v_mul_lo_u32 v2, v0, v5 -; GCN-IR-NEXT: v_mul_hi_u32 v3, v0, v4 -; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v4 -; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v4 -; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GCN-IR-NEXT: v_mul_lo_u32 v3, v0, v3 +; GCN-IR-NEXT: v_mul_hi_u32 v4, v0, v2 +; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v2 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v2 +; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 0x8000, v0 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, 0, v1, vcc ; GCN-IR-NEXT: s_setpc_b64 s[30:31] @@ -1249,71 +1208,60 @@ define i64 @v_test_urem_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0 ; GCN-IR-NEXT: v_add_i32_e64 v2, s[4:5], 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 -; GCN-IR-NEXT: v_min_u32_e32 v10, v2, v3 -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 48, v10 -; GCN-IR-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, s[4:5] +; GCN-IR-NEXT: v_min_u32_e32 v2, v2, v3 +; GCN-IR-NEXT: v_sub_i32_e64 v6, s[4:5], 48, v2 +; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, s[4:5] ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], 63, v[2:3] +; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], 63, v[6:7] +; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 ; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[2:3] -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[6:7] +; GCN-IR-NEXT: s_xor_b64 s[8:9], s[4:5], -1 +; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 ; GCN-IR-NEXT: v_cndmask_b32_e64 v5, v1, 0, s[4:5] ; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v0, 0, s[4:5] -; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB9_6 +; GCN-IR-NEXT: s_and_b64 s[4:5], s[8:9], vcc +; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] +; GCN-IR-NEXT: s_cbranch_execz .LBB9_4 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v2 -; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[0:1], v2 -; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB9_5 -; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: v_lshr_b64 v[8:9], v[0:1], v6 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 0xffffffcf, v10 +; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 1, v6 +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 63, v6 +; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[0:1], v4 +; GCN-IR-NEXT: v_lshr_b64 v[8:9], v[0:1], v7 +; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 0xffffffcf, v2 ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_addc_u32_e64 v7, s[4:5], 0, -1, vcc -; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 -; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: .LBB9_3: ; %udiv-do-while +; GCN-IR-NEXT: s_movk_i32 s10, 0x7fff +; GCN-IR-NEXT: .LBB9_2: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 -; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 -; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4 -; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, s12, v8 -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, 0, v9, vcc +; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v5 +; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v2 +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s10, v8 +; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 +; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, 0, v9, vcc ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6 -; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4 +; GCN-IR-NEXT: v_or_b32_e32 v4, v10, v4 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v2 ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10 +; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v10 ; GCN-IR-NEXT: v_and_b32_e32 v10, 0x8000, v10 ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] -; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 +; GCN-IR-NEXT: v_or_b32_e32 v5, v11, v5 ; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10 -; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 +; GCN-IR-NEXT: v_mov_b32_e32 v11, v3 ; GCN-IR-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v9, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v10, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: s_cbranch_execnz .LBB9_3 -; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: .LBB9_5: ; %Flow4 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3 -; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2 -; GCN-IR-NEXT: .LBB9_6: ; %Flow5 +; GCN-IR-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN-IR-NEXT: v_mov_b32_e32 v10, v2 +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: s_cbranch_execnz .LBB9_2 +; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 +; GCN-IR-NEXT: v_or_b32_e32 v4, v2, v4 +; GCN-IR-NEXT: .LBB9_4: ; %Flow +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[4:5], 15 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc diff --git a/llvm/test/CodeGen/PowerPC/reduce_cr.ll b/llvm/test/CodeGen/PowerPC/reduce_cr.ll index 7491d13c53010..e5761de12670f 100644 --- a/llvm/test/CodeGen/PowerPC/reduce_cr.ll +++ b/llvm/test/CodeGen/PowerPC/reduce_cr.ll @@ -7,7 +7,7 @@ target triple = "powerpc64le-grtev4-linux-gnu" ;CHECK-NEXT: - BB0[entry]: float = 1.0, int = {{.*}} ;CHECK-NEXT: - BB1[for.check]: float = 2.6667, int = {{.*}} ;CHECK-NEXT: - BB2[test1]: float = 1.6667, int = {{.*}} -;CHECK-NEXT: - BB3[optional1]: float = 0.625, int = {{.*}} +;CHECK-NEXT: - BB3[optional2]: float = 0.625, int = {{.*}} ;CHECK: block-frequency-info: loop_test ;CHECK: block-frequency-info: loop_test @@ -19,7 +19,7 @@ target triple = "powerpc64le-grtev4-linux-gnu" ;CHECK-NEXT: - BB1[for.check]: float = 2.6667, int = {{.*}} ;CHECK-NEXT: - BB2[for.check]: float = 2.1667, int = {{.*}} ;CHECK-NEXT: - BB3[test1]: float = 1.6667, int = {{.*}} -;CHECK-NEXT: - BB4[optional1]: float = 0.625, int = {{.*}} +;CHECK-NEXT: - BB4[optional2]: float = 0.625, int = {{.*}} define void @loop_test(ptr %tags, i32 %count) { diff --git a/llvm/test/CodeGen/PowerPC/tail-dup-layout.ll b/llvm/test/CodeGen/PowerPC/tail-dup-layout.ll index 77d861ad0599c..ee911e001b79d 100644 --- a/llvm/test/CodeGen/PowerPC/tail-dup-layout.ll +++ b/llvm/test/CodeGen/PowerPC/tail-dup-layout.ll @@ -262,10 +262,8 @@ exit: ; for.latch ; for.check ; test1 -; test2 ; test3 ; test4 -; optional1 ; optional2 ; optional3 ; optional4 @@ -282,9 +280,6 @@ exit: ;CHECK: lwz [[TAGREG:[0-9]+]], 0([[TAGPTRREG]]) ;CHECK-O3: .[[CHECKLABEL:[._0-9A-Za-z]+]]: # %for.check ;CHECK: # %bb.{{[0-9]+}}: # %test1 -;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1 -;CHECK-NEXT: bc 12, 1, .[[OPT1LABEL:[._0-9A-Za-z]+]] -;CHECK-NEXT: # %test2 ;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 2 ;CHECK-NEXT: bne 0, .[[OPT2LABEL:[._0-9A-Za-z]+]] ;CHECK-NEXT: .[[TEST3LABEL:[._0-9A-Za-z]+]]: # %test3 @@ -294,10 +289,7 @@ exit: ;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 8 ;CHECK-NEXT: beq 0, .[[LATCHLABEL]] ;CHECK-NEXT: b .[[OPT4LABEL:[._0-9A-Za-z]+]] -;CHECK: [[OPT1LABEL]] -;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 2 -;CHECK-NEXT: beq 0, .[[TEST3LABEL]] -;CHECK-NEXT: .[[OPT2LABEL]] +;CHECK: .[[OPT2LABEL]] ;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 4 ;CHECK-NEXT: beq 0, .[[TEST4LABEL]] ;CHECK-NEXT: .[[OPT3LABEL]] diff --git a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll index 1c95d28b5eed1..84ef6b1a02750 100644 --- a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll @@ -997,14 +997,12 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no ; CHECK-NEXT: sub sp, #24 ; CHECK-NEXT: cmp r3, #8 ; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill -; CHECK-NEXT: blo.w .LBB16_12 +; CHECK-NEXT: blo.w .LBB16_11 ; CHECK-NEXT: @ %bb.1: @ %if.then -; CHECK-NEXT: lsrs.w r12, r3, #2 -; CHECK-NEXT: beq.w .LBB16_12 -; CHECK-NEXT: @ %bb.2: @ %while.body.lr.ph ; CHECK-NEXT: ldrh r4, [r0] ; CHECK-NEXT: movs r1, #1 -; CHECK-NEXT: ldrd r5, r3, [r0, #4] +; CHECK-NEXT: ldrd r5, r12, [r0, #4] +; CHECK-NEXT: lsr.w r9, r3, #2 ; CHECK-NEXT: sub.w r0, r4, #8 ; CHECK-NEXT: add.w r7, r0, r0, lsr #29 ; CHECK-NEXT: and r0, r0, #7 @@ -1017,43 +1015,43 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no ; CHECK-NEXT: subs r1, r7, #2 ; CHECK-NEXT: rsbs r7, r4, #0 ; CHECK-NEXT: str r7, [sp, #8] @ 4-byte Spill -; CHECK-NEXT: add.w r7, r3, #16 +; CHECK-NEXT: add.w r7, r12, #16 ; CHECK-NEXT: str r4, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: str r7, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: str r0, [sp, #16] @ 4-byte Spill -; CHECK-NEXT: b .LBB16_6 -; CHECK-NEXT: .LBB16_3: @ %while.end.loopexit -; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1 +; CHECK-NEXT: b .LBB16_5 +; CHECK-NEXT: .LBB16_2: @ %while.end.loopexit +; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1 ; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: add.w r5, r5, r0, lsl #1 -; CHECK-NEXT: b .LBB16_5 -; CHECK-NEXT: .LBB16_4: @ %for.end -; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1 +; CHECK-NEXT: b .LBB16_4 +; CHECK-NEXT: .LBB16_3: @ %for.end +; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1 ; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: wls lr, r0, .LBB16_5 -; CHECK-NEXT: b .LBB16_10 -; CHECK-NEXT: .LBB16_5: @ %while.end -; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1 +; CHECK-NEXT: wls lr, r0, .LBB16_4 +; CHECK-NEXT: b .LBB16_9 +; CHECK-NEXT: .LBB16_4: @ %while.end +; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1 ; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: subs.w r12, r12, #1 +; CHECK-NEXT: subs.w r9, r9, #1 ; CHECK-NEXT: vstrb.8 q0, [r2], #8 ; CHECK-NEXT: add.w r0, r5, r0, lsl #1 ; CHECK-NEXT: add.w r5, r0, #8 -; CHECK-NEXT: beq.w .LBB16_12 -; CHECK-NEXT: .LBB16_6: @ %while.body +; CHECK-NEXT: beq.w .LBB16_11 +; CHECK-NEXT: .LBB16_5: @ %while.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 -; CHECK-NEXT: @ Child Loop BB16_8 Depth 2 -; CHECK-NEXT: @ Child Loop BB16_11 Depth 2 +; CHECK-NEXT: @ Child Loop BB16_7 Depth 2 +; CHECK-NEXT: @ Child Loop BB16_10 Depth 2 ; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: ldrh.w lr, [r3, #14] +; CHECK-NEXT: ldrh.w lr, [r12, #14] ; CHECK-NEXT: vldrw.u32 q0, [r0], #8 -; CHECK-NEXT: ldrh.w r8, [r3, #12] -; CHECK-NEXT: ldrh r7, [r3, #10] -; CHECK-NEXT: ldrh r4, [r3, #8] -; CHECK-NEXT: ldrh r6, [r3, #6] -; CHECK-NEXT: ldrh.w r9, [r3, #4] -; CHECK-NEXT: ldrh.w r11, [r3, #2] -; CHECK-NEXT: ldrh.w r10, [r3] +; CHECK-NEXT: ldrh.w r8, [r12, #12] +; CHECK-NEXT: ldrh.w r7, [r12, #10] +; CHECK-NEXT: ldrh.w r4, [r12, #8] +; CHECK-NEXT: ldrh.w r3, [r12, #6] +; CHECK-NEXT: ldrh.w r6, [r12, #4] +; CHECK-NEXT: ldrh.w r11, [r12, #2] +; CHECK-NEXT: ldrh.w r10, [r12] ; CHECK-NEXT: vstrb.8 q0, [r1], #8 ; CHECK-NEXT: vldrw.u32 q0, [r5] ; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill @@ -1063,10 +1061,10 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no ; CHECK-NEXT: adds r0, r5, #6 ; CHECK-NEXT: vfma.f16 q0, q1, r11 ; CHECK-NEXT: vldrw.u32 q1, [r5, #4] -; CHECK-NEXT: vfma.f16 q0, q1, r9 +; CHECK-NEXT: vfma.f16 q0, q1, r6 ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: add.w r0, r5, #10 -; CHECK-NEXT: vfma.f16 q0, q1, r6 +; CHECK-NEXT: vfma.f16 q0, q1, r3 ; CHECK-NEXT: vldrw.u32 q1, [r5, #8] ; CHECK-NEXT: vfma.f16 q0, q1, r4 ; CHECK-NEXT: vldrw.u32 q1, [r0] @@ -1079,36 +1077,36 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no ; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: vfma.f16 q0, q1, lr ; CHECK-NEXT: cmp r0, #16 -; CHECK-NEXT: blo .LBB16_9 -; CHECK-NEXT: @ %bb.7: @ %for.body.preheader -; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1 +; CHECK-NEXT: blo .LBB16_8 +; CHECK-NEXT: @ %bb.6: @ %for.body.preheader +; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1 ; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload ; CHECK-NEXT: dls lr, r0 ; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: .LBB16_8: @ %for.body -; CHECK-NEXT: @ Parent Loop BB16_6 Depth=1 +; CHECK-NEXT: .LBB16_7: @ %for.body +; CHECK-NEXT: @ Parent Loop BB16_5 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: ldrh r0, [r6], #16 ; CHECK-NEXT: vldrw.u32 q1, [r5] -; CHECK-NEXT: adds r4, r5, #2 +; CHECK-NEXT: adds r3, r5, #2 ; CHECK-NEXT: vfma.f16 q0, q1, r0 -; CHECK-NEXT: vldrw.u32 q1, [r4] +; CHECK-NEXT: vldrw.u32 q1, [r3] ; CHECK-NEXT: ldrh r0, [r6, #-14] -; CHECK-NEXT: adds r4, r5, #6 +; CHECK-NEXT: adds r3, r5, #6 ; CHECK-NEXT: vfma.f16 q0, q1, r0 ; CHECK-NEXT: ldrh r0, [r6, #-12] ; CHECK-NEXT: vldrw.u32 q1, [r5, #4] ; CHECK-NEXT: vfma.f16 q0, q1, r0 -; CHECK-NEXT: vldrw.u32 q1, [r4] +; CHECK-NEXT: vldrw.u32 q1, [r3] ; CHECK-NEXT: ldrh r0, [r6, #-10] -; CHECK-NEXT: add.w r4, r5, #10 +; CHECK-NEXT: add.w r3, r5, #10 ; CHECK-NEXT: vfma.f16 q0, q1, r0 ; CHECK-NEXT: ldrh r0, [r6, #-8] ; CHECK-NEXT: vldrw.u32 q1, [r5, #8] ; CHECK-NEXT: vfma.f16 q0, q1, r0 -; CHECK-NEXT: vldrw.u32 q1, [r4] +; CHECK-NEXT: vldrw.u32 q1, [r3] ; CHECK-NEXT: ldrh r0, [r6, #-6] -; CHECK-NEXT: ldrh r4, [r6, #-2] +; CHECK-NEXT: ldrh r3, [r6, #-2] ; CHECK-NEXT: vfma.f16 q0, q1, r0 ; CHECK-NEXT: ldrh r0, [r6, #-4] ; CHECK-NEXT: vldrw.u32 q1, [r5, #12] @@ -1116,24 +1114,24 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no ; CHECK-NEXT: add.w r0, r5, #14 ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: adds r5, #16 -; CHECK-NEXT: vfma.f16 q0, q1, r4 -; CHECK-NEXT: le lr, .LBB16_8 -; CHECK-NEXT: b .LBB16_4 -; CHECK-NEXT: .LBB16_9: @ in Loop: Header=BB16_6 Depth=1 +; CHECK-NEXT: vfma.f16 q0, q1, r3 +; CHECK-NEXT: le lr, .LBB16_7 +; CHECK-NEXT: b .LBB16_3 +; CHECK-NEXT: .LBB16_8: @ in Loop: Header=BB16_5 Depth=1 ; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: b .LBB16_4 -; CHECK-NEXT: .LBB16_10: @ %while.body76.preheader -; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1 +; CHECK-NEXT: b .LBB16_3 +; CHECK-NEXT: .LBB16_9: @ %while.body76.preheader +; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1 ; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: .LBB16_11: @ %while.body76 -; CHECK-NEXT: @ Parent Loop BB16_6 Depth=1 +; CHECK-NEXT: .LBB16_10: @ %while.body76 +; CHECK-NEXT: @ Parent Loop BB16_5 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: ldrh r4, [r6], #2 +; CHECK-NEXT: ldrh r3, [r6], #2 ; CHECK-NEXT: vldrh.u16 q1, [r0], #2 -; CHECK-NEXT: vfma.f16 q0, q1, r4 -; CHECK-NEXT: le lr, .LBB16_11 -; CHECK-NEXT: b .LBB16_3 -; CHECK-NEXT: .LBB16_12: @ %if.end +; CHECK-NEXT: vfma.f16 q0, q1, r3 +; CHECK-NEXT: le lr, .LBB16_10 +; CHECK-NEXT: b .LBB16_2 +; CHECK-NEXT: .LBB16_11: @ %if.end ; CHECK-NEXT: add sp, #24 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll index 808626d9a0aeb..394a8ba8f53d0 100644 --- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll @@ -983,12 +983,9 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no ; CHECK-LABEL: fir: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: cmp r3, #8 -; CHECK-NEXT: blo.w .LBB16_13 -; CHECK-NEXT: @ %bb.1: @ %if.then -; CHECK-NEXT: lsrs.w r12, r3, #2 -; CHECK-NEXT: it eq -; CHECK-NEXT: bxeq lr -; CHECK-NEXT: .LBB16_2: @ %while.body.lr.ph +; CHECK-NEXT: it lo +; CHECK-NEXT: bxlo lr +; CHECK-NEXT: .LBB16_1: @ %if.then ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: .pad #4 @@ -997,83 +994,84 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: .pad #32 ; CHECK-NEXT: sub sp, #32 -; CHECK-NEXT: ldrh r6, [r0] -; CHECK-NEXT: movs r5, #1 -; CHECK-NEXT: ldrd r4, r10, [r0, #4] -; CHECK-NEXT: sub.w r0, r6, #8 -; CHECK-NEXT: add.w r3, r0, r0, lsr #29 +; CHECK-NEXT: ldrh r5, [r0] +; CHECK-NEXT: movs r6, #1 +; CHECK-NEXT: ldrd r4, r12, [r0, #4] +; CHECK-NEXT: lsr.w r10, r3, #2 +; CHECK-NEXT: sub.w r0, r5, #8 +; CHECK-NEXT: add.w r7, r0, r0, lsr #29 ; CHECK-NEXT: and r0, r0, #7 -; CHECK-NEXT: asrs r7, r3, #3 -; CHECK-NEXT: cmp r7, #1 +; CHECK-NEXT: asr.w lr, r7, #3 +; CHECK-NEXT: cmp.w lr, #1 ; CHECK-NEXT: it gt -; CHECK-NEXT: asrgt r5, r3, #3 -; CHECK-NEXT: add.w r3, r4, r6, lsl #2 -; CHECK-NEXT: sub.w r9, r3, #4 -; CHECK-NEXT: rsbs r3, r6, #0 -; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill -; CHECK-NEXT: add.w r3, r10, #32 -; CHECK-NEXT: str r5, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: str r6, [sp, #16] @ 4-byte Spill -; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: asrgt r6, r7, #3 +; CHECK-NEXT: add.w r7, r4, r5, lsl #2 +; CHECK-NEXT: sub.w r9, r7, #4 +; CHECK-NEXT: rsbs r7, r5, #0 +; CHECK-NEXT: str r7, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: add.w r7, r12, #32 +; CHECK-NEXT: str r6, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: str r5, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: str r7, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill -; CHECK-NEXT: b .LBB16_6 -; CHECK-NEXT: .LBB16_3: @ %while.end.loopexit -; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1 +; CHECK-NEXT: b .LBB16_5 +; CHECK-NEXT: .LBB16_2: @ %while.end.loopexit +; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1 ; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: add.w r4, r4, r0, lsl #2 -; CHECK-NEXT: b .LBB16_5 -; CHECK-NEXT: .LBB16_4: @ %for.end -; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1 +; CHECK-NEXT: b .LBB16_4 +; CHECK-NEXT: .LBB16_3: @ %for.end +; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1 ; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload ; CHECK-NEXT: ldrd r0, r9, [sp, #20] @ 8-byte Folded Reload -; CHECK-NEXT: wls lr, r0, .LBB16_5 -; CHECK-NEXT: b .LBB16_10 -; CHECK-NEXT: .LBB16_5: @ %while.end -; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1 +; CHECK-NEXT: wls lr, r0, .LBB16_4 +; CHECK-NEXT: b .LBB16_9 +; CHECK-NEXT: .LBB16_4: @ %while.end +; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1 ; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: subs.w r12, r12, #1 +; CHECK-NEXT: subs.w r10, r10, #1 ; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: add.w r0, r4, r0, lsl #2 ; CHECK-NEXT: add.w r4, r0, #16 -; CHECK-NEXT: beq .LBB16_12 -; CHECK-NEXT: .LBB16_6: @ %while.body +; CHECK-NEXT: beq .LBB16_11 +; CHECK-NEXT: .LBB16_5: @ %while.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 -; CHECK-NEXT: @ Child Loop BB16_8 Depth 2 -; CHECK-NEXT: @ Child Loop BB16_11 Depth 2 -; CHECK-NEXT: add.w lr, r10, #8 +; CHECK-NEXT: @ Child Loop BB16_7 Depth 2 +; CHECK-NEXT: @ Child Loop BB16_10 Depth 2 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 -; CHECK-NEXT: ldrd r3, r7, [r10] -; CHECK-NEXT: ldm.w lr, {r0, r5, r6, lr} -; CHECK-NEXT: ldrd r11, r8, [r10, #24] +; CHECK-NEXT: ldrd r7, r6, [r12] +; CHECK-NEXT: ldrd r0, r5, [r12, #8] +; CHECK-NEXT: ldrd r3, lr, [r12, #16] +; CHECK-NEXT: ldrd r11, r8, [r12, #24] ; CHECK-NEXT: vstrb.8 q0, [r9], #16 ; CHECK-NEXT: vldrw.u32 q0, [r4], #32 ; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill ; CHECK-NEXT: str.w r9, [sp, #24] @ 4-byte Spill ; CHECK-NEXT: vldrw.u32 q1, [r4, #-28] -; CHECK-NEXT: vmul.f32 q0, q0, r3 +; CHECK-NEXT: vmul.f32 q0, q0, r7 ; CHECK-NEXT: vldrw.u32 q6, [r4, #-24] ; CHECK-NEXT: vldrw.u32 q4, [r4, #-20] -; CHECK-NEXT: vfma.f32 q0, q1, r7 +; CHECK-NEXT: vfma.f32 q0, q1, r6 ; CHECK-NEXT: vldrw.u32 q5, [r4, #-16] ; CHECK-NEXT: vfma.f32 q0, q6, r0 ; CHECK-NEXT: vldrw.u32 q2, [r4, #-12] ; CHECK-NEXT: vfma.f32 q0, q4, r5 ; CHECK-NEXT: vldrw.u32 q3, [r4, #-8] -; CHECK-NEXT: vfma.f32 q0, q5, r6 +; CHECK-NEXT: vfma.f32 q0, q5, r3 ; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: vfma.f32 q0, q2, lr ; CHECK-NEXT: vldrw.u32 q1, [r4, #-4] ; CHECK-NEXT: vfma.f32 q0, q3, r11 ; CHECK-NEXT: cmp r0, #16 ; CHECK-NEXT: vfma.f32 q0, q1, r8 -; CHECK-NEXT: blo .LBB16_9 -; CHECK-NEXT: @ %bb.7: @ %for.body.preheader -; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1 +; CHECK-NEXT: blo .LBB16_8 +; CHECK-NEXT: @ %bb.6: @ %for.body.preheader +; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1 ; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: dls lr, r0 ; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: .LBB16_8: @ %for.body -; CHECK-NEXT: @ Parent Loop BB16_6 Depth=1 +; CHECK-NEXT: .LBB16_7: @ %for.body +; CHECK-NEXT: @ Parent Loop BB16_5 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: ldm.w r7, {r0, r3, r5, r6, r8, r11} ; CHECK-NEXT: vldrw.u32 q1, [r4], #32 @@ -1094,28 +1092,27 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no ; CHECK-NEXT: vfma.f32 q0, q2, r11 ; CHECK-NEXT: vfma.f32 q0, q3, r9 ; CHECK-NEXT: vfma.f32 q0, q1, r1 -; CHECK-NEXT: le lr, .LBB16_8 -; CHECK-NEXT: b .LBB16_4 -; CHECK-NEXT: .LBB16_9: @ in Loop: Header=BB16_6 Depth=1 +; CHECK-NEXT: le lr, .LBB16_7 +; CHECK-NEXT: b .LBB16_3 +; CHECK-NEXT: .LBB16_8: @ in Loop: Header=BB16_5 Depth=1 ; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: b .LBB16_4 -; CHECK-NEXT: .LBB16_10: @ %while.body76.preheader -; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1 -; CHECK-NEXT: mov r3, r4 -; CHECK-NEXT: .LBB16_11: @ %while.body76 -; CHECK-NEXT: @ Parent Loop BB16_6 Depth=1 +; CHECK-NEXT: b .LBB16_3 +; CHECK-NEXT: .LBB16_9: @ %while.body76.preheader +; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1 +; CHECK-NEXT: mov r6, r4 +; CHECK-NEXT: .LBB16_10: @ %while.body76 +; CHECK-NEXT: @ Parent Loop BB16_5 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: ldr r0, [r7], #4 -; CHECK-NEXT: vldrw.u32 q1, [r3], #4 +; CHECK-NEXT: vldrw.u32 q1, [r6], #4 ; CHECK-NEXT: vfma.f32 q0, q1, r0 -; CHECK-NEXT: le lr, .LBB16_11 -; CHECK-NEXT: b .LBB16_3 -; CHECK-NEXT: .LBB16_12: +; CHECK-NEXT: le lr, .LBB16_10 +; CHECK-NEXT: b .LBB16_2 +; CHECK-NEXT: .LBB16_11: ; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: .LBB16_13: @ %if.end ; CHECK-NEXT: bx lr entry: %pState1 = getelementptr inbounds %struct.arm_fir_instance_f32, ptr %S, i32 0, i32 1 diff --git a/llvm/test/Transforms/InstCombine/icmp-or.ll b/llvm/test/Transforms/InstCombine/icmp-or.ll index 922845c1e7e2d..a96341f311329 100644 --- a/llvm/test/Transforms/InstCombine/icmp-or.ll +++ b/llvm/test/Transforms/InstCombine/icmp-or.ll @@ -430,13 +430,8 @@ define i1 @icmp_or_xor_2_ne_fail(i64 %x1, i64 %y1, i64 %x2, i64 %y2) { define i1 @icmp_or_xor_2_3_fail(i64 %x1, i64 %y1, i64 %x2, i64 %y2) { ; CHECK-LABEL: @icmp_or_xor_2_3_fail( -; CHECK-NEXT: [[XOR:%.*]] = xor i64 [[X1:%.*]], [[Y1:%.*]] -; CHECK-NEXT: [[XOR1:%.*]] = xor i64 [[X2:%.*]], [[Y2:%.*]] -; CHECK-NEXT: [[OR:%.*]] = or i64 [[XOR]], [[XOR1]] -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[OR]], 0 -; CHECK-NEXT: [[CMP_1:%.*]] = icmp eq i64 [[XOR]], 0 -; CHECK-NEXT: [[OR1:%.*]] = or i1 [[CMP]], [[CMP_1]] -; CHECK-NEXT: ret i1 [[OR1]] +; CHECK-NEXT: [[CMP_1:%.*]] = icmp eq i64 [[X1:%.*]], [[Y1:%.*]] +; CHECK-NEXT: ret i1 [[CMP_1]] ; %xor = xor i64 %x1, %y1 %xor1 = xor i64 %x2, %y2 @@ -451,13 +446,8 @@ define i1 @icmp_or_xor_2_3_fail(i64 %x1, i64 %y1, i64 %x2, i64 %y2) { define i1 @icmp_or_xor_2_4_fail(i64 %x1, i64 %y1, i64 %x2, i64 %y2) { ; CHECK-LABEL: @icmp_or_xor_2_4_fail( -; CHECK-NEXT: [[XOR:%.*]] = xor i64 [[X1:%.*]], [[Y1:%.*]] -; CHECK-NEXT: [[XOR1:%.*]] = xor i64 [[X2:%.*]], [[Y2:%.*]] -; CHECK-NEXT: [[OR:%.*]] = or i64 [[XOR]], [[XOR1]] -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[OR]], 0 -; CHECK-NEXT: [[CMP_1:%.*]] = icmp eq i64 [[XOR1]], 0 -; CHECK-NEXT: [[OR1:%.*]] = or i1 [[CMP]], [[CMP_1]] -; CHECK-NEXT: ret i1 [[OR1]] +; CHECK-NEXT: [[CMP_1:%.*]] = icmp eq i64 [[X2:%.*]], [[Y2:%.*]] +; CHECK-NEXT: ret i1 [[CMP_1]] ; %xor = xor i64 %x1, %y1 %xor1 = xor i64 %x2, %y2 diff --git a/llvm/test/Transforms/InstCombine/icmp-power2-and-icmp-shifted-mask.ll b/llvm/test/Transforms/InstCombine/icmp-power2-and-icmp-shifted-mask.ll index 82fcca07a00ac..27ecc5686066c 100644 --- a/llvm/test/Transforms/InstCombine/icmp-power2-and-icmp-shifted-mask.ll +++ b/llvm/test/Transforms/InstCombine/icmp-power2-and-icmp-shifted-mask.ll @@ -250,10 +250,7 @@ define i1 @icmp_power2_and_icmp_shifted_mask_swapped_256_239_gap_in_mask_fail(i3 define i1 @icmp_power2_and_icmp_shifted_mask_8_112_mask_to_left_fail(i32 %x) { ; CHECK-LABEL: @icmp_power2_and_icmp_shifted_mask_8_112_mask_to_left_fail( ; CHECK-NEXT: [[T1:%.*]] = icmp ult i32 [[X:%.*]], 8 -; CHECK-NEXT: [[T2:%.*]] = and i32 [[X]], 112 -; CHECK-NEXT: [[T3:%.*]] = icmp ne i32 [[T2]], 112 -; CHECK-NEXT: [[T4:%.*]] = and i1 [[T1]], [[T3]] -; CHECK-NEXT: ret i1 [[T4]] +; CHECK-NEXT: ret i1 [[T1]] ; %t1 = icmp ult i32 %x, 8 %t2 = and i32 %x, 112 @@ -265,10 +262,7 @@ define i1 @icmp_power2_and_icmp_shifted_mask_8_112_mask_to_left_fail(i32 %x) { define i1 @icmp_power2_and_icmp_shifted_mask_swapped_8_112_mask_to_left_fail(i32 %x) { ; CHECK-LABEL: @icmp_power2_and_icmp_shifted_mask_swapped_8_112_mask_to_left_fail( ; CHECK-NEXT: [[T1:%.*]] = icmp ult i32 [[X:%.*]], 8 -; CHECK-NEXT: [[T2:%.*]] = and i32 [[X]], 112 -; CHECK-NEXT: [[T3:%.*]] = icmp ne i32 [[T2]], 112 -; CHECK-NEXT: [[T4:%.*]] = and i1 [[T3]], [[T1]] -; CHECK-NEXT: ret i1 [[T4]] +; CHECK-NEXT: ret i1 [[T1]] ; %t1 = icmp ult i32 %x, 8 %t2 = and i32 %x, 112 @@ -281,10 +275,7 @@ define i1 @icmp_power2_and_icmp_shifted_mask_swapped_8_112_mask_to_left_fail(i32 define i1 @icmp_power2_and_icmp_shifted_mask_8_56_mask_overlap_fail(i32 %x) { ; CHECK-LABEL: @icmp_power2_and_icmp_shifted_mask_8_56_mask_overlap_fail( ; CHECK-NEXT: [[T1:%.*]] = icmp ult i32 [[X:%.*]], 8 -; CHECK-NEXT: [[T2:%.*]] = and i32 [[X]], 56 -; CHECK-NEXT: [[T3:%.*]] = icmp ne i32 [[T2]], 56 -; CHECK-NEXT: [[T4:%.*]] = and i1 [[T1]], [[T3]] -; CHECK-NEXT: ret i1 [[T4]] +; CHECK-NEXT: ret i1 [[T1]] ; %t1 = icmp ult i32 %x, 8 %t2 = and i32 %x, 56 @@ -296,10 +287,7 @@ define i1 @icmp_power2_and_icmp_shifted_mask_8_56_mask_overlap_fail(i32 %x) { define i1 @icmp_power2_and_icmp_shifted_mask_swapped_8_56_mask_overlap_fail(i32 %x) { ; CHECK-LABEL: @icmp_power2_and_icmp_shifted_mask_swapped_8_56_mask_overlap_fail( ; CHECK-NEXT: [[T1:%.*]] = icmp ult i32 [[X:%.*]], 8 -; CHECK-NEXT: [[T2:%.*]] = and i32 [[X]], 56 -; CHECK-NEXT: [[T3:%.*]] = icmp ne i32 [[T2]], 56 -; CHECK-NEXT: [[T4:%.*]] = and i1 [[T3]], [[T1]] -; CHECK-NEXT: ret i1 [[T4]] +; CHECK-NEXT: ret i1 [[T1]] ; %t1 = icmp ult i32 %x, 8 %t2 = and i32 %x, 56 @@ -312,10 +300,7 @@ define i1 @icmp_power2_and_icmp_shifted_mask_swapped_8_56_mask_overlap_fail(i32 define i1 @icmp_power2_and_icmp_shifted_mask_8_24_mask_overlap_fail(i32 %x) { ; CHECK-LABEL: @icmp_power2_and_icmp_shifted_mask_8_24_mask_overlap_fail( ; CHECK-NEXT: [[T1:%.*]] = icmp ult i32 [[X:%.*]], 8 -; CHECK-NEXT: [[T2:%.*]] = and i32 [[X]], 24 -; CHECK-NEXT: [[T3:%.*]] = icmp ne i32 [[T2]], 24 -; CHECK-NEXT: [[T4:%.*]] = and i1 [[T1]], [[T3]] -; CHECK-NEXT: ret i1 [[T4]] +; CHECK-NEXT: ret i1 [[T1]] ; %t1 = icmp ult i32 %x, 8 %t2 = and i32 %x, 24 @@ -327,10 +312,7 @@ define i1 @icmp_power2_and_icmp_shifted_mask_8_24_mask_overlap_fail(i32 %x) { define i1 @icmp_power2_and_icmp_shifted_mask_swapped_8_24_mask_overlap_fail(i32 %x) { ; CHECK-LABEL: @icmp_power2_and_icmp_shifted_mask_swapped_8_24_mask_overlap_fail( ; CHECK-NEXT: [[T1:%.*]] = icmp ult i32 [[X:%.*]], 8 -; CHECK-NEXT: [[T2:%.*]] = and i32 [[X]], 24 -; CHECK-NEXT: [[T3:%.*]] = icmp ne i32 [[T2]], 24 -; CHECK-NEXT: [[T4:%.*]] = and i1 [[T3]], [[T1]] -; CHECK-NEXT: ret i1 [[T4]] +; CHECK-NEXT: ret i1 [[T1]] ; %t1 = icmp ult i32 %x, 8 %t2 = and i32 %x, 24 @@ -343,10 +325,7 @@ define i1 @icmp_power2_and_icmp_shifted_mask_swapped_8_24_mask_overlap_fail(i32 define i1 @icmp_power2_and_icmp_shifted_mask_8_12_mask_overlap_fail(i32 %x) { ; CHECK-LABEL: @icmp_power2_and_icmp_shifted_mask_8_12_mask_overlap_fail( ; CHECK-NEXT: [[T1:%.*]] = icmp ult i32 [[X:%.*]], 8 -; CHECK-NEXT: [[T2:%.*]] = and i32 [[X]], 12 -; CHECK-NEXT: [[T3:%.*]] = icmp ne i32 [[T2]], 12 -; CHECK-NEXT: [[T4:%.*]] = and i1 [[T1]], [[T3]] -; CHECK-NEXT: ret i1 [[T4]] +; CHECK-NEXT: ret i1 [[T1]] ; %t1 = icmp ult i32 %x, 8 %t2 = and i32 %x, 12 @@ -358,10 +337,7 @@ define i1 @icmp_power2_and_icmp_shifted_mask_8_12_mask_overlap_fail(i32 %x) { define i1 @icmp_power2_and_icmp_shifted_mask_swapped_8_12_mask_overlap_fail(i32 %x) { ; CHECK-LABEL: @icmp_power2_and_icmp_shifted_mask_swapped_8_12_mask_overlap_fail( ; CHECK-NEXT: [[T1:%.*]] = icmp ult i32 [[X:%.*]], 8 -; CHECK-NEXT: [[T2:%.*]] = and i32 [[X]], 12 -; CHECK-NEXT: [[T3:%.*]] = icmp ne i32 [[T2]], 12 -; CHECK-NEXT: [[T4:%.*]] = and i1 [[T3]], [[T1]] -; CHECK-NEXT: ret i1 [[T4]] +; CHECK-NEXT: ret i1 [[T1]] ; %t1 = icmp ult i32 %x, 8 %t2 = and i32 %x, 12