diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp index 3bd086230cbec5..3c1f2e79d74feb 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -1250,6 +1250,74 @@ static Instruction *foldToUnsignedSaturatedAdd(BinaryOperator &I) { return nullptr; } +static Value *foldCeilIdioms(BinaryOperator &I, InstCombinerImpl &IC) { + assert(I.getOpcode() == Instruction::Add && "Expecting add instruction."); + Value *A, *B; + auto &ICB = IC.Builder; + + // Fold the log2 ceil idiom: + // zext (ctpop(A) >u/!= 1) + (ctlz (A, true) ^ (BW - 1)) + // -> BW - ctlz (A - 1, false) + const APInt *XorC; + ICmpInst::Predicate Pred; + if (match(&I, + m_c_Add( + m_ZExt(m_ICmp(Pred, m_Intrinsic(m_Value(A)), + m_One())), + m_OneUse(m_ZExtOrSelf(m_OneUse(m_Xor( + m_OneUse(m_TruncOrSelf(m_OneUse( + m_Intrinsic(m_Deferred(A), m_One())))), + m_APInt(XorC))))))) && + (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_NE) && + *XorC == A->getType()->getScalarSizeInBits() - 1) { + Value *Sub = ICB.CreateAdd(A, Constant::getAllOnesValue(A->getType())); + Value *Ctlz = ICB.CreateIntrinsic(Intrinsic::ctlz, {A->getType()}, + {Sub, ICB.getFalse()}); + Value *Ret = ICB.CreateSub( + ConstantInt::get(A->getType(), A->getType()->getScalarSizeInBits()), + Ctlz, "", /*HasNUW*/ true, /*HasNSW*/ true); + return ICB.CreateZExtOrTrunc(Ret, I.getType()); + } + + // Fold the ceil division idiom: + // add (udiv (sub A, Bias), B), Bias + // -> udiv (add A, B - 1), B) + // with Bias = A != 0; A + B not to overflow + auto MatchDivision = [&IC](Instruction *Div, Value *&DivOp0, Value *&DivOp1) { + if (match(Div, m_UDiv(m_Value(DivOp0), m_Value(DivOp1)))) + return true; + + Value *N; + if (match(Div, m_LShr(m_Value(DivOp0), m_Value(N))) && + match(N, + m_Sub(m_SpecificInt(Div->getType()->getScalarSizeInBits() - 1), + m_Intrinsic(m_Value(DivOp1), m_Zero()))) && + IC.isKnownToBeAPowerOfTwo(DivOp1, /*OrZero*/ false, 0, Div)) + return true; + + return false; + }; + + Instruction *Div; + Value *Bias, *Sub; + if (match(&I, m_c_Add(m_Instruction(Div), m_Value(Bias))) && + MatchDivision(Div, Sub, B) && + match(Sub, m_Sub(m_Value(A), m_Value(Bias))) && + match(Bias, m_ZExt(m_SpecificICmp(ICmpInst::ICMP_NE, m_Specific(A), + m_ZeroInt()))) && + Bias->hasNUses(2)) { + WithCache LHSCache(A), RHSCache(B); + auto OR = IC.computeOverflowForUnsignedAdd(LHSCache, RHSCache, &I); + if (OR == OverflowResult::NeverOverflows) { + auto *BMinusOne = + ICB.CreateAdd(B, Constant::getAllOnesValue(I.getType())); + return ICB.CreateUDiv(ICB.CreateAdd(A, BMinusOne), B); + } + } + + return nullptr; +} + // Transform: // (add A, (shl (neg B), Y)) // -> (sub A, (shl B, Y)) @@ -1785,30 +1853,8 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) { I, Builder.CreateIntrinsic(Intrinsic::ctpop, {I.getType()}, {Builder.CreateOr(A, B)})); - // Fold the log2_ceil idiom: - // zext(ctpop(A) >u/!= 1) + (ctlz(A, true) ^ (BW - 1)) - // --> - // BW - ctlz(A - 1, false) - const APInt *XorC; - ICmpInst::Predicate Pred; - if (match(&I, - m_c_Add( - m_ZExt(m_ICmp(Pred, m_Intrinsic(m_Value(A)), - m_One())), - m_OneUse(m_ZExtOrSelf(m_OneUse(m_Xor( - m_OneUse(m_TruncOrSelf(m_OneUse( - m_Intrinsic(m_Deferred(A), m_One())))), - m_APInt(XorC))))))) && - (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_NE) && - *XorC == A->getType()->getScalarSizeInBits() - 1) { - Value *Sub = Builder.CreateAdd(A, Constant::getAllOnesValue(A->getType())); - Value *Ctlz = Builder.CreateIntrinsic(Intrinsic::ctlz, {A->getType()}, - {Sub, Builder.getFalse()}); - Value *Ret = Builder.CreateSub( - ConstantInt::get(A->getType(), A->getType()->getScalarSizeInBits()), - Ctlz, "", /*HasNUW*/ true, /*HasNSW*/ true); - return replaceInstUsesWith(I, Builder.CreateZExtOrTrunc(Ret, I.getType())); - } + if (Value *V = foldCeilIdioms(I, *this)) + return replaceInstUsesWith(I, V); if (Instruction *Res = foldSquareSumInt(I)) return Res; diff --git a/llvm/test/Transforms/InstCombine/fold-ceil-div-idiom.ll b/llvm/test/Transforms/InstCombine/fold-ceil-div-idiom.ll new file mode 100644 index 00000000000000..b0aaa7cd6c991a --- /dev/null +++ b/llvm/test/Transforms/InstCombine/fold-ceil-div-idiom.ll @@ -0,0 +1,213 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt < %s -passes=instcombine -S | FileCheck %s + +define i8 @ceil_div_idiom(i8 %x, i8 %y) { +; CHECK-LABEL: define i8 @ceil_div_idiom( +; CHECK-SAME: i8 [[X:%.*]], i8 [[Y:%.*]]) { +; CHECK-NEXT: [[WO:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[X]], i8 [[Y]]) +; CHECK-NEXT: [[OV:%.*]] = extractvalue { i8, i1 } [[WO]], 1 +; CHECK-NEXT: [[OV_NOT:%.*]] = xor i1 [[OV]], true +; CHECK-NEXT: call void @llvm.assume(i1 [[OV_NOT]]) +; CHECK-NEXT: [[NONZERO:%.*]] = icmp ne i8 [[X]], 0 +; CHECK-NEXT: [[BIAS:%.*]] = zext i1 [[NONZERO]] to i8 +; CHECK-NEXT: [[SUB:%.*]] = sub i8 [[X]], [[BIAS]] +; CHECK-NEXT: [[DIV:%.*]] = udiv i8 [[SUB]], [[Y]] +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[DIV]], [[BIAS]] +; CHECK-NEXT: ret i8 [[ADD]] +; + %wo = call {i8, i1} @llvm.uadd.with.overflow(i8 %x, i8 %y) + %ov = extractvalue {i8, i1} %wo, 1 + %ov.not = xor i1 %ov, true + call void @llvm.assume(i1 %ov.not) + + %nonzero = icmp ne i8 %x, 0 + %bias = zext i1 %nonzero to i8 + %sub = sub i8 %x, %bias + %div = udiv i8 %sub, %y + %add = add i8 %div, %bias + ret i8 %add +} + +define i8 @ceil_div_idiom_2(i8 %x, i8 %y) { +; CHECK-LABEL: define i8 @ceil_div_idiom_2( +; CHECK-SAME: i8 [[X:%.*]], i8 [[Y:%.*]]) { +; CHECK-NEXT: [[OV_NOT:%.*]] = add nuw i8 [[X]], [[Y]] +; CHECK-NEXT: [[TRUNC:%.*]] = trunc i8 [[OV_NOT]] to i1 +; CHECK-NEXT: call void @llvm.assume(i1 [[TRUNC]]) +; CHECK-NEXT: [[NONZERO:%.*]] = icmp ne i8 [[X]], 0 +; CHECK-NEXT: [[BIAS:%.*]] = zext i1 [[NONZERO]] to i8 +; CHECK-NEXT: [[SUB:%.*]] = sub i8 [[X]], [[BIAS]] +; CHECK-NEXT: [[DIV:%.*]] = udiv i8 [[SUB]], [[Y]] +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[DIV]], [[BIAS]] +; CHECK-NEXT: ret i8 [[ADD]] +; + %ov.not = add nuw i8 %x, %y + %trunc = trunc i8 %ov.not to i1 + call void @llvm.assume(i1 %trunc) + + %nonzero = icmp ne i8 %x, 0 + %bias = zext i1 %nonzero to i8 + %sub = sub i8 %x, %bias + %div = udiv i8 %sub, %y + %add = add i8 %div, %bias + ret i8 %add +} + +define i8 @ceil_div_idiom_with_lshr(i8 %x, i8 %y) { +; CHECK-LABEL: define i8 @ceil_div_idiom_with_lshr( +; CHECK-SAME: i8 [[X:%.*]], i8 [[Y:%.*]]) { +; CHECK-NEXT: [[WO:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[X]], i8 [[Y]]) +; CHECK-NEXT: [[OV:%.*]] = extractvalue { i8, i1 } [[WO]], 1 +; CHECK-NEXT: [[OV_NOT:%.*]] = xor i1 [[OV]], true +; CHECK-NEXT: call void @llvm.assume(i1 [[OV_NOT]]) +; CHECK-NEXT: [[CTPOPULATION:%.*]] = call range(i8 0, 9) i8 @llvm.ctpop.i8(i8 [[Y]]) +; CHECK-NEXT: [[IS_POW_2:%.*]] = icmp eq i8 [[CTPOPULATION]], 1 +; CHECK-NEXT: call void @llvm.assume(i1 [[IS_POW_2]]) +; CHECK-NEXT: [[NONZERO:%.*]] = icmp ne i8 [[X]], 0 +; CHECK-NEXT: [[BIAS:%.*]] = zext i1 [[NONZERO]] to i8 +; CHECK-NEXT: [[SUB:%.*]] = sub i8 [[X]], [[BIAS]] +; CHECK-NEXT: [[CTLZ:%.*]] = tail call range(i8 0, 9) i8 @llvm.ctlz.i8(i8 [[Y]], i1 true) +; CHECK-NEXT: [[N:%.*]] = xor i8 [[CTLZ]], 7 +; CHECK-NEXT: [[DIV:%.*]] = lshr i8 [[SUB]], [[N]] +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[DIV]], [[BIAS]] +; CHECK-NEXT: ret i8 [[ADD]] +; + %wo = call {i8, i1} @llvm.uadd.with.overflow(i8 %x, i8 %y) + %ov = extractvalue {i8, i1} %wo, 1 + %ov.not = xor i1 %ov, true + call void @llvm.assume(i1 %ov.not) + + %ctpopulation = call i8 @llvm.ctpop.i8(i8 %y) + %is_pow_2 = icmp eq i8 %ctpopulation, 1 + call void @llvm.assume(i1 %is_pow_2) + + %nonzero = icmp ne i8 %x, 0 + %bias = zext i1 %nonzero to i8 + %sub = sub i8 %x, %bias + %ctlz = tail call i8 @llvm.ctlz.i8(i8 %y, i1 true) + %n = sub i8 7, %ctlz + %div = lshr i8 %sub, %n + %add = add i8 %div, %bias + ret i8 %add +} + +define i8 @ceil_div_idiom_add_may_overflow(i8 %x, i8 %y) { +; CHECK-LABEL: define i8 @ceil_div_idiom_add_may_overflow( +; CHECK-SAME: i8 [[X:%.*]], i8 [[Y:%.*]]) { +; CHECK-NEXT: [[NONZERO:%.*]] = icmp ne i8 [[X]], 0 +; CHECK-NEXT: [[BIAS:%.*]] = zext i1 [[NONZERO]] to i8 +; CHECK-NEXT: [[SUB:%.*]] = sub i8 [[X]], [[BIAS]] +; CHECK-NEXT: [[DIV:%.*]] = udiv i8 [[SUB]], [[Y]] +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[DIV]], [[BIAS]] +; CHECK-NEXT: ret i8 [[ADD]] +; + %nonzero = icmp ne i8 %x, 0 + %bias = zext i1 %nonzero to i8 + %sub = sub i8 %x, %bias + %div = udiv i8 %sub, %y + %add = add i8 %div, %bias + ret i8 %add +} + +define i8 @ceil_div_idiom_multiuse_bias(i8 %x, i8 %y) { +; CHECK-LABEL: define i8 @ceil_div_idiom_multiuse_bias( +; CHECK-SAME: i8 [[X:%.*]], i8 [[Y:%.*]]) { +; CHECK-NEXT: [[WO:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[X]], i8 [[Y]]) +; CHECK-NEXT: [[OV:%.*]] = extractvalue { i8, i1 } [[WO]], 1 +; CHECK-NEXT: [[OV_NOT:%.*]] = xor i1 [[OV]], true +; CHECK-NEXT: call void @llvm.assume(i1 [[OV_NOT]]) +; CHECK-NEXT: [[NONZERO:%.*]] = icmp ne i8 [[X]], 0 +; CHECK-NEXT: [[BIAS:%.*]] = zext i1 [[NONZERO]] to i8 +; CHECK-NEXT: [[SUB:%.*]] = sub i8 [[X]], [[BIAS]] +; CHECK-NEXT: [[DIV:%.*]] = udiv i8 [[SUB]], [[Y]] +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[DIV]], [[BIAS]] +; CHECK-NEXT: call void @use(i8 [[BIAS]]) +; CHECK-NEXT: ret i8 [[ADD]] +; + %wo = call {i8, i1} @llvm.uadd.with.overflow(i8 %x, i8 %y) + %ov = extractvalue {i8, i1} %wo, 1 + %ov.not = xor i1 %ov, true + call void @llvm.assume(i1 %ov.not) + + %nonzero = icmp ne i8 %x, 0 + %bias = zext i1 %nonzero to i8 + %sub = sub i8 %x, %bias + %div = udiv i8 %sub, %y + %add = add i8 %div, %bias + call void @use(i8 %bias) + ret i8 %add +} + +define i8 @ceil_div_idiom_with_lshr_not_power_2(i8 %x, i8 %y) { +; CHECK-LABEL: define i8 @ceil_div_idiom_with_lshr_not_power_2( +; CHECK-SAME: i8 [[X:%.*]], i8 [[Y:%.*]]) { +; CHECK-NEXT: [[WO:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[X]], i8 [[Y]]) +; CHECK-NEXT: [[OV:%.*]] = extractvalue { i8, i1 } [[WO]], 1 +; CHECK-NEXT: [[OV_NOT:%.*]] = xor i1 [[OV]], true +; CHECK-NEXT: call void @llvm.assume(i1 [[OV_NOT]]) +; CHECK-NEXT: [[NONZERO:%.*]] = icmp ne i8 [[X]], 0 +; CHECK-NEXT: [[BIAS:%.*]] = zext i1 [[NONZERO]] to i8 +; CHECK-NEXT: [[SUB:%.*]] = sub i8 [[X]], [[BIAS]] +; CHECK-NEXT: [[CTLZ:%.*]] = tail call range(i8 0, 9) i8 @llvm.ctlz.i8(i8 [[Y]], i1 true) +; CHECK-NEXT: [[N:%.*]] = xor i8 [[CTLZ]], 7 +; CHECK-NEXT: [[DIV:%.*]] = lshr i8 [[SUB]], [[N]] +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[DIV]], [[BIAS]] +; CHECK-NEXT: ret i8 [[ADD]] +; + %wo = call {i8, i1} @llvm.uadd.with.overflow(i8 %x, i8 %y) + %ov = extractvalue {i8, i1} %wo, 1 + %ov.not = xor i1 %ov, true + call void @llvm.assume(i1 %ov.not) + + %nonzero = icmp ne i8 %x, 0 + %bias = zext i1 %nonzero to i8 + %sub = sub i8 %x, %bias + %ctlz = tail call i8 @llvm.ctlz.i8(i8 %y, i1 true) + %n = sub i8 7, %ctlz + %div = lshr i8 %sub, %n + %add = add i8 %div, %bias + ret i8 %add +} + +define i8 @ceil_div_idiom_with_lshr_wrong_bw(i8 %x, i8 %y) { +; CHECK-LABEL: define i8 @ceil_div_idiom_with_lshr_wrong_bw( +; CHECK-SAME: i8 [[X:%.*]], i8 [[Y:%.*]]) { +; CHECK-NEXT: [[WO:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[X]], i8 [[Y]]) +; CHECK-NEXT: [[OV:%.*]] = extractvalue { i8, i1 } [[WO]], 1 +; CHECK-NEXT: [[OV_NOT:%.*]] = xor i1 [[OV]], true +; CHECK-NEXT: call void @llvm.assume(i1 [[OV_NOT]]) +; CHECK-NEXT: [[CTPOPULATION:%.*]] = call range(i8 0, 9) i8 @llvm.ctpop.i8(i8 [[Y]]) +; CHECK-NEXT: [[IS_POW_2:%.*]] = icmp eq i8 [[CTPOPULATION]], 1 +; CHECK-NEXT: call void @llvm.assume(i1 [[IS_POW_2]]) +; CHECK-NEXT: [[NONZERO:%.*]] = icmp ne i8 [[X]], 0 +; CHECK-NEXT: [[BIAS:%.*]] = zext i1 [[NONZERO]] to i8 +; CHECK-NEXT: [[SUB:%.*]] = sub i8 [[X]], [[BIAS]] +; CHECK-NEXT: [[CTLZ:%.*]] = tail call range(i8 0, 9) i8 @llvm.ctlz.i8(i8 [[Y]], i1 true) +; CHECK-NEXT: [[N:%.*]] = sub nuw nsw i8 8, [[CTLZ]] +; CHECK-NEXT: [[DIV:%.*]] = lshr i8 [[SUB]], [[N]] +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[DIV]], [[BIAS]] +; CHECK-NEXT: ret i8 [[ADD]] +; + %wo = call {i8, i1} @llvm.uadd.with.overflow(i8 %x, i8 %y) + %ov = extractvalue {i8, i1} %wo, 1 + %ov.not = xor i1 %ov, true + call void @llvm.assume(i1 %ov.not) + + %ctpopulation = call i8 @llvm.ctpop.i8(i8 %y) + %is_pow_2 = icmp eq i8 %ctpopulation, 1 + call void @llvm.assume(i1 %is_pow_2) + + %nonzero = icmp ne i8 %x, 0 + %bias = zext i1 %nonzero to i8 + %sub = sub i8 %x, %bias + %ctlz = tail call i8 @llvm.ctlz.i8(i8 %y, i1 true) + %n = sub i8 8, %ctlz + %div = lshr i8 %sub, %n + %add = add i8 %div, %bias + ret i8 %add +} + +declare { i8, i1 } @llvm.uadd.with.overflow.i8(i8, i8) +declare i8 @llvm.ctpop.i8(i8) +declare void @llvm.assume(i1) +declare void @use(i8)