-
Notifications
You must be signed in to change notification settings - Fork 12.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Reland [CodeGenPrepare] Convert
ctpop(X) ==/!= 1 into
ctpop(X) u</u> 2/1 (#111284)
#111998
Conversation
…llvm#111284) Some targets have better codegen for `ctpop(X) u< 2` than `ctpop(X) == 1`. After llvm#100899, we set the range of ctpop's return value to indicate the argument/result is non-zero. This patch converts `ctpop(X) ==/!= 1` into `ctpop(X) u</u> 2/1` in CGP to fix llvm#95255.
@llvm/pr-subscribers-backend-aarch64 @llvm/pr-subscribers-backend-x86 Author: Yingwei Zheng (dtcxzyw) ChangesRelands #111284. Test failure with stage2 build has been fixed by #111946. Some targets have better codegen for This patch converts Full diff: https://github.com/llvm/llvm-project/pull/111998.diff 6 Files Affected:
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 3e09fbad6ab198..86f28293ba9ff8 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -2111,6 +2111,31 @@ bool CodeGenPrepare::optimizeURem(Instruction *Rem) {
return false;
}
+/// Some targets have better codegen for `ctpop(X) u< 2` than `ctpop(X) == 1`.
+/// This function converts `ctpop(X) ==/!= 1` into `ctpop(X) u</u> 2/1` if the
+/// result cannot be zero.
+static bool adjustIsPower2Test(CmpInst *Cmp, const TargetLowering &TLI,
+ const TargetTransformInfo &TTI,
+ const DataLayout &DL) {
+ ICmpInst::Predicate Pred;
+ if (!match(Cmp, m_ICmp(Pred, m_Intrinsic<Intrinsic::ctpop>(), m_One())))
+ return false;
+ if (!ICmpInst::isEquality(Pred))
+ return false;
+ auto *II = cast<IntrinsicInst>(Cmp->getOperand(0));
+
+ if (isKnownNonZero(II, DL)) {
+ if (Pred == ICmpInst::ICMP_EQ) {
+ Cmp->setOperand(1, ConstantInt::get(II->getType(), 2));
+ Cmp->setPredicate(ICmpInst::ICMP_ULT);
+ } else {
+ Cmp->setPredicate(ICmpInst::ICMP_UGT);
+ }
+ return true;
+ }
+ return false;
+}
+
bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) {
if (sinkCmpExpression(Cmp, *TLI))
return true;
@@ -2130,6 +2155,9 @@ bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) {
if (foldFCmpToFPClassTest(Cmp, *TLI, *DL))
return true;
+ if (adjustIsPower2Test(Cmp, *TLI, *TTI, *DL))
+ return true;
+
return false;
}
diff --git a/llvm/test/CodeGen/AArch64/arm64-popcnt.ll b/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
index f5ce73a366125b..0030e9ce80abb4 100644
--- a/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
@@ -15,7 +15,7 @@ define i32 @cnt32_advsimd(i32 %x) nounwind readnone {
; CHECK-NONEON-LABEL: cnt32_advsimd:
; CHECK-NONEON: // %bb.0:
; CHECK-NONEON-NEXT: lsr w9, w0, #1
-; CHECK-NONEON-NEXT: mov w8, #16843009
+; CHECK-NONEON-NEXT: mov w8, #16843009 // =0x1010101
; CHECK-NONEON-NEXT: and w9, w9, #0x55555555
; CHECK-NONEON-NEXT: sub w9, w0, w9
; CHECK-NONEON-NEXT: lsr w10, w9, #2
@@ -50,7 +50,7 @@ define i32 @cnt32_advsimd_2(<2 x i32> %x) {
; CHECK-NONEON-LABEL: cnt32_advsimd_2:
; CHECK-NONEON: // %bb.0:
; CHECK-NONEON-NEXT: lsr w9, w0, #1
-; CHECK-NONEON-NEXT: mov w8, #16843009
+; CHECK-NONEON-NEXT: mov w8, #16843009 // =0x1010101
; CHECK-NONEON-NEXT: and w9, w9, #0x55555555
; CHECK-NONEON-NEXT: sub w9, w0, w9
; CHECK-NONEON-NEXT: lsr w10, w9, #2
@@ -86,7 +86,7 @@ define i64 @cnt64_advsimd(i64 %x) nounwind readnone {
; CHECK-NONEON-LABEL: cnt64_advsimd:
; CHECK-NONEON: // %bb.0:
; CHECK-NONEON-NEXT: lsr x9, x0, #1
-; CHECK-NONEON-NEXT: mov x8, #72340172838076673
+; CHECK-NONEON-NEXT: mov x8, #72340172838076673 // =0x101010101010101
; CHECK-NONEON-NEXT: and x9, x9, #0x5555555555555555
; CHECK-NONEON-NEXT: sub x9, x0, x9
; CHECK-NONEON-NEXT: lsr x10, x9, #2
@@ -114,7 +114,7 @@ define i32 @cnt32(i32 %x) nounwind readnone noimplicitfloat {
; CHECK-LABEL: cnt32:
; CHECK: // %bb.0:
; CHECK-NEXT: lsr w9, w0, #1
-; CHECK-NEXT: mov w8, #16843009
+; CHECK-NEXT: mov w8, #16843009 // =0x1010101
; CHECK-NEXT: and w9, w9, #0x55555555
; CHECK-NEXT: sub w9, w0, w9
; CHECK-NEXT: lsr w10, w9, #2
@@ -130,7 +130,7 @@ define i32 @cnt32(i32 %x) nounwind readnone noimplicitfloat {
; CHECK-NONEON-LABEL: cnt32:
; CHECK-NONEON: // %bb.0:
; CHECK-NONEON-NEXT: lsr w9, w0, #1
-; CHECK-NONEON-NEXT: mov w8, #16843009
+; CHECK-NONEON-NEXT: mov w8, #16843009 // =0x1010101
; CHECK-NONEON-NEXT: and w9, w9, #0x55555555
; CHECK-NONEON-NEXT: sub w9, w0, w9
; CHECK-NONEON-NEXT: lsr w10, w9, #2
@@ -155,7 +155,7 @@ define i64 @cnt64(i64 %x) nounwind readnone noimplicitfloat {
; CHECK-LABEL: cnt64:
; CHECK: // %bb.0:
; CHECK-NEXT: lsr x9, x0, #1
-; CHECK-NEXT: mov x8, #72340172838076673
+; CHECK-NEXT: mov x8, #72340172838076673 // =0x101010101010101
; CHECK-NEXT: and x9, x9, #0x5555555555555555
; CHECK-NEXT: sub x9, x0, x9
; CHECK-NEXT: lsr x10, x9, #2
@@ -171,7 +171,7 @@ define i64 @cnt64(i64 %x) nounwind readnone noimplicitfloat {
; CHECK-NONEON-LABEL: cnt64:
; CHECK-NONEON: // %bb.0:
; CHECK-NONEON-NEXT: lsr x9, x0, #1
-; CHECK-NONEON-NEXT: mov x8, #72340172838076673
+; CHECK-NONEON-NEXT: mov x8, #72340172838076673 // =0x101010101010101
; CHECK-NONEON-NEXT: and x9, x9, #0x5555555555555555
; CHECK-NONEON-NEXT: sub x9, x0, x9
; CHECK-NONEON-NEXT: lsr x10, x9, #2
@@ -278,5 +278,59 @@ define i1 @ctpop32_ne_one(i32 %x) nounwind readnone {
ret i1 %cmp
}
+define i1 @ctpop32_eq_one_nonzero(i32 %x) {
+; CHECK-LABEL: ctpop32_eq_one_nonzero:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sub w8, w0, #1
+; CHECK-NEXT: tst w0, w8
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ret
+;
+; CHECK-NONEON-LABEL: ctpop32_eq_one_nonzero:
+; CHECK-NONEON: // %bb.0: // %entry
+; CHECK-NONEON-NEXT: sub w8, w0, #1
+; CHECK-NONEON-NEXT: tst w0, w8
+; CHECK-NONEON-NEXT: cset w0, eq
+; CHECK-NONEON-NEXT: ret
+;
+; CHECK-CSSC-LABEL: ctpop32_eq_one_nonzero:
+; CHECK-CSSC: // %bb.0: // %entry
+; CHECK-CSSC-NEXT: sub w8, w0, #1
+; CHECK-CSSC-NEXT: tst w0, w8
+; CHECK-CSSC-NEXT: cset w0, eq
+; CHECK-CSSC-NEXT: ret
+entry:
+ %popcnt = call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x)
+ %cmp = icmp eq i32 %popcnt, 1
+ ret i1 %cmp
+}
+
+define i1 @ctpop32_ne_one_nonzero(i32 %x) {
+; CHECK-LABEL: ctpop32_ne_one_nonzero:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sub w8, w0, #1
+; CHECK-NEXT: tst w0, w8
+; CHECK-NEXT: cset w0, ne
+; CHECK-NEXT: ret
+;
+; CHECK-NONEON-LABEL: ctpop32_ne_one_nonzero:
+; CHECK-NONEON: // %bb.0: // %entry
+; CHECK-NONEON-NEXT: sub w8, w0, #1
+; CHECK-NONEON-NEXT: tst w0, w8
+; CHECK-NONEON-NEXT: cset w0, ne
+; CHECK-NONEON-NEXT: ret
+;
+; CHECK-CSSC-LABEL: ctpop32_ne_one_nonzero:
+; CHECK-CSSC: // %bb.0: // %entry
+; CHECK-CSSC-NEXT: sub w8, w0, #1
+; CHECK-CSSC-NEXT: tst w0, w8
+; CHECK-CSSC-NEXT: cset w0, ne
+; CHECK-CSSC-NEXT: ret
+entry:
+ %popcnt = tail call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x)
+ %cmp = icmp ne i32 %popcnt, 1
+ ret i1 %cmp
+}
+
declare i32 @llvm.ctpop.i32(i32) nounwind readnone
declare i64 @llvm.ctpop.i64(i64) nounwind readnone
diff --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll
index e24b1b41645cdf..4c52047b928f4d 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll
@@ -1441,3 +1441,42 @@ define i32 @srai_slli2(i16 signext %0) {
%3 = sext i16 %sext to i32
ret i32 %3
}
+
+define i1 @ctpop32_eq_one_nonzero(i32 %x) {
+; RV32I-LABEL: ctpop32_eq_one_nonzero:
+; RV32I: # %bb.0: # %entry
+; RV32I-NEXT: addi a1, a0, -1
+; RV32I-NEXT: and a0, a0, a1
+; RV32I-NEXT: seqz a0, a0
+; RV32I-NEXT: ret
+;
+; RV32ZBB-LABEL: ctpop32_eq_one_nonzero:
+; RV32ZBB: # %bb.0: # %entry
+; RV32ZBB-NEXT: cpop a0, a0
+; RV32ZBB-NEXT: sltiu a0, a0, 2
+; RV32ZBB-NEXT: ret
+entry:
+ %popcnt = call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x)
+ %cmp = icmp eq i32 %popcnt, 1
+ ret i1 %cmp
+}
+
+define i1 @ctpop32_ne_one_nonzero(i32 %x) {
+; RV32I-LABEL: ctpop32_ne_one_nonzero:
+; RV32I: # %bb.0: # %entry
+; RV32I-NEXT: addi a1, a0, -1
+; RV32I-NEXT: and a0, a0, a1
+; RV32I-NEXT: snez a0, a0
+; RV32I-NEXT: ret
+;
+; RV32ZBB-LABEL: ctpop32_ne_one_nonzero:
+; RV32ZBB: # %bb.0: # %entry
+; RV32ZBB-NEXT: cpop a0, a0
+; RV32ZBB-NEXT: sltiu a0, a0, 2
+; RV32ZBB-NEXT: xori a0, a0, 1
+; RV32ZBB-NEXT: ret
+entry:
+ %popcnt = tail call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x)
+ %cmp = icmp ne i32 %popcnt, 1
+ ret i1 %cmp
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64zbb.ll
index 43a499806ab5ae..1e7814d588e4c0 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbb.ll
@@ -1618,3 +1618,84 @@ entry:
%5 = add nsw i32 %4, %0
ret i32 %5
}
+
+define i1 @ctpop32_eq_one_nonzero(i32 %x) {
+; RV64I-LABEL: ctpop32_eq_one_nonzero:
+; RV64I: # %bb.0: # %entry
+; RV64I-NEXT: addi a1, a0, -1
+; RV64I-NEXT: and a0, a0, a1
+; RV64I-NEXT: sext.w a0, a0
+; RV64I-NEXT: seqz a0, a0
+; RV64I-NEXT: ret
+;
+; RV64ZBB-LABEL: ctpop32_eq_one_nonzero:
+; RV64ZBB: # %bb.0: # %entry
+; RV64ZBB-NEXT: cpopw a0, a0
+; RV64ZBB-NEXT: sltiu a0, a0, 2
+; RV64ZBB-NEXT: ret
+entry:
+ %popcnt = call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x)
+ %cmp = icmp eq i32 %popcnt, 1
+ ret i1 %cmp
+}
+
+define i1 @ctpop32_ne_one_nonzero(i32 %x) {
+; RV64I-LABEL: ctpop32_ne_one_nonzero:
+; RV64I: # %bb.0: # %entry
+; RV64I-NEXT: addi a1, a0, -1
+; RV64I-NEXT: and a0, a0, a1
+; RV64I-NEXT: sext.w a0, a0
+; RV64I-NEXT: snez a0, a0
+; RV64I-NEXT: ret
+;
+; RV64ZBB-LABEL: ctpop32_ne_one_nonzero:
+; RV64ZBB: # %bb.0: # %entry
+; RV64ZBB-NEXT: cpopw a0, a0
+; RV64ZBB-NEXT: sltiu a0, a0, 2
+; RV64ZBB-NEXT: xori a0, a0, 1
+; RV64ZBB-NEXT: ret
+entry:
+ %popcnt = tail call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x)
+ %cmp = icmp ne i32 %popcnt, 1
+ ret i1 %cmp
+}
+
+define i1 @ctpop64_eq_one_nonzero(i64 %x) {
+; RV64I-LABEL: ctpop64_eq_one_nonzero:
+; RV64I: # %bb.0: # %entry
+; RV64I-NEXT: addi a1, a0, -1
+; RV64I-NEXT: and a0, a0, a1
+; RV64I-NEXT: seqz a0, a0
+; RV64I-NEXT: ret
+;
+; RV64ZBB-LABEL: ctpop64_eq_one_nonzero:
+; RV64ZBB: # %bb.0: # %entry
+; RV64ZBB-NEXT: cpop a0, a0
+; RV64ZBB-NEXT: sltiu a0, a0, 2
+; RV64ZBB-NEXT: ret
+entry:
+ %popcnt = call range(i64 1, 65) i64 @llvm.ctpop.i64(i64 %x)
+ %cmp = icmp eq i64 %popcnt, 1
+ ret i1 %cmp
+}
+
+define i1 @ctpop32_eq_one_maybezero(i32 %x) {
+; RV64I-LABEL: ctpop32_eq_one_maybezero:
+; RV64I: # %bb.0: # %entry
+; RV64I-NEXT: addiw a1, a0, -1
+; RV64I-NEXT: xor a0, a0, a1
+; RV64I-NEXT: sext.w a0, a0
+; RV64I-NEXT: sltu a0, a1, a0
+; RV64I-NEXT: ret
+;
+; RV64ZBB-LABEL: ctpop32_eq_one_maybezero:
+; RV64ZBB: # %bb.0: # %entry
+; RV64ZBB-NEXT: cpopw a0, a0
+; RV64ZBB-NEXT: addi a0, a0, -1
+; RV64ZBB-NEXT: seqz a0, a0
+; RV64ZBB-NEXT: ret
+entry:
+ %popcnt = call range(i32 0, 16) i32 @llvm.ctpop.i32(i32 %x)
+ %cmp = icmp eq i32 %popcnt, 1
+ ret i1 %cmp
+}
diff --git a/llvm/test/CodeGen/X86/ispow2.ll b/llvm/test/CodeGen/X86/ispow2.ll
index 8723432de8b6b0..649d257b28d762 100644
--- a/llvm/test/CodeGen/X86/ispow2.ll
+++ b/llvm/test/CodeGen/X86/ispow2.ll
@@ -102,7 +102,7 @@ define <4 x i1> @is_pow2_non_zero_4xv64(<4 x i64> %xin) {
; CHECK-AVX512: # %bb.0:
; CHECK-AVX512-NEXT: vporq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
; CHECK-AVX512-NEXT: vpopcntq %ymm0, %ymm0
-; CHECK-AVX512-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %k1
+; CHECK-AVX512-NEXT: vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %k1
; CHECK-AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; CHECK-AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; CHECK-AVX512-NEXT: vzeroupper
@@ -155,7 +155,7 @@ define <4 x i1> @neither_pow2_non_zero_4xv64(<4 x i64> %xin) {
; CHECK-AVX512: # %bb.0:
; CHECK-AVX512-NEXT: vporq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
; CHECK-AVX512-NEXT: vpopcntq %ymm0, %ymm0
-; CHECK-AVX512-NEXT: vpcmpneqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %k1
+; CHECK-AVX512-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %k1
; CHECK-AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; CHECK-AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; CHECK-AVX512-NEXT: vzeroupper
@@ -220,3 +220,44 @@ define <4 x i1> @neither_pow2_non_zero_4xv64_x_maybe_z(<4 x i64> %x) {
%r = icmp ne <4 x i64> %cnt, <i64 1, i64 1, i64 1, i64 1>
ret <4 x i1> %r
}
+
+
+define i1 @ctpop32_eq_one_nonzero(i32 %x) {
+; CHECK-NOBMI-LABEL: ctpop32_eq_one_nonzero:
+; CHECK-NOBMI: # %bb.0: # %entry
+; CHECK-NOBMI-NEXT: # kill: def $edi killed $edi def $rdi
+; CHECK-NOBMI-NEXT: leal -1(%rdi), %eax
+; CHECK-NOBMI-NEXT: testl %eax, %edi
+; CHECK-NOBMI-NEXT: sete %al
+; CHECK-NOBMI-NEXT: retq
+;
+; CHECK-BMI2-LABEL: ctpop32_eq_one_nonzero:
+; CHECK-BMI2: # %bb.0: # %entry
+; CHECK-BMI2-NEXT: blsrl %edi, %eax
+; CHECK-BMI2-NEXT: sete %al
+; CHECK-BMI2-NEXT: retq
+entry:
+ %popcnt = call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x)
+ %cmp = icmp eq i32 %popcnt, 1
+ ret i1 %cmp
+}
+
+define i1 @ctpop32_ne_one_nonzero(i32 %x) {
+; CHECK-NOBMI-LABEL: ctpop32_ne_one_nonzero:
+; CHECK-NOBMI: # %bb.0: # %entry
+; CHECK-NOBMI-NEXT: # kill: def $edi killed $edi def $rdi
+; CHECK-NOBMI-NEXT: leal -1(%rdi), %eax
+; CHECK-NOBMI-NEXT: testl %eax, %edi
+; CHECK-NOBMI-NEXT: setne %al
+; CHECK-NOBMI-NEXT: retq
+;
+; CHECK-BMI2-LABEL: ctpop32_ne_one_nonzero:
+; CHECK-BMI2: # %bb.0: # %entry
+; CHECK-BMI2-NEXT: blsrl %edi, %eax
+; CHECK-BMI2-NEXT: setne %al
+; CHECK-BMI2-NEXT: retq
+entry:
+ %popcnt = tail call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x)
+ %cmp = icmp ne i32 %popcnt, 1
+ ret i1 %cmp
+}
diff --git a/llvm/test/CodeGen/X86/known-never-zero.ll b/llvm/test/CodeGen/X86/known-never-zero.ll
index ac41a3fe6bb7e4..6c0aaeb451e14a 100644
--- a/llvm/test/CodeGen/X86/known-never-zero.ll
+++ b/llvm/test/CodeGen/X86/known-never-zero.ll
@@ -555,9 +555,9 @@ define <4 x i32> @smax_known_zero_vec(<4 x i32> %x, <4 x i32> %y) {
; X86-NEXT: por %xmm2, %xmm0
; X86-NEXT: pcmpeqd %xmm1, %xmm1
; X86-NEXT: paddd %xmm0, %xmm1
-; X86-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-NEXT: pxor %xmm1, %xmm0
-; X86-NEXT: pcmpgtd %xmm1, %xmm0
+; X86-NEXT: pand %xmm1, %xmm0
+; X86-NEXT: pxor %xmm1, %xmm1
+; X86-NEXT: pcmpeqd %xmm1, %xmm0
; X86-NEXT: psrld $31, %xmm0
; X86-NEXT: retl
;
@@ -566,10 +566,10 @@ define <4 x i32> @smax_known_zero_vec(<4 x i32> %x, <4 x i32> %y) {
; X64-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; X64-NEXT: vpaddd %xmm1, %xmm0, %xmm1
-; X64-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; X64-NEXT: vpminud %xmm1, %xmm0, %xmm1
+; X64-NEXT: vpand %xmm1, %xmm0, %xmm0
+; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X64-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
-; X64-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT: vpsrld $31, %xmm0, %xmm0
; X64-NEXT: retq
%z = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %x, <4 x i32> <i32 54, i32 23, i32 12, i32 1>)
%r = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %z)
|
@@ -155,7 +155,7 @@ define <4 x i1> @neither_pow2_non_zero_4xv64(<4 x i64> %xin) { | |||
; CHECK-AVX512: # %bb.0: | |||
; CHECK-AVX512-NEXT: vporq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 | |||
; CHECK-AVX512-NEXT: vpopcntq %ymm0, %ymm0 | |||
; CHECK-AVX512-NEXT: vpcmpneqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %k1 | |||
; CHECK-AVX512-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %k1 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
noticing, this is probably not profitable for vectors on x86-64. Particularly if there is no avx512
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you explain it further? I have checked intel intrinsic guide. Both vpcmpneqq
and vpcmpgtq
take 3 cycles.
I decide to block this pr until #112078 is fixed. |
…/u> 2/1` (llvm#111284)` (llvm#111998) Relands llvm#111284. Test failure with stage2 build has been fixed by llvm#111946. Some targets have better codegen for `ctpop(X) u< 2` than `ctpop(X) == 1`. After llvm#100899, we set the range of ctpop's return value to indicate the argument/result is non-zero. This patch converts `ctpop(X) ==/!= 1` into `ctpop(X) u</u> 2/1` in CGP to fix llvm#95255.
…/u> 2/1` (llvm#111284)` (llvm#111998) Relands llvm#111284. Test failure with stage2 build has been fixed by llvm#111946. Some targets have better codegen for `ctpop(X) u< 2` than `ctpop(X) == 1`. After llvm#100899, we set the range of ctpop's return value to indicate the argument/result is non-zero. This patch converts `ctpop(X) ==/!= 1` into `ctpop(X) u</u> 2/1` in CGP to fix llvm#95255.
…/u> 2/1` (llvm#111284)` (llvm#111998) Relands llvm#111284. Test failure with stage2 build has been fixed by llvm#111946. Some targets have better codegen for `ctpop(X) u< 2` than `ctpop(X) == 1`. After llvm#100899, we set the range of ctpop's return value to indicate the argument/result is non-zero. This patch converts `ctpop(X) ==/!= 1` into `ctpop(X) u</u> 2/1` in CGP to fix llvm#95255.
Relands #111284. Test failure with stage2 build has been fixed by #111946.
Some targets have better codegen for
ctpop(X) u< 2
thanctpop(X) == 1
. After #100899, we set the range of ctpop's return value to indicate the argument/result is non-zero.This patch converts
ctpop(X) ==/!= 1
intoctpop(X) u</u> 2/1
in CGP to fix #95255.