From bda93eeb690e2888aeed1051f1b10a95875d5049 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 22 Aug 2024 11:11:00 +0100 Subject: [PATCH] [X86] Allow speculative BSR/BSF instructions on targets with CMOV (#102885) Currently targets without LZCNT/TZCNT won't speculate with BSR/BSF instructions in case they have a zero value input, meaning we always insert a test+branch for the zero-input case. This patch proposes we allow speculation if the target has CMOV, and perform a branchless select instead to handle the zero input case. This will predominately help x86-64 targets where we haven't set any particular cpu target. We already always perform BSR/BSF instructions if we were lowering a CTLZ/CTTZ_ZERO_UNDEF instruction. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 4 +- .../lib/Target/X86/X86TargetTransformInfo.cpp | 10 +- .../Analysis/CostModel/X86/ctlz-codesize.ll | 8 +- .../CostModel/X86/ctlz-sizelatency.ll | 8 +- llvm/test/Analysis/CostModel/X86/ctlz.ll | 8 +- .../Analysis/CostModel/X86/cttz-codesize.ll | 2 +- .../CostModel/X86/cttz-sizelatency.ll | 2 +- llvm/test/CodeGen/X86/atomic-bit-test.ll | 1 - llvm/test/CodeGen/X86/bit_ceil.ll | 53 +-- llvm/test/CodeGen/X86/combine-or.ll | 47 ++- llvm/test/CodeGen/X86/ctlo.ll | 161 ++++++---- llvm/test/CodeGen/X86/ctlz.ll | 304 +++++++++--------- llvm/test/CodeGen/X86/cttz.ll | 37 ++- llvm/test/CodeGen/X86/known-never-zero.ll | 269 +++++----------- llvm/test/CodeGen/X86/lzcnt-cmp.ll | 52 +-- llvm/test/CodeGen/X86/pr57673.ll | 50 +-- llvm/test/CodeGen/X86/pr89877.ll | 8 +- llvm/test/CodeGen/X86/pr92569.ll | 16 +- .../CodeGenPrepare/X86/cttz-ctlz.ll | 80 ++--- .../test/Transforms/SLPVectorizer/X86/ctlz.ll | 78 ++++- 20 files changed, 516 insertions(+), 682 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index da5ea50f80ce04c..97775ce40aee4f9 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -3239,14 +3239,14 @@ bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT, bool X86TargetLowering::isCheapToSpeculateCttz(Type *Ty) const { // Speculate cttz only if we can directly use TZCNT or can promote to i32/i64. - return Subtarget.hasBMI() || + return Subtarget.hasBMI() || Subtarget.canUseCMOV() || (!Ty->isVectorTy() && Ty->getScalarSizeInBits() < (Subtarget.is64Bit() ? 64u : 32u)); } bool X86TargetLowering::isCheapToSpeculateCtlz(Type *Ty) const { // Speculate ctlz only if we can directly use LZCNT. - return Subtarget.hasLZCNT(); + return Subtarget.hasLZCNT() || Subtarget.canUseCMOV(); } bool X86TargetLowering::ShouldShrinkFPConstant(EVT VT) const { diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 9a11c33386fd0b9..cb9ee64a677a7eb 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -4210,9 +4210,9 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, { ISD::ABS, MVT::i64, { 1, 2, 3, 3 } }, // SUB+CMOV { ISD::BITREVERSE, MVT::i64, { 10, 12, 20, 22 } }, { ISD::BSWAP, MVT::i64, { 1, 2, 1, 2 } }, - { ISD::CTLZ, MVT::i64, { 3, 2, 6, 6 } }, // BSR+XOR or BSR+XOR+CMOV + { ISD::CTLZ, MVT::i64, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV { ISD::CTLZ_ZERO_UNDEF, MVT::i64,{ 1, 2, 2, 2 } }, // BSR+XOR - { ISD::CTTZ, MVT::i64, { 2, 2, 5, 5 } }, // TEST+BSF+CMOV/BRANCH + { ISD::CTTZ, MVT::i64, { 2, 2, 3, 4 } }, // TEST+BSF+CMOV/BRANCH { ISD::CTTZ_ZERO_UNDEF, MVT::i64,{ 1, 2, 1, 2 } }, // BSF { ISD::CTPOP, MVT::i64, { 10, 6, 19, 19 } }, { ISD::ROTL, MVT::i64, { 2, 3, 1, 3 } }, @@ -4241,9 +4241,9 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, { ISD::BITREVERSE, MVT::i8, { 7, 9, 13, 14 } }, { ISD::BSWAP, MVT::i32, { 1, 1, 1, 1 } }, { ISD::BSWAP, MVT::i16, { 1, 2, 1, 2 } }, // ROL - { ISD::CTLZ, MVT::i32, { 3, 2, 6, 6 } }, // BSR+XOR or BSR+XOR+CMOV - { ISD::CTLZ, MVT::i16, { 3, 2, 6, 6 } }, // BSR+XOR or BSR+XOR+CMOV - { ISD::CTLZ, MVT::i8, { 3, 2, 7, 7 } }, // BSR+XOR or BSR+XOR+CMOV + { ISD::CTLZ, MVT::i32, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV + { ISD::CTLZ, MVT::i16, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV + { ISD::CTLZ, MVT::i8, { 2, 2, 5, 6 } }, // BSR+XOR or BSR+XOR+CMOV { ISD::CTLZ_ZERO_UNDEF, MVT::i32,{ 1, 2, 2, 2 } }, // BSR+XOR { ISD::CTLZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 2, 2 } }, // BSR+XOR { ISD::CTLZ_ZERO_UNDEF, MVT::i8, { 2, 2, 3, 3 } }, // BSR+XOR diff --git a/llvm/test/Analysis/CostModel/X86/ctlz-codesize.ll b/llvm/test/Analysis/CostModel/X86/ctlz-codesize.ll index ae0f1a3cfad307e..da0f71c63ef80ed 100644 --- a/llvm/test/Analysis/CostModel/X86/ctlz-codesize.ll +++ b/llvm/test/Analysis/CostModel/X86/ctlz-codesize.ll @@ -17,7 +17,7 @@ declare i8 @llvm.ctlz.i8(i8, i1) define i64 @var_ctlz_i64(i64 %a) { ; NOLZCNT-LABEL: 'var_ctlz_i64' -; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false) +; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false) ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ctlz ; ; LZCNT-LABEL: 'var_ctlz_i64' @@ -43,7 +43,7 @@ define i64 @var_ctlz_i64u(i64 %a) { define i32 @var_ctlz_i32(i32 %a) { ; NOLZCNT-LABEL: 'var_ctlz_i32' -; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false) +; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false) ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 %ctlz ; ; LZCNT-LABEL: 'var_ctlz_i32' @@ -69,7 +69,7 @@ define i32 @var_ctlz_i32u(i32 %a) { define i16 @var_ctlz_i16(i16 %a) { ; NOLZCNT-LABEL: 'var_ctlz_i16' -; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false) +; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false) ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i16 %ctlz ; ; LZCNT-LABEL: 'var_ctlz_i16' @@ -95,7 +95,7 @@ define i16 @var_ctlz_i16u(i16 %a) { define i8 @var_ctlz_i8(i8 %a) { ; NOLZCNT-LABEL: 'var_ctlz_i8' -; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false) +; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false) ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i8 %ctlz ; ; LZCNT-LABEL: 'var_ctlz_i8' diff --git a/llvm/test/Analysis/CostModel/X86/ctlz-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/ctlz-sizelatency.ll index 8c6c3228d8fc6e9..2425e7286265b06 100644 --- a/llvm/test/Analysis/CostModel/X86/ctlz-sizelatency.ll +++ b/llvm/test/Analysis/CostModel/X86/ctlz-sizelatency.ll @@ -17,7 +17,7 @@ declare i8 @llvm.ctlz.i8(i8, i1) define i64 @var_ctlz_i64(i64 %a) { ; NOLZCNT-LABEL: 'var_ctlz_i64' -; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false) +; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false) ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ctlz ; ; LZCNT-LABEL: 'var_ctlz_i64' @@ -43,7 +43,7 @@ define i64 @var_ctlz_i64u(i64 %a) { define i32 @var_ctlz_i32(i32 %a) { ; NOLZCNT-LABEL: 'var_ctlz_i32' -; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false) +; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false) ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 %ctlz ; ; LZCNT-LABEL: 'var_ctlz_i32' @@ -69,7 +69,7 @@ define i32 @var_ctlz_i32u(i32 %a) { define i16 @var_ctlz_i16(i16 %a) { ; NOLZCNT-LABEL: 'var_ctlz_i16' -; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false) +; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false) ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i16 %ctlz ; ; LZCNT-LABEL: 'var_ctlz_i16' @@ -95,7 +95,7 @@ define i16 @var_ctlz_i16u(i16 %a) { define i8 @var_ctlz_i8(i8 %a) { ; NOLZCNT-LABEL: 'var_ctlz_i8' -; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false) +; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false) ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i8 %ctlz ; ; LZCNT-LABEL: 'var_ctlz_i8' diff --git a/llvm/test/Analysis/CostModel/X86/ctlz.ll b/llvm/test/Analysis/CostModel/X86/ctlz.ll index 99e682b8e17826e..fa7982ce09e9cef 100644 --- a/llvm/test/Analysis/CostModel/X86/ctlz.ll +++ b/llvm/test/Analysis/CostModel/X86/ctlz.ll @@ -17,7 +17,7 @@ declare i8 @llvm.ctlz.i8(i8, i1) define i64 @var_ctlz_i64(i64 %a) { ; NOLZCNT-LABEL: 'var_ctlz_i64' -; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false) +; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false) ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %ctlz ; ; LZCNT-LABEL: 'var_ctlz_i64' @@ -43,7 +43,7 @@ define i64 @var_ctlz_i64u(i64 %a) { define i32 @var_ctlz_i32(i32 %a) { ; NOLZCNT-LABEL: 'var_ctlz_i32' -; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false) +; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false) ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %ctlz ; ; LZCNT-LABEL: 'var_ctlz_i32' @@ -69,7 +69,7 @@ define i32 @var_ctlz_i32u(i32 %a) { define i16 @var_ctlz_i16(i16 %a) { ; NOLZCNT-LABEL: 'var_ctlz_i16' -; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false) +; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false) ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %ctlz ; ; LZCNT-LABEL: 'var_ctlz_i16' @@ -95,7 +95,7 @@ define i16 @var_ctlz_i16u(i16 %a) { define i8 @var_ctlz_i8(i8 %a) { ; NOLZCNT-LABEL: 'var_ctlz_i8' -; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false) +; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false) ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %ctlz ; ; LZCNT-LABEL: 'var_ctlz_i8' diff --git a/llvm/test/Analysis/CostModel/X86/cttz-codesize.ll b/llvm/test/Analysis/CostModel/X86/cttz-codesize.ll index 1d40debb7ab8166..07bf1dd7a2ff6cb 100644 --- a/llvm/test/Analysis/CostModel/X86/cttz-codesize.ll +++ b/llvm/test/Analysis/CostModel/X86/cttz-codesize.ll @@ -18,7 +18,7 @@ declare i8 @llvm.cttz.i8(i8, i1) define i64 @var_cttz_i64(i64 %a) { ; NOBMI-LABEL: 'var_cttz_i64' -; NOBMI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false) +; NOBMI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false) ; NOBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %cttz ; ; BMI-LABEL: 'var_cttz_i64' diff --git a/llvm/test/Analysis/CostModel/X86/cttz-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/cttz-sizelatency.ll index 351e863f1320674..afe5cb8c55fe653 100644 --- a/llvm/test/Analysis/CostModel/X86/cttz-sizelatency.ll +++ b/llvm/test/Analysis/CostModel/X86/cttz-sizelatency.ll @@ -18,7 +18,7 @@ declare i8 @llvm.cttz.i8(i8, i1) define i64 @var_cttz_i64(i64 %a) { ; NOBMI-LABEL: 'var_cttz_i64' -; NOBMI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false) +; NOBMI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false) ; NOBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %cttz ; ; BMI-LABEL: 'var_cttz_i64' diff --git a/llvm/test/CodeGen/X86/atomic-bit-test.ll b/llvm/test/CodeGen/X86/atomic-bit-test.ll index f39c4b5e620d0e8..10b6605c3fb05e4 100644 --- a/llvm/test/CodeGen/X86/atomic-bit-test.ll +++ b/llvm/test/CodeGen/X86/atomic-bit-test.ll @@ -582,7 +582,6 @@ define i32 @split_hoist_and(i32 %0) nounwind { ; X64-NEXT: lock btsl $3, v32(%rip) ; X64-NEXT: setb %al ; X64-NEXT: shll $3, %eax -; X64-NEXT: testl %edi, %edi ; X64-NEXT: retq %2 = atomicrmw or ptr @v32, i32 8 monotonic, align 4 %3 = tail call i32 @llvm.ctlz.i32(i32 %0, i1 false) diff --git a/llvm/test/CodeGen/X86/bit_ceil.ll b/llvm/test/CodeGen/X86/bit_ceil.ll index 4641c114238f8f9..823453087f6180e 100644 --- a/llvm/test/CodeGen/X86/bit_ceil.ll +++ b/llvm/test/CodeGen/X86/bit_ceil.ll @@ -8,16 +8,12 @@ define i32 @bit_ceil_i32(i32 %x) { ; NOBMI-LABEL: bit_ceil_i32: ; NOBMI: # %bb.0: -; NOBMI-NEXT: movl %edi, %eax -; NOBMI-NEXT: decl %eax -; NOBMI-NEXT: je .LBB0_1 -; NOBMI-NEXT: # %bb.2: # %cond.false -; NOBMI-NEXT: bsrl %eax, %ecx +; NOBMI-NEXT: # kill: def $edi killed $edi def $rdi +; NOBMI-NEXT: leal -1(%rdi), %eax +; NOBMI-NEXT: bsrl %eax, %eax +; NOBMI-NEXT: movl $63, %ecx +; NOBMI-NEXT: cmovnel %eax, %ecx ; NOBMI-NEXT: xorl $31, %ecx -; NOBMI-NEXT: jmp .LBB0_3 -; NOBMI-NEXT: .LBB0_1: -; NOBMI-NEXT: movl $32, %ecx -; NOBMI-NEXT: .LBB0_3: # %cond.end ; NOBMI-NEXT: negb %cl ; NOBMI-NEXT: movl $1, %edx ; NOBMI-NEXT: movl $1, %eax @@ -51,15 +47,10 @@ define i32 @bit_ceil_i32(i32 %x) { define i32 @bit_ceil_i32_plus1(i32 noundef %x) { ; NOBMI-LABEL: bit_ceil_i32_plus1: ; NOBMI: # %bb.0: # %entry -; NOBMI-NEXT: testl %edi, %edi -; NOBMI-NEXT: je .LBB1_1 -; NOBMI-NEXT: # %bb.2: # %cond.false -; NOBMI-NEXT: bsrl %edi, %ecx +; NOBMI-NEXT: bsrl %edi, %eax +; NOBMI-NEXT: movl $63, %ecx +; NOBMI-NEXT: cmovnel %eax, %ecx ; NOBMI-NEXT: xorl $31, %ecx -; NOBMI-NEXT: jmp .LBB1_3 -; NOBMI-NEXT: .LBB1_1: -; NOBMI-NEXT: movl $32, %ecx -; NOBMI-NEXT: .LBB1_3: # %cond.end ; NOBMI-NEXT: negb %cl ; NOBMI-NEXT: movl $1, %edx ; NOBMI-NEXT: movl $1, %eax @@ -94,16 +85,11 @@ entry: define i64 @bit_ceil_i64(i64 %x) { ; NOBMI-LABEL: bit_ceil_i64: ; NOBMI: # %bb.0: -; NOBMI-NEXT: movq %rdi, %rax -; NOBMI-NEXT: decq %rax -; NOBMI-NEXT: je .LBB2_1 -; NOBMI-NEXT: # %bb.2: # %cond.false -; NOBMI-NEXT: bsrq %rax, %rcx -; NOBMI-NEXT: xorq $63, %rcx -; NOBMI-NEXT: jmp .LBB2_3 -; NOBMI-NEXT: .LBB2_1: -; NOBMI-NEXT: movl $64, %ecx -; NOBMI-NEXT: .LBB2_3: # %cond.end +; NOBMI-NEXT: leaq -1(%rdi), %rax +; NOBMI-NEXT: bsrq %rax, %rax +; NOBMI-NEXT: movl $127, %ecx +; NOBMI-NEXT: cmovneq %rax, %rcx +; NOBMI-NEXT: xorl $63, %ecx ; NOBMI-NEXT: negb %cl ; NOBMI-NEXT: movl $1, %edx ; NOBMI-NEXT: movl $1, %eax @@ -136,15 +122,10 @@ define i64 @bit_ceil_i64(i64 %x) { define i64 @bit_ceil_i64_plus1(i64 noundef %x) { ; NOBMI-LABEL: bit_ceil_i64_plus1: ; NOBMI: # %bb.0: # %entry -; NOBMI-NEXT: testq %rdi, %rdi -; NOBMI-NEXT: je .LBB3_1 -; NOBMI-NEXT: # %bb.2: # %cond.false -; NOBMI-NEXT: bsrq %rdi, %rcx -; NOBMI-NEXT: xorq $63, %rcx -; NOBMI-NEXT: jmp .LBB3_3 -; NOBMI-NEXT: .LBB3_1: -; NOBMI-NEXT: movl $64, %ecx -; NOBMI-NEXT: .LBB3_3: # %cond.end +; NOBMI-NEXT: bsrq %rdi, %rax +; NOBMI-NEXT: movl $127, %ecx +; NOBMI-NEXT: cmovneq %rax, %rcx +; NOBMI-NEXT: xorl $63, %ecx ; NOBMI-NEXT: negb %cl ; NOBMI-NEXT: movl $1, %edx ; NOBMI-NEXT: movl $1, %eax diff --git a/llvm/test/CodeGen/X86/combine-or.ll b/llvm/test/CodeGen/X86/combine-or.ll index 3b2102f46a297a6..4060355495eb3b5 100644 --- a/llvm/test/CodeGen/X86/combine-or.ll +++ b/llvm/test/CodeGen/X86/combine-or.ll @@ -213,21 +213,18 @@ define i64 @PR89533(<64 x i8> %a0) { ; SSE-NEXT: shll $16, %ecx ; SSE-NEXT: orl %eax, %ecx ; SSE-NEXT: pcmpeqb %xmm4, %xmm2 -; SSE-NEXT: pmovmskb %xmm2, %edx -; SSE-NEXT: xorl $65535, %edx # imm = 0xFFFF +; SSE-NEXT: pmovmskb %xmm2, %eax +; SSE-NEXT: xorl $65535, %eax # imm = 0xFFFF ; SSE-NEXT: pcmpeqb %xmm4, %xmm3 -; SSE-NEXT: pmovmskb %xmm3, %eax -; SSE-NEXT: notl %eax -; SSE-NEXT: shll $16, %eax -; SSE-NEXT: orl %edx, %eax -; SSE-NEXT: shlq $32, %rax -; SSE-NEXT: orq %rcx, %rax -; SSE-NEXT: je .LBB11_2 -; SSE-NEXT: # %bb.1: # %cond.false -; SSE-NEXT: rep bsfq %rax, %rax -; SSE-NEXT: retq -; SSE-NEXT: .LBB11_2: # %cond.end +; SSE-NEXT: pmovmskb %xmm3, %edx +; SSE-NEXT: notl %edx +; SSE-NEXT: shll $16, %edx +; SSE-NEXT: orl %eax, %edx +; SSE-NEXT: shlq $32, %rdx +; SSE-NEXT: orq %rcx, %rdx +; SSE-NEXT: bsfq %rdx, %rcx ; SSE-NEXT: movl $64, %eax +; SSE-NEXT: cmovneq %rcx, %rax ; SSE-NEXT: retq ; ; AVX1-LABEL: PR89533: @@ -243,23 +240,19 @@ define i64 @PR89533(<64 x i8> %a0) { ; AVX1-NEXT: shll $16, %ecx ; AVX1-NEXT: orl %eax, %ecx ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm0 -; AVX1-NEXT: vpmovmskb %xmm0, %edx -; AVX1-NEXT: xorl $65535, %edx # imm = 0xFFFF +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: xorl $65535, %eax # imm = 0xFFFF ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpmovmskb %xmm0, %eax -; AVX1-NEXT: notl %eax -; AVX1-NEXT: shll $16, %eax -; AVX1-NEXT: orl %edx, %eax -; AVX1-NEXT: shlq $32, %rax -; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: je .LBB11_2 -; AVX1-NEXT: # %bb.1: # %cond.false -; AVX1-NEXT: rep bsfq %rax, %rax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; AVX1-NEXT: .LBB11_2: # %cond.end +; AVX1-NEXT: vpmovmskb %xmm0, %edx +; AVX1-NEXT: notl %edx +; AVX1-NEXT: shll $16, %edx +; AVX1-NEXT: orl %eax, %edx +; AVX1-NEXT: shlq $32, %rdx +; AVX1-NEXT: orq %rcx, %rdx +; AVX1-NEXT: bsfq %rdx, %rcx ; AVX1-NEXT: movl $64, %eax +; AVX1-NEXT: cmovneq %rcx, %rax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/ctlo.ll b/llvm/test/CodeGen/X86/ctlo.ll index bb80279e28f3d3a..f383c9a2544fca0 100644 --- a/llvm/test/CodeGen/X86/ctlo.ll +++ b/llvm/test/CodeGen/X86/ctlo.ll @@ -13,36 +13,44 @@ declare i32 @llvm.ctlz.i32(i32, i1) declare i64 @llvm.ctlz.i64(i64, i1) define i8 @ctlo_i8(i8 %x) { -; X86-LABEL: ctlo_i8: -; X86: # %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorb $-1, %al -; X86-NEXT: je .LBB0_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: movzbl %al, %eax -; X86-NEXT: bsrl %eax, %eax -; X86-NEXT: xorl $7, %eax -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: retl -; X86-NEXT: .LBB0_1: -; X86-NEXT: movb $8, %al -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: retl +; X86-NOCMOV-LABEL: ctlo_i8: +; X86-NOCMOV: # %bb.0: +; X86-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NOCMOV-NEXT: xorb $-1, %al +; X86-NOCMOV-NEXT: je .LBB0_1 +; X86-NOCMOV-NEXT: # %bb.2: # %cond.false +; X86-NOCMOV-NEXT: movzbl %al, %eax +; X86-NOCMOV-NEXT: bsrl %eax, %eax +; X86-NOCMOV-NEXT: xorl $7, %eax +; X86-NOCMOV-NEXT: # kill: def $al killed $al killed $eax +; X86-NOCMOV-NEXT: retl +; X86-NOCMOV-NEXT: .LBB0_1: +; X86-NOCMOV-NEXT: movb $8, %al +; X86-NOCMOV-NEXT: # kill: def $al killed $al killed $eax +; X86-NOCMOV-NEXT: retl +; +; X86-CMOV-LABEL: ctlo_i8: +; X86-CMOV: # %bb.0: +; X86-CMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-CMOV-NEXT: notb %al +; X86-CMOV-NEXT: movzbl %al, %eax +; X86-CMOV-NEXT: bsrl %eax, %ecx +; X86-CMOV-NEXT: movl $15, %eax +; X86-CMOV-NEXT: cmovnel %ecx, %eax +; X86-CMOV-NEXT: xorl $7, %eax +; X86-CMOV-NEXT: # kill: def $al killed $al killed $eax +; X86-CMOV-NEXT: retl ; ; X64-LABEL: ctlo_i8: ; X64: # %bb.0: -; X64-NEXT: xorb $-1, %dil -; X64-NEXT: je .LBB0_1 -; X64-NEXT: # %bb.2: # %cond.false +; X64-NEXT: notb %dil ; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: bsrl %eax, %eax +; X64-NEXT: bsrl %eax, %ecx +; X64-NEXT: movl $15, %eax +; X64-NEXT: cmovnel %ecx, %eax ; X64-NEXT: xorl $7, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq -; X64-NEXT: .LBB0_1: -; X64-NEXT: movb $8, %al -; X64-NEXT: # kill: def $al killed $al killed $eax -; X64-NEXT: retq ; ; X86-CLZ-LABEL: ctlo_i8: ; X86-CLZ: # %bb.0: @@ -111,34 +119,41 @@ define i8 @ctlo_i8_undef(i8 %x) { } define i16 @ctlo_i16(i16 %x) { -; X86-LABEL: ctlo_i16: -; X86: # %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorw $-1, %ax -; X86-NEXT: je .LBB2_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: bsrw %ax, %ax -; X86-NEXT: xorl $15, %eax -; X86-NEXT: # kill: def $ax killed $ax killed $eax -; X86-NEXT: retl -; X86-NEXT: .LBB2_1: -; X86-NEXT: movw $16, %ax -; X86-NEXT: # kill: def $ax killed $ax killed $eax -; X86-NEXT: retl +; X86-NOCMOV-LABEL: ctlo_i16: +; X86-NOCMOV: # %bb.0: +; X86-NOCMOV-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NOCMOV-NEXT: xorw $-1, %ax +; X86-NOCMOV-NEXT: je .LBB2_1 +; X86-NOCMOV-NEXT: # %bb.2: # %cond.false +; X86-NOCMOV-NEXT: bsrw %ax, %ax +; X86-NOCMOV-NEXT: xorl $15, %eax +; X86-NOCMOV-NEXT: # kill: def $ax killed $ax killed $eax +; X86-NOCMOV-NEXT: retl +; X86-NOCMOV-NEXT: .LBB2_1: +; X86-NOCMOV-NEXT: movw $16, %ax +; X86-NOCMOV-NEXT: # kill: def $ax killed $ax killed $eax +; X86-NOCMOV-NEXT: retl +; +; X86-CMOV-LABEL: ctlo_i16: +; X86-CMOV: # %bb.0: +; X86-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-CMOV-NEXT: notl %eax +; X86-CMOV-NEXT: bsrw %ax, %cx +; X86-CMOV-NEXT: movw $31, %ax +; X86-CMOV-NEXT: cmovnew %cx, %ax +; X86-CMOV-NEXT: xorl $15, %eax +; X86-CMOV-NEXT: # kill: def $ax killed $ax killed $eax +; X86-CMOV-NEXT: retl ; ; X64-LABEL: ctlo_i16: ; X64: # %bb.0: -; X64-NEXT: xorw $-1, %di -; X64-NEXT: je .LBB2_1 -; X64-NEXT: # %bb.2: # %cond.false -; X64-NEXT: bsrw %di, %ax +; X64-NEXT: notl %edi +; X64-NEXT: bsrw %di, %cx +; X64-NEXT: movw $31, %ax +; X64-NEXT: cmovnew %cx, %ax ; X64-NEXT: xorl $15, %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq -; X64-NEXT: .LBB2_1: -; X64-NEXT: movw $16, %ax -; X64-NEXT: # kill: def $ax killed $ax killed $eax -; X64-NEXT: retq ; ; X86-CLZ-LABEL: ctlo_i16: ; X86-CLZ: # %bb.0: @@ -193,30 +208,37 @@ define i16 @ctlo_i16_undef(i16 %x) { } define i32 @ctlo_i32(i32 %x) { -; X86-LABEL: ctlo_i32: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl $-1, %eax -; X86-NEXT: je .LBB4_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: bsrl %eax, %eax -; X86-NEXT: xorl $31, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB4_1: -; X86-NEXT: movl $32, %eax -; X86-NEXT: retl +; X86-NOCMOV-LABEL: ctlo_i32: +; X86-NOCMOV: # %bb.0: +; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOCMOV-NEXT: xorl $-1, %eax +; X86-NOCMOV-NEXT: je .LBB4_1 +; X86-NOCMOV-NEXT: # %bb.2: # %cond.false +; X86-NOCMOV-NEXT: bsrl %eax, %eax +; X86-NOCMOV-NEXT: xorl $31, %eax +; X86-NOCMOV-NEXT: retl +; X86-NOCMOV-NEXT: .LBB4_1: +; X86-NOCMOV-NEXT: movl $32, %eax +; X86-NOCMOV-NEXT: retl +; +; X86-CMOV-LABEL: ctlo_i32: +; X86-CMOV: # %bb.0: +; X86-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-CMOV-NEXT: notl %eax +; X86-CMOV-NEXT: bsrl %eax, %ecx +; X86-CMOV-NEXT: movl $63, %eax +; X86-CMOV-NEXT: cmovnel %ecx, %eax +; X86-CMOV-NEXT: xorl $31, %eax +; X86-CMOV-NEXT: retl ; ; X64-LABEL: ctlo_i32: ; X64: # %bb.0: -; X64-NEXT: xorl $-1, %edi -; X64-NEXT: je .LBB4_1 -; X64-NEXT: # %bb.2: # %cond.false -; X64-NEXT: bsrl %edi, %eax +; X64-NEXT: notl %edi +; X64-NEXT: bsrl %edi, %ecx +; X64-NEXT: movl $63, %eax +; X64-NEXT: cmovnel %ecx, %eax ; X64-NEXT: xorl $31, %eax ; X64-NEXT: retq -; X64-NEXT: .LBB4_1: -; X64-NEXT: movl $32, %eax -; X64-NEXT: retq ; ; X86-CLZ-LABEL: ctlo_i32: ; X86-CLZ: # %bb.0: @@ -314,15 +336,12 @@ define i64 @ctlo_i64(i64 %x) { ; ; X64-LABEL: ctlo_i64: ; X64: # %bb.0: -; X64-NEXT: xorq $-1, %rdi -; X64-NEXT: je .LBB6_1 -; X64-NEXT: # %bb.2: # %cond.false -; X64-NEXT: bsrq %rdi, %rax +; X64-NEXT: notq %rdi +; X64-NEXT: bsrq %rdi, %rcx +; X64-NEXT: movl $127, %eax +; X64-NEXT: cmovneq %rcx, %rax ; X64-NEXT: xorq $63, %rax ; X64-NEXT: retq -; X64-NEXT: .LBB6_1: -; X64-NEXT: movl $64, %eax -; X64-NEXT: retq ; ; X86-CLZ-LABEL: ctlo_i64: ; X86-CLZ: # %bb.0: diff --git a/llvm/test/CodeGen/X86/ctlz.ll b/llvm/test/CodeGen/X86/ctlz.ll index d8f83502bd849a7..6635be18b0f7a75 100644 --- a/llvm/test/CodeGen/X86/ctlz.ll +++ b/llvm/test/CodeGen/X86/ctlz.ll @@ -218,36 +218,41 @@ define i64 @ctlz_i64(i64 %x) { ; Generate a test and branch to handle zero inputs because bsr/bsf are very slow. define i8 @ctlz_i8_zero_test(i8 %n) { -; X86-LABEL: ctlz_i8_zero_test: -; X86: # %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: testb %al, %al -; X86-NEXT: je .LBB4_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: movzbl %al, %eax -; X86-NEXT: bsrl %eax, %eax -; X86-NEXT: xorl $7, %eax -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: retl -; X86-NEXT: .LBB4_1: -; X86-NEXT: movb $8, %al -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: retl +; X86-NOCMOV-LABEL: ctlz_i8_zero_test: +; X86-NOCMOV: # %bb.0: +; X86-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NOCMOV-NEXT: testb %al, %al +; X86-NOCMOV-NEXT: je .LBB4_1 +; X86-NOCMOV-NEXT: # %bb.2: # %cond.false +; X86-NOCMOV-NEXT: movzbl %al, %eax +; X86-NOCMOV-NEXT: bsrl %eax, %eax +; X86-NOCMOV-NEXT: xorl $7, %eax +; X86-NOCMOV-NEXT: # kill: def $al killed $al killed $eax +; X86-NOCMOV-NEXT: retl +; X86-NOCMOV-NEXT: .LBB4_1: +; X86-NOCMOV-NEXT: movb $8, %al +; X86-NOCMOV-NEXT: # kill: def $al killed $al killed $eax +; X86-NOCMOV-NEXT: retl +; +; X86-CMOV-LABEL: ctlz_i8_zero_test: +; X86-CMOV: # %bb.0: +; X86-CMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-CMOV-NEXT: bsrl %eax, %ecx +; X86-CMOV-NEXT: movl $15, %eax +; X86-CMOV-NEXT: cmovnel %ecx, %eax +; X86-CMOV-NEXT: xorl $7, %eax +; X86-CMOV-NEXT: # kill: def $al killed $al killed $eax +; X86-CMOV-NEXT: retl ; ; X64-LABEL: ctlz_i8_zero_test: ; X64: # %bb.0: -; X64-NEXT: testb %dil, %dil -; X64-NEXT: je .LBB4_1 -; X64-NEXT: # %bb.2: # %cond.false ; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: bsrl %eax, %eax +; X64-NEXT: bsrl %eax, %ecx +; X64-NEXT: movl $15, %eax +; X64-NEXT: cmovnel %ecx, %eax ; X64-NEXT: xorl $7, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq -; X64-NEXT: .LBB4_1: -; X64-NEXT: movb $8, %al -; X64-NEXT: # kill: def $al killed $al killed $eax -; X64-NEXT: retq ; ; X86-CLZ-LABEL: ctlz_i8_zero_test: ; X86-CLZ: # %bb.0: @@ -286,34 +291,38 @@ define i8 @ctlz_i8_zero_test(i8 %n) { ; Generate a test and branch to handle zero inputs because bsr/bsf are very slow. define i16 @ctlz_i16_zero_test(i16 %n) { -; X86-LABEL: ctlz_i16_zero_test: -; X86: # %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: testw %ax, %ax -; X86-NEXT: je .LBB5_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: bsrw %ax, %ax -; X86-NEXT: xorl $15, %eax -; X86-NEXT: # kill: def $ax killed $ax killed $eax -; X86-NEXT: retl -; X86-NEXT: .LBB5_1: -; X86-NEXT: movw $16, %ax -; X86-NEXT: # kill: def $ax killed $ax killed $eax -; X86-NEXT: retl +; X86-NOCMOV-LABEL: ctlz_i16_zero_test: +; X86-NOCMOV: # %bb.0: +; X86-NOCMOV-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NOCMOV-NEXT: testw %ax, %ax +; X86-NOCMOV-NEXT: je .LBB5_1 +; X86-NOCMOV-NEXT: # %bb.2: # %cond.false +; X86-NOCMOV-NEXT: bsrw %ax, %ax +; X86-NOCMOV-NEXT: xorl $15, %eax +; X86-NOCMOV-NEXT: # kill: def $ax killed $ax killed $eax +; X86-NOCMOV-NEXT: retl +; X86-NOCMOV-NEXT: .LBB5_1: +; X86-NOCMOV-NEXT: movw $16, %ax +; X86-NOCMOV-NEXT: # kill: def $ax killed $ax killed $eax +; X86-NOCMOV-NEXT: retl +; +; X86-CMOV-LABEL: ctlz_i16_zero_test: +; X86-CMOV: # %bb.0: +; X86-CMOV-NEXT: bsrw {{[0-9]+}}(%esp), %cx +; X86-CMOV-NEXT: movw $31, %ax +; X86-CMOV-NEXT: cmovnew %cx, %ax +; X86-CMOV-NEXT: xorl $15, %eax +; X86-CMOV-NEXT: # kill: def $ax killed $ax killed $eax +; X86-CMOV-NEXT: retl ; ; X64-LABEL: ctlz_i16_zero_test: ; X64: # %bb.0: -; X64-NEXT: testw %di, %di -; X64-NEXT: je .LBB5_1 -; X64-NEXT: # %bb.2: # %cond.false -; X64-NEXT: bsrw %di, %ax +; X64-NEXT: bsrw %di, %cx +; X64-NEXT: movw $31, %ax +; X64-NEXT: cmovnew %cx, %ax ; X64-NEXT: xorl $15, %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq -; X64-NEXT: .LBB5_1: -; X64-NEXT: movw $16, %ax -; X64-NEXT: # kill: def $ax killed $ax killed $eax -; X64-NEXT: retq ; ; X86-CLZ-LABEL: ctlz_i16_zero_test: ; X86-CLZ: # %bb.0: @@ -340,30 +349,34 @@ define i16 @ctlz_i16_zero_test(i16 %n) { ; Generate a test and branch to handle zero inputs because bsr/bsf are very slow. define i32 @ctlz_i32_zero_test(i32 %n) { -; X86-LABEL: ctlz_i32_zero_test: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB6_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: bsrl %eax, %eax -; X86-NEXT: xorl $31, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB6_1: -; X86-NEXT: movl $32, %eax -; X86-NEXT: retl +; X86-NOCMOV-LABEL: ctlz_i32_zero_test: +; X86-NOCMOV: # %bb.0: +; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOCMOV-NEXT: testl %eax, %eax +; X86-NOCMOV-NEXT: je .LBB6_1 +; X86-NOCMOV-NEXT: # %bb.2: # %cond.false +; X86-NOCMOV-NEXT: bsrl %eax, %eax +; X86-NOCMOV-NEXT: xorl $31, %eax +; X86-NOCMOV-NEXT: retl +; X86-NOCMOV-NEXT: .LBB6_1: +; X86-NOCMOV-NEXT: movl $32, %eax +; X86-NOCMOV-NEXT: retl +; +; X86-CMOV-LABEL: ctlz_i32_zero_test: +; X86-CMOV: # %bb.0: +; X86-CMOV-NEXT: bsrl {{[0-9]+}}(%esp), %ecx +; X86-CMOV-NEXT: movl $63, %eax +; X86-CMOV-NEXT: cmovnel %ecx, %eax +; X86-CMOV-NEXT: xorl $31, %eax +; X86-CMOV-NEXT: retl ; ; X64-LABEL: ctlz_i32_zero_test: ; X64: # %bb.0: -; X64-NEXT: testl %edi, %edi -; X64-NEXT: je .LBB6_1 -; X64-NEXT: # %bb.2: # %cond.false -; X64-NEXT: bsrl %edi, %eax +; X64-NEXT: bsrl %edi, %ecx +; X64-NEXT: movl $63, %eax +; X64-NEXT: cmovnel %ecx, %eax ; X64-NEXT: xorl $31, %eax ; X64-NEXT: retq -; X64-NEXT: .LBB6_1: -; X64-NEXT: movl $32, %eax -; X64-NEXT: retq ; ; X86-CLZ-LABEL: ctlz_i32_zero_test: ; X86-CLZ: # %bb.0: @@ -429,15 +442,11 @@ define i64 @ctlz_i64_zero_test(i64 %n) { ; ; X64-LABEL: ctlz_i64_zero_test: ; X64: # %bb.0: -; X64-NEXT: testq %rdi, %rdi -; X64-NEXT: je .LBB7_1 -; X64-NEXT: # %bb.2: # %cond.false -; X64-NEXT: bsrq %rdi, %rax +; X64-NEXT: bsrq %rdi, %rcx +; X64-NEXT: movl $127, %eax +; X64-NEXT: cmovneq %rcx, %rax ; X64-NEXT: xorq $63, %rax ; X64-NEXT: retq -; X64-NEXT: .LBB7_1: -; X64-NEXT: movl $64, %eax -; X64-NEXT: retq ; ; X86-CLZ-LABEL: ctlz_i64_zero_test: ; X86-CLZ: # %bb.0: @@ -580,33 +589,33 @@ define i32 @ctlz_bsr(i32 %n) { ; FIXME: The compare and branch are produced late in IR (by CodeGenPrepare), and ; codegen doesn't know how to combine the $32 and $31 into $63. define i32 @ctlz_bsr_zero_test(i32 %n) { -; X86-LABEL: ctlz_bsr_zero_test: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB10_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: bsrl %eax, %eax -; X86-NEXT: xorl $31, %eax -; X86-NEXT: xorl $31, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB10_1: -; X86-NEXT: movl $32, %eax -; X86-NEXT: xorl $31, %eax -; X86-NEXT: retl +; X86-NOCMOV-LABEL: ctlz_bsr_zero_test: +; X86-NOCMOV: # %bb.0: +; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOCMOV-NEXT: testl %eax, %eax +; X86-NOCMOV-NEXT: je .LBB10_1 +; X86-NOCMOV-NEXT: # %bb.2: # %cond.false +; X86-NOCMOV-NEXT: bsrl %eax, %eax +; X86-NOCMOV-NEXT: xorl $31, %eax +; X86-NOCMOV-NEXT: xorl $31, %eax +; X86-NOCMOV-NEXT: retl +; X86-NOCMOV-NEXT: .LBB10_1: +; X86-NOCMOV-NEXT: movl $32, %eax +; X86-NOCMOV-NEXT: xorl $31, %eax +; X86-NOCMOV-NEXT: retl +; +; X86-CMOV-LABEL: ctlz_bsr_zero_test: +; X86-CMOV: # %bb.0: +; X86-CMOV-NEXT: bsrl {{[0-9]+}}(%esp), %ecx +; X86-CMOV-NEXT: movl $63, %eax +; X86-CMOV-NEXT: cmovnel %ecx, %eax +; X86-CMOV-NEXT: retl ; ; X64-LABEL: ctlz_bsr_zero_test: ; X64: # %bb.0: -; X64-NEXT: testl %edi, %edi -; X64-NEXT: je .LBB10_1 -; X64-NEXT: # %bb.2: # %cond.false -; X64-NEXT: bsrl %edi, %eax -; X64-NEXT: xorl $31, %eax -; X64-NEXT: xorl $31, %eax -; X64-NEXT: retq -; X64-NEXT: .LBB10_1: -; X64-NEXT: movl $32, %eax -; X64-NEXT: xorl $31, %eax +; X64-NEXT: bsrl %edi, %ecx +; X64-NEXT: movl $63, %eax +; X64-NEXT: cmovnel %ecx, %eax ; X64-NEXT: retq ; ; X86-CLZ-LABEL: ctlz_bsr_zero_test: @@ -945,38 +954,39 @@ define i8 @ctlz_xor7_i8_true(i8 %x) { } define i8 @ctlz_xor7_i8_false(i8 %x) { -; X86-LABEL: ctlz_xor7_i8_false: -; X86: # %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: testb %al, %al -; X86-NEXT: je .LBB16_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: movzbl %al, %eax -; X86-NEXT: bsrl %eax, %eax -; X86-NEXT: xorl $7, %eax -; X86-NEXT: xorb $7, %al -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: retl -; X86-NEXT: .LBB16_1: -; X86-NEXT: movb $8, %al -; X86-NEXT: xorb $7, %al -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: retl +; X86-NOCMOV-LABEL: ctlz_xor7_i8_false: +; X86-NOCMOV: # %bb.0: +; X86-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NOCMOV-NEXT: testb %al, %al +; X86-NOCMOV-NEXT: je .LBB16_1 +; X86-NOCMOV-NEXT: # %bb.2: # %cond.false +; X86-NOCMOV-NEXT: movzbl %al, %eax +; X86-NOCMOV-NEXT: bsrl %eax, %eax +; X86-NOCMOV-NEXT: xorl $7, %eax +; X86-NOCMOV-NEXT: xorb $7, %al +; X86-NOCMOV-NEXT: # kill: def $al killed $al killed $eax +; X86-NOCMOV-NEXT: retl +; X86-NOCMOV-NEXT: .LBB16_1: +; X86-NOCMOV-NEXT: movb $8, %al +; X86-NOCMOV-NEXT: xorb $7, %al +; X86-NOCMOV-NEXT: # kill: def $al killed $al killed $eax +; X86-NOCMOV-NEXT: retl +; +; X86-CMOV-LABEL: ctlz_xor7_i8_false: +; X86-CMOV: # %bb.0: +; X86-CMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-CMOV-NEXT: bsrl %eax, %ecx +; X86-CMOV-NEXT: movl $15, %eax +; X86-CMOV-NEXT: cmovnel %ecx, %eax +; X86-CMOV-NEXT: # kill: def $al killed $al killed $eax +; X86-CMOV-NEXT: retl ; ; X64-LABEL: ctlz_xor7_i8_false: ; X64: # %bb.0: -; X64-NEXT: testb %dil, %dil -; X64-NEXT: je .LBB16_1 -; X64-NEXT: # %bb.2: # %cond.false ; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: bsrl %eax, %eax -; X64-NEXT: xorl $7, %eax -; X64-NEXT: xorb $7, %al -; X64-NEXT: # kill: def $al killed $al killed $eax -; X64-NEXT: retq -; X64-NEXT: .LBB16_1: -; X64-NEXT: movb $8, %al -; X64-NEXT: xorb $7, %al +; X64-NEXT: bsrl %eax, %ecx +; X64-NEXT: movl $15, %eax +; X64-NEXT: cmovnel %ecx, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq ; @@ -1060,33 +1070,33 @@ define i16 @ctlz_xor15_i16_true(i16 %x) { } define i32 @ctlz_xor31_i32_false(i32 %x) { -; X86-LABEL: ctlz_xor31_i32_false: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB18_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: bsrl %eax, %eax -; X86-NEXT: xorl $31, %eax -; X86-NEXT: xorl $31, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB18_1: -; X86-NEXT: movl $32, %eax -; X86-NEXT: xorl $31, %eax -; X86-NEXT: retl +; X86-NOCMOV-LABEL: ctlz_xor31_i32_false: +; X86-NOCMOV: # %bb.0: +; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOCMOV-NEXT: testl %eax, %eax +; X86-NOCMOV-NEXT: je .LBB18_1 +; X86-NOCMOV-NEXT: # %bb.2: # %cond.false +; X86-NOCMOV-NEXT: bsrl %eax, %eax +; X86-NOCMOV-NEXT: xorl $31, %eax +; X86-NOCMOV-NEXT: xorl $31, %eax +; X86-NOCMOV-NEXT: retl +; X86-NOCMOV-NEXT: .LBB18_1: +; X86-NOCMOV-NEXT: movl $32, %eax +; X86-NOCMOV-NEXT: xorl $31, %eax +; X86-NOCMOV-NEXT: retl +; +; X86-CMOV-LABEL: ctlz_xor31_i32_false: +; X86-CMOV: # %bb.0: +; X86-CMOV-NEXT: bsrl {{[0-9]+}}(%esp), %ecx +; X86-CMOV-NEXT: movl $63, %eax +; X86-CMOV-NEXT: cmovnel %ecx, %eax +; X86-CMOV-NEXT: retl ; ; X64-LABEL: ctlz_xor31_i32_false: ; X64: # %bb.0: -; X64-NEXT: testl %edi, %edi -; X64-NEXT: je .LBB18_1 -; X64-NEXT: # %bb.2: # %cond.false -; X64-NEXT: bsrl %edi, %eax -; X64-NEXT: xorl $31, %eax -; X64-NEXT: xorl $31, %eax -; X64-NEXT: retq -; X64-NEXT: .LBB18_1: -; X64-NEXT: movl $32, %eax -; X64-NEXT: xorl $31, %eax +; X64-NEXT: bsrl %edi, %ecx +; X64-NEXT: movl $63, %eax +; X64-NEXT: cmovnel %ecx, %eax ; X64-NEXT: retq ; ; X86-CLZ-LABEL: ctlz_xor31_i32_false: diff --git a/llvm/test/CodeGen/X86/cttz.ll b/llvm/test/CodeGen/X86/cttz.ll index b35a1b72fcb6f12..27f229b18bf057b 100644 --- a/llvm/test/CodeGen/X86/cttz.ll +++ b/llvm/test/CodeGen/X86/cttz.ll @@ -303,17 +303,24 @@ define i16 @cttz_i16_zero_test(i16 %n) { ; Generate a test and branch to handle zero inputs because bsr/bsf are very slow. define i32 @cttz_i32_zero_test(i32 %n) { -; X86-LABEL: cttz_i32_zero_test: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB6_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB6_1: -; X86-NEXT: movl $32, %eax -; X86-NEXT: retl +; X86-NOCMOV-LABEL: cttz_i32_zero_test: +; X86-NOCMOV: # %bb.0: +; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOCMOV-NEXT: testl %eax, %eax +; X86-NOCMOV-NEXT: je .LBB6_1 +; X86-NOCMOV-NEXT: # %bb.2: # %cond.false +; X86-NOCMOV-NEXT: rep bsfl %eax, %eax +; X86-NOCMOV-NEXT: retl +; X86-NOCMOV-NEXT: .LBB6_1: +; X86-NOCMOV-NEXT: movl $32, %eax +; X86-NOCMOV-NEXT: retl +; +; X86-CMOV-LABEL: cttz_i32_zero_test: +; X86-CMOV: # %bb.0: +; X86-CMOV-NEXT: bsfl {{[0-9]+}}(%esp), %ecx +; X86-CMOV-NEXT: movl $32, %eax +; X86-CMOV-NEXT: cmovnel %ecx, %eax +; X86-CMOV-NEXT: retl ; ; X64-LABEL: cttz_i32_zero_test: ; X64: # %bb.0: @@ -386,13 +393,9 @@ define i64 @cttz_i64_zero_test(i64 %n) { ; ; X64-LABEL: cttz_i64_zero_test: ; X64: # %bb.0: -; X64-NEXT: testq %rdi, %rdi -; X64-NEXT: je .LBB7_1 -; X64-NEXT: # %bb.2: # %cond.false -; X64-NEXT: rep bsfq %rdi, %rax -; X64-NEXT: retq -; X64-NEXT: .LBB7_1: +; X64-NEXT: bsfq %rdi, %rcx ; X64-NEXT: movl $64, %eax +; X64-NEXT: cmovneq %rcx, %rax ; X64-NEXT: retq ; ; X86-CLZ-LABEL: cttz_i64_zero_test: diff --git a/llvm/test/CodeGen/X86/known-never-zero.ll b/llvm/test/CodeGen/X86/known-never-zero.ll index d5d604a138a719c..ac41a3fe6bb7e44 100644 --- a/llvm/test/CodeGen/X86/known-never-zero.ll +++ b/llvm/test/CodeGen/X86/known-never-zero.ll @@ -44,12 +44,9 @@ define i32 @or_maybe_zero(i32 %x, i32 %y) { ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: orl {{[0-9]+}}(%esp), %eax -; X86-NEXT: je .LBB1_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB1_1: +; X86-NEXT: bsfl %eax, %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: or_maybe_zero: @@ -94,18 +91,14 @@ define i32 @select_known_nonzero(i1 %c, i32 %x) { define i32 @select_maybe_zero(i1 %c, i32 %x) { ; X86-LABEL: select_maybe_zero: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: orl $1, %ecx -; X86-NEXT: xorl %eax, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: orl $1, %eax +; X86-NEXT: xorl %ecx, %ecx ; X86-NEXT: testb $1, {{[0-9]+}}(%esp) -; X86-NEXT: cmovnel %ecx, %eax -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB3_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB3_1: +; X86-NEXT: cmovnel %eax, %ecx +; X86-NEXT: bsfl %ecx, %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: select_maybe_zero: @@ -201,13 +194,9 @@ define i32 @shl_maybe_zero(i32 %x, i32 %y) { ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: shll %cl, %eax -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB7_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB7_1: +; X86-NEXT: bsfl %eax, %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: shl_maybe_zero: @@ -251,17 +240,13 @@ define i32 @uaddsat_known_nonzero(i32 %x) { define i32 @uaddsat_maybe_zero(i32 %x, i32 %y) { ; X86-LABEL: uaddsat_maybe_zero: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: addl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $-1, %eax -; X86-NEXT: cmovael %ecx, %eax -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB9_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB9_1: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: addl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl $-1, %ecx +; X86-NEXT: cmovael %eax, %ecx +; X86-NEXT: bsfl %ecx, %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: uaddsat_maybe_zero: @@ -314,13 +299,9 @@ define i32 @umax_maybe_zero(i32 %x, i32 %y) { ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: cmpl %eax, %ecx ; X86-NEXT: cmoval %ecx, %eax -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB11_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB11_1: +; X86-NEXT: bsfl %eax, %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: umax_maybe_zero: @@ -372,17 +353,13 @@ define i32 @umin_known_nonzero(i32 %xx, i32 %yy) { define i32 @umin_maybe_zero(i32 %x, i32 %y) { ; X86-LABEL: umin_maybe_zero: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmpl $54, %ecx -; X86-NEXT: movl $54, %eax -; X86-NEXT: cmovbl %ecx, %eax -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB13_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB13_1: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $54, %eax +; X86-NEXT: movl $54, %ecx +; X86-NEXT: cmovbl %eax, %ecx +; X86-NEXT: bsfl %ecx, %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: umin_maybe_zero: @@ -490,17 +467,13 @@ define <4 x i32> @smin_known_zero_vec(<4 x i32> %x, <4 x i32> %y) { define i32 @smin_maybe_zero(i32 %x, i32 %y) { ; X86-LABEL: smin_maybe_zero: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmpl $54, %ecx -; X86-NEXT: movl $54, %eax -; X86-NEXT: cmovll %ecx, %eax -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB17_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB17_1: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $54, %eax +; X86-NEXT: movl $54, %ecx +; X86-NEXT: cmovll %eax, %ecx +; X86-NEXT: bsfl %ecx, %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: smin_maybe_zero: @@ -608,17 +581,13 @@ define <4 x i32> @smax_known_zero_vec(<4 x i32> %x, <4 x i32> %y) { define i32 @smax_known_zero(i32 %x, i32 %y) { ; X86-LABEL: smax_known_zero: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: testl %ecx, %ecx -; X86-NEXT: movl $-1, %eax -; X86-NEXT: cmovnsl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB21_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB21_1: +; X86-NEXT: movl $-1, %ecx +; X86-NEXT: cmovnsl %eax, %ecx +; X86-NEXT: bsfl %ecx, %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: smax_known_zero: @@ -643,14 +612,8 @@ define i32 @rotr_known_nonzero(i32 %xx, i32 %y) { ; X86-NEXT: movl $256, %eax # imm = 0x100 ; X86-NEXT: orl {{[0-9]+}}(%esp), %eax ; X86-NEXT: rorl %cl, %eax -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB22_1 -; X86-NEXT: # %bb.2: # %cond.false ; X86-NEXT: rep bsfl %eax, %eax ; X86-NEXT: retl -; X86-NEXT: .LBB22_1: -; X86-NEXT: movl $32, %eax -; X86-NEXT: retl ; ; X64-LABEL: rotr_known_nonzero: ; X64: # %bb.0: @@ -675,13 +638,9 @@ define i32 @rotr_maybe_zero(i32 %x, i32 %y) { ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: rorl %cl, %eax -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB23_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB23_1: +; X86-NEXT: bsfl %eax, %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: rotr_maybe_zero: @@ -733,13 +692,9 @@ define i32 @rotr_with_fshr_maybe_zero(i32 %x, i32 %y) { ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: rorl %cl, %eax -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB25_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB25_1: +; X86-NEXT: bsfl %eax, %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: rotr_with_fshr_maybe_zero: @@ -765,14 +720,8 @@ define i32 @rotl_known_nonzero(i32 %xx, i32 %y) { ; X86-NEXT: movl $256, %eax # imm = 0x100 ; X86-NEXT: orl {{[0-9]+}}(%esp), %eax ; X86-NEXT: roll %cl, %eax -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB26_1 -; X86-NEXT: # %bb.2: # %cond.false ; X86-NEXT: rep bsfl %eax, %eax ; X86-NEXT: retl -; X86-NEXT: .LBB26_1: -; X86-NEXT: movl $32, %eax -; X86-NEXT: retl ; ; X64-LABEL: rotl_known_nonzero: ; X64: # %bb.0: @@ -797,13 +746,9 @@ define i32 @rotl_maybe_zero(i32 %x, i32 %y) { ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: roll %cl, %eax -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB27_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB27_1: +; X86-NEXT: bsfl %eax, %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: rotl_maybe_zero: @@ -855,13 +800,9 @@ define i32 @rotl_with_fshl_maybe_zero(i32 %x, i32 %y) { ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: roll %cl, %eax -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB29_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB29_1: +; X86-NEXT: bsfl %eax, %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: rotl_with_fshl_maybe_zero: @@ -932,13 +873,9 @@ define i32 @sra_maybe_zero(i32 %x, i32 %y) { ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: sarl %cl, %eax -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB32_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB32_1: +; X86-NEXT: bsfl %eax, %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: sra_maybe_zero: @@ -1009,13 +946,9 @@ define i32 @srl_maybe_zero(i32 %x, i32 %y) { ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: shrl %cl, %eax -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB35_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB35_1: +; X86-NEXT: bsfl %eax, %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: srl_maybe_zero: @@ -1064,13 +997,9 @@ define i32 @udiv_maybe_zero(i32 %x, i32 %y) { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: divl {{[0-9]+}}(%esp) -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB37_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB37_1: +; X86-NEXT: bsfl %eax, %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: udiv_maybe_zero: @@ -1119,13 +1048,9 @@ define i32 @sdiv_maybe_zero(i32 %x, i32 %y) { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cltd ; X86-NEXT: idivl {{[0-9]+}}(%esp) -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB39_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB39_1: +; X86-NEXT: bsfl %eax, %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: sdiv_maybe_zero: @@ -1171,12 +1096,9 @@ define i32 @add_maybe_zero(i32 %xx, i32 %y) { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: orl $1, %eax ; X86-NEXT: addl {{[0-9]+}}(%esp), %eax -; X86-NEXT: je .LBB41_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB41_1: +; X86-NEXT: bsfl %eax, %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: add_maybe_zero: @@ -1249,16 +1171,13 @@ define i32 @sub_known_nonzero_ne_case(i32 %xx, i32 %yy) { define i32 @sub_maybe_zero(i32 %x) { ; X86-LABEL: sub_maybe_zero: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: orl $64, %eax -; X86-NEXT: subl %ecx, %eax -; X86-NEXT: je .LBB44_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB44_1: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: orl $64, %ecx +; X86-NEXT: subl %eax, %ecx +; X86-NEXT: bsfl %ecx, %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: sub_maybe_zero: @@ -1280,14 +1199,11 @@ define i32 @sub_maybe_zero(i32 %x) { define i32 @sub_maybe_zero2(i32 %x) { ; X86-LABEL: sub_maybe_zero2: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: negl %eax -; X86-NEXT: je .LBB45_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB45_1: +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: subl {{[0-9]+}}(%esp), %eax +; X86-NEXT: bsfl %eax, %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: sub_maybe_zero2: @@ -1310,13 +1226,9 @@ define i32 @mul_known_nonzero_nsw(i32 %x, i32 %yy) { ; X86-NEXT: movl $256, %eax # imm = 0x100 ; X86-NEXT: orl {{[0-9]+}}(%esp), %eax ; X86-NEXT: imull {{[0-9]+}}(%esp), %eax -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB46_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB46_1: +; X86-NEXT: bsfl %eax, %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: mul_known_nonzero_nsw: @@ -1341,13 +1253,9 @@ define i32 @mul_known_nonzero_nuw(i32 %x, i32 %yy) { ; X86-NEXT: movl $256, %eax # imm = 0x100 ; X86-NEXT: orl {{[0-9]+}}(%esp), %eax ; X86-NEXT: imull {{[0-9]+}}(%esp), %eax -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB47_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB47_1: +; X86-NEXT: bsfl %eax, %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: mul_known_nonzero_nuw: @@ -1371,13 +1279,9 @@ define i32 @mul_maybe_zero(i32 %x, i32 %y) { ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: imull {{[0-9]+}}(%esp), %eax -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB48_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB48_1: +; X86-NEXT: bsfl %eax, %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: mul_maybe_zero: @@ -1433,13 +1337,9 @@ define i32 @bitcast_maybe_zero(<2 x i16> %x) { ; X86-LABEL: bitcast_maybe_zero: ; X86: # %bb.0: ; X86-NEXT: movd %xmm0, %eax -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB50_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB50_1: +; X86-NEXT: bsfl %eax, %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: bitcast_maybe_zero: @@ -1458,15 +1358,9 @@ define i32 @bitcast_maybe_zero(<2 x i16> %x) { define i32 @bitcast_from_float(float %x) { ; X86-LABEL: bitcast_from_float: ; X86: # %bb.0: -; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: movd %xmm0, %eax -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB51_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB51_1: +; X86-NEXT: bsfl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: bitcast_from_float: @@ -1511,14 +1405,9 @@ define i32 @zext_maybe_zero(i16 %x) { ; X86-LABEL: zext_maybe_zero: ; X86: # %bb.0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: testw %ax, %ax -; X86-NEXT: je .LBB53_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: movzwl %ax, %eax -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB53_1: +; X86-NEXT: bsfl %eax, %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: zext_maybe_zero: @@ -1563,13 +1452,9 @@ define i32 @sext_maybe_zero(i16 %x) { ; X86-LABEL: sext_maybe_zero: ; X86: # %bb.0: ; X86-NEXT: movswl {{[0-9]+}}(%esp), %eax -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB55_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB55_1: +; X86-NEXT: bsfl %eax, %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: sext_maybe_zero: diff --git a/llvm/test/CodeGen/X86/lzcnt-cmp.ll b/llvm/test/CodeGen/X86/lzcnt-cmp.ll index a9513a373661f49..4f65739cc70dd1a 100644 --- a/llvm/test/CodeGen/X86/lzcnt-cmp.ll +++ b/llvm/test/CodeGen/X86/lzcnt-cmp.ll @@ -12,27 +12,11 @@ define i1 @lshr_ctlz_cmpeq_one_i64(i64 %in) nounwind { ; X86-NEXT: sete %al ; X86-NEXT: retl ; -; X64-BSR-LABEL: lshr_ctlz_cmpeq_one_i64: -; X64-BSR: # %bb.0: -; X64-BSR-NEXT: testq %rdi, %rdi -; X64-BSR-NEXT: je .LBB0_1 -; X64-BSR-NEXT: # %bb.2: # %cond.false -; X64-BSR-NEXT: bsrq %rdi, %rax -; X64-BSR-NEXT: xorq $63, %rax -; X64-BSR-NEXT: shrl $6, %eax -; X64-BSR-NEXT: # kill: def $al killed $al killed $rax -; X64-BSR-NEXT: retq -; X64-BSR-NEXT: .LBB0_1: -; X64-BSR-NEXT: movl $64, %eax -; X64-BSR-NEXT: shrl $6, %eax -; X64-BSR-NEXT: # kill: def $al killed $al killed $rax -; X64-BSR-NEXT: retq -; -; X64-LZCNT-LABEL: lshr_ctlz_cmpeq_one_i64: -; X64-LZCNT: # %bb.0: -; X64-LZCNT-NEXT: testq %rdi, %rdi -; X64-LZCNT-NEXT: sete %al -; X64-LZCNT-NEXT: retq +; X64-LABEL: lshr_ctlz_cmpeq_one_i64: +; X64: # %bb.0: +; X64-NEXT: testq %rdi, %rdi +; X64-NEXT: sete %al +; X64-NEXT: retq %ctlz = call i64 @llvm.ctlz.i64(i64 %in, i1 0) %lshr = lshr i64 %ctlz, 6 %icmp = icmp eq i64 %lshr, 1 @@ -81,27 +65,11 @@ define i1 @lshr_ctlz_cmpne_zero_i64(i64 %in) nounwind { ; X86-NEXT: sete %al ; X86-NEXT: retl ; -; X64-BSR-LABEL: lshr_ctlz_cmpne_zero_i64: -; X64-BSR: # %bb.0: -; X64-BSR-NEXT: testq %rdi, %rdi -; X64-BSR-NEXT: je .LBB2_1 -; X64-BSR-NEXT: # %bb.2: # %cond.false -; X64-BSR-NEXT: bsrq %rdi, %rax -; X64-BSR-NEXT: xorq $63, %rax -; X64-BSR-NEXT: shrl $6, %eax -; X64-BSR-NEXT: # kill: def $al killed $al killed $rax -; X64-BSR-NEXT: retq -; X64-BSR-NEXT: .LBB2_1: -; X64-BSR-NEXT: movl $64, %eax -; X64-BSR-NEXT: shrl $6, %eax -; X64-BSR-NEXT: # kill: def $al killed $al killed $rax -; X64-BSR-NEXT: retq -; -; X64-LZCNT-LABEL: lshr_ctlz_cmpne_zero_i64: -; X64-LZCNT: # %bb.0: -; X64-LZCNT-NEXT: testq %rdi, %rdi -; X64-LZCNT-NEXT: sete %al -; X64-LZCNT-NEXT: retq +; X64-LABEL: lshr_ctlz_cmpne_zero_i64: +; X64: # %bb.0: +; X64-NEXT: testq %rdi, %rdi +; X64-NEXT: sete %al +; X64-NEXT: retq %ctlz = call i64 @llvm.ctlz.i64(i64 %in, i1 0) %lshr = lshr i64 %ctlz, 6 %icmp = icmp ne i64 %lshr, 0 diff --git a/llvm/test/CodeGen/X86/pr57673.ll b/llvm/test/CodeGen/X86/pr57673.ll index d0ae6cea068dc07..cf7717f420480b5 100644 --- a/llvm/test/CodeGen/X86/pr57673.ll +++ b/llvm/test/CodeGen/X86/pr57673.ll @@ -24,35 +24,24 @@ define void @foo() { ; NORMAL-NEXT: [[COPY:%[0-9]+]]:gr8 = COPY [[MOV32r0_]].sub_8bit ; NORMAL-NEXT: [[LEA64r:%[0-9]+]]:gr64 = LEA64r %stack.1.i, 1, $noreg, 0, $noreg ; NORMAL-NEXT: [[DEF:%[0-9]+]]:gr64 = IMPLICIT_DEF - ; NORMAL-NEXT: [[DEF1:%[0-9]+]]:gr64 = IMPLICIT_DEF ; NORMAL-NEXT: {{ $}} ; NORMAL-NEXT: bb.1.bb_8: - ; NORMAL-NEXT: successors: %bb.5(0x40000000), %bb.2(0x40000000) + ; NORMAL-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) ; NORMAL-NEXT: {{ $}} ; NORMAL-NEXT: TEST8rr [[COPY]], [[COPY]], implicit-def $eflags - ; NORMAL-NEXT: JCC_1 %bb.5, 5, implicit $eflags + ; NORMAL-NEXT: JCC_1 %bb.3, 5, implicit $eflags ; NORMAL-NEXT: JMP_1 %bb.2 ; NORMAL-NEXT: {{ $}} ; NORMAL-NEXT: bb.2.bb_mid: - ; NORMAL-NEXT: successors: %bb.4(0x30000000), %bb.3(0x50000000) + ; NORMAL-NEXT: successors: %bb.3(0x80000000) ; NORMAL-NEXT: {{ $}} - ; NORMAL-NEXT: TEST64rr [[DEF1]], [[DEF1]], implicit-def $eflags - ; NORMAL-NEXT: JCC_1 %bb.4, 4, implicit $eflags - ; NORMAL-NEXT: JMP_1 %bb.3 - ; NORMAL-NEXT: {{ $}} - ; NORMAL-NEXT: bb.3.cond.false: - ; NORMAL-NEXT: successors: %bb.4(0x80000000) - ; NORMAL-NEXT: {{ $}} - ; NORMAL-NEXT: bb.4.cond.end: - ; NORMAL-NEXT: successors: %bb.5(0x80000000) - ; NORMAL-NEXT: {{ $}} - ; NORMAL-NEXT: [[MOVUPSrm:%[0-9]+]]:vr128 = MOVUPSrm [[LEA64r]], 1, $noreg, 40, $noreg :: (load (s128) from %ir.i4, align 8) + ; NORMAL-NEXT: [[MOVUPSrm:%[0-9]+]]:vr128 = MOVUPSrm %stack.1.i, 1, $noreg, 40, $noreg :: (load (s128) from %ir.i4, align 8) ; NORMAL-NEXT: MOVUPSmr $noreg, 1, $noreg, 0, $noreg, killed [[MOVUPSrm]] :: (store (s128) into `ptr null`, align 8) - ; NORMAL-NEXT: DBG_VALUE_LIST !3, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_plus_uconst, 40), [[LEA64r]], [[LEA64r]], debug-location !8 - ; NORMAL-NEXT: [[MOVUPSrm1:%[0-9]+]]:vr128 = MOVUPSrm [[LEA64r]], 1, $noreg, 40, $noreg :: (load (s128) from %ir.i6, align 8) + ; NORMAL-NEXT: DBG_VALUE $noreg, $noreg, !3, !DIExpression(), debug-location !8 + ; NORMAL-NEXT: [[MOVUPSrm1:%[0-9]+]]:vr128 = MOVUPSrm %stack.1.i, 1, $noreg, 40, $noreg :: (load (s128) from %ir.i6, align 8) ; NORMAL-NEXT: MOVUPSmr $noreg, 1, $noreg, 0, $noreg, killed [[MOVUPSrm1]] :: (store (s128) into `ptr null`, align 8) ; NORMAL-NEXT: {{ $}} - ; NORMAL-NEXT: bb.5.bb_last: + ; NORMAL-NEXT: bb.3.bb_last: ; NORMAL-NEXT: successors: %bb.1(0x80000000) ; NORMAL-NEXT: {{ $}} ; NORMAL-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp @@ -74,35 +63,24 @@ define void @foo() { ; INSTRREF-NEXT: [[COPY:%[0-9]+]]:gr8 = COPY [[MOV32r0_]].sub_8bit ; INSTRREF-NEXT: [[LEA64r:%[0-9]+]]:gr64 = LEA64r %stack.1.i, 1, $noreg, 0, $noreg ; INSTRREF-NEXT: [[DEF:%[0-9]+]]:gr64 = IMPLICIT_DEF - ; INSTRREF-NEXT: [[DEF1:%[0-9]+]]:gr64 = IMPLICIT_DEF ; INSTRREF-NEXT: {{ $}} ; INSTRREF-NEXT: bb.1.bb_8: - ; INSTRREF-NEXT: successors: %bb.5(0x40000000), %bb.2(0x40000000) + ; INSTRREF-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) ; INSTRREF-NEXT: {{ $}} ; INSTRREF-NEXT: TEST8rr [[COPY]], [[COPY]], implicit-def $eflags - ; INSTRREF-NEXT: JCC_1 %bb.5, 5, implicit $eflags + ; INSTRREF-NEXT: JCC_1 %bb.3, 5, implicit $eflags ; INSTRREF-NEXT: JMP_1 %bb.2 ; INSTRREF-NEXT: {{ $}} ; INSTRREF-NEXT: bb.2.bb_mid: - ; INSTRREF-NEXT: successors: %bb.4(0x30000000), %bb.3(0x50000000) - ; INSTRREF-NEXT: {{ $}} - ; INSTRREF-NEXT: TEST64rr [[DEF1]], [[DEF1]], implicit-def $eflags - ; INSTRREF-NEXT: JCC_1 %bb.4, 4, implicit $eflags - ; INSTRREF-NEXT: JMP_1 %bb.3 - ; INSTRREF-NEXT: {{ $}} - ; INSTRREF-NEXT: bb.3.cond.false: - ; INSTRREF-NEXT: successors: %bb.4(0x80000000) - ; INSTRREF-NEXT: {{ $}} - ; INSTRREF-NEXT: bb.4.cond.end: - ; INSTRREF-NEXT: successors: %bb.5(0x80000000) + ; INSTRREF-NEXT: successors: %bb.3(0x80000000) ; INSTRREF-NEXT: {{ $}} - ; INSTRREF-NEXT: [[MOVUPSrm:%[0-9]+]]:vr128 = MOVUPSrm [[LEA64r]], 1, $noreg, 40, $noreg :: (load (s128) from %ir.i4, align 8) + ; INSTRREF-NEXT: [[MOVUPSrm:%[0-9]+]]:vr128 = MOVUPSrm %stack.1.i, 1, $noreg, 40, $noreg :: (load (s128) from %ir.i4, align 8) ; INSTRREF-NEXT: MOVUPSmr $noreg, 1, $noreg, 0, $noreg, killed [[MOVUPSrm]] :: (store (s128) into `ptr null`, align 8) - ; INSTRREF-NEXT: DBG_INSTR_REF !3, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0), dbg-instr-ref(1, 0), debug-location !8 - ; INSTRREF-NEXT: [[MOVUPSrm1:%[0-9]+]]:vr128 = MOVUPSrm [[LEA64r]], 1, $noreg, 40, $noreg :: (load (s128) from %ir.i6, align 8) + ; INSTRREF-NEXT: DBG_VALUE $noreg, $noreg, !3, !DIExpression(), debug-location !8 + ; INSTRREF-NEXT: [[MOVUPSrm1:%[0-9]+]]:vr128 = MOVUPSrm %stack.1.i, 1, $noreg, 40, $noreg :: (load (s128) from %ir.i6, align 8) ; INSTRREF-NEXT: MOVUPSmr $noreg, 1, $noreg, 0, $noreg, killed [[MOVUPSrm1]] :: (store (s128) into `ptr null`, align 8) ; INSTRREF-NEXT: {{ $}} - ; INSTRREF-NEXT: bb.5.bb_last: + ; INSTRREF-NEXT: bb.3.bb_last: ; INSTRREF-NEXT: successors: %bb.1(0x80000000) ; INSTRREF-NEXT: {{ $}} ; INSTRREF-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp diff --git a/llvm/test/CodeGen/X86/pr89877.ll b/llvm/test/CodeGen/X86/pr89877.ll index fdbe75b467d992a..19baad26583ada5 100644 --- a/llvm/test/CodeGen/X86/pr89877.ll +++ b/llvm/test/CodeGen/X86/pr89877.ll @@ -9,13 +9,9 @@ define i32 @sext_known_nonzero(i16 %xx) { ; X86-NEXT: movl $256, %eax # imm = 0x100 ; X86-NEXT: shll %cl, %eax ; X86-NEXT: cwtl -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB0_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB0_1: +; X86-NEXT: bsfl %eax, %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: sext_known_nonzero: diff --git a/llvm/test/CodeGen/X86/pr92569.ll b/llvm/test/CodeGen/X86/pr92569.ll index f91063089e3a909..0fb4ed7905287cb 100644 --- a/llvm/test/CodeGen/X86/pr92569.ll +++ b/llvm/test/CodeGen/X86/pr92569.ll @@ -4,17 +4,13 @@ define void @PR92569(i64 %arg, <8 x i8> %arg1) { ; CHECK-LABEL: PR92569: ; CHECK: # %bb.0: -; CHECK-NEXT: testq %rdi, %rdi -; CHECK-NEXT: je .LBB0_1 -; CHECK-NEXT: # %bb.2: # %cond.false -; CHECK-NEXT: rep bsfq %rdi, %rax -; CHECK-NEXT: jmp .LBB0_3 -; CHECK-NEXT: .LBB0_1: -; CHECK-NEXT: movl $64, %eax -; CHECK-NEXT: .LBB0_3: # %cond.end -; CHECK-NEXT: shrb $3, %al +; CHECK-NEXT: bsfq %rdi, %rax +; CHECK-NEXT: movl $64, %ecx +; CHECK-NEXT: cmovneq %rax, %rcx +; CHECK-NEXT: shrb $3, %cl ; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: movzbl %cl, %eax +; CHECK-NEXT: andl $15, %eax ; CHECK-NEXT: movzbl -24(%rsp,%rax), %eax ; CHECK-NEXT: movl %eax, 0 ; CHECK-NEXT: retq diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/cttz-ctlz.ll b/llvm/test/Transforms/CodeGenPrepare/X86/cttz-ctlz.ll index 06909d950addb6c..2c2923440bf7c20 100644 --- a/llvm/test/Transforms/CodeGenPrepare/X86/cttz-ctlz.ll +++ b/llvm/test/Transforms/CodeGenPrepare/X86/cttz-ctlz.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -passes='require,function(codegenprepare)' < %s | FileCheck %s --check-prefix=SLOW -; RUN: opt -S -passes='require,function(codegenprepare)' -mattr=+bmi < %s | FileCheck %s --check-prefix=FAST_TZ -; RUN: opt -S -passes='require,function(codegenprepare)' -mattr=+lzcnt < %s | FileCheck %s --check-prefix=FAST_LZ +; RUN: opt -S -passes="require,function(codegenprepare)" < %s | FileCheck %s --check-prefix=SLOW +; RUN: opt -S -passes="require,function(codegenprepare)" -mattr=+bmi < %s | FileCheck %s --check-prefix=FAST_TZ +; RUN: opt -S -passes="require,function(codegenprepare)" -mattr=+lzcnt < %s | FileCheck %s --check-prefix=FAST_LZ -; RUN: opt -S -enable-debugify -passes='require,function(codegenprepare)' < %s | FileCheck %s --check-prefix=DEBUGINFO -; RUN: opt -S -enable-debugify -passes='require,function(codegenprepare)' --try-experimental-debuginfo-iterators < %s | FileCheck %s --check-prefix=DEBUGINFO +; RUN: opt -S -enable-debugify -passes="require,function(codegenprepare)" < %s | FileCheck %s --check-prefix=DEBUGINFO +; RUN: opt -S -enable-debugify -passes="require,function(codegenprepare)" --try-experimental-debuginfo-iterators < %s | FileCheck %s --check-prefix=DEBUGINFO target triple = "x86_64-unknown-unknown" target datalayout = "e-n32:64" @@ -16,15 +16,8 @@ target datalayout = "e-n32:64" define i64 @cttz(i64 %A) { ; SLOW-LABEL: @cttz( ; SLOW-NEXT: entry: -; SLOW-NEXT: [[A_FR:%.*]] = freeze i64 [[A:%.*]] -; SLOW-NEXT: [[CMPZ:%.*]] = icmp eq i64 [[A_FR]], 0 -; SLOW-NEXT: br i1 [[CMPZ]], label [[COND_END:%.*]], label [[COND_FALSE:%.*]] -; SLOW: cond.false: -; SLOW-NEXT: [[Z:%.*]] = call i64 @llvm.cttz.i64(i64 [[A_FR]], i1 true) -; SLOW-NEXT: br label [[COND_END]] -; SLOW: cond.end: -; SLOW-NEXT: [[CTZ:%.*]] = phi i64 [ 64, [[ENTRY:%.*]] ], [ [[Z]], [[COND_FALSE]] ] -; SLOW-NEXT: ret i64 [[CTZ]] +; SLOW-NEXT: [[Z:%.*]] = call i64 @llvm.cttz.i64(i64 [[A:%.*]], i1 false) +; SLOW-NEXT: ret i64 [[Z]] ; ; FAST_TZ-LABEL: @cttz( ; FAST_TZ-NEXT: entry: @@ -33,28 +26,14 @@ define i64 @cttz(i64 %A) { ; ; FAST_LZ-LABEL: @cttz( ; FAST_LZ-NEXT: entry: -; FAST_LZ-NEXT: [[A_FR:%.*]] = freeze i64 [[A:%.*]] -; FAST_LZ-NEXT: [[CMPZ:%.*]] = icmp eq i64 [[A_FR]], 0 -; FAST_LZ-NEXT: br i1 [[CMPZ]], label [[COND_END:%.*]], label [[COND_FALSE:%.*]] -; FAST_LZ: cond.false: -; FAST_LZ-NEXT: [[Z:%.*]] = call i64 @llvm.cttz.i64(i64 [[A_FR]], i1 true) -; FAST_LZ-NEXT: br label [[COND_END]] -; FAST_LZ: cond.end: -; FAST_LZ-NEXT: [[CTZ:%.*]] = phi i64 [ 64, [[ENTRY:%.*]] ], [ [[Z]], [[COND_FALSE]] ] -; FAST_LZ-NEXT: ret i64 [[CTZ]] +; FAST_LZ-NEXT: [[Z:%.*]] = call i64 @llvm.cttz.i64(i64 [[A:%.*]], i1 false) +; FAST_LZ-NEXT: ret i64 [[Z]] ; ; DEBUGINFO-LABEL: @cttz( ; DEBUGINFO-NEXT: entry: -; DEBUGINFO-NEXT: [[A_FR:%.*]] = freeze i64 [[A:%.*]], !dbg [[DBG11:![0-9]+]] -; DEBUGINFO-NEXT: [[CMPZ:%.*]] = icmp eq i64 [[A_FR]], 0, !dbg [[DBG11]] -; DEBUGINFO-NEXT: br i1 [[CMPZ]], label [[COND_END:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG11]] -; DEBUGINFO: cond.false: -; DEBUGINFO-NEXT: [[Z:%.*]] = call i64 @llvm.cttz.i64(i64 [[A_FR]], i1 true), !dbg [[DBG11]] -; DEBUGINFO-NEXT: br label [[COND_END]], !dbg [[DBG12:![0-9]+]] -; DEBUGINFO: cond.end: -; DEBUGINFO-NEXT: [[CTZ:%.*]] = phi i64 [ 64, [[ENTRY:%.*]] ], [ [[Z]], [[COND_FALSE]] ], !dbg [[DBG12]] -; DEBUGINFO-NEXT: #dbg_value(i64 [[CTZ]], [[META9:![0-9]+]], !DIExpression(), [[DBG11]]) -; DEBUGINFO-NEXT: ret i64 [[CTZ]], !dbg [[DBG12]] +; DEBUGINFO-NEXT: [[Z:%.*]] = call i64 @llvm.cttz.i64(i64 [[A:%.*]], i1 false), !dbg [[DBG11:![0-9]+]] +; DEBUGINFO-NEXT: #dbg_value(i64 [[Z]], [[META9:![0-9]+]], !DIExpression(), [[DBG11]]) +; DEBUGINFO-NEXT: ret i64 [[Z]], !dbg [[DBG12:![0-9]+]] ; entry: %z = call i64 @llvm.cttz.i64(i64 %A, i1 false) @@ -64,27 +43,13 @@ entry: define i64 @ctlz(i64 %A) { ; SLOW-LABEL: @ctlz( ; SLOW-NEXT: entry: -; SLOW-NEXT: [[A_FR:%.*]] = freeze i64 [[A:%.*]] -; SLOW-NEXT: [[CMPZ:%.*]] = icmp eq i64 [[A_FR]], 0 -; SLOW-NEXT: br i1 [[CMPZ]], label [[COND_END:%.*]], label [[COND_FALSE:%.*]] -; SLOW: cond.false: -; SLOW-NEXT: [[Z:%.*]] = call i64 @llvm.ctlz.i64(i64 [[A_FR]], i1 true) -; SLOW-NEXT: br label [[COND_END]] -; SLOW: cond.end: -; SLOW-NEXT: [[CTZ:%.*]] = phi i64 [ 64, [[ENTRY:%.*]] ], [ [[Z]], [[COND_FALSE]] ] -; SLOW-NEXT: ret i64 [[CTZ]] +; SLOW-NEXT: [[Z:%.*]] = call i64 @llvm.ctlz.i64(i64 [[A:%.*]], i1 false) +; SLOW-NEXT: ret i64 [[Z]] ; ; FAST_TZ-LABEL: @ctlz( ; FAST_TZ-NEXT: entry: -; FAST_TZ-NEXT: [[A_FR:%.*]] = freeze i64 [[A:%.*]] -; FAST_TZ-NEXT: [[CMPZ:%.*]] = icmp eq i64 [[A_FR]], 0 -; FAST_TZ-NEXT: br i1 [[CMPZ]], label [[COND_END:%.*]], label [[COND_FALSE:%.*]] -; FAST_TZ: cond.false: -; FAST_TZ-NEXT: [[Z:%.*]] = call i64 @llvm.ctlz.i64(i64 [[A_FR]], i1 true) -; FAST_TZ-NEXT: br label [[COND_END]] -; FAST_TZ: cond.end: -; FAST_TZ-NEXT: [[CTZ:%.*]] = phi i64 [ 64, [[ENTRY:%.*]] ], [ [[Z]], [[COND_FALSE]] ] -; FAST_TZ-NEXT: ret i64 [[CTZ]] +; FAST_TZ-NEXT: [[Z:%.*]] = call i64 @llvm.ctlz.i64(i64 [[A:%.*]], i1 false) +; FAST_TZ-NEXT: ret i64 [[Z]] ; ; FAST_LZ-LABEL: @ctlz( ; FAST_LZ-NEXT: entry: @@ -93,16 +58,9 @@ define i64 @ctlz(i64 %A) { ; ; DEBUGINFO-LABEL: @ctlz( ; DEBUGINFO-NEXT: entry: -; DEBUGINFO-NEXT: [[A_FR:%.*]] = freeze i64 [[A:%.*]], !dbg [[DBG16:![0-9]+]] -; DEBUGINFO-NEXT: [[CMPZ:%.*]] = icmp eq i64 [[A_FR]], 0, !dbg [[DBG16]] -; DEBUGINFO-NEXT: br i1 [[CMPZ]], label [[COND_END:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG16]] -; DEBUGINFO: cond.false: -; DEBUGINFO-NEXT: [[Z:%.*]] = call i64 @llvm.ctlz.i64(i64 [[A_FR]], i1 true), !dbg [[DBG16]] -; DEBUGINFO-NEXT: br label [[COND_END]], !dbg [[DBG17:![0-9]+]] -; DEBUGINFO: cond.end: -; DEBUGINFO-NEXT: [[CTZ:%.*]] = phi i64 [ 64, [[ENTRY:%.*]] ], [ [[Z]], [[COND_FALSE]] ], !dbg [[DBG17]] -; DEBUGINFO-NEXT: #dbg_value(i64 [[CTZ]], [[META15:![0-9]+]], !DIExpression(), [[DBG16]]) -; DEBUGINFO-NEXT: ret i64 [[CTZ]], !dbg [[DBG17]] +; DEBUGINFO-NEXT: [[Z:%.*]] = call i64 @llvm.ctlz.i64(i64 [[A:%.*]], i1 false), !dbg [[DBG16:![0-9]+]] +; DEBUGINFO-NEXT: #dbg_value(i64 [[Z]], [[META15:![0-9]+]], !DIExpression(), [[DBG16]]) +; DEBUGINFO-NEXT: ret i64 [[Z]], !dbg [[DBG17:![0-9]+]] ; entry: %z = call i64 @llvm.ctlz.i64(i64 %A, i1 false) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll b/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll index 0462f125955bf43..8a22e45fe1ca570 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE,SSE4 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=icelake-server -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512 @@ -136,11 +136,32 @@ define void @ctlz_4i64() #0 { } define void @ctlz_4i32() #0 { -; CHECK-LABEL: @ctlz_4i32( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4 -; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false) -; CHECK-NEXT: store <4 x i32> [[TMP2]], ptr @dst32, align 4 -; CHECK-NEXT: ret void +; SSE2-LABEL: @ctlz_4i32( +; SSE2-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4 +; SSE2-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false) +; SSE2-NEXT: store <4 x i32> [[TMP2]], ptr @dst32, align 4 +; SSE2-NEXT: ret void +; +; SSE4-LABEL: @ctlz_4i32( +; SSE4-NEXT: [[LD0:%.*]] = load i32, ptr @src32, align 4 +; SSE4-NEXT: [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4 +; SSE4-NEXT: [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 4 +; SSE4-NEXT: [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 4 +; SSE4-NEXT: [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false) +; SSE4-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false) +; SSE4-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false) +; SSE4-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false) +; SSE4-NEXT: store i32 [[CTLZ0]], ptr @dst32, align 4 +; SSE4-NEXT: store i32 [[CTLZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 4 +; SSE4-NEXT: store i32 [[CTLZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 4 +; SSE4-NEXT: store i32 [[CTLZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 4 +; SSE4-NEXT: ret void +; +; AVX-LABEL: @ctlz_4i32( +; AVX-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4 +; AVX-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false) +; AVX-NEXT: store <4 x i32> [[TMP2]], ptr @dst32, align 4 +; AVX-NEXT: ret void ; %ld0 = load i32, ptr @src32, align 4 %ld1 = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4 @@ -158,14 +179,41 @@ define void @ctlz_4i32() #0 { } define void @ctlz_8i32() #0 { -; SSE-LABEL: @ctlz_8i32( -; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 2 -; SSE-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false) -; SSE-NEXT: store <4 x i32> [[TMP2]], ptr @dst32, align 2 -; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 4), align 2 -; SSE-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP3]], i1 false) -; SSE-NEXT: store <4 x i32> [[TMP4]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 4), align 2 -; SSE-NEXT: ret void +; SSE2-LABEL: @ctlz_8i32( +; SSE2-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 2 +; SSE2-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false) +; SSE2-NEXT: store <4 x i32> [[TMP2]], ptr @dst32, align 2 +; SSE2-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 4), align 2 +; SSE2-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP3]], i1 false) +; SSE2-NEXT: store <4 x i32> [[TMP4]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 4), align 2 +; SSE2-NEXT: ret void +; +; SSE4-LABEL: @ctlz_8i32( +; SSE4-NEXT: [[LD0:%.*]] = load i32, ptr @src32, align 2 +; SSE4-NEXT: [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 2 +; SSE4-NEXT: [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 2 +; SSE4-NEXT: [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 2 +; SSE4-NEXT: [[LD4:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 4), align 2 +; SSE4-NEXT: [[LD5:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 5), align 2 +; SSE4-NEXT: [[LD6:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 6), align 2 +; SSE4-NEXT: [[LD7:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 7), align 2 +; SSE4-NEXT: [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false) +; SSE4-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false) +; SSE4-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false) +; SSE4-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false) +; SSE4-NEXT: [[CTLZ4:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD4]], i1 false) +; SSE4-NEXT: [[CTLZ5:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD5]], i1 false) +; SSE4-NEXT: [[CTLZ6:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD6]], i1 false) +; SSE4-NEXT: [[CTLZ7:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD7]], i1 false) +; SSE4-NEXT: store i32 [[CTLZ0]], ptr @dst32, align 2 +; SSE4-NEXT: store i32 [[CTLZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 2 +; SSE4-NEXT: store i32 [[CTLZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 2 +; SSE4-NEXT: store i32 [[CTLZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 2 +; SSE4-NEXT: store i32 [[CTLZ4]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 4), align 2 +; SSE4-NEXT: store i32 [[CTLZ5]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 5), align 2 +; SSE4-NEXT: store i32 [[CTLZ6]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 6), align 2 +; SSE4-NEXT: store i32 [[CTLZ7]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 7), align 2 +; SSE4-NEXT: ret void ; ; AVX-LABEL: @ctlz_8i32( ; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @src32, align 2