From bda93eeb690e2888aeed1051f1b10a95875d5049 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 22 Aug 2024 11:11:00 +0100
Subject: [PATCH] [X86] Allow speculative BSR/BSF instructions on targets with
 CMOV (#102885)

Currently targets without LZCNT/TZCNT won't speculate with BSR/BSF instructions in case they have a zero value input, meaning we always insert a test+branch for the zero-input case.

This patch proposes we allow speculation if the target has CMOV, and perform a branchless select instead to handle the zero input case. This will predominately help x86-64 targets where we haven't set any particular cpu target. We already always perform BSR/BSF instructions if we were lowering a CTLZ/CTTZ_ZERO_UNDEF instruction.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |   4 +-
 .../lib/Target/X86/X86TargetTransformInfo.cpp |  10 +-
 .../Analysis/CostModel/X86/ctlz-codesize.ll   |   8 +-
 .../CostModel/X86/ctlz-sizelatency.ll         |   8 +-
 llvm/test/Analysis/CostModel/X86/ctlz.ll      |   8 +-
 .../Analysis/CostModel/X86/cttz-codesize.ll   |   2 +-
 .../CostModel/X86/cttz-sizelatency.ll         |   2 +-
 llvm/test/CodeGen/X86/atomic-bit-test.ll      |   1 -
 llvm/test/CodeGen/X86/bit_ceil.ll             |  53 +--
 llvm/test/CodeGen/X86/combine-or.ll           |  47 ++-
 llvm/test/CodeGen/X86/ctlo.ll                 | 161 ++++++----
 llvm/test/CodeGen/X86/ctlz.ll                 | 304 +++++++++---------
 llvm/test/CodeGen/X86/cttz.ll                 |  37 ++-
 llvm/test/CodeGen/X86/known-never-zero.ll     | 269 +++++-----------
 llvm/test/CodeGen/X86/lzcnt-cmp.ll            |  52 +--
 llvm/test/CodeGen/X86/pr57673.ll              |  50 +--
 llvm/test/CodeGen/X86/pr89877.ll              |   8 +-
 llvm/test/CodeGen/X86/pr92569.ll              |  16 +-
 .../CodeGenPrepare/X86/cttz-ctlz.ll           |  80 ++---
 .../test/Transforms/SLPVectorizer/X86/ctlz.ll |  78 ++++-
 20 files changed, 516 insertions(+), 682 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index da5ea50f80ce04c..97775ce40aee4f9 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -3239,14 +3239,14 @@ bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT,
 
 bool X86TargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
   // Speculate cttz only if we can directly use TZCNT or can promote to i32/i64.
-  return Subtarget.hasBMI() ||
+  return Subtarget.hasBMI() || Subtarget.canUseCMOV() ||
          (!Ty->isVectorTy() &&
           Ty->getScalarSizeInBits() < (Subtarget.is64Bit() ? 64u : 32u));
 }
 
 bool X86TargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {
   // Speculate ctlz only if we can directly use LZCNT.
-  return Subtarget.hasLZCNT();
+  return Subtarget.hasLZCNT() || Subtarget.canUseCMOV();
 }
 
 bool X86TargetLowering::ShouldShrinkFPConstant(EVT VT) const {
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 9a11c33386fd0b9..cb9ee64a677a7eb 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -4210,9 +4210,9 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::ABS,        MVT::i64,     {  1,  2,  3,  3 } }, // SUB+CMOV
     { ISD::BITREVERSE, MVT::i64,     { 10, 12, 20, 22 } },
     { ISD::BSWAP,      MVT::i64,     {  1,  2,  1,  2 } },
-    { ISD::CTLZ,       MVT::i64,     {  3,  2,  6,  6 } }, // BSR+XOR or BSR+XOR+CMOV
+    { ISD::CTLZ,       MVT::i64,     {  2,  2,  4,  5 } }, // BSR+XOR or BSR+XOR+CMOV
     { ISD::CTLZ_ZERO_UNDEF, MVT::i64,{  1,  2,  2,  2 } }, // BSR+XOR
-    { ISD::CTTZ,       MVT::i64,     {  2,  2,  5,  5 } }, // TEST+BSF+CMOV/BRANCH
+    { ISD::CTTZ,       MVT::i64,     {  2,  2,  3,  4 } }, // TEST+BSF+CMOV/BRANCH
     { ISD::CTTZ_ZERO_UNDEF, MVT::i64,{  1,  2,  1,  2 } }, // BSF
     { ISD::CTPOP,      MVT::i64,     { 10,  6, 19, 19 } },
     { ISD::ROTL,       MVT::i64,     {  2,  3,  1,  3 } },
@@ -4241,9 +4241,9 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::BITREVERSE, MVT::i8,      {  7,  9, 13, 14 } },
     { ISD::BSWAP,      MVT::i32,     {  1,  1,  1,  1 } },
     { ISD::BSWAP,      MVT::i16,     {  1,  2,  1,  2 } }, // ROL
-    { ISD::CTLZ,       MVT::i32,     {  3,  2,  6,  6 } }, // BSR+XOR or BSR+XOR+CMOV
-    { ISD::CTLZ,       MVT::i16,     {  3,  2,  6,  6 } }, // BSR+XOR or BSR+XOR+CMOV
-    { ISD::CTLZ,       MVT::i8,      {  3,  2,  7,  7 } }, // BSR+XOR or BSR+XOR+CMOV
+    { ISD::CTLZ,       MVT::i32,     {  2,  2,  4,  5 } }, // BSR+XOR or BSR+XOR+CMOV
+    { ISD::CTLZ,       MVT::i16,     {  2,  2,  4,  5 } }, // BSR+XOR or BSR+XOR+CMOV
+    { ISD::CTLZ,       MVT::i8,      {  2,  2,  5,  6 } }, // BSR+XOR or BSR+XOR+CMOV
     { ISD::CTLZ_ZERO_UNDEF, MVT::i32,{  1,  2,  2,  2 } }, // BSR+XOR
     { ISD::CTLZ_ZERO_UNDEF, MVT::i16,{  2,  2,  2,  2 } }, // BSR+XOR
     { ISD::CTLZ_ZERO_UNDEF, MVT::i8, {  2,  2,  3,  3 } }, // BSR+XOR
diff --git a/llvm/test/Analysis/CostModel/X86/ctlz-codesize.ll b/llvm/test/Analysis/CostModel/X86/ctlz-codesize.ll
index ae0f1a3cfad307e..da0f71c63ef80ed 100644
--- a/llvm/test/Analysis/CostModel/X86/ctlz-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/ctlz-codesize.ll
@@ -17,7 +17,7 @@ declare  i8 @llvm.ctlz.i8(i8, i1)
 
 define i64 @var_ctlz_i64(i64 %a) {
 ; NOLZCNT-LABEL: 'var_ctlz_i64'
-; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
+; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
 ; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ctlz
 ;
 ; LZCNT-LABEL: 'var_ctlz_i64'
@@ -43,7 +43,7 @@ define i64 @var_ctlz_i64u(i64 %a) {
 
 define i32 @var_ctlz_i32(i32 %a) {
 ; NOLZCNT-LABEL: 'var_ctlz_i32'
-; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
+; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
 ; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %ctlz
 ;
 ; LZCNT-LABEL: 'var_ctlz_i32'
@@ -69,7 +69,7 @@ define i32 @var_ctlz_i32u(i32 %a) {
 
 define i16 @var_ctlz_i16(i16 %a) {
 ; NOLZCNT-LABEL: 'var_ctlz_i16'
-; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false)
+; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false)
 ; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %ctlz
 ;
 ; LZCNT-LABEL: 'var_ctlz_i16'
@@ -95,7 +95,7 @@ define i16 @var_ctlz_i16u(i16 %a) {
 
 define i8 @var_ctlz_i8(i8 %a) {
 ; NOLZCNT-LABEL: 'var_ctlz_i8'
-; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false)
+; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false)
 ; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %ctlz
 ;
 ; LZCNT-LABEL: 'var_ctlz_i8'
diff --git a/llvm/test/Analysis/CostModel/X86/ctlz-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/ctlz-sizelatency.ll
index 8c6c3228d8fc6e9..2425e7286265b06 100644
--- a/llvm/test/Analysis/CostModel/X86/ctlz-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/ctlz-sizelatency.ll
@@ -17,7 +17,7 @@ declare  i8 @llvm.ctlz.i8(i8, i1)
 
 define i64 @var_ctlz_i64(i64 %a) {
 ; NOLZCNT-LABEL: 'var_ctlz_i64'
-; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
+; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
 ; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ctlz
 ;
 ; LZCNT-LABEL: 'var_ctlz_i64'
@@ -43,7 +43,7 @@ define i64 @var_ctlz_i64u(i64 %a) {
 
 define i32 @var_ctlz_i32(i32 %a) {
 ; NOLZCNT-LABEL: 'var_ctlz_i32'
-; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
+; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
 ; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %ctlz
 ;
 ; LZCNT-LABEL: 'var_ctlz_i32'
@@ -69,7 +69,7 @@ define i32 @var_ctlz_i32u(i32 %a) {
 
 define i16 @var_ctlz_i16(i16 %a) {
 ; NOLZCNT-LABEL: 'var_ctlz_i16'
-; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false)
+; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false)
 ; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %ctlz
 ;
 ; LZCNT-LABEL: 'var_ctlz_i16'
@@ -95,7 +95,7 @@ define i16 @var_ctlz_i16u(i16 %a) {
 
 define i8 @var_ctlz_i8(i8 %a) {
 ; NOLZCNT-LABEL: 'var_ctlz_i8'
-; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false)
+; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false)
 ; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %ctlz
 ;
 ; LZCNT-LABEL: 'var_ctlz_i8'
diff --git a/llvm/test/Analysis/CostModel/X86/ctlz.ll b/llvm/test/Analysis/CostModel/X86/ctlz.ll
index 99e682b8e17826e..fa7982ce09e9cef 100644
--- a/llvm/test/Analysis/CostModel/X86/ctlz.ll
+++ b/llvm/test/Analysis/CostModel/X86/ctlz.ll
@@ -17,7 +17,7 @@ declare  i8 @llvm.ctlz.i8(i8, i1)
 
 define i64 @var_ctlz_i64(i64 %a) {
 ; NOLZCNT-LABEL: 'var_ctlz_i64'
-; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
+; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
 ; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %ctlz
 ;
 ; LZCNT-LABEL: 'var_ctlz_i64'
@@ -43,7 +43,7 @@ define i64 @var_ctlz_i64u(i64 %a) {
 
 define i32 @var_ctlz_i32(i32 %a) {
 ; NOLZCNT-LABEL: 'var_ctlz_i32'
-; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
+; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
 ; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %ctlz
 ;
 ; LZCNT-LABEL: 'var_ctlz_i32'
@@ -69,7 +69,7 @@ define i32 @var_ctlz_i32u(i32 %a) {
 
 define i16 @var_ctlz_i16(i16 %a) {
 ; NOLZCNT-LABEL: 'var_ctlz_i16'
-; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false)
+; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false)
 ; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %ctlz
 ;
 ; LZCNT-LABEL: 'var_ctlz_i16'
@@ -95,7 +95,7 @@ define i16 @var_ctlz_i16u(i16 %a) {
 
 define i8 @var_ctlz_i8(i8 %a) {
 ; NOLZCNT-LABEL: 'var_ctlz_i8'
-; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false)
+; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false)
 ; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %ctlz
 ;
 ; LZCNT-LABEL: 'var_ctlz_i8'
diff --git a/llvm/test/Analysis/CostModel/X86/cttz-codesize.ll b/llvm/test/Analysis/CostModel/X86/cttz-codesize.ll
index 1d40debb7ab8166..07bf1dd7a2ff6cb 100644
--- a/llvm/test/Analysis/CostModel/X86/cttz-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/cttz-codesize.ll
@@ -18,7 +18,7 @@ declare  i8 @llvm.cttz.i8(i8, i1)
 
 define i64 @var_cttz_i64(i64 %a) {
 ; NOBMI-LABEL: 'var_cttz_i64'
-; NOBMI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false)
+; NOBMI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false)
 ; NOBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %cttz
 ;
 ; BMI-LABEL: 'var_cttz_i64'
diff --git a/llvm/test/Analysis/CostModel/X86/cttz-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/cttz-sizelatency.ll
index 351e863f1320674..afe5cb8c55fe653 100644
--- a/llvm/test/Analysis/CostModel/X86/cttz-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/cttz-sizelatency.ll
@@ -18,7 +18,7 @@ declare  i8 @llvm.cttz.i8(i8, i1)
 
 define i64 @var_cttz_i64(i64 %a) {
 ; NOBMI-LABEL: 'var_cttz_i64'
-; NOBMI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false)
+; NOBMI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false)
 ; NOBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %cttz
 ;
 ; BMI-LABEL: 'var_cttz_i64'
diff --git a/llvm/test/CodeGen/X86/atomic-bit-test.ll b/llvm/test/CodeGen/X86/atomic-bit-test.ll
index f39c4b5e620d0e8..10b6605c3fb05e4 100644
--- a/llvm/test/CodeGen/X86/atomic-bit-test.ll
+++ b/llvm/test/CodeGen/X86/atomic-bit-test.ll
@@ -582,7 +582,6 @@ define i32 @split_hoist_and(i32 %0) nounwind {
 ; X64-NEXT:    lock btsl $3, v32(%rip)
 ; X64-NEXT:    setb %al
 ; X64-NEXT:    shll $3, %eax
-; X64-NEXT:    testl %edi, %edi
 ; X64-NEXT:    retq
   %2 = atomicrmw or ptr @v32, i32 8 monotonic, align 4
   %3 = tail call i32 @llvm.ctlz.i32(i32 %0, i1 false)
diff --git a/llvm/test/CodeGen/X86/bit_ceil.ll b/llvm/test/CodeGen/X86/bit_ceil.ll
index 4641c114238f8f9..823453087f6180e 100644
--- a/llvm/test/CodeGen/X86/bit_ceil.ll
+++ b/llvm/test/CodeGen/X86/bit_ceil.ll
@@ -8,16 +8,12 @@
 define i32 @bit_ceil_i32(i32 %x) {
 ; NOBMI-LABEL: bit_ceil_i32:
 ; NOBMI:       # %bb.0:
-; NOBMI-NEXT:    movl %edi, %eax
-; NOBMI-NEXT:    decl %eax
-; NOBMI-NEXT:    je .LBB0_1
-; NOBMI-NEXT:  # %bb.2: # %cond.false
-; NOBMI-NEXT:    bsrl %eax, %ecx
+; NOBMI-NEXT:    # kill: def $edi killed $edi def $rdi
+; NOBMI-NEXT:    leal -1(%rdi), %eax
+; NOBMI-NEXT:    bsrl %eax, %eax
+; NOBMI-NEXT:    movl $63, %ecx
+; NOBMI-NEXT:    cmovnel %eax, %ecx
 ; NOBMI-NEXT:    xorl $31, %ecx
-; NOBMI-NEXT:    jmp .LBB0_3
-; NOBMI-NEXT:  .LBB0_1:
-; NOBMI-NEXT:    movl $32, %ecx
-; NOBMI-NEXT:  .LBB0_3: # %cond.end
 ; NOBMI-NEXT:    negb %cl
 ; NOBMI-NEXT:    movl $1, %edx
 ; NOBMI-NEXT:    movl $1, %eax
@@ -51,15 +47,10 @@ define i32 @bit_ceil_i32(i32 %x) {
 define i32 @bit_ceil_i32_plus1(i32 noundef %x) {
 ; NOBMI-LABEL: bit_ceil_i32_plus1:
 ; NOBMI:       # %bb.0: # %entry
-; NOBMI-NEXT:    testl %edi, %edi
-; NOBMI-NEXT:    je .LBB1_1
-; NOBMI-NEXT:  # %bb.2: # %cond.false
-; NOBMI-NEXT:    bsrl %edi, %ecx
+; NOBMI-NEXT:    bsrl %edi, %eax
+; NOBMI-NEXT:    movl $63, %ecx
+; NOBMI-NEXT:    cmovnel %eax, %ecx
 ; NOBMI-NEXT:    xorl $31, %ecx
-; NOBMI-NEXT:    jmp .LBB1_3
-; NOBMI-NEXT:  .LBB1_1:
-; NOBMI-NEXT:    movl $32, %ecx
-; NOBMI-NEXT:  .LBB1_3: # %cond.end
 ; NOBMI-NEXT:    negb %cl
 ; NOBMI-NEXT:    movl $1, %edx
 ; NOBMI-NEXT:    movl $1, %eax
@@ -94,16 +85,11 @@ entry:
 define i64 @bit_ceil_i64(i64 %x) {
 ; NOBMI-LABEL: bit_ceil_i64:
 ; NOBMI:       # %bb.0:
-; NOBMI-NEXT:    movq %rdi, %rax
-; NOBMI-NEXT:    decq %rax
-; NOBMI-NEXT:    je .LBB2_1
-; NOBMI-NEXT:  # %bb.2: # %cond.false
-; NOBMI-NEXT:    bsrq %rax, %rcx
-; NOBMI-NEXT:    xorq $63, %rcx
-; NOBMI-NEXT:    jmp .LBB2_3
-; NOBMI-NEXT:  .LBB2_1:
-; NOBMI-NEXT:    movl $64, %ecx
-; NOBMI-NEXT:  .LBB2_3: # %cond.end
+; NOBMI-NEXT:    leaq -1(%rdi), %rax
+; NOBMI-NEXT:    bsrq %rax, %rax
+; NOBMI-NEXT:    movl $127, %ecx
+; NOBMI-NEXT:    cmovneq %rax, %rcx
+; NOBMI-NEXT:    xorl $63, %ecx
 ; NOBMI-NEXT:    negb %cl
 ; NOBMI-NEXT:    movl $1, %edx
 ; NOBMI-NEXT:    movl $1, %eax
@@ -136,15 +122,10 @@ define i64 @bit_ceil_i64(i64 %x) {
 define i64 @bit_ceil_i64_plus1(i64 noundef %x) {
 ; NOBMI-LABEL: bit_ceil_i64_plus1:
 ; NOBMI:       # %bb.0: # %entry
-; NOBMI-NEXT:    testq %rdi, %rdi
-; NOBMI-NEXT:    je .LBB3_1
-; NOBMI-NEXT:  # %bb.2: # %cond.false
-; NOBMI-NEXT:    bsrq %rdi, %rcx
-; NOBMI-NEXT:    xorq $63, %rcx
-; NOBMI-NEXT:    jmp .LBB3_3
-; NOBMI-NEXT:  .LBB3_1:
-; NOBMI-NEXT:    movl $64, %ecx
-; NOBMI-NEXT:  .LBB3_3: # %cond.end
+; NOBMI-NEXT:    bsrq %rdi, %rax
+; NOBMI-NEXT:    movl $127, %ecx
+; NOBMI-NEXT:    cmovneq %rax, %rcx
+; NOBMI-NEXT:    xorl $63, %ecx
 ; NOBMI-NEXT:    negb %cl
 ; NOBMI-NEXT:    movl $1, %edx
 ; NOBMI-NEXT:    movl $1, %eax
diff --git a/llvm/test/CodeGen/X86/combine-or.ll b/llvm/test/CodeGen/X86/combine-or.ll
index 3b2102f46a297a6..4060355495eb3b5 100644
--- a/llvm/test/CodeGen/X86/combine-or.ll
+++ b/llvm/test/CodeGen/X86/combine-or.ll
@@ -213,21 +213,18 @@ define i64 @PR89533(<64 x i8> %a0) {
 ; SSE-NEXT:    shll $16, %ecx
 ; SSE-NEXT:    orl %eax, %ecx
 ; SSE-NEXT:    pcmpeqb %xmm4, %xmm2
-; SSE-NEXT:    pmovmskb %xmm2, %edx
-; SSE-NEXT:    xorl $65535, %edx # imm = 0xFFFF
+; SSE-NEXT:    pmovmskb %xmm2, %eax
+; SSE-NEXT:    xorl $65535, %eax # imm = 0xFFFF
 ; SSE-NEXT:    pcmpeqb %xmm4, %xmm3
-; SSE-NEXT:    pmovmskb %xmm3, %eax
-; SSE-NEXT:    notl %eax
-; SSE-NEXT:    shll $16, %eax
-; SSE-NEXT:    orl %edx, %eax
-; SSE-NEXT:    shlq $32, %rax
-; SSE-NEXT:    orq %rcx, %rax
-; SSE-NEXT:    je .LBB11_2
-; SSE-NEXT:  # %bb.1: # %cond.false
-; SSE-NEXT:    rep bsfq %rax, %rax
-; SSE-NEXT:    retq
-; SSE-NEXT:  .LBB11_2: # %cond.end
+; SSE-NEXT:    pmovmskb %xmm3, %edx
+; SSE-NEXT:    notl %edx
+; SSE-NEXT:    shll $16, %edx
+; SSE-NEXT:    orl %eax, %edx
+; SSE-NEXT:    shlq $32, %rdx
+; SSE-NEXT:    orq %rcx, %rdx
+; SSE-NEXT:    bsfq %rdx, %rcx
 ; SSE-NEXT:    movl $64, %eax
+; SSE-NEXT:    cmovneq %rcx, %rax
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: PR89533:
@@ -243,23 +240,19 @@ define i64 @PR89533(<64 x i8> %a0) {
 ; AVX1-NEXT:    shll $16, %ecx
 ; AVX1-NEXT:    orl %eax, %ecx
 ; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm0
-; AVX1-NEXT:    vpmovmskb %xmm0, %edx
-; AVX1-NEXT:    xorl $65535, %edx # imm = 0xFFFF
+; AVX1-NEXT:    vpmovmskb %xmm0, %eax
+; AVX1-NEXT:    xorl $65535, %eax # imm = 0xFFFF
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
 ; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpmovmskb %xmm0, %eax
-; AVX1-NEXT:    notl %eax
-; AVX1-NEXT:    shll $16, %eax
-; AVX1-NEXT:    orl %edx, %eax
-; AVX1-NEXT:    shlq $32, %rax
-; AVX1-NEXT:    orq %rcx, %rax
-; AVX1-NEXT:    je .LBB11_2
-; AVX1-NEXT:  # %bb.1: # %cond.false
-; AVX1-NEXT:    rep bsfq %rax, %rax
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-; AVX1-NEXT:  .LBB11_2: # %cond.end
+; AVX1-NEXT:    vpmovmskb %xmm0, %edx
+; AVX1-NEXT:    notl %edx
+; AVX1-NEXT:    shll $16, %edx
+; AVX1-NEXT:    orl %eax, %edx
+; AVX1-NEXT:    shlq $32, %rdx
+; AVX1-NEXT:    orq %rcx, %rdx
+; AVX1-NEXT:    bsfq %rdx, %rcx
 ; AVX1-NEXT:    movl $64, %eax
+; AVX1-NEXT:    cmovneq %rcx, %rax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/ctlo.ll b/llvm/test/CodeGen/X86/ctlo.ll
index bb80279e28f3d3a..f383c9a2544fca0 100644
--- a/llvm/test/CodeGen/X86/ctlo.ll
+++ b/llvm/test/CodeGen/X86/ctlo.ll
@@ -13,36 +13,44 @@ declare i32 @llvm.ctlz.i32(i32, i1)
 declare i64 @llvm.ctlz.i64(i64, i1)
 
 define i8 @ctlo_i8(i8 %x) {
-; X86-LABEL: ctlo_i8:
-; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorb $-1, %al
-; X86-NEXT:    je .LBB0_1
-; X86-NEXT:  # %bb.2: # %cond.false
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    bsrl %eax, %eax
-; X86-NEXT:    xorl $7, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB0_1:
-; X86-NEXT:    movb $8, %al
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
+; X86-NOCMOV-LABEL: ctlo_i8:
+; X86-NOCMOV:       # %bb.0:
+; X86-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NOCMOV-NEXT:    xorb $-1, %al
+; X86-NOCMOV-NEXT:    je .LBB0_1
+; X86-NOCMOV-NEXT:  # %bb.2: # %cond.false
+; X86-NOCMOV-NEXT:    movzbl %al, %eax
+; X86-NOCMOV-NEXT:    bsrl %eax, %eax
+; X86-NOCMOV-NEXT:    xorl $7, %eax
+; X86-NOCMOV-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NOCMOV-NEXT:    retl
+; X86-NOCMOV-NEXT:  .LBB0_1:
+; X86-NOCMOV-NEXT:    movb $8, %al
+; X86-NOCMOV-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NOCMOV-NEXT:    retl
+;
+; X86-CMOV-LABEL: ctlo_i8:
+; X86-CMOV:       # %bb.0:
+; X86-CMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-CMOV-NEXT:    notb %al
+; X86-CMOV-NEXT:    movzbl %al, %eax
+; X86-CMOV-NEXT:    bsrl %eax, %ecx
+; X86-CMOV-NEXT:    movl $15, %eax
+; X86-CMOV-NEXT:    cmovnel %ecx, %eax
+; X86-CMOV-NEXT:    xorl $7, %eax
+; X86-CMOV-NEXT:    # kill: def $al killed $al killed $eax
+; X86-CMOV-NEXT:    retl
 ;
 ; X64-LABEL: ctlo_i8:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorb $-1, %dil
-; X64-NEXT:    je .LBB0_1
-; X64-NEXT:  # %bb.2: # %cond.false
+; X64-NEXT:    notb %dil
 ; X64-NEXT:    movzbl %dil, %eax
-; X64-NEXT:    bsrl %eax, %eax
+; X64-NEXT:    bsrl %eax, %ecx
+; X64-NEXT:    movl $15, %eax
+; X64-NEXT:    cmovnel %ecx, %eax
 ; X64-NEXT:    xorl $7, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB0_1:
-; X64-NEXT:    movb $8, %al
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    retq
 ;
 ; X86-CLZ-LABEL: ctlo_i8:
 ; X86-CLZ:       # %bb.0:
@@ -111,34 +119,41 @@ define i8 @ctlo_i8_undef(i8 %x) {
 }
 
 define i16 @ctlo_i16(i16 %x) {
-; X86-LABEL: ctlo_i16:
-; X86:       # %bb.0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorw $-1, %ax
-; X86-NEXT:    je .LBB2_1
-; X86-NEXT:  # %bb.2: # %cond.false
-; X86-NEXT:    bsrw %ax, %ax
-; X86-NEXT:    xorl $15, %eax
-; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB2_1:
-; X86-NEXT:    movw $16, %ax
-; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    retl
+; X86-NOCMOV-LABEL: ctlo_i16:
+; X86-NOCMOV:       # %bb.0:
+; X86-NOCMOV-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NOCMOV-NEXT:    xorw $-1, %ax
+; X86-NOCMOV-NEXT:    je .LBB2_1
+; X86-NOCMOV-NEXT:  # %bb.2: # %cond.false
+; X86-NOCMOV-NEXT:    bsrw %ax, %ax
+; X86-NOCMOV-NEXT:    xorl $15, %eax
+; X86-NOCMOV-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NOCMOV-NEXT:    retl
+; X86-NOCMOV-NEXT:  .LBB2_1:
+; X86-NOCMOV-NEXT:    movw $16, %ax
+; X86-NOCMOV-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NOCMOV-NEXT:    retl
+;
+; X86-CMOV-LABEL: ctlo_i16:
+; X86-CMOV:       # %bb.0:
+; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-CMOV-NEXT:    notl %eax
+; X86-CMOV-NEXT:    bsrw %ax, %cx
+; X86-CMOV-NEXT:    movw $31, %ax
+; X86-CMOV-NEXT:    cmovnew %cx, %ax
+; X86-CMOV-NEXT:    xorl $15, %eax
+; X86-CMOV-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-CMOV-NEXT:    retl
 ;
 ; X64-LABEL: ctlo_i16:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorw $-1, %di
-; X64-NEXT:    je .LBB2_1
-; X64-NEXT:  # %bb.2: # %cond.false
-; X64-NEXT:    bsrw %di, %ax
+; X64-NEXT:    notl %edi
+; X64-NEXT:    bsrw %di, %cx
+; X64-NEXT:    movw $31, %ax
+; X64-NEXT:    cmovnew %cx, %ax
 ; X64-NEXT:    xorl $15, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB2_1:
-; X64-NEXT:    movw $16, %ax
-; X64-NEXT:    # kill: def $ax killed $ax killed $eax
-; X64-NEXT:    retq
 ;
 ; X86-CLZ-LABEL: ctlo_i16:
 ; X86-CLZ:       # %bb.0:
@@ -193,30 +208,37 @@ define i16 @ctlo_i16_undef(i16 %x) {
 }
 
 define i32 @ctlo_i32(i32 %x) {
-; X86-LABEL: ctlo_i32:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl $-1, %eax
-; X86-NEXT:    je .LBB4_1
-; X86-NEXT:  # %bb.2: # %cond.false
-; X86-NEXT:    bsrl %eax, %eax
-; X86-NEXT:    xorl $31, %eax
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB4_1:
-; X86-NEXT:    movl $32, %eax
-; X86-NEXT:    retl
+; X86-NOCMOV-LABEL: ctlo_i32:
+; X86-NOCMOV:       # %bb.0:
+; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOCMOV-NEXT:    xorl $-1, %eax
+; X86-NOCMOV-NEXT:    je .LBB4_1
+; X86-NOCMOV-NEXT:  # %bb.2: # %cond.false
+; X86-NOCMOV-NEXT:    bsrl %eax, %eax
+; X86-NOCMOV-NEXT:    xorl $31, %eax
+; X86-NOCMOV-NEXT:    retl
+; X86-NOCMOV-NEXT:  .LBB4_1:
+; X86-NOCMOV-NEXT:    movl $32, %eax
+; X86-NOCMOV-NEXT:    retl
+;
+; X86-CMOV-LABEL: ctlo_i32:
+; X86-CMOV:       # %bb.0:
+; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-CMOV-NEXT:    notl %eax
+; X86-CMOV-NEXT:    bsrl %eax, %ecx
+; X86-CMOV-NEXT:    movl $63, %eax
+; X86-CMOV-NEXT:    cmovnel %ecx, %eax
+; X86-CMOV-NEXT:    xorl $31, %eax
+; X86-CMOV-NEXT:    retl
 ;
 ; X64-LABEL: ctlo_i32:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorl $-1, %edi
-; X64-NEXT:    je .LBB4_1
-; X64-NEXT:  # %bb.2: # %cond.false
-; X64-NEXT:    bsrl %edi, %eax
+; X64-NEXT:    notl %edi
+; X64-NEXT:    bsrl %edi, %ecx
+; X64-NEXT:    movl $63, %eax
+; X64-NEXT:    cmovnel %ecx, %eax
 ; X64-NEXT:    xorl $31, %eax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB4_1:
-; X64-NEXT:    movl $32, %eax
-; X64-NEXT:    retq
 ;
 ; X86-CLZ-LABEL: ctlo_i32:
 ; X86-CLZ:       # %bb.0:
@@ -314,15 +336,12 @@ define i64 @ctlo_i64(i64 %x) {
 ;
 ; X64-LABEL: ctlo_i64:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorq $-1, %rdi
-; X64-NEXT:    je .LBB6_1
-; X64-NEXT:  # %bb.2: # %cond.false
-; X64-NEXT:    bsrq %rdi, %rax
+; X64-NEXT:    notq %rdi
+; X64-NEXT:    bsrq %rdi, %rcx
+; X64-NEXT:    movl $127, %eax
+; X64-NEXT:    cmovneq %rcx, %rax
 ; X64-NEXT:    xorq $63, %rax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB6_1:
-; X64-NEXT:    movl $64, %eax
-; X64-NEXT:    retq
 ;
 ; X86-CLZ-LABEL: ctlo_i64:
 ; X86-CLZ:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/ctlz.ll b/llvm/test/CodeGen/X86/ctlz.ll
index d8f83502bd849a7..6635be18b0f7a75 100644
--- a/llvm/test/CodeGen/X86/ctlz.ll
+++ b/llvm/test/CodeGen/X86/ctlz.ll
@@ -218,36 +218,41 @@ define i64 @ctlz_i64(i64 %x) {
 
 ; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
 define i8 @ctlz_i8_zero_test(i8 %n) {
-; X86-LABEL: ctlz_i8_zero_test:
-; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    testb %al, %al
-; X86-NEXT:    je .LBB4_1
-; X86-NEXT:  # %bb.2: # %cond.false
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    bsrl %eax, %eax
-; X86-NEXT:    xorl $7, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB4_1:
-; X86-NEXT:    movb $8, %al
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
+; X86-NOCMOV-LABEL: ctlz_i8_zero_test:
+; X86-NOCMOV:       # %bb.0:
+; X86-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NOCMOV-NEXT:    testb %al, %al
+; X86-NOCMOV-NEXT:    je .LBB4_1
+; X86-NOCMOV-NEXT:  # %bb.2: # %cond.false
+; X86-NOCMOV-NEXT:    movzbl %al, %eax
+; X86-NOCMOV-NEXT:    bsrl %eax, %eax
+; X86-NOCMOV-NEXT:    xorl $7, %eax
+; X86-NOCMOV-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NOCMOV-NEXT:    retl
+; X86-NOCMOV-NEXT:  .LBB4_1:
+; X86-NOCMOV-NEXT:    movb $8, %al
+; X86-NOCMOV-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NOCMOV-NEXT:    retl
+;
+; X86-CMOV-LABEL: ctlz_i8_zero_test:
+; X86-CMOV:       # %bb.0:
+; X86-CMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-CMOV-NEXT:    bsrl %eax, %ecx
+; X86-CMOV-NEXT:    movl $15, %eax
+; X86-CMOV-NEXT:    cmovnel %ecx, %eax
+; X86-CMOV-NEXT:    xorl $7, %eax
+; X86-CMOV-NEXT:    # kill: def $al killed $al killed $eax
+; X86-CMOV-NEXT:    retl
 ;
 ; X64-LABEL: ctlz_i8_zero_test:
 ; X64:       # %bb.0:
-; X64-NEXT:    testb %dil, %dil
-; X64-NEXT:    je .LBB4_1
-; X64-NEXT:  # %bb.2: # %cond.false
 ; X64-NEXT:    movzbl %dil, %eax
-; X64-NEXT:    bsrl %eax, %eax
+; X64-NEXT:    bsrl %eax, %ecx
+; X64-NEXT:    movl $15, %eax
+; X64-NEXT:    cmovnel %ecx, %eax
 ; X64-NEXT:    xorl $7, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB4_1:
-; X64-NEXT:    movb $8, %al
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    retq
 ;
 ; X86-CLZ-LABEL: ctlz_i8_zero_test:
 ; X86-CLZ:       # %bb.0:
@@ -286,34 +291,38 @@ define i8 @ctlz_i8_zero_test(i8 %n) {
 
 ; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
 define i16 @ctlz_i16_zero_test(i16 %n) {
-; X86-LABEL: ctlz_i16_zero_test:
-; X86:       # %bb.0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    testw %ax, %ax
-; X86-NEXT:    je .LBB5_1
-; X86-NEXT:  # %bb.2: # %cond.false
-; X86-NEXT:    bsrw %ax, %ax
-; X86-NEXT:    xorl $15, %eax
-; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB5_1:
-; X86-NEXT:    movw $16, %ax
-; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    retl
+; X86-NOCMOV-LABEL: ctlz_i16_zero_test:
+; X86-NOCMOV:       # %bb.0:
+; X86-NOCMOV-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NOCMOV-NEXT:    testw %ax, %ax
+; X86-NOCMOV-NEXT:    je .LBB5_1
+; X86-NOCMOV-NEXT:  # %bb.2: # %cond.false
+; X86-NOCMOV-NEXT:    bsrw %ax, %ax
+; X86-NOCMOV-NEXT:    xorl $15, %eax
+; X86-NOCMOV-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NOCMOV-NEXT:    retl
+; X86-NOCMOV-NEXT:  .LBB5_1:
+; X86-NOCMOV-NEXT:    movw $16, %ax
+; X86-NOCMOV-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NOCMOV-NEXT:    retl
+;
+; X86-CMOV-LABEL: ctlz_i16_zero_test:
+; X86-CMOV:       # %bb.0:
+; X86-CMOV-NEXT:    bsrw {{[0-9]+}}(%esp), %cx
+; X86-CMOV-NEXT:    movw $31, %ax
+; X86-CMOV-NEXT:    cmovnew %cx, %ax
+; X86-CMOV-NEXT:    xorl $15, %eax
+; X86-CMOV-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-CMOV-NEXT:    retl
 ;
 ; X64-LABEL: ctlz_i16_zero_test:
 ; X64:       # %bb.0:
-; X64-NEXT:    testw %di, %di
-; X64-NEXT:    je .LBB5_1
-; X64-NEXT:  # %bb.2: # %cond.false
-; X64-NEXT:    bsrw %di, %ax
+; X64-NEXT:    bsrw %di, %cx
+; X64-NEXT:    movw $31, %ax
+; X64-NEXT:    cmovnew %cx, %ax
 ; X64-NEXT:    xorl $15, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB5_1:
-; X64-NEXT:    movw $16, %ax
-; X64-NEXT:    # kill: def $ax killed $ax killed $eax
-; X64-NEXT:    retq
 ;
 ; X86-CLZ-LABEL: ctlz_i16_zero_test:
 ; X86-CLZ:       # %bb.0:
@@ -340,30 +349,34 @@ define i16 @ctlz_i16_zero_test(i16 %n) {
 
 ; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
 define i32 @ctlz_i32_zero_test(i32 %n) {
-; X86-LABEL: ctlz_i32_zero_test:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    je .LBB6_1
-; X86-NEXT:  # %bb.2: # %cond.false
-; X86-NEXT:    bsrl %eax, %eax
-; X86-NEXT:    xorl $31, %eax
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB6_1:
-; X86-NEXT:    movl $32, %eax
-; X86-NEXT:    retl
+; X86-NOCMOV-LABEL: ctlz_i32_zero_test:
+; X86-NOCMOV:       # %bb.0:
+; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOCMOV-NEXT:    testl %eax, %eax
+; X86-NOCMOV-NEXT:    je .LBB6_1
+; X86-NOCMOV-NEXT:  # %bb.2: # %cond.false
+; X86-NOCMOV-NEXT:    bsrl %eax, %eax
+; X86-NOCMOV-NEXT:    xorl $31, %eax
+; X86-NOCMOV-NEXT:    retl
+; X86-NOCMOV-NEXT:  .LBB6_1:
+; X86-NOCMOV-NEXT:    movl $32, %eax
+; X86-NOCMOV-NEXT:    retl
+;
+; X86-CMOV-LABEL: ctlz_i32_zero_test:
+; X86-CMOV:       # %bb.0:
+; X86-CMOV-NEXT:    bsrl {{[0-9]+}}(%esp), %ecx
+; X86-CMOV-NEXT:    movl $63, %eax
+; X86-CMOV-NEXT:    cmovnel %ecx, %eax
+; X86-CMOV-NEXT:    xorl $31, %eax
+; X86-CMOV-NEXT:    retl
 ;
 ; X64-LABEL: ctlz_i32_zero_test:
 ; X64:       # %bb.0:
-; X64-NEXT:    testl %edi, %edi
-; X64-NEXT:    je .LBB6_1
-; X64-NEXT:  # %bb.2: # %cond.false
-; X64-NEXT:    bsrl %edi, %eax
+; X64-NEXT:    bsrl %edi, %ecx
+; X64-NEXT:    movl $63, %eax
+; X64-NEXT:    cmovnel %ecx, %eax
 ; X64-NEXT:    xorl $31, %eax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB6_1:
-; X64-NEXT:    movl $32, %eax
-; X64-NEXT:    retq
 ;
 ; X86-CLZ-LABEL: ctlz_i32_zero_test:
 ; X86-CLZ:       # %bb.0:
@@ -429,15 +442,11 @@ define i64 @ctlz_i64_zero_test(i64 %n) {
 ;
 ; X64-LABEL: ctlz_i64_zero_test:
 ; X64:       # %bb.0:
-; X64-NEXT:    testq %rdi, %rdi
-; X64-NEXT:    je .LBB7_1
-; X64-NEXT:  # %bb.2: # %cond.false
-; X64-NEXT:    bsrq %rdi, %rax
+; X64-NEXT:    bsrq %rdi, %rcx
+; X64-NEXT:    movl $127, %eax
+; X64-NEXT:    cmovneq %rcx, %rax
 ; X64-NEXT:    xorq $63, %rax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB7_1:
-; X64-NEXT:    movl $64, %eax
-; X64-NEXT:    retq
 ;
 ; X86-CLZ-LABEL: ctlz_i64_zero_test:
 ; X86-CLZ:       # %bb.0:
@@ -580,33 +589,33 @@ define i32 @ctlz_bsr(i32 %n) {
 ; FIXME: The compare and branch are produced late in IR (by CodeGenPrepare), and
 ;        codegen doesn't know how to combine the $32 and $31 into $63.
 define i32 @ctlz_bsr_zero_test(i32 %n) {
-; X86-LABEL: ctlz_bsr_zero_test:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    je .LBB10_1
-; X86-NEXT:  # %bb.2: # %cond.false
-; X86-NEXT:    bsrl %eax, %eax
-; X86-NEXT:    xorl $31, %eax
-; X86-NEXT:    xorl $31, %eax
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB10_1:
-; X86-NEXT:    movl $32, %eax
-; X86-NEXT:    xorl $31, %eax
-; X86-NEXT:    retl
+; X86-NOCMOV-LABEL: ctlz_bsr_zero_test:
+; X86-NOCMOV:       # %bb.0:
+; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOCMOV-NEXT:    testl %eax, %eax
+; X86-NOCMOV-NEXT:    je .LBB10_1
+; X86-NOCMOV-NEXT:  # %bb.2: # %cond.false
+; X86-NOCMOV-NEXT:    bsrl %eax, %eax
+; X86-NOCMOV-NEXT:    xorl $31, %eax
+; X86-NOCMOV-NEXT:    xorl $31, %eax
+; X86-NOCMOV-NEXT:    retl
+; X86-NOCMOV-NEXT:  .LBB10_1:
+; X86-NOCMOV-NEXT:    movl $32, %eax
+; X86-NOCMOV-NEXT:    xorl $31, %eax
+; X86-NOCMOV-NEXT:    retl
+;
+; X86-CMOV-LABEL: ctlz_bsr_zero_test:
+; X86-CMOV:       # %bb.0:
+; X86-CMOV-NEXT:    bsrl {{[0-9]+}}(%esp), %ecx
+; X86-CMOV-NEXT:    movl $63, %eax
+; X86-CMOV-NEXT:    cmovnel %ecx, %eax
+; X86-CMOV-NEXT:    retl
 ;
 ; X64-LABEL: ctlz_bsr_zero_test:
 ; X64:       # %bb.0:
-; X64-NEXT:    testl %edi, %edi
-; X64-NEXT:    je .LBB10_1
-; X64-NEXT:  # %bb.2: # %cond.false
-; X64-NEXT:    bsrl %edi, %eax
-; X64-NEXT:    xorl $31, %eax
-; X64-NEXT:    xorl $31, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB10_1:
-; X64-NEXT:    movl $32, %eax
-; X64-NEXT:    xorl $31, %eax
+; X64-NEXT:    bsrl %edi, %ecx
+; X64-NEXT:    movl $63, %eax
+; X64-NEXT:    cmovnel %ecx, %eax
 ; X64-NEXT:    retq
 ;
 ; X86-CLZ-LABEL: ctlz_bsr_zero_test:
@@ -945,38 +954,39 @@ define i8 @ctlz_xor7_i8_true(i8 %x) {
 }
 
 define i8 @ctlz_xor7_i8_false(i8 %x) {
-; X86-LABEL: ctlz_xor7_i8_false:
-; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    testb %al, %al
-; X86-NEXT:    je .LBB16_1
-; X86-NEXT:  # %bb.2: # %cond.false
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    bsrl %eax, %eax
-; X86-NEXT:    xorl $7, %eax
-; X86-NEXT:    xorb $7, %al
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB16_1:
-; X86-NEXT:    movb $8, %al
-; X86-NEXT:    xorb $7, %al
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
+; X86-NOCMOV-LABEL: ctlz_xor7_i8_false:
+; X86-NOCMOV:       # %bb.0:
+; X86-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NOCMOV-NEXT:    testb %al, %al
+; X86-NOCMOV-NEXT:    je .LBB16_1
+; X86-NOCMOV-NEXT:  # %bb.2: # %cond.false
+; X86-NOCMOV-NEXT:    movzbl %al, %eax
+; X86-NOCMOV-NEXT:    bsrl %eax, %eax
+; X86-NOCMOV-NEXT:    xorl $7, %eax
+; X86-NOCMOV-NEXT:    xorb $7, %al
+; X86-NOCMOV-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NOCMOV-NEXT:    retl
+; X86-NOCMOV-NEXT:  .LBB16_1:
+; X86-NOCMOV-NEXT:    movb $8, %al
+; X86-NOCMOV-NEXT:    xorb $7, %al
+; X86-NOCMOV-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NOCMOV-NEXT:    retl
+;
+; X86-CMOV-LABEL: ctlz_xor7_i8_false:
+; X86-CMOV:       # %bb.0:
+; X86-CMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-CMOV-NEXT:    bsrl %eax, %ecx
+; X86-CMOV-NEXT:    movl $15, %eax
+; X86-CMOV-NEXT:    cmovnel %ecx, %eax
+; X86-CMOV-NEXT:    # kill: def $al killed $al killed $eax
+; X86-CMOV-NEXT:    retl
 ;
 ; X64-LABEL: ctlz_xor7_i8_false:
 ; X64:       # %bb.0:
-; X64-NEXT:    testb %dil, %dil
-; X64-NEXT:    je .LBB16_1
-; X64-NEXT:  # %bb.2: # %cond.false
 ; X64-NEXT:    movzbl %dil, %eax
-; X64-NEXT:    bsrl %eax, %eax
-; X64-NEXT:    xorl $7, %eax
-; X64-NEXT:    xorb $7, %al
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB16_1:
-; X64-NEXT:    movb $8, %al
-; X64-NEXT:    xorb $7, %al
+; X64-NEXT:    bsrl %eax, %ecx
+; X64-NEXT:    movl $15, %eax
+; X64-NEXT:    cmovnel %ecx, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
 ;
@@ -1060,33 +1070,33 @@ define i16 @ctlz_xor15_i16_true(i16 %x) {
 }
 
 define i32 @ctlz_xor31_i32_false(i32 %x) {
-; X86-LABEL: ctlz_xor31_i32_false:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    je .LBB18_1
-; X86-NEXT:  # %bb.2: # %cond.false
-; X86-NEXT:    bsrl %eax, %eax
-; X86-NEXT:    xorl $31, %eax
-; X86-NEXT:    xorl $31, %eax
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB18_1:
-; X86-NEXT:    movl $32, %eax
-; X86-NEXT:    xorl $31, %eax
-; X86-NEXT:    retl
+; X86-NOCMOV-LABEL: ctlz_xor31_i32_false:
+; X86-NOCMOV:       # %bb.0:
+; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOCMOV-NEXT:    testl %eax, %eax
+; X86-NOCMOV-NEXT:    je .LBB18_1
+; X86-NOCMOV-NEXT:  # %bb.2: # %cond.false
+; X86-NOCMOV-NEXT:    bsrl %eax, %eax
+; X86-NOCMOV-NEXT:    xorl $31, %eax
+; X86-NOCMOV-NEXT:    xorl $31, %eax
+; X86-NOCMOV-NEXT:    retl
+; X86-NOCMOV-NEXT:  .LBB18_1:
+; X86-NOCMOV-NEXT:    movl $32, %eax
+; X86-NOCMOV-NEXT:    xorl $31, %eax
+; X86-NOCMOV-NEXT:    retl
+;
+; X86-CMOV-LABEL: ctlz_xor31_i32_false:
+; X86-CMOV:       # %bb.0:
+; X86-CMOV-NEXT:    bsrl {{[0-9]+}}(%esp), %ecx
+; X86-CMOV-NEXT:    movl $63, %eax
+; X86-CMOV-NEXT:    cmovnel %ecx, %eax
+; X86-CMOV-NEXT:    retl
 ;
 ; X64-LABEL: ctlz_xor31_i32_false:
 ; X64:       # %bb.0:
-; X64-NEXT:    testl %edi, %edi
-; X64-NEXT:    je .LBB18_1
-; X64-NEXT:  # %bb.2: # %cond.false
-; X64-NEXT:    bsrl %edi, %eax
-; X64-NEXT:    xorl $31, %eax
-; X64-NEXT:    xorl $31, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB18_1:
-; X64-NEXT:    movl $32, %eax
-; X64-NEXT:    xorl $31, %eax
+; X64-NEXT:    bsrl %edi, %ecx
+; X64-NEXT:    movl $63, %eax
+; X64-NEXT:    cmovnel %ecx, %eax
 ; X64-NEXT:    retq
 ;
 ; X86-CLZ-LABEL: ctlz_xor31_i32_false:
diff --git a/llvm/test/CodeGen/X86/cttz.ll b/llvm/test/CodeGen/X86/cttz.ll
index b35a1b72fcb6f12..27f229b18bf057b 100644
--- a/llvm/test/CodeGen/X86/cttz.ll
+++ b/llvm/test/CodeGen/X86/cttz.ll
@@ -303,17 +303,24 @@ define i16 @cttz_i16_zero_test(i16 %n) {
 
 ; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
 define i32 @cttz_i32_zero_test(i32 %n) {
-; X86-LABEL: cttz_i32_zero_test:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    je .LBB6_1
-; X86-NEXT:  # %bb.2: # %cond.false
-; X86-NEXT:    rep bsfl %eax, %eax
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB6_1:
-; X86-NEXT:    movl $32, %eax
-; X86-NEXT:    retl
+; X86-NOCMOV-LABEL: cttz_i32_zero_test:
+; X86-NOCMOV:       # %bb.0:
+; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOCMOV-NEXT:    testl %eax, %eax
+; X86-NOCMOV-NEXT:    je .LBB6_1
+; X86-NOCMOV-NEXT:  # %bb.2: # %cond.false
+; X86-NOCMOV-NEXT:    rep bsfl %eax, %eax
+; X86-NOCMOV-NEXT:    retl
+; X86-NOCMOV-NEXT:  .LBB6_1:
+; X86-NOCMOV-NEXT:    movl $32, %eax
+; X86-NOCMOV-NEXT:    retl
+;
+; X86-CMOV-LABEL: cttz_i32_zero_test:
+; X86-CMOV:       # %bb.0:
+; X86-CMOV-NEXT:    bsfl {{[0-9]+}}(%esp), %ecx
+; X86-CMOV-NEXT:    movl $32, %eax
+; X86-CMOV-NEXT:    cmovnel %ecx, %eax
+; X86-CMOV-NEXT:    retl
 ;
 ; X64-LABEL: cttz_i32_zero_test:
 ; X64:       # %bb.0:
@@ -386,13 +393,9 @@ define i64 @cttz_i64_zero_test(i64 %n) {
 ;
 ; X64-LABEL: cttz_i64_zero_test:
 ; X64:       # %bb.0:
-; X64-NEXT:    testq %rdi, %rdi
-; X64-NEXT:    je .LBB7_1
-; X64-NEXT:  # %bb.2: # %cond.false
-; X64-NEXT:    rep bsfq %rdi, %rax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB7_1:
+; X64-NEXT:    bsfq %rdi, %rcx
 ; X64-NEXT:    movl $64, %eax
+; X64-NEXT:    cmovneq %rcx, %rax
 ; X64-NEXT:    retq
 ;
 ; X86-CLZ-LABEL: cttz_i64_zero_test:
diff --git a/llvm/test/CodeGen/X86/known-never-zero.ll b/llvm/test/CodeGen/X86/known-never-zero.ll
index d5d604a138a719c..ac41a3fe6bb7e44 100644
--- a/llvm/test/CodeGen/X86/known-never-zero.ll
+++ b/llvm/test/CodeGen/X86/known-never-zero.ll
@@ -44,12 +44,9 @@ define i32 @or_maybe_zero(i32 %x, i32 %y) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    je .LBB1_1
-; X86-NEXT:  # %bb.2: # %cond.false
-; X86-NEXT:    rep bsfl %eax, %eax
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB1_1:
+; X86-NEXT:    bsfl %eax, %ecx
 ; X86-NEXT:    movl $32, %eax
+; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: or_maybe_zero:
@@ -94,18 +91,14 @@ define i32 @select_known_nonzero(i1 %c, i32 %x) {
 define i32 @select_maybe_zero(i1 %c, i32 %x) {
 ; X86-LABEL: select_maybe_zero:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    orl $1, %ecx
-; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orl $1, %eax
+; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    cmovnel %ecx, %eax
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    je .LBB3_1
-; X86-NEXT:  # %bb.2: # %cond.false
-; X86-NEXT:    rep bsfl %eax, %eax
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB3_1:
+; X86-NEXT:    cmovnel %eax, %ecx
+; X86-NEXT:    bsfl %ecx, %ecx
 ; X86-NEXT:    movl $32, %eax
+; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: select_maybe_zero:
@@ -201,13 +194,9 @@ define i32 @shl_maybe_zero(i32 %x, i32 %y) {
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    je .LBB7_1
-; X86-NEXT:  # %bb.2: # %cond.false
-; X86-NEXT:    rep bsfl %eax, %eax
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB7_1:
+; X86-NEXT:    bsfl %eax, %ecx
 ; X86-NEXT:    movl $32, %eax
+; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: shl_maybe_zero:
@@ -251,17 +240,13 @@ define i32 @uaddsat_known_nonzero(i32 %x) {
 define i32 @uaddsat_maybe_zero(i32 %x, i32 %y) {
 ; X86-LABEL: uaddsat_maybe_zero:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $-1, %eax
-; X86-NEXT:    cmovael %ecx, %eax
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    je .LBB9_1
-; X86-NEXT:  # %bb.2: # %cond.false
-; X86-NEXT:    rep bsfl %eax, %eax
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB9_1:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl $-1, %ecx
+; X86-NEXT:    cmovael %eax, %ecx
+; X86-NEXT:    bsfl %ecx, %ecx
 ; X86-NEXT:    movl $32, %eax
+; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: uaddsat_maybe_zero:
@@ -314,13 +299,9 @@ define i32 @umax_maybe_zero(i32 %x, i32 %y) {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    cmpl %eax, %ecx
 ; X86-NEXT:    cmoval %ecx, %eax
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    je .LBB11_1
-; X86-NEXT:  # %bb.2: # %cond.false
-; X86-NEXT:    rep bsfl %eax, %eax
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB11_1:
+; X86-NEXT:    bsfl %eax, %ecx
 ; X86-NEXT:    movl $32, %eax
+; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: umax_maybe_zero:
@@ -372,17 +353,13 @@ define i32 @umin_known_nonzero(i32 %xx, i32 %yy) {
 define i32 @umin_maybe_zero(i32 %x, i32 %y) {
 ; X86-LABEL: umin_maybe_zero:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl $54, %ecx
-; X86-NEXT:    movl $54, %eax
-; X86-NEXT:    cmovbl %ecx, %eax
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    je .LBB13_1
-; X86-NEXT:  # %bb.2: # %cond.false
-; X86-NEXT:    rep bsfl %eax, %eax
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB13_1:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $54, %eax
+; X86-NEXT:    movl $54, %ecx
+; X86-NEXT:    cmovbl %eax, %ecx
+; X86-NEXT:    bsfl %ecx, %ecx
 ; X86-NEXT:    movl $32, %eax
+; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: umin_maybe_zero:
@@ -490,17 +467,13 @@ define <4 x i32> @smin_known_zero_vec(<4 x i32> %x, <4 x i32> %y) {
 define i32 @smin_maybe_zero(i32 %x, i32 %y) {
 ; X86-LABEL: smin_maybe_zero:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl $54, %ecx
-; X86-NEXT:    movl $54, %eax
-; X86-NEXT:    cmovll %ecx, %eax
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    je .LBB17_1
-; X86-NEXT:  # %bb.2: # %cond.false
-; X86-NEXT:    rep bsfl %eax, %eax
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB17_1:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $54, %eax
+; X86-NEXT:    movl $54, %ecx
+; X86-NEXT:    cmovll %eax, %ecx
+; X86-NEXT:    bsfl %ecx, %ecx
 ; X86-NEXT:    movl $32, %eax
+; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: smin_maybe_zero:
@@ -608,17 +581,13 @@ define <4 x i32> @smax_known_zero_vec(<4 x i32> %x, <4 x i32> %y) {
 define i32 @smax_known_zero(i32 %x, i32 %y) {
 ; X86-LABEL: smax_known_zero:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    testl %ecx, %ecx
-; X86-NEXT:    movl $-1, %eax
-; X86-NEXT:    cmovnsl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    je .LBB21_1
-; X86-NEXT:  # %bb.2: # %cond.false
-; X86-NEXT:    rep bsfl %eax, %eax
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB21_1:
+; X86-NEXT:    movl $-1, %ecx
+; X86-NEXT:    cmovnsl %eax, %ecx
+; X86-NEXT:    bsfl %ecx, %ecx
 ; X86-NEXT:    movl $32, %eax
+; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: smax_known_zero:
@@ -643,14 +612,8 @@ define i32 @rotr_known_nonzero(i32 %xx, i32 %y) {
 ; X86-NEXT:    movl $256, %eax # imm = 0x100
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    rorl %cl, %eax
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    je .LBB22_1
-; X86-NEXT:  # %bb.2: # %cond.false
 ; X86-NEXT:    rep bsfl %eax, %eax
 ; X86-NEXT:    retl
-; X86-NEXT:  .LBB22_1:
-; X86-NEXT:    movl $32, %eax
-; X86-NEXT:    retl
 ;
 ; X64-LABEL: rotr_known_nonzero:
 ; X64:       # %bb.0:
@@ -675,13 +638,9 @@ define i32 @rotr_maybe_zero(i32 %x, i32 %y) {
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    rorl %cl, %eax
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    je .LBB23_1
-; X86-NEXT:  # %bb.2: # %cond.false
-; X86-NEXT:    rep bsfl %eax, %eax
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB23_1:
+; X86-NEXT:    bsfl %eax, %ecx
 ; X86-NEXT:    movl $32, %eax
+; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: rotr_maybe_zero:
@@ -733,13 +692,9 @@ define i32 @rotr_with_fshr_maybe_zero(i32 %x, i32 %y) {
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    rorl %cl, %eax
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    je .LBB25_1
-; X86-NEXT:  # %bb.2: # %cond.false
-; X86-NEXT:    rep bsfl %eax, %eax
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB25_1:
+; X86-NEXT:    bsfl %eax, %ecx
 ; X86-NEXT:    movl $32, %eax
+; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: rotr_with_fshr_maybe_zero:
@@ -765,14 +720,8 @@ define i32 @rotl_known_nonzero(i32 %xx, i32 %y) {
 ; X86-NEXT:    movl $256, %eax # imm = 0x100
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    roll %cl, %eax
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    je .LBB26_1
-; X86-NEXT:  # %bb.2: # %cond.false
 ; X86-NEXT:    rep bsfl %eax, %eax
 ; X86-NEXT:    retl
-; X86-NEXT:  .LBB26_1:
-; X86-NEXT:    movl $32, %eax
-; X86-NEXT:    retl
 ;
 ; X64-LABEL: rotl_known_nonzero:
 ; X64:       # %bb.0:
@@ -797,13 +746,9 @@ define i32 @rotl_maybe_zero(i32 %x, i32 %y) {
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    roll %cl, %eax
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    je .LBB27_1
-; X86-NEXT:  # %bb.2: # %cond.false
-; X86-NEXT:    rep bsfl %eax, %eax
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB27_1:
+; X86-NEXT:    bsfl %eax, %ecx
 ; X86-NEXT:    movl $32, %eax
+; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: rotl_maybe_zero:
@@ -855,13 +800,9 @@ define i32 @rotl_with_fshl_maybe_zero(i32 %x, i32 %y) {
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    roll %cl, %eax
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    je .LBB29_1
-; X86-NEXT:  # %bb.2: # %cond.false
-; X86-NEXT:    rep bsfl %eax, %eax
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB29_1:
+; X86-NEXT:    bsfl %eax, %ecx
 ; X86-NEXT:    movl $32, %eax
+; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: rotl_with_fshl_maybe_zero:
@@ -932,13 +873,9 @@ define i32 @sra_maybe_zero(i32 %x, i32 %y) {
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    sarl %cl, %eax
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    je .LBB32_1
-; X86-NEXT:  # %bb.2: # %cond.false
-; X86-NEXT:    rep bsfl %eax, %eax
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB32_1:
+; X86-NEXT:    bsfl %eax, %ecx
 ; X86-NEXT:    movl $32, %eax
+; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: sra_maybe_zero:
@@ -1009,13 +946,9 @@ define i32 @srl_maybe_zero(i32 %x, i32 %y) {
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shrl %cl, %eax
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    je .LBB35_1
-; X86-NEXT:  # %bb.2: # %cond.false
-; X86-NEXT:    rep bsfl %eax, %eax
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB35_1:
+; X86-NEXT:    bsfl %eax, %ecx
 ; X86-NEXT:    movl $32, %eax
+; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: srl_maybe_zero:
@@ -1064,13 +997,9 @@ define i32 @udiv_maybe_zero(i32 %x, i32 %y) {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    divl {{[0-9]+}}(%esp)
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    je .LBB37_1
-; X86-NEXT:  # %bb.2: # %cond.false
-; X86-NEXT:    rep bsfl %eax, %eax
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB37_1:
+; X86-NEXT:    bsfl %eax, %ecx
 ; X86-NEXT:    movl $32, %eax
+; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: udiv_maybe_zero:
@@ -1119,13 +1048,9 @@ define i32 @sdiv_maybe_zero(i32 %x, i32 %y) {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cltd
 ; X86-NEXT:    idivl {{[0-9]+}}(%esp)
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    je .LBB39_1
-; X86-NEXT:  # %bb.2: # %cond.false
-; X86-NEXT:    rep bsfl %eax, %eax
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB39_1:
+; X86-NEXT:    bsfl %eax, %ecx
 ; X86-NEXT:    movl $32, %eax
+; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: sdiv_maybe_zero:
@@ -1171,12 +1096,9 @@ define i32 @add_maybe_zero(i32 %xx, i32 %y) {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    orl $1, %eax
 ; X86-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    je .LBB41_1
-; X86-NEXT:  # %bb.2: # %cond.false
-; X86-NEXT:    rep bsfl %eax, %eax
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB41_1:
+; X86-NEXT:    bsfl %eax, %ecx
 ; X86-NEXT:    movl $32, %eax
+; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: add_maybe_zero:
@@ -1249,16 +1171,13 @@ define i32 @sub_known_nonzero_ne_case(i32 %xx, i32 %yy) {
 define i32 @sub_maybe_zero(i32 %x) {
 ; X86-LABEL: sub_maybe_zero:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    orl $64, %eax
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    je .LBB44_1
-; X86-NEXT:  # %bb.2: # %cond.false
-; X86-NEXT:    rep bsfl %eax, %eax
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB44_1:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    orl $64, %ecx
+; X86-NEXT:    subl %eax, %ecx
+; X86-NEXT:    bsfl %ecx, %ecx
 ; X86-NEXT:    movl $32, %eax
+; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: sub_maybe_zero:
@@ -1280,14 +1199,11 @@ define i32 @sub_maybe_zero(i32 %x) {
 define i32 @sub_maybe_zero2(i32 %x) {
 ; X86-LABEL: sub_maybe_zero2:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    negl %eax
-; X86-NEXT:    je .LBB45_1
-; X86-NEXT:  # %bb.2: # %cond.false
-; X86-NEXT:    rep bsfl %eax, %eax
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB45_1:
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    bsfl %eax, %ecx
 ; X86-NEXT:    movl $32, %eax
+; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: sub_maybe_zero2:
@@ -1310,13 +1226,9 @@ define i32 @mul_known_nonzero_nsw(i32 %x, i32 %yy) {
 ; X86-NEXT:    movl $256, %eax # imm = 0x100
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    imull {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    je .LBB46_1
-; X86-NEXT:  # %bb.2: # %cond.false
-; X86-NEXT:    rep bsfl %eax, %eax
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB46_1:
+; X86-NEXT:    bsfl %eax, %ecx
 ; X86-NEXT:    movl $32, %eax
+; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: mul_known_nonzero_nsw:
@@ -1341,13 +1253,9 @@ define i32 @mul_known_nonzero_nuw(i32 %x, i32 %yy) {
 ; X86-NEXT:    movl $256, %eax # imm = 0x100
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    imull {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    je .LBB47_1
-; X86-NEXT:  # %bb.2: # %cond.false
-; X86-NEXT:    rep bsfl %eax, %eax
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB47_1:
+; X86-NEXT:    bsfl %eax, %ecx
 ; X86-NEXT:    movl $32, %eax
+; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: mul_known_nonzero_nuw:
@@ -1371,13 +1279,9 @@ define i32 @mul_maybe_zero(i32 %x, i32 %y) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    imull {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    je .LBB48_1
-; X86-NEXT:  # %bb.2: # %cond.false
-; X86-NEXT:    rep bsfl %eax, %eax
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB48_1:
+; X86-NEXT:    bsfl %eax, %ecx
 ; X86-NEXT:    movl $32, %eax
+; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: mul_maybe_zero:
@@ -1433,13 +1337,9 @@ define i32 @bitcast_maybe_zero(<2 x i16> %x) {
 ; X86-LABEL: bitcast_maybe_zero:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movd %xmm0, %eax
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    je .LBB50_1
-; X86-NEXT:  # %bb.2: # %cond.false
-; X86-NEXT:    rep bsfl %eax, %eax
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB50_1:
+; X86-NEXT:    bsfl %eax, %ecx
 ; X86-NEXT:    movl $32, %eax
+; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: bitcast_maybe_zero:
@@ -1458,15 +1358,9 @@ define i32 @bitcast_maybe_zero(<2 x i16> %x) {
 define i32 @bitcast_from_float(float %x) {
 ; X86-LABEL: bitcast_from_float:
 ; X86:       # %bb.0:
-; X86-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT:    movd %xmm0, %eax
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    je .LBB51_1
-; X86-NEXT:  # %bb.2: # %cond.false
-; X86-NEXT:    rep bsfl %eax, %eax
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB51_1:
+; X86-NEXT:    bsfl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl $32, %eax
+; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: bitcast_from_float:
@@ -1511,14 +1405,9 @@ define i32 @zext_maybe_zero(i16 %x) {
 ; X86-LABEL: zext_maybe_zero:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    testw %ax, %ax
-; X86-NEXT:    je .LBB53_1
-; X86-NEXT:  # %bb.2: # %cond.false
-; X86-NEXT:    movzwl %ax, %eax
-; X86-NEXT:    rep bsfl %eax, %eax
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB53_1:
+; X86-NEXT:    bsfl %eax, %ecx
 ; X86-NEXT:    movl $32, %eax
+; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: zext_maybe_zero:
@@ -1563,13 +1452,9 @@ define i32 @sext_maybe_zero(i16 %x) {
 ; X86-LABEL: sext_maybe_zero:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movswl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    je .LBB55_1
-; X86-NEXT:  # %bb.2: # %cond.false
-; X86-NEXT:    rep bsfl %eax, %eax
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB55_1:
+; X86-NEXT:    bsfl %eax, %ecx
 ; X86-NEXT:    movl $32, %eax
+; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: sext_maybe_zero:
diff --git a/llvm/test/CodeGen/X86/lzcnt-cmp.ll b/llvm/test/CodeGen/X86/lzcnt-cmp.ll
index a9513a373661f49..4f65739cc70dd1a 100644
--- a/llvm/test/CodeGen/X86/lzcnt-cmp.ll
+++ b/llvm/test/CodeGen/X86/lzcnt-cmp.ll
@@ -12,27 +12,11 @@ define i1 @lshr_ctlz_cmpeq_one_i64(i64 %in) nounwind {
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    retl
 ;
-; X64-BSR-LABEL: lshr_ctlz_cmpeq_one_i64:
-; X64-BSR:       # %bb.0:
-; X64-BSR-NEXT:    testq %rdi, %rdi
-; X64-BSR-NEXT:    je .LBB0_1
-; X64-BSR-NEXT:  # %bb.2: # %cond.false
-; X64-BSR-NEXT:    bsrq %rdi, %rax
-; X64-BSR-NEXT:    xorq $63, %rax
-; X64-BSR-NEXT:    shrl $6, %eax
-; X64-BSR-NEXT:    # kill: def $al killed $al killed $rax
-; X64-BSR-NEXT:    retq
-; X64-BSR-NEXT:  .LBB0_1:
-; X64-BSR-NEXT:    movl $64, %eax
-; X64-BSR-NEXT:    shrl $6, %eax
-; X64-BSR-NEXT:    # kill: def $al killed $al killed $rax
-; X64-BSR-NEXT:    retq
-;
-; X64-LZCNT-LABEL: lshr_ctlz_cmpeq_one_i64:
-; X64-LZCNT:       # %bb.0:
-; X64-LZCNT-NEXT:    testq %rdi, %rdi
-; X64-LZCNT-NEXT:    sete %al
-; X64-LZCNT-NEXT:    retq
+; X64-LABEL: lshr_ctlz_cmpeq_one_i64:
+; X64:       # %bb.0:
+; X64-NEXT:    testq %rdi, %rdi
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
   %ctlz = call i64 @llvm.ctlz.i64(i64 %in, i1 0)
   %lshr = lshr i64 %ctlz, 6
   %icmp = icmp eq i64 %lshr, 1
@@ -81,27 +65,11 @@ define i1 @lshr_ctlz_cmpne_zero_i64(i64 %in) nounwind {
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    retl
 ;
-; X64-BSR-LABEL: lshr_ctlz_cmpne_zero_i64:
-; X64-BSR:       # %bb.0:
-; X64-BSR-NEXT:    testq %rdi, %rdi
-; X64-BSR-NEXT:    je .LBB2_1
-; X64-BSR-NEXT:  # %bb.2: # %cond.false
-; X64-BSR-NEXT:    bsrq %rdi, %rax
-; X64-BSR-NEXT:    xorq $63, %rax
-; X64-BSR-NEXT:    shrl $6, %eax
-; X64-BSR-NEXT:    # kill: def $al killed $al killed $rax
-; X64-BSR-NEXT:    retq
-; X64-BSR-NEXT:  .LBB2_1:
-; X64-BSR-NEXT:    movl $64, %eax
-; X64-BSR-NEXT:    shrl $6, %eax
-; X64-BSR-NEXT:    # kill: def $al killed $al killed $rax
-; X64-BSR-NEXT:    retq
-;
-; X64-LZCNT-LABEL: lshr_ctlz_cmpne_zero_i64:
-; X64-LZCNT:       # %bb.0:
-; X64-LZCNT-NEXT:    testq %rdi, %rdi
-; X64-LZCNT-NEXT:    sete %al
-; X64-LZCNT-NEXT:    retq
+; X64-LABEL: lshr_ctlz_cmpne_zero_i64:
+; X64:       # %bb.0:
+; X64-NEXT:    testq %rdi, %rdi
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
   %ctlz = call i64 @llvm.ctlz.i64(i64 %in, i1 0)
   %lshr = lshr i64 %ctlz, 6
   %icmp = icmp ne i64 %lshr, 0
diff --git a/llvm/test/CodeGen/X86/pr57673.ll b/llvm/test/CodeGen/X86/pr57673.ll
index d0ae6cea068dc07..cf7717f420480b5 100644
--- a/llvm/test/CodeGen/X86/pr57673.ll
+++ b/llvm/test/CodeGen/X86/pr57673.ll
@@ -24,35 +24,24 @@ define void @foo() {
   ; NORMAL-NEXT:   [[COPY:%[0-9]+]]:gr8 = COPY [[MOV32r0_]].sub_8bit
   ; NORMAL-NEXT:   [[LEA64r:%[0-9]+]]:gr64 = LEA64r %stack.1.i, 1, $noreg, 0, $noreg
   ; NORMAL-NEXT:   [[DEF:%[0-9]+]]:gr64 = IMPLICIT_DEF
-  ; NORMAL-NEXT:   [[DEF1:%[0-9]+]]:gr64 = IMPLICIT_DEF
   ; NORMAL-NEXT: {{  $}}
   ; NORMAL-NEXT: bb.1.bb_8:
-  ; NORMAL-NEXT:   successors: %bb.5(0x40000000), %bb.2(0x40000000)
+  ; NORMAL-NEXT:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
   ; NORMAL-NEXT: {{  $}}
   ; NORMAL-NEXT:   TEST8rr [[COPY]], [[COPY]], implicit-def $eflags
-  ; NORMAL-NEXT:   JCC_1 %bb.5, 5, implicit $eflags
+  ; NORMAL-NEXT:   JCC_1 %bb.3, 5, implicit $eflags
   ; NORMAL-NEXT:   JMP_1 %bb.2
   ; NORMAL-NEXT: {{  $}}
   ; NORMAL-NEXT: bb.2.bb_mid:
-  ; NORMAL-NEXT:   successors: %bb.4(0x30000000), %bb.3(0x50000000)
+  ; NORMAL-NEXT:   successors: %bb.3(0x80000000)
   ; NORMAL-NEXT: {{  $}}
-  ; NORMAL-NEXT:   TEST64rr [[DEF1]], [[DEF1]], implicit-def $eflags
-  ; NORMAL-NEXT:   JCC_1 %bb.4, 4, implicit $eflags
-  ; NORMAL-NEXT:   JMP_1 %bb.3
-  ; NORMAL-NEXT: {{  $}}
-  ; NORMAL-NEXT: bb.3.cond.false:
-  ; NORMAL-NEXT:   successors: %bb.4(0x80000000)
-  ; NORMAL-NEXT: {{  $}}
-  ; NORMAL-NEXT: bb.4.cond.end:
-  ; NORMAL-NEXT:   successors: %bb.5(0x80000000)
-  ; NORMAL-NEXT: {{  $}}
-  ; NORMAL-NEXT:   [[MOVUPSrm:%[0-9]+]]:vr128 = MOVUPSrm [[LEA64r]], 1, $noreg, 40, $noreg :: (load (s128) from %ir.i4, align 8)
+  ; NORMAL-NEXT:   [[MOVUPSrm:%[0-9]+]]:vr128 = MOVUPSrm %stack.1.i, 1, $noreg, 40, $noreg :: (load (s128) from %ir.i4, align 8)
   ; NORMAL-NEXT:   MOVUPSmr $noreg, 1, $noreg, 0, $noreg, killed [[MOVUPSrm]] :: (store (s128) into `ptr null`, align 8)
-  ; NORMAL-NEXT:   DBG_VALUE_LIST !3, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_plus_uconst, 40), [[LEA64r]], [[LEA64r]], debug-location !8
-  ; NORMAL-NEXT:   [[MOVUPSrm1:%[0-9]+]]:vr128 = MOVUPSrm [[LEA64r]], 1, $noreg, 40, $noreg :: (load (s128) from %ir.i6, align 8)
+  ; NORMAL-NEXT:   DBG_VALUE $noreg, $noreg, !3, !DIExpression(), debug-location !8
+  ; NORMAL-NEXT:   [[MOVUPSrm1:%[0-9]+]]:vr128 = MOVUPSrm %stack.1.i, 1, $noreg, 40, $noreg :: (load (s128) from %ir.i6, align 8)
   ; NORMAL-NEXT:   MOVUPSmr $noreg, 1, $noreg, 0, $noreg, killed [[MOVUPSrm1]] :: (store (s128) into `ptr null`, align 8)
   ; NORMAL-NEXT: {{  $}}
-  ; NORMAL-NEXT: bb.5.bb_last:
+  ; NORMAL-NEXT: bb.3.bb_last:
   ; NORMAL-NEXT:   successors: %bb.1(0x80000000)
   ; NORMAL-NEXT: {{  $}}
   ; NORMAL-NEXT:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
@@ -74,35 +63,24 @@ define void @foo() {
   ; INSTRREF-NEXT:   [[COPY:%[0-9]+]]:gr8 = COPY [[MOV32r0_]].sub_8bit
   ; INSTRREF-NEXT:   [[LEA64r:%[0-9]+]]:gr64 = LEA64r %stack.1.i, 1, $noreg, 0, $noreg
   ; INSTRREF-NEXT:   [[DEF:%[0-9]+]]:gr64 = IMPLICIT_DEF
-  ; INSTRREF-NEXT:   [[DEF1:%[0-9]+]]:gr64 = IMPLICIT_DEF
   ; INSTRREF-NEXT: {{  $}}
   ; INSTRREF-NEXT: bb.1.bb_8:
-  ; INSTRREF-NEXT:   successors: %bb.5(0x40000000), %bb.2(0x40000000)
+  ; INSTRREF-NEXT:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
   ; INSTRREF-NEXT: {{  $}}
   ; INSTRREF-NEXT:   TEST8rr [[COPY]], [[COPY]], implicit-def $eflags
-  ; INSTRREF-NEXT:   JCC_1 %bb.5, 5, implicit $eflags
+  ; INSTRREF-NEXT:   JCC_1 %bb.3, 5, implicit $eflags
   ; INSTRREF-NEXT:   JMP_1 %bb.2
   ; INSTRREF-NEXT: {{  $}}
   ; INSTRREF-NEXT: bb.2.bb_mid:
-  ; INSTRREF-NEXT:   successors: %bb.4(0x30000000), %bb.3(0x50000000)
-  ; INSTRREF-NEXT: {{  $}}
-  ; INSTRREF-NEXT:   TEST64rr [[DEF1]], [[DEF1]], implicit-def $eflags
-  ; INSTRREF-NEXT:   JCC_1 %bb.4, 4, implicit $eflags
-  ; INSTRREF-NEXT:   JMP_1 %bb.3
-  ; INSTRREF-NEXT: {{  $}}
-  ; INSTRREF-NEXT: bb.3.cond.false:
-  ; INSTRREF-NEXT:   successors: %bb.4(0x80000000)
-  ; INSTRREF-NEXT: {{  $}}
-  ; INSTRREF-NEXT: bb.4.cond.end:
-  ; INSTRREF-NEXT:   successors: %bb.5(0x80000000)
+  ; INSTRREF-NEXT:   successors: %bb.3(0x80000000)
   ; INSTRREF-NEXT: {{  $}}
-  ; INSTRREF-NEXT:   [[MOVUPSrm:%[0-9]+]]:vr128 = MOVUPSrm [[LEA64r]], 1, $noreg, 40, $noreg :: (load (s128) from %ir.i4, align 8)
+  ; INSTRREF-NEXT:   [[MOVUPSrm:%[0-9]+]]:vr128 = MOVUPSrm %stack.1.i, 1, $noreg, 40, $noreg :: (load (s128) from %ir.i4, align 8)
   ; INSTRREF-NEXT:   MOVUPSmr $noreg, 1, $noreg, 0, $noreg, killed [[MOVUPSrm]] :: (store (s128) into `ptr null`, align 8)
-  ; INSTRREF-NEXT:   DBG_INSTR_REF !3, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0), dbg-instr-ref(1, 0), debug-location !8
-  ; INSTRREF-NEXT:   [[MOVUPSrm1:%[0-9]+]]:vr128 = MOVUPSrm [[LEA64r]], 1, $noreg, 40, $noreg :: (load (s128) from %ir.i6, align 8)
+  ; INSTRREF-NEXT:   DBG_VALUE $noreg, $noreg, !3, !DIExpression(), debug-location !8
+  ; INSTRREF-NEXT:   [[MOVUPSrm1:%[0-9]+]]:vr128 = MOVUPSrm %stack.1.i, 1, $noreg, 40, $noreg :: (load (s128) from %ir.i6, align 8)
   ; INSTRREF-NEXT:   MOVUPSmr $noreg, 1, $noreg, 0, $noreg, killed [[MOVUPSrm1]] :: (store (s128) into `ptr null`, align 8)
   ; INSTRREF-NEXT: {{  $}}
-  ; INSTRREF-NEXT: bb.5.bb_last:
+  ; INSTRREF-NEXT: bb.3.bb_last:
   ; INSTRREF-NEXT:   successors: %bb.1(0x80000000)
   ; INSTRREF-NEXT: {{  $}}
   ; INSTRREF-NEXT:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
diff --git a/llvm/test/CodeGen/X86/pr89877.ll b/llvm/test/CodeGen/X86/pr89877.ll
index fdbe75b467d992a..19baad26583ada5 100644
--- a/llvm/test/CodeGen/X86/pr89877.ll
+++ b/llvm/test/CodeGen/X86/pr89877.ll
@@ -9,13 +9,9 @@ define i32 @sext_known_nonzero(i16 %xx) {
 ; X86-NEXT:    movl $256, %eax # imm = 0x100
 ; X86-NEXT:    shll %cl, %eax
 ; X86-NEXT:    cwtl
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    je .LBB0_1
-; X86-NEXT:  # %bb.2: # %cond.false
-; X86-NEXT:    rep bsfl %eax, %eax
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB0_1:
+; X86-NEXT:    bsfl %eax, %ecx
 ; X86-NEXT:    movl $32, %eax
+; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: sext_known_nonzero:
diff --git a/llvm/test/CodeGen/X86/pr92569.ll b/llvm/test/CodeGen/X86/pr92569.ll
index f91063089e3a909..0fb4ed7905287cb 100644
--- a/llvm/test/CodeGen/X86/pr92569.ll
+++ b/llvm/test/CodeGen/X86/pr92569.ll
@@ -4,17 +4,13 @@
 define void @PR92569(i64 %arg, <8 x i8> %arg1) {
 ; CHECK-LABEL: PR92569:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    testq %rdi, %rdi
-; CHECK-NEXT:    je .LBB0_1
-; CHECK-NEXT:  # %bb.2: # %cond.false
-; CHECK-NEXT:    rep bsfq %rdi, %rax
-; CHECK-NEXT:    jmp .LBB0_3
-; CHECK-NEXT:  .LBB0_1:
-; CHECK-NEXT:    movl $64, %eax
-; CHECK-NEXT:  .LBB0_3: # %cond.end
-; CHECK-NEXT:    shrb $3, %al
+; CHECK-NEXT:    bsfq %rdi, %rax
+; CHECK-NEXT:    movl $64, %ecx
+; CHECK-NEXT:    cmovneq %rax, %rcx
+; CHECK-NEXT:    shrb $3, %cl
 ; CHECK-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    movzbl %cl, %eax
+; CHECK-NEXT:    andl $15, %eax
 ; CHECK-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; CHECK-NEXT:    movl %eax, 0
 ; CHECK-NEXT:    retq
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/cttz-ctlz.ll b/llvm/test/Transforms/CodeGenPrepare/X86/cttz-ctlz.ll
index 06909d950addb6c..2c2923440bf7c20 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/cttz-ctlz.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/cttz-ctlz.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' < %s | FileCheck %s --check-prefix=SLOW
-; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mattr=+bmi < %s | FileCheck %s --check-prefix=FAST_TZ
-; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mattr=+lzcnt < %s | FileCheck %s --check-prefix=FAST_LZ
+; RUN: opt -S -passes="require<profile-summary>,function(codegenprepare)" < %s | FileCheck %s --check-prefix=SLOW
+; RUN: opt -S -passes="require<profile-summary>,function(codegenprepare)" -mattr=+bmi < %s | FileCheck %s --check-prefix=FAST_TZ
+; RUN: opt -S -passes="require<profile-summary>,function(codegenprepare)" -mattr=+lzcnt < %s | FileCheck %s --check-prefix=FAST_LZ
 
-; RUN: opt -S -enable-debugify -passes='require<profile-summary>,function(codegenprepare)' < %s | FileCheck %s --check-prefix=DEBUGINFO
-; RUN: opt -S -enable-debugify -passes='require<profile-summary>,function(codegenprepare)' --try-experimental-debuginfo-iterators < %s | FileCheck %s --check-prefix=DEBUGINFO
+; RUN: opt -S -enable-debugify -passes="require<profile-summary>,function(codegenprepare)" < %s | FileCheck %s --check-prefix=DEBUGINFO
+; RUN: opt -S -enable-debugify -passes="require<profile-summary>,function(codegenprepare)" --try-experimental-debuginfo-iterators < %s | FileCheck %s --check-prefix=DEBUGINFO
 
 target triple = "x86_64-unknown-unknown"
 target datalayout = "e-n32:64"
@@ -16,15 +16,8 @@ target datalayout = "e-n32:64"
 define i64 @cttz(i64 %A) {
 ; SLOW-LABEL: @cttz(
 ; SLOW-NEXT:  entry:
-; SLOW-NEXT:    [[A_FR:%.*]] = freeze i64 [[A:%.*]]
-; SLOW-NEXT:    [[CMPZ:%.*]] = icmp eq i64 [[A_FR]], 0
-; SLOW-NEXT:    br i1 [[CMPZ]], label [[COND_END:%.*]], label [[COND_FALSE:%.*]]
-; SLOW:       cond.false:
-; SLOW-NEXT:    [[Z:%.*]] = call i64 @llvm.cttz.i64(i64 [[A_FR]], i1 true)
-; SLOW-NEXT:    br label [[COND_END]]
-; SLOW:       cond.end:
-; SLOW-NEXT:    [[CTZ:%.*]] = phi i64 [ 64, [[ENTRY:%.*]] ], [ [[Z]], [[COND_FALSE]] ]
-; SLOW-NEXT:    ret i64 [[CTZ]]
+; SLOW-NEXT:    [[Z:%.*]] = call i64 @llvm.cttz.i64(i64 [[A:%.*]], i1 false)
+; SLOW-NEXT:    ret i64 [[Z]]
 ;
 ; FAST_TZ-LABEL: @cttz(
 ; FAST_TZ-NEXT:  entry:
@@ -33,28 +26,14 @@ define i64 @cttz(i64 %A) {
 ;
 ; FAST_LZ-LABEL: @cttz(
 ; FAST_LZ-NEXT:  entry:
-; FAST_LZ-NEXT:    [[A_FR:%.*]] = freeze i64 [[A:%.*]]
-; FAST_LZ-NEXT:    [[CMPZ:%.*]] = icmp eq i64 [[A_FR]], 0
-; FAST_LZ-NEXT:    br i1 [[CMPZ]], label [[COND_END:%.*]], label [[COND_FALSE:%.*]]
-; FAST_LZ:       cond.false:
-; FAST_LZ-NEXT:    [[Z:%.*]] = call i64 @llvm.cttz.i64(i64 [[A_FR]], i1 true)
-; FAST_LZ-NEXT:    br label [[COND_END]]
-; FAST_LZ:       cond.end:
-; FAST_LZ-NEXT:    [[CTZ:%.*]] = phi i64 [ 64, [[ENTRY:%.*]] ], [ [[Z]], [[COND_FALSE]] ]
-; FAST_LZ-NEXT:    ret i64 [[CTZ]]
+; FAST_LZ-NEXT:    [[Z:%.*]] = call i64 @llvm.cttz.i64(i64 [[A:%.*]], i1 false)
+; FAST_LZ-NEXT:    ret i64 [[Z]]
 ;
 ; DEBUGINFO-LABEL: @cttz(
 ; DEBUGINFO-NEXT:  entry:
-; DEBUGINFO-NEXT:    [[A_FR:%.*]] = freeze i64 [[A:%.*]], !dbg [[DBG11:![0-9]+]]
-; DEBUGINFO-NEXT:    [[CMPZ:%.*]] = icmp eq i64 [[A_FR]], 0, !dbg [[DBG11]]
-; DEBUGINFO-NEXT:    br i1 [[CMPZ]], label [[COND_END:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG11]]
-; DEBUGINFO:       cond.false:
-; DEBUGINFO-NEXT:    [[Z:%.*]] = call i64 @llvm.cttz.i64(i64 [[A_FR]], i1 true), !dbg [[DBG11]]
-; DEBUGINFO-NEXT:    br label [[COND_END]], !dbg [[DBG12:![0-9]+]]
-; DEBUGINFO:       cond.end:
-; DEBUGINFO-NEXT:    [[CTZ:%.*]] = phi i64 [ 64, [[ENTRY:%.*]] ], [ [[Z]], [[COND_FALSE]] ], !dbg [[DBG12]]
-; DEBUGINFO-NEXT:      #dbg_value(i64 [[CTZ]], [[META9:![0-9]+]], !DIExpression(), [[DBG11]])
-; DEBUGINFO-NEXT:    ret i64 [[CTZ]], !dbg [[DBG12]]
+; DEBUGINFO-NEXT:    [[Z:%.*]] = call i64 @llvm.cttz.i64(i64 [[A:%.*]], i1 false), !dbg [[DBG11:![0-9]+]]
+; DEBUGINFO-NEXT:      #dbg_value(i64 [[Z]], [[META9:![0-9]+]], !DIExpression(), [[DBG11]])
+; DEBUGINFO-NEXT:    ret i64 [[Z]], !dbg [[DBG12:![0-9]+]]
 ;
 entry:
   %z = call i64 @llvm.cttz.i64(i64 %A, i1 false)
@@ -64,27 +43,13 @@ entry:
 define i64 @ctlz(i64 %A) {
 ; SLOW-LABEL: @ctlz(
 ; SLOW-NEXT:  entry:
-; SLOW-NEXT:    [[A_FR:%.*]] = freeze i64 [[A:%.*]]
-; SLOW-NEXT:    [[CMPZ:%.*]] = icmp eq i64 [[A_FR]], 0
-; SLOW-NEXT:    br i1 [[CMPZ]], label [[COND_END:%.*]], label [[COND_FALSE:%.*]]
-; SLOW:       cond.false:
-; SLOW-NEXT:    [[Z:%.*]] = call i64 @llvm.ctlz.i64(i64 [[A_FR]], i1 true)
-; SLOW-NEXT:    br label [[COND_END]]
-; SLOW:       cond.end:
-; SLOW-NEXT:    [[CTZ:%.*]] = phi i64 [ 64, [[ENTRY:%.*]] ], [ [[Z]], [[COND_FALSE]] ]
-; SLOW-NEXT:    ret i64 [[CTZ]]
+; SLOW-NEXT:    [[Z:%.*]] = call i64 @llvm.ctlz.i64(i64 [[A:%.*]], i1 false)
+; SLOW-NEXT:    ret i64 [[Z]]
 ;
 ; FAST_TZ-LABEL: @ctlz(
 ; FAST_TZ-NEXT:  entry:
-; FAST_TZ-NEXT:    [[A_FR:%.*]] = freeze i64 [[A:%.*]]
-; FAST_TZ-NEXT:    [[CMPZ:%.*]] = icmp eq i64 [[A_FR]], 0
-; FAST_TZ-NEXT:    br i1 [[CMPZ]], label [[COND_END:%.*]], label [[COND_FALSE:%.*]]
-; FAST_TZ:       cond.false:
-; FAST_TZ-NEXT:    [[Z:%.*]] = call i64 @llvm.ctlz.i64(i64 [[A_FR]], i1 true)
-; FAST_TZ-NEXT:    br label [[COND_END]]
-; FAST_TZ:       cond.end:
-; FAST_TZ-NEXT:    [[CTZ:%.*]] = phi i64 [ 64, [[ENTRY:%.*]] ], [ [[Z]], [[COND_FALSE]] ]
-; FAST_TZ-NEXT:    ret i64 [[CTZ]]
+; FAST_TZ-NEXT:    [[Z:%.*]] = call i64 @llvm.ctlz.i64(i64 [[A:%.*]], i1 false)
+; FAST_TZ-NEXT:    ret i64 [[Z]]
 ;
 ; FAST_LZ-LABEL: @ctlz(
 ; FAST_LZ-NEXT:  entry:
@@ -93,16 +58,9 @@ define i64 @ctlz(i64 %A) {
 ;
 ; DEBUGINFO-LABEL: @ctlz(
 ; DEBUGINFO-NEXT:  entry:
-; DEBUGINFO-NEXT:    [[A_FR:%.*]] = freeze i64 [[A:%.*]], !dbg [[DBG16:![0-9]+]]
-; DEBUGINFO-NEXT:    [[CMPZ:%.*]] = icmp eq i64 [[A_FR]], 0, !dbg [[DBG16]]
-; DEBUGINFO-NEXT:    br i1 [[CMPZ]], label [[COND_END:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG16]]
-; DEBUGINFO:       cond.false:
-; DEBUGINFO-NEXT:    [[Z:%.*]] = call i64 @llvm.ctlz.i64(i64 [[A_FR]], i1 true), !dbg [[DBG16]]
-; DEBUGINFO-NEXT:    br label [[COND_END]], !dbg [[DBG17:![0-9]+]]
-; DEBUGINFO:       cond.end:
-; DEBUGINFO-NEXT:    [[CTZ:%.*]] = phi i64 [ 64, [[ENTRY:%.*]] ], [ [[Z]], [[COND_FALSE]] ], !dbg [[DBG17]]
-; DEBUGINFO-NEXT:      #dbg_value(i64 [[CTZ]], [[META15:![0-9]+]], !DIExpression(), [[DBG16]])
-; DEBUGINFO-NEXT:    ret i64 [[CTZ]], !dbg [[DBG17]]
+; DEBUGINFO-NEXT:    [[Z:%.*]] = call i64 @llvm.ctlz.i64(i64 [[A:%.*]], i1 false), !dbg [[DBG16:![0-9]+]]
+; DEBUGINFO-NEXT:      #dbg_value(i64 [[Z]], [[META15:![0-9]+]], !DIExpression(), [[DBG16]])
+; DEBUGINFO-NEXT:    ret i64 [[Z]], !dbg [[DBG17:![0-9]+]]
 ;
 entry:
   %z = call i64 @llvm.ctlz.i64(i64 %A, i1 false)
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll b/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll
index 0462f125955bf43..8a22e45fe1ca570 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=icelake-server -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
@@ -136,11 +136,32 @@ define void @ctlz_4i64() #0 {
 }
 
 define void @ctlz_4i32() #0 {
-; CHECK-LABEL: @ctlz_4i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false)
-; CHECK-NEXT:    store <4 x i32> [[TMP2]], ptr @dst32, align 4
-; CHECK-NEXT:    ret void
+; SSE2-LABEL: @ctlz_4i32(
+; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4
+; SSE2-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false)
+; SSE2-NEXT:    store <4 x i32> [[TMP2]], ptr @dst32, align 4
+; SSE2-NEXT:    ret void
+;
+; SSE4-LABEL: @ctlz_4i32(
+; SSE4-NEXT:    [[LD0:%.*]] = load i32, ptr @src32, align 4
+; SSE4-NEXT:    [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4
+; SSE4-NEXT:    [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 4
+; SSE4-NEXT:    [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 4
+; SSE4-NEXT:    [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false)
+; SSE4-NEXT:    [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false)
+; SSE4-NEXT:    [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false)
+; SSE4-NEXT:    [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false)
+; SSE4-NEXT:    store i32 [[CTLZ0]], ptr @dst32, align 4
+; SSE4-NEXT:    store i32 [[CTLZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 4
+; SSE4-NEXT:    store i32 [[CTLZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 4
+; SSE4-NEXT:    store i32 [[CTLZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 4
+; SSE4-NEXT:    ret void
+;
+; AVX-LABEL: @ctlz_4i32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4
+; AVX-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false)
+; AVX-NEXT:    store <4 x i32> [[TMP2]], ptr @dst32, align 4
+; AVX-NEXT:    ret void
 ;
   %ld0 = load i32, ptr @src32, align 4
   %ld1 = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4
@@ -158,14 +179,41 @@ define void @ctlz_4i32() #0 {
 }
 
 define void @ctlz_8i32() #0 {
-; SSE-LABEL: @ctlz_8i32(
-; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 2
-; SSE-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false)
-; SSE-NEXT:    store <4 x i32> [[TMP2]], ptr @dst32, align 2
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 4), align 2
-; SSE-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP3]], i1 false)
-; SSE-NEXT:    store <4 x i32> [[TMP4]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 4), align 2
-; SSE-NEXT:    ret void
+; SSE2-LABEL: @ctlz_8i32(
+; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 2
+; SSE2-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false)
+; SSE2-NEXT:    store <4 x i32> [[TMP2]], ptr @dst32, align 2
+; SSE2-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 4), align 2
+; SSE2-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP3]], i1 false)
+; SSE2-NEXT:    store <4 x i32> [[TMP4]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 4), align 2
+; SSE2-NEXT:    ret void
+;
+; SSE4-LABEL: @ctlz_8i32(
+; SSE4-NEXT:    [[LD0:%.*]] = load i32, ptr @src32, align 2
+; SSE4-NEXT:    [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 2
+; SSE4-NEXT:    [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 2
+; SSE4-NEXT:    [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 2
+; SSE4-NEXT:    [[LD4:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 4), align 2
+; SSE4-NEXT:    [[LD5:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 5), align 2
+; SSE4-NEXT:    [[LD6:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 6), align 2
+; SSE4-NEXT:    [[LD7:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 7), align 2
+; SSE4-NEXT:    [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false)
+; SSE4-NEXT:    [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false)
+; SSE4-NEXT:    [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false)
+; SSE4-NEXT:    [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false)
+; SSE4-NEXT:    [[CTLZ4:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD4]], i1 false)
+; SSE4-NEXT:    [[CTLZ5:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD5]], i1 false)
+; SSE4-NEXT:    [[CTLZ6:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD6]], i1 false)
+; SSE4-NEXT:    [[CTLZ7:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD7]], i1 false)
+; SSE4-NEXT:    store i32 [[CTLZ0]], ptr @dst32, align 2
+; SSE4-NEXT:    store i32 [[CTLZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 2
+; SSE4-NEXT:    store i32 [[CTLZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 2
+; SSE4-NEXT:    store i32 [[CTLZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 2
+; SSE4-NEXT:    store i32 [[CTLZ4]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 4), align 2
+; SSE4-NEXT:    store i32 [[CTLZ5]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 5), align 2
+; SSE4-NEXT:    store i32 [[CTLZ6]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 6), align 2
+; SSE4-NEXT:    store i32 [[CTLZ7]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 7), align 2
+; SSE4-NEXT:    ret void
 ;
 ; AVX-LABEL: @ctlz_8i32(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @src32, align 2