[InstCombine] Reduce range of ctpop for non zero argument #100899

andjo403 · 2024-07-27T21:22:13Z

Part of solution for #95255

llvmbot · 2024-07-27T21:22:46Z

@llvm/pr-subscribers-llvm-transforms

Author: Andreas Jonson (andjo403)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/100899.diff

5 Files Affected:

(modified) llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp (+7-2)
(modified) llvm/test/Transforms/InstCombine/ctpop-pow2.ll (+2-2)
(modified) llvm/test/Transforms/InstCombine/ctpop.ll (+11-2)
(modified) llvm/test/Transforms/InstCombine/icmp-ne-pow2.ll (+2-2)
(modified) llvm/test/Transforms/InstCombine/ispow2.ll (+4-4)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index f6c4b6e180937..52932f19512f3 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -694,8 +694,13 @@ static Instruction *foldCtpop(IntrinsicInst &II, InstCombinerImpl &IC) {
   // Add range attribute since known bits can't completely reflect what we know.
   if (BitWidth != 1 && !II.hasRetAttr(Attribute::Range) &&
       !II.getMetadata(LLVMContext::MD_range)) {
-    ConstantRange Range(APInt(BitWidth, Known.countMinPopulation()),
-                        APInt(BitWidth, Known.countMaxPopulation() + 1));
+    unsigned KnownMinPop = Known.countMinPopulation();
+    unsigned Lower =
+        KnownMinPop ? KnownMinPop
+                    : isKnownNonZero(
+                          Op0, IC.getSimplifyQuery().getWithInstruction(&II));
+    unsigned Upper = Known.countMaxPopulation() + 1;
+    ConstantRange Range(APInt(BitWidth, Lower), APInt(BitWidth, Upper));
     II.addRangeRetAttr(Range);
     return &II;
   }
diff --git a/llvm/test/Transforms/InstCombine/ctpop-pow2.ll b/llvm/test/Transforms/InstCombine/ctpop-pow2.ll
index 7facdaf7590d3..4ef1ed0ec4976 100644
--- a/llvm/test/Transforms/InstCombine/ctpop-pow2.ll
+++ b/llvm/test/Transforms/InstCombine/ctpop-pow2.ll
@@ -60,7 +60,7 @@ define i8 @ctpop_imin_plus1_lshr_nz(i8 %x) {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i8 [[X:%.*]], 0
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
 ; CHECK-NEXT:    [[V:%.*]] = lshr i8 -127, [[X]]
-; CHECK-NEXT:    [[CNT:%.*]] = call range(i8 0, 9) i8 @llvm.ctpop.i8(i8 [[V]])
+; CHECK-NEXT:    [[CNT:%.*]] = call range(i8 1, 9) i8 @llvm.ctpop.i8(i8 [[V]])
 ; CHECK-NEXT:    ret i8 [[CNT]]
 ;
   %cmp = icmp ne i8 %x, 0
@@ -104,7 +104,7 @@ define <2 x i32> @ctpop_lshr_intmin_intmin_plus1_vec_nz(<2 x i32> %x) {
 ; CHECK-LABEL: @ctpop_lshr_intmin_intmin_plus1_vec_nz(
 ; CHECK-NEXT:    [[X1:%.*]] = or <2 x i32> [[X:%.*]], <i32 1, i32 1>
 ; CHECK-NEXT:    [[SHR:%.*]] = lshr <2 x i32> <i32 -2147483648, i32 -2147483647>, [[X1]]
-; CHECK-NEXT:    [[CNT:%.*]] = call range(i32 0, 17) <2 x i32> @llvm.ctpop.v2i32(<2 x i32> [[SHR]])
+; CHECK-NEXT:    [[CNT:%.*]] = call range(i32 1, 17) <2 x i32> @llvm.ctpop.v2i32(<2 x i32> [[SHR]])
 ; CHECK-NEXT:    ret <2 x i32> [[CNT]]
 ;
   %x1 = or <2 x i32> %x, <i32 1 ,i32 1>
diff --git a/llvm/test/Transforms/InstCombine/ctpop.ll b/llvm/test/Transforms/InstCombine/ctpop.ll
index 83700e72de080..56ac2cad752a3 100644
--- a/llvm/test/Transforms/InstCombine/ctpop.ll
+++ b/llvm/test/Transforms/InstCombine/ctpop.ll
@@ -169,8 +169,8 @@ define <2 x i32> @_parity_of_not_poison(<2 x i32> %x) {
 
 define <2 x i32> @_parity_of_not_poison2(<2 x i32> %x) {
 ; CHECK-LABEL: @_parity_of_not_poison2(
-; CHECK-NEXT:    [[CNT:%.*]] = call range(i32 0, 33) <2 x i32> @llvm.ctpop.v2i32(<2 x i32> [[X:%.*]])
-; CHECK-NEXT:    [[R:%.*]] = and <2 x i32> [[CNT]], <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = call range(i32 0, 33) <2 x i32> @llvm.ctpop.v2i32(<2 x i32> [[X:%.*]])
+; CHECK-NEXT:    [[R:%.*]] = and <2 x i32> [[TMP1]], <i32 1, i32 poison>
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
   %neg = xor <2 x i32> %x, <i32 -1 ,i32 -1>
@@ -485,3 +485,12 @@ define i32 @select_ctpop_zero(i32 %x) {
   %res = select i1 %cmp, i32 0, i32 %ctpop
   ret i32 %res
 }
+
+define i32 @ctpop_non_zero(i32 range(i32 1, 255) %x) {
+; CHECK-LABEL: @ctpop_non_zero(
+; CHECK-NEXT:    [[CTPOP:%.*]] = call range(i32 1, 9) i32 @llvm.ctpop.i32(i32 [[X:%.*]])
+; CHECK-NEXT:    ret i32 [[CTPOP]]
+;
+  %ctpop = call i32 @llvm.ctpop.i32(i32 %x)
+  ret i32 %ctpop
+}
diff --git a/llvm/test/Transforms/InstCombine/icmp-ne-pow2.ll b/llvm/test/Transforms/InstCombine/icmp-ne-pow2.ll
index e9ec6b415d462..618f5d641dc1a 100644
--- a/llvm/test/Transforms/InstCombine/icmp-ne-pow2.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-ne-pow2.ll
@@ -306,7 +306,7 @@ define i32 @pow2_32_nonconst_assume(i32 %x, i32 %y) {
 
 define i32 @pow2_32_gtnonconst_assume(i32 %x, i32 %y) {
 ; CHECK-LABEL: @pow2_32_gtnonconst_assume(
-; CHECK-NEXT:    [[CTPOP:%.*]] = call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[Y:%.*]])
+; CHECK-NEXT:    [[CTPOP:%.*]] = call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 [[Y:%.*]])
 ; CHECK-NEXT:    [[YP2:%.*]] = icmp eq i32 [[CTPOP]], 1
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[YP2]])
 ; CHECK-NEXT:    [[YGT:%.*]] = icmp ugt i32 [[Y]], [[X:%.*]]
@@ -513,7 +513,7 @@ define i32 @maybe_pow2_32_noncont(i32 %x, i32 %y) {
 ; CHECK-NEXT:    [[YGT8:%.*]] = icmp ugt i32 [[Y:%.*]], 8
 ; CHECK-NEXT:    br i1 [[YGT8]], label [[CONT1:%.*]], label [[CONT2:%.*]]
 ; CHECK:       Cont1:
-; CHECK-NEXT:    [[CTPOP:%.*]] = call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[Y]])
+; CHECK-NEXT:    [[CTPOP:%.*]] = call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 [[Y]])
 ; CHECK-NEXT:    [[YP2:%.*]] = icmp eq i32 [[CTPOP]], 1
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[YP2]])
 ; CHECK-NEXT:    br i1 true, label [[CONT2]], label [[FALSE:%.*]]
diff --git a/llvm/test/Transforms/InstCombine/ispow2.ll b/llvm/test/Transforms/InstCombine/ispow2.ll
index a143b1347ccee..3f2c31d05f3ed 100644
--- a/llvm/test/Transforms/InstCombine/ispow2.ll
+++ b/llvm/test/Transforms/InstCombine/ispow2.ll
@@ -197,7 +197,7 @@ define i1 @is_pow2_non_zero_ult_2(i32 %x) {
 ; CHECK-LABEL: @is_pow2_non_zero_ult_2(
 ; CHECK-NEXT:    [[NOTZERO:%.*]] = icmp ne i32 [[X:%.*]], 0
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[NOTZERO]])
-; CHECK-NEXT:    [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X]])
+; CHECK-NEXT:    [[T0:%.*]] = tail call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 [[X]])
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[T0]], 2
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
@@ -212,7 +212,7 @@ define i1 @is_pow2_non_zero_eq_1(i32 %x) {
 ; CHECK-LABEL: @is_pow2_non_zero_eq_1(
 ; CHECK-NEXT:    [[NOTZERO:%.*]] = icmp ne i32 [[X:%.*]], 0
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[NOTZERO]])
-; CHECK-NEXT:    [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X]])
+; CHECK-NEXT:    [[T0:%.*]] = tail call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 [[X]])
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[T0]], 1
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
@@ -227,7 +227,7 @@ define i1 @is_pow2_non_zero_ugt_1(i32 %x) {
 ; CHECK-LABEL: @is_pow2_non_zero_ugt_1(
 ; CHECK-NEXT:    [[NOTZERO:%.*]] = icmp ne i32 [[X:%.*]], 0
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[NOTZERO]])
-; CHECK-NEXT:    [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X]])
+; CHECK-NEXT:    [[T0:%.*]] = tail call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 [[X]])
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[T0]], 1
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
@@ -242,7 +242,7 @@ define i1 @is_pow2_non_zero_ne_1(i32 %x) {
 ; CHECK-LABEL: @is_pow2_non_zero_ne_1(
 ; CHECK-NEXT:    [[NOTZERO:%.*]] = icmp ne i32 [[X:%.*]], 0
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[NOTZERO]])
-; CHECK-NEXT:    [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X]])
+; CHECK-NEXT:    [[T0:%.*]] = tail call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 [[X]])
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[T0]], 1
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;

dtcxzyw · 2024-07-28T03:03:19Z

llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp

@@ -694,8 +694,13 @@ static Instruction *foldCtpop(IntrinsicInst &II, InstCombinerImpl &IC) {
  // Add range attribute since known bits can't completely reflect what we know.
  if (BitWidth != 1 && !II.hasRetAttr(Attribute::Range) &&
      !II.getMetadata(LLVMContext::MD_range)) {
-    ConstantRange Range(APInt(BitWidth, Known.countMinPopulation()),
-                        APInt(BitWidth, Known.countMaxPopulation() + 1));
+    unsigned KnownMinPop = Known.countMinPopulation();


There is a phase ordering problem. We may get a more precise result later. See my previous comment #100870 (comment).

Can you add the following test?

define i32 @ctpop_non_zero_with_existing_range_attr(i32 range(i32 1, 255) %x) { %ctpop = call i32 range(i32 0, 9) @llvm.ctpop.i32(i32 %x) ret i32 %ctpop }

did not think that far, forgets that the checks id done multiple times.
but then it fells like the draft that you have is better then this solution so we do not need to execute isKnownNonZero if not needed.

added the test and when for a combination of our solutions that reduced number of calls to intersectWith also used full set range if old is missing as it is faster in intersectWith

PR Link: llvm/llvm-project#100899

dtcxzyw

LGTM. Thank you!

goldsteinn · 2024-07-29T05:14:13Z

LGTM

nikic

LGTM

…#111284) Some targets have better codegen for `ctpop(X) u< 2` than `ctpop(X) == 1`. After #100899, we set the range of ctpop's return value to indicate the argument/result is non-zero. This patch converts `ctpop(X) ==/!= 1` into `ctpop(X) u 2/1` in CGP to fix #95255.

…llvm#111284) Some targets have better codegen for `ctpop(X) u< 2` than `ctpop(X) == 1`. After llvm#100899, we set the range of ctpop's return value to indicate the argument/result is non-zero. This patch converts `ctpop(X) ==/!= 1` into `ctpop(X) u 2/1` in CGP to fix llvm#95255.

…/u> 2/1` (#111284)` (#111998) Relands #111284. Test failure with stage2 build has been fixed by #111946. Some targets have better codegen for `ctpop(X) u< 2` than `ctpop(X) == 1`. After #100899, we set the range of ctpop's return value to indicate the argument/result is non-zero. This patch converts `ctpop(X) ==/!= 1` into `ctpop(X) u 2/1` in CGP to fix #95255.

…llvm#111284) Some targets have better codegen for `ctpop(X) u< 2` than `ctpop(X) == 1`. After llvm#100899, we set the range of ctpop's return value to indicate the argument/result is non-zero. This patch converts `ctpop(X) ==/!= 1` into `ctpop(X) u 2/1` in CGP to fix llvm#95255.

…/u> 2/1` (llvm#111284)` (llvm#111998) Relands llvm#111284. Test failure with stage2 build has been fixed by llvm#111946. Some targets have better codegen for `ctpop(X) u< 2` than `ctpop(X) == 1`. After llvm#100899, we set the range of ctpop's return value to indicate the argument/result is non-zero. This patch converts `ctpop(X) ==/!= 1` into `ctpop(X) u 2/1` in CGP to fix llvm#95255.

…llvm#111284) Some targets have better codegen for `ctpop(X) u< 2` than `ctpop(X) == 1`. After llvm#100899, we set the range of ctpop's return value to indicate the argument/result is non-zero. This patch converts `ctpop(X) ==/!= 1` into `ctpop(X) u 2/1` in CGP to fix llvm#95255.

…/u> 2/1` (llvm#111284)` (llvm#111998) Relands llvm#111284. Test failure with stage2 build has been fixed by llvm#111946. Some targets have better codegen for `ctpop(X) u< 2` than `ctpop(X) == 1`. After llvm#100899, we set the range of ctpop's return value to indicate the argument/result is non-zero. This patch converts `ctpop(X) ==/!= 1` into `ctpop(X) u 2/1` in CGP to fix llvm#95255.

pranavk · 2024-10-28T22:19:07Z

This seems to cause miscompiles for one of our Arm targets. Here's the reproducer:
https://godbolt.org/z/689eKhcnh

The problem seems to be the range attribute that doesn't contain "0" even though smax could return 0 in this case. This then combines with https://github.com/llvm/llvm-project/pull/111998/files which causes incorrect value of b on Arm machines.

andjo403 · 2024-10-28T23:18:34Z

this pass of simplifycfg https://godbolt.org/z/PhqdMcWG3 looks strange to me starts to call ctpop for zero but do not drop the range attribute

dtcxzyw · 2024-10-29T00:05:52Z

this pass of simplifycfg https://godbolt.org/z/PhqdMcWG3 looks strange to me starts to call ctpop for zero but do not drop the range attribute

It is correct. Even if ctpop returns a poison value due to range violation, %6 = select i1 %3, i1 %5, i1 false still returns false.

dtcxzyw · 2024-10-29T00:11:54Z

This seems to cause miscompiles for one of our Arm targets. Here's the reproducer: https://godbolt.org/z/689eKhcnh

The problem seems to be the range attribute that doesn't contain "0" even though smax could return 0 in this case. This then combines with https://github.com/llvm/llvm-project/pull/111998/files which causes incorrect value of b on Arm machines.

The result looks OK? https://godbolt.org/z/T5E1oYvse
bit_ceil(0) = 1, is_power_of_two(0) = 0

pranavk · 2024-10-29T00:15:42Z

In the original repro, I am using absl and c++17. https://godbolt.org/z/35c9xfn6h but I can't make executor run this because it can't find absl libraries.

pranavk · 2024-10-29T00:17:46Z

When I try to execute it on an Arm machine, I get b = INT_MAX.

dtcxzyw · 2024-10-29T00:30:01Z

In the original repro, I am using absl and c++17. https://godbolt.org/z/35c9xfn6h but I can't make executor run this because it can't find absl libraries.

define dso_local noundef range(i32 1, -2147483647) i32 @bucket(int)(i32 noundef %0) local_unnamed_addr #4 { %0 -> 0
  %2 = tail call i32 @llvm.smax.i32(i32 %0, i32 0) -> 0
  %3 = icmp sgt i32 %0, 0 -> false
  %4 = tail call range(i32 1, 32) i32 @llvm.ctpop.i32(i32 %2) -> poison
  %5 = icmp samesign ult i32 %4, 2 -> poison
  %6 = select i1 %3, i1 %5, i1 false -> false
  %7 = tail call range(i32 0, 33) i32 @llvm.ctlz.i32(i32 %2, i1 true) -> poison
  %8 = add nsw i32 %2, -1 -> -1
  %9 = tail call range(i32 0, 33) i32 @llvm.ctlz.i32(i32 %8, i1 false) -> 0
  %10 = select i1 %6, i32 %7, i32 %9 -> 0
  %11 = sub nsw i32 0, %10 -> 0
  %12 = and i32 %11, 31 -> 0
  %13 = shl nuw i32 1, %12 -> 1
  ret i32 %13 -> 1
}

Looks like it is a codegen bug in ARM backend.

dtcxzyw · 2024-10-29T01:34:22Z

I will file an issue later. It seems to be a SDAG bug: https://godbolt.org/z/W6cdshn7q

dtcxzyw · 2024-10-29T03:20:26Z

this pass of simplifycfg https://godbolt.org/z/PhqdMcWG3 looks strange to me starts to call ctpop for zero but do not drop the range attribute

You are correct. But it should be handled by GVNPass: https://godbolt.org/z/Gf9Mob8Kz

andjo403 requested a review from dtcxzyw July 27, 2024 21:22

andjo403 requested a review from nikic as a code owner July 27, 2024 21:22

llvmbot added the llvm:transforms label Jul 27, 2024

dtcxzyw reviewed Jul 28, 2024

View reviewed changes

nikic mentioned this pull request Jul 28, 2024

Allow empty range attribute and add assert for full range #100601

Merged

andjo403 added 2 commits July 28, 2024 23:16

[InstCombine] Add pre-commit tests. NFC.

e113ae8

[InstCombine] Reduce range of ctpop for non zero argument

5a671f5

andjo403 force-pushed the nonZeroCtpop branch from 1aeba28 to 5a671f5 Compare July 28, 2024 21:26

dtcxzyw added a commit to dtcxzyw/llvm-opt-benchmark that referenced this pull request Jul 29, 2024

pre-commit: test PR100899

28551b8

PR Link: llvm/llvm-project#100899

dtcxzyw mentioned this pull request Jul 29, 2024

pre-commit: test PR100899 dtcxzyw/llvm-opt-benchmark#1101

Closed

dtcxzyw approved these changes Jul 29, 2024

View reviewed changes

nikic approved these changes Jul 29, 2024

View reviewed changes

andjo403 merged commit f7491f5 into llvm:main Jul 29, 2024
7 checks passed

andjo403 deleted the nonZeroCtpop branch July 29, 2024 12:07

banach-space pushed a commit to banach-space/llvm-project that referenced this pull request Aug 7, 2024

[InstCombine] Reduce range of ctpop for non zero argument (llvm#100899)

383ffca

dtcxzyw mentioned this pull request Oct 6, 2024

[CodeGenPrepare] Convert ctpop(X) ==/!= 1 into ctpop(X) u 2/1 #111284

Merged

dtcxzyw mentioned this pull request Oct 11, 2024

Backport "[InstCombine] Drop range attributes in foldIsPowerOf2 (#111946)" #111984

Merged

dtcxzyw mentioned this pull request Oct 11, 2024

Reland [CodeGenPrepare] Convert ctpop(X) ==/!= 1 into ctpop(X) u 2/1 (#111284) #111998

Merged

dtcxzyw mentioned this pull request Oct 29, 2024

[GVNPass] Range attribute should be handled after CSE #113997

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[InstCombine] Reduce range of ctpop for non zero argument #100899

[InstCombine] Reduce range of ctpop for non zero argument #100899

andjo403 commented Jul 27, 2024 •

edited

Loading

llvmbot commented Jul 27, 2024

dtcxzyw Jul 28, 2024 •

edited

Loading

andjo403 Jul 28, 2024

andjo403 Jul 28, 2024

dtcxzyw left a comment

goldsteinn commented Jul 29, 2024

nikic left a comment

pranavk commented Oct 28, 2024

andjo403 commented Oct 28, 2024

dtcxzyw commented Oct 29, 2024

dtcxzyw commented Oct 29, 2024

pranavk commented Oct 29, 2024 •

edited

Loading

pranavk commented Oct 29, 2024

dtcxzyw commented Oct 29, 2024

dtcxzyw commented Oct 29, 2024

dtcxzyw commented Oct 29, 2024 •

edited

Loading

[InstCombine] Reduce range of ctpop for non zero argument #100899

[InstCombine] Reduce range of ctpop for non zero argument #100899

Conversation

andjo403 commented Jul 27, 2024 • edited Loading

llvmbot commented Jul 27, 2024

dtcxzyw Jul 28, 2024 • edited Loading

Choose a reason for hiding this comment

andjo403 Jul 28, 2024

Choose a reason for hiding this comment

andjo403 Jul 28, 2024

Choose a reason for hiding this comment

dtcxzyw left a comment

Choose a reason for hiding this comment

goldsteinn commented Jul 29, 2024

nikic left a comment

Choose a reason for hiding this comment

pranavk commented Oct 28, 2024

andjo403 commented Oct 28, 2024

dtcxzyw commented Oct 29, 2024

dtcxzyw commented Oct 29, 2024

pranavk commented Oct 29, 2024 • edited Loading

pranavk commented Oct 29, 2024

dtcxzyw commented Oct 29, 2024

dtcxzyw commented Oct 29, 2024

dtcxzyw commented Oct 29, 2024 • edited Loading

andjo403 commented Jul 27, 2024 •

edited

Loading

dtcxzyw Jul 28, 2024 •

edited

Loading

pranavk commented Oct 29, 2024 •

edited

Loading

dtcxzyw commented Oct 29, 2024 •

edited

Loading