From 3b7f84d97fa5be1fcd178a5450362bfbe3ff0234 Mon Sep 17 00:00:00 2001
From: Nikita Popov <nikita.ppv@gmail.com>
Date: Sun, 15 Nov 2020 20:29:53 +0100
Subject: [PATCH 1/4] [AA] Add missing AAQI parameter

This alias() call did not pass on the AAQueryInfo.
---
 llvm/lib/Analysis/AliasAnalysis.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Analysis/AliasAnalysis.cpp b/llvm/lib/Analysis/AliasAnalysis.cpp
index 456100975ac5e..515aecdf1e8da 100644
--- a/llvm/lib/Analysis/AliasAnalysis.cpp
+++ b/llvm/lib/Analysis/AliasAnalysis.cpp
@@ -214,7 +214,7 @@ ModRefInfo AAResults::getModRefInfo(const CallBase *Call,
         unsigned ArgIdx = std::distance(Call->arg_begin(), AI);
         MemoryLocation ArgLoc =
             MemoryLocation::getForArgument(Call, ArgIdx, TLI);
-        AliasResult ArgAlias = alias(ArgLoc, Loc);
+        AliasResult ArgAlias = alias(ArgLoc, Loc, AAQI);
         if (ArgAlias != NoAlias) {
           ModRefInfo ArgMask = getArgModRefInfo(Call, ArgIdx);
           AllArgsMask = unionModRef(AllArgsMask, ArgMask);

From 9bcef58b63776c490fd902290f0efc580e3970bc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Sat, 14 Nov 2020 09:33:36 +0200
Subject: [PATCH 2/4] [OpenMP] Fix building for windows after adding omp_calloc

Differential Revision: https://reviews.llvm.org/D91478
---
 openmp/runtime/tools/generate-def.pl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/openmp/runtime/tools/generate-def.pl b/openmp/runtime/tools/generate-def.pl
index 0298d723c3a3e..b245387c0fc03 100755
--- a/openmp/runtime/tools/generate-def.pl
+++ b/openmp/runtime/tools/generate-def.pl
@@ -108,8 +108,8 @@ (\%)
     foreach my $entry ( keys( %$entries ) ) {
         if ( not $entries->{ $entry }->{ obsolete } ) {
             my $ordinal = $entries->{ $entry }->{ ordinal };
-            # omp_alloc and omp_free are C/C++ only functions, skip "1000+ordinal" for them
-            if ( $entry =~ m{\A[ok]mp_} and $entry ne "omp_alloc" and $entry ne "omp_free" ) {
+            # omp_alloc, omp_calloc and omp_free are C/C++ only functions, skip "1000+ordinal" for them
+            if ( $entry =~ m{\A[ok]mp_} and $entry ne "omp_alloc" and $entry ne "omp_calloc" and $entry ne "omp_free" ) {
                 if ( not defined( $ordinal ) ) {
                     runtime_error(
                         "Bad entry \"$entry\": ordinal number is not specified."

From 91aa211ea168306ba5d13830806f44aa41e1b5bc Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Sun, 15 Nov 2020 14:57:01 -0500
Subject: [PATCH 3/4] [InstCombine] add vector tests for multi-use demanded
 bits; NFC

See D91415.
---
 llvm/test/Transforms/InstCombine/and.ll | 26 +++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/llvm/test/Transforms/InstCombine/and.ll b/llvm/test/Transforms/InstCombine/and.ll
index 7bbc66990c439..3642f50547984 100644
--- a/llvm/test/Transforms/InstCombine/and.ll
+++ b/llvm/test/Transforms/InstCombine/and.ll
@@ -1100,3 +1100,29 @@ define <2 x i8> @lowmask_add_splat(<2 x i8> %x, <2 x i8>* %p) {
   %r = and <2 x i8> %a, <i8 32, i8 32> ; 0x20
   ret <2 x i8> %r
 }
+
+define <2 x i8> @lowmask_add_splat_undef(<2 x i8> %x, <2 x i8>* %p) {
+; CHECK-LABEL: @lowmask_add_splat_undef(
+; CHECK-NEXT:    [[A:%.*]] = add <2 x i8> [[X:%.*]], <i8 -64, i8 undef>
+; CHECK-NEXT:    store <2 x i8> [[A]], <2 x i8>* [[P:%.*]], align 2
+; CHECK-NEXT:    [[R:%.*]] = and <2 x i8> [[A]], <i8 undef, i8 32>
+; CHECK-NEXT:    ret <2 x i8> [[R]]
+;
+  %a = add <2 x i8> %x, <i8 -64, i8 undef> ; 0xc0
+  store <2 x i8> %a, <2 x i8>* %p
+  %r = and <2 x i8> %a, <i8 undef, i8 32> ; 0x20
+  ret <2 x i8> %r
+}
+
+define <2 x i8> @lowmask_add_vec(<2 x i8> %x, <2 x i8>* %p) {
+; CHECK-LABEL: @lowmask_add_vec(
+; CHECK-NEXT:    [[A:%.*]] = add <2 x i8> [[X:%.*]], <i8 -96, i8 -64>
+; CHECK-NEXT:    store <2 x i8> [[A]], <2 x i8>* [[P:%.*]], align 2
+; CHECK-NEXT:    [[R:%.*]] = and <2 x i8> [[A]], <i8 16, i8 32>
+; CHECK-NEXT:    ret <2 x i8> [[R]]
+;
+  %a = add <2 x i8> %x, <i8 -96, i8 -64> ; 0xe0, 0xc0
+  store <2 x i8> %a, <2 x i8>* %p
+  %r = and <2 x i8> %a, <i8 16, i8 32> ; 0x10, 0x20
+  ret <2 x i8> %r
+}

From e56103d25016c9ce4e98f652ac1a09379793ccf5 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Sun, 15 Nov 2020 15:08:41 -0500
Subject: [PATCH 4/4] [InstCombine] add multi-use demanded bits fold for add
 with low-bit mask

I noticed an add example like the one from D91343, so here's a similar patch.
The logic is based on existing code for the single-use demanded bits fold.
But I only matched a constant instead of using compute known bits on the
operands because that was the motivating patterni that I noticed.

I think this will allow removing a special-case (but incomplete) dedicated
fold within visitAnd(), but I need to untangle the existing code to be sure.

https://rise4fun.com/Alive/V6fP

  Name: add with low mask
  Pre: (C1 & (-1 u>> countLeadingZeros(C2))) == 0
  %a = add i8 %x, C1
  %r = and i8 %a, C2
  =>
  %r = and i8 %x, C2

Differential Revision: https://reviews.llvm.org/D91415
---
 .../InstCombine/InstCombineSimplifyDemanded.cpp   | 15 +++++++++++++++
 llvm/test/Transforms/InstCombine/and.ll           | 12 ++++++++++--
 .../LoopVectorize/X86/float-induction-x86.ll      |  4 ++--
 .../LoopVectorize/if-conversion-nest.ll           |  2 +-
 .../Transforms/LoopVectorize/runtime-check.ll     |  4 ++--
 llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll    |  2 +-
 6 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 3f2a6f8eb2ea9..78621ab16151a 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -826,6 +826,21 @@ Value *InstCombinerImpl::SimplifyMultipleUseDemandedBits(
   // do simplifications that apply to *just* the one user if we know that
   // this instruction has a simpler value in that context.
   switch (I->getOpcode()) {
+  case Instruction::Add: {
+    // TODO: Allow undefs and/or non-splat vectors.
+    const APInt *C;
+    if (match(I->getOperand(1), m_APInt(C))) {
+      // Right fill the demanded bits for this add to demand the most
+      // significant demanded bit and all those below it.
+      unsigned Ctlz = DemandedMask.countLeadingZeros();
+      APInt LowMask(APInt::getLowBitsSet(BitWidth, BitWidth - Ctlz));
+      // If we are adding zeros to every bit below the highest demanded bit,
+      // just return the add's variable operand.
+      if ((*C & LowMask).isNullValue())
+        return I->getOperand(0);
+    }
+    break;
+  }
   case Instruction::And: {
     // If either the LHS or the RHS are Zero, the result is zero.
     computeKnownBits(I->getOperand(1), RHSKnown, Depth + 1, CxtI);
diff --git a/llvm/test/Transforms/InstCombine/and.ll b/llvm/test/Transforms/InstCombine/and.ll
index 3642f50547984..f4e0e01e6fedc 100644
--- a/llvm/test/Transforms/InstCombine/and.ll
+++ b/llvm/test/Transforms/InstCombine/and.ll
@@ -1049,11 +1049,13 @@ define <2 x i32> @lowmask_sext_in_reg_splat(<2 x i32> %x, <2 x i32>* %p) {
   ret <2 x i32> %and
 }
 
+; Multi-use demanded bits - 'add' doesn't change 'and'
+
 define i8 @lowmask_add(i8 %x) {
 ; CHECK-LABEL: @lowmask_add(
 ; CHECK-NEXT:    [[A:%.*]] = add i8 [[X:%.*]], -64
 ; CHECK-NEXT:    call void @use8(i8 [[A]])
-; CHECK-NEXT:    [[R:%.*]] = and i8 [[A]], 32
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[X]], 32
 ; CHECK-NEXT:    ret i8 [[R]]
 ;
   %a = add i8 %x, -64 ; 0xc0
@@ -1062,6 +1064,8 @@ define i8 @lowmask_add(i8 %x) {
   ret i8 %r
 }
 
+; Negative test - mask overlaps low bit of add
+
 define i8 @not_lowmask_add(i8 %x) {
 ; CHECK-LABEL: @not_lowmask_add(
 ; CHECK-NEXT:    [[A:%.*]] = add i8 [[X:%.*]], -64
@@ -1075,6 +1079,8 @@ define i8 @not_lowmask_add(i8 %x) {
   ret i8 %r
 }
 
+; Negative test - mask overlaps low bit of add
+
 define i8 @not_lowmask_add2(i8 %x) {
 ; CHECK-LABEL: @not_lowmask_add2(
 ; CHECK-NEXT:    [[A:%.*]] = add i8 [[X:%.*]], -96
@@ -1088,11 +1094,13 @@ define i8 @not_lowmask_add2(i8 %x) {
   ret i8 %r
 }
 
+; Multi-use demanded bits - 'add' doesn't change 'and'
+
 define <2 x i8> @lowmask_add_splat(<2 x i8> %x, <2 x i8>* %p) {
 ; CHECK-LABEL: @lowmask_add_splat(
 ; CHECK-NEXT:    [[A:%.*]] = add <2 x i8> [[X:%.*]], <i8 -64, i8 -64>
 ; CHECK-NEXT:    store <2 x i8> [[A]], <2 x i8>* [[P:%.*]], align 2
-; CHECK-NEXT:    [[R:%.*]] = and <2 x i8> [[A]], <i8 32, i8 32>
+; CHECK-NEXT:    [[R:%.*]] = and <2 x i8> [[X]], <i8 32, i8 32>
 ; CHECK-NEXT:    ret <2 x i8> [[R]]
 ;
   %a = add <2 x i8> %x, <i8 -64, i8 -64> ; 0xc0
diff --git a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
index ced3f482561b6..eaa41cd6ed61b 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
@@ -35,7 +35,7 @@ define void @fp_iv_loop1(float* noalias nocapture %A, i32 %N) #0 {
 ; AUTO_VEC-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP1]], 96
 ; AUTO_VEC-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK_UNR_LCSSA:%.*]], label [[VECTOR_PH_NEW:%.*]]
 ; AUTO_VEC:       vector.ph.new:
-; AUTO_VEC-NEXT:    [[UNROLL_ITER:%.*]] = and i64 [[TMP3]], 1152921504606846972
+; AUTO_VEC-NEXT:    [[UNROLL_ITER:%.*]] = and i64 [[TMP3]], -4
 ; AUTO_VEC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; AUTO_VEC:       vector.body:
 ; AUTO_VEC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH_NEW]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY]] ]
@@ -306,7 +306,7 @@ define double @external_use_with_fast_math(double* %a, i64 %n) {
 ; AUTO_VEC-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP2]], 48
 ; AUTO_VEC-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK_UNR_LCSSA:%.*]], label [[VECTOR_PH_NEW:%.*]]
 ; AUTO_VEC:       vector.ph.new:
-; AUTO_VEC-NEXT:    [[UNROLL_ITER:%.*]] = and i64 [[TMP4]], 2305843009213693948
+; AUTO_VEC-NEXT:    [[UNROLL_ITER:%.*]] = and i64 [[TMP4]], -4
 ; AUTO_VEC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; AUTO_VEC:       vector.body:
 ; AUTO_VEC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH_NEW]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll b/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll
index 0cba3fc20ed92..a96a35e416f13 100644
--- a/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll
@@ -25,7 +25,7 @@ define i32 @foo(i32* nocapture %A, i32* nocapture %B, i32 %n) {
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934588
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], -4
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check.ll b/llvm/test/Transforms/LoopVectorize/runtime-check.ll
index 4ba898d2d31f2..1eb090f838bed 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-check.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-check.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function foo
 ; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 ; RUN: opt < %s -loop-vectorize -disable-basic-aa -S -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck %s -check-prefix=FORCED_OPTSIZE
 
@@ -32,7 +32,7 @@ define i32 @foo(float* nocapture %a, float* nocapture %b, i32 %n) nounwind uwtab
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]], [[DBG9]]
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]], [[DBG9]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934588, [[DBG9]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], -4, [[DBG9]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]], [[DBG9]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [[DBG9]]
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll b/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll
index 4eb4c5ac1ed52..e2b96212fcabf 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll
@@ -38,7 +38,7 @@ define void @vdiv(double* %x, double* %y, double %a, i32 %N) #0 {
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], 12
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK_UNR_LCSSA:%.*]], label [[VECTOR_PH_NEW:%.*]]
 ; CHECK:       vector.ph.new:
-; CHECK-NEXT:    [[UNROLL_ITER:%.*]] = and i64 [[TMP2]], 9223372036854775804
+; CHECK-NEXT:    [[UNROLL_ITER:%.*]] = and i64 [[TMP2]], -4
 ; CHECK-NEXT:    [[TMP4:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT]]