From 3b7f84d97fa5be1fcd178a5450362bfbe3ff0234 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Sun, 15 Nov 2020 20:29:53 +0100 Subject: [PATCH 1/4] [AA] Add missing AAQI parameter This alias() call did not pass on the AAQueryInfo. --- llvm/lib/Analysis/AliasAnalysis.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Analysis/AliasAnalysis.cpp b/llvm/lib/Analysis/AliasAnalysis.cpp index 456100975ac5e9..515aecdf1e8da9 100644 --- a/llvm/lib/Analysis/AliasAnalysis.cpp +++ b/llvm/lib/Analysis/AliasAnalysis.cpp @@ -214,7 +214,7 @@ ModRefInfo AAResults::getModRefInfo(const CallBase *Call, unsigned ArgIdx = std::distance(Call->arg_begin(), AI); MemoryLocation ArgLoc = MemoryLocation::getForArgument(Call, ArgIdx, TLI); - AliasResult ArgAlias = alias(ArgLoc, Loc); + AliasResult ArgAlias = alias(ArgLoc, Loc, AAQI); if (ArgAlias != NoAlias) { ModRefInfo ArgMask = getArgModRefInfo(Call, ArgIdx); AllArgsMask = unionModRef(AllArgsMask, ArgMask); From 9bcef58b63776c490fd902290f0efc580e3970bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Sat, 14 Nov 2020 09:33:36 +0200 Subject: [PATCH 2/4] [OpenMP] Fix building for windows after adding omp_calloc Differential Revision: https://reviews.llvm.org/D91478 --- openmp/runtime/tools/generate-def.pl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/openmp/runtime/tools/generate-def.pl b/openmp/runtime/tools/generate-def.pl index 0298d723c3a3e1..b245387c0fc036 100755 --- a/openmp/runtime/tools/generate-def.pl +++ b/openmp/runtime/tools/generate-def.pl @@ -108,8 +108,8 @@ (\%) foreach my $entry ( keys( %$entries ) ) { if ( not $entries->{ $entry }->{ obsolete } ) { my $ordinal = $entries->{ $entry }->{ ordinal }; - # omp_alloc and omp_free are C/C++ only functions, skip "1000+ordinal" for them - if ( $entry =~ m{\A[ok]mp_} and $entry ne "omp_alloc" and $entry ne "omp_free" ) { + # omp_alloc, omp_calloc and omp_free are C/C++ only functions, skip "1000+ordinal" for them + if ( $entry =~ m{\A[ok]mp_} and $entry ne "omp_alloc" and $entry ne "omp_calloc" and $entry ne "omp_free" ) { if ( not defined( $ordinal ) ) { runtime_error( "Bad entry \"$entry\": ordinal number is not specified." From 91aa211ea168306ba5d13830806f44aa41e1b5bc Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Sun, 15 Nov 2020 14:57:01 -0500 Subject: [PATCH 3/4] [InstCombine] add vector tests for multi-use demanded bits; NFC See D91415. --- llvm/test/Transforms/InstCombine/and.ll | 26 +++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/llvm/test/Transforms/InstCombine/and.ll b/llvm/test/Transforms/InstCombine/and.ll index 7bbc66990c4393..3642f50547984c 100644 --- a/llvm/test/Transforms/InstCombine/and.ll +++ b/llvm/test/Transforms/InstCombine/and.ll @@ -1100,3 +1100,29 @@ define <2 x i8> @lowmask_add_splat(<2 x i8> %x, <2 x i8>* %p) { %r = and <2 x i8> %a, ; 0x20 ret <2 x i8> %r } + +define <2 x i8> @lowmask_add_splat_undef(<2 x i8> %x, <2 x i8>* %p) { +; CHECK-LABEL: @lowmask_add_splat_undef( +; CHECK-NEXT: [[A:%.*]] = add <2 x i8> [[X:%.*]], +; CHECK-NEXT: store <2 x i8> [[A]], <2 x i8>* [[P:%.*]], align 2 +; CHECK-NEXT: [[R:%.*]] = and <2 x i8> [[A]], +; CHECK-NEXT: ret <2 x i8> [[R]] +; + %a = add <2 x i8> %x, ; 0xc0 + store <2 x i8> %a, <2 x i8>* %p + %r = and <2 x i8> %a, ; 0x20 + ret <2 x i8> %r +} + +define <2 x i8> @lowmask_add_vec(<2 x i8> %x, <2 x i8>* %p) { +; CHECK-LABEL: @lowmask_add_vec( +; CHECK-NEXT: [[A:%.*]] = add <2 x i8> [[X:%.*]], +; CHECK-NEXT: store <2 x i8> [[A]], <2 x i8>* [[P:%.*]], align 2 +; CHECK-NEXT: [[R:%.*]] = and <2 x i8> [[A]], +; CHECK-NEXT: ret <2 x i8> [[R]] +; + %a = add <2 x i8> %x, ; 0xe0, 0xc0 + store <2 x i8> %a, <2 x i8>* %p + %r = and <2 x i8> %a, ; 0x10, 0x20 + ret <2 x i8> %r +} From e56103d25016c9ce4e98f652ac1a09379793ccf5 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Sun, 15 Nov 2020 15:08:41 -0500 Subject: [PATCH 4/4] [InstCombine] add multi-use demanded bits fold for add with low-bit mask I noticed an add example like the one from D91343, so here's a similar patch. The logic is based on existing code for the single-use demanded bits fold. But I only matched a constant instead of using compute known bits on the operands because that was the motivating patterni that I noticed. I think this will allow removing a special-case (but incomplete) dedicated fold within visitAnd(), but I need to untangle the existing code to be sure. https://rise4fun.com/Alive/V6fP Name: add with low mask Pre: (C1 & (-1 u>> countLeadingZeros(C2))) == 0 %a = add i8 %x, C1 %r = and i8 %a, C2 => %r = and i8 %x, C2 Differential Revision: https://reviews.llvm.org/D91415 --- .../InstCombine/InstCombineSimplifyDemanded.cpp | 15 +++++++++++++++ llvm/test/Transforms/InstCombine/and.ll | 12 ++++++++++-- .../LoopVectorize/X86/float-induction-x86.ll | 4 ++-- .../LoopVectorize/if-conversion-nest.ll | 2 +- .../Transforms/LoopVectorize/runtime-check.ll | 4 ++-- llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll | 2 +- 6 files changed, 31 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 3f2a6f8eb2ea97..78621ab16151a9 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -826,6 +826,21 @@ Value *InstCombinerImpl::SimplifyMultipleUseDemandedBits( // do simplifications that apply to *just* the one user if we know that // this instruction has a simpler value in that context. switch (I->getOpcode()) { + case Instruction::Add: { + // TODO: Allow undefs and/or non-splat vectors. + const APInt *C; + if (match(I->getOperand(1), m_APInt(C))) { + // Right fill the demanded bits for this add to demand the most + // significant demanded bit and all those below it. + unsigned Ctlz = DemandedMask.countLeadingZeros(); + APInt LowMask(APInt::getLowBitsSet(BitWidth, BitWidth - Ctlz)); + // If we are adding zeros to every bit below the highest demanded bit, + // just return the add's variable operand. + if ((*C & LowMask).isNullValue()) + return I->getOperand(0); + } + break; + } case Instruction::And: { // If either the LHS or the RHS are Zero, the result is zero. computeKnownBits(I->getOperand(1), RHSKnown, Depth + 1, CxtI); diff --git a/llvm/test/Transforms/InstCombine/and.ll b/llvm/test/Transforms/InstCombine/and.ll index 3642f50547984c..f4e0e01e6fedc2 100644 --- a/llvm/test/Transforms/InstCombine/and.ll +++ b/llvm/test/Transforms/InstCombine/and.ll @@ -1049,11 +1049,13 @@ define <2 x i32> @lowmask_sext_in_reg_splat(<2 x i32> %x, <2 x i32>* %p) { ret <2 x i32> %and } +; Multi-use demanded bits - 'add' doesn't change 'and' + define i8 @lowmask_add(i8 %x) { ; CHECK-LABEL: @lowmask_add( ; CHECK-NEXT: [[A:%.*]] = add i8 [[X:%.*]], -64 ; CHECK-NEXT: call void @use8(i8 [[A]]) -; CHECK-NEXT: [[R:%.*]] = and i8 [[A]], 32 +; CHECK-NEXT: [[R:%.*]] = and i8 [[X]], 32 ; CHECK-NEXT: ret i8 [[R]] ; %a = add i8 %x, -64 ; 0xc0 @@ -1062,6 +1064,8 @@ define i8 @lowmask_add(i8 %x) { ret i8 %r } +; Negative test - mask overlaps low bit of add + define i8 @not_lowmask_add(i8 %x) { ; CHECK-LABEL: @not_lowmask_add( ; CHECK-NEXT: [[A:%.*]] = add i8 [[X:%.*]], -64 @@ -1075,6 +1079,8 @@ define i8 @not_lowmask_add(i8 %x) { ret i8 %r } +; Negative test - mask overlaps low bit of add + define i8 @not_lowmask_add2(i8 %x) { ; CHECK-LABEL: @not_lowmask_add2( ; CHECK-NEXT: [[A:%.*]] = add i8 [[X:%.*]], -96 @@ -1088,11 +1094,13 @@ define i8 @not_lowmask_add2(i8 %x) { ret i8 %r } +; Multi-use demanded bits - 'add' doesn't change 'and' + define <2 x i8> @lowmask_add_splat(<2 x i8> %x, <2 x i8>* %p) { ; CHECK-LABEL: @lowmask_add_splat( ; CHECK-NEXT: [[A:%.*]] = add <2 x i8> [[X:%.*]], ; CHECK-NEXT: store <2 x i8> [[A]], <2 x i8>* [[P:%.*]], align 2 -; CHECK-NEXT: [[R:%.*]] = and <2 x i8> [[A]], +; CHECK-NEXT: [[R:%.*]] = and <2 x i8> [[X]], ; CHECK-NEXT: ret <2 x i8> [[R]] ; %a = add <2 x i8> %x, ; 0xc0 diff --git a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll index ced3f482561b61..eaa41cd6ed61b5 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll @@ -35,7 +35,7 @@ define void @fp_iv_loop1(float* noalias nocapture %A, i32 %N) #0 { ; AUTO_VEC-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP1]], 96 ; AUTO_VEC-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK_UNR_LCSSA:%.*]], label [[VECTOR_PH_NEW:%.*]] ; AUTO_VEC: vector.ph.new: -; AUTO_VEC-NEXT: [[UNROLL_ITER:%.*]] = and i64 [[TMP3]], 1152921504606846972 +; AUTO_VEC-NEXT: [[UNROLL_ITER:%.*]] = and i64 [[TMP3]], -4 ; AUTO_VEC-NEXT: br label [[VECTOR_BODY:%.*]] ; AUTO_VEC: vector.body: ; AUTO_VEC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH_NEW]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY]] ] @@ -306,7 +306,7 @@ define double @external_use_with_fast_math(double* %a, i64 %n) { ; AUTO_VEC-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP2]], 48 ; AUTO_VEC-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK_UNR_LCSSA:%.*]], label [[VECTOR_PH_NEW:%.*]] ; AUTO_VEC: vector.ph.new: -; AUTO_VEC-NEXT: [[UNROLL_ITER:%.*]] = and i64 [[TMP4]], 2305843009213693948 +; AUTO_VEC-NEXT: [[UNROLL_ITER:%.*]] = and i64 [[TMP4]], -4 ; AUTO_VEC-NEXT: br label [[VECTOR_BODY:%.*]] ; AUTO_VEC: vector.body: ; AUTO_VEC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH_NEW]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll b/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll index 0cba3fc20ed92b..a96a35e416f13e 100644 --- a/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll +++ b/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll @@ -25,7 +25,7 @@ define i32 @foo(i32* nocapture %A, i32* nocapture %B, i32 %n) { ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934588 +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], -4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check.ll b/llvm/test/Transforms/LoopVectorize/runtime-check.ll index 4ba898d2d31f2a..1eb090f838bed1 100644 --- a/llvm/test/Transforms/LoopVectorize/runtime-check.ll +++ b/llvm/test/Transforms/LoopVectorize/runtime-check.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function foo ; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s ; RUN: opt < %s -loop-vectorize -disable-basic-aa -S -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck %s -check-prefix=FORCED_OPTSIZE @@ -32,7 +32,7 @@ define i32 @foo(float* nocapture %a, float* nocapture %b, i32 %n) nounwind uwtab ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]], [[DBG9]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]], [[DBG9]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934588, [[DBG9]] +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], -4, [[DBG9]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]], [[DBG9]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [[DBG9]] diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll b/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll index 4eb4c5ac1ed52a..e2b96212fcabf6 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll @@ -38,7 +38,7 @@ define void @vdiv(double* %x, double* %y, double %a, i32 %N) #0 { ; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], 12 ; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK_UNR_LCSSA:%.*]], label [[VECTOR_PH_NEW:%.*]] ; CHECK: vector.ph.new: -; CHECK-NEXT: [[UNROLL_ITER:%.*]] = and i64 [[TMP2]], 9223372036854775804 +; CHECK-NEXT: [[UNROLL_ITER:%.*]] = and i64 [[TMP2]], -4 ; CHECK-NEXT: [[TMP4:%.*]] = fdiv fast <4 x double> , [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP5:%.*]] = fdiv fast <4 x double> , [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP6:%.*]] = fdiv fast <4 x double> , [[BROADCAST_SPLAT]]