Merged master:4a2673d79fd into amd-gfx:e87ce7d5df8

Local branch amd-gfx e87ce7d Merged master:45ebe38ffc4 into amd-gfx:86103e348b6 Remote branch master 4a2673d [X86][AVX] Add SimplifyMultipleUseDemandedBits VBROADCAST handling to SimplifyDemandedVectorElts.
jaebaek · May 31, 2020 · 563eb59 · 563eb59
2 parents e87ce7d + 4a2673d
commit 563eb59
Show file tree

Hide file tree

Showing 4 changed files with 801 additions and 14 deletions.
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -7271,7 +7271,7 @@ static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask,
 // TODO: Use DemandedElts variant.
 static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
                                    SmallVectorImpl<int> &Mask,
-                                   SelectionDAG &DAG, unsigned Depth,
+                                   const SelectionDAG &DAG, unsigned Depth,
                                    bool ResolveKnownElts);
 
 // Attempt to decode ops that could be represented as a shuffle mask.
@@ -7280,7 +7280,7 @@ static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
 static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
                                SmallVectorImpl<int> &Mask,
                                SmallVectorImpl<SDValue> &Ops,
-                               SelectionDAG &DAG, unsigned Depth,
+                               const SelectionDAG &DAG, unsigned Depth,
                                bool ResolveKnownElts) {
   Mask.clear();
   Ops.clear();
@@ -7438,9 +7438,9 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
                                 SubMask, DAG, Depth + 1, ResolveKnownElts))
       return false;
 
-    // Shuffle inputs must be the same size as the subvector.
+    // Subvector shuffle inputs must not be larger than the subvector.
     if (llvm::any_of(SubInputs, [SubVT](SDValue Op) {
-          return SubVT.getSizeInBits() != Op.getValueSizeInBits();
+          return SubVT.getSizeInBits() > Op.getValueSizeInBits();
         }))
       return false;
 
@@ -7460,14 +7460,7 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
       }
     }
     Ops.push_back(Src);
-    for (SDValue &SubInput : SubInputs) {
-      EVT SubSVT = SubInput.getValueType().getScalarType();
-      EVT AltVT = EVT::getVectorVT(*DAG.getContext(), SubSVT,
-                                   NumSizeInBits / SubSVT.getSizeInBits());
-      Ops.push_back(DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), AltVT,
-                                DAG.getUNDEF(AltVT), SubInput,
-                                DAG.getIntPtrConstant(0, SDLoc(N))));
-    }
+    Ops.append(SubInputs.begin(), SubInputs.end());
     for (int i = 0; i != (int)NumElts; ++i)
       Mask.push_back(i);
     for (int i = 0; i != (int)NumSubElts; ++i) {
@@ -7741,7 +7734,7 @@ static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
                                    SmallVectorImpl<SDValue> &Inputs,
                                    SmallVectorImpl<int> &Mask,
                                    APInt &KnownUndef, APInt &KnownZero,
-                                   SelectionDAG &DAG, unsigned Depth,
+                                   const SelectionDAG &DAG, unsigned Depth,
                                    bool ResolveKnownElts) {
   EVT VT = Op.getValueType();
   if (!VT.isSimple() || !VT.isVector())
@@ -7762,7 +7755,7 @@ static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
 
 static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
                                    SmallVectorImpl<int> &Mask,
-                                   SelectionDAG &DAG, unsigned Depth = 0,
+                                   const SelectionDAG &DAG, unsigned Depth = 0,
                                    bool ResolveKnownElts = true) {
   EVT VT = Op.getValueType();
   if (!VT.isSimple() || !VT.isVector())
@@ -36837,6 +36830,12 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
     if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
                                    Depth + 1))
       return true;
+    // Aggressively peek through src to get at the demanded elt.
+    // TODO - we should do this for all target/faux shuffles ops.
+    APInt SrcBits = APInt::getAllOnesValue(SrcVT.getScalarSizeInBits());
+    if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(Src, SrcBits, SrcElts,
+                                                         TLO.DAG, Depth + 1))
+      return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
     break;
   }
   case X86ISD::VPERMV: {

diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll
@@ -1977,6 +1977,73 @@ define void @splat3_256(<32 x i8> %a0, <96 x i8> *%a1) {
   ret void
 }
 
+; D79987
+define <16 x i32> @splat_v3i32(<3 x i32>* %ptr) {
+; SSE2-LABEL: splat_v3i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,0,1]
+; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    xorps %xmm3, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: splat_v3i32:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE42-NEXT:    pxor %xmm1, %xmm1
+; SSE42-NEXT:    pxor %xmm2, %xmm2
+; SSE42-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7]
+; SSE42-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
+; SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,0,1]
+; SSE42-NEXT:    pxor %xmm1, %xmm1
+; SSE42-NEXT:    xorps %xmm3, %xmm3
+; SSE42-NEXT:    retq
+;
+; AVX1-LABEL: splat_v3i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT:    vpinsrd $2, 8(%rdi), %xmm0, %xmm1
+; AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4,5,6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: splat_v3i32:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX2-SLOW-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0],ymm1[1],ymm2[2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vbroadcastss %xmm1, %ymm1
+; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6,7]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: splat_v3i32:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-FAST-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7]
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-FAST-NEXT:    retq
+;
+; XOP-LABEL: splat_v3i32:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; XOP-NEXT:    vpinsrd $2, 8(%rdi), %xmm0, %xmm1
+; XOP-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; XOP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4,5,6,7]
+; XOP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; XOP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6,7]
+; XOP-NEXT:    retq
+  %1 = load <3 x i32>, <3 x i32>* %ptr, align 1
+  %2 = shufflevector <3 x i32> %1, <3 x i32> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %3 = shufflevector <16 x i32> <i32 0, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0>, <16 x i32> %2, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i32 > %3
+}
+
 define <2 x double> @wrongorder(<4 x double> %A, <8 x double>* %P) #0 {
 ; SSE2-LABEL: wrongorder:
 ; SSE2:       # %bb.0:

diff --git a/llvm/test/Transforms/PhaseOrdering/X86/scalarization.ll b/llvm/test/Transforms/PhaseOrdering/X86/scalarization.ll
@@ -0,0 +1,78 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -O3                   -S < %s  | FileCheck %s
+; RUN: opt -passes='default<O3>' -S < %s  | FileCheck %s
+
+target triple = "x86_64--"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; PR42174 - https://bugs.llvm.org/show_bug.cgi?id=42174
+; This test should match the IR produced by clang after running -mem2reg.
+; All math before the final 'add' should be scalarized.
+
+define <4 x i32> @square(<4 x i32> %num, i32 %y, i32 %x, i32 %h, i32 %k, i32 %w, i32 %p, i32 %j, i32 %u) {
+; CHECK-LABEL: @square(
+; CHECK-NEXT:    [[DIV:%.*]] = sdiv i32 [[K:%.*]], 2
+; CHECK-NEXT:    [[SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[DIV]], i32 0
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[P:%.*]], 6234
+; CHECK-NEXT:    [[SPLATINSERT2:%.*]] = insertelement <4 x i32> undef, i32 [[MUL]], i32 0
+; CHECK-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[H:%.*]], 75
+; CHECK-NEXT:    [[SPLATINSERT6:%.*]] = insertelement <4 x i32> undef, i32 [[MUL5]], i32 0
+; CHECK-NEXT:    [[DIV9:%.*]] = sdiv i32 [[J:%.*]], 3452
+; CHECK-NEXT:    [[SPLATINSERT10:%.*]] = insertelement <4 x i32> undef, i32 [[DIV9]], i32 0
+; CHECK-NEXT:    [[MUL13:%.*]] = mul nsw i32 [[W:%.*]], 53
+; CHECK-NEXT:    [[SPLATINSERT14:%.*]] = insertelement <4 x i32> undef, i32 [[MUL13]], i32 0
+; CHECK-NEXT:    [[DIV17:%.*]] = sdiv i32 [[X:%.*]], 820
+; CHECK-NEXT:    [[SPLATINSERT18:%.*]] = insertelement <4 x i32> undef, i32 [[DIV17]], i32 0
+; CHECK-NEXT:    [[MUL21:%.*]] = shl nsw i32 [[U:%.*]], 2
+; CHECK-NEXT:    [[SPLATINSERT22:%.*]] = insertelement <4 x i32> undef, i32 [[MUL21]], i32 0
+; CHECK-NEXT:    [[SPLATINSERT25:%.*]] = insertelement <4 x i32> undef, i32 [[Y:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[SPLATINSERT25]], <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[TMP1]], [[SPLATINSERT18]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP2]], [[SPLATINSERT6]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP3]], [[SPLATINSERT]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[SPLATINSERT14]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP5]], [[SPLATINSERT2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i32> [[TMP6]], [[SPLATINSERT10]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP7]], [[SPLATINSERT22]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[TMP8]], <i32 317425, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[ADD29:%.*]] = add <4 x i32> [[TMP10]], [[NUM:%.*]]
+; CHECK-NEXT:    ret <4 x i32> [[ADD29]]
+;
+  %add = add <4 x i32> %num, <i32 1, i32 1, i32 1, i32 1>
+  %div = sdiv i32 %k, 2
+  %splatinsert = insertelement <4 x i32> undef, i32 %div, i32 0
+  %splat = shufflevector <4 x i32> %splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %add1 = add <4 x i32> %add, %splat
+  %mul = mul nsw i32 %p, 6234
+  %splatinsert2 = insertelement <4 x i32> undef, i32 %mul, i32 0
+  %splat3 = shufflevector <4 x i32> %splatinsert2, <4 x i32> undef, <4 x i32> zeroinitializer
+  %add4 = add <4 x i32> %add1, %splat3
+  %mul5 = mul nsw i32 75, %h
+  %splatinsert6 = insertelement <4 x i32> undef, i32 %mul5, i32 0
+  %splat7 = shufflevector <4 x i32> %splatinsert6, <4 x i32> undef, <4 x i32> zeroinitializer
+  %add8 = add <4 x i32> %add4, %splat7
+  %div9 = sdiv i32 %j, 3452
+  %splatinsert10 = insertelement <4 x i32> undef, i32 %div9, i32 0
+  %splat11 = shufflevector <4 x i32> %splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
+  %add12 = add <4 x i32> %add8, %splat11
+  %mul13 = mul nsw i32 53, %w
+  %splatinsert14 = insertelement <4 x i32> undef, i32 %mul13, i32 0
+  %splat15 = shufflevector <4 x i32> %splatinsert14, <4 x i32> undef, <4 x i32> zeroinitializer
+  %add16 = add <4 x i32> %add12, %splat15
+  %div17 = sdiv i32 %x, 820
+  %splatinsert18 = insertelement <4 x i32> undef, i32 %div17, i32 0
+  %splat19 = shufflevector <4 x i32> %splatinsert18, <4 x i32> undef, <4 x i32> zeroinitializer
+  %add20 = add <4 x i32> %add16, %splat19
+  %mul21 = mul nsw i32 4, %u
+  %splatinsert22 = insertelement <4 x i32> undef, i32 %mul21, i32 0
+  %splat23 = shufflevector <4 x i32> %splatinsert22, <4 x i32> undef, <4 x i32> zeroinitializer
+  %add24 = add <4 x i32> %add20, %splat23
+  %splatinsert25 = insertelement <4 x i32> undef, i32 %y, i32 0
+  %splat26 = shufflevector <4 x i32> %splatinsert25, <4 x i32> undef, <4 x i32> zeroinitializer
+  %add27 = add <4 x i32> %add24, %splat26
+  %add28 = add <4 x i32> %add27, <i32 25, i32 25, i32 25, i32 25>
+  %add29 = add <4 x i32> %add28, <i32 317400, i32 317400, i32 317400, i32 317400>
+  ret <4 x i32> %add29
+}
+