From 22e50833e9564f6be75fcbbabe9d75ca745e778d Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 31 May 2020 20:19:24 +0100 Subject: [PATCH 1/5] [X86][AVX] Reduce unary target shuffles width if the upper elements aren't demanded. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 5 +++ .../X86/avx512-intrinsics-fast-isel.ll | 32 +++++++++---------- llvm/test/CodeGen/X86/vector-reduce-mul.ll | 27 ++++++---------- .../X86/vector-shuffle-combining-avx.ll | 3 +- 4 files changed, 31 insertions(+), 36 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 1cbfd41dcbc324..86825ce8a446c9 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -36908,6 +36908,11 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Src, 0, TLO.DAG, DL, ExtSizeInBits)); } + // Target unary shuffles by immediate: + case X86ISD::PSHUFD: + case X86ISD::PSHUFLW: + case X86ISD::PSHUFHW: + case X86ISD::VPERMILPI: // Byte shifts by immediate. case X86ISD::VSHLDQ: case X86ISD::VSRLDQ: diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll index fee195ae121fd3..295b5271ed0b4e 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -7742,7 +7742,7 @@ define i64 @test_mm512_reduce_max_epi64(<8 x i64> %__W) { ; X86-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] ; X86-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X86-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; X86-NEXT: vmovd %xmm0, %eax ; X86-NEXT: vpextrd $1, %xmm0, %edx @@ -7755,7 +7755,7 @@ define i64 @test_mm512_reduce_max_epi64(<8 x i64> %__W) { ; X64-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] ; X64-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X64-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; X64-NEXT: vmovq %xmm0, %rax ; X64-NEXT: vzeroupper @@ -7781,7 +7781,7 @@ define i64 @test_mm512_reduce_max_epu64(<8 x i64> %__W) { ; X86-NEXT: vpmaxuq %zmm0, %zmm1, %zmm0 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] ; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 -; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ; X86-NEXT: vmovd %xmm0, %eax ; X86-NEXT: vpextrd $1, %xmm0, %edx @@ -7794,7 +7794,7 @@ define i64 @test_mm512_reduce_max_epu64(<8 x i64> %__W) { ; X64-NEXT: vpmaxuq %zmm0, %zmm1, %zmm0 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] ; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 -; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ; X64-NEXT: vmovq %xmm0, %rax ; X64-NEXT: vzeroupper @@ -7867,7 +7867,7 @@ define i64 @test_mm512_reduce_min_epi64(<8 x i64> %__W) { ; X86-NEXT: vpminsq %zmm0, %zmm1, %zmm0 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] ; X86-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X86-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; X86-NEXT: vmovd %xmm0, %eax ; X86-NEXT: vpextrd $1, %xmm0, %edx @@ -7880,7 +7880,7 @@ define i64 @test_mm512_reduce_min_epi64(<8 x i64> %__W) { ; X64-NEXT: vpminsq %zmm0, %zmm1, %zmm0 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] ; X64-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X64-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; X64-NEXT: vmovq %xmm0, %rax ; X64-NEXT: vzeroupper @@ -7906,7 +7906,7 @@ define i64 @test_mm512_reduce_min_epu64(<8 x i64> %__W) { ; X86-NEXT: vpminuq %zmm0, %zmm1, %zmm0 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] ; X86-NEXT: vpminuq %zmm1, %zmm0, %zmm0 -; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X86-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; X86-NEXT: vmovd %xmm0, %eax ; X86-NEXT: vpextrd $1, %xmm0, %edx @@ -7919,7 +7919,7 @@ define i64 @test_mm512_reduce_min_epu64(<8 x i64> %__W) { ; X64-NEXT: vpminuq %zmm0, %zmm1, %zmm0 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] ; X64-NEXT: vpminuq %zmm1, %zmm0, %zmm0 -; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X64-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; X64-NEXT: vmovq %xmm0, %rax ; X64-NEXT: vzeroupper @@ -7996,7 +7996,7 @@ define i64 @test_mm512_mask_reduce_max_epi64(i8 zeroext %__M, <8 x i64> %__W) { ; X86-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] ; X86-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X86-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; X86-NEXT: vmovd %xmm0, %eax ; X86-NEXT: vpextrd $1, %xmm0, %edx @@ -8012,7 +8012,7 @@ define i64 @test_mm512_mask_reduce_max_epi64(i8 zeroext %__M, <8 x i64> %__W) { ; X64-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] ; X64-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X64-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; X64-NEXT: vmovq %xmm0, %rax ; X64-NEXT: vzeroupper @@ -8043,7 +8043,7 @@ define i64 @test_mm512_mask_reduce_max_epu64(i8 zeroext %__M, <8 x i64> %__W) { ; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] ; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 -; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ; X86-NEXT: vmovd %xmm0, %eax ; X86-NEXT: vpextrd $1, %xmm0, %edx @@ -8058,7 +8058,7 @@ define i64 @test_mm512_mask_reduce_max_epu64(i8 zeroext %__M, <8 x i64> %__W) { ; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] ; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 -; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ; X64-NEXT: vmovq %xmm0, %rax ; X64-NEXT: vzeroupper @@ -8146,7 +8146,7 @@ define i64 @test_mm512_mask_reduce_min_epi64(i8 zeroext %__M, <8 x i64> %__W) { ; X86-NEXT: vpminsq %zmm0, %zmm1, %zmm0 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] ; X86-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X86-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; X86-NEXT: vmovd %xmm0, %eax ; X86-NEXT: vpextrd $1, %xmm0, %edx @@ -8162,7 +8162,7 @@ define i64 @test_mm512_mask_reduce_min_epi64(i8 zeroext %__M, <8 x i64> %__W) { ; X64-NEXT: vpminsq %zmm0, %zmm1, %zmm0 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] ; X64-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X64-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; X64-NEXT: vmovq %xmm0, %rax ; X64-NEXT: vzeroupper @@ -8194,7 +8194,7 @@ define i64 @test_mm512_mask_reduce_min_epu64(i8 zeroext %__M, <8 x i64> %__W) { ; X86-NEXT: vpminuq %zmm0, %zmm1, %zmm0 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] ; X86-NEXT: vpminuq %zmm1, %zmm0, %zmm0 -; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X86-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; X86-NEXT: vmovd %xmm0, %eax ; X86-NEXT: vpextrd $1, %xmm0, %edx @@ -8210,7 +8210,7 @@ define i64 @test_mm512_mask_reduce_min_epu64(i8 zeroext %__M, <8 x i64> %__W) { ; X64-NEXT: vpminuq %zmm0, %zmm1, %zmm0 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] ; X64-NEXT: vpminuq %zmm1, %zmm0, %zmm0 -; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X64-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; X64-NEXT: vmovq %xmm0, %rax ; X64-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-reduce-mul.ll b/llvm/test/CodeGen/X86/vector-reduce-mul.ll index 09d1472c39e98f..e6f9bb597a225d 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-mul.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-mul.ll @@ -1969,8 +1969,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX2-NEXT: vpmullw %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -2052,8 +2051,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512DQ-NEXT: vpmullw %xmm2, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpand %xmm3, %xmm0, %xmm1 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -2079,8 +2077,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm1, %xmm1 ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpand %xmm3, %xmm0, %xmm1 -; AVX512DQVL-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX512DQVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -2238,8 +2235,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-NEXT: vpmullw %xmm0, %xmm3, %xmm0 ; AVX2-NEXT: vpmullw %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero ; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero ; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0 @@ -2274,8 +2270,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512BW-NEXT: vpmullw %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpand %xmm3, %xmm0, %xmm1 -; AVX512BW-NEXT: vpshufd {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -2311,8 +2306,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512BWVL-NEXT: vpmullw %xmm2, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 -; AVX512BWVL-NEXT: vpshufd {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] +; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -2608,8 +2602,7 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-NEXT: vpmullw %xmm1, %xmm3, %xmm1 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -2647,8 +2640,7 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512BW-NEXT: vpmullw %xmm0, %xmm1, %xmm0 ; AVX512BW-NEXT: vpmullw %xmm0, %xmm2, %xmm0 -; AVX512BW-NEXT: vpand %xmm3, %xmm0, %xmm1 -; AVX512BW-NEXT: vpshufd {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -2687,8 +2679,7 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512BWVL-NEXT: vpmullw %xmm0, %xmm1, %xmm0 ; AVX512BWVL-NEXT: vpmullw %xmm0, %xmm2, %xmm0 -; AVX512BWVL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 -; AVX512BWVL-NEXT: vpshufd {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] +; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll index 50a250ba1adf04..6ffbe095c39baf 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll @@ -145,8 +145,7 @@ define <8 x float> @combine_vpermilvar_vperm2f128_zero_8f32(<8 x float> %a0) { define <4 x double> @combine_vperm2f128_vpermilvar_as_vpblendpd(<4 x double> %a0) { ; CHECK-LABEL: combine_vperm2f128_vpermilvar_as_vpblendpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] -; CHECK-NEXT: vmovapd %xmm0, %xmm0 +; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; CHECK-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] ; CHECK-NEXT: ret{{[l|q]}} %1 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> ) From 8abe830093f65a0fc6ba398ee1786d4d96607fdf Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sun, 31 May 2020 12:39:14 -0700 Subject: [PATCH 2/5] [X86] Rewrite how X86PartialReduction finds candidates to consider optimizing. Previously we walked the users of any vector binop looking for more binops with the same opcode or phis that eventually ended up in a reduction. While this is simple it also means visiting the same nodes many times since we'll do a forward walk for each BinaryOperator in the chain. It was also far more general than what we have tests for or expect to see. This patch replaces the algorithm with a new method that starts at extract elements looking for a horizontal reduction. Once we find a reduction we walk through backwards through phis and adds to collect leaves that we can consider for rewriting. We only consider single use adds and phis. Except for a special case if the Add is used by a phi that forms a loop back to the Add. Including other single use Adds to support unrolled loops. Ultimately, I want to narrow the Adds, Phis, and final reduction based on the partial reduction we're doing. I still haven't figured out exactly what that looks like yet. But restricting the types of graphs we expect to handle seemed like a good first step. As does having all the leaves and the reduction at once. Differential Revision: https://reviews.llvm.org/D79971 --- llvm/lib/Target/X86/X86PartialReduction.cpp | 367 ++++++++++---------- llvm/test/CodeGen/X86/madd.ll | 4 +- llvm/test/CodeGen/X86/sad.ll | 2 +- 3 files changed, 190 insertions(+), 183 deletions(-) diff --git a/llvm/lib/Target/X86/X86PartialReduction.cpp b/llvm/lib/Target/X86/X86PartialReduction.cpp index 16108bd1928f60..65caeab1d1cf27 100644 --- a/llvm/lib/Target/X86/X86PartialReduction.cpp +++ b/llvm/lib/Target/X86/X86PartialReduction.cpp @@ -49,11 +49,8 @@ class X86PartialReduction : public FunctionPass { } private: - bool tryMAddPattern(BinaryOperator *BO); - bool tryMAddReplacement(Value *Op, BinaryOperator *Add); - - bool trySADPattern(BinaryOperator *BO); - bool trySADReplacement(Value *Op, BinaryOperator *Add); + bool tryMAddReplacement(Instruction *Op); + bool trySADReplacement(Instruction *Op); }; } @@ -66,139 +63,24 @@ char X86PartialReduction::ID = 0; INITIALIZE_PASS(X86PartialReduction, DEBUG_TYPE, "X86 Partial Reduction", false, false) -static bool isVectorReductionOp(const BinaryOperator &BO) { - if (!BO.getType()->isVectorTy()) +bool X86PartialReduction::tryMAddReplacement(Instruction *Op) { + if (!ST->hasSSE2()) return false; - unsigned Opcode = BO.getOpcode(); - - switch (Opcode) { - case Instruction::Add: - case Instruction::Mul: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: - break; - case Instruction::FAdd: - case Instruction::FMul: - if (auto *FPOp = dyn_cast(&BO)) - if (FPOp->getFastMathFlags().isFast()) - break; - LLVM_FALLTHROUGH; - default: + // Need at least 8 elements. + if (cast(Op->getType())->getNumElements() < 8) return false; - } - unsigned ElemNum = cast(BO.getType())->getNumElements(); - // Ensure the reduction size is a power of 2. - if (!isPowerOf2_32(ElemNum)) + // Element type should be i32. + if (!cast(Op->getType())->getElementType()->isIntegerTy(32)) return false; - unsigned ElemNumToReduce = ElemNum; - - // Do DFS search on the def-use chain from the given instruction. We only - // allow four kinds of operations during the search until we reach the - // instruction that extracts the first element from the vector: - // - // 1. The reduction operation of the same opcode as the given instruction. - // - // 2. PHI node. - // - // 3. ShuffleVector instruction together with a reduction operation that - // does a partial reduction. - // - // 4. ExtractElement that extracts the first element from the vector, and we - // stop searching the def-use chain here. - // - // 3 & 4 above perform a reduction on all elements of the vector. We push defs - // from 1-3 to the stack to continue the DFS. The given instruction is not - // a reduction operation if we meet any other instructions other than those - // listed above. - - SmallVector UsersToVisit{&BO}; - SmallPtrSet Visited; - bool ReduxExtracted = false; - - while (!UsersToVisit.empty()) { - auto User = UsersToVisit.back(); - UsersToVisit.pop_back(); - if (!Visited.insert(User).second) - continue; - - for (const auto *U : User->users()) { - auto *Inst = dyn_cast(U); - if (!Inst) - return false; - - if (Inst->getOpcode() == Opcode || isa(U)) { - if (auto *FPOp = dyn_cast(Inst)) - if (!isa(FPOp) && !FPOp->getFastMathFlags().isFast()) - return false; - UsersToVisit.push_back(U); - } else if (auto *ShufInst = dyn_cast(U)) { - // Detect the following pattern: A ShuffleVector instruction together - // with a reduction that do partial reduction on the first and second - // ElemNumToReduce / 2 elements, and store the result in - // ElemNumToReduce / 2 elements in another vector. - - unsigned ResultElements = ShufInst->getType()->getNumElements(); - if (ResultElements < ElemNum) - return false; - - if (ElemNumToReduce == 1) - return false; - if (!isa(U->getOperand(1))) - return false; - for (unsigned i = 0; i < ElemNumToReduce / 2; ++i) - if (ShufInst->getMaskValue(i) != int(i + ElemNumToReduce / 2)) - return false; - for (unsigned i = ElemNumToReduce / 2; i < ElemNum; ++i) - if (ShufInst->getMaskValue(i) != -1) - return false; - - // There is only one user of this ShuffleVector instruction, which - // must be a reduction operation. - if (!U->hasOneUse()) - return false; - - auto *U2 = dyn_cast(*U->user_begin()); - if (!U2 || U2->getOpcode() != Opcode) - return false; - - // Check operands of the reduction operation. - if ((U2->getOperand(0) == U->getOperand(0) && U2->getOperand(1) == U) || - (U2->getOperand(1) == U->getOperand(0) && U2->getOperand(0) == U)) { - UsersToVisit.push_back(U2); - ElemNumToReduce /= 2; - } else - return false; - } else if (isa(U)) { - // At this moment we should have reduced all elements in the vector. - if (ElemNumToReduce != 1) - return false; - - auto *Val = dyn_cast(U->getOperand(1)); - if (!Val || !Val->isZero()) - return false; - - ReduxExtracted = true; - } else - return false; - } - } - return ReduxExtracted; -} - -bool X86PartialReduction::tryMAddReplacement(Value *Op, BinaryOperator *Add) { - BasicBlock *BB = Add->getParent(); - - auto *BO = dyn_cast(Op); - if (!BO || BO->getOpcode() != Instruction::Mul || !BO->hasOneUse() || - BO->getParent() != BB) + auto *Mul = dyn_cast(Op); + if (!Mul || Mul->getOpcode() != Instruction::Mul) return false; - Value *LHS = BO->getOperand(0); - Value *RHS = BO->getOperand(1); + Value *LHS = Mul->getOperand(0); + Value *RHS = Mul->getOperand(1); // LHS and RHS should be only used once or if they are the same then only // used twice. Only check this when SSE4.1 is enabled and we have zext/sext @@ -219,7 +101,7 @@ bool X86PartialReduction::tryMAddReplacement(Value *Op, BinaryOperator *Add) { auto CanShrinkOp = [&](Value *Op) { auto IsFreeTruncation = [&](Value *Op) { if (auto *Cast = dyn_cast(Op)) { - if (Cast->getParent() == BB && + if (Cast->getParent() == Mul->getParent() && (Cast->getOpcode() == Instruction::SExt || Cast->getOpcode() == Instruction::ZExt) && Cast->getOperand(0)->getType()->getScalarSizeInBits() <= 16) @@ -232,16 +114,16 @@ bool X86PartialReduction::tryMAddReplacement(Value *Op, BinaryOperator *Add) { // If the operation can be freely truncated and has enough sign bits we // can shrink. if (IsFreeTruncation(Op) && - ComputeNumSignBits(Op, *DL, 0, nullptr, BO) > 16) + ComputeNumSignBits(Op, *DL, 0, nullptr, Mul) > 16) return true; // SelectionDAG has limited support for truncating through an add or sub if // the inputs are freely truncatable. if (auto *BO = dyn_cast(Op)) { - if (BO->getParent() == BB && + if (BO->getParent() == Mul->getParent() && IsFreeTruncation(BO->getOperand(0)) && IsFreeTruncation(BO->getOperand(1)) && - ComputeNumSignBits(Op, *DL, 0, nullptr, BO) > 16) + ComputeNumSignBits(Op, *DL, 0, nullptr, Mul) > 16) return true; } @@ -252,7 +134,7 @@ bool X86PartialReduction::tryMAddReplacement(Value *Op, BinaryOperator *Add) { if (!CanShrinkOp(LHS) && !CanShrinkOp(RHS)) return false; - IRBuilder<> Builder(Add); + IRBuilder<> Builder(Mul); auto *MulTy = cast(Op->getType()); unsigned NumElts = MulTy->getNumElements(); @@ -266,8 +148,11 @@ bool X86PartialReduction::tryMAddReplacement(Value *Op, BinaryOperator *Add) { EvenMask[i] = i * 2; OddMask[i] = i * 2 + 1; } - Value *EvenElts = Builder.CreateShuffleVector(BO, BO, EvenMask); - Value *OddElts = Builder.CreateShuffleVector(BO, BO, OddMask); + // Creating a new mul so the replaceAllUsesWith below doesn't replace the + // uses in the shuffles we're creating. + Value *NewMul = Builder.CreateMul(Mul->getOperand(0), Mul->getOperand(1)); + Value *EvenElts = Builder.CreateShuffleVector(NewMul, NewMul, EvenMask); + Value *OddElts = Builder.CreateShuffleVector(NewMul, NewMul, OddMask); Value *MAdd = Builder.CreateAdd(EvenElts, OddElts); // Concatenate zeroes to extend back to the original type. @@ -276,34 +161,21 @@ bool X86PartialReduction::tryMAddReplacement(Value *Op, BinaryOperator *Add) { Value *Zero = Constant::getNullValue(MAdd->getType()); Value *Concat = Builder.CreateShuffleVector(MAdd, Zero, ConcatMask); - // Replaces the use of mul in the original Add with the pmaddwd and zeroes. - Add->replaceUsesOfWith(BO, Concat); - Add->setHasNoSignedWrap(false); - Add->setHasNoUnsignedWrap(false); + Mul->replaceAllUsesWith(Concat); + Mul->eraseFromParent(); return true; } -// Try to replace operans of this add with pmaddwd patterns. -bool X86PartialReduction::tryMAddPattern(BinaryOperator *BO) { +bool X86PartialReduction::trySADReplacement(Instruction *Op) { if (!ST->hasSSE2()) return false; - // Need at least 8 elements. - if (cast(BO->getType())->getNumElements() < 8) - return false; - - // Element type should be i32. - if (!cast(BO->getType())->getElementType()->isIntegerTy(32)) + // TODO: There's nothing special about i32, any integer type above i16 should + // work just as well. + if (!cast(Op->getType())->getElementType()->isIntegerTy(32)) return false; - bool Changed = false; - Changed |= tryMAddReplacement(BO->getOperand(0), BO); - Changed |= tryMAddReplacement(BO->getOperand(1), BO); - return Changed; -} - -bool X86PartialReduction::trySADReplacement(Value *Op, BinaryOperator *Add) { // Operand should be a select. auto *SI = dyn_cast(Op); if (!SI) @@ -337,7 +209,7 @@ bool X86PartialReduction::trySADReplacement(Value *Op, BinaryOperator *Add) { if (!Op0 || !Op1) return false; - IRBuilder<> Builder(Add); + IRBuilder<> Builder(SI); auto *OpTy = cast(Op->getType()); unsigned NumElts = OpTy->getNumElements(); @@ -355,7 +227,7 @@ bool X86PartialReduction::trySADReplacement(Value *Op, BinaryOperator *Add) { IntrinsicNumElts = 16; } - Function *PSADBWFn = Intrinsic::getDeclaration(Add->getModule(), IID); + Function *PSADBWFn = Intrinsic::getDeclaration(SI->getModule(), IID); if (NumElts < 16) { // Pad input with zeroes. @@ -419,27 +291,155 @@ bool X86PartialReduction::trySADReplacement(Value *Op, BinaryOperator *Add) { Ops[0] = Builder.CreateShuffleVector(Ops[0], Zero, ConcatMask); } - // Replaces the uses of Op in Add with the new sequence. - Add->replaceUsesOfWith(Op, Ops[0]); - Add->setHasNoSignedWrap(false); - Add->setHasNoUnsignedWrap(false); + SI->replaceAllUsesWith(Ops[0]); + SI->eraseFromParent(); return true; } -bool X86PartialReduction::trySADPattern(BinaryOperator *BO) { - if (!ST->hasSSE2()) - return false; +// Walk backwards from the ExtractElementInst and determine if it is the end of +// a horizontal reduction. Return the input to the reduction if we find one. +static Value *matchAddReduction(const ExtractElementInst &EE) { + // Make sure we're extracting index 0. + auto *Index = dyn_cast(EE.getIndexOperand()); + if (!Index || !Index->isNullValue()) + return nullptr; - // TODO: There's nothing special about i32, any integer type above i16 should - // work just as well. - if (!cast(BO->getType())->getElementType()->isIntegerTy(32)) + const auto *BO = dyn_cast(EE.getVectorOperand()); + if (!BO || BO->getOpcode() != Instruction::Add || !BO->hasOneUse()) + return nullptr; + + unsigned NumElems = cast(BO->getType())->getNumElements(); + // Ensure the reduction size is a power of 2. + if (!isPowerOf2_32(NumElems)) + return nullptr; + + const Value *Op = BO; + unsigned Stages = Log2_32(NumElems); + for (unsigned i = 0; i != Stages; ++i) { + const auto *BO = dyn_cast(Op); + if (!BO || BO->getOpcode() != Instruction::Add) + return nullptr; + + // If this isn't the first add, then it should only have 2 users, the + // shuffle and another add which we checked in the previous iteration. + if (i != 0 && !BO->hasNUses(2)) + return nullptr; + + Value *LHS = BO->getOperand(0); + Value *RHS = BO->getOperand(1); + + auto *Shuffle = dyn_cast(LHS); + if (Shuffle) { + Op = RHS; + } else { + Shuffle = dyn_cast(RHS); + Op = LHS; + } + + // The first operand of the shuffle should be the same as the other operand + // of the bin op. + if (!Shuffle || Shuffle->getOperand(0) != Op) + return nullptr; + + // Verify the shuffle has the expected (at this stage of the pyramid) mask. + unsigned MaskEnd = 1 << i; + for (unsigned Index = 0; Index < MaskEnd; ++Index) + if (Shuffle->getMaskValue(Index) != (int)(MaskEnd + Index)) + return nullptr; + } + + return const_cast(Op); +} + +// See if this BO is reachable from this Phi by walking forward through single +// use BinaryOperators with the same opcode. If we get back then we know we've +// found a loop and it is safe to step through this Add to find more leaves. +static bool isReachableFromPHI(PHINode *Phi, BinaryOperator *BO) { + // The PHI itself should only have one use. + if (!Phi->hasOneUse()) return false; - bool Changed = false; - Changed |= trySADReplacement(BO->getOperand(0), BO); - Changed |= trySADReplacement(BO->getOperand(1), BO); - return Changed; + Instruction *U = cast(*Phi->user_begin()); + if (U == BO) + return true; + + while (U->hasOneUse() && U->getOpcode() == BO->getOpcode()) + U = cast(*U->user_begin()); + + return U == BO; +} + +// Collect all the leaves of the tree of adds that feeds into the horizontal +// reduction. Root is the Value that is used by the horizontal reduction. +// We look through single use phis, single use adds, or adds that are used by +// a phi that forms a loop with the add. +static void collectLeaves(Value *Root, SmallVectorImpl &Leaves) { + SmallPtrSet Visited; + SmallVector Worklist; + Worklist.push_back(Root); + + while (!Worklist.empty()) { + Value *V = Worklist.pop_back_val(); + if (!Visited.insert(V).second) + continue; + + if (auto *PN = dyn_cast(V)) { + // PHI node should have single use unless it is the root node, then it + // has 2 uses. + if (!PN->hasNUses(PN == Root ? 2 : 1)) + break; + + // Push incoming values to the worklist. + for (Value *InV : PN->incoming_values()) + Worklist.push_back(InV); + + continue; + } + + if (auto *BO = dyn_cast(V)) { + if (BO->getOpcode() == Instruction::Add) { + // Simple case. Single use, just push its operands to the worklist. + if (BO->hasNUses(BO == Root ? 2 : 1)) { + for (Value *Op : BO->operands()) + Worklist.push_back(Op); + continue; + } + + // If there is additional use, make sure it is an unvisited phi that + // gets us back to this node. + if (BO->hasNUses(BO == Root ? 3 : 2)) { + PHINode *PN = nullptr; + for (auto *U : Root->users()) + if (auto *P = dyn_cast(U)) + if (!Visited.count(P)) + PN = P; + + // If we didn't find a 2-input PHI then this isn't a case we can + // handle. + if (!PN || PN->getNumIncomingValues() != 2) + continue; + + // Walk forward from this phi to see if it reaches back to this add. + if (!isReachableFromPHI(PN, BO)) + continue; + + // The phi forms a loop with this Add, push its operands. + for (Value *Op : BO->operands()) + Worklist.push_back(Op); + } + } + } + + // Not an add or phi, make it a leaf. + if (auto *I = dyn_cast(V)) { + if (!V->hasNUses(I == Root ? 2 : 1)) + continue; + + // Add this as a leaf. + Leaves.push_back(I); + } + } } bool X86PartialReduction::runOnFunction(Function &F) { @@ -458,22 +458,29 @@ bool X86PartialReduction::runOnFunction(Function &F) { bool MadeChange = false; for (auto &BB : F) { for (auto &I : BB) { - auto *BO = dyn_cast(&I); - if (!BO) + auto *EE = dyn_cast(&I); + if (!EE) continue; - if (!isVectorReductionOp(*BO)) + // First find a reduction tree. + // FIXME: Do we need to handle other opcodes than Add? + Value *Root = matchAddReduction(*EE); + if (!Root) continue; - if (BO->getOpcode() == Instruction::Add) { - if (tryMAddPattern(BO)) { + SmallVector Leaves; + collectLeaves(Root, Leaves); + + for (Instruction *I : Leaves) { + if (tryMAddReplacement(I)) { MadeChange = true; continue; } - if (trySADPattern(BO)) { + + // Don't do SAD matching on the root node. SelectionDAG already + // has support for that and currently generates better code. + if (I != Root && trySADReplacement(I)) MadeChange = true; - continue; - } } } } diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll index d6d04d9b128412..6109bd25c69e2a 100644 --- a/llvm/test/CodeGen/X86/madd.ll +++ b/llvm/test/CodeGen/X86/madd.ll @@ -2657,9 +2657,9 @@ define i32 @madd_double_reduction(<8 x i16>* %arg, <8 x i16>* %arg1, <8 x i16>* ; AVX-LABEL: madd_double_reduction: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqu (%rdi), %xmm0 +; AVX-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqu (%rdx), %xmm1 ; AVX-NEXT: vpmaddwd (%rcx), %xmm1, %xmm1 -; AVX-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -2720,9 +2720,9 @@ define i32 @madd_quad_reduction(<8 x i16>* %arg, <8 x i16>* %arg1, <8 x i16>* %a ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX-NEXT: vmovdqu (%rdi), %xmm0 +; AVX-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqu (%rdx), %xmm1 ; AVX-NEXT: vpmaddwd (%rcx), %xmm1, %xmm1 -; AVX-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqu (%r8), %xmm2 ; AVX-NEXT: vpmaddwd (%r9), %xmm2, %xmm2 ; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/sad.ll b/llvm/test/CodeGen/X86/sad.ll index 006dd3d5ff1788..f55a58048e227a 100644 --- a/llvm/test/CodeGen/X86/sad.ll +++ b/llvm/test/CodeGen/X86/sad.ll @@ -1061,9 +1061,9 @@ define i32 @sad_double_reduction(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x i8>* % ; AVX-LABEL: sad_double_reduction: ; AVX: # %bb.0: # %bb ; AVX-NEXT: vmovdqu (%rdi), %xmm0 +; AVX-NEXT: vpsadbw (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqu (%rdx), %xmm1 ; AVX-NEXT: vpsadbw (%rcx), %xmm1, %xmm1 -; AVX-NEXT: vpsadbw (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 From 403d5a5e351956e950fdb8bba07f804fb7d52742 Mon Sep 17 00:00:00 2001 From: Hubert Tong Date: Sun, 31 May 2020 16:33:42 -0400 Subject: [PATCH 3/5] [test][compiler-rt] Avoid LD_PRELOAD for "outer" dynamic linkers Summary: This patch moves the setting of `LD_PRELOAD` "inwards" to avoid issues where the built library needs to be loaded with the dynamic linker that was configured with the build (and cannot, for example, be loaded by the dynamic linker associated with the `env` utility). Reviewed By: vitalybuka, nemanjai, jsji Differential Revision: https://reviews.llvm.org/D79695 --- .../asan/TestCases/Linux/preinstalled_signal.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/compiler-rt/test/asan/TestCases/Linux/preinstalled_signal.cpp b/compiler-rt/test/asan/TestCases/Linux/preinstalled_signal.cpp index 2b50944c6f2f6e..71929fdd9b37fe 100644 --- a/compiler-rt/test/asan/TestCases/Linux/preinstalled_signal.cpp +++ b/compiler-rt/test/asan/TestCases/Linux/preinstalled_signal.cpp @@ -1,16 +1,16 @@ // RUN: %clangxx -std=c++11 %s -o %t -// RUN: env LD_PRELOAD=%shared_libasan %env_asan_opts=handle_segv=1 not %run %t 2>&1 | FileCheck %s -// RUN: env LD_PRELOAD=%shared_libasan %env_asan_opts=handle_segv=2 not %run %t 2>&1 | FileCheck %s +// RUN: %env_asan_opts=handle_segv=1 LD_PRELOAD=%shared_libasan not %run %t 2>&1 | FileCheck %s +// RUN: %env_asan_opts=handle_segv=2 LD_PRELOAD=%shared_libasan not %run %t 2>&1 | FileCheck %s // RUN: %clangxx -std=c++11 -DTEST_INSTALL_SIG_HANDLER %s -o %t -// RUN: env LD_PRELOAD=%shared_libasan %env_asan_opts=handle_segv=0 not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-HANDLER -// RUN: env LD_PRELOAD=%shared_libasan %env_asan_opts=handle_segv=1 not %run %t 2>&1 | FileCheck %s -// RUN: env LD_PRELOAD=%shared_libasan %env_asan_opts=handle_segv=2 not %run %t 2>&1 | FileCheck %s +// RUN: %env_asan_opts=handle_segv=0 LD_PRELOAD=%shared_libasan not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-HANDLER +// RUN: %env_asan_opts=handle_segv=1 LD_PRELOAD=%shared_libasan not %run %t 2>&1 | FileCheck %s +// RUN: %env_asan_opts=handle_segv=2 LD_PRELOAD=%shared_libasan not %run %t 2>&1 | FileCheck %s // RUN: %clangxx -std=c++11 -DTEST_INSTALL_SIG_ACTION %s -o %t -// RUN: env LD_PRELOAD=%shared_libasan %env_asan_opts=handle_segv=0 not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-ACTION -// RUN: env LD_PRELOAD=%shared_libasan %env_asan_opts=handle_segv=1 not %run %t 2>&1 | FileCheck %s -// RUN: env LD_PRELOAD=%shared_libasan %env_asan_opts=handle_segv=2 not %run %t 2>&1 | FileCheck %s +// RUN: %env_asan_opts=handle_segv=0 LD_PRELOAD=%shared_libasan not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-ACTION +// RUN: %env_asan_opts=handle_segv=1 LD_PRELOAD=%shared_libasan not %run %t 2>&1 | FileCheck %s +// RUN: %env_asan_opts=handle_segv=2 LD_PRELOAD=%shared_libasan not %run %t 2>&1 | FileCheck %s // REQUIRES: asan-dynamic-runtime From c15d5d12c625df52bf82828a6af5ef2dfb6b4533 Mon Sep 17 00:00:00 2001 From: Hubert Tong Date: Sun, 31 May 2020 16:38:10 -0400 Subject: [PATCH 4/5] [Driver] NFC: Use Twine temp to replace std::string local This patch replaces a `std::string` local used for a concatentation with a `Twine` where the string was being passed into call. --- clang/lib/Driver/ToolChains/Gnu.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp index 9a340142a24281..ac9eb46dacb512 100644 --- a/clang/lib/Driver/ToolChains/Gnu.cpp +++ b/clang/lib/Driver/ToolChains/Gnu.cpp @@ -449,10 +449,9 @@ void tools::gnutools::Linker::ConstructJob(Compilation &C, const JobAction &JA, CmdArgs.push_back("-export-dynamic"); if (!Args.hasArg(options::OPT_shared) && !IsStaticPIE) { - const std::string Loader = - D.DyldPrefix + ToolChain.getDynamicLinker(Args); CmdArgs.push_back("-dynamic-linker"); - CmdArgs.push_back(Args.MakeArgString(Loader)); + CmdArgs.push_back(Args.MakeArgString(Twine(D.DyldPrefix) + + ToolChain.getDynamicLinker(Args))); } } From 77e1181df446b54391acad08512b540e174cf6e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kirst=C3=B3f=20Umann?= Date: Sun, 31 May 2020 21:22:35 +0200 Subject: [PATCH 5/5] [analyzer] Add dumps to CheckerRegistry --- .../StaticAnalyzer/Frontend/CheckerRegistry.h | 10 +++ .../Frontend/CheckerRegistry.cpp | 61 +++++++++++++++++++ 2 files changed, 71 insertions(+) diff --git a/clang/include/clang/StaticAnalyzer/Frontend/CheckerRegistry.h b/clang/include/clang/StaticAnalyzer/Frontend/CheckerRegistry.h index 4e98ba2e10d233..c3494d0ebeefd7 100644 --- a/clang/include/clang/StaticAnalyzer/Frontend/CheckerRegistry.h +++ b/clang/include/clang/StaticAnalyzer/Frontend/CheckerRegistry.h @@ -13,6 +13,7 @@ #include "llvm/ADT/SetVector.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Support/raw_ostream.h" #include #include @@ -133,6 +134,9 @@ class CheckerRegistry { DevelopmentStatus == "released") && "Invalid development status!"); } + + LLVM_DUMP_METHOD void dump() const { dumpToStream(llvm::errs()); } + LLVM_DUMP_METHOD void dumpToStream(llvm::raw_ostream &Out) const; }; using CmdLineOptionList = llvm::SmallVector; @@ -189,6 +193,9 @@ class CheckerRegistry { // Used for lower_bound. explicit CheckerInfo(StringRef FullName) : FullName(FullName) {} + + LLVM_DUMP_METHOD void dump() const { dumpToStream(llvm::errs()); } + LLVM_DUMP_METHOD void dumpToStream(llvm::raw_ostream &Out) const; }; using StateFromCmdLine = CheckerInfo::StateFromCmdLine; @@ -206,6 +213,9 @@ class CheckerRegistry { } explicit PackageInfo(StringRef FullName) : FullName(FullName) {} + + LLVM_DUMP_METHOD void dump() const { dumpToStream(llvm::errs()); } + LLVM_DUMP_METHOD void dumpToStream(llvm::raw_ostream &Out) const; }; using PackageInfoList = llvm::SmallVector; diff --git a/clang/lib/StaticAnalyzer/Frontend/CheckerRegistry.cpp b/clang/lib/StaticAnalyzer/Frontend/CheckerRegistry.cpp index 62ac1ed252dd1e..f4d5db1e7a4b03 100644 --- a/clang/lib/StaticAnalyzer/Frontend/CheckerRegistry.cpp +++ b/clang/lib/StaticAnalyzer/Frontend/CheckerRegistry.cpp @@ -27,6 +27,10 @@ using namespace clang; using namespace ento; using llvm::sys::DynamicLibrary; +//===----------------------------------------------------------------------===// +// Utilities. +//===----------------------------------------------------------------------===// + using RegisterCheckersFn = void (*)(CheckerRegistry &); static bool isCompatibleAPIVersion(const char *VersionString) { @@ -86,6 +90,63 @@ static bool isInPackage(const CheckerRegistry::CheckerInfo &Checker, return false; } +//===----------------------------------------------------------------------===// +// Methods of CmdLineOption, PackageInfo and CheckerInfo. +//===----------------------------------------------------------------------===// + +LLVM_DUMP_METHOD void +CheckerRegistry::CmdLineOption::dumpToStream(llvm::raw_ostream &Out) const { + // The description can be just checked in Checkers.inc, the point here is to + // debug whether we succeeded in parsing it. + Out << OptionName << " (" << OptionType << ", " + << (IsHidden ? "hidden, " : "") << DevelopmentStatus << ") default: \"" + << DefaultValStr; +} + +static StringRef toString(CheckerRegistry::StateFromCmdLine Kind) { + switch (Kind) { + case CheckerRegistry::StateFromCmdLine::State_Disabled: + return "Disabled"; + case CheckerRegistry::StateFromCmdLine::State_Enabled: + return "Enabled"; + case CheckerRegistry::StateFromCmdLine::State_Unspecified: + return "Unspecified"; + } +} + +LLVM_DUMP_METHOD void +CheckerRegistry::CheckerInfo::dumpToStream(llvm::raw_ostream &Out) const { + // The description can be just checked in Checkers.inc, the point here is to + // debug whether we succeeded in parsing it. Same with documentation uri. + Out << FullName << " (" << toString(State) << (IsHidden ? ", hidden" : "") + << ")\n"; + Out << " Options:\n"; + for (const CmdLineOption &Option : CmdLineOptions) { + Out << " "; + Option.dumpToStream(Out); + Out << '\n'; + } + Out << " Dependencies:\n"; + for (const CheckerInfo *Dependency : Dependencies) { + Out << " " << Dependency->FullName << '\n'; + } +} + +LLVM_DUMP_METHOD void +CheckerRegistry::PackageInfo::dumpToStream(llvm::raw_ostream &Out) const { + Out << FullName << "\n"; + Out << " Options:\n"; + for (const CmdLineOption &Option : CmdLineOptions) { + Out << " "; + Option.dumpToStream(Out); + Out << '\n'; + } +} + +//===----------------------------------------------------------------------===// +// Methods of CheckerRegistry. +//===----------------------------------------------------------------------===// + CheckerRegistry::CheckerInfoListRange CheckerRegistry::getMutableCheckersForCmdLineArg(StringRef CmdLineArg) { auto It = binaryFind(Checkers, CmdLineArg);