diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 24f50b87c4cf2f..d9c6c28d5dac96 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -24463,6 +24463,23 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) { if (!LegalOperations || TLI.isOperationLegal(ISD::SPLAT_VECTOR, NVT)) return DAG.getSplatVector(NVT, DL, V.getOperand(0)); + // extract_subvector(insert_subvector(x,y,c1),c2) + // --> extract_subvector(y,c2-c1) + // iff we're just extracting from the inserted subvector. + if (V.getOpcode() == ISD::INSERT_SUBVECTOR) { + SDValue InsSub = V.getOperand(1); + EVT InsSubVT = InsSub.getValueType(); + unsigned NumInsElts = InsSubVT.getVectorMinNumElements(); + unsigned InsIdx = V.getConstantOperandVal(2); + unsigned NumSubElts = NVT.getVectorMinNumElements(); + if (InsIdx <= ExtIdx && (ExtIdx + NumSubElts) <= (InsIdx + NumInsElts) && + TLI.isExtractSubvectorCheap(NVT, InsSubVT, ExtIdx - InsIdx)) { + SDLoc DL(N); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT, InsSub, + DAG.getVectorIdxConstant(ExtIdx - InsIdx, DL)); + } + } + // Try to move vector bitcast after extract_subv by scaling extraction index: // extract_subv (bitcast X), Index --> bitcast (extract_subv X, Index') if (V.getOpcode() == ISD::BITCAST && diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll index 4242d8483e7233..39c7ce1413d1b3 100644 --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll @@ -314,8 +314,8 @@ define void @vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2(ptr %in.v ; ; AVX512F-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) @@ -324,8 +324,8 @@ define void @vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2(ptr %in.v ; ; AVX512DQ-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) @@ -981,7 +981,7 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in. ; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7] ; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpermd %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpermd %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper @@ -992,7 +992,7 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in. ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7] ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpermd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpermd %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper @@ -3507,13 +3507,12 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i ; ; AVX512F-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm2 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) @@ -3523,13 +3522,12 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i ; ; AVX512DQ-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] +; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) @@ -3768,10 +3766,10 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; ; AVX512F-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpbroadcastq %xmm0, %ymm2 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] ; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0 @@ -3784,10 +3782,10 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; ; AVX512DQ-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-NEXT: vpbroadcastq %xmm0, %ymm2 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] ; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0 @@ -4147,9 +4145,9 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr % ; ; AVX512F-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; AVX512F-NEXT: vpbroadcastw %xmm0, %xmm0 @@ -4161,9 +4159,9 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr % ; ; AVX512DQ-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; AVX512DQ-NEXT: vpbroadcastw %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/dpbusd.ll b/llvm/test/CodeGen/X86/dpbusd.ll index fbea08eb1e5502..04d7a9691b645f 100644 --- a/llvm/test/CodeGen/X86/dpbusd.ll +++ b/llvm/test/CodeGen/X86/dpbusd.ll @@ -26,7 +26,7 @@ define i32 @no_dpbusd(ptr%a, ptr%b, i32 %c, i32 %n) { ; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] diff --git a/llvm/test/CodeGen/X86/dpbusd_i4.ll b/llvm/test/CodeGen/X86/dpbusd_i4.ll index 906fead7f8db53..a212f99680ef4d 100644 --- a/llvm/test/CodeGen/X86/dpbusd_i4.ll +++ b/llvm/test/CodeGen/X86/dpbusd_i4.ll @@ -86,7 +86,7 @@ define i32 @mul_sext_i4i4(<16 x i4> %a, <16 x i4> %b, i32 %c) { ; CHECK-NEXT: vpsraw $12, %ymm0, %ymm0 ; CHECK-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll index 1436922f9dd114..6d5fc9ed0ab5b6 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll @@ -1828,22 +1828,22 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512-LABEL: load_i16_stride3_vf32: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512-NEXT: vmovdqa 128(%rdi), %ymm5 ; AVX512-NEXT: vmovdqa 160(%rdi), %ymm6 -; AVX512-NEXT: vmovdqa %ymm1, %ymm0 -; AVX512-NEXT: vpternlogq $202, %ymm5, %ymm6, %ymm0 -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7],ymm0[8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14],ymm2[15] -; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] -; AVX512-NEXT: vmovdqa 112(%rdi), %xmm0 +; AVX512-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512-NEXT: vpternlogq $202, %ymm5, %ymm6, %ymm1 +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15] +; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] +; AVX512-NEXT: vmovdqa 112(%rdi), %xmm1 ; AVX512-NEXT: vmovdqa 96(%rdi), %xmm2 -; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6],xmm0[7] +; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] ; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u] ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX512-NEXT: vmovdqa (%rdi), %ymm8 ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm9 -; AVX512-NEXT: vmovdqa %ymm1, %ymm3 +; AVX512-NEXT: vmovdqa %ymm0, %ymm3 ; AVX512-NEXT: vpternlogq $202, %ymm9, %ymm8, %ymm3 ; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] ; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15] @@ -1857,14 +1857,14 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,4,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm7 -; AVX512-NEXT: vmovdqa %ymm1, %ymm10 +; AVX512-NEXT: vmovdqa %ymm0, %ymm10 ; AVX512-NEXT: vpternlogq $202, %ymm6, %ymm5, %ymm10 ; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] ; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7,8,9],ymm11[10],ymm10[11,12],ymm11[13],ymm10[14,15] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] ; AVX512-NEXT: vpshufb %ymm11, %ymm10, %ymm10 -; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] -; AVX512-NEXT: vpshufb %xmm11, %xmm12, %xmm12 +; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] ; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm10[5,6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] @@ -1885,21 +1885,19 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1,2],ymm5[3],ymm12[4,5],ymm5[6],ymm12[7],ymm5[8],ymm12[9,10],ymm5[11],ymm12[12,13],ymm5[14],ymm12[15] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] ; AVX512-NEXT: vpshufb %ymm6, %ymm5, %ymm5 -; AVX512-NEXT: vpternlogq $202, %ymm8, %ymm9, %ymm1 -; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2],ymm8[3],ymm1[4,5],ymm8[6],ymm1[7],ymm8[8],ymm1[9,10],ymm8[11],ymm1[12,13],ymm8[14],ymm1[15] -; AVX512-NEXT: vpshufb %ymm6, %ymm1, %ymm1 -; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7] -; AVX512-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6,7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm0 -; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm5[5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] +; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512-NEXT: vpternlogq $202, %ymm8, %ymm9, %ymm0 +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7],ymm2[8],ymm0[9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] +; AVX512-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7] +; AVX512-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm7, (%rsi) ; AVX512-NEXT: vmovdqa64 %zmm10, (%rdx) ; AVX512-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -1908,22 +1906,22 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512-FCP-LABEL: load_i16_stride3_vf32: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm5 ; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm6 -; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm0 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm5, %ymm6, %ymm0 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7],ymm0[8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14],ymm2[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] -; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm0 +; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512-FCP-NEXT: vpternlogq $202, %ymm5, %ymm6, %ymm1 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] +; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm1 ; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm2 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6],xmm0[7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm8 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm9 -; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm3 +; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm3 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm9, %ymm8, %ymm3 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15] @@ -1937,14 +1935,14 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,4,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm7 -; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm10 +; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm10 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm6, %ymm5, %ymm10 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7,8,9],ymm11[10],ymm10[11,12],ymm11[13],ymm10[14,15] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm10 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm12, %xmm12 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm10[5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] @@ -1965,21 +1963,19 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1,2],ymm5[3],ymm12[4,5],ymm5[6],ymm12[7],ymm5[8],ymm12[9,10],ymm5[11],ymm12[12,13],ymm5[14],ymm12[15] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm8, %ymm9, %ymm1 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2],ymm8[3],ymm1[4,5],ymm8[6],ymm1[7],ymm8[8],ymm1[9,10],ymm8[11],ymm1[12,13],ymm8[14],ymm1[15] -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7] -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512-FCP-NEXT: vextracti32x4 $2, %zmm0, %xmm0 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm5[5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512-FCP-NEXT: vpternlogq $202, %ymm8, %ymm9, %ymm0 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7],ymm2[8],ymm0[9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7] +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, (%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -1988,22 +1984,22 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512DQ-LABEL: load_i16_stride3_vf32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm5 ; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm6 -; AVX512DQ-NEXT: vmovdqa %ymm1, %ymm0 -; AVX512DQ-NEXT: vpternlogq $202, %ymm5, %ymm6, %ymm0 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7],ymm0[8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14],ymm2[15] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] -; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512DQ-NEXT: vpternlogq $202, %ymm5, %ymm6, %ymm1 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] +; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm1 ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6],xmm0[7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm8 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm9 -; AVX512DQ-NEXT: vmovdqa %ymm1, %ymm3 +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm3 ; AVX512DQ-NEXT: vpternlogq $202, %ymm9, %ymm8, %ymm3 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15] @@ -2017,14 +2013,14 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,4,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm7 -; AVX512DQ-NEXT: vmovdqa %ymm1, %ymm10 +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm10 ; AVX512DQ-NEXT: vpternlogq $202, %ymm6, %ymm5, %ymm10 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7,8,9],ymm11[10],ymm10[11,12],ymm11[13],ymm10[14,15] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] ; AVX512DQ-NEXT: vpshufb %ymm11, %ymm10, %ymm10 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] -; AVX512DQ-NEXT: vpshufb %xmm11, %xmm12, %xmm12 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm10[5,6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] @@ -2045,21 +2041,19 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1,2],ymm5[3],ymm12[4,5],ymm5[6],ymm12[7],ymm5[8],ymm12[9,10],ymm5[11],ymm12[12,13],ymm5[14],ymm12[15] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] ; AVX512DQ-NEXT: vpshufb %ymm6, %ymm5, %ymm5 -; AVX512DQ-NEXT: vpternlogq $202, %ymm8, %ymm9, %ymm1 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2],ymm8[3],ymm1[4,5],ymm8[6],ymm1[7],ymm8[8],ymm1[9,10],ymm8[11],ymm1[12,13],ymm8[14],ymm1[15] -; AVX512DQ-NEXT: vpshufb %ymm6, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7] -; AVX512DQ-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6,7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512DQ-NEXT: vextracti32x4 $2, %zmm0, %xmm0 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm5[5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-NEXT: vpternlogq $202, %ymm8, %ymm9, %ymm0 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7],ymm2[8],ymm0[9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] +; AVX512DQ-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7] +; AVX512DQ-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, (%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm10, (%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -2068,22 +2062,22 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512DQ-FCP-LABEL: load_i16_stride3_vf32: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm5 ; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm0 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm5, %ymm6, %ymm0 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7],ymm0[8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14],ymm2[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] -; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm5, %ymm6, %ymm1 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] +; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm2 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6],xmm0[7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm8 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm9 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm3 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm9, %ymm8, %ymm3 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15] @@ -2097,14 +2091,14 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,4,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm10 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm10 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm6, %ymm5, %ymm10 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7,8,9],ymm11[10],ymm10[11,12],ymm11[13],ymm10[14,15] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm10 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm12, %xmm12 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm10[5,6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] @@ -2125,21 +2119,19 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1,2],ymm5[3],ymm12[4,5],ymm5[6],ymm12[7],ymm5[8],ymm12[9,10],ymm5[11],ymm12[12,13],ymm5[14],ymm12[15] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm8, %ymm9, %ymm1 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2],ymm8[3],ymm1[4,5],ymm8[6],ymm1[7],ymm8[8],ymm1[9,10],ymm8[11],ymm1[12,13],ymm8[14],ymm1[15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512DQ-FCP-NEXT: vextracti32x4 $2, %zmm0, %xmm0 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm5[5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm8, %ymm9, %ymm0 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7],ymm2[8],ymm0[9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -3500,688 +3492,668 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-LABEL: load_i16_stride3_vf64: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512-NEXT: vmovdqa64 224(%rdi), %ymm20 -; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm21 +; AVX512-NEXT: vmovdqa64 224(%rdi), %ymm18 +; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm20 ; AVX512-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512-NEXT: vpternlogq $202, %ymm20, %ymm21, %ymm1 -; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7],ymm1[8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13,14],ymm3[15] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] -; AVX512-NEXT: vpshufb %ymm3, %ymm1, %ymm5 -; AVX512-NEXT: vmovdqa 272(%rdi), %xmm8 +; AVX512-NEXT: vpternlogq $202, %ymm18, %ymm20, %ymm1 +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX512-NEXT: vpshufb %ymm7, %ymm2, %ymm5 +; AVX512-NEXT: vmovdqa 272(%rdi), %xmm1 ; AVX512-NEXT: vmovdqa 256(%rdi), %xmm2 -; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm8[2],xmm2[3,4],xmm8[5],xmm2[6,7] -; AVX512-NEXT: vmovdqa %xmm2, %xmm14 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] -; AVX512-NEXT: vpshufb %xmm9, %xmm6, %xmm6 +; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] +; AVX512-NEXT: vmovdqa %xmm2, %xmm3 +; AVX512-NEXT: vmovdqa64 %xmm1, %xmm19 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] +; AVX512-NEXT: vpshufb %xmm13, %xmm6, %xmm6 ; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512-NEXT: vmovdqa64 320(%rdi), %ymm22 -; AVX512-NEXT: vmovdqa64 352(%rdi), %ymm23 -; AVX512-NEXT: vmovdqa %ymm0, %ymm6 -; AVX512-NEXT: vpternlogq $202, %ymm22, %ymm23, %ymm6 -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6],ymm7[7],ymm6[8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13,14],ymm7[15] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] -; AVX512-NEXT: vpshufb %ymm11, %ymm6, %ymm12 +; AVX512-NEXT: vmovdqa64 320(%rdi), %ymm21 +; AVX512-NEXT: vmovdqa64 352(%rdi), %ymm22 +; AVX512-NEXT: vmovdqa %ymm0, %ymm8 +; AVX512-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm8 +; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7],ymm8[8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13,14],ymm9[15] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] +; AVX512-NEXT: vpshufb %ymm10, %ymm8, %ymm11 ; AVX512-NEXT: vmovdqa 304(%rdi), %xmm1 ; AVX512-NEXT: vmovdqa 288(%rdi), %xmm2 -; AVX512-NEXT: vpblendw {{.*#+}} xmm13 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] +; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] ; AVX512-NEXT: vmovdqa %xmm2, %xmm4 -; AVX512-NEXT: vmovdqa %xmm1, %xmm6 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] -; AVX512-NEXT: vpshufb %xmm15, %xmm13, %xmm13 -; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4,5,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm16 -; AVX512-NEXT: vmovdqa64 128(%rdi), %ymm24 -; AVX512-NEXT: vmovdqa 160(%rdi), %ymm13 +; AVX512-NEXT: vmovdqa %xmm1, %xmm8 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] +; AVX512-NEXT: vpshufb %xmm14, %xmm12, %xmm12 +; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm16 +; AVX512-NEXT: vmovdqa64 128(%rdi), %ymm23 +; AVX512-NEXT: vmovdqa 160(%rdi), %ymm11 ; AVX512-NEXT: vmovdqa %ymm0, %ymm5 -; AVX512-NEXT: vpternlogq $202, %ymm24, %ymm13, %ymm5 +; AVX512-NEXT: vpternlogq $202, %ymm23, %ymm11, %ymm5 ; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm5[2,3,0,1] ; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6],ymm12[7],ymm5[8],ymm12[9],ymm5[10,11],ymm12[12],ymm5[13,14],ymm12[15] -; AVX512-NEXT: vpshufb %ymm11, %ymm5, %ymm5 -; AVX512-NEXT: vmovdqa 112(%rdi), %xmm11 -; AVX512-NEXT: vmovdqa 96(%rdi), %xmm12 -; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm11[1],xmm12[2,3],xmm11[4],xmm12[5,6],xmm11[7] -; AVX512-NEXT: vpshufb %xmm15, %xmm10, %xmm10 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm5[3,4,5,6,7] -; AVX512-NEXT: vmovdqa64 (%rdi), %ymm17 -; AVX512-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX512-NEXT: vpshufb %ymm10, %ymm5, %ymm10 +; AVX512-NEXT: vmovdqa 112(%rdi), %xmm15 +; AVX512-NEXT: vmovdqa 96(%rdi), %xmm5 +; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0],xmm15[1],xmm5[2,3],xmm15[4],xmm5[5,6],xmm15[7] +; AVX512-NEXT: vpshufb %xmm14, %xmm12, %xmm12 +; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2],ymm10[3,4,5,6,7] +; AVX512-NEXT: vmovdqa64 (%rdi), %ymm24 +; AVX512-NEXT: vmovdqa 32(%rdi), %ymm12 ; AVX512-NEXT: vmovdqa %ymm0, %ymm10 -; AVX512-NEXT: vpternlogq $202, %ymm5, %ymm17, %ymm10 -; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm10[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3],ymm15[4],ymm10[5,6],ymm15[7],ymm10[8],ymm15[9],ymm10[10,11],ymm15[12],ymm10[13,14],ymm15[15] -; AVX512-NEXT: vpshufb %ymm3, %ymm10, %ymm2 +; AVX512-NEXT: vpternlogq $202, %ymm12, %ymm24, %ymm10 +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm10[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm1[1],ymm10[2,3],ymm1[4],ymm10[5,6],ymm1[7],ymm10[8],ymm1[9],ymm10[10,11],ymm1[12],ymm10[13,14],ymm1[15] +; AVX512-NEXT: vpshufb %ymm7, %ymm1, %ymm7 ; AVX512-NEXT: vmovdqa 80(%rdi), %xmm10 -; AVX512-NEXT: vmovdqa 64(%rdi), %xmm15 -; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1],xmm10[2],xmm15[3,4],xmm10[5],xmm15[6,7] -; AVX512-NEXT: vpshufb %xmm9, %xmm3, %xmm3 -; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm18 -; AVX512-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512-NEXT: vpternlogq $202, %ymm23, %ymm22, %ymm1 -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] -; AVX512-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512-NEXT: vmovdqa64 %ymm2, %ymm28 -; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm6[2],xmm4[3,4],xmm6[5],xmm4[6,7] -; AVX512-NEXT: vmovdqa64 %xmm6, %xmm25 +; AVX512-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm10[2],xmm1[3,4],xmm10[5],xmm1[6,7] +; AVX512-NEXT: vpshufb %xmm13, %xmm2, %xmm2 +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7],ymm7[8,9,10],ymm2[11,12,13,14,15] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,4,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm17 +; AVX512-NEXT: vmovdqa %ymm0, %ymm2 +; AVX512-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm2 +; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7,8,9],ymm6[10],ymm2[11,12],ymm6[13],ymm2[14,15] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] +; AVX512-NEXT: vpshufb %ymm9, %ymm2, %ymm2 +; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm8[2],xmm4[3,4],xmm8[5],xmm4[6,7] +; AVX512-NEXT: vmovdqa64 %xmm8, %xmm25 ; AVX512-NEXT: vmovdqa64 %xmm4, %xmm26 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] -; AVX512-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm1[5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512-NEXT: vmovdqa %ymm9, %ymm1 -; AVX512-NEXT: vpternlogq $202, %ymm21, %ymm20, %ymm1 -; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] +; AVX512-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm2[5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512-NEXT: vmovdqa %ymm13, %ymm2 +; AVX512-NEXT: vpternlogq $202, %ymm20, %ymm18, %ymm2 +; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] -; AVX512-NEXT: vpshufb %ymm4, %ymm1, %ymm1 -; AVX512-NEXT: vmovdqa %xmm14, %xmm7 -; AVX512-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm14[2],xmm8[3,4],xmm14[5],xmm8[6,7] -; AVX512-NEXT: vmovdqa64 %xmm8, %xmm27 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] -; AVX512-NEXT: vpshufb %xmm2, %xmm14, %xmm14 -; AVX512-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm1[0,1,2],ymm14[3,4,5,6,7],ymm1[8,9,10],ymm14[11,12,13,14,15] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4] -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm19 -; AVX512-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512-NEXT: vpternlogq $202, %ymm13, %ymm24, %ymm1 -; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15] -; AVX512-NEXT: vmovdqa64 %ymm28, %ymm3 -; AVX512-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1],xmm11[2],xmm12[3,4],xmm11[5],xmm12[6,7] -; AVX512-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm1[5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqa %ymm9, %ymm3 -; AVX512-NEXT: vpternlogq $202, %ymm17, %ymm5, %ymm3 -; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7,8,9],ymm6[10],ymm3[11,12],ymm6[13],ymm3[14,15] -; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1],xmm15[2],xmm10[3,4],xmm15[5],xmm10[6,7] -; AVX512-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,7,4] -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512-NEXT: vpternlogq $226, %ymm24, %ymm9, %ymm13 -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm13[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm13[1,2],ymm2[3],ymm13[4,5],ymm2[6],ymm13[7],ymm2[8],ymm13[9,10],ymm2[11],ymm13[12,13],ymm2[14],ymm13[15] -; AVX512-NEXT: vpternlogq $226, %ymm17, %ymm0, %ymm5 -; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm5[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1,2],ymm3[3],ymm5[4,5],ymm3[6],ymm5[7],ymm3[8],ymm5[9,10],ymm3[11],ymm5[12,13],ymm3[14],ymm5[15] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] -; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0],xmm10[1],xmm15[2,3],xmm10[4],xmm15[5,6],xmm10[7] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] -; AVX512-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7] ; AVX512-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm11[0,1],xmm12[2],xmm11[3,4],xmm12[5],xmm11[6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] -; AVX512-NEXT: vpshufb %xmm8, %xmm5, %xmm5 -; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm5 -; AVX512-NEXT: vextracti32x4 $2, %zmm5, %xmm5 -; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm2[5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %xmm19, %xmm8 +; AVX512-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7] +; AVX512-NEXT: vmovdqa64 %xmm3, %xmm27 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] +; AVX512-NEXT: vpshufb %xmm3, %xmm14, %xmm14 +; AVX512-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm2[0,1,2],ymm14[3,4,5,6,7],ymm2[8,9,10],ymm14[11,12,13,14,15] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,4] +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm19 +; AVX512-NEXT: vmovdqa %ymm0, %ymm2 +; AVX512-NEXT: vpternlogq $202, %ymm11, %ymm23, %ymm2 +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7,8,9],ymm7[10],ymm2[11,12],ymm7[13],ymm2[14,15] +; AVX512-NEXT: vpshufb %ymm9, %ymm2, %ymm2 +; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0,1],xmm15[2],xmm5[3,4],xmm15[5],xmm5[6,7] +; AVX512-NEXT: vpshufb %xmm6, %xmm7, %xmm6 +; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm2[5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX512-NEXT: vmovdqa %ymm13, %ymm6 +; AVX512-NEXT: vpternlogq $202, %ymm24, %ymm12, %ymm6 +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] +; AVX512-NEXT: vpshufb %ymm4, %ymm6, %ymm4 +; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm1[2],xmm10[3,4],xmm1[5],xmm10[6,7] +; AVX512-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4] +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512-NEXT: vpternlogq $202, %ymm22, %ymm23, %ymm9 -; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm9[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm9[1,2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7],ymm3[8],ymm9[9,10],ymm3[11],ymm9[12,13],ymm3[14],ymm9[15] -; AVX512-NEXT: vpternlogq $202, %ymm21, %ymm20, %ymm0 -; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2],ymm5[3],ymm0[4,5],ymm5[6],ymm0[7],ymm5[8],ymm0[9,10],ymm5[11],ymm0[12,13],ymm5[14],ymm0[15] -; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX512-NEXT: vpternlogq $226, %ymm23, %ymm13, %ymm11 +; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm11[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm11[1,2],ymm3[3],ymm11[4,5],ymm3[6],ymm11[7],ymm3[8],ymm11[9,10],ymm3[11],ymm11[12,13],ymm3[14],ymm11[15] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] +; AVX512-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm5[2],xmm15[3,4],xmm5[5],xmm15[6,7] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] +; AVX512-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512-NEXT: vpternlogq $226, %ymm24, %ymm0, %ymm12 +; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15] +; AVX512-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3],xmm10[4],xmm1[5,6],xmm10[7] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] +; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm13 +; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm13[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7],ymm3[8],ymm13[9,10],ymm3[11],ymm13[12,13],ymm3[14],ymm13[15] +; AVX512-NEXT: vmovdqa64 %xmm25, %xmm4 +; AVX512-NEXT: vmovdqa64 %xmm26, %xmm7 +; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7] +; AVX512-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX512-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512-NEXT: vpternlogq $202, %ymm20, %ymm18, %ymm0 +; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7],ymm4[8],ymm0[9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15] +; AVX512-NEXT: vpshufb %ymm11, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqa64 %xmm27, %xmm4 -; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm4[1],xmm7[2,3],xmm4[4],xmm7[5,6],xmm4[7] +; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7] ; AVX512-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] -; AVX512-NEXT: vmovdqa64 %xmm25, %xmm4 -; AVX512-NEXT: vmovdqa64 %xmm26, %xmm5 -; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] -; AVX512-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512-NEXT: vextracti32x4 $2, %zmm4, %xmm4 -; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512-NEXT: vmovdqa64 %zmm17, (%rsi) ; AVX512-NEXT: vmovdqa64 %zmm16, 64(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm19, 64(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm1, (%rdx) +; AVX512-NEXT: vmovdqa64 %zmm2, (%rdx) ; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm2, (%rcx) +; AVX512-NEXT: vmovdqa64 %zmm1, (%rcx) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i16_stride3_vf64: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512-FCP-NEXT: vmovdqa64 224(%rdi), %ymm20 -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm21 +; AVX512-FCP-NEXT: vmovdqa64 224(%rdi), %ymm18 +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm20 ; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm20, %ymm21, %ymm1 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7],ymm1[8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13,14],ymm3[15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm5 -; AVX512-FCP-NEXT: vmovdqa 272(%rdi), %xmm8 +; AVX512-FCP-NEXT: vpternlogq $202, %ymm18, %ymm20, %ymm1 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm5 +; AVX512-FCP-NEXT: vmovdqa 272(%rdi), %xmm1 ; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %xmm2 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm8[2],xmm2[3,4],xmm8[5],xmm2[6,7] -; AVX512-FCP-NEXT: vmovdqa %xmm2, %xmm14 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] -; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm6, %xmm6 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] +; AVX512-FCP-NEXT: vmovdqa %xmm2, %xmm3 +; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm19 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm6, %xmm6 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15] ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %ymm22 -; AVX512-FCP-NEXT: vmovdqa64 352(%rdi), %ymm23 -; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm6 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm22, %ymm23, %ymm6 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6],ymm7[7],ymm6[8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13,14],ymm7[15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm6, %ymm12 +; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %ymm21 +; AVX512-FCP-NEXT: vmovdqa64 352(%rdi), %ymm22 +; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm8 +; AVX512-FCP-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm8 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7],ymm8[8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13,14],ymm9[15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] +; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm11 ; AVX512-FCP-NEXT: vmovdqa 304(%rdi), %xmm1 ; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %xmm2 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] ; AVX512-FCP-NEXT: vmovdqa %xmm2, %xmm4 -; AVX512-FCP-NEXT: vmovdqa %xmm1, %xmm6 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] -; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm13, %xmm13 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm24 -; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm13 +; AVX512-FCP-NEXT: vmovdqa %xmm1, %xmm8 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] +; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm12 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm16 +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23 +; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm11 ; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm5 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm24, %ymm13, %ymm5 +; AVX512-FCP-NEXT: vpternlogq $202, %ymm23, %ymm11, %ymm5 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm5[2,3,0,1] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6],ymm12[7],ymm5[8],ymm12[9],ymm5[10,11],ymm12[12],ymm5[13,14],ymm12[15] -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm5 -; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm11 -; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm12 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm11[1],xmm12[2,3],xmm11[4],xmm12[5,6],xmm11[7] -; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm10, %xmm10 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm5[3,4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm17 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm5, %ymm10 +; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm15 +; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm5 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0],xmm15[1],xmm5[2,3],xmm15[4],xmm5[5,6],xmm15[7] +; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm12 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2],ymm10[3,4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm24 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm12 ; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm10 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm5, %ymm17, %ymm10 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm10[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3],ymm15[4],ymm10[5,6],ymm15[7],ymm10[8],ymm15[9],ymm10[10,11],ymm15[12],ymm10[13,14],ymm15[15] -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm10, %ymm2 +; AVX512-FCP-NEXT: vpternlogq $202, %ymm12, %ymm24, %ymm10 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm10[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm1[1],ymm10[2,3],ymm1[4],ymm10[5,6],ymm1[7],ymm10[8],ymm1[9],ymm10[10,11],ymm1[12],ymm10[13,14],ymm1[15] +; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm7 ; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm10 -; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm15 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1],xmm10[2],xmm15[3,4],xmm10[5],xmm15[6,7] -; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15] -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm18 -; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm23, %ymm22, %ymm1 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm28 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm6[2],xmm4[3,4],xmm6[5],xmm4[6,7] -; AVX512-FCP-NEXT: vmovdqa64 %xmm6, %xmm25 +; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm10[2],xmm1[3,4],xmm10[5],xmm1[6,7] +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7],ymm7[8,9,10],ymm2[11,12,13,14,15] +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,4,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm17 +; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm2 +; AVX512-FCP-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm2 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7,8,9],ymm6[10],ymm2[11,12],ymm6[13],ymm2[14,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm8[2],xmm4[3,4],xmm8[5],xmm4[6,7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm8, %xmm25 ; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm26 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm1[5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm1 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm21, %ymm20, %ymm1 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm2[5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm2 +; AVX512-FCP-NEXT: vpternlogq $202, %ymm20, %ymm18, %ymm2 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vmovdqa %xmm14, %xmm7 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm14[2],xmm8[3,4],xmm14[5],xmm8[6,7] -; AVX512-FCP-NEXT: vmovdqa64 %xmm8, %xmm27 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] -; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm14, %xmm14 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm1[0,1,2],ymm14[3,4,5,6,7],ymm1[8,9,10],ymm14[11,12,13,14,15] -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm19 -; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm13, %ymm24, %ymm1 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15] -; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm3 -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1],xmm11[2],xmm12[3,4],xmm11[5],xmm12[6,7] -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm1[5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm3 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm17, %ymm5, %ymm3 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7,8,9],ymm6[10],ymm3[11,12],ymm6[13],ymm3[14,15] -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1],xmm15[2],xmm10[3,4],xmm15[5],xmm10[6,7] -; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,7,4] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512-FCP-NEXT: vpternlogq $226, %ymm24, %ymm9, %ymm13 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm13[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm13[1,2],ymm2[3],ymm13[4,5],ymm2[6],ymm13[7],ymm2[8],ymm13[9,10],ymm2[11],ymm13[12,13],ymm2[14],ymm13[15] -; AVX512-FCP-NEXT: vpternlogq $226, %ymm17, %ymm0, %ymm5 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm5[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1,2],ymm3[3],ymm5[4,5],ymm3[6],ymm5[7],ymm3[8],ymm5[9,10],ymm3[11],ymm5[12,13],ymm3[14],ymm5[15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0],xmm10[1],xmm15[2,3],xmm10[4],xmm15[5,6],xmm10[7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7] ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm11[0,1],xmm12[2],xmm11[3,4],xmm12[5],xmm11[6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm5 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm5 -; AVX512-FCP-NEXT: vextracti32x4 $2, %zmm5, %xmm5 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm2[5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm8 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm27 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm14, %xmm14 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm2[0,1,2],ymm14[3,4,5,6,7],ymm2[8,9,10],ymm14[11,12,13,14,15] +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,4] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm19 +; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm2 +; AVX512-FCP-NEXT: vpternlogq $202, %ymm11, %ymm23, %ymm2 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7,8,9],ymm7[10],ymm2[11,12],ymm7[13],ymm2[14,15] +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0,1],xmm15[2],xmm5[3,4],xmm15[5],xmm5[6,7] +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm6 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm2[5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm6 +; AVX512-FCP-NEXT: vpternlogq $202, %ymm24, %ymm12, %ymm6 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm4 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm1[2],xmm10[3,4],xmm1[5],xmm10[6,7] +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15] +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm22, %ymm23, %ymm9 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm9[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm9[1,2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7],ymm3[8],ymm9[9,10],ymm3[11],ymm9[12,13],ymm3[14],ymm9[15] -; AVX512-FCP-NEXT: vpternlogq $202, %ymm21, %ymm20, %ymm0 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2],ymm5[3],ymm0[4,5],ymm5[6],ymm0[7],ymm5[8],ymm0[9,10],ymm5[11],ymm0[12,13],ymm5[14],ymm0[15] -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vpternlogq $226, %ymm23, %ymm13, %ymm11 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm11[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm11[1,2],ymm3[3],ymm11[4,5],ymm3[6],ymm11[7],ymm3[8],ymm11[9,10],ymm3[11],ymm11[12,13],ymm3[14],ymm11[15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm5[2],xmm15[3,4],xmm5[5],xmm15[6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] +; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512-FCP-NEXT: vpternlogq $226, %ymm24, %ymm0, %ymm12 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15] +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3],xmm10[4],xmm1[5,6],xmm10[7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512-FCP-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm13 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm13[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7],ymm3[8],ymm13[9,10],ymm3[11],ymm13[12,13],ymm3[14],ymm13[15] +; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm4 +; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm7 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7] +; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512-FCP-NEXT: vpternlogq $202, %ymm20, %ymm18, %ymm0 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7],ymm4[8],ymm0[9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15] +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm4 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm4[1],xmm7[2,3],xmm4[4],xmm7[5,6],xmm4[7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7] ; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm4 -; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm5 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vextracti32x4 $2, %zmm4, %xmm4 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, (%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 64(%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 64(%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 64(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rcx) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i16_stride3_vf64: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512DQ-NEXT: vmovdqa64 224(%rdi), %ymm20 -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %ymm21 +; AVX512DQ-NEXT: vmovdqa64 224(%rdi), %ymm18 +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %ymm20 ; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512DQ-NEXT: vpternlogq $202, %ymm20, %ymm21, %ymm1 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7],ymm1[8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13,14],ymm3[15] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm1, %ymm5 -; AVX512DQ-NEXT: vmovdqa 272(%rdi), %xmm8 +; AVX512DQ-NEXT: vpternlogq $202, %ymm18, %ymm20, %ymm1 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX512DQ-NEXT: vpshufb %ymm7, %ymm2, %ymm5 +; AVX512DQ-NEXT: vmovdqa 272(%rdi), %xmm1 ; AVX512DQ-NEXT: vmovdqa 256(%rdi), %xmm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm8[2],xmm2[3,4],xmm8[5],xmm2[6,7] -; AVX512DQ-NEXT: vmovdqa %xmm2, %xmm14 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] -; AVX512DQ-NEXT: vpshufb %xmm9, %xmm6, %xmm6 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] +; AVX512DQ-NEXT: vmovdqa %xmm2, %xmm3 +; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm19 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] +; AVX512DQ-NEXT: vpshufb %xmm13, %xmm6, %xmm6 ; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %ymm22 -; AVX512DQ-NEXT: vmovdqa64 352(%rdi), %ymm23 -; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm6 -; AVX512DQ-NEXT: vpternlogq $202, %ymm22, %ymm23, %ymm6 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6],ymm7[7],ymm6[8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13,14],ymm7[15] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] -; AVX512DQ-NEXT: vpshufb %ymm11, %ymm6, %ymm12 +; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %ymm21 +; AVX512DQ-NEXT: vmovdqa64 352(%rdi), %ymm22 +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm8 +; AVX512DQ-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm8 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7],ymm8[8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13,14],ymm9[15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] +; AVX512DQ-NEXT: vpshufb %ymm10, %ymm8, %ymm11 ; AVX512DQ-NEXT: vmovdqa 304(%rdi), %xmm1 ; AVX512DQ-NEXT: vmovdqa 288(%rdi), %xmm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm13 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] ; AVX512DQ-NEXT: vmovdqa %xmm2, %xmm4 -; AVX512DQ-NEXT: vmovdqa %xmm1, %xmm6 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] -; AVX512DQ-NEXT: vpshufb %xmm15, %xmm13, %xmm13 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm16 -; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %ymm24 -; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm13 +; AVX512DQ-NEXT: vmovdqa %xmm1, %xmm8 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] +; AVX512DQ-NEXT: vpshufb %xmm14, %xmm12, %xmm12 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm16 +; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %ymm23 +; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm11 ; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm5 -; AVX512DQ-NEXT: vpternlogq $202, %ymm24, %ymm13, %ymm5 +; AVX512DQ-NEXT: vpternlogq $202, %ymm23, %ymm11, %ymm5 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm5[2,3,0,1] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6],ymm12[7],ymm5[8],ymm12[9],ymm5[10,11],ymm12[12],ymm5[13,14],ymm12[15] -; AVX512DQ-NEXT: vpshufb %ymm11, %ymm5, %ymm5 -; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm11 -; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm12 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm11[1],xmm12[2,3],xmm11[4],xmm12[5,6],xmm11[7] -; AVX512DQ-NEXT: vpshufb %xmm15, %xmm10, %xmm10 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm5[3,4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm17 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX512DQ-NEXT: vpshufb %ymm10, %ymm5, %ymm10 +; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm15 +; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm5 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0],xmm15[1],xmm5[2,3],xmm15[4],xmm5[5,6],xmm15[7] +; AVX512DQ-NEXT: vpshufb %xmm14, %xmm12, %xmm12 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2],ymm10[3,4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm24 +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm12 ; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm10 -; AVX512DQ-NEXT: vpternlogq $202, %ymm5, %ymm17, %ymm10 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm10[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3],ymm15[4],ymm10[5,6],ymm15[7],ymm10[8],ymm15[9],ymm10[10,11],ymm15[12],ymm10[13,14],ymm15[15] -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm10, %ymm2 +; AVX512DQ-NEXT: vpternlogq $202, %ymm12, %ymm24, %ymm10 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm10[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm1[1],ymm10[2,3],ymm1[4],ymm10[5,6],ymm1[7],ymm10[8],ymm1[9],ymm10[10,11],ymm1[12],ymm10[13,14],ymm1[15] +; AVX512DQ-NEXT: vpshufb %ymm7, %ymm1, %ymm7 ; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm10 -; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm15 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1],xmm10[2],xmm15[3,4],xmm10[5],xmm15[6,7] -; AVX512DQ-NEXT: vpshufb %xmm9, %xmm3, %xmm3 -; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm18 -; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512DQ-NEXT: vpternlogq $202, %ymm23, %ymm22, %ymm1 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] -; AVX512DQ-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm28 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm6[2],xmm4[3,4],xmm6[5],xmm4[6,7] -; AVX512DQ-NEXT: vmovdqa64 %xmm6, %xmm25 +; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm10[2],xmm1[3,4],xmm10[5],xmm1[6,7] +; AVX512DQ-NEXT: vpshufb %xmm13, %xmm2, %xmm2 +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7],ymm7[8,9,10],ymm2[11,12,13,14,15] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,4,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm17 +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm2 +; AVX512DQ-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm2 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7,8,9],ymm6[10],ymm2[11,12],ymm6[13],ymm2[14,15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] +; AVX512DQ-NEXT: vpshufb %ymm9, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm8[2],xmm4[3,4],xmm8[5],xmm4[6,7] +; AVX512DQ-NEXT: vmovdqa64 %xmm8, %xmm25 ; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm26 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] -; AVX512DQ-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm1[5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm1 -; AVX512DQ-NEXT: vpternlogq $202, %ymm21, %ymm20, %ymm1 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] +; AVX512DQ-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm2[5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512DQ-NEXT: vmovdqa %ymm13, %ymm2 +; AVX512DQ-NEXT: vpternlogq $202, %ymm20, %ymm18, %ymm2 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] -; AVX512DQ-NEXT: vpshufb %ymm4, %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa %xmm14, %xmm7 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm14[2],xmm8[3,4],xmm14[5],xmm8[6,7] -; AVX512DQ-NEXT: vmovdqa64 %xmm8, %xmm27 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] -; AVX512DQ-NEXT: vpshufb %xmm2, %xmm14, %xmm14 -; AVX512DQ-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm1[0,1,2],ymm14[3,4,5,6,7],ymm1[8,9,10],ymm14[11,12,13,14,15] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm19 -; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512DQ-NEXT: vpternlogq $202, %ymm13, %ymm24, %ymm1 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15] -; AVX512DQ-NEXT: vmovdqa64 %ymm28, %ymm3 -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1],xmm11[2],xmm12[3,4],xmm11[5],xmm12[6,7] -; AVX512DQ-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm1[5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm3 -; AVX512DQ-NEXT: vpternlogq $202, %ymm17, %ymm5, %ymm3 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7,8,9],ymm6[10],ymm3[11,12],ymm6[13],ymm3[14,15] -; AVX512DQ-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1],xmm15[2],xmm10[3,4],xmm15[5],xmm10[6,7] -; AVX512DQ-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,7,4] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512DQ-NEXT: vpternlogq $226, %ymm24, %ymm9, %ymm13 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm13[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm13[1,2],ymm2[3],ymm13[4,5],ymm2[6],ymm13[7],ymm2[8],ymm13[9,10],ymm2[11],ymm13[12,13],ymm2[14],ymm13[15] -; AVX512DQ-NEXT: vpternlogq $226, %ymm17, %ymm0, %ymm5 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm5[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1,2],ymm3[3],ymm5[4,5],ymm3[6],ymm5[7],ymm3[8],ymm5[9,10],ymm3[11],ymm5[12,13],ymm3[14],ymm5[15] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] -; AVX512DQ-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0],xmm10[1],xmm15[2,3],xmm10[4],xmm15[5,6],xmm10[7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] -; AVX512DQ-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7] ; AVX512DQ-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm11[0,1],xmm12[2],xmm11[3,4],xmm12[5],xmm11[6,7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] -; AVX512DQ-NEXT: vpshufb %xmm8, %xmm5, %xmm5 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm5 -; AVX512DQ-NEXT: vextracti32x4 $2, %zmm5, %xmm5 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm2[5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm8 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7] +; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm27 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] +; AVX512DQ-NEXT: vpshufb %xmm3, %xmm14, %xmm14 +; AVX512DQ-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm2[0,1,2],ymm14[3,4,5,6,7],ymm2[8,9,10],ymm14[11,12,13,14,15] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,4] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm19 +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm2 +; AVX512DQ-NEXT: vpternlogq $202, %ymm11, %ymm23, %ymm2 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7,8,9],ymm7[10],ymm2[11,12],ymm7[13],ymm2[14,15] +; AVX512DQ-NEXT: vpshufb %ymm9, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0,1],xmm15[2],xmm5[3,4],xmm15[5],xmm5[6,7] +; AVX512DQ-NEXT: vpshufb %xmm6, %xmm7, %xmm6 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm2[5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa %ymm13, %ymm6 +; AVX512DQ-NEXT: vpternlogq $202, %ymm24, %ymm12, %ymm6 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] +; AVX512DQ-NEXT: vpshufb %ymm4, %ymm6, %ymm4 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm1[2],xmm10[3,4],xmm1[5],xmm10[6,7] +; AVX512DQ-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512DQ-NEXT: vpternlogq $202, %ymm22, %ymm23, %ymm9 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm9[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm9[1,2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7],ymm3[8],ymm9[9,10],ymm3[11],ymm9[12,13],ymm3[14],ymm9[15] -; AVX512DQ-NEXT: vpternlogq $202, %ymm21, %ymm20, %ymm0 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2],ymm5[3],ymm0[4,5],ymm5[6],ymm0[7],ymm5[8],ymm0[9,10],ymm5[11],ymm0[12,13],ymm5[14],ymm0[15] -; AVX512DQ-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpternlogq $226, %ymm23, %ymm13, %ymm11 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm11[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm11[1,2],ymm3[3],ymm11[4,5],ymm3[6],ymm11[7],ymm3[8],ymm11[9,10],ymm3[11],ymm11[12,13],ymm3[14],ymm11[15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] +; AVX512DQ-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm5[2],xmm15[3,4],xmm5[5],xmm15[6,7] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] +; AVX512DQ-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-NEXT: vpternlogq $226, %ymm24, %ymm0, %ymm12 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15] +; AVX512DQ-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3],xmm10[4],xmm1[5,6],xmm10[7] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] +; AVX512DQ-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm13 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm13[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7],ymm3[8],ymm13[9,10],ymm3[11],ymm13[12,13],ymm3[14],ymm13[15] +; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm4 +; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm7 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7] +; AVX512DQ-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX512DQ-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-NEXT: vpternlogq $202, %ymm20, %ymm18, %ymm0 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7],ymm4[8],ymm0[9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15] +; AVX512DQ-NEXT: vpshufb %ymm11, %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm4 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm4[1],xmm7[2,3],xmm4[4],xmm7[5,6],xmm4[7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7] ; AVX512DQ-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm4 -; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm5 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] -; AVX512DQ-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512DQ-NEXT: vextracti32x4 $2, %zmm4, %xmm4 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm17, (%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm16, 64(%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm19, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i16_stride3_vf64: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512DQ-FCP-NEXT: vmovdqa64 224(%rdi), %ymm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 224(%rdi), %ymm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm20 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm20, %ymm21, %ymm1 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7],ymm1[8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13,14],ymm3[15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqa 272(%rdi), %xmm8 +; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm18, %ymm20, %ymm1 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa 272(%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %xmm2 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm8[2],xmm2[3,4],xmm8[5],xmm2[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, %xmm14 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] -; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm6, %xmm6 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm19 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm6, %xmm6 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %ymm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 352(%rdi), %ymm23 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm6 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm22, %ymm23, %ymm6 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6],ymm7[7],ymm6[8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13,14],ymm7[15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm6, %ymm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %ymm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 352(%rdi), %ymm22 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm8 +; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm8 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7],ymm8[8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13,14],ymm9[15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] +; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm11 ; AVX512DQ-FCP-NEXT: vmovdqa 304(%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %xmm2 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, %xmm6 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm13, %xmm13 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm24 -; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm13 +; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, %xmm8 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm12 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23 +; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm11 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm5 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm24, %ymm13, %ymm5 +; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm23, %ymm11, %ymm5 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm5[2,3,0,1] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6],ymm12[7],ymm5[8],ymm12[9],ymm5[10,11],ymm12[12],ymm5[13,14],ymm12[15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm11 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm12 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm11[1],xmm12[2,3],xmm11[4],xmm12[5,6],xmm11[7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm10, %xmm10 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm5[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm17 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm5, %ymm10 +; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm15 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm5 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0],xmm15[1],xmm5[2,3],xmm15[4],xmm5[5,6],xmm15[7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm12 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2],ymm10[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm24 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm12 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm10 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm5, %ymm17, %ymm10 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm10[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3],ymm15[4],ymm10[5,6],ymm15[7],ymm10[8],ymm15[9],ymm10[10,11],ymm15[12],ymm10[13,14],ymm15[15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm10, %ymm2 +; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm12, %ymm24, %ymm10 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm10[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm1[1],ymm10[2,3],ymm1[4],ymm10[5,6],ymm1[7],ymm10[8],ymm1[9],ymm10[10,11],ymm1[12],ymm10[13,14],ymm1[15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm7 ; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm10 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm15 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1],xmm10[2],xmm15[3,4],xmm10[5],xmm15[6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm23, %ymm22, %ymm1 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm28 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm6[2],xmm4[3,4],xmm6[5],xmm4[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm6, %xmm25 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm10[2],xmm1[3,4],xmm10[5],xmm1[6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7],ymm7[8,9,10],ymm2[11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,4,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm2 +; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm2 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7,8,9],ymm6[10],ymm2[11,12],ymm6[13],ymm2[14,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm8[2],xmm4[3,4],xmm8[5],xmm4[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm8, %xmm25 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm26 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm1[5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm1 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm21, %ymm20, %ymm1 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm2[5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm13, %ymm2 +; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm20, %ymm18, %ymm2 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm14, %xmm7 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm14[2],xmm8[3,4],xmm14[5],xmm8[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm8, %xmm27 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] -; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm14, %xmm14 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm1[0,1,2],ymm14[3,4,5,6,7],ymm1[8,9,10],ymm14[11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm13, %ymm24, %ymm1 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1],xmm11[2],xmm12[3,4],xmm11[5],xmm12[6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm1[5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm17, %ymm5, %ymm3 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7,8,9],ymm6[10],ymm3[11,12],ymm6[13],ymm3[14,15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1],xmm15[2],xmm10[3,4],xmm15[5],xmm10[6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,7,4] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vpternlogq $226, %ymm24, %ymm9, %ymm13 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm13[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm13[1,2],ymm2[3],ymm13[4,5],ymm2[6],ymm13[7],ymm2[8],ymm13[9,10],ymm2[11],ymm13[12,13],ymm2[14],ymm13[15] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %ymm17, %ymm0, %ymm5 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm5[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1,2],ymm3[3],ymm5[4,5],ymm3[6],ymm5[7],ymm3[8],ymm5[9,10],ymm3[11],ymm5[12,13],ymm3[14],ymm5[15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0],xmm10[1],xmm15[2,3],xmm10[4],xmm15[5,6],xmm10[7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm11[0,1],xmm12[2],xmm11[3,4],xmm12[5],xmm11[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm5 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm5 -; AVX512DQ-FCP-NEXT: vextracti32x4 $2, %zmm5, %xmm5 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm2[5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm8 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm27 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm14, %xmm14 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm2[0,1,2],ymm14[3,4,5,6,7],ymm2[8,9,10],ymm14[11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,4] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm2 +; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm11, %ymm23, %ymm2 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7,8,9],ymm7[10],ymm2[11,12],ymm7[13],ymm2[14,15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0,1],xmm15[2],xmm5[3,4],xmm15[5],xmm5[6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm6 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm2[5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm13, %ymm6 +; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm24, %ymm12, %ymm6 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm4 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm1[2],xmm10[3,4],xmm1[5],xmm10[6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm22, %ymm23, %ymm9 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm9[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm9[1,2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7],ymm3[8],ymm9[9,10],ymm3[11],ymm9[12,13],ymm3[14],ymm9[15] -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm21, %ymm20, %ymm0 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2],ymm5[3],ymm0[4,5],ymm5[6],ymm0[7],ymm5[8],ymm0[9,10],ymm5[11],ymm0[12,13],ymm5[14],ymm0[15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vpternlogq $226, %ymm23, %ymm13, %ymm11 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm11[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm11[1,2],ymm3[3],ymm11[4,5],ymm3[6],ymm11[7],ymm3[8],ymm11[9,10],ymm3[11],ymm11[12,13],ymm3[14],ymm11[15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm5[2],xmm15[3,4],xmm5[5],xmm15[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpternlogq $226, %ymm24, %ymm0, %ymm12 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3],xmm10[4],xmm1[5,6],xmm10[7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm13 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm13[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7],ymm3[8],ymm13[9,10],ymm3[11],ymm13[12,13],ymm3[14],ymm13[15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm7 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm20, %ymm18, %ymm0 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7],ymm4[8],ymm0[9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm4 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm4[1],xmm7[2,3],xmm4[4],xmm7[5,6],xmm4[7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7] ; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm5 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vextracti32x4 $2, %zmm4, %xmm4 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 64(%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 64(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 64(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%rcx) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll index 8b6ba51506ab79..8091afbbfd70c3 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll @@ -1246,29 +1246,28 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0] ; AVX512BW-FCP-NEXT: vpermi2q %ymm3, %ymm0, %ymm1 -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,3,5,7,1,3,5,7] -; AVX512BW-FCP-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512BW-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6,10,14],zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,3,5,0,5,1,3,0] -; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,4,8],zero,zero,zero,zero,ymm3[1,5,9],zero,zero,zero,zero,ymm3[2,6,18],zero,zero,zero,zero,ymm3[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,ymm2[2,10,18,26],zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28] +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8],zero,zero,zero,zero,zero,ymm3[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[19,27],zero,zero,zero,zero,zero,ymm3[20,28],zero,zero ; AVX512BW-FCP-NEXT: vpor %ymm0, %ymm3, %ymm0 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,ymm2[2,10,18,26],zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28] -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28],zero,zero -; AVX512BW-FCP-NEXT: vpor %ymm3, %ymm2, %ymm2 ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,4,6,0,2,4,6] ; AVX512BW-FCP-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1 +; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3 ; AVX512BW-FCP-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 {%k1} = ymm1[u,u,u,u,0,4,8,u,u,u,u,1,5,9,u,u,u,u,18,22,26,u,u,u,u,19,23,27,u,u,u,u] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%rax) -; AVX512BW-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) -; AVX512BW-FCP-NEXT: vextracti32x4 $3, %zmm0, %xmm0 -; AVX512BW-FCP-NEXT: vmovq %xmm0, 48(%rax) +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm3[u,u,u,u,0,4,8,u,u,u,u,1,5,9,u,u,u,u,18,22,26,u,u,u,u,19,23,27,u,u,u,u] +; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,3,5,7,1,3,5,7] +; AVX512BW-FCP-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm2[1,5,9,13],zero,zero,zero,ymm2[2,6,10,14],zero,zero,zero,ymm2[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,3,5,0,5,1,3,0] +; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,4,8],zero,zero,zero,zero,ymm1[1,5,9],zero,zero,zero,zero,ymm1[2,6,18],zero,zero,zero,zero,ymm1[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-FCP-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512BW-FCP-NEXT: vmovq %xmm2, 48(%rax) +; AVX512BW-FCP-NEXT: vmovdqa %xmm1, 32(%rax) +; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -1326,29 +1325,28 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm3, %ymm0, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,3,5,7,1,3,5,7] -; AVX512DQ-BW-FCP-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6,10,14],zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,3,5,0,5,1,3,0] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,4,8],zero,zero,zero,zero,ymm3[1,5,9],zero,zero,zero,zero,ymm3[2,6,18],zero,zero,zero,zero,ymm3[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,ymm2[2,10,18,26],zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28] +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8],zero,zero,zero,zero,zero,ymm3[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[19,27],zero,zero,zero,zero,zero,ymm3[20,28],zero,zero ; AVX512DQ-BW-FCP-NEXT: vpor %ymm0, %ymm3, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,ymm2[2,10,18,26],zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28] -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28],zero,zero -; AVX512DQ-BW-FCP-NEXT: vpor %ymm3, %ymm2, %ymm2 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,4,6,0,2,4,6] ; AVX512DQ-BW-FCP-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3 ; AVX512DQ-BW-FCP-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 {%k1} = ymm1[u,u,u,u,0,4,8,u,u,u,u,1,5,9,u,u,u,u,18,22,26,u,u,u,u,19,23,27,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%rax) -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $3, %zmm0, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, 48(%rax) +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm3[u,u,u,u,0,4,8,u,u,u,u,1,5,9,u,u,u,u,18,22,26,u,u,u,u,19,23,27,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,3,5,7,1,3,5,7] +; AVX512DQ-BW-FCP-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm2[1,5,9,13],zero,zero,zero,ymm2[2,6,10,14],zero,zero,zero,ymm2[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,3,5,0,5,1,3,0] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,4,8],zero,zero,zero,zero,ymm1[1,5,9],zero,zero,zero,zero,ymm1[2,6,18],zero,zero,zero,zero,ymm1[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, 48(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm1, 32(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %in.vec0 = load <8 x i8>, ptr %in.vecptr0, align 64 @@ -2053,77 +2051,76 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512: # %bb.0: ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: vmovdqa (%rdi), %xmm4 -; AVX512-NEXT: vmovdqa (%rsi), %xmm5 -; AVX512-NEXT: vmovdqa (%rdx), %xmm6 -; AVX512-NEXT: vmovdqa (%rcx), %xmm7 -; AVX512-NEXT: vmovdqa (%r8), %xmm0 -; AVX512-NEXT: vmovdqa (%r10), %xmm1 -; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm3 -; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm2 -; AVX512-NEXT: vinserti128 $1, (%r9), %ymm0, %ymm0 -; AVX512-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[12,13,u,u,u],zero,zero,xmm6[14,15,u,u,u] -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] -; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,12,13],zero,zero,xmm4[u,u,u,14,15],zero,zero,xmm4[u,u,u] -; AVX512-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10],zero,xmm5[u,u,u,u,13,12],zero,xmm5[u,u,u,u,15,14],zero -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm1[13,u,u,u,u],zero,zero,xmm1[14,u,u,u,u],zero,zero,xmm1[15] -; AVX512-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm5 -; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm2[3,1,1,3] -; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[1],zero,zero,ymm4[u,u,u,10,2],zero,zero,ymm4[u,u,u,11,3],zero,zero,ymm4[u,u,u,20,28],zero,zero,ymm4[u,u,u,21,29],zero,zero,ymm4[u] -; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm3[1,3,3,1] -; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = zero,ymm6[1,9,u,u,u],zero,zero,ymm6[2,10,u,u,u],zero,zero,ymm6[3,19,u,u,u],zero,zero,ymm6[28,20,u,u,u],zero,zero,ymm6[29,21,u] -; AVX512-NEXT: vpor %ymm4, %ymm6, %ymm4 -; AVX512-NEXT: vpshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,5,5,6] -; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm0[1,3,1,3] +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512-NEXT: vmovdqa (%rdx), %xmm5 +; AVX512-NEXT: vmovdqa (%rcx), %xmm6 +; AVX512-NEXT: vmovdqa (%r8), %xmm3 +; AVX512-NEXT: vmovdqa (%r9), %xmm4 +; AVX512-NEXT: vmovdqa (%r10), %xmm2 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8 +; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm9 +; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm7 +; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm9[u,u,u,u,u,5],zero,ymm9[u,u,u,u,u,6],zero,ymm9[u,u,u,u,u],zero,ymm9[23,u,u,u,u,u],zero,ymm9[24,u,u,u,u] +; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm9[2,3,0,1] +; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u],zero,ymm11[5,u,u,u,u,u],zero,ymm11[6,u,u,u,u,u,23],zero,ymm11[u,u,u,u,u,24],zero,ymm11[u,u,u,u] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] +; AVX512-NEXT: vpternlogq $50, %ymm10, %ymm12, %ymm11 +; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,2,0,2] +; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,ymm10[0,8,u,u,u],zero,zero,ymm10[1,9,u,u,u],zero,zero,ymm10[18,26,u,u,u],zero,zero,ymm10[19,27,u,u,u],zero,zero,ymm10[20,28] +; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 +; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm8[u,u,u,5],zero,ymm8[u,u,u,u,u,6],zero,ymm8[u,u,u,u,u],zero,ymm8[23,u,u,u,u,u],zero,ymm8[24,u,u,u,u,u],zero +; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm8[2,3,0,1] +; AVX512-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u],zero,ymm13[5,u,u,u,u,u],zero,ymm13[6,u,u,u,u,u,23],zero,ymm13[u,u,u,u,u,24],zero,ymm13[u,u,u,u,u,25] +; AVX512-NEXT: vpternlogq $200, %ymm11, %ymm12, %ymm13 +; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm8[0,2,0,2] +; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,8],zero,zero,ymm11[u,u,u,1,9],zero,zero,ymm11[u,u,u,2,10],zero,zero,ymm11[u,u,u,19,27],zero,zero,ymm11[u,u,u,20,28],zero,zero +; AVX512-NEXT: vinserti64x4 $1, %ymm13, %zmm11, %zmm11 +; AVX512-NEXT: vporq %zmm10, %zmm11, %zmm10 +; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm7[4],zero,ymm7[u,u,u,u,u,5],zero,ymm7[u,u,u,u,u,6],zero,ymm7[u,u,u,u,u],zero,ymm7[23,u,u,u,u,u],zero,ymm7[24,u,u] +; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm7[2,3,0,1] +; AVX512-NEXT: vpshufb {{.*#+}} ymm12 = zero,ymm12[4,u,u,u,u,u],zero,ymm12[5,u,u,u,u,u],zero,ymm12[6,u,u,u,u,u,23],zero,ymm12[u,u,u,u,u,24],zero,ymm12[u,u] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] +; AVX512-NEXT: vpternlogq $200, %ymm11, %ymm13, %ymm12 +; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm7[0,2,0,2] +; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,0,8],zero,ymm11[u,u,u,u,1,9],zero,ymm11[u,u,u,u,18,26],zero,ymm11[u,u,u,u,19,27],zero,ymm11[u,u,u,u] +; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 +; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0] +; AVX512-NEXT: vpandn %ymm12, %ymm13, %ymm12 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm2[1,1,0,0,4,5,6,7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,0] +; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,0] +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm13, %ymm13 +; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 +; AVX512-NEXT: vporq %zmm12, %zmm11, %zmm11 +; AVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm11 +; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[3,1,1,3] +; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[1],zero,zero,ymm8[u,u,u,10,2],zero,zero,ymm8[u,u,u,11,3],zero,zero,ymm8[u,u,u,20,28],zero,zero,ymm8[u,u,u,21,29],zero,zero,ymm8[u] +; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,3,3,1] +; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm9[1,9,u,u,u],zero,zero,ymm9[2,10,u,u,u],zero,zero,ymm9[3,19,u,u,u],zero,zero,ymm9[28,20,u,u,u],zero,zero,ymm9[29,21,u] +; AVX512-NEXT: vpor %ymm8, %ymm9, %ymm8 +; AVX512-NEXT: vpshufhw {{.*#+}} xmm9 = xmm2[0,1,2,3,4,5,5,6] +; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,1,3] ; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,1,9],zero,ymm7[u,u,u,u,2,10],zero,ymm7[u,u,u,u,19,27],zero,ymm7[u,u,u,u,20,28],zero,ymm7[u,u,u,u,21] -; AVX512-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm7 -; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm7 -; AVX512-NEXT: vinserti32x4 $2, %xmm5, %zmm7, %zmm4 -; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm3[u,u,u,u,u,5],zero,ymm3[u,u,u,u,u,6],zero,ymm3[u,u,u,u,u],zero,ymm3[23,u,u,u,u,u],zero,ymm3[24,u,u,u,u] -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1] -; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u],zero,ymm7[5,u,u,u,u,u],zero,ymm7[6,u,u,u,u,u,23],zero,ymm7[u,u,u,u,u,24],zero,ymm7[u,u,u,u] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] -; AVX512-NEXT: vpternlogq $50, %ymm6, %ymm8, %ymm7 -; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,0,2] -; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u,u,u],zero,zero,ymm3[1,9,u,u,u],zero,zero,ymm3[18,26,u,u,u],zero,zero,ymm3[19,27,u,u,u],zero,zero,ymm3[20,28] -; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm3, %zmm3 -; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm2[u,u,u,5],zero,ymm2[u,u,u,u,u,6],zero,ymm2[u,u,u,u,u],zero,ymm2[23,u,u,u,u,u],zero,ymm2[24,u,u,u,u,u],zero -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1] -; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u],zero,ymm7[5,u,u,u,u,u],zero,ymm7[6,u,u,u,u,u,23],zero,ymm7[u,u,u,u,u,24],zero,ymm7[u,u,u,u,u,25] -; AVX512-NEXT: vpternlogq $200, %ymm6, %ymm8, %ymm7 -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2] -; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,8],zero,zero,ymm2[u,u,u,1,9],zero,zero,ymm2[u,u,u,2,10],zero,zero,ymm2[u,u,u,19,27],zero,zero,ymm2[u,u,u,20,28],zero,zero -; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2 -; AVX512-NEXT: vporq %zmm3, %zmm2, %zmm2 -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,0] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] -; AVX512-NEXT: vpandn %ymm3, %ymm6, %ymm3 -; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,0,0,4,5,6,7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,0] -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[4],zero,ymm0[u,u,u,u,u,5],zero,ymm0[u,u,u,u,u,6],zero,ymm0[u,u,u,u,u],zero,ymm0[23,u,u,u,u,u],zero,ymm0[24,u,u] -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm0[2,3,0,1] -; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm7[4,u,u,u,u,u],zero,ymm7[5,u,u,u,u,u],zero,ymm7[6,u,u,u,u,u,23],zero,ymm7[u,u,u,u,u,24],zero,ymm7[u,u] -; AVX512-NEXT: vpternlogq $200, %ymm3, %ymm6, %ymm7 -; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2] -; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,0,8],zero,ymm0[u,u,u,u,1,9],zero,ymm0[u,u,u,u,18,26],zero,ymm0[u,u,u,u,19,27],zero,ymm0[u,u,u,u] -; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 -; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 -; AVX512-NEXT: vmovdqa %xmm5, 96(%rax) -; AVX512-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512-NEXT: vmovdqa %ymm4, 64(%rax) +; AVX512-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm7 +; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm7 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] +; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,xmm5[12,13,u,u,u],zero,zero,xmm5[14,15,u,u,u] +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13],zero,zero,xmm0[u,u,u,14,15],zero,zero,xmm0[u,u,u] +; AVX512-NEXT: vpor %xmm5, %xmm0, %xmm0 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,xmm1[u,u,u,u,13,12],zero,xmm1[u,u,u,u,15,14],zero +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[13,u,u,u,u],zero,zero,xmm2[14,u,u,u,u],zero,zero,xmm2[15] +; AVX512-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX512-NEXT: vinserti32x4 $2, %xmm1, %zmm7, %zmm0 +; AVX512-NEXT: vmovdqa %xmm1, 96(%rax) +; AVX512-NEXT: vmovdqa %ymm0, 64(%rax) +; AVX512-NEXT: vmovdqa64 %zmm11, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -2131,70 +2128,69 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm3 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm5 -; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm1 -; AVX512-FCP-NEXT: vmovdqa (%r10), %xmm0 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm6 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm7 -; AVX512-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1 -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm1 -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[12,13,u,u,u],zero,zero,xmm4[14,15,u,u,u] -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,12,13],zero,zero,xmm2[u,u,u,14,15],zero,zero,xmm2[u,u,u] -; AVX512-FCP-NEXT: vpor %xmm4, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10],zero,xmm3[u,u,u,u,13,12],zero,xmm3[u,u,u,u,15,14],zero -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15] -; AVX512-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm7[3,1,1,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1],zero,zero,ymm2[u,u,u,10,2],zero,zero,ymm2[u,u,u,11,3],zero,zero,ymm2[u,u,u,20,28],zero,zero,ymm2[u,u,u,21,29],zero,zero,ymm2[u] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm6[1,3,3,1] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,ymm4[1,9,u,u,u],zero,zero,ymm4[2,10,u,u,u],zero,zero,ymm4[3,19,u,u,u],zero,zero,ymm4[28,20,u,u,u],zero,zero,ymm4[29,21,u] -; AVX512-FCP-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,5,5,6] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,2,3,3,2,2,3,3] -; AVX512-FCP-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpermd %ymm4, %ymm5, %ymm4 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm1[1,3,1,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,1,9],zero,ymm5[u,u,u,u,2,10],zero,ymm5[u,u,u,u,19,27],zero,ymm5[u,u,u,u,20,28],zero,ymm5[u,u,u,u,21] -; AVX512-FCP-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm5 -; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm5 -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm5, %zmm2 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm6[0,2,0,2] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[0,8,u,u,u],zero,zero,ymm4[1,9,u,u,u],zero,zero,ymm4[18,26,u,u,u],zero,zero,ymm4[19,27,u,u,u],zero,zero,ymm4[20,28] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,5,2,6,1,5,2,6] -; AVX512-FCP-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpermd %ymm6, %ymm5, %ymm6 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u],zero,zero,ymm6[1,5,u,u,u],zero,zero,ymm6[2,6,u,u,u],zero,zero,ymm6[19,23,u,u,u],zero,zero,ymm6[24,28,u,u,u],zero -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm7[0,2,0,2] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,8],zero,zero,ymm6[u,u,u,1,9],zero,zero,ymm6[u,u,u,2,10],zero,zero,ymm6[u,u,u,19,27],zero,zero,ymm6[u,u,u,20,28],zero,zero -; AVX512-FCP-NEXT: vpermd %ymm7, %ymm5, %ymm7 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,1,5],zero,zero,ymm7[u,u,u,2,6],zero,zero,ymm7[u,u,u,19,23],zero,zero,ymm7[u,u,u,24,28],zero,zero,ymm7[u,u,u,25] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512-FCP-NEXT: vporq %zmm4, %zmm6, %zmm4 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[1,1,0,0,4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,0,1,0,0,0,0] -; AVX512-FCP-NEXT: vpermd %ymm6, %ymm7, %ymm6 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm1[0,2,0,2] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,0,8],zero,ymm6[u,u,u,u,1,9],zero,ymm6[u,u,u,u,18,26],zero,ymm6[u,u,u,u,19,27],zero,ymm6[u,u,u,u] -; AVX512-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,4],zero,ymm1[u,u,u,u,1,5],zero,ymm1[u,u,u,u,2,6],zero,ymm1[u,u,u,u,19,23],zero,ymm1[u,u,u,u,24,28],zero,ymm1[u] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm6, %zmm1 -; AVX512-FCP-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 -; AVX512-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1 -; AVX512-FCP-NEXT: vmovdqa %xmm3, 96(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512-FCP-NEXT: vmovdqa %ymm2, 64(%rax) +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm5 +; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm6 +; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm3 +; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm4 +; AVX512-FCP-NEXT: vmovdqa (%r10), %xmm2 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm7 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm8 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm9 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm8[0,2,0,2] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,ymm10[0,8,u,u,u],zero,zero,ymm10[1,9,u,u,u],zero,zero,ymm10[18,26,u,u,u],zero,zero,ymm10[19,27,u,u,u],zero,zero,ymm10[20,28] +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [1,5,2,6,1,5,2,6] +; AVX512-FCP-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpermd %ymm8, %ymm11, %ymm12 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u],zero,zero,ymm12[1,5,u,u,u],zero,zero,ymm12[2,6,u,u,u],zero,zero,ymm12[19,23,u,u,u],zero,zero,ymm12[24,28,u,u,u],zero +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm10 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm7[0,2,0,2] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,8],zero,zero,ymm12[u,u,u,1,9],zero,zero,ymm12[u,u,u,2,10],zero,zero,ymm12[u,u,u,19,27],zero,zero,ymm12[u,u,u,20,28],zero,zero +; AVX512-FCP-NEXT: vpermd %ymm7, %ymm11, %ymm13 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,1,5],zero,zero,ymm13[u,u,u,2,6],zero,zero,ymm13[u,u,u,19,23],zero,zero,ymm13[u,u,u,24,28],zero,zero,ymm13[u,u,u,25] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 +; AVX512-FCP-NEXT: vporq %zmm10, %zmm12, %zmm10 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[1,1,0,0,4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,1,0,1,0,0,0,0] +; AVX512-FCP-NEXT: vpermd %ymm12, %ymm13, %ymm12 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,0] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm9[0,2,0,2] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,0,8],zero,ymm13[u,u,u,u,1,9],zero,ymm13[u,u,u,u,18,26],zero,ymm13[u,u,u,u,19,27],zero,ymm13[u,u,u,u] +; AVX512-FCP-NEXT: vpermd %ymm9, %ymm11, %ymm11 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,4],zero,ymm11[u,u,u,u,1,5],zero,ymm11[u,u,u,u,2,6],zero,ymm11[u,u,u,u,19,23],zero,ymm11[u,u,u,u,24,28],zero,ymm11[u] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm11 +; AVX512-FCP-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm11 +; AVX512-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm11 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[3,1,1,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[1],zero,zero,ymm7[u,u,u,10,2],zero,zero,ymm7[u,u,u,11,3],zero,zero,ymm7[u,u,u,20,28],zero,zero,ymm7[u,u,u,21,29],zero,zero,ymm7[u] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,3,3,1] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[1,9,u,u,u],zero,zero,ymm8[2,10,u,u,u],zero,zero,ymm8[3,19,u,u,u],zero,zero,ymm8[28,20,u,u,u],zero,zero,ymm8[29,21,u] +; AVX512-FCP-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm2[0,1,2,3,4,5,5,6] +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,2,3,3,2,2,3,3] +; AVX512-FCP-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpermd %ymm8, %ymm10, %ymm8 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,3,1,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,1,9],zero,ymm9[u,u,u,u,2,10],zero,ymm9[u,u,u,u,19,27],zero,ymm9[u,u,u,u,20,28],zero,ymm9[u,u,u,u,21] +; AVX512-FCP-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm9 +; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm9 +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,xmm5[12,13,u,u,u],zero,zero,xmm5[14,15,u,u,u] +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13],zero,zero,xmm0[u,u,u,14,15],zero,zero,xmm0[u,u,u] +; AVX512-FCP-NEXT: vpor %xmm5, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,xmm1[u,u,u,u,13,12],zero,xmm1[u,u,u,u,15,14],zero +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[13,u,u,u,u],zero,zero,xmm2[14,u,u,u,u],zero,zero,xmm2[15] +; AVX512-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm9, %zmm0 +; AVX512-FCP-NEXT: vmovdqa %xmm1, 96(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512-FCP-NEXT: vmovdqa %ymm0, 64(%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -2202,77 +2198,76 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm4 -; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm5 -; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm6 -; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm7 -; AVX512DQ-NEXT: vmovdqa (%r8), %xmm0 -; AVX512DQ-NEXT: vmovdqa (%r10), %xmm1 -; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm3 -; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm2 -; AVX512DQ-NEXT: vinserti128 $1, (%r9), %ymm0, %ymm0 -; AVX512DQ-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[12,13,u,u,u],zero,zero,xmm6[14,15,u,u,u] -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,12,13],zero,zero,xmm4[u,u,u,14,15],zero,zero,xmm4[u,u,u] -; AVX512DQ-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10],zero,xmm5[u,u,u,u,13,12],zero,xmm5[u,u,u,u,15,14],zero -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm1[13,u,u,u,u],zero,zero,xmm1[14,u,u,u,u],zero,zero,xmm1[15] -; AVX512DQ-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm5 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm2[3,1,1,3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[1],zero,zero,ymm4[u,u,u,10,2],zero,zero,ymm4[u,u,u,11,3],zero,zero,ymm4[u,u,u,20,28],zero,zero,ymm4[u,u,u,21,29],zero,zero,ymm4[u] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm3[1,3,3,1] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = zero,ymm6[1,9,u,u,u],zero,zero,ymm6[2,10,u,u,u],zero,zero,ymm6[3,19,u,u,u],zero,zero,ymm6[28,20,u,u,u],zero,zero,ymm6[29,21,u] -; AVX512DQ-NEXT: vpor %ymm4, %ymm6, %ymm4 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,5,5,6] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm0[1,3,1,3] +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm5 +; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm6 +; AVX512DQ-NEXT: vmovdqa (%r8), %xmm3 +; AVX512DQ-NEXT: vmovdqa (%r9), %xmm4 +; AVX512DQ-NEXT: vmovdqa (%r10), %xmm2 +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8 +; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm9 +; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm7 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm9[u,u,u,u,u,5],zero,ymm9[u,u,u,u,u,6],zero,ymm9[u,u,u,u,u],zero,ymm9[23,u,u,u,u,u],zero,ymm9[24,u,u,u,u] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm9[2,3,0,1] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u],zero,ymm11[5,u,u,u,u,u],zero,ymm11[6,u,u,u,u,u,23],zero,ymm11[u,u,u,u,u,24],zero,ymm11[u,u,u,u] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] +; AVX512DQ-NEXT: vpternlogq $50, %ymm10, %ymm12, %ymm11 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,2,0,2] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,ymm10[0,8,u,u,u],zero,zero,ymm10[1,9,u,u,u],zero,zero,ymm10[18,26,u,u,u],zero,zero,ymm10[19,27,u,u,u],zero,zero,ymm10[20,28] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm8[u,u,u,5],zero,ymm8[u,u,u,u,u,6],zero,ymm8[u,u,u,u,u],zero,ymm8[23,u,u,u,u,u],zero,ymm8[24,u,u,u,u,u],zero +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm8[2,3,0,1] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u],zero,ymm13[5,u,u,u,u,u],zero,ymm13[6,u,u,u,u,u,23],zero,ymm13[u,u,u,u,u,24],zero,ymm13[u,u,u,u,u,25] +; AVX512DQ-NEXT: vpternlogq $200, %ymm11, %ymm12, %ymm13 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm8[0,2,0,2] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,8],zero,zero,ymm11[u,u,u,1,9],zero,zero,ymm11[u,u,u,2,10],zero,zero,ymm11[u,u,u,19,27],zero,zero,ymm11[u,u,u,20,28],zero,zero +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm13, %zmm11, %zmm11 +; AVX512DQ-NEXT: vporq %zmm10, %zmm11, %zmm10 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm7[4],zero,ymm7[u,u,u,u,u,5],zero,ymm7[u,u,u,u,u,6],zero,ymm7[u,u,u,u,u],zero,ymm7[23,u,u,u,u,u],zero,ymm7[24,u,u] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm7[2,3,0,1] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm12 = zero,ymm12[4,u,u,u,u,u],zero,ymm12[5,u,u,u,u,u],zero,ymm12[6,u,u,u,u,u,23],zero,ymm12[u,u,u,u,u,24],zero,ymm12[u,u] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] +; AVX512DQ-NEXT: vpternlogq $200, %ymm11, %ymm13, %ymm12 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm7[0,2,0,2] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,0,8],zero,ymm11[u,u,u,u,1,9],zero,ymm11[u,u,u,u,18,26],zero,ymm11[u,u,u,u,19,27],zero,ymm11[u,u,u,u] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0] +; AVX512DQ-NEXT: vpandn %ymm12, %ymm13, %ymm12 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm2[1,1,0,0,4,5,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,0] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,0] +; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm13, %ymm13 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 +; AVX512DQ-NEXT: vporq %zmm12, %zmm11, %zmm11 +; AVX512DQ-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm11 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[3,1,1,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[1],zero,zero,ymm8[u,u,u,10,2],zero,zero,ymm8[u,u,u,11,3],zero,zero,ymm8[u,u,u,20,28],zero,zero,ymm8[u,u,u,21,29],zero,zero,ymm8[u] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,3,3,1] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm9[1,9,u,u,u],zero,zero,ymm9[2,10,u,u,u],zero,zero,ymm9[3,19,u,u,u],zero,zero,ymm9[28,20,u,u,u],zero,zero,ymm9[29,21,u] +; AVX512DQ-NEXT: vpor %ymm8, %ymm9, %ymm8 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm9 = xmm2[0,1,2,3,4,5,5,6] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,1,3] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,1,9],zero,ymm7[u,u,u,u,2,10],zero,ymm7[u,u,u,u,19,27],zero,ymm7[u,u,u,u,20,28],zero,ymm7[u,u,u,u,21] -; AVX512DQ-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm7 -; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm7 -; AVX512DQ-NEXT: vinserti32x4 $2, %xmm5, %zmm7, %zmm4 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = ymm3[u,u,u,u,u,5],zero,ymm3[u,u,u,u,u,6],zero,ymm3[u,u,u,u,u],zero,ymm3[23,u,u,u,u,u],zero,ymm3[24,u,u,u,u] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u],zero,ymm7[5,u,u,u,u,u],zero,ymm7[6,u,u,u,u,u,23],zero,ymm7[u,u,u,u,u,24],zero,ymm7[u,u,u,u] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] -; AVX512DQ-NEXT: vpternlogq $50, %ymm6, %ymm8, %ymm7 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,0,2] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u,u,u],zero,zero,ymm3[1,9,u,u,u],zero,zero,ymm3[18,26,u,u,u],zero,zero,ymm3[19,27,u,u,u],zero,zero,ymm3[20,28] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm3, %zmm3 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = ymm2[u,u,u,5],zero,ymm2[u,u,u,u,u,6],zero,ymm2[u,u,u,u,u],zero,ymm2[23,u,u,u,u,u],zero,ymm2[24,u,u,u,u,u],zero -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u],zero,ymm7[5,u,u,u,u,u],zero,ymm7[6,u,u,u,u,u,23],zero,ymm7[u,u,u,u,u,24],zero,ymm7[u,u,u,u,u,25] -; AVX512DQ-NEXT: vpternlogq $200, %ymm6, %ymm8, %ymm7 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,8],zero,zero,ymm2[u,u,u,1,9],zero,zero,ymm2[u,u,u,2,10],zero,zero,ymm2[u,u,u,19,27],zero,zero,ymm2[u,u,u,20,28],zero,zero -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2 -; AVX512DQ-NEXT: vporq %zmm3, %zmm2, %zmm2 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,0] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] -; AVX512DQ-NEXT: vpandn %ymm3, %ymm6, %ymm3 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,0,0,4,5,6,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,0] -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[4],zero,ymm0[u,u,u,u,u,5],zero,ymm0[u,u,u,u,u,6],zero,ymm0[u,u,u,u,u],zero,ymm0[23,u,u,u,u,u],zero,ymm0[24,u,u] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm0[2,3,0,1] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm7[4,u,u,u,u,u],zero,ymm7[5,u,u,u,u,u],zero,ymm7[6,u,u,u,u,u,23],zero,ymm7[u,u,u,u,u,24],zero,ymm7[u,u] -; AVX512DQ-NEXT: vpternlogq $200, %ymm3, %ymm6, %ymm7 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,0,8],zero,ymm0[u,u,u,u,1,9],zero,ymm0[u,u,u,u,18,26],zero,ymm0[u,u,u,u,19,27],zero,ymm0[u,u,u,u] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 -; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 -; AVX512DQ-NEXT: vmovdqa %xmm5, 96(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512DQ-NEXT: vmovdqa %ymm4, 64(%rax) +; AVX512DQ-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm7 +; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm7 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,xmm5[12,13,u,u,u],zero,zero,xmm5[14,15,u,u,u] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13],zero,zero,xmm0[u,u,u,14,15],zero,zero,xmm0[u,u,u] +; AVX512DQ-NEXT: vpor %xmm5, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,xmm1[u,u,u,u,13,12],zero,xmm1[u,u,u,u,15,14],zero +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[13,u,u,u,u],zero,zero,xmm2[14,u,u,u,u],zero,zero,xmm2[15] +; AVX512DQ-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm1, %zmm7, %zmm0 +; AVX512DQ-NEXT: vmovdqa %xmm1, 96(%rax) +; AVX512DQ-NEXT: vmovdqa %ymm0, 64(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2280,70 +2275,69 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm5 -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %xmm0 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm6 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm7 -; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm1 -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[12,13,u,u,u],zero,zero,xmm4[14,15,u,u,u] -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,12,13],zero,zero,xmm2[u,u,u,14,15],zero,zero,xmm2[u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10],zero,xmm3[u,u,u,u,13,12],zero,xmm3[u,u,u,u,15,14],zero -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15] -; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm7[3,1,1,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1],zero,zero,ymm2[u,u,u,10,2],zero,zero,ymm2[u,u,u,11,3],zero,zero,ymm2[u,u,u,20,28],zero,zero,ymm2[u,u,u,21,29],zero,zero,ymm2[u] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm6[1,3,3,1] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,ymm4[1,9,u,u,u],zero,zero,ymm4[2,10,u,u,u],zero,zero,ymm4[3,19,u,u,u],zero,zero,ymm4[28,20,u,u,u],zero,zero,ymm4[29,21,u] -; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,5,5,6] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,2,3,3,2,2,3,3] -; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm5, %ymm4 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm1[1,3,1,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,1,9],zero,ymm5[u,u,u,u,2,10],zero,ymm5[u,u,u,u,19,27],zero,ymm5[u,u,u,u,20,28],zero,ymm5[u,u,u,u,21] -; AVX512DQ-FCP-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm5 -; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm5 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm5, %zmm2 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm6[0,2,0,2] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[0,8,u,u,u],zero,zero,ymm4[1,9,u,u,u],zero,zero,ymm4[18,26,u,u,u],zero,zero,ymm4[19,27,u,u,u],zero,zero,ymm4[20,28] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,5,2,6,1,5,2,6] -; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm5, %ymm6 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u],zero,zero,ymm6[1,5,u,u,u],zero,zero,ymm6[2,6,u,u,u],zero,zero,ymm6[19,23,u,u,u],zero,zero,ymm6[24,28,u,u,u],zero -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm7[0,2,0,2] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,8],zero,zero,ymm6[u,u,u,1,9],zero,zero,ymm6[u,u,u,2,10],zero,zero,ymm6[u,u,u,19,27],zero,zero,ymm6[u,u,u,20,28],zero,zero -; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm5, %ymm7 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,1,5],zero,zero,ymm7[u,u,u,2,6],zero,zero,ymm7[u,u,u,19,23],zero,zero,ymm7[u,u,u,24,28],zero,zero,ymm7[u,u,u,25] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512DQ-FCP-NEXT: vporq %zmm4, %zmm6, %zmm4 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[1,1,0,0,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,0,1,0,0,0,0] -; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm7, %ymm6 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm1[0,2,0,2] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,0,8],zero,ymm6[u,u,u,u,1,9],zero,ymm6[u,u,u,u,18,26],zero,ymm6[u,u,u,u,19,27],zero,ymm6[u,u,u,u] -; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,4],zero,ymm1[u,u,u,u,1,5],zero,ymm1[u,u,u,u,2,6],zero,ymm1[u,u,u,u,19,23],zero,ymm1[u,u,u,u,24,28],zero,ymm1[u] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm6, %zmm1 -; AVX512DQ-FCP-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, 96(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, 64(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm5 +; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm6 +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm4 +; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %xmm2 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm7 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm8 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm9 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm8[0,2,0,2] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,ymm10[0,8,u,u,u],zero,zero,ymm10[1,9,u,u,u],zero,zero,ymm10[18,26,u,u,u],zero,zero,ymm10[19,27,u,u,u],zero,zero,ymm10[20,28] +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [1,5,2,6,1,5,2,6] +; AVX512DQ-FCP-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm11, %ymm12 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u],zero,zero,ymm12[1,5,u,u,u],zero,zero,ymm12[2,6,u,u,u],zero,zero,ymm12[19,23,u,u,u],zero,zero,ymm12[24,28,u,u,u],zero +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm10 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm7[0,2,0,2] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,8],zero,zero,ymm12[u,u,u,1,9],zero,zero,ymm12[u,u,u,2,10],zero,zero,ymm12[u,u,u,19,27],zero,zero,ymm12[u,u,u,20,28],zero,zero +; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm11, %ymm13 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,1,5],zero,zero,ymm13[u,u,u,2,6],zero,zero,ymm13[u,u,u,19,23],zero,zero,ymm13[u,u,u,24,28],zero,zero,ymm13[u,u,u,25] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 +; AVX512DQ-FCP-NEXT: vporq %zmm10, %zmm12, %zmm10 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[1,1,0,0,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,1,0,1,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm13, %ymm12 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,0] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm9[0,2,0,2] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,0,8],zero,ymm13[u,u,u,u,1,9],zero,ymm13[u,u,u,u,18,26],zero,ymm13[u,u,u,u,19,27],zero,ymm13[u,u,u,u] +; AVX512DQ-FCP-NEXT: vpermd %ymm9, %ymm11, %ymm11 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,4],zero,ymm11[u,u,u,u,1,5],zero,ymm11[u,u,u,u,2,6],zero,ymm11[u,u,u,u,19,23],zero,ymm11[u,u,u,u,24,28],zero,ymm11[u] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm11 +; AVX512DQ-FCP-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm11 +; AVX512DQ-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm11 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[3,1,1,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[1],zero,zero,ymm7[u,u,u,10,2],zero,zero,ymm7[u,u,u,11,3],zero,zero,ymm7[u,u,u,20,28],zero,zero,ymm7[u,u,u,21,29],zero,zero,ymm7[u] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,3,3,1] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[1,9,u,u,u],zero,zero,ymm8[2,10,u,u,u],zero,zero,ymm8[3,19,u,u,u],zero,zero,ymm8[28,20,u,u,u],zero,zero,ymm8[29,21,u] +; AVX512DQ-FCP-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm2[0,1,2,3,4,5,5,6] +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,2,3,3,2,2,3,3] +; AVX512DQ-FCP-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm10, %ymm8 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,3,1,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,1,9],zero,ymm9[u,u,u,u,2,10],zero,ymm9[u,u,u,u,19,27],zero,ymm9[u,u,u,u,20,28],zero,ymm9[u,u,u,u,21] +; AVX512DQ-FCP-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm9 +; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm9 +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,xmm5[12,13,u,u,u],zero,zero,xmm5[14,15,u,u,u] +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13],zero,zero,xmm0[u,u,u,14,15],zero,zero,xmm0[u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm0, %xmm0 +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,xmm1[u,u,u,u,13,12],zero,xmm1[u,u,u,u,15,14],zero +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[13,u,u,u,u],zero,zero,xmm2[14,u,u,u,u],zero,zero,xmm2[15] +; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm9, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, 96(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, 64(%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll index 11f422d671541a..99e8cdb179c8dc 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll @@ -314,8 +314,8 @@ define void @vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2(ptr %in.v ; ; AVX512F-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) @@ -324,8 +324,8 @@ define void @vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2(ptr %in.v ; ; AVX512DQ-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) @@ -981,7 +981,7 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in. ; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7] ; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpermd %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpermd %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper @@ -992,7 +992,7 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in. ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7] ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpermd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpermd %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper @@ -4026,10 +4026,10 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; ; AVX512F-FAST-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-FAST-NEXT: vpbroadcastq %xmm0, %ymm2 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] ; AVX512F-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 @@ -4062,10 +4062,10 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; ; AVX512DQ-FAST-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512DQ-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm0, %ymm2 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] ; AVX512DQ-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 @@ -4541,9 +4541,9 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr % ; ; AVX512F-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0 @@ -4559,9 +4559,9 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr % ; ; AVX512DQ-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0