-
Notifications
You must be signed in to change notification settings - Fork 12.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[X86] LowerShift - lower vXi8 shifts of an uniform constant using PSHUFB #112175
Conversation
If each 128-bit vXi8 lane is shifting the same constant value, we can pre-compute the 8 valid shift results and use PSHUFB to act as a LUT with the shift amount. Fixes llvm#110317
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesIf each 128-bit vXi8 lane is shifting the same constant value, we can pre-compute the 8 valid shift results and use PSHUFB to act as a LUT with the shift amount. Fixes #110317 Patch is 128.65 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/112175.diff 2 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index e57ca7a31dce2a..4c16d2eaac4cd2 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -30143,6 +30143,39 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
}
+ // If we're shifting (per-lane) uniform vXi8 constants, we can use PSHUFB to
+ // look up the pre-computed shift values.
+ if ((VT == MVT::v16i8 && Subtarget.hasSSSE3()) ||
+ (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
+ (VT == MVT::v64i8 && Subtarget.hasBWI())) {
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned NumLanes = VT.getSizeInBits() / 128u;
+ unsigned NumEltsPerLane = NumElts / NumLanes;
+ SmallVector<APInt, 16> LUT;
+ for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
+ unsigned LoElt = Lane * NumEltsPerLane;
+ APInt EltMask = APInt::getBitsSet(NumElts, LoElt, LoElt + NumEltsPerLane);
+ KnownBits KnownLane = DAG.computeKnownBits(R, EltMask);
+ if (!KnownLane.isConstant())
+ break;
+ const APInt &LaneSplat = KnownLane.getConstant();
+ for (unsigned I = 0; I != 8; ++I) {
+ if (Opc == ISD::SHL)
+ LUT.push_back(LaneSplat.shl(I));
+ else if (Opc == ISD::SRL)
+ LUT.push_back(LaneSplat.lshr(I));
+ else if (Opc == ISD::SRA)
+ LUT.push_back(LaneSplat.ashr(I));
+ }
+ LUT.append(8, APInt::getZero(8));
+ }
+ if (LUT.size() == NumElts) {
+ APInt Undefs = APInt::getSplat(NumElts, APInt(16, 0xFF00));
+ SDValue Mask = getConstVector(LUT, Undefs, VT, DAG, dl);
+ return DAG.getNode(X86ISD::PSHUFB, dl, VT, Mask, Amt);
+ }
+ }
+
// It's worth extending once and using the vXi16/vXi32 shifts for smaller
// types, but without AVX512 the extra overheads to get from vXi8 to vXi32
// make the existing SSE solution better.
diff --git a/llvm/test/CodeGen/X86/vector-shift-lut.ll b/llvm/test/CodeGen/X86/vector-shift-lut.ll
index 4b6021f6f16b36..0bf2006090893c 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lut.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lut.ll
@@ -43,48 +43,16 @@ define <16 x i8> @uniform_shl_v16i8(<16 x i8> %a) nounwind {
;
; SSE41-LABEL: uniform_shl_v16i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: psllw $5, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; SSE41-NEXT: pblendvb %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psllw $2, %xmm2
-; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE41-NEXT: paddb %xmm0, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: paddb %xmm1, %xmm2
-; SSE41-NEXT: paddb %xmm0, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movq {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,0,0,0,0,0,0,0,0]
+; SSE41-NEXT: pshufb %xmm0, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: uniform_shl_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllw $5, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX1-NEXT: vpblendvb %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vpsllw $2, %xmm1, %xmm2
-; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vpblendvb %xmm0, %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm2
-; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vpblendvb %xmm0, %xmm2, %xmm1, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: uniform_shl_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllw $5, %xmm0, %xmm0
-; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX2-NEXT: vpblendvb %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT: vpsllw $2, %xmm1, %xmm2
-; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vpblendvb %xmm0, %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm2
-; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vpblendvb %xmm0, %xmm2, %xmm1, %xmm0
-; AVX2-NEXT: retq
+; AVX-LABEL: uniform_shl_v16i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovq {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
;
; XOP-LABEL: uniform_shl_v16i8:
; XOP: # %bb.0:
@@ -93,20 +61,14 @@ define <16 x i8> @uniform_shl_v16i8(<16 x i8> %a) nounwind {
;
; AVX512DQ-LABEL: uniform_shl_v16i8:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX512DQ-NEXT: vpsllvd %zmm0, %zmm1, %zmm0
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,0,0,0,0,0,0,0,0]
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm1, %xmm0
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: uniform_shl_v16i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX512BW-NEXT: vpsllvw %ymm0, %ymm1, %ymm0
-; AVX512BW-NEXT: vpmovwb %ymm0, %xmm0
-; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm1, %xmm0
; AVX512BW-NEXT: retq
%shift = shl <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, %a
ret <16 x i8> %shift
@@ -137,54 +99,16 @@ define <16 x i8> @uniform_lshr_v16i8(<16 x i8> %a) nounwind {
;
; SSE41-LABEL: uniform_lshr_v16i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: psllw $5, %xmm0
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psrlw $2, %xmm2
-; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE41-NEXT: paddb %xmm0, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psrlw $1, %xmm2
-; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE41-NEXT: paddb %xmm0, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movd {{.*#+}} xmm1 = [2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; SSE41-NEXT: pshufb %xmm0, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: uniform_lshr_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllw $5, %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
-; AVX1-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm2
-; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vpblendvb %xmm0, %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm2
-; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vpblendvb %xmm0, %xmm2, %xmm1, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: uniform_lshr_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllw $5, %xmm0, %xmm0
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
-; AVX2-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1
-; AVX2-NEXT: vpsrlw $2, %xmm1, %xmm2
-; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vpblendvb %xmm0, %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpsrlw $1, %xmm1, %xmm2
-; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vpblendvb %xmm0, %xmm2, %xmm1, %xmm0
-; AVX2-NEXT: retq
+; AVX-LABEL: uniform_lshr_v16i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovd {{.*#+}} xmm1 = [2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
;
; XOP-LABEL: uniform_lshr_v16i8:
; XOP: # %bb.0:
@@ -195,20 +119,14 @@ define <16 x i8> @uniform_lshr_v16i8(<16 x i8> %a) nounwind {
;
; AVX512DQ-LABEL: uniform_lshr_v16i8:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
-; AVX512DQ-NEXT: vpsrlvd %zmm0, %zmm1, %zmm0
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: vmovd {{.*#+}} xmm1 = [2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm1, %xmm0
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: uniform_lshr_v16i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
-; AVX512BW-NEXT: vpsrlvw %ymm0, %ymm1, %ymm0
-; AVX512BW-NEXT: vpmovwb %ymm0, %xmm0
-; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = [2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm1, %xmm0
; AVX512BW-NEXT: retq
%shift = lshr <16 x i8> <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>, %a
ret <16 x i8> %shift
@@ -239,54 +157,16 @@ define <16 x i8> @uniform_ashr_v16i8(<16 x i8> %a) nounwind {
;
; SSE41-LABEL: uniform_ashr_v16i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: psllw $5, %xmm0
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psrlw $2, %xmm2
-; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE41-NEXT: paddb %xmm0, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psrlw $1, %xmm2
-; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE41-NEXT: paddb %xmm0, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movd {{.*#+}} xmm1 = [3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; SSE41-NEXT: pshufb %xmm0, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: uniform_ashr_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllw $5, %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
-; AVX1-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm2
-; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vpblendvb %xmm0, %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm2
-; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vpblendvb %xmm0, %xmm2, %xmm1, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: uniform_ashr_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllw $5, %xmm0, %xmm0
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
-; AVX2-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1
-; AVX2-NEXT: vpsrlw $2, %xmm1, %xmm2
-; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vpblendvb %xmm0, %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpsrlw $1, %xmm1, %xmm2
-; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vpblendvb %xmm0, %xmm2, %xmm1, %xmm0
-; AVX2-NEXT: retq
+; AVX-LABEL: uniform_ashr_v16i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovd {{.*#+}} xmm1 = [3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
;
; XOP-LABEL: uniform_ashr_v16i8:
; XOP: # %bb.0:
@@ -297,20 +177,14 @@ define <16 x i8> @uniform_ashr_v16i8(<16 x i8> %a) nounwind {
;
; AVX512DQ-LABEL: uniform_ashr_v16i8:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} zmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
-; AVX512DQ-NEXT: vpsrlvd %zmm0, %zmm1, %zmm0
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: vmovd {{.*#+}} xmm1 = [3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm1, %xmm0
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: uniform_ashr_v16i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
-; AVX512BW-NEXT: vpsrlvw %ymm0, %ymm1, %ymm0
-; AVX512BW-NEXT: vpmovwb %ymm0, %xmm0
-; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = [3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm1, %xmm0
; AVX512BW-NEXT: retq
%shift = ashr <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>, %a
ret <16 x i8> %shift
@@ -376,78 +250,27 @@ define <32 x i8> @uniform_shl_v32i8(<32 x i8> %a) nounwind {
;
; SSE41-LABEL: uniform_shl_v32i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psllw $5, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm5
-; SSE41-NEXT: psllw $2, %xmm5
-; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; SSE41-NEXT: pand %xmm6, %xmm5
-; SSE41-NEXT: paddb %xmm0, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm5, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm5
-; SSE41-NEXT: paddb %xmm3, %xmm5
-; SSE41-NEXT: paddb %xmm0, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm5, %xmm3
-; SSE41-NEXT: psllw $5, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm4
-; SSE41-NEXT: psllw $2, %xmm4
-; SSE41-NEXT: pand %xmm6, %xmm4
-; SSE41-NEXT: paddb %xmm2, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm4
-; SSE41-NEXT: paddb %xmm1, %xmm4
-; SSE41-NEXT: paddb %xmm2, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1
+; SSE41-NEXT: movq {{.*#+}} xmm2 = [4,8,16,32,64,128,0,0,0,0,0,0,0,0,0,0]
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: pshufb %xmm0, %xmm3
+; SSE41-NEXT: pshufb %xmm1, %xmm2
; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: movdqa %xmm2, %xmm1
; SSE41-NEXT: retq
;
; AVX1-LABEL: uniform_shl_v32i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
-; AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm3, %xmm4
-; AVX1-NEXT: vpsllw $2, %xmm4, %xmm5
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm5
-; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpblendvb %xmm1, %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm5
-; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpblendvb %xmm1, %xmm5, %xmm4, %xmm1
-; AVX1-NEXT: vpsllw $5, %xmm0, %xmm0
-; AVX1-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsllw $2, %xmm2, %xmm3
-; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
-; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vpblendvb %xmm0, %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm3
-; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vpblendvb %xmm0, %xmm3, %xmm2, %xmm0
+; AVX1-NEXT: vmovq {{.*#+}} xmm2 = [4,8,16,32,64,128,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: uniform_shl_v32i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllw $5, %ymm0, %ymm0
-; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
-; AVX2-NEXT: vpblendvb %ymm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT: vpsllw $2, %ymm1, %ymm2
-; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm0
-; AVX2-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm2
-; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm0
-; AVX2-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm0
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4,8,16,32,64,128,0,0,4,8,16,32,64,128,0,0,4,8,16,32,64,128,0,0,4,8,16,32,64,128,0,0]
+; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: uniform_shl_v32i8:
@@ -461,33 +284,20 @@ define <32 x i8> @uniform_shl_v32i8(<32 x i8> %a) nounwind {
;
; XOPAVX2-LABEL: uniform_shl_v32i8:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
-; XOPAVX2-NEXT: vpshlb %xmm1, %xmm2, %xmm1
-; XOPAVX2-NEXT: vpshlb %xmm0, %xmm2, %xmm0
-; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4,8,16,32,64,128,0,0,4,8,16,32,64,128,0,0,4,8,16,32,64,128,0,0,4,8,16,32,64,128,0,0]
+; XOPAVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0
; XOPAVX2-NEXT: retq
;
; AVX512DQ-LABEL: uniform_shl_v32i8:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpsllw $5, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
-; AVX512DQ-NEXT: vpblendvb %ymm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512DQ-NEXT: vpsllw $2, %ymm1, %ymm2
-; AVX512DQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpaddb %ymm0, %ymm0, %ymm...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
…UFB (llvm#112175) If each 128-bit vXi8 lane is shifting the same constant value, we can pre-compute the 8 valid shift results and use PSHUFB to act as a LUT with the shift amount. Fixes llvm#110317
…UFB (llvm#112175) If each 128-bit vXi8 lane is shifting the same constant value, we can pre-compute the 8 valid shift results and use PSHUFB to act as a LUT with the shift amount. Fixes llvm#110317
…UFB (llvm#112175) If each 128-bit vXi8 lane is shifting the same constant value, we can pre-compute the 8 valid shift results and use PSHUFB to act as a LUT with the shift amount. Fixes llvm#110317
If each 128-bit vXi8 lane is shifting the same constant value, we can pre-compute the 8 valid shift results and use PSHUFB to act as a LUT with the shift amount.
Fixes #110317