Skip to content

Commit

Permalink
[X86][SSE1] Add support for logic+movmsk patterns (PR42870)
Browse files Browse the repository at this point in the history
rL368506 handled the basic case, but we need to account for boolean logic patterns as well.
  • Loading branch information
RKSimon committed Mar 24, 2020
1 parent 10bd842 commit 7144021
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 92 deletions.
73 changes: 55 additions & 18 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37027,6 +37027,51 @@ static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size) {
return false;
}

// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
static unsigned getAltBitOpcode(unsigned Opcode) {
switch(Opcode) {
case ISD::AND: return X86ISD::FAND;
case ISD::OR: return X86ISD::FOR;
case ISD::XOR: return X86ISD::FXOR;
case X86ISD::ANDNP: return X86ISD::FANDN;
}
llvm_unreachable("Unknown bitwise opcode");
}

// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src,
const SDLoc &DL) {
EVT SrcVT = Src.getValueType();
if (SrcVT != MVT::v4i1)
return SDValue();

switch (Src.getOpcode()) {
case ISD::SETCC:
if (Src.getOperand(0).getValueType() == MVT::v4i32 &&
ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
SDValue Op0 = Src.getOperand(0);
if (ISD::isNormalLoad(Op0.getNode()))
return DAG.getBitcast(MVT::v4f32, Op0);
if (Op0.getOpcode() == ISD::BITCAST &&
Op0.getOperand(0).getValueType() == MVT::v4f32)
return Op0.getOperand(0);
}
break;
case ISD::AND:
case ISD::XOR:
case ISD::OR: {
SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);
SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);
if (Op0 && Op1)
return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,
Op1);
break;
}
}
return SDValue();
}

// Helper to push sign extension of vXi1 SETCC result through bitops.
static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT,
SDValue Src, const SDLoc &DL) {
Expand Down Expand Up @@ -37057,6 +37102,16 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
return SDValue();

// Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
// legalization destroys the v4i32 type.
if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {
if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {
V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,
DAG.getBitcast(MVT::v4f32, V));
return DAG.getZExtOrTrunc(V, DL, VT);
}
}

// If the input is a truncate from v16i8 or v32i8 go ahead and use a
// movmskb even with avx512. This will be better than truncating to vXi1 and
// using a kmov. This can especially help KNL if the input is a v16i8/v32i8
Expand Down Expand Up @@ -37319,24 +37374,6 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
return V;

// Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
// legalization destroys the v4i32 type.
if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && SrcVT == MVT::v4i1 &&
VT.isScalarInteger() && N0.getOpcode() == ISD::SETCC &&
N0.getOperand(0).getValueType() == MVT::v4i32 &&
ISD::isBuildVectorAllZeros(N0.getOperand(1).getNode()) &&
cast<CondCodeSDNode>(N0.getOperand(2))->get() == ISD::SETLT) {
SDValue N00 = N0.getOperand(0);
// Only do this if we can avoid scalarizing the input.
if (ISD::isNormalLoad(N00.getNode()) ||
(N00.getOpcode() == ISD::BITCAST &&
N00.getOperand(0).getValueType() == MVT::v4f32)) {
SDValue V = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32,
DAG.getBitcast(MVT::v4f32, N00));
return DAG.getZExtOrTrunc(V, dl, VT);
}
}

// If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
// type, widen both sides to avoid a trip through memory.
if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
Expand Down
78 changes: 4 additions & 74 deletions llvm/test/CodeGen/X86/pr42870.ll
Original file line number Diff line number Diff line change
Expand Up @@ -33,26 +33,8 @@ start:
define i32 @test_and(<4 x float> %a, <4 x float> %b) {
; CHECK-LABEL: test_and:
; CHECK: ## %bb.0: ## %start
; CHECK-NEXT: subl $28, %esp
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: andps %xmm1, %xmm0
; CHECK-NEXT: movaps %xmm0, (%esp)
; CHECK-NEXT: cmpl $0, (%esp)
; CHECK-NEXT: sets %al
; CHECK-NEXT: cmpl $0, {{[0-9]+}}(%esp)
; CHECK-NEXT: sets %cl
; CHECK-NEXT: addb %cl, %cl
; CHECK-NEXT: orb %al, %cl
; CHECK-NEXT: cmpl $0, {{[0-9]+}}(%esp)
; CHECK-NEXT: sets %al
; CHECK-NEXT: cmpl $0, {{[0-9]+}}(%esp)
; CHECK-NEXT: sets %dl
; CHECK-NEXT: addb %dl, %dl
; CHECK-NEXT: orb %al, %dl
; CHECK-NEXT: shlb $2, %dl
; CHECK-NEXT: orb %cl, %dl
; CHECK-NEXT: movzbl %dl, %eax
; CHECK-NEXT: addl $28, %esp
; CHECK-NEXT: movmskps %xmm0, %eax
; CHECK-NEXT: retl
start:
%0 = bitcast <4 x float> %a to <4 x i32>
Expand All @@ -68,26 +50,8 @@ start:
define i32 @test_or(<4 x float> %a, <4 x float> %b) {
; CHECK-LABEL: test_or:
; CHECK: ## %bb.0: ## %start
; CHECK-NEXT: subl $28, %esp
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: orps %xmm1, %xmm0
; CHECK-NEXT: movaps %xmm0, (%esp)
; CHECK-NEXT: cmpl $0, (%esp)
; CHECK-NEXT: sets %al
; CHECK-NEXT: cmpl $0, {{[0-9]+}}(%esp)
; CHECK-NEXT: sets %cl
; CHECK-NEXT: addb %cl, %cl
; CHECK-NEXT: orb %al, %cl
; CHECK-NEXT: cmpl $0, {{[0-9]+}}(%esp)
; CHECK-NEXT: sets %al
; CHECK-NEXT: cmpl $0, {{[0-9]+}}(%esp)
; CHECK-NEXT: sets %dl
; CHECK-NEXT: addb %dl, %dl
; CHECK-NEXT: orb %al, %dl
; CHECK-NEXT: shlb $2, %dl
; CHECK-NEXT: orb %cl, %dl
; CHECK-NEXT: movzbl %dl, %eax
; CHECK-NEXT: addl $28, %esp
; CHECK-NEXT: movmskps %xmm0, %eax
; CHECK-NEXT: retl
start:
%0 = bitcast <4 x float> %a to <4 x i32>
Expand All @@ -103,42 +67,8 @@ start:
define i32 @test_xor(<4 x float> %a, <4 x float> %b) {
; CHECK-LABEL: test_xor:
; CHECK: ## %bb.0: ## %start
; CHECK-NEXT: pushl %ebx
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: subl $40, %esp
; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: .cfi_offset %ebx, -8
; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; CHECK-NEXT: movaps %xmm1, (%esp)
; CHECK-NEXT: cmpl $0, {{[0-9]+}}(%esp)
; CHECK-NEXT: sets %al
; CHECK-NEXT: cmpl $0, {{[0-9]+}}(%esp)
; CHECK-NEXT: sets %cl
; CHECK-NEXT: cmpl $0, {{[0-9]+}}(%esp)
; CHECK-NEXT: sets %dl
; CHECK-NEXT: cmpl $0, {{[0-9]+}}(%esp)
; CHECK-NEXT: sets %ah
; CHECK-NEXT: cmpl $0, {{[0-9]+}}(%esp)
; CHECK-NEXT: sets %ch
; CHECK-NEXT: cmpl $0, {{[0-9]+}}(%esp)
; CHECK-NEXT: sets %dh
; CHECK-NEXT: cmpl $0, {{[0-9]+}}(%esp)
; CHECK-NEXT: sets %bl
; CHECK-NEXT: cmpl $0, (%esp)
; CHECK-NEXT: sets %bh
; CHECK-NEXT: xorb %ah, %bh
; CHECK-NEXT: xorb %dl, %bl
; CHECK-NEXT: addb %bl, %bl
; CHECK-NEXT: orb %bh, %bl
; CHECK-NEXT: xorb %cl, %dh
; CHECK-NEXT: xorb %al, %ch
; CHECK-NEXT: addb %ch, %ch
; CHECK-NEXT: orb %dh, %ch
; CHECK-NEXT: shlb $2, %ch
; CHECK-NEXT: orb %bl, %ch
; CHECK-NEXT: movzbl %ch, %eax
; CHECK-NEXT: addl $40, %esp
; CHECK-NEXT: popl %ebx
; CHECK-NEXT: xorps %xmm1, %xmm0
; CHECK-NEXT: movmskps %xmm0, %eax
; CHECK-NEXT: retl
start:
%0 = bitcast <4 x float> %a to <4 x i32>
Expand Down

0 comments on commit 7144021

Please sign in to comment.