diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 017993399c143d..3828de5edd6833 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -37027,6 +37027,51 @@ static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size) { return false; } +// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents. +static unsigned getAltBitOpcode(unsigned Opcode) { + switch(Opcode) { + case ISD::AND: return X86ISD::FAND; + case ISD::OR: return X86ISD::FOR; + case ISD::XOR: return X86ISD::FXOR; + case X86ISD::ANDNP: return X86ISD::FANDN; + } + llvm_unreachable("Unknown bitwise opcode"); +} + +// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets. +static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src, + const SDLoc &DL) { + EVT SrcVT = Src.getValueType(); + if (SrcVT != MVT::v4i1) + return SDValue(); + + switch (Src.getOpcode()) { + case ISD::SETCC: + if (Src.getOperand(0).getValueType() == MVT::v4i32 && + ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) && + cast(Src.getOperand(2))->get() == ISD::SETLT) { + SDValue Op0 = Src.getOperand(0); + if (ISD::isNormalLoad(Op0.getNode())) + return DAG.getBitcast(MVT::v4f32, Op0); + if (Op0.getOpcode() == ISD::BITCAST && + Op0.getOperand(0).getValueType() == MVT::v4f32) + return Op0.getOperand(0); + } + break; + case ISD::AND: + case ISD::XOR: + case ISD::OR: { + SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL); + SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL); + if (Op0 && Op1) + return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0, + Op1); + break; + } + } + return SDValue(); +} + // Helper to push sign extension of vXi1 SETCC result through bitops. static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT, SDValue Src, const SDLoc &DL) { @@ -37057,6 +37102,16 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src, if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1) return SDValue(); + // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type + // legalization destroys the v4i32 type. + if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) { + if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) { + V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, + DAG.getBitcast(MVT::v4f32, V)); + return DAG.getZExtOrTrunc(V, DL, VT); + } + } + // If the input is a truncate from v16i8 or v32i8 go ahead and use a // movmskb even with avx512. This will be better than truncating to vXi1 and // using a kmov. This can especially help KNL if the input is a v16i8/v32i8 @@ -37319,24 +37374,6 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget)) return V; - // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type - // legalization destroys the v4i32 type. - if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && SrcVT == MVT::v4i1 && - VT.isScalarInteger() && N0.getOpcode() == ISD::SETCC && - N0.getOperand(0).getValueType() == MVT::v4i32 && - ISD::isBuildVectorAllZeros(N0.getOperand(1).getNode()) && - cast(N0.getOperand(2))->get() == ISD::SETLT) { - SDValue N00 = N0.getOperand(0); - // Only do this if we can avoid scalarizing the input. - if (ISD::isNormalLoad(N00.getNode()) || - (N00.getOpcode() == ISD::BITCAST && - N00.getOperand(0).getValueType() == MVT::v4f32)) { - SDValue V = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, - DAG.getBitcast(MVT::v4f32, N00)); - return DAG.getZExtOrTrunc(V, dl, VT); - } - } - // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer // type, widen both sides to avoid a trip through memory. if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() && diff --git a/llvm/test/CodeGen/X86/pr42870.ll b/llvm/test/CodeGen/X86/pr42870.ll index e4ffcb4787e868..c42cb7cb8b2865 100644 --- a/llvm/test/CodeGen/X86/pr42870.ll +++ b/llvm/test/CodeGen/X86/pr42870.ll @@ -33,26 +33,8 @@ start: define i32 @test_and(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: test_and: ; CHECK: ## %bb.0: ## %start -; CHECK-NEXT: subl $28, %esp -; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: andps %xmm1, %xmm0 -; CHECK-NEXT: movaps %xmm0, (%esp) -; CHECK-NEXT: cmpl $0, (%esp) -; CHECK-NEXT: sets %al -; CHECK-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; CHECK-NEXT: sets %cl -; CHECK-NEXT: addb %cl, %cl -; CHECK-NEXT: orb %al, %cl -; CHECK-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; CHECK-NEXT: sets %al -; CHECK-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; CHECK-NEXT: sets %dl -; CHECK-NEXT: addb %dl, %dl -; CHECK-NEXT: orb %al, %dl -; CHECK-NEXT: shlb $2, %dl -; CHECK-NEXT: orb %cl, %dl -; CHECK-NEXT: movzbl %dl, %eax -; CHECK-NEXT: addl $28, %esp +; CHECK-NEXT: movmskps %xmm0, %eax ; CHECK-NEXT: retl start: %0 = bitcast <4 x float> %a to <4 x i32> @@ -68,26 +50,8 @@ start: define i32 @test_or(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: test_or: ; CHECK: ## %bb.0: ## %start -; CHECK-NEXT: subl $28, %esp -; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: orps %xmm1, %xmm0 -; CHECK-NEXT: movaps %xmm0, (%esp) -; CHECK-NEXT: cmpl $0, (%esp) -; CHECK-NEXT: sets %al -; CHECK-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; CHECK-NEXT: sets %cl -; CHECK-NEXT: addb %cl, %cl -; CHECK-NEXT: orb %al, %cl -; CHECK-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; CHECK-NEXT: sets %al -; CHECK-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; CHECK-NEXT: sets %dl -; CHECK-NEXT: addb %dl, %dl -; CHECK-NEXT: orb %al, %dl -; CHECK-NEXT: shlb $2, %dl -; CHECK-NEXT: orb %cl, %dl -; CHECK-NEXT: movzbl %dl, %eax -; CHECK-NEXT: addl $28, %esp +; CHECK-NEXT: movmskps %xmm0, %eax ; CHECK-NEXT: retl start: %0 = bitcast <4 x float> %a to <4 x i32> @@ -103,42 +67,8 @@ start: define i32 @test_xor(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: test_xor: ; CHECK: ## %bb.0: ## %start -; CHECK-NEXT: pushl %ebx -; CHECK-NEXT: .cfi_def_cfa_offset 8 -; CHECK-NEXT: subl $40, %esp -; CHECK-NEXT: .cfi_def_cfa_offset 48 -; CHECK-NEXT: .cfi_offset %ebx, -8 -; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; CHECK-NEXT: movaps %xmm1, (%esp) -; CHECK-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; CHECK-NEXT: sets %al -; CHECK-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; CHECK-NEXT: sets %cl -; CHECK-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; CHECK-NEXT: sets %dl -; CHECK-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; CHECK-NEXT: sets %ah -; CHECK-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; CHECK-NEXT: sets %ch -; CHECK-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; CHECK-NEXT: sets %dh -; CHECK-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; CHECK-NEXT: sets %bl -; CHECK-NEXT: cmpl $0, (%esp) -; CHECK-NEXT: sets %bh -; CHECK-NEXT: xorb %ah, %bh -; CHECK-NEXT: xorb %dl, %bl -; CHECK-NEXT: addb %bl, %bl -; CHECK-NEXT: orb %bh, %bl -; CHECK-NEXT: xorb %cl, %dh -; CHECK-NEXT: xorb %al, %ch -; CHECK-NEXT: addb %ch, %ch -; CHECK-NEXT: orb %dh, %ch -; CHECK-NEXT: shlb $2, %ch -; CHECK-NEXT: orb %bl, %ch -; CHECK-NEXT: movzbl %ch, %eax -; CHECK-NEXT: addl $40, %esp -; CHECK-NEXT: popl %ebx +; CHECK-NEXT: xorps %xmm1, %xmm0 +; CHECK-NEXT: movmskps %xmm0, %eax ; CHECK-NEXT: retl start: %0 = bitcast <4 x float> %a to <4 x i32>