From 0b637a639e8b8972b0a17322cecb50bf0d99b7c6 Mon Sep 17 00:00:00 2001 From: Mike Danes Date: Sun, 17 Sep 2017 12:49:10 +0300 Subject: [PATCH 1/4] Simplify SIMD EQ/NE optimization --- src/jit/codegenxarch.cpp | 21 ----------- src/jit/lower.cpp | 50 --------------------------- src/jit/lowerxarch.cpp | 67 ++++++++++++++++++++++++++++++++++++ src/jit/lsra.cpp | 2 +- src/jit/lsraxarch.cpp | 51 ++++++--------------------- src/jit/simdcodegenxarch.cpp | 45 ++++-------------------- 6 files changed, 85 insertions(+), 151 deletions(-) diff --git a/src/jit/codegenxarch.cpp b/src/jit/codegenxarch.cpp index 58c6c1022226..43327afc55ae 100644 --- a/src/jit/codegenxarch.cpp +++ b/src/jit/codegenxarch.cpp @@ -6282,27 +6282,6 @@ void CodeGen::genCompareInt(GenTreePtr treeNode) return; } -#ifdef FEATURE_SIMD - // If we have GT_JTRUE(GT_EQ/NE(GT_SIMD((in)Equality, v1, v2), true/false)), - // then we don't need to generate code for GT_EQ/GT_NE, since SIMD (in)Equality intrinsic - // would set or clear Zero flag. - if ((targetReg == REG_NA) && tree->OperIs(GT_EQ, GT_NE)) - { - // Is it a SIMD (in)Equality that doesn't need to materialize result into a register? - if ((op1->gtRegNum == REG_NA) && op1->IsSIMDEqualityOrInequality()) - { - // Must be comparing against true or false. - assert(op2->IsIntegralConst(0) || op2->IsIntegralConst(1)); - assert(op2->isContainedIntOrIImmed()); - - // In this case SIMD (in)Equality will set or clear - // Zero flag, based on which GT_JTRUE would generate - // the right conditional jump. - return; - } - } -#endif // FEATURE_SIMD - genConsumeOperands(tree); assert(!op1->isContainedIntOrIImmed()); diff --git a/src/jit/lower.cpp b/src/jit/lower.cpp index d82676e540a5..0b9f9b22f7e6 100644 --- a/src/jit/lower.cpp +++ b/src/jit/lower.cpp @@ -5606,56 +5606,6 @@ void Lowering::ContainCheckJTrue(GenTreeOp* node) // The compare does not need to be generated into a register. GenTree* cmp = node->gtGetOp1(); cmp->gtLsraInfo.isNoRegCompare = true; - -#ifdef FEATURE_SIMD - assert(node->OperIs(GT_JTRUE)); - - // Say we have the following IR - // simdCompareResult = GT_SIMD((In)Equality, v1, v2) - // integerCompareResult = GT_EQ/NE(simdCompareResult, true/false) - // GT_JTRUE(integerCompareResult) - // - // In this case we don't need to generate code for GT_EQ_/NE, since SIMD (In)Equality - // intrinsic will set or clear the Zero flag. - genTreeOps cmpOper = cmp->OperGet(); - if (cmpOper == GT_EQ || cmpOper == GT_NE) - { - GenTree* cmpOp1 = cmp->gtGetOp1(); - GenTree* cmpOp2 = cmp->gtGetOp2(); - - if (cmpOp1->IsSIMDEqualityOrInequality() && (cmpOp2->IsIntegralConst(0) || cmpOp2->IsIntegralConst(1))) - { - // We always generate code for a SIMD equality comparison, though it produces no value. - // Neither the GT_JTRUE nor the immediate need to be evaluated. - MakeSrcContained(cmp, cmpOp2); - cmpOp1->gtLsraInfo.isNoRegCompare = true; - // We have to reverse compare oper in the following cases: - // 1) SIMD Equality: Sets Zero flag on equal otherwise clears it. - // Therefore, if compare oper is == or != against false(0), we will - // be checking opposite of what is required. - // - // 2) SIMD inEquality: Clears Zero flag on true otherwise sets it. - // Therefore, if compare oper is == or != against true(1), we will - // be checking opposite of what is required. - GenTreeSIMD* simdNode = cmpOp1->AsSIMD(); - if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpEquality) - { - if (cmpOp2->IsIntegralConst(0)) - { - cmp->SetOper(GenTree::ReverseRelop(cmpOper)); - } - } - else - { - assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpInEquality); - if (cmpOp2->IsIntegralConst(1)) - { - cmp->SetOper(GenTree::ReverseRelop(cmpOper)); - } - } - } - } -#endif // FEATURE_SIMD } #endif // !LEGACY_BACKEND diff --git a/src/jit/lowerxarch.cpp b/src/jit/lowerxarch.cpp index bb45279887e9..834cd34fda3f 100644 --- a/src/jit/lowerxarch.cpp +++ b/src/jit/lowerxarch.cpp @@ -794,6 +794,73 @@ void Lowering::LowerSIMD(GenTreeSIMD* simdNode) // the addr of SIMD vector with the given index. simdNode->gtOp1->gtFlags |= GTF_IND_REQ_ADDR_IN_REG; } + else if (simdNode->IsSIMDEqualityOrInequality()) + { + LIR::Use simdUse; + + if (BlockRange().TryGetUse(simdNode, &simdUse)) + { + // + // Try to transform JTRUE(EQ|NE(SIMD(x, y), 0|1)) into + // JCC(SIMD(x, y)). SIMD(x, y) + // is expected to set the Zero flag appropriately. + // All the involved nodes must form a continuous range, there's no other way to + // guarantee that condition flags aren't changed between the SIMD node and the JCC + // node. + // + + bool transformed = false; + GenTree* simdUser = simdUse.User(); + + if (simdUser->OperIs(GT_EQ, GT_NE) && simdUser->gtGetOp2()->IsCnsIntOrI() && + (simdNode->gtNext == simdUser->gtGetOp2()) && (simdUser->gtGetOp2()->gtNext == simdUser)) + { + ssize_t relopOp2Value = simdUser->gtGetOp2()->AsIntCon()->IconValue(); + + if ((relopOp2Value == 0) || (relopOp2Value == 1)) + { + GenTree* jtrue = simdUser->gtNext; + + if ((jtrue != nullptr) && jtrue->OperIs(GT_JTRUE) && (jtrue->gtGetOp1() == simdUser)) + { + if ((simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpEquality) != simdUser->OperIs(GT_EQ)) + { + relopOp2Value ^= 1; + } + + jtrue->ChangeOper(GT_JCC); + GenTreeCC* jcc = jtrue->AsCC(); + jcc->gtFlags |= GTF_USE_FLAGS; + jcc->gtCondition = (relopOp2Value == 0) ? GT_NE : GT_EQ; + + BlockRange().Remove(simdUser->gtGetOp2()); + BlockRange().Remove(simdUser); + transformed = true; + } + } + } + + if (!transformed) + { + // + // The code generated for SIMD SIMD(x, y) nodes sets + // the Zero flag like integer compares do so we can simply use SETCC + // to produce the desired result. This avoids the need for subsequent phases + // to have to handle 2 cases (set flags/set destination register). + // + + genTreeOps condition = (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpEquality) ? GT_EQ : GT_NE; + GenTreeCC* setcc = new (comp, GT_SETCC) GenTreeCC(GT_SETCC, condition, simdNode->TypeGet()); + setcc->gtFlags |= GTF_USE_FLAGS; + BlockRange().InsertAfter(simdNode, setcc); + simdUse.ReplaceWith(comp, setcc); + } + } + + simdNode->gtFlags |= GTF_SET_FLAGS; + simdNode->SetUnusedValue(); + simdNode->gtLsraInfo.isNoRegCompare = true; + } #endif ContainCheckSIMD(simdNode); } diff --git a/src/jit/lsra.cpp b/src/jit/lsra.cpp index db6e69913445..1ace2f0da60b 100644 --- a/src/jit/lsra.cpp +++ b/src/jit/lsra.cpp @@ -4842,7 +4842,7 @@ void LinearScan::buildIntervals() TreeNodeInfoInit(node); // If the node produces an unused value, mark it as a local def-use - if (node->IsValue() && node->IsUnusedValue()) + if (node->IsValue() && node->IsUnusedValue() && !node->gtLsraInfo.isNoRegCompare) { node->gtLsraInfo.isLocalDefUse = true; node->gtLsraInfo.dstCount = 0; diff --git a/src/jit/lsraxarch.cpp b/src/jit/lsraxarch.cpp index 8ac8e235b536..4fff1115065b 100644 --- a/src/jit/lsraxarch.cpp +++ b/src/jit/lsraxarch.cpp @@ -254,20 +254,6 @@ void LinearScan::TreeNodeInfoInit(GenTree* tree) GenTree* cmp = tree->gtGetOp1(); assert(cmp->gtLsraInfo.dstCount == 0); - -#ifdef FEATURE_SIMD - GenTree* cmpOp1 = cmp->gtGetOp1(); - GenTree* cmpOp2 = cmp->gtGetOp2(); - if (cmpOp1->IsSIMDEqualityOrInequality() && cmpOp2->isContained()) - { - genTreeOps cmpOper = cmp->OperGet(); - - // We always generate code for a SIMD equality comparison, but the compare itself produces no value. - // Neither the SIMD node nor the immediate need to be evaluated into a register. - assert(cmpOp1->gtLsraInfo.dstCount == 0); - assert(cmpOp2->gtLsraInfo.dstCount == 0); - } -#endif // FEATURE_SIMD } break; @@ -2136,41 +2122,26 @@ void LinearScan::TreeNodeInfoInitSIMD(GenTreeSIMD* simdTree) case SIMDIntrinsicOpEquality: case SIMDIntrinsicOpInEquality: - - // On SSE4/AVX, we can generate optimal code for (in)equality - // against zero using ptest. We can safely do this optimization - // for integral vectors but not for floating-point for the reason - // that we have +0.0 and -0.0 and +0.0 == -0.0 if (simdTree->gtGetOp2()->isContained()) { + // If the second operand is contained then ContainCheckSIMD has determined + // that PTEST can be used. We only need a single source register and no + // internal registers. info->srcCount = 1; } else { - info->srcCount = 2; - // Need one SIMD register as scratch. - // See genSIMDIntrinsicRelOp() for details on code sequence generated and - // the need for one scratch register. - // - // Note these intrinsics produce a BOOL result, hence internal float - // registers reserved are guaranteed to be different from target - // integer register without explicitly specifying. + // Can't use PTEST so we need 2 source registers, 1 internal SIMD register + // (to hold the result of PCMPEQD or other similar SIMD compare instruction) + // and one internal INT register (to hold the result of PMOVMSKB). + info->srcCount = 2; info->internalFloatCount = 1; info->setInternalCandidates(this, allSIMDRegs()); + info->internalIntCount = 1; + info->addInternalCandidates(this, allRegs(TYP_INT)); } - if (info->isNoRegCompare) - { - info->dstCount = 0; - // Codegen of SIMD (in)Equality uses target integer reg only for setting flags. - // A target reg is not needed on AVX when comparing against Vector Zero. - // In all other cases we need to reserve an int type internal register if we - // don't have a target register on the compare. - if (!compiler->canUseAVX() || !simdTree->gtGetOp2()->IsIntegralConstVector(0)) - { - info->internalIntCount = 1; - info->addInternalCandidates(this, allRegs(TYP_INT)); - } - } + // These SIMD nodes only set the condition flags. + info->dstCount = 0; break; case SIMDIntrinsicDotProduct: diff --git a/src/jit/simdcodegenxarch.cpp b/src/jit/simdcodegenxarch.cpp index b7c9b290884e..429cff0bb3b4 100644 --- a/src/jit/simdcodegenxarch.cpp +++ b/src/jit/simdcodegenxarch.cpp @@ -2049,6 +2049,9 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode) case SIMDIntrinsicOpEquality: case SIMDIntrinsicOpInEquality: { + // We're only setting condition flags, if a 0/1 value is desired then Lowering should have inserted a SETCC. + assert(targetReg == REG_NA); + var_types simdType = op1->TypeGet(); // TODO-1stClassStructs: Temporary to minimize asmDiffs if (simdType == TYP_DOUBLE) @@ -2064,9 +2067,9 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode) } // On SSE4/AVX, we can generate optimal code for (in)equality against zero using ptest. - if ((compiler->getSIMDInstructionSet() >= InstructionSet_SSE3_4) && op2->IsIntegralConstVector(0)) + if (op2->isContained()) { - assert(op2->isContained()); + assert((compiler->getSIMDInstructionSet() >= InstructionSet_SSE3_4) && op2->IsIntegralConstVector(0)); inst_RV_RV(INS_ptest, op1->gtRegNum, op1->gtRegNum, simdType, emitActualTypeSize(simdType)); } else @@ -2103,22 +2106,7 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode) inst_RV_RV(ins, tmpReg1, otherReg, simdType, emitActualTypeSize(simdType)); } - regNumber intReg; - if (targetReg == REG_NA) - { - // If we are not materializing result into a register, - // we would have reserved an int type internal register. - intReg = simdNode->GetSingleTempReg(RBM_ALLINT); - } - else - { - // We can use targetReg for setting flags. - intReg = targetReg; - - // Must have not reserved any int type internal registers. - assert(simdNode->AvailableTempRegCount(RBM_ALLINT) == 0); - } - + regNumber intReg = simdNode->GetSingleTempReg(RBM_ALLINT); inst_RV_RV(INS_pmovmskb, intReg, tmpReg1, simdType, emitActualTypeSize(simdType)); // There's no pmovmskw/pmovmskd/pmovmskq but they're not needed anyway. Vector compare // instructions produce "all ones"/"all zeroes" components and pmovmskb extracts a @@ -2147,27 +2135,6 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode) } getEmitter()->emitIns_R_I(INS_cmp, EA_4BYTE, intReg, mask); } - - if (targetReg != REG_NA) - { - // If we need to materialize result into a register, targetReg needs to - // be set to 1 on true and zero on false. - // Equality: - // cmp targetReg, 0xFFFFFFFF or 0xFFFF - // sete targetReg - // movzx targetReg, targetReg - // - // InEquality: - // cmp targetReg, 0xFFFFFFFF or 0xFFFF - // setne targetReg - // movzx targetReg, targetReg - // - assert(simdNode->TypeGet() == TYP_INT); - inst_RV((simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpEquality) ? INS_sete : INS_setne, targetReg, - TYP_INT, EA_1BYTE); - // Set the higher bytes to 0 - inst_RV_RV(ins_Move_Extend(TYP_UBYTE, true), targetReg, targetReg, TYP_UBYTE, emitTypeSize(TYP_UBYTE)); - } } break; From f1f8cd25e0b79bfdf4e3b7a595fa22bbf7bdae26 Mon Sep 17 00:00:00 2001 From: Mike Danes Date: Mon, 18 Sep 2017 00:26:58 +0300 Subject: [PATCH 2/4] Reimplement compare flags reuse using SETCC/JCC --- src/jit/codegenarm64.cpp | 34 ------------- src/jit/codegenxarch.cpp | 39 --------------- src/jit/compiler.cpp | 2 +- src/jit/gentree.h | 4 +- src/jit/lower.cpp | 105 +++++++++++++++++++++++++++++---------- src/jit/lower.h | 2 +- src/jit/lowerarmarch.cpp | 24 +-------- src/jit/lowerxarch.cpp | 38 -------------- 8 files changed, 84 insertions(+), 164 deletions(-) diff --git a/src/jit/codegenarm64.cpp b/src/jit/codegenarm64.cpp index 8d21ae3b56ae..cf2d858d3d0d 100644 --- a/src/jit/codegenarm64.cpp +++ b/src/jit/codegenarm64.cpp @@ -3298,40 +3298,6 @@ void CodeGen::genCodeForCompare(GenTreeOp* tree) assert(!op1->isUsedFromMemory()); assert(!op2->isUsedFromMemory()); - // Case of op1 == 0 or op1 != 0: - // Optimize generation of 'test' instruction if op1 sets flags. - // - // This behavior is designed to match the unexpected behavior - // of XARCH genCompareInt(); - // - // TODO-Cleanup Review GTF_USE_FLAGS usage - // https://github.com/dotnet/coreclr/issues/14093 - if ((tree->gtFlags & GTF_USE_FLAGS) != 0) - { - // op1 must set flags - assert(op1->gtSetFlags()); - - // Must be compare against zero. - assert(!tree->OperIs(GT_TEST_EQ, GT_TEST_NE)); - assert(op2->IsIntegralConst(0)); - assert(op2->isContained()); - - // Just consume the operands - genConsumeOperands(tree); - - // No need to generate compare instruction since - // op1 sets flags - - // Are we evaluating this into a register? - if (targetReg != REG_NA) - { - genSetRegToCond(targetReg, tree); - genProduceReg(tree); - } - - return; - } - genConsumeOperands(tree); emitAttr cmpSize = EA_ATTR(genTypeSize(op1Type)); diff --git a/src/jit/codegenxarch.cpp b/src/jit/codegenxarch.cpp index 43327afc55ae..28db51162dee 100644 --- a/src/jit/codegenxarch.cpp +++ b/src/jit/codegenxarch.cpp @@ -6243,45 +6243,6 @@ void CodeGen::genCompareInt(GenTreePtr treeNode) var_types op2Type = op2->TypeGet(); regNumber targetReg = tree->gtRegNum; - // Case of op1 == 0 or op1 != 0: - // Optimize generation of 'test' instruction if op1 sets flags. - // - // TODO-Cleanup Review GTF_USE_FLAGS usage - // https://github.com/dotnet/coreclr/issues/14093 - // - // Note that if LSRA has inserted any GT_RELOAD/GT_COPY before - // op1, it will not modify the flags set by codegen of op1. - // Similarly op1 could also be reg-optional at its use and - // it was spilled after producing its result in a register. - // Spill code too will not modify the flags set by op1. - GenTree* realOp1 = op1->gtSkipReloadOrCopy(); - if ((tree->gtFlags & GTF_USE_FLAGS) != 0) - { - // op1 must set ZF and SF flags - assert(realOp1->gtSetZSFlags()); - assert(realOp1->gtSetFlags()); - - // Must be (in)equality against zero. - assert(tree->OperIs(GT_EQ, GT_NE)); - assert(op2->IsIntegralConst(0)); - assert(op2->isContained()); - - // Just consume the operands - genConsumeOperands(tree); - - // No need to generate test instruction since - // op1 sets flags - - // Are we evaluating this into a register? - if (targetReg != REG_NA) - { - genSetRegToCond(targetReg, tree); - genProduceReg(tree); - } - - return; - } - genConsumeOperands(tree); assert(!op1->isContainedIntOrIImmed()); diff --git a/src/jit/compiler.cpp b/src/jit/compiler.cpp index 552a6dff9480..6118e4548968 100644 --- a/src/jit/compiler.cpp +++ b/src/jit/compiler.cpp @@ -9781,11 +9781,11 @@ int cTreeFlagsIR(Compiler* comp, GenTree* tree) { chars += printf("[SPILLED_OP2]"); } -#endif if (tree->gtFlags & GTF_ZSF_SET) { chars += printf("[ZSF_SET]"); } +#endif #if FEATURE_SET_FLAGS if (tree->gtFlags & GTF_SET_FLAGS) { diff --git a/src/jit/gentree.h b/src/jit/gentree.h index 82e5d70b94ef..e8803e0e1d48 100644 --- a/src/jit/gentree.h +++ b/src/jit/gentree.h @@ -856,11 +856,11 @@ struct GenTree #ifdef LEGACY_BACKEND #define GTF_SPILLED_OPER 0x00000100 // op1 has been spilled #define GTF_SPILLED_OP2 0x00000200 // op2 has been spilled +#define GTF_ZSF_SET 0x00000400 // the zero(ZF) and sign(SF) flags set to the operand #else // !LEGACY_BACKEND #define GTF_NOREG_AT_USE 0x00000100 // tree node is in memory at the point of use #endif // !LEGACY_BACKEND -#define GTF_ZSF_SET 0x00000400 // the zero(ZF) and sign(SF) flags set to the operand #define GTF_SET_FLAGS 0x00000800 // Requires that codegen for this node set the flags. Use gtSetFlags() to check this flag. #define GTF_USE_FLAGS 0x00001000 // Indicates that this node uses the flags bits. @@ -2126,12 +2126,14 @@ struct GenTree bool gtSetFlags() const; bool gtRequestSetFlags(); +#ifdef LEGACY_BACKEND // Returns true if the codegen of this tree node // sets ZF and SF flags. bool gtSetZSFlags() const { return (gtFlags & GTF_ZSF_SET) != 0; } +#endif #ifdef DEBUG bool gtIsValid64RsltMul(); diff --git a/src/jit/lower.cpp b/src/jit/lower.cpp index 0b9f9b22f7e6..f6592f67a4c8 100644 --- a/src/jit/lower.cpp +++ b/src/jit/lower.cpp @@ -142,16 +142,6 @@ GenTree* Lowering::LowerNode(GenTree* node) ContainCheckBinary(node->AsOp()); break; -#ifdef _TARGET_XARCH_ - case GT_NEG: - // Codegen of this tree node sets ZF and SF flags. - if (!varTypeIsFloating(node)) - { - node->gtFlags |= GTF_ZSF_SET; - } - break; -#endif // _TARGET_XARCH_ - case GT_MUL: case GT_MULHI: #if defined(_TARGET_X86_) && !defined(LEGACY_BACKEND) @@ -189,8 +179,7 @@ GenTree* Lowering::LowerNode(GenTree* node) case GT_TEST_NE: case GT_CMP: case GT_JCMP: - LowerCompare(node); - break; + return LowerCompare(node); case GT_JTRUE: ContainCheckJTrue(node->AsOp()); @@ -2148,6 +2137,9 @@ GenTree* Lowering::LowerTailCallViaHelper(GenTreeCall* call, GenTree* callTarget // Arguments: // cmp - the compare node // +// Return Value: +// The next node to lower. +// // Notes: // - Decomposes long comparisons that feed a GT_JTRUE (32 bit specific). // - Decomposes long comparisons that produce a value (X86 specific). @@ -2156,8 +2148,11 @@ GenTree* Lowering::LowerTailCallViaHelper(GenTreeCall* call, GenTree* callTarget // - Transform cmp(and(x, y), 0) into test(x, y) (XARCH/Arm64 specific but could // be used for ARM as well if support for GT_TEST_EQ/GT_TEST_NE is added). // - Transform TEST(x, LSH(1, y)) into BT(x, y) (XARCH specific) +// - Transform RELOP(OP, 0) into SETCC(OP) or JCC(OP) if OP can set the +// condition flags appropriately (XARCH/ARM64 specific but could be extended +// to ARM32 as well if ARM32 codegen supports GTF_SET_FLAGS). -void Lowering::LowerCompare(GenTree* cmp) +GenTree* Lowering::LowerCompare(GenTree* cmp) { #ifndef _TARGET_64BIT_ if (cmp->gtGetOp1()->TypeGet() == TYP_LONG) @@ -2363,7 +2358,7 @@ void Lowering::LowerCompare(GenTree* cmp) cmp->AsCC()->gtCondition = condition; } - return; + return cmp->gtNext; } #endif @@ -2567,11 +2562,10 @@ void Lowering::LowerCompare(GenTree* cmp) } } } -#endif // defined(_TARGET_XARCH_) || defined(_TARGET_ARM64_) -#ifdef _TARGET_XARCH_ if (cmp->OperIs(GT_TEST_EQ, GT_TEST_NE)) { +#ifdef _TARGET_XARCH_ // // Transform TEST_EQ|NE(x, LSH(1, y)) into BT(x, y) when possible. Using BT // results in smaller and faster code. It also doesn't have special register @@ -2614,10 +2608,76 @@ void Lowering::LowerCompare(GenTree* cmp) cc->gtFlags |= GTF_USE_FLAGS | GTF_UNSIGNED; - return; + return cmp->gtNext; + } +#endif // _TARGET_XARCH_ + } +#ifdef _TARGET_XARCH_ + else if (cmp->OperIs(GT_EQ, GT_NE)) +#else // _TARGET_ARM64_ + else +#endif + { + GenTree* op1 = cmp->gtGetOp1(); + GenTree* op2 = cmp->gtGetOp2(); + + // TODO-CQ: right now the below peep is inexpensive and gets the benefit in most + // cases because in majority of cases op1, op2 and cmp would be in that order in + // execution. In general we should be able to check that all the nodes that come + // after op1 do not modify the flags so that it is safe to avoid generating a + // test instruction. + + if (op2->IsIntegralConst(0) && (op1->gtNext == op2) && (op2->gtNext == cmp) && +#ifdef _TARGET_XARCH_ + op1->OperIs(GT_AND, GT_OR, GT_XOR, GT_ADD, GT_SUB, GT_NEG)) +#else // _TARGET_ARM64_ + op1->OperIs(GT_AND, GT_ADD, GT_SUB)) +#endif + { + op1->gtFlags |= GTF_SET_FLAGS; + op1->SetUnusedValue(); + + BlockRange().Remove(op2); + + GenTree* next = cmp->gtNext; + GenTree* cc; + genTreeOps ccOp; + LIR::Use cmpUse; + + // Fast check for the common case - relop used by a JTRUE that immediately follows it. + if ((next != nullptr) && next->OperIs(GT_JTRUE) && (next->gtGetOp1() == cmp)) + { + cc = next; + ccOp = GT_JCC; + next = nullptr; + BlockRange().Remove(cmp); + } + else if (BlockRange().TryGetUse(cmp, &cmpUse) && cmpUse.User()->OperIs(GT_JTRUE)) + { + cc = cmpUse.User(); + ccOp = GT_JCC; + next = nullptr; + BlockRange().Remove(cmp); + } + else // The relop is not used by a JTRUE or it is not used at all. + { + // Transform the relop node it into a SETCC. If it's not used we could remove + // it completely but that means doing more work to handle a rare case. + cc = cmp; + ccOp = GT_SETCC; + } + + genTreeOps condition = cmp->OperGet(); + cc->ChangeOper(ccOp); + cc->AsCC()->gtCondition = condition; + cc->gtFlags |= GTF_USE_FLAGS | (cmp->gtFlags & GTF_UNSIGNED); + + return next; } } +#endif // defined(_TARGET_XARCH_) || defined(_TARGET_ARM64_) +#ifdef _TARGET_XARCH_ if (cmp->gtGetOp1()->TypeGet() == cmp->gtGetOp2()->TypeGet()) { if (varTypeIsSmall(cmp->gtGetOp1()->TypeGet()) && varTypeIsUnsigned(cmp->gtGetOp1()->TypeGet())) @@ -2635,6 +2695,7 @@ void Lowering::LowerCompare(GenTree* cmp) } #endif // _TARGET_XARCH_ ContainCheckCompare(cmp->AsOp()); + return cmp->gtNext; } // Lower "jmp " tail call to insert PInvoke method epilog if required. @@ -5377,16 +5438,6 @@ void Lowering::ContainCheckNode(GenTree* node) ContainCheckBinary(node->AsOp()); break; -#ifdef _TARGET_XARCH_ - case GT_NEG: - // Codegen of this tree node sets ZF and SF flags. - if (!varTypeIsFloating(node)) - { - node->gtFlags |= GTF_ZSF_SET; - } - break; -#endif // _TARGET_XARCH_ - #if defined(_TARGET_X86_) case GT_MUL_LONG: #endif diff --git a/src/jit/lower.h b/src/jit/lower.h index ce6bb948313e..2f8219eb9d10 100644 --- a/src/jit/lower.h +++ b/src/jit/lower.h @@ -137,7 +137,7 @@ class Lowering : public Phase // Call Lowering // ------------------------------ void LowerCall(GenTree* call); - void LowerCompare(GenTree* tree); + GenTree* LowerCompare(GenTree* tree); void LowerJmpMethod(GenTree* jmp); void LowerRet(GenTree* ret); GenTree* LowerDelegateInvoke(GenTreeCall* call); diff --git a/src/jit/lowerarmarch.cpp b/src/jit/lowerarmarch.cpp index 8986f2f23e82..22573407b40c 100644 --- a/src/jit/lowerarmarch.cpp +++ b/src/jit/lowerarmarch.cpp @@ -700,29 +700,7 @@ void Lowering::ContainCheckCompare(GenTreeOp* cmp) GenTreePtr op1 = cmp->gtOp.gtOp1; GenTreePtr op2 = cmp->gtOp.gtOp2; - // If op1 codegen can set flags op2 is an immediate 0 - // we don't need to generate cmp instruction, - // provided we don't have another GenTree node between op1 - // and cmp that could potentially modify flags. - // - // TODO-CQ: right now the below peep is inexpensive and - // gets the benefit in most of cases because in majority - // of cases op1, op2 and cmp would be in that order in - // execution. In general we should be able to check that all - // the nodes that come after op1 in execution order do not - // modify the flags so that it is safe to avoid generating a - // test instruction. Such a check requires that on each - // GenTree node we need to set the info whether its codegen - // will modify flags. - if (op2->IsIntegralConst(0) && (op1->gtNext == op2) && (op2->gtNext == cmp) && - !cmp->OperIs(GT_TEST_EQ, GT_TEST_NE) && op1->OperIs(GT_ADD, GT_AND, GT_SUB)) - { - assert(!op1->gtSetFlags()); - op1->gtFlags |= GTF_SET_FLAGS; - cmp->gtFlags |= GTF_USE_FLAGS; - } - - if (!varTypeIsFloating(cmp) && op2->IsCnsIntOrI() && ((cmp->gtFlags & GTF_USE_FLAGS) == 0)) + if (!varTypeIsFloating(cmp) && op2->IsCnsIntOrI()) { LIR::Use cmpUse; bool useJCMP = false; diff --git a/src/jit/lowerxarch.cpp b/src/jit/lowerxarch.cpp index 834cd34fda3f..415c10619b44 100644 --- a/src/jit/lowerxarch.cpp +++ b/src/jit/lowerxarch.cpp @@ -1870,41 +1870,6 @@ void Lowering::ContainCheckCompare(GenTreeOp* cmp) { MakeSrcContained(cmp, op1); } - // If op1 codegen sets ZF and SF flags and ==/!= against - // zero, we don't need to generate test instruction, - // provided we don't have another GenTree node between op1 - // and cmp that could potentially modify flags. - // - // TODO-CQ: right now the below peep is inexpensive and - // gets the benefit in most of cases because in majority - // of cases op1, op2 and cmp would be in that order in - // execution. In general we should be able to check that all - // the nodes that come after op1 in execution order do not - // modify the flags so that it is safe to avoid generating a - // test instruction. Such a check requires that on each - // GenTree node we need to set the info whether its codegen - // will modify flags. - // - // TODO-CQ: We can optimize compare against zero in the - // following cases by generating the branch as indicated - // against each case. - // 1) unsigned compare - // < 0 - always FALSE - // <= 0 - ZF=1 and jne - // > 0 - ZF=0 and je - // >= 0 - always TRUE - // - // 2) signed compare - // < 0 - SF=1 and js - // >= 0 - SF=0 and jns - else if (cmp->OperIs(GT_EQ, GT_NE) && op1->gtSetZSFlags() && op2->IsIntegralConst(0) && - (op1->gtNext == op2) && (op2->gtNext == cmp)) - { - // Require codegen of op1 to set the flags. - assert(!op1->gtSetFlags()); - op1->gtFlags |= GTF_SET_FLAGS; - cmp->gtFlags |= GTF_USE_FLAGS; - } else { SetRegOptional(op1); @@ -2077,9 +2042,6 @@ void Lowering::ContainCheckBinary(GenTreeOp* node) return; } - // Codegen of these tree nodes sets ZF and SF flags. - node->gtFlags |= GTF_ZSF_SET; - // We're not marking a constant hanging on the left of an add // as containable so we assign it to a register having CQ impact. // TODO-XArch-CQ: Detect this case and support both generating a single instruction From 848fd7427f4abc718ece9a7d74deb7ea70720c3b Mon Sep 17 00:00:00 2001 From: Mike Danes Date: Sat, 30 Sep 2017 00:34:07 +0300 Subject: [PATCH 3/4] Extend flag reuse optimization to all relops --- src/jit/codegenxarch.cpp | 3 ++- src/jit/lower.cpp | 6 ++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/jit/codegenxarch.cpp b/src/jit/codegenxarch.cpp index 28db51162dee..a51243626bdd 100644 --- a/src/jit/codegenxarch.cpp +++ b/src/jit/codegenxarch.cpp @@ -887,7 +887,8 @@ void CodeGen::genCodeForBinary(GenTree* treeNode) } // try to use an inc or dec - if (oper == GT_ADD && !varTypeIsFloating(treeNode) && src->isContainedIntOrIImmed() && !treeNode->gtOverflowEx()) + if (oper == GT_ADD && !varTypeIsFloating(treeNode) && src->isContainedIntOrIImmed() && !treeNode->gtOverflowEx() && + !treeNode->gtSetFlags()) { if (src->IsIntegralConst(1)) { diff --git a/src/jit/lower.cpp b/src/jit/lower.cpp index f6592f67a4c8..1856c2d2fc43 100644 --- a/src/jit/lower.cpp +++ b/src/jit/lower.cpp @@ -2612,12 +2612,10 @@ GenTree* Lowering::LowerCompare(GenTree* cmp) } #endif // _TARGET_XARCH_ } -#ifdef _TARGET_XARCH_ - else if (cmp->OperIs(GT_EQ, GT_NE)) -#else // _TARGET_ARM64_ else -#endif { + assert(cmp->OperIs(GT_EQ, GT_NE, GT_LE, GT_LT, GT_GE, GT_GT)); + GenTree* op1 = cmp->gtGetOp1(); GenTree* op2 = cmp->gtGetOp2(); From 5402ec55b132188d9a04f30af9e814695409f913 Mon Sep 17 00:00:00 2001 From: Mike Danes Date: Sun, 1 Oct 2017 13:53:42 +0300 Subject: [PATCH 4/4] Move JCMP transform to LowerJTrue Unlike many other relop transforms we do this one is only triggerred by the presence of a conditional branch (JTRUE) so it makes more sense to do it when lowering JTRUE nodes, avoids unnecessary calls to TryGetUse. --- src/jit/lower.cpp | 64 ++++++++++++++++++++++++++++++++++++++-- src/jit/lower.h | 1 + src/jit/lowerarmarch.cpp | 41 +------------------------ 3 files changed, 63 insertions(+), 43 deletions(-) diff --git a/src/jit/lower.cpp b/src/jit/lower.cpp index 1856c2d2fc43..e445baf5f12c 100644 --- a/src/jit/lower.cpp +++ b/src/jit/lower.cpp @@ -178,12 +178,10 @@ GenTree* Lowering::LowerNode(GenTree* node) case GT_TEST_EQ: case GT_TEST_NE: case GT_CMP: - case GT_JCMP: return LowerCompare(node); case GT_JTRUE: - ContainCheckJTrue(node->AsOp()); - break; + return LowerJTrue(node->AsOp()); case GT_JMP: LowerJmpMethod(node); @@ -2696,6 +2694,66 @@ GenTree* Lowering::LowerCompare(GenTree* cmp) return cmp->gtNext; } +//------------------------------------------------------------------------ +// Lowering::LowerJTrue: Lowers a JTRUE node. +// +// Arguments: +// jtrue - the JTRUE node +// +// Return Value: +// The next node to lower (usually nullptr). +// +// Notes: +// On ARM64 this may remove the JTRUE node and transform its associated +// relop into a JCMP node. +// +GenTree* Lowering::LowerJTrue(GenTreeOp* jtrue) +{ +#ifdef _TARGET_ARM64_ + GenTree* relop = jtrue->gtGetOp1(); + GenTree* relopOp2 = relop->gtOp.gtGetOp2(); + + if ((relop->gtNext == jtrue) && relopOp2->IsCnsIntOrI()) + { + bool useJCMP = false; + unsigned flags = 0; + + if (relop->OperIs(GT_EQ, GT_NE) && relopOp2->IsIntegralConst(0)) + { + // Codegen will use cbz or cbnz in codegen which do not affect the flag register + flags = relop->OperIs(GT_EQ) ? GTF_JCMP_EQ : 0; + useJCMP = true; + } + else if (relop->OperIs(GT_TEST_EQ, GT_TEST_NE) && isPow2(relopOp2->AsIntCon()->IconValue())) + { + // Codegen will use tbz or tbnz in codegen which do not affect the flag register + flags = GTF_JCMP_TST | (relop->OperIs(GT_TEST_EQ) ? GTF_JCMP_EQ : 0); + useJCMP = true; + } + + if (useJCMP) + { + relop->SetOper(GT_JCMP); + relop->gtFlags &= ~(GTF_JCMP_TST | GTF_JCMP_EQ); + relop->gtFlags |= flags; + relop->gtLsraInfo.isNoRegCompare = true; + + relopOp2->SetContained(); + + BlockRange().Remove(jtrue); + + assert(relop->gtNext == nullptr); + return nullptr; + } + } +#endif // _TARGET_ARM64_ + + ContainCheckJTrue(jtrue); + + assert(jtrue->gtNext == nullptr); + return nullptr; +} + // Lower "jmp " tail call to insert PInvoke method epilog if required. void Lowering::LowerJmpMethod(GenTree* jmp) { diff --git a/src/jit/lower.h b/src/jit/lower.h index 2f8219eb9d10..fd0cb20e2a52 100644 --- a/src/jit/lower.h +++ b/src/jit/lower.h @@ -138,6 +138,7 @@ class Lowering : public Phase // ------------------------------ void LowerCall(GenTree* call); GenTree* LowerCompare(GenTree* tree); + GenTree* LowerJTrue(GenTreeOp* jtrue); void LowerJmpMethod(GenTree* jmp); void LowerRet(GenTree* ret); GenTree* LowerDelegateInvoke(GenTreeCall* call); diff --git a/src/jit/lowerarmarch.cpp b/src/jit/lowerarmarch.cpp index 22573407b40c..bbc879f6b6b9 100644 --- a/src/jit/lowerarmarch.cpp +++ b/src/jit/lowerarmarch.cpp @@ -694,46 +694,7 @@ void Lowering::ContainCheckCast(GenTreeCast* node) // void Lowering::ContainCheckCompare(GenTreeOp* cmp) { - if (CheckImmedAndMakeContained(cmp, cmp->gtOp2)) - { -#ifdef _TARGET_ARM64_ - GenTreePtr op1 = cmp->gtOp.gtOp1; - GenTreePtr op2 = cmp->gtOp.gtOp2; - - if (!varTypeIsFloating(cmp) && op2->IsCnsIntOrI()) - { - LIR::Use cmpUse; - bool useJCMP = false; - uint64_t flags = 0; - - if (cmp->OperIs(GT_EQ, GT_NE) && op2->IsIntegralConst(0) && BlockRange().TryGetUse(cmp, &cmpUse) && - cmpUse.User()->OperIs(GT_JTRUE)) - { - // Codegen will use cbz or cbnz in codegen which do not affect the flag register - flags = cmp->OperIs(GT_EQ) ? GTF_JCMP_EQ : 0; - useJCMP = true; - } - else if (cmp->OperIs(GT_TEST_EQ, GT_TEST_NE) && isPow2(op2->gtIntCon.IconValue()) && - BlockRange().TryGetUse(cmp, &cmpUse) && cmpUse.User()->OperIs(GT_JTRUE)) - { - // Codegen will use tbz or tbnz in codegen which do not affect the flag register - flags = GTF_JCMP_TST | (cmp->OperIs(GT_TEST_EQ) ? GTF_JCMP_EQ : 0); - useJCMP = true; - } - - if (useJCMP) - { - cmp->gtLsraInfo.isNoRegCompare = true; - cmp->SetOper(GT_JCMP); - - cmp->gtFlags &= ~(GTF_JCMP_TST | GTF_JCMP_EQ); - cmp->gtFlags |= flags; - - BlockRange().Remove(cmpUse.User()); - } - } -#endif // _TARGET_ARM64_ - } + CheckImmedAndMakeContained(cmp, cmp->gtOp2); } //------------------------------------------------------------------------