diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index dc5c67ec347c8..94d67a06ace08 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -46,6 +46,9 @@ class CodeGen final : public CodeGenInterface CORINFO_FIELD_HANDLE absBitmaskFlt; CORINFO_FIELD_HANDLE absBitmaskDbl; + // Bit mask used in zeroing the 3rd element of a SIMD12 + CORINFO_FIELD_HANDLE zroSimd12Elm3; + // Bit mask used in U8 -> double conversion to adjust the result. CORINFO_FIELD_HANDLE u8ToDblBitmask; @@ -925,6 +928,8 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX void genSimdUpperSave(GenTreeIntrinsic* node); void genSimdUpperRestore(GenTreeIntrinsic* node); + void genSimd12UpperClear(regNumber tgtReg); + // TYP_SIMD12 (i.e Vector3 of size 12 bytes) is not a hardware supported size and requires // two reads/writes on 64-bit targets. These routines abstract reading/writing of Vector3 // values through an indirection. Note that Vector3 locals allocated on stack would have @@ -1532,6 +1537,8 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX void inst_RV_RV_IV(instruction ins, emitAttr size, regNumber reg1, regNumber reg2, unsigned ival); void inst_RV_TT_IV(instruction ins, emitAttr attr, regNumber reg1, GenTree* rmOp, int ival); void inst_RV_RV_TT(instruction ins, emitAttr size, regNumber targetReg, regNumber op1Reg, GenTree* op2, bool isRMW); + void inst_RV_RV_TT_IV( + instruction ins, emitAttr size, regNumber targetReg, regNumber op1Reg, GenTree* op2, int8_t ival, bool isRMW); #endif void inst_set_SV_var(GenTree* tree); diff --git a/src/coreclr/jit/codegencommon.cpp b/src/coreclr/jit/codegencommon.cpp index 0133175b5df32..721594e93e926 100644 --- a/src/coreclr/jit/codegencommon.cpp +++ b/src/coreclr/jit/codegencommon.cpp @@ -4474,7 +4474,7 @@ void CodeGen::genZeroInitFltRegs(const regMaskTP& initFltRegs, const regMaskTP& } #elif defined(TARGET_XARCH) // XORPS is the fastest and smallest way to initialize a XMM register to zero. - inst_RV_RV(INS_xorps, reg, reg, TYP_DOUBLE); + GetEmitter()->emitIns_SIMD_R_R_R(INS_xorps, EA_16BYTE, reg, reg, reg); dblInitReg = reg; #elif defined(TARGET_ARM64) // We will just zero out the entire vector register. This sets it to a double/float zero value @@ -4514,7 +4514,7 @@ void CodeGen::genZeroInitFltRegs(const regMaskTP& initFltRegs, const regMaskTP& } #elif defined(TARGET_XARCH) // XORPS is the fastest and smallest way to initialize a XMM register to zero. - inst_RV_RV(INS_xorps, reg, reg, TYP_DOUBLE); + GetEmitter()->emitIns_SIMD_R_R_R(INS_xorps, EA_16BYTE, reg, reg, reg); fltInitReg = reg; #elif defined(TARGET_ARM64) // We will just zero out the entire vector register. This sets it to a double/float zero value diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index a864529c9e004..c0b2404cefb58 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -465,12 +465,12 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTre if (tree->IsFloatPositiveZero()) { // A faster/smaller way to generate Zero - emit->emitIns_R_R(INS_xorps, size, targetReg, targetReg); + emit->emitIns_SIMD_R_R_R(INS_xorps, EA_16BYTE, targetReg, targetReg, targetReg); } else if (tree->IsFloatAllBitsSet()) { // A faster/smaller way to generate AllBitsSet - emit->emitIns_R_R(INS_pcmpeqd, size, targetReg, targetReg); + emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, EA_16BYTE, targetReg, targetReg, targetReg); } else { @@ -496,9 +496,10 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTre case EA_8BYTE: case EA_16BYTE: { - emit->emitIns_R_R(INS_pcmpeqd, attr, targetReg, targetReg); + emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, EA_16BYTE, targetReg, targetReg, targetReg); return; } + #if defined(FEATURE_SIMD) case EA_32BYTE: { @@ -528,27 +529,32 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTre if (vecCon->IsZero()) { - bool isSupported; - switch (attr) { - case EA_32BYTE: + case EA_8BYTE: + case EA_16BYTE: { - isSupported = compiler->compOpportunisticallyDependsOn(InstructionSet_AVX); - break; + emit->emitIns_SIMD_R_R_R(INS_xorps, EA_16BYTE, targetReg, targetReg, targetReg); + return; } - case EA_64BYTE: + case EA_32BYTE: { - isSupported = compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512F); + if (compiler->compOpportunisticallyDependsOn(InstructionSet_AVX)) + { + emit->emitIns_SIMD_R_R_R(INS_xorps, attr, targetReg, targetReg, targetReg); + return; + } break; } - case EA_8BYTE: - case EA_16BYTE: + case EA_64BYTE: { - assert((attr == EA_8BYTE) || (attr == EA_16BYTE)); - isSupported = true; + if (compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512F)) + { + emit->emitIns_SIMD_R_R_R(INS_xorps, attr, targetReg, targetReg, targetReg); + return; + } break; } @@ -557,16 +563,6 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTre unreached(); } } - - if (isSupported) - { -#if defined(FEATURE_SIMD) - emit->emitIns_SIMD_R_R_R(INS_xorps, attr, targetReg, targetReg, targetReg); -#else - emit->emitIns_R_R(INS_xorps, attr, targetReg, targetReg); -#endif // FEATURE_SIMD - break; - } } switch (tree->TypeGet()) @@ -1250,37 +1246,46 @@ void CodeGen::genSIMDSplitReturn(GenTree* src, ReturnTypeDesc* retTypeDesc) #ifdef TARGET_AMD64 assert(src->TypeIs(TYP_SIMD16)); assert(srcIsFloatReg == dstIsFloatReg); - if (opReg != reg0 && opReg != reg1) - { - // Operand reg is different from return regs. - // Copy opReg to reg0 and let it to be handled by one of the - // two cases below. - inst_Mov(TYP_SIMD16, reg0, opReg, /* canSkip */ false); - opReg = reg0; - } - - if (opReg == reg0) - { - assert(opReg != reg1); - // reg1 = opReg. - inst_Mov(TYP_SIMD16, reg1, opReg, /* canSkip */ false); - } - else - { - assert(opReg == reg1); - // reg0 = opReg. - inst_Mov(TYP_SIMD16, reg0, opReg, /* canSkip */ false); - } - // reg0 - already has required 8-byte in bit position [63:0]. - // swap upper and lower 8-bytes of reg1 so that desired 8-byte is in bit position [63:0]. - inst_RV_RV_IV(INS_shufpd, EA_16BYTE, reg1, reg1, 0x01); + assert(reg0 != reg1); + // We can have one of three scenarios here. + // + // First, all three registers are different: + // opReg = xmm0 + // reg1 = xmm1 + // reg2 = xmm2 + // We can then generate two instructions: + // movaps xmm1, xmm0 ; reg1[63:00] = opReg[ 63:00] + // movhlps xmm2, xmm0 ; reg2[63:00] = opReg[127:64] + // + // Second we have opReg and reg1 as the same register: + // opReg = xmm0 + // reg1 = xmm0 + // reg2 = xmm2 + // We can then generate one instruction: + // movhlps xmm2, xmm0 ; reg2[63:00] = opReg[127:64] + // + // Third we have opReg and reg2 as the same register: + // opReg = xmm0 + // reg1 = xmm1 + // reg2 = xmm0 + // We can then generate two instructions: + // movaps xmm1, xmm0 ; reg1[63:00] = opReg[ 63:00] + // movhlps xmm0, xmm0 ; reg2[63:00] = opReg[127:64] + + // Move opReg into reg0, if not already there + inst_Mov(TYP_SIMD16, reg0, opReg, /* canSkip */ true); + + // Move upper 64-bits of opReg into reg1 + GetEmitter()->emitIns_SIMD_R_R_R(INS_movhlps, EA_16BYTE, reg1, reg1, opReg); #else // TARGET_X86 assert(src->TypeIs(TYP_SIMD8)); assert(srcIsFloatReg != dstIsFloatReg); assert((reg0 == REG_EAX) && (reg1 == REG_EDX)); + // reg0 = opReg[31:0] inst_Mov(TYP_INT, reg0, opReg, /* canSkip */ false); + // reg1 = opRef[61:32] if (compiler->compOpportunisticallyDependsOn(InstructionSet_SSE41)) { @@ -1288,8 +1293,10 @@ void CodeGen::genSIMDSplitReturn(GenTree* src, ReturnTypeDesc* retTypeDesc) } else { + bool isRMW = !compiler->canUseVexEncoding(); int8_t shuffleMask = 1; // we only need [61:32]->[31:0], the rest is not read. - inst_RV_TT_IV(INS_pshufd, EA_8BYTE, opReg, src, shuffleMask); + + inst_RV_RV_TT_IV(INS_pshufd, EA_8BYTE, opReg, opReg, src, shuffleMask, isRMW); inst_Mov(TYP_INT, reg1, opReg, /* canSkip */ false); } #endif // TARGET_X86 @@ -2192,36 +2199,28 @@ void CodeGen::genMultiRegStoreToSIMDLocal(GenTreeLclVar* lclNode) // This is a case where the two 8-bytes that comprise the operand are in // two different xmm registers and need to be assembled into a single // xmm register. - if (targetReg != reg0 && targetReg != reg1) - { - // targetReg = reg0; - // targetReg[127:64] = reg1[127:64] - inst_Mov(TYP_DOUBLE, targetReg, reg0, /* canSkip */ false); - inst_RV_RV_IV(INS_shufpd, EA_16BYTE, targetReg, reg1, 0x00); - } - else if (targetReg == reg0) + + if (targetReg != reg1) { - // (elided) targetReg = reg0 - // targetReg[127:64] = reg1[127:64] - inst_RV_RV_IV(INS_shufpd, EA_16BYTE, targetReg, reg1, 0x00); + GetEmitter()->emitIns_SIMD_R_R_R(INS_movlhps, EA_16BYTE, targetReg, reg0, reg1); } else { - assert(targetReg == reg1); // We need two shuffles to achieve this // First: - // targetReg[63:0] = targetReg[63:0] + // targetReg[ 63:00] = reg1[63:0] // targetReg[127:64] = reg0[63:0] // // Second: - // targetReg[63:0] = targetReg[127:64] - // targetReg[127:64] = targetReg[63:0] + // targetReg[ 63:00] = targetReg[127:64] + // targetReg[127:64] = targetReg[ 63:00] // // Essentially copy low 8-bytes from reg0 to high 8-bytes of targetReg // and next swap low and high 8-bytes of targetReg to have them // rearranged in the right order. - inst_RV_RV_IV(INS_shufpd, EA_16BYTE, targetReg, reg0, 0x00); - inst_RV_RV_IV(INS_shufpd, EA_16BYTE, targetReg, targetReg, 0x01); + + GetEmitter()->emitIns_SIMD_R_R_R(INS_movlhps, EA_16BYTE, targetReg, reg1, reg0); + GetEmitter()->emitIns_SIMD_R_R_R_I(INS_shufpd, EA_16BYTE, targetReg, targetReg, reg1, 0x01); } genProduceReg(lclNode); #elif defined(TARGET_X86) @@ -3184,7 +3183,7 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) { // If the source is constant 0 then always use xorps, it's faster // than copying the constant from a GPR to a XMM register. - emit->emitIns_R_R(INS_xorps, EA_ATTR(regSize), srcXmmReg, srcXmmReg); + emit->emitIns_SIMD_R_R_R(INS_xorps, EA_ATTR(regSize), srcXmmReg, srcXmmReg, srcXmmReg); zeroing = true; } else @@ -3192,16 +3191,31 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) // TODO-AVX512-ARCH: Enable AVX-512 for non-zeroing initblk. regSize = min(regSize, YMM_REGSIZE_BYTES); - emit->emitIns_Mov(INS_movd, EA_PTRSIZE, srcXmmReg, srcIntReg, /* canSkip */ false); - emit->emitIns_R_R(INS_punpckldq, EA_16BYTE, srcXmmReg, srcXmmReg); + if (compiler->compOpportunisticallyDependsOn(InstructionSet_Vector512)) + { + emit->emitIns_R_R(INS_vpbroadcastd_gpr, EA_ATTR(regSize), srcXmmReg, srcIntReg); + } + else if (compiler->compOpportunisticallyDependsOn(InstructionSet_AVX2)) + { + emit->emitIns_Mov(INS_movd, EA_PTRSIZE, srcXmmReg, srcIntReg, /* canSkip */ false); + emit->emitIns_R_R(INS_vpbroadcastd, EA_ATTR(regSize), srcXmmReg, srcXmmReg); + } + else + { + emit->emitIns_Mov(INS_movd, EA_PTRSIZE, srcXmmReg, srcIntReg, /* canSkip */ false); + + emit->emitIns_SIMD_R_R_R(INS_punpckldq, EA_16BYTE, srcXmmReg, srcXmmReg, srcXmmReg); + #ifdef TARGET_X86 - // For x86, we need one more to convert it from 8 bytes to 16 bytes. - emit->emitIns_R_R(INS_punpckldq, EA_16BYTE, srcXmmReg, srcXmmReg); + // For x86, we need one more to convert it from 8 bytes to 16 bytes. + emit->emitIns_SIMD_R_R_R(INS_punpckldq, EA_16BYTE, srcXmmReg, srcXmmReg, srcXmmReg); #endif - if (regSize == YMM_REGSIZE_BYTES) - { - // Extend the bytes in the lower lanes to the upper lanes - emit->emitIns_R_R_R_I(INS_vinsertf128, EA_32BYTE, srcXmmReg, srcXmmReg, srcXmmReg, 1); + + if (regSize == YMM_REGSIZE_BYTES) + { + // Extend the bytes in the lower lanes to the upper lanes + emit->emitIns_R_R_R_I(INS_vinsertf128, EA_32BYTE, srcXmmReg, srcXmmReg, srcXmmReg, 1); + } } } @@ -4080,12 +4094,7 @@ void CodeGen::genClearStackVec3ArgUpperBits() // Assume that for x64 linux, an argument is fully in registers // or fully on stack. regNumber argReg = varDsc->GetOtherArgReg(); - - // Clear the upper 32 bits by two shift instructions. - // argReg = argReg << 96 - GetEmitter()->emitIns_R_I(INS_pslldq, emitActualTypeSize(TYP_SIMD12), argReg, 12); - // argReg = argReg >> 96 - GetEmitter()->emitIns_R_I(INS_psrldq, emitActualTypeSize(TYP_SIMD12), argReg, 12); + genSimd12UpperClear(argReg); } } } @@ -6119,18 +6128,14 @@ void CodeGen::genCall(GenTreeCall* call) inst_Mov(regType, allocatedReg, returnReg, /* canSkip */ true); } -#ifdef FEATURE_SIMD +#if defined(FEATURE_SIMD) // A Vector3 return value is stored in xmm0 and xmm1. // RyuJIT assumes that the upper unused bits of xmm1 are cleared but // the native compiler doesn't guarantee it. if (call->IsUnmanaged() && (returnType == TYP_SIMD12)) { returnReg = retTypeDesc->GetABIReturnReg(1); - // Clear the upper 32 bits by two shift instructions. - // retReg = retReg << 96 - // retReg = retReg >> 96 - GetEmitter()->emitIns_R_I(INS_pslldq, emitActualTypeSize(TYP_SIMD12), returnReg, 12); - GetEmitter()->emitIns_R_I(INS_psrldq, emitActualTypeSize(TYP_SIMD12), returnReg, 12); + genSimd12UpperClear(returnReg); } #endif // FEATURE_SIMD } @@ -7455,7 +7460,12 @@ void CodeGen::genFloatToFloatCast(GenTree* treeNode) else { instruction ins = ins_FloatConv(dstType, srcType, emitTypeSize(dstType)); - GetEmitter()->emitInsBinary(ins, emitTypeSize(dstType), treeNode, op1); + + // integral to floating-point conversions all have RMW semantics if VEX support + // is not available + + bool isRMW = !compiler->canUseVexEncoding(); + inst_RV_RV_TT(ins, emitTypeSize(dstType), targetReg, targetReg, op1, isRMW); } genProduceReg(treeNode); @@ -7544,12 +7554,18 @@ void CodeGen::genIntToFloatCast(GenTree* treeNode) // cvtsi2ss/sd instruction. genConsumeOperands(treeNode->AsOp()); - GetEmitter()->emitIns_R_R(INS_xorps, EA_4BYTE, treeNode->GetRegNum(), treeNode->GetRegNum()); + GetEmitter()->emitIns_SIMD_R_R_R(INS_xorps, EA_16BYTE, treeNode->GetRegNum(), treeNode->GetRegNum(), + treeNode->GetRegNum()); // Note that here we need to specify srcType that will determine // the size of source reg/mem operand and rex.w prefix. instruction ins = ins_FloatConv(dstType, TYP_INT, emitTypeSize(srcType)); - GetEmitter()->emitInsBinary(ins, emitTypeSize(srcType), treeNode, op1); + + // integral to floating-point conversions all have RMW semantics if VEX support + // is not available + + bool isRMW = !compiler->canUseVexEncoding(); + inst_RV_RV_TT(ins, emitTypeSize(srcType), targetReg, targetReg, op1, isRMW); // Handle the case of srcType = TYP_ULONG. SSE2 conversion instruction // will interpret ULONG value as LONG. Hence we need to adjust the @@ -7584,7 +7600,7 @@ void CodeGen::genIntToFloatCast(GenTree* treeNode) *cns = GetEmitter()->emitFltOrDblConst(d, EA_8BYTE); } - GetEmitter()->emitIns_R_C(INS_addsd, EA_8BYTE, treeNode->GetRegNum(), *cns, 0); + GetEmitter()->emitIns_SIMD_R_R_C(INS_addsd, EA_8BYTE, targetReg, targetReg, *cns, 0); genDefineTempLabel(label); } @@ -7747,7 +7763,7 @@ void CodeGen::genCkfinite(GenTree* treeNode) if (targetType == TYP_DOUBLE) { inst_Mov(targetType, targetReg, op1->GetRegNum(), /* canSkip */ true); - inst_RV_RV_IV(INS_shufps, EA_16BYTE, targetReg, targetReg, (int8_t)0xb1); + GetEmitter()->emitIns_SIMD_R_R_R_I(INS_shufps, EA_16BYTE, targetReg, targetReg, targetReg, (int8_t)0xB1); copyToTmpSrcReg = targetReg; } else @@ -7769,7 +7785,7 @@ void CodeGen::genCkfinite(GenTree* treeNode) if ((targetType == TYP_DOUBLE) && (targetReg == op1->GetRegNum())) { // We need to re-shuffle the targetReg to get the correct result. - inst_RV_RV_IV(INS_shufps, EA_16BYTE, targetReg, targetReg, (int8_t)0xb1); + GetEmitter()->emitIns_SIMD_R_R_R_I(INS_shufps, EA_16BYTE, targetReg, targetReg, targetReg, (int8_t)0xB1); } else { @@ -7935,7 +7951,7 @@ void CodeGen::genSSE2BitwiseOp(GenTree* treeNode) // Abs(f) = f & 0x7fffffff x4 (packed) // Abs(d) = d & 0x7fffffffffffffff x2 (packed) ins = INS_andps; - mask = treeNode->TypeIs(TYP_FLOAT) ? 0x7fffffff7fffffffUL : 0x7fffffffffffffffUL; + mask = treeNode->TypeIs(TYP_FLOAT) ? 0x7FFFFFFF7FFFFFFFUL : 0x7FFFFFFFFFFFFFFFUL; maskFld = treeNode->TypeIs(TYP_FLOAT) ? &absBitmaskFlt : &absBitmaskDbl; } else @@ -7945,11 +7961,19 @@ void CodeGen::genSSE2BitwiseOp(GenTree* treeNode) if (*maskFld == nullptr) { - UINT64 maskPack[] = {mask, mask}; - *maskFld = GetEmitter()->emitBlkConst(&maskPack, 16, 16, treeNode->TypeGet()); + simd16_t constValue; + + constValue.u64[0] = mask; + constValue.u64[1] = mask; + +#if defined(FEATURE_SIMD) + *maskFld = GetEmitter()->emitSimd16Const(constValue); +#else + *maskFld = GetEmitter()->emitBlkConst(&constValue, 16, 16, treeNode->TypeGet()); +#endif } - GetEmitter()->emitIns_SIMD_R_R_C(ins, size, targetReg, operandReg, *maskFld, 0); + GetEmitter()->emitIns_SIMD_R_R_C(ins, EA_16BYTE, targetReg, operandReg, *maskFld, 0); } //----------------------------------------------------------------------------------------- @@ -7992,7 +8016,7 @@ void CodeGen::genSSE41RoundOp(GenTreeOp* treeNode) regNumber dstReg = treeNode->GetRegNum(); - unsigned ival = 0; + int8_t ival = 0; // v) tree oper is NI_System_Math{F}_Round, _Ceiling, _Floor, or _Truncate switch (treeNode->AsIntrinsic()->gtIntrinsicName) @@ -8019,104 +8043,8 @@ void CodeGen::genSSE41RoundOp(GenTreeOp* treeNode) unreached(); } - if (srcNode->isContained() || srcNode->isUsedFromSpillTemp()) - { - emitter* emit = GetEmitter(); - - TempDsc* tmpDsc = nullptr; - unsigned varNum = BAD_VAR_NUM; - unsigned offset = (unsigned)-1; - - if (srcNode->isUsedFromSpillTemp()) - { - assert(srcNode->IsRegOptional()); - - tmpDsc = getSpillTempDsc(srcNode); - varNum = tmpDsc->tdTempNum(); - offset = 0; - - regSet.tmpRlsTemp(tmpDsc); - } - else if (srcNode->isIndir()) - { - GenTreeIndir* memIndir = srcNode->AsIndir(); - GenTree* memBase = memIndir->gtOp1; - - switch (memBase->OperGet()) - { - case GT_LCL_ADDR: - { - assert(memBase->isContained()); - varNum = memBase->AsLclFld()->GetLclNum(); - offset = memBase->AsLclFld()->GetLclOffs(); - - // Ensure that all the GenTreeIndir values are set to their defaults. - assert(memBase->GetRegNum() == REG_NA); - assert(!memIndir->HasIndex()); - assert(memIndir->Scale() == 1); - assert(memIndir->Offset() == 0); - - break; - } - - case GT_CLS_VAR_ADDR: - { - emit->emitIns_R_C_I(ins, size, dstReg, memBase->AsClsVar()->gtClsVarHnd, 0, ival); - return; - } - - default: - { - emit->emitIns_R_A_I(ins, size, dstReg, memIndir, ival); - return; - } - } - } - else - { - switch (srcNode->OperGet()) - { - case GT_CNS_DBL: - { - GenTreeDblCon* dblConst = srcNode->AsDblCon(); - CORINFO_FIELD_HANDLE hnd = emit->emitFltOrDblConst(dblConst->DconValue(), emitTypeSize(dblConst)); - - emit->emitIns_R_C_I(ins, size, dstReg, hnd, 0, ival); - return; - } - - case GT_LCL_FLD: - varNum = srcNode->AsLclFld()->GetLclNum(); - offset = srcNode->AsLclFld()->GetLclOffs(); - break; - - case GT_LCL_VAR: - { - assert(srcNode->IsRegOptional() || !compiler->lvaGetDesc(srcNode->AsLclVar())->lvIsRegCandidate()); - - varNum = srcNode->AsLclVar()->GetLclNum(); - offset = 0; - break; - } - - default: - unreached(); - break; - } - } - - // Ensure we got a good varNum and offset. - // We also need to check for `tmpDsc != nullptr` since spill temp numbers - // are negative and start with -1, which also happens to be BAD_VAR_NUM. - assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr)); - assert(offset != (unsigned)-1); - - emit->emitIns_R_S_I(ins, size, dstReg, varNum, offset, ival); - } - else - { - inst_RV_RV_IV(ins, size, dstReg, srcNode->GetRegNum(), ival); - } + bool isRMW = !compiler->canUseVexEncoding(); + inst_RV_RV_TT_IV(ins, size, dstReg, dstReg, srcNode, ival, isRMW); } //--------------------------------------------------------------------- @@ -8154,7 +8082,11 @@ void CodeGen::genIntrinsic(GenTreeIntrinsic* treeNode) genConsumeOperands(treeNode->AsOp()); const instruction ins = (treeNode->TypeGet() == TYP_FLOAT) ? INS_sqrtss : INS_sqrtsd; - GetEmitter()->emitInsBinary(ins, emitTypeSize(treeNode), treeNode, srcNode); + + regNumber targetReg = treeNode->GetRegNum(); + bool isRMW = !compiler->canUseVexEncoding(); + + inst_RV_RV_TT(ins, emitTypeSize(treeNode), targetReg, targetReg, srcNode, isRMW); break; } @@ -11109,7 +11041,7 @@ void CodeGen::genZeroInitFrameUsingBlockInit(int untrLclHi, int untrLclLo, regNu // movups xmmword ptr [ebp/esp-OFFS], xmm4 // mov qword ptr [ebp/esp-OFFS], rax - emit->emitIns_R_R(INS_xorps, EA_ATTR(XMM_REGSIZE_BYTES), zeroSIMDReg, zeroSIMDReg); + emit->emitIns_SIMD_R_R_R(INS_xorps, EA_16BYTE, zeroSIMDReg, zeroSIMDReg, zeroSIMDReg); int i = 0; for (; i < blkSize; i += XMM_REGSIZE_BYTES) @@ -11133,7 +11065,7 @@ void CodeGen::genZeroInitFrameUsingBlockInit(int untrLclHi, int untrLclLo, regNu // add rax, 48 ; | // jne SHORT -5 instr ; ---+ - emit->emitIns_R_R(INS_xorps, EA_ATTR(XMM_REGSIZE_BYTES), zeroSIMDReg, zeroSIMDReg); + emit->emitIns_SIMD_R_R_R(INS_xorps, EA_16BYTE, zeroSIMDReg, zeroSIMDReg, zeroSIMDReg); // How many extra don't fit into the 3x unroll int extraSimd = (blkSize % (XMM_REGSIZE_BYTES * 3)) / XMM_REGSIZE_BYTES; diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index 97de8d503606c..91e6276321d4b 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -3673,6 +3673,33 @@ const BYTE emitter::emitFmtToOps[] = { const unsigned emitter::emitFmtCount = ArrLen(emitFmtToOps); #endif +#if defined(TARGET_XARCH) +//------------------------------------------------------------------------ +// emitGetSchedInfo: Gets the scheduling information for a given insFmt +// +// Arguments: +// insFmt - format for which to query scheduling information +// +// Return Value: +// the scheduling information for insFmt +// +const IS_INFO emitter::emitGetSchedInfo(insFormat insFmt) +{ + static const IS_INFO emitFmtToSchedInfo[] = { +#define IF_DEF(en, op1, op2) static_cast(op1), +#include "emitfmts.h" + }; + + if (insFmt < ArrLen(emitFmtToSchedInfo)) + { + return emitFmtToSchedInfo[insFmt]; + } + + assert(!"Unsupported insFmt"); + return IS_NONE; +} +#endif // TARGET_XARCH + //------------------------------------------------------------------------ // Interleaved GC info dumping. // We'll attempt to line this up with the opcode, which indented differently for diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index 4c078777964cd..e858a886029d5 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -1204,6 +1204,7 @@ class emitter idAddr()->_idReg3 = reg; assert(reg == idAddr()->_idReg3); } + regNumber idReg4() const { assert(!idIsSmallDsc()); @@ -1215,27 +1216,117 @@ class emitter idAddr()->_idReg4 = reg; assert(reg == idAddr()->_idReg4); } + + bool idHasReg1() const + { + IS_INFO isInfo = emitGetSchedInfo(idInsFmt()); + return (isInfo & (IS_R1_RD | IS_R1_RW | IS_R1_WR)) != 0; + } + bool idIsReg1Read() const + { + IS_INFO isInfo = emitGetSchedInfo(idInsFmt()); + return (isInfo & (IS_R1_RD | IS_R1_RW)) != 0; + } + bool idIsReg1Write() const + { + IS_INFO isInfo = emitGetSchedInfo(idInsFmt()); + return (isInfo & (IS_R1_RW | IS_R1_WR)) != 0; + } + + bool idHasReg2() const + { + IS_INFO isInfo = emitGetSchedInfo(idInsFmt()); + return (isInfo & (IS_R2_RD | IS_R2_RW | IS_R2_WR)) != 0; + } + bool idIsReg2Read() const + { + IS_INFO isInfo = emitGetSchedInfo(idInsFmt()); + return (isInfo & (IS_R2_RD | IS_R2_RW)) != 0; + } + bool idIsReg2Write() const + { + IS_INFO isInfo = emitGetSchedInfo(idInsFmt()); + return (isInfo & (IS_R2_RW | IS_R2_WR)) != 0; + } + bool idHasReg3() const { - switch (idInsFmt()) - { - case IF_RWR_RRD_RRD: - case IF_RWR_RRD_RRD_CNS: - case IF_RWR_RRD_RRD_RRD: - return true; - default: - return false; - } + IS_INFO isInfo = emitGetSchedInfo(idInsFmt()); + return (isInfo & (IS_R3_RD | IS_R3_RW | IS_R3_WR)) != 0; } + bool idIsReg3Read() const + { + IS_INFO isInfo = emitGetSchedInfo(idInsFmt()); + return (isInfo & (IS_R3_RD | IS_R3_RW)) != 0; + } + bool idIsReg3Write() const + { + IS_INFO isInfo = emitGetSchedInfo(idInsFmt()); + return (isInfo & (IS_R3_RW | IS_R3_WR)) != 0; + } + bool idHasReg4() const { - switch (idInsFmt()) - { - case IF_RWR_RRD_RRD_RRD: - return true; - default: - return false; - } + IS_INFO isInfo = emitGetSchedInfo(idInsFmt()); + return (isInfo & (IS_R4_RD | IS_R4_RW | IS_R4_WR)) != 0; + } + bool idIsReg4Read() const + { + IS_INFO isInfo = emitGetSchedInfo(idInsFmt()); + return (isInfo & (IS_R4_RD | IS_R4_RW)) != 0; + } + bool idIsReg4Write() const + { + IS_INFO isInfo = emitGetSchedInfo(idInsFmt()); + return (isInfo & (IS_R4_RW | IS_R4_WR)) != 0; + } + + bool idHasMemGen() const + { + IS_INFO isInfo = emitGetSchedInfo(idInsFmt()); + return (isInfo & (IS_GM_RD | IS_GM_RW | IS_GM_WR)) != 0; + } + bool idHasMemGenRead() const + { + IS_INFO isInfo = emitGetSchedInfo(idInsFmt()); + return (isInfo & (IS_GM_RD | IS_GM_RW)) != 0; + } + bool idHasMemGenWrite() const + { + IS_INFO isInfo = emitGetSchedInfo(idInsFmt()); + return (isInfo & (IS_GM_RW | IS_GM_WR)) != 0; + } + + bool idHasMemStk() const + { + IS_INFO isInfo = emitGetSchedInfo(idInsFmt()); + return (isInfo & (IS_SF_RD | IS_SF_RW | IS_SF_WR)) != 0; + } + bool idHasMemStkRead() const + { + IS_INFO isInfo = emitGetSchedInfo(idInsFmt()); + return (isInfo & (IS_SF_RD | IS_SF_RW)) != 0; + } + bool idHasMemStkWrite() const + { + IS_INFO isInfo = emitGetSchedInfo(idInsFmt()); + return (isInfo & (IS_SF_RW | IS_SF_WR)) != 0; + } + + bool idHasMemAdr() const + { + IS_INFO isInfo = emitGetSchedInfo(idInsFmt()); + return (isInfo & (IS_AM_RD | IS_AM_RW | IS_AM_WR)) != 0; + } + bool idHasMemAdrRead() const + { + IS_INFO isInfo = emitGetSchedInfo(idInsFmt()); + return (isInfo & (IS_AM_RD | IS_AM_RW)) != 0; + } + bool idHasMemAdrWrite() const + { + IS_INFO isInfo = emitGetSchedInfo(idInsFmt()); + return (isInfo & (IS_AM_RW | IS_AM_WR)) != 0; } #endif // defined(TARGET_XARCH) #ifdef TARGET_ARMARCH @@ -1495,7 +1586,7 @@ class emitter { _idDspReloc = val; } - bool idIsReloc() + bool idIsReloc() const { return idIsDspReloc() || idIsCnsReloc(); } @@ -1547,7 +1638,8 @@ class emitter }; // End of struct instrDesc #if defined(TARGET_XARCH) - insFormat getMemoryOperation(instrDesc* id); + insFormat getMemoryOperation(instrDesc* id) const; + insFormat ExtractMemoryFormat(insFormat insFmt) const; #elif defined(TARGET_ARM64) void getMemoryOperation(instrDesc* id, unsigned* pMemAccessKind, bool* pIsLocalAccess); #endif @@ -1937,6 +2029,7 @@ class emitter // Return the argument count for a direct call "id". int emitGetInsCDinfo(instrDesc* id); + static const IS_INFO emitGetSchedInfo(insFormat f); #endif // TARGET_XARCH cnsval_ssize_t emitGetInsSC(instrDesc* id); @@ -2742,9 +2835,9 @@ class emitter static const unsigned emitFmtCount; #endif - bool emitIsScnsInsDsc(instrDesc* id); + bool emitIsSmallInsDsc(instrDesc* id) const; - size_t emitSizeOfInsDsc(instrDesc* id); + size_t emitSizeOfInsDsc(instrDesc* id) const; /************************************************************************/ /* The following keeps track of stack-based GC values */ @@ -3127,7 +3220,7 @@ inline void emitter::instrDesc::checkSizes() * fields allocated). */ -inline bool emitter::emitIsScnsInsDsc(instrDesc* id) +inline bool emitter::emitIsSmallInsDsc(instrDesc* id) const { return id->idIsSmallDsc(); } diff --git a/src/coreclr/jit/emitarm.cpp b/src/coreclr/jit/emitarm.cpp index a82f37421d0dc..13b9c37ab529e 100644 --- a/src/coreclr/jit/emitarm.cpp +++ b/src/coreclr/jit/emitarm.cpp @@ -83,9 +83,9 @@ const emitJumpKind emitReverseJumpKinds[] = { * Return the allocated size (in bytes) of the given instruction descriptor. */ -size_t emitter::emitSizeOfInsDsc(instrDesc* id) +size_t emitter::emitSizeOfInsDsc(instrDesc* id) const { - if (emitIsScnsInsDsc(id)) + if (emitIsSmallInsDsc(id)) return SMALL_IDSC_SIZE; assert((unsigned)id->idInsFmt() < emitFmtCount); diff --git a/src/coreclr/jit/emitarm64.cpp b/src/coreclr/jit/emitarm64.cpp index b537146be5b40..c35b9c6573256 100644 --- a/src/coreclr/jit/emitarm64.cpp +++ b/src/coreclr/jit/emitarm64.cpp @@ -86,9 +86,9 @@ const emitJumpKind emitReverseJumpKinds[] = { * Return the allocated size (in bytes) of the given instruction descriptor. */ -size_t emitter::emitSizeOfInsDsc(instrDesc* id) +size_t emitter::emitSizeOfInsDsc(instrDesc* id) const { - if (emitIsScnsInsDsc(id)) + if (emitIsSmallInsDsc(id)) return SMALL_IDSC_SIZE; assert((unsigned)id->idInsFmt() < emitFmtCount); diff --git a/src/coreclr/jit/emitfmtsxarch.h b/src/coreclr/jit/emitfmtsxarch.h index 7e90ca3b417de..45ddb9bb188cf 100644 --- a/src/coreclr/jit/emitfmtsxarch.h +++ b/src/coreclr/jit/emitfmtsxarch.h @@ -31,6 +31,39 @@ enum ID_OPS ID_OP_SPEC, // special handling required }; +enum IS_INFO +{ + IS_NONE = 0, // no scheduling information + + IS_R1_RD = 1 << 0, // has a reg1 op that is read-only + IS_R1_WR = 1 << 1, // has a reg1 op that is write-only + IS_R1_RW = 1 << 2, // has a reg1 op that is read-write + + IS_R2_RD = 1 << 3, // has a reg2 op that is read-only + IS_R2_WR = 1 << 4, // has a reg2 op that is write-only + IS_R2_RW = 1 << 5, // has a reg2 op that is read-write + + IS_R3_RD = 1 << 6, // has a reg3 op that is read-only + IS_R3_WR = 1 << 7, // has a reg3 op that is write-only + IS_R3_RW = 1 << 8, // has a reg3 op that is read-write + + IS_R4_RD = 1 << 9, // has a reg4 op that is read-only + IS_R4_WR = 1 << 10, // has a reg4 op that is write-only + IS_R4_RW = 1 << 11, // has a reg4 op that is read-write + + IS_GM_RD = 1 << 12, // has a [mem] op that is read-only + IS_GM_WR = 1 << 13, // has a [mem] op that is write-only + IS_GM_RW = 1 << 14, // has a [mem] op that is read-write + + IS_SF_RD = 1 << 15, // has a [stk] op that is read-only + IS_SF_WR = 1 << 16, // has a [stk] op that is write-only + IS_SF_RW = 1 << 17, // has a [stk] op that is read-write + + IS_AM_RD = 1 << 18, // has a [adr] op that is read-only + IS_AM_WR = 1 << 19, // has a [adr] op that is write-only + IS_AM_RW = 1 << 20, // has a [adr] op that is read-write +}; + ////////////////////////////////////////////////////////////////////////////// #else // !DEFINE_ID_OPS ////////////////////////////////////////////////////////////////////////////// @@ -71,141 +104,186 @@ enum ID_OPS ////////////////////////////////////////////////////////////////////////////// // -// enum insFormat instruction enum ID_OPS -// scheduling -// (unused) +// enum insFormat enum IS_INFO enum ID_OPS ////////////////////////////////////////////////////////////////////////////// -IF_DEF(NONE, IS_NONE, NONE) // no operands +IF_DEF(NONE, IS_NONE, NONE) // no operands -IF_DEF(LABEL, IS_NONE, JMP ) // label -IF_DEF(RWR_LABEL, IS_R1_WR, JMP ) // write label to register -IF_DEF(SWR_LABEL, IS_SF_WR, LBL ) // write label to stack +IF_DEF(LABEL, IS_NONE, JMP ) // label +IF_DEF(RWR_LABEL, IS_R1_WR, JMP ) // write label to register +IF_DEF(SWR_LABEL, IS_SF_WR, LBL ) // write label to stack -IF_DEF(METHOD, IS_NONE, CALL) // method -IF_DEF(METHPTR, IS_NONE, CALL) // method ptr (glbl) +IF_DEF(METHOD, IS_NONE, CALL) // method +IF_DEF(METHPTR, IS_NONE, CALL) // method ptr (glbl) -IF_DEF(CNS, IS_NONE, SCNS) // const +IF_DEF(CNS, IS_NONE, SCNS) // const //---------------------------------------------------------------------------- // NOTE: The order of the "RD/WR/RW" varieties must match that of // the "insUpdateModes" enum in "instr.h". //---------------------------------------------------------------------------- -IF_DEF(RRD, IS_R1_RD, NONE) // read reg -IF_DEF(RWR, IS_R1_WR, NONE) // write reg -IF_DEF(RRW, IS_R1_RW, NONE) // r/w reg +IF_DEF(RRD, IS_R1_RD, NONE) // read reg1 +IF_DEF(RWR, IS_R1_WR, NONE) // write reg1 +IF_DEF(RRW, IS_R1_RW, NONE) // r/w reg1 + +IF_DEF(RRD_CNS, IS_R1_RD, SCNS) // read reg1, const +IF_DEF(RWR_CNS, IS_R1_WR, SCNS) // write reg1, const +IF_DEF(RRW_CNS, IS_R1_RW, SCNS) // r/w reg1, const +IF_DEF(RRW_SHF, IS_R1_RW, SCNS) // r/w reg1, shift + +IF_DEF(RRD_RRD, IS_R1_RD|IS_R2_RD, NONE) // read reg1, read reg2 +IF_DEF(RWR_RRD, IS_R1_WR|IS_R2_RD, NONE) // write reg1, read reg2 +IF_DEF(RRW_RRD, IS_R1_RW|IS_R2_RD, NONE) // r/w reg1, read reg2 +IF_DEF(RRW_RRW, IS_R1_RW|IS_R2_RW, NONE) // r/w reg1, r/w reg2 - for XCHG reg, reg2 -IF_DEF(RRD_CNS, IS_R1_RD, SCNS) // read reg , const -IF_DEF(RWR_CNS, IS_R1_WR, SCNS) // write reg , const -IF_DEF(RRW_CNS, IS_R1_RW, SCNS) // r/w reg , const -IF_DEF(RRW_SHF, IS_R1_RW, SCNS) // r/w reg , shift-const +IF_DEF(RRD_RRD_CNS, IS_R1_RD|IS_R2_RD, SCNS) // read reg1, read reg2, const +IF_DEF(RWR_RRD_CNS, IS_R1_WR|IS_R2_RD, SCNS) // write reg1, read reg2, const +IF_DEF(RRW_RRD_CNS, IS_R1_RW|IS_R2_RD, SCNS) // r/w reg1, read reg2, const -IF_DEF(RRD_RRD, IS_R1_RD|IS_R2_RD, NONE) // read reg , read reg2 -IF_DEF(RWR_RRD, IS_R1_WR|IS_R2_RD, NONE) // write reg , read reg2 -IF_DEF(RRW_RRD, IS_R1_RW|IS_R2_RD, NONE) // r/w reg , read reg2 -IF_DEF(RRW_RRW, IS_R1_RW|IS_R2_RW, NONE) // r/w reg , r/w reg2 - for XCHG reg, reg2 -IF_DEF(RRW_RRW_CNS, IS_R1_RW|IS_R2_RW, SCNS) // r/w reg , r/w reg2 , const +IF_DEF(RRD_RRD_RRD, IS_R1_RD|IS_R2_RD|IS_R3_RD, NONE) // read reg1, read reg2, read reg3 +IF_DEF(RWR_RRD_RRD, IS_R1_WR|IS_R2_RD|IS_R3_RD, NONE) // write reg1, read reg2, read reg3 +IF_DEF(RRW_RRD_RRD, IS_R1_RW|IS_R2_RD|IS_R3_RD, NONE) // r/w reg1, read reg2, read reg3 -IF_DEF(RWR_RRD_RRD, IS_R1_WR|IS_R2_RD|IS_R3_RD, NONE) // write reg , read reg2 , read reg3 -IF_DEF(RWR_RRD_RRD_CNS, IS_R1_WR|IS_R2_RD|IS_R3_RD, SCNS) // write reg , read reg2 , read reg3, const +IF_DEF(RWR_RWR_RRD, IS_R1_WR|IS_R2_WR|IS_R3_RD, NONE) // write reg1, write reg2, read reg3 + +IF_DEF(RWR_RRD_RRD_CNS, IS_R1_WR|IS_R2_RD|IS_R3_RD, SCNS) // write reg1, read reg2, read reg3, const +IF_DEF(RWR_RRD_RRD_RRD, IS_R1_WR|IS_R2_RD|IS_R3_RD|IS_R4_RD, SCNS) // write reg1, read reg2, read reg3, read reg4 -IF_DEF(RWR_RRD_RRD_RRD, IS_R1_WR|IS_R2_RD|IS_R3_RD|IS_R4_RD, CNS) // write reg , read reg2 , read reg3 , read reg4 //---------------------------------------------------------------------------- // The following formats are used for direct addresses (e.g. static data members) //---------------------------------------------------------------------------- -IF_DEF(MRD, IS_GM_RD, SPEC) // read [mem] (indirect call req. SPEC) -IF_DEF(MWR, IS_GM_WR, DSP) // write [mem] -IF_DEF(MRW, IS_GM_RW, DSP) // r/w [mem] -IF_DEF(MRD_OFF, IS_GM_RD, DSP) // offset mem +IF_DEF(MRD, IS_GM_RD, SPEC) // read [mem] (indirect call req. SPEC) +IF_DEF(MWR, IS_GM_WR, DSP) // write [mem] +IF_DEF(MRW, IS_GM_RW, DSP) // r/w [mem] + +IF_DEF(MRD_CNS, IS_GM_RD, DSP_CNS) // read [mem], const +IF_DEF(MWR_CNS, IS_GM_WR, DSP_CNS) // write [mem], const +IF_DEF(MRW_CNS, IS_GM_RW, DSP_CNS) // r/w [mem], const +IF_DEF(MRW_SHF, IS_GM_RW, DSP_CNS) // r/w [mem], shift + +IF_DEF(MRD_RRD, IS_GM_RD|IS_R1_RD, DSP) // read [mem], read reg1 +IF_DEF(MWR_RRD, IS_GM_WR|IS_R1_RD, DSP) // write [mem], read reg1 +IF_DEF(MRW_RRD, IS_GM_RW|IS_R1_RD, DSP) // r/w [mem], read reg1 + +IF_DEF(MRD_RRD_CNS, IS_GM_RD|IS_R1_RD, DSP_CNS) // read [mem], read reg1, const +IF_DEF(MWR_RRD_CNS, IS_GM_WR|IS_R1_RD, DSP_CNS) // write [mem], read reg1, const +IF_DEF(MRW_RRD_CNS, IS_GM_RW|IS_R1_RD, DSP_CNS) // r/w [mem], read reg1, const + +IF_DEF(MWR_RRD_RRD, IS_GM_WR|IS_R1_RD|IS_R2_RD, DSP) // write [mem], read reg1, read reg2 + +IF_DEF(RRD_MRD, IS_R1_RD|IS_GM_RD, DSP) // read reg1, read [mem] +IF_DEF(RWR_MRD, IS_R1_WR|IS_GM_RD, DSP) // write reg1, read [mem] +IF_DEF(RRW_MRD, IS_R1_RW|IS_GM_RD, DSP) // r/w reg1, read [mem] -IF_DEF(RRD_MRD, IS_GM_RD|IS_R1_RD, DSP) // read reg , read [mem] -IF_DEF(RWR_MRD, IS_GM_RD|IS_R1_WR, DSP) // write reg , read [mem] -IF_DEF(RRW_MRD, IS_GM_RD|IS_R1_RW, DSP) // r/w reg , read [mem] -IF_DEF(RRW_MRD_CNS, IS_GM_RD|IS_R1_RW, DSP_CNS) // r/w reg , read [mem], const +IF_DEF(RRD_MRD_CNS, IS_R1_RD|IS_GM_RD, DSP_CNS) // read reg1, read [mem], const +IF_DEF(RWR_MRD_CNS, IS_R1_WR|IS_GM_RD, DSP_CNS) // write reg1, read [mem], const +IF_DEF(RRW_MRD_CNS, IS_R1_RW|IS_GM_RD, DSP_CNS) // r/w reg1, read [mem], const -IF_DEF(RWR_RRD_MRD, IS_GM_RD|IS_R1_WR|IS_R2_RD, DSP) // write reg , read reg2 , read [mem] -IF_DEF(RWR_MRD_CNS, IS_GM_RD|IS_R1_WR, DSP_CNS) // write reg , read [mem], const -IF_DEF(RWR_RRD_MRD_CNS, IS_GM_RD|IS_R1_WR|IS_R2_RD, DSP_CNS) // write reg , read reg2 , read [mem], const -IF_DEF(RWR_RRD_MRD_RRD, IS_GM_RD|IS_R1_WR|IS_R2_RD|IS_R3_RD, DSP_CNS) // write reg , read reg2 , read [mem], read reg3 -IF_DEF(RWR_MRD_OFF, IS_GM_RD|IS_R1_WR, DSP) // write reg , offset mem +IF_DEF(RRD_MRD_RRD, IS_R1_RD|IS_GM_RD|IS_R2_RD, DSP) // read reg1, read [mem], read reg2 +IF_DEF(RWR_MRD_RRD, IS_R1_WR|IS_GM_RD|IS_R2_RD, DSP) // write reg1, read [mem], read reg2 +IF_DEF(RRW_MRD_RRD, IS_R1_RW|IS_GM_RD|IS_R2_RD, DSP) // r/w reg1, read [mem], read reg2 -IF_DEF(MRD_RRD, IS_GM_RD|IS_R1_RD, DSP) // read [mem], read reg -IF_DEF(MWR_RRD, IS_GM_WR|IS_R1_RD, DSP) // write [mem], read reg -IF_DEF(MRW_RRD, IS_GM_RW|IS_R1_RD, DSP) // r/w [mem], read reg +IF_DEF(RRD_RRD_MRD, IS_R1_RD|IS_R2_RD|IS_GM_RD, DSP) // read reg1, read reg2, read [mem] +IF_DEF(RWR_RRD_MRD, IS_R1_WR|IS_R2_RD|IS_GM_RD, DSP) // write reg1, read reg2, read [mem] +IF_DEF(RRW_RRD_MRD, IS_R1_RW|IS_R2_RD|IS_GM_RD, DSP) // r/w reg1, read reg2, read [mem] -IF_DEF(MRD_CNS, IS_GM_RD, DSP_CNS) // read [mem], const -IF_DEF(MWR_CNS, IS_GM_WR, DSP_CNS) // write [mem], const -IF_DEF(MRW_CNS, IS_GM_RW, DSP_CNS) // r/w [mem], const +IF_DEF(RWR_RWR_MRD, IS_R1_WR|IS_R2_WR|IS_GM_RD, DSP) // write reg1, write reg2, read [mem] -IF_DEF(MWR_RRD_CNS, IS_GM_WR|IS_R1_RD, DSP_CNS) // write [mem], read reg, const +IF_DEF(RWR_RRD_MRD_CNS, IS_R1_WR|IS_R2_RD|IS_GM_RD, DSP_CNS) // write reg1, read reg2, read [mem], const +IF_DEF(RWR_RRD_MRD_RRD, IS_R1_WR|IS_R2_RD|IS_GM_RD|IS_R3_RD, DSP_CNS) // write reg1, read reg2, read [mem], read reg3 -IF_DEF(MRW_SHF, IS_GM_RW, DSP_CNS) // shift [mem], const +IF_DEF(MRD_OFF, IS_GM_RD, DSP) // read [mem + offset] +IF_DEF(RWR_MRD_OFF, IS_R1_WR|IS_GM_RD, DSP) // write reg1, read [mem + offset] //---------------------------------------------------------------------------- // The following formats are used for stack frame refs //---------------------------------------------------------------------------- -IF_DEF(SRD, IS_SF_RD, SPEC) // read [stk] (indirect call req. SPEC) -IF_DEF(SWR, IS_SF_WR, NONE) // write [stk] -IF_DEF(SRW, IS_SF_RW, NONE) // r/w [stk] +IF_DEF(SRD, IS_SF_RD, SPEC) // read [stk] (indirect call req. SPEC) +IF_DEF(SWR, IS_SF_WR, NONE) // write [stk] +IF_DEF(SRW, IS_SF_RW, NONE) // r/w [stk] -IF_DEF(RRD_SRD, IS_SF_RD|IS_R1_RD, NONE) // read reg , read [stk] -IF_DEF(RWR_SRD, IS_SF_RD|IS_R1_WR, NONE) // write reg , read [stk] -IF_DEF(RRW_SRD, IS_SF_RD|IS_R1_RW, NONE) // r/w reg , read [stk] -IF_DEF(RRW_SRD_CNS, IS_SF_RD|IS_R1_RW, CNS ) // r/w reg , read [stk], const +IF_DEF(SRD_CNS, IS_SF_RD, CNS) // read [stk], const +IF_DEF(SWR_CNS, IS_SF_WR, CNS) // write [stk], const +IF_DEF(SRW_CNS, IS_SF_RW, CNS) // r/w [stk], const +IF_DEF(SRW_SHF, IS_SF_RW, CNS) // r/w [stk], shift -IF_DEF(RWR_RRD_SRD, IS_SF_RD|IS_R1_WR|IS_R2_RD, NONE) // write reg , read reg2, read [stk] -IF_DEF(RWR_SRD_CNS, IS_SF_RD|IS_R1_WR, CNS ) // write reg , read [stk], const -IF_DEF(RWR_RRD_SRD_CNS, IS_SF_RD|IS_R1_WR|IS_R2_RD, CNS ) // write reg , read reg2, read [stk], const -IF_DEF(RWR_RRD_SRD_RRD, IS_SF_RD|IS_R1_WR|IS_R2_RD|IS_R3_RD, CNS ) // write reg , read reg2, read [stk], read reg3 +IF_DEF(SRD_RRD, IS_SF_RD|IS_R1_RD, NONE) // read [stk], read reg1 +IF_DEF(SWR_RRD, IS_SF_WR|IS_R1_RD, NONE) // write [stk], read reg1 +IF_DEF(SRW_RRD, IS_SF_RW|IS_R1_RD, NONE) // r/w [stk], read reg1 -IF_DEF(SRD_RRD, IS_SF_RD|IS_R1_RD, NONE) // read [stk], read reg -IF_DEF(SWR_RRD, IS_SF_WR|IS_R1_RD, NONE) // write [stk], read reg -IF_DEF(SRW_RRD, IS_SF_RW|IS_R1_RD, NONE) // r/w [stk], read reg +IF_DEF(SRD_RRD_CNS, IS_SF_RD|IS_R1_RD, CNS) // read [stk], read reg1, const +IF_DEF(SWR_RRD_CNS, IS_SF_WR|IS_R1_RD, CNS) // write [stk], read reg1, const +IF_DEF(SRW_RRD_CNS, IS_SF_RW|IS_R1_RD, CNS) // r/w [stk], read reg1, const -IF_DEF(SRD_CNS, IS_SF_RD, CNS ) // read [stk], const -IF_DEF(SWR_CNS, IS_SF_WR, CNS ) // write [stk], const -IF_DEF(SRW_CNS, IS_SF_RW, CNS ) // r/w [stk], const +IF_DEF(SWR_RRD_RRD, IS_SF_WR|IS_R1_RD|IS_R2_RD, NONE) // write [stk], read reg1, read reg2 -IF_DEF(SWR_RRD_CNS, IS_AM_WR|IS_R1_RD, AMD_CNS) // write [stk], read reg, const +IF_DEF(RRD_SRD, IS_R1_RD|IS_SF_RD, NONE) // read reg1, read [stk] +IF_DEF(RWR_SRD, IS_R1_WR|IS_SF_RD, NONE) // write reg1, read [stk] +IF_DEF(RRW_SRD, IS_R1_RW|IS_SF_RD, NONE) // r/w reg1, read [stk] -IF_DEF(SRW_SHF, IS_SF_RW, CNS ) // shift [stk], const +IF_DEF(RRD_SRD_CNS, IS_R1_RD|IS_SF_RD, CNS) // read reg1, read [stk], const +IF_DEF(RWR_SRD_CNS, IS_R1_WR|IS_SF_RD, CNS) // write reg1, read [stk], const +IF_DEF(RRW_SRD_CNS, IS_R1_RW|IS_SF_RD, CNS) // r/w reg1, read [stk], const + +IF_DEF(RRD_SRD_RRD, IS_R1_RD|IS_SF_RD|IS_R2_RD, NONE) // read reg1, read [stk], read reg2 +IF_DEF(RWR_SRD_RRD, IS_R1_WR|IS_SF_RD|IS_R2_RD, NONE) // write reg1, read [stk], read reg2 +IF_DEF(RRW_SRD_RRD, IS_R1_RW|IS_SF_RD|IS_R2_RD, NONE) // r/w reg1, read [stk], read reg2 + +IF_DEF(RRD_RRD_SRD, IS_R1_RD|IS_R2_RD|IS_SF_RD, NONE) // read reg1, read reg2, read [stk] +IF_DEF(RWR_RRD_SRD, IS_R1_WR|IS_R2_RD|IS_SF_RD, NONE) // write reg1, read reg2, read [stk] +IF_DEF(RRW_RRD_SRD, IS_R1_RW|IS_R2_RD|IS_SF_RD, NONE) // r/w reg1, read reg2, read [stk] + +IF_DEF(RWR_RWR_SRD, IS_R1_WR|IS_R2_WR|IS_SF_RD, NONE) // write reg1, write reg2, read [stk] + +IF_DEF(RWR_RRD_SRD_CNS, IS_R1_WR|IS_R2_RD|IS_SF_RD, CNS) // write reg1, read reg2, read [stk], const +IF_DEF(RWR_RRD_SRD_RRD, IS_R1_WR|IS_R2_RD|IS_SF_RD|IS_R3_RD, CNS) // write reg1, read reg2, read [stk], read reg3 //---------------------------------------------------------------------------- // The following formats are used for indirect address modes //---------------------------------------------------------------------------- +IF_DEF(ARD, IS_AM_RD, SPEC) // read [adr] (indirect call req. SPEC) +IF_DEF(AWR, IS_AM_WR, AMD) // write [adr] +IF_DEF(ARW, IS_AM_RW, AMD) // r/w [adr] + +IF_DEF(ARD_CNS, IS_AM_RD, AMD_CNS) // read [adr], const +IF_DEF(AWR_CNS, IS_AM_WR, AMD_CNS) // write [adr], const +IF_DEF(ARW_CNS, IS_AM_RW, AMD_CNS) // r/w [adr], const +IF_DEF(ARW_SHF, IS_AM_RW, AMD_CNS) // r/w [adr], shift + +IF_DEF(ARD_RRD, IS_AM_RD|IS_R1_RD, AMD) // read [adr], read reg1 +IF_DEF(AWR_RRD, IS_AM_WR|IS_R1_RD, AMD) // write [adr], read reg1 +IF_DEF(ARW_RRD, IS_AM_RW|IS_R1_RD, AMD) // r/w [adr], read reg1 -IF_DEF(ARD, IS_AM_RD, SPEC) // read [adr] (indirect call req. SPEC) -IF_DEF(AWR, IS_AM_WR, AMD ) // write [adr] -IF_DEF(ARW, IS_AM_RW, AMD ) // r/w [adr] +IF_DEF(ARD_RRD_CNS, IS_AM_RD|IS_R1_RD, AMD_CNS) // read [adr], read reg1, const +IF_DEF(AWR_RRD_CNS, IS_AM_WR|IS_R1_RD, AMD_CNS) // write [adr], read reg1, const +IF_DEF(ARW_RRD_CNS, IS_AM_RW|IS_R1_RD, AMD_CNS) // r/w [adr], read reg1, const -IF_DEF(RRD_ARD, IS_AM_RD|IS_R1_RD, AMD ) // read reg , read [adr] -IF_DEF(RWR_ARD, IS_AM_RD|IS_R1_WR, AMD ) // write reg , read [adr] -IF_DEF(RRW_ARD, IS_AM_RD|IS_R1_RW, AMD ) // r/w reg , read [adr] -IF_DEF(RRW_ARD_CNS, IS_AM_RD|IS_R1_RW, AMD_CNS) // r/w reg , read [adr], const +IF_DEF(AWR_RRD_RRD, IS_AM_WR|IS_R1_RD|IS_R2_RD, AMD_CNS) // write [adr], read reg1, read reg2 -IF_DEF(RWR_RRD_ARD, IS_AM_RD|IS_R1_WR|IS_R2_RD, AMD ) // write reg , read reg2, read [adr] -IF_DEF(RWR_ARD_CNS, IS_AM_RD|IS_R1_WR, AMD_CNS) // write reg , read [adr], const -IF_DEF(RWR_ARD_RRD, IS_AM_RD|IS_R1_WR|IS_R2_RD, AMD) // write reg , read [adr], read reg2 -IF_DEF(RWR_RRD_ARD_CNS, IS_AM_RD|IS_R1_WR|IS_R2_RD, AMD_CNS) // write reg , read reg2, read [adr], const -IF_DEF(RWR_RRD_ARD_RRD, IS_AM_RD|IS_R1_WR|IS_R2_RD|IS_R3_RD, AMD_CNS) // write reg , read reg2, read [adr], read reg3 +IF_DEF(RRD_ARD, IS_R1_RD|IS_AM_RD, AMD) // read reg1, read [adr] +IF_DEF(RWR_ARD, IS_R1_WR|IS_AM_RD, AMD) // write reg1, read [adr] +IF_DEF(RRW_ARD, IS_R1_RW|IS_AM_RD, AMD) // r/w reg1, read [adr] -IF_DEF(ARD_RRD, IS_AM_RD|IS_R1_RD, AMD ) // read [adr], read reg -IF_DEF(AWR_RRD, IS_AM_WR|IS_R1_RD, AMD ) // write [adr], read reg -IF_DEF(ARW_RRD, IS_AM_RW|IS_R1_RD, AMD ) // r/w [adr], read reg +IF_DEF(RRD_ARD_CNS, IS_R1_RD|IS_AM_RD, AMD_CNS) // read reg1, read [adr], const +IF_DEF(RWR_ARD_CNS, IS_R1_WR|IS_AM_RD, AMD_CNS) // write reg1, read [adr], const +IF_DEF(RRW_ARD_CNS, IS_R1_RW|IS_AM_RD, AMD_CNS) // r/w reg1, read [adr], const -IF_DEF(AWR_RRD_RRD, IS_AM_WR|IS_R1_RD|IS_R2_RD, AMD ) // write [adr], read reg, read reg +IF_DEF(RRD_ARD_RRD, IS_R1_RD|IS_AM_RD|IS_R2_RD, AMD) // read reg1, read [adr], read reg2 +IF_DEF(RWR_ARD_RRD, IS_R1_WR|IS_AM_RD|IS_R2_RD, AMD) // write reg1, read [adr], read reg2 +IF_DEF(RRW_ARD_RRD, IS_R1_RW|IS_AM_RD|IS_R2_RD, AMD) // r/w reg1, read [adr], read reg2 -IF_DEF(ARD_CNS, IS_AM_RD, AMD_CNS) // read [adr], const -IF_DEF(AWR_CNS, IS_AM_WR, AMD_CNS) // write [adr], const -IF_DEF(ARW_CNS, IS_AM_RW, AMD_CNS) // r/w [adr], const +IF_DEF(RRD_RRD_ARD, IS_R1_RD|IS_R2_RD|IS_AM_RD, AMD) // read reg1, read reg2, read [adr] +IF_DEF(RWR_RRD_ARD, IS_R1_WR|IS_R2_RD|IS_AM_RD, AMD) // write reg1, read reg2, read [adr] +IF_DEF(RRW_RRD_ARD, IS_R1_RW|IS_R2_RD|IS_AM_RD, AMD) // r/w reg1, read reg2, read [adr] -IF_DEF(AWR_RRD_CNS, IS_AM_WR|IS_R1_RD, AMD_CNS) // write [adr], read reg, const +IF_DEF(RWR_RWR_ARD, IS_R1_WR|IS_R2_WR|IS_AM_RD, AMD) // write reg1, write reg2, read [adr] -IF_DEF(ARW_SHF, IS_AM_RW, AMD_CNS) // shift [adr], const +IF_DEF(RWR_RRD_ARD_CNS, IS_R1_WR|IS_R2_RD|IS_AM_RD, AMD_CNS) // write reg1, read reg2, read [adr], const +IF_DEF(RWR_RRD_ARD_RRD, IS_R1_WR|IS_R2_RD|IS_AM_RD|IS_R3_RD, AMD_CNS) // write reg1, read reg2, read [adr], read reg3 ////////////////////////////////////////////////////////////////////////////// diff --git a/src/coreclr/jit/emitloongarch64.cpp b/src/coreclr/jit/emitloongarch64.cpp index 4e1e02ae838d9..53a19b45770b6 100644 --- a/src/coreclr/jit/emitloongarch64.cpp +++ b/src/coreclr/jit/emitloongarch64.cpp @@ -420,9 +420,9 @@ const emitJumpKind emitReverseJumpKinds[] = { * Return the allocated size (in bytes) of the given instruction descriptor. */ -size_t emitter::emitSizeOfInsDsc(instrDesc* id) +size_t emitter::emitSizeOfInsDsc(instrDesc* id) const { - if (emitIsScnsInsDsc(id)) + if (emitIsSmallInsDsc(id)) return SMALL_IDSC_SIZE; insOpts insOp = id->idInsOpt(); diff --git a/src/coreclr/jit/emitriscv64.cpp b/src/coreclr/jit/emitriscv64.cpp index 414b2cd68964d..df007bfc27034 100644 --- a/src/coreclr/jit/emitriscv64.cpp +++ b/src/coreclr/jit/emitriscv64.cpp @@ -76,9 +76,9 @@ const emitJumpKind emitReverseJumpKinds[] = { * Return the allocated size (in bytes) of the given instruction descriptor. */ -size_t emitter::emitSizeOfInsDsc(instrDesc* id) +size_t emitter::emitSizeOfInsDsc(instrDesc* id) const { - if (emitIsScnsInsDsc(id)) + if (emitIsSmallInsDsc(id)) return SMALL_IDSC_SIZE; insOpts insOp = id->idInsOpt(); diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 5c79baed593d7..5677952e08448 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -170,18 +170,25 @@ regNumber emitter::getSseShiftRegNumber(instruction ins) } } +bool emitter::HasVexEncoding(instruction ins) const +{ + insFlags flags = CodeGenInterface::instInfo[ins]; + return (flags & Encoding_VEX) != 0; +} + +bool emitter::HasEvexEncoding(instruction ins) const +{ + insFlags flags = CodeGenInterface::instInfo[ins]; + return (flags & Encoding_EVEX) != 0; +} + bool emitter::IsVexEncodableInstruction(instruction ins) const { if (!UseVEXEncoding()) { return false; } - - // There is also Translate_VEX, however we expect codegen to have handled - // this difference and given us the VEX exclusive version of the instruction - - insFlags flags = CodeGenInterface::instInfo[ins]; - return (flags & Encoding_VEX) != 0; + return HasVexEncoding(ins); } //------------------------------------------------------------------------ @@ -199,12 +206,7 @@ bool emitter::IsEvexEncodableInstruction(instruction ins) const { return false; } - - // There is also Translate_EVEX, however we expect codegen to have handled - // this difference and given us the EVEX exclusive version of the instruction - - insFlags flags = CodeGenInterface::instInfo[ins]; - return (flags & Encoding_EVEX) != 0; + return HasEvexEncoding(ins); } //------------------------------------------------------------------------ @@ -752,85 +754,19 @@ bool emitter::emitIsInstrWritingToReg(instrDesc* id, regNumber reg) } #endif // TARGET_64BIT - switch (id->idInsFmt()) + if (id->idIsReg1Write() && (id->idReg1() == reg)) { - case IF_RWR: - case IF_RRW: - - case IF_RWR_CNS: - case IF_RRW_CNS: - case IF_RRW_SHF: - - case IF_RWR_RRD: - case IF_RRW_RRD: - case IF_RRW_RRW: - case IF_RRW_RRW_CNS: - - case IF_RWR_RRD_RRD: - case IF_RWR_RRD_RRD_CNS: - - case IF_RWR_RRD_RRD_RRD: - - case IF_RWR_MRD: - case IF_RRW_MRD: - case IF_RRW_MRD_CNS: - - case IF_RWR_RRD_MRD: - case IF_RWR_MRD_CNS: - case IF_RWR_RRD_MRD_CNS: - case IF_RWR_RRD_MRD_RRD: - case IF_RWR_MRD_OFF: - - case IF_RWR_SRD: - case IF_RRW_SRD: - case IF_RRW_SRD_CNS: - - case IF_RWR_RRD_SRD: - case IF_RWR_SRD_CNS: - case IF_RWR_RRD_SRD_CNS: - case IF_RWR_RRD_SRD_RRD: - - case IF_RWR_ARD: - case IF_RRW_ARD: - case IF_RRW_ARD_CNS: - - case IF_RWR_RRD_ARD: - case IF_RWR_ARD_CNS: - case IF_RWR_ARD_RRD: - case IF_RWR_RRD_ARD_CNS: - case IF_RWR_RRD_ARD_RRD: - { - if (id->idReg1() != reg) - { - switch (id->idInsFmt()) - { - // Handles instructions who write to two registers. - case IF_RRW_RRW: - case IF_RRW_RRW_CNS: - { - if (id->idReg2() == reg) - { - return true; - } - break; - } - - default: - break; - } - - return false; - } - - return true; - } + return true; + } - default: - { - return false; - } + if (id->idIsReg2Write() && (id->idReg2() == reg)) + { + return true; } + assert(!id->idIsReg3Write()); + assert(!id->idIsReg4Write()); + return false; } @@ -936,34 +872,8 @@ bool emitter::AreFlagsSetToZeroCmp(regNumber reg, emitAttr opSize, GenCondition instrDesc* id = emitLastIns; instruction lastIns = id->idIns(); - insFormat fmt = id->idInsFmt(); - - // make sure op1 is a reg - switch (fmt) - { - case IF_RWR_CNS: - case IF_RRW_CNS: - case IF_RRW_SHF: - case IF_RWR_RRD: - case IF_RRW_RRD: - case IF_RWR_MRD: - case IF_RWR_SRD: - case IF_RRW_SRD: - case IF_RWR_ARD: - case IF_RRW_ARD: - case IF_RWR: - case IF_RRD: - case IF_RRW: - case IF_RWR_RRD_RRD: - case IF_RWR_RRD_MRD: - case IF_RWR_RRD_ARD: - case IF_RWR_RRD_SRD: - break; - default: - return false; - } - if (id->idReg1() != reg) + if (!id->idHasReg1() || (id->idReg1() != reg)) { return false; } @@ -1022,28 +932,7 @@ bool emitter::AreFlagsSetForSignJumpOpt(regNumber reg, emitAttr opSize, GenCondi instruction lastIns = id->idIns(); insFormat fmt = id->idInsFmt(); - // make sure op1 is a reg - switch (fmt) - { - case IF_RWR_CNS: - case IF_RRW_CNS: - case IF_RRW_SHF: - case IF_RWR_RRD: - case IF_RRW_RRD: - case IF_RWR_MRD: - case IF_RWR_SRD: - case IF_RRW_SRD: - case IF_RWR_ARD: - case IF_RRW_ARD: - case IF_RWR: - case IF_RRD: - case IF_RRW: - break; - default: - return false; - } - - if (id->idReg1() != reg) + if (!id->idHasReg1() || (id->idReg1() != reg)) { return false; } @@ -1957,7 +1846,7 @@ emitter::code_t emitter::emitExtractVexPrefix(instruction ins, code_t& code) con { // 3-byte opcode: with the bytes ordered as 0x2211RM33 or // 4-byte opcode: with the bytes ordered as 0x22114433 - + // // check for a prefix in the 11 position BYTE sizePrefix = (code >> 16) & 0xFF; @@ -2855,105 +2744,63 @@ unsigned emitter::emitGetVexPrefixSize(instrDesc* id) const return 3; } - regNumber regFor012Bits = REG_NA; - regNumber regForSibBits = REG_NA; + regNumber regFor012Bits; - switch (id->idInsFmt()) + if (id->idHasMemAdr()) { - case IF_ARD: - case IF_AWR_RRD: - case IF_RRD_ARD: - case IF_RRW_ARD: - case IF_RRW_ARD_CNS: - case IF_RWR_ARD: - case IF_RWR_ARD_CNS: - case IF_RWR_RRD_ARD: - case IF_RWR_RRD_ARD_CNS: - { - regFor012Bits = id->idAddr()->iiaAddrMode.amBaseReg; - regForSibBits = id->idAddr()->iiaAddrMode.amIndxReg; - break; - } + regNumber regForSibBits = id->idAddr()->iiaAddrMode.amIndxReg; - case IF_MRD: - case IF_MWR_RRD: - case IF_RRD_MRD: - case IF_RRD_SRD: - case IF_RRW_MRD: - case IF_RRW_MRD_CNS: - case IF_RRW_SRD: - case IF_RRW_SRD_CNS: - case IF_RWR_MRD: - case IF_RWR_MRD_CNS: - case IF_RWR_RRD_MRD: - case IF_RWR_RRD_MRD_CNS: - case IF_RWR_RRD_SRD: - case IF_RWR_RRD_SRD_CNS: - case IF_RWR_SRD: - case IF_RWR_SRD_CNS: - case IF_SRD: - case IF_SWR_RRD: + if (IsExtendedReg(regForSibBits)) { - // Nothing is encoded in a way to prevent the 2-byte encoding - break; + // When the REX.X bit is present, we must use the 3-byte encoding + // - REX.X is used to encode the extended index field for SIB addressing + return 3; } - case IF_RRD_CNS: - case IF_RRW_CNS: - case IF_RWR_CNS: - { - regFor012Bits = id->idReg1(); - break; - } + regFor012Bits = id->idAddr()->iiaAddrMode.amBaseReg; + } + else if (id->idHasMemGen() || id->idHasMemStk()) + { + // Nothing is encoded in a way to prevent the 2-byte encoding + // - We don't encode an index or base field so can't use REX.X or REX.B + return 2; + } + else if (id->idHasReg3()) + { + // All instructions which have 3 registers encode reg3 in the r/m byte + regFor012Bits = id->idReg3(); + } + else if (id->idHasReg2()) + { + // Most instructions which have 2 registers encode reg2 in the r/m byte + regFor012Bits = id->idReg2(); - case IF_RRD_RRD: - case IF_RRW_RRD: - case IF_RWR_RRD: - { - regFor012Bits = id->idReg2(); + // However, there are a couple with MR variants (such as the extract instructions) + // and movd which uses both float and general registers which may use op1 + ID_OPS idOp = static_cast(emitFmtToOps[id->idInsFmt()]); - if ((ins == INS_movd) && isFloatReg(regFor012Bits)) + if (idOp == ID_OP_SCNS) + { + if (hasCodeMR(ins)) { regFor012Bits = id->idReg1(); } - break; } - - case IF_RRW_RRW_CNS: + else if (ins == INS_movd) { - if (hasCodeMR(ins)) + if (isFloatReg(regFor012Bits)) { regFor012Bits = id->idReg1(); } - else - { - regFor012Bits = id->idReg2(); - } - break; - } - - case IF_RWR_RRD_RRD: - case IF_RWR_RRD_RRD_CNS: - case IF_RWR_RRD_RRD_RRD: - { - regFor012Bits = id->idReg3(); - break; - } - - default: - { - assert(!"Unhandled insFmt for emitGetVexPrefixSize"); - return 3; } } - - if ((regForSibBits != REG_NA) && IsExtendedReg(regForSibBits)) + else { - // When the REX.X bit is present, we must use the 3-byte encoding - return 3; + assert(id->idHasReg1()); + regFor012Bits = id->idReg1(); } - if ((regFor012Bits != REG_NA) && IsExtendedReg(regFor012Bits)) + if (IsExtendedReg(regFor012Bits)) { // When the REX.B bit is present, we must use the 3-byte encoding return 3; @@ -3929,26 +3776,42 @@ UNATIVE_OFFSET emitter::emitInsSizeAM(instrDesc* id, code_t code) regNumber rgx; // The idAddr field is a union and only some of the instruction formats use the iiaAddrMode variant - // these are IF_AWR_*, IF_ARD_*, IF_ARW_* and IF_*_ARD + // these are IF_ARD_*, IF_ARW_*, and IF_AWR_* // ideally these should really be the only idInsFmts that we see here // but we have some outliers to deal with: // emitIns_R_L adds IF_RWR_LABEL and calls emitInsSizeAM // emitInsRMW adds IF_MRW_CNS, IF_MRW_RRD, IF_MRW_SHF, and calls emitInsSizeAM - switch (id->idInsFmt()) + if (id->idHasMemAdr()) { - case IF_RWR_LABEL: - case IF_MRW_CNS: - case IF_MRW_RRD: - case IF_MRW_SHF: - reg = REG_NA; - rgx = REG_NA; - break; + reg = id->idAddr()->iiaAddrMode.amBaseReg; + rgx = id->idAddr()->iiaAddrMode.amIndxReg; + } + else + { + reg = REG_NA; + rgx = REG_NA; - default: - reg = id->idAddr()->iiaAddrMode.amBaseReg; - rgx = id->idAddr()->iiaAddrMode.amIndxReg; - break; +#if defined(DEBUG) + switch (id->idInsFmt()) + { + case IF_RWR_LABEL: + case IF_MRW_CNS: + case IF_MRW_RRD: + case IF_MRW_SHF: + { + break; + } + + default: + { + assert(!"Unexpected insFormat in emitInsSizeAMD"); + reg = id->idAddr()->iiaAddrMode.amBaseReg; + rgx = id->idAddr()->iiaAddrMode.amIndxReg; + break; + } + } +#endif // DEBUG } if (id->idIsDspReloc()) @@ -4496,7 +4359,9 @@ emitter::insFormat emitter::emitMapFmtForIns(insFormat fmt, instruction ins) unreached(); } } + default: + { if (IsMovInstruction(ins)) { // A `mov` instruction is always "write" @@ -4507,6 +4372,7 @@ emitter::insFormat emitter::emitMapFmtForIns(insFormat fmt, instruction ins) } } return fmt; + } } } @@ -4521,55 +4387,28 @@ emitter::insFormat emitter::emitMapFmtForIns(insFormat fmt, instruction ins) // emitter::insFormat emitter::emitMapFmtAtoM(insFormat fmt) { - switch (fmt) - { - case IF_ARD: - return IF_MRD; - case IF_AWR: - return IF_MWR; - case IF_ARW: - return IF_MRW; + // We should only get here for AM formats + assert((fmt >= IF_ARD) && (fmt <= IF_RWR_RRD_ARD_RRD)); - case IF_RRD_ARD: - return IF_RRD_MRD; - case IF_RWR_ARD: - return IF_RWR_MRD; - case IF_RWR_ARD_CNS: - return IF_RWR_MRD_CNS; - case IF_RRW_ARD: - return IF_RRW_MRD; - case IF_RRW_ARD_CNS: - return IF_RRW_MRD_CNS; - case IF_RWR_RRD_ARD: - return IF_RWR_RRD_MRD; - case IF_RWR_RRD_ARD_CNS: - return IF_RWR_RRD_MRD_CNS; - case IF_RWR_RRD_ARD_RRD: - return IF_RWR_RRD_MRD_RRD; + // We should have the same number of AM and GM formats + static_assert_no_msg((IF_RWR_RRD_ARD_RRD - IF_ARD) == (IF_RWR_RRD_MRD_RRD - IF_MRD)); - case IF_ARD_RRD: - return IF_MRD_RRD; - case IF_AWR_RRD: - return IF_MWR_RRD; - case IF_ARW_RRD: - return IF_MRW_RRD; + // GM should precede AM in the list + static_assert_no_msg(IF_MRD < IF_ARD); - case IF_ARD_CNS: - return IF_MRD_CNS; - case IF_AWR_CNS: - return IF_MWR_CNS; - case IF_ARW_CNS: - return IF_MRW_CNS; - - case IF_AWR_RRD_CNS: - return IF_MWR_RRD_CNS; + const unsigned delta = IF_ARD - IF_MRD; - case IF_ARW_SHF: - return IF_MRW_SHF; + // Spot check a few entries + static_assert_no_msg((IF_ARD - delta) == IF_MRD); + static_assert_no_msg((IF_ARD_CNS - delta) == IF_MRD_CNS); + static_assert_no_msg((IF_ARD_RRD - delta) == IF_MRD_RRD); + static_assert_no_msg((IF_RRD_ARD - delta) == IF_RRD_MRD); + static_assert_no_msg((IF_RRD_ARD_CNS - delta) == IF_RRD_MRD_CNS); + static_assert_no_msg((IF_RRD_ARD_RRD - delta) == IF_RRD_MRD_RRD); + static_assert_no_msg((IF_RRD_RRD_ARD - delta) == IF_RRD_RRD_MRD); + static_assert_no_msg((IF_RWR_RRD_ARD_RRD - delta) == IF_RWR_RRD_MRD_RRD); - default: - unreached(); - } + return static_cast(fmt - delta); } //------------------------------------------------------------------------ @@ -4713,7 +4552,7 @@ void emitter::spillIntArgRegsToShadowSlots() id = emitNewInstrAmd(EA_PTRSIZE, offset); id->idIns(INS_mov); - id->idInsFmt(IF_AWR_RRD); + id->idInsFmt(emitInsModeFormat(INS_mov, IF_ARD_RRD)); id->idAddr()->iiaAddrMode.amBaseReg = REG_SPBASE; id->idAddr()->iiaAddrMode.amIndxReg = REG_NA; id->idAddr()->iiaAddrMode.amScale = emitEncodeScale(1); @@ -4772,7 +4611,7 @@ void emitter::emitInsLoadInd(instruction ins, emitAttr attr, regNumber dstReg, G instrDesc* id = emitNewInstrAmd(attr, offset); id->idIns(ins); id->idReg1(dstReg); - emitHandleMemOp(mem, id, IF_RWR_ARD, ins); + emitHandleMemOp(mem, id, emitInsModeFormat(ins, IF_RRD_ARD), ins); UNATIVE_OFFSET sz = emitInsSizeAM(id, insCodeRM(ins)); id->idCodeSize(sz); dispIns(id); @@ -4886,7 +4725,7 @@ void emitter::emitInsStoreInd(instruction ins, emitAttr attr, GenTreeStoreInd* m int icon = (int)data->AsIntConCommon()->IconValue(); id = emitNewInstrAmdCns(attr, offset, icon); id->idIns(ins); - emitHandleMemOp(mem, id, IF_AWR_CNS, ins); + emitHandleMemOp(mem, id, emitInsModeFormat(ins, IF_ARD_CNS), ins); sz = emitInsSizeAM(id, insCodeMI(ins), icon); id->idCodeSize(sz); } @@ -4902,7 +4741,7 @@ void emitter::emitInsStoreInd(instruction ins, emitAttr attr, GenTreeStoreInd* m { id = emitNewInstrAmd(attr, offset); id->idIns(ins); - emitHandleMemOp(mem, id, IF_AWR_RRD, ins); + emitHandleMemOp(mem, id, emitInsModeFormat(ins, IF_ARD_RRD), ins); id->idReg1(op1->GetRegNum()); sz = emitInsSizeAM(id, insCodeMR(ins)); id->idCodeSize(sz); @@ -4915,7 +4754,7 @@ void emitter::emitInsStoreInd(instruction ins, emitAttr attr, GenTreeStoreInd* m id = emitNewInstrAmdCns(attr, offset, icon); id->idIns(ins); id->idReg1(op1->GetRegNum()); - emitHandleMemOp(mem, id, IF_AWR_RRD_CNS, ins); + emitHandleMemOp(mem, id, emitInsModeFormat(ins, IF_ARD_RRD_CNS), ins); sz = emitInsSizeAM(id, insCodeMR(ins), icon); id->idCodeSize(sz); } @@ -4926,7 +4765,7 @@ void emitter::emitInsStoreInd(instruction ins, emitAttr attr, GenTreeStoreInd* m assert(!data->isContained()); id = emitNewInstrAmd(attr, offset); id->idIns(ins); - emitHandleMemOp(mem, id, IF_AWR_RRD, ins); + emitHandleMemOp(mem, id, emitInsModeFormat(ins, IF_ARD_RRD), ins); id->idReg1(data->GetRegNum()); sz = emitInsSizeAM(id, insCodeMR(ins)); id->idCodeSize(sz); @@ -5430,7 +5269,7 @@ void emitter::emitInsRMW(instruction ins, emitAttr attr, GenTreeStoreInd* storeI else { id = emitNewInstrAmdCns(attr, offset, iconVal); - emitHandleMemOp(storeInd, id, IF_ARW_CNS, ins); + emitHandleMemOp(storeInd, id, emitInsModeFormat(ins, IF_ARD_CNS), ins); id->idIns(ins); sz = emitInsSizeAM(id, insCodeMI(ins), iconVal); } @@ -5441,7 +5280,7 @@ void emitter::emitInsRMW(instruction ins, emitAttr attr, GenTreeStoreInd* storeI // ind, reg id = emitNewInstrAmd(attr, offset); - emitHandleMemOp(storeInd, id, IF_ARW_RRD, ins); + emitHandleMemOp(storeInd, id, emitInsModeFormat(ins, IF_ARD_RRD), ins); id->idReg1(src->GetRegNum()); id->idIns(ins); sz = emitInsSizeAM(id, insCodeMR(ins)); @@ -5495,7 +5334,7 @@ void emitter::emitInsRMW(instruction ins, emitAttr attr, GenTreeStoreInd* storeI } instrDesc* id = emitNewInstrAmd(attr, offset); - emitHandleMemOp(storeInd, id, IF_ARW, ins); + emitHandleMemOp(storeInd, id, emitInsModeFormat(ins, IF_ARD), ins); id->idIns(ins); UNATIVE_OFFSET sz = emitInsSizeAM(id, insCodeMR(ins)); id->idCodeSize(sz); @@ -5843,7 +5682,7 @@ void emitter::emitIns_IJ(emitAttr attr, regNumber reg, unsigned base) instrDesc* id = emitNewInstrAmd(attr, base); id->idIns(ins); - id->idInsFmt(IF_ARD); + id->idInsFmt(emitInsModeFormat(ins, IF_ARD)); id->idAddr()->iiaAddrMode.amBaseReg = REG_NA; id->idAddr()->iiaAddrMode.amIndxReg = reg; id->idAddr()->iiaAddrMode.amScale = emitter::OPSZP; @@ -6470,7 +6309,7 @@ void emitter::emitIns_R_R_I(instruction ins, emitAttr attr, regNumber reg1, regN instrDesc* id = emitNewInstrSC(attr, ival); id->idIns(ins); - id->idInsFmt(IF_RRW_RRW_CNS); + id->idInsFmt(emitInsModeFormat(ins, IF_RRD_RRD_CNS)); id->idReg1(reg1); id->idReg2(reg2); @@ -6527,7 +6366,7 @@ void emitter::emitIns_AR(instruction ins, emitAttr attr, regNumber base, int off id->idIns(ins); - id->idInsFmt(IF_ARD); + id->idInsFmt(emitInsModeFormat(ins, IF_ARD)); id->idAddr()->iiaAddrMode.amBaseReg = base; id->idAddr()->iiaAddrMode.amIndxReg = REG_NA; @@ -6582,7 +6421,7 @@ void emitter::emitIns_R_A(instruction ins, emitAttr attr, regNumber reg1, GenTre id->idIns(ins); id->idReg1(reg1); - emitHandleMemOp(indir, id, IF_RRW_ARD, ins); + emitHandleMemOp(indir, id, emitInsModeFormat(ins, IF_RRD_ARD), ins); UNATIVE_OFFSET sz = emitInsSizeAM(id, insCodeRM(ins)); id->idCodeSize(sz); @@ -6602,7 +6441,7 @@ void emitter::emitIns_R_A_I(instruction ins, emitAttr attr, regNumber reg1, GenT id->idIns(ins); id->idReg1(reg1); - emitHandleMemOp(indir, id, IF_RRW_ARD_CNS, ins); + emitHandleMemOp(indir, id, emitInsModeFormat(ins, IF_RRD_ARD_CNS), ins); UNATIVE_OFFSET sz = emitInsSizeAM(id, insCodeRM(ins), ival); id->idCodeSize(sz); @@ -6621,7 +6460,7 @@ void emitter::emitIns_R_AR_I(instruction ins, emitAttr attr, regNumber reg1, reg id->idIns(ins); id->idReg1(reg1); - id->idInsFmt(IF_RRW_ARD_CNS); + id->idInsFmt(emitInsModeFormat(ins, IF_RRD_ARD_CNS)); id->idAddr()->iiaAddrMode.amBaseReg = base; id->idAddr()->iiaAddrMode.amIndxReg = REG_NA; @@ -6647,7 +6486,7 @@ void emitter::emitIns_R_C_I( instrDesc* id = emitNewInstrCnsDsp(attr, ival, offs); id->idIns(ins); - id->idInsFmt(IF_RRW_MRD_CNS); + id->idInsFmt(emitInsModeFormat(ins, IF_RRD_MRD_CNS)); id->idReg1(reg1); id->idAddr()->iiaFieldHnd = fldHnd; @@ -6666,7 +6505,7 @@ void emitter::emitIns_R_S_I(instruction ins, emitAttr attr, regNumber reg1, int instrDesc* id = emitNewInstrCns(attr, ival); id->idIns(ins); - id->idInsFmt(IF_RRW_SRD_CNS); + id->idInsFmt(emitInsModeFormat(ins, IF_RRD_SRD_CNS)); id->idReg1(reg1); id->idAddr()->iiaLclVar.initLclVarAddr(varx, offs); @@ -6693,7 +6532,7 @@ void emitter::emitIns_R_R_A(instruction ins, emitAttr attr, regNumber reg1, regN id->idReg1(reg1); id->idReg2(reg2); - emitHandleMemOp(indir, id, IF_RWR_RRD_ARD, ins); + emitHandleMemOp(indir, id, (ins == INS_mulx) ? IF_RWR_RWR_ARD : emitInsModeFormat(ins, IF_RRD_RRD_ARD), ins); UNATIVE_OFFSET sz = emitInsSizeAM(id, insCodeRM(ins)); id->idCodeSize(sz); @@ -6713,7 +6552,7 @@ void emitter::emitIns_R_R_AR(instruction ins, emitAttr attr, regNumber reg1, reg id->idReg1(reg1); id->idReg2(reg2); - id->idInsFmt(IF_RWR_RRD_ARD); + id->idInsFmt(emitInsModeFormat(ins, IF_RRD_RRD_ARD)); id->idAddr()->iiaAddrMode.amBaseReg = base; id->idAddr()->iiaAddrMode.amIndxReg = REG_NA; @@ -6780,7 +6619,7 @@ void emitter::emitIns_R_AR_R(instruction ins, id->idReg1(reg1); id->idReg2(reg2); - id->idInsFmt(IF_RWR_ARD_RRD); + id->idInsFmt(emitInsModeFormat(ins, IF_RRD_ARD_RRD)); id->idAddr()->iiaAddrMode.amBaseReg = base; id->idAddr()->iiaAddrMode.amIndxReg = index; id->idAddr()->iiaAddrMode.amScale = emitEncodeSize((emitAttr)scale); @@ -6807,7 +6646,7 @@ void emitter::emitIns_R_R_C( instrDesc* id = emitNewInstrDsp(attr, offs); id->idIns(ins); - id->idInsFmt(IF_RWR_RRD_MRD); + id->idInsFmt((ins == INS_mulx) ? IF_RWR_RWR_MRD : emitInsModeFormat(ins, IF_RRD_RRD_MRD)); id->idReg1(reg1); id->idReg2(reg2); id->idAddr()->iiaFieldHnd = fldHnd; @@ -6831,7 +6670,7 @@ void emitter::emitIns_R_R_R(instruction ins, emitAttr attr, regNumber targetReg, instrDesc* id = emitNewInstr(attr); id->idIns(ins); - id->idInsFmt(IF_RWR_RRD_RRD); + id->idInsFmt((ins == INS_mulx) ? IF_RWR_RWR_RRD : emitInsModeFormat(ins, IF_RRD_RRD_RRD)); id->idReg1(targetReg); id->idReg2(reg1); id->idReg3(reg2); @@ -6851,7 +6690,7 @@ void emitter::emitIns_R_R_S(instruction ins, emitAttr attr, regNumber reg1, regN instrDesc* id = emitNewInstr(attr); id->idIns(ins); - id->idInsFmt(IF_RWR_RRD_SRD); + id->idInsFmt((ins == INS_mulx) ? IF_RWR_RWR_SRD : emitInsModeFormat(ins, IF_RRD_RRD_SRD)); id->idReg1(reg1); id->idReg2(reg2); id->idAddr()->iiaLclVar.initLclVarAddr(varx, offs); @@ -7729,7 +7568,7 @@ void emitter::emitIns_C_R_I( instrDesc* id = emitNewInstrCnsDsp(attr, ival, offs); id->idIns(ins); - id->idInsFmt(IF_MWR_RRD_CNS); + id->idInsFmt(emitInsModeFormat(ins, IF_MRD_RRD_CNS)); id->idReg1(reg); id->idAddr()->iiaFieldHnd = fldHnd; @@ -7760,7 +7599,7 @@ void emitter::emitIns_S_R_I(instruction ins, emitAttr attr, int varNum, int offs instrDesc* id = emitNewInstrAmdCns(attr, 0, ival); id->idIns(ins); - id->idInsFmt(IF_SWR_RRD_CNS); + id->idInsFmt(emitInsModeFormat(ins, IF_SRD_RRD_CNS)); id->idReg1(reg); id->idAddr()->iiaLclVar.initLclVarAddr(varNum, offs); #ifdef DEBUG @@ -7793,7 +7632,7 @@ void emitter::emitIns_A_R_I(instruction ins, emitAttr attr, GenTreeIndir* indir, instrDesc* id = emitNewInstrAmdCns(attr, indir->Offset(), imm); id->idIns(ins); id->idReg1(reg); - emitHandleMemOp(indir, id, IF_AWR_RRD_CNS, ins); + emitHandleMemOp(indir, id, emitInsModeFormat(ins, IF_ARD_RRD_CNS), ins); UNATIVE_OFFSET size = emitInsSizeAM(id, insCodeMR(ins), imm); id->idCodeSize(size); dispIns(id); @@ -8189,32 +8028,6 @@ void emitter::emitIns_SIMD_R_R_A( } } -//------------------------------------------------------------------------ -// emitIns_SIMD_R_R_AR: emits the code for a SIMD instruction that takes a register operand, a base memory register, -// and that returns a value in register -// -// Arguments: -// ins -- The instruction being emitted -// attr -- The emit attribute -// targetReg -- The target register -// op1Reg -- The register of the first operand -// base -- The base register used for the memory address -// offset -- The memory offset -// -void emitter::emitIns_SIMD_R_R_AR( - instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber base, int offset) -{ - if (UseSimdEncoding()) - { - emitIns_R_R_AR(ins, attr, targetReg, op1Reg, base, offset); - } - else - { - emitIns_Mov(INS_movaps, attr, targetReg, op1Reg, /* canSkip */ true); - emitIns_R_AR(ins, attr, targetReg, base, offset); - } -} - //------------------------------------------------------------------------ // emitIns_SIMD_R_R_C: emits the code for a SIMD instruction that takes a register operand, a field handle + offset, // and that returns a value in register @@ -8303,7 +8116,6 @@ void emitter::emitIns_SIMD_R_R_S( } } -#ifdef FEATURE_HW_INTRINSICS //------------------------------------------------------------------------ // emitIns_SIMD_R_R_A_I: emits the code for a SIMD instruction that takes a register operand, a GenTreeIndir address, // an immediate operand, and that returns a value in register @@ -8330,32 +8142,6 @@ void emitter::emitIns_SIMD_R_R_A_I( } } -//------------------------------------------------------------------------ -// emitIns_SIMD_R_R_AR_I: emits the code for a SIMD instruction that takes a register operand, a base memory register, -// an immediate operand, and that returns a value in register -// -// Arguments: -// ins -- The instruction being emitted -// attr -- The emit attribute -// targetReg -- The target register -// op1Reg -- The register of the first operand -// base -- The base register used for the memory address -// ival -- The immediate value -// -void emitter::emitIns_SIMD_R_R_AR_I( - instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber base, int ival) -{ - if (UseSimdEncoding()) - { - emitIns_R_R_AR_I(ins, attr, targetReg, op1Reg, base, 0, ival); - } - else - { - emitIns_Mov(INS_movaps, attr, targetReg, op1Reg, /* canSkip */ true); - emitIns_R_AR_I(ins, attr, targetReg, base, 0, ival); - } -} - //------------------------------------------------------------------------ // emitIns_SIMD_R_R_C_I: emits the code for a SIMD instruction that takes a register operand, a field handle + offset, // an immediate operand, and that returns a value in register @@ -8444,6 +8230,7 @@ void emitter::emitIns_SIMD_R_R_S_I( } } +#ifdef FEATURE_HW_INTRINSICS //------------------------------------------------------------------------ // emitIns_SIMD_R_R_R_A: emits the code for a SIMD instruction that takes two register operands, a GenTreeIndir address, // and that returns a value in register @@ -9523,7 +9310,7 @@ void emitter::emitIns_Call(EmitCallType callType, // The function is "ireg" if id->idIsCallRegPtr(), // else [ireg+xmul*xreg+disp] - id->idInsFmt(IF_ARD); + id->idInsFmt(emitInsModeFormat(ins, IF_ARD)); id->idAddr()->iiaAddrMode.amBaseReg = ireg; id->idAddr()->iiaAddrMode.amIndxReg = xreg; @@ -9666,20 +9453,202 @@ void emitter::emitInsSanityCheck(instrDesc* id) } #endif -/***************************************************************************** - * - * Return the allocated size (in bytes) of the given instruction descriptor. - */ +//------------------------------------------------------------------------ +// emitSizeOfInsDsc_AMD: The allocated size, in bytes, of the AMD or AMD_CNS instrDesc +// +// Arguments: +// id - The instrDesc for which to get the size +// +// Returns: +// The allocated size, in bytes, of id +// +size_t emitter::emitSizeOfInsDsc_AMD(instrDesc* id) const +{ + assert(!emitIsSmallInsDsc(id)); + +#if defined(DEBUG) + assert((unsigned)id->idInsFmt() < emitFmtCount); + ID_OPS idOp = (ID_OPS)emitFmtToOps[id->idInsFmt()]; + assert((idOp == ID_OP_AMD) || (idOp == ID_OP_AMD_CNS)); +#endif // DEBUG + + if (id->idIsLargeCns()) + { + if (id->idIsLargeDsp()) + { + return sizeof(instrDescCnsAmd); + } + else + { + return sizeof(instrDescCns); + } + } + else if (id->idIsLargeDsp()) + { + return sizeof(instrDescAmd); + } + else + { + return sizeof(instrDesc); + } +} + +//------------------------------------------------------------------------ +// emitSizeOfInsDsc_CNS: The allocated size, in bytes, of the CNS or SCNS instrDesc +// +// Arguments: +// id - The instrDesc for which to get the size +// +// Returns: +// The allocated size, in bytes, of id +// +size_t emitter::emitSizeOfInsDsc_CNS(instrDesc* id) const +{ +#if defined(DEBUG) + assert((unsigned)id->idInsFmt() < emitFmtCount); + ID_OPS idOp = (ID_OPS)emitFmtToOps[id->idInsFmt()]; + assert((idOp == ID_OP_CNS) || (idOp == ID_OP_SCNS)); +#endif // DEBUG -size_t emitter::emitSizeOfInsDsc(instrDesc* id) + if (emitIsSmallInsDsc(id)) + { + return SMALL_IDSC_SIZE; + } + else if (id->idIsLargeCns()) + { + return sizeof(instrDescCns); + } + else + { + return sizeof(instrDesc); + } +} + +//------------------------------------------------------------------------ +// emitSizeOfInsDsc_NONE: The allocated size, in bytes, of the NONE instrDesc +// +// Arguments: +// id - The instrDesc for which to get the size +// +// Returns: +// The allocated size, in bytes, of id +// +size_t emitter::emitSizeOfInsDsc_NONE(instrDesc* id) const { - if (emitIsScnsInsDsc(id)) +#if defined(DEBUG) + assert((unsigned)id->idInsFmt() < emitFmtCount); + ID_OPS idOp = (ID_OPS)emitFmtToOps[id->idInsFmt()]; + assert(idOp == ID_OP_NONE); +#endif // DEBUG + + if (emitIsSmallInsDsc(id)) { return SMALL_IDSC_SIZE; } +#if FEATURE_LOOP_ALIGN + else if (id->idIns() == INS_align) + { + return sizeof(instrDescAlign); + } +#endif + else + { + return sizeof(instrDesc); + } +} +//------------------------------------------------------------------------ +// emitSizeOfInsDsc_SPEC: The allocated size, in bytes, of the CALL or SPEC instrDesc +// +// Arguments: +// id - The instrDesc for which to get the size +// +// Returns: +// The allocated size, in bytes, of id +// +size_t emitter::emitSizeOfInsDsc_SPEC(instrDesc* id) const +{ + assert(!emitIsSmallInsDsc(id)); + +#if defined(DEBUG) assert((unsigned)id->idInsFmt() < emitFmtCount); + ID_OPS idOp = (ID_OPS)emitFmtToOps[id->idInsFmt()]; + assert((idOp == ID_OP_CALL) || (idOp == ID_OP_SPEC)); +#endif // DEBUG + if (id->idIsLargeCall()) + { + /* Must be a "fat" indirect call descriptor */ + return sizeof(instrDescCGCA); + } + else if (id->idIsLargeCns()) + { + if (id->idIsLargeDsp()) + { + return sizeof(instrDescCnsDsp); + } + else + { + return sizeof(instrDescCns); + } + } + else if (id->idIsLargeDsp()) + { + return sizeof(instrDescDsp); + } + else + { + return sizeof(instrDesc); + } +} + +//------------------------------------------------------------------------ +// emitSizeOfInsDsc_DSP: The allocated size, in bytes, of the DSP or DSP_CNS instrDesc +// +// Arguments: +// id - The instrDesc for which to get the size +// +// Returns: +// The allocated size, in bytes, of id +// +size_t emitter::emitSizeOfInsDsc_DSP(instrDesc* id) const +{ + assert(!emitIsSmallInsDsc(id)); + +#if defined(DEBUG) + assert((unsigned)id->idInsFmt() < emitFmtCount); + ID_OPS idOp = (ID_OPS)emitFmtToOps[id->idInsFmt()]; + assert((idOp == ID_OP_DSP) || (idOp == ID_OP_DSP_CNS)); +#endif // DEBUG + + if (id->idIsLargeCns()) + { + if (id->idIsLargeDsp()) + { + return sizeof(instrDescCnsDsp); + } + else + { + return sizeof(instrDescCns); + } + } + else if (id->idIsLargeDsp()) + { + return sizeof(instrDescDsp); + } + else + { + return sizeof(instrDesc); + } +} + +/***************************************************************************** + * + * Return the allocated size (in bytes) of the given instruction descriptor. + */ +size_t emitter::emitSizeOfInsDsc(instrDesc* id) const +{ + assert((unsigned)id->idInsFmt() < emitFmtCount); ID_OPS idOp = (ID_OPS)emitFmtToOps[id->idInsFmt()]; // An INS_call instruction may use a "fat" direct/indirect call descriptor @@ -9689,95 +9658,58 @@ size_t emitter::emitSizeOfInsDsc(instrDesc* id) if (id->idIns() == INS_call) { - assert(idOp == ID_OP_CALL || // is a direct call - idOp == ID_OP_SPEC || // is a indirect call - idOp == ID_OP_JMP); // is a local call to finally clause + assert((idOp == ID_OP_CALL) || // is a direct call + (idOp == ID_OP_SPEC) || // is a indirect call + (idOp == ID_OP_JMP)); // is a local call to finally clause } switch (idOp) { case ID_OP_NONE: -#if FEATURE_LOOP_ALIGN - if (id->idIns() == INS_align) - { - return sizeof(instrDescAlign); - } -#endif - break; + { + return emitSizeOfInsDsc_NONE(id); + } case ID_OP_LBL: + { return sizeof(instrDescLbl); + } case ID_OP_JMP: + { return sizeof(instrDescJmp); + } case ID_OP_CALL: case ID_OP_SPEC: - if (id->idIsLargeCall()) - { - /* Must be a "fat" indirect call descriptor */ - return sizeof(instrDescCGCA); - } - - FALLTHROUGH; + { + return emitSizeOfInsDsc_SPEC(id); + } case ID_OP_SCNS: case ID_OP_CNS: + { + return emitSizeOfInsDsc_CNS(id); + } + case ID_OP_DSP: case ID_OP_DSP_CNS: - if (id->idIsLargeCns()) - { - if (id->idIsLargeDsp()) - { - return sizeof(instrDescCnsDsp); - } - else - { - return sizeof(instrDescCns); - } - } - else - { - if (id->idIsLargeDsp()) - { - return sizeof(instrDescDsp); - } - else - { - return sizeof(instrDesc); - } - } + { + return emitSizeOfInsDsc_DSP(id); + } + case ID_OP_AMD: case ID_OP_AMD_CNS: - if (id->idIsLargeCns()) - { - if (id->idIsLargeDsp()) - { - return sizeof(instrDescCnsAmd); - } - else - { - return sizeof(instrDescCns); - } - } - else - { - if (id->idIsLargeDsp()) - { - return sizeof(instrDescAmd); - } - else - { - return sizeof(instrDesc); - } - } + { + return emitSizeOfInsDsc_AMD(id); + } default: + { NO_WAY("unexpected instruction descriptor format"); - break; + return sizeof(instrDesc); + } } - - return sizeof(instrDesc); } /***************************************************************************** @@ -10550,102 +10482,73 @@ void emitter::emitDispIns( /* Display a constant value if the instruction references one */ - if (!isNew) + if (!isNew && id->idHasMemGen()) { - switch (id->idInsFmt()) - { - int offs; - - case IF_MRD_RRD: - case IF_MWR_RRD: - case IF_MRW_RRD: - - case IF_RRD_MRD: - case IF_RWR_MRD: - case IF_RRW_MRD: + /* Is this actually a reference to a data section? */ + int offs = Compiler::eeGetJitDataOffs(id->idAddr()->iiaFieldHnd); - case IF_MRD_CNS: - case IF_MWR_CNS: - case IF_MRW_CNS: - case IF_MRW_SHF: + if (offs >= 0) + { + void* addr; - case IF_MRD: - case IF_MWR: - case IF_MRW: + /* Display a data section reference */ - case IF_MRD_OFF: + assert((unsigned)offs < emitConsDsc.dsdOffs); + addr = emitConsBlock ? emitConsBlock + offs : nullptr; - /* Is this actually a reference to a data section? */ +#if 0 + // TODO-XArch-Cleanup: Fix or remove this code. + /* Is the operand an integer or floating-point value? */ - offs = Compiler::eeGetJitDataOffs(id->idAddr()->iiaFieldHnd); + bool isFP = false; - if (offs >= 0) + if (CodeGen::instIsFP(id->idIns())) + { + switch (id->idIns()) { - void* addr; + case INS_fild: + case INS_fildl: + break; - /* Display a data section reference */ + default: + isFP = true; + break; + } + } - assert((unsigned)offs < emitConsDsc.dsdOffs); - addr = emitConsBlock ? emitConsBlock + offs : nullptr; + if (offs & 1) + printf("@CNS%02u", offs); + else + printf("@RWD%02u", offs); -#if 0 - // TODO-XArch-Cleanup: Fix or remove this code. - /* Is the operand an integer or floating-point value? */ + printf(" "); - bool isFP = false; + if (addr) + { + addr = 0; + // TODO-XArch-Bug?: + // This was busted by switching the order + // in which we output the code block vs. + // the data blocks -- when we get here, + // the data block has not been filled in + // yet, so we'll display garbage. - if (CodeGen::instIsFP(id->idIns())) + if (isFP) { - switch (id->idIns()) - { - case INS_fild: - case INS_fildl: - break; - - default: - isFP = true; - break; - } + if (id->idOpSize() == EA_4BYTE) + printf("DF %f \n", addr ? *(float *)addr : 0); + else + printf("DQ %lf\n", addr ? *(double *)addr : 0); } - - if (offs & 1) - printf("@CNS%02u", offs); else - printf("@RWD%02u", offs); - - printf(" "); - - if (addr) { - addr = 0; - // TODO-XArch-Bug?: - // This was busted by switching the order - // in which we output the code block vs. - // the data blocks -- when we get here, - // the data block has not been filled in - // yet, so we'll display garbage. - - if (isFP) - { - if (id->idOpSize() == EA_4BYTE) - printf("DF %f \n", addr ? *(float *)addr : 0); - else - printf("DQ %lf\n", addr ? *(double *)addr : 0); - } + if (id->idOpSize() <= EA_4BYTE) + printf("DD %d \n", addr ? *(int *)addr : 0); else - { - if (id->idOpSize() <= EA_4BYTE) - printf("DD %d \n", addr ? *(int *)addr : 0); - else - printf("DQ %D \n", addr ? *(__int64 *)addr : 0); - } + printf("DQ %D \n", addr ? *(__int64 *)addr : 0); } + } #endif - } - break; - - default: - break; } } @@ -10747,6 +10650,7 @@ void emitter::emitDispIns( const char* methodName; case IF_CNS: + { val = emitGetInsSC(id); #ifdef TARGET_AMD64 // no 8-byte immediates allowed here! @@ -10784,11 +10688,12 @@ void emitter::emitDispIns( emitDispCommentForHandle(srcVal, id->idDebugOnlyInfo()->idMemCookie, id->idDebugOnlyInfo()->idFlags); } break; + } case IF_ARD: case IF_AWR: case IF_ARW: - + { if (id->idIsCallRegPtr()) { printf("%s", emitRegName(id->idAddr()->iiaAddrMode.amBaseReg)); @@ -10829,10 +10734,12 @@ void emitter::emitDispIns( printf("%s", methodName); } break; + } case IF_RRD_ARD: case IF_RWR_ARD: case IF_RRW_ARD: + { #ifdef TARGET_AMD64 if (ins == INS_movsxd) { @@ -10855,7 +10762,9 @@ void emitter::emitDispIns( emitDispCommentForHandle(id->idDebugOnlyInfo()->idMemCookie, 0, id->idDebugOnlyInfo()->idFlags); break; + } + case IF_RRD_ARD_CNS: case IF_RRW_ARD_CNS: case IF_RWR_ARD_CNS: { @@ -10878,7 +10787,9 @@ void emitter::emitDispIns( break; } + case IF_ARD_RRD_CNS: case IF_AWR_RRD_CNS: + case IF_ARW_RRD_CNS: { switch (ins) { @@ -10929,12 +10840,20 @@ void emitter::emitDispIns( break; } + case IF_RRD_RRD_ARD: case IF_RWR_RRD_ARD: + case IF_RRW_RRD_ARD: + case IF_RWR_RWR_ARD: + { printf("%s, %s, %s", emitRegName(id->idReg1(), attr), emitRegName(id->idReg2(), attr), sstr); emitDispAddrMode(id); break; + } + case IF_RRD_ARD_RRD: case IF_RWR_ARD_RRD: + case IF_RRW_ARD_RRD: + { if (ins == INS_vpgatherqd || ins == INS_vgatherqps) { attr = EA_16BYTE; @@ -10944,6 +10863,7 @@ void emitter::emitDispIns( emitDispAddrMode(id); printf(", %s", emitRegName(id->idReg2(), attr)); break; + } case IF_RWR_RRD_ARD_CNS: { @@ -10981,11 +10901,12 @@ void emitter::emitDispIns( case IF_ARD_RRD: case IF_AWR_RRD: case IF_ARW_RRD: - + { printf("%s", sstr); emitDispAddrMode(id); printf(", %s", emitRegName(id->idReg1(), attr)); break; + } case IF_AWR_RRD_RRD: { @@ -11000,7 +10921,7 @@ void emitter::emitDispIns( case IF_AWR_CNS: case IF_ARW_CNS: case IF_ARW_SHF: - + { printf("%s", sstr); emitDispAddrMode(id); emitGetInsAmdCns(id, &cnsVal); @@ -11026,11 +10947,12 @@ void emitter::emitDispIns( } } break; + } case IF_SRD: case IF_SWR: case IF_SRW: - + { printf("%s", sstr); #if !FEATURE_FIXED_OUT_ARGS @@ -11048,11 +10970,12 @@ void emitter::emitDispIns( emitDispShift(ins); break; + } case IF_SRD_RRD: case IF_SWR_RRD: case IF_SRW_RRD: - + { printf("%s", sstr); emitDispFrameRef(id->idAddr()->iiaLclVar.lvaVarNum(), id->idAddr()->iiaLclVar.lvaOffset(), @@ -11060,12 +10983,13 @@ void emitter::emitDispIns( printf(", %s", emitRegName(id->idReg1(), attr)); break; + } case IF_SRD_CNS: case IF_SWR_CNS: case IF_SRW_CNS: case IF_SRW_SHF: - + { printf("%s", sstr); emitDispFrameRef(id->idAddr()->iiaLclVar.lvaVarNum(), id->idAddr()->iiaLclVar.lvaOffset(), @@ -11094,8 +11018,12 @@ void emitter::emitDispIns( } } break; + } + case IF_SRD_RRD_CNS: case IF_SWR_RRD_CNS: + case IF_SRW_RRD_CNS: + { assert(IsSSEOrAVXInstruction(ins)); emitGetInsAmdCns(id, &cnsVal); @@ -11118,10 +11046,12 @@ void emitter::emitDispIns( goto PRINT_CONSTANT; } break; + } case IF_RRD_SRD: case IF_RWR_SRD: case IF_RRW_SRD: + { #ifdef TARGET_AMD64 if (ins == INS_movsxd) { @@ -11145,9 +11075,11 @@ void emitter::emitDispIns( id->idDebugOnlyInfo()->idVarRefOffs, asmfm); break; + } - case IF_RRW_SRD_CNS: + case IF_RRD_SRD_CNS: case IF_RWR_SRD_CNS: + case IF_RRW_SRD_CNS: { printf("%s, %s", emitRegName(id->idReg1(), attr), sstr); emitDispFrameRef(id->idAddr()->iiaLclVar.lvaVarNum(), id->idAddr()->iiaLclVar.lvaOffset(), @@ -11168,11 +11100,16 @@ void emitter::emitDispIns( break; } + case IF_RRD_RRD_SRD: case IF_RWR_RRD_SRD: + case IF_RRW_RRD_SRD: + case IF_RWR_RWR_SRD: + { printf("%s, %s, %s", emitRegName(id->idReg1(), attr), emitRegName(id->idReg2(), attr), sstr); emitDispFrameRef(id->idAddr()->iiaLclVar.lvaVarNum(), id->idAddr()->iiaLclVar.lvaOffset(), id->idDebugOnlyInfo()->idVarRefOffs, asmfm); break; + } case IF_RWR_RRD_SRD_CNS: { @@ -11208,9 +11145,30 @@ void emitter::emitDispIns( break; } + case IF_RRD_SRD_RRD: + case IF_RWR_SRD_RRD: + case IF_RRW_SRD_RRD: + { + printf("%s, %s", emitRegName(id->idReg1(), attr), sstr); + emitDispFrameRef(id->idAddr()->iiaLclVar.lvaVarNum(), id->idAddr()->iiaLclVar.lvaOffset(), + id->idDebugOnlyInfo()->idVarRefOffs, asmfm); + printf(", %s", emitRegName(id->idReg2(), attr)); + break; + } + + case IF_SWR_RRD_RRD: + { + printf("%s", sstr); + emitDispFrameRef(id->idAddr()->iiaLclVar.lvaVarNum(), id->idAddr()->iiaLclVar.lvaOffset(), + id->idDebugOnlyInfo()->idVarRefOffs, asmfm); + printf(", %s, %s", emitRegName(id->idReg1(), attr), emitRegName(id->idReg2(), attr)); + break; + } + case IF_RRD_RRD: case IF_RWR_RRD: case IF_RRW_RRD: + case IF_RRW_RRW: { switch (ins) { @@ -11302,13 +11260,10 @@ void emitter::emitDispIns( break; } - case IF_RRW_RRW: - assert(ins == INS_xchg); - printf("%s,", emitRegName(id->idReg1(), attr)); - printf(" %s", emitRegName(id->idReg2(), attr)); - break; - + case IF_RRD_RRD_RRD: case IF_RWR_RRD_RRD: + case IF_RRW_RRD_RRD: + case IF_RWR_RWR_RRD: { assert(IsVexOrEvexEncodableInstruction(ins)); assert(IsThreeOperandAVXInstruction(ins)); @@ -11333,6 +11288,7 @@ void emitter::emitDispIns( } case IF_RWR_RRD_RRD_CNS: + { assert(IsVexOrEvexEncodableInstruction(ins)); assert(IsThreeOperandAVXInstruction(ins)); printf("%s, ", emitRegName(id->idReg1(), attr)); @@ -11382,7 +11338,10 @@ void emitter::emitDispIns( val = emitGetInsSC(id); goto PRINT_CONSTANT; break; + } + case IF_RWR_RRD_RRD_RRD: + { assert(IsAVXOnlyInstruction(ins)); assert(UseVEXEncoding()); printf("%s, ", emitRegName(id->idReg1(), attr)); @@ -11390,7 +11349,11 @@ void emitter::emitDispIns( printf("%s, ", emitRegName(id->idReg3(), attr)); printf("%s", emitRegName(id->idReg4(), attr)); break; - case IF_RRW_RRW_CNS: + } + + case IF_RRD_RRD_CNS: + case IF_RWR_RRD_CNS: + case IF_RRW_RRD_CNS: { emitAttr tgtAttr = attr; @@ -11472,19 +11435,46 @@ void emitter::emitDispIns( case IF_RRD: case IF_RWR: case IF_RRW: + { printf("%s", emitRegName(id->idReg1(), attr)); emitDispShift(ins); break; + } + case IF_RRD_CNS: + case IF_RWR_CNS: + case IF_RRW_CNS: case IF_RRW_SHF: + { printf("%s", emitRegName(id->idReg1(), attr)); - emitDispShift(ins, (BYTE)emitGetInsSC(id)); + + emitGetInsCns(id, &cnsVal); + val = cnsVal.cnsVal; + + if (id->idInsFmt() == IF_RRW_SHF) + { + emitDispShift(ins, (BYTE)val); + } + else + { + printf(", "); + + if (cnsVal.cnsReloc) + { + emitDispReloc(val); + } + else + { + goto PRINT_CONSTANT; + } + } break; + } case IF_RRD_MRD: case IF_RWR_MRD: case IF_RRW_MRD: - + { if (ins == INS_movsx || ins == INS_movzx) { attr = EA_PTRSIZE; @@ -11505,9 +11495,11 @@ void emitter::emitDispIns( offs = emitGetInsDsp(id); emitDispClsVar(id->idAddr()->iiaFieldHnd, offs, ID_INFO_DSP_RELOC); break; + } - case IF_RRW_MRD_CNS: + case IF_RRD_MRD_CNS: case IF_RWR_MRD_CNS: + case IF_RRW_MRD_CNS: { printf("%s, %s", emitRegName(id->idReg1(), attr), sstr); offs = emitGetInsDsp(id); @@ -11528,7 +11520,9 @@ void emitter::emitDispIns( break; } + case IF_MRD_RRD_CNS: case IF_MWR_RRD_CNS: + case IF_MRW_RRD_CNS: { switch (ins) { @@ -11579,11 +11573,16 @@ void emitter::emitDispIns( break; } + case IF_RRD_RRD_MRD: case IF_RWR_RRD_MRD: + case IF_RRW_RRD_MRD: + case IF_RWR_RWR_MRD: + { printf("%s, %s, %s", emitRegName(id->idReg1(), attr), emitRegName(id->idReg2(), attr), sstr); offs = emitGetInsDsp(id); emitDispClsVar(id->idAddr()->iiaFieldHnd, offs, ID_INFO_DSP_RELOC); break; + } case IF_RWR_RRD_MRD_CNS: { @@ -11621,27 +11620,29 @@ void emitter::emitDispIns( } case IF_RWR_MRD_OFF: - + { printf("%s, %s", emitRegName(id->idReg1(), attr), "offset"); offs = emitGetInsDsp(id); emitDispClsVar(id->idAddr()->iiaFieldHnd, offs, ID_INFO_DSP_RELOC); break; + } case IF_MRD_RRD: case IF_MWR_RRD: case IF_MRW_RRD: - + { printf("%s", sstr); offs = emitGetInsDsp(id); emitDispClsVar(id->idAddr()->iiaFieldHnd, offs, ID_INFO_DSP_RELOC); printf(", %s", emitRegName(id->idReg1(), attr)); break; + } case IF_MRD_CNS: case IF_MWR_CNS: case IF_MRW_CNS: case IF_MRW_SHF: - + { printf("%s", sstr); offs = emitGetInsDsp(id); emitDispClsVar(id->idAddr()->iiaFieldHnd, offs, ID_INFO_DSP_RELOC); @@ -11665,43 +11666,51 @@ void emitter::emitDispIns( goto PRINT_CONSTANT; } break; + } case IF_MRD: case IF_MWR: case IF_MRW: - + { printf("%s", sstr); offs = emitGetInsDsp(id); emitDispClsVar(id->idAddr()->iiaFieldHnd, offs, ID_INFO_DSP_RELOC); emitDispShift(ins); break; + } case IF_MRD_OFF: - + { printf("offset "); offs = emitGetInsDsp(id); emitDispClsVar(id->idAddr()->iiaFieldHnd, offs, ID_INFO_DSP_RELOC); break; + } - case IF_RRD_CNS: - case IF_RWR_CNS: - case IF_RRW_CNS: - printf("%s, ", emitRegName(id->idReg1(), attr)); - val = emitGetInsSC(id); - if (id->idIsCnsReloc()) - { - emitDispReloc(val); - } - else - { - goto PRINT_CONSTANT; - } + case IF_RRD_MRD_RRD: + case IF_RWR_MRD_RRD: + case IF_RRW_MRD_RRD: + { + printf("%s, %s", emitRegName(id->idReg1(), attr), sstr); + offs = emitGetInsDsp(id); + emitDispClsVar(id->idAddr()->iiaFieldHnd, offs, ID_INFO_DSP_RELOC); + printf(", %s", emitRegName(id->idReg2(), attr)); break; + } + + case IF_MWR_RRD_RRD: + { + printf("%s", sstr); + offs = emitGetInsDsp(id); + emitDispClsVar(id->idAddr()->iiaFieldHnd, offs, ID_INFO_DSP_RELOC); + printf(", %s, %s", emitRegName(id->idReg1(), attr), emitRegName(id->idReg2(), attr)); + break; + } case IF_LABEL: case IF_RWR_LABEL: case IF_SWR_LABEL: - + { if (ins == INS_lea) { printf("%s, ", emitRegName(id->idReg1(), attr)); @@ -11738,9 +11747,11 @@ void emitter::emitDispIns( printf("L_M%03u_" FMT_BB, emitComp->compMethodID, id->idAddr()->iiaBBlabel->bbNum); } break; + } case IF_METHOD: case IF_METHPTR: + { methodName = emitComp->eeGetMethodFullName((CORINFO_METHOD_HANDLE)id->idDebugOnlyInfo()->idMemCookie); if (id->idInsFmt() == IF_METHPTR) @@ -11756,8 +11767,10 @@ void emitter::emitDispIns( } break; + } case IF_NONE: + { #if FEATURE_LOOP_ALIGN if (ins == INS_align) { @@ -11772,6 +11785,7 @@ void emitter::emitDispIns( } #endif break; + } default: printf("unexpected format %s", emitIfName(id->idInsFmt())); @@ -12031,6 +12045,8 @@ BYTE* emitter::emitOutputAlign(insGroup* ig, instrDesc* id, BYTE* dst) BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) { + assert(id->idHasMemAdr()); + regNumber reg; regNumber rgx; ssize_t dsp; @@ -12122,28 +12138,23 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) // relative addressing which results in smaller instruction size. if ((ins == INS_mov) && (id->idReg1() == REG_EAX) && (reg == REG_NA) && (rgx == REG_NA)) { - switch (id->idInsFmt()) - { - case IF_RWR_ARD: + insFormat insFmt = id->idInsFmt(); - assert(code == (insCodeRM(ins) | (insEncodeReg345(id, REG_EAX, EA_PTRSIZE, NULL) << 8))); - - code &= ~((code_t)0xFFFFFFFF); - code |= 0xA0; - isMoffset = true; - break; - - case IF_AWR_RRD: - - assert(code == (insCodeMR(ins) | (insEncodeReg345(id, REG_EAX, EA_PTRSIZE, NULL) << 8))); + if (insFmt == IF_RWR_ARD) + { + assert(code == (insCodeRM(ins) | (insEncodeReg345(id, REG_EAX, EA_PTRSIZE, NULL) << 8))); - code &= ~((code_t)0xFFFFFFFF); - code |= 0xA2; - isMoffset = true; - break; + code &= ~((code_t)0xFFFFFFFF); + code |= 0xA0; + isMoffset = true; + } + else if (insFmt == IF_AWR_RRD) + { + assert(code == (insCodeMR(ins) | (insEncodeReg345(id, REG_EAX, EA_PTRSIZE, NULL) << 8))); - default: - break; + code &= ~((code_t)0xFFFFFFFF); + code |= 0xA2; + isMoffset = true; } } } @@ -12163,8 +12174,13 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) switch (id->idInsFmt()) { + case IF_RRD_RRD_ARD: case IF_RWR_RRD_ARD: + case IF_RRW_RRD_ARD: + case IF_RWR_RWR_ARD: + case IF_RRD_ARD_RRD: case IF_RWR_ARD_RRD: + case IF_RRW_ARD_RRD: case IF_RWR_RRD_ARD_CNS: case IF_RWR_RRD_ARD_RRD: { @@ -12172,8 +12188,18 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) break; } + case IF_RRD_ARD: + case IF_RWR_ARD: + case IF_RRW_ARD: + case IF_AWR_RRD_RRD: + { + src1 = id->idReg1(); + break; + } + default: { + assert(!"Unhandled insFmt in emitOutputAM"); src1 = id->idReg1(); break; } @@ -12184,7 +12210,10 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) } else if (IsDstSrcSrcAVXInstruction(ins)) { - code = insEncodeReg3456(id, id->idReg2(), size, code); + if (id->idHasReg2()) + { + code = insEncodeReg3456(id, id->idReg2(), size, code); + } } } @@ -12919,6 +12948,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) case IF_ARW_RRD: case IF_ARW_CNS: + case IF_ARW_SHF: if (id->idGCref() == GCT_BYREF) { assert(ins == INS_add || ins == INS_sub || ins == INS_sub_hide); @@ -12949,8 +12979,19 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) case IF_RWR_ARD: case IF_RRW_ARD: case IF_RWR_RRD_ARD: + case IF_RRW_RRD_ARD: + { + emitGCregDeadUpd(id->idReg1(), dst); + break; + } + + case IF_RWR_RWR_ARD: + { emitGCregDeadUpd(id->idReg1(), dst); + emitGCregDeadUpd(id->idReg2(), dst); break; + } + default: break; } @@ -12982,6 +13023,8 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) { + assert(id->idHasMemStk()); + int adr; int dsp; bool EBPbased; @@ -13408,8 +13451,19 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) case IF_RWR_SRD: // Register Write, Stack Read case IF_RRW_SRD: // Register Read/Write, Stack Read case IF_RWR_RRD_SRD: + case IF_RRW_RRD_SRD: + { + emitGCregDeadUpd(id->idReg1(), dst); + break; + } + + case IF_RWR_RWR_SRD: + { emitGCregDeadUpd(id->idReg1(), dst); + emitGCregDeadUpd(id->idReg2(), dst); break; + } + default: break; } @@ -13441,6 +13495,8 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) BYTE* emitter::emitOutputCV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) { + assert(id->idHasMemGen()); + BYTE* addr; CORINFO_FIELD_HANDLE fldh; ssize_t offs; @@ -13505,28 +13561,23 @@ BYTE* emitter::emitOutputCV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) // relative addressing which results in smaller instruction size. if (ins == INS_mov && id->idReg1() == REG_EAX) { - switch (id->idInsFmt()) - { - case IF_RWR_MRD: - - assert(code == (insCodeRM(ins) | (insEncodeReg345(id, REG_EAX, EA_PTRSIZE, NULL) << 8) | 0x0500)); - - code &= ~((code_t)0xFFFFFFFF); - code |= 0xA0; - isMoffset = true; - break; - - case IF_MWR_RRD: + insFormat insFmt = id->idInsFmt(); - assert(code == (insCodeMR(ins) | (insEncodeReg345(id, REG_EAX, EA_PTRSIZE, NULL) << 8) | 0x0500)); + if (insFmt == IF_RWR_MRD) + { + assert(code == (insCodeRM(ins) | (insEncodeReg345(id, REG_EAX, EA_PTRSIZE, NULL) << 8) | 0x0500)); - code &= ~((code_t)0xFFFFFFFF); - code |= 0xA2; - isMoffset = true; - break; + code &= ~((code_t)0xFFFFFFFF); + code |= 0xA0; + isMoffset = true; + } + else if (insFmt == IF_MWR_RRD) + { + assert(code == (insCodeMR(ins) | (insEncodeReg345(id, REG_EAX, EA_PTRSIZE, NULL) << 8) | 0x0500)); - default: - break; + code &= ~((code_t)0xFFFFFFFF); + code |= 0xA2; + isMoffset = true; } } } @@ -13854,6 +13905,7 @@ BYTE* emitter::emitOutputCV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) case IF_MRD_CNS: case IF_MWR_CNS: case IF_MRW_CNS: + case IF_MRW_SHF: break; case IF_RRW_MRD: @@ -13881,8 +13933,19 @@ BYTE* emitter::emitOutputCV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) case IF_RWR_MRD: case IF_RRW_MRD: case IF_RWR_RRD_MRD: + case IF_RRW_RRD_MRD: + { emitGCregDeadUpd(id->idReg1(), dst); break; + } + + case IF_RWR_RWR_MRD: + { + emitGCregDeadUpd(id->idReg1(), dst); + emitGCregDeadUpd(id->idReg2(), dst); + break; + } + default: break; } @@ -13920,6 +13983,8 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id) regNumber reg = id->idReg1(); emitAttr size = id->idOpSize(); + assert(!id->idHasReg2()); + // We would to update GC info correctly assert(!IsSSEInstruction(ins)); assert(!IsVexOrEvexEncodableInstruction(ins)); @@ -14160,6 +14225,8 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) regNumber reg2 = id->idReg2(); emitAttr size = id->idOpSize(); + assert(!id->idHasReg3()); + if (IsAvx512OrPriorInstruction(ins)) { assert((ins != INS_movd) || (isFloatReg(reg1) != isFloatReg(reg2))); @@ -14383,7 +14450,7 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) break; case IF_RWR_RRD: - + { if (emitSyncThisObjReg != REG_NA && emitIGisInProlog(emitCurIG) && reg2 == (int)REG_ARG_0) { // We're relocating "this" in the prolog @@ -14407,9 +14474,10 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) emitGCregLiveUpd(id->idGCref(), reg1, dst); break; + } case IF_RRW_RRD: - + { switch (id->idIns()) { /* @@ -14475,8 +14543,10 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) } break; + } case IF_RRW_RRW: + { // This must be "xchg reg1, reg2" assert(id->idIns() == INS_xchg); @@ -14515,6 +14585,7 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) } } break; + } default: #ifdef DEBUG @@ -14530,6 +14601,7 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) switch (id->idInsFmt()) { case IF_RRD_CNS: + { // INS_mulEAX can not be used with any of these formats assert(ins != INS_mulEAX && ins != INS_imulEAX); @@ -14542,12 +14614,14 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) emitGCregDeadUpd(tgtReg, dst); } break; + } case IF_RWR_RRD: case IF_RRW_RRD: - case IF_RWR_RRD_RRD: + { emitGCregDeadUpd(reg1, dst); break; + } default: break; @@ -14627,10 +14701,20 @@ BYTE* emitter::emitOutputRRR(BYTE* dst, instrDesc* id) switch (id->idInsFmt()) { case IF_RWR_RRD_RRD: + case IF_RRW_RRD_RRD: case IF_RWR_RRD_RRD_CNS: case IF_RWR_RRD_RRD_RRD: + { + emitGCregDeadUpd(id->idReg1(), dst); + break; + } + + case IF_RWR_RWR_RRD: + { emitGCregDeadUpd(id->idReg1(), dst); + emitGCregDeadUpd(id->idReg2(), dst); break; + } default: break; @@ -14654,6 +14738,8 @@ BYTE* emitter::emitOutputRI(BYTE* dst, instrDesc* id) ssize_t val = emitGetInsSC(id); bool valInByte = ((signed char)val == (target_ssize_t)val) && (ins != INS_mov) && (ins != INS_test); + assert(!id->idHasReg2()); + // BT reg,imm might be useful but it requires special handling of the immediate value // (it is always encoded in a byte). Let's not complicate things until this is needed. assert(ins != INS_bt); @@ -15327,7 +15413,7 @@ BYTE* emitter::emitOutputLJ(insGroup* ig, BYTE* dst, instrDesc* i) insGroup* tmpIGlabel = id->idAddr()->iiaIGlabel; bool tmpDspReloc = id->idIsDspReloc(); - id->idInsFmt(IF_SWR_CNS); + id->idInsFmt(emitInsModeFormat(ins, IF_SRD_CNS)); id->idAddr()->iiaLclVar = ((instrDescLbl*)id)->dstLclVar; id->idSetIsDspReloc(false); @@ -15353,7 +15439,7 @@ BYTE* emitter::emitOutputLJ(insGroup* ig, BYTE* dst, instrDesc* i) idAmd->idDebugOnlyInfo(id->idDebugOnlyInfo()); } - idAmd->idInsFmt(IF_RWR_ARD); + idAmd->idInsFmt(emitInsModeFormat(ins, IF_RRD_ARD)); idAmd->idAddr()->iiaAddrMode.amBaseReg = REG_NA; idAmd->idAddr()->iiaAddrMode.amIndxReg = REG_NA; emitSetAmdDisp(idAmd, distVal); // set the displacement @@ -15652,6 +15738,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) /* No operands */ /********************************************************************/ case IF_NONE: + { // the loop alignment pseudo instruction if (ins == INS_align) { @@ -15726,19 +15813,23 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } break; + } /********************************************************************/ /* Simple constant, local label, method */ /********************************************************************/ case IF_CNS: + { dst = emitOutputIV(dst, id); - sz = emitSizeOfInsDsc(id); + sz = emitSizeOfInsDsc_CNS(id); break; + } case IF_LABEL: case IF_RWR_LABEL: case IF_SWR_LABEL: + { assert(id->idGCref() == GCT_NONE); assert(id->idIsBound() || emitJmpInstHasNoCode(id)); @@ -15753,9 +15844,11 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } sz = (id->idInsFmt() == IF_SWR_LABEL ? sizeof(instrDescLbl) : sizeof(instrDescJmp)); break; + } case IF_METHOD: case IF_METHPTR: + { // Assume we'll be recording this call recCall = true; @@ -15950,6 +16043,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) #endif // DEBUG break; + } /********************************************************************/ /* One register operand */ @@ -15958,15 +16052,18 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_RRD: case IF_RWR: case IF_RRW: + { dst = emitOutputR(dst, id); sz = SMALL_IDSC_SIZE; break; + } /********************************************************************/ /* Register and register/constant */ /********************************************************************/ case IF_RRW_SHF: + { code = insCodeMR(ins); // Emit the VEX prefix if it exists code = AddSimdPrefixIfNeeded(id, code, size); @@ -15993,40 +16090,56 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); dst += emitOutputWord(dst, code); dst += emitOutputByte(dst, emitGetInsSC(id)); - sz = emitSizeOfInsDsc(id); + sz = emitSizeOfInsDsc_CNS(id); // Update GC info. assert(!id->idGCref()); emitGCregDeadUpd(id->idReg1(), dst); break; + } case IF_RRD_RRD: case IF_RWR_RRD: case IF_RRW_RRD: case IF_RRW_RRW: + { dst = emitOutputRR(dst, id); sz = SMALL_IDSC_SIZE; break; + } case IF_RRD_CNS: case IF_RWR_CNS: case IF_RRW_CNS: + { dst = emitOutputRI(dst, id); - sz = emitSizeOfInsDsc(id); + sz = emitSizeOfInsDsc_CNS(id); break; + } + case IF_RRD_RRD_RRD: case IF_RWR_RRD_RRD: + case IF_RRW_RRD_RRD: + case IF_RWR_RWR_RRD: + { dst = emitOutputRRR(dst, id); - sz = emitSizeOfInsDsc(id); + sz = sizeof(instrDesc); break; + } + case IF_RWR_RRD_RRD_CNS: case IF_RWR_RRD_RRD_RRD: + { dst = emitOutputRRR(dst, id); - sz = emitSizeOfInsDsc(id); dst += emitOutputByte(dst, emitGetInsSC(id)); + sz = emitSizeOfInsDsc_CNS(id); break; + } - case IF_RRW_RRW_CNS: + case IF_RRD_RRD_CNS: + case IF_RWR_RRD_CNS: + case IF_RRW_RRD_CNS: + { assert(id->idGCref() == GCT_NONE); // Get the 'base' opcode (it's a big one) @@ -16137,7 +16250,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } dst += emitOutputByte(dst, emitGetInsSC(id)); - sz = emitSizeOfInsDsc(id); + sz = emitSizeOfInsDsc_CNS(id); // Kill any GC ref in the destination register if necessary. if (!emitInsCanOnlyWriteSSE2OrAVXReg(id)) @@ -16145,6 +16258,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) emitGCregDeadUpd(id->idReg1(), dst); } break; + } /********************************************************************/ /* Address mode operand */ @@ -16153,7 +16267,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_ARD: case IF_AWR: case IF_ARW: - + { dst = emitCodeWithInstructionSize(dst, emitOutputAM(dst, id, insCodeMR(ins)), &callInstrSize); switch (ins) @@ -16190,13 +16304,24 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) goto DONE_CALL; default: - sz = emitSizeOfInsDsc(id); + { + if (id->idInsFmt() == IF_ARD) + { + sz = emitSizeOfInsDsc_SPEC(id); + } + else + { + sz = emitSizeOfInsDsc_AMD(id); + } break; + } } break; + } - case IF_RRW_ARD_CNS: + case IF_RRD_ARD_CNS: case IF_RWR_ARD_CNS: + case IF_RRW_ARD_CNS: { assert(IsAvx512OrPriorInstruction(ins)); emitGetInsAmdCns(id, &cnsVal); @@ -16215,21 +16340,28 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) dst = emitOutputAM(dst, id, code | regcode, &cnsVal); } - sz = emitSizeOfInsDsc(id); + sz = emitSizeOfInsDsc_AMD(id); break; } + case IF_ARD_RRD_CNS: case IF_AWR_RRD_CNS: + case IF_ARW_RRD_CNS: + { assert(IsAvx512OrPriorInstruction(ins)); emitGetInsAmdCns(id, &cnsVal); dst = emitOutputAM(dst, id, insCodeMR(ins), &cnsVal); - sz = emitSizeOfInsDsc(id); + sz = emitSizeOfInsDsc_AMD(id); break; + } case IF_RRD_ARD: case IF_RWR_ARD: case IF_RRW_ARD: + case IF_RRD_RRD_ARD: case IF_RWR_RRD_ARD: + case IF_RRW_RRD_ARD: + case IF_RWR_RWR_ARD: { code = insCodeRM(ins); @@ -16246,15 +16378,17 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) dst = emitOutputAM(dst, id, code | regcode); } - sz = emitSizeOfInsDsc(id); + sz = emitSizeOfInsDsc_AMD(id); break; } + case IF_RRD_ARD_RRD: case IF_RWR_ARD_RRD: + case IF_RRW_ARD_RRD: { assert(IsAVX2GatherInstruction(ins)); dst = emitOutputAM(dst, id, insCodeRM(ins)); - sz = emitSizeOfInsDsc(id); + sz = emitSizeOfInsDsc_AMD(id); break; } @@ -16278,7 +16412,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) dst = emitOutputAM(dst, id, code | regcode, &cnsVal); } - sz = emitSizeOfInsDsc(id); + sz = emitSizeOfInsDsc_AMD(id); break; } @@ -16301,7 +16435,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) dst = emitOutputAM(dst, id, code | regcode); } - sz = emitSizeOfInsDsc(id); + sz = emitSizeOfInsDsc_AMD(id); break; } @@ -16310,23 +16444,27 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) code = insCodeMR(ins); code = AddSimdPrefixIfNeeded(id, code, size); dst = emitOutputAM(dst, id, code); - sz = emitSizeOfInsDsc(id); + sz = emitSizeOfInsDsc_AMD(id); break; } case IF_ARD_CNS: case IF_AWR_CNS: case IF_ARW_CNS: + { emitGetInsAmdCns(id, &cnsVal); dst = emitOutputAM(dst, id, insCodeMI(ins), &cnsVal); - sz = emitSizeOfInsDsc(id); + sz = emitSizeOfInsDsc_AMD(id); break; + } case IF_ARW_SHF: + { emitGetInsAmdCns(id, &cnsVal); dst = emitOutputAM(dst, id, insCodeMR(ins), &cnsVal); - sz = emitSizeOfInsDsc(id); + sz = emitSizeOfInsDsc_AMD(id); break; + } /********************************************************************/ /* Stack-based operand */ @@ -16335,7 +16473,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_SRD: case IF_SWR: case IF_SRW: - + { assert(ins != INS_pop_hide); if (ins == INS_pop) { @@ -16359,32 +16497,41 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) { goto IND_CALL; } - break; + } case IF_SRD_CNS: case IF_SWR_CNS: case IF_SRW_CNS: + { emitGetInsCns(id, &cnsVal); dst = emitOutputSV(dst, id, insCodeMI(ins), &cnsVal); - sz = emitSizeOfInsDsc(id); + sz = emitSizeOfInsDsc_CNS(id); break; + } case IF_SRW_SHF: + { emitGetInsCns(id, &cnsVal); dst = emitOutputSV(dst, id, insCodeMR(ins), &cnsVal); - sz = emitSizeOfInsDsc(id); + sz = emitSizeOfInsDsc_CNS(id); break; + } + case IF_SRD_RRD_CNS: case IF_SWR_RRD_CNS: + case IF_SRW_RRD_CNS: + { assert(IsAvx512OrPriorInstruction(ins)); emitGetInsAmdCns(id, &cnsVal); dst = emitOutputSV(dst, id, insCodeMR(ins), &cnsVal); - sz = emitSizeOfInsDsc(id); + sz = emitSizeOfInsDsc_CNS(id); break; + } - case IF_RRW_SRD_CNS: + case IF_RRD_SRD_CNS: case IF_RWR_SRD_CNS: + case IF_RRW_SRD_CNS: { assert(IsAvx512OrPriorInstruction(ins)); emitGetInsCns(id, &cnsVal); @@ -16416,7 +16563,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) dst = emitOutputSV(dst, id, code | regcode, &cnsVal); } - sz = emitSizeOfInsDsc(id); + sz = emitSizeOfInsDsc_CNS(id); break; } @@ -16445,12 +16592,14 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputSV(dst, id, code | regcode); } - - sz = emitSizeOfInsDsc(id); + sz = sizeof(instrDesc); break; } + case IF_RRD_RRD_SRD: case IF_RWR_RRD_SRD: + case IF_RRW_RRD_SRD: + case IF_RWR_RWR_SRD: { assert(IsVexOrEvexEncodableInstruction(ins)); @@ -16470,6 +16619,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputSV(dst, id, code | regcode); } + sz = sizeof(instrDesc); break; } @@ -16496,8 +16646,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputSV(dst, id, code | regcode, &cnsVal); } - - sz = emitSizeOfInsDsc(id); + sz = emitSizeOfInsDsc_CNS(id); break; } @@ -16532,9 +16681,24 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputSV(dst, id, code | regcode); } + + sz = sizeof(instrDesc); break; } + case IF_RRD_SRD_RRD: + case IF_RWR_SRD_RRD: + case IF_RRW_SRD_RRD: + { + assert(IsAVX2GatherInstruction(ins)); + unreached(); + } + + case IF_SWR_RRD_RRD: + { + unreached(); + } + /********************************************************************/ /* Direct memory address */ /********************************************************************/ @@ -16542,18 +16706,30 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_MRD: case IF_MRW: case IF_MWR: - + { noway_assert(ins != INS_call); dst = emitOutputCV(dst, id, insCodeMR(ins) | 0x0500); - sz = emitSizeOfInsDsc(id); + if (id->idInsFmt() == IF_MRD) + { + sz = emitSizeOfInsDsc_SPEC(id); + } + else + { + sz = emitSizeOfInsDsc_DSP(id); + } break; + } case IF_MRD_OFF: + { dst = emitOutputCV(dst, id, insCodeMI(ins)); + sz = sizeof(instrDesc); break; + } - case IF_RRW_MRD_CNS: + case IF_RRD_MRD_CNS: case IF_RWR_MRD_CNS: + case IF_RRW_MRD_CNS: { assert(IsAvx512OrPriorInstruction(ins)); emitGetInsDcmCns(id, &cnsVal); @@ -16585,11 +16761,13 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) dst = emitOutputCV(dst, id, code | regcode | 0x0500, &cnsVal); } - sz = emitSizeOfInsDsc(id); + sz = emitSizeOfInsDsc_DSP(id); break; } + case IF_MRD_RRD_CNS: case IF_MWR_RRD_CNS: + case IF_MRW_RRD_CNS: { assert((ins == INS_vextractf128) || (ins == INS_vextractf32x8) || (ins == INS_vextractf64x2) || (ins == INS_vextractf64x4) || (ins == INS_vextracti128) || (ins == INS_vextracti32x8) || @@ -16598,7 +16776,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) emitGetInsDcmCns(id, &cnsVal); // we do not need VEX.vvvv to encode the register operand dst = emitOutputCV(dst, id, insCodeMR(ins), &cnsVal); - sz = emitSizeOfInsDsc(id); + sz = emitSizeOfInsDsc_DSP(id); break; } @@ -16643,11 +16821,14 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } } - sz = emitSizeOfInsDsc(id); + sz = emitSizeOfInsDsc_DSP(id); break; } + case IF_RRD_RRD_MRD: case IF_RWR_RRD_MRD: + case IF_RRW_RRD_MRD: + case IF_RWR_RWR_MRD: { // This should only be called on AVX instructions assert(IsVexOrEvexEncodableInstruction(ins)); @@ -16668,7 +16849,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputCV(dst, id, code | regcode | 0x0500); } - sz = emitSizeOfInsDsc(id); + sz = emitSizeOfInsDsc_DSP(id); break; } @@ -16695,7 +16876,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputCV(dst, id, code | regcode | 0x0500, &cnsVal); } - sz = emitSizeOfInsDsc(id); + sz = emitSizeOfInsDsc_DSP(id); break; } @@ -16719,7 +16900,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) regcode = insEncodeReg012(id, id->idReg1(), size, &code); dst = emitOutputCV(dst, id, code | 0x30 | regcode); - sz = emitSizeOfInsDsc(id); + sz = emitSizeOfInsDsc_DSP(id); break; } @@ -16754,23 +16935,41 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputCV(dst, id, code | regcode | 0x0500); } - sz = emitSizeOfInsDsc(id); + + sz = emitSizeOfInsDsc_DSP(id); break; } case IF_MRD_CNS: case IF_MWR_CNS: case IF_MRW_CNS: + { emitGetInsDcmCns(id, &cnsVal); dst = emitOutputCV(dst, id, insCodeMI(ins) | 0x0500, &cnsVal); - sz = emitSizeOfInsDsc(id); + sz = emitSizeOfInsDsc_DSP(id); break; + } case IF_MRW_SHF: + { emitGetInsDcmCns(id, &cnsVal); dst = emitOutputCV(dst, id, insCodeMR(ins) | 0x0500, &cnsVal); - sz = emitSizeOfInsDsc(id); + sz = emitSizeOfInsDsc_DSP(id); break; + } + + case IF_RRD_MRD_RRD: + case IF_RWR_MRD_RRD: + case IF_RRW_MRD_RRD: + { + assert(IsAVX2GatherInstruction(ins)); + unreached(); + } + + case IF_MWR_RRD_RRD: + { + unreached(); + } /********************************************************************/ /* oops */ @@ -16942,150 +17141,61 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) #pragma warning(pop) #endif -emitter::insFormat emitter::getMemoryOperation(instrDesc* id) +emitter::insFormat emitter::getMemoryOperation(instrDesc* id) const { - insFormat result = IF_NONE; instruction ins = id->idIns(); insFormat insFmt = id->idInsFmt(); if (ins == INS_lea) { // an INS_lea instruction doesn't actually read memory - insFmt = IF_NONE; + return IF_NONE; } - switch (insFmt) - { - case IF_NONE: - case IF_LABEL: - case IF_RWR_LABEL: - case IF_METHOD: - case IF_CNS: - - case IF_RRD: - case IF_RWR: - case IF_RRW: - case IF_RRD_CNS: - case IF_RWR_CNS: - case IF_RRW_CNS: - case IF_RRW_SHF: - case IF_RRD_RRD: - case IF_RWR_RRD: - case IF_RRW_RRD: - case IF_RRW_RRW: - case IF_RRW_RRW_CNS: - case IF_RWR_RRD_RRD: - case IF_RWR_RRD_RRD_CNS: - case IF_RWR_RRD_RRD_RRD: - // none, or register only - result = IF_NONE; - break; - - case IF_ARD: - case IF_RRD_ARD: - case IF_RWR_ARD: - case IF_RRW_ARD: - case IF_RWR_ARD_CNS: - case IF_RWR_RRD_ARD: - case IF_RRW_ARD_CNS: - case IF_RWR_ARD_RRD: - case IF_RWR_RRD_ARD_CNS: - case IF_RWR_RRD_ARD_RRD: - case IF_ARD_CNS: - case IF_ARD_RRD: - // Address [reg+reg*scale+cns] - read - result = IF_ARD; - break; - - case IF_AWR: - case IF_AWR_RRD: - case IF_AWR_CNS: - case IF_AWR_RRD_CNS: - case IF_AWR_RRD_RRD: - // Address [reg+reg*scale+cns] - write - result = IF_AWR; - break; - - case IF_ARW: - case IF_ARW_RRD: - case IF_ARW_CNS: - case IF_ARW_SHF: - // Address [reg+reg*scale+cns] - read and write - result = IF_ARW; - break; - - case IF_MRD: - case IF_MRD_CNS: - case IF_MRD_OFF: - case IF_MRD_RRD: - case IF_RRD_MRD: - case IF_RRW_MRD: - case IF_RWR_MRD: - case IF_RWR_MRD_CNS: - case IF_RWR_MRD_OFF: - case IF_RWR_RRD_MRD: - case IF_RRW_MRD_CNS: - case IF_RWR_RRD_MRD_CNS: - case IF_RWR_RRD_MRD_RRD: - case IF_METHPTR: - // Address [cns] - read - result = IF_MRD; - break; + return ExtractMemoryFormat(insFmt); +} - case IF_MWR: - case IF_MWR_CNS: - case IF_MWR_RRD: - case IF_MWR_RRD_CNS: - // Address [cns] - write - result = IF_MWR; - break; +emitter::insFormat emitter::ExtractMemoryFormat(insFormat insFmt) const +{ + IS_INFO isInfo = emitGetSchedInfo(insFmt); - case IF_MRW: - case IF_MRW_CNS: - case IF_MRW_RRD: - case IF_MRW_SHF: - // Address [cns] - read and write - result = IF_MWR; - break; + IS_INFO mask = static_cast(isInfo & (IS_GM_RD | IS_GM_RW | IS_GM_WR)); + if (mask != 0) + { + static_assert_no_msg(0 == (IS_GM_RD >> 13)); + static_assert_no_msg(1 == (IS_GM_WR >> 13)); + static_assert_no_msg(2 == (IS_GM_RW >> 13)); - case IF_SRD: - case IF_SRD_CNS: - case IF_SRD_RRD: + insFormat result = static_cast(IF_MRD + (mask >> 13)); + assert((result == IF_MRD) || (result == IF_MWR) || (result == IF_MRW)); + return result; + } - case IF_RRD_SRD: - case IF_RRW_SRD: - case IF_RWR_SRD: - case IF_RWR_SRD_CNS: - case IF_RWR_RRD_SRD: - case IF_RRW_SRD_CNS: - case IF_RWR_RRD_SRD_CNS: - case IF_RWR_RRD_SRD_RRD: - // Stack [RSP] - read - result = IF_SRD; - break; + mask = static_cast(isInfo & (IS_SF_RD | IS_SF_RW | IS_SF_WR)); + if (mask != 0) + { + static_assert_no_msg(0 == (IS_SF_RD >> 16)); + static_assert_no_msg(1 == (IS_SF_WR >> 16)); + static_assert_no_msg(2 == (IS_SF_RW >> 16)); - case IF_SWR: - case IF_SWR_CNS: - case IF_SWR_RRD: - case IF_SWR_RRD_CNS: - case IF_SWR_LABEL: - // Stack [RSP] - write - result = IF_SWR; - break; + insFormat result = static_cast(IF_SRD + (mask >> 16)); + assert((result == IF_SRD) || (result == IF_SWR) || (result == IF_SRW)); + return result; + } - case IF_SRW: - case IF_SRW_CNS: - case IF_SRW_RRD: - case IF_SRW_SHF: - // Stack [RSP] - read and write - result = IF_SWR; - break; + mask = static_cast(isInfo & (IS_AM_RD | IS_AM_RW | IS_AM_WR)); + if (mask != 0) + { + static_assert_no_msg(0 == (IS_AM_RD >> 19)); + static_assert_no_msg(1 == (IS_AM_WR >> 19)); + static_assert_no_msg(2 == (IS_AM_RW >> 19)); - default: - result = IF_NONE; - break; + insFormat result = static_cast(IF_ARD + (mask >> 19)); + assert((result == IF_ARD) || (result == IF_AWR) || (result == IF_ARW)); + return result; } - return result; + + return IF_NONE; } #if defined(DEBUG) || defined(LATE_DISASM) @@ -17563,7 +17673,7 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins case INS_shld: case INS_shrd: result.insLatency += PERFSCORE_LATENCY_3C; - if (insFmt == IF_RRW_RRW_CNS) + if (insFmt == IF_RRW_RRD_CNS) { // ins reg, reg, cns result.insThroughput = PERFSCORE_THROUGHPUT_1C; @@ -17681,10 +17791,8 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins break; case IF_SRD: - result.insThroughput = PERFSCORE_THROUGHPUT_3C; - break; - case IF_ARD: + case IF_MRD: result.insThroughput = PERFSCORE_THROUGHPUT_3C; break; @@ -18116,6 +18224,13 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins case INS_blendvps: case INS_blendvpd: case INS_pblendvb: + case INS_vpcmpeqb: + case INS_vpcmpeqw: + case INS_vpcmpeqd: + case INS_vpcmpeqq: + case INS_vpcmpgtb: + case INS_vpcmpgtw: + case INS_vpcmpgtd: case INS_vpsllvd: case INS_vpsllvq: case INS_vpsllvw: @@ -18318,6 +18433,10 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins case INS_cmppd: case INS_cmpss: case INS_cmpsd: + case INS_vcmpps: + case INS_vcmppd: + case INS_vcmpss: + case INS_vcmpsd: result.insThroughput = PERFSCORE_THROUGHPUT_2X; result.insLatency = PERFSCORE_LATENCY_4C; break; diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index 0c01f9aaca7b3..865847f22a497 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -105,6 +105,8 @@ static bool IsKInstruction(instruction ins); static regNumber getBmiRegNumber(instruction ins); static regNumber getSseShiftRegNumber(instruction ins); +bool HasVexEncoding(instruction ins) const; +bool HasEvexEncoding(instruction ins) const; bool IsVexEncodableInstruction(instruction ins) const; bool IsEvexEncodableInstruction(instruction ins) const; bool IsVexOrEvexEncodableInstruction(instruction ins) const; @@ -493,6 +495,12 @@ void emitAdjustStackDepthPushPop(instruction ins); void emitAdjustStackDepth(instruction ins, ssize_t val); #endif // !FEATURE_FIXED_OUT_ARGS +size_t emitSizeOfInsDsc_AMD(instrDesc* id) const; +size_t emitSizeOfInsDsc_CNS(instrDesc* id) const; +size_t emitSizeOfInsDsc_DSP(instrDesc* id) const; +size_t emitSizeOfInsDsc_NONE(instrDesc* id) const; +size_t emitSizeOfInsDsc_SPEC(instrDesc* id) const; + /***************************************************************************** * * Convert between an index scale in bytes to a smaller encoding used for @@ -682,18 +690,13 @@ void emitIns_AX_R(instruction ins, emitAttr attr, regNumber ireg, regNumber reg, void emitIns_SIMD_R_R_I(instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, int ival); void emitIns_SIMD_R_R_A(instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, GenTreeIndir* indir); -void emitIns_SIMD_R_R_AR( - instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber base, int offset); void emitIns_SIMD_R_R_C( instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, CORINFO_FIELD_HANDLE fldHnd, int offs); void emitIns_SIMD_R_R_R(instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg); void emitIns_SIMD_R_R_S(instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, int varx, int offs); -#ifdef FEATURE_HW_INTRINSICS void emitIns_SIMD_R_R_A_I( instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, GenTreeIndir* indir, int ival); -void emitIns_SIMD_R_R_AR_I( - instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber base, int ival); void emitIns_SIMD_R_R_C_I(instruction ins, emitAttr attr, regNumber targetReg, @@ -706,6 +709,7 @@ void emitIns_SIMD_R_R_R_I( void emitIns_SIMD_R_R_S_I( instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, int varx, int offs, int ival); +#ifdef FEATURE_HW_INTRINSICS void emitIns_SIMD_R_R_R_A( instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg, GenTreeIndir* indir); void emitIns_SIMD_R_R_R_AR( diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp index 5bffbe41c839a..d61679b27f80f 100644 --- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp @@ -681,12 +681,15 @@ void CodeGen::genHWIntrinsic_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins, regNumber targetReg = node->GetRegNum(); GenTree* op1 = node->Op(1); GenTree* op2 = node->Op(2); - emitter* emit = GetEmitter(); + regNumber op1Reg = op1->GetRegNum(); - // TODO-XArch-CQ: Commutative operations can have op1 be contained - // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained + assert(targetReg != REG_NA); - regNumber op1Reg = op1->GetRegNum(); + if (op2->isContained() || op2->isUsedFromSpillTemp()) + { + assert(HWIntrinsicInfo::SupportsContainment(node->GetHWIntrinsicId())); + assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op2); + } if (ins == INS_insertps) { @@ -702,68 +705,15 @@ void CodeGen::genHWIntrinsic_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins, // insertps can also contain op2 when it is zero in which case // we just reuse op1Reg since ival specifies the entry to zero - emit->emitIns_SIMD_R_R_R_I(ins, simdSize, targetReg, op1Reg, op1Reg, ival); + GetEmitter()->emitIns_SIMD_R_R_R_I(ins, simdSize, targetReg, op1Reg, op1Reg, ival); return; } } - assert(targetReg != REG_NA); assert(op1Reg != REG_NA); - OperandDesc op2Desc = genOperandDesc(op2); - - if (op2Desc.IsContained()) - { - assert(HWIntrinsicInfo::SupportsContainment(node->GetHWIntrinsicId())); - assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op2); - } - - switch (op2Desc.GetKind()) - { - case OperandKind::ClsVar: - emit->emitIns_SIMD_R_R_C_I(ins, simdSize, targetReg, op1Reg, op2Desc.GetFieldHnd(), 0, ival); - break; - - case OperandKind::Local: - emit->emitIns_SIMD_R_R_S_I(ins, simdSize, targetReg, op1Reg, op2Desc.GetVarNum(), op2Desc.GetLclOffset(), - ival); - break; - - case OperandKind::Indir: - { - // Until we improve the handling of addressing modes in the emitter, we'll create a - // temporary GT_IND to generate code with. - GenTreeIndir indirForm; - GenTreeIndir* indir = op2Desc.GetIndirForm(&indirForm); - emit->emitIns_SIMD_R_R_A_I(ins, simdSize, targetReg, op1Reg, indir, ival); - } - break; - - case OperandKind::Reg: - { - regNumber op2Reg = op2Desc.GetReg(); - - if ((op1Reg != targetReg) && (op2Reg == targetReg) && node->isRMWHWIntrinsic(compiler)) - { - // We have "reg2 = reg1 op reg2" where "reg1 != reg2" on a RMW intrinsic. - // - // For non-commutative intrinsics, we should have ensured that op2 was marked - // delay free in order to prevent it from getting assigned the same register - // as target. However, for commutative intrinsics, we can just swap the operands - // in order to have "reg2 = reg2 op reg1" which will end up producing the right code. - - noway_assert(node->OperIsCommutative()); - op2Reg = op1Reg; - op1Reg = targetReg; - } - - emit->emitIns_SIMD_R_R_R_I(ins, simdSize, targetReg, op1Reg, op2Reg, ival); - } - break; - - default: - unreached(); - } + bool isRMW = node->isRMWHWIntrinsic(compiler); + inst_RV_RV_TT_IV(ins, simdSize, targetReg, op1Reg, op2, ival, isRMW); } //------------------------------------------------------------------------ diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index c8e81f69f5050..0c677837b8cf2 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -1205,11 +1205,11 @@ HARDWARE_INTRINSIC(SSE41, PTEST, HARDWARE_INTRINSIC(AVX, PTEST, 0, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX512F, MoveMaskSpecial, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX512F, CompareEqualSpecial, 64, 2, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_cmpps, INS_cmppd}, HW_Category_Special, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX512F, CompareGreaterThanOrEqualSpecial, 64, 2, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_cmpps, INS_cmppd}, HW_Category_Special, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX512F, CompareGreaterThanSpecial, 64, 2, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_cmpps, INS_cmppd}, HW_Category_Special, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX512F, CompareLessThanOrEqualSpecial, 64, 2, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_cmpps, INS_cmppd}, HW_Category_Special, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX512F, CompareLessThanSpecial, 64, 2, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_cmpps, INS_cmppd}, HW_Category_Special, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(AVX512F, CompareEqualSpecial, 64, 2, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_Special, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(AVX512F, CompareGreaterThanOrEqualSpecial, 64, 2, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_Special, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(AVX512F, CompareGreaterThanSpecial, 64, 2, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_Special, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(AVX512F, CompareLessThanOrEqualSpecial, 64, 2, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_Special, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(AVX512F, CompareLessThanSpecial, 64, 2, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_Special, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX512F, MoveMaskToVectorSpecial, 64, 1, {INS_vpmovm2b, INS_vpmovm2b, INS_vpmovm2w, INS_vpmovm2w, INS_vpmovm2d, INS_vpmovm2d, INS_vpmovm2q, INS_vpmovm2q, INS_vpmovm2d, INS_vpmovm2q}, HW_Category_Special, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX512F, KORTEST, 0, 1, {INS_kortestq, INS_kortestq, INS_kortestd, INS_kortestd, INS_kortestw, INS_kortestw, INS_kortestb, INS_kortestb, INS_kortestw, INS_kortestb}, HW_Category_Special, HW_Flag_NoRMWSemantics) diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index a36807b426cf8..b942ddd6d878d 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -1164,6 +1164,76 @@ void CodeGen::inst_RV_RV_TT( unreached(); } } + +//------------------------------------------------------------------------ +// inst_RV_RV_TT_IV: Generates an instruction that takes 3 operands: +// a register operand, an operand that may be in memory or register, +// and an immediate value. The result is returned in register +// +// Arguments: +// ins -- The instruction being emitted +// size -- The emit size attribute +// targetReg -- The target register +// op1Reg -- The first operand register +// op2 -- The second operand, which may be a memory node or a node producing a register +// ival -- The immediate operand +// isRMW -- true if the instruction is RMW; otherwise, false +// +void CodeGen::inst_RV_RV_TT_IV( + instruction ins, emitAttr size, regNumber targetReg, regNumber op1Reg, GenTree* op2, int8_t ival, bool isRMW) +{ + emitter* emit = GetEmitter(); + noway_assert(emit->emitVerifyEncodable(ins, EA_SIZE(size), op1Reg)); + + // TODO-XArch-CQ: Commutative operations can have op1 be contained + // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained + + OperandDesc op2Desc = genOperandDesc(op2); + switch (op2Desc.GetKind()) + { + case OperandKind::ClsVar: + emit->emitIns_SIMD_R_R_C_I(ins, size, targetReg, op1Reg, op2Desc.GetFieldHnd(), 0, ival); + break; + + case OperandKind::Local: + emit->emitIns_SIMD_R_R_S_I(ins, size, targetReg, op1Reg, op2Desc.GetVarNum(), op2Desc.GetLclOffset(), ival); + break; + + case OperandKind::Indir: + { + // Until we improve the handling of addressing modes in the emitter, we'll create a + // temporary GT_IND to generate code with. + GenTreeIndir indirForm; + GenTreeIndir* indir = op2Desc.GetIndirForm(&indirForm); + emit->emitIns_SIMD_R_R_A_I(ins, size, targetReg, op1Reg, indir, ival); + } + break; + + case OperandKind::Reg: + { + regNumber op2Reg = op2Desc.GetReg(); + + if ((op1Reg != targetReg) && (op2Reg == targetReg) && isRMW) + { + // We have "reg2 = reg1 op reg2" where "reg1 != reg2" on a RMW intrinsic. + // + // For non-commutative intrinsics, we should have ensured that op2 was marked + // delay free in order to prevent it from getting assigned the same register + // as target. However, for commutative intrinsics, we can just swap the operands + // in order to have "reg2 = reg2 op reg1" which will end up producing the right code. + + op2Reg = op1Reg; + op1Reg = targetReg; + } + + emit->emitIns_SIMD_R_R_R_I(ins, size, targetReg, op1Reg, op2Reg, ival); + } + break; + + default: + unreached(); + } +} #endif // TARGET_XARCH /***************************************************************************** diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index 73f6dd45e704b..589df2cd7bdeb 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -195,8 +195,8 @@ INST3(addps, "addps", IUM_WR, BAD_CODE, BAD_CODE, INST3(addss, "addss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x58), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add scalar singles INST3(andnps, "andnps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x55), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // And-Not packed singles INST3(andps, "andps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x54), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // AND packed singles -INST3(cmpps, "cmpps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0xC2), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // compare packed singles -INST3(cmpss, "cmpss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0xC2), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // compare scalar singles +INST3(cmpps, "cmpps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0xC2), INS_TT_NONE, Input_32Bit | REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // compare packed singles +INST3(cmpss, "cmpss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0xC2), INS_TT_NONE, Input_32Bit | REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // compare scalar singles INST3(comiss, "comiss", IUM_RD, BAD_CODE, BAD_CODE, PCKFLT(0x2F), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Writes_PF | Writes_CF) // ordered compare singles INST3(cvtsi2ss32, "cvtsi2ss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x2A), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // cvt DWORD to scalar single INST3(cvtsi2ss64, "cvtsi2ss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x2A), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // cvt QWORD to scalar single @@ -244,8 +244,8 @@ INST3(addpd, "addpd", IUM_WR, BAD_CODE, BAD_CODE, INST3(addsd, "addsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x58), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add scalar doubles INST3(andnpd, "andnpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x55), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // And-Not packed doubles INST3(andpd, "andpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x54), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // AND packed doubles -INST3(cmppd, "cmppd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xC2), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // compare packed doubles -INST3(cmpsd, "cmpsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0xC2), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // compare scalar doubles +INST3(cmppd, "cmppd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xC2), INS_TT_NONE, Input_64Bit | REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // compare packed doubles +INST3(cmpsd, "cmpsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0xC2), INS_TT_NONE, Input_64Bit | REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // compare scalar doubles INST3(comisd, "comisd", IUM_RD, BAD_CODE, BAD_CODE, PCKDBL(0x2F), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Writes_PF | Writes_CF) // ordered compare doubles INST3(cvtdq2pd, "cvtdq2pd", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0xE6), INS_TT_HALF, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) // cvt packed DWORDs to doubles INST3(cvtdq2ps, "cvtdq2ps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x5B), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) // cvt packed DWORDs to singles @@ -301,12 +301,12 @@ INST3(pand, "pand", IUM_WR, BAD_CODE, BAD_CODE, INST3(pandn, "pandn", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDF), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND NOT of two xmm regs INST3(pavgb, "pavgb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xE0), INS_TT_FULL_MEM, Input_8Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Average of packed byte integers INST3(pavgw, "pavgw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xE3), INS_TT_FULL_MEM, Input_16Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Average of packed word integers -INST3(pcmpeqb, "pcmpeqb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x74), INS_TT_FULL_MEM, Input_8Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 8-bit integers for equality -INST3(pcmpeqd, "pcmpeqd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x76), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 32-bit integers for equality -INST3(pcmpeqw, "pcmpeqw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x75), INS_TT_FULL_MEM, Input_16Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 16-bit integers for equality -INST3(pcmpgtb, "pcmpgtb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x64), INS_TT_FULL_MEM, Input_8Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 8-bit signed integers for greater than -INST3(pcmpgtd, "pcmpgtd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x66), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 32-bit signed integers for greater than -INST3(pcmpgtw, "pcmpgtw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x65), INS_TT_FULL_MEM, Input_16Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 16-bit signed integers for greater than +INST3(pcmpeqb, "pcmpeqb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x74), INS_TT_NONE, Input_8Bit | REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 8-bit integers for equality +INST3(pcmpeqd, "pcmpeqd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x76), INS_TT_NONE, Input_32Bit | REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 32-bit integers for equality +INST3(pcmpeqw, "pcmpeqw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x75), INS_TT_NONE, Input_16Bit | REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 16-bit integers for equality +INST3(pcmpgtb, "pcmpgtb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x64), INS_TT_NONE, Input_8Bit | REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 8-bit signed integers for greater than +INST3(pcmpgtd, "pcmpgtd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x66), INS_TT_NONE, Input_32Bit | REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 32-bit signed integers for greater than +INST3(pcmpgtw, "pcmpgtw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x65), INS_TT_NONE, Input_16Bit | REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 16-bit signed integers for greater than INST3(pextrw, "pextrw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xC5), INS_TT_TUPLE1_SCALAR, Input_16Bit | REX_W0 | Encoding_VEX | Encoding_EVEX) // Extract 16-bit value into a r32 with zero extended to 32-bits INST3(pinsrw, "pinsrw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xC4), INS_TT_TUPLE1_SCALAR, Input_16Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert word at index INST3(pmaddwd, "pmaddwd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xF5), INS_TT_FULL_MEM, Input_16Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst @@ -335,8 +335,8 @@ INST3(psrldq, "psrldq", IUM_WR, BAD_CODE, PCKDBL(0x73), INST3(psrlq, "psrlq", IUM_WR, BAD_CODE, PCKDBL(0x73), PCKDBL(0xD3), INS_TT_FULL | INS_TT_MEM128, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed shift right logical of 64-bit integers INST3(psrlw, "psrlw", IUM_WR, BAD_CODE, PCKDBL(0x71), PCKDBL(0xD1), INS_TT_FULL_MEM | INS_TT_MEM128, Input_16Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed shift right logical of 16-bit integers INST3(psubb, "psubb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xF8), INS_TT_FULL_MEM, Input_8Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed word (16-bit) integers -INST3(psubd, "psubd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xFA), INS_TT_FULL_MEM, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed double-word (32-bit) integers -INST3(psubq, "psubq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xFB), INS_TT_FULL_MEM, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // subtract packed quad-word (64-bit) integers +INST3(psubd, "psubd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xFA), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed double-word (32-bit) integers +INST3(psubq, "psubq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xFB), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // subtract packed quad-word (64-bit) integers INST3(psubw, "psubw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xF9), INS_TT_FULL_MEM, Input_16Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed word (16-bit) integers INST3(psubsb, "psubsb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xE8), INS_TT_FULL_MEM, Input_8Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed 8-bit integers in b from packed 8-bit integers in a using saturation INST3(psubsw, "psubsw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xE9), INS_TT_FULL_MEM, Input_16Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed 16-bit integers in b from packed 16-bit integers in a using saturation @@ -414,7 +414,7 @@ INST3(mpsadbw, "mpsadbw", IUM_WR, BAD_CODE, BAD_CODE, INST3(packusdw, "packusdw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x2B), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Pack (narrow) int to unsigned short with saturation INST3(pblendvb, "pblendvb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x10), INS_TT_NONE, Input_8Bit | REX_W0) // Variable Blend Packed Bytes INST3(pblendw, "pblendw", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0E), INS_TT_NONE, Input_16Bit | REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Blend Packed Words -INST3(pcmpeqq, "pcmpeqq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x29), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 64-bit integers for equality +INST3(pcmpeqq, "pcmpeqq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x29), INS_TT_NONE, Input_64Bit | REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 64-bit integers for equality INST3(pextrb, "pextrb", IUM_WR, SSE3A(0x14), BAD_CODE, BAD_CODE, INS_TT_TUPLE1_SCALAR, Input_8Bit | REX_W0 | Encoding_VEX | Encoding_EVEX) // Extract Byte INST3(pextrd, "pextrd", IUM_WR, SSE3A(0x16), BAD_CODE, BAD_CODE, INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX) // Extract Dword INST3(pextrq, "pextrq", IUM_WR, SSE3A(0x16), BAD_CODE, BAD_CODE, INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX) // Extract Qword @@ -445,14 +445,14 @@ INST3(pmovzxwd, "pmovzxwd", IUM_WR, BAD_CODE, BAD_CODE, INST3(pmovzxwq, "pmovzxwq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x34), INS_TT_QUARTER_MEM, Input_16Bit | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed zero extend short to long INST3(pmuldq, "pmuldq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x28), INS_TT_FULL, Input_32Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed multiply 32-bit signed integers and store 64-bit result INST3(pmulld, "pmulld", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x40), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed multiply 32 bit unsigned integers and store lower 32 bits of each result -INST3(ptest, "ptest", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x17), INS_TT_NONE, REX_WIG | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF) // Packed logical compare +INST3(ptest, "ptest", IUM_RD, BAD_CODE, BAD_CODE, SSE38(0x17), INS_TT_NONE, REX_WIG | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF) // Packed logical compare INST3(roundpd, "roundpd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x09), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX) // Round packed double precision floating-point values INST3(roundps, "roundps", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x08), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) // Round packed single precision floating-point values INST3(roundsd, "roundsd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0B), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Round scalar double precision floating-point values INST3(roundss, "roundss", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0A), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Round scalar single precision floating-point values // SSE4.2 -INST3(pcmpgtq, "pcmpgtq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x37), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 64-bit integers for equality +INST3(pcmpgtq, "pcmpgtq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x37), INS_TT_NONE, Input_64Bit | REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 64-bit integers for equality INST3(LAST_SSE_INSTRUCTION, "LAST_SSE_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) @@ -474,8 +474,8 @@ INST3(vpermilpd, "permilpd", IUM_WR, BAD_CODE, BAD_CODE, INST3(vpermilpdvar, "permilpd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x0D), INS_TT_FULL, Input_64Bit | REX_WX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Permute In-Lane of Quadruples of Double-Precision Floating-Point Values INST3(vpermilps, "permilps", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x04), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX) // Permute In-Lane of Quadruples of Single-Precision Floating-Point Values INST3(vpermilpsvar, "permilps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x0C), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Permute In-Lane of Quadruples of Single-Precision Floating-Point Values -INST3(vtestpd, "testpd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x0F), INS_TT_NONE, Input_64Bit | REX_W0 | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF) // Packed Bit Test -INST3(vtestps, "testps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x0E), INS_TT_NONE, Input_32Bit | REX_W0 | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF) // Packed Bit Test +INST3(vtestpd, "testpd", IUM_RD, BAD_CODE, BAD_CODE, SSE38(0x0F), INS_TT_NONE, Input_64Bit | REX_W0 | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF) // Packed Bit Test +INST3(vtestps, "testps", IUM_RD, BAD_CODE, BAD_CODE, SSE38(0x0E), INS_TT_NONE, Input_32Bit | REX_W0 | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF) // Packed Bit Test INST3(vzeroupper, "zeroupper", IUM_WR, 0xC577F8, BAD_CODE, BAD_CODE, INS_TT_NONE, REX_WIG | Encoding_VEX) // Zero upper 128-bits of all YMM regs (includes 2-byte fixed VEX prefix) // AVX2 @@ -510,66 +510,66 @@ INST3(vpsrlvq, "psrlvq", IUM_WR, BAD_CODE, BAD_CODE, INST3(FIRST_FMA_INSTRUCTION, "FIRST_FMA_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) // id nm um mr mi rm flags -INST3(vfmadd132pd, "fmadd132pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x98), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Add of Packed Double-Precision Floating-Point Values -INST3(vfmadd213pd, "fmadd213pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xA8), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmadd231pd, "fmadd231pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xB8), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmadd132ps, "fmadd132ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x98), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Add of Packed Single-Precision Floating-Point Values -INST3(vfmadd213ps, "fmadd213ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xA8), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmadd231ps, "fmadd231ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xB8), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmadd132sd, "fmadd132sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x99), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Add of Scalar Double-Precision Floating-Point Values -INST3(vfmadd213sd, "fmadd213sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xA9), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmadd231sd, "fmadd231sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xB9), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmadd132ss, "fmadd132ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x99), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Add of Scalar Single-Precision Floating-Point Values -INST3(vfmadd213ss, "fmadd213ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xA9), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmadd231ss, "fmadd231ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xB9), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmaddsub132pd, "fmaddsub132pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x96), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Alternating Add/Subtract of Packed Double-Precision Floating-Point Values -INST3(vfmaddsub213pd, "fmaddsub213pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xA6), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmaddsub231pd, "fmaddsub231pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xB6), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmaddsub132ps, "fmaddsub132ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x96), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Alternating Add/Subtract of Packed Single-Precision Floating-Point Values -INST3(vfmaddsub213ps, "fmaddsub213ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xA6), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmaddsub231ps, "fmaddsub231ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xB6), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmsubadd132pd, "fmsubadd132pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x97), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Alternating Subtract/Add of Packed Double-Precision Floating-Point Values -INST3(vfmsubadd213pd, "fmsubadd213pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xA7), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmsubadd231pd, "fmsubadd231pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xB7), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmsubadd132ps, "fmsubadd132ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x97), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Alternating Subtract/Add of Packed Single-Precision Floating-Point Values -INST3(vfmsubadd213ps, "fmsubadd213ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xA7), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmsubadd231ps, "fmsubadd231ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xB7), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmsub132pd, "fmsub132pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9A), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Subtract of Packed Double-Precision Floating-Point Values -INST3(vfmsub213pd, "fmsub213pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAA), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmsub231pd, "fmsub231pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBA), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmsub132ps, "fmsub132ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9A), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Subtract of Packed Single-Precision Floating-Point Values -INST3(vfmsub213ps, "fmsub213ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAA), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmsub231ps, "fmsub231ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBA), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmsub132sd, "fmsub132sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9B), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Subtract of Scalar Double-Precision Floating-Point Values -INST3(vfmsub213sd, "fmsub213sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAB), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmsub231sd, "fmsub231sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBB), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmsub132ss, "fmsub132ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9B), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Subtract of Scalar Single-Precision Floating-Point Values -INST3(vfmsub213ss, "fmsub213ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAB), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmsub231ss, "fmsub231ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBB), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmadd132pd, "fnmadd132pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9C), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Add of Packed Double-Precision Floating-Point Values -INST3(vfnmadd213pd, "fnmadd213pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAC), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmadd231pd, "fnmadd231pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBC), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmadd132ps, "fnmadd132ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9C), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Add of Packed Single-Precision Floating-Point Values -INST3(vfnmadd213ps, "fnmadd213ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAC), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmadd231ps, "fnmadd231ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBC), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmadd132sd, "fnmadd132sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9D), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Add of Scalar Double-Precision Floating-Point Values -INST3(vfnmadd213sd, "fnmadd213sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAD), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmadd231sd, "fnmadd231sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBD), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmadd132ss, "fnmadd132ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9D), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Add of Scalar Single-Precision Floating-Point Values -INST3(vfnmadd213ss, "fnmadd213ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAD), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmadd231ss, "fnmadd231ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBD), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmsub132pd, "fnmsub132pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9E), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Subtract of Packed Double-Precision Floating-Point Values -INST3(vfnmsub213pd, "fnmsub213pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAE), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmsub231pd, "fnmsub231pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBE), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmsub132ps, "fnmsub132ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9E), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Subtract of Packed Single-Precision Floating-Point Values -INST3(vfnmsub213ps, "fnmsub213ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAE), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmsub231ps, "fnmsub231ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBE), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmsub132sd, "fnmsub132sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9F), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Subtract of Scalar Double-Precision Floating-Point Values -INST3(vfnmsub213sd, "fnmsub213sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAF), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmsub231sd, "fnmsub231sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBF), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmsub132ss, "fnmsub132ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9F), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Subtract of Scalar Single-Precision Floating-Point Values -INST3(vfnmsub213ss, "fnmsub213ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAF), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmsub231ss, "fnmsub231ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBF), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmadd132pd, "fmadd132pd", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0x98), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Add of Packed Double-Precision Floating-Point Values +INST3(vfmadd213pd, "fmadd213pd", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xA8), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmadd231pd, "fmadd231pd", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xB8), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmadd132ps, "fmadd132ps", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0x98), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Add of Packed Single-Precision Floating-Point Values +INST3(vfmadd213ps, "fmadd213ps", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xA8), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmadd231ps, "fmadd231ps", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xB8), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmadd132sd, "fmadd132sd", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0x99), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Add of Scalar Double-Precision Floating-Point Values +INST3(vfmadd213sd, "fmadd213sd", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xA9), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmadd231sd, "fmadd231sd", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xB9), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmadd132ss, "fmadd132ss", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0x99), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Add of Scalar Single-Precision Floating-Point Values +INST3(vfmadd213ss, "fmadd213ss", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xA9), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmadd231ss, "fmadd231ss", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xB9), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmaddsub132pd, "fmaddsub132pd", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0x96), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Alternating Add/Subtract of Packed Double-Precision Floating-Point Values +INST3(vfmaddsub213pd, "fmaddsub213pd", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xA6), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmaddsub231pd, "fmaddsub231pd", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xB6), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmaddsub132ps, "fmaddsub132ps", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0x96), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Alternating Add/Subtract of Packed Single-Precision Floating-Point Values +INST3(vfmaddsub213ps, "fmaddsub213ps", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xA6), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmaddsub231ps, "fmaddsub231ps", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xB6), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmsubadd132pd, "fmsubadd132pd", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0x97), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Alternating Subtract/Add of Packed Double-Precision Floating-Point Values +INST3(vfmsubadd213pd, "fmsubadd213pd", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xA7), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmsubadd231pd, "fmsubadd231pd", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xB7), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmsubadd132ps, "fmsubadd132ps", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0x97), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Alternating Subtract/Add of Packed Single-Precision Floating-Point Values +INST3(vfmsubadd213ps, "fmsubadd213ps", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xA7), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmsubadd231ps, "fmsubadd231ps", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xB7), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmsub132pd, "fmsub132pd", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0x9A), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Subtract of Packed Double-Precision Floating-Point Values +INST3(vfmsub213pd, "fmsub213pd", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xAA), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmsub231pd, "fmsub231pd", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xBA), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmsub132ps, "fmsub132ps", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0x9A), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Subtract of Packed Single-Precision Floating-Point Values +INST3(vfmsub213ps, "fmsub213ps", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xAA), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmsub231ps, "fmsub231ps", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xBA), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmsub132sd, "fmsub132sd", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0x9B), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Subtract of Scalar Double-Precision Floating-Point Values +INST3(vfmsub213sd, "fmsub213sd", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xAB), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmsub231sd, "fmsub231sd", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xBB), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmsub132ss, "fmsub132ss", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0x9B), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Subtract of Scalar Single-Precision Floating-Point Values +INST3(vfmsub213ss, "fmsub213ss", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xAB), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmsub231ss, "fmsub231ss", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xBB), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmadd132pd, "fnmadd132pd", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0x9C), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Add of Packed Double-Precision Floating-Point Values +INST3(vfnmadd213pd, "fnmadd213pd", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xAC), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmadd231pd, "fnmadd231pd", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xBC), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmadd132ps, "fnmadd132ps", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0x9C), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Add of Packed Single-Precision Floating-Point Values +INST3(vfnmadd213ps, "fnmadd213ps", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xAC), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmadd231ps, "fnmadd231ps", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xBC), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmadd132sd, "fnmadd132sd", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0x9D), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Add of Scalar Double-Precision Floating-Point Values +INST3(vfnmadd213sd, "fnmadd213sd", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xAD), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmadd231sd, "fnmadd231sd", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xBD), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmadd132ss, "fnmadd132ss", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0x9D), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Add of Scalar Single-Precision Floating-Point Values +INST3(vfnmadd213ss, "fnmadd213ss", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xAD), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmadd231ss, "fnmadd231ss", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xBD), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmsub132pd, "fnmsub132pd", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0x9E), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Subtract of Packed Double-Precision Floating-Point Values +INST3(vfnmsub213pd, "fnmsub213pd", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xAE), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmsub231pd, "fnmsub231pd", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xBE), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmsub132ps, "fnmsub132ps", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0x9E), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Subtract of Packed Single-Precision Floating-Point Values +INST3(vfnmsub213ps, "fnmsub213ps", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xAE), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmsub231ps, "fnmsub231ps", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xBE), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmsub132sd, "fnmsub132sd", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0x9F), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Subtract of Scalar Double-Precision Floating-Point Values +INST3(vfnmsub213sd, "fnmsub213sd", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xAF), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmsub231sd, "fnmsub231sd", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xBF), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmsub132ss, "fnmsub132ss", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0x9F), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Subtract of Scalar Single-Precision Floating-Point Values +INST3(vfnmsub213ss, "fnmsub213ss", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xAF), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmsub231ss, "fnmsub231ss", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xBF), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // INST3(LAST_FMA_INSTRUCTION, "LAST_FMA_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) INST3(FIRST_AVXVNNI_INSTRUCTION, "FIRST_AVXVNNI_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) @@ -610,135 +610,147 @@ INST3(FIRST_AVX512_INSTRUCTION, "FIRST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BA // AVX512F INST3(kmovw_gpr, "kmovw", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x92), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) INST3(kmovw_msk, "kmovw", IUM_WR, PCKFLT(0x91), BAD_CODE, PCKFLT(0x90), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) -INST3(kortestw, "kortestw", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x98), INS_TT_NONE, REX_W0 | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF | KInstruction) +INST3(kortestw, "kortestw", IUM_RD, BAD_CODE, BAD_CODE, PCKFLT(0x98), INS_TT_NONE, REX_W0 | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF | KInstruction) INST3(vbroadcastf64x2, "broadcastf64x2", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x1A), INS_TT_TUPLE2, Input_64Bit | REX_W1 | Encoding_EVEX) // Broadcast packed float values read from memory to entire register INST3(vbroadcasti64x2, "broadcasti64x2", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x5A), INS_TT_TUPLE2, Input_64Bit | REX_W1 | Encoding_EVEX) // Broadcast packed integer values read from memory to entire register INST3(vbroadcastf64x4, "broadcastf64x4", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x1B), INS_TT_TUPLE2, Input_64Bit | REX_W1 | Encoding_EVEX) // Broadcast packed float values read from memory to entire register INST3(vbroadcasti64x4, "broadcasti64x4", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x5B), INS_TT_TUPLE2, Input_64Bit | REX_W1 | Encoding_EVEX) // Broadcast packed integer values read from memory to entire register -INST3(vcvtpd2udq, "cvtpd2udq", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x79), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX) // cvt packed doubles to unsigned DWORDs -INST3(vcvtps2udq, "cvtps2udq", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x79), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_EVEX) // cvt packed singles to unsigned DWORDs +INST3(vcmpps, "cmpps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0xC2), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) // compare packed singles +INST3(vcmpss, "cmpss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0xC2), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) // compare scalar singles +INST3(vcmppd, "cmppd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xC2), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) // compare packed doubles +INST3(vcmpsd, "cmpsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0xC2), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) // compare scalar doubles +INST3(vcvtpd2udq, "cvtpd2udq", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x79), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX) // cvt packed doubles to unsigned DWORDs +INST3(vcvtps2udq, "cvtps2udq", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x79), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_EVEX) // cvt packed singles to unsigned DWORDs INST3(vcvtsd2usi, "cvtsd2usi", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x79), INS_TT_TUPLE1_FIXED, Input_64Bit | REX_WX | Encoding_EVEX) // cvt scalar double to unsigned DWORD/QWORD INST3(vcvtss2usi, "cvtss2usi", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x79), INS_TT_TUPLE1_FIXED, Input_32Bit | REX_WX | Encoding_EVEX) // cvt scalar single to unsigned DWORD/QWORD -INST3(vcvttpd2udq, "cvttpd2udq", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x78), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX) // cvt w/ truncation packed doubles to unsigned DWORDs -INST3(vcvttps2udq, "cvttps2udq", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x78), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_EVEX) // cvt w/ truncation packed singles to unsigned DWORDs +INST3(vcvttpd2udq, "cvttpd2udq", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x78), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX) // cvt w/ truncation packed doubles to unsigned DWORDs +INST3(vcvttps2udq, "cvttps2udq", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x78), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_EVEX) // cvt w/ truncation packed singles to unsigned DWORDs INST3(vcvttsd2usi, "cvttsd2usi", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x78), INS_TT_TUPLE1_FIXED, Input_64Bit | REX_WX | Encoding_EVEX) // cvt w/ truncation scalar double to unsigned DWORD/QWORD INST3(vcvttss2usi, "cvttss2usi", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x78), INS_TT_TUPLE1_FIXED, Input_32Bit | REX_WX | Encoding_EVEX) // cvt w/ truncation scalar single to unsigned DWORD/QWORD -INST3(vcvtudq2pd, "cvtudq2pd", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x7A), INS_TT_HALF, Input_32Bit | REX_W0_EVEX | Encoding_EVEX) // cvt packed unsigned DWORDs to doubles -INST3(vcvtudq2ps, "cvtudq2ps", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x7A), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_EVEX) // cvt packed unsigned DWORDs to singles -INST3(vcvtusi2sd32, "cvtusi2sd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x7B), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // cvt scalar unsigned DWORD to double -INST3(vcvtusi2sd64, "cvtusi2sd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x7B), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // cvt scalar unsigned QWORD to double -INST3(vcvtusi2ss32, "cvtusi2ss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x7B), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // cvt scalar unsigned DWORD to single -INST3(vcvtusi2ss64, "cvtusi2ss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x7B), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // cvt scalar unsigned QWORD to single -INST3(vextractf64x4, "extractf64x4", IUM_WR, SSE3A(0x1B), BAD_CODE, BAD_CODE, INS_TT_TUPLE4, Input_64Bit | REX_W1_EVEX | Encoding_EVEX) // Extract 256-bit packed double-precision floating point values -INST3(vextracti64x4, "extracti64x4", IUM_WR, SSE3A(0x3B), BAD_CODE, BAD_CODE, INS_TT_TUPLE4, Input_64Bit | REX_W1_EVEX | Encoding_EVEX) // Extract 256-bit packed quadword integer values -INST3(vinsertf64x4, "insertf64x4", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1A), INS_TT_TUPLE4, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed double-precision floating point values -INST3(vinserti64x4, "inserti64x4", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3A), INS_TT_TUPLE4, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed quadword integer values -INST3(vmovdqa64, "movdqa64", IUM_WR, PCKDBL(0x7F), BAD_CODE, PCKDBL(0x6F), INS_TT_FULL_MEM, Input_64Bit | REX_W1_EVEX | Encoding_EVEX) -INST3(vmovdqu64, "movdqu64", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_64Bit | REX_W1_EVEX | Encoding_EVEX) -INST3(vpabsq, "pabsq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x1F), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX) // Packed absolute value of 64-bit integers -INST3(vpandq, "pandq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDB), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND of two xmm regs -INST3(vpandnq, "pandnq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDF), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND NOT of two xmm regs +INST3(vcvtudq2pd, "cvtudq2pd", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x7A), INS_TT_HALF, Input_32Bit | REX_W0 | Encoding_EVEX) // cvt packed unsigned DWORDs to doubles +INST3(vcvtudq2ps, "cvtudq2ps", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x7A), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_EVEX) // cvt packed unsigned DWORDs to singles +INST3(vcvtusi2sd32, "cvtusi2sd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x7B), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // cvt scalar unsigned DWORD to double +INST3(vcvtusi2sd64, "cvtusi2sd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x7B), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // cvt scalar unsigned QWORD to double +INST3(vcvtusi2ss32, "cvtusi2ss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x7B), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // cvt scalar unsigned DWORD to single +INST3(vcvtusi2ss64, "cvtusi2ss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x7B), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // cvt scalar unsigned QWORD to single +INST3(vextractf64x4, "extractf64x4", IUM_WR, SSE3A(0x1B), BAD_CODE, BAD_CODE, INS_TT_TUPLE4, Input_64Bit | REX_W1 | Encoding_EVEX) // Extract 256-bit packed double-precision floating point values +INST3(vextracti64x4, "extracti64x4", IUM_WR, SSE3A(0x3B), BAD_CODE, BAD_CODE, INS_TT_TUPLE4, Input_64Bit | REX_W1 | Encoding_EVEX) // Extract 256-bit packed quadword integer values +INST3(vinsertf64x4, "insertf64x4", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1A), INS_TT_TUPLE4, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed double-precision floating point values +INST3(vinserti64x4, "inserti64x4", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3A), INS_TT_TUPLE4, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed quadword integer values +INST3(vmovdqa64, "movdqa64", IUM_WR, PCKDBL(0x7F), BAD_CODE, PCKDBL(0x6F), INS_TT_FULL_MEM, Input_64Bit | REX_W1 | Encoding_EVEX) +INST3(vmovdqu64, "movdqu64", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_64Bit | REX_W1 | Encoding_EVEX) +INST3(vpabsq, "pabsq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x1F), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX) // Packed absolute value of 64-bit integers +INST3(vpandq, "pandq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDB), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND of two xmm regs +INST3(vpandnq, "pandnq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDF), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND NOT of two xmm regs INST3(vpbroadcastd_gpr, "pbroadcastd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x7C), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_EVEX) // Broadcast int32 value from gpr to entire register INST3(vpbroadcastq_gpr, "pbroadcastq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x7C), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_EVEX) // Broadcast int64 value from gpr to entire register -INST3(vpermq_reg, "permq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x36), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Permute 64-bit of input register -INST3(vpermpd_reg, "permpd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x16), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Permute 64-bit of input register -INST3(vpmaxsq, "pmaxsq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3D), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed maximum 64-bit signed integers -INST3(vpmaxuq, "pmaxuq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3F), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed maximum 64-bit unsigned integers -INST3(vpminsq, "pminsq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x39), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed minimum 64-bit signed integers -INST3(vpminuq, "pminuq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3B), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed minimum 64-bit unsigned integers -INST3(vpmovdb, "pmovdb", IUM_WR, PSSE38(0xF3, 0x31), BAD_CODE, PSSE38(0xF3, 0x31), INS_TT_QUARTER_MEM, Input_32Bit | REX_W0_EVEX | Encoding_EVEX) -INST3(vpmovdw, "pmovdw", IUM_WR, PSSE38(0xF3, 0x33), BAD_CODE, PSSE38(0xF3, 0x33), INS_TT_HALF_MEM, Input_32Bit | REX_W0_EVEX | Encoding_EVEX) -INST3(vpmovqb, "pmovqb", IUM_WR, PSSE38(0xF3, 0x32), BAD_CODE, PSSE38(0xF3, 0x32), INS_TT_EIGHTH_MEM, Input_64Bit | REX_W0_EVEX | Encoding_EVEX) -INST3(vpmovqd, "pmovqd", IUM_WR, PSSE38(0xF3, 0x35), BAD_CODE, PSSE38(0xF3, 0x35), INS_TT_HALF_MEM, Input_64Bit | REX_W0_EVEX | Encoding_EVEX) -INST3(vpmovqw, "pmovqw", IUM_WR, PSSE38(0xF3, 0x34), BAD_CODE, PSSE38(0xF3, 0x34), INS_TT_QUARTER_MEM, Input_64Bit | REX_W0_EVEX | Encoding_EVEX) -INST3(vpmovsdb, "pmovsdb", IUM_WR, PSSE38(0xF3, 0x21), BAD_CODE, PSSE38(0xF3, 0x21), INS_TT_QUARTER_MEM, Input_32Bit | REX_W0_EVEX | Encoding_EVEX) -INST3(vpmovsdw, "pmovsdw", IUM_WR, PSSE38(0xF3, 0x23), BAD_CODE, PSSE38(0xF3, 0x23), INS_TT_HALF_MEM, Input_32Bit | REX_W0_EVEX | Encoding_EVEX) -INST3(vpmovsqb, "pmovsqb", IUM_WR, PSSE38(0xF3, 0x22), BAD_CODE, PSSE38(0xF3, 0x22), INS_TT_EIGHTH_MEM, Input_64Bit | REX_W0_EVEX | Encoding_EVEX) -INST3(vpmovsqd, "pmovsqd", IUM_WR, PSSE38(0xF3, 0x25), BAD_CODE, PSSE38(0xF3, 0x25), INS_TT_HALF_MEM, Input_64Bit | REX_W0_EVEX | Encoding_EVEX) -INST3(vpmovsqw, "pmovsqw", IUM_WR, PSSE38(0xF3, 0x24), BAD_CODE, PSSE38(0xF3, 0x24), INS_TT_QUARTER_MEM, Input_64Bit | REX_W0_EVEX | Encoding_EVEX) -INST3(vpmovusdb, "pmovusdb", IUM_WR, PSSE38(0xF3, 0x11), BAD_CODE, PSSE38(0xF3, 0x11), INS_TT_QUARTER_MEM, Input_32Bit | REX_W0_EVEX | Encoding_EVEX) -INST3(vpmovusdw, "pmovusdw", IUM_WR, PSSE38(0xF3, 0x13), BAD_CODE, PSSE38(0xF3, 0x13), INS_TT_HALF_MEM, Input_32Bit | REX_W0_EVEX | Encoding_EVEX) -INST3(vpmovusqb, "pmovusqb", IUM_WR, PSSE38(0xF3, 0x12), BAD_CODE, PSSE38(0xF3, 0x12), INS_TT_EIGHTH_MEM, Input_64Bit | REX_W0_EVEX | Encoding_EVEX) -INST3(vpmovusqd, "pmovusqd", IUM_WR, PSSE38(0xF3, 0x15), BAD_CODE, PSSE38(0xF3, 0x15), INS_TT_HALF_MEM, Input_64Bit | REX_W0_EVEX | Encoding_EVEX) -INST3(vpmovusqw, "pmovusqw", IUM_WR, PSSE38(0xF3, 0x14), BAD_CODE, PSSE38(0xF3, 0x14), INS_TT_QUARTER_MEM, Input_64Bit | REX_W0_EVEX | Encoding_EVEX) -INST3(vporq, "porq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEB), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise OR of two xmm regs -INST3(vpsraq, "psraq", IUM_WR, BAD_CODE, PCKDBL(0x72), PCKDBL(0xE2), INS_TT_FULL | INS_TT_MEM128, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed shift right arithmetic of 64-bit integers -INST3(vpsravq, "psravq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x46), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Variable Bit Shift Right Arithmetic -INST3(vpternlogd, "pternlogd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x25), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) -INST3(vpxorq, "pxorq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEF), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise XOR of two xmm regs +INST3(vpcmpeqd, "pcmpeqd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x76), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) // Packed compare 32-bit integers for equality +INST3(vpcmpgtd, "pcmpgtd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x66), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) // Packed compare 32-bit signed integers for greater than +INST3(vpcmpeqq, "pcmpeqq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x29), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) // Packed compare 64-bit integers for equality +INST3(vpcmpgtq, "pcmpgtq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x37), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) // Packed compare 64-bit integers for equality +INST3(vpermq_reg, "permq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x36), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Permute 64-bit of input register +INST3(vpermpd_reg, "permpd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x16), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Permute 64-bit of input register +INST3(vpmaxsq, "pmaxsq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3D), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed maximum 64-bit signed integers +INST3(vpmaxuq, "pmaxuq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3F), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed maximum 64-bit unsigned integers +INST3(vpminsq, "pminsq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x39), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed minimum 64-bit signed integers +INST3(vpminuq, "pminuq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3B), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed minimum 64-bit unsigned integers +INST3(vpmovdb, "pmovdb", IUM_WR, PSSE38(0xF3, 0x31), BAD_CODE, PSSE38(0xF3, 0x31), INS_TT_QUARTER_MEM, Input_32Bit | REX_W0 | Encoding_EVEX) +INST3(vpmovdw, "pmovdw", IUM_WR, PSSE38(0xF3, 0x33), BAD_CODE, PSSE38(0xF3, 0x33), INS_TT_HALF_MEM, Input_32Bit | REX_W0 | Encoding_EVEX) +INST3(vpmovqb, "pmovqb", IUM_WR, PSSE38(0xF3, 0x32), BAD_CODE, PSSE38(0xF3, 0x32), INS_TT_EIGHTH_MEM, Input_64Bit | REX_W0 | Encoding_EVEX) +INST3(vpmovqd, "pmovqd", IUM_WR, PSSE38(0xF3, 0x35), BAD_CODE, PSSE38(0xF3, 0x35), INS_TT_HALF_MEM, Input_64Bit | REX_W0 | Encoding_EVEX) +INST3(vpmovqw, "pmovqw", IUM_WR, PSSE38(0xF3, 0x34), BAD_CODE, PSSE38(0xF3, 0x34), INS_TT_QUARTER_MEM, Input_64Bit | REX_W0 | Encoding_EVEX) +INST3(vpmovsdb, "pmovsdb", IUM_WR, PSSE38(0xF3, 0x21), BAD_CODE, PSSE38(0xF3, 0x21), INS_TT_QUARTER_MEM, Input_32Bit | REX_W0 | Encoding_EVEX) +INST3(vpmovsdw, "pmovsdw", IUM_WR, PSSE38(0xF3, 0x23), BAD_CODE, PSSE38(0xF3, 0x23), INS_TT_HALF_MEM, Input_32Bit | REX_W0 | Encoding_EVEX) +INST3(vpmovsqb, "pmovsqb", IUM_WR, PSSE38(0xF3, 0x22), BAD_CODE, PSSE38(0xF3, 0x22), INS_TT_EIGHTH_MEM, Input_64Bit | REX_W0 | Encoding_EVEX) +INST3(vpmovsqd, "pmovsqd", IUM_WR, PSSE38(0xF3, 0x25), BAD_CODE, PSSE38(0xF3, 0x25), INS_TT_HALF_MEM, Input_64Bit | REX_W0 | Encoding_EVEX) +INST3(vpmovsqw, "pmovsqw", IUM_WR, PSSE38(0xF3, 0x24), BAD_CODE, PSSE38(0xF3, 0x24), INS_TT_QUARTER_MEM, Input_64Bit | REX_W0 | Encoding_EVEX) +INST3(vpmovusdb, "pmovusdb", IUM_WR, PSSE38(0xF3, 0x11), BAD_CODE, PSSE38(0xF3, 0x11), INS_TT_QUARTER_MEM, Input_32Bit | REX_W0 | Encoding_EVEX) +INST3(vpmovusdw, "pmovusdw", IUM_WR, PSSE38(0xF3, 0x13), BAD_CODE, PSSE38(0xF3, 0x13), INS_TT_HALF_MEM, Input_32Bit | REX_W0 | Encoding_EVEX) +INST3(vpmovusqb, "pmovusqb", IUM_WR, PSSE38(0xF3, 0x12), BAD_CODE, PSSE38(0xF3, 0x12), INS_TT_EIGHTH_MEM, Input_64Bit | REX_W0 | Encoding_EVEX) +INST3(vpmovusqd, "pmovusqd", IUM_WR, PSSE38(0xF3, 0x15), BAD_CODE, PSSE38(0xF3, 0x15), INS_TT_HALF_MEM, Input_64Bit | REX_W0 | Encoding_EVEX) +INST3(vpmovusqw, "pmovusqw", IUM_WR, PSSE38(0xF3, 0x14), BAD_CODE, PSSE38(0xF3, 0x14), INS_TT_QUARTER_MEM, Input_64Bit | REX_W0 | Encoding_EVEX) +INST3(vporq, "porq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEB), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise OR of two xmm regs +INST3(vpsraq, "psraq", IUM_WR, BAD_CODE, PCKDBL(0x72), PCKDBL(0xE2), INS_TT_FULL | INS_TT_MEM128, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed shift right arithmetic of 64-bit integers +INST3(vpsravq, "psravq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x46), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Variable Bit Shift Right Arithmetic +INST3(vpternlogd, "pternlogd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x25), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) +INST3(vpxorq, "pxorq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEF), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise XOR of two xmm regs // AVX512BW INST3(kmovd_gpr, "kmovd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x92), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) INST3(kmovd_msk, "kmovd", IUM_WR, PCKDBL(0x91), BAD_CODE, PCKDBL(0x90), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) INST3(kmovq_gpr, "kmovq", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x92), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) INST3(kmovq_msk, "kmovq", IUM_WR, PCKFLT(0x91), BAD_CODE, PCKFLT(0x90), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) -INST3(kortestd, "kortestd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x98), INS_TT_NONE, REX_W1 | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF | KInstruction) -INST3(kortestq, "kortestq", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x98), INS_TT_NONE, REX_W1 | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF | KInstruction) -INST3(vmovdqu8, "movdqu8", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_8Bit | REX_W0_EVEX | Encoding_EVEX) -INST3(vmovdqu16, "movdqu16", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_16Bit | REX_W1_EVEX | Encoding_EVEX) +INST3(kortestd, "kortestd", IUM_RD, BAD_CODE, BAD_CODE, PCKDBL(0x98), INS_TT_NONE, REX_W1 | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF | KInstruction) +INST3(kortestq, "kortestq", IUM_RD, BAD_CODE, BAD_CODE, PCKFLT(0x98), INS_TT_NONE, REX_W1 | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF | KInstruction) +INST3(vmovdqu8, "movdqu8", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_8Bit | REX_W0 | Encoding_EVEX) +INST3(vmovdqu16, "movdqu16", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_16Bit | REX_W1 | Encoding_EVEX) INST3(vpbroadcastb_gpr, "pbroadcastb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x7A), INS_TT_TUPLE1_SCALAR, Input_8Bit | REX_W0 | Encoding_EVEX) // Broadcast int8 value from gpr to entire register INST3(vpbroadcastw_gpr, "pbroadcastw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x7B), INS_TT_TUPLE1_SCALAR, Input_16Bit | REX_W0 | Encoding_EVEX) // Broadcast int16 value from gpr to entire register -INST3(vpcmpb, "pcmpb", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3F), INS_TT_FULL_MEM, Input_8Bit | REX_W0_EVEX | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) -INST3(vpcmpw, "pcmpw", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3F), INS_TT_FULL_MEM, Input_16Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) -INST3(vpcmpub, "pcmpub", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3E), INS_TT_FULL_MEM, Input_8Bit | REX_W0_EVEX | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) -INST3(vpcmpuw, "pcmpuw", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3E), INS_TT_FULL_MEM, Input_16Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) -INST3(vpermw, "permw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x8D), INS_TT_FULL_MEM, Input_16Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Permute Packed Doublewords Elements -INST3(vpmovb2m, "pmovb2m", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF3, 0x29), INS_TT_NONE, Input_8Bit | REX_W0_EVEX | Encoding_EVEX) -INST3(vpmovm2b, "pmovm2b", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF3, 0x28), INS_TT_NONE, Input_8Bit | REX_W0_EVEX | Encoding_EVEX) -INST3(vpmovm2w, "pmovm2w", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF3, 0x28), INS_TT_NONE, Input_16Bit | REX_W1_EVEX | Encoding_EVEX) -INST3(vpmovw2m, "pmovw2m", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF3, 0x29), INS_TT_NONE, Input_16Bit | REX_W1_EVEX | Encoding_EVEX) -INST3(vpmovwb, "pmovwb", IUM_WR, PSSE38(0xF3, 0x30), BAD_CODE, PSSE38(0xF3, 0x30), INS_TT_HALF_MEM, Input_16Bit | REX_W0_EVEX | Encoding_EVEX) -INST3(vpmovswb, "pmovswb", IUM_WR, PSSE38(0xF3, 0x20), BAD_CODE, PSSE38(0xF3, 0x20), INS_TT_HALF_MEM, Input_16Bit | REX_W0_EVEX | Encoding_EVEX) -INST3(vpmovuswb, "pmovuswb", IUM_WR, PSSE38(0xF3, 0x10), BAD_CODE, PSSE38(0xF3, 0x10), INS_TT_HALF_MEM, Input_16Bit | REX_W0_EVEX | Encoding_EVEX) -INST3(vpsllvw, "psllvw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x12), INS_TT_FULL_MEM, Input_16Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Variable Bit Shift Left Logical -INST3(vpsravw, "psravw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x11), INS_TT_FULL_MEM, Input_16Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Variable Bit Shift Right Arithmetic -INST3(vpsrlvw, "psrlvw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x10), INS_TT_FULL_MEM, Input_16Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Variable Bit Shift Right Logical +INST3(vpcmpb, "pcmpb", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3F), INS_TT_FULL_MEM, Input_8Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) +INST3(vpcmpeqb, "pcmpeqb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x74), INS_TT_FULL_MEM, Input_8Bit | REX_WIG | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) // Packed compare 8-bit integers for equality +INST3(vpcmpeqw, "pcmpeqw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x75), INS_TT_FULL_MEM, Input_16Bit | REX_WIG | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) // Packed compare 16-bit integers for equality +INST3(vpcmpgtb, "pcmpgtb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x64), INS_TT_FULL_MEM, Input_8Bit | REX_WIG | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) // Packed compare 8-bit signed integers for greater than +INST3(vpcmpgtw, "pcmpgtw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x65), INS_TT_FULL_MEM, Input_16Bit | REX_WIG | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) // Packed compare 16-bit signed integers for greater than +INST3(vpcmpw, "pcmpw", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3F), INS_TT_FULL_MEM, Input_16Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) +INST3(vpcmpub, "pcmpub", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3E), INS_TT_FULL_MEM, Input_8Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) +INST3(vpcmpuw, "pcmpuw", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3E), INS_TT_FULL_MEM, Input_16Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) +INST3(vpermw, "permw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x8D), INS_TT_FULL_MEM, Input_16Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Permute Packed Doublewords Elements +INST3(vpmovb2m, "pmovb2m", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF3, 0x29), INS_TT_NONE, Input_8Bit | REX_W0 | Encoding_EVEX) +INST3(vpmovm2b, "pmovm2b", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF3, 0x28), INS_TT_NONE, Input_8Bit | REX_W0 | Encoding_EVEX) +INST3(vpmovm2w, "pmovm2w", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF3, 0x28), INS_TT_NONE, Input_16Bit | REX_W1 | Encoding_EVEX) +INST3(vpmovw2m, "pmovw2m", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF3, 0x29), INS_TT_NONE, Input_16Bit | REX_W1 | Encoding_EVEX) +INST3(vpmovwb, "pmovwb", IUM_WR, PSSE38(0xF3, 0x30), BAD_CODE, PSSE38(0xF3, 0x30), INS_TT_HALF_MEM, Input_16Bit | REX_W0 | Encoding_EVEX) +INST3(vpmovswb, "pmovswb", IUM_WR, PSSE38(0xF3, 0x20), BAD_CODE, PSSE38(0xF3, 0x20), INS_TT_HALF_MEM, Input_16Bit | REX_W0 | Encoding_EVEX) +INST3(vpmovuswb, "pmovuswb", IUM_WR, PSSE38(0xF3, 0x10), BAD_CODE, PSSE38(0xF3, 0x10), INS_TT_HALF_MEM, Input_16Bit | REX_W0 | Encoding_EVEX) +INST3(vpsllvw, "psllvw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x12), INS_TT_FULL_MEM, Input_16Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Variable Bit Shift Left Logical +INST3(vpsravw, "psravw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x11), INS_TT_FULL_MEM, Input_16Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Variable Bit Shift Right Arithmetic +INST3(vpsrlvw, "psrlvw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x10), INS_TT_FULL_MEM, Input_16Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Variable Bit Shift Right Logical // AVX512DQ -INST3(kortestb, "kortestb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x98), INS_TT_NONE, REX_W0 | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF | KInstruction) +INST3(kortestb, "kortestb", IUM_RD, BAD_CODE, BAD_CODE, PCKDBL(0x98), INS_TT_NONE, REX_W0 | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF | KInstruction) INST3(kmovb_gpr, "kmovb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x92), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) INST3(kmovb_msk, "kmovb", IUM_WR, PCKDBL(0x91), BAD_CODE, PCKDBL(0x90), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) INST3(vbroadcastf32x2, "broadcastf32x2", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x19), INS_TT_TUPLE2, Input_32Bit | REX_W0 | Encoding_EVEX) // Broadcast packed float values read from memory to entire register INST3(vbroadcasti32x2, "broadcasti32x2", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x59), INS_TT_TUPLE2, Input_32Bit | REX_W0 | Encoding_EVEX) // Broadcast packed integer values read from memory to entire register INST3(vbroadcastf32x8, "broadcastf32x8", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x1B), INS_TT_TUPLE8, Input_32Bit | REX_W0 | Encoding_EVEX) // Broadcast packed float values read from memory to entire register INST3(vbroadcasti32x8, "broadcasti32x8", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x5B), INS_TT_TUPLE8, Input_32Bit | REX_W0 | Encoding_EVEX) // Broadcast packed integer values read from memory to entire register -INST3(vcvtpd2qq, "cvtpd2qq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x7B), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX) // cvt packed doubles to signed QWORDs -INST3(vcvtpd2uqq, "cvtpd2uqq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x79), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX) // cvt packed doubles to unsigned QWORDs -INST3(vcvtps2qq, "cvtps2qq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x7B), INS_TT_HALF, Input_32Bit | REX_W0_EVEX | Encoding_EVEX) // cvt packed singles to signed QWORDs -INST3(vcvtps2uqq, "cvtps2uqq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x79), INS_TT_HALF, Input_32Bit | REX_W0_EVEX | Encoding_EVEX) // cvt packed singles to unsigned QWORDs -INST3(vcvtqq2pd, "cvtqq2pd", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0xE6), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX) // cvt packed signed QWORDs to doubles -INST3(vcvtqq2ps, "cvtqq2ps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x5B), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX) // cvt packed signed QWORDs to singles -INST3(vcvttpd2qq, "cvttpd2qq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x7A), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX) // cvt w/ truncation packed doubles to signed QWORDs -INST3(vcvttpd2uqq, "cvttpd2uqq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x78), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX) // cvt w/ truncation packed doubles to unsigned QWORDs -INST3(vcvttps2qq, "cvttps2qq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x7A), INS_TT_HALF, Input_32Bit | REX_W0_EVEX | Encoding_EVEX) // cvt w/ truncation packed singles to signed QWORDs -INST3(vcvttps2uqq, "cvttps2uqq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x78), INS_TT_HALF, Input_32Bit | REX_W0_EVEX | Encoding_EVEX) // cvt w/ truncation packed singles to unsigned QWORDs -INST3(vcvtuqq2pd, "cvtuqq2pd", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x7A), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX) // cvt packed signed QWORDs to doubles -INST3(vcvtuqq2ps, "cvtuqq2ps", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x7A), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX) // cvt packed signed QWORDs to singles -INST3(vextractf32x8, "extractf32x8", IUM_WR, SSE3A(0x1B), BAD_CODE, BAD_CODE, INS_TT_TUPLE8, Input_32Bit | REX_W0_EVEX | Encoding_EVEX) // Extract 256-bit packed double-precision floating point values -INST3(vextractf64x2, "extractf64x2", IUM_WR, SSE3A(0x19), BAD_CODE, BAD_CODE, INS_TT_TUPLE2, Input_64Bit | REX_W1_EVEX | Encoding_EVEX) // Extract 256-bit packed double-precision floating point values -INST3(vextracti32x8, "extracti32x8", IUM_WR, SSE3A(0x3B), BAD_CODE, BAD_CODE, INS_TT_TUPLE8, Input_32Bit | REX_W0_EVEX | Encoding_EVEX) // Extract 256-bit packed quadword integer values -INST3(vextracti64x2, "extracti64x2", IUM_WR, SSE3A(0x39), BAD_CODE, BAD_CODE, INS_TT_TUPLE2, Input_64Bit | REX_W1_EVEX | Encoding_EVEX) // Extract 256-bit packed quadword integer values -INST3(vinsertf32x8, "insertf32x8", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1A), INS_TT_TUPLE8, Input_32Bit | REX_W0_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed double-precision floating point values -INST3(vinsertf64x2, "insertf64x2", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x18), INS_TT_TUPLE2, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed double-precision floating point values -INST3(vinserti32x8, "inserti32x8", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3A), INS_TT_TUPLE8, Input_32Bit | REX_W0_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed quadword integer values -INST3(vinserti64x2, "inserti64x2", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x38), INS_TT_TUPLE2, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed quadword integer values -INST3(vpcmpd, "pcmpd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1F), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_EVEX | INS_Flags_Is3OperandInstructionMask) -INST3(vpcmpq, "pcmpq", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1F), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_Is3OperandInstructionMask) -INST3(vpcmpud, "pcmpud", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1E), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_EVEX | INS_Flags_Is3OperandInstructionMask) -INST3(vpcmpuq, "pcmpuq", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1E), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_Is3OperandInstructionMask) -INST3(vpmovd2m, "pmovd2m", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF3, 0x39), INS_TT_NONE, Input_32Bit | REX_W0_EVEX | Encoding_EVEX) -INST3(vpmovm2d, "pmovm2d", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF3, 0x38), INS_TT_NONE, Input_32Bit | REX_W0_EVEX | Encoding_EVEX) -INST3(vpmovm2q, "pmovm2q", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF3, 0x38), INS_TT_NONE, Input_64Bit | REX_W1_EVEX | Encoding_EVEX) -INST3(vpmovq2m, "pmovq2m", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF3, 0x39), INS_TT_NONE, Input_64Bit | REX_W1_EVEX | Encoding_EVEX) -INST3(vpmullq, "pmullq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x40), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed multiply 64 bit unsigned integers and store lower 64 bits of each result +INST3(vcvtpd2qq, "cvtpd2qq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x7B), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX) // cvt packed doubles to signed QWORDs +INST3(vcvtpd2uqq, "cvtpd2uqq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x79), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX) // cvt packed doubles to unsigned QWORDs +INST3(vcvtps2qq, "cvtps2qq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x7B), INS_TT_HALF, Input_32Bit | REX_W0 | Encoding_EVEX) // cvt packed singles to signed QWORDs +INST3(vcvtps2uqq, "cvtps2uqq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x79), INS_TT_HALF, Input_32Bit | REX_W0 | Encoding_EVEX) // cvt packed singles to unsigned QWORDs +INST3(vcvtqq2pd, "cvtqq2pd", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0xE6), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX) // cvt packed signed QWORDs to doubles +INST3(vcvtqq2ps, "cvtqq2ps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x5B), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX) // cvt packed signed QWORDs to singles +INST3(vcvttpd2qq, "cvttpd2qq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x7A), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX) // cvt w/ truncation packed doubles to signed QWORDs +INST3(vcvttpd2uqq, "cvttpd2uqq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x78), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX) // cvt w/ truncation packed doubles to unsigned QWORDs +INST3(vcvttps2qq, "cvttps2qq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x7A), INS_TT_HALF, Input_32Bit | REX_W0 | Encoding_EVEX) // cvt w/ truncation packed singles to signed QWORDs +INST3(vcvttps2uqq, "cvttps2uqq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x78), INS_TT_HALF, Input_32Bit | REX_W0 | Encoding_EVEX) // cvt w/ truncation packed singles to unsigned QWORDs +INST3(vcvtuqq2pd, "cvtuqq2pd", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x7A), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX) // cvt packed signed QWORDs to doubles +INST3(vcvtuqq2ps, "cvtuqq2ps", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x7A), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX) // cvt packed signed QWORDs to singles +INST3(vextractf32x8, "extractf32x8", IUM_WR, SSE3A(0x1B), BAD_CODE, BAD_CODE, INS_TT_TUPLE8, Input_32Bit | REX_W0 | Encoding_EVEX) // Extract 256-bit packed double-precision floating point values +INST3(vextractf64x2, "extractf64x2", IUM_WR, SSE3A(0x19), BAD_CODE, BAD_CODE, INS_TT_TUPLE2, Input_64Bit | REX_W1 | Encoding_EVEX) // Extract 256-bit packed double-precision floating point values +INST3(vextracti32x8, "extracti32x8", IUM_WR, SSE3A(0x3B), BAD_CODE, BAD_CODE, INS_TT_TUPLE8, Input_32Bit | REX_W0 | Encoding_EVEX) // Extract 256-bit packed quadword integer values +INST3(vextracti64x2, "extracti64x2", IUM_WR, SSE3A(0x39), BAD_CODE, BAD_CODE, INS_TT_TUPLE2, Input_64Bit | REX_W1 | Encoding_EVEX) // Extract 256-bit packed quadword integer values +INST3(vinsertf32x8, "insertf32x8", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1A), INS_TT_TUPLE8, Input_32Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed double-precision floating point values +INST3(vinsertf64x2, "insertf64x2", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x18), INS_TT_TUPLE2, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed double-precision floating point values +INST3(vinserti32x8, "inserti32x8", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3A), INS_TT_TUPLE8, Input_32Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed quadword integer values +INST3(vinserti64x2, "inserti64x2", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x38), INS_TT_TUPLE2, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed quadword integer values +INST3(vpcmpd, "pcmpd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1F), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_EVEX | INS_Flags_Is3OperandInstructionMask) +INST3(vpcmpq, "pcmpq", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1F), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_Is3OperandInstructionMask) +INST3(vpcmpud, "pcmpud", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1E), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_EVEX | INS_Flags_Is3OperandInstructionMask) +INST3(vpcmpuq, "pcmpuq", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1E), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_Is3OperandInstructionMask) +INST3(vpmovd2m, "pmovd2m", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF3, 0x39), INS_TT_NONE, Input_32Bit | REX_W0 | Encoding_EVEX) +INST3(vpmovm2d, "pmovm2d", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF3, 0x38), INS_TT_NONE, Input_32Bit | REX_W0 | Encoding_EVEX) +INST3(vpmovm2q, "pmovm2q", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF3, 0x38), INS_TT_NONE, Input_64Bit | REX_W1 | Encoding_EVEX) +INST3(vpmovq2m, "pmovq2m", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF3, 0x39), INS_TT_NONE, Input_64Bit | REX_W1 | Encoding_EVEX) +INST3(vpmullq, "pmullq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x40), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed multiply 64 bit unsigned integers and store lower 64 bits of each result // AVX512VBMI -INST3(vpermb, "permb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x8D), INS_TT_FULL_MEM, Input_8Bit | REX_W0_EVEX | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Permute Packed Byte Elements +INST3(vpermb, "permb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x8D), INS_TT_FULL_MEM, Input_8Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Permute Packed Byte Elements INST3(LAST_AVX512_INSTRUCTION, "LAST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) // Scalar instructions in SSE4.2 -INST3(crc32, "crc32", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF2, 0xF0), INS_TT_NONE, INS_FLAGS_None) +INST3(crc32, "crc32", IUM_RW, BAD_CODE, BAD_CODE, PSSE38(0xF2, 0xF0), INS_TT_NONE, INS_FLAGS_None) // BMI1 INST3(tzcnt, "tzcnt", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0xBC), INS_TT_NONE, Undefined_OF | Undefined_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF) // Count the Number of Trailing Zero Bits diff --git a/src/coreclr/jit/simdcodegenxarch.cpp b/src/coreclr/jit/simdcodegenxarch.cpp index c01b59bb8606c..055afc98ece80 100644 --- a/src/coreclr/jit/simdcodegenxarch.cpp +++ b/src/coreclr/jit/simdcodegenxarch.cpp @@ -511,5 +511,46 @@ void CodeGen::genSimdUpperRestore(GenTreeIntrinsic* node) } } +//----------------------------------------------------------------------------- +// genSimd12UpperClear: Clears the upper 32-bits of a TYP_SIMD12 vector +// +// Arguments: +// tgtReg - The target register for which to clear the upper bits +// +// Return Value: +// None. +// +void CodeGen::genSimd12UpperClear(regNumber tgtReg) +{ + assert(genIsValidFloatReg(tgtReg)); + + if (compiler->compOpportunisticallyDependsOn(InstructionSet_SSE41)) + { + // ZMASK: 0b1000 - Preserve element 0, 1, and 2; Zero element 3 + // COUNT_D: 0b11 - Insert into element 3 + // COUNT_S: 0b11 - Insert from element 3 + + GetEmitter()->emitIns_SIMD_R_R_R_I(INS_insertps, EA_16BYTE, tgtReg, tgtReg, tgtReg, static_cast(0xF8)); + } + else + { + // Preserve element 0, 1, and 2; Zero element 3 + + if (zroSimd12Elm3 == NO_FIELD_HANDLE) + { + simd16_t constValue; + + constValue.u32[0] = 0xFFFFFFFF; + constValue.u32[1] = 0xFFFFFFFF; + constValue.u32[2] = 0xFFFFFFFF; + constValue.u32[3] = 0x00000000; + + zroSimd12Elm3 = GetEmitter()->emitSimd16Const(constValue); + } + + GetEmitter()->emitIns_SIMD_R_R_C(INS_andpd, EA_16BYTE, tgtReg, tgtReg, zroSimd12Elm3, 0); + } +} + #endif // FEATURE_SIMD #endif // TARGET_XARCH