From ec5ac992e5f5c23a5435930aa66b5dd0f622523d Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Thu, 25 Jul 2024 11:21:10 -0700 Subject: [PATCH] Update mono to support shuffle for constant inputs (#105299) * Support mono creating xconst in a few more places * Update mono to support shuffle for constant inputs * Ensure that arm64 also accelerates shuffle for non-constant inputs * Ensure OP_XZERO and OP_XONES are recognized as being constant * Ensure shuffle creates a correct instruction when the fsig doesn't match the necessary parameter count * Ensure that getting the index for floating-point shuffle is possible * Ensure the right class handle is passed down to LLVM so overload resolution can function * Make sure we update the original xconst if we mutate it * Return a new constant and instead of mutating the existing one * Insert relevant xcast nodes * Add some asserts around the ecount * Ensure we get the right element type * Ensure we don't create nodes unnecessarily for create_elementwise * Ensure that create_elementwise still works for other vector sizes * Ensure indentation of switch cases is correct for Mono --- src/mono/mono/mini/simd-intrinsics.c | 628 +++++++++++++++++++++++++-- 1 file changed, 585 insertions(+), 43 deletions(-) diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index 44fc12ac0a232..54588a2f2dfdf 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -83,6 +83,31 @@ mono_emit_simd_field_load (MonoCompile *cfg, MonoClassField *field, MonoInst *ad return NULL; } +static gboolean +is_const (const MonoInst* ins) +{ + switch (ins->opcode) { + case OP_ICONST: + case OP_I8CONST: + case OP_R4CONST: + case OP_R8CONST: + return TRUE; + } + return FALSE; +} + +static gboolean +is_xconst (const MonoInst* ins) +{ + switch (ins->opcode) { + case OP_XCONST: + case OP_XZERO: + case OP_XONES: + return TRUE; + } + return FALSE; +} + static gboolean is_zero_const (const MonoInst* ins) { @@ -647,6 +672,60 @@ emit_xconst_v128 (MonoCompile *cfg, MonoClass *klass, guint8 value[16]) return ins; } +static guint64 +get_xconst_int_elem (MonoCompile *cfg, MonoInst *ins, MonoTypeEnum etype, int index) +{ + guint8 cns_vec[16]; + if (ins->opcode == OP_XZERO) { + memset (cns_vec, 0x00, 16); + } else if (ins->opcode == OP_XONES) { + memset (cns_vec, 0xFF, 16); + } else { + g_assert (ins->opcode == OP_XCONST); + memcpy (cns_vec, ins->inst_p0, 16); + } + g_assert (index >= 0); + switch (etype) { + case MONO_TYPE_I1: { + g_assert (index < 16); + return ((gint8*)cns_vec) [index]; + } + case MONO_TYPE_U1: { + g_assert (index < 16); + return ((guint8*)cns_vec) [index]; + } + case MONO_TYPE_I2: { + g_assert (index < 8); + return ((gint16*)cns_vec) [index]; + } + case MONO_TYPE_U2: { + g_assert (index < 8); + return ((guint16*)cns_vec) [index]; + } + case MONO_TYPE_I4: + case MONO_TYPE_R4: { + g_assert (index < 4); + return ((gint32*)cns_vec) [index]; + } + case MONO_TYPE_U4: { + g_assert (index < 4); + return ((guint32*)cns_vec) [index]; + } + case MONO_TYPE_I8: + case MONO_TYPE_R8: { + g_assert (index < 2); + return ((gint64*)cns_vec) [index]; + } + case MONO_TYPE_U8: { + g_assert (index < 2); + return ((guint64*)cns_vec) [index]; + } + default: { + g_assert_not_reached (); + } + } +} + #ifdef TARGET_ARM64 static int type_to_extract_op (MonoTypeEnum type); static MonoType* get_vector_t_elem_type (MonoType *vector_type); @@ -1076,34 +1155,393 @@ emit_vector_insert_element ( int op = type_to_insert_op (type); if (is_zero_inited && is_zero_const (element)) { - // element already set to zero + // element already set to zero + return ins; + } + + if (is_xconst (ins) && is_const (element)) { + // Specially handle insertion of a constant into a constant + int vector_size = mono_class_value_size (vklass, NULL); + if (vector_size == 16) { + guint8 cns_vec[16]; + if (ins->opcode == OP_XZERO) { + memset (cns_vec, 0x00, 16); + } else if (ins->opcode == OP_XONES) { + memset (cns_vec, 0xFF, 16); + } else { + g_assert (ins->opcode == OP_XCONST); + memcpy (cns_vec, ins->inst_p0, 16); + } + if (type_enum_is_float (type)) { + double cns_val; + if (element->opcode == OP_R4CONST) { + cns_val = *(const float*)(element->inst_p0); + } else { + g_assert (element->opcode == OP_R8CONST); + cns_val = *(const double*)(element->inst_p0); + } + switch (type) { + case MONO_TYPE_R4: { + ((float*)cns_vec) [index] = (float)cns_val; + break; + } + case MONO_TYPE_R8: { + ((double*)cns_vec) [index] = (double)cns_val; + break; + } + default: { + g_assert_not_reached (); + } + } + } else { + gint64 cns_val; + if (element->opcode == OP_ICONST) { + cns_val = GTMREG_TO_INT (element->inst_c0); + } else { + g_assert (element->opcode == OP_I8CONST); + cns_val = element->inst_l; + } + switch (type) { + case MONO_TYPE_I1: + case MONO_TYPE_U1: { + ((guint8*)cns_vec) [index] = (guint8)cns_val; + break; + } + case MONO_TYPE_I2: + case MONO_TYPE_U2: { + ((guint16*)cns_vec) [index] = (guint16)cns_val; + break; + } + case MONO_TYPE_I4: + case MONO_TYPE_U4: { + ((guint32*)cns_vec) [index] = (guint32)cns_val; + break; + } + case MONO_TYPE_I8: + case MONO_TYPE_U8: { + ((guint64*)cns_vec) [index] = (guint64)cns_val; + break; + } + default: { + g_assert_not_reached (); + } + } + } + return emit_xconst_v128 (cfg, vklass, cns_vec); + } + } + #ifdef TARGET_ARM64 - } else if (!COMPILE_LLVM (cfg) && element->opcode == type_to_extract_op (type) && + if (!COMPILE_LLVM (cfg) && element->opcode == type_to_extract_op (type) && (type == MONO_TYPE_R4 || type == MONO_TYPE_R8)) { // OP_INSERT_Ix inserts from GP reg, not SIMD. Cannot optimize for int types. ins = emit_simd_ins (cfg, vklass, op, ins->dreg, element->sreg1); ins->inst_c0 = index | ((element->inst_c0) << 8); ins->inst_c1 = type; + return ins; + } #endif + + ins = emit_simd_ins (cfg, vklass, op, ins->dreg, element->dreg); + ins->inst_c0 = index; + ins->inst_c1 = type; + + return ins; +} + +static MonoInst * +emit_vector_create_broadcast ( + MonoCompile *cfg, MonoClass *vklass, MonoType *etype, MonoInst *arg0) +{ + int vector_size = mono_class_value_size (vklass, NULL); + if (vector_size == 16) { + // We want to handle constant inputs and create constant nodes so other import + // optimizations can be enabled. + if (is_const (arg0)) { + guint8 cns_vec[16]; + if (type_enum_is_float (etype->type)) { + double cns_val; + if (arg0->opcode == OP_R4CONST) { + cns_val = *(const float*)(arg0->inst_p0); + } else { + g_assert (arg0->opcode == OP_R8CONST); + cns_val = *(const double*)(arg0->inst_p0); + } + switch (etype->type) { + case MONO_TYPE_R4: { + for (int i = 0; i < vector_size / 4; i++) { + ((float*)cns_vec) [i] = (float)cns_val; + } + break; + } + case MONO_TYPE_R8: { + for (int i = 0; i < vector_size / 8; i++) { + ((double*)cns_vec) [i] = (double)cns_val; + } + break; + } + default: { + g_assert_not_reached (); + } + } + } else { + gint64 cns_val; + if (arg0->opcode == OP_ICONST) { + cns_val = GTMREG_TO_INT (arg0->inst_c0); + } else { + g_assert (arg0->opcode == OP_I8CONST); + cns_val = arg0->inst_l; + } + switch (etype->type) { + case MONO_TYPE_I1: + case MONO_TYPE_U1: { + for (int i = 0; i < vector_size / 1; i++) { + ((guint8*)cns_vec) [i] = (guint8)cns_val; + } + break; + } + case MONO_TYPE_I2: + case MONO_TYPE_U2: { + for (int i = 0; i < vector_size / 2; i++) { + ((guint16*)cns_vec) [i] = (guint16)cns_val; + } + break; + } + case MONO_TYPE_I4: + case MONO_TYPE_U4: { + for (int i = 0; i < vector_size / 4; i++) { + ((guint32*)cns_vec) [i] = (guint32)cns_val; + } + break; + } + case MONO_TYPE_I8: + case MONO_TYPE_U8: { + for (int i = 0; i < vector_size / 8; i++) { + ((guint64*)cns_vec) [i] = (guint64)cns_val; + } + break; + } + default: { + g_assert_not_reached (); + } + } + } + return emit_xconst_v128 (cfg, vklass, (guint8*)cns_vec); + } + } + MonoInst* ins = emit_simd_ins (cfg, vklass, type_to_expand_op (etype->type), arg0->dreg, -1); + ins->inst_c1 = etype->type; + return ins; +} + +static MonoInst * +emit_vector_create_elementwise ( + MonoCompile *cfg, MonoClass *vklass, MonoType *etype, MonoInst **args, int param_count) +{ + // We want to handle constant inputs and create constant nodes so other import + // optimizations can be enabled. This includes recognizing partial constants + // and only performing the minimal number of inserts required + + gboolean all_const = true; + gboolean some_const = false; + + guint8 cns_vec[16]; + memset (cns_vec, 0x00, 16); + + int vector_size = mono_class_value_size (vklass, NULL); + if (vector_size == 16) { + for (int i = 0; i < param_count; ++i) { + if (!is_const (args[i])) { + all_const = false; + break; + } + + some_const = true; + + if (type_enum_is_float (etype->type)) { + double cns_val; + if (args[i]->opcode == OP_R4CONST) { + cns_val = *(const float*)(args[i]->inst_p0); + } else { + g_assert (args[i]->opcode == OP_R8CONST); + cns_val = *(const double*)(args[i]->inst_p0); + } + + switch (etype->type) { + case MONO_TYPE_R4: { + ((float*)cns_vec) [i] = (float)cns_val; + break; + } + case MONO_TYPE_R8: { + ((double*)cns_vec) [i] = (double)cns_val; + break; + } + default: { + g_assert_not_reached (); + } + } + } else { + gint64 cns_val; + if (args[i]->opcode == OP_ICONST) { + cns_val = GTMREG_TO_INT (args[i]->inst_c0); + } else { + g_assert (args[i]->opcode == OP_I8CONST); + cns_val = args[i]->inst_l; + } + + switch (etype->type) { + case MONO_TYPE_I1: + case MONO_TYPE_U1: { + ((guint8*)cns_vec) [i] = (guint8)cns_val; + break; + } + case MONO_TYPE_I2: + case MONO_TYPE_U2: { + ((guint16*)cns_vec) [i] = (guint16)cns_val; + break; + } + case MONO_TYPE_I4: + case MONO_TYPE_U4: { + ((guint32*)cns_vec) [i] = (guint32)cns_val; + break; + } + case MONO_TYPE_I8: + case MONO_TYPE_U8: { + ((guint64*)cns_vec) [i] = (guint64)cns_val; + break; + } + default: { + g_assert_not_reached (); + } + } + } + } + } + + if (all_const) { + return emit_xconst_v128 (cfg, vklass, (guint8*)cns_vec); + } + + MonoInst *ins; + + if (some_const) { + ins = emit_xconst_v128 (cfg, vklass, (guint8*)cns_vec); } else { - ins = emit_simd_ins (cfg, vklass, op, ins->dreg, element->dreg); - ins->inst_c0 = index; - ins->inst_c1 = type; + ins = emit_xzero (cfg, vklass); + } + + for (int i = 0; i < param_count; ++i) { + if (!is_const (args[i]) || (vector_size != 16)) { + ins = emit_vector_insert_element (cfg, vklass, ins, etype->type, args[i], i, TRUE); + } } return ins; } static MonoInst * -emit_vector_create_elementwise ( - MonoCompile *cfg, MonoMethodSignature *fsig, MonoType *vtype, - MonoTypeEnum type, MonoInst **args) +emit_vector_create_scalar ( + MonoCompile *cfg, MonoClass *vklass, MonoType *etype, MonoInst *arg0, gboolean is_unsafe) { - MonoClass *vklass = mono_class_from_mono_type_internal (vtype); - MonoInst *ins = emit_xzero (cfg, vklass); - for (int i = 0; i < fsig->param_count; ++i) - ins = emit_vector_insert_element (cfg, vklass, ins, type, args[i], i, TRUE); + int vector_size = mono_class_value_size (vklass, NULL); + if (vector_size == 16) { + // We want to handle constant inputs and create constant nodes so other import + // optimizations can be enabled. For is_unsafe, we treat it the same as broadcast + if (is_const (arg0)) { + if (is_unsafe) { + return emit_vector_create_broadcast (cfg, vklass, etype, arg0); + } + + guint8 cns_vec[16]; + memset (cns_vec, 0x00, 16); + + if (type_enum_is_float (etype->type)) { + double cns_val; + if (arg0->opcode == OP_R4CONST) { + cns_val = *(const float*)(arg0->inst_p0); + } else { + g_assert (arg0->opcode == OP_R8CONST); + cns_val = *(const double*)(arg0->inst_p0); + } + + switch (etype->type) { + case MONO_TYPE_R4: { + ((float*)cns_vec) [0] = (float)cns_val; + break; + } + case MONO_TYPE_R8: { + ((double*)cns_vec) [0] = (double)cns_val; + break; + } + default: { + g_assert_not_reached (); + } + } + } else { + gint64 cns_val; + if (arg0->opcode == OP_ICONST) { + cns_val = GTMREG_TO_INT (arg0->inst_c0); + } else { + g_assert (arg0->opcode == OP_I8CONST); + cns_val = arg0->inst_l; + +} + switch (etype->type) { + case MONO_TYPE_I1: + case MONO_TYPE_U1: { + ((guint8*)cns_vec) [0] = (guint8)cns_val; + break; + } + case MONO_TYPE_I2: + case MONO_TYPE_U2: { + ((guint16*)cns_vec) [0] = (guint16)cns_val; + break; + } + case MONO_TYPE_I4: + case MONO_TYPE_U4: { + ((guint32*)cns_vec) [0] = (guint32)cns_val; + break; + } + case MONO_TYPE_I8: + case MONO_TYPE_U8: { + ((guint64*)cns_vec) [0] = (guint64)cns_val; + break; + } + default: { + g_assert_not_reached (); + } + } + } + return emit_xconst_v128 (cfg, vklass, (guint8*)cns_vec); + } + } + + int opcode = 0; + if (COMPILE_LLVM (cfg)) { + opcode = is_unsafe ? OP_CREATE_SCALAR_UNSAFE : OP_CREATE_SCALAR; + } else { +#ifdef TARGET_AMD64 + MonoInst *ins; + + ins = emit_xzero (cfg, vklass); + if (!is_zero_const (arg0)) { + ins = emit_simd_ins (cfg, vklass, type_to_insert_op (etype->type), ins->dreg, arg0->dreg); + ins->inst_c0 = 0; + ins->inst_c1 = etype->type; + } + return ins; +#else + if (type_enum_is_float (etype->type)) { + opcode = is_unsafe ? OP_CREATE_SCALAR_UNSAFE_FLOAT : OP_CREATE_SCALAR_FLOAT; + } else { + opcode = is_unsafe ? OP_CREATE_SCALAR_UNSAFE_INT : OP_CREATE_SCALAR_INT; + } +#endif + } + g_assert (opcode != 0); + MonoInst* ins = emit_simd_ins (cfg, vklass, opcode, arg0->dreg, -1); + ins->inst_c1 = etype->type; return ins; } @@ -1867,9 +2305,8 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi if (!MONO_TYPE_IS_VECTOR_PRIMITIVE (etype)) return NULL; if (fsig->param_count == 1 && mono_metadata_type_equal (fsig->params [0], etype)) { - MonoInst* ins = emit_simd_ins (cfg, klass, type_to_expand_op (etype->type), args [0]->dreg, -1); - ins->inst_c1 = arg0_type; - return ins; + MonoClass *vklass = mono_class_from_mono_type_internal(fsig->ret); + return emit_vector_create_broadcast (cfg, vklass, etype, args [0]); } else if (is_create_from_half_vectors_overload (fsig)) { #if defined(TARGET_AMD64) // Require Vector64 SIMD support @@ -1890,8 +2327,10 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi return emit_simd_ins (cfg, klass, OP_XCONCAT, args [0]->dreg, args [1]->dreg); } - else if (is_elementwise_create_overload (fsig, etype)) - return emit_vector_create_elementwise (cfg, fsig, fsig->ret, arg0_type, args); + else if (is_elementwise_create_overload (fsig, etype)) { + MonoClass *vklass = mono_class_from_mono_type_internal(fsig->ret); + return emit_vector_create_elementwise (cfg, vklass, etype, args, fsig->param_count); + } break; } case SN_CreateScalar: @@ -1900,27 +2339,8 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi if (!MONO_TYPE_IS_VECTOR_PRIMITIVE (etype)) return NULL; gboolean is_unsafe = id == SN_CreateScalarUnsafe; - if (COMPILE_LLVM (cfg)) { - return emit_simd_ins_for_sig (cfg, klass, is_unsafe ? OP_CREATE_SCALAR_UNSAFE : OP_CREATE_SCALAR, -1, arg0_type, fsig, args); - } else { -#ifdef TARGET_AMD64 - MonoInst *ins; - - ins = emit_xzero (cfg, klass); - if (!is_zero_const (args [0])) { - ins = emit_simd_ins (cfg, klass, type_to_insert_op (arg0_type), ins->dreg, args [0]->dreg); - ins->inst_c0 = 0; - ins->inst_c1 = arg0_type; - } - return ins; -#else - if (type_enum_is_float (arg0_type)) { - return emit_simd_ins_for_sig (cfg, klass, is_unsafe ? OP_CREATE_SCALAR_UNSAFE_FLOAT : OP_CREATE_SCALAR_FLOAT, -1, arg0_type, fsig, args); - } else { - return emit_simd_ins_for_sig (cfg, klass, is_unsafe ? OP_CREATE_SCALAR_UNSAFE_INT : OP_CREATE_SCALAR_INT, -1, arg0_type, fsig, args); - } -#endif - } + MonoClass *vklass = mono_class_from_mono_type_internal(fsig->ret); + return emit_vector_create_scalar (cfg, vklass, etype, args [0], is_unsafe); } case SN_Dot: { return emit_dot (cfg, klass, fsig->params [0], arg0_type, args [0]->dreg, args [1]->dreg); @@ -2418,18 +2838,140 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi return emit_simd_ins_for_unary_op (cfg, klass, fsig, args, arg0_type, id); } case SN_Shuffle: { - if (!is_element_type_primitive (fsig->params [0])) + MonoType *etype = get_vector_t_elem_type (fsig->ret); + if (!MONO_TYPE_IS_VECTOR_PRIMITIVE (etype)) return NULL; #ifdef TARGET_WASM return emit_simd_ins_for_sig (cfg, klass, OP_WASM_SIMD_SWIZZLE, -1, -1, fsig, args); #elif defined(TARGET_ARM64) - if (vector_size == 128 && (arg0_type == MONO_TYPE_I1 || arg0_type == MONO_TYPE_U1)) + if (vector_size != 128) { + return NULL; + } + if ((arg0_type == MONO_TYPE_I1 || arg0_type == MONO_TYPE_U1)) { return emit_simd_ins_for_sig (cfg, klass, OP_XOP_OVR_X_X_X, INTRINS_AARCH64_ADV_SIMD_TBL1, 0, fsig, args); - return NULL; + } + if (!is_xconst (args [1])) { + return NULL; + } + int vsize = mono_class_value_size (klass, NULL); + int esize = mono_class_value_size (mono_class_from_mono_type_internal (etype), NULL); + int ecount = vsize / esize; + g_assert ((ecount == 2) || (ecount == 4) || (ecount == 8) || (ecount == 16)); + guint64 value = 0; + guint8 vec_cns[16]; + for (int index = 0; index < ecount; index++) { + value = get_xconst_int_elem (cfg, args [1], arg0_type, index); + if (value < ecount) { + for (int i = 0; i < esize; i++) { + vec_cns [(index * esize) + i] = (guint8)((value * esize) + i); + } + } else { + for (int i = 0; i < esize; i++) { + vec_cns [(index * esize) + i] = (guint8)0xFF; + } + } + } + MonoType *op_etype = m_class_get_byval_arg (mono_defaults.byte_class); + MonoClass *op_klass = create_class_instance ("System.Runtime.Intrinsics", "Vector128`1", op_etype); + MonoInst *new_args[2]; + new_args [0] = emit_simd_ins (cfg, op_klass, OP_XCAST, args [0]->dreg, -1); + new_args [1] = emit_xconst_v128 (cfg, op_klass, vec_cns); + MonoInst *ins = emit_simd_ins_for_sig (cfg, op_klass, OP_XOP_OVR_X_X_X, INTRINS_AARCH64_ADV_SIMD_TBL1, 0, fsig, new_args); + return emit_simd_ins (cfg, klass, OP_XCAST, ins->dreg, -1); #elif defined(TARGET_AMD64) if (COMPILE_LLVM (cfg)) { - if (is_SIMD_feature_supported (cfg, MONO_CPU_X86_SSSE3) && vector_size == 128 && (arg0_type == MONO_TYPE_I1 || arg0_type == MONO_TYPE_U1)) - return emit_simd_ins_for_sig (cfg, klass, OP_XOP_X_X_X, INTRINS_SSE_PSHUFB, 0, fsig, args); + if (vector_size != 128) { + return NULL; + } + if (!is_xconst (args [1])) { + return NULL; + } + MonoType *op_etype = etype; + MonoClass *op_klass = klass; + int vsize = mono_class_value_size (klass, NULL); + int esize = mono_class_value_size (mono_class_from_mono_type_internal (etype), NULL); + int ecount = vsize / esize; + g_assert ((ecount == 2) || (ecount == 4) || (ecount == 8) || (ecount == 16)); + guint8 control = 0; + gboolean needs_zero = false; + guint64 value = 0; + guint8 vec_cns[16]; + if ((arg0_type == MONO_TYPE_I1) || (arg0_type == MONO_TYPE_U1)) { + needs_zero = true; + } else if ((arg0_type == MONO_TYPE_I2) || (arg0_type == MONO_TYPE_U2)) { + needs_zero = true; + } + for (int index = 0; index < ecount; index++) { + value = get_xconst_int_elem (cfg, args [1], arg0_type, index); + if (value < ecount) { + // Setting the control for byte/sbyte and short/ushort is unnecessary + // and will actually compute an incorrect control word. But it simplifies + // the overall logic needed here and will remain unused. + + control |= (value << (index * (ecount / 2))); + + // When Ssse3 is supported, we may need vecCns to accurately select the relevant + // bytes if some index is outside the valid range. Since x86/x64 is little-endian + // we can simplify this down to a for loop that scales the value and selects count + // sequential bytes. + + for (int i = 0; i < esize; i++) { + vec_cns [(index * esize) + i] = (guint8)((value * esize) + i); + } + } else { + needs_zero = true; + + // When Ssse3 is supported, we may need vecCns to accurately select the relevant + // bytes if some index is outside the valid range. We can do this by just zeroing + // out each byte in the element. This only requires the most significant bit to be + // set, but we use 0xFF instead since that will be the equivalent of AllBitsSet + + for (int i = 0; i < esize; i++) { + vec_cns [(index * esize) + i] = (guint8)0xFF; + } + } + } + MonoInst *new_args[3]; + if (needs_zero) { + if (!is_SIMD_feature_supported (cfg, MONO_CPU_X86_SSSE3)) { + return NULL; + } + op_etype = m_class_get_byval_arg (mono_defaults.byte_class); + op_klass = create_class_instance ("System.Runtime.Intrinsics", "Vector128`1", op_etype); + new_args [0] = emit_simd_ins (cfg, op_klass, OP_XCAST, args [0]->dreg, -1); + new_args [1] = emit_xconst_v128 (cfg, op_klass, vec_cns); + MonoInst *ins = emit_simd_ins_for_sig (cfg, op_klass, OP_XOP_X_X_X, INTRINS_SSE_PSHUFB, 0, fsig, new_args); + return emit_simd_ins (cfg, klass, OP_XCAST, ins->dreg, -1); + } + if ((arg0_type == MONO_TYPE_I8) || (arg0_type == MONO_TYPE_U8)) { + // TYP_LONG and TYP_ULONG don't have their own shuffle/permute instructions and so we'll + // just utilize the path for TYP_DOUBLE for simplicity. We could alternatively break this + // down into a TYP_INT or TYP_UINT based shuffle, but that's additional complexity for no + // real benefit since shuffle gets its own port rather than using the fp specific ports. + arg0_type = MONO_TYPE_R8; + op_etype = m_class_get_byval_arg (mono_defaults.double_class); + op_klass = create_class_instance ("System.Runtime.Intrinsics", "Vector128`1", etype); + } + if ((arg0_type == MONO_TYPE_R4) || (arg0_type == MONO_TYPE_R8)) { + int opcode = (arg0_type == MONO_TYPE_R4) ? OP_SSE_SHUFPS : OP_SSE2_SHUFPD; + new_args [0] = args [0]; + if (op_klass != klass) { + new_args [0] = emit_simd_ins (cfg, op_klass, OP_XCAST, new_args [0]->dreg, -1); + } + new_args [1] = new_args [0]; + EMIT_NEW_ICONST (cfg, new_args [2], control); + MonoInst* ins = emit_simd_ins (cfg, op_klass, opcode, new_args [0]->dreg, new_args [1]->dreg); + ins->inst_c0 = 0; + ins->inst_c1 = arg0_type; + ins->sreg3 = new_args [2]->dreg; + if (op_klass != klass) { + ins = emit_simd_ins (cfg, klass, OP_XCAST, ins->dreg, -1); + } + return ins; + } else { + g_assert ((arg0_type == MONO_TYPE_I4) || (arg0_type == MONO_TYPE_U4)); + return emit_simd_ins_for_sig (cfg, klass, OP_SSE2_PSHUFD, 0, arg0_type, fsig, new_args); + } } // There is no variable shuffle until avx512 return NULL;