From 4820a216cf081d268cd059a853c17de9cdd415aa Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Mon, 22 Jul 2024 23:23:39 -0700 Subject: [PATCH 01/15] Support mono creating xconst in a few more places --- src/mono/mono/mini/simd-intrinsics.c | 267 +++++++++++++++++++++++---- 1 file changed, 229 insertions(+), 38 deletions(-) diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index 44fc12ac0a232..06de318599964 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -83,6 +83,19 @@ mono_emit_simd_field_load (MonoCompile *cfg, MonoClassField *field, MonoInst *ad return NULL; } +static gboolean +is_const (const MonoInst* ins) +{ + switch (ins->opcode) { + case OP_ICONST: + case OP_I8CONST: + case OP_R4CONST: + case OP_R8CONST: + return TRUE; + } + return FALSE; +} + static gboolean is_zero_const (const MonoInst* ins) { @@ -1076,37 +1089,233 @@ emit_vector_insert_element ( int op = type_to_insert_op (type); if (is_zero_inited && is_zero_const (element)) { - // element already set to zero + // element already set to zero + return ins; + } + + if ((ins->opcode == OP_XCONST) && is_const (element)) { + // Specially handle insertion of a constant into a constant + int vector_size = mono_class_value_size (vklass, NULL); + if (vector_size == 16) { + guint8* cns_vec = (guint8*)ins->inst_p0; + if (type_enum_is_float (type)) { + double cns_val; + if (element->opcode == OP_R4CONST) { + cns_val = *(const float*)(element->inst_p0); + } else { + g_assert (element->opcode == OP_R8CONST); + cns_val = *(const double*)(element->inst_p0); + } + switch (type) { + case MONO_TYPE_R4: { + ((float*)cns_vec) [index] = (float)cns_val; + break; + } + case MONO_TYPE_R8: { + ((double*)cns_vec) [index] = (double)cns_val; + break; + } + default: { + g_assert_not_reached (); + } + } + } else { + gint64 cns_val; + if (element->opcode == OP_ICONST) { + cns_val = GTMREG_TO_INT (element->inst_c0); + } else { + g_assert (element->opcode == OP_I8CONST); + cns_val = element->inst_l; + } + switch (type) { + case MONO_TYPE_I1: + case MONO_TYPE_U1: { + ((guint8*)cns_vec) [index] = (guint8)cns_val; + break; + } + case MONO_TYPE_I2: + case MONO_TYPE_U2: { + ((guint16*)cns_vec) [index] = (guint16)cns_val; + break; + } + case MONO_TYPE_I4: + case MONO_TYPE_U4: { + ((guint32*)cns_vec) [index] = (guint32)cns_val; + break; + } + case MONO_TYPE_I8: + case MONO_TYPE_U8: { + ((guint64*)cns_vec) [index] = (guint64)cns_val; + break; + } + default: { + g_assert_not_reached (); + } + } + } + return ins; + } + } + #ifdef TARGET_ARM64 - } else if (!COMPILE_LLVM (cfg) && element->opcode == type_to_extract_op (type) && + if (!COMPILE_LLVM (cfg) && element->opcode == type_to_extract_op (type) && (type == MONO_TYPE_R4 || type == MONO_TYPE_R8)) { // OP_INSERT_Ix inserts from GP reg, not SIMD. Cannot optimize for int types. ins = emit_simd_ins (cfg, vklass, op, ins->dreg, element->sreg1); ins->inst_c0 = index | ((element->inst_c0) << 8); ins->inst_c1 = type; -#endif - } else { - ins = emit_simd_ins (cfg, vklass, op, ins->dreg, element->dreg); - ins->inst_c0 = index; - ins->inst_c1 = type; + return ins; } +#endif + + ins = emit_simd_ins (cfg, vklass, op, ins->dreg, element->dreg); + ins->inst_c0 = index; + ins->inst_c1 = type; + + return ins; +} +static MonoInst * +emit_vector_create_broadcast ( + MonoCompile *cfg, MonoClass *vklass, MonoType *etype, MonoInst *arg0) +{ + int vector_size = mono_class_value_size (vklass, NULL); + if (vector_size == 16) { + // We want to handle constant inputs and create constant nodes so other import + // optimizations can be enabled. + if (is_const (arg0)) { + guint8 cns_vec[16]; + if (type_enum_is_float (etype->type)) { + double cns_val; + if (arg0->opcode == OP_R4CONST) { + cns_val = *(const float*)(arg0->inst_p0); + } else { + g_assert (arg0->opcode == OP_R8CONST); + cns_val = *(const double*)(arg0->inst_p0); + } + switch (etype->type) { + case MONO_TYPE_R4: { + for (int i = 0; i < vector_size / 4; i++) { + ((float*)cns_vec) [i] = (float)cns_val; + } + break; + } + case MONO_TYPE_R8: { + for (int i = 0; i < vector_size / 8; i++) { + ((double*)cns_vec) [i] = (double)cns_val; + } + break; + } + default: { + g_assert_not_reached (); + } + } + } else { + gint64 cns_val; + if (arg0->opcode == OP_ICONST) { + cns_val = GTMREG_TO_INT (arg0->inst_c0); + } else { + g_assert (arg0->opcode == OP_I8CONST); + cns_val = arg0->inst_l; + } + switch (etype->type) { + case MONO_TYPE_I1: + case MONO_TYPE_U1: { + for (int i = 0; i < vector_size / 1; i++) { + ((guint8*)cns_vec) [i] = (guint8)cns_val; + } + break; + } + case MONO_TYPE_I2: + case MONO_TYPE_U2: { + for (int i = 0; i < vector_size / 2; i++) { + ((guint16*)cns_vec) [i] = (guint16)cns_val; + } + break; + } + case MONO_TYPE_I4: + case MONO_TYPE_U4: { + for (int i = 0; i < vector_size / 4; i++) { + ((guint32*)cns_vec) [i] = (guint32)cns_val; + } + break; + } + case MONO_TYPE_I8: + case MONO_TYPE_U8: { + for (int i = 0; i < vector_size / 8; i++) { + ((guint64*)cns_vec) [i] = (guint64)cns_val; + } + break; + } + default: { + g_assert_not_reached (); + } + } + } + return emit_xconst_v128 (cfg, vklass, (guint8*)cns_vec); + } + } + MonoInst* ins = emit_simd_ins (cfg, vklass, type_to_expand_op (etype->type), arg0->dreg, -1); + ins->inst_c1 = etype->type; return ins; } static MonoInst * emit_vector_create_elementwise ( - MonoCompile *cfg, MonoMethodSignature *fsig, MonoType *vtype, - MonoTypeEnum type, MonoInst **args) + MonoCompile *cfg, MonoClass *vklass, MonoType *etype, MonoInst **args, int param_count) { - MonoClass *vklass = mono_class_from_mono_type_internal (vtype); MonoInst *ins = emit_xzero (cfg, vklass); - for (int i = 0; i < fsig->param_count; ++i) - ins = emit_vector_insert_element (cfg, vklass, ins, type, args[i], i, TRUE); + for (int i = 0; i < param_count; ++i) + ins = emit_vector_insert_element (cfg, vklass, ins, etype->type, args[i], i, TRUE); return ins; } +static MonoInst * +emit_vector_create_scalar ( + MonoCompile *cfg, MonoClass *vklass, MonoType *etype, MonoInst *arg0, gboolean is_unsafe) +{ + int vector_size = mono_class_value_size (vklass, NULL); + if (vector_size == 16) { + // We want to handle constant inputs and create constant nodes so other import + // optimizations can be enabled. For is_unsafe, we treat it the same as broadcast + if (is_const (arg0)) { + if (is_unsafe) { + return emit_vector_create_broadcast (cfg, vklass, etype, arg0); + } + MonoInst *ins = emit_xzero (cfg, vklass); + ins = emit_vector_insert_element (cfg, vklass, ins, etype->type, arg0, 0, TRUE); + return ins; + } + } + int opcode = 0; + if (COMPILE_LLVM (cfg)) { + opcode = is_unsafe ? OP_CREATE_SCALAR_UNSAFE : OP_CREATE_SCALAR; + } else { +#ifdef TARGET_AMD64 + MonoInst *ins; + + ins = emit_xzero (cfg, vklass); + if (!is_zero_const (arg0)) { + ins = emit_simd_ins (cfg, vklass, type_to_insert_op (etype->type), ins->dreg, arg0->dreg); + ins->inst_c0 = 0; + ins->inst_c1 = etype->type; + } + return ins; +#else + if (type_enum_is_float (etype->type)) { + opcode = is_unsafe ? OP_CREATE_SCALAR_UNSAFE_FLOAT : OP_CREATE_SCALAR_FLOAT; + } else { + opcode = is_unsafe ? OP_CREATE_SCALAR_UNSAFE_INT : OP_CREATE_SCALAR_INT; + } +#endif + } + g_assert (opcode != 0); + MonoInst* ins = emit_simd_ins (cfg, vklass, opcode, arg0->dreg, -1); + ins->inst_c1 = etype->type; + return ins; +} + static int type_to_xinsert_op (MonoTypeEnum type) { @@ -1867,9 +2076,8 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi if (!MONO_TYPE_IS_VECTOR_PRIMITIVE (etype)) return NULL; if (fsig->param_count == 1 && mono_metadata_type_equal (fsig->params [0], etype)) { - MonoInst* ins = emit_simd_ins (cfg, klass, type_to_expand_op (etype->type), args [0]->dreg, -1); - ins->inst_c1 = arg0_type; - return ins; + MonoClass *vklass = mono_class_from_mono_type_internal(fsig->ret); + return emit_vector_create_broadcast (cfg, vklass, etype, args [0]); } else if (is_create_from_half_vectors_overload (fsig)) { #if defined(TARGET_AMD64) // Require Vector64 SIMD support @@ -1890,8 +2098,10 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi return emit_simd_ins (cfg, klass, OP_XCONCAT, args [0]->dreg, args [1]->dreg); } - else if (is_elementwise_create_overload (fsig, etype)) - return emit_vector_create_elementwise (cfg, fsig, fsig->ret, arg0_type, args); + else if (is_elementwise_create_overload (fsig, etype)) { + MonoClass *vklass = mono_class_from_mono_type_internal(fsig->ret); + return emit_vector_create_elementwise (cfg, vklass, etype, args, fsig->param_count); + } break; } case SN_CreateScalar: @@ -1900,27 +2110,8 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi if (!MONO_TYPE_IS_VECTOR_PRIMITIVE (etype)) return NULL; gboolean is_unsafe = id == SN_CreateScalarUnsafe; - if (COMPILE_LLVM (cfg)) { - return emit_simd_ins_for_sig (cfg, klass, is_unsafe ? OP_CREATE_SCALAR_UNSAFE : OP_CREATE_SCALAR, -1, arg0_type, fsig, args); - } else { -#ifdef TARGET_AMD64 - MonoInst *ins; - - ins = emit_xzero (cfg, klass); - if (!is_zero_const (args [0])) { - ins = emit_simd_ins (cfg, klass, type_to_insert_op (arg0_type), ins->dreg, args [0]->dreg); - ins->inst_c0 = 0; - ins->inst_c1 = arg0_type; - } - return ins; -#else - if (type_enum_is_float (arg0_type)) { - return emit_simd_ins_for_sig (cfg, klass, is_unsafe ? OP_CREATE_SCALAR_UNSAFE_FLOAT : OP_CREATE_SCALAR_FLOAT, -1, arg0_type, fsig, args); - } else { - return emit_simd_ins_for_sig (cfg, klass, is_unsafe ? OP_CREATE_SCALAR_UNSAFE_INT : OP_CREATE_SCALAR_INT, -1, arg0_type, fsig, args); - } -#endif - } + MonoClass *vklass = mono_class_from_mono_type_internal(fsig->ret); + return emit_vector_create_scalar (cfg, vklass, etype, args [0], is_unsafe); } case SN_Dot: { return emit_dot (cfg, klass, fsig->params [0], arg0_type, args [0]->dreg, args [1]->dreg); From fcdbd18aaf711629a3c16dcafbc6c07e5f89a667 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Mon, 22 Jul 2024 23:58:57 -0700 Subject: [PATCH 02/15] Update mono to support shuffle for constant inputs --- src/mono/mono/mini/simd-intrinsics.c | 121 ++++++++++++++++++++++++++- 1 file changed, 118 insertions(+), 3 deletions(-) diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index 06de318599964..d567b7d89cbec 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -660,6 +660,50 @@ emit_xconst_v128 (MonoCompile *cfg, MonoClass *klass, guint8 value[16]) return ins; } +static guint64 +get_xconst_int_elem (MonoCompile *cfg, MonoInst *ins, MonoTypeEnum etype, int index) +{ + g_assert (ins->opcode == OP_XCONST); + g_assert (index >= 0); + switch (etype) { + case MONO_TYPE_I1: { + g_assert (index < 16); + return ((gint8*)ins->inst_p0) [index]; + } + case MONO_TYPE_U1: { + g_assert (index < 16); + return ((guint8*)ins->inst_p0) [index]; + } + case MONO_TYPE_I2: { + g_assert (index < 8); + return ((gint16*)ins->inst_p0) [index]; + } + case MONO_TYPE_U2: { + g_assert (index < 8); + return ((guint16*)ins->inst_p0) [index]; + } + case MONO_TYPE_I4: { + g_assert (index < 4); + return ((gint32*)ins->inst_p0) [index]; + } + case MONO_TYPE_U4: { + g_assert (index < 4); + return ((guint32*)ins->inst_p0) [index]; + } + case MONO_TYPE_I8: { + g_assert (index < 2); + return ((gint64*)ins->inst_p0) [index]; + } + case MONO_TYPE_U8: { + g_assert (index < 2); + return ((guint64*)ins->inst_p0) [index]; + } + default: { + g_assert_not_reached (); + } + } +} + #ifdef TARGET_ARM64 static int type_to_extract_op (MonoTypeEnum type); static MonoType* get_vector_t_elem_type (MonoType *vector_type); @@ -2609,7 +2653,8 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi return emit_simd_ins_for_unary_op (cfg, klass, fsig, args, arg0_type, id); } case SN_Shuffle: { - if (!is_element_type_primitive (fsig->params [0])) + MonoType *etype = fsig->params [0]; + if (!is_element_type_primitive (etype)) return NULL; #ifdef TARGET_WASM return emit_simd_ins_for_sig (cfg, klass, OP_WASM_SIMD_SWIZZLE, -1, -1, fsig, args); @@ -2619,8 +2664,78 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi return NULL; #elif defined(TARGET_AMD64) if (COMPILE_LLVM (cfg)) { - if (is_SIMD_feature_supported (cfg, MONO_CPU_X86_SSSE3) && vector_size == 128 && (arg0_type == MONO_TYPE_I1 || arg0_type == MONO_TYPE_U1)) - return emit_simd_ins_for_sig (cfg, klass, OP_XOP_X_X_X, INTRINS_SSE_PSHUFB, 0, fsig, args); + if (vector_size != 128) { + return NULL; + } + if (args [1]->opcode != OP_XCONST) { + return NULL; + } + int esize = mono_class_value_size (mono_class_from_mono_type_internal (etype), NULL); + int ecount = (vector_size / 8) / esize; + guint8 control = 0; + gboolean needs_zero = false; + guint64 value = 0; + guint8 vec_cns[16]; + if ((arg0_type == MONO_TYPE_I1) || (arg0_type == MONO_TYPE_U1)) { + needs_zero = true; + } else if ((arg0_type == MONO_TYPE_I2) || (arg0_type == MONO_TYPE_U2)) { + needs_zero = true; + } + for (int index = 0; index < ecount; index++) { + value = get_xconst_int_elem (cfg, args [1], etype->type, index); + if (value < ecount) { + // Setting the control for byte/sbyte and short/ushort is unnecessary + // and will actually compute an incorrect control word. But it simplifies + // the overall logic needed here and will remain unused. + + control |= (value << (index * (ecount / 2))); + + // When Ssse3 is supported, we may need vecCns to accurately select the relevant + // bytes if some index is outside the valid range. Since x86/x64 is little-endian + // we can simplify this down to a for loop that scales the value and selects count + // sequential bytes. + + for (int i = 0; i < esize; i++) { + vec_cns[(index * esize) + i] = (guint8)((value * esize) + i); + } + } else { + needs_zero = true; + + // When Ssse3 is supported, we may need vecCns to accurately select the relevant + // bytes if some index is outside the valid range. We can do this by just zeroing + // out each byte in the element. This only requires the most significant bit to be + // set, but we use 0xFF instead since that will be the equivalent of AllBitsSet + + for (int i = 0; i < esize; i++) { + vec_cns[(index * esize) + i] = 0xFF; + } + } + } + MonoInst *new_args[2]; + new_args [0] = args[0]; + if (needs_zero) { + if (!is_SIMD_feature_supported (cfg, MONO_CPU_X86_SSSE3)) { + return NULL; + } + new_args [1] = emit_xconst_v128 (cfg, klass, vec_cns); + return emit_simd_ins_for_sig (cfg, klass, OP_XOP_X_X_X, INTRINS_SSE_PSHUFB, 0, fsig, new_args); + } + if ((arg0_type == MONO_TYPE_I8) || (arg0_type == MONO_TYPE_U8)) { + // TYP_LONG and TYP_ULONG don't have their own shuffle/permute instructions and so we'll + // just utilize the path for TYP_DOUBLE for simplicity. We could alternatively break this + // down into a TYP_INT or TYP_UINT based shuffle, but that's additional complexity for no + // real benefit since shuffle gets its own port rather than using the fp specific ports. + arg0_type = MONO_TYPE_R8; + } + EMIT_NEW_ICONST (cfg, new_args [1], control); + if (arg0_type == MONO_TYPE_R4) { + return emit_simd_ins_for_sig (cfg, klass, OP_SSE_SHUFPS, 0, arg0_type, fsig, new_args); + } else if (arg0_type == MONO_TYPE_R8) { + return emit_simd_ins_for_sig (cfg, klass, OP_SSE2_SHUFPD, 0, arg0_type, fsig, new_args); + } else { + g_assert ((arg0_type == MONO_TYPE_I4) || (arg0_type == MONO_TYPE_U4)); + return emit_simd_ins_for_sig (cfg, klass, OP_SSE2_PSHUFD, 0, arg0_type, fsig, new_args); + } } // There is no variable shuffle until avx512 return NULL; From 095873b426c9fb36829c7e121edb35062bf28f84 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Tue, 23 Jul 2024 06:32:25 -0700 Subject: [PATCH 03/15] Ensure that arm64 also accelerates shuffle for non-constant inputs --- src/mono/mono/mini/simd-intrinsics.c | 44 ++++++++++++++++++++++++---- 1 file changed, 39 insertions(+), 5 deletions(-) diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index d567b7d89cbec..bd93b7127a9d9 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -2659,9 +2659,43 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi #ifdef TARGET_WASM return emit_simd_ins_for_sig (cfg, klass, OP_WASM_SIMD_SWIZZLE, -1, -1, fsig, args); #elif defined(TARGET_ARM64) - if (vector_size == 128 && (arg0_type == MONO_TYPE_I1 || arg0_type == MONO_TYPE_U1)) + if (vector_size != 128) { + return NULL; + } + if ((arg0_type == MONO_TYPE_I1 || arg0_type == MONO_TYPE_U1)) { return emit_simd_ins_for_sig (cfg, klass, OP_XOP_OVR_X_X_X, INTRINS_AARCH64_ADV_SIMD_TBL1, 0, fsig, args); - return NULL; + } + if (args [1]->opcode != OP_XCONST) { + return NULL; + } + MonoInst *new_args[2]; + new_args [0] = args [0]; + if (COMPILE_LLVM (cfg)) { + if ((get_xconst_int_elem (cfg, args [1], MONO_TYPE_U8, 0) == 0x300000002) && + (get_xconst_int_elem (cfg, args [1], MONO_TYPE_U8, 1) == 0x100000000)) { + new_args [1] = args [0]; + return emit_simd_ins_for_sig (cfg, klass, OP_ARM64_EXT, 0, MONO_TYPE_U8, fsig, new_args); + } + } + int esize = mono_class_value_size (mono_class_from_mono_type_internal (etype), NULL); + int ecount = (vector_size / 8) / esize; + guint64 value = 0; + guint8 vec_cns[16]; + for (int index = 0; index < ecount; index++) { + value = get_xconst_int_elem (cfg, args [1], arg0_type, index); + + if (value < ecount) { + for (int i = 0; i < esize; i++) { + vec_cns [(index * esize) + i] = (guint8)((value * esize) + i); + } + } else { + for (int i = 0; i < esize; i++) { + vec_cns [(index * esize) + i] = 0xFF; + } + } + } + new_args [1] = emit_xconst_v128 (cfg, klass, vec_cns); + return emit_simd_ins_for_sig (cfg, klass, OP_XOP_OVR_X_X_X, INTRINS_AARCH64_ADV_SIMD_TBL1, 0, fsig, new_args); #elif defined(TARGET_AMD64) if (COMPILE_LLVM (cfg)) { if (vector_size != 128) { @@ -2682,7 +2716,7 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi needs_zero = true; } for (int index = 0; index < ecount; index++) { - value = get_xconst_int_elem (cfg, args [1], etype->type, index); + value = get_xconst_int_elem (cfg, args [1], arg0_type, index); if (value < ecount) { // Setting the control for byte/sbyte and short/ushort is unnecessary // and will actually compute an incorrect control word. But it simplifies @@ -2696,7 +2730,7 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi // sequential bytes. for (int i = 0; i < esize; i++) { - vec_cns[(index * esize) + i] = (guint8)((value * esize) + i); + vec_cns [(index * esize) + i] = (guint8)((value * esize) + i); } } else { needs_zero = true; @@ -2707,7 +2741,7 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi // set, but we use 0xFF instead since that will be the equivalent of AllBitsSet for (int i = 0; i < esize; i++) { - vec_cns[(index * esize) + i] = 0xFF; + vec_cns [(index * esize) + i] = 0xFF; } } } From d87a444600d63033b8746ada56eb16fcda7921fd Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Tue, 23 Jul 2024 08:22:30 -0700 Subject: [PATCH 04/15] Ensure OP_XZERO and OP_XONES are recognized as being constant --- src/mono/mono/mini/simd-intrinsics.c | 59 +++++++++++++++++++++------- 1 file changed, 45 insertions(+), 14 deletions(-) diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index bd93b7127a9d9..a6b1b9c057a7e 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -96,6 +96,18 @@ is_const (const MonoInst* ins) return FALSE; } +static gboolean +is_xconst (const MonoInst* ins) +{ + switch (ins->opcode) { + case OP_XCONST: + case OP_XZERO: + case OP_XONES: + return TRUE; + } + return FALSE; +} + static gboolean is_zero_const (const MonoInst* ins) { @@ -663,40 +675,48 @@ emit_xconst_v128 (MonoCompile *cfg, MonoClass *klass, guint8 value[16]) static guint64 get_xconst_int_elem (MonoCompile *cfg, MonoInst *ins, MonoTypeEnum etype, int index) { - g_assert (ins->opcode == OP_XCONST); + guint8 cns_vec[16]; + if (ins->opcode == OP_XZERO) { + memset (cns_vec, 0x00, 16); + } else if (ins->opcode == OP_XONES) { + memset (cns_vec, 0xFF, 16); + } else { + g_assert (ins->opcode == OP_XCONST); + memcpy (cns_vec, ins->inst_p0, 16); + } g_assert (index >= 0); switch (etype) { case MONO_TYPE_I1: { g_assert (index < 16); - return ((gint8*)ins->inst_p0) [index]; + return ((gint8*)cns_vec) [index]; } case MONO_TYPE_U1: { g_assert (index < 16); - return ((guint8*)ins->inst_p0) [index]; + return ((guint8*)cns_vec) [index]; } case MONO_TYPE_I2: { g_assert (index < 8); - return ((gint16*)ins->inst_p0) [index]; + return ((gint16*)cns_vec) [index]; } case MONO_TYPE_U2: { g_assert (index < 8); - return ((guint16*)ins->inst_p0) [index]; + return ((guint16*)cns_vec) [index]; } case MONO_TYPE_I4: { g_assert (index < 4); - return ((gint32*)ins->inst_p0) [index]; + return ((gint32*)cns_vec) [index]; } case MONO_TYPE_U4: { g_assert (index < 4); - return ((guint32*)ins->inst_p0) [index]; + return ((guint32*)cns_vec) [index]; } case MONO_TYPE_I8: { g_assert (index < 2); - return ((gint64*)ins->inst_p0) [index]; + return ((gint64*)cns_vec) [index]; } case MONO_TYPE_U8: { g_assert (index < 2); - return ((guint64*)ins->inst_p0) [index]; + return ((guint64*)cns_vec) [index]; } default: { g_assert_not_reached (); @@ -1137,11 +1157,19 @@ emit_vector_insert_element ( return ins; } - if ((ins->opcode == OP_XCONST) && is_const (element)) { + if (is_xconst (ins) && is_const (element)) { // Specially handle insertion of a constant into a constant int vector_size = mono_class_value_size (vklass, NULL); if (vector_size == 16) { - guint8* cns_vec = (guint8*)ins->inst_p0; + guint8 cns_vec[16]; + if (ins->opcode == OP_XZERO) { + memset (cns_vec, 0x00, 16); + } else if (ins->opcode == OP_XONES) { + memset (cns_vec, 0xFF, 16); + } else { + g_assert (ins->opcode == OP_XCONST); + memcpy (cns_vec, ins->inst_p0, 16); + } if (type_enum_is_float (type)) { double cns_val; if (element->opcode == OP_R4CONST) { @@ -1197,7 +1225,10 @@ emit_vector_insert_element ( } } } - return ins; + if (ins->opcode == OP_XCONST) { + return ins; + } + return emit_xconst_v128 (cfg, vklass, cns_vec); } } @@ -2665,7 +2696,7 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi if ((arg0_type == MONO_TYPE_I1 || arg0_type == MONO_TYPE_U1)) { return emit_simd_ins_for_sig (cfg, klass, OP_XOP_OVR_X_X_X, INTRINS_AARCH64_ADV_SIMD_TBL1, 0, fsig, args); } - if (args [1]->opcode != OP_XCONST) { + if (!is_xconst (args [1])) { return NULL; } MonoInst *new_args[2]; @@ -2701,7 +2732,7 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi if (vector_size != 128) { return NULL; } - if (args [1]->opcode != OP_XCONST) { + if (!is_xconst (args [1])) { return NULL; } int esize = mono_class_value_size (mono_class_from_mono_type_internal (etype), NULL); From 71708fe958366ea0eb4d39b6b122ac1e64a3bb41 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Tue, 23 Jul 2024 09:38:31 -0700 Subject: [PATCH 05/15] Ensure shuffle creates a correct instruction when the fsig doesn't match the necessary parameter count --- src/mono/mono/mini/simd-intrinsics.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index a6b1b9c057a7e..75117971b7051 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -2699,13 +2699,18 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi if (!is_xconst (args [1])) { return NULL; } - MonoInst *new_args[2]; + MonoInst *new_args[3]; new_args [0] = args [0]; if (COMPILE_LLVM (cfg)) { if ((get_xconst_int_elem (cfg, args [1], MONO_TYPE_U8, 0) == 0x300000002) && (get_xconst_int_elem (cfg, args [1], MONO_TYPE_U8, 1) == 0x100000000)) { new_args [1] = args [0]; - return emit_simd_ins_for_sig (cfg, klass, OP_ARM64_EXT, 0, MONO_TYPE_U8, fsig, new_args); + EMIT_NEW_ICONST (cfg, new_args [2], 1); + MonoInst* ins = emit_simd_ins (cfg, klass, OP_ARM64_EXT, new_args [0]->dreg, new_args [1]->dreg); + ins->inst_c0 = 0; + ins->inst_c1 = MONO_TYPE_U8; + ins->sreg3 = new_args [2]->dreg; + return ins; } } int esize = mono_class_value_size (mono_class_from_mono_type_internal (etype), NULL); @@ -2776,8 +2781,8 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi } } } - MonoInst *new_args[2]; - new_args [0] = args[0]; + MonoInst *new_args[3]; + new_args [0] = args [0]; if (needs_zero) { if (!is_SIMD_feature_supported (cfg, MONO_CPU_X86_SSSE3)) { return NULL; @@ -2792,11 +2797,15 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi // real benefit since shuffle gets its own port rather than using the fp specific ports. arg0_type = MONO_TYPE_R8; } - EMIT_NEW_ICONST (cfg, new_args [1], control); - if (arg0_type == MONO_TYPE_R4) { - return emit_simd_ins_for_sig (cfg, klass, OP_SSE_SHUFPS, 0, arg0_type, fsig, new_args); - } else if (arg0_type == MONO_TYPE_R8) { - return emit_simd_ins_for_sig (cfg, klass, OP_SSE2_SHUFPD, 0, arg0_type, fsig, new_args); + if ((arg0_type == MONO_TYPE_R4) || (arg0_type == MONO_TYPE_R8)) { + int opcode = (arg0_type == MONO_TYPE_R4) ? OP_SSE_SHUFPS : OP_SSE2_SHUFPD; + new_args [1] = args [0]; + EMIT_NEW_ICONST (cfg, new_args [2], control); + MonoInst* ins = emit_simd_ins (cfg, klass, opcode, new_args [0]->dreg, new_args [1]->dreg); + ins->inst_c0 = 0; + ins->inst_c1 = arg0_type; + ins->sreg3 = new_args [2]->dreg; + return ins; } else { g_assert ((arg0_type == MONO_TYPE_I4) || (arg0_type == MONO_TYPE_U4)); return emit_simd_ins_for_sig (cfg, klass, OP_SSE2_PSHUFD, 0, arg0_type, fsig, new_args); From f85e513f6e6fcbd01e04491811a0fb6f95c5537d Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Tue, 23 Jul 2024 10:28:45 -0700 Subject: [PATCH 06/15] Ensure that getting the index for floating-point shuffle is possible --- src/mono/mono/mini/simd-intrinsics.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index 75117971b7051..8974e9f172617 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -702,7 +702,8 @@ get_xconst_int_elem (MonoCompile *cfg, MonoInst *ins, MonoTypeEnum etype, int in g_assert (index < 8); return ((guint16*)cns_vec) [index]; } - case MONO_TYPE_I4: { + case MONO_TYPE_I4: + case MONO_TYPE_R4: { g_assert (index < 4); return ((gint32*)cns_vec) [index]; } @@ -710,7 +711,8 @@ get_xconst_int_elem (MonoCompile *cfg, MonoInst *ins, MonoTypeEnum etype, int in g_assert (index < 4); return ((guint32*)cns_vec) [index]; } - case MONO_TYPE_I8: { + case MONO_TYPE_I8: + case MONO_TYPE_R8: { g_assert (index < 2); return ((gint64*)cns_vec) [index]; } From 2995754f734c841cc324d5191c164d2928317278 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Tue, 23 Jul 2024 12:04:15 -0700 Subject: [PATCH 07/15] Ensure the right class handle is passed down to LLVM so overload resolution can function --- src/mono/mono/mini/simd-intrinsics.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index 8974e9f172617..6456109724909 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -2706,6 +2706,8 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi if (COMPILE_LLVM (cfg)) { if ((get_xconst_int_elem (cfg, args [1], MONO_TYPE_U8, 0) == 0x300000002) && (get_xconst_int_elem (cfg, args [1], MONO_TYPE_U8, 1) == 0x100000000)) { + etype = m_class_get_byval_arg (mono_defaults.uint64_class); + klass = create_class_instance ("System.Runtime.Intrinsics", "Vector128`1", etype); new_args [1] = args [0]; EMIT_NEW_ICONST (cfg, new_args [2], 1); MonoInst* ins = emit_simd_ins (cfg, klass, OP_ARM64_EXT, new_args [0]->dreg, new_args [1]->dreg); @@ -2732,6 +2734,8 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi } } } + etype = m_class_get_byval_arg (mono_defaults.byte_class); + klass = create_class_instance ("System.Runtime.Intrinsics", "Vector128`1", etype); new_args [1] = emit_xconst_v128 (cfg, klass, vec_cns); return emit_simd_ins_for_sig (cfg, klass, OP_XOP_OVR_X_X_X, INTRINS_AARCH64_ADV_SIMD_TBL1, 0, fsig, new_args); #elif defined(TARGET_AMD64) @@ -2789,6 +2793,8 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi if (!is_SIMD_feature_supported (cfg, MONO_CPU_X86_SSSE3)) { return NULL; } + etype = m_class_get_byval_arg (mono_defaults.byte_class); + klass = create_class_instance ("System.Runtime.Intrinsics", "Vector128`1", etype); new_args [1] = emit_xconst_v128 (cfg, klass, vec_cns); return emit_simd_ins_for_sig (cfg, klass, OP_XOP_X_X_X, INTRINS_SSE_PSHUFB, 0, fsig, new_args); } @@ -2798,6 +2804,8 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi // down into a TYP_INT or TYP_UINT based shuffle, but that's additional complexity for no // real benefit since shuffle gets its own port rather than using the fp specific ports. arg0_type = MONO_TYPE_R8; + etype = m_class_get_byval_arg (mono_defaults.double_class); + klass = create_class_instance ("System.Runtime.Intrinsics", "Vector128`1", etype); } if ((arg0_type == MONO_TYPE_R4) || (arg0_type == MONO_TYPE_R8)) { int opcode = (arg0_type == MONO_TYPE_R4) ? OP_SSE_SHUFPS : OP_SSE2_SHUFPD; From b35b1e362f39874b7a06d70560a2a164ddd2e734 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Tue, 23 Jul 2024 13:45:57 -0700 Subject: [PATCH 08/15] Make sure we update the original xconst if we mutate it --- src/mono/mono/mini/simd-intrinsics.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index 6456109724909..a94dba0bf9dbf 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -1154,7 +1154,7 @@ emit_vector_insert_element ( { int op = type_to_insert_op (type); - if (is_zero_inited && is_zero_const (element)) { + if (((is_zero_inited) || (ins->opcode == OP_XZERO)) && is_zero_const (element)) { // element already set to zero return ins; } @@ -1228,6 +1228,7 @@ emit_vector_insert_element ( } } if (ins->opcode == OP_XCONST) { + memcpy (ins->inst_p0, cns_vec, 16); return ins; } return emit_xconst_v128 (cfg, vklass, cns_vec); From 1d55132189ce11f24bd47d89d838fa2aed52f4d9 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Tue, 23 Jul 2024 18:19:04 -0700 Subject: [PATCH 09/15] Return a new constant and instead of mutating the existing one --- src/mono/mono/mini/simd-intrinsics.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index a94dba0bf9dbf..c303bec495da2 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -1228,8 +1228,7 @@ emit_vector_insert_element ( } } if (ins->opcode == OP_XCONST) { - memcpy (ins->inst_p0, cns_vec, 16); - return ins; + return emit_xconst_v128 (cfg, vklass, cns_vec); } return emit_xconst_v128 (cfg, vklass, cns_vec); } From 3c327b022cd791712c44563e652a03c9d4659292 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Tue, 23 Jul 2024 19:28:49 -0700 Subject: [PATCH 10/15] Insert relevant xcast nodes --- src/mono/mono/mini/simd-intrinsics.c | 61 ++++++++++++++++------------ 1 file changed, 36 insertions(+), 25 deletions(-) diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index c303bec495da2..9495528319994 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -1154,7 +1154,7 @@ emit_vector_insert_element ( { int op = type_to_insert_op (type); - if (((is_zero_inited) || (ins->opcode == OP_XZERO)) && is_zero_const (element)) { + if (is_zero_inited && is_zero_const (element)) { // element already set to zero return ins; } @@ -1227,9 +1227,6 @@ emit_vector_insert_element ( } } } - if (ins->opcode == OP_XCONST) { - return emit_xconst_v128 (cfg, vklass, cns_vec); - } return emit_xconst_v128 (cfg, vklass, cns_vec); } } @@ -2702,23 +2699,24 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi return NULL; } MonoInst *new_args[3]; - new_args [0] = args [0]; if (COMPILE_LLVM (cfg)) { if ((get_xconst_int_elem (cfg, args [1], MONO_TYPE_U8, 0) == 0x300000002) && (get_xconst_int_elem (cfg, args [1], MONO_TYPE_U8, 1) == 0x100000000)) { - etype = m_class_get_byval_arg (mono_defaults.uint64_class); - klass = create_class_instance ("System.Runtime.Intrinsics", "Vector128`1", etype); - new_args [1] = args [0]; + MonoType *op_etype = m_class_get_byval_arg (mono_defaults.uint64_class); + MonoClass *op_klass = create_class_instance ("System.Runtime.Intrinsics", "Vector128`1", op_etype); + new_args [0] = emit_simd_ins (cfg, op_klass, OP_XCAST, args [0]->dreg, -1); + new_args [1] = new_args [0]; EMIT_NEW_ICONST (cfg, new_args [2], 1); - MonoInst* ins = emit_simd_ins (cfg, klass, OP_ARM64_EXT, new_args [0]->dreg, new_args [1]->dreg); + MonoInst *ins = emit_simd_ins (cfg, op_klass, OP_ARM64_EXT, new_args [0]->dreg, new_args [1]->dreg); ins->inst_c0 = 0; ins->inst_c1 = MONO_TYPE_U8; ins->sreg3 = new_args [2]->dreg; - return ins; + return emit_simd_ins (cfg, klass, OP_XCAST, ins->dreg, -1); } } + int vsize = mono_class_value_size (klass, NULL); int esize = mono_class_value_size (mono_class_from_mono_type_internal (etype), NULL); - int ecount = (vector_size / 8) / esize; + int ecount = vsize / esize; guint64 value = 0; guint8 vec_cns[16]; for (int index = 0; index < ecount; index++) { @@ -2734,10 +2732,12 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi } } } - etype = m_class_get_byval_arg (mono_defaults.byte_class); - klass = create_class_instance ("System.Runtime.Intrinsics", "Vector128`1", etype); - new_args [1] = emit_xconst_v128 (cfg, klass, vec_cns); - return emit_simd_ins_for_sig (cfg, klass, OP_XOP_OVR_X_X_X, INTRINS_AARCH64_ADV_SIMD_TBL1, 0, fsig, new_args); + MonoType *op_etype = m_class_get_byval_arg (mono_defaults.byte_class); + MonoClass *op_klass = create_class_instance ("System.Runtime.Intrinsics", "Vector128`1", op_etype); + new_args [0] = emit_simd_ins (cfg, op_klass, OP_XCAST, args [0]->dreg, -1); + new_args [1] = emit_xconst_v128 (cfg, op_klass, vec_cns); + MonoInst *ins = emit_simd_ins_for_sig (cfg, op_klass, OP_XOP_OVR_X_X_X, INTRINS_AARCH64_ADV_SIMD_TBL1, 0, fsig, new_args); + return emit_simd_ins (cfg, klass, OP_XCAST, ins->dreg, -1); #elif defined(TARGET_AMD64) if (COMPILE_LLVM (cfg)) { if (vector_size != 128) { @@ -2746,8 +2746,11 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi if (!is_xconst (args [1])) { return NULL; } + MonoType *op_etype = etype; + MonoClass *op_klass = klass; + int vsize = mono_class_value_size (klass, NULL); int esize = mono_class_value_size (mono_class_from_mono_type_internal (etype), NULL); - int ecount = (vector_size / 8) / esize; + int ecount = vsize / esize; guint8 control = 0; gboolean needs_zero = false; guint64 value = 0; @@ -2788,15 +2791,16 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi } } MonoInst *new_args[3]; - new_args [0] = args [0]; if (needs_zero) { if (!is_SIMD_feature_supported (cfg, MONO_CPU_X86_SSSE3)) { return NULL; } - etype = m_class_get_byval_arg (mono_defaults.byte_class); - klass = create_class_instance ("System.Runtime.Intrinsics", "Vector128`1", etype); - new_args [1] = emit_xconst_v128 (cfg, klass, vec_cns); - return emit_simd_ins_for_sig (cfg, klass, OP_XOP_X_X_X, INTRINS_SSE_PSHUFB, 0, fsig, new_args); + op_etype = m_class_get_byval_arg (mono_defaults.byte_class); + op_klass = create_class_instance ("System.Runtime.Intrinsics", "Vector128`1", op_etype); + new_args [0] = emit_simd_ins (cfg, op_klass, OP_XCAST, args [0]->dreg, -1); + new_args [1] = emit_xconst_v128 (cfg, op_klass, vec_cns); + MonoInst *ins = emit_simd_ins_for_sig (cfg, op_klass, OP_XOP_X_X_X, INTRINS_SSE_PSHUFB, 0, fsig, new_args); + return emit_simd_ins (cfg, klass, OP_XCAST, ins->dreg, -1); } if ((arg0_type == MONO_TYPE_I8) || (arg0_type == MONO_TYPE_U8)) { // TYP_LONG and TYP_ULONG don't have their own shuffle/permute instructions and so we'll @@ -2804,17 +2808,24 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi // down into a TYP_INT or TYP_UINT based shuffle, but that's additional complexity for no // real benefit since shuffle gets its own port rather than using the fp specific ports. arg0_type = MONO_TYPE_R8; - etype = m_class_get_byval_arg (mono_defaults.double_class); - klass = create_class_instance ("System.Runtime.Intrinsics", "Vector128`1", etype); + op_etype = m_class_get_byval_arg (mono_defaults.double_class); + op_klass = create_class_instance ("System.Runtime.Intrinsics", "Vector128`1", etype); } if ((arg0_type == MONO_TYPE_R4) || (arg0_type == MONO_TYPE_R8)) { int opcode = (arg0_type == MONO_TYPE_R4) ? OP_SSE_SHUFPS : OP_SSE2_SHUFPD; - new_args [1] = args [0]; + new_args [0] = args [0]; + if (op_klass != klass) { + new_args [0] = emit_simd_ins (cfg, op_klass, OP_XCAST, new_args [0]->dreg, -1); + } + new_args [1] = new_args [0]; EMIT_NEW_ICONST (cfg, new_args [2], control); - MonoInst* ins = emit_simd_ins (cfg, klass, opcode, new_args [0]->dreg, new_args [1]->dreg); + MonoInst* ins = emit_simd_ins (cfg, op_klass, opcode, new_args [0]->dreg, new_args [1]->dreg); ins->inst_c0 = 0; ins->inst_c1 = arg0_type; ins->sreg3 = new_args [2]->dreg; + if (op_klass != klass) { + ins = emit_simd_ins (cfg, klass, OP_XCAST, ins->dreg, -1); + } return ins; } else { g_assert ((arg0_type == MONO_TYPE_I4) || (arg0_type == MONO_TYPE_U4)); From 50cec20adae403f4d3457f67ce51a927755b222e Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Wed, 24 Jul 2024 08:49:05 -0700 Subject: [PATCH 11/15] Add some asserts around the ecount --- src/mono/mono/mini/simd-intrinsics.c | 24 +++++------------------- 1 file changed, 5 insertions(+), 19 deletions(-) diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index 9495528319994..e19e01339de34 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -2698,42 +2698,27 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi if (!is_xconst (args [1])) { return NULL; } - MonoInst *new_args[3]; - if (COMPILE_LLVM (cfg)) { - if ((get_xconst_int_elem (cfg, args [1], MONO_TYPE_U8, 0) == 0x300000002) && - (get_xconst_int_elem (cfg, args [1], MONO_TYPE_U8, 1) == 0x100000000)) { - MonoType *op_etype = m_class_get_byval_arg (mono_defaults.uint64_class); - MonoClass *op_klass = create_class_instance ("System.Runtime.Intrinsics", "Vector128`1", op_etype); - new_args [0] = emit_simd_ins (cfg, op_klass, OP_XCAST, args [0]->dreg, -1); - new_args [1] = new_args [0]; - EMIT_NEW_ICONST (cfg, new_args [2], 1); - MonoInst *ins = emit_simd_ins (cfg, op_klass, OP_ARM64_EXT, new_args [0]->dreg, new_args [1]->dreg); - ins->inst_c0 = 0; - ins->inst_c1 = MONO_TYPE_U8; - ins->sreg3 = new_args [2]->dreg; - return emit_simd_ins (cfg, klass, OP_XCAST, ins->dreg, -1); - } - } int vsize = mono_class_value_size (klass, NULL); int esize = mono_class_value_size (mono_class_from_mono_type_internal (etype), NULL); int ecount = vsize / esize; + g_assert ((ecount == 2) || (ecount == 4) || (ecount == 8) || (ecount == 16)); guint64 value = 0; guint8 vec_cns[16]; for (int index = 0; index < ecount; index++) { value = get_xconst_int_elem (cfg, args [1], arg0_type, index); - if (value < ecount) { for (int i = 0; i < esize; i++) { vec_cns [(index * esize) + i] = (guint8)((value * esize) + i); } } else { for (int i = 0; i < esize; i++) { - vec_cns [(index * esize) + i] = 0xFF; + vec_cns [(index * esize) + i] = (guint8)0xFF; } } } MonoType *op_etype = m_class_get_byval_arg (mono_defaults.byte_class); MonoClass *op_klass = create_class_instance ("System.Runtime.Intrinsics", "Vector128`1", op_etype); + MonoInst *new_args[2]; new_args [0] = emit_simd_ins (cfg, op_klass, OP_XCAST, args [0]->dreg, -1); new_args [1] = emit_xconst_v128 (cfg, op_klass, vec_cns); MonoInst *ins = emit_simd_ins_for_sig (cfg, op_klass, OP_XOP_OVR_X_X_X, INTRINS_AARCH64_ADV_SIMD_TBL1, 0, fsig, new_args); @@ -2751,6 +2736,7 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi int vsize = mono_class_value_size (klass, NULL); int esize = mono_class_value_size (mono_class_from_mono_type_internal (etype), NULL); int ecount = vsize / esize; + g_assert ((ecount == 2) || (ecount == 4) || (ecount == 8) || (ecount == 16)); guint8 control = 0; gboolean needs_zero = false; guint64 value = 0; @@ -2786,7 +2772,7 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi // set, but we use 0xFF instead since that will be the equivalent of AllBitsSet for (int i = 0; i < esize; i++) { - vec_cns [(index * esize) + i] = 0xFF; + vec_cns [(index * esize) + i] = (guint8)0xFF; } } } From fe451d7b3e13bcdcbac8263b2634b33496b7a92f Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Wed, 24 Jul 2024 10:19:02 -0700 Subject: [PATCH 12/15] Ensure we get the right element type --- src/mono/mono/mini/simd-intrinsics.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index e19e01339de34..1b83a909858b8 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -2683,8 +2683,8 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi return emit_simd_ins_for_unary_op (cfg, klass, fsig, args, arg0_type, id); } case SN_Shuffle: { - MonoType *etype = fsig->params [0]; - if (!is_element_type_primitive (etype)) + MonoType *etype = get_vector_t_elem_type (fsig->ret); + if (!MONO_TYPE_IS_VECTOR_PRIMITIVE (etype)) return NULL; #ifdef TARGET_WASM return emit_simd_ins_for_sig (cfg, klass, OP_WASM_SIMD_SWIZZLE, -1, -1, fsig, args); From 08aa47757c27241cbb9160f6f691730bc82788f6 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Wed, 24 Jul 2024 10:44:13 -0700 Subject: [PATCH 13/15] Ensure we don't create nodes unnecessarily for create_elementwise --- src/mono/mono/mini/simd-intrinsics.c | 167 ++++++++++++++++++++++++++- 1 file changed, 161 insertions(+), 6 deletions(-) diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index 1b83a909858b8..4814dfb135f49 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -1338,9 +1338,103 @@ static MonoInst * emit_vector_create_elementwise ( MonoCompile *cfg, MonoClass *vklass, MonoType *etype, MonoInst **args, int param_count) { - MonoInst *ins = emit_xzero (cfg, vklass); - for (int i = 0; i < param_count; ++i) - ins = emit_vector_insert_element (cfg, vklass, ins, etype->type, args[i], i, TRUE); + // We want to handle constant inputs and create constant nodes so other import + // optimizations can be enabled. This includes recognizing partial constants + // and only performing the minimal number of inserts required + + gboolean all_const = true; + gboolean some_const = false; + + guint8 cns_vec[16]; + memset (cns_vec, 0x00, 16); + + int vector_size = mono_class_value_size (vklass, NULL); + if (vector_size == 16) { + for (int i = 0; i < param_count; ++i) { + if (!is_const (args[i])) { + all_const = false; + break; + } + + some_const = true; + + if (type_enum_is_float (etype->type)) { + double cns_val; + if (args[i]->opcode == OP_R4CONST) { + cns_val = *(const float*)(args[i]->inst_p0); + } else { + g_assert (args[i]->opcode == OP_R8CONST); + cns_val = *(const double*)(args[i]->inst_p0); + } + + switch (etype->type) { + case MONO_TYPE_R4: { + ((float*)cns_vec) [i] = (float)cns_val; + break; + } + case MONO_TYPE_R8: { + ((double*)cns_vec) [i] = (double)cns_val; + break; + } + default: { + g_assert_not_reached (); + } + } + } else { + gint64 cns_val; + if (args[i]->opcode == OP_ICONST) { + cns_val = GTMREG_TO_INT (args[i]->inst_c0); + } else { + g_assert (args[i]->opcode == OP_I8CONST); + cns_val = args[i]->inst_l; + } + + switch (etype->type) { + case MONO_TYPE_I1: + case MONO_TYPE_U1: { + ((guint8*)cns_vec) [i] = (guint8)cns_val; + break; + } + case MONO_TYPE_I2: + case MONO_TYPE_U2: { + ((guint16*)cns_vec) [i] = (guint16)cns_val; + break; + } + case MONO_TYPE_I4: + case MONO_TYPE_U4: { + ((guint32*)cns_vec) [i] = (guint32)cns_val; + break; + } + case MONO_TYPE_I8: + case MONO_TYPE_U8: { + ((guint64*)cns_vec) [i] = (guint64)cns_val; + break; + } + default: { + g_assert_not_reached (); + } + } + } + } + } + + if (all_const) { + return emit_xconst_v128 (cfg, vklass, (guint8*)cns_vec); + } + + MonoInst *ins; + + if (some_const) { + ins = emit_xconst_v128 (cfg, vklass, (guint8*)cns_vec); + } else { + ins = emit_xzero (cfg, vklass); + } + + for (int i = 0; i < param_count; ++i) { + if (!is_const (args[i])) { + ins = emit_vector_insert_element (cfg, vklass, ins, etype->type, args[i], i, TRUE); + } + } return ins; } @@ -1357,11 +1451,72 @@ emit_vector_create_scalar ( if (is_unsafe) { return emit_vector_create_broadcast (cfg, vklass, etype, arg0); } - MonoInst *ins = emit_xzero (cfg, vklass); - ins = emit_vector_insert_element (cfg, vklass, ins, etype->type, arg0, 0, TRUE); - return ins; + + guint8 cns_vec[16]; + memset (cns_vec, 0x00, 16); + + if (type_enum_is_float (etype->type)) { + double cns_val; + if (arg0->opcode == OP_R4CONST) { + cns_val = *(const float*)(arg0->inst_p0); + } else { + g_assert (arg0->opcode == OP_R8CONST); + cns_val = *(const double*)(arg0->inst_p0); + } + + switch (etype->type) { + case MONO_TYPE_R4: { + ((float*)cns_vec) [0] = (float)cns_val; + break; + } + case MONO_TYPE_R8: { + ((double*)cns_vec) [0] = (double)cns_val; + break; + } + default: { + g_assert_not_reached (); + } + } + } else { + gint64 cns_val; + if (arg0->opcode == OP_ICONST) { + cns_val = GTMREG_TO_INT (arg0->inst_c0); + } else { + g_assert (arg0->opcode == OP_I8CONST); + cns_val = arg0->inst_l; + +} + switch (etype->type) { + case MONO_TYPE_I1: + case MONO_TYPE_U1: { + ((guint8*)cns_vec) [0] = (guint8)cns_val; + break; + } + case MONO_TYPE_I2: + case MONO_TYPE_U2: { + ((guint16*)cns_vec) [0] = (guint16)cns_val; + break; + } + case MONO_TYPE_I4: + case MONO_TYPE_U4: { + ((guint32*)cns_vec) [0] = (guint32)cns_val; + break; + } + case MONO_TYPE_I8: + case MONO_TYPE_U8: { + ((guint64*)cns_vec) [0] = (guint64)cns_val; + break; + } + default: { + g_assert_not_reached (); + } + } + } + + return emit_xconst_v128 (cfg, vklass, (guint8*)cns_vec); } } + int opcode = 0; if (COMPILE_LLVM (cfg)) { opcode = is_unsafe ? OP_CREATE_SCALAR_UNSAFE : OP_CREATE_SCALAR; From 4a491729f4fefc8c042e4fc6a6092cb572457ac6 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Wed, 24 Jul 2024 10:48:40 -0700 Subject: [PATCH 14/15] Ensure that create_elementwise still works for other vector sizes --- src/mono/mono/mini/simd-intrinsics.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index 4814dfb135f49..f72bd3608b609 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -1431,7 +1431,7 @@ emit_vector_create_elementwise ( } for (int i = 0; i < param_count; ++i) { - if (!is_const (args[i])) { + if (!is_const (args[i]) || (vector_size != 16)) { ins = emit_vector_insert_element (cfg, vklass, ins, etype->type, args[i], i, TRUE); } } From 5a8a22199cc8e5294beb0069d290f978ef328437 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Thu, 25 Jul 2024 07:57:48 -0700 Subject: [PATCH 15/15] Ensure indentation of switch cases is correct for Mono --- src/mono/mono/mini/simd-intrinsics.c | 358 +++++++++++++-------------- 1 file changed, 179 insertions(+), 179 deletions(-) diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index f72bd3608b609..54588a2f2dfdf 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -686,43 +686,43 @@ get_xconst_int_elem (MonoCompile *cfg, MonoInst *ins, MonoTypeEnum etype, int in } g_assert (index >= 0); switch (etype) { - case MONO_TYPE_I1: { - g_assert (index < 16); - return ((gint8*)cns_vec) [index]; - } - case MONO_TYPE_U1: { - g_assert (index < 16); - return ((guint8*)cns_vec) [index]; - } - case MONO_TYPE_I2: { - g_assert (index < 8); - return ((gint16*)cns_vec) [index]; - } - case MONO_TYPE_U2: { - g_assert (index < 8); - return ((guint16*)cns_vec) [index]; - } - case MONO_TYPE_I4: - case MONO_TYPE_R4: { - g_assert (index < 4); - return ((gint32*)cns_vec) [index]; - } - case MONO_TYPE_U4: { - g_assert (index < 4); - return ((guint32*)cns_vec) [index]; - } - case MONO_TYPE_I8: - case MONO_TYPE_R8: { - g_assert (index < 2); - return ((gint64*)cns_vec) [index]; - } - case MONO_TYPE_U8: { - g_assert (index < 2); - return ((guint64*)cns_vec) [index]; - } - default: { - g_assert_not_reached (); - } + case MONO_TYPE_I1: { + g_assert (index < 16); + return ((gint8*)cns_vec) [index]; + } + case MONO_TYPE_U1: { + g_assert (index < 16); + return ((guint8*)cns_vec) [index]; + } + case MONO_TYPE_I2: { + g_assert (index < 8); + return ((gint16*)cns_vec) [index]; + } + case MONO_TYPE_U2: { + g_assert (index < 8); + return ((guint16*)cns_vec) [index]; + } + case MONO_TYPE_I4: + case MONO_TYPE_R4: { + g_assert (index < 4); + return ((gint32*)cns_vec) [index]; + } + case MONO_TYPE_U4: { + g_assert (index < 4); + return ((guint32*)cns_vec) [index]; + } + case MONO_TYPE_I8: + case MONO_TYPE_R8: { + g_assert (index < 2); + return ((gint64*)cns_vec) [index]; + } + case MONO_TYPE_U8: { + g_assert (index < 2); + return ((guint64*)cns_vec) [index]; + } + default: { + g_assert_not_reached (); + } } } @@ -1181,17 +1181,17 @@ emit_vector_insert_element ( cns_val = *(const double*)(element->inst_p0); } switch (type) { - case MONO_TYPE_R4: { - ((float*)cns_vec) [index] = (float)cns_val; - break; - } - case MONO_TYPE_R8: { - ((double*)cns_vec) [index] = (double)cns_val; - break; - } - default: { - g_assert_not_reached (); - } + case MONO_TYPE_R4: { + ((float*)cns_vec) [index] = (float)cns_val; + break; + } + case MONO_TYPE_R8: { + ((double*)cns_vec) [index] = (double)cns_val; + break; + } + default: { + g_assert_not_reached (); + } } } else { gint64 cns_val; @@ -1202,29 +1202,29 @@ emit_vector_insert_element ( cns_val = element->inst_l; } switch (type) { - case MONO_TYPE_I1: - case MONO_TYPE_U1: { - ((guint8*)cns_vec) [index] = (guint8)cns_val; - break; - } - case MONO_TYPE_I2: - case MONO_TYPE_U2: { - ((guint16*)cns_vec) [index] = (guint16)cns_val; - break; - } - case MONO_TYPE_I4: - case MONO_TYPE_U4: { - ((guint32*)cns_vec) [index] = (guint32)cns_val; - break; - } - case MONO_TYPE_I8: - case MONO_TYPE_U8: { - ((guint64*)cns_vec) [index] = (guint64)cns_val; - break; - } - default: { - g_assert_not_reached (); - } + case MONO_TYPE_I1: + case MONO_TYPE_U1: { + ((guint8*)cns_vec) [index] = (guint8)cns_val; + break; + } + case MONO_TYPE_I2: + case MONO_TYPE_U2: { + ((guint16*)cns_vec) [index] = (guint16)cns_val; + break; + } + case MONO_TYPE_I4: + case MONO_TYPE_U4: { + ((guint32*)cns_vec) [index] = (guint32)cns_val; + break; + } + case MONO_TYPE_I8: + case MONO_TYPE_U8: { + ((guint64*)cns_vec) [index] = (guint64)cns_val; + break; + } + default: { + g_assert_not_reached (); + } } } return emit_xconst_v128 (cfg, vklass, cns_vec); @@ -1268,21 +1268,21 @@ emit_vector_create_broadcast ( cns_val = *(const double*)(arg0->inst_p0); } switch (etype->type) { - case MONO_TYPE_R4: { - for (int i = 0; i < vector_size / 4; i++) { - ((float*)cns_vec) [i] = (float)cns_val; - } - break; - } - case MONO_TYPE_R8: { - for (int i = 0; i < vector_size / 8; i++) { - ((double*)cns_vec) [i] = (double)cns_val; - } - break; + case MONO_TYPE_R4: { + for (int i = 0; i < vector_size / 4; i++) { + ((float*)cns_vec) [i] = (float)cns_val; } - default: { - g_assert_not_reached (); + break; + } + case MONO_TYPE_R8: { + for (int i = 0; i < vector_size / 8; i++) { + ((double*)cns_vec) [i] = (double)cns_val; } + break; + } + default: { + g_assert_not_reached (); + } } } else { gint64 cns_val; @@ -1293,37 +1293,37 @@ emit_vector_create_broadcast ( cns_val = arg0->inst_l; } switch (etype->type) { - case MONO_TYPE_I1: - case MONO_TYPE_U1: { - for (int i = 0; i < vector_size / 1; i++) { - ((guint8*)cns_vec) [i] = (guint8)cns_val; - } - break; - } - case MONO_TYPE_I2: - case MONO_TYPE_U2: { - for (int i = 0; i < vector_size / 2; i++) { - ((guint16*)cns_vec) [i] = (guint16)cns_val; - } - break; + case MONO_TYPE_I1: + case MONO_TYPE_U1: { + for (int i = 0; i < vector_size / 1; i++) { + ((guint8*)cns_vec) [i] = (guint8)cns_val; } - case MONO_TYPE_I4: - case MONO_TYPE_U4: { - for (int i = 0; i < vector_size / 4; i++) { - ((guint32*)cns_vec) [i] = (guint32)cns_val; - } - break; + break; + } + case MONO_TYPE_I2: + case MONO_TYPE_U2: { + for (int i = 0; i < vector_size / 2; i++) { + ((guint16*)cns_vec) [i] = (guint16)cns_val; } - case MONO_TYPE_I8: - case MONO_TYPE_U8: { - for (int i = 0; i < vector_size / 8; i++) { - ((guint64*)cns_vec) [i] = (guint64)cns_val; - } - break; + break; + } + case MONO_TYPE_I4: + case MONO_TYPE_U4: { + for (int i = 0; i < vector_size / 4; i++) { + ((guint32*)cns_vec) [i] = (guint32)cns_val; } - default: { - g_assert_not_reached (); + break; + } + case MONO_TYPE_I8: + case MONO_TYPE_U8: { + for (int i = 0; i < vector_size / 8; i++) { + ((guint64*)cns_vec) [i] = (guint64)cns_val; } + break; + } + default: { + g_assert_not_reached (); + } } } return emit_xconst_v128 (cfg, vklass, (guint8*)cns_vec); @@ -1368,17 +1368,17 @@ emit_vector_create_elementwise ( } switch (etype->type) { - case MONO_TYPE_R4: { - ((float*)cns_vec) [i] = (float)cns_val; - break; - } - case MONO_TYPE_R8: { - ((double*)cns_vec) [i] = (double)cns_val; - break; - } - default: { - g_assert_not_reached (); - } + case MONO_TYPE_R4: { + ((float*)cns_vec) [i] = (float)cns_val; + break; + } + case MONO_TYPE_R8: { + ((double*)cns_vec) [i] = (double)cns_val; + break; + } + default: { + g_assert_not_reached (); + } } } else { gint64 cns_val; @@ -1390,29 +1390,29 @@ emit_vector_create_elementwise ( } switch (etype->type) { - case MONO_TYPE_I1: - case MONO_TYPE_U1: { - ((guint8*)cns_vec) [i] = (guint8)cns_val; - break; - } - case MONO_TYPE_I2: - case MONO_TYPE_U2: { - ((guint16*)cns_vec) [i] = (guint16)cns_val; - break; - } - case MONO_TYPE_I4: - case MONO_TYPE_U4: { - ((guint32*)cns_vec) [i] = (guint32)cns_val; - break; - } - case MONO_TYPE_I8: - case MONO_TYPE_U8: { - ((guint64*)cns_vec) [i] = (guint64)cns_val; - break; - } - default: { - g_assert_not_reached (); - } + case MONO_TYPE_I1: + case MONO_TYPE_U1: { + ((guint8*)cns_vec) [i] = (guint8)cns_val; + break; + } + case MONO_TYPE_I2: + case MONO_TYPE_U2: { + ((guint16*)cns_vec) [i] = (guint16)cns_val; + break; + } + case MONO_TYPE_I4: + case MONO_TYPE_U4: { + ((guint32*)cns_vec) [i] = (guint32)cns_val; + break; + } + case MONO_TYPE_I8: + case MONO_TYPE_U8: { + ((guint64*)cns_vec) [i] = (guint64)cns_val; + break; + } + default: { + g_assert_not_reached (); + } } } } @@ -1465,17 +1465,17 @@ emit_vector_create_scalar ( } switch (etype->type) { - case MONO_TYPE_R4: { - ((float*)cns_vec) [0] = (float)cns_val; - break; - } - case MONO_TYPE_R8: { - ((double*)cns_vec) [0] = (double)cns_val; - break; - } - default: { - g_assert_not_reached (); - } + case MONO_TYPE_R4: { + ((float*)cns_vec) [0] = (float)cns_val; + break; + } + case MONO_TYPE_R8: { + ((double*)cns_vec) [0] = (double)cns_val; + break; + } + default: { + g_assert_not_reached (); + } } } else { gint64 cns_val; @@ -1487,29 +1487,29 @@ emit_vector_create_scalar ( } switch (etype->type) { - case MONO_TYPE_I1: - case MONO_TYPE_U1: { - ((guint8*)cns_vec) [0] = (guint8)cns_val; - break; - } - case MONO_TYPE_I2: - case MONO_TYPE_U2: { - ((guint16*)cns_vec) [0] = (guint16)cns_val; - break; - } - case MONO_TYPE_I4: - case MONO_TYPE_U4: { - ((guint32*)cns_vec) [0] = (guint32)cns_val; - break; - } - case MONO_TYPE_I8: - case MONO_TYPE_U8: { - ((guint64*)cns_vec) [0] = (guint64)cns_val; - break; - } - default: { - g_assert_not_reached (); - } + case MONO_TYPE_I1: + case MONO_TYPE_U1: { + ((guint8*)cns_vec) [0] = (guint8)cns_val; + break; + } + case MONO_TYPE_I2: + case MONO_TYPE_U2: { + ((guint16*)cns_vec) [0] = (guint16)cns_val; + break; + } + case MONO_TYPE_I4: + case MONO_TYPE_U4: { + ((guint32*)cns_vec) [0] = (guint32)cns_val; + break; + } + case MONO_TYPE_I8: + case MONO_TYPE_U8: { + ((guint64*)cns_vec) [0] = (guint64)cns_val; + break; + } + default: { + g_assert_not_reached (); + } } }