From c439344eb7f55de81b685fbcf44bc81de05b833f Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Mon, 9 Jan 2023 17:27:25 +0200 Subject: [PATCH 1/9] [mono][interp] Move defines from transform.c --- src/mono/mono/mini/interp/mintops.h | 115 ++++++++++++++++++++++++++ src/mono/mono/mini/interp/transform.c | 115 -------------------------- 2 files changed, 115 insertions(+), 115 deletions(-) diff --git a/src/mono/mono/mini/interp/mintops.h b/src/mono/mono/mini/interp/mintops.h index 411d0a0091e7d..b5afbc18a9286 100644 --- a/src/mono/mono/mini/interp/mintops.h +++ b/src/mono/mono/mini/interp/mintops.h @@ -57,6 +57,121 @@ typedef enum { #define READ64(x) (*(guint64 *)(x)) #endif +#if SIZEOF_VOID_P == 8 +#define MINT_NEG_P MINT_NEG_I8 +#define MINT_NOT_P MINT_NOT_I8 + +#define MINT_NEG_FP MINT_NEG_R8 + +#define MINT_ADD_P MINT_ADD_I8 +#define MINT_ADD_P_IMM MINT_ADD_I8_IMM +#define MINT_SUB_P MINT_SUB_I8 +#define MINT_MUL_P MINT_MUL_I8 +#define MINT_DIV_P MINT_DIV_I8 +#define MINT_DIV_UN_P MINT_DIV_UN_I8 +#define MINT_REM_P MINT_REM_I8 +#define MINT_REM_UN_P MINT_REM_UN_I8 +#define MINT_AND_P MINT_AND_I8 +#define MINT_OR_P MINT_OR_I8 +#define MINT_XOR_P MINT_XOR_I8 +#define MINT_SHL_P MINT_SHL_I8 +#define MINT_SHR_P MINT_SHR_I8 +#define MINT_SHR_UN_P MINT_SHR_UN_I8 + +#define MINT_CEQ_P MINT_CEQ_I8 +#define MINT_CNE_P MINT_CNE_I8 +#define MINT_CLT_P MINT_CLT_I8 +#define MINT_CLT_UN_P MINT_CLT_UN_I8 +#define MINT_CGT_P MINT_CGT_I8 +#define MINT_CGT_UN_P MINT_CGT_UN_I8 +#define MINT_CLE_P MINT_CLE_I8 +#define MINT_CLE_UN_P MINT_CLE_UN_I8 +#define MINT_CGE_P MINT_CGE_I8 +#define MINT_CGE_UN_P MINT_CGE_UN_I8 + +#define MINT_ADD_FP MINT_ADD_R8 +#define MINT_SUB_FP MINT_SUB_R8 +#define MINT_MUL_FP MINT_MUL_R8 +#define MINT_DIV_FP MINT_DIV_R8 +#define MINT_REM_FP MINT_REM_R8 + +#define MINT_CNE_FP MINT_CNE_R8 +#define MINT_CEQ_FP MINT_CEQ_R8 +#define MINT_CGT_FP MINT_CGT_R8 +#define MINT_CGE_FP MINT_CGE_R8 +#define MINT_CLT_FP MINT_CLT_R8 +#define MINT_CLE_FP MINT_CLE_R8 + +#define MINT_CONV_OVF_U4_P MINT_CONV_OVF_U4_I8 +#else + +#define MINT_NEG_P MINT_NEG_I4 +#define MINT_NOT_P MINT_NOT_I4 + +#define MINT_NEG_FP MINT_NEG_R4 + +#define MINT_ADD_P MINT_ADD_I4 +#define MINT_ADD_P_IMM MINT_ADD_I4_IMM +#define MINT_SUB_P MINT_SUB_I4 +#define MINT_MUL_P MINT_MUL_I4 +#define MINT_DIV_P MINT_DIV_I4 +#define MINT_DIV_UN_P MINT_DIV_UN_I4 +#define MINT_REM_P MINT_REM_I4 +#define MINT_REM_UN_P MINT_REM_UN_I4 +#define MINT_AND_P MINT_AND_I4 +#define MINT_OR_P MINT_OR_I4 +#define MINT_XOR_P MINT_XOR_I4 +#define MINT_SHL_P MINT_SHL_I4 +#define MINT_SHR_P MINT_SHR_I4 +#define MINT_SHR_UN_P MINT_SHR_UN_I4 + +#define MINT_CEQ_P MINT_CEQ_I4 +#define MINT_CNE_P MINT_CNE_I4 +#define MINT_CLT_P MINT_CLT_I4 +#define MINT_CLT_UN_P MINT_CLT_UN_I4 +#define MINT_CGT_P MINT_CGT_I4 +#define MINT_CGT_UN_P MINT_CGT_UN_I4 +#define MINT_CLE_P MINT_CLE_I4 +#define MINT_CLE_UN_P MINT_CLE_UN_I4 +#define MINT_CGE_P MINT_CGE_I4 +#define MINT_CGE_UN_P MINT_CGE_UN_I4 + +#define MINT_ADD_FP MINT_ADD_R4 +#define MINT_SUB_FP MINT_SUB_R4 +#define MINT_MUL_FP MINT_MUL_R4 +#define MINT_DIV_FP MINT_DIV_R4 +#define MINT_REM_FP MINT_REM_R4 + +#define MINT_CNE_FP MINT_CNE_R4 +#define MINT_CEQ_FP MINT_CEQ_R4 +#define MINT_CGT_FP MINT_CGT_R4 +#define MINT_CGE_FP MINT_CGE_R4 +#define MINT_CLT_FP MINT_CLT_R4 +#define MINT_CLE_FP MINT_CLE_R4 + +#define MINT_CONV_OVF_U4_P MINT_CONV_OVF_U4_I4 +#endif + +#if SIZEOF_VOID_P == 8 +#define MINT_MOV_P MINT_MOV_8 +#define MINT_LDNULL MINT_LDC_I8_0 +#define MINT_LDIND_I MINT_LDIND_I8 +#define MINT_STIND_I MINT_STIND_I8 +#define MINT_LDELEM_I MINT_LDELEM_I8 +#define MINT_STELEM_I MINT_STELEM_I8 +#define MINT_MUL_P_IMM MINT_MUL_I8_IMM +#define MINT_ADD_MUL_P_IMM MINT_ADD_MUL_I8_IMM +#else +#define MINT_MOV_P MINT_MOV_4 +#define MINT_LDNULL MINT_LDC_I4_0 +#define MINT_LDIND_I MINT_LDIND_I4 +#define MINT_STIND_I MINT_STIND_I4 +#define MINT_LDELEM_I MINT_LDELEM_I4 +#define MINT_STELEM_I MINT_STELEM_I4 +#define MINT_MUL_P_IMM MINT_MUL_I4_IMM +#define MINT_ADD_MUL_P_IMM MINT_ADD_MUL_I4_IMM +#endif + #define MINT_SWITCH_LEN(n) (4 + (n) * 2) #define MINT_IS_NOP(op) ((op) == MINT_NOP || (op) == MINT_DEF || (op) == MINT_DUMMY_USE || (op) == MINT_IL_SEQ_POINT) diff --git a/src/mono/mono/mini/interp/transform.c b/src/mono/mono/mini/interp/transform.c index b1b819bf84941..7bf1024e1c961 100644 --- a/src/mono/mono/mini/interp/transform.c +++ b/src/mono/mono/mini/interp/transform.c @@ -43,121 +43,6 @@ MonoInterpStats mono_interp_stats; #define DEBUG 0 -#if SIZEOF_VOID_P == 8 -#define MINT_NEG_P MINT_NEG_I8 -#define MINT_NOT_P MINT_NOT_I8 - -#define MINT_NEG_FP MINT_NEG_R8 - -#define MINT_ADD_P MINT_ADD_I8 -#define MINT_ADD_P_IMM MINT_ADD_I8_IMM -#define MINT_SUB_P MINT_SUB_I8 -#define MINT_MUL_P MINT_MUL_I8 -#define MINT_DIV_P MINT_DIV_I8 -#define MINT_DIV_UN_P MINT_DIV_UN_I8 -#define MINT_REM_P MINT_REM_I8 -#define MINT_REM_UN_P MINT_REM_UN_I8 -#define MINT_AND_P MINT_AND_I8 -#define MINT_OR_P MINT_OR_I8 -#define MINT_XOR_P MINT_XOR_I8 -#define MINT_SHL_P MINT_SHL_I8 -#define MINT_SHR_P MINT_SHR_I8 -#define MINT_SHR_UN_P MINT_SHR_UN_I8 - -#define MINT_CEQ_P MINT_CEQ_I8 -#define MINT_CNE_P MINT_CNE_I8 -#define MINT_CLT_P MINT_CLT_I8 -#define MINT_CLT_UN_P MINT_CLT_UN_I8 -#define MINT_CGT_P MINT_CGT_I8 -#define MINT_CGT_UN_P MINT_CGT_UN_I8 -#define MINT_CLE_P MINT_CLE_I8 -#define MINT_CLE_UN_P MINT_CLE_UN_I8 -#define MINT_CGE_P MINT_CGE_I8 -#define MINT_CGE_UN_P MINT_CGE_UN_I8 - -#define MINT_ADD_FP MINT_ADD_R8 -#define MINT_SUB_FP MINT_SUB_R8 -#define MINT_MUL_FP MINT_MUL_R8 -#define MINT_DIV_FP MINT_DIV_R8 -#define MINT_REM_FP MINT_REM_R8 - -#define MINT_CNE_FP MINT_CNE_R8 -#define MINT_CEQ_FP MINT_CEQ_R8 -#define MINT_CGT_FP MINT_CGT_R8 -#define MINT_CGE_FP MINT_CGE_R8 -#define MINT_CLT_FP MINT_CLT_R8 -#define MINT_CLE_FP MINT_CLE_R8 - -#define MINT_CONV_OVF_U4_P MINT_CONV_OVF_U4_I8 -#else - -#define MINT_NEG_P MINT_NEG_I4 -#define MINT_NOT_P MINT_NOT_I4 - -#define MINT_NEG_FP MINT_NEG_R4 - -#define MINT_ADD_P MINT_ADD_I4 -#define MINT_ADD_P_IMM MINT_ADD_I4_IMM -#define MINT_SUB_P MINT_SUB_I4 -#define MINT_MUL_P MINT_MUL_I4 -#define MINT_DIV_P MINT_DIV_I4 -#define MINT_DIV_UN_P MINT_DIV_UN_I4 -#define MINT_REM_P MINT_REM_I4 -#define MINT_REM_UN_P MINT_REM_UN_I4 -#define MINT_AND_P MINT_AND_I4 -#define MINT_OR_P MINT_OR_I4 -#define MINT_XOR_P MINT_XOR_I4 -#define MINT_SHL_P MINT_SHL_I4 -#define MINT_SHR_P MINT_SHR_I4 -#define MINT_SHR_UN_P MINT_SHR_UN_I4 - -#define MINT_CEQ_P MINT_CEQ_I4 -#define MINT_CNE_P MINT_CNE_I4 -#define MINT_CLT_P MINT_CLT_I4 -#define MINT_CLT_UN_P MINT_CLT_UN_I4 -#define MINT_CGT_P MINT_CGT_I4 -#define MINT_CGT_UN_P MINT_CGT_UN_I4 -#define MINT_CLE_P MINT_CLE_I4 -#define MINT_CLE_UN_P MINT_CLE_UN_I4 -#define MINT_CGE_P MINT_CGE_I4 -#define MINT_CGE_UN_P MINT_CGE_UN_I4 - -#define MINT_ADD_FP MINT_ADD_R4 -#define MINT_SUB_FP MINT_SUB_R4 -#define MINT_MUL_FP MINT_MUL_R4 -#define MINT_DIV_FP MINT_DIV_R4 -#define MINT_REM_FP MINT_REM_R4 - -#define MINT_CNE_FP MINT_CNE_R4 -#define MINT_CEQ_FP MINT_CEQ_R4 -#define MINT_CGT_FP MINT_CGT_R4 -#define MINT_CGE_FP MINT_CGE_R4 -#define MINT_CLT_FP MINT_CLT_R4 -#define MINT_CLE_FP MINT_CLE_R4 - -#define MINT_CONV_OVF_U4_P MINT_CONV_OVF_U4_I4 -#endif - -#if SIZEOF_VOID_P == 8 -#define MINT_MOV_P MINT_MOV_8 -#define MINT_LDNULL MINT_LDC_I8_0 -#define MINT_LDIND_I MINT_LDIND_I8 -#define MINT_STIND_I MINT_STIND_I8 -#define MINT_LDELEM_I MINT_LDELEM_I8 -#define MINT_STELEM_I MINT_STELEM_I8 -#define MINT_MUL_P_IMM MINT_MUL_I8_IMM -#define MINT_ADD_MUL_P_IMM MINT_ADD_MUL_I8_IMM -#else -#define MINT_MOV_P MINT_MOV_4 -#define MINT_LDNULL MINT_LDC_I4_0 -#define MINT_LDIND_I MINT_LDIND_I4 -#define MINT_STIND_I MINT_STIND_I4 -#define MINT_LDELEM_I MINT_LDELEM_I4 -#define MINT_STELEM_I MINT_STELEM_I4 -#define MINT_MUL_P_IMM MINT_MUL_I4_IMM -#define MINT_ADD_MUL_P_IMM MINT_ADD_MUL_I4_IMM -#endif - static const char *stack_type_string [] = { "I4", "I8", "R4", "R8", "O ", "VT", "MP", "F " }; static int stack_type [] = { From d4eb2aa65b290fef3aed8725a1826561ba8b8351 Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Mon, 9 Jan 2023 18:05:55 +0200 Subject: [PATCH 2/9] [mono][interp] Move more defines from transform.c --- src/mono/mono/mini/interp/transform.c | 71 -------------------------- src/mono/mono/mini/interp/transform.h | 72 +++++++++++++++++++++++++++ 2 files changed, 72 insertions(+), 71 deletions(-) diff --git a/src/mono/mono/mini/interp/transform.c b/src/mono/mono/mini/interp/transform.c index 7bf1024e1c961..09878270c91ea 100644 --- a/src/mono/mono/mini/interp/transform.c +++ b/src/mono/mono/mini/interp/transform.c @@ -62,25 +62,6 @@ static GENERATE_TRY_GET_CLASS_WITH_CACHE (intrinsic_klass, "System.Runtime.Compi static gboolean generate_code (TransformData *td, MonoMethod *method, MonoMethodHeader *header, MonoGenericContext *generic_context, MonoError *error); -#define interp_ins_set_dreg(ins,dr) do { \ - ins->dreg = dr; \ -} while (0) - -#define interp_ins_set_sreg(ins,s1) do { \ - ins->sregs [0] = s1; \ -} while (0) - -#define interp_ins_set_sregs2(ins,s1,s2) do { \ - ins->sregs [0] = s1; \ - ins->sregs [1] = s2; \ -} while (0) - -#define interp_ins_set_sregs3(ins,s1,s2,s3) do { \ - ins->sregs [0] = s1; \ - ins->sregs [1] = s2; \ - ins->sregs [2] = s3; \ -} while (0) - static gboolean has_intrinsic_attribute (MonoMethod *method) { @@ -272,58 +253,6 @@ interp_last_ins (InterpBasicBlock *bb) } \ } while (0) -#if NO_UNALIGNED_ACCESS -#define WRITE32(ip, v) \ - do { \ - * (ip) = * (guint16 *)(v); \ - * ((ip) + 1) = * ((guint16 *)(v) + 1); \ - (ip) += 2; \ - } while (0) - -#define WRITE32_INS(ins, index, v) \ - do { \ - (ins)->data [index] = * (guint16 *)(v); \ - (ins)->data [index + 1] = * ((guint16 *)(v) + 1); \ - } while (0) - -#define WRITE64(ins, v) \ - do { \ - *((ins) + 0) = * ((guint16 *)(v) + 0); \ - *((ins) + 1) = * ((guint16 *)(v) + 1); \ - *((ins) + 2) = * ((guint16 *)(v) + 2); \ - *((ins) + 3) = * ((guint16 *)(v) + 3); \ - } while (0) - -#define WRITE64_INS(ins, index, v) \ - do { \ - (ins)->data [index] = * (guint16 *)(v); \ - (ins)->data [index + 1] = * ((guint16 *)(v) + 1); \ - (ins)->data [index + 2] = * ((guint16 *)(v) + 2); \ - (ins)->data [index + 3] = * ((guint16 *)(v) + 3); \ - } while (0) -#else -#define WRITE32(ip, v) \ - do { \ - * (guint32*)(ip) = * (guint32 *)(v); \ - (ip) += 2; \ - } while (0) -#define WRITE32_INS(ins, index, v) \ - do { \ - * (guint32 *)(&(ins)->data [index]) = * (guint32 *)(v); \ - } while (0) - -#define WRITE64(ip, v) \ - do { \ - * (guint64*)(ip) = * (guint64 *)(v); \ - (ip) += 4; \ - } while (0) -#define WRITE64_INS(ins, index, v) \ - do { \ - * (guint64 *)(&(ins)->data [index]) = * (guint64 *)(v); \ - } while (0) - -#endif - static void realloc_stack (TransformData *td) { diff --git a/src/mono/mono/mini/interp/transform.h b/src/mono/mono/mini/interp/transform.h index c385429a74875..eaa05a137865f 100644 --- a/src/mono/mono/mini/interp/transform.h +++ b/src/mono/mono/mini/interp/transform.h @@ -282,6 +282,78 @@ typedef struct #define STACK_TYPE_I STACK_TYPE_I4 #endif + +#define interp_ins_set_dreg(ins,dr) do { \ + ins->dreg = dr; \ +} while (0) + +#define interp_ins_set_sreg(ins,s1) do { \ + ins->sregs [0] = s1; \ +} while (0) + +#define interp_ins_set_sregs2(ins,s1,s2) do { \ + ins->sregs [0] = s1; \ + ins->sregs [1] = s2; \ +} while (0) + +#define interp_ins_set_sregs3(ins,s1,s2,s3) do { \ + ins->sregs [0] = s1; \ + ins->sregs [1] = s2; \ + ins->sregs [2] = s3; \ +} while (0) + +#if NO_UNALIGNED_ACCESS +#define WRITE32(ip, v) \ + do { \ + * (ip) = * (guint16 *)(v); \ + * ((ip) + 1) = * ((guint16 *)(v) + 1); \ + (ip) += 2; \ + } while (0) + +#define WRITE32_INS(ins, index, v) \ + do { \ + (ins)->data [index] = * (guint16 *)(v); \ + (ins)->data [index + 1] = * ((guint16 *)(v) + 1); \ + } while (0) + +#define WRITE64(ins, v) \ + do { \ + *((ins) + 0) = * ((guint16 *)(v) + 0); \ + *((ins) + 1) = * ((guint16 *)(v) + 1); \ + *((ins) + 2) = * ((guint16 *)(v) + 2); \ + *((ins) + 3) = * ((guint16 *)(v) + 3); \ + } while (0) + +#define WRITE64_INS(ins, index, v) \ + do { \ + (ins)->data [index] = * (guint16 *)(v); \ + (ins)->data [index + 1] = * ((guint16 *)(v) + 1); \ + (ins)->data [index + 2] = * ((guint16 *)(v) + 2); \ + (ins)->data [index + 3] = * ((guint16 *)(v) + 3); \ + } while (0) +#else +#define WRITE32(ip, v) \ + do { \ + * (guint32 *)(ip) = * (guint32 *)(v); \ + (ip) += 2; \ + } while (0) +#define WRITE32_INS(ins, index, v) \ + do { \ + * (guint32 *)(&(ins)->data [index]) = * (guint32 *)(v); \ + } while (0) + +#define WRITE64(ip, v) \ + do { \ + * (guint64 *)(ip) = * (guint64 *)(v); \ + (ip) += 4; \ + } while (0) +#define WRITE64_INS(ins, index, v) \ + do { \ + * (guint64 *)(&(ins)->data [index]) = * (guint64 *)(v); \ + } while (0) + +#endif + /* test exports for white box testing */ void mono_test_interp_cprop (TransformData *td); From 7e4a3858209398ff0d1b2dd103e12eab5556e527 Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Fri, 2 Dec 2022 19:59:09 +0200 Subject: [PATCH 3/9] [mono][interp] Add intrinsics for most common V128 operations We add intrinsics for Vector128 intrinsics that are actively used within our bcl. We declare a set of simd method names, the same way we do it with jit, in `simd-methods.def`. In `transform-simd.c` we lookup method names in the list of supported intrinsics for `Vector128` and `Vector128`. Once we find a supported instrinsic, we generate code for it, typically a `MINT_SIMD_INTRINS_*` opcode. In order to avoid adding too many new opcodes to the interpreter, simd intrinsics are grouped by signature. So all simd intrinsics that receive a single argument and return a value, will be called through `MINT_SIMD_INTRINS_P_P`. This instruction will receive an index to get the intrinsic implementation and calls it indirectly. Some of the intrinsics are implemented using the standard vector intrinsics, supported by gcc and clang. These do not fully expose the SIMD capabilities, so some intrinsics are implemented naively. This should still be faster than using nonvectorized approach from managed code. In the future we can add better implmentation, on platforms where we have low level support. This would both be faster and reduce code size. --- src/mono/mono/mini/CMakeLists.txt | 1 + src/mono/mono/mini/interp/interp-internals.h | 1 + .../mono/mini/interp/interp-simd-intrins.def | 88 +++ src/mono/mono/mini/interp/interp-simd.c | 597 ++++++++++++++++++ src/mono/mono/mini/interp/interp-simd.h | 16 + src/mono/mono/mini/interp/interp.c | 69 ++ src/mono/mono/mini/interp/mintops.def | 10 + src/mono/mono/mini/interp/mintops.h | 33 + src/mono/mono/mini/interp/simd-methods.def | 39 ++ src/mono/mono/mini/interp/transform-simd.c | 474 ++++++++++++++ src/mono/mono/mini/interp/transform.c | 5 + src/mono/mono/mini/interp/transform.h | 4 + 12 files changed, 1337 insertions(+) create mode 100644 src/mono/mono/mini/interp/interp-simd-intrins.def create mode 100644 src/mono/mono/mini/interp/interp-simd.c create mode 100644 src/mono/mono/mini/interp/interp-simd.h create mode 100644 src/mono/mono/mini/interp/simd-methods.def create mode 100644 src/mono/mono/mini/interp/transform-simd.c diff --git a/src/mono/mono/mini/CMakeLists.txt b/src/mono/mono/mini/CMakeLists.txt index ef4e755161860..1cb0895431d4a 100644 --- a/src/mono/mono/mini/CMakeLists.txt +++ b/src/mono/mono/mini/CMakeLists.txt @@ -278,6 +278,7 @@ set(interp_sources interp/interp.h interp/interp-internals.h interp/interp.c + interp/interp-simd.c interp/interp-intrins.h interp/interp-intrins.c interp/mintops.h diff --git a/src/mono/mono/mini/interp/interp-internals.h b/src/mono/mono/mini/interp/interp-internals.h index 920a6f3b38899..cecefced4be3f 100644 --- a/src/mono/mono/mini/interp/interp-internals.h +++ b/src/mono/mono/mini/interp/interp-internals.h @@ -30,6 +30,7 @@ // This alignment provides us with straight forward support for Vector128 #define MINT_STACK_ALIGNMENT (2 * MINT_STACK_SLOT_SIZE) #define MINT_SIMD_ALIGNMENT (MINT_STACK_ALIGNMENT) +#define SIZEOF_V128 16 #define INTERP_STACK_SIZE (1024*1024) #define INTERP_REDZONE_SIZE (8*1024) diff --git a/src/mono/mono/mini/interp/interp-simd-intrins.def b/src/mono/mono/mini/interp/interp-simd-intrins.def new file mode 100644 index 0000000000000..3e933168662f0 --- /dev/null +++ b/src/mono/mono/mini/interp/interp-simd-intrins.def @@ -0,0 +1,88 @@ +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_ADD, interp_v128_i1_op_addition) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_ADD, interp_v128_i2_op_addition) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_ADD, interp_v128_i4_op_addition) + +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_SUB, interp_v128_i1_op_subtraction) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_SUB, interp_v128_i2_op_subtraction) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_SUB, interp_v128_i4_op_subtraction) + +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_BITWISE_AND, interp_v128_op_bitwise_and) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_BITWISE_OR, interp_v128_op_bitwise_or) + +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_DIVISION, interp_v128_i1_op_division) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_U1_DIVISION, interp_v128_u1_op_division) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_DIVISION, interp_v128_i2_op_division) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_U2_DIVISION, interp_v128_u2_op_division) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_DIVISION, interp_v128_i4_op_division) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_U4_DIVISION, interp_v128_u4_op_division) + +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_BITWISE_EQUALITY, interp_v128_op_bitwise_equality) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_BITWISE_INEQUALITY, interp_v128_op_bitwise_inequality) + +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_EXCLUSIVE_OR, interp_v128_op_exclusive_or) + +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_MULTIPLY, interp_v128_i1_op_multiply) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_MULTIPLY, interp_v128_i2_op_multiply) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_MULTIPLY, interp_v128_i4_op_multiply) + +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I1_NEGATION, interp_v128_i1_op_negation) +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I2_NEGATION, interp_v128_i2_op_negation) +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I4_NEGATION, interp_v128_i4_op_negation) + +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_LEFT_SHIFT, interp_v128_i1_op_left_shift) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_LEFT_SHIFT, interp_v128_i2_op_left_shift) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_LEFT_SHIFT, interp_v128_i4_op_left_shift) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I8_LEFT_SHIFT, interp_v128_i8_op_left_shift) + +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_RIGHT_SHIFT, interp_v128_i1_op_right_shift) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_RIGHT_SHIFT, interp_v128_i2_op_right_shift) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_RIGHT_SHIFT, interp_v128_i4_op_right_shift) + +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_URIGHT_SHIFT, interp_v128_i1_op_uright_shift) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_URIGHT_SHIFT, interp_v128_i2_op_uright_shift) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_URIGHT_SHIFT, interp_v128_i4_op_uright_shift) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I8_URIGHT_SHIFT, interp_v128_i8_op_uright_shift) + +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_ONES_COMPLEMENT, interp_v128_op_ones_complement) + +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_U2_WIDEN_LOWER, interp_v128_u2_widen_lower) +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_U2_WIDEN_UPPER, interp_v128_u2_widen_upper) + +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_U1_NARROW, interp_v128_u1_narrow) + +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_U1_GREATER_THAN, interp_v128_u1_greater_than) + +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_LESS_THAN, interp_v128_i1_less_than) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_U1_LESS_THAN, interp_v128_u1_less_than) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_LESS_THAN, interp_v128_i2_less_than) + +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_EQUALS, interp_v128_i1_equals) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_EQUALS, interp_v128_i2_equals) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_EQUALS, interp_v128_i4_equals) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I8_EQUALS, interp_v128_i8_equals) + +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I1_CREATE_SCALAR, interp_v128_i1_create_scalar) +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I2_CREATE_SCALAR, interp_v128_i2_create_scalar) +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I4_CREATE_SCALAR, interp_v128_i4_create_scalar) +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I8_CREATE_SCALAR, interp_v128_i8_create_scalar) + +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I1_EXTRACT_MSB, interp_v128_i1_extract_msb) +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I2_EXTRACT_MSB, interp_v128_i2_extract_msb) +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I4_EXTRACT_MSB, interp_v128_i4_extract_msb) +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I8_EXTRACT_MSB, interp_v128_i8_extract_msb) + +INTERP_SIMD_INTRINSIC_P_PPP (INTERP_SIMD_INTRINSIC_V128_CONDITIONAL_SELECT, interp_v128_conditional_select) + +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I1_CREATE, interp_v128_i1_create) +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I2_CREATE, interp_v128_i2_create) +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I4_CREATE, interp_v128_i4_create) +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I8_CREATE, interp_v128_i8_create) + +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_AND_NOT, interp_v128_and_not) + +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_U2_LESS_THAN_EQUAL, interp_v128_u2_less_than_equal) + +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_SHUFFLE, interp_v128_i1_shuffle) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_SHUFFLE, interp_v128_i2_shuffle) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_SHUFFLE, interp_v128_i4_shuffle) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I8_SHUFFLE, interp_v128_i8_shuffle) diff --git a/src/mono/mono/mini/interp/interp-simd.c b/src/mono/mono/mini/interp/interp-simd.c new file mode 100644 index 0000000000000..a6966e192af89 --- /dev/null +++ b/src/mono/mono/mini/interp/interp-simd.c @@ -0,0 +1,597 @@ +#include "interp-simd.h" + +typedef gint64 v128_i8 __attribute__ ((vector_size (SIZEOF_V128))); +typedef guint64 v128_u8 __attribute__ ((vector_size (SIZEOF_V128))); +typedef gint32 v128_i4 __attribute__ ((vector_size (SIZEOF_V128))); +typedef guint32 v128_u4 __attribute__ ((vector_size (SIZEOF_V128))); +typedef gint16 v128_i2 __attribute__ ((vector_size (SIZEOF_V128))); +typedef guint16 v128_u2 __attribute__ ((vector_size (SIZEOF_V128))); +typedef gint8 v128_i1 __attribute__ ((vector_size (SIZEOF_V128))); +typedef guint8 v128_u1 __attribute__ ((vector_size (SIZEOF_V128))); + +// get_AllBitsSet +static void +interp_v128_i4_all_bits_set (gpointer res) +{ + memset (res, 0xff, SIZEOF_V128); +} + +// op_Addition +static void +interp_v128_i1_op_addition (gpointer res, gpointer v1, gpointer v2) +{ + *(v128_i1*)res = *(v128_i1*)v1 + *(v128_i1*)v2; +} + +static void +interp_v128_i2_op_addition (gpointer res, gpointer v1, gpointer v2) +{ + *(v128_i2*)res = *(v128_i2*)v1 + *(v128_i2*)v2; +} + +static void +interp_v128_i4_op_addition (gpointer res, gpointer v1, gpointer v2) +{ + *(v128_i4*)res = *(v128_i4*)v1 + *(v128_i4*)v2; +} + +// op_Subtraction +static void +interp_v128_i1_op_subtraction (gpointer res, gpointer v1, gpointer v2) +{ + *(v128_i1*)res = *(v128_i1*)v1 - *(v128_i1*)v2; +} + +static void +interp_v128_i2_op_subtraction (gpointer res, gpointer v1, gpointer v2) +{ + *(v128_i2*)res = *(v128_i2*)v1 - *(v128_i2*)v2; +} + +static void +interp_v128_i4_op_subtraction (gpointer res, gpointer v1, gpointer v2) +{ + *(v128_i4*)res = *(v128_i4*)v1 - *(v128_i4*)v2; +} + +// op_BitwiseAnd +static void +interp_v128_op_bitwise_and (gpointer res, gpointer v1, gpointer v2) +{ + *(v128_i4*)res = *(v128_i4*)v1 & *(v128_i4*)v2; +} + +// op_BitwiseOr +static void +interp_v128_op_bitwise_or (gpointer res, gpointer v1, gpointer v2) +{ + *(v128_i4*)res = *(v128_i4*)v1 | *(v128_i4*)v2; +} + +// op_Division +static void +interp_v128_i1_op_division (gpointer res, gpointer v1, gpointer v2) +{ + *(v128_i1*)res = *(v128_i1*)v1 / *(v128_i1*)v2; +} + +static void +interp_v128_u1_op_division (gpointer res, gpointer v1, gpointer v2) +{ + *(v128_u1*)res = *(v128_u1*)v1 / *(v128_u1*)v2; +} + +static void +interp_v128_i2_op_division (gpointer res, gpointer v1, gpointer v2) +{ + *(v128_i2*)res = *(v128_i2*)v1 / *(v128_i2*)v2; +} + +static void +interp_v128_u2_op_division (gpointer res, gpointer v1, gpointer v2) +{ + *(v128_u2*)res = *(v128_u2*)v1 / *(v128_u2*)v2; +} + +static void +interp_v128_i4_op_division (gpointer res, gpointer v1, gpointer v2) +{ + *(v128_i4*)res = *(v128_i4*)v1 / *(v128_i4*)v2; +} + +static void +interp_v128_u4_op_division (gpointer res, gpointer v1, gpointer v2) +{ + *(v128_u4*)res = *(v128_u4*)v1 / *(v128_u4*)v2; +} + +// op_Equality +static void +interp_v128_op_bitwise_equality (gpointer res, gpointer v1, gpointer v2) +{ + gint64 *v1_cast = (gint64*)v1; + gint64 *v2_cast = (gint64*)v2; + + if (*v1_cast == *v2_cast && *(v1_cast + 1) == *(v2_cast + 1)) + *(gint32*)res = 1; + else + *(gint32*)res = 0; +} + +// op_ExclusiveOr +static void +interp_v128_op_exclusive_or (gpointer res, gpointer v1, gpointer v2) +{ + *(v128_i4*)res = *(v128_i4*)v1 ^ *(v128_i4*)v2; +} + +// op_Inequality +static void +interp_v128_op_bitwise_inequality (gpointer res, gpointer v1, gpointer v2) +{ + gint64 *v1_cast = (gint64*)v1; + gint64 *v2_cast = (gint64*)v2; + + if (*v1_cast == *v2_cast && *(v1_cast + 1) == *(v2_cast + 1)) + *(gint32*)res = 0; + else + *(gint32*)res = 1; +} + +// op_Addition +static void +interp_v128_i1_op_multiply (gpointer res, gpointer v1, gpointer v2) +{ + *(v128_i1*)res = *(v128_i1*)v1 * *(v128_i1*)v2; +} + +static void +interp_v128_i2_op_multiply (gpointer res, gpointer v1, gpointer v2) +{ + *(v128_i2*)res = *(v128_i2*)v1 * *(v128_i2*)v2; +} + +static void +interp_v128_i4_op_multiply (gpointer res, gpointer v1, gpointer v2) +{ + *(v128_i4*)res = *(v128_i4*)v1 * *(v128_i4*)v2; +} + +// op_UnaryNegation +static void +interp_v128_i1_op_negation (gpointer res, gpointer v1) +{ + *(v128_i1*)res = - (*(v128_i1*)v1); +} + +static void +interp_v128_i2_op_negation (gpointer res, gpointer v1) +{ + *(v128_i2*)res = - (*(v128_i2*)v1); +} + +static void +interp_v128_i4_op_negation (gpointer res, gpointer v1) +{ + *(v128_i4*)res = - (*(v128_i4*)v1); +} + +// op_LeftShift +static void +interp_v128_i1_op_left_shift (gpointer res, gpointer v1, gpointer s1) +{ + *(v128_i1*)res = *(v128_i1*)v1 << (*(gint32*)s1 & 0x7); +} + +static void +interp_v128_i2_op_left_shift (gpointer res, gpointer v1, gpointer s1) +{ + *(v128_i2*)res = *(v128_i2*)v1 << (*(gint32*)s1 & 0x15); +} + +static void +interp_v128_i4_op_left_shift (gpointer res, gpointer v1, gpointer s1) +{ + *(v128_i4*)res = *(v128_i4*)v1 << *(gint32*)s1; +} + +static void +interp_v128_i8_op_left_shift (gpointer res, gpointer v1, gpointer s1) +{ + *(v128_i8*)res = *(v128_i8*)v1 << *(gint32*)s1; +} + +// op_RightShift +static void +interp_v128_i1_op_right_shift (gpointer res, gpointer v1, gpointer s1) +{ + *(v128_i1*)res = *(v128_i1*)v1 >> *(gint32*)s1; +} + +static void +interp_v128_i2_op_right_shift (gpointer res, gpointer v1, gpointer s1) +{ + *(v128_i2*)res = *(v128_i2*)v1 >> *(gint32*)s1; +} + +static void +interp_v128_i4_op_right_shift (gpointer res, gpointer v1, gpointer s1) +{ + *(v128_i4*)res = *(v128_i4*)v1 >> *(gint32*)s1; +} + +// op_UnsignedRightShift +static void +interp_v128_i1_op_uright_shift (gpointer res, gpointer v1, gpointer s1) +{ + *(v128_u1*)res = *(v128_u1*)v1 >> *(gint32*)s1; +} + +static void +interp_v128_i2_op_uright_shift (gpointer res, gpointer v1, gpointer s1) +{ + *(v128_u2*)res = *(v128_u2*)v1 >> *(gint32*)s1; +} + +static void +interp_v128_i4_op_uright_shift (gpointer res, gpointer v1, gpointer s1) +{ + *(v128_u4*)res = *(v128_u4*)v1 >> *(gint32*)s1; +} + +static void +interp_v128_i8_op_uright_shift (gpointer res, gpointer v1, gpointer s1) +{ + *(v128_u8*)res = *(v128_u8*)v1 >> *(gint32*)s1; +} + +// op_OnesComplement +static void +interp_v128_op_ones_complement (gpointer res, gpointer v1) +{ + *(v128_i4*)res = ~(*(v128_i4*)v1); +} + +// WidenLower +static void +interp_v128_u2_widen_lower (gpointer res, gpointer v1) +{ + guint16 *res_typed = (guint16*)res; + guint64 lower_copy = *(guint64*)v1; + guint8 *v1_typed = (guint8*)&lower_copy; + + res_typed [0] = v1_typed [0]; + res_typed [1] = v1_typed [1]; + res_typed [2] = v1_typed [2]; + res_typed [3] = v1_typed [3]; + res_typed [4] = v1_typed [4]; + res_typed [5] = v1_typed [5]; + res_typed [6] = v1_typed [6]; + res_typed [7] = v1_typed [7]; +} + +// WidenUpper +static void +interp_v128_u2_widen_upper (gpointer res, gpointer v1) +{ + guint16 *res_typed = (guint16*)res; + guint64 upper_copy = *((guint64*)v1 + 1); + guint8 *v1_typed = (guint8*)&upper_copy; + + res_typed [0] = v1_typed [0]; + res_typed [1] = v1_typed [1]; + res_typed [2] = v1_typed [2]; + res_typed [3] = v1_typed [3]; + res_typed [4] = v1_typed [4]; + res_typed [5] = v1_typed [5]; + res_typed [6] = v1_typed [6]; + res_typed [7] = v1_typed [7]; +} + +// Narrow +static void +interp_v128_u1_narrow (gpointer res, gpointer v1, gpointer v2) +{ + guint8 *res_typed = (guint8*)res; + guint16 *v1_typed = (guint16*)v1; + guint16 *v2_typed = (guint16*)v2; + + if (res != v2) { + res_typed [0] = v1_typed [0]; + res_typed [1] = v1_typed [1]; + res_typed [2] = v1_typed [2]; + res_typed [3] = v1_typed [3]; + res_typed [4] = v1_typed [4]; + res_typed [5] = v1_typed [5]; + res_typed [6] = v1_typed [6]; + res_typed [7] = v1_typed [7]; + + res_typed [8] = v2_typed [0]; + res_typed [9] = v2_typed [1]; + res_typed [10] = v2_typed [2]; + res_typed [11] = v2_typed [3]; + res_typed [12] = v2_typed [4]; + res_typed [13] = v2_typed [5]; + res_typed [14] = v2_typed [6]; + res_typed [15] = v2_typed [7]; + } else { + res_typed [15] = v2_typed [7]; + res_typed [14] = v2_typed [6]; + res_typed [13] = v2_typed [5]; + res_typed [12] = v2_typed [4]; + res_typed [11] = v2_typed [3]; + res_typed [10] = v2_typed [2]; + res_typed [9] = v2_typed [1]; + res_typed [8] = v2_typed [0]; + + res_typed [0] = v1_typed [0]; + res_typed [1] = v1_typed [1]; + res_typed [2] = v1_typed [2]; + res_typed [3] = v1_typed [3]; + res_typed [4] = v1_typed [4]; + res_typed [5] = v1_typed [5]; + res_typed [6] = v1_typed [6]; + res_typed [7] = v1_typed [7]; + } +} + +// GreaterThan +static void +interp_v128_u1_greater_than (gpointer res, gpointer v1, gpointer v2) +{ + *(v128_u1*)res = *(v128_u1*)v1 > *(v128_u1*)v2; +} + +// LessThan +static void +interp_v128_i1_less_than (gpointer res, gpointer v1, gpointer v2) +{ + *(v128_i1*)res = *(v128_i1*)v1 < *(v128_i1*)v2; +} + +static void +interp_v128_u1_less_than (gpointer res, gpointer v1, gpointer v2) +{ + *(v128_u1*)res = *(v128_u1*)v1 < *(v128_u1*)v2; +} + +static void +interp_v128_i2_less_than (gpointer res, gpointer v1, gpointer v2) +{ + *(v128_i2*)res = *(v128_i2*)v1 < *(v128_i2*)v2; +} + +// Equals +static void +interp_v128_i1_equals (gpointer res, gpointer v1, gpointer v2) +{ + *(v128_i1*)res = *(v128_i1*)v1 == *(v128_i1*)v2; +} + +static void +interp_v128_i2_equals (gpointer res, gpointer v1, gpointer v2) +{ + *(v128_i2*)res = *(v128_i2*)v1 == *(v128_i2*)v2; +} + +static void +interp_v128_i4_equals (gpointer res, gpointer v1, gpointer v2) +{ + *(v128_i4*)res = *(v128_i4*)v1 == *(v128_i4*)v2; +} + +static void +interp_v128_i8_equals (gpointer res, gpointer v1, gpointer v2) +{ + *(v128_i8*)res = *(v128_i8*)v1 == *(v128_i8*)v2; +} + +// CreateScalar +static void +interp_v128_i1_create_scalar (gpointer res, gpointer v1) +{ + gint8 val = *(gint8*)v1; + memset (res, 0, SIZEOF_V128); + *(gint8*)res = val; +} + +static void +interp_v128_i2_create_scalar (gpointer res, gpointer v1) +{ + gint16 val = *(gint16*)v1; + memset (res, 0, SIZEOF_V128); + *(gint16*)res = val; +} +static void +interp_v128_i4_create_scalar (gpointer res, gpointer v1) +{ + gint32 val = *(gint32*)v1; + memset (res, 0, SIZEOF_V128); + *(gint32*)res = val; +} +static void +interp_v128_i8_create_scalar (gpointer res, gpointer v1) +{ + gint64 val = *(gint64*)v1; + memset (res, 0, SIZEOF_V128); + *(gint64*)res = val; +} + +// ExtractMostSignificantBits +static void +interp_v128_i1_extract_msb (gpointer res, gpointer v1) +{ + guint32 val = 0; + gint8 *v1_typed = (gint8*)v1; + for (int i = 0; i < SIZEOF_V128 / sizeof (gint8); i++) { + if (*v1_typed & (1 << 7)) + val |= 1 << i; + v1_typed++; + } + *(guint32*)res = val; +} + +static void +interp_v128_i2_extract_msb (gpointer res, gpointer v1) +{ + guint32 val = 0; + gint16 *v1_typed = (gint16*)v1; + for (int i = 0; i < SIZEOF_V128 / sizeof (gint16); i++) { + if (*v1_typed & (1 << 15)) + val |= 1 << i; + v1_typed++; + } + *(guint32*)res = val; +} + +static void +interp_v128_i4_extract_msb (gpointer res, gpointer v1) +{ + guint32 val = 0; + gint32 *v1_typed = (gint32*)v1; + for (int i = 0; i < SIZEOF_V128 / sizeof (gint32); i++) { + if (*v1_typed & (1 << 31)) + val |= 1 << i; + v1_typed++; + } + *(guint32*)res = val; +} + +static void +interp_v128_i8_extract_msb (gpointer res, gpointer v1) +{ + guint32 val = 0; + gint64 *v1_typed = (gint64*)v1; + for (int i = 0; i < SIZEOF_V128 / sizeof (gint64); i++) { + if (*v1_typed & ((gint64)1 << 63)) + val |= 1 << i; + v1_typed++; + } + *(guint32*)res = val; +} + +// ConditionalSelect +static void +interp_v128_conditional_select (gpointer res, gpointer v1, gpointer v2, gpointer v3) +{ + v128_i8 cond = *(v128_i8*)v1; + *(v128_i8*)res = (*(v128_i8*)v2 & cond) | (*(v128_i8*)v3 & ~cond); +} + +// Create +static void +interp_v128_i1_create (gpointer res, gpointer v1) +{ + gint8 val = *(gint8*)v1; + v128_i1 v = { val, val, val, val, + val, val, val, val, + val, val, val, val, + val, val, val, val }; + *(v128_i1*)res = v; +} + +static void +interp_v128_i2_create (gpointer res, gpointer v1) +{ + gint16 val = *(gint16*)v1; + v128_i2 v = { val, val, val, val, + val, val, val, val }; + *(v128_i2*)res = v; +} + +static void +interp_v128_i4_create (gpointer res, gpointer v1) +{ + gint32 val = *(gint32*)v1; + v128_i4 v = { val, val, val, val }; + *(v128_i4*)res = v; +} + +static void +interp_v128_i8_create (gpointer res, gpointer v1) +{ + gint64 val = *(gint64*)v1; + v128_i8 v = { val, val }; + *(v128_i8*)res = v; +} + +// AndNot +static void +interp_v128_and_not (gpointer res, gpointer v1, gpointer v2) +{ + *(v128_i4*)res = *(v128_i4*)v1 & ~(*(v128_i4*)v2); +} + +// LessThanOrEqual +static void +interp_v128_u2_less_than_equal (gpointer res, gpointer v1, gpointer v2) +{ + *(v128_u2*)res = *(v128_u2*)v1 <= *(v128_u2*)v2; +} + +// Shuffle + +#define V128_SHUFFLE(eltype, itype) do { \ + eltype result[16]; \ + eltype *v1_typed = (eltype*)v1; \ + itype *v2_typed = (itype*)v2; \ + for (int i = 0; i < SIZEOF_V128 / sizeof (eltype); i++) { \ + itype index = v2_typed [i]; \ + if (index < (SIZEOF_V128 / sizeof (eltype))) \ + result [i] = v1_typed [index]; \ + else \ + result [i] = 0; \ + } \ + memcpy (res, result, SIZEOF_V128); \ + } while (0) +static void +interp_v128_i1_shuffle (gpointer res, gpointer v1, gpointer v2) +{ + V128_SHUFFLE (gint8, guint8); +} + +static void +interp_v128_i2_shuffle (gpointer res, gpointer v1, gpointer v2) +{ + V128_SHUFFLE (gint16, guint16); +} + +static void +interp_v128_i4_shuffle (gpointer res, gpointer v1, gpointer v2) +{ + V128_SHUFFLE (gint32, guint32); +} + +static void +interp_v128_i8_shuffle (gpointer res, gpointer v1, gpointer v2) +{ + V128_SHUFFLE (gint64, guint64); +} + +#define INTERP_SIMD_INTRINSIC_P_P(a,b) +#define INTERP_SIMD_INTRINSIC_P_PP(a,b) +#define INTERP_SIMD_INTRINSIC_P_PPP(a,b) + +#undef INTERP_SIMD_INTRINSIC_P_P +#define INTERP_SIMD_INTRINSIC_P_P(a,b) b, +PP_SIMD_Method interp_simd_p_p_table [] = { +#include "interp-simd-intrins.def" +}; +#undef INTERP_SIMD_INTRINSIC_P_P +#define INTERP_SIMD_INTRINSIC_P_P(a,b) + +#undef INTERP_SIMD_INTRINSIC_P_PP +#define INTERP_SIMD_INTRINSIC_P_PP(a,b) b, +PPP_SIMD_Method interp_simd_p_pp_table [] = { +#include "interp-simd-intrins.def" +}; +#undef INTERP_SIMD_INTRINSIC_P_PP +#define INTERP_SIMD_INTRINSIC_P_PP(a,b) + +#undef INTERP_SIMD_INTRINSIC_P_PPP +#define INTERP_SIMD_INTRINSIC_P_PPP(a,b) b, +PPPP_SIMD_Method interp_simd_p_ppp_table [] = { +#include "interp-simd-intrins.def" +}; +#undef INTERP_SIMD_INTRINSIC_P_PPP +#define INTERP_SIMD_INTRINSIC_P_PPP(a,b) diff --git a/src/mono/mono/mini/interp/interp-simd.h b/src/mono/mono/mini/interp/interp-simd.h new file mode 100644 index 0000000000000..3763c571069ba --- /dev/null +++ b/src/mono/mono/mini/interp/interp-simd.h @@ -0,0 +1,16 @@ +#ifndef __MONO_MINI_INTERP_SIMD_H__ +#define __MONO_MINI_INTERP_SIMD_H__ + +#include + +typedef void (*PP_SIMD_Method) (gpointer, gpointer); +typedef void (*PPP_SIMD_Method) (gpointer, gpointer, gpointer); +typedef void (*PPPP_SIMD_Method) (gpointer, gpointer, gpointer, gpointer); + +extern PP_SIMD_Method interp_simd_p_p_table []; +extern PPP_SIMD_Method interp_simd_p_pp_table []; +extern PPPP_SIMD_Method interp_simd_p_ppp_table []; + +#endif /* __MONO_MINI_INTERP_SIMD_H__ */ + + diff --git a/src/mono/mono/mini/interp/interp.c b/src/mono/mono/mini/interp/interp.c index 81ccb95ed937e..f28afbc957992 100644 --- a/src/mono/mono/mini/interp/interp.c +++ b/src/mono/mono/mini/interp/interp.c @@ -65,6 +65,7 @@ #include "mintops.h" #include "interp-intrins.h" #include "tiering.h" +#include "interp-simd.h" #include #include @@ -5791,6 +5792,74 @@ MINT_IN_CASE(MINT_BRTRUE_I8_SP) ZEROP_SP(gint64, !=); MINT_IN_BREAK; ip += 7; goto call; } + + MINT_IN_CASE(MINT_SIMD_V128_LDC) { + memcpy (locals + ip [1], ip + 2, SIZEOF_V128); + ip += 10; + MINT_IN_BREAK; + } + MINT_IN_CASE(MINT_SIMD_V128_I1_CREATE) { + const int num_elements = SIZEOF_V128 / sizeof (gint8); + gint8 res_buffer [num_elements]; + gint8 *args = (gint8*)(locals + ip [2]); + for (int i = 0; i < num_elements; i++) { + res_buffer [i] = *args; + args += MINT_STACK_SLOT_SIZE / sizeof (gint8); + } + memcpy (locals + ip [1], res_buffer, SIZEOF_V128); + ip += 3; + MINT_IN_BREAK; + } + MINT_IN_CASE(MINT_SIMD_V128_I2_CREATE) { + const int num_elements = SIZEOF_V128 / sizeof (gint16); + gint16 res_buffer [num_elements]; + gint16 *args = (gint16*)(locals + ip [2]); + for (int i = 0; i < num_elements; i++) { + res_buffer [i] = *args; + args += MINT_STACK_SLOT_SIZE / sizeof (gint16); + } + memcpy (locals + ip [1], res_buffer, SIZEOF_V128); + ip += 3; + MINT_IN_BREAK; + } + MINT_IN_CASE(MINT_SIMD_V128_I4_CREATE) { + const int num_elements = SIZEOF_V128 / sizeof (gint32); + gint32 res_buffer [num_elements]; + gint32 *args = (gint32*)(locals + ip [2]); + for (int i = 0; i < num_elements; i++) { + res_buffer [i] = *args; + args += MINT_STACK_SLOT_SIZE / sizeof (gint32); + } + memcpy (locals + ip [1], res_buffer, SIZEOF_V128); + ip += 3; + MINT_IN_BREAK; + } + MINT_IN_CASE(MINT_SIMD_V128_I8_CREATE) { + const int num_elements = SIZEOF_V128 / sizeof (gint64); + gint64 res_buffer [num_elements]; + gint64 *args = (gint64*)(locals + ip [2]); + for (int i = 0; i < num_elements; i++) { + res_buffer [i] = *args; + args += MINT_STACK_SLOT_SIZE / sizeof (gint64); + } + memcpy (locals + ip [1], res_buffer, SIZEOF_V128); + ip += 3; + MINT_IN_BREAK; + } + + MINT_IN_CASE(MINT_SIMD_INTRINS_P_P) + interp_simd_p_p_table [ip [3]] (locals + ip [1], locals + ip [2]); + ip += 4; + MINT_IN_BREAK; + MINT_IN_CASE(MINT_SIMD_INTRINS_P_PP) + interp_simd_p_pp_table [ip [4]] (locals + ip [1], locals + ip [2], locals + ip [3]); + ip += 5; + MINT_IN_BREAK; + MINT_IN_CASE(MINT_SIMD_INTRINS_P_PPP) + interp_simd_p_ppp_table [ip [5]] (locals + ip [1], locals + ip [2], locals + ip [3], locals + ip [4]); + ip += 6; + MINT_IN_BREAK; + MINT_IN_CASE(MINT_INTRINS_SPAN_CTOR) { gpointer ptr = LOCAL_VAR (ip [2], gpointer); int len = LOCAL_VAR (ip [3], gint32); diff --git a/src/mono/mono/mini/interp/mintops.def b/src/mono/mono/mini/interp/mintops.def index c0034017930f8..cb957a5ee0f9a 100644 --- a/src/mono/mono/mini/interp/mintops.def +++ b/src/mono/mono/mini/interp/mintops.def @@ -794,6 +794,16 @@ OPDEF(MINT_PROF_COVERAGE_STORE, "prof_coverage_store", 5, 0, 0, MintOpLongInt) OPDEF(MINT_TIER_ENTER_METHOD, "tier_enter_method", 1, 0, 0, MintOpNoArgs) OPDEF(MINT_TIER_PATCHPOINT, "tier_patchpoint", 2, 0, 0, MintOpShortInt) +OPDEF(MINT_SIMD_V128_LDC, "simd_v128_ldc", 10, 1, 0, MintOpNoArgs) +OPDEF(MINT_SIMD_V128_I1_CREATE, "simd_v128_i1_create", 3, 1, 1, MintOpNoArgs) +OPDEF(MINT_SIMD_V128_I2_CREATE, "simd_v128_i2_create", 3, 1, 1, MintOpNoArgs) +OPDEF(MINT_SIMD_V128_I4_CREATE, "simd_v128_i4_create", 3, 1, 1, MintOpNoArgs) +OPDEF(MINT_SIMD_V128_I8_CREATE, "simd_v128_i8_create", 3, 1, 1, MintOpNoArgs) + +OPDEF(MINT_SIMD_INTRINS_P_P, "simd_intrins_p_p", 4, 1, 1, MintOpShortInt) +OPDEF(MINT_SIMD_INTRINS_P_PP, "simd_intrins_p_pp", 5, 1, 2, MintOpShortInt) +OPDEF(MINT_SIMD_INTRINS_P_PPP, "simd_intrins_p_ppp", 6, 1, 3, MintOpShortInt) + OPDEF(MINT_INTRINS_ENUM_HASFLAG, "intrins_enum_hasflag", 5, 1, 2, MintOpClassToken) OPDEF(MINT_INTRINS_GET_HASHCODE, "intrins_get_hashcode", 3, 1, 1, MintOpNoArgs) OPDEF(MINT_INTRINS_TRY_GET_HASHCODE, "intrins_try_get_hashcode", 3, 1, 1, MintOpNoArgs) diff --git a/src/mono/mono/mini/interp/mintops.h b/src/mono/mono/mini/interp/mintops.h index b5afbc18a9286..10b021698bc36 100644 --- a/src/mono/mono/mini/interp/mintops.h +++ b/src/mono/mono/mini/interp/mintops.h @@ -5,6 +5,7 @@ #ifndef __INTERPRETER_MINTOPS_H #define __INTERPRETER_MINTOPS_H +#include #include typedef enum @@ -38,6 +39,38 @@ typedef enum { } MintOpcode; #undef OPDEF +/* SIMD opcodes, grouped by signature */ + +#define INTERP_SIMD_INTRINSIC_P_P(a,b) +#define INTERP_SIMD_INTRINSIC_P_PP(a,b) +#define INTERP_SIMD_INTRINSIC_P_PPP(a,b) + +#undef INTERP_SIMD_INTRINSIC_P_P +#define INTERP_SIMD_INTRINSIC_P_P(a,b) a, +typedef enum { +#include "interp-simd-intrins.def" +} MintSIMDOpsPP; +#undef INTERP_SIMD_INTRINSIC_P_P +#define INTERP_SIMD_INTRINSIC_P_P(a,b) + +#undef INTERP_SIMD_INTRINSIC_P_PP +#define INTERP_SIMD_INTRINSIC_P_PP(a,b) a, +typedef enum { +#include "interp-simd-intrins.def" + INTERP_SIMD_INTRINSIC_P_PP_LAST +} MintSIMDOpsPPP; +#undef INTERP_SIMD_INTRINSIC_P_PP +#define INTERP_SIMD_INTRINSIC_P_PP(a,b) + +#undef INTERP_SIMD_INTRINSIC_P_PPP +#define INTERP_SIMD_INTRINSIC_P_PPP(a,b) a, +typedef enum { +#include "interp-simd-intrins.def" + INTERP_SIMD_INTRINSIC_P_PPP_LAST +} MintSIMDOpsPPPP; +#undef INTERP_SIMD_INTRINSIC_P_PPP +#define INTERP_SIMD_INTRINSIC_P_PPP(a,b) + #if NO_UNALIGNED_ACCESS # if G_BYTE_ORDER == G_LITTLE_ENDIAN #define READ32(x) (((guint16 *)(x)) [0] | ((guint16 *)(x)) [1] << 16) diff --git a/src/mono/mono/mini/interp/simd-methods.def b/src/mono/mono/mini/interp/simd-methods.def new file mode 100644 index 0000000000000..6b3ef8e5d9c61 --- /dev/null +++ b/src/mono/mono/mini/interp/simd-methods.def @@ -0,0 +1,39 @@ +SIMD_METHOD(get_Count) +SIMD_METHOD(get_AllBitsSet) +SIMD_METHOD(get_IsHardwareAccelerated) +SIMD_METHOD(get_Item) +SIMD_METHOD(get_One) +SIMD_METHOD(get_Zero) +SIMD_METHOD(op_Addition) +SIMD_METHOD(op_BitwiseAnd) +SIMD_METHOD(op_BitwiseOr) +SIMD_METHOD(op_Division) +SIMD_METHOD(op_Equality) +SIMD_METHOD(op_ExclusiveOr) +SIMD_METHOD(op_Explicit) +SIMD_METHOD(op_Inequality) +SIMD_METHOD(op_LeftShift) +SIMD_METHOD(op_Multiply) +SIMD_METHOD(op_OnesComplement) +SIMD_METHOD(op_RightShift) +SIMD_METHOD(op_Subtraction) +SIMD_METHOD(op_UnaryNegation) +SIMD_METHOD(op_UnsignedRightShift) + +SIMD_METHOD(AndNot) +SIMD_METHOD(ConditionalSelect) +SIMD_METHOD(Create) +SIMD_METHOD(CreateScalar) +SIMD_METHOD(CreateScalarUnsafe) +SIMD_METHOD(Equals) +SIMD_METHOD(ExtractMostSignificantBits) +SIMD_METHOD(GreaterThan) +SIMD_METHOD(LessThan) +SIMD_METHOD(LessThanOrEqual) +SIMD_METHOD(Narrow) +SIMD_METHOD(ShiftLeft) +SIMD_METHOD(ShiftRightArithmetic) +SIMD_METHOD(ShiftRightLogical) +SIMD_METHOD(Shuffle) +SIMD_METHOD(WidenLower) +SIMD_METHOD(WidenUpper) diff --git a/src/mono/mono/mini/interp/transform-simd.c b/src/mono/mono/mini/interp/transform-simd.c new file mode 100644 index 0000000000000..1b2c94b53436b --- /dev/null +++ b/src/mono/mono/mini/interp/transform-simd.c @@ -0,0 +1,474 @@ +/* + * SIMD Intrinsics support for interpreter + */ + +#include + +// We use the same approach as jit/aot for identifying simd methods. +// FIXME Consider sharing the code + +#define MSGSTRFIELD(line) MSGSTRFIELD1(line) +#define MSGSTRFIELD1(line) str##line +static const struct msgstr_t { +#define SIMD_METHOD(name) char MSGSTRFIELD(__LINE__) [sizeof (#name)]; +#include "simd-methods.def" +#undef SIMD_METHOD +} method_names = { +#define SIMD_METHOD(name) #name, +#include "simd-methods.def" +#undef SIMD_METHOD +}; + +enum { +#define SIMD_METHOD(name) SN_ ## name = offsetof (struct msgstr_t, MSGSTRFIELD(__LINE__)), +#include "simd-methods.def" +}; + +#define method_name(idx) ((const char*)&method_names + (idx)) + +static int +simd_intrinsic_compare_by_name (const void *key, const void *value) +{ + return strcmp ((const char*)key, method_name (*(guint16*)value)); +} + +static int +lookup_intrins (guint16 *intrinsics, int size, MonoMethod *cmethod) +{ + guint16 *result = mono_binary_search (cmethod->name, intrinsics, size / sizeof (guint16), sizeof (guint16), &simd_intrinsic_compare_by_name); + + if (result == NULL) + return -1; + else + return (int)*result; +} + +static guint16 sri_vector128_methods [] = { + SN_AndNot, + SN_ConditionalSelect, + SN_Create, + SN_CreateScalar, + SN_CreateScalarUnsafe, + SN_Equals, + SN_ExtractMostSignificantBits, + SN_GreaterThan, + SN_LessThan, + SN_LessThanOrEqual, + SN_Narrow, + SN_ShiftLeft, + SN_ShiftRightArithmetic, + SN_ShiftRightLogical, + SN_Shuffle, + SN_WidenLower, + SN_WidenUpper, + SN_get_IsHardwareAccelerated +}; + +static guint16 sri_vector128_t_methods [] = { + SN_get_AllBitsSet, + SN_get_Count, + SN_get_One, + SN_get_Zero, + SN_op_Addition, + SN_op_BitwiseAnd, + SN_op_BitwiseOr, + SN_op_Division, + SN_op_Equality, + SN_op_ExclusiveOr, + SN_op_Inequality, + SN_op_LeftShift, + SN_op_Multiply, + SN_op_OnesComplement, + SN_op_RightShift, + SN_op_Subtraction, + SN_op_UnaryNegation, + SN_op_UnsignedRightShift +}; + +static gboolean +emit_sri_vector128 (TransformData *td, MonoMethod *cmethod, MonoMethodSignature *csignature) +{ + int id = lookup_intrins (sri_vector128_methods, sizeof (sri_vector128_methods), cmethod); + if (id == -1) + return FALSE; + + if (id == SN_get_IsHardwareAccelerated) { + interp_add_ins (td, MINT_LDC_I4_1); + goto opcode_added; + } + + gint16 simd_opcode = -1; + gint16 simd_intrins = -1; + + MonoClass *vector_klass = mono_class_from_mono_type_internal (csignature->ret); + if (!m_class_is_simd_type (vector_klass)) + vector_klass = mono_class_from_mono_type_internal (csignature->params [0]); + if (!m_class_is_simd_type (vector_klass)) + return FALSE; + MonoType *arg_type = mono_class_get_context (vector_klass)->class_inst->type_argv [0]; + MonoTypeEnum atype = arg_type->type; + int vector_size = mono_class_value_size (vector_klass, NULL); + int arg_size = mono_class_value_size (mono_class_from_mono_type_internal (arg_type), NULL); + g_assert (vector_size == SIZEOF_V128); + + int scalar_arg = -1; + for (int i = 0; i < csignature->param_count; i++) { + if (csignature->params [i]->type != MONO_TYPE_GENERICINST) + scalar_arg = i; + } + + switch (id) { + case SN_AndNot: + simd_opcode = MINT_SIMD_INTRINS_P_PP; + simd_intrins = INTERP_SIMD_INTRINSIC_V128_AND_NOT; + break; + case SN_ConditionalSelect: + simd_opcode = MINT_SIMD_INTRINS_P_PPP; + simd_intrins = INTERP_SIMD_INTRINSIC_V128_CONDITIONAL_SELECT; + break; + case SN_Create: + if (csignature->param_count == 1 && atype == csignature->params [0]->type) { + simd_opcode = MINT_SIMD_INTRINS_P_P; + if (arg_size == 1) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I1_CREATE; + else if (arg_size == 2) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I2_CREATE; + else if (arg_size == 4) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I4_CREATE; + else if (arg_size == 8) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I8_CREATE; + } else if (csignature->param_count == vector_size / arg_size && atype == csignature->params [0]->type) { + int num_args = csignature->param_count; + if (num_args == 16) interp_add_ins (td, MINT_SIMD_V128_I1_CREATE); + else if (num_args == 8) interp_add_ins (td, MINT_SIMD_V128_I2_CREATE); + else if (num_args == 4) interp_add_ins (td, MINT_SIMD_V128_I4_CREATE); + else if (num_args == 2) interp_add_ins (td, MINT_SIMD_V128_I8_CREATE); + else g_assert_not_reached (); + + // We use call args machinery since we have too many args + interp_ins_set_sreg (td->last_ins, MINT_CALL_ARGS_SREG); + int *call_args = (int*)mono_mempool_alloc (td->mempool, (num_args + 1) * sizeof (int)); + td->sp -= csignature->param_count; + for (int i = 0; i < num_args; i++) + call_args [i] = td->sp [i].local; + call_args [num_args] = -1; + init_last_ins_call (td); + td->last_ins->info.call_info->call_args = call_args; + if (!td->optimized) + td->last_ins->info.call_info->call_offset = get_tos_offset (td); + push_type_vt (td, vector_klass, vector_size); + interp_ins_set_dreg (td->last_ins, td->sp [-1].local); + td->ip += 5; + return TRUE; + } + break; + case SN_CreateScalar: + case SN_CreateScalarUnsafe: + simd_opcode = MINT_SIMD_INTRINS_P_P; + if (arg_size == 1) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I1_CREATE_SCALAR; + else if (arg_size == 2) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I2_CREATE_SCALAR; + else if (arg_size == 4) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I4_CREATE_SCALAR; + else if (arg_size == 8) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I8_CREATE_SCALAR; + break; + case SN_Equals: + simd_opcode = MINT_SIMD_INTRINS_P_PP; + if (atype == MONO_TYPE_I1 || atype == MONO_TYPE_U1) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I1_EQUALS; + else if (atype == MONO_TYPE_I2 || atype == MONO_TYPE_U2) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I2_EQUALS; + else if (atype == MONO_TYPE_I4 || atype == MONO_TYPE_U4) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I4_EQUALS; + else if (atype == MONO_TYPE_I8 || atype == MONO_TYPE_U8) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I8_EQUALS; + break; + case SN_ExtractMostSignificantBits: + simd_opcode = MINT_SIMD_INTRINS_P_P; + if (arg_size == 1) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I1_EXTRACT_MSB; + else if (arg_size == 2) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I2_EXTRACT_MSB; + else if (arg_size == 4) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I4_EXTRACT_MSB; + else if (arg_size == 8) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I8_EXTRACT_MSB; + break; + case SN_GreaterThan: + simd_opcode = MINT_SIMD_INTRINS_P_PP; + if (atype == MONO_TYPE_U1) simd_intrins = INTERP_SIMD_INTRINSIC_V128_U1_GREATER_THAN; + break; + case SN_LessThan: + simd_opcode = MINT_SIMD_INTRINS_P_PP; + if (atype == MONO_TYPE_I1) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I1_LESS_THAN; + else if (atype == MONO_TYPE_U1) simd_intrins = INTERP_SIMD_INTRINSIC_V128_U1_LESS_THAN; + else if (atype == MONO_TYPE_I2) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I2_LESS_THAN; + break; + case SN_LessThanOrEqual: + simd_opcode = MINT_SIMD_INTRINS_P_PP; + if (atype == MONO_TYPE_U2) simd_intrins = INTERP_SIMD_INTRINSIC_V128_U2_LESS_THAN_EQUAL; + break; + case SN_Narrow: + simd_opcode = MINT_SIMD_INTRINS_P_PP; + if (atype == MONO_TYPE_U1) simd_intrins = INTERP_SIMD_INTRINSIC_V128_U1_NARROW; + break; + case SN_ShiftLeft: + g_assert (scalar_arg == 1); + simd_opcode = MINT_SIMD_INTRINS_P_PP; + if (arg_size == 1) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I1_LEFT_SHIFT; + else if (arg_size == 2) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I2_LEFT_SHIFT; + else if (arg_size == 4) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I4_LEFT_SHIFT; + else if (arg_size == 8) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I8_LEFT_SHIFT; + break; + case SN_ShiftRightLogical: + g_assert (scalar_arg == 1); + simd_opcode = MINT_SIMD_INTRINS_P_PP; + if (arg_size == 1) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I1_URIGHT_SHIFT; + else if (arg_size == 2) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I2_URIGHT_SHIFT; + else if (arg_size == 4) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I4_URIGHT_SHIFT; + else if (arg_size == 8) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I8_URIGHT_SHIFT; + break; + case SN_ShiftRightArithmetic: + g_assert (scalar_arg == 1); + simd_opcode = MINT_SIMD_INTRINS_P_PP; + if (atype == MONO_TYPE_I1) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I1_RIGHT_SHIFT; + else if (atype == MONO_TYPE_I2) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I2_RIGHT_SHIFT; + else if (atype == MONO_TYPE_I4) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I4_RIGHT_SHIFT; + else if (atype == MONO_TYPE_U1) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I1_URIGHT_SHIFT; + else if (atype == MONO_TYPE_U2) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I2_URIGHT_SHIFT; + else if (atype == MONO_TYPE_U4) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I4_URIGHT_SHIFT; + break; + case SN_Shuffle: + simd_opcode = MINT_SIMD_INTRINS_P_PP; + if (arg_size == 1) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I1_SHUFFLE; + else if (arg_size == 2) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I2_SHUFFLE; + else if (arg_size == 4) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I4_SHUFFLE; + else if (arg_size == 8) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I8_SHUFFLE; + break; + case SN_WidenLower: + simd_opcode = MINT_SIMD_INTRINS_P_P; + if (atype == MONO_TYPE_U2) simd_intrins = INTERP_SIMD_INTRINSIC_V128_U2_WIDEN_LOWER; + break; + case SN_WidenUpper: + simd_opcode = MINT_SIMD_INTRINS_P_P; + if (atype == MONO_TYPE_U2) simd_intrins = INTERP_SIMD_INTRINSIC_V128_U2_WIDEN_UPPER; + break; + default: + return FALSE; + } + + if (simd_opcode == -1 || simd_intrins == -1) { + return FALSE; + } + + interp_add_ins (td, simd_opcode); + td->last_ins->data [0] = simd_intrins; + +opcode_added: + td->sp -= csignature->param_count; + for (int i = 0; i < csignature->param_count; i++) + td->last_ins->sregs [i] = td->sp [i].local; + + g_assert (csignature->ret->type != MONO_TYPE_VOID); + int ret_mt = mint_type (csignature->ret); + if (ret_mt == MINT_TYPE_VT) { + // For these intrinsics, if we return a VT then it is a V128 + push_type_vt (td, vector_klass, vector_size); + } else { + push_simple_type (td, stack_type [ret_mt]); + } + interp_ins_set_dreg (td->last_ins, td->sp [-1].local); + td->ip += 5; + return TRUE; +} + +static gboolean +emit_sri_vector128_t (TransformData *td, MonoMethod *cmethod, MonoMethodSignature *csignature) +{ + int id = lookup_intrins (sri_vector128_t_methods, sizeof (sri_vector128_t_methods), cmethod); + if (id == -1) + return FALSE; + + gint16 simd_opcode = -1; + gint16 simd_intrins = -1; + + // First argument is always vector + MonoClass *vector_klass = cmethod->klass; + MonoType *arg_type = mono_class_get_context (vector_klass)->class_inst->type_argv [0]; + MonoTypeEnum atype = arg_type->type; + int vector_size = mono_class_value_size (vector_klass, NULL); + int arg_size = mono_class_value_size (mono_class_from_mono_type_internal (arg_type), NULL); + g_assert (vector_size == SIZEOF_V128); + + int scalar_arg = -1; + for (int i = 0; i < csignature->param_count; i++) { + if (csignature->params [i]->type != MONO_TYPE_GENERICINST) + scalar_arg = i; + } + + switch (id) { + case SN_get_AllBitsSet: { + interp_add_ins (td, MINT_SIMD_V128_LDC); + guint16 *data = &td->last_ins->data [0]; + for (int i = 0; i < vector_size / sizeof (guint16); i++) + data [i] = 0xffff; + goto opcode_added; + } + case SN_get_Count: + interp_add_ins (td, MINT_LDC_I4_S); + td->last_ins->data [0] = vector_size / arg_size; + goto opcode_added; + case SN_get_One: + if (atype == MONO_TYPE_I1 || atype == MONO_TYPE_U1) { + interp_add_ins (td, MINT_SIMD_V128_LDC); + gint8 *data = (gint8*)&td->last_ins->data [0]; + for (int i = 0; i < vector_size / arg_size; i++) + data [i] = 1; + goto opcode_added; + } else if (atype == MONO_TYPE_I2 || atype == MONO_TYPE_U2) { + interp_add_ins (td, MINT_SIMD_V128_LDC); + gint16 *data = (gint16*)&td->last_ins->data [0]; + for (int i = 0; i < vector_size / arg_size; i++) + data [i] = 1; + goto opcode_added; + } else if (atype == MONO_TYPE_I4 || atype == MONO_TYPE_U4) { + interp_add_ins (td, MINT_SIMD_V128_LDC); + gint32 *data = (gint32*)&td->last_ins->data [0]; + for (int i = 0; i < vector_size / arg_size; i++) + data [i] = 1; + goto opcode_added; + } else if (atype == MONO_TYPE_I8 || atype == MONO_TYPE_U8) { + interp_add_ins (td, MINT_SIMD_V128_LDC); + gint64 *data = (gint64*)&td->last_ins->data [0]; + for (int i = 0; i < vector_size / arg_size; i++) + data [i] = 1; + goto opcode_added; + } + break; + case SN_get_Zero: + interp_add_ins (td, MINT_INITLOCAL); + td->last_ins->data [0] = SIZEOF_V128; + goto opcode_added; + case SN_op_Addition: + simd_opcode = MINT_SIMD_INTRINS_P_PP; + if (atype == MONO_TYPE_I1 || atype == MONO_TYPE_U1) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I1_ADD; + else if (atype == MONO_TYPE_I2 || atype == MONO_TYPE_U2) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I2_ADD; + else if (atype == MONO_TYPE_I4 || atype == MONO_TYPE_U4) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I4_ADD; + break; + case SN_op_BitwiseAnd: + simd_opcode = MINT_SIMD_INTRINS_P_PP; + simd_intrins = INTERP_SIMD_INTRINSIC_V128_BITWISE_AND; + break; + case SN_op_BitwiseOr: + simd_opcode = MINT_SIMD_INTRINS_P_PP; + simd_intrins = INTERP_SIMD_INTRINSIC_V128_BITWISE_OR; + break; + case SN_op_Division: + g_assert (scalar_arg == -1); + simd_opcode = MINT_SIMD_INTRINS_P_PP; + if (atype == MONO_TYPE_I1) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I1_DIVISION; + else if (atype == MONO_TYPE_U1) simd_intrins = INTERP_SIMD_INTRINSIC_V128_U1_DIVISION; + else if (atype == MONO_TYPE_I2) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I2_DIVISION; + else if (atype == MONO_TYPE_U2) simd_intrins = INTERP_SIMD_INTRINSIC_V128_U2_DIVISION; + else if (atype == MONO_TYPE_I4) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I4_DIVISION; + else if (atype == MONO_TYPE_U4) simd_intrins = INTERP_SIMD_INTRINSIC_V128_U4_DIVISION; + break; + case SN_op_Equality: + if (atype != MONO_TYPE_R4 && atype != MONO_TYPE_R8) { + simd_opcode = MINT_SIMD_INTRINS_P_PP; + simd_intrins = INTERP_SIMD_INTRINSIC_V128_BITWISE_EQUALITY; + } + break; + case SN_op_ExclusiveOr: + simd_opcode = MINT_SIMD_INTRINS_P_PP; + simd_intrins = INTERP_SIMD_INTRINSIC_V128_EXCLUSIVE_OR; + break; + case SN_op_Inequality: + if (atype != MONO_TYPE_R4 && atype != MONO_TYPE_R8) { + simd_opcode = MINT_SIMD_INTRINS_P_PP; + simd_intrins = INTERP_SIMD_INTRINSIC_V128_BITWISE_INEQUALITY; + } + break; + case SN_op_LeftShift: + g_assert (scalar_arg == 1); + simd_opcode = MINT_SIMD_INTRINS_P_PP; + if (arg_size == 1) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I1_LEFT_SHIFT; + else if (arg_size == 2) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I2_LEFT_SHIFT; + else if (arg_size == 4) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I4_LEFT_SHIFT; + else if (arg_size == 8) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I8_LEFT_SHIFT; + break; + case SN_op_Multiply: + g_assert (scalar_arg == -1); + simd_opcode = MINT_SIMD_INTRINS_P_PP; + if (atype == MONO_TYPE_I1 || atype == MONO_TYPE_U1) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I1_MULTIPLY; + else if (atype == MONO_TYPE_I2 || atype == MONO_TYPE_U2) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I2_MULTIPLY; + else if (atype == MONO_TYPE_I4 || atype == MONO_TYPE_U4) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I4_MULTIPLY; + break; + case SN_op_OnesComplement: + simd_opcode = MINT_SIMD_INTRINS_P_P; + simd_intrins = INTERP_SIMD_INTRINSIC_V128_ONES_COMPLEMENT; + break; + case SN_op_RightShift: + g_assert (scalar_arg == 1); + simd_opcode = MINT_SIMD_INTRINS_P_PP; + if (atype == MONO_TYPE_I1) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I1_RIGHT_SHIFT; + else if (atype == MONO_TYPE_I2) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I2_RIGHT_SHIFT; + else if (atype == MONO_TYPE_I4) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I4_RIGHT_SHIFT; + else if (atype == MONO_TYPE_U1) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I1_URIGHT_SHIFT; + else if (atype == MONO_TYPE_U2) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I2_URIGHT_SHIFT; + else if (atype == MONO_TYPE_U4) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I4_URIGHT_SHIFT; + break; + case SN_op_Subtraction: + simd_opcode = MINT_SIMD_INTRINS_P_PP; + if (atype == MONO_TYPE_I1 || atype == MONO_TYPE_U1) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I1_SUB; + else if (atype == MONO_TYPE_I2 || atype == MONO_TYPE_U2) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I2_SUB; + else if (atype == MONO_TYPE_I4 || atype == MONO_TYPE_U4) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I4_SUB; + break; + case SN_op_UnaryNegation: + simd_opcode = MINT_SIMD_INTRINS_P_P; + if (atype == MONO_TYPE_I1 || atype == MONO_TYPE_U1) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I1_NEGATION; + else if (atype == MONO_TYPE_I2 || atype == MONO_TYPE_U2) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I2_NEGATION; + else if (atype == MONO_TYPE_I4 || atype == MONO_TYPE_U4) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I4_NEGATION; + break; + case SN_op_UnsignedRightShift: + g_assert (scalar_arg == 1); + simd_opcode = MINT_SIMD_INTRINS_P_PP; + if (arg_size == 1) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I1_URIGHT_SHIFT; + else if (arg_size == 2) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I2_URIGHT_SHIFT; + else if (arg_size == 4) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I4_URIGHT_SHIFT; + else if (arg_size == 8) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I8_URIGHT_SHIFT; + break; + } + + if (simd_opcode == -1 || simd_intrins == -1) { + return FALSE; + } + + interp_add_ins (td, simd_opcode); + td->last_ins->data [0] = simd_intrins; + +opcode_added: + td->sp -= csignature->param_count; + for (int i = 0; i < csignature->param_count; i++) + td->last_ins->sregs [i] = td->sp [i].local; + + g_assert (csignature->ret->type != MONO_TYPE_VOID); + int ret_mt = mint_type (csignature->ret); + if (ret_mt == MINT_TYPE_VT) { + // For these intrinsics, if we return a VT then it is a V128 + push_type_vt (td, vector_klass, vector_size); + } else { + push_simple_type (td, stack_type [ret_mt]); + } + interp_ins_set_dreg (td->last_ins, td->sp [-1].local); + td->ip += 5; + return TRUE; +} + +static gboolean +interp_emit_simd_intrinsics (TransformData *td, MonoMethod *cmethod, MonoMethodSignature *csignature) +{ + const char *class_name; + const char *class_ns; + MonoImage *image = m_class_get_image (cmethod->klass); + + if (image != mono_get_corlib ()) + return FALSE; + + class_ns = m_class_get_name_space (cmethod->klass); + class_name = m_class_get_name (cmethod->klass); + + if (!strcmp (class_ns, "System.Runtime.Intrinsics")) { + if (!strcmp (class_name, "Vector128")) + return emit_sri_vector128 (td, cmethod, csignature); + else if (!strcmp (class_name, "Vector128`1")) + return emit_sri_vector128_t (td, cmethod, csignature); + } + return FALSE; +} diff --git a/src/mono/mono/mini/interp/transform.c b/src/mono/mono/mini/interp/transform.c index 09878270c91ea..47e263137e595 100644 --- a/src/mono/mono/mini/interp/transform.c +++ b/src/mono/mono/mini/interp/transform.c @@ -1884,6 +1884,9 @@ interp_handle_intrinsics (TransformData *td, MonoMethod *target_method, MonoClas klass_name_space = m_class_get_name_space (target_method->klass); const char *klass_name = m_class_get_name (target_method->klass); + if (interp_emit_simd_intrinsics (td, target_method, csignature)) + return TRUE; + if (target_method->klass == mono_defaults.string_class) { if (tm [0] == 'g') { if (strcmp (tm, "get_Chars") == 0) @@ -10831,3 +10834,5 @@ mono_jiterp_insert_ins (TransformData *td, InterpInst *prev_ins, int opcode) } #endif + +#include "transform-simd.c" diff --git a/src/mono/mono/mini/interp/transform.h b/src/mono/mono/mini/interp/transform.h index eaa05a137865f..362f8c9afafb0 100644 --- a/src/mono/mono/mini/interp/transform.h +++ b/src/mono/mono/mini/interp/transform.h @@ -371,4 +371,8 @@ mono_jiterp_insert_ins (TransformData *td, InterpInst *prev_ins, int opcode); void mono_interp_print_td_code (TransformData *td); +/* Forward definitions for simd methods */ +static gboolean +interp_emit_simd_intrinsics (TransformData *td, MonoMethod *cmethod, MonoMethodSignature *csignature); + #endif /* __MONO_MINI_INTERP_TRANSFORM_H__ */ From d42c15cf3a0b4f505f16c31dd00f4ebd68debeb3 Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Tue, 7 Feb 2023 16:55:10 +0200 Subject: [PATCH 4/9] [mono][interp] Add option to disable simd intrinsics --- src/mono/mono/mini/interp/interp.c | 2 ++ src/mono/mono/mini/interp/interp.h | 3 ++- src/mono/mono/mini/interp/transform.c | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/mono/mono/mini/interp/interp.c b/src/mono/mono/mini/interp/interp.c index f28afbc957992..aa5887c19c1bf 100644 --- a/src/mono/mono/mini/interp/interp.c +++ b/src/mono/mono/mini/interp/interp.c @@ -7858,6 +7858,8 @@ interp_parse_options (const char *options) opt = INTERP_OPT_BBLOCKS; else if (strncmp (arg, "tiering", 7) == 0) opt = INTERP_OPT_TIERING; + else if (strncmp (arg, "simd", 7) == 0) + opt = INTERP_OPT_SIMD; else if (strncmp (arg, "all", 3) == 0) opt = ~INTERP_OPT_NONE; diff --git a/src/mono/mono/mini/interp/interp.h b/src/mono/mono/mini/interp/interp.h index df813b3fead99..6e39f7de763df 100644 --- a/src/mono/mono/mini/interp/interp.h +++ b/src/mono/mono/mini/interp/interp.h @@ -35,7 +35,8 @@ enum { INTERP_OPT_SUPER_INSTRUCTIONS = 4, INTERP_OPT_BBLOCKS = 8, INTERP_OPT_TIERING = 16, - INTERP_OPT_DEFAULT = INTERP_OPT_INLINE | INTERP_OPT_CPROP | INTERP_OPT_SUPER_INSTRUCTIONS | INTERP_OPT_BBLOCKS | INTERP_OPT_TIERING + INTERP_OPT_SIMD = 32, + INTERP_OPT_DEFAULT = INTERP_OPT_INLINE | INTERP_OPT_CPROP | INTERP_OPT_SUPER_INSTRUCTIONS | INTERP_OPT_BBLOCKS | INTERP_OPT_TIERING | INTERP_OPT_SIMD }; typedef struct _InterpMethodArguments InterpMethodArguments; diff --git a/src/mono/mono/mini/interp/transform.c b/src/mono/mono/mini/interp/transform.c index 47e263137e595..dfcdb7ea34791 100644 --- a/src/mono/mono/mini/interp/transform.c +++ b/src/mono/mono/mini/interp/transform.c @@ -1884,7 +1884,7 @@ interp_handle_intrinsics (TransformData *td, MonoMethod *target_method, MonoClas klass_name_space = m_class_get_name_space (target_method->klass); const char *klass_name = m_class_get_name (target_method->klass); - if (interp_emit_simd_intrinsics (td, target_method, csignature)) + if ((mono_interp_opt & INTERP_OPT_SIMD) && interp_emit_simd_intrinsics (td, target_method, csignature)) return TRUE; if (target_method->klass == mono_defaults.string_class) { From f69a049d2b557089fcc24168ea7a9d949427f508 Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Tue, 7 Feb 2023 17:45:12 +0200 Subject: [PATCH 5/9] [mono][interp] Disable simd intrinsics by default on wasm These intrinsics are not yet implemented on jiterpreter, making it slighty slower instead. --- src/mono/mono/mini/interp/interp-internals.h | 4 ++++ src/mono/mono/mini/interp/interp-simd.c | 6 ++++++ src/mono/mono/mini/interp/interp.c | 17 ++++++++++++++++- src/mono/mono/mini/interp/transform.c | 4 ++++ 4 files changed, 30 insertions(+), 1 deletion(-) diff --git a/src/mono/mono/mini/interp/interp-internals.h b/src/mono/mono/mini/interp/interp-internals.h index cecefced4be3f..2ab6f22e19ccd 100644 --- a/src/mono/mono/mini/interp/interp-internals.h +++ b/src/mono/mono/mini/interp/interp-internals.h @@ -102,6 +102,10 @@ typedef enum { #define PROFILE_INTERP 0 +#ifndef HOST_BROWSER +#define INTERP_ENABLE_SIMD +#endif + #define INTERP_IMETHOD_TAG_1(im) ((gpointer)((mono_u)(im) | 1)) #define INTERP_IMETHOD_IS_TAGGED_1(im) ((mono_u)(im) & 1) #define INTERP_IMETHOD_UNTAG_1(im) ((InterpMethod*)((mono_u)(im) & ~1)) diff --git a/src/mono/mono/mini/interp/interp-simd.c b/src/mono/mono/mini/interp/interp-simd.c index a6966e192af89..d84ef28a46cb5 100644 --- a/src/mono/mono/mini/interp/interp-simd.c +++ b/src/mono/mono/mini/interp/interp-simd.c @@ -1,5 +1,9 @@ + +#include "interp-internals.h" #include "interp-simd.h" +#ifdef INTERP_ENABLE_SIMD + typedef gint64 v128_i8 __attribute__ ((vector_size (SIZEOF_V128))); typedef guint64 v128_u8 __attribute__ ((vector_size (SIZEOF_V128))); typedef gint32 v128_i4 __attribute__ ((vector_size (SIZEOF_V128))); @@ -595,3 +599,5 @@ PPPP_SIMD_Method interp_simd_p_ppp_table [] = { }; #undef INTERP_SIMD_INTRINSIC_P_PPP #define INTERP_SIMD_INTRINSIC_P_PPP(a,b) + +#endif // INTERP_ENABLE_SIMD diff --git a/src/mono/mono/mini/interp/interp.c b/src/mono/mono/mini/interp/interp.c index aa5887c19c1bf..c9fc2d947e5e1 100644 --- a/src/mono/mono/mini/interp/interp.c +++ b/src/mono/mono/mini/interp/interp.c @@ -65,7 +65,10 @@ #include "mintops.h" #include "interp-intrins.h" #include "tiering.h" + +#ifdef INTERP_ENABLE_SIMD #include "interp-simd.h" +#endif #include #include @@ -5792,7 +5795,7 @@ MINT_IN_CASE(MINT_BRTRUE_I8_SP) ZEROP_SP(gint64, !=); MINT_IN_BREAK; ip += 7; goto call; } - +#ifdef INTERP_ENABLE_SIMD MINT_IN_CASE(MINT_SIMD_V128_LDC) { memcpy (locals + ip [1], ip + 2, SIZEOF_V128); ip += 10; @@ -5859,6 +5862,18 @@ MINT_IN_CASE(MINT_BRTRUE_I8_SP) ZEROP_SP(gint64, !=); MINT_IN_BREAK; interp_simd_p_ppp_table [ip [5]] (locals + ip [1], locals + ip [2], locals + ip [3], locals + ip [4]); ip += 6; MINT_IN_BREAK; +#else + MINT_IN_CASE(MINT_SIMD_V128_LDC) + MINT_IN_CASE(MINT_SIMD_V128_I1_CREATE) + MINT_IN_CASE(MINT_SIMD_V128_I2_CREATE) + MINT_IN_CASE(MINT_SIMD_V128_I4_CREATE) + MINT_IN_CASE(MINT_SIMD_V128_I8_CREATE) + MINT_IN_CASE(MINT_SIMD_INTRINS_P_P) + MINT_IN_CASE(MINT_SIMD_INTRINS_P_PP) + MINT_IN_CASE(MINT_SIMD_INTRINS_P_PPP) + g_assert_not_reached (); + MINT_IN_BREAK; +#endif MINT_IN_CASE(MINT_INTRINS_SPAN_CTOR) { gpointer ptr = LOCAL_VAR (ip [2], gpointer); diff --git a/src/mono/mono/mini/interp/transform.c b/src/mono/mono/mini/interp/transform.c index dfcdb7ea34791..1d05d402d5fa5 100644 --- a/src/mono/mono/mini/interp/transform.c +++ b/src/mono/mono/mini/interp/transform.c @@ -1884,8 +1884,10 @@ interp_handle_intrinsics (TransformData *td, MonoMethod *target_method, MonoClas klass_name_space = m_class_get_name_space (target_method->klass); const char *klass_name = m_class_get_name (target_method->klass); +#ifdef INTERP_ENABLE_SIMD if ((mono_interp_opt & INTERP_OPT_SIMD) && interp_emit_simd_intrinsics (td, target_method, csignature)) return TRUE; +#endif if (target_method->klass == mono_defaults.string_class) { if (tm [0] == 'g') { @@ -10835,4 +10837,6 @@ mono_jiterp_insert_ins (TransformData *td, InterpInst *prev_ins, int opcode) #endif +#ifdef INTERP_ENABLE_SIMD #include "transform-simd.c" +#endif From db74dd589392e7de047b9755f9324fb01efccb83 Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Tue, 7 Feb 2023 22:11:40 +0200 Subject: [PATCH 6/9] [mono][interp] Replace v128_create with v128_ldc if possible v128_create receives as an argument every single element of the vector. This method is typically used with constants. For a Vector128 this means that creating a constant vector required 8 ldc.i4 and a v128_create. We can instead use a single instruction and embed the vector value in the code stream directly. --- src/mono/mono/mini/interp/mintops.h | 1 + src/mono/mono/mini/interp/transform.c | 66 +++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) diff --git a/src/mono/mono/mini/interp/mintops.h b/src/mono/mono/mini/interp/mintops.h index 10b021698bc36..a1ae2fda8bad6 100644 --- a/src/mono/mono/mini/interp/mintops.h +++ b/src/mono/mono/mini/interp/mintops.h @@ -229,6 +229,7 @@ typedef enum { #define MINT_IS_STIND_INT(op) ((op) >= MINT_STIND_I1 && (op) <= MINT_STIND_I8) #define MINT_IS_STIND(op) ((op) >= MINT_STIND_I1 && (op) <= MINT_STIND_REF) #define MINT_IS_LDIND_OFFSET(op) ((op) >= MINT_LDIND_OFFSET_I1 && (op) <= MINT_LDIND_OFFSET_I8) +#define MINT_IS_SIMD_CREATE(op) ((op) >= MINT_SIMD_V128_I1_CREATE && (op) <= MINT_SIMD_V128_I8_CREATE) // TODO Add more #define MINT_NO_SIDE_EFFECTS(op) (MINT_IS_MOV (op) || MINT_IS_LDC_I4 (op) || MINT_IS_LDC_I8 (op) || op == MINT_MONO_LDPTR) diff --git a/src/mono/mono/mini/interp/transform.c b/src/mono/mono/mini/interp/transform.c index 1d05d402d5fa5..aa43b70f2de54 100644 --- a/src/mono/mono/mini/interp/transform.c +++ b/src/mono/mono/mini/interp/transform.c @@ -8925,6 +8925,70 @@ interp_fold_binop_cond_br (TransformData *td, InterpBasicBlock *cbb, LocalValue return ins; } +static void +write_v128_element (gpointer v128_addr, LocalValue *val, int index, int el_size) +{ + gpointer el_addr = (gint8*)v128_addr + index * el_size; + g_assert ((gint8*)el_addr < ((gint8*)v128_addr + 16)); + switch (el_size) { + case 1: *(gint8*)el_addr = (gint8)val->i; break; + case 2: *(gint16*)el_addr = (gint16)val->i; break; + case 4: *(gint32*)el_addr = val->i; break; + case 8: *(gint64*)el_addr = val->l; break; + default: + g_assert_not_reached (); + } +} + +static InterpInst* +interp_fold_simd_create (TransformData *td, InterpBasicBlock *cbb, LocalValue *local_defs, InterpInst *ins) +{ + int *local_ref_count = td->local_ref_count; + + int *args = ins->info.call_info->call_args; + int index = 0; + int var = args [index]; + while (var != -1) { + LocalValue *val = &local_defs [var]; + if (val->type != LOCAL_VALUE_I4 && val->type != LOCAL_VALUE_I8) + return ins; + index++; + var = args [index]; + } + + // If we reached this point, it means that all args of the simd_create are constants + // We can replace the simd_create with simd_ldc + int el_size = 16 / index; + int dreg = ins->dreg; + + ins = interp_insert_ins (td, ins, MINT_SIMD_V128_LDC); + interp_clear_ins (ins->prev); + interp_ins_set_dreg (ins, dreg); + + gpointer v128_addr = &ins->data [0]; + + index = 0; + var = args [index]; + while (var != -1) { + LocalValue *val = &local_defs [var]; + write_v128_element (v128_addr, val, index, el_size); + val->ref_count--; + local_ref_count [var]--; + index++; + var = args [index]; + } + + if (td->verbose_level) { + g_print ("Fold simd create:\n\t"); + dump_interp_inst (ins); + } + + local_defs [dreg].ins = ins; + local_defs [dreg].type = LOCAL_VALUE_NONE; + + return ins; +} + static void cprop_sreg (TransformData *td, InterpInst *ins, int *psreg, LocalValue *local_defs) { @@ -9171,6 +9235,8 @@ interp_cprop (TransformData *td) ins = interp_fold_unop (td, local_defs, ins); } else if (MINT_IS_UNOP_CONDITIONAL_BRANCH (opcode)) { ins = interp_fold_unop_cond_br (td, bb, local_defs, ins); + } else if (MINT_IS_SIMD_CREATE (opcode)) { + ins = interp_fold_simd_create (td, bb, local_defs, ins); } else if (MINT_IS_BINOP (opcode)) { gboolean folded; ins = interp_fold_binop (td, local_defs, ins, &folded); From 8ffac0787d3c6a71bd6f8ae08c2c11861e970c51 Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Thu, 9 Feb 2023 10:44:55 +0200 Subject: [PATCH 7/9] [mono][interp] Remove op_Division It is actually not used in bcl, it is not really vectorized on any platforms and the codegen for the interp implementation is massive and inefficient. --- .../mono/mini/interp/interp-simd-intrins.def | 7 ---- src/mono/mono/mini/interp/interp-simd.c | 37 ------------------- src/mono/mono/mini/interp/simd-methods.def | 1 - src/mono/mono/mini/interp/transform-simd.c | 11 ------ 4 files changed, 56 deletions(-) diff --git a/src/mono/mono/mini/interp/interp-simd-intrins.def b/src/mono/mono/mini/interp/interp-simd-intrins.def index 3e933168662f0..57bbba1717d7b 100644 --- a/src/mono/mono/mini/interp/interp-simd-intrins.def +++ b/src/mono/mono/mini/interp/interp-simd-intrins.def @@ -9,13 +9,6 @@ INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_SUB, interp_v128_i4_op INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_BITWISE_AND, interp_v128_op_bitwise_and) INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_BITWISE_OR, interp_v128_op_bitwise_or) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_DIVISION, interp_v128_i1_op_division) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_U1_DIVISION, interp_v128_u1_op_division) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_DIVISION, interp_v128_i2_op_division) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_U2_DIVISION, interp_v128_u2_op_division) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_DIVISION, interp_v128_i4_op_division) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_U4_DIVISION, interp_v128_u4_op_division) - INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_BITWISE_EQUALITY, interp_v128_op_bitwise_equality) INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_BITWISE_INEQUALITY, interp_v128_op_bitwise_inequality) diff --git a/src/mono/mono/mini/interp/interp-simd.c b/src/mono/mono/mini/interp/interp-simd.c index d84ef28a46cb5..2d5bb6b50ae27 100644 --- a/src/mono/mono/mini/interp/interp-simd.c +++ b/src/mono/mono/mini/interp/interp-simd.c @@ -72,43 +72,6 @@ interp_v128_op_bitwise_or (gpointer res, gpointer v1, gpointer v2) *(v128_i4*)res = *(v128_i4*)v1 | *(v128_i4*)v2; } -// op_Division -static void -interp_v128_i1_op_division (gpointer res, gpointer v1, gpointer v2) -{ - *(v128_i1*)res = *(v128_i1*)v1 / *(v128_i1*)v2; -} - -static void -interp_v128_u1_op_division (gpointer res, gpointer v1, gpointer v2) -{ - *(v128_u1*)res = *(v128_u1*)v1 / *(v128_u1*)v2; -} - -static void -interp_v128_i2_op_division (gpointer res, gpointer v1, gpointer v2) -{ - *(v128_i2*)res = *(v128_i2*)v1 / *(v128_i2*)v2; -} - -static void -interp_v128_u2_op_division (gpointer res, gpointer v1, gpointer v2) -{ - *(v128_u2*)res = *(v128_u2*)v1 / *(v128_u2*)v2; -} - -static void -interp_v128_i4_op_division (gpointer res, gpointer v1, gpointer v2) -{ - *(v128_i4*)res = *(v128_i4*)v1 / *(v128_i4*)v2; -} - -static void -interp_v128_u4_op_division (gpointer res, gpointer v1, gpointer v2) -{ - *(v128_u4*)res = *(v128_u4*)v1 / *(v128_u4*)v2; -} - // op_Equality static void interp_v128_op_bitwise_equality (gpointer res, gpointer v1, gpointer v2) diff --git a/src/mono/mono/mini/interp/simd-methods.def b/src/mono/mono/mini/interp/simd-methods.def index 6b3ef8e5d9c61..57b87d028de94 100644 --- a/src/mono/mono/mini/interp/simd-methods.def +++ b/src/mono/mono/mini/interp/simd-methods.def @@ -7,7 +7,6 @@ SIMD_METHOD(get_Zero) SIMD_METHOD(op_Addition) SIMD_METHOD(op_BitwiseAnd) SIMD_METHOD(op_BitwiseOr) -SIMD_METHOD(op_Division) SIMD_METHOD(op_Equality) SIMD_METHOD(op_ExclusiveOr) SIMD_METHOD(op_Explicit) diff --git a/src/mono/mono/mini/interp/transform-simd.c b/src/mono/mono/mini/interp/transform-simd.c index 1b2c94b53436b..9f78b8db34296 100644 --- a/src/mono/mono/mini/interp/transform-simd.c +++ b/src/mono/mono/mini/interp/transform-simd.c @@ -72,7 +72,6 @@ static guint16 sri_vector128_t_methods [] = { SN_op_Addition, SN_op_BitwiseAnd, SN_op_BitwiseOr, - SN_op_Division, SN_op_Equality, SN_op_ExclusiveOr, SN_op_Inequality, @@ -349,16 +348,6 @@ emit_sri_vector128_t (TransformData *td, MonoMethod *cmethod, MonoMethodSignatur simd_opcode = MINT_SIMD_INTRINS_P_PP; simd_intrins = INTERP_SIMD_INTRINSIC_V128_BITWISE_OR; break; - case SN_op_Division: - g_assert (scalar_arg == -1); - simd_opcode = MINT_SIMD_INTRINS_P_PP; - if (atype == MONO_TYPE_I1) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I1_DIVISION; - else if (atype == MONO_TYPE_U1) simd_intrins = INTERP_SIMD_INTRINSIC_V128_U1_DIVISION; - else if (atype == MONO_TYPE_I2) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I2_DIVISION; - else if (atype == MONO_TYPE_U2) simd_intrins = INTERP_SIMD_INTRINSIC_V128_U2_DIVISION; - else if (atype == MONO_TYPE_I4) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I4_DIVISION; - else if (atype == MONO_TYPE_U4) simd_intrins = INTERP_SIMD_INTRINSIC_V128_U4_DIVISION; - break; case SN_op_Equality: if (atype != MONO_TYPE_R4 && atype != MONO_TYPE_R8) { simd_opcode = MINT_SIMD_INTRINS_P_PP; From 629cd94b141e26c5c39074c456d2e68b69f35937 Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Mon, 13 Feb 2023 10:59:26 +0200 Subject: [PATCH 8/9] [mono][interp] Don't emit intrinsics for unsupported vector types --- src/mono/mono/mini/interp/transform-simd.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/mono/mono/mini/interp/transform-simd.c b/src/mono/mono/mini/interp/transform-simd.c index 9f78b8db34296..ed2ef76c880ef 100644 --- a/src/mono/mono/mini/interp/transform-simd.c +++ b/src/mono/mono/mini/interp/transform-simd.c @@ -104,8 +104,13 @@ emit_sri_vector128 (TransformData *td, MonoMethod *cmethod, MonoMethodSignature vector_klass = mono_class_from_mono_type_internal (csignature->params [0]); if (!m_class_is_simd_type (vector_klass)) return FALSE; + MonoType *arg_type = mono_class_get_context (vector_klass)->class_inst->type_argv [0]; + if (!mono_type_is_primitive (arg_type)) + return FALSE; MonoTypeEnum atype = arg_type->type; + if (atype == MONO_TYPE_BOOLEAN) + return FALSE; int vector_size = mono_class_value_size (vector_klass, NULL); int arg_size = mono_class_value_size (mono_class_from_mono_type_internal (arg_type), NULL); g_assert (vector_size == SIZEOF_V128); @@ -280,7 +285,11 @@ emit_sri_vector128_t (TransformData *td, MonoMethod *cmethod, MonoMethodSignatur // First argument is always vector MonoClass *vector_klass = cmethod->klass; MonoType *arg_type = mono_class_get_context (vector_klass)->class_inst->type_argv [0]; + if (!mono_type_is_primitive (arg_type)) + return FALSE; MonoTypeEnum atype = arg_type->type; + if (atype == MONO_TYPE_BOOLEAN) + return FALSE; int vector_size = mono_class_value_size (vector_klass, NULL); int arg_size = mono_class_value_size (mono_class_from_mono_type_internal (arg_type), NULL); g_assert (vector_size == SIZEOF_V128); From 719523d68bd1d8f2a075795fcfb6fd8adc39d018 Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Mon, 13 Feb 2023 11:14:22 +0200 Subject: [PATCH 9/9] [mono][interp] Vector extensions used in these intrinsics are a GNUC extension --- src/mono/mono/mini/interp/interp-internals.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mono/mono/mini/interp/interp-internals.h b/src/mono/mono/mini/interp/interp-internals.h index 2ab6f22e19ccd..303bb1af21c9c 100644 --- a/src/mono/mono/mini/interp/interp-internals.h +++ b/src/mono/mono/mini/interp/interp-internals.h @@ -102,7 +102,7 @@ typedef enum { #define PROFILE_INTERP 0 -#ifndef HOST_BROWSER +#if !HOST_BROWSER && __GNUC__ #define INTERP_ENABLE_SIMD #endif