From ec5ac992e5f5c23a5435930aa66b5dd0f622523d Mon Sep 17 00:00:00 2001
From: Tanner Gooding <tagoo@outlook.com>
Date: Thu, 25 Jul 2024 11:21:10 -0700
Subject: [PATCH] Update mono to support shuffle for constant inputs (#105299)

* Support mono creating xconst in a few more places

* Update mono to support shuffle for constant inputs

* Ensure that arm64 also accelerates shuffle for non-constant inputs

* Ensure OP_XZERO and OP_XONES are recognized as being constant

* Ensure shuffle creates a correct instruction when the fsig doesn't match the necessary parameter count

* Ensure that getting the index for floating-point shuffle is possible

* Ensure the right class handle is passed down to LLVM so overload resolution can function

* Make sure we update the original xconst if we mutate it

* Return a new constant and instead of mutating the existing one

* Insert relevant xcast nodes

* Add some asserts around the ecount

* Ensure we get the right element type

* Ensure we don't create nodes unnecessarily for create_elementwise

* Ensure that create_elementwise still works for other vector sizes

* Ensure indentation of switch cases is correct for Mono
---
 src/mono/mono/mini/simd-intrinsics.c | 628 +++++++++++++++++++++++++--
 1 file changed, 585 insertions(+), 43 deletions(-)

diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c
index 44fc12ac0a232..54588a2f2dfdf 100644
--- a/src/mono/mono/mini/simd-intrinsics.c
+++ b/src/mono/mono/mini/simd-intrinsics.c
@@ -83,6 +83,31 @@ mono_emit_simd_field_load (MonoCompile *cfg, MonoClassField *field, MonoInst *ad
 	return NULL;
 }
 
+static gboolean
+is_const (const MonoInst* ins)
+{
+	switch (ins->opcode) {
+	case OP_ICONST:
+	case OP_I8CONST:
+	case OP_R4CONST:
+	case OP_R8CONST:
+		return TRUE;
+	}
+	return FALSE;
+}
+
+static gboolean
+is_xconst (const MonoInst* ins)
+{
+	switch (ins->opcode) {
+	case OP_XCONST:
+	case OP_XZERO:
+	case OP_XONES:
+		return TRUE;
+	}
+	return FALSE;
+}
+
 static gboolean
 is_zero_const (const MonoInst* ins)
 {
@@ -647,6 +672,60 @@ emit_xconst_v128 (MonoCompile *cfg, MonoClass *klass, guint8 value[16])
 	return ins;
 }
 
+static guint64
+get_xconst_int_elem (MonoCompile *cfg, MonoInst *ins, MonoTypeEnum etype, int index)
+{
+	guint8 cns_vec[16];
+	if (ins->opcode == OP_XZERO) {
+		memset (cns_vec, 0x00, 16);
+	} else if (ins->opcode == OP_XONES) {
+		memset (cns_vec, 0xFF, 16);
+	} else {
+		g_assert (ins->opcode == OP_XCONST);
+		memcpy (cns_vec, ins->inst_p0, 16);
+	}
+	g_assert (index >= 0);
+	switch (etype) {
+	case MONO_TYPE_I1: {
+		g_assert (index < 16);
+		return ((gint8*)cns_vec) [index];
+	}
+	case MONO_TYPE_U1: {
+		g_assert (index < 16);
+		return ((guint8*)cns_vec) [index];
+	}
+	case MONO_TYPE_I2: {
+		g_assert (index < 8);
+		return ((gint16*)cns_vec) [index];
+	}
+	case MONO_TYPE_U2: {
+		g_assert (index < 8);
+		return ((guint16*)cns_vec) [index];
+	}
+	case MONO_TYPE_I4:
+	case MONO_TYPE_R4: {
+		g_assert (index < 4);
+		return ((gint32*)cns_vec) [index];
+	}
+	case MONO_TYPE_U4: {
+		g_assert (index < 4);
+		return ((guint32*)cns_vec) [index];
+	}
+	case MONO_TYPE_I8:
+	case MONO_TYPE_R8: {
+		g_assert (index < 2);
+		return ((gint64*)cns_vec) [index];
+	}
+	case MONO_TYPE_U8: {
+		g_assert (index < 2);
+		return ((guint64*)cns_vec) [index];
+	}
+	default: {
+		g_assert_not_reached ();
+	}
+	}
+}
+
 #ifdef TARGET_ARM64
 static int type_to_extract_op (MonoTypeEnum type);
 static MonoType* get_vector_t_elem_type (MonoType *vector_type);
@@ -1076,34 +1155,393 @@ emit_vector_insert_element (
 	int op = type_to_insert_op (type);
 
 	if (is_zero_inited && is_zero_const (element)) {
-			// element already set to zero
+		// element already set to zero
+		return ins;
+	}
+
+	if (is_xconst (ins) && is_const (element)) {
+		// Specially handle insertion of a constant into a constant
+		int vector_size = mono_class_value_size (vklass, NULL);
+		if (vector_size == 16) {
+			guint8 cns_vec[16];
+			if (ins->opcode == OP_XZERO) {
+				memset (cns_vec, 0x00, 16);
+			} else if (ins->opcode == OP_XONES) {
+				memset (cns_vec, 0xFF, 16);
+			} else {
+				g_assert (ins->opcode == OP_XCONST);
+				memcpy (cns_vec, ins->inst_p0, 16);
+			}
+			if (type_enum_is_float (type)) {
+				double cns_val;
+				if (element->opcode == OP_R4CONST) {
+					cns_val = *(const float*)(element->inst_p0);
+				} else {
+					g_assert (element->opcode == OP_R8CONST);
+					cns_val = *(const double*)(element->inst_p0);
+				}
+				switch (type) {
+				case MONO_TYPE_R4: {
+					((float*)cns_vec) [index] = (float)cns_val;
+					break;
+				}
+				case MONO_TYPE_R8: {
+					((double*)cns_vec) [index] = (double)cns_val;
+					break;
+				}
+				default: {
+					g_assert_not_reached ();
+				}
+				}
+			} else {
+				gint64 cns_val;
+				if (element->opcode == OP_ICONST) {
+					cns_val = GTMREG_TO_INT (element->inst_c0);
+				} else {
+					g_assert (element->opcode == OP_I8CONST);
+					cns_val = element->inst_l;
+				}
+				switch (type) {
+				case MONO_TYPE_I1:
+				case MONO_TYPE_U1: {
+					((guint8*)cns_vec) [index] = (guint8)cns_val;
+					break;
+				}
+				case MONO_TYPE_I2:
+				case MONO_TYPE_U2: {
+					((guint16*)cns_vec) [index] = (guint16)cns_val;
+					break;
+				}
+				case MONO_TYPE_I4:
+				case MONO_TYPE_U4: {
+					((guint32*)cns_vec) [index] = (guint32)cns_val;
+					break;
+				}
+				case MONO_TYPE_I8:
+				case MONO_TYPE_U8: {
+					((guint64*)cns_vec) [index] = (guint64)cns_val;
+					break;
+				}
+				default: {
+					g_assert_not_reached ();
+				}
+				}
+			}
+			return emit_xconst_v128 (cfg, vklass, cns_vec);
+		}
+	}
+
 #ifdef TARGET_ARM64
-	} else if (!COMPILE_LLVM (cfg) && element->opcode == type_to_extract_op (type) &&
+	if (!COMPILE_LLVM (cfg) && element->opcode == type_to_extract_op (type) &&
 		(type == MONO_TYPE_R4 || type == MONO_TYPE_R8)) {
 		// OP_INSERT_Ix inserts from GP reg, not SIMD. Cannot optimize for int types.
 		ins = emit_simd_ins (cfg, vklass, op, ins->dreg, element->sreg1);
 		ins->inst_c0 = index | ((element->inst_c0) << 8);
 		ins->inst_c1 = type;
+		return ins;
+	}
 #endif
+
+	ins = emit_simd_ins (cfg, vklass, op, ins->dreg, element->dreg);
+	ins->inst_c0 = index;
+	ins->inst_c1 = type;
+
+	return ins;
+}
+
+static MonoInst *
+emit_vector_create_broadcast (
+	MonoCompile *cfg, MonoClass *vklass, MonoType *etype, MonoInst *arg0)
+{
+	int vector_size = mono_class_value_size (vklass, NULL);
+	if (vector_size == 16) {
+		// We want to handle constant inputs and create constant nodes so other import 
+		// optimizations can be enabled.
+		if (is_const (arg0)) {
+			guint8 cns_vec[16];
+			if (type_enum_is_float (etype->type)) {
+				double cns_val;
+				if (arg0->opcode == OP_R4CONST) {
+					cns_val = *(const float*)(arg0->inst_p0);
+				} else {
+					g_assert (arg0->opcode == OP_R8CONST);
+					cns_val = *(const double*)(arg0->inst_p0);
+				}
+				switch (etype->type) {
+				case MONO_TYPE_R4: {
+					for (int i = 0; i < vector_size / 4; i++) {
+						((float*)cns_vec) [i] = (float)cns_val;
+					}
+					break;
+				}
+				case MONO_TYPE_R8: {
+					for (int i = 0; i < vector_size / 8; i++) {
+						((double*)cns_vec) [i] = (double)cns_val;
+					}
+					break;
+				}
+				default: {
+					g_assert_not_reached ();
+				}
+				}
+			} else {
+				gint64 cns_val;
+				if (arg0->opcode == OP_ICONST) {
+					cns_val = GTMREG_TO_INT (arg0->inst_c0);
+				} else {
+					g_assert (arg0->opcode == OP_I8CONST);
+					cns_val = arg0->inst_l;
+				}
+				switch (etype->type) {
+				case MONO_TYPE_I1:
+				case MONO_TYPE_U1: {
+					for (int i = 0; i < vector_size / 1; i++) {
+						((guint8*)cns_vec) [i] = (guint8)cns_val;
+					}
+					break;
+				}
+				case MONO_TYPE_I2:
+				case MONO_TYPE_U2: {
+					for (int i = 0; i < vector_size / 2; i++) {
+						((guint16*)cns_vec) [i] = (guint16)cns_val;
+					}
+					break;
+				}
+				case MONO_TYPE_I4:
+				case MONO_TYPE_U4: {
+					for (int i = 0; i < vector_size / 4; i++) {
+						((guint32*)cns_vec) [i] = (guint32)cns_val;
+					}
+					break;
+				}
+				case MONO_TYPE_I8:
+				case MONO_TYPE_U8: {
+					for (int i = 0; i < vector_size / 8; i++) {
+						((guint64*)cns_vec) [i] = (guint64)cns_val;
+					}
+					break;
+				}
+				default: {
+					g_assert_not_reached ();
+				}
+				}
+			}
+			return emit_xconst_v128 (cfg, vklass, (guint8*)cns_vec);
+		}
+	}
+	MonoInst* ins = emit_simd_ins (cfg, vklass, type_to_expand_op (etype->type), arg0->dreg, -1);
+	ins->inst_c1 = etype->type;
+	return ins;
+}
+
+static MonoInst *
+emit_vector_create_elementwise (
+	MonoCompile *cfg, MonoClass *vklass, MonoType *etype, MonoInst **args, int param_count)
+{
+	// We want to handle constant inputs and create constant nodes so other import 
+	// optimizations can be enabled. This includes recognizing partial constants
+	// and only performing the minimal number of inserts required
+
+	gboolean all_const = true;
+	gboolean some_const = false;
+
+	guint8 cns_vec[16];
+	memset (cns_vec, 0x00, 16);
+
+	int vector_size = mono_class_value_size (vklass, NULL);
+	if (vector_size == 16) {
+		for (int i = 0; i < param_count; ++i) {
+			if (!is_const (args[i])) {
+				all_const = false;
+				break;
+			}
+
+			some_const = true;
+
+			if (type_enum_is_float (etype->type)) {
+				double cns_val;
+				if (args[i]->opcode == OP_R4CONST) {
+					cns_val = *(const float*)(args[i]->inst_p0);
+				} else {
+					g_assert (args[i]->opcode == OP_R8CONST);
+					cns_val = *(const double*)(args[i]->inst_p0);
+				}
+
+				switch (etype->type) {
+				case MONO_TYPE_R4: {
+					((float*)cns_vec) [i] = (float)cns_val;
+					break;
+				}
+				case MONO_TYPE_R8: {
+					((double*)cns_vec) [i] = (double)cns_val;
+					break;
+				}
+				default: {
+					g_assert_not_reached ();
+				}
+				}
+			} else {
+				gint64 cns_val;
+				if (args[i]->opcode == OP_ICONST) {
+					cns_val = GTMREG_TO_INT (args[i]->inst_c0);
+				} else {
+					g_assert (args[i]->opcode == OP_I8CONST);
+					cns_val = args[i]->inst_l;
+				}
+
+				switch (etype->type) {
+				case MONO_TYPE_I1:
+				case MONO_TYPE_U1: {
+					((guint8*)cns_vec) [i] = (guint8)cns_val;
+					break;
+				}
+				case MONO_TYPE_I2:
+				case MONO_TYPE_U2: {
+					((guint16*)cns_vec) [i] = (guint16)cns_val;
+					break;
+				}
+				case MONO_TYPE_I4:
+				case MONO_TYPE_U4: {
+					((guint32*)cns_vec) [i] = (guint32)cns_val;
+					break;
+				}
+				case MONO_TYPE_I8:
+				case MONO_TYPE_U8: {
+					((guint64*)cns_vec) [i] = (guint64)cns_val;
+					break;
+				}
+				default: {
+					g_assert_not_reached ();
+				}
+				}
+			}
+		}
+	}
+
+	if (all_const) {
+		return emit_xconst_v128 (cfg, vklass, (guint8*)cns_vec);
+	}
+
+	MonoInst *ins;
+
+	if (some_const) {
+		ins = emit_xconst_v128 (cfg, vklass, (guint8*)cns_vec);
 	} else {
-		ins = emit_simd_ins (cfg, vklass, op, ins->dreg, element->dreg);
-		ins->inst_c0 = index;
-		ins->inst_c1 = type;
+		ins = emit_xzero (cfg, vklass);
+	}
+
+	for (int i = 0; i < param_count; ++i) {
+		if (!is_const (args[i]) || (vector_size != 16)) {
+			ins = emit_vector_insert_element (cfg, vklass, ins, etype->type, args[i], i, TRUE);
+		}
 	}
 
 	return ins;
 }
 
 static MonoInst *
-emit_vector_create_elementwise (
-	MonoCompile *cfg, MonoMethodSignature *fsig, MonoType *vtype,
-	MonoTypeEnum type, MonoInst **args)
+emit_vector_create_scalar (
+	MonoCompile *cfg, MonoClass *vklass, MonoType *etype, MonoInst *arg0, gboolean is_unsafe)
 {
-	MonoClass *vklass = mono_class_from_mono_type_internal (vtype);
-	MonoInst *ins = emit_xzero (cfg, vklass);
-	for (int i = 0; i < fsig->param_count; ++i)
-		ins = emit_vector_insert_element (cfg, vklass, ins, type, args[i], i, TRUE);
+	int vector_size = mono_class_value_size (vklass, NULL);
+	if (vector_size == 16) {
+		// We want to handle constant inputs and create constant nodes so other import 
+		// optimizations can be enabled. For is_unsafe, we treat it the same as broadcast
+		if (is_const (arg0)) {
+			if (is_unsafe) {
+				return emit_vector_create_broadcast (cfg, vklass, etype, arg0);
+			}
+
+			guint8 cns_vec[16];
+			memset (cns_vec, 0x00, 16);
+
+			if (type_enum_is_float (etype->type)) {
+				double cns_val;
+				if (arg0->opcode == OP_R4CONST) {
+					cns_val = *(const float*)(arg0->inst_p0);
+				} else {
+					g_assert (arg0->opcode == OP_R8CONST);
+					cns_val = *(const double*)(arg0->inst_p0);
+				}
+
+				switch (etype->type) {
+				case MONO_TYPE_R4: {
+					((float*)cns_vec) [0] = (float)cns_val;
+					break;
+				}
+				case MONO_TYPE_R8: {
+					((double*)cns_vec) [0] = (double)cns_val;
+					break;
+				}
+				default: {
+					g_assert_not_reached ();
+				}
+				}
+			} else {
+				gint64 cns_val;
+				if (arg0->opcode == OP_ICONST) {
+					cns_val = GTMREG_TO_INT (arg0->inst_c0);
+				} else {
+					g_assert (arg0->opcode == OP_I8CONST);
+					cns_val = arg0->inst_l;
+				
+}
+				switch (etype->type) {
+				case MONO_TYPE_I1:
+				case MONO_TYPE_U1: {
+					((guint8*)cns_vec) [0] = (guint8)cns_val;
+					break;
+				}
+				case MONO_TYPE_I2:
+				case MONO_TYPE_U2: {
+					((guint16*)cns_vec) [0] = (guint16)cns_val;
+					break;
+				}
+				case MONO_TYPE_I4:
+				case MONO_TYPE_U4: {
+					((guint32*)cns_vec) [0] = (guint32)cns_val;
+					break;
+				}
+				case MONO_TYPE_I8:
+				case MONO_TYPE_U8: {
+					((guint64*)cns_vec) [0] = (guint64)cns_val;
+					break;
+				}
+				default: {
+					g_assert_not_reached ();
+				}
+				}
+			}
 
+			return emit_xconst_v128 (cfg, vklass, (guint8*)cns_vec);
+		}
+	}
+
+	int opcode = 0;
+	if (COMPILE_LLVM (cfg)) {
+		opcode = is_unsafe ? OP_CREATE_SCALAR_UNSAFE : OP_CREATE_SCALAR;
+	} else {
+#ifdef TARGET_AMD64
+		MonoInst *ins;
+
+		ins = emit_xzero (cfg, vklass);
+		if (!is_zero_const (arg0)) {
+			ins = emit_simd_ins (cfg, vklass, type_to_insert_op (etype->type), ins->dreg, arg0->dreg);
+			ins->inst_c0 = 0;
+			ins->inst_c1 = etype->type;
+		}
+		return ins;
+#else
+		if (type_enum_is_float (etype->type)) {
+			opcode = is_unsafe ? OP_CREATE_SCALAR_UNSAFE_FLOAT : OP_CREATE_SCALAR_FLOAT;
+		} else {
+			opcode = is_unsafe ? OP_CREATE_SCALAR_UNSAFE_INT : OP_CREATE_SCALAR_INT;
+		}
+#endif
+	}
+	g_assert (opcode != 0);
+	MonoInst* ins = emit_simd_ins (cfg, vklass, opcode, arg0->dreg, -1);
+	ins->inst_c1 = etype->type;
 	return ins;
 }
 
@@ -1867,9 +2305,8 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi
 		if (!MONO_TYPE_IS_VECTOR_PRIMITIVE (etype))
 			return NULL;
 		if (fsig->param_count == 1 && mono_metadata_type_equal (fsig->params [0], etype)) {
-			MonoInst* ins = emit_simd_ins (cfg, klass, type_to_expand_op (etype->type), args [0]->dreg, -1);
-			ins->inst_c1 = arg0_type;
-			return ins;
+			MonoClass *vklass = mono_class_from_mono_type_internal(fsig->ret);
+			return emit_vector_create_broadcast (cfg, vklass, etype, args [0]);
 		} else if (is_create_from_half_vectors_overload (fsig)) {
 #if defined(TARGET_AMD64)
 			// Require Vector64 SIMD support
@@ -1890,8 +2327,10 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi
 
 			return emit_simd_ins (cfg, klass, OP_XCONCAT, args [0]->dreg, args [1]->dreg);
 		}
-		else if (is_elementwise_create_overload (fsig, etype))
-			return emit_vector_create_elementwise (cfg, fsig, fsig->ret, arg0_type, args);
+		else if (is_elementwise_create_overload (fsig, etype)) {
+			MonoClass *vklass = mono_class_from_mono_type_internal(fsig->ret);
+			return emit_vector_create_elementwise (cfg, vklass, etype, args, fsig->param_count);
+		}
 		break;
 	}
 	case SN_CreateScalar:
@@ -1900,27 +2339,8 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi
 		if (!MONO_TYPE_IS_VECTOR_PRIMITIVE (etype))
 			return NULL;
 		gboolean is_unsafe = id == SN_CreateScalarUnsafe;
-		if (COMPILE_LLVM (cfg)) {
-			return emit_simd_ins_for_sig (cfg, klass, is_unsafe ? OP_CREATE_SCALAR_UNSAFE : OP_CREATE_SCALAR, -1, arg0_type, fsig, args);
-		} else {
-#ifdef TARGET_AMD64
-			MonoInst *ins;
-
-			ins = emit_xzero (cfg, klass);
-			if (!is_zero_const (args [0])) {
-				ins = emit_simd_ins (cfg, klass, type_to_insert_op (arg0_type), ins->dreg, args [0]->dreg);
-				ins->inst_c0 = 0;
-				ins->inst_c1 = arg0_type;
-			}
-			return ins;
-#else
-			if (type_enum_is_float (arg0_type)) {
-				return emit_simd_ins_for_sig (cfg, klass, is_unsafe ? OP_CREATE_SCALAR_UNSAFE_FLOAT : OP_CREATE_SCALAR_FLOAT, -1, arg0_type, fsig, args);
-			} else {
-				return emit_simd_ins_for_sig (cfg, klass, is_unsafe ? OP_CREATE_SCALAR_UNSAFE_INT : OP_CREATE_SCALAR_INT, -1, arg0_type, fsig, args);
-			}
-#endif
-		}
+		MonoClass *vklass = mono_class_from_mono_type_internal(fsig->ret);
+		return emit_vector_create_scalar (cfg, vklass, etype, args [0], is_unsafe);
 	}
 	case SN_Dot: {
 		return emit_dot (cfg, klass, fsig->params [0], arg0_type, args [0]->dreg, args [1]->dreg);
@@ -2418,18 +2838,140 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi
 		return emit_simd_ins_for_unary_op (cfg, klass, fsig, args, arg0_type, id);
 	}
 	case SN_Shuffle: {
-		if (!is_element_type_primitive (fsig->params [0]))
+		MonoType *etype = get_vector_t_elem_type (fsig->ret);
+		if (!MONO_TYPE_IS_VECTOR_PRIMITIVE (etype))
 			return NULL;
 #ifdef TARGET_WASM
 		return emit_simd_ins_for_sig (cfg, klass, OP_WASM_SIMD_SWIZZLE, -1, -1, fsig, args);
 #elif defined(TARGET_ARM64)
-		if (vector_size == 128 && (arg0_type == MONO_TYPE_I1 || arg0_type == MONO_TYPE_U1))
+		if (vector_size != 128) {
+			return NULL;
+		}
+		if ((arg0_type == MONO_TYPE_I1 || arg0_type == MONO_TYPE_U1)) {
 			return emit_simd_ins_for_sig (cfg, klass, OP_XOP_OVR_X_X_X, INTRINS_AARCH64_ADV_SIMD_TBL1, 0, fsig, args);
-		return NULL;
+		}
+		if (!is_xconst (args [1])) {
+			return NULL;
+		}
+		int vsize = mono_class_value_size (klass, NULL);
+		int esize = mono_class_value_size (mono_class_from_mono_type_internal (etype), NULL);
+		int ecount = vsize / esize;
+		g_assert ((ecount == 2) || (ecount == 4) || (ecount == 8) || (ecount == 16));
+		guint64 value = 0;
+		guint8 vec_cns[16];
+		for (int index = 0; index < ecount; index++) {
+			value = get_xconst_int_elem (cfg, args [1], arg0_type, index);
+			if (value < ecount) {
+				for (int i = 0; i < esize; i++) {
+					vec_cns [(index * esize) + i] = (guint8)((value * esize) + i);
+				}
+			} else {
+				for (int i = 0; i < esize; i++) {
+					vec_cns [(index * esize) + i] = (guint8)0xFF;
+				}
+			}
+		}
+		MonoType *op_etype = m_class_get_byval_arg (mono_defaults.byte_class);
+		MonoClass *op_klass = create_class_instance ("System.Runtime.Intrinsics", "Vector128`1", op_etype);
+		MonoInst *new_args[2];
+		new_args [0] = emit_simd_ins (cfg, op_klass, OP_XCAST, args [0]->dreg, -1);
+		new_args [1] = emit_xconst_v128 (cfg, op_klass, vec_cns);
+		MonoInst *ins = emit_simd_ins_for_sig (cfg, op_klass, OP_XOP_OVR_X_X_X, INTRINS_AARCH64_ADV_SIMD_TBL1, 0, fsig, new_args);
+		return emit_simd_ins (cfg, klass, OP_XCAST, ins->dreg, -1);
 #elif defined(TARGET_AMD64)
 		if (COMPILE_LLVM (cfg)) {
-			if (is_SIMD_feature_supported (cfg, MONO_CPU_X86_SSSE3) && vector_size == 128 && (arg0_type == MONO_TYPE_I1 || arg0_type == MONO_TYPE_U1))
-				return emit_simd_ins_for_sig (cfg, klass, OP_XOP_X_X_X, INTRINS_SSE_PSHUFB, 0, fsig, args);
+			if (vector_size != 128) {
+				return NULL;
+			}
+			if (!is_xconst (args [1])) {
+				return NULL;
+			}
+			MonoType *op_etype = etype;
+			MonoClass *op_klass = klass;
+			int vsize = mono_class_value_size (klass, NULL);
+			int esize = mono_class_value_size (mono_class_from_mono_type_internal (etype), NULL);
+			int ecount = vsize / esize;
+			g_assert ((ecount == 2) || (ecount == 4) || (ecount == 8) || (ecount == 16));
+			guint8 control = 0;
+			gboolean needs_zero = false;
+			guint64 value = 0;
+			guint8 vec_cns[16];
+			if ((arg0_type == MONO_TYPE_I1) || (arg0_type == MONO_TYPE_U1)) {
+				needs_zero = true;
+			} else if ((arg0_type == MONO_TYPE_I2) || (arg0_type == MONO_TYPE_U2)) {
+				needs_zero = true;
+			}
+			for (int index = 0; index < ecount; index++) {
+				value = get_xconst_int_elem (cfg, args [1], arg0_type, index);
+				if (value < ecount) {
+					// Setting the control for byte/sbyte and short/ushort is unnecessary
+					// and will actually compute an incorrect control word. But it simplifies
+					// the overall logic needed here and will remain unused.
+					
+					control |= (value << (index * (ecount / 2)));
+					
+					// When Ssse3 is supported, we may need vecCns to accurately select the relevant
+					// bytes if some index is outside the valid range. Since x86/x64 is little-endian
+					// we can simplify this down to a for loop that scales the value and selects count
+					// sequential bytes.
+
+					for (int i = 0; i < esize; i++) {
+						vec_cns [(index * esize) + i] = (guint8)((value * esize) + i);
+					}
+				} else {
+					needs_zero = true;
+
+					// When Ssse3 is supported, we may need vecCns to accurately select the relevant
+					// bytes if some index is outside the valid range. We can do this by just zeroing
+					// out each byte in the element. This only requires the most significant bit to be
+					// set, but we use 0xFF instead since that will be the equivalent of AllBitsSet
+
+					for (int i = 0; i < esize; i++) {
+						vec_cns [(index * esize) + i] = (guint8)0xFF;
+					}
+				}
+			}
+			MonoInst *new_args[3];
+			if (needs_zero) {
+				if (!is_SIMD_feature_supported (cfg, MONO_CPU_X86_SSSE3)) {
+					return NULL;
+				}
+				op_etype = m_class_get_byval_arg (mono_defaults.byte_class);
+				op_klass = create_class_instance ("System.Runtime.Intrinsics", "Vector128`1", op_etype);
+				new_args [0] = emit_simd_ins (cfg, op_klass, OP_XCAST, args [0]->dreg, -1);
+				new_args [1] = emit_xconst_v128 (cfg, op_klass, vec_cns);
+				MonoInst *ins = emit_simd_ins_for_sig (cfg, op_klass, OP_XOP_X_X_X, INTRINS_SSE_PSHUFB, 0, fsig, new_args);
+				return emit_simd_ins (cfg, klass, OP_XCAST, ins->dreg, -1);
+			}
+			if ((arg0_type == MONO_TYPE_I8) || (arg0_type == MONO_TYPE_U8)) {
+				// TYP_LONG and TYP_ULONG don't have their own shuffle/permute instructions and so we'll
+				// just utilize the path for TYP_DOUBLE for simplicity. We could alternatively break this
+				// down into a TYP_INT or TYP_UINT based shuffle, but that's additional complexity for no
+				// real benefit since shuffle gets its own port rather than using the fp specific ports.
+				arg0_type = MONO_TYPE_R8;
+				op_etype = m_class_get_byval_arg (mono_defaults.double_class);
+				op_klass = create_class_instance ("System.Runtime.Intrinsics", "Vector128`1", etype);
+			}
+			if ((arg0_type == MONO_TYPE_R4) || (arg0_type == MONO_TYPE_R8)) {
+				int opcode = (arg0_type == MONO_TYPE_R4) ? OP_SSE_SHUFPS : OP_SSE2_SHUFPD;
+				new_args [0] = args [0];
+				if (op_klass != klass) {
+					new_args [0] = emit_simd_ins (cfg, op_klass, OP_XCAST, new_args [0]->dreg, -1);
+				}
+				new_args [1] = new_args [0];
+				EMIT_NEW_ICONST (cfg, new_args [2], control);
+				MonoInst* ins = emit_simd_ins (cfg, op_klass, opcode, new_args [0]->dreg, new_args [1]->dreg);
+				ins->inst_c0 = 0;
+				ins->inst_c1 = arg0_type;
+				ins->sreg3 = new_args [2]->dreg;
+				if (op_klass != klass) {
+					ins = emit_simd_ins (cfg, klass, OP_XCAST, ins->dreg, -1);
+				}
+				return ins;
+			} else {
+				g_assert ((arg0_type == MONO_TYPE_I4) || (arg0_type == MONO_TYPE_U4));
+				return emit_simd_ins_for_sig (cfg, klass, OP_SSE2_PSHUFD, 0, arg0_type, fsig, new_args);
+			}
 		}
 		// There is no variable shuffle until avx512
 		return NULL;