diff --git a/cranelift-codegen/meta/src/cdsl/ast.rs b/cranelift-codegen/meta/src/cdsl/ast.rs
index 141ffd84c..798ee29d0 100644
--- a/cranelift-codegen/meta/src/cdsl/ast.rs
+++ b/cranelift-codegen/meta/src/cdsl/ast.rs
@@ -8,6 +8,7 @@ use cranelift_entity::{entity_impl, PrimaryMap};
 
 use std::fmt;
 
+#[derive(Debug)]
 pub enum Expr {
     Var(VarIndex),
     Literal(Literal),
@@ -363,6 +364,7 @@ impl VarPool {
 ///
 /// An `Apply` AST expression is created by using function call syntax on instructions. This
 /// applies to both bound and unbound polymorphic instructions.
+#[derive(Debug)]
 pub struct Apply {
     pub inst: Instruction,
     pub args: Vec<Expr>,
diff --git a/cranelift-codegen/meta/src/cdsl/instructions.rs b/cranelift-codegen/meta/src/cdsl/instructions.rs
index 1689f2a5c..f061be94e 100644
--- a/cranelift-codegen/meta/src/cdsl/instructions.rs
+++ b/cranelift-codegen/meta/src/cdsl/instructions.rs
@@ -14,7 +14,7 @@ use crate::cdsl::type_inference::Constraint;
 use crate::cdsl::types::{LaneType, ReferenceType, ValueType, VectorType};
 use crate::cdsl::typevar::TypeVar;
 
-#[derive(Copy, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
 pub struct OpcodeNumber(u32);
 entity_impl!(OpcodeNumber);
 
@@ -79,12 +79,14 @@ impl InstructionGroup {
     }
 }
 
+#[derive(Debug)]
 pub struct PolymorphicInfo {
     pub use_typevar_operand: bool,
     pub ctrl_typevar: TypeVar,
     pub other_typevars: Vec<TypeVar>,
 }
 
+#[derive(Debug)]
 pub struct InstructionContent {
     /// Instruction mnemonic, also becomes opcode name.
     pub name: String,
@@ -139,7 +141,7 @@ pub struct InstructionContent {
     pub writes_cpu_flags: bool,
 }
 
-#[derive(Clone)]
+#[derive(Clone, Debug)]
 pub struct Instruction {
     content: Rc<InstructionContent>,
 }
@@ -1125,6 +1127,11 @@ fn bind_vector(
     mut value_types: Vec<ValueTypeOrAny>,
 ) -> BoundInstruction {
     let num_lanes = vector_size_in_bits / lane_type.lane_bits();
+    assert!(
+        num_lanes >= 2,
+        "Minimum lane number for bind_vector is 2, found {}.",
+        num_lanes,
+    );
     let vector_type = ValueType::Vector(VectorType::new(lane_type, num_lanes));
     value_types.push(ValueTypeOrAny::ValueType(vector_type));
     verify_polymorphic_binding(&inst, &value_types);
diff --git a/cranelift-codegen/meta/src/cdsl/type_inference.rs b/cranelift-codegen/meta/src/cdsl/type_inference.rs
index 101cfa410..a56d81463 100644
--- a/cranelift-codegen/meta/src/cdsl/type_inference.rs
+++ b/cranelift-codegen/meta/src/cdsl/type_inference.rs
@@ -4,7 +4,7 @@ use crate::cdsl::typevar::{DerivedFunc, TypeSet, TypeVar};
 use std::collections::{HashMap, HashSet};
 use std::iter::FromIterator;
 
-#[derive(Hash, PartialEq, Eq)]
+#[derive(Debug, Hash, PartialEq, Eq)]
 pub enum Constraint {
     /// Constraint specifying that a type var tv1 must be wider than or equal to type var tv2 at
     /// runtime. This requires that:
diff --git a/cranelift-codegen/meta/src/cdsl/types.rs b/cranelift-codegen/meta/src/cdsl/types.rs
index eba239d1d..f431bb3ed 100644
--- a/cranelift-codegen/meta/src/cdsl/types.rs
+++ b/cranelift-codegen/meta/src/cdsl/types.rs
@@ -215,12 +215,14 @@ impl LaneType {
                 LaneType::BoolType(shared_types::Bool::B16) => 2,
                 LaneType::BoolType(shared_types::Bool::B32) => 3,
                 LaneType::BoolType(shared_types::Bool::B64) => 4,
-                LaneType::IntType(shared_types::Int::I8) => 5,
-                LaneType::IntType(shared_types::Int::I16) => 6,
-                LaneType::IntType(shared_types::Int::I32) => 7,
-                LaneType::IntType(shared_types::Int::I64) => 8,
-                LaneType::FloatType(shared_types::Float::F32) => 9,
-                LaneType::FloatType(shared_types::Float::F64) => 10,
+                LaneType::BoolType(shared_types::Bool::B128) => 5,
+                LaneType::IntType(shared_types::Int::I8) => 6,
+                LaneType::IntType(shared_types::Int::I16) => 7,
+                LaneType::IntType(shared_types::Int::I32) => 8,
+                LaneType::IntType(shared_types::Int::I64) => 9,
+                LaneType::IntType(shared_types::Int::I128) => 10,
+                LaneType::FloatType(shared_types::Float::F32) => 11,
+                LaneType::FloatType(shared_types::Float::F64) => 12,
             }
     }
 
@@ -231,6 +233,7 @@ impl LaneType {
             16 => shared_types::Bool::B16,
             32 => shared_types::Bool::B32,
             64 => shared_types::Bool::B64,
+            128 => shared_types::Bool::B128,
             _ => unreachable!("unxpected num bits for bool"),
         })
     }
@@ -241,6 +244,7 @@ impl LaneType {
             16 => shared_types::Int::I16,
             32 => shared_types::Int::I32,
             64 => shared_types::Int::I64,
+            128 => shared_types::Int::I128,
             _ => unreachable!("unxpected num bits for int"),
         })
     }
diff --git a/cranelift-codegen/meta/src/cdsl/typevar.rs b/cranelift-codegen/meta/src/cdsl/typevar.rs
index 9ae4c33fd..71c2fd2e2 100644
--- a/cranelift-codegen/meta/src/cdsl/typevar.rs
+++ b/cranelift-codegen/meta/src/cdsl/typevar.rs
@@ -9,7 +9,8 @@ use std::rc::Rc;
 use crate::cdsl::types::{BVType, LaneType, ReferenceType, SpecialType, ValueType};
 
 const MAX_LANES: u16 = 256;
-const MAX_BITS: u16 = 64;
+const MAX_BITS: u16 = 128;
+const MAX_FLOAT_BITS: u16 = 64;
 const MAX_BITVEC: u16 = MAX_BITS * MAX_LANES;
 
 /// Type variables can be used in place of concrete types when defining
@@ -177,7 +178,7 @@ impl TypeVar {
                     "can't double all integer types"
                 );
                 assert!(
-                    ts.floats.len() == 0 || *ts.floats.iter().max().unwrap() < MAX_BITS,
+                    ts.floats.len() == 0 || *ts.floats.iter().max().unwrap() < MAX_FLOAT_BITS,
                     "can't double all float types"
                 );
                 assert!(
@@ -503,7 +504,7 @@ impl TypeSet {
         copy.floats = NumSet::from_iter(
             self.floats
                 .iter()
-                .filter(|&&x| x < MAX_BITS)
+                .filter(|&&x| x < MAX_FLOAT_BITS)
                 .map(|&x| x * 2),
         );
         copy.bools = NumSet::from_iter(
@@ -621,7 +622,7 @@ impl TypeSet {
                 let mut copy = self.clone();
                 copy.bitvecs = NumSet::new();
                 if self.bools.contains(&1) {
-                    copy.ints = NumSet::from_iter(vec![8, 16, 32, 64]);
+                    copy.ints = NumSet::from_iter(vec![8, 16, 32, 64, 128]);
                     copy.floats = NumSet::from_iter(vec![32, 64]);
                 } else {
                     copy.ints = &self.bools - &NumSet::from_iter(vec![1]);
@@ -950,7 +951,7 @@ fn test_typevar_builder() {
     let type_set = TypeSetBuilder::new().ints(Interval::All).build();
     assert_eq!(type_set.lanes, num_set![1]);
     assert!(type_set.floats.is_empty());
-    assert_eq!(type_set.ints, num_set![8, 16, 32, 64]);
+    assert_eq!(type_set.ints, num_set![8, 16, 32, 64, 128]);
     assert!(type_set.bools.is_empty());
     assert!(type_set.bitvecs.is_empty());
     assert!(type_set.specials.is_empty());
@@ -959,7 +960,7 @@ fn test_typevar_builder() {
     assert_eq!(type_set.lanes, num_set![1]);
     assert!(type_set.floats.is_empty());
     assert!(type_set.ints.is_empty());
-    assert_eq!(type_set.bools, num_set![1, 8, 16, 32, 64]);
+    assert_eq!(type_set.bools, num_set![1, 8, 16, 32, 64, 128]);
     assert!(type_set.bitvecs.is_empty());
     assert!(type_set.specials.is_empty());
 
@@ -1101,7 +1102,7 @@ fn test_forward_images() {
     );
     assert_eq!(
         TypeSetBuilder::new().ints(32..64).build().double_width(),
-        TypeSetBuilder::new().ints(64..64).build()
+        TypeSetBuilder::new().ints(64..128).build()
     );
     assert_eq!(
         TypeSetBuilder::new().floats(32..32).build().double_width(),
@@ -1117,7 +1118,7 @@ fn test_forward_images() {
     );
     assert_eq!(
         TypeSetBuilder::new().bools(32..64).build().double_width(),
-        TypeSetBuilder::new().bools(64..64).build()
+        TypeSetBuilder::new().bools(64..128).build()
     );
 }
 
@@ -1145,7 +1146,7 @@ fn test_backward_images() {
     assert_eq!(
         TypeSetBuilder::new()
             .simd_lanes(1..4)
-            .bools(1..64)
+            .bools(1..128)
             .build()
             .preimage(DerivedFunc::AsBool),
         TypeSetBuilder::new()
@@ -1205,9 +1206,9 @@ fn test_backward_images() {
     // Half width.
     assert_eq!(
         TypeSetBuilder::new()
-            .ints(64..64)
+            .ints(128..128)
             .floats(64..64)
-            .bools(64..64)
+            .bools(128..128)
             .build()
             .preimage(DerivedFunc::HalfWidth)
             .size(),
@@ -1221,7 +1222,7 @@ fn test_backward_images() {
             .preimage(DerivedFunc::HalfWidth),
         TypeSetBuilder::new()
             .simd_lanes(64..256)
-            .bools(16..64)
+            .bools(16..128)
             .build(),
     );
 
diff --git a/cranelift-codegen/meta/src/cdsl/xform.rs b/cranelift-codegen/meta/src/cdsl/xform.rs
index b1a0234cd..b90d552b9 100644
--- a/cranelift-codegen/meta/src/cdsl/xform.rs
+++ b/cranelift-codegen/meta/src/cdsl/xform.rs
@@ -183,7 +183,14 @@ fn rewrite_expr(
     assert_eq!(
         apply_target.inst().operands_in.len(),
         dummy_args.len(),
-        "number of arguments in instruction is incorrect"
+        "number of arguments in instruction {} is incorrect\nexpected: {:?}",
+        apply_target.inst().name,
+        apply_target
+            .inst()
+            .operands_in
+            .iter()
+            .map(|operand| format!("{}: {}", operand.name, operand.kind.name))
+            .collect::<Vec<_>>(),
     );
 
     let mut args = Vec::new();
diff --git a/cranelift-codegen/meta/src/gen_legalizer.rs b/cranelift-codegen/meta/src/gen_legalizer.rs
index 7b59844e6..da1fb1f58 100644
--- a/cranelift-codegen/meta/src/gen_legalizer.rs
+++ b/cranelift-codegen/meta/src/gen_legalizer.rs
@@ -61,10 +61,10 @@ fn unwrap_inst(
             fmtln!(fmt, "{},", field.member);
         }
 
-        if iform.num_value_operands == 1 {
-            fmt.line("arg,");
-        } else if iform.has_value_list || iform.num_value_operands > 1 {
+        if iform.has_value_list || iform.num_value_operands > 1 {
             fmt.line("ref args,");
+        } else if iform.num_value_operands == 1 {
+            fmt.line("arg,");
         }
 
         fmt.line("..");
@@ -87,6 +87,13 @@ fn unwrap_inst(
                 } else if op.is_value() {
                     let n = inst.value_opnums.iter().position(|&i| i == op_num).unwrap();
                     fmtln!(fmt, "func.dfg.resolve_aliases(args[{}]),", n);
+                } else if op.is_varargs() {
+                    let n = inst.imm_opnums.iter().chain(inst.value_opnums.iter()).max().map(|n| n + 1).unwrap_or(0);
+                    // We need to create a `Vec` here, as using a slice would result in a borrowck
+                    // error later on.
+                    fmtln!(fmt, "\
+                        args.iter().skip({}).map(|&arg| func.dfg.resolve_aliases(arg)).collect::<Vec<_>>(),\
+                    ", n);
                 }
             }
 
@@ -104,6 +111,19 @@ fn unwrap_inst(
     });
     fmtln!(fmt, "};");
 
+    assert_eq!(inst.operands_in.len(), apply.args.len());
+    for (i, op) in inst.operands_in.iter().enumerate() {
+        if op.is_varargs() {
+            let name = var_pool
+                .get(apply.args[i].maybe_var().expect("vararg without name"))
+                .name;
+
+            // Above name is set to an `Vec` representing the varargs. However it is expected to be
+            // `&[Value]` below, so we borrow it.
+            fmtln!(fmt, "let {} = &{};", name, name);
+        }
+    }
+
     for &op_num in &inst.value_opnums {
         let arg = &apply.args[op_num];
         if let Some(var_index) = arg.maybe_var() {
@@ -402,6 +422,13 @@ fn gen_transform<'a>(
             fmt.line("let removed = pos.remove_inst();");
             fmt.line("debug_assert_eq!(removed, inst);");
         }
+
+        if transform.def_pool.get(transform.src).apply.inst.is_branch {
+            // A branch might have been legalized into multiple branches, so we need to recompute
+            // the cfg.
+            fmt.line("cfg.recompute_ebb(pos.func, pos.current_ebb().unwrap());");
+        }
+
         fmt.line("return true;");
     });
     fmt.line("}");
diff --git a/cranelift-codegen/meta/src/isa/x86/encodings.rs b/cranelift-codegen/meta/src/isa/x86/encodings.rs
index 71f104210..253491f01 100644
--- a/cranelift-codegen/meta/src/isa/x86/encodings.rs
+++ b/cranelift-codegen/meta/src/isa/x86/encodings.rs
@@ -9,7 +9,7 @@ use crate::cdsl::instructions::{
 };
 use crate::cdsl::recipes::{EncodingRecipe, EncodingRecipeNumber, Recipes};
 use crate::cdsl::settings::{SettingGroup, SettingPredicateNumber};
-use crate::cdsl::types::ValueType;
+use crate::cdsl::types::{LaneType, ValueType};
 use crate::shared::types::Bool::{B1, B16, B32, B64, B8};
 use crate::shared::types::Float::{F32, F64};
 use crate::shared::types::Int::{I16, I32, I64, I8};
@@ -1735,6 +1735,8 @@ pub(crate) fn define(
     // legalize.rs for how this is done; once there, x86_pshuf* (below) is used for broadcasting the
     // value across the register
 
+    let allowed_simd_type = |t: &LaneType| t.lane_bits() >= 8 && t.lane_bits() < 128;
+
     // PSHUFB, 8-bit shuffle using two XMM registers
     for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 8) {
         let instruction = x86_pshufb.bind_vector_from_lane(ty, sse_vector_size);
@@ -1756,7 +1758,7 @@ pub(crate) fn define(
     // SIMD scalar_to_vector; this uses MOV to copy the scalar value to an XMM register; according
     // to the Intel manual: "When the destination operand is an XMM register, the source operand is
     // written to the low doubleword of the register and the regiser is zero-extended to 128 bits."
-    for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() >= 8) {
+    for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
         let instruction = scalar_to_vector.bind_vector_from_lane(ty, sse_vector_size);
         let template = rec_frurm.opcodes(vec![0x66, 0x0f, 0x6e]); // MOVD/MOVQ
         if ty.lane_bits() < 64 {
@@ -1774,7 +1776,7 @@ pub(crate) fn define(
     insertlane_mapping.insert(32, (vec![0x66, 0x0f, 0x3a, 0x22], Some(use_sse41_simd))); // PINSRD
     insertlane_mapping.insert(64, (vec![0x66, 0x0f, 0x3a, 0x22], Some(use_sse41_simd))); // PINSRQ, only x86_64
 
-    for ty in ValueType::all_lane_types() {
+    for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
         if let Some((opcode, isap)) = insertlane_mapping.get(&ty.lane_bits()) {
             let instruction = insertlane.bind_vector_from_lane(ty, sse_vector_size);
             let template = rec_r_ib_unsigned_r.opcodes(opcode.clone());
@@ -1795,7 +1797,7 @@ pub(crate) fn define(
     extractlane_mapping.insert(32, (vec![0x66, 0x0f, 0x3a, 0x16], Some(use_sse41_simd))); // PEXTRD
     extractlane_mapping.insert(64, (vec![0x66, 0x0f, 0x3a, 0x16], Some(use_sse41_simd))); // PEXTRQ, only x86_64
 
-    for ty in ValueType::all_lane_types() {
+    for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
         if let Some((opcode, isap)) = extractlane_mapping.get(&ty.lane_bits()) {
             let instruction = extractlane.bind_vector_from_lane(ty, sse_vector_size);
             let template = rec_r_ib_unsigned_gpr.opcodes(opcode.clone());
@@ -1816,8 +1818,9 @@ pub(crate) fn define(
     }
 
     // SIMD bitcast all 128-bit vectors to each other (for legalizing splat.x16x8)
-    for from_type in ValueType::all_lane_types().filter(|t| t.lane_bits() >= 8) {
-        for to_type in ValueType::all_lane_types().filter(|t| t.lane_bits() >= 8 && *t != from_type)
+    for from_type in ValueType::all_lane_types().filter(allowed_simd_type) {
+        for to_type in
+            ValueType::all_lane_types().filter(|t| allowed_simd_type(t) && *t != from_type)
         {
             let instruction = raw_bitcast
                 .bind_vector_from_lane(to_type, sse_vector_size)
@@ -1833,7 +1836,7 @@ pub(crate) fn define(
     // for that; alternately, constants could be loaded into XMM registers using a sequence like:
     // MOVQ + MOVHPD + MOVQ + MOVLPD (this allows the constants to be immediates instead of stored
     // in memory) but some performance measurements are needed.
-    for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() >= 8) {
+    for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
         let instruction = vconst.bind_vector_from_lane(ty, sse_vector_size);
         let template = rec_vconst.nonrex().opcodes(vec![0x0f, 0x10]);
         e.enc_32_64_maybe_isap(instruction, template, None); // from SSE
diff --git a/cranelift-codegen/meta/src/shared/instructions.rs b/cranelift-codegen/meta/src/shared/instructions.rs
index 67e289106..843347ce9 100644
--- a/cranelift-codegen/meta/src/shared/instructions.rs
+++ b/cranelift-codegen/meta/src/shared/instructions.rs
@@ -3143,7 +3143,7 @@ pub(crate) fn define(
         "WideInt",
         "An integer type with lanes from `i16` upwards",
         TypeSetBuilder::new()
-            .ints(16..64)
+            .ints(16..128)
             .simd_lanes(Interval::All)
             .build(),
     );
@@ -3171,9 +3171,9 @@ pub(crate) fn define(
 
     let NarrowInt = &TypeVar::new(
         "NarrowInt",
-        "An integer type with lanes type to `i32`",
+        "An integer type with lanes type to `i64`",
         TypeSetBuilder::new()
-            .ints(8..32)
+            .ints(8..64)
             .simd_lanes(Interval::All)
             .build(),
     );
diff --git a/cranelift-codegen/meta/src/shared/legalize.rs b/cranelift-codegen/meta/src/shared/legalize.rs
index 90fe47d0f..958827270 100644
--- a/cranelift-codegen/meta/src/shared/legalize.rs
+++ b/cranelift-codegen/meta/src/shared/legalize.rs
@@ -4,7 +4,7 @@ use crate::cdsl::xform::{TransformGroupBuilder, TransformGroups};
 
 use crate::shared::immediates::Immediates;
 use crate::shared::types::Float::{F32, F64};
-use crate::shared::types::Int::{I16, I32, I64, I8};
+use crate::shared::types::Int::{I128, I16, I32, I64, I8};
 
 pub(crate) fn define(insts: &InstructionGroup, imm: &Immediates) -> TransformGroups {
     let mut narrow = TransformGroupBuilder::new(
@@ -49,6 +49,8 @@ pub(crate) fn define(insts: &InstructionGroup, imm: &Immediates) -> TransformGro
     let bor = insts.by_name("bor");
     let bor_imm = insts.by_name("bor_imm");
     let bor_not = insts.by_name("bor_not");
+    let brnz = insts.by_name("brnz");
+    let brz = insts.by_name("brz");
     let br_icmp = insts.by_name("br_icmp");
     let br_table = insts.by_name("br_table");
     let bxor = insts.by_name("bxor");
@@ -177,9 +179,14 @@ pub(crate) fn define(insts: &InstructionGroup, imm: &Immediates) -> TransformGro
     let al = var("al");
     let ah = var("ah");
     let cc = var("cc");
+    let ebb = var("ebb");
     let ptr = var("ptr");
     let flags = var("flags");
     let offset = var("off");
+    let vararg = var("vararg");
+
+    narrow.custom_legalize(load, "narrow_load");
+    narrow.custom_legalize(store, "narrow_store");
 
     narrow.legalize(
         def!(a = iadd(x, y)),
@@ -203,7 +210,7 @@ pub(crate) fn define(insts: &InstructionGroup, imm: &Immediates) -> TransformGro
         ],
     );
 
-    for &bin_op in &[band, bor, bxor] {
+    for &bin_op in &[band, bor, bxor, band_not, bor_not, bxor_not] {
         narrow.legalize(
             def!(a = bin_op(x, y)),
             vec![
@@ -216,6 +223,16 @@ pub(crate) fn define(insts: &InstructionGroup, imm: &Immediates) -> TransformGro
         );
     }
 
+    narrow.legalize(
+        def!(a = bnot(x)),
+        vec![
+            def!((xl, xh) = isplit(x)),
+            def!(al = bnot(xl)),
+            def!(ah = bnot(xh)),
+            def!(a = iconcat(al, ah)),
+        ],
+    );
+
     narrow.legalize(
         def!(a = select(c, x, y)),
         vec![
@@ -227,6 +244,38 @@ pub(crate) fn define(insts: &InstructionGroup, imm: &Immediates) -> TransformGro
         ],
     );
 
+    narrow.legalize(
+        def!(brz.I128(x, ebb, vararg)),
+        vec![
+            def!((xl, xh) = isplit(x)),
+            def!(
+                a = icmp_imm(
+                    Literal::enumerator_for(&imm.intcc, "eq"),
+                    xl,
+                    Literal::constant(&imm.imm64, 0)
+                )
+            ),
+            def!(
+                b = icmp_imm(
+                    Literal::enumerator_for(&imm.intcc, "eq"),
+                    xh,
+                    Literal::constant(&imm.imm64, 0)
+                )
+            ),
+            def!(c = band(a, b)),
+            def!(brz(c, ebb, vararg)),
+        ],
+    );
+
+    narrow.legalize(
+        def!(brnz.I128(x, ebb, vararg)),
+        vec![
+            def!((xl, xh) = isplit(x)),
+            def!(brnz(xl, ebb, vararg)),
+            def!(brnz(xh, ebb, vararg)),
+        ],
+    );
+
     // Widen instructions with one input operand.
     for &op in &[bnot, popcnt] {
         for &int_ty in &[I8, I16] {
diff --git a/cranelift-codegen/meta/src/shared/types.rs b/cranelift-codegen/meta/src/shared/types.rs
index 266c30b3d..52fa9545c 100644
--- a/cranelift-codegen/meta/src/shared/types.rs
+++ b/cranelift-codegen/meta/src/shared/types.rs
@@ -12,6 +12,8 @@ pub enum Bool {
     B32 = 32,
     /// 64-bit bool.
     B64 = 64,
+    /// 128-bit bool.
+    B128 = 128,
 }
 
 /// This provides an iterator through all of the supported bool variants.
@@ -34,6 +36,7 @@ impl Iterator for BoolIterator {
             2 => Some(Bool::B16),
             3 => Some(Bool::B32),
             4 => Some(Bool::B64),
+            5 => Some(Bool::B128),
             _ => return None,
         };
         self.index += 1;
@@ -51,6 +54,8 @@ pub enum Int {
     I32 = 32,
     /// 64-bit int.
     I64 = 64,
+    /// 128-bit int.
+    I128 = 128,
 }
 
 /// This provides an iterator through all of the supported int variants.
@@ -72,6 +77,7 @@ impl Iterator for IntIterator {
             1 => Some(Int::I16),
             2 => Some(Int::I32),
             3 => Some(Int::I64),
+            4 => Some(Int::I128),
             _ => return None,
         };
         self.index += 1;
@@ -189,6 +195,7 @@ mod iter_tests {
         assert_eq!(bool_iter.next(), Some(Bool::B16));
         assert_eq!(bool_iter.next(), Some(Bool::B32));
         assert_eq!(bool_iter.next(), Some(Bool::B64));
+        assert_eq!(bool_iter.next(), Some(Bool::B128));
         assert_eq!(bool_iter.next(), None);
     }
 
@@ -199,6 +206,7 @@ mod iter_tests {
         assert_eq!(int_iter.next(), Some(Int::I16));
         assert_eq!(int_iter.next(), Some(Int::I32));
         assert_eq!(int_iter.next(), Some(Int::I64));
+        assert_eq!(int_iter.next(), Some(Int::I128));
         assert_eq!(int_iter.next(), None);
     }
 
diff --git a/cranelift-codegen/src/ir/types.rs b/cranelift-codegen/src/ir/types.rs
index 4eb72f3fc..10fca8aaa 100644
--- a/cranelift-codegen/src/ir/types.rs
+++ b/cranelift-codegen/src/ir/types.rs
@@ -10,11 +10,11 @@ use target_lexicon::{PointerWidth, Triple};
 /// field is present put no type is needed, such as the controlling type variable for a
 /// non-polymorphic instruction.
 ///
-/// Basic integer types: `I8`, `I16`, `I32`, and `I64`. These types are sign-agnostic.
+/// Basic integer types: `I8`, `I16`, `I32`, `I64`, and `I128`. These types are sign-agnostic.
 ///
 /// Basic floating point types: `F32` and `F64`. IEEE single and double precision.
 ///
-/// Boolean types: `B1`, `B8`, `B16`, `B32`, and `B64`. These all encode 'true' or 'false'. The
+/// Boolean types: `B1`, `B8`, `B16`, `B32`, `B64`, and `B128`. These all encode 'true' or 'false'. The
 /// larger types use redundant bits.
 ///
 /// SIMD vector types have power-of-two lanes, up to 256. Lanes can be any int/float/bool type.
@@ -63,6 +63,7 @@ impl Type {
             B16 | I16 => 4,
             B32 | I32 | F32 | R32 => 5,
             B64 | I64 | F64 | R64 => 6,
+            B128 | I128 => 7,
             _ => 0,
         }
     }
@@ -75,6 +76,7 @@ impl Type {
             B16 | I16 => 16,
             B32 | I32 | F32 | R32 => 32,
             B64 | I64 | F64 | R64 => 64,
+            B128 | I128 => 128,
             _ => 0,
         }
     }
@@ -86,6 +88,7 @@ impl Type {
             16 => Some(I16),
             32 => Some(I32),
             64 => Some(I64),
+            128 => Some(I128),
             _ => None,
         }
     }
@@ -109,6 +112,7 @@ impl Type {
             B32 | I32 | F32 => B32,
             B64 | I64 | F64 => B64,
             R32 | R64 => panic!("Reference types should not convert to bool"),
+            B128 | I128 => B128,
             _ => B1,
         })
     }
@@ -132,10 +136,12 @@ impl Type {
             I16 => I8,
             I32 => I16,
             I64 => I32,
+            I128 => I64,
             F64 => F32,
             B16 => B8,
             B32 => B16,
             B64 => B32,
+            B128 => B64,
             _ => return None,
         }))
     }
@@ -147,10 +153,12 @@ impl Type {
             I8 => I16,
             I16 => I32,
             I32 => I64,
+            I64 => I128,
             F32 => F64,
             B8 => B16,
             B16 => B32,
             B32 => B64,
+            B64 => B128,
             _ => return None,
         }))
     }
@@ -182,7 +190,7 @@ impl Type {
     /// Is this a scalar boolean type?
     pub fn is_bool(self) -> bool {
         match self {
-            B1 | B8 | B16 | B32 | B64 => true,
+            B1 | B8 | B16 | B32 | B64 | B128 => true,
             _ => false,
         }
     }
@@ -190,7 +198,7 @@ impl Type {
     /// Is this a scalar integer type?
     pub fn is_int(self) -> bool {
         match self {
-            I8 | I16 | I32 | I64 => true,
+            I8 | I16 | I32 | I64 | I128 => true,
             _ => false,
         }
     }
@@ -370,10 +378,12 @@ mod tests {
         assert_eq!(B16, B16.lane_type());
         assert_eq!(B32, B32.lane_type());
         assert_eq!(B64, B64.lane_type());
+        assert_eq!(B128, B128.lane_type());
         assert_eq!(I8, I8.lane_type());
         assert_eq!(I16, I16.lane_type());
         assert_eq!(I32, I32.lane_type());
         assert_eq!(I64, I64.lane_type());
+        assert_eq!(I128, I128.lane_type());
         assert_eq!(F32, F32.lane_type());
         assert_eq!(F64, F64.lane_type());
         assert_eq!(B1, B1.by(8).unwrap().lane_type());
@@ -390,10 +400,12 @@ mod tests {
         assert_eq!(B16.lane_bits(), 16);
         assert_eq!(B32.lane_bits(), 32);
         assert_eq!(B64.lane_bits(), 64);
+        assert_eq!(B128.lane_bits(), 128);
         assert_eq!(I8.lane_bits(), 8);
         assert_eq!(I16.lane_bits(), 16);
         assert_eq!(I32.lane_bits(), 32);
         assert_eq!(I64.lane_bits(), 64);
+        assert_eq!(I128.lane_bits(), 128);
         assert_eq!(F32.lane_bits(), 32);
         assert_eq!(F64.lane_bits(), 64);
         assert_eq!(R32.lane_bits(), 32);
@@ -410,11 +422,13 @@ mod tests {
         assert_eq!(B16.half_width(), Some(B8));
         assert_eq!(B32.half_width(), Some(B16));
         assert_eq!(B64.half_width(), Some(B32));
+        assert_eq!(B128.half_width(), Some(B64));
         assert_eq!(I8.half_width(), None);
         assert_eq!(I16.half_width(), Some(I8));
         assert_eq!(I32.half_width(), Some(I16));
         assert_eq!(I32X4.half_width(), Some(I16X4));
         assert_eq!(I64.half_width(), Some(I32));
+        assert_eq!(I128.half_width(), Some(I64));
         assert_eq!(F32.half_width(), None);
         assert_eq!(F64.half_width(), Some(F32));
 
@@ -425,12 +439,14 @@ mod tests {
         assert_eq!(B8.double_width(), Some(B16));
         assert_eq!(B16.double_width(), Some(B32));
         assert_eq!(B32.double_width(), Some(B64));
-        assert_eq!(B64.double_width(), None);
+        assert_eq!(B64.double_width(), Some(B128));
+        assert_eq!(B128.double_width(), None);
         assert_eq!(I8.double_width(), Some(I16));
         assert_eq!(I16.double_width(), Some(I32));
         assert_eq!(I32.double_width(), Some(I64));
         assert_eq!(I32X4.double_width(), Some(I64X4));
-        assert_eq!(I64.double_width(), None);
+        assert_eq!(I64.double_width(), Some(I128));
+        assert_eq!(I128.double_width(), None);
         assert_eq!(F32.double_width(), Some(F64));
         assert_eq!(F64.double_width(), None);
     }
@@ -461,10 +477,12 @@ mod tests {
         assert_eq!(B16.to_string(), "b16");
         assert_eq!(B32.to_string(), "b32");
         assert_eq!(B64.to_string(), "b64");
+        assert_eq!(B128.to_string(), "b128");
         assert_eq!(I8.to_string(), "i8");
         assert_eq!(I16.to_string(), "i16");
         assert_eq!(I32.to_string(), "i32");
         assert_eq!(I64.to_string(), "i64");
+        assert_eq!(I128.to_string(), "i128");
         assert_eq!(F32.to_string(), "f32");
         assert_eq!(F64.to_string(), "f64");
         assert_eq!(R32.to_string(), "r32");
diff --git a/cranelift-codegen/src/legalizer/mod.rs b/cranelift-codegen/src/legalizer/mod.rs
index e6f7bcb00..0f24689d8 100644
--- a/cranelift-codegen/src/legalizer/mod.rs
+++ b/cranelift-codegen/src/legalizer/mod.rs
@@ -21,6 +21,8 @@ use crate::ir::{self, InstBuilder, MemFlags};
 use crate::isa::TargetIsa;
 use crate::predicates;
 use crate::timing;
+use std::collections::BTreeSet;
+use std::vec::Vec;
 
 mod boundary;
 mod call;
@@ -36,31 +38,78 @@ use self::heap::expand_heap_addr;
 use self::libcall::expand_as_libcall;
 use self::table::expand_table_addr;
 
-/// Legalize `inst` for `isa`. Return true if any changes to the code were
-/// made; return false if the instruction was successfully encoded as is.
+enum LegalizeInstResult {
+    Done,
+    Legalized,
+    SplitLegalizePending,
+}
+
+/// Legalize `inst` for `isa`.
 fn legalize_inst(
     inst: ir::Inst,
     pos: &mut FuncCursor,
     cfg: &mut ControlFlowGraph,
     isa: &dyn TargetIsa,
-) -> bool {
+) -> LegalizeInstResult {
     let opcode = pos.func.dfg[inst].opcode();
 
     // Check for ABI boundaries that need to be converted to the legalized signature.
     if opcode.is_call() {
         if boundary::handle_call_abi(inst, pos.func, cfg) {
-            return true;
+            return LegalizeInstResult::Legalized;
         }
     } else if opcode.is_return() {
         if boundary::handle_return_abi(inst, pos.func, cfg) {
-            return true;
+            return LegalizeInstResult::Legalized;
         }
     } else if opcode.is_branch() {
         split::simplify_branch_arguments(&mut pos.func.dfg, inst);
+    } else if opcode == ir::Opcode::Isplit {
+        pos.use_srcloc(inst);
+
+        let arg = match pos.func.dfg[inst] {
+            ir::InstructionData::Unary { arg, .. } => pos.func.dfg.resolve_aliases(arg),
+            _ => panic!("Expected isplit: {}", pos.func.dfg.display_inst(inst, None)),
+        };
+
+        match pos.func.dfg.value_def(arg) {
+            ir::ValueDef::Result(inst, _num) => {
+                if let ir::InstructionData::Binary {
+                    opcode: ir::Opcode::Iconcat,
+                    ..
+                } = pos.func.dfg[inst]
+                {
+                    // `arg` was created by an `iconcat` instruction.
+                } else {
+                    // `arg` was not created by an `iconcat` instruction. Don't try to resolve it,
+                    // as otherwise `split::isplit` will re-insert the original `isplit`, causing
+                    // an endless loop.
+                    return LegalizeInstResult::SplitLegalizePending;
+                }
+            }
+            ir::ValueDef::Param(_ebb, _num) => {}
+        }
+
+        let res = pos.func.dfg.inst_results(inst).to_vec();
+        assert_eq!(res.len(), 2);
+        let (resl, resh) = (res[0], res[1]); // Prevent borrowck error
+
+        // Remove old isplit
+        pos.func.dfg.clear_results(inst);
+        pos.remove_inst();
+
+        let curpos = pos.position();
+        let srcloc = pos.srcloc();
+        let (xl, xh) = split::isplit(pos.func, cfg, curpos, srcloc, arg);
+
+        pos.func.dfg.change_to_alias(resl, xl);
+        pos.func.dfg.change_to_alias(resh, xh);
+
+        return LegalizeInstResult::Legalized;
     }
 
     match pos.func.update_encoding(inst, isa) {
-        Ok(()) => false,
+        Ok(()) => LegalizeInstResult::Done,
         Err(action) => {
             // We should transform the instruction into legal equivalents.
             // If the current instruction was replaced, we need to double back and revisit
@@ -69,12 +118,16 @@ fn legalize_inst(
             // There's a risk of infinite looping here if the legalization patterns are
             // unsound. Should we attempt to detect that?
             if action(inst, pos.func, cfg, isa) {
-                return true;
+                return LegalizeInstResult::Legalized;
             }
 
             // We don't have any pattern expansion for this instruction either.
             // Try converting it to a library call as a last resort.
-            expand_as_libcall(inst, pos.func, isa)
+            if expand_as_libcall(inst, pos.func, isa) {
+                LegalizeInstResult::Legalized
+            } else {
+                LegalizeInstResult::Done
+            }
         }
     }
 }
@@ -94,24 +147,42 @@ pub fn legalize_function(func: &mut ir::Function, cfg: &mut ControlFlowGraph, is
 
     let mut pos = FuncCursor::new(func);
 
+    // This must be a set to prevent trying to legalize `isplit` and `vsplit` twice in certain cases.
+    let mut pending_splits = BTreeSet::new();
+
     // Process EBBs in layout order. Some legalization actions may split the current EBB or append
     // new ones to the end. We need to make sure we visit those new EBBs too.
-    while let Some(_ebb) = pos.next_ebb() {
+    while let Some(ebb) = pos.next_ebb() {
+        split::split_ebb_params(pos.func, cfg, ebb);
+
         // Keep track of the cursor position before the instruction being processed, so we can
         // double back when replacing instructions.
         let mut prev_pos = pos.position();
 
         while let Some(inst) = pos.next_inst() {
-            if legalize_inst(inst, &mut pos, cfg, isa) {
-                // Go back and legalize the inserted return value conversion instructions.
-                pos.set_position(prev_pos);
-            } else {
+            match legalize_inst(inst, &mut pos, cfg, isa) {
                 // Remember this position in case we need to double back.
-                prev_pos = pos.position();
+                LegalizeInstResult::Done => prev_pos = pos.position(),
+
+                // Go back and legalize the inserted return value conversion instructions.
+                LegalizeInstResult::Legalized => pos.set_position(prev_pos),
+
+                // The argument of a `isplit` or `vsplit` instruction didn't resolve to a
+                // `iconcat` or `vconcat` instruction. Try again after legalizing the rest of
+                // the instructions.
+                LegalizeInstResult::SplitLegalizePending => {
+                    pending_splits.insert(inst);
+                }
             }
         }
     }
 
+    // Try legalizing `isplit` and `vsplit` instructions, which could not previously be legalized.
+    for inst in pending_splits {
+        pos.goto_inst(inst);
+        legalize_inst(inst, &mut pos, cfg, isa);
+    }
+
     // Now that we've lowered all br_tables, we don't need the jump tables anymore.
     if !isa.flags().jump_tables_enabled() {
         pos.func.jump_tables.clear();
@@ -498,3 +569,67 @@ fn expand_stack_store(
     mflags.set_aligned();
     pos.func.dfg.replace(inst).store(mflags, val, addr, 0);
 }
+
+/// Split a load into two parts before `iconcat`ing the result together.
+fn narrow_load(
+    inst: ir::Inst,
+    func: &mut ir::Function,
+    _cfg: &mut ControlFlowGraph,
+    _isa: &dyn TargetIsa,
+) {
+    let mut pos = FuncCursor::new(func).at_inst(inst);
+    pos.use_srcloc(inst);
+
+    let (ptr, offset, flags) = match pos.func.dfg[inst] {
+        ir::InstructionData::Load {
+            opcode: ir::Opcode::Load,
+            arg,
+            offset,
+            flags,
+        } => (arg, offset, flags),
+        _ => panic!("Expected load: {}", pos.func.dfg.display_inst(inst, None)),
+    };
+
+    let res_ty = pos.func.dfg.ctrl_typevar(inst);
+    let small_ty = res_ty.half_width().expect("Can't narrow load");
+
+    let al = pos.ins().load(small_ty, flags, ptr, offset);
+    let ah = pos.ins().load(
+        small_ty,
+        flags,
+        ptr,
+        offset.try_add_i64(8).expect("load offset overflow"),
+    );
+    pos.func.dfg.replace(inst).iconcat(al, ah);
+}
+
+/// Split a store into two parts after `isplit`ing the value.
+fn narrow_store(
+    inst: ir::Inst,
+    func: &mut ir::Function,
+    _cfg: &mut ControlFlowGraph,
+    _isa: &dyn TargetIsa,
+) {
+    let mut pos = FuncCursor::new(func).at_inst(inst);
+    pos.use_srcloc(inst);
+
+    let (val, ptr, offset, flags) = match pos.func.dfg[inst] {
+        ir::InstructionData::Store {
+            opcode: ir::Opcode::Store,
+            args,
+            offset,
+            flags,
+        } => (args[0], args[1], offset, flags),
+        _ => panic!("Expected store: {}", pos.func.dfg.display_inst(inst, None)),
+    };
+
+    let (al, ah) = pos.ins().isplit(val);
+    pos.ins().store(flags, al, ptr, offset);
+    pos.ins().store(
+        flags,
+        ah,
+        ptr,
+        offset.try_add_i64(8).expect("store offset overflow"),
+    );
+    pos.remove_inst();
+}
diff --git a/cranelift-codegen/src/legalizer/split.rs b/cranelift-codegen/src/legalizer/split.rs
index 773df1321..f16dae161 100644
--- a/cranelift-codegen/src/legalizer/split.rs
+++ b/cranelift-codegen/src/legalizer/split.rs
@@ -124,6 +124,35 @@ fn split_any(
     let pos = &mut FuncCursor::new(func).at_position(pos).with_srcloc(srcloc);
     let result = split_value(pos, value, concat, &mut repairs);
 
+    perform_repairs(pos, cfg, repairs);
+
+    result
+}
+
+pub fn split_ebb_params(func: &mut ir::Function, cfg: &ControlFlowGraph, ebb: Ebb) {
+    let mut repairs = Vec::new();
+    let pos = &mut FuncCursor::new(func).at_top(ebb);
+
+    for (num, ebb_param) in pos
+        .func
+        .dfg
+        .ebb_params(ebb)
+        .to_vec()
+        .into_iter()
+        .enumerate()
+    {
+        let ty = pos.func.dfg.value_type(ebb_param);
+        if ty != ir::types::I128 {
+            continue;
+        }
+
+        split_ebb_param(pos, ebb, num, ebb_param, Opcode::Iconcat, &mut repairs);
+    }
+
+    perform_repairs(pos, cfg, repairs);
+}
+
+fn perform_repairs(pos: &mut FuncCursor, cfg: &ControlFlowGraph, mut repairs: Vec<Repair>) {
     // We have split the value requested, and now we may need to fix some EBB predecessors.
     while let Some(repair) = repairs.pop() {
         for BasicBlock { inst, .. } in cfg.pred_iter(repair.ebb) {
@@ -181,8 +210,6 @@ fn split_any(
             pos.func.dfg[inst].put_value_list(args);
         }
     }
-
-    result
 }
 
 /// Split a single value using the integer or vector semantics given by the `concat` opcode.
@@ -215,40 +242,7 @@ fn split_value(
             // This is an EBB parameter. We can split the parameter value unless this is the entry
             // block.
             if pos.func.layout.entry_block() != Some(ebb) {
-                // We are going to replace the parameter at `num` with two new arguments.
-                // Determine the new value types.
-                let ty = pos.func.dfg.value_type(value);
-                let split_type = match concat {
-                    Opcode::Iconcat => ty.half_width().expect("Invalid type for isplit"),
-                    Opcode::Vconcat => ty.half_vector().expect("Invalid type for vsplit"),
-                    _ => panic!("Unhandled concat opcode: {}", concat),
-                };
-
-                // Since the `repairs` stack potentially contains other parameter numbers for
-                // `ebb`, avoid shifting and renumbering EBB parameters. It could invalidate other
-                // `repairs` entries.
-                //
-                // Replace the original `value` with the low part, and append the high part at the
-                // end of the argument list.
-                let lo = pos.func.dfg.replace_ebb_param(value, split_type);
-                let hi_num = pos.func.dfg.num_ebb_params(ebb);
-                let hi = pos.func.dfg.append_ebb_param(ebb, split_type);
-                reuse = Some((lo, hi));
-
-                // Now the original value is dangling. Insert a concatenation instruction that can
-                // compute it from the two new parameters. This also serves as a record of what we
-                // did so a future call to this function doesn't have to redo the work.
-                //
-                // Note that it is safe to move `pos` here since `reuse` was set above, so we don't
-                // need to insert a split instruction before returning.
-                pos.goto_first_inst(ebb);
-                pos.ins()
-                    .with_result(value)
-                    .Binary(concat, split_type, lo, hi);
-
-                // Finally, splitting the EBB parameter is not enough. We also have to repair all
-                // of the predecessor instructions that branch here.
-                add_repair(concat, split_type, ebb, num, hi_num, repairs);
+                reuse = Some(split_ebb_param(pos, ebb, num, value, concat, repairs));
             }
         }
     }
@@ -267,6 +261,51 @@ fn split_value(
     }
 }
 
+fn split_ebb_param(
+    pos: &mut FuncCursor,
+    ebb: Ebb,
+    param_num: usize,
+    value: Value,
+    concat: Opcode,
+    repairs: &mut Vec<Repair>,
+) -> (Value, Value) {
+    // We are going to replace the parameter at `num` with two new arguments.
+    // Determine the new value types.
+    let ty = pos.func.dfg.value_type(value);
+    let split_type = match concat {
+        Opcode::Iconcat => ty.half_width().expect("Invalid type for isplit"),
+        Opcode::Vconcat => ty.half_vector().expect("Invalid type for vsplit"),
+        _ => panic!("Unhandled concat opcode: {}", concat),
+    };
+
+    // Since the `repairs` stack potentially contains other parameter numbers for
+    // `ebb`, avoid shifting and renumbering EBB parameters. It could invalidate other
+    // `repairs` entries.
+    //
+    // Replace the original `value` with the low part, and append the high part at the
+    // end of the argument list.
+    let lo = pos.func.dfg.replace_ebb_param(value, split_type);
+    let hi_num = pos.func.dfg.num_ebb_params(ebb);
+    let hi = pos.func.dfg.append_ebb_param(ebb, split_type);
+
+    // Now the original value is dangling. Insert a concatenation instruction that can
+    // compute it from the two new parameters. This also serves as a record of what we
+    // did so a future call to this function doesn't have to redo the work.
+    //
+    // Note that it is safe to move `pos` here since `reuse` was set above, so we don't
+    // need to insert a split instruction before returning.
+    pos.goto_first_inst(ebb);
+    pos.ins()
+        .with_result(value)
+        .Binary(concat, split_type, lo, hi);
+
+    // Finally, splitting the EBB parameter is not enough. We also have to repair all
+    // of the predecessor instructions that branch here.
+    add_repair(concat, split_type, ebb, param_num, hi_num, repairs);
+
+    (lo, hi)
+}
+
 // Add a repair entry to the work list.
 fn add_repair(
     concat: Opcode,
diff --git a/cranelift-codegen/src/regalloc/reload.rs b/cranelift-codegen/src/regalloc/reload.rs
index fb6b61ec6..bbc198c45 100644
--- a/cranelift-codegen/src/regalloc/reload.rs
+++ b/cranelift-codegen/src/regalloc/reload.rs
@@ -233,7 +233,7 @@ impl<'a> Context<'a> {
                             let dst_ty = self.cur.func.dfg.value_type(dst_val);
                             debug_assert!(src_ty == dst_ty);
                             // This limits the transformation to copies of the
-                            // types: I64 I32 I16 I8 F64 and F32, since that's
+                            // types: I128 I64 I32 I16 I8 F64 and F32, since that's
                             // the set of `copy_nop` encodings available.
                             src_ty.is_int() || src_ty.is_float()
                         }
diff --git a/cranelift-reader/src/lexer.rs b/cranelift-reader/src/lexer.rs
index 465e79e97..2432ab055 100644
--- a/cranelift-reader/src/lexer.rs
+++ b/cranelift-reader/src/lexer.rs
@@ -365,6 +365,7 @@ impl<'a> Lexer<'a> {
             "i16" => types::I16,
             "i32" => types::I32,
             "i64" => types::I64,
+            "i128" => types::I128,
             "f32" => types::F32,
             "f64" => types::F64,
             "b1" => types::B1,
@@ -372,6 +373,7 @@ impl<'a> Lexer<'a> {
             "b16" => types::B16,
             "b32" => types::B32,
             "b64" => types::B64,
+            "b128" => types::B128,
             "r32" => types::R32,
             "r64" => types::R64,
             _ => return None,
diff --git a/filetests/isa/x86/br-i128.clif b/filetests/isa/x86/br-i128.clif
new file mode 100644
index 000000000..a09db3f41
--- /dev/null
+++ b/filetests/isa/x86/br-i128.clif
@@ -0,0 +1,24 @@
+test compile
+target x86_64
+
+function u0:0(i128) -> i8 fast {
+ebb0(v0: i128):
+    brz v0, ebb1
+    v1 = iconst.i8 0
+    return v1
+
+ebb1:
+    v2 = iconst.i8 1
+    return v2
+}
+
+function u0:1(i128) -> i8 fast {
+ebb0(v0: i128):
+    brnz v0, ebb1
+    v1 = iconst.i8 0
+    return v1
+
+ebb1:
+    v2 = iconst.i8 1
+    return v2
+}
diff --git a/filetests/isa/x86/i128.clif b/filetests/isa/x86/i128.clif
new file mode 100644
index 000000000..b710a7430
--- /dev/null
+++ b/filetests/isa/x86/i128.clif
@@ -0,0 +1,46 @@
+test compile
+target x86_64
+
+function u0:0(i64, i64) -> i128 fast {
+ebb0(v0: i64, v1: i64):
+;check: ebb0(v0: i64 [%rdi], v1: i64 [%rsi], v3: i64 [%rbp]):
+
+    v2 = iconcat.i64 v0, v1
+    ; check: regmove v0, %rdi -> %rax
+    ; check: regmove v1, %rsi -> %rdx
+
+    return v2
+    ; check: v4 = x86_pop.i64
+    ; check: return v0, v1, v4
+}
+
+function u0:1(i128) -> i64, i64 fast {
+ebb0(v0: i128):
+; check: ebb0(v3: i64 [%rdi], v4: i64 [%rsi], v5: i64 [%rbp]):
+
+    v1, v2 = isplit v0
+    ; check: regmove v3, %rdi -> %rax
+    ; check: regmove v4, %rsi -> %rdx
+
+    return v1, v2
+    ; check: v6 = x86_pop.i64
+    ; check: return v3, v4, v6
+}
+
+function u0:2(i64, i128) fast {
+; check: ebb0(v0: i64 [%rdi], v2: i64 [%rsi], v3: i64 [%rdx], v6: i64 [%rbp]):
+ebb0(v0: i64, v1: i128):
+    ; check: store v2, v0+8
+    ; check: store v3, v0+16
+    store v1, v0+8
+    return
+}
+
+function u0:3(i64) -> i128 fast {
+ebb0(v0: i64):
+    ; check: v2 = load.i64 v0+8
+    ; check: v3 = load.i64 v0+16
+    v1 = load.i128 v0+8
+    ; check: return v2, v3, v5
+    return v1
+}
diff --git a/filetests/isa/x86/isplit-not-legalized-twice.clif b/filetests/isa/x86/isplit-not-legalized-twice.clif
new file mode 100644
index 000000000..4b81a186d
--- /dev/null
+++ b/filetests/isa/x86/isplit-not-legalized-twice.clif
@@ -0,0 +1,20 @@
+test compile
+target x86_64
+
+function u0:0(i64, i64) -> i128 system_v {
+ebb0(v0: i64, v1: i64):
+    trap user0
+
+ebb30:
+    v245 = iconst.i64 0
+    v246 = iconcat v245, v245
+    ; The next instruction used to be legalized twice, causing a panic the second time.
+    v250, v251 = isplit.i128 v370
+    v252, v253 = isplit v246
+    trap user0
+
+ebb45:
+    v369 = iconst.i64 0
+    v370 = load.i128 v369
+    trap user0
+}
diff --git a/filetests/isa/x86/jump_i128_param_unused.clif b/filetests/isa/x86/jump_i128_param_unused.clif
new file mode 100644
index 000000000..9d96fcbe3
--- /dev/null
+++ b/filetests/isa/x86/jump_i128_param_unused.clif
@@ -0,0 +1,10 @@
+test compile
+target x86_64
+
+function u0:0(i128) system_v {
+ebb0(v0: i128):
+    jump ebb1(v0)
+
+ebb1(v1: i128):
+    return
+}
diff --git a/filetests/isa/x86/legalize-isplit-backwards.clif b/filetests/isa/x86/legalize-isplit-backwards.clif
new file mode 100644
index 000000000..43881fe09
--- /dev/null
+++ b/filetests/isa/x86/legalize-isplit-backwards.clif
@@ -0,0 +1,24 @@
+test compile
+target x86_64
+
+function u0:0(i128) -> i64, i64 fast {
+; check: ebb0(v4: i64 [%rdi], v5: i64 [%rsi], v8: i64 [%rbp]):
+ebb0(v0: i128):
+    jump ebb2
+
+ebb1:
+    ; When this `isplit` is legalized, the bnot below is not yet legalized,
+    ; so there isn't a corresponding `iconcat` yet. We should try legalization
+    ; for this `isplit` again once all instrucions have been legalized.
+    v2, v3 = isplit.i128 v1
+    ; return v6, v7
+    return v2, v3
+
+ebb2:
+    ; check: v6 = bnot.i64 v4
+    ; check: v2 -> v6
+    ; check: v7 = bnot.i64 v5
+    ; check: v3 -> v7
+    v1 = bnot.i128 v0
+    jump ebb1
+}
diff --git a/filetests/isa/x86/load-store-narrow.clif b/filetests/isa/x86/load-store-narrow.clif
new file mode 100644
index 000000000..5f95b92fc
--- /dev/null
+++ b/filetests/isa/x86/load-store-narrow.clif
@@ -0,0 +1,16 @@
+test compile
+target i686
+
+function u0:0(i64, i32) system_v {
+ebb0(v0: i64, v1: i32):
+    v2 = bor v0, v0
+    store v2, v1
+    return
+}
+
+function u0:1(i32) -> i64 system_v {
+ebb0(v1: i32):
+    v0 = load.i64 v1
+    v2 = bor v0, v0
+    return v2
+}