Skip to content

Commit

Permalink
[WebAssembly] Add wasm-specific vector shuffle builtin and intrinsic
Browse files Browse the repository at this point in the history
Summary:

Although using `__builtin_shufflevector` and the `shufflevector`
instruction works fine, they are not opaque to the optimizer. As a
result, DAGCombine can potentially reduce the number of shuffles and
change the shuffle masks. This is unexpected behavior for users of the
WebAssembly SIMD intrinsics who have crafted their shuffles to
optimize the code generated by engines. This patch solves the problem
by adding a new shuffle intrinsic that is opaque to the optimizers in
line with the decision of the WebAssembly SIMD contributors at
WebAssembly/simd#196 (comment). In
the future we may implement custom DAG combines to properly optimize
shuffles and replace this solution.

Reviewers: aheejin, dschuff

Subscribers: sbc100, jgravelle-google, hiraditya, sunfish, cfe-commits, llvm-commits

Tags: #clang, #llvm

Differential Revision: https://reviews.llvm.org/D66983
  • Loading branch information
tlively authored and arichardson committed Jul 2, 2020
2 parents 1879266 + 8e3e56f commit 076d0ed
Show file tree
Hide file tree
Showing 7 changed files with 97 additions and 12 deletions.
1 change: 1 addition & 0 deletions clang/include/clang/Basic/BuiltinsWebAssembly.def
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ TARGET_BUILTIN(__builtin_wasm_avgr_u_i8x16, "V16cV16cV16c", "nc", "simd128")
TARGET_BUILTIN(__builtin_wasm_avgr_u_i16x8, "V8sV8sV8s", "nc", "simd128")

TARGET_BUILTIN(__builtin_wasm_bitselect, "V4iV4iV4iV4i", "nc", "simd128")
TARGET_BUILTIN(__builtin_wasm_shuffle_v8x16, "V16cV16cV16cIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIi", "nc", "simd128")

TARGET_BUILTIN(__builtin_wasm_any_true_i8x16, "iV16c", "nc", "simd128")
TARGET_BUILTIN(__builtin_wasm_any_true_i16x8, "iV8s", "nc", "simd128")
Expand Down
14 changes: 14 additions & 0 deletions clang/lib/CodeGen/CGBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16374,6 +16374,20 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
CGM.getIntrinsic(IntNo, {ConvertType(E->getType()), Vec->getType()});
return Builder.CreateCall(Callee, Vec);
}
case WebAssembly::BI__builtin_wasm_shuffle_v8x16: {
Value *Ops[18];
size_t OpIdx = 0;
Ops[OpIdx++] = EmitScalarExpr(E->getArg(0));
Ops[OpIdx++] = EmitScalarExpr(E->getArg(1));
while (OpIdx < 18) {
llvm::APSInt LaneConst;
if (!E->getArg(OpIdx)->isIntegerConstantExpr(LaneConst, getContext()))
llvm_unreachable("Constant arg isn't actually constant?");
Ops[OpIdx++] = llvm::ConstantInt::get(getLLVMContext(), LaneConst);
}
Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_shuffle);
return Builder.CreateCall(Callee, Ops);
}
default:
return nullptr;
}
Expand Down
28 changes: 18 additions & 10 deletions clang/lib/Headers/wasm_simd128.h
Original file line number Diff line number Diff line change
Expand Up @@ -1020,23 +1020,31 @@ wasm_f32x4_convert_u32x4(v128_t __a) {
#define wasm_v8x16_shuffle(__a, __b, __c0, __c1, __c2, __c3, __c4, __c5, __c6, \
__c7, __c8, __c9, __c10, __c11, __c12, __c13, \
__c14, __c15) \
((v128_t)(__builtin_shufflevector( \
(__u8x16)(__a), (__u8x16)(__b), __c0, __c1, __c2, __c3, __c4, __c5, \
__c6, __c7, __c8, __c9, __c10, __c11, __c12, __c13, __c14, __c15)))
((v128_t)__builtin_wasm_shuffle_v8x16( \
(__i8x16)(__a), (__i8x16)(__b), __c0, __c1, __c2, __c3, __c4, __c5, \
__c6, __c7, __c8, __c9, __c10, __c11, __c12, __c13, __c14, __c15))

#define wasm_v16x8_shuffle(__a, __b, __c0, __c1, __c2, __c3, __c4, __c5, __c6, \
__c7) \
((v128_t)(__builtin_shufflevector((__u16x8)(__a), (__u16x8)(__b), __c0, \
__c1, __c2, __c3, __c4, __c5, __c6, \
__c7)))
((v128_t)__builtin_wasm_shuffle_v8x16( \
(__i8x16)(__a), (__i8x16)(__b), __c0 * 2, __c0 * 2 + 1, __c1 * 2, \
__c1 * 2 + 1, __c2 * 2, __c2 * 2 + 1, __c3 * 2, __c3 * 2 + 1, __c4 * 2, \
__c4 * 2 + 1, __c5 * 2, __c5 * 2 + 1, __c6 * 2, __c6 * 2 + 1, __c7 * 2, \
__c7 * 2 + 1))

#define wasm_v32x4_shuffle(__a, __b, __c0, __c1, __c2, __c3) \
((v128_t)(__builtin_shufflevector((__u32x4)(__a), (__u32x4)(__b), __c0, \
__c1, __c2, __c3)))
((v128_t)__builtin_wasm_shuffle_v8x16( \
(__i8x16)(__a), (__i8x16)(__b), __c0 * 4, __c0 * 4 + 1, __c0 * 4 + 2, \
__c0 * 4 + 3, __c1 * 4, __c1 * 4 + 1, __c1 * 4 + 2, __c1 * 4 + 3, \
__c2 * 4, __c2 * 4 + 1, __c2 * 4 + 2, __c2 * 4 + 3, __c3 * 4, \
__c3 * 4 + 1, __c3 * 4 + 2, __c3 * 4 + 3))

#define wasm_v64x2_shuffle(__a, __b, __c0, __c1) \
((v128_t)( \
__builtin_shufflevector((__u64x2)(__a), (__u64x2)(__b), __c0, __c1)))
((v128_t)__builtin_wasm_shuffle_v8x16( \
(__i8x16)(__a), (__i8x16)(__b), __c0 * 8, __c0 * 8 + 1, __c0 * 8 + 2, \
__c0 * 8 + 3, __c0 * 8 + 4, __c0 * 8 + 5, __c0 * 8 + 6, __c0 * 8 + 7, \
__c1 * 8, __c1 * 8 + 1, __c1 * 8 + 2, __c1 * 8 + 3, __c1 * 8 + 4, \
__c1 * 8 + 5, __c1 * 8 + 6, __c1 * 8 + 7))

#ifdef __wasm_unimplemented_simd128__

Expand Down
9 changes: 9 additions & 0 deletions clang/test/CodeGen/builtins-wasm.c
Original file line number Diff line number Diff line change
Expand Up @@ -724,5 +724,14 @@ i32x4 widen_high_u_i32x4_i16x8(i16x8 v) {
i8x16 swizzle_v8x16(i8x16 x, i8x16 y) {
return __builtin_wasm_swizzle_v8x16(x, y);
// WEBASSEMBLY: call <16 x i8> @llvm.wasm.swizzle(<16 x i8> %x, <16 x i8> %y)
}

i8x16 shuffle(i8x16 x, i8x16 y) {
return __builtin_wasm_shuffle_v8x16(x, y, 0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15);
// WEBASSEMBLY: call <16 x i8> @llvm.wasm.shuffle(<16 x i8> %x, <16 x i8> %y,
// WEBASSEMBLY-SAME: i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
// WEBASSEMBLY-SAME: i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14,
// WEBASSEMBLY-SAME: i32 15
// WEBASSEMBLY-NEXT: ret
}
9 changes: 7 additions & 2 deletions llvm/include/llvm/IR/IntrinsicsWebAssembly.td
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,13 @@ def int_wasm_swizzle :
Intrinsic<[llvm_v16i8_ty],
[llvm_v16i8_ty, llvm_v16i8_ty],
[IntrNoMem, IntrSpeculatable]>;
def int_wasm_shuffle :
Intrinsic<[llvm_v16i8_ty],
[llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty, llvm_i32_ty,
llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
[IntrNoMem, IntrSpeculatable]>;
def int_wasm_sub_saturate_signed :
Intrinsic<[llvm_anyvector_ty],
[LLVMMatchType<0>, LLVMMatchType<0>],
Expand All @@ -116,7 +123,6 @@ def int_wasm_avgr_unsigned :
Intrinsic<[llvm_anyvector_ty],
[LLVMMatchType<0>, LLVMMatchType<0>],
[IntrNoMem, IntrSpeculatable]>;

def int_wasm_bitselect :
Intrinsic<[llvm_anyvector_ty],
[LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
Expand Down Expand Up @@ -170,7 +176,6 @@ def int_wasm_widen_high_unsigned :
[llvm_anyvector_ty],
[IntrNoMem, IntrSpeculatable]>;


//===----------------------------------------------------------------------===//
// Bulk memory intrinsics
//===----------------------------------------------------------------------===//
Expand Down
18 changes: 18 additions & 0 deletions llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1354,6 +1354,24 @@ SDValue WebAssemblyTargetLowering::LowerIntrinsic(SDValue Op,
Op.getOperand(3) // thrown value
});
}

case Intrinsic::wasm_shuffle: {
// Drop in-chain and replace undefs, but otherwise pass through unchanged
SDValue Ops[18];
size_t OpIdx = 0;
Ops[OpIdx++] = Op.getOperand(1);
Ops[OpIdx++] = Op.getOperand(2);
while (OpIdx < 18) {
const SDValue &MaskIdx = Op.getOperand(OpIdx + 1);
if (MaskIdx.isUndef() ||
cast<ConstantSDNode>(MaskIdx.getNode())->getZExtValue() >= 32) {
Ops[OpIdx++] = DAG.getConstant(0, DL, MVT::i32);
} else {
Ops[OpIdx++] = MaskIdx;
}
}
return DAG.getNode(WebAssemblyISD::SHUFFLE, DL, Op.getValueType(), Ops);
}
}
}

Expand Down
30 changes: 30 additions & 0 deletions llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,36 @@ define <16 x i8> @narrow_unsigned_v16i8(<8 x i16> %low, <8 x i16> %high) {
ret <16 x i8> %a
}

; CHECK-LABEL: shuffle_v16i8:
; NO-SIMD128-NOT: v8x16
; SIMD128-NEXT: .functype shuffle_v16i8 (v128, v128) -> (v128){{$}}
; SIMD128-NEXT: v8x16.shuffle $push[[R:[0-9]+]]=, $0, $1,
; SIMD128-SAME: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0{{$}}
; SIMD128-NEXT: return $pop[[R]]{{$}}
declare <16 x i8> @llvm.wasm.shuffle(
<16 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32,
i32, i32, i32, i32, i32)
define <16 x i8> @shuffle_v16i8(<16 x i8> %x, <16 x i8> %y) {
%res = call <16 x i8> @llvm.wasm.shuffle(<16 x i8> %x, <16 x i8> %y,
i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 35)
ret <16 x i8> %res
}

; CHECK-LABEL: shuffle_undef_v16i8:
; NO-SIMD128-NOT: v8x16
; SIMD128-NEXT: .functype shuffle_undef_v16i8 (v128, v128) -> (v128){{$}}
; SIMD128-NEXT: v8x16.shuffle $push[[R:[0-9]+]]=, $0, $1,
; SIMD128-SAME: 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2{{$}}
; SIMD128-NEXT: return $pop[[R]]{{$}}
define <16 x i8> @shuffle_undef_v16i8(<16 x i8> %x, <16 x i8> %y) {
%res = call <16 x i8> @llvm.wasm.shuffle(<16 x i8> %x, <16 x i8> %y,
i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 2)
ret <16 x i8> %res
}

; ==============================================================================
; 8 x i16
; ==============================================================================
Expand Down

0 comments on commit 076d0ed

Please sign in to comment.