From 91e09715dd4dd5c6120cbac97c10ae146409b0f3 Mon Sep 17 00:00:00 2001 From: sayantn Date: Mon, 8 Jul 2024 20:00:07 +0530 Subject: [PATCH] AVX512_FP16 Part 2: Complex Multiplication Using `(f16, f16)` as `_Float16 _Complex` --- crates/core_arch/missing-x86.md | 79 - crates/core_arch/src/x86/avx512fp16.rs | 4618 ++++++++++++++++------ crates/stdarch-verify/tests/x86-intel.rs | 2 + 3 files changed, 3467 insertions(+), 1232 deletions(-) diff --git a/crates/core_arch/missing-x86.md b/crates/core_arch/missing-x86.md index 7bc2456ddd..8e24308fe2 100644 --- a/crates/core_arch/missing-x86.md +++ b/crates/core_arch/missing-x86.md @@ -54,12 +54,9 @@
["AVX512_FP16"]

* [ ] [`_mm256_cvtsh_h`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtsh_h) - * [ ] [`_mm256_set1_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set1_pch) * [ ] [`_mm512_abs_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_abs_ph) * [ ] [`_mm512_cmp_ph_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_ph_mask) * [ ] [`_mm512_cmp_round_ph_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_round_ph_mask) - * [ ] [`_mm512_cmul_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmul_pch) - * [ ] [`_mm512_cmul_round_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmul_round_pch) * [ ] [`_mm512_conj_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_conj_pch) * [ ] [`_mm512_cvt_roundepi16_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepi16_ph) * [ ] [`_mm512_cvt_roundepi32_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepi32_ph) @@ -108,8 +105,6 @@ * [ ] [`_mm512_cvtxps_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtxps_ph) * [ ] [`_mm512_fcmadd_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmadd_pch) * [ ] [`_mm512_fcmadd_round_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmadd_round_pch) - * [ ] [`_mm512_fcmul_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmul_pch) - * [ ] [`_mm512_fcmul_round_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmul_round_pch) * [ ] [`_mm512_fmadd_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_pch) * [ ] [`_mm512_fmadd_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_ph) * [ ] [`_mm512_fmadd_round_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_round_pch) @@ -120,8 +115,6 @@ * [ ] [`_mm512_fmsub_round_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsub_round_ph) * [ ] [`_mm512_fmsubadd_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsubadd_ph) * [ ] [`_mm512_fmsubadd_round_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsubadd_round_ph) - * [ ] [`_mm512_fmul_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmul_pch) - * [ ] [`_mm512_fmul_round_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmul_round_pch) * [ ] [`_mm512_fnmadd_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmadd_ph) * [ ] [`_mm512_fnmadd_round_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmadd_round_ph) * [ ] [`_mm512_fnmsub_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmsub_ph) @@ -150,8 +143,6 @@ * [ ] [`_mm512_mask_blend_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_ph) * [ ] [`_mm512_mask_cmp_ph_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_ph_mask) * [ ] [`_mm512_mask_cmp_round_ph_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_round_ph_mask) - * [ ] [`_mm512_mask_cmul_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmul_pch) - * [ ] [`_mm512_mask_cmul_round_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmul_round_pch) * [ ] [`_mm512_mask_conj_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_conj_pch) * [ ] [`_mm512_mask_cvt_roundepi16_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepi16_ph) * [ ] [`_mm512_mask_cvt_roundepi32_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepi32_ph) @@ -199,8 +190,6 @@ * [ ] [`_mm512_mask_cvtxps_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtxps_ph) * [ ] [`_mm512_mask_fcmadd_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmadd_pch) * [ ] [`_mm512_mask_fcmadd_round_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmadd_round_pch) - * [ ] [`_mm512_mask_fcmul_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmul_pch) - * [ ] [`_mm512_mask_fcmul_round_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmul_round_pch) * [ ] [`_mm512_mask_fmadd_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_pch) * [ ] [`_mm512_mask_fmadd_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_ph) * [ ] [`_mm512_mask_fmadd_round_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_round_pch) @@ -211,8 +200,6 @@ * [ ] [`_mm512_mask_fmsub_round_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsub_round_ph) * [ ] [`_mm512_mask_fmsubadd_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsubadd_ph) * [ ] [`_mm512_mask_fmsubadd_round_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsubadd_round_ph) - * [ ] [`_mm512_mask_fmul_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmul_pch) - * [ ] [`_mm512_mask_fmul_round_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmul_round_pch) * [ ] [`_mm512_mask_fnmadd_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmadd_ph) * [ ] [`_mm512_mask_fnmadd_round_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmadd_round_ph) * [ ] [`_mm512_mask_fnmsub_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmsub_ph) @@ -226,8 +213,6 @@ * [ ] [`_mm512_mask_max_round_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_max_round_ph) * [ ] [`_mm512_mask_min_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_min_ph) * [ ] [`_mm512_mask_min_round_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_min_round_ph) - * [ ] [`_mm512_mask_mul_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_pch) - * [ ] [`_mm512_mask_mul_round_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_round_pch) * [ ] [`_mm512_mask_rcp_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rcp_ph) * [ ] [`_mm512_mask_reduce_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_ph) * [ ] [`_mm512_mask_reduce_round_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_round_ph) @@ -238,8 +223,6 @@ * [ ] [`_mm512_mask_scalef_round_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_round_ph) * [ ] [`_mm512_mask_sqrt_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sqrt_ph) * [ ] [`_mm512_mask_sqrt_round_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sqrt_round_ph) - * [ ] [`_mm512_maskz_cmul_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cmul_pch) - * [ ] [`_mm512_maskz_cmul_round_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cmul_round_pch) * [ ] [`_mm512_maskz_conj_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_conj_pch) * [ ] [`_mm512_maskz_cvt_roundepi16_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepi16_ph) * [ ] [`_mm512_maskz_cvt_roundepi32_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepi32_ph) @@ -287,8 +270,6 @@ * [ ] [`_mm512_maskz_cvtxps_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtxps_ph) * [ ] [`_mm512_maskz_fcmadd_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmadd_pch) * [ ] [`_mm512_maskz_fcmadd_round_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmadd_round_pch) - * [ ] [`_mm512_maskz_fcmul_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmul_pch) - * [ ] [`_mm512_maskz_fcmul_round_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmul_round_pch) * [ ] [`_mm512_maskz_fmadd_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_pch) * [ ] [`_mm512_maskz_fmadd_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_ph) * [ ] [`_mm512_maskz_fmadd_round_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_round_pch) @@ -299,8 +280,6 @@ * [ ] [`_mm512_maskz_fmsub_round_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsub_round_ph) * [ ] [`_mm512_maskz_fmsubadd_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsubadd_ph) * [ ] [`_mm512_maskz_fmsubadd_round_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsubadd_round_ph) - * [ ] [`_mm512_maskz_fmul_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmul_pch) - * [ ] [`_mm512_maskz_fmul_round_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmul_round_pch) * [ ] [`_mm512_maskz_fnmadd_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmadd_ph) * [ ] [`_mm512_maskz_fnmadd_round_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmadd_round_ph) * [ ] [`_mm512_maskz_fnmsub_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmsub_ph) @@ -313,8 +292,6 @@ * [ ] [`_mm512_maskz_max_round_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_max_round_ph) * [ ] [`_mm512_maskz_min_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_min_ph) * [ ] [`_mm512_maskz_min_round_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_min_round_ph) - * [ ] [`_mm512_maskz_mul_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_pch) - * [ ] [`_mm512_maskz_mul_round_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_round_pch) * [ ] [`_mm512_maskz_rcp_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rcp_ph) * [ ] [`_mm512_maskz_reduce_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_reduce_ph) * [ ] [`_mm512_maskz_reduce_round_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_reduce_round_ph) @@ -329,8 +306,6 @@ * [ ] [`_mm512_max_round_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_max_round_ph) * [ ] [`_mm512_min_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_min_ph) * [ ] [`_mm512_min_round_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_min_round_ph) - * [ ] [`_mm512_mul_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_pch) - * [ ] [`_mm512_mul_round_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_round_pch) * [ ] [`_mm512_permutex2var_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutex2var_ph) * [ ] [`_mm512_permutexvar_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutexvar_ph) * [ ] [`_mm512_rcp_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rcp_ph) @@ -345,13 +320,8 @@ * [ ] [`_mm512_rsqrt_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rsqrt_ph) * [ ] [`_mm512_scalef_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_ph) * [ ] [`_mm512_scalef_round_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_round_ph) - * [ ] [`_mm512_set1_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set1_pch) * [ ] [`_mm512_sqrt_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sqrt_ph) * [ ] [`_mm512_sqrt_round_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sqrt_round_ph) - * [ ] [`_mm_cmp_round_sh_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_round_sh_mask) - * [ ] [`_mm_cmp_sh_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_sh_mask) - * [ ] [`_mm_cmul_round_sch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmul_round_sch) - * [ ] [`_mm_cmul_sch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmul_sch) * [ ] [`_mm_cvt_roundi32_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundi32_sh) * [ ] [`_mm_cvt_roundi64_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundi64_sh) * [ ] [`_mm_cvt_roundsd_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsd_sh) @@ -389,16 +359,12 @@ * [ ] [`_mm_cvtu64_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtu64_sh) * [ ] [`_mm_fcmadd_round_sch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmadd_round_sch) * [ ] [`_mm_fcmadd_sch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmadd_sch) - * [ ] [`_mm_fcmul_round_sch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmul_round_sch) - * [ ] [`_mm_fcmul_sch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmul_sch) * [ ] [`_mm_fmadd_round_sch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_round_sch) * [ ] [`_mm_fmadd_round_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_round_sh) * [ ] [`_mm_fmadd_sch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_sch) * [ ] [`_mm_fmadd_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_sh) * [ ] [`_mm_fmsub_round_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_round_sh) * [ ] [`_mm_fmsub_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_sh) - * [ ] [`_mm_fmul_round_sch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmul_round_sch) - * [ ] [`_mm_fmul_sch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmul_sch) * [ ] [`_mm_fnmadd_round_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_round_sh) * [ ] [`_mm_fnmadd_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_sh) * [ ] [`_mm_fnmsub_round_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_round_sh) @@ -420,10 +386,6 @@ * [ ] [`_mm_mask3_fnmadd_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmadd_sh) * [ ] [`_mm_mask3_fnmsub_round_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_round_sh) * [ ] [`_mm_mask3_fnmsub_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_sh) - * [ ] [`_mm_mask_cmp_round_sh_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_round_sh_mask) - * [ ] [`_mm_mask_cmp_sh_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_sh_mask) - * [ ] [`_mm_mask_cmul_round_sch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmul_round_sch) - * [ ] [`_mm_mask_cmul_sch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmul_sch) * [ ] [`_mm_mask_cvt_roundsd_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundsd_sh) * [ ] [`_mm_mask_cvt_roundsh_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundsh_sd) * [ ] [`_mm_mask_cvt_roundsh_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundsh_ss) @@ -434,16 +396,12 @@ * [ ] [`_mm_mask_cvtss_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtss_sh) * [ ] [`_mm_mask_fcmadd_round_sch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmadd_round_sch) * [ ] [`_mm_mask_fcmadd_sch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmadd_sch) - * [ ] [`_mm_mask_fcmul_round_sch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmul_round_sch) - * [ ] [`_mm_mask_fcmul_sch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmul_sch) * [ ] [`_mm_mask_fmadd_round_sch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_round_sch) * [ ] [`_mm_mask_fmadd_round_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_round_sh) * [ ] [`_mm_mask_fmadd_sch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_sch) * [ ] [`_mm_mask_fmadd_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_sh) * [ ] [`_mm_mask_fmsub_round_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsub_round_sh) * [ ] [`_mm_mask_fmsub_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsub_sh) - * [ ] [`_mm_mask_fmul_round_sch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmul_round_sch) - * [ ] [`_mm_mask_fmul_sch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmul_sch) * [ ] [`_mm_mask_fnmadd_round_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmadd_round_sh) * [ ] [`_mm_mask_fnmadd_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmadd_sh) * [ ] [`_mm_mask_fnmsub_round_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmsub_round_sh) @@ -453,8 +411,6 @@ * [ ] [`_mm_mask_getexp_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_sh) * [ ] [`_mm_mask_getmant_round_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_round_sh) * [ ] [`_mm_mask_getmant_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_sh) - * [ ] [`_mm_mask_mul_round_sch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_round_sch) - * [ ] [`_mm_mask_mul_sch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_sch) * [ ] [`_mm_mask_rcp_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp_sh) * [ ] [`_mm_mask_reduce_round_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_round_sh) * [ ] [`_mm_mask_reduce_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_sh) @@ -465,8 +421,6 @@ * [ ] [`_mm_mask_scalef_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_sh) * [ ] [`_mm_mask_sqrt_round_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_round_sh) * [ ] [`_mm_mask_sqrt_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_sh) - * [ ] [`_mm_maskz_cmul_round_sch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cmul_round_sch) - * [ ] [`_mm_maskz_cmul_sch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cmul_sch) * [ ] [`_mm_maskz_cvt_roundsd_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundsd_sh) * [ ] [`_mm_maskz_cvt_roundsh_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundsh_sd) * [ ] [`_mm_maskz_cvt_roundsh_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundsh_ss) @@ -477,16 +431,12 @@ * [ ] [`_mm_maskz_cvtss_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtss_sh) * [ ] [`_mm_maskz_fcmadd_round_sch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmadd_round_sch) * [ ] [`_mm_maskz_fcmadd_sch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmadd_sch) - * [ ] [`_mm_maskz_fcmul_round_sch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmul_round_sch) - * [ ] [`_mm_maskz_fcmul_sch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmul_sch) * [ ] [`_mm_maskz_fmadd_round_sch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_round_sch) * [ ] [`_mm_maskz_fmadd_round_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_round_sh) * [ ] [`_mm_maskz_fmadd_sch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_sch) * [ ] [`_mm_maskz_fmadd_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_sh) * [ ] [`_mm_maskz_fmsub_round_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsub_round_sh) * [ ] [`_mm_maskz_fmsub_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsub_sh) - * [ ] [`_mm_maskz_fmul_round_sch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmul_round_sch) - * [ ] [`_mm_maskz_fmul_sch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmul_sch) * [ ] [`_mm_maskz_fnmadd_round_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmadd_round_sh) * [ ] [`_mm_maskz_fnmadd_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmadd_sh) * [ ] [`_mm_maskz_fnmsub_round_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmsub_round_sh) @@ -495,8 +445,6 @@ * [ ] [`_mm_maskz_getexp_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_sh) * [ ] [`_mm_maskz_getmant_round_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_round_sh) * [ ] [`_mm_maskz_getmant_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_sh) - * [ ] [`_mm_maskz_mul_round_sch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_sch) - * [ ] [`_mm_maskz_mul_sch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_sch) * [ ] [`_mm_maskz_rcp_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp_sh) * [ ] [`_mm_maskz_reduce_round_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_reduce_round_sh) * [ ] [`_mm_maskz_reduce_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_reduce_sh) @@ -507,8 +455,6 @@ * [ ] [`_mm_maskz_scalef_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_sh) * [ ] [`_mm_maskz_sqrt_round_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_round_sh) * [ ] [`_mm_maskz_sqrt_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_sh) - * [ ] [`_mm_mul_round_sch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_round_sch) - * [ ] [`_mm_mul_sch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_sch) * [ ] [`_mm_rcp_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_sh) * [ ] [`_mm_reduce_round_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_round_sh) * [ ] [`_mm_reduce_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_sh) @@ -517,7 +463,6 @@ * [ ] [`_mm_rsqrt_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_sh) * [ ] [`_mm_scalef_round_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_round_sh) * [ ] [`_mm_scalef_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_sh) - * [ ] [`_mm_set1_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_pch) * [ ] [`_mm_sqrt_round_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_round_sh) * [ ] [`_mm_sqrt_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_sh)

@@ -527,7 +472,6 @@ * [ ] [`_mm256_abs_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_abs_ph) * [ ] [`_mm256_cmp_ph_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmp_ph_mask) - * [ ] [`_mm256_cmul_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmul_pch) * [ ] [`_mm256_conj_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_conj_pch) * [ ] [`_mm256_cvtepi16_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi16_ph) * [ ] [`_mm256_cvtepi32_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi32_ph) @@ -552,13 +496,11 @@ * [ ] [`_mm256_cvtxph_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtxph_ps) * [ ] [`_mm256_cvtxps_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtxps_ph) * [ ] [`_mm256_fcmadd_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fcmadd_pch) - * [ ] [`_mm256_fcmul_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fcmul_pch) * [ ] [`_mm256_fmadd_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmadd_pch) * [ ] [`_mm256_fmadd_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmadd_ph) * [ ] [`_mm256_fmaddsub_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmaddsub_ph) * [ ] [`_mm256_fmsub_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmsub_ph) * [ ] [`_mm256_fmsubadd_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmsubadd_ph) - * [ ] [`_mm256_fmul_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmul_pch) * [ ] [`_mm256_fnmadd_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fnmadd_ph) * [ ] [`_mm256_fnmsub_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fnmsub_ph) * [ ] [`_mm256_fpclass_ph_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fpclass_ph_mask) @@ -574,7 +516,6 @@ * [ ] [`_mm256_mask3_fnmsub_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fnmsub_ph) * [ ] [`_mm256_mask_blend_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_blend_ph) * [ ] [`_mm256_mask_cmp_ph_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmp_ph_mask) - * [ ] [`_mm256_mask_cmul_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmul_pch) * [ ] [`_mm256_mask_conj_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_conj_pch) * [ ] [`_mm256_mask_cvtepi16_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi16_ph) * [ ] [`_mm256_mask_cvtepi32_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi32_ph) @@ -599,13 +540,11 @@ * [ ] [`_mm256_mask_cvtxph_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtxph_ps) * [ ] [`_mm256_mask_cvtxps_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtxps_ph) * [ ] [`_mm256_mask_fcmadd_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fcmadd_pch) - * [ ] [`_mm256_mask_fcmul_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fcmul_pch) * [ ] [`_mm256_mask_fmadd_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmadd_pch) * [ ] [`_mm256_mask_fmadd_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmadd_ph) * [ ] [`_mm256_mask_fmaddsub_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmaddsub_ph) * [ ] [`_mm256_mask_fmsub_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmsub_ph) * [ ] [`_mm256_mask_fmsubadd_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmsubadd_ph) - * [ ] [`_mm256_mask_fmul_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmul_pch) * [ ] [`_mm256_mask_fnmadd_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fnmadd_ph) * [ ] [`_mm256_mask_fnmsub_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fnmsub_ph) * [ ] [`_mm256_mask_fpclass_ph_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fpclass_ph_mask) @@ -613,14 +552,12 @@ * [ ] [`_mm256_mask_getmant_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_getmant_ph) * [ ] [`_mm256_mask_max_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_max_ph) * [ ] [`_mm256_mask_min_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_min_ph) - * [ ] [`_mm256_mask_mul_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_mul_pch) * [ ] [`_mm256_mask_rcp_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_rcp_ph) * [ ] [`_mm256_mask_reduce_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_reduce_ph) * [ ] [`_mm256_mask_roundscale_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_roundscale_ph) * [ ] [`_mm256_mask_rsqrt_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_rsqrt_ph) * [ ] [`_mm256_mask_scalef_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_scalef_ph) * [ ] [`_mm256_mask_sqrt_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sqrt_ph) - * [ ] [`_mm256_maskz_cmul_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cmul_pch) * [ ] [`_mm256_maskz_conj_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_conj_pch) * [ ] [`_mm256_maskz_cvtepi16_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi16_ph) * [ ] [`_mm256_maskz_cvtepi32_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi32_ph) @@ -645,20 +582,17 @@ * [ ] [`_mm256_maskz_cvtxph_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtxph_ps) * [ ] [`_mm256_maskz_cvtxps_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtxps_ph) * [ ] [`_mm256_maskz_fcmadd_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fcmadd_pch) - * [ ] [`_mm256_maskz_fcmul_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fcmul_pch) * [ ] [`_mm256_maskz_fmadd_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmadd_pch) * [ ] [`_mm256_maskz_fmadd_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmadd_ph) * [ ] [`_mm256_maskz_fmaddsub_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmaddsub_ph) * [ ] [`_mm256_maskz_fmsub_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmsub_ph) * [ ] [`_mm256_maskz_fmsubadd_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmsubadd_ph) - * [ ] [`_mm256_maskz_fmul_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmul_pch) * [ ] [`_mm256_maskz_fnmadd_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fnmadd_ph) * [ ] [`_mm256_maskz_fnmsub_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fnmsub_ph) * [ ] [`_mm256_maskz_getexp_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_getexp_ph) * [ ] [`_mm256_maskz_getmant_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_getmant_ph) * [ ] [`_mm256_maskz_max_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_max_ph) * [ ] [`_mm256_maskz_min_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_min_ph) - * [ ] [`_mm256_maskz_mul_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_mul_pch) * [ ] [`_mm256_maskz_rcp_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_rcp_ph) * [ ] [`_mm256_maskz_reduce_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_reduce_ph) * [ ] [`_mm256_maskz_roundscale_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_roundscale_ph) @@ -667,7 +601,6 @@ * [ ] [`_mm256_maskz_sqrt_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sqrt_ph) * [ ] [`_mm256_max_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_ph) * [ ] [`_mm256_min_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_ph) - * [ ] [`_mm256_mul_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_pch) * [ ] [`_mm256_permutex2var_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutex2var_ph) * [ ] [`_mm256_permutexvar_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutexvar_ph) * [ ] [`_mm256_rcp_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rcp_ph) @@ -682,7 +615,6 @@ * [ ] [`_mm256_sqrt_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sqrt_ph) * [ ] [`_mm_abs_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_ph) * [ ] [`_mm_cmp_ph_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_ph_mask) - * [ ] [`_mm_cmul_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmul_pch) * [ ] [`_mm_conj_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_conj_pch) * [ ] [`_mm_cvtepi16_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi16_ph) * [ ] [`_mm_cvtepi32_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_ph) @@ -707,13 +639,11 @@ * [ ] [`_mm_cvtxph_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtxph_ps) * [ ] [`_mm_cvtxps_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtxps_ph) * [ ] [`_mm_fcmadd_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmadd_pch) - * [ ] [`_mm_fcmul_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmul_pch) * [ ] [`_mm_fmadd_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_pch) * [ ] [`_mm_fmadd_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_ph) * [ ] [`_mm_fmaddsub_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmaddsub_ph) * [ ] [`_mm_fmsub_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_ph) * [ ] [`_mm_fmsubadd_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsubadd_ph) - * [ ] [`_mm_fmul_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmul_pch) * [ ] [`_mm_fnmadd_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_ph) * [ ] [`_mm_fnmsub_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_ph) * [ ] [`_mm_fpclass_ph_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fpclass_ph_mask) @@ -729,7 +659,6 @@ * [ ] [`_mm_mask3_fnmsub_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_ph) * [ ] [`_mm_mask_blend_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_blend_ph) * [ ] [`_mm_mask_cmp_ph_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_ph_mask) - * [ ] [`_mm_mask_cmul_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmul_pch) * [ ] [`_mm_mask_conj_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_conj_pch) * [ ] [`_mm_mask_cvtepi16_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi16_ph) * [ ] [`_mm_mask_cvtepi32_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi32_ph) @@ -754,13 +683,11 @@ * [ ] [`_mm_mask_cvtxph_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtxph_ps) * [ ] [`_mm_mask_cvtxps_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtxps_ph) * [ ] [`_mm_mask_fcmadd_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmadd_pch) - * [ ] [`_mm_mask_fcmul_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmul_pch) * [ ] [`_mm_mask_fmadd_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_pch) * [ ] [`_mm_mask_fmadd_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_ph) * [ ] [`_mm_mask_fmaddsub_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmaddsub_ph) * [ ] [`_mm_mask_fmsub_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsub_ph) * [ ] [`_mm_mask_fmsubadd_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsubadd_ph) - * [ ] [`_mm_mask_fmul_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmul_pch) * [ ] [`_mm_mask_fnmadd_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmadd_ph) * [ ] [`_mm_mask_fnmsub_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmsub_ph) * [ ] [`_mm_mask_fpclass_ph_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fpclass_ph_mask) @@ -772,14 +699,12 @@ * [ ] [`_mm_mask_min_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_ph) * [ ] [`_mm_mask_min_round_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_round_sh) * [ ] [`_mm_mask_min_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_sh) - * [ ] [`_mm_mask_mul_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_pch) * [ ] [`_mm_mask_rcp_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp_ph) * [ ] [`_mm_mask_reduce_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_ph) * [ ] [`_mm_mask_roundscale_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_ph) * [ ] [`_mm_mask_rsqrt_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rsqrt_ph) * [ ] [`_mm_mask_scalef_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_ph) * [ ] [`_mm_mask_sqrt_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_ph) - * [ ] [`_mm_maskz_cmul_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cmul_pch) * [ ] [`_mm_maskz_conj_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_conj_pch) * [ ] [`_mm_maskz_cvtepi16_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi16_ph) * [ ] [`_mm_maskz_cvtepi32_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi32_ph) @@ -804,13 +729,11 @@ * [ ] [`_mm_maskz_cvtxph_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtxph_ps) * [ ] [`_mm_maskz_cvtxps_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtxps_ph) * [ ] [`_mm_maskz_fcmadd_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmadd_pch) - * [ ] [`_mm_maskz_fcmul_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmul_pch) * [ ] [`_mm_maskz_fmadd_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_pch) * [ ] [`_mm_maskz_fmadd_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_ph) * [ ] [`_mm_maskz_fmaddsub_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmaddsub_ph) * [ ] [`_mm_maskz_fmsub_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsub_ph) * [ ] [`_mm_maskz_fmsubadd_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsubadd_ph) - * [ ] [`_mm_maskz_fmul_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmul_pch) * [ ] [`_mm_maskz_fnmadd_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmadd_ph) * [ ] [`_mm_maskz_fnmsub_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmsub_ph) * [ ] [`_mm_maskz_getexp_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_ph) @@ -821,7 +744,6 @@ * [ ] [`_mm_maskz_min_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_ph) * [ ] [`_mm_maskz_min_round_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_round_sh) * [ ] [`_mm_maskz_min_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_sh) - * [ ] [`_mm_maskz_mul_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_pch) * [ ] [`_mm_maskz_rcp_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp_ph) * [ ] [`_mm_maskz_reduce_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_reduce_ph) * [ ] [`_mm_maskz_roundscale_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_ph) @@ -834,7 +756,6 @@ * [ ] [`_mm_min_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_ph) * [ ] [`_mm_min_round_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_round_sh) * [ ] [`_mm_min_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_sh) - * [ ] [`_mm_mul_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pch) * [ ] [`_mm_permutex2var_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permutex2var_ph) * [ ] [`_mm_permutexvar_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permutexvar_ph) * [ ] [`_mm_rcp_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ph) diff --git a/crates/core_arch/src/x86/avx512fp16.rs b/crates/core_arch/src/x86/avx512fp16.rs index c6eeff1904..3d12be8c6d 100644 --- a/crates/core_arch/src/x86/avx512fp16.rs +++ b/crates/core_arch/src/x86/avx512fp16.rs @@ -232,6 +232,41 @@ pub unsafe fn _mm512_setr_ph( ) } +/// Broadcast half-precision (16-bit) complex floating-point value a to all elements of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_pch) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm_set1_pch(a: (f16, f16)) -> __m128h { + __m128h(a.0, a.1, a.0, a.1, a.0, a.1, a.0, a.1) +} + +/// Broadcast half-precision (16-bit) complex floating-point value a to all elements of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set1_pch) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm256_set1_pch(a: (f16, f16)) -> __m256h { + __m256h( + a.0, a.1, a.0, a.1, a.0, a.1, a.0, a.1, a.0, a.1, a.0, a.1, a.0, a.1, a.0, a.1, + ) +} + +/// Broadcast half-precision (16-bit) complex floating-point value a to all elements of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set1_pch) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm512_set1_pch(a: (f16, f16)) -> __m512h { + __m512h( + a.0, a.1, a.0, a.1, a.0, a.1, a.0, a.1, a.0, a.1, a.0, a.1, a.0, a.1, a.0, a.1, a.0, a.1, + a.0, a.1, a.0, a.1, a.0, a.1, a.0, a.1, a.0, a.1, a.0, a.1, a.0, a.1, + ) +} + /// Return vector of type __m128h with all elements set to zero. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_ph) @@ -615,6 +650,69 @@ pub unsafe fn _mm512_zextph128_ph512(a: __m128h) -> __m512h { ) } +/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison +/// operand specified by imm8, and store the result in mask vector k. Exceptions can be suppressed by +/// passing _MM_FROUND_NO_EXC in the sae parameter. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_round_sh_mask) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[rustc_legacy_const_generics(2, 3)] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm_cmp_round_sh_mask( + a: __m128h, + b: __m128h, +) -> __mmask8 { + static_assert_sae!(SAE); + _mm_mask_cmp_round_sh_mask::(0xff, a, b) +} + +/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison +/// operand specified by imm8, and store the result in mask vector k using zeromask k1. Exceptions can be +/// suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_round_sh_mask) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[rustc_legacy_const_generics(3, 4)] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm_mask_cmp_round_sh_mask( + k1: __mmask8, + a: __m128h, + b: __m128h, +) -> __mmask8 { + static_assert_sae!(SAE); + vcmpsh(a, b, IMM8, k1, SAE) +} + +/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison +/// operand specified by imm8, and store the result in mask vector k. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_sh_mask) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[rustc_legacy_const_generics(2)] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm_cmp_sh_mask(a: __m128h, b: __m128h) -> __mmask8 { + _mm_cmp_round_sh_mask::(a, b) +} + +/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison +/// operand specified by imm8, and store the result in mask vector k using zeromask k1. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_sh_mask) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[rustc_legacy_const_generics(3)] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm_mask_cmp_sh_mask( + k1: __mmask8, + a: __m128h, + b: __m128h, +) -> __mmask8 { + _mm_mask_cmp_round_sh_mask::(k1, a, b) +} + /// Cast vector of type `__m256h` to type `__m512h`. The upper 16 elements of the result are zeroed. /// This intrinsic can generate the `vzeroupper` instruction, but most of the time it does not generate /// any instructions. @@ -1236,7 +1334,7 @@ pub unsafe fn _mm512_maskz_add_round_ph( /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions /// _MM_FROUND_CUR_DIRECTION /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_ph) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_sh) #[inline] #[target_feature(enable = "avx512fp16")] #[cfg_attr(test, assert_instr(vaddsh, ROUNDING = 8))] @@ -2227,1778 +2325,3992 @@ pub unsafe fn _mm_maskz_div_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h { _mm_maskz_div_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b) } -#[allow(improper_ctypes)] -extern "C" { - #[link_name = "llvm.x86.avx512fp16.vcomi.sh"] - fn vcomish(a: __m128h, b: __m128h, imm8: i32, sae: i32) -> i32; +/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is +/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex +/// number `complex = vec.fp16[0] + i * vec.fp16[1]`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pch) +#[inline] +#[target_feature(enable = "avx512fp16,avx512vl")] +#[cfg_attr(test, assert_instr(vfmulcph))] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm_mul_pch(a: __m128h, b: __m128h) -> __m128h { + _mm_mask_mul_pch(_mm_undefined_ph(), 0xff, a, b) +} - #[link_name = "llvm.x86.avx512fp16.add.ph.512"] - fn vaddph(a: __m512h, b: __m512h, rounding: i32) -> __m512h; - #[link_name = "llvm.x86.avx512fp16.sub.ph.512"] - fn vsubph(a: __m512h, b: __m512h, rounding: i32) -> __m512h; - #[link_name = "llvm.x86.avx512fp16.mul.ph.512"] - fn vmulph(a: __m512h, b: __m512h, rounding: i32) -> __m512h; - #[link_name = "llvm.x86.avx512fp16.div.ph.512"] - fn vdivph(a: __m512h, b: __m512h, rounding: i32) -> __m512h; +/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element +/// is copied from src when mask bit 0 is not set). Each complex number is composed of two adjacent +/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_pch) +#[inline] +#[target_feature(enable = "avx512fp16,avx512vl")] +#[cfg_attr(test, assert_instr(vfmulcph))] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm_mask_mul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h { + transmute(vfmulcph_128(transmute(a), transmute(b), transmute(src), k)) +} - #[link_name = "llvm.x86.avx512fp16.mask.add.sh.round"] - fn vaddsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h; - #[link_name = "llvm.x86.avx512fp16.mask.sub.sh.round"] - fn vsubsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h; - #[link_name = "llvm.x86.avx512fp16.mask.mul.sh.round"] - fn vmulsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h; - #[link_name = "llvm.x86.avx512fp16.mask.div.sh.round"] - fn vdivsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h; +/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element +/// is zeroed out when mask bit 0 is not set). Each complex number is composed of two adjacent +/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_pch) +#[inline] +#[target_feature(enable = "avx512fp16,avx512vl")] +#[cfg_attr(test, assert_instr(vfmulcph))] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm_maskz_mul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h { + _mm_mask_mul_pch(_mm_setzero_ph(), k, a, b) +} +/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is +/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex +/// number `complex = vec.fp16[0] + i * vec.fp16[1]`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_pch) +#[inline] +#[target_feature(enable = "avx512fp16,avx512vl")] +#[cfg_attr(test, assert_instr(vfmulcph))] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm256_mul_pch(a: __m256h, b: __m256h) -> __m256h { + _mm256_mask_mul_pch(_mm256_undefined_ph(), 0xff, a, b) } -#[cfg(test)] -mod tests { - use crate::core_arch::x86::*; - use crate::mem::transmute; - use crate::ptr::{addr_of, addr_of_mut}; - use stdarch_test::simd_test; +/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element +/// is copied from src when mask bit 0 is not set). Each complex number is composed of two adjacent +/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_mul_pch) +#[inline] +#[target_feature(enable = "avx512fp16,avx512vl")] +#[cfg_attr(test, assert_instr(vfmulcph))] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm256_mask_mul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h { + transmute(vfmulcph_256(transmute(a), transmute(b), transmute(src), k)) +} - #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_set_ph() { - let r = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); - let e = _mm_setr_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0); - assert_eq_m128h(r, e); - } +/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element +/// is zeroed out when mask bit 0 is not set). Each complex number is composed of two adjacent +/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_mul_pch) +#[inline] +#[target_feature(enable = "avx512fp16,avx512vl")] +#[cfg_attr(test, assert_instr(vfmulcph))] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm256_maskz_mul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h { + _mm256_mask_mul_pch(_mm256_setzero_ph(), k, a, b) +} - #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm256_set_ph() { - let r = _mm256_set_ph( - 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, - ); - let e = _mm256_setr_ph( - 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, - ); - assert_eq_m256h(r, e); - } +/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is +/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex +/// number `complex = vec.fp16[0] + i * vec.fp16[1]`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_pch) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[cfg_attr(test, assert_instr(vfmulcph))] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm512_mul_pch(a: __m512h, b: __m512h) -> __m512h { + _mm512_mask_mul_pch(_mm512_undefined_ph(), 0xffff, a, b) +} - #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm512_set_ph() { - let r = _mm512_set_ph( - 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, - 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, - 31.0, 32.0, - ); - let e = _mm512_setr_ph( - 32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, - 18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, - 3.0, 2.0, 1.0, - ); - assert_eq_m512h(r, e); - } +/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element +/// is copied from src when mask bit 0 is not set). Each complex number is composed of two adjacent +/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_pch) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[cfg_attr(test, assert_instr(vfmulcph))] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm512_mask_mul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h { + _mm512_mask_mul_round_pch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b) +} - #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_set_sh() { - let r = _mm_set_sh(1.0); - let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0); - assert_eq_m128h(r, e); - } +/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element +/// is zeroed out when mask bit 0 is not set). Each complex number is composed of two adjacent +/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_pch) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[cfg_attr(test, assert_instr(vfmulcph))] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm512_maskz_mul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h { + _mm512_mask_mul_pch(_mm512_setzero_ph(), k, a, b) +} - #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_set1_ph() { - let r = _mm_set1_ph(1.0); - let e = _mm_set_ph(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0); - assert_eq_m128h(r, e); - } +/// Multiply the packed complex numbers in a and b, and store the results in dst. Each complex number is +/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex +/// number `complex = vec.fp16[0] + i * vec.fp16[1]`. +/// +/// Rounding is done according to the rounding parameter, which can be one of: +/// +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_round_pch) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))] +#[rustc_legacy_const_generics(2)] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm512_mul_round_pch(a: __m512h, b: __m512h) -> __m512h { + static_assert_rounding!(ROUNDING); + _mm512_mask_mul_round_pch::(_mm512_undefined_ph(), 0xffff, a, b) +} - #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm256_set1_ph() { - let r = _mm256_set1_ph(1.0); - let e = _mm256_set_ph( - 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, - ); - assert_eq_m256h(r, e); - } +/// Multiply the packed complex numbers in a and b, and store the results in dst using writemask k (the element +/// is copied from src when mask bit 0 is not set). Each complex number is composed of two adjacent +/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`. +/// +/// Rounding is done according to the rounding parameter, which can be one of: +/// +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_round_pch) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))] +#[rustc_legacy_const_generics(4)] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm512_mask_mul_round_pch( + src: __m512h, + k: __mmask16, + a: __m512h, + b: __m512h, +) -> __m512h { + static_assert_rounding!(ROUNDING); + transmute(vfmulcph_512( + transmute(a), + transmute(b), + transmute(src), + k, + ROUNDING, + )) +} - #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm512_set1_ph() { - let r = _mm512_set1_ph(1.0); - let e = _mm512_set_ph( - 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, - ); - assert_eq_m512h(r, e); - } +/// Multiply the packed complex numbers in a and b, and store the results in dst using zeromask k (the element +/// is zeroed out when mask bit 0 is not set). Each complex number is composed of two adjacent +/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`. +/// +/// Rounding is done according to the rounding parameter, which can be one of: +/// +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_round_pch) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))] +#[rustc_legacy_const_generics(3)] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm512_maskz_mul_round_pch( + k: __mmask16, + a: __m512h, + b: __m512h, +) -> __m512h { + static_assert_rounding!(ROUNDING); + _mm512_mask_mul_round_pch::(_mm512_setzero_ph(), k, a, b) +} - #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_setr_ph() { - let r = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); - let e = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0); - assert_eq_m128h(r, e); - } +/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst, +/// and copy the upper 6 packed elements from a to the upper elements of dst. Each complex number is +/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex +/// number `complex = vec.fp16[0] + i * vec.fp16[1]`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_sch) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[cfg_attr(test, assert_instr(vfmulcsh))] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm_mul_sch(a: __m128h, b: __m128h) -> __m128h { + _mm_mask_mul_sch(_mm_undefined_ph(), 0xff, a, b) +} - #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm256_setr_ph() { - let r = _mm256_setr_ph( +/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using +/// writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 6 packed +/// elements from a to the upper elements of dst. Each complex number is composed of two adjacent +/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_sch) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[cfg_attr(test, assert_instr(vfmulcsh))] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm_mask_mul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h { + _mm_mask_mul_round_sch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b) +} + +/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using +/// zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements +/// from a to the upper elements of dst. Each complex number is composed of two adjacent half-precision +/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_sch) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[cfg_attr(test, assert_instr(vfmulcsh))] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm_maskz_mul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h { + _mm_mask_mul_sch(_mm_setzero_ph(), k, a, b) +} + +/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst, +/// and copy the upper 6 packed elements from a to the upper elements of dst. Each complex number is +/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex +/// number `complex = vec.fp16[0] + i * vec.fp16[1]`. +/// +/// Rounding is done according to the rounding parameter, which can be one of: +/// +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_round_sch) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))] +#[rustc_legacy_const_generics(2)] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm_mul_round_sch(a: __m128h, b: __m128h) -> __m128h { + static_assert_rounding!(ROUNDING); + _mm_mask_mul_round_sch::(_mm_undefined_ph(), 0xff, a, b) +} + +/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using +/// writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 6 packed +/// elements from a to the upper elements of dst. Each complex number is composed of two adjacent half-precision +/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`. +/// +/// Rounding is done according to the rounding parameter, which can be one of: +/// +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_round_sch) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))] +#[rustc_legacy_const_generics(4)] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm_mask_mul_round_sch( + src: __m128h, + k: __mmask8, + a: __m128h, + b: __m128h, +) -> __m128h { + static_assert_rounding!(ROUNDING); + transmute(vfmulcsh( + transmute(a), + transmute(b), + transmute(src), + k, + ROUNDING, + )) +} + +/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using +/// zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements +/// from a to the upper elements of dst. Each complex number is composed of two adjacent half-precision +/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`. +/// +/// Rounding is done according to the rounding parameter, which can be one of: +/// +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_sch) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))] +#[rustc_legacy_const_generics(3)] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm_maskz_mul_round_sch( + k: __mmask8, + a: __m128h, + b: __m128h, +) -> __m128h { + static_assert_rounding!(ROUNDING); + _mm_mask_mul_round_sch::(_mm_setzero_ph(), k, a, b) +} + +/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is +/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex +/// number `complex = vec.fp16[0] + i * vec.fp16[1]`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmul_pch) +#[inline] +#[target_feature(enable = "avx512fp16,avx512vl")] +#[cfg_attr(test, assert_instr(vfmulcph))] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm_fmul_pch(a: __m128h, b: __m128h) -> __m128h { + _mm_mul_pch(a, b) +} + +/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element +/// is copied from src when mask bit 0 is not set). Each complex number is composed of two adjacent +/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmul_pch) +#[inline] +#[target_feature(enable = "avx512fp16,avx512vl")] +#[cfg_attr(test, assert_instr(vfmulcph))] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm_mask_fmul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h { + _mm_mask_mul_pch(src, k, a, b) +} + +/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element +/// is zeroed out when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision +/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmul_pch) +#[inline] +#[target_feature(enable = "avx512fp16,avx512vl")] +#[cfg_attr(test, assert_instr(vfmulcph))] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm_maskz_fmul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h { + _mm_maskz_mul_pch(k, a, b) +} + +/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is +/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex +/// number `complex = vec.fp16[0] + i * vec.fp16[1]`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmul_pch) +#[inline] +#[target_feature(enable = "avx512fp16,avx512vl")] +#[cfg_attr(test, assert_instr(vfmulcph))] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm256_fmul_pch(a: __m256h, b: __m256h) -> __m256h { + _mm256_mul_pch(a, b) +} + +/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element +/// is copied from src when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision +/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmul_pch) +#[inline] +#[target_feature(enable = "avx512fp16,avx512vl")] +#[cfg_attr(test, assert_instr(vfmulcph))] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm256_mask_fmul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h { + _mm256_mask_mul_pch(src, k, a, b) +} + +/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element +/// is zeroed out when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision +/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmul_pch) +#[inline] +#[target_feature(enable = "avx512fp16,avx512vl")] +#[cfg_attr(test, assert_instr(vfmulcph))] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm256_maskz_fmul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h { + _mm256_maskz_mul_pch(k, a, b) +} + +/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is composed +/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmul_pch) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[cfg_attr(test, assert_instr(vfmulcph))] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm512_fmul_pch(a: __m512h, b: __m512h) -> __m512h { + _mm512_mul_pch(a, b) +} + +/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element +/// is copied from src when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision +/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmul_pch) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[cfg_attr(test, assert_instr(vfmulcph))] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm512_mask_fmul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h { + _mm512_mask_mul_pch(src, k, a, b) +} + +/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element +/// is zeroed out when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision +/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmul_pch) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[cfg_attr(test, assert_instr(vfmulcph))] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm512_maskz_fmul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h { + _mm512_maskz_mul_pch(k, a, b) +} + +/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is composed +/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`. +/// Rounding is done according to the rounding parameter, which can be one of: +/// +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmul_round_pch) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))] +#[rustc_legacy_const_generics(2)] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm512_fmul_round_pch(a: __m512h, b: __m512h) -> __m512h { + static_assert_rounding!(ROUNDING); + _mm512_mul_round_pch::(a, b) +} + +/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element +/// is copied from src when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision +/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`. +/// Rounding is done according to the rounding parameter, which can be one of: +/// +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmul_round_pch) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))] +#[rustc_legacy_const_generics(4)] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm512_mask_fmul_round_pch( + src: __m512h, + k: __mmask16, + a: __m512h, + b: __m512h, +) -> __m512h { + static_assert_rounding!(ROUNDING); + _mm512_mask_mul_round_pch::(src, k, a, b) +} + +/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element +/// is zeroed out when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision +/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`. +/// Rounding is done according to the rounding parameter, which can be one of: +/// +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmul_round_pch) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))] +#[rustc_legacy_const_generics(3)] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm512_maskz_fmul_round_pch( + k: __mmask16, + a: __m512h, + b: __m512h, +) -> __m512h { + static_assert_rounding!(ROUNDING); + _mm512_maskz_mul_round_pch::(k, a, b) +} + +/// Multiply the lower complex numbers in a and b, and store the results in dst. Each complex number is +/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex +/// number `complex = vec.fp16[0] + i * vec.fp16[1]`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmul_sch) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[cfg_attr(test, assert_instr(vfmulcsh))] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm_fmul_sch(a: __m128h, b: __m128h) -> __m128h { + _mm_mul_sch(a, b) +} + +/// Multiply the lower complex numbers in a and b, and store the results in dst using writemask k (the element +/// is copied from src when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision +/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmul_sch) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[cfg_attr(test, assert_instr(vfmulcsh))] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm_mask_fmul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h { + _mm_mask_mul_sch(src, k, a, b) +} + +/// Multiply the lower complex numbers in a and b, and store the results in dst using zeromask k (the element +/// is zeroed out when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision +/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmul_sch) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[cfg_attr(test, assert_instr(vfmulcsh))] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm_maskz_fmul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h { + _mm_maskz_mul_sch(k, a, b) +} + +/// Multiply the lower complex numbers in a and b, and store the results in dst. Each complex number is composed +/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`. +/// +/// Rounding is done according to the rounding parameter, which can be one of: +/// +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmul_round_sch) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))] +#[rustc_legacy_const_generics(2)] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm_fmul_round_sch(a: __m128h, b: __m128h) -> __m128h { + static_assert_rounding!(ROUNDING); + _mm_mul_round_sch::(a, b) +} + +/// Multiply the lower complex numbers in a and b, and store the results in dst using writemask k (the element +/// is copied from src when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision +/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`. +/// +/// Rounding is done according to the rounding parameter, which can be one of: +/// +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmul_round_sch) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))] +#[rustc_legacy_const_generics(4)] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm_mask_fmul_round_sch( + src: __m128h, + k: __mmask8, + a: __m128h, + b: __m128h, +) -> __m128h { + static_assert_rounding!(ROUNDING); + _mm_mask_mul_round_sch::(src, k, a, b) +} + +/// Multiply the lower complex numbers in a and b, and store the results in dst using zeromask k (the element +/// is zeroed out when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision +/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`. +/// +/// Rounding is done according to the rounding parameter, which can be one of: +/// +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmul_round_sch) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))] +#[rustc_legacy_const_generics(3)] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm_maskz_fmul_round_sch( + k: __mmask8, + a: __m128h, + b: __m128h, +) -> __m128h { + static_assert_rounding!(ROUNDING); + _mm_maskz_mul_round_sch::(k, a, b) +} + +/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and +/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit) +/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, +/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmul_pch) +#[inline] +#[target_feature(enable = "avx512fp16,avx512vl")] +#[cfg_attr(test, assert_instr(vfcmulcph))] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm_cmul_pch(a: __m128h, b: __m128h) -> __m128h { + _mm_mask_cmul_pch(_mm_undefined_ph(), 0xff, a, b) +} + +/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and +/// store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set). +/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which +/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmul_pch) +#[inline] +#[target_feature(enable = "avx512fp16,avx512vl")] +#[cfg_attr(test, assert_instr(vfcmulcph))] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm_mask_cmul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h { + transmute(vfcmulcph_128(transmute(a), transmute(b), transmute(src), k)) +} + +/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and +/// store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set). +/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which +/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cmul_pch) +#[inline] +#[target_feature(enable = "avx512fp16,avx512vl")] +#[cfg_attr(test, assert_instr(vfcmulcph))] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm_maskz_cmul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h { + _mm_mask_cmul_pch(_mm_setzero_ph(), k, a, b) +} + +/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and +/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit) +/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, +/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmul_pch) +#[inline] +#[target_feature(enable = "avx512fp16,avx512vl")] +#[cfg_attr(test, assert_instr(vfcmulcph))] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm256_cmul_pch(a: __m256h, b: __m256h) -> __m256h { + _mm256_mask_cmul_pch(_mm256_undefined_ph(), 0xff, a, b) +} + +/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and +/// store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set). +/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which +/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmul_pch) +#[inline] +#[target_feature(enable = "avx512fp16,avx512vl")] +#[cfg_attr(test, assert_instr(vfcmulcph))] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm256_mask_cmul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h { + transmute(vfcmulcph_256(transmute(a), transmute(b), transmute(src), k)) +} + +/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and +/// store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set). +/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which +/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cmul_pch) +#[inline] +#[target_feature(enable = "avx512fp16,avx512vl")] +#[cfg_attr(test, assert_instr(vfcmulcph))] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm256_maskz_cmul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h { + _mm256_mask_cmul_pch(_mm256_setzero_ph(), k, a, b) +} + +/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and +/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit) +/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, +/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmul_pch) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[cfg_attr(test, assert_instr(vfcmulcph))] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm512_cmul_pch(a: __m512h, b: __m512h) -> __m512h { + _mm512_mask_cmul_pch(_mm512_undefined_ph(), 0xffff, a, b) +} + +/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and +/// store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set). +/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which +/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmul_pch) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[cfg_attr(test, assert_instr(vfcmulcph))] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm512_mask_cmul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h { + _mm512_mask_cmul_round_pch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b) +} + +/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and +/// store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set). +/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which +/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cmul_pch) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[cfg_attr(test, assert_instr(vfcmulcph))] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm512_maskz_cmul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h { + _mm512_mask_cmul_pch(_mm512_setzero_ph(), k, a, b) +} + +/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and +/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit) +/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, +/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`. +/// +/// Rounding is done according to the rounding parameter, which can be one of: +/// +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmul_round_pch) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))] +#[rustc_legacy_const_generics(2)] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm512_cmul_round_pch(a: __m512h, b: __m512h) -> __m512h { + static_assert_rounding!(ROUNDING); + _mm512_mask_cmul_round_pch::(_mm512_undefined_ph(), 0xffff, a, b) +} + +/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and +/// store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set). +/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which +/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`. +/// +/// Rounding is done according to the rounding parameter, which can be one of: +/// +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmul_round_pch) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))] +#[rustc_legacy_const_generics(4)] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm512_mask_cmul_round_pch( + src: __m512h, + k: __mmask16, + a: __m512h, + b: __m512h, +) -> __m512h { + static_assert_rounding!(ROUNDING); + transmute(vfcmulcph_512( + transmute(a), + transmute(b), + transmute(src), + k, + ROUNDING, + )) +} + +/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and +/// store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set). +/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which +/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`. +/// +/// Rounding is done according to the rounding parameter, which can be one of: +/// +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cmul_round_pch) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))] +#[rustc_legacy_const_generics(3)] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm512_maskz_cmul_round_pch( + k: __mmask16, + a: __m512h, + b: __m512h, +) -> __m512h { + static_assert_rounding!(ROUNDING); + _mm512_mask_cmul_round_pch::(_mm512_setzero_ph(), k, a, b) +} + +/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b, +/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit) +/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmul_sch) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[cfg_attr(test, assert_instr(vfcmulcsh))] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm_cmul_sch(a: __m128h, b: __m128h) -> __m128h { + _mm_mask_cmul_sch(_mm_undefined_ph(), 0xff, a, b) +} + +/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b, +/// and store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set). +/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which +/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmul_sch) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[cfg_attr(test, assert_instr(vfcmulcsh))] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm_mask_cmul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h { + _mm_mask_cmul_round_sch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b) +} + +/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b, +/// and store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set). +/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which +/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cmul_sch) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[cfg_attr(test, assert_instr(vfcmulcsh))] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm_maskz_cmul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h { + _mm_mask_cmul_sch(_mm_setzero_ph(), k, a, b) +} + +/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b, +/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit) +/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, +/// +/// Rounding is done according to the rounding parameter, which can be one of: +/// +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmul_round_sch) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))] +#[rustc_legacy_const_generics(2)] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm_cmul_round_sch(a: __m128h, b: __m128h) -> __m128h { + static_assert_rounding!(ROUNDING); + _mm_mask_cmul_round_sch::(_mm_undefined_ph(), 0xff, a, b) +} + +/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b, +/// and store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set). +/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which +/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`. +/// +/// Rounding is done according to the rounding parameter, which can be one of: +/// +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmul_round_sch) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))] +#[rustc_legacy_const_generics(4)] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm_mask_cmul_round_sch( + src: __m128h, + k: __mmask8, + a: __m128h, + b: __m128h, +) -> __m128h { + static_assert_rounding!(ROUNDING); + transmute(vfcmulcsh( + transmute(a), + transmute(b), + transmute(src), + k, + ROUNDING, + )) +} + +/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b, +/// and store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set). +/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which +/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`. +/// +/// Rounding is done according to the rounding parameter, which can be one of: +/// +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cmul_round_sch) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))] +#[rustc_legacy_const_generics(3)] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm_maskz_cmul_round_sch( + k: __mmask8, + a: __m128h, + b: __m128h, +) -> __m128h { + static_assert_rounding!(ROUNDING); + _mm_mask_cmul_round_sch::(_mm_setzero_ph(), k, a, b) +} + +/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and +/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit) +/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, +/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmul_pch) +#[inline] +#[target_feature(enable = "avx512fp16,avx512vl")] +#[cfg_attr(test, assert_instr(vfcmulcph))] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm_fcmul_pch(a: __m128h, b: __m128h) -> __m128h { + _mm_cmul_pch(a, b) +} + +/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and +/// store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set). +/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which +/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmul_pch) +#[inline] +#[target_feature(enable = "avx512fp16,avx512vl")] +#[cfg_attr(test, assert_instr(vfcmulcph))] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm_mask_fcmul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h { + _mm_mask_cmul_pch(src, k, a, b) +} + +/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and +/// store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set). +/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which +/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmul_pch) +#[inline] +#[target_feature(enable = "avx512fp16,avx512vl")] +#[cfg_attr(test, assert_instr(vfcmulcph))] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm_maskz_fcmul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h { + _mm_maskz_cmul_pch(k, a, b) +} + +/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and +/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit) +/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, +/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fcmul_pch) +#[inline] +#[target_feature(enable = "avx512fp16,avx512vl")] +#[cfg_attr(test, assert_instr(vfcmulcph))] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm256_fcmul_pch(a: __m256h, b: __m256h) -> __m256h { + _mm256_cmul_pch(a, b) +} + +/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and +/// store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set). +/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which +/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fcmul_pch) +#[inline] +#[target_feature(enable = "avx512fp16,avx512vl")] +#[cfg_attr(test, assert_instr(vfcmulcph))] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm256_mask_fcmul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h { + _mm256_mask_cmul_pch(src, k, a, b) +} + +/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and +/// store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set). +/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which +/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fcmul_pch) +#[inline] +#[target_feature(enable = "avx512fp16,avx512vl")] +#[cfg_attr(test, assert_instr(vfcmulcph))] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm256_maskz_fcmul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h { + _mm256_maskz_cmul_pch(k, a, b) +} + +/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and +/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit) +/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, +/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmul_pch) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[cfg_attr(test, assert_instr(vfcmulcph))] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm512_fcmul_pch(a: __m512h, b: __m512h) -> __m512h { + _mm512_cmul_pch(a, b) +} + +/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and +/// store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set). +/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which +/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmul_pch) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[cfg_attr(test, assert_instr(vfcmulcph))] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm512_mask_fcmul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h { + _mm512_mask_cmul_pch(src, k, a, b) +} + +/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and +/// store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set). +/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which +/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmul_pch) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[cfg_attr(test, assert_instr(vfcmulcph))] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm512_maskz_fcmul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h { + _mm512_maskz_cmul_pch(k, a, b) +} + +/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and +/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit) +/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, +/// +/// Rounding is done according to the rounding parameter, which can be one of: +/// +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmul_round_pch) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))] +#[rustc_legacy_const_generics(2)] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm512_fcmul_round_pch(a: __m512h, b: __m512h) -> __m512h { + static_assert_rounding!(ROUNDING); + _mm512_cmul_round_pch::(a, b) +} + +/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and +/// store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set). +/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which +/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`. +/// +/// Rounding is done according to the rounding parameter, which can be one of: +/// +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmul_round_pch) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))] +#[rustc_legacy_const_generics(4)] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm512_mask_fcmul_round_pch( + src: __m512h, + k: __mmask16, + a: __m512h, + b: __m512h, +) -> __m512h { + static_assert_rounding!(ROUNDING); + _mm512_mask_cmul_round_pch::(src, k, a, b) +} + +/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and +/// store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set). +/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which +/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`. +/// +/// Rounding is done according to the rounding parameter, which can be one of: +/// +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmul_round_pch) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))] +#[rustc_legacy_const_generics(3)] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm512_maskz_fcmul_round_pch( + k: __mmask16, + a: __m512h, + b: __m512h, +) -> __m512h { + static_assert_rounding!(ROUNDING); + _mm512_maskz_cmul_round_pch::(k, a, b) +} + +/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b, +/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit) +/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, +/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmul_sch) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[cfg_attr(test, assert_instr(vfcmulcsh))] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm_fcmul_sch(a: __m128h, b: __m128h) -> __m128h { + _mm_cmul_sch(a, b) +} + +/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b, +/// and store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set). +/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which +/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmul_sch) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[cfg_attr(test, assert_instr(vfcmulcsh))] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm_mask_fcmul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h { + _mm_mask_cmul_sch(src, k, a, b) +} + +/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b, +/// and store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set). +/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which +/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmul_sch) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[cfg_attr(test, assert_instr(vfcmulcsh))] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm_maskz_fcmul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h { + _mm_maskz_cmul_sch(k, a, b) +} + +/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b, +/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit) +/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, +/// +/// Rounding is done according to the rounding parameter, which can be one of: +/// +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmul_round_sch) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))] +#[rustc_legacy_const_generics(2)] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm_fcmul_round_sch(a: __m128h, b: __m128h) -> __m128h { + static_assert_rounding!(ROUNDING); + _mm_cmul_round_sch::(a, b) +} + +/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b, +/// and store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set). +/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which +/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`. +/// +/// Rounding is done according to the rounding parameter, which can be one of: +/// +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmul_round_sch) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))] +#[rustc_legacy_const_generics(4)] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm_mask_fcmul_round_sch( + src: __m128h, + k: __mmask8, + a: __m128h, + b: __m128h, +) -> __m128h { + static_assert_rounding!(ROUNDING); + _mm_mask_cmul_round_sch::(src, k, a, b) +} + +/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b, +/// and store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set). +/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which +/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`. +/// +/// Rounding is done according to the rounding parameter, which can be one of: +/// +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmul_round_sch) +#[inline] +#[target_feature(enable = "avx512fp16")] +#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))] +#[rustc_legacy_const_generics(3)] +#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")] +pub unsafe fn _mm_maskz_fcmul_round_sch( + k: __mmask8, + a: __m128h, + b: __m128h, +) -> __m128h { + static_assert_rounding!(ROUNDING); + _mm_maskz_cmul_round_sch::(k, a, b) +} + +#[allow(improper_ctypes)] +extern "C" { + #[link_name = "llvm.x86.avx512fp16.mask.cmp.sh"] + fn vcmpsh(a: __m128h, b: __m128h, imm8: i32, mask: __mmask8, sae: i32) -> __mmask8; + #[link_name = "llvm.x86.avx512fp16.vcomi.sh"] + fn vcomish(a: __m128h, b: __m128h, imm8: i32, sae: i32) -> i32; + + #[link_name = "llvm.x86.avx512fp16.add.ph.512"] + fn vaddph(a: __m512h, b: __m512h, rounding: i32) -> __m512h; + #[link_name = "llvm.x86.avx512fp16.sub.ph.512"] + fn vsubph(a: __m512h, b: __m512h, rounding: i32) -> __m512h; + #[link_name = "llvm.x86.avx512fp16.mul.ph.512"] + fn vmulph(a: __m512h, b: __m512h, rounding: i32) -> __m512h; + #[link_name = "llvm.x86.avx512fp16.div.ph.512"] + fn vdivph(a: __m512h, b: __m512h, rounding: i32) -> __m512h; + + #[link_name = "llvm.x86.avx512fp16.mask.add.sh.round"] + fn vaddsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h; + #[link_name = "llvm.x86.avx512fp16.mask.sub.sh.round"] + fn vsubsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h; + #[link_name = "llvm.x86.avx512fp16.mask.mul.sh.round"] + fn vmulsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h; + #[link_name = "llvm.x86.avx512fp16.mask.div.sh.round"] + fn vdivsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h; + + #[link_name = "llvm.x86.avx512fp16.mask.vfmul.cph.128"] + fn vfmulcph_128(a: __m128, b: __m128, src: __m128, k: __mmask8) -> __m128; + #[link_name = "llvm.x86.avx512fp16.mask.vfmul.cph.256"] + fn vfmulcph_256(a: __m256, b: __m256, src: __m256, k: __mmask8) -> __m256; + #[link_name = "llvm.x86.avx512fp16.mask.vfmul.cph.512"] + fn vfmulcph_512(a: __m512, b: __m512, src: __m512, k: __mmask16, rounding: i32) -> __m512; + #[link_name = "llvm.x86.avx512fp16.mask.vfmul.csh"] + fn vfmulcsh(a: __m128, b: __m128, src: __m128, k: __mmask8, rounding: i32) -> __m128; + + #[link_name = "llvm.x86.avx512fp16.mask.vfcmul.cph.128"] + fn vfcmulcph_128(a: __m128, b: __m128, src: __m128, k: __mmask8) -> __m128; + #[link_name = "llvm.x86.avx512fp16.mask.vfcmul.cph.256"] + fn vfcmulcph_256(a: __m256, b: __m256, src: __m256, k: __mmask8) -> __m256; + #[link_name = "llvm.x86.avx512fp16.mask.vfcmul.cph.512"] + fn vfcmulcph_512(a: __m512, b: __m512, src: __m512, k: __mmask16, rounding: i32) -> __m512; + #[link_name = "llvm.x86.avx512fp16.mask.vfcmul.csh"] + fn vfcmulcsh(a: __m128, b: __m128, src: __m128, k: __mmask8, rounding: i32) -> __m128; + +} + +#[cfg(test)] +mod tests { + use crate::core_arch::x86::*; + use crate::mem::transmute; + use crate::ptr::{addr_of, addr_of_mut}; + use stdarch_test::simd_test; + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm_set_ph() { + let r = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); + let e = _mm_setr_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0); + assert_eq_m128h(r, e); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm256_set_ph() { + let r = _mm256_set_ph( + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + ); + let e = _mm256_setr_ph( + 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, + ); + assert_eq_m256h(r, e); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm512_set_ph() { + let r = _mm512_set_ph( + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, + 31.0, 32.0, + ); + let e = _mm512_setr_ph( + 32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, + 18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, + 3.0, 2.0, 1.0, + ); + assert_eq_m512h(r, e); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm_set_sh() { + let r = _mm_set_sh(1.0); + let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0); + assert_eq_m128h(r, e); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm_set1_ph() { + let r = _mm_set1_ph(1.0); + let e = _mm_set_ph(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0); + assert_eq_m128h(r, e); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm256_set1_ph() { + let r = _mm256_set1_ph(1.0); + let e = _mm256_set_ph( + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + ); + assert_eq_m256h(r, e); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm512_set1_ph() { + let r = _mm512_set1_ph(1.0); + let e = _mm512_set_ph( + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + ); + assert_eq_m512h(r, e); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm_setr_ph() { + let r = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); + let e = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0); + assert_eq_m128h(r, e); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm256_setr_ph() { + let r = _mm256_setr_ph( + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + ); + let e = _mm256_set_ph( + 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, + ); + assert_eq_m256h(r, e); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm512_setr_ph() { + let r = _mm512_setr_ph( 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, + 31.0, 32.0, + ); + let e = _mm512_set_ph( + 32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, + 18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, + 3.0, 2.0, 1.0, + ); + assert_eq_m512h(r, e); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm_set1_pch() { + let r = _mm_set1_pch((1.0, 2.0)); + let e = _mm_setr_ph(1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0); + assert_eq_m128h(r, e); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm256_set1_pch() { + let r = _mm256_set1_pch((1.0, 2.0)); + let e = _mm256_setr_ph( + 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, + ); + assert_eq_m256h(r, e); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm512_set1_pch() { + let r = _mm512_set1_pch((1.0, 2.0)); + let e = _mm512_setr_ph( + 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, + 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, + ); + assert_eq_m512h(r, e); + } + + #[simd_test(enable = "avx512fp16,avx512vl")] + unsafe fn test_mm_setzero_ph() { + let r = _mm_setzero_ph(); + let e = _mm_set1_ph(0.0); + assert_eq_m128h(r, e); + } + + #[simd_test(enable = "avx512fp16,avx512vl")] + unsafe fn test_mm256_setzero_ph() { + let r = _mm256_setzero_ph(); + let e = _mm256_set1_ph(0.0); + assert_eq_m256h(r, e); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm512_setzero_ph() { + let r = _mm512_setzero_ph(); + let e = _mm512_set1_ph(0.0); + assert_eq_m512h(r, e); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm_castsi128_ph() { + let a = _mm_set1_epi16(0x3c00); + let r = _mm_castsi128_ph(a); + let e = _mm_set1_ph(1.0); + assert_eq_m128h(r, e); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm256_castsi256_ph() { + let a = _mm256_set1_epi16(0x3c00); + let r = _mm256_castsi256_ph(a); + let e = _mm256_set1_ph(1.0); + assert_eq_m256h(r, e); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm512_castsi512_ph() { + let a = _mm512_set1_epi16(0x3c00); + let r = _mm512_castsi512_ph(a); + let e = _mm512_set1_ph(1.0); + assert_eq_m512h(r, e); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm_castph_si128() { + let a = _mm_set1_ph(1.0); + let r = _mm_castph_si128(a); + let e = _mm_set1_epi16(0x3c00); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm256_castph_si256() { + let a = _mm256_set1_ph(1.0); + let r = _mm256_castph_si256(a); + let e = _mm256_set1_epi16(0x3c00); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm512_castph_si512() { + let a = _mm512_set1_ph(1.0); + let r = _mm512_castph_si512(a); + let e = _mm512_set1_epi16(0x3c00); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm_castps_ph() { + let a = _mm_castsi128_ps(_mm_set1_epi16(0x3c00)); + let r = _mm_castps_ph(a); + let e = _mm_set1_ph(1.0); + assert_eq_m128h(r, e); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm256_castps_ph() { + let a = _mm256_castsi256_ps(_mm256_set1_epi16(0x3c00)); + let r = _mm256_castps_ph(a); + let e = _mm256_set1_ph(1.0); + assert_eq_m256h(r, e); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm512_castps_ph() { + let a = _mm512_castsi512_ps(_mm512_set1_epi16(0x3c00)); + let r = _mm512_castps_ph(a); + let e = _mm512_set1_ph(1.0); + assert_eq_m512h(r, e); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm_castph_ps() { + let a = _mm_castsi128_ph(_mm_set1_epi32(0x3f800000)); + let r = _mm_castph_ps(a); + let e = _mm_set1_ps(1.0); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm256_castph_ps() { + let a = _mm256_castsi256_ph(_mm256_set1_epi32(0x3f800000)); + let r = _mm256_castph_ps(a); + let e = _mm256_set1_ps(1.0); + assert_eq_m256(r, e); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm512_castph_ps() { + let a = _mm512_castsi512_ph(_mm512_set1_epi32(0x3f800000)); + let r = _mm512_castph_ps(a); + let e = _mm512_set1_ps(1.0); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm_castpd_ph() { + let a = _mm_castsi128_pd(_mm_set1_epi16(0x3c00)); + let r = _mm_castpd_ph(a); + let e = _mm_set1_ph(1.0); + assert_eq_m128h(r, e); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm256_castpd_ph() { + let a = _mm256_castsi256_pd(_mm256_set1_epi16(0x3c00)); + let r = _mm256_castpd_ph(a); + let e = _mm256_set1_ph(1.0); + assert_eq_m256h(r, e); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm512_castpd_ph() { + let a = _mm512_castsi512_pd(_mm512_set1_epi16(0x3c00)); + let r = _mm512_castpd_ph(a); + let e = _mm512_set1_ph(1.0); + assert_eq_m512h(r, e); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm_castph_pd() { + let a = _mm_castsi128_ph(_mm_set1_epi64x(0x3ff0000000000000)); + let r = _mm_castph_pd(a); + let e = _mm_set1_pd(1.0); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm256_castph_pd() { + let a = _mm256_castsi256_ph(_mm256_set1_epi64x(0x3ff0000000000000)); + let r = _mm256_castph_pd(a); + let e = _mm256_set1_pd(1.0); + assert_eq_m256d(r, e); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm512_castph_pd() { + let a = _mm512_castsi512_ph(_mm512_set1_epi64(0x3ff0000000000000)); + let r = _mm512_castph_pd(a); + let e = _mm512_set1_pd(1.0); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm256_castph256_ph128() { + let a = _mm256_setr_ph( + 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., + ); + let r = _mm256_castph256_ph128(a); + let e = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.); + assert_eq_m128h(r, e); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm512_castph512_ph128() { + let a = _mm512_setr_ph( + 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19., + 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., + ); + let r = _mm512_castph512_ph128(a); + let e = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.); + assert_eq_m128h(r, e); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm512_castph512_ph256() { + let a = _mm512_setr_ph( + 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19., + 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., + ); + let r = _mm512_castph512_ph256(a); + let e = _mm256_setr_ph( + 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., ); - let e = _mm256_set_ph( - 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, + assert_eq_m256h(r, e); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm256_castph128_ph256() { + let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.); + let r = _mm256_castph128_ph256(a); + assert_eq_m128h(_mm256_castph256_ph128(r), a); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm512_castph128_ph512() { + let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.); + let r = _mm512_castph128_ph512(a); + assert_eq_m128h(_mm512_castph512_ph128(r), a); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm512_castph256_ph512() { + let a = _mm256_setr_ph( + 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., + ); + let r = _mm512_castph256_ph512(a); + assert_eq_m256h(_mm512_castph512_ph256(r), a); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm256_zextph128_ph256() { + let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.); + let r = _mm256_zextph128_ph256(a); + let e = _mm256_setr_ph( + 1., 2., 3., 4., 5., 6., 7., 8., 0., 0., 0., 0., 0., 0., 0., 0., ); assert_eq_m256h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm512_setr_ph() { - let r = _mm512_setr_ph( + unsafe fn test_mm512_zextph128_ph512() { + let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.); + let r = _mm512_zextph128_ph512(a); + let e = _mm512_setr_ph( + 1., 2., 3., 4., 5., 6., 7., 8., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., + 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., + ); + assert_eq_m512h(r, e); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm512_zextph256_ph512() { + let a = _mm256_setr_ph( + 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., + ); + let r = _mm512_zextph256_ph512(a); + let e = _mm512_setr_ph( + 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 0., 0., 0., 0., + 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., + ); + assert_eq_m512h(r, e); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm_cmp_round_sh_mask() { + let a = _mm_set_sh(1.0); + let b = _mm_set_sh(1.0); + let r = _mm_cmp_round_sh_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(a, b); + assert_eq!(r, 1); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm_mask_cmp_round_sh_mask() { + let a = _mm_set_sh(1.0); + let b = _mm_set_sh(1.0); + let r = _mm_mask_cmp_round_sh_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(0, a, b); + assert_eq!(r, 0); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm_cmp_sh_mask() { + let a = _mm_set_sh(1.0); + let b = _mm_set_sh(1.0); + let r = _mm_cmp_sh_mask::<_CMP_EQ_OQ>(a, b); + assert_eq!(r, 1); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm_mask_cmp_sh_mask() { + let a = _mm_set_sh(1.0); + let b = _mm_set_sh(1.0); + let r = _mm_mask_cmp_sh_mask::<_CMP_EQ_OQ>(0, a, b); + assert_eq!(r, 0); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm_comi_round_sh() { + let a = _mm_set_sh(1.0); + let b = _mm_set_sh(1.0); + let r = _mm_comi_round_sh::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(a, b); + assert_eq!(r, 1); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm_comi_sh() { + let a = _mm_set_sh(1.0); + let b = _mm_set_sh(1.0); + let r = _mm_comi_sh::<_CMP_EQ_OQ>(a, b); + assert_eq!(r, 1); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm_comieq_sh() { + let a = _mm_set_sh(1.0); + let b = _mm_set_sh(1.0); + let r = _mm_comieq_sh(a, b); + assert_eq!(r, 1); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm_comige_sh() { + let a = _mm_set_sh(2.0); + let b = _mm_set_sh(1.0); + let r = _mm_comige_sh(a, b); + assert_eq!(r, 1); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm_comigt_sh() { + let a = _mm_set_sh(2.0); + let b = _mm_set_sh(1.0); + let r = _mm_comigt_sh(a, b); + assert_eq!(r, 1); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm_comile_sh() { + let a = _mm_set_sh(1.0); + let b = _mm_set_sh(2.0); + let r = _mm_comile_sh(a, b); + assert_eq!(r, 1); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm_comilt_sh() { + let a = _mm_set_sh(1.0); + let b = _mm_set_sh(2.0); + let r = _mm_comilt_sh(a, b); + assert_eq!(r, 1); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm_comineq_sh() { + let a = _mm_set_sh(1.0); + let b = _mm_set_sh(2.0); + let r = _mm_comineq_sh(a, b); + assert_eq!(r, 1); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm_ucomieq_sh() { + let a = _mm_set_sh(1.0); + let b = _mm_set_sh(1.0); + let r = _mm_ucomieq_sh(a, b); + assert_eq!(r, 1); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm_ucomige_sh() { + let a = _mm_set_sh(2.0); + let b = _mm_set_sh(1.0); + let r = _mm_ucomige_sh(a, b); + assert_eq!(r, 1); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm_ucomigt_sh() { + let a = _mm_set_sh(2.0); + let b = _mm_set_sh(1.0); + let r = _mm_ucomigt_sh(a, b); + assert_eq!(r, 1); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm_ucomile_sh() { + let a = _mm_set_sh(1.0); + let b = _mm_set_sh(2.0); + let r = _mm_ucomile_sh(a, b); + assert_eq!(r, 1); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm_ucomilt_sh() { + let a = _mm_set_sh(1.0); + let b = _mm_set_sh(2.0); + let r = _mm_ucomilt_sh(a, b); + assert_eq!(r, 1); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm_ucomineq_sh() { + let a = _mm_set_sh(1.0); + let b = _mm_set_sh(2.0); + let r = _mm_ucomineq_sh(a, b); + assert_eq!(r, 1); + } + + #[simd_test(enable = "avx512fp16,avx512vl")] + unsafe fn test_mm_load_ph() { + let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); + let b = _mm_load_ph(addr_of!(a).cast()); + assert_eq_m128h(a, b); + } + + #[simd_test(enable = "avx512fp16,avx512vl")] + unsafe fn test_mm256_load_ph() { + let a = _mm256_set_ph( + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + ); + let b = _mm256_load_ph(addr_of!(a).cast()); + assert_eq_m256h(a, b); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm512_load_ph() { + let a = _mm512_set_ph( 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, ); - let e = _mm512_set_ph( - 32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, - 18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, - 3.0, 2.0, 1.0, - ); - assert_eq_m512h(r, e); + let b = _mm512_load_ph(addr_of!(a).cast()); + assert_eq_m512h(a, b); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm_load_sh() { + let a = _mm_set_sh(1.0); + let b = _mm_load_sh(addr_of!(a).cast()); + assert_eq_m128h(a, b); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm_mask_load_sh() { + let a = _mm_set_sh(1.0); + let src = _mm_set_sh(2.); + let b = _mm_mask_load_sh(src, 1, addr_of!(a).cast()); + assert_eq_m128h(a, b); + let b = _mm_mask_load_sh(src, 0, addr_of!(a).cast()); + assert_eq_m128h(src, b); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm_maskz_load_sh() { + let a = _mm_set_sh(1.0); + let b = _mm_maskz_load_sh(1, addr_of!(a).cast()); + assert_eq_m128h(a, b); + let b = _mm_maskz_load_sh(0, addr_of!(a).cast()); + assert_eq_m128h(_mm_setzero_ph(), b); } #[simd_test(enable = "avx512fp16,avx512vl")] - unsafe fn test_mm_setzero_ph() { - let r = _mm_setzero_ph(); - let e = _mm_set1_ph(0.0); + unsafe fn test_mm_loadu_ph() { + let array = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]; + let r = _mm_loadu_ph(array.as_ptr()); + let e = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); assert_eq_m128h(r, e); } #[simd_test(enable = "avx512fp16,avx512vl")] - unsafe fn test_mm256_setzero_ph() { - let r = _mm256_setzero_ph(); - let e = _mm256_set1_ph(0.0); + unsafe fn test_mm256_loadu_ph() { + let array = [ + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + ]; + let r = _mm256_loadu_ph(array.as_ptr()); + let e = _mm256_setr_ph( + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + ); assert_eq_m256h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm512_setzero_ph() { - let r = _mm512_setzero_ph(); - let e = _mm512_set1_ph(0.0); + unsafe fn test_mm512_loadu_ph() { + let array = [ + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, + 31.0, 32.0, + ]; + let r = _mm512_loadu_ph(array.as_ptr()); + let e = _mm512_setr_ph( + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, + 31.0, 32.0, + ); assert_eq_m512h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_castsi128_ph() { - let a = _mm_set1_epi16(0x3c00); - let r = _mm_castsi128_ph(a); - let e = _mm_set1_ph(1.0); + unsafe fn test_mm_move_sh() { + let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); + let b = _mm_set_sh(9.0); + let r = _mm_move_sh(a, b); + let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 9.0); assert_eq_m128h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm256_castsi256_ph() { - let a = _mm256_set1_epi16(0x3c00); - let r = _mm256_castsi256_ph(a); - let e = _mm256_set1_ph(1.0); - assert_eq_m256h(r, e); + unsafe fn test_mm_mask_move_sh() { + let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); + let b = _mm_set_sh(9.0); + let src = _mm_set_sh(10.0); + let r = _mm_mask_move_sh(src, 0, a, b); + let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 10.0); + assert_eq_m128h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm512_castsi512_ph() { - let a = _mm512_set1_epi16(0x3c00); - let r = _mm512_castsi512_ph(a); - let e = _mm512_set1_ph(1.0); - assert_eq_m512h(r, e); + unsafe fn test_mm_maskz_move_sh() { + let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); + let b = _mm_set_sh(9.0); + let r = _mm_maskz_move_sh(0, a, b); + let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 0.0); + assert_eq_m128h(r, e); + } + + #[simd_test(enable = "avx512fp16,avx512vl")] + unsafe fn test_mm_store_ph() { + let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); + let mut b = _mm_setzero_ph(); + _mm_store_ph(addr_of_mut!(b).cast(), a); + assert_eq_m128h(a, b); + } + + #[simd_test(enable = "avx512fp16,avx512vl")] + unsafe fn test_mm256_store_ph() { + let a = _mm256_set_ph( + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + ); + let mut b = _mm256_setzero_ph(); + _mm256_store_ph(addr_of_mut!(b).cast(), a); + assert_eq_m256h(a, b); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_castph_si128() { - let a = _mm_set1_ph(1.0); - let r = _mm_castph_si128(a); - let e = _mm_set1_epi16(0x3c00); - assert_eq_m128i(r, e); + unsafe fn test_mm512_store_ph() { + let a = _mm512_set_ph( + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, + 31.0, 32.0, + ); + let mut b = _mm512_setzero_ph(); + _mm512_store_ph(addr_of_mut!(b).cast(), a); + assert_eq_m512h(a, b); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm256_castph_si256() { - let a = _mm256_set1_ph(1.0); - let r = _mm256_castph_si256(a); - let e = _mm256_set1_epi16(0x3c00); - assert_eq_m256i(r, e); + unsafe fn test_mm_store_sh() { + let a = _mm_set_sh(1.0); + let mut b = _mm_setzero_ph(); + _mm_store_sh(addr_of_mut!(b).cast(), a); + assert_eq_m128h(a, b); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm512_castph_si512() { - let a = _mm512_set1_ph(1.0); - let r = _mm512_castph_si512(a); - let e = _mm512_set1_epi16(0x3c00); - assert_eq_m512i(r, e); + unsafe fn test_mm_mask_store_sh() { + let a = _mm_set_sh(1.0); + let mut b = _mm_setzero_ph(); + _mm_mask_store_sh(addr_of_mut!(b).cast(), 0, a); + assert_eq_m128h(_mm_setzero_ph(), b); + _mm_mask_store_sh(addr_of_mut!(b).cast(), 1, a); + assert_eq_m128h(a, b); + } + + #[simd_test(enable = "avx512fp16,avx512vl")] + unsafe fn test_mm_storeu_ph() { + let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); + let mut array = [0.0; 8]; + _mm_storeu_ph(array.as_mut_ptr(), a); + assert_eq_m128h(a, _mm_loadu_ph(array.as_ptr())); + } + + #[simd_test(enable = "avx512fp16,avx512vl")] + unsafe fn test_mm256_storeu_ph() { + let a = _mm256_set_ph( + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + ); + let mut array = [0.0; 16]; + _mm256_storeu_ph(array.as_mut_ptr(), a); + assert_eq_m256h(a, _mm256_loadu_ph(array.as_ptr())); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_castps_ph() { - let a = _mm_castsi128_ps(_mm_set1_epi16(0x3c00)); - let r = _mm_castps_ph(a); - let e = _mm_set1_ph(1.0); + unsafe fn test_mm512_storeu_ph() { + let a = _mm512_set_ph( + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, + 31.0, 32.0, + ); + let mut array = [0.0; 32]; + _mm512_storeu_ph(array.as_mut_ptr(), a); + assert_eq_m512h(a, _mm512_loadu_ph(array.as_ptr())); + } + + #[simd_test(enable = "avx512fp16,avx512vl")] + unsafe fn test_mm_add_ph() { + let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); + let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0); + let r = _mm_add_ph(a, b); + let e = _mm_set1_ph(9.0); assert_eq_m128h(r, e); } - #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm256_castps_ph() { - let a = _mm256_castsi256_ps(_mm256_set1_epi16(0x3c00)); - let r = _mm256_castps_ph(a); - let e = _mm256_set1_ph(1.0); + #[simd_test(enable = "avx512fp16,avx512vl")] + unsafe fn test_mm_mask_add_ph() { + let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); + let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0); + let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.); + let r = _mm_mask_add_ph(src, 0b01010101, a, b); + let e = _mm_set_ph(10., 9., 12., 9., 14., 9., 16., 9.); + assert_eq_m128h(r, e); + } + + #[simd_test(enable = "avx512fp16,avx512vl")] + unsafe fn test_mm_maskz_add_ph() { + let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); + let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0); + let r = _mm_maskz_add_ph(0b01010101, a, b); + let e = _mm_set_ph(0., 9., 0., 9., 0., 9., 0., 9.); + assert_eq_m128h(r, e); + } + + #[simd_test(enable = "avx512fp16,avx512vl")] + unsafe fn test_mm256_add_ph() { + let a = _mm256_set_ph( + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + ); + let b = _mm256_set_ph( + 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, + ); + let r = _mm256_add_ph(a, b); + let e = _mm256_set1_ph(17.0); + assert_eq_m256h(r, e); + } + + #[simd_test(enable = "avx512fp16,avx512vl")] + unsafe fn test_mm256_mask_add_ph() { + let a = _mm256_set_ph( + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + ); + let b = _mm256_set_ph( + 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, + ); + let src = _mm256_set_ph( + 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33., + ); + let r = _mm256_mask_add_ph(src, 0b0101010101010101, a, b); + let e = _mm256_set_ph( + 18., 17., 20., 17., 22., 17., 24., 17., 26., 17., 28., 17., 30., 17., 32., 17., + ); + assert_eq_m256h(r, e); + } + + #[simd_test(enable = "avx512fp16,avx512vl")] + unsafe fn test_mm256_maskz_add_ph() { + let a = _mm256_set_ph( + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + ); + let b = _mm256_set_ph( + 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, + ); + let r = _mm256_maskz_add_ph(0b0101010101010101, a, b); + let e = _mm256_set_ph( + 0., 17., 0., 17., 0., 17., 0., 17., 0., 17., 0., 17., 0., 17., 0., 17., + ); assert_eq_m256h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm512_castps_ph() { - let a = _mm512_castsi512_ps(_mm512_set1_epi16(0x3c00)); - let r = _mm512_castps_ph(a); - let e = _mm512_set1_ph(1.0); + unsafe fn test_mm512_add_ph() { + let a = _mm512_set_ph( + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, + 31.0, 32.0, + ); + let b = _mm512_set_ph( + 32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, + 18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, + 3.0, 2.0, 1.0, + ); + let r = _mm512_add_ph(a, b); + let e = _mm512_set1_ph(33.0); + assert_eq_m512h(r, e); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm512_mask_add_ph() { + let a = _mm512_set_ph( + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, + 31.0, 32.0, + ); + let b = _mm512_set_ph( + 32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, + 18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, + 3.0, 2.0, 1.0, + ); + let src = _mm512_set_ph( + 34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50., + 51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65., + ); + let r = _mm512_mask_add_ph(src, 0b01010101010101010101010101010101, a, b); + let e = _mm512_set_ph( + 34., 33., 36., 33., 38., 33., 40., 33., 42., 33., 44., 33., 46., 33., 48., 33., 50., + 33., 52., 33., 54., 33., 56., 33., 58., 33., 60., 33., 62., 33., 64., 33., + ); assert_eq_m512h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_castph_ps() { - let a = _mm_castsi128_ph(_mm_set1_epi32(0x3f800000)); - let r = _mm_castph_ps(a); - let e = _mm_set1_ps(1.0); - assert_eq_m128(r, e); - } - - #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm256_castph_ps() { - let a = _mm256_castsi256_ph(_mm256_set1_epi32(0x3f800000)); - let r = _mm256_castph_ps(a); - let e = _mm256_set1_ps(1.0); - assert_eq_m256(r, e); + unsafe fn test_mm512_maskz_add_ph() { + let a = _mm512_set_ph( + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, + 31.0, 32.0, + ); + let b = _mm512_set_ph( + 32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, + 18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, + 3.0, 2.0, 1.0, + ); + let r = _mm512_maskz_add_ph(0b01010101010101010101010101010101, a, b); + let e = _mm512_set_ph( + 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., + 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., + ); + assert_eq_m512h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm512_castph_ps() { - let a = _mm512_castsi512_ph(_mm512_set1_epi32(0x3f800000)); - let r = _mm512_castph_ps(a); - let e = _mm512_set1_ps(1.0); - assert_eq_m512(r, e); + unsafe fn test_mm512_add_round_ph() { + let a = _mm512_set_ph( + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, + 31.0, 32.0, + ); + let b = _mm512_set_ph( + 32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, + 18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, + 3.0, 2.0, 1.0, + ); + let r = _mm512_add_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b); + let e = _mm512_set1_ph(33.0); + assert_eq_m512h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_castpd_ph() { - let a = _mm_castsi128_pd(_mm_set1_epi16(0x3c00)); - let r = _mm_castpd_ph(a); - let e = _mm_set1_ph(1.0); - assert_eq_m128h(r, e); + unsafe fn test_mm512_mask_add_round_ph() { + let a = _mm512_set_ph( + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, + 31.0, 32.0, + ); + let b = _mm512_set_ph( + 32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, + 18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, + 3.0, 2.0, 1.0, + ); + let src = _mm512_set_ph( + 34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50., + 51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65., + ); + let r = _mm512_mask_add_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( + src, + 0b01010101010101010101010101010101, + a, + b, + ); + let e = _mm512_set_ph( + 34., 33., 36., 33., 38., 33., 40., 33., 42., 33., 44., 33., 46., 33., 48., 33., 50., + 33., 52., 33., 54., 33., 56., 33., 58., 33., 60., 33., 62., 33., 64., 33., + ); + assert_eq_m512h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm256_castpd_ph() { - let a = _mm256_castsi256_pd(_mm256_set1_epi16(0x3c00)); - let r = _mm256_castpd_ph(a); - let e = _mm256_set1_ph(1.0); - assert_eq_m256h(r, e); + unsafe fn test_mm512_maskz_add_round_ph() { + let a = _mm512_set_ph( + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, + 31.0, 32.0, + ); + let b = _mm512_set_ph( + 32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, + 18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, + 3.0, 2.0, 1.0, + ); + let r = _mm512_maskz_add_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( + 0b01010101010101010101010101010101, + a, + b, + ); + let e = _mm512_set_ph( + 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., + 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., + ); + assert_eq_m512h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm512_castpd_ph() { - let a = _mm512_castsi512_pd(_mm512_set1_epi16(0x3c00)); - let r = _mm512_castpd_ph(a); - let e = _mm512_set1_ph(1.0); - assert_eq_m512h(r, e); + unsafe fn test_mm_add_round_sh() { + let a = _mm_set_sh(1.0); + let b = _mm_set_sh(2.0); + let r = _mm_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b); + let e = _mm_set_sh(3.0); + assert_eq_m128h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_castph_pd() { - let a = _mm_castsi128_ph(_mm_set1_epi64x(0x3ff0000000000000)); - let r = _mm_castph_pd(a); - let e = _mm_set1_pd(1.0); - assert_eq_m128d(r, e); + unsafe fn test_mm_mask_add_round_sh() { + let a = _mm_set_sh(1.0); + let b = _mm_set_sh(2.0); + let src = _mm_set_sh(4.0); + let r = _mm_mask_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( + src, 0, a, b, + ); + let e = _mm_set_sh(4.0); + assert_eq_m128h(r, e); + let r = _mm_mask_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( + src, 1, a, b, + ); + let e = _mm_set_sh(3.0); + assert_eq_m128h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm256_castph_pd() { - let a = _mm256_castsi256_ph(_mm256_set1_epi64x(0x3ff0000000000000)); - let r = _mm256_castph_pd(a); - let e = _mm256_set1_pd(1.0); - assert_eq_m256d(r, e); + unsafe fn test_mm_maskz_add_round_sh() { + let a = _mm_set_sh(1.0); + let b = _mm_set_sh(2.0); + let r = + _mm_maskz_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b); + let e = _mm_set_sh(0.0); + assert_eq_m128h(r, e); + let r = + _mm_maskz_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b); + let e = _mm_set_sh(3.0); + assert_eq_m128h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm512_castph_pd() { - let a = _mm512_castsi512_ph(_mm512_set1_epi64(0x3ff0000000000000)); - let r = _mm512_castph_pd(a); - let e = _mm512_set1_pd(1.0); - assert_eq_m512d(r, e); + unsafe fn test_mm_add_sh() { + let a = _mm_set_sh(1.0); + let b = _mm_set_sh(2.0); + let r = _mm_add_sh(a, b); + let e = _mm_set_sh(3.0); + assert_eq_m128h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm256_castph256_ph128() { - let a = _mm256_setr_ph( - 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., - ); - let r = _mm256_castph256_ph128(a); - let e = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.); + unsafe fn test_mm_mask_add_sh() { + let a = _mm_set_sh(1.0); + let b = _mm_set_sh(2.0); + let src = _mm_set_sh(4.0); + let r = _mm_mask_add_sh(src, 0, a, b); + let e = _mm_set_sh(4.0); + assert_eq_m128h(r, e); + let r = _mm_mask_add_sh(src, 1, a, b); + let e = _mm_set_sh(3.0); assert_eq_m128h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm512_castph512_ph128() { - let a = _mm512_setr_ph( - 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19., - 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., - ); - let r = _mm512_castph512_ph128(a); - let e = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.); + unsafe fn test_mm_maskz_add_sh() { + let a = _mm_set_sh(1.0); + let b = _mm_set_sh(2.0); + let r = _mm_maskz_add_sh(0, a, b); + let e = _mm_set_sh(0.0); + assert_eq_m128h(r, e); + let r = _mm_maskz_add_sh(1, a, b); + let e = _mm_set_sh(3.0); assert_eq_m128h(r, e); } - #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm512_castph512_ph256() { - let a = _mm512_setr_ph( - 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19., - 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., - ); - let r = _mm512_castph512_ph256(a); - let e = _mm256_setr_ph( - 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., - ); - assert_eq_m256h(r, e); + #[simd_test(enable = "avx512fp16,avx512vl")] + unsafe fn test_mm_sub_ph() { + let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); + let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0); + let r = _mm_sub_ph(a, b); + let e = _mm_set_ph(-7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0); + assert_eq_m128h(r, e); } - #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm256_castph128_ph256() { - let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.); - let r = _mm256_castph128_ph256(a); - assert_eq_m128h(_mm256_castph256_ph128(r), a); + #[simd_test(enable = "avx512fp16,avx512vl")] + unsafe fn test_mm_mask_sub_ph() { + let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); + let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0); + let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.); + let r = _mm_mask_sub_ph(src, 0b01010101, a, b); + let e = _mm_set_ph(10., -5., 12., -1., 14., 3., 16., 7.); + assert_eq_m128h(r, e); } - #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm512_castph128_ph512() { - let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.); - let r = _mm512_castph128_ph512(a); - assert_eq_m128h(_mm512_castph512_ph128(r), a); + #[simd_test(enable = "avx512fp16,avx512vl")] + unsafe fn test_mm_maskz_sub_ph() { + let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); + let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0); + let r = _mm_maskz_sub_ph(0b01010101, a, b); + let e = _mm_set_ph(0., -5., 0., -1., 0., 3., 0., 7.); + assert_eq_m128h(r, e); } - #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm512_castph256_ph512() { - let a = _mm256_setr_ph( - 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., + #[simd_test(enable = "avx512fp16,avx512vl")] + unsafe fn test_mm256_sub_ph() { + let a = _mm256_set_ph( + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, ); - let r = _mm512_castph256_ph512(a); - assert_eq_m256h(_mm512_castph512_ph256(r), a); + let b = _mm256_set_ph( + 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, + ); + let r = _mm256_sub_ph(a, b); + let e = _mm256_set_ph( + -15.0, -13.0, -11.0, -9.0, -7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, + 15.0, + ); + assert_eq_m256h(r, e); } - #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm256_zextph128_ph256() { - let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.); - let r = _mm256_zextph128_ph256(a); - let e = _mm256_setr_ph( - 1., 2., 3., 4., 5., 6., 7., 8., 0., 0., 0., 0., 0., 0., 0., 0., + #[simd_test(enable = "avx512fp16,avx512vl")] + unsafe fn test_mm256_mask_sub_ph() { + let a = _mm256_set_ph( + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + ); + let b = _mm256_set_ph( + 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, + ); + let src = _mm256_set_ph( + 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33., + ); + let r = _mm256_mask_sub_ph(src, 0b0101010101010101, a, b); + let e = _mm256_set_ph( + 18., -13., 20., -9., 22., -5., 24., -1., 26., 3., 28., 7., 30., 11., 32., 15., ); assert_eq_m256h(r, e); } - #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm512_zextph128_ph512() { - let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.); - let r = _mm512_zextph128_ph512(a); - let e = _mm512_setr_ph( - 1., 2., 3., 4., 5., 6., 7., 8., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., - 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., + #[simd_test(enable = "avx512fp16,avx512vl")] + unsafe fn test_mm256_maskz_sub_ph() { + let a = _mm256_set_ph( + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, ); - assert_eq_m512h(r, e); + let b = _mm256_set_ph( + 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, + ); + let r = _mm256_maskz_sub_ph(0b0101010101010101, a, b); + let e = _mm256_set_ph( + 0., -13., 0., -9., 0., -5., 0., -1., 0., 3., 0., 7., 0., 11., 0., 15., + ); + assert_eq_m256h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm512_zextph256_ph512() { - let a = _mm256_setr_ph( - 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., + unsafe fn test_mm512_sub_ph() { + let a = _mm512_set_ph( + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, + 31.0, 32.0, ); - let r = _mm512_zextph256_ph512(a); - let e = _mm512_setr_ph( - 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 0., 0., 0., 0., - 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., + let b = _mm512_set_ph( + 32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, + 18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, + 3.0, 2.0, 1.0, + ); + let r = _mm512_sub_ph(a, b); + let e = _mm512_set_ph( + -31.0, -29.0, -27.0, -25.0, -23.0, -21.0, -19.0, -17.0, -15.0, -13.0, -11.0, -9.0, + -7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0, + 23.0, 25.0, 27.0, 29.0, 31.0, ); assert_eq_m512h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_comi_round_sh() { - let a = _mm_set_sh(1.0); - let b = _mm_set_sh(1.0); - let r = _mm_comi_round_sh::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(a, b); - assert_eq!(r, 1); + unsafe fn test_mm512_mask_sub_ph() { + let a = _mm512_set_ph( + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, + 31.0, 32.0, + ); + let b = _mm512_set_ph( + 32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, + 18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, + 3.0, 2.0, 1.0, + ); + let src = _mm512_set_ph( + 34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50., + 51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65., + ); + let r = _mm512_mask_sub_ph(src, 0b01010101010101010101010101010101, a, b); + let e = _mm512_set_ph( + 34., -29., 36., -25., 38., -21., 40., -17., 42., -13., 44., -9., 46., -5., 48., -1., + 50., 3., 52., 7., 54., 11., 56., 15., 58., 19., 60., 23., 62., 27., 64., 31., + ); + assert_eq_m512h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_comi_sh() { - let a = _mm_set_sh(1.0); - let b = _mm_set_sh(1.0); - let r = _mm_comi_sh::<_CMP_EQ_OQ>(a, b); - assert_eq!(r, 1); + unsafe fn test_mm512_maskz_sub_ph() { + let a = _mm512_set_ph( + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, + 31.0, 32.0, + ); + let b = _mm512_set_ph( + 32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, + 18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, + 3.0, 2.0, 1.0, + ); + let r = _mm512_maskz_sub_ph(0b01010101010101010101010101010101, a, b); + let e = _mm512_set_ph( + 0., -29., 0., -25., 0., -21., 0., -17., 0., -13., 0., -9., 0., -5., 0., -1., 0., 3., + 0., 7., 0., 11., 0., 15., 0., 19., 0., 23., 0., 27., 0., 31., + ); + assert_eq_m512h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_comieq_sh() { - let a = _mm_set_sh(1.0); - let b = _mm_set_sh(1.0); - let r = _mm_comieq_sh(a, b); - assert_eq!(r, 1); + unsafe fn test_mm512_sub_round_ph() { + let a = _mm512_set_ph( + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, + 31.0, 32.0, + ); + let b = _mm512_set_ph( + 32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, + 18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, + 3.0, 2.0, 1.0, + ); + let r = _mm512_sub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b); + let e = _mm512_set_ph( + -31.0, -29.0, -27.0, -25.0, -23.0, -21.0, -19.0, -17.0, -15.0, -13.0, -11.0, -9.0, + -7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0, + 23.0, 25.0, 27.0, 29.0, 31.0, + ); + assert_eq_m512h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_comige_sh() { - let a = _mm_set_sh(2.0); - let b = _mm_set_sh(1.0); - let r = _mm_comige_sh(a, b); - assert_eq!(r, 1); + unsafe fn test_mm512_mask_sub_round_ph() { + let a = _mm512_set_ph( + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, + 31.0, 32.0, + ); + let b = _mm512_set_ph( + 32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, + 18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, + 3.0, 2.0, 1.0, + ); + let src = _mm512_set_ph( + 34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50., + 51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65., + ); + let r = _mm512_mask_sub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( + src, + 0b01010101010101010101010101010101, + a, + b, + ); + let e = _mm512_set_ph( + 34., -29., 36., -25., 38., -21., 40., -17., 42., -13., 44., -9., 46., -5., 48., -1., + 50., 3., 52., 7., 54., 11., 56., 15., 58., 19., 60., 23., 62., 27., 64., 31., + ); + assert_eq_m512h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_comigt_sh() { - let a = _mm_set_sh(2.0); - let b = _mm_set_sh(1.0); - let r = _mm_comigt_sh(a, b); - assert_eq!(r, 1); + unsafe fn test_mm512_maskz_sub_round_ph() { + let a = _mm512_set_ph( + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, + 31.0, 32.0, + ); + let b = _mm512_set_ph( + 32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, + 18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, + 3.0, 2.0, 1.0, + ); + let r = _mm512_maskz_sub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( + 0b01010101010101010101010101010101, + a, + b, + ); + let e = _mm512_set_ph( + 0., -29., 0., -25., 0., -21., 0., -17., 0., -13., 0., -9., 0., -5., 0., -1., 0., 3., + 0., 7., 0., 11., 0., 15., 0., 19., 0., 23., 0., 27., 0., 31., + ); + assert_eq_m512h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_comile_sh() { + unsafe fn test_mm_sub_round_sh() { let a = _mm_set_sh(1.0); let b = _mm_set_sh(2.0); - let r = _mm_comile_sh(a, b); - assert_eq!(r, 1); + let r = _mm_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b); + let e = _mm_set_sh(-1.0); + assert_eq_m128h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_comilt_sh() { + unsafe fn test_mm_mask_sub_round_sh() { let a = _mm_set_sh(1.0); let b = _mm_set_sh(2.0); - let r = _mm_comilt_sh(a, b); - assert_eq!(r, 1); + let src = _mm_set_sh(4.0); + let r = _mm_mask_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( + src, 0, a, b, + ); + let e = _mm_set_sh(4.0); + assert_eq_m128h(r, e); + let r = _mm_mask_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( + src, 1, a, b, + ); + let e = _mm_set_sh(-1.0); + assert_eq_m128h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_comineq_sh() { + unsafe fn test_mm_maskz_sub_round_sh() { let a = _mm_set_sh(1.0); let b = _mm_set_sh(2.0); - let r = _mm_comineq_sh(a, b); - assert_eq!(r, 1); - } - - #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_ucomieq_sh() { - let a = _mm_set_sh(1.0); - let b = _mm_set_sh(1.0); - let r = _mm_ucomieq_sh(a, b); - assert_eq!(r, 1); - } - - #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_ucomige_sh() { - let a = _mm_set_sh(2.0); - let b = _mm_set_sh(1.0); - let r = _mm_ucomige_sh(a, b); - assert_eq!(r, 1); - } - - #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_ucomigt_sh() { - let a = _mm_set_sh(2.0); - let b = _mm_set_sh(1.0); - let r = _mm_ucomigt_sh(a, b); - assert_eq!(r, 1); + let r = + _mm_maskz_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b); + let e = _mm_set_sh(0.0); + assert_eq_m128h(r, e); + let r = + _mm_maskz_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b); + let e = _mm_set_sh(-1.0); + assert_eq_m128h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_ucomile_sh() { + unsafe fn test_mm_sub_sh() { let a = _mm_set_sh(1.0); let b = _mm_set_sh(2.0); - let r = _mm_ucomile_sh(a, b); - assert_eq!(r, 1); + let r = _mm_sub_sh(a, b); + let e = _mm_set_sh(-1.0); + assert_eq_m128h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_ucomilt_sh() { + unsafe fn test_mm_mask_sub_sh() { let a = _mm_set_sh(1.0); let b = _mm_set_sh(2.0); - let r = _mm_ucomilt_sh(a, b); - assert_eq!(r, 1); + let src = _mm_set_sh(4.0); + let r = _mm_mask_sub_sh(src, 0, a, b); + let e = _mm_set_sh(4.0); + assert_eq_m128h(r, e); + let r = _mm_mask_sub_sh(src, 1, a, b); + let e = _mm_set_sh(-1.0); + assert_eq_m128h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_ucomineq_sh() { + unsafe fn test_mm_maskz_sub_sh() { let a = _mm_set_sh(1.0); let b = _mm_set_sh(2.0); - let r = _mm_ucomineq_sh(a, b); - assert_eq!(r, 1); + let r = _mm_maskz_sub_sh(0, a, b); + let e = _mm_set_sh(0.0); + assert_eq_m128h(r, e); + let r = _mm_maskz_sub_sh(1, a, b); + let e = _mm_set_sh(-1.0); + assert_eq_m128h(r, e); } #[simd_test(enable = "avx512fp16,avx512vl")] - unsafe fn test_mm_load_ph() { + unsafe fn test_mm_mul_ph() { let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); - let b = _mm_load_ph(addr_of!(a).cast()); - assert_eq_m128h(a, b); + let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0); + let r = _mm_mul_ph(a, b); + let e = _mm_set_ph(8.0, 14.0, 18.0, 20.0, 20.0, 18.0, 14.0, 8.0); + assert_eq_m128h(r, e); } #[simd_test(enable = "avx512fp16,avx512vl")] - unsafe fn test_mm256_load_ph() { - let a = _mm256_set_ph( - 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, - ); - let b = _mm256_load_ph(addr_of!(a).cast()); - assert_eq_m256h(a, b); - } - - #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm512_load_ph() { - let a = _mm512_set_ph( - 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, - 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, - 31.0, 32.0, - ); - let b = _mm512_load_ph(addr_of!(a).cast()); - assert_eq_m512h(a, b); - } - - #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_load_sh() { - let a = _mm_set_sh(1.0); - let b = _mm_load_sh(addr_of!(a).cast()); - assert_eq_m128h(a, b); + unsafe fn test_mm_mask_mul_ph() { + let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); + let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0); + let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.); + let r = _mm_mask_mul_ph(src, 0b01010101, a, b); + let e = _mm_set_ph(10., 14., 12., 20., 14., 18., 16., 8.); + assert_eq_m128h(r, e); } - #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_mask_load_sh() { - let a = _mm_set_sh(1.0); - let src = _mm_set_sh(2.); - let b = _mm_mask_load_sh(src, 1, addr_of!(a).cast()); - assert_eq_m128h(a, b); - let b = _mm_mask_load_sh(src, 0, addr_of!(a).cast()); - assert_eq_m128h(src, b); + #[simd_test(enable = "avx512fp16,avx512vl")] + unsafe fn test_mm_maskz_mul_ph() { + let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); + let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0); + let r = _mm_maskz_mul_ph(0b01010101, a, b); + let e = _mm_set_ph(0., 14., 0., 20., 0., 18., 0., 8.); + assert_eq_m128h(r, e); } - #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_maskz_load_sh() { - let a = _mm_set_sh(1.0); - let b = _mm_maskz_load_sh(1, addr_of!(a).cast()); - assert_eq_m128h(a, b); - let b = _mm_maskz_load_sh(0, addr_of!(a).cast()); - assert_eq_m128h(_mm_setzero_ph(), b); + #[simd_test(enable = "avx512fp16,avx512vl")] + unsafe fn test_mm256_mul_ph() { + let a = _mm256_set_ph( + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + ); + let b = _mm256_set_ph( + 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, + ); + let r = _mm256_mul_ph(a, b); + let e = _mm256_set_ph( + 16.0, 30.0, 42.0, 52.0, 60.0, 66.0, 70.0, 72.0, 72.0, 70.0, 66.0, 60.0, 52.0, 42.0, + 30.0, 16.0, + ); + assert_eq_m256h(r, e); } #[simd_test(enable = "avx512fp16,avx512vl")] - unsafe fn test_mm_loadu_ph() { - let array = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]; - let r = _mm_loadu_ph(array.as_ptr()); - let e = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); - assert_eq_m128h(r, e); + unsafe fn test_mm256_mask_mul_ph() { + let a = _mm256_set_ph( + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + ); + let b = _mm256_set_ph( + 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, + ); + let src = _mm256_set_ph( + 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33., + ); + let r = _mm256_mask_mul_ph(src, 0b0101010101010101, a, b); + let e = _mm256_set_ph( + 18., 30., 20., 52., 22., 66., 24., 72., 26., 70., 28., 60., 30., 42., 32., 16., + ); + assert_eq_m256h(r, e); } #[simd_test(enable = "avx512fp16,avx512vl")] - unsafe fn test_mm256_loadu_ph() { - let array = [ - 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, - ]; - let r = _mm256_loadu_ph(array.as_ptr()); - let e = _mm256_setr_ph( + unsafe fn test_mm256_maskz_mul_ph() { + let a = _mm256_set_ph( 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, ); + let b = _mm256_set_ph( + 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, + ); + let r = _mm256_maskz_mul_ph(0b0101010101010101, a, b); + let e = _mm256_set_ph( + 0., 30., 0., 52., 0., 66., 0., 72., 0., 70., 0., 60., 0., 42., 0., 16., + ); assert_eq_m256h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm512_loadu_ph() { - let array = [ + unsafe fn test_mm512_mul_ph() { + let a = _mm512_set_ph( 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, - ]; - let r = _mm512_loadu_ph(array.as_ptr()); - let e = _mm512_setr_ph( + ); + let b = _mm512_set_ph( + 32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, + 18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, + 3.0, 2.0, 1.0, + ); + let r = _mm512_mul_ph(a, b); + let e = _mm512_set_ph( + 32.0, 62.0, 90.0, 116.0, 140.0, 162.0, 182.0, 200.0, 216.0, 230.0, 242.0, 252.0, 260.0, + 266.0, 270.0, 272.0, 272.0, 270.0, 266.0, 260.0, 252.0, 242.0, 230.0, 216.0, 200.0, + 182.0, 162.0, 140.0, 116.0, 90.0, 62.0, 32.0, + ); + assert_eq_m512h(r, e); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm512_mask_mul_ph() { + let a = _mm512_set_ph( 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, ); + let b = _mm512_set_ph( + 32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, + 18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, + 3.0, 2.0, 1.0, + ); + let src = _mm512_set_ph( + 34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50., + 51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65., + ); + let r = _mm512_mask_mul_ph(src, 0b01010101010101010101010101010101, a, b); + let e = _mm512_set_ph( + 34., 62., 36., 116., 38., 162., 40., 200., 42., 230., 44., 252., 46., 266., 48., 272., + 50., 270., 52., 260., 54., 242., 56., 216., 58., 182., 60., 140., 62., 90., 64., 32., + ); assert_eq_m512h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_move_sh() { - let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); - let b = _mm_set_sh(9.0); - let r = _mm_move_sh(a, b); - let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 9.0); - assert_eq_m128h(r, e); + unsafe fn test_mm512_maskz_mul_ph() { + let a = _mm512_set_ph( + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, + 31.0, 32.0, + ); + let b = _mm512_set_ph( + 32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, + 18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, + 3.0, 2.0, 1.0, + ); + let r = _mm512_maskz_mul_ph(0b01010101010101010101010101010101, a, b); + let e = _mm512_set_ph( + 0., 62., 0., 116., 0., 162., 0., 200., 0., 230., 0., 252., 0., 266., 0., 272., 0., + 270., 0., 260., 0., 242., 0., 216., 0., 182., 0., 140., 0., 90., 0., 32., + ); + assert_eq_m512h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_mask_move_sh() { - let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); - let b = _mm_set_sh(9.0); - let src = _mm_set_sh(10.0); - let r = _mm_mask_move_sh(src, 0, a, b); - let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 10.0); - assert_eq_m128h(r, e); + unsafe fn test_mm512_mul_round_ph() { + let a = _mm512_set_ph( + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, + 31.0, 32.0, + ); + let b = _mm512_set_ph( + 32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, + 18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, + 3.0, 2.0, 1.0, + ); + let r = _mm512_mul_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b); + let e = _mm512_set_ph( + 32.0, 62.0, 90.0, 116.0, 140.0, 162.0, 182.0, 200.0, 216.0, 230.0, 242.0, 252.0, 260.0, + 266.0, 270.0, 272.0, 272.0, 270.0, 266.0, 260.0, 252.0, 242.0, 230.0, 216.0, 200.0, + 182.0, 162.0, 140.0, 116.0, 90.0, 62.0, 32.0, + ); + assert_eq_m512h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_maskz_move_sh() { - let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); - let b = _mm_set_sh(9.0); - let r = _mm_maskz_move_sh(0, a, b); - let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 0.0); - assert_eq_m128h(r, e); - } - - #[simd_test(enable = "avx512fp16,avx512vl")] - unsafe fn test_mm_store_ph() { - let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); - let mut b = _mm_setzero_ph(); - _mm_store_ph(addr_of_mut!(b).cast(), a); - assert_eq_m128h(a, b); - } - - #[simd_test(enable = "avx512fp16,avx512vl")] - unsafe fn test_mm256_store_ph() { - let a = _mm256_set_ph( + unsafe fn test_mm512_mask_mul_round_ph() { + let a = _mm512_set_ph( 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, + 31.0, 32.0, + ); + let b = _mm512_set_ph( + 32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, + 18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, + 3.0, 2.0, 1.0, + ); + let src = _mm512_set_ph( + 34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50., + 51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65., + ); + let r = _mm512_mask_mul_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( + src, + 0b01010101010101010101010101010101, + a, + b, ); - let mut b = _mm256_setzero_ph(); - _mm256_store_ph(addr_of_mut!(b).cast(), a); - assert_eq_m256h(a, b); + let e = _mm512_set_ph( + 34., 62., 36., 116., 38., 162., 40., 200., 42., 230., 44., 252., 46., 266., 48., 272., + 50., 270., 52., 260., 54., 242., 56., 216., 58., 182., 60., 140., 62., 90., 64., 32., + ); + assert_eq_m512h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm512_store_ph() { + unsafe fn test_mm512_maskz_mul_round_ph() { let a = _mm512_set_ph( 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, ); - let mut b = _mm512_setzero_ph(); - _mm512_store_ph(addr_of_mut!(b).cast(), a); - assert_eq_m512h(a, b); + let b = _mm512_set_ph( + 32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, + 18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, + 3.0, 2.0, 1.0, + ); + let r = _mm512_maskz_mul_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( + 0b01010101010101010101010101010101, + a, + b, + ); + let e = _mm512_set_ph( + 0., 62., 0., 116., 0., 162., 0., 200., 0., 230., 0., 252., 0., 266., 0., 272., 0., + 270., 0., 260., 0., 242., 0., 216., 0., 182., 0., 140., 0., 90., 0., 32., + ); + assert_eq_m512h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_store_sh() { + unsafe fn test_mm_mul_round_sh() { let a = _mm_set_sh(1.0); - let mut b = _mm_setzero_ph(); - _mm_store_sh(addr_of_mut!(b).cast(), a); - assert_eq_m128h(a, b); + let b = _mm_set_sh(2.0); + let r = _mm_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b); + let e = _mm_set_sh(2.0); + assert_eq_m128h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_mask_store_sh() { + unsafe fn test_mm_mask_mul_round_sh() { let a = _mm_set_sh(1.0); - let mut b = _mm_setzero_ph(); - _mm_mask_store_sh(addr_of_mut!(b).cast(), 0, a); - assert_eq_m128h(_mm_setzero_ph(), b); - _mm_mask_store_sh(addr_of_mut!(b).cast(), 1, a); - assert_eq_m128h(a, b); + let b = _mm_set_sh(2.0); + let src = _mm_set_sh(4.0); + let r = _mm_mask_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( + src, 0, a, b, + ); + let e = _mm_set_sh(4.0); + assert_eq_m128h(r, e); + let r = _mm_mask_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( + src, 1, a, b, + ); + let e = _mm_set_sh(2.0); + assert_eq_m128h(r, e); } - #[simd_test(enable = "avx512fp16,avx512vl")] - unsafe fn test_mm_storeu_ph() { - let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); - let mut array = [0.0; 8]; - _mm_storeu_ph(array.as_mut_ptr(), a); - assert_eq_m128h(a, _mm_loadu_ph(array.as_ptr())); + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm_maskz_mul_round_sh() { + let a = _mm_set_sh(1.0); + let b = _mm_set_sh(2.0); + let r = + _mm_maskz_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b); + let e = _mm_set_sh(0.0); + assert_eq_m128h(r, e); + let r = + _mm_maskz_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b); + let e = _mm_set_sh(2.0); + assert_eq_m128h(r, e); } - #[simd_test(enable = "avx512fp16,avx512vl")] - unsafe fn test_mm256_storeu_ph() { - let a = _mm256_set_ph( - 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, - ); - let mut array = [0.0; 16]; - _mm256_storeu_ph(array.as_mut_ptr(), a); - assert_eq_m256h(a, _mm256_loadu_ph(array.as_ptr())); + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm_mul_sh() { + let a = _mm_set_sh(1.0); + let b = _mm_set_sh(2.0); + let r = _mm_mul_sh(a, b); + let e = _mm_set_sh(2.0); + assert_eq_m128h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm512_storeu_ph() { - let a = _mm512_set_ph( - 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, - 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, - 31.0, 32.0, - ); - let mut array = [0.0; 32]; - _mm512_storeu_ph(array.as_mut_ptr(), a); - assert_eq_m512h(a, _mm512_loadu_ph(array.as_ptr())); + unsafe fn test_mm_mask_mul_sh() { + let a = _mm_set_sh(1.0); + let b = _mm_set_sh(2.0); + let src = _mm_set_sh(4.0); + let r = _mm_mask_mul_sh(src, 0, a, b); + let e = _mm_set_sh(4.0); + assert_eq_m128h(r, e); + let r = _mm_mask_mul_sh(src, 1, a, b); + let e = _mm_set_sh(2.0); + assert_eq_m128h(r, e); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm_maskz_mul_sh() { + let a = _mm_set_sh(1.0); + let b = _mm_set_sh(2.0); + let r = _mm_maskz_mul_sh(0, a, b); + let e = _mm_set_sh(0.0); + assert_eq_m128h(r, e); + let r = _mm_maskz_mul_sh(1, a, b); + let e = _mm_set_sh(2.0); + assert_eq_m128h(r, e); } #[simd_test(enable = "avx512fp16,avx512vl")] - unsafe fn test_mm_add_ph() { - let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); - let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0); - let r = _mm_add_ph(a, b); - let e = _mm_set1_ph(9.0); + unsafe fn test_mm_div_ph() { + let a = _mm_set1_ph(1.0); + let b = _mm_set1_ph(2.0); + let r = _mm_div_ph(a, b); + let e = _mm_set1_ph(0.5); assert_eq_m128h(r, e); } #[simd_test(enable = "avx512fp16,avx512vl")] - unsafe fn test_mm_mask_add_ph() { - let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); - let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0); - let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.); - let r = _mm_mask_add_ph(src, 0b01010101, a, b); - let e = _mm_set_ph(10., 9., 12., 9., 14., 9., 16., 9.); + unsafe fn test_mm_mask_div_ph() { + let a = _mm_set1_ph(1.0); + let b = _mm_set1_ph(2.0); + let src = _mm_set_ph(4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0); + let r = _mm_mask_div_ph(src, 0b01010101, a, b); + let e = _mm_set_ph(4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5); assert_eq_m128h(r, e); } #[simd_test(enable = "avx512fp16,avx512vl")] - unsafe fn test_mm_maskz_add_ph() { - let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); - let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0); - let r = _mm_maskz_add_ph(0b01010101, a, b); - let e = _mm_set_ph(0., 9., 0., 9., 0., 9., 0., 9.); + unsafe fn test_mm_maskz_div_ph() { + let a = _mm_set1_ph(1.0); + let b = _mm_set1_ph(2.0); + let r = _mm_maskz_div_ph(0b01010101, a, b); + let e = _mm_set_ph(0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5); assert_eq_m128h(r, e); } #[simd_test(enable = "avx512fp16,avx512vl")] - unsafe fn test_mm256_add_ph() { - let a = _mm256_set_ph( - 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, - ); - let b = _mm256_set_ph( - 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, - ); - let r = _mm256_add_ph(a, b); - let e = _mm256_set1_ph(17.0); + unsafe fn test_mm256_div_ph() { + let a = _mm256_set1_ph(1.0); + let b = _mm256_set1_ph(2.0); + let r = _mm256_div_ph(a, b); + let e = _mm256_set1_ph(0.5); assert_eq_m256h(r, e); } #[simd_test(enable = "avx512fp16,avx512vl")] - unsafe fn test_mm256_mask_add_ph() { - let a = _mm256_set_ph( - 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, - ); - let b = _mm256_set_ph( - 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, - ); + unsafe fn test_mm256_mask_div_ph() { + let a = _mm256_set1_ph(1.0); + let b = _mm256_set1_ph(2.0); let src = _mm256_set_ph( - 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33., + 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, + 19.0, ); - let r = _mm256_mask_add_ph(src, 0b0101010101010101, a, b); + let r = _mm256_mask_div_ph(src, 0b0101010101010101, a, b); let e = _mm256_set_ph( - 18., 17., 20., 17., 22., 17., 24., 17., 26., 17., 28., 17., 30., 17., 32., 17., + 4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5, 12.0, 0.5, 14.0, 0.5, 16.0, 0.5, 18.0, 0.5, ); assert_eq_m256h(r, e); } #[simd_test(enable = "avx512fp16,avx512vl")] - unsafe fn test_mm256_maskz_add_ph() { - let a = _mm256_set_ph( - 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, - ); - let b = _mm256_set_ph( - 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, - ); - let r = _mm256_maskz_add_ph(0b0101010101010101, a, b); + unsafe fn test_mm256_maskz_div_ph() { + let a = _mm256_set1_ph(1.0); + let b = _mm256_set1_ph(2.0); + let r = _mm256_maskz_div_ph(0b0101010101010101, a, b); let e = _mm256_set_ph( - 0., 17., 0., 17., 0., 17., 0., 17., 0., 17., 0., 17., 0., 17., 0., 17., + 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, ); assert_eq_m256h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm512_add_ph() { - let a = _mm512_set_ph( - 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, - 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, - 31.0, 32.0, - ); - let b = _mm512_set_ph( - 32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, - 18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, - 3.0, 2.0, 1.0, - ); - let r = _mm512_add_ph(a, b); - let e = _mm512_set1_ph(33.0); + unsafe fn test_mm512_div_ph() { + let a = _mm512_set1_ph(1.0); + let b = _mm512_set1_ph(2.0); + let r = _mm512_div_ph(a, b); + let e = _mm512_set1_ph(0.5); assert_eq_m512h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm512_mask_add_ph() { - let a = _mm512_set_ph( - 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, - 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, - 31.0, 32.0, - ); - let b = _mm512_set_ph( - 32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, - 18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, - 3.0, 2.0, 1.0, - ); + unsafe fn test_mm512_mask_div_ph() { + let a = _mm512_set1_ph(1.0); + let b = _mm512_set1_ph(2.0); let src = _mm512_set_ph( - 34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50., - 51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65., + 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, + 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, + 33.0, 34.0, 35.0, ); - let r = _mm512_mask_add_ph(src, 0b01010101010101010101010101010101, a, b); + let r = _mm512_mask_div_ph(src, 0b01010101010101010101010101010101, a, b); let e = _mm512_set_ph( - 34., 33., 36., 33., 38., 33., 40., 33., 42., 33., 44., 33., 46., 33., 48., 33., 50., - 33., 52., 33., 54., 33., 56., 33., 58., 33., 60., 33., 62., 33., 64., 33., + 4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5, 12.0, 0.5, 14.0, 0.5, 16.0, 0.5, 18.0, 0.5, + 20.0, 0.5, 22.0, 0.5, 24.0, 0.5, 26.0, 0.5, 28.0, 0.5, 30.0, 0.5, 32.0, 0.5, 34.0, 0.5, ); assert_eq_m512h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm512_maskz_add_ph() { - let a = _mm512_set_ph( - 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, - 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, - 31.0, 32.0, - ); - let b = _mm512_set_ph( - 32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, - 18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, - 3.0, 2.0, 1.0, - ); - let r = _mm512_maskz_add_ph(0b01010101010101010101010101010101, a, b); + unsafe fn test_mm512_maskz_div_ph() { + let a = _mm512_set1_ph(1.0); + let b = _mm512_set1_ph(2.0); + let r = _mm512_maskz_div_ph(0b01010101010101010101010101010101, a, b); let e = _mm512_set_ph( - 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., - 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., + 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, + 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, ); assert_eq_m512h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm512_add_round_ph() { - let a = _mm512_set_ph( - 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, - 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, - 31.0, 32.0, - ); - let b = _mm512_set_ph( - 32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, - 18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, - 3.0, 2.0, 1.0, - ); - let r = _mm512_add_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b); - let e = _mm512_set1_ph(33.0); + unsafe fn test_mm512_div_round_ph() { + let a = _mm512_set1_ph(1.0); + let b = _mm512_set1_ph(2.0); + let r = _mm512_div_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b); + let e = _mm512_set1_ph(0.5); assert_eq_m512h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm512_mask_add_round_ph() { - let a = _mm512_set_ph( - 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, - 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, - 31.0, 32.0, - ); - let b = _mm512_set_ph( - 32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, - 18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, - 3.0, 2.0, 1.0, - ); + unsafe fn test_mm512_mask_div_round_ph() { + let a = _mm512_set1_ph(1.0); + let b = _mm512_set1_ph(2.0); let src = _mm512_set_ph( - 34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50., - 51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65., + 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, + 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, + 33.0, 34.0, 35.0, ); - let r = _mm512_mask_add_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( + let r = _mm512_mask_div_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( src, 0b01010101010101010101010101010101, a, b, ); let e = _mm512_set_ph( - 34., 33., 36., 33., 38., 33., 40., 33., 42., 33., 44., 33., 46., 33., 48., 33., 50., - 33., 52., 33., 54., 33., 56., 33., 58., 33., 60., 33., 62., 33., 64., 33., + 4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5, 12.0, 0.5, 14.0, 0.5, 16.0, 0.5, 18.0, 0.5, + 20.0, 0.5, 22.0, 0.5, 24.0, 0.5, 26.0, 0.5, 28.0, 0.5, 30.0, 0.5, 32.0, 0.5, 34.0, 0.5, ); assert_eq_m512h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm512_maskz_add_round_ph() { - let a = _mm512_set_ph( - 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, - 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, - 31.0, 32.0, - ); - let b = _mm512_set_ph( - 32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, - 18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, - 3.0, 2.0, 1.0, - ); - let r = _mm512_maskz_add_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( + unsafe fn test_mm512_maskz_div_round_ph() { + let a = _mm512_set1_ph(1.0); + let b = _mm512_set1_ph(2.0); + let r = _mm512_maskz_div_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( 0b01010101010101010101010101010101, a, b, ); let e = _mm512_set_ph( - 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., - 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., + 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, + 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, ); assert_eq_m512h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_add_round_sh() { + unsafe fn test_mm_div_round_sh() { let a = _mm_set_sh(1.0); let b = _mm_set_sh(2.0); - let r = _mm_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b); - let e = _mm_set_sh(3.0); + let r = _mm_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b); + let e = _mm_set_sh(0.5); assert_eq_m128h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_mask_add_round_sh() { + unsafe fn test_mm_mask_div_round_sh() { let a = _mm_set_sh(1.0); let b = _mm_set_sh(2.0); let src = _mm_set_sh(4.0); - let r = _mm_mask_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( + let r = _mm_mask_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( src, 0, a, b, ); let e = _mm_set_sh(4.0); assert_eq_m128h(r, e); - let r = _mm_mask_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( + let r = _mm_mask_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( src, 1, a, b, ); - let e = _mm_set_sh(3.0); + let e = _mm_set_sh(0.5); assert_eq_m128h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_maskz_add_round_sh() { + unsafe fn test_mm_maskz_div_round_sh() { let a = _mm_set_sh(1.0); let b = _mm_set_sh(2.0); let r = - _mm_maskz_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b); + _mm_maskz_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b); let e = _mm_set_sh(0.0); assert_eq_m128h(r, e); let r = - _mm_maskz_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b); - let e = _mm_set_sh(3.0); + _mm_maskz_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b); + let e = _mm_set_sh(0.5); assert_eq_m128h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_add_sh() { + unsafe fn test_mm_div_sh() { let a = _mm_set_sh(1.0); let b = _mm_set_sh(2.0); - let r = _mm_add_sh(a, b); - let e = _mm_set_sh(3.0); + let r = _mm_div_sh(a, b); + let e = _mm_set_sh(0.5); assert_eq_m128h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_mask_add_sh() { + unsafe fn test_mm_mask_div_sh() { let a = _mm_set_sh(1.0); let b = _mm_set_sh(2.0); let src = _mm_set_sh(4.0); - let r = _mm_mask_add_sh(src, 0, a, b); + let r = _mm_mask_div_sh(src, 0, a, b); let e = _mm_set_sh(4.0); assert_eq_m128h(r, e); - let r = _mm_mask_add_sh(src, 1, a, b); - let e = _mm_set_sh(3.0); + let r = _mm_mask_div_sh(src, 1, a, b); + let e = _mm_set_sh(0.5); assert_eq_m128h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_maskz_add_sh() { + unsafe fn test_mm_maskz_div_sh() { let a = _mm_set_sh(1.0); let b = _mm_set_sh(2.0); - let r = _mm_maskz_add_sh(0, a, b); + let r = _mm_maskz_div_sh(0, a, b); let e = _mm_set_sh(0.0); assert_eq_m128h(r, e); - let r = _mm_maskz_add_sh(1, a, b); - let e = _mm_set_sh(3.0); + let r = _mm_maskz_div_sh(1, a, b); + let e = _mm_set_sh(0.5); assert_eq_m128h(r, e); } #[simd_test(enable = "avx512fp16,avx512vl")] - unsafe fn test_mm_sub_ph() { - let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); - let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0); - let r = _mm_sub_ph(a, b); - let e = _mm_set_ph(-7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0); + unsafe fn test_mm_mul_pch() { + let a = _mm_set1_pch((0.0, 1.0)); + let b = _mm_set1_pch((0.0, 1.0)); + let r = _mm_mul_pch(a, b); + let e = _mm_set1_pch((-1.0, 0.0)); assert_eq_m128h(r, e); } #[simd_test(enable = "avx512fp16,avx512vl")] - unsafe fn test_mm_mask_sub_ph() { - let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); - let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0); - let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.); - let r = _mm_mask_sub_ph(src, 0b01010101, a, b); - let e = _mm_set_ph(10., -5., 12., -1., 14., 3., 16., 7.); + unsafe fn test_mm_mask_mul_pch() { + let a = _mm_set1_pch((0.0, 1.0)); + let b = _mm_set1_pch((0.0, 1.0)); + let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0); + let r = _mm_mask_mul_pch(src, 0b0101, a, b); + let e = _mm_setr_ph(-1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0); assert_eq_m128h(r, e); } #[simd_test(enable = "avx512fp16,avx512vl")] - unsafe fn test_mm_maskz_sub_ph() { - let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); - let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0); - let r = _mm_maskz_sub_ph(0b01010101, a, b); - let e = _mm_set_ph(0., -5., 0., -1., 0., 3., 0., 7.); + unsafe fn test_mm_maskz_mul_pch() { + let a = _mm_set1_pch((0.0, 1.0)); + let b = _mm_set1_pch((0.0, 1.0)); + let r = _mm_maskz_mul_pch(0b0101, a, b); + let e = _mm_setr_ph(-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0); assert_eq_m128h(r, e); } #[simd_test(enable = "avx512fp16,avx512vl")] - unsafe fn test_mm256_sub_ph() { - let a = _mm256_set_ph( - 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, - ); - let b = _mm256_set_ph( - 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, - ); - let r = _mm256_sub_ph(a, b); - let e = _mm256_set_ph( - -15.0, -13.0, -11.0, -9.0, -7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, - 15.0, - ); + unsafe fn test_mm256_mul_pch() { + let a = _mm256_set1_pch((0.0, 1.0)); + let b = _mm256_set1_pch((0.0, 1.0)); + let r = _mm256_mul_pch(a, b); + let e = _mm256_set1_pch((-1.0, 0.0)); assert_eq_m256h(r, e); } #[simd_test(enable = "avx512fp16,avx512vl")] - unsafe fn test_mm256_mask_sub_ph() { - let a = _mm256_set_ph( - 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, - ); - let b = _mm256_set_ph( - 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, - ); - let src = _mm256_set_ph( - 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33., + unsafe fn test_mm256_mask_mul_pch() { + let a = _mm256_set1_pch((0.0, 1.0)); + let b = _mm256_set1_pch((0.0, 1.0)); + let src = _mm256_setr_ph( + 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, ); - let r = _mm256_mask_sub_ph(src, 0b0101010101010101, a, b); - let e = _mm256_set_ph( - 18., -13., 20., -9., 22., -5., 24., -1., 26., 3., 28., 7., 30., 11., 32., 15., + let r = _mm256_mask_mul_pch(src, 0b01010101, a, b); + let e = _mm256_setr_ph( + -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0, ); assert_eq_m256h(r, e); } #[simd_test(enable = "avx512fp16,avx512vl")] - unsafe fn test_mm256_maskz_sub_ph() { - let a = _mm256_set_ph( - 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + unsafe fn test_mm256_maskz_mul_pch() { + let a = _mm256_set1_pch((0.0, 1.0)); + let b = _mm256_set1_pch((0.0, 1.0)); + let r = _mm256_maskz_mul_pch(0b01010101, a, b); + let e = _mm256_setr_ph( + -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, ); - let b = _mm256_set_ph( - 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, + assert_eq_m256h(r, e); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm512_mul_pch() { + let a = _mm512_set1_pch((0.0, 1.0)); + let b = _mm512_set1_pch((0.0, 1.0)); + let r = _mm512_mul_pch(a, b); + let e = _mm512_set1_pch((-1.0, 0.0)); + assert_eq_m512h(r, e); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm512_mask_mul_pch() { + let a = _mm512_set1_pch((0.0, 1.0)); + let b = _mm512_set1_pch((0.0, 1.0)); + let src = _mm512_setr_ph( + 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, + 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, + 32.0, 33.0, + ); + let r = _mm512_mask_mul_pch(src, 0b0101010101010101, a, b); + let e = _mm512_setr_ph( + -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0, + -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0, + 33.0, ); - let r = _mm256_maskz_sub_ph(0b0101010101010101, a, b); - let e = _mm256_set_ph( - 0., -13., 0., -9., 0., -5., 0., -1., 0., 3., 0., 7., 0., 11., 0., 15., + assert_eq_m512h(r, e); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm512_maskz_mul_pch() { + let a = _mm512_set1_pch((0.0, 1.0)); + let b = _mm512_set1_pch((0.0, 1.0)); + let r = _mm512_maskz_mul_pch(0b0101010101010101, a, b); + let e = _mm512_setr_ph( + -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, + -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, ); - assert_eq_m256h(r, e); + assert_eq_m512h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm512_sub_ph() { - let a = _mm512_set_ph( - 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, - 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, - 31.0, 32.0, + unsafe fn test_mm512_mul_round_pch() { + let a = _mm512_set1_pch((0.0, 1.0)); + let b = _mm512_set1_pch((0.0, 1.0)); + let r = _mm512_mul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b); + let e = _mm512_set1_pch((-1.0, 0.0)); + assert_eq_m512h(r, e); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm512_mask_mul_round_pch() { + let a = _mm512_set1_pch((0.0, 1.0)); + let b = _mm512_set1_pch((0.0, 1.0)); + let src = _mm512_setr_ph( + 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, + 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, + 32.0, 33.0, + ); + let r = _mm512_mask_mul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( + src, + 0b0101010101010101, + a, + b, ); - let b = _mm512_set_ph( - 32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, - 18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, - 3.0, 2.0, 1.0, + let e = _mm512_setr_ph( + -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0, + -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0, + 33.0, ); - let r = _mm512_sub_ph(a, b); - let e = _mm512_set_ph( - -31.0, -29.0, -27.0, -25.0, -23.0, -21.0, -19.0, -17.0, -15.0, -13.0, -11.0, -9.0, - -7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0, - 23.0, 25.0, 27.0, 29.0, 31.0, + assert_eq_m512h(r, e); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm512_maskz_mul_round_pch() { + let a = _mm512_set1_pch((0.0, 1.0)); + let b = _mm512_set1_pch((0.0, 1.0)); + let r = _mm512_maskz_mul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( + 0b0101010101010101, + a, + b, + ); + let e = _mm512_setr_ph( + -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, + -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, + ); + assert_eq_m512h(r, e); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm_mul_round_sch() { + let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0); + let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0); + let r = _mm_mul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b); + let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0); + assert_eq_m128h(r, e); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm_mask_mul_round_sch() { + let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0); + let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0); + let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0); + let r = _mm_mask_mul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( + src, 0, a, b, + ); + let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0); + assert_eq_m128h(r, e); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm_maskz_mul_round_sch() { + let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0); + let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0); + let r = + _mm_maskz_mul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b); + let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0); + assert_eq_m128h(r, e); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm_mul_sch() { + let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0); + let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0); + let r = _mm_mul_sch(a, b); + let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0); + assert_eq_m128h(r, e); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm_mask_mul_sch() { + let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0); + let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0); + let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0); + let r = _mm_mask_mul_sch(src, 0, a, b); + let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0); + assert_eq_m128h(r, e); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm_maskz_mul_sch() { + let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0); + let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0); + let r = _mm_maskz_mul_sch(0, a, b); + let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0); + assert_eq_m128h(r, e); + } + + #[simd_test(enable = "avx512fp16,avx512vl")] + unsafe fn test_mm_fmul_pch() { + let a = _mm_set1_pch((0.0, 1.0)); + let b = _mm_set1_pch((0.0, 1.0)); + let r = _mm_fmul_pch(a, b); + let e = _mm_set1_pch((-1.0, 0.0)); + assert_eq_m128h(r, e); + } + + #[simd_test(enable = "avx512fp16,avx512vl")] + unsafe fn test_mm_mask_fmul_pch() { + let a = _mm_set1_pch((0.0, 1.0)); + let b = _mm_set1_pch((0.0, 1.0)); + let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0); + let r = _mm_mask_fmul_pch(src, 0b0101, a, b); + let e = _mm_setr_ph(-1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0); + assert_eq_m128h(r, e); + } + + #[simd_test(enable = "avx512fp16,avx512vl")] + unsafe fn test_mm_maskz_fmul_pch() { + let a = _mm_set1_pch((0.0, 1.0)); + let b = _mm_set1_pch((0.0, 1.0)); + let r = _mm_maskz_fmul_pch(0b0101, a, b); + let e = _mm_setr_ph(-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0); + assert_eq_m128h(r, e); + } + + #[simd_test(enable = "avx512fp16,avx512vl")] + unsafe fn test_mm256_fmul_pch() { + let a = _mm256_set1_pch((0.0, 1.0)); + let b = _mm256_set1_pch((0.0, 1.0)); + let r = _mm256_fmul_pch(a, b); + let e = _mm256_set1_pch((-1.0, 0.0)); + assert_eq_m256h(r, e); + } + + #[simd_test(enable = "avx512fp16,avx512vl")] + unsafe fn test_mm256_mask_fmul_pch() { + let a = _mm256_set1_pch((0.0, 1.0)); + let b = _mm256_set1_pch((0.0, 1.0)); + let src = _mm256_setr_ph( + 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, ); + let r = _mm256_mask_fmul_pch(src, 0b01010101, a, b); + let e = _mm256_setr_ph( + -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0, + ); + assert_eq_m256h(r, e); + } + + #[simd_test(enable = "avx512fp16,avx512vl")] + unsafe fn test_mm256_maskz_fmul_pch() { + let a = _mm256_set1_pch((0.0, 1.0)); + let b = _mm256_set1_pch((0.0, 1.0)); + let r = _mm256_maskz_fmul_pch(0b01010101, a, b); + let e = _mm256_setr_ph( + -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, + ); + assert_eq_m256h(r, e); + } + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm512_fmul_pch() { + let a = _mm512_set1_pch((0.0, 1.0)); + let b = _mm512_set1_pch((0.0, 1.0)); + let r = _mm512_fmul_pch(a, b); + let e = _mm512_set1_pch((-1.0, 0.0)); assert_eq_m512h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm512_mask_sub_ph() { - let a = _mm512_set_ph( - 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, - 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, - 31.0, 32.0, - ); - let b = _mm512_set_ph( - 32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, - 18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, - 3.0, 2.0, 1.0, - ); - let src = _mm512_set_ph( - 34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50., - 51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65., - ); - let r = _mm512_mask_sub_ph(src, 0b01010101010101010101010101010101, a, b); - let e = _mm512_set_ph( - 34., -29., 36., -25., 38., -21., 40., -17., 42., -13., 44., -9., 46., -5., 48., -1., - 50., 3., 52., 7., 54., 11., 56., 15., 58., 19., 60., 23., 62., 27., 64., 31., + unsafe fn test_mm512_mask_fmul_pch() { + let a = _mm512_set1_pch((0.0, 1.0)); + let b = _mm512_set1_pch((0.0, 1.0)); + let src = _mm512_setr_ph( + 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, + 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, + 32.0, 33.0, + ); + let r = _mm512_mask_fmul_pch(src, 0b0101010101010101, a, b); + let e = _mm512_setr_ph( + -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0, + -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0, + 33.0, ); assert_eq_m512h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm512_maskz_sub_ph() { - let a = _mm512_set_ph( - 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, - 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, - 31.0, 32.0, - ); - let b = _mm512_set_ph( - 32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, - 18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, - 3.0, 2.0, 1.0, - ); - let r = _mm512_maskz_sub_ph(0b01010101010101010101010101010101, a, b); - let e = _mm512_set_ph( - 0., -29., 0., -25., 0., -21., 0., -17., 0., -13., 0., -9., 0., -5., 0., -1., 0., 3., - 0., 7., 0., 11., 0., 15., 0., 19., 0., 23., 0., 27., 0., 31., + unsafe fn test_mm512_maskz_fmul_pch() { + let a = _mm512_set1_pch((0.0, 1.0)); + let b = _mm512_set1_pch((0.0, 1.0)); + let r = _mm512_maskz_fmul_pch(0b0101010101010101, a, b); + let e = _mm512_setr_ph( + -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, + -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, ); assert_eq_m512h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm512_sub_round_ph() { - let a = _mm512_set_ph( - 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, - 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, - 31.0, 32.0, - ); - let b = _mm512_set_ph( - 32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, - 18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, - 3.0, 2.0, 1.0, - ); - let r = _mm512_sub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b); - let e = _mm512_set_ph( - -31.0, -29.0, -27.0, -25.0, -23.0, -21.0, -19.0, -17.0, -15.0, -13.0, -11.0, -9.0, - -7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0, - 23.0, 25.0, 27.0, 29.0, 31.0, - ); + unsafe fn test_mm512_fmul_round_pch() { + let a = _mm512_set1_pch((0.0, 1.0)); + let b = _mm512_set1_pch((0.0, 1.0)); + let r = _mm512_fmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b); + let e = _mm512_set1_pch((-1.0, 0.0)); assert_eq_m512h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm512_mask_sub_round_ph() { - let a = _mm512_set_ph( - 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, - 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, - 31.0, 32.0, - ); - let b = _mm512_set_ph( - 32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, - 18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, - 3.0, 2.0, 1.0, - ); - let src = _mm512_set_ph( - 34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50., - 51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65., - ); - let r = _mm512_mask_sub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( + unsafe fn test_mm512_mask_fmul_round_pch() { + let a = _mm512_set1_pch((0.0, 1.0)); + let b = _mm512_set1_pch((0.0, 1.0)); + let src = _mm512_setr_ph( + 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, + 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, + 32.0, 33.0, + ); + let r = _mm512_mask_fmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( src, - 0b01010101010101010101010101010101, + 0b0101010101010101, a, b, ); - let e = _mm512_set_ph( - 34., -29., 36., -25., 38., -21., 40., -17., 42., -13., 44., -9., 46., -5., 48., -1., - 50., 3., 52., 7., 54., 11., 56., 15., 58., 19., 60., 23., 62., 27., 64., 31., + let e = _mm512_setr_ph( + -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0, + -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0, + 33.0, ); assert_eq_m512h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm512_maskz_sub_round_ph() { - let a = _mm512_set_ph( - 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, - 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, - 31.0, 32.0, - ); - let b = _mm512_set_ph( - 32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, - 18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, - 3.0, 2.0, 1.0, - ); - let r = _mm512_maskz_sub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( - 0b01010101010101010101010101010101, + unsafe fn test_mm512_maskz_fmul_round_pch() { + let a = _mm512_set1_pch((0.0, 1.0)); + let b = _mm512_set1_pch((0.0, 1.0)); + let r = _mm512_maskz_fmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( + 0b0101010101010101, a, b, ); - let e = _mm512_set_ph( - 0., -29., 0., -25., 0., -21., 0., -17., 0., -13., 0., -9., 0., -5., 0., -1., 0., 3., - 0., 7., 0., 11., 0., 15., 0., 19., 0., 23., 0., 27., 0., 31., + let e = _mm512_setr_ph( + -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, + -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, ); assert_eq_m512h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_sub_round_sh() { - let a = _mm_set_sh(1.0); - let b = _mm_set_sh(2.0); - let r = _mm_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b); - let e = _mm_set_sh(-1.0); + unsafe fn test_mm_fmul_round_sch() { + let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0); + let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0); + let r = _mm_fmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b); + let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0); assert_eq_m128h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_mask_sub_round_sh() { - let a = _mm_set_sh(1.0); - let b = _mm_set_sh(2.0); - let src = _mm_set_sh(4.0); - let r = _mm_mask_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( + unsafe fn test_mm_mask_fmul_round_sch() { + let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0); + let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0); + let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0); + let r = _mm_mask_fmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( src, 0, a, b, ); - let e = _mm_set_sh(4.0); - assert_eq_m128h(r, e); - let r = _mm_mask_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( - src, 1, a, b, - ); - let e = _mm_set_sh(-1.0); + let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0); assert_eq_m128h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_maskz_sub_round_sh() { - let a = _mm_set_sh(1.0); - let b = _mm_set_sh(2.0); - let r = - _mm_maskz_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b); - let e = _mm_set_sh(0.0); - assert_eq_m128h(r, e); + unsafe fn test_mm_maskz_fmul_round_sch() { + let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0); + let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0); let r = - _mm_maskz_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b); - let e = _mm_set_sh(-1.0); + _mm_maskz_fmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b); + let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0); assert_eq_m128h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_sub_sh() { - let a = _mm_set_sh(1.0); - let b = _mm_set_sh(2.0); - let r = _mm_sub_sh(a, b); - let e = _mm_set_sh(-1.0); + unsafe fn test_mm_fmul_sch() { + let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0); + let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0); + let r = _mm_fmul_sch(a, b); + let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0); assert_eq_m128h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_mask_sub_sh() { - let a = _mm_set_sh(1.0); - let b = _mm_set_sh(2.0); - let src = _mm_set_sh(4.0); - let r = _mm_mask_sub_sh(src, 0, a, b); - let e = _mm_set_sh(4.0); - assert_eq_m128h(r, e); - let r = _mm_mask_sub_sh(src, 1, a, b); - let e = _mm_set_sh(-1.0); + unsafe fn test_mm_mask_fmul_sch() { + let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0); + let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0); + let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0); + let r = _mm_mask_fmul_sch(src, 0, a, b); + let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0); assert_eq_m128h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_maskz_sub_sh() { - let a = _mm_set_sh(1.0); - let b = _mm_set_sh(2.0); - let r = _mm_maskz_sub_sh(0, a, b); - let e = _mm_set_sh(0.0); - assert_eq_m128h(r, e); - let r = _mm_maskz_sub_sh(1, a, b); - let e = _mm_set_sh(-1.0); + unsafe fn test_mm_maskz_fmul_sch() { + let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0); + let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0); + let r = _mm_maskz_fmul_sch(0, a, b); + let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0); assert_eq_m128h(r, e); } #[simd_test(enable = "avx512fp16,avx512vl")] - unsafe fn test_mm_mul_ph() { - let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); - let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0); - let r = _mm_mul_ph(a, b); - let e = _mm_set_ph(8.0, 14.0, 18.0, 20.0, 20.0, 18.0, 14.0, 8.0); + unsafe fn test_mm_cmul_pch() { + let a = _mm_set1_pch((0.0, 1.0)); + let b = _mm_set1_pch((0.0, -1.0)); + let r = _mm_cmul_pch(a, b); + let e = _mm_set1_pch((-1.0, 0.0)); assert_eq_m128h(r, e); } #[simd_test(enable = "avx512fp16,avx512vl")] - unsafe fn test_mm_mask_mul_ph() { - let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); - let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0); - let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.); - let r = _mm_mask_mul_ph(src, 0b01010101, a, b); - let e = _mm_set_ph(10., 14., 12., 20., 14., 18., 16., 8.); + unsafe fn test_mm_mask_cmul_pch() { + let a = _mm_set1_pch((0.0, 1.0)); + let b = _mm_set1_pch((0.0, -1.0)); + let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0); + let r = _mm_mask_cmul_pch(src, 0b0101, a, b); + let e = _mm_setr_ph(-1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0); assert_eq_m128h(r, e); } #[simd_test(enable = "avx512fp16,avx512vl")] - unsafe fn test_mm_maskz_mul_ph() { - let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); - let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0); - let r = _mm_maskz_mul_ph(0b01010101, a, b); - let e = _mm_set_ph(0., 14., 0., 20., 0., 18., 0., 8.); + unsafe fn test_mm_maskz_cmul_pch() { + let a = _mm_set1_pch((0.0, 1.0)); + let b = _mm_set1_pch((0.0, -1.0)); + let r = _mm_maskz_cmul_pch(0b0101, a, b); + let e = _mm_setr_ph(-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0); assert_eq_m128h(r, e); } #[simd_test(enable = "avx512fp16,avx512vl")] - unsafe fn test_mm256_mul_ph() { - let a = _mm256_set_ph( - 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, - ); - let b = _mm256_set_ph( - 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, - ); - let r = _mm256_mul_ph(a, b); - let e = _mm256_set_ph( - 16.0, 30.0, 42.0, 52.0, 60.0, 66.0, 70.0, 72.0, 72.0, 70.0, 66.0, 60.0, 52.0, 42.0, - 30.0, 16.0, - ); + unsafe fn test_mm256_cmul_pch() { + let a = _mm256_set1_pch((0.0, 1.0)); + let b = _mm256_set1_pch((0.0, -1.0)); + let r = _mm256_cmul_pch(a, b); + let e = _mm256_set1_pch((-1.0, 0.0)); assert_eq_m256h(r, e); } #[simd_test(enable = "avx512fp16,avx512vl")] - unsafe fn test_mm256_mask_mul_ph() { - let a = _mm256_set_ph( - 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, - ); - let b = _mm256_set_ph( - 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, + unsafe fn test_mm256_mask_cmul_pch() { + let a = _mm256_set1_pch((0.0, 1.0)); + let b = _mm256_set1_pch((0.0, -1.0)); + let src = _mm256_setr_ph( + 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, ); - let src = _mm256_set_ph( - 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33., - ); - let r = _mm256_mask_mul_ph(src, 0b0101010101010101, a, b); - let e = _mm256_set_ph( - 18., 30., 20., 52., 22., 66., 24., 72., 26., 70., 28., 60., 30., 42., 32., 16., + let r = _mm256_mask_cmul_pch(src, 0b01010101, a, b); + let e = _mm256_setr_ph( + -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0, ); assert_eq_m256h(r, e); } #[simd_test(enable = "avx512fp16,avx512vl")] - unsafe fn test_mm256_maskz_mul_ph() { - let a = _mm256_set_ph( - 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, - ); - let b = _mm256_set_ph( - 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, - ); - let r = _mm256_maskz_mul_ph(0b0101010101010101, a, b); - let e = _mm256_set_ph( - 0., 30., 0., 52., 0., 66., 0., 72., 0., 70., 0., 60., 0., 42., 0., 16., + unsafe fn test_mm256_maskz_cmul_pch() { + let a = _mm256_set1_pch((0.0, 1.0)); + let b = _mm256_set1_pch((0.0, -1.0)); + let r = _mm256_maskz_cmul_pch(0b01010101, a, b); + let e = _mm256_setr_ph( + -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, ); assert_eq_m256h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm512_mul_ph() { - let a = _mm512_set_ph( - 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, - 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, - 31.0, 32.0, - ); - let b = _mm512_set_ph( - 32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, - 18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, - 3.0, 2.0, 1.0, - ); - let r = _mm512_mul_ph(a, b); - let e = _mm512_set_ph( - 32.0, 62.0, 90.0, 116.0, 140.0, 162.0, 182.0, 200.0, 216.0, 230.0, 242.0, 252.0, 260.0, - 266.0, 270.0, 272.0, 272.0, 270.0, 266.0, 260.0, 252.0, 242.0, 230.0, 216.0, 200.0, - 182.0, 162.0, 140.0, 116.0, 90.0, 62.0, 32.0, - ); + unsafe fn test_mm512_cmul_pch() { + let a = _mm512_set1_pch((0.0, 1.0)); + let b = _mm512_set1_pch((0.0, -1.0)); + let r = _mm512_cmul_pch(a, b); + let e = _mm512_set1_pch((-1.0, 0.0)); assert_eq_m512h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm512_mask_mul_ph() { - let a = _mm512_set_ph( - 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, - 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, - 31.0, 32.0, - ); - let b = _mm512_set_ph( - 32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, - 18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, - 3.0, 2.0, 1.0, - ); - let src = _mm512_set_ph( - 34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50., - 51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65., - ); - let r = _mm512_mask_mul_ph(src, 0b01010101010101010101010101010101, a, b); - let e = _mm512_set_ph( - 34., 62., 36., 116., 38., 162., 40., 200., 42., 230., 44., 252., 46., 266., 48., 272., - 50., 270., 52., 260., 54., 242., 56., 216., 58., 182., 60., 140., 62., 90., 64., 32., + unsafe fn test_mm512_mask_cmul_pch() { + let a = _mm512_set1_pch((0.0, 1.0)); + let b = _mm512_set1_pch((0.0, -1.0)); + let src = _mm512_setr_ph( + 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, + 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, + 32.0, 33.0, + ); + let r = _mm512_mask_cmul_pch(src, 0b0101010101010101, a, b); + let e = _mm512_setr_ph( + -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0, + -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0, + 33.0, ); assert_eq_m512h(r, e); } - - #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm512_maskz_mul_ph() { - let a = _mm512_set_ph( - 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, - 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, - 31.0, 32.0, - ); - let b = _mm512_set_ph( - 32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, - 18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, - 3.0, 2.0, 1.0, - ); - let r = _mm512_maskz_mul_ph(0b01010101010101010101010101010101, a, b); - let e = _mm512_set_ph( - 0., 62., 0., 116., 0., 162., 0., 200., 0., 230., 0., 252., 0., 266., 0., 272., 0., - 270., 0., 260., 0., 242., 0., 216., 0., 182., 0., 140., 0., 90., 0., 32., + + #[simd_test(enable = "avx512fp16")] + unsafe fn test_mm512_maskz_cmul_pch() { + let a = _mm512_set1_pch((0.0, 1.0)); + let b = _mm512_set1_pch((0.0, -1.0)); + let r = _mm512_maskz_cmul_pch(0b0101010101010101, a, b); + let e = _mm512_setr_ph( + -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, + -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, ); assert_eq_m512h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm512_mul_round_ph() { - let a = _mm512_set_ph( - 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, - 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, - 31.0, 32.0, - ); - let b = _mm512_set_ph( - 32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, - 18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, - 3.0, 2.0, 1.0, - ); - let r = _mm512_mul_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b); - let e = _mm512_set_ph( - 32.0, 62.0, 90.0, 116.0, 140.0, 162.0, 182.0, 200.0, 216.0, 230.0, 242.0, 252.0, 260.0, - 266.0, 270.0, 272.0, 272.0, 270.0, 266.0, 260.0, 252.0, 242.0, 230.0, 216.0, 200.0, - 182.0, 162.0, 140.0, 116.0, 90.0, 62.0, 32.0, - ); + unsafe fn test_mm512_cmul_round_pch() { + let a = _mm512_set1_pch((0.0, 1.0)); + let b = _mm512_set1_pch((0.0, -1.0)); + let r = _mm512_cmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b); + let e = _mm512_set1_pch((-1.0, 0.0)); assert_eq_m512h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm512_mask_mul_round_ph() { - let a = _mm512_set_ph( - 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, - 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, - 31.0, 32.0, - ); - let b = _mm512_set_ph( - 32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, - 18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, - 3.0, 2.0, 1.0, - ); - let src = _mm512_set_ph( - 34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50., - 51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65., - ); - let r = _mm512_mask_mul_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( + unsafe fn test_mm512_mask_cmul_round_pch() { + let a = _mm512_set1_pch((0.0, 1.0)); + let b = _mm512_set1_pch((0.0, -1.0)); + let src = _mm512_setr_ph( + 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, + 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, + 32.0, 33.0, + ); + let r = _mm512_mask_cmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( src, - 0b01010101010101010101010101010101, + 0b0101010101010101, a, b, ); - let e = _mm512_set_ph( - 34., 62., 36., 116., 38., 162., 40., 200., 42., 230., 44., 252., 46., 266., 48., 272., - 50., 270., 52., 260., 54., 242., 56., 216., 58., 182., 60., 140., 62., 90., 64., 32., + let e = _mm512_setr_ph( + -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0, + -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0, + 33.0, ); assert_eq_m512h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm512_maskz_mul_round_ph() { - let a = _mm512_set_ph( - 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, - 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, - 31.0, 32.0, - ); - let b = _mm512_set_ph( - 32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, - 18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, - 3.0, 2.0, 1.0, - ); - let r = _mm512_maskz_mul_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( - 0b01010101010101010101010101010101, + unsafe fn test_mm512_maskz_cmul_round_pch() { + let a = _mm512_set1_pch((0.0, 1.0)); + let b = _mm512_set1_pch((0.0, -1.0)); + let r = _mm512_maskz_cmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( + 0b0101010101010101, a, b, ); - let e = _mm512_set_ph( - 0., 62., 0., 116., 0., 162., 0., 200., 0., 230., 0., 252., 0., 266., 0., 272., 0., - 270., 0., 260., 0., 242., 0., 216., 0., 182., 0., 140., 0., 90., 0., 32., + let e = _mm512_setr_ph( + -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, + -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, ); assert_eq_m512h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_mul_round_sh() { - let a = _mm_set_sh(1.0); - let b = _mm_set_sh(2.0); - let r = _mm_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b); - let e = _mm_set_sh(2.0); + unsafe fn test_mm_cmul_sch() { + let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0); + let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0); + let r = _mm_cmul_sch(a, b); + let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0); assert_eq_m128h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_mask_mul_round_sh() { - let a = _mm_set_sh(1.0); - let b = _mm_set_sh(2.0); - let src = _mm_set_sh(4.0); - let r = _mm_mask_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( - src, 0, a, b, - ); - let e = _mm_set_sh(4.0); - assert_eq_m128h(r, e); - let r = _mm_mask_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( - src, 1, a, b, - ); - let e = _mm_set_sh(2.0); + unsafe fn test_mm_mask_cmul_sch() { + let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0); + let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0); + let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0); + let r = _mm_mask_cmul_sch(src, 0, a, b); + let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0); assert_eq_m128h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_maskz_mul_round_sh() { - let a = _mm_set_sh(1.0); - let b = _mm_set_sh(2.0); - let r = - _mm_maskz_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b); - let e = _mm_set_sh(0.0); - assert_eq_m128h(r, e); - let r = - _mm_maskz_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b); - let e = _mm_set_sh(2.0); + unsafe fn test_mm_maskz_cmul_sch() { + let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0); + let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0); + let r = _mm_maskz_cmul_sch(0, a, b); + let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0); assert_eq_m128h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_mul_sh() { - let a = _mm_set_sh(1.0); - let b = _mm_set_sh(2.0); - let r = _mm_mul_sh(a, b); - let e = _mm_set_sh(2.0); + unsafe fn test_mm_cmul_round_sch() { + let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0); + let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0); + let r = _mm_cmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b); + let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0); assert_eq_m128h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_mask_mul_sh() { - let a = _mm_set_sh(1.0); - let b = _mm_set_sh(2.0); - let src = _mm_set_sh(4.0); - let r = _mm_mask_mul_sh(src, 0, a, b); - let e = _mm_set_sh(4.0); - assert_eq_m128h(r, e); - let r = _mm_mask_mul_sh(src, 1, a, b); - let e = _mm_set_sh(2.0); + unsafe fn test_mm_mask_cmul_round_sch() { + let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0); + let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0); + let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0); + let r = _mm_mask_cmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( + src, 0, a, b, + ); + let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0); assert_eq_m128h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_maskz_mul_sh() { - let a = _mm_set_sh(1.0); - let b = _mm_set_sh(2.0); - let r = _mm_maskz_mul_sh(0, a, b); - let e = _mm_set_sh(0.0); - assert_eq_m128h(r, e); - let r = _mm_maskz_mul_sh(1, a, b); - let e = _mm_set_sh(2.0); + unsafe fn test_mm_maskz_cmul_round_sch() { + let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0); + let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0); + let r = + _mm_maskz_cmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b); + let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0); assert_eq_m128h(r, e); } #[simd_test(enable = "avx512fp16,avx512vl")] - unsafe fn test_mm_div_ph() { - let a = _mm_set1_ph(1.0); - let b = _mm_set1_ph(2.0); - let r = _mm_div_ph(a, b); - let e = _mm_set1_ph(0.5); + unsafe fn test_mm_fcmul_pch() { + let a = _mm_set1_pch((0.0, 1.0)); + let b = _mm_set1_pch((0.0, -1.0)); + let r = _mm_fcmul_pch(a, b); + let e = _mm_set1_pch((-1.0, 0.0)); assert_eq_m128h(r, e); } #[simd_test(enable = "avx512fp16,avx512vl")] - unsafe fn test_mm_mask_div_ph() { - let a = _mm_set1_ph(1.0); - let b = _mm_set1_ph(2.0); - let src = _mm_set_ph(4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0); - let r = _mm_mask_div_ph(src, 0b01010101, a, b); - let e = _mm_set_ph(4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5); + unsafe fn test_mm_mask_fcmul_pch() { + let a = _mm_set1_pch((0.0, 1.0)); + let b = _mm_set1_pch((0.0, -1.0)); + let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0); + let r = _mm_mask_fcmul_pch(src, 0b0101, a, b); + let e = _mm_setr_ph(-1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0); assert_eq_m128h(r, e); } #[simd_test(enable = "avx512fp16,avx512vl")] - unsafe fn test_mm_maskz_div_ph() { - let a = _mm_set1_ph(1.0); - let b = _mm_set1_ph(2.0); - let r = _mm_maskz_div_ph(0b01010101, a, b); - let e = _mm_set_ph(0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5); + unsafe fn test_mm_maskz_fcmul_pch() { + let a = _mm_set1_pch((0.0, 1.0)); + let b = _mm_set1_pch((0.0, -1.0)); + let r = _mm_maskz_fcmul_pch(0b0101, a, b); + let e = _mm_setr_ph(-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0); assert_eq_m128h(r, e); } #[simd_test(enable = "avx512fp16,avx512vl")] - unsafe fn test_mm256_div_ph() { - let a = _mm256_set1_ph(1.0); - let b = _mm256_set1_ph(2.0); - let r = _mm256_div_ph(a, b); - let e = _mm256_set1_ph(0.5); + unsafe fn test_mm256_fcmul_pch() { + let a = _mm256_set1_pch((0.0, 1.0)); + let b = _mm256_set1_pch((0.0, -1.0)); + let r = _mm256_fcmul_pch(a, b); + let e = _mm256_set1_pch((-1.0, 0.0)); assert_eq_m256h(r, e); } #[simd_test(enable = "avx512fp16,avx512vl")] - unsafe fn test_mm256_mask_div_ph() { - let a = _mm256_set1_ph(1.0); - let b = _mm256_set1_ph(2.0); - let src = _mm256_set_ph( - 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, - 19.0, + unsafe fn test_mm256_mask_fcmul_pch() { + let a = _mm256_set1_pch((0.0, 1.0)); + let b = _mm256_set1_pch((0.0, -1.0)); + let src = _mm256_setr_ph( + 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, ); - let r = _mm256_mask_div_ph(src, 0b0101010101010101, a, b); - let e = _mm256_set_ph( - 4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5, 12.0, 0.5, 14.0, 0.5, 16.0, 0.5, 18.0, 0.5, + let r = _mm256_mask_fcmul_pch(src, 0b01010101, a, b); + let e = _mm256_setr_ph( + -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0, ); assert_eq_m256h(r, e); } #[simd_test(enable = "avx512fp16,avx512vl")] - unsafe fn test_mm256_maskz_div_ph() { - let a = _mm256_set1_ph(1.0); - let b = _mm256_set1_ph(2.0); - let r = _mm256_maskz_div_ph(0b0101010101010101, a, b); - let e = _mm256_set_ph( - 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, + unsafe fn test_mm256_maskz_fcmul_pch() { + let a = _mm256_set1_pch((0.0, 1.0)); + let b = _mm256_set1_pch((0.0, -1.0)); + let r = _mm256_maskz_fcmul_pch(0b01010101, a, b); + let e = _mm256_setr_ph( + -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, ); assert_eq_m256h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm512_div_ph() { - let a = _mm512_set1_ph(1.0); - let b = _mm512_set1_ph(2.0); - let r = _mm512_div_ph(a, b); - let e = _mm512_set1_ph(0.5); + unsafe fn test_mm512_fcmul_pch() { + let a = _mm512_set1_pch((0.0, 1.0)); + let b = _mm512_set1_pch((0.0, -1.0)); + let r = _mm512_fcmul_pch(a, b); + let e = _mm512_set1_pch((-1.0, 0.0)); assert_eq_m512h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm512_mask_div_ph() { - let a = _mm512_set1_ph(1.0); - let b = _mm512_set1_ph(2.0); - let src = _mm512_set_ph( - 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, - 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, - 33.0, 34.0, 35.0, - ); - let r = _mm512_mask_div_ph(src, 0b01010101010101010101010101010101, a, b); - let e = _mm512_set_ph( - 4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5, 12.0, 0.5, 14.0, 0.5, 16.0, 0.5, 18.0, 0.5, - 20.0, 0.5, 22.0, 0.5, 24.0, 0.5, 26.0, 0.5, 28.0, 0.5, 30.0, 0.5, 32.0, 0.5, 34.0, 0.5, + unsafe fn test_mm512_mask_fcmul_pch() { + let a = _mm512_set1_pch((0.0, 1.0)); + let b = _mm512_set1_pch((0.0, -1.0)); + let src = _mm512_setr_ph( + 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, + 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, + 32.0, 33.0, + ); + let r = _mm512_mask_fcmul_pch(src, 0b0101010101010101, a, b); + let e = _mm512_setr_ph( + -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0, + -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0, + 33.0, ); assert_eq_m512h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm512_maskz_div_ph() { - let a = _mm512_set1_ph(1.0); - let b = _mm512_set1_ph(2.0); - let r = _mm512_maskz_div_ph(0b01010101010101010101010101010101, a, b); - let e = _mm512_set_ph( - 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, - 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, + unsafe fn test_mm512_maskz_fcmul_pch() { + let a = _mm512_set1_pch((0.0, 1.0)); + let b = _mm512_set1_pch((0.0, -1.0)); + let r = _mm512_maskz_fcmul_pch(0b0101010101010101, a, b); + let e = _mm512_setr_ph( + -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, + -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, ); assert_eq_m512h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm512_div_round_ph() { - let a = _mm512_set1_ph(1.0); - let b = _mm512_set1_ph(2.0); - let r = _mm512_div_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b); - let e = _mm512_set1_ph(0.5); + unsafe fn test_mm512_fcmul_round_pch() { + let a = _mm512_set1_pch((0.0, 1.0)); + let b = _mm512_set1_pch((0.0, -1.0)); + let r = _mm512_fcmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b); + let e = _mm512_set1_pch((-1.0, 0.0)); assert_eq_m512h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm512_mask_div_round_ph() { - let a = _mm512_set1_ph(1.0); - let b = _mm512_set1_ph(2.0); - let src = _mm512_set_ph( - 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, - 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, - 33.0, 34.0, 35.0, - ); - let r = _mm512_mask_div_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( + unsafe fn test_mm512_mask_fcmul_round_pch() { + let a = _mm512_set1_pch((0.0, 1.0)); + let b = _mm512_set1_pch((0.0, -1.0)); + let src = _mm512_setr_ph( + 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, + 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, + 32.0, 33.0, + ); + let r = _mm512_mask_fcmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( src, - 0b01010101010101010101010101010101, + 0b0101010101010101, a, b, ); - let e = _mm512_set_ph( - 4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5, 12.0, 0.5, 14.0, 0.5, 16.0, 0.5, 18.0, 0.5, - 20.0, 0.5, 22.0, 0.5, 24.0, 0.5, 26.0, 0.5, 28.0, 0.5, 30.0, 0.5, 32.0, 0.5, 34.0, 0.5, + let e = _mm512_setr_ph( + -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0, + -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0, + 33.0, ); assert_eq_m512h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm512_maskz_div_round_ph() { - let a = _mm512_set1_ph(1.0); - let b = _mm512_set1_ph(2.0); - let r = _mm512_maskz_div_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( - 0b01010101010101010101010101010101, + unsafe fn test_mm512_maskz_fcmul_round_pch() { + let a = _mm512_set1_pch((0.0, 1.0)); + let b = _mm512_set1_pch((0.0, -1.0)); + let r = _mm512_maskz_fcmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( + 0b0101010101010101, a, b, ); - let e = _mm512_set_ph( - 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, - 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, + let e = _mm512_setr_ph( + -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, + -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, ); assert_eq_m512h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_div_round_sh() { - let a = _mm_set_sh(1.0); - let b = _mm_set_sh(2.0); - let r = _mm_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b); - let e = _mm_set_sh(0.5); + unsafe fn test_mm_fcmul_sch() { + let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0); + let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0); + let r = _mm_fcmul_sch(a, b); + let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0); assert_eq_m128h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_mask_div_round_sh() { - let a = _mm_set_sh(1.0); - let b = _mm_set_sh(2.0); - let src = _mm_set_sh(4.0); - let r = _mm_mask_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( - src, 0, a, b, - ); - let e = _mm_set_sh(4.0); - assert_eq_m128h(r, e); - let r = _mm_mask_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( - src, 1, a, b, - ); - let e = _mm_set_sh(0.5); + unsafe fn test_mm_mask_fcmul_sch() { + let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0); + let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0); + let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0); + let r = _mm_mask_fcmul_sch(src, 0, a, b); + let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0); assert_eq_m128h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_maskz_div_round_sh() { - let a = _mm_set_sh(1.0); - let b = _mm_set_sh(2.0); - let r = - _mm_maskz_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b); - let e = _mm_set_sh(0.0); - assert_eq_m128h(r, e); - let r = - _mm_maskz_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b); - let e = _mm_set_sh(0.5); + unsafe fn test_mm_maskz_fcmul_sch() { + let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0); + let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0); + let r = _mm_maskz_fcmul_sch(0, a, b); + let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0); assert_eq_m128h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_div_sh() { - let a = _mm_set_sh(1.0); - let b = _mm_set_sh(2.0); - let r = _mm_div_sh(a, b); - let e = _mm_set_sh(0.5); + unsafe fn test_mm_fcmul_round_sch() { + let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0); + let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0); + let r = _mm_fcmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b); + let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0); assert_eq_m128h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_mask_div_sh() { - let a = _mm_set_sh(1.0); - let b = _mm_set_sh(2.0); - let src = _mm_set_sh(4.0); - let r = _mm_mask_div_sh(src, 0, a, b); - let e = _mm_set_sh(4.0); - assert_eq_m128h(r, e); - let r = _mm_mask_div_sh(src, 1, a, b); - let e = _mm_set_sh(0.5); + unsafe fn test_mm_mask_fcmul_round_sch() { + let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0); + let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0); + let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0); + let r = _mm_mask_fcmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( + src, 0, a, b, + ); + let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0); assert_eq_m128h(r, e); } #[simd_test(enable = "avx512fp16")] - unsafe fn test_mm_maskz_div_sh() { - let a = _mm_set_sh(1.0); - let b = _mm_set_sh(2.0); - let r = _mm_maskz_div_sh(0, a, b); - let e = _mm_set_sh(0.0); - assert_eq_m128h(r, e); - let r = _mm_maskz_div_sh(1, a, b); - let e = _mm_set_sh(0.5); + unsafe fn test_mm_maskz_fcmul_round_sch() { + let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0); + let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0); + let r = + _mm_maskz_fcmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b); + let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0); assert_eq_m128h(r, e); } } diff --git a/crates/stdarch-verify/tests/x86-intel.rs b/crates/stdarch-verify/tests/x86-intel.rs index fadaa6a4b1..511223a8da 100644 --- a/crates/stdarch-verify/tests/x86-intel.rs +++ b/crates/stdarch-verify/tests/x86-intel.rs @@ -823,6 +823,8 @@ fn equate( (&Type::MM_MANTISSA_SIGN_ENUM, "_MM_MANTISSA_SIGN_ENUM") => {} (&Type::MM_PERM_ENUM, "_MM_PERM_ENUM") => {} + (&Type::Tuple, "_Float16 _Complex") if intrinsic.contains("_set1_pch") => {} + // This is a macro (?) in C which seems to mutate its arguments, but // that means that we're taking pointers to arguments in rust // as we're not exposing it as a macro.