Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add AVX512-IFMA intrinsics. #676

Merged
merged 6 commits into from
Feb 11, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions crates/core_arch/src/simd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -191,3 +191,7 @@ simd_ty!(i32x16[i32]:
i32, i32, i32, i32, i32, i32, i32, i32
| x0, x1, x2, x3, x4, x5, x6, x7,
x8, x9, x10, x11, x12, x13, x14, x15);

simd_ty!(i64x8[i64]:
i64, i64, i64, i64, i64, i64, i64, i64
| x0, x1, x2, x3, x4, x5, x6, x7);
7 changes: 7 additions & 0 deletions crates/core_arch/src/x86/avx512f.rs
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,13 @@ pub unsafe fn _mm512_setr_epi32(
mem::transmute(r)
}

/// Broadcast 64-bit integer `a` to all elements of `dst`.
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_set1_epi64(a: i64) -> __m512i {
mem::transmute(i64x8::splat(a))
}

#[cfg(test)]
mod tests {
use std;
Expand Down
196 changes: 196 additions & 0 deletions crates/core_arch/src/x86/avx512ifma.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
use core_arch::x86::*;

#[cfg(test)]
use stdsimd_test::assert_instr;

/// Multiply packed unsigned 52-bit integers in each 64-bit element of
/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
/// unsigned integer from the intermediate result with the
/// corresponding unsigned 64-bit integer in `a`, and store the
/// results in `dst`.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#avx512techs=AVX512IFMA52&expand=3488)
#[inline]
#[target_feature(enable = "avx512ifma")]
#[cfg_attr(test, assert_instr(vpmadd52huq))]
pub unsafe fn _mm512_madd52hi_epu64(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
vpmadd52huq_512(a, b, c)
}

/// Multiply packed unsigned 52-bit integers in each 64-bit element of
/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
/// unsigned integer from the intermediate result with the
/// corresponding unsigned 64-bit integer in `a`, and store the
/// results in `dst`.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=3497&avx512techs=AVX512IFMA52)
#[inline]
#[target_feature(enable = "avx512ifma")]
#[cfg_attr(test, assert_instr(vpmadd52luq))]
pub unsafe fn _mm512_madd52lo_epu64(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
vpmadd52luq_512(a, b, c)
}

/// Multiply packed unsigned 52-bit integers in each 64-bit element of
/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
/// unsigned integer from the intermediate result with the
/// corresponding unsigned 64-bit integer in `a`, and store the
/// results in `dst`.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=vpmadd52&avx512techs=AVX512IFMA52,AVX512VL&expand=3485)
#[inline]
#[target_feature(enable = "avx512ifma,avx512vl")]
#[cfg_attr(test, assert_instr(vpmadd52huq))]
pub unsafe fn _mm256_madd52hi_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
vpmadd52huq_256(a, b, c)
}

/// Multiply packed unsigned 52-bit integers in each 64-bit element of
/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
/// unsigned integer from the intermediate result with the
/// corresponding unsigned 64-bit integer in `a`, and store the
/// results in `dst`.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=vpmadd52&avx512techs=AVX512IFMA52,AVX512VL&expand=3494)
#[inline]
#[target_feature(enable = "avx512ifma,avx512vl")]
#[cfg_attr(test, assert_instr(vpmadd52luq))]
pub unsafe fn _mm256_madd52lo_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
vpmadd52luq_256(a, b, c)
}

/// Multiply packed unsigned 52-bit integers in each 64-bit element of
/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
/// unsigned integer from the intermediate result with the
/// corresponding unsigned 64-bit integer in `a`, and store the
/// results in `dst`.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=3488,3482&text=vpmadd52&avx512techs=AVX512IFMA52,AVX512VL)
#[inline]
#[target_feature(enable = "avx512ifma,avx512vl")]
#[cfg_attr(test, assert_instr(vpmadd52huq))]
pub unsafe fn _mm_madd52hi_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
vpmadd52huq_128(a, b, c)
}

/// Multiply packed unsigned 52-bit integers in each 64-bit element of
/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
/// unsigned integer from the intermediate result with the
/// corresponding unsigned 64-bit integer in `a`, and store the
/// results in `dst`.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=3488,3491&text=vpmadd52&avx512techs=AVX512IFMA52,AVX512VL)
#[inline]
#[target_feature(enable = "avx512ifma,avx512vl")]
#[cfg_attr(test, assert_instr(vpmadd52luq))]
pub unsafe fn _mm_madd52lo_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
vpmadd52luq_128(a, b, c)
}

#[allow(improper_ctypes)]
extern "C" {
#[link_name = "llvm.x86.avx512.vpmadd52l.uq.128"]
fn vpmadd52luq_128(z: __m128i, x: __m128i, y: __m128i) -> __m128i;
#[link_name = "llvm.x86.avx512.vpmadd52h.uq.128"]
fn vpmadd52huq_128(z: __m128i, x: __m128i, y: __m128i) -> __m128i;
#[link_name = "llvm.x86.avx512.vpmadd52l.uq.256"]
fn vpmadd52luq_256(z: __m256i, x: __m256i, y: __m256i) -> __m256i;
#[link_name = "llvm.x86.avx512.vpmadd52h.uq.256"]
fn vpmadd52huq_256(z: __m256i, x: __m256i, y: __m256i) -> __m256i;
#[link_name = "llvm.x86.avx512.vpmadd52l.uq.512"]
fn vpmadd52luq_512(z: __m512i, x: __m512i, y: __m512i) -> __m512i;
#[link_name = "llvm.x86.avx512.vpmadd52h.uq.512"]
fn vpmadd52huq_512(z: __m512i, x: __m512i, y: __m512i) -> __m512i;
}

#[cfg(test)]
mod tests {
use std;
use stdsimd_test::simd_test;

use core_arch::x86::*;

#[simd_test(enable = "avx512ifma")]
unsafe fn test_mm512_madd52hi_epu64() {
let mut a = _mm512_set1_epi64(10 << 40);
let b = _mm512_set1_epi64((11 << 40) + 4);
let c = _mm512_set1_epi64((12 << 40) + 3);

a = _mm512_madd52hi_epu64(a, b, c);

// (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
let expected = _mm512_set1_epi64(11030549757952);

assert_eq_m512i(a, expected);
}

#[simd_test(enable = "avx512ifma")]
unsafe fn test_mm512_madd52lo_epu64() {
let mut a = _mm512_set1_epi64(10 << 40);
let b = _mm512_set1_epi64((11 << 40) + 4);
let c = _mm512_set1_epi64((12 << 40) + 3);

a = _mm512_madd52lo_epu64(a, b, c);

// (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
let expected = _mm512_set1_epi64(100055558127628);

assert_eq_m512i(a, expected);
}

#[simd_test(enable = "avx512ifma,avx512vl")]
unsafe fn test_mm256_madd52hi_epu64() {
let mut a = _mm256_set1_epi64x(10 << 40);
let b = _mm256_set1_epi64x((11 << 40) + 4);
let c = _mm256_set1_epi64x((12 << 40) + 3);

a = _mm256_madd52hi_epu64(a, b, c);

// (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
let expected = _mm256_set1_epi64x(11030549757952);

assert_eq_m256i(a, expected);
}

#[simd_test(enable = "avx512ifma,avx512vl")]
unsafe fn test_mm256_madd52lo_epu64() {
let mut a = _mm256_set1_epi64x(10 << 40);
let b = _mm256_set1_epi64x((11 << 40) + 4);
let c = _mm256_set1_epi64x((12 << 40) + 3);

a = _mm256_madd52lo_epu64(a, b, c);

// (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
let expected = _mm256_set1_epi64x(100055558127628);

assert_eq_m256i(a, expected);
}

#[simd_test(enable = "avx512ifma,avx512vl")]
unsafe fn test_mm_madd52hi_epu64() {
let mut a = _mm_set1_epi64x(10 << 40);
let b = _mm_set1_epi64x((11 << 40) + 4);
let c = _mm_set1_epi64x((12 << 40) + 3);

a = _mm_madd52hi_epu64(a, b, c);

// (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
let expected = _mm_set1_epi64x(11030549757952);

assert_eq_m128i(a, expected);
}

#[simd_test(enable = "avx512ifma,avx512vl")]
unsafe fn test_mm_madd52lo_epu64() {
let mut a = _mm_set1_epi64x(10 << 40);
let b = _mm_set1_epi64x((11 << 40) + 4);
let c = _mm_set1_epi64x((12 << 40) + 3);

a = _mm_madd52hi_epu64(a, b, c);

// (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
let expected = _mm_set1_epi64x(11030549757952);

assert_eq_m128i(a, expected);
}
}
3 changes: 3 additions & 0 deletions crates/core_arch/src/x86/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -560,3 +560,6 @@ pub unsafe fn ud2() -> ! {

mod avx512f;
pub use self::avx512f::*;

mod avx512ifma;
pub use self::avx512ifma::*;
16 changes: 13 additions & 3 deletions crates/stdsimd-verify/tests/x86-intel.rs
Original file line number Diff line number Diff line change
Expand Up @@ -273,15 +273,25 @@ fn matches(rust: &Function, intel: &Intrinsic) -> Result<(), String> {
.flat_map(|c| c.to_lowercase())
.collect::<String>();

// The XML file names IFMA as "avx512ifma52", while Rust calls
// it "avx512ifma". Fix this mismatch by replacing the Intel
// name with the Rust name.
let fixup_cpuid = |cpuid: String| match cpuid.as_ref() {
"avx512ifma52" => String::from("avx512ifma"),
_ => cpuid,
};
let fixed_cpuid = fixup_cpuid(cpuid);

let rust_feature = rust
.target_feature
.expect(&format!("no target feature listed for {}", rust.name));
if rust_feature.contains(&cpuid) {

if rust_feature.contains(&fixed_cpuid) {
continue;
}
bail!(
"intel cpuid `{}` not in `{}` for {}",
cpuid,
fixed_cpuid,
rust_feature,
rust.name
)
Expand Down Expand Up @@ -359,7 +369,7 @@ fn matches(rust: &Function, intel: &Intrinsic) -> Result<(), String> {
// Apparently all of clang/msvc/gcc accept these intrinsics on
// 32-bit, so let's do the same
"_mm_set_epi64x" | "_mm_set1_epi64x" | "_mm256_set_epi64x" | "_mm256_setr_epi64x"
| "_mm256_set1_epi64x" => true,
| "_mm256_set1_epi64x" | "_mm512_set1_epi64" => true,

// These return a 64-bit argument but they're assembled from other
// 32-bit registers, so these work on 32-bit just fine. See #308 for
Expand Down