Redundant rounding in AVX512 exp d8? #285

chriselrod · 2020-02-23T04:22:20Z

I haven't looked at the source code, but compiling with -DSLEEF_ENABLE_LLVM_BITCODE=TRUE let me look at some of the LLVM bitcode. Look at the first few lines (%3 through %8):

define dso_local <8 x double> @Sleef_expd8_u10avx512f(<8 x double> %0) local_unnamed_addr #1 {
  %2 = fmul <8 x double> %0, <double 0x3FF71547652B82FE, double 0x3FF71547652B82FE, double 0x3FF71547652B82FE, double 0x3FF71547652B82FE, double 0x3FF71547652B82FE, double 0x3FF71547652B82FE, double 0x3FF71547652B82FE, double 0x3FF71547652B82FE>
  %3 = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %4 = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %5 = tail call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %3, i32 8) #13
  %6 = tail call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %4, i32 8) #13
  %7 = shufflevector <4 x double> %6, <4 x double> %5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %8 = tail call <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double> %7, <8 x i32> zeroinitializer, i8 -1, i32 8) #13
  %9 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %7, <8 x double> <double 0xBFE62E42FEFA3000, double 0xBFE62E42FEFA3000, double 0xBFE62E42FEFA3000, double 0xBFE62E42FEFA3000, double 0xBFE62E42FEFA3000, double 0xBFE62E42FEFA3000, double 0xBFE62E42FEFA3000, double 0xBFE62E42FEFA3000>, <8 x double> %0) #13
  %10 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %7, <8 x double> <double 0xBD53DE6AF278ECE6, double 0xBD53DE6AF278ECE6, double 0xBD53DE6AF278ECE6, double 0xBD53DE6AF278ECE6, double 0xBD53DE6AF278ECE6, double 0xBD53DE6AF278ECE6, double 0xBD53DE6AF278ECE6, double 0xBD53DE6AF278ECE6>, <8 x double> %9) #13
  %11 = fmul <8 x double> %10, %10
  %12 = fmul <8 x double> %11, %11
  %13 = fmul <8 x double> %12, %12
  %14 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %10, <8 x double> <double 0x3E21E0C670AFFF06, double 0x3E21E0C670AFFF06, double 0x3E21E0C670AFFF06, double 0x3E21E0C670AFFF06, double 0x3E21E0C670AFFF06, double 0x3E21E0C670AFFF06, double 0x3E21E0C670AFFF06, double 0x3E21E0C670AFFF06>, <8 x double> <double 0x3E5AF6C36F75740C, double 0x3E5AF6C36F75740C, double 0x3E5AF6C36F75740C, double 0x3E5AF6C36F75740C, double 0x3E5AF6C36F75740C, double 0x3E5AF6C36F75740C, double 0x3E5AF6C36F75740C, double 0x3E5AF6C36F75740C>) #13
  %15 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %10, <8 x double> <double 0x3E927E5D38A23654, double 0x3E927E5D38A23654, double 0x3E927E5D38A23654, double 0x3E927E5D38A23654, double 0x3E927E5D38A23654, double 0x3E927E5D38A23654, double 0x3E927E5D38A23654, double 0x3E927E5D38A23654>, <8 x double> <double 0x3EC71DDEF633FB47, double 0x3EC71DDEF633FB47, double 0x3EC71DDEF633FB47, double 0x3EC71DDEF633FB47, double 0x3EC71DDEF633FB47, double 0x3EC71DDEF633FB47, double 0x3EC71DDEF633FB47, double 0x3EC71DDEF633FB47>) #13
  %16 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %10, <8 x double> <double 0x3EFA01A0127F883A, double 0x3EFA01A0127F883A, double 0x3EFA01A0127F883A, double 0x3EFA01A0127F883A, double 0x3EFA01A0127F883A, double 0x3EFA01A0127F883A, double 0x3EFA01A0127F883A, double 0x3EFA01A0127F883A>, <8 x double> <double 0x3F2A01A01B4421FD, double 0x3F2A01A01B4421FD, double 0x3F2A01A01B4421FD, double 0x3F2A01A01B4421FD, double 0x3F2A01A01B4421FD, double 0x3F2A01A01B4421FD, double 0x3F2A01A01B4421FD, double 0x3F2A01A01B4421FD>) #13
  %17 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %11, <8 x double> %15, <8 x double> %16) #13
  %18 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %10, <8 x double> <double 0x3F56C16C16C3396B, double 0x3F56C16C16C3396B, double 0x3F56C16C16C3396B, double 0x3F56C16C16C3396B, double 0x3F56C16C16C3396B, double 0x3F56C16C16C3396B, double 0x3F56C16C16C3396B, double 0x3F56C16C16C3396B>, <8 x double> <double 0x3F8111111110E7A5, double 0x3F8111111110E7A5, double 0x3F8111111110E7A5, double 0x3F8111111110E7A5, double 0x3F8111111110E7A5, double 0x3F8111111110E7A5, double 0x3F8111111110E7A5, double 0x3F8111111110E7A5>) #13
  %19 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %10, <8 x double> <double 0x3FA55555555554F9, double 0x3FA55555555554F9, double 0x3FA55555555554F9, double 0x3FA55555555554F9, double 0x3FA55555555554F9, double 0x3FA55555555554F9, double 0x3FA55555555554F9, double 0x3FA55555555554F9>, <8 x double> <double 0x3FC555555555555E, double 0x3FC555555555555E, double 0x3FC555555555555E, double 0x3FC555555555555E, double 0x3FC555555555555E, double 0x3FC555555555555E, double 0x3FC555555555555E, double 0x3FC555555555555E>) #13
  %20 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %11, <8 x double> %18, <8 x double> %19) #13
  %21 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %12, <8 x double> %17, <8 x double> %20) #13
  %22 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %13, <8 x double> %14, <8 x double> %21) #13
  %23 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %22, <8 x double> %10, <8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>) #13
  %24 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %23, <8 x double> %10, <8 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>) #13
  %25 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %24, <8 x double> %10, <8 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>) #13
  %26 = ashr <8 x i32> %8, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
  %27 = add nsw <8 x i32> %26, <i32 1023, i32 1023, i32 1023, i32 1023, i32 1023, i32 1023, i32 1023, i32 1023>
  %28 = bitcast <8 x i32> %27 to <4 x i64>
  %29 = shufflevector <4 x i64> %28, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
  %30 = bitcast <8 x i64> %29 to <16 x i32>
  %31 = shufflevector <16 x i32> %30, <16 x i32> undef, <16 x i32> <i32 undef, i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 3, i32 undef, i32 4, i32 undef, i32 5, i32 undef, i32 6, i32 undef, i32 7>
  %32 = shufflevector <16 x i32> <i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef>, <16 x i32> %31, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
  %33 = shl <16 x i32> %32, <i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20>
  %34 = bitcast <16 x i32> %33 to <8 x double>
  %35 = fmul <8 x double> %25, %34
  %36 = add <8 x i32> %8, <i32 1023, i32 1023, i32 1023, i32 1023, i32 1023, i32 1023, i32 1023, i32 1023>
  %37 = sub <8 x i32> %36, %26
  %38 = bitcast <8 x i32> %37 to <4 x i64>
  %39 = shufflevector <4 x i64> %38, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
  %40 = bitcast <8 x i64> %39 to <16 x i32>
  %41 = shufflevector <16 x i32> %40, <16 x i32> undef, <16 x i32> <i32 undef, i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 3, i32 undef, i32 4, i32 undef, i32 5, i32 undef, i32 6, i32 undef, i32 7>
  %42 = shufflevector <16 x i32> <i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef>, <16 x i32> %41, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
  %43 = shl <16 x i32> %42, <i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20>
  %44 = bitcast <16 x i32> %43 to <8 x double>
  %45 = fmul <8 x double> %35, %44
  %46 = fcmp ogt <8 x double> %0, <double 0x40862E42FE102C83, double 0x40862E42FE102C83, double 0x40862E42FE102C83, double 0x40862E42FE102C83, double 0x40862E42FE102C83, double 0x40862E42FE102C83, double 0x40862E42FE102C83, double 0x40862E42FE102C83>
  %47 = select <8 x i1> %46, <8 x double> <double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000>, <8 x double> %45
  %48 = fcmp olt <8 x double> %0, <double -1.000000e+03, double -1.000000e+03, double -1.000000e+03, double -1.000000e+03, double -1.000000e+03, double -1.000000e+03, double -1.000000e+03, double -1.000000e+03>
  %49 = select <8 x i1> %48, <8 x double> zeroinitializer, <8 x double> %47
  ret <8 x double> %49
}

It uses shufflevector to split a vector of length 8 into two vectors of length 4. It rounds both. Then it uses another shufflevector to combine them, before converting to quadword integers and rounding a second time with the same rounding more.
I tried modifying the code:

  %8 = tail call <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double> %2, <8 x i32> zeroinitializer, i8 -1, i32 8) #13

That is, I replaced the %7 with %2 above, to use the unrounded result, and then relied on the compiled to remove the now dead code on lines 3-7 (it would be a pain to manually rename all the SSA values by actually deleting the lines).
Using llvmcall from Julia on this IR to define vexp and vexpv2 (using the modified IR to eliminate the redundant 256 bit rounds):

julia> @benchmark vexp($x)
BenchmarkTools.Trial:
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     7.351 ns (0.00% GC)
  median time:      7.411 ns (0.00% GC)
  mean time:        7.429 ns (0.00% GC)
  maximum time:     21.115 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     999

julia> @benchmark vexp2($x)
BenchmarkTools.Trial:
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     6.864 ns (0.00% GC)
  median time:      6.885 ns (0.00% GC)
  mean time:        6.894 ns (0.00% GC)
  maximum time:     17.627 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1000

Shows a sizeable performance improvement. (Note that there are "evals/sample" evaluations per sample for more accurate timings of fast functions.)

Answers appear to be identical:

julia> SVec(x) # prints nicers
SVec{8,Float64}<0.8603146724594573, 0.6915463820339747, 0.14366360122161304, 0.5372294767804786, 0.4625001495441958, 0.3517674577629759, 0.37599478692090504, 0.5328044070990636>

julia> SVec(vexp(x))
SVec{8,Float64}<2.363904432304266, 1.996800964137148, 1.1544956722600328, 1.7112592051122344, 1.5880393617979949, 1.4215779083721343, 1.4564395412167703, 1.7037034934910387>

julia> SVec(vexpv2(x))
SVec{8,Float64}<2.363904432304266, 1.996800964137148, 1.1544956722600328, 1.7112592051122344, 1.5880393617979949, 1.4215779083721343, 1.4564395412167703, 1.7037034934910387>

julia> SVec(y)
SVec{8,Float64}<30.74182981567884, 22.610568325702403, -23.373675873567066, 46.079961631637985, 7.4123845633532, -34.8233789346198, 17.38425314982699, 18.964492286633444>

julia> SVec(vexp(y))
SVec{8,Float64}<2.2439183627359965e13, 6.601536921097034e9, 7.062224941426471e-11, 1.0286628672771097e20, 1656.3713638772492, 7.523132095311339e-16, 3.547196236382924e7, 1.722559976384776e8>

julia> SVec(vexpv2(y))
SVec{8,Float64}<2.2439183627359965e13, 6.601536921097034e9, 7.062224941426471e-11, 1.0286628672771097e20, 1656.3713638772492, 7.523132095311339e-16, 3.547196236382924e7, 1.722559976384776e8>

If anyone happens to have Julia installed and a system with AVX512, you can run the above benchmarks via

const Vec{W,T} = NTuple{W,Core.VecElement{T}}
@inline function vexp(v::Vec{8,Float64}) #modify for  vexp
    Base.llvmcall(("""
declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32)
declare <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double>, <8 x i32>, i8, i32)
declare <8 x double> @llvm.fma.v8f64(<8 x double>, <8 x double>, <8 x double>)
""","""
  %2 = fmul <8 x double> %0, <double 0x3FF71547652B82FE, double 0x3FF71547652B82FE, double 0x3FF71547652B82FE, double 0x3FF71547652B82FE, double 0x3FF71547652B82FE, double 0x3FF71547652B82FE, double 0x3FF71547652B82FE, double 0x3FF71547652B82FE>
  %3 = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %4 = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %5 = tail call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %3, i32 8) #13
  %6 = tail call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %4, i32 8) #13
  %7 = shufflevector <4 x double> %6, <4 x double> %5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %8 = tail call <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double> %7, <8 x i32> zeroinitializer, i8 -1, i32 8) #13
  %9 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %7, <8 x double> <double 0xBFE62E42FEFA3000, double 0xBFE62E42FEFA3000, double 0xBFE62E42FEFA3000, double 0xBFE62E42FEFA3000, double 0xBFE62E42FEFA3000, double 0xBFE62E42FEFA3000, double 0xBFE62E42FEFA3000, double 0xBFE62E42FEFA3000>, <8 x double> %0) #13
  %10 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %7, <8 x double> <double 0xBD53DE6AF278ECE6, double 0xBD53DE6AF278ECE6, double 0xBD53DE6AF278ECE6, double 0xBD53DE6AF278ECE6, double 0xBD53DE6AF278ECE6, double 0xBD53DE6AF278ECE6, double 0xBD53DE6AF278ECE6, double 0xBD53DE6AF278ECE6>, <8 x double> %9) #13
  %11 = fmul <8 x double> %10, %10
  %12 = fmul <8 x double> %11, %11
  %13 = fmul <8 x double> %12, %12
  %14 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %10, <8 x double> <double 0x3E21E0C670AFFF06, double 0x3E21E0C670AFFF06, double 0x3E21E0C670AFFF06, double 0x3E21E0C670AFFF06, double 0x3E21E0C670AFFF06, double 0x3E21E0C670AFFF06, double 0x3E21E0C670AFFF06, double 0x3E21E0C670AFFF06>, <8 x double> <double 0x3E5AF6C36F75740C, double 0x3E5AF6C36F75740C, double 0x3E5AF6C36F75740C, double 0x3E5AF6C36F75740C, double 0x3E5AF6C36F75740C, double 0x3E5AF6C36F75740C, double 0x3E5AF6C36F75740C, double 0x3E5AF6C36F75740C>) #13
  %15 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %10, <8 x double> <double 0x3E927E5D38A23654, double 0x3E927E5D38A23654, double 0x3E927E5D38A23654, double 0x3E927E5D38A23654, double 0x3E927E5D38A23654, double 0x3E927E5D38A23654, double 0x3E927E5D38A23654, double 0x3E927E5D38A23654>, <8 x double> <double 0x3EC71DDEF633FB47, double 0x3EC71DDEF633FB47, double 0x3EC71DDEF633FB47, double 0x3EC71DDEF633FB47, double 0x3EC71DDEF633FB47, double 0x3EC71DDEF633FB47, double 0x3EC71DDEF633FB47, double 0x3EC71DDEF633FB47>) #13
  %16 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %10, <8 x double> <double 0x3EFA01A0127F883A, double 0x3EFA01A0127F883A, double 0x3EFA01A0127F883A, double 0x3EFA01A0127F883A, double 0x3EFA01A0127F883A, double 0x3EFA01A0127F883A, double 0x3EFA01A0127F883A, double 0x3EFA01A0127F883A>, <8 x double> <double 0x3F2A01A01B4421FD, double 0x3F2A01A01B4421FD, double 0x3F2A01A01B4421FD, double 0x3F2A01A01B4421FD, double 0x3F2A01A01B4421FD, double 0x3F2A01A01B4421FD, double 0x3F2A01A01B4421FD, double 0x3F2A01A01B4421FD>) #13
  %17 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %11, <8 x double> %15, <8 x double> %16) #13
  %18 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %10, <8 x double> <double 0x3F56C16C16C3396B, double 0x3F56C16C16C3396B, double 0x3F56C16C16C3396B, double 0x3F56C16C16C3396B, double 0x3F56C16C16C3396B, double 0x3F56C16C16C3396B, double 0x3F56C16C16C3396B, double 0x3F56C16C16C3396B>, <8 x double> <double 0x3F8111111110E7A5, double 0x3F8111111110E7A5, double 0x3F8111111110E7A5, double 0x3F8111111110E7A5, double 0x3F8111111110E7A5, double 0x3F8111111110E7A5, double 0x3F8111111110E7A5, double 0x3F8111111110E7A5>) #13
  %19 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %10, <8 x double> <double 0x3FA55555555554F9, double 0x3FA55555555554F9, double 0x3FA55555555554F9, double 0x3FA55555555554F9, double 0x3FA55555555554F9, double 0x3FA55555555554F9, double 0x3FA55555555554F9, double 0x3FA55555555554F9>, <8 x double> <double 0x3FC555555555555E, double 0x3FC555555555555E, double 0x3FC555555555555E, double 0x3FC555555555555E, double 0x3FC555555555555E, double 0x3FC555555555555E, double 0x3FC555555555555E, double 0x3FC555555555555E>) #13
  %20 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %11, <8 x double> %18, <8 x double> %19) #13
  %21 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %12, <8 x double> %17, <8 x double> %20) #13
  %22 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %13, <8 x double> %14, <8 x double> %21) #13
  %23 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %22, <8 x double> %10, <8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>) #13
  %24 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %23, <8 x double> %10, <8 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>) #13
  %25 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %24, <8 x double> %10, <8 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>) #13
  %26 = ashr <8 x i32> %8, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
  %27 = add nsw <8 x i32> %26, <i32 1023, i32 1023, i32 1023, i32 1023, i32 1023, i32 1023, i32 1023, i32 1023>
  %28 = bitcast <8 x i32> %27 to <4 x i64>
  %29 = shufflevector <4 x i64> %28, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
  %30 = bitcast <8 x i64> %29 to <16 x i32>
  %31 = shufflevector <16 x i32> %30, <16 x i32> undef, <16 x i32> <i32 undef, i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 3, i32 undef, i32 4, i32 undef, i32 5, i32 undef, i32 6, i32 undef, i32 7>
  %32 = shufflevector <16 x i32> <i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef>, <16 x i32> %31, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
  %33 = shl <16 x i32> %32, <i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20>
  %34 = bitcast <16 x i32> %33 to <8 x double>
  %35 = fmul <8 x double> %25, %34
  %36 = add <8 x i32> %8, <i32 1023, i32 1023, i32 1023, i32 1023, i32 1023, i32 1023, i32 1023, i32 1023>
  %37 = sub <8 x i32> %36, %26
  %38 = bitcast <8 x i32> %37 to <4 x i64>
  %39 = shufflevector <4 x i64> %38, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
  %40 = bitcast <8 x i64> %39 to <16 x i32>
  %41 = shufflevector <16 x i32> %40, <16 x i32> undef, <16 x i32> <i32 undef, i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 3, i32 undef, i32 4, i32 undef, i32 5, i32 undef, i32 6, i32 undef, i32 7>
  %42 = shufflevector <16 x i32> <i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef>, <16 x i32> %41, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
  %43 = shl <16 x i32> %42, <i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20>
  %44 = bitcast <16 x i32> %43 to <8 x double>
  %45 = fmul <8 x double> %35, %44
  %46 = fcmp ogt <8 x double> %0, <double 0x40862E42FE102C83, double 0x40862E42FE102C83, double 0x40862E42FE102C83, double 0x40862E42FE102C83, double 0x40862E42FE102C83, double 0x40862E42FE102C83, double 0x40862E42FE102C83, double 0x40862E42FE102C83>
  %47 = select <8 x i1> %46, <8 x double> <double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000>, <8 x double> %45
  %48 = fcmp olt <8 x double> %0, <double -1.000000e+03, double -1.000000e+03, double -1.000000e+03, double -1.000000e+03, double -1.000000e+03, double -1.000000e+03, double -1.000000e+03, double -1.000000e+03>
  %49 = select <8 x i1> %48, <8 x double> zeroinitializer, <8 x double> %47
  ret <8 x double> %49
"""), Vec{8,Float64}, Tuple{Vec{8,Float64}}, v)
end
@inline function vexpv2(v::Vec{8,Float64}) #modify for  vexp
    Base.llvmcall(("""
declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32)
declare <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double>, <8 x i32>, i8, i32)
declare <8 x double> @llvm.fma.v8f64(<8 x double>, <8 x double>, <8 x double>)
""","""
  %2 = fmul <8 x double> %0, <double 0x3FF71547652B82FE, double 0x3FF71547652B82FE, double 0x3FF71547652B82FE, double 0x3FF71547652B82FE, double 0x3FF71547652B82FE, double 0x3FF71547652B82FE, double 0x3FF71547652B82FE, double 0x3FF71547652B82FE>
  %3 = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %4 = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %5 = tail call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %3, i32 8) #13
  %6 = tail call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %4, i32 8) #13
  %7 = shufflevector <4 x double> %6, <4 x double> %5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %8 = tail call <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double> %2, <8 x i32> zeroinitializer, i8 -1, i32 8) #13
  %9 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %7, <8 x double> <double 0xBFE62E42FEFA3000, double 0xBFE62E42FEFA3000, double 0xBFE62E42FEFA3000, double 0xBFE62E42FEFA3000, double 0xBFE62E42FEFA3000, double 0xBFE62E42FEFA3000, double 0xBFE62E42FEFA3000, double 0xBFE62E42FEFA3000>, <8 x double> %0) #13
  %10 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %7, <8 x double> <double 0xBD53DE6AF278ECE6, double 0xBD53DE6AF278ECE6, double 0xBD53DE6AF278ECE6, double 0xBD53DE6AF278ECE6, double 0xBD53DE6AF278ECE6, double 0xBD53DE6AF278ECE6, double 0xBD53DE6AF278ECE6, double 0xBD53DE6AF278ECE6>, <8 x double> %9) #13
  %11 = fmul <8 x double> %10, %10
  %12 = fmul <8 x double> %11, %11
  %13 = fmul <8 x double> %12, %12
  %14 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %10, <8 x double> <double 0x3E21E0C670AFFF06, double 0x3E21E0C670AFFF06, double 0x3E21E0C670AFFF06, double 0x3E21E0C670AFFF06, double 0x3E21E0C670AFFF06, double 0x3E21E0C670AFFF06, double 0x3E21E0C670AFFF06, double 0x3E21E0C670AFFF06>, <8 x double> <double 0x3E5AF6C36F75740C, double 0x3E5AF6C36F75740C, double 0x3E5AF6C36F75740C, double 0x3E5AF6C36F75740C, double 0x3E5AF6C36F75740C, double 0x3E5AF6C36F75740C, double 0x3E5AF6C36F75740C, double 0x3E5AF6C36F75740C>) #13
  %15 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %10, <8 x double> <double 0x3E927E5D38A23654, double 0x3E927E5D38A23654, double 0x3E927E5D38A23654, double 0x3E927E5D38A23654, double 0x3E927E5D38A23654, double 0x3E927E5D38A23654, double 0x3E927E5D38A23654, double 0x3E927E5D38A23654>, <8 x double> <double 0x3EC71DDEF633FB47, double 0x3EC71DDEF633FB47, double 0x3EC71DDEF633FB47, double 0x3EC71DDEF633FB47, double 0x3EC71DDEF633FB47, double 0x3EC71DDEF633FB47, double 0x3EC71DDEF633FB47, double 0x3EC71DDEF633FB47>) #13
  %16 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %10, <8 x double> <double 0x3EFA01A0127F883A, double 0x3EFA01A0127F883A, double 0x3EFA01A0127F883A, double 0x3EFA01A0127F883A, double 0x3EFA01A0127F883A, double 0x3EFA01A0127F883A, double 0x3EFA01A0127F883A, double 0x3EFA01A0127F883A>, <8 x double> <double 0x3F2A01A01B4421FD, double 0x3F2A01A01B4421FD, double 0x3F2A01A01B4421FD, double 0x3F2A01A01B4421FD, double 0x3F2A01A01B4421FD, double 0x3F2A01A01B4421FD, double 0x3F2A01A01B4421FD, double 0x3F2A01A01B4421FD>) #13
  %17 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %11, <8 x double> %15, <8 x double> %16) #13
  %18 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %10, <8 x double> <double 0x3F56C16C16C3396B, double 0x3F56C16C16C3396B, double 0x3F56C16C16C3396B, double 0x3F56C16C16C3396B, double 0x3F56C16C16C3396B, double 0x3F56C16C16C3396B, double 0x3F56C16C16C3396B, double 0x3F56C16C16C3396B>, <8 x double> <double 0x3F8111111110E7A5, double 0x3F8111111110E7A5, double 0x3F8111111110E7A5, double 0x3F8111111110E7A5, double 0x3F8111111110E7A5, double 0x3F8111111110E7A5, double 0x3F8111111110E7A5, double 0x3F8111111110E7A5>) #13
  %19 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %10, <8 x double> <double 0x3FA55555555554F9, double 0x3FA55555555554F9, double 0x3FA55555555554F9, double 0x3FA55555555554F9, double 0x3FA55555555554F9, double 0x3FA55555555554F9, double 0x3FA55555555554F9, double 0x3FA55555555554F9>, <8 x double> <double 0x3FC555555555555E, double 0x3FC555555555555E, double 0x3FC555555555555E, double 0x3FC555555555555E, double 0x3FC555555555555E, double 0x3FC555555555555E, double 0x3FC555555555555E, double 0x3FC555555555555E>) #13
  %20 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %11, <8 x double> %18, <8 x double> %19) #13
  %21 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %12, <8 x double> %17, <8 x double> %20) #13
  %22 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %13, <8 x double> %14, <8 x double> %21) #13
  %23 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %22, <8 x double> %10, <8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>) #13
  %24 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %23, <8 x double> %10, <8 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>) #13
  %25 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %24, <8 x double> %10, <8 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>) #13
  %26 = ashr <8 x i32> %8, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
  %27 = add nsw <8 x i32> %26, <i32 1023, i32 1023, i32 1023, i32 1023, i32 1023, i32 1023, i32 1023, i32 1023>
  %28 = bitcast <8 x i32> %27 to <4 x i64>
  %29 = shufflevector <4 x i64> %28, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
  %30 = bitcast <8 x i64> %29 to <16 x i32>
  %31 = shufflevector <16 x i32> %30, <16 x i32> undef, <16 x i32> <i32 undef, i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 3, i32 undef, i32 4, i32 undef, i32 5, i32 undef, i32 6, i32 undef, i32 7>
  %32 = shufflevector <16 x i32> <i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef>, <16 x i32> %31, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
  %33 = shl <16 x i32> %32, <i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20>
  %34 = bitcast <16 x i32> %33 to <8 x double>
  %35 = fmul <8 x double> %25, %34
  %36 = add <8 x i32> %8, <i32 1023, i32 1023, i32 1023, i32 1023, i32 1023, i32 1023, i32 1023, i32 1023>
  %37 = sub <8 x i32> %36, %26
  %38 = bitcast <8 x i32> %37 to <4 x i64>
  %39 = shufflevector <4 x i64> %38, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
  %40 = bitcast <8 x i64> %39 to <16 x i32>
  %41 = shufflevector <16 x i32> %40, <16 x i32> undef, <16 x i32> <i32 undef, i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 3, i32 undef, i32 4, i32 undef, i32 5, i32 undef, i32 6, i32 undef, i32 7>
  %42 = shufflevector <16 x i32> <i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef>, <16 x i32> %41, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
  %43 = shl <16 x i32> %42, <i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20>
  %44 = bitcast <16 x i32> %43 to <8 x double>
  %45 = fmul <8 x double> %35, %44
  %46 = fcmp ogt <8 x double> %0, <double 0x40862E42FE102C83, double 0x40862E42FE102C83, double 0x40862E42FE102C83, double 0x40862E42FE102C83, double 0x40862E42FE102C83, double 0x40862E42FE102C83, double 0x40862E42FE102C83, double 0x40862E42FE102C83>
  %47 = select <8 x i1> %46, <8 x double> <double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000>, <8 x double> %45
  %48 = fcmp olt <8 x double> %0, <double -1.000000e+03, double -1.000000e+03, double -1.000000e+03, double -1.000000e+03, double -1.000000e+03, double -1.000000e+03, double -1.000000e+03, double -1.000000e+03>
  %49 = select <8 x i1> %48, <8 x double> zeroinitializer, <8 x double> %47
  ret <8 x double> %49
"""), Vec{8,Float64}, Tuple{Vec{8,Float64}}, v)
end
x = ntuple(Val(8)) do w Core.VecElement(rand()) end;
using BenchmarkTools
@benchmark vexp($x)
@benchmark vexpv2($x)

FWIW, the exp d8 from my system's GLIBC takes only 4.2 nanoseconds.

The text was updated successfully, but these errors were encountered:

shibatch · 2020-02-23T08:00:30Z

This is because there is no AVX512 intrinsic function for rounding 512-bit vector.

chriselrod · 2020-02-23T08:15:00Z

What about the cvtpd2dq instruction you're already using (or that the compiler is also generating?)?

Converts packed double-precision floating-point values in the source operand (second operand) to packed signed doubleword integers in the destination operand (first operand).

When a conversion is inexact, the value returned is rounded according to the rounding control bits in the MXCSR register or the embedded rounding control bits.

Does this not support the needed/correct rounding modes?

shibatch · 2020-02-23T08:30:34Z

That's an instruction.
What I am saying is which intrinsic function to use.

https://software.intel.com/sites/landingpage/IntrinsicsGuide/

chriselrod · 2020-02-23T08:36:09Z

_mm512_cvt_roundpd_epi32?

shibatch · 2020-02-23T08:52:14Z

What is needed here is rounding from double to double.

chriselrod · 2020-02-23T08:58:05Z

Ah, of course. I'm closing this issue.
The reason for the performance improvement I saw is (I believe) because of better out of order execution. That is, the conversion to integers (used later) doesn't have to wait for the doubles to be rounded.

shibatch · 2020-02-23T08:59:35Z

Intrinsic functions for AVX-512 are weird.

chriselrod · 2020-02-23T09:13:25Z

Because it looks like you need both integer and double versions of the rounded numbers, what about:

To integer and round: _mm512_cvt_roundpd_epi32
Integer to double: _mm512_cvtepi32_pd

?

This is probably more efficient than splitting the vector into two 256 bit vectors and applying the avx2 instructions.
Trying this with exp2 d8, I get 5.4ns vs the 6.2ns I get with the original version.

shibatch · 2020-02-23T09:26:09Z

That method does not cover the whole range of double precision number.
SLEEF has a path to adapt to such a method of rounding, but it will make other functions slower.

shibatch · 2020-02-23T09:28:10Z

Can you write inline assembly?
If so, please contribute an alternative rounding function.

chriselrod · 2020-02-23T09:47:03Z

For what it's worth, exp2(typemin(Int32)) == 0.0 and exp2(typemax(Int32)) == Inf, so using 32 bit integers shouldn't cause a loss in range (and exp and exp10 would reach 0/Inf more quickly).
But AVX512DQ provides vcvtpd2qq and vcvtqq2pd, so that 64 bit integers can be used in place of 32 bit.
That is, these can be used instead:

_mm512_cvt_roundpd_epi64
_mm512_cvtepi64_pd

Can you write inline assembly?

Only in the context of using call [returntype] asm when writing LLVM for use with Julia's llvmcall.
But I think I could modify a larger block by starting with the asm from current implementations.

What sort of format do you have in mind?

shibatch · 2020-02-23T10:12:15Z

Ah, there is no 512-bit version of vroundpd.
I am not fully sure if converting back and forth to epi64 is safe.

chriselrod · 2020-02-23T19:40:46Z

I am not fully sure if converting back and forth to epi64 is safe.

What may be the problem for abs(x) <= 1 << 53?

Being able to strip out all the conversions between 256 and 512 bit vectors (and making the other necessary adjustments, like replacing the 20 bit shifts with 52 bit shifts) helps improve performance a little more, so it'd be great if this were safe.

shibatch · 2020-02-23T22:59:35Z

Okay, I will try it.

shibatch · 2020-02-24T02:33:07Z

_mm512_cvt_roundpd_epi64 requires AVX512DQ.

chriselrod · 2020-02-24T03:12:51Z

Knight's Landing and Knight's Mill (the defunct Xeon Phi CPUs) don't have DQ, but Skylake-X and on do.
If you want to divide AVX2 CPUs into two categories, Knight's Mill/Landing vs others would be a reasonable divide. The Knight's MIll/Landing CPUs were many-core, featuring up to 72 cores @1.5 GHz, and some models fit in PCIe slots like a graphics card.

While lacking many of the AVX512 instruction sets (such as DQ), they did have ER, which all the others lack, and provides accurate reciprocal, reciprocal square root, and exp2 instructions. I don't have access to a Knight's * CPU, but when it comes to those functions, I think the native implementations are going to hard to beat on that architecture according to Agner Fog's instruction tables (page 347), so odds are optimized implementations are going to want to take advantage of them.

shibatch · 2020-02-24T04:22:39Z

So, my plan is to add support for avx512fcdvwdqvl.
It is also not easy as just replacing the helper function.
So, this will take time. I will take care of this after PR #283 is merged.

shibatch · 2020-02-25T02:14:18Z

It seems that _mm512_roundscale_pd can be used for rounding.

https://stackoverflow.com/questions/50854991/instrinsic-mm512-round-ps-is-missing-for-avx512

shibatch · 2020-02-25T02:55:36Z

Merged.

chriselrod closed this as completed Feb 23, 2020

shibatch reopened this Feb 23, 2020

shibatch mentioned this issue Feb 25, 2020

[AVX-512] Use roundscale for rounding #286

Merged

shibatch closed this as completed Feb 25, 2020

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Redundant rounding in AVX512 exp d8? #285

Redundant rounding in AVX512 exp d8? #285

chriselrod commented Feb 23, 2020 •

edited

Loading

shibatch commented Feb 23, 2020 •

edited

Loading

chriselrod commented Feb 23, 2020 •

edited

Loading

shibatch commented Feb 23, 2020

chriselrod commented Feb 23, 2020

shibatch commented Feb 23, 2020

chriselrod commented Feb 23, 2020 •

edited

Loading

shibatch commented Feb 23, 2020

chriselrod commented Feb 23, 2020 •

edited

Loading

shibatch commented Feb 23, 2020

shibatch commented Feb 23, 2020

chriselrod commented Feb 23, 2020

shibatch commented Feb 23, 2020

chriselrod commented Feb 23, 2020

shibatch commented Feb 23, 2020

shibatch commented Feb 24, 2020

chriselrod commented Feb 24, 2020 •

edited

Loading

shibatch commented Feb 24, 2020

shibatch commented Feb 25, 2020

shibatch commented Feb 25, 2020

Redundant rounding in AVX512 exp d8? #285

Redundant rounding in AVX512 exp d8? #285

Comments

chriselrod commented Feb 23, 2020 • edited Loading

shibatch commented Feb 23, 2020 • edited Loading

chriselrod commented Feb 23, 2020 • edited Loading

shibatch commented Feb 23, 2020

chriselrod commented Feb 23, 2020

shibatch commented Feb 23, 2020

chriselrod commented Feb 23, 2020 • edited Loading

shibatch commented Feb 23, 2020

chriselrod commented Feb 23, 2020 • edited Loading

shibatch commented Feb 23, 2020

shibatch commented Feb 23, 2020

chriselrod commented Feb 23, 2020

shibatch commented Feb 23, 2020

chriselrod commented Feb 23, 2020

shibatch commented Feb 23, 2020

shibatch commented Feb 24, 2020

chriselrod commented Feb 24, 2020 • edited Loading

shibatch commented Feb 24, 2020

shibatch commented Feb 25, 2020

shibatch commented Feb 25, 2020

chriselrod commented Feb 23, 2020 •

edited

Loading

shibatch commented Feb 23, 2020 •

edited

Loading

chriselrod commented Feb 23, 2020 •

edited

Loading

chriselrod commented Feb 23, 2020 •

edited

Loading

chriselrod commented Feb 23, 2020 •

edited

Loading

chriselrod commented Feb 24, 2020 •

edited

Loading