diff --git a/bench/x86-64/peakflops_amd_zen_avx512_fma_add.ptt b/bench/x86-64/peakflops_amd_zen_avx512_fma_add.ptt new file mode 100644 index 000000000..9a6851ffb --- /dev/null +++ b/bench/x86-64/peakflops_amd_zen_avx512_fma_add.ptt @@ -0,0 +1,59 @@ +STREAMS 1 +TYPE DOUBLE +FLOPS 45 +BYTES 8 +DESC Double-precision FMA and ADD with a single load. Designed for architectures that can perform AVX-512 FMA and AVX ADD simultaneously. +LOADS 1 +STORES 0 +INSTR_CONST 32 +INSTR_LOOP 34 +UOPS 33 +vmovapd zmm0, [rip+SCALAR] +vmovapd zmm1, [rip+SCALAR] +vmovapd zmm2, [rip+SCALAR] +vmovapd zmm3, [rip+SCALAR] +vmovapd zmm4, [rip+SCALAR] +vmovapd zmm5, [rip+SCALAR] +vmovapd zmm6, [rip+SCALAR] +vmovapd zmm7, [rip+SCALAR] +vmovapd zmm8, [rip+SCALAR] +vmovapd zmm9, [rip+SCALAR] +vmovapd zmm10, [rip+SCALAR] +vmovapd zmm11, [rip+SCALAR] +vmovapd zmm12, [rip+SCALAR] +vmovapd zmm13, [rip+SCALAR] +vmovapd zmm14, [rip+SCALAR] +vmovapd zmm15, [rip+SCALAR] +.align 32 +LOOP 8 +vmovapd zmm1, [STR0 + GPR1 * 8 ] +vfmadd213pd zmm0, zmm0, zmm1 +vfmadd213pd zmm2, zmm2, zmm1 +vfmadd213pd zmm3, zmm3, zmm1 +vfmadd213pd zmm4, zmm4, zmm1 +vfmadd213pd zmm5, zmm5, zmm1 +vfmadd213pd zmm6, zmm6, zmm1 +vfmadd213pd zmm7, zmm7, zmm1 +vfmadd213pd zmm8, zmm8, zmm1 +vfmadd213pd zmm9, zmm9, zmm1 +vfmadd213pd zmm10, zmm10, zmm1 +vfmadd213pd zmm11, zmm11, zmm1 +vfmadd213pd zmm12, zmm12, zmm1 +vfmadd213pd zmm13, zmm13, zmm1 +vfmadd213pd zmm14, zmm14, zmm1 +vfmadd213pd zmm15, zmm15, zmm1 +vaddpd zmm0, zmm0, zmm1 +vaddpd zmm2, zmm2, zmm1 +vaddpd zmm3, zmm3, zmm1 +vaddpd zmm4, zmm4, zmm1 +vaddpd zmm5, zmm5, zmm1 +vaddpd zmm6, zmm6, zmm1 +vaddpd zmm7, zmm7, zmm1 +vaddpd zmm8, zmm8, zmm1 +vaddpd zmm9, zmm9, zmm1 +vaddpd zmm10, zmm10, zmm1 +vaddpd zmm11, zmm11, zmm1 +vaddpd zmm12, zmm12, zmm1 +vaddpd zmm13, zmm13, zmm1 +vaddpd zmm14, zmm14, zmm1 +vaddpd zmm15, zmm15, zmm1 diff --git a/bench/x86-64/peakflops_amd_zen_avx_fma_add.ptt b/bench/x86-64/peakflops_amd_zen_avx_fma_add.ptt new file mode 100644 index 000000000..8c61fdbed --- /dev/null +++ b/bench/x86-64/peakflops_amd_zen_avx_fma_add.ptt @@ -0,0 +1,59 @@ +STREAMS 1 +TYPE DOUBLE +FLOPS 45 +BYTES 8 +DESC Double-precision FMA and ADD with a single load. Designed for architectures that can perform AVX2 FMA and AVX ADD simultaneously. +LOADS 1 +STORES 0 +INSTR_CONST 32 +INSTR_LOOP 34 +UOPS 33 +vmovapd ymm0, [rip+SCALAR] +vmovapd ymm1, [rip+SCALAR] +vmovapd ymm2, [rip+SCALAR] +vmovapd ymm3, [rip+SCALAR] +vmovapd ymm4, [rip+SCALAR] +vmovapd ymm5, [rip+SCALAR] +vmovapd ymm6, [rip+SCALAR] +vmovapd ymm7, [rip+SCALAR] +vmovapd ymm8, [rip+SCALAR] +vmovapd ymm9, [rip+SCALAR] +vmovapd ymm10, [rip+SCALAR] +vmovapd ymm11, [rip+SCALAR] +vmovapd ymm12, [rip+SCALAR] +vmovapd ymm13, [rip+SCALAR] +vmovapd ymm14, [rip+SCALAR] +vmovapd ymm15, [rip+SCALAR] +.align 32 +LOOP 4 +vmovapd ymm1, [STR0 + GPR1 * 8 ] +vfmadd213pd ymm0, ymm0, ymm1 +vfmadd213pd ymm2, ymm2, ymm1 +vfmadd213pd ymm3, ymm3, ymm1 +vfmadd213pd ymm4, ymm4, ymm1 +vfmadd213pd ymm5, ymm5, ymm1 +vfmadd213pd ymm6, ymm6, ymm1 +vfmadd213pd ymm7, ymm7, ymm1 +vfmadd213pd ymm8, ymm8, ymm1 +vfmadd213pd ymm9, ymm9, ymm1 +vfmadd213pd ymm10, ymm10, ymm1 +vfmadd213pd ymm11, ymm11, ymm1 +vfmadd213pd ymm12, ymm12, ymm1 +vfmadd213pd ymm13, ymm13, ymm1 +vfmadd213pd ymm14, ymm14, ymm1 +vfmadd213pd ymm15, ymm15, ymm1 +vaddpd ymm0, ymm0, ymm1 +vaddpd ymm2, ymm2, ymm1 +vaddpd ymm3, ymm3, ymm1 +vaddpd ymm4, ymm4, ymm1 +vaddpd ymm5, ymm5, ymm1 +vaddpd ymm6, ymm6, ymm1 +vaddpd ymm7, ymm7, ymm1 +vaddpd ymm8, ymm8, ymm1 +vaddpd ymm9, ymm9, ymm1 +vaddpd ymm10, ymm10, ymm1 +vaddpd ymm11, ymm11, ymm1 +vaddpd ymm12, ymm12, ymm1 +vaddpd ymm13, ymm13, ymm1 +vaddpd ymm14, ymm14, ymm1 +vaddpd ymm15, ymm15, ymm1