From f548c4d83cdded0c19ca02ca9c071d8ced9ea4fd Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 7 May 2024 15:46:15 +0200 Subject: [PATCH] AMDGPU: Add mode register use to s_getreg_b32 This should fix reading the wrong mode after setting the mode. Ideally we would have separate pseudos for the case that we know does not read mode. --- llvm/lib/Target/AMDGPU/SOPInstructions.td | 5 +- llvm/test/CodeGen/AMDGPU/fdiv.ll | 44 +++--- llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll | 127 ++++++++++++++++++ 3 files changed, 152 insertions(+), 24 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index 93b7e86b5f2973..b05d0018201b85 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -1110,14 +1110,15 @@ def S_CBRANCH_I_FORK : SOPK_Pseudo < // This is hasSideEffects to allow its use in readcyclecounter selection. // FIXME: Need to truncate immediate to 16-bits. -// FIXME: Missing mode register use. Should have separate pseudos for -// known may read MODE and only read MODE. +// FIXME: Should have separate pseudos for known may read MODE and +// only read MODE. def S_GETREG_B32 : SOPK_Pseudo < "s_getreg_b32", (outs SReg_32:$sdst), (ins hwreg:$simm16), "$sdst, $simm16", [(set i32:$sdst, (int_amdgcn_s_getreg (i32 timm:$simm16)))]> { let hasSideEffects = 1; + let Uses = [MODE]; } let Defs = [MODE], Uses = [MODE] in { diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.ll b/llvm/test/CodeGen/AMDGPU/fdiv.ll index 1e5f4c08c7a005..0468175c5df50d 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv.ll @@ -2417,12 +2417,12 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #2 { ; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-FASTFMA-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FASTFMA-NEXT: v_fma_f32 v5, -v2, v3, 1.0 ; GFX6-FASTFMA-NEXT: v_fma_f32 v3, v5, v3, v3 ; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v5, v4, v3 ; GFX6-FASTFMA-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-FASTFMA-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX6-FASTFMA-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FASTFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 @@ -2455,12 +2455,12 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #2 { ; GFX7-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX7-NEXT: v_rcp_f32_e32 v3, v2 ; GFX7-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX7-NEXT: v_fma_f32 v5, -v2, v3, 1.0 ; GFX7-NEXT: v_fma_f32 v3, v5, v3, v3 ; GFX7-NEXT: v_mul_f32_e32 v5, v4, v3 ; GFX7-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX7-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX7-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 @@ -2727,12 +2727,12 @@ define float @v_fdiv_f32_dynamic(float %x, float %y) #2 { ; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-FASTFMA-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FASTFMA-NEXT: v_fma_f32 v5, -v2, v3, 1.0 ; GFX6-FASTFMA-NEXT: v_fma_f32 v3, v5, v3, v3 ; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v5, v4, v3 ; GFX6-FASTFMA-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-FASTFMA-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX6-FASTFMA-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FASTFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 @@ -2765,12 +2765,12 @@ define float @v_fdiv_f32_dynamic(float %x, float %y) #2 { ; GFX7-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX7-NEXT: v_rcp_f32_e32 v3, v2 ; GFX7-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX7-NEXT: v_fma_f32 v5, -v2, v3, 1.0 ; GFX7-NEXT: v_fma_f32 v3, v5, v3, v3 ; GFX7-NEXT: v_mul_f32_e32 v5, v4, v3 ; GFX7-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX7-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX7-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 @@ -3294,12 +3294,12 @@ define float @v_fdiv_f32_dynamic_contractable_user(float %x, float %y, float %z) ; GFX6-FASTFMA-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0 ; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v4, v3 ; GFX6-FASTFMA-NEXT: v_div_scale_f32 v5, vcc, v0, v1, v0 +; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FASTFMA-NEXT: v_fma_f32 v6, -v3, v4, 1.0 ; GFX6-FASTFMA-NEXT: v_fma_f32 v4, v6, v4, v4 ; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v6, v5, v4 ; GFX6-FASTFMA-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-FASTFMA-NEXT: v_fma_f32 v6, v7, v4, v6 ; GFX6-FASTFMA-NEXT: v_fma_f32 v3, -v3, v6, v5 ; GFX6-FASTFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 @@ -3334,12 +3334,12 @@ define float @v_fdiv_f32_dynamic_contractable_user(float %x, float %y, float %z) ; GFX7-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0 ; GFX7-NEXT: v_rcp_f32_e32 v4, v3 ; GFX7-NEXT: v_div_scale_f32 v5, vcc, v0, v1, v0 +; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX7-NEXT: v_fma_f32 v6, -v3, v4, 1.0 ; GFX7-NEXT: v_fma_f32 v4, v6, v4, v4 ; GFX7-NEXT: v_mul_f32_e32 v6, v5, v4 ; GFX7-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX7-NEXT: v_fma_f32 v6, v7, v4, v6 ; GFX7-NEXT: v_fma_f32 v3, -v3, v6, v5 ; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 @@ -3868,12 +3868,12 @@ define float @v_fdiv_f32_dynamic__nnan_ninf(float %x, float %y, float %z) #2 { ; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-FASTFMA-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FASTFMA-NEXT: v_fma_f32 v5, -v2, v3, 1.0 ; GFX6-FASTFMA-NEXT: v_fma_f32 v3, v5, v3, v3 ; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v5, v4, v3 ; GFX6-FASTFMA-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-FASTFMA-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX6-FASTFMA-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FASTFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 @@ -3906,12 +3906,12 @@ define float @v_fdiv_f32_dynamic__nnan_ninf(float %x, float %y, float %z) #2 { ; GFX7-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX7-NEXT: v_rcp_f32_e32 v3, v2 ; GFX7-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX7-NEXT: v_fma_f32 v5, -v2, v3, 1.0 ; GFX7-NEXT: v_fma_f32 v3, v5, v3, v3 ; GFX7-NEXT: v_mul_f32_e32 v5, v4, v3 ; GFX7-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX7-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX7-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 @@ -4434,12 +4434,12 @@ define float @v_fdiv_f32_dynamic__nnan_ninf_contractable_user(float %x, float %y ; GFX6-FASTFMA-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0 ; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v4, v3 ; GFX6-FASTFMA-NEXT: v_div_scale_f32 v5, vcc, v0, v1, v0 +; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FASTFMA-NEXT: v_fma_f32 v6, -v3, v4, 1.0 ; GFX6-FASTFMA-NEXT: v_fma_f32 v4, v6, v4, v4 ; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v6, v5, v4 ; GFX6-FASTFMA-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-FASTFMA-NEXT: v_fma_f32 v6, v7, v4, v6 ; GFX6-FASTFMA-NEXT: v_fma_f32 v3, -v3, v6, v5 ; GFX6-FASTFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 @@ -4474,12 +4474,12 @@ define float @v_fdiv_f32_dynamic__nnan_ninf_contractable_user(float %x, float %y ; GFX7-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0 ; GFX7-NEXT: v_rcp_f32_e32 v4, v3 ; GFX7-NEXT: v_div_scale_f32 v5, vcc, v0, v1, v0 +; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX7-NEXT: v_fma_f32 v6, -v3, v4, 1.0 ; GFX7-NEXT: v_fma_f32 v4, v6, v4, v4 ; GFX7-NEXT: v_mul_f32_e32 v6, v5, v4 ; GFX7-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX7-NEXT: v_fma_f32 v6, v7, v4, v6 ; GFX7-NEXT: v_fma_f32 v3, -v3, v6, v5 ; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 @@ -5010,12 +5010,12 @@ define float @v_fdiv_neglhs_f32_dynamic(float %x, float %y) #2 { ; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -v0 ; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-FASTFMA-NEXT: v_div_scale_f32 v4, vcc, -v0, v1, -v0 +; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FASTFMA-NEXT: v_fma_f32 v5, -v2, v3, 1.0 ; GFX6-FASTFMA-NEXT: v_fma_f32 v3, v5, v3, v3 ; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v5, v4, v3 ; GFX6-FASTFMA-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-FASTFMA-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX6-FASTFMA-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FASTFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 @@ -5048,12 +5048,12 @@ define float @v_fdiv_neglhs_f32_dynamic(float %x, float %y) #2 { ; GFX7-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -v0 ; GFX7-NEXT: v_rcp_f32_e32 v3, v2 ; GFX7-NEXT: v_div_scale_f32 v4, vcc, -v0, v1, -v0 +; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX7-NEXT: v_fma_f32 v5, -v2, v3, 1.0 ; GFX7-NEXT: v_fma_f32 v3, v5, v3, v3 ; GFX7-NEXT: v_mul_f32_e32 v5, v4, v3 ; GFX7-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX7-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX7-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 @@ -5569,12 +5569,12 @@ define float @v_fdiv_negrhs_f32_dynamic(float %x, float %y) #2 { ; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], -v1, -v1, v0 ; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-FASTFMA-NEXT: v_div_scale_f32 v4, vcc, v0, -v1, v0 +; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FASTFMA-NEXT: v_fma_f32 v5, -v2, v3, 1.0 ; GFX6-FASTFMA-NEXT: v_fma_f32 v3, v5, v3, v3 ; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v5, v4, v3 ; GFX6-FASTFMA-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-FASTFMA-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX6-FASTFMA-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FASTFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 @@ -5607,12 +5607,12 @@ define float @v_fdiv_negrhs_f32_dynamic(float %x, float %y) #2 { ; GFX7-NEXT: v_div_scale_f32 v2, s[4:5], -v1, -v1, v0 ; GFX7-NEXT: v_rcp_f32_e32 v3, v2 ; GFX7-NEXT: v_div_scale_f32 v4, vcc, v0, -v1, v0 +; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX7-NEXT: v_fma_f32 v5, -v2, v3, 1.0 ; GFX7-NEXT: v_fma_f32 v3, v5, v3, v3 ; GFX7-NEXT: v_mul_f32_e32 v5, v4, v3 ; GFX7-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX7-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX7-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 @@ -6113,12 +6113,12 @@ define float @v_fdiv_f32_constrhs0_dynamic(float %x) #2 { ; GFX6-FASTFMA-NEXT: v_div_scale_f32 v1, s[4:5], s6, s6, v0 ; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v2, v1 ; GFX6-FASTFMA-NEXT: v_div_scale_f32 v3, vcc, v0, s6, v0 +; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FASTFMA-NEXT: v_fma_f32 v4, -v1, v2, 1.0 ; GFX6-FASTFMA-NEXT: v_fma_f32 v2, v4, v2, v2 ; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v4, v3, v2 ; GFX6-FASTFMA-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-FASTFMA-NEXT: v_fma_f32 v4, v5, v2, v4 ; GFX6-FASTFMA-NEXT: v_fma_f32 v1, -v1, v4, v3 ; GFX6-FASTFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 @@ -6153,12 +6153,12 @@ define float @v_fdiv_f32_constrhs0_dynamic(float %x) #2 { ; GFX7-NEXT: v_div_scale_f32 v1, s[4:5], s6, s6, v0 ; GFX7-NEXT: v_rcp_f32_e32 v2, v1 ; GFX7-NEXT: v_div_scale_f32 v3, vcc, v0, s6, v0 +; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX7-NEXT: v_fma_f32 v4, -v1, v2, 1.0 ; GFX7-NEXT: v_fma_f32 v2, v4, v2, v2 ; GFX7-NEXT: v_mul_f32_e32 v4, v3, v2 ; GFX7-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX7-NEXT: v_fma_f32 v4, v5, v2, v4 ; GFX7-NEXT: v_fma_f32 v1, -v1, v4, v3 ; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 @@ -6619,12 +6619,12 @@ define float @v_fdiv_f32_constlhs0_dynamic(float %x) #2 { ; GFX6-FASTFMA-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, s6 ; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v2, v1 ; GFX6-FASTFMA-NEXT: v_div_scale_f32 v3, vcc, s6, v0, s6 +; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FASTFMA-NEXT: v_fma_f32 v4, -v1, v2, 1.0 ; GFX6-FASTFMA-NEXT: v_fma_f32 v2, v4, v2, v2 ; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v4, v3, v2 ; GFX6-FASTFMA-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-FASTFMA-NEXT: v_fma_f32 v4, v5, v2, v4 ; GFX6-FASTFMA-NEXT: v_fma_f32 v1, -v1, v4, v3 ; GFX6-FASTFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 @@ -6659,12 +6659,12 @@ define float @v_fdiv_f32_constlhs0_dynamic(float %x) #2 { ; GFX7-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, s6 ; GFX7-NEXT: v_rcp_f32_e32 v2, v1 ; GFX7-NEXT: v_div_scale_f32 v3, vcc, s6, v0, s6 +; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX7-NEXT: v_fma_f32 v4, -v1, v2, 1.0 ; GFX7-NEXT: v_fma_f32 v2, v4, v2, v2 ; GFX7-NEXT: v_mul_f32_e32 v4, v3, v2 ; GFX7-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX7-NEXT: v_fma_f32 v4, v5, v2, v4 ; GFX7-NEXT: v_fma_f32 v1, -v1, v4, v3 ; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 @@ -7168,12 +7168,12 @@ define float @v_fdiv_f32_dynamic_nodenorm_x(float nofpclass(sub) %x, float %y) # ; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-FASTFMA-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FASTFMA-NEXT: v_fma_f32 v5, -v2, v3, 1.0 ; GFX6-FASTFMA-NEXT: v_fma_f32 v3, v5, v3, v3 ; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v5, v4, v3 ; GFX6-FASTFMA-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-FASTFMA-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX6-FASTFMA-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FASTFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 @@ -7206,12 +7206,12 @@ define float @v_fdiv_f32_dynamic_nodenorm_x(float nofpclass(sub) %x, float %y) # ; GFX7-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX7-NEXT: v_rcp_f32_e32 v3, v2 ; GFX7-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX7-NEXT: v_fma_f32 v5, -v2, v3, 1.0 ; GFX7-NEXT: v_fma_f32 v3, v5, v3, v3 ; GFX7-NEXT: v_mul_f32_e32 v5, v4, v3 ; GFX7-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX7-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX7-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 @@ -7721,12 +7721,12 @@ define float @v_fdiv_f32_dynamic_nodenorm_y(float %x, float nofpclass(sub) %y) # ; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-FASTFMA-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FASTFMA-NEXT: v_fma_f32 v5, -v2, v3, 1.0 ; GFX6-FASTFMA-NEXT: v_fma_f32 v3, v5, v3, v3 ; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v5, v4, v3 ; GFX6-FASTFMA-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX6-FASTFMA-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX6-FASTFMA-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FASTFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 @@ -7759,12 +7759,12 @@ define float @v_fdiv_f32_dynamic_nodenorm_y(float %x, float nofpclass(sub) %y) # ; GFX7-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX7-NEXT: v_rcp_f32_e32 v3, v2 ; GFX7-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX7-NEXT: v_fma_f32 v5, -v2, v3, 1.0 ; GFX7-NEXT: v_fma_f32 v3, v5, v3, v3 ; GFX7-NEXT: v_mul_f32_e32 v5, v4, v3 ; GFX7-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2) ; GFX7-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX7-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll b/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll index 48abc49c41ae0a..6a9c4c8d41c202 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll @@ -1661,5 +1661,132 @@ define amdgpu_gfx void @s_set_rounding_select_3_5(i32 inreg %cond) { ret void } +define amdgpu_kernel void @get_rounding_after_set_rounding_1() { +; GFX6-LABEL: get_rounding_after_set_rounding_1: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 0 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_nop 0 +; GFX6-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 4) +; GFX6-NEXT: s_lshl_b32 s2, s0, 2 +; GFX6-NEXT: s_mov_b32 s0, 0xeb24da71 +; GFX6-NEXT: s_mov_b32 s1, 0xc96f385 +; GFX6-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX6-NEXT: s_and_b32 s0, s0, 15 +; GFX6-NEXT: s_add_i32 s1, s0, 4 +; GFX6-NEXT: s_cmp_lt_u32 s0, 4 +; GFX6-NEXT: s_cselect_b32 s4, s0, s1 +; GFX6-NEXT: s_mov_b32 s0, 0 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s1, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: get_rounding_after_set_rounding_1: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 0 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_nop 0 +; GFX7-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 4) +; GFX7-NEXT: s_lshl_b32 s2, s0, 2 +; GFX7-NEXT: s_mov_b32 s0, 0xeb24da71 +; GFX7-NEXT: s_mov_b32 s1, 0xc96f385 +; GFX7-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX7-NEXT: s_and_b32 s0, s0, 15 +; GFX7-NEXT: s_add_i32 s1, s0, 4 +; GFX7-NEXT: s_cmp_lt_u32 s0, 4 +; GFX7-NEXT: s_cselect_b32 s4, s0, s1 +; GFX7-NEXT: s_mov_b32 s0, 0 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: get_rounding_after_set_rounding_1: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 0 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 4) +; GFX8-NEXT: s_lshl_b32 s2, s0, 2 +; GFX8-NEXT: s_mov_b32 s0, 0xeb24da71 +; GFX8-NEXT: s_mov_b32 s1, 0xc96f385 +; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX8-NEXT: s_and_b32 s0, s0, 15 +; GFX8-NEXT: s_add_i32 s1, s0, 4 +; GFX8-NEXT: s_cmp_lt_u32 s0, 4 +; GFX8-NEXT: s_cselect_b32 s0, s0, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: get_rounding_after_set_rounding_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 4) +; GFX9-NEXT: s_lshl_b32 s2, s0, 2 +; GFX9-NEXT: s_mov_b32 s0, 0xeb24da71 +; GFX9-NEXT: s_mov_b32 s1, 0xc96f385 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX9-NEXT: s_and_b32 s0, s0, 15 +; GFX9-NEXT: s_add_i32 s1, s0, 4 +; GFX9-NEXT: s_cmp_lt_u32 s0, 4 +; GFX9-NEXT: s_cselect_b32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: get_rounding_after_set_rounding_1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_round_mode 0x0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 4) +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_lshl_b32 s2, s0, 2 +; GFX10-NEXT: s_mov_b32 s0, 0xeb24da71 +; GFX10-NEXT: s_mov_b32 s1, 0xc96f385 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX10-NEXT: s_and_b32 s0, s0, 15 +; GFX10-NEXT: s_add_i32 s1, s0, 4 +; GFX10-NEXT: s_cmp_lt_u32 s0, 4 +; GFX10-NEXT: s_cselect_b32 s0, s0, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: get_rounding_after_set_rounding_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_round_mode 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 4) +; GFX11-NEXT: s_lshl_b32 s2, s0, 2 +; GFX11-NEXT: s_mov_b32 s0, 0xeb24da71 +; GFX11-NEXT: s_mov_b32 s1, 0xc96f385 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX11-NEXT: s_and_b32 s0, s0, 15 +; GFX11-NEXT: s_add_i32 s1, s0, 4 +; GFX11-NEXT: s_cmp_lt_u32 s0, 4 +; GFX11-NEXT: s_cselect_b32 s0, s0, s1 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm + tail call void @llvm.set.rounding(i32 1) + %set.mode = tail call i32 @llvm.get.rounding() + store volatile i32 %set.mode, ptr addrspace(1) null + ret void +} + ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; GCN: {{.*}}