diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 34a89a907e64873..5be0a049cc5827f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -3532,6 +3532,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) { return true; return selectImpl(I, *CoverageInfo); case TargetOpcode::G_LOAD: + case TargetOpcode::G_ZEXTLOAD: + case TargetOpcode::G_SEXTLOAD: case TargetOpcode::G_STORE: case TargetOpcode::G_ATOMIC_CMPXCHG: case TargetOpcode::G_ATOMICRMW_XCHG: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sextload-local.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sextload-local.mir index 6bac125c0309bac..37958480d28a5f8 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sextload-local.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sextload-local.mir @@ -19,14 +19,18 @@ body: | ; GFX6: liveins: $vgpr0 ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6-NEXT: $m0 = S_MOV_B32 -1 ; GFX6-NEXT: [[DS_READ_I8_:%[0-9]+]]:vgpr_32 = DS_READ_I8 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3) ; GFX6-NEXT: $vgpr0 = COPY [[DS_READ_I8_]] + ; ; GFX7-LABEL: name: sextload_local_s32_from_s8_align1 ; GFX7: liveins: $vgpr0 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX7-NEXT: $m0 = S_MOV_B32 -1 ; GFX7-NEXT: [[DS_READ_I8_:%[0-9]+]]:vgpr_32 = DS_READ_I8 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3) ; GFX7-NEXT: $vgpr0 = COPY [[DS_READ_I8_]] + ; ; GFX9-LABEL: name: sextload_local_s32_from_s8_align1 ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} @@ -53,14 +57,18 @@ body: | ; GFX6: liveins: $vgpr0 ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6-NEXT: $m0 = S_MOV_B32 -1 ; GFX6-NEXT: [[DS_READ_I16_:%[0-9]+]]:vgpr_32 = DS_READ_I16 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (s16), addrspace 3) ; GFX6-NEXT: $vgpr0 = COPY [[DS_READ_I16_]] + ; ; GFX7-LABEL: name: sextload_local_s32_from_s16_align2 ; GFX7: liveins: $vgpr0 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX7-NEXT: $m0 = S_MOV_B32 -1 ; GFX7-NEXT: [[DS_READ_I16_:%[0-9]+]]:vgpr_32 = DS_READ_I16 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (s16), addrspace 3) ; GFX7-NEXT: $vgpr0 = COPY [[DS_READ_I16_]] + ; ; GFX9-LABEL: name: sextload_local_s32_from_s16_align2 ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} @@ -105,15 +113,19 @@ body: | ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec - ; GFX6-NEXT: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6-NEXT: [[DS_READ_I8_:%[0-9]+]]:vgpr_32 = DS_READ_I8 %2, 0, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3) + ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX6-NEXT: $m0 = S_MOV_B32 -1 + ; GFX6-NEXT: [[DS_READ_I8_:%[0-9]+]]:vgpr_32 = DS_READ_I8 [[V_ADD_CO_U32_e64_]], 0, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3) ; GFX6-NEXT: $vgpr0 = COPY [[DS_READ_I8_]] + ; ; GFX7-LABEL: name: sextload_local_s32_from_s8_align1_offset4095 ; GFX7: liveins: $vgpr0 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX7-NEXT: $m0 = S_MOV_B32 -1 ; GFX7-NEXT: [[DS_READ_I8_:%[0-9]+]]:vgpr_32 = DS_READ_I8 [[COPY]], 4095, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3) ; GFX7-NEXT: $vgpr0 = COPY [[DS_READ_I8_]] + ; ; GFX9-LABEL: name: sextload_local_s32_from_s8_align1_offset4095 ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-zextload-local.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-zextload-local.mir index 63e5d061f8c372f..29671c13e173f51 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-zextload-local.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-zextload-local.mir @@ -19,14 +19,18 @@ body: | ; GFX6: liveins: $vgpr0 ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6-NEXT: $m0 = S_MOV_B32 -1 ; GFX6-NEXT: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3) ; GFX6-NEXT: $vgpr0 = COPY [[DS_READ_U8_]] + ; ; GFX7-LABEL: name: zextload_local_s32_from_s8_align1 ; GFX7: liveins: $vgpr0 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX7-NEXT: $m0 = S_MOV_B32 -1 ; GFX7-NEXT: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3) ; GFX7-NEXT: $vgpr0 = COPY [[DS_READ_U8_]] + ; ; GFX9-LABEL: name: zextload_local_s32_from_s8_align1 ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} @@ -53,14 +57,18 @@ body: | ; GFX6: liveins: $vgpr0 ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6-NEXT: $m0 = S_MOV_B32 -1 ; GFX6-NEXT: [[DS_READ_U16_:%[0-9]+]]:vgpr_32 = DS_READ_U16 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (s16), addrspace 3) ; GFX6-NEXT: $vgpr0 = COPY [[DS_READ_U16_]] + ; ; GFX7-LABEL: name: zextload_local_s32_from_s16_align2 ; GFX7: liveins: $vgpr0 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX7-NEXT: $m0 = S_MOV_B32 -1 ; GFX7-NEXT: [[DS_READ_U16_:%[0-9]+]]:vgpr_32 = DS_READ_U16 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (s16), addrspace 3) ; GFX7-NEXT: $vgpr0 = COPY [[DS_READ_U16_]] + ; ; GFX9-LABEL: name: zextload_local_s32_from_s16_align2 ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} @@ -105,15 +113,19 @@ body: | ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec - ; GFX6-NEXT: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6-NEXT: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 %2, 0, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3) + ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX6-NEXT: $m0 = S_MOV_B32 -1 + ; GFX6-NEXT: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[V_ADD_CO_U32_e64_]], 0, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3) ; GFX6-NEXT: $vgpr0 = COPY [[DS_READ_U8_]] + ; ; GFX7-LABEL: name: zextload_local_s32_from_s8_align1_offset4095 ; GFX7: liveins: $vgpr0 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX7-NEXT: $m0 = S_MOV_B32 -1 ; GFX7-NEXT: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[COPY]], 4095, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3) ; GFX7-NEXT: $vgpr0 = COPY [[DS_READ_U8_]] + ; ; GFX9-LABEL: name: zextload_local_s32_from_s8_align1_offset4095 ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll index fef672570352c37..21f1af1feb4a06c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll @@ -90,54 +90,53 @@ define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) { ; GFX7-LABEL: load_lds_v4i32_align1: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: ds_read_u8 v1, v0 offset:1 -; GFX7-NEXT: ds_read_u8 v2, v0 -; GFX7-NEXT: ds_read_u8 v3, v0 offset:2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(2) -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(1) -; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX7-NEXT: ds_read_u8 v2, v0 offset:3 +; GFX7-NEXT: ds_read_u8 v1, v0 +; GFX7-NEXT: ds_read_u8 v2, v0 offset:1 +; GFX7-NEXT: ds_read_u8 v3, v0 offset:2 +; GFX7-NEXT: ds_read_u8 v4, v0 offset:3 ; GFX7-NEXT: ds_read_u8 v5, v0 offset:4 ; GFX7-NEXT: ds_read_u8 v6, v0 offset:5 ; GFX7-NEXT: ds_read_u8 v7, v0 offset:6 ; GFX7-NEXT: ds_read_u8 v8, v0 offset:7 -; GFX7-NEXT: ds_read_u8 v9, v0 offset:8 -; GFX7-NEXT: ds_read_u8 v10, v0 offset:9 -; GFX7-NEXT: ds_read_u8 v11, v0 offset:10 -; GFX7-NEXT: s_waitcnt lgkmcnt(7) -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(6) +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(4) +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX7-NEXT: v_or_b32_e32 v4, v2, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(5) +; GFX7-NEXT: s_waitcnt lgkmcnt(2) ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v6 -; GFX7-NEXT: s_waitcnt lgkmcnt(3) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX7-NEXT: ds_read_u8 v3, v0 offset:11 -; GFX7-NEXT: ds_read_u8 v5, v0 offset:12 -; GFX7-NEXT: ds_read_u8 v6, v0 offset:13 -; GFX7-NEXT: ds_read_u8 v7, v0 offset:14 -; GFX7-NEXT: ds_read_u8 v0, v0 offset:15 ; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX7-NEXT: ds_read_u8 v2, v0 offset:8 +; GFX7-NEXT: ds_read_u8 v3, v0 offset:9 +; GFX7-NEXT: ds_read_u8 v5, v0 offset:10 +; GFX7-NEXT: ds_read_u8 v6, v0 offset:11 +; GFX7-NEXT: ds_read_u8 v7, v0 offset:12 +; GFX7-NEXT: ds_read_u8 v8, v0 offset:13 +; GFX7-NEXT: ds_read_u8 v9, v0 offset:14 +; GFX7-NEXT: ds_read_u8 v0, v0 offset:15 ; GFX7-NEXT: s_waitcnt lgkmcnt(6) -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v10 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(4) -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v11 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v9 -; GFX7-NEXT: v_or_b32_e32 v3, v3, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(2) -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v6 -; GFX7-NEXT: v_or_b32_e32 v3, v3, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v8 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v9 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v7 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 ; GFX7-NEXT: v_or_b32_e32 v3, v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v0, v4 @@ -270,8 +269,8 @@ define <4 x i32> @load_lds_v4i32_align2(ptr addrspace(3) %ptr) { ; GFX7-LABEL: load_lds_v4i32_align2: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: ds_read_u16 v1, v0 ; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_u16 v1, v0 ; GFX7-NEXT: ds_read_u16 v2, v0 offset:2 ; GFX7-NEXT: ds_read_u16 v3, v0 offset:4 ; GFX7-NEXT: ds_read_u16 v4, v0 offset:6 @@ -281,11 +280,12 @@ define <4 x i32> @load_lds_v4i32_align2(ptr addrspace(3) %ptr) { ; GFX7-NEXT: ds_read_u16 v8, v0 offset:14 ; GFX7-NEXT: s_waitcnt lgkmcnt(6) ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GFX7-NEXT: s_waitcnt lgkmcnt(2) -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(4) ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX7-NEXT: s_waitcnt lgkmcnt(2) +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v8 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll index 225f2165977b3c8..67a089b5cd17dc5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll @@ -81,42 +81,42 @@ define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) { ; GFX7-LABEL: load_lds_v3i32_align1: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: ds_read_u8 v1, v0 offset:1 -; GFX7-NEXT: ds_read_u8 v2, v0 -; GFX7-NEXT: ds_read_u8 v3, v0 offset:2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(2) -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(1) -; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX7-NEXT: ds_read_u8 v2, v0 offset:3 -; GFX7-NEXT: ds_read_u8 v4, v0 offset:4 -; GFX7-NEXT: ds_read_u8 v5, v0 offset:5 -; GFX7-NEXT: ds_read_u8 v6, v0 offset:6 -; GFX7-NEXT: ds_read_u8 v7, v0 offset:7 -; GFX7-NEXT: ds_read_u8 v8, v0 offset:8 -; GFX7-NEXT: ds_read_u8 v9, v0 offset:9 -; GFX7-NEXT: ds_read_u8 v10, v0 offset:10 -; GFX7-NEXT: s_waitcnt lgkmcnt(7) -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: ds_read_u8 v1, v0 +; GFX7-NEXT: ds_read_u8 v2, v0 offset:1 +; GFX7-NEXT: ds_read_u8 v3, v0 offset:2 +; GFX7-NEXT: ds_read_u8 v4, v0 offset:3 +; GFX7-NEXT: ds_read_u8 v5, v0 offset:4 +; GFX7-NEXT: ds_read_u8 v6, v0 offset:5 +; GFX7-NEXT: ds_read_u8 v7, v0 offset:6 +; GFX7-NEXT: ds_read_u8 v8, v0 offset:7 +; GFX7-NEXT: s_waitcnt lgkmcnt(6) +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(4) +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: ds_read_u8 v0, v0 offset:11 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v2, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(6) -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: s_waitcnt lgkmcnt(2) +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v6 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX7-NEXT: s_waitcnt lgkmcnt(1) +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; GFX7-NEXT: ds_read_u8 v5, v0 offset:8 +; GFX7-NEXT: ds_read_u8 v6, v0 offset:9 +; GFX7-NEXT: ds_read_u8 v7, v0 offset:10 +; GFX7-NEXT: ds_read_u8 v0, v0 offset:11 ; GFX7-NEXT: s_waitcnt lgkmcnt(4) -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v8 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(2) -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v6 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v10 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX7-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v0, v3 @@ -223,8 +223,8 @@ define <3 x i32> @load_lds_v3i32_align2(ptr addrspace(3) %ptr) { ; GFX7-LABEL: load_lds_v3i32_align2: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: ds_read_u16 v1, v0 ; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_u16 v1, v0 ; GFX7-NEXT: ds_read_u16 v2, v0 offset:2 ; GFX7-NEXT: ds_read_u16 v3, v0 offset:4 ; GFX7-NEXT: ds_read_u16 v4, v0 offset:6 @@ -235,9 +235,9 @@ define <3 x i32> @load_lds_v3i32_align2(ptr addrspace(3) %ptr) { ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(2) ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll index 06bf71f5e122cc9..c595c939e8d1394 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll @@ -19,54 +19,53 @@ define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) { ; GFX7-LABEL: load_lds_v4i32_align1: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: ds_read_u8 v1, v0 offset:1 -; GFX7-NEXT: ds_read_u8 v2, v0 -; GFX7-NEXT: ds_read_u8 v3, v0 offset:2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(2) -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(1) -; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX7-NEXT: ds_read_u8 v2, v0 offset:3 +; GFX7-NEXT: ds_read_u8 v1, v0 +; GFX7-NEXT: ds_read_u8 v2, v0 offset:1 +; GFX7-NEXT: ds_read_u8 v3, v0 offset:2 +; GFX7-NEXT: ds_read_u8 v4, v0 offset:3 ; GFX7-NEXT: ds_read_u8 v5, v0 offset:4 ; GFX7-NEXT: ds_read_u8 v6, v0 offset:5 ; GFX7-NEXT: ds_read_u8 v7, v0 offset:6 ; GFX7-NEXT: ds_read_u8 v8, v0 offset:7 -; GFX7-NEXT: ds_read_u8 v9, v0 offset:8 -; GFX7-NEXT: ds_read_u8 v10, v0 offset:9 -; GFX7-NEXT: ds_read_u8 v11, v0 offset:10 -; GFX7-NEXT: s_waitcnt lgkmcnt(7) -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(6) +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(4) +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX7-NEXT: v_or_b32_e32 v4, v2, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(5) +; GFX7-NEXT: s_waitcnt lgkmcnt(2) ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v6 -; GFX7-NEXT: s_waitcnt lgkmcnt(3) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX7-NEXT: ds_read_u8 v3, v0 offset:11 -; GFX7-NEXT: ds_read_u8 v5, v0 offset:12 -; GFX7-NEXT: ds_read_u8 v6, v0 offset:13 -; GFX7-NEXT: ds_read_u8 v7, v0 offset:14 -; GFX7-NEXT: ds_read_u8 v0, v0 offset:15 ; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX7-NEXT: ds_read_u8 v2, v0 offset:8 +; GFX7-NEXT: ds_read_u8 v3, v0 offset:9 +; GFX7-NEXT: ds_read_u8 v5, v0 offset:10 +; GFX7-NEXT: ds_read_u8 v6, v0 offset:11 +; GFX7-NEXT: ds_read_u8 v7, v0 offset:12 +; GFX7-NEXT: ds_read_u8 v8, v0 offset:13 +; GFX7-NEXT: ds_read_u8 v9, v0 offset:14 +; GFX7-NEXT: ds_read_u8 v0, v0 offset:15 ; GFX7-NEXT: s_waitcnt lgkmcnt(6) -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v10 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(4) -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v11 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v9 -; GFX7-NEXT: v_or_b32_e32 v3, v3, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(2) -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v6 -; GFX7-NEXT: v_or_b32_e32 v3, v3, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v8 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v9 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v7 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 ; GFX7-NEXT: v_or_b32_e32 v3, v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v0, v4 @@ -102,42 +101,42 @@ define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) { ; GFX7-LABEL: load_lds_v3i32_align1: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: ds_read_u8 v1, v0 offset:1 -; GFX7-NEXT: ds_read_u8 v2, v0 -; GFX7-NEXT: ds_read_u8 v3, v0 offset:2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(2) -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(1) -; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX7-NEXT: ds_read_u8 v2, v0 offset:3 -; GFX7-NEXT: ds_read_u8 v4, v0 offset:4 -; GFX7-NEXT: ds_read_u8 v5, v0 offset:5 -; GFX7-NEXT: ds_read_u8 v6, v0 offset:6 -; GFX7-NEXT: ds_read_u8 v7, v0 offset:7 -; GFX7-NEXT: ds_read_u8 v8, v0 offset:8 -; GFX7-NEXT: ds_read_u8 v9, v0 offset:9 -; GFX7-NEXT: ds_read_u8 v10, v0 offset:10 -; GFX7-NEXT: s_waitcnt lgkmcnt(7) -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: ds_read_u8 v1, v0 +; GFX7-NEXT: ds_read_u8 v2, v0 offset:1 +; GFX7-NEXT: ds_read_u8 v3, v0 offset:2 +; GFX7-NEXT: ds_read_u8 v4, v0 offset:3 +; GFX7-NEXT: ds_read_u8 v5, v0 offset:4 +; GFX7-NEXT: ds_read_u8 v6, v0 offset:5 +; GFX7-NEXT: ds_read_u8 v7, v0 offset:6 +; GFX7-NEXT: ds_read_u8 v8, v0 offset:7 +; GFX7-NEXT: s_waitcnt lgkmcnt(6) +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(4) +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: ds_read_u8 v0, v0 offset:11 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v2, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(6) -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: s_waitcnt lgkmcnt(2) +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v6 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX7-NEXT: s_waitcnt lgkmcnt(1) +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; GFX7-NEXT: ds_read_u8 v5, v0 offset:8 +; GFX7-NEXT: ds_read_u8 v6, v0 offset:9 +; GFX7-NEXT: ds_read_u8 v7, v0 offset:10 +; GFX7-NEXT: ds_read_u8 v0, v0 offset:11 ; GFX7-NEXT: s_waitcnt lgkmcnt(4) -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v8 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(2) -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v6 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v10 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX7-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v0, v3 diff --git a/llvm/test/CodeGen/AMDGPU/med3-knownbits.ll b/llvm/test/CodeGen/AMDGPU/med3-knownbits.ll index fe0ab81bffe3101..01e1e56aea8882b 100644 --- a/llvm/test/CodeGen/AMDGPU/med3-knownbits.ll +++ b/llvm/test/CodeGen/AMDGPU/med3-knownbits.ll @@ -11,26 +11,16 @@ declare i32 @llvm.umax.i32(i32, i32) ; 0 sign bit only after umed3 is formed. The DS instruction offset can ; only be folded on SI with a positive base address. define i32 @v_known_bits_umed3(i8 %a) { -; SI-SDAG-LABEL: v_known_bits_umed3: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x80 -; SI-SDAG-NEXT: v_med3_u32 v0, v0, 32, v1 -; SI-SDAG-NEXT: s_mov_b32 m0, -1 -; SI-SDAG-NEXT: ds_read_u8 v0, v0 offset:128 -; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_known_bits_umed3: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x80 -; SI-GISEL-NEXT: v_med3_u32 v0, v0, 32, v1 -; SI-GISEL-NEXT: ds_read_u8 v0, v0 offset:128 -; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_known_bits_umed3: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0x80 +; SI-NEXT: v_med3_u32 v0, v0, 32, v1 +; SI-NEXT: s_mov_b32 m0, -1 +; SI-NEXT: ds_read_u8 v0, v0 offset:128 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] %ext.a = zext i8 %a to i32 %max.a = call i32 @llvm.umax.i32(i32 %ext.a, i32 32) %umed3 = call i32 @llvm.umin.i32(i32 %max.a, i32 128) @@ -120,5 +110,3 @@ define i32 @v_known_signbits_smed3(i16 %a, i16 %b) { %mul = sdiv i32 %smed3.a, %smed3.b ret i32 %mul } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; SI: {{.*}}