-
Notifications
You must be signed in to change notification settings - Fork 12.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[AArch64] Disable consecutive store merging when Neon is unavailable (#…
…111519) Lowering fixed-size BUILD_VECTORS without Neon may introduce stack spills, leading to more stores/reloads than if the stores were not merged. In some cases, it can also prevent using paired store instructions. In the future, we may want to relax when SVE is available, but currently, the SVE lowerings for BUILD_VECTOR are limited to a few specific cases.
- Loading branch information
Showing
3 changed files
with
108 additions
and
10 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py | ||
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sme -O3 < %s -o - | FileCheck %s --check-prefixes=CHECK | ||
|
||
; Tests consecutive stores of @llvm.aarch64.sve.faddv. Within SDAG faddv is | ||
; lowered as a FADDV + EXTRACT_VECTOR_ELT (of lane 0). Stores of extracts can | ||
; be matched by DAGCombiner::mergeConsecutiveStores(), which we want to avoid in | ||
; some cases as it can lead to worse codegen. | ||
|
||
; TODO: A single `stp s0, s1, [x0]` may be preferred here. | ||
define void @consecutive_stores_pair(ptr %dest0, <vscale x 4 x float> %vec0, <vscale x 4 x float> %vec1) { | ||
; CHECK-LABEL: consecutive_stores_pair: | ||
; CHECK: // %bb.0: | ||
; CHECK-NEXT: ptrue p0.s | ||
; CHECK-NEXT: faddv s0, p0, z0.s | ||
; CHECK-NEXT: faddv s1, p0, z1.s | ||
; CHECK-NEXT: mov v0.s[1], v1.s[0] | ||
; CHECK-NEXT: str d0, [x0] | ||
; CHECK-NEXT: ret | ||
%dest1 = getelementptr inbounds i8, ptr %dest0, i64 4 | ||
%reduce0 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec0) | ||
%reduce1 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec1) | ||
store float %reduce0, ptr %dest0, align 4 | ||
store float %reduce1, ptr %dest1, align 4 | ||
ret void | ||
} | ||
|
||
define void @consecutive_stores_quadruple(ptr %dest0, <vscale x 4 x float> %vec0, <vscale x 4 x float> %vec1, <vscale x 4 x float> %vec2, <vscale x 4 x float> %vec3) { | ||
; CHECK-LABEL: consecutive_stores_quadruple: | ||
; CHECK: // %bb.0: | ||
; CHECK-NEXT: ptrue p0.s | ||
; CHECK-NEXT: faddv s0, p0, z0.s | ||
; CHECK-NEXT: faddv s1, p0, z1.s | ||
; CHECK-NEXT: faddv s2, p0, z2.s | ||
; CHECK-NEXT: mov v0.s[1], v1.s[0] | ||
; CHECK-NEXT: faddv s3, p0, z3.s | ||
; CHECK-NEXT: mov v2.s[1], v3.s[0] | ||
; CHECK-NEXT: stp d0, d2, [x0] | ||
; CHECK-NEXT: ret | ||
%dest1 = getelementptr inbounds i8, ptr %dest0, i64 4 | ||
%dest2 = getelementptr inbounds i8, ptr %dest1, i64 4 | ||
%dest3 = getelementptr inbounds i8, ptr %dest2, i64 4 | ||
%reduce0 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec0) | ||
%reduce1 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec1) | ||
%reduce2 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec2) | ||
%reduce3 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec3) | ||
store float %reduce0, ptr %dest0, align 4 | ||
store float %reduce1, ptr %dest1, align 4 | ||
store float %reduce2, ptr %dest2, align 4 | ||
store float %reduce3, ptr %dest3, align 4 | ||
ret void | ||
} | ||
|
||
define void @consecutive_stores_pair_streaming_function(ptr %dest0, <vscale x 4 x float> %vec0, <vscale x 4 x float> %vec1) "aarch64_pstate_sm_enabled" { | ||
; CHECK-LABEL: consecutive_stores_pair_streaming_function: | ||
; CHECK: // %bb.0: | ||
; CHECK-NEXT: ptrue p0.s | ||
; CHECK-NEXT: faddv s0, p0, z0.s | ||
; CHECK-NEXT: faddv s1, p0, z1.s | ||
; CHECK-NEXT: stp s0, s1, [x0] | ||
; CHECK-NEXT: ret | ||
%dest1 = getelementptr inbounds i8, ptr %dest0, i64 4 | ||
%reduce0 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec0) | ||
%reduce1 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec1) | ||
store float %reduce0, ptr %dest0, align 4 | ||
store float %reduce1, ptr %dest1, align 4 | ||
ret void | ||
} | ||
|
||
define void @consecutive_stores_quadruple_streaming_function(ptr %dest0, <vscale x 4 x float> %vec0, <vscale x 4 x float> %vec1, <vscale x 4 x float> %vec2, <vscale x 4 x float> %vec3) "aarch64_pstate_sm_enabled" { | ||
; CHECK-LABEL: consecutive_stores_quadruple_streaming_function: | ||
; CHECK: // %bb.0: | ||
; CHECK-NEXT: ptrue p0.s | ||
; CHECK-NEXT: faddv s0, p0, z0.s | ||
; CHECK-NEXT: faddv s1, p0, z1.s | ||
; CHECK-NEXT: faddv s2, p0, z2.s | ||
; CHECK-NEXT: stp s0, s1, [x0] | ||
; CHECK-NEXT: faddv s3, p0, z3.s | ||
; CHECK-NEXT: stp s2, s3, [x0, #8] | ||
; CHECK-NEXT: ret | ||
%dest1 = getelementptr inbounds i8, ptr %dest0, i64 4 | ||
%dest2 = getelementptr inbounds i8, ptr %dest1, i64 4 | ||
%dest3 = getelementptr inbounds i8, ptr %dest2, i64 4 | ||
%reduce0 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec0) | ||
%reduce1 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec1) | ||
%reduce2 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec2) | ||
%reduce3 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec3) | ||
store float %reduce0, ptr %dest0, align 4 | ||
store float %reduce1, ptr %dest1, align 4 | ||
store float %reduce2, ptr %dest2, align 4 | ||
store float %reduce3, ptr %dest3, align 4 | ||
ret void | ||
} |