-
Notifications
You must be signed in to change notification settings - Fork 12.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[DAG][RISCV] Use vp_reduce_fadd/fmul when widening types for FP reductions #105840
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -791,12 +791,7 @@ define float @reduce_fadd_16xi32_prefix5(ptr %p) { | |
; CHECK-NEXT: vle32.v v8, (a0) | ||
; CHECK-NEXT: lui a0, 524288 | ||
; CHECK-NEXT: vmv.s.x v10, a0 | ||
; CHECK-NEXT: vsetivli zero, 6, e32, m2, tu, ma | ||
; CHECK-NEXT: vslideup.vi v8, v10, 5 | ||
; CHECK-NEXT: vsetivli zero, 7, e32, m2, tu, ma | ||
; CHECK-NEXT: vslideup.vi v8, v10, 6 | ||
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma | ||
; CHECK-NEXT: vslideup.vi v8, v10, 7 | ||
; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Once we add reductions to tryToReduceVL, this VL toggle will be removed as we'll narrow the load above. I'm deliberately leaving out that part of the change as a post commit concern was raised on that review, and I want that to be settled before extending it. |
||
; CHECK-NEXT: vfredusum.vs v8, v8, v10 | ||
; CHECK-NEXT: vfmv.f.s fa0, v8 | ||
; CHECK-NEXT: ret | ||
|
@@ -880,7 +875,7 @@ define float @reduce_fadd_4xi32_non_associative(ptr %p) { | |
; CHECK-NEXT: vfmv.f.s fa5, v9 | ||
; CHECK-NEXT: lui a0, 524288 | ||
; CHECK-NEXT: vmv.s.x v9, a0 | ||
; CHECK-NEXT: vslideup.vi v8, v9, 3 | ||
; CHECK-NEXT: vsetivli zero, 3, e32, m1, ta, ma | ||
; CHECK-NEXT: vfredusum.vs v8, v8, v9 | ||
; CHECK-NEXT: vfmv.f.s fa4, v8 | ||
; CHECK-NEXT: fadd.s fa0, fa4, fa5 | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -889,17 +889,12 @@ define half @vreduce_ord_fadd_nxv3f16(<vscale x 3 x half> %v, half %s) { | |
; CHECK-NEXT: csrr a0, vlenb | ||
; CHECK-NEXT: srli a0, a0, 3 | ||
; CHECK-NEXT: slli a1, a0, 1 | ||
; CHECK-NEXT: add a1, a1, a0 | ||
; CHECK-NEXT: add a0, a1, a0 | ||
; CHECK-NEXT: lui a2, 1048568 | ||
; CHECK-NEXT: vsetvli a3, zero, e16, m1, ta, ma | ||
; CHECK-NEXT: vmv.v.x v9, a2 | ||
; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma | ||
; CHECK-NEXT: vslideup.vx v8, v9, a1 | ||
; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma | ||
; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma | ||
; CHECK-NEXT: vfmv.s.f v9, fa0 | ||
; CHECK-NEXT: vfredosum.vs v8, v8, v9 | ||
; CHECK-NEXT: vfmv.f.s fa0, v8 | ||
; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma | ||
; CHECK-NEXT: vfredosum.vs v9, v8, v9 | ||
; CHECK-NEXT: vfmv.f.s fa0, v9 | ||
; CHECK-NEXT: ret | ||
%red = call half @llvm.vector.reduce.fadd.nxv3f16(half %s, <vscale x 3 x half> %v) | ||
ret half %red | ||
|
@@ -910,18 +905,15 @@ declare half @llvm.vector.reduce.fadd.nxv6f16(half, <vscale x 6 x half>) | |
define half @vreduce_ord_fadd_nxv6f16(<vscale x 6 x half> %v, half %s) { | ||
; CHECK-LABEL: vreduce_ord_fadd_nxv6f16: | ||
; CHECK: # %bb.0: | ||
; CHECK-NEXT: lui a0, 1048568 | ||
; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma | ||
; CHECK-NEXT: vmv.v.x v10, a0 | ||
; CHECK-NEXT: csrr a0, vlenb | ||
; CHECK-NEXT: srli a0, a0, 2 | ||
; CHECK-NEXT: add a1, a0, a0 | ||
; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma | ||
; CHECK-NEXT: vslideup.vx v9, v10, a0 | ||
; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma | ||
; CHECK-NEXT: srli a1, a0, 3 | ||
; CHECK-NEXT: slli a1, a1, 1 | ||
; CHECK-NEXT: sub a0, a0, a1 | ||
; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma | ||
; CHECK-NEXT: vfmv.s.f v10, fa0 | ||
; CHECK-NEXT: vfredosum.vs v8, v8, v10 | ||
; CHECK-NEXT: vfmv.f.s fa0, v8 | ||
; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma | ||
; CHECK-NEXT: vfredosum.vs v10, v8, v10 | ||
; CHECK-NEXT: vfmv.f.s fa0, v10 | ||
; CHECK-NEXT: ret | ||
%red = call half @llvm.vector.reduce.fadd.nxv6f16(half %s, <vscale x 6 x half> %v) | ||
ret half %red | ||
|
@@ -932,22 +924,15 @@ declare half @llvm.vector.reduce.fadd.nxv10f16(half, <vscale x 10 x half>) | |
define half @vreduce_ord_fadd_nxv10f16(<vscale x 10 x half> %v, half %s) { | ||
; CHECK-LABEL: vreduce_ord_fadd_nxv10f16: | ||
; CHECK: # %bb.0: | ||
; CHECK-NEXT: lui a0, 1048568 | ||
; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma | ||
; CHECK-NEXT: vmv.v.x v12, a0 | ||
; CHECK-NEXT: csrr a0, vlenb | ||
; CHECK-NEXT: srli a0, a0, 2 | ||
; CHECK-NEXT: add a1, a0, a0 | ||
; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma | ||
; CHECK-NEXT: vslideup.vx v10, v12, a0 | ||
; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, ma | ||
; CHECK-NEXT: vmv.v.v v11, v12 | ||
; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma | ||
; CHECK-NEXT: vslideup.vx v11, v12, a0 | ||
; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma | ||
; CHECK-NEXT: srli a0, a0, 3 | ||
; CHECK-NEXT: li a1, 10 | ||
; CHECK-NEXT: mul a0, a0, a1 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There's something going wrong in our lowering here. We should be simplifying this multiply. (vlen/8 * 10 should be vlen/4 * 5, and we should be expanding the multiply. Not a blocker, but something I want to follow up on (low priority) |
||
; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma | ||
; CHECK-NEXT: vfmv.s.f v12, fa0 | ||
; CHECK-NEXT: vfredosum.vs v8, v8, v12 | ||
; CHECK-NEXT: vfmv.f.s fa0, v8 | ||
; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma | ||
; CHECK-NEXT: vfredosum.vs v12, v8, v12 | ||
; CHECK-NEXT: vfmv.f.s fa0, v12 | ||
; CHECK-NEXT: ret | ||
%red = call half @llvm.vector.reduce.fadd.nxv10f16(half %s, <vscale x 10 x half> %v) | ||
ret half %red | ||
|
@@ -958,13 +943,16 @@ declare half @llvm.vector.reduce.fadd.nxv12f16(half, <vscale x 12 x half>) | |
define half @vreduce_ord_fadd_nxv12f16(<vscale x 12 x half> %v, half %s) { | ||
; CHECK-LABEL: vreduce_ord_fadd_nxv12f16: | ||
; CHECK: # %bb.0: | ||
; CHECK-NEXT: lui a0, 1048568 | ||
; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma | ||
; CHECK-NEXT: vmv.v.x v11, a0 | ||
; CHECK-NEXT: csrr a0, vlenb | ||
; CHECK-NEXT: srli a0, a0, 3 | ||
; CHECK-NEXT: slli a1, a0, 2 | ||
; CHECK-NEXT: slli a0, a0, 4 | ||
; CHECK-NEXT: sub a0, a0, a1 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same as above, except that the simplified result should also be equal to VLMAX and we're failing to prove that for some reason. |
||
; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma | ||
; CHECK-NEXT: vfmv.s.f v12, fa0 | ||
; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma | ||
; CHECK-NEXT: vfredosum.vs v8, v8, v12 | ||
; CHECK-NEXT: vfmv.f.s fa0, v8 | ||
; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma | ||
; CHECK-NEXT: vfredosum.vs v12, v8, v12 | ||
; CHECK-NEXT: vfmv.f.s fa0, v12 | ||
; CHECK-NEXT: ret | ||
%red = call half @llvm.vector.reduce.fadd.nxv12f16(half %s, <vscale x 12 x half> %v) | ||
ret half %red | ||
|
@@ -977,17 +965,14 @@ define half @vreduce_fadd_nxv3f16(<vscale x 3 x half> %v, half %s) { | |
; CHECK-NEXT: csrr a0, vlenb | ||
; CHECK-NEXT: srli a0, a0, 3 | ||
; CHECK-NEXT: slli a1, a0, 1 | ||
; CHECK-NEXT: add a1, a1, a0 | ||
; CHECK-NEXT: add a0, a1, a0 | ||
; CHECK-NEXT: lui a2, 1048568 | ||
; CHECK-NEXT: vsetvli a3, zero, e16, m1, ta, ma | ||
; CHECK-NEXT: vmv.v.x v9, a2 | ||
; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma | ||
; CHECK-NEXT: vslideup.vx v8, v9, a1 | ||
; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma | ||
; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma | ||
; CHECK-NEXT: vfmv.s.f v9, fa0 | ||
; CHECK-NEXT: vfredusum.vs v8, v8, v9 | ||
; CHECK-NEXT: vfmv.f.s fa0, v8 | ||
; CHECK-NEXT: lui a1, 1048568 | ||
; CHECK-NEXT: vmv.s.x v10, a1 | ||
; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma | ||
; CHECK-NEXT: vfredusum.vs v10, v8, v9 | ||
; CHECK-NEXT: vfmv.f.s fa0, v10 | ||
; CHECK-NEXT: ret | ||
%red = call reassoc half @llvm.vector.reduce.fadd.nxv3f16(half %s, <vscale x 3 x half> %v) | ||
ret half %red | ||
|
@@ -996,18 +981,17 @@ define half @vreduce_fadd_nxv3f16(<vscale x 3 x half> %v, half %s) { | |
define half @vreduce_fadd_nxv6f16(<vscale x 6 x half> %v, half %s) { | ||
; CHECK-LABEL: vreduce_fadd_nxv6f16: | ||
; CHECK: # %bb.0: | ||
; CHECK-NEXT: lui a0, 1048568 | ||
; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma | ||
; CHECK-NEXT: vmv.v.x v10, a0 | ||
; CHECK-NEXT: csrr a0, vlenb | ||
; CHECK-NEXT: srli a0, a0, 2 | ||
; CHECK-NEXT: add a1, a0, a0 | ||
; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma | ||
; CHECK-NEXT: vslideup.vx v9, v10, a0 | ||
; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma | ||
; CHECK-NEXT: srli a1, a0, 3 | ||
; CHECK-NEXT: slli a1, a1, 1 | ||
; CHECK-NEXT: sub a0, a0, a1 | ||
; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma | ||
; CHECK-NEXT: vfmv.s.f v10, fa0 | ||
; CHECK-NEXT: vfredusum.vs v8, v8, v10 | ||
; CHECK-NEXT: vfmv.f.s fa0, v8 | ||
; CHECK-NEXT: lui a1, 1048568 | ||
; CHECK-NEXT: vmv.s.x v11, a1 | ||
; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma | ||
; CHECK-NEXT: vfredusum.vs v11, v8, v10 | ||
; CHECK-NEXT: vfmv.f.s fa0, v11 | ||
; CHECK-NEXT: ret | ||
%red = call reassoc half @llvm.vector.reduce.fadd.nxv6f16(half %s, <vscale x 6 x half> %v) | ||
ret half %red | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How come this built before without having VP_PROPERTY_NO_FUNCTIONAL?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That property is only used in checking whether the vp operation has a functional IR level representation marked. It doesn't apply to the SDNode representation at all.