-
Notifications
You must be signed in to change notification settings - Fork 12.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[RISCV][VLOPT] Add support for widening integer mul-add instructions #112219
[RISCV][VLOPT] Add support for widening integer mul-add instructions #112219
Conversation
@llvm/pr-subscribers-backend-risc-v Author: Michael Maitland (michaelmaitland) ChangesThis adds support for these instructions and also tests getOperandInfo for these instructions as well. I think the VL on the using add instruction can be optimized further, once we add support for optimizing non-vlmax. Full diff: https://github.com/llvm/llvm-project/pull/112219.diff 2 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
index 90af9ef898d951..7c5ce23ad72fc3 100644
--- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
@@ -421,8 +421,8 @@ static OperandInfo getOperandInfo(const MachineInstr &MI,
case RISCV::VWSUB_WX:
// 11.14. Vector Widening Integer Multiply-Add Instructions
// Destination EEW=2*SEW and EMUL=2*LMUL. Source EEW=SEW and EMUL=LMUL.
- // Even though the add is a 2*SEW addition, the operands of the add are the
- // Dest which is 2*SEW and the result of the multiply which is 2*SEW.
+ // A SEW-bit*SEW-bit multiply of the sources forms a 2*SEW-bit value, which
+ // is then added to the 2*SEW-bit Dest.
case RISCV::VWMACCU_VV:
case RISCV::VWMACCU_VX:
case RISCV::VWMACC_VV:
@@ -552,9 +552,13 @@ static bool isSupportedInstr(const MachineInstr &MI) {
// 11.13. Vector Single-Width Integer Multiply-Add Instructions
// FIXME: Add support for 11.13 instructions
// 11.14. Vector Widening Integer Multiply-Add Instructions
- // FIXME: Add support for 11.14 instructions
- case RISCV::VWMACC_VX:
+ case RISCV::VWMACCU_VV:
case RISCV::VWMACCU_VX:
+ case RISCV::VWMACC_VV:
+ case RISCV::VWMACC_VX:
+ case RISCV::VWMACCSU_VV:
+ case RISCV::VWMACCSU_VX:
+ case RISCV::VWMACCUS_VX:
// 11.15. Vector Integer Merge Instructions
// FIXME: Add support for 11.15 instructions
// 11.16. Vector Integer Move Instructions
diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll b/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll
index 107252338829bd..7be3964f2ae3c6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll
@@ -804,44 +804,149 @@ define <vscale x 4 x i32> @vmulhsu_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl)
ret <vscale x 4 x i32> %2
}
-define <vscale x 4 x i32> @vwmacc_vx(<vscale x 4 x i16> %a, i16 %b, iXLen %vl) {
+define <vscale x 4 x i32> @vwmacc_vv(<vscale x 4 x i32> %a, <vscale x 4 x i16> %b, <vscale x 4 x i16> %c, iXLen %vl) {
+; NOVLOPT-LABEL: vwmacc_vv:
+; NOVLOPT: # %bb.0:
+; NOVLOPT-NEXT: vsetvli a1, zero, e16, m1, tu, ma
+; NOVLOPT-NEXT: vwmacc.vv v8, v10, v11
+; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; NOVLOPT-NEXT: vadd.vv v8, v8, v8
+; NOVLOPT-NEXT: ret
+;
+; VLOPT-LABEL: vwmacc_vv:
+; VLOPT: # %bb.0:
+; VLOPT-NEXT: vsetvli zero, a0, e16, m1, tu, ma
+; VLOPT-NEXT: vwmacc.vv v8, v10, v11
+; VLOPT-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; VLOPT-NEXT: vadd.vv v8, v8, v8
+; VLOPT-NEXT: ret
+ %1 = call <vscale x 4 x i32> @llvm.riscv.vwmacc.nxv4i32.nxv4i16(<vscale x 4 x i32> %a, <vscale x 4 x i16> %b, <vscale x 4 x i16> %c, iXLen -1, iXLen 0)
+ %2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %1, iXLen %vl)
+ ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vwmacc_vx(<vscale x 4 x i32> %a, i16 %b, <vscale x 4 x i16> %c, iXLen %vl) {
; NOVLOPT-LABEL: vwmacc_vx:
; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e16, m1, ta, ma
-; NOVLOPT-NEXT: vwmacc.vx v10, a0, v8
+; NOVLOPT-NEXT: vsetvli a2, zero, e16, m1, tu, ma
+; NOVLOPT-NEXT: vwmacc.vx v8, a0, v10
; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v10
+; NOVLOPT-NEXT: vadd.vv v8, v8, v8
; NOVLOPT-NEXT: ret
;
; VLOPT-LABEL: vwmacc_vx:
; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e16, m1, ta, ma
-; VLOPT-NEXT: vwmacc.vx v10, a0, v8
+; VLOPT-NEXT: vsetvli zero, a1, e16, m1, tu, ma
+; VLOPT-NEXT: vwmacc.vx v8, a0, v10
; VLOPT-NEXT: vsetvli zero, zero, e32, m2, ta, ma
-; VLOPT-NEXT: vadd.vv v8, v10, v10
+; VLOPT-NEXT: vadd.vv v8, v8, v8
; VLOPT-NEXT: ret
- %1 = call <vscale x 4 x i32> @llvm.riscv.vwmacc.nxv4i32.i16(<vscale x 4 x i32> poison, i16 %b, <vscale x 4 x i16> %a, iXLen -1, iXLen 0)
+ %1 = call <vscale x 4 x i32> @llvm.riscv.vwmacc.nxv4i32.i16(<vscale x 4 x i32> %a, i16 %b, <vscale x 4 x i16> %c, iXLen -1, iXLen 0)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %1, iXLen %vl)
ret <vscale x 4 x i32> %2
}
-define <vscale x 4 x i32> @vwmaccu_vx(<vscale x 4 x i16> %a, i16 %b, iXLen %vl) {
+define <vscale x 4 x i32> @vwmaccu_vv(<vscale x 4 x i32> %a, <vscale x 4 x i16> %b, <vscale x 4 x i16> %c, iXLen %vl) {
+; NOVLOPT-LABEL: vwmaccu_vv:
+; NOVLOPT: # %bb.0:
+; NOVLOPT-NEXT: vsetvli a1, zero, e16, m1, tu, ma
+; NOVLOPT-NEXT: vwmaccu.vv v8, v10, v11
+; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; NOVLOPT-NEXT: vadd.vv v8, v8, v8
+; NOVLOPT-NEXT: ret
+;
+; VLOPT-LABEL: vwmaccu_vv:
+; VLOPT: # %bb.0:
+; VLOPT-NEXT: vsetvli zero, a0, e16, m1, tu, ma
+; VLOPT-NEXT: vwmaccu.vv v8, v10, v11
+; VLOPT-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; VLOPT-NEXT: vadd.vv v8, v8, v8
+; VLOPT-NEXT: ret
+ %1 = call <vscale x 4 x i32> @llvm.riscv.vwmaccu.nxv4i32.nxv4i16(<vscale x 4 x i32> %a, <vscale x 4 x i16> %b, <vscale x 4 x i16> %c, iXLen -1, iXLen 0)
+ %2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %1, iXLen %vl)
+ ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vwmaccu_vx(<vscale x 4 x i32> %a, i16 %b, <vscale x 4 x i16> %c, iXLen %vl) {
; NOVLOPT-LABEL: vwmaccu_vx:
; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e16, m1, ta, ma
-; NOVLOPT-NEXT: vwmaccu.vx v10, a0, v8
+; NOVLOPT-NEXT: vsetvli a2, zero, e16, m1, tu, ma
+; NOVLOPT-NEXT: vwmaccu.vx v8, a0, v10
; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v10
+; NOVLOPT-NEXT: vadd.vv v8, v8, v8
; NOVLOPT-NEXT: ret
;
; VLOPT-LABEL: vwmaccu_vx:
; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e16, m1, ta, ma
-; VLOPT-NEXT: vwmaccu.vx v10, a0, v8
+; VLOPT-NEXT: vsetvli zero, a1, e16, m1, tu, ma
+; VLOPT-NEXT: vwmaccu.vx v8, a0, v10
; VLOPT-NEXT: vsetvli zero, zero, e32, m2, ta, ma
-; VLOPT-NEXT: vadd.vv v8, v10, v10
+; VLOPT-NEXT: vadd.vv v8, v8, v8
+; VLOPT-NEXT: ret
+ %1 = call <vscale x 4 x i32> @llvm.riscv.vwmaccu.nxv4i32.i16(<vscale x 4 x i32> %a, i16 %b, <vscale x 4 x i16> %c, iXLen -1, iXLen 0)
+ %2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %1, iXLen %vl)
+ ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vwmaccsu_vv(<vscale x 4 x i32> %a, <vscale x 4 x i16> %b, <vscale x 4 x i16> %c, iXLen %vl) {
+; NOVLOPT-LABEL: vwmaccsu_vv:
+; NOVLOPT: # %bb.0:
+; NOVLOPT-NEXT: vsetvli a1, zero, e16, m1, tu, ma
+; NOVLOPT-NEXT: vwmaccsu.vv v8, v10, v11
+; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; NOVLOPT-NEXT: vadd.vv v8, v8, v8
+; NOVLOPT-NEXT: ret
+;
+; VLOPT-LABEL: vwmaccsu_vv:
+; VLOPT: # %bb.0:
+; VLOPT-NEXT: vsetvli zero, a0, e16, m1, tu, ma
+; VLOPT-NEXT: vwmaccsu.vv v8, v10, v11
+; VLOPT-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; VLOPT-NEXT: vadd.vv v8, v8, v8
+; VLOPT-NEXT: ret
+ %1 = call <vscale x 4 x i32> @llvm.riscv.vwmaccsu.nxv4i32.nxv4i16(<vscale x 4 x i32> %a, <vscale x 4 x i16> %b, <vscale x 4 x i16> %c, iXLen -1, iXLen 0)
+ %2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %1, iXLen %vl)
+ ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vwmaccsu_vx(<vscale x 4 x i32> %a, i16 %b, <vscale x 4 x i16> %c, iXLen %vl) {
+; NOVLOPT-LABEL: vwmaccsu_vx:
+; NOVLOPT: # %bb.0:
+; NOVLOPT-NEXT: vsetvli a2, zero, e16, m1, tu, ma
+; NOVLOPT-NEXT: vwmaccsu.vx v8, a0, v10
+; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; NOVLOPT-NEXT: vadd.vv v8, v8, v8
+; NOVLOPT-NEXT: ret
+;
+; VLOPT-LABEL: vwmaccsu_vx:
+; VLOPT: # %bb.0:
+; VLOPT-NEXT: vsetvli zero, a1, e16, m1, tu, ma
+; VLOPT-NEXT: vwmaccsu.vx v8, a0, v10
+; VLOPT-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; VLOPT-NEXT: vadd.vv v8, v8, v8
+; VLOPT-NEXT: ret
+ %1 = call <vscale x 4 x i32> @llvm.riscv.vwmaccsu.nxv4i32.i16(<vscale x 4 x i32> %a, i16 %b, <vscale x 4 x i16> %c, iXLen -1, iXLen 0)
+ %2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %1, iXLen %vl)
+ ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vwmaccus_vx(<vscale x 4 x i32> %a, i16 %b, <vscale x 4 x i16> %c, iXLen %vl) {
+; NOVLOPT-LABEL: vwmaccus_vx:
+; NOVLOPT: # %bb.0:
+; NOVLOPT-NEXT: vsetvli a2, zero, e16, m1, tu, ma
+; NOVLOPT-NEXT: vwmaccus.vx v8, a0, v10
+; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; NOVLOPT-NEXT: vadd.vv v8, v8, v8
+; NOVLOPT-NEXT: ret
+;
+; VLOPT-LABEL: vwmaccus_vx:
+; VLOPT: # %bb.0:
+; VLOPT-NEXT: vsetvli zero, a1, e16, m1, tu, ma
+; VLOPT-NEXT: vwmaccus.vx v8, a0, v10
+; VLOPT-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; VLOPT-NEXT: vadd.vv v8, v8, v8
; VLOPT-NEXT: ret
- %1 = call <vscale x 4 x i32> @llvm.riscv.vwmaccu.nxv4i32.i16(<vscale x 4 x i32> poison, i16 %b, <vscale x 4 x i16> %a, iXLen -1, iXLen 0)
+ %1 = call <vscale x 4 x i32> @llvm.riscv.vwmaccus.nxv4i32.i16(<vscale x 4 x i32> %a, i16 %b, <vscale x 4 x i16> %c, iXLen -1, iXLen 0)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %1, iXLen %vl)
ret <vscale x 4 x i32> %2
}
|
This does not test getOperandInfo for the multiplicands. Should we have tests where these instructions are the ones providing the VL? |
67941c2
to
27379bb
Compare
e456af8
to
4cb75a2
Compare
; VLOPT-NEXT: vmv4r.v v8, v12 | ||
; VLOPT-NEXT: ret | ||
%1 = call <vscale x 4 x i32> @llvm.riscv.vwmaccu.nxv4i32.i16(<vscale x 4 x i32> %a, i16 %b, <vscale x 4 x i16> %c, iXLen -1, iXLen 0) | ||
%2 = call <vscale x 4 x i64> @llvm.riscv.vwmaccu.nxv4i64.i32(<vscale x 4 x i64> %d, i32 %e, <vscale x 4 x i32> %1, iXLen %vl, iXLen 0) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit, should the user instruction for this test and the tests above also be vadd.vv for consistency with the other added tests?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
updated.
…lvm#112219) This adds support for these instructions and also tests getOperandInfo for these instructions as well. I think the VL on the using add instruction can be optimized further, once we add support for optimizing non-vlmax.
This adds support for these instructions and also tests getOperandInfo for these instructions as well. I think the VL on the using add instruction can be optimized further, once we add support for optimizing non-vlmax.