-
Notifications
You must be signed in to change notification settings - Fork 12.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AArch64] Add SchedReadAdvance to Neoverse-V1 scheduling model. #111538
Conversation
@llvm/pr-subscribers-backend-aarch64 Author: Rin Dobrescu (Rin18) ChangesIntroduce a description of late forwarding to the Neoverse-V1 Scheduling model. Patch is 108.18 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/111538.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td
index f7e6545f0dd386..1d7cb699f731aa 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td
@@ -469,6 +469,87 @@ def V1Write_11c_9L01_9S_9V : SchedWriteRes<[V1UnitL01, V1UnitL01, V1UnitL01,
V1UnitV, V1UnitV, V1UnitV,
V1UnitV, V1UnitV, V1UnitV]>;
+//===----------------------------------------------------------------------===//
+// Define forwarded types
+
+// NOTE: SOG, p. 20, n. 2: Accumulator forwarding is not supported for
+// consumers of 64 bit multiply high operations?
+def V1WriteIM : SchedWriteVariant<
+ [SchedVar<NeoverseMULIdiomPred, [V1Write_2c_1M]>,
+ SchedVar<NoSchedPred, [V1Write_2c_1M0]>]>;
+def V1Rd_MA : SchedReadAdvance<1, [V1Write_2c_1M0]>;
+
+def V1Wr_FMA : SchedWriteRes<[V1UnitV]> { let Latency = 4; }
+def V1Rd_FMA : SchedReadAdvance<2, [WriteFMul, V1Wr_FMA]>;
+
+def V1Wr_ADA : SchedWriteRes<[V1UnitV13]> { let Latency = 4; }
+def V1Rd_ADA : SchedReadAdvance<3, [V1Wr_ADA]>;
+
+def V1Wr_VDOT : SchedWriteRes<[V1UnitV]> { let Latency = 3; }
+def V1Rd_VDOT : SchedReadAdvance<2, [V1Wr_VDOT]>;
+
+def V1Wr_VMMA : SchedWriteRes<[V1UnitV]> { let Latency = 3; }
+def V1Rd_VMMA : SchedReadAdvance<2, [V1Wr_VMMA]>;
+
+def V1Wr_VMA : SchedWriteRes<[V1UnitV02]> { let Latency = 4; }
+def V1Rd_VMA : SchedReadAdvance<3, [V1Wr_VMA]>;
+
+def V1Wr_VMAL : SchedWriteRes<[V1UnitV02]> { let Latency = 4; }
+def V1Rd_VMAL : SchedReadAdvance<3, [V1Wr_VMAL]>;
+
+def V1Wr_VSA : SchedWriteRes<[V1UnitV13]> { let Latency = 4; }
+def V1Rd_VSA : SchedReadAdvance<3, [V1Wr_VSA]>;
+
+def V1Wr_FCMA : SchedWriteRes<[V1UnitV]> { let Latency = 4; }
+def V1Rd_FCMA : SchedReadAdvance<2, [V1Wr_FCMA]>;
+
+def V1Wr_FPM : SchedWriteRes<[V1UnitV]> { let Latency = 3; }
+def V1Wr_FPMA : SchedWriteRes<[V1UnitV]> { let Latency = 4; }
+def V1Rd_FPMA : SchedReadAdvance<2, [V1Wr_FPM, V1Wr_FPMA]>;
+
+def V1Wr_FPMAL : SchedWriteRes<[V1UnitV]> { let Latency = 5; }
+def V1Rd_FPMAL : SchedReadAdvance<3, [V1Wr_FPMAL]>;
+
+def V1Wr_BFD : SchedWriteRes<[V1UnitV]> { let Latency = 4; }
+def V1Rd_BFD : SchedReadAdvance<2, [V1Wr_BFD]>;
+
+def V1Wr_BFMMA : SchedWriteRes<[V1UnitV]> { let Latency = 5; }
+def V1Rd_BFMMA : SchedReadAdvance<2, [V1Wr_BFMMA]>;
+
+def V1Wr_BFMLA : SchedWriteRes<[V1UnitV]> { let Latency = 4; }
+def V1Rd_BFMLA : SchedReadAdvance<2, [V1Wr_BFMLA]>;
+
+def V1Wr_CRC : SchedWriteRes<[V1UnitM0]> { let Latency = 2; }
+def V1Rd_CRC : SchedReadAdvance<1, [V1Wr_CRC]>;
+
+def V1Wr_ZDOTB : SchedWriteRes<[V1UnitV01]> { let Latency = 3; }
+def V1Rd_ZDOTB : SchedReadAdvance<2, [V1Wr_ZDOTB]>;
+
+def V1Wr_ZUDOTB : SchedWriteRes<[V1UnitV]> { let Latency = 3; }
+def V1Rd_ZUDOTB : SchedReadAdvance<2, [V1Wr_ZUDOTB]>;
+
+def V1Wr_ZDOTH : SchedWriteRes<[V1UnitV0]> { let Latency = 4; }
+def V1Rd_ZDOTH : SchedReadAdvance<3, [V1Wr_ZDOTH]>;
+
+def V1Wr_ZMMA : SchedWriteRes<[V1UnitV01]> { let Latency = 3; }
+def V1Rd_ZMMA : SchedReadAdvance<2, [V1Wr_ZMMA]>;
+
+let Latency = 5, NumMicroOps = 2 in
+def V1Wr_ZMAD : SchedWriteRes<[V1UnitV0, V1UnitV0]>;
+def V1Rd_ZMAD : SchedReadAdvance<3, [V1Wr_ZMAD]>;
+
+def V1Wr_ZFCMA : SchedWriteRes<[V1UnitV01]> { let Latency = 5; }
+def V1Rd_ZFCMA : SchedReadAdvance<3, [V1Wr_ZFCMA]>;
+
+def V1Wr_ZFMA : SchedWriteRes<[V1UnitV01]> { let Latency = 4; }
+def V1Rd_ZFMA : SchedReadAdvance<2, [V1Wr_ZFMA]>;
+
+def V1Wr_ZBFDOT : SchedWriteRes<[V1UnitV01]> { let Latency = 4; }
+def V1Rd_ZBFDOT : SchedReadAdvance<2, [V1Wr_ZBFDOT]>;
+def V1Wr_ZBFMMA : SchedWriteRes<[V1UnitV01]> { let Latency = 5; }
+def V1Rd_ZBFMMA : SchedReadAdvance<2, [V1Wr_ZBFMMA]>;
+def V1Wr_ZBFMAL : SchedWriteRes<[V1UnitV01]> { let Latency = 5; }
+def V1Rd_ZBFMAL : SchedReadAdvance<3, [V1Wr_ZBFMAL]>;
// Miscellaneous Instructions
// -----------------------------------------------------------------------------
@@ -553,16 +634,22 @@ def : InstRW<[V1Write_1c_1J], (instrs SETF8, SETF16, RMIF, CFINV)>;
def : SchedAlias<WriteID32, V1Write_12c5_1M0>;
def : SchedAlias<WriteID64, V1Write_20c5_1M0>;
+def : SchedAlias<WriteIM32, V1Write_2c_1M>;
+def : SchedAlias<WriteIM64, V1Write_2c_1M>;
+
// Multiply
-// Multiply accumulate
-// Multiply accumulate, long
-// Multiply long
-def V1WriteIM : SchedWriteVariant<
+// Multiply accumulate, W-form
+// Multiply accumulate, X-form
+/*def V1WriteIM : SchedWriteVariant<
[SchedVar<NeoverseMULIdiomPred, [V1Write_2c_1M]>,
SchedVar<NoSchedPred, [V1Write_2c_1M0]>]>;
-def : SchedAlias<WriteIM32, V1WriteIM>;
-def : SchedAlias<WriteIM64, V1WriteIM>;
+def V1Rd_MA : SchedReadAdvance<1, [V1Write_2c_1M0]>;*/
+def : InstRW<[V1WriteIM, ReadIM, ReadIM, V1Rd_MA], (instregex "^M(ADD|SUB)[WX]rrr$")>;
+// Multiply accumulate long
+// Multiply long
+def : InstRW<[V1WriteIM, ReadIM, ReadIM, V1Rd_MA],
+ (instregex "^(S|U)M(ADD|SUB)Lrrr$")>;
// Multiply high
def : InstRW<[V1Write_3c_1M, ReadIM, ReadIM], (instrs SMULHrr, UMULHrr)>;
@@ -680,10 +767,10 @@ def : InstRW<[V1Write_15c7_1V02], (instrs FDIVDrr)>;
def : InstRW<[V1Write_16c7_1V02], (instrs FSQRTDr)>;
// FP multiply
-def : SchedAlias<WriteFMul, V1Write_3c_1V>;
+def : WriteRes<WriteFMul, [V1UnitV]> { let Latency = 3; }
// FP multiply accumulate
-def : InstRW<[V1Write_4c_1V], (instregex "^FN?M(ADD|SUB)[HSD]rrr$")>;
+def : InstRW<[V1Wr_FMA, ReadDefault, ReadDefault, V1Rd_FMA], (instregex "^FN?M(ADD|SUB)[HSD]rrr$")>;
// FP round to integral
def : InstRW<[V1Write_3c_1V02], (instregex "^FRINT[AIMNPXZ][HSD]r$",
@@ -824,7 +911,7 @@ def : SchedAlias<WriteVq, V1Write_2c_1V>;
// ASIMD absolute diff accum
// ASIMD absolute diff accum long
// ASIMD pairwise add and accumulate long
-def : InstRW<[V1Write_4c_1V13], (instregex "^[SU]ABAL?v", "^[SU]ADALPv")>;
+def : InstRW<[V1Wr_ADA, V1Rd_ADA], (instregex "^[SU]ABAL?v", "^[SU]ADALPv")>;
// ASIMD arith, reduce, 4H/4S
// ASIMD max/min, reduce, 4H/4S
@@ -843,23 +930,25 @@ def : InstRW<[V1Write_4c_2V13], (instregex "^(ADD|[SU]ADDL)Vv16i8v$",
// ASIMD dot product
// ASIMD dot product using signed and unsigned integers
-def : InstRW<[V1Write_2c_1V], (instregex "^([SU]|SU|US)DOT(lane)?v(8|16)i8$")>;
+def : InstRW<[V1Wr_VDOT, V1Rd_VDOT], (instregex "^([SU]|SU|US)DOT(lane)?v(8|16)i8$")>;
-// ASIMD matrix multiply- accumulate
-def : InstRW<[V1Write_3c_1V], (instrs SMMLA, UMMLA, USMMLA)>;
+// ASIMD matrix multiply-accumulate
+def : InstRW<[V1Wr_VMMA, V1Rd_VMMA], (instrs SMMLA, UMMLA, USMMLA)>;
// ASIMD multiply
+def : InstRW<[V1Write_4c_1V02], (instregex "^MULv", "^SQ(R)?DMULHv")>;
+
// ASIMD multiply accumulate
+def : InstRW<[V1Wr_VMA, V1Rd_VMA], (instregex "^MLAv", "^MLSv")>;
+
// ASIMD multiply accumulate long
+def : InstRW<[V1Wr_VMAL, V1Rd_VMAL], (instregex "^[SU]MLALv", "^[SU]MLSLv")>;
+
// ASIMD multiply accumulate high
+def : InstRW<[V1Write_4c_1V02], (instregex "^SQRDMLAHv", "^SQRDMLSHv")>;
+
// ASIMD multiply accumulate saturating long
-def : InstRW<[V1Write_4c_1V02],
- (instregex "^MUL(v[148]i16|v[124]i32)$",
- "^SQR?DMULH(v[48]i16|v[24]i32)$",
- "^ML[AS](v[148]i16|v[124]i32)$",
- "^[SU]ML[AS]Lv",
- "^SQRDML[AS]H(v[148]i16|v[124]i32)$",
- "^SQDML[AS]Lv")>;
+def : InstRW<[V1Write_4c_1V02], (instregex "^SQDML[AS]L[iv]")>;
// ASIMD multiply/multiply long (8x8) polynomial
def : InstRW<[V1Write_3c_1V01], (instregex "^PMULL?v(8|16)i8$")>;
@@ -868,11 +957,12 @@ def : InstRW<[V1Write_3c_1V01], (instregex "^PMULL?v(8|16)i8$")>;
def : InstRW<[V1Write_3c_1V02], (instregex "^([SU]|SQD)MULLv")>;
// ASIMD shift accumulate
+def : InstRW<[V1Wr_VSA, V1Rd_VSA], (instregex "^[SU]SRAv", "^[SU]RSRAv")>;
+
// ASIMD shift by immed, complex
// ASIMD shift by register, complex
def : InstRW<[V1Write_4c_1V13],
- (instregex "^[SU]R?SRAv",
- "^RSHRNv", "^SQRSHRU?Nv", "^(SQSHLU?|UQSHL)[bhsd]$",
+ (instregex "^RSHRNv", "^SQRSHRU?Nv", "^(SQSHLU?|UQSHL)[bhsd]$",
"^(SQSHLU?|UQSHL)(v8i8|v16i8|v4i16|v8i16|v2i32|v4i32|v2i64)_shift$",
"^SQSHU?RNv", "^[SU]RSHRv", "^UQR?SHRNv",
"^[SU]Q?RSHLv", "^[SU]QSHLv")>;
@@ -890,16 +980,25 @@ def : InstRW<[V1Write_2c_1V13], (instregex "^SHLL?v", "^SHRNv", "^[SU]SHLLv",
// ASIMD FP absolute value/difference
// ASIMD FP arith, normal
// ASIMD FP compare
-// ASIMD FP complex add
// ASIMD FP max/min, normal
// ASIMD FP max/min, pairwise
// ASIMD FP negate
// Covered by "SchedAlias (WriteV[dq]...)" above
+// ASIMD FP complex add
+def : InstRW<[V1Write_4c_1V], (instregex "^FCADD(v[48]f16|v[24]f32|v2f64)$")>;
+
// ASIMD FP complex multiply add
+def : InstRW<[V1Wr_FCMA, V1Rd_FCMA], (instregex "^FCMLAv")>;
+
+// ASIMD FP multiply
+def : InstRW<[V1Wr_FPM], (instregex "^FMULX?v")>;
+
// ASIMD FP multiply accumulate
-def : InstRW<[V1Write_4c_1V], (instregex "^FCADD(v[48]f16|v[24]f32|v2f64)$",
- "^FML[AS]v")>;
+def : InstRW<[V1Wr_FPMA, V1Rd_FPMA], (instregex "^FML[AS]v")>;
+
+// ASIMD FP multiply accumulate long
+def : InstRW<[V1Wr_FPMAL, V1Rd_FPMAL], (instregex "^FML[AS]L2?v")>;
// ASIMD FP convert, long (F16 to F32)
def : InstRW<[V1Write_4c_2V02], (instregex "^FCVTLv[48]i16$")>;
@@ -953,12 +1052,6 @@ def : InstRW<[V1Write_4c_2V], (instregex "^F(MAX|MIN)(NM)?Vv4(i16|i32)v$")>;
// ASIMD FP max/min, reduce, Q-form F16
def : InstRW<[V1Write_6c_3V], (instregex "^F(MAX|MIN)(NM)?Vv8i16v$")>;
-// ASIMD FP multiply
-def : InstRW<[V1Write_3c_1V], (instregex "^FMULX?v")>;
-
-// ASIMD FP multiply accumulate long
-def : InstRW<[V1Write_5c_1V], (instregex "^FML[AS]L2?v")>;
-
// ASIMD FP round, D-form F32 and Q-form F64
def : InstRW<[V1Write_3c_1V02], (instregex "^FRINT[AIMNPXZ]v2f(32|64)$")>;
@@ -976,13 +1069,13 @@ def : InstRW<[V1Write_6c_4V02], (instregex "^FRINT[AIMNPXZ]v8f16$")>;
def : InstRW<[V1Write_4c_1V02], (instrs BFCVTN, BFCVTN2)>;
// ASIMD dot product
-def : InstRW<[V1Write_4c_1V], (instregex "^BF(DOT|16DOTlane)v[48]bf16$")>;
+def : InstRW<[V1Wr_BFD, V1Rd_BFD], (instregex "^BF(DOT|16DOTlane)v[48]bf16$")>;
// ASIMD matrix multiply accumulate
-def : InstRW<[V1Write_5c_1V], (instrs BFMMLA)>;
+def : InstRW<[V1Wr_BFMMA, V1Rd_BFMMA], (instrs BFMMLA)>;
// ASIMD multiply accumulate long
-def : InstRW<[V1Write_4c_1V], (instregex "^BFMLAL[BT](Idx)?$")>;
+def : InstRW<[V1Wr_BFMLA, V1Rd_BFMLA], (instregex "^BFMLAL[BT](Idx)?$")>;
// Scalar convert, F32 to BF16
def : InstRW<[V1Write_3c_1V02], (instrs BFCVT)>;
@@ -1300,7 +1393,7 @@ def : InstRW<[V1Write_2c_1V0], (instrs BCAX, EOR3, RAX1, XAR)>;
// -----------------------------------------------------------------------------
// CRC checksum ops
-def : InstRW<[V1Write_2c_1M0], (instregex "^CRC32C?[BHWX]rr$")>;
+def : InstRW<[V1Wr_CRC, V1Rd_CRC], (instregex "^CRC32C?[BHWX]rr$")>;
// SVE Predicate instructions
@@ -1440,13 +1533,13 @@ def : InstRW<[V1Write_20c7_1V0], (instregex "^[SU]DIVR?_ZPmZ_D",
"^[SU]DIV_ZPZZ_D")>;
// Dot product, 8 bit
-def : InstRW<[V1Write_3c_1V01], (instregex "^[SU]DOT_ZZZI?_S$")>;
+def : InstRW<[V1Wr_ZDOTB, V1Rd_ZDOTB], (instregex "^[SU]DOT_ZZZI?_S$")>;
// Dot product, 8 bit, using signed and unsigned integers
-def : InstRW<[V1Write_3c_1V], (instrs SUDOT_ZZZI, USDOT_ZZZ, USDOT_ZZZI)>;
+def : InstRW<[V1Wr_ZUDOTB, V1Rd_ZUDOTB], (instrs SUDOT_ZZZI, USDOT_ZZZ, USDOT_ZZZI)>;
// Dot product, 16 bit
-def : InstRW<[V1Write_4c_1V0], (instregex "^[SU]DOT_ZZZI?_D$")>;
+def : InstRW<[V1Wr_ZDOTH, V1Rd_ZDOTH], (instregex "^[SU]DOT_ZZZI?_D$")>;
// Duplicate, immediate and indexed form
def : InstRW<[V1Write_2c_1V01], (instregex "^DUP_ZI_[BHSD]$",
@@ -1488,7 +1581,7 @@ def : InstRW<[V1Write_2c_1V01], (instregex "^MOVPRFX_ZP[mz]Z_[BHSD]$",
"^MOVPRFX_ZZ$")>;
// Matrix multiply-accumulate
-def : InstRW<[V1Write_3c_1V01], (instrs SMMLA_ZZZ, UMMLA_ZZZ, USMMLA_ZZZ)>;
+def : InstRW<[V1Wr_ZMMA, V1Rd_ZMMA], (instrs SMMLA_ZZZ, UMMLA_ZZZ, USMMLA_ZZZ)>;
// Multiply, B, H, S element size
def : InstRW<[V1Write_4c_1V0], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ)_[BHS]",
@@ -1497,12 +1590,16 @@ def : InstRW<[V1Write_4c_1V0], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ)_[BHS]",
"^[SU]MULH_ZPZZ_[BHS]")>;
// Multiply, D element size
-// Multiply accumulate, D element size
def : InstRW<[V1Write_5c_2V0], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ)_D",
"^MUL_ZPZZ_D",
"^[SU]MULH_(ZPmZ|ZZZ)_D",
- "^[SU]MULH_ZPZZ_D",
- "^(MLA|MLS|MAD|MSB)_(ZPmZZ|ZPZZZ)_D")>;
+ "^[SU]MULH_ZPZZ_D")>;
+
+// Multiply accumulate, D element size
+def : InstRW<[V1Wr_ZMAD, V1Rd_ZMAD],
+ (instregex "^ML[AS]_ZPZZZ_D")>;
+def : InstRW<[V1Wr_ZMAD, ReadDefault, V1Rd_ZMAD],
+ (instregex "^(ML[AS]|MAD|MSB)_ZPmZZ_D")>;
// Multiply accumulate, B, H, S element size
// NOTE: This is not specified in the SOG.
@@ -1583,8 +1680,10 @@ def : InstRW<[V1Write_2c_1V0], (instregex "^FAC(GE|GT)_PPzZZ_[HSD]$",
def : InstRW<[V1Write_3c_1V01], (instregex "^FCADD_ZPmZ_[HSD]$")>;
// Floating point complex multiply add
-def : InstRW<[V1Write_5c_1V01], (instregex "^FCMLA_ZPmZZ_[HSD]$",
- "^FCMLA_ZZZI_[HS]$")>;
+/*def : InstRW<[V1Write_5c_1V01], (instregex "^FCMLA_ZPmZZ_[HSD]$",
+ "^FCMLA_ZZZI_[HS]$")>;*/
+def : InstRW<[V1Wr_ZFCMA, ReadDefault, V1Rd_ZFCMA], (instregex "^FCMLA_ZPmZZ_[HSD]")>;
+def : InstRW<[V1Wr_ZFCMA, V1Rd_ZFCMA], (instregex "^FCMLA_ZZZI_[HS]")>;
// Floating point convert, long or narrow (F16 to F32 or F32 to F16)
// Floating point convert to integer, F32
@@ -1623,11 +1722,15 @@ def : InstRW<[V1Write_3c_1V01], (instregex "^(FSCALE|FMULX)_ZPmZ_[HSD]",
"^FMUL_ZPZ[IZ]_[HSD]")>;
// Floating point multiply accumulate
+def : InstRW<[V1Wr_ZFMA, ReadDefault, V1Rd_ZFMA],
+ (instregex "^FN?ML[AS]_ZPmZZ_[HSD]",
+ "^FN?(MAD|MSB)_ZPmZZ_[HSD]")>;
+def : InstRW<[V1Wr_ZFMA, V1Rd_ZFMA],
+ (instregex "^FML[AS]_ZZZI_[HSD]",
+ "^FN?ML[AS]_ZPZZZ_[HSD]")>;
+
// Floating point reciprocal step
-def : InstRW<[V1Write_4c_1V01], (instregex "^F(N?M(AD|SB)|N?ML[AS])_ZPmZZ_[HSD]$",
- "^FN?ML[AS]_ZPZZZ_[HSD]",
- "^FML[AS]_ZZZI_[HSD]$",
- "^F(RECPS|RSQRTS)_ZZZ_[HSD]$")>;
+def : InstRW<[V1Write_4c_1V01], (instregex "^F(RECPS|RSQRTS)_ZZZ_[HSD]")>;
// Floating point reciprocal estimate, F16
def : InstRW<[V1Write_6c_4V0], (instrs FRECPE_ZZ_H, FRSQRTE_ZZ_H)>;
@@ -1681,13 +1784,13 @@ def : InstRW<[V1Write_3c_1V01], (instregex "^FEXPA_ZZ_[HSD]$",
def : InstRW<[V1Write_4c_1V0], (instrs BFCVT_ZPmZ, BFCVTNT_ZPmZ)>;
// Dot product
-def : InstRW<[V1Write_4c_1V01], (instrs BFDOT_ZZI, BFDOT_ZZZ)>;
+def : InstRW<[V1Wr_ZBFDOT, V1Rd_ZBFDOT], (instrs BFDOT_ZZI, BFDOT_ZZZ)>;
// Matrix multiply accumulate
-def : InstRW<[V1Write_5c_1V01], (instrs BFMMLA_ZZZ)>;
+def : InstRW<[V1Wr_ZBFMMA, V1Rd_ZBFMMA], (instrs BFMMLA_ZZZ)>;
// Multiply accumulate long
-def : InstRW<[V1Write_5c_1V01], (instregex "^BFMLAL[BT]_ZZZ(I)?$")>;
+def : InstRW<[V1Wr_ZBFMAL, V1Rd_ZBFMAL], (instregex "^BFMLAL[BT]_ZZZ(I)?$")>;
// SVE Load instructions
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-forwarding.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-forwarding.s
new file mode 100644
index 00000000000000..4de37f96000520
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-forwarding.s
@@ -0,0 +1,1421 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=neoverse-v1 -mattr=+sve --instruction-info=0 --resource-pressure=0 --timeline --timeline-max-iterations=2 < %s | FileCheck %s
+
+# LLVM-MCA-BEGIN madd
+mul x0, x0, x0
+madd x0, x1, x2, x0
+madd x0, x1, x2, x0
+madd x0, x0, x0, x0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN smaddl
+mul x0, x0, x0
+smaddl x0, w1, w2, x0
+smaddl x0, w1, w2, x0
+smaddl x0, w0, w0, x0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN fmadd
+fadd d0, d0, d0
+fmadd d0, d1, d2, d0
+fmul d0, d0, d0
+fmadd d0, d1, d2, d0
+fmadd d0, d1, d2, d0
+fmadd d0, d0, d1, d2
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN saba
+mul v0.4s, v0.4s, v0.4s
+saba v0.4s, v1.4s, v2.4s
+saba v0.4s, v1.4s, v2.4s
+saba v0.4s, v0.4s, v1.4s
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN sadalp
+mul v0.4s, v0.4s, v0.4s
+sadalp v0.2d, v1.4s
+sadalp v0.2d, v1.4s
+sadalp v0.2d, v0.4s
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN sdot
+mul v0.4s, v0.4s, v0.4s
+sdot v0.4s, v1.16b, v2.16b
+sdot v0.4s, v1.16b, v2.16b
+sdot v0.4s, v0.16b, v1.16b
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN smmla
+mul v0.4s, v0.4s, v0.4s
+smmla v0.4s, v1.16b, v2.16b
+smmla v0.4s, v1.16b, v2.16b
+smmla v0.4s, v0.16b, v1.16b
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN mla
+mul v0.4s, v0.4s, v0.4s
+mla v0.4s, v1.4s, v2.4s
+mla v0.4s, v1.4s, v2.4s
+mla v0.4s, v0.4s, v1.4s
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN smlal2
+mul v0.4s, v0.4s, v0.4s
+smlal2 v0.4s, v1.8h, v2.8h
+smlal2 v0.4s, v1.8h, v2.8h
+smlal2 v0.4s, v0.8h, v1.8h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN ssra
+mul v0.4s, v0.4s, v0.4s
+ssra v0.2d, v1.2d, #1
+ssra v0.2d, v1.2d, #1
+ssra v0.2d, v0.2d, #1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN fcmla
+fmul v0.4s, v0.4s, v0.4s
+fcmla v0.2d, v1.2d, v2.2d, #90
+fcmla v0.2d, v1.2d, v2.2d, #90
+fcmla v0.2d, v0.2d, v1.2d, #90
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN fmla
+fmul v0.2d, v0.2d, v0.2d
+fmla v0.2d, v1.2d, v2.2d
+fadd v0.2d, v0.2d, v0.2d
+fmla v0.2d, v1.2d, v2.2d
+fmla v0.2d, v1.2d, v2.2d
+fmla v0.2d, v0.2d, v1.2d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN fmlal
+fmul v0.2d, v0.2d, v0.2d
+fmlal v0.4s, v1.4h, v2.4h
+fadd v0.2d, v0.2d, v0.2d
+fmlal v0.4s, v1.4h, v2.4h
+fmlal v0.4s, v1.4h, v2.4h
+fmlal v0.4s, v0.4h, v1.4h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN bfdot
+fmul v0.2d, v0.2d, v0.2d
+bfdot v0.4s, v1.8h, v2.8h
+bfdot v0.4s, v1.8h, v2.8h
+bfdot v0.4s, v0.8h, v1.8h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN bfmmla
+fmul v0.2d, v0.2d, v0.2d
+bfmmla v0.4s, v1.8h, v2.8h
+bfmmla v0.4s, v1.8h, v2.8h
+bfmmla v0.4s, v0.8h, v1.8h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN bfmlalb
+fmul v0.2d, v0.2d, v0.2d
+bfmlalb v0.4s, v1.8h, v2.8h
+bfmlalb v0.4s, v1.8h, v2.8h
+bfmlalb v0.4s, v0.8h, v1.8h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN crc32cb
+mul w0, w0, w0
+crc32cb w0, w0, w1
+crc32cb w0, w0, w1
+crc32cb w0, w0, w0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z sdot.s
+mul z0.d, p0/m, z0.d, z0.d
+sdot z0.s, z1.b, z2.b
+sdot z0.s, z1.b, z2.b
+sdot z0.s, z0.b, z1.b
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z sudot
+mul z0.d, p0/m, z0.d, z0.d
+sdot z0.s, z1.b, z2.b[1]
+sdot z0.s, z1.b, z2.b[1]
+sdot z0.s, z0.b, z1.b[1]
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z sdot.d
+mul z0.d, p0/m, z0.d, z0.d
+sdot z0.d, z1.h, z2.h
+sdot z0.d, z1.h, z2.h
+sdot z0.d, z0.h, z1.h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z smmla
+mul z0.d, p0/m, z0.d, z0.d
+smmla z0.s, z1.b, z2.b
+smmla z0.s, z1.b, z2.b
+smmla z0.s, z0.b, z1.b
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z mla.d
+mul z0.d, p0/m, z0.d, z0.d
+mla z0.d, p0/m, z1.d, z2.d
+mla z0.d, p0/m, z1.d, z2.d
+mla z0.d, p0/m, z0.d, z1.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z mad.d
+mul z0.d, p0/m, z0.d, z0.d
+mad z0.d, p0/m, z1.d, z2.d
+mad z0.d, p0/m, z1.d, z2.d
+mad z0.d, p0/m, z0.d, z1.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z msb.d
+mul z0.d, p0/m, z0.d, z0.d
+msb z0.d, p0/m, z1.d, z2.d
+msb z0.d, p0/m, z1.d, z2.d
+msb z0.d, p0/m, z0.d, z1.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z fcmla ZPmZZ
+fmul z0.d, z0.d, z0.d
+fcmla z0.d, p0/m, z1.d, z2.d, 90
+fcmla z0.d, p0/m, z1.d, z2.d, 90
+fcmla z0.d, p0/m, z0.d, z1.d, 90
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z fcmla ZZZI
+fmul z0.d, z0.d, z0.d
+fcmla z0.s, z1.s, z2.s[1], 90
+fcmla z0.s, z1.s, z2.s[1], 90
+fcmla z0.s, z0.s, z1.s[1], 90
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z fmla ZPmZZ
+fmul z0.d, z0.d, z0.d
+fmla z0.d, p0/m, z1.d, z2....
[truncated]
|
def V1WriteIM : SchedWriteVariant< | ||
[SchedVar<NeoverseMULIdiomPred, [V1Write_2c_1M]>, | ||
SchedVar<NoSchedPred, [V1Write_2c_1M0]>]>; | ||
def V1Rd_MA : SchedReadAdvance<1, [V1Write_2c_1M0]>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If this will forward from any V1Write_2c_1M0, could that include some instructions we that it should not?
And should a mul forward, considering they are the same instruction?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If this will forward from any V1Write_2c_1M0
I've defined another resource so it will no longer forward from any V1Write_2c_1M0.
And should a mul forward
I don't think a mul should forward.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks. LGTM
…#111538) Introduce a description of late forwarding to the Neoverse-V1 Scheduling model.
Introduce a description of late forwarding to the Neoverse-V1 Scheduling model.