llvm · RKSimon · May 16, 2024 · May 7, 2024
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver3.td b/llvm/lib/Target/X86/X86ScheduleZnver3.td
@@ -33,13 +33,10 @@ def Znver3Model : SchedMachineModel {
   // The op cache is organized as an associative cache with 64 sets and 8 ways.
   // At each set-way intersection is an entry containing up to 8 macro ops.
   // The maximum capacity of the op cache is 4K ops.
-  // Agner, 22.5 µop cache
-  // The size of the µop cache is big enough for holding most critical loops.
-  // FIXME: PR50584: MachineScheduler/PostRAScheduler have quadradic complexity,
-  //        with large values here the compilation of certain loops
-  //        ends up taking way too long.
-  // let LoopMicroOpBufferSize = 4096;
-  let LoopMicroOpBufferSize = 512;
+  // Assuming a maximum dispatch of 8 ops/cy and a mispredict cost of 12cy from
+  // the op-cache, we limit the loop buffer to 8*12 = 96 to avoid loop unrolling
+  // leading to excessive filling of the op-cache from frontend.
+  let LoopMicroOpBufferSize = 96;
   // AMD SOG 19h, 2.6.2 L1 Data Cache
   // The L1 data cache has a 4- or 5- cycle integer load-to-use latency.
   // AMD SOG 19h, 2.12 L1 Data Cache

diff --git a/llvm/lib/Target/X86/X86ScheduleZnver4.td b/llvm/lib/Target/X86/X86ScheduleZnver4.td
@@ -28,17 +28,11 @@ def Znver4Model : SchedMachineModel {
   // AMD SOG 19h, 2.9.1 Op Cache
   // The op cache is organized as an associative cache with 64 sets and 8 ways.
   // At each set-way intersection is an entry containing up to 8 macro ops.
-  // The maximum capacity of the op cache is 4K ops.
-  // Agner, 22.5 µop cache
-  // The size of the µop cache is big enough for holding most critical loops.
-  // FIXME: PR50584: MachineScheduler/PostRAScheduler have quadradic complexity,
-  //        with large values here the compilation of certain loops
-  //        ends up taking way too long.
-  // Ideally for znver4, we should have 6.75K. However we don't add that
-  // considerting the impact compile time and prefer using default values 
-  // instead.
-  // Retaining minimal value to influence unrolling as we did for znver3.
-  let LoopMicroOpBufferSize = 512;
+  // The maximum capacity of the op cache is 6.75K ops.
+  // Assuming a maximum dispatch of 9 ops/cy and a mispredict cost of 12cy from
+  // the op-cache, we limit the loop buffer to 9*12 = 108 to avoid loop
+  // unrolling leading to excessive filling of the op-cache from frontend.
+  let LoopMicroOpBufferSize = 108;
   // AMD SOG 19h, 2.6.2 L1 Data Cache
   // The L1 data cache has a 4- or 5- cycle integer load-to-use latency.
   // AMD SOG 19h, 2.12 L1 Data Cache