-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path0001-MultiQueue-Skiplist-Scheduler-version-v0.180_reworked.patch
10413 lines (10334 loc) · 296 KB
/
0001-MultiQueue-Skiplist-Scheduler-version-v0.180_reworked.patch
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
From dc988b5f353c28a846da592e31b504c32d95ed5e Mon Sep 17 00:00:00 2001
From: Con Kolivas <kernel@kolivas.org>
Date: Tue, 13 Nov 2018 17:18:04 +1100
Subject: [PATCH 01/16] MultiQueue Skiplist Scheduler version v0.180.
---
.../admin-guide/kernel-parameters.txt | 8 +
Documentation/scheduler/sched-BFS.txt | 351 +
Documentation/scheduler/sched-MuQSS.txt | 373 +
Documentation/sysctl/kernel.txt | 37 +
arch/powerpc/platforms/cell/spufs/sched.c | 5 -
arch/x86/Kconfig | 92 +-
fs/proc/base.c | 2 +-
include/linux/init_task.h | 4 +
include/linux/ioprio.h | 2 +
include/linux/sched.h | 60 +-
include/linux/sched/nohz.h | 4 +-
include/linux/sched/prio.h | 12 +
include/linux/sched/rt.h | 2 +
include/linux/sched/task.h | 2 +-
include/linux/skip_list.h | 33 +
include/uapi/linux/sched.h | 9 +-
init/Kconfig | 23 +-
init/init_task.c | 10 +
init/main.c | 2 +
kernel/Makefile | 2 +-
kernel/delayacct.c | 2 +-
kernel/exit.c | 4 +-
kernel/kthread.c | 30 +-
kernel/livepatch/transition.c | 8 +-
kernel/rcu/Kconfig | 2 +-
kernel/sched/Makefile | 12 +
kernel/sched/MuQSS.c | 7366 +++++++++++++++++
kernel/sched/MuQSS.h | 881 ++
kernel/sched/cpufreq_schedutil.c | 12 +-
kernel/sched/cpupri.h | 2 +
kernel/sched/cputime.c | 22 +-
kernel/sched/idle.c | 12 +-
kernel/sched/sched.h | 40 +
kernel/sched/topology.c | 8 +
kernel/skip_list.c | 148 +
kernel/sysctl.c | 52 +-
kernel/time/clockevents.c | 5 +
kernel/time/posix-cpu-timers.c | 8 +-
kernel/time/timer.c | 7 +-
kernel/trace/trace_selftest.c | 5 +
40 files changed, 9605 insertions(+), 54 deletions(-)
create mode 100644 Documentation/scheduler/sched-BFS.txt
create mode 100644 Documentation/scheduler/sched-MuQSS.txt
create mode 100644 include/linux/skip_list.h
create mode 100644 kernel/sched/MuQSS.c
create mode 100644 kernel/sched/MuQSS.h
create mode 100644 kernel/skip_list.c
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 92eb1f42240d..627365ae5499 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4001,6 +4001,14 @@
Memory area to be used by remote processor image,
managed by CMA.
+ rqshare= [X86] Select the MuQSS scheduler runqueue sharing type.
+ Format: <string>
+ smt -- Share SMT (hyperthread) sibling runqueues
+ mc -- Share MC (multicore) sibling runqueues
+ smp -- Share SMP runqueues
+ none -- So not share any runqueues
+ Default value is mc
+
rw [KNL] Mount root device read-write on boot
S [KNL] Run init in single mode
diff --git a/Documentation/scheduler/sched-BFS.txt b/Documentation/scheduler/sched-BFS.txt
new file mode 100644
index 000000000000..c0282002a079
--- /dev/null
+++ b/Documentation/scheduler/sched-BFS.txt
@@ -0,0 +1,351 @@
+BFS - The Brain Fuck Scheduler by Con Kolivas.
+
+Goals.
+
+The goal of the Brain Fuck Scheduler, referred to as BFS from here on, is to
+completely do away with the complex designs of the past for the cpu process
+scheduler and instead implement one that is very simple in basic design.
+The main focus of BFS is to achieve excellent desktop interactivity and
+responsiveness without heuristics and tuning knobs that are difficult to
+understand, impossible to model and predict the effect of, and when tuned to
+one workload cause massive detriment to another.
+
+
+Design summary.
+
+BFS is best described as a single runqueue, O(n) lookup, earliest effective
+virtual deadline first design, loosely based on EEVDF (earliest eligible virtual
+deadline first) and my previous Staircase Deadline scheduler. Each component
+shall be described in order to understand the significance of, and reasoning for
+it. The codebase when the first stable version was released was approximately
+9000 lines less code than the existing mainline linux kernel scheduler (in
+2.6.31). This does not even take into account the removal of documentation and
+the cgroups code that is not used.
+
+Design reasoning.
+
+The single runqueue refers to the queued but not running processes for the
+entire system, regardless of the number of CPUs. The reason for going back to
+a single runqueue design is that once multiple runqueues are introduced,
+per-CPU or otherwise, there will be complex interactions as each runqueue will
+be responsible for the scheduling latency and fairness of the tasks only on its
+own runqueue, and to achieve fairness and low latency across multiple CPUs, any
+advantage in throughput of having CPU local tasks causes other disadvantages.
+This is due to requiring a very complex balancing system to at best achieve some
+semblance of fairness across CPUs and can only maintain relatively low latency
+for tasks bound to the same CPUs, not across them. To increase said fairness
+and latency across CPUs, the advantage of local runqueue locking, which makes
+for better scalability, is lost due to having to grab multiple locks.
+
+A significant feature of BFS is that all accounting is done purely based on CPU
+used and nowhere is sleep time used in any way to determine entitlement or
+interactivity. Interactivity "estimators" that use some kind of sleep/run
+algorithm are doomed to fail to detect all interactive tasks, and to falsely tag
+tasks that aren't interactive as being so. The reason for this is that it is
+close to impossible to determine that when a task is sleeping, whether it is
+doing it voluntarily, as in a userspace application waiting for input in the
+form of a mouse click or otherwise, or involuntarily, because it is waiting for
+another thread, process, I/O, kernel activity or whatever. Thus, such an
+estimator will introduce corner cases, and more heuristics will be required to
+cope with those corner cases, introducing more corner cases and failed
+interactivity detection and so on. Interactivity in BFS is built into the design
+by virtue of the fact that tasks that are waking up have not used up their quota
+of CPU time, and have earlier effective deadlines, thereby making it very likely
+they will preempt any CPU bound task of equivalent nice level. See below for
+more information on the virtual deadline mechanism. Even if they do not preempt
+a running task, because the rr interval is guaranteed to have a bound upper
+limit on how long a task will wait for, it will be scheduled within a timeframe
+that will not cause visible interface jitter.
+
+
+Design details.
+
+Task insertion.
+
+BFS inserts tasks into each relevant queue as an O(1) insertion into a double
+linked list. On insertion, *every* running queue is checked to see if the newly
+queued task can run on any idle queue, or preempt the lowest running task on the
+system. This is how the cross-CPU scheduling of BFS achieves significantly lower
+latency per extra CPU the system has. In this case the lookup is, in the worst
+case scenario, O(n) where n is the number of CPUs on the system.
+
+Data protection.
+
+BFS has one single lock protecting the process local data of every task in the
+global queue. Thus every insertion, removal and modification of task data in the
+global runqueue needs to grab the global lock. However, once a task is taken by
+a CPU, the CPU has its own local data copy of the running process' accounting
+information which only that CPU accesses and modifies (such as during a
+timer tick) thus allowing the accounting data to be updated lockless. Once a
+CPU has taken a task to run, it removes it from the global queue. Thus the
+global queue only ever has, at most,
+
+ (number of tasks requesting cpu time) - (number of logical CPUs) + 1
+
+tasks in the global queue. This value is relevant for the time taken to look up
+tasks during scheduling. This will increase if many tasks with CPU affinity set
+in their policy to limit which CPUs they're allowed to run on if they outnumber
+the number of CPUs. The +1 is because when rescheduling a task, the CPU's
+currently running task is put back on the queue. Lookup will be described after
+the virtual deadline mechanism is explained.
+
+Virtual deadline.
+
+The key to achieving low latency, scheduling fairness, and "nice level"
+distribution in BFS is entirely in the virtual deadline mechanism. The one
+tunable in BFS is the rr_interval, or "round robin interval". This is the
+maximum time two SCHED_OTHER (or SCHED_NORMAL, the common scheduling policy)
+tasks of the same nice level will be running for, or looking at it the other
+way around, the longest duration two tasks of the same nice level will be
+delayed for. When a task requests cpu time, it is given a quota (time_slice)
+equal to the rr_interval and a virtual deadline. The virtual deadline is
+offset from the current time in jiffies by this equation:
+
+ jiffies + (prio_ratio * rr_interval)
+
+The prio_ratio is determined as a ratio compared to the baseline of nice -20
+and increases by 10% per nice level. The deadline is a virtual one only in that
+no guarantee is placed that a task will actually be scheduled by this time, but
+it is used to compare which task should go next. There are three components to
+how a task is next chosen. First is time_slice expiration. If a task runs out
+of its time_slice, it is descheduled, the time_slice is refilled, and the
+deadline reset to that formula above. Second is sleep, where a task no longer
+is requesting CPU for whatever reason. The time_slice and deadline are _not_
+adjusted in this case and are just carried over for when the task is next
+scheduled. Third is preemption, and that is when a newly waking task is deemed
+higher priority than a currently running task on any cpu by virtue of the fact
+that it has an earlier virtual deadline than the currently running task. The
+earlier deadline is the key to which task is next chosen for the first and
+second cases. Once a task is descheduled, it is put back on the queue, and an
+O(n) lookup of all queued-but-not-running tasks is done to determine which has
+the earliest deadline and that task is chosen to receive CPU next.
+
+The CPU proportion of different nice tasks works out to be approximately the
+
+ (prio_ratio difference)^2
+
+The reason it is squared is that a task's deadline does not change while it is
+running unless it runs out of time_slice. Thus, even if the time actually
+passes the deadline of another task that is queued, it will not get CPU time
+unless the current running task deschedules, and the time "base" (jiffies) is
+constantly moving.
+
+Task lookup.
+
+BFS has 103 priority queues. 100 of these are dedicated to the static priority
+of realtime tasks, and the remaining 3 are, in order of best to worst priority,
+SCHED_ISO (isochronous), SCHED_NORMAL, and SCHED_IDLEPRIO (idle priority
+scheduling). When a task of these priorities is queued, a bitmap of running
+priorities is set showing which of these priorities has tasks waiting for CPU
+time. When a CPU is made to reschedule, the lookup for the next task to get
+CPU time is performed in the following way:
+
+First the bitmap is checked to see what static priority tasks are queued. If
+any realtime priorities are found, the corresponding queue is checked and the
+first task listed there is taken (provided CPU affinity is suitable) and lookup
+is complete. If the priority corresponds to a SCHED_ISO task, they are also
+taken in FIFO order (as they behave like SCHED_RR). If the priority corresponds
+to either SCHED_NORMAL or SCHED_IDLEPRIO, then the lookup becomes O(n). At this
+stage, every task in the runlist that corresponds to that priority is checked
+to see which has the earliest set deadline, and (provided it has suitable CPU
+affinity) it is taken off the runqueue and given the CPU. If a task has an
+expired deadline, it is taken and the rest of the lookup aborted (as they are
+chosen in FIFO order).
+
+Thus, the lookup is O(n) in the worst case only, where n is as described
+earlier, as tasks may be chosen before the whole task list is looked over.
+
+
+Scalability.
+
+The major limitations of BFS will be that of scalability, as the separate
+runqueue designs will have less lock contention as the number of CPUs rises.
+However they do not scale linearly even with separate runqueues as multiple
+runqueues will need to be locked concurrently on such designs to be able to
+achieve fair CPU balancing, to try and achieve some sort of nice-level fairness
+across CPUs, and to achieve low enough latency for tasks on a busy CPU when
+other CPUs would be more suited. BFS has the advantage that it requires no
+balancing algorithm whatsoever, as balancing occurs by proxy simply because
+all CPUs draw off the global runqueue, in priority and deadline order. Despite
+the fact that scalability is _not_ the prime concern of BFS, it both shows very
+good scalability to smaller numbers of CPUs and is likely a more scalable design
+at these numbers of CPUs.
+
+It also has some very low overhead scalability features built into the design
+when it has been deemed their overhead is so marginal that they're worth adding.
+The first is the local copy of the running process' data to the CPU it's running
+on to allow that data to be updated lockless where possible. Then there is
+deference paid to the last CPU a task was running on, by trying that CPU first
+when looking for an idle CPU to use the next time it's scheduled. Finally there
+is the notion of cache locality beyond the last running CPU. The sched_domains
+information is used to determine the relative virtual "cache distance" that
+other CPUs have from the last CPU a task was running on. CPUs with shared
+caches, such as SMT siblings, or multicore CPUs with shared caches, are treated
+as cache local. CPUs without shared caches are treated as not cache local, and
+CPUs on different NUMA nodes are treated as very distant. This "relative cache
+distance" is used by modifying the virtual deadline value when doing lookups.
+Effectively, the deadline is unaltered between "cache local" CPUs, doubled for
+"cache distant" CPUs, and quadrupled for "very distant" CPUs. The reasoning
+behind the doubling of deadlines is as follows. The real cost of migrating a
+task from one CPU to another is entirely dependant on the cache footprint of
+the task, how cache intensive the task is, how long it's been running on that
+CPU to take up the bulk of its cache, how big the CPU cache is, how fast and
+how layered the CPU cache is, how fast a context switch is... and so on. In
+other words, it's close to random in the real world where we do more than just
+one sole workload. The only thing we can be sure of is that it's not free. So
+BFS uses the principle that an idle CPU is a wasted CPU and utilising idle CPUs
+is more important than cache locality, and cache locality only plays a part
+after that. Doubling the effective deadline is based on the premise that the
+"cache local" CPUs will tend to work on the same tasks up to double the number
+of cache local CPUs, and once the workload is beyond that amount, it is likely
+that none of the tasks are cache warm anywhere anyway. The quadrupling for NUMA
+is a value I pulled out of my arse.
+
+When choosing an idle CPU for a waking task, the cache locality is determined
+according to where the task last ran and then idle CPUs are ranked from best
+to worst to choose the most suitable idle CPU based on cache locality, NUMA
+node locality and hyperthread sibling business. They are chosen in the
+following preference (if idle):
+
+* Same core, idle or busy cache, idle threads
+* Other core, same cache, idle or busy cache, idle threads.
+* Same node, other CPU, idle cache, idle threads.
+* Same node, other CPU, busy cache, idle threads.
+* Same core, busy threads.
+* Other core, same cache, busy threads.
+* Same node, other CPU, busy threads.
+* Other node, other CPU, idle cache, idle threads.
+* Other node, other CPU, busy cache, idle threads.
+* Other node, other CPU, busy threads.
+
+This shows the SMT or "hyperthread" awareness in the design as well which will
+choose a real idle core first before a logical SMT sibling which already has
+tasks on the physical CPU.
+
+Early benchmarking of BFS suggested scalability dropped off at the 16 CPU mark.
+However this benchmarking was performed on an earlier design that was far less
+scalable than the current one so it's hard to know how scalable it is in terms
+of both CPUs (due to the global runqueue) and heavily loaded machines (due to
+O(n) lookup) at this stage. Note that in terms of scalability, the number of
+_logical_ CPUs matters, not the number of _physical_ CPUs. Thus, a dual (2x)
+quad core (4X) hyperthreaded (2X) machine is effectively a 16X. Newer benchmark
+results are very promising indeed, without needing to tweak any knobs, features
+or options. Benchmark contributions are most welcome.
+
+
+Features
+
+As the initial prime target audience for BFS was the average desktop user, it
+was designed to not need tweaking, tuning or have features set to obtain benefit
+from it. Thus the number of knobs and features has been kept to an absolute
+minimum and should not require extra user input for the vast majority of cases.
+There are precisely 2 tunables, and 2 extra scheduling policies. The rr_interval
+and iso_cpu tunables, and the SCHED_ISO and SCHED_IDLEPRIO policies. In addition
+to this, BFS also uses sub-tick accounting. What BFS does _not_ now feature is
+support for CGROUPS. The average user should neither need to know what these
+are, nor should they need to be using them to have good desktop behaviour.
+
+rr_interval
+
+There is only one "scheduler" tunable, the round robin interval. This can be
+accessed in
+
+ /proc/sys/kernel/rr_interval
+
+The value is in milliseconds, and the default value is set to 6 on a
+uniprocessor machine, and automatically set to a progressively higher value on
+multiprocessor machines. The reasoning behind increasing the value on more CPUs
+is that the effective latency is decreased by virtue of there being more CPUs on
+BFS (for reasons explained above), and increasing the value allows for less
+cache contention and more throughput. Valid values are from 1 to 1000
+Decreasing the value will decrease latencies at the cost of decreasing
+throughput, while increasing it will improve throughput, but at the cost of
+worsening latencies. The accuracy of the rr interval is limited by HZ resolution
+of the kernel configuration. Thus, the worst case latencies are usually slightly
+higher than this actual value. The default value of 6 is not an arbitrary one.
+It is based on the fact that humans can detect jitter at approximately 7ms, so
+aiming for much lower latencies is pointless under most circumstances. It is
+worth noting this fact when comparing the latency performance of BFS to other
+schedulers. Worst case latencies being higher than 7ms are far worse than
+average latencies not being in the microsecond range.
+
+Isochronous scheduling.
+
+Isochronous scheduling is a unique scheduling policy designed to provide
+near-real-time performance to unprivileged (ie non-root) users without the
+ability to starve the machine indefinitely. Isochronous tasks (which means
+"same time") are set using, for example, the schedtool application like so:
+
+ schedtool -I -e amarok
+
+This will start the audio application "amarok" as SCHED_ISO. How SCHED_ISO works
+is that it has a priority level between true realtime tasks and SCHED_NORMAL
+which would allow them to preempt all normal tasks, in a SCHED_RR fashion (ie,
+if multiple SCHED_ISO tasks are running, they purely round robin at rr_interval
+rate). However if ISO tasks run for more than a tunable finite amount of time,
+they are then demoted back to SCHED_NORMAL scheduling. This finite amount of
+time is the percentage of _total CPU_ available across the machine, configurable
+as a percentage in the following "resource handling" tunable (as opposed to a
+scheduler tunable):
+
+ /proc/sys/kernel/iso_cpu
+
+and is set to 70% by default. It is calculated over a rolling 5 second average
+Because it is the total CPU available, it means that on a multi CPU machine, it
+is possible to have an ISO task running as realtime scheduling indefinitely on
+just one CPU, as the other CPUs will be available. Setting this to 100 is the
+equivalent of giving all users SCHED_RR access and setting it to 0 removes the
+ability to run any pseudo-realtime tasks.
+
+A feature of BFS is that it detects when an application tries to obtain a
+realtime policy (SCHED_RR or SCHED_FIFO) and the caller does not have the
+appropriate privileges to use those policies. When it detects this, it will
+give the task SCHED_ISO policy instead. Thus it is transparent to the user.
+Because some applications constantly set their policy as well as their nice
+level, there is potential for them to undo the override specified by the user
+on the command line of setting the policy to SCHED_ISO. To counter this, once
+a task has been set to SCHED_ISO policy, it needs superuser privileges to set
+it back to SCHED_NORMAL. This will ensure the task remains ISO and all child
+processes and threads will also inherit the ISO policy.
+
+Idleprio scheduling.
+
+Idleprio scheduling is a scheduling policy designed to give out CPU to a task
+_only_ when the CPU would be otherwise idle. The idea behind this is to allow
+ultra low priority tasks to be run in the background that have virtually no
+effect on the foreground tasks. This is ideally suited to distributed computing
+clients (like setiathome, folding, mprime etc) but can also be used to start
+a video encode or so on without any slowdown of other tasks. To avoid this
+policy from grabbing shared resources and holding them indefinitely, if it
+detects a state where the task is waiting on I/O, the machine is about to
+suspend to ram and so on, it will transiently schedule them as SCHED_NORMAL. As
+per the Isochronous task management, once a task has been scheduled as IDLEPRIO,
+it cannot be put back to SCHED_NORMAL without superuser privileges. Tasks can
+be set to start as SCHED_IDLEPRIO with the schedtool command like so:
+
+ schedtool -D -e ./mprime
+
+Subtick accounting.
+
+It is surprisingly difficult to get accurate CPU accounting, and in many cases,
+the accounting is done by simply determining what is happening at the precise
+moment a timer tick fires off. This becomes increasingly inaccurate as the
+timer tick frequency (HZ) is lowered. It is possible to create an application
+which uses almost 100% CPU, yet by being descheduled at the right time, records
+zero CPU usage. While the main problem with this is that there are possible
+security implications, it is also difficult to determine how much CPU a task
+really does use. BFS tries to use the sub-tick accounting from the TSC clock,
+where possible, to determine real CPU usage. This is not entirely reliable, but
+is far more likely to produce accurate CPU usage data than the existing designs
+and will not show tasks as consuming no CPU usage when they actually are. Thus,
+the amount of CPU reported as being used by BFS will more accurately represent
+how much CPU the task itself is using (as is shown for example by the 'time'
+application), so the reported values may be quite different to other schedulers.
+Values reported as the 'load' are more prone to problems with this design, but
+per process values are closer to real usage. When comparing throughput of BFS
+to other designs, it is important to compare the actual completed work in terms
+of total wall clock time taken and total work done, rather than the reported
+"cpu usage".
+
+
+Con Kolivas <kernel@kolivas.org> Fri Aug 27 2010
diff --git a/Documentation/scheduler/sched-MuQSS.txt b/Documentation/scheduler/sched-MuQSS.txt
new file mode 100644
index 000000000000..ae28b85c9995
--- /dev/null
+++ b/Documentation/scheduler/sched-MuQSS.txt
@@ -0,0 +1,373 @@
+MuQSS - The Multiple Queue Skiplist Scheduler by Con Kolivas.
+
+MuQSS is a per-cpu runqueue variant of the original BFS scheduler with
+one 8 level skiplist per runqueue, and fine grained locking for much more
+scalability.
+
+
+Goals.
+
+The goal of the Multiple Queue Skiplist Scheduler, referred to as MuQSS from
+here on (pronounced mux) is to completely do away with the complex designs of
+the past for the cpu process scheduler and instead implement one that is very
+simple in basic design. The main focus of MuQSS is to achieve excellent desktop
+interactivity and responsiveness without heuristics and tuning knobs that are
+difficult to understand, impossible to model and predict the effect of, and when
+tuned to one workload cause massive detriment to another, while still being
+scalable to many CPUs and processes.
+
+
+Design summary.
+
+MuQSS is best described as per-cpu multiple runqueue, O(log n) insertion, O(1)
+lookup, earliest effective virtual deadline first tickless design, loosely based
+on EEVDF (earliest eligible virtual deadline first) and my previous Staircase
+Deadline scheduler, and evolved from the single runqueue O(n) BFS scheduler.
+Each component shall be described in order to understand the significance of,
+and reasoning for it.
+
+
+Design reasoning.
+
+In BFS, the use of a single runqueue across all CPUs meant that each CPU would
+need to scan the entire runqueue looking for the process with the earliest
+deadline and schedule that next, regardless of which CPU it originally came
+from. This made BFS deterministic with respect to latency and provided
+guaranteed latencies dependent on number of processes and CPUs. The single
+runqueue, however, meant that all CPUs would compete for the single lock
+protecting it, which would lead to increasing lock contention as the number of
+CPUs rose and appeared to limit scalability of common workloads beyond 16
+logical CPUs. Additionally, the O(n) lookup of the runqueue list obviously
+increased overhead proportionate to the number of queued proecesses and led to
+cache thrashing while iterating over the linked list.
+
+MuQSS is an evolution of BFS, designed to maintain the same scheduling
+decision mechanism and be virtually deterministic without relying on the
+constrained design of the single runqueue by splitting out the single runqueue
+to be per-CPU and use skiplists instead of linked lists.
+
+The original reason for going back to a single runqueue design for BFS was that
+once multiple runqueues are introduced, per-CPU or otherwise, there will be
+complex interactions as each runqueue will be responsible for the scheduling
+latency and fairness of the tasks only on its own runqueue, and to achieve
+fairness and low latency across multiple CPUs, any advantage in throughput of
+having CPU local tasks causes other disadvantages. This is due to requiring a
+very complex balancing system to at best achieve some semblance of fairness
+across CPUs and can only maintain relatively low latency for tasks bound to the
+same CPUs, not across them. To increase said fairness and latency across CPUs,
+the advantage of local runqueue locking, which makes for better scalability, is
+lost due to having to grab multiple locks.
+
+MuQSS works around the problems inherent in multiple runqueue designs by
+making its skip lists priority ordered and through novel use of lockless
+examination of each other runqueue it can decide if it should take the earliest
+deadline task from another runqueue for latency reasons, or for CPU balancing
+reasons. It still does not have a balancing system, choosing to allow the
+next task scheduling decision and task wakeup CPU choice to allow balancing to
+happen by virtue of its choices.
+
+As a further evolution of the design, MuQSS normally configures sharing of
+runqueues in a logical fashion for when CPU resources are shared for improved
+latency and throughput. By default it shares runqueues and locks between
+multicore siblings. Optionally it can be configured to run with sharing of
+SMT siblings only, all SMP packages or no sharing at all. Additionally it can
+be selected at boot time.
+
+
+Design details.
+
+Custom skip list implementation:
+
+To avoid the overhead of building up and tearing down skip list structures,
+the variant used by MuQSS has a number of optimisations making it specific for
+its use case in the scheduler. It uses static arrays of 8 'levels' instead of
+building up and tearing down structures dynamically. This makes each runqueue
+only scale O(log N) up to 64k tasks. However as there is one runqueue per CPU
+it means that it scales O(log N) up to 64k x number of logical CPUs which is
+far beyond the realistic task limits each CPU could handle. By being 8 levels
+it also makes the array exactly one cacheline in size. Additionally, each
+skip list node is bidirectional making insertion and removal amortised O(1),
+being O(k) where k is 1-8. Uniquely, we are only ever interested in the very
+first entry in each list at all times with MuQSS, so there is never a need to
+do a search and thus look up is always O(1). In interactive mode, the queues
+will be searched beyond their first entry if the first task is not suitable
+for affinity or SMT nice reasons.
+
+Task insertion:
+
+MuQSS inserts tasks into a per CPU runqueue as an O(log N) insertion into
+a custom skip list as described above (based on the original design by William
+Pugh). Insertion is ordered in such a way that there is never a need to do a
+search by ordering tasks according to static priority primarily, and then
+virtual deadline at the time of insertion.
+
+Niffies:
+
+Niffies are a monotonic forward moving timer not unlike the "jiffies" but are
+of nanosecond resolution. Niffies are calculated per-runqueue from the high
+resolution TSC timers, and in order to maintain fairness are synchronised
+between CPUs whenever both runqueues are locked concurrently.
+
+Virtual deadline:
+
+The key to achieving low latency, scheduling fairness, and "nice level"
+distribution in MuQSS is entirely in the virtual deadline mechanism. The one
+tunable in MuQSS is the rr_interval, or "round robin interval". This is the
+maximum time two SCHED_OTHER (or SCHED_NORMAL, the common scheduling policy)
+tasks of the same nice level will be running for, or looking at it the other
+way around, the longest duration two tasks of the same nice level will be
+delayed for. When a task requests cpu time, it is given a quota (time_slice)
+equal to the rr_interval and a virtual deadline. The virtual deadline is
+offset from the current time in niffies by this equation:
+
+ niffies + (prio_ratio * rr_interval)
+
+The prio_ratio is determined as a ratio compared to the baseline of nice -20
+and increases by 10% per nice level. The deadline is a virtual one only in that
+no guarantee is placed that a task will actually be scheduled by this time, but
+it is used to compare which task should go next. There are three components to
+how a task is next chosen. First is time_slice expiration. If a task runs out
+of its time_slice, it is descheduled, the time_slice is refilled, and the
+deadline reset to that formula above. Second is sleep, where a task no longer
+is requesting CPU for whatever reason. The time_slice and deadline are _not_
+adjusted in this case and are just carried over for when the task is next
+scheduled. Third is preemption, and that is when a newly waking task is deemed
+higher priority than a currently running task on any cpu by virtue of the fact
+that it has an earlier virtual deadline than the currently running task. The
+earlier deadline is the key to which task is next chosen for the first and
+second cases.
+
+The CPU proportion of different nice tasks works out to be approximately the
+
+ (prio_ratio difference)^2
+
+The reason it is squared is that a task's deadline does not change while it is
+running unless it runs out of time_slice. Thus, even if the time actually
+passes the deadline of another task that is queued, it will not get CPU time
+unless the current running task deschedules, and the time "base" (niffies) is
+constantly moving.
+
+Task lookup:
+
+As tasks are already pre-ordered according to anticipated scheduling order in
+the skip lists, lookup for the next suitable task per-runqueue is always a
+matter of simply selecting the first task in the 0th level skip list entry.
+In order to maintain optimal latency and fairness across CPUs, MuQSS does a
+novel examination of every other runqueue in cache locality order, choosing the
+best task across all runqueues. This provides near-determinism of how long any
+task across the entire system may wait before receiving CPU time. The other
+runqueues are first examine lockless and then trylocked to minimise the
+potential lock contention if they are likely to have a suitable better task.
+Each other runqueue lock is only held for as long as it takes to examine the
+entry for suitability. In "interactive" mode, the default setting, MuQSS will
+look for the best deadline task across all CPUs, while in !interactive mode,
+it will only select a better deadline task from another CPU if it is more
+heavily laden than the current one.
+
+Lookup is therefore O(k) where k is number of CPUs.
+
+
+Latency.
+
+Through the use of virtual deadlines to govern the scheduling order of normal
+tasks, queue-to-activation latency per runqueue is guaranteed to be bound by
+the rr_interval tunable which is set to 6ms by default. This means that the
+longest a CPU bound task will wait for more CPU is proportional to the number
+of running tasks and in the common case of 0-2 running tasks per CPU, will be
+under the 7ms threshold for human perception of jitter. Additionally, as newly
+woken tasks will have an early deadline from their previous runtime, the very
+tasks that are usually latency sensitive will have the shortest interval for
+activation, usually preempting any existing CPU bound tasks.
+
+Tickless expiry:
+
+A feature of MuQSS is that it is not tied to the resolution of the chosen tick
+rate in Hz, instead depending entirely on the high resolution timers where
+possible for sub-millisecond accuracy on timeouts regarless of the underlying
+tick rate. This allows MuQSS to be run with the low overhead of low Hz rates
+such as 100 by default, benefiting from the improved throughput and lower
+power usage it provides. Another advantage of this approach is that in
+combination with the Full No HZ option, which disables ticks on running task
+CPUs instead of just idle CPUs, the tick can be disabled at all times
+regardless of how many tasks are running instead of being limited to just one
+running task. Note that this option is NOT recommended for regular desktop
+users.
+
+
+Scalability and balancing.
+
+Unlike traditional approaches where balancing is a combination of CPU selection
+at task wakeup and intermittent balancing based on a vast array of rules set
+according to architecture, busyness calculations and special case management,
+MuQSS indirectly balances on the fly at task wakeup and next task selection.
+During initialisation, MuQSS creates a cache coherency ordered list of CPUs for
+each logical CPU and uses this to aid task/CPU selection when CPUs are busy.
+Additionally it selects any idle CPUs, if they are available, at any time over
+busy CPUs according to the following preference:
+
+ * Same thread, idle or busy cache, idle or busy threads
+ * Other core, same cache, idle or busy cache, idle threads.
+ * Same node, other CPU, idle cache, idle threads.
+ * Same node, other CPU, busy cache, idle threads.
+ * Other core, same cache, busy threads.
+ * Same node, other CPU, busy threads.
+ * Other node, other CPU, idle cache, idle threads.
+ * Other node, other CPU, busy cache, idle threads.
+ * Other node, other CPU, busy threads.
+
+Mux is therefore SMT, MC and Numa aware without the need for extra
+intermittent balancing to maintain CPUs busy and make the most of cache
+coherency.
+
+
+Features
+
+As the initial prime target audience for MuQSS was the average desktop user, it
+was designed to not need tweaking, tuning or have features set to obtain benefit
+from it. Thus the number of knobs and features has been kept to an absolute
+minimum and should not require extra user input for the vast majority of cases.
+There are 3 optional tunables, and 2 extra scheduling policies. The rr_interval,
+interactive, and iso_cpu tunables, and the SCHED_ISO and SCHED_IDLEPRIO
+policies. In addition to this, MuQSS also uses sub-tick accounting. What MuQSS
+does _not_ now feature is support for CGROUPS. The average user should neither
+need to know what these are, nor should they need to be using them to have good
+desktop behaviour. However since some applications refuse to work without
+cgroups, one can enable them with MuQSS as a stub and the filesystem will be
+created which will allow the applications to work.
+
+rr_interval:
+
+ /proc/sys/kernel/rr_interval
+
+The value is in milliseconds, and the default value is set to 6. Valid values
+are from 1 to 1000 Decreasing the value will decrease latencies at the cost of
+decreasing throughput, while increasing it will improve throughput, but at the
+cost of worsening latencies. It is based on the fact that humans can detect
+jitter at approximately 7ms, so aiming for much lower latencies is pointless
+under most circumstances. It is worth noting this fact when comparing the
+latency performance of MuQSS to other schedulers. Worst case latencies being
+higher than 7ms are far worse than average latencies not being in the
+microsecond range.
+
+interactive:
+
+ /proc/sys/kernel/interactive
+
+The value is a simple boolean of 1 for on and 0 for off and is set to on by
+default. Disabling this will disable the near-determinism of MuQSS when
+selecting the next task by not examining all CPUs for the earliest deadline
+task, or which CPU to wake to, instead prioritising CPU balancing for improved
+throughput. Latency will still be bound by rr_interval, but on a per-CPU basis
+instead of across the whole system.
+
+Runqueue sharing.
+
+By default MuQSS chooses to share runqueue resources (specifically the skip
+list and locking) between multicore siblings. It is configurable at build time
+to select between None, SMT, MC and SMP, corresponding to no sharing, sharing
+only between simultaneous mulithreading siblings, multicore siblings, or
+symmetric multiprocessing physical packages. Additionally it can be se at
+bootime with the use of the rqshare parameter. The reason for configurability
+is that some architectures have CPUs with many multicore siblings (>= 16)
+where it may be detrimental to throughput to share runqueues and another
+sharing option may be desirable. Additionally, more sharing than usual can
+improve latency on a system-wide level at the expense of throughput if desired.
+
+The options are:
+none, smt, mc, smp
+
+eg:
+ rqshare=mc
+
+Isochronous scheduling:
+
+Isochronous scheduling is a unique scheduling policy designed to provide
+near-real-time performance to unprivileged (ie non-root) users without the
+ability to starve the machine indefinitely. Isochronous tasks (which means
+"same time") are set using, for example, the schedtool application like so:
+
+ schedtool -I -e amarok
+
+This will start the audio application "amarok" as SCHED_ISO. How SCHED_ISO works
+is that it has a priority level between true realtime tasks and SCHED_NORMAL
+which would allow them to preempt all normal tasks, in a SCHED_RR fashion (ie,
+if multiple SCHED_ISO tasks are running, they purely round robin at rr_interval
+rate). However if ISO tasks run for more than a tunable finite amount of time,
+they are then demoted back to SCHED_NORMAL scheduling. This finite amount of
+time is the percentage of CPU available per CPU, configurable as a percentage in
+the following "resource handling" tunable (as opposed to a scheduler tunable):
+
+iso_cpu:
+
+ /proc/sys/kernel/iso_cpu
+
+and is set to 70% by default. It is calculated over a rolling 5 second average
+Because it is the total CPU available, it means that on a multi CPU machine, it
+is possible to have an ISO task running as realtime scheduling indefinitely on
+just one CPU, as the other CPUs will be available. Setting this to 100 is the
+equivalent of giving all users SCHED_RR access and setting it to 0 removes the
+ability to run any pseudo-realtime tasks.
+
+A feature of MuQSS is that it detects when an application tries to obtain a
+realtime policy (SCHED_RR or SCHED_FIFO) and the caller does not have the
+appropriate privileges to use those policies. When it detects this, it will
+give the task SCHED_ISO policy instead. Thus it is transparent to the user.
+
+
+Idleprio scheduling:
+
+Idleprio scheduling is a scheduling policy designed to give out CPU to a task
+_only_ when the CPU would be otherwise idle. The idea behind this is to allow
+ultra low priority tasks to be run in the background that have virtually no
+effect on the foreground tasks. This is ideally suited to distributed computing
+clients (like setiathome, folding, mprime etc) but can also be used to start a
+video encode or so on without any slowdown of other tasks. To avoid this policy
+from grabbing shared resources and holding them indefinitely, if it detects a
+state where the task is waiting on I/O, the machine is about to suspend to ram
+and so on, it will transiently schedule them as SCHED_NORMAL. Once a task has
+been scheduled as IDLEPRIO, it cannot be put back to SCHED_NORMAL without
+superuser privileges since it is effectively a lower scheduling policy. Tasks
+can be set to start as SCHED_IDLEPRIO with the schedtool command like so:
+
+schedtool -D -e ./mprime
+
+Subtick accounting:
+
+It is surprisingly difficult to get accurate CPU accounting, and in many cases,
+the accounting is done by simply determining what is happening at the precise
+moment a timer tick fires off. This becomes increasingly inaccurate as the timer
+tick frequency (HZ) is lowered. It is possible to create an application which
+uses almost 100% CPU, yet by being descheduled at the right time, records zero
+CPU usage. While the main problem with this is that there are possible security
+implications, it is also difficult to determine how much CPU a task really does
+use. Mux uses sub-tick accounting from the TSC clock to determine real CPU
+usage. Thus, the amount of CPU reported as being used by MuQSS will more
+accurately represent how much CPU the task itself is using (as is shown for
+example by the 'time' application), so the reported values may be quite
+different to other schedulers. When comparing throughput of MuQSS to other
+designs, it is important to compare the actual completed work in terms of total
+wall clock time taken and total work done, rather than the reported "cpu usage".
+
+Symmetric MultiThreading (SMT) aware nice:
+
+SMT, a.k.a. hyperthreading, is a very common feature on modern CPUs. While the
+logical CPU count rises by adding thread units to each CPU core, allowing more
+than one task to be run simultaneously on the same core, the disadvantage of it
+is that the CPU power is shared between the tasks, not summating to the power
+of two CPUs. The practical upshot of this is that two tasks running on
+separate threads of the same core run significantly slower than if they had one
+core each to run on. While smart CPU selection allows each task to have a core
+to itself whenever available (as is done on MuQSS), it cannot offset the
+slowdown that occurs when the cores are all loaded and only a thread is left.
+Most of the time this is harmless as the CPU is effectively overloaded at this
+point and the extra thread is of benefit. However when running a niced task in
+the presence of an un-niced task (say nice 19 v nice 0), the nice task gets
+precisely the same amount of CPU power as the unniced one. MuQSS has an
+optional configuration feature known as SMT-NICE which selectively idles the
+secondary niced thread for a period proportional to the nice difference,
+allowing CPU distribution according to nice level to be maintained, at the
+expense of a small amount of extra overhead. If this is configured in on a
+machine without SMT threads, the overhead is minimal.
+
+
+Con Kolivas <kernel@kolivas.org> Sat, 29th October 2016
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 37a679501ddc..e78109cf3458 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -41,6 +41,7 @@ show up in /proc/sys/kernel:
- hung_task_check_interval_secs
- hung_task_warnings
- hyperv_record_panic_msg
+- iso_cpu
- kexec_load_disabled
- kptr_restrict
- l2cr [ PPC only ]
@@ -76,6 +77,7 @@ show up in /proc/sys/kernel:
- randomize_va_space
- real-root-dev ==> Documentation/admin-guide/initrd.rst
- reboot-cmd [ SPARC only ]
+- rr_interval
- rtsig-max
- rtsig-nr
- seccomp/ ==> Documentation/userspace-api/seccomp_filter.rst
@@ -97,6 +99,7 @@ show up in /proc/sys/kernel:
- unknown_nmi_panic
- watchdog
- watchdog_thresh
+- yield_type
- version
==============================================================
@@ -435,6 +438,16 @@ When kptr_restrict is set to (2), kernel pointers printed using
==============================================================
+iso_cpu: (MuQSS CPU scheduler only).
+
+This sets the percentage cpu that the unprivileged SCHED_ISO tasks can
+run effectively at realtime priority, averaged over a rolling five
+seconds over the -whole- system, meaning all cpus.
+
+Set to 70 (percent) by default.
+
+==============================================================
+
l2cr: (PPC only)
This flag controls the L2 cache of G3 processor boards. If
@@ -862,6 +875,20 @@ rebooting. ???
==============================================================
+rr_interval: (MuQSS CPU scheduler only)
+
+This is the smallest duration that any cpu process scheduling unit
+will run for. Increasing this value can increase throughput of cpu
+bound tasks substantially but at the expense of increased latencies
+overall. Conversely decreasing it will decrease average and maximum
+latencies but at the expense of throughput. This value is in
+milliseconds and the default value chosen depends on the number of
+cpus available at scheduler initialisation with a minimum of 6.
+
+Valid values are from 1-1000.
+
+==============================================================
+
rtsig-max & rtsig-nr:
The file rtsig-max can be used to tune the maximum number
@@ -1102,3 +1129,13 @@ The softlockup threshold is (2 * watchdog_thresh). Setting this
tunable to zero will disable lockup detection altogether.
==============================================================
+
+yield_type: (MuQSS CPU scheduler only)
+
+This determines what type of yield calls to sched_yield will perform.
+
+ 0: No yield.
+ 1: Yield only to better priority/deadline tasks. (default)
+ 2: Expire timeslice and recalculate deadline.
+
+==============================================================
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index c9ef3c532169..1298454c0499 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -64,11 +64,6 @@ static struct task_struct *spusched_task;
static struct timer_list spusched_timer;
static struct timer_list spuloadavg_timer;
-/*
- * Priority of a normal, non-rt, non-niced'd process (aka nice level 0).
- */
-#define NORMAL_PRIO 120
-
/*
* Frequency of the spu scheduler tick. By default we do one SPU scheduler
* tick for every 10 CPU scheduler ticks.
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 1a0be022f91d..88bfccabcf3b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1003,6 +1003,21 @@ config NR_CPUS
config SCHED_SMT
def_bool y if SMP
+config SMT_NICE
+ bool "SMT (Hyperthreading) aware nice priority and policy support"
+ depends on SCHED_MUQSS && SCHED_SMT
+ default y
+ ---help---
+ Enabling Hyperthreading on Intel CPUs decreases the effectiveness
+ of the use of 'nice' levels and different scheduling policies
+ (e.g. realtime) due to sharing of CPU power between hyperthreads.
+ SMT nice support makes each logical CPU aware of what is running on
+ its hyperthread siblings, maintaining appropriate distribution of
+ CPU according to nice levels and scheduling policies at the expense
+ of slightly increased overhead.
+
+ If unsure say Y here.
+
config SCHED_MC
def_bool y
prompt "Multi-core scheduler support"
@@ -1043,6 +1059,80 @@ config SCHED_MC_PRIO
If unsure say Y here.
+choice
+ prompt "CPU scheduler runqueue sharing"
+ default RQ_MC if SCHED_MUQSS
+ default RQ_NONE
+
+config RQ_NONE
+ bool "No sharing"
+ help
+ This is the default behaviour where the CPU scheduler has one runqueue
+ per CPU, whether it is a physical or logical CPU (hyperthread).
+
+ This can still be enabled runtime with the boot parameter
+ rqshare=none
+
+ If unsure, say N.
+
+config RQ_SMT
+ bool "SMT (hyperthread) siblings"
+ depends on SCHED_SMT && SCHED_MUQSS
+
+ help
+ With this option enabled, the CPU scheduler will have one runqueue
+ shared by SMT (hyperthread) siblings. As these logical cores share
+ one physical core, sharing the runqueue resource can lead to decreased
+ overhead, lower latency and higher throughput.
+
+ This can still be enabled runtime with the boot parameter
+ rqshare=smt
+
+ If unsure, say N.
+
+config RQ_MC
+ bool "Multicore siblings"
+ depends on SCHED_MC && SCHED_MUQSS
+ help
+ With this option enabled, the CPU scheduler will have one runqueue
+ shared by multicore siblings in addition to any SMT siblings.
+ As these physical cores share caches, sharing the runqueue resource
+ will lead to lower latency, but its effects on overhead and throughput
+ are less predictable. As a general rule, 6 or fewer cores will likely
+ benefit from this, while larger CPUs will only derive a latency
+ benefit. If your workloads are primarily single threaded, this will
+ possibly worsen throughput. If you are only concerned about latency
+ then enable this regardless of how many cores you have.
+
+ This can still be enabled runtime with the boot parameter
+ rqshare=mc
+
+ If unsure, say Y.
+
+config RQ_SMP
+ bool "Symmetric Multi-Processing"
+ depends on SMP && SCHED_MUQSS
+ help
+ With this option enabled, the CPU scheduler will have one runqueue
+ shared by all physical CPUs unless they are on separate NUMA nodes.
+ As physical CPUs usually do not share resources, sharing the runqueue
+ will normally worsen throughput but improve latency. If you only
+ care about latency enable this.
+
+ This can still be enabled runtime with the boot parameter
+ rqshare=smp
+
+ If unsure, say N.
+endchoice