Recent kernels have more jitter in nanosleep(). Account for that.

o Use a larger allowance for jitter, with the final microseconds to be manually inched towards with busy wait. Only do this on Pi2 or 3 where we have a core to burn. o Always disable realtime throttling for Pi2 and 3 now (used to be an optional parameter), because now we are more likely to hit 100% core utilization. Should fix #483
hzeller · Jan 22, 2018 · 181e3bf · 181e3bf
1 parent 9e54dc9
commit 181e3bf
Show file tree

Hide file tree

Showing 2 changed files with 35 additions and 16 deletions.
diff --git a/lib/Makefile b/lib/Makefile
@@ -117,10 +117,6 @@ HARDWARE_DESC?=regular
 # some oddball old (typically one-colored) display, such as Hub12.
 #DEFINES+=-DONLY_SINGLE_SUB_PANEL
 
-# Experimental: Disable realtime throttling of the Linux kernel. Uncomment
-# if you see periodic flickering in particular with longer display chains.
-#DEFINES+=-DDISABLE_RT_THROTTLE
-
 # If someone gives additional values on the make commandline e.g.
 # make USER_DEFINES="-DSHOW_REFRESH_RATE"
 DEFINES+=$(USER_DEFINES)

diff --git a/lib/gpio.cc b/lib/gpio.cc
@@ -38,14 +38,25 @@
  *
  * Note: A higher value here will result in more CPU use because of more busy
  * waiting inching towards the real value (for all the cases that nanosleep()
- * actually was better than this overhead) so you might consider defining
- * DISABLE_RT_THROTTLE as well (see lib/Makefile)
+ * actually was better than this overhead).
  *
  * This might be interesting to tweak in particular if you have a realtime
  * kernel with different characteristics.
  */
 #define EMPIRICAL_NANOSLEEP_OVERHEAD_US 25
 
+/*
+ * In few cases on a standard kernel, we see that the overhead is actually
+ * even longer; these additional 35usec cover up for the 99.999%-ile.
+ * So ideally, we always use these additional time and also busy-wait them,
+ * right ?
+ * However, that would take away a lot of CPU on older, one-core Raspberry Pis
+ * or Pi Zeros. They rely for us to sleep when possible for it to do work.
+ * So we only enable it, if we have have a newer Pi where we anyway burn
+ * away on one core (And are isolated there with isolcpus=3).
+ */
+#define EMPIRICAL_NANOSLEEP_EXTRA_OVERHEAD_US 35
+
 /* In order to determine useful values for above, set this to 1 and use the
  * hardware pin-pulser.
  * It will output a histogram atexit() of how much how often we were over
@@ -152,7 +163,7 @@ uint32_t GPIO::InitOutputs(uint32_t outputs) {
   return output_bits_;
 }
 
-static bool IsRaspberryPi2() {
+static bool DetermineIsRaspberryPi2() {
   // TODO: there must be a better, more robust way. Can we ask the processor ?
   char buffer[2048];
   const int fd = open("/proc/cmdline", O_RDONLY);
@@ -169,6 +180,19 @@ static bool IsRaspberryPi2() {
   return false;
 }
 
+static bool IsRaspberryPi2() {
+  static bool ispi2 = DetermineIsRaspberryPi2();
+  return ispi2;
+}
+
+static uint32_t JitterAllowanceMicroseconds() {
+  // If this is a Raspberry Pi2 or 3, we can allow to burn a bit more busy-wait
+  // CPU cycles to get the timing accurate as we have more CPU to spare.
+  static int allowance_us = EMPIRICAL_NANOSLEEP_OVERHEAD_US
+    + (IsRaspberryPi2() ? EMPIRICAL_NANOSLEEP_EXTRA_OVERHEAD_US : 0);
+  return allowance_us;
+}
+
 static uint32_t *mmap_bcm_register(bool isRPi2, off_t register_offset) {
   const off_t base = (isRPi2 ? BCM2709_PERI_BASE : BCM2708_PERI_BASE);
 
@@ -271,14 +295,12 @@ static void (*busy_sleep_impl)(long) = sleep_nanos_rpi_1;
 // really want all we can get iff the machine has more cores and
 // our RT-thread is locked onto one of these.
 // So let's tell it not to do that.
-// Only call if there is more than one core available.
 static void DisableRealtimeThrottling() {
-#ifdef DISABLE_RT_THROTTLE
+  if (!IsRaspberryPi2()) return;   // Not safe if we don't have > 1 core.
   const int out = open("/proc/sys/kernel/sched_rt_runtime_us", O_WRONLY);
   if (out < 0) return;
   write(out, "-1", 2);
   close(out);
-#endif
 }
 
 bool Timers::Init() {
@@ -308,10 +330,11 @@ void Timers::sleep_nanos(long nanos) {
   // We use the global 1Mhz hardware timer to measure the actual time period
   // that has passed, and then inch forward for the remaining time with
   // busy wait.
-  if (nanos > EMPIRICAL_NANOSLEEP_OVERHEAD_US * 1000 + 5000) {
+  static long kJitterAllowanceNanos = JitterAllowanceMicroseconds() * 1000;
+  if (nanos > kJitterAllowanceNanos + 5000) {
     const uint32_t before = *timer1Mhz;
     struct timespec sleep_time
-      = { 0, nanos - EMPIRICAL_NANOSLEEP_OVERHEAD_US * 1000 };
+      = { 0, nanos - kJitterAllowanceNanos };
     nanosleep(&sleep_time, NULL);
     const uint32_t after = *timer1Mhz;
     const long nanoseconds_passed = 1000 * (uint32_t)(after - before);
@@ -344,9 +367,9 @@ static void sleep_nanos_rpi_2(long nanos) {
 #if DEBUG_SLEEP_JITTER
 static int overshoot_histogram_us[256] = {0};
 static void print_overshoot_histogram() {
-  fprintf(stderr, "Overshoot histogram >= EMPIRICAL_NANOSLEEP_OVERHEAD_US=%d\n"
+  fprintf(stderr, "Overshoot histogram >= empirical overhead of %dus\n"
           "%6s | %7s | %7s\n",
-          EMPIRICAL_NANOSLEEP_OVERHEAD_US, "usec", "count", "accum");
+          JitterAllowanceMicroseconds(), "usec", "count", "accum");
   int total_count = 0;
   for (int i = 0; i < 256; ++i) total_count += overshoot_histogram_us[i];
   int running_count = 0;
@@ -395,7 +418,7 @@ class HardwarePinPulser : public PinPulser {
 
     for (size_t i = 0; i < specs.size(); ++i) {
       // Hints how long to nanosleep, already corrected for system overhead.
-      sleep_hints_.push_back(specs[i] / 1000 - EMPIRICAL_NANOSLEEP_OVERHEAD_US);
+      sleep_hints_.push_back(specs[i] / 1000 - JitterAllowanceMicroseconds());
     }
 
     const int base = specs[0];
@@ -480,7 +503,7 @@ class HardwarePinPulser : public PinPulser {
           // took.
           const int total_us = *timer1Mhz - start_time_;
           const int nanoslept = total_us - already_elapsed_usec;
-          int overshoot = nanoslept - (to_sleep + EMPIRICAL_NANOSLEEP_OVERHEAD_US);
+          int overshoot = nanoslept - (to_sleep + JitterAllowanceMicroseconds());
           if (overshoot < 0) overshoot = 0;
           if (overshoot > 255) overshoot = 255;
           overshoot_histogram_us[overshoot]++;