Merge PR #108: Make retries after failures configurable

smarr · Mar 6, 2019 · e9789c3 · e9789c3
2 parents e6979bf + ca83bb3
commit e9789c3
Show file tree

Hide file tree

Showing 11 changed files with 54 additions and 13 deletions.
diff --git a/docs/config.md b/docs/config.md
@@ -316,6 +316,23 @@ runs:
   execute_exclusively: false
 ```
 
+---
+
+**retries_after_failure:**
+
+Some experiments may fail non-deterministically. For these, it may be
+convenient to simply retry them a few times.
+This setting indicates how often execution should be retried on failure.
+
+Default: `0`
+
+Example:
+
+```yaml
+runs:
+  retries_after_failure: 3
+```
+
 ## Reporting
 
 Currently, [Codespeed] is the only supported system for continuous
@@ -480,6 +497,7 @@ Thus, one can use:
 - `max_invocation_time`
 - `parallel_interference_factor`
 - `execute_exclusively`
+- `retries_after_failure`
 
 As well as:
 

diff --git a/rebench/configurator.py b/rebench/configurator.py
@@ -140,8 +140,7 @@ def load_config(file_name):
             assert err.strerror == "No such file or directory"
             raise UIError("The requested config file (%s) could not be opened. %s.\n"
                           % (file_name, err.strerror), err)
-        else:
-            raise UIError(str(err) + "\n", err)
+        raise UIError(str(err) + "\n", err)
     except yaml.YAMLError as err:
         raise UIError("Parsing of the config file "
                       + file_name + " failed.\nError " + str(err) + "\n", err)

diff --git a/rebench/executor.py b/rebench/executor.py
@@ -234,8 +234,7 @@ def _process_remaining_runs(self, runs):
         if exceptions:
             if len(exceptions) == 1:
                 raise exceptions[0]
-            else:
-                raise BenchmarkThreadExceptions(exceptions)
+            raise BenchmarkThreadExceptions(exceptions)
 
     def _determine_num_work_items_to_take(self):
         # use a simple and naive scheduling strategy that still allows for
@@ -383,8 +382,8 @@ def _keep_alive(seconds):
                 self._ui.error("{ind}stderr:\n\n{ind}{ind}"
                                + "\n{ind}{ind}".join(lines) + "\n")
             raise FailedBuilding(name, build_command)
-        else:
-            build_command.mark_succeeded()
+
+        build_command.mark_succeeded()
 
     def process_output(self, name, stdout_result, stderr_result):
         with open_with_enc(self._build_log, 'a', encoding='utf8') as log_file:

diff --git a/rebench/model/exp_run_details.py b/rebench/model/exp_run_details.py
@@ -38,21 +38,26 @@ def compile(cls, config, defaults):
         execute_exclusively = none_or_bool(config.get('execute_exclusively',
                                                       defaults.execute_exclusively))
 
+        retries_after_failure = none_or_int(config.get('retries_after_failure',
+                                                       defaults.retries_after_failure))
+
         return ExpRunDetails(invocations, iterations, warmup, min_iteration_time,
                              max_invocation_time, parallel_interference_factor, execute_exclusively,
+                             retries_after_failure,
                              defaults.invocations_override, defaults.iterations_override)
 
     @classmethod
     def empty(cls):
-        return ExpRunDetails(None, None, None, None, None, None, None, None, None)
+        return ExpRunDetails(None, None, None, None, None, None, None, None, None, None)
 
     @classmethod
     def default(cls, invocations_override, iterations_override):
-        return ExpRunDetails(1, 1, None, 50, -1, None, True,
+        return ExpRunDetails(1, 1, None, 50, -1, None, True, 0,
                              invocations_override, iterations_override)
 
     def __init__(self, invocations, iterations, warmup, min_iteration_time,
                  max_invocation_time, parallel_interference_factor, execute_exclusively,
+                 retries_after_failure,
                  invocations_override, iterations_override):
         self._invocations = invocations
         self._iterations = iterations
@@ -62,6 +67,7 @@ def __init__(self, invocations, iterations, warmup, min_iteration_time,
         self._max_invocation_time = max_invocation_time
         self._parallel_interference_factor = parallel_interference_factor
         self._execute_exclusively = execute_exclusively
+        self._retries_after_failure = retries_after_failure
 
         self._invocations_override = invocations_override
         self._iterations_override = iterations_override
@@ -101,3 +107,7 @@ def parallel_interference_factor(self):
     @property
     def execute_exclusively(self):
         return self._execute_exclusively
+
+    @property
+    def retries_after_failure(self):
+        return self._retries_after_failure
diff --git a/rebench/model/run_id.py b/rebench/model/run_id.py
@@ -56,6 +56,10 @@ def min_iteration_time(self):
     def max_invocation_time(self):
         return self._benchmark.run_details.max_invocation_time
 
+    @property
+    def retries_after_failure(self):
+        return self._benchmark.run_details.retries_after_failure
+
     @property
     def iterations(self):
         run_details = self._benchmark.run_details

diff --git a/rebench/model/termination_check.py b/rebench/model/termination_check.py
@@ -39,7 +39,8 @@ def indicate_successful_execution(self):
 
     def fails_consecutively(self):
         return (self._fail_immediately or
-                self._consecutive_erroneous_executions >= 3)
+                (self._consecutive_erroneous_executions > 0 and
+                 self._consecutive_erroneous_executions >= self._run_id.retries_after_failure))
 
     def has_too_many_failures(self, number_of_data_points):
         return (self._fail_immediately or

diff --git a/rebench/persistence.py b/rebench/persistence.py
@@ -79,8 +79,7 @@ def register_config(self, cfg):
         if key in self._bench_cfgs:
             raise ValueError("Two identical BenchmarkConfig tried to " +
                              "register. This seems to be wrong: " + str(key))
-        else:
-            self._bench_cfgs[key] = cfg
+        self._bench_cfgs[key] = cfg
         return cfg
 
 

diff --git a/rebench/rebench-schema.yml b/rebench/rebench-schema.yml
@@ -47,6 +47,15 @@ schema;runs_type:
         TODO: probably needs to be removed, not sure. parallel exec of
         benchmarks introduced a lot of noise
 
+    retries_after_failure:
+      type: int
+      # default: 0 #  can't specify this here, because the defaults override settings
+      desc: |
+        Some experiments may fail non-deterministically. For these, it may be
+        convenient to simply retry them a few times.
+        This value indicates how often execution should be retried on failure.
+
+
 schema;reporting_type:
   type: map
   mapping:

diff --git a/rebench/subprocess_with_timeout.py b/rebench/subprocess_with_timeout.py
@@ -97,7 +97,7 @@ def process_output(self, proc):
 
 
 def _print_keep_alive(seconds_since_start):
-    print("Keep alive, current job runs for %dmin" % (seconds_since_start / 60))
+    print("Keep alive, current job runs for %dmin\n" % (seconds_since_start / 60))
 
 
 def run(args, cwd=None, shell=False, kill_tree=True, timeout=-1,

diff --git a/rebench/tests/small.conf b/rebench/tests/small.conf
@@ -8,6 +8,7 @@ default_data_file:  'tests/small.data'
 # general configuration for runs
 runs:
     invocations:  10
+    retries_after_failure: 3
 
 benchmark_suites:
     Suite:

diff --git a/rebench/ui.py b/rebench/ui.py
@@ -214,7 +214,8 @@ def step(self, progress=0, label=None):
         label = label or self.label
         if not label:
             raise Exception("No label set for spinner!")
-        elif self.total:
+
+        if self.total:
             label = "%s: %.2f%%\n" % (label, progress / (self.total / 100.0))
         elif self.timer and self.timer.elapsed_time > 2:
             label = "%s (%s)\n" % (label, self.timer.rounded)