Skip to content

Commit

Permalink
Merge PR #108: Make retries after failures configurable
Browse files Browse the repository at this point in the history
  • Loading branch information
smarr authored Mar 6, 2019
2 parents e6979bf + ca83bb3 commit e9789c3
Show file tree
Hide file tree
Showing 11 changed files with 54 additions and 13 deletions.
18 changes: 18 additions & 0 deletions docs/config.md
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,23 @@ runs:
execute_exclusively: false
```

---

**retries_after_failure:**

Some experiments may fail non-deterministically. For these, it may be
convenient to simply retry them a few times.
This setting indicates how often execution should be retried on failure.

Default: `0`

Example:

```yaml
runs:
retries_after_failure: 3
```

## Reporting

Currently, [Codespeed] is the only supported system for continuous
Expand Down Expand Up @@ -480,6 +497,7 @@ Thus, one can use:
- `max_invocation_time`
- `parallel_interference_factor`
- `execute_exclusively`
- `retries_after_failure`

As well as:

Expand Down
3 changes: 1 addition & 2 deletions rebench/configurator.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,7 @@ def load_config(file_name):
assert err.strerror == "No such file or directory"
raise UIError("The requested config file (%s) could not be opened. %s.\n"
% (file_name, err.strerror), err)
else:
raise UIError(str(err) + "\n", err)
raise UIError(str(err) + "\n", err)
except yaml.YAMLError as err:
raise UIError("Parsing of the config file "
+ file_name + " failed.\nError " + str(err) + "\n", err)
Expand Down
7 changes: 3 additions & 4 deletions rebench/executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,8 +234,7 @@ def _process_remaining_runs(self, runs):
if exceptions:
if len(exceptions) == 1:
raise exceptions[0]
else:
raise BenchmarkThreadExceptions(exceptions)
raise BenchmarkThreadExceptions(exceptions)

def _determine_num_work_items_to_take(self):
# use a simple and naive scheduling strategy that still allows for
Expand Down Expand Up @@ -383,8 +382,8 @@ def _keep_alive(seconds):
self._ui.error("{ind}stderr:\n\n{ind}{ind}"
+ "\n{ind}{ind}".join(lines) + "\n")
raise FailedBuilding(name, build_command)
else:
build_command.mark_succeeded()

build_command.mark_succeeded()

def process_output(self, name, stdout_result, stderr_result):
with open_with_enc(self._build_log, 'a', encoding='utf8') as log_file:
Expand Down
14 changes: 12 additions & 2 deletions rebench/model/exp_run_details.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,21 +38,26 @@ def compile(cls, config, defaults):
execute_exclusively = none_or_bool(config.get('execute_exclusively',
defaults.execute_exclusively))

retries_after_failure = none_or_int(config.get('retries_after_failure',
defaults.retries_after_failure))

return ExpRunDetails(invocations, iterations, warmup, min_iteration_time,
max_invocation_time, parallel_interference_factor, execute_exclusively,
retries_after_failure,
defaults.invocations_override, defaults.iterations_override)

@classmethod
def empty(cls):
return ExpRunDetails(None, None, None, None, None, None, None, None, None)
return ExpRunDetails(None, None, None, None, None, None, None, None, None, None)

@classmethod
def default(cls, invocations_override, iterations_override):
return ExpRunDetails(1, 1, None, 50, -1, None, True,
return ExpRunDetails(1, 1, None, 50, -1, None, True, 0,
invocations_override, iterations_override)

def __init__(self, invocations, iterations, warmup, min_iteration_time,
max_invocation_time, parallel_interference_factor, execute_exclusively,
retries_after_failure,
invocations_override, iterations_override):
self._invocations = invocations
self._iterations = iterations
Expand All @@ -62,6 +67,7 @@ def __init__(self, invocations, iterations, warmup, min_iteration_time,
self._max_invocation_time = max_invocation_time
self._parallel_interference_factor = parallel_interference_factor
self._execute_exclusively = execute_exclusively
self._retries_after_failure = retries_after_failure

self._invocations_override = invocations_override
self._iterations_override = iterations_override
Expand Down Expand Up @@ -101,3 +107,7 @@ def parallel_interference_factor(self):
@property
def execute_exclusively(self):
return self._execute_exclusively

@property
def retries_after_failure(self):
return self._retries_after_failure
4 changes: 4 additions & 0 deletions rebench/model/run_id.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@ def min_iteration_time(self):
def max_invocation_time(self):
return self._benchmark.run_details.max_invocation_time

@property
def retries_after_failure(self):
return self._benchmark.run_details.retries_after_failure

@property
def iterations(self):
run_details = self._benchmark.run_details
Expand Down
3 changes: 2 additions & 1 deletion rebench/model/termination_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ def indicate_successful_execution(self):

def fails_consecutively(self):
return (self._fail_immediately or
self._consecutive_erroneous_executions >= 3)
(self._consecutive_erroneous_executions > 0 and
self._consecutive_erroneous_executions >= self._run_id.retries_after_failure))

def has_too_many_failures(self, number_of_data_points):
return (self._fail_immediately or
Expand Down
3 changes: 1 addition & 2 deletions rebench/persistence.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,7 @@ def register_config(self, cfg):
if key in self._bench_cfgs:
raise ValueError("Two identical BenchmarkConfig tried to " +
"register. This seems to be wrong: " + str(key))
else:
self._bench_cfgs[key] = cfg
self._bench_cfgs[key] = cfg
return cfg


Expand Down
9 changes: 9 additions & 0 deletions rebench/rebench-schema.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,15 @@ schema;runs_type:
TODO: probably needs to be removed, not sure. parallel exec of
benchmarks introduced a lot of noise
retries_after_failure:
type: int
# default: 0 # can't specify this here, because the defaults override settings
desc: |
Some experiments may fail non-deterministically. For these, it may be
convenient to simply retry them a few times.
This value indicates how often execution should be retried on failure.
schema;reporting_type:
type: map
mapping:
Expand Down
2 changes: 1 addition & 1 deletion rebench/subprocess_with_timeout.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def process_output(self, proc):


def _print_keep_alive(seconds_since_start):
print("Keep alive, current job runs for %dmin" % (seconds_since_start / 60))
print("Keep alive, current job runs for %dmin\n" % (seconds_since_start / 60))


def run(args, cwd=None, shell=False, kill_tree=True, timeout=-1,
Expand Down
1 change: 1 addition & 0 deletions rebench/tests/small.conf
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ default_data_file: 'tests/small.data'
# general configuration for runs
runs:
invocations: 10
retries_after_failure: 3

benchmark_suites:
Suite:
Expand Down
3 changes: 2 additions & 1 deletion rebench/ui.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,8 @@ def step(self, progress=0, label=None):
label = label or self.label
if not label:
raise Exception("No label set for spinner!")
elif self.total:

if self.total:
label = "%s: %.2f%%\n" % (label, progress / (self.total / 100.0))
elif self.timer and self.timer.elapsed_time > 2:
label = "%s (%s)\n" % (label, self.timer.rounded)
Expand Down

0 comments on commit e9789c3

Please sign in to comment.