diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
index bfd4056b2..5b03ccae9 100644
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@@ -11,6 +11,7 @@ jobs:
         with:
           python-version: '3.8'
       - name: Install Tox
-        run: pip install tox==3.28.0
+        run: pip install tox==4.11.0
+
       - name: Run Tox
         run: tox -e style,py38,cover
diff --git a/.gitignore b/.gitignore
index 7c4220e5b..fb0bacc60 100644
--- a/.gitignore
+++ b/.gitignore
@@ -97,6 +97,7 @@ venv/
 ENV/
 env.bak/
 venv.bak/
+.virtualenvs/
 
 # Spyder project settings
 .spyderproject
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
new file mode 100644
index 000000000..15e7f5f4f
--- /dev/null
+++ b/.readthedocs.yaml
@@ -0,0 +1,20 @@
+# .readthedocs.yaml
+# Read the Docs configuration file
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+# Required
+version: 2
+
+build:
+  os: ubuntu-20.04
+  tools:
+    python: "3.7"
+
+sphinx:
+  configuration: docs/conf.py
+
+python:
+  install:
+    - requirements: docs/requirements.txt
+    - method: setuptools
+      path: .
diff --git a/README.md b/README.md
index 2ad3345de..eb7c30c9f 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,7 @@
+[![Documentation Status](https://readthedocs.org/projects/ducktape/badge/?version=0.8.x)](https://ducktape.readthedocs.io/en/0.8.x/?badge=0.8.x)
+
+
+
 Distributed System Integration & Performance Testing Library
 ============================================================
 
@@ -17,7 +21,7 @@ Ducktape contains tools for running system integration and performance tests. It
 Documentation
 -------------
 
-For detailed documentation on how to install, run, create new tests please refer to: http://ducktape-docs.readthedocs.io/
+For detailed documentation on how to install, run, create new tests please refer to: http://ducktape.readthedocs.io/
 
 Contribute
 ----------
diff --git a/Vagrantfile b/Vagrantfile
index 1191624ac..fc29167b9 100644
--- a/Vagrantfile
+++ b/Vagrantfile
@@ -23,7 +23,7 @@ VAGRANTFILE_API_VERSION = "2"
 enable_dns = false
 num_workers = 3
 ram_megabytes = 300
-base_box = "ubuntu/trusty64"
+base_box = "ubuntu/focal64"
 
 local_config_file = File.join(File.dirname(__FILE__), "Vagrantfile.local")
 if File.exists?(local_config_file) then
@@ -51,7 +51,7 @@ Vagrant.configure(VAGRANTFILE_API_VERSION) do |config|
     name = "ducktape" + i.to_s
     config.vm.define name do |worker|
       worker.vm.hostname = name
-      worker.vm.network :private_network, ip: "192.168.50." + (150 + i).to_s
+      worker.vm.network :private_network, ip: "192.168.56." + (150 + i).to_s
     end
   }
 
diff --git a/docs/README.md b/docs/README.md
index 2f38dba4c..8dcc4456a 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -1,23 +1,35 @@
 Ducktape documentation quick start guide
 ========================================
 
-This file provides a quick guide on how to compile the Ducktape documentation.
 
+Build the documentation
+-----------------------
 
-Setup the environment
----------------------
+To render the pages run::
+```shell
+tox -e docs
+```
+    
+The rendered pages will be in ``docs/_build/html``
 
-To compile the documentation you need Sphinx Python library. To install it and all its dependencies run::
 
-    pip install -r requirements.txt
+Specify documentation format
+----------------------------
 
+Documentation is built using [sphinx-build](https://www.sphinx-doc.org/en/master/man/sphinx-build.html) command.
+You can select which builder to use using SPHINX_BUILDER command:
+```shell
+SPHINX_BUILDER=man tox -e docs
+```
+All available values: https://www.sphinx-doc.org/en/master/man/sphinx-build.html#cmdoption-sphinx-build-M
 
-Build the documentation
------------------------
 
-To render the pages run::
+Pass options to sphinx-build
+----------------------------
+Any argument after `--` will be passed to the 
+[sphinx-build](https://www.sphinx-doc.org/en/master/man/sphinx-build.html) command directly:
+```shell
+tox -e docs -- -E
+```
 
-    make html
-    
-The rendered pages will be in ``docs/_build/html``
 
diff --git a/docs/changelog.rst b/docs/changelog.rst
new file mode 100644
index 000000000..7f0dd0b83
--- /dev/null
+++ b/docs/changelog.rst
@@ -0,0 +1,31 @@
+.. _topics-changelog:
+
+====
+Changelog
+====
+
+0.8.18
+======
+Friday, August 18th, 2023
+-------------------------
+-
+
+0.8.18
+======
+- Updated `requests` version to `2.31.0`
+
+0.8.17
+======
+- Removed `tox` from requirements. It was not used, but was breaking our builds due to recent pushes to `virtualenv`.
+
+0.8.x
+=====
+- Support test suites
+- Easier way to rerun failed tests - generate test suite with all the failed tests and also print them in the log so that user can copy them and paste as ducktape command line arguments
+- Python 2 is no longer supported, minimum supported version is 3.6
+- Added `--deflake N` flag - if provided, it will attempt to rerun each failed test  up to N times, and if it eventually passes, it will be marked as Flaky - `#299 <https://github.com/confluentinc/ducktape/pull/299>`_
+- [backport, also in 0.9.1] - use a generic network device based on the devices found on the remote machine rather than a hardcoded one - `#314 <https://github.com/confluentinc/ducktape/pull/314>`_ and `#328 <https://github.com/confluentinc/ducktape/pull/328>`_
+- [backport, also in 0.9.1] - clean up process properly after an exception during test runner execution - `#323 <https://github.com/confluentinc/ducktape/pull/323>`_
+- [backport, also in 0.9.1] - log ssh errors - `#319 <https://github.com/confluentinc/ducktape/pull/319>`_
+- [backport, also in 0.9.1] - update vagrant tests to use ubuntu20 - `#328 <https://github.com/confluentinc/ducktape/pull/328>`_
+- [backport, also in 0.9.1] - added command to print the total number of nodes the tests run will require - `#320 <https://github.com/confluentinc/ducktape/pull/320>`_
\ No newline at end of file
diff --git a/docs/debug_tests.rst b/docs/debug_tests.rst
index 71df91d6c..7b17066dc 100644
--- a/docs/debug_tests.rst
+++ b/docs/debug_tests.rst
@@ -97,3 +97,13 @@ Tools for Managing Logs
 =======================
 
 Analyzing and matching up logs from a distributed service could be time consuming. There are many good tools for working with logs. Examples include http://lnav.org/, http://list.xmodulo.com/multitail.html, and http://glogg.bonnefon.org/.
+
+Validating Ssh Issues
+=======================
+
+Ducktape supports running custom validators when an ssh error occurs, allowing you to run your own validation against a host.
+this is done simply by running ducktape with the `--ssh-checker-function`, followed by the module path to your function, so for instance::
+    
+    ducktape my-test.py --ssh-checker-function my.module.validator.validate_ssh
+
+this function will take in the ssh error raised as its first argument, and the remote account object as its second.
diff --git a/docs/index.rst b/docs/index.rst
index ada5ed625..834b00fe8 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -21,6 +21,7 @@ Ducktape contains tools for running system integration and performance tests. It
    debug_tests
    api
    misc
+   changelog
 
 Contribute
 ==========
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 389884184..618ea9840 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,6 +1,8 @@
-Sphinx==1.5.3
+Sphinx<1.7
 sphinx-argparse==0.1.17
 sphinx-rtd-theme==0.2.4
 boto3==1.15.9
 pycryptodome==3.9.8
 pywinrm==0.2.2
+jinja2==2.11.2
+MarkupSafe<2.0.0
diff --git a/docs/run_tests.rst b/docs/run_tests.rst
index 3ea89b335..a9941f956 100644
--- a/docs/run_tests.rst
+++ b/docs/run_tests.rst
@@ -8,9 +8,9 @@ Running Tests
 =============
 
 ducktape discovers and runs tests in the path(s) provided.
-You can specify a folder with tests, a specific test file or even a specific class or test method, via absolute or
-relative paths. You can optionally specify a specific set of parameters
-for tests with ``@parametrize`` or ``@matrix`` annotations::
+You can specify a folder with tests (all tests in Python modules named with "test\_" prefix or "_test" suffix will be
+run), a specific test file (with any name) or even a specific class or test method, via absolute or relative paths.
+You can optionally specify a specific set of parameters for tests with ``@parametrize`` or ``@matrix`` annotations::
 
     ducktape <relative_path_to_testdirectory>                   # e.g. ducktape dir/tests
     ducktape <relative_path_to_file>                            # e.g. ducktape dir/tests/my_test.py
diff --git a/ducktape/__init__.py b/ducktape/__init__.py
index a02276f75..b14fb08a8 100644
--- a/ducktape/__init__.py
+++ b/ducktape/__init__.py
@@ -1 +1 @@
-__version__ = '0.8.8'
+__version__ = '0.8.18'
diff --git a/ducktape/cluster/json.py b/ducktape/cluster/json.py
index 8f4b72d63..8e03b503d 100644
--- a/ducktape/cluster/json.py
+++ b/ducktape/cluster/json.py
@@ -90,7 +90,9 @@ def __init__(self, cluster_json=None, *args, **kwargs):
                     "Cluster json has a node without a ssh_config field: %s\n Cluster json: %s" % (ninfo, cluster_json)
 
                 ssh_config = RemoteAccountSSHConfig(**ninfo.get("ssh_config", {}))
-                remote_account = JsonCluster.make_remote_account(ssh_config, ninfo.get("externally_routable_ip"))
+                remote_account = \
+                    JsonCluster.make_remote_account(ssh_config, ninfo.get("externally_routable_ip"),
+                                                    ssh_exception_checks=kwargs.get("ssh_exception_checks"))
                 if remote_account.externally_routable_ip is None:
                     remote_account.externally_routable_ip = self._externally_routable_ip(remote_account)
                 self._available_accounts.add_node(remote_account)
@@ -100,15 +102,13 @@ def __init__(self, cluster_json=None, *args, **kwargs):
         self._id_supplier = 0
 
     @staticmethod
-    def make_remote_account(ssh_config, externally_routable_ip=None):
+    def make_remote_account(ssh_config, *args, **kwargs):
         """Factory function for creating the correct RemoteAccount implementation."""
 
         if ssh_config.host and WINDOWS in ssh_config.host:
-            return WindowsRemoteAccount(ssh_config=ssh_config,
-                                        externally_routable_ip=externally_routable_ip)
+            return WindowsRemoteAccount(ssh_config, *args, **kwargs)
         else:
-            return LinuxRemoteAccount(ssh_config=ssh_config,
-                                      externally_routable_ip=externally_routable_ip)
+            return LinuxRemoteAccount(ssh_config, *args, **kwargs)
 
     def do_alloc(self, cluster_spec):
         allocated_accounts = self._available_accounts.remove_spec(cluster_spec)
diff --git a/ducktape/cluster/linux_remoteaccount.py b/ducktape/cluster/linux_remoteaccount.py
index 22bf3778a..72873edd7 100644
--- a/ducktape/cluster/linux_remoteaccount.py
+++ b/ducktape/cluster/linux_remoteaccount.py
@@ -13,14 +13,13 @@
 # limitations under the License.
 
 from ducktape.cluster.cluster_spec import LINUX
-from ducktape.cluster.remoteaccount import RemoteAccount
+from ducktape.cluster.remoteaccount import RemoteAccount, RemoteAccountError
 
 
 class LinuxRemoteAccount(RemoteAccount):
 
-    def __init__(self, ssh_config, externally_routable_ip=None, logger=None):
-        super(LinuxRemoteAccount, self).__init__(ssh_config, externally_routable_ip=externally_routable_ip,
-                                                 logger=logger)
+    def __init__(self, *args, **kwargs):
+        super(LinuxRemoteAccount, self).__init__(*args, **kwargs)
         self._ssh_client = None
         self._sftp_client = None
         self.os = LINUX
@@ -31,11 +30,54 @@ def local(self):
         This is an imperfect heuristic, but should work for simple local testing."""
         return self.hostname == "localhost" and self.user is None and self.ssh_config is None
 
-    def fetch_externally_routable_ip(self, is_aws):
-        if is_aws:
-            cmd = "/sbin/ifconfig eth0 "
-        else:
-            cmd = "/sbin/ifconfig eth1 "
-        cmd += r"| grep 'inet ' | tail -n 1 | egrep -o '[0-9\.]+' | head -n 1 2>&1"
-        output = "".join(self.ssh_capture(cmd))
-        return output.strip()
+    def get_network_devices(self):
+        """
+        Utility to get all network devices on a linux account
+        """
+        return [
+            device
+            for device in self.sftp_client.listdir('/sys/class/net')
+        ]
+
+    def get_external_accessible_network_devices(self):
+        """
+        gets the subset of devices accessible through an external conenction
+        """
+        return [
+            device
+            for device in self.get_network_devices()
+            if device != 'lo'  # do not include local device
+            and (device.startswith("en") or device.startswith('eth'))  # filter out other devices; "en" means ethernet
+            # eth0 can also sometimes happen, see https://unix.stackexchange.com/q/134483
+        ]
+
+    # deprecated, please use the self.externally_routable_ip that is set in your cluster,
+    # not explicitly deprecating it as it's used by vagrant cluster
+    def fetch_externally_routable_ip(self, is_aws=None):
+        if is_aws is not None:
+            self.logger.warning("fetch_externally_routable_ip: is_aws is a deprecated flag, and does nothing")
+
+        devices = self.get_external_accessible_network_devices()
+
+        self.logger.debug("found devices: {}".format(devices))
+
+        if not devices:
+            raise RemoteAccountError(self, "Couldn't find any network devices")
+
+        fmt_cmd = (
+            "/sbin/ifconfig {device} | "
+            "grep 'inet ' | "
+            "tail -n 1 | "
+            r"egrep -o '[0-9\.]+' | "
+            "head -n 1 2>&1"
+        )
+
+        ips = [
+            "".join(
+                self.ssh_capture(fmt_cmd.format(device=device))
+            ).strip()
+            for device in devices
+        ]
+        self.logger.debug("found ips: {}".format(ips))
+        self.logger.debug("returning the first ip found")
+        return next(iter(ips))
diff --git a/ducktape/cluster/localhost.py b/ducktape/cluster/localhost.py
index b76c76673..b502a5624 100644
--- a/ducktape/cluster/localhost.py
+++ b/ducktape/cluster/localhost.py
@@ -32,7 +32,9 @@ def __init__(self, *args, **kwargs):
         self._available_nodes = NodeContainer()
         for i in range(num_nodes):
             ssh_config = RemoteAccountSSHConfig("localhost%d" % i, hostname="localhost", port=22)
-            self._available_nodes.add_node(ClusterNode(LinuxRemoteAccount(ssh_config)))
+            self._available_nodes.add_node(ClusterNode(
+                LinuxRemoteAccount(ssh_config,
+                                   ssh_exception_checks=kwargs.get("ssh_exception_checks"))))
         self._in_use_nodes = NodeContainer()
 
     def do_alloc(self, cluster_spec):
diff --git a/ducktape/cluster/remoteaccount.py b/ducktape/cluster/remoteaccount.py
index 95dc514cd..0b630cd37 100644
--- a/ducktape/cluster/remoteaccount.py
+++ b/ducktape/cluster/remoteaccount.py
@@ -18,20 +18,36 @@
 # Changing it due to https://github.com/redpanda-data/redpanda/issues/6792
 paramiko.packet.Packetizer.REKEY_BYTES = pow(2, 32) # noqa
 
-from contextlib import contextmanager
-import logging
-import os
-from paramiko import SSHClient, SSHConfig, MissingHostKeyPolicy
-import shutil
-import signal
-import socket
-import stat
-import tempfile
-import warnings
-
-from ducktape.utils.http_utils import HttpMixin
-from ducktape.utils.util import wait_until
-from ducktape.errors import DucktapeError
+from contextlib import contextmanager # noqa
+import logging # noqa
+import os # noqa
+from paramiko import SSHClient, SSHConfig, MissingHostKeyPolicy # noqa
+from paramiko.ssh_exception import SSHException, NoValidConnectionsError # noqa
+import shutil # noqa
+import signal # noqa
+import socket # noqa
+import stat # noqa
+import tempfile # noqa
+import warnings # noqa
+
+from ducktape.utils.http_utils import HttpMixin # noqa
+from ducktape.utils.util import wait_until # noqa
+from ducktape.errors import DucktapeError # noqa
+
+
+def check_ssh(method):
+    def wrapper(self, *args, **kwargs):
+        try:
+            return method(self, *args, **kwargs)
+        except (SSHException, NoValidConnectionsError, socket.error) as e:
+            if self._custom_ssh_exception_checks:
+                self._log(logging.DEBUG, "caught ssh error", exc_info=True)
+                self._log(logging.DEBUG, "starting ssh checks:")
+                self._log(logging.DEBUG, "\n".join(repr(f) for f in self._custom_ssh_exception_checks))
+                for func in self._custom_ssh_exception_checks:
+                    func(e, self)
+            raise
+    return wrapper
 
 
 class RemoteAccountSSHConfig(object):
@@ -126,7 +142,7 @@ class RemoteAccount(HttpMixin):
     Each operating system has its own RemoteAccount implementation.
     """
 
-    def __init__(self, ssh_config, externally_routable_ip=None, logger=None):
+    def __init__(self, ssh_config, externally_routable_ip=None, logger=None, ssh_exception_checks=[]):
         # Instance of RemoteAccountSSHConfig - use this instead of a dict, because we need the entire object to
         # be hashable
         self.ssh_config = ssh_config
@@ -145,6 +161,7 @@ def __init__(self, ssh_config, externally_routable_ip=None, logger=None):
         self.os = None
         self._ssh_client = None
         self._sftp_client = None
+        self._custom_ssh_exception_checks = ssh_exception_checks
 
     @property
     def operating_system(self):
@@ -165,6 +182,7 @@ def _log(self, level, msg, *args, **kwargs):
         msg = "%s: %s" % (str(self), msg)
         self.logger.log(level, msg, *args, **kwargs)
 
+    @check_ssh
     def _set_ssh_client(self):
         client = SSHClient()
         client.set_missing_host_key_policy(IgnoreMissingHostKeyPolicy())
@@ -256,6 +274,7 @@ def _can_ping_url(self, url, headers):
         except Exception:
             return False
 
+    @check_ssh
     def ssh(self, cmd, allow_fail=False):
         """Run the given command on the remote host, and block until the command has finished running.
 
@@ -289,6 +308,7 @@ def ssh(self, cmd, allow_fail=False):
 
         return exit_status
 
+    @check_ssh
     def ssh_capture(self, cmd, allow_fail=False, callback=None, combine_stderr=True, timeout_sec=None):
         """Run the given command asynchronously via ssh, and return an SSHOutputIter object.
 
@@ -342,6 +362,7 @@ def output_generator():
 
         return SSHOutputIter(output_generator, stdout)
 
+    @check_ssh
     def ssh_output(self, cmd, allow_fail=False, combine_stderr=True, timeout_sec=None):
         """Runs the command via SSH and captures the output, returning it as a string.
 
@@ -381,7 +402,7 @@ def ssh_output(self, cmd, allow_fail=False, combine_stderr=True, timeout_sec=Non
             stdin.close()
             stdout.close()
             stderr.close()
-
+        self._log(logging.DEBUG, "Returning ssh command output:\n%s" % stdoutdata)
         return stdoutdata
 
     def alive(self, pid):
@@ -493,6 +514,7 @@ def _re_anchor_basename(self, path, directory):
 
         return os.path.join(directory, path_basename)
 
+    @check_ssh
     def copy_from(self, src, dest):
         if os.path.isdir(dest):
             # dest is an existing directory, so assuming src looks like path/to/src_name,
@@ -519,6 +541,7 @@ def scp_to(self, src, dest, recursive=False):
         warnings.warn("scp_to is now deprecated. Please use copy_to")
         self.copy_to(src, dest)
 
+    @check_ssh
     def copy_to(self, src, dest):
 
         if self.isdir(dest):
@@ -543,6 +566,7 @@ def copy_to(self, src, dest):
                     # TODO what about uncopyable file types?
                     pass
 
+    @check_ssh
     def islink(self, path):
         try:
             # stat should follow symlinks
@@ -551,6 +575,7 @@ def islink(self, path):
         except Exception:
             return False
 
+    @check_ssh
     def isdir(self, path):
         try:
             # stat should follow symlinks
@@ -559,6 +584,7 @@ def isdir(self, path):
         except Exception:
             return False
 
+    @check_ssh
     def exists(self, path):
         """Test that the path exists, but don't follow symlinks."""
         try:
@@ -568,6 +594,7 @@ def exists(self, path):
         except IOError:
             return False
 
+    @check_ssh
     def isfile(self, path):
         """Imitates semantics of os.path.isfile
 
@@ -584,6 +611,7 @@ def isfile(self, path):
     def open(self, path, mode='r'):
         return self.sftp_client.open(path, mode)
 
+    @check_ssh
     def create_file(self, path, contents):
         """Create file at path, with the given contents.
 
@@ -591,13 +619,14 @@ def create_file(self, path, contents):
         """
         # TODO: what should semantics be if path exists? what actually happens if it already exists?
         # TODO: what happens if the base part of the path does not exist?
+
         with self.sftp_client.open(path, "w") as f:
             f.write(contents)
 
     _DEFAULT_PERMISSIONS = int('755', 8)
 
+    @check_ssh
     def mkdir(self, path, mode=_DEFAULT_PERMISSIONS):
-
         self.sftp_client.mkdir(path, mode)
 
     def mkdirs(self, path, mode=_DEFAULT_PERMISSIONS):
diff --git a/ducktape/cluster/vagrant.py b/ducktape/cluster/vagrant.py
index 462131a1a..71d72dc27 100644
--- a/ducktape/cluster/vagrant.py
+++ b/ducktape/cluster/vagrant.py
@@ -34,9 +34,8 @@ class VagrantCluster(JsonCluster):
     """
 
     def __init__(self, *args, **kwargs):
-        self._is_aws = None
         is_read_from_file = False
-
+        self.ssh_exception_checks = kwargs.get("ssh_exception_checks")
         cluster_file = kwargs.get("cluster_file")
         if cluster_file is not None:
             try:
@@ -51,7 +50,7 @@ def __init__(self, *args, **kwargs):
                 "nodes": self._get_nodes_from_vagrant()
             }
 
-        super(VagrantCluster, self).__init__(cluster_json)
+        super(VagrantCluster, self).__init__(cluster_json, *args, **kwargs)
 
         # If cluster file is specified but the cluster info is not read from it, write the cluster info into the file
         if not is_read_from_file and cluster_file is not None:
@@ -82,8 +81,8 @@ def _get_nodes_from_vagrant(self):
 
             account = None
             try:
-                account = JsonCluster.make_remote_account(ssh_config)
-                externally_routable_ip = account.fetch_externally_routable_ip(self.is_aws)
+                account = JsonCluster.make_remote_account(ssh_config, ssh_exception_checks=self.ssh_exception_checks)
+                externally_routable_ip = account.fetch_externally_routable_ip()
             finally:
                 if account:
                     account.close()
@@ -102,18 +101,3 @@ def _vagrant_ssh_config(self):
                                                   # Force to text mode in py2/3 compatible way
                                                   universal_newlines=True).communicate()
         return ssh_config_info, error
-
-    @property
-    def is_aws(self):
-        """Heuristic to detect whether the slave nodes are local or aws.
-
-        Return true if they are running on aws.
-        """
-        if self._is_aws is None:
-            proc = subprocess.Popen("vagrant status", shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
-                                    close_fds=True,
-                                    # Force to text mode in py2/3 compatible way
-                                    universal_newlines=True)
-            output, _ = proc.communicate()
-            self._is_aws = output.find("aws") >= 0
-        return self._is_aws
diff --git a/ducktape/cluster/windows_remoteaccount.py b/ducktape/cluster/windows_remoteaccount.py
index 68c7197ec..7979054c2 100644
--- a/ducktape/cluster/windows_remoteaccount.py
+++ b/ducktape/cluster/windows_remoteaccount.py
@@ -37,9 +37,8 @@ class WindowsRemoteAccount(RemoteAccount):
 
     WINRM_USERNAME = "Administrator"
 
-    def __init__(self, ssh_config, externally_routable_ip=None, logger=None):
-        super(WindowsRemoteAccount, self).__init__(ssh_config, externally_routable_ip=externally_routable_ip,
-                                                   logger=logger)
+    def __init__(self, *args, **kwargs):
+        super(WindowsRemoteAccount, self).__init__(*args, **kwargs)
         self.os = WINDOWS
         self._winrm_client = None
 
@@ -97,10 +96,7 @@ def winrm_client(self):
 
         return self._winrm_client
 
-    def fetch_externally_routable_ip(self, is_aws):
-        if not is_aws:
-            raise NotImplementedError("Windows is only supported in AWS.")
-
+    def fetch_externally_routable_ip(self, is_aws=None):
         # EC2 windows machines aren't given an externally routable IP. Use the hostname instead.
         return self.ssh_config.hostname
 
diff --git a/ducktape/command_line/main.py b/ducktape/command_line/main.py
index 4ff4ff613..2237fba8c 100644
--- a/ducktape/command_line/main.py
+++ b/ducktape/command_line/main.py
@@ -27,12 +27,13 @@
 from ducktape.tests.loader import TestLoader, LoaderException
 from ducktape.tests.loggermaker import close_logger
 from ducktape.tests.reporter import SimpleStdoutSummaryReporter, SimpleFileSummaryReporter, \
-    HTMLSummaryReporter, JSONReporter, JUnitReporter
+    HTMLSummaryReporter, JSONReporter, JUnitReporter, FailedTestSymbolReporter
 from ducktape.tests.runner import TestRunner
 from ducktape.tests.session import SessionContext, SessionLoggerMaker
 from ducktape.tests.session import generate_session_id, generate_results_dir
 from ducktape.utils.local_filesystem_utils import mkdir_p
 from ducktape.utils import persistence
+from ducktape.utils.util import load_function
 
 
 def get_user_defined_globals(globals_str):
@@ -143,6 +144,11 @@ def main():
             print("    " + str(test))
         sys.exit(0)
 
+    if args_dict["collect_num_nodes"]:
+        total_nodes = sum(test.expected_num_nodes for test in tests)
+        print(total_nodes)
+        sys.exit(0)
+
     if args_dict["sample"]:
         print("Running a sample of %d tests" % args_dict["sample"])
         try:
@@ -160,7 +166,14 @@ def main():
         (cluster_mod_name, cluster_class_name) = args_dict["cluster"].rsplit('.', 1)
         cluster_mod = importlib.import_module(cluster_mod_name)
         cluster_class = getattr(cluster_mod, cluster_class_name)
-        cluster = cluster_class(cluster_file=args_dict["cluster_file"])
+
+        cluster_kwargs = {"cluster_file": args_dict["cluster_file"]}
+        checker_function_names = args_dict['ssh_checker_function']
+        if checker_function_names:
+            checkers = [load_function(func_path) for func_path in checker_function_names]
+            if checkers:
+                cluster_kwargs['ssh_exception_checks'] = checkers
+        cluster = cluster_class(**cluster_kwargs)
         for ctx in tests:
             # Note that we're attaching a reference to cluster
             # only after test context objects have been instantiated
@@ -171,7 +184,11 @@ def main():
         sys.exit(1)
 
     # Run the tests
-    runner = TestRunner(cluster, session_context, session_logger, tests)
+    deflake_num = args_dict['deflake']
+    if deflake_num < 1:
+        session_logger.warning("specified number of deflake runs specified to be less than 1, running without deflake.")
+    deflake_num = max(1, deflake_num)
+    runner = TestRunner(cluster, session_context, session_logger, tests, deflake_num)
     test_results = runner.run_all_tests()
     test_results.command_line = " ".join(sys.argv)
     # Report results
@@ -180,7 +197,8 @@ def main():
         SimpleFileSummaryReporter(test_results),
         HTMLSummaryReporter(test_results),
         JSONReporter(test_results),
-        JUnitReporter(test_results)
+        JUnitReporter(test_results),
+        FailedTestSymbolReporter(test_results)
     ]
 
     for r in reporters:
diff --git a/ducktape/command_line/parse_args.py b/ducktape/command_line/parse_args.py
index 7728c65ce..ec663f8de 100644
--- a/ducktape/command_line/parse_args.py
+++ b/ducktape/command_line/parse_args.py
@@ -30,6 +30,8 @@ def create_ducktape_parser():
     parser.add_argument('--exclude', type=str, nargs='*', default=None,
                         help='one or more space-delimited strings indicating which tests to exclude')
     parser.add_argument("--collect-only", action="store_true", help="display collected tests, but do not run.")
+    parser.add_argument("--collect-num-nodes", action="store_true",
+                        help="display total number of nodes requested by all tests, but do not run anything.")
     parser.add_argument("--debug", action="store_true", help="pipe more verbose test output to stdout.")
     parser.add_argument("--config-file", action="store", default=ConsoleDefaults.USER_CONFIG_FILE,
                         help="path to project-specific configuration file.")
@@ -81,6 +83,16 @@ def create_ducktape_parser():
     parser.add_argument("--test-runner-timeout", action="store", type=int, default=1800000,
                         help="Amount of time in milliseconds between test communicating between the test runner"
                              " before a timeout error occurs. Default is 30 minutes")
+    parser.add_argument("--ssh-checker-function", action="store", type=str, nargs="+",
+                        help="Python module path(s) to a function that takes an exception and a remote account"
+                        " that will be called when an ssh error occurs, this can give some "
+                        "validation or better logging when an ssh error occurs. Specify any "
+                        "number of module paths after this flag to be called."),
+    parser.add_argument("--deflake", action="store", type=int, default=1,
+                        help="the number of times a failed test should be ran in total (including its initial run) "
+                             "to determine flakyness. When not present, deflake will not be used, "
+                             "and a test will be marked as either passed or failed. "
+                             "When enabled tests will be marked as flaky if it passes on any of the reruns")
     return parser
 
 
diff --git a/ducktape/services/service.py b/ducktape/services/service.py
index cad8c810a..3400784ee 100644
--- a/ducktape/services/service.py
+++ b/ducktape/services/service.py
@@ -23,6 +23,31 @@
 import time
 
 
+class ServiceIdFactory:
+    def generate_service_id(self, service):
+        return "{service_name}-{service_number}-{service_id}".format(
+            service_name=service.__class__.__name__,
+            service_number=service._order,
+            service_id=id(service)
+        )
+
+
+class MultiRunServiceIdFactory:
+    def __init__(self, run_number=1):
+        self.run_number = run_number
+
+    def generate_service_id(self, service):
+        return "{run_number}-{service_name}-{service_number}-{service_id}".format(
+            run_number=self.run_number,
+            service_name=service.__class__.__name__,
+            service_number=service._order,
+            service_id=id(service)
+        )
+
+
+service_id_factory = ServiceIdFactory()
+
+
 class Service(TemplateRenderer):
     """Service classes know how to deploy a service onto a set of nodes and then clean up after themselves.
 
@@ -72,6 +97,7 @@ def __init__(self, context, num_nodes=None, cluster_spec=None, *args, **kwargs):
         self._clean_time = -1
 
         self._initialized = False
+        self.service_id_factory = service_id_factory
         self.cluster_spec = Service.setup_cluster_spec(num_nodes=num_nodes, cluster_spec=cluster_spec)
         self.context = context
 
@@ -125,7 +151,7 @@ def local_scratch_dir(self):
     @property
     def service_id(self):
         """Human-readable identifier (almost certainly) unique within a test run."""
-        return "%s-%d-%d" % (self.__class__.__name__, self._order, id(self))
+        return self.service_id_factory.generate_service_id(self)
 
     @property
     def _order(self):
@@ -307,13 +333,12 @@ def clean_node(self, node, **kwargs):
 
     def free(self):
         """Free each node. This 'deallocates' the nodes so the cluster can assign them to other services."""
-        for node in self.nodes:
+        while self.nodes:
+            node = self.nodes.pop()
             self.logger.info("%s: freeing node" % self.who_am_i(node))
             node.account.logger = None
             self.cluster.free(node)
 
-        self.nodes = []
-
     def run(self):
         """Helper that executes run(), wait(), and stop() in sequence."""
         self.start()
diff --git a/ducktape/services/service_registry.py b/ducktape/services/service_registry.py
index ce2960018..24d3d315a 100644
--- a/ducktape/services/service_registry.py
+++ b/ducktape/services/service_registry.py
@@ -38,7 +38,7 @@ def append(self, service):
         self._nodes[id(service)] = [str(n.account) for n in service.nodes]
 
     def to_json(self):
-        return [self._services[k].to_json() for k in self._services]
+        return [service.to_json() for service in self._services.values()]
 
     def stop_all(self):
         """Stop all currently registered services in the reverse of the order in which they were added.
@@ -84,6 +84,8 @@ def free_all(self):
 
         if keyboard_interrupt is not None:
             raise keyboard_interrupt
+        self._services.clear()
+        self._nodes.clear()
 
     def min_cluster_spec(self):
         """
@@ -99,8 +101,8 @@ def errors(self):
         """
         Gets a printable string containing any errors produced by the services.
         """
-        all_errors = []
-        for service in self._services.values():
-            if hasattr(service, 'error') and service.error:
-                all_errors.append("%s: %s" % (service.who_am_i(), service.error))
-        return '\n\n'.join(all_errors)
+        return '\n\n'.join(
+            "{}: {}".format(service.who_am_i(), service.error)
+            for service in self._services.values()
+            if hasattr(service, 'error') and service.error
+        )
diff --git a/ducktape/templates/report/report.css b/ducktape/templates/report/report.css
index 27587df4a..8d41cbde2 100644
--- a/ducktape/templates/report/report.css
+++ b/ducktape/templates/report/report.css
@@ -33,6 +33,13 @@ h1, h2, h3, h4, h5, h6 {
     padding: 2px;
 }
 
+.pre_stack_trace {
+    white-space: pre-wrap;       /* Since CSS 2.1 */
+    white-space: -moz-pre-wrap;  /* Mozilla, since 1999 */
+    white-space: -o-pre-wrap;    /* Opera 7 */
+    word-wrap: break-word;       /* Internet Explorer 5.5+ */
+}
+
 .header_row {
     font-weight: bold;
     color: white;
@@ -59,6 +66,10 @@ h1, h2, h3, h4, h5, h6 {
     background-color: #6c6;
 }
 
+.flaky {
+    background-color: #dd2;
+}
+
 .fail {
     background-color: #c60; 
 }
diff --git a/ducktape/templates/report/report.html b/ducktape/templates/report/report.html
index 12b262331..dafa34c71 100644
--- a/ducktape/templates/report/report.html
+++ b/ducktape/templates/report/report.html
@@ -11,6 +11,8 @@
     <div id="color_key_panel"></div>
     <div id="failed_test_panel"></div>
     <div id="ignored_test_panel"></div>
+    <div id="flaky_test_panel"></div>
+
     <div id="opassed_test_panel"></div>
     <div id="ofailed_test_panel"></div>
     <div id="passed_test_panel"></div>
@@ -41,6 +43,7 @@ <h1>
             <tr>
               <td colSpan='5' align='center'>{this.props.summary_prop.tests}</td>
               <td colSpan='5' align='center'>{this.props.summary_prop.passes}</td>
+              <td colSpan='5' align='center'>{this.props.summary_prop.flaky}</td>
               <td colSpan='5' align='center'>{this.props.summary_prop.failures}</td>
               <td colSpan='5' align='center'>{this.props.summary_prop.ignored}</td>
               <td colSpan='5' align='center'>{this.props.summary_prop.opassed}</td>
@@ -59,6 +62,7 @@ <h1>
                 <tr id="summary_header_row">
                   <th colSpan='5' align='center'>Tests</th>
                   <th colSpan='5' align='center'>Passes</th>
+                  <th colSpan='5' align='center'>Flaky</th>
                   <th colSpan='5' align='center'>Failures</th>
                   <th colSpan='5' align='center'>Ignored</th>
                   <th colSpan='5' align='center'>OPassed</th>
@@ -94,6 +98,7 @@ <h1>
               <td colSpan='5' align='center'><pre>{this.props.test.description}</pre></td>
               <td colSpan='5' align='center'><pre>{this.props.test.run_time}</pre></td>
               <td colSpan='5' align='center'><pre>{this.props.test.data}</pre></td>
+              <td colSpan='5' align='center'><pre className="pre_stack_trace">{this.props.test.summary}</pre></td>
               {detailCol}
             </tr>
           );
@@ -112,6 +117,7 @@ <h1>
                   <th colSpan='5' align='center'>Description</th>
                   <th colSpan='5' align='center'>Time</th>
                   <th colSpan='5' align='center'>Data</th>
+                  <th colSpan='5' align='center'>Summary</th>
                   <th colSpan='5' align='center'>Detail</th>
                 </tr>
               </thead>
@@ -182,6 +188,7 @@ <h2>{this.props.title}</h2>
       SUMMARY=[{
         "tests": %(num_tests)d,
         "passes": %(num_passes)d,
+        "flaky": %(num_flaky)d,
         "failures": %(num_failures)d,
         "ignored": %(num_ignored)d,
         "opassed": %(num_opassed)d,
@@ -198,6 +205,7 @@ <h2>{this.props.title}</h2>
       COLOR_KEYS=[%(test_status_names)s];
 
       PASSED_TESTS=[%(passed_tests)s];
+      FLAKY_TESTS=[%(flaky_tests)s];
       FAILED_TESTS=[%(failed_tests)s];
       IGNORED_TESTS=[%(ignored_tests)s];
       OPASSED_TESTS=[%(opassed_tests)s];
@@ -208,6 +216,7 @@ <h2>{this.props.title}</h2>
       React.render(<SummaryPanel summary_props={SUMMARY}/>, document.getElementById('summary_panel'));
       React.render(<TestPanel title="Failed Tests" tests={FAILED_TESTS}/>, document.getElementById('failed_test_panel'));
       React.render(<TestPanel title="Ignored Tests" tests={IGNORED_TESTS}/>, document.getElementById('ignored_test_panel'));
+      React.render(<TestPanel title="Flaky Tests" tests={FLAKY_TESTS}/>, document.getElementById('flaky_test_panel'));
       React.render(<TestPanel title="OPassed Tests" tests={OPASSED_TESTS}/>, document.getElementById('opassed_test_panel'));
       React.render(<TestPanel title="OFailed Tests" tests={OFAILED_TESTS}/>, document.getElementById('ofailed_test_panel'));
       React.render(<TestPanel title="Passed Tests" tests={PASSED_TESTS}/>, document.getElementById('passed_test_panel'));
diff --git a/ducktape/tests/loader.py b/ducktape/tests/loader.py
index a7419be92..df2d1314f 100644
--- a/ducktape/tests/loader.py
+++ b/ducktape/tests/loader.py
@@ -389,18 +389,22 @@ def _find_test_files(self, path_or_glob):
         """
         test_files = []
         self.logger.debug('Looking for test files in {}'.format(path_or_glob))
+        # glob is safe to be called on non-glob path - it would just return that same path wrapped in a list
         expanded_glob = glob.glob(path_or_glob)
         self.logger.debug('Expanded {} into {}'.format(path_or_glob, expanded_glob))
-        # glob is safe to be called on non-glob path - it would just return that same path wrapped in a list
+
+        def maybe_add_test_file(f):
+            if self._is_test_file(f):
+                test_files.append(f)
+            else:
+                self.logger.debug("Skipping {} because it isn't a test file".format(f))
+
         for path in expanded_glob:
             if not os.path.exists(path):
                 raise LoaderException('Path {} does not exist'.format(path))
             self.logger.debug('Checking {}'.format(path))
             if os.path.isfile(path):
-                if self._is_test_file(path):
-                    test_files.append(os.path.abspath(path))
-                else:
-                    self.logger.debug("Skipping {} because it isn't a test file".format(path))
+                maybe_add_test_file(path)
             elif os.path.isdir(path):
                 for pwd, dirs, files in os.walk(path):
                     if "__init__.py" not in files:
@@ -408,10 +412,7 @@ def _find_test_files(self, path_or_glob):
                         continue
                     for f in files:
                         file_path = os.path.abspath(os.path.join(pwd, f))
-                        if self._is_test_file(file_path):
-                            test_files.append(file_path)
-                        else:
-                            self.logger.debug("Skipping {} because it isn't a test file".format(file_path))
+                        maybe_add_test_file(file_path)
             else:
                 raise LoaderException("Got a path that we don't understand: " + path)
 
@@ -559,7 +560,13 @@ def _load_test_contexts(self, test_discovery_symbols, base_dir=None):
             path_or_glob = os.path.abspath(path_or_glob)
 
             # TODO: consider adding a check to ensure glob or dir is not used together with cls_name and method
-            test_files = self._find_test_files(path_or_glob)
+            test_files = []
+            if os.path.isfile(path_or_glob):
+                # if it is a single file, just add it directly - https://github.com/confluentinc/ducktape/issues/284
+                test_files = [path_or_glob]
+            else:
+                # otherwise, when dealing with a dir or a glob, apply pattern matching rules
+                test_files = self._find_test_files(path_or_glob)
 
             self._add_top_level_dirs_to_sys_path(test_files)
 
diff --git a/ducktape/tests/reporter.py b/ducktape/tests/reporter.py
index e8246d4e3..d6c4dfd74 100644
--- a/ducktape/tests/reporter.py
+++ b/ducktape/tests/reporter.py
@@ -15,6 +15,9 @@
 from __future__ import print_function
 
 import json
+from pathlib import Path
+
+import yaml
 import os
 import shutil
 import xml.etree.ElementTree as ET
@@ -23,7 +26,7 @@
 
 from ducktape.utils.terminal_size import get_terminal_size
 from ducktape.utils.util import ducktape_version
-from ducktape.tests.status import PASS, FAIL, IGNORE, OPASS, OFAIL
+from ducktape.tests.status import PASS, FAIL, IGNORE, FLAKY, OPASS, OFAIL
 from ducktape.json_serializable import DucktapeJSONEncoder
 
 
@@ -109,6 +112,7 @@ def footer_string(self):
             "run time:         %s" % format_time(self.results.run_time_seconds),
             "tests run:        %d" % len(self.results),
             "passed:           %d" % self.results.num_passed,
+            "flaky:            %d" % self.results.num_flaky,
             "failed:           %d" % self.results.num_failed,
             "ignored:          %d" % self.results.num_ignored,
             "opassed:          %d" % self.results.num_opassed,
@@ -203,7 +207,7 @@ def report(self):
                 testsuite['skipped'] += 1
 
         total = self.results.num_failed + self.results.num_ignored + self.results.num_ofailed + \
-            self.results.num_opassed + self.results.num_passed
+            self.results.num_opassed + self.results.num_passed + self.results.num_flaky
         # Now start building XML document
         root = ET.Element('testsuites', attrib=dict(
             name="ducktape", time=str(self.results.run_time_seconds),
@@ -265,6 +269,7 @@ def format_result(self, result):
             "run_time": format_time(result.run_time_seconds),
             "data": "" if result.data is None else json.dumps(result.data, sort_keys=True,
                                                               indent=2, separators=(',', ': ')),
+            "summary": result.summary,
             "test_log": self.test_results_dir(result)
         }
         return result_json
@@ -286,48 +291,57 @@ def format_report(self):
 
         num_tests = len(self.results)
         num_passes = 0
-        failed_result_string = ""
-        passed_result_string = ""
-        ignored_result_string = ""
-        opassed_result_string = ""
-        ofailed_result_string = ""
+        failed_result_string = []
+        passed_result_string = []
+        ignored_result_string = []
+        flaky_result_string = []
+        opassed_result_string = []
+        ofailed_result_string = []
 
         for result in self.results:
             json_string = json.dumps(self.format_result(result))
             if result.test_status == PASS:
                 num_passes += 1
-                passed_result_string += json_string
-                passed_result_string += ","
+                passed_result_string.append(json_string)
+                passed_result_string.append(",")
             elif result.test_status == FAIL:
-                failed_result_string += json_string
-                failed_result_string += ","
+                failed_result_string.append(json_string)
+                failed_result_string.append(",")
+            elif result.test_status == IGNORE:
+                ignored_result_string.append(json_string)
+                ignored_result_string.append(",")
+            elif result.test_status == FLAKY:
+                flaky_result_string.append(json_string)
+                flaky_result_string.append(",")
             elif result.test_status == OPASS:
-                opassed_result_string += json_string
-                opassed_result_string += ","
+                opassed_result_string.append(json_string)
+                opassed_result_string.append(",")
             elif result.test_status == OFAIL:
-                ofailed_result_string += json_string
-                ofailed_result_string += ","
+                ofailed_result_string.append(json_string)
+                ofailed_result_string.append(",")
             else:
-                ignored_result_string += json_string
-                ignored_result_string += ","
+                raise Exception("Unknown test status in report: {}".format(result.test_status.to_json()))
 
         args = {
             'ducktape_version': ducktape_version(),
             'command_line': self.results.command_line,
             'num_tests': num_tests,
             'num_passes': self.results.num_passed,
+            'num_flaky': self.results.num_flaky,
             'num_failures': self.results.num_failed,
             'num_ignored': self.results.num_ignored,
             'num_opassed': self.results.num_opassed,
             'num_ofailed': self.results.num_ofailed,
             'run_time': format_time(self.results.run_time_seconds),
             'session': self.results.session_context.session_id,
-            'passed_tests': passed_result_string,
-            'failed_tests': failed_result_string,
-            'ignored_tests': ignored_result_string,
-            'ofailed_tests': ofailed_result_string,
-            'opassed_tests': opassed_result_string,
-            'test_status_names': ",".join(["\'%s\'" % str(status) for status in [PASS, FAIL, IGNORE, OPASS, OFAIL]])
+            'passed_tests': "".join(passed_result_string),
+            'flaky_tests': "".join(flaky_result_string),
+            'failed_tests': "".join(failed_result_string),
+            'ignored_tests': "".join(ignored_result_string),
+            'ofailed_tests': "".join(ofailed_result_string),
+            'opassed_tests': "".join(opassed_result_string),
+            'test_status_names': ",".join(["\'%s\'" % str(status) for status in
+                                           [PASS, FAIL, IGNORE, FLAKY, OPASS, OFAIL]])
         }
 
         html = template % args
@@ -342,3 +356,43 @@ def format_report(self):
 
     def report(self):
         self.format_report()
+
+
+class FailedTestSymbolReporter(SummaryReporter):
+
+    def __init__(self, results):
+        super().__init__(results)
+        self.working_dir = Path().absolute()
+        self.separator = "=" * self.width
+
+    def to_symbol(self, result):
+        p = Path(result.file_name).relative_to(self.working_dir)
+        line = f'{p}::{result.cls_name}.{result.function_name}'
+        if result.injected_args:
+            injected_args_str = json.dumps(result.injected_args, separators=(',', ':'))
+            line += f'@{injected_args_str}'
+        return line
+
+    def dump_test_suite(self, lines):
+        print(self.separator)
+        print('FAILED TEST SUITE')
+        suite = {self.results.session_context.session_id: lines}
+        file_path = Path(self.results.session_context.results_dir) / "rerun-failed.yml"
+        with file_path.open('w') as fp:
+            print(f'Test suite to rerun failed tests: {file_path}')
+            yaml.dump(suite, stream=fp, indent=4)
+
+    def print_test_symbols_string(self, lines):
+        print(self.separator)
+        print('FAILED TEST SYMBOLS')
+        print('Pass the test symbols below to your ducktape run')
+        # quote the symbol because json parameters will be processed by shell otherwise, making it not copy-pasteable
+        print(' '.join([f"'{line}'" for line in lines]))
+
+    def report(self):
+        symbols = [self.to_symbol(result) for result in self.results if result.test_status == FAIL]
+        if not symbols:
+            return
+
+        self.dump_test_suite(symbols)
+        self.print_test_symbols_string(symbols)
diff --git a/ducktape/tests/result.py b/ducktape/tests/result.py
index 9c75815eb..bf9e5ebcb 100644
--- a/ducktape/tests/result.py
+++ b/ducktape/tests/result.py
@@ -21,7 +21,7 @@
 from ducktape.tests.reporter import SingleResultFileReporter
 from ducktape.utils.local_filesystem_utils import mkdir_p
 from ducktape.utils.util import ducktape_version
-from ducktape.tests.status import PASS, FAIL, IGNORE, OPASS, OFAIL
+from ducktape.tests.status import FLAKY, PASS, FAIL, IGNORE, OPASS, OFAIL
 
 
 class TestResult(object):
@@ -64,6 +64,7 @@ def __init__(self,
         self.test_status = test_status
         self.summary = summary
         self.data = data
+        self.file_name = test_context.file
 
         self.base_results_dir = session_context.results_dir
         if not self.results_dir.endswith(os.path.sep):
@@ -162,6 +163,11 @@ def num_ignored(self):
         return len([r for r in self._results if r.test_status == IGNORE])
 
     @property
+    def num_flaky(self):
+        return len([r for r in self._results if r.test_status == FLAKY])
+
+    @property
+
     def num_opassed(self):
         return len([r for r in self._results if r.test_status == OPASS])
 
@@ -211,8 +217,7 @@ def to_json(self):
             cluster_utilization = (1.0 / len(self.cluster)) * (1.0 / self.run_time_seconds) * \
                 sum([r.nodes_used * r.run_time_seconds for r in self])
             parallelism = sum([r.run_time_seconds for r in self._results]) / self.run_time_seconds
-
-        return {
+        result = {
             "ducktape_version": ducktape_version(),
             "session_context": self.session_context,
             "run_time_seconds": self.run_time_seconds,
@@ -231,3 +236,6 @@ def to_json(self):
             "parallelism": parallelism,
             "results": [r for r in self._results]
         }
+        if self.num_flaky:
+            result['num_flaky'] = self.num_flaky
+        return result
diff --git a/ducktape/tests/runner.py b/ducktape/tests/runner.py
index 8d1f0b33f..466581533 100644
--- a/ducktape/tests/runner.py
+++ b/ducktape/tests/runner.py
@@ -84,7 +84,7 @@ class TestRunner(object):
     # When set to True, the test runner will finish running/cleaning the current test, but it will not run any more
     stop_testing = False
 
-    def __init__(self, cluster, session_context, session_logger, tests,
+    def __init__(self, cluster, session_context, session_logger, tests, deflake_num,
                  min_port=ConsoleDefaults.TEST_DRIVER_MIN_PORT,
                  max_port=ConsoleDefaults.TEST_DRIVER_MAX_PORT):
 
@@ -101,6 +101,8 @@ def __init__(self, cluster, session_context, session_logger, tests,
         self.hostname = "localhost"
         self.receiver = Receiver(min_port, max_port)
 
+        self.deflake_num = deflake_num
+
         self.session_context = session_context
         self.max_parallel = session_context.max_parallel
         self.results = TestResults(self.session_context, self.cluster)
@@ -249,7 +251,8 @@ def _run_single_test(self, test_context):
                 TestContext.logger_name(test_context, current_test_counter),
                 TestContext.results_dir(test_context, current_test_counter),
                 self.session_context.debug,
-                self.session_context.fail_bad_cluster_utilization
+                self.session_context.fail_bad_cluster_utilization,
+                self.deflake_num
             ])
 
         self._client_procs[test_key] = proc
diff --git a/ducktape/tests/runner_client.py b/ducktape/tests/runner_client.py
index 7332d289e..8abb70423 100644
--- a/ducktape/tests/runner_client.py
+++ b/ducktape/tests/runner_client.py
@@ -20,10 +20,13 @@
 import zmq
 
 from six import iteritems
+from ducktape.services.service import MultiRunServiceIdFactory, service_id_factory
+from ducktape.services.service_registry import ServiceRegistry
 
 from ducktape.tests.event import ClientEventFactory
 from ducktape.tests.loader import TestLoader
 from ducktape.tests.serde import SerDe
+from ducktape.tests.status import FLAKY
 from ducktape.tests.test import test_logger, TestContext
 
 from ducktape.tests.result import TestResult, IGNORE, PASS, FAIL, OPASS, OFAIL
@@ -39,7 +42,7 @@ class RunnerClient(object):
     """Run a single test"""
 
     def __init__(self, server_hostname, server_port, test_id,
-                 test_index, logger_name, log_dir, debug, fail_bad_cluster_utilization):
+                 test_index, logger_name, log_dir, debug, fail_bad_cluster_utilization, deflake_num):
         signal.signal(signal.SIGTERM, self._sigterm_handler)  # register a SIGTERM handler
 
         self.serde = SerDe()
@@ -58,9 +61,12 @@ def __init__(self, server_hostname, server_port, test_id,
         self.test_metadata = ready_reply["test_metadata"]
         self.cluster = ready_reply["cluster"]
 
+        self.deflake_num = deflake_num
+
         # Wait to instantiate the test object until running the test
         self.test = None
         self.test_context = None
+        self.all_services = None
 
     def send(self, event):
         return self.sender.send(event)
@@ -102,70 +108,40 @@ def run(self):
 
         start_time = -1
         stop_time = -1
-        test_status = PASS
-        summary = ""
+        test_status = FAIL
+        summary = []
         data = None
+        self.all_services = ServiceRegistry()
+
+        num_runs = 0
 
         try:
-            # Results from this test, as well as logs will be dumped here
-            mkdir_p(TestContext.results_dir(self.test_context, self.test_index))
-            # Instantiate test
-            self.test = self.test_context.cls(self.test_context)
+            while test_status == FAIL and num_runs < self.deflake_num:
+                num_runs += 1
+                self.log(logging.INFO, "on run {}/{}".format(num_runs, self.deflake_num))
+                start_time = time.time()
+                test_status, summary, data = self._do_run(num_runs)
 
-            self.log(logging.DEBUG, "Checking if there are enough nodes...")
-            min_cluster_spec = self.test.min_cluster_spec()
-            os_to_num_nodes = {}
-            for node_spec in min_cluster_spec:
-                if not os_to_num_nodes.get(node_spec.operating_system):
-                    os_to_num_nodes[node_spec.operating_system] = 1
-                else:
-                    os_to_num_nodes[node_spec.operating_system] = os_to_num_nodes[node_spec.operating_system] + 1
-            for (operating_system, node_count) in iteritems(os_to_num_nodes):
-                num_avail = len(list(self.cluster.all().nodes.elements(operating_system=operating_system)))
-                if node_count > num_avail:
-                    raise RuntimeError(
-                        "There are not enough nodes available in the cluster to run this test. "
-                        "Cluster size for %s: %d, Need at least: %d. Services currently registered: %s" %
-                        (operating_system, num_avail, node_count, self.test_context.services))
+                if test_status == PASS and num_runs > 1:
+                    test_status = FLAKY
 
-            # Run the test unit
-            start_time = time.time()
-            self.setup_test()
-
-            data = self.run_test()
+                msg = str(test_status.to_json())
+                if summary:
+                    msg += ": {}".format(summary)
+                if num_runs != self.deflake_num:
+                    msg += "\n" + "~" * max(len(line) for line in summary.split('\n'))
 
-            if self.test_context.ok_to_fail:
-                test_status = OPASS
-                self.log(logging.INFO, "OPASS")
-            else:
-                test_status = PASS
-                self.log(logging.INFO, "PASS")
-
-        except BaseException as e:
-            if self.test_context.ok_to_fail:
-                test_status = OFAIL
-                err_trace = self._exc_msg(e)
-                summary += err_trace
-                self.log(logging.INFO, "OFAIL: " + err_trace)
-            else:
-                # mark the test as failed before doing anything else
-                test_status = FAIL
-                err_trace = self._exc_msg(e)
-                summary += err_trace
-                self.log(logging.INFO, "FAIL: " + err_trace)
+                self.log(logging.INFO, msg)
 
         finally:
-            self.teardown_test(teardown_services=not self.session_context.no_teardown, test_status=test_status)
-
             stop_time = time.time()
 
-            if hasattr(self, "services"):
-                service_errors = self.test_context.services.errors()
-                if service_errors:
-                    summary += "\n\n" + service_errors
-
             test_status, summary = self._check_cluster_utilization(test_status, summary)
 
+            if num_runs > 1:
+                # for reporting purposes report all services
+                self.test_context.services = self.all_services
+            # for flaky tests, we report the start and end time of the successful run, and not the whole run period
             result = TestResult(
                 self.test_context,
                 self.test_index,
@@ -176,7 +152,6 @@ def run(self):
                 start_time,
                 stop_time)
 
-            self.log(logging.INFO, "Summary: %s" % str(result.summary))
             self.log(logging.INFO, "Data: %s" % str(result.data))
 
             result.report()
@@ -186,9 +161,76 @@ def run(self):
             # Release test_context resources only after creating the result and finishing logging activity
             # The Sender object uses the same logger, so we postpone closing until after the finished message is sent
             self.test_context.close()
+            self.all_services = None
             self.test_context = None
             self.test = None
 
+    def _do_run(self, num_runs):
+        test_status = FAIL
+        summary = []
+        data = None
+        sid_factory = MultiRunServiceIdFactory(num_runs) if self.deflake_num > 1 else service_id_factory
+        try:
+            # Results from this test, as well as logs will be dumped here
+            mkdir_p(TestContext.results_dir(self.test_context, self.test_index))
+            # Instantiate test
+            self.test = self.test_context.cls(self.test_context)
+            # Check if there are enough nodes
+            self._check_min_cluster_spec()
+            # Run the test unit
+
+            self.setup_test()
+
+            data = self.run_test()
+
+            if self.test_context.ok_to_fail:
+                test_status = OPASS
+            else:
+                test_status = PASS
+
+        except BaseException as e:
+            if self.test_context.ok_to_fail:
+                test_status = OFAIL
+            else:
+                test_status = FAIL
+            err_trace = self._exc_msg(e)
+            summary.append(err_trace)
+
+        finally:
+            for service in self.test_context.services:
+                service.service_id_factory = sid_factory
+                self.all_services.append(service)
+
+            self.teardown_test(teardown_services=not self.session_context.no_teardown, test_status=test_status)
+
+            if hasattr(self.test_context, "services"):
+                service_errors = self.test_context.services.errors()
+                if service_errors:
+                    summary.extend(["\n\n", service_errors])
+
+            # free nodes
+            if self.test:
+                self.log(logging.DEBUG, "Freeing nodes...")
+                self._do_safely(self.test.free_nodes, "Error freeing nodes:")
+            return test_status, "".join(summary), data
+
+    def _check_min_cluster_spec(self):
+        self.log(logging.DEBUG, "Checking if there are enough nodes...")
+        min_cluster_spec = self.test.min_cluster_spec()
+        os_to_num_nodes = {}
+        for node_spec in min_cluster_spec:
+            if not os_to_num_nodes.get(node_spec.operating_system):
+                os_to_num_nodes[node_spec.operating_system] = 1
+            else:
+                os_to_num_nodes[node_spec.operating_system] = os_to_num_nodes[node_spec.operating_system] + 1
+        for (operating_system, node_count) in iteritems(os_to_num_nodes):
+            num_avail = len(list(self.cluster.all().nodes.elements(operating_system=operating_system)))
+            if node_count > num_avail:
+                raise RuntimeError(
+                    "There are not enough nodes available in the cluster to run this test. "
+                    "Cluster size for %s: %d, Need at least: %d. Services currently registered: %s" %
+                    (operating_system, num_avail, node_count, self.test_context.services))
+
     def _check_cluster_utilization(self, result, summary):
         """Checks if the number of nodes used by a test is less than the number of
         nodes requested by the test. If this is the case and we wish to fail
@@ -201,7 +243,7 @@ def _check_cluster_utilization(self, result, summary):
             message = "Test requested %d nodes, used only %d" % (total, max_used)
             if self.fail_bad_cluster_utilization:
                 # only check node utilization on test pass
-                if result == PASS:
+                if result == PASS or result == FLAKY:
                     self.log(logging.INFO, "FAIL: " + message)
                     result = FAIL
                 elif result == OPASS:
@@ -264,9 +306,6 @@ def teardown_test(self, teardown_services=True, test_status=None):
             self.log(logging.DEBUG, "Cleaning up services...")
             self._do_safely(services.clean_all, "Error cleaning services:")
 
-        self.log(logging.DEBUG, "Freeing nodes...")
-        self._do_safely(self.test.free_nodes, "Error freeing nodes:")
-
     def log(self, log_level, msg, *args, **kwargs):
         """Log to the service log and the test log of the current test."""
 
diff --git a/ducktape/tests/status.py b/ducktape/tests/status.py
index 56c1a0dbf..0264af2e9 100644
--- a/ducktape/tests/status.py
+++ b/ducktape/tests/status.py
@@ -28,6 +28,7 @@ def to_json(self):
 
 
 PASS = TestStatus("pass")
+FLAKY = TestStatus("flaky")
 FAIL = TestStatus("fail")
 IGNORE = TestStatus("ignore")
 OPASS = TestStatus("opass")
diff --git a/ducktape/utils/util.py b/ducktape/utils/util.py
index c9563a64c..2b2c606c3 100644
--- a/ducktape/utils/util.py
+++ b/ducktape/utils/util.py
@@ -69,3 +69,13 @@ def package_is_installed(package_name):
 def ducktape_version():
     """Return string representation of current ducktape version."""
     return __ducktape_version__
+
+
+def load_function(func_module_path):
+    """Loads and returns a function from a module path seperated by '.'s"""
+    module, function = func_module_path.rsplit(".", 1)
+    try:
+        return getattr(importlib.import_module(module), function)
+    except AttributeError:
+        raise Exception("Function could not be loaded from the module path {}, "
+                        "verify that it is '.' seperated".format(func_module_path))
diff --git a/requirements-test.txt b/requirements-test.txt
new file mode 100644
index 000000000..af59e6420
--- /dev/null
+++ b/requirements-test.txt
@@ -0,0 +1,10 @@
+pytest==6.2.0
+# 4.0 drops py27 support
+mock==4.0.2
+psutil==5.7.2
+memory_profiler==0.57
+statistics==1.0.3.5
+requests-testadapter==0.3.0
+flake8~=4.0.0
+pytest-cov~=3.0
+pytest-xdist~=2.5
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 000000000..9b34c2226
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,16 @@
+jinja2==2.11.2
+boto3==1.26.62
+# jinja2 pulls in MarkupSafe with a > constraint, but we need to constrain it for compatibility
+MarkupSafe<2.0.0
+pyparsing<3.0.0
+zipp<2.0.0
+pywinrm==0.2.2
+requests==2.24.0
+paramiko~=2.11.0
+pyzmq==19.0.2
+pycryptodome==3.9.8
+# > 5.0 drops py27 support
+more-itertools==5.0.0
+tox==3.20.0
+six==1.15.0
+PyYAML==6.0
diff --git a/setup.py b/setup.py
index f40bd102f..6c3389b28 100644
--- a/setup.py
+++ b/setup.py
@@ -32,6 +32,9 @@ def run_tests(self):
         sys.exit(errno)
 
 
+test_req = open('requirements-test.txt').read()
+
+
 setup(name="ducktape",
       version=version,
       description="Distributed system test tools",
@@ -45,29 +48,9 @@ def run_tests(self):
       packages=find_packages(),
       package_data={'ducktape': ['templates/report/*']},
       python_requires='>= 3.6',
-      install_requires=['jinja2==2.11.2',
-                        'boto3==1.26.62',
-                        # jinja2 pulls in MarkupSafe with a > constraint, but we need to constrain it for compatibility
-                        'MarkupSafe<2.0.0',
-                        'pyparsing<3.0.0',
-                        'zipp<2.0.0',
-                        'pywinrm==0.2.2',
-                        'requests==2.24.0',
-                        'paramiko~=2.11.0',
-                        'pyzmq==19.0.2',
-                        'pycryptodome==3.9.8',
-                        # > 5.0 drops py27 support
-                        'more-itertools==5.0.0',
-                        'tox==3.20.0',
-                        'six==1.15.0',
-                        'PyYAML==6.0'],
-      tests_require=['pytest==6.1.0',
-                     # 4.0 drops py27 support
-                     'mock==4.0.2',
-                     'psutil==5.7.2',
-                     'memory_profiler==0.57',
-                     'statistics==1.0.3.5',
-                     'requests-testadapter==0.3.0'],
+      install_requires=open('requirements.txt').read(),
+      tests_require=test_req,
+      extras_require={'test': test_req},
       setup_requires=['flake8==3.8.3'],
       cmdclass={'test': PyTest},
       )
diff --git a/systests/cluster/test_remote_account.py b/systests/cluster/test_remote_account.py
index 4b9787a15..1432a0449 100644
--- a/systests/cluster/test_remote_account.py
+++ b/systests/cluster/test_remote_account.py
@@ -11,11 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from ducktape.cluster.cluster_spec import ClusterSpec
+from ducktape.cluster.cluster_spec import ClusterSpec, WINDOWS, LINUX, NodeSpec
 from ducktape.services.service import Service
 from ducktape.tests.test import Test
 from ducktape.errors import TimeoutError
 from ducktape.mark.resource import cluster
+from ducktape.mark import matrix, parametrize
 
 import os
 import pytest
@@ -110,6 +111,33 @@ def under_utilized_test(self):
         assert self.test_context.cluster.max_used() == 2
 
 
+class FailingTest(Test):
+    """
+    The purpose of this test is to validate reporters. Some of them are intended to fail.
+    """
+    def setup(self):
+        self.service = GenericService(self.test_context, 1)
+
+    @cluster(num_nodes=1)
+    @matrix(string_param=['success-first', 'fail-second', 'fail-third'], int_param=[10, 20, -30])
+    def matrix_test(self, string_param, int_param):
+        assert not string_param.startswith('fail') and int_param > 0
+
+    @cluster(num_nodes=1)
+    @parametrize(string_param='success-first', int_param=10)
+    @parametrize(string_param='fail-second', int_param=-10)
+    def parametrized_test(self, string_param, int_param):
+        assert not string_param.startswith('fail') and int_param > 0
+
+    @cluster(num_nodes=1)
+    def failing_test(self):
+        assert False
+
+    @cluster(num_nodes=1)
+    def successful_test(self):
+        assert True
+
+
 class FileSystemTest(Test):
     """
     Note that in an attempt to isolate the file system methods, validation should be done with ssh/shell commands.
@@ -420,6 +448,18 @@ def test_create_two_node_service(self):
         for node in self.service.nodes:
             node.account.ssh("echo hi")
 
+    @cluster(cluster_spec=ClusterSpec.from_nodes(
+        [
+            NodeSpec(operating_system=WINDOWS),
+            NodeSpec(operating_system=LINUX),
+            NodeSpec()  # this one is also linux
+        ]
+    ))
+    def three_nodes_test(self):
+        self.service = GenericService(self.test_context, 3)
+        for node in self.service.nodes:
+            node.account.ssh("echo hi")
+
 
 class RemoteAccountTest(Test):
     def __init__(self, test_context):
@@ -429,6 +469,10 @@ def __init__(self, test_context):
     def setup(self):
         self.account_service.start()
 
+    @cluster(num_nodes=1)
+    def test_flaky(self):
+        assert random.choice([True, False, False])
+
     @cluster(num_nodes=1)
     def test_ssh_capture_combine_stderr(self):
         """Test that ssh_capture correctly captures stderr and stdout from remote process.
@@ -535,9 +579,10 @@ def test_monitor_log_exception(self):
     @cluster(num_nodes=1)
     def test_kill_process(self):
         """Tests that kill_process correctly works"""
+        grep_str = '"nc -l -p 5000"'
 
         def get_pids():
-            pid_cmd = "ps ax | grep -i nc | grep -v grep | awk '{print $1}'"
+            pid_cmd = f"ps ax | grep -i {grep_str} | grep -v grep | awk '{{print $1}}'"
 
             return list(node.account.ssh_capture(pid_cmd, callback=int))
 
@@ -550,7 +595,7 @@ def get_pids():
                    err_msg="Failed to start process within %d sec" % 10)
 
         # Kill service.
-        node.account.kill_process("nc")
+        node.account.kill_process(grep_str)
 
         wait_until(lambda: len(get_pids()) == 0, timeout_sec=10,
                    err_msg="Failed to kill process within %d sec" % 10)
diff --git a/tests/cluster/check_remoteaccount.py b/tests/cluster/check_remoteaccount.py
index 57c1c2f6a..e08b39f34 100644
--- a/tests/cluster/check_remoteaccount.py
+++ b/tests/cluster/check_remoteaccount.py
@@ -17,6 +17,7 @@
 from tests.test_utils import find_available_port
 from ducktape.cluster.remoteaccount import RemoteAccount
 from ducktape.cluster.remoteaccount import RemoteAccountSSHConfig
+import pytest
 
 import logging
 from threading import Thread
@@ -26,6 +27,18 @@
 import time
 
 
+class DummyException(Exception):
+    pass
+
+
+def raise_error_checker(error, remote_account):
+    raise DummyException("dummy raise: {}\nfrom: {}".format(error, remote_account))
+
+
+def raise_no_error_checker(error, remote_account):
+    pass
+
+
 class SimpleServer(object):
     """Helper class which starts a simple server listening on localhost at the specified port
     """
@@ -86,6 +99,21 @@ def check_wait_for_http_timeout(self):
             actual_timeout = time.time() - start
             assert abs(actual_timeout - timeout) / timeout < 1
 
+    @pytest.mark.parametrize("checkers", [[raise_error_checker],
+                                          [raise_no_error_checker, raise_error_checker],
+                                          [raise_error_checker, raise_no_error_checker]])
+    def check_ssh_checker(self, checkers):
+        self.server.start()
+        self.account = RemoteAccount(RemoteAccountSSHConfig.from_string(
+            """
+        Host dummy_host.com
+            Hostname dummy_host.name.com
+            Port 22
+            User dummy
+        """), ssh_exception_checks=checkers)
+        with pytest.raises(DummyException):
+            self.account.ssh('echo test')
+
     def teardown(self):
         self.server.stop()
 
diff --git a/tests/cluster/check_vagrant.py b/tests/cluster/check_vagrant.py
index f5d68ebd0..9a1afc08e 100644
--- a/tests/cluster/check_vagrant.py
+++ b/tests/cluster/check_vagrant.py
@@ -18,6 +18,8 @@
 import pickle
 import os
 import random
+import pytest
+from ducktape.cluster.remoteaccount import RemoteAccountError
 
 TWO_HOSTS = """Host worker1
   HostName 127.0.0.1
@@ -59,10 +61,9 @@ def teardown_method(self, _):
 
     def _set_monkeypatch_attr(self, monkeypatch):
         monkeypatch.setattr("ducktape.cluster.vagrant.VagrantCluster._vagrant_ssh_config", lambda vc: (TWO_HOSTS, None))
-        monkeypatch.setattr("ducktape.cluster.vagrant.VagrantCluster.is_aws", lambda vc: False)
         monkeypatch.setattr(
             "ducktape.cluster.linux_remoteaccount.LinuxRemoteAccount.fetch_externally_routable_ip",
-            lambda vc, node_account: "127.0.0.1")
+            lambda vc: "127.0.0.1")
 
     def check_pickleable(self, monkeypatch):
         self._set_monkeypatch_attr(monkeypatch)
@@ -174,3 +175,14 @@ def check_cluster_file_read(self, monkeypatch):
         assert node2.account.user == "vagrant"
         assert node2.account.ssh_hostname == '127.0.0.3'
         assert node2.account.ssh_config.to_json() == node1_expected["ssh_config"]
+
+    def check_no_valid_network_devices(self, monkeypatch):
+        """
+        test to make sure that a remote account error is raised when no network devices are found
+        """
+        monkeypatch.setattr("ducktape.cluster.vagrant.VagrantCluster._vagrant_ssh_config", lambda vc: (TWO_HOSTS, None))
+        monkeypatch.setattr("ducktape.cluster.linux_remoteaccount.LinuxRemoteAccount.get_network_devices",
+                            lambda account: [])
+
+        with pytest.raises(RemoteAccountError):
+            VagrantCluster()
diff --git a/tests/ducktape_mock.py b/tests/ducktape_mock.py
index f7cfa7675..b11f7c407 100644
--- a/tests/ducktape_mock.py
+++ b/tests/ducktape_mock.py
@@ -89,11 +89,11 @@ def __init__(self):
 class MockAccount(LinuxRemoteAccount):
     """Mock node.account object. It's Linux because tests are run in Linux."""
 
-    def __init__(self):
+    def __init__(self, **kwargs):
         ssh_config = RemoteAccountSSHConfig(
             host="localhost",
             user=None,
             hostname="localhost",
             port=22)
 
-        super(MockAccount, self).__init__(ssh_config, externally_routable_ip="localhost", logger=None)
+        super(MockAccount, self).__init__(ssh_config, externally_routable_ip="localhost", logger=None, **kwargs)
diff --git a/tests/loader/check_loader.py b/tests/loader/check_loader.py
index af41cb277..aa5f66f9b 100644
--- a/tests/loader/check_loader.py
+++ b/tests/loader/check_loader.py
@@ -195,10 +195,14 @@ def check_test_loader_with_directory(self):
         tests = loader.load([discover_dir()])
         assert len(tests) == num_tests_in_dir(discover_dir())
 
-    def check_test_loader_with_file(self):
+    @pytest.mark.parametrize(['dir_', 'file_name'], [
+        pytest.param(discover_dir(), 'test_a.py'),
+        pytest.param(resources_dir(), 'a.py')
+    ])
+    def check_test_loader_with_file(self, dir_, file_name):
         """Check discovery on a file. """
         loader = TestLoader(self.SESSION_CONTEXT, logger=Mock())
-        module_path = os.path.join(discover_dir(), "test_a.py")
+        module_path = os.path.join(dir_, file_name)
 
         tests = loader.load([module_path])
         assert len(tests) == num_tests_in_file(module_path)
diff --git a/tests/loader/resources/a.py b/tests/loader/resources/a.py
new file mode 120000
index 000000000..bc2ea9ccd
--- /dev/null
+++ b/tests/loader/resources/a.py
@@ -0,0 +1 @@
+loader_test_directory/test_a.py
\ No newline at end of file
diff --git a/tests/loader/resources/loader_test_directory/name_does_not_match_pattern.py b/tests/loader/resources/loader_test_directory/name_does_not_match_pattern.py
new file mode 100644
index 000000000..ebd954920
--- /dev/null
+++ b/tests/loader/resources/loader_test_directory/name_does_not_match_pattern.py
@@ -0,0 +1,8 @@
+from ducktape.tests.test import Test
+
+
+class TestNotLoaded(Test):
+    """Loader should not discover this - module name does not match default pattern."""
+
+    def test_a(self):
+        pass
diff --git a/tests/reporter/check_symbol_reporter.py b/tests/reporter/check_symbol_reporter.py
new file mode 100644
index 000000000..b78a604cb
--- /dev/null
+++ b/tests/reporter/check_symbol_reporter.py
@@ -0,0 +1,29 @@
+from pathlib import Path
+from unittest.mock import Mock
+
+from ducktape.tests.reporter import FailedTestSymbolReporter
+
+
+def check_to_symbol_no_args(tmp_path):
+    result = Mock(file_name='/test_folder/test_file', cls_name='TestClass', function_name='test_func',
+                  injected_args=None)
+    reporter = FailedTestSymbolReporter(Mock())
+    reporter.working_dir = Path('/')
+    assert reporter.to_symbol(result) == 'test_folder/test_file::TestClass.test_func'
+
+
+def check_to_symbol_relative_path(tmp_path):
+    result = Mock(file_name='/test_folder/test_file', cls_name='TestClass', function_name='test_func',
+                  injected_args=None)
+    reporter = FailedTestSymbolReporter(Mock())
+    reporter.working_dir = Path('/test_folder')
+    assert reporter.to_symbol(result) == 'test_file::TestClass.test_func'
+
+
+def check_to_symbol_with_args():
+    result = Mock(file_name='/test_folder/test_file', cls_name='TestClass', function_name='test_func',
+                  injected_args={'arg': 'val'})
+
+    reporter = FailedTestSymbolReporter(Mock())
+    reporter.working_dir = Path('/')
+    assert reporter.to_symbol(result) == 'test_folder/test_file::TestClass.test_func@{"arg":"val"}'
diff --git a/tests/runner/check_runner.py b/tests/runner/check_runner.py
index fbe880ca3..0dc84b9d5 100644
--- a/tests/runner/check_runner.py
+++ b/tests/runner/check_runner.py
@@ -53,7 +53,7 @@ def check_insufficient_cluster_resources(self):
 
         test_context = TestContext(session_context=session_context, module=None, cls=TestThingy,
                                    function=TestThingy.test_pi, file=TEST_THINGY_FILE, cluster=mock_cluster)
-        runner = TestRunner(mock_cluster, session_context, Mock(), [test_context])
+        runner = TestRunner(mock_cluster, session_context, Mock(), [test_context], 1)
 
         # Even though the cluster is too small, the test runner should this handle gracefully without raising an error
         results = runner.run_all_tests()
@@ -79,10 +79,11 @@ def check_simple_run(self):
         test_methods = [TestThingy.test_pi, TestThingy.test_ignore1, TestThingy.test_ignore2, TestThingy.test_failure]
         ctx_list = self._do_expand(test_file=TEST_THINGY_FILE, test_class=TestThingy, test_methods=test_methods,
                                    cluster=mock_cluster, session_context=session_context)
-        runner = TestRunner(mock_cluster, session_context, Mock(), ctx_list)
+        runner = TestRunner(mock_cluster, session_context, Mock(), ctx_list, 1)
 
         results = runner.run_all_tests()
         assert len(results) == 4
+        assert results.num_flaky == 0
         assert results.num_failed == 1
         assert results.num_passed == 1
         assert results.num_ignored == 2
@@ -90,6 +91,23 @@ def check_simple_run(self):
         result_with_data = [r for r in results if r.data is not None][0]
         assert result_with_data.data == {"data": 3.14159}
 
+    def check_deflake_run(self):
+        """Check expected behavior when running a single test."""
+        mock_cluster = LocalhostCluster(num_nodes=1000)
+        session_context = tests.ducktape_mock.session_context()
+
+        test_methods = [TestThingy.test_flaky, TestThingy.test_failure]
+        ctx_list = self._do_expand(test_file=TEST_THINGY_FILE, test_class=TestThingy, test_methods=test_methods,
+                                   cluster=mock_cluster, session_context=session_context)
+        runner = TestRunner(mock_cluster, session_context, Mock(), ctx_list, 2)
+
+        results = runner.run_all_tests()
+        assert len(results) == 2
+        assert results.num_flaky == 1
+        assert results.num_failed == 1
+        assert results.num_passed == 0
+        assert results.num_ignored == 0
+
     def check_runner_report_junit(self):
         """Check we can serialize results into a xunit xml format. Also ensures that the XML report
         adheres to the Junit spec using xpath queries"""
@@ -98,7 +116,7 @@ def check_runner_report_junit(self):
         test_methods = [TestThingy.test_pi, TestThingy.test_ignore1, TestThingy.test_ignore2, TestThingy.test_failure]
         ctx_list = self._do_expand(test_file=TEST_THINGY_FILE, test_class=TestThingy, test_methods=test_methods,
                                    cluster=mock_cluster, session_context=session_context)
-        runner = TestRunner(mock_cluster, session_context, Mock(), ctx_list)
+        runner = TestRunner(mock_cluster, session_context, Mock(), ctx_list, 1)
 
         results = runner.run_all_tests()
         JUnitReporter(results).report()
@@ -137,7 +155,7 @@ def check_exit_first(self):
         test_methods = [FailingTest.test_fail]
         ctx_list = self._do_expand(test_file=FAILING_TEST_FILE, test_class=FailingTest, test_methods=test_methods,
                                    cluster=mock_cluster, session_context=session_context)
-        runner = TestRunner(mock_cluster, session_context, Mock(), ctx_list)
+        runner = TestRunner(mock_cluster, session_context, Mock(), ctx_list, 1)
         results = runner.run_all_tests()
         assert len(ctx_list) > 1
         assert len(results) == 1
@@ -155,10 +173,11 @@ def check_exits_if_failed_to_initialize(self):
                                         test_methods=[FailsToInitInSetupTest.test_nothing],
                                         cluster=mock_cluster, session_context=session_context))
 
-        runner = TestRunner(mock_cluster, session_context, Mock(), ctx_list)
+        runner = TestRunner(mock_cluster, session_context, Mock(), ctx_list, 1)
         results = runner.run_all_tests()
         # These tests fail to initialize, each class has two test methods, so should have 4 results, all failed
         assert len(results) == 4
+        assert results.num_flaky == 0
         assert results.num_failed == 4
         assert results.num_passed == 0
         assert results.num_ignored == 0
@@ -174,10 +193,11 @@ def check_sends_result_when_error_reporting_exception(self, exc_msg_mock):
         test_methods = [TestThingy.test_failure, TestThingy.test_pi]
         ctx_list = self._do_expand(test_file=TEST_THINGY_FILE, test_class=TestThingy, test_methods=test_methods,
                                    cluster=mock_cluster, session_context=session_context)
-        runner = TestRunner(mock_cluster, session_context, Mock(), ctx_list)
+        runner = TestRunner(mock_cluster, session_context, Mock(), ctx_list, 1)
 
         results = runner.run_all_tests()
         assert len(results) == 2
+        assert results.num_flaky == 0
         assert results.num_failed == 1
         assert results.num_passed == 1
         assert results.num_ignored == 0
@@ -192,11 +212,12 @@ def check_run_failure_with_bad_cluster_allocation(self):
         ctx_list = self._do_expand(test_file=TEST_THINGY_FILE, test_class=ClusterTestThingy,
                                    test_methods=test_methods, cluster=mock_cluster,
                                    session_context=session_context)
-        runner = TestRunner(mock_cluster, session_context, Mock(), ctx_list)
+        runner = TestRunner(mock_cluster, session_context, Mock(), ctx_list, 1)
 
         results = runner.run_all_tests()
 
         assert len(results) == 2
+        assert results.num_flaky == 0
         assert results.num_failed == 1
         assert results.num_passed == 1
         assert results.num_ignored == 0
diff --git a/tests/runner/check_runner_memory.py b/tests/runner/check_runner_memory.py
index 5958d2d05..c1ca80479 100644
--- a/tests/runner/check_runner_memory.py
+++ b/tests/runner/check_runner_memory.py
@@ -87,7 +87,7 @@ def check_for_inter_test_memory_leak(self):
         assert len(ctx_list) == N_TEST_CASES  # Sanity check
 
         q = queue.Queue()
-        runner = InstrumentedTestRunner(self.cluster, self.session_context, Mock(), ctx_list, queue=q)
+        runner = InstrumentedTestRunner(self.cluster, self.session_context, Mock(), ctx_list, 1, queue=q)
         runner.run_all_tests()
 
         measurements = []
diff --git a/tests/runner/resources/test_thingy.py b/tests/runner/resources/test_thingy.py
index 79104cce3..ecb140609 100644
--- a/tests/runner/resources/test_thingy.py
+++ b/tests/runner/resources/test_thingy.py
@@ -12,13 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from time import time
+
+import time
 from ducktape.cluster.cluster_spec import ClusterSpec
 from ducktape.tests.test import Test
 from ducktape.mark import ignore, parametrize
 from ducktape.mark.resource import cluster
 
 
+_flake = False
+
+
 class TestThingy(Test):
     """Fake ducktape test class"""
 
@@ -44,6 +48,11 @@ def test_ignore2(self, x=2):
     def test_failure(self):
         raise Exception("This failed")
 
+    def test_flaky(self):
+        global _flake
+        flake, _flake = _flake, not _flake
+        assert flake
+
 
 class ClusterTestThingy(Test):
     """Fake ducktape test class"""
diff --git a/tox.ini b/tox.ini
index fad0e430c..a76e58a4d 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,20 +1,12 @@
 [tox]
-envlist = py36, py37, py38, cover, style
+envlist = py36, py37, py38, cover, style, docs
 
 [testenv]
 # Consolidate all deps here instead of separately in test/style/cover so we
 # have a single env to work with, which makes debugging easier (like which env?).
 # Not as clean but easier to work with during development, which is better.
 deps =
-    flake8==3.7.*
-    mock==2.0.*
-    pytest==4.4.*
-    pytest-cov==2.6.*
-    pytest-xdist==1.28.*
-    psutil==4.1.0
-    memory_profiler==0.41
-    statistics==1.0.3.5
-    requests-testadapter==0.3.0
+    -r requirements-test.txt
 install_command =
     pip install -U {packages}
 recreate = False
@@ -24,33 +16,41 @@ setenv =
     PIP_PROCESS_DEPENDENCY_LINKS=1
     PIP_DEFAULT_TIMEOUT=60
     ARCHFLAGS=-Wno-error=unused-command-line-argument-hard-error-in-future
-envdir = {homedir}/.virtualenvs/ducktape_{envname}
+envdir = {package_root}/.virtualenvs/ducktape_{envname}
 commands =
     pytest {env:PYTESTARGS:} {posargs}
 
 [testenv:py36]
-envdir = {homedir}/.virtualenvs/ducktape-py36
+envdir = {package_root}/.virtualenvs/ducktape-py36
 
 [testenv:py37]
-envdir = {homedir}/.virtualenvs/ducktape-py37
+envdir = {package_root}/.virtualenvs/ducktape-py37
 
 [testenv:py38]
-envdir = {homedir}/.virtualenvs/ducktape-py38
+envdir = {package_root}/.virtualenvs/ducktape-py38
 
 [testenv:style]
 basepython = python3.8
-envdir = {homedir}/.virtualenvs/ducktape
+envdir = {package_root}/.virtualenvs/ducktape
 commands =
     flake8 --config tox.ini
 
 [testenv:cover]
 basepython = python3.8
-envdir = {homedir}/.virtualenvs/ducktape
+envdir = {package_root}/.virtualenvs/ducktape
 commands =
     pytest {env:PYTESTARGS:} --cov ducktape --cov-report=xml --cov-report=html --cov-report=term --cov-report=annotate:textcov \
                              --cov-fail-under=70
 
+[testenv:docs]
+basepython = python3.8
+deps =
+    -r {toxinidir}/docs/requirements.txt
+changedir = {toxinidir}/docs
+commands = sphinx-build -M {env:SPHINX_BUILDER:html} . _build  {posargs}
+
+
 [flake8]
-exclude = .git,.tox,.eggs,__pycache__,docs,build,dist
+exclude = .git,.tox,.eggs,__pycache__,docs,build,dist,.virtualenvs
 ignore = E111,E121,W292,E123,E226,W503
 max-line-length = 120