diff --git a/c-deps/libroach/CMakeLists.txt b/c-deps/libroach/CMakeLists.txt
index fe43dcf50f1a..b6d86f44b7a4 100644
--- a/c-deps/libroach/CMakeLists.txt
+++ b/c-deps/libroach/CMakeLists.txt
@@ -54,6 +54,7 @@ add_library(roach
   protos/util/log/log.pb.cc
   protos/util/unresolved_addr.pb.cc
   rocksdbutils/env_encryption.cc
+  rocksdbutils/env_sync_fault_injection.cc
 )
 target_include_directories(roach
   PUBLIC  ./include
diff --git a/c-deps/libroach/db.cc b/c-deps/libroach/db.cc
index 298da9c7e001..c4f43d82fe10 100644
--- a/c-deps/libroach/db.cc
+++ b/c-deps/libroach/db.cc
@@ -19,6 +19,7 @@
 #include <rocksdb/sst_file_writer.h>
 #include <rocksdb/table.h>
 #include <rocksdb/utilities/checkpoint.h>
+#include <rocksdb/utilities/object_registry.h>
 #include <stdarg.h>
 #include "batch.h"
 #include "cache.h"
@@ -34,6 +35,7 @@
 #include "iterator.h"
 #include "merge.h"
 #include "options.h"
+#include "rocksdbutils/env_sync_fault_injection.h"
 #include "snapshot.h"
 #include "status.h"
 #include "table_props.h"
@@ -151,7 +153,41 @@ static DBOpenHook* db_open_hook = DBOpenHookOSS;
 
 void DBSetOpenHook(void* hook) { db_open_hook = (DBOpenHook*)hook; }
 
+void DBRegisterTestingEnvs() {
+  // Have a couple `SyncFaultInjectionEnv`s registered with RocksDB for our test
+  // cases to use, which they can select using any of RocksDB's text-based options
+  // mechanisms. Currently the only one Cockroach exposes is options string which
+  // is passed the value from `DBOptions::rocksdb_options`.
+  static rocksdb::Registrar<rocksdb::Env> sync_failure_env_reg(
+      "sync-failure-injection-wrapping-default-env",
+      [](const std::string& /* name */,
+         std::unique_ptr<rocksdb::Env>* /* guard */) {
+        static rocksdb_utils::SyncFaultInjectionEnv env(
+            rocksdb::Env::Default(),
+            0 /* crash_failure_one_in */,
+            500000 /* sync_failure_one_in */,
+            true /* crash_after_sync_failure */);
+        return &env;
+      }
+  );
+
+  static rocksdb::Registrar<rocksdb::Env> crash_failure_env_reg(
+      "crash-failure-injection-wrapping-default-env",
+      [](const std::string& /* name */,
+         std::unique_ptr<rocksdb::Env>* /* guard */) {
+        static rocksdb_utils::SyncFaultInjectionEnv env(
+            rocksdb::Env::Default(),
+            500000 /* crash_failure_one_in */,
+            0 /* sync_failure_one_in */,
+            false /* crash_after_sync_failure */);
+        return &env;
+      }
+  );
+}
+
 DBStatus DBOpen(DBEngine** db, DBSlice dir, DBOptions db_opts) {
+  DBRegisterTestingEnvs();
+
   rocksdb::Options options = DBMakeOptions(db_opts);
 
   const std::string additional_options = ToString(db_opts.rocksdb_options);
diff --git a/c-deps/libroach/rocksdbutils/env_sync_fault_injection.cc b/c-deps/libroach/rocksdbutils/env_sync_fault_injection.cc
new file mode 100644
index 000000000000..6d7e4d371e8b
--- /dev/null
+++ b/c-deps/libroach/rocksdbutils/env_sync_fault_injection.cc
@@ -0,0 +1,132 @@
+// Copyright 2019 The Cockroach Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied.  See the License for the specific language governing
+// permissions and limitations under the License.
+
+#include "env_sync_fault_injection.h"
+#include "rocksdb/utilities/object_registry.h"
+
+namespace rocksdb_utils {
+
+// See comment above `SyncFaultInjectionEnv` class definition.
+class SyncFaultInjectionWritableFile : public rocksdb::WritableFileWrapper {
+ public:
+  SyncFaultInjectionWritableFile(std::unique_ptr<rocksdb::WritableFile> target,
+                                 int crash_failure_one_in,
+                                 int sync_failure_one_in,
+                                 bool crash_after_sync_failure);
+
+  rocksdb::Status Append(const rocksdb::Slice& data) override;
+  rocksdb::Status Sync() override;
+
+ private:
+  std::unique_ptr<rocksdb::WritableFile> target_;
+  const int crash_failure_one_in_;
+  const int sync_failure_one_in_;
+  const bool crash_after_sync_failure_;
+  // Countdown until crash if a sync failure already happened.
+  int num_syncs_until_crash_;
+  // Lock needed to handle concurrent writes and syncs.
+  std::mutex mu_;
+  // A buffer of written but unsynced data.
+  std::string buffer_;
+
+  // Some constants for use with `num_syncs_until_crash_`.
+  const static int kNoCountdown = -1;
+  const static int kStartCountdown = 10;
+};
+
+SyncFaultInjectionWritableFile::SyncFaultInjectionWritableFile(
+    std::unique_ptr<rocksdb::WritableFile> target,
+    int crash_failure_one_in,
+    int sync_failure_one_in,
+    bool crash_after_sync_failure) :
+  rocksdb::WritableFileWrapper(target.get()),
+  target_(std::move(target)),
+  crash_failure_one_in_(crash_failure_one_in),
+  sync_failure_one_in_(sync_failure_one_in),
+  crash_after_sync_failure_(crash_after_sync_failure),
+  num_syncs_until_crash_(kNoCountdown) {}
+
+rocksdb::Status SyncFaultInjectionWritableFile::Append(
+    const rocksdb::Slice& data) {
+  std::unique_lock<std::mutex> lock(mu_);
+  buffer_.append(data.data(), data.size());
+  return rocksdb::Status::OK();
+}
+
+// We are using process crash to simulate system crash for tests and don't
+// expect these tests to face actual system crashes. So for "syncing" it is
+// sufficient to push data into page cache via the underlying `WritableFile`'s
+// `Append()`. That should be enough for the file data to survive a process
+// crash.
+rocksdb::Status SyncFaultInjectionWritableFile::Sync() {
+  std::unique_lock<std::mutex> lock(mu_);
+  if (num_syncs_until_crash_ > kNoCountdown) {
+    --num_syncs_until_crash_;
+    if (num_syncs_until_crash_ == 0) {
+      exit(0);
+    }
+    // On Linux the behavior after a sync failure occurred is to clear the error
+    // state and continue accepting writes/syncs. To simulate that behavior, we
+    // do not return early here, even though the file is known to have lost writes.
+  }
+
+  if (crash_failure_one_in_ > 0 && random() % crash_failure_one_in_ == 0) {
+    exit(0);
+  } else if (sync_failure_one_in_ > 0 && random() % sync_failure_one_in_ == 0) {
+    if (num_syncs_until_crash_ == kNoCountdown && crash_after_sync_failure_) {
+      // This was the first failure. Start the countdown.
+      num_syncs_until_crash_ = kStartCountdown;
+    }
+    // As mentioned above, after a sync failure we allow continued writes and syncs
+    // to the same file. To make sure those new writes are written at the proper offset,
+    // we cannot drop unsynced writes simply by clearing the buffer. Instead we drop
+    // unsynced writes by overwriting the buffer with all zeros (well, this assumes
+    // the buffer didn't have all zeros to begin with).
+    buffer_.replace(0, buffer_.size(), buffer_.size(), '\0');
+    return rocksdb::Status::IOError();
+  }
+  std::string old_buffer;
+  buffer_.swap(old_buffer);
+  // It should be fine to buffer new writes while we're syncing old ones, so unlock.
+  lock.unlock();
+  return target_->Append(old_buffer);
+}
+
+SyncFaultInjectionEnv::SyncFaultInjectionEnv(
+    Env* target,
+    int crash_failure_one_in,
+    int sync_failure_one_in,
+    bool crash_after_sync_failure) :
+  rocksdb::EnvWrapper(target),
+  crash_failure_one_in_(crash_failure_one_in),
+  sync_failure_one_in_(sync_failure_one_in),
+  crash_after_sync_failure_(crash_after_sync_failure) {}
+
+rocksdb::Status SyncFaultInjectionEnv::NewWritableFile(
+    const std::string& filename,
+    std::unique_ptr<rocksdb::WritableFile>* result,
+    const rocksdb::EnvOptions& env_options) {
+  std::unique_ptr<rocksdb::WritableFile> underlying_file;
+  rocksdb::Status s = EnvWrapper::NewWritableFile(filename, &underlying_file, env_options);
+  if (s.ok()) {
+    result->reset(new SyncFaultInjectionWritableFile(
+          std::move(underlying_file),
+          crash_failure_one_in_,
+          sync_failure_one_in_,
+          crash_after_sync_failure_));
+  }
+  return s;
+}
+
+}  // rocksdb_utils
diff --git a/c-deps/libroach/rocksdbutils/env_sync_fault_injection.h b/c-deps/libroach/rocksdbutils/env_sync_fault_injection.h
new file mode 100644
index 000000000000..a07b66a66dbe
--- /dev/null
+++ b/c-deps/libroach/rocksdbutils/env_sync_fault_injection.h
@@ -0,0 +1,61 @@
+// Copyright 2019 The Cockroach Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied.  See the License for the specific language governing
+// permissions and limitations under the License.
+
+#pragma once
+
+#include <mutex>
+#include <string>
+
+#include "rocksdb/env.h"
+
+namespace rocksdb_utils {
+
+// `SyncFaultInjectionEnv` creates files that buffer `Append()`s in process memory
+// until `Sync()` is called. Such files enable us to simulate machine crashes by only
+// crashing the process. This works since, unlike normal files whose writes survive
+// process crash in page cache, these files' unsynced writes are dropped on the floor.
+//
+// Such files also enable us to simulate sync failure by dropping unsynced writes at
+// the same time we inject a sync error. This is more comprehensive than the available
+// fault injection tools I looked at (like libfiu and charybdefs), as those ones only
+// inject errors without dropping unsynced writes.
+class SyncFaultInjectionEnv : public rocksdb::EnvWrapper {
+ public:
+  // - `target`: A pointer to the underlying `Env`.
+  // - `crash_failure_one_in`: During a sync operation, crash the process immediately
+  //   with a probability of 1/n. All unsynced writes are lost since they are buffered
+  //   in process memory.
+  // - `sync_failure_one_in`: A sync operation will return failure with a probability
+  //   of 1/n. All unsynced writes for the file are dropped to simulate the failure.
+  // - `crash_after_sync_failure`: If set to true, the program will crash itself some
+  //   time after the first simulated sync failure. It does not happen immediately to
+  //   allow the system to get itself into a weird state in case it doesn't handle sync
+  //   failures properly.
+  SyncFaultInjectionEnv(
+      Env* target,
+      int crash_failure_one_in,
+      int sync_failure_one_in,
+      bool crash_after_sync_failure);
+
+  rocksdb::Status NewWritableFile(const std::string& filename,
+                                  std::unique_ptr<rocksdb::WritableFile>* result,
+                                  const rocksdb::EnvOptions& env_options) override;
+
+ private:
+  const int crash_failure_one_in_;
+  const int sync_failure_one_in_;
+  const bool crash_after_sync_failure_;
+};
+
+}  // rocksdb_utils
diff --git a/c-deps/rocksdb b/c-deps/rocksdb
index 47e4f25d7b30..90a02d472277 160000
--- a/c-deps/rocksdb
+++ b/c-deps/rocksdb
@@ -1 +1 @@
-Subproject commit 47e4f25d7b30c5cdf077e4f96b945a2c6b790d7b
+Subproject commit 90a02d47227787436016e5d22c5287d2210d7da0
diff --git a/pkg/cmd/roachprod/install/cockroach.go b/pkg/cmd/roachprod/install/cockroach.go
index bc5382de59c3..15c207acbd4d 100644
--- a/pkg/cmd/roachprod/install/cockroach.go
+++ b/pkg/cmd/roachprod/install/cockroach.go
@@ -386,6 +386,10 @@ tar cvf certs.tar certs
 		cmd += `echo ">>> roachprod start: $(date)" >> ` + logDir + "/roachprod.log; " +
 			`ps axeww -o pid -o command >> ` + logDir + "/roachprod.log; " +
 			`[ -x /usr/bin/lslocks ] && /usr/bin/lslocks >> ` + logDir + "/roachprod.log; "
+		if c.IsLocal() {
+			// This is consistent with the working directory used by `roachprod run`.
+			cmd += fmt.Sprintf("cd ${HOME}/local/%d ; ", nodes[i])
+		}
 		cmd += keyCmd +
 			fmt.Sprintf(" export ROACHPROD=%d%s && ", nodes[i], c.Tag) +
 			"GOTRACEBACK=crash " +
diff --git a/pkg/cmd/roachprod/vm/gce/gcloud.go b/pkg/cmd/roachprod/vm/gce/gcloud.go
index 1b54189679d8..e1d113b729e0 100644
--- a/pkg/cmd/roachprod/vm/gce/gcloud.go
+++ b/pkg/cmd/roachprod/vm/gce/gcloud.go
@@ -51,7 +51,7 @@ var projectsWithGC = []string{defaultProject, "andrei-jepsen"}
 
 // init will inject the GCE provider into vm.Providers, but only if the gcloud tool is available on the local path.
 func init() {
-	var p vm.Provider
+	var p vm.Provider = &Provider{}
 	if _, err := exec.LookPath("gcloud"); err != nil {
 		p = flagstub.New(p, "please install the gcloud CLI utilities "+
 			"(https://cloud.google.com/sdk/downloads)")
diff --git a/pkg/cmd/roachtest/cluster.go b/pkg/cmd/roachtest/cluster.go
index c11f4b4af784..07b0e3da7f04 100644
--- a/pkg/cmd/roachtest/cluster.go
+++ b/pkg/cmd/roachtest/cluster.go
@@ -925,7 +925,7 @@ func (c *cluster) All() nodeListOption {
 	return c.Range(1, c.nodes)
 }
 
-// All returns a node list containing the nodes [begin,end].
+// Range returns a node list containing the nodes [begin,end].
 func (c *cluster) Range(begin, end int) nodeListOption {
 	if begin < 1 || end > c.nodes {
 		c.t.Fatalf("invalid node range: %d-%d (1-%d)", begin, end, c.nodes)
@@ -937,7 +937,7 @@ func (c *cluster) Range(begin, end int) nodeListOption {
 	return r
 }
 
-// All returns a node list containing only the node i.
+// Node returns a node list containing only the node i.
 func (c *cluster) Node(i int) nodeListOption {
 	return c.Range(i, i)
 }
diff --git a/pkg/cmd/roachtest/registry.go b/pkg/cmd/roachtest/registry.go
index 5d06a68fa737..5a808e4ad70d 100644
--- a/pkg/cmd/roachtest/registry.go
+++ b/pkg/cmd/roachtest/registry.go
@@ -69,6 +69,7 @@ func registerTests(r *registry) {
 	registerSQLsmith(r)
 	registerSyncTest(r)
 	registerSysbench(r)
+	registerSystemCrashTest(r)
 	registerTPCC(r)
 	registerTypeORM(r)
 	registerLoadSplits(r)
diff --git a/pkg/cmd/roachtest/system_crash.go b/pkg/cmd/roachtest/system_crash.go
new file mode 100644
index 000000000000..e4d08c270846
--- /dev/null
+++ b/pkg/cmd/roachtest/system_crash.go
@@ -0,0 +1,199 @@
+// Copyright 2019 The Cockroach Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the License. See the AUTHORS file
+// for names of contributors.
+
+package main
+
+import (
+	"bufio"
+	"context"
+	"fmt"
+	"math"
+	"strconv"
+	"strings"
+	"time"
+
+	"github.com/cockroachdb/cockroach/pkg/util/contextutil"
+)
+
+func registerSystemCrashTest(r *registry) {
+	// Two variants of this test:
+	//
+	// - `syncErrors == false`: The unreliable cockroach processes (i.e., the ones running
+	//   with fault injection `Env`s) will crash themselves during a randomly chosen sync
+	//   operation. The crash occurs before that sync operation writes out data.
+	//
+	// - `syncErrors == true`: The unreliable cockroach processes will return an error
+	//   from a randomly chosen sync operation, and also drop the writes corresponding
+	//   to that sync. If cockroach does not crash itself, the fault injection `Env`
+	//   will crash for us shortly afterwards.
+	for _, syncErrors := range []bool{false, true} {
+		syncErrors := syncErrors
+		r.Add(testSpec{
+			Name:    fmt.Sprintf("system-crash/sync-errors=%t", syncErrors),
+			Cluster: makeClusterSpec(6 /* nodeCount */),
+			Run: func(ctx context.Context, t *test, c *cluster) {
+				runSystemCrashTest(ctx, t, c, syncErrors)
+			},
+		})
+	}
+}
+
+func runSystemCrashTest(ctx context.Context, t *test, c *cluster, syncErrors bool) {
+	// Currently setup is as follows:
+	// - Node 1 runs a reliable (i.e., no fault injection) Cockroach process
+	// - Node 2..n-1 run unreliable Cockroach processes
+	// - Node n runs a `workload kv` process. It targets the reliable Cockroach on node 1.
+	getCockroachNodes := func(c *cluster) nodeListOption {
+		return c.Range(1, c.nodes-1)
+	}
+
+	getReliableCockroachNode := func(c *cluster) nodeListOption {
+		cockroachNodes := getCockroachNodes(c)
+		return cockroachNodes[0:1]
+	}
+
+	getUnreliableCockroachNodes := func(c *cluster) nodeListOption {
+		cockroachNodes := getCockroachNodes(c)
+		return cockroachNodes[1:]
+	}
+
+	getWorkloadNode := func(c *cluster) nodeListOption {
+		return c.Node(c.nodes)
+	}
+
+	t.Status("installing binaries")
+	c.Put(ctx, cockroach, "./cockroach", getCockroachNodes(c))
+	c.Put(ctx, workload, "./workload", getWorkloadNode(c))
+
+	startCockroachNodes := func(
+		ctx context.Context, t *test, c *cluster, nodes nodeListOption, reliable, syncErrors bool,
+	) {
+		envVars := "COCKROACH_CONSISTENCY_CHECK_INTERVAL=10s"
+		args := "--store=path=./data"
+		if !reliable {
+			if syncErrors {
+				args += ",rocksdb=env=sync-failure-injection-wrapping-default-env"
+			} else {
+				args += ",rocksdb=env=crash-failure-injection-wrapping-default-env"
+			}
+		}
+		c.Start(ctx, t, nodes, startArgs(
+			"--env", envVars,
+			"--args", args,
+		))
+	}
+
+	t.Status("starting cockroach")
+	startCockroachNodes(ctx, t, c, getReliableCockroachNode(c), true /* reliable */, syncErrors)
+	startCockroachNodes(ctx, t, c, getUnreliableCockroachNodes(c), false /* reliable */, syncErrors)
+
+	// There are two goroutines run under the monitor `m`:
+	//
+	// (1) A goroutine that runs `workload` synchronously.
+	// (2) A goroutine that loops running `roachprod monitor`, looking for dead nodes and
+	//     restarting them.
+	//
+	// When nothing goes wrong, (1) finishes first. It then invokes `cancel()` to tell (2)
+	// to exit its loop. Then once both goroutines have completed, `m.Wait()` returns.
+	ctx, cancel := context.WithCancel(ctx)
+	m := newMonitor(ctx, c, getCockroachNodes(c))
+
+	t.Status("running kv workload")
+	// Launch goroutine running kv workload
+	m.Go(func(ctx context.Context) error {
+		c.Run(
+			ctx, getWorkloadNode(c), "./workload", "run", "kv", "--init", "--splits=1000",
+			"--histograms=logs/stats.json", "--concurrency=12", "--duration=1h",
+			"--read-percent=0", "--batch=32", "--min-block-bytes=128", "--max-block-bytes=128",
+			"--tolerate-errors", fmt.Sprintf("{pgurl%s}", getReliableCockroachNode(c)),
+		)
+		cancel()
+		return nil
+	})
+
+	// Launch goroutine for monitoring and restarting dead cockroach nodes
+	m.Go(func(ctx context.Context) error {
+		for {
+			select {
+			case <-ctx.Done():
+				return nil
+			case <-time.After(1 * time.Second):
+			}
+
+			var restartNodeIds []int
+			// Running `roachprod monitor` under a timeout is a hack copied from
+			// `FailOnDeadNodes`. It is necessary because sometimes that command
+			// gets stuck in an infinite loop when run with the `--oneshot` option.
+			_ = contextutil.RunWithTimeout(
+				ctx, "restart dead nodes", 1*time.Second,
+				func(ctx context.Context) error {
+					output, err := execCmdWithBuffer(
+						ctx, t.l, roachprod, "monitor", c.name, "--oneshot", "--ignore-empty-nodes",
+					)
+					// If there's an error, it means either that the monitor command failed
+					// completely, or that it found a dead node worth complaining about.
+					if err != nil {
+						if ctx.Err() != nil {
+							// Don't fail if we timed out. Could be the known infinite loop bug.
+							return nil
+						}
+
+						// Figure out which nodes are dead and restart them. Output looks like below
+						// when there are dead nodes.
+						//
+						// 3: dead
+						// 1: 6890
+						// 4: 7075
+						// 2: 6951
+						// Error:  3: dead
+						scanner := bufio.NewScanner(strings.NewReader(string(output)))
+						for scanner.Scan() {
+							line := scanner.Text()
+							if strings.HasPrefix(line, "Error:") {
+								// We already passed over all the nodes' statuses once. The
+								// remaining lines are redundant (see example output above).
+								break
+							}
+							fields := strings.Split(line, ": ")
+							if len(fields) != 2 {
+								t.Fatalf("unexpected `roachprod monitor` output line: %s", line)
+							}
+							nodeId, err := strconv.Atoi(fields[0])
+							if err != nil {
+								t.Fatalf("unexpected `roachprod monitor` output line: %s", line)
+							}
+							nodeStatus := fields[1]
+							if nodeStatus == "dead" {
+								restartNodeIds = append(restartNodeIds, nodeId)
+							}
+						}
+					}
+					return nil
+				},
+			)
+			for i := range restartNodeIds {
+				startCockroachNodes(
+					ctx, t, c, c.Node(restartNodeIds[i]), false, /* reliable */
+					syncErrors,
+				)
+			}
+		}
+	})
+
+	// Permit any number of node crashes/restarts since this test causes them intentionally.
+	m.ExpectDeaths(math.MaxInt32)
+
+	m.Wait()
+}