From 1c540424696a73adcafd5a773d220fbafb68bcbf Mon Sep 17 00:00:00 2001 From: Andrew Kryczka Date: Sun, 21 Apr 2019 14:55:17 -0400 Subject: [PATCH] Add roachtest that simulates system crash and sync failures There is an existing synctest that verifies the database is correct and usable after a crash triggered by an I/O error. The charybdefs dependency it uses does error injection by manipulating return values. When it injects an error into a sync operation, that sync does no work and returns an error, but unsynced writes still survive in page cache. Then after process crash-recovery, the DB's state is the same as if the failed sync had succeeded. This new test attempts to simulate the effects of a failed sync more completely, in particular by ensuring unsynced writes are dropped. The approach taken in this new test is to buffer unsynced writes in process memory. This is achieved by providing our own implementation of a few C syscall wrappers via `LD_PRELOAD`. By buffering in process memory instead of page cache, we can easily drop unsynced writes. In this new test, sync failure injection (`system-crash/sync-errors=true`) involves both returning an error and deleting unsynced data. Assuming error handling is correct the process will crash itself shortly afterwards. There is also some logic in the failure injector to force crash a little while later in case there's ever a bug in RocksDB or Cockroach where we ignore the failure. We can also use this approach to simulate machine crash (`system-crash/sync-errors=false`). Simply killing the process will drop writes that aren't yet synced, which is the same as what would happen if a machine crashed. Right now the test relies on frequent consistency checks to find errors like missing writes. It hits the DB heavily with KV queries to try to trigger enough flushes/WAL changes/compactions in case there are bugs in those code paths. But I am open to suggestions for alternative workloads/verification mechanisms. Release note: None --- c-deps/libroach/CMakeLists.txt | 1 + c-deps/libroach/db.cc | 36 ++++ .../rocksdbutils/env_sync_fault_injection.cc | 132 ++++++++++++ .../rocksdbutils/env_sync_fault_injection.h | 61 ++++++ c-deps/rocksdb | 2 +- pkg/cmd/roachprod/install/cockroach.go | 4 + pkg/cmd/roachprod/vm/gce/gcloud.go | 2 +- pkg/cmd/roachtest/cluster.go | 4 +- pkg/cmd/roachtest/registry.go | 1 + pkg/cmd/roachtest/system_crash.go | 199 ++++++++++++++++++ 10 files changed, 438 insertions(+), 4 deletions(-) create mode 100644 c-deps/libroach/rocksdbutils/env_sync_fault_injection.cc create mode 100644 c-deps/libroach/rocksdbutils/env_sync_fault_injection.h create mode 100644 pkg/cmd/roachtest/system_crash.go diff --git a/c-deps/libroach/CMakeLists.txt b/c-deps/libroach/CMakeLists.txt index fe43dcf50f1a..b6d86f44b7a4 100644 --- a/c-deps/libroach/CMakeLists.txt +++ b/c-deps/libroach/CMakeLists.txt @@ -54,6 +54,7 @@ add_library(roach protos/util/log/log.pb.cc protos/util/unresolved_addr.pb.cc rocksdbutils/env_encryption.cc + rocksdbutils/env_sync_fault_injection.cc ) target_include_directories(roach PUBLIC ./include diff --git a/c-deps/libroach/db.cc b/c-deps/libroach/db.cc index 298da9c7e001..c4f43d82fe10 100644 --- a/c-deps/libroach/db.cc +++ b/c-deps/libroach/db.cc @@ -19,6 +19,7 @@ #include #include #include +#include #include #include "batch.h" #include "cache.h" @@ -34,6 +35,7 @@ #include "iterator.h" #include "merge.h" #include "options.h" +#include "rocksdbutils/env_sync_fault_injection.h" #include "snapshot.h" #include "status.h" #include "table_props.h" @@ -151,7 +153,41 @@ static DBOpenHook* db_open_hook = DBOpenHookOSS; void DBSetOpenHook(void* hook) { db_open_hook = (DBOpenHook*)hook; } +void DBRegisterTestingEnvs() { + // Have a couple `SyncFaultInjectionEnv`s registered with RocksDB for our test + // cases to use, which they can select using any of RocksDB's text-based options + // mechanisms. Currently the only one Cockroach exposes is options string which + // is passed the value from `DBOptions::rocksdb_options`. + static rocksdb::Registrar sync_failure_env_reg( + "sync-failure-injection-wrapping-default-env", + [](const std::string& /* name */, + std::unique_ptr* /* guard */) { + static rocksdb_utils::SyncFaultInjectionEnv env( + rocksdb::Env::Default(), + 0 /* crash_failure_one_in */, + 500000 /* sync_failure_one_in */, + true /* crash_after_sync_failure */); + return &env; + } + ); + + static rocksdb::Registrar crash_failure_env_reg( + "crash-failure-injection-wrapping-default-env", + [](const std::string& /* name */, + std::unique_ptr* /* guard */) { + static rocksdb_utils::SyncFaultInjectionEnv env( + rocksdb::Env::Default(), + 500000 /* crash_failure_one_in */, + 0 /* sync_failure_one_in */, + false /* crash_after_sync_failure */); + return &env; + } + ); +} + DBStatus DBOpen(DBEngine** db, DBSlice dir, DBOptions db_opts) { + DBRegisterTestingEnvs(); + rocksdb::Options options = DBMakeOptions(db_opts); const std::string additional_options = ToString(db_opts.rocksdb_options); diff --git a/c-deps/libroach/rocksdbutils/env_sync_fault_injection.cc b/c-deps/libroach/rocksdbutils/env_sync_fault_injection.cc new file mode 100644 index 000000000000..6d7e4d371e8b --- /dev/null +++ b/c-deps/libroach/rocksdbutils/env_sync_fault_injection.cc @@ -0,0 +1,132 @@ +// Copyright 2019 The Cockroach Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. + +#include "env_sync_fault_injection.h" +#include "rocksdb/utilities/object_registry.h" + +namespace rocksdb_utils { + +// See comment above `SyncFaultInjectionEnv` class definition. +class SyncFaultInjectionWritableFile : public rocksdb::WritableFileWrapper { + public: + SyncFaultInjectionWritableFile(std::unique_ptr target, + int crash_failure_one_in, + int sync_failure_one_in, + bool crash_after_sync_failure); + + rocksdb::Status Append(const rocksdb::Slice& data) override; + rocksdb::Status Sync() override; + + private: + std::unique_ptr target_; + const int crash_failure_one_in_; + const int sync_failure_one_in_; + const bool crash_after_sync_failure_; + // Countdown until crash if a sync failure already happened. + int num_syncs_until_crash_; + // Lock needed to handle concurrent writes and syncs. + std::mutex mu_; + // A buffer of written but unsynced data. + std::string buffer_; + + // Some constants for use with `num_syncs_until_crash_`. + const static int kNoCountdown = -1; + const static int kStartCountdown = 10; +}; + +SyncFaultInjectionWritableFile::SyncFaultInjectionWritableFile( + std::unique_ptr target, + int crash_failure_one_in, + int sync_failure_one_in, + bool crash_after_sync_failure) : + rocksdb::WritableFileWrapper(target.get()), + target_(std::move(target)), + crash_failure_one_in_(crash_failure_one_in), + sync_failure_one_in_(sync_failure_one_in), + crash_after_sync_failure_(crash_after_sync_failure), + num_syncs_until_crash_(kNoCountdown) {} + +rocksdb::Status SyncFaultInjectionWritableFile::Append( + const rocksdb::Slice& data) { + std::unique_lock lock(mu_); + buffer_.append(data.data(), data.size()); + return rocksdb::Status::OK(); +} + +// We are using process crash to simulate system crash for tests and don't +// expect these tests to face actual system crashes. So for "syncing" it is +// sufficient to push data into page cache via the underlying `WritableFile`'s +// `Append()`. That should be enough for the file data to survive a process +// crash. +rocksdb::Status SyncFaultInjectionWritableFile::Sync() { + std::unique_lock lock(mu_); + if (num_syncs_until_crash_ > kNoCountdown) { + --num_syncs_until_crash_; + if (num_syncs_until_crash_ == 0) { + exit(0); + } + // On Linux the behavior after a sync failure occurred is to clear the error + // state and continue accepting writes/syncs. To simulate that behavior, we + // do not return early here, even though the file is known to have lost writes. + } + + if (crash_failure_one_in_ > 0 && random() % crash_failure_one_in_ == 0) { + exit(0); + } else if (sync_failure_one_in_ > 0 && random() % sync_failure_one_in_ == 0) { + if (num_syncs_until_crash_ == kNoCountdown && crash_after_sync_failure_) { + // This was the first failure. Start the countdown. + num_syncs_until_crash_ = kStartCountdown; + } + // As mentioned above, after a sync failure we allow continued writes and syncs + // to the same file. To make sure those new writes are written at the proper offset, + // we cannot drop unsynced writes simply by clearing the buffer. Instead we drop + // unsynced writes by overwriting the buffer with all zeros (well, this assumes + // the buffer didn't have all zeros to begin with). + buffer_.replace(0, buffer_.size(), buffer_.size(), '\0'); + return rocksdb::Status::IOError(); + } + std::string old_buffer; + buffer_.swap(old_buffer); + // It should be fine to buffer new writes while we're syncing old ones, so unlock. + lock.unlock(); + return target_->Append(old_buffer); +} + +SyncFaultInjectionEnv::SyncFaultInjectionEnv( + Env* target, + int crash_failure_one_in, + int sync_failure_one_in, + bool crash_after_sync_failure) : + rocksdb::EnvWrapper(target), + crash_failure_one_in_(crash_failure_one_in), + sync_failure_one_in_(sync_failure_one_in), + crash_after_sync_failure_(crash_after_sync_failure) {} + +rocksdb::Status SyncFaultInjectionEnv::NewWritableFile( + const std::string& filename, + std::unique_ptr* result, + const rocksdb::EnvOptions& env_options) { + std::unique_ptr underlying_file; + rocksdb::Status s = EnvWrapper::NewWritableFile(filename, &underlying_file, env_options); + if (s.ok()) { + result->reset(new SyncFaultInjectionWritableFile( + std::move(underlying_file), + crash_failure_one_in_, + sync_failure_one_in_, + crash_after_sync_failure_)); + } + return s; +} + +} // rocksdb_utils diff --git a/c-deps/libroach/rocksdbutils/env_sync_fault_injection.h b/c-deps/libroach/rocksdbutils/env_sync_fault_injection.h new file mode 100644 index 000000000000..a07b66a66dbe --- /dev/null +++ b/c-deps/libroach/rocksdbutils/env_sync_fault_injection.h @@ -0,0 +1,61 @@ +// Copyright 2019 The Cockroach Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. + +#pragma once + +#include +#include + +#include "rocksdb/env.h" + +namespace rocksdb_utils { + +// `SyncFaultInjectionEnv` creates files that buffer `Append()`s in process memory +// until `Sync()` is called. Such files enable us to simulate machine crashes by only +// crashing the process. This works since, unlike normal files whose writes survive +// process crash in page cache, these files' unsynced writes are dropped on the floor. +// +// Such files also enable us to simulate sync failure by dropping unsynced writes at +// the same time we inject a sync error. This is more comprehensive than the available +// fault injection tools I looked at (like libfiu and charybdefs), as those ones only +// inject errors without dropping unsynced writes. +class SyncFaultInjectionEnv : public rocksdb::EnvWrapper { + public: + // - `target`: A pointer to the underlying `Env`. + // - `crash_failure_one_in`: During a sync operation, crash the process immediately + // with a probability of 1/n. All unsynced writes are lost since they are buffered + // in process memory. + // - `sync_failure_one_in`: A sync operation will return failure with a probability + // of 1/n. All unsynced writes for the file are dropped to simulate the failure. + // - `crash_after_sync_failure`: If set to true, the program will crash itself some + // time after the first simulated sync failure. It does not happen immediately to + // allow the system to get itself into a weird state in case it doesn't handle sync + // failures properly. + SyncFaultInjectionEnv( + Env* target, + int crash_failure_one_in, + int sync_failure_one_in, + bool crash_after_sync_failure); + + rocksdb::Status NewWritableFile(const std::string& filename, + std::unique_ptr* result, + const rocksdb::EnvOptions& env_options) override; + + private: + const int crash_failure_one_in_; + const int sync_failure_one_in_; + const bool crash_after_sync_failure_; +}; + +} // rocksdb_utils diff --git a/c-deps/rocksdb b/c-deps/rocksdb index 47e4f25d7b30..90a02d472277 160000 --- a/c-deps/rocksdb +++ b/c-deps/rocksdb @@ -1 +1 @@ -Subproject commit 47e4f25d7b30c5cdf077e4f96b945a2c6b790d7b +Subproject commit 90a02d47227787436016e5d22c5287d2210d7da0 diff --git a/pkg/cmd/roachprod/install/cockroach.go b/pkg/cmd/roachprod/install/cockroach.go index bc5382de59c3..15c207acbd4d 100644 --- a/pkg/cmd/roachprod/install/cockroach.go +++ b/pkg/cmd/roachprod/install/cockroach.go @@ -386,6 +386,10 @@ tar cvf certs.tar certs cmd += `echo ">>> roachprod start: $(date)" >> ` + logDir + "/roachprod.log; " + `ps axeww -o pid -o command >> ` + logDir + "/roachprod.log; " + `[ -x /usr/bin/lslocks ] && /usr/bin/lslocks >> ` + logDir + "/roachprod.log; " + if c.IsLocal() { + // This is consistent with the working directory used by `roachprod run`. + cmd += fmt.Sprintf("cd ${HOME}/local/%d ; ", nodes[i]) + } cmd += keyCmd + fmt.Sprintf(" export ROACHPROD=%d%s && ", nodes[i], c.Tag) + "GOTRACEBACK=crash " + diff --git a/pkg/cmd/roachprod/vm/gce/gcloud.go b/pkg/cmd/roachprod/vm/gce/gcloud.go index 1b54189679d8..e1d113b729e0 100644 --- a/pkg/cmd/roachprod/vm/gce/gcloud.go +++ b/pkg/cmd/roachprod/vm/gce/gcloud.go @@ -51,7 +51,7 @@ var projectsWithGC = []string{defaultProject, "andrei-jepsen"} // init will inject the GCE provider into vm.Providers, but only if the gcloud tool is available on the local path. func init() { - var p vm.Provider + var p vm.Provider = &Provider{} if _, err := exec.LookPath("gcloud"); err != nil { p = flagstub.New(p, "please install the gcloud CLI utilities "+ "(https://cloud.google.com/sdk/downloads)") diff --git a/pkg/cmd/roachtest/cluster.go b/pkg/cmd/roachtest/cluster.go index c11f4b4af784..07b0e3da7f04 100644 --- a/pkg/cmd/roachtest/cluster.go +++ b/pkg/cmd/roachtest/cluster.go @@ -925,7 +925,7 @@ func (c *cluster) All() nodeListOption { return c.Range(1, c.nodes) } -// All returns a node list containing the nodes [begin,end]. +// Range returns a node list containing the nodes [begin,end]. func (c *cluster) Range(begin, end int) nodeListOption { if begin < 1 || end > c.nodes { c.t.Fatalf("invalid node range: %d-%d (1-%d)", begin, end, c.nodes) @@ -937,7 +937,7 @@ func (c *cluster) Range(begin, end int) nodeListOption { return r } -// All returns a node list containing only the node i. +// Node returns a node list containing only the node i. func (c *cluster) Node(i int) nodeListOption { return c.Range(i, i) } diff --git a/pkg/cmd/roachtest/registry.go b/pkg/cmd/roachtest/registry.go index 5d06a68fa737..5a808e4ad70d 100644 --- a/pkg/cmd/roachtest/registry.go +++ b/pkg/cmd/roachtest/registry.go @@ -69,6 +69,7 @@ func registerTests(r *registry) { registerSQLsmith(r) registerSyncTest(r) registerSysbench(r) + registerSystemCrashTest(r) registerTPCC(r) registerTypeORM(r) registerLoadSplits(r) diff --git a/pkg/cmd/roachtest/system_crash.go b/pkg/cmd/roachtest/system_crash.go new file mode 100644 index 000000000000..e4d08c270846 --- /dev/null +++ b/pkg/cmd/roachtest/system_crash.go @@ -0,0 +1,199 @@ +// Copyright 2019 The Cockroach Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. See the AUTHORS file +// for names of contributors. + +package main + +import ( + "bufio" + "context" + "fmt" + "math" + "strconv" + "strings" + "time" + + "github.com/cockroachdb/cockroach/pkg/util/contextutil" +) + +func registerSystemCrashTest(r *registry) { + // Two variants of this test: + // + // - `syncErrors == false`: The unreliable cockroach processes (i.e., the ones running + // with fault injection `Env`s) will crash themselves during a randomly chosen sync + // operation. The crash occurs before that sync operation writes out data. + // + // - `syncErrors == true`: The unreliable cockroach processes will return an error + // from a randomly chosen sync operation, and also drop the writes corresponding + // to that sync. If cockroach does not crash itself, the fault injection `Env` + // will crash for us shortly afterwards. + for _, syncErrors := range []bool{false, true} { + syncErrors := syncErrors + r.Add(testSpec{ + Name: fmt.Sprintf("system-crash/sync-errors=%t", syncErrors), + Cluster: makeClusterSpec(6 /* nodeCount */), + Run: func(ctx context.Context, t *test, c *cluster) { + runSystemCrashTest(ctx, t, c, syncErrors) + }, + }) + } +} + +func runSystemCrashTest(ctx context.Context, t *test, c *cluster, syncErrors bool) { + // Currently setup is as follows: + // - Node 1 runs a reliable (i.e., no fault injection) Cockroach process + // - Node 2..n-1 run unreliable Cockroach processes + // - Node n runs a `workload kv` process. It targets the reliable Cockroach on node 1. + getCockroachNodes := func(c *cluster) nodeListOption { + return c.Range(1, c.nodes-1) + } + + getReliableCockroachNode := func(c *cluster) nodeListOption { + cockroachNodes := getCockroachNodes(c) + return cockroachNodes[0:1] + } + + getUnreliableCockroachNodes := func(c *cluster) nodeListOption { + cockroachNodes := getCockroachNodes(c) + return cockroachNodes[1:] + } + + getWorkloadNode := func(c *cluster) nodeListOption { + return c.Node(c.nodes) + } + + t.Status("installing binaries") + c.Put(ctx, cockroach, "./cockroach", getCockroachNodes(c)) + c.Put(ctx, workload, "./workload", getWorkloadNode(c)) + + startCockroachNodes := func( + ctx context.Context, t *test, c *cluster, nodes nodeListOption, reliable, syncErrors bool, + ) { + envVars := "COCKROACH_CONSISTENCY_CHECK_INTERVAL=10s" + args := "--store=path=./data" + if !reliable { + if syncErrors { + args += ",rocksdb=env=sync-failure-injection-wrapping-default-env" + } else { + args += ",rocksdb=env=crash-failure-injection-wrapping-default-env" + } + } + c.Start(ctx, t, nodes, startArgs( + "--env", envVars, + "--args", args, + )) + } + + t.Status("starting cockroach") + startCockroachNodes(ctx, t, c, getReliableCockroachNode(c), true /* reliable */, syncErrors) + startCockroachNodes(ctx, t, c, getUnreliableCockroachNodes(c), false /* reliable */, syncErrors) + + // There are two goroutines run under the monitor `m`: + // + // (1) A goroutine that runs `workload` synchronously. + // (2) A goroutine that loops running `roachprod monitor`, looking for dead nodes and + // restarting them. + // + // When nothing goes wrong, (1) finishes first. It then invokes `cancel()` to tell (2) + // to exit its loop. Then once both goroutines have completed, `m.Wait()` returns. + ctx, cancel := context.WithCancel(ctx) + m := newMonitor(ctx, c, getCockroachNodes(c)) + + t.Status("running kv workload") + // Launch goroutine running kv workload + m.Go(func(ctx context.Context) error { + c.Run( + ctx, getWorkloadNode(c), "./workload", "run", "kv", "--init", "--splits=1000", + "--histograms=logs/stats.json", "--concurrency=12", "--duration=1h", + "--read-percent=0", "--batch=32", "--min-block-bytes=128", "--max-block-bytes=128", + "--tolerate-errors", fmt.Sprintf("{pgurl%s}", getReliableCockroachNode(c)), + ) + cancel() + return nil + }) + + // Launch goroutine for monitoring and restarting dead cockroach nodes + m.Go(func(ctx context.Context) error { + for { + select { + case <-ctx.Done(): + return nil + case <-time.After(1 * time.Second): + } + + var restartNodeIds []int + // Running `roachprod monitor` under a timeout is a hack copied from + // `FailOnDeadNodes`. It is necessary because sometimes that command + // gets stuck in an infinite loop when run with the `--oneshot` option. + _ = contextutil.RunWithTimeout( + ctx, "restart dead nodes", 1*time.Second, + func(ctx context.Context) error { + output, err := execCmdWithBuffer( + ctx, t.l, roachprod, "monitor", c.name, "--oneshot", "--ignore-empty-nodes", + ) + // If there's an error, it means either that the monitor command failed + // completely, or that it found a dead node worth complaining about. + if err != nil { + if ctx.Err() != nil { + // Don't fail if we timed out. Could be the known infinite loop bug. + return nil + } + + // Figure out which nodes are dead and restart them. Output looks like below + // when there are dead nodes. + // + // 3: dead + // 1: 6890 + // 4: 7075 + // 2: 6951 + // Error: 3: dead + scanner := bufio.NewScanner(strings.NewReader(string(output))) + for scanner.Scan() { + line := scanner.Text() + if strings.HasPrefix(line, "Error:") { + // We already passed over all the nodes' statuses once. The + // remaining lines are redundant (see example output above). + break + } + fields := strings.Split(line, ": ") + if len(fields) != 2 { + t.Fatalf("unexpected `roachprod monitor` output line: %s", line) + } + nodeId, err := strconv.Atoi(fields[0]) + if err != nil { + t.Fatalf("unexpected `roachprod monitor` output line: %s", line) + } + nodeStatus := fields[1] + if nodeStatus == "dead" { + restartNodeIds = append(restartNodeIds, nodeId) + } + } + } + return nil + }, + ) + for i := range restartNodeIds { + startCockroachNodes( + ctx, t, c, c.Node(restartNodeIds[i]), false, /* reliable */ + syncErrors, + ) + } + } + }) + + // Permit any number of node crashes/restarts since this test causes them intentionally. + m.ExpectDeaths(math.MaxInt32) + + m.Wait() +}