diff --git a/c-deps/libroach/CMakeLists.txt b/c-deps/libroach/CMakeLists.txt index fe43dcf50f1a..b6d86f44b7a4 100644 --- a/c-deps/libroach/CMakeLists.txt +++ b/c-deps/libroach/CMakeLists.txt @@ -54,6 +54,7 @@ add_library(roach protos/util/log/log.pb.cc protos/util/unresolved_addr.pb.cc rocksdbutils/env_encryption.cc + rocksdbutils/env_sync_fault_injection.cc ) target_include_directories(roach PUBLIC ./include diff --git a/c-deps/libroach/db.cc b/c-deps/libroach/db.cc index 298da9c7e001..c4f43d82fe10 100644 --- a/c-deps/libroach/db.cc +++ b/c-deps/libroach/db.cc @@ -19,6 +19,7 @@ #include #include #include +#include #include #include "batch.h" #include "cache.h" @@ -34,6 +35,7 @@ #include "iterator.h" #include "merge.h" #include "options.h" +#include "rocksdbutils/env_sync_fault_injection.h" #include "snapshot.h" #include "status.h" #include "table_props.h" @@ -151,7 +153,41 @@ static DBOpenHook* db_open_hook = DBOpenHookOSS; void DBSetOpenHook(void* hook) { db_open_hook = (DBOpenHook*)hook; } +void DBRegisterTestingEnvs() { + // Have a couple `SyncFaultInjectionEnv`s registered with RocksDB for our test + // cases to use, which they can select using any of RocksDB's text-based options + // mechanisms. Currently the only one Cockroach exposes is options string which + // is passed the value from `DBOptions::rocksdb_options`. + static rocksdb::Registrar sync_failure_env_reg( + "sync-failure-injection-wrapping-default-env", + [](const std::string& /* name */, + std::unique_ptr* /* guard */) { + static rocksdb_utils::SyncFaultInjectionEnv env( + rocksdb::Env::Default(), + 0 /* crash_failure_one_in */, + 500000 /* sync_failure_one_in */, + true /* crash_after_sync_failure */); + return &env; + } + ); + + static rocksdb::Registrar crash_failure_env_reg( + "crash-failure-injection-wrapping-default-env", + [](const std::string& /* name */, + std::unique_ptr* /* guard */) { + static rocksdb_utils::SyncFaultInjectionEnv env( + rocksdb::Env::Default(), + 500000 /* crash_failure_one_in */, + 0 /* sync_failure_one_in */, + false /* crash_after_sync_failure */); + return &env; + } + ); +} + DBStatus DBOpen(DBEngine** db, DBSlice dir, DBOptions db_opts) { + DBRegisterTestingEnvs(); + rocksdb::Options options = DBMakeOptions(db_opts); const std::string additional_options = ToString(db_opts.rocksdb_options); diff --git a/c-deps/libroach/rocksdbutils/env_sync_fault_injection.cc b/c-deps/libroach/rocksdbutils/env_sync_fault_injection.cc new file mode 100644 index 000000000000..6d7e4d371e8b --- /dev/null +++ b/c-deps/libroach/rocksdbutils/env_sync_fault_injection.cc @@ -0,0 +1,132 @@ +// Copyright 2019 The Cockroach Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. + +#include "env_sync_fault_injection.h" +#include "rocksdb/utilities/object_registry.h" + +namespace rocksdb_utils { + +// See comment above `SyncFaultInjectionEnv` class definition. +class SyncFaultInjectionWritableFile : public rocksdb::WritableFileWrapper { + public: + SyncFaultInjectionWritableFile(std::unique_ptr target, + int crash_failure_one_in, + int sync_failure_one_in, + bool crash_after_sync_failure); + + rocksdb::Status Append(const rocksdb::Slice& data) override; + rocksdb::Status Sync() override; + + private: + std::unique_ptr target_; + const int crash_failure_one_in_; + const int sync_failure_one_in_; + const bool crash_after_sync_failure_; + // Countdown until crash if a sync failure already happened. + int num_syncs_until_crash_; + // Lock needed to handle concurrent writes and syncs. + std::mutex mu_; + // A buffer of written but unsynced data. + std::string buffer_; + + // Some constants for use with `num_syncs_until_crash_`. + const static int kNoCountdown = -1; + const static int kStartCountdown = 10; +}; + +SyncFaultInjectionWritableFile::SyncFaultInjectionWritableFile( + std::unique_ptr target, + int crash_failure_one_in, + int sync_failure_one_in, + bool crash_after_sync_failure) : + rocksdb::WritableFileWrapper(target.get()), + target_(std::move(target)), + crash_failure_one_in_(crash_failure_one_in), + sync_failure_one_in_(sync_failure_one_in), + crash_after_sync_failure_(crash_after_sync_failure), + num_syncs_until_crash_(kNoCountdown) {} + +rocksdb::Status SyncFaultInjectionWritableFile::Append( + const rocksdb::Slice& data) { + std::unique_lock lock(mu_); + buffer_.append(data.data(), data.size()); + return rocksdb::Status::OK(); +} + +// We are using process crash to simulate system crash for tests and don't +// expect these tests to face actual system crashes. So for "syncing" it is +// sufficient to push data into page cache via the underlying `WritableFile`'s +// `Append()`. That should be enough for the file data to survive a process +// crash. +rocksdb::Status SyncFaultInjectionWritableFile::Sync() { + std::unique_lock lock(mu_); + if (num_syncs_until_crash_ > kNoCountdown) { + --num_syncs_until_crash_; + if (num_syncs_until_crash_ == 0) { + exit(0); + } + // On Linux the behavior after a sync failure occurred is to clear the error + // state and continue accepting writes/syncs. To simulate that behavior, we + // do not return early here, even though the file is known to have lost writes. + } + + if (crash_failure_one_in_ > 0 && random() % crash_failure_one_in_ == 0) { + exit(0); + } else if (sync_failure_one_in_ > 0 && random() % sync_failure_one_in_ == 0) { + if (num_syncs_until_crash_ == kNoCountdown && crash_after_sync_failure_) { + // This was the first failure. Start the countdown. + num_syncs_until_crash_ = kStartCountdown; + } + // As mentioned above, after a sync failure we allow continued writes and syncs + // to the same file. To make sure those new writes are written at the proper offset, + // we cannot drop unsynced writes simply by clearing the buffer. Instead we drop + // unsynced writes by overwriting the buffer with all zeros (well, this assumes + // the buffer didn't have all zeros to begin with). + buffer_.replace(0, buffer_.size(), buffer_.size(), '\0'); + return rocksdb::Status::IOError(); + } + std::string old_buffer; + buffer_.swap(old_buffer); + // It should be fine to buffer new writes while we're syncing old ones, so unlock. + lock.unlock(); + return target_->Append(old_buffer); +} + +SyncFaultInjectionEnv::SyncFaultInjectionEnv( + Env* target, + int crash_failure_one_in, + int sync_failure_one_in, + bool crash_after_sync_failure) : + rocksdb::EnvWrapper(target), + crash_failure_one_in_(crash_failure_one_in), + sync_failure_one_in_(sync_failure_one_in), + crash_after_sync_failure_(crash_after_sync_failure) {} + +rocksdb::Status SyncFaultInjectionEnv::NewWritableFile( + const std::string& filename, + std::unique_ptr* result, + const rocksdb::EnvOptions& env_options) { + std::unique_ptr underlying_file; + rocksdb::Status s = EnvWrapper::NewWritableFile(filename, &underlying_file, env_options); + if (s.ok()) { + result->reset(new SyncFaultInjectionWritableFile( + std::move(underlying_file), + crash_failure_one_in_, + sync_failure_one_in_, + crash_after_sync_failure_)); + } + return s; +} + +} // rocksdb_utils diff --git a/c-deps/libroach/rocksdbutils/env_sync_fault_injection.h b/c-deps/libroach/rocksdbutils/env_sync_fault_injection.h new file mode 100644 index 000000000000..a07b66a66dbe --- /dev/null +++ b/c-deps/libroach/rocksdbutils/env_sync_fault_injection.h @@ -0,0 +1,61 @@ +// Copyright 2019 The Cockroach Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. + +#pragma once + +#include +#include + +#include "rocksdb/env.h" + +namespace rocksdb_utils { + +// `SyncFaultInjectionEnv` creates files that buffer `Append()`s in process memory +// until `Sync()` is called. Such files enable us to simulate machine crashes by only +// crashing the process. This works since, unlike normal files whose writes survive +// process crash in page cache, these files' unsynced writes are dropped on the floor. +// +// Such files also enable us to simulate sync failure by dropping unsynced writes at +// the same time we inject a sync error. This is more comprehensive than the available +// fault injection tools I looked at (like libfiu and charybdefs), as those ones only +// inject errors without dropping unsynced writes. +class SyncFaultInjectionEnv : public rocksdb::EnvWrapper { + public: + // - `target`: A pointer to the underlying `Env`. + // - `crash_failure_one_in`: During a sync operation, crash the process immediately + // with a probability of 1/n. All unsynced writes are lost since they are buffered + // in process memory. + // - `sync_failure_one_in`: A sync operation will return failure with a probability + // of 1/n. All unsynced writes for the file are dropped to simulate the failure. + // - `crash_after_sync_failure`: If set to true, the program will crash itself some + // time after the first simulated sync failure. It does not happen immediately to + // allow the system to get itself into a weird state in case it doesn't handle sync + // failures properly. + SyncFaultInjectionEnv( + Env* target, + int crash_failure_one_in, + int sync_failure_one_in, + bool crash_after_sync_failure); + + rocksdb::Status NewWritableFile(const std::string& filename, + std::unique_ptr* result, + const rocksdb::EnvOptions& env_options) override; + + private: + const int crash_failure_one_in_; + const int sync_failure_one_in_; + const bool crash_after_sync_failure_; +}; + +} // rocksdb_utils diff --git a/c-deps/rocksdb b/c-deps/rocksdb index 47e4f25d7b30..90a02d472277 160000 --- a/c-deps/rocksdb +++ b/c-deps/rocksdb @@ -1 +1 @@ -Subproject commit 47e4f25d7b30c5cdf077e4f96b945a2c6b790d7b +Subproject commit 90a02d47227787436016e5d22c5287d2210d7da0 diff --git a/pkg/cmd/roachprod/install/cockroach.go b/pkg/cmd/roachprod/install/cockroach.go index bc5382de59c3..15c207acbd4d 100644 --- a/pkg/cmd/roachprod/install/cockroach.go +++ b/pkg/cmd/roachprod/install/cockroach.go @@ -386,6 +386,10 @@ tar cvf certs.tar certs cmd += `echo ">>> roachprod start: $(date)" >> ` + logDir + "/roachprod.log; " + `ps axeww -o pid -o command >> ` + logDir + "/roachprod.log; " + `[ -x /usr/bin/lslocks ] && /usr/bin/lslocks >> ` + logDir + "/roachprod.log; " + if c.IsLocal() { + // This is consistent with the working directory used by `roachprod run`. + cmd += fmt.Sprintf("cd ${HOME}/local/%d ; ", nodes[i]) + } cmd += keyCmd + fmt.Sprintf(" export ROACHPROD=%d%s && ", nodes[i], c.Tag) + "GOTRACEBACK=crash " + diff --git a/pkg/cmd/roachprod/vm/gce/gcloud.go b/pkg/cmd/roachprod/vm/gce/gcloud.go index 1b54189679d8..e1d113b729e0 100644 --- a/pkg/cmd/roachprod/vm/gce/gcloud.go +++ b/pkg/cmd/roachprod/vm/gce/gcloud.go @@ -51,7 +51,7 @@ var projectsWithGC = []string{defaultProject, "andrei-jepsen"} // init will inject the GCE provider into vm.Providers, but only if the gcloud tool is available on the local path. func init() { - var p vm.Provider + var p vm.Provider = &Provider{} if _, err := exec.LookPath("gcloud"); err != nil { p = flagstub.New(p, "please install the gcloud CLI utilities "+ "(https://cloud.google.com/sdk/downloads)") diff --git a/pkg/cmd/roachtest/cluster.go b/pkg/cmd/roachtest/cluster.go index c11f4b4af784..07b0e3da7f04 100644 --- a/pkg/cmd/roachtest/cluster.go +++ b/pkg/cmd/roachtest/cluster.go @@ -925,7 +925,7 @@ func (c *cluster) All() nodeListOption { return c.Range(1, c.nodes) } -// All returns a node list containing the nodes [begin,end]. +// Range returns a node list containing the nodes [begin,end]. func (c *cluster) Range(begin, end int) nodeListOption { if begin < 1 || end > c.nodes { c.t.Fatalf("invalid node range: %d-%d (1-%d)", begin, end, c.nodes) @@ -937,7 +937,7 @@ func (c *cluster) Range(begin, end int) nodeListOption { return r } -// All returns a node list containing only the node i. +// Node returns a node list containing only the node i. func (c *cluster) Node(i int) nodeListOption { return c.Range(i, i) } diff --git a/pkg/cmd/roachtest/registry.go b/pkg/cmd/roachtest/registry.go index 5d06a68fa737..5a808e4ad70d 100644 --- a/pkg/cmd/roachtest/registry.go +++ b/pkg/cmd/roachtest/registry.go @@ -69,6 +69,7 @@ func registerTests(r *registry) { registerSQLsmith(r) registerSyncTest(r) registerSysbench(r) + registerSystemCrashTest(r) registerTPCC(r) registerTypeORM(r) registerLoadSplits(r) diff --git a/pkg/cmd/roachtest/system_crash.go b/pkg/cmd/roachtest/system_crash.go new file mode 100644 index 000000000000..e4d08c270846 --- /dev/null +++ b/pkg/cmd/roachtest/system_crash.go @@ -0,0 +1,199 @@ +// Copyright 2019 The Cockroach Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. See the AUTHORS file +// for names of contributors. + +package main + +import ( + "bufio" + "context" + "fmt" + "math" + "strconv" + "strings" + "time" + + "github.com/cockroachdb/cockroach/pkg/util/contextutil" +) + +func registerSystemCrashTest(r *registry) { + // Two variants of this test: + // + // - `syncErrors == false`: The unreliable cockroach processes (i.e., the ones running + // with fault injection `Env`s) will crash themselves during a randomly chosen sync + // operation. The crash occurs before that sync operation writes out data. + // + // - `syncErrors == true`: The unreliable cockroach processes will return an error + // from a randomly chosen sync operation, and also drop the writes corresponding + // to that sync. If cockroach does not crash itself, the fault injection `Env` + // will crash for us shortly afterwards. + for _, syncErrors := range []bool{false, true} { + syncErrors := syncErrors + r.Add(testSpec{ + Name: fmt.Sprintf("system-crash/sync-errors=%t", syncErrors), + Cluster: makeClusterSpec(6 /* nodeCount */), + Run: func(ctx context.Context, t *test, c *cluster) { + runSystemCrashTest(ctx, t, c, syncErrors) + }, + }) + } +} + +func runSystemCrashTest(ctx context.Context, t *test, c *cluster, syncErrors bool) { + // Currently setup is as follows: + // - Node 1 runs a reliable (i.e., no fault injection) Cockroach process + // - Node 2..n-1 run unreliable Cockroach processes + // - Node n runs a `workload kv` process. It targets the reliable Cockroach on node 1. + getCockroachNodes := func(c *cluster) nodeListOption { + return c.Range(1, c.nodes-1) + } + + getReliableCockroachNode := func(c *cluster) nodeListOption { + cockroachNodes := getCockroachNodes(c) + return cockroachNodes[0:1] + } + + getUnreliableCockroachNodes := func(c *cluster) nodeListOption { + cockroachNodes := getCockroachNodes(c) + return cockroachNodes[1:] + } + + getWorkloadNode := func(c *cluster) nodeListOption { + return c.Node(c.nodes) + } + + t.Status("installing binaries") + c.Put(ctx, cockroach, "./cockroach", getCockroachNodes(c)) + c.Put(ctx, workload, "./workload", getWorkloadNode(c)) + + startCockroachNodes := func( + ctx context.Context, t *test, c *cluster, nodes nodeListOption, reliable, syncErrors bool, + ) { + envVars := "COCKROACH_CONSISTENCY_CHECK_INTERVAL=10s" + args := "--store=path=./data" + if !reliable { + if syncErrors { + args += ",rocksdb=env=sync-failure-injection-wrapping-default-env" + } else { + args += ",rocksdb=env=crash-failure-injection-wrapping-default-env" + } + } + c.Start(ctx, t, nodes, startArgs( + "--env", envVars, + "--args", args, + )) + } + + t.Status("starting cockroach") + startCockroachNodes(ctx, t, c, getReliableCockroachNode(c), true /* reliable */, syncErrors) + startCockroachNodes(ctx, t, c, getUnreliableCockroachNodes(c), false /* reliable */, syncErrors) + + // There are two goroutines run under the monitor `m`: + // + // (1) A goroutine that runs `workload` synchronously. + // (2) A goroutine that loops running `roachprod monitor`, looking for dead nodes and + // restarting them. + // + // When nothing goes wrong, (1) finishes first. It then invokes `cancel()` to tell (2) + // to exit its loop. Then once both goroutines have completed, `m.Wait()` returns. + ctx, cancel := context.WithCancel(ctx) + m := newMonitor(ctx, c, getCockroachNodes(c)) + + t.Status("running kv workload") + // Launch goroutine running kv workload + m.Go(func(ctx context.Context) error { + c.Run( + ctx, getWorkloadNode(c), "./workload", "run", "kv", "--init", "--splits=1000", + "--histograms=logs/stats.json", "--concurrency=12", "--duration=1h", + "--read-percent=0", "--batch=32", "--min-block-bytes=128", "--max-block-bytes=128", + "--tolerate-errors", fmt.Sprintf("{pgurl%s}", getReliableCockroachNode(c)), + ) + cancel() + return nil + }) + + // Launch goroutine for monitoring and restarting dead cockroach nodes + m.Go(func(ctx context.Context) error { + for { + select { + case <-ctx.Done(): + return nil + case <-time.After(1 * time.Second): + } + + var restartNodeIds []int + // Running `roachprod monitor` under a timeout is a hack copied from + // `FailOnDeadNodes`. It is necessary because sometimes that command + // gets stuck in an infinite loop when run with the `--oneshot` option. + _ = contextutil.RunWithTimeout( + ctx, "restart dead nodes", 1*time.Second, + func(ctx context.Context) error { + output, err := execCmdWithBuffer( + ctx, t.l, roachprod, "monitor", c.name, "--oneshot", "--ignore-empty-nodes", + ) + // If there's an error, it means either that the monitor command failed + // completely, or that it found a dead node worth complaining about. + if err != nil { + if ctx.Err() != nil { + // Don't fail if we timed out. Could be the known infinite loop bug. + return nil + } + + // Figure out which nodes are dead and restart them. Output looks like below + // when there are dead nodes. + // + // 3: dead + // 1: 6890 + // 4: 7075 + // 2: 6951 + // Error: 3: dead + scanner := bufio.NewScanner(strings.NewReader(string(output))) + for scanner.Scan() { + line := scanner.Text() + if strings.HasPrefix(line, "Error:") { + // We already passed over all the nodes' statuses once. The + // remaining lines are redundant (see example output above). + break + } + fields := strings.Split(line, ": ") + if len(fields) != 2 { + t.Fatalf("unexpected `roachprod monitor` output line: %s", line) + } + nodeId, err := strconv.Atoi(fields[0]) + if err != nil { + t.Fatalf("unexpected `roachprod monitor` output line: %s", line) + } + nodeStatus := fields[1] + if nodeStatus == "dead" { + restartNodeIds = append(restartNodeIds, nodeId) + } + } + } + return nil + }, + ) + for i := range restartNodeIds { + startCockroachNodes( + ctx, t, c, c.Node(restartNodeIds[i]), false, /* reliable */ + syncErrors, + ) + } + } + }) + + // Permit any number of node crashes/restarts since this test causes them intentionally. + m.ExpectDeaths(math.MaxInt32) + + m.Wait() +}