Skip to content

Commit

Permalink
cli,base: surface stored critical errors at the right moment
Browse files Browse the repository at this point in the history
If/when storage detects an important error, the text for this error is
stored in a file named `_CRITICAL_ERROR.txt` in the auxiliary
directory. The intention is to block subsequent server restarts until
the error is investigated and the file (manually) removed.

Prior to this patch, this check was done in the startup sequence 1)
before logging was fully initialized 2) using a `log.Fatal` to
announce the critical error.

The first aspect is problematic because it logs before logging flags
are applied. The second is problematic because it makes the failure
super-verbose and buries the lede.

This patch simplifies the code and makes the error reported at the
right place.

Example, before:
```
kena@kenax ....com/cockroachdb/cockroach % ./cockroach start-single-node
F200421 14:24:02.303675 1 cli/start.go:478  From .../auxiliary/_CRITICAL_ALERT.txt:

boom

goroutine 1 [running]:
github.com/cockroachdb/cockroach/pkg/util/log.getStacks(0x6a7ca00, 0xed630f902, 0x0, 0x1000)
        ...//src/github.com/cockroachdb/cockroach/pkg/util/log/get_stacks.go:25 +0xb8
github.com/cockroachdb/cockroach/pkg/util/log.(*loggerT).outputLogEntry(0x6a79800, 0xc000000004, 0x33eb7b9, 0xc, 0x1de, 0xc000ba8180, 0x76)
        ...//src/github.com/cockroachdb/cockroach/pkg/util/log/clog.go:210 +0xa92
github.com/cockroachdb/cockroach/pkg/util/log.addStructured(0x15e3420, 0xc000078168, 0x4, 0x2, 0x0, 0x0, 0xc00063f7e8, 0x1, 0x1)
        ...//src/github.com/cockroachdb/cockroach/pkg/util/log/structured.go:66 +0x2c9
...
[147/245]
****************************************************************************

This node experienced a fatal error (printed above), and as a result the
process is terminating.

Fatal errors can occur due to faulty hardware (disks, memory, clocks) or a
...
    support@cockroachlabs.com

The Cockroach Labs team appreciates your feedback.
```

Example, after:

```
kena@kenax ....com/cockroachdb/cockroach % ./cockroach start-single-node
*
* ERROR: ERROR: startup forbidden by prior critical alert
* DETAIL: From /data/home/kena/src/go/src/github.com/cockroachdb/cockroach/cockroach-data/auxiliary/_CRITICAL_ALERT.txt:
* boom
*
```

Release note (cli change): The error message displayed upon `cockroach
start` / `cockroach start-single-node` when manual intervention is
needed in the store directory is now clearer.
  • Loading branch information
knz committed Apr 21, 2020
1 parent 5ad50d8 commit cd63487
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 25 deletions.
33 changes: 21 additions & 12 deletions pkg/base/store_spec.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ import (
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/util/humanizeutil"
"github.com/cockroachdb/cockroach/pkg/util/netutil"
"github.com/cockroachdb/errors"
humanize "github.com/dustin/go-humanize"
"github.com/pkg/errors"
"github.com/spf13/pflag"
)

Expand Down Expand Up @@ -369,12 +369,23 @@ func PreventedStartupFile(dir string) string {
return filepath.Join(dir, "_CRITICAL_ALERT.txt")
}

// GetPreventedStartupMessage attempts to read the PreventedStartupFile for each
// store directory and returns their concatenated contents. These files
// typically request operator intervention after a corruption event by
// preventing the affected node(s) from starting back up.
func (ssl StoreSpecList) GetPreventedStartupMessage() (string, error) {
var buf strings.Builder
// AssertNoPriorCriticalAlert attempts to read the
// PreventedStartupFile for each store directory and returns their
// contents as a structured error.
//
// These files typically request operator intervention after a
// corruption event by preventing the affected node(s) from starting
// back up.
func (ssl StoreSpecList) AssertNoPriorCriticalAlert() (err error) {
addError := func(newErr error) {
if err == nil {
err = errors.New("startup forbidden by prior critical alert")
}
// We use WithDetailf here instead of errors.CombineErrors
// because we want the details to be printed to the screen
// (combined errors only show up via %+v).
err = errors.WithDetailf(err, "%v", newErr)
}
for _, ss := range ssl.Specs {
path := ss.PreventedStartupFile()
if path == "" {
Expand All @@ -383,15 +394,13 @@ func (ssl StoreSpecList) GetPreventedStartupMessage() (string, error) {
b, err := ioutil.ReadFile(path)
if err != nil {
if !os.IsNotExist(err) {
return "", err
addError(errors.Wrapf(err, "%s", path))
}
continue
}
fmt.Fprintf(&buf, "From %s:\n\n", path)
_, _ = buf.Write(b)
fmt.Fprintln(&buf)
addError(errors.Newf("From %s:\n\n%s\n", path, b))
}
return buf.String(), nil
return err
}

// PreventedStartupFile returns the path to a file which, if it exists, should
Expand Down
11 changes: 6 additions & 5 deletions pkg/base/store_spec_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/testutils"
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
"github.com/cockroachdb/errors"
"github.com/stretchr/testify/require"
)

Expand Down Expand Up @@ -245,13 +246,13 @@ func TestStoreSpecListPreventedStartupMessage(t *testing.T) {
},
}

msg, err := ssl.GetPreventedStartupMessage()
err := ssl.AssertNoPriorCriticalAlert()
require.NoError(t, err)
require.Empty(t, msg)

require.NoError(t, ioutil.WriteFile(ssl.Specs[2].PreventedStartupFile(), []byte("boom"), 0644))

msg, err = ssl.GetPreventedStartupMessage()
require.NoError(t, err)
require.Contains(t, msg, "boom")
err = ssl.AssertNoPriorCriticalAlert()
require.Error(t, err)
require.Contains(t, err.Error(), "startup forbidden by prior critical alert")
require.Contains(t, errors.FlattenDetails(err), "boom")
}
14 changes: 6 additions & 8 deletions pkg/cli/start.go
Original file line number Diff line number Diff line change
Expand Up @@ -470,14 +470,6 @@ func runStart(cmd *cobra.Command, args []string, disableReplication bool) error
// not be world-readable.
disableOtherPermissionBits()

// TODO(knz): the following call is not in the right place.
// See: https://github.com/cockroachdb/cockroach/issues/44041
if s, err := serverCfg.Stores.GetPreventedStartupMessage(); err != nil {
return err
} else if s != "" {
log.Fatal(context.Background(), s)
}

// Set up the signal handlers. This also ensures that any of these
// signals received beyond this point do not interrupt the startup
// sequence until the point signals are checked below.
Expand Down Expand Up @@ -521,6 +513,12 @@ func runStart(cmd *cobra.Command, args []string, disableReplication bool) error
return err
}

// If any store has something to say against a server start-up
// (e.g. previously detected corruption), listen to them now.
if err := serverCfg.Stores.AssertNoPriorCriticalAlert(); err != nil {
return err
}

// We don't care about GRPCs fairly verbose logs in most client commands,
// but when actually starting a server, we enable them.
grpcutil.SetSeverity(log.Severity_WARNING)
Expand Down

0 comments on commit cd63487

Please sign in to comment.