pkg/cmd/roachtest/restart.go

// Copyright 2019 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.

package main

import (
	"context"
	"fmt"
	"time"

	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
)

func runRestart(ctx context.Context, t *test, c *cluster, downDuration time.Duration) {
	crdbNodes := c.Range(1, c.nodes)
	workloadNode := c.Node(1)
	const restartNode = 3

	t.Status("installing cockroach")
	c.Put(ctx, cockroach, "./cockroach", crdbNodes)
	c.Start(ctx, t, crdbNodes, startArgs(`--args=--vmodule=raft_log_queue=3`))

	// We don't really need tpcc, we just need a good amount of traffic and a good
	// amount of data.
	t.Status("importing tpcc fixture")
	c.Run(ctx, workloadNode,
		"./cockroach workload fixtures import tpcc --warehouses=100 --fks=false --checks=false")

	// Wait a full scanner cycle (10m) for the raft log queue to truncate the
	// sstable entries from the import. They're huge and are not representative of
	// normal traffic.
	//
	// TODO(dan): It seems like some part of `fixtures import` should do this for
	// you. It's odd that a cluster that's freshly initialized with fixtures
	// import behaves so differently than it does 10m after.
	t.Status("waiting for addsstable truncations")
	time.Sleep(11 * time.Minute)

	// Stop a node.
	c.Stop(ctx, c.Node(restartNode))

	// Wait for between 10s and `server.time_until_store_dead` while sending
	// traffic to one of the nodes that are not down. This used to cause lots of
	// raft log truncation, which caused node 3 to need lots of snapshots when it
	// came back up.
	c.Run(ctx, workloadNode, "./cockroach workload run tpcc --warehouses=100 "+
		fmt.Sprintf("--tolerate-errors --wait=false --duration=%s", downDuration))

	// Bring it back up and make sure it can serve a query within a reasonable
	// time limit. For now, less time than it was down for.
	c.Start(ctx, t, c.Node(restartNode))
	start := timeutil.Now()
	restartNodeDB := c.Conn(ctx, restartNode)
	if _, err := restartNodeDB.Exec(`SELECT count(*) FROM tpcc.order_line`); err != nil {
		t.Fatal(err)
	}
	if took := timeutil.Since(start); took > downDuration {
		t.Fatalf(`expected to recover within %s took %s`, downDuration, took)
	} else {
		c.l.Printf(`connecting and query finished in %s`, took)
	}
}

func registerRestart(r *registry) {
	r.Add(testSpec{
		Name:    fmt.Sprintf("restart/down-for-2m"),
		Cluster: makeClusterSpec(3),
		// "cockroach workload is only in 19.1+"
		MinVersion: "v19.1.0",
		Run: func(ctx context.Context, t *test, c *cluster) {
			runRestart(ctx, t, c, 2*time.Minute)
		},
	})
}