-
Notifications
You must be signed in to change notification settings - Fork 3.8k
/
restart.go
81 lines (70 loc) · 2.78 KB
/
restart.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
// Copyright 2019 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.
package main
import (
"context"
"fmt"
"time"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
)
func runRestart(ctx context.Context, t *test, c *cluster, downDuration time.Duration) {
crdbNodes := c.Range(1, c.nodes)
workloadNode := c.Node(1)
const restartNode = 3
t.Status("installing cockroach")
c.Put(ctx, cockroach, "./cockroach", crdbNodes)
c.Start(ctx, t, crdbNodes, startArgs(`--args=--vmodule=raft_log_queue=3`))
// We don't really need tpcc, we just need a good amount of traffic and a good
// amount of data.
t.Status("importing tpcc fixture")
c.Run(ctx, workloadNode,
"./cockroach workload fixtures import tpcc --warehouses=100 --fks=false --checks=false")
// Wait a full scanner cycle (10m) for the raft log queue to truncate the
// sstable entries from the import. They're huge and are not representative of
// normal traffic.
//
// TODO(dan): It seems like some part of `fixtures import` should do this for
// you. It's odd that a cluster that's freshly initialized with fixtures
// import behaves so differently than it does 10m after.
t.Status("waiting for addsstable truncations")
time.Sleep(11 * time.Minute)
// Stop a node.
c.Stop(ctx, c.Node(restartNode))
// Wait for between 10s and `server.time_until_store_dead` while sending
// traffic to one of the nodes that are not down. This used to cause lots of
// raft log truncation, which caused node 3 to need lots of snapshots when it
// came back up.
c.Run(ctx, workloadNode, "./cockroach workload run tpcc --warehouses=100 "+
fmt.Sprintf("--tolerate-errors --wait=false --duration=%s", downDuration))
// Bring it back up and make sure it can serve a query within a reasonable
// time limit. For now, less time than it was down for.
c.Start(ctx, t, c.Node(restartNode))
start := timeutil.Now()
restartNodeDB := c.Conn(ctx, restartNode)
if _, err := restartNodeDB.Exec(`SELECT count(*) FROM tpcc.order_line`); err != nil {
t.Fatal(err)
}
if took := timeutil.Since(start); took > downDuration {
t.Fatalf(`expected to recover within %s took %s`, downDuration, took)
} else {
c.l.Printf(`connecting and query finished in %s`, took)
}
}
func registerRestart(r *registry) {
r.Add(testSpec{
Name: fmt.Sprintf("restart/down-for-2m"),
Cluster: makeClusterSpec(3),
// "cockroach workload is only in 19.1+"
MinVersion: "v19.1.0",
Run: func(ctx context.Context, t *test, c *cluster) {
runRestart(ctx, t, c, 2*time.Minute)
},
})
}