-
Notifications
You must be signed in to change notification settings - Fork 3.8k
/
rebalance_objective.go
280 lines (255 loc) · 12.3 KB
/
rebalance_objective.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
// Copyright 2023 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.
package kvserver
import (
"context"
"github.com/cockroachdb/cockroach/pkg/clusterversion"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/allocator/load"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/allocator/storepool"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/settings"
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
"github.com/cockroachdb/cockroach/pkg/util/grunning"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
)
// LBRebalancingObjective controls the objective of load based rebalancing.
// This is used to both (1) define the types of load considered when
// determining how balanced the cluster is, and (2) select actions that improve
// balancing the given objective. Currently there are only two possible
// objectives:
// - qps which is the original default setting and looks at the number of batch
// requests on a range and store.
// - cpu which is added in 23.1 and looks at the cpu usage of a range and
// store.
type LBRebalancingObjective int64
const (
// LBRebalancingQueries is a rebalancing objective that aims to balances
// queries (QPS) among stores in the cluster. The QPS per-store is
// calculated as the sum of every replica's QPS on the store. The QPS value
// per-replica is calculated as the average number of batch requests per
// second, the replica received over the last 30 minutes, or replica
// lifetime, whichever is shorter. A special case for the QPS calculation
// of a batch request exists for requests that contain AddSST requests,
// which are weighted by the size of the SST to be added (see #76252). When
// there are multiple stores per-node, the behavior doesn't change in
// comparison to single store per-node.
//
// When searching for rebalance actions, this objective estimates the
// impact of an action by using the QPS of the leaseholder replica invovled
// e.g. the impact of lease transfers on the stores invovled is
// +leaseholder replica QPS on the store that receives the lease and
// -leaseholder replica QPS on the store that removes the lease.
//
// This rebalancing objective tends to works well when the load of
// different batch requests in the cluster is uniform. e.g. there are only
// few types of requests which all exert approx the same load on the
// system. This rebalancing objective tends to perform poorly when the load
// of different batch requests in the cluster is non-uniform as balancing
// QPS does not correlate well with balancing load.
LBRebalancingQueries LBRebalancingObjective = iota
// LBRebalancingCPU is a rebalance objective that aims balances the store
// CPU usage. The store CPU usage is calculated as the sum of replicas' cpu
// usage on the store. The CPU value per-replica is calculated as the
// average cpu usage per second, the replica used in processing over the
// last 30 minutes, or replica lifetime, whichever is shorter. When there
// are multiple stores per-node, the behavior doesn't change in comparison
// to single store per-node. That is, despite multiple stores sharing the
// same underling CPU, the objective attempts to balance CPU usage of each
// store on a node e.g. In a cluster where there is 1 node and 8 stores on
// the 1 node, the rebalance objective will rebalance leases and replicas
// so that the CPU usage is balanced between the 8 stores.
//
// When searching for rebalance actions, this objective estimates the
// impact of an action by either using all of the leaseholder replicas' CPU
// usage for transfer+rebalance and the foreground request cpu usage for
// just lease transfers. See allocator/range_usage_info.go.
//
// One alternative approach that was considered for the LBRebalancingCPU
// objective was to use the process CPU usage and balance each stores'
// process usage. The measured replica cpu usage is used only to determine
// which replica to rebalance, but not when to rebalance or who to
// rebalance to. This approach benefits from observing the "true" cpu
// usage, rather than just the sum of replica's usage. However, unlike the
// implemented approach, the estimated impact of actions was less reliable
// and had to be scaled to account for multi-store and missing cpu
// attribution. The implemented approach composes well in comparison to the
// process cpu approach. The sum of impact over available actions is equal
// to the store value being balanced, similar to LBRebalancingQueries.
LBRebalancingCPU
)
// LoadBasedRebalancingObjective is a cluster setting that defines the load
// balancing objective of the cluster.
var LoadBasedRebalancingObjective = settings.RegisterEnumSetting(
settings.SystemOnly,
"kv.allocator.load_based_rebalancing.objective",
"what objective does the cluster use to rebalance; if set to `qps` "+
"the cluster will attempt to balance qps among stores, if set to "+
"`cpu` the cluster will attempt to balance cpu usage among stores",
"qps",
map[int64]string{
int64(LBRebalancingQueries): "qps",
int64(LBRebalancingCPU): "cpu",
},
).WithPublic()
// ToDimension returns the equivalent allocator load dimension of a rebalancing
// objective.
//
// TODO(kvoli): It is currently the case that every LBRebalancingObjective maps
// uniquely to a load.Dimension. However, in the future it is forseeable that
// LBRebalancingObjective could be a value that encompassese many different
// dimensions within a single objective e.g. bytes written, cpu usage and
// storage availability. If this occurs, this ToDimension fn will no longer be
// appropriate for multi-dimension objectives.
func (d LBRebalancingObjective) ToDimension() load.Dimension {
switch d {
case LBRebalancingQueries:
return load.Queries
case LBRebalancingCPU:
return load.CPU
default:
panic("unknown dimension")
}
}
// RebalanceObjectiveManager provides a method to get the rebalance objective
// of the cluster. It is possible that the cluster setting objective may not be
// the objective returned, when the cluster environment is unsupported or mixed
// versions exist.
type RebalanceObjectiveProvider interface {
// Objective returns the current rebalance objective.
Objective() LBRebalancingObjective
}
// gossipStoreDescriptorProvider provides a method to get the store descriptors
// from the storepool, received via gossip. Expose a thin interface for the
// objective manager to use for easier testing.
type gossipStoreDescriptorProvider interface {
// GetStores returns information on all the stores with descriptor that
// have been recently seen in gossip.
GetStores() map[roachpb.StoreID]roachpb.StoreDescriptor
}
// gossipStoreCapacityChangeNotifier provides a method to install a callback
// that will be called whenever the capacity of a store changes. Expose a thin
// interface for the objective manager to use for easier testing.
type gossipStoreCapacityChangeNotifier interface {
// SetOnCapacityChange installs a callback to be called when the store
// capacity changes.
SetOnCapacityChange(fn storepool.CapacityChangeFn)
}
// RebalanceObjectiveManager implements the RebalanceObjectiveProvider
// interface and registers a callback at creation time, that will be called on
// a reblanace objective change.
type RebalanceObjectiveManager struct {
st *cluster.Settings
storeDescProvider gossipStoreDescriptorProvider
mu struct {
syncutil.RWMutex
obj LBRebalancingObjective
// onChange callback registered will execute synchronously on the
// cluster settings thread that triggers an objective check. This is
// not good for large blocking operations.
onChange func(ctx context.Context, obj LBRebalancingObjective)
}
}
func newRebalanceObjectiveManager(
ctx context.Context,
st *cluster.Settings,
onChange func(ctx context.Context, obj LBRebalancingObjective),
storeDescProvider gossipStoreDescriptorProvider,
capacityChangeNotifier gossipStoreCapacityChangeNotifier,
) *RebalanceObjectiveManager {
rom := &RebalanceObjectiveManager{st: st, storeDescProvider: storeDescProvider}
rom.mu.obj = ResolveLBRebalancingObjective(ctx, st, storeDescProvider.GetStores())
rom.mu.onChange = onChange
LoadBasedRebalancingObjective.SetOnChange(&rom.st.SV, func(ctx context.Context) {
rom.maybeUpdateRebalanceObjective(ctx)
})
rom.st.Version.SetOnChange(func(ctx context.Context, _ clusterversion.ClusterVersion) {
rom.maybeUpdateRebalanceObjective(ctx)
})
// Rather than caching each capacity locally, use the callback as a trigger
// to recalculate the objective. This is less expensive than recacluating
// the objective on every call to Objective, which would need to be done
// otherwise, just in case a new capacity has come in. This approach does
// have the downside of using the gossip callback goroutine to trigger the
// onChange callback, which iterates through every replica on the store. It
// is unlikely though that the conditions are satisfied (some node begins
// not supporting grunning or begin supporting grunning) to trigger the
// onChange callback here.
capacityChangeNotifier.SetOnCapacityChange(
func(storeID roachpb.StoreID, old, cur roachpb.StoreCapacity) {
if old.CPUPerSecond < 0 && cur.CPUPerSecond >= 0 ||
cur.CPUPerSecond < 0 && old.CPUPerSecond >= 0 {
rom.maybeUpdateRebalanceObjective(ctx)
}
})
return rom
}
// Objective returns the current rebalance objective.
func (rom *RebalanceObjectiveManager) Objective() LBRebalancingObjective {
rom.mu.RLock()
defer rom.mu.RUnlock()
return rom.mu.obj
}
func (rom *RebalanceObjectiveManager) maybeUpdateRebalanceObjective(ctx context.Context) {
rom.mu.Lock()
defer rom.mu.Unlock()
prev := rom.mu.obj
new := ResolveLBRebalancingObjective(ctx, rom.st, rom.storeDescProvider.GetStores())
// Nothing to do when the objective hasn't changed.
if prev == new {
return
}
log.Infof(ctx, "Updating the rebalance objective from %s to %s", prev.ToDimension(), new.ToDimension())
rom.mu.obj = new
rom.mu.onChange(ctx, rom.mu.obj)
}
// ResolveLBRebalancingObjective returns the load based rebalancing objective
// for the cluster. In cases where a first objective cannot be used, it will
// return a fallback.
func ResolveLBRebalancingObjective(
ctx context.Context, st *cluster.Settings, descs map[roachpb.StoreID]roachpb.StoreDescriptor,
) LBRebalancingObjective {
set := LoadBasedRebalancingObjective.Get(&st.SV)
// Queries should always be supported, return early if set.
if set == int64(LBRebalancingQueries) {
return LBRebalancingQueries
}
// When the cluster version hasn't finalized to 23.1, some unupgraded
// stores will not be populating additional fields in their StoreCapacity,
// in such cases we cannot balance another objective since the data may not
// exist. Fall back to QPS balancing.
if !st.Version.IsActive(ctx, clusterversion.V23_1AllocatorCPUBalancing) {
log.Infof(ctx, "version doesn't support cpu objective, reverting to qps balance objective")
return LBRebalancingQueries
}
// When the cpu timekeeping utility is unsupported on this aarch, the cpu
// usage cannot be gathered. Fall back to QPS balancing.
if !grunning.Supported() {
log.Infof(ctx, "cpu timekeeping unavailable on host, reverting to qps balance objective")
return LBRebalancingQueries
}
// It is possible that the cputime utility isn't supported on a remote
// node's architecture, yet is supported locally on this node. If that is
// the case, the store's on the node will publish the cpu per second as -1
// for their capacity to gossip. The -1 is special cased here and
// disallows any other store using the cpu balancing objective.
for _, desc := range descs {
if desc.Capacity.CPUPerSecond == -1 {
log.Warningf(ctx,
"cpu timekeeping unavailable on node %d but available locally, reverting to qps balance objective",
desc.Node.NodeID)
return LBRebalancingQueries
}
}
// The cluster is on a supported version and this local store is on aarch
// which supported the cpu timekeeping utility, return the cluster setting
// as is.
return LBRebalancingObjective(set)
}