Skip to content

Commit

Permalink
vtctld/vtorc: improve reparenting stats (#13723)
Browse files Browse the repository at this point in the history
Signed-off-by: Tim Vaillancourt <tim@timvaillancourt.com>
Signed-off-by: Manan Gupta <manan@planetscale.com>
Co-authored-by: Manan Gupta <manan@planetscale.com>
  • Loading branch information
timvaillancourt and GuptaManan100 authored Sep 28, 2023
1 parent 94281fd commit 3038580
Showing 10 changed files with 261 additions and 20 deletions.
21 changes: 20 additions & 1 deletion changelog/18.0/18.0.0/summary.md
Original file line number Diff line number Diff line change
@@ -14,17 +14,19 @@
- [Updated to node v18.16.0](#update-node)
- **[Deprecations and Deletions](#deprecations-and-deletions)**
- [Deprecated Flags](#deprecated-flags)
- [Deprecated Stats](#deprecated-stats)
- [Deleted `V3` planner](#deleted-v3)
- [Deleted `k8stopo`](#deleted-k8stopo)
- [Deleted `vtgr`](#deleted-vtgr)
- [Deleted `query_analyzer`](#deleted-query_analyzer)
- [Deprecated VTBackup stat `DurationByPhase`](#deprecated-vtbackup-stat-duration-by-phase)
- [Deprecated VDiff v1](#deprecated-vdiff-v1)
- **[New stats](#new-stats)**
- **[New Stats](#new-stats)**
- [VTGate Vindex unknown parameters](#vtgate-vindex-unknown-parameters)
- [VTBackup stat `Phase`](#vtbackup-stat-phase)
- [VTBackup stat `PhaseStatus`](#vtbackup-stat-phase-status)
- [Backup and restore metrics for AWS S3](#backup-restore-metrics-aws-s3)
- [VTCtld and VTOrc reparenting stats](#vtctld-and-vtorc-reparenting-stats)
- **[VTTablet](#vttablet)**
- [VTTablet: New ResetSequences RPC](#vttablet-new-rpc-reset-sequences)
- **[Docker](#docker)**
@@ -116,6 +118,15 @@ VTGate flag:

- `--schema_change_signal_user` is deprecated and will be removed in `v19.0`

#### <a id="deprecated-stats"/>Deprecated Stats

The following Emergency Reparent Shard stats are deprecated in `v18.0` and will be removed in `v19.0`:
- `ers_counter`
- `ers_success_counter`
- `ers_failure_counter`

These metrics are replaced by [new reparenting stats introduced in `v18.0`](#vtctld-and-vtorc-reparenting-stats).

#### <a id="deleted-v3"/>Deleted `v3` planner

The `Gen4` planner has been the default planner since Vitess 14. The `v3` planner was deprecated in Vitess 15 and has now been removed in this release.
@@ -182,6 +193,14 @@ vtbackup_restore_count{component="BackupStorage",implementation="S3",operation="
vtbackup_restore_count{component="BackupStorage",implementation="S3",operation="AWS:Request:Send"} 165
```

#### <a id="vtctld-and-vtorc-reparenting-stats"/>VTCtld and VTOrc reparenting stats

New VTCtld and VTorc stats were added to measure frequency of reparents by keyspace/shard:
- `emergency_reparent_counts` - Number of times Emergency Reparent Shard has been run. It is further subdivided by the keyspace, shard and the result of the operation.
- `planned_reparent_counts` - Number of times Planned Reparent Shard has been run. It is further subdivided by the keyspace, shard and the result of the operation.

Also, the `reparent_shard_operation_timings` stat was added to provide per-operation timings of reparent operations.

### <a id="vttablet"/>VTTablet

#### <a id="vttablet-new-rpc-reset-sequences"/>New ResetSequences rpc
4 changes: 3 additions & 1 deletion go/stats/timings.go
Original file line number Diff line number Diff line change
@@ -61,10 +61,12 @@ func NewTimings(name, help, label string, categories ...string) *Timings {
return t
}

// Reset will clearStats histograms: used during testing
// Reset will clear histograms and counters: used during testing
func (t *Timings) Reset() {
t.mu.RLock()
t.histograms = make(map[string]*Histogram)
t.totalCount.Store(0)
t.totalTime.Store(0)
t.mu.RUnlock()
}

6 changes: 6 additions & 0 deletions go/test/endtoend/vtorc/general/vtorc_test.go
Original file line number Diff line number Diff line change
@@ -76,6 +76,7 @@ func TestSingleKeyspace(t *testing.T) {
utils.CheckPrimaryTablet(t, clusterInfo, shard0.Vttablets[0], true)
utils.CheckReplication(t, clusterInfo, shard0.Vttablets[0], shard0.Vttablets[1:], 10*time.Second)
utils.WaitForSuccessfulRecoveryCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], logic.ElectNewPrimaryRecoveryName, 1)
utils.WaitForSuccessfulPRSCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], keyspace.Name, shard0.Name, 1)
}

// Cases to test:
@@ -94,6 +95,7 @@ func TestKeyspaceShard(t *testing.T) {
utils.CheckPrimaryTablet(t, clusterInfo, shard0.Vttablets[0], true)
utils.CheckReplication(t, clusterInfo, shard0.Vttablets[0], shard0.Vttablets[1:], 10*time.Second)
utils.WaitForSuccessfulRecoveryCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], logic.ElectNewPrimaryRecoveryName, 1)
utils.WaitForSuccessfulPRSCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], keyspace.Name, shard0.Name, 1)
}

// Cases to test:
@@ -116,6 +118,7 @@ func TestVTOrcRepairs(t *testing.T) {
assert.NotNil(t, curPrimary, "should have elected a primary")
vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0]
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)

var replica, otherReplica *cluster.Vttablet
for _, tablet := range shard0.Vttablets {
@@ -344,6 +347,7 @@ func TestVTOrcWithPrs(t *testing.T) {
assert.NotNil(t, curPrimary, "should have elected a primary")
vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0]
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)

// find any replica tablet other than the current primary
var replica *cluster.Vttablet
@@ -371,7 +375,9 @@ func TestVTOrcWithPrs(t *testing.T) {
utils.CheckPrimaryTablet(t, clusterInfo, replica, true)
// Verify that VTOrc didn't run any other recovery
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 0)
utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 0)
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixPrimaryRecoveryName, 0)
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, 0)
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverPrimaryHasPrimaryRecoveryName, 0)
6 changes: 6 additions & 0 deletions go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go
Original file line number Diff line number Diff line change
@@ -53,6 +53,7 @@ func TestDownPrimary(t *testing.T) {
assert.NotNil(t, curPrimary, "should have elected a primary")
vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0]
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)

// find the replica and rdonly tablets
var replica, rdonly *cluster.Vttablet
@@ -99,6 +100,7 @@ func TestDownPrimary(t *testing.T) {
// also check that the replication is working correctly after failover
utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{crossCellReplica}, 10*time.Second)
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 1)
utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)
}

// bring down primary before VTOrc has started, let vtorc repair.
@@ -154,6 +156,7 @@ func TestDownPrimaryBeforeVTOrc(t *testing.T) {
// also check that the replication is working correctly after failover
utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{rdonly}, 10*time.Second)
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 1)
utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)
}

// delete the primary record and let vtorc repair.
@@ -168,6 +171,7 @@ func TestDeletedPrimaryTablet(t *testing.T) {
assert.NotNil(t, curPrimary, "should have elected a primary")
vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0]
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)

// find the replica and rdonly tablets
var replica, rdonly *cluster.Vttablet
@@ -239,6 +243,7 @@ func TestDeadPrimaryRecoversImmediately(t *testing.T) {
assert.NotNil(t, curPrimary, "should have elected a primary")
vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0]
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)

// find the replica and rdonly tablets
var replica, rdonly *cluster.Vttablet
@@ -275,6 +280,7 @@ func TestDeadPrimaryRecoversImmediately(t *testing.T) {
// also check that the replication is working correctly after failover
utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{crossCellReplica}, 10*time.Second)
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 1)
utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)

// Parse log file and find out how much time it took for DeadPrimary to recover.
logFile := path.Join(vtOrcProcess.LogDir, vtOrcProcess.LogFileName)
62 changes: 60 additions & 2 deletions go/test/endtoend/vtorc/utils/utils.go
Original file line number Diff line number Diff line change
@@ -20,9 +20,11 @@ import (
"context"
"encoding/json"
"fmt"
"math"
"os"
"os/exec"
"path"
"reflect"
"strconv"
"strings"
"testing"
@@ -957,18 +959,74 @@ func WaitForSuccessfulRecoveryCount(t *testing.T, vtorcInstance *cluster.VTOrcPr
for time.Since(startTime) < timeout {
vars := vtorcInstance.GetVars()
successfulRecoveriesMap := vars["SuccessfulRecoveries"].(map[string]interface{})
successCount := successfulRecoveriesMap[recoveryName]
successCount := getIntFromValue(successfulRecoveriesMap[recoveryName])
if successCount == countExpected {
return
}
time.Sleep(time.Second)
}
vars := vtorcInstance.GetVars()
successfulRecoveriesMap := vars["SuccessfulRecoveries"].(map[string]interface{})
successCount := successfulRecoveriesMap[recoveryName]
successCount := getIntFromValue(successfulRecoveriesMap[recoveryName])
assert.EqualValues(t, countExpected, successCount)
}

// WaitForSuccessfulPRSCount waits until the given keyspace-shard's count of successful prs runs matches the count expected.
func WaitForSuccessfulPRSCount(t *testing.T, vtorcInstance *cluster.VTOrcProcess, keyspace, shard string, countExpected int) {
t.Helper()
timeout := 15 * time.Second
startTime := time.Now()
mapKey := fmt.Sprintf("%v.%v.success", keyspace, shard)
for time.Since(startTime) < timeout {
vars := vtorcInstance.GetVars()
prsCountsMap := vars["planned_reparent_counts"].(map[string]interface{})
successCount := getIntFromValue(prsCountsMap[mapKey])
if successCount == countExpected {
return
}
time.Sleep(time.Second)
}
vars := vtorcInstance.GetVars()
prsCountsMap := vars["planned_reparent_counts"].(map[string]interface{})
successCount := getIntFromValue(prsCountsMap[mapKey])
assert.EqualValues(t, countExpected, successCount)
}

// WaitForSuccessfulERSCount waits until the given keyspace-shard's count of successful ers runs matches the count expected.
func WaitForSuccessfulERSCount(t *testing.T, vtorcInstance *cluster.VTOrcProcess, keyspace, shard string, countExpected int) {
t.Helper()
timeout := 15 * time.Second
startTime := time.Now()
mapKey := fmt.Sprintf("%v.%v.success", keyspace, shard)
for time.Since(startTime) < timeout {
vars := vtorcInstance.GetVars()
ersCountsMap := vars["emergency_reparent_counts"].(map[string]interface{})
successCount := getIntFromValue(ersCountsMap[mapKey])
if successCount == countExpected {
return
}
time.Sleep(time.Second)
}
vars := vtorcInstance.GetVars()
ersCountsMap := vars["emergency_reparent_counts"].(map[string]interface{})
successCount := getIntFromValue(ersCountsMap[mapKey])
assert.EqualValues(t, countExpected, successCount)
}

// getIntFromValue is a helper function to get an integer from the given value.
// If it is convertible to a float, then we round the number to the nearest integer.
// If the value is not numeric at all, we return 0.
func getIntFromValue(val any) int {
value := reflect.ValueOf(val)
if value.CanFloat() {
return int(math.Round(value.Float()))
}
if value.CanInt() {
return int(value.Int())
}
return 0
}

// WaitForTabletType waits for the tablet to reach a certain type.
func WaitForTabletType(t *testing.T, tablet *cluster.Vttablet, expectedTabletType string) {
t.Helper()
24 changes: 18 additions & 6 deletions go/vt/vtctl/reparentutil/emergency_reparenter.go
Original file line number Diff line number Diff line change
@@ -69,9 +69,14 @@ type EmergencyReparentOptions struct {

// counters for Emergency Reparent Shard
var (
ersCounter = stats.NewGauge("ers_counter", "Number of times Emergency Reparent Shard has been run")
ersSuccessCounter = stats.NewGauge("ers_success_counter", "Number of times Emergency Reparent Shard has succeeded")
ersFailureCounter = stats.NewGauge("ers_failure_counter", "Number of times Emergency Reparent Shard has failed")
// TODO(timvaillancourt): remove legacyERS* gauges in v19+.
legacyERSCounter = stats.NewGauge("ers_counter", "Number of times Emergency Reparent Shard has been run")
legacyERSSuccessCounter = stats.NewGauge("ers_success_counter", "Number of times Emergency Reparent Shard has succeeded")
legacyERSFailureCounter = stats.NewGauge("ers_failure_counter", "Number of times Emergency Reparent Shard has failed")

ersCounter = stats.NewCountersWithMultiLabels("emergency_reparent_counts", "Number of times Emergency Reparent Shard has been run",
[]string{"Keyspace", "Shard", "Result"},
)
)

// NewEmergencyReparenter returns a new EmergencyReparenter object, ready to
@@ -99,26 +104,33 @@ func NewEmergencyReparenter(ts *topo.Server, tmc tmclient.TabletManagerClient, l
// keyspace and shard.
func (erp *EmergencyReparenter) ReparentShard(ctx context.Context, keyspace string, shard string, opts EmergencyReparentOptions) (*events.Reparent, error) {
var err error
statsLabels := []string{keyspace, shard}

opts.lockAction = erp.getLockAction(opts.NewPrimaryAlias)
// First step is to lock the shard for the given operation, if not already locked
if err = topo.CheckShardLocked(ctx, keyspace, shard); err != nil {
var unlock func(*error)
ctx, unlock, err = erp.ts.LockShard(ctx, keyspace, shard, opts.lockAction)
if err != nil {
ersCounter.Add(append(statsLabels, failureResult), 1)
return nil, err
}
defer unlock(&err)
}

// dispatch success or failure of ERS
startTime := time.Now()
ev := &events.Reparent{}
defer func() {
reparentShardOpTimings.Add("EmergencyReparentShard", time.Since(startTime))
switch err {
case nil:
ersSuccessCounter.Add(1)
legacyERSSuccessCounter.Add(1)
ersCounter.Add(append(statsLabels, successResult), 1)
event.DispatchUpdate(ev, "finished EmergencyReparentShard")
default:
ersFailureCounter.Add(1)
legacyERSFailureCounter.Add(1)
ersCounter.Add(append(statsLabels, failureResult), 1)
event.DispatchUpdate(ev, "failed EmergencyReparentShard: "+err.Error())
}
}()
@@ -142,7 +154,7 @@ func (erp *EmergencyReparenter) getLockAction(newPrimaryAlias *topodatapb.Tablet
func (erp *EmergencyReparenter) reparentShardLocked(ctx context.Context, ev *events.Reparent, keyspace, shard string, opts EmergencyReparentOptions) (err error) {
// log the starting of the operation and increment the counter
erp.logger.Infof("will initiate emergency reparent shard in keyspace - %s, shard - %s", keyspace, shard)
ersCounter.Add(1)
legacyERSCounter.Add(1)

var (
stoppedReplicationSnapshot *replicationSnapshot
30 changes: 20 additions & 10 deletions go/vt/vtctl/reparentutil/emergency_reparenter_test.go
Original file line number Diff line number Diff line change
@@ -2733,10 +2733,12 @@ func TestEmergencyReparenter_waitForAllRelayLogsToApply(t *testing.T) {
}
}

func TestEmergencyReparenterCounters(t *testing.T) {
ersCounter.Set(0)
ersSuccessCounter.Set(0)
ersFailureCounter.Set(0)
func TestEmergencyReparenterStats(t *testing.T) {
ersCounter.ResetAll()
legacyERSCounter.Reset()
legacyERSSuccessCounter.Reset()
legacyERSFailureCounter.Reset()
reparentShardOpTimings.Reset()

emergencyReparentOps := EmergencyReparentOptions{}
tmc := &testutil.TabletManagerClient{
@@ -2865,9 +2867,13 @@ func TestEmergencyReparenterCounters(t *testing.T) {
require.NoError(t, err)

// check the counter values
require.EqualValues(t, 1, ersCounter.Get())
require.EqualValues(t, 1, ersSuccessCounter.Get())
require.EqualValues(t, 0, ersFailureCounter.Get())
require.EqualValues(t, map[string]int64{"testkeyspace.-.success": 1}, ersCounter.Counts())
require.EqualValues(t, map[string]int64{"All": 1, "EmergencyReparentShard": 1}, reparentShardOpTimings.Counts())

// check the legacy counter values
require.EqualValues(t, 1, legacyERSCounter.Get())
require.EqualValues(t, 1, legacyERSSuccessCounter.Get())
require.EqualValues(t, 0, legacyERSFailureCounter.Get())

// set emergencyReparentOps to request a non existent tablet
emergencyReparentOps.NewPrimaryAlias = &topodatapb.TabletAlias{
@@ -2880,9 +2886,13 @@ func TestEmergencyReparenterCounters(t *testing.T) {
require.Error(t, err)

// check the counter values
require.EqualValues(t, 2, ersCounter.Get())
require.EqualValues(t, 1, ersSuccessCounter.Get())
require.EqualValues(t, 1, ersFailureCounter.Get())
require.EqualValues(t, map[string]int64{"testkeyspace.-.success": 1, "testkeyspace.-.failure": 1}, ersCounter.Counts())
require.EqualValues(t, map[string]int64{"All": 2, "EmergencyReparentShard": 2}, reparentShardOpTimings.Counts())

// check the legacy counter values
require.EqualValues(t, 2, legacyERSCounter.Get())
require.EqualValues(t, 1, legacyERSSuccessCounter.Get())
require.EqualValues(t, 1, legacyERSFailureCounter.Get())
}

func TestEmergencyReparenter_findMostAdvanced(t *testing.T) {
Loading

0 comments on commit 3038580

Please sign in to comment.