From 499ff49009b885f96cfdaf580fcf388dcf0e37a2 Mon Sep 17 00:00:00 2001 From: Florent Poinsard Date: Fri, 17 Jun 2022 18:21:51 +0200 Subject: [PATCH 01/15] Changed the txthrottler to use the new healthcheck implementation Signed-off-by: Florent Poinsard --- .../txthrottler/mock_healthcheck_test.go | 141 ++++++++++++++---- .../txthrottler/mock_throttler_test.go | 12 +- .../txthrottler/mock_topology_watcher_test.go | 26 ++-- .../tabletserver/txthrottler/tx_throttler.go | 54 ++++--- .../txthrottler/tx_throttler_test.go | 24 ++- 5 files changed, 175 insertions(+), 82 deletions(-) diff --git a/go/vt/vttablet/tabletserver/txthrottler/mock_healthcheck_test.go b/go/vt/vttablet/tabletserver/txthrottler/mock_healthcheck_test.go index cf098a7b3e0..17f21f7690b 100644 --- a/go/vt/vttablet/tabletserver/txthrottler/mock_healthcheck_test.go +++ b/go/vt/vttablet/tabletserver/txthrottler/mock_healthcheck_test.go @@ -1,20 +1,22 @@ // Code generated by MockGen. DO NOT EDIT. -// Source: vitess.io/vitess/go/vt/discovery (interfaces: LegacyHealthCheck) +// Source: vitess.io/vitess/go/vt/discovery (interfaces: HealthCheck) // Package txthrottler is a generated GoMock package. package txthrottler import ( + context "context" reflect "reflect" gomock "github.com/golang/mock/gomock" discovery "vitess.io/vitess/go/vt/discovery" + query "vitess.io/vitess/go/vt/proto/query" topodata "vitess.io/vitess/go/vt/proto/topodata" queryservice "vitess.io/vitess/go/vt/vttablet/queryservice" ) -// MockHealthCheck is a mock of LegacyHealthCheck interface. +// MockHealthCheck is a mock of HealthCheck interface. type MockHealthCheck struct { ctrl *gomock.Controller recorder *MockHealthCheckMockRecorder @@ -38,22 +40,22 @@ func (m *MockHealthCheck) EXPECT() *MockHealthCheckMockRecorder { } // AddTablet mocks base method. -func (m *MockHealthCheck) AddTablet(arg0 *topodata.Tablet, arg1 string) { +func (m *MockHealthCheck) AddTablet(arg0 *topodata.Tablet) { m.ctrl.T.Helper() - m.ctrl.Call(m, "AddTablet", arg0, arg1) + m.ctrl.Call(m, "AddTablet", arg0) } // AddTablet indicates an expected call of AddTablet. -func (mr *MockHealthCheckMockRecorder) AddTablet(arg0, arg1 any) *gomock.Call { +func (mr *MockHealthCheckMockRecorder) AddTablet(arg0 interface{}) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "AddTablet", reflect.TypeOf((*MockHealthCheck)(nil).AddTablet), arg0, arg1) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "AddTablet", reflect.TypeOf((*MockHealthCheck)(nil).AddTablet), arg0) } // CacheStatus mocks base method. -func (m *MockHealthCheck) CacheStatus() discovery.LegacyTabletsCacheStatusList { +func (m *MockHealthCheck) CacheStatus() discovery.TabletsCacheStatusList { m.ctrl.T.Helper() ret := m.ctrl.Call(m, "CacheStatus") - ret0, _ := ret[0].(discovery.LegacyTabletsCacheStatusList) + ret0, _ := ret[0].(discovery.TabletsCacheStatusList) return ret0 } @@ -63,6 +65,20 @@ func (mr *MockHealthCheckMockRecorder) CacheStatus() *gomock.Call { return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "CacheStatus", reflect.TypeOf((*MockHealthCheck)(nil).CacheStatus)) } +// CacheStatusMap mocks base method. +func (m *MockHealthCheck) CacheStatusMap() map[string]*discovery.TabletsCacheStatus { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "CacheStatusMap") + ret0, _ := ret[0].(map[string]*discovery.TabletsCacheStatus) + return ret0 +} + +// CacheStatusMap indicates an expected call of CacheStatusMap. +func (mr *MockHealthCheckMockRecorder) CacheStatusMap() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "CacheStatusMap", reflect.TypeOf((*MockHealthCheck)(nil).CacheStatusMap)) +} + // Close mocks base method. func (m *MockHealthCheck) Close() error { m.ctrl.T.Helper() @@ -77,18 +93,48 @@ func (mr *MockHealthCheckMockRecorder) Close() *gomock.Call { return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Close", reflect.TypeOf((*MockHealthCheck)(nil).Close)) } -// GetConnection mocks base method. -func (m *MockHealthCheck) GetConnection(arg0 string) queryservice.QueryService { +// GetHealthyTabletStats mocks base method. +func (m *MockHealthCheck) GetHealthyTabletStats(arg0 *query.Target) []*discovery.TabletHealth { m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "GetConnection", arg0) - ret0, _ := ret[0].(queryservice.QueryService) + ret := m.ctrl.Call(m, "GetHealthyTabletStats", arg0) + ret0, _ := ret[0].([]*discovery.TabletHealth) return ret0 } -// GetConnection indicates an expected call of GetConnection. -func (mr *MockHealthCheckMockRecorder) GetConnection(arg0 any) *gomock.Call { +// GetHealthyTabletStats indicates an expected call of GetHealthyTabletStats. +func (mr *MockHealthCheckMockRecorder) GetHealthyTabletStats(arg0 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetHealthyTabletStats", reflect.TypeOf((*MockHealthCheck)(nil).GetHealthyTabletStats), arg0) +} + +// GetTabletHealth mocks base method. +func (m *MockHealthCheck) GetTabletHealth(arg0 discovery.KeyspaceShardTabletType, arg1 *topodata.TabletAlias) (*discovery.TabletHealth, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GetTabletHealth", arg0, arg1) + ret0, _ := ret[0].(*discovery.TabletHealth) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// GetTabletHealth indicates an expected call of GetTabletHealth. +func (mr *MockHealthCheckMockRecorder) GetTabletHealth(arg0, arg1 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetTabletHealth", reflect.TypeOf((*MockHealthCheck)(nil).GetTabletHealth), arg0, arg1) +} + +// GetTabletHealthByAlias mocks base method. +func (m *MockHealthCheck) GetTabletHealthByAlias(arg0 *topodata.TabletAlias) (*discovery.TabletHealth, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GetTabletHealthByAlias", arg0) + ret0, _ := ret[0].(*discovery.TabletHealth) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// GetTabletHealthByAlias indicates an expected call of GetTabletHealthByAlias. +func (mr *MockHealthCheckMockRecorder) GetTabletHealthByAlias(arg0 interface{}) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetConnection", reflect.TypeOf((*MockHealthCheck)(nil).GetConnection), arg0) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetTabletHealthByAlias", reflect.TypeOf((*MockHealthCheck)(nil).GetTabletHealthByAlias), arg0) } // RegisterStats mocks base method. @@ -110,43 +156,74 @@ func (m *MockHealthCheck) RemoveTablet(arg0 *topodata.Tablet) { } // RemoveTablet indicates an expected call of RemoveTablet. -func (mr *MockHealthCheckMockRecorder) RemoveTablet(arg0 any) *gomock.Call { +func (mr *MockHealthCheckMockRecorder) RemoveTablet(arg0 interface{}) *gomock.Call { mr.mock.ctrl.T.Helper() return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "RemoveTablet", reflect.TypeOf((*MockHealthCheck)(nil).RemoveTablet), arg0) } // ReplaceTablet mocks base method. -func (m *MockHealthCheck) ReplaceTablet(arg0, arg1 *topodata.Tablet, arg2 string) { +func (m *MockHealthCheck) ReplaceTablet(arg0, arg1 *topodata.Tablet) { m.ctrl.T.Helper() - m.ctrl.Call(m, "ReplaceTablet", arg0, arg1, arg2) + m.ctrl.Call(m, "ReplaceTablet", arg0, arg1) } // ReplaceTablet indicates an expected call of ReplaceTablet. -func (mr *MockHealthCheckMockRecorder) ReplaceTablet(arg0, arg1, arg2 any) *gomock.Call { +func (mr *MockHealthCheckMockRecorder) ReplaceTablet(arg0, arg1 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ReplaceTablet", reflect.TypeOf((*MockHealthCheck)(nil).ReplaceTablet), arg0, arg1) +} + +// Subscribe mocks base method. +func (m *MockHealthCheck) Subscribe() chan *discovery.TabletHealth { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Subscribe") + ret0, _ := ret[0].(chan *discovery.TabletHealth) + return ret0 +} + +// Subscribe indicates an expected call of Subscribe. +func (mr *MockHealthCheckMockRecorder) Subscribe() *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ReplaceTablet", reflect.TypeOf((*MockHealthCheck)(nil).ReplaceTablet), arg0, arg1, arg2) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Subscribe", reflect.TypeOf((*MockHealthCheck)(nil).Subscribe)) } -// SetListener mocks base method. -func (m *MockHealthCheck) SetListener(arg0 discovery.LegacyHealthCheckStatsListener, arg1 bool) { +// TabletConnection mocks base method. +func (m *MockHealthCheck) TabletConnection(arg0 *topodata.TabletAlias, arg1 *query.Target) (queryservice.QueryService, error) { m.ctrl.T.Helper() - m.ctrl.Call(m, "SetListener", arg0, arg1) + ret := m.ctrl.Call(m, "TabletConnection", arg0, arg1) + ret0, _ := ret[0].(queryservice.QueryService) + ret1, _ := ret[1].(error) + return ret0, ret1 } -// SetListener indicates an expected call of SetListener. -func (mr *MockHealthCheckMockRecorder) SetListener(arg0, arg1 any) *gomock.Call { +// TabletConnection indicates an expected call of TabletConnection. +func (mr *MockHealthCheckMockRecorder) TabletConnection(arg0, arg1 interface{}) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SetListener", reflect.TypeOf((*MockHealthCheck)(nil).SetListener), arg0, arg1) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "TabletConnection", reflect.TypeOf((*MockHealthCheck)(nil).TabletConnection), arg0, arg1) } -// WaitForInitialStatsUpdates mocks base method. -func (m *MockHealthCheck) WaitForInitialStatsUpdates() { +// Unsubscribe mocks base method. +func (m *MockHealthCheck) Unsubscribe(arg0 chan *discovery.TabletHealth) { m.ctrl.T.Helper() - m.ctrl.Call(m, "WaitForInitialStatsUpdates") + m.ctrl.Call(m, "Unsubscribe", arg0) +} + +// Unsubscribe indicates an expected call of Unsubscribe. +func (mr *MockHealthCheckMockRecorder) Unsubscribe(arg0 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Unsubscribe", reflect.TypeOf((*MockHealthCheck)(nil).Unsubscribe), arg0) +} + +// WaitForAllServingTablets mocks base method. +func (m *MockHealthCheck) WaitForAllServingTablets(arg0 context.Context, arg1 []*query.Target) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "WaitForAllServingTablets", arg0, arg1) + ret0, _ := ret[0].(error) + return ret0 } -// WaitForInitialStatsUpdates indicates an expected call of WaitForInitialStatsUpdates. -func (mr *MockHealthCheckMockRecorder) WaitForInitialStatsUpdates() *gomock.Call { +// WaitForAllServingTablets indicates an expected call of WaitForAllServingTablets. +func (mr *MockHealthCheckMockRecorder) WaitForAllServingTablets(arg0, arg1 interface{}) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "WaitForInitialStatsUpdates", reflect.TypeOf((*MockHealthCheck)(nil).WaitForInitialStatsUpdates)) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "WaitForAllServingTablets", reflect.TypeOf((*MockHealthCheck)(nil).WaitForAllServingTablets), arg0, arg1) } diff --git a/go/vt/vttablet/tabletserver/txthrottler/mock_throttler_test.go b/go/vt/vttablet/tabletserver/txthrottler/mock_throttler_test.go index a3da535037e..53b827d591a 100644 --- a/go/vt/vttablet/tabletserver/txthrottler/mock_throttler_test.go +++ b/go/vt/vttablet/tabletserver/txthrottler/mock_throttler_test.go @@ -78,13 +78,13 @@ func (mr *MockThrottlerInterfaceMockRecorder) MaxRate() *gomock.Call { } // RecordReplicationLag mocks base method. -func (m *MockThrottlerInterface) RecordReplicationLag(arg0 time.Time, arg1 *discovery.LegacyTabletStats) { +func (m *MockThrottlerInterface) RecordReplicationLag(arg0 time.Time, arg1 *discovery.TabletHealth) { m.ctrl.T.Helper() m.ctrl.Call(m, "RecordReplicationLag", arg0, arg1) } // RecordReplicationLag indicates an expected call of RecordReplicationLag. -func (mr *MockThrottlerInterfaceMockRecorder) RecordReplicationLag(arg0, arg1 any) *gomock.Call { +func (mr *MockThrottlerInterfaceMockRecorder) RecordReplicationLag(arg0, arg1 interface{}) *gomock.Call { mr.mock.ctrl.T.Helper() return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "RecordReplicationLag", reflect.TypeOf((*MockThrottlerInterface)(nil).RecordReplicationLag), arg0, arg1) } @@ -108,7 +108,7 @@ func (m *MockThrottlerInterface) SetMaxRate(arg0 int64) { } // SetMaxRate indicates an expected call of SetMaxRate. -func (mr *MockThrottlerInterfaceMockRecorder) SetMaxRate(arg0 any) *gomock.Call { +func (mr *MockThrottlerInterfaceMockRecorder) SetMaxRate(arg0 interface{}) *gomock.Call { mr.mock.ctrl.T.Helper() return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SetMaxRate", reflect.TypeOf((*MockThrottlerInterface)(nil).SetMaxRate), arg0) } @@ -120,7 +120,7 @@ func (m *MockThrottlerInterface) ThreadFinished(arg0 int) { } // ThreadFinished indicates an expected call of ThreadFinished. -func (mr *MockThrottlerInterfaceMockRecorder) ThreadFinished(arg0 any) *gomock.Call { +func (mr *MockThrottlerInterfaceMockRecorder) ThreadFinished(arg0 interface{}) *gomock.Call { mr.mock.ctrl.T.Helper() return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ThreadFinished", reflect.TypeOf((*MockThrottlerInterface)(nil).ThreadFinished), arg0) } @@ -134,7 +134,7 @@ func (m *MockThrottlerInterface) Throttle(arg0 int) time.Duration { } // Throttle indicates an expected call of Throttle. -func (mr *MockThrottlerInterfaceMockRecorder) Throttle(arg0 any) *gomock.Call { +func (mr *MockThrottlerInterfaceMockRecorder) Throttle(arg0 interface{}) *gomock.Call { mr.mock.ctrl.T.Helper() return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Throttle", reflect.TypeOf((*MockThrottlerInterface)(nil).Throttle), arg0) } @@ -148,7 +148,7 @@ func (m *MockThrottlerInterface) UpdateConfiguration(arg0 *throttlerdata.Configu } // UpdateConfiguration indicates an expected call of UpdateConfiguration. -func (mr *MockThrottlerInterfaceMockRecorder) UpdateConfiguration(arg0, arg1 any) *gomock.Call { +func (mr *MockThrottlerInterfaceMockRecorder) UpdateConfiguration(arg0, arg1 interface{}) *gomock.Call { mr.mock.ctrl.T.Helper() return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UpdateConfiguration", reflect.TypeOf((*MockThrottlerInterface)(nil).UpdateConfiguration), arg0, arg1) } diff --git a/go/vt/vttablet/tabletserver/txthrottler/mock_topology_watcher_test.go b/go/vt/vttablet/tabletserver/txthrottler/mock_topology_watcher_test.go index 3e2e6f803c2..5afb16d3473 100644 --- a/go/vt/vttablet/tabletserver/txthrottler/mock_topology_watcher_test.go +++ b/go/vt/vttablet/tabletserver/txthrottler/mock_topology_watcher_test.go @@ -33,28 +33,26 @@ func (m *MockTopologyWatcherInterface) EXPECT() *MockTopologyWatcherInterfaceMoc return m.recorder } -// Stop mocks base method. -func (m *MockTopologyWatcherInterface) Stop() { +// Start mocks base method. +func (m *MockTopologyWatcherInterface) Start() { m.ctrl.T.Helper() - m.ctrl.Call(m, "Stop") + m.ctrl.Call(m, "Start") } -// Stop indicates an expected call of Stop. -func (mr *MockTopologyWatcherInterfaceMockRecorder) Stop() *gomock.Call { +// Start indicates an expected call of Start. +func (mr *MockTopologyWatcherInterfaceMockRecorder) Start() *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Stop", reflect.TypeOf((*MockTopologyWatcherInterface)(nil).Stop)) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Start", reflect.TypeOf((*MockTopologyWatcherInterface)(nil).Start)) } -// WaitForInitialTopology mocks base method. -func (m *MockTopologyWatcherInterface) WaitForInitialTopology() error { +// Stop mocks base method. +func (m *MockTopologyWatcherInterface) Stop() { m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "WaitForInitialTopology") - ret0, _ := ret[0].(error) - return ret0 + m.ctrl.Call(m, "Stop") } -// WaitForInitialTopology indicates an expected call of WaitForInitialTopology. -func (mr *MockTopologyWatcherInterfaceMockRecorder) WaitForInitialTopology() *gomock.Call { +// Stop indicates an expected call of Stop. +func (mr *MockTopologyWatcherInterfaceMockRecorder) Stop() *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "WaitForInitialTopology", reflect.TypeOf((*MockTopologyWatcherInterface)(nil).WaitForInitialTopology)) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Stop", reflect.TypeOf((*MockTopologyWatcherInterface)(nil).Stop)) } diff --git a/go/vt/vttablet/tabletserver/txthrottler/tx_throttler.go b/go/vt/vttablet/tabletserver/txthrottler/tx_throttler.go index 217c3ac1f1e..11f04a26811 100644 --- a/go/vt/vttablet/tabletserver/txthrottler/tx_throttler.go +++ b/go/vt/vttablet/tabletserver/txthrottler/tx_throttler.go @@ -18,6 +18,7 @@ package txthrottler import ( "fmt" + "strings" "sync" "time" @@ -148,7 +149,7 @@ type ThrottlerInterface interface { Close() MaxRate() int64 SetMaxRate(rate int64) - RecordReplicationLag(time time.Time, ts *discovery.LegacyTabletStats) + RecordReplicationLag(time time.Time, th *discovery.TabletHealth) GetConfiguration() *throttlerdatapb.Configuration UpdateConfiguration(configuration *throttlerdatapb.Configuration, copyZeroValues bool) error ResetConfiguration() @@ -158,7 +159,7 @@ type ThrottlerInterface interface { // discovery.LegacyTopologyWatcher. It is only used here to allow mocking out // go/vt/discovery.LegacyTopologyWatcher. type TopologyWatcherInterface interface { - WaitForInitialTopology() error + Start() Stop() } @@ -166,18 +167,19 @@ type TopologyWatcherInterface interface { type txThrottlerState struct { // throttleMu serializes calls to throttler.Throttler.Throttle(threadId). // That method is required to be called in serial for each threadId. - throttleMu sync.Mutex - throttler ThrottlerInterface + throttleMu sync.Mutex + throttler ThrottlerInterface + stopHealthCheck context.CancelFunc - healthCheck discovery.LegacyHealthCheck + healthCheck discovery.HealthCheck topologyWatchers []TopologyWatcherInterface } // These vars store the functions used to create the topo server, healthcheck, // topology watchers and go/vt/throttler. These are provided here so that they can be overridden // in tests to generate mocks. -type healthCheckFactoryFunc func() discovery.LegacyHealthCheck -type topologyWatcherFactoryFunc func(topoServer *topo.Server, tr discovery.LegacyTabletRecorder, cell, keyspace, shard string, refreshInterval time.Duration, topoReadConcurrency int) TopologyWatcherInterface +type healthCheckFactoryFunc func(topoServer *topo.Server, cell string, cellsToWatch []string) discovery.HealthCheck +type topologyWatcherFactoryFunc func(topoServer *topo.Server, tr discovery.TabletRecorder, cell, keyspace, shard string, refreshInterval time.Duration, topoReadConcurrency int) TopologyWatcherInterface type throttlerFactoryFunc func(name, unit string, threadCount int, maxRate, maxReplicationLag int64) (ThrottlerInterface, error) var ( @@ -191,9 +193,11 @@ func init() { } func resetTxThrottlerFactories() { - healthCheckFactory = discovery.NewLegacyDefaultHealthCheck - topologyWatcherFactory = func(topoServer *topo.Server, tr discovery.LegacyTabletRecorder, cell, keyspace, shard string, refreshInterval time.Duration, topoReadConcurrency int) TopologyWatcherInterface { - return discovery.NewLegacyShardReplicationWatcher(context.Background(), topoServer, tr, cell, keyspace, shard, refreshInterval, topoReadConcurrency) + healthCheckFactory = func(topoServer *topo.Server, cell string, cellsToWatch []string) discovery.HealthCheck { + return discovery.NewHealthCheck(context.Background(), discovery.DefaultHealthCheckRetryDelay, discovery.DefaultHealthCheckTimeout, topoServer, cell, strings.Join(cellsToWatch, ",")) + } + topologyWatcherFactory = func(topoServer *topo.Server, tr discovery.TabletRecorder, cell, keyspace, shard string, refreshInterval time.Duration, topoReadConcurrency int) TopologyWatcherInterface { + return discovery.NewCellTabletsWatcher(context.Background(), topoServer, tr, discovery.NewFilterByKeyspace([]string{keyspace}), cell, refreshInterval, true, topoReadConcurrency) } throttlerFactory = func(name, unit string, threadCount int, maxRate, maxReplicationLag int64) (ThrottlerInterface, error) { return throttler.NewThrottler(name, unit, threadCount, maxRate, maxReplicationLag) @@ -230,7 +234,7 @@ func (t *TxThrottler) Open() error { } log.Info("TxThrottler: opening") var err error - t.state, err = newTxThrottlerState(t.config, t.target.Keyspace, t.target.Shard) + t.state, err = newTxThrottlerState(t.config, t.target.Keyspace, t.target.Shard, t.target.Cell) return err } @@ -263,8 +267,7 @@ func (t *TxThrottler) Throttle() (result bool) { return t.state.throttle() } -func newTxThrottlerState(config *txThrottlerConfig, keyspace, shard string, -) (*txThrottlerState, error) { +func newTxThrottlerState(config *txThrottlerConfig, keyspace, shard, cell string) (*txThrottlerState, error) { t, err := throttlerFactory( TxThrottlerName, "TPS", /* unit */ @@ -281,8 +284,8 @@ func newTxThrottlerState(config *txThrottlerConfig, keyspace, shard string, result := &txThrottlerState{ throttler: t, } - result.healthCheck = healthCheckFactory() - result.healthCheck.SetListener(result, false /* sendDownEvents */) + createTxThrottlerHealthCheck(config, result, cell) + result.topologyWatchers = make( []TopologyWatcherInterface, 0, len(config.healthCheckCells)) for _, cell := range config.healthCheckCells { @@ -290,7 +293,7 @@ func newTxThrottlerState(config *txThrottlerConfig, keyspace, shard string, result.topologyWatchers, topologyWatcherFactory( config.topoServer, - result.healthCheck, /* LegacyTabletRecorder */ + result.healthCheck, cell, keyspace, shard, @@ -300,6 +303,23 @@ func newTxThrottlerState(config *txThrottlerConfig, keyspace, shard string, return result, nil } +func createTxThrottlerHealthCheck(config *txThrottlerConfig, result *txThrottlerState, cell string) { + ctx, cancel := context.WithCancel(context.Background()) + result.stopHealthCheck = cancel + result.healthCheck = healthCheckFactory(config.topoServer, cell, config.healthCheckCells) + ch := result.healthCheck.Subscribe() + go func(ctx context.Context) { + for { + select { + case <-ctx.Done(): + return + case th := <-ch: + result.StatsUpdate(th) + } + } + }(ctx) +} + func (ts *txThrottlerState) throttle() bool { if ts.throttler == nil { panic("BUG: throttle called after deallocateResources was called.") @@ -329,7 +349,7 @@ func (ts *txThrottlerState) deallocateResources() { } // StatsUpdate is part of the LegacyHealthCheckStatsListener interface. -func (ts *txThrottlerState) StatsUpdate(tabletStats *discovery.LegacyTabletStats) { +func (ts *txThrottlerState) StatsUpdate(tabletStats *discovery.TabletHealth) { // Ignore PRIMARY and RDONLY stats. // We currently do not monitor RDONLY tablets for replication lag. RDONLY tablets are not // candidates for becoming primary during failover, and it's acceptable to serve somewhat diff --git a/go/vt/vttablet/tabletserver/txthrottler/tx_throttler_test.go b/go/vt/vttablet/tabletserver/txthrottler/tx_throttler_test.go index cefba6746d7..bdce7899370 100644 --- a/go/vt/vttablet/tabletserver/txthrottler/tx_throttler_test.go +++ b/go/vt/vttablet/tabletserver/txthrottler/tx_throttler_test.go @@ -17,7 +17,7 @@ limitations under the License. package txthrottler // Commands to generate the mocks for this test. -//go:generate mockgen -destination mock_healthcheck_test.go -package txthrottler -mock_names "LegacyHealthCheck=MockHealthCheck" vitess.io/vitess/go/vt/discovery LegacyHealthCheck +//go:generate mockgen -destination mock_healthcheck_test.go -package txthrottler -mock_names "HealthCheck=MockHealthCheck" vitess.io/vitess/go/vt/discovery HealthCheck //go:generate mockgen -destination mock_throttler_test.go -package txthrottler vitess.io/vitess/go/vt/vttablet/tabletserver/txthrottler ThrottlerInterface //go:generate mockgen -destination mock_topology_watcher_test.go -package txthrottler vitess.io/vitess/go/vt/vttablet/tabletserver/txthrottler TopologyWatcherInterface @@ -61,17 +61,15 @@ func TestEnabledThrottler(t *testing.T) { ts := memorytopo.NewServer("cell1", "cell2") mockHealthCheck := NewMockHealthCheck(mockCtrl) - var hcListener discovery.LegacyHealthCheckStatsListener - hcCall1 := mockHealthCheck.EXPECT().SetListener(gomock.Any(), false /* sendDownEvents */) - hcCall1.Do(func(listener discovery.LegacyHealthCheckStatsListener, sendDownEvents bool) { - // Record the listener we're given. - hcListener = listener - }) + hcCall1 := mockHealthCheck.EXPECT().Subscribe() + hcCall1.Do(func() {}) hcCall2 := mockHealthCheck.EXPECT().Close() hcCall2.After(hcCall1) - healthCheckFactory = func() discovery.LegacyHealthCheck { return mockHealthCheck } + healthCheckFactory = func(topoServer *topo.Server, cell string, cellsToWatch []string) discovery.HealthCheck { + return mockHealthCheck + } - topologyWatcherFactory = func(topoServer *topo.Server, tr discovery.LegacyTabletRecorder, cell, keyspace, shard string, refreshInterval time.Duration, topoReadConcurrency int) TopologyWatcherInterface { + topologyWatcherFactory = func(topoServer *topo.Server, tr discovery.TabletRecorder, cell, keyspace, shard string, refreshInterval time.Duration, topoReadConcurrency int) TopologyWatcherInterface { if ts != topoServer { t.Errorf("want: %v, got: %v", ts, topoServer) } @@ -100,7 +98,7 @@ func TestEnabledThrottler(t *testing.T) { call0 := mockThrottler.EXPECT().UpdateConfiguration(gomock.Any(), true /* copyZeroValues */) call1 := mockThrottler.EXPECT().Throttle(0) call1.Return(0 * time.Second) - tabletStats := &discovery.LegacyTabletStats{ + tabletStats := &discovery.TabletHealth{ Target: &querypb.Target{ TabletType: topodatapb.TabletType_REPLICA, }, @@ -132,14 +130,14 @@ func TestEnabledThrottler(t *testing.T) { if result := throttler.Throttle(); result != false { t.Errorf("want: false, got: %v", result) } - hcListener.StatsUpdate(tabletStats) - rdonlyTabletStats := &discovery.LegacyTabletStats{ + throttler.state.StatsUpdate(tabletStats) + rdonlyTabletStats := &discovery.TabletHealth{ Target: &querypb.Target{ TabletType: topodatapb.TabletType_RDONLY, }, } // This call should not be forwarded to the go/vt/throttler.Throttler object. - hcListener.StatsUpdate(rdonlyTabletStats) + throttler.state.StatsUpdate(rdonlyTabletStats) // The second throttle call should reject. if result := throttler.Throttle(); result != true { t.Errorf("want: true, got: %v", result) From a809a13dbb5d94861b04ac5c1e866559b5b67e33 Mon Sep 17 00:00:00 2001 From: Florent Poinsard Date: Mon, 20 Jun 2022 11:02:59 +0200 Subject: [PATCH 02/15] Removed the use of the legacy healthcheck in the tx_throttler Signed-off-by: Florent Poinsard --- go/vt/discovery/healthcheck.go | 19 ++++++++++-- go/vt/discovery/legacy_healthcheck.go | 14 +-------- go/vt/throttler/demo/throttler_demo.go | 30 +++++++++++-------- go/vt/throttler/max_replication_lag_module.go | 16 +++++----- .../max_replication_lag_module_test.go | 14 ++++----- go/vt/throttler/replication_lag_cache.go | 16 +++++----- go/vt/throttler/replication_lag_cache_test.go | 5 ++-- go/vt/throttler/replication_lag_record.go | 2 +- go/vt/throttler/throttler.go | 4 +-- 9 files changed, 64 insertions(+), 56 deletions(-) diff --git a/go/vt/discovery/healthcheck.go b/go/vt/discovery/healthcheck.go index c724ddb2870..fc68ff17d9c 100644 --- a/go/vt/discovery/healthcheck.go +++ b/go/vt/discovery/healthcheck.go @@ -44,6 +44,7 @@ import ( "strings" "sync" "time" + "vitess.io/vitess/go/netutil" "vitess.io/vitess/go/flagutil" "vitess.io/vitess/go/stats" @@ -85,8 +86,8 @@ var ( // See the documentation for NewHealthCheck below for an explanation of these parameters. const ( - defaultHealthCheckRetryDelay = 5 * time.Second - defaultHealthCheckTimeout = 1 * time.Minute + DefaultHealthCheckRetryDelay = 5 * time.Second + DefaultHealthCheckTimeout = 1 * time.Minute // DefaultTopoReadConcurrency is used as the default value for the topoReadConcurrency parameter of a TopologyWatcher. DefaultTopoReadConcurrency int = 5 @@ -166,6 +167,8 @@ type tabletAliasString string // HealthCheck declares what the TabletGateway needs from the HealthCheck type HealthCheck interface { + TabletRecorder + // CacheStatus returns a displayable version of the health check cache. CacheStatus() TabletsCacheStatusList @@ -899,3 +902,15 @@ func (hc *HealthCheckImpl) stateChecksum() int64 { return int64(crc32.ChecksumIEEE(buf.Bytes())) } + +// TabletToMapKey creates a key to the map from tablet's host and ports. +// It should only be used in discovery and related module. +func TabletToMapKey(tablet *topodata.Tablet) string { + parts := make([]string, 0, 1) + for name, port := range tablet.PortMap { + parts = append(parts, netutil.JoinHostPort(name, port)) + } + sort.Strings(parts) + parts = append([]string{tablet.Hostname}, parts...) + return strings.Join(parts, ",") +} diff --git a/go/vt/discovery/legacy_healthcheck.go b/go/vt/discovery/legacy_healthcheck.go index 3154736f60d..a1025ace145 100644 --- a/go/vt/discovery/legacy_healthcheck.go +++ b/go/vt/discovery/legacy_healthcheck.go @@ -372,7 +372,7 @@ type legacyTabletHealth struct { // NewLegacyDefaultHealthCheck creates a new LegacyHealthCheck object with a default configuration. func NewLegacyDefaultHealthCheck() LegacyHealthCheck { - return NewLegacyHealthCheck(defaultHealthCheckRetryDelay, defaultHealthCheckTimeout) + return NewLegacyHealthCheck(DefaultHealthCheckRetryDelay, DefaultHealthCheckTimeout) } // NewLegacyHealthCheck creates a new LegacyHealthCheck object. @@ -960,15 +960,3 @@ func (hc *LegacyHealthCheckImpl) Close() error { return nil } - -// TabletToMapKey creates a key to the map from tablet's host and ports. -// It should only be used in discovery and related module. -func TabletToMapKey(tablet *topodatapb.Tablet) string { - parts := make([]string, 0, 1) - for name, port := range tablet.PortMap { - parts = append(parts, netutil.JoinHostPort(name, port)) - } - sort.Strings(parts) - parts = append([]string{tablet.Hostname}, parts...) - return strings.Join(parts, ",") -} diff --git a/go/vt/throttler/demo/throttler_demo.go b/go/vt/throttler/demo/throttler_demo.go index 9d6cc7b93ba..7ccad70506a 100644 --- a/go/vt/throttler/demo/throttler_demo.go +++ b/go/vt/throttler/demo/throttler_demo.go @@ -17,12 +17,14 @@ limitations under the License. package main import ( + "context" "flag" "math/rand" "net/http" "sync" "testing" "time" + "vitess.io/vitess/go/vt/topo" "vitess.io/vitess/go/vt/discovery" "vitess.io/vitess/go/vt/logutil" @@ -103,9 +105,8 @@ type replica struct { wg sync.WaitGroup } -func newReplica(lagUpdateInterval, degrationInterval, degrationDuration time.Duration) *replica { +func newReplica(lagUpdateInterval, degrationInterval, degrationDuration time.Duration, ts *topo.Server) *replica { t := &testing.T{} - ts := memorytopo.NewServer("cell1") wr := wrangler.New(logutil.NewConsoleLogger(), ts, tmclient.NewTabletManagerClient()) fakeTablet := testlib.NewFakeTablet(t, wr, "cell1", 0, topodatapb.TabletType_REPLICA, nil, testlib.TabletKeyspaceShard(t, "ks", "-80")) @@ -213,28 +214,30 @@ func (r *replica) stop() { type client struct { primary *primary - healthCheck discovery.LegacyHealthCheck + healthCheck discovery.HealthCheck throttler *throttler.Throttler - stopChan chan struct{} - wg sync.WaitGroup + stopChan chan struct{} + wg sync.WaitGroup + healthcheckCh chan *discovery.TabletHealth } -func newClient(primary *primary, replica *replica) *client { +func newClient(primary *primary, replica *replica, ts *topo.Server) *client { t, err := throttler.NewThrottler("client", "TPS", 1, throttler.MaxRateModuleDisabled, 5 /* seconds */) if err != nil { log.Fatal(err) } - healthCheck := discovery.NewLegacyHealthCheck(5*time.Second, 1*time.Minute) + healthCheck := discovery.NewHealthCheck(context.Background(), 5*time.Second, 1*time.Minute, ts, "cell1", "") c := &client{ primary: primary, healthCheck: healthCheck, throttler: t, stopChan: make(chan struct{}), } - c.healthCheck.SetListener(c, false /* sendDownEvents */) - c.healthCheck.AddTablet(replica.fakeTablet.Tablet, "name") + healthcheckCh := c.healthCheck.Subscribe() + c.healthcheckCh = healthcheckCh + c.healthCheck.AddTablet(replica.fakeTablet.Tablet) return c } @@ -250,6 +253,8 @@ func (c *client) loop() { select { case <-c.stopChan: return + case th := <-c.healthcheckCh: + c.StatsUpdate(th) default: } @@ -276,7 +281,7 @@ func (c *client) stop() { // StatsUpdate implements discovery.LegacyHealthCheckStatsListener. // It gets called by the healthCheck instance every time a tablet broadcasts // a health update. -func (c *client) StatsUpdate(ts *discovery.LegacyTabletStats) { +func (c *client) StatsUpdate(ts *discovery.TabletHealth) { // Ignore unless REPLICA or RDONLY. if ts.Target.TabletType != topodatapb.TabletType_REPLICA && ts.Target.TabletType != topodatapb.TabletType_RDONLY { return @@ -294,9 +299,10 @@ func main() { }) log.Infof("start rate set to: %v", *rate) - replica := newReplica(*lagUpdateInterval, *replicaDegrationInterval, *replicaDegrationDuration) + ts := memorytopo.NewServer("cell1") + replica := newReplica(*lagUpdateInterval, *replicaDegrationInterval, *replicaDegrationDuration, ts) primary := &primary{replica: replica} - client := newClient(primary, replica) + client := newClient(primary, replica, ts) client.run() time.Sleep(*duration) diff --git a/go/vt/throttler/max_replication_lag_module.go b/go/vt/throttler/max_replication_lag_module.go index 68b31147965..7aa2b868ab0 100644 --- a/go/vt/throttler/max_replication_lag_module.go +++ b/go/vt/throttler/max_replication_lag_module.go @@ -238,7 +238,7 @@ func (m *MaxReplicationLagModule) resetConfiguration() { } // RecordReplicationLag records the current replication lag for processing. -func (m *MaxReplicationLagModule) RecordReplicationLag(t time.Time, ts *discovery.LegacyTabletStats) { +func (m *MaxReplicationLagModule) RecordReplicationLag(t time.Time, th *discovery.TabletHealth) { m.mutableConfigMu.Lock() if m.mutableConfig.MaxReplicationLagSec == ReplicationLagModuleDisabled { m.mutableConfigMu.Unlock() @@ -248,7 +248,7 @@ func (m *MaxReplicationLagModule) RecordReplicationLag(t time.Time, ts *discover // Buffer data point for now to unblock the LegacyHealthCheck listener and process // it asynchronously in ProcessRecords(). - m.lagRecords <- replicationLagRecord{t, *ts} + m.lagRecords <- replicationLagRecord{t, *th} } // ProcessRecords is the main loop, run in a separate Go routine, which @@ -331,7 +331,7 @@ func (m *MaxReplicationLagModule) recalculateRate(lagRecordNow replicationLagRec var clear bool var clearReason string - if m.lagCache(lagRecordNow).ignoreSlowReplica(lagRecordNow.Key) { + if m.lagCache(lagRecordNow).ignoreSlowReplica(discovery.TabletToMapKey(lagRecordNow.Tablet)) { r.Reason = fmt.Sprintf("skipping this replica because it's among the %d slowest %v tablets", m.getNSlowestReplicasConfig(lagRecordNow), lagRecordNow.Target.TabletType.String()) goto logResult } @@ -394,7 +394,7 @@ func (m *MaxReplicationLagModule) clearReplicaUnderTest(now time.Time, testedSta // Verify that the current replica under test is not in an error state. lr := lagRecordNow - if m.replicaUnderTest.key != lr.Key { + if m.replicaUnderTest.key != discovery.TabletToMapKey(lr.Tablet) { lr = m.lagCacheByType(m.replicaUnderTest.tabletType).latest(m.replicaUnderTest.key) } if lr.isZero() { @@ -445,7 +445,7 @@ func (m *MaxReplicationLagModule) isReplicaUnderTest(r *result, now time.Time, t return true } - if m.replicaUnderTest.key != lagRecordNow.Key { + if m.replicaUnderTest.key != discovery.TabletToMapKey(lagRecordNow.Tablet) { r.Reason = fmt.Sprintf("skipping this replica because we're waiting for the next lag record from the 'replica under test': %v", m.replicaUnderTest.alias) return false } @@ -557,7 +557,7 @@ func (m *MaxReplicationLagModule) minTestDurationUntilNextIncrease(increase floa func (m *MaxReplicationLagModule) decreaseAndGuessRate(r *result, now time.Time, lagRecordNow replicationLagRecord) { // Guess replication rate based on the difference in the replication lag of this // particular replica. - lagRecordBefore := m.lagCache(lagRecordNow).atOrAfter(lagRecordNow.Key, m.lastRateChange) + lagRecordBefore := m.lagCache(lagRecordNow).atOrAfter(discovery.TabletToMapKey(lagRecordNow.Tablet), m.lastRateChange) if lagRecordBefore.isZero() { // We should see at least "lagRecordNow" here because we did just insert it // in processRecord(). @@ -592,7 +592,7 @@ func (m *MaxReplicationLagModule) decreaseAndGuessRate(r *result, now time.Time, if replicationLagChange == equal { // The replication lag did not change. Keep going at the current rate. - r.Reason = fmt.Sprintf("did not decrease the rate because the lag did not change (assuming a 1s error margin)") //nolint + r.Reason = fmt.Sprintf("did not decrease the rate because the lag did not change (assuming a 1s error margin)") // nolint return } @@ -705,7 +705,7 @@ func (m *MaxReplicationLagModule) updateRate(r *result, newState state, rate int } m.lastRateChange = now - m.replicaUnderTest = &replicaUnderTest{lagRecordNow.Key, topoproto.TabletAliasString(lagRecordNow.Tablet.Alias), lagRecordNow.Target.TabletType, newState, now.Add(testDuration)} + m.replicaUnderTest = &replicaUnderTest{discovery.TabletToMapKey(lagRecordNow.Tablet), topoproto.TabletAliasString(lagRecordNow.Tablet.Alias), lagRecordNow.Target.TabletType, newState, now.Add(testDuration)} if rate == oldRate { return diff --git a/go/vt/throttler/max_replication_lag_module_test.go b/go/vt/throttler/max_replication_lag_module_test.go index 1cb74c79e65..082eb6ee1d7 100644 --- a/go/vt/throttler/max_replication_lag_module_test.go +++ b/go/vt/throttler/max_replication_lag_module_test.go @@ -244,7 +244,7 @@ func TestMaxReplicationLagModule_ReplicaUnderTest_LastErrorOrNotUp(t *testing.T) tf.ratesHistory.add(sinceZero(110*time.Second), 200) tf.ratesHistory.add(sinceZero(114*time.Second), 400) rNotUp := lagRecord(sinceZero(115*time.Second), r1, 0) - rNotUp.Up = false + rNotUp.Serving = false tf.m.replicaLagCache.add(rNotUp) // r2 @ 150s, 0s lag (lastError no longer set) @@ -453,7 +453,7 @@ func TestMaxReplicationLagModule_Increase_BadRateUpperBound(t *testing.T) { t.Fatal(err) } - //Assume that a bad value of 150 was set @ 30s and log error + // Assume that a bad value of 150 was set @ 30s and log error if err := tf.m.memory.markBad(150, sinceZero(30*time.Second)); err != nil { log.Errorf("tf.m.memory.markBad(150, sinceZero(30*time.Second)) falied : %v", err) } @@ -955,7 +955,7 @@ func lagRecord(t time.Time, uid, lag uint32) replicationLagRecord { } // tabletStats creates fake tablet health data. -func tabletStats(uid, lag uint32) discovery.LegacyTabletStats { +func tabletStats(uid, lag uint32) discovery.TabletHealth { typ := topodatapb.TabletType_REPLICA if uid == rdonly1 || uid == rdonly2 { typ = topodatapb.TabletType_RDONLY @@ -967,21 +967,19 @@ func tabletStats(uid, lag uint32) discovery.LegacyTabletStats { Type: typ, PortMap: map[string]int32{"vt": int32(uid)}, } - return discovery.LegacyTabletStats{ + return discovery.TabletHealth{ Tablet: tablet, - Key: discovery.TabletToMapKey(tablet), Target: &querypb.Target{ Keyspace: "ks1", Shard: "-80", TabletType: typ, }, - Up: true, Serving: true, Stats: &querypb.RealtimeStats{ ReplicationLagSeconds: lag, }, - TabletExternallyReparentedTimestamp: 22, - LastError: nil, + PrimaryTermStartTime: 22, + LastError: nil, } } diff --git a/go/vt/throttler/replication_lag_cache.go b/go/vt/throttler/replication_lag_cache.go index ee4a5b18377..3b1a8685245 100644 --- a/go/vt/throttler/replication_lag_cache.go +++ b/go/vt/throttler/replication_lag_cache.go @@ -60,17 +60,17 @@ func newReplicationLagCache(historyCapacityPerReplica int) *replicationLagCache // add inserts or updates "r" in the cache for the replica with the key "r.Key". func (c *replicationLagCache) add(r replicationLagRecord) { - if !r.Up { + if !r.Serving { // Tablet is down. Do no longer track it. - delete(c.entries, r.Key) - delete(c.ignoredSlowReplicasInARow, r.Key) + delete(c.entries, discovery.TabletToMapKey(r.Tablet)) + delete(c.ignoredSlowReplicasInARow, discovery.TabletToMapKey(r.Tablet)) return } - entry, ok := c.entries[r.Key] + entry, ok := c.entries[discovery.TabletToMapKey(r.Tablet)] if !ok { entry = newReplicationLagHistory(c.historyCapacityPerReplica) - c.entries[r.Key] = entry + c.entries[discovery.TabletToMapKey(r.Tablet)] = entry } entry.add(r) @@ -114,7 +114,7 @@ func (c *replicationLagCache) sortByLag(ignoreNSlowestReplicas int, minimumRepli for _, v := range c.entries { record := v.latest() if int64(record.Stats.ReplicationLagSeconds) >= minimumReplicationLag { - list = append(list, record.LegacyTabletStats) + list = append(list, record.TabletHealth) i++ } } @@ -122,13 +122,13 @@ func (c *replicationLagCache) sortByLag(ignoreNSlowestReplicas int, minimumRepli // Now remember the N slowest replicas. for i := len(list) - 1; len(list) > 0 && i >= len(list)-ignoreNSlowestReplicas; i-- { - c.slowReplicas[list[i].Key] = true + c.slowReplicas[discovery.TabletToMapKey(list[i].Tablet)] = true } } // byLagAndTabletUID is a slice of discovery.LegacyTabletStats elements that // implements sort.Interface to sort by replication lag and tablet Uid. -type byLagAndTabletUID []discovery.LegacyTabletStats +type byLagAndTabletUID []discovery.TabletHealth func (a byLagAndTabletUID) Swap(i, j int) { a[i], a[j] = a[j], a[i] } func (a byLagAndTabletUID) Len() int { return len(a) } diff --git a/go/vt/throttler/replication_lag_cache_test.go b/go/vt/throttler/replication_lag_cache_test.go index c2381f05246..8964db17c05 100644 --- a/go/vt/throttler/replication_lag_cache_test.go +++ b/go/vt/throttler/replication_lag_cache_test.go @@ -19,6 +19,7 @@ package throttler import ( "testing" "time" + "vitess.io/vitess/go/vt/discovery" ) // TestReplicationLagCache tests that the ring buffer in "replicationLagHistory" @@ -27,7 +28,7 @@ import ( // max_replication_lag_module_test.go. func TestReplicationLagCache(t *testing.T) { c := newReplicationLagCache(2) - r1Key := tabletStats(r1, 1).Key + r1Key := discovery.TabletToMapKey(tabletStats(r1, 1).Tablet) // If there is no entry yet, a zero struct is returned. zeroEntry := c.atOrAfter(r1Key, sinceZero(0*time.Second)) @@ -73,7 +74,7 @@ func TestReplicationLagCache(t *testing.T) { func TestReplicationLagCache_SortByLag(t *testing.T) { c := newReplicationLagCache(2) - r1Key := tabletStats(r1, 1).Key + r1Key := discovery.TabletToMapKey(tabletStats(r1, 1).Tablet) c.add(lagRecord(sinceZero(1*time.Second), r1, 30)) c.sortByLag(1 /* ignoreNSlowestReplicas */, 30 /* minimumReplicationLag */) diff --git a/go/vt/throttler/replication_lag_record.go b/go/vt/throttler/replication_lag_record.go index 734cb3df43a..1a079aab325 100644 --- a/go/vt/throttler/replication_lag_record.go +++ b/go/vt/throttler/replication_lag_record.go @@ -29,7 +29,7 @@ type replicationLagRecord struct { time time.Time // LegacyTabletStats holds a copy of the current health data of the tablet. - discovery.LegacyTabletStats + discovery.TabletHealth } func (r replicationLagRecord) isZero() bool { diff --git a/go/vt/throttler/throttler.go b/go/vt/throttler/throttler.go index ea0096bc537..c20b0329338 100644 --- a/go/vt/throttler/throttler.go +++ b/go/vt/throttler/throttler.go @@ -295,8 +295,8 @@ func (t *Throttler) SetMaxRate(rate int64) { // RecordReplicationLag must be called by users to report the "ts" tablet health // data observed at "time". // Note: After Close() is called, this method must not be called anymore. -func (t *Throttler) RecordReplicationLag(time time.Time, ts *discovery.LegacyTabletStats) { - t.maxReplicationLagModule.RecordReplicationLag(time, ts) +func (t *Throttler) RecordReplicationLag(time time.Time, th *discovery.TabletHealth) { + t.maxReplicationLagModule.RecordReplicationLag(time, th) } // GetConfiguration returns the configuration of the MaxReplicationLag module. From 860c6fd0c305583da0c6ec3da2ed9a064d4ac501 Mon Sep 17 00:00:00 2001 From: Florent Poinsard Date: Mon, 20 Jun 2022 15:23:06 +0200 Subject: [PATCH 03/15] Started the removal of the legacy healthcheck in vtworker Signed-off-by: Florent Poinsard --- go/vt/discovery/healthcheck.go | 2 +- go/vt/discovery/utils.go | 14 +++++-- go/vt/discovery/utils_test.go | 52 +++++++++++++------------- go/vt/worker/executor.go | 14 +++---- go/vt/worker/split_clone.go | 58 +++++++++++++++++------------ go/vt/worker/tablet_provider.go | 10 +++-- go/vt/worker/tablet_tracker.go | 2 +- go/vt/worker/tablet_tracker_test.go | 10 ++--- go/vt/worker/topo_utils.go | 25 +++++++------ 9 files changed, 104 insertions(+), 83 deletions(-) diff --git a/go/vt/discovery/healthcheck.go b/go/vt/discovery/healthcheck.go index fc68ff17d9c..d4c7cb54191 100644 --- a/go/vt/discovery/healthcheck.go +++ b/go/vt/discovery/healthcheck.go @@ -44,9 +44,9 @@ import ( "strings" "sync" "time" - "vitess.io/vitess/go/netutil" "vitess.io/vitess/go/flagutil" + "vitess.io/vitess/go/netutil" "vitess.io/vitess/go/stats" "vitess.io/vitess/go/vt/log" "vitess.io/vitess/go/vt/proto/query" diff --git a/go/vt/discovery/utils.go b/go/vt/discovery/utils.go index e7793731463..ba5e782ddee 100644 --- a/go/vt/discovery/utils.go +++ b/go/vt/discovery/utils.go @@ -27,17 +27,25 @@ import ( // tablets returned by LegacyHealthCheck.GetTabletStatsFrom*. // See also legacy_replicationlag.go for a more sophisicated filter used by vtgate. +func TabletHealthReferenceListToValue(thl []*TabletHealth) []TabletHealth { + newTh := []TabletHealth{} + for _, th := range thl { + newTh = append(newTh, *th) + } + return newTh +} + // RemoveUnhealthyTablets filters all unhealthy tablets out. // NOTE: Non-serving tablets are considered healthy. -func RemoveUnhealthyTablets(tabletStatsList []LegacyTabletStats) []LegacyTabletStats { - result := make([]LegacyTabletStats, 0, len(tabletStatsList)) +func RemoveUnhealthyTablets(tabletStatsList []TabletHealth) []TabletHealth { + result := make([]TabletHealth, 0, len(tabletStatsList)) for _, ts := range tabletStatsList { // Note we do not check the 'Serving' flag here. // This is mainly to avoid the case where we run a vtworker Diff between a // source and destination, and the source is not serving (disabled by // TabletControl). When we switch the tablet to 'worker', it will // go back to serving state. - if ts.Stats == nil || ts.Stats.HealthError != "" || ts.LastError != nil || LegacyIsReplicationLagHigh(&ts) { + if ts.Stats == nil || ts.Stats.HealthError != "" || ts.LastError != nil || IsReplicationLagHigh(&ts) { continue } result = append(result, ts) diff --git a/go/vt/discovery/utils_test.go b/go/vt/discovery/utils_test.go index 501c3a03832..97cf14d7f17 100644 --- a/go/vt/discovery/utils_test.go +++ b/go/vt/discovery/utils_test.go @@ -27,36 +27,36 @@ import ( func TestRemoveUnhealthyTablets(t *testing.T) { var testcases = []struct { desc string - input []LegacyTabletStats - want []LegacyTabletStats + input []TabletHealth + want []TabletHealth }{{ desc: "tablets missing Stats", - input: []LegacyTabletStats{replica(1), replica(2)}, - want: []LegacyTabletStats{}, + input: []TabletHealth{replica(1), replica(2)}, + want: []TabletHealth{}, }, { desc: "all tablets healthy", - input: []LegacyTabletStats{healthy(replica(1)), healthy(replica(2))}, - want: []LegacyTabletStats{healthy(replica(1)), healthy(replica(2))}, + input: []TabletHealth{healthy(replica(1)), healthy(replica(2))}, + want: []TabletHealth{healthy(replica(1)), healthy(replica(2))}, }, { desc: "one unhealthy tablet (error)", - input: []LegacyTabletStats{healthy(replica(1)), unhealthyError(replica(2))}, - want: []LegacyTabletStats{healthy(replica(1))}, + input: []TabletHealth{healthy(replica(1)), unhealthyError(replica(2))}, + want: []TabletHealth{healthy(replica(1))}, }, { desc: "one error tablet", - input: []LegacyTabletStats{healthy(replica(1)), unhealthyLastError(replica(2))}, - want: []LegacyTabletStats{healthy(replica(1))}, + input: []TabletHealth{healthy(replica(1)), unhealthyLastError(replica(2))}, + want: []TabletHealth{healthy(replica(1))}, }, { desc: "one unhealthy tablet (lag)", - input: []LegacyTabletStats{healthy(replica(1)), unhealthyLag(replica(2))}, - want: []LegacyTabletStats{healthy(replica(1))}, + input: []TabletHealth{healthy(replica(1)), unhealthyLag(replica(2))}, + want: []TabletHealth{healthy(replica(1))}, }, { desc: "no filtering by tablet type", - input: []LegacyTabletStats{healthy(primary(1)), healthy(replica(2)), healthy(rdonly(3))}, - want: []LegacyTabletStats{healthy(primary(1)), healthy(replica(2)), healthy(rdonly(3))}, + input: []TabletHealth{healthy(primary(1)), healthy(replica(2)), healthy(rdonly(3))}, + want: []TabletHealth{healthy(primary(1)), healthy(replica(2)), healthy(rdonly(3))}, }, { desc: "non-serving tablets won't be removed", - input: []LegacyTabletStats{notServing(healthy(replica(1)))}, - want: []LegacyTabletStats{notServing(healthy(replica(1)))}, + input: []TabletHealth{notServing(healthy(replica(1)))}, + want: []TabletHealth{notServing(healthy(replica(1)))}, }} for _, tc := range testcases { @@ -73,20 +73,20 @@ func TestRemoveUnhealthyTablets(t *testing.T) { } } -func primary(uid uint32) LegacyTabletStats { +func primary(uid uint32) TabletHealth { return minimalTabletStats(uid, topodatapb.TabletType_PRIMARY) } -func replica(uid uint32) LegacyTabletStats { +func replica(uid uint32) TabletHealth { return minimalTabletStats(uid, topodatapb.TabletType_REPLICA) } -func rdonly(uid uint32) LegacyTabletStats { +func rdonly(uid uint32) TabletHealth { return minimalTabletStats(uid, topodatapb.TabletType_RDONLY) } -func minimalTabletStats(uid uint32, tabletType topodatapb.TabletType) LegacyTabletStats { - return LegacyTabletStats{ +func minimalTabletStats(uid uint32, tabletType topodatapb.TabletType) TabletHealth { + return TabletHealth{ Tablet: &topodatapb.Tablet{ Alias: &topodatapb.TabletAlias{ Uid: uid}, @@ -100,33 +100,33 @@ func minimalTabletStats(uid uint32, tabletType topodatapb.TabletType) LegacyTabl } } -func healthy(ts LegacyTabletStats) LegacyTabletStats { +func healthy(ts TabletHealth) TabletHealth { ts.Stats = &querypb.RealtimeStats{ ReplicationLagSeconds: uint32(1), } return ts } -func unhealthyLag(ts LegacyTabletStats) LegacyTabletStats { +func unhealthyLag(ts TabletHealth) TabletHealth { ts.Stats = &querypb.RealtimeStats{ ReplicationLagSeconds: uint32(3600), } return ts } -func unhealthyError(ts LegacyTabletStats) LegacyTabletStats { +func unhealthyError(ts TabletHealth) TabletHealth { ts.Stats = &querypb.RealtimeStats{ HealthError: "unhealthy", } return ts } -func unhealthyLastError(ts LegacyTabletStats) LegacyTabletStats { +func unhealthyLastError(ts TabletHealth) TabletHealth { ts.LastError = errors.New("err") return ts } -func notServing(ts LegacyTabletStats) LegacyTabletStats { +func notServing(ts TabletHealth) TabletHealth { ts.Serving = false return ts } diff --git a/go/vt/worker/executor.go b/go/vt/worker/executor.go index 2e2b30d5d8e..04201e4994b 100644 --- a/go/vt/worker/executor.go +++ b/go/vt/worker/executor.go @@ -41,7 +41,7 @@ import ( // executor is also used for executing vreplication and RefreshState commands. type executor struct { wr *wrangler.Wrangler - tsc *discovery.LegacyTabletStatsCache + hc *discovery.HealthCheckImpl throttler *throttler.Throttler keyspace string shard string @@ -51,10 +51,10 @@ type executor struct { statsKey []string } -func newExecutor(wr *wrangler.Wrangler, tsc *discovery.LegacyTabletStatsCache, throttler *throttler.Throttler, keyspace, shard string, threadID int) *executor { +func newExecutor(wr *wrangler.Wrangler, hc *discovery.HealthCheckImpl, throttler *throttler.Throttler, keyspace, shard string, threadID int) *executor { return &executor{ wr: wr, - tsc: tsc, + hc: hc, throttler: throttler, keyspace: keyspace, shard: shard, @@ -122,18 +122,18 @@ func (e *executor) fetchWithRetries(ctx context.Context, action func(ctx context // Is this current attempt a retry of a previous attempt? isRetry := false for { - var primary *discovery.LegacyTabletStats + var primary *discovery.TabletHealth var err error // Get the current primary from the LegacyTabletStatsCache. - primaries := e.tsc.GetHealthyTabletStats(e.keyspace, e.shard, topodatapb.TabletType_PRIMARY) + primaries := e.hc.GetHealthyTabletStats(&querypb.Target{Keyspace: e.keyspace, Shard: e.shard, TabletType: topodatapb.TabletType_PRIMARY}) if len(primaries) == 0 { e.wr.Logger().Warningf("ExecuteFetch failed for keyspace/shard %v/%v because no PRIMARY is available; will retry until there is PRIMARY again", e.keyspace, e.shard) statsRetryCount.Add(1) statsRetryCounters.Add(retryCategoryNoPrimaryAvailable, 1) goto retry } - primary = &primaries[0] + primary = primaries[0] // Block if we are throttled. if e.throttler != nil { @@ -194,7 +194,7 @@ func (e *executor) fetchWithRetries(ctx context.Context, action func(ctx context // checkError returns true if the error can be ignored and the command // succeeded, false if the error is retryable and a non-nil error if the // command must not be retried. -func (e *executor) checkError(ctx context.Context, err error, isRetry bool, primary *discovery.LegacyTabletStats) (bool, error) { +func (e *executor) checkError(ctx context.Context, err error, isRetry bool, primary *discovery.TabletHealth) (bool, error) { tabletString := fmt.Sprintf("%v (%v/%v)", topoproto.TabletAliasString(primary.Tablet.Alias), e.keyspace, e.shard) // first see if it was a context timeout. diff --git a/go/vt/worker/split_clone.go b/go/vt/worker/split_clone.go index b8c871e2d59..811afd569b0 100644 --- a/go/vt/worker/split_clone.go +++ b/go/vt/worker/split_clone.go @@ -24,6 +24,8 @@ import ( "sync" "time" + querypb "vitess.io/vitess/go/vt/proto/query" + "vitess.io/vitess/go/event" "vitess.io/vitess/go/stats" "vitess.io/vitess/go/vt/binlog/binlogplayer" @@ -97,8 +99,8 @@ type SplitCloneWorker struct { // PRIMARY tablet, b) get the list of healthy RDONLY tablets and c) track the // replication lag of all REPLICA tablets. // It must be closed at the end of the command. - healthCheck discovery.LegacyHealthCheck - tsc *discovery.LegacyTabletStatsCache + healthCheck *discovery.HealthCheckImpl + healthCheckCh chan *discovery.TabletHealth // populated during WorkerStateFindTargets, read-only after that sourceTablets []*topodatapb.Tablet @@ -109,7 +111,7 @@ type SplitCloneWorker struct { // shard. It updates the list of tablets in the healthcheck if replicas are // added/removed. // Each watcher must be stopped at the end of the command. - shardWatchers []*discovery.LegacyTopologyWatcher + shardWatchers []*discovery.TopologyWatcher // destinationDbNames stores for each destination keyspace/shard the MySQL // database name. // Example Map Entry: test_keyspace/-80 => vt_test_keyspace @@ -558,17 +560,25 @@ func (scw *SplitCloneWorker) init(ctx context.Context) error { } // Initialize healthcheck and add destination shards to it. - scw.healthCheck = discovery.NewLegacyHealthCheck(*healthcheckRetryDelay, *healthCheckTimeout) - scw.tsc = discovery.NewTabletStatsCacheDoNotSetListener(scw.wr.TopoServer(), scw.cell) - // We set sendDownEvents=true because it's required by LegacyTabletStatsCache. - scw.healthCheck.SetListener(scw, true /* sendDownEvents */) + scw.healthCheck = discovery.NewHealthCheck(ctx, *healthcheckRetryDelay, *healthCheckTimeout, scw.wr.TopoServer(), scw.cell, "") + scw.healthCheckCh = scw.healthCheck.Subscribe() + go func() { + for { + select { + case th := <-scw.healthCheckCh: + scw.StatsUpdate(th) + default: + return + } + } + }() // Start watchers to get tablets added automatically to healthCheck. allShards := append(scw.sourceShards, scw.destinationShards...) for _, si := range allShards { - watcher := discovery.NewLegacyShardReplicationWatcher(ctx, scw.wr.TopoServer(), scw.healthCheck, - scw.cell, si.Keyspace(), si.ShardName(), - *healthCheckTopologyRefresh, discovery.DefaultTopoReadConcurrency) + watcher := discovery.NewCellTabletsWatcher(ctx, scw.wr.TopoServer(), scw.healthCheck, + discovery.NewFilterByKeyspace([]string{si.Keyspace()}), scw.cell, *healthCheckTopologyRefresh, + true, discovery.DefaultTopoReadConcurrency) scw.shardWatchers = append(scw.shardWatchers, watcher) } @@ -761,7 +771,7 @@ func (scw *SplitCloneWorker) findOfflineSourceTablets(ctx context.Context) error scw.sourceAliases = make([]*topodatapb.TabletAlias, len(scw.sourceShards)) for i, si := range scw.sourceShards { var err error - scw.sourceAliases[i], err = FindWorkerTablet(ctx, scw.wr, scw.cleaner, scw.tsc, scw.cell, si.Keyspace(), si.ShardName(), scw.minHealthyTablets, scw.tabletType) + scw.sourceAliases[i], err = FindWorkerTablet(ctx, scw.wr, scw.cleaner, scw.healthCheck, scw.cell, si.Keyspace(), si.ShardName(), scw.minHealthyTablets, scw.tabletType) if err != nil { return vterrors.Wrapf(err, "FindWorkerTablet() failed for %v/%v/%v", scw.cell, si.Keyspace(), si.ShardName()) } @@ -806,7 +816,7 @@ func (scw *SplitCloneWorker) findTransactionalSources(ctx context.Context) error // find an appropriate tablet in the source shard si := scw.sourceShards[0] scw.sourceAliases = make([]*topodatapb.TabletAlias, 1) - scw.sourceAliases[0], err = FindHealthyTablet(ctx, scw.wr, scw.tsc, scw.cell, si.Keyspace(), si.ShardName(), scw.minHealthyTablets, scw.tabletType) + scw.sourceAliases[0], err = FindHealthyTablet(ctx, scw.wr, scw.healthCheck, scw.cell, si.Keyspace(), si.ShardName(), scw.minHealthyTablets, scw.tabletType) if err != nil { return vterrors.Wrapf(err, "FindHealthyTablet() failed for %v/%v/%v", scw.cell, si.Keyspace(), si.ShardName()) } @@ -842,12 +852,12 @@ func (scw *SplitCloneWorker) findDestinationPrimarys(ctx context.Context) error scw.wr.Logger().Infof("Finding a PRIMARY tablet for each destination shard...") for _, si := range scw.destinationShards { waitCtx, waitCancel := context.WithTimeout(ctx, *waitForHealthyTabletsTimeout) - err := scw.tsc.WaitForTablets(waitCtx, si.Keyspace(), si.ShardName(), topodatapb.TabletType_PRIMARY) + err := scw.healthCheck.WaitForTablets(waitCtx, si.Keyspace(), si.ShardName(), topodatapb.TabletType_PRIMARY) waitCancel() if err != nil { return vterrors.Wrapf(err, "cannot find PRIMARY tablet for destination shard for %v/%v (in cell: %v)", si.Keyspace(), si.ShardName(), scw.cell) } - primarys := scw.tsc.GetHealthyTabletStats(si.Keyspace(), si.ShardName(), topodatapb.TabletType_PRIMARY) + primarys := scw.healthCheck.GetHealthyTabletStats(&querypb.Target{Keyspace: si.Keyspace(), Shard: si.ShardName(), TabletType: topodatapb.TabletType_PRIMARY}) if len(primarys) == 0 { return vterrors.Errorf(vtrpc.Code_FAILED_PRECONDITION, "cannot find PRIMARY tablet for destination shard for %v/%v (in cell: %v) in LegacyHealthCheck: empty LegacyTabletStats list", si.Keyspace(), si.ShardName(), scw.cell) } @@ -881,7 +891,7 @@ func (scw *SplitCloneWorker) waitForTablets(ctx context.Context, shardInfos []*t // We wait for --min_healthy_tablets because we will use several // tablets per shard to spread reading the chunks of rows across as many // tablets as possible. - if _, err := waitForHealthyTablets(ctx, scw.wr, scw.tsc, scw.cell, keyspace, shard, scw.minHealthyTablets, timeout, scw.tabletType); err != nil { + if _, err := waitForHealthyTablets(ctx, scw.wr, scw.healthCheck, scw.cell, keyspace, shard, scw.minHealthyTablets, timeout, scw.tabletType); err != nil { rec.RecordError(err) } }(si.Keyspace(), si.ShardName()) @@ -898,7 +908,7 @@ func (scw *SplitCloneWorker) findFirstSourceTablet(ctx context.Context, state St // Pick any healthy serving source tablet. si := scw.sourceShards[0] - tablets := scw.tsc.GetHealthyTabletStats(si.Keyspace(), si.ShardName(), scw.tabletType) + tablets := scw.healthCheck.GetHealthyTabletStats(&querypb.Target{Keyspace: si.Keyspace(), Shard: si.ShardName(), TabletType: scw.tabletType}) if len(tablets) == 0 { // We fail fast on this problem and don't retry because at the start all tablets should be healthy. return nil, vterrors.Errorf(vtrpc.Code_FAILED_PRECONDITION, "no healthy %v tablet in source shard (%v) available (required to find out the schema)", topodatapb.TabletType_name[int32(scw.tabletType)], topoproto.KeyspaceShardString(si.Keyspace(), si.ShardName())) @@ -922,9 +932,9 @@ func (scw *SplitCloneWorker) getCounters(state StatusWorkerState) ([]*stats.Coun func (scw *SplitCloneWorker) startExecutor(ctx context.Context, wg *sync.WaitGroup, keyspace, shard string, insertChannel chan string, threadID int, processError func(string, ...any)) { defer wg.Done() t := scw.getThrottler(keyspace, shard) - //defer t.ThreadFinished(threadID) + // defer t.ThreadFinished(threadID) - executor := newExecutor(scw.wr, scw.tsc, t, keyspace, shard, threadID) + executor := newExecutor(scw.wr, scw.healthCheck, t, keyspace, shard, threadID) if err := executor.fetchLoop(ctx, insertChannel); err != nil { processError("executer.FetchLoop failed: %v", err) } @@ -961,7 +971,7 @@ func (scw *SplitCloneWorker) getSourceResultReader(ctx context.Context, td *tabl if txID < 1 { return nil, vterrors.Errorf(vtrpc.Code_FAILED_PRECONDITION, "tried using consistent snapshot without a valid transaction") } - tp := newShardTabletProvider(scw.tsc, scw.tabletTracker, si.Keyspace(), si.ShardName(), scw.tabletType) + tp := newShardTabletProvider(scw.healthCheck, scw.tabletTracker, si.Keyspace(), si.ShardName(), scw.tabletType) sourceResultReader, err = NewTransactionalRestartableResultReader(ctx, scw.wr.Logger(), tp, td, chunk, false, txID) if err != nil { closeReaders(ctx, sourceReaders) @@ -980,7 +990,7 @@ func (scw *SplitCloneWorker) getSourceResultReader(ctx context.Context, td *tabl // longer stopped at the same point as we took it offline initially. allowMultipleRetries = false } else { - tp = newShardTabletProvider(scw.tsc, scw.tabletTracker, si.Keyspace(), si.ShardName(), scw.tabletType) + tp = newShardTabletProvider(scw.healthCheck, scw.tabletTracker, si.Keyspace(), si.ShardName(), scw.tabletType) } sourceResultReader, err = NewRestartableResultReader(ctx, scw.wr.Logger(), tp, td, chunk, allowMultipleRetries) if err != nil { @@ -1002,7 +1012,7 @@ func (scw *SplitCloneWorker) getDestinationResultReader(ctx context.Context, td destReaders := make([]ResultReader, len(scw.destinationShards)) for shardIndex, si := range scw.destinationShards { - tp := newShardTabletProvider(scw.tsc, scw.tabletTracker, si.Keyspace(), si.ShardName(), topodatapb.TabletType_PRIMARY) + tp := newShardTabletProvider(scw.healthCheck, scw.tabletTracker, si.Keyspace(), si.ShardName(), topodatapb.TabletType_PRIMARY) destResultReader, err := NewRestartableResultReader(ctx, scw.wr.Logger(), tp, td, chunk, true /* allowMultipleRetries */) if err != nil { closeReaders(ctx, destReaders) @@ -1268,7 +1278,7 @@ func (scw *SplitCloneWorker) setUpVReplication(ctx context.Context) error { defer wg.Done() scw.wr.Logger().Infof("Making and populating vreplication table") - exc := newExecutor(scw.wr, scw.tsc, nil, keyspace, shard, 0) + exc := newExecutor(scw.wr, scw.healthCheck, nil, keyspace, shard, 0) for shardIndex, src := range scw.sourceShards { // Check if any error occurred in any other gorouties: select { @@ -1351,8 +1361,8 @@ func (scw *SplitCloneWorker) createKeyResolver(td *tabletmanagerdatapb.TableDefi // and forwards them to the respective throttler instance. // It also forwards any update to the LegacyTabletStatsCache to keep it up to date. // It is part of the discovery.LegacyHealthCheckStatsListener interface. -func (scw *SplitCloneWorker) StatsUpdate(ts *discovery.LegacyTabletStats) { - scw.tsc.StatsUpdate(ts) +func (scw *SplitCloneWorker) StatsUpdate(ts *discovery.TabletHealth) { + // scw.tsc.StatsUpdate(ts) // Ignore unless REPLICA or RDONLY. if ts.Target.TabletType != topodatapb.TabletType_REPLICA && ts.Target.TabletType != topodatapb.TabletType_RDONLY { diff --git a/go/vt/worker/tablet_provider.go b/go/vt/worker/tablet_provider.go index 8c07e18407f..9b46fc7de85 100644 --- a/go/vt/worker/tablet_provider.go +++ b/go/vt/worker/tablet_provider.go @@ -19,6 +19,8 @@ package worker import ( "fmt" + querypb "vitess.io/vitess/go/vt/proto/query" + "vitess.io/vitess/go/vt/vterrors" "context" @@ -75,20 +77,20 @@ func (p *singleTabletProvider) description() string { // shardTabletProvider returns a random healthy RDONLY tablet for a given // keyspace and shard. It uses the LegacyHealthCheck module to retrieve the tablets. type shardTabletProvider struct { - tsc *discovery.LegacyTabletStatsCache + hc *discovery.HealthCheckImpl tracker *TabletTracker keyspace string shard string tabletType topodatapb.TabletType } -func newShardTabletProvider(tsc *discovery.LegacyTabletStatsCache, tracker *TabletTracker, keyspace, shard string, tabletType topodatapb.TabletType) *shardTabletProvider { - return &shardTabletProvider{tsc, tracker, keyspace, shard, tabletType} +func newShardTabletProvider(healthCheck *discovery.HealthCheckImpl, tracker *TabletTracker, keyspace, shard string, tabletType topodatapb.TabletType) *shardTabletProvider { + return &shardTabletProvider{healthCheck, tracker, keyspace, shard, tabletType} } func (p *shardTabletProvider) getTablet() (*topodatapb.Tablet, error) { // Pick any healthy serving tablet. - tablets := p.tsc.GetHealthyTabletStats(p.keyspace, p.shard, p.tabletType) + tablets := p.hc.GetHealthyTabletStats(&querypb.Target{Keyspace: p.keyspace, Shard: p.shard, TabletType: p.tabletType}) if len(tablets) == 0 { return nil, fmt.Errorf("%v: no healthy %v tablets available", p.description(), p.tabletType) } diff --git a/go/vt/worker/tablet_tracker.go b/go/vt/worker/tablet_tracker.go index 3e2dee1ba61..51ca8abe1b0 100644 --- a/go/vt/worker/tablet_tracker.go +++ b/go/vt/worker/tablet_tracker.go @@ -50,7 +50,7 @@ func NewTabletTracker() *TabletTracker { // Track will pick the least used tablet from "stats", increment its usage by 1 // and return it. // "stats" must not be empty. -func (t *TabletTracker) Track(stats []discovery.LegacyTabletStats) *topodatapb.Tablet { +func (t *TabletTracker) Track(stats []*discovery.TabletHealth) *topodatapb.Tablet { if len(stats) == 0 { panic("stats must not be empty") } diff --git a/go/vt/worker/tablet_tracker_test.go b/go/vt/worker/tablet_tracker_test.go index 17375b6a11a..dd2740012f6 100644 --- a/go/vt/worker/tablet_tracker_test.go +++ b/go/vt/worker/tablet_tracker_test.go @@ -28,25 +28,25 @@ import ( topodatapb "vitess.io/vitess/go/vt/proto/topodata" ) -var ts1 = discovery.LegacyTabletStats{ +var ts1 = discovery.TabletHealth{ Tablet: topo.NewTablet(10, "cell", "host1"), Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, } -var ts2 = discovery.LegacyTabletStats{ +var ts2 = discovery.TabletHealth{ Tablet: topo.NewTablet(20, "cell", "host1"), Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, } -var allTs = []discovery.LegacyTabletStats{ts1, ts2} +var allTs = []*discovery.TabletHealth{&ts1, &ts2} func TestTabletsInUse(t *testing.T) { tt := NewTabletTracker() - tt.Track([]discovery.LegacyTabletStats{ts1}) + tt.Track([]*discovery.TabletHealth{&ts1}) if got, want := tt.TabletsInUse(), "cell-0000000010"; got != want { t.Fatalf("TabletsInUse() = %v, want = %v", got, want) } - tt.Track([]discovery.LegacyTabletStats{ts2}) + tt.Track([]*discovery.TabletHealth{&ts2}) if got, want := tt.TabletsInUse(), "cell-0000000010 cell-0000000020"; got != want { t.Fatalf("TabletsInUse() = %v, want = %v", got, want) } diff --git a/go/vt/worker/topo_utils.go b/go/vt/worker/topo_utils.go index b707cf3955c..86601fb17b9 100644 --- a/go/vt/worker/topo_utils.go +++ b/go/vt/worker/topo_utils.go @@ -22,6 +22,8 @@ import ( "math/rand" "time" + querypb "vitess.io/vitess/go/vt/proto/query" + "vitess.io/vitess/go/vt/vterrors" "context" @@ -47,17 +49,16 @@ var ( // Since we don't want to use them all, we require at least // minHealthyRdonlyTablets servers to be healthy. // May block up to -wait_for_healthy_rdonly_tablets_timeout. -func FindHealthyTablet(ctx context.Context, wr *wrangler.Wrangler, tsc *discovery.LegacyTabletStatsCache, cell, keyspace, shard string, minHealthyRdonlyTablets int, tabletType topodatapb.TabletType) (*topodatapb.TabletAlias, error) { - if tsc == nil { +func FindHealthyTablet(ctx context.Context, wr *wrangler.Wrangler, healthCheck *discovery.HealthCheckImpl, cell, keyspace, shard string, minHealthyRdonlyTablets int, tabletType topodatapb.TabletType) (*topodatapb.TabletAlias, error) { + if healthCheck == nil { // No healthcheck instance provided. Create one. - healthCheck := discovery.NewLegacyHealthCheck(*healthcheckRetryDelay, *healthCheckTimeout) - tsc = discovery.NewLegacyTabletStatsCache(healthCheck, wr.TopoServer(), cell) - watcher := discovery.NewLegacyShardReplicationWatcher(ctx, wr.TopoServer(), healthCheck, cell, keyspace, shard, *healthCheckTopologyRefresh, discovery.DefaultTopoReadConcurrency) + healthCheck = discovery.NewHealthCheck(ctx, *healthcheckRetryDelay, *healthCheckTimeout, wr.TopoServer(), cell, "") + watcher := discovery.NewCellTabletsWatcher(ctx, wr.TopoServer(), healthCheck, discovery.NewFilterByKeyspace([]string{keyspace}), cell, *healthCheckTopologyRefresh, true, discovery.DefaultTopoReadConcurrency) defer watcher.Stop() defer healthCheck.Close() } - healthyTablets, err := waitForHealthyTablets(ctx, wr, tsc, cell, keyspace, shard, minHealthyRdonlyTablets, *waitForHealthyTabletsTimeout, tabletType) + healthyTablets, err := waitForHealthyTablets(ctx, wr, healthCheck, cell, keyspace, shard, minHealthyRdonlyTablets, *waitForHealthyTabletsTimeout, tabletType) if err != nil { return nil, err } @@ -67,7 +68,7 @@ func FindHealthyTablet(ctx context.Context, wr *wrangler.Wrangler, tsc *discover return healthyTablets[index].Tablet.Alias, nil } -func waitForHealthyTablets(ctx context.Context, wr *wrangler.Wrangler, tsc *discovery.LegacyTabletStatsCache, cell, keyspace, shard string, minHealthyRdonlyTablets int, timeout time.Duration, tabletType topodatapb.TabletType) ([]discovery.LegacyTabletStats, error) { +func waitForHealthyTablets(ctx context.Context, wr *wrangler.Wrangler, healthCheck *discovery.HealthCheckImpl, cell, keyspace, shard string, minHealthyRdonlyTablets int, timeout time.Duration, tabletType topodatapb.TabletType) ([]discovery.TabletHealth, error) { busywaitCtx, busywaitCancel := context.WithTimeout(ctx, timeout) defer busywaitCancel() @@ -77,11 +78,11 @@ func waitForHealthyTablets(ctx context.Context, wr *wrangler.Wrangler, tsc *disc cell, keyspace, shard, minHealthyRdonlyTablets, time.Until(deadlineForLog).Seconds()) // Wait for at least one RDONLY tablet initially before checking the list. - if err := tsc.WaitForTablets(busywaitCtx, keyspace, shard, tabletType); err != nil { + if err := healthCheck.WaitForTablets(busywaitCtx, keyspace, shard, tabletType); err != nil { return nil, vterrors.Wrapf(err, "error waiting for %v tablets for (%v,%v/%v)", tabletType, cell, keyspace, shard) } - var healthyTablets []discovery.LegacyTabletStats + var healthyTablets []discovery.TabletHealth for { select { case <-busywaitCtx.Done(): @@ -90,7 +91,7 @@ func waitForHealthyTablets(ctx context.Context, wr *wrangler.Wrangler, tsc *disc default: } - healthyTablets = discovery.RemoveUnhealthyTablets(tsc.GetTabletStats(keyspace, shard, tabletType)) + healthyTablets = discovery.RemoveUnhealthyTablets(discovery.TabletHealthReferenceListToValue(healthCheck.GetHealthyTabletStats(&querypb.Target{Shard: shard, Keyspace: keyspace, TabletType: tabletType}))) if len(healthyTablets) >= minHealthyRdonlyTablets { break } @@ -115,8 +116,8 @@ func waitForHealthyTablets(ctx context.Context, wr *wrangler.Wrangler, tsc *disc // - find a tabletType instance in the keyspace / shard // - mark it as worker // - tag it with our worker process -func FindWorkerTablet(ctx context.Context, wr *wrangler.Wrangler, cleaner *wrangler.Cleaner, tsc *discovery.LegacyTabletStatsCache, cell, keyspace, shard string, minHealthyTablets int, tabletType topodatapb.TabletType) (*topodatapb.TabletAlias, error) { - tabletAlias, err := FindHealthyTablet(ctx, wr, tsc, cell, keyspace, shard, minHealthyTablets, tabletType) +func FindWorkerTablet(ctx context.Context, wr *wrangler.Wrangler, cleaner *wrangler.Cleaner, healthCheck *discovery.HealthCheckImpl, cell, keyspace, shard string, minHealthyTablets int, tabletType topodatapb.TabletType) (*topodatapb.TabletAlias, error) { + tabletAlias, err := FindHealthyTablet(ctx, wr, healthCheck, cell, keyspace, shard, minHealthyTablets, tabletType) if err != nil { return nil, err } From 1b445b890c58a7c9406bfaa81c34fe022dc63836 Mon Sep 17 00:00:00 2001 From: Florent Poinsard Date: Tue, 21 Jun 2022 11:47:46 +0200 Subject: [PATCH 04/15] Remove the legacy healthcheck from vtworker Signed-off-by: Florent Poinsard --- go/vt/discovery/healthcheck.go | 19 +++++++++++++++++++ go/vt/discovery/utils.go | 2 +- go/vt/worker/topo_utils.go | 2 +- 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/go/vt/discovery/healthcheck.go b/go/vt/discovery/healthcheck.go index d4c7cb54191..4b0a76e6fe6 100644 --- a/go/vt/discovery/healthcheck.go +++ b/go/vt/discovery/healthcheck.go @@ -628,6 +628,25 @@ func (hc *HealthCheckImpl) GetHealthyTabletStats(target *query.Target) []*Tablet return append(result, hc.healthy[KeyFromTarget(target)]...) } +// GetHealthyTabletStats returns only the healthy tablets. +// The returned array is owned by the caller. +// For TabletType_PRIMARY, this will only return at most one entry, +// the most recent tablet of type primary. +// This returns a copy of the data so that callers can access without +// synchronization +func (hc *HealthCheckImpl) GetTabletStats(target *query.Target) []*TabletHealth { + var result []*TabletHealth + hc.mu.Lock() + defer hc.mu.Unlock() + if target.Shard == "" { + target.Shard = "0" + } + for _, health := range hc.healthData[KeyFromTarget(target)] { + result = append(result, health) + } + return result +} + // getTabletStats returns all tablets for the given target. // The returned array is owned by the caller. // For TabletType_PRIMARY, this will only return at most one entry, diff --git a/go/vt/discovery/utils.go b/go/vt/discovery/utils.go index ba5e782ddee..b76c7409e80 100644 --- a/go/vt/discovery/utils.go +++ b/go/vt/discovery/utils.go @@ -45,7 +45,7 @@ func RemoveUnhealthyTablets(tabletStatsList []TabletHealth) []TabletHealth { // source and destination, and the source is not serving (disabled by // TabletControl). When we switch the tablet to 'worker', it will // go back to serving state. - if ts.Stats == nil || ts.Stats.HealthError != "" || ts.LastError != nil || IsReplicationLagHigh(&ts) { + if ts.LastError != nil || ts.Stats != nil && (ts.Stats.HealthError != "" || IsReplicationLagHigh(&ts)) { continue } result = append(result, ts) diff --git a/go/vt/worker/topo_utils.go b/go/vt/worker/topo_utils.go index 86601fb17b9..7ecaf4956ce 100644 --- a/go/vt/worker/topo_utils.go +++ b/go/vt/worker/topo_utils.go @@ -91,7 +91,7 @@ func waitForHealthyTablets(ctx context.Context, wr *wrangler.Wrangler, healthChe default: } - healthyTablets = discovery.RemoveUnhealthyTablets(discovery.TabletHealthReferenceListToValue(healthCheck.GetHealthyTabletStats(&querypb.Target{Shard: shard, Keyspace: keyspace, TabletType: tabletType}))) + healthyTablets = discovery.RemoveUnhealthyTablets(discovery.TabletHealthReferenceListToValue(healthCheck.GetTabletStats(&querypb.Target{Shard: shard, Keyspace: keyspace, TabletType: tabletType}))) if len(healthyTablets) >= minHealthyRdonlyTablets { break } From 80cd1934e383067d910523c39623a66cc3d40c4a Mon Sep 17 00:00:00 2001 From: Florent Poinsard Date: Tue, 21 Jun 2022 13:58:45 +0200 Subject: [PATCH 05/15] Remove the legacy healthcheck from vtworker Signed-off-by: Florent Poinsard --- go/vt/discovery/utils_test.go | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/go/vt/discovery/utils_test.go b/go/vt/discovery/utils_test.go index 97cf14d7f17..27416da44b0 100644 --- a/go/vt/discovery/utils_test.go +++ b/go/vt/discovery/utils_test.go @@ -32,7 +32,7 @@ func TestRemoveUnhealthyTablets(t *testing.T) { }{{ desc: "tablets missing Stats", input: []TabletHealth{replica(1), replica(2)}, - want: []TabletHealth{}, + want: []TabletHealth{replica(1), replica(2)}, }, { desc: "all tablets healthy", input: []TabletHealth{healthy(replica(1)), healthy(replica(2))}, @@ -60,16 +60,18 @@ func TestRemoveUnhealthyTablets(t *testing.T) { }} for _, tc := range testcases { - got := RemoveUnhealthyTablets(tc.input) - if len(got) != len(tc.want) { - t.Errorf("test case '%v' failed: RemoveUnhealthyTablets(%v) = %#v, want: %#v", tc.desc, tc.input, got, tc.want) - } else { - for i := range tc.want { - if !got[i].DeepEqual(&tc.want[i]) { - t.Errorf("test case '%v' failed: RemoveUnhealthyTablets(%v) = %#v, want: %#v", tc.desc, tc.input, got, tc.want) + t.Run(tc.desc, func(t *testing.T) { + got := RemoveUnhealthyTablets(tc.input) + if len(got) != len(tc.want) { + t.Errorf("test case '%v' failed: RemoveUnhealthyTablets(%v) = %#v, want: %#v", tc.desc, tc.input, got, tc.want) + } else { + for i := range tc.want { + if !got[i].DeepEqual(&tc.want[i]) { + t.Errorf("test case '%v' failed: RemoveUnhealthyTablets(%v) = %#v, want: %#v", tc.desc, tc.input, got, tc.want) + } } } - } + }) } } From 96932b0d7118793ce05b473acc4c4347b014012c Mon Sep 17 00:00:00 2001 From: Florent Poinsard Date: Tue, 21 Jun 2022 14:36:06 +0200 Subject: [PATCH 06/15] Remove the legacy healthcheck from the wrangler Signed-off-by: Florent Poinsard --- go/vt/vtctl/vtctl.go | 9 ++++----- go/vt/wrangler/keyspace.go | 30 +++++++++++++----------------- 2 files changed, 17 insertions(+), 22 deletions(-) diff --git a/go/vt/vtctl/vtctl.go b/go/vt/vtctl/vtctl.go index 96ec3a25a94..356b8ac9c4d 100644 --- a/go/vt/vtctl/vtctl.go +++ b/go/vt/vtctl/vtctl.go @@ -1354,8 +1354,7 @@ func commandWaitForDrain(ctx context.Context, wr *wrangler.Wrangler, subFlags *f return err } - return wr.WaitForDrain(ctx, cells, keyspace, shard, servedType, - *retryDelay, *HealthCheckTopologyRefresh, *HealthcheckRetryDelay, *HealthCheckTimeout, *initialWait) + return wr.WaitForDrain(ctx, cells, keyspace, shard, servedType, *retryDelay, *HealthcheckRetryDelay, *HealthCheckTimeout, *initialWait) } func commandSleep(ctx context.Context, wr *wrangler.Wrangler, subFlags *flag.FlagSet, args []string) error { @@ -2403,7 +2402,7 @@ func commandVRWorkflow(ctx context.Context, wr *wrangler.Wrangler, subFlags *fla return err } - //TODO: check if invalid parameters were passed in that do not apply to this action + // TODO: check if invalid parameters were passed in that do not apply to this action originalAction := action action = strings.ToLower(action) // allow users to input action in a case-insensitive manner if workflowType == wrangler.MigrateWorkflow { @@ -2649,7 +2648,7 @@ func commandVRWorkflow(ctx context.Context, wr *wrangler.Wrangler, subFlags *fla func commandCreateLookupVindex(ctx context.Context, wr *wrangler.Wrangler, subFlags *flag.FlagSet, args []string) error { cells := subFlags.String("cells", "", "Source cells to replicate from.") - //TODO: keep --cell around for backward compatibility and remove it in a future version + // TODO: keep --cell around for backward compatibility and remove it in a future version cell := subFlags.String("cell", "", "Cell to replicate from.") tabletTypes := subFlags.String("tablet_types", "", "Source tablet types to replicate from.") continueAfterCopyWithOwner := subFlags.Bool("continue_after_copy_with_owner", false, "Vindex will continue materialization after copy when an owner is provided") @@ -2749,7 +2748,7 @@ func commandVDiff(ctx context.Context, wr *wrangler.Wrangler, subFlags *flag.Fla maxRows := subFlags.Int64("limit", math.MaxInt64, "Max rows to stop comparing after") debugQuery := subFlags.Bool("debug_query", false, "Adds a mysql query to the report that can be used for further debugging") onlyPks := subFlags.Bool("only_pks", false, "When reporting missing rows, only show primary keys in the report.") - format := subFlags.String("format", "", "Format of report") //"json" or "" + format := subFlags.String("format", "", "Format of report") // "json" or "" tables := subFlags.String("tables", "", "Only run vdiff for these tables in the workflow") maxExtraRowsToCompare := subFlags.Int("max_extra_rows_to_compare", 1000, "If there are collation differences between the source and target, you can have rows that are identical but simply returned in a different order from MySQL. We will do a second pass to compare the rows for any actual differences in this case and this flag allows you to control the resources used for this operation.") if err := subFlags.Parse(args); err != nil { diff --git a/go/vt/wrangler/keyspace.go b/go/vt/wrangler/keyspace.go index e8cd498fe8f..216d31c883c 100644 --- a/go/vt/wrangler/keyspace.go +++ b/go/vt/wrangler/keyspace.go @@ -25,6 +25,8 @@ import ( "sync" "time" + querypb "vitess.io/vitess/go/vt/proto/query" + "vitess.io/vitess/go/event" "vitess.io/vitess/go/sqltypes" "vitess.io/vitess/go/vt/binlog/binlogplayer" @@ -917,7 +919,7 @@ func (wr *Wrangler) updateFrozenFlag(ctx context.Context, shards []*topo.ShardIn // the tablet was actually drained. At later times, a QPS rate > 0.0 could still // be observed. func (wr *Wrangler) WaitForDrain(ctx context.Context, cells []string, keyspace, shard string, servedType topodatapb.TabletType, - retryDelay, healthCheckTopologyRefresh, healthcheckRetryDelay, healthCheckTimeout, initialWait time.Duration) error { + retryDelay, healthcheckRetryDelay, healthCheckTimeout, initialWait time.Duration) error { var err error if len(cells) == 0 { // Retrieve list of cells for the shard from the topology. @@ -934,8 +936,7 @@ func (wr *Wrangler) WaitForDrain(ctx context.Context, cells []string, keyspace, wg.Add(1) go func(cell string) { defer wg.Done() - rec.RecordError(wr.waitForDrainInCell(ctx, cell, keyspace, shard, servedType, - retryDelay, healthCheckTopologyRefresh, healthcheckRetryDelay, healthCheckTimeout, initialWait)) + rec.RecordError(wr.waitForDrainInCell(ctx, cell, keyspace, shard, servedType, retryDelay, healthcheckRetryDelay, healthCheckTimeout, initialWait)) }(cell) } wg.Wait() @@ -944,19 +945,14 @@ func (wr *Wrangler) WaitForDrain(ctx context.Context, cells []string, keyspace, } func (wr *Wrangler) waitForDrainInCell(ctx context.Context, cell, keyspace, shard string, servedType topodatapb.TabletType, - retryDelay, healthCheckTopologyRefresh, healthcheckRetryDelay, healthCheckTimeout, initialWait time.Duration) error { + retryDelay, healthcheckRetryDelay, healthCheckTimeout, initialWait time.Duration) error { // Create the healthheck module, with a cache. - hc := discovery.NewLegacyHealthCheck(healthcheckRetryDelay, healthCheckTimeout) + hc := discovery.NewHealthCheck(ctx, healthcheckRetryDelay, healthCheckTimeout, wr.TopoServer(), cell, "") defer hc.Close() - tsc := discovery.NewLegacyTabletStatsCache(hc, wr.TopoServer(), cell) - - // Create a tablet watcher. - watcher := discovery.NewLegacyShardReplicationWatcher(ctx, wr.TopoServer(), hc, cell, keyspace, shard, healthCheckTopologyRefresh, discovery.DefaultTopoReadConcurrency) - defer watcher.Stop() // Wait for at least one tablet. - if err := tsc.WaitForTablets(ctx, keyspace, shard, servedType); err != nil { + if err := hc.WaitForTablets(ctx, keyspace, shard, servedType); err != nil { return fmt.Errorf("%v: error waiting for initial %v tablets for %v/%v: %v", cell, servedType, keyspace, shard, err) } @@ -974,15 +970,15 @@ func (wr *Wrangler) waitForDrainInCell(ctx context.Context, cell, keyspace, shar startTime := time.Now() for { // map key: tablet uid - drainedHealthyTablets := make(map[uint32]*discovery.LegacyTabletStats) - notDrainedHealtyTablets := make(map[uint32]*discovery.LegacyTabletStats) + drainedHealthyTablets := make(map[uint32]*discovery.TabletHealth) + notDrainedHealtyTablets := make(map[uint32]*discovery.TabletHealth) - healthyTablets := tsc.GetHealthyTabletStats(keyspace, shard, servedType) + healthyTablets := hc.GetTabletStats(&querypb.Target{Keyspace: keyspace, Shard: shard, TabletType: servedType}) for _, ts := range healthyTablets { if ts.Stats.Qps == 0.0 { - drainedHealthyTablets[ts.Tablet.Alias.Uid] = &ts + drainedHealthyTablets[ts.Tablet.Alias.Uid] = ts } else { - notDrainedHealtyTablets[ts.Tablet.Alias.Uid] = &ts + notDrainedHealtyTablets[ts.Tablet.Alias.Uid] = ts } } @@ -1018,7 +1014,7 @@ func (wr *Wrangler) waitForDrainInCell(ctx context.Context, cell, keyspace, shar return nil } -func formatTabletStats(ts *discovery.LegacyTabletStats) string { +func formatTabletStats(ts *discovery.TabletHealth) string { webURL := "unknown http port" if webPort, ok := ts.Tablet.PortMap["vt"]; ok { webURL = fmt.Sprintf("http://%v:%d/", ts.Tablet.Hostname, webPort) From 97400e0f1d1f733e998b0267c402e896a064c6f9 Mon Sep 17 00:00:00 2001 From: Florent Poinsard Date: Tue, 21 Jun 2022 14:53:36 +0200 Subject: [PATCH 07/15] Remove the legacy healthcheck and al. files Signed-off-by: Florent Poinsard --- go/vt/discovery/fake_legacy_healthcheck.go | 238 ----- go/vt/discovery/healthcheck.go | 3 + go/vt/discovery/healthcheck_test.go | 23 +- go/vt/discovery/legacy_healthcheck.go | 962 ------------------ .../legacy_healthcheck_flaky_test.go | 615 ----------- go/vt/discovery/legacy_replicationlag.go | 201 ---- go/vt/discovery/legacy_replicationlag_test.go | 370 ------- go/vt/discovery/legacy_tablet_stats_cache.go | 313 ------ .../legacy_tablet_stats_cache_test.go | 280 ----- .../legacy_tablet_stats_cache_wait.go | 120 --- .../legacy_tablet_stats_cache_wait_test.go | 76 -- go/vt/discovery/legacy_topology_watcher.go | 457 --------- .../discovery/legacy_topology_watcher_test.go | 490 --------- go/vt/discovery/replicationlag_test.go | 5 + 14 files changed, 24 insertions(+), 4129 deletions(-) delete mode 100644 go/vt/discovery/fake_legacy_healthcheck.go delete mode 100644 go/vt/discovery/legacy_healthcheck.go delete mode 100644 go/vt/discovery/legacy_healthcheck_flaky_test.go delete mode 100644 go/vt/discovery/legacy_replicationlag.go delete mode 100644 go/vt/discovery/legacy_replicationlag_test.go delete mode 100644 go/vt/discovery/legacy_tablet_stats_cache.go delete mode 100644 go/vt/discovery/legacy_tablet_stats_cache_test.go delete mode 100644 go/vt/discovery/legacy_tablet_stats_cache_wait.go delete mode 100644 go/vt/discovery/legacy_tablet_stats_cache_wait_test.go delete mode 100644 go/vt/discovery/legacy_topology_watcher.go delete mode 100644 go/vt/discovery/legacy_topology_watcher_test.go diff --git a/go/vt/discovery/fake_legacy_healthcheck.go b/go/vt/discovery/fake_legacy_healthcheck.go deleted file mode 100644 index b6b854b5aba..00000000000 --- a/go/vt/discovery/fake_legacy_healthcheck.go +++ /dev/null @@ -1,238 +0,0 @@ -/* -Copyright 2019 The Vitess Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package discovery - -import ( - "sort" - "sync" - "time" - - "vitess.io/vitess/go/vt/logutil" - - "vitess.io/vitess/go/vt/topo" - "vitess.io/vitess/go/vt/topo/topoproto" - "vitess.io/vitess/go/vt/vttablet/queryservice" - "vitess.io/vitess/go/vt/vttablet/sandboxconn" - - querypb "vitess.io/vitess/go/vt/proto/query" - topodatapb "vitess.io/vitess/go/vt/proto/topodata" -) - -// This file contains the definitions for a FakeLegacyHealthCheck class to -// simulate a LegacyHealthCheck module. Note it is not in a sub-package because -// otherwise it couldn't be used in this package's tests because of -// circular dependencies. - -// NewFakeLegacyHealthCheck returns the fake healthcheck object. -func NewFakeLegacyHealthCheck() *FakeLegacyHealthCheck { - return &FakeLegacyHealthCheck{ - items: make(map[string]*flhcItem), - } -} - -// FakeLegacyHealthCheck implements discovery.LegacyHealthCheck. -type FakeLegacyHealthCheck struct { - listener LegacyHealthCheckStatsListener - - // mu protects the items map - mu sync.RWMutex - items map[string]*flhcItem -} - -type flhcItem struct { - ts *LegacyTabletStats - conn queryservice.QueryService -} - -// -// discovery.LegacyHealthCheck interface methods -// - -// RegisterStats is not implemented. -func (fhc *FakeLegacyHealthCheck) RegisterStats() { -} - -// SetListener is not implemented. -func (fhc *FakeLegacyHealthCheck) SetListener(listener LegacyHealthCheckStatsListener, sendDownEvents bool) { - fhc.listener = listener -} - -// WaitForInitialStatsUpdates is not implemented. -func (fhc *FakeLegacyHealthCheck) WaitForInitialStatsUpdates() { -} - -// AddTablet adds the tablet and calls the listener. -func (fhc *FakeLegacyHealthCheck) AddTablet(tablet *topodatapb.Tablet, name string) { - key := TabletToMapKey(tablet) - item := &flhcItem{ - ts: &LegacyTabletStats{ - Key: key, - Tablet: tablet, - Target: &querypb.Target{ - Keyspace: tablet.Keyspace, - Shard: tablet.Shard, - TabletType: tablet.Type, - }, - Serving: true, - Up: true, - Name: name, - Stats: &querypb.RealtimeStats{}, - }, - } - - fhc.mu.Lock() - defer fhc.mu.Unlock() - fhc.items[key] = item - - if fhc.listener != nil { - fhc.listener.StatsUpdate(item.ts) - } -} - -// RemoveTablet removes the tablet. -func (fhc *FakeLegacyHealthCheck) RemoveTablet(tablet *topodatapb.Tablet) { - fhc.mu.Lock() - defer fhc.mu.Unlock() - key := TabletToMapKey(tablet) - item, ok := fhc.items[key] - if !ok { - return - } - // Make sure the key still corresponds to the tablet we want to delete. - // If it doesn't match, we should do nothing. The tablet we were asked to - // delete is already gone, and some other tablet is using the key - // (host:port) that the original tablet used to use, which is fine. - if !topoproto.TabletAliasEqual(tablet.Alias, item.ts.Tablet.Alias) { - return - } - delete(fhc.items, key) -} - -// ReplaceTablet removes the old tablet and adds the new. -func (fhc *FakeLegacyHealthCheck) ReplaceTablet(old, new *topodatapb.Tablet, name string) { - fhc.RemoveTablet(old) - fhc.AddTablet(new, name) -} - -// GetConnection returns the TabletConn of the given tablet. -func (fhc *FakeLegacyHealthCheck) GetConnection(key string) queryservice.QueryService { - fhc.mu.RLock() - defer fhc.mu.RUnlock() - if item := fhc.items[key]; item != nil { - return item.conn - } - return nil -} - -// CacheStatus returns the status for each tablet -func (fhc *FakeLegacyHealthCheck) CacheStatus() LegacyTabletsCacheStatusList { - fhc.mu.Lock() - defer fhc.mu.Unlock() - - stats := make(LegacyTabletsCacheStatusList, 0, len(fhc.items)) - for _, item := range fhc.items { - stats = append(stats, &LegacyTabletsCacheStatus{ - Cell: "FakeCell", - Target: item.ts.Target, - TabletsStats: LegacyTabletStatsList{item.ts}, - }) - } - sort.Sort(stats) - return stats -} - -// Close is not implemented. -func (fhc *FakeLegacyHealthCheck) Close() error { - return nil -} - -// -// Management methods -// - -// Reset cleans up the internal state. -func (fhc *FakeLegacyHealthCheck) Reset() { - fhc.mu.Lock() - defer fhc.mu.Unlock() - - fhc.items = make(map[string]*flhcItem) -} - -// AddFakeTablet inserts a fake entry into FakeLegacyHealthCheck. -// The Tablet can be talked to using the provided connection. -// The Listener is called, as if AddTablet had been called. -// For flexibility the connection is created via a connFactory callback -func (fhc *FakeLegacyHealthCheck) AddFakeTablet(cell, host string, port int32, keyspace, shard string, tabletType topodatapb.TabletType, serving bool, reparentTS int64, err error, connFactory func(*topodatapb.Tablet) queryservice.QueryService) queryservice.QueryService { - t := topo.NewTablet(0, cell, host) - t.Keyspace = keyspace - t.Shard = shard - t.Type = tabletType - t.PortMap["vt"] = port - // reparentTS only has precision to seconds - t.PrimaryTermStartTime = logutil.TimeToProto(time.Unix(reparentTS, 0)) - key := TabletToMapKey(t) - - fhc.mu.Lock() - defer fhc.mu.Unlock() - item := fhc.items[key] - if item == nil { - item = &flhcItem{ - ts: &LegacyTabletStats{ - Key: key, - Tablet: t, - Up: true, - }, - } - fhc.items[key] = item - } - item.ts.Target = &querypb.Target{ - Keyspace: keyspace, - Shard: shard, - TabletType: tabletType, - } - item.ts.Serving = serving - item.ts.TabletExternallyReparentedTimestamp = reparentTS - item.ts.Stats = &querypb.RealtimeStats{} - item.ts.LastError = err - conn := connFactory(t) - item.conn = conn - - if fhc.listener != nil { - fhc.listener.StatsUpdate(item.ts) - } - return conn -} - -// AddTestTablet adds a fake tablet for tests using the SandboxConn and returns -// the fake connection -func (fhc *FakeLegacyHealthCheck) AddTestTablet(cell, host string, port int32, keyspace, shard string, tabletType topodatapb.TabletType, serving bool, reparentTS int64, err error) *sandboxconn.SandboxConn { - conn := fhc.AddFakeTablet(cell, host, port, keyspace, shard, tabletType, serving, reparentTS, err, func(tablet *topodatapb.Tablet) queryservice.QueryService { - return sandboxconn.NewSandboxConn(tablet) - }) - return conn.(*sandboxconn.SandboxConn) -} - -// GetAllTablets returns all the tablets we have. -func (fhc *FakeLegacyHealthCheck) GetAllTablets() map[string]*topodatapb.Tablet { - res := make(map[string]*topodatapb.Tablet) - fhc.mu.RLock() - defer fhc.mu.RUnlock() - for key, t := range fhc.items { - res[key] = t.ts.Tablet - } - return res -} diff --git a/go/vt/discovery/healthcheck.go b/go/vt/discovery/healthcheck.go index 4b0a76e6fe6..2667f2b0128 100644 --- a/go/vt/discovery/healthcheck.go +++ b/go/vt/discovery/healthcheck.go @@ -82,6 +82,9 @@ var ( refreshKnownTablets = flag.Bool("tablet_refresh_known_tablets", true, "tablet refresh reloads the tablet address/port map from topo in case it changes") // topoReadConcurrency tells us how many topo reads are allowed in parallel topoReadConcurrency = flag.Int("topo_read_concurrency", 32, "concurrent topo reads") + + // How much to sleep between each check. + waitAvailableTabletInterval = 100 * time.Millisecond ) // See the documentation for NewHealthCheck below for an explanation of these parameters. diff --git a/go/vt/discovery/healthcheck_test.go b/go/vt/discovery/healthcheck_test.go index d2f6d25eaf9..8606eb85bd7 100644 --- a/go/vt/discovery/healthcheck_test.go +++ b/go/vt/discovery/healthcheck_test.go @@ -51,13 +51,22 @@ import ( topodatapb "vitess.io/vitess/go/vt/proto/topodata" ) -var connMap map[string]*fakeConn -var connMapMu sync.Mutex +var ( + connMap map[string]*fakeConn + connMapMu sync.Mutex +) + +func testChecksum(t *testing.T, want, got int64) { + t.Helper() + if want != got { + t.Errorf("want checksum %v, got %v", want, got) + } +} func init() { tabletconn.RegisterDialer("fake_gateway", tabletDialer) - //log error + // log error if err := flag.Set("tablet_protocol", "fake_gateway"); err != nil { log.Errorf("failed to set flag \"tablet_protocol\" to \"fake_gateway\":%v", err) } @@ -196,7 +205,7 @@ func TestHealthCheck(t *testing.T) { } input <- shr result = <-resultChan - //TODO: figure out how to compare objects that contain errors using utils.MustMatch + // TODO: figure out how to compare objects that contain errors using utils.MustMatch assert.True(t, want.DeepEqual(result), "Wrong TabletHealth data\n Expected: %v\n Actual: %v", want, result) testChecksum(t, 1027934207, hc.stateChecksum()) // unchanged @@ -257,7 +266,7 @@ func TestHealthCheckStreamError(t *testing.T) { LastError: fmt.Errorf("some stream error"), } result = <-resultChan - //TODO: figure out how to compare objects that contain errors using utils.MustMatch + // TODO: figure out how to compare objects that contain errors using utils.MustMatch assert.True(t, want.DeepEqual(result), "Wrong TabletHealth data\n Expected: %v\n Actual: %v", want, result) // tablet should be removed from healthy list a := hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}) @@ -317,7 +326,7 @@ func TestHealthCheckErrorOnPrimary(t *testing.T) { LastError: fmt.Errorf("some stream error"), } result = <-resultChan - //TODO: figure out how to compare objects that contain errors using utils.MustMatch + // TODO: figure out how to compare objects that contain errors using utils.MustMatch assert.True(t, want.DeepEqual(result), "Wrong TabletHealth data\n Expected: %v\n Actual: %v", want, result) // tablet should be removed from healthy list a := hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY}) @@ -1158,7 +1167,7 @@ func TestTemplate(t *testing.T) { } func TestDebugURLFormatting(t *testing.T) { - //log error + // log error if err2 := flag.Set("tablet_url_template", "https://{{.GetHostNameLevel 0}}.bastion.{{.Tablet.Alias.Cell}}.corp"); err2 != nil { log.Errorf("flag.Set(\"tablet_url_template\", \"https://{{.GetHostNameLevel 0}}.bastion.{{.Tablet.Alias.Cell}}.corp\") failed : %v", err2) } diff --git a/go/vt/discovery/legacy_healthcheck.go b/go/vt/discovery/legacy_healthcheck.go deleted file mode 100644 index a1025ace145..00000000000 --- a/go/vt/discovery/legacy_healthcheck.go +++ /dev/null @@ -1,962 +0,0 @@ -/* -Copyright 2019 The Vitess Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -// Package discovery provides a way to discover all tablets e.g. within a -// specific shard and monitor their current health. -// Deprecated -// Use the LegacyHealthCheck object to query for tablets and their health. -// -// For an example how to use the LegacyHealthCheck object, see worker/topo_utils.go. -// -// Tablets have to be manually added to the LegacyHealthCheck using AddTablet(). -// Alternatively, use a Watcher implementation which will constantly watch -// a source (e.g. the topology) and add and remove tablets as they are -// added or removed from the source. -// For a Watcher example have a look at NewLegacyShardReplicationWatcher(). -// -// Each LegacyHealthCheck has a LegacyHealthCheckStatsListener that will receive -// notification of when tablets go up and down. -// LegacyTabletStatsCache is one implementation, that caches the known tablets -// and the healthy ones per keyspace/shard/tabletType. -// -// Internally, the LegacyHealthCheck module is connected to each tablet and has a -// streaming RPC (StreamHealth) open to receive periodic health infos. -package discovery - -import ( - "bytes" - "context" - "encoding/json" - "fmt" - "hash/crc32" - "html/template" - "net/http" - "sort" - "strings" - "sync" - "time" - - "google.golang.org/protobuf/proto" - - "vitess.io/vitess/go/netutil" - "vitess.io/vitess/go/stats" - "vitess.io/vitess/go/sync2" - "vitess.io/vitess/go/vt/grpcclient" - "vitess.io/vitess/go/vt/log" - "vitess.io/vitess/go/vt/servenv" - "vitess.io/vitess/go/vt/topo/topoproto" - "vitess.io/vitess/go/vt/topotools" - "vitess.io/vitess/go/vt/vttablet/queryservice" - "vitess.io/vitess/go/vt/vttablet/tabletconn" - - querypb "vitess.io/vitess/go/vt/proto/query" - topodatapb "vitess.io/vitess/go/vt/proto/topodata" -) - -const ( - // LegacyHealthCheckTemplate is the HTML code to display a TabletsCacheStatusList - LegacyHealthCheckTemplate = ` - - - - - - - - - - - - - {{range $i, $ts := .}} - - - - - - - - {{end}} -
HealthCheck Tablet Cache
CellKeyspaceShardTabletTypetabletStats
{{github_com_vitessio_vitess_vtctld_srv_cell $ts.Cell}}{{github_com_vitessio_vitess_vtctld_srv_keyspace $ts.Cell $ts.Target.Keyspace}}{{$ts.Target.Shard}}{{$ts.Target.TabletType}}{{$ts.StatusAsHTML}}
-` -) - -func init() { - // Flags are not parsed at this point and the default value of the flag (just the hostname) will be used. - ParseTabletURLTemplateFromFlag() -} - -// LegacyHealthCheckStatsListener is the listener to receive health check stats update. -type LegacyHealthCheckStatsListener interface { - // StatsUpdate is called when: - // - a new tablet is known to the LegacyHealthCheck, and its first - // streaming healthcheck is returned. (then ts.Up is true). - // - a tablet is removed from the list of tablets we watch - // (then ts.Up is false). - // - a tablet dynamically changes its type. When registering the - // listener, if sendDownEvents is true, two events are generated - // (ts.Up false on the old type, ts.Up true on the new type). - // If it is false, only one event is sent (ts.Up true on the new - // type). - StatsUpdate(*LegacyTabletStats) -} - -// LegacyTabletStats is returned when getting the set of tablets. -type LegacyTabletStats struct { - // Key uniquely identifies that serving tablet. It is computed - // from the Tablet's record Hostname and PortMap. If a tablet - // is restarted on different ports, its Key will be different. - // Key is computed using the TabletToMapKey method below. - // key can be used in GetConnection(). - Key string - // Tablet is the tablet object that was sent to LegacyHealthCheck.AddTablet. - Tablet *topodatapb.Tablet - // Name is an optional tag (e.g. alternative address) for the - // tablet. It is supposed to represent the tablet as a task, - // not as a process. For instance, it can be a - // cell+keyspace+shard+tabletType+taskIndex value. - Name string - // Target is the current target as returned by the streaming - // StreamHealth RPC. - Target *querypb.Target - // Up describes whether the tablet is added or removed. - Up bool - // Serving describes if the tablet can be serving traffic. - Serving bool - // TabletExternallyReparentedTimestamp is the last timestamp - // that this tablet was either elected the primary, or received - // a TabletExternallyReparented event. It is set to 0 if the - // tablet doesn't think it's a primary. - TabletExternallyReparentedTimestamp int64 - // Stats is the current health status, as received by the - // StreamHealth RPC (replication lag, ...). - Stats *querypb.RealtimeStats - // LastError is the error we last saw when trying to get the - // tablet's healthcheck. - LastError error -} - -// String is defined because we want to print a []*LegacyTabletStats array nicely. -func (e *LegacyTabletStats) String() string { - return fmt.Sprint(*e) -} - -// DeepEqual compares two LegacyTabletStats. Since we include protos, we -// need to use proto.Equal on these. -func (e *LegacyTabletStats) DeepEqual(f *LegacyTabletStats) bool { - return e.Key == f.Key && - proto.Equal(e.Tablet, f.Tablet) && - e.Name == f.Name && - proto.Equal(e.Target, f.Target) && - e.Up == f.Up && - e.Serving == f.Serving && - e.TabletExternallyReparentedTimestamp == f.TabletExternallyReparentedTimestamp && - proto.Equal(e.Stats, f.Stats) && - ((e.LastError == nil && f.LastError == nil) || - (e.LastError != nil && f.LastError != nil && e.LastError.Error() == f.LastError.Error())) -} - -// Copy produces a copy of LegacyTabletStats. -func (e *LegacyTabletStats) Copy() *LegacyTabletStats { - ts := *e - return &ts -} - -// GetTabletHostPort formats a tablet host port address. -func (e LegacyTabletStats) GetTabletHostPort() string { - vtPort := e.Tablet.PortMap["vt"] - return netutil.JoinHostPort(e.Tablet.Hostname, vtPort) -} - -// GetHostNameLevel returns the specified hostname level. If the level does not exist it will pick the closest level. -// This seems unused but can be utilized by certain url formatting templates. See getTabletDebugURL for more details. -func (e LegacyTabletStats) GetHostNameLevel(level int) string { - chunkedHostname := strings.Split(e.Tablet.Hostname, ".") - - if level < 0 { - return chunkedHostname[0] - } else if level >= len(chunkedHostname) { - return chunkedHostname[len(chunkedHostname)-1] - } else { - return chunkedHostname[level] - } -} - -// NamedStatusURL returns the URL for the case where a tablet server is named. -func (e LegacyTabletStats) NamedStatusURL() string { - return "/" + topoproto.TabletAliasString(e.Tablet.Alias) + servenv.StatusURLPath() -} - -// getTabletDebugURL formats a debug url to the tablet. -// It uses a format string that can be passed into the app to format -// the debug URL to accommodate different network setups. It applies -// the html/template string defined to a LegacyTabletStats object. The -// format string can refer to members and functions of LegacyTabletStats -// like a regular html/template string. -// -// For instance given a tablet with hostname:port of host.dc.domain:22 -// could be configured as follows: -// http://{{.GetTabletHostPort}} -> http://host.dc.domain:22 -// https://{{.Tablet.Hostname}} -> https://host.dc.domain -// https://{{.GetHostNameLevel 0}}.bastion.corp -> https://host.bastion.corp -// {{.NamedStatusURL}} -> test-0000000001/debug/status -func (e LegacyTabletStats) getTabletDebugURL() string { - var buffer bytes.Buffer - - // Error logged - if err := tabletURLTemplate.Execute(&buffer, e); err != nil { - log.Errorf("tabletURLTemplate.Execute(&buffer, e) failed: %v", err) - } - return buffer.String() -} - -// TrivialStatsUpdate returns true iff the old and new LegacyTabletStats -// haven't changed enough to warrant re-calling FilterLegacyStatsByReplicationLag. -func (e *LegacyTabletStats) TrivialStatsUpdate(n *LegacyTabletStats) bool { - // Skip replag filter when replag remains in the low rep lag range, - // which should be the case majority of the time. - lowRepLag := lowReplicationLag.Seconds() - oldRepLag := float64(e.Stats.ReplicationLagSeconds) - newRepLag := float64(n.Stats.ReplicationLagSeconds) - if oldRepLag <= lowRepLag && newRepLag <= lowRepLag { - return true - } - - // Skip replag filter when replag remains in the high rep lag range, - // and did not change beyond +/- 10%. - // when there is a high rep lag, it takes a long time for it to reduce, - // so it is not necessary to re-calculate every time. - // In that case, we won't save the new record, so we still - // remember the original replication lag. - if oldRepLag > lowRepLag && newRepLag > lowRepLag && newRepLag < oldRepLag*1.1 && newRepLag > oldRepLag*0.9 { - return true - } - - return false -} - -// LegacyTabletRecorder is the part of the LegacyHealthCheck interface that can -// add or remove tablets. We define it as a sub-interface here so we -// can add filters on tablets if needed. -type LegacyTabletRecorder interface { - // AddTablet adds the tablet. - // Name is an alternate name, like an address. - AddTablet(tablet *topodatapb.Tablet, name string) - - // RemoveTablet removes the tablet. - RemoveTablet(tablet *topodatapb.Tablet) - - // ReplaceTablet does an AddTablet and RemoveTablet in one call, effectively replacing the old tablet with the new. - ReplaceTablet(old, new *topodatapb.Tablet, name string) -} - -// LegacyHealthCheck defines the interface of health checking module. -// The goal of this object is to maintain a StreamHealth RPC -// to a lot of tablets. Tablets are added / removed by calling the -// AddTablet / RemoveTablet methods (other discovery module objects -// can for instance watch the topology and call these). -// -// Updates to the health of all registered tablet can be watched by -// registering a listener. To get the underlying "TabletConn" object -// which is used for each tablet, use the "GetConnection()" method -// below and pass in the Key string which is also sent to the -// listener in each update (as it is part of LegacyTabletStats). -type LegacyHealthCheck interface { - // LegacyTabletRecorder interface adds AddTablet and RemoveTablet methods. - // AddTablet adds the tablet, and starts health check on it. - // RemoveTablet removes the tablet, and stops its StreamHealth RPC. - LegacyTabletRecorder - - // RegisterStats registers the connection counts and checksum stats. - // It can only be called on one Healthcheck object per process. - RegisterStats() - // SetListener sets the listener for healthcheck - // updates. sendDownEvents is used when a tablet changes type - // (from replica to primary for instance). If the listener - // wants two events (Up=false on old type, Up=True on new - // type), sendDownEvents should be set. Otherwise, the - // healthcheck will only send one event (Up=true on new type). - // - // Note that the default implementation requires to set the - // listener before any tablets are added to the healthcheck. - SetListener(listener LegacyHealthCheckStatsListener, sendDownEvents bool) - // WaitForInitialStatsUpdates waits until all tablets added via - // AddTablet() call were propagated to the listener via correspondingdiscovert - // StatsUpdate() calls. Note that code path from AddTablet() to - // corresponding StatsUpdate() is asynchronous but not cancelable, thus - // this function is also non-cancelable and can't return error. Also - // note that all AddTablet() calls should happen before calling this - // method. WaitForInitialStatsUpdates won't wait for StatsUpdate() calls - // corresponding to AddTablet() calls made during its execution. - WaitForInitialStatsUpdates() - // GetConnection returns the TabletConn of the given tablet. - GetConnection(key string) queryservice.QueryService - // CacheStatus returns a displayable version of the cache. - CacheStatus() LegacyTabletsCacheStatusList - // Close stops the healthcheck. - Close() error -} - -// LegacyHealthCheckImpl performs health checking and notifies downstream components about any changes. -// It contains a map of legacyTabletHealth objects, each of which stores the health information for -// a tablet. A checkConn goroutine is spawned for each legacyTabletHealth, which is responsible for -// keeping that legacyTabletHealth up-to-date. This is done through callbacks to updateHealth. -// If checkConn terminates for any reason, it updates legacyTabletHealth.Up as false. If a legacyTabletHealth -// gets removed from the map, its cancelFunc gets called, which ensures that the associated -// checkConn goroutine eventually terminates. -type LegacyHealthCheckImpl struct { - // Immutable fields set at construction time. - listener LegacyHealthCheckStatsListener - sendDownEvents bool - retryDelay time.Duration - healthCheckTimeout time.Duration - // connsWG keeps track of all launched Go routines that monitor tablet connections. - connsWG sync.WaitGroup - - // mu protects all the following fields. - mu sync.Mutex - - // addrToHealth maps from address to legacyTabletHealth. - addrToHealth map[string]*legacyTabletHealth - - // Wait group that's used to wait until all initial StatsUpdate() calls are made after the AddTablet() calls. - initialUpdatesWG sync.WaitGroup -} - -// legacyHealthCheckConn is a structure that lives within the scope of -// the checkConn goroutine to maintain its internal state. Therefore, -// it does not require synchronization. Changes that are relevant to -// healthcheck are transmitted through calls to LegacyHealthCheckImpl.updateHealth. -// TODO(sougou): move this and associated functions to a separate file. -type legacyHealthCheckConn struct { - ctx context.Context - - conn queryservice.QueryService - tabletStats LegacyTabletStats - loggedServingState bool - lastResponseTimestamp time.Time // timestamp of the last healthcheck response -} - -// legacyTabletHealth maintains the health status of a tablet. A map of this -// structure is maintained in LegacyHealthCheckImpl. -type legacyTabletHealth struct { - // cancelFunc must be called before discarding legacyTabletHealth. - // This will ensure that the associated checkConn goroutine will terminate. - cancelFunc context.CancelFunc - // conn is the connection associated with the tablet. - conn queryservice.QueryService - // latestTabletStats stores the latest health stats of the tablet. - latestTabletStats LegacyTabletStats -} - -// NewLegacyDefaultHealthCheck creates a new LegacyHealthCheck object with a default configuration. -func NewLegacyDefaultHealthCheck() LegacyHealthCheck { - return NewLegacyHealthCheck(DefaultHealthCheckRetryDelay, DefaultHealthCheckTimeout) -} - -// NewLegacyHealthCheck creates a new LegacyHealthCheck object. -// Parameters: -// retryDelay. -// The duration to wait before retrying to connect (e.g. after a failed connection -// attempt). -// healthCheckTimeout. -// The duration for which we consider a health check response to be 'fresh'. If we don't get -// a health check response from a tablet for more than this duration, we consider the tablet -// not healthy. -func NewLegacyHealthCheck(retryDelay, healthCheckTimeout time.Duration) LegacyHealthCheck { - hc := &LegacyHealthCheckImpl{ - addrToHealth: make(map[string]*legacyTabletHealth), - retryDelay: retryDelay, - healthCheckTimeout: healthCheckTimeout, - } - - healthcheckOnce.Do(func() { - http.Handle("/debug/gateway", hc) - }) - - return hc -} - -// RegisterStats registers the connection counts stats -func (hc *LegacyHealthCheckImpl) RegisterStats() { - stats.NewGaugesFuncWithMultiLabels( - "HealthcheckConnections", - "the number of healthcheck connections registered", - []string{"Keyspace", "ShardName", "TabletType"}, - hc.servingConnStats) - - stats.NewGaugeFunc( - "HealthcheckChecksum", - "crc32 checksum of the current healthcheck state", - hc.stateChecksum) -} - -// ServeHTTP is part of the http.Handler interface. It renders the current state of the discovery gateway tablet cache into json. -func (hc *LegacyHealthCheckImpl) ServeHTTP(w http.ResponseWriter, _ *http.Request) { - w.Header().Set("Content-Type", "application/json; charset=utf-8") - status := hc.cacheStatusMap() - b, err := json.MarshalIndent(status, "", " ") - if err != nil { - w.Write([]byte(err.Error())) - return - } - - buf := bytes.NewBuffer(nil) - json.HTMLEscape(buf, b) - w.Write(buf.Bytes()) -} - -// servingConnStats returns the number of serving tablets per keyspace/shard/tablet type. -func (hc *LegacyHealthCheckImpl) servingConnStats() map[string]int64 { - res := make(map[string]int64) - hc.mu.Lock() - defer hc.mu.Unlock() - for _, th := range hc.addrToHealth { - if !th.latestTabletStats.Up || !th.latestTabletStats.Serving || th.latestTabletStats.LastError != nil { - continue - } - key := fmt.Sprintf("%s.%s.%s", th.latestTabletStats.Target.Keyspace, th.latestTabletStats.Target.Shard, topoproto.TabletTypeLString(th.latestTabletStats.Target.TabletType)) - res[key]++ - } - return res -} - -// stateChecksum returns a crc32 checksum of the healthcheck state -func (hc *LegacyHealthCheckImpl) stateChecksum() int64 { - // CacheStatus is sorted so this should be stable across vtgates - cacheStatus := hc.CacheStatus() - var buf bytes.Buffer - for _, st := range cacheStatus { - fmt.Fprintf(&buf, - "%v%v%v%v\n", - st.Cell, - st.Target.Keyspace, - st.Target.Shard, - st.Target.TabletType.String(), - ) - sort.Sort(st.TabletsStats) - for _, ts := range st.TabletsStats { - fmt.Fprintf(&buf, "%v%v%v\n", ts.Up, ts.Serving, ts.TabletExternallyReparentedTimestamp) - } - } - - return int64(crc32.ChecksumIEEE(buf.Bytes())) -} - -// updateHealth updates the legacyTabletHealth record and transmits the tablet stats -// to the listener. -func (hc *LegacyHealthCheckImpl) updateHealth(ts *LegacyTabletStats, conn queryservice.QueryService) { - // Unconditionally send the received update at the end. - defer func() { - if hc.listener != nil { - hc.listener.StatsUpdate(ts) - } - }() - - hc.mu.Lock() - th, ok := hc.addrToHealth[ts.Key] - if !ok { - // This can happen on delete because the entry is removed first, - // or if LegacyHealthCheckImpl has been closed. - hc.mu.Unlock() - return - } - oldts := th.latestTabletStats - th.latestTabletStats = *ts - th.conn = conn - hc.mu.Unlock() - - // In the case where a tablet changes type (but not for the - // initial message), we want to log it, and maybe advertise it too. - if oldts.Target.TabletType != topodatapb.TabletType_UNKNOWN && oldts.Target.TabletType != ts.Target.TabletType { - // Log and maybe notify - log.Infof("HealthCheckUpdate(Type Change): %v, tablet: %s, target %+v => %+v, reparent time: %v", - oldts.Name, topotools.TabletIdent(oldts.Tablet), topotools.TargetIdent(oldts.Target), topotools.TargetIdent(ts.Target), ts.TabletExternallyReparentedTimestamp) - if hc.listener != nil && hc.sendDownEvents { - oldts.Up = false - hc.listener.StatsUpdate(&oldts) - } - - // Track how often a tablet gets promoted to primary. It is used for - // comparing against the variables in go/vtgate/buffer/variables.go. - if oldts.Target.TabletType != topodatapb.TabletType_PRIMARY && ts.Target.TabletType == topodatapb.TabletType_PRIMARY { - hcPrimaryPromotedCounters.Add([]string{ts.Target.Keyspace, ts.Target.Shard}, 1) - } - } -} - -// finalizeConn closes the health checking connection and sends the final -// notification about the tablet to downstream. To be called only on exit from -// checkConn(). -func (hc *LegacyHealthCheckImpl) finalizeConn(hcc *legacyHealthCheckConn) { - hcc.tabletStats.Up = false - hcc.setServingState(false, "finalizeConn closing connection") - // Note: checkConn() exits only when hcc.ctx.Done() is closed. Thus it's - // safe to simply get Err() value here and assign to LastError. - hcc.tabletStats.LastError = hcc.ctx.Err() - hc.updateHealth(hcc.tabletStats.Copy(), nil) - if hcc.conn != nil { - // Don't use hcc.ctx because it's already closed. - // Use a separate context, and add a timeout to prevent unbounded waits. - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) - defer cancel() - hcc.conn.Close(ctx) - hcc.conn = nil - } -} - -// checkConn performs health checking on the given tablet. -func (hc *LegacyHealthCheckImpl) checkConn(hcc *legacyHealthCheckConn, name string) { - defer hc.connsWG.Done() - defer hc.finalizeConn(hcc) - - // Initial notification for downstream about the tablet existence. - hc.updateHealth(hcc.tabletStats.Copy(), hcc.conn) - hc.initialUpdatesWG.Done() - - retryDelay := hc.retryDelay - for { - streamCtx, streamCancel := context.WithCancel(hcc.ctx) - - // Setup a watcher that restarts the timer every time an update is received. - // If a timeout occurs for a serving tablet, we make it non-serving and send - // a status update. The stream is also terminated so it can be retried. - // servingStatus feeds into the serving var, which keeps track of the serving - // status transmitted by the tablet. - servingStatus := make(chan bool, 1) - // timedout is accessed atomically because there could be a race - // between the goroutine that sets it and the check for its value - // later. - timedout := sync2.NewAtomicBool(false) - go func() { - for { - select { - case <-servingStatus: - continue - case <-time.After(hc.healthCheckTimeout): - timedout.Set(true) - streamCancel() - return - case <-streamCtx.Done(): - // If the stream is done, stop watching. - return - } - } - }() - - // Read stream health responses. - hcc.stream(streamCtx, hc, func(shr *querypb.StreamHealthResponse) error { - // We received a message. Reset the back-off. - retryDelay = hc.retryDelay - // Don't block on send to avoid deadlocks. - select { - case servingStatus <- shr.Serving: - default: - } - return hcc.processResponse(hc, shr) - }) - - // streamCancel to make sure the watcher goroutine terminates. - streamCancel() - - // If there was a timeout send an error. We do this after stream has returned. - // This will ensure that this update prevails over any previous message that - // stream could have sent. - if timedout.Get() { - hcc.tabletStats.LastError = fmt.Errorf("healthcheck timed out (latest %v)", hcc.lastResponseTimestamp) - hcc.setServingState(false, hcc.tabletStats.LastError.Error()) - hc.updateHealth(hcc.tabletStats.Copy(), hcc.conn) - hcErrorCounters.Add([]string{hcc.tabletStats.Target.Keyspace, hcc.tabletStats.Target.Shard, topoproto.TabletTypeLString(hcc.tabletStats.Target.TabletType)}, 1) - } - - // Streaming RPC failed e.g. because vttablet was restarted or took too long. - // Sleep until the next retry is up or the context is done/canceled. - select { - case <-hcc.ctx.Done(): - return - case <-time.After(retryDelay): - // Exponentially back-off to prevent tight-loop. - retryDelay *= 2 - // Limit the retry delay backoff to the health check timeout - if retryDelay > hc.healthCheckTimeout { - retryDelay = hc.healthCheckTimeout - } - } - } -} - -// setServingState sets the tablet state to the given value. -// -// If the state changes, it logs the change so that failures -// from the health check connection are logged the first time, -// but don't continue to log if the connection stays down. -// -// hcc.mu must be locked before calling this function -func (hcc *legacyHealthCheckConn) setServingState(serving bool, reason string) { - if !hcc.loggedServingState || (serving != hcc.tabletStats.Serving) { - // Emit the log from a separate goroutine to avoid holding - // the hcc lock while logging is happening - go log.Infof("HealthCheckUpdate(Serving State): %v, tablet: %v serving => %v for %v/%v (%v) reason: %s", - hcc.tabletStats.Name, - topotools.TabletIdent(hcc.tabletStats.Tablet), - serving, - hcc.tabletStats.Tablet.GetKeyspace(), - hcc.tabletStats.Tablet.GetShard(), - hcc.tabletStats.Target.GetTabletType(), - reason, - ) - hcc.loggedServingState = true - } - - hcc.tabletStats.Serving = serving -} - -// stream streams healthcheck responses to callback. -func (hcc *legacyHealthCheckConn) stream(ctx context.Context, hc *LegacyHealthCheckImpl, callback func(*querypb.StreamHealthResponse) error) { - if hcc.conn == nil { - conn, err := tabletconn.GetDialer()(hcc.tabletStats.Tablet, grpcclient.FailFast(true)) - if err != nil { - hcc.tabletStats.LastError = err - return - } - hcc.conn = conn - hcc.tabletStats.LastError = nil - } - - if err := hcc.conn.StreamHealth(ctx, callback); err != nil { - log.Warningf("tablet %v healthcheck stream error: %v", hcc.tabletStats.Tablet.Alias, err) - hcc.setServingState(false, err.Error()) - hcc.tabletStats.LastError = err - // Send nil because we intend to close the connection. - hc.updateHealth(hcc.tabletStats.Copy(), nil) - hcc.conn.Close(ctx) - hcc.conn = nil - } -} - -// processResponse reads one health check response, and notifies LegacyHealthCheckStatsListener. -func (hcc *legacyHealthCheckConn) processResponse(hc *LegacyHealthCheckImpl, shr *querypb.StreamHealthResponse) error { - select { - case <-hcc.ctx.Done(): - return hcc.ctx.Err() - default: - } - - // Check for invalid data, better than panicking. - if shr.Target == nil || shr.RealtimeStats == nil { - return fmt.Errorf("health stats is not valid: %v", shr) - } - - // an app-level error from tablet, force serving state. - var healthErr error - serving := shr.Serving - if shr.RealtimeStats.HealthError != "" { - healthErr = fmt.Errorf("vttablet error: %v", shr.RealtimeStats.HealthError) - serving = false - } - - // hcc.LegacyTabletStats.Tablet.Alias.Uid may be 0 because the youtube internal mechanism uses a different - // code path to initialize this value. If so, we should skip this check. - if shr.TabletAlias != nil && hcc.tabletStats.Tablet.Alias.Uid != 0 && !proto.Equal(shr.TabletAlias, hcc.tabletStats.Tablet.Alias) { - return fmt.Errorf("health stats mismatch, tablet %+v alias does not match response alias %v", hcc.tabletStats.Tablet, shr.TabletAlias) - } - - // In this case where a new tablet is initialized or a tablet type changes, we want to - // initialize the counter so the rate can be calculated correctly. - if hcc.tabletStats.Target.TabletType != shr.Target.TabletType { - hcErrorCounters.Add([]string{shr.Target.Keyspace, shr.Target.Shard, topoproto.TabletTypeLString(shr.Target.TabletType)}, 0) - } - - // Update our record, and notify downstream for tabletType and - // realtimeStats change. - hcc.lastResponseTimestamp = time.Now() - hcc.tabletStats.Target = shr.Target - hcc.tabletStats.TabletExternallyReparentedTimestamp = shr.TabletExternallyReparentedTimestamp - hcc.tabletStats.Stats = shr.RealtimeStats - hcc.tabletStats.LastError = healthErr - reason := "healthCheck update" - if healthErr != nil { - reason = "healthCheck update error: " + healthErr.Error() - } - hcc.setServingState(serving, reason) - hc.updateHealth(hcc.tabletStats.Copy(), hcc.conn) - return nil -} - -func (hc *LegacyHealthCheckImpl) deleteConn(tablet *topodatapb.Tablet) { - hc.mu.Lock() - defer hc.mu.Unlock() - - key := TabletToMapKey(tablet) - th, ok := hc.addrToHealth[key] - if !ok { - return - } - // Make sure the key still corresponds to the tablet we want to delete. - // If it doesn't match, we should do nothing. The tablet we were asked to - // delete is already gone, and some other tablet is using the key - // (host:port) that the original tablet used to use, which is fine. - if !topoproto.TabletAliasEqual(tablet.Alias, th.latestTabletStats.Tablet.Alias) { - return - } - hc.deleteConnLocked(key, th) -} - -func (hc *LegacyHealthCheckImpl) deleteConnLocked(key string, th *legacyTabletHealth) { - th.latestTabletStats.Up = false - th.cancelFunc() - delete(hc.addrToHealth, key) -} - -// SetListener sets the listener for healthcheck updates. -// It must be called after NewLegacyHealthCheck and before any tablets are added -// (either through AddTablet or through a Watcher). -func (hc *LegacyHealthCheckImpl) SetListener(listener LegacyHealthCheckStatsListener, sendDownEvents bool) { - if hc.listener != nil { - panic("must not call SetListener twice") - } - - hc.mu.Lock() - defer hc.mu.Unlock() - if len(hc.addrToHealth) > 0 { - panic("must not call SetListener after tablets were added") - } - - hc.listener = listener - hc.sendDownEvents = sendDownEvents -} - -// AddTablet adds the tablet, and starts health check. -// It does not block on making connection. -// name is an optional tag for the tablet, e.g. an alternative address. -func (hc *LegacyHealthCheckImpl) AddTablet(tablet *topodatapb.Tablet, name string) { - ctx, cancelFunc := context.WithCancel(context.Background()) - key := TabletToMapKey(tablet) - hcc := &legacyHealthCheckConn{ - ctx: ctx, - tabletStats: LegacyTabletStats{ - Key: key, - Tablet: tablet, - Name: name, - Target: &querypb.Target{}, - Up: true, - }, - } - hc.mu.Lock() - if hc.addrToHealth == nil { - // already closed. - hc.mu.Unlock() - cancelFunc() - return - } - if th, ok := hc.addrToHealth[key]; ok { - // Something already exists at this key. - // If it's the same tablet, something is wrong. - if topoproto.TabletAliasEqual(th.latestTabletStats.Tablet.Alias, tablet.Alias) { - hc.mu.Unlock() - log.Warningf("refusing to add duplicate tablet %v for %v: %+v", name, tablet.Alias.Cell, tablet) - cancelFunc() - return - } - // If it's a different tablet, then we trust this new tablet that claims - // it has taken over the host:port that the old tablet used to be on. - // Remove the old tablet to clear the way. - hc.deleteConnLocked(key, th) - } - hc.addrToHealth[key] = &legacyTabletHealth{ - cancelFunc: cancelFunc, - latestTabletStats: hcc.tabletStats, - } - hc.initialUpdatesWG.Add(1) - hc.connsWG.Add(1) - hc.mu.Unlock() - - go hc.checkConn(hcc, name) -} - -// RemoveTablet removes the tablet, and stops the health check. -// It does not block. -func (hc *LegacyHealthCheckImpl) RemoveTablet(tablet *topodatapb.Tablet) { - hc.deleteConn(tablet) -} - -// ReplaceTablet removes the old tablet and adds the new tablet. -func (hc *LegacyHealthCheckImpl) ReplaceTablet(old, new *topodatapb.Tablet, name string) { - hc.deleteConn(old) - hc.AddTablet(new, name) -} - -// WaitForInitialStatsUpdates waits until all tablets added via AddTablet() call -// were propagated to downstream via corresponding StatsUpdate() calls. -func (hc *LegacyHealthCheckImpl) WaitForInitialStatsUpdates() { - hc.initialUpdatesWG.Wait() -} - -// GetConnection returns the TabletConn of the given tablet. -func (hc *LegacyHealthCheckImpl) GetConnection(key string) queryservice.QueryService { - hc.mu.Lock() - defer hc.mu.Unlock() - - th := hc.addrToHealth[key] - if th == nil { - return nil - } - return th.conn -} - -// LegacyTabletsCacheStatus is the current tablets for a cell/target. -type LegacyTabletsCacheStatus struct { - Cell string - Target *querypb.Target - TabletsStats LegacyTabletStatsList -} - -// LegacyTabletStatsList is used for sorting. -type LegacyTabletStatsList []*LegacyTabletStats - -// Len is part of sort.Interface. -func (tsl LegacyTabletStatsList) Len() int { - return len(tsl) -} - -// Less is part of sort.Interface -func (tsl LegacyTabletStatsList) Less(i, j int) bool { - name1 := tsl[i].Name - if name1 == "" { - name1 = tsl[i].Key - } - name2 := tsl[j].Name - if name2 == "" { - name2 = tsl[j].Key - } - return name1 < name2 -} - -// Swap is part of sort.Interface -func (tsl LegacyTabletStatsList) Swap(i, j int) { - tsl[i], tsl[j] = tsl[j], tsl[i] -} - -// StatusAsHTML returns an HTML version of the status. -func (tcs *LegacyTabletsCacheStatus) StatusAsHTML() template.HTML { - tLinks := make([]string, 0, 1) - if tcs.TabletsStats != nil { - sort.Sort(tcs.TabletsStats) - } - for _, ts := range tcs.TabletsStats { - color := "green" - extra := "" - if ts.LastError != nil { - color = "red" - extra = fmt.Sprintf(" (%v)", ts.LastError) - } else if !ts.Serving { - color = "red" - extra = " (Not Serving)" - } else if !ts.Up { - color = "red" - extra = " (Down)" - } else if ts.Target.TabletType == topodatapb.TabletType_PRIMARY { - extra = fmt.Sprintf(" (PrimaryTermStartTime: %v)", ts.TabletExternallyReparentedTimestamp) - } else { - extra = fmt.Sprintf(" (RepLag: %v)", ts.Stats.ReplicationLagSeconds) - } - name := ts.Name - if name == "" { - name = ts.GetTabletHostPort() - } - tLinks = append(tLinks, fmt.Sprintf(`%v%v`, ts.getTabletDebugURL(), color, name, extra)) - } - return template.HTML(strings.Join(tLinks, "
")) -} - -// LegacyTabletsCacheStatusList is used for sorting. -type LegacyTabletsCacheStatusList []*LegacyTabletsCacheStatus - -// Len is part of sort.Interface. -func (tcsl LegacyTabletsCacheStatusList) Len() int { - return len(tcsl) -} - -// Less is part of sort.Interface -func (tcsl LegacyTabletsCacheStatusList) Less(i, j int) bool { - return tcsl[i].Cell+"."+tcsl[i].Target.Keyspace+"."+tcsl[i].Target.Shard+"."+string(tcsl[i].Target.TabletType) < - tcsl[j].Cell+"."+tcsl[j].Target.Keyspace+"."+tcsl[j].Target.Shard+"."+string(tcsl[j].Target.TabletType) -} - -// Swap is part of sort.Interface -func (tcsl LegacyTabletsCacheStatusList) Swap(i, j int) { - tcsl[i], tcsl[j] = tcsl[j], tcsl[i] -} - -// CacheStatus returns a displayable version of the cache. -func (hc *LegacyHealthCheckImpl) CacheStatus() LegacyTabletsCacheStatusList { - tcsMap := hc.cacheStatusMap() - tcsl := make(LegacyTabletsCacheStatusList, 0, len(tcsMap)) - for _, tcs := range tcsMap { - tcsl = append(tcsl, tcs) - } - sort.Sort(tcsl) - return tcsl -} - -func (hc *LegacyHealthCheckImpl) cacheStatusMap() map[string]*LegacyTabletsCacheStatus { - tcsMap := make(map[string]*LegacyTabletsCacheStatus) - hc.mu.Lock() - defer hc.mu.Unlock() - for _, th := range hc.addrToHealth { - key := fmt.Sprintf("%v.%v.%v.%v", th.latestTabletStats.Tablet.Alias.Cell, th.latestTabletStats.Target.Keyspace, th.latestTabletStats.Target.Shard, th.latestTabletStats.Target.TabletType.String()) - var tcs *LegacyTabletsCacheStatus - var ok bool - if tcs, ok = tcsMap[key]; !ok { - tcs = &LegacyTabletsCacheStatus{ - Cell: th.latestTabletStats.Tablet.Alias.Cell, - Target: th.latestTabletStats.Target, - } - tcsMap[key] = tcs - } - tabletStats := th.latestTabletStats - tcs.TabletsStats = append(tcs.TabletsStats, &tabletStats) - } - return tcsMap -} - -// Close stops the healthcheck. -// After Close() returned, it's guaranteed that the listener isn't -// currently executing and won't be called again. -func (hc *LegacyHealthCheckImpl) Close() error { - hc.mu.Lock() - for _, th := range hc.addrToHealth { - th.cancelFunc() - } - hc.addrToHealth = nil - // Release the lock early or a pending checkHealthCheckTimeout - // cannot get a read lock on it. - hc.mu.Unlock() - - // Wait for the checkHealthCheckTimeout Go routine and each Go - // routine per tablet. - hc.connsWG.Wait() - - return nil -} diff --git a/go/vt/discovery/legacy_healthcheck_flaky_test.go b/go/vt/discovery/legacy_healthcheck_flaky_test.go deleted file mode 100644 index 33ac51c3247..00000000000 --- a/go/vt/discovery/legacy_healthcheck_flaky_test.go +++ /dev/null @@ -1,615 +0,0 @@ -/* -Copyright 2019 The Vitess Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package discovery - -import ( - "bytes" - "flag" - "fmt" - "html/template" - "strings" - "testing" - "time" - - "vitess.io/vitess/go/test/utils" - - "context" - - querypb "vitess.io/vitess/go/vt/proto/query" - topodatapb "vitess.io/vitess/go/vt/proto/topodata" - "vitess.io/vitess/go/vt/status" - "vitess.io/vitess/go/vt/topo" -) - -func testChecksum(t *testing.T, want, got int64) { - t.Helper() - if want != got { - t.Errorf("want checksum %v, got %v", want, got) - } -} - -func TestLegacyHealthCheck(t *testing.T) { - hcErrorCounters.ResetAll() - tablet := topo.NewTablet(0, "cell", "a") - tablet.PortMap["vt"] = 1 - input := make(chan *querypb.StreamHealthResponse) - createFakeConn(tablet, input) - t.Logf(`createFakeConn({Host: "a", PortMap: {"vt": 1}}, c)`) - l := newListener() - hc := NewLegacyHealthCheck(1*time.Millisecond, time.Hour).(*LegacyHealthCheckImpl) - hc.SetListener(l, true) - testChecksum(t, 0, hc.stateChecksum()) - hc.AddTablet(tablet, "") - t.Logf(`hc = LegacyHealthCheck(); hc.AddTablet({Host: "a", PortMap: {"vt": 1}}, "")`) - - // Immediately after AddTablet() there will be the first notification. - want := &LegacyTabletStats{ - Key: "a,vt:1", - Tablet: tablet, - Target: &querypb.Target{}, - Up: true, - Serving: false, - } - res := <-l.output - utils.MustMatch(t, want, res) - testChecksum(t, 401258919, hc.stateChecksum()) - - // one tablet after receiving a StreamHealthResponse - shr := &querypb.StreamHealthResponse{ - Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY}, - Serving: true, - - TabletExternallyReparentedTimestamp: 10, - RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2}, - } - want = &LegacyTabletStats{ - Key: "a,vt:1", - Tablet: tablet, - Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY}, - Up: true, - Serving: true, - Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2}, - - TabletExternallyReparentedTimestamp: 10, - } - input <- shr - t.Logf(`input <- {{Keyspace: "k", Shard: "s", TabletType: PRIMARY}, Serving: true, TabletExternallyReparentedTimestamp: 10, {ReplicationLagSeconds: 1, CpuUsage: 0.2}}`) - res = <-l.output - utils.MustMatch(t, want, res) - - // Verify that the error count is initialized to 0 after the first tablet response. - if err := checkErrorCounter("k", "s", topodatapb.TabletType_PRIMARY, 0); err != nil { - t.Errorf("%v", err) - } - - tcsl := hc.CacheStatus() - tcslWant := LegacyTabletsCacheStatusList{{ - Cell: "cell", - Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY}, - TabletsStats: LegacyTabletStatsList{{ - Key: "a,vt:1", - Tablet: tablet, - Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY}, - Up: true, - Serving: true, - Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2}, - - TabletExternallyReparentedTimestamp: 10, - }}, - }} - utils.MustMatch(t, tcslWant, tcsl) - testChecksum(t, 4163049392, hc.stateChecksum()) - - // TabletType changed, should get both old and new event - shr = &querypb.StreamHealthResponse{ - Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, - Serving: true, - - TabletExternallyReparentedTimestamp: 0, - - RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.5}, - } - input <- shr - t.Logf(`input <- {{Keyspace: "k", Shard: "s", TabletType: REPLICA}, Serving: true, TabletExternallyReparentedTimestamp: 0, {ReplicationLagSeconds: 1, CpuUsage: 0.5}}`) - want = &LegacyTabletStats{ - Key: "a,vt:1", - Tablet: tablet, - Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY}, - Up: false, - Serving: true, - Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2}, - TabletExternallyReparentedTimestamp: 10, - } - res = <-l.output - utils.MustMatch(t, want, res) - want = &LegacyTabletStats{ - Key: "a,vt:1", - Tablet: tablet, - Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, - Up: true, - Serving: true, - Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.5}, - TabletExternallyReparentedTimestamp: 0, - } - res = <-l.output - utils.MustMatch(t, want, res) - - if err := checkErrorCounter("k", "s", topodatapb.TabletType_REPLICA, 0); err != nil { - t.Errorf("%v", err) - } - testChecksum(t, 1906892404, hc.stateChecksum()) - - // Serving & RealtimeStats changed - shr = &querypb.StreamHealthResponse{ - Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, - Serving: false, - TabletExternallyReparentedTimestamp: 0, - RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.3}, - } - want = &LegacyTabletStats{ - Key: "a,vt:1", - Tablet: tablet, - Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, - Up: true, - Serving: false, - Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.3}, - TabletExternallyReparentedTimestamp: 0, - } - input <- shr - t.Logf(`input <- {{Keyspace: "k", Shard: "s", TabletType: REPLICA}, TabletExternallyReparentedTimestamp: 0, {ReplicationLagSeconds: 1, CpuUsage: 0.3}}`) - res = <-l.output - utils.MustMatch(t, want, res) - testChecksum(t, 1200695592, hc.stateChecksum()) - - // HealthError - shr = &querypb.StreamHealthResponse{ - Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, - Serving: true, - TabletExternallyReparentedTimestamp: 0, - RealtimeStats: &querypb.RealtimeStats{HealthError: "some error", ReplicationLagSeconds: 1, CpuUsage: 0.3}, - } - want = &LegacyTabletStats{ - Key: "a,vt:1", - Tablet: tablet, - Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, - Up: true, - Serving: false, - Stats: &querypb.RealtimeStats{HealthError: "some error", ReplicationLagSeconds: 1, CpuUsage: 0.3}, - TabletExternallyReparentedTimestamp: 0, - LastError: fmt.Errorf("vttablet error: some error"), - } - input <- shr - t.Logf(`input <- {{Keyspace: "k", Shard: "s", TabletType: REPLICA}, Serving: true, TabletExternallyReparentedTimestamp: 0, {HealthError: "some error", ReplicationLagSeconds: 1, CpuUsage: 0.3}}`) - res = <-l.output - utils.MustMatch(t, want, res) - testChecksum(t, 1200695592, hc.stateChecksum()) // unchanged - - // remove tablet - hc.deleteConn(tablet) - t.Logf(`hc.RemoveTablet({Host: "a", PortMap: {"vt": 1}})`) - want = &LegacyTabletStats{ - Key: "a,vt:1", - Tablet: tablet, - Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, - Up: false, - Serving: false, - Stats: &querypb.RealtimeStats{HealthError: "some error", ReplicationLagSeconds: 1, CpuUsage: 0.3}, - TabletExternallyReparentedTimestamp: 0, - LastError: context.Canceled, - } - res = <-l.output - utils.MustMatch(t, want, res) - testChecksum(t, 0, hc.stateChecksum()) - - // close healthcheck - hc.Close() -} - -func TestLegacyHealthCheckStreamError(t *testing.T) { - tablet := topo.NewTablet(0, "cell", "a") - tablet.PortMap["vt"] = 1 - input := make(chan *querypb.StreamHealthResponse) - fc := createFakeConn(tablet, input) - fc.errCh = make(chan error) - t.Logf(`createFakeConn({Host: "a", PortMap: {"vt": 1}}, c)`) - l := newListener() - hc := NewLegacyHealthCheck(1*time.Millisecond, time.Hour).(*LegacyHealthCheckImpl) - hc.SetListener(l, true) - hc.AddTablet(tablet, "") - t.Logf(`hc = LegacyHealthCheck(); hc.AddTablet({Host: "a", PortMap: {"vt": 1}}, "")`) - - // Immediately after AddTablet() there will be the first notification. - want := &LegacyTabletStats{ - Key: "a,vt:1", - Tablet: tablet, - Target: &querypb.Target{}, - Up: true, - Serving: false, - } - res := <-l.output - utils.MustMatch(t, want, res) - - // one tablet after receiving a StreamHealthResponse - shr := &querypb.StreamHealthResponse{ - Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, - Serving: true, - TabletExternallyReparentedTimestamp: 0, - RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2}, - } - want = &LegacyTabletStats{ - Key: "a,vt:1", - Tablet: tablet, - Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, - Up: true, - Serving: true, - Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2}, - TabletExternallyReparentedTimestamp: 0, - } - input <- shr - t.Logf(`input <- {{Keyspace: "k", Shard: "s", TabletType: PRIMARY}, Serving: true, TabletExternallyReparentedTimestamp: 10, {ReplicationLagSeconds: 1, CpuUsage: 0.2}}`) - res = <-l.output - utils.MustMatch(t, want, res) - - // Stream error - fc.errCh <- fmt.Errorf("some stream error") - want = &LegacyTabletStats{ - Key: "a,vt:1", - Tablet: tablet, - Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, - Up: true, - Serving: false, - Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2}, - TabletExternallyReparentedTimestamp: 0, - LastError: fmt.Errorf("some stream error"), - } - res = <-l.output - utils.MustMatch(t, want, res) - - // close healthcheck - hc.Close() -} - -func TestLegacyHealthCheckVerifiesTabletAlias(t *testing.T) { - t.Logf("starting") - tablet := topo.NewTablet(1, "cell", "a") - tablet.PortMap["vt"] = 1 - input := make(chan *querypb.StreamHealthResponse, 1) - fc := createFakeConn(tablet, input) - - t.Logf(`createFakeConn({Host: "a", PortMap: {"vt": 1}}, c)`) - - l := newListener() - hc := NewLegacyHealthCheck(1*time.Millisecond, time.Hour).(*LegacyHealthCheckImpl) - hc.SetListener(l, false) - hc.AddTablet(tablet, "") - t.Logf(`hc = LegacyHealthCheck(); hc.AddTablet({Host: "a", PortMap: {"vt": 1}}, "")`) - - // Immediately after AddTablet() there will be the first notification. - want := &LegacyTabletStats{ - Key: "a,vt:1", - Tablet: tablet, - Target: &querypb.Target{}, - Up: true, - Serving: false, - } - res := <-l.output - utils.MustMatch(t, want, res) - - input <- &querypb.StreamHealthResponse{ - Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY}, - TabletAlias: &topodatapb.TabletAlias{Uid: 20, Cell: "cellb"}, - Serving: true, - TabletExternallyReparentedTimestamp: 10, - RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2}, - } - - select { - case err := <-fc.cbErrCh: - t.Logf("<-fc.cbErrCh: %v", err) - if prefix := "health stats mismatch"; !strings.HasPrefix(err.Error(), prefix) { - t.Fatalf("wrong error, got %v; want prefix %v", err, prefix) - } - case <-l.output: - t.Fatalf("StreamHealth should have returned a health stats mismatch error") - } - - input <- &querypb.StreamHealthResponse{ - Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY}, - TabletAlias: &topodatapb.TabletAlias{Uid: 1, Cell: "cell"}, - Serving: true, - TabletExternallyReparentedTimestamp: 10, - RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2}, - } - - select { - case err := <-fc.cbErrCh: - t.Fatalf("wanted listener output, got error: %v", err) - case res := <-l.output: - t.Logf("<-l.output: %+v", res) - } - - // close healthcheck - hc.Close() -} - -// TestLegacyHealthCheckCloseWaitsForGoRoutines tests that Close() waits for all Go -// routines to finish and the listener won't be called anymore. -func TestLegacyHealthCheckCloseWaitsForGoRoutines(t *testing.T) { - tablet := topo.NewTablet(0, "cell", "a") - tablet.PortMap["vt"] = 1 - input := make(chan *querypb.StreamHealthResponse, 1) - createFakeConn(tablet, input) - - t.Logf(`createFakeConn({Host: "a", PortMap: {"vt": 1}}, c)`) - - l := newListener() - hc := NewLegacyHealthCheck(1*time.Millisecond, time.Hour).(*LegacyHealthCheckImpl) - hc.SetListener(l, false) - hc.AddTablet(tablet, "") - t.Logf(`hc = LegacyHealthCheck(); hc.AddTablet({Host: "a", PortMap: {"vt": 1}}, "")`) - - // Immediately after AddTablet() there will be the first notification. - want := &LegacyTabletStats{ - Key: "a,vt:1", - Tablet: tablet, - Target: &querypb.Target{}, - Up: true, - Serving: false, - } - res := <-l.output - utils.MustMatch(t, want, res) - - // Verify that the listener works in general. - shr := &querypb.StreamHealthResponse{ - Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY}, - Serving: true, - TabletExternallyReparentedTimestamp: 10, - RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2}, - } - want = &LegacyTabletStats{ - Key: "a,vt:1", - Tablet: tablet, - Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY}, - Up: true, - Serving: true, - Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2}, - TabletExternallyReparentedTimestamp: 10, - } - input <- shr - t.Logf(`input <- %v`, shr) - res = <-l.output - utils.MustMatch(t, want, res) - - // Change input to distinguish between stats sent before and after Close(). - shr.TabletExternallyReparentedTimestamp = 11 - // Close the healthcheck. Tablet connections are closed asynchronously and - // Close() will block until all Go routines (one per connection) are done. - hc.Close() - - // Try to send more updates. They should be ignored and the listener should - // not be called from any Go routine anymore. - // Note that this code is racy by nature. If there is a regression, it should - // fail in some cases. - input <- shr - t.Logf(`input <- %v`, shr) - - // After Close() we'll receive one or two notifications with Serving == false. - res = <-l.output - if res.Serving { - t.Errorf(`Received one more notification with Serving == true: %+v`, res) - } - - select { - case res = <-l.output: - if res.TabletExternallyReparentedTimestamp == 10 && res.LastError == context.Canceled { - // LegacyHealthCheck repeats the previous stats if there is an error. - // This is expected. - break - } - t.Fatalf("healthCheck still running after Close(): listener received: %v but should not have been called", res) - case <-time.After(1 * time.Millisecond): - // No response after timeout. Close probably closed all Go routines - // properly and won't use the listener anymore. - } - - // The last notification should have Up = false. - if res.Up || res.Serving { - t.Errorf(`Last notification doesn't have Up == false and Serving == false: %+v`, res) - } - - // Check if there are more updates than the one emitted during Close(). - select { - case res := <-l.output: - t.Fatalf("healthCheck still running after Close(): listener received: %v but should not have been called", res) - case <-time.After(1 * time.Millisecond): - // No response after timeout. Listener probably not called again. Success. - } -} - -func TestLegacyHealthCheckTimeout(t *testing.T) { - timeout := 500 * time.Millisecond - tablet := topo.NewTablet(0, "cell", "a") - tablet.PortMap["vt"] = 1 - input := make(chan *querypb.StreamHealthResponse) - fc := createFakeConn(tablet, input) - t.Logf(`createFakeConn({Host: "a", PortMap: {"vt": 1}}, c)`) - l := newListener() - hc := NewLegacyHealthCheck(1*time.Millisecond, timeout).(*LegacyHealthCheckImpl) - hc.SetListener(l, false) - hc.AddTablet(tablet, "") - t.Logf(`hc = LegacyHealthCheck(); hc.AddTablet({Host: "a", PortMap: {"vt": 1}}, "")`) - - // Immediately after AddTablet() there will be the first notification. - want := &LegacyTabletStats{ - Key: "a,vt:1", - Tablet: tablet, - Target: &querypb.Target{}, - Up: true, - Serving: false, - } - res := <-l.output - utils.MustMatch(t, want, res) - - // one tablet after receiving a StreamHealthResponse - shr := &querypb.StreamHealthResponse{ - Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY}, - Serving: true, - TabletExternallyReparentedTimestamp: 10, - RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2}, - } - want = &LegacyTabletStats{ - Key: "a,vt:1", - Tablet: tablet, - Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY}, - Up: true, - Serving: true, - Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2}, - TabletExternallyReparentedTimestamp: 10, - } - input <- shr - t.Logf(`input <- {{Keyspace: "k", Shard: "s", TabletType: PRIMARY}, Serving: true, TabletExternallyReparentedTimestamp: 10, {ReplicationLagSeconds: 1, CpuUsage: 0.2}}`) - res = <-l.output - utils.MustMatch(t, want, res) - - if err := checkErrorCounter("k", "s", topodatapb.TabletType_PRIMARY, 0); err != nil { - t.Errorf("%v", err) - } - - // wait for timeout period - time.Sleep(2 * timeout) - t.Logf(`Sleep(2 * timeout)`) - res = <-l.output - if res.Serving { - t.Errorf(`<-l.output: %+v; want not serving`, res) - } - - if err := checkErrorCounter("k", "s", topodatapb.TabletType_PRIMARY, 1); err != nil { - t.Errorf("%v", err) - } - - if !fc.isCanceled() { - t.Errorf("StreamHealth should be canceled after timeout, but is not") - } - - // repeat the wait. It will timeout one more time trying to get the connection. - fc.resetCanceledFlag() - time.Sleep(timeout) - t.Logf(`Sleep(2 * timeout)`) - - res = <-l.output - if res.Serving { - t.Errorf(`<-l.output: %+v; want not serving`, res) - } - - if err := checkErrorCounter("k", "s", topodatapb.TabletType_PRIMARY, 2); err != nil { - t.Errorf("%v", err) - } - - if !fc.isCanceled() { - t.Errorf("StreamHealth should be canceled again after timeout") - } - - // send a healthcheck response, it should be serving again - fc.resetCanceledFlag() - input <- shr - t.Logf(`input <- {{Keyspace: "k", Shard: "s", TabletType: PRIMARY}, Serving: true, TabletExternallyReparentedTimestamp: 10, {ReplicationLagSeconds: 1, CpuUsage: 0.2}}`) - - // wait for the exponential backoff to wear off and health monitoring to resume. - time.Sleep(timeout) - res = <-l.output - utils.MustMatch(t, want, res) - - // close healthcheck - hc.Close() -} - -func TestLegacyTemplate(t *testing.T) { - tablet := topo.NewTablet(0, "cell", "a") - ts := []*LegacyTabletStats{ - { - Key: "a", - Tablet: tablet, - Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, - Up: true, - Serving: false, - Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.3}, - TabletExternallyReparentedTimestamp: 0, - }, - } - tcs := &LegacyTabletsCacheStatus{ - Cell: "cell", - Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, - TabletsStats: ts, - } - templ := template.New("").Funcs(status.StatusFuncs) - templ, err := templ.Parse(LegacyHealthCheckTemplate) - if err != nil { - t.Fatalf("error parsing template: %v", err) - } - wr := &bytes.Buffer{} - if err := templ.Execute(wr, []*LegacyTabletsCacheStatus{tcs}); err != nil { - t.Fatalf("error executing template: %v", err) - } -} - -func TestLegacyDebugURLFormatting(t *testing.T) { - flag.Set("tablet_url_template", "https://{{.GetHostNameLevel 0}}.bastion.{{.Tablet.Alias.Cell}}.corp") - ParseTabletURLTemplateFromFlag() - - tablet := topo.NewTablet(0, "cell", "host.dc.domain") - ts := []*LegacyTabletStats{ - { - Key: "a", - Tablet: tablet, - Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, - Up: true, - Serving: false, - Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.3}, - TabletExternallyReparentedTimestamp: 0, - }, - } - tcs := &LegacyTabletsCacheStatus{ - Cell: "cell", - Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, - TabletsStats: ts, - } - templ := template.New("").Funcs(status.StatusFuncs) - templ, err := templ.Parse(LegacyHealthCheckTemplate) - if err != nil { - t.Fatalf("error parsing template: %v", err) - } - wr := &bytes.Buffer{} - if err := templ.Execute(wr, []*LegacyTabletsCacheStatus{tcs}); err != nil { - t.Fatalf("error executing template: %v", err) - } - expectedURL := `"https://host.bastion.cell.corp"` - if !strings.Contains(wr.String(), expectedURL) { - t.Fatalf("output missing formatted URL, expectedURL: %s , output: %s", expectedURL, wr.String()) - } -} - -type listener struct { - output chan *LegacyTabletStats -} - -func newListener() *listener { - return &listener{output: make(chan *LegacyTabletStats, 2)} -} - -func (l *listener) StatsUpdate(ts *LegacyTabletStats) { - l.output <- ts -} diff --git a/go/vt/discovery/legacy_replicationlag.go b/go/vt/discovery/legacy_replicationlag.go deleted file mode 100644 index 31dd5eaa4e8..00000000000 --- a/go/vt/discovery/legacy_replicationlag.go +++ /dev/null @@ -1,201 +0,0 @@ -/* -Copyright 2019 The Vitess Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package discovery - -import ( - "fmt" - "sort" -) - -// LegacyIsReplicationLagHigh verifies that the given LegacyTabletStats refers to a tablet with high -// replication lag, i.e. higher than the configured discovery_low_replication_lag flag. -func LegacyIsReplicationLagHigh(tabletStats *LegacyTabletStats) bool { - return float64(tabletStats.Stats.ReplicationLagSeconds) > lowReplicationLag.Seconds() -} - -// LegacyIsReplicationLagVeryHigh verifies that the given LegacyTabletStats refers to a tablet with very high -// replication lag, i.e. higher than the configured discovery_high_replication_lag_minimum_serving flag. -func LegacyIsReplicationLagVeryHigh(tabletStats *LegacyTabletStats) bool { - return float64(tabletStats.Stats.ReplicationLagSeconds) > highReplicationLagMinServing.Seconds() -} - -// FilterLegacyStatsByReplicationLag filters the list of LegacyTabletStats by LegacyTabletStats.Stats.ReplicationLagSeconds. -// Note that LegacyTabletStats that is non-serving or has error is ignored. -// -// The simplified logic: -// - Return tablets that have lag <= lowReplicationLag. -// - Make sure we return at least minNumTablets tablets, if there are enough one with lag <= highReplicationLagMinServing. -// For example, with the default of 30s / 2h / 2, this means: -// - lags of (5s, 10s, 15s, 120s) return the first three -// - lags of (30m, 35m, 40m, 45m) return the first two -// - lags of (2h, 3h, 4h, 5h) return the first one -// -// The legacy algorithm (default for now): -// - Return the list if there is 0 or 1 tablet. -// - Return the list if all tablets have <=30s lag. -// - Filter by replication lag: for each tablet, if the mean value without it is more than 0.7 of the mean value across all tablets, it is valid. -// - Make sure we return at least minNumTablets tablets (if there are enough one with only low replication lag). -// - If one tablet is removed, run above steps again in case there are two tablets with high replication lag. (It should cover most cases.) -// For example, lags of (5s, 10s, 15s, 120s) return the first three; -// lags of (30m, 35m, 40m, 45m) return all. -// -// One thing to know about this code: vttablet also has a couple flags that impact the logic here: -// * unhealthy_threshold: if replication lag is higher than this, a tablet will be reported as unhealthy. -// The default for this is 2h, same as the discovery_high_replication_lag_minimum_serving here. -// * degraded_threshold: this is only used by vttablet for display. It should match -// discovery_low_replication_lag here, so the vttablet status display matches what vtgate will do of it. -func FilterLegacyStatsByReplicationLag(tabletStatsList []*LegacyTabletStats) []*LegacyTabletStats { - if !*legacyReplicationLagAlgorithm { - return filterLegacyStatsByLag(tabletStatsList) - } - - res := filterLegacyStatsByLagWithLegacyAlgorithm(tabletStatsList) - // run the filter again if exactly one tablet is removed, - // and we have spare tablets. - if len(res) > *minNumTablets && len(res) == len(tabletStatsList)-1 { - res = filterLegacyStatsByLagWithLegacyAlgorithm(res) - } - return res -} - -func filterLegacyStatsByLag(tabletStatsList []*LegacyTabletStats) []*LegacyTabletStats { - list := make([]legacyTabletLagSnapshot, 0, len(tabletStatsList)) - // filter non-serving tablets and those with very high replication lag - for _, ts := range tabletStatsList { - if !ts.Serving || ts.LastError != nil || ts.Stats == nil || LegacyIsReplicationLagVeryHigh(ts) { - continue - } - // Pull the current replication lag for a stable sort later. - list = append(list, legacyTabletLagSnapshot{ - ts: ts, - replag: ts.Stats.ReplicationLagSeconds}) - } - - // Sort by replication lag. - sort.Sort(byLegacyReplag(list)) - - // Pick those with low replication lag, but at least minNumTablets tablets regardless. - res := make([]*LegacyTabletStats, 0, len(list)) - for i := 0; i < len(list); i++ { - if !LegacyIsReplicationLagHigh(list[i].ts) || i < *minNumTablets { - res = append(res, list[i].ts) - } - } - return res -} - -func filterLegacyStatsByLagWithLegacyAlgorithm(tabletStatsList []*LegacyTabletStats) []*LegacyTabletStats { - list := make([]*LegacyTabletStats, 0, len(tabletStatsList)) - // filter non-serving tablets - for _, ts := range tabletStatsList { - if !ts.Serving || ts.LastError != nil || ts.Stats == nil { - continue - } - list = append(list, ts) - } - if len(list) <= 1 { - return list - } - // if all have low replication lag (<=30s), return all tablets. - allLowLag := true - for _, ts := range list { - if LegacyIsReplicationLagHigh(ts) { - allLowLag = false - break - } - } - if allLowLag { - return list - } - // filter those affecting "mean" lag significantly - // calculate mean for all tablets - res := make([]*LegacyTabletStats, 0, len(list)) - m, _ := legacyMean(list, -1) - for i, ts := range list { - // calculate mean by excluding ith tablet - mi, _ := legacyMean(list, i) - if float64(mi) > float64(m)*0.7 { - res = append(res, ts) - } - } - if len(res) >= *minNumTablets { - return res - } - // return at least minNumTablets tablets to avoid over loading, - // if there is enough tablets with replication lag < highReplicationLagMinServing. - // Pull the current replication lag for a stable sort. - snapshots := make([]legacyTabletLagSnapshot, 0, len(list)) - for _, ts := range list { - if !LegacyIsReplicationLagVeryHigh(ts) { - snapshots = append(snapshots, legacyTabletLagSnapshot{ - ts: ts, - replag: ts.Stats.ReplicationLagSeconds}) - } - } - if len(snapshots) == 0 { - // We get here if all tablets are over the high - // replication lag threshold, and their lag is - // different enough that the 70% mean computation up - // there didn't find them all in a group. For - // instance, if *minNumTablets = 2, and we have two - // tablets with lag of 3h and 30h. In that case, we - // just use them all. - for _, ts := range list { - snapshots = append(snapshots, legacyTabletLagSnapshot{ - ts: ts, - replag: ts.Stats.ReplicationLagSeconds}) - } - } - - // Sort by replication lag. - sort.Sort(byLegacyReplag(snapshots)) - - // Pick the first minNumTablets tablets. - res = make([]*LegacyTabletStats, 0, *minNumTablets) - for i := 0; i < min(*minNumTablets, len(snapshots)); i++ { - res = append(res, snapshots[i].ts) - } - return res -} - -type legacyTabletLagSnapshot struct { - ts *LegacyTabletStats - replag uint32 -} -type byLegacyReplag []legacyTabletLagSnapshot - -func (a byLegacyReplag) Len() int { return len(a) } -func (a byLegacyReplag) Swap(i, j int) { a[i], a[j] = a[j], a[i] } -func (a byLegacyReplag) Less(i, j int) bool { return a[i].replag < a[j].replag } - -// mean calculates the mean value over the given list, -// while excluding the item with the specified index. -func legacyMean(tabletStatsList []*LegacyTabletStats, idxExclude int) (uint64, error) { - var sum uint64 - var count uint64 - for i, ts := range tabletStatsList { - if i == idxExclude { - continue - } - sum = sum + uint64(ts.Stats.ReplicationLagSeconds) - count++ - } - if count == 0 { - return 0, fmt.Errorf("empty list") - } - return sum / count, nil -} diff --git a/go/vt/discovery/legacy_replicationlag_test.go b/go/vt/discovery/legacy_replicationlag_test.go deleted file mode 100644 index 0033b0370ee..00000000000 --- a/go/vt/discovery/legacy_replicationlag_test.go +++ /dev/null @@ -1,370 +0,0 @@ -/* -Copyright 2019 The Vitess Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package discovery - -import ( - "fmt" - "testing" - - querypb "vitess.io/vitess/go/vt/proto/query" - "vitess.io/vitess/go/vt/topo" -) - -// testSetLegacyReplicationLagAlgorithm is a test helper function, if this is used by a production code path, something is wrong. -func testSetLegacyReplicationLagAlgorithm(newLegacy bool) { - *legacyReplicationLagAlgorithm = newLegacy -} - -func TestFilterLegacyStatsByReplicationLagUnhealthy(t *testing.T) { - // 1 healthy serving tablet, 1 not healhty - ts1 := &LegacyTabletStats{ - Tablet: topo.NewTablet(1, "cell", "host1"), - Serving: true, - Stats: &querypb.RealtimeStats{}, - } - ts2 := &LegacyTabletStats{ - Tablet: topo.NewTablet(2, "cell", "host2"), - Serving: false, - Stats: &querypb.RealtimeStats{}, - } - got := FilterLegacyStatsByReplicationLag([]*LegacyTabletStats{ts1, ts2}) - if len(got) != 1 { - t.Errorf("len(FilterLegacyStatsByReplicationLag([{Tablet: {Uid: 1}, Serving: true}, {Tablet: {Uid: 2}, Serving: false}])) = %v, want 1", len(got)) - } - if len(got) > 0 && !got[0].DeepEqual(ts1) { - t.Errorf("FilterLegacyStatsByReplicationLag([{Tablet: {Uid: 1}, Serving: true}, {Tablet: {Uid: 2}, Serving: false}]) = %+v, want %+v", got[0], ts1) - } -} - -func TestFilterLegacyStatsByReplicationLag(t *testing.T) { - // Use simplified logic - testSetLegacyReplicationLagAlgorithm(false) - - cases := []struct { - description string - input []uint32 - output []uint32 - }{ - { - "0 tablet", - []uint32{}, - []uint32{}, - }, - { - "lags of (1s) - return all items with low lag.", - []uint32{1}, - []uint32{1}, - }, - { - "lags of (1s, 1s, 1s, 30s) - return all items with low lag.", - []uint32{1, 1, 1, 30}, - []uint32{1, 1, 1, 30}, - }, - { - "lags of (1s, 1s, 1s, 40m, 40m, 40m) - return all items with low lag.", - []uint32{1, 1, 1, 40 * 60, 40 * 60, 40 * 60}, - []uint32{1, 1, 1}, - }, - { - "lags of (1s, 40m, 40m, 40m) - return at least 2 items if they don't have very high lag.", - []uint32{1, 40 * 60, 40 * 60, 40 * 60}, - []uint32{1, 40 * 60}, - }, - { - "lags of (30m, 35m, 40m, 45m) - return at least 2 items if they don't have very high lag.", - []uint32{30 * 60, 35 * 60, 40 * 60, 45 * 60}, - []uint32{30 * 60, 35 * 60}, - }, - { - "lags of (2h, 3h, 4h, 5h) - return <2 items if the others have very high lag.", - []uint32{2 * 60 * 60, 3 * 60 * 60, 4 * 60 * 60, 5 * 60 * 60}, - []uint32{2 * 60 * 60}, - }, - { - "lags of (3h, 30h) - return nothing if all have very high lag.", - []uint32{3 * 60 * 60, 30 * 60 * 60}, - []uint32{}, - }, - } - - for _, tc := range cases { - lts := make([]*LegacyTabletStats, len(tc.input)) - for i, lag := range tc.input { - lts[i] = &LegacyTabletStats{ - Tablet: topo.NewTablet(uint32(i+1), "cell", fmt.Sprintf("host-%vs-behind", lag)), - Serving: true, - Stats: &querypb.RealtimeStats{ReplicationLagSeconds: lag}, - } - } - got := FilterLegacyStatsByReplicationLag(lts) - if len(got) != len(tc.output) { - t.Errorf("FilterLegacyStatsByReplicationLag(%v) failed: got output:\n%v\nExpected: %v", tc.description, got, tc.output) - continue - } - for i, elag := range tc.output { - if got[i].Stats.ReplicationLagSeconds != elag { - t.Errorf("FilterLegacyStatsByReplicationLag(%v) failed: got output:\n%v\nExpected value index %v to be %v", tc.description, got, i, elag) - } - } - } - - // Reset to the default - testSetLegacyReplicationLagAlgorithm(true) -} - -func TestFilterLegacyStatysByReplicationLagWithLegacyAlgorithm(t *testing.T) { - // Use legacy algorithm by default for now - - cases := []struct { - description string - input []uint32 - output []uint32 - }{ - { - "0 tablet", - []uint32{}, - []uint32{}, - }, - { - "1 serving tablet", - []uint32{1}, - []uint32{1}, - }, - { - "lags of (1s, 1s, 1s, 30s)", - []uint32{1, 1, 1, 30}, - []uint32{1, 1, 1, 30}, - }, - { - "lags of (30m, 35m, 40m, 45m)", - []uint32{30 * 60, 35 * 60, 40 * 60, 45 * 60}, - []uint32{30 * 60, 35 * 60, 40 * 60, 45 * 60}, - }, - { - "lags of (1s, 1s, 1m, 40m, 40m) - not run filter the second time as first run removed two items.", - []uint32{1, 1, 60, 40 * 60, 40 * 60}, - []uint32{1, 1, 60}, - }, - { - "lags of (1s, 1s, 10m, 40m) - run filter twice to remove two items", - []uint32{1, 1, 10 * 60, 40 * 60}, - []uint32{1, 1}, - }, - { - "lags of (1m, 100m) - return at least 2 items to avoid overloading if the 2nd one is not delayed too much.", - []uint32{1 * 60, 100 * 60}, - []uint32{1 * 60, 100 * 60}, - }, - { - "lags of (1m, 3h) - return 1 if the 2nd one is delayed too much.", - []uint32{1 * 60, 3 * 60 * 60}, - []uint32{1 * 60}, - }, - { - "lags of (3h) - return 1 as they're all delayed too much.", - []uint32{3 * 60 * 60}, - []uint32{3 * 60 * 60}, - }, - { - "lags of (3h, 4h) - return 2 as they're all delayed too much, but still in a good group.", - []uint32{3 * 60 * 60, 4 * 60 * 60}, - []uint32{3 * 60 * 60, 4 * 60 * 60}, - }, - { - "lags of (3h, 3h, 4h) - return 3 as they're all delayed too much, but still in a good group.", - []uint32{3 * 60 * 60, 3 * 60 * 60, 4 * 60 * 60}, - []uint32{3 * 60 * 60, 3 * 60 * 60, 4 * 60 * 60}, - }, - { - "lags of (3h, 15h, 18h) - return 3 as they're all delayed too much, but still in a good group." + - "(different test case than above to show how absurb the good group logic is)", - []uint32{3 * 60 * 60, 15 * 60 * 60, 18 * 60 * 60}, - []uint32{3 * 60 * 60, 15 * 60 * 60, 18 * 60 * 60}, - }, - { - "lags of (3h, 12h, 18h) - return 2 as they're all delayed too much, but 18h is now considered an outlier." + - "(different test case than above to show how absurb the good group logic is)", - []uint32{3 * 60 * 60, 12 * 60 * 60, 18 * 60 * 60}, - []uint32{3 * 60 * 60, 12 * 60 * 60}, - }, - { - "lags of (3h, 30h) - return 2 as they're all delayed too much." + - "(different test case that before, as both tablet stats are" + - "widely different, not within 70% of eachother)", - []uint32{3 * 60 * 60, 30 * 60 * 60}, - []uint32{3 * 60 * 60, 30 * 60 * 60}, - }, - } - - for _, tc := range cases { - lts := make([]*LegacyTabletStats, len(tc.input)) - for i, lag := range tc.input { - lts[i] = &LegacyTabletStats{ - Tablet: topo.NewTablet(uint32(i+1), "cell", fmt.Sprintf("host-%vs-behind", lag)), - Serving: true, - Stats: &querypb.RealtimeStats{ReplicationLagSeconds: lag}, - } - } - got := FilterLegacyStatsByReplicationLag(lts) - if len(got) != len(tc.output) { - t.Errorf("FilterLegacyStatsByReplicationLag(%v) failed: got output:\n%v\nExpected: %v", tc.description, got, tc.output) - continue - } - for i, elag := range tc.output { - if got[i].Stats.ReplicationLagSeconds != elag { - t.Errorf("FilterLegacyStatsByReplicationLag(%v) failed: got output:\n%v\nExpected value index %v to be %v", tc.description, got, i, elag) - } - } - } -} - -func TestFilterLegacyStatsByReplicationLagThreeTabletMin(t *testing.T) { - // Use at least 3 tablets if possible - testSetMinNumTablets(3) - // lags of (1s, 1s, 10m, 11m) - returns at least32 items where the slightly delayed ones that are returned are the 10m and 11m ones. - ts1 := &LegacyTabletStats{ - Tablet: topo.NewTablet(1, "cell", "host1"), - Serving: true, - Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 1}, - } - ts2 := &LegacyTabletStats{ - Tablet: topo.NewTablet(2, "cell", "host2"), - Serving: true, - Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 1}, - } - ts3 := &LegacyTabletStats{ - Tablet: topo.NewTablet(3, "cell", "host3"), - Serving: true, - Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 10 * 60}, - } - ts4 := &LegacyTabletStats{ - Tablet: topo.NewTablet(4, "cell", "host4"), - Serving: true, - Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 11 * 60}, - } - got := FilterLegacyStatsByReplicationLag([]*LegacyTabletStats{ts1, ts2, ts3, ts4}) - if len(got) != 3 || !got[0].DeepEqual(ts1) || !got[1].DeepEqual(ts2) || !got[2].DeepEqual(ts3) { - t.Errorf("FilterLegacyStatsByReplicationLag([1s, 1s, 10m, 11m]) = %+v, want [1s, 1s, 10m]", got) - } - // lags of (11m, 10m, 1s, 1s) - reordered tablets returns the same 3 items where the slightly delayed one that is returned is the 10m and 11m ones. - ts1 = &LegacyTabletStats{ - Tablet: topo.NewTablet(1, "cell", "host1"), - Serving: true, - Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 11 * 60}, - } - ts2 = &LegacyTabletStats{ - Tablet: topo.NewTablet(2, "cell", "host2"), - Serving: true, - Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 10 * 60}, - } - ts3 = &LegacyTabletStats{ - Tablet: topo.NewTablet(3, "cell", "host3"), - Serving: true, - Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 1}, - } - ts4 = &LegacyTabletStats{ - Tablet: topo.NewTablet(4, "cell", "host4"), - Serving: true, - Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 1}, - } - got = FilterLegacyStatsByReplicationLag([]*LegacyTabletStats{ts1, ts2, ts3, ts4}) - if len(got) != 3 || !got[0].DeepEqual(ts3) || !got[1].DeepEqual(ts4) || !got[2].DeepEqual(ts2) { - t.Errorf("FilterLegacyStatsByReplicationLag([1s, 1s, 10m, 11m]) = %+v, want [1s, 1s, 10m]", got) - } - // Reset to the default - testSetMinNumTablets(2) -} - -func TestFilterByReplicationLagOneTabletMin(t *testing.T) { - // Use at least 1 tablets if possible - testSetMinNumTablets(1) - // lags of (1s, 100m) - return only healthy tablet if that is all that is available. - ts1 := &LegacyTabletStats{ - Tablet: topo.NewTablet(1, "cell", "host1"), - Serving: true, - Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 1}, - } - ts2 := &LegacyTabletStats{ - Tablet: topo.NewTablet(2, "cell", "host2"), - Serving: true, - Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 100 * 60}, - } - got := FilterLegacyStatsByReplicationLag([]*LegacyTabletStats{ts1, ts2}) - if len(got) != 1 || !got[0].DeepEqual(ts1) { - t.Errorf("FilterLegacyStatsByReplicationLag([1s, 100m]) = %+v, want [1s]", got) - } - // lags of (1m, 100m) - return only healthy tablet if that is all that is healthy enough. - ts1 = &LegacyTabletStats{ - Tablet: topo.NewTablet(1, "cell", "host1"), - Serving: true, - Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 1 * 60}, - } - ts2 = &LegacyTabletStats{ - Tablet: topo.NewTablet(2, "cell", "host2"), - Serving: true, - Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 100 * 60}, - } - got = FilterLegacyStatsByReplicationLag([]*LegacyTabletStats{ts1, ts2}) - if len(got) != 1 || !got[0].DeepEqual(ts1) { - t.Errorf("FilterLegacyStatsByReplicationLag([1m, 100m]) = %+v, want [1m]", got) - } - // Reset to the default - testSetMinNumTablets(2) -} - -func TestTrivialLegacyStatsUpdate(t *testing.T) { - // Note the healthy threshold is set to 30s. - cases := []struct { - o uint32 - n uint32 - expected bool - }{ - // both are under 30s - {o: 0, n: 1, expected: true}, - {o: 15, n: 20, expected: true}, - - // one is under 30s, the other isn't - {o: 2, n: 40, expected: false}, - {o: 40, n: 10, expected: false}, - - // both are over 30s, but close enough - {o: 100, n: 100, expected: true}, - {o: 100, n: 105, expected: true}, - {o: 105, n: 100, expected: true}, - - // both are over 30s, but too far - {o: 100, n: 120, expected: false}, - {o: 120, n: 100, expected: false}, - } - - for _, c := range cases { - o := &LegacyTabletStats{ - Stats: &querypb.RealtimeStats{ - ReplicationLagSeconds: c.o, - }, - } - n := &LegacyTabletStats{ - Stats: &querypb.RealtimeStats{ - ReplicationLagSeconds: c.n, - }, - } - got := o.TrivialStatsUpdate(n) - if got != c.expected { - t.Errorf("TrivialStatsUpdate(%v, %v) = %v, expected %v", c.o, c.n, got, c.expected) - } - } -} diff --git a/go/vt/discovery/legacy_tablet_stats_cache.go b/go/vt/discovery/legacy_tablet_stats_cache.go deleted file mode 100644 index fb642c596a9..00000000000 --- a/go/vt/discovery/legacy_tablet_stats_cache.go +++ /dev/null @@ -1,313 +0,0 @@ -/* -Copyright 2019 The Vitess Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package discovery - -import ( - "sync" - - "context" - - "vitess.io/vitess/go/vt/log" - querypb "vitess.io/vitess/go/vt/proto/query" - topodatapb "vitess.io/vitess/go/vt/proto/topodata" - "vitess.io/vitess/go/vt/topo" - "vitess.io/vitess/go/vt/topo/topoproto" -) - -// LegacyTabletStatsCache is a LegacyHealthCheckStatsListener that keeps both the -// current list of available LegacyTabletStats, and a serving list: -// - for primary tablets, only the current primary is kept. -// - for non-primary tablets, we filter the list using FilterLegacyStatsByReplicationLag. -// It keeps entries for all tablets in the cell(s) it's configured to serve for, -// and for the primary independently of which cell it's in. -// Note the healthy tablet computation is done when we receive a tablet -// update only, not at serving time. -// Also note the cache may not have the last entry received by the tablet. -// For instance, if a tablet was healthy, and is still healthy, we do not -// keep its new update. -type LegacyTabletStatsCache struct { - // cell is the cell we are keeping all tablets for. - // Note we keep track of all primary tablets in all cells. - cell string - // ts is the topo server in use. - ts *topo.Server - // mu protects the following fields. It does not protect individual - // entries in the entries map. - mu sync.RWMutex - // entries maps from keyspace/shard/tabletType to our cache. - entries map[string]map[string]map[topodatapb.TabletType]*legacyTabletStatsCacheEntry - // cellAliases is a cache of cell aliases - cellAliases map[string]string -} - -// legacyTabletStatsCacheEntry is the per keyspace/shard/tabletType -// entry of the in-memory map for LegacyTabletStatsCache. -type legacyTabletStatsCacheEntry struct { - // mu protects the rest of this structure. - mu sync.RWMutex - // all has the valid tablets, indexed by TabletToMapKey(ts.Tablet), - // as it is the index used by LegacyHealthCheck. - all map[string]*LegacyTabletStats - // healthy only has the healthy ones. - healthy []*LegacyTabletStats -} - -func (e *legacyTabletStatsCacheEntry) updateHealthyMapForPrimary(ts *LegacyTabletStats) { - if ts.Up { - // We have an Up primary. - if len(e.healthy) == 0 { - // We have a new Up server, just remember it. - e.healthy = append(e.healthy, ts) - return - } - - // We already have one up server, see if we - // need to replace it. - if ts.TabletExternallyReparentedTimestamp < e.healthy[0].TabletExternallyReparentedTimestamp { - log.Warningf("not marking healthy primary %s as Up for %s because its externally reparented timestamp is smaller than the highest known timestamp from previous MASTERs %s: %d < %d ", - topoproto.TabletAliasString(ts.Tablet.Alias), - topoproto.KeyspaceShardString(ts.Target.Keyspace, ts.Target.Shard), - topoproto.TabletAliasString(e.healthy[0].Tablet.Alias), - ts.TabletExternallyReparentedTimestamp, - e.healthy[0].TabletExternallyReparentedTimestamp) - return - } - - // Just replace it. - e.healthy[0] = ts - return - } - - // We have a Down primary, remove it only if it's exactly the same. - if len(e.healthy) != 0 { - if ts.Key == e.healthy[0].Key { - // Same guy, remove it. - e.healthy = nil - } - } -} - -// NewLegacyTabletStatsCache creates a LegacyTabletStatsCache, and registers -// it as LegacyHealthCheckStatsListener of the provided healthcheck. -// Note we do the registration in this code to guarantee we call -// SetListener with sendDownEvents=true, as we need these events -// to maintain the integrity of our cache. -func NewLegacyTabletStatsCache(hc LegacyHealthCheck, ts *topo.Server, cell string) *LegacyTabletStatsCache { - return newLegacyTabletStatsCache(hc, ts, cell, true /* setListener */) -} - -// NewTabletStatsCacheDoNotSetListener is identical to NewLegacyTabletStatsCache -// but does not automatically set the returned object as listener for "hc". -// Instead, it's up to the caller to ensure that LegacyTabletStatsCache.StatsUpdate() -// gets called properly. This is useful for chaining multiple listeners. -// When the caller sets its own listener on "hc", they must make sure that they -// set the parameter "sendDownEvents" to "true" or this cache won't properly -// remove tablets whose tablet type changes. -func NewTabletStatsCacheDoNotSetListener(ts *topo.Server, cell string) *LegacyTabletStatsCache { - return newLegacyTabletStatsCache(nil, ts, cell, false /* setListener */) -} - -func newLegacyTabletStatsCache(hc LegacyHealthCheck, ts *topo.Server, cell string, setListener bool) *LegacyTabletStatsCache { - tc := &LegacyTabletStatsCache{ - cell: cell, - ts: ts, - entries: make(map[string]map[string]map[topodatapb.TabletType]*legacyTabletStatsCacheEntry), - cellAliases: make(map[string]string), - } - - if setListener { - // We need to set sendDownEvents=true to get the deletes from the map - // upon type change. - hc.SetListener(tc, true /*sendDownEvents*/) - } - return tc -} - -// getEntry returns an existing legacyTabletStatsCacheEntry in the cache, or nil -// if the entry does not exist. It only takes a Read lock on mu. -func (tc *LegacyTabletStatsCache) getEntry(keyspace, shard string, tabletType topodatapb.TabletType) *legacyTabletStatsCacheEntry { - tc.mu.RLock() - defer tc.mu.RUnlock() - - if s, ok := tc.entries[keyspace]; ok { - if t, ok := s[shard]; ok { - if e, ok := t[tabletType]; ok { - return e - } - } - } - return nil -} - -// getOrCreateEntry returns an existing legacyTabletStatsCacheEntry from the cache, -// or creates it if it doesn't exist. -func (tc *LegacyTabletStatsCache) getOrCreateEntry(target *querypb.Target) *legacyTabletStatsCacheEntry { - // Fast path (most common path too): Read-lock, return the entry. - if e := tc.getEntry(target.Keyspace, target.Shard, target.TabletType); e != nil { - return e - } - - // Slow path: Lock, will probably have to add the entry at some level. - tc.mu.Lock() - defer tc.mu.Unlock() - - s, ok := tc.entries[target.Keyspace] - if !ok { - s = make(map[string]map[topodatapb.TabletType]*legacyTabletStatsCacheEntry) - tc.entries[target.Keyspace] = s - } - t, ok := s[target.Shard] - if !ok { - t = make(map[topodatapb.TabletType]*legacyTabletStatsCacheEntry) - s[target.Shard] = t - } - e, ok := t[target.TabletType] - if !ok { - e = &legacyTabletStatsCacheEntry{ - all: make(map[string]*LegacyTabletStats), - } - t[target.TabletType] = e - } - return e -} - -func (tc *LegacyTabletStatsCache) getAliasByCell(cell string) string { - tc.mu.Lock() - defer tc.mu.Unlock() - - if alias, ok := tc.cellAliases[cell]; ok { - return alias - } - - alias := topo.GetAliasByCell(context.Background(), tc.ts, cell) - tc.cellAliases[cell] = alias - - return alias -} - -// StatsUpdate is part of the LegacyHealthCheckStatsListener interface. -func (tc *LegacyTabletStatsCache) StatsUpdate(ts *LegacyTabletStats) { - if ts.Target.TabletType != topodatapb.TabletType_PRIMARY && - ts.Tablet.Alias.Cell != tc.cell && - tc.getAliasByCell(ts.Tablet.Alias.Cell) != tc.getAliasByCell(tc.cell) { - // this is for a non-primary tablet in a different cell and a different alias, drop it - return - } - - e := tc.getOrCreateEntry(ts.Target) - e.mu.Lock() - defer e.mu.Unlock() - - // Update our full map. - trivialNonPrimaryUpdate := false - if existing, ok := e.all[ts.Key]; ok { - if ts.Up { - // We have an existing entry, and a new entry. - // Remember if they are both good (most common case). - trivialNonPrimaryUpdate = existing.LastError == nil && existing.Serving && ts.LastError == nil && - ts.Serving && ts.Target.TabletType != topodatapb.TabletType_PRIMARY && existing.TrivialStatsUpdate(ts) - - // We already have the entry, update the - // values if necessary. (will update both - // 'all' and 'healthy' as they use pointers). - if !trivialNonPrimaryUpdate { - *existing = *ts - } - } else { - // We have an entry which we shouldn't. Remove it. - delete(e.all, ts.Key) - } - } else { - if ts.Up { - // Add the entry. - e.all[ts.Key] = ts - } else { - // We were told to remove an entry which we - // didn't have anyway, nothing should happen. - return - } - } - - // Update our healthy list. - var allArray []*LegacyTabletStats - if ts.Target.TabletType == topodatapb.TabletType_PRIMARY { - // The healthy list is different for TabletType_PRIMARY: we - // only keep the most recent one. - e.updateHealthyMapForPrimary(ts) - } else { - // For non-primary, if it is a trivial update, - // we just skip everything else. We don't even update the - // aggregate stats. - if trivialNonPrimaryUpdate { - return - } - - // Now we need to do some work. Recompute our healthy list. - allArray = make([]*LegacyTabletStats, 0, len(e.all)) - for _, s := range e.all { - allArray = append(allArray, s) - } - e.healthy = FilterLegacyStatsByReplicationLag(allArray) - } -} - -// GetTabletStats returns the full list of available targets. -// The returned array is owned by the caller. -func (tc *LegacyTabletStatsCache) GetTabletStats(keyspace, shard string, tabletType topodatapb.TabletType) []LegacyTabletStats { - e := tc.getEntry(keyspace, shard, tabletType) - if e == nil { - return nil - } - - e.mu.RLock() - defer e.mu.RUnlock() - result := make([]LegacyTabletStats, 0, len(e.all)) - for _, s := range e.all { - result = append(result, *s) - } - return result -} - -// GetHealthyTabletStats returns only the healthy targets. -// The returned array is owned by the caller. -// For TabletType_PRIMARY, this will only return at most one entry, -// the most recent tablet of type primary. -func (tc *LegacyTabletStatsCache) GetHealthyTabletStats(keyspace, shard string, tabletType topodatapb.TabletType) []LegacyTabletStats { - e := tc.getEntry(keyspace, shard, tabletType) - if e == nil { - return nil - } - - e.mu.RLock() - defer e.mu.RUnlock() - result := make([]LegacyTabletStats, len(e.healthy)) - for i, ts := range e.healthy { - result[i] = *ts - } - return result -} - -// ResetForTesting is for use in tests only. -func (tc *LegacyTabletStatsCache) ResetForTesting() { - tc.mu.Lock() - defer tc.mu.Unlock() - - tc.entries = make(map[string]map[string]map[topodatapb.TabletType]*legacyTabletStatsCacheEntry) -} - -// Compile-time interface check. -var _ LegacyHealthCheckStatsListener = (*LegacyTabletStatsCache)(nil) diff --git a/go/vt/discovery/legacy_tablet_stats_cache_test.go b/go/vt/discovery/legacy_tablet_stats_cache_test.go deleted file mode 100644 index eabbb38ffa5..00000000000 --- a/go/vt/discovery/legacy_tablet_stats_cache_test.go +++ /dev/null @@ -1,280 +0,0 @@ -/* -Copyright 2019 The Vitess Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package discovery - -import ( - "context" - "testing" - - "vitess.io/vitess/go/vt/log" - - "vitess.io/vitess/go/vt/topo" - "vitess.io/vitess/go/vt/topo/memorytopo" - - querypb "vitess.io/vitess/go/vt/proto/query" - topodatapb "vitess.io/vitess/go/vt/proto/topodata" -) - -// TestTabletStatsCache tests the functionality of the LegacyTabletStatsCache class. -func TestLegacyTabletStatsCache(t *testing.T) { - ts := memorytopo.NewServer("cell", "cell1", "cell2") - - cellsAlias := &topodatapb.CellsAlias{ - Cells: []string{"cell", "cell1"}, - } - - if err := ts.CreateCellsAlias(context.Background(), "region1", cellsAlias); err != nil { - log.Errorf("creating cellsAlias \"region1\" failed: %v", err) - } - - defer deleteCellsAlias(t, ts, "region1") - - cellsAlias = &topodatapb.CellsAlias{ - Cells: []string{"cell2"}, - } - - if err := ts.CreateCellsAlias(context.Background(), "region2", cellsAlias); err != nil { - log.Errorf("creating cellsAlias \"region2\" failed: %v", err) - } - - defer deleteCellsAlias(t, ts, "region2") - - // We want to unit test LegacyTabletStatsCache without a full-blown - // LegacyHealthCheck object, so we can't call NewLegacyTabletStatsCache. - // So we just construct this object here. - tsc := &LegacyTabletStatsCache{ - cell: "cell", - ts: ts, - entries: make(map[string]map[string]map[topodatapb.TabletType]*legacyTabletStatsCacheEntry), - cellAliases: make(map[string]string), - } - - // empty - a := tsc.GetTabletStats("k", "s", topodatapb.TabletType_PRIMARY) - if len(a) != 0 { - t.Errorf("wrong result, expected empty list: %v", a) - } - - // add a tablet - tablet1 := topo.NewTablet(10, "cell", "host1") - ts1 := &LegacyTabletStats{ - Key: "t1", - Tablet: tablet1, - Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, - Up: true, - Serving: true, - Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2}, - } - tsc.StatsUpdate(ts1) - - // check it's there - a = tsc.GetTabletStats("k", "s", topodatapb.TabletType_REPLICA) - if len(a) != 1 || !ts1.DeepEqual(&a[0]) { - t.Errorf("unexpected result: %v", a) - } - a = tsc.GetHealthyTabletStats("k", "s", topodatapb.TabletType_REPLICA) - if len(a) != 1 || !ts1.DeepEqual(&a[0]) { - t.Errorf("unexpected result: %v", a) - } - - // update stats with a change that won't change health array - stillHealthyTs1 := &LegacyTabletStats{ - Key: "t1", - Tablet: tablet1, - Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, - Up: true, - Serving: true, - Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 2, CpuUsage: 0.2}, - } - tsc.StatsUpdate(stillHealthyTs1) - - // check the previous ts1 is still there, as the new one is ignored. - a = tsc.GetTabletStats("k", "s", topodatapb.TabletType_REPLICA) - if len(a) != 1 || !ts1.DeepEqual(&a[0]) { - t.Errorf("unexpected result: %v", a) - } - a = tsc.GetHealthyTabletStats("k", "s", topodatapb.TabletType_REPLICA) - if len(a) != 1 || !ts1.DeepEqual(&a[0]) { - t.Errorf("unexpected result: %v", a) - } - - // update stats with a change that will change arrays - notHealthyTs1 := &LegacyTabletStats{ - Key: "t1", - Tablet: tablet1, - Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, - Up: true, - Serving: true, - Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 35, CpuUsage: 0.2}, - } - tsc.StatsUpdate(notHealthyTs1) - - // check it's there - a = tsc.GetTabletStats("k", "s", topodatapb.TabletType_REPLICA) - if len(a) != 1 || !notHealthyTs1.DeepEqual(&a[0]) { - t.Errorf("unexpected result: %v", a) - } - a = tsc.GetHealthyTabletStats("k", "s", topodatapb.TabletType_REPLICA) - if len(a) != 1 || !notHealthyTs1.DeepEqual(&a[0]) { - t.Errorf("unexpected result: %v", a) - } - - // add a second tablet - tablet2 := topo.NewTablet(11, "cell", "host2") - ts2 := &LegacyTabletStats{ - Key: "t2", - Tablet: tablet2, - Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, - Up: true, - Serving: true, - Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 10, CpuUsage: 0.2}, - } - tsc.StatsUpdate(ts2) - - // check it's there - a = tsc.GetTabletStats("k", "s", topodatapb.TabletType_REPLICA) - if len(a) != 2 { - t.Errorf("unexpected result: %v", a) - } else { - if a[0].Tablet.Alias.Uid == 11 { - a[0], a[1] = a[1], a[0] - } - if !ts1.DeepEqual(&a[0]) || !ts2.DeepEqual(&a[1]) { - t.Errorf("unexpected result: %v", a) - } - } - a = tsc.GetHealthyTabletStats("k", "s", topodatapb.TabletType_REPLICA) - if len(a) != 2 { - t.Errorf("unexpected result: %v", a) - } else { - if a[0].Tablet.Alias.Uid == 11 { - a[0], a[1] = a[1], a[0] - } - if !ts1.DeepEqual(&a[0]) || !ts2.DeepEqual(&a[1]) { - t.Errorf("unexpected result: %v", a) - } - } - - // one tablet goes unhealthy - ts2.Serving = false - tsc.StatsUpdate(ts2) - - // check we only have one left in healthy version - a = tsc.GetTabletStats("k", "s", topodatapb.TabletType_REPLICA) - if len(a) != 2 { - t.Errorf("unexpected result: %v", a) - } else { - if a[0].Tablet.Alias.Uid == 11 { - a[0], a[1] = a[1], a[0] - } - if !ts1.DeepEqual(&a[0]) || !ts2.DeepEqual(&a[1]) { - t.Errorf("unexpected result: %v", a) - } - } - a = tsc.GetHealthyTabletStats("k", "s", topodatapb.TabletType_REPLICA) - if len(a) != 1 || !ts1.DeepEqual(&a[0]) { - t.Errorf("unexpected result: %v", a) - } - - // second tablet turns into a primary, we receive down + up - ts2.Serving = true - ts2.Up = false - tsc.StatsUpdate(ts2) - ts2.Up = true - ts2.Target.TabletType = topodatapb.TabletType_PRIMARY - ts2.TabletExternallyReparentedTimestamp = 10 - tsc.StatsUpdate(ts2) - - // check we only have one replica left - a = tsc.GetTabletStats("k", "s", topodatapb.TabletType_REPLICA) - if len(a) != 1 || !ts1.DeepEqual(&a[0]) { - t.Errorf("unexpected result: %v", a) - } - - // check we have a primary now - a = tsc.GetTabletStats("k", "s", topodatapb.TabletType_PRIMARY) - if len(a) != 1 || !ts2.DeepEqual(&a[0]) { - t.Errorf("unexpected result: %v", a) - } - - // reparent: old replica goes into primary - ts1.Up = false - tsc.StatsUpdate(ts1) - ts1.Up = true - ts1.Target.TabletType = topodatapb.TabletType_PRIMARY - ts1.TabletExternallyReparentedTimestamp = 20 - tsc.StatsUpdate(ts1) - - // check we lost all replicas, and primary is new one - a = tsc.GetTabletStats("k", "s", topodatapb.TabletType_REPLICA) - if len(a) != 0 { - t.Errorf("unexpected result: %v", a) - } - a = tsc.GetHealthyTabletStats("k", "s", topodatapb.TabletType_PRIMARY) - if len(a) != 1 || !ts1.DeepEqual(&a[0]) { - t.Errorf("unexpected result: %v", a) - } - - // old primary sending an old ping should be ignored - tsc.StatsUpdate(ts2) - a = tsc.GetHealthyTabletStats("k", "s", topodatapb.TabletType_PRIMARY) - if len(a) != 1 || !ts1.DeepEqual(&a[0]) { - t.Errorf("unexpected result: %v", a) - } - - // add a third tablet as replica in diff cell, same region - tablet3 := topo.NewTablet(12, "cell1", "host3") - ts3 := &LegacyTabletStats{ - Key: "t3", - Tablet: tablet3, - Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, - Up: true, - Serving: true, - Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 10, CpuUsage: 0.2}, - } - tsc.StatsUpdate(ts3) - // check it's there - a = tsc.GetTabletStats("k", "s", topodatapb.TabletType_REPLICA) - if len(a) != 1 { - t.Errorf("unexpected result: %v", a) - } - a = tsc.GetHealthyTabletStats("k", "s", topodatapb.TabletType_REPLICA) - if len(a) != 1 { - t.Errorf("unexpected result: %v", a) - } - - // add a 4th replica tablet in a diff cell, diff region - tablet4 := topo.NewTablet(13, "cell2", "host4") - ts4 := &LegacyTabletStats{ - Key: "t4", - Tablet: tablet4, - Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, - Up: true, - Serving: true, - Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 10, CpuUsage: 0.2}, - } - tsc.StatsUpdate(ts4) - // check it's *NOT* there - a = tsc.GetTabletStats("k", "s", topodatapb.TabletType_REPLICA) - if len(a) != 1 { - t.Errorf("unexpected result: %v", a) - } - a = tsc.GetHealthyTabletStats("k", "s", topodatapb.TabletType_REPLICA) - if len(a) != 1 { - t.Errorf("unexpected result: %v", a) - } -} diff --git a/go/vt/discovery/legacy_tablet_stats_cache_wait.go b/go/vt/discovery/legacy_tablet_stats_cache_wait.go deleted file mode 100644 index 8590051be88..00000000000 --- a/go/vt/discovery/legacy_tablet_stats_cache_wait.go +++ /dev/null @@ -1,120 +0,0 @@ -/* -Copyright 2019 The Vitess Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package discovery - -import ( - "time" - - "context" - - "vitess.io/vitess/go/vt/log" - querypb "vitess.io/vitess/go/vt/proto/query" - topodatapb "vitess.io/vitess/go/vt/proto/topodata" -) - -var ( - // How much to sleep between each check. - waitAvailableTabletInterval = 100 * time.Millisecond -) - -// WaitForTablets waits for at least one tablet in the given -// keyspace / shard / tablet type before returning. The tablets do not -// have to be healthy. It will return ctx.Err() if the context is canceled. -func (tc *LegacyTabletStatsCache) WaitForTablets(ctx context.Context, keyspace, shard string, tabletType topodatapb.TabletType) error { - targets := []*querypb.Target{ - { - Keyspace: keyspace, - Shard: shard, - TabletType: tabletType, - }, - } - return tc.waitForTablets(ctx, targets, false) -} - -// WaitForAllServingTablets waits for at least one healthy serving tablet in -// each given target before returning. -// It will return ctx.Err() if the context is canceled. -// It will return an error if it can't read the necessary topology records. -func (tc *LegacyTabletStatsCache) WaitForAllServingTablets(ctx context.Context, targets []*querypb.Target) error { - return tc.waitForTablets(ctx, targets, true) -} - -// waitForTablets is the internal method that polls for tablets. -func (tc *LegacyTabletStatsCache) waitForTablets(ctx context.Context, targets []*querypb.Target, requireServing bool) error { - for { - // We nil targets as we find them. - allPresent := true - for i, target := range targets { - if target == nil { - continue - } - - var stats []LegacyTabletStats - if requireServing { - stats = tc.GetHealthyTabletStats(target.Keyspace, target.Shard, target.TabletType) - } else { - stats = tc.GetTabletStats(target.Keyspace, target.Shard, target.TabletType) - } - if len(stats) == 0 { - allPresent = false - } else { - targets[i] = nil - } - } - - if allPresent { - // we found everything we needed - return nil - } - - // Unblock after the sleep or when the context has expired. - timer := time.NewTimer(waitAvailableTabletInterval) - select { - case <-ctx.Done(): - for _, target := range targets { - if target != nil { - log.Infof("couldn't find tablets for target: %v", target) - } - } - timer.Stop() - return ctx.Err() - case <-timer.C: - } - } -} - -// WaitByFilter waits for at least one tablet based on the filter function. -func (tc *LegacyTabletStatsCache) WaitByFilter(ctx context.Context, keyspace, shard string, tabletTypes []topodatapb.TabletType, filter func([]LegacyTabletStats) []LegacyTabletStats) error { - for { - for _, tt := range tabletTypes { - stats := tc.GetTabletStats(keyspace, shard, tt) - stats = filter(stats) - if len(stats) > 0 { - return nil - } - } - - // Unblock after the sleep or when the context has expired. - timer := time.NewTimer(waitAvailableTabletInterval) - select { - case <-ctx.Done(): - timer.Stop() - return ctx.Err() - case <-timer.C: - } - } -} diff --git a/go/vt/discovery/legacy_tablet_stats_cache_wait_test.go b/go/vt/discovery/legacy_tablet_stats_cache_wait_test.go deleted file mode 100644 index ff29c0e3570..00000000000 --- a/go/vt/discovery/legacy_tablet_stats_cache_wait_test.go +++ /dev/null @@ -1,76 +0,0 @@ -/* -Copyright 2019 The Vitess Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package discovery - -import ( - "testing" - "time" - - "context" - - "vitess.io/vitess/go/vt/topo" - - querypb "vitess.io/vitess/go/vt/proto/query" - topodatapb "vitess.io/vitess/go/vt/proto/topodata" -) - -func TestWaitForTablets(t *testing.T) { - shortCtx, shortCancel := context.WithTimeout(context.Background(), 10*time.Millisecond) - defer shortCancel() - waitAvailableTabletInterval = 20 * time.Millisecond - - tablet := topo.NewTablet(0, "cell", "a") - tablet.PortMap["vt"] = 1 - input := make(chan *querypb.StreamHealthResponse) - createFakeConn(tablet, input) - - hc := NewLegacyHealthCheck(1*time.Millisecond, 1*time.Hour) - tsc := NewLegacyTabletStatsCache(hc, nil, "cell") - hc.AddTablet(tablet, "") - - // this should time out - if err := tsc.WaitForTablets(shortCtx, "keyspace", "shard", topodatapb.TabletType_REPLICA); err != context.DeadlineExceeded { - t.Errorf("got wrong error: %v", err) - } - - // this should fail, but return a non-timeout error - cancelledCtx, cancel := context.WithCancel(context.Background()) - cancel() - if err := tsc.WaitForTablets(cancelledCtx, "keyspace", "shard", topodatapb.TabletType_REPLICA); err == nil || err == context.DeadlineExceeded { - t.Errorf("want: non-timeout error, got: %v", err) - } - - // send the tablet in - shr := &querypb.StreamHealthResponse{ - Target: &querypb.Target{ - Keyspace: "keyspace", - Shard: "shard", - TabletType: topodatapb.TabletType_REPLICA, - }, - Serving: true, - RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2}, - } - input <- shr - - // and ask again, with longer time outs so it's not flaky - longCtx, longCancel := context.WithTimeout(context.Background(), 10*time.Second) - defer longCancel() - waitAvailableTabletInterval = 10 * time.Millisecond - if err := tsc.WaitForTablets(longCtx, "keyspace", "shard", topodatapb.TabletType_REPLICA); err != nil { - t.Errorf("got error: %v", err) - } -} diff --git a/go/vt/discovery/legacy_topology_watcher.go b/go/vt/discovery/legacy_topology_watcher.go deleted file mode 100644 index 194396df0c1..00000000000 --- a/go/vt/discovery/legacy_topology_watcher.go +++ /dev/null @@ -1,457 +0,0 @@ -/* -Copyright 2019 The Vitess Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package discovery - -import ( - "bytes" - "fmt" - "hash/crc32" - "sort" - "strings" - "sync" - "time" - - "context" - - "vitess.io/vitess/go/trace" - - "vitess.io/vitess/go/vt/key" - "vitess.io/vitess/go/vt/log" - "vitess.io/vitess/go/vt/topo" - "vitess.io/vitess/go/vt/topo/topoproto" - - topodatapb "vitess.io/vitess/go/vt/proto/topodata" -) - -// tabletInfo is used internally by the TopologyWatcher class -type legacyTabletInfo struct { - alias string - key string - tablet *topodatapb.Tablet -} - -// NewLegacyCellTabletsWatcher returns a LegacyTopologyWatcher that monitors all -// the tablets in a cell, and starts refreshing. -func NewLegacyCellTabletsWatcher(ctx context.Context, topoServer *topo.Server, tr LegacyTabletRecorder, cell string, refreshInterval time.Duration, refreshKnownTablets bool, topoReadConcurrency int) *LegacyTopologyWatcher { - return NewLegacyTopologyWatcher(ctx, topoServer, tr, cell, refreshInterval, refreshKnownTablets, topoReadConcurrency, func(tw *LegacyTopologyWatcher) ([]*topodatapb.TabletAlias, error) { - return tw.topoServer.GetTabletAliasesByCell(ctx, tw.cell) - }) -} - -// NewLegacyShardReplicationWatcher returns a LegacyTopologyWatcher that -// monitors the tablets in a cell/keyspace/shard, and starts refreshing. -func NewLegacyShardReplicationWatcher(ctx context.Context, topoServer *topo.Server, tr LegacyTabletRecorder, cell, keyspace, shard string, refreshInterval time.Duration, topoReadConcurrency int) *LegacyTopologyWatcher { - return NewLegacyTopologyWatcher(ctx, topoServer, tr, cell, refreshInterval, true /* refreshKnownTablets */, topoReadConcurrency, func(tw *LegacyTopologyWatcher) ([]*topodatapb.TabletAlias, error) { - sri, err := tw.topoServer.GetShardReplication(ctx, tw.cell, keyspace, shard) - switch { - case err == nil: - // we handle this case after this switch block - case topo.IsErrType(err, topo.NoNode): - // this is not an error - return nil, nil - default: - return nil, err - } - - result := make([]*topodatapb.TabletAlias, len(sri.Nodes)) - for i, node := range sri.Nodes { - result[i] = node.TabletAlias - } - return result, nil - }) -} - -// LegacyTopologyWatcher polls tablet from a configurable set of tablets -// periodically. When tablets are added / removed, it calls -// the LegacyTabletRecorder AddTablet / RemoveTablet interface appropriately. -type LegacyTopologyWatcher struct { - // set at construction time - topoServer *topo.Server - tr LegacyTabletRecorder - cell string - refreshInterval time.Duration - refreshKnownTablets bool - getTablets func(tw *LegacyTopologyWatcher) ([]*topodatapb.TabletAlias, error) - sem chan int - ctx context.Context - cancelFunc context.CancelFunc - // wg keeps track of all launched Go routines. - wg sync.WaitGroup - - // mu protects all variables below - mu sync.Mutex - // tablets contains a map of alias -> tabletInfo for all known tablets - tablets map[string]*legacyTabletInfo - // topoChecksum stores a crc32 of the tablets map and is exported as a metric - topoChecksum uint32 - // lastRefresh records the timestamp of the last topo refresh - lastRefresh time.Time - // firstLoadDone is true when first load of the topology data is done. - firstLoadDone bool - // firstLoadChan is closed when the initial loading of topology data is done. - firstLoadChan chan struct{} -} - -// NewLegacyTopologyWatcher returns a LegacyTopologyWatcher that monitors all -// the tablets in a cell, and starts refreshing. -func NewLegacyTopologyWatcher(ctx context.Context, topoServer *topo.Server, tr LegacyTabletRecorder, cell string, refreshInterval time.Duration, refreshKnownTablets bool, topoReadConcurrency int, getTablets func(tw *LegacyTopologyWatcher) ([]*topodatapb.TabletAlias, error)) *LegacyTopologyWatcher { - tw := &LegacyTopologyWatcher{ - topoServer: topoServer, - tr: tr, - cell: cell, - refreshInterval: refreshInterval, - refreshKnownTablets: refreshKnownTablets, - getTablets: getTablets, - sem: make(chan int, topoReadConcurrency), - tablets: make(map[string]*legacyTabletInfo), - } - tw.firstLoadChan = make(chan struct{}) - - // We want the span from the context, but not the cancelation that comes with it - spanContext := trace.CopySpan(context.Background(), ctx) - tw.ctx, tw.cancelFunc = context.WithCancel(spanContext) - tw.wg.Add(1) - go tw.watch() - return tw -} - -// watch polls all tablets and notifies LegacyTabletRecorder by adding/removing tablets. -func (tw *LegacyTopologyWatcher) watch() { - defer tw.wg.Done() - ticker := time.NewTicker(tw.refreshInterval) - defer ticker.Stop() - for { - tw.loadTablets() - select { - case <-tw.ctx.Done(): - return - case <-ticker.C: - } - } -} - -// loadTablets reads all tablets from topology, and updates LegacyTabletRecorder. -func (tw *LegacyTopologyWatcher) loadTablets() { - var wg sync.WaitGroup - newTablets := make(map[string]*legacyTabletInfo) - replacedTablets := make(map[string]*legacyTabletInfo) - - tabletAliases, err := tw.getTablets(tw) - topologyWatcherOperations.Add(topologyWatcherOpListTablets, 1) - if err != nil { - topologyWatcherErrors.Add(topologyWatcherOpListTablets, 1) - select { - case <-tw.ctx.Done(): - return - default: - } - log.Errorf("cannot get tablets for cell: %v: %v", tw.cell, err) - return - } - - // Accumulate a list of all known alias strings to use later - // when sorting - tabletAliasStrs := make([]string, 0, len(tabletAliases)) - - tw.mu.Lock() - for _, tAlias := range tabletAliases { - aliasStr := topoproto.TabletAliasString(tAlias) - tabletAliasStrs = append(tabletAliasStrs, aliasStr) - - if !tw.refreshKnownTablets { - if val, ok := tw.tablets[aliasStr]; ok { - newTablets[aliasStr] = val - continue - } - } - - wg.Add(1) - go func(alias *topodatapb.TabletAlias) { - defer wg.Done() - tw.sem <- 1 // Wait for active queue to drain. - tablet, err := tw.topoServer.GetTablet(tw.ctx, alias) - topologyWatcherOperations.Add(topologyWatcherOpGetTablet, 1) - <-tw.sem // Done; enable next request to run - if err != nil { - topologyWatcherErrors.Add(topologyWatcherOpGetTablet, 1) - select { - case <-tw.ctx.Done(): - return - default: - } - log.Errorf("cannot get tablet for alias %v: %v", alias, err) - return - } - tw.mu.Lock() - aliasStr := topoproto.TabletAliasString(alias) - newTablets[aliasStr] = &legacyTabletInfo{ - alias: aliasStr, - key: TabletToMapKey(tablet.Tablet), - tablet: tablet.Tablet, - } - tw.mu.Unlock() - }(tAlias) - } - - tw.mu.Unlock() - wg.Wait() - tw.mu.Lock() - - for alias, newVal := range newTablets { - if val, ok := tw.tablets[alias]; !ok { - // Check if there's a tablet with the same address key but a - // different alias. If so, replace it and keep track of the - // replaced alias to make sure it isn't removed later. - found := false - for _, otherVal := range tw.tablets { - if newVal.key == otherVal.key { - found = true - tw.tr.ReplaceTablet(otherVal.tablet, newVal.tablet, alias) - topologyWatcherOperations.Add(topologyWatcherOpReplaceTablet, 1) - replacedTablets[otherVal.alias] = newVal - } - } - if !found { - tw.tr.AddTablet(newVal.tablet, alias) - topologyWatcherOperations.Add(topologyWatcherOpAddTablet, 1) - } - - } else if val.key != newVal.key { - // Handle the case where the same tablet alias is now reporting - // a different address key. - replacedTablets[alias] = newVal - tw.tr.ReplaceTablet(val.tablet, newVal.tablet, alias) - topologyWatcherOperations.Add(topologyWatcherOpReplaceTablet, 1) - } - } - - for _, val := range tw.tablets { - if _, ok := newTablets[val.alias]; !ok { - if _, ok2 := replacedTablets[val.alias]; !ok2 { - tw.tr.RemoveTablet(val.tablet) - topologyWatcherOperations.Add(topologyWatcherOpRemoveTablet, 1) - } - } - } - tw.tablets = newTablets - if !tw.firstLoadDone { - tw.firstLoadDone = true - close(tw.firstLoadChan) - } - - // iterate through the tablets in a stable order and compute a - // checksum of the tablet map - sort.Strings(tabletAliasStrs) - var buf bytes.Buffer - for _, alias := range tabletAliasStrs { - tabletInfo, ok := tw.tablets[alias] - if ok { - buf.WriteString(alias) - buf.WriteString(tabletInfo.key) - } - } - tw.topoChecksum = crc32.ChecksumIEEE(buf.Bytes()) - tw.lastRefresh = time.Now() - - tw.mu.Unlock() -} - -// WaitForInitialTopology waits until the watcher reads all of the topology data -// for the first time and transfers the information to LegacyTabletRecorder via its -// AddTablet() method. -func (tw *LegacyTopologyWatcher) WaitForInitialTopology() error { - select { - case <-tw.ctx.Done(): - return tw.ctx.Err() - case <-tw.firstLoadChan: - return nil - } -} - -// Stop stops the watcher. It does not clean up the tablets added to LegacyTabletRecorder. -func (tw *LegacyTopologyWatcher) Stop() { - tw.cancelFunc() - // wait for watch goroutine to finish. - tw.wg.Wait() -} - -// RefreshLag returns the time since the last refresh -func (tw *LegacyTopologyWatcher) RefreshLag() time.Duration { - tw.mu.Lock() - defer tw.mu.Unlock() - - return time.Since(tw.lastRefresh) -} - -// TopoChecksum returns the checksum of the current state of the topo -func (tw *LegacyTopologyWatcher) TopoChecksum() uint32 { - tw.mu.Lock() - defer tw.mu.Unlock() - - return tw.topoChecksum -} - -// LegacyFilterByShard is a LegacyTabletRecorder filter that filters tablets by -// keyspace/shard. -type LegacyFilterByShard struct { - // tr is the underlying LegacyTabletRecorder to forward requests too - tr LegacyTabletRecorder - - // filters is a map of keyspace to filters for shards - filters map[string][]*filterShard -} - -// NewLegacyFilterByShard creates a new LegacyFilterByShard on top of an existing -// LegacyTabletRecorder. Each filter is a keyspace|shard entry, where shard -// can either be a shard name, or a keyrange. All tablets that match -// at least one keyspace|shard tuple will be forwarded to the -// underlying LegacyTabletRecorder. -func NewLegacyFilterByShard(tr LegacyTabletRecorder, filters []string) (*LegacyFilterByShard, error) { - m := make(map[string][]*filterShard) - for _, filter := range filters { - parts := strings.Split(filter, "|") - if len(parts) != 2 { - return nil, fmt.Errorf("invalid LegacyFilterByShard parameter: %v", filter) - } - - keyspace := parts[0] - shard := parts[1] - - // extract keyrange if it's a range - canonical, kr, err := topo.ValidateShardName(shard) - if err != nil { - return nil, fmt.Errorf("error parsing shard name %v: %v", shard, err) - } - - // check for duplicates - for _, c := range m[keyspace] { - if c.shard == canonical { - return nil, fmt.Errorf("duplicate %v/%v entry", keyspace, shard) - } - } - - m[keyspace] = append(m[keyspace], &filterShard{ - keyspace: keyspace, - shard: canonical, - keyRange: kr, - }) - } - - return &LegacyFilterByShard{ - tr: tr, - filters: m, - }, nil -} - -// AddTablet is part of the LegacyTabletRecorder interface. -func (fbs *LegacyFilterByShard) AddTablet(tablet *topodatapb.Tablet, name string) { - if fbs.isIncluded(tablet) { - fbs.tr.AddTablet(tablet, name) - } -} - -// RemoveTablet is part of the LegacyTabletRecorder interface. -func (fbs *LegacyFilterByShard) RemoveTablet(tablet *topodatapb.Tablet) { - if fbs.isIncluded(tablet) { - fbs.tr.RemoveTablet(tablet) - } -} - -// ReplaceTablet is part of the LegacyTabletRecorder interface. -func (fbs *LegacyFilterByShard) ReplaceTablet(old, new *topodatapb.Tablet, name string) { - if fbs.isIncluded(old) && fbs.isIncluded(new) { - fbs.tr.ReplaceTablet(old, new, name) - } -} - -// isIncluded returns true iff the tablet's keyspace and shard should be -// forwarded to the underlying LegacyTabletRecorder. -func (fbs *LegacyFilterByShard) isIncluded(tablet *topodatapb.Tablet) bool { - canonical, kr, err := topo.ValidateShardName(tablet.Shard) - if err != nil { - log.Errorf("Error parsing shard name %v, will ignore tablet: %v", tablet.Shard, err) - return false - } - - for _, c := range fbs.filters[tablet.Keyspace] { - if canonical == c.shard { - // Exact match (probably a non-sharded keyspace). - return true - } - if kr != nil && c.keyRange != nil && key.KeyRangeIncludes(c.keyRange, kr) { - // Our filter's KeyRange includes the provided KeyRange - return true - } - } - return false -} - -// LegacyFilterByKeyspace is a LegacyTabletRecorder filter that filters tablets by -// keyspace -type LegacyFilterByKeyspace struct { - tr LegacyTabletRecorder - - keyspaces map[string]bool -} - -// NewLegacyFilterByKeyspace creates a new LegacyFilterByKeyspace on top of an existing -// LegacyTabletRecorder. Each filter is a keyspace entry. All tablets that match -// a keyspace will be forwarded to the underlying LegacyTabletRecorder. -func NewLegacyFilterByKeyspace(tr LegacyTabletRecorder, selectedKeyspaces []string) *LegacyFilterByKeyspace { - m := make(map[string]bool) - for _, keyspace := range selectedKeyspaces { - m[keyspace] = true - } - - return &LegacyFilterByKeyspace{ - tr: tr, - keyspaces: m, - } -} - -// AddTablet is part of the LegacyTabletRecorder interface. -func (fbk *LegacyFilterByKeyspace) AddTablet(tablet *topodatapb.Tablet, name string) { - if fbk.isIncluded(tablet) { - fbk.tr.AddTablet(tablet, name) - } -} - -// RemoveTablet is part of the LegacyTabletRecorder interface. -func (fbk *LegacyFilterByKeyspace) RemoveTablet(tablet *topodatapb.Tablet) { - if fbk.isIncluded(tablet) { - fbk.tr.RemoveTablet(tablet) - } -} - -// ReplaceTablet is part of the LegacyTabletRecorder interface. -func (fbk *LegacyFilterByKeyspace) ReplaceTablet(old *topodatapb.Tablet, new *topodatapb.Tablet, name string) { - if old.Keyspace != new.Keyspace { - log.Errorf("Error replacing old tablet in %v with new tablet in %v", old.Keyspace, new.Keyspace) - return - } - - if fbk.isIncluded(new) { - fbk.tr.ReplaceTablet(old, new, name) - } -} - -// isIncluded returns true if the tablet's keyspace should be -// forwarded to the underlying LegacyTabletRecorder. -func (fbk *LegacyFilterByKeyspace) isIncluded(tablet *topodatapb.Tablet) bool { - _, exist := fbk.keyspaces[tablet.Keyspace] - return exist -} diff --git a/go/vt/discovery/legacy_topology_watcher_test.go b/go/vt/discovery/legacy_topology_watcher_test.go deleted file mode 100644 index ed831eddeda..00000000000 --- a/go/vt/discovery/legacy_topology_watcher_test.go +++ /dev/null @@ -1,490 +0,0 @@ -/* -Copyright 2019 The Vitess Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package discovery - -import ( - "math/rand" - "testing" - "time" - - "context" - - "google.golang.org/protobuf/proto" - - "vitess.io/vitess/go/vt/logutil" - topodatapb "vitess.io/vitess/go/vt/proto/topodata" - "vitess.io/vitess/go/vt/topo" - "vitess.io/vitess/go/vt/topo/memorytopo" -) - -func checkLegacyOpCounts(t *testing.T, tw *LegacyTopologyWatcher, prevCounts, deltas map[string]int64) map[string]int64 { - t.Helper() - newCounts := topologyWatcherOperations.Counts() - for key, prevVal := range prevCounts { - delta, ok := deltas[key] - if !ok { - delta = 0 - } - newVal, ok := newCounts[key] - if !ok { - newVal = 0 - } - - if newVal != prevVal+delta { - t.Errorf("expected %v to increase by %v, got %v -> %v", key, delta, prevVal, newVal) - } - } - return newCounts -} - -func checkLegacyChecksum(t *testing.T, tw *LegacyTopologyWatcher, want uint32) { - t.Helper() - got := tw.TopoChecksum() - if want != got { - t.Errorf("want checksum %v got %v", want, got) - } -} - -func TestLegacyCellTabletsWatcher(t *testing.T) { - checkLegacyWatcher(t, true, true) -} - -func TestLegacyCellTabletsWatcherNoRefreshKnown(t *testing.T) { - checkLegacyWatcher(t, true, false) -} - -func TestLegacyShardReplicationWatcher(t *testing.T) { - checkLegacyWatcher(t, false, true) -} - -func checkLegacyWatcher(t *testing.T, cellTablets, refreshKnownTablets bool) { - ts := memorytopo.NewServer("aa") - fhc := NewFakeLegacyHealthCheck() - logger := logutil.NewMemoryLogger() - topologyWatcherOperations.ZeroAll() - counts := topologyWatcherOperations.Counts() - var tw *LegacyTopologyWatcher - if cellTablets { - tw = NewLegacyCellTabletsWatcher(context.Background(), ts, fhc, "aa", 10*time.Minute, refreshKnownTablets, 5) - } else { - tw = NewLegacyShardReplicationWatcher(context.Background(), ts, fhc, "aa", "keyspace", "shard", 10*time.Minute, 5) - } - - // Wait for the initial topology load to finish. Otherwise we - // have a background loadTablets() that's running, and it can - // interact with our tests in weird ways. - if err := tw.WaitForInitialTopology(); err != nil { - t.Fatalf("initial WaitForInitialTopology failed") - } - counts = checkLegacyOpCounts(t, tw, counts, map[string]int64{"ListTablets": 1}) - checkLegacyChecksum(t, tw, 0) - - // Add a tablet to the topology. - tablet := &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{ - Cell: "aa", - Uid: 0, - }, - Hostname: "host1", - PortMap: map[string]int32{ - "vt": 123, - }, - Keyspace: "keyspace", - Shard: "shard", - } - if err := ts.CreateTablet(context.Background(), tablet); err != nil { - t.Fatalf("CreateTablet failed: %v", err) - } - tw.loadTablets() - counts = checkLegacyOpCounts(t, tw, counts, map[string]int64{"ListTablets": 1, "GetTablet": 1, "AddTablet": 1}) - checkLegacyChecksum(t, tw, 1261153186) - - // Check the tablet is returned by GetAllTablets(). - allTablets := fhc.GetAllTablets() - key := TabletToMapKey(tablet) - if _, ok := allTablets[key]; !ok || len(allTablets) != 1 || !proto.Equal(allTablets[key], tablet) { - t.Errorf("fhc.GetAllTablets() = %+v; want %+v", allTablets, tablet) - } - - // Add a second tablet to the topology. - tablet2 := &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{ - Cell: "aa", - Uid: 2, - }, - Hostname: "host2", - PortMap: map[string]int32{ - "vt": 789, - }, - Keyspace: "keyspace", - Shard: "shard", - } - if err := ts.CreateTablet(context.Background(), tablet2); err != nil { - t.Fatalf("CreateTablet failed: %v", err) - } - tw.loadTablets() - - // If refreshKnownTablets is disabled, only the new tablet is read - // from the topo - if refreshKnownTablets { - counts = checkLegacyOpCounts(t, tw, counts, map[string]int64{"ListTablets": 1, "GetTablet": 2, "AddTablet": 1}) - } else { - counts = checkLegacyOpCounts(t, tw, counts, map[string]int64{"ListTablets": 1, "GetTablet": 1, "AddTablet": 1}) - } - checkLegacyChecksum(t, tw, 832404892) - - // Check the new tablet is returned by GetAllTablets(). - allTablets = fhc.GetAllTablets() - key = TabletToMapKey(tablet2) - if _, ok := allTablets[key]; !ok || len(allTablets) != 2 || !proto.Equal(allTablets[key], tablet2) { - t.Errorf("fhc.GetAllTablets() = %+v; want %+v", allTablets, tablet2) - } - - // Load the tablets again to show that when refreshKnownTablets is disabled, - // only the list is read from the topo and the checksum doesn't change - tw.loadTablets() - if refreshKnownTablets { - counts = checkLegacyOpCounts(t, tw, counts, map[string]int64{"ListTablets": 1, "GetTablet": 2}) - } else { - counts = checkLegacyOpCounts(t, tw, counts, map[string]int64{"ListTablets": 1}) - } - checkLegacyChecksum(t, tw, 832404892) - - // same tablet, different port, should update (previous - // one should go away, new one be added) - // - // if refreshKnownTablets is disabled, this case is *not* - // detected and the tablet remains in the topo using the - // old key - origTablet := proto.Clone(tablet).(*topodatapb.Tablet) - origKey := TabletToMapKey(tablet) - tablet.PortMap["vt"] = 456 - if _, err := ts.UpdateTabletFields(context.Background(), tablet.Alias, func(t *topodatapb.Tablet) error { - t.PortMap["vt"] = 456 - return nil - }); err != nil { - t.Fatalf("UpdateTabletFields failed: %v", err) - } - tw.loadTablets() - allTablets = fhc.GetAllTablets() - key = TabletToMapKey(tablet) - - if refreshKnownTablets { - counts = checkLegacyOpCounts(t, tw, counts, map[string]int64{"ListTablets": 1, "GetTablet": 2, "ReplaceTablet": 1}) - - if _, ok := allTablets[key]; !ok || len(allTablets) != 2 || !proto.Equal(allTablets[key], tablet) { - t.Errorf("fhc.GetAllTablets() = %+v; want %+v", allTablets, tablet) - } - if _, ok := allTablets[origKey]; ok { - t.Errorf("fhc.GetAllTablets() = %+v; don't want %v", allTablets, origKey) - } - checkLegacyChecksum(t, tw, 698548794) - } else { - counts = checkLegacyOpCounts(t, tw, counts, map[string]int64{"ListTablets": 1}) - - if _, ok := allTablets[origKey]; !ok || len(allTablets) != 2 || !proto.Equal(allTablets[origKey], origTablet) { - t.Errorf("fhc.GetAllTablets() = %+v; want %+v", allTablets, origTablet) - } - if _, ok := allTablets[key]; ok { - t.Errorf("fhc.GetAllTablets() = %+v; don't want %v", allTablets, key) - } - checkLegacyChecksum(t, tw, 832404892) - } - - // Remove the second tablet and re-add with a new uid. This should - // trigger a ReplaceTablet in loadTablets because the uid does not - // match. - // - // This case *is* detected even if refreshKnownTablets is false - // because the delete tablet / create tablet sequence causes the - // list of tablets to change and therefore the change is detected. - if err := ts.DeleteTablet(context.Background(), tablet2.Alias); err != nil { - t.Fatalf("DeleteTablet failed: %v", err) - } - tablet2.Alias.Uid = 3 - if err := ts.CreateTablet(context.Background(), tablet2); err != nil { - t.Fatalf("CreateTablet failed: %v", err) - } - if _, err := topo.FixShardReplication(context.Background(), ts, logger, "aa", "keyspace", "shard"); err != nil { - t.Fatalf("FixShardReplication failed: %v", err) - } - tw.loadTablets() - allTablets = fhc.GetAllTablets() - - if refreshKnownTablets { - counts = checkLegacyOpCounts(t, tw, counts, map[string]int64{"ListTablets": 1, "GetTablet": 2, "ReplaceTablet": 1}) - checkLegacyChecksum(t, tw, 4097170367) - } else { - counts = checkLegacyOpCounts(t, tw, counts, map[string]int64{"ListTablets": 1, "GetTablet": 1, "ReplaceTablet": 1}) - checkLegacyChecksum(t, tw, 3960185881) - } - key = TabletToMapKey(tablet2) - if _, ok := allTablets[key]; !ok || len(allTablets) != 2 || !proto.Equal(allTablets[key], tablet2) { - t.Errorf("fhc.GetAllTablets() = %+v; want %v => %+v", allTablets, key, tablet2) - } - - // Both tablets restart on different hosts. - // tablet2 happens to land on the host:port that tablet 1 used to be on. - // This can only be tested when we refresh known tablets. - if refreshKnownTablets { - origTablet := proto.Clone(tablet).(*topodatapb.Tablet) - origTablet2 := proto.Clone(tablet2).(*topodatapb.Tablet) - - if _, err := ts.UpdateTabletFields(context.Background(), tablet2.Alias, func(t *topodatapb.Tablet) error { - t.Hostname = tablet.Hostname - t.PortMap = tablet.PortMap - tablet2 = t - return nil - }); err != nil { - t.Fatalf("UpdateTabletFields failed: %v", err) - } - if _, err := ts.UpdateTabletFields(context.Background(), tablet.Alias, func(t *topodatapb.Tablet) error { - t.Hostname = "host3" - tablet = t - return nil - }); err != nil { - t.Fatalf("UpdateTabletFields failed: %v", err) - } - tw.loadTablets() - counts = checkLegacyOpCounts(t, tw, counts, map[string]int64{"ListTablets": 1, "GetTablet": 2, "ReplaceTablet": 2}) - allTablets = fhc.GetAllTablets() - key2 := TabletToMapKey(tablet2) - if _, ok := allTablets[key2]; !ok { - t.Fatalf("tablet was lost because it's reusing an address recently used by another tablet: %v", key2) - } - - // Change tablets back to avoid altering later tests. - if _, err := ts.UpdateTabletFields(context.Background(), tablet2.Alias, func(t *topodatapb.Tablet) error { - t.Hostname = origTablet2.Hostname - t.PortMap = origTablet2.PortMap - tablet2 = t - return nil - }); err != nil { - t.Fatalf("UpdateTabletFields failed: %v", err) - } - if _, err := ts.UpdateTabletFields(context.Background(), tablet.Alias, func(t *topodatapb.Tablet) error { - t.Hostname = origTablet.Hostname - tablet = t - return nil - }); err != nil { - t.Fatalf("UpdateTabletFields failed: %v", err) - } - tw.loadTablets() - counts = checkLegacyOpCounts(t, tw, counts, map[string]int64{"ListTablets": 1, "GetTablet": 2, "ReplaceTablet": 2}) - } - - // Remove the tablet and check that it is detected as being gone. - if err := ts.DeleteTablet(context.Background(), tablet.Alias); err != nil { - t.Fatalf("DeleteTablet failed: %v", err) - } - if _, err := topo.FixShardReplication(context.Background(), ts, logger, "aa", "keyspace", "shard"); err != nil { - t.Fatalf("FixShardReplication failed: %v", err) - } - tw.loadTablets() - if refreshKnownTablets { - counts = checkLegacyOpCounts(t, tw, counts, map[string]int64{"ListTablets": 1, "GetTablet": 1, "RemoveTablet": 1}) - } else { - counts = checkLegacyOpCounts(t, tw, counts, map[string]int64{"ListTablets": 1, "RemoveTablet": 1}) - } - checkLegacyChecksum(t, tw, 1725545897) - - allTablets = fhc.GetAllTablets() - key = TabletToMapKey(tablet) - if _, ok := allTablets[key]; ok || len(allTablets) != 1 { - t.Errorf("fhc.GetAllTablets() = %+v; don't want %v", allTablets, key) - } - key = TabletToMapKey(tablet2) - if _, ok := allTablets[key]; !ok || len(allTablets) != 1 || !proto.Equal(allTablets[key], tablet2) { - t.Errorf("fhc.GetAllTablets() = %+v; want %+v", allTablets, tablet2) - } - - // Remove the other and check that it is detected as being gone. - if err := ts.DeleteTablet(context.Background(), tablet2.Alias); err != nil { - t.Fatalf("DeleteTablet failed: %v", err) - } - if _, err := topo.FixShardReplication(context.Background(), ts, logger, "aa", "keyspace", "shard"); err != nil { - t.Fatalf("FixShardReplication failed: %v", err) - } - tw.loadTablets() - checkLegacyOpCounts(t, tw, counts, map[string]int64{"ListTablets": 1, "GetTablet": 0, "RemoveTablet": 1}) - checkLegacyChecksum(t, tw, 0) - - allTablets = fhc.GetAllTablets() - key = TabletToMapKey(tablet) - if _, ok := allTablets[key]; ok || len(allTablets) != 0 { - t.Errorf("fhc.GetAllTablets() = %+v; don't want %v", allTablets, key) - } - key = TabletToMapKey(tablet2) - if _, ok := allTablets[key]; ok || len(allTablets) != 0 { - t.Errorf("fhc.GetAllTablets() = %+v; don't want %v", allTablets, key) - } - - tw.Stop() -} - -func TestLegacyFilterByShard(t *testing.T) { - testcases := []struct { - filters []string - keyspace string - shard string - included bool - }{ - // un-sharded keyspaces - { - filters: []string{"ks1|0"}, - keyspace: "ks1", - shard: "0", - included: true, - }, - { - filters: []string{"ks1|0"}, - keyspace: "ks2", - shard: "0", - included: false, - }, - // custom sharding, different shard - { - filters: []string{"ks1|0"}, - keyspace: "ks1", - shard: "1", - included: false, - }, - // keyrange based sharding - { - filters: []string{"ks1|-80"}, - keyspace: "ks1", - shard: "0", - included: false, - }, - { - filters: []string{"ks1|-80"}, - keyspace: "ks1", - shard: "-40", - included: true, - }, - { - filters: []string{"ks1|-80"}, - keyspace: "ks1", - shard: "-80", - included: true, - }, - { - filters: []string{"ks1|-80"}, - keyspace: "ks1", - shard: "80-", - included: false, - }, - { - filters: []string{"ks1|-80"}, - keyspace: "ks1", - shard: "c0-", - included: false, - }, - } - - for _, tc := range testcases { - fbs, err := NewLegacyFilterByShard(nil, tc.filters) - if err != nil { - t.Errorf("cannot create LegacyFilterByShard for filters %v: %v", tc.filters, err) - } - - tablet := &topodatapb.Tablet{ - Keyspace: tc.keyspace, - Shard: tc.shard, - } - - got := fbs.isIncluded(tablet) - if got != tc.included { - t.Errorf("isIncluded(%v,%v) for filters %v returned %v but expected %v", tc.keyspace, tc.shard, tc.filters, got, tc.included) - } - } -} - -func TestLegacyFilterByKeyspace(t *testing.T) { - hc := NewFakeLegacyHealthCheck() - tr := NewLegacyFilterByKeyspace(hc, testKeyspacesToWatch) - ts := memorytopo.NewServer(testCell) - tw := NewLegacyCellTabletsWatcher(context.Background(), ts, tr, testCell, 10*time.Minute, true, 5) - - for _, test := range testFilterByKeyspace { - // Add a new tablet to the topology. - port := rand.Int31n(1000) - tablet := &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{ - Cell: testCell, - Uid: rand.Uint32(), - }, - Hostname: testHostName, - PortMap: map[string]int32{ - "vt": port, - }, - Keyspace: test.keyspace, - Shard: testShard, - } - - got := tr.isIncluded(tablet) - if got != test.expected { - t.Errorf("isIncluded(%v) for keyspace %v returned %v but expected %v", test.keyspace, test.keyspace, got, test.expected) - } - - if err := ts.CreateTablet(context.Background(), tablet); err != nil { - t.Errorf("CreateTablet failed: %v", err) - } - - tw.loadTablets() - key := TabletToMapKey(tablet) - allTablets := hc.GetAllTablets() - - if _, ok := allTablets[key]; ok != test.expected && proto.Equal(allTablets[key], tablet) != test.expected { - t.Errorf("Error adding tablet - got %v; want %v", ok, test.expected) - } - - // Replace the tablet we added above - tabletReplacement := &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{ - Cell: testCell, - Uid: rand.Uint32(), - }, - Hostname: testHostName, - PortMap: map[string]int32{ - "vt": port, - }, - Keyspace: test.keyspace, - Shard: testShard, - } - got = tr.isIncluded(tabletReplacement) - if got != test.expected { - t.Errorf("isIncluded(%v) for keyspace %v returned %v but expected %v", test.keyspace, test.keyspace, got, test.expected) - } - if err := ts.CreateTablet(context.Background(), tabletReplacement); err != nil { - t.Errorf("CreateTablet failed: %v", err) - } - - tw.loadTablets() - key = TabletToMapKey(tabletReplacement) - allTablets = hc.GetAllTablets() - - if _, ok := allTablets[key]; ok != test.expected && proto.Equal(allTablets[key], tabletReplacement) != test.expected { - t.Errorf("Error replacing tablet - got %v; want %v", ok, test.expected) - } - - // Delete the tablet - if err := ts.DeleteTablet(context.Background(), tabletReplacement.Alias); err != nil { - t.Fatalf("DeleteTablet failed: %v", err) - } - } -} diff --git a/go/vt/discovery/replicationlag_test.go b/go/vt/discovery/replicationlag_test.go index bd879b01d3d..16354a4d54f 100644 --- a/go/vt/discovery/replicationlag_test.go +++ b/go/vt/discovery/replicationlag_test.go @@ -26,6 +26,11 @@ import ( "vitess.io/vitess/go/vt/topo" ) +// testSetLegacyReplicationLagAlgorithm is a test helper function, if this is used by a production code path, something is wrong. +func testSetLegacyReplicationLagAlgorithm(newLegacy bool) { + *legacyReplicationLagAlgorithm = newLegacy +} + // testSetMinNumTablets is a test helper function, if this is used by a production code path, something is wrong. func testSetMinNumTablets(newMin int) { *minNumTablets = newMin From 5c66c6ed481716f2d3e499d1d0ac96d0e1ba2207 Mon Sep 17 00:00:00 2001 From: Florent Poinsard Date: Tue, 21 Jun 2022 14:57:57 +0200 Subject: [PATCH 08/15] Applied goimports and linter on the throttler package Signed-off-by: Florent Poinsard --- go/vt/throttler/demo/throttler_demo.go | 5 +- go/vt/throttler/manager.go | 2 +- go/vt/throttler/max_replication_lag_module.go | 20 +++---- .../max_replication_lag_module_config.go | 54 +++++++++---------- go/vt/throttler/replication_lag_cache_test.go | 1 + go/vt/throttler/result.go | 30 +++++------ go/vt/throttler/result_test.go | 20 +++---- go/vt/throttler/throttler.go | 2 +- go/vt/throttler/throttlerlogz.go | 2 +- go/vt/throttler/throttlerlogz_test.go | 2 +- 10 files changed, 69 insertions(+), 69 deletions(-) diff --git a/go/vt/throttler/demo/throttler_demo.go b/go/vt/throttler/demo/throttler_demo.go index 7ccad70506a..e4796283f74 100644 --- a/go/vt/throttler/demo/throttler_demo.go +++ b/go/vt/throttler/demo/throttler_demo.go @@ -24,12 +24,13 @@ import ( "sync" "testing" "time" - "vitess.io/vitess/go/vt/topo" "vitess.io/vitess/go/vt/discovery" + "vitess.io/vitess/go/vt/log" "vitess.io/vitess/go/vt/logutil" "vitess.io/vitess/go/vt/servenv" "vitess.io/vitess/go/vt/throttler" + "vitess.io/vitess/go/vt/topo" "vitess.io/vitess/go/vt/topo/memorytopo" "vitess.io/vitess/go/vt/vttablet/grpcqueryservice" "vitess.io/vitess/go/vt/vttablet/queryservice/fakes" @@ -39,8 +40,6 @@ import ( querypb "vitess.io/vitess/go/vt/proto/query" topodatapb "vitess.io/vitess/go/vt/proto/topodata" - - "vitess.io/vitess/go/vt/log" ) // This file contains a demo binary that demonstrates how the resharding diff --git a/go/vt/throttler/manager.go b/go/vt/throttler/manager.go index 32790a9d601..ae575263c80 100644 --- a/go/vt/throttler/manager.go +++ b/go/vt/throttler/manager.go @@ -207,7 +207,7 @@ func (m *managerImpl) throttlerNamesLocked() []string { // Log returns the most recent changes of the MaxReplicationLag module. // There will be one result for each processed replication lag record. -func (m *managerImpl) Log(throttlerName string) ([]result, error) { +func (m *managerImpl) Log(throttlerName string) ([]Result, error) { m.mu.Lock() defer m.mu.Unlock() diff --git a/go/vt/throttler/max_replication_lag_module.go b/go/vt/throttler/max_replication_lag_module.go index 7aa2b868ab0..98402890fa6 100644 --- a/go/vt/throttler/max_replication_lag_module.go +++ b/go/vt/throttler/max_replication_lag_module.go @@ -306,7 +306,7 @@ func (m *MaxReplicationLagModule) recalculateRate(lagRecordNow replicationLagRec m.memory.ageBadRate(now) - r := result{ + r := Result{ Now: now, RateChange: unchangedRate, lastRateChange: m.lastRateChange, @@ -440,7 +440,7 @@ func stateGreater(a, b state) bool { // and we should not skip the current replica ("lagRecordNow"). // Even if it's the same replica we may skip it and return false because // we want to wait longer for the propagation of the current rate change. -func (m *MaxReplicationLagModule) isReplicaUnderTest(r *result, now time.Time, testedState state, lagRecordNow replicationLagRecord) bool { +func (m *MaxReplicationLagModule) isReplicaUnderTest(r *Result, now time.Time, testedState state, lagRecordNow replicationLagRecord) bool { if m.replicaUnderTest == nil { return true } @@ -466,7 +466,7 @@ func (m *MaxReplicationLagModule) isReplicaUnderTest(r *result, now time.Time, t return true } -func (m *MaxReplicationLagModule) increaseRate(r *result, now time.Time, lagRecordNow replicationLagRecord) { +func (m *MaxReplicationLagModule) increaseRate(r *Result, now time.Time, lagRecordNow replicationLagRecord) { m.markCurrentRateAsBadOrGood(r, now, stateIncreaseRate, unknown) oldRate := m.rate.Get() @@ -554,7 +554,7 @@ func (m *MaxReplicationLagModule) minTestDurationUntilNextIncrease(increase floa return minDuration } -func (m *MaxReplicationLagModule) decreaseAndGuessRate(r *result, now time.Time, lagRecordNow replicationLagRecord) { +func (m *MaxReplicationLagModule) decreaseAndGuessRate(r *Result, now time.Time, lagRecordNow replicationLagRecord) { // Guess replication rate based on the difference in the replication lag of this // particular replica. lagRecordBefore := m.lagCache(lagRecordNow).atOrAfter(discovery.TabletToMapKey(lagRecordNow.Tablet), m.lastRateChange) @@ -625,7 +625,7 @@ func (m *MaxReplicationLagModule) decreaseAndGuessRate(r *result, now time.Time, // guessReplicationRate guesses the actual replication rate based on the new bac // Note that "lagDifference" can be positive (lag increased) or negative (lag // decreased). -func (m *MaxReplicationLagModule) guessReplicationRate(r *result, avgPrimaryRate float64, lagBefore, lagNow int64, lagDifference, d time.Duration) (int64, string) { +func (m *MaxReplicationLagModule) guessReplicationRate(r *Result, avgPrimaryRate float64, lagBefore, lagNow int64, lagDifference, d time.Duration) (int64, string) { // avgReplicationRate is the average rate (per second) at which the replica // applied transactions from the replication stream. We infer the value // from the relative change in the replication lag. @@ -670,14 +670,14 @@ func (m *MaxReplicationLagModule) guessReplicationRate(r *result, avgPrimaryRate return int64(newRate), reason } -func (m *MaxReplicationLagModule) emergency(r *result, now time.Time, lagRecordNow replicationLagRecord) { +func (m *MaxReplicationLagModule) emergency(r *Result, now time.Time, lagRecordNow replicationLagRecord) { m.markCurrentRateAsBadOrGood(r, now, stateEmergency, unknown) decreaseReason := fmt.Sprintf("replication lag went beyond max: %d > %d", lagRecordNow.lag(), m.config.MaxReplicationLagSec) m.decreaseRateByPercentage(r, now, lagRecordNow, stateEmergency, m.config.EmergencyDecrease, decreaseReason) } -func (m *MaxReplicationLagModule) decreaseRateByPercentage(r *result, now time.Time, lagRecordNow replicationLagRecord, newState state, decrease float64, decreaseReason string) { +func (m *MaxReplicationLagModule) decreaseRateByPercentage(r *Result, now time.Time, lagRecordNow replicationLagRecord, newState state, decrease float64, decreaseReason string) { oldRate := m.rate.Get() rate := int64(float64(oldRate) - float64(oldRate)*decrease) if rate == 0 { @@ -689,7 +689,7 @@ func (m *MaxReplicationLagModule) decreaseRateByPercentage(r *result, now time.T m.updateRate(r, newState, rate, reason, now, lagRecordNow, m.config.MinDurationBetweenDecreases()) } -func (m *MaxReplicationLagModule) updateRate(r *result, newState state, rate int64, reason string, now time.Time, lagRecordNow replicationLagRecord, testDuration time.Duration) { +func (m *MaxReplicationLagModule) updateRate(r *Result, newState state, rate int64, reason string, now time.Time, lagRecordNow replicationLagRecord, testDuration time.Duration) { oldRate := m.rate.Get() m.currentState = newState @@ -717,7 +717,7 @@ func (m *MaxReplicationLagModule) updateRate(r *result, newState state, rate int // markCurrentRateAsBadOrGood determines the actual rate between the last rate // change and "now" and determines if that rate was bad or good. -func (m *MaxReplicationLagModule) markCurrentRateAsBadOrGood(r *result, now time.Time, newState state, replicationLagChange replicationLagChange) { +func (m *MaxReplicationLagModule) markCurrentRateAsBadOrGood(r *Result, now time.Time, newState state, replicationLagChange replicationLagChange) { if m.lastRateChange.IsZero() { // Module was just started. We don't have any data points yet. r.GoodOrBad = ignoredRate @@ -791,6 +791,6 @@ func (m *MaxReplicationLagModule) markCurrentRateAsBadOrGood(r *result, now time } } -func (m *MaxReplicationLagModule) log() []result { +func (m *MaxReplicationLagModule) log() []Result { return m.results.latestValues() } diff --git a/go/vt/throttler/max_replication_lag_module_config.go b/go/vt/throttler/max_replication_lag_module_config.go index e6a8e8a8494..775aa4639a4 100644 --- a/go/vt/throttler/max_replication_lag_module_config.go +++ b/go/vt/throttler/max_replication_lag_module_config.go @@ -85,51 +85,51 @@ func NewMaxReplicationLagModuleConfig(maxReplicationLag int64) MaxReplicationLag // in protobuf. // Verify returns an error if the config is invalid. -func (c MaxReplicationLagModuleConfig) Verify() error { - if c.TargetReplicationLagSec < 1 { +func (cfg MaxReplicationLagModuleConfig) Verify() error { + if cfg.TargetReplicationLagSec < 1 { return fmt.Errorf("target_replication_lag_sec must be >= 1") } - if c.MaxReplicationLagSec < 2 { + if cfg.MaxReplicationLagSec < 2 { return fmt.Errorf("max_replication_lag_sec must be >= 2") } - if c.TargetReplicationLagSec > c.MaxReplicationLagSec { + if cfg.TargetReplicationLagSec > cfg.MaxReplicationLagSec { return fmt.Errorf("target_replication_lag_sec must not be higher than max_replication_lag_sec: invalid: %v > %v", - c.TargetReplicationLagSec, c.MaxReplicationLagSec) + cfg.TargetReplicationLagSec, cfg.MaxReplicationLagSec) } - if c.InitialRate < 1 { + if cfg.InitialRate < 1 { return fmt.Errorf("initial_rate must be >= 1") } - if c.MaxIncrease <= 0 { + if cfg.MaxIncrease <= 0 { return fmt.Errorf("max_increase must be > 0") } - if c.EmergencyDecrease <= 0 { + if cfg.EmergencyDecrease <= 0 { return fmt.Errorf("emergency_decrease must be > 0") } - if c.MinDurationBetweenIncreasesSec < 1 { + if cfg.MinDurationBetweenIncreasesSec < 1 { return fmt.Errorf("min_duration_between_increases_sec must be >= 1") } - if c.MaxDurationBetweenIncreasesSec < 1 { + if cfg.MaxDurationBetweenIncreasesSec < 1 { return fmt.Errorf("max_duration_between_increases_sec must be >= 1") } - if c.MinDurationBetweenDecreasesSec < 1 { + if cfg.MinDurationBetweenDecreasesSec < 1 { return fmt.Errorf("min_duration_between_decreases_sec must be >= 1") } - if c.SpreadBacklogAcrossSec < 1 { + if cfg.SpreadBacklogAcrossSec < 1 { return fmt.Errorf("spread_backlog_across_sec must be >= 1") } - if c.IgnoreNSlowestReplicas < 0 { + if cfg.IgnoreNSlowestReplicas < 0 { return fmt.Errorf("ignore_n_slowest_replicas must be >= 0") } - if c.IgnoreNSlowestRdonlys < 0 { + if cfg.IgnoreNSlowestRdonlys < 0 { return fmt.Errorf("ignore_n_slowest_rdonlys must be >= 0") } - if c.AgeBadRateAfterSec < 1 { + if cfg.AgeBadRateAfterSec < 1 { return fmt.Errorf("age_bad_rate_after_sec must be >= 1") } - if c.MaxRateApproachThreshold < 0 { + if cfg.MaxRateApproachThreshold < 0 { return fmt.Errorf("max_rate_approach_threshold must be >=0") } - if c.MaxRateApproachThreshold > 1 { + if cfg.MaxRateApproachThreshold > 1 { return fmt.Errorf("max_rate_approach_threshold must be <=1") } return nil @@ -137,30 +137,30 @@ func (c MaxReplicationLagModuleConfig) Verify() error { // MinDurationBetweenIncreases is a helper function which returns the respective // protobuf field as native Go type. -func (c MaxReplicationLagModuleConfig) MinDurationBetweenIncreases() time.Duration { - return time.Duration(c.MinDurationBetweenIncreasesSec) * time.Second +func (cfg MaxReplicationLagModuleConfig) MinDurationBetweenIncreases() time.Duration { + return time.Duration(cfg.MinDurationBetweenIncreasesSec) * time.Second } // MaxDurationBetweenIncreases is a helper function which returns the respective // protobuf field as native Go type. -func (c MaxReplicationLagModuleConfig) MaxDurationBetweenIncreases() time.Duration { - return time.Duration(c.MaxDurationBetweenIncreasesSec) * time.Second +func (cfg MaxReplicationLagModuleConfig) MaxDurationBetweenIncreases() time.Duration { + return time.Duration(cfg.MaxDurationBetweenIncreasesSec) * time.Second } // MinDurationBetweenDecreases is a helper function which returns the respective // protobuf field as native Go type. -func (c MaxReplicationLagModuleConfig) MinDurationBetweenDecreases() time.Duration { - return time.Duration(c.MinDurationBetweenDecreasesSec) * time.Second +func (cfg MaxReplicationLagModuleConfig) MinDurationBetweenDecreases() time.Duration { + return time.Duration(cfg.MinDurationBetweenDecreasesSec) * time.Second } // SpreadBacklogAcross is a helper function which returns the respective // protobuf field as native Go type. -func (c MaxReplicationLagModuleConfig) SpreadBacklogAcross() time.Duration { - return time.Duration(c.SpreadBacklogAcrossSec) * time.Second +func (cfg MaxReplicationLagModuleConfig) SpreadBacklogAcross() time.Duration { + return time.Duration(cfg.SpreadBacklogAcrossSec) * time.Second } // AgeBadRateAfter is a helper function which returns the respective // protobuf field as native Go type. -func (c MaxReplicationLagModuleConfig) AgeBadRateAfter() time.Duration { - return time.Duration(c.AgeBadRateAfterSec) * time.Second +func (cfg MaxReplicationLagModuleConfig) AgeBadRateAfter() time.Duration { + return time.Duration(cfg.AgeBadRateAfterSec) * time.Second } diff --git a/go/vt/throttler/replication_lag_cache_test.go b/go/vt/throttler/replication_lag_cache_test.go index 8964db17c05..312f97e1999 100644 --- a/go/vt/throttler/replication_lag_cache_test.go +++ b/go/vt/throttler/replication_lag_cache_test.go @@ -19,6 +19,7 @@ package throttler import ( "testing" "time" + "vitess.io/vitess/go/vt/discovery" ) diff --git a/go/vt/throttler/result.go b/go/vt/throttler/result.go index 179711116a3..275eb01f9df 100644 --- a/go/vt/throttler/result.go +++ b/go/vt/throttler/result.go @@ -42,7 +42,7 @@ const ( ignoredRate = "ignored" ) -var resultStringTemplate = template.Must(template.New("result.String()").Parse( +var resultStringTemplate = template.Must(template.New("Result.String()").Parse( `rate was: {{.RateChange}} from: {{.OldRate}} to: {{.NewRate}} alias: {{.Alias}} lag: {{.LagRecordNow.Stats.ReplicationLagSeconds}}s last change: {{.TimeSinceLastRateChange}} rate: {{.CurrentRate}} good/bad? {{.GoodOrBad}} skipped b/c: {{.MemorySkipReason}} good/bad: {{.HighestGood}}/{{.LowestBad}} @@ -50,10 +50,10 @@ state (old/tested/new): {{.OldState}}/{{.TestedState}}/{{.NewState}} lag before: {{.LagBefore}} ({{.AgeOfBeforeLag}} ago) rates (primary/replica): {{.PrimaryRate}}/{{.GuessedReplicationRate}} backlog (old/new): {{.GuessedReplicationBacklogOld}}/{{.GuessedReplicationBacklogNew}} reason: {{.Reason}}`)) -// result is generated by the MaxReplicationLag module for each processed +// Result is generated by the MaxReplicationLag module for each processed // "replicationLagRecord". // It captures the details and the decision of the processing. -type result struct { +type Result struct { Now time.Time RateChange rateChange lastRateChange time.Time @@ -80,7 +80,7 @@ type result struct { GuessedReplicationBacklogNew int } -func (r result) String() string { +func (r Result) String() string { var b bytes.Buffer if err := resultStringTemplate.Execute(&b, r); err != nil { panic(fmt.Sprintf("failed to Execute() template: %v", err)) @@ -88,25 +88,25 @@ func (r result) String() string { return b.String() } -func (r result) Alias() string { +func (r Result) Alias() string { return topoproto.TabletAliasString(r.LagRecordNow.Tablet.Alias) } -func (r result) TimeSinceLastRateChange() string { +func (r Result) TimeSinceLastRateChange() string { if r.lastRateChange.IsZero() { return "n/a" } return fmt.Sprintf("%.1fs", r.Now.Sub(r.lastRateChange).Seconds()) } -func (r result) LagBefore() string { +func (r Result) LagBefore() string { if r.LagRecordBefore.isZero() { return "n/a" } return fmt.Sprintf("%ds", r.LagRecordBefore.Stats.ReplicationLagSeconds) } -func (r result) AgeOfBeforeLag() string { +func (r Result) AgeOfBeforeLag() string { if r.LagRecordBefore.isZero() { return "n/a" } @@ -117,24 +117,24 @@ func (r result) AgeOfBeforeLag() string { type resultRing struct { // mu guards the fields below. mu sync.Mutex - // position holds the index of the *next* result in the ring. + // position holds the index of the *next* Result in the ring. position int // wrapped becomes true when the ring buffer "wrapped" at least once and we // started reusing entries. wrapped bool // values is the underlying ring buffer. - values []result + values []Result } // newResultRing creates a new resultRing. func newResultRing(capacity int) *resultRing { return &resultRing{ - values: make([]result, capacity), + values: make([]Result, capacity), } } -// add inserts a new result into the ring buffer. -func (rr *resultRing) add(r result) { +// add inserts a new Result into the ring buffer. +func (rr *resultRing) add(r Result) { rr.mu.Lock() defer rr.mu.Unlock() @@ -148,7 +148,7 @@ func (rr *resultRing) add(r result) { // latestValues returns all values of the buffer. Entries are sorted in reverse // chronological order i.e. newer items come first. -func (rr *resultRing) latestValues() []result { +func (rr *resultRing) latestValues() []Result { rr.mu.Lock() defer rr.mu.Unlock() @@ -162,7 +162,7 @@ func (rr *resultRing) latestValues() []result { count = rr.position } - results := make([]result, count) + results := make([]Result, count) for i := 0; i < count; i++ { pos := start - i if pos < 0 { diff --git a/go/vt/throttler/result_test.go b/go/vt/throttler/result_test.go index 9efc7df9412..9eadab503e8 100644 --- a/go/vt/throttler/result_test.go +++ b/go/vt/throttler/result_test.go @@ -23,7 +23,7 @@ import ( ) var ( - resultIncreased = result{ + resultIncreased = Result{ Now: sinceZero(1234 * time.Millisecond), RateChange: increasedRate, lastRateChange: sinceZero(1 * time.Millisecond), @@ -45,7 +45,7 @@ var ( GuessedReplicationBacklogOld: 0, GuessedReplicationBacklogNew: 0, } - resultDecreased = result{ + resultDecreased = Result{ Now: sinceZero(5000 * time.Millisecond), RateChange: decreasedRate, lastRateChange: sinceZero(1234 * time.Millisecond), @@ -67,7 +67,7 @@ var ( GuessedReplicationBacklogOld: 10, GuessedReplicationBacklogNew: 20, } - resultEmergency = result{ + resultEmergency = Result{ Now: sinceZero(10123 * time.Millisecond), RateChange: decreasedRate, lastRateChange: sinceZero(5000 * time.Millisecond), @@ -93,7 +93,7 @@ var ( func TestResultString(t *testing.T) { testcases := []struct { - r result + r Result want string }{ { @@ -135,27 +135,27 @@ reason: emergency state decreased the rate`, func TestResultRing(t *testing.T) { // Test data. - r1 := result{Reason: "r1"} - r2 := result{Reason: "r2"} - r3 := result{Reason: "r3"} + r1 := Result{Reason: "r1"} + r2 := Result{Reason: "r2"} + r3 := Result{Reason: "r3"} rr := newResultRing(2) // Use the ring partially. rr.add(r1) - if got, want := rr.latestValues(), []result{r1}; !reflect.DeepEqual(got, want) { + if got, want := rr.latestValues(), []Result{r1}; !reflect.DeepEqual(got, want) { t.Fatalf("items not correctly added to resultRing. got = %v, want = %v", got, want) } // Use it fully. rr.add(r2) - if got, want := rr.latestValues(), []result{r2, r1}; !reflect.DeepEqual(got, want) { + if got, want := rr.latestValues(), []Result{r2, r1}; !reflect.DeepEqual(got, want) { t.Fatalf("items not correctly added to resultRing. got = %v, want = %v", got, want) } // Let it wrap. rr.add(r3) - if got, want := rr.latestValues(), []result{r3, r2}; !reflect.DeepEqual(got, want) { + if got, want := rr.latestValues(), []Result{r3, r2}; !reflect.DeepEqual(got, want) { t.Fatalf("resultRing did not wrap correctly. got = %v, want = %v", got, want) } } diff --git a/go/vt/throttler/throttler.go b/go/vt/throttler/throttler.go index c20b0329338..686df6c89ab 100644 --- a/go/vt/throttler/throttler.go +++ b/go/vt/throttler/throttler.go @@ -316,6 +316,6 @@ func (t *Throttler) ResetConfiguration() { } // Log returns the most recent changes of the MaxReplicationLag module. -func (t *Throttler) Log() []result { +func (t *Throttler) Log() []Result { return t.maxReplicationLagModule.log() } diff --git a/go/vt/throttler/throttlerlogz.go b/go/vt/throttler/throttlerlogz.go index 23b49bdc070..1528bc01c7a 100644 --- a/go/vt/throttler/throttlerlogz.go +++ b/go/vt/throttler/throttlerlogz.go @@ -152,7 +152,7 @@ func showThrottlerLog(w http.ResponseWriter, m *managerImpl, name string) { colorLevel = "high" } data := struct { - result + Result ColorLevel string }{r, colorLevel} diff --git a/go/vt/throttler/throttlerlogz_test.go b/go/vt/throttler/throttlerlogz_test.go index 82ebb77e7a1..2ecb76c9982 100644 --- a/go/vt/throttler/throttlerlogz_test.go +++ b/go/vt/throttler/throttlerlogz_test.go @@ -55,7 +55,7 @@ func TestThrottlerlogzHandler(t *testing.T) { testcases := []struct { desc string - r result + r Result want string }{ { From 4654a771c2868fd0a6eab8ac96af39f82e5c532c Mon Sep 17 00:00:00 2001 From: Florent Poinsard Date: Tue, 21 Jun 2022 15:10:58 +0200 Subject: [PATCH 09/15] Fixed build issue and code comments Signed-off-by: Florent Poinsard --- go/vt/discovery/utils.go | 3 +-- go/vt/throttler/demo/throttler_demo.go | 3 +-- go/vt/throttler/max_replication_lag_module.go | 10 ++++---- .../max_replication_lag_module_test.go | 2 +- go/vt/throttler/replication_lag_cache.go | 2 +- go/vt/throttler/replication_lag_record.go | 2 +- go/vt/vtctld/api.go | 2 +- go/vt/vtgate/buffer/buffer.go | 25 +------------------ go/vt/vtgate/buffer/buffer_helper_test.go | 11 -------- go/vt/vtgate/executor_test.go | 2 -- go/vt/vtgate/vtgate.go | 1 - .../tabletserver/txthrottler/tx_throttler.go | 4 +-- go/vt/worker/executor.go | 2 +- go/vt/worker/split_clone.go | 12 +++------ go/vt/worker/tablet_provider.go | 2 +- 15 files changed, 20 insertions(+), 63 deletions(-) diff --git a/go/vt/discovery/utils.go b/go/vt/discovery/utils.go index b76c7409e80..6a50c6abc7c 100644 --- a/go/vt/discovery/utils.go +++ b/go/vt/discovery/utils.go @@ -24,8 +24,7 @@ import ( ) // This file contains helper filter methods to process the unfiltered list of -// tablets returned by LegacyHealthCheck.GetTabletStatsFrom*. -// See also legacy_replicationlag.go for a more sophisicated filter used by vtgate. +// tablets returned by HealthCheckImpl.GetTabletHealth*. func TabletHealthReferenceListToValue(thl []*TabletHealth) []TabletHealth { newTh := []TabletHealth{} diff --git a/go/vt/throttler/demo/throttler_demo.go b/go/vt/throttler/demo/throttler_demo.go index e4796283f74..a098b032b67 100644 --- a/go/vt/throttler/demo/throttler_demo.go +++ b/go/vt/throttler/demo/throttler_demo.go @@ -277,8 +277,7 @@ func (c *client) stop() { c.throttler.Close() } -// StatsUpdate implements discovery.LegacyHealthCheckStatsListener. -// It gets called by the healthCheck instance every time a tablet broadcasts +// StatsUpdate gets called by the healthCheck instance every time a tablet broadcasts // a health update. func (c *client) StatsUpdate(ts *discovery.TabletHealth) { // Ignore unless REPLICA or RDONLY. diff --git a/go/vt/throttler/max_replication_lag_module.go b/go/vt/throttler/max_replication_lag_module.go index 98402890fa6..5474440bb3d 100644 --- a/go/vt/throttler/max_replication_lag_module.go +++ b/go/vt/throttler/max_replication_lag_module.go @@ -54,7 +54,7 @@ const ( // i.e. we'll ignore lag records with lower lag from other replicas while we're // waiting for the next record of this replica under test. type replicaUnderTest struct { - // key holds the discovery.LegacyTabletStats.Key value for the replica. + // key holds the key value for the replica. key string alias string tabletType topodatapb.TabletType @@ -114,8 +114,8 @@ type MaxReplicationLagModule struct { // max rate calculation has changed. The field is immutable (set in Start().) rateUpdateChan chan<- struct{} - // lagRecords buffers the replication lag records received by the LegacyHealthCheck - // listener. ProcessRecords() will process them. + // lagRecords buffers the replication lag records received by the HealthCheck + // subscriber. ProcessRecords() will process them. lagRecords chan replicationLagRecord wg sync.WaitGroup @@ -246,7 +246,7 @@ func (m *MaxReplicationLagModule) RecordReplicationLag(t time.Time, th *discover } m.mutableConfigMu.Unlock() - // Buffer data point for now to unblock the LegacyHealthCheck listener and process + // Buffer data point for now to unblock the HealthCheck subscriber and process // it asynchronously in ProcessRecords(). m.lagRecords <- replicationLagRecord{t, *th} } @@ -402,7 +402,7 @@ func (m *MaxReplicationLagModule) clearReplicaUnderTest(now time.Time, testedSta return true, "it is no longer actively tracked" } if lr.LastError != nil { - // LastError is set i.e. LegacyHealthCheck module cannot connect and the cached + // LastError is set i.e. HealthCheck module cannot connect and the cached // data for the replica might be outdated. return true, "it has LastError set i.e. is no longer correctly tracked" } diff --git a/go/vt/throttler/max_replication_lag_module_test.go b/go/vt/throttler/max_replication_lag_module_test.go index 082eb6ee1d7..f0324df192c 100644 --- a/go/vt/throttler/max_replication_lag_module_test.go +++ b/go/vt/throttler/max_replication_lag_module_test.go @@ -225,7 +225,7 @@ func TestMaxReplicationLagModule_ReplicaUnderTest_LastErrorOrNotUp(t *testing.T) // r2 @ 75s, 0s lag, LastError set rError := lagRecord(sinceZero(75*time.Second), r2, 0) - rError.LastError = errors.New("LegacyHealthCheck reporting broken") + rError.LastError = errors.New("HealthCheck reporting broken") tf.m.replicaLagCache.add(rError) // r1 @ 110s, 0s lag diff --git a/go/vt/throttler/replication_lag_cache.go b/go/vt/throttler/replication_lag_cache.go index 3b1a8685245..c9c2e94f113 100644 --- a/go/vt/throttler/replication_lag_cache.go +++ b/go/vt/throttler/replication_lag_cache.go @@ -126,7 +126,7 @@ func (c *replicationLagCache) sortByLag(ignoreNSlowestReplicas int, minimumRepli } } -// byLagAndTabletUID is a slice of discovery.LegacyTabletStats elements that +// byLagAndTabletUID is a slice of discovery.TabletHealth elements that // implements sort.Interface to sort by replication lag and tablet Uid. type byLagAndTabletUID []discovery.TabletHealth diff --git a/go/vt/throttler/replication_lag_record.go b/go/vt/throttler/replication_lag_record.go index 1a079aab325..6fdccbd8810 100644 --- a/go/vt/throttler/replication_lag_record.go +++ b/go/vt/throttler/replication_lag_record.go @@ -23,7 +23,7 @@ import ( ) // replicationLagRecord stores the tablet health data for a given point in time. -// This data is obtained via the LegacyHealthCheck module. +// This data is obtained via the HealthCheck module. type replicationLagRecord struct { // time is the time at which "value" was observed. time time.Time diff --git a/go/vt/vtctld/api.go b/go/vt/vtctld/api.go index 80cc988db58..781b4416246 100644 --- a/go/vt/vtctld/api.go +++ b/go/vt/vtctld/api.go @@ -62,7 +62,7 @@ const ( jsonContentType = "application/json; charset=utf-8" ) -// TabletStats represents realtime stats from a discovery.LegacyTabletStats struct. +// TabletStats represents realtime stats from a discovery.TabletHealth struct. type TabletStats struct { LastError string `json:"last_error,omitempty"` Realtime *querypb.RealtimeStats `json:"realtime,omitempty"` diff --git a/go/vt/vtgate/buffer/buffer.go b/go/vt/vtgate/buffer/buffer.go index ab4d5a32000..25fe4181a4a 100644 --- a/go/vt/vtgate/buffer/buffer.go +++ b/go/vt/vtgate/buffer/buffer.go @@ -93,7 +93,7 @@ type Buffer struct { // In particular, it is used to serialize the following Go routines: // - 1. Requests which may buffer (RLock, can be run in parallel) // - 2. Request which starts buffering (based on the seen error) - // - 3. LegacyHealthCheck listener ("StatsUpdate") which stops buffering + // - 3. HealthCheck subscriber ("StatsUpdate") which stops buffering // - 4. Timer which may stop buffering after -buffer_max_failover_duration mu sync.RWMutex // buffers holds a shardBuffer object per shard, even if no failover is in @@ -171,29 +171,6 @@ func (b *Buffer) HandleKeyspaceEvent(ksevent *discovery.KeyspaceEvent) { } } -// StatsUpdate keeps track of the "tablet_externally_reparented_timestamp" of -// each primary. This way we can detect the end of a failover. -// It is part of the discovery.LegacyHealthCheckStatsListener interface. -func (b *Buffer) StatsUpdate(ts *discovery.LegacyTabletStats) { - if ts.Target.TabletType != topodatapb.TabletType_PRIMARY { - panic(fmt.Sprintf("BUG: non-PRIMARY LegacyTabletStats object must not be forwarded: %#v", ts)) - } - - timestamp := ts.TabletExternallyReparentedTimestamp - if timestamp == 0 { - // Primarys where TabletExternallyReparented was never called will return 0. - // Ignore them. - return - } - - sb := b.getOrCreateBuffer(ts.Target.Keyspace, ts.Target.Shard) - if sb == nil { - // Buffer is shut down. Ignore all calls. - return - } - sb.recordExternallyReparentedTimestamp(timestamp, ts.Tablet.Alias) -} - // getOrCreateBuffer returns the ShardBuffer for the given keyspace and shard. // It returns nil if Buffer is shut down and all calls should be ignored. func (b *Buffer) getOrCreateBuffer(keyspace, shard string) *shardBuffer { diff --git a/go/vt/vtgate/buffer/buffer_helper_test.go b/go/vt/vtgate/buffer/buffer_helper_test.go index 38983426a88..442e78d08f7 100644 --- a/go/vt/vtgate/buffer/buffer_helper_test.go +++ b/go/vt/vtgate/buffer/buffer_helper_test.go @@ -31,17 +31,6 @@ func testAllImplementations(t *testing.T, runTest func(t *testing.T, fail failov }) }) - t.Run("LegacyHealthCheck", func(t *testing.T) { - t.Helper() - runTest(t, func(buf *Buffer, tablet *topodatapb.Tablet, keyspace, shard string, now time.Time) { - buf.StatsUpdate(&discovery.LegacyTabletStats{ - Tablet: tablet, - Target: &query.Target{Keyspace: keyspace, Shard: shard, TabletType: topodatapb.TabletType_PRIMARY}, - TabletExternallyReparentedTimestamp: now.Unix(), - }) - }) - }) - t.Run("KeyspaceEvent", func(t *testing.T) { t.Helper() runTest(t, func(buf *Buffer, tablet *topodatapb.Tablet, keyspace, shard string, now time.Time) { diff --git a/go/vt/vtgate/executor_test.go b/go/vt/vtgate/executor_test.go index f2af6566297..b0cd5407367 100644 --- a/go/vt/vtgate/executor_test.go +++ b/go/vt/vtgate/executor_test.go @@ -776,8 +776,6 @@ func TestExecutorShow(t *testing.T) { } utils.MustMatch(t, wantqr, qr, query) - // The FakeLegacyTablets in FakeLegacyHealthCheck don't have support for these columns/values - // So let's just be sure the statement works and we get the expected results (none) query = "show vitess_replication_status" qr, err = executor.Execute(ctx, "TestExecute", session, query, nil) require.NoError(t, err) diff --git a/go/vt/vtgate/vtgate.go b/go/vt/vtgate/vtgate.go index f45de1d4f8f..74c89a04b62 100644 --- a/go/vt/vtgate/vtgate.go +++ b/go/vt/vtgate/vtgate.go @@ -141,7 +141,6 @@ type VTGate struct { vsm *vstreamManager txConn *TxConn gw *TabletGateway - pv plancontext.PlannerVersion // stats objects. // TODO(sougou): This needs to be cleaned up. There diff --git a/go/vt/vttablet/tabletserver/txthrottler/tx_throttler.go b/go/vt/vttablet/tabletserver/txthrottler/tx_throttler.go index 11f04a26811..143840e0c21 100644 --- a/go/vt/vttablet/tabletserver/txthrottler/tx_throttler.go +++ b/go/vt/vttablet/tabletserver/txthrottler/tx_throttler.go @@ -41,7 +41,7 @@ import ( // TxThrottler throttles transactions based on replication lag. // It's a thin wrapper around the throttler found in vitess/go/vt/throttler. -// It uses a discovery.LegacyHealthCheck to send replication-lag updates to the wrapped throttler. +// It uses a discovery.HealthCheck to send replication-lag updates to the wrapped throttler. // // Intended Usage: // // Assuming topoServer is a topo.Server variable pointing to a Vitess topology server. @@ -348,7 +348,7 @@ func (ts *txThrottlerState) deallocateResources() { ts.throttler = nil } -// StatsUpdate is part of the LegacyHealthCheckStatsListener interface. +// StatsUpdate updates the health of a tablet with the given healthcheck. func (ts *txThrottlerState) StatsUpdate(tabletStats *discovery.TabletHealth) { // Ignore PRIMARY and RDONLY stats. // We currently do not monitor RDONLY tablets for replication lag. RDONLY tablets are not diff --git a/go/vt/worker/executor.go b/go/vt/worker/executor.go index 04201e4994b..ed20a704cce 100644 --- a/go/vt/worker/executor.go +++ b/go/vt/worker/executor.go @@ -185,7 +185,7 @@ func (e *executor) fetchWithRetries(ctx context.Context, action func(ctx context } return vterrors.Wrapf(err, "interrupted while trying to run a command on tablet %v", tabletString) case <-time.After(*executeFetchRetryTime): - // Retry 30s after the failure using the current primary seen by the LegacyHealthCheck. + // Retry 30s after the failure using the current primary seen by the HealthCheck. } isRetry = true } diff --git a/go/vt/worker/split_clone.go b/go/vt/worker/split_clone.go index 811afd569b0..bac4948e0db 100644 --- a/go/vt/worker/split_clone.go +++ b/go/vt/worker/split_clone.go @@ -422,7 +422,7 @@ func (scw *SplitCloneWorker) Run(ctx context.Context) error { // After Close returned, we can be sure that it won't call our listener // implementation (method StatsUpdate) anymore. if err := scw.healthCheck.Close(); err != nil { - scw.wr.Logger().Errorf2(err, "LegacyHealthCheck.Close() failed") + scw.wr.Logger().Errorf2(err, "HealthCheck.Close() failed") } } @@ -859,7 +859,7 @@ func (scw *SplitCloneWorker) findDestinationPrimarys(ctx context.Context) error } primarys := scw.healthCheck.GetHealthyTabletStats(&querypb.Target{Keyspace: si.Keyspace(), Shard: si.ShardName(), TabletType: topodatapb.TabletType_PRIMARY}) if len(primarys) == 0 { - return vterrors.Errorf(vtrpc.Code_FAILED_PRECONDITION, "cannot find PRIMARY tablet for destination shard for %v/%v (in cell: %v) in LegacyHealthCheck: empty LegacyTabletStats list", si.Keyspace(), si.ShardName(), scw.cell) + return vterrors.Errorf(vtrpc.Code_FAILED_PRECONDITION, "cannot find PRIMARY tablet for destination shard for %v/%v (in cell: %v) in HealthCheck: empty TabletHealth list", si.Keyspace(), si.ShardName(), scw.cell) } primary := primarys[0] @@ -869,7 +869,7 @@ func (scw *SplitCloneWorker) findDestinationPrimarys(ctx context.Context) error scw.wr.Logger().Infof("Using tablet %v as destination primary for %v/%v", topoproto.TabletAliasString(primary.Tablet.Alias), si.Keyspace(), si.ShardName()) } - scw.wr.Logger().Infof("NOTE: The used primary of a destination shard might change over the course of the copy e.g. due to a reparent. The LegacyHealthCheck module will track and log primary changes and any error message will always refer the actually used primary address.") + scw.wr.Logger().Infof("NOTE: The used primary of a destination shard might change over the course of the copy e.g. due to a reparent. The HealthCheck module will track and log primary changes and any error message will always refer the actually used primary address.") return nil } @@ -1357,13 +1357,9 @@ func (scw *SplitCloneWorker) createKeyResolver(td *tabletmanagerdatapb.TableDefi return newV3ResolverFromTableDefinition(scw.keyspaceSchema, td) } -// StatsUpdate receives replication lag updates for each destination primary +// StatsUpdate receives replication lag updates from the healthcheck for each destination primary // and forwards them to the respective throttler instance. -// It also forwards any update to the LegacyTabletStatsCache to keep it up to date. -// It is part of the discovery.LegacyHealthCheckStatsListener interface. func (scw *SplitCloneWorker) StatsUpdate(ts *discovery.TabletHealth) { - // scw.tsc.StatsUpdate(ts) - // Ignore unless REPLICA or RDONLY. if ts.Target.TabletType != topodatapb.TabletType_REPLICA && ts.Target.TabletType != topodatapb.TabletType_RDONLY { return diff --git a/go/vt/worker/tablet_provider.go b/go/vt/worker/tablet_provider.go index 9b46fc7de85..d38dc8e46fd 100644 --- a/go/vt/worker/tablet_provider.go +++ b/go/vt/worker/tablet_provider.go @@ -75,7 +75,7 @@ func (p *singleTabletProvider) description() string { } // shardTabletProvider returns a random healthy RDONLY tablet for a given -// keyspace and shard. It uses the LegacyHealthCheck module to retrieve the tablets. +// keyspace and shard. It uses the HealthCheck module to retrieve the tablets. type shardTabletProvider struct { hc *discovery.HealthCheckImpl tracker *TabletTracker From 7901bb799dc6c79eba901d1a5ecfa442cd09abf2 Mon Sep 17 00:00:00 2001 From: Florent Poinsard Date: Wed, 22 Jun 2022 16:34:32 +0200 Subject: [PATCH 10/15] Fixed comment of GetTabletStats Signed-off-by: Florent Poinsard --- go/vt/discovery/healthcheck.go | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/go/vt/discovery/healthcheck.go b/go/vt/discovery/healthcheck.go index 2667f2b0128..69a67168a12 100644 --- a/go/vt/discovery/healthcheck.go +++ b/go/vt/discovery/healthcheck.go @@ -631,10 +631,7 @@ func (hc *HealthCheckImpl) GetHealthyTabletStats(target *query.Target) []*Tablet return append(result, hc.healthy[KeyFromTarget(target)]...) } -// GetHealthyTabletStats returns only the healthy tablets. -// The returned array is owned by the caller. -// For TabletType_PRIMARY, this will only return at most one entry, -// the most recent tablet of type primary. +// GetTabletStats returns only the tablets that matches the given target. // This returns a copy of the data so that callers can access without // synchronization func (hc *HealthCheckImpl) GetTabletStats(target *query.Target) []*TabletHealth { From 2436bcc42852c3fe8f2e3cc386aa44c6f79fcbf9 Mon Sep 17 00:00:00 2001 From: Florent Poinsard Date: Wed, 22 Jun 2022 16:35:51 +0200 Subject: [PATCH 11/15] Removed duplicated function GetTabletStats Signed-off-by: Florent Poinsard --- go/vt/discovery/healthcheck.go | 23 +++++------------------ 1 file changed, 5 insertions(+), 18 deletions(-) diff --git a/go/vt/discovery/healthcheck.go b/go/vt/discovery/healthcheck.go index 69a67168a12..b0da81a9111 100644 --- a/go/vt/discovery/healthcheck.go +++ b/go/vt/discovery/healthcheck.go @@ -631,9 +631,10 @@ func (hc *HealthCheckImpl) GetHealthyTabletStats(target *query.Target) []*Tablet return append(result, hc.healthy[KeyFromTarget(target)]...) } -// GetTabletStats returns only the tablets that matches the given target. -// This returns a copy of the data so that callers can access without -// synchronization +// GetTabletStats returns all tablets for the given target. +// The returned array is owned by the caller. +// For TabletType_PRIMARY, this will only return at most one entry, +// the most recent tablet of type primary. func (hc *HealthCheckImpl) GetTabletStats(target *query.Target) []*TabletHealth { var result []*TabletHealth hc.mu.Lock() @@ -641,20 +642,6 @@ func (hc *HealthCheckImpl) GetTabletStats(target *query.Target) []*TabletHealth if target.Shard == "" { target.Shard = "0" } - for _, health := range hc.healthData[KeyFromTarget(target)] { - result = append(result, health) - } - return result -} - -// getTabletStats returns all tablets for the given target. -// The returned array is owned by the caller. -// For TabletType_PRIMARY, this will only return at most one entry, -// the most recent tablet of type primary. -func (hc *HealthCheckImpl) getTabletStats(target *query.Target) []*TabletHealth { - var result []*TabletHealth - hc.mu.Lock() - defer hc.mu.Unlock() ths := hc.healthData[KeyFromTarget(target)] for _, th := range ths { result = append(result, th) @@ -720,7 +707,7 @@ func (hc *HealthCheckImpl) waitForTablets(ctx context.Context, targets []*query. if requireServing { tabletHealths = hc.GetHealthyTabletStats(target) } else { - tabletHealths = hc.getTabletStats(target) + tabletHealths = hc.GetTabletStats(target) } if len(tabletHealths) == 0 { allPresent = false From 60e6279ab0e0e9d368fa61636753115a28becfea Mon Sep 17 00:00:00 2001 From: Florent Poinsard Date: Mon, 4 Jul 2022 16:09:56 +0200 Subject: [PATCH 12/15] Reverted the changes to throttler.Result Signed-off-by: Florent Poinsard --- go/vt/throttler/manager.go | 6 ++-- go/vt/throttler/max_replication_lag_module.go | 20 ++++++------- go/vt/throttler/result.go | 30 +++++++++---------- go/vt/throttler/result_test.go | 20 ++++++------- go/vt/throttler/throttler.go | 4 +-- go/vt/throttler/throttlerlogz.go | 4 +-- go/vt/throttler/throttlerlogz_test.go | 2 +- 7 files changed, 43 insertions(+), 43 deletions(-) diff --git a/go/vt/throttler/manager.go b/go/vt/throttler/manager.go index ae575263c80..c2ee9f0a652 100644 --- a/go/vt/throttler/manager.go +++ b/go/vt/throttler/manager.go @@ -205,9 +205,9 @@ func (m *managerImpl) throttlerNamesLocked() []string { return names } -// Log returns the most recent changes of the MaxReplicationLag module. +// log returns the most recent changes of the MaxReplicationLag module. // There will be one result for each processed replication lag record. -func (m *managerImpl) Log(throttlerName string) ([]Result, error) { +func (m *managerImpl) log(throttlerName string) ([]result, error) { m.mu.Lock() defer m.mu.Unlock() @@ -216,5 +216,5 @@ func (m *managerImpl) Log(throttlerName string) ([]Result, error) { return nil, fmt.Errorf("throttler: %v does not exist", throttlerName) } - return t.Log(), nil + return t.log(), nil } diff --git a/go/vt/throttler/max_replication_lag_module.go b/go/vt/throttler/max_replication_lag_module.go index 5474440bb3d..f8037f7f975 100644 --- a/go/vt/throttler/max_replication_lag_module.go +++ b/go/vt/throttler/max_replication_lag_module.go @@ -306,7 +306,7 @@ func (m *MaxReplicationLagModule) recalculateRate(lagRecordNow replicationLagRec m.memory.ageBadRate(now) - r := Result{ + r := result{ Now: now, RateChange: unchangedRate, lastRateChange: m.lastRateChange, @@ -440,7 +440,7 @@ func stateGreater(a, b state) bool { // and we should not skip the current replica ("lagRecordNow"). // Even if it's the same replica we may skip it and return false because // we want to wait longer for the propagation of the current rate change. -func (m *MaxReplicationLagModule) isReplicaUnderTest(r *Result, now time.Time, testedState state, lagRecordNow replicationLagRecord) bool { +func (m *MaxReplicationLagModule) isReplicaUnderTest(r *result, now time.Time, testedState state, lagRecordNow replicationLagRecord) bool { if m.replicaUnderTest == nil { return true } @@ -466,7 +466,7 @@ func (m *MaxReplicationLagModule) isReplicaUnderTest(r *Result, now time.Time, t return true } -func (m *MaxReplicationLagModule) increaseRate(r *Result, now time.Time, lagRecordNow replicationLagRecord) { +func (m *MaxReplicationLagModule) increaseRate(r *result, now time.Time, lagRecordNow replicationLagRecord) { m.markCurrentRateAsBadOrGood(r, now, stateIncreaseRate, unknown) oldRate := m.rate.Get() @@ -554,7 +554,7 @@ func (m *MaxReplicationLagModule) minTestDurationUntilNextIncrease(increase floa return minDuration } -func (m *MaxReplicationLagModule) decreaseAndGuessRate(r *Result, now time.Time, lagRecordNow replicationLagRecord) { +func (m *MaxReplicationLagModule) decreaseAndGuessRate(r *result, now time.Time, lagRecordNow replicationLagRecord) { // Guess replication rate based on the difference in the replication lag of this // particular replica. lagRecordBefore := m.lagCache(lagRecordNow).atOrAfter(discovery.TabletToMapKey(lagRecordNow.Tablet), m.lastRateChange) @@ -625,7 +625,7 @@ func (m *MaxReplicationLagModule) decreaseAndGuessRate(r *Result, now time.Time, // guessReplicationRate guesses the actual replication rate based on the new bac // Note that "lagDifference" can be positive (lag increased) or negative (lag // decreased). -func (m *MaxReplicationLagModule) guessReplicationRate(r *Result, avgPrimaryRate float64, lagBefore, lagNow int64, lagDifference, d time.Duration) (int64, string) { +func (m *MaxReplicationLagModule) guessReplicationRate(r *result, avgPrimaryRate float64, lagBefore, lagNow int64, lagDifference, d time.Duration) (int64, string) { // avgReplicationRate is the average rate (per second) at which the replica // applied transactions from the replication stream. We infer the value // from the relative change in the replication lag. @@ -670,14 +670,14 @@ func (m *MaxReplicationLagModule) guessReplicationRate(r *Result, avgPrimaryRate return int64(newRate), reason } -func (m *MaxReplicationLagModule) emergency(r *Result, now time.Time, lagRecordNow replicationLagRecord) { +func (m *MaxReplicationLagModule) emergency(r *result, now time.Time, lagRecordNow replicationLagRecord) { m.markCurrentRateAsBadOrGood(r, now, stateEmergency, unknown) decreaseReason := fmt.Sprintf("replication lag went beyond max: %d > %d", lagRecordNow.lag(), m.config.MaxReplicationLagSec) m.decreaseRateByPercentage(r, now, lagRecordNow, stateEmergency, m.config.EmergencyDecrease, decreaseReason) } -func (m *MaxReplicationLagModule) decreaseRateByPercentage(r *Result, now time.Time, lagRecordNow replicationLagRecord, newState state, decrease float64, decreaseReason string) { +func (m *MaxReplicationLagModule) decreaseRateByPercentage(r *result, now time.Time, lagRecordNow replicationLagRecord, newState state, decrease float64, decreaseReason string) { oldRate := m.rate.Get() rate := int64(float64(oldRate) - float64(oldRate)*decrease) if rate == 0 { @@ -689,7 +689,7 @@ func (m *MaxReplicationLagModule) decreaseRateByPercentage(r *Result, now time.T m.updateRate(r, newState, rate, reason, now, lagRecordNow, m.config.MinDurationBetweenDecreases()) } -func (m *MaxReplicationLagModule) updateRate(r *Result, newState state, rate int64, reason string, now time.Time, lagRecordNow replicationLagRecord, testDuration time.Duration) { +func (m *MaxReplicationLagModule) updateRate(r *result, newState state, rate int64, reason string, now time.Time, lagRecordNow replicationLagRecord, testDuration time.Duration) { oldRate := m.rate.Get() m.currentState = newState @@ -717,7 +717,7 @@ func (m *MaxReplicationLagModule) updateRate(r *Result, newState state, rate int // markCurrentRateAsBadOrGood determines the actual rate between the last rate // change and "now" and determines if that rate was bad or good. -func (m *MaxReplicationLagModule) markCurrentRateAsBadOrGood(r *Result, now time.Time, newState state, replicationLagChange replicationLagChange) { +func (m *MaxReplicationLagModule) markCurrentRateAsBadOrGood(r *result, now time.Time, newState state, replicationLagChange replicationLagChange) { if m.lastRateChange.IsZero() { // Module was just started. We don't have any data points yet. r.GoodOrBad = ignoredRate @@ -791,6 +791,6 @@ func (m *MaxReplicationLagModule) markCurrentRateAsBadOrGood(r *Result, now time } } -func (m *MaxReplicationLagModule) log() []Result { +func (m *MaxReplicationLagModule) log() []result { return m.results.latestValues() } diff --git a/go/vt/throttler/result.go b/go/vt/throttler/result.go index 275eb01f9df..179711116a3 100644 --- a/go/vt/throttler/result.go +++ b/go/vt/throttler/result.go @@ -42,7 +42,7 @@ const ( ignoredRate = "ignored" ) -var resultStringTemplate = template.Must(template.New("Result.String()").Parse( +var resultStringTemplate = template.Must(template.New("result.String()").Parse( `rate was: {{.RateChange}} from: {{.OldRate}} to: {{.NewRate}} alias: {{.Alias}} lag: {{.LagRecordNow.Stats.ReplicationLagSeconds}}s last change: {{.TimeSinceLastRateChange}} rate: {{.CurrentRate}} good/bad? {{.GoodOrBad}} skipped b/c: {{.MemorySkipReason}} good/bad: {{.HighestGood}}/{{.LowestBad}} @@ -50,10 +50,10 @@ state (old/tested/new): {{.OldState}}/{{.TestedState}}/{{.NewState}} lag before: {{.LagBefore}} ({{.AgeOfBeforeLag}} ago) rates (primary/replica): {{.PrimaryRate}}/{{.GuessedReplicationRate}} backlog (old/new): {{.GuessedReplicationBacklogOld}}/{{.GuessedReplicationBacklogNew}} reason: {{.Reason}}`)) -// Result is generated by the MaxReplicationLag module for each processed +// result is generated by the MaxReplicationLag module for each processed // "replicationLagRecord". // It captures the details and the decision of the processing. -type Result struct { +type result struct { Now time.Time RateChange rateChange lastRateChange time.Time @@ -80,7 +80,7 @@ type Result struct { GuessedReplicationBacklogNew int } -func (r Result) String() string { +func (r result) String() string { var b bytes.Buffer if err := resultStringTemplate.Execute(&b, r); err != nil { panic(fmt.Sprintf("failed to Execute() template: %v", err)) @@ -88,25 +88,25 @@ func (r Result) String() string { return b.String() } -func (r Result) Alias() string { +func (r result) Alias() string { return topoproto.TabletAliasString(r.LagRecordNow.Tablet.Alias) } -func (r Result) TimeSinceLastRateChange() string { +func (r result) TimeSinceLastRateChange() string { if r.lastRateChange.IsZero() { return "n/a" } return fmt.Sprintf("%.1fs", r.Now.Sub(r.lastRateChange).Seconds()) } -func (r Result) LagBefore() string { +func (r result) LagBefore() string { if r.LagRecordBefore.isZero() { return "n/a" } return fmt.Sprintf("%ds", r.LagRecordBefore.Stats.ReplicationLagSeconds) } -func (r Result) AgeOfBeforeLag() string { +func (r result) AgeOfBeforeLag() string { if r.LagRecordBefore.isZero() { return "n/a" } @@ -117,24 +117,24 @@ func (r Result) AgeOfBeforeLag() string { type resultRing struct { // mu guards the fields below. mu sync.Mutex - // position holds the index of the *next* Result in the ring. + // position holds the index of the *next* result in the ring. position int // wrapped becomes true when the ring buffer "wrapped" at least once and we // started reusing entries. wrapped bool // values is the underlying ring buffer. - values []Result + values []result } // newResultRing creates a new resultRing. func newResultRing(capacity int) *resultRing { return &resultRing{ - values: make([]Result, capacity), + values: make([]result, capacity), } } -// add inserts a new Result into the ring buffer. -func (rr *resultRing) add(r Result) { +// add inserts a new result into the ring buffer. +func (rr *resultRing) add(r result) { rr.mu.Lock() defer rr.mu.Unlock() @@ -148,7 +148,7 @@ func (rr *resultRing) add(r Result) { // latestValues returns all values of the buffer. Entries are sorted in reverse // chronological order i.e. newer items come first. -func (rr *resultRing) latestValues() []Result { +func (rr *resultRing) latestValues() []result { rr.mu.Lock() defer rr.mu.Unlock() @@ -162,7 +162,7 @@ func (rr *resultRing) latestValues() []Result { count = rr.position } - results := make([]Result, count) + results := make([]result, count) for i := 0; i < count; i++ { pos := start - i if pos < 0 { diff --git a/go/vt/throttler/result_test.go b/go/vt/throttler/result_test.go index 9eadab503e8..9efc7df9412 100644 --- a/go/vt/throttler/result_test.go +++ b/go/vt/throttler/result_test.go @@ -23,7 +23,7 @@ import ( ) var ( - resultIncreased = Result{ + resultIncreased = result{ Now: sinceZero(1234 * time.Millisecond), RateChange: increasedRate, lastRateChange: sinceZero(1 * time.Millisecond), @@ -45,7 +45,7 @@ var ( GuessedReplicationBacklogOld: 0, GuessedReplicationBacklogNew: 0, } - resultDecreased = Result{ + resultDecreased = result{ Now: sinceZero(5000 * time.Millisecond), RateChange: decreasedRate, lastRateChange: sinceZero(1234 * time.Millisecond), @@ -67,7 +67,7 @@ var ( GuessedReplicationBacklogOld: 10, GuessedReplicationBacklogNew: 20, } - resultEmergency = Result{ + resultEmergency = result{ Now: sinceZero(10123 * time.Millisecond), RateChange: decreasedRate, lastRateChange: sinceZero(5000 * time.Millisecond), @@ -93,7 +93,7 @@ var ( func TestResultString(t *testing.T) { testcases := []struct { - r Result + r result want string }{ { @@ -135,27 +135,27 @@ reason: emergency state decreased the rate`, func TestResultRing(t *testing.T) { // Test data. - r1 := Result{Reason: "r1"} - r2 := Result{Reason: "r2"} - r3 := Result{Reason: "r3"} + r1 := result{Reason: "r1"} + r2 := result{Reason: "r2"} + r3 := result{Reason: "r3"} rr := newResultRing(2) // Use the ring partially. rr.add(r1) - if got, want := rr.latestValues(), []Result{r1}; !reflect.DeepEqual(got, want) { + if got, want := rr.latestValues(), []result{r1}; !reflect.DeepEqual(got, want) { t.Fatalf("items not correctly added to resultRing. got = %v, want = %v", got, want) } // Use it fully. rr.add(r2) - if got, want := rr.latestValues(), []Result{r2, r1}; !reflect.DeepEqual(got, want) { + if got, want := rr.latestValues(), []result{r2, r1}; !reflect.DeepEqual(got, want) { t.Fatalf("items not correctly added to resultRing. got = %v, want = %v", got, want) } // Let it wrap. rr.add(r3) - if got, want := rr.latestValues(), []Result{r3, r2}; !reflect.DeepEqual(got, want) { + if got, want := rr.latestValues(), []result{r3, r2}; !reflect.DeepEqual(got, want) { t.Fatalf("resultRing did not wrap correctly. got = %v, want = %v", got, want) } } diff --git a/go/vt/throttler/throttler.go b/go/vt/throttler/throttler.go index 686df6c89ab..b731bcb2fbe 100644 --- a/go/vt/throttler/throttler.go +++ b/go/vt/throttler/throttler.go @@ -315,7 +315,7 @@ func (t *Throttler) ResetConfiguration() { t.maxReplicationLagModule.resetConfiguration() } -// Log returns the most recent changes of the MaxReplicationLag module. -func (t *Throttler) Log() []Result { +// log returns the most recent changes of the MaxReplicationLag module. +func (t *Throttler) log() []result { return t.maxReplicationLagModule.log() } diff --git a/go/vt/throttler/throttlerlogz.go b/go/vt/throttler/throttlerlogz.go index 1528bc01c7a..6952b34feec 100644 --- a/go/vt/throttler/throttlerlogz.go +++ b/go/vt/throttler/throttlerlogz.go @@ -125,7 +125,7 @@ func throttlerlogzHandler(w http.ResponseWriter, r *http.Request, m *managerImpl } func showThrottlerLog(w http.ResponseWriter, m *managerImpl, name string) { - results, err := m.Log(name) + results, err := m.log(name) if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) return @@ -152,7 +152,7 @@ func showThrottlerLog(w http.ResponseWriter, m *managerImpl, name string) { colorLevel = "high" } data := struct { - Result + result ColorLevel string }{r, colorLevel} diff --git a/go/vt/throttler/throttlerlogz_test.go b/go/vt/throttler/throttlerlogz_test.go index 2ecb76c9982..82ebb77e7a1 100644 --- a/go/vt/throttler/throttlerlogz_test.go +++ b/go/vt/throttler/throttlerlogz_test.go @@ -55,7 +55,7 @@ func TestThrottlerlogzHandler(t *testing.T) { testcases := []struct { desc string - r Result + r result want string }{ { From a93fda66a1ea5eb28b9aac08550a36cecad6095f Mon Sep 17 00:00:00 2001 From: Florent Poinsard Date: Mon, 4 Jul 2022 16:29:41 +0200 Subject: [PATCH 13/15] Use GetHealthyTabletStats in waitForDrainInCell and removed unrequired target.shard checks Signed-off-by: Florent Poinsard --- go/vt/discovery/healthcheck.go | 6 ------ go/vt/wrangler/keyspace.go | 2 +- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/go/vt/discovery/healthcheck.go b/go/vt/discovery/healthcheck.go index b0da81a9111..73ed319e8a4 100644 --- a/go/vt/discovery/healthcheck.go +++ b/go/vt/discovery/healthcheck.go @@ -625,9 +625,6 @@ func (hc *HealthCheckImpl) GetHealthyTabletStats(target *query.Target) []*Tablet var result []*TabletHealth hc.mu.Lock() defer hc.mu.Unlock() - if target.Shard == "" { - target.Shard = "0" - } return append(result, hc.healthy[KeyFromTarget(target)]...) } @@ -639,9 +636,6 @@ func (hc *HealthCheckImpl) GetTabletStats(target *query.Target) []*TabletHealth var result []*TabletHealth hc.mu.Lock() defer hc.mu.Unlock() - if target.Shard == "" { - target.Shard = "0" - } ths := hc.healthData[KeyFromTarget(target)] for _, th := range ths { result = append(result, th) diff --git a/go/vt/wrangler/keyspace.go b/go/vt/wrangler/keyspace.go index 216d31c883c..37d706326d3 100644 --- a/go/vt/wrangler/keyspace.go +++ b/go/vt/wrangler/keyspace.go @@ -973,7 +973,7 @@ func (wr *Wrangler) waitForDrainInCell(ctx context.Context, cell, keyspace, shar drainedHealthyTablets := make(map[uint32]*discovery.TabletHealth) notDrainedHealtyTablets := make(map[uint32]*discovery.TabletHealth) - healthyTablets := hc.GetTabletStats(&querypb.Target{Keyspace: keyspace, Shard: shard, TabletType: servedType}) + healthyTablets := hc.GetHealthyTabletStats(&querypb.Target{Keyspace: keyspace, Shard: shard, TabletType: servedType}) for _, ts := range healthyTablets { if ts.Stats.Qps == 0.0 { drainedHealthyTablets[ts.Tablet.Alias.Uid] = ts From e3aba1b0e1da03cc1b7e32c84786ba5a88c89ade Mon Sep 17 00:00:00 2001 From: Florent Poinsard Date: Mon, 4 Jul 2022 16:57:42 +0200 Subject: [PATCH 14/15] Addition of a NewCellTabletsWatcher to waitForDrainInCell Signed-off-by: Florent Poinsard --- go/vt/vtctl/vtctl.go | 3 ++- go/vt/wrangler/keyspace.go | 9 ++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/go/vt/vtctl/vtctl.go b/go/vt/vtctl/vtctl.go index 857b4a16760..d7013b2078b 100644 --- a/go/vt/vtctl/vtctl.go +++ b/go/vt/vtctl/vtctl.go @@ -1354,7 +1354,8 @@ func commandWaitForDrain(ctx context.Context, wr *wrangler.Wrangler, subFlags *f return err } - return wr.WaitForDrain(ctx, cells, keyspace, shard, servedType, *retryDelay, *HealthcheckRetryDelay, *HealthCheckTimeout, *initialWait) + return wr.WaitForDrain(ctx, cells, keyspace, shard, servedType, + *retryDelay, *HealthCheckTopologyRefresh, *HealthcheckRetryDelay, *HealthCheckTimeout, *initialWait) } func commandSleep(ctx context.Context, wr *wrangler.Wrangler, subFlags *flag.FlagSet, args []string) error { diff --git a/go/vt/wrangler/keyspace.go b/go/vt/wrangler/keyspace.go index 37d706326d3..71962b38121 100644 --- a/go/vt/wrangler/keyspace.go +++ b/go/vt/wrangler/keyspace.go @@ -919,7 +919,7 @@ func (wr *Wrangler) updateFrozenFlag(ctx context.Context, shards []*topo.ShardIn // the tablet was actually drained. At later times, a QPS rate > 0.0 could still // be observed. func (wr *Wrangler) WaitForDrain(ctx context.Context, cells []string, keyspace, shard string, servedType topodatapb.TabletType, - retryDelay, healthcheckRetryDelay, healthCheckTimeout, initialWait time.Duration) error { + retryDelay, healthCheckTopologyRefresh, healthcheckRetryDelay, healthCheckTimeout, initialWait time.Duration) error { var err error if len(cells) == 0 { // Retrieve list of cells for the shard from the topology. @@ -936,7 +936,7 @@ func (wr *Wrangler) WaitForDrain(ctx context.Context, cells []string, keyspace, wg.Add(1) go func(cell string) { defer wg.Done() - rec.RecordError(wr.waitForDrainInCell(ctx, cell, keyspace, shard, servedType, retryDelay, healthcheckRetryDelay, healthCheckTimeout, initialWait)) + rec.RecordError(wr.waitForDrainInCell(ctx, cell, keyspace, shard, servedType, retryDelay, healthCheckTopologyRefresh, healthcheckRetryDelay, healthCheckTimeout, initialWait)) }(cell) } wg.Wait() @@ -945,12 +945,15 @@ func (wr *Wrangler) WaitForDrain(ctx context.Context, cells []string, keyspace, } func (wr *Wrangler) waitForDrainInCell(ctx context.Context, cell, keyspace, shard string, servedType topodatapb.TabletType, - retryDelay, healthcheckRetryDelay, healthCheckTimeout, initialWait time.Duration) error { + retryDelay, healthCheckTopologyRefresh, healthcheckRetryDelay, healthCheckTimeout, initialWait time.Duration) error { // Create the healthheck module, with a cache. hc := discovery.NewHealthCheck(ctx, healthcheckRetryDelay, healthCheckTimeout, wr.TopoServer(), cell, "") defer hc.Close() + watcher := discovery.NewCellTabletsWatcher(ctx, wr.TopoServer(), hc, discovery.NewFilterByKeyspace([]string{keyspace}), cell, healthCheckTopologyRefresh, true, discovery.DefaultTopoReadConcurrency) + defer watcher.Stop() + // Wait for at least one tablet. if err := hc.WaitForTablets(ctx, keyspace, shard, servedType); err != nil { return fmt.Errorf("%v: error waiting for initial %v tablets for %v/%v: %v", cell, servedType, keyspace, shard, err) From b89882fdd19b35a1037614bbb083d10de5da288f Mon Sep 17 00:00:00 2001 From: Florent Poinsard Date: Fri, 8 Jul 2022 16:00:23 +0200 Subject: [PATCH 15/15] Moved TabletRecorded methods inside the HealthCheck interface Signed-off-by: Florent Poinsard --- go/vt/discovery/healthcheck.go | 18 +++++++----------- go/vt/discovery/topology_watcher.go | 16 ++++++++-------- .../tabletserver/txthrottler/tx_throttler.go | 6 +++--- .../txthrottler/tx_throttler_test.go | 2 +- 4 files changed, 19 insertions(+), 23 deletions(-) diff --git a/go/vt/discovery/healthcheck.go b/go/vt/discovery/healthcheck.go index 73ed319e8a4..65bfa5aafc5 100644 --- a/go/vt/discovery/healthcheck.go +++ b/go/vt/discovery/healthcheck.go @@ -154,23 +154,19 @@ func FilteringKeyspaces() bool { return len(KeyspacesToWatch) > 0 } -// TabletRecorder is a sub interface of HealthCheck. -// It is separated out to enable unit testing. -type TabletRecorder interface { +type KeyspaceShardTabletType string +type tabletAliasString string + +// HealthCheck declares what the TabletGateway needs from the HealthCheck +type HealthCheck interface { // AddTablet adds the tablet. AddTablet(tablet *topodata.Tablet) + // RemoveTablet removes the tablet. RemoveTablet(tablet *topodata.Tablet) + // ReplaceTablet does an AddTablet and RemoveTablet in one call, effectively replacing the old tablet with the new. ReplaceTablet(old, new *topodata.Tablet) -} - -type KeyspaceShardTabletType string -type tabletAliasString string - -// HealthCheck declares what the TabletGateway needs from the HealthCheck -type HealthCheck interface { - TabletRecorder // CacheStatus returns a displayable version of the health check cache. CacheStatus() TabletsCacheStatusList diff --git a/go/vt/discovery/topology_watcher.go b/go/vt/discovery/topology_watcher.go index c510a7ee871..a2a70a1f2d0 100644 --- a/go/vt/discovery/topology_watcher.go +++ b/go/vt/discovery/topology_watcher.go @@ -66,7 +66,7 @@ type tabletInfo struct { type TopologyWatcher struct { // set at construction time topoServer *topo.Server - tabletRecorder TabletRecorder + healthcheck HealthCheck tabletFilter TabletFilter cell string refreshInterval time.Duration @@ -94,10 +94,10 @@ type TopologyWatcher struct { // NewTopologyWatcher returns a TopologyWatcher that monitors all // the tablets in a cell, and starts refreshing. -func NewTopologyWatcher(ctx context.Context, topoServer *topo.Server, tr TabletRecorder, filter TabletFilter, cell string, refreshInterval time.Duration, refreshKnownTablets bool, topoReadConcurrency int, getTablets func(tw *TopologyWatcher) ([]*topodata.TabletAlias, error)) *TopologyWatcher { +func NewTopologyWatcher(ctx context.Context, topoServer *topo.Server, hc HealthCheck, filter TabletFilter, cell string, refreshInterval time.Duration, refreshKnownTablets bool, topoReadConcurrency int, getTablets func(tw *TopologyWatcher) ([]*topodata.TabletAlias, error)) *TopologyWatcher { tw := &TopologyWatcher{ topoServer: topoServer, - tabletRecorder: tr, + healthcheck: hc, tabletFilter: filter, cell: cell, refreshInterval: refreshInterval, @@ -116,8 +116,8 @@ func NewTopologyWatcher(ctx context.Context, topoServer *topo.Server, tr TabletR // NewCellTabletsWatcher returns a TopologyWatcher that monitors all // the tablets in a cell, and starts refreshing. -func NewCellTabletsWatcher(ctx context.Context, topoServer *topo.Server, tr TabletRecorder, f TabletFilter, cell string, refreshInterval time.Duration, refreshKnownTablets bool, topoReadConcurrency int) *TopologyWatcher { - return NewTopologyWatcher(ctx, topoServer, tr, f, cell, refreshInterval, refreshKnownTablets, topoReadConcurrency, func(tw *TopologyWatcher) ([]*topodata.TabletAlias, error) { +func NewCellTabletsWatcher(ctx context.Context, topoServer *topo.Server, hc HealthCheck, f TabletFilter, cell string, refreshInterval time.Duration, refreshKnownTablets bool, topoReadConcurrency int) *TopologyWatcher { + return NewTopologyWatcher(ctx, topoServer, hc, f, cell, refreshInterval, refreshKnownTablets, topoReadConcurrency, func(tw *TopologyWatcher) ([]*topodata.TabletAlias, error) { return tw.topoServer.GetTabletAliasesByCell(ctx, tw.cell) }) } @@ -225,19 +225,19 @@ func (tw *TopologyWatcher) loadTablets() { if oldKey != newKey { // This is the case where the same tablet alias is now reporting // a different address (host:port) key. - tw.tabletRecorder.ReplaceTablet(val.tablet, newVal.tablet) + tw.healthcheck.ReplaceTablet(val.tablet, newVal.tablet) topologyWatcherOperations.Add(topologyWatcherOpReplaceTablet, 1) } } else { // This is a new tablet record, let's add it to the healthcheck - tw.tabletRecorder.AddTablet(newVal.tablet) + tw.healthcheck.AddTablet(newVal.tablet) topologyWatcherOperations.Add(topologyWatcherOpAddTablet, 1) } } for _, val := range tw.tablets { if _, ok := newTablets[val.alias]; !ok { - tw.tabletRecorder.RemoveTablet(val.tablet) + tw.healthcheck.RemoveTablet(val.tablet) topologyWatcherOperations.Add(topologyWatcherOpRemoveTablet, 1) } } diff --git a/go/vt/vttablet/tabletserver/txthrottler/tx_throttler.go b/go/vt/vttablet/tabletserver/txthrottler/tx_throttler.go index 143840e0c21..6f0a10d1776 100644 --- a/go/vt/vttablet/tabletserver/txthrottler/tx_throttler.go +++ b/go/vt/vttablet/tabletserver/txthrottler/tx_throttler.go @@ -179,7 +179,7 @@ type txThrottlerState struct { // topology watchers and go/vt/throttler. These are provided here so that they can be overridden // in tests to generate mocks. type healthCheckFactoryFunc func(topoServer *topo.Server, cell string, cellsToWatch []string) discovery.HealthCheck -type topologyWatcherFactoryFunc func(topoServer *topo.Server, tr discovery.TabletRecorder, cell, keyspace, shard string, refreshInterval time.Duration, topoReadConcurrency int) TopologyWatcherInterface +type topologyWatcherFactoryFunc func(topoServer *topo.Server, hc discovery.HealthCheck, cell, keyspace, shard string, refreshInterval time.Duration, topoReadConcurrency int) TopologyWatcherInterface type throttlerFactoryFunc func(name, unit string, threadCount int, maxRate, maxReplicationLag int64) (ThrottlerInterface, error) var ( @@ -196,8 +196,8 @@ func resetTxThrottlerFactories() { healthCheckFactory = func(topoServer *topo.Server, cell string, cellsToWatch []string) discovery.HealthCheck { return discovery.NewHealthCheck(context.Background(), discovery.DefaultHealthCheckRetryDelay, discovery.DefaultHealthCheckTimeout, topoServer, cell, strings.Join(cellsToWatch, ",")) } - topologyWatcherFactory = func(topoServer *topo.Server, tr discovery.TabletRecorder, cell, keyspace, shard string, refreshInterval time.Duration, topoReadConcurrency int) TopologyWatcherInterface { - return discovery.NewCellTabletsWatcher(context.Background(), topoServer, tr, discovery.NewFilterByKeyspace([]string{keyspace}), cell, refreshInterval, true, topoReadConcurrency) + topologyWatcherFactory = func(topoServer *topo.Server, hc discovery.HealthCheck, cell, keyspace, shard string, refreshInterval time.Duration, topoReadConcurrency int) TopologyWatcherInterface { + return discovery.NewCellTabletsWatcher(context.Background(), topoServer, hc, discovery.NewFilterByKeyspace([]string{keyspace}), cell, refreshInterval, true, topoReadConcurrency) } throttlerFactory = func(name, unit string, threadCount int, maxRate, maxReplicationLag int64) (ThrottlerInterface, error) { return throttler.NewThrottler(name, unit, threadCount, maxRate, maxReplicationLag) diff --git a/go/vt/vttablet/tabletserver/txthrottler/tx_throttler_test.go b/go/vt/vttablet/tabletserver/txthrottler/tx_throttler_test.go index bdce7899370..1606fa2cf4c 100644 --- a/go/vt/vttablet/tabletserver/txthrottler/tx_throttler_test.go +++ b/go/vt/vttablet/tabletserver/txthrottler/tx_throttler_test.go @@ -69,7 +69,7 @@ func TestEnabledThrottler(t *testing.T) { return mockHealthCheck } - topologyWatcherFactory = func(topoServer *topo.Server, tr discovery.TabletRecorder, cell, keyspace, shard string, refreshInterval time.Duration, topoReadConcurrency int) TopologyWatcherInterface { + topologyWatcherFactory = func(topoServer *topo.Server, hc discovery.HealthCheck, cell, keyspace, shard string, refreshInterval time.Duration, topoReadConcurrency int) TopologyWatcherInterface { if ts != topoServer { t.Errorf("want: %v, got: %v", ts, topoServer) }