diff --git a/changelog/18.0/18.0.0/summary.md b/changelog/18.0/18.0.0/summary.md index e0001324c26..903e6a95895 100644 --- a/changelog/18.0/18.0.0/summary.md +++ b/changelog/18.0/18.0.0/summary.md @@ -101,6 +101,14 @@ Vitess upgrade process from an earlier version if you need to use such a workflo Any MoveTables or Migrate workflow that moves a sequence table should only be run after all vitess components have been upgraded, and no upgrade should be done while such a workflow is in progress. +#### New Dry-run/monitoring-only mode for the transaction throttler + +A new CLI flag `--tx-throttler-dry-run` to set the Transaction Throttler to monitoring-only/dry-run mode has been added. +If the transaction throttler is enabled with `--enable-tx-throttler` and the new dry-run flag is also specified, the +tablet will not actually throttle any transactions; however, it will increase the counters for transactions throttled +(`vttablet_transaction_throttler_throttled`). This allows users to deploy the transaction throttler in production and +gain observability on how much throttling would take place, without actually throttling any requests. + ### Docker #### Bookworm added and made default diff --git a/go/flags/endtoend/vttablet.txt b/go/flags/endtoend/vttablet.txt index 571933310b8..ba53b392123 100644 --- a/go/flags/endtoend/vttablet.txt +++ b/go/flags/endtoend/vttablet.txt @@ -347,6 +347,7 @@ Usage of vttablet: --twopc_enable if the flag is on, 2pc is enabled. Other 2pc flags must be supplied. --tx-throttler-config string Synonym to -tx_throttler_config (default "target_replication_lag_sec:2 max_replication_lag_sec:10 initial_rate:100 max_increase:1 emergency_decrease:0.5 min_duration_between_increases_sec:40 max_duration_between_increases_sec:62 min_duration_between_decreases_sec:20 spread_backlog_across_sec:20 age_bad_rate_after_sec:180 bad_rate_increase:0.1 max_rate_approach_threshold:0.9") --tx-throttler-default-priority int Default priority assigned to queries that lack priority information (default 100) + --tx-throttler-dry-run If present, the transaction throttler only records metrics about requests received and throttled, but does not actually throttle any requests. --tx-throttler-healthcheck-cells strings Synonym to -tx_throttler_healthcheck_cells --tx-throttler-tablet-types strings A comma-separated list of tablet types. Only tablets of this type are monitored for replication lag by the transaction throttler. Supported types are replica and/or rdonly. (default replica) --tx-throttler-topo-refresh-interval duration The rate that the transaction throttler will refresh the topology to find cells. (default 5m0s) diff --git a/go/vt/vttablet/tabletserver/tabletenv/config.go b/go/vt/vttablet/tabletserver/tabletenv/config.go index 9b3a2fd7d37..cb91cf271ac 100644 --- a/go/vt/vttablet/tabletserver/tabletenv/config.go +++ b/go/vt/vttablet/tabletserver/tabletenv/config.go @@ -185,6 +185,7 @@ func registerTabletEnvFlags(fs *pflag.FlagSet) { flagutil.DualFormatStringListVar(fs, ¤tConfig.TxThrottlerHealthCheckCells, "tx_throttler_healthcheck_cells", defaultConfig.TxThrottlerHealthCheckCells, "A comma-separated list of cells. Only tabletservers running in these cells will be monitored for replication lag by the transaction throttler.") fs.IntVar(¤tConfig.TxThrottlerDefaultPriority, "tx-throttler-default-priority", defaultConfig.TxThrottlerDefaultPriority, "Default priority assigned to queries that lack priority information") fs.Var(currentConfig.TxThrottlerTabletTypes, "tx-throttler-tablet-types", "A comma-separated list of tablet types. Only tablets of this type are monitored for replication lag by the transaction throttler. Supported types are replica and/or rdonly.") + fs.BoolVar(¤tConfig.TxThrottlerDryRun, "tx-throttler-dry-run", defaultConfig.TxThrottlerDryRun, "If present, the transaction throttler only records metrics about requests received and throttled, but does not actually throttle any requests.") fs.DurationVar(¤tConfig.TxThrottlerTopoRefreshInterval, "tx-throttler-topo-refresh-interval", time.Minute*5, "The rate that the transaction throttler will refresh the topology to find cells.") fs.BoolVar(&enableHotRowProtection, "enable_hot_row_protection", false, "If true, incoming transactions for the same row (range) will be queued and cannot consume all txpool slots.") @@ -366,6 +367,7 @@ type TabletConfig struct { TxThrottlerDefaultPriority int `json:"-"` TxThrottlerTabletTypes *topoproto.TabletTypeListFlag `json:"-"` TxThrottlerTopoRefreshInterval time.Duration `json:"-"` + TxThrottlerDryRun bool `json:"-"` EnableTableGC bool `json:"-"` // can be turned off programmatically by tests @@ -831,6 +833,7 @@ var defaultConfig = TabletConfig{ TxThrottlerHealthCheckCells: []string{}, TxThrottlerDefaultPriority: sqlparser.MaxPriorityValue, // This leads to all queries being candidates to throttle TxThrottlerTabletTypes: &topoproto.TabletTypeListFlag{topodatapb.TabletType_REPLICA}, + TxThrottlerDryRun: false, TxThrottlerTopoRefreshInterval: time.Minute * 5, TransactionLimitConfig: defaultTransactionLimitConfig(), diff --git a/go/vt/vttablet/tabletserver/txthrottler/tx_throttler.go b/go/vt/vttablet/tabletserver/txthrottler/tx_throttler.go index 0bc8c34a69b..a119b703d62 100644 --- a/go/vt/vttablet/tabletserver/txthrottler/tx_throttler.go +++ b/go/vt/vttablet/tabletserver/txthrottler/tx_throttler.go @@ -148,7 +148,7 @@ type txThrottler struct { // state holds an open transaction throttler state. It is nil // if the TransactionThrottler is closed. - state *txThrottlerState + state txThrottlerState target *querypb.Target topoServer *topo.Server @@ -170,6 +170,10 @@ type txThrottlerConfig struct { // returns false. enabled bool + // if dryRun is true, the txThrottler will run only on monitoring mode, meaning that it will increase counters for + // total and actually throttled requests, but it will not actually return that a transaction should be throttled. + dryRun bool + throttlerConfig *throttlerdatapb.Configuration // healthCheckCells stores the cell names in which running vttablets will be monitored for // replication lag. @@ -182,8 +186,14 @@ type txThrottlerConfig struct { topoRefreshInterval time.Duration } -// txThrottlerState holds the state of an open TxThrottler object. -type txThrottlerState struct { +type txThrottlerState interface { + deallocateResources() + StatsUpdate(tabletStats *discovery.TabletHealth) + throttle() bool +} + +// txThrottlerStateImpl holds the state of an open TxThrottler object. +type txThrottlerStateImpl struct { config *txThrottlerConfig txThrottler *txThrottler @@ -221,6 +231,7 @@ func NewTxThrottler(env tabletenv.Env, topoServer *topo.Server) TxThrottler { throttlerConfig = &txThrottlerConfig{ enabled: true, healthCheckCells: healthCheckCells, + dryRun: env.Config().TxThrottlerDryRun, tabletTypes: tabletTypes, throttlerConfig: env.Config().TxThrottlerConfig.Get(), topoRefreshInterval: env.Config().TxThrottlerTopoRefreshInterval, @@ -299,10 +310,10 @@ func (t *txThrottler) Throttle(priority int, workload string) (result bool) { t.requestsThrottled.Add(workload, 1) } - return result + return result && !t.config.dryRun } -func newTxThrottlerState(txThrottler *txThrottler, config *txThrottlerConfig, target *querypb.Target) (*txThrottlerState, error) { +func newTxThrottlerState(txThrottler *txThrottler, config *txThrottlerConfig, target *querypb.Target) (txThrottlerState, error) { maxReplicationLagModuleConfig := throttler.MaxReplicationLagModuleConfig{Configuration: config.throttlerConfig} t, err := throttlerFactory( @@ -319,7 +330,7 @@ func newTxThrottlerState(txThrottler *txThrottler, config *txThrottlerConfig, ta t.Close() return nil, err } - state := &txThrottlerState{ + state := &txThrottlerStateImpl{ config: config, healthCheckCells: config.healthCheckCells, throttler: t, @@ -342,7 +353,7 @@ func newTxThrottlerState(txThrottler *txThrottler, config *txThrottlerConfig, ta return state, nil } -func (ts *txThrottlerState) initHealthCheckStream(topoServer *topo.Server, target *querypb.Target) { +func (ts *txThrottlerStateImpl) initHealthCheckStream(topoServer *topo.Server, target *querypb.Target) { ts.healthCheck = healthCheckFactory(topoServer, target.Cell, ts.healthCheckCells) ts.healthCheckChan = ts.healthCheck.Subscribe() @@ -362,7 +373,7 @@ func (ts *txThrottlerState) initHealthCheckStream(topoServer *topo.Server, targe } } -func (ts *txThrottlerState) closeHealthCheckStream() { +func (ts *txThrottlerStateImpl) closeHealthCheckStream() { if ts.healthCheck == nil { return } @@ -375,7 +386,7 @@ func (ts *txThrottlerState) closeHealthCheckStream() { ts.healthCheck.Close() } -func (ts *txThrottlerState) updateHealthCheckCells(ctx context.Context, topoServer *topo.Server, target *querypb.Target) { +func (ts *txThrottlerStateImpl) updateHealthCheckCells(ctx context.Context, topoServer *topo.Server, target *querypb.Target) { fetchCtx, cancel := context.WithTimeout(ctx, topo.RemoteOperationTimeout) defer cancel() @@ -388,7 +399,7 @@ func (ts *txThrottlerState) updateHealthCheckCells(ctx context.Context, topoServ } } -func (ts *txThrottlerState) healthChecksProcessor(ctx context.Context, topoServer *topo.Server, target *querypb.Target) { +func (ts *txThrottlerStateImpl) healthChecksProcessor(ctx context.Context, topoServer *topo.Server, target *querypb.Target) { var cellsUpdateTicks <-chan time.Time if ts.cellsFromTopo { ticker := time.NewTicker(ts.config.topoRefreshInterval) @@ -407,7 +418,7 @@ func (ts *txThrottlerState) healthChecksProcessor(ctx context.Context, topoServe } } -func (ts *txThrottlerState) throttle() bool { +func (ts *txThrottlerStateImpl) throttle() bool { if ts.throttler == nil { log.Error("throttle called after deallocateResources was called") return false @@ -418,19 +429,19 @@ func (ts *txThrottlerState) throttle() bool { return ts.throttler.Throttle(0 /* threadId */) > 0 } -func (ts *txThrottlerState) deallocateResources() { +func (ts *txThrottlerStateImpl) deallocateResources() { // Close healthcheck and topo watchers ts.closeHealthCheckStream() ts.healthCheck = nil - // After ts.healthCheck is closed txThrottlerState.StatsUpdate() is guaranteed not + // After ts.healthCheck is closed txThrottlerStateImpl.StatsUpdate() is guaranteed not // to be executing, so we can safely close the throttler. ts.throttler.Close() ts.throttler = nil } // StatsUpdate updates the health of a tablet with the given healthcheck. -func (ts *txThrottlerState) StatsUpdate(tabletStats *discovery.TabletHealth) { +func (ts *txThrottlerStateImpl) StatsUpdate(tabletStats *discovery.TabletHealth) { if ts.config.tabletTypes == nil { return } diff --git a/go/vt/vttablet/tabletserver/txthrottler/tx_throttler_test.go b/go/vt/vttablet/tabletserver/txthrottler/tx_throttler_test.go index 4f5303aeb3f..e36d3517264 100644 --- a/go/vt/vttablet/tabletserver/txthrottler/tx_throttler_test.go +++ b/go/vt/vttablet/tabletserver/txthrottler/tx_throttler_test.go @@ -200,3 +200,54 @@ func TestNewTxThrottler(t *testing.T) { assert.Equal(t, []string{"cell1", "cell2"}, throttlerImpl.config.healthCheckCells) } } + +func TestDryRunThrottler(t *testing.T) { + config := tabletenv.NewDefaultConfig() + env := tabletenv.NewEnv(config, t.Name()) + + testCases := []struct { + Name string + txThrottlerStateShouldThrottle bool + throttlerDryRun bool + expectedResult bool + }{ + {Name: "Real run throttles when txThrottlerStateImpl says it should", txThrottlerStateShouldThrottle: true, throttlerDryRun: false, expectedResult: true}, + {Name: "Real run does not throttle when txThrottlerStateImpl says it should not", txThrottlerStateShouldThrottle: false, throttlerDryRun: false, expectedResult: false}, + {Name: "Dry run does not throttle when txThrottlerStateImpl says it should", txThrottlerStateShouldThrottle: true, throttlerDryRun: true, expectedResult: false}, + {Name: "Dry run does not throttle when txThrottlerStateImpl says it should not", txThrottlerStateShouldThrottle: false, throttlerDryRun: true, expectedResult: false}, + } + + for _, aTestCase := range testCases { + theTestCase := aTestCase + + t.Run(theTestCase.Name, func(t *testing.T) { + aTxThrottler := &txThrottler{ + config: &txThrottlerConfig{ + enabled: true, + dryRun: theTestCase.throttlerDryRun, + }, + state: &mockTxThrottlerState{shouldThrottle: theTestCase.txThrottlerStateShouldThrottle}, + throttlerRunning: env.Exporter().NewGauge("TransactionThrottlerRunning", "transaction throttler running state"), + requestsTotal: env.Exporter().NewCountersWithSingleLabel("TransactionThrottlerRequests", "transaction throttler requests", "workload"), + requestsThrottled: env.Exporter().NewCountersWithSingleLabel("TransactionThrottlerThrottled", "transaction throttler requests throttled", "workload"), + } + + assert.Equal(t, theTestCase.expectedResult, aTxThrottler.Throttle(100, "some-workload")) + }) + } +} + +type mockTxThrottlerState struct { + shouldThrottle bool +} + +func (t *mockTxThrottlerState) deallocateResources() { + +} +func (t *mockTxThrottlerState) StatsUpdate(tabletStats *discovery.TabletHealth) { + +} + +func (t *mockTxThrottlerState) throttle() bool { + return t.shouldThrottle +}