From 1f5ac88bc124a79f9bcaa31b8fb6e92229499524 Mon Sep 17 00:00:00 2001
From: Tim Vaillancourt <tim@timvaillancourt.com>
Date: Thu, 20 Feb 2025 12:26:10 +0100
Subject: [PATCH] `slack-21.0`: backport v22 VTOrc fixes, part 1 (#610)

* Move to native sqlite3 queries (#17124)

Signed-off-by: Dirkjan Bussink <d.bussink@gmail.com>
Co-authored-by: Tim Vaillancourt <tim@timvaillancourt.com>

* Improve efficiency of `vtorc` topo calls  (#17071)

Signed-off-by: Tim Vaillancourt <tim@timvaillancourt.com>
Co-authored-by: Matt Lord <mattalord@gmail.com>

* Ensure all topo read calls consider `--topo_read_concurrency` (#17276)

Signed-off-by: Tim Vaillancourt <tim@timvaillancourt.com>

* Avoid flaky topo concurrency test (#17407)

Signed-off-by: Tim Vaillancourt <tim@timvaillancourt.com>

* `vtorc`: fetch all tablets from cells once + filter during refresh (#17388)

Signed-off-by: Tim Vaillancourt <tim@timvaillancourt.com>

* Support KeyRange in `--clusters_to_watch` flag (#17604)

Signed-off-by: Manan Gupta <manan@planetscale.com>

* `vtorc`: improve handling of partial cell topo results (#17718)

Signed-off-by: Tim Vaillancourt <tim@timvaillancourt.com>

* Add stats for shards watched by VTOrc

Signed-off-by: Tim Vaillancourt <tim@timvaillancourt.com>

* add more tests

Signed-off-by: Tim Vaillancourt <tim@timvaillancourt.com>

* cleanup

Signed-off-by: Tim Vaillancourt <tim@timvaillancourt.com>

* fix ineffassign

Signed-off-by: Tim Vaillancourt <tim@timvaillancourt.com>

* fix test for v21

Signed-off-by: Tim Vaillancourt <tim@timvaillancourt.com>

* Use prefix in all vtorc check and recover logs (#17526)

Signed-off-by: Eduardo J. Ortega U. <5791035+ejortegau@users.noreply.github.com>

---------

Signed-off-by: Dirkjan Bussink <d.bussink@gmail.com>
Signed-off-by: Tim Vaillancourt <tim@timvaillancourt.com>
Signed-off-by: Manan Gupta <manan@planetscale.com>
Signed-off-by: Eduardo J. Ortega U. <5791035+ejortegau@users.noreply.github.com>
Co-authored-by: Dirkjan Bussink <d.bussink@gmail.com>
Co-authored-by: Matt Lord <mattalord@gmail.com>
Co-authored-by: Manan Gupta <35839558+GuptaManan100@users.noreply.github.com>
Co-authored-by: Eduardo J. Ortega U. <5791035+ejortegau@users.noreply.github.com>
---
 go/flags/endtoend/vtbackup.txt                |   1 +
 go/flags/endtoend/vtcombo.txt                 |   2 +-
 go/flags/endtoend/vtctld.txt                  |   2 +-
 go/flags/endtoend/vtgate.txt                  |   2 +-
 go/flags/endtoend/vtorc.txt                   |   3 +-
 go/flags/endtoend/vttablet.txt                |   1 +
 go/vt/discovery/healthcheck.go                |   2 +-
 go/vt/discovery/topology_watcher.go           |  29 +-
 go/vt/discovery/topology_watcher_test.go      |  10 +-
 go/vt/external/golib/sqlutils/dialect.go      |  53 ---
 .../external/golib/sqlutils/sqlite_dialect.go | 134 -------
 .../golib/sqlutils/sqlite_dialect_test.go     | 314 ----------------
 go/vt/key/key.go                              |  13 +
 go/vt/log/log.go                              |  91 +++++
 go/vt/topo/faketopo/faketopo.go               |  70 +++-
 go/vt/topo/keyspace.go                        |  27 +-
 go/vt/topo/server.go                          |  13 +-
 go/vt/topo/shard.go                           |  77 +++-
 go/vt/topo/shard_test.go                      |   8 +
 go/vt/topo/stats_conn.go                      |  35 +-
 go/vt/topo/stats_conn_test.go                 | 212 ++++++-----
 go/vt/topo/tablet.go                          |  53 ++-
 go/vt/topo/tablet_test.go                     | 348 +++++++++++++++---
 go/vt/vtorc/db/db.go                          |  29 +-
 go/vt/vtorc/inst/analysis_dao.go              |  14 +-
 go/vt/vtorc/inst/audit_dao.go                 |   2 +-
 go/vt/vtorc/inst/instance_dao.go              |  61 ++-
 go/vt/vtorc/inst/instance_dao_test.go         |  65 ++--
 go/vt/vtorc/inst/shard_dao.go                 |  37 ++
 go/vt/vtorc/inst/shard_dao_test.go            |  20 +-
 go/vt/vtorc/logic/disable_recovery.go         |   2 +-
 go/vt/vtorc/logic/keyspace_shard_discovery.go |  80 ++--
 .../logic/keyspace_shard_discovery_test.go    |  33 +-
 go/vt/vtorc/logic/tablet_discovery.go         | 215 +++++++----
 go/vt/vtorc/logic/tablet_discovery_test.go    | 277 ++++++++++++++
 go/vt/vtorc/logic/topology_recovery.go        | 144 +++++---
 go/vt/vtorc/logic/topology_recovery_dao.go    |  14 +-
 .../vtorc/logic/topology_recovery_dao_test.go |  18 +-
 go/vt/vtorc/logic/topology_recovery_test.go   |   4 +-
 go/vt/vtorc/process/health.go                 |   2 +-
 40 files changed, 1491 insertions(+), 1026 deletions(-)
 delete mode 100644 go/vt/external/golib/sqlutils/dialect.go
 delete mode 100644 go/vt/external/golib/sqlutils/sqlite_dialect.go
 delete mode 100644 go/vt/external/golib/sqlutils/sqlite_dialect_test.go

diff --git a/go/flags/endtoend/vtbackup.txt b/go/flags/endtoend/vtbackup.txt
index bf3a9eb9690..b4405960711 100644
--- a/go/flags/endtoend/vtbackup.txt
+++ b/go/flags/endtoend/vtbackup.txt
@@ -231,6 +231,7 @@ Flags:
       --topo_global_root string                                     the path of the global topology data in the global topology server
       --topo_global_server_address string                           the address of the global topology server
       --topo_implementation string                                  the topology implementation to use
+      --topo_read_concurrency int                                   Maximum concurrency of topo reads per global or local cell. (default 32)
       --topo_zk_auth_file string                                    auth to use when connecting to the zk topo server, file contents should be <scheme>:<auth>, e.g., digest:user:pass
       --topo_zk_base_timeout duration                               zk base timeout (see zk.Connect) (default 30s)
       --topo_zk_max_concurrency int                                 maximum number of pending requests to send to a Zookeeper server. (default 64)
diff --git a/go/flags/endtoend/vtcombo.txt b/go/flags/endtoend/vtcombo.txt
index 01a391d0cad..5190cc4ffab 100644
--- a/go/flags/endtoend/vtcombo.txt
+++ b/go/flags/endtoend/vtcombo.txt
@@ -374,7 +374,7 @@ Flags:
       --topo_global_root string                                          the path of the global topology data in the global topology server
       --topo_global_server_address string                                the address of the global topology server
       --topo_implementation string                                       the topology implementation to use
-      --topo_read_concurrency int                                        Concurrency of topo reads. (default 32)
+      --topo_read_concurrency int                                        Maximum concurrency of topo reads per global or local cell. (default 32)
       --topo_zk_auth_file string                                         auth to use when connecting to the zk topo server, file contents should be <scheme>:<auth>, e.g., digest:user:pass
       --topo_zk_base_timeout duration                                    zk base timeout (see zk.Connect) (default 30s)
       --topo_zk_max_concurrency int                                      maximum number of pending requests to send to a Zookeeper server. (default 64)
diff --git a/go/flags/endtoend/vtctld.txt b/go/flags/endtoend/vtctld.txt
index 8b1aa6f4a92..be0f5114e79 100644
--- a/go/flags/endtoend/vtctld.txt
+++ b/go/flags/endtoend/vtctld.txt
@@ -164,7 +164,7 @@ Flags:
       --topo_global_root string                                          the path of the global topology data in the global topology server
       --topo_global_server_address string                                the address of the global topology server
       --topo_implementation string                                       the topology implementation to use
-      --topo_read_concurrency int                                        Concurrency of topo reads. (default 32)
+      --topo_read_concurrency int                                        Maximum concurrency of topo reads per global or local cell. (default 32)
       --topo_zk_auth_file string                                         auth to use when connecting to the zk topo server, file contents should be <scheme>:<auth>, e.g., digest:user:pass
       --topo_zk_base_timeout duration                                    zk base timeout (see zk.Connect) (default 30s)
       --topo_zk_max_concurrency int                                      maximum number of pending requests to send to a Zookeeper server. (default 64)
diff --git a/go/flags/endtoend/vtgate.txt b/go/flags/endtoend/vtgate.txt
index 4cb4cd34148..fde17f89c49 100644
--- a/go/flags/endtoend/vtgate.txt
+++ b/go/flags/endtoend/vtgate.txt
@@ -223,7 +223,7 @@ Flags:
       --topo_global_root string                                          the path of the global topology data in the global topology server
       --topo_global_server_address string                                the address of the global topology server
       --topo_implementation string                                       the topology implementation to use
-      --topo_read_concurrency int                                        Concurrency of topo reads. (default 32)
+      --topo_read_concurrency int                                        Maximum concurrency of topo reads per global or local cell. (default 32)
       --topo_zk_auth_file string                                         auth to use when connecting to the zk topo server, file contents should be <scheme>:<auth>, e.g., digest:user:pass
       --topo_zk_base_timeout duration                                    zk base timeout (see zk.Connect) (default 30s)
       --topo_zk_max_concurrency int                                      maximum number of pending requests to send to a Zookeeper server. (default 64)
diff --git a/go/flags/endtoend/vtorc.txt b/go/flags/endtoend/vtorc.txt
index 3917522a2f8..11022f8e2a7 100644
--- a/go/flags/endtoend/vtorc.txt
+++ b/go/flags/endtoend/vtorc.txt
@@ -24,7 +24,7 @@ Flags:
       --bind-address string                                         Bind address for the server. If empty, the server will listen on all available unicast and anycast IP addresses of the local system.
       --catch-sigpipe                                               catch and ignore SIGPIPE on stdout and stderr if specified
       --change-tablets-with-errant-gtid-to-drained                  Whether VTOrc should be changing the type of tablets with errant GTIDs to DRAINED
-      --clusters_to_watch strings                                   Comma-separated list of keyspaces or keyspace/shards that this instance will monitor and repair. Defaults to all clusters in the topology. Example: "ks1,ks2/-80"
+      --clusters_to_watch strings                                   Comma-separated list of keyspaces or keyspace/keyranges that this instance will monitor and repair. Defaults to all clusters in the topology. Example: "ks1,ks2/-80"
       --config string                                               config file name
       --config-file string                                          Full path of the config file (with extension) to use. If set, --config-path, --config-type, and --config-name are ignored.
       --config-file-not-found-handling ConfigFileNotFoundHandling   Behavior when a config file is not found. (Options: error, exit, ignore, warn) (default warn)
@@ -99,6 +99,7 @@ Flags:
       --topo_global_root string                                     the path of the global topology data in the global topology server
       --topo_global_server_address string                           the address of the global topology server
       --topo_implementation string                                  the topology implementation to use
+      --topo_read_concurrency int                                   Maximum concurrency of topo reads per global or local cell. (default 32)
       --topo_zk_auth_file string                                    auth to use when connecting to the zk topo server, file contents should be <scheme>:<auth>, e.g., digest:user:pass
       --topo_zk_base_timeout duration                               zk base timeout (see zk.Connect) (default 30s)
       --topo_zk_max_concurrency int                                 maximum number of pending requests to send to a Zookeeper server. (default 64)
diff --git a/go/flags/endtoend/vttablet.txt b/go/flags/endtoend/vttablet.txt
index 8be7b620469..1d10b7066de 100644
--- a/go/flags/endtoend/vttablet.txt
+++ b/go/flags/endtoend/vttablet.txt
@@ -376,6 +376,7 @@ Flags:
       --topo_global_root string                                          the path of the global topology data in the global topology server
       --topo_global_server_address string                                the address of the global topology server
       --topo_implementation string                                       the topology implementation to use
+      --topo_read_concurrency int                                        Maximum concurrency of topo reads per global or local cell. (default 32)
       --topo_zk_auth_file string                                         auth to use when connecting to the zk topo server, file contents should be <scheme>:<auth>, e.g., digest:user:pass
       --topo_zk_base_timeout duration                                    zk base timeout (see zk.Connect) (default 30s)
       --topo_zk_max_concurrency int                                      maximum number of pending requests to send to a Zookeeper server. (default 64)
diff --git a/go/vt/discovery/healthcheck.go b/go/vt/discovery/healthcheck.go
index d4dfcc3ffa5..35f4ac1c2a9 100644
--- a/go/vt/discovery/healthcheck.go
+++ b/go/vt/discovery/healthcheck.go
@@ -376,7 +376,7 @@ func NewHealthCheck(ctx context.Context, retryDelay, healthCheckTimeout time.Dur
 		if c == "" {
 			continue
 		}
-		topoWatchers = append(topoWatchers, NewTopologyWatcher(ctx, topoServer, hc, filters, c, refreshInterval, refreshKnownTablets, topo.DefaultConcurrency))
+		topoWatchers = append(topoWatchers, NewTopologyWatcher(ctx, topoServer, hc, filters, c, refreshInterval, refreshKnownTablets))
 	}
 
 	hc.topoWatchers = topoWatchers
diff --git a/go/vt/discovery/topology_watcher.go b/go/vt/discovery/topology_watcher.go
index 64346d524ad..d1e358e1aa5 100644
--- a/go/vt/discovery/topology_watcher.go
+++ b/go/vt/discovery/topology_watcher.go
@@ -26,16 +26,13 @@ import (
 	"sync"
 	"time"
 
-	"vitess.io/vitess/go/vt/topo/topoproto"
-
-	"vitess.io/vitess/go/vt/key"
-
 	"vitess.io/vitess/go/stats"
 	"vitess.io/vitess/go/trace"
-
+	"vitess.io/vitess/go/vt/key"
 	"vitess.io/vitess/go/vt/log"
-	"vitess.io/vitess/go/vt/proto/topodata"
+	topodatapb "vitess.io/vitess/go/vt/proto/topodata"
 	"vitess.io/vitess/go/vt/topo"
+	"vitess.io/vitess/go/vt/topo/topoproto"
 )
 
 const (
@@ -56,7 +53,7 @@ var (
 // tabletInfo is used internally by the TopologyWatcher struct.
 type tabletInfo struct {
 	alias  string
-	tablet *topodata.Tablet
+	tablet *topodatapb.Tablet
 }
 
 // TopologyWatcher polls the topology periodically for changes to
@@ -70,7 +67,6 @@ type TopologyWatcher struct {
 	cell                string
 	refreshInterval     time.Duration
 	refreshKnownTablets bool
-	concurrency         int
 	ctx                 context.Context
 	cancelFunc          context.CancelFunc
 	// wg keeps track of all launched Go routines.
@@ -92,7 +88,7 @@ type TopologyWatcher struct {
 
 // NewTopologyWatcher returns a TopologyWatcher that monitors all
 // the tablets in a cell, and reloads them as needed.
-func NewTopologyWatcher(ctx context.Context, topoServer *topo.Server, hc HealthCheck, f TabletFilter, cell string, refreshInterval time.Duration, refreshKnownTablets bool, topoReadConcurrency int) *TopologyWatcher {
+func NewTopologyWatcher(ctx context.Context, topoServer *topo.Server, hc HealthCheck, f TabletFilter, cell string, refreshInterval time.Duration, refreshKnownTablets bool) *TopologyWatcher {
 	tw := &TopologyWatcher{
 		topoServer:          topoServer,
 		healthcheck:         hc,
@@ -100,7 +96,6 @@ func NewTopologyWatcher(ctx context.Context, topoServer *topo.Server, hc HealthC
 		cell:                cell,
 		refreshInterval:     refreshInterval,
 		refreshKnownTablets: refreshKnownTablets,
-		concurrency:         topoReadConcurrency,
 		tablets:             make(map[string]*tabletInfo),
 	}
 	tw.firstLoadChan = make(chan struct{})
@@ -112,7 +107,7 @@ func NewTopologyWatcher(ctx context.Context, topoServer *topo.Server, hc HealthC
 }
 
 func (tw *TopologyWatcher) getTablets() ([]*topo.TabletInfo, error) {
-	return tw.topoServer.GetTabletsByCell(tw.ctx, tw.cell, &topo.GetTabletsByCellOptions{Concurrency: tw.concurrency})
+	return tw.topoServer.GetTabletsByCell(tw.ctx, tw.cell, nil)
 }
 
 // Start starts the topology watcher.
@@ -271,14 +266,14 @@ func (tw *TopologyWatcher) TopoChecksum() uint32 {
 // to be applied as an additional filter on the list of tablets returned by its getTablets function.
 type TabletFilter interface {
 	// IsIncluded returns whether tablet is included in this filter
-	IsIncluded(tablet *topodata.Tablet) bool
+	IsIncluded(tablet *topodatapb.Tablet) bool
 }
 
 // TabletFilters contains filters for tablets.
 type TabletFilters []TabletFilter
 
 // IsIncluded returns true if a tablet passes all filters.
-func (tf TabletFilters) IsIncluded(tablet *topodata.Tablet) bool {
+func (tf TabletFilters) IsIncluded(tablet *topodatapb.Tablet) bool {
 	for _, filter := range tf {
 		if !filter.IsIncluded(tablet) {
 			return false
@@ -299,7 +294,7 @@ type FilterByShard struct {
 type filterShard struct {
 	keyspace string
 	shard    string
-	keyRange *topodata.KeyRange // only set if shard is also a KeyRange
+	keyRange *topodatapb.KeyRange // only set if shard is also a KeyRange
 }
 
 // NewFilterByShard creates a new FilterByShard for use by a
@@ -344,7 +339,7 @@ func NewFilterByShard(filters []string) (*FilterByShard, error) {
 }
 
 // IsIncluded returns true iff the tablet's keyspace and shard match what we have.
-func (fbs *FilterByShard) IsIncluded(tablet *topodata.Tablet) bool {
+func (fbs *FilterByShard) IsIncluded(tablet *topodatapb.Tablet) bool {
 	canonical, kr, err := topo.ValidateShardName(tablet.Shard)
 	if err != nil {
 		log.Errorf("Error parsing shard name %v, will ignore tablet: %v", tablet.Shard, err)
@@ -384,7 +379,7 @@ func NewFilterByKeyspace(selectedKeyspaces []string) *FilterByKeyspace {
 }
 
 // IsIncluded returns true if the tablet's keyspace matches what we have.
-func (fbk *FilterByKeyspace) IsIncluded(tablet *topodata.Tablet) bool {
+func (fbk *FilterByKeyspace) IsIncluded(tablet *topodatapb.Tablet) bool {
 	_, exist := fbk.keyspaces[tablet.Keyspace]
 	return exist
 }
@@ -403,7 +398,7 @@ func NewFilterByTabletTags(tabletTags map[string]string) *FilterByTabletTags {
 }
 
 // IsIncluded returns true if the tablet's tags match what we expect.
-func (fbtg *FilterByTabletTags) IsIncluded(tablet *topodata.Tablet) bool {
+func (fbtg *FilterByTabletTags) IsIncluded(tablet *topodatapb.Tablet) bool {
 	if fbtg.tags == nil {
 		return true
 	}
diff --git a/go/vt/discovery/topology_watcher_test.go b/go/vt/discovery/topology_watcher_test.go
index cef367c9b74..89a656c0982 100644
--- a/go/vt/discovery/topology_watcher_test.go
+++ b/go/vt/discovery/topology_watcher_test.go
@@ -67,7 +67,7 @@ func TestStartAndCloseTopoWatcher(t *testing.T) {
 	fhc := NewFakeHealthCheck(nil)
 	defer fhc.Close()
 	topologyWatcherOperations.ZeroAll()
-	tw := NewTopologyWatcher(context.Background(), ts, fhc, nil, "aa", 100*time.Microsecond, true, 5)
+	tw := NewTopologyWatcher(context.Background(), ts, fhc, nil, "aa", 100*time.Microsecond, true)
 
 	done := make(chan bool, 3)
 	result := make(chan bool, 1)
@@ -127,7 +127,7 @@ func checkWatcher(t *testing.T, refreshKnownTablets bool) {
 	logger := logutil.NewMemoryLogger()
 	topologyWatcherOperations.ZeroAll()
 	counts := topologyWatcherOperations.Counts()
-	tw := NewTopologyWatcher(context.Background(), ts, fhc, filter, "aa", 10*time.Minute, refreshKnownTablets, 5)
+	tw := NewTopologyWatcher(context.Background(), ts, fhc, filter, "aa", 10*time.Minute, refreshKnownTablets)
 
 	counts = checkOpCounts(t, counts, map[string]int64{})
 	checkChecksum(t, tw, 0)
@@ -421,7 +421,7 @@ func TestFilterByKeyspace(t *testing.T) {
 	f := TabletFilters{NewFilterByKeyspace(testKeyspacesToWatch)}
 	ts := memorytopo.NewServer(ctx, testCell)
 	defer ts.Close()
-	tw := NewTopologyWatcher(context.Background(), ts, hc, f, testCell, 10*time.Minute, true, 5)
+	tw := NewTopologyWatcher(context.Background(), ts, hc, f, testCell, 10*time.Minute, true)
 
 	for _, test := range testFilterByKeyspace {
 		// Add a new tablet to the topology.
@@ -502,7 +502,7 @@ func TestFilterByKeyspaceSkipsIgnoredTablets(t *testing.T) {
 	topologyWatcherOperations.ZeroAll()
 	counts := topologyWatcherOperations.Counts()
 	f := TabletFilters{NewFilterByKeyspace(testKeyspacesToWatch)}
-	tw := NewTopologyWatcher(context.Background(), ts, fhc, f, "aa", 10*time.Minute, false /*refreshKnownTablets*/, 5)
+	tw := NewTopologyWatcher(context.Background(), ts, fhc, f, "aa", 10*time.Minute, false /*refreshKnownTablets*/)
 
 	counts = checkOpCounts(t, counts, map[string]int64{})
 	checkChecksum(t, tw, 0)
@@ -639,7 +639,7 @@ func TestGetTabletErrorDoesNotRemoveFromHealthcheck(t *testing.T) {
 	defer fhc.Close()
 	topologyWatcherOperations.ZeroAll()
 	counts := topologyWatcherOperations.Counts()
-	tw := NewTopologyWatcher(context.Background(), ts, fhc, nil, "aa", 10*time.Minute, true, 5)
+	tw := NewTopologyWatcher(context.Background(), ts, fhc, nil, "aa", 10*time.Minute, true)
 	defer tw.Stop()
 
 	// Force fallback to getting tablets individually.
diff --git a/go/vt/external/golib/sqlutils/dialect.go b/go/vt/external/golib/sqlutils/dialect.go
deleted file mode 100644
index 8dabe57ccaf..00000000000
--- a/go/vt/external/golib/sqlutils/dialect.go
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
-   Copyright 2017 GitHub Inc.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-*/
-
-/*
-	This file has been copied over from VTOrc package
-*/
-
-package sqlutils
-
-import (
-	"regexp"
-	"strings"
-)
-
-type regexpMap struct {
-	r           *regexp.Regexp
-	replacement string
-}
-
-func (this *regexpMap) process(text string) (result string) {
-	return this.r.ReplaceAllString(text, this.replacement)
-}
-
-func rmap(regexpExpression string, replacement string) regexpMap {
-	return regexpMap{
-		r:           regexp.MustCompile(regexpSpaces(regexpExpression)),
-		replacement: replacement,
-	}
-}
-
-func regexpSpaces(statement string) string {
-	return strings.Replace(statement, " ", `[\s]+`, -1)
-}
-
-func applyConversions(statement string, conversions []regexpMap) string {
-	for _, rmap := range conversions {
-		statement = rmap.process(statement)
-	}
-	return statement
-}
diff --git a/go/vt/external/golib/sqlutils/sqlite_dialect.go b/go/vt/external/golib/sqlutils/sqlite_dialect.go
deleted file mode 100644
index 11f0e531367..00000000000
--- a/go/vt/external/golib/sqlutils/sqlite_dialect.go
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
-   Copyright 2017 GitHub Inc.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-*/
-
-/*
-	This file has been copied over from VTOrc package
-*/
-
-// What's this about?
-// This is a brute-force regular-expression based conversion from MySQL syntax to sqlite3 syntax.
-// It is NOT meant to be a general purpose solution and is only expected & confirmed to run on
-// queries issued by orchestrator. There are known limitations to this design.
-// It's not even pretty.
-// In fact...
-// Well, it gets the job done at this time. Call it debt.
-
-package sqlutils
-
-import (
-	"regexp"
-)
-
-var sqlite3CreateTableConversions = []regexpMap{
-	rmap(`(?i) (character set|charset) [\S]+`, ``),
-	rmap(`(?i)int unsigned`, `int`),
-	rmap(`(?i)int[\s]*[(][\s]*([0-9]+)[\s]*[)] unsigned`, `int`),
-	rmap(`(?i)engine[\s]*=[\s]*(innodb|myisam|ndb|memory|tokudb)`, ``),
-	rmap(`(?i)DEFAULT CHARSET[\s]*=[\s]*[\S]+`, ``),
-	rmap(`(?i)[\S]*int( not null|) auto_increment`, `integer`),
-	rmap(`(?i)comment '[^']*'`, ``),
-	rmap(`(?i)after [\S]+`, ``),
-	rmap(`(?i)alter table ([\S]+) add (index|key) ([\S]+) (.+)`, `create index ${3}_${1} on $1 $4`),
-	rmap(`(?i)alter table ([\S]+) add unique (index|key) ([\S]+) (.+)`, `create unique index ${3}_${1} on $1 $4`),
-	rmap(`(?i)([\S]+) enum[\s]*([(].*?[)])`, `$1 text check($1 in $2)`),
-	rmap(`(?i)([\s\S]+[/][*] sqlite3-skip [*][/][\s\S]+)`, ``),
-	rmap(`(?i)timestamp default current_timestamp`, `timestamp default ('')`),
-	rmap(`(?i)timestamp not null default current_timestamp`, `timestamp not null default ('')`),
-
-	rmap(`(?i)add column (.*int) not null[\s]*$`, `add column $1 not null default 0`),
-	rmap(`(?i)add column (.* text) not null[\s]*$`, `add column $1 not null default ''`),
-	rmap(`(?i)add column (.* varchar.*) not null[\s]*$`, `add column $1 not null default ''`),
-}
-
-var sqlite3InsertConversions = []regexpMap{
-	rmap(`(?i)insert ignore ([\s\S]+) on duplicate key update [\s\S]+`, `insert or ignore $1`),
-	rmap(`(?i)insert ignore`, `insert or ignore`),
-	rmap(`(?i)now[(][)]`, `datetime('now')`),
-	rmap(`(?i)insert into ([\s\S]+) on duplicate key update [\s\S]+`, `replace into $1`),
-}
-
-var sqlite3GeneralConversions = []regexpMap{
-	rmap(`(?i)now[(][)][\s]*[-][\s]*interval [?] ([\w]+)`, `datetime('now', printf('-%d $1', ?))`),
-	rmap(`(?i)now[(][)][\s]*[+][\s]*interval [?] ([\w]+)`, `datetime('now', printf('+%d $1', ?))`),
-	rmap(`(?i)now[(][)][\s]*[-][\s]*interval ([0-9.]+) ([\w]+)`, `datetime('now', '-${1} $2')`),
-	rmap(`(?i)now[(][)][\s]*[+][\s]*interval ([0-9.]+) ([\w]+)`, `datetime('now', '+${1} $2')`),
-
-	rmap(`(?i)[=<>\s]([\S]+[.][\S]+)[\s]*[-][\s]*interval [?] ([\w]+)`, ` datetime($1, printf('-%d $2', ?))`),
-	rmap(`(?i)[=<>\s]([\S]+[.][\S]+)[\s]*[+][\s]*interval [?] ([\w]+)`, ` datetime($1, printf('+%d $2', ?))`),
-
-	rmap(`(?i)unix_timestamp[(][)]`, `strftime('%s', 'now')`),
-	rmap(`(?i)unix_timestamp[(]([^)]+)[)]`, `strftime('%s', $1)`),
-	rmap(`(?i)now[(][)]`, `datetime('now')`),
-	rmap(`(?i)cast[(][\s]*([\S]+) as signed[\s]*[)]`, `cast($1 as integer)`),
-
-	rmap(`(?i)\bconcat[(][\s]*([^,)]+)[\s]*,[\s]*([^,)]+)[\s]*[)]`, `($1 || $2)`),
-	rmap(`(?i)\bconcat[(][\s]*([^,)]+)[\s]*,[\s]*([^,)]+)[\s]*,[\s]*([^,)]+)[\s]*[)]`, `($1 || $2 || $3)`),
-
-	rmap(`(?i) rlike `, ` like `),
-
-	rmap(`(?i)create index([\s\S]+)[(][\s]*[0-9]+[\s]*[)]([\s\S]+)`, `create index ${1}${2}`),
-	rmap(`(?i)drop index ([\S]+) on ([\S]+)`, `drop index if exists $1`),
-}
-
-var (
-	sqlite3IdentifyCreateTableStatement = regexp.MustCompile(regexpSpaces(`(?i)^[\s]*create table`))
-	sqlite3IdentifyCreateIndexStatement = regexp.MustCompile(regexpSpaces(`(?i)^[\s]*create( unique|) index`))
-	sqlite3IdentifyDropIndexStatement   = regexp.MustCompile(regexpSpaces(`(?i)^[\s]*drop index`))
-	sqlite3IdentifyAlterTableStatement  = regexp.MustCompile(regexpSpaces(`(?i)^[\s]*alter table`))
-	sqlite3IdentifyInsertStatement      = regexp.MustCompile(regexpSpaces(`(?i)^[\s]*(insert|replace)`))
-)
-
-func IsInsert(statement string) bool {
-	return sqlite3IdentifyInsertStatement.MatchString(statement)
-}
-
-func IsCreateTable(statement string) bool {
-	return sqlite3IdentifyCreateTableStatement.MatchString(statement)
-}
-
-func IsCreateIndex(statement string) bool {
-	return sqlite3IdentifyCreateIndexStatement.MatchString(statement)
-}
-
-func IsDropIndex(statement string) bool {
-	return sqlite3IdentifyDropIndexStatement.MatchString(statement)
-}
-
-func IsAlterTable(statement string) bool {
-	return sqlite3IdentifyAlterTableStatement.MatchString(statement)
-}
-
-func ToSqlite3CreateTable(statement string) string {
-	return applyConversions(statement, sqlite3CreateTableConversions)
-}
-
-func ToSqlite3Insert(statement string) string {
-	return applyConversions(statement, sqlite3InsertConversions)
-}
-
-func ToSqlite3Dialect(statement string) (translated string) {
-	if IsCreateTable(statement) {
-		return ToSqlite3CreateTable(statement)
-	}
-	if IsAlterTable(statement) {
-		return ToSqlite3CreateTable(statement)
-	}
-	statement = applyConversions(statement, sqlite3GeneralConversions)
-	if IsInsert(statement) {
-		return ToSqlite3Insert(statement)
-	}
-	return statement
-}
diff --git a/go/vt/external/golib/sqlutils/sqlite_dialect_test.go b/go/vt/external/golib/sqlutils/sqlite_dialect_test.go
deleted file mode 100644
index 1298c379adf..00000000000
--- a/go/vt/external/golib/sqlutils/sqlite_dialect_test.go
+++ /dev/null
@@ -1,314 +0,0 @@
-/*
-   Copyright 2017 GitHub Inc.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-*/
-
-/*
-	This file has been copied over from VTOrc package
-*/
-
-package sqlutils
-
-import (
-	"regexp"
-	"strings"
-	"testing"
-
-	"github.com/stretchr/testify/assert"
-	"github.com/stretchr/testify/require"
-)
-
-var spacesRegexp = regexp.MustCompile(`[\s]+`)
-
-func init() {
-}
-
-func stripSpaces(statement string) string {
-	statement = strings.TrimSpace(statement)
-	statement = spacesRegexp.ReplaceAllString(statement, " ")
-	return statement
-}
-
-func TestIsCreateTable(t *testing.T) {
-	require.True(t, IsCreateTable("create table t(id int)"))
-	require.True(t, IsCreateTable(" create table t(id int)"))
-	require.True(t, IsCreateTable("CREATE  TABLE t(id int)"))
-	require.True(t, IsCreateTable(`
-		create table t(id int)
-		`))
-	require.False(t, IsCreateTable("where create table t(id int)"))
-	require.False(t, IsCreateTable("insert"))
-}
-
-func TestToSqlite3CreateTable(t *testing.T) {
-	{
-		statement := "create table t(id int)"
-		result := ToSqlite3CreateTable(statement)
-		require.Equal(t, result, statement)
-	}
-	{
-		statement := "create table t(id int, v varchar(123) CHARACTER SET ascii NOT NULL default '')"
-		result := ToSqlite3CreateTable(statement)
-		require.Equal(t, result, "create table t(id int, v varchar(123) NOT NULL default '')")
-	}
-	{
-		statement := "create table t(id int, v varchar ( 123 ) CHARACTER SET ascii NOT NULL default '')"
-		result := ToSqlite3CreateTable(statement)
-		require.Equal(t, result, "create table t(id int, v varchar ( 123 ) NOT NULL default '')")
-	}
-	{
-		statement := "create table t(i smallint unsigned)"
-		result := ToSqlite3CreateTable(statement)
-		require.Equal(t, result, "create table t(i smallint)")
-	}
-	{
-		statement := "create table t(i smallint(5) unsigned)"
-		result := ToSqlite3CreateTable(statement)
-		require.Equal(t, result, "create table t(i smallint)")
-	}
-	{
-		statement := "create table t(i smallint ( 5 ) unsigned)"
-		result := ToSqlite3CreateTable(statement)
-		require.Equal(t, result, "create table t(i smallint)")
-	}
-}
-
-func TestToSqlite3AlterTable(t *testing.T) {
-	{
-		statement := `
-			ALTER TABLE
-				database_instance
-				ADD COLUMN sql_delay INT UNSIGNED NOT NULL AFTER replica_lag_seconds
-		`
-		result := stripSpaces(ToSqlite3Dialect(statement))
-		require.Equal(t, result, stripSpaces(`
-			ALTER TABLE
-				database_instance
-				add column sql_delay int not null default 0
-			`))
-	}
-	{
-		statement := `
-			ALTER TABLE
-				database_instance
-				ADD INDEX source_host_port_idx (source_host, source_port)
-		`
-		result := stripSpaces(ToSqlite3Dialect(statement))
-		require.Equal(t, result, stripSpaces(`
-			create index
-				source_host_port_idx_database_instance
-				on database_instance (source_host, source_port)
-			`))
-	}
-	{
-		statement := `
-				ALTER TABLE
-					topology_recovery
-					ADD KEY last_detection_idx (last_detection_id)
-			`
-		result := stripSpaces(ToSqlite3Dialect(statement))
-		require.Equal(t, result, stripSpaces(`
-			create index
-				last_detection_idx_topology_recovery
-				on topology_recovery (last_detection_id)
-			`))
-	}
-
-}
-
-func TestCreateIndex(t *testing.T) {
-	{
-		statement := `
-			create index
-				source_host_port_idx_database_instance
-				on database_instance (source_host(128), source_port)
-		`
-		result := stripSpaces(ToSqlite3Dialect(statement))
-		require.Equal(t, result, stripSpaces(`
-			create index
-				source_host_port_idx_database_instance
-				on database_instance (source_host, source_port)
-			`))
-	}
-}
-
-func TestIsInsert(t *testing.T) {
-	require.True(t, IsInsert("insert into t"))
-	require.True(t, IsInsert("insert ignore into t"))
-	require.True(t, IsInsert(`
-		  insert ignore into t
-			`))
-	require.False(t, IsInsert("where create table t(id int)"))
-	require.False(t, IsInsert("create table t(id int)"))
-	require.True(t, IsInsert(`
-		insert into
-				cluster_domain_name (cluster_name, domain_name, last_registered)
-			values
-				(?, ?, datetime('now'))
-			on duplicate key update
-				domain_name=values(domain_name),
-				last_registered=values(last_registered)
-	`))
-}
-
-func TestToSqlite3Insert(t *testing.T) {
-	{
-		statement := `
-			insert into
-					cluster_domain_name (cluster_name, domain_name, last_registered)
-				values
-					(?, ?, datetime('now'))
-				on duplicate key update
-					domain_name=values(domain_name),
-					last_registered=values(last_registered)
-		`
-		result := stripSpaces(ToSqlite3Dialect(statement))
-		require.Equal(t, result, stripSpaces(`
-			replace into
-					cluster_domain_name (cluster_name, domain_name, last_registered)
-				values
-					(?, ?, datetime('now'))
-			`))
-	}
-}
-
-func TestToSqlite3GeneralConversions(t *testing.T) {
-	{
-		statement := "select now()"
-		result := ToSqlite3Dialect(statement)
-		require.Equal(t, result, "select datetime('now')")
-	}
-	{
-		statement := "select now() - interval ? second"
-		result := ToSqlite3Dialect(statement)
-		require.Equal(t, result, "select datetime('now', printf('-%d second', ?))")
-	}
-	{
-		statement := "select now() + interval ? minute"
-		result := ToSqlite3Dialect(statement)
-		require.Equal(t, result, "select datetime('now', printf('+%d minute', ?))")
-	}
-	{
-		statement := "select now() + interval 5 minute"
-		result := ToSqlite3Dialect(statement)
-		require.Equal(t, result, "select datetime('now', '+5 minute')")
-	}
-	{
-		statement := "select some_table.some_column + interval ? minute"
-		result := ToSqlite3Dialect(statement)
-		require.Equal(t, result, "select datetime(some_table.some_column, printf('+%d minute', ?))")
-	}
-	{
-		statement := "AND primary_instance.last_attempted_check <= primary_instance.last_seen + interval ? minute"
-		result := ToSqlite3Dialect(statement)
-		require.Equal(t, result, "AND primary_instance.last_attempted_check <= datetime(primary_instance.last_seen, printf('+%d minute', ?))")
-	}
-	{
-		statement := "select concat(primary_instance.port, '') as port"
-		result := ToSqlite3Dialect(statement)
-		require.Equal(t, result, "select (primary_instance.port || '') as port")
-	}
-	{
-		statement := "select concat( 'abc' , 'def') as s"
-		result := ToSqlite3Dialect(statement)
-		require.Equal(t, result, "select ('abc'  || 'def') as s")
-	}
-	{
-		statement := "select concat( 'abc' , 'def', last.col) as s"
-		result := ToSqlite3Dialect(statement)
-		require.Equal(t, result, "select ('abc'  || 'def' || last.col) as s")
-	}
-	{
-		statement := "select concat(myself.only) as s"
-		result := ToSqlite3Dialect(statement)
-		require.Equal(t, result, "select concat(myself.only) as s")
-	}
-	{
-		statement := "select concat(1, '2', 3, '4') as s"
-		result := ToSqlite3Dialect(statement)
-		require.Equal(t, result, "select concat(1, '2', 3, '4') as s")
-	}
-	{
-		statement := "select group_concat( 'abc' , 'def') as s"
-		result := ToSqlite3Dialect(statement)
-		require.Equal(t, result, "select group_concat( 'abc' , 'def') as s")
-	}
-}
-
-func TestIsCreateIndex(t *testing.T) {
-	tests := []struct {
-		input    string
-		expected bool
-	}{
-		{"create index my_index on my_table(column);", true},
-		{"CREATE INDEX my_index ON my_table(column);", true},
-		{"create unique index my_index on my_table(column);", true},
-		{"CREATE UNIQUE INDEX my_index ON my_table(column);", true},
-		{"create index my_index on my_table(column) where condition;", true},
-		{"create unique index my_index on my_table(column) where condition;", true},
-		{"create table my_table(column);", false},
-		{"drop index my_index on my_table;", false},
-		{"alter table my_table add index my_index (column);", false},
-		{"", false},
-	}
-
-	for _, test := range tests {
-		t.Run(test.input, func(t *testing.T) {
-			result := IsCreateIndex(test.input)
-			assert.Equal(t, test.expected, result)
-		})
-	}
-}
-
-func TestIsDropIndex(t *testing.T) {
-	tests := []struct {
-		input    string
-		expected bool
-	}{
-		{"drop index my_index on my_table;", true},
-		{"DROP INDEX my_index ON my_table;", true},
-		{"drop index if exists my_index on my_table;", true},
-		{"DROP INDEX IF EXISTS my_index ON my_table;", true},
-		{"drop table my_table;", false},
-		{"create index my_index on my_table(column);", false},
-		{"alter table my_table add index my_index (column);", false},
-		{"", false},
-	}
-
-	for _, test := range tests {
-		t.Run(test.input, func(t *testing.T) {
-			result := IsDropIndex(test.input)
-			assert.Equal(t, test.expected, result)
-		})
-	}
-}
-
-func TestToSqlite3Dialect(t *testing.T) {
-	tests := []struct {
-		input    string
-		expected string
-	}{
-		{"create table my_table(id int);", "create table my_table(id int);"},
-		{"alter table my_table add column new_col int;", "alter table my_table add column new_col int;"},
-		{"insert into my_table values (1);", "insert into my_table values (1);"},
-		{"", ""},
-	}
-
-	for _, test := range tests {
-		t.Run(test.input, func(t *testing.T) {
-			result := ToSqlite3Dialect(test.input)
-			assert.Equal(t, test.expected, result)
-		})
-	}
-}
diff --git a/go/vt/key/key.go b/go/vt/key/key.go
index dcdcda47f81..82852daa16e 100644
--- a/go/vt/key/key.go
+++ b/go/vt/key/key.go
@@ -90,6 +90,19 @@ func Empty(id []byte) bool {
 // KeyRange helper methods
 //
 
+// Make a Key Range
+func NewKeyRange(start []byte, end []byte) *topodatapb.KeyRange {
+	return &topodatapb.KeyRange{Start: start, End: end}
+}
+
+// NewCompleteKeyRange returns a complete key range.
+func NewCompleteKeyRange() *topodatapb.KeyRange {
+	return &topodatapb.KeyRange{
+		Start: nil,
+		End:   nil,
+	}
+}
+
 // KeyRangeAdd adds two adjacent KeyRange values (in any order) into a single value. If the values are not adjacent,
 // it returns false.
 func KeyRangeAdd(a, b *topodatapb.KeyRange) (*topodatapb.KeyRange, bool) {
diff --git a/go/vt/log/log.go b/go/vt/log/log.go
index 79be1da464c..fb0c90bbb1c 100644
--- a/go/vt/log/log.go
+++ b/go/vt/log/log.go
@@ -111,3 +111,94 @@ func (lrms *logRotateMaxSize) String() string {
 func (lrms *logRotateMaxSize) Type() string {
 	return "uint64"
 }
+
+type PrefixedLogger struct {
+	prefix string
+}
+
+func NewPrefixedLogger(prefix string) *PrefixedLogger {
+	return &PrefixedLogger{prefix: prefix + ": "}
+}
+
+func (pl *PrefixedLogger) V(level glog.Level) glog.Verbose {
+	return V(level)
+}
+
+func (pl *PrefixedLogger) Flush() {
+	Flush()
+}
+
+func (pl *PrefixedLogger) Info(args ...any) {
+	args = append([]interface{}{pl.prefix}, args...)
+	Info(args...)
+}
+
+func (pl *PrefixedLogger) Infof(format string, args ...any) {
+	args = append([]interface{}{pl.prefix}, args...)
+	Infof("%s"+format, args...)
+}
+
+func (pl *PrefixedLogger) InfoDepth(depth int, args ...any) {
+	args = append([]interface{}{pl.prefix}, args...)
+	InfoDepth(depth, args...)
+}
+
+func (pl *PrefixedLogger) Warning(args ...any) {
+	args = append([]interface{}{pl.prefix}, args...)
+	Warning(args...)
+}
+
+func (pl *PrefixedLogger) Warningf(format string, args ...any) {
+	args = append([]interface{}{pl.prefix}, args...)
+	Warningf("%s"+format, args...)
+}
+
+func (pl *PrefixedLogger) WarningDepth(depth int, args ...any) {
+	args = append([]interface{}{pl.prefix}, args...)
+	WarningDepth(depth, args...)
+}
+
+func (pl *PrefixedLogger) Error(args ...any) {
+	args = append([]interface{}{pl.prefix}, args...)
+	Error(args...)
+}
+
+func (pl *PrefixedLogger) Errorf(format string, args ...any) {
+	args = append([]interface{}{pl.prefix}, args...)
+	Errorf("%s"+format, args...)
+}
+
+func (pl *PrefixedLogger) ErrorDepth(depth int, args ...any) {
+	args = append([]interface{}{pl.prefix}, args...)
+	ErrorDepth(depth, args...)
+}
+
+func (pl *PrefixedLogger) Exit(args ...any) {
+	args = append([]interface{}{pl.prefix}, args...)
+	Exit(args...)
+}
+
+func (pl *PrefixedLogger) Exitf(format string, args ...any) {
+	args = append([]interface{}{pl.prefix}, args...)
+	Exitf("%s"+format, args...)
+}
+
+func (pl *PrefixedLogger) ExitDepth(depth int, args ...any) {
+	args = append([]interface{}{pl.prefix}, args...)
+	ExitDepth(depth, args...)
+}
+
+func (pl *PrefixedLogger) Fatal(args ...any) {
+	args = append([]interface{}{pl.prefix}, args...)
+	Fatal(args...)
+}
+
+func (pl *PrefixedLogger) Fatalf(format string, args ...any) {
+	args = append([]interface{}{pl.prefix}, args...)
+	Fatalf("%s"+format, args...)
+}
+
+func (pl *PrefixedLogger) FatalDepth(depth int, args ...any) {
+	args = append([]interface{}{pl.prefix}, args...)
+	FatalDepth(depth, args...)
+}
diff --git a/go/vt/topo/faketopo/faketopo.go b/go/vt/topo/faketopo/faketopo.go
index 0c88b95e3da..c49072c043f 100644
--- a/go/vt/topo/faketopo/faketopo.go
+++ b/go/vt/topo/faketopo/faketopo.go
@@ -45,17 +45,26 @@ func NewFakeTopoFactory() *FakeFactory {
 		mu:    sync.Mutex{},
 		cells: map[string][]*FakeConn{},
 	}
-	factory.cells[topo.GlobalCell] = []*FakeConn{newFakeConnection()}
+	factory.cells[topo.GlobalCell] = []*FakeConn{NewFakeConnection()}
 	return factory
 }
 
 // AddCell is used to add a cell to the factory. It returns the fake connection created. This connection can then be used to set get and update errors
 func (f *FakeFactory) AddCell(cell string) *FakeConn {
-	conn := newFakeConnection()
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	conn := NewFakeConnection()
 	f.cells[cell] = []*FakeConn{conn}
 	return conn
 }
 
+// SetCell is used to set a cell in the factory.
+func (f *FakeFactory) SetCell(cell string, fakeConn *FakeConn) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.cells[cell] = []*FakeConn{fakeConn}
+}
+
 // HasGlobalReadOnlyCell implements the Factory interface
 func (f *FakeFactory) HasGlobalReadOnlyCell(serverAddr, root string) bool {
 	return false
@@ -70,7 +79,7 @@ func (f *FakeFactory) Create(cell, serverAddr, root string) (topo.Conn, error) {
 	if !ok || len(connections) == 0 {
 		return nil, topo.NewError(topo.NoNode, cell)
 	}
-	// pick the first connection and remove it from the list
+	// pick the first connection and remove it from the list.
 	conn := connections[0]
 	f.cells[cell] = connections[1:]
 
@@ -84,15 +93,19 @@ type FakeConn struct {
 	cell       string
 	serverAddr string
 
-	// mutex to protect all the operations
+	// mutex to protect all the operations.
 	mu sync.Mutex
 
-	// getResultMap is a map storing the results for each filepath
+	// getResultMap is a map storing the results for each filepath.
 	getResultMap map[string]result
-	// updateErrors stores whether update function call should error or not
+	// listResultMap is a map storing the resuls for each filepath prefix.
+	listResultMap map[string][]topo.KVInfo
+	// updateErrors stores whether update function call should error or not.
 	updateErrors []updateError
-	// getErrors stores whether the get function call should error or not
+	// getErrors stores whether the get function call should error or not.
 	getErrors []bool
+	// listErrors stores whether the list function call should error or not.
+	listErrors []bool
 
 	// watches is a map of all watches for this connection to the cell keyed by the filepath.
 	watches map[string][]chan *topo.WatchData
@@ -105,13 +118,15 @@ type updateError struct {
 	writePersists bool
 }
 
-// newFakeConnection creates a new fake connection
-func newFakeConnection() *FakeConn {
+// NewFakeConnection creates a new fake connection
+func NewFakeConnection() *FakeConn {
 	return &FakeConn{
-		getResultMap: map[string]result{},
-		watches:      map[string][]chan *topo.WatchData{},
-		getErrors:    []bool{},
-		updateErrors: []updateError{},
+		getResultMap:  map[string]result{},
+		listResultMap: map[string][]topo.KVInfo{},
+		watches:       map[string][]chan *topo.WatchData{},
+		getErrors:     []bool{},
+		listErrors:    []bool{},
+		updateErrors:  []updateError{},
 	}
 }
 
@@ -122,6 +137,20 @@ func (f *FakeConn) AddGetError(shouldErr bool) {
 	f.getErrors = append(f.getErrors, shouldErr)
 }
 
+// AddListError is used to add a list error to the fake connection
+func (f *FakeConn) AddListError(shouldErr bool) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.listErrors = append(f.listErrors, shouldErr)
+}
+
+// AddListResult is used to add a list result to the fake connection
+func (f *FakeConn) AddListResult(filePathPrefix string, result []topo.KVInfo) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.listResultMap[filePathPrefix] = result
+}
+
 // AddUpdateError is used to add an update error to the fake connection
 func (f *FakeConn) AddUpdateError(shouldErr bool, writePersists bool) {
 	f.mu.Lock()
@@ -261,7 +290,20 @@ func (f *FakeConn) GetVersion(ctx context.Context, filePath string, version int6
 
 // List is part of the topo.Conn interface.
 func (f *FakeConn) List(ctx context.Context, filePathPrefix string) ([]topo.KVInfo, error) {
-	return nil, topo.NewError(topo.NoImplementation, "List not supported in fake topo")
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	if len(f.listErrors) > 0 {
+		shouldErr := f.listErrors[0]
+		f.listErrors = f.listErrors[1:]
+		if shouldErr {
+			return nil, topo.NewError(topo.Timeout, filePathPrefix)
+		}
+	}
+	kvInfos, isPresent := f.listResultMap[filePathPrefix]
+	if !isPresent {
+		return nil, topo.NewError(topo.NoNode, filePathPrefix)
+	}
+	return kvInfos, nil
 }
 
 // Delete implements the Conn interface
diff --git a/go/vt/topo/keyspace.go b/go/vt/topo/keyspace.go
index dced769ca78..4e1d19ec79a 100755
--- a/go/vt/topo/keyspace.go
+++ b/go/vt/topo/keyspace.go
@@ -22,43 +22,26 @@ import (
 	"sort"
 	"sync"
 
-	"github.com/spf13/pflag"
 	"golang.org/x/sync/errgroup"
 
 	"vitess.io/vitess/go/constants/sidecar"
+	"vitess.io/vitess/go/event"
 	"vitess.io/vitess/go/sqlescape"
 	"vitess.io/vitess/go/vt/key"
-	"vitess.io/vitess/go/vt/servenv"
-	"vitess.io/vitess/go/vt/vterrors"
-
-	"vitess.io/vitess/go/event"
 	"vitess.io/vitess/go/vt/log"
-	"vitess.io/vitess/go/vt/topo/events"
-
 	topodatapb "vitess.io/vitess/go/vt/proto/topodata"
 	vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc"
+	"vitess.io/vitess/go/vt/topo/events"
+	"vitess.io/vitess/go/vt/vterrors"
 )
 
 // This file contains keyspace utility functions.
 
-// Default concurrency to use in order to avoid overhwelming the topo server.
-var DefaultConcurrency = 32
-
 // shardKeySuffix is the suffix of a shard key.
 // The full key looks like this:
 // /vitess/global/keyspaces/customer/shards/80-/Shard
 const shardKeySuffix = "Shard"
 
-func registerFlags(fs *pflag.FlagSet) {
-	fs.IntVar(&DefaultConcurrency, "topo_read_concurrency", DefaultConcurrency, "Concurrency of topo reads.")
-}
-
-func init() {
-	servenv.OnParseFor("vtcombo", registerFlags)
-	servenv.OnParseFor("vtctld", registerFlags)
-	servenv.OnParseFor("vtgate", registerFlags)
-}
-
 // KeyspaceInfo is a meta struct that contains metadata to give the
 // data more context and convenience. This is the main way we interact
 // with a keyspace.
@@ -197,7 +180,7 @@ func (ts *Server) UpdateKeyspace(ctx context.Context, ki *KeyspaceInfo) error {
 type FindAllShardsInKeyspaceOptions struct {
 	// Concurrency controls the maximum number of concurrent calls to GetShard.
 	// If <= 0, Concurrency is set to 1.
-	Concurrency int
+	Concurrency int64
 }
 
 // FindAllShardsInKeyspace reads and returns all the existing shards in a
@@ -211,7 +194,7 @@ func (ts *Server) FindAllShardsInKeyspace(ctx context.Context, keyspace string,
 		opt = &FindAllShardsInKeyspaceOptions{}
 	}
 	if opt.Concurrency <= 0 {
-		opt.Concurrency = DefaultConcurrency
+		opt.Concurrency = DefaultReadConcurrency
 	}
 
 	// Unescape the keyspace name as this can e.g. come from the VSchema where
diff --git a/go/vt/topo/server.go b/go/vt/topo/server.go
index a0dba42006d..319d64cb077 100644
--- a/go/vt/topo/server.go
+++ b/go/vt/topo/server.go
@@ -49,6 +49,7 @@ import (
 	"sync"
 
 	"github.com/spf13/pflag"
+	"golang.org/x/sync/semaphore"
 
 	"vitess.io/vitess/go/vt/log"
 	"vitess.io/vitess/go/vt/proto/topodata"
@@ -181,6 +182,9 @@ var (
 
 	FlagBinaries = []string{"vttablet", "vtctl", "vtctld", "vtcombo", "vtgate",
 		"vtorc", "vtbackup"}
+
+	// Default read concurrency to use in order to avoid overhwelming the topo server.
+	DefaultReadConcurrency int64 = 32
 )
 
 func init() {
@@ -193,6 +197,7 @@ func registerTopoFlags(fs *pflag.FlagSet) {
 	fs.StringVar(&topoImplementation, "topo_implementation", topoImplementation, "the topology implementation to use")
 	fs.StringVar(&topoGlobalServerAddress, "topo_global_server_address", topoGlobalServerAddress, "the address of the global topology server")
 	fs.StringVar(&topoGlobalRoot, "topo_global_root", topoGlobalRoot, "the path of the global topology data in the global topology server")
+	fs.Int64Var(&DefaultReadConcurrency, "topo_read_concurrency", DefaultReadConcurrency, "Maximum concurrency of topo reads per global or local cell.")
 }
 
 // RegisterFactory registers a Factory for an implementation for a Server.
@@ -208,11 +213,12 @@ func RegisterFactory(name string, factory Factory) {
 // NewWithFactory creates a new Server based on the given Factory.
 // It also opens the global cell connection.
 func NewWithFactory(factory Factory, serverAddress, root string) (*Server, error) {
+	globalReadSem := semaphore.NewWeighted(DefaultReadConcurrency)
 	conn, err := factory.Create(GlobalCell, serverAddress, root)
 	if err != nil {
 		return nil, err
 	}
-	conn = NewStatsConn(GlobalCell, conn)
+	conn = NewStatsConn(GlobalCell, conn, globalReadSem)
 
 	var connReadOnly Conn
 	if factory.HasGlobalReadOnlyCell(serverAddress, root) {
@@ -220,7 +226,7 @@ func NewWithFactory(factory Factory, serverAddress, root string) (*Server, error
 		if err != nil {
 			return nil, err
 		}
-		connReadOnly = NewStatsConn(GlobalReadOnlyCell, connReadOnly)
+		connReadOnly = NewStatsConn(GlobalReadOnlyCell, connReadOnly, globalReadSem)
 	} else {
 		connReadOnly = conn
 	}
@@ -301,7 +307,8 @@ func (ts *Server) ConnForCell(ctx context.Context, cell string) (Conn, error) {
 	conn, err := ts.factory.Create(cell, ci.ServerAddress, ci.Root)
 	switch {
 	case err == nil:
-		conn = NewStatsConn(cell, conn)
+		cellReadSem := semaphore.NewWeighted(DefaultReadConcurrency)
+		conn = NewStatsConn(cell, conn, cellReadSem)
 		ts.cellConns[cell] = cellConn{ci, conn}
 		return conn, nil
 	case IsErrType(err, NoNode):
diff --git a/go/vt/topo/shard.go b/go/vt/topo/shard.go
index ba3a02a68ca..0c7e7b205ee 100644
--- a/go/vt/topo/shard.go
+++ b/go/vt/topo/shard.go
@@ -20,7 +20,6 @@ import (
 	"context"
 	"encoding/hex"
 	"errors"
-	"fmt"
 	"path"
 	"slices"
 	"sort"
@@ -28,20 +27,20 @@ import (
 	"sync"
 	"time"
 
-	"vitess.io/vitess/go/constants/sidecar"
-	"vitess.io/vitess/go/protoutil"
-	"vitess.io/vitess/go/vt/proto/vtrpc"
-	"vitess.io/vitess/go/vt/vterrors"
+	"golang.org/x/sync/errgroup"
 
+	"vitess.io/vitess/go/constants/sidecar"
 	"vitess.io/vitess/go/event"
+	"vitess.io/vitess/go/protoutil"
 	"vitess.io/vitess/go/trace"
 	"vitess.io/vitess/go/vt/concurrency"
 	"vitess.io/vitess/go/vt/key"
 	"vitess.io/vitess/go/vt/log"
+	topodatapb "vitess.io/vitess/go/vt/proto/topodata"
+	"vitess.io/vitess/go/vt/proto/vtrpc"
 	"vitess.io/vitess/go/vt/topo/events"
 	"vitess.io/vitess/go/vt/topo/topoproto"
-
-	topodatapb "vitess.io/vitess/go/vt/proto/topodata"
+	"vitess.io/vitess/go/vt/vterrors"
 )
 
 const (
@@ -553,7 +552,6 @@ func (ts *Server) FindAllTabletAliasesInShardByCell(ctx context.Context, keyspac
 	span.Annotate("shard", shard)
 	span.Annotate("num_cells", len(cells))
 	defer span.Finish()
-	ctx = trace.NewContext(ctx, span)
 	var err error
 
 	// The caller intents to all cells
@@ -597,7 +595,7 @@ func (ts *Server) FindAllTabletAliasesInShardByCell(ctx context.Context, keyspac
 			case IsErrType(err, NoNode):
 				// There is no shard replication for this shard in this cell. NOOP
 			default:
-				rec.RecordError(vterrors.Wrap(err, fmt.Sprintf("GetShardReplication(%v, %v, %v) failed.", cell, keyspace, shard)))
+				rec.RecordError(vterrors.Wrapf(err, "GetShardReplication(%v, %v, %v) failed.", cell, keyspace, shard))
 				return
 			}
 		}(cell)
@@ -616,6 +614,67 @@ func (ts *Server) FindAllTabletAliasesInShardByCell(ctx context.Context, keyspac
 	return result, err
 }
 
+// GetTabletsByShard returns the tablets in the given shard using all cells.
+// It can return ErrPartialResult if it couldn't read all the cells, or all
+// the individual tablets, in which case the result is valid, but partial.
+func (ts *Server) GetTabletsByShard(ctx context.Context, keyspace, shard string) ([]*TabletInfo, error) {
+	return ts.GetTabletsByShardCell(ctx, keyspace, shard, nil)
+}
+
+// GetTabletsByShardCell returns the tablets in the given shard. It can return
+// ErrPartialResult if it couldn't read all the cells, or all the individual
+// tablets, in which case the result is valid, but partial.
+func (ts *Server) GetTabletsByShardCell(ctx context.Context, keyspace, shard string, cells []string) ([]*TabletInfo, error) {
+	span, ctx := trace.NewSpan(ctx, "topo.GetTabletsByShardCell")
+	span.Annotate("keyspace", keyspace)
+	span.Annotate("shard", shard)
+	span.Annotate("num_cells", len(cells))
+	defer span.Finish()
+	var err error
+
+	if len(cells) == 0 {
+		cells, err = ts.GetCellInfoNames(ctx)
+		if err != nil {
+			return nil, err
+		}
+		if len(cells) == 0 { // Nothing to do
+			return nil, nil
+		}
+	}
+
+	mu := sync.Mutex{}
+	eg, ctx := errgroup.WithContext(ctx)
+
+	tablets := make([]*TabletInfo, 0, len(cells))
+	var kss *KeyspaceShard
+	if keyspace != "" {
+		kss = &KeyspaceShard{
+			Keyspace: keyspace,
+			Shard:    shard,
+		}
+	}
+	options := &GetTabletsByCellOptions{
+		KeyspaceShard: kss,
+	}
+	for _, cell := range cells {
+		eg.Go(func() error {
+			t, err := ts.GetTabletsByCell(ctx, cell, options)
+			if err != nil {
+				return vterrors.Wrapf(err, "GetTabletsByCell for %v failed.", cell)
+			}
+			mu.Lock()
+			defer mu.Unlock()
+			tablets = append(tablets, t...)
+			return nil
+		})
+	}
+	if err := eg.Wait(); err != nil {
+		log.Warningf("GetTabletsByShardCell(%v,%v): got partial result: %v", keyspace, shard, err)
+		return tablets, NewError(PartialResult, shard)
+	}
+	return tablets, nil
+}
+
 // GetTabletMapForShard returns the tablets for a shard. It can return
 // ErrPartialResult if it couldn't read all the cells, or all
 // the individual tablets, in which case the map is valid, but partial.
diff --git a/go/vt/topo/shard_test.go b/go/vt/topo/shard_test.go
index 6bd4aae5b62..915bcd18e3c 100644
--- a/go/vt/topo/shard_test.go
+++ b/go/vt/topo/shard_test.go
@@ -323,6 +323,14 @@ func TestValidateShardName(t *testing.T) {
 			},
 			valid: true,
 		},
+		{
+			name: "-",
+			expectedRange: &topodatapb.KeyRange{
+				Start: []byte{},
+				End:   []byte{},
+			},
+			valid: true,
+		},
 		{
 			name: "40-80",
 			expectedRange: &topodatapb.KeyRange{
diff --git a/go/vt/topo/stats_conn.go b/go/vt/topo/stats_conn.go
index 39bc8c9bc43..ded362b9139 100644
--- a/go/vt/topo/stats_conn.go
+++ b/go/vt/topo/stats_conn.go
@@ -20,6 +20,8 @@ import (
 	"context"
 	"time"
 
+	"golang.org/x/sync/semaphore"
+
 	"vitess.io/vitess/go/stats"
 	"vitess.io/vitess/go/vt/proto/vtrpc"
 	"vitess.io/vitess/go/vt/vterrors"
@@ -37,6 +39,11 @@ var (
 		"TopologyConnErrors",
 		"TopologyConnErrors errors per operation",
 		[]string{"Operation", "Cell"})
+
+	topoStatsConnReadWaitTimings = stats.NewMultiTimings(
+		"TopologyConnReadWaits",
+		"TopologyConnReadWait timings",
+		[]string{"Operation", "Cell"})
 )
 
 const readOnlyErrorStrFormat = "cannot perform %s on %s as the topology server connection is read-only"
@@ -46,14 +53,16 @@ type StatsConn struct {
 	cell     string
 	conn     Conn
 	readOnly bool
+	readSem  *semaphore.Weighted
 }
 
 // NewStatsConn returns a StatsConn
-func NewStatsConn(cell string, conn Conn) *StatsConn {
+func NewStatsConn(cell string, conn Conn, readSem *semaphore.Weighted) *StatsConn {
 	return &StatsConn{
 		cell:     cell,
 		conn:     conn,
 		readOnly: false,
+		readSem:  readSem,
 	}
 }
 
@@ -61,6 +70,12 @@ func NewStatsConn(cell string, conn Conn) *StatsConn {
 func (st *StatsConn) ListDir(ctx context.Context, dirPath string, full bool) ([]DirEntry, error) {
 	startTime := time.Now()
 	statsKey := []string{"ListDir", st.cell}
+	if err := st.readSem.Acquire(ctx, 1); err != nil {
+		return nil, err
+	}
+	defer st.readSem.Release(1)
+	topoStatsConnReadWaitTimings.Record(statsKey, startTime)
+	startTime = time.Now() // reset
 	defer topoStatsConnTimings.Record(statsKey, startTime)
 	res, err := st.conn.ListDir(ctx, dirPath, full)
 	if err != nil {
@@ -106,6 +121,12 @@ func (st *StatsConn) Update(ctx context.Context, filePath string, contents []byt
 func (st *StatsConn) Get(ctx context.Context, filePath string) ([]byte, Version, error) {
 	startTime := time.Now()
 	statsKey := []string{"Get", st.cell}
+	if err := st.readSem.Acquire(ctx, 1); err != nil {
+		return nil, nil, err
+	}
+	defer st.readSem.Release(1)
+	topoStatsConnReadWaitTimings.Record(statsKey, startTime)
+	startTime = time.Now() // reset
 	defer topoStatsConnTimings.Record(statsKey, startTime)
 	bytes, version, err := st.conn.Get(ctx, filePath)
 	if err != nil {
@@ -119,6 +140,12 @@ func (st *StatsConn) Get(ctx context.Context, filePath string) ([]byte, Version,
 func (st *StatsConn) GetVersion(ctx context.Context, filePath string, version int64) ([]byte, error) {
 	startTime := time.Now()
 	statsKey := []string{"GetVersion", st.cell}
+	if err := st.readSem.Acquire(ctx, 1); err != nil {
+		return nil, err
+	}
+	defer st.readSem.Release(1)
+	topoStatsConnReadWaitTimings.Record(statsKey, startTime)
+	startTime = time.Now() // reset
 	defer topoStatsConnTimings.Record(statsKey, startTime)
 	bytes, err := st.conn.GetVersion(ctx, filePath, version)
 	if err != nil {
@@ -132,6 +159,12 @@ func (st *StatsConn) GetVersion(ctx context.Context, filePath string, version in
 func (st *StatsConn) List(ctx context.Context, filePathPrefix string) ([]KVInfo, error) {
 	startTime := time.Now()
 	statsKey := []string{"List", st.cell}
+	if err := st.readSem.Acquire(ctx, 1); err != nil {
+		return nil, err
+	}
+	defer st.readSem.Release(1)
+	topoStatsConnReadWaitTimings.Record(statsKey, startTime)
+	startTime = time.Now() // reset
 	defer topoStatsConnTimings.Record(statsKey, startTime)
 	bytes, err := st.conn.List(ctx, filePathPrefix)
 	if err != nil {
diff --git a/go/vt/topo/stats_conn_test.go b/go/vt/topo/stats_conn_test.go
index 605487697cc..9e48ec58407 100644
--- a/go/vt/topo/stats_conn_test.go
+++ b/go/vt/topo/stats_conn_test.go
@@ -23,11 +23,24 @@ import (
 	"time"
 
 	"github.com/stretchr/testify/require"
+	"golang.org/x/sync/semaphore"
 
 	"vitess.io/vitess/go/vt/proto/vtrpc"
 	"vitess.io/vitess/go/vt/vterrors"
 )
 
+// testStatsConnReadSem is a semaphore for unit tests.
+// It intentionally has a concurrency limit of '1' to
+// allow semaphore contention in tests.
+var testStatsConnReadSem = semaphore.NewWeighted(1)
+
+// testStatsConnStatsReset resets StatsConn-based stats.
+func testStatsConnStatsReset() {
+	topoStatsConnErrors.ResetAll()
+	topoStatsConnReadWaitTimings.Reset()
+	topoStatsConnTimings.Reset()
+}
+
 // The fakeConn is a wrapper for a Conn that emits stats for every operation
 type fakeConn struct {
 	v        Version
@@ -181,218 +194,217 @@ func (st *fakeConn) IsReadOnly() bool {
 	return st.readOnly
 }
 
+// createTestReadSemaphoreContention simulates semaphore contention on the test read semaphore.
+func createTestReadSemaphoreContention(ctx context.Context, duration time.Duration, semAcquiredChan chan struct{}) {
+	if err := testStatsConnReadSem.Acquire(ctx, 1); err != nil {
+		panic(err)
+	}
+	defer testStatsConnReadSem.Release(1)
+	semAcquiredChan <- struct{}{}
+	time.Sleep(duration)
+}
+
 // TestStatsConnTopoListDir emits stats on ListDir
 func TestStatsConnTopoListDir(t *testing.T) {
+	testStatsConnStatsReset()
+	defer testStatsConnStatsReset()
+
 	conn := &fakeConn{}
-	statsConn := NewStatsConn("global", conn)
+	statsConn := NewStatsConn("global", conn, testStatsConnReadSem)
 	ctx := context.Background()
 
+	semAcquiredChan := make(chan struct{})
+	go createTestReadSemaphoreContention(ctx, 100*time.Millisecond, semAcquiredChan)
+	<-semAcquiredChan
 	statsConn.ListDir(ctx, "", true)
-	timingCounts := topoStatsConnTimings.Counts()["ListDir.global"]
-	if got, want := timingCounts, int64(1); got != want {
-		t.Errorf("stats were not properly recorded: got = %d, want = %d", got, want)
-	}
+	require.Equal(t, int64(1), topoStatsConnTimings.Counts()["ListDir.global"])
+	require.NotZero(t, topoStatsConnTimings.Time())
+
+	require.Equal(t, int64(1), topoStatsConnReadWaitTimings.Counts()["ListDir.global"])
+	require.NotZero(t, topoStatsConnReadWaitTimings.Time())
 
 	// error is zero before getting an error
-	errorCount := topoStatsConnErrors.Counts()["ListDir.global"]
-	if got, want := errorCount, int64(0); got != want {
-		t.Errorf("stats were not properly recorded: got = %d, want = %d", got, want)
-	}
+	require.Zero(t, topoStatsConnErrors.Counts()["ListDir.global"])
 
 	statsConn.ListDir(ctx, "error", true)
 
 	// error stats gets emitted
-	errorCount = topoStatsConnErrors.Counts()["ListDir.global"]
-	if got, want := errorCount, int64(1); got != want {
-		t.Errorf("stats were not properly recorded: got = %d, want = %d", got, want)
-	}
+	require.Equal(t, int64(1), topoStatsConnErrors.Counts()["ListDir.global"])
 }
 
 // TestStatsConnTopoCreate emits stats on Create
 func TestStatsConnTopoCreate(t *testing.T) {
+	testStatsConnStatsReset()
+	defer testStatsConnStatsReset()
+
 	conn := &fakeConn{}
-	statsConn := NewStatsConn("global", conn)
+	statsConn := NewStatsConn("global", conn, testStatsConnReadSem)
 	ctx := context.Background()
 
 	statsConn.Create(ctx, "", []byte{})
-	timingCounts := topoStatsConnTimings.Counts()["Create.global"]
-	if got, want := timingCounts, int64(1); got != want {
-		t.Errorf("stats were not properly recorded: got = %d, want = %d", got, want)
-	}
+	require.Equal(t, int64(1), topoStatsConnTimings.Counts()["Create.global"])
+	require.NotZero(t, topoStatsConnTimings.Time())
+	require.Zero(t, topoStatsConnReadWaitTimings.Time())
 
 	// error is zero before getting an error
-	errorCount := topoStatsConnErrors.Counts()["Create.global"]
-	if got, want := errorCount, int64(0); got != want {
-		t.Errorf("stats were not properly recorded: got = %d, want = %d", got, want)
-	}
+	require.Zero(t, topoStatsConnErrors.Counts()["Create.global"])
 
 	statsConn.Create(ctx, "error", []byte{})
 
 	// error stats gets emitted
-	errorCount = topoStatsConnErrors.Counts()["Create.global"]
-	if got, want := errorCount, int64(1); got != want {
-		t.Errorf("stats were not properly recorded: got = %d, want = %d", got, want)
-	}
+	require.Equal(t, int64(1), topoStatsConnErrors.Counts()["Create.global"])
 }
 
 // TestStatsConnTopoUpdate emits stats on Update
 func TestStatsConnTopoUpdate(t *testing.T) {
+	testStatsConnStatsReset()
+	defer testStatsConnStatsReset()
+
 	conn := &fakeConn{}
-	statsConn := NewStatsConn("global", conn)
+	statsConn := NewStatsConn("global", conn, testStatsConnReadSem)
 	ctx := context.Background()
 
 	statsConn.Update(ctx, "", []byte{}, conn.v)
-	timingCounts := topoStatsConnTimings.Counts()["Update.global"]
-	if got, want := timingCounts, int64(1); got != want {
-		t.Errorf("stats were not properly recorded: got = %d, want = %d", got, want)
-	}
+	require.Equal(t, int64(1), topoStatsConnTimings.Counts()["Update.global"])
+	require.NotZero(t, topoStatsConnTimings.Time())
+	require.Zero(t, topoStatsConnReadWaitTimings.Time())
 
 	// error is zero before getting an error
-	errorCount := topoStatsConnErrors.Counts()["Update.global"]
-	if got, want := errorCount, int64(0); got != want {
-		t.Errorf("stats were not properly recorded: got = %d, want = %d", got, want)
-	}
+	require.Zero(t, topoStatsConnErrors.Counts()["Update.global"])
 
 	statsConn.Update(ctx, "error", []byte{}, conn.v)
 
 	// error stats gets emitted
-	errorCount = topoStatsConnErrors.Counts()["Update.global"]
-	if got, want := errorCount, int64(1); got != want {
-		t.Errorf("stats were not properly recorded: got = %d, want = %d", got, want)
-	}
+	require.Equal(t, int64(1), topoStatsConnErrors.Counts()["Update.global"])
 }
 
 // TestStatsConnTopoGet emits stats on Get
 func TestStatsConnTopoGet(t *testing.T) {
+	testStatsConnStatsReset()
+	defer testStatsConnStatsReset()
+
 	conn := &fakeConn{}
-	statsConn := NewStatsConn("global", conn)
+	statsConn := NewStatsConn("global", conn, testStatsConnReadSem)
 	ctx := context.Background()
 
+	semAcquiredChan := make(chan struct{})
+	go createTestReadSemaphoreContention(ctx, time.Millisecond*100, semAcquiredChan)
+	<-semAcquiredChan
 	statsConn.Get(ctx, "")
-	timingCounts := topoStatsConnTimings.Counts()["Get.global"]
-	if got, want := timingCounts, int64(1); got != want {
-		t.Errorf("stats were not properly recorded: got = %d, want = %d", got, want)
-	}
+	require.Equal(t, int64(1), topoStatsConnTimings.Counts()["Get.global"])
+	require.NotZero(t, topoStatsConnTimings.Time())
+
+	require.Equal(t, int64(1), topoStatsConnReadWaitTimings.Counts()["Get.global"])
+	require.NotZero(t, topoStatsConnReadWaitTimings.Time())
 
 	// error is zero before getting an error
-	errorCount := topoStatsConnErrors.Counts()["Get.global"]
-	if got, want := errorCount, int64(0); got != want {
-		t.Errorf("stats were not properly recorded: got = %d, want = %d", got, want)
-	}
+	require.Zero(t, topoStatsConnErrors.Counts()["Get.global"])
 
 	statsConn.Get(ctx, "error")
 
 	// error stats gets emitted
-	errorCount = topoStatsConnErrors.Counts()["Get.global"]
-	if got, want := errorCount, int64(1); got != want {
-		t.Errorf("stats were not properly recorded: got = %d, want = %d", got, want)
-	}
+	require.Equal(t, int64(1), topoStatsConnErrors.Counts()["Get.global"])
 }
 
 // TestStatsConnTopoDelete emits stats on Delete
 func TestStatsConnTopoDelete(t *testing.T) {
+	testStatsConnStatsReset()
+	defer testStatsConnStatsReset()
+
 	conn := &fakeConn{}
-	statsConn := NewStatsConn("global", conn)
+	statsConn := NewStatsConn("global", conn, testStatsConnReadSem)
 	ctx := context.Background()
 
 	statsConn.Delete(ctx, "", conn.v)
-	timingCounts := topoStatsConnTimings.Counts()["Delete.global"]
-	if got, want := timingCounts, int64(1); got != want {
-		t.Errorf("stats were not properly recorded: got = %d, want = %d", got, want)
-	}
+	require.Equal(t, int64(1), topoStatsConnTimings.Counts()["Delete.global"])
+	require.NotZero(t, topoStatsConnTimings.Time())
+	require.Zero(t, topoStatsConnReadWaitTimings.Time())
 
 	// error is zero before getting an error
-	errorCount := topoStatsConnErrors.Counts()["Delete.global"]
-	if got, want := errorCount, int64(0); got != want {
-		t.Errorf("stats were not properly recorded: got = %d, want = %d", got, want)
-	}
+	require.Zero(t, topoStatsConnErrors.Counts()["Delete.global"])
 
 	statsConn.Delete(ctx, "error", conn.v)
 
 	// error stats gets emitted
-	errorCount = topoStatsConnErrors.Counts()["Delete.global"]
-	if got, want := errorCount, int64(1); got != want {
-		t.Errorf("stats were not properly recorded: got = %d, want = %d", got, want)
-	}
+	require.Equal(t, int64(1), topoStatsConnErrors.Counts()["Delete.global"])
 }
 
 // TestStatsConnTopoLock emits stats on Lock
 func TestStatsConnTopoLock(t *testing.T) {
+	testStatsConnStatsReset()
+	defer testStatsConnStatsReset()
+
 	conn := &fakeConn{}
-	statsConn := NewStatsConn("global", conn)
+	statsConn := NewStatsConn("global", conn, testStatsConnReadSem)
 	ctx := context.Background()
 
 	statsConn.Lock(ctx, "", "")
-	timingCounts := topoStatsConnTimings.Counts()["Lock.global"]
-	require.Equal(t, timingCounts, int64(1))
+	require.Equal(t, int64(1), topoStatsConnTimings.Counts()["Lock.global"])
+	require.NotZero(t, topoStatsConnTimings.Time())
+	require.Zero(t, topoStatsConnReadWaitTimings.Time())
 
 	statsConn.LockWithTTL(ctx, "", "", time.Second)
-	timingCounts = topoStatsConnTimings.Counts()["LockWithTTL.global"]
-	require.Equal(t, timingCounts, int64(1))
+	require.Equal(t, int64(1), topoStatsConnTimings.Counts()["LockWithTTL.global"])
 
 	statsConn.LockName(ctx, "", "")
-	timingCounts = topoStatsConnTimings.Counts()["LockName.global"]
-	require.Equal(t, timingCounts, int64(1))
+	require.Equal(t, int64(1), topoStatsConnTimings.Counts()["LockName.global"])
 
 	// Error is zero before getting an error.
-	errorCount := topoStatsConnErrors.Counts()["Lock.global"]
-	require.Equal(t, errorCount, int64(0))
+	require.Zero(t, topoStatsConnErrors.Counts()["Lock.global"])
 
 	statsConn.Lock(ctx, "error", "")
 
 	// Error stats gets emitted.
-	errorCount = topoStatsConnErrors.Counts()["Lock.global"]
-	require.Equal(t, errorCount, int64(1))
+	require.Equal(t, int64(1), topoStatsConnErrors.Counts()["Lock.global"])
 }
 
 // TestStatsConnTopoWatch emits stats on Watch
 func TestStatsConnTopoWatch(t *testing.T) {
+	testStatsConnStatsReset()
+	defer testStatsConnStatsReset()
+
 	conn := &fakeConn{}
-	statsConn := NewStatsConn("global", conn)
+	statsConn := NewStatsConn("global", conn, testStatsConnReadSem)
 	ctx := context.Background()
 
 	statsConn.Watch(ctx, "")
-	timingCounts := topoStatsConnTimings.Counts()["Watch.global"]
-	if got, want := timingCounts, int64(1); got != want {
-		t.Errorf("stats were not properly recorded: got = %d, want = %d", got, want)
-	}
-
+	require.Equal(t, int64(1), topoStatsConnTimings.Counts()["Watch.global"])
+	require.NotZero(t, topoStatsConnTimings.Time())
+	require.Zero(t, topoStatsConnReadWaitTimings.Time())
 }
 
 // TestStatsConnTopoNewLeaderParticipation emits stats on NewLeaderParticipation
 func TestStatsConnTopoNewLeaderParticipation(t *testing.T) {
+	testStatsConnStatsReset()
+	defer testStatsConnStatsReset()
+
 	conn := &fakeConn{}
-	statsConn := NewStatsConn("global", conn)
+	statsConn := NewStatsConn("global", conn, testStatsConnReadSem)
 
 	_, _ = statsConn.NewLeaderParticipation("", "")
-	timingCounts := topoStatsConnTimings.Counts()["NewLeaderParticipation.global"]
-	if got, want := timingCounts, int64(1); got != want {
-		t.Errorf("stats were not properly recorded: got = %d, want = %d", got, want)
-	}
+	require.Equal(t, int64(1), topoStatsConnTimings.Counts()["NewLeaderParticipation.global"])
+	require.NotZero(t, topoStatsConnTimings.Time())
+	require.Zero(t, topoStatsConnReadWaitTimings.Time())
 
 	// error is zero before getting an error
-	errorCount := topoStatsConnErrors.Counts()["NewLeaderParticipation.global"]
-	if got, want := errorCount, int64(0); got != want {
-		t.Errorf("stats were not properly recorded: got = %d, want = %d", got, want)
-	}
+	require.Zero(t, topoStatsConnErrors.Counts()["NewLeaderParticipation.global"])
 
 	_, _ = statsConn.NewLeaderParticipation("error", "")
 
 	// error stats gets emitted
-	errorCount = topoStatsConnErrors.Counts()["NewLeaderParticipation.global"]
-	if got, want := errorCount, int64(1); got != want {
-		t.Errorf("stats were not properly recorded: got = %d, want = %d", got, want)
-	}
+	require.Equal(t, int64(1), topoStatsConnErrors.Counts()["NewLeaderParticipation.global"])
 }
 
 // TestStatsConnTopoClose emits stats on Close
 func TestStatsConnTopoClose(t *testing.T) {
+	testStatsConnStatsReset()
+	defer testStatsConnStatsReset()
+
 	conn := &fakeConn{}
-	statsConn := NewStatsConn("global", conn)
+	statsConn := NewStatsConn("global", conn, testStatsConnReadSem)
 
 	statsConn.Close()
-	timingCounts := topoStatsConnTimings.Counts()["Close.global"]
-	if got, want := timingCounts, int64(1); got != want {
-		t.Errorf("stats were not properly recorded: got = %d, want = %d", got, want)
-	}
+	require.Equal(t, int64(1), topoStatsConnTimings.Counts()["Close.global"])
+	require.NotZero(t, topoStatsConnTimings.Time())
+	require.Zero(t, topoStatsConnReadWaitTimings.Time())
 }
diff --git a/go/vt/topo/tablet.go b/go/vt/topo/tablet.go
index 482f782e775..f2c9ab81d67 100644
--- a/go/vt/topo/tablet.go
+++ b/go/vt/topo/tablet.go
@@ -24,21 +24,17 @@ import (
 	"sync"
 	"time"
 
-	"golang.org/x/sync/semaphore"
-
-	"vitess.io/vitess/go/protoutil"
-	"vitess.io/vitess/go/vt/key"
-
 	"vitess.io/vitess/go/event"
 	"vitess.io/vitess/go/netutil"
+	"vitess.io/vitess/go/protoutil"
 	"vitess.io/vitess/go/trace"
+	"vitess.io/vitess/go/vt/key"
 	"vitess.io/vitess/go/vt/log"
-	"vitess.io/vitess/go/vt/proto/vtrpc"
-	"vitess.io/vitess/go/vt/vterrors"
-
 	topodatapb "vitess.io/vitess/go/vt/proto/topodata"
+	"vitess.io/vitess/go/vt/proto/vtrpc"
 	"vitess.io/vitess/go/vt/topo/events"
 	"vitess.io/vitess/go/vt/topo/topoproto"
+	"vitess.io/vitess/go/vt/vterrors"
 )
 
 // IsTrivialTypeChange returns if this db type be trivially reassigned
@@ -234,8 +230,9 @@ func (ts *Server) GetTabletAliasesByCell(ctx context.Context, cell string) ([]*t
 // GetTabletsByCellOptions controls the behavior of
 // Server.FindAllShardsInKeyspace.
 type GetTabletsByCellOptions struct {
-	// Concurrency controls the maximum number of concurrent calls to GetTablet.
-	Concurrency int
+	// KeyspaceShard is the optional keyspace/shard that tablets must match.
+	// An empty shard value will match all shards in the keyspace.
+	KeyspaceShard *KeyspaceShard
 }
 
 // GetTabletsByCell returns all the tablets in the cell.
@@ -263,15 +260,27 @@ func (ts *Server) GetTabletsByCell(ctx context.Context, cellAlias string, opt *G
 		return nil, err
 	}
 
-	tablets := make([]*TabletInfo, len(listResults))
+	var capHint int
+	if opt != nil && opt.KeyspaceShard == nil {
+		capHint = len(listResults)
+	}
+
+	tablets := make([]*TabletInfo, 0, capHint)
 	for n := range listResults {
 		tablet := &topodatapb.Tablet{}
 		if err := tablet.UnmarshalVT(listResults[n].Value); err != nil {
 			return nil, err
 		}
-		tablets[n] = &TabletInfo{Tablet: tablet, version: listResults[n].Version}
+		if opt != nil && opt.KeyspaceShard != nil && opt.KeyspaceShard.Keyspace != "" {
+			if opt.KeyspaceShard.Keyspace != tablet.Keyspace {
+				continue
+			}
+			if opt.KeyspaceShard.Shard != "" && opt.KeyspaceShard.Shard != tablet.Shard {
+				continue
+			}
+		}
+		tablets = append(tablets, &TabletInfo{Tablet: tablet, version: listResults[n].Version})
 	}
-
 	return tablets, nil
 }
 
@@ -482,12 +491,6 @@ func (ts *Server) GetTabletMap(ctx context.Context, tabletAliases []*topodatapb.
 		returnErr error
 	)
 
-	concurrency := DefaultConcurrency
-	if opt != nil && opt.Concurrency > 0 {
-		concurrency = opt.Concurrency
-	}
-	var sem = semaphore.NewWeighted(int64(concurrency))
-
 	for _, tabletAlias := range tabletAliases {
 		if tabletAlias == nil {
 			return nil, vterrors.Errorf(vtrpc.Code_INVALID_ARGUMENT, "nil tablet alias in list")
@@ -495,19 +498,7 @@ func (ts *Server) GetTabletMap(ctx context.Context, tabletAliases []*topodatapb.
 		wg.Add(1)
 		go func(tabletAlias *topodatapb.TabletAlias) {
 			defer wg.Done()
-			if err := sem.Acquire(ctx, 1); err != nil {
-				// Only happens if context is cancelled.
-				mu.Lock()
-				defer mu.Unlock()
-				log.Warningf("%v: %v", tabletAlias, err)
-				// We only need to set this on the first error.
-				if returnErr == nil {
-					returnErr = NewError(PartialResult, tabletAlias.GetCell())
-				}
-				return
-			}
 			tabletInfo, err := ts.GetTablet(ctx, tabletAlias)
-			sem.Release(1)
 			mu.Lock()
 			defer mu.Unlock()
 			if err != nil {
diff --git a/go/vt/topo/tablet_test.go b/go/vt/topo/tablet_test.go
index 3a0153a11b5..1c242e8778b 100644
--- a/go/vt/topo/tablet_test.go
+++ b/go/vt/topo/tablet_test.go
@@ -17,8 +17,10 @@ limitations under the License.
 package topo_test
 
 import (
+	"cmp"
 	"context"
 	"errors"
+	"slices"
 	"testing"
 
 	"github.com/stretchr/testify/assert"
@@ -34,43 +36,291 @@ import (
 // GetTabletsByCell first tries to get all the tablets using List.
 // If the response is too large, we will get an error, and fall back to one tablet at a time.
 func TestServerGetTabletsByCell(t *testing.T) {
+	const cell = "zone1"
+	const keyspace = "keyspace"
+	const shard = "shard"
+
 	tests := []struct {
-		name      string
-		tablets   int
-		opt       *topo.GetTabletsByCellOptions
-		listError error
+		name               string
+		createShardTablets int
+		expectedTablets    []*topodatapb.Tablet
+		opt                *topo.GetTabletsByCellOptions
+		listError          error
+		keyspaceShards     []*topo.KeyspaceShard
 	}{
 		{
-			name:    "negative concurrency",
-			tablets: 1,
+			name: "negative concurrency",
+			keyspaceShards: []*topo.KeyspaceShard{
+				{Keyspace: keyspace, Shard: shard},
+			},
+			createShardTablets: 1,
+			expectedTablets: []*topodatapb.Tablet{
+				{
+					Alias: &topodatapb.TabletAlias{
+						Cell: cell,
+						Uid:  uint32(1),
+					},
+					Hostname: "host1",
+					PortMap: map[string]int32{
+						"vt": int32(1),
+					},
+					Keyspace: keyspace,
+					Shard:    shard,
+				},
+			},
 			// Ensure this doesn't panic.
-			opt: &topo.GetTabletsByCellOptions{Concurrency: -1},
 		},
 		{
-			name:    "single",
-			tablets: 1,
+			name: "single",
+			keyspaceShards: []*topo.KeyspaceShard{
+				{Keyspace: keyspace, Shard: shard},
+			},
+			createShardTablets: 1,
+			expectedTablets: []*topodatapb.Tablet{
+				{
+					Alias: &topodatapb.TabletAlias{
+						Cell: cell,
+						Uid:  uint32(1),
+					},
+					Hostname: "host1",
+					PortMap: map[string]int32{
+						"vt": int32(1),
+					},
+					Keyspace: keyspace,
+					Shard:    shard,
+				},
+			},
 			// Make sure the defaults apply as expected.
 			opt: nil,
 		},
 		{
 			name: "multiple",
-			// should work with more than 1 tablet
-			tablets: 32,
-			opt:     &topo.GetTabletsByCellOptions{Concurrency: 8},
+			keyspaceShards: []*topo.KeyspaceShard{
+				{Keyspace: keyspace, Shard: shard},
+			},
+			// Should work with more than 1 tablet
+			createShardTablets: 4,
+			expectedTablets: []*topodatapb.Tablet{
+				{
+					Alias: &topodatapb.TabletAlias{
+						Cell: cell,
+						Uid:  uint32(1),
+					},
+					Hostname: "host1",
+					PortMap: map[string]int32{
+						"vt": int32(1),
+					},
+					Keyspace: keyspace,
+					Shard:    shard,
+				},
+				{
+					Alias: &topodatapb.TabletAlias{
+						Cell: cell,
+						Uid:  uint32(2),
+					},
+					Hostname: "host1",
+					PortMap: map[string]int32{
+						"vt": int32(2),
+					},
+					Keyspace: keyspace,
+					Shard:    shard,
+				},
+				{
+					Alias: &topodatapb.TabletAlias{
+						Cell: cell,
+						Uid:  uint32(3),
+					},
+					Hostname: "host1",
+					PortMap: map[string]int32{
+						"vt": int32(3),
+					},
+					Keyspace: keyspace,
+					Shard:    shard,
+				},
+				{
+					Alias: &topodatapb.TabletAlias{
+						Cell: cell,
+						Uid:  uint32(4),
+					},
+					Hostname: "host1",
+					PortMap: map[string]int32{
+						"vt": int32(4),
+					},
+					Keyspace: keyspace,
+					Shard:    shard,
+				},
+			},
 		},
 		{
 			name: "multiple with list error",
-			// should work with more than 1 tablet when List returns an error
-			tablets:   32,
-			opt:       &topo.GetTabletsByCellOptions{Concurrency: 8},
+			keyspaceShards: []*topo.KeyspaceShard{
+				{Keyspace: keyspace, Shard: shard},
+			},
+			// Should work with more than 1 tablet when List returns an error
+			createShardTablets: 4,
+			expectedTablets: []*topodatapb.Tablet{
+				{
+					Alias: &topodatapb.TabletAlias{
+						Cell: cell,
+						Uid:  uint32(1),
+					},
+					Hostname: "host1",
+					PortMap: map[string]int32{
+						"vt": int32(1),
+					},
+					Keyspace: keyspace,
+					Shard:    shard,
+				},
+				{
+					Alias: &topodatapb.TabletAlias{
+						Cell: cell,
+						Uid:  uint32(2),
+					},
+					Hostname: "host1",
+					PortMap: map[string]int32{
+						"vt": int32(2),
+					},
+					Keyspace: keyspace,
+					Shard:    shard,
+				},
+				{
+					Alias: &topodatapb.TabletAlias{
+						Cell: cell,
+						Uid:  uint32(3),
+					},
+					Hostname: "host1",
+					PortMap: map[string]int32{
+						"vt": int32(3),
+					},
+					Keyspace: keyspace,
+					Shard:    shard,
+				},
+				{
+					Alias: &topodatapb.TabletAlias{
+						Cell: cell,
+						Uid:  uint32(4),
+					},
+					Hostname: "host1",
+					PortMap: map[string]int32{
+						"vt": int32(4),
+					},
+					Keyspace: keyspace,
+					Shard:    shard,
+				},
+			},
 			listError: topo.NewError(topo.ResourceExhausted, ""),
 		},
+		{
+			name: "filtered by keyspace and shard",
+			keyspaceShards: []*topo.KeyspaceShard{
+				{Keyspace: keyspace, Shard: shard},
+				{Keyspace: "filtered", Shard: "-"},
+			},
+			// Should create 2 tablets in 2 different shards (4 total)
+			// but only a single shard is returned
+			createShardTablets: 2,
+			expectedTablets: []*topodatapb.Tablet{
+				{
+					Alias: &topodatapb.TabletAlias{
+						Cell: cell,
+						Uid:  uint32(1),
+					},
+					Hostname: "host1",
+					PortMap: map[string]int32{
+						"vt": int32(1),
+					},
+					Keyspace: keyspace,
+					Shard:    shard,
+				},
+				{
+					Alias: &topodatapb.TabletAlias{
+						Cell: cell,
+						Uid:  uint32(2),
+					},
+					Hostname: "host1",
+					PortMap: map[string]int32{
+						"vt": int32(2),
+					},
+					Keyspace: keyspace,
+					Shard:    shard,
+				},
+			},
+			opt: &topo.GetTabletsByCellOptions{
+				KeyspaceShard: &topo.KeyspaceShard{
+					Keyspace: keyspace,
+					Shard:    shard,
+				},
+			},
+		},
+		{
+			name: "filtered by keyspace and no shard",
+			keyspaceShards: []*topo.KeyspaceShard{
+				{Keyspace: keyspace, Shard: shard},
+				{Keyspace: keyspace, Shard: shard + "2"},
+			},
+			// Should create 2 tablets in 2 different shards (4 total)
+			// in the same keyspace and both shards are returned due to
+			// empty shard
+			createShardTablets: 2,
+			expectedTablets: []*topodatapb.Tablet{
+				{
+					Alias: &topodatapb.TabletAlias{
+						Cell: cell,
+						Uid:  uint32(1),
+					},
+					Hostname: "host1",
+					PortMap: map[string]int32{
+						"vt": int32(1),
+					},
+					Keyspace: keyspace,
+					Shard:    shard,
+				},
+				{
+					Alias: &topodatapb.TabletAlias{
+						Cell: cell,
+						Uid:  uint32(2),
+					},
+					Hostname: "host1",
+					PortMap: map[string]int32{
+						"vt": int32(2),
+					},
+					Keyspace: keyspace,
+					Shard:    shard,
+				},
+				{
+					Alias: &topodatapb.TabletAlias{
+						Cell: cell,
+						Uid:  uint32(3),
+					},
+					Hostname: "host1",
+					PortMap: map[string]int32{
+						"vt": int32(3),
+					},
+					Keyspace: keyspace,
+					Shard:    shard + "2",
+				},
+				{
+					Alias: &topodatapb.TabletAlias{
+						Cell: cell,
+						Uid:  uint32(4),
+					},
+					Hostname: "host1",
+					PortMap: map[string]int32{
+						"vt": int32(4),
+					},
+					Keyspace: keyspace,
+					Shard:    shard + "2",
+				},
+			},
+			opt: &topo.GetTabletsByCellOptions{
+				KeyspaceShard: &topo.KeyspaceShard{
+					Keyspace: keyspace,
+					Shard:    "",
+				},
+			},
+		},
 	}
 
-	const cell = "zone1"
-	const keyspace = "keyspace"
-	const shard = "shard"
-
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			ctx, cancel := context.WithCancel(context.Background())
@@ -84,37 +334,53 @@ func TestServerGetTabletsByCell(t *testing.T) {
 
 			// Create an ephemeral keyspace and generate shard records within
 			// the keyspace to fetch later.
-			require.NoError(t, ts.CreateKeyspace(ctx, keyspace, &topodatapb.Keyspace{}))
-			require.NoError(t, ts.CreateShard(ctx, keyspace, shard))
-
-			tablets := make([]*topo.TabletInfo, tt.tablets)
+			createdKeyspaces := make(map[string]bool, len(tt.keyspaceShards))
+			for _, kss := range tt.keyspaceShards {
+				if !createdKeyspaces[kss.Keyspace] {
+					require.NoError(t, ts.CreateKeyspace(ctx, kss.Keyspace, &topodatapb.Keyspace{}))
+					createdKeyspaces[kss.Keyspace] = true
+				}
+				require.NoError(t, ts.CreateShard(ctx, kss.Keyspace, kss.Shard))
+			}
 
-			for i := 0; i < tt.tablets; i++ {
-				tablet := &topodatapb.Tablet{
-					Alias: &topodatapb.TabletAlias{
-						Cell: cell,
-						Uid:  uint32(i),
-					},
-					Hostname: "host1",
-					PortMap: map[string]int32{
-						"vt": int32(i),
-					},
-					Keyspace: keyspace,
-					Shard:    shard,
+			var uid uint32 = 1
+			for _, kss := range tt.keyspaceShards {
+				for i := 0; i < tt.createShardTablets; i++ {
+					tablet := &topodatapb.Tablet{
+						Alias: &topodatapb.TabletAlias{
+							Cell: cell,
+							Uid:  uid,
+						},
+						Hostname: "host1",
+						PortMap: map[string]int32{
+							"vt": int32(uid),
+						},
+						Keyspace: kss.Keyspace,
+						Shard:    kss.Shard,
+					}
+					require.NoError(t, ts.CreateTablet(ctx, tablet))
+					uid++
 				}
-				tInfo := &topo.TabletInfo{Tablet: tablet}
-				tablets[i] = tInfo
-				require.NoError(t, ts.CreateTablet(ctx, tablet))
 			}
 
 			// Verify that we return a complete list of tablets and that each
 			// tablet matches what we expect.
 			out, err := ts.GetTabletsByCell(ctx, cell, tt.opt)
 			require.NoError(t, err)
-			require.Len(t, out, tt.tablets)
+			require.Len(t, out, len(tt.expectedTablets))
+
+			slices.SortFunc(out, func(i, j *topo.TabletInfo) int {
+				return cmp.Compare(i.Alias.Uid, j.Alias.Uid)
+			})
+			slices.SortFunc(tt.expectedTablets, func(i, j *topodatapb.Tablet) int {
+				return cmp.Compare(i.Alias.Uid, j.Alias.Uid)
+			})
 
-			for i, tab := range tablets {
-				require.Equal(t, tab.Tablet, tablets[i].Tablet)
+			for i, tablet := range out {
+				expected := tt.expectedTablets[i]
+				require.Equal(t, expected.Alias.String(), tablet.Alias.String())
+				require.Equal(t, expected.Keyspace, tablet.Keyspace)
+				require.Equal(t, expected.Shard, tablet.Shard)
 			}
 		})
 	}
diff --git a/go/vt/vtorc/db/db.go b/go/vt/vtorc/db/db.go
index 00f5b5b2550..64143477645 100644
--- a/go/vt/vtorc/db/db.go
+++ b/go/vt/vtorc/db/db.go
@@ -18,7 +18,6 @@ package db
 
 import (
 	"database/sql"
-	"strings"
 
 	"vitess.io/vitess/go/vt/external/golib/sqlutils"
 	"vitess.io/vitess/go/vt/log"
@@ -57,17 +56,13 @@ func OpenVTOrc() (db *sql.DB, err error) {
 	return db, err
 }
 
-func translateStatement(statement string) string {
-	return sqlutils.ToSqlite3Dialect(statement)
-}
-
 // registerVTOrcDeployment updates the vtorc_db_deployments table upon successful deployment
 func registerVTOrcDeployment(db *sql.DB) error {
 	query := `
     	replace into vtorc_db_deployments (
 				deployed_version, deployed_timestamp
 			) values (
-				?, NOW()
+				?, datetime('now')
 			)
 				`
 	if _, err := execInternal(db, query, ""); err != nil {
@@ -82,25 +77,12 @@ func deployStatements(db *sql.DB, queries []string) error {
 	tx, err := db.Begin()
 	if err != nil {
 		log.Fatal(err.Error())
+		return err
 	}
 	for _, query := range queries {
-		query = translateStatement(query)
 		if _, err := tx.Exec(query); err != nil {
-			if strings.Contains(err.Error(), "syntax error") {
-				log.Fatalf("Cannot initiate vtorc: %+v; query=%+v", err, query)
-				return err
-			}
-			if !sqlutils.IsAlterTable(query) && !sqlutils.IsCreateIndex(query) && !sqlutils.IsDropIndex(query) {
-				log.Fatalf("Cannot initiate vtorc: %+v; query=%+v", err, query)
-				return err
-			}
-			if !strings.Contains(err.Error(), "duplicate column name") &&
-				!strings.Contains(err.Error(), "Duplicate column name") &&
-				!strings.Contains(err.Error(), "check that column/key exists") &&
-				!strings.Contains(err.Error(), "already exists") &&
-				!strings.Contains(err.Error(), "Duplicate key name") {
-				log.Errorf("Error initiating vtorc: %+v; query=%+v", err, query)
-			}
+			log.Fatalf("Cannot initiate vtorc: %+v; query=%+v", err, query)
+			return err
 		}
 	}
 	if err := tx.Commit(); err != nil {
@@ -135,7 +117,6 @@ func initVTOrcDB(db *sql.DB) error {
 // execInternal
 func execInternal(db *sql.DB, query string, args ...any) (sql.Result, error) {
 	var err error
-	query = translateStatement(query)
 	res, err := sqlutils.ExecNoPrepare(db, query, args...)
 	return res, err
 }
@@ -151,7 +132,6 @@ func ExecVTOrc(query string, args ...any) (sql.Result, error) {
 
 // QueryVTOrcRowsMap
 func QueryVTOrcRowsMap(query string, onRow func(sqlutils.RowMap) error) error {
-	query = translateStatement(query)
 	db, err := OpenVTOrc()
 	if err != nil {
 		return err
@@ -162,7 +142,6 @@ func QueryVTOrcRowsMap(query string, onRow func(sqlutils.RowMap) error) error {
 
 // QueryVTOrc
 func QueryVTOrc(query string, argsArray []any, onRow func(sqlutils.RowMap) error) error {
-	query = translateStatement(query)
 	db, err := OpenVTOrc()
 	if err != nil {
 		return err
diff --git a/go/vt/vtorc/inst/analysis_dao.go b/go/vt/vtorc/inst/analysis_dao.go
index 93d3bc6a3a6..456d62a6258 100644
--- a/go/vt/vtorc/inst/analysis_dao.go
+++ b/go/vt/vtorc/inst/analysis_dao.go
@@ -91,13 +91,13 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna
 			IFNULL(
 				primary_instance.binary_log_file = database_instance_stale_binlog_coordinates.binary_log_file
 				AND primary_instance.binary_log_pos = database_instance_stale_binlog_coordinates.binary_log_pos
-				AND database_instance_stale_binlog_coordinates.first_seen < NOW() - interval ? second,
+				AND database_instance_stale_binlog_coordinates.first_seen < datetime('now', printf('-%d second', ?)),
 				0
 			)
 		) AS is_stale_binlog_coordinates,
 		MIN(
 			primary_instance.last_checked <= primary_instance.last_seen
-			and primary_instance.last_attempted_check <= primary_instance.last_seen + interval ? second
+			and primary_instance.last_attempted_check <= datetime(primary_instance.last_seen, printf('+%d second', ?))
 		) = 1 AS is_last_check_valid,
 		/* To be considered a primary, traditional async replication must not be present/valid AND the host should either */
 		/* not be a replication group member OR be the primary of the replication group */
@@ -658,7 +658,7 @@ func auditInstanceAnalysisInChangelog(tabletAlias string, analysisCode AnalysisC
 		sqlResult, err := db.ExecVTOrc(`
 			update database_instance_last_analysis set
 				analysis = ?,
-				analysis_timestamp = now()
+				analysis_timestamp = datetime('now')
 			where
 				alias = ?
 				and analysis != ?
@@ -683,10 +683,10 @@ func auditInstanceAnalysisInChangelog(tabletAlias string, analysisCode AnalysisC
 	if !lastAnalysisChanged {
 		// The insert only returns more than 1 row changed if this is the first insertion.
 		sqlResult, err := db.ExecVTOrc(`
-			insert ignore into database_instance_last_analysis (
+			insert or ignore into database_instance_last_analysis (
 					alias, analysis_timestamp, analysis
 				) values (
-					?, now(), ?
+					?, datetime('now'), ?
 				)
 			`,
 			tabletAlias, string(analysisCode),
@@ -712,7 +712,7 @@ func auditInstanceAnalysisInChangelog(tabletAlias string, analysisCode AnalysisC
 			insert into database_instance_analysis_changelog (
 					alias, analysis_timestamp, analysis
 				) values (
-					?, now(), ?
+					?, datetime('now'), ?
 				)
 			`,
 		tabletAlias, string(analysisCode),
@@ -731,7 +731,7 @@ func ExpireInstanceAnalysisChangelog() error {
 			delete
 				from database_instance_analysis_changelog
 			where
-				analysis_timestamp < now() - interval ? hour
+				analysis_timestamp < datetime('now', printf('-%d hour', ?))
 			`,
 		config.UnseenInstanceForgetHours,
 	)
diff --git a/go/vt/vtorc/inst/audit_dao.go b/go/vt/vtorc/inst/audit_dao.go
index d048f300faf..642fb187509 100644
--- a/go/vt/vtorc/inst/audit_dao.go
+++ b/go/vt/vtorc/inst/audit_dao.go
@@ -60,7 +60,7 @@ func AuditOperation(auditType string, tabletAlias string, message string) error
 				into audit (
 					audit_timestamp, audit_type, alias, keyspace, shard, message
 				) VALUES (
-					NOW(), ?, ?, ?, ?, ?
+					datetime('now'), ?, ?, ?, ?, ?
 				)
 			`,
 			auditType,
diff --git a/go/vt/vtorc/inst/instance_dao.go b/go/vt/vtorc/inst/instance_dao.go
index 46be8689c0a..1ab13fb7cfa 100644
--- a/go/vt/vtorc/inst/instance_dao.go
+++ b/go/vt/vtorc/inst/instance_dao.go
@@ -115,7 +115,7 @@ func ExecDBWriteFunc(f func() error) error {
 func ExpireTableData(tableName string, timestampColumn string) error {
 	writeFunc := func() error {
 		_, err := db.ExecVTOrc(
-			fmt.Sprintf("delete from %s where %s < NOW() - INTERVAL ? DAY", tableName, timestampColumn),
+			fmt.Sprintf("delete from %s where %s < datetime('now', printf('-%%d DAY', ?))", tableName, timestampColumn),
 			config.Config.AuditPurgeDays,
 		)
 		return err
@@ -583,9 +583,9 @@ func readInstancesByCondition(condition string, args []any, sort string) ([](*In
 		query := fmt.Sprintf(`
 		select
 			*,
-			unix_timestamp() - unix_timestamp(last_checked) as seconds_since_last_checked,
+			strftime('%%s', 'now') - strftime('%%s', last_checked) as seconds_since_last_checked,
 			ifnull(last_checked <= last_seen, 0) as is_last_check_valid,
-			unix_timestamp() - unix_timestamp(last_seen) as seconds_since_last_seen
+			strftime('%%s', 'now') - strftime('%%s', last_seen) as seconds_since_last_seen
 		from
 			vitess_tablet
 			left join database_instance using (alias, hostname, port)
@@ -637,11 +637,11 @@ func ReadProblemInstances(keyspace string, shard string) ([](*Instance), error)
 			and shard LIKE (CASE WHEN ? = '' THEN '%' ELSE ? END)
 			and (
 				(last_seen < last_checked)
-				or (unix_timestamp() - unix_timestamp(last_checked) > ?)
+				or (strftime('%%s', 'now') - strftime('%%s', last_checked) > ?)
 				or (replication_sql_thread_state not in (-1 ,1))
 				or (replication_io_thread_state not in (-1 ,1))
-				or (abs(cast(replication_lag_seconds as signed) - cast(sql_delay as signed)) > ?)
-				or (abs(cast(replica_lag_seconds as signed) - cast(sql_delay as signed)) > ?)
+				or (abs(cast(replication_lag_seconds as integer) - cast(sql_delay as integer)) > ?)
+				or (abs(cast(replica_lag_seconds as integer) - cast(sql_delay as integer)) > ?)
 				or (gtid_errant != '')
 			)
 		`
@@ -703,8 +703,8 @@ func ReadOutdatedInstanceKeys() ([]string, error) {
 		WHERE
 			CASE
 				WHEN last_attempted_check <= last_checked
-				THEN last_checked < now() - interval ? second
-				ELSE last_checked < now() - interval ? second
+				THEN last_checked < datetime('now', printf('-%d second', ?))
+				ELSE last_checked < datetime('now', printf('-%d second', ?))
 			END
 		UNION
 		SELECT
@@ -733,7 +733,7 @@ func ReadOutdatedInstanceKeys() ([]string, error) {
 	return res, err
 }
 
-func mkInsertOdku(table string, columns []string, values []string, nrRows int, insertIgnore bool) (string, error) {
+func mkInsert(table string, columns []string, values []string, nrRows int, insertIgnore bool) (string, error) {
 	if len(columns) == 0 {
 		return "", errors.New("Column list cannot be empty")
 	}
@@ -745,9 +745,9 @@ func mkInsertOdku(table string, columns []string, values []string, nrRows int, i
 	}
 
 	var q strings.Builder
-	var ignore string
+	insertStr := "REPLACE INTO"
 	if insertIgnore {
-		ignore = "ignore"
+		insertStr = "INSERT OR IGNORE INTO"
 	}
 	valRow := fmt.Sprintf("(%s)", strings.Join(values, ", "))
 	var val strings.Builder
@@ -758,26 +758,17 @@ func mkInsertOdku(table string, columns []string, values []string, nrRows int, i
 	}
 
 	col := strings.Join(columns, ", ")
-	var odku strings.Builder
-	odku.WriteString(fmt.Sprintf("%s=VALUES(%s)", columns[0], columns[0]))
-	for _, c := range columns[1:] {
-		odku.WriteString(", ")
-		odku.WriteString(fmt.Sprintf("%s=VALUES(%s)", c, c))
-	}
-
-	q.WriteString(fmt.Sprintf(`INSERT %s INTO %s
+	q.WriteString(fmt.Sprintf(`%s %s
                 (%s)
         VALUES
                 %s
-        ON DUPLICATE KEY UPDATE
-                %s
         `,
-		ignore, table, col, val.String(), odku.String()))
+		insertStr, table, col, val.String()))
 
 	return q.String(), nil
 }
 
-func mkInsertOdkuForInstances(instances []*Instance, instanceWasActuallyFound bool, updateLastSeen bool) (string, []any, error) {
+func mkInsertForInstances(instances []*Instance, instanceWasActuallyFound bool, updateLastSeen bool) (string, []any, error) {
 	if len(instances) == 0 {
 		return "", nil, nil
 	}
@@ -858,13 +849,13 @@ func mkInsertOdkuForInstances(instances []*Instance, instanceWasActuallyFound bo
 	for i := range columns {
 		values[i] = "?"
 	}
-	values[3] = "NOW()" // last_checked
-	values[4] = "NOW()" // last_attempted_check
-	values[5] = "1"     // last_check_partial_success
+	values[3] = "datetime('now')" // last_checked
+	values[4] = "datetime('now')" // last_attempted_check
+	values[5] = "1"               // last_check_partial_success
 
 	if updateLastSeen {
 		columns = append(columns, "last_seen")
-		values = append(values, "NOW()")
+		values = append(values, "datetime('now')")
 	}
 
 	var args []any
@@ -935,7 +926,7 @@ func mkInsertOdkuForInstances(instances []*Instance, instanceWasActuallyFound bo
 		args = append(args, instance.LastDiscoveryLatency.Nanoseconds())
 	}
 
-	sql, err := mkInsertOdku("database_instance", columns, values, len(instances), insertIgnore)
+	sql, err := mkInsert("database_instance", columns, values, len(instances), insertIgnore)
 	if err != nil {
 		errMsg := fmt.Sprintf("Failed to build query: %v", err)
 		log.Errorf(errMsg)
@@ -957,7 +948,7 @@ func writeManyInstances(instances []*Instance, instanceWasActuallyFound bool, up
 	if len(writeInstances) == 0 {
 		return nil // nothing to write
 	}
-	sql, args, err := mkInsertOdkuForInstances(writeInstances, instanceWasActuallyFound, updateLastSeen)
+	sql, args, err := mkInsertForInstances(writeInstances, instanceWasActuallyFound, updateLastSeen)
 	if err != nil {
 		return err
 	}
@@ -984,7 +975,7 @@ func UpdateInstanceLastChecked(tabletAlias string, partialSuccess bool) error {
         	update
         		database_instance
         	set
-						last_checked = NOW(),
+						last_checked = datetime('now'),
 						last_check_partial_success = ?
 			where
 				alias = ?`,
@@ -1013,7 +1004,7 @@ func UpdateInstanceLastAttemptedCheck(tabletAlias string) error {
     	update
     		database_instance
     	set
-    		last_attempted_check = NOW()
+    		last_attempted_check = datetime('now')
 			where
 				alias = ?`,
 			tabletAlias,
@@ -1091,7 +1082,7 @@ func ForgetLongUnseenInstances() error {
 			delete
 				from database_instance
 			where
-				last_seen < NOW() - interval ? hour`,
+				last_seen < datetime('now', printf('-%d hour', ?))`,
 		config.UnseenInstanceForgetHours,
 	)
 	if err != nil {
@@ -1113,11 +1104,11 @@ func ForgetLongUnseenInstances() error {
 func SnapshotTopologies() error {
 	writeFunc := func() error {
 		_, err := db.ExecVTOrc(`
-        	insert ignore into
+        	insert or ignore into
         		database_instance_topology_history (snapshot_unix_timestamp,
         			alias, hostname, port, source_host, source_port, keyspace, shard, version)
         	select
-        		UNIX_TIMESTAMP(NOW()),
+        		strftime('%s', 'now'),
 				vitess_tablet.alias, vitess_tablet.hostname, vitess_tablet.port, 
 				database_instance.source_host, database_instance.source_port, 
 				vitess_tablet.keyspace, vitess_tablet.shard, database_instance.version
@@ -1143,7 +1134,7 @@ func ExpireStaleInstanceBinlogCoordinates() error {
 	writeFunc := func() error {
 		_, err := db.ExecVTOrc(`
 					delete from database_instance_stale_binlog_coordinates
-					where first_seen < NOW() - INTERVAL ? SECOND
+					where first_seen < datetime('now', printf('-%d second', ?))
 					`, expireSeconds,
 		)
 		if err != nil {
diff --git a/go/vt/vtorc/inst/instance_dao_test.go b/go/vt/vtorc/inst/instance_dao_test.go
index 741fc48bca9..2416c1abb90 100644
--- a/go/vt/vtorc/inst/instance_dao_test.go
+++ b/go/vt/vtorc/inst/instance_dao_test.go
@@ -49,57 +49,48 @@ func mkTestInstances() []*Instance {
 	return instances
 }
 
-func TestMkInsertOdkuSingle(t *testing.T) {
+func TestMkInsertSingle(t *testing.T) {
 	instances := mkTestInstances()
 
-	sql, args, err := mkInsertOdkuForInstances(nil, true, true)
+	sql, args, err := mkInsertForInstances(nil, true, true)
 	require.NoError(t, err)
 	require.Equal(t, sql, "")
 	require.Equal(t, len(args), 0)
 
 	// one instance
-	s1 := `INSERT ignore INTO database_instance
+	s1 := `INSERT OR IGNORE INTO database_instance
 				(alias, hostname, port, last_checked, last_attempted_check, last_check_partial_success, server_id, server_uuid,
 				version, major_version, version_comment, binlog_server, read_only, binlog_format,
 				binlog_row_image, log_bin, log_replica_updates, binary_log_file, binary_log_pos, source_host, source_port, replica_net_timeout, heartbeat_interval,
 				replica_sql_running, replica_io_running, replication_sql_thread_state, replication_io_thread_state, has_replication_filters, supports_oracle_gtid, oracle_gtid, source_uuid, ancestry_uuid, executed_gtid_set, gtid_mode, gtid_purged, gtid_errant, mariadb_gtid, pseudo_gtid,
 				source_log_file, read_source_log_pos, relay_source_log_file, exec_source_log_pos, relay_log_file, relay_log_pos, last_sql_error, last_io_error, replication_lag_seconds, replica_lag_seconds, sql_delay, data_center, region, physical_environment, replication_depth, is_co_primary, has_replication_credentials, allow_tls, semi_sync_enforced, semi_sync_primary_enabled, semi_sync_primary_timeout, semi_sync_primary_wait_for_replica_count, semi_sync_replica_enabled, semi_sync_primary_status, semi_sync_primary_clients, semi_sync_replica_status, last_discovery_latency, last_seen)
 		VALUES
-				(?, ?, ?, NOW(), NOW(), 1, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, NOW())
-		ON DUPLICATE KEY UPDATE
-				alias=VALUES(alias), hostname=VALUES(hostname), port=VALUES(port), last_checked=VALUES(last_checked), last_attempted_check=VALUES(last_attempted_check), last_check_partial_success=VALUES(last_check_partial_success), server_id=VALUES(server_id), server_uuid=VALUES(server_uuid), version=VALUES(version), major_version=VALUES(major_version), version_comment=VALUES(version_comment), binlog_server=VALUES(binlog_server), read_only=VALUES(read_only), binlog_format=VALUES(binlog_format), binlog_row_image=VALUES(binlog_row_image), log_bin=VALUES(log_bin), log_replica_updates=VALUES(log_replica_updates), binary_log_file=VALUES(binary_log_file), binary_log_pos=VALUES(binary_log_pos), source_host=VALUES(source_host), source_port=VALUES(source_port), replica_net_timeout=VALUES(replica_net_timeout), heartbeat_interval=VALUES(heartbeat_interval), replica_sql_running=VALUES(replica_sql_running), replica_io_running=VALUES(replica_io_running), replication_sql_thread_state=VALUES(replication_sql_thread_state), replication_io_thread_state=VALUES(replication_io_thread_state), has_replication_filters=VALUES(has_replication_filters), supports_oracle_gtid=VALUES(supports_oracle_gtid), oracle_gtid=VALUES(oracle_gtid), source_uuid=VALUES(source_uuid), ancestry_uuid=VALUES(ancestry_uuid), executed_gtid_set=VALUES(executed_gtid_set), gtid_mode=VALUES(gtid_mode), gtid_purged=VALUES(gtid_purged), gtid_errant=VALUES(gtid_errant), mariadb_gtid=VALUES(mariadb_gtid), pseudo_gtid=VALUES(pseudo_gtid), source_log_file=VALUES(source_log_file), read_source_log_pos=VALUES(read_source_log_pos), relay_source_log_file=VALUES(relay_source_log_file), exec_source_log_pos=VALUES(exec_source_log_pos), relay_log_file=VALUES(relay_log_file), relay_log_pos=VALUES(relay_log_pos), last_sql_error=VALUES(last_sql_error), last_io_error=VALUES(last_io_error), replication_lag_seconds=VALUES(replication_lag_seconds), replica_lag_seconds=VALUES(replica_lag_seconds), sql_delay=VALUES(sql_delay), data_center=VALUES(data_center), region=VALUES(region), physical_environment=VALUES(physical_environment), replication_depth=VALUES(replication_depth), is_co_primary=VALUES(is_co_primary), has_replication_credentials=VALUES(has_replication_credentials), allow_tls=VALUES(allow_tls),
-				semi_sync_enforced=VALUES(semi_sync_enforced), semi_sync_primary_enabled=VALUES(semi_sync_primary_enabled), semi_sync_primary_timeout=VALUES(semi_sync_primary_timeout), semi_sync_primary_wait_for_replica_count=VALUES(semi_sync_primary_wait_for_replica_count), semi_sync_replica_enabled=VALUES(semi_sync_replica_enabled), semi_sync_primary_status=VALUES(semi_sync_primary_status), semi_sync_primary_clients=VALUES(semi_sync_primary_clients), semi_sync_replica_status=VALUES(semi_sync_replica_status),
-				last_discovery_latency=VALUES(last_discovery_latency), last_seen=VALUES(last_seen)
+				(?, ?, ?, datetime('now'), datetime('now'), 1, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, datetime('now'))
        `
 	a1 := `zone1-i710, i710, 3306, 710, , 5.6.7, 5.6, MySQL, false, false, STATEMENT,
 	FULL, false, false, , 0, , 0, 0, 0,
 	false, false, 0, 0, false, false, false, , , , , , , false, false, , 0, mysql.000007, 10, , 0, , , {0 false}, {0 false}, 0, , , , 0, false, false, false, false, false, 0, 0, false, false, 0, false, 0,`
 
-	sql1, args1, err := mkInsertOdkuForInstances(instances[:1], false, true)
+	sql1, args1, err := mkInsertForInstances(instances[:1], false, true)
 	require.NoError(t, err)
 	require.Equal(t, normalizeQuery(sql1), normalizeQuery(s1))
 	require.Equal(t, stripSpaces(fmtArgs(args1)), stripSpaces(a1))
 }
 
-func TestMkInsertOdkuThree(t *testing.T) {
+func TestMkInsertThree(t *testing.T) {
 	instances := mkTestInstances()
 
 	// three instances
-	s3 := `INSERT  INTO database_instance
+	s3 := `REPLACE INTO database_instance
 				(alias, hostname, port, last_checked, last_attempted_check, last_check_partial_success, server_id, server_uuid,
 				version, major_version, version_comment, binlog_server, read_only, binlog_format,
 				binlog_row_image, log_bin, log_replica_updates, binary_log_file, binary_log_pos, source_host, source_port, replica_net_timeout, heartbeat_interval,
 				replica_sql_running, replica_io_running, replication_sql_thread_state, replication_io_thread_state, has_replication_filters, supports_oracle_gtid, oracle_gtid, source_uuid, ancestry_uuid, executed_gtid_set, gtid_mode, gtid_purged, gtid_errant, mariadb_gtid, pseudo_gtid,
 				source_log_file, read_source_log_pos, relay_source_log_file, exec_source_log_pos, relay_log_file, relay_log_pos, last_sql_error, last_io_error, replication_lag_seconds, replica_lag_seconds, sql_delay, data_center, region, physical_environment, replication_depth, is_co_primary, has_replication_credentials, allow_tls, semi_sync_enforced, semi_sync_primary_enabled, semi_sync_primary_timeout, semi_sync_primary_wait_for_replica_count, semi_sync_replica_enabled, semi_sync_primary_status, semi_sync_primary_clients, semi_sync_replica_status, last_discovery_latency, last_seen)
 		VALUES
-				(?, ?, ?, NOW(), NOW(), 1, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, NOW()),
-				(?, ?, ?, NOW(), NOW(), 1, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, NOW()),
-				(?, ?, ?, NOW(), NOW(), 1, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, NOW())
-		ON DUPLICATE KEY UPDATE
-				alias=VALUES(alias), hostname=VALUES(hostname), port=VALUES(port), last_checked=VALUES(last_checked), last_attempted_check=VALUES(last_attempted_check), last_check_partial_success=VALUES(last_check_partial_success), server_id=VALUES(server_id), server_uuid=VALUES(server_uuid), version=VALUES(version), major_version=VALUES(major_version), version_comment=VALUES(version_comment), binlog_server=VALUES(binlog_server), read_only=VALUES(read_only), binlog_format=VALUES(binlog_format), binlog_row_image=VALUES(binlog_row_image), log_bin=VALUES(log_bin), log_replica_updates=VALUES(log_replica_updates), binary_log_file=VALUES(binary_log_file), binary_log_pos=VALUES(binary_log_pos), source_host=VALUES(source_host), source_port=VALUES(source_port), replica_net_timeout=VALUES(replica_net_timeout), heartbeat_interval=VALUES(heartbeat_interval), replica_sql_running=VALUES(replica_sql_running), replica_io_running=VALUES(replica_io_running), replication_sql_thread_state=VALUES(replication_sql_thread_state), replication_io_thread_state=VALUES(replication_io_thread_state), has_replication_filters=VALUES(has_replication_filters), supports_oracle_gtid=VALUES(supports_oracle_gtid), oracle_gtid=VALUES(oracle_gtid), source_uuid=VALUES(source_uuid), ancestry_uuid=VALUES(ancestry_uuid), executed_gtid_set=VALUES(executed_gtid_set), gtid_mode=VALUES(gtid_mode), gtid_purged=VALUES(gtid_purged), gtid_errant=VALUES(gtid_errant), mariadb_gtid=VALUES(mariadb_gtid), pseudo_gtid=VALUES(pseudo_gtid), source_log_file=VALUES(source_log_file), read_source_log_pos=VALUES(read_source_log_pos), relay_source_log_file=VALUES(relay_source_log_file), exec_source_log_pos=VALUES(exec_source_log_pos), relay_log_file=VALUES(relay_log_file), relay_log_pos=VALUES(relay_log_pos), last_sql_error=VALUES(last_sql_error), last_io_error=VALUES(last_io_error), replication_lag_seconds=VALUES(replication_lag_seconds), replica_lag_seconds=VALUES(replica_lag_seconds), sql_delay=VALUES(sql_delay), data_center=VALUES(data_center), region=VALUES(region),
-				physical_environment=VALUES(physical_environment), replication_depth=VALUES(replication_depth), is_co_primary=VALUES(is_co_primary), has_replication_credentials=VALUES(has_replication_credentials), allow_tls=VALUES(allow_tls), semi_sync_enforced=VALUES(semi_sync_enforced),
-				semi_sync_primary_enabled=VALUES(semi_sync_primary_enabled), semi_sync_primary_timeout=VALUES(semi_sync_primary_timeout), semi_sync_primary_wait_for_replica_count=VALUES(semi_sync_primary_wait_for_replica_count), semi_sync_replica_enabled=VALUES(semi_sync_replica_enabled), semi_sync_primary_status=VALUES(semi_sync_primary_status), semi_sync_primary_clients=VALUES(semi_sync_primary_clients), semi_sync_replica_status=VALUES(semi_sync_replica_status),
-				last_discovery_latency=VALUES(last_discovery_latency), last_seen=VALUES(last_seen)
+				(?, ?, ?, datetime('now'), datetime('now'), 1, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, datetime('now')),
+				(?, ?, ?, datetime('now'), datetime('now'), 1, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, datetime('now')),
+				(?, ?, ?, datetime('now'), datetime('now'), 1, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, datetime('now'))
        `
 	a3 := `
 		zone1-i710, i710, 3306, 710, , 5.6.7, 5.6, MySQL, false, false, STATEMENT, FULL, false, false, , 0, , 0, 0, 0, false, false, 0, 0, false, false, false, , , , , , , false, false, , 0, mysql.000007, 10, , 0, , , {0 false}, {0 false}, 0, , , , 0, false, false, false, false, false, 0, 0, false, false, 0, false, 0,
@@ -107,7 +98,7 @@ func TestMkInsertOdkuThree(t *testing.T) {
 		zone1-i730, i730, 3306, 730, , 5.6.7, 5.6, MySQL, false, false, STATEMENT, FULL, false, false, , 0, , 0, 0, 0, false, false, 0, 0, false, false, false, , , , , , , false, false, , 0, mysql.000007, 30, , 0, , , {0 false}, {0 false}, 0, , , , 0, false, false, false, false, false, 0, 0, false, false, 0, false, 0,
 		`
 
-	sql3, args3, err := mkInsertOdkuForInstances(instances[:3], true, true)
+	sql3, args3, err := mkInsertForInstances(instances[:3], true, true)
 	require.NoError(t, err)
 	require.Equal(t, normalizeQuery(sql3), normalizeQuery(s3))
 	require.Equal(t, stripSpaces(fmtArgs(args3)), stripSpaces(a3))
@@ -437,27 +428,27 @@ func TestReadOutdatedInstanceKeys(t *testing.T) {
 	}{
 		{
 			name:              "No problems",
-			sql:               []string{"update database_instance set last_checked = now()"},
+			sql:               []string{"update database_instance set last_checked = datetime('now')"},
 			instancesRequired: nil,
 		}, {
 			name: "One instance is outdated",
 			sql: []string{
-				"update database_instance set last_checked = now()",
-				"update database_instance set last_checked = datetime(now(), '-1 hour') where alias = 'zone1-0000000100'",
+				"update database_instance set last_checked = datetime('now')",
+				"update database_instance set last_checked = datetime('now', '-1 hour') where alias = 'zone1-0000000100'",
 			},
 			instancesRequired: []string{"zone1-0000000100"},
 		}, {
 			name: "One instance doesn't have myql data",
 			sql: []string{
-				"update database_instance set last_checked = now()",
+				"update database_instance set last_checked = datetime('now')",
 				`INSERT INTO vitess_tablet VALUES('zone1-0000000103','localhost',7706,'ks','0','zone1',2,'0001-01-01 00:00:00+00:00','');`,
 			},
 			instancesRequired: []string{"zone1-0000000103"},
 		}, {
 			name: "One instance doesn't have myql data and one is outdated",
 			sql: []string{
-				"update database_instance set last_checked = now()",
-				"update database_instance set last_checked = datetime(now(), '-1 hour') where alias = 'zone1-0000000100'",
+				"update database_instance set last_checked = datetime('now')",
+				"update database_instance set last_checked = datetime('now', '-1 hour') where alias = 'zone1-0000000100'",
 				`INSERT INTO vitess_tablet VALUES('zone1-0000000103','localhost',7706,'ks','0','zone1',2,'0001-01-01 00:00:00+00:00','');`,
 			},
 			instancesRequired: []string{"zone1-0000000103", "zone1-0000000100"},
@@ -494,10 +485,10 @@ func TestReadOutdatedInstanceKeys(t *testing.T) {
 			errInDataCollection := db.QueryVTOrcRowsMap(`select alias, 
 last_checked, 
 last_attempted_check, 
-ROUND((JULIANDAY(now()) - JULIANDAY(last_checked)) * 86400) AS difference,
+ROUND((JULIANDAY(datetime('now')) - JULIANDAY(last_checked)) * 86400) AS difference,
 last_attempted_check <= last_checked as use1,
-last_checked < now() - interval 1500 second as is_outdated1,
-last_checked < now() - interval 3000 second as is_outdated2
+last_checked < datetime('now', '-1500 second') as is_outdated1,
+last_checked < datetime('now', '-3000 second') as is_outdated2
 from database_instance`, func(rowMap sqlutils.RowMap) error {
 				log.Errorf("Row in database_instance - %+v", rowMap)
 				return nil
@@ -521,12 +512,12 @@ func TestUpdateInstanceLastChecked(t *testing.T) {
 			name:             "Verify updated last checked",
 			tabletAlias:      "zone1-0000000100",
 			partialSuccess:   false,
-			conditionToCheck: "last_checked >= now() - interval 30 second and last_check_partial_success = false",
+			conditionToCheck: "last_checked >= datetime('now', '-30 second') and last_check_partial_success = false",
 		}, {
 			name:             "Verify partial success",
 			tabletAlias:      "zone1-0000000100",
 			partialSuccess:   true,
-			conditionToCheck: "last_checked >= now() - interval 30 second and last_check_partial_success = true",
+			conditionToCheck: "last_checked >= datetime('now', '-30 second') and last_check_partial_success = true",
 		}, {
 			name:           "Verify no error on unknown tablet",
 			tabletAlias:    "unknown tablet",
@@ -572,7 +563,7 @@ func TestUpdateInstanceLastAttemptedCheck(t *testing.T) {
 		{
 			name:             "Verify updated last checked",
 			tabletAlias:      "zone1-0000000100",
-			conditionToCheck: "last_attempted_check >= now() - interval 30 second",
+			conditionToCheck: "last_attempted_check >= datetime('now', '-30 second')",
 		}, {
 			name:        "Verify no error on unknown tablet",
 			tabletAlias: "unknown tablet",
@@ -746,8 +737,8 @@ func TestExpireTableData(t *testing.T) {
 			timestampColumn:  "audit_timestamp",
 			expectedRowCount: 1,
 			insertQuery: `insert into audit (audit_id, audit_timestamp, audit_type, alias, message, keyspace, shard) values
-(1, NOW() - INTERVAL 50 DAY, 'a','a','a','a','a'),
-(2, NOW() - INTERVAL 5 DAY, 'a','a','a','a','a')`,
+(1, datetime('now', '-50 DAY'), 'a','a','a','a','a'),
+(2, datetime('now', '-5 DAY'), 'a','a','a','a','a')`,
 		},
 		{
 			name:             "ExpireRecoveryDetectionHistory",
@@ -755,9 +746,9 @@ func TestExpireTableData(t *testing.T) {
 			timestampColumn:  "detection_timestamp",
 			expectedRowCount: 2,
 			insertQuery: `insert into recovery_detection (detection_id, detection_timestamp, alias, analysis, keyspace, shard) values
-(1, NOW() - INTERVAL 3 DAY,'a','a','a','a'),
-(2, NOW() - INTERVAL 5 DAY,'a','a','a','a'),
-(3, NOW() - INTERVAL 15 DAY,'a','a','a','a')`,
+(1, datetime('now', '-3 DAY'),'a','a','a','a'),
+(2, datetime('now', '-5 DAY'),'a','a','a','a'),
+(3, datetime('now', '-15 DAY'),'a','a','a','a')`,
 		},
 	}
 	for _, tt := range tests {
diff --git a/go/vt/vtorc/inst/shard_dao.go b/go/vt/vtorc/inst/shard_dao.go
index a90eed0f509..5501ba5d7b9 100644
--- a/go/vt/vtorc/inst/shard_dao.go
+++ b/go/vt/vtorc/inst/shard_dao.go
@@ -29,6 +29,30 @@ import (
 // ErrShardNotFound is a fixed error message used when a shard is not found in the database.
 var ErrShardNotFound = errors.New("shard not found")
 
+// ReadShardNames reads the names of vitess shards for a single keyspace.
+func ReadShardNames(keyspaceName string) (shardNames []string, err error) {
+	shardNames = make([]string, 0)
+	query := `select shard from vitess_shard where keyspace = ?`
+	args := sqlutils.Args(keyspaceName)
+	err = db.QueryVTOrc(query, args, func(row sqlutils.RowMap) error {
+		shardNames = append(shardNames, row.GetString("shard"))
+		return nil
+	})
+	return shardNames, err
+}
+
+// ReadAllShardNames reads the names of all vitess shards by keyspace.
+func ReadAllShardNames() (shardNames map[string][]string, err error) {
+	shardNames = make(map[string][]string)
+	query := `select keyspace, shard from vitess_shard`
+	err = db.QueryVTOrc(query, nil, func(row sqlutils.RowMap) error {
+		ks := row.GetString("keyspace")
+		shardNames[ks] = append(shardNames[ks], row.GetString("shard"))
+		return nil
+	})
+	return shardNames, err
+}
+
 // ReadShardPrimaryInformation reads the vitess shard record and gets the shard primary alias and timestamp.
 func ReadShardPrimaryInformation(keyspaceName, shardName string) (primaryAlias string, primaryTimestamp string, err error) {
 	if err = topo.ValidateKeyspaceName(keyspaceName); err != nil {
@@ -95,3 +119,16 @@ func getShardPrimaryTermStartTimeString(shard *topo.ShardInfo) string {
 	}
 	return protoutil.TimeFromProto(shard.PrimaryTermStartTime).UTC().String()
 }
+
+// DeleteShard deletes a shard using a keyspace and shard name.
+func DeleteShard(keyspace, shard string) error {
+	_, err := db.ExecVTOrc(`DELETE FROM
+			vitess_shard
+		WHERE
+			keyspace = ?
+			AND shard = ?`,
+		keyspace,
+		shard,
+	)
+	return err
+}
diff --git a/go/vt/vtorc/inst/shard_dao_test.go b/go/vt/vtorc/inst/shard_dao_test.go
index 84f6aef7a4a..8cf67f10ee6 100644
--- a/go/vt/vtorc/inst/shard_dao_test.go
+++ b/go/vt/vtorc/inst/shard_dao_test.go
@@ -28,7 +28,7 @@ import (
 	"vitess.io/vitess/go/vt/vtorc/db"
 )
 
-func TestSaveAndReadShard(t *testing.T) {
+func TestSaveReadAndDeleteShard(t *testing.T) {
 	// Clear the database after the test. The easiest way to do that is to run all the initialization commands again.
 	defer func() {
 		db.ClearVTOrcDatabase()
@@ -93,6 +93,7 @@ func TestSaveAndReadShard(t *testing.T) {
 				require.NoError(t, err)
 			}
 
+			// ReadShardPrimaryInformation
 			shardPrimaryAlias, primaryTimestamp, err := ReadShardPrimaryInformation(tt.keyspaceName, tt.shardName)
 			if tt.err != "" {
 				require.EqualError(t, err, tt.err)
@@ -101,6 +102,23 @@ func TestSaveAndReadShard(t *testing.T) {
 			require.NoError(t, err)
 			require.EqualValues(t, tt.primaryAliasWanted, shardPrimaryAlias)
 			require.EqualValues(t, tt.primaryTimestampWanted, primaryTimestamp)
+
+			// ReadShardNames
+			shardNames, err := ReadShardNames(tt.keyspaceName)
+			require.NoError(t, err)
+			require.Equal(t, []string{tt.shardName}, shardNames)
+
+			// ReadAllShardNames
+			allShardNames, err := ReadAllShardNames()
+			require.NoError(t, err)
+			ksShards, found := allShardNames[tt.keyspaceName]
+			require.True(t, found)
+			require.Equal(t, []string{tt.shardName}, ksShards)
+
+			// DeleteShard
+			require.NoError(t, DeleteShard(tt.keyspaceName, tt.shardName))
+			_, _, err = ReadShardPrimaryInformation(tt.keyspaceName, tt.shardName)
+			require.EqualError(t, err, ErrShardNotFound.Error())
 		})
 	}
 }
diff --git a/go/vt/vtorc/logic/disable_recovery.go b/go/vt/vtorc/logic/disable_recovery.go
index 74aa291e17a..60650798876 100644
--- a/go/vt/vtorc/logic/disable_recovery.go
+++ b/go/vt/vtorc/logic/disable_recovery.go
@@ -64,7 +64,7 @@ func IsRecoveryDisabled() (disabled bool, err error) {
 // DisableRecovery ensures recoveries are disabled globally
 func DisableRecovery() error {
 	_, err := db.ExecVTOrc(`
-		INSERT IGNORE INTO global_recovery_disable
+		INSERT OR IGNORE INTO global_recovery_disable
 			(disable_recovery)
 		VALUES  (1)
 	`,
diff --git a/go/vt/vtorc/logic/keyspace_shard_discovery.go b/go/vt/vtorc/logic/keyspace_shard_discovery.go
index 0dd17cb65fd..77f3930be1e 100644
--- a/go/vt/vtorc/logic/keyspace_shard_discovery.go
+++ b/go/vt/vtorc/logic/keyspace_shard_discovery.go
@@ -18,20 +18,49 @@ package logic
 
 import (
 	"context"
-	"sort"
-	"strings"
 	"sync"
 
-	"vitess.io/vitess/go/vt/log"
+	"golang.org/x/exp/maps"
 
+	"vitess.io/vitess/go/stats"
+	"vitess.io/vitess/go/vt/log"
 	"vitess.io/vitess/go/vt/topo"
+	"vitess.io/vitess/go/vt/topo/topoproto"
 	"vitess.io/vitess/go/vt/vtorc/inst"
 )
 
+var statsShardsWatched = stats.NewGaugesFuncWithMultiLabels("ShardsWatched",
+	"Keyspace/shards currently watched",
+	[]string{"Keyspace", "Shard"},
+	getShardsWatchedStats)
+
+// getShardsWatchedStats returns the keyspace/shards watched in a format for stats.
+func getShardsWatchedStats() map[string]int64 {
+	shardsWatched := make(map[string]int64)
+	allShardNames, err := inst.ReadAllShardNames()
+	if err != nil {
+		log.Errorf("Failed to read all shard names: %+v", err)
+		return shardsWatched
+	}
+	for ks, shards := range allShardNames {
+		for _, shard := range shards {
+			shardsWatched[ks+"."+shard] = 1
+		}
+	}
+	return shardsWatched
+}
+
+// refreshAllKeyspacesAndShardsMu ensures RefreshAllKeyspacesAndShards
+// is not executed concurrently.
+var refreshAllKeyspacesAndShardsMu sync.Mutex
+
 // RefreshAllKeyspacesAndShards reloads the keyspace and shard information for the keyspaces that vtorc is concerned with.
 func RefreshAllKeyspacesAndShards(ctx context.Context) error {
+	refreshAllKeyspacesAndShardsMu.Lock()
+	defer refreshAllKeyspacesAndShardsMu.Unlock()
+
 	var keyspaces []string
-	if len(clustersToWatch) == 0 { // all known keyspaces
+	if len(shardsToWatch) == 0 { // all known keyspaces
 		ctx, cancel := context.WithTimeout(ctx, topo.RemoteOperationTimeout)
 		defer cancel()
 		var err error
@@ -41,26 +70,10 @@ func RefreshAllKeyspacesAndShards(ctx context.Context) error {
 			return err
 		}
 	} else {
-		// Parse input and build list of keyspaces
-		for _, ks := range clustersToWatch {
-			if strings.Contains(ks, "/") {
-				// This is a keyspace/shard specification
-				input := strings.Split(ks, "/")
-				keyspaces = append(keyspaces, input[0])
-			} else {
-				// Assume this is a keyspace
-				keyspaces = append(keyspaces, ks)
-			}
-		}
-		if len(keyspaces) == 0 {
-			log.Errorf("Found no keyspaces for input: %+v", clustersToWatch)
-			return nil
-		}
+		// Get keyspaces to watch from the list of known keyspaces.
+		keyspaces = maps.Keys(shardsToWatch)
 	}
 
-	// Sort the list of keyspaces.
-	// The list can have duplicates because the input to clusters to watch may have multiple shards of the same keyspace
-	sort.Strings(keyspaces)
 	refreshCtx, refreshCancel := context.WithTimeout(ctx, topo.RemoteOperationTimeout)
 	defer refreshCancel()
 	var wg sync.WaitGroup
@@ -125,6 +138,7 @@ func refreshKeyspaceHelper(ctx context.Context, keyspaceName string) error {
 
 // refreshAllShards refreshes all the shard records in the given keyspace.
 func refreshAllShards(ctx context.Context, keyspaceName string) error {
+	// get all shards for keyspace name.
 	shardInfos, err := ts.FindAllShardsInKeyspace(ctx, keyspaceName, &topo.FindAllShardsInKeyspaceOptions{
 		// Fetch shard records concurrently to speed up discovery. A typical
 		// Vitess cluster will have 1-3 vtorc instances deployed, so there is
@@ -135,13 +149,35 @@ func refreshAllShards(ctx context.Context, keyspaceName string) error {
 		log.Error(err)
 		return err
 	}
+	savedShards := make(map[string]bool, len(shardInfos))
 	for _, shardInfo := range shardInfos {
 		err = inst.SaveShard(shardInfo)
 		if err != nil {
 			log.Error(err)
 			return err
 		}
+		savedShards[shardInfo.ShardName()] = true
+	}
+
+	// delete shards that were not returned by ts.FindAllShardsInKeyspace(...),
+	// indicating they are stale.
+	shards, err := inst.ReadShardNames(keyspaceName)
+	if err != nil {
+		return err
+	}
+	for _, shard := range shards {
+		if savedShards[shard] {
+			continue
+		}
+		shardName := topoproto.KeyspaceShardString(keyspaceName, shard)
+		log.Infof("Forgetting shard: %s", shardName)
+		err = inst.DeleteShard(keyspaceName, shard)
+		if err != nil {
+			log.Errorf("Failed to delete shard %s: %+v", shardName, err)
+			return err
+		}
 	}
+
 	return nil
 }
 
diff --git a/go/vt/vtorc/logic/keyspace_shard_discovery_test.go b/go/vt/vtorc/logic/keyspace_shard_discovery_test.go
index 5cbe139728b..20e32d99470 100644
--- a/go/vt/vtorc/logic/keyspace_shard_discovery_test.go
+++ b/go/vt/vtorc/logic/keyspace_shard_discovery_test.go
@@ -92,6 +92,8 @@ func TestRefreshAllKeyspaces(t *testing.T) {
 	// Set clusters to watch to only watch ks1 and ks3
 	onlyKs1and3 := []string{"ks1/-80", "ks3/-80", "ks3/80-"}
 	clustersToWatch = onlyKs1and3
+	err := initializeShardsToWatch()
+	require.NoError(t, err)
 	require.NoError(t, RefreshAllKeyspacesAndShards(context.Background()))
 
 	// Verify that we only have ks1 and ks3 in vtorc's db.
@@ -105,6 +107,8 @@ func TestRefreshAllKeyspaces(t *testing.T) {
 
 	// Set clusters to watch to watch all keyspaces
 	clustersToWatch = nil
+	err = initializeShardsToWatch()
+	require.NoError(t, err)
 	// Change the durability policy of ks1
 	reparenttestutil.SetKeyspaceDurability(ctx, t, ts, "ks1", "semi_sync")
 	require.NoError(t, RefreshAllKeyspacesAndShards(context.Background()))
@@ -118,7 +122,6 @@ func TestRefreshAllKeyspaces(t *testing.T) {
 	verifyPrimaryAlias(t, "ks3", "80-", "zone_ks3-0000000101", "")
 	verifyKeyspaceInfo(t, "ks4", keyspaceDurabilityTest, "")
 	verifyPrimaryAlias(t, "ks4", "80-", "zone_ks4-0000000101", "")
-
 }
 
 func TestRefreshKeyspace(t *testing.T) {
@@ -307,3 +310,31 @@ func verifyPrimaryAlias(t *testing.T, keyspaceName, shardName string, primaryAli
 	require.NoError(t, err)
 	require.Equal(t, primaryAliasWanted, primaryAlias)
 }
+
+func TestRefreshAllShards(t *testing.T) {
+	// Store the old flags and restore on test completion
+	oldTs := ts
+	defer func() {
+		ts = oldTs
+		db.ClearVTOrcDatabase()
+	}()
+
+	ctx := context.Background()
+	ts = memorytopo.NewServer(ctx, "zone1")
+	require.NoError(t, ts.CreateKeyspace(ctx, "ks1", keyspaceDurabilityNone))
+
+	shards := []string{"-80", "80-"}
+	for _, shard := range shards {
+		require.NoError(t, ts.CreateShard(ctx, "ks1", shard))
+	}
+	require.NoError(t, refreshAllShards(ctx, "ks1"))
+	shardNames, err := inst.ReadShardNames("ks1")
+	require.NoError(t, err)
+	require.Equal(t, []string{"-80", "80-"}, shardNames)
+
+	require.NoError(t, ts.DeleteShard(ctx, "ks1", "80-"))
+	require.NoError(t, refreshAllShards(ctx, "ks1"))
+	shardNames, err = inst.ReadShardNames("ks1")
+	require.NoError(t, err)
+	require.Equal(t, []string{"-80"}, shardNames)
+}
diff --git a/go/vt/vtorc/logic/tablet_discovery.go b/go/vt/vtorc/logic/tablet_discovery.go
index 2218a112f14..37bea1a8733 100644
--- a/go/vt/vtorc/logic/tablet_discovery.go
+++ b/go/vt/vtorc/logic/tablet_discovery.go
@@ -27,20 +27,20 @@ import (
 	"time"
 
 	"github.com/spf13/pflag"
+	"golang.org/x/sync/errgroup"
 	"google.golang.org/protobuf/encoding/prototext"
 	"google.golang.org/protobuf/proto"
 
 	"vitess.io/vitess/go/vt/external/golib/sqlutils"
+	"vitess.io/vitess/go/vt/key"
 	"vitess.io/vitess/go/vt/log"
+	topodatapb "vitess.io/vitess/go/vt/proto/topodata"
 	"vitess.io/vitess/go/vt/topo"
 	"vitess.io/vitess/go/vt/topo/topoproto"
-	"vitess.io/vitess/go/vt/topotools"
 	"vitess.io/vitess/go/vt/vtorc/config"
 	"vitess.io/vitess/go/vt/vtorc/db"
 	"vitess.io/vitess/go/vt/vtorc/inst"
 	"vitess.io/vitess/go/vt/vttablet/tmclient"
-
-	topodatapb "vitess.io/vitess/go/vt/proto/topodata"
 )
 
 var (
@@ -49,16 +49,83 @@ var (
 	clustersToWatch   []string
 	shutdownWaitTime  = 30 * time.Second
 	shardsLockCounter int32
+	// shardsToWatch is a map storing the shards for a given keyspace that need to be watched.
+	// We store the key range for all the shards that we want to watch.
+	// This is populated by parsing `--clusters_to_watch` flag.
+	shardsToWatch map[string][]*topodatapb.KeyRange
+
 	// ErrNoPrimaryTablet is a fixed error message.
 	ErrNoPrimaryTablet = errors.New("no primary tablet found")
 )
 
 // RegisterFlags registers the flags required by VTOrc
 func RegisterFlags(fs *pflag.FlagSet) {
-	fs.StringSliceVar(&clustersToWatch, "clusters_to_watch", clustersToWatch, "Comma-separated list of keyspaces or keyspace/shards that this instance will monitor and repair. Defaults to all clusters in the topology. Example: \"ks1,ks2/-80\"")
+	fs.StringSliceVar(&clustersToWatch, "clusters_to_watch", clustersToWatch, "Comma-separated list of keyspaces or keyspace/keyranges that this instance will monitor and repair. Defaults to all clusters in the topology. Example: \"ks1,ks2/-80\"")
 	fs.DurationVar(&shutdownWaitTime, "shutdown_wait_time", shutdownWaitTime, "Maximum time to wait for VTOrc to release all the locks that it is holding before shutting down on SIGTERM")
 }
 
+// initializeShardsToWatch parses the --clusters_to_watch flag-value
+// into a map of keyspace/shards.
+func initializeShardsToWatch() error {
+	shardsToWatch = make(map[string][]*topodatapb.KeyRange)
+	if len(clustersToWatch) == 0 {
+		return nil
+	}
+
+	for _, ks := range clustersToWatch {
+		if strings.Contains(ks, "/") && !strings.HasSuffix(ks, "/") {
+			// Validate keyspace/shard parses.
+			k, s, err := topoproto.ParseKeyspaceShard(ks)
+			if err != nil {
+				log.Errorf("Could not parse keyspace/shard %q: %+v", ks, err)
+				continue
+			}
+			if !key.IsValidKeyRange(s) {
+				return fmt.Errorf("invalid key range %q while parsing clusters to watch", s)
+			}
+			// Parse the shard name into key range value.
+			keyRanges, err := key.ParseShardingSpec(s)
+			if err != nil {
+				return fmt.Errorf("could not parse shard name %q: %+v", s, err)
+			}
+			shardsToWatch[k] = append(shardsToWatch[k], keyRanges...)
+		} else {
+			// Remove trailing slash if exists.
+			ks = strings.TrimSuffix(ks, "/")
+			// We store the entire range of key range if nothing is specified.
+			shardsToWatch[ks] = []*topodatapb.KeyRange{key.NewCompleteKeyRange()}
+		}
+	}
+
+	if len(shardsToWatch) == 0 {
+		log.Error("No keyspace/shards to watch, watching all keyspaces")
+	}
+	return nil
+}
+
+// shouldWatchTablet checks if the given tablet is part of the watch list.
+func shouldWatchTablet(tablet *topodatapb.Tablet) bool {
+	// If we are watching all keyspaces, then we want to watch this tablet too.
+	if len(shardsToWatch) == 0 {
+		return true
+	}
+	shardRanges, ok := shardsToWatch[tablet.GetKeyspace()]
+	// If we don't have the keyspace in our map, then this tablet
+	// doesn't need to be watched.
+	if !ok {
+		return false
+	}
+	// Get the tablet's key range, and check if
+	// it is part of the shard ranges we are watching.
+	kr := tablet.GetKeyRange()
+	for _, shardRange := range shardRanges {
+		if key.KeyRangeContainsKeyRange(shardRange, kr) {
+			return true
+		}
+	}
+	return false
+}
+
 // OpenTabletDiscovery opens the vitess topo if enables and returns a ticker
 // channel for polling.
 func OpenTabletDiscovery() <-chan time.Time {
@@ -68,6 +135,11 @@ func OpenTabletDiscovery() <-chan time.Time {
 	if _, err := db.ExecVTOrc("delete from vitess_tablet"); err != nil {
 		log.Error(err)
 	}
+	// Parse --clusters_to_watch into a filter.
+	err := initializeShardsToWatch()
+	if err != nil {
+		log.Fatalf("Error parsing --clusters-to-watch: %v", err)
+	}
 	// We refresh all information from the topo once before we start the ticks to do
 	// it on a timer.
 	ctx, cancel := context.WithTimeout(context.Background(), topo.RemoteOperationTimeout)
@@ -78,6 +150,32 @@ func OpenTabletDiscovery() <-chan time.Time {
 	return time.Tick(time.Second * time.Duration(config.Config.TopoInformationRefreshSeconds)) //nolint SA1015: using time.Tick leaks the underlying ticker
 }
 
+// getAllTablets gets all tablets from all cells using a goroutine per cell. It returns a map of
+// cells (string) to slices of tablets (as topo.TabletInfo) and a slice of cells (string) that
+// failed to return a result.
+func getAllTablets(ctx context.Context, cells []string) (tabletsByCell map[string][]*topo.TabletInfo, failedCells []string) {
+	var mu sync.Mutex
+	failedCells = make([]string, 0, len(cells))
+	tabletsByCell = make(map[string][]*topo.TabletInfo, len(cells))
+	eg, ctx := errgroup.WithContext(ctx)
+	for _, cell := range cells {
+		eg.Go(func() error {
+			tablets, err := ts.GetTabletsByCell(ctx, cell, nil)
+			mu.Lock()
+			defer mu.Unlock()
+			if err != nil {
+				log.Errorf("Failed to load tablets from cell %s: %+v", cell, err)
+				failedCells = append(failedCells, cell)
+			} else {
+				tabletsByCell[cell] = tablets
+			}
+			return nil
+		})
+	}
+	_ = eg.Wait() // always nil
+	return tabletsByCell, failedCells
+}
+
 // refreshAllTablets reloads the tablets from topo and discovers the ones which haven't been refreshed in a while
 func refreshAllTablets(ctx context.Context) error {
 	return refreshTabletsUsing(ctx, func(tabletAlias string) {
@@ -85,81 +183,48 @@ func refreshAllTablets(ctx context.Context) error {
 	}, false /* forceRefresh */)
 }
 
+// refreshTabletsUsing refreshes tablets using a provided loader.
 func refreshTabletsUsing(ctx context.Context, loader func(tabletAlias string), forceRefresh bool) error {
-	if len(clustersToWatch) == 0 { // all known clusters
-		ctx, cancel := context.WithTimeout(ctx, topo.RemoteOperationTimeout)
-		defer cancel()
-		cells, err := ts.GetKnownCells(ctx)
-		if err != nil {
-			return err
-		}
+	// Get all cells.
+	cellsCtx, cellsCancel := context.WithTimeout(ctx, topo.RemoteOperationTimeout)
+	defer cellsCancel()
+	cells, err := ts.GetKnownCells(cellsCtx)
+	if err != nil {
+		return err
+	}
 
-		refreshCtx, refreshCancel := context.WithTimeout(ctx, topo.RemoteOperationTimeout)
-		defer refreshCancel()
-		var wg sync.WaitGroup
-		for _, cell := range cells {
-			wg.Add(1)
-			go func(cell string) {
-				defer wg.Done()
-				refreshTabletsInCell(refreshCtx, cell, loader, forceRefresh)
-			}(cell)
-		}
-		wg.Wait()
-	} else {
-		// Parse input and build list of keyspaces / shards
-		var keyspaceShards []*topo.KeyspaceShard
-		for _, ks := range clustersToWatch {
-			if strings.Contains(ks, "/") {
-				// This is a keyspace/shard specification
-				input := strings.Split(ks, "/")
-				keyspaceShards = append(keyspaceShards, &topo.KeyspaceShard{Keyspace: input[0], Shard: input[1]})
-			} else {
-				// Assume this is a keyspace and find all shards in keyspace
-				ctx, cancel := context.WithTimeout(ctx, topo.RemoteOperationTimeout)
-				defer cancel()
-				shards, err := ts.GetShardNames(ctx, ks)
-				if err != nil {
-					// Log the errr and continue
-					log.Errorf("Error fetching shards for keyspace: %v", ks)
-					continue
-				}
-				if len(shards) == 0 {
-					log.Errorf("Topo has no shards for ks: %v", ks)
-					continue
-				}
-				for _, s := range shards {
-					keyspaceShards = append(keyspaceShards, &topo.KeyspaceShard{Keyspace: ks, Shard: s})
+	// Get all tablets from all cells.
+	getTabletsCtx, getTabletsCancel := context.WithTimeout(ctx, topo.RemoteOperationTimeout)
+	defer getTabletsCancel()
+	tabletsByCell, failedCells := getAllTablets(getTabletsCtx, cells)
+	if len(tabletsByCell) == 0 {
+		log.Error("Found no cells with tablets")
+		return nil
+	}
+	if len(failedCells) > 0 {
+		log.Errorf("Got partial topo result. Failed cells: %s", strings.Join(failedCells, ", "))
+	}
+
+	// Update each cell that provided a response. This ensures only cells that provided a
+	// response are updated in the backend and are considered for forgetting stale tablets.
+	for cell, tablets := range tabletsByCell {
+		// Filter tablets that should not be watched using func shouldWatchTablet.
+		matchedTablets := make([]*topo.TabletInfo, 0, len(tablets))
+		func() {
+			for _, t := range tablets {
+				if shouldWatchTablet(t.Tablet) {
+					matchedTablets = append(matchedTablets, t)
 				}
 			}
-		}
-		if len(keyspaceShards) == 0 {
-			log.Errorf("Found no keyspaceShards for input: %+v", clustersToWatch)
-			return nil
-		}
-		refreshCtx, refreshCancel := context.WithTimeout(ctx, topo.RemoteOperationTimeout)
-		defer refreshCancel()
-		var wg sync.WaitGroup
-		for _, ks := range keyspaceShards {
-			wg.Add(1)
-			go func(ks *topo.KeyspaceShard) {
-				defer wg.Done()
-				refreshTabletsInKeyspaceShard(refreshCtx, ks.Keyspace, ks.Shard, loader, forceRefresh, nil)
-			}(ks)
-		}
-		wg.Wait()
-	}
-	return nil
-}
+		}()
 
-func refreshTabletsInCell(ctx context.Context, cell string, loader func(tabletAlias string), forceRefresh bool) {
-	tablets, err := topotools.GetTabletMapForCell(ctx, ts, cell)
-	if err != nil {
-		log.Errorf("Error fetching topo info for cell %v: %v", cell, err)
-		return
+		// Refresh the filtered tablets and forget stale tablets.
+		query := "select alias from vitess_tablet where cell = ?"
+		args := sqlutils.Args(cell)
+		refreshTablets(matchedTablets, query, args, loader, forceRefresh, nil)
 	}
-	query := "select alias from vitess_tablet where cell = ?"
-	args := sqlutils.Args(cell)
-	refreshTablets(tablets, query, args, loader, forceRefresh, nil)
+
+	return nil
 }
 
 // forceRefreshAllTabletsInShard is used to refresh all the tablet's information (both MySQL information and topo records)
@@ -184,7 +249,7 @@ func refreshTabletInfoOfShard(ctx context.Context, keyspace, shard string) {
 }
 
 func refreshTabletsInKeyspaceShard(ctx context.Context, keyspace, shard string, loader func(tabletAlias string), forceRefresh bool, tabletsToIgnore []string) {
-	tablets, err := ts.GetTabletMapForShard(ctx, keyspace, shard)
+	tablets, err := ts.GetTabletsByShard(ctx, keyspace, shard)
 	if err != nil {
 		log.Errorf("Error fetching tablets for keyspace/shard %v/%v: %v", keyspace, shard, err)
 		return
@@ -194,7 +259,7 @@ func refreshTabletsInKeyspaceShard(ctx context.Context, keyspace, shard string,
 	refreshTablets(tablets, query, args, loader, forceRefresh, tabletsToIgnore)
 }
 
-func refreshTablets(tablets map[string]*topo.TabletInfo, query string, args []any, loader func(tabletAlias string), forceRefresh bool, tabletsToIgnore []string) {
+func refreshTablets(tablets []*topo.TabletInfo, query string, args []any, loader func(tabletAlias string), forceRefresh bool, tabletsToIgnore []string) {
 	// Discover new tablets.
 	latestInstances := make(map[string]bool)
 	var wg sync.WaitGroup
diff --git a/go/vt/vtorc/logic/tablet_discovery_test.go b/go/vt/vtorc/logic/tablet_discovery_test.go
index f6a7af38382..c351fb41c0a 100644
--- a/go/vt/vtorc/logic/tablet_discovery_test.go
+++ b/go/vt/vtorc/logic/tablet_discovery_test.go
@@ -19,6 +19,8 @@ package logic
 import (
 	"context"
 	"fmt"
+	"slices"
+	"strings"
 	"sync/atomic"
 	"testing"
 	"time"
@@ -29,9 +31,11 @@ import (
 	"google.golang.org/protobuf/proto"
 
 	"vitess.io/vitess/go/vt/external/golib/sqlutils"
+	"vitess.io/vitess/go/vt/key"
 	topodatapb "vitess.io/vitess/go/vt/proto/topodata"
 	"vitess.io/vitess/go/vt/proto/vttime"
 	"vitess.io/vitess/go/vt/topo"
+	"vitess.io/vitess/go/vt/topo/faketopo"
 	"vitess.io/vitess/go/vt/topo/memorytopo"
 	"vitess.io/vitess/go/vt/topo/topoproto"
 	"vitess.io/vitess/go/vt/vtctl/grpcvtctldserver/testutil"
@@ -101,6 +105,221 @@ var (
 	}
 )
 
+func TestShouldWatchTablet(t *testing.T) {
+	oldClustersToWatch := clustersToWatch
+	defer func() {
+		clustersToWatch = oldClustersToWatch
+		shardsToWatch = nil
+	}()
+
+	testCases := []struct {
+		in                  []string
+		tablet              *topodatapb.Tablet
+		expectedShouldWatch bool
+	}{
+		{
+			in: []string{},
+			tablet: &topodatapb.Tablet{
+				Keyspace: keyspace,
+				Shard:    shard,
+			},
+			expectedShouldWatch: true,
+		},
+		{
+			in: []string{keyspace},
+			tablet: &topodatapb.Tablet{
+				Keyspace: keyspace,
+				Shard:    shard,
+			},
+			expectedShouldWatch: true,
+		},
+		{
+			in: []string{keyspace + "/-"},
+			tablet: &topodatapb.Tablet{
+				Keyspace: keyspace,
+				Shard:    shard,
+			},
+			expectedShouldWatch: true,
+		},
+		{
+			in: []string{keyspace + "/" + shard},
+			tablet: &topodatapb.Tablet{
+				Keyspace: keyspace,
+				Shard:    shard,
+			},
+			expectedShouldWatch: true,
+		},
+		{
+			in: []string{"ks/-70", "ks/70-"},
+			tablet: &topodatapb.Tablet{
+				Keyspace: "ks",
+				KeyRange: key.NewKeyRange([]byte{0x50}, []byte{0x70}),
+			},
+			expectedShouldWatch: true,
+		},
+		{
+			in: []string{"ks/-70", "ks/70-"},
+			tablet: &topodatapb.Tablet{
+				Keyspace: "ks",
+				KeyRange: key.NewKeyRange([]byte{0x40}, []byte{0x50}),
+			},
+			expectedShouldWatch: true,
+		},
+		{
+			in: []string{"ks/-70", "ks/70-"},
+			tablet: &topodatapb.Tablet{
+				Keyspace: "ks",
+				KeyRange: key.NewKeyRange([]byte{0x70}, []byte{0x90}),
+			},
+			expectedShouldWatch: true,
+		},
+		{
+			in: []string{"ks/-70", "ks/70-"},
+			tablet: &topodatapb.Tablet{
+				Keyspace: "ks",
+				KeyRange: key.NewKeyRange([]byte{0x60}, []byte{0x90}),
+			},
+			expectedShouldWatch: false,
+		},
+		{
+			in: []string{"ks/50-70"},
+			tablet: &topodatapb.Tablet{
+				Keyspace: "ks",
+				KeyRange: key.NewKeyRange([]byte{0x50}, []byte{0x70}),
+			},
+			expectedShouldWatch: true,
+		},
+		{
+			in: []string{"ks2/-70", "ks2/70-", "unknownKs/-", "ks/-80"},
+			tablet: &topodatapb.Tablet{
+				Keyspace: "ks",
+				KeyRange: key.NewKeyRange([]byte{0x60}, []byte{0x80}),
+			},
+			expectedShouldWatch: true,
+		},
+		{
+			in: []string{"ks2/-70", "ks2/70-", "unknownKs/-", "ks/-80"},
+			tablet: &topodatapb.Tablet{
+				Keyspace: "ks",
+				KeyRange: key.NewKeyRange([]byte{0x80}, []byte{0x90}),
+			},
+			expectedShouldWatch: false,
+		},
+		{
+			in: []string{"ks2/-70", "ks2/70-", "unknownKs/-", "ks/-80"},
+			tablet: &topodatapb.Tablet{
+				Keyspace: "ks",
+				KeyRange: key.NewKeyRange([]byte{0x90}, []byte{0xa0}),
+			},
+			expectedShouldWatch: false,
+		},
+	}
+
+	for _, tt := range testCases {
+		t.Run(fmt.Sprintf("%v-Tablet-%v-%v", strings.Join(tt.in, ","), tt.tablet.GetKeyspace(), tt.tablet.GetShard()), func(t *testing.T) {
+			clustersToWatch = tt.in
+			err := initializeShardsToWatch()
+			require.NoError(t, err)
+			assert.Equal(t, tt.expectedShouldWatch, shouldWatchTablet(tt.tablet))
+		})
+	}
+}
+
+// TestInitializeShardsToWatch tests that we initialize the shardsToWatch map correctly
+// using the `--clusters_to_watch` flag.
+func TestInitializeShardsToWatch(t *testing.T) {
+	oldClustersToWatch := clustersToWatch
+	defer func() {
+		clustersToWatch = oldClustersToWatch
+		shardsToWatch = nil
+	}()
+
+	testCases := []struct {
+		in          []string
+		expected    map[string][]*topodatapb.KeyRange
+		expectedErr string
+	}{
+		{
+			in:       []string{},
+			expected: map[string][]*topodatapb.KeyRange{},
+		},
+		{
+			in: []string{"unknownKs"},
+			expected: map[string][]*topodatapb.KeyRange{
+				"unknownKs": {
+					key.NewCompleteKeyRange(),
+				},
+			},
+		},
+		{
+			in: []string{"test/-"},
+			expected: map[string][]*topodatapb.KeyRange{
+				"test": {
+					key.NewCompleteKeyRange(),
+				},
+			},
+		},
+		{
+			in:          []string{"test/324"},
+			expectedErr: `invalid key range "324" while parsing clusters to watch`,
+		},
+		{
+			in: []string{"test/0"},
+			expected: map[string][]*topodatapb.KeyRange{
+				"test": {
+					key.NewCompleteKeyRange(),
+				},
+			},
+		},
+		{
+			in: []string{"test/-", "test2/-80", "test2/80-"},
+			expected: map[string][]*topodatapb.KeyRange{
+				"test": {
+					key.NewCompleteKeyRange(),
+				},
+				"test2": {
+					key.NewKeyRange(nil, []byte{0x80}),
+					key.NewKeyRange([]byte{0x80}, nil),
+				},
+			},
+		},
+		{
+			// known keyspace
+			in: []string{keyspace},
+			expected: map[string][]*topodatapb.KeyRange{
+				keyspace: {
+					key.NewCompleteKeyRange(),
+				},
+			},
+		},
+		{
+			// keyspace with trailing-slash
+			in: []string{keyspace + "/"},
+			expected: map[string][]*topodatapb.KeyRange{
+				keyspace: {
+					key.NewCompleteKeyRange(),
+				},
+			},
+		},
+	}
+
+	for _, testCase := range testCases {
+		t.Run(strings.Join(testCase.in, ","), func(t *testing.T) {
+			defer func() {
+				shardsToWatch = make(map[string][]*topodatapb.KeyRange, 0)
+			}()
+			clustersToWatch = testCase.in
+			err := initializeShardsToWatch()
+			if testCase.expectedErr != "" {
+				require.EqualError(t, err, testCase.expectedErr)
+				return
+			}
+			require.NoError(t, err)
+			require.Equal(t, testCase.expected, shardsToWatch)
+		})
+	}
+}
+
 func TestRefreshTabletsInKeyspaceShard(t *testing.T) {
 	// Store the old flags and restore on test completion
 	oldTs := ts
@@ -623,3 +842,61 @@ func TestSetReplicationSource(t *testing.T) {
 		})
 	}
 }
+
+func TestGetAllTablets(t *testing.T) {
+	tablet := &topodatapb.Tablet{
+		Hostname: t.Name(),
+	}
+	tabletProto, _ := tablet.MarshalVT()
+
+	factory := faketopo.NewFakeTopoFactory()
+
+	// zone1 (success)
+	goodCell1 := faketopo.NewFakeConnection()
+	goodCell1.AddListResult("tablets", []topo.KVInfo{
+		{
+			Key:   []byte("zone1-00000001"),
+			Value: tabletProto,
+		},
+	})
+	factory.SetCell("zone1", goodCell1)
+
+	// zone2 (success)
+	goodCell2 := faketopo.NewFakeConnection()
+	goodCell2.AddListResult("tablets", []topo.KVInfo{
+		{
+			Key:   []byte("zone2-00000002"),
+			Value: tabletProto,
+		},
+	})
+	factory.SetCell("zone2", goodCell2)
+
+	// zone3 (fail)
+	badCell1 := faketopo.NewFakeConnection()
+	badCell1.AddListError(true)
+	factory.SetCell("zone3", badCell1)
+
+	// zone4 (fail)
+	badCell2 := faketopo.NewFakeConnection()
+	badCell2.AddListError(true)
+	factory.SetCell("zone4", badCell2)
+
+	oldTs := ts
+	defer func() {
+		ts = oldTs
+	}()
+	ctx := context.Background()
+	ts = faketopo.NewFakeTopoServer(ctx, factory)
+
+	// confirm zone1 + zone2 succeeded and zone3 + zone4 failed
+	tabletsByCell, failedCells := getAllTablets(ctx, []string{"zone1", "zone2", "zone3", "zone4"})
+	require.Len(t, tabletsByCell, 2)
+	slices.Sort(failedCells)
+	require.Equal(t, []string{"zone3", "zone4"}, failedCells)
+	for _, tablets := range tabletsByCell {
+		require.Len(t, tablets, 1)
+		for _, tablet := range tablets {
+			require.Equal(t, t.Name(), tablet.Tablet.GetHostname())
+		}
+	}
+}
diff --git a/go/vt/vtorc/logic/topology_recovery.go b/go/vt/vtorc/logic/topology_recovery.go
index aec137a45b4..f82aad0a0c7 100644
--- a/go/vt/vtorc/logic/topology_recovery.go
+++ b/go/vt/vtorc/logic/topology_recovery.go
@@ -171,13 +171,15 @@ func resolveRecovery(topologyRecovery *TopologyRecovery, successorInstance *inst
 }
 
 // recoverPrimaryHasPrimary resets the replication on the primary instance
-func recoverPrimaryHasPrimary(ctx context.Context, analysisEntry *inst.ReplicationAnalysis) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) {
+func recoverPrimaryHasPrimary(ctx context.Context, analysisEntry *inst.ReplicationAnalysis, logger *log.PrefixedLogger) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) {
 	topologyRecovery, err = AttemptRecoveryRegistration(analysisEntry)
 	if topologyRecovery == nil {
-		_ = AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another fixPrimaryHasPrimary.", analysisEntry.AnalyzedInstanceAlias))
+		message := fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another fixPrimaryHasPrimary.", analysisEntry.AnalyzedInstanceAlias)
+		logger.Warning(message)
+		_ = AuditTopologyRecovery(topologyRecovery, message)
 		return false, nil, err
 	}
-	log.Infof("Analysis: %v, will fix incorrect primaryship on %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias)
+	logger.Infof("Analysis: %v, will fix incorrect primaryship on %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias)
 	// This has to be done in the end; whether successful or not, we should mark that the recovery is done.
 	// So that after the active period passes, we are able to run other recoveries.
 	defer func() {
@@ -187,6 +189,7 @@ func recoverPrimaryHasPrimary(ctx context.Context, analysisEntry *inst.Replicati
 	// Read the tablet information from the database to find the shard and keyspace of the tablet
 	analyzedTablet, err := inst.ReadTablet(analysisEntry.AnalyzedInstanceAlias)
 	if err != nil {
+		logger.Errorf("Failed to read instance %s, aborting recovery", analysisEntry.AnalyzedInstanceAlias)
 		return false, nil, err
 	}
 
@@ -197,19 +200,22 @@ func recoverPrimaryHasPrimary(ctx context.Context, analysisEntry *inst.Replicati
 
 // runEmergencyReparentOp runs a recovery for which we have to run ERS. Here waitForAllTablets is a boolean telling ERS whether it should wait for all the tablets
 // or is it okay to skip 1.
-func runEmergencyReparentOp(ctx context.Context, analysisEntry *inst.ReplicationAnalysis, recoveryName string, waitForAllTablets bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) {
+func runEmergencyReparentOp(ctx context.Context, analysisEntry *inst.ReplicationAnalysis, recoveryName string, waitForAllTablets bool, logger *log.PrefixedLogger) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) {
 	// Read the tablet information from the database to find the shard and keyspace of the tablet
 	tablet, err := inst.ReadTablet(analysisEntry.AnalyzedInstanceAlias)
 	if err != nil {
+		logger.Errorf("Failed to read instance %s, aborting recovery", analysisEntry.AnalyzedInstanceAlias)
 		return false, nil, err
 	}
 
 	topologyRecovery, err = AttemptRecoveryRegistration(analysisEntry)
 	if topologyRecovery == nil {
-		_ = AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another %v.", analysisEntry.AnalyzedInstanceAlias, recoveryName))
+		message := fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another %v.", analysisEntry.AnalyzedInstanceAlias, recoveryName)
+		logger.Warning(message)
+		_ = AuditTopologyRecovery(topologyRecovery, message)
 		return false, nil, err
 	}
-	log.Infof("Analysis: %v, %v %+v", analysisEntry.Analysis, recoveryName, analysisEntry.AnalyzedInstanceAlias)
+	logger.Infof("Analysis: %v, %v %+v", analysisEntry.Analysis, recoveryName, analysisEntry.AnalyzedInstanceAlias)
 	var promotedReplica *inst.Instance
 	// This has to be done in the end; whether successful or not, we should mark that the recovery is done.
 	// So that after the active period passes, we are able to run other recoveries.
@@ -223,11 +229,11 @@ func runEmergencyReparentOp(ctx context.Context, analysisEntry *inst.Replication
 		// we only log the warnings and errors explicitly, everything gets logged as an information message anyways in auditing topology recovery
 		switch level {
 		case logutilpb.Level_WARNING:
-			log.Warningf("ERS - %s", value)
+			logger.Warningf("ERS - %s", value)
 		case logutilpb.Level_ERROR:
-			log.Errorf("ERS - %s", value)
+			logger.Errorf("ERS - %s", value)
 		default:
-			log.Infof("ERS - %s", value)
+			logger.Infof("ERS - %s", value)
 		}
 		_ = AuditTopologyRecovery(topologyRecovery, value)
 	})).ReparentShard(ctx,
@@ -241,7 +247,7 @@ func runEmergencyReparentOp(ctx context.Context, analysisEntry *inst.Replication
 		},
 	)
 	if err != nil {
-		log.Errorf("Error running ERS - %v", err)
+		logger.Errorf("Error running ERS - %v", err)
 	}
 
 	if ev != nil && ev.NewPrimary != nil {
@@ -253,13 +259,13 @@ func runEmergencyReparentOp(ctx context.Context, analysisEntry *inst.Replication
 
 // recoverDeadPrimary checks a given analysis, decides whether to take action, and possibly takes action
 // Returns true when action was taken.
-func recoverDeadPrimary(ctx context.Context, analysisEntry *inst.ReplicationAnalysis) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) {
-	return runEmergencyReparentOp(ctx, analysisEntry, "RecoverDeadPrimary", false)
+func recoverDeadPrimary(ctx context.Context, analysisEntry *inst.ReplicationAnalysis, logger *log.PrefixedLogger) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) {
+	return runEmergencyReparentOp(ctx, analysisEntry, "RecoverDeadPrimary", false, logger)
 }
 
 // recoverPrimaryTabletDeleted tries to run a recovery for the case where the primary tablet has been deleted.
-func recoverPrimaryTabletDeleted(ctx context.Context, analysisEntry *inst.ReplicationAnalysis) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) {
-	return runEmergencyReparentOp(ctx, analysisEntry, "PrimaryTabletDeleted", true)
+func recoverPrimaryTabletDeleted(ctx context.Context, analysisEntry *inst.ReplicationAnalysis, logger *log.PrefixedLogger) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) {
+	return runEmergencyReparentOp(ctx, analysisEntry, "PrimaryTabletDeleted", true, logger)
 }
 
 func postErsCompletion(topologyRecovery *TopologyRecovery, analysisEntry *inst.ReplicationAnalysis, recoveryName string, promotedReplica *inst.Instance) {
@@ -272,12 +278,14 @@ func postErsCompletion(topologyRecovery *TopologyRecovery, analysisEntry *inst.R
 }
 
 // checkAndRecoverGenericProblem is a general-purpose recovery function
-func checkAndRecoverLockedSemiSyncPrimary(ctx context.Context, analysisEntry *inst.ReplicationAnalysis) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) {
+func checkAndRecoverLockedSemiSyncPrimary(ctx context.Context, analysisEntry *inst.ReplicationAnalysis, logger *log.PrefixedLogger) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) {
+	logger.Warning("No actions in checkAndRecoverLockedSemiSyncPrimary")
 	return false, nil, nil
 }
 
 // checkAndRecoverGenericProblem is a general-purpose recovery function
-func checkAndRecoverGenericProblem(ctx context.Context, analysisEntry *inst.ReplicationAnalysis) (bool, *TopologyRecovery, error) {
+func checkAndRecoverGenericProblem(ctx context.Context, analysisEntry *inst.ReplicationAnalysis, logger *log.PrefixedLogger) (bool, *TopologyRecovery, error) {
+	logger.Warning("No actions in checkAndRecoverGenericProblem")
 	return false, nil, nil
 }
 
@@ -367,7 +375,7 @@ func hasActionableRecovery(recoveryFunctionCode recoveryFunction) bool {
 
 // getCheckAndRecoverFunction gets the recovery function for the given code.
 func getCheckAndRecoverFunction(recoveryFunctionCode recoveryFunction) (
-	checkAndRecoverFunction func(ctx context.Context, analysisEntry *inst.ReplicationAnalysis) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error),
+	checkAndRecoverFunction func(ctx context.Context, analysisEntry *inst.ReplicationAnalysis, logger *log.PrefixedLogger) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error),
 ) {
 	switch recoveryFunctionCode {
 	case noRecoveryFunc:
@@ -447,16 +455,20 @@ func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (er
 	countPendingRecoveries.Add(1)
 	defer countPendingRecoveries.Add(-1)
 
+	logger := log.NewPrefixedLogger(fmt.Sprintf("Recovery for %s on %s/%s", analysisEntry.Analysis, analysisEntry.AnalyzedKeyspace, analysisEntry.AnalyzedShard))
+	logger.Info("Starting checkAndRecover")
+
 	checkAndRecoverFunctionCode := getCheckAndRecoverFunctionCode(analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias)
 	isActionableRecovery := hasActionableRecovery(checkAndRecoverFunctionCode)
 	analysisEntry.IsActionableRecovery = isActionableRecovery
 
 	if checkAndRecoverFunctionCode == noRecoveryFunc {
+		logger.Warning("No recovery strategies for problem, aborting recovery")
 		// Unhandled problem type
 		if analysisEntry.Analysis != inst.NoProblem {
 			if util.ClearToLog("executeCheckAndRecoverFunction", analysisEntry.AnalyzedInstanceAlias) {
-				log.Warningf("executeCheckAndRecoverFunction: ignoring analysisEntry that has no action plan: %+v; tablet: %+v",
-					analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias)
+				logger.Warningf("executeCheckAndRecoverFunction: ignoring analysisEntry that has no action plan: tablet: %+v",
+					analysisEntry.AnalyzedInstanceAlias)
 			}
 		}
 
@@ -464,24 +476,24 @@ func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (er
 	}
 	// we have a recovery function; its execution still depends on filters if not disabled.
 	if isActionableRecovery || util.ClearToLog("executeCheckAndRecoverFunction: detection", analysisEntry.AnalyzedInstanceAlias) {
-		log.Infof("executeCheckAndRecoverFunction: proceeding with %+v detection on %+v; isActionable?: %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias, isActionableRecovery)
+		logger.Infof("executeCheckAndRecoverFunction: proceeding with %+v detection on %+v; isActionable?: %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias, isActionableRecovery)
 	}
 
 	// At this point we have validated there's a failure scenario for which we have a recovery path.
 	// Record the failure detected in the logs.
 	err = InsertRecoveryDetection(analysisEntry)
 	if err != nil {
-		log.Errorf("executeCheckAndRecoverFunction: error on inserting recovery detection record: %+v", err)
+		logger.Errorf("executeCheckAndRecoverFunction: error inserting recovery detection record, aborting recovery: %+v", err)
 		return err
 	}
 
 	// Check for recovery being disabled globally
 	if recoveryDisabledGlobally, err := IsRecoveryDisabled(); err != nil {
 		// Unexpected. Shouldn't get this
-		log.Errorf("Unable to determine if recovery is disabled globally: %v", err)
+		logger.Errorf("Unable to determine if recovery is disabled globally, still attempting to recover: %v", err)
 	} else if recoveryDisabledGlobally {
-		log.Infof("CheckAndRecover: Analysis: %+v, Tablet: %+v: NOT Recovering host (disabled globally)",
-			analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias)
+		logger.Infof("CheckAndRecover: Tablet: %+v: NOT Recovering host (disabled globally)",
+			analysisEntry.AnalyzedInstanceAlias)
 
 		return err
 	}
@@ -489,6 +501,7 @@ func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (er
 	// We lock the shard here and then refresh the tablets information
 	ctx, unlock, err := LockShard(context.Background(), analysisEntry.AnalyzedInstanceAlias, getLockAction(analysisEntry.AnalyzedInstanceAlias, analysisEntry.Analysis))
 	if err != nil {
+		logger.Errorf("Failed to lock shard, aborting recovery: %v", err)
 		return err
 	}
 	defer unlock(&err)
@@ -498,7 +511,7 @@ func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (er
 	// changes, we should be checking that this failure is indeed needed to be fixed. We do this after locking the shard to be sure
 	// that the data that we use now is up-to-date.
 	if isActionableRecovery {
-		log.Errorf("executeCheckAndRecoverFunction: Proceeding with %v recovery on %v validation after acquiring shard lock.", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias)
+		logger.Infof("executeCheckAndRecoverFunction: Proceeding with %v recovery on %v validation after acquiring shard lock.", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias)
 		// The first step we have to do is refresh the keyspace and shard information
 		// This is required to know if the durability policies have changed or not
 		// If they have, then recoveries like ReplicaSemiSyncMustNotBeSet, etc won't be valid anymore.
@@ -506,6 +519,7 @@ func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (er
 		// a change in the recovery we run.
 		err = RefreshKeyspaceAndShard(analysisEntry.AnalyzedKeyspace, analysisEntry.AnalyzedShard)
 		if err != nil {
+			logger.Errorf("Failed to refresh keyspace and shard, aborting recovery: %v", err)
 			return err
 		}
 		// If we are about to run a cluster-wide recovery, it is imperative to first refresh all the tablets
@@ -518,6 +532,7 @@ func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (er
 			}
 			// We ignore the dead primary tablet because it is going to be unreachable. If all the other tablets aren't able to reach this tablet either,
 			// we can proceed with the dead primary recovery. We don't need to refresh the information for this dead tablet.
+			logger.Info("Force refreshing all shard tablets")
 			forceRefreshAllTabletsInShard(ctx, analysisEntry.AnalyzedKeyspace, analysisEntry.AnalyzedShard, tabletsToIgnore)
 		} else {
 			// If we are not running a cluster-wide recovery, then it is only concerned with the specific tablet
@@ -526,66 +541,76 @@ func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (er
 			// and the host-port set on the tablet in question.
 			// So, we only need to refresh the tablet info records (to know if the primary tablet has changed),
 			// and the replication data of the new primary and this tablet.
+			logger.Info("Refreshing shard tablet info")
 			refreshTabletInfoOfShard(ctx, analysisEntry.AnalyzedKeyspace, analysisEntry.AnalyzedShard)
+			logger.Info("Discovering analysis instance")
 			DiscoverInstance(analysisEntry.AnalyzedInstanceAlias, true)
+			logger.Info("Getting shard primary")
 			primaryTablet, err := shardPrimary(analysisEntry.AnalyzedKeyspace, analysisEntry.AnalyzedShard)
 			if err != nil {
-				log.Errorf("executeCheckAndRecoverFunction: Analysis: %+v, Tablet: %+v: error while finding the shard primary: %v",
-					analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias, err)
+				logger.Errorf("executeCheckAndRecoverFunction: Tablet: %+v: error while finding the shard primary: %v",
+					analysisEntry.AnalyzedInstanceAlias, err)
 				return err
 			}
 			primaryTabletAlias := topoproto.TabletAliasString(primaryTablet.Alias)
 			// We can skip the refresh if we know the tablet we are looking at is the primary tablet.
 			// This would be the case for PrimaryHasPrimary recovery. We don't need to refresh the same tablet twice.
 			if analysisEntry.AnalyzedInstanceAlias != primaryTabletAlias {
+				logger.Info("Discovering primary instance")
 				DiscoverInstance(primaryTabletAlias, true)
 			}
 		}
 		alreadyFixed, err := checkIfAlreadyFixed(analysisEntry)
 		if err != nil {
-			log.Errorf("executeCheckAndRecoverFunction: Analysis: %+v, Tablet: %+v: error while trying to find if the problem is already fixed: %v",
-				analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias, err)
+			logger.Errorf("executeCheckAndRecoverFunction: Tablet: %+v: error while trying to find if the problem is already fixed: %v",
+				analysisEntry.AnalyzedInstanceAlias, err)
 			return err
 		}
 		if alreadyFixed {
-			log.Infof("Analysis: %v on tablet %v - No longer valid, some other agent must have fixed the problem.", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias)
+			logger.Infof("Analysis: %v on tablet %v - No longer valid, some other agent must have fixed the problem.", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias)
 			return nil
 		}
 	}
 
 	// Actually attempt recovery:
 	if isActionableRecovery || util.ClearToLog("executeCheckAndRecoverFunction: recovery", analysisEntry.AnalyzedInstanceAlias) {
-		log.Infof("executeCheckAndRecoverFunction: proceeding with %+v recovery on %+v; isRecoverable?: %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias, isActionableRecovery)
+		logger.Infof("executeCheckAndRecoverFunction: proceeding with recovery on %+v; isRecoverable?: %+v", analysisEntry.AnalyzedInstanceAlias, isActionableRecovery)
 	}
-	recoveryAttempted, topologyRecovery, err := getCheckAndRecoverFunction(checkAndRecoverFunctionCode)(ctx, analysisEntry)
+	recoveryAttempted, topologyRecovery, err := getCheckAndRecoverFunction(checkAndRecoverFunctionCode)(ctx, analysisEntry, logger)
 	if !recoveryAttempted {
+		logger.Errorf("Recovery not attempted: %+v", err)
 		return err
 	}
 	recoveryName := getRecoverFunctionName(checkAndRecoverFunctionCode)
 	recoveriesCounter.Add(recoveryName, 1)
 	if err != nil {
+		logger.Errorf("Failed to recover: %+v", err)
 		recoveriesFailureCounter.Add(recoveryName, 1)
 	} else {
+		logger.Info("Recovery succeeded")
 		recoveriesSuccessfulCounter.Add(recoveryName, 1)
 	}
 	if topologyRecovery == nil {
+		logger.Error("Topology recovery is nil - recovery might have failed")
 		return err
 	}
 	if b, err := json.Marshal(topologyRecovery); err == nil {
-		log.Infof("Topology recovery: %+v", string(b))
+		logger.Infof("Topology recovery: %+v", string(b))
 	} else {
-		log.Infof("Topology recovery: %+v", topologyRecovery)
+		logger.Infof("Topology recovery: %+v", topologyRecovery)
 	}
 	// If we ran a cluster wide recovery and actually attempted it, then we know that the replication state for all the tablets in this cluster
 	// would have changed. So we can go ahead and pre-emptively refresh them.
 	// For this refresh we don't use the same context that we used for the recovery, since that context might have expired or could expire soon
 	// Instead we pass the background context. The call forceRefreshAllTabletsInShard handles adding a timeout to it for us.
 	if isClusterWideRecovery(checkAndRecoverFunctionCode) {
+		logger.Info("Forcing refresh of all tablets post recovery")
 		forceRefreshAllTabletsInShard(context.Background(), analysisEntry.AnalyzedKeyspace, analysisEntry.AnalyzedShard, nil)
 	} else {
 		// For all other recoveries, we would have changed the replication status of the analyzed tablet
 		// so it doesn't hurt to re-read the information of this tablet, otherwise we'll requeue the same recovery
 		// that we just completed because we would be using stale data.
+		logger.Info("Force discovering problem instance %s post recovery", analysisEntry.AnalyzedInstanceAlias)
 		DiscoverInstance(analysisEntry.AnalyzedInstanceAlias, true)
 	}
 	return err
@@ -667,13 +692,15 @@ func postPrsCompletion(topologyRecovery *TopologyRecovery, analysisEntry *inst.R
 }
 
 // electNewPrimary elects a new primary while none were present before.
-func electNewPrimary(ctx context.Context, analysisEntry *inst.ReplicationAnalysis) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) {
+func electNewPrimary(ctx context.Context, analysisEntry *inst.ReplicationAnalysis, logger *log.PrefixedLogger) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) {
 	topologyRecovery, err = AttemptRecoveryRegistration(analysisEntry)
 	if topologyRecovery == nil || err != nil {
-		_ = AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another electNewPrimary.", analysisEntry.AnalyzedInstanceAlias))
+		message := fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another electNewPrimary.", analysisEntry.AnalyzedInstanceAlias)
+		logger.Warning(message)
+		_ = AuditTopologyRecovery(topologyRecovery, message)
 		return false, nil, err
 	}
-	log.Infof("Analysis: %v, will elect a new primary for %v:%v", analysisEntry.Analysis, analysisEntry.ClusterDetails.Keyspace, analysisEntry.ClusterDetails.Shard)
+	logger.Infof("Analysis: %v, will elect a new primary for %v:%v", analysisEntry.Analysis, analysisEntry.ClusterDetails.Keyspace, analysisEntry.ClusterDetails.Shard)
 
 	var promotedReplica *inst.Instance
 	// This has to be done in the end; whether successful or not, we should mark that the recovery is done.
@@ -684,6 +711,7 @@ func electNewPrimary(ctx context.Context, analysisEntry *inst.ReplicationAnalysi
 
 	analyzedTablet, err := inst.ReadTablet(analysisEntry.AnalyzedInstanceAlias)
 	if err != nil {
+		logger.Errorf("Failed to read instance %s, aborting recovery", analysisEntry.AnalyzedInstanceAlias)
 		return false, topologyRecovery, err
 	}
 	_ = AuditTopologyRecovery(topologyRecovery, "starting PlannedReparentShard for electing new primary.")
@@ -694,9 +722,9 @@ func electNewPrimary(ctx context.Context, analysisEntry *inst.ReplicationAnalysi
 		// we only log the warnings and errors explicitly, everything gets logged as an information message anyways in auditing topology recovery
 		switch level {
 		case logutilpb.Level_WARNING:
-			log.Warningf("PRS - %s", value)
+			logger.Warningf("PRS - %s", value)
 		case logutilpb.Level_ERROR:
-			log.Errorf("PRS - %s", value)
+			logger.Errorf("PRS - %s", value)
 		}
 		_ = AuditTopologyRecovery(topologyRecovery, value)
 	})).ReparentShard(ctx,
@@ -716,13 +744,15 @@ func electNewPrimary(ctx context.Context, analysisEntry *inst.ReplicationAnalysi
 }
 
 // fixPrimary sets the primary as read-write.
-func fixPrimary(ctx context.Context, analysisEntry *inst.ReplicationAnalysis) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) {
+func fixPrimary(ctx context.Context, analysisEntry *inst.ReplicationAnalysis, logger *log.PrefixedLogger) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) {
 	topologyRecovery, err = AttemptRecoveryRegistration(analysisEntry)
 	if topologyRecovery == nil {
-		_ = AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another fixPrimary.", analysisEntry.AnalyzedInstanceAlias))
+		message := fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another fixPrimary.", analysisEntry.AnalyzedInstanceAlias)
+		logger.Warning(message)
+		_ = AuditTopologyRecovery(topologyRecovery, message)
 		return false, nil, err
 	}
-	log.Infof("Analysis: %v, will fix primary to read-write %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias)
+	logger.Infof("Analysis: %v, will fix primary to read-write %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias)
 	// This has to be done in the end; whether successful or not, we should mark that the recovery is done.
 	// So that after the active period passes, we are able to run other recoveries.
 	defer func() {
@@ -731,12 +761,13 @@ func fixPrimary(ctx context.Context, analysisEntry *inst.ReplicationAnalysis) (r
 
 	analyzedTablet, err := inst.ReadTablet(analysisEntry.AnalyzedInstanceAlias)
 	if err != nil {
+		logger.Errorf("Failed to read instance %s, aborting recovery", analysisEntry.AnalyzedInstanceAlias)
 		return false, topologyRecovery, err
 	}
 
 	durabilityPolicy, err := inst.GetDurabilityPolicy(analyzedTablet.Keyspace)
 	if err != nil {
-		log.Info("Could not read the durability policy for %v/%v", analyzedTablet.Keyspace, analyzedTablet.Shard)
+		logger.Info("Could not read the durability policy for %v/%v", analyzedTablet.Keyspace, analyzedTablet.Shard)
 		return false, topologyRecovery, err
 	}
 
@@ -747,13 +778,15 @@ func fixPrimary(ctx context.Context, analysisEntry *inst.ReplicationAnalysis) (r
 }
 
 // fixReplica sets the replica as read-only and points it at the current primary.
-func fixReplica(ctx context.Context, analysisEntry *inst.ReplicationAnalysis) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) {
+func fixReplica(ctx context.Context, analysisEntry *inst.ReplicationAnalysis, logger *log.PrefixedLogger) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) {
 	topologyRecovery, err = AttemptRecoveryRegistration(analysisEntry)
 	if topologyRecovery == nil {
-		_ = AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another fixReplica.", analysisEntry.AnalyzedInstanceAlias))
+		message := fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another fixReplica.", analysisEntry.AnalyzedInstanceAlias)
+		logger.Warning(message)
+		_ = AuditTopologyRecovery(topologyRecovery, message)
 		return false, nil, err
 	}
-	log.Infof("Analysis: %v, will fix replica %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias)
+	logger.Infof("Analysis: %v, will fix replica %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias)
 	// This has to be done in the end; whether successful or not, we should mark that the recovery is done.
 	// So that after the active period passes, we are able to run other recoveries.
 	defer func() {
@@ -762,24 +795,25 @@ func fixReplica(ctx context.Context, analysisEntry *inst.ReplicationAnalysis) (r
 
 	analyzedTablet, err := inst.ReadTablet(analysisEntry.AnalyzedInstanceAlias)
 	if err != nil {
+		logger.Errorf("Failed to read instance %s, aborting recovery", analysisEntry.AnalyzedInstanceAlias)
 		return false, topologyRecovery, err
 	}
 
 	primaryTablet, err := shardPrimary(analyzedTablet.Keyspace, analyzedTablet.Shard)
 	if err != nil {
-		log.Info("Could not compute primary for %v/%v", analyzedTablet.Keyspace, analyzedTablet.Shard)
+		logger.Info("Could not compute primary for %v/%v", analyzedTablet.Keyspace, analyzedTablet.Shard)
 		return false, topologyRecovery, err
 	}
 
 	durabilityPolicy, err := inst.GetDurabilityPolicy(analyzedTablet.Keyspace)
 	if err != nil {
-		log.Info("Could not read the durability policy for %v/%v", analyzedTablet.Keyspace, analyzedTablet.Shard)
+		logger.Info("Could not read the durability policy for %v/%v", analyzedTablet.Keyspace, analyzedTablet.Shard)
 		return false, topologyRecovery, err
 	}
 
 	err = setReadOnly(ctx, analyzedTablet)
 	if err != nil {
-		log.Info("Could not set the tablet %v to readonly - %v", analysisEntry.AnalyzedInstanceAlias, err)
+		logger.Info("Could not set the tablet %v to readonly - %v", analysisEntry.AnalyzedInstanceAlias, err)
 		return true, topologyRecovery, err
 	}
 
@@ -788,13 +822,15 @@ func fixReplica(ctx context.Context, analysisEntry *inst.ReplicationAnalysis) (r
 }
 
 // recoverErrantGTIDDetected changes the tablet type of a replica tablet that has errant GTIDs.
-func recoverErrantGTIDDetected(ctx context.Context, analysisEntry *inst.ReplicationAnalysis) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) {
+func recoverErrantGTIDDetected(ctx context.Context, analysisEntry *inst.ReplicationAnalysis, logger *log.PrefixedLogger) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) {
 	topologyRecovery, err = AttemptRecoveryRegistration(analysisEntry)
 	if topologyRecovery == nil {
-		_ = AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another recoverErrantGTIDDetected.", analysisEntry.AnalyzedInstanceAlias))
+		message := fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another recoverErrantGTIDDetected.", analysisEntry.AnalyzedInstanceAlias)
+		logger.Warning(message)
+		_ = AuditTopologyRecovery(topologyRecovery, message)
 		return false, nil, err
 	}
-	log.Infof("Analysis: %v, will fix tablet %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias)
+	logger.Infof("Analysis: %v, will fix tablet %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias)
 	// This has to be done in the end; whether successful or not, we should mark that the recovery is done.
 	// So that after the active period passes, we are able to run other recoveries.
 	defer func() {
@@ -808,13 +844,13 @@ func recoverErrantGTIDDetected(ctx context.Context, analysisEntry *inst.Replicat
 
 	primaryTablet, err := shardPrimary(analyzedTablet.Keyspace, analyzedTablet.Shard)
 	if err != nil {
-		log.Info("Could not compute primary for %v/%v", analyzedTablet.Keyspace, analyzedTablet.Shard)
+		logger.Info("Could not compute primary for %v/%v", analyzedTablet.Keyspace, analyzedTablet.Shard)
 		return false, topologyRecovery, err
 	}
 
 	durabilityPolicy, err := inst.GetDurabilityPolicy(analyzedTablet.Keyspace)
 	if err != nil {
-		log.Info("Could not read the durability policy for %v/%v", analyzedTablet.Keyspace, analyzedTablet.Shard)
+		logger.Info("Could not read the durability policy for %v/%v", analyzedTablet.Keyspace, analyzedTablet.Shard)
 		return false, topologyRecovery, err
 	}
 
diff --git a/go/vt/vtorc/logic/topology_recovery_dao.go b/go/vt/vtorc/logic/topology_recovery_dao.go
index 1920da4dcd8..730e6b2a158 100644
--- a/go/vt/vtorc/logic/topology_recovery_dao.go
+++ b/go/vt/vtorc/logic/topology_recovery_dao.go
@@ -31,7 +31,7 @@ import (
 // InsertRecoveryDetection inserts the recovery analysis that has been detected.
 func InsertRecoveryDetection(analysisEntry *inst.ReplicationAnalysis) error {
 	sqlResult, err := db.ExecVTOrc(`
-			insert ignore
+			insert or ignore
 				into recovery_detection (
 					alias,
 					analysis,
@@ -43,7 +43,7 @@ func InsertRecoveryDetection(analysisEntry *inst.ReplicationAnalysis) error {
 					?,
 					?,
 					?,
-					now()
+					datetime('now')
 				)`,
 		analysisEntry.AnalyzedInstanceAlias,
 		string(analysisEntry.Analysis),
@@ -66,7 +66,7 @@ func InsertRecoveryDetection(analysisEntry *inst.ReplicationAnalysis) error {
 func writeTopologyRecovery(topologyRecovery *TopologyRecovery) (*TopologyRecovery, error) {
 	analysisEntry := topologyRecovery.AnalysisEntry
 	sqlResult, err := db.ExecVTOrc(`
-			insert ignore
+			insert or ignore
 				into topology_recovery (
 					recovery_id,
 					alias,
@@ -78,7 +78,7 @@ func writeTopologyRecovery(topologyRecovery *TopologyRecovery) (*TopologyRecover
 				) values (
 					?,
 					?,
-					NOW(),
+					datetime('now'),
 					?,
 					?,
 					?,
@@ -143,7 +143,7 @@ func writeResolveRecovery(topologyRecovery *TopologyRecovery) error {
 				is_successful = ?,
 				successor_alias = ?,
 				all_errors = ?,
-				end_recovery = NOW()
+				end_recovery = datetime('now')
 			where
 				recovery_id = ?
 			`, topologyRecovery.IsSuccessful,
@@ -237,10 +237,10 @@ func ReadRecentRecoveries(page int) ([]*TopologyRecovery, error) {
 // writeTopologyRecoveryStep writes down a single step in a recovery process
 func writeTopologyRecoveryStep(topologyRecoveryStep *TopologyRecoveryStep) error {
 	sqlResult, err := db.ExecVTOrc(`
-			insert ignore
+			insert or ignore
 				into topology_recovery_steps (
 					recovery_step_id, recovery_id, audit_at, message
-				) values (?, ?, now(), ?)
+				) values (?, ?, datetime('now'), ?)
 			`, sqlutils.NilIfZero(topologyRecoveryStep.ID), topologyRecoveryStep.RecoveryID, topologyRecoveryStep.Message,
 	)
 	if err != nil {
diff --git a/go/vt/vtorc/logic/topology_recovery_dao_test.go b/go/vt/vtorc/logic/topology_recovery_dao_test.go
index 354af82e2b3..20dfb7e91e2 100644
--- a/go/vt/vtorc/logic/topology_recovery_dao_test.go
+++ b/go/vt/vtorc/logic/topology_recovery_dao_test.go
@@ -88,9 +88,9 @@ func TestExpireTableData(t *testing.T) {
 			tableName:        "recovery_detection",
 			expectedRowCount: 2,
 			insertQuery: `insert into recovery_detection (detection_id, detection_timestamp, alias, analysis, keyspace, shard) values
-(1, NOW() - INTERVAL 3 DAY,'a','a','a','a'),
-(2, NOW() - INTERVAL 5 DAY,'a','a','a','a'),
-(3, NOW() - INTERVAL 15 DAY,'a','a','a','a')`,
+(1, datetime('now', '-3 DAY'),'a','a','a','a'),
+(2, datetime('now', '-5 DAY'),'a','a','a','a'),
+(3, datetime('now', '-15 DAY'),'a','a','a','a')`,
 			expireFunc: ExpireRecoveryDetectionHistory,
 		},
 		{
@@ -98,9 +98,9 @@ func TestExpireTableData(t *testing.T) {
 			tableName:        "topology_recovery",
 			expectedRowCount: 1,
 			insertQuery: `insert into topology_recovery (recovery_id, start_recovery, alias, analysis, keyspace, shard) values
-(1, NOW() - INTERVAL 13 DAY,'a','a','a','a'),
-(2, NOW() - INTERVAL 5 DAY,'a','a','a','a'),
-(3, NOW() - INTERVAL 15 DAY,'a','a','a','a')`,
+(1, datetime('now', '-13 DAY'),'a','a','a','a'),
+(2, datetime('now', '-5 DAY'),'a','a','a','a'),
+(3, datetime('now', '-15 DAY'),'a','a','a','a')`,
 			expireFunc: ExpireTopologyRecoveryHistory,
 		},
 		{
@@ -108,9 +108,9 @@ func TestExpireTableData(t *testing.T) {
 			tableName:        "topology_recovery_steps",
 			expectedRowCount: 1,
 			insertQuery: `insert into topology_recovery_steps (recovery_step_id, audit_at, recovery_id, message) values
-(1, NOW() - INTERVAL 13 DAY, 1, 'a'),
-(2, NOW() - INTERVAL 5 DAY, 2, 'a'),
-(3, NOW() - INTERVAL 15 DAY, 3, 'a')`,
+(1, datetime('now', '-13 DAY'), 1, 'a'),
+(2, datetime('now', '-5 DAY'), 2, 'a'),
+(3, datetime('now', '-15 DAY'), 3, 'a')`,
 			expireFunc: ExpireTopologyRecoveryStepsHistory,
 		},
 	}
diff --git a/go/vt/vtorc/logic/topology_recovery_test.go b/go/vt/vtorc/logic/topology_recovery_test.go
index f7658060b95..384b3667d4f 100644
--- a/go/vt/vtorc/logic/topology_recovery_test.go
+++ b/go/vt/vtorc/logic/topology_recovery_test.go
@@ -20,6 +20,8 @@ import (
 	"context"
 	"testing"
 
+	"vitess.io/vitess/go/vt/log"
+
 	"github.com/stretchr/testify/require"
 
 	topodatapb "vitess.io/vitess/go/vt/proto/topodata"
@@ -126,7 +128,7 @@ func TestElectNewPrimaryPanic(t *testing.T) {
 	defer cancel()
 
 	ts = memorytopo.NewServer(ctx, "zone1")
-	recoveryAttempted, _, err := electNewPrimary(context.Background(), analysisEntry)
+	recoveryAttempted, _, err := electNewPrimary(context.Background(), analysisEntry, log.NewPrefixedLogger("prefix"))
 	require.True(t, recoveryAttempted)
 	require.Error(t, err)
 }
diff --git a/go/vt/vtorc/process/health.go b/go/vt/vtorc/process/health.go
index 8e3eee7c649..99080a7f86f 100644
--- a/go/vt/vtorc/process/health.go
+++ b/go/vt/vtorc/process/health.go
@@ -40,7 +40,7 @@ func writeHealthToDatabase() bool {
 		log.Error(err)
 		return false
 	}
-	sqlResult, err := db.ExecVTOrc(`insert into node_health (last_seen_active) values (now())`)
+	sqlResult, err := db.ExecVTOrc(`insert into node_health (last_seen_active) values (datetime('now'))`)
 	if err != nil {
 		log.Error(err)
 		return false