diff --git a/go/vt/discovery/healthcheck.go b/go/vt/discovery/healthcheck.go index c724ddb2870..5ab5823c5ad 100644 --- a/go/vt/discovery/healthcheck.go +++ b/go/vt/discovery/healthcheck.go @@ -397,8 +397,29 @@ func (hc *HealthCheckImpl) deleteTablet(tablet *topodata.Tablet) { hc.mu.Lock() defer hc.mu.Unlock() - key := KeyFromTablet(tablet) tabletAlias := tabletAliasString(topoproto.TabletAliasString(tablet.Alias)) + defer func() { + // We want to be sure the tablet is gone from the secondary + // maps even if it's already gone from the authoritative map. + // The tablet's type also may have recently changed as well, + // so ensure that the tablet we're removing is removed from + // any possible secondary map keys: + // key: keyspace.shard.tabletType -> val: map[tabletAlias]tabletHealth + for _, tabletType := range topoproto.AllTabletTypes { + key := KeyspaceShardTabletType(fmt.Sprintf("%s.%s.%s", tablet.Keyspace, tablet.Shard, topoproto.TabletTypeLString(tabletType))) + // delete from map by keyspace.shard.tabletType + ths, ok := hc.healthData[key] + if !ok { + continue + } + delete(ths, tabletAlias) + // delete from healthy list + healthy, ok := hc.healthy[key] + if ok && len(healthy) > 0 { + hc.recomputeHealthy(key) + } + } + }() // delete from authoritative map th, ok := hc.healthByAlias[tabletAlias] if !ok { @@ -409,18 +430,6 @@ func (hc *HealthCheckImpl) deleteTablet(tablet *topodata.Tablet) { // which will call finalizeConn, which will close the connection. th.cancelFunc() delete(hc.healthByAlias, tabletAlias) - // delete from map by keyspace.shard.tabletType - ths, ok := hc.healthData[key] - if !ok { - log.Warningf("We have no health data for target: %v", key) - return - } - delete(ths, tabletAlias) - // delete from healthy list - healthy, ok := hc.healthy[key] - if ok && len(healthy) > 0 { - hc.recomputeHealthy(key) - } } func (hc *HealthCheckImpl) updateHealth(th *TabletHealth, prevTarget *query.Target, trivialUpdate bool, up bool) { diff --git a/go/vt/discovery/healthcheck_test.go b/go/vt/discovery/healthcheck_test.go index d2f6d25eaf9..bbe3e4b8d8c 100644 --- a/go/vt/discovery/healthcheck_test.go +++ b/go/vt/discovery/healthcheck_test.go @@ -681,7 +681,7 @@ func TestRemoveTablet(t *testing.T) { // there will be a first result, get and discard it <-resultChan - shr := &querypb.StreamHealthResponse{ + shrReplica := &querypb.StreamHealthResponse{ TabletAlias: tablet.Alias, Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, Serving: true, @@ -695,7 +695,7 @@ func TestRemoveTablet(t *testing.T) { Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2}, PrimaryTermStartTime: 0, }} - input <- shr + input <- shrReplica <-resultChan // check it's there a := hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}) @@ -705,6 +705,59 @@ func TestRemoveTablet(t *testing.T) { hc.RemoveTablet(tablet) a = hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}) assert.Empty(t, a, "wrong result, expected empty list") + + // Now confirm that when a tablet's type changes between when it's added to the cache + // and when it's removed, that the tablet is entirely removed from the cache since + // in the secondary maps it's keyed in part by tablet type. + // Note: we are using GetTabletStats here to check the healthData map (rather than + // the healthy map that we checked above) because that is the data structure that + // is used when printing the contents of the healthcheck cache in the /debug/status + // endpoint and in the SHOW VITESS_TABLETS; SQL command output. + + // Add the tablet back. + hc.AddTablet(tablet) + // Receive and discard the initial result. + <-resultChan + input <- shrReplica + // Confirm it's there in the cache. + a = hc.getTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}) + mustMatch(t, want, a, "unexpected result") + // Change the tablet type to RDONLY. + tablet.Type = topodatapb.TabletType_RDONLY + shrRdonly := &querypb.StreamHealthResponse{ + TabletAlias: tablet.Alias, + Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_RDONLY}, + Serving: true, + TabletExternallyReparentedTimestamp: 0, + RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 2, CpuUsage: 0.4}, + } + // Now replace it, which does a Remove and Add. The tablet should + // be removed from the cache and all its maps even though the + // tablet type had changed in-between the initial Add and Remove. + hc.ReplaceTablet(tablet, tablet) + // Confirm that the old entry is gone. + a = hc.getTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}) + assert.Empty(t, a, "wrong result, expected empty list") + // Receive and discard the initial result. + <-resultChan + input <- shrRdonly + // Confirm that the new entry is there in the cache. + want = []*TabletHealth{{ + Tablet: tablet, + Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_RDONLY}, + Serving: true, + Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 2, CpuUsage: 0.4}, + PrimaryTermStartTime: 0, + }} + a = hc.getTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_RDONLY}) + mustMatch(t, want, a, "unexpected result") + // Delete the tablet, confirm again that it's gone in both + // tablet type forms. + hc.RemoveTablet(tablet) + a = hc.getTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}) + assert.Empty(t, a, "wrong result, expected empty list") + a = hc.getTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_RDONLY}) + assert.Empty(t, a, "wrong result, expected empty list") } // TestGetHealthyTablets tests the functionality of GetHealthyTabletStats.