Skip to content

Commit

Permalink
Avoid failover delays in twemproxy reconfig when using master targets
Browse files Browse the repository at this point in the history
It seems that even though the "sentinel master <shard_name>" command
returns updated information about the master as soon as a slave is
promoted, there is an exeption with the sentinel instance that acts as
"failover leader". The failover leader is the instance that actually
performs the shard reconfigurations, and in this case, its "sentinel
master <shard_name>" command only returns updated information when all
the slaves have been reconfigured to point to the new master. This
causes delays in twemproxy reconfiguration if this is the instance being
used to "discover" the shard.
To detect this situation, add a step that checks the master address with
the command "sentinel get-master-addr-by-name <shard_name>". This
command always returns updated master information, even if we are
querying the leader sentinel instance.
  • Loading branch information
roivaz committed Oct 17, 2023
1 parent aad968e commit d9cab77
Show file tree
Hide file tree
Showing 12 changed files with 338 additions and 89 deletions.
3 changes: 2 additions & 1 deletion controllers/sentinel_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,8 @@ func (r *SentinelReconciler) reconcileStatus(ctx context.Context, instance *saas
// and rely on reconciles triggered by sentinel events to correct the situation.
masterError := &sharded.DiscoveryError_Master_SingleServerFailure{}
slaveError := &sharded.DiscoveryError_Slave_SingleServerFailure{}
if errors.As(merr, masterError) || errors.As(merr, slaveError) {
failoverInProgressError := &sharded.DiscoveryError_Slave_FailoverInProgress{}
if errors.As(merr, masterError) || errors.As(merr, slaveError) || errors.As(merr, failoverInProgressError) {
log.Error(merr, "DiscoveryError")
}

Expand Down
6 changes: 3 additions & 3 deletions examples/backend/redis-v6/redis.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,15 @@ metadata:
name: shard01
spec:
image:
tag: 6.2.7-alpine
tag: 6.2.13-alpine
---
apiVersion: saas.3scale.net/v1alpha1
kind: RedisShard
metadata:
name: shard02
spec:
image:
tag: 6.2.7-alpine
tag: 6.2.13-alpine

---
apiVersion: saas.3scale.net/v1alpha1
Expand All @@ -22,4 +22,4 @@ metadata:
spec:
slaveCount: 0
image:
tag: 6.2.7-alpine
tag: 6.2.13-alpine
2 changes: 1 addition & 1 deletion examples/backend/redis-v6/sentinel.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ metadata:
spec:
replicas: 3
image:
tag: 6.2.7-debian-11-r17
tag: 6.2.13-debian-11-r76
config:
monitoredShards:
# DNS should not be used in production. DNS is
Expand Down
56 changes: 56 additions & 0 deletions pkg/generators/twemproxyconfig/generator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,13 +92,27 @@ func TestNewGenerator(t *testing.T) {
},
InjectError: func() error { return nil },
},
redis_client.FakeResponse{
// cmd: SentinelGetMasterAddrByName (shard0)
InjectResponse: func() interface{} {
return []string{"127.0.0.1", "1000"}
},
InjectError: func() error { return nil },
},
redis_client.FakeResponse{
// cmd: SentinelMaster (shard1)
InjectResponse: func() interface{} {
return &redis_client.SentinelMasterCmdResult{Name: "shard1", IP: "127.0.0.1", Port: 5000, Flags: "master"}
},
InjectError: func() error { return nil },
},
redis_client.FakeResponse{
// cmd: SentinelGetMasterAddrByName (shard1)
InjectResponse: func() interface{} {
return []string{"127.0.0.1", "5000"}
},
InjectError: func() error { return nil },
},
),
),
log: logr.Discard(),
Expand Down Expand Up @@ -258,6 +272,13 @@ func TestNewGenerator(t *testing.T) {
},
InjectError: func() error { return nil },
},
redis_client.FakeResponse{
// cmd: SentinelGetMasterAddrByName (shard0)
InjectResponse: func() interface{} {
return []string{"127.0.0.1", "1000"}
},
InjectError: func() error { return nil },
},
redis_client.FakeResponse{
// cmd: SentinelSlaves (shard0)
InjectResponse: func() interface{} {
Expand Down Expand Up @@ -285,6 +306,13 @@ func TestNewGenerator(t *testing.T) {
},
InjectError: func() error { return nil },
},
redis_client.FakeResponse{
// cmd: SentinelGetMasterAddrByName (shard1)
InjectResponse: func() interface{} {
return []string{"127.0.0.1", "5000"}
},
InjectError: func() error { return nil },
},
redis_client.FakeResponse{
// cmd: SentinelSlaves (shard1)
InjectResponse: func() interface{} {
Expand Down Expand Up @@ -428,6 +456,13 @@ func TestNewGenerator(t *testing.T) {
},
InjectError: func() error { return nil },
},
redis_client.FakeResponse{
// cmd: SentinelGetMasterAddrByName (shard0)
InjectResponse: func() interface{} {
return []string{"127.0.0.1", "1000"}
},
InjectError: func() error { return nil },
},
redis_client.FakeResponse{
// cmd: SentinelSlaves (shard0)
InjectResponse: func() interface{} {
Expand Down Expand Up @@ -455,6 +490,13 @@ func TestNewGenerator(t *testing.T) {
},
InjectError: func() error { return nil },
},
redis_client.FakeResponse{
// cmd: SentinelGetMasterAddrByName (shard1)
InjectResponse: func() interface{} {
return []string{"127.0.0.1", "5000"}
},
InjectError: func() error { return nil },
},
redis_client.FakeResponse{
// cmd: SentinelSlaves (shard1)
InjectResponse: func() interface{} {
Expand Down Expand Up @@ -593,13 +635,27 @@ func TestNewGenerator(t *testing.T) {
},
InjectError: func() error { return nil },
},
redis_client.FakeResponse{
// cmd: SentinelGetMasterAddrByName (shard0)
InjectResponse: func() interface{} {
return []string{"127.0.0.1", "1000"}
},
InjectError: func() error { return nil },
},
redis_client.FakeResponse{
// cmd: SentinelMaster (shard1)
InjectResponse: func() interface{} {
return &redis_client.SentinelMasterCmdResult{Name: "shard1", IP: "127.0.0.1", Port: 5000, Flags: "master"}
},
InjectError: func() error { return nil },
},
redis_client.FakeResponse{
// cmd: SentinelGetMasterAddrByName (shard1)
InjectResponse: func() interface{} {
return []string{"127.0.0.1", "5000"}
},
InjectError: func() error { return nil },
},
redis_client.FakeResponse{
// cmd: SentinelSlaves (shard1)
InjectResponse: func() interface{} {
Expand Down
5 changes: 5 additions & 0 deletions pkg/redis/client/fake_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,11 @@ func (fc *FakeClient) SentinelMaster(ctx context.Context, shard string) (*Sentin
return rsp.InjectResponse().(*SentinelMasterCmdResult), rsp.InjectError()
}

func (fc *FakeClient) SentinelGetMasterAddrByName(ctx context.Context, shard string) ([]string, error) {
rsp := fc.pop()
return rsp.InjectResponse().([]string), rsp.InjectError()
}

func (fc *FakeClient) SentinelMasters(ctx context.Context) ([]interface{}, error) {
rsp := fc.pop()
return rsp.InjectResponse().([]interface{}), rsp.InjectError()
Expand Down
6 changes: 6 additions & 0 deletions pkg/redis/client/goredis_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,12 @@ func (c *GoRedisClient) SentinelMaster(ctx context.Context, shard string) (*Sent
return result, err
}

func (c *GoRedisClient) SentinelGetMasterAddrByName(ctx context.Context, shard string) ([]string, error) {

values, err := c.sentinel.GetMasterAddrByName(ctx, shard).Result()
return values, err
}

func (c *GoRedisClient) SentinelMasters(ctx context.Context) ([]interface{}, error) {

values, err := c.sentinel.Masters(ctx).Result()
Expand Down
1 change: 1 addition & 0 deletions pkg/redis/client/interface.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
type TestableInterface interface {
SentinelMaster(context.Context, string) (*SentinelMasterCmdResult, error)
SentinelMasters(context.Context) ([]interface{}, error)
SentinelGetMasterAddrByName(ctx context.Context, shard string) ([]string, error)
SentinelSlaves(context.Context, string) ([]interface{}, error)
SentinelMonitor(context.Context, string, string, string, int) error
SentinelSet(context.Context, string, string, string) error
Expand Down
14 changes: 14 additions & 0 deletions pkg/redis/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"bufio"
"context"
"net"
"strconv"
"strings"
"sync"
"time"
Expand Down Expand Up @@ -109,6 +110,19 @@ func (srv *Server) SentinelMaster(ctx context.Context, shard string) (*client.Se
return result, nil
}

func (srv *Server) SentinelGetMasterAddrByName(ctx context.Context, shard string) (string, int, error) {

values, err := srv.client.SentinelGetMasterAddrByName(ctx, shard)
if err != nil {
return "", 0, err
}
port, err := strconv.Atoi(values[1])
if err != nil {
return "", 0, err
}
return values[0], port, nil
}

func (srv *Server) SentinelMasters(ctx context.Context) ([]client.SentinelMasterCmdResult, error) {

values, err := srv.client.SentinelMasters(ctx)
Expand Down
1 change: 1 addition & 0 deletions pkg/redis/sharded/discover.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,5 +105,6 @@ func (srv *RedisServer) Discover(ctx context.Context, opts ...DiscoveryOption) e
// Discovery errors
type DiscoveryError_Sentinel_Failure struct{ error }
type DiscoveryError_Master_SingleServerFailure struct{ error }
type DiscoveryError_Slave_FailoverInProgress struct{ error }
type DiscoveryError_Slave_SingleServerFailure struct{ error }
type DiscoveryError_UnknownRole_SingleServerFailure struct{ error }
23 changes: 23 additions & 0 deletions pkg/redis/sharded/redis_shard.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@ package sharded
import (
"context"
"fmt"
"net"
"sort"
"strconv"
"strings"

"github.com/3scale/saas-operator/pkg/redis/client"
Expand Down Expand Up @@ -79,6 +81,22 @@ func (shard *Shard) Discover(ctx context.Context, sentinel *SentinelServer, opti
return append(merr, DiscoveryError_Sentinel_Failure{err})
}

// If a failover is currently in progress, only the GetMasterAddrByName command returns the
// updated address of the master. The other commands ('sentinel masters' and 'sentinel master <shard>)
// wait until all the replicas are also reconfigured to start announcing the new IP, which can cause delays
// to reconfigure the clients.
failoverInProgress := false
masterIP, masterPort, err := sentinel.SentinelGetMasterAddrByName(ctx, shard.Name)
if err != nil {
return append(merr, DiscoveryError_Sentinel_Failure{err})
}
if masterIP != sentinelMasterResult.IP || masterPort != sentinelMasterResult.Port {
failoverInProgress = true
logger.Info("failover in progress", "oldMaster", net.JoinHostPort(sentinelMasterResult.IP, strconv.Itoa(sentinelMasterResult.Port)), "newMaster", net.JoinHostPort(masterIP, strconv.Itoa(masterPort)))
sentinelMasterResult.IP = masterIP
sentinelMasterResult.Port = masterPort
}

// Get the corresponding server or add a new one if not found
srv, err := shard.GetServerByID(fmt.Sprintf("%s:%d", sentinelMasterResult.IP, sentinelMasterResult.Port))
if err != nil {
Expand Down Expand Up @@ -108,6 +126,11 @@ func (shard *Shard) Discover(ctx context.Context, sentinel *SentinelServer, opti
return merr.ErrorOrNil()
}

// don't discover slaves if a failover is in progress
if failoverInProgress {
return append(merr, DiscoveryError_Slave_FailoverInProgress{fmt.Errorf("refusing to discover slaves while a failover is in progress")})
}

// discover slaves
sentinelSlavesResult, err := sentinel.SentinelSlaves(ctx, shard.Name)
if err != nil {
Expand Down
Loading

0 comments on commit d9cab77

Please sign in to comment.