Skip to content

Commit

Permalink
Retry all servers on RPC call failure
Browse files Browse the repository at this point in the history
rpcproxy is refactored into serverlist which prioritizes good servers
over servers in a remote DC or who have had a failure.
  • Loading branch information
schmichael committed Sep 23, 2016
1 parent d49dda4 commit ce76aef
Show file tree
Hide file tree
Showing 9 changed files with 384 additions and 1,899 deletions.
320 changes: 189 additions & 131 deletions client/client.go

Large diffs are not rendered by default.

779 changes: 0 additions & 779 deletions client/rpcproxy/rpcproxy.go

This file was deleted.

818 changes: 0 additions & 818 deletions client/rpcproxy/rpcproxy_test.go

This file was deleted.

84 changes: 0 additions & 84 deletions client/rpcproxy/server_endpoint.go

This file was deleted.

77 changes: 0 additions & 77 deletions client/rpcproxy/server_endpoint_test.go

This file was deleted.

107 changes: 107 additions & 0 deletions client/serverlist.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
package client

import (
"math/rand"
"net"
"sort"
"strings"
"sync"
)

type serverlist struct {
e endpoints
mu sync.RWMutex
}

func newServerList() *serverlist {
return &serverlist{}
}

// set the server list to a new list. The new list will be shuffled and sorted
// by priority.
func (s *serverlist) set(newe endpoints) {
s.mu.Lock()
s.e = newe
s.mu.Unlock()
}

// all returns a copy of the full server list, shuffled and then sorted by
// priority
func (s *serverlist) all() endpoints {
s.mu.RLock()
out := make(endpoints, len(s.e))
copy(out, s.e)
s.mu.RUnlock()

// Randomize the order
for i, j := range rand.Perm(len(out)) {
out[i], out[j] = out[j], out[i]
}

// Sort by priority
sort.Sort(out)
return out
}

// failed servers get deprioritized
func (s *serverlist) failed(e *endpoint) {
s.mu.Lock()
defer s.mu.Unlock()
for i := 0; i < len(s.e); i++ {
if s.e[i].equal(e) {
e.priority++
return
}
}
}

// good servers get promoted to the highest priority
func (s *serverlist) good(e *endpoint) {
s.mu.Lock()
defer s.mu.Unlock()
for i := 0; i < len(s.e); i++ {
if s.e[i].equal(e) {
e.priority = 0
return
}
}
}

func (e endpoints) Len() int {
return len(e)
}

func (e endpoints) Less(i int, j int) bool {
// Sort only by priority as endpoints should be shuffled and ordered
// only by priority
return e[i].priority < e[j].priority
}

func (e endpoints) Swap(i int, j int) {
e[i], e[j] = e[j], e[i]
}

type endpoints []*endpoint

func (e endpoints) String() string {
names := make([]string, 0, len(e))
for _, endpoint := range e {
names = append(names, endpoint.name)
}
return strings.Join(names, ",")
}

type endpoint struct {
name string
addr net.Addr

// 0 being the highest priority
priority int
}

// equal returns true if the name and addr match between two endpoints.
// Priority is ignored because the same endpoint may be added by discovery and
// heartbeating with different priorities.
func (e *endpoint) equal(o *endpoint) bool {
return e.name == o.name && e.addr == o.addr
}
80 changes: 80 additions & 0 deletions client/serverlist_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
package client

import "testing"

func TestServerList(t *testing.T) {
s := newServerList()

// New lists should be empty
if e := s.get(); e != nil {
t.Fatalf("expected empty list to return nil, but received: %v", e)
}
if e := s.all(); len(e) != 0 {
t.Fatalf("expected empty list to return an empty list, but received: %+q", e)
}

mklist := func() endpoints {
return endpoints{
&endpoint{"b", nil, 1},
&endpoint{"c", nil, 1},
&endpoint{"g", nil, 2},
&endpoint{"d", nil, 1},
&endpoint{"e", nil, 1},
&endpoint{"f", nil, 1},
&endpoint{"h", nil, 2},
&endpoint{"a", nil, 0},
}
}
s.set(mklist())

orig := mklist()
all := s.all()
if len(all) != len(orig) {
t.Fatalf("expected %d endpoints but only have %d", len(orig), len(all))
}

// Assert list is properly randomized+sorted
for i, pri := range []int{0, 1, 1, 1, 1, 1, 2, 2} {
if all[i].priority != pri {
t.Errorf("expected endpoint %d (%+q) to be priority %d", i, all[i], pri)
}
}

// Subsequent sets should reshuffle (try multiple times as they may
// shuffle in the same order)
tries := 0
max := 3
for ; tries < max; tries++ {
s.set(mklist())
// First entry should always be the same
if e := s.get(); *e != *all[0] {
t.Fatalf("on try %d get returned the wrong endpoint: %+q", tries, e)
}

all2 := s.all()
if all.String() == all2.String() {
// eek, matched; try again in case we just got unlucky
continue
}
break
}
if tries == max {
t.Fatalf("after %d attempts servers were still not random reshuffled", tries)
}

// Mark should rotate list items in place
s.mark(&endpoint{"a", nil, 0})
all3 := s.all()
if s.get().name == "a" || all3[len(all3)-1].name != "a" {
t.Fatalf("endpoint a shold have been rotated to end")
}
if len(all3) != len(all) {
t.Fatalf("marking should not have changed list length")
}

// Marking a non-existant endpoint should do nothing
s.mark(&endpoint{})
if s.all().String() != all3.String() {
t.Fatalf("marking a non-existant endpoint alterd the list")
}
}
13 changes: 6 additions & 7 deletions command/agent/agent_endpoint.go
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ func (s *HTTPServer) listServers(resp http.ResponseWriter, req *http.Request) (i
return nil, CodedError(501, ErrInvalidMethod)
}

peers := s.agent.client.RPCProxy().ServerRPCAddrs()
peers := s.agent.client.GetServers()
return peers, nil
}

Expand All @@ -156,12 +156,11 @@ func (s *HTTPServer) updateServers(resp http.ResponseWriter, req *http.Request)
}

// Set the servers list into the client
for _, server := range servers {
s.agent.logger.Printf("[TRACE] Adding server %s to the client's primary server list", server)
se := client.AddPrimaryServerToRPCProxy(server)
if se == nil {
s.agent.logger.Printf("[ERR] Attempt to add server %q to client failed", server)
}
s.agent.logger.Printf("[TRACE] Adding servers %+q to the client's primary server list", servers)
if err := client.SetServers(servers); err != nil {
s.agent.logger.Printf("[ERR] Attempt to add servers %q to client failed: %v", servers, err)
//TODO is this the right error to return?
return nil, CodedError(400, err.Error())
}
return nil, nil
}
Expand Down
5 changes: 2 additions & 3 deletions nomad/pool.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ import (

"github.com/hashicorp/consul/tlsutil"
"github.com/hashicorp/net-rpc-msgpackrpc"
"github.com/hashicorp/nomad/client/rpcproxy"
"github.com/hashicorp/yamux"
)

Expand Down Expand Up @@ -376,9 +375,9 @@ func (p *ConnPool) RPC(region string, addr net.Addr, version int, method string,

// PingNomadServer sends a Status.Ping message to the specified server and
// returns true if healthy, false if an error occurred
func (p *ConnPool) PingNomadServer(region string, apiMajorVersion int, s *rpcproxy.ServerEndpoint) (bool, error) {
func (p *ConnPool) PingNomadServer(region string, apiMajorVersion int, s net.Addr) (bool, error) {
// Get a usable client
conn, sc, err := p.getClient(region, s.Addr, apiMajorVersion)
conn, sc, err := p.getClient(region, s, apiMajorVersion)
if err != nil {
return false, err
}
Expand Down

0 comments on commit ce76aef

Please sign in to comment.