Skip to content

Commit

Permalink
rpc: retry server busy (#3758)
Browse files Browse the repository at this point in the history
Add RPC retry with exponential back-off in case of quic `server busy`
error.
Reduce synchronous part of accept loop.

With the bump to quic-go v0.15.5 (#3732) the quic library now enforces a
max accept queue length. With our approach of every RPC is a new quic
connection, and some work being done synchronously in the accept loop
this bites on CI heavily.

With the exponential back-off and immediately spawning a go routine, we
can improve failure rate significantly.

But this should not be the final solution. Applications need to be
able to deal with requests failing. Currently, we only do one shot
testing, which might be too strict.

For investigation results, see the buildkite builds associated with
the draft PR #3750.
  • Loading branch information
oncilla authored May 11, 2020
1 parent d990b00 commit 778d62d
Show file tree
Hide file tree
Showing 5 changed files with 37 additions and 23 deletions.
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ require (
github.com/iancoleman/strcase v0.0.0-20190422225806-e506e3ef7365
github.com/inconshreveable/log15 v0.0.0-20161013181240-944cbfb97b44
github.com/kormat/fmt15 v0.0.0-20181112140556-ee69fecb2656
github.com/lucas-clemente/quic-go v0.15.5
github.com/lucas-clemente/quic-go v0.15.7
github.com/mattn/go-colorable v0.1.4 // indirect
github.com/mattn/go-isatty v0.0.8
github.com/mattn/go-sqlite3 v1.9.1-0.20180719091609-b3511bfdd742
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -142,8 +142,8 @@ github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
github.com/lucas-clemente/quic-go v0.15.5 h1:2DR5qCVt4k1qnDmknf+idj1FDJIrkiDyS6a0uIR+UJY=
github.com/lucas-clemente/quic-go v0.15.5/go.mod h1:Myi1OyS0FOjL3not4BxT7KN29bRkcMUV5JVVFLKtDp8=
github.com/lucas-clemente/quic-go v0.15.7 h1:Pu7To5/G9JoP1mwlrcIvfV8ByPBlCzif3MCl8+1W83I=
github.com/lucas-clemente/quic-go v0.15.7/go.mod h1:Myi1OyS0FOjL3not4BxT7KN29bRkcMUV5JVVFLKtDp8=
github.com/lunixbochs/vtclean v1.0.0/go.mod h1:pHhQNgMf3btfWnGBVipUOjRYhoOsdGqdm/+2c2E2WMI=
github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ=
github.com/mailru/easyjson v0.0.0-20190312143242-1de009706dbe/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc=
Expand Down
14 changes: 7 additions & 7 deletions go/lib/infra/messenger/tcp/messenger.go
Original file line number Diff line number Diff line change
Expand Up @@ -203,9 +203,12 @@ func (m *Messenger) ListenAndServe() {
log.Error("[tcp-msgr] Listen error", "err", err)
return
}
if err := m.handleConn(conn); err != nil {
log.Warn("[tcp-msgr] Server handler exited with error", "err", err)
}
go func() {
defer log.HandlePanic()
if err := m.handleConn(conn); err != nil {
log.Warn("[tcp-msgr] Server handler exited with error", "err", err)
}
}()
}
}

Expand All @@ -223,10 +226,7 @@ func (m *Messenger) handleConn(conn net.Conn) error {
Message: msg,
Address: conn.RemoteAddr(),
}
go func() {
defer log.HandlePanic()
m.Handler.ServeRPC(rw, request)
}()
m.Handler.ServeRPC(rw, request)
return nil
}

Expand Down
36 changes: 25 additions & 11 deletions go/lib/infra/rpc/rpc.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,11 @@ import (
"crypto/tls"
"fmt"
"io"
mrand "math/rand"
"net"
"strings"
"sync"
"time"

quic "github.com/lucas-clemente/quic-go"
capnp "zombiezen.com/go/capnproto2"
Expand Down Expand Up @@ -73,9 +75,12 @@ func (s *Server) ListenAndServe() error {
log.Warn("[quic] server accept error", "err", err)
continue
}
if err := s.handleQUICSession(session); err != nil {
log.Warn("[quic] server handler exited with error", "err", err)
}
go func() {
defer log.HandlePanic()
if err := s.handleQUICSession(session); err != nil {
log.Warn("[quic] server handler exited with error", "err", err)
}
}()
}
}

Expand Down Expand Up @@ -120,10 +125,7 @@ func (s *Server) handleQUICSession(session quic.Session) error {
Message: msg,
Address: session.RemoteAddr(),
}
go func() {
defer log.HandlePanic()
s.Handler.ServeRPC(rw, request)
}()
s.Handler.ServeRPC(rw, request)
return nil
}

Expand All @@ -143,10 +145,22 @@ type Client struct {
func (c *Client) Request(ctx context.Context, request *Request, address net.Addr) (*Reply, error) {
addressStr := computeAddressStr(address)

session, err := quic.DialContext(ctx, c.Conn, address, addressStr,
c.TLSConfig, c.QUICConfig)
if err != nil {
return nil, err
var session quic.Session
for sleep := 2 * time.Millisecond; ctx.Err() == nil; sleep = sleep * 2 {
var err error
session, err = quic.DialContext(ctx, c.Conn, address, addressStr, c.TLSConfig, c.QUICConfig)
if err == nil {
break
}
// Unfortunately there is no better way to check the error.
// https://github.com/lucas-clemente/quic-go/issues/2441
if err.Error() != "SERVER_BUSY" {
return nil, err
}
select {
case <-time.After(sleep + time.Duration(mrand.Int()%5000)*time.Microsecond):
case <-ctx.Done():
}
}

stream, err := session.OpenStream()
Expand Down
4 changes: 2 additions & 2 deletions go_deps.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -479,8 +479,8 @@ def go_deps():
go_repository(
name = "com_github_lucas_clemente_quic_go",
importpath = "github.com/lucas-clemente/quic-go",
sum = "h1:2DR5qCVt4k1qnDmknf+idj1FDJIrkiDyS6a0uIR+UJY=",
version = "v0.15.5",
sum = "h1:Pu7To5/G9JoP1mwlrcIvfV8ByPBlCzif3MCl8+1W83I=",
version = "v0.15.7",
)
go_repository(
name = "com_github_lunixbochs_vtclean",
Expand Down

0 comments on commit 778d62d

Please sign in to comment.