Skip to content
This repository has been archived by the owner on May 26, 2022. It is now read-only.

Commit

Permalink
Merge pull request #121 from libp2p/feat/improve-errors
Browse files Browse the repository at this point in the history
dial: return a nice custom dial error
  • Loading branch information
Stebalien authored Apr 25, 2019
2 parents 94a49f1 + 755d063 commit 0fc0136
Show file tree
Hide file tree
Showing 5 changed files with 121 additions and 55 deletions.
69 changes: 69 additions & 0 deletions dial_error.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
package swarm

import (
"fmt"
"strings"

peer "github.com/libp2p/go-libp2p-peer"
ma "github.com/multiformats/go-multiaddr"
)

// maxDialDialErrors is the maximum number of dial errors we record
const maxDialDialErrors = 16

// DialError is the error type returned when dialing.
type DialError struct {
Peer peer.ID
DialErrors []TransportError
Cause error
Skipped int
}

func (e *DialError) recordErr(addr ma.Multiaddr, err error) {
if len(e.DialErrors) >= maxDialDialErrors {
e.Skipped++
return
}
e.DialErrors = append(e.DialErrors, TransportError{
Address: addr,
Cause: err,
})
}

func (e *DialError) Error() string {
var builder strings.Builder
fmt.Fprintf(&builder, "failed to dial %s:", e.Peer)
if e.Cause != nil {
fmt.Fprintf(&builder, " %s", e.Cause)
}
for _, te := range e.DialErrors {
fmt.Fprintf(&builder, "\n * [%s] %s", te.Address, te.Cause)
}
if e.Skipped > 0 {
fmt.Fprintf(&builder, "\n ... skipping %d errors ...", e.Skipped)
}
return builder.String()
}

// Unwrap implements https://godoc.org/golang.org/x/xerrors#Wrapper.
func (e *DialError) Unwrap() error {
// If we have a context error, that's the "ultimate" error.
if e.Cause != nil {
return e.Cause
}
return nil
}

var _ error = (*DialError)(nil)

// TransportError is the error returned when dialing a specific address.
type TransportError struct {
Address ma.Multiaddr
Cause error
}

func (e *TransportError) Error() string {
return fmt.Sprintf("failed to dial %s: %s", e.Address, e.Cause)
}

var _ error = (*TransportError)(nil)
28 changes: 10 additions & 18 deletions dial_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,7 @@ package swarm_test

import (
"context"
"fmt"
"net"
"regexp"
"sync"
"testing"
"time"
Expand Down Expand Up @@ -491,8 +489,8 @@ func TestDialPeerFailed(t *testing.T) {
defer closeSwarms(swarms)
testedSwarm, targetSwarm := swarms[0], swarms[1]

exceptedErrorsCount := 5
for i := 0; i < exceptedErrorsCount; i++ {
expectedErrorsCount := 5
for i := 0; i < expectedErrorsCount; i++ {
_, silentPeerAddress, silentPeerListener := newSilentPeer(t)
go acceptAndHang(silentPeerListener)
defer silentPeerListener.Close()
Expand All @@ -508,23 +506,17 @@ func TestDialPeerFailed(t *testing.T) {
t.Fatal(err)
}

// dial_test.go:508: correctly get a combined error: dial attempt failed: 10 errors occurred:
// * <peer.ID Qm*Wpwtvc> --> <peer.ID Qm*cc2FQR> (/ip4/127.0.0.1/tcp/46485) dial attempt failed: failed to negotiate security protocol: context deadline exceeded
// * <peer.ID Qm*Wpwtvc> --> <peer.ID Qm*cc2FQR> (/ip4/127.0.0.1/tcp/34881) dial attempt failed: failed to negotiate security protocol: context deadline exceeded
// dial_test.go:508: correctly get a combined error: failed to dial PEER: all dials failed
// * [/ip4/127.0.0.1/tcp/46485] failed to negotiate security protocol: context deadline exceeded
// * [/ip4/127.0.0.1/tcp/34881] failed to negotiate security protocol: context deadline exceeded
// ...

errorCountRegexpString := fmt.Sprintf("%d errors occurred", exceptedErrorsCount)
errorCountRegexp := regexp.MustCompile(errorCountRegexpString)
if !errorCountRegexp.MatchString(err.Error()) {
t.Fatalf("can't find total err count: `%s' in `%s'", errorCountRegexpString, err.Error())
dialErr, ok := err.(*DialError)
if !ok {
t.Fatalf("expected *DialError, got %T", err)
}

connectErrorsRegexpString := `\* <peer\.ID .+?> --> <peer\.ID .+?> \(.+?\) dial attempt failed:.+`
connectErrorsRegexp := regexp.MustCompile(connectErrorsRegexpString)
connectErrors := connectErrorsRegexp.FindAll([]byte(err.Error()), -1)
if len(connectErrors) != exceptedErrorsCount {
t.Fatalf("connectErrors must contain %d errros; "+
"but `%s' was found in `%s' %d times",
exceptedErrorsCount, connectErrorsRegexpString, err.Error(), len(connectErrors))
if len(dialErr.DialErrors) != expectedErrorsCount {
t.Errorf("expected %d errors, got %d", expectedErrorsCount, len(dialErr.DialErrors))
}
}
1 change: 0 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
module github.com/libp2p/go-libp2p-swarm

require (
github.com/hashicorp/go-multierror v1.0.0
github.com/ipfs/go-log v0.0.1
github.com/jbenet/goprocess v0.0.0-20160826012719-b497e2f366b8
github.com/libp2p/go-addr-util v0.0.1
Expand Down
4 changes: 0 additions & 4 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,6 @@ github.com/gxed/hashland/keccakpg v0.0.1 h1:wrk3uMNaMxbXiHibbPO4S0ymqJMm41WiudyF
github.com/gxed/hashland/keccakpg v0.0.1/go.mod h1:kRzw3HkwxFU1mpmPP8v1WyQzwdGfmKFJ6tItnhQ67kU=
github.com/gxed/hashland/murmur3 v0.0.1 h1:SheiaIt0sda5K+8FLz952/1iWS9zrnKsEJaOJu4ZbSc=
github.com/gxed/hashland/murmur3 v0.0.1/go.mod h1:KjXop02n4/ckmZSnY2+HKcLud/tcmvhST0bie/0lS48=
github.com/hashicorp/errwrap v1.0.0 h1:hLrqtEDnRye3+sgx6z4qVLNuviH3MR5aQ0ykNJa/UYA=
github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4=
github.com/hashicorp/go-multierror v1.0.0 h1:iVjPR7a6H0tWELX5NxNe7bYopibicUzc7uPribsnS6o=
github.com/hashicorp/go-multierror v1.0.0/go.mod h1:dHtQlpGsu+cZNNAkkCN/P3hoUDHhCYQXV3UM06sGGrk=
github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8=
github.com/hpcloud/tail v1.0.0 h1:nfCOvKYfkgYP8hkirhJocXT2+zOD8yUNjXaWfTlyFKI=
github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU=
Expand Down
74 changes: 42 additions & 32 deletions swarm_dial.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@ import (
"sync"
"time"

"github.com/hashicorp/go-multierror"

logging "github.com/ipfs/go-log"
addrutil "github.com/libp2p/go-addr-util"
lgbl "github.com/libp2p/go-libp2p-loggables"
Expand All @@ -34,15 +32,23 @@ var (
// been dialed too frequently
ErrDialBackoff = errors.New("dial backoff")

// ErrDialFailed is returned when connecting to a peer has ultimately failed
ErrDialFailed = errors.New("dial attempt failed")

// ErrDialToSelf is returned if we attempt to dial our own peer
ErrDialToSelf = errors.New("dial to self attempted")

// ErrNoTransport is returned when we don't know a transport for the
// given multiaddr.
ErrNoTransport = errors.New("no transport for protocol")

// ErrAllDialsFailed is returned when connecting to a peer has ultimately failed
ErrAllDialsFailed = errors.New("all dials failed")

// ErrNoAddresses is returned when we fail to find any addresses for a
// peer we're trying to dial.
ErrNoAddresses = errors.New("no addresses")

// ErrNoAddresses is returned when we find addresses for a peer but
// can't use any of them.
ErrNoGoodAddresses = errors.New("no good addresses")
)

// DialAttempts governs how many times a goroutine will try to dial a given peer.
Expand Down Expand Up @@ -256,7 +262,7 @@ func (s *Swarm) doDial(ctx context.Context, p peer.ID) (*Conn, error) {
}

// ok, we failed.
return nil, fmt.Errorf("dial attempt failed: %s", err)
return nil, err
}
return conn, nil
}
Expand Down Expand Up @@ -293,11 +299,11 @@ func (s *Swarm) dial(ctx context.Context, p peer.ID) (*Conn, error) {
*/
peerAddrs := s.peers.Addrs(p)
if len(peerAddrs) == 0 {
return nil, errors.New("no addresses")
return nil, &DialError{Peer: p, Cause: ErrNoAddresses}
}
goodAddrs := s.filterKnownUndialables(peerAddrs)
if len(goodAddrs) == 0 {
return nil, errors.New("no good addresses")
return nil, &DialError{Peer: p, Cause: ErrNoGoodAddresses}
}
goodAddrsChan := make(chan ma.Multiaddr, len(goodAddrs))
for _, a := range goodAddrs {
Expand All @@ -307,10 +313,18 @@ func (s *Swarm) dial(ctx context.Context, p peer.ID) (*Conn, error) {
/////////

// try to get a connection to any addr
connC, err := s.dialAddrs(ctx, p, goodAddrsChan)
if err != nil {
logdial["error"] = err.Error()
return nil, err
connC, dialErr := s.dialAddrs(ctx, p, goodAddrsChan)
if dialErr != nil {
logdial["error"] = dialErr.Cause.Error()
if dialErr.Cause == context.Canceled {
// always prefer the "context canceled" error.
// we rely on behing able to check `err == context.Canceled`
//
// Removing this will BREAK backoff (causing us to
// backoff when canceling dials).
return nil, context.Canceled
}
return nil, dialErr
}
logdial["conn"] = logging.Metadata{
"localAddr": connC.LocalMultiaddr(),
Expand All @@ -320,7 +334,7 @@ func (s *Swarm) dial(ctx context.Context, p peer.ID) (*Conn, error) {
if err != nil {
logdial["error"] = err.Error()
connC.Close() // close the connection. didn't work out :(
return nil, err
return nil, &DialError{Peer: p, Cause: err}
}

logdial["dial"] = "success"
Expand Down Expand Up @@ -352,34 +366,31 @@ func (s *Swarm) filterKnownUndialables(addrs []ma.Multiaddr) []ma.Multiaddr {
)
}

func (s *Swarm) dialAddrs(ctx context.Context, p peer.ID, remoteAddrs <-chan ma.Multiaddr) (transport.Conn, error) {
func (s *Swarm) dialAddrs(ctx context.Context, p peer.ID, remoteAddrs <-chan ma.Multiaddr) (transport.Conn, *DialError) {
log.Debugf("%s swarm dialing %s", s.local, p)

ctx, cancel := context.WithCancel(ctx)
defer cancel() // cancel work when we exit func

// use a single response type instead of errs and conns, reduces complexity *a ton*
respch := make(chan dialResult)
var dialErrors *multierror.Error
err := new(DialError)

defer s.limiter.clearAllPeerDials(p)

var active int
dialLoop:
for remoteAddrs != nil || active > 0 {
// Check for context cancellations and/or responses first.
select {
case <-ctx.Done():
if dialError := dialErrors.ErrorOrNil(); dialError != nil {
return nil, dialError
}

return nil, ctx.Err()
break dialLoop
case resp := <-respch:
active--
if resp.Err != nil {
// Errors are normal, lots of dials will fail
log.Infof("got error on dial: %s", resp.Err)
dialErrors = multierror.Append(dialErrors, resp.Err)
err.recordErr(resp.Addr, resp.Err)
} else if resp.Conn != nil {
return resp.Conn, nil
}
Expand All @@ -400,28 +411,27 @@ func (s *Swarm) dialAddrs(ctx context.Context, p peer.ID, remoteAddrs <-chan ma.
s.limitedDial(ctx, p, addr, respch)
active++
case <-ctx.Done():
if dialError := dialErrors.ErrorOrNil(); dialError != nil {
return nil, dialError
}

return nil, ctx.Err()
break dialLoop
case resp := <-respch:
active--
if resp.Err != nil {
// Errors are normal, lots of dials will fail
log.Infof("got error on dial: %s", resp.Err)
dialErrors = multierror.Append(dialErrors, resp.Err)
err.recordErr(resp.Addr, resp.Err)
} else if resp.Conn != nil {
return resp.Conn, nil
}
}
}

if dialError := dialErrors.ErrorOrNil(); dialError != nil {
return nil, dialError
if ctxErr := ctx.Err(); ctxErr != nil {
err.Cause = ctxErr
} else if len(err.DialErrors) == 0 {
err.Cause = inet.ErrNoRemoteAddrs
} else {
err.Cause = ErrAllDialsFailed
}

return nil, inet.ErrNoRemoteAddrs
return nil, err
}

// limitedDial will start a dial to the given peer when
Expand Down Expand Up @@ -450,7 +460,7 @@ func (s *Swarm) dialAddr(ctx context.Context, p peer.ID, addr ma.Multiaddr) (tra

connC, err := tpt.Dial(ctx, addr, p)
if err != nil {
return nil, fmt.Errorf("%s --> %s (%s) dial attempt failed: %s", s.local, p, addr, err)
return nil, err
}

// Trust the transport? Yeah... right.
Expand Down

0 comments on commit 0fc0136

Please sign in to comment.