Skip to content
This repository has been archived by the owner on May 26, 2022. It is now read-only.

Commit

Permalink
dial: return a nice custom dial error
Browse files Browse the repository at this point in the history
* Limits the number of dial errors we keep
* Exposes the dial errors
* Avoids string formatting unless we actually need to.

Helps address #119
  • Loading branch information
Stebalien committed Apr 24, 2019
1 parent 94a49f1 commit 3a2e78b
Show file tree
Hide file tree
Showing 6 changed files with 105 additions and 47 deletions.
69 changes: 69 additions & 0 deletions dial_error.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
package swarm

import (
"fmt"
"strings"

peer "github.com/libp2p/go-libp2p-peer"
ma "github.com/multiformats/go-multiaddr"
)

// maxDialErrors is the maximum number of dial errors we record
const maxDialErrors = 32

// DialError is the error type returned when dialing.
type DialError struct {
Peer peer.ID
Errors []TransportError
Cause error
Skipped int
}

func (e *DialError) recordErr(addr ma.Multiaddr, err error) {
if len(e.Errors) >= maxDialErrors {
e.Skipped++
return
}
e.Errors = append(e.Errors, TransportError{
Address: addr,
Cause: err,
})
}

func (e *DialError) Error() string {
var builder strings.Builder
fmt.Fprintf(&builder, "failed to dial %s:", e.Peer)
if e.Cause != nil {
fmt.Fprintf(&builder, " %s", e.Cause)
}
for _, te := range e.Errors {
fmt.Fprintf(&builder, "\n * [%s] %s", te.Address, te.Cause)
}
if e.Skipped > 0 {
fmt.Fprintf(&builder, "\n ... skipping %d errors ...", e.Skipped)
}
return builder.String()
}

// Unwrap implements https://godoc.org/golang.org/x/xerrors#Wrapper.
func (e *DialError) Unwrap() error {
// If we have a context error, that's the "ultimate" error.
if e.Cause != nil {
return e.Cause
}
return nil
}

var _ error = (*DialError)(nil)

// TransportError is the error returned when dialing a specific address.
type TransportError struct {
Address ma.Multiaddr
Cause error
}

func (e *TransportError) Error() string {
return fmt.Sprintf("failed to dial %s: %s", e.Address, e.Cause)
}

var _ error = (*TransportError)(nil)
28 changes: 10 additions & 18 deletions dial_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,7 @@ package swarm_test

import (
"context"
"fmt"
"net"
"regexp"
"sync"
"testing"
"time"
Expand Down Expand Up @@ -491,8 +489,8 @@ func TestDialPeerFailed(t *testing.T) {
defer closeSwarms(swarms)
testedSwarm, targetSwarm := swarms[0], swarms[1]

exceptedErrorsCount := 5
for i := 0; i < exceptedErrorsCount; i++ {
expectedErrorsCount := 5
for i := 0; i < expectedErrorsCount; i++ {
_, silentPeerAddress, silentPeerListener := newSilentPeer(t)
go acceptAndHang(silentPeerListener)
defer silentPeerListener.Close()
Expand All @@ -508,23 +506,17 @@ func TestDialPeerFailed(t *testing.T) {
t.Fatal(err)
}

// dial_test.go:508: correctly get a combined error: dial attempt failed: 10 errors occurred:
// * <peer.ID Qm*Wpwtvc> --> <peer.ID Qm*cc2FQR> (/ip4/127.0.0.1/tcp/46485) dial attempt failed: failed to negotiate security protocol: context deadline exceeded
// * <peer.ID Qm*Wpwtvc> --> <peer.ID Qm*cc2FQR> (/ip4/127.0.0.1/tcp/34881) dial attempt failed: failed to negotiate security protocol: context deadline exceeded
// dial_test.go:508: correctly get a combined error: failed to dial PEER: all dials failed
// * [/ip4/127.0.0.1/tcp/46485] failed to negotiate security protocol: context deadline exceeded
// * [/ip4/127.0.0.1/tcp/34881] failed to negotiate security protocol: context deadline exceeded
// ...

errorCountRegexpString := fmt.Sprintf("%d errors occurred", exceptedErrorsCount)
errorCountRegexp := regexp.MustCompile(errorCountRegexpString)
if !errorCountRegexp.MatchString(err.Error()) {
t.Fatalf("can't find total err count: `%s' in `%s'", errorCountRegexpString, err.Error())
dialErr, ok := err.(*DialError)
if !ok {
t.Fatalf("expected *DialError, got %T", err)
}

connectErrorsRegexpString := `\* <peer\.ID .+?> --> <peer\.ID .+?> \(.+?\) dial attempt failed:.+`
connectErrorsRegexp := regexp.MustCompile(connectErrorsRegexpString)
connectErrors := connectErrorsRegexp.FindAll([]byte(err.Error()), -1)
if len(connectErrors) != exceptedErrorsCount {
t.Fatalf("connectErrors must contain %d errros; "+
"but `%s' was found in `%s' %d times",
exceptedErrorsCount, connectErrorsRegexpString, err.Error(), len(connectErrors))
if len(dialErr.Errors) != expectedErrorsCount {
t.Errorf("expected %d errors, got %d", expectedErrorsCount, len(dialErr.Errors))
}
}
1 change: 0 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
module github.com/libp2p/go-libp2p-swarm

require (
github.com/hashicorp/go-multierror v1.0.0
github.com/ipfs/go-log v0.0.1
github.com/jbenet/goprocess v0.0.0-20160826012719-b497e2f366b8
github.com/libp2p/go-addr-util v0.0.1
Expand Down
4 changes: 0 additions & 4 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,6 @@ github.com/gxed/hashland/keccakpg v0.0.1 h1:wrk3uMNaMxbXiHibbPO4S0ymqJMm41WiudyF
github.com/gxed/hashland/keccakpg v0.0.1/go.mod h1:kRzw3HkwxFU1mpmPP8v1WyQzwdGfmKFJ6tItnhQ67kU=
github.com/gxed/hashland/murmur3 v0.0.1 h1:SheiaIt0sda5K+8FLz952/1iWS9zrnKsEJaOJu4ZbSc=
github.com/gxed/hashland/murmur3 v0.0.1/go.mod h1:KjXop02n4/ckmZSnY2+HKcLud/tcmvhST0bie/0lS48=
github.com/hashicorp/errwrap v1.0.0 h1:hLrqtEDnRye3+sgx6z4qVLNuviH3MR5aQ0ykNJa/UYA=
github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4=
github.com/hashicorp/go-multierror v1.0.0 h1:iVjPR7a6H0tWELX5NxNe7bYopibicUzc7uPribsnS6o=
github.com/hashicorp/go-multierror v1.0.0/go.mod h1:dHtQlpGsu+cZNNAkkCN/P3hoUDHhCYQXV3UM06sGGrk=
github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8=
github.com/hpcloud/tail v1.0.0 h1:nfCOvKYfkgYP8hkirhJocXT2+zOD8yUNjXaWfTlyFKI=
github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU=
Expand Down
3 changes: 3 additions & 0 deletions swarm.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@ var log = logging.Logger("swarm2")
// ErrSwarmClosed is returned when one attempts to operate on a closed swarm.
var ErrSwarmClosed = errors.New("swarm closed")

// ErrAllDialsFailed is returned when connecting to a peer has ultimately failed
var ErrAllDialsFailed = errors.New("all dials failed")

// ErrAddrFiltered is returned when trying to register a connection to a
// filtered address. You shouldn't see this error unless some underlying
// transport is misbehaving.
Expand Down
47 changes: 23 additions & 24 deletions swarm_dial.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@ import (
"sync"
"time"

"github.com/hashicorp/go-multierror"

logging "github.com/ipfs/go-log"
addrutil "github.com/libp2p/go-addr-util"
lgbl "github.com/libp2p/go-libp2p-loggables"
Expand All @@ -34,9 +32,6 @@ var (
// been dialed too frequently
ErrDialBackoff = errors.New("dial backoff")

// ErrDialFailed is returned when connecting to a peer has ultimately failed
ErrDialFailed = errors.New("dial attempt failed")

// ErrDialToSelf is returned if we attempt to dial our own peer
ErrDialToSelf = errors.New("dial to self attempted")

Expand Down Expand Up @@ -256,7 +251,7 @@ func (s *Swarm) doDial(ctx context.Context, p peer.ID) (*Conn, error) {
}

// ok, we failed.
return nil, fmt.Errorf("dial attempt failed: %s", err)
return nil, err
}
return conn, nil
}
Expand Down Expand Up @@ -360,26 +355,23 @@ func (s *Swarm) dialAddrs(ctx context.Context, p peer.ID, remoteAddrs <-chan ma.

// use a single response type instead of errs and conns, reduces complexity *a ton*
respch := make(chan dialResult)
var dialErrors *multierror.Error
err := new(DialError)

defer s.limiter.clearAllPeerDials(p)

var active int
dialLoop:
for remoteAddrs != nil || active > 0 {
// Check for context cancellations and/or responses first.
select {
case <-ctx.Done():
if dialError := dialErrors.ErrorOrNil(); dialError != nil {
return nil, dialError
}

return nil, ctx.Err()
break dialLoop
case resp := <-respch:
active--
if resp.Err != nil {
// Errors are normal, lots of dials will fail
log.Infof("got error on dial: %s", resp.Err)
dialErrors = multierror.Append(dialErrors, resp.Err)
err.recordErr(resp.Addr, resp.Err)
} else if resp.Conn != nil {
return resp.Conn, nil
}
Expand All @@ -400,28 +392,35 @@ func (s *Swarm) dialAddrs(ctx context.Context, p peer.ID, remoteAddrs <-chan ma.
s.limitedDial(ctx, p, addr, respch)
active++
case <-ctx.Done():
if dialError := dialErrors.ErrorOrNil(); dialError != nil {
return nil, dialError
}

return nil, ctx.Err()
break dialLoop
case resp := <-respch:
active--
if resp.Err != nil {
// Errors are normal, lots of dials will fail
log.Infof("got error on dial: %s", resp.Err)
dialErrors = multierror.Append(dialErrors, resp.Err)
err.recordErr(resp.Addr, resp.Err)
} else if resp.Conn != nil {
return resp.Conn, nil
}
}
}

if dialError := dialErrors.ErrorOrNil(); dialError != nil {
return nil, dialError
if ctxErr := ctx.Err(); ctxErr != nil {
// cancellation errors take precedence as users tend to check
// against this error manually.
//
// DO NOT REMOVE THIS. Otherwise, we'll put a peer into
// "backoff" mode whenever we cancel a dial to that peer.
if ctxErr == context.Canceled {
return nil, ctxErr
}
err.Cause = ctxErr
} else if len(err.Errors) == 0 {
err.Cause = inet.ErrNoRemoteAddrs
} else {
err.Cause = ErrAllDialsFailed
}

return nil, inet.ErrNoRemoteAddrs
return nil, err
}

// limitedDial will start a dial to the given peer when
Expand Down Expand Up @@ -450,7 +449,7 @@ func (s *Swarm) dialAddr(ctx context.Context, p peer.ID, addr ma.Multiaddr) (tra

connC, err := tpt.Dial(ctx, addr, p)
if err != nil {
return nil, fmt.Errorf("%s --> %s (%s) dial attempt failed: %s", s.local, p, addr, err)
return nil, err
}

// Trust the transport? Yeah... right.
Expand Down

0 comments on commit 3a2e78b

Please sign in to comment.