From d33801b6402d9c1b2afd9ed48315a33873022a0f Mon Sep 17 00:00:00 2001 From: Alexander Polcyn Date: Tue, 20 Jul 2021 14:48:27 -0700 Subject: [PATCH] Propagate the most recent connection error when grpclb policies enter transient failure --- balancer/grpclb/grpclb.go | 6 +++- balancer/grpclb/grpclb_test.go | 59 ++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 1 deletion(-) diff --git a/balancer/grpclb/grpclb.go b/balancer/grpclb/grpclb.go index a43d8964119f..49d11d0d2e21 100644 --- a/balancer/grpclb/grpclb.go +++ b/balancer/grpclb/grpclb.go @@ -25,6 +25,7 @@ package grpclb import ( "context" "errors" + "fmt" "sync" "time" @@ -221,6 +222,7 @@ type lbBalancer struct { // when resolved address updates are received, and read in the goroutine // handling fallback. resolvedBackendAddrs []resolver.Address + connErr error // the last connection error } // regeneratePicker takes a snapshot of the balancer, and generates a picker from @@ -230,7 +232,7 @@ type lbBalancer struct { // Caller must hold lb.mu. func (lb *lbBalancer) regeneratePicker(resetDrop bool) { if lb.state == connectivity.TransientFailure { - lb.picker = &errPicker{err: balancer.ErrTransientFailure} + lb.picker = &errPicker{err: fmt.Errorf("all SubConns are in TransientFailure, last connection error: %v", lb.connErr)} return } @@ -336,6 +338,8 @@ func (lb *lbBalancer) UpdateSubConnState(sc balancer.SubConn, scs balancer.SubCo // When an address was removed by resolver, b called RemoveSubConn but // kept the sc's state in scStates. Remove state for this sc here. delete(lb.scStates, sc) + case connectivity.TransientFailure: + lb.connErr = scs.ConnectionError } // Force regenerate picker if // - this sc became ready from not-ready diff --git a/balancer/grpclb/grpclb_test.go b/balancer/grpclb/grpclb_test.go index 9cbb338c2415..d6275b657f90 100644 --- a/balancer/grpclb/grpclb_test.go +++ b/balancer/grpclb/grpclb_test.go @@ -1232,6 +1232,65 @@ func (s) TestGRPCLBPickFirst(t *testing.T) { } } +func (s) TestGRPCLBBackendConnectionErrorPropagation(t *testing.T) { + r := manual.NewBuilderWithScheme("whatever") + + // Start up an LB which will tell the client to fall back + // right away. + tss, cleanup, err := newLoadBalancer(0, "", nil) + if err != nil { + t.Fatalf("failed to create new load balancer: %v", err) + } + defer cleanup() + + // Start a standalone backend, to be used during fallback. The creds + // are intentionally misconfigured in order to simulate failure of a + // security handshake. + beLis, err := net.Listen("tcp", "localhost:0") + if err != nil { + t.Fatalf("Failed to listen %v", err) + } + defer beLis.Close() + standaloneBEs := startBackends("arbitrary.invalid.name", true, beLis) + defer stopBackends(standaloneBEs) + + creds := serverNameCheckCreds{} + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + cc, err := grpc.DialContext(ctx, r.Scheme()+":///"+beServerName, grpc.WithResolvers(r), + grpc.WithTransportCredentials(&creds), grpc.WithContextDialer(fakeNameDialer)) + if err != nil { + t.Fatalf("Failed to dial to the backend %v", err) + } + defer cc.Close() + testC := testpb.NewTestServiceClient(cc) + + r.UpdateState(resolver.State{Addresses: []resolver.Address{{ + Addr: tss.lbAddr, + Type: resolver.GRPCLB, + ServerName: lbServerName, + }, { + Addr: beLis.Addr().String(), + Type: resolver.Backend, + }}}) + + // If https://github.com/grpc/grpc-go/blob/65cabd74d8e18d7347fecd414fa8d83a00035f5f/balancer/grpclb/grpclb_test.go#L103 + // changes, then expectedErrMsg may need to be updated. + const expectedErrMsg = "received unexpected server name" + ctx, cancel = context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + var wg sync.WaitGroup + wg.Add(1) + go func() { + tss.ls.fallbackNow() + wg.Done() + }() + if _, err := testC.EmptyCall(ctx, &testpb.Empty{}); err == nil || !strings.Contains(err.Error(), expectedErrMsg) { + t.Fatalf("%v.EmptyCall(_, _) = _, %v, want _, rpc error containing substring: %q", testC, err, expectedErrMsg) + } + wg.Wait() +} + type failPreRPCCred struct{} func (failPreRPCCred) GetRequestMetadata(ctx context.Context, uri ...string) (map[string]string, error) {