Skip to content

Commit

Permalink
client: don't use Status RPC for Consul discovery (#16490) (#16568)
Browse files Browse the repository at this point in the history
In #16217 we switched clients using Consul discovery to the `Status.Members`
endpoint for getting the list of servers so that we're using the correct
address. This endpoint has an authorization gate, so this fails if the anonymous
policy doesn't have `node:read`. We also can't check the `AuthToken` for the
request for the client secret, because the client hasn't yet registered so the
server doesn't have anything to compare against.

Instead of hitting the `Status.Peers` or `Status.Members` RPC endpoint, use the
Consul response directly. Update the `registerNode` method to handle the list of
servers we get back in the response; if we get a "no servers" or "no path to
region" response we'll kick off discovery again and retry immediately rather
than waiting 15s.

Co-authored-by: Tim Gross <tgross@hashicorp.com>
  • Loading branch information
hc-github-team-nomad-core and tgross committed Mar 20, 2023
1 parent 372ca8b commit cb40332
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 45 deletions.
3 changes: 3 additions & 0 deletions .changelog/16490.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:bug
client: Fixed a bug where clients using Consul discovery to join the cluster would get permission denied errors
```
75 changes: 30 additions & 45 deletions client/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -1862,7 +1862,7 @@ func (c *Client) retryRegisterNode() {
}

retryIntv := registerRetryIntv
if err == noServersErr {
if err == noServersErr || structs.IsErrNoRegionPath(err) {
c.logger.Debug("registration waiting on servers")
c.triggerDiscovery()
retryIntv = noServerRetryIntv
Expand All @@ -1889,6 +1889,11 @@ func (c *Client) registerNode() error {
return err
}

err := c.handleNodeUpdateResponse(resp)
if err != nil {
return err
}

// Update the node status to ready after we register.
c.UpdateConfig(func(c *config.Config) {
c.Node.Status = structs.NodeStatusReady
Expand All @@ -1903,6 +1908,7 @@ func (c *Client) registerNode() error {
defer c.heartbeatLock.Unlock()
c.heartbeatStop.setLastOk(time.Now())
c.heartbeatTTL = resp.HeartbeatTTL

return nil
}

Expand Down Expand Up @@ -1954,6 +1960,22 @@ func (c *Client) updateNodeStatus() error {
}
})

err := c.handleNodeUpdateResponse(resp)
if err != nil {
return fmt.Errorf("heartbeat response returned no valid servers")
}

// If there's no Leader in the response we may be talking to a partitioned
// server. Redo discovery to ensure our server list is up to date.
if resp.LeaderRPCAddr == "" {
c.triggerDiscovery()
}

c.EnterpriseClient.SetFeatures(resp.Features)
return nil
}

func (c *Client) handleNodeUpdateResponse(resp structs.NodeUpdateResponse) error {
// Update the number of nodes in the cluster so we can adjust our server
// rebalance rate.
c.servers.SetNumNodes(resp.NumNodes)
Expand All @@ -1970,20 +1992,9 @@ func (c *Client) updateNodeStatus() error {
nomadServers = append(nomadServers, e)
}
if len(nomadServers) == 0 {
return fmt.Errorf("heartbeat response returned no valid servers")
return noServersErr
}
c.servers.SetServers(nomadServers)

// Begin polling Consul if there is no Nomad leader. We could be
// heartbeating to a Nomad server that is in the minority of a
// partition of the Nomad server quorum, but this Nomad Agent still
// has connectivity to the existing majority of Nomad Servers, but
// only if it queries Consul.
if resp.LeaderRPCAddr == "" {
c.triggerDiscovery()
}

c.EnterpriseClient.SetFeatures(resp.Features)
return nil
}

Expand Down Expand Up @@ -2821,14 +2832,6 @@ func (c *Client) consulDiscoveryImpl() error {
dcs = dcs[0:helper.Min(len(dcs), datacenterQueryLimit)]
}

// Query for servers in this client's region only
region := c.Region()
rpcargs := structs.GenericRequest{
QueryOptions: structs.QueryOptions{
Region: region,
},
}

serviceName := c.GetConfig().ConsulConfig.ServerServiceName
var mErr multierror.Error
var nomadServers servers.Servers
Expand Down Expand Up @@ -2859,32 +2862,14 @@ DISCOLOOP:
continue
}

// Query the members from the region that Consul gave us, and
// extract the client-advertise RPC address from each member
var membersResp structs.ServerMembersResponse
if err := c.connPool.RPC(region, addr, "Status.Members", rpcargs, &membersResp); err != nil {
mErr.Errors = append(mErr.Errors, err)
continue
}
for _, member := range membersResp.Members {
if addrTag, ok := member.Tags["rpc_addr"]; ok {
if portTag, ok := member.Tags["port"]; ok {
addr, err := net.ResolveTCPAddr("tcp",
fmt.Sprintf("%s:%s", addrTag, portTag))
if err != nil {
mErr.Errors = append(mErr.Errors, err)
continue
}
srv := &servers.Server{Addr: addr}
nomadServers = append(nomadServers, srv)
}
}
}
srv := &servers.Server{Addr: addr}
nomadServers = append(nomadServers, srv)
}

if len(nomadServers) > 0 {
break DISCOLOOP
}
if len(nomadServers) > 0 {
break DISCOLOOP
}

}
if len(nomadServers) == 0 {
if len(mErr.Errors) > 0 {
Expand Down

0 comments on commit cb40332

Please sign in to comment.