Skip to content

Commit

Permalink
Merge pull request kubernetes#46463 from wongma7/getinstances
Browse files Browse the repository at this point in the history
Automatic merge from submit-queue (batch tested with PRs 46489, 46281, 46463, 46114, 43946)

AWS: consider instances of all states in DisksAreAttached, not just "running"

Require callers of `getInstancesByNodeNames(Cached)` to specify the states they want to filter instances by, if any. DisksAreAttached, cannot only get "running" instances because of the following attach/detach bug we discovered:

1. Node A stops (or reboots) and stays down for x amount of time
2. Kube reschedules all pods to different nodes; the ones using ebs volumes cannot run because their volumes are still attached to node A
3. Verify volumes are attached check happens while node A is down
4. Since aws ebs bulk verify filters by running nodes, it assumes the volumes attached to node A are detached and removes them all from ASW
5. Node A comes back; its volumes are still attached to it but the attach detach controller has removed them all from asw and so will never detach them even though they are no longer desired on this node and in fact desired elsewhere
6. Pods cannot run because their volumes are still attached to node A

So the idea here is to remove the wrong assumption that callers of `getInstancesByNodeNames(Cached)` only want "running" nodes.

I hope this isn't too confusing, open to alternative ways of fixing the bug + making the code nice.

ping @gnufied @kubernetes/sig-storage-bugs

```release-note
Fix AWS EBS volumes not getting detached from node if routine to verify volumes are attached runs while the node is down
```
  • Loading branch information
Kubernetes Submit Queue authored May 30, 2017
2 parents 69c4a8f + 319c608 commit 222d247
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 10 deletions.
19 changes: 10 additions & 9 deletions pkg/cloudprovider/providers/aws/aws.go
Original file line number Diff line number Diff line change
Expand Up @@ -2599,7 +2599,7 @@ func (c *Cloud) EnsureLoadBalancer(clusterName string, apiService *v1.Service, n
return nil, fmt.Errorf("LoadBalancerIP cannot be specified for AWS ELB")
}

instances, err := c.getInstancesByNodeNamesCached(nodeNames(nodes))
instances, err := c.getInstancesByNodeNamesCached(nodeNames(nodes), "running")
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -3176,7 +3176,7 @@ func (c *Cloud) EnsureLoadBalancerDeleted(clusterName string, service *v1.Servic

// UpdateLoadBalancer implements LoadBalancer.UpdateLoadBalancer
func (c *Cloud) UpdateLoadBalancer(clusterName string, service *v1.Service, nodes []*v1.Node) error {
instances, err := c.getInstancesByNodeNamesCached(nodeNames(nodes))
instances, err := c.getInstancesByNodeNamesCached(nodeNames(nodes), "running")
if err != nil {
return err
}
Expand Down Expand Up @@ -3248,10 +3248,11 @@ func (c *Cloud) getInstancesByIDs(instanceIDs []*string) (map[string]*ec2.Instan
return instancesByID, nil
}

// Fetches and caches instances by node names; returns an error if any cannot be found.
// Fetches and caches instances in the given state, by node names; returns an error if any cannot be found. If no states
// are given, no state filter is used and instances of all states are fetched.
// This is implemented with a multi value filter on the node names, fetching the desired instances with a single query.
// TODO(therc): make all the caching more rational during the 1.4 timeframe
func (c *Cloud) getInstancesByNodeNamesCached(nodeNames sets.String) ([]*ec2.Instance, error) {
func (c *Cloud) getInstancesByNodeNamesCached(nodeNames sets.String, states ...string) ([]*ec2.Instance, error) {
c.mutex.Lock()
defer c.mutex.Unlock()
if nodeNames.Equal(c.lastNodeNames) {
Expand All @@ -3262,7 +3263,7 @@ func (c *Cloud) getInstancesByNodeNamesCached(nodeNames sets.String) ([]*ec2.Ins
return c.lastInstancesByNodeNames, nil
}
}
instances, err := c.getInstancesByNodeNames(nodeNames.List())
instances, err := c.getInstancesByNodeNames(nodeNames.List(), states...)

if err != nil {
return nil, err
Expand All @@ -3278,17 +3279,17 @@ func (c *Cloud) getInstancesByNodeNamesCached(nodeNames sets.String) ([]*ec2.Ins
return instances, nil
}

func (c *Cloud) getInstancesByNodeNames(nodeNames []string) ([]*ec2.Instance, error) {
func (c *Cloud) getInstancesByNodeNames(nodeNames []string, states ...string) ([]*ec2.Instance, error) {
names := aws.StringSlice(nodeNames)

nodeNameFilter := &ec2.Filter{
Name: aws.String("private-dns-name"),
Values: names,
}

filters := []*ec2.Filter{
nodeNameFilter,
newEc2Filter("instance-state-name", "running"),
filters := []*ec2.Filter{nodeNameFilter}
if len(states) > 0 {
filters = append(filters, newEc2Filter("instance-state-name", states...))
}

instances, err := c.describeInstances(filters)
Expand Down
2 changes: 1 addition & 1 deletion pkg/cloudprovider/providers/aws/aws_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1050,7 +1050,7 @@ func TestFindInstancesByNodeNameCached(t *testing.T) {
}

nodeNames := sets.NewString(nodeNameOne)
returnedInstances, errr := c.getInstancesByNodeNamesCached(nodeNames)
returnedInstances, errr := c.getInstancesByNodeNamesCached(nodeNames, "running")

if errr != nil {
t.Errorf("Failed to find instance: %v", err)
Expand Down

0 comments on commit 222d247

Please sign in to comment.