Skip to content

Commit

Permalink
driver/networking: don't recreate existing network namespaces
Browse files Browse the repository at this point in the history
  • Loading branch information
tgross committed Sep 19, 2019
1 parent 9a629c4 commit 09585ab
Show file tree
Hide file tree
Showing 12 changed files with 381 additions and 340 deletions.
9 changes: 6 additions & 3 deletions client/allocrunner/network_hook.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@ func (h *networkHook) Prerun() error {
return nil
}

spec, err := h.manager.CreateNetwork(h.alloc.ID)
spec, created, err := h.manager.CreateNetwork(h.alloc.ID)

if err != nil {
return fmt.Errorf("failed to create network for alloc: %v", err)
}
Expand All @@ -71,8 +72,10 @@ func (h *networkHook) Prerun() error {
h.setter.SetNetworkIsolation(spec)
}

if err := h.networkConfigurator.Setup(context.TODO(), h.alloc, spec); err != nil {
return fmt.Errorf("failed to configure networking for alloc: %v", err)
if created {
if err := h.networkConfigurator.Setup(context.TODO(), h.alloc, spec); err != nil {
return fmt.Errorf("failed to configure networking for alloc: %v", err)
}
}
return nil
}
Expand Down
4 changes: 2 additions & 2 deletions client/allocrunner/network_hook_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,9 @@ func TestNetworkHook_Prerun_Destroy(t *testing.T) {
destroyCalled := false
nm := &testutils.MockDriver{
MockNetworkManager: testutils.MockNetworkManager{
CreateNetworkF: func(allocID string) (*drivers.NetworkIsolationSpec, error) {
CreateNetworkF: func(allocID string) (*drivers.NetworkIsolationSpec, bool, error) {
require.Equal(t, alloc.ID, allocID)
return spec, nil
return spec, false, nil
},

DestroyNetworkF: func(allocID string, netSpec *drivers.NetworkIsolationSpec) error {
Expand Down
6 changes: 3 additions & 3 deletions client/allocrunner/network_manager_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,10 +89,10 @@ func newNetworkManager(alloc *structs.Allocation, driverManager drivermanager.Ma
// defaultNetworkManager creates a network namespace for the alloc
type defaultNetworkManager struct{}

func (*defaultNetworkManager) CreateNetwork(allocID string) (*drivers.NetworkIsolationSpec, error) {
func (*defaultNetworkManager) CreateNetwork(allocID string) (*drivers.NetworkIsolationSpec, bool, error) {
netns, err := nsutil.NewNS(allocID)
if err != nil {
return nil, err
return nil, false, err
}

spec := &drivers.NetworkIsolationSpec{
Expand All @@ -101,7 +101,7 @@ func (*defaultNetworkManager) CreateNetwork(allocID string) (*drivers.NetworkIso
Labels: make(map[string]string),
}

return spec, nil
return spec, true, nil
}

func (*defaultNetworkManager) DestroyNetwork(allocID string, spec *drivers.NetworkIsolationSpec) error {
Expand Down
110 changes: 60 additions & 50 deletions drivers/docker/driver.go
Original file line number Diff line number Diff line change
Expand Up @@ -397,60 +397,26 @@ CREATE:
// If the container already exists determine whether it's already
// running or if it's dead and needs to be recreated.
if strings.Contains(strings.ToLower(createErr.Error()), "container already exists") {
containers, err := client.ListContainers(docker.ListContainersOptions{
All: true,
})

container, err := d.containerByName(config.Name)
if err != nil {
d.logger.Error("failed to query list of containers matching name", "container_name", config.Name)
return nil, recoverableErrTimeouts(fmt.Errorf("Failed to query list of containers: %s", err))
return nil, err
}

// Delete matching containers
// Adding a / infront of the container name since Docker returns the
// container names with a / pre-pended to the Nomad generated container names
containerName := "/" + config.Name
d.logger.Debug("searching for container to purge", "container_name", containerName)
for _, shimContainer := range containers {
d.logger.Debug("listed container", "names", hclog.Fmt("%+v", shimContainer.Names))
found := false
for _, name := range shimContainer.Names {
if name == containerName {
d.logger.Debug("Found container", "containter_name", containerName, "container_id", shimContainer.ID)
found = true
break
}
}

if !found {
continue
}

// Inspect the container and if the container isn't dead then return
// the container
container, err := client.InspectContainer(shimContainer.ID)
if err != nil {
err = fmt.Errorf("Failed to inspect container %s: %s", shimContainer.ID, err)

// This error is always recoverable as it could
// be caused by races between listing
// containers and this container being removed.
// See #2802
return nil, nstructs.NewRecoverableError(err, true)
}
if container != nil && container.State.Running {
return container, nil
}
if container != nil && container.State.Running {
return container, nil
}

err = client.RemoveContainer(docker.RemoveContainerOptions{
ID: container.ID,
Force: true,
})
if err != nil {
d.logger.Error("failed to purge container", "container_id", container.ID)
return nil, recoverableErrTimeouts(fmt.Errorf("Failed to purge container %s: %s", container.ID, err))
} else if err == nil {
d.logger.Info("purged container", "container_id", container.ID)
}
// Delete matching containers
err = client.RemoveContainer(docker.RemoveContainerOptions{
ID: container.ID,
Force: true,
})
if err != nil {
d.logger.Error("failed to purge container", "container_id", container.ID)
return nil, recoverableErrTimeouts(fmt.Errorf("Failed to purge container %s: %s", container.ID, err))
} else if err == nil {
d.logger.Info("purged container", "container_id", container.ID)
}

if attempted < 5 {
Expand Down Expand Up @@ -1074,6 +1040,50 @@ func (d *Driver) detectIP(c *docker.Container, driverConfig *TaskConfig) (string
return ip, auto
}

// containerByName finds a running container by name, and returns an error
// if the container is dead or can't be found.
func (d *Driver) containerByName(name string) (*docker.Container, error) {

containers, err := client.ListContainers(docker.ListContainersOptions{
All: true,
})
if err != nil {
d.logger.Error("failed to query list of containers matching name",
"container_name", name)
return nil, recoverableErrTimeouts(
fmt.Errorf("Failed to query list of containers: %s", err))
}

// container names with a / pre-pended to the Nomad generated container names
containerName := "/" + name
var shimContainer docker.APIContainers
OUTER:
for _, shimContainer = range containers {
d.logger.Debug("listed container", "names", hclog.Fmt("%+v", shimContainer.Names))
for _, name := range shimContainer.Names {
if name == containerName {
d.logger.Debug("Found container",
"container_name", containerName, "container_id", shimContainer.ID)
break OUTER
}
}
}

// Inspect the container and if the container isn't dead then return
// the container
container, err := client.InspectContainer(shimContainer.ID)
if err != nil {
err = fmt.Errorf("Failed to inspect container %s: %s", shimContainer.ID, err)

// This error is always recoverable as it could
// be caused by races between listing
// containers and this container being removed.
// See #2802
return nil, nstructs.NewRecoverableError(err, true)
}
return container, nil
}

// validateCommand validates that the command only has a single value and
// returns a user friendly error message telling them to use the passed
// argField.
Expand Down
51 changes: 34 additions & 17 deletions drivers/docker/network.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@ import (
// the parent container so tasks can configure their network mode accordingly
const dockerNetSpecLabelKey = "docker_sandbox_container_id"

func (d *Driver) CreateNetwork(allocID string) (*drivers.NetworkIsolationSpec, error) {
func (d *Driver) CreateNetwork(allocID string) (*drivers.NetworkIsolationSpec, bool, error) {
// Initialize docker API clients
client, _, err := d.dockerClients()
if err != nil {
return nil, fmt.Errorf("failed to connect to docker daemon: %s", err)
return nil, false, fmt.Errorf("failed to connect to docker daemon: %s", err)
}

repo, _ := parseDockerImage(d.config.InfraImage)
Expand All @@ -29,35 +29,52 @@ func (d *Driver) CreateNetwork(allocID string) (*drivers.NetworkIsolationSpec, e
}
_, err = d.coordinator.PullImage(d.config.InfraImage, authOptions, allocID, noopLogEventFn)
if err != nil {
return nil, err
return nil, false, err
}

config, err := d.createSandboxContainerConfig(allocID)
if err != nil {
return nil, err
return nil, false, err
}

container, err := d.createContainer(client, *config, d.config.InfraImage)
specFromContainer := func(c *docker.Container) *drivers.NetworkIsolationSpec {
return &drivers.NetworkIsolationSpec{
Mode: drivers.NetIsolationModeGroup,
Path: c.NetworkSettings.SandboxKey,
Labels: map[string]string{
dockerNetSpecLabelKey: c.ID,
},
}
}

// We want to return a flag that tells us if the container already
// existed so that callers can decide whether or not to recreate
// the task's network namespace associations.
container, err := d.containerByName(config.Name)
if err != nil {
return nil, false, err
}
if container != nil && container.State.Running {
return specFromContainer(container), false, nil
}

container, err = d.createContainer(client, *config, d.config.InfraImage)
if err != nil {
return nil, err
return nil, false, err
}

if err := d.startContainer(container); err != nil {
return nil, err
if err = d.startContainer(container); err != nil {
return nil, false, err
}

c, err := client.InspectContainer(container.ID)
// until the container is started, InspectContainer
// returns a mostly-empty struct
container, err = client.InspectContainer(container.ID)
if err != nil {
return nil, err
return nil, false, err
}

return &drivers.NetworkIsolationSpec{
Mode: drivers.NetIsolationModeGroup,
Path: c.NetworkSettings.SandboxKey,
Labels: map[string]string{
dockerNetSpecLabelKey: c.ID,
},
}, nil
return specFromContainer(container), true, nil
}

func (d *Driver) DestroyNetwork(allocID string, spec *drivers.NetworkIsolationSpec) error {
Expand Down
2 changes: 1 addition & 1 deletion e2e/connect/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ func (tc *ConnectClientStateE2ETest) TestClientRestart(f *framework.F) {
tc.jobIds = append(tc.jobIds, restartID)
}
if err != nil {
t.Skip("node cannot be restarted: %v", err)
t.Skip("node cannot be restarted", err)
}

e2eutil.RequireConsulStatus(require, consulClient,
Expand Down
6 changes: 3 additions & 3 deletions plugins/drivers/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -465,17 +465,17 @@ func (d *driverPluginClient) ExecTaskStreamingRaw(ctx context.Context,
}
}

func (d *driverPluginClient) CreateNetwork(allocID string) (*NetworkIsolationSpec, error) {
func (d *driverPluginClient) CreateNetwork(allocID string) (*NetworkIsolationSpec, bool, error) {
req := &proto.CreateNetworkRequest{
AllocId: allocID,
}

resp, err := d.client.CreateNetwork(d.doneCtx, req)
if err != nil {
return nil, grpcutils.HandleGrpcErr(err, d.doneCtx)
return nil, false, grpcutils.HandleGrpcErr(err, d.doneCtx)
}

return NetworkIsolationSpecFromProto(resp.IsolationSpec), nil
return NetworkIsolationSpecFromProto(resp.IsolationSpec), resp.Created, nil
}

func (d *driverPluginClient) DestroyNetwork(allocID string, spec *NetworkIsolationSpec) error {
Expand Down
2 changes: 1 addition & 1 deletion plugins/drivers/driver.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ type ExecOptions struct {
// network namespace for which tasks can join. This only needs to be implemented
// if the driver MUST create the network namespace
type DriverNetworkManager interface {
CreateNetwork(allocID string) (*NetworkIsolationSpec, error)
CreateNetwork(allocID string) (*NetworkIsolationSpec, bool, error)
DestroyNetwork(allocID string, spec *NetworkIsolationSpec) error
}

Expand Down
Loading

0 comments on commit 09585ab

Please sign in to comment.