Skip to content

Commit

Permalink
fix(executor): Kill injected sidecars. Fixes #5337 (#5345)
Browse files Browse the repository at this point in the history
Signed-off-by: Alex Collins <alex_collins@intuit.com>
  • Loading branch information
alexec committed Mar 16, 2021
1 parent c9d7bfc commit c2c4410
Show file tree
Hide file tree
Showing 19 changed files with 323 additions and 86 deletions.
44 changes: 44 additions & 0 deletions docs/sidecar-injection.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# Sidecar Injection

Automatic (i.e. mutating webhook based) sidecar injection systems, including service meshes such as Anthos and Istio
Proxy, create a unique problem for Kubernetes workloads that run to completion.

Because sidecars are injected outside of the view of the workflow controller, the controller has no awareness of them.
It has no opportunity to rewrite the containers command (when using the Emissary Executor) and as the sidecar's process
will run as PID 1, which is protected. It can be impossible for the wait container to terminate the sidecar.

You will minimize problems by not using Istio with Argo Workflows.

See [#1282](https://github.com/argoproj/argo-workflows/issues/1282).

## How We Kill Sidecars

Kubernetes does not provide a way to kill a single container. You can delete a pod, but this kills all containers, and loses all information
and logs of that pod.

Instead, try to mimic the Kubernetes termination behaviour, which is:

1. SIGTERM PID 1
1. Wait for the pod's `terminateGracePeriodSeconds` (30s by default).
1. SIGKILL PID 1

The following are not supported:

* `preStop`
* `STOPSIGNAL`

### Support Matrix

Key:

* Any - we can kill any image
* Shell - we can only kill images with `/bin/sh` installed on them (e.g. Debian)
* None - we cannot kill these images

| Executor | Sidecar | Injected Sidecar |
|---|---|---|
| `docker` | Any | Any |
| `emissary` | Any | None |
| `k8sapi` | Shell | Shell |
| `kubelet` | Shell | Shell |
| `pns` | Any | Any |
6 changes: 6 additions & 0 deletions manifests/quick-start-minimal.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -422,6 +422,12 @@ rules:
verbs:
- get
- watch
- apiGroups:
- ""
resources:
- pods/exec
verbs:
- create
- apiGroups:
- argoproj.io
resources:
Expand Down
6 changes: 6 additions & 0 deletions manifests/quick-start-mysql.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -422,6 +422,12 @@ rules:
verbs:
- get
- watch
- apiGroups:
- ""
resources:
- pods/exec
verbs:
- create
- apiGroups:
- argoproj.io
resources:
Expand Down
6 changes: 6 additions & 0 deletions manifests/quick-start-postgres.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -422,6 +422,12 @@ rules:
verbs:
- get
- watch
- apiGroups:
- ""
resources:
- pods/exec
verbs:
- create
- apiGroups:
- argoproj.io
resources:
Expand Down
7 changes: 7 additions & 0 deletions manifests/quick-start/base/workflow-role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,13 @@ rules:
verbs:
- get
- watch
# Only needed if you are running the `k8sapi` executor.
- apiGroups:
- ""
resources:
- pods/exec
verbs:
- create
# This allows one workflow to create another.
# Not needed for the majority of use cases.
- apiGroups:
Expand Down
1 change: 1 addition & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ nav:
- configure-artifact-repository.md
- workflow-controller-configmap.md
- workflow-executors.md
- sidecar-injection.md
- default-workflow-specs.md
- offloading-large-workflows.md
- workflow-archive.md
Expand Down
8 changes: 0 additions & 8 deletions pkg/apis/workflow/v1alpha1/workflow_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -627,14 +627,6 @@ func (tmpl *Template) HasPodSpecPatch() bool {
return tmpl.PodSpecPatch != ""
}

func (tmpl *Template) GetSidecarNames() []string {
var containerNames []string
for _, s := range tmpl.Sidecars {
containerNames = append(containerNames, s.Name)
}
return containerNames
}

type Artifacts []Artifact

func (a Artifacts) GetArtifactByName(name string) *Artifact {
Expand Down
112 changes: 105 additions & 7 deletions pkg/apis/workflow/v1alpha1/workflow_types_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -617,11 +617,109 @@ func TestWorkflow_GetSemaphoreKeys(t *testing.T) {
assert.Contains(keys, "test/template1")
}

func TestTemplate_GetSidecarNames(t *testing.T) {
m := &Template{
Sidecars: []UserContainer{
{Container: corev1.Container{Name: "sidecar-0"}},
},
}
assert.ElementsMatch(t, []string{"sidecar-0"}, m.GetSidecarNames())
func TestTemplate_IsMainContainerNamed(t *testing.T) {
t.Run("Default", func(t *testing.T) {
x := &Template{}
assert.True(t, x.IsMainContainerName("main"))
})
t.Run("ContainerSet", func(t *testing.T) {
x := &Template{ContainerSet: &ContainerSetTemplate{Containers: []ContainerNode{{Container: corev1.Container{Name: "foo"}}}}}
assert.False(t, x.IsMainContainerName("main"))
assert.True(t, x.IsMainContainerName("foo"))
})
}

func TestTemplate_GetMainContainer(t *testing.T) {
t.Run("Default", func(t *testing.T) {
x := &Template{}
assert.Equal(t, []string{"main"}, x.GetMainContainerNames())
})
t.Run("ContainerSet", func(t *testing.T) {
x := &Template{ContainerSet: &ContainerSetTemplate{Containers: []ContainerNode{{Container: corev1.Container{Name: "foo"}}}}}
assert.Equal(t, []string{"foo"}, x.GetMainContainerNames())
})
}

func TestTemplate_HasSequencedContainers(t *testing.T) {
t.Run("Default", func(t *testing.T) {
x := &Template{}
assert.False(t, x.HasSequencedContainers())
})
t.Run("ContainerSet", func(t *testing.T) {
x := &Template{ContainerSet: &ContainerSetTemplate{Containers: []ContainerNode{{Dependencies: []string{""}}}}}
assert.True(t, x.HasSequencedContainers())
})
}

func TestTemplate_GetVolumeMounts(t *testing.T) {
t.Run("Default", func(t *testing.T) {
x := &Template{}
assert.Empty(t, x.GetVolumeMounts())
})
t.Run("Container", func(t *testing.T) {
x := &Template{Container: &corev1.Container{VolumeMounts: []corev1.VolumeMount{{}}}}
assert.NotEmpty(t, x.GetVolumeMounts())
})
t.Run("ContainerSet", func(t *testing.T) {
x := &Template{ContainerSet: &ContainerSetTemplate{VolumeMounts: []corev1.VolumeMount{{}}}}
assert.NotEmpty(t, x.GetVolumeMounts())
})
t.Run("Script", func(t *testing.T) {
x := &Template{Script: &ScriptTemplate{Container: corev1.Container{VolumeMounts: []corev1.VolumeMount{{}}}}}
assert.NotEmpty(t, x.GetVolumeMounts())
})
}

func TestTemplate_HasOutputs(t *testing.T) {
t.Run("Default", func(t *testing.T) {
x := &Template{}
assert.False(t, x.HasOutput())
})
t.Run("Container", func(t *testing.T) {
x := &Template{Container: &corev1.Container{}}
assert.True(t, x.HasOutput())
})
t.Run("ContainerSet", func(t *testing.T) {
t.Run("NoMain", func(t *testing.T) {
x := &Template{ContainerSet: &ContainerSetTemplate{}}
assert.False(t, x.HasOutput())
})
t.Run("Main", func(t *testing.T) {
x := &Template{ContainerSet: &ContainerSetTemplate{Containers: []ContainerNode{{Container: corev1.Container{Name: "main"}}}}}
assert.True(t, x.HasOutput())
})
})
t.Run("Script", func(t *testing.T) {
x := &Template{Script: &ScriptTemplate{}}
assert.True(t, x.HasOutput())
})
t.Run("Data", func(t *testing.T) {
x := &Template{Data: &Data{}}
assert.True(t, x.HasOutput())
})
t.Run("Resource", func(t *testing.T) {
x := &Template{Resource: &ResourceTemplate{}}
assert.False(t, x.HasOutput())
})
}

func TestTemplate_SaveLogsAsArtifact(t *testing.T) {
t.Run("Default", func(t *testing.T) {
x := &Template{}
assert.False(t, x.SaveLogsAsArtifact())
})
t.Run("IsArchiveLogs", func(t *testing.T) {
x := &Template{ArchiveLocation: &ArtifactLocation{ArchiveLogs: pointer.BoolPtr(true)}}
assert.True(t, x.SaveLogsAsArtifact())
})
t.Run("ContainerSet", func(t *testing.T) {
t.Run("NoMain", func(t *testing.T) {
x := &Template{ArchiveLocation: &ArtifactLocation{ArchiveLogs: pointer.BoolPtr(true)}, ContainerSet: &ContainerSetTemplate{}}
assert.False(t, x.SaveLogsAsArtifact())
})
t.Run("Main", func(t *testing.T) {
x := &Template{ArchiveLocation: &ArtifactLocation{ArchiveLogs: pointer.BoolPtr(true)}, ContainerSet: &ContainerSetTemplate{Containers: []ContainerNode{{Container: corev1.Container{Name: "main"}}}}}
assert.True(t, x.SaveLogsAsArtifact())
})
})
}
2 changes: 1 addition & 1 deletion test/e2e/cli_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -765,7 +765,7 @@ func (s *CLISuite) TestWorkflowRetry() {
WaitForWorkflow(fixtures.ToStart).
WaitForWorkflow(fixtures.Condition(func(wf *wfv1.Workflow) (bool, string) {
return wf.Status.AnyActiveSuspendNode(), "suspended node"
})).
}), time.Minute).
RunCli([]string{"terminate", "retry-test"}, func(t *testing.T, output string, err error) {
if assert.NoError(t, err) {
assert.Contains(t, output, "workflow retry-test terminated")
Expand Down
2 changes: 1 addition & 1 deletion test/e2e/functional_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ func (s *FunctionalSuite) TestArchiveStrategies() {
Workflow(`@testdata/archive-strategies.yaml`).
When().
SubmitWorkflow().
WaitForWorkflow().
WaitForWorkflow(time.Minute).
Then().
ExpectWorkflow(func(t *testing.T, _ *metav1.ObjectMeta, status *wfv1.WorkflowStatus) {
assert.Equal(t, wfv1.WorkflowSucceeded, status.Phase)
Expand Down
46 changes: 10 additions & 36 deletions test/e2e/signals_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ import (
"github.com/argoproj/argo-workflows/v3/test/e2e/fixtures"
)

const kill2xDuration = 70 * time.Second

// Tests the use of signals to kill containers.
// argoproj/argosay:v2 does not contain sh, so you must use argoproj/argosay:v1.
// Killing often requires SIGKILL, which is issued 30s after SIGTERM. So tests need longer (>30s) timeout.
Expand Down Expand Up @@ -92,7 +94,7 @@ func (s *SignalsSuite) TestDoNotCreatePodsUnderStopBehavior() {
assert.NoError(t, err)
assert.Regexp(t, "workflow stop-terminate-.* stopped", output)
}).
WaitForWorkflow(1 * time.Minute).
WaitForWorkflow(kill2xDuration).
Then().
ExpectWorkflow(func(t *testing.T, m *metav1.ObjectMeta, status *wfv1.WorkflowStatus) {
assert.Equal(t, wfv1.WorkflowFailed, status.Phase)
Expand All @@ -105,47 +107,19 @@ func (s *SignalsSuite) TestDoNotCreatePodsUnderStopBehavior() {
})
}

func (s *SignalsSuite) TestPropagateMaxDuration() {
s.T().Skip("too hard to get working")
func (s *SignalsSuite) TestSidecars() {
s.Given().
Workflow(`
apiVersion: argoproj.io/v1alpha1
kind: Workflow
metadata:
name: retry-backoff-2
spec:
entrypoint: retry-backoff
templates:
- name: retry-backoff
retryStrategy:
limit: 10
backoff:
duration: "1"
factor: 1
maxDuration: "10"
container:
image: argoproj/argosay:v1
command: [sh, -c]
args: ["sleep $(( {{retries}} * 40 )); exit 1"]
`).
Workflow("@testdata/sidecar-workflow.yaml").
When().
SubmitWorkflow().
WaitForWorkflow(1 * time.Minute).
Then().
ExpectWorkflow(func(t *testing.T, _ *metav1.ObjectMeta, status *wfv1.WorkflowStatus) {
assert.Contains(t, []wfv1.WorkflowPhase{wfv1.WorkflowFailed, wfv1.WorkflowError}, status.Phase)
assert.Len(t, status.Nodes, 3)
node := status.Nodes.FindByDisplayName("retry-backoff-2(1)")
if assert.NotNil(t, node) {
assert.Equal(t, wfv1.NodeFailed, node.Phase)
}
})
WaitForWorkflow(fixtures.ToBeSucceeded, kill2xDuration)
}

func (s *SignalsSuite) TestSidecars() {
// make sure Istio/Anthos and other sidecar injectors will work
func (s *SignalsSuite) TestInjectedSidecar() {
s.Need(fixtures.None(fixtures.Emissary))
s.Given().
Workflow("@testdata/sidecar-workflow.yaml").
Workflow("@testdata/sidecar-injected-workflow.yaml").
When().
SubmitWorkflow().
WaitForWorkflow(1 * time.Minute).
Expand Down
22 changes: 22 additions & 0 deletions test/e2e/testdata/sidecar-injected-workflow.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
apiVersion: argoproj.io/v1alpha1
kind: Workflow
metadata:
generateName: sidecar-injected-
spec:
entrypoint: main
podSpecPatch: |
terminationGracePeriodSeconds: 3
containers:
- name: wait
- name: main
- name: sidecar
image: argoproj/argosay:v1
command:
- sh
- -c
args:
- "sleep 999"
templates:
- name: main
container:
image: argoproj/argosay:v1
9 changes: 3 additions & 6 deletions test/e2e/testdata/sidecar-workflow.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,8 @@ spec:
- name: main
container:
image: argoproj/argosay:v1
args: [ sleep, "5s" ]
sidecars:
- name: sidecar-0
- name: sidecar
image: argoproj/argosay:v1
args: [ sleep, "999s" ]
- name: sidecar-1
image: argoproj/argosay:v1
args: [ sleep, "999s" ]
command: [ sh, -c ]
args: [ "sleep 999" ]
Loading

0 comments on commit c2c4410

Please sign in to comment.