Skip to content

Commit

Permalink
create Pod Disruption Budget for DCA and CCR deployments (#1454)
Browse files Browse the repository at this point in the history
* starting pdb stuff

* create pdb with flags to enable it in override

* cleanup

* add tests

* double check CLC features flag before adding pdb

* cleanup

* fix bug in test

* code fixes

* fix error logging

* added pdb testing

* update config doc

* remove pdb logging

---------

Co-authored-by: levan-m <116471169+levan-m@users.noreply.github.com>
  • Loading branch information
swang392 and levan-m authored Nov 8, 2024
1 parent 795158e commit 54f43d1
Show file tree
Hide file tree
Showing 14 changed files with 205 additions and 4 deletions.
5 changes: 5 additions & 0 deletions api/datadoghq/v2alpha1/datadogagent_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -1419,6 +1419,11 @@ type DatadogAgentComponentOverride struct {
// +optional
Replicas *int32 `json:"replicas,omitempty"`

// Set CreatePodDisruptionBudget to true to create a PodDisruptionBudget for this component.
// Not applicable for the Node Agent. A Cluster Agent PDB is set with 1 minimum available pod, and a Cluster Checks Runner PDB is set with 1 maximum unavailable pod.
// +optional
CreatePodDisruptionBudget *bool `json:"createPodDisruptionBudget,omitempty"`

// Set CreateRbac to false to prevent automatic creation of Role/ClusterRole for this component
// +optional
CreateRbac *bool `json:"createRbac,omitempty"`
Expand Down
5 changes: 5 additions & 0 deletions api/datadoghq/v2alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions config/crd/bases/v1/datadoghq.com_datadogagents.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3915,6 +3915,11 @@ spec:
`agent`, `cluster-agent`, `init-config`, `init-volume`, `process-agent`, `seccomp-setup`,
`security-agent`, `system-probe`, and `trace-agent`.
type: object
createPodDisruptionBudget:
description: |-
Set CreatePodDisruptionBudget to true to create a PodDisruptionBudget for this component.
Not applicable for the Node Agent. A Cluster Agent PDB is set with 1 minimum available pod, and a Cluster Checks Runner PDB is set with 1 maximum unavailable pod.
type: boolean
createRbac:
description: Set CreateRbac to false to prevent automatic creation of Role/ClusterRole for this component
type: boolean
Expand Down
1 change: 1 addition & 0 deletions docs/configuration.v2alpha1.md
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,7 @@ In the table, `spec.override.nodeAgent.image.name` and `spec.override.nodeAgent.
| [key].containers.[key].securityContext.windowsOptions.hostProcess | HostProcess determines if a container should be run as a 'Host Process' container. All of a Pod's containers must have the same effective HostProcess value (it is not allowed to have a mix of HostProcess containers and non-HostProcess containers). In addition, if HostProcess is true then HostNetwork must also be set to true. |
| [key].containers.[key].securityContext.windowsOptions.runAsUserName | The UserName in Windows to run the entrypoint of the container process. Defaults to the user specified in image metadata if unspecified. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence. |
| [key].containers.[key].volumeMounts `[]object` | Specify additional volume mounts in the container. |
| [key].createPodDisruptionBudget | Set CreatePodDisruptionBudget to true to create a PodDisruptionBudget for this component. Not applicable for the Node Agent. A Cluster Agent PDB is set with 1 minimum available pod, and a Cluster Checks Runner PDB is set with 1 maximum unavailable pod. |
| [key].createRbac | Set CreateRbac to false to prevent automatic creation of Role/ClusterRole for this component |
| [key].customConfigurations `map[string]object` | CustomConfiguration allows to specify custom configuration files for `datadog.yaml`, `datadog-cluster.yaml`, `security-agent.yaml`, and `system-probe.yaml`. The content is merged with configuration generated by the Datadog Operator, with priority given to custom configuration. WARNING: It is possible to override values set in the `DatadogAgent`. |
| [key].customConfigurations.[key].configData | ConfigData corresponds to the configuration file content. |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,11 @@ func GetClusterAgentServiceName(dda metav1.Object) string {
return fmt.Sprintf("%s-%s", dda.GetName(), v2alpha1.DefaultClusterAgentResourceSuffix)
}

// GetClusterAgentPodDisruptionBudgetName return the Cluster-Agent PodDisruptionBudget name based on the DatadogAgent name
func GetClusterAgentPodDisruptionBudgetName(dda metav1.Object) string {
return fmt.Sprintf("%s-%s-pdb", dda.GetName(), v2alpha1.DefaultClusterAgentResourceSuffix)
}

// GetClusterAgentName return the Cluster-Agent name based on the DatadogAgent name
func GetClusterAgentName(dda metav1.Object) string {
return fmt.Sprintf("%s-%s", dda.GetName(), v2alpha1.DefaultClusterAgentResourceSuffix)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"strconv"
"testing"

"github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1"
datadoghqv2alpha1 "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1"
apiutils "github.com/DataDog/datadog-operator/api/utils"
"github.com/DataDog/datadog-operator/internal/controller/datadogagent/common"
Expand Down Expand Up @@ -38,6 +39,18 @@ func Test_defaultClusterAgentDeployment(t *testing.T) {

assert.Empty(t, testutils.CompareKubeResource(&deployment.Spec.Template, expectedDeployment))
}
func Test_getPodDisruptionBudget(t *testing.T) {
dda := v2alpha1.DatadogAgent{
ObjectMeta: metav1.ObjectMeta{
Name: "my-datadog-agent",
Namespace: "some-namespace",
},
}
testpdb := GetClusterAgentPodDisruptionBudget(&dda)
assert.Equal(t, "my-datadog-agent-cluster-agent-pdb", testpdb.Name)
assert.Equal(t, intstr.FromInt(pdbMinAvailableInstances), *testpdb.Spec.MinAvailable)
assert.Nil(t, testpdb.Spec.MaxUnavailable)
}

func clusterAgentExpectedPodTemplate(dda *datadoghqv2alpha1.DatadogAgent) *corev1.PodTemplateSpec {
podTemplate := &corev1.PodTemplateSpec{
Expand Down
26 changes: 26 additions & 0 deletions internal/controller/datadogagent/component/clusteragent/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,16 @@ import (
"github.com/DataDog/datadog-operator/pkg/controller/utils/comparison"

corev1 "k8s.io/api/core/v1"
policyv1 "k8s.io/api/policy/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/apimachinery/pkg/version"
)

const (
pdbMinAvailableInstances = 1
)

// GetClusterAgentService returns the Cluster-Agent service
func GetClusterAgentService(dda metav1.Object) *corev1.Service {
labels := object.GetDefaultLabels(dda, v2alpha1.DefaultClusterAgentResourceSuffix, GetClusterAgentVersion(dda))
Expand Down Expand Up @@ -53,6 +58,27 @@ func GetClusterAgentService(dda metav1.Object) *corev1.Service {
return service
}

func GetClusterAgentPodDisruptionBudget(dda metav1.Object) *policyv1.PodDisruptionBudget {
// labels and annotations
minAvailableStr := intstr.FromInt(pdbMinAvailableInstances)
matchLabels := map[string]string{
apicommon.AgentDeploymentNameLabelKey: dda.GetName(),
apicommon.AgentDeploymentComponentLabelKey: v2alpha1.DefaultClusterAgentResourceSuffix}
pdb := &policyv1.PodDisruptionBudget{
ObjectMeta: metav1.ObjectMeta{
Name: GetClusterAgentPodDisruptionBudgetName(dda),
Namespace: dda.GetNamespace(),
},
Spec: policyv1.PodDisruptionBudgetSpec{
MinAvailable: &minAvailableStr,
Selector: &metav1.LabelSelector{
MatchLabels: matchLabels,
},
},
}
return pdb
}

// GetMetricsServerServiceName returns the external metrics provider service name
func GetMetricsServerServiceName(dda metav1.Object) string {
return fmt.Sprintf("%s-%s", dda.GetName(), v2alpha1.DefaultMetricsServerResourceSuffix)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@ import (

appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
policyv1 "k8s.io/api/policy/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"

apicommon "github.com/DataDog/datadog-operator/api/datadoghq/common"
"github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1"
Expand All @@ -21,6 +23,10 @@ import (
"github.com/DataDog/datadog-operator/pkg/defaulting"
)

const (
pdbMaxUnavailableInstances = 1
)

// GetClusterChecksRunnerName return the Cluster-Checks-Runner name based on the DatadogAgent name
func GetClusterChecksRunnerName(dda metav1.Object) string {
return fmt.Sprintf("%s-%s", dda.GetName(), v2alpha1.DefaultClusterChecksRunnerResourceSuffix)
Expand Down Expand Up @@ -82,6 +88,30 @@ func NewDefaultClusterChecksRunnerPodTemplateSpec(dda metav1.Object) *corev1.Pod
return template
}

func GetClusterChecksRunnerPodDisruptionBudgetName(dda metav1.Object) string {
return fmt.Sprintf("%s-%s-pdb", dda.GetName(), v2alpha1.DefaultClusterChecksRunnerResourceSuffix)
}

func GetClusterChecksRunnerPodDisruptionBudget(dda metav1.Object) *policyv1.PodDisruptionBudget {
maxUnavailableStr := intstr.FromInt(pdbMaxUnavailableInstances)
matchLabels := map[string]string{
apicommon.AgentDeploymentNameLabelKey: dda.GetName(),
apicommon.AgentDeploymentComponentLabelKey: v2alpha1.DefaultClusterChecksRunnerResourceSuffix}
pdb := &policyv1.PodDisruptionBudget{
ObjectMeta: metav1.ObjectMeta{
Name: GetClusterChecksRunnerPodDisruptionBudgetName(dda),
Namespace: dda.GetNamespace(),
},
Spec: policyv1.PodDisruptionBudgetSpec{
MaxUnavailable: &maxUnavailableStr,
Selector: &metav1.LabelSelector{
MatchLabels: matchLabels,
},
},
}
return pdb
}

// getDefaultServiceAccountName return the default Cluster-Agent ServiceAccountName
func getDefaultServiceAccountName(dda metav1.Object) string {
return fmt.Sprintf("%s-%s", dda.GetName(), v2alpha1.DefaultClusterChecksRunnerResourceSuffix)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1"
"github.com/stretchr/testify/assert"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
)

func Test_getDefaultServiceAccountName(t *testing.T) {
Expand All @@ -23,3 +24,16 @@ func Test_getDefaultServiceAccountName(t *testing.T) {

assert.Equal(t, "my-datadog-agent-cluster-checks-runner", getDefaultServiceAccountName(&dda))
}

func Test_getPodDisruptionBudget(t *testing.T) {
dda := v2alpha1.DatadogAgent{
ObjectMeta: metav1.ObjectMeta{
Name: "my-datadog-agent",
Namespace: "some-namespace",
},
}
testpdb := GetClusterChecksRunnerPodDisruptionBudget(&dda)
assert.Equal(t, "my-datadog-agent-cluster-checks-runner-pdb", testpdb.Name)
assert.Equal(t, intstr.FromInt(pdbMaxUnavailableInstances), *testpdb.Spec.MaxUnavailable)
assert.Nil(t, testpdb.Spec.MinAvailable)
}
2 changes: 0 additions & 2 deletions internal/controller/datadogagent/controller_reconcile_v2.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,15 +88,13 @@ func (r *Reconciler) internalReconcileV2(ctx context.Context, request reconcile.
// Set default values for GlobalConfig and Features
instanceCopy := instance.DeepCopy()
datadoghqv2alpha1.DefaultDatadogAgent(instanceCopy)

return r.reconcileInstanceV2(ctx, reqLogger, instanceCopy)
}

func (r *Reconciler) reconcileInstanceV2(ctx context.Context, logger logr.Logger, instance *datadoghqv2alpha1.DatadogAgent) (reconcile.Result, error) {
var result reconcile.Result
newStatus := instance.Status.DeepCopy()
now := metav1.NewTime(time.Now())

features, requiredComponents := feature.BuildFeatures(instance, reconcilerOptionsToFeatureOptions(&r.options, logger))
// update list of enabled features for metrics forwarder
r.updateMetricsForwardersFeatures(instance, features)
Expand Down
49 changes: 49 additions & 0 deletions internal/controller/datadogagent/controller_v2_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,11 @@ import (
assert "github.com/stretchr/testify/require"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
policyv1 "k8s.io/api/policy/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/client-go/kubernetes/scheme"
"k8s.io/client-go/tools/record"
"sigs.k8s.io/controller-runtime/pkg/client"
Expand Down Expand Up @@ -367,6 +369,34 @@ func TestReconcileDatadogAgentV2_Reconcile(t *testing.T) {
return verifyDaemonsetContainers(c, resourcesNamespace, dsName, expectedContainers)
},
},
{
name: "DatadogAgent with PDB enabled",
fields: fields{
client: fake.NewClientBuilder().WithStatusSubresource(&appsv1.DaemonSet{}, &v2alpha1.DatadogAgent{}, &policyv1.PodDisruptionBudget{}).Build(),
scheme: s,
recorder: recorder,
},
args: args{
request: newRequest(resourcesNamespace, resourcesName),
loadFunc: func(c client.Client) {
dda := v2alpha1test.NewInitializedDatadogAgentBuilder(resourcesNamespace, resourcesName).
WithComponentOverride(v2alpha1.ClusterAgentComponentName, v2alpha1.DatadogAgentComponentOverride{
CreatePodDisruptionBudget: apiutils.NewBoolPointer(true),
}).
WithClusterChecksUseCLCEnabled(true).
WithComponentOverride(v2alpha1.ClusterChecksRunnerComponentName, v2alpha1.DatadogAgentComponentOverride{
CreatePodDisruptionBudget: apiutils.NewBoolPointer(true),
}).
Build()
_ = c.Create(context.TODO(), dda)
},
},
want: reconcile.Result{RequeueAfter: defaultRequeueDuration},
wantErr: false,
wantFunc: func(c client.Client) error {
return verifyPDB(t, c)
},
},
}

for _, tt := range tests {
Expand Down Expand Up @@ -564,3 +594,22 @@ func verifyDaemonsetNames(t *testing.T, c client.Client, resourcesNamespace, dsN
assert.Equal(t, expectedDSNames, actualDSNames)
return nil
}

func verifyPDB(t *testing.T, c client.Client) error {
pdbList := policyv1.PodDisruptionBudgetList{}
if err := c.List(context.TODO(), &pdbList); err != nil {
return err
}
assert.True(t, len(pdbList.Items) == 2)

dcaPDB := pdbList.Items[0]
assert.Equal(t, "foo-cluster-agent-pdb", dcaPDB.Name)
assert.Equal(t, intstr.FromInt(1), *dcaPDB.Spec.MinAvailable)
assert.Nil(t, dcaPDB.Spec.MaxUnavailable)

ccrPDB := pdbList.Items[1]
assert.Equal(t, "foo-cluster-checks-runner-pdb", ccrPDB.Name)
assert.Equal(t, intstr.FromInt(1), *ccrPDB.Spec.MaxUnavailable)
assert.Nil(t, ccrPDB.Spec.MinAvailable)
return nil
}
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,6 @@ func (f *defaultFeature) Configure(dda *v2alpha1.DatadogAgent) feature.RequiredC
if dda.Spec.Global.DisableNonResourceRules != nil && *dda.Spec.Global.DisableNonResourceRules {
f.disableNonResourceRules = true
}

if dda.Spec.Global.Credentials != nil {
creds := dda.Spec.Global.Credentials

Expand Down Expand Up @@ -223,7 +222,6 @@ func (f *defaultFeature) Configure(dda *v2alpha1.DatadogAgent) feature.RequiredC
},
}
}

}

// ManageDependencies allows a feature to manage its dependencies.
Expand Down
23 changes: 23 additions & 0 deletions internal/controller/datadogagent/override/dependencies.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ import (
"k8s.io/apimachinery/pkg/util/errors"

"github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1"
componentdca "github.com/DataDog/datadog-operator/internal/controller/datadogagent/component/clusteragent"
componentccr "github.com/DataDog/datadog-operator/internal/controller/datadogagent/component/clusterchecksrunner"
"github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature"
"github.com/DataDog/datadog-operator/internal/controller/datadogagent/object"
"github.com/DataDog/datadog-operator/internal/controller/datadogagent/object/configmap"
Expand Down Expand Up @@ -42,11 +44,32 @@ func Dependencies(logger logr.Logger, manager feature.ResourceManagers, dda *v2a
// Handle custom check files
checksdCMName := fmt.Sprintf(extraChecksdConfigMapName, strings.ToLower((string(component))))
errs = append(errs, overrideExtraConfigs(logger, manager, override.ExtraChecksd, namespace, checksdCMName, false)...)

errs = append(errs, overridePodDisruptionBudget(logger, manager, dda, override.CreatePodDisruptionBudget, component)...)
}

return errs
}

func overridePodDisruptionBudget(logger logr.Logger, manager feature.ResourceManagers, dda *v2alpha1.DatadogAgent, createPdb *bool, component v2alpha1.ComponentName) (errs []error) {
if createPdb != nil && *createPdb {
if component == v2alpha1.ClusterAgentComponentName {
pdb := componentdca.GetClusterAgentPodDisruptionBudget(dda)
if err := manager.Store().AddOrUpdate(kubernetes.PodDisruptionBudgetsKind, pdb); err != nil {
errs = append(errs, err)
}
} else if component == v2alpha1.ClusterChecksRunnerComponentName &&
(dda.Spec.Features.ClusterChecks.UseClusterChecksRunners == nil ||
*dda.Spec.Features.ClusterChecks.UseClusterChecksRunners) {
pdb := componentccr.GetClusterChecksRunnerPodDisruptionBudget(dda)
if err := manager.Store().AddOrUpdate(kubernetes.PodDisruptionBudgetsKind, pdb); err != nil {
errs = append(errs, err)
}
}
}
return errs
}

func overrideRBAC(logger logr.Logger, manager feature.ResourceManagers, override *v2alpha1.DatadogAgentComponentOverride, component v2alpha1.ComponentName, namespace string) error {
var errs []error

Expand Down
29 changes: 29 additions & 0 deletions internal/controller/datadogagent/override/dependencies_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,35 @@ func TestDependencies(t *testing.T) {
},
expectsErrors: false,
},
{
name: "override clusterAgent createPDB without errors",
dda: v2alpha1.DatadogAgent{
Spec: v2alpha1.DatadogAgentSpec{
Override: map[v2alpha1.ComponentName]*v2alpha1.DatadogAgentComponentOverride{
v2alpha1.ClusterAgentComponentName: {
CreatePodDisruptionBudget: apiutils.NewBoolPointer(true),
},
},
},
},
},
{
name: "override clusterChecksRunner createPDB without errors",
dda: v2alpha1.DatadogAgent{
Spec: v2alpha1.DatadogAgentSpec{
Override: map[v2alpha1.ComponentName]*v2alpha1.DatadogAgentComponentOverride{
v2alpha1.ClusterChecksRunnerComponentName: {
CreatePodDisruptionBudget: apiutils.NewBoolPointer(true),
},
},
Features: &v2alpha1.DatadogFeatures{
ClusterChecks: &v2alpha1.ClusterChecksFeatureConfig{
UseClusterChecksRunners: apiutils.NewBoolPointer(true),
},
},
},
},
},
}

for _, test := range tests {
Expand Down

0 comments on commit 54f43d1

Please sign in to comment.