Skip to content

Commit

Permalink
feat: add ignoreResourceUpdates to reduce controller CPU usage (#13534
Browse files Browse the repository at this point in the history
) (#13912)

* feat: ignore watched resource update

Signed-off-by: Alexandre Gaudreault <alexandre.gaudreault@logmein.com>

* add doc and CLI

Signed-off-by: Alexandre Gaudreault <alexandre.gaudreault@logmein.com>

* update doc index

Signed-off-by: Alexandre Gaudreault <alexandre.gaudreault@logmein.com>

* add command

Signed-off-by: Alexandre Gaudreault <alexandre.gaudreault@logmein.com>

* codegen

Signed-off-by: Alexandre Gaudreault <alexandre.gaudreault@logmein.com>

* revert formatting

Signed-off-by: Alexandre Gaudreault <alexandre.gaudreault@logmein.com>

* do not skip on health change

Signed-off-by: Alexandre Gaudreault <alexandre.gaudreault@logmein.com>

* update doc

Signed-off-by: Alexandre Gaudreault <alexandre.gaudreault@logmein.com>

* update logging to use context

Signed-off-by: Alexandre Gaudreault <alexandre.gaudreault@logmein.com>

* fix typos. local build broken...

Signed-off-by: Alexandre Gaudreault <alexandre.gaudreault@logmein.com>

* change after review

Signed-off-by: Alexandre Gaudreault <alexandre.gaudreault@logmein.com>

* manifestHash to string

Signed-off-by: Alexandre Gaudreault <alexandre.gaudreault@logmein.com>

* more doc

Signed-off-by: Alexandre Gaudreault <alexandre.gaudreault@logmein.com>

* example for argoproj Application

Signed-off-by: Alexandre Gaudreault <alexandre.gaudreault@logmein.com>

* add unit test for ignored logs

Signed-off-by: Alexandre Gaudreault <alexandre.gaudreault@logmein.com>

* codegen

Signed-off-by: Alexandre Gaudreault <alexandre.gaudreault@logmein.com>

* Update docs/operator-manual/reconcile.md

Co-authored-by: Michael Crenshaw <350466+crenshaw-dev@users.noreply.github.com>
Signed-off-by: Alexandre Gaudreault <alexandre.gaudreault@logmein.com>

* move hash and set log to debug

Signed-off-by: Alexandre Gaudreault <alexandre.gaudreault@logmein.com>

* Update util/settings/settings.go

Co-authored-by: Michael Crenshaw <350466+crenshaw-dev@users.noreply.github.com>
Signed-off-by: Alexandre Gaudreault <alexandre.gaudreault@logmein.com>

* Update util/settings/settings.go

Co-authored-by: Michael Crenshaw <350466+crenshaw-dev@users.noreply.github.com>
Signed-off-by: Alexandre Gaudreault <alexandre.gaudreault@logmein.com>

* feature flag

Signed-off-by: Michael Crenshaw <350466+crenshaw-dev@users.noreply.github.com>

* fix

Signed-off-by: Michael Crenshaw <350466+crenshaw-dev@users.noreply.github.com>

* less aggressive managedFields ignore rule

Signed-off-by: Michael Crenshaw <350466+crenshaw-dev@users.noreply.github.com>

* Update docs/operator-manual/reconcile.md

Co-authored-by: Alexandre Gaudreault <alexandre.gaudreault@logmein.com>

* use local settings

Signed-off-by: Michael Crenshaw <350466+crenshaw-dev@users.noreply.github.com>

* latest settings

Signed-off-by: Michael Crenshaw <350466+crenshaw-dev@users.noreply.github.com>

* safety first

Signed-off-by: Michael Crenshaw <350466+crenshaw-dev@users.noreply.github.com>

* since it's behind a feature flag, go aggressive on overrides

Signed-off-by: Michael Crenshaw <350466+crenshaw-dev@users.noreply.github.com>

---------

Signed-off-by: Alexandre Gaudreault <alexandre.gaudreault@logmein.com>
Signed-off-by: Michael Crenshaw <350466+crenshaw-dev@users.noreply.github.com>
Co-authored-by: Michael Crenshaw <350466+crenshaw-dev@users.noreply.github.com>
  • Loading branch information
agaudreault and crenshaw-dev authored Jun 25, 2023
1 parent 771012b commit 88994ea
Show file tree
Hide file tree
Showing 24 changed files with 1,509 additions and 677 deletions.
1 change: 1 addition & 0 deletions USERS.md
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ Currently, the following organizations are **officially** using Argo CD:
1. [Glovo](https://www.glovoapp.com)
1. [GMETRI](https://gmetri.com/)
1. [Gojek](https://www.gojek.io/)
1. [GoTo](https://www.goto.com/)
1. [GoTo Financial](https://gotofinancial.com/)
1. [Greenpass](https://www.greenpass.com.br/)
1. [Gridfuse](https://gridfuse.com/)
Expand Down
3 changes: 3 additions & 0 deletions assets/swagger.json
Original file line number Diff line number Diff line change
Expand Up @@ -8007,6 +8007,9 @@
"ignoreDifferences": {
"$ref": "#/definitions/v1alpha1OverrideIgnoreDiff"
},
"ignoreResourceUpdates": {
"$ref": "#/definitions/v1alpha1OverrideIgnoreDiff"
},
"knownTypeFields": {
"type": "array",
"items": {
Expand Down
72 changes: 72 additions & 0 deletions cmd/argocd/commands/admin/settings.go
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,7 @@ func NewResourceOverridesCommand(cmdCtx commandContext) *cobra.Command {
},
}
command.AddCommand(NewResourceIgnoreDifferencesCommand(cmdCtx))
command.AddCommand(NewResourceIgnoreResourceUpdatesCommand(cmdCtx))
command.AddCommand(NewResourceActionListCommand(cmdCtx))
command.AddCommand(NewResourceActionRunCommand(cmdCtx))
command.AddCommand(NewResourceHealthCommand(cmdCtx))
Expand Down Expand Up @@ -380,6 +381,31 @@ func executeResourceOverrideCommand(ctx context.Context, cmdCtx commandContext,
callback(res, override, overrides)
}

func executeIgnoreResourceUpdatesOverrideCommand(ctx context.Context, cmdCtx commandContext, args []string, callback func(res unstructured.Unstructured, override v1alpha1.ResourceOverride, overrides map[string]v1alpha1.ResourceOverride)) {
data, err := os.ReadFile(args[0])
errors.CheckError(err)

res := unstructured.Unstructured{}
errors.CheckError(yaml.Unmarshal(data, &res))

settingsManager, err := cmdCtx.createSettingsManager(ctx)
errors.CheckError(err)

overrides, err := settingsManager.GetIgnoreResourceUpdatesOverrides()
errors.CheckError(err)
gvk := res.GroupVersionKind()
key := gvk.Kind
if gvk.Group != "" {
key = fmt.Sprintf("%s/%s", gvk.Group, gvk.Kind)
}
override, hasOverride := overrides[key]
if !hasOverride {
_, _ = fmt.Printf("No overrides configured for '%s/%s'\n", gvk.Group, gvk.Kind)
return
}
callback(res, override, overrides)
}

func NewResourceIgnoreDifferencesCommand(cmdCtx commandContext) *cobra.Command {
var command = &cobra.Command{
Use: "ignore-differences RESOURCE_YAML_PATH",
Expand Down Expand Up @@ -430,6 +456,52 @@ argocd admin settings resource-overrides ignore-differences ./deploy.yaml --argo
return command
}

func NewResourceIgnoreResourceUpdatesCommand(cmdCtx commandContext) *cobra.Command {
var command = &cobra.Command{
Use: "ignore-resource-updates RESOURCE_YAML_PATH",
Short: "Renders fields excluded from resource updates",
Long: "Renders ignored fields using the 'ignoreResourceUpdates' setting specified in the 'resource.customizations' field of 'argocd-cm' ConfigMap",
Example: `
argocd admin settings resource-overrides ignore-resource-updates ./deploy.yaml --argocd-cm-path ./argocd-cm.yaml`,
Run: func(c *cobra.Command, args []string) {
ctx := c.Context()

if len(args) < 1 {
c.HelpFunc()(c, args)
os.Exit(1)
}

executeIgnoreResourceUpdatesOverrideCommand(ctx, cmdCtx, args, func(res unstructured.Unstructured, override v1alpha1.ResourceOverride, overrides map[string]v1alpha1.ResourceOverride) {
gvk := res.GroupVersionKind()
if len(override.IgnoreResourceUpdates.JSONPointers) == 0 && len(override.IgnoreResourceUpdates.JQPathExpressions) == 0 {
_, _ = fmt.Printf("Ignore resource updates are not configured for '%s/%s'\n", gvk.Group, gvk.Kind)
return
}

normalizer, err := normalizers.NewIgnoreNormalizer(nil, overrides)
errors.CheckError(err)

normalizedRes := res.DeepCopy()
logs := collectLogs(func() {
errors.CheckError(normalizer.Normalize(normalizedRes))
})
if logs != "" {
_, _ = fmt.Println(logs)
}

if reflect.DeepEqual(&res, normalizedRes) {
_, _ = fmt.Printf("No fields are ignored by ignoreResourceUpdates settings: \n%s\n", override.IgnoreResourceUpdates)
return
}

_, _ = fmt.Printf("Following fields are ignored:\n\n")
_ = cli.PrintDiff(res.GetName(), &res, normalizedRes)
})
},
}
return command
}

func NewResourceHealthCommand(cmdCtx commandContext) *cobra.Command {
var command = &cobra.Command{
Use: "health RESOURCE_YAML_PATH",
Expand Down
23 changes: 12 additions & 11 deletions controller/appcontroller.go
Original file line number Diff line number Diff line change
Expand Up @@ -359,17 +359,18 @@ func (ctrl *ApplicationController) handleObjectUpdated(managedByApp map[string]b
level = CompareWithRecent
}

// Additional check for debug level so we don't need to evaluate the
// format string in case of non-debug scenarios
if log.GetLevel() >= log.DebugLevel {
var resKey string
if ref.Namespace != "" {
resKey = ref.Namespace + "/" + ref.Name
} else {
resKey = "(cluster-scoped)/" + ref.Name
}
log.Debugf("Refreshing app %s for change in cluster of object %s of type %s/%s", appKey, resKey, ref.APIVersion, ref.Kind)
}
namespace := ref.Namespace
if ref.Namespace == "" {
namespace = "(cluster-scoped)"
}
log.WithFields(log.Fields{
"application": appKey,
"level": level,
"namespace": namespace,
"name": ref.Name,
"api-version": ref.APIVersion,
"kind": ref.Kind,
}).Debug("Requesting app refresh caused by object update")

ctrl.requestAppRefresh(app.QualifiedName(), &level, nil)
}
Expand Down
75 changes: 74 additions & 1 deletion controller/cache/cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import (
"k8s.io/client-go/tools/cache"

"github.com/argoproj/argo-cd/v2/controller/metrics"
"github.com/argoproj/argo-cd/v2/pkg/apis/application"
appv1 "github.com/argoproj/argo-cd/v2/pkg/apis/application/v1alpha1"
"github.com/argoproj/argo-cd/v2/util/argo"
"github.com/argoproj/argo-cd/v2/util/db"
Expand Down Expand Up @@ -149,6 +150,8 @@ type ResourceInfo struct {
PodInfo *PodInfo
// NodeInfo is available for nodes only
NodeInfo *NodeInfo

manifestHash string
}

func NewLiveStateCache(
Expand Down Expand Up @@ -178,6 +181,11 @@ type cacheSettings struct {
clusterSettings clustercache.Settings
appInstanceLabelKey string
trackingMethod appv1.TrackingMethod
// resourceOverrides provides a list of ignored differences to ignore watched resource updates
resourceOverrides map[string]appv1.ResourceOverride

// ignoreResourceUpdates is a flag to enable resource-ignore rules.
ignoreResourceUpdatesEnabled bool
}

type liveStateCache struct {
Expand All @@ -200,6 +208,14 @@ func (c *liveStateCache) loadCacheSettings() (*cacheSettings, error) {
if err != nil {
return nil, err
}
resourceUpdatesOverrides, err := c.settingsMgr.GetIgnoreResourceUpdatesOverrides()
if err != nil {
return nil, err
}
ignoreResourceUpdatesEnabled, err := c.settingsMgr.GetIsIgnoreResourceUpdatesEnabled()
if err != nil {
return nil, err
}
resourcesFilter, err := c.settingsMgr.GetResourcesFilter()
if err != nil {
return nil, err
Expand All @@ -212,7 +228,8 @@ func (c *liveStateCache) loadCacheSettings() (*cacheSettings, error) {
ResourceHealthOverride: lua.ResourceHealthOverrides(resourceOverrides),
ResourcesFilter: resourcesFilter,
}
return &cacheSettings{clusterSettings, appInstanceLabelKey, argo.GetTrackingMethod(c.settingsMgr)}, nil

return &cacheSettings{clusterSettings, appInstanceLabelKey, argo.GetTrackingMethod(c.settingsMgr), resourceUpdatesOverrides, ignoreResourceUpdatesEnabled}, nil
}

func asResourceNode(r *clustercache.Resource) appv1.ResourceNode {
Expand Down Expand Up @@ -309,6 +326,27 @@ func skipAppRequeuing(key kube.ResourceKey) bool {
return ignoredRefreshResources[key.Group+"/"+key.Kind]
}

func skipResourceUpdate(oldInfo, newInfo *ResourceInfo) bool {
if oldInfo == nil || newInfo == nil {
return false
}
isSameHealthStatus := (oldInfo.Health == nil && newInfo.Health == nil) || oldInfo.Health != nil && newInfo.Health != nil && oldInfo.Health.Status == newInfo.Health.Status
isSameManifest := oldInfo.manifestHash != "" && newInfo.manifestHash != "" && oldInfo.manifestHash == newInfo.manifestHash
return isSameHealthStatus && isSameManifest
}

// shouldHashManifest validates if the API resource needs to be hashed.
// If there's an app name from resource tracking, or if this is itself an app, we should generate a hash.
// Otherwise, the hashing should be skipped to save CPU time.
func shouldHashManifest(appName string, gvk schema.GroupVersionKind) bool {
// Only hash if the resource belongs to an app.
// Best - Only hash for resources that are part of an app or their dependencies
// (current) - Only hash for resources that are part of an app + all apps that might be from an ApplicationSet
// Orphan - If orphan is enabled, hash should be made on all resource of that namespace and a config to disable it
// Worst - Hash all resources watched by Argo
return appName != "" || (gvk.Group == application.Group && gvk.Kind == application.ApplicationKind)
}

// isRetryableError is a helper method to see whether an error
// returned from the dynamic client is potentially retryable.
func isRetryableError(err error) bool {
Expand Down Expand Up @@ -424,14 +462,25 @@ func (c *liveStateCache) getCluster(server string) (clustercache.ClusterCache, e
c.lock.RLock()
cacheSettings := c.cacheSettings
c.lock.RUnlock()

res.Health, _ = health.GetResourceHealth(un, cacheSettings.clusterSettings.ResourceHealthOverride)

appName := c.resourceTracking.GetAppName(un, cacheSettings.appInstanceLabelKey, cacheSettings.trackingMethod)
if isRoot && appName != "" {
res.AppName = appName
}

gvk := un.GroupVersionKind()

if cacheSettings.ignoreResourceUpdatesEnabled && shouldHashManifest(appName, gvk) {
hash, err := generateManifestHash(un, nil, cacheSettings.resourceOverrides)
if err != nil {
log.Errorf("Failed to generate manifest hash: %v", err)
} else {
res.manifestHash = hash
}
}

// edge case. we do not label CRDs, so they miss the tracking label we inject. But we still
// want the full resource to be available in our cache (to diff), so we store all CRDs
return res, res.AppName != "" || gvk.Kind == kube.CustomResourceDefinitionKind
Expand All @@ -450,6 +499,30 @@ func (c *liveStateCache) getCluster(server string) (clustercache.ClusterCache, e
} else {
ref = oldRes.Ref
}

c.lock.RLock()
cacheSettings := c.cacheSettings
c.lock.RUnlock()

if cacheSettings.ignoreResourceUpdatesEnabled && oldRes != nil && newRes != nil && skipResourceUpdate(resInfo(oldRes), resInfo(newRes)) {
// Additional check for debug level so we don't need to evaluate the
// format string in case of non-debug scenarios
if log.GetLevel() >= log.DebugLevel {
namespace := ref.Namespace
if ref.Namespace == "" {
namespace = "(cluster-scoped)"
}
log.WithFields(log.Fields{
"server": clusterCache.GetClusterInfo().Server,
"namespace": namespace,
"name": ref.Name,
"api-version": ref.APIVersion,
"kind": ref.Kind,
}).Debug("Ignoring change of object because none of the watched resource fields have changed")
}
return
}

for _, r := range []*clustercache.Resource{newRes, oldRes} {
if r == nil {
continue
Expand Down
Loading

0 comments on commit 88994ea

Please sign in to comment.