Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor NTO to use controller runtime lib #302

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
4 changes: 2 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM registry.ci.openshift.org/openshift/release:golang-1.16 AS builder
FROM registry.ci.openshift.org/openshift/release:golang-1.17 AS builder
WORKDIR /go/src/github.com/openshift/cluster-node-tuning-operator
COPY . .
RUN make build
Expand All @@ -20,7 +20,7 @@ RUN INSTALL_PKGS=" \

FROM centos:8
COPY --from=builder /go/src/github.com/openshift/cluster-node-tuning-operator/_output/cluster-node-tuning-operator /usr/bin/
COPY manifests/*.yaml manifests/image-references /manifests
COPY manifests/*.yaml manifests/image-references /manifests/
ENV APP_ROOT=/var/lib/tuned
ENV PATH=${APP_ROOT}/bin:${PATH}
ENV HOME=${APP_ROOT}
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile.rhel8
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ RUN INSTALL_PKGS=" \

FROM registry.ci.openshift.org/ocp/4.10:base
COPY --from=builder /go/src/github.com/openshift/cluster-node-tuning-operator/_output/cluster-node-tuning-operator /usr/bin/
COPY manifests/*.yaml manifests/image-references /manifests
COPY manifests/*.yaml manifests/image-references /manifests/
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm glad to see I'm not the only one who hit this :)

ENV APP_ROOT=/var/lib/tuned
ENV PATH=${APP_ROOT}/bin:${PATH}
ENV HOME=${APP_ROOT}
Expand Down
102 changes: 86 additions & 16 deletions cmd/cluster-node-tuning-operator/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,63 +2,133 @@ package main

import (
"flag"
"fmt"
"os"
"path/filepath"
"runtime"

apiconfigv1 "github.com/openshift/api/config/v1"
mcov1 "github.com/openshift/machine-config-operator/pkg/apis/machineconfiguration.openshift.io/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
apiruntime "k8s.io/apimachinery/pkg/runtime"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
_ "k8s.io/client-go/plugin/pkg/client/auth/gcp"
"k8s.io/klog/v2"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/cache"

tunedv1 "github.com/openshift/cluster-node-tuning-operator/pkg/apis/tuned/v1"
"github.com/openshift/cluster-node-tuning-operator/pkg/config"
"github.com/openshift/cluster-node-tuning-operator/pkg/metrics"
"github.com/openshift/cluster-node-tuning-operator/pkg/operator"
"github.com/openshift/cluster-node-tuning-operator/pkg/signals"
"github.com/openshift/cluster-node-tuning-operator/pkg/tuned"
"github.com/openshift/cluster-node-tuning-operator/pkg/util"
"github.com/openshift/cluster-node-tuning-operator/version"
)

const (
operandFilename = "openshift-tuned"
operatorFilename = "cluster-node-tuning-operator"
metricsHost = "0.0.0.0"
)

var (
boolVersion = flag.Bool("version", false, "show program version and exit")
boolLocal = flag.Bool("local", false, "local run outside a pod")
scheme = apiruntime.NewScheme()
)

func init() {
utilruntime.Must(clientgoscheme.AddToScheme(scheme))
utilruntime.Must(tunedv1.AddToScheme(scheme))
utilruntime.Must(mcov1.AddToScheme(scheme))
utilruntime.Must(apiconfigv1.Install(scheme))
}

func printVersion() {
klog.Infof("Go Version: %s", runtime.Version())
klog.Infof("Go OS/Arch: %s/%s", runtime.GOOS, runtime.GOARCH)
klog.Infof("%s Version: %s", tunedv1.TunedClusterOperatorResourceName, version.Version)
}

func main() {
const (
operandFilename string = "openshift-tuned"
operatorFilename string = "cluster-node-tuning-operator"
)
var boolVersion bool
var enableLeaderElection bool
flag.BoolVar(&boolVersion, "version", false, "show program version and exit")
flag.BoolVar(&enableLeaderElection, "enable-leader-election", true,
"Enable leader election for controller manager. "+
"Enabling this will ensure there is only one active controller manager.")

runAs := filepath.Base(os.Args[0])

stopCh := signals.SetupSignalHandler()

switch runAs {
case operatorFilename:
var metricsAddr string
flag.StringVar(&metricsAddr, "metrics-addr", fmt.Sprintf("%s:%d", metricsHost, metrics.MetricsPort),
"The address the metric endpoint binds to.")

klog.InitFlags(nil)
flag.Parse()

printVersion()

if *boolVersion {
if boolVersion {
os.Exit(0)
}

go metrics.RunServer(metrics.MetricsPort, stopCh)
metrics.RegisterVersion(version.Version)
controller, err := operator.NewController()
// We have two namespaces that we need to watch:
// 1. NTO namespace - for NTO resources
// 2. None namespace - for cluster wide resources
ntoNamespace := config.OperatorNamespace()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

metrics related configuration is missing

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At this point the metrics configuration is under the server.go class which is added as a runnable to the manager.

namespaces := []string{
ntoNamespace,
metav1.NamespaceNone,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

probably no need, because MC and node resources are cluster-scoped

Copy link
Contributor Author

@yanirq yanirq Jan 13, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I understand that its because MultiNamespacedCacheBuilder by default creates a global cache for cluster scoped resources. But we also have a need to watch pods in all namespaces when tuned CRs are pod typed driven.
In that case would you also recommend to remove the metav1.NamespaceNone from the cache builder ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mm good point, let leave it for now

}

restConfig := ctrl.GetConfigOrDie()
le := util.GetLeaderElectionConfig(restConfig, enableLeaderElection)
mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{
NewCache: cache.MultiNamespacedCacheBuilder(namespaces),
Copy link

@camilamacedo86 camilamacedo86 Feb 25, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why would you like to use this option?
How many ns would you like to inform here?

See that if you try to cache all -1 or 2 namespaces on the cluster then, you will probably check performance issues.
Please, check the comment: https://github.com/kubernetes-sigs/controller-runtime/blob/master/pkg/cache/multi_namespace_cache.go#L40-L46

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the main reason here was to watch cluster resources such as nodes and pods and have a distinct namespace for NTO to be used for filtering

Scheme: scheme,
LeaderElection: true,
LeaderElectionID: config.OperatorLockName,
LeaderElectionNamespace: ntoNamespace,
LeaseDuration: &le.LeaseDuration.Duration,
Copy link

@camilamacedo86 camilamacedo86 Feb 25, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are you defining a lease duration? see:

. // LeaseDuration is the duration that non-leader candidates will
// wait to force acquire leadership. This is measured against time of
// last observed ack. Default is 15 seconds.
LeaseDuration *time.Duration

Is not the default time good enough?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this was to keep the original behavior before the refactor

RetryPeriod: &le.RetryPeriod.Duration,
RenewDeadline: &le.RenewDeadline.Duration,
Namespace: ntoNamespace,

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are passing an NS and NewCache: cache.MultiNamespacedCacheBuilder(namespaces)
Note that you need to have permissions ( scope ) to read/update/delete resources and in this way cache them where the operator will be installed. Then, you can:

a) Watching/Catching resources in a set of Namespaces
It is possible to use MultiNamespacedCacheBuilder from Options to watch and manage resources in a set of Namespaces.

OR

b) Watching/Catching resources in a single Namespace (where the. operator will be installed) by using the Namespace option. See:

// Namespace if specified restricts the manager's cache to watch objects in
// the desired namespace Defaults to all namespaces
//
// Note: If a namespace is specified, controllers can still Watch for a
// cluster-scoped resource (e.g Node). For namespaced resources the cache
// will only hold objects from the desired namespace.
Namespace string

Ref: https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.11.1/pkg/manager#Manager

OR

c) Do not add both which means grant cluster-scope permission for the project. Your operator will be watching/catching the whole cluster

Be aware that how much more resources/namespaces do you catching/watching more resources you will consume.

})

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


if err != nil {
klog.Fatal(err)
klog.Exit(err)
}

mgr.Add(metrics.Server{})
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the reason for having two reconcilers? If we need them, I believe we definitely need to add some clear explanation about what each of the reconcilers is supposed to do. Brief explanation here, detailed in their code. I often find that comments/docs make me think about what I coded and whether it is coded well -- then I often rewrite the code... One "reconciler" file is called controller.go, the other one reconciler.go -- not sure why the inconsistency. Perhaps we want to make the names a bit more "telling"? They both also include comments that "... reconciles a Tuned object ...". However, at least in the controller.go we also sync MachineConfigs. Also, where's the split if they both "reconcile" a Tuned object?

Copy link
Contributor Author

@yanirq yanirq Jan 27, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The class names were kept to have some sort of a diff (at least the controller)
but now I intend to change the names to:
reconciler.go -> cvo_controller.go
controller.go -> tuned_controller.go
The struct names will be changed accordingly to be more clear.

More detailed comments will be added in main to explain that, but in general:
cvo_controller - reconciles requests for CVO objects. It owns (default) tuned and deamonset it creates.
It also watches for all other tuned objects and profiles in order to update the operator's status.

tuned_controller - reconciles requests for tuned objects and performs the logic for syncing profiles/nodes/rendered tuned in correlation with machine configs.

I agree that there would be some more cleanups to make the code more clear and move/remove some code sections from one controller or the other.
I do think that having 2 reconciles / controllers should be the form of implementation here but I'm open for changes
(cc @cynepco3hahue)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The class names were kept to have some sort of a diff (at least the controller) but now I intend to change the names to: reconciler.go -> cvo_controller.go controller.go -> tuned_controller.go The struct names will be changed accordingly to be more clear.

It is still not clear to me why we need two controllers, but I'm not against it as it might aid modularity and a clear separation of tasks. But the "tasks" need to be clear, agreed and make sense. If we really do need two (or even more controllers), then perhaps having a controller prefix to separate them would be better. E.g. controller_whatever.go. But controller_cvo.go still seems like a strange name and misleading. We need to come up with a better name if we do split it. I do not have any better suggestions at this point.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In general, the assumption of the controller is that the controller should work only on one type of resource. Like you said it is better from a modularity and readability perspective. So the idea is to have:

  1. operator controller to deploy the default set of resources(default tuned and default daemon set)
  2. controller to handle the behavior around the tuned resources.

Copy link

@camilamacedo86 camilamacedo86 Feb 25, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

WDYT use the SDK tool to refactor this one and adopted the default layout? (see: https://github.com/operator-framework/operator-sdk/tree/master/testdata/go/v3/memcached-operator)
In this case, we would have a controller to reconcile each API and the name would be Kind_controller, see: https://github.com/operator-framework/operator-sdk/tree/master/testdata/go/v3/memcached-operator/controllers

Regards why to have more than one controller, I'd like to suggest having one for each KIND/API provided by this project, see: https://sdk.operatorframework.io/docs/best-practices/common-recommendation/#avoid-a-design-solution-where-more-than-one-kind-is-reconciled-by-the-same-controller

metrics.RegisterVersion(version.Version)

if err = (&operator.Reconciler{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Namespace: ntoNamespace,
}).SetupWithManager(mgr); err != nil {
klog.Exitf("unable to create node tuning operator controller: %v", err)
}

if err = (&operator.TunedReconciler{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Namespace: ntoNamespace,
}).SetupWithManager(mgr); err != nil {
klog.Exitf("unable to create Tuned controller: %v", err)
}

controller.BecomeLeader(stopCh)
if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil {
klog.Exitf("manager exited with non-zero code: %v", err)
}
case operandFilename:
tuned.Run(stopCh, boolVersion, version.Version)
stopCh := signals.SetupSignalHandler()
tuned.Run(stopCh, &boolVersion, version.Version)
default:
klog.Fatalf("application should be run as \"%s\" or \"%s\"", operatorFilename, operandFilename)
klog.Fatalf("application should be run as %q or %q", operatorFilename, operandFilename)
}
}
31 changes: 20 additions & 11 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ require (
github.com/coreos/ignition/v2 v2.7.0
github.com/google/uuid v1.1.2
github.com/kevinburke/go-bindata v3.16.0+incompatible
github.com/onsi/ginkgo v1.14.0
github.com/onsi/gomega v1.10.1
github.com/onsi/ginkgo v1.16.5
github.com/onsi/gomega v1.17.0
github.com/openshift/api v0.0.0-20211209135129-c58d9f695577
github.com/openshift/build-machinery-go v0.0.0-20211213093930-7e33a7eb4ce3
github.com/openshift/client-go v0.0.0-20211209144617-7385dd6338e3
Expand All @@ -24,10 +24,12 @@ require (
k8s.io/klog v1.0.0
k8s.io/klog/v2 v2.30.0
k8s.io/utils v0.0.0-20210930125809-cb0fa318a74b
sigs.k8s.io/controller-runtime v0.11.0

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sigs.k8s.io/controller-tools v0.4.0
)

require (
cloud.google.com/go v0.81.0 // indirect
github.com/PuerkitoBio/purell v1.1.1 // indirect
github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578 // indirect
github.com/ajeddeloh/go-json v0.0.0-20170920214419-6a2fe990e083 // indirect
Expand All @@ -39,31 +41,31 @@ require (
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/emicklei/go-restful v2.10.0+incompatible // indirect
github.com/evanphx/json-patch v4.12.0+incompatible // indirect
github.com/fatih/color v1.7.0 // indirect
github.com/fsnotify/fsnotify v1.4.9 // indirect
github.com/fatih/color v1.12.0 // indirect
github.com/fsnotify/fsnotify v1.5.1 // indirect
github.com/ghodss/yaml v1.0.0 // indirect
github.com/go-logr/logr v1.2.0 // indirect
github.com/go-openapi/jsonpointer v0.19.5 // indirect
github.com/go-openapi/jsonreference v0.19.5 // indirect
github.com/go-openapi/swag v0.19.14 // indirect
github.com/gobuffalo/flect v0.2.0 // indirect
github.com/gobuffalo/flect v0.2.3 // indirect
github.com/gogo/protobuf v1.3.2 // indirect
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
github.com/golang/protobuf v1.5.2 // indirect
github.com/google/go-cmp v0.5.5 // indirect
github.com/google/go-cmp v0.5.6 // indirect
github.com/google/gofuzz v1.1.0 // indirect
github.com/googleapis/gnostic v0.5.5 // indirect
github.com/imdario/mergo v0.3.9 // indirect
github.com/imdario/mergo v0.3.12 // indirect
github.com/inconshreveable/mousetrap v1.0.0 // indirect
github.com/josharian/intern v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/mailru/easyjson v0.7.6 // indirect
github.com/mattn/go-colorable v0.1.2 // indirect
github.com/mattn/go-colorable v0.1.8 // indirect
github.com/mattn/go-isatty v0.0.12 // indirect
github.com/matttproud/golang_protobuf_extensions v1.0.2-0.20181231171920-c182affec369 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/nxadm/tail v1.4.4 // indirect
github.com/nxadm/tail v1.4.8 // indirect
github.com/prometheus/client_model v0.2.0 // indirect
github.com/prometheus/common v0.28.0 // indirect
github.com/prometheus/procfs v0.6.0 // indirect
Expand All @@ -73,24 +75,26 @@ require (
golang.org/x/mod v0.4.2 // indirect
golang.org/x/net v0.0.0-20210825183410-e898025ed96a // indirect
golang.org/x/oauth2 v0.0.0-20210819190943-2bc19b11175f // indirect
golang.org/x/sys v0.0.0-20210831042530-f4d43177bf5e // indirect
golang.org/x/sys v0.0.0-20211029165221-6e7872819dc8 // indirect
golang.org/x/term v0.0.0-20210615171337-6886f2dfbf5b // indirect
golang.org/x/text v0.3.7 // indirect
golang.org/x/time v0.0.0-20210723032227-1f47c861a9ac // indirect
golang.org/x/tools v0.1.6-0.20210820212750-d4cc65f0b2ff // indirect
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
gomodules.xyz/jsonpatch/v2 v2.2.0 // indirect
google.golang.org/appengine v1.6.7 // indirect
google.golang.org/protobuf v1.27.1 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b // indirect
k8s.io/apiextensions-apiserver v0.23.0 // indirect
k8s.io/component-base v0.23.0 // indirect
k8s.io/gengo v0.0.0-20210813121822-485abfe95c7c // indirect
k8s.io/kube-openapi v0.0.0-20211115234752-e816edb12b65 // indirect
sigs.k8s.io/json v0.0.0-20211020170558-c049b76a60c6 // indirect
sigs.k8s.io/structured-merge-diff/v4 v4.2.0 // indirect
sigs.k8s.io/yaml v1.2.0 // indirect
sigs.k8s.io/yaml v1.3.0 // indirect
)

// Pinned to kubernetes-1.23.0
Expand All @@ -105,6 +109,8 @@ replace (
k8s.io/cluster-bootstrap => k8s.io/cluster-bootstrap v0.23.0
k8s.io/code-generator => k8s.io/code-generator v0.23.0
k8s.io/component-base => k8s.io/component-base v0.23.0
k8s.io/component-helpers => k8s.io/component-helpers v0.23.0
k8s.io/controller-manager => k8s.io/controller-manager v0.23.0
k8s.io/cri-api => k8s.io/cri-api v0.23.0
k8s.io/csi-translation-lib => k8s.io/csi-translation-lib v0.23.0
k8s.io/kube-aggregator => k8s.io/kube-aggregator v0.23.0
Expand All @@ -116,5 +122,8 @@ replace (
k8s.io/kubernetes => k8s.io/kubernetes v1.23.0
k8s.io/legacy-cloud-providers => k8s.io/legacy-cloud-providers v0.23.0
k8s.io/metrics => k8s.io/metrics v0.23.0
k8s.io/mount-utils => k8s.io/mount-utils v0.23.0
k8s.io/sample-apiserver => k8s.io/sample-apiserver v0.23.0
sigs.k8s.io/controller-runtime => sigs.k8s.io/controller-runtime v0.11.0
sigs.k8s.io/controller-tools => sigs.k8s.io/controller-tools v0.7.0
Comment on lines +127 to +128

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do you need these replaces?
Why do you import sigs.k8s.io/controller-tools?

)
Loading