From 032812ba61a71a51896996c4f569f779fdd69957 Mon Sep 17 00:00:00 2001 From: Wei Fu Date: Thu, 27 Feb 2020 15:50:00 +0800 Subject: [PATCH] reload cni network config if has fs change events With go RWMutex design, no goroutine should expect to be able to acquire a read lock until the read lock has been released, if one goroutine call lock. The original design is to reload cni network config on every single Status CRI gRPC call. If one RunPodSandbox request holds read lock to allocate IP for too long, all other RunPodSandbox/StopPodSandbox requests will wait for the RunPodSandbox request to release read lock. And the Status CRI call will fail and kubelet becomes NOTReady. Reload cni network config at every single Status CRI call is not necessary and also brings NOTReady situation. To lower the possibility of NOTReady, CRI will reload cni network config if there is any valid fs change events from the cni network config dir. Signed-off-by: Wei Fu --- pkg/server/cni.go | 100 ++++++++++++++++++++++++++++++++++ pkg/server/service.go | 24 +++++++- pkg/server/service_unix.go | 5 -- pkg/server/service_windows.go | 6 -- pkg/server/status.go | 10 ++-- 5 files changed, 128 insertions(+), 17 deletions(-) create mode 100644 pkg/server/cni.go diff --git a/pkg/server/cni.go b/pkg/server/cni.go new file mode 100644 index 000000000..d8a02deec --- /dev/null +++ b/pkg/server/cni.go @@ -0,0 +1,100 @@ +package server + +import ( + "sync" + + cni "github.com/containerd/go-cni" + "github.com/fsnotify/fsnotify" + "github.com/pkg/errors" + "github.com/sirupsen/logrus" +) + +// cniNetConfSyncer is used to reload cni network conf triggered by fs change +// events. +type cniNetConfSyncer struct { + // only used for lastSyncStatus + sync.RWMutex + lastSyncStatus error + + watcher *fsnotify.Watcher + confDir string + netPlugin cni.CNI + loadOpts []cni.CNIOpt +} + +// newCNINetConfSyncer creates cni network conf syncer. +func newCNINetConfSyncer(confDir string, netPlugin cni.CNI, loadOpts []cni.CNIOpt) (*cniNetConfSyncer, error) { + watcher, err := fsnotify.NewWatcher() + if err != nil { + return nil, errors.Wrap(err, "failed to create fsnotify watcher") + } + + if err := watcher.Add(confDir); err != nil { + return nil, errors.Wrapf(err, "failed to watch cni conf dir %s", confDir) + } + + syncer := &cniNetConfSyncer{ + watcher: watcher, + confDir: confDir, + netPlugin: netPlugin, + loadOpts: loadOpts, + } + + if err := syncer.netPlugin.Load(syncer.loadOpts...); err != nil { + logrus.WithError(err).Error("failed to load cni during init, please check CRI plugin status before setting up network for pods") + syncer.updateLastStatus(err) + } + return syncer, nil +} + +// syncLoop monitors any fs change events from cni conf dir and tries to reload +// cni configuration. +func (syncer *cniNetConfSyncer) syncLoop() error { + for { + select { + case event := <-syncer.watcher.Events: + // Only reload config when receiving write/rename/remove + // events + // + // FIXME(fuweid): Might only reload target cni config + // files to prevent no-ops. + if event.Op&(fsnotify.Chmod|fsnotify.Create) > 0 { + logrus.Debugf("ignore event from cni conf dir: %s", event) + continue + } + logrus.Debugf("receiving change event from cni conf dir: %s", event) + + lerr := syncer.netPlugin.Load(syncer.loadOpts...) + if lerr != nil { + logrus.WithError(lerr). + Errorf("failed to reload cni configuration after receiving fs change event(%s)", event) + } + syncer.updateLastStatus(lerr) + + case err := <-syncer.watcher.Errors: + if err != nil { + logrus.WithError(err).Error("failed to continue sync cni conf change") + return err + } + } + } +} + +// lastStatus retrieves last sync status. +func (syncer *cniNetConfSyncer) lastStatus() error { + syncer.RLock() + defer syncer.RUnlock() + return syncer.lastSyncStatus +} + +// updateLastStatus will be called after every single cni load. +func (syncer *cniNetConfSyncer) updateLastStatus(err error) { + syncer.Lock() + defer syncer.Unlock() + syncer.lastSyncStatus = err +} + +// stop stops watcher in the syncLoop. +func (syncer *cniNetConfSyncer) stop() error { + return syncer.watcher.Close() +} diff --git a/pkg/server/service.go b/pkg/server/service.go index b7fd1c36a..46a84fe1d 100644 --- a/pkg/server/service.go +++ b/pkg/server/service.go @@ -91,6 +91,9 @@ type criService struct { // initialized indicates whether the server is initialized. All GRPC services // should return error before the server is initialized. initialized atomic.Bool + // cniNetConfMonitor is used to reload cni network conf if there is + // any valid fs change events from cni network conf dir. + cniNetConfMonitor *cniNetConfSyncer } // NewCRIService returns a new instance of CRIService @@ -128,6 +131,11 @@ func NewCRIService(config criconfig.Config, client *containerd.Client) (CRIServi c.eventMonitor = newEventMonitor(c) + c.cniNetConfMonitor, err = newCNINetConfSyncer(c.config.NetworkPluginConfDir, c.netPlugin, c.cniLoadOptions()) + if err != nil { + return nil, errors.Wrap(err, "failed to create cni conf monitor") + } + return c, nil } @@ -169,6 +177,14 @@ func (c *criService) Run() error { ) snapshotsSyncer.start() + // Start CNI network conf syncer + logrus.Info("Start cni network conf syncer") + cniNetConfMonitorErrCh := make(chan error, 1) + go func() { + defer close(cniNetConfMonitorErrCh) + cniNetConfMonitorErrCh <- c.cniNetConfMonitor.syncLoop() + }() + // Start streaming server. logrus.Info("Start streaming server") streamServerErrCh := make(chan error) @@ -183,11 +199,12 @@ func (c *criService) Run() error { // Set the server as initialized. GRPC services could start serving traffic. c.initialized.Set() - var eventMonitorErr, streamServerErr error + var eventMonitorErr, streamServerErr, cniNetConfMonitorErr error // Stop the whole CRI service if any of the critical service exits. select { case eventMonitorErr = <-eventMonitorErrCh: case streamServerErr = <-streamServerErrCh: + case cniNetConfMonitorErr = <-cniNetConfMonitorErrCh: } if err := c.Close(); err != nil { return errors.Wrap(err, "failed to stop cri service") @@ -222,6 +239,9 @@ func (c *criService) Run() error { if streamServerErr != nil { return errors.Wrap(streamServerErr, "stream server error") } + if cniNetConfMonitorErr != nil { + return errors.Wrap(cniNetConfMonitorErr, "cni network conf monitor error") + } return nil } @@ -233,7 +253,7 @@ func (c *criService) Close() error { if err := c.streamServer.Stop(); err != nil { return errors.Wrap(err, "failed to stop stream server") } - return nil + return c.cniNetConfMonitor.stop() } func (c *criService) register(s *grpc.Server) error { diff --git a/pkg/server/service_unix.go b/pkg/server/service_unix.go index 51753eb78..d5c78906a 100644 --- a/pkg/server/service_unix.go +++ b/pkg/server/service_unix.go @@ -60,11 +60,6 @@ func (c *criService) initPlatform() error { return errors.Wrap(err, "failed to initialize cni") } - // Try to load the config if it exists. Just log the error if load fails - // This is not disruptive for containerd to panic - if err := c.netPlugin.Load(c.cniLoadOptions()...); err != nil { - logrus.WithError(err).Error("Failed to load cni during init, please check CRI plugin status before setting up network for pods") - } return nil } diff --git a/pkg/server/service_windows.go b/pkg/server/service_windows.go index 7b1c35f2f..acd41ace0 100644 --- a/pkg/server/service_windows.go +++ b/pkg/server/service_windows.go @@ -21,7 +21,6 @@ package server import ( cni "github.com/containerd/go-cni" "github.com/pkg/errors" - "github.com/sirupsen/logrus" ) // windowsNetworkAttachCount is the minimum number of networks the PodSandbox @@ -44,11 +43,6 @@ func (c *criService) initPlatform() error { return errors.Wrap(err, "failed to initialize cni") } - // Try to load the config if it exists. Just log the error if load fails - // This is not disruptive for containerd to panic - if err := c.netPlugin.Load(c.cniLoadOptions()...); err != nil { - logrus.WithError(err).Error("Failed to load cni during init, please check CRI plugin status before setting up network for pods") - } return nil } diff --git a/pkg/server/status.go b/pkg/server/status.go index af2521781..e31183636 100644 --- a/pkg/server/status.go +++ b/pkg/server/status.go @@ -41,10 +41,6 @@ func (c *criService) Status(ctx context.Context, r *runtime.StatusRequest) (*run Type: runtime.NetworkReady, Status: true, } - // Load the latest cni configuration to be in sync with the latest network configuration - if err := c.netPlugin.Load(c.cniLoadOptions()...); err != nil { - log.G(ctx).WithError(err).Errorf("Failed to load cni configuration") - } // Check the status of the cni initialization if err := c.netPlugin.Status(); err != nil { networkCondition.Status = false @@ -76,6 +72,12 @@ func (c *criService) Status(ctx context.Context, r *runtime.StatusRequest) (*run log.G(ctx).WithError(err).Errorf("Failed to marshal CNI config %v", err) } resp.Info["cniconfig"] = string(cniConfig) + + lastCNILoadStatus := "OK" + if lerr := c.cniNetConfMonitor.lastStatus(); lerr != nil { + lastCNILoadStatus = lerr.Error() + } + resp.Info["lastCNILoadStatus"] = lastCNILoadStatus } return resp, nil }