diff --git a/cmd/sedna-gm/app/controller.go b/cmd/sedna-gm/app/controller.go index 07ec6287c..52174e50c 100644 --- a/cmd/sedna-gm/app/controller.go +++ b/cmd/sedna-gm/app/controller.go @@ -18,6 +18,7 @@ package app import ( "fmt" + "os" "github.com/spf13/cobra" "github.com/spf13/pflag" @@ -27,7 +28,7 @@ import ( "k8s.io/klog/v2" "github.com/kubeedge/sedna/cmd/sedna-gm/app/options" - controller "github.com/kubeedge/sedna/pkg/globalmanager" + controller "github.com/kubeedge/sedna/pkg/globalmanager/controllers" "github.com/kubeedge/sedna/pkg/util" "github.com/kubeedge/sedna/pkg/version/verflag" ) @@ -61,8 +62,12 @@ func NewControllerCommand() *cobra.Command { if errs := config.Validate(); len(errs) > 0 { klog.Fatal(util.SpliceErrors(errs.ToAggregate().Errors())) } - c := controller.NewController(config) - c.Start() + c := controller.New(config) + err = c.Start() + if err != nil { + klog.Errorf("failed to start controller: %v", err) + os.Exit(1) + } }, } fs := cmd.Flags() diff --git a/cmd/sedna-gm/sedna-gm.go b/cmd/sedna-gm/sedna-gm.go index bce60eca2..3777a617c 100644 --- a/cmd/sedna-gm/sedna-gm.go +++ b/cmd/sedna-gm/sedna-gm.go @@ -17,7 +17,9 @@ limitations under the License. package main import ( + "math/rand" "os" + "time" "k8s.io/component-base/logs" @@ -25,6 +27,8 @@ import ( ) func main() { + rand.Seed(time.Now().UnixNano()) + command := app.NewControllerCommand() logs.InitLogs() defer logs.FlushLogs() diff --git a/pkg/globalmanager/controller.go b/pkg/globalmanager/controller.go deleted file mode 100644 index 0085fe8f1..000000000 --- a/pkg/globalmanager/controller.go +++ /dev/null @@ -1,71 +0,0 @@ -/* -Copyright 2021 The KubeEdge Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package globalmanager - -import ( - "fmt" - "os" - - "k8s.io/klog/v2" - - "github.com/kubeedge/sedna/pkg/globalmanager/config" - websocket "github.com/kubeedge/sedna/pkg/globalmanager/messagelayer/ws" -) - -// MainController defines the main controller -type MainController struct { - Config *config.ControllerConfig -} - -// NewController creates a new main controller -func NewController(cc *config.ControllerConfig) *MainController { - config.InitConfigure(cc) - return &MainController{ - Config: cc, - } -} - -// Start starts the main controller -func (c *MainController) Start() { - type newFunc func(cfg *config.ControllerConfig) (FeatureControllerI, error) - - for _, featureFunc := range []newFunc{ - NewUpstreamController, - NewDownstreamController, - NewFederatedController, - NewJointController, - NewIncrementalJobController, - NewLifelongLearningJobController, - } { - f, _ := featureFunc(c.Config) - err := f.Start() - if err != nil { - klog.Warningf("failed to start controller %s: %+v", f.GetName(), err) - } else { - klog.Infof("started controller %s", f.GetName()) - } - } - - addr := fmt.Sprintf("%s:%d", c.Config.WebSocket.Address, c.Config.WebSocket.Port) - - ws := websocket.NewServer(addr) - err := ws.ListenAndServe() - if err != nil { - klog.Fatalf("failed to listen websocket at %s", addr) - os.Exit(1) - } -} diff --git a/pkg/globalmanager/controllers/dataset/dataset.go b/pkg/globalmanager/controllers/dataset/dataset.go new file mode 100644 index 000000000..8523057c5 --- /dev/null +++ b/pkg/globalmanager/controllers/dataset/dataset.go @@ -0,0 +1,74 @@ +/* +Copyright 2021 The KubeEdge Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package dataset + +import ( + "k8s.io/apimachinery/pkg/watch" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/cache" + + sednaclientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned/typed/sedna/v1alpha1" + "github.com/kubeedge/sedna/pkg/globalmanager/config" + "github.com/kubeedge/sedna/pkg/globalmanager/runtime" +) + +const ( + // KindName is the kind name of CR this controller controls + KindName = "Dataset" + + // Name is this controller name + Name = "Dataset" +) + +// Controller handles all dataset objects including: syncing to edge and update from edge. +type Controller struct { + kubeClient kubernetes.Interface + client sednaclientset.SednaV1alpha1Interface + + cfg *config.ControllerConfig + + sendToEdgeFunc runtime.DownstreamSendFunc +} + +func (c *Controller) Run(stopCh <-chan struct{}) { + // noop now +} + +// New creates a dataset controller +func New(cc *runtime.ControllerContext) (runtime.FeatureControllerI, error) { + c := &Controller{ + client: cc.SednaClient.SednaV1alpha1(), + kubeClient: cc.KubeClient, + } + informer := cc.SednaInformerFactory.Sedna().V1alpha1().Datasets().Informer() + informer.AddEventHandler(cache.ResourceEventHandlerFuncs{ + + AddFunc: func(obj interface{}) { + c.syncToEdge(watch.Added, obj) + }, + + UpdateFunc: func(old, cur interface{}) { + c.syncToEdge(watch.Added, cur) + }, + + DeleteFunc: func(obj interface{}) { + c.syncToEdge(watch.Deleted, obj) + }, + }) + + return c, nil +} diff --git a/pkg/globalmanager/controllers/dataset/downstream.go b/pkg/globalmanager/controllers/dataset/downstream.go new file mode 100644 index 000000000..a898fac0c --- /dev/null +++ b/pkg/globalmanager/controllers/dataset/downstream.go @@ -0,0 +1,54 @@ +/* +Copyright 2021 The KubeEdge Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package dataset + +import ( + "fmt" + + "k8s.io/apimachinery/pkg/watch" + + sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" + "github.com/kubeedge/sedna/pkg/globalmanager/runtime" +) + +// syncToEdge syncs the dataset resources +func (c *Controller) syncToEdge(eventType watch.EventType, obj interface{}) error { + dataset, ok := obj.(*sednav1.Dataset) + if !ok { + return nil + } + + // Since t.Kind may be empty, + // we need to fix the kind here if missing. + // more details at https://github.com/kubernetes/kubernetes/issues/3030 + dataset.Kind = KindName + + // Here only propagate to the nodes with non empty name + nodeName := dataset.Spec.NodeName + if len(nodeName) == 0 { + return fmt.Errorf("empty node name") + } + + runtime.InjectSecretAnnotations(c.kubeClient, dataset, dataset.Spec.CredentialName) + + return c.sendToEdgeFunc(nodeName, eventType, dataset) +} + +func (c *Controller) SetDownstreamSendFunc(f runtime.DownstreamSendFunc) error { + c.sendToEdgeFunc = f + return nil +} diff --git a/pkg/globalmanager/controllers/dataset/upstream.go b/pkg/globalmanager/controllers/dataset/upstream.go new file mode 100644 index 000000000..a1b1949e0 --- /dev/null +++ b/pkg/globalmanager/controllers/dataset/upstream.go @@ -0,0 +1,62 @@ +/* +Copyright 2021 The KubeEdge Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package dataset + +import ( + "context" + "encoding/json" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" + "github.com/kubeedge/sedna/pkg/globalmanager/runtime" +) + +// updateFromEdge syncs update from edge +func (c *Controller) updateFromEdge(name, namespace, operation string, content []byte) error { + status := sednav1.DatasetStatus{} + err := json.Unmarshal(content, &status) + if err != nil { + return err + } + + return c.updateStatus(name, namespace, status) +} + +// updateStatus updates the dataset status +func (c *Controller) updateStatus(name, namespace string, status sednav1.DatasetStatus) error { + client := c.client.Datasets(namespace) + + if status.UpdateTime == nil { + now := metav1.Now() + status.UpdateTime = &now + } + + return runtime.RetryUpdateStatus(name, namespace, func() error { + dataset, err := client.Get(context.TODO(), name, metav1.GetOptions{}) + if err != nil { + return err + } + dataset.Status = status + _, err = client.UpdateStatus(context.TODO(), dataset, metav1.UpdateOptions{}) + return err + }) +} + +func (c *Controller) SetUpstreamHandler(addFunc runtime.UpstreamHandlerAddFunc) error { + return addFunc(KindName, c.updateFromEdge) +} diff --git a/pkg/globalmanager/controllers/federatedlearning/downstream.go b/pkg/globalmanager/controllers/federatedlearning/downstream.go new file mode 100644 index 000000000..3b5f2fd22 --- /dev/null +++ b/pkg/globalmanager/controllers/federatedlearning/downstream.go @@ -0,0 +1,56 @@ +/* +Copyright 2021 The KubeEdge Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package federatedlearning + +import ( + "k8s.io/apimachinery/pkg/watch" + + sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" + "github.com/kubeedge/sedna/pkg/globalmanager/runtime" +) + +func (c *Controller) syncToEdge(eventType watch.EventType, obj interface{}) error { + job, ok := obj.(*sednav1.FederatedLearningJob) + if !ok { + return nil + } + + // Since Kind may be empty, + // we need to fix the kind here if missing. + // more details at https://github.com/kubernetes/kubernetes/issues/3030 + job.Kind = KindName + + // broadcast to all nodes specified in spec + nodeset := make(map[string]bool) + for _, trainingWorker := range job.Spec.TrainingWorkers { + // Here only propagate to the nodes with non empty name + if len(trainingWorker.Template.Spec.NodeName) > 0 { + nodeset[trainingWorker.Template.Spec.NodeName] = true + } + } + + for nodeName := range nodeset { + c.sendToEdgeFunc(nodeName, eventType, job) + } + return nil +} + +func (c *Controller) SetDownstreamSendFunc(f runtime.DownstreamSendFunc) error { + c.sendToEdgeFunc = f + + return nil +} diff --git a/pkg/globalmanager/federatedlearningjob.go b/pkg/globalmanager/controllers/federatedlearning/federatedlearningjob.go similarity index 53% rename from pkg/globalmanager/federatedlearningjob.go rename to pkg/globalmanager/controllers/federatedlearning/federatedlearningjob.go index a8ab0c489..b775b089b 100644 --- a/pkg/globalmanager/federatedlearningjob.go +++ b/pkg/globalmanager/controllers/federatedlearning/federatedlearningjob.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package globalmanager +package federatedlearning import ( "context" @@ -28,7 +28,7 @@ import ( utilrand "k8s.io/apimachinery/pkg/util/rand" utilruntime "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/apimachinery/pkg/util/wait" - kubeinformers "k8s.io/client-go/informers" + "k8s.io/apimachinery/pkg/watch" "k8s.io/client-go/kubernetes" "k8s.io/client-go/kubernetes/scheme" v1core "k8s.io/client-go/kubernetes/typed/core/v1" @@ -40,33 +40,37 @@ import ( k8scontroller "k8s.io/kubernetes/pkg/controller" sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" - clientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned" sednaclientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned/typed/sedna/v1alpha1" - informers "github.com/kubeedge/sedna/pkg/client/informers/externalversions" sednav1listers "github.com/kubeedge/sedna/pkg/client/listers/sedna/v1alpha1" "github.com/kubeedge/sedna/pkg/globalmanager/config" - messageContext "github.com/kubeedge/sedna/pkg/globalmanager/messagelayer/ws" - "github.com/kubeedge/sedna/pkg/globalmanager/utils" + "github.com/kubeedge/sedna/pkg/globalmanager/runtime" ) const ( - FLJobStageAgg = "Aggregation" - FLJobStageTrain = "Training" + // KindName is the kind name of CR this controller controls + KindName = "FederatedLearningJob" + // Name is this controller name + Name = "FederatedLearning" ) -// flJobControllerKind contains the schema.GroupVersionKind for this controller type. -var flJobControllerKind = sednav1.SchemeGroupVersion.WithKind("FederatedLearningJob") +const ( + jobStageAgg = "Aggregation" + jobStageTrain = "Training" +) + +// Kind contains the schema.GroupVersionKind for this controller type. +var Kind = sednav1.SchemeGroupVersion.WithKind(KindName) -// FederatedController ensures that all FLJob objects have corresponding pods to +// Controller ensures that all FederatedLearningJob objects have corresponding pods to // run their configured workload. -type FederatedController struct { +type Controller struct { kubeClient kubernetes.Interface client sednaclientset.SednaV1alpha1Interface // podStoreSynced returns true if the pod store has been synced at least once. // Added as a member to the struct to allow injection for testing. podStoreSynced cache.InformerSynced - // jobStoreSynced returns true if the flJob store has been synced at least once. + // jobStoreSynced returns true if the FederatedLearningJob store has been synced at least once. // Added as a member to the struct to allow injection for testing. jobStoreSynced cache.InformerSynced @@ -82,48 +86,47 @@ type FederatedController struct { recorder record.EventRecorder cfg *config.ControllerConfig + + sendToEdgeFunc runtime.DownstreamSendFunc } -// Run the main goroutine responsible for watching and syncing jobs. -func (fc *FederatedController) Start() error { +// Run starts the main goroutine responsible for watching and syncing jobs. +func (c *Controller) Run(stopCh <-chan struct{}) { workers := 1 - stopCh := messageContext.Done() - go func() { - defer utilruntime.HandleCrash() - defer fc.queue.ShutDown() - klog.Infof("Starting federatedlearning job controller") - defer klog.Infof("Shutting down federatedlearning job controller") + defer utilruntime.HandleCrash() + defer c.queue.ShutDown() - if !cache.WaitForNamedCacheSync("federatedlearning job", stopCh, fc.podStoreSynced, fc.jobStoreSynced) { - klog.Errorf("failed to wait for caches to sync") + klog.Infof("Starting %s controller", Name) + defer klog.Infof("Shutting down %s controller", Name) - return - } + if !cache.WaitForNamedCacheSync(Name, stopCh, c.podStoreSynced, c.jobStoreSynced) { + klog.Errorf("failed to wait for %s caches to sync", Name) - klog.Infof("Starting federatedlearning job workers") - for i := 0; i < workers; i++ { - go wait.Until(fc.worker, time.Second, stopCh) - } + return + } - <-stopCh - }() - return nil + klog.Infof("Starting %s workers", Name) + for i := 0; i < workers; i++ { + go wait.Until(c.worker, time.Second, stopCh) + } + + <-stopCh } // enqueueByPod enqueues the FederatedLearningJob object of the specified pod. -func (fc *FederatedController) enqueueByPod(pod *v1.Pod, immediate bool) { +func (c *Controller) enqueueByPod(pod *v1.Pod, immediate bool) { controllerRef := metav1.GetControllerOf(pod) if controllerRef == nil { return } - if controllerRef.Kind != flJobControllerKind.Kind { + if controllerRef.Kind != Kind.Kind { return } - job, err := fc.jobLister.FederatedLearningJobs(pod.Namespace).Get(controllerRef.Name) + job, err := c.jobLister.FederatedLearningJobs(pod.Namespace).Get(controllerRef.Name) if err != nil { return } @@ -132,27 +135,27 @@ func (fc *FederatedController) enqueueByPod(pod *v1.Pod, immediate bool) { return } - fc.enqueueController(job, immediate) + c.enqueueController(job, immediate) } // When a pod is created, enqueue the controller that manages it and update it's expectations. -func (fc *FederatedController) addPod(obj interface{}) { +func (c *Controller) addPod(obj interface{}) { pod := obj.(*v1.Pod) if pod.DeletionTimestamp != nil { // on a restart of the controller, it's possible a new pod shows up in a state that // is already pending deletion. Prevent the pod from being a creation observation. - fc.deletePod(pod) + c.deletePod(pod) return } // backoff to queue when PodFailed immediate := pod.Status.Phase != v1.PodFailed - fc.enqueueByPod(pod, immediate) + c.enqueueByPod(pod, immediate) } // When a pod is updated, figure out what federatedlearning job manage it and wake them up. -func (fc *FederatedController) updatePod(old, cur interface{}) { +func (c *Controller) updatePod(old, cur interface{}) { curPod := cur.(*v1.Pod) oldPod := old.(*v1.Pod) @@ -161,11 +164,11 @@ func (fc *FederatedController) updatePod(old, cur interface{}) { return } - fc.addPod(curPod) + c.addPod(curPod) } // deletePod enqueues the FederatedLearningJob obj When a pod is deleted -func (fc *FederatedController) deletePod(obj interface{}) { +func (c *Controller) deletePod(obj interface{}) { pod, ok := obj.(*v1.Pod) // comment from https://github.com/kubernetes/kubernetes/blob/master/pkg/controller/job/job_controller.go @@ -186,13 +189,13 @@ func (fc *FederatedController) deletePod(obj interface{}) { return } } - fc.enqueueByPod(pod, true) + c.enqueueByPod(pod, true) } // obj could be an *sednav1.FederatedLearningJob, or a DeletionFinalStateUnknown marker item, // immediate tells the controller to update the status right away, and should // happen ONLY when there was a successful pod run. -func (fc *FederatedController) enqueueController(obj interface{}, immediate bool) { +func (c *Controller) enqueueController(obj interface{}, immediate bool) { key, err := k8scontroller.KeyFunc(obj) if err != nil { klog.Warningf("Couldn't get key for object %+v: %v", obj, err) @@ -201,43 +204,43 @@ func (fc *FederatedController) enqueueController(obj interface{}, immediate bool backoff := time.Duration(0) if !immediate { - backoff = getBackoff(fc.queue, key) + backoff = runtime.GetBackoff(c.queue, key) } - fc.queue.AddAfter(key, backoff) + c.queue.AddAfter(key, backoff) } // worker runs a worker thread that just dequeues items, processes them, and marks them done. // It enforces that the syncHandler is never invoked concurrently with the same key. -func (fc *FederatedController) worker() { - for fc.processNextWorkItem() { +func (c *Controller) worker() { + for c.processNextWorkItem() { } } -func (fc *FederatedController) processNextWorkItem() bool { - key, quit := fc.queue.Get() +func (c *Controller) processNextWorkItem() bool { + key, quit := c.queue.Get() if quit { return false } - defer fc.queue.Done(key) + defer c.queue.Done(key) - forget, err := fc.syncFLJob(key.(string)) + forget, err := c.sync(key.(string)) if err == nil { if forget { - fc.queue.Forget(key) + c.queue.Forget(key) } return true } klog.Warningf("Error syncing federatedlearning job: %v", err) - fc.queue.AddRateLimited(key) + c.queue.AddRateLimited(key) return true } -// syncFLJob will sync the flJob with the given key if it has had its expectations fulfilled, meaning +// sync will sync the FederatedLearningJob with the given key if it has had its expectations fulfilled, meaning // it did not expect to see any more of its pods created or deleted. This function is not meant to be invoked // concurrently with the same key. -func (fc *FederatedController) syncFLJob(key string) (bool, error) { +func (c *Controller) sync(key string) (bool, error) { startTime := time.Now() defer func() { klog.V(4).Infof("Finished syncing federatedlearning job %q (%v)", key, time.Since(startTime)) @@ -250,91 +253,96 @@ func (fc *FederatedController) syncFLJob(key string) (bool, error) { if len(ns) == 0 || len(name) == 0 { return false, fmt.Errorf("invalid federatedlearning job key %q: either namespace or name is missing", key) } - sharedFLJob, err := fc.jobLister.FederatedLearningJobs(ns).Get(name) + sharedJob, err := c.jobLister.FederatedLearningJobs(ns).Get(name) if err != nil { if errors.IsNotFound(err) { - klog.V(4).Infof("FLJob has been deleted: %v", key) + klog.V(4).Infof("%s %v has been deleted", Name, key) return true, nil } return false, err } - flJob := *sharedFLJob - // set kind for flJob in case that the kind is None - flJob.SetGroupVersionKind(sednav1.SchemeGroupVersion.WithKind("FederatedLearningJob")) - // if flJob was finished previously, we don't want to redo the termination - if IsFLJobFinished(&flJob) { + + job := *sharedJob + // set kind for FederatedLearningJob in case that the kind is None + job.SetGroupVersionKind(Kind) + + // if job was finished previously, we don't want to redo the termination + if IsJobFinished(&job) { return true, nil } - selector, _ := GenerateSelector(&flJob) - pods, err := fc.podStore.Pods(flJob.Namespace).List(selector) + + selector, _ := runtime.GenerateSelector(&job) + pods, err := c.podStore.Pods(job.Namespace).List(selector) if err != nil { return false, err } activePods := k8scontroller.FilterActivePods(pods) active := int32(len(activePods)) - succeeded, failed := getStatus(pods) - conditions := len(flJob.Status.Conditions) - // flJob first start - if flJob.Status.StartTime == nil { + succeeded, failed := countPods(pods) + conditions := len(job.Status.Conditions) + + // set StartTime when job is handled firstly + if job.Status.StartTime == nil { now := metav1.Now() - flJob.Status.StartTime = &now + job.Status.StartTime = &now } var manageJobErr error jobFailed := false var failureReason string var failureMessage string - phase := flJob.Status.Phase + phase := job.Status.Phase if failed > 0 { jobFailed = true failureReason = "workerFailed" - failureMessage = "the worker of FLJob failed" + failureMessage = "the worker of FederatedLearningJob failed" } if jobFailed { - flJob.Status.Conditions = append(flJob.Status.Conditions, NewFLJobCondition(sednav1.FLJobCondFailed, failureReason, failureMessage)) - flJob.Status.Phase = sednav1.FLJobFailed - fc.recorder.Event(&flJob, v1.EventTypeWarning, failureReason, failureMessage) + job.Status.Conditions = append(job.Status.Conditions, NewJobCondition(sednav1.FLJobCondFailed, failureReason, failureMessage)) + job.Status.Phase = sednav1.FLJobFailed + c.recorder.Event(&job, v1.EventTypeWarning, failureReason, failureMessage) } else { // in the First time, we create the pods if len(pods) == 0 { - active, manageJobErr = fc.createPod(&flJob) + active, manageJobErr = c.createPod(&job) } complete := false if succeeded > 0 && active == 0 { complete = true } if complete { - flJob.Status.Conditions = append(flJob.Status.Conditions, NewFLJobCondition(sednav1.FLJobCondComplete, "", "")) + job.Status.Conditions = append(job.Status.Conditions, NewJobCondition(sednav1.FLJobCondComplete, "", "")) now := metav1.Now() - flJob.Status.CompletionTime = &now - fc.recorder.Event(&flJob, v1.EventTypeNormal, "Completed", "FLJob completed") - flJob.Status.Phase = sednav1.FLJobSucceeded + job.Status.CompletionTime = &now + c.recorder.Event(&job, v1.EventTypeNormal, "Completed", "FederatedLearningJob completed") + job.Status.Phase = sednav1.FLJobSucceeded } else { - flJob.Status.Phase = sednav1.FLJobRunning + job.Status.Phase = sednav1.FLJobRunning } } forget := false // Check if the number of jobs succeeded increased since the last check. If yes "forget" should be true // This logic is linked to the issue: https://github.com/kubernetes/kubernetes/issues/56853 that aims to - // improve the FLJob backoff policy when parallelism > 1 and few FLJobs failed but others succeed. + // improve the job backoff policy when parallelism > 1 and few FLJobs failed but others succeed. // In this case, we should clear the backoff delay. - if flJob.Status.Succeeded < succeeded { + if job.Status.Succeeded < succeeded { forget = true } - // no need to update the flJob if the status hasn't changed since last time - if flJob.Status.Active != active || flJob.Status.Succeeded != succeeded || flJob.Status.Failed != failed || len(flJob.Status.Conditions) != conditions || flJob.Status.Phase != phase { - flJob.Status.Active = active - flJob.Status.Succeeded = succeeded - flJob.Status.Failed = failed + // no need to update the job if the status hasn't changed since last time + if job.Status.Active != active || job.Status.Succeeded != succeeded || job.Status.Failed != failed || len(job.Status.Conditions) != conditions || job.Status.Phase != phase { + job.Status.Active = active + job.Status.Succeeded = succeeded + job.Status.Failed = failed + c.updateJobStatus(&job) - if jobFailed && !IsFLJobFinished(&flJob) { - // returning an error will re-enqueue FLJob after the backoff period - return forget, fmt.Errorf("failed pod(s) detected for flJob key %q", key) + if jobFailed && !IsJobFinished(&job) { + // returning an error will re-enqueue FederatedLearningJob after the backoff period + return forget, fmt.Errorf("failed pod(s) detected for FederatedLearningJob key %q", key) } forget = true @@ -343,7 +351,7 @@ func (fc *FederatedController) syncFLJob(key string) (bool, error) { return forget, manageJobErr } -func NewFLJobCondition(conditionType sednav1.FLJobConditionType, reason, message string) sednav1.FLJobCondition { +func NewJobCondition(conditionType sednav1.FLJobConditionType, reason, message string) sednav1.FLJobCondition { return sednav1.FLJobCondition{ Type: conditionType, Status: v1.ConditionTrue, @@ -354,28 +362,24 @@ func NewFLJobCondition(conditionType sednav1.FLJobConditionType, reason, message } } -// getStatus returns no of succeeded and failed pods running a flJob -func getStatus(pods []*v1.Pod) (succeeded, failed int32) { +// countPods returns number of succeeded and failed pods +func countPods(pods []*v1.Pod) (succeeded, failed int32) { succeeded = int32(filterPods(pods, v1.PodSucceeded)) failed = int32(filterPods(pods, v1.PodFailed)) return } -func (fc *FederatedController) updateFLJobStatus(flJob *sednav1.FederatedLearningJob) error { - jobClient := fc.client.FederatedLearningJobs(flJob.Namespace) - var err error - for i := 0; i <= ResourceUpdateRetries; i = i + 1 { - var newFLJob *sednav1.FederatedLearningJob - newFLJob, err = jobClient.Get(context.TODO(), flJob.Name, metav1.GetOptions{}) +func (c *Controller) updateJobStatus(job *sednav1.FederatedLearningJob) error { + jobClient := c.client.FederatedLearningJobs(job.Namespace) + return runtime.RetryUpdateStatus(job.Name, job.Namespace, func() error { + newJob, err := jobClient.Get(context.TODO(), job.Name, metav1.GetOptions{}) if err != nil { - break - } - newFLJob.Status = flJob.Status - if _, err = jobClient.UpdateStatus(context.TODO(), newFLJob, metav1.UpdateOptions{}); err == nil { - break + return err } - } - return nil + newJob.Status = job.Status + _, err = jobClient.UpdateStatus(context.TODO(), newJob, metav1.UpdateOptions{}) + return err + }) } // filterPods returns pods based on their phase. @@ -389,7 +393,7 @@ func filterPods(pods []*v1.Pod, phase v1.PodPhase) int { return result } -func IsFLJobFinished(j *sednav1.FederatedLearningJob) bool { +func IsJobFinished(j *sednav1.FederatedLearningJob) bool { for _, c := range j.Status.Conditions { if (c.Type == sednav1.FLJobCondComplete || c.Type == sednav1.FLJobCondFailed) && c.Status == v1.ConditionTrue { return true @@ -398,12 +402,12 @@ func IsFLJobFinished(j *sednav1.FederatedLearningJob) bool { return false } -func (fc *FederatedController) createPod(job *sednav1.FederatedLearningJob) (active int32, err error) { +func (c *Controller) createPod(job *sednav1.FederatedLearningJob) (active int32, err error) { active = 0 ctx := context.Background() modelName := job.Spec.AggregationWorker.Model.Name - model, err := fc.client.Models(job.Namespace).Get(ctx, modelName, metav1.GetOptions{}) + model, err := c.client.Models(job.Namespace).Get(ctx, modelName, metav1.GetOptions{}) if err != nil { return active, fmt.Errorf("failed to get model %s: %w", modelName, err) @@ -412,7 +416,7 @@ func (fc *FederatedController) createPod(job *sednav1.FederatedLearningJob) (act secretName := model.Spec.CredentialName var modelSecret *v1.Secret if secretName != "" { - modelSecret, _ = fc.kubeClient.CoreV1().Secrets(job.Namespace).Get(context.TODO(), secretName, metav1.GetOptions{}) + modelSecret, _ = c.kubeClient.CoreV1().Secrets(job.Namespace).Get(context.TODO(), secretName, metav1.GetOptions{}) } participantsCount := strconv.Itoa(len(job.Spec.TrainingWorkers)) @@ -420,10 +424,10 @@ func (fc *FederatedController) createPod(job *sednav1.FederatedLearningJob) (act // deliver pod for aggregation worker aggWorker := job.Spec.AggregationWorker - // Configure container mounting and Env information by initial WorkerParam + // Configure aggregation worker's mounts and envs var aggPort int32 = 7363 - var aggWorkerParam *WorkerParam = new(WorkerParam) - aggWorkerParam.env = map[string]string{ + var aggWorkerParam runtime.WorkerParam + aggWorkerParam.Env = map[string]string{ "NAMESPACE": job.Namespace, "WORKER_NAME": "aggworker-" + utilrand.String(5), "JOB_NAME": job.Name, @@ -432,12 +436,12 @@ func (fc *FederatedController) createPod(job *sednav1.FederatedLearningJob) (act "PARTICIPANTS_COUNT": participantsCount, } - aggWorkerParam.workerType = FLJobStageAgg - aggWorkerParam.restartPolicy = v1.RestartPolicyOnFailure + aggWorkerParam.WorkerType = jobStageAgg + aggWorkerParam.RestartPolicy = v1.RestartPolicyOnFailure - aggWorkerParam.mounts = append(aggWorkerParam.mounts, - WorkerMount{ - URL: &MountURL{ + aggWorkerParam.Mounts = append(aggWorkerParam.Mounts, + runtime.WorkerMount{ + URL: &runtime.MountURL{ URL: model.Spec.URL, Secret: modelSecret, DownloadByInitializer: false, @@ -447,9 +451,9 @@ func (fc *FederatedController) createPod(job *sednav1.FederatedLearningJob) (act ) // create aggpod based on configured parameters - _, err = createPodWithTemplate(fc.kubeClient, job, &aggWorker.Template, aggWorkerParam) + _, err = runtime.CreatePodWithTemplate(c.kubeClient, job, &aggWorker.Template, &aggWorkerParam) if err != nil { - return active, err + return active, fmt.Errorf("failed to create aggregation worker: %w", err) } active++ @@ -458,17 +462,21 @@ func (fc *FederatedController) createPod(job *sednav1.FederatedLearningJob) (act // FIXME(llhuii): only the case that Spec.NodeName specified is support, // will support Spec.NodeSelector. - appIP, err = GetNodeIPByName(fc.kubeClient, job.Spec.AggregationWorker.Template.Spec.NodeName) + appIP, err = runtime.GetNodeIPByName(c.kubeClient, job.Spec.AggregationWorker.Template.Spec.NodeName) + if err != nil { + return active, err + } - aggServicePort, err = CreateKubernetesService(fc.kubeClient, job, FLJobStageAgg, aggPort, appIP) + aggServicePort, err = runtime.CreateKubernetesService(c.kubeClient, job, jobStageAgg, aggPort, appIP) if err != nil { return active, err } + // deliver pod for training worker - for _, trainingWorker := range job.Spec.TrainingWorkers { + for i, trainingWorker := range job.Spec.TrainingWorkers { // get dataseturl through parsing crd of dataset datasetName := trainingWorker.Dataset.Name - dataset, err := fc.client.Datasets(job.Namespace).Get(ctx, datasetName, metav1.GetOptions{}) + dataset, err := c.client.Datasets(job.Namespace).Get(ctx, datasetName, metav1.GetOptions{}) if err != nil { return active, fmt.Errorf("failed to get dataset %s: %w", datasetName, err) @@ -477,23 +485,22 @@ func (fc *FederatedController) createPod(job *sednav1.FederatedLearningJob) (act secretName := dataset.Spec.CredentialName var datasetSecret *v1.Secret if secretName != "" { - datasetSecret, _ = fc.kubeClient.CoreV1().Secrets(job.Namespace).Get(context.TODO(), secretName, metav1.GetOptions{}) + datasetSecret, _ = c.kubeClient.CoreV1().Secrets(job.Namespace).Get(context.TODO(), secretName, metav1.GetOptions{}) } - // Configure container mounting and Env information by initial WorkerParam - var workerParam *WorkerParam = new(WorkerParam) - - workerParam.mounts = append(workerParam.mounts, - WorkerMount{ - URL: &MountURL{ + // Configure training worker's mounts and envs + var workerParam runtime.WorkerParam + workerParam.Mounts = append(workerParam.Mounts, + runtime.WorkerMount{ + URL: &runtime.MountURL{ URL: model.Spec.URL, Secret: modelSecret, }, EnvName: "MODEL_URL", }, - WorkerMount{ - URL: &MountURL{ + runtime.WorkerMount{ + URL: &runtime.MountURL{ URL: dataset.Spec.URL, Secret: datasetSecret, }, @@ -501,7 +508,7 @@ func (fc *FederatedController) createPod(job *sednav1.FederatedLearningJob) (act }, ) - workerParam.env = map[string]string{ + workerParam.Env = map[string]string{ "AGG_PORT": strconv.Itoa(int(aggServicePort)), "AGG_IP": appIP, @@ -511,65 +518,67 @@ func (fc *FederatedController) createPod(job *sednav1.FederatedLearningJob) (act "NAMESPACE": job.Namespace, "MODEL_NAME": modelName, "DATASET_NAME": datasetName, - "LC_SERVER": fc.cfg.LC.Server, + "LC_SERVER": c.cfg.LC.Server, } - workerParam.workerType = TrainPodType - workerParam.hostNetwork = true - workerParam.restartPolicy = v1.RestartPolicyOnFailure - // create train pod based on configured parameters - _, err = createPodWithTemplate(fc.kubeClient, job, &trainingWorker.Template, workerParam) + workerParam.WorkerType = runtime.TrainPodType + workerParam.HostNetwork = true + workerParam.RestartPolicy = v1.RestartPolicyOnFailure + + // create training worker based on configured parameters + _, err = runtime.CreatePodWithTemplate(c.kubeClient, job, &trainingWorker.Template, &workerParam) if err != nil { - return active, err + return active, fmt.Errorf("failed to create %dth training worker: %w", i, err) } active++ } return } -func (fc *FederatedController) GetName() string { - return "FederatedLearningJobController" -} - -// NewFederatedController creates a new FederatedLearningJob controller that keeps the relevant pods -// in sync with their corresponding FFederatedLearningJob objects. -func NewFederatedController(cfg *config.ControllerConfig) (FeatureControllerI, error) { - namespace := cfg.Namespace - if namespace == "" { - namespace = metav1.NamespaceAll - } - kubeClient, err := utils.KubeClient() - kubecfg, _ := utils.KubeConfig() - crdclient, err := clientset.NewForConfig(kubecfg) - kubeInformerFactory := kubeinformers.NewSharedInformerFactoryWithOptions(kubeClient, time.Second*30, kubeinformers.WithNamespace(namespace)) +// New creates a new federated learning job controller that keeps the relevant pods +// in sync with their corresponding FederatedLearningJob objects. +func New(cc *runtime.ControllerContext) (runtime.FeatureControllerI, error) { + cfg := cc.Config - podInformer := kubeInformerFactory.Core().V1().Pods() + podInformer := cc.KubeInformerFactory.Core().V1().Pods() - jobInformerFactory := informers.NewSharedInformerFactoryWithOptions(crdclient, time.Second*30, informers.WithNamespace(namespace)) - jobInformer := jobInformerFactory.Sedna().V1alpha1().FederatedLearningJobs() + jobInformer := cc.SednaInformerFactory.Sedna().V1alpha1().FederatedLearningJobs() eventBroadcaster := record.NewBroadcaster() - eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: kubeClient.CoreV1().Events("")}) + eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: cc.KubeClient.CoreV1().Events("")}) - fc := &FederatedController{ - kubeClient: kubeClient, - client: crdclient.SednaV1alpha1(), + fc := &Controller{ + kubeClient: cc.KubeClient, + client: cc.SednaClient.SednaV1alpha1(), - queue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(DefaultBackOff, MaxBackOff), "flJob"), - recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "flJob-controller"}), + queue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(runtime.DefaultBackOff, runtime.MaxBackOff), Name), + recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: Name + "-controller"}), cfg: cfg, } jobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ AddFunc: func(obj interface{}) { fc.enqueueController(obj, true) + + // when a federated learning job is added, + // send it to edge's LC. + fc.syncToEdge(watch.Added, obj) }, UpdateFunc: func(old, cur interface{}) { fc.enqueueController(cur, true) + + // when a federated learning job is updated, + // send it to edge's LC as Added event. + fc.syncToEdge(watch.Added, cur) }, DeleteFunc: func(obj interface{}) { fc.enqueueController(obj, true) + + // when a federated learning job is deleted, + // send it to edge's LC. + fc.syncToEdge(watch.Deleted, obj) }, }) + fc.jobLister = jobInformer.Lister() fc.jobStoreSynced = jobInformer.Informer().HasSynced @@ -581,8 +590,5 @@ func NewFederatedController(cfg *config.ControllerConfig) (FeatureControllerI, e fc.podStore = podInformer.Lister() fc.podStoreSynced = podInformer.Informer().HasSynced - stopCh := make(chan struct{}) - kubeInformerFactory.Start(stopCh) - jobInformerFactory.Start(stopCh) - return fc, err + return fc, nil } diff --git a/pkg/globalmanager/controllers/federatedlearning/upstream.go b/pkg/globalmanager/controllers/federatedlearning/upstream.go new file mode 100644 index 000000000..01888a6d2 --- /dev/null +++ b/pkg/globalmanager/controllers/federatedlearning/upstream.go @@ -0,0 +1,123 @@ +/* +Copyright 2021 The KubeEdge Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package federatedlearning + +import ( + "context" + "encoding/json" + "fmt" + + sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" + "github.com/kubeedge/sedna/pkg/globalmanager/runtime" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func (c *Controller) updateModelMetrics(jobName, namespace string, metrics []sednav1.Metric) error { + var err error + job, err := c.client.FederatedLearningJobs(namespace).Get(context.TODO(), jobName, metav1.GetOptions{}) + if err != nil { + // federated crd not found + return err + } + modelName := job.Spec.AggregationWorker.Model.Name + client := c.client.Models(namespace) + + return runtime.RetryUpdateStatus(modelName, namespace, (func() error { + model, err := client.Get(context.TODO(), modelName, metav1.GetOptions{}) + if err != nil { + return err + } + + now := metav1.Now() + model.Status.UpdateTime = &now + model.Status.Metrics = metrics + _, err = client.UpdateStatus(context.TODO(), model, metav1.UpdateOptions{}) + return err + })) +} + +func (c *Controller) appendStatusCondition(name, namespace string, cond sednav1.FLJobCondition) error { + client := c.client.FederatedLearningJobs(namespace) + + return runtime.RetryUpdateStatus(name, namespace, (func() error { + job, err := client.Get(context.TODO(), name, metav1.GetOptions{}) + if err != nil { + return err + } + job.Status.Conditions = append(job.Status.Conditions, cond) + _, err = client.UpdateStatus(context.TODO(), job, metav1.UpdateOptions{}) + return err + })) +} + +// updateFromEdge updates the federated job's status +func (c *Controller) updateFromEdge(name, namespace, operation string, content []byte) (err error) { + // JobInfo defines the job information + type JobInfo struct { + // Current training round + CurrentRound int `json:"currentRound"` + UpdateTime string `json:"updateTime"` + } + + // Output defines job output information + type Output struct { + Models []runtime.Model `json:"models"` + JobInfo *JobInfo `json:"ownerInfo"` + } + + var status struct { + Phase string `json:"phase"` + Status string `json:"status"` + Output *Output `json:"output"` + } + + err = json.Unmarshal(content, &status) + if err != nil { + return + } + + output := status.Output + + if output != nil { + // Update the model's metrics + if len(output.Models) > 0 { + // only one model + model := output.Models[0] + metrics := runtime.ConvertMapToMetrics(model.Metrics) + if len(metrics) > 0 { + c.updateModelMetrics(name, namespace, metrics) + } + } + + jobInfo := output.JobInfo + // update job info if having any info + if jobInfo != nil && jobInfo.CurrentRound > 0 { + // Find a good place to save the progress info + // TODO: more meaningful reason/message + reason := "DoTraining" + message := fmt.Sprintf("Round %v reaches at %s", jobInfo.CurrentRound, jobInfo.UpdateTime) + cond := NewJobCondition(sednav1.FLJobCondTraining, reason, message) + c.appendStatusCondition(name, namespace, cond) + } + } + + return nil +} + +func (c *Controller) SetUpstreamHandler(addFunc runtime.UpstreamHandlerAddFunc) error { + return addFunc(KindName, c.updateFromEdge) +} diff --git a/pkg/globalmanager/controllers/incrementallearning/downstream.go b/pkg/globalmanager/controllers/incrementallearning/downstream.go new file mode 100644 index 000000000..a53da8cbe --- /dev/null +++ b/pkg/globalmanager/controllers/incrementallearning/downstream.go @@ -0,0 +1,145 @@ +/* +Copyright 2021 The KubeEdge Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package incrementallearning + +import ( + "context" + "fmt" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/watch" + "k8s.io/klog/v2" + + sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" + "github.com/kubeedge/sedna/pkg/globalmanager/runtime" +) + +// syncModelWithName will sync the model to the specified node. +// Now called when creating the incrementaljob. +func (c *Controller) syncModelWithName(nodeName, modelName, namespace string) error { + model, err := c.client.Models(namespace).Get(context.TODO(), modelName, metav1.GetOptions{}) + if err != nil { + // TODO: maybe use err.ErrStatus.Code == 404 + return fmt.Errorf("model(%s/%s) not found", namespace, modelName) + } + + // Since model.Kind may be empty, + // we need to fix the kind here if missing. + // more details at https://github.com/kubernetes/kubernetes/issues/3030 + if len(model.Kind) == 0 { + model.Kind = "Model" + } + + runtime.InjectSecretAnnotations(c.kubeClient, model, model.Spec.CredentialName) + + c.sendToEdgeFunc(nodeName, watch.Added, model) + return nil +} + +func (c *Controller) syncToEdge(eventType watch.EventType, obj interface{}) error { + job, ok := obj.(*sednav1.IncrementalLearningJob) + if !ok { + return nil + } + + // Since Kind may be empty, + // we need to fix the kind here if missing. + // more details at https://github.com/kubernetes/kubernetes/issues/3030 + job.Kind = KindName + + jobConditions := job.Status.Conditions + if len(jobConditions) == 0 { + return nil + } + + dataName := job.Spec.Dataset.Name + ds, err := c.client.Datasets(job.Namespace).Get(context.TODO(), dataName, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("dataset(%s/%s) not found", job.Namespace, dataName) + } + // LC has dataset object on this node that may call dataset node + dsNodeName := ds.Spec.NodeName + + var trainNodeName string + var evalNodeName string + + ann := job.GetAnnotations() + if ann != nil { + trainNodeName = ann[runtime.AnnotationsKeyPrefix+string(sednav1.ILJobTrain)] + evalNodeName = ann[runtime.AnnotationsKeyPrefix+string(sednav1.ILJobEval)] + } + + if eventType == watch.Deleted { + // delete jobs from all LCs + for _, v := range []string{dsNodeName, trainNodeName, evalNodeName} { + if v != "" { + c.sendToEdgeFunc(v, eventType, job) + } + } + return nil + } + + latestCondition := jobConditions[len(jobConditions)-1] + currentType := latestCondition.Type + jobStage := latestCondition.Stage + + syncModelWithName := func(modelName string) { + if err := c.syncModelWithName(dsNodeName, modelName, job.Namespace); err != nil { + klog.Warningf("Error to sync model %s when sync incremental learning job %s to node %s: %v", + modelName, job.Name, dsNodeName, err) + } + } + + syncJobWithNodeName := func(nodeName string) { + if err := c.sendToEdgeFunc(nodeName, eventType, job); err != nil { + klog.Warningf("Error to sync incremental learning job %s to node %s in stage %s: %v", + job.Name, nodeName, jobStage, err) + } + } + + runtime.InjectSecretAnnotations(c.kubeClient, job, job.Spec.CredentialName) + + doJobStageEvent := func(modelName string, nodeName string) { + if currentType == sednav1.ILJobStageCondWaiting { + syncJobWithNodeName(dsNodeName) + syncModelWithName(modelName) + } else if currentType == sednav1.ILJobStageCondRunning { + if nodeName != "" { + syncJobWithNodeName(nodeName) + } + } else if currentType == sednav1.ILJobStageCondCompleted || currentType == sednav1.ILJobStageCondFailed { + if nodeName != dsNodeName { + // delete LC's job from nodeName that's different from dataset node when worker's status is completed or failed. + c.sendToEdgeFunc(nodeName, watch.Deleted, job) + } + } + } + + switch jobStage { + case sednav1.ILJobTrain: + doJobStageEvent(job.Spec.InitialModel.Name, trainNodeName) + case sednav1.ILJobEval: + doJobStageEvent(job.Spec.DeploySpec.Model.Name, evalNodeName) + } + + return nil +} + +func (c *Controller) SetDownstreamSendFunc(f runtime.DownstreamSendFunc) error { + c.sendToEdgeFunc = f + return nil +} diff --git a/pkg/globalmanager/incrementallearningjob.go b/pkg/globalmanager/controllers/incrementallearning/incrementallearningjob.go similarity index 59% rename from pkg/globalmanager/incrementallearningjob.go rename to pkg/globalmanager/controllers/incrementallearning/incrementallearningjob.go index 2df9d9f64..f1d792aef 100644 --- a/pkg/globalmanager/incrementallearningjob.go +++ b/pkg/globalmanager/controllers/incrementallearning/incrementallearningjob.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package globalmanager +package incrementallearning import ( "context" @@ -30,9 +30,8 @@ import ( utilrand "k8s.io/apimachinery/pkg/util/rand" utilruntime "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/apimachinery/pkg/util/wait" - kubeinformers "k8s.io/client-go/informers" + "k8s.io/apimachinery/pkg/watch" "k8s.io/client-go/kubernetes" - "k8s.io/client-go/kubernetes/scheme" v1core "k8s.io/client-go/kubernetes/typed/core/v1" corelisters "k8s.io/client-go/listers/core/v1" "k8s.io/client-go/tools/cache" @@ -42,28 +41,33 @@ import ( k8scontroller "k8s.io/kubernetes/pkg/controller" sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" - clientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned" sednaclientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned/typed/sedna/v1alpha1" - informers "github.com/kubeedge/sedna/pkg/client/informers/externalversions" sednav1listers "github.com/kubeedge/sedna/pkg/client/listers/sedna/v1alpha1" "github.com/kubeedge/sedna/pkg/globalmanager/config" - messageContext "github.com/kubeedge/sedna/pkg/globalmanager/messagelayer/ws" - "github.com/kubeedge/sedna/pkg/globalmanager/utils" + "github.com/kubeedge/sedna/pkg/globalmanager/runtime" ) -// ijControllerKind contains the schema.GroupVersionKind for this controller type. -var ijControllerKind = sednav1.SchemeGroupVersion.WithKind("IncrementalLearningJob") +const ( + // Name is this controller name + Name = "IncrementalLearning" -// IncrementalJobController ensures that all IncrementalLearningJob objects have corresponding pods to + // KindName is the kind name of CR this controller controls + KindName = "IncrementalLearningJob" +) + +// Kind contains the schema.GroupVersionKind for this controller type. +var Kind = sednav1.SchemeGroupVersion.WithKind(KindName) + +// Controller ensures that all IncrementalLearningJob objects have corresponding pods to // run their configured workload. -type IncrementalJobController struct { +type Controller struct { kubeClient kubernetes.Interface client sednaclientset.SednaV1alpha1Interface // podStoreSynced returns true if the pod store has been synced at least once. // Added as a member to the struct to allow injection for testing. podStoreSynced cache.InformerSynced - // jobStoreSynced returns true if the incrementaljob store has been synced at least once. + // jobStoreSynced returns true if the job store has been synced at least once. // Added as a member to the struct to allow injection for testing. jobStoreSynced cache.InformerSynced @@ -76,50 +80,49 @@ type IncrementalJobController struct { // IncrementalLearningJobs that need to be updated queue workqueue.RateLimitingInterface - recorder record.EventRecorder - cfg *config.ControllerConfig + + sendToEdgeFunc runtime.DownstreamSendFunc } -// Run the main goroutine responsible for watching and syncing jobs. -func (jc *IncrementalJobController) Start() error { +// Run starts the main goroutine responsible for watching and syncing jobs. +func (c *Controller) Run(stopCh <-chan struct{}) { + // TODO: make workers parameter workers := 1 - stopCh := messageContext.Done() - go func() { - defer utilruntime.HandleCrash() - defer jc.queue.ShutDown() - klog.Infof("Starting incrementallearning job controller") - defer klog.Infof("Shutting down incrementallearning job controller") + defer utilruntime.HandleCrash() + defer c.queue.ShutDown() - if !cache.WaitForNamedCacheSync("incrementallearningjob", stopCh, jc.podStoreSynced, jc.jobStoreSynced) { - klog.Errorf("failed to wait for caches to sync") + klog.Infof("Starting %s controller", Name) + defer klog.Infof("Shutting down %s controller", Name) - return - } - klog.Infof("Starting incrementallearning job workers") - for i := 0; i < workers; i++ { - go wait.Until(jc.worker, time.Second, stopCh) - } + if !cache.WaitForNamedCacheSync(Name, stopCh, c.podStoreSynced, c.jobStoreSynced) { + klog.Errorf("failed to wait for %s caches to sync", Name) - <-stopCh - }() - return nil + return + } + + klog.Infof("Starting %s job workers", Name) + for i := 0; i < workers; i++ { + go wait.Until(c.worker, time.Second, stopCh) + } + + <-stopCh } // enqueueByPod enqueues the jointInferenceService object of the specified pod. -func (jc *IncrementalJobController) enqueueByPod(pod *v1.Pod, immediate bool) { +func (c *Controller) enqueueByPod(pod *v1.Pod, immediate bool) { controllerRef := metav1.GetControllerOf(pod) if controllerRef == nil { return } - if controllerRef.Kind != ijControllerKind.Kind { + if controllerRef.Kind != Kind.Kind { return } - service, err := jc.jobLister.IncrementalLearningJobs(pod.Namespace).Get(controllerRef.Name) + service, err := c.jobLister.IncrementalLearningJobs(pod.Namespace).Get(controllerRef.Name) if err != nil { return } @@ -128,27 +131,27 @@ func (jc *IncrementalJobController) enqueueByPod(pod *v1.Pod, immediate bool) { return } - jc.enqueueController(service, immediate) + c.enqueueController(service, immediate) } // When a pod is created, enqueue the controller that manages it and update it's expectations. -func (jc *IncrementalJobController) addPod(obj interface{}) { +func (c *Controller) addPod(obj interface{}) { pod := obj.(*v1.Pod) if pod.DeletionTimestamp != nil { // on a restart of the controller, it's possible a new pod shows up in a state that // is already pending deletion. Prevent the pod from being a creation observation. - jc.deletePod(pod) + c.deletePod(pod) return } // backoff to queue when PodFailed immediate := pod.Status.Phase != v1.PodFailed - jc.enqueueByPod(pod, immediate) + c.enqueueByPod(pod, immediate) } // When a pod is updated, figure out what joint inference service manage it and wake them up. -func (jc *IncrementalJobController) updatePod(old, cur interface{}) { +func (c *Controller) updatePod(old, cur interface{}) { curPod := cur.(*v1.Pod) oldPod := old.(*v1.Pod) @@ -157,11 +160,11 @@ func (jc *IncrementalJobController) updatePod(old, cur interface{}) { return } - jc.addPod(curPod) + c.addPod(curPod) } // deletePod enqueues the jointinferenceservice obj When a pod is deleted -func (jc *IncrementalJobController) deletePod(obj interface{}) { +func (c *Controller) deletePod(obj interface{}) { pod, ok := obj.(*v1.Pod) // comment from https://github.com/kubernetes/kubernetes/blob/master/pkg/controller/job/job_controller.go @@ -182,13 +185,13 @@ func (jc *IncrementalJobController) deletePod(obj interface{}) { return } } - jc.enqueueByPod(pod, true) + c.enqueueByPod(pod, true) } // obj could be an *sedna.IncrementalLearningJob, or a DeletionFinalStateUnknown marker item, // immediate tells the controller to update the status right away, and should // happen ONLY when there was a successful pod run. -func (jc *IncrementalJobController) enqueueController(obj interface{}, immediate bool) { +func (c *Controller) enqueueController(obj interface{}, immediate bool) { key, err := k8scontroller.KeyFunc(obj) if err != nil { utilruntime.HandleError(fmt.Errorf("Couldn't get key for object %+v: %v", obj, err)) @@ -197,36 +200,36 @@ func (jc *IncrementalJobController) enqueueController(obj interface{}, immediate backoff := time.Duration(0) if !immediate { - backoff = getBackoff(jc.queue, key) + backoff = runtime.GetBackoff(c.queue, key) } - jc.queue.AddAfter(key, backoff) + c.queue.AddAfter(key, backoff) } // worker runs a worker thread that just dequeues items, processes them, and marks them done. // It enforces that the syncHandler is never invoked concurrently with the same key. -func (jc *IncrementalJobController) worker() { - for jc.processNextWorkItem() { +func (c *Controller) worker() { + for c.processNextWorkItem() { } } -func (jc *IncrementalJobController) processNextWorkItem() bool { - key, quit := jc.queue.Get() +func (c *Controller) processNextWorkItem() bool { + key, quit := c.queue.Get() if quit { return false } - defer jc.queue.Done(key) + defer c.queue.Done(key) - forget, err := jc.sync(key.(string)) + forget, err := c.sync(key.(string)) if err == nil { if forget { - jc.queue.Forget(key) + c.queue.Forget(key) } return true } utilruntime.HandleError(fmt.Errorf("Error syncing incrementallearning job: %v", err)) - jc.queue.AddRateLimited(key) + c.queue.AddRateLimited(key) return true } @@ -234,7 +237,7 @@ func (jc *IncrementalJobController) processNextWorkItem() bool { // sync will sync the incrementallearning job with the given key if it has had its expectations fulfilled, meaning // it did not expect to see any more of its pods created or deleted. This function is not meant to be invoked // concurrently with the same key. -func (jc *IncrementalJobController) sync(key string) (bool, error) { +func (c *Controller) sync(key string) (bool, error) { startTime := time.Now() defer func() { klog.V(4).Infof("Finished syncing incrementallearning job %q (%v)", key, time.Since(startTime)) @@ -247,7 +250,8 @@ func (jc *IncrementalJobController) sync(key string) (bool, error) { if len(ns) == 0 || len(name) == 0 { return false, fmt.Errorf("invalid incrementallearning job key %q: either namespace or name is missing", key) } - sharedIncrementalJob, err := jc.jobLister.IncrementalLearningJobs(ns).Get(name) + + sharedJob, err := c.jobLister.IncrementalLearningJobs(ns).Get(name) if err != nil { if errors.IsNotFound(err) { klog.V(4).Infof("incrementallearning job has been deleted: %v", key) @@ -255,19 +259,21 @@ func (jc *IncrementalJobController) sync(key string) (bool, error) { } return false, err } - incrementaljob := *sharedIncrementalJob - // set kind for incrementaljob in case that the kind is None - incrementaljob.SetGroupVersionKind(sednav1.SchemeGroupVersion.WithKind("IncrementalLearningJob")) - // incrementaljob first start, create pod for inference - if incrementaljob.Status.StartTime == nil { + + job := *sharedJob + // set kind in case that the kind is None + job.SetGroupVersionKind(Kind) + + // when job is handled at first, create pod for inference + if job.Status.StartTime == nil { now := metav1.Now() - incrementaljob.Status.StartTime = &now - pod := jc.getSpecifiedPods(&incrementaljob, InferencePodType) + job.Status.StartTime = &now + pod := c.getSpecifiedPods(&job, runtime.InferencePodType) if pod == nil { - err = jc.createInferPod(&incrementaljob) + err = c.createInferPod(&job) } else { if pod.Status.Phase != v1.PodRunning && pod.Status.Phase != v1.PodPending { - err = jc.createInferPod(&incrementaljob) + err = c.createInferPod(&job) } } if err != nil { @@ -275,8 +281,8 @@ func (jc *IncrementalJobController) sync(key string) (bool, error) { } } - // if incrementaljob was finished previously, we don't want to redo the termination - if IsIncrementalJobFinished(&incrementaljob) { + // if job was finished previously, we don't want to redo the termination + if IsJobFinished(&job) { return true, nil } @@ -284,20 +290,20 @@ func (jc *IncrementalJobController) sync(key string) (bool, error) { jobFailed := false needUpdated := false - // update conditions of incremental job - needUpdated, err = jc.updateIncrementalJobConditions(&incrementaljob) + // transit this job's state machine + needUpdated, err = c.transitJobState(&job) if err != nil { - klog.V(2).Infof("incrementallearning job %v/%v faied to be updated, err:%s", incrementaljob.Namespace, incrementaljob.Name, err) + klog.V(2).Infof("incrementallearning job %v/%v failed to be updated, err:%s", job.Namespace, job.Name, err) } if needUpdated { - if err := jc.updateIncrementalJobStatus(&incrementaljob); err != nil { + if err := c.updateJobStatus(&job); err != nil { return forget, err } - if jobFailed && !IsIncrementalJobFinished(&incrementaljob) { - // returning an error will re-enqueue IncrementalJob after the backoff period - return forget, fmt.Errorf("failed pod(s) detected for incrementaljob key %q", key) + if jobFailed && !IsJobFinished(&job) { + // returning an error will re-enqueue IncrementalLearningJob after the backoff period + return forget, fmt.Errorf("failed pod(s) detected for incrementallearning job key %q", key) } forget = true @@ -308,65 +314,60 @@ func (jc *IncrementalJobController) sync(key string) (bool, error) { // setWorkerNodeNameOfJob sets the worker nodeName of the specified job // which is used for downstream to sync job info to the specified LC located in nodeName. -func (jc *IncrementalJobController) setWorkerNodeNameOfJob(job *sednav1.IncrementalLearningJob, jobStage string, nodeName string) error { - key := AnnotationsKeyPrefix + jobStage +func (c *Controller) setWorkerNodeNameOfJob(job *sednav1.IncrementalLearningJob, jobStage string, nodeName string) error { + key := runtime.AnnotationsKeyPrefix + jobStage ann := job.GetAnnotations() - if ann != nil { - if ann[key] == nodeName { - // already set - return nil - } + if ann[key] == nodeName { + // already set + return nil } + dataStr := fmt.Sprintf(`{"metadata":{"annotations":{"%s":"%s"}}}`, key, nodeName) - jobClient := jc.client.IncrementalLearningJobs(job.Namespace) - var err error - for i := 0; i <= ResourceUpdateRetries; i++ { - var newJob *sednav1.IncrementalLearningJob - newJob, err = jobClient.Get(context.TODO(), job.Name, metav1.GetOptions{}) + jobClient := c.client.IncrementalLearningJobs(job.Namespace) + return runtime.RetryUpdateStatus(job.Name, job.Namespace, func() error { + newJob, err := jobClient.Get(context.TODO(), job.Name, metav1.GetOptions{}) if err != nil { - break + return err } annotations := newJob.GetAnnotations() - if annotations != nil { - if annotations[key] == nodeName { - return nil - } - } - - dataStr := fmt.Sprintf(`{"metadata":{"annotations":{"%s":"%s"}}}`, key, nodeName) - if _, err = jobClient.Patch(context.TODO(), job.Name, types.MergePatchType, []byte(dataStr), metav1.PatchOptions{}); err == nil { - break + if annotations[key] == nodeName { + return nil } - } - return err + _, err = jobClient.Patch(context.TODO(), job.Name, types.MergePatchType, []byte(dataStr), metav1.PatchOptions{}) + return err + }) } -// updateIncrementalJobConditions ensures that conditions of incrementallearning job can be changed by podstatus -func (jc *IncrementalJobController) updateIncrementalJobConditions(incrementaljob *sednav1.IncrementalLearningJob) (bool, error) { +// transitJobState transit job to next state +func (c *Controller) transitJobState(job *sednav1.IncrementalLearningJob) (bool, error) { var initialType sednav1.ILJobStageConditionType var latestCondition sednav1.ILJobCondition = sednav1.ILJobCondition{ Stage: sednav1.ILJobTrain, Type: initialType, } + var newConditionType sednav1.ILJobStageConditionType var needUpdated = false - jobConditions := incrementaljob.Status.Conditions + var podStatus v1.PodPhase = v1.PodUnknown var pod *v1.Pod + + jobConditions := job.Status.Conditions if len(jobConditions) > 0 { // get latest pod and pod status latestCondition = (jobConditions)[len(jobConditions)-1] - klog.V(2).Infof("incrementallearning job %v/%v latest stage %v:", incrementaljob.Namespace, incrementaljob.Name, + klog.V(2).Infof("incrementallearning job %v/%v latest stage %v:", job.Namespace, job.Name, latestCondition.Stage) - pod = jc.getSpecifiedPods(incrementaljob, string(latestCondition.Stage)) + pod = c.getSpecifiedPods(job, string(latestCondition.Stage)) if pod != nil { podStatus = pod.Status.Phase } } + jobStage := latestCondition.Stage currentType := latestCondition.Type newConditionType = currentType @@ -383,14 +384,14 @@ func (jc *IncrementalJobController) updateIncrementalJobConditions(incrementaljo // include train, eval, deploy pod var err error if jobStage == sednav1.ILJobDeploy { - err = jc.restartInferPod(incrementaljob) + err = c.restartInferPod(job) if err != nil { - klog.V(2).Infof("incrementallearning job %v/%v inference pod failed to restart, err:%s", incrementaljob.Namespace, incrementaljob.Name, err) + klog.V(2).Infof("incrementallearning job %v/%v inference pod failed to restart, err:%s", job.Namespace, job.Name, err) } else { - klog.V(2).Infof("incrementallearning job %v/%v inference pod restarts successfully", incrementaljob.Namespace, incrementaljob.Name) + klog.V(2).Infof("incrementallearning job %v/%v inference pod restarts successfully", job.Namespace, job.Name) } } else if podStatus != v1.PodPending && podStatus != v1.PodRunning { - err = jc.createPod(incrementaljob, jobStage) + err = c.createPod(job, jobStage) } if err != nil { return needUpdated, err @@ -406,17 +407,17 @@ func (jc *IncrementalJobController) updateIncrementalJobConditions(incrementaljo newConditionType = sednav1.ILJobStageCondRunning // add nodeName to job - if err := jc.setWorkerNodeNameOfJob(incrementaljob, string(jobStage), pod.Spec.NodeName); err != nil { + if err := c.setWorkerNodeNameOfJob(job, string(jobStage), pod.Spec.NodeName); err != nil { return needUpdated, err } } } else if podStatus == v1.PodSucceeded { // watch pod status, if pod completed, set type completed newConditionType = sednav1.ILJobStageCondCompleted - klog.V(2).Infof("incrementallearning job %v/%v %v stage completed!", incrementaljob.Namespace, incrementaljob.Name, jobStage) + klog.V(2).Infof("incrementallearning job %v/%v %v stage completed!", job.Namespace, job.Name, jobStage) } else if podStatus == v1.PodFailed { newConditionType = sednav1.ILJobStageCondFailed - klog.V(2).Infof("incrementallearning job %v/%v %v stage failed!", incrementaljob.Namespace, incrementaljob.Name, jobStage) + klog.V(2).Infof("incrementallearning job %v/%v %v stage failed!", job.Namespace, job.Name, jobStage) } case sednav1.ILJobStageCondCompleted: jobStage = getNextStage(jobStage) @@ -429,31 +430,29 @@ func (jc *IncrementalJobController) updateIncrementalJobConditions(incrementaljo default: // do nothing when given other type out of cases } - klog.V(2).Infof("incrementallearning job %v/%v, conditions: %v", incrementaljob.Namespace, incrementaljob.Name, jobConditions) + + klog.V(2).Infof("incrementallearning job %v/%v, conditions: %v", job.Namespace, job.Name, jobConditions) if latestCondition.Type != newConditionType { - incrementaljob.Status.Conditions = append(incrementaljob.Status.Conditions, NewIncrementalJobCondition(newConditionType, jobStage)) + job.Status.Conditions = append(job.Status.Conditions, NewIncrementalJobCondition(newConditionType, jobStage)) needUpdated = true - return needUpdated, nil } + return needUpdated, nil } -// updateIncrementalJobStatus ensures that jobstatus can be updated rightly -func (jc *IncrementalJobController) updateIncrementalJobStatus(incrementaljob *sednav1.IncrementalLearningJob) error { - jobClient := jc.client.IncrementalLearningJobs(incrementaljob.Namespace) - var err error - for i := 0; i <= ResourceUpdateRetries; i++ { - var newIncrementalJob *sednav1.IncrementalLearningJob - newIncrementalJob, err = jobClient.Get(context.TODO(), incrementaljob.Name, metav1.GetOptions{}) +// updateJobStatus ensures that job status can be updated rightly +func (c *Controller) updateJobStatus(job *sednav1.IncrementalLearningJob) error { + jobClient := c.client.IncrementalLearningJobs(job.Namespace) + return runtime.RetryUpdateStatus(job.Name, job.Namespace, func() error { + newJob, err := jobClient.Get(context.TODO(), job.Name, metav1.GetOptions{}) if err != nil { - break + return err } - newIncrementalJob.Status = incrementaljob.Status - if _, err = jobClient.UpdateStatus(context.TODO(), newIncrementalJob, metav1.UpdateOptions{}); err == nil { - break - } - } - return err + + newJob.Status = job.Status + _, err = jobClient.UpdateStatus(context.TODO(), newJob, metav1.UpdateOptions{}) + return err + }) } func NewIncrementalJobCondition(conditionType sednav1.ILJobStageConditionType, jobStage sednav1.ILJobStage) sednav1.ILJobCondition { @@ -468,26 +467,29 @@ func NewIncrementalJobCondition(conditionType sednav1.ILJobStageConditionType, j } } -func (jc *IncrementalJobController) generatePodName(jobName string, workerType string) string { +func (c *Controller) generatePodName(jobName string, workerType string) string { return jobName + "-" + strings.ToLower(workerType) + "-" + utilrand.String(5) } -func (jc *IncrementalJobController) getSpecifiedPods(job *sednav1.IncrementalLearningJob, podType string) *v1.Pod { - if podType == "Deploy" { - podType = InferencePodType - } +func (c *Controller) getSpecifiedPods(job *sednav1.IncrementalLearningJob, podType string) *v1.Pod { var latestPod *v1.Pod - selector, _ := GenerateSelector(job) - pods, err := jc.podStore.Pods(job.Namespace).List(selector) + selector, _ := runtime.GenerateSelector(job) + pods, err := c.podStore.Pods(job.Namespace).List(selector) if len(pods) == 0 || err != nil { return nil } + var matchTag = false latestPod = pods[0] + + if podType == "Deploy" { + podType = runtime.InferencePodType + } + for _, pod := range pods { s := strings.Split(pod.Name, "-") - CurrentPodType := s[len(s)-2] - if (latestPod.CreationTimestamp.Before(&pod.CreationTimestamp) || latestPod.CreationTimestamp.Equal(&pod.CreationTimestamp)) && CurrentPodType == strings.ToLower(podType) { + currentPodType := s[len(s)-2] + if (latestPod.CreationTimestamp.Before(&pod.CreationTimestamp) || latestPod.CreationTimestamp.Equal(&pod.CreationTimestamp)) && currentPodType == strings.ToLower(podType) { latestPod = pod matchTag = true } @@ -498,20 +500,22 @@ func (jc *IncrementalJobController) getSpecifiedPods(job *sednav1.IncrementalLea return latestPod } -func (jc *IncrementalJobController) restartInferPod(job *sednav1.IncrementalLearningJob) error { - inferPod := jc.getSpecifiedPods(job, InferencePodType) +func (c *Controller) restartInferPod(job *sednav1.IncrementalLearningJob) error { + inferPod := c.getSpecifiedPods(job, runtime.InferencePodType) if inferPod == nil { klog.V(2).Infof("No inferpod is running in incrementallearning job %v/%v", job.Namespace, job.Name) - err := jc.createInferPod(job) + err := c.createInferPod(job) return err } + ctx := context.Background() - err := jc.kubeClient.CoreV1().Pods(job.Namespace).Delete(ctx, inferPod.Name, metav1.DeleteOptions{}) + err := c.kubeClient.CoreV1().Pods(job.Namespace).Delete(ctx, inferPod.Name, metav1.DeleteOptions{}) if err != nil { klog.Warningf("failed to delete inference pod %s for incrementallearning job %v/%v, err:%s", inferPod.Name, job.Namespace, job.Name, err) return err } - err = jc.createInferPod(job) + + err = c.createInferPod(job) if err != nil { klog.Warningf("failed to create inference pod %s for incrementallearning job %v/%v, err:%s", inferPod.Name, job.Namespace, job.Name, err) return err @@ -532,14 +536,14 @@ func getNextStage(currentStage sednav1.ILJobStage) sednav1.ILJobStage { } } -func IsIncrementalJobFinished(j *sednav1.IncrementalLearningJob) bool { +func IsJobFinished(j *sednav1.IncrementalLearningJob) bool { // TODO return false } -func (jc *IncrementalJobController) getSecret(namespace, name string, ownerStr string) (secret *v1.Secret, err error) { +func (c *Controller) getSecret(namespace, name string, ownerStr string) (secret *v1.Secret, err error) { if name != "" { - secret, err = jc.kubeClient.CoreV1().Secrets(namespace).Get(context.TODO(), name, metav1.GetOptions{}) + secret, err = c.kubeClient.CoreV1().Secrets(namespace).Get(context.TODO(), name, metav1.GetOptions{}) if err != nil { err = fmt.Errorf("failed to get the secret %s for %s: %w", name, @@ -549,7 +553,7 @@ func (jc *IncrementalJobController) getSecret(namespace, name string, ownerStr s return } -func (jc *IncrementalJobController) createPod(job *sednav1.IncrementalLearningJob, podtype sednav1.ILJobStage) (err error) { +func (c *Controller) createPod(job *sednav1.IncrementalLearningJob, podtype sednav1.ILJobStage) (err error) { ctx := context.Background() var podTemplate *v1.PodTemplateSpec @@ -558,25 +562,25 @@ func (jc *IncrementalJobController) createPod(job *sednav1.IncrementalLearningJo deployModelName := job.Spec.DeploySpec.Model.Name // check initial model name - initialModel, err := jc.client.Models(job.Namespace).Get(ctx, initialModelName, metav1.GetOptions{}) + initialModel, err := c.client.Models(job.Namespace).Get(ctx, initialModelName, metav1.GetOptions{}) if err != nil { return fmt.Errorf("failed to get initial model %s: %w", initialModelName, err) } - _, err = jc.client.Models(job.Namespace).Get(ctx, deployModelName, metav1.GetOptions{}) + _, err = c.client.Models(job.Namespace).Get(ctx, deployModelName, metav1.GetOptions{}) if err != nil { return fmt.Errorf("failed to get deploy model %s: %w", deployModelName, err) } - dataset, err := jc.client.Datasets(job.Namespace).Get(ctx, incrementalDatasetName, metav1.GetOptions{}) + dataset, err := c.client.Datasets(job.Namespace).Get(ctx, incrementalDatasetName, metav1.GetOptions{}) if err != nil { return fmt.Errorf("failed to get dataset %s: %w", incrementalDatasetName, err) } - datasetSecret, err := jc.getSecret( + datasetSecret, err := c.getSecret( job.Namespace, dataset.Spec.CredentialName, fmt.Sprintf("dataset %s", dataset.Name), @@ -585,7 +589,7 @@ func (jc *IncrementalJobController) createPod(job *sednav1.IncrementalLearningJo return err } - jobSecret, err := jc.getSecret( + jobSecret, err := c.getSecret( job.Namespace, job.Spec.CredentialName, fmt.Sprintf("incremental job %s", job.Name), @@ -595,13 +599,14 @@ func (jc *IncrementalJobController) createPod(job *sednav1.IncrementalLearningJo } // get all url for train and eval from data in condition + var cond IncrementalCondData condDataStr := job.Status.Conditions[len(job.Status.Conditions)-1].Data klog.V(2).Infof("incrementallearning job %v/%v data condition:%s", job.Namespace, job.Name, condDataStr) - var cond IncrementalCondData (&cond).Unmarshal([]byte(condDataStr)) if cond.Input == nil { return fmt.Errorf("empty input from condData") } + dataURL := cond.Input.DataURL inputmodelURLs := cond.GetInputModelURLs() @@ -614,25 +619,26 @@ func (jc *IncrementalJobController) createPod(job *sednav1.IncrementalLearningJo originalDataURLOrIndex = dataset.Spec.URL } - var workerParam *WorkerParam = new(WorkerParam) + var workerParam runtime.WorkerParam + if podtype == sednav1.ILJobTrain { - workerParam.workerType = TrainPodType + workerParam.WorkerType = runtime.TrainPodType podTemplate = &job.Spec.TrainSpec.Template - // Env parameters for train - workerParam.env = map[string]string{ + // Env parameters for train + workerParam.Env = map[string]string{ "NAMESPACE": job.Namespace, "JOB_NAME": job.Name, "WORKER_NAME": "train-worker-" + utilrand.String(5), - "LC_SERVER": jc.cfg.LC.Server, + "LC_SERVER": c.cfg.LC.Server, } baseModelURL := inputmodelURLs[0] var baseModelSecret *v1.Secret if baseModelURL == initialModel.Spec.URL { - baseModelSecret, err = jc.getSecret( + baseModelSecret, err = c.getSecret( job.Namespace, initialModel.Spec.CredentialName, fmt.Sprintf("initial model %s", initialModelName), @@ -644,17 +650,17 @@ func (jc *IncrementalJobController) createPod(job *sednav1.IncrementalLearningJo baseModelSecret = jobSecret } - workerParam.mounts = append(workerParam.mounts, - WorkerMount{ - URL: &MountURL{ + workerParam.Mounts = append(workerParam.Mounts, + runtime.WorkerMount{ + URL: &runtime.MountURL{ URL: baseModelURL, Secret: baseModelSecret, DownloadByInitializer: true, }, EnvName: "BASE_MODEL_URL", }, - WorkerMount{ - URL: &MountURL{ + runtime.WorkerMount{ + URL: &runtime.MountURL{ URL: cond.Input.OutputDir, Secret: jobSecret, DownloadByInitializer: false, @@ -662,8 +668,8 @@ func (jc *IncrementalJobController) createPod(job *sednav1.IncrementalLearningJo EnvName: "MODEL_URL", }, - WorkerMount{ - URL: &MountURL{ + runtime.WorkerMount{ + URL: &runtime.MountURL{ URL: dataURL, DownloadByInitializer: true, Secret: jobSecret, @@ -672,8 +678,8 @@ func (jc *IncrementalJobController) createPod(job *sednav1.IncrementalLearningJo }, // see https://github.com/kubeedge/sedna/issues/35 - WorkerMount{ - URL: &MountURL{ + runtime.WorkerMount{ + URL: &runtime.MountURL{ Secret: datasetSecret, URL: originalDataURLOrIndex, DownloadByInitializer: true, @@ -683,23 +689,23 @@ func (jc *IncrementalJobController) createPod(job *sednav1.IncrementalLearningJo }, ) } else { + // Configure eval worker's mounts and envs podTemplate = &job.Spec.EvalSpec.Template - workerParam.workerType = "Eval" + workerParam.WorkerType = "Eval" - // Configure Env information for eval by initial WorkerParam - workerParam.env = map[string]string{ + workerParam.Env = map[string]string{ "NAMESPACE": job.Namespace, "JOB_NAME": job.Name, "WORKER_NAME": "eval-worker-" + utilrand.String(5), - "LC_SERVER": jc.cfg.LC.Server, + "LC_SERVER": c.cfg.LC.Server, } - var modelMountURLs []MountURL + var modelMountURLs []runtime.MountURL for _, url := range inputmodelURLs { var modelSecret *v1.Secret if url == initialModel.Spec.URL { - modelSecret, err = jc.getSecret( + modelSecret, err = c.getSecret( job.Namespace, initialModel.Spec.CredentialName, fmt.Sprintf("initial model %s", initialModelName), @@ -711,21 +717,21 @@ func (jc *IncrementalJobController) createPod(job *sednav1.IncrementalLearningJo modelSecret = jobSecret } - modelMountURLs = append(modelMountURLs, MountURL{ + modelMountURLs = append(modelMountURLs, runtime.MountURL{ URL: url, Secret: modelSecret, DownloadByInitializer: true, }) } - workerParam.mounts = append(workerParam.mounts, - WorkerMount{ + workerParam.Mounts = append(workerParam.Mounts, + runtime.WorkerMount{ URLs: modelMountURLs, Name: "models", EnvName: "MODEL_URLS", }, - WorkerMount{ - URL: &MountURL{ + runtime.WorkerMount{ + URL: &runtime.MountURL{ URL: dataURL, Secret: datasetSecret, DownloadByInitializer: true, @@ -734,8 +740,8 @@ func (jc *IncrementalJobController) createPod(job *sednav1.IncrementalLearningJo EnvName: "TEST_DATASET_URL", }, - WorkerMount{ - URL: &MountURL{ + runtime.WorkerMount{ + URL: &runtime.MountURL{ Secret: datasetSecret, URL: originalDataURLOrIndex, DownloadByInitializer: true, @@ -748,40 +754,38 @@ func (jc *IncrementalJobController) createPod(job *sednav1.IncrementalLearningJo } // set the default policy instead of Always policy - workerParam.restartPolicy = v1.RestartPolicyOnFailure - workerParam.hostNetwork = true + workerParam.RestartPolicy = v1.RestartPolicyOnFailure + workerParam.HostNetwork = true // create pod based on podtype - _, err = createPodWithTemplate(jc.kubeClient, job, podTemplate, workerParam) - if err != nil { - return err - } + _, err = runtime.CreatePodWithTemplate(c.kubeClient, job, podTemplate, &workerParam) return } -func (jc *IncrementalJobController) createInferPod(job *sednav1.IncrementalLearningJob) error { +func (c *Controller) createInferPod(job *sednav1.IncrementalLearningJob) error { infermodelName := job.Spec.DeploySpec.Model.Name - inferModel, err := jc.client.Models(job.Namespace).Get(context.TODO(), infermodelName, metav1.GetOptions{}) + inferModel, err := c.client.Models(job.Namespace).Get(context.TODO(), infermodelName, metav1.GetOptions{}) if err != nil { return fmt.Errorf("failed to get infer model %s: %w", infermodelName, err) } + inferModelURL := inferModel.Spec.URL - // Env parameters for edge HEMParameterJSON, _ := json.Marshal(job.Spec.DeploySpec.HardExampleMining.Parameters) HEMParameterString := string(HEMParameterJSON) - // Configure container mounting and Env information by initial WorkerParam - modelSecret, err := jc.getSecret( + modelSecret, err := c.getSecret( job.Namespace, inferModel.Spec.CredentialName, fmt.Sprintf("model %s", inferModel.Name), ) - var workerParam *WorkerParam = new(WorkerParam) - workerParam.mounts = append(workerParam.mounts, - WorkerMount{ - URL: &MountURL{ + + // Configure inference worker's mounts and envs + var workerParam runtime.WorkerParam + workerParam.Mounts = append(workerParam.Mounts, + runtime.WorkerMount{ + URL: &runtime.MountURL{ URL: inferModelURL, Secret: modelSecret, DownloadByInitializer: true, @@ -791,7 +795,7 @@ func (jc *IncrementalJobController) createInferPod(job *sednav1.IncrementalLearn }, ) - workerParam.env = map[string]string{ + workerParam.Env = map[string]string{ "NAMESPACE": job.Namespace, "JOB_NAME": job.Name, "WORKER_NAME": "inferworker-" + utilrand.String(5), @@ -799,71 +803,48 @@ func (jc *IncrementalJobController) createInferPod(job *sednav1.IncrementalLearn "HEM_NAME": job.Spec.DeploySpec.HardExampleMining.Name, "HEM_PARAMETERS": HEMParameterString, - "LC_SERVER": jc.cfg.LC.Server, + "LC_SERVER": c.cfg.LC.Server, } - workerParam.workerType = InferencePodType - workerParam.hostNetwork = true + workerParam.WorkerType = runtime.InferencePodType + workerParam.HostNetwork = true - // create edge pod - _, err = createPodWithTemplate(jc.kubeClient, job, &job.Spec.DeploySpec.Template, workerParam) + // create the inference worker + _, err = runtime.CreatePodWithTemplate(c.kubeClient, job, &job.Spec.DeploySpec.Template, &workerParam) return err } -// GetName returns the name of the incrementallearning job controller -func (jc *IncrementalJobController) GetName() string { - return "IncrementalLearningJobController" -} - -// NewIncrementalJobController creates a new IncrementalJob controller that keeps the relevant pods -// in sync with their corresponding IncrementalJob objects. -func NewIncrementalJobController(cfg *config.ControllerConfig) (FeatureControllerI, error) { - namespace := cfg.Namespace - if namespace == "" { - namespace = metav1.NamespaceAll - } - kubeClient, err := utils.KubeClient() - if err != nil { - return nil, err - } - - kubecfg, err := utils.KubeConfig() - if err != nil { - return nil, err - } - crdclient, err := clientset.NewForConfig(kubecfg) - if err != nil { - return nil, err - } - - kubeInformerFactory := kubeinformers.NewSharedInformerFactoryWithOptions(kubeClient, time.Second*30, kubeinformers.WithNamespace(namespace)) +// New creates a new incremental learning job controller that keeps the relevant pods +// in sync with the corresponding IncrementalLearningJob objects. +func New(cc *runtime.ControllerContext) (runtime.FeatureControllerI, error) { + podInformer := cc.KubeInformerFactory.Core().V1().Pods() - podInformer := kubeInformerFactory.Core().V1().Pods() - - jobInformerFactory := informers.NewSharedInformerFactoryWithOptions(crdclient, time.Second*30, informers.WithNamespace(namespace)) - jobInformer := jobInformerFactory.Sedna().V1alpha1().IncrementalLearningJobs() + jobInformer := cc.SednaInformerFactory.Sedna().V1alpha1().IncrementalLearningJobs() eventBroadcaster := record.NewBroadcaster() - eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: kubeClient.CoreV1().Events("")}) + eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: cc.KubeClient.CoreV1().Events("")}) + + jc := &Controller{ + kubeClient: cc.KubeClient, + client: cc.SednaClient.SednaV1alpha1(), - jc := &IncrementalJobController{ - kubeClient: kubeClient, - client: crdclient.SednaV1alpha1(), + queue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(runtime.DefaultBackOff, runtime.MaxBackOff), Name), - queue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(DefaultBackOff, MaxBackOff), "incrementallearningjob"), - recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "incrementallearningjob-controller"}), - cfg: cfg, + cfg: cc.Config, } jobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ AddFunc: func(obj interface{}) { jc.enqueueController(obj, true) + jc.syncToEdge(watch.Added, obj) }, UpdateFunc: func(old, cur interface{}) { jc.enqueueController(cur, true) + jc.syncToEdge(watch.Added, cur) }, DeleteFunc: func(obj interface{}) { jc.enqueueController(obj, true) + jc.syncToEdge(watch.Deleted, obj) }, }) jc.jobLister = jobInformer.Lister() @@ -877,8 +858,5 @@ func NewIncrementalJobController(cfg *config.ControllerConfig) (FeatureControlle jc.podStore = podInformer.Lister() jc.podStoreSynced = podInformer.Informer().HasSynced - stopCh := make(chan struct{}) - kubeInformerFactory.Start(stopCh) - jobInformerFactory.Start(stopCh) - return jc, err + return jc, nil } diff --git a/pkg/globalmanager/controllers/incrementallearning/upstream.go b/pkg/globalmanager/controllers/incrementallearning/upstream.go new file mode 100644 index 000000000..7932a0038 --- /dev/null +++ b/pkg/globalmanager/controllers/incrementallearning/upstream.go @@ -0,0 +1,162 @@ +/* +Copyright 2021 The KubeEdge Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package incrementallearning + +import ( + "context" + "encoding/json" + "fmt" + "strings" + + sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" + "github.com/kubeedge/sedna/pkg/globalmanager/runtime" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +type Model = runtime.Model + +// the data of this condition including the input/output to do the next step +type IncrementalCondData struct { + Input *struct { + // Only one model cases + Model *Model `json:"model,omitempty"` + Models []Model `json:"models,omitempty"` + + DataURL string `json:"dataURL,omitempty"` + + // the data samples reference will be stored into this URL. + // The content of this url would be: + // # the first uncomment line means the directory + // s3://dataset/ + // mnist/0.jpg + // mnist/1.jpg + DataIndexURL string `json:"dataIndexURL,omitempty"` + + OutputDir string `json:"outputDir,omitempty"` + } `json:"input,omitempty"` + + Output *struct { + Model *Model `json:"model,omitempty"` + Models []Model `json:"models,omitempty"` + } `json:"output,omitempty"` +} + +func (cd *IncrementalCondData) joinModelURLs(model *Model, models []Model) []string { + var modelURLs []string + if model != nil { + modelURLs = append(modelURLs, model.GetURL()) + } else { + for _, m := range models { + modelURLs = append(modelURLs, m.GetURL()) + } + } + return modelURLs +} + +func (cd *IncrementalCondData) GetInputModelURLs() []string { + return cd.joinModelURLs(cd.Input.Model, cd.Input.Models) +} + +func (cd *IncrementalCondData) GetOutputModelURLs() []string { + return cd.joinModelURLs(cd.Output.Model, cd.Output.Models) +} + +func (cd *IncrementalCondData) Unmarshal(data []byte) error { + return json.Unmarshal(data, cd) +} + +func (cd IncrementalCondData) Marshal() ([]byte, error) { + return json.Marshal(cd) +} + +func (c *Controller) appendStatusCondition(name, namespace string, cond sednav1.ILJobCondition) error { + client := c.client.IncrementalLearningJobs(namespace) + return runtime.RetryUpdateStatus(name, namespace, (func() error { + job, err := client.Get(context.TODO(), name, metav1.GetOptions{}) + if err != nil { + return err + } + job.Status.Conditions = append(job.Status.Conditions, cond) + _, err = client.UpdateStatus(context.TODO(), job, metav1.UpdateOptions{}) + return err + })) +} + +// updateFromEdge syncs the edge updates to k8s +func (c *Controller) updateFromEdge(name, namespace, operation string, content []byte) error { + var jobStatus struct { + Phase string `json:"phase"` + Status string `json:"status"` + } + + err := json.Unmarshal(content, &jobStatus) + if err != nil { + return err + } + + // Get the condition data. + // Here unmarshal and marshal immediately to skip the unnecessary fields + var condData IncrementalCondData + err = json.Unmarshal(content, &condData) + if err != nil { + return err + } + condDataBytes, _ := json.Marshal(&condData) + + cond := sednav1.ILJobCondition{ + Status: v1.ConditionTrue, + LastHeartbeatTime: metav1.Now(), + LastTransitionTime: metav1.Now(), + Data: string(condDataBytes), + Message: "reported by lc", + } + + switch strings.ToLower(jobStatus.Phase) { + case "train": + cond.Stage = sednav1.ILJobTrain + case "eval": + cond.Stage = sednav1.ILJobEval + case "deploy": + cond.Stage = sednav1.ILJobDeploy + default: + return fmt.Errorf("invalid condition stage: %v", jobStatus.Phase) + } + + switch strings.ToLower(jobStatus.Status) { + case "ready": + cond.Type = sednav1.ILJobStageCondReady + case "completed": + cond.Type = sednav1.ILJobStageCondCompleted + case "failed": + cond.Type = sednav1.ILJobStageCondFailed + case "waiting": + cond.Type = sednav1.ILJobStageCondWaiting + default: + return fmt.Errorf("invalid condition type: %v", jobStatus.Status) + } + + err = c.appendStatusCondition(name, namespace, cond) + if err != nil { + return fmt.Errorf("failed to append condition, err:%+w", err) + } + return nil +} + +func (c *Controller) SetUpstreamHandler(addFunc runtime.UpstreamHandlerAddFunc) error { + return addFunc(KindName, c.updateFromEdge) +} diff --git a/pkg/globalmanager/controllers/jointinference/downstream.go b/pkg/globalmanager/controllers/jointinference/downstream.go new file mode 100644 index 000000000..99b2563d6 --- /dev/null +++ b/pkg/globalmanager/controllers/jointinference/downstream.go @@ -0,0 +1,56 @@ +/* +Copyright 2021 The KubeEdge Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package jointinference + +import ( + "fmt" + + "k8s.io/apimachinery/pkg/watch" + + sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" + "github.com/kubeedge/sedna/pkg/globalmanager/runtime" +) + +func (c *Controller) syncToEdge(eventType watch.EventType, obj interface{}) error { + joint, ok := obj.(*sednav1.JointInferenceService) + if !ok { + return nil + } + + // Since Kind may be empty, + // we need to fix the kind here if missing. + // more details at https://github.com/kubernetes/kubernetes/issues/3030 + joint.Kind = KindName + + // Here only propagate to the nodes with non empty name + // FIXME: only the case that Spec.NodeName specified is support + nodeName := joint.Spec.EdgeWorker.Template.Spec.NodeName + if len(nodeName) == 0 { + return fmt.Errorf("empty node name") + } + + if len(joint.Kind) == 0 { + joint.Kind = KindName + } + return c.sendToEdgeFunc(nodeName, eventType, joint) +} + +func (c *Controller) SetDownstreamSendFunc(f runtime.DownstreamSendFunc) error { + c.sendToEdgeFunc = f + + return nil +} diff --git a/pkg/globalmanager/jointinferenceservice.go b/pkg/globalmanager/controllers/jointinference/jointinferenceservice.go similarity index 56% rename from pkg/globalmanager/jointinferenceservice.go rename to pkg/globalmanager/controllers/jointinference/jointinferenceservice.go index 8d22fa3b7..faff1143b 100644 --- a/pkg/globalmanager/jointinferenceservice.go +++ b/pkg/globalmanager/controllers/jointinference/jointinferenceservice.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package globalmanager +package jointinference import ( "context" @@ -29,7 +29,7 @@ import ( utilrand "k8s.io/apimachinery/pkg/util/rand" utilruntime "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/apimachinery/pkg/util/wait" - kubeinformers "k8s.io/client-go/informers" + "k8s.io/apimachinery/pkg/watch" "k8s.io/client-go/kubernetes" "k8s.io/client-go/kubernetes/scheme" v1core "k8s.io/client-go/kubernetes/typed/core/v1" @@ -41,26 +41,32 @@ import ( k8scontroller "k8s.io/kubernetes/pkg/controller" sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" - clientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned" sednaclientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned/typed/sedna/v1alpha1" - informers "github.com/kubeedge/sedna/pkg/client/informers/externalversions" sednav1listers "github.com/kubeedge/sedna/pkg/client/listers/sedna/v1alpha1" "github.com/kubeedge/sedna/pkg/globalmanager/config" - messageContext "github.com/kubeedge/sedna/pkg/globalmanager/messagelayer/ws" - "github.com/kubeedge/sedna/pkg/globalmanager/utils" + "github.com/kubeedge/sedna/pkg/globalmanager/runtime" +) + +const ( + // Name is this controller name + Name = "JointInference" + + // KindName is the kind name of CR this controller controls + KindName = "JointInferenceService" ) const ( jointInferenceForEdge = "Edge" jointInferenceForCloud = "Cloud" + bigModelPort = 5000 ) -// jointServiceControllerKind contains the schema.GroupVersionKind for this controller type. -var jointServiceControllerKind = sednav1.SchemeGroupVersion.WithKind("JointInferenceService") +// Kind contains the schema.GroupVersionKind for this controller type. +var Kind = sednav1.SchemeGroupVersion.WithKind(Name) -// JointInferenceServiceController ensures that all JointInferenceService objects +// Controller ensures that all JointInferenceService objects // have corresponding pods to run their configured workload. -type JointInferenceServiceController struct { +type Controller struct { kubeClient kubernetes.Interface client sednaclientset.SednaV1alpha1Interface @@ -69,7 +75,7 @@ type JointInferenceServiceController struct { // A store of pods podStore corelisters.PodLister - // serviceStoreSynced returns true if the jointinferenceservice store has been synced at least once. + // serviceStoreSynced returns true if the JointInferenceService store has been synced at least once. serviceStoreSynced cache.InformerSynced // A store of service serviceLister sednav1listers.JointInferenceServiceLister @@ -80,48 +86,47 @@ type JointInferenceServiceController struct { recorder record.EventRecorder cfg *config.ControllerConfig + + sendToEdgeFunc runtime.DownstreamSendFunc } -// Start starts the main goroutine responsible for watching and syncing services. -func (jc *JointInferenceServiceController) Start() error { +// Run starts the main goroutine responsible for watching and syncing services. +func (c *Controller) Run(stopCh <-chan struct{}) { workers := 1 - stopCh := messageContext.Done() - go func() { - defer utilruntime.HandleCrash() - defer jc.queue.ShutDown() - klog.Infof("Starting joint inference service controller") - defer klog.Infof("Shutting down joint inference service controller") + defer utilruntime.HandleCrash() + defer c.queue.ShutDown() - if !cache.WaitForNamedCacheSync("jointinferenceservice", stopCh, jc.podStoreSynced, jc.serviceStoreSynced) { - klog.Errorf("failed to wait for joint inferce service caches to sync") + klog.Infof("Starting %s controller", Name) + defer klog.Infof("Shutting down %s controller", Name) - return - } + if !cache.WaitForNamedCacheSync(Name, stopCh, c.podStoreSynced, c.serviceStoreSynced) { + klog.Errorf("failed to wait for %s caches to sync", Name) - klog.Infof("Starting joint inference service workers") - for i := 0; i < workers; i++ { - go wait.Until(jc.worker, time.Second, stopCh) - } + return + } - <-stopCh - }() - return nil + klog.Infof("Starting %s workers", Name) + for i := 0; i < workers; i++ { + go wait.Until(c.worker, time.Second, stopCh) + } + + <-stopCh } -// enqueueByPod enqueues the jointInferenceService object of the specified pod. -func (jc *JointInferenceServiceController) enqueueByPod(pod *v1.Pod, immediate bool) { +// enqueueByPod enqueues the JointInferenceService object of the specified pod. +func (c *Controller) enqueueByPod(pod *v1.Pod, immediate bool) { controllerRef := metav1.GetControllerOf(pod) if controllerRef == nil { return } - if controllerRef.Kind != jointServiceControllerKind.Kind { + if controllerRef.Kind != Kind.Kind { return } - service, err := jc.serviceLister.JointInferenceServices(pod.Namespace).Get(controllerRef.Name) + service, err := c.serviceLister.JointInferenceServices(pod.Namespace).Get(controllerRef.Name) if err != nil { return } @@ -130,27 +135,27 @@ func (jc *JointInferenceServiceController) enqueueByPod(pod *v1.Pod, immediate b return } - jc.enqueueController(service, immediate) + c.enqueueController(service, immediate) } // When a pod is created, enqueue the controller that manages it and update it's expectations. -func (jc *JointInferenceServiceController) addPod(obj interface{}) { +func (c *Controller) addPod(obj interface{}) { pod := obj.(*v1.Pod) if pod.DeletionTimestamp != nil { // on a restart of the controller, it's possible a new pod shows up in a state that // is already pending deletion. Prevent the pod from being a creation observation. - jc.deletePod(pod) + c.deletePod(pod) return } // backoff to queue when PodFailed immediate := pod.Status.Phase != v1.PodFailed - jc.enqueueByPod(pod, immediate) + c.enqueueByPod(pod, immediate) } // When a pod is updated, figure out what joint inference service manage it and wake them up. -func (jc *JointInferenceServiceController) updatePod(old, cur interface{}) { +func (c *Controller) updatePod(old, cur interface{}) { curPod := cur.(*v1.Pod) oldPod := old.(*v1.Pod) @@ -159,11 +164,11 @@ func (jc *JointInferenceServiceController) updatePod(old, cur interface{}) { return } - jc.addPod(curPod) + c.addPod(curPod) } -// deletePod enqueues the jointinferenceservice obj When a pod is deleted -func (jc *JointInferenceServiceController) deletePod(obj interface{}) { +// deletePod enqueues the JointinferenceService obj When a pod is deleted +func (c *Controller) deletePod(obj interface{}) { pod, ok := obj.(*v1.Pod) // comment from https://github.com/kubernetes/kubernetes/blob/master/pkg/controller/job/job_controller.go @@ -171,7 +176,7 @@ func (jc *JointInferenceServiceController) deletePod(obj interface{}) { // When a delete is dropped, the relist will notice a pod in the store not // in the list, leading to the insertion of a tombstone object which contains // the deleted key/value. Note that this value might be stale. If the pod - // changed labels the new jointinferenceservice will not be woken up till the periodic resync. + // changed labels the new JointInferenceService will not be woken up till the periodic resync. if !ok { tombstone, ok := obj.(cache.DeletedFinalStateUnknown) if !ok { @@ -184,13 +189,13 @@ func (jc *JointInferenceServiceController) deletePod(obj interface{}) { return } } - jc.enqueueByPod(pod, true) + c.enqueueByPod(pod, true) } // obj could be an *sednav1.JointInferenceService, or a DeletionFinalStateUnknown marker item, // immediate tells the controller to update the status right away, and should // happen ONLY when there was a successful pod run. -func (jc *JointInferenceServiceController) enqueueController(obj interface{}, immediate bool) { +func (c *Controller) enqueueController(obj interface{}, immediate bool) { key, err := k8scontroller.KeyFunc(obj) if err != nil { klog.Warningf("Couldn't get key for object %+v: %v", obj, err) @@ -199,42 +204,42 @@ func (jc *JointInferenceServiceController) enqueueController(obj interface{}, im backoff := time.Duration(0) if !immediate { - backoff = getBackoff(jc.queue, key) + backoff = runtime.GetBackoff(c.queue, key) } - jc.queue.AddAfter(key, backoff) + c.queue.AddAfter(key, backoff) } // worker runs a worker thread that just dequeues items, processes them, and marks them done. // It enforces that the sync is never invoked concurrently with the same key. -func (jc *JointInferenceServiceController) worker() { - for jc.processNextWorkItem() { +func (c *Controller) worker() { + for c.processNextWorkItem() { } } -func (jc *JointInferenceServiceController) processNextWorkItem() bool { - key, quit := jc.queue.Get() +func (c *Controller) processNextWorkItem() bool { + key, quit := c.queue.Get() if quit { return false } - defer jc.queue.Done(key) + defer c.queue.Done(key) - forget, err := jc.sync(key.(string)) + forget, err := c.sync(key.(string)) if err == nil { if forget { - jc.queue.Forget(key) + c.queue.Forget(key) } return true } klog.Warningf("Error syncing jointinference service: %v", err) - jc.queue.AddRateLimited(key) + c.queue.AddRateLimited(key) return true } // sync will sync the jointinferenceservice with the given key. // This function is not meant to be invoked concurrently with the same key. -func (jc *JointInferenceServiceController) sync(key string) (bool, error) { +func (c *Controller) sync(key string) (bool, error) { startTime := time.Now() defer func() { klog.V(4).Infof("Finished syncing jointinference service %q (%v)", key, time.Since(startTime)) @@ -247,7 +252,7 @@ func (jc *JointInferenceServiceController) sync(key string) (bool, error) { if len(ns) == 0 || len(name) == 0 { return false, fmt.Errorf("invalid jointinference service key %q: either namespace or name is missing", key) } - sharedJointinferenceservice, err := jc.serviceLister.JointInferenceServices(ns).Get(name) + sharedService, err := c.serviceLister.JointInferenceServices(ns).Get(name) if err != nil { if errors.IsNotFound(err) { klog.V(4).Infof("JointInferenceService has been deleted: %v", key) @@ -256,37 +261,38 @@ func (jc *JointInferenceServiceController) sync(key string) (bool, error) { return false, err } - jointinferenceservice := *sharedJointinferenceservice + service := *sharedService - // if jointinferenceservice was finished previously, we don't want to redo the termination - if isJointinferenceserviceFinished(&jointinferenceservice) { + // if service was finished previously, we don't want to redo the termination + if isServiceFinished(&service) { return true, nil } - // set kind for jointinferenceservice in case that the kind is None + // set kind for service in case that the kind is None // more details at https://github.com/kubernetes/kubernetes/issues/3030 - jointinferenceservice.SetGroupVersionKind(jointServiceControllerKind) + service.SetGroupVersionKind(Kind) - selector, _ := GenerateSelector(&jointinferenceservice) - pods, err := jc.podStore.Pods(jointinferenceservice.Namespace).List(selector) + selector, _ := runtime.GenerateSelector(&service) + pods, err := c.podStore.Pods(service.Namespace).List(selector) if err != nil { return false, err } - klog.V(4).Infof("list jointinference service %v/%v, %v pods: %v", jointinferenceservice.Namespace, jointinferenceservice.Name, len(pods), pods) + klog.V(4).Infof("list jointinference service %v/%v, %v pods: %v", service.Namespace, service.Name, len(pods), pods) - latestConditionLen := len(jointinferenceservice.Status.Conditions) + latestConditionLen := len(service.Status.Conditions) - active := calcActivePodCount(pods) + active := runtime.CalcActivePodCount(pods) var failed int32 = 0 + // neededCounts means that two pods should be created successfully in a jointinference service currently // two pods consist of edge pod and cloud pod var neededCounts int32 = 2 - // jointinferenceservice first start - if jointinferenceservice.Status.StartTime == nil { + + if service.Status.StartTime == nil { now := metav1.Now() - jointinferenceservice.Status.StartTime = &now + service.Status.StartTime = &now } else { failed = neededCounts - active } @@ -298,7 +304,7 @@ func (jc *JointInferenceServiceController) sync(key string) (bool, error) { // get the latest condition type // based on that condition updated is appended, not inserted. - jobConditions := jointinferenceservice.Status.Conditions + jobConditions := service.Status.Conditions if len(jobConditions) > 0 { latestConditionType = (jobConditions)[len(jobConditions)-1].Type } @@ -311,12 +317,12 @@ func (jc *JointInferenceServiceController) sync(key string) (bool, error) { serviceFailed = true // TODO: get the failed worker, and knows that which worker fails, edge inference worker or cloud inference worker reason = "workerFailed" - message = "the worker of Jointinferenceservice failed" + message = "the worker of service failed" newCondtionType = sednav1.JointInferenceServiceCondFailed - jc.recorder.Event(&jointinferenceservice, v1.EventTypeWarning, reason, message) + c.recorder.Event(&service, v1.EventTypeWarning, reason, message) } else { if len(pods) == 0 { - active, manageServiceErr = jc.createWorkers(&jointinferenceservice) + active, manageServiceErr = c.createWorkers(&service) } if manageServiceErr != nil { serviceFailed = true @@ -331,20 +337,20 @@ func (jc *JointInferenceServiceController) sync(key string) (bool, error) { // if newCondtionType != latestConditionType { - jointinferenceservice.Status.Conditions = append(jointinferenceservice.Status.Conditions, NewJointInferenceServiceCondition(newCondtionType, reason, message)) + service.Status.Conditions = append(service.Status.Conditions, newServiceCondition(newCondtionType, reason, message)) } forget := false // no need to update the jointinferenceservice if the status hasn't changed since last time - if jointinferenceservice.Status.Active != active || jointinferenceservice.Status.Failed != failed || len(jointinferenceservice.Status.Conditions) != latestConditionLen { - jointinferenceservice.Status.Active = active - jointinferenceservice.Status.Failed = failed + if service.Status.Active != active || service.Status.Failed != failed || len(service.Status.Conditions) != latestConditionLen { + service.Status.Active = active + service.Status.Failed = failed - if err := jc.updateStatus(&jointinferenceservice); err != nil { + if err := c.updateStatus(&service); err != nil { return forget, err } - if serviceFailed && !isJointinferenceserviceFinished(&jointinferenceservice) { + if serviceFailed && !isServiceFinished(&service) { // returning an error will re-enqueue jointinferenceservice after the backoff period return forget, fmt.Errorf("failed pod(s) detected for jointinference service key %q", key) } @@ -355,8 +361,8 @@ func (jc *JointInferenceServiceController) sync(key string) (bool, error) { return forget, manageServiceErr } -// NewJointInferenceServiceCondition creates a new joint condition -func NewJointInferenceServiceCondition(conditionType sednav1.JointInferenceServiceConditionType, reason, message string) sednav1.JointInferenceServiceCondition { +// newServiceCondition creates a new joint condition +func newServiceCondition(conditionType sednav1.JointInferenceServiceConditionType, reason, message string) sednav1.JointInferenceServiceCondition { return sednav1.JointInferenceServiceCondition{ Type: conditionType, Status: v1.ConditionTrue, @@ -367,24 +373,20 @@ func NewJointInferenceServiceCondition(conditionType sednav1.JointInferenceServi } } -func (jc *JointInferenceServiceController) updateStatus(jointinferenceservice *sednav1.JointInferenceService) error { - serviceClient := jc.client.JointInferenceServices(jointinferenceservice.Namespace) - var err error - for i := 0; i <= ResourceUpdateRetries; i = i + 1 { - var newJointinferenceservice *sednav1.JointInferenceService - newJointinferenceservice, err = serviceClient.Get(context.TODO(), jointinferenceservice.Name, metav1.GetOptions{}) +func (c *Controller) updateStatus(service *sednav1.JointInferenceService) error { + client := c.client.JointInferenceServices(service.Namespace) + return runtime.RetryUpdateStatus(service.Name, service.Namespace, func() error { + newService, err := client.Get(context.TODO(), service.Name, metav1.GetOptions{}) if err != nil { - break - } - newJointinferenceservice.Status = jointinferenceservice.Status - if _, err = serviceClient.UpdateStatus(context.TODO(), newJointinferenceservice, metav1.UpdateOptions{}); err == nil { - break + return err } - } - return nil + newService.Status = service.Status + _, err = client.UpdateStatus(context.TODO(), newService, metav1.UpdateOptions{}) + return err + }) } -func isJointinferenceserviceFinished(j *sednav1.JointInferenceService) bool { +func isServiceFinished(j *sednav1.JointInferenceService) bool { for _, c := range j.Status.Conditions { if (c.Type == sednav1.JointInferenceServiceCondFailed) && c.Status == v1.ConditionTrue { return true @@ -393,11 +395,11 @@ func isJointinferenceserviceFinished(j *sednav1.JointInferenceService) bool { return false } -func (jc *JointInferenceServiceController) createWorkers(service *sednav1.JointInferenceService) (active int32, err error) { +func (c *Controller) createWorkers(service *sednav1.JointInferenceService) (active int32, err error) { active = 0 // create cloud worker - err = jc.createCloudWorker(service) + err = c.createCloudWorker(service) if err != nil { return active, err } @@ -406,14 +408,14 @@ func (jc *JointInferenceServiceController) createWorkers(service *sednav1.JointI // create k8s service for cloudPod // FIXME(llhuii): only the case that Spec.NodeName specified is support, // will support Spec.NodeSelector. - bigModelIP, err := GetNodeIPByName(jc.kubeClient, service.Spec.CloudWorker.Template.Spec.NodeName) - bigServicePort, err := CreateKubernetesService(jc.kubeClient, service, jointInferenceForCloud, bigModelPort, bigModelIP) + bigModelIP, err := runtime.GetNodeIPByName(c.kubeClient, service.Spec.CloudWorker.Template.Spec.NodeName) + bigServicePort, err := runtime.CreateKubernetesService(c.kubeClient, service, jointInferenceForCloud, bigModelPort, bigModelIP) if err != nil { return active, err } // create edge worker - err = jc.createEdgeWorker(service, bigServicePort) + err = c.createEdgeWorker(service, bigServicePort) if err != nil { return active, err } @@ -422,24 +424,24 @@ func (jc *JointInferenceServiceController) createWorkers(service *sednav1.JointI return active, err } -func (jc *JointInferenceServiceController) createCloudWorker(service *sednav1.JointInferenceService) error { +func (c *Controller) createCloudWorker(service *sednav1.JointInferenceService) error { // deliver pod for cloudworker cloudModelName := service.Spec.CloudWorker.Model.Name - cloudModel, err := jc.client.Models(service.Namespace).Get(context.Background(), cloudModelName, metav1.GetOptions{}) + cloudModel, err := c.client.Models(service.Namespace).Get(context.Background(), cloudModelName, metav1.GetOptions{}) if err != nil { return fmt.Errorf("failed to get cloud model %s: %w", cloudModelName, err) } - var workerParam WorkerParam + var workerParam runtime.WorkerParam secretName := cloudModel.Spec.CredentialName var modelSecret *v1.Secret if secretName != "" { - modelSecret, _ = jc.kubeClient.CoreV1().Secrets(service.Namespace).Get(context.TODO(), secretName, metav1.GetOptions{}) + modelSecret, _ = c.kubeClient.CoreV1().Secrets(service.Namespace).Get(context.TODO(), secretName, metav1.GetOptions{}) } - workerParam.mounts = append(workerParam.mounts, WorkerMount{ - URL: &MountURL{ + workerParam.Mounts = append(workerParam.Mounts, runtime.WorkerMount{ + URL: &runtime.MountURL{ URL: cloudModel.Spec.URL, Secret: modelSecret, DownloadByInitializer: true, @@ -448,7 +450,7 @@ func (jc *JointInferenceServiceController) createCloudWorker(service *sednav1.Jo EnvName: "MODEL_URL", }) - workerParam.env = map[string]string{ + workerParam.Env = map[string]string{ "NAMESPACE": service.Namespace, "SERVICE_NAME": service.Name, "WORKER_NAME": "cloudworker-" + utilrand.String(5), @@ -456,21 +458,21 @@ func (jc *JointInferenceServiceController) createCloudWorker(service *sednav1.Jo "BIG_MODEL_BIND_PORT": strconv.Itoa(int(bigModelPort)), } - workerParam.workerType = jointInferenceForCloud + workerParam.WorkerType = jointInferenceForCloud // create cloud pod - _, err = createPodWithTemplate(jc.kubeClient, + _, err = runtime.CreatePodWithTemplate(c.kubeClient, service, &service.Spec.CloudWorker.Template, &workerParam) return err } -func (jc *JointInferenceServiceController) createEdgeWorker(service *sednav1.JointInferenceService, bigServicePort int32) error { +func (c *Controller) createEdgeWorker(service *sednav1.JointInferenceService, bigServicePort int32) error { // deliver pod for edgeworker ctx := context.Background() edgeModelName := service.Spec.EdgeWorker.Model.Name - edgeModel, err := jc.client.Models(service.Namespace).Get(ctx, edgeModelName, metav1.GetOptions{}) + edgeModel, err := c.client.Models(service.Namespace).Get(ctx, edgeModelName, metav1.GetOptions{}) if err != nil { return fmt.Errorf("failed to get edge model %s: %w", edgeModelName, err) @@ -479,13 +481,13 @@ func (jc *JointInferenceServiceController) createEdgeWorker(service *sednav1.Joi secretName := edgeModel.Spec.CredentialName var modelSecret *v1.Secret if secretName != "" { - modelSecret, _ = jc.kubeClient.CoreV1().Secrets(service.Namespace).Get(context.TODO(), secretName, metav1.GetOptions{}) + modelSecret, _ = c.kubeClient.CoreV1().Secrets(service.Namespace).Get(context.TODO(), secretName, metav1.GetOptions{}) } // FIXME(llhuii): only the case that Spec.NodeName specified is support, // will support Spec.NodeSelector. // get bigModelIP from nodeName in cloudWorker - bigModelIP, err := GetNodeIPByName(jc.kubeClient, service.Spec.CloudWorker.Template.Spec.NodeName) + bigModelIP, err := runtime.GetNodeIPByName(c.kubeClient, service.Spec.CloudWorker.Template.Spec.NodeName) if err != nil { return fmt.Errorf("failed to get node ip: %w", err) } @@ -494,10 +496,10 @@ func (jc *JointInferenceServiceController) createEdgeWorker(service *sednav1.Joi HEMParameterJSON, _ := json.Marshal(edgeWorker.HardExampleMining.Parameters) HEMParameterString := string(HEMParameterJSON) - var workerParam WorkerParam + var workerParam runtime.WorkerParam - workerParam.mounts = append(workerParam.mounts, WorkerMount{ - URL: &MountURL{ + workerParam.Mounts = append(workerParam.Mounts, runtime.WorkerMount{ + URL: &runtime.MountURL{ URL: edgeModel.Spec.URL, Secret: modelSecret, DownloadByInitializer: true, @@ -506,7 +508,7 @@ func (jc *JointInferenceServiceController) createEdgeWorker(service *sednav1.Joi EnvName: "MODEL_URL", }) - workerParam.env = map[string]string{ + workerParam.Env = map[string]string{ "NAMESPACE": service.Namespace, "SERVICE_NAME": service.Name, "WORKER_NAME": "edgeworker-" + utilrand.String(5), @@ -517,52 +519,37 @@ func (jc *JointInferenceServiceController) createEdgeWorker(service *sednav1.Joi "HEM_NAME": edgeWorker.HardExampleMining.Name, "HEM_PARAMETERS": HEMParameterString, - "LC_SERVER": jc.cfg.LC.Server, + "LC_SERVER": c.cfg.LC.Server, } - workerParam.workerType = jointInferenceForEdge - workerParam.hostNetwork = true + workerParam.WorkerType = jointInferenceForEdge + workerParam.HostNetwork = true // create edge pod - _, err = createPodWithTemplate(jc.kubeClient, + _, err = runtime.CreatePodWithTemplate(c.kubeClient, service, &service.Spec.EdgeWorker.Template, &workerParam) return err } -// GetName returns the name of the joint inference controller -func (jc *JointInferenceServiceController) GetName() string { - return "JointInferenceServiceController" -} - -// NewJointController creates a new JointInferenceService controller that keeps the relevant pods +// New creates a new JointInferenceService controller that keeps the relevant pods // in sync with their corresponding JointInferenceService objects. -func NewJointController(cfg *config.ControllerConfig) (FeatureControllerI, error) { - var err error - namespace := cfg.Namespace - if namespace == "" { - namespace = metav1.NamespaceAll - } - - kubeClient, _ := utils.KubeClient() - kubecfg, _ := utils.KubeConfig() - crdclient, _ := clientset.NewForConfig(kubecfg) - kubeInformerFactory := kubeinformers.NewSharedInformerFactoryWithOptions(kubeClient, time.Second*30, kubeinformers.WithNamespace(namespace)) +func New(cc *runtime.ControllerContext) (runtime.FeatureControllerI, error) { + cfg := cc.Config - podInformer := kubeInformerFactory.Core().V1().Pods() + podInformer := cc.KubeInformerFactory.Core().V1().Pods() - serviceInformerFactory := informers.NewSharedInformerFactoryWithOptions(crdclient, time.Second*30, informers.WithNamespace(namespace)) - serviceInformer := serviceInformerFactory.Sedna().V1alpha1().JointInferenceServices() + serviceInformer := cc.SednaInformerFactory.Sedna().V1alpha1().JointInferenceServices() eventBroadcaster := record.NewBroadcaster() - eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: kubeClient.CoreV1().Events("")}) + eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: cc.KubeClient.CoreV1().Events("")}) - jc := &JointInferenceServiceController{ - kubeClient: kubeClient, - client: crdclient.SednaV1alpha1(), + jc := &Controller{ + kubeClient: cc.KubeClient, + client: cc.SednaClient.SednaV1alpha1(), - queue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(DefaultBackOff, MaxBackOff), "jointinferenceservice"), + queue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(runtime.DefaultBackOff, runtime.MaxBackOff), "jointinferenceservice"), recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "jointinferenceservice-controller"}), cfg: cfg, } @@ -570,14 +557,17 @@ func NewJointController(cfg *config.ControllerConfig) (FeatureControllerI, error serviceInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ AddFunc: func(obj interface{}) { jc.enqueueController(obj, true) + jc.syncToEdge(watch.Added, obj) }, UpdateFunc: func(old, cur interface{}) { jc.enqueueController(cur, true) + jc.syncToEdge(watch.Added, cur) }, DeleteFunc: func(obj interface{}) { jc.enqueueController(obj, true) + jc.syncToEdge(watch.Deleted, obj) }, }) @@ -593,8 +583,5 @@ func NewJointController(cfg *config.ControllerConfig) (FeatureControllerI, error jc.podStore = podInformer.Lister() jc.podStoreSynced = podInformer.Informer().HasSynced - stopCh := messageContext.Done() - kubeInformerFactory.Start(stopCh) - serviceInformerFactory.Start(stopCh) - return jc, err + return jc, nil } diff --git a/pkg/globalmanager/controllers/jointinference/upstream.go b/pkg/globalmanager/controllers/jointinference/upstream.go new file mode 100644 index 000000000..93d0fa7e9 --- /dev/null +++ b/pkg/globalmanager/controllers/jointinference/upstream.go @@ -0,0 +1,92 @@ +/* +Copyright 2021 The KubeEdge Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package jointinference + +import ( + "context" + "encoding/json" + "fmt" + + sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" + "github.com/kubeedge/sedna/pkg/globalmanager/runtime" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/klog/v2" +) + +func (c *Controller) updateMetrics(name, namespace string, metrics []sednav1.Metric) error { + client := c.client.JointInferenceServices(namespace) + + return runtime.RetryUpdateStatus(name, namespace, func() error { + joint, err := client.Get(context.TODO(), name, metav1.GetOptions{}) + if err != nil { + return err + } + joint.Status.Metrics = metrics + _, err = client.UpdateStatus(context.TODO(), joint, metav1.UpdateOptions{}) + return err + }) +} + +// updateFromEdge syncs the edge updates to k8s +func (c *Controller) updateFromEdge(name, namespace, operation string, content []byte) error { + // Output defines owner output information + type Output struct { + ServiceInfo map[string]interface{} `json:"ownerInfo"` + } + + var status struct { + // Phase always should be "inference" + Phase string `json:"phase"` + Status string `json:"status"` + Output *Output `json:"output"` + } + + err := json.Unmarshal(content, &status) + if err != nil { + return err + } + + // TODO: propagate status.Status to k8s + + output := status.Output + if output == nil || output.ServiceInfo == nil { + // no output info + klog.Warningf("empty status info for joint inference service %s/%s", namespace, name) + return nil + } + + info := output.ServiceInfo + + for _, ignoreTimeKey := range []string{ + "startTime", + "updateTime", + } { + delete(info, ignoreTimeKey) + } + + metrics := runtime.ConvertMapToMetrics(info) + + err = c.updateMetrics(name, namespace, metrics) + if err != nil { + return fmt.Errorf("failed to update metrics, err:%+w", err) + } + return nil +} + +func (c *Controller) SetUpstreamHandler(addFunc runtime.UpstreamHandlerAddFunc) error { + return addFunc(KindName, c.updateFromEdge) +} diff --git a/pkg/globalmanager/controllers/lifelonglearning/downstream.go b/pkg/globalmanager/controllers/lifelonglearning/downstream.go new file mode 100644 index 000000000..8b9ef5faa --- /dev/null +++ b/pkg/globalmanager/controllers/lifelonglearning/downstream.go @@ -0,0 +1,55 @@ +/* +Copyright 2021 The KubeEdge Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package lifelonglearning + +import ( + "fmt" + + "k8s.io/apimachinery/pkg/watch" + + sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" + "github.com/kubeedge/sedna/pkg/globalmanager/runtime" +) + +func (c *Controller) syncToEdge(eventType watch.EventType, obj interface{}) error { + job, ok := obj.(*sednav1.LifelongLearningJob) + if !ok { + return nil + } + + // Since Kind may be empty, + // we need to fix the kind here if missing. + // more details at https://github.com/kubernetes/kubernetes/issues/3030 + job.Kind = KindName + + // Here only propagate to the nodes with non empty name + + // FIXME(llhuii): only the case that all workers having the same nodeName are support, + // will support Spec.NodeSelector and differenect nodeName. + nodeName := job.Spec.TrainSpec.Template.Spec.NodeName + if len(nodeName) == 0 { + return fmt.Errorf("empty node name") + } + + runtime.InjectSecretAnnotations(c.kubeClient, job, job.Spec.CredentialName) + return c.sendToEdgeFunc(nodeName, eventType, job) +} + +func (c *Controller) SetDownstreamSendFunc(f runtime.DownstreamSendFunc) error { + c.sendToEdgeFunc = f + return nil +} diff --git a/pkg/globalmanager/lifelonglearningjob.go b/pkg/globalmanager/controllers/lifelonglearning/lifelonglearningjob.go similarity index 60% rename from pkg/globalmanager/lifelonglearningjob.go rename to pkg/globalmanager/controllers/lifelonglearning/lifelonglearningjob.go index 73e2efc48..a946ca7cb 100644 --- a/pkg/globalmanager/lifelonglearningjob.go +++ b/pkg/globalmanager/controllers/lifelonglearning/lifelonglearningjob.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package globalmanager +package lifelonglearning import ( "context" @@ -28,9 +28,8 @@ import ( utilrand "k8s.io/apimachinery/pkg/util/rand" utilruntime "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/apimachinery/pkg/util/wait" - kubeinformers "k8s.io/client-go/informers" + "k8s.io/apimachinery/pkg/watch" "k8s.io/client-go/kubernetes" - "k8s.io/client-go/kubernetes/scheme" v1core "k8s.io/client-go/kubernetes/typed/core/v1" corelisters "k8s.io/client-go/listers/core/v1" "k8s.io/client-go/tools/cache" @@ -40,21 +39,25 @@ import ( k8scontroller "k8s.io/kubernetes/pkg/controller" sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" - clientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned" sednaclientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned/typed/sedna/v1alpha1" - informers "github.com/kubeedge/sedna/pkg/client/informers/externalversions" sednav1listers "github.com/kubeedge/sedna/pkg/client/listers/sedna/v1alpha1" "github.com/kubeedge/sedna/pkg/globalmanager/config" - messageContext "github.com/kubeedge/sedna/pkg/globalmanager/messagelayer/ws" - "github.com/kubeedge/sedna/pkg/globalmanager/utils" + "github.com/kubeedge/sedna/pkg/globalmanager/runtime" ) -// ljControllerKind contains the schema.GroupVersionKind for this controller type. -var ljControllerKind = sednav1.SchemeGroupVersion.WithKind("LifelongLearningJob") +const ( + // KindName is the kind name of CR this controller controls + KindName = "LifelongLearningJob" + // Name is this controller name + Name = "LifelongLearning" +) + +// Kind contains the schema.GroupVersionKind for this controller type. +var Kind = sednav1.SchemeGroupVersion.WithKind(KindName) -// LifelongLearningJobController ensures that all LifelongLearningJob objects have corresponding pods to +// Controller ensures that all LifelongLearningJob objects have corresponding pods to // run their configured workload. -type LifelongLearningJobController struct { +type Controller struct { kubeClient kubernetes.Interface client sednaclientset.SednaV1alpha1Interface @@ -74,50 +77,47 @@ type LifelongLearningJobController struct { // LifelongLearningJobs that need to be updated queue workqueue.RateLimitingInterface - recorder record.EventRecorder - cfg *config.ControllerConfig + + sendToEdgeFunc runtime.DownstreamSendFunc } -// Run the main goroutine responsible for watching and syncing jobs. -func (jc *LifelongLearningJobController) Start() error { +// Run starts the main goroutine responsible for watching and syncing jobs. +func (c *Controller) Run(stopCh <-chan struct{}) { workers := 1 - stopCh := messageContext.Done() - go func() { - defer utilruntime.HandleCrash() - defer jc.queue.ShutDown() - klog.Infof("Starting lifelonglearning job controller") - defer klog.Infof("Shutting down lifelonglearning job controller") + defer utilruntime.HandleCrash() + defer c.queue.ShutDown() - if !cache.WaitForNamedCacheSync("lifelonglearningjob", stopCh, jc.podStoreSynced, jc.jobStoreSynced) { - klog.Errorf("failed to wait for caches to sync") + klog.Infof("Starting %s controller", Name) + defer klog.Infof("Shutting down %s controller", Name) - return - } - klog.Infof("Starting lifelonglearning job workers") - for i := 0; i < workers; i++ { - go wait.Until(jc.worker, time.Second, stopCh) - } + if !cache.WaitForNamedCacheSync(Name, stopCh, c.podStoreSynced, c.jobStoreSynced) { + klog.Errorf("failed to wait for %s caches to sync", Name) - <-stopCh - }() - return nil + return + } + klog.Infof("Starting %s workers", Name) + for i := 0; i < workers; i++ { + go wait.Until(c.worker, time.Second, stopCh) + } + + <-stopCh } // enqueueByPod enqueues the lifelonglearningjob object of the specified pod. -func (jc *LifelongLearningJobController) enqueueByPod(pod *v1.Pod, immediate bool) { +func (c *Controller) enqueueByPod(pod *v1.Pod, immediate bool) { controllerRef := metav1.GetControllerOf(pod) if controllerRef == nil { return } - if controllerRef.Kind != ljControllerKind.Kind { + if controllerRef.Kind != Kind.Kind { return } - service, err := jc.jobLister.LifelongLearningJobs(pod.Namespace).Get(controllerRef.Name) + service, err := c.jobLister.LifelongLearningJobs(pod.Namespace).Get(controllerRef.Name) if err != nil { return } @@ -126,27 +126,27 @@ func (jc *LifelongLearningJobController) enqueueByPod(pod *v1.Pod, immediate boo return } - jc.enqueueController(service, immediate) + c.enqueueController(service, immediate) } // When a pod is created, enqueue the controller that manages it and update it's expectations. -func (jc *LifelongLearningJobController) addPod(obj interface{}) { +func (c *Controller) addPod(obj interface{}) { pod := obj.(*v1.Pod) if pod.DeletionTimestamp != nil { // on a restart of the controller, it's possible a new pod shows up in a state that // is already pending deletion. Prevent the pod from being a creation observation. - jc.deletePod(pod) + c.deletePod(pod) return } // backoff to queue when PodFailed immediate := pod.Status.Phase != v1.PodFailed - jc.enqueueByPod(pod, immediate) + c.enqueueByPod(pod, immediate) } // When a pod is updated, figure out what lifelonglearning job manage it and wake them up. -func (jc *LifelongLearningJobController) updatePod(old, cur interface{}) { +func (c *Controller) updatePod(old, cur interface{}) { curPod := cur.(*v1.Pod) oldPod := old.(*v1.Pod) @@ -155,11 +155,11 @@ func (jc *LifelongLearningJobController) updatePod(old, cur interface{}) { return } - jc.addPod(curPod) + c.addPod(curPod) } // deletePod enqueues the lifelonglearningjob obj When a pod is deleted -func (jc *LifelongLearningJobController) deletePod(obj interface{}) { +func (c *Controller) deletePod(obj interface{}) { pod, ok := obj.(*v1.Pod) // comment from https://github.com/kubernetes/kubernetes/blob/master/pkg/controller/job/job_controller.go @@ -180,13 +180,13 @@ func (jc *LifelongLearningJobController) deletePod(obj interface{}) { return } } - jc.enqueueByPod(pod, true) + c.enqueueByPod(pod, true) } // obj could be an *sedna.LifelongLearningJob, or a DeletionFinalStateUnknown marker item, // immediate tells the controller to update the status right away, and should // happen ONLY when there was a successful pod run. -func (jc *LifelongLearningJobController) enqueueController(obj interface{}, immediate bool) { +func (c *Controller) enqueueController(obj interface{}, immediate bool) { key, err := k8scontroller.KeyFunc(obj) if err != nil { utilruntime.HandleError(fmt.Errorf("Couldn't get key for object %+v: %v", obj, err)) @@ -195,36 +195,36 @@ func (jc *LifelongLearningJobController) enqueueController(obj interface{}, imme backoff := time.Duration(0) if !immediate { - backoff = getBackoff(jc.queue, key) + backoff = runtime.GetBackoff(c.queue, key) } - jc.queue.AddAfter(key, backoff) + c.queue.AddAfter(key, backoff) } // worker runs a worker thread that just dequeues items, processes them, and marks them done. // It enforces that the syncHandler is never invoked concurrently with the same key. -func (jc *LifelongLearningJobController) worker() { - for jc.processNextWorkItem() { +func (c *Controller) worker() { + for c.processNextWorkItem() { } } -func (jc *LifelongLearningJobController) processNextWorkItem() bool { - key, quit := jc.queue.Get() +func (c *Controller) processNextWorkItem() bool { + key, quit := c.queue.Get() if quit { return false } - defer jc.queue.Done(key) + defer c.queue.Done(key) - forget, err := jc.sync(key.(string)) + forget, err := c.sync(key.(string)) if err == nil { if forget { - jc.queue.Forget(key) + c.queue.Forget(key) } return true } utilruntime.HandleError(fmt.Errorf("Error syncing lifelonglearning job: %v", err)) - jc.queue.AddRateLimited(key) + c.queue.AddRateLimited(key) return true } @@ -232,7 +232,7 @@ func (jc *LifelongLearningJobController) processNextWorkItem() bool { // sync will sync the lifelonglearning job with the given key if it has had its expectations fulfilled, meaning // it did not expect to see any more of its pods created or deleted. This function is not meant to be invoked // concurrently with the same key. -func (jc *LifelongLearningJobController) sync(key string) (bool, error) { +func (c *Controller) sync(key string) (bool, error) { startTime := time.Now() defer func() { klog.V(4).Infof("Finished syncing lifelonglearning job %q (%v)", key, time.Since(startTime)) @@ -245,7 +245,7 @@ func (jc *LifelongLearningJobController) sync(key string) (bool, error) { if len(ns) == 0 || len(name) == 0 { return false, fmt.Errorf("invalid lifelonglearning job key %q: either namespace or name is missing", key) } - sharedLifelongLearningJob, err := jc.jobLister.LifelongLearningJobs(ns).Get(name) + sharedJob, err := c.jobLister.LifelongLearningJobs(ns).Get(name) if err != nil { if errors.IsNotFound(err) { klog.V(4).Infof("lifelonglearning job has been deleted: %v", key) @@ -253,18 +253,18 @@ func (jc *LifelongLearningJobController) sync(key string) (bool, error) { } return false, err } - lifelonglearningjob := *sharedLifelongLearningJob + job := *sharedJob // set kind for lifelonglearningjob in case that the kind is None - lifelonglearningjob.SetGroupVersionKind(sednav1.SchemeGroupVersion.WithKind("LifelongLearningJob")) + job.SetGroupVersionKind(Kind) - // lifelonglearningjob first start - if lifelonglearningjob.Status.StartTime == nil { + if job.Status.StartTime == nil { + // job is first in now := metav1.Now() - lifelonglearningjob.Status.StartTime = &now + job.Status.StartTime = &now } - // if lifelonglearningjob was finished previously, we don't want to redo the termination - if IsLifelongLearningJobFinished(&lifelonglearningjob) { + // if job was finished previously, we don't want to redo the termination + if IsJobFinished(&job) { return true, nil } @@ -272,18 +272,18 @@ func (jc *LifelongLearningJobController) sync(key string) (bool, error) { jobFailed := false needUpdated := false - // update conditions of lifelonglearning job - needUpdated, err = jc.updateLifelongLearningJobConditions(&lifelonglearningjob) + // transit this job's state machine + needUpdated, err = c.transitJobState(&job) if err != nil { - klog.V(2).Infof("lifelonglearning job %v/%v faied to be updated, err:%s", lifelonglearningjob.Namespace, lifelonglearningjob.Name, err) + klog.V(2).Infof("lifelonglearning job %v/%v faied to be updated, err:%s", job.Namespace, job.Name, err) } if needUpdated { - if err := jc.updateLifelongLearningJobStatus(&lifelonglearningjob); err != nil { + if err := c.updateJobStatus(&job); err != nil { return forget, err } - if jobFailed && !IsLifelongLearningJobFinished(&lifelonglearningjob) { + if jobFailed && !IsJobFinished(&job) { // returning an error will re-enqueue LifelongLearningJob after the backoff period return forget, fmt.Errorf("failed pod(s) detected for lifelonglearningjob key %q", key) } @@ -294,24 +294,25 @@ func (jc *LifelongLearningJobController) sync(key string) (bool, error) { return forget, err } -// updateLifelongLearningJobConditions ensures that conditions of lifelonglearning job can be changed by podstatus -func (jc *LifelongLearningJobController) updateLifelongLearningJobConditions(lifelonglearningjob *sednav1.LifelongLearningJob) (bool, error) { +// transitJobState transit job to next state +func (c *Controller) transitJobState(job *sednav1.LifelongLearningJob) (bool, error) { var initialType sednav1.LLJobStageConditionType var latestCondition sednav1.LLJobCondition = sednav1.LLJobCondition{ Stage: sednav1.LLJobTrain, Type: initialType, } + var newConditionType sednav1.LLJobStageConditionType - latestCondition.Stage = sednav1.LLJobTrain var needUpdated = false - jobConditions := lifelonglearningjob.Status.Conditions + var podStatus v1.PodPhase = v1.PodUnknown + jobConditions := job.Status.Conditions if len(jobConditions) > 0 { // get latest pod and pod status latestCondition = (jobConditions)[len(jobConditions)-1] - klog.V(2).Infof("lifelonglearning job %v/%v latest stage %v:", lifelonglearningjob.Namespace, lifelonglearningjob.Name, + klog.V(2).Infof("lifelonglearning job %v/%v latest stage %v:", job.Namespace, job.Name, latestCondition.Stage) - pod := jc.getSpecifiedPods(lifelonglearningjob, string(latestCondition.Stage)) + pod := c.getSpecifiedPods(job, string(latestCondition.Stage)) if pod != nil { podStatus = pod.Status.Phase @@ -333,14 +334,14 @@ func (jc *LifelongLearningJobController) updateLifelongLearningJobConditions(lif // include train, eval, deploy pod var err error if jobStage == sednav1.LLJobDeploy { - err = jc.restartInferPod(lifelonglearningjob) + err = c.restartInferPod(job) if err != nil { - klog.V(2).Infof("lifelonglearning job %v/%v inference pod failed to restart, err:%s", lifelonglearningjob.Namespace, lifelonglearningjob.Name, err) + klog.V(2).Infof("lifelonglearning job %v/%v inference pod failed to restart, err:%s", job.Namespace, job.Name, err) } else { - klog.V(2).Infof("lifelonglearning job %v/%v inference pod restarts successfully", lifelonglearningjob.Namespace, lifelonglearningjob.Name) + klog.V(2).Infof("lifelonglearning job %v/%v inference pod restarts successfully", job.Namespace, job.Name) } } else if podStatus != v1.PodPending && podStatus != v1.PodRunning { - err = jc.createPod(lifelonglearningjob, jobStage) + err = c.createPod(job, jobStage) } if err != nil { return needUpdated, err @@ -358,13 +359,13 @@ func (jc *LifelongLearningJobController) updateLifelongLearningJobConditions(lif } else if podStatus == v1.PodSucceeded { // watch pod status, if pod completed, set type completed newConditionType = sednav1.LLJobStageCondCompleted - klog.V(2).Infof("lifelonglearning job %v/%v %v stage completed!", lifelonglearningjob.Namespace, lifelonglearningjob.Name, jobStage) + klog.V(2).Infof("lifelonglearning job %v/%v %v stage completed!", job.Namespace, job.Name, jobStage) } else if podStatus == v1.PodFailed { newConditionType = sednav1.LLJobStageCondFailed - klog.V(2).Infof("lifelonglearning job %v/%v %v stage failed!", lifelonglearningjob.Namespace, lifelonglearningjob.Name, jobStage) + klog.V(2).Infof("lifelonglearning job %v/%v %v stage failed!", job.Namespace, job.Name, jobStage) } case sednav1.LLJobStageCondCompleted: - jobStage = jc.getNextStage(jobStage) + jobStage = c.getNextStage(jobStage) newConditionType = sednav1.LLJobStageCondWaiting case sednav1.LLJobStageCondFailed: @@ -374,34 +375,31 @@ func (jc *LifelongLearningJobController) updateLifelongLearningJobConditions(lif default: // do nothing when given other type out of cases } - klog.V(2).Infof("lifelonglearning job %v/%v, conditions: %v", lifelonglearningjob.Namespace, lifelonglearningjob.Name, jobConditions) + + klog.V(2).Infof("lifelonglearning job %v/%v, conditions: %v", job.Namespace, job.Name, jobConditions) if latestCondition.Type != newConditionType { - lifelonglearningjob.Status.Conditions = append(lifelonglearningjob.Status.Conditions, NewLifelongLearningJobCondition(newConditionType, jobStage)) + job.Status.Conditions = append(job.Status.Conditions, NewJobCondition(newConditionType, jobStage)) needUpdated = true return needUpdated, nil } return needUpdated, nil } -// updateLifelongLearningJobStatus ensures that jobstatus can be updated rightly -func (jc *LifelongLearningJobController) updateLifelongLearningJobStatus(lifelonglearningjob *sednav1.LifelongLearningJob) error { - jobClient := jc.client.LifelongLearningJobs(lifelonglearningjob.Namespace) - var err error - for i := 0; i <= ResourceUpdateRetries; i = i + 1 { - var newLifelongLearningJob *sednav1.LifelongLearningJob - newLifelongLearningJob, err = jobClient.Get(context.TODO(), lifelonglearningjob.Name, metav1.GetOptions{}) +// updateJobStatus ensures that jobstatus can be updated rightly +func (c *Controller) updateJobStatus(job *sednav1.LifelongLearningJob) error { + jobClient := c.client.LifelongLearningJobs(job.Namespace) + return runtime.RetryUpdateStatus(job.Name, job.Namespace, func() error { + newJob, err := jobClient.Get(context.TODO(), job.Name, metav1.GetOptions{}) if err != nil { - break + return err } - newLifelongLearningJob.Status = lifelonglearningjob.Status - if _, err = jobClient.UpdateStatus(context.TODO(), newLifelongLearningJob, metav1.UpdateOptions{}); err == nil { - break - } - } - return err + newJob.Status = job.Status + _, err = jobClient.UpdateStatus(context.TODO(), newJob, metav1.UpdateOptions{}) + return err + }) } -func NewLifelongLearningJobCondition(conditionType sednav1.LLJobStageConditionType, jobStage sednav1.LLJobStage) sednav1.LLJobCondition { +func NewJobCondition(conditionType sednav1.LLJobStageConditionType, jobStage sednav1.LLJobStage) sednav1.LLJobCondition { return sednav1.LLJobCondition{ Type: conditionType, Status: v1.ConditionTrue, @@ -413,17 +411,17 @@ func NewLifelongLearningJobCondition(conditionType sednav1.LLJobStageConditionTy } } -func (jc *LifelongLearningJobController) generatePodName(jobName string, workerType string) string { +func (c *Controller) generatePodName(jobName string, workerType string) string { return jobName + "-" + strings.ToLower(workerType) + "-" + utilrand.String(5) } -func (jc *LifelongLearningJobController) getSpecifiedPods(job *sednav1.LifelongLearningJob, podType string) *v1.Pod { +func (c *Controller) getSpecifiedPods(job *sednav1.LifelongLearningJob, podType string) *v1.Pod { if podType == "Deploy" { - podType = InferencePodType + podType = runtime.InferencePodType } var latestPod *v1.Pod - selector, _ := GenerateSelector(job) - pods, err := jc.podStore.Pods(job.Namespace).List(selector) + selector, _ := runtime.GenerateSelector(job) + pods, err := c.podStore.Pods(job.Namespace).List(selector) if len(pods) == 0 || err != nil { return nil } @@ -443,20 +441,20 @@ func (jc *LifelongLearningJobController) getSpecifiedPods(job *sednav1.LifelongL return latestPod } -func (jc *LifelongLearningJobController) restartInferPod(job *sednav1.LifelongLearningJob) error { - inferPod := jc.getSpecifiedPods(job, InferencePodType) +func (c *Controller) restartInferPod(job *sednav1.LifelongLearningJob) error { + inferPod := c.getSpecifiedPods(job, runtime.InferencePodType) if inferPod == nil { klog.V(2).Infof("No inferpod is running in lifelonglearning job %v/%v", job.Namespace, job.Name) - err := jc.createInferPod(job) + err := c.createInferPod(job) return err } ctx := context.Background() - err := jc.kubeClient.CoreV1().Pods(job.Namespace).Delete(ctx, inferPod.Name, metav1.DeleteOptions{}) + err := c.kubeClient.CoreV1().Pods(job.Namespace).Delete(ctx, inferPod.Name, metav1.DeleteOptions{}) if err != nil { klog.Warningf("failed to delete inference pod %s for lifelonglearning job %v/%v, err:%s", inferPod.Name, job.Namespace, job.Name, err) return err } - err = jc.createInferPod(job) + err = c.createInferPod(job) if err != nil { klog.Warningf("failed to create inference pod %s for lifelonglearning job %v/%v, err:%s", inferPod.Name, job.Namespace, job.Name, err) return err @@ -464,7 +462,7 @@ func (jc *LifelongLearningJobController) restartInferPod(job *sednav1.LifelongLe return nil } -func (jc *LifelongLearningJobController) getNextStage(currentStage sednav1.LLJobStage) sednav1.LLJobStage { +func (c *Controller) getNextStage(currentStage sednav1.LLJobStage) sednav1.LLJobStage { switch currentStage { case sednav1.LLJobTrain: return sednav1.LLJobEval @@ -477,9 +475,9 @@ func (jc *LifelongLearningJobController) getNextStage(currentStage sednav1.LLJob } } -func (jc *LifelongLearningJobController) getSecret(namespace, name string, ownerStr string) (secret *v1.Secret, err error) { +func (c *Controller) getSecret(namespace, name string, ownerStr string) (secret *v1.Secret, err error) { if name != "" { - secret, err = jc.kubeClient.CoreV1().Secrets(namespace).Get(context.TODO(), name, metav1.GetOptions{}) + secret, err = c.kubeClient.CoreV1().Secrets(namespace).Get(context.TODO(), name, metav1.GetOptions{}) if err != nil { err = fmt.Errorf("failed to get the secret %s for %s: %w", name, @@ -489,23 +487,23 @@ func (jc *LifelongLearningJobController) getSecret(namespace, name string, owner return } -func IsLifelongLearningJobFinished(j *sednav1.LifelongLearningJob) bool { +func IsJobFinished(j *sednav1.LifelongLearningJob) bool { // TODO return false } -func (jc *LifelongLearningJobController) createPod(job *sednav1.LifelongLearningJob, podtype sednav1.LLJobStage) (err error) { +func (c *Controller) createPod(job *sednav1.LifelongLearningJob, podtype sednav1.LLJobStage) (err error) { ctx := context.Background() var podTemplate *v1.PodTemplateSpec LLDatasetName := job.Spec.Dataset.Name - dataset, err := jc.client.Datasets(job.Namespace).Get(ctx, LLDatasetName, metav1.GetOptions{}) + dataset, err := c.client.Datasets(job.Namespace).Get(ctx, LLDatasetName, metav1.GetOptions{}) if err != nil { return fmt.Errorf("failed to get dataset %s: %w", LLDatasetName, err) } - datasetSecret, err := jc.getSecret( + datasetSecret, err := c.getSecret( job.Namespace, dataset.Spec.CredentialName, fmt.Sprintf("dataset %s", dataset.Name), @@ -514,7 +512,7 @@ func (jc *LifelongLearningJobController) createPod(job *sednav1.LifelongLearning return err } - jobSecret, err := jc.getSecret( + jobSecret, err := c.getSecret( job.Namespace, job.Spec.CredentialName, fmt.Sprintf("lifelonglearning job %s", job.Name), @@ -526,7 +524,7 @@ func (jc *LifelongLearningJobController) createPod(job *sednav1.LifelongLearning // get all url for train and eval from data in condition condDataStr := job.Status.Conditions[len(job.Status.Conditions)-1].Data klog.V(2).Infof("lifelonglearning job %v/%v data condition:%s", job.Namespace, job.Name, condDataStr) - var cond LifelongLearningCondData + var cond ConditionData (&cond).Unmarshal([]byte(condDataStr)) if cond.Input == nil { return fmt.Errorf("empty input from condData") @@ -543,25 +541,25 @@ func (jc *LifelongLearningJobController) createPod(job *sednav1.LifelongLearning originalDataURLOrIndex = dataset.Spec.URL } - var workerParam *WorkerParam = new(WorkerParam) + var workerParam *runtime.WorkerParam = new(runtime.WorkerParam) if podtype == sednav1.LLJobTrain { - workerParam.workerType = "Train" + workerParam.WorkerType = "Train" podTemplate = &job.Spec.TrainSpec.Template // Env parameters for train - workerParam.env = map[string]string{ + workerParam.Env = map[string]string{ "NAMESPACE": job.Namespace, "JOB_NAME": job.Name, "WORKER_NAME": "train-worker-" + utilrand.String(5), - "LC_SERVER": jc.cfg.LC.Server, - "KB_SERVER": jc.cfg.KB.Server, + "LC_SERVER": c.cfg.LC.Server, + "KB_SERVER": c.cfg.KB.Server, } - workerParam.mounts = append(workerParam.mounts, - WorkerMount{ - URL: &MountURL{ + workerParam.Mounts = append(workerParam.Mounts, + runtime.WorkerMount{ + URL: &runtime.MountURL{ URL: cond.Input.OutputDir, Secret: jobSecret, DownloadByInitializer: false, @@ -569,8 +567,8 @@ func (jc *LifelongLearningJobController) createPod(job *sednav1.LifelongLearning EnvName: "OUTPUT_URL", }, - WorkerMount{ - URL: &MountURL{ + runtime.WorkerMount{ + URL: &runtime.MountURL{ URL: dataURL, Secret: jobSecret, DownloadByInitializer: true, @@ -579,8 +577,8 @@ func (jc *LifelongLearningJobController) createPod(job *sednav1.LifelongLearning }, // see https://github.com/kubeedge/sedna/issues/35 - WorkerMount{ - URL: &MountURL{ + runtime.WorkerMount{ + URL: &runtime.MountURL{ Secret: datasetSecret, URL: originalDataURLOrIndex, Indirect: dataset.Spec.URL != originalDataURLOrIndex, @@ -591,35 +589,35 @@ func (jc *LifelongLearningJobController) createPod(job *sednav1.LifelongLearning ) } else { podTemplate = &job.Spec.EvalSpec.Template - workerParam.workerType = "Eval" + workerParam.WorkerType = "Eval" // Configure Env information for eval by initial WorkerParam - workerParam.env = map[string]string{ + workerParam.Env = map[string]string{ "NAMESPACE": job.Namespace, "JOB_NAME": job.Name, "WORKER_NAME": "eval-worker-" + utilrand.String(5), - "LC_SERVER": jc.cfg.LC.Server, - "KB_SERVER": jc.cfg.KB.Server, + "LC_SERVER": c.cfg.LC.Server, + "KB_SERVER": c.cfg.KB.Server, } - var modelMountURLs []MountURL + var modelMountURLs []runtime.MountURL for _, url := range inputmodelURLs { - modelMountURLs = append(modelMountURLs, MountURL{ + modelMountURLs = append(modelMountURLs, runtime.MountURL{ URL: url, Secret: jobSecret, DownloadByInitializer: true, }) } - workerParam.mounts = append(workerParam.mounts, - WorkerMount{ + workerParam.Mounts = append(workerParam.Mounts, + runtime.WorkerMount{ URLs: modelMountURLs, Name: "models", EnvName: "MODEL_URLS", }, - WorkerMount{ - URL: &MountURL{ + runtime.WorkerMount{ + URL: &runtime.MountURL{ URL: cond.Input.OutputDir, Secret: jobSecret, DownloadByInitializer: false, @@ -627,8 +625,8 @@ func (jc *LifelongLearningJobController) createPod(job *sednav1.LifelongLearning EnvName: "OUTPUT_URL", }, - WorkerMount{ - URL: &MountURL{ + runtime.WorkerMount{ + URL: &runtime.MountURL{ URL: dataURL, Secret: datasetSecret, DownloadByInitializer: true, @@ -637,8 +635,8 @@ func (jc *LifelongLearningJobController) createPod(job *sednav1.LifelongLearning EnvName: "TEST_DATASET_URL", }, - WorkerMount{ - URL: &MountURL{ + runtime.WorkerMount{ + URL: &runtime.MountURL{ Secret: datasetSecret, URL: originalDataURLOrIndex, DownloadByInitializer: true, @@ -651,21 +649,21 @@ func (jc *LifelongLearningJobController) createPod(job *sednav1.LifelongLearning } // set the default policy instead of Always policy - workerParam.restartPolicy = v1.RestartPolicyOnFailure - workerParam.hostNetwork = true + workerParam.RestartPolicy = v1.RestartPolicyOnFailure + workerParam.HostNetwork = true // create pod based on podtype - _, err = createPodWithTemplate(jc.kubeClient, job, podTemplate, workerParam) + _, err = runtime.CreatePodWithTemplate(c.kubeClient, job, podTemplate, workerParam) if err != nil { return err } return } -func (jc *LifelongLearningJobController) createInferPod(job *sednav1.LifelongLearningJob) error { +func (c *Controller) createInferPod(job *sednav1.LifelongLearningJob) error { inferModelURL := strings.Join([]string{strings.TrimRight(job.Spec.OutputDir, "/"), "deploy/index.pkl"}, "/") - jobSecret, err := jc.getSecret( + jobSecret, err := c.getSecret( job.Namespace, job.Spec.CredentialName, fmt.Sprintf("lifelonglearning job %s", job.Name), @@ -674,10 +672,10 @@ func (jc *LifelongLearningJobController) createInferPod(job *sednav1.LifelongLea return err } - var workerParam *WorkerParam = new(WorkerParam) - workerParam.mounts = append(workerParam.mounts, - WorkerMount{ - URL: &MountURL{ + var workerParam *runtime.WorkerParam = new(runtime.WorkerParam) + workerParam.Mounts = append(workerParam.Mounts, + runtime.WorkerMount{ + URL: &runtime.MountURL{ URL: inferModelURL, Secret: jobSecret, DownloadByInitializer: false, @@ -687,75 +685,53 @@ func (jc *LifelongLearningJobController) createInferPod(job *sednav1.LifelongLea }, ) - workerParam.env = map[string]string{ + workerParam.Env = map[string]string{ "NAMESPACE": job.Namespace, "JOB_NAME": job.Name, "WORKER_NAME": "inferworker-" + utilrand.String(5), - "LC_SERVER": jc.cfg.LC.Server, + "LC_SERVER": c.cfg.LC.Server, } - workerParam.workerType = InferencePodType - workerParam.hostNetwork = true + workerParam.WorkerType = runtime.InferencePodType + workerParam.HostNetwork = true // create edge pod - _, err = createPodWithTemplate(jc.kubeClient, job, &job.Spec.DeploySpec.Template, workerParam) + _, err = runtime.CreatePodWithTemplate(c.kubeClient, job, &job.Spec.DeploySpec.Template, workerParam) return err } -// GetName returns the name of the lifelonglearning job controller -func (jc *LifelongLearningJobController) GetName() string { - return "LifelongLearningJobController" -} - -// NewLifelongLearningJobController creates a new LifelongLearningJob controller that keeps the relevant pods +// New creates a new LifelongLearningJob controller that keeps the relevant pods // in sync with their corresponding LifelongLearningJob objects. -func NewLifelongLearningJobController(cfg *config.ControllerConfig) (FeatureControllerI, error) { - namespace := cfg.Namespace - if namespace == "" { - namespace = metav1.NamespaceAll - } - kubeClient, err := utils.KubeClient() - if err != nil { - return nil, err - } - - kubecfg, err := utils.KubeConfig() - if err != nil { - return nil, err - } - crdclient, err := clientset.NewForConfig(kubecfg) - if err != nil { - return nil, err - } - - kubeInformerFactory := kubeinformers.NewSharedInformerFactoryWithOptions(kubeClient, time.Second*30, kubeinformers.WithNamespace(namespace)) +func New(cc *runtime.ControllerContext) (runtime.FeatureControllerI, error) { + cfg := cc.Config - podInformer := kubeInformerFactory.Core().V1().Pods() + podInformer := cc.KubeInformerFactory.Core().V1().Pods() - jobInformerFactory := informers.NewSharedInformerFactoryWithOptions(crdclient, time.Second*30, informers.WithNamespace(namespace)) - jobInformer := jobInformerFactory.Sedna().V1alpha1().LifelongLearningJobs() + jobInformer := cc.SednaInformerFactory.Sedna().V1alpha1().LifelongLearningJobs() eventBroadcaster := record.NewBroadcaster() - eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: kubeClient.CoreV1().Events("")}) + eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: cc.KubeClient.CoreV1().Events("")}) - jc := &LifelongLearningJobController{ - kubeClient: kubeClient, - client: crdclient.SednaV1alpha1(), - queue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(DefaultBackOff, MaxBackOff), "lifelonglearningjob"), - recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "lifelonglearningjob-controller"}), + jc := &Controller{ + kubeClient: cc.KubeClient, + client: cc.SednaClient.SednaV1alpha1(), + queue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(runtime.DefaultBackOff, runtime.MaxBackOff), Name), cfg: cfg, } jobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ AddFunc: func(obj interface{}) { jc.enqueueController(obj, true) + jc.syncToEdge(watch.Added, obj) }, UpdateFunc: func(old, cur interface{}) { jc.enqueueController(cur, true) + jc.syncToEdge(watch.Added, cur) }, DeleteFunc: func(obj interface{}) { jc.enqueueController(obj, true) + jc.syncToEdge(watch.Deleted, obj) }, }) jc.jobLister = jobInformer.Lister() @@ -769,8 +745,5 @@ func NewLifelongLearningJobController(cfg *config.ControllerConfig) (FeatureCont jc.podStore = podInformer.Lister() jc.podStoreSynced = podInformer.Informer().HasSynced - stopCh := make(chan struct{}) - kubeInformerFactory.Start(stopCh) - jobInformerFactory.Start(stopCh) - return jc, err + return jc, nil } diff --git a/pkg/globalmanager/controllers/lifelonglearning/upstream.go b/pkg/globalmanager/controllers/lifelonglearning/upstream.go new file mode 100644 index 000000000..011c60ec7 --- /dev/null +++ b/pkg/globalmanager/controllers/lifelonglearning/upstream.go @@ -0,0 +1,164 @@ +/* +Copyright 2021 The KubeEdge Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package lifelonglearning + +import ( + "context" + "encoding/json" + "fmt" + "strings" + + sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "github.com/kubeedge/sedna/pkg/globalmanager/runtime" +) + +type Model = runtime.Model + +// the data of this condition including the input/output to do the next step +type ConditionData struct { + Input *struct { + // Only one model cases + Model *Model `json:"model,omitempty"` + Models []Model `json:"models,omitempty"` + + DataURL string `json:"dataURL,omitempty"` + + // the data samples reference will be stored into this URL. + // The content of this url would be: + // # the first uncomment line means the directory + // s3://dataset/ + // mnist/0.jpg + // mnist/1.jpg + DataIndexURL string `json:"dataIndexURL,omitempty"` + + OutputDir string `json:"outputDir,omitempty"` + } `json:"input,omitempty"` + + Output *struct { + Model *Model `json:"model,omitempty"` + Models []Model `json:"models,omitempty"` + } `json:"output,omitempty"` +} + +func (cd *ConditionData) joinModelURLs(model *Model, models []Model) []string { + var modelURLs []string + if model != nil { + modelURLs = append(modelURLs, model.GetURL()) + } else { + for _, m := range models { + modelURLs = append(modelURLs, m.GetURL()) + } + } + return modelURLs +} + +func (cd *ConditionData) Unmarshal(data []byte) error { + return json.Unmarshal(data, cd) +} + +func (cd ConditionData) Marshal() ([]byte, error) { + return json.Marshal(cd) +} + +func (cd *ConditionData) GetInputModelURLs() []string { + return cd.joinModelURLs(cd.Input.Model, cd.Input.Models) +} + +func (cd *ConditionData) GetOutputModelURLs() []string { + return cd.joinModelURLs(cd.Output.Model, cd.Output.Models) +} + +func (c *Controller) appendStatusCondition(name, namespace string, cond sednav1.LLJobCondition) error { + client := c.client.LifelongLearningJobs(namespace) + return runtime.RetryUpdateStatus(name, namespace, func() error { + job, err := client.Get(context.TODO(), name, metav1.GetOptions{}) + if err != nil { + return err + } + job.Status.Conditions = append(job.Status.Conditions, cond) + _, err = client.UpdateStatus(context.TODO(), job, metav1.UpdateOptions{}) + return err + }) +} + +// updateFromEdge syncs the edge updates to k8s +func (c *Controller) updateFromEdge(name, namespace, operation string, content []byte) error { + var jobStatus struct { + Phase string `json:"phase"` + Status string `json:"status"` + } + + err := json.Unmarshal(content, &jobStatus) + if err != nil { + return err + } + + // Get the condition data. + // Here unmarshal and marshal immediately to skip the unnecessary fields + var condData ConditionData + err = json.Unmarshal(content, &condData) + if err != nil { + return err + } + + condDataBytes, _ := json.Marshal(&condData) + + cond := sednav1.LLJobCondition{ + Status: v1.ConditionTrue, + LastHeartbeatTime: metav1.Now(), + LastTransitionTime: metav1.Now(), + Data: string(condDataBytes), + Message: "reported by lc", + } + + switch strings.ToLower(jobStatus.Phase) { + case "train": + cond.Stage = sednav1.LLJobTrain + case "eval": + cond.Stage = sednav1.LLJobEval + case "deploy": + cond.Stage = sednav1.LLJobDeploy + default: + return fmt.Errorf("invalid condition stage: %v", jobStatus.Phase) + } + + switch strings.ToLower(jobStatus.Status) { + case "ready": + cond.Type = sednav1.LLJobStageCondReady + case "completed": + cond.Type = sednav1.LLJobStageCondCompleted + case "failed": + cond.Type = sednav1.LLJobStageCondFailed + case "waiting": + cond.Type = sednav1.LLJobStageCondWaiting + default: + return fmt.Errorf("invalid condition type: %v", jobStatus.Status) + } + + err = c.appendStatusCondition(name, namespace, cond) + if err != nil { + return fmt.Errorf("failed to append condition, err:%+w", err) + } + return nil +} + +func (c *Controller) SetUpstreamHandler(addFunc runtime.UpstreamHandlerAddFunc) error { + return addFunc(KindName, c.updateFromEdge) +} diff --git a/pkg/globalmanager/controllers/manager.go b/pkg/globalmanager/controllers/manager.go new file mode 100644 index 000000000..42feb40ec --- /dev/null +++ b/pkg/globalmanager/controllers/manager.go @@ -0,0 +1,128 @@ +/* +Copyright 2021 The KubeEdge Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controllers + +import ( + "fmt" + "math/rand" + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + kubeinformers "k8s.io/client-go/informers" + "k8s.io/klog/v2" + + clientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned" + sednainformers "github.com/kubeedge/sedna/pkg/client/informers/externalversions" + "github.com/kubeedge/sedna/pkg/globalmanager/config" + "github.com/kubeedge/sedna/pkg/globalmanager/messagelayer" + websocket "github.com/kubeedge/sedna/pkg/globalmanager/messagelayer/ws" + "github.com/kubeedge/sedna/pkg/globalmanager/runtime" + "github.com/kubeedge/sedna/pkg/globalmanager/utils" +) + +// Manager defines the controller manager +type Manager struct { + Config *config.ControllerConfig +} + +// New creates the controller manager +func New(cc *config.ControllerConfig) *Manager { + config.InitConfigure(cc) + return &Manager{ + Config: cc, + } +} + +func genResyncPeriod(minPeriod time.Duration) time.Duration { + factor := rand.Float64() + 1 + // [minPeriod, 2*minPeriod) + return time.Duration(factor * float64(minPeriod.Nanoseconds())) +} + +// Start starts the controllers it has managed +func (m *Manager) Start() error { + kubeClient, err := utils.KubeClient() + if err != nil { + return err + } + + kubecfg, err := utils.KubeConfig() + if err != nil { + return err + } + + sednaClient, err := clientset.NewForConfig(kubecfg) + if err != nil { + return err + } + + cfg := m.Config + namespace := cfg.Namespace + if namespace == "" { + namespace = metav1.NamespaceAll + } + + // TODO(llhuii): make this period configurable + minResyncPeriod := time.Second * 30 + + kubeInformerFactory := kubeinformers.NewSharedInformerFactoryWithOptions(kubeClient, genResyncPeriod(minResyncPeriod), kubeinformers.WithNamespace(namespace)) + + sednaInformerFactory := sednainformers.NewSharedInformerFactoryWithOptions(sednaClient, genResyncPeriod(minResyncPeriod), sednainformers.WithNamespace(namespace)) + + context := &runtime.ControllerContext{ + Config: m.Config, + + KubeClient: kubeClient, + KubeInformerFactory: kubeInformerFactory, + + SednaClient: sednaClient, + SednaInformerFactory: sednaInformerFactory, + } + + uc, _ := NewUpstreamController(context) + + downstreamSendFunc := messagelayer.NewContextMessageLayer().SendResourceObject + + stopCh := make(chan struct{}) + + go uc.Run(stopCh) + + for name, factory := range NewRegistry() { + f, err := factory(context) + if err != nil { + return fmt.Errorf("failed to initialize controller %s: %v", name, err) + } + f.SetDownstreamSendFunc(downstreamSendFunc) + f.SetUpstreamHandler(uc.Add) + + klog.Infof("initialized controller %s", name) + go f.Run(stopCh) + } + + kubeInformerFactory.Start(stopCh) + sednaInformerFactory.Start(stopCh) + + addr := fmt.Sprintf("%s:%d", m.Config.WebSocket.Address, m.Config.WebSocket.Port) + + ws := websocket.NewServer(addr) + err = ws.ListenAndServe() + if err != nil { + close(stopCh) + return fmt.Errorf("failed to listen websocket at %s: %v", addr, err) + } + return nil +} diff --git a/pkg/globalmanager/controllers/registry.go b/pkg/globalmanager/controllers/registry.go new file mode 100644 index 000000000..1af7db1f5 --- /dev/null +++ b/pkg/globalmanager/controllers/registry.go @@ -0,0 +1,40 @@ +/* +Copyright 2021 The KubeEdge Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controllers + +import ( + "github.com/kubeedge/sedna/pkg/globalmanager/controllers/dataset" + fl "github.com/kubeedge/sedna/pkg/globalmanager/controllers/federatedlearning" + il "github.com/kubeedge/sedna/pkg/globalmanager/controllers/incrementallearning" + ji "github.com/kubeedge/sedna/pkg/globalmanager/controllers/jointinference" + ll "github.com/kubeedge/sedna/pkg/globalmanager/controllers/lifelonglearning" + "github.com/kubeedge/sedna/pkg/globalmanager/runtime" +) + +type FeatureFactory = func(*runtime.ControllerContext) (runtime.FeatureControllerI, error) + +type Registry map[string]FeatureFactory + +func NewRegistry() Registry { + return Registry{ + ji.Name: ji.New, + fl.Name: fl.New, + il.Name: il.New, + ll.Name: ll.New, + dataset.Name: dataset.New, + } +} diff --git a/pkg/globalmanager/controllers/upstream.go b/pkg/globalmanager/controllers/upstream.go new file mode 100644 index 000000000..c02f2c571 --- /dev/null +++ b/pkg/globalmanager/controllers/upstream.go @@ -0,0 +1,105 @@ +/* +Copyright 2021 The KubeEdge Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controllers + +import ( + "fmt" + "strings" + + "k8s.io/klog/v2" + + "github.com/kubeedge/sedna/pkg/globalmanager/messagelayer" + "github.com/kubeedge/sedna/pkg/globalmanager/runtime" +) + +// UpstreamController subscribes the updates from edge and syncs to k8s api server +type UpstreamController struct { + messageLayer messagelayer.MessageLayer + updateHandlers map[string]runtime.UpstreamHandler +} + +func (uc *UpstreamController) checkOperation(operation string) error { + // current only support the 'status' operation + if operation != "status" { + return fmt.Errorf("unknown operation '%s'", operation) + } + return nil +} + +// syncEdgeUpdate receives the updates from edge and syncs these to k8s. +func (uc *UpstreamController) syncEdgeUpdate() { + for { + select { + case <-uc.messageLayer.Done(): + klog.Info("Stop sedna upstream loop") + return + default: + } + + update, err := uc.messageLayer.ReceiveResourceUpdate() + if err == nil { + err = uc.checkOperation(update.Operation) + } + if err != nil { + klog.Warningf("Ignore update since this err: %+v", err) + continue + } + + kind := update.Kind + namespace := update.Namespace + name := update.Name + operation := update.Operation + + handler, ok := uc.updateHandlers[kind] + if ok { + err := handler(name, namespace, operation, update.Content) + if err != nil { + klog.Errorf("Error to handle %s %s/%s operation(%s): %+v", kind, namespace, name, operation, err) + } + } else { + klog.Warningf("No handler for resource kind %s", kind) + } + } +} + +// Run starts the upstream controller +func (uc *UpstreamController) Run(stopCh <-chan struct{}) { + klog.Info("Start the sedna upstream controller") + + uc.syncEdgeUpdate() + <-stopCh +} + +func (uc *UpstreamController) Add(kind string, handler runtime.UpstreamHandler) error { + kind = strings.ToLower(kind) + if _, ok := uc.updateHandlers[kind]; ok { + return fmt.Errorf("a upstream handler for kind %s already exists", kind) + } + uc.updateHandlers[kind] = handler + + return nil +} + +// NewUpstreamController creates a new Upstream controller from config +func NewUpstreamController(cc *runtime.ControllerContext) (*UpstreamController, error) { + uc := &UpstreamController{ + messageLayer: messagelayer.NewContextMessageLayer(), + updateHandlers: make(map[string]runtime.UpstreamHandler), + } + + return uc, nil +} diff --git a/pkg/globalmanager/downstream.go b/pkg/globalmanager/downstream.go deleted file mode 100644 index 5de2f8310..000000000 --- a/pkg/globalmanager/downstream.go +++ /dev/null @@ -1,388 +0,0 @@ -/* -Copyright 2021 The KubeEdge Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package globalmanager - -import ( - "context" - "fmt" - "time" - - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/fields" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/watch" - "k8s.io/client-go/kubernetes" - "k8s.io/client-go/tools/cache" - "k8s.io/klog/v2" - - sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" - clientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned/typed/sedna/v1alpha1" - "github.com/kubeedge/sedna/pkg/globalmanager/config" - "github.com/kubeedge/sedna/pkg/globalmanager/messagelayer" - "github.com/kubeedge/sedna/pkg/globalmanager/utils" -) - -// DownstreamController watch kubernetes api server and send the controller resource change to edge -type DownstreamController struct { - // events from watch kubernetes api server - events chan watch.Event - - cfg *config.ControllerConfig - - client *clientset.SednaV1alpha1Client - kubeClient kubernetes.Interface - - messageLayer messagelayer.MessageLayer -} - -func (dc *DownstreamController) injectSecret(obj CommonInterface, secretName string) error { - if secretName == "" { - return nil - } - - secret, err := dc.kubeClient.CoreV1().Secrets(obj.GetNamespace()).Get(context.TODO(), secretName, metav1.GetOptions{}) - if err != nil { - klog.Warningf("failed to get the secret %s: %+v", - secretName, err) - - return err - } - InjectSecretObj(obj, secret) - return err -} - -// syncDataset syncs the dataset resources -func (dc *DownstreamController) syncDataset(eventType watch.EventType, dataset *sednav1.Dataset) error { - // Here only propagate to the nodes with non empty name - nodeName := dataset.Spec.NodeName - if len(nodeName) == 0 { - return fmt.Errorf("empty node name") - } - dc.injectSecret(dataset, dataset.Spec.CredentialName) - - return dc.messageLayer.SendResourceObject(nodeName, eventType, dataset) -} - -// syncJointInferenceService syncs the joint-inference-service resources -func (dc *DownstreamController) syncJointInferenceService(eventType watch.EventType, joint *sednav1.JointInferenceService) error { - // Here only propagate to the nodes with non empty name - // FIXME: only the case that Spec.NodeName specified is support - nodeName := joint.Spec.EdgeWorker.Template.Spec.NodeName - if len(nodeName) == 0 { - return fmt.Errorf("empty node name") - } - - return dc.messageLayer.SendResourceObject(nodeName, eventType, joint) -} - -// syncFederatedLearningJob syncs the federated resources -func (dc *DownstreamController) syncFederatedLearningJob(eventType watch.EventType, job *sednav1.FederatedLearningJob) error { - // broadcast to all nodes specified in spec - nodeset := make(map[string]bool) - for _, trainingWorker := range job.Spec.TrainingWorkers { - // Here only propagate to the nodes with non empty name - if len(trainingWorker.Template.Spec.NodeName) > 0 { - nodeset[trainingWorker.Template.Spec.NodeName] = true - } - } - - for nodeName := range nodeset { - dc.messageLayer.SendResourceObject(nodeName, eventType, job) - } - return nil -} - -// syncModelWithName will sync the model to the specified node. -// Now called when creating the incrementaljob. -func (dc *DownstreamController) syncModelWithName(nodeName, modelName, namespace string) error { - model, err := dc.client.Models(namespace).Get(context.TODO(), modelName, metav1.GetOptions{}) - if err != nil { - // TODO: maybe use err.ErrStatus.Code == 404 - return fmt.Errorf("model(%s/%s) not found", namespace, modelName) - } - - // Since model.Kind may be empty, - // we need to fix the kind here if missing. - // more details at https://github.com/kubernetes/kubernetes/issues/3030 - if len(model.Kind) == 0 { - model.Kind = "Model" - } - - dc.injectSecret(model, model.Spec.CredentialName) - - dc.messageLayer.SendResourceObject(nodeName, watch.Added, model) - return nil -} - -// syncIncrementalJob syncs the incremental learning jobs -func (dc *DownstreamController) syncIncrementalJob(eventType watch.EventType, job *sednav1.IncrementalLearningJob) error { - jobConditions := job.Status.Conditions - if len(jobConditions) == 0 { - return nil - } - - dataName := job.Spec.Dataset.Name - ds, err := dc.client.Datasets(job.Namespace).Get(context.TODO(), dataName, metav1.GetOptions{}) - if err != nil { - return fmt.Errorf("dataset(%s/%s) not found", job.Namespace, dataName) - } - // LC has dataset object on this node that may call dataset node - dsNodeName := ds.Spec.NodeName - - var trainNodeName string - var evalNodeName string - - ann := job.GetAnnotations() - if ann != nil { - trainNodeName = ann[AnnotationsKeyPrefix+string(sednav1.ILJobTrain)] - evalNodeName = ann[AnnotationsKeyPrefix+string(sednav1.ILJobEval)] - } - - if eventType == watch.Deleted { - // delete jobs from all LCs - for _, v := range []string{dsNodeName, trainNodeName, evalNodeName} { - if v != "" { - dc.messageLayer.SendResourceObject(v, eventType, job) - } - } - return nil - } - - latestCondition := jobConditions[len(jobConditions)-1] - currentType := latestCondition.Type - jobStage := latestCondition.Stage - - syncModelWithName := func(modelName string) { - if err := dc.syncModelWithName(dsNodeName, modelName, job.Namespace); err != nil { - klog.Warningf("Error to sync model %s when sync incremental learning job %s to node %s: %v", - modelName, job.Name, dsNodeName, err) - } - } - - syncJobWithNodeName := func(nodeName string) { - if err := dc.messageLayer.SendResourceObject(nodeName, eventType, job); err != nil { - klog.Warningf("Error to sync incremental learning job %s to node %s in stage %s: %v", - job.Name, nodeName, jobStage, err) - } - } - - dc.injectSecret(job, job.Spec.CredentialName) - - doJobStageEvent := func(modelName string, nodeName string) { - if currentType == sednav1.ILJobStageCondWaiting { - syncJobWithNodeName(dsNodeName) - syncModelWithName(modelName) - } else if currentType == sednav1.ILJobStageCondRunning { - if nodeName != "" { - syncJobWithNodeName(nodeName) - } - } else if currentType == sednav1.ILJobStageCondCompleted || currentType == sednav1.ILJobStageCondFailed { - if nodeName != dsNodeName { - // delete LC's job from nodeName that's different from dataset node when worker's status is completed or failed. - dc.messageLayer.SendResourceObject(nodeName, watch.Deleted, job) - } - } - } - - switch jobStage { - case sednav1.ILJobTrain: - doJobStageEvent(job.Spec.InitialModel.Name, trainNodeName) - case sednav1.ILJobEval: - doJobStageEvent(job.Spec.DeploySpec.Model.Name, evalNodeName) - } - - return nil -} - -// syncLifelongLearningJob syncs the lifelonglearning jobs -func (dc *DownstreamController) syncLifelongLearningJob(eventType watch.EventType, job *sednav1.LifelongLearningJob) error { - // Here only propagate to the nodes with non empty name - - // FIXME(llhuii): only the case that all workers having the same nodeName are support, - // will support Spec.NodeSelector and differenect nodeName. - nodeName := job.Spec.TrainSpec.Template.Spec.NodeName - if len(nodeName) == 0 { - return fmt.Errorf("empty node name") - } - - dc.injectSecret(job, job.Spec.CredentialName) - dc.messageLayer.SendResourceObject(nodeName, eventType, job) - - return nil -} - -// sync defines the entrypoint of syncing all resources -func (dc *DownstreamController) sync(stopCh <-chan struct{}) { - for { - select { - case <-stopCh: - klog.Info("Stop controller downstream loop") - return - - case e := <-dc.events: - - var err error - var kind, namespace, name string - switch t := e.Object.(type) { - case (*sednav1.Dataset): - // Since t.Kind may be empty, - // we need to fix the kind here if missing. - // more details at https://github.com/kubernetes/kubernetes/issues/3030 - if len(t.Kind) == 0 { - t.Kind = "Dataset" - } - kind = t.Kind - namespace = t.Namespace - name = t.Name - err = dc.syncDataset(e.Type, t) - - case (*sednav1.JointInferenceService): - // TODO: find a good way to avoid these duplicate codes - if len(t.Kind) == 0 { - t.Kind = "JointInferenceService" - } - kind = t.Kind - namespace = t.Namespace - name = t.Name - err = dc.syncJointInferenceService(e.Type, t) - - case (*sednav1.FederatedLearningJob): - if len(t.Kind) == 0 { - t.Kind = "FederatedLearningJob" - } - kind = t.Kind - namespace = t.Namespace - name = t.Name - err = dc.syncFederatedLearningJob(e.Type, t) - - case (*sednav1.IncrementalLearningJob): - if len(t.Kind) == 0 { - t.Kind = "IncrementalLearningJob" - } - kind = t.Kind - namespace = t.Namespace - name = t.Name - err = dc.syncIncrementalJob(e.Type, t) - case (*sednav1.LifelongLearningJob): - if len(t.Kind) == 0 { - t.Kind = "LifelongLearningJob" - } - kind = t.Kind - namespace = t.Namespace - name = t.Name - err = dc.syncLifelongLearningJob(e.Type, t) - default: - klog.Warningf("object type: %T unsupported", e) - continue - } - - if err != nil { - klog.Warningf("Error to sync %s(%s/%s), err: %+v", kind, namespace, name, err) - } else { - klog.V(2).Infof("synced %s(%s/%s)", kind, namespace, name) - } - } - } -} - -// watch function watches the crd resources which should by synced to nodes -func (dc *DownstreamController) watch(stopCh <-chan struct{}) { - rh := cache.ResourceEventHandlerFuncs{ - AddFunc: func(obj interface{}) { - eventObj := obj.(runtime.Object) - dc.events <- watch.Event{Type: watch.Added, Object: eventObj} - }, - UpdateFunc: func(old, cur interface{}) { - // Since we don't support the spec update operation currently, - // so only status updates arrive here and NO propagation to edge. - - // Update: - // We sync it to edge when using self-built websocket, and - // this sync isn't needed when we switch out self-built websocket. - dc.events <- watch.Event{Type: watch.Added, Object: cur.(runtime.Object)} - }, - DeleteFunc: func(obj interface{}) { - eventObj := obj.(runtime.Object) - dc.events <- watch.Event{Type: watch.Deleted, Object: eventObj} - }, - } - - client := dc.client.RESTClient() - - // make this option configurable - resyncPeriod := time.Second * 60 - namespace := dc.cfg.Namespace - - // TODO: use the informer - for resourceName, object := range map[string]runtime.Object{ - "datasets": &sednav1.Dataset{}, - "jointinferenceservices": &sednav1.JointInferenceService{}, - "federatedlearningjobs": &sednav1.FederatedLearningJob{}, - "incrementallearningjobs": &sednav1.IncrementalLearningJob{}, - "lifelonglearningjobs": &sednav1.LifelongLearningJob{}, - } { - lw := cache.NewListWatchFromClient(client, resourceName, namespace, fields.Everything()) - si := cache.NewSharedInformer(lw, object, resyncPeriod) - si.AddEventHandler(rh) - go si.Run(stopCh) - } -} - -// Start starts the controller -func (dc *DownstreamController) Start() error { - stopCh := dc.messageLayer.Done() - - // watch is an asynchronous call - dc.watch(stopCh) - - // sync is a synchronous call - go dc.sync(stopCh) - - return nil -} - -// GetName returns the name of the downstream controller -func (dc *DownstreamController) GetName() string { - return "DownstreamController" -} - -// NewDownstreamController creates a controller DownstreamController from config -func NewDownstreamController(cfg *config.ControllerConfig) (FeatureControllerI, error) { - // TODO: make bufferSize configurable - bufferSize := 10 - events := make(chan watch.Event, bufferSize) - - crdclient, err := utils.NewCRDClient() - if err != nil { - return nil, fmt.Errorf("create crd client failed with error: %w", err) - } - - kubeClient, err := utils.KubeClient() - if err != nil { - return nil, err - } - - dc := &DownstreamController{ - cfg: cfg, - events: events, - client: crdclient, - kubeClient: kubeClient, - messageLayer: messagelayer.NewContextMessageLayer(), - } - - return dc, nil -} diff --git a/pkg/globalmanager/common.go b/pkg/globalmanager/runtime/common.go similarity index 70% rename from pkg/globalmanager/common.go rename to pkg/globalmanager/runtime/common.go index 85842b3d0..e85c15c00 100644 --- a/pkg/globalmanager/common.go +++ b/pkg/globalmanager/runtime/common.go @@ -14,10 +14,11 @@ See the License for the specific language governing permissions and limitations under the License. */ -package globalmanager +package runtime import ( "context" + "encoding/json" "fmt" "math" "strings" @@ -27,16 +28,14 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" "k8s.io/client-go/util/workqueue" + "k8s.io/klog/v2" + + sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" ) const ( - // DefaultBackOff is the default backoff period - DefaultBackOff = 10 * time.Second - // MaxBackOff is the max backoff period - MaxBackOff = 360 * time.Second - bigModelPort int32 = 5000 - // ResourceUpdateRetries defines times of retrying to update resource - ResourceUpdateRetries = 3 + // resourceUpdateTries defines times of trying to update resource + resourceUpdateTries = 3 ) // GetNodeIPByName get node ip by node name @@ -62,8 +61,8 @@ func GetNodeIPByName(kubeClient kubernetes.Interface, name string) (string, erro return "", fmt.Errorf("can't found node ip for node %s", name) } -// getBackoff calc the next wait time for the key -func getBackoff(queue workqueue.RateLimitingInterface, key interface{}) time.Duration { +// GetBackoff calc the next wait time for the key +func GetBackoff(queue workqueue.RateLimitingInterface, key interface{}) time.Duration { exp := queue.NumRequeues(key) if exp <= 0 { @@ -83,7 +82,7 @@ func getBackoff(queue workqueue.RateLimitingInterface, key interface{}) time.Dur return calculated } -func calcActivePodCount(pods []*v1.Pod) int32 { +func CalcActivePodCount(pods []*v1.Pod) int32 { var result int32 = 0 for _, p := range pods { if v1.PodSucceeded != p.Status.Phase && @@ -129,3 +128,35 @@ func ConvertK8SValidName(name string) string { return string(fixName) } + +// ConvertMapToMetrics converts the metric map to list of resource Metric +func ConvertMapToMetrics(metric map[string]interface{}) []sednav1.Metric { + var l []sednav1.Metric + for k, v := range metric { + var displayValue string + switch t := v.(type) { + case string: + displayValue = t + default: + // ignore the json marshal error + b, _ := json.Marshal(v) + displayValue = string(b) + } + + l = append(l, sednav1.Metric{Key: k, Value: displayValue}) + } + return l +} + +// RetryUpdateStatus simply retries to call the status update func +func RetryUpdateStatus(name, namespace string, updateStatusFunc func() error) error { + var err error + for try := 1; try <= resourceUpdateTries; try++ { + err = updateStatusFunc() + if err == nil { + return nil + } + klog.Warningf("Error to update %s/%s status, tried %d times: %+v", namespace, name, try, err) + } + return err +} diff --git a/pkg/globalmanager/secret_injector.go b/pkg/globalmanager/runtime/secret_injector.go similarity index 84% rename from pkg/globalmanager/secret_injector.go rename to pkg/globalmanager/runtime/secret_injector.go index 6b5577f23..8c986f419 100644 --- a/pkg/globalmanager/secret_injector.go +++ b/pkg/globalmanager/runtime/secret_injector.go @@ -14,13 +14,16 @@ See the License for the specific language governing permissions and limitations under the License. */ -package globalmanager +package runtime import ( + "context" "encoding/json" "fmt" v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" ) const ( @@ -106,11 +109,18 @@ func MergeSecretEnvs(nowE, newE []v1.EnvVar, overwrite bool) []v1.EnvVar { return nowE } -func InjectSecretObj(obj CommonInterface, secret *v1.Secret) { - if secret == nil { +func InjectSecretAnnotations(client kubernetes.Interface, obj CommonInterface, secretName string) (err error) { + if len(secretName) == 0 { + return + } + secret, err := client.CoreV1().Secrets(obj.GetNamespace()).Get(context.TODO(), secretName, metav1.GetOptions{}) + if err != nil { return } + return injectSecretObj(obj, secret) +} +func injectSecretObj(obj CommonInterface, secret *v1.Secret) (err error) { secretData := secret.GetAnnotations() for k, v := range secret.Data { @@ -127,4 +137,5 @@ func InjectSecretObj(obj CommonInterface, secret *v1.Secret) { ann[SecretAnnotationKey] = string(b) obj.SetAnnotations(ann) + return nil } diff --git a/pkg/globalmanager/storage_initializer_injector.go b/pkg/globalmanager/runtime/storage_initializer_injector.go similarity index 97% rename from pkg/globalmanager/storage_initializer_injector.go rename to pkg/globalmanager/runtime/storage_initializer_injector.go index e6ee0d096..f9df1af88 100644 --- a/pkg/globalmanager/storage_initializer_injector.go +++ b/pkg/globalmanager/runtime/storage_initializer_injector.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package globalmanager +package runtime import ( "net/url" @@ -179,7 +179,7 @@ func injectHostPathMount(pod *v1.Pod, workerParam *WorkerParam) { hostPathType := v1.HostPathDirectory - for _, mount := range workerParam.mounts { + for _, mount := range workerParam.Mounts { for _, m := range mount.URLs { if m.HostPath == "" { continue @@ -240,7 +240,7 @@ func injectHostPathMount(pod *v1.Pod, workerParam *WorkerParam) { func injectWorkerSecrets(pod *v1.Pod, workerParam *WorkerParam) { var secretEnvs []v1.EnvVar - for _, mount := range workerParam.mounts { + for _, mount := range workerParam.Mounts { for _, m := range mount.URLs { if m.Disable || m.DownloadByInitializer { continue @@ -259,7 +259,7 @@ func injectInitializerContainer(pod *v1.Pod, workerParam *WorkerParam) { var downloadPairs []string var secretEnvs []v1.EnvVar - for _, mount := range workerParam.mounts { + for _, mount := range workerParam.Mounts { for _, m := range mount.URLs { if m.Disable { continue @@ -345,7 +345,7 @@ func injectInitializerContainer(pod *v1.Pod, workerParam *WorkerParam) { func InjectStorageInitializer(pod *v1.Pod, workerParam *WorkerParam) { var mounts []WorkerMount // parse the mounts and environment key - for _, mount := range workerParam.mounts { + for _, mount := range workerParam.Mounts { var envPaths []string if mount.URL != nil { @@ -374,13 +374,13 @@ func InjectStorageInitializer(pod *v1.Pod, workerParam *WorkerParam) { } if mount.EnvName != "" { - workerParam.env[mount.EnvName] = strings.Join( + workerParam.Env[mount.EnvName] = strings.Join( envPaths, urlsFieldSep, ) } } - workerParam.mounts = mounts + workerParam.Mounts = mounts // need to call injectInitializerContainer before injectHostPathMount // since injectHostPathMount could inject volumeMount to init container diff --git a/pkg/globalmanager/runtime/types.go b/pkg/globalmanager/runtime/types.go new file mode 100644 index 000000000..4a2c075d7 --- /dev/null +++ b/pkg/globalmanager/runtime/types.go @@ -0,0 +1,103 @@ +/* +Copyright 2021 The KubeEdge Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package runtime + +import ( + "time" + + "github.com/kubeedge/sedna/pkg/globalmanager/config" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + k8sruntime "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/watch" + kubeinformers "k8s.io/client-go/informers" + "k8s.io/client-go/kubernetes" + + sednaclientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned" + sednainformers "github.com/kubeedge/sedna/pkg/client/informers/externalversions" +) + +const ( + // DefaultBackOff is the default backoff period + DefaultBackOff = 10 * time.Second + // MaxBackOff is the max backoff period + MaxBackOff = 360 * time.Second + + // TrainPodType is type of train pod + TrainPodType = "train" + // EvalPodType is type of eval pod + EvalPodType = "eval" + // InferencePodType is type of inference pod + InferencePodType = "inference" + + // AnnotationsKeyPrefix defines prefix of key in annotations + AnnotationsKeyPrefix = "sedna.io/" +) + +type Model struct { + Format string `json:"format,omitempty"` + URL string `json:"url,omitempty"` + Metrics map[string]interface{} `json:"metrics,omitempty"` +} + +func (m *Model) GetURL() string { + return m.URL +} + +// CommonInterface describes the commom interface of CRs +type CommonInterface interface { + metav1.Object + schema.ObjectKind + k8sruntime.Object +} + +// UpstreamHandler is the function definition for handling the upstream updates, +// i.e. resource updates(mainly status) from LC(running at edge) +type UpstreamHandler = func(namespace, name, operation string, content []byte) error + +// UpstreamHandlerAddFunc defines the upstream controller register function for adding handler +type UpstreamHandlerAddFunc = func(kind string, updateHandler UpstreamHandler) error + +// DownstreamSendFunc is the send function for feature controllers to sync the resource updates(spec and status) to LC +type DownstreamSendFunc = func(nodeName string, eventType watch.EventType, obj interface{}) error + +// BaseControllerI defines the interface of an controller +type BaseControllerI interface { + Run(stopCh <-chan struct{}) +} + +// FeatureControllerI defines the interface of an AI Feature controller +type FeatureControllerI interface { + BaseControllerI + + // SetDownstreamSendFunc sets up the downstream send function in the feature controller + SetDownstreamSendFunc(f DownstreamSendFunc) error + + // SetUpstreamHandler sets up the upstream handler function for the feature controller + SetUpstreamHandler(add UpstreamHandlerAddFunc) error +} + +// ControllerContext defines the context that all feature controller share and belong to +type ControllerContext struct { + Config *config.ControllerConfig + + KubeClient kubernetes.Interface + KubeInformerFactory kubeinformers.SharedInformerFactory + + SednaClient sednaclientset.Interface + SednaInformerFactory sednainformers.SharedInformerFactory +} diff --git a/pkg/globalmanager/worker.go b/pkg/globalmanager/runtime/worker.go similarity index 89% rename from pkg/globalmanager/worker.go rename to pkg/globalmanager/runtime/worker.go index dc950faf3..df7208f41 100644 --- a/pkg/globalmanager/worker.go +++ b/pkg/globalmanager/runtime/worker.go @@ -1,4 +1,4 @@ -package globalmanager +package runtime import ( "context" @@ -27,15 +27,15 @@ type WorkerMount struct { // WorkerParam describes the system-defined parameters of worker type WorkerParam struct { - mounts []WorkerMount + Mounts []WorkerMount - env map[string]string - workerType string + Env map[string]string + WorkerType string // if true, force to use hostNetwork - hostNetwork bool + HostNetwork bool - restartPolicy v1.RestartPolicy + RestartPolicy v1.RestartPolicy } // generateLabels generates labels for an object @@ -109,7 +109,7 @@ func CreateKubernetesService(kubeClient kubernetes.Interface, object CommonInter func injectWorkerParam(pod *v1.Pod, workerParam *WorkerParam, object CommonInterface) { InjectStorageInitializer(pod, workerParam) - envs := createEnvVars(workerParam.env) + envs := createEnvVars(workerParam.Env) for idx := range pod.Spec.Containers { pod.Spec.Containers[idx].Env = append( pod.Spec.Containers[idx].Env, envs..., @@ -121,27 +121,27 @@ func injectWorkerParam(pod *v1.Pod, workerParam *WorkerParam, object CommonInter pod.Labels = make(map[string]string) } - for k, v := range generateLabels(object, workerParam.workerType) { + for k, v := range generateLabels(object, workerParam.WorkerType) { pod.Labels[k] = v } - pod.GenerateName = object.GetName() + "-" + strings.ToLower(workerParam.workerType) + "-" + pod.GenerateName = object.GetName() + "-" + strings.ToLower(workerParam.WorkerType) + "-" pod.Namespace = object.GetNamespace() - if workerParam.hostNetwork { + if workerParam.HostNetwork { // FIXME // force to set hostnetwork pod.Spec.HostNetwork = true } if pod.Spec.RestartPolicy == "" { - pod.Spec.RestartPolicy = workerParam.restartPolicy + pod.Spec.RestartPolicy = workerParam.RestartPolicy } } -// createPodWithTemplate creates and returns a pod object given a crd object, pod template, and workerParam -func createPodWithTemplate(client kubernetes.Interface, object CommonInterface, spec *v1.PodTemplateSpec, workerParam *WorkerParam) (*v1.Pod, error) { +// CreatePodWithTemplate creates and returns a pod object given a crd object, pod template, and workerParam +func CreatePodWithTemplate(client kubernetes.Interface, object CommonInterface, spec *v1.PodTemplateSpec, workerParam *WorkerParam) (*v1.Pod, error) { objectKind := object.GroupVersionKind() pod, _ := k8scontroller.GetPodFromTemplate(spec, object, metav1.NewControllerRef(object, objectKind)) injectWorkerParam(pod, workerParam, object) @@ -149,7 +149,7 @@ func createPodWithTemplate(client kubernetes.Interface, object CommonInterface, createdPod, err := client.CoreV1().Pods(object.GetNamespace()).Create(context.TODO(), pod, metav1.CreateOptions{}) objectName := object.GetNamespace() + "/" + object.GetName() if err != nil { - klog.Warningf("failed to create pod(type=%s) for %s %s, err:%s", workerParam.workerType, objectKind, objectName, err) + klog.Warningf("failed to create pod(type=%s) for %s %s, err:%s", workerParam.WorkerType, objectKind, objectName, err) return nil, err } klog.V(2).Infof("pod %s is created successfully for %s %s", createdPod.Name, objectKind, objectName) diff --git a/pkg/globalmanager/types.go b/pkg/globalmanager/types.go deleted file mode 100644 index 2fb9534be..000000000 --- a/pkg/globalmanager/types.go +++ /dev/null @@ -1,168 +0,0 @@ -/* -Copyright 2021 The KubeEdge Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package globalmanager - -import ( - "encoding/json" - - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/runtime/schema" -) - -// CommonInterface describes the commom interface of CRs -type CommonInterface interface { - metav1.Object - schema.ObjectKind - runtime.Object -} - -// FeatureControllerI defines the interface of an AI Feature controller -type FeatureControllerI interface { - Start() error - GetName() string -} - -type Model struct { - Format string `json:"format,omitempty"` - URL string `json:"url,omitempty"` - Metrics map[string]interface{} `json:"metrics,omitempty"` -} - -// the data of this condition including the input/output to do the next step -type IncrementalCondData struct { - Input *struct { - // Only one model cases - Model *Model `json:"model,omitempty"` - Models []Model `json:"models,omitempty"` - - DataURL string `json:"dataURL,omitempty"` - - // the data samples reference will be stored into this URL. - // The content of this url would be: - // # the first uncomment line means the directory - // s3://dataset/ - // mnist/0.jpg - // mnist/1.jpg - DataIndexURL string `json:"dataIndexURL,omitempty"` - - OutputDir string `json:"outputDir,omitempty"` - } `json:"input,omitempty"` - - Output *struct { - Model *Model `json:"model,omitempty"` - Models []Model `json:"models,omitempty"` - } `json:"output,omitempty"` -} - -const ( - // TrainPodType is type of train pod - TrainPodType = "train" - // EvalPodType is type of eval pod - EvalPodType = "eval" - // InferencePodType is type of inference pod - InferencePodType = "inference" - - // AnnotationsKeyPrefix defines prefix of key in annotations - AnnotationsKeyPrefix = "sedna.io/" -) - -func (m *Model) GetURL() string { - return m.URL -} - -func (cd *IncrementalCondData) joinModelURLs(model *Model, models []Model) []string { - var modelURLs []string - if model != nil { - modelURLs = append(modelURLs, model.GetURL()) - } else { - for _, m := range models { - modelURLs = append(modelURLs, m.GetURL()) - } - } - return modelURLs -} - -func (cd *IncrementalCondData) GetInputModelURLs() []string { - return cd.joinModelURLs(cd.Input.Model, cd.Input.Models) -} - -func (cd *IncrementalCondData) GetOutputModelURLs() []string { - return cd.joinModelURLs(cd.Output.Model, cd.Output.Models) -} - -func (cd *IncrementalCondData) Unmarshal(data []byte) error { - return json.Unmarshal(data, cd) -} - -func (cd IncrementalCondData) Marshal() ([]byte, error) { - return json.Marshal(cd) -} - -// the data of this condition including the input/output to do the next step -type LifelongLearningCondData struct { - Input *struct { - // Only one model cases - Model *Model `json:"model,omitempty"` - Models []Model `json:"models,omitempty"` - - DataURL string `json:"dataURL,omitempty"` - - // the data samples reference will be stored into this URL. - // The content of this url would be: - // # the first uncomment line means the directory - // s3://dataset/ - // mnist/0.jpg - // mnist/1.jpg - DataIndexURL string `json:"dataIndexURL,omitempty"` - - OutputDir string `json:"outputDir,omitempty"` - } `json:"input,omitempty"` - - Output *struct { - Model *Model `json:"model,omitempty"` - Models []Model `json:"models,omitempty"` - } `json:"output,omitempty"` -} - -func (cd *LifelongLearningCondData) joinModelURLs(model *Model, models []Model) []string { - var modelURLs []string - if model != nil { - modelURLs = append(modelURLs, model.GetURL()) - } else { - for _, m := range models { - modelURLs = append(modelURLs, m.GetURL()) - } - } - return modelURLs -} - -func (cd *LifelongLearningCondData) Unmarshal(data []byte) error { - return json.Unmarshal(data, cd) -} - -func (cd LifelongLearningCondData) Marshal() ([]byte, error) { - return json.Marshal(cd) -} - -func (cd *LifelongLearningCondData) GetInputModelURLs() []string { - return cd.joinModelURLs(cd.Input.Model, cd.Input.Models) -} - -func (cd *LifelongLearningCondData) GetOutputModelURLs() []string { - return cd.joinModelURLs(cd.Output.Model, cd.Output.Models) -} diff --git a/pkg/globalmanager/upstream.go b/pkg/globalmanager/upstream.go deleted file mode 100644 index 13d64483e..000000000 --- a/pkg/globalmanager/upstream.go +++ /dev/null @@ -1,519 +0,0 @@ -/* -Copyright 2021 The KubeEdge Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package globalmanager - -import ( - "context" - "encoding/json" - "fmt" - "strings" - - v1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/klog/v2" - - sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" - clientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned/typed/sedna/v1alpha1" - "github.com/kubeedge/sedna/pkg/globalmanager/config" - "github.com/kubeedge/sedna/pkg/globalmanager/messagelayer" - "github.com/kubeedge/sedna/pkg/globalmanager/utils" -) - -// updateHandler handles the updates from LC(running at edge) to update the -// corresponding resource -type updateHandler func(namespace, name, operation string, content []byte) error - -// UpstreamController subscribes the updates from edge and syncs to k8s api server -type UpstreamController struct { - client *clientset.SednaV1alpha1Client - messageLayer messagelayer.MessageLayer - updateHandlers map[string]updateHandler -} - -const upstreamStatusUpdateRetries = 3 - -// retryUpdateStatus simply retries to call the status update func -func retryUpdateStatus(name, namespace string, updateStatusFunc func() error) error { - var err error - for retry := 0; retry <= upstreamStatusUpdateRetries; retry++ { - err = updateStatusFunc() - if err == nil { - return nil - } - klog.Warningf("Error to update %s/%s status, retried %d times: %+v", namespace, name, retry, err) - } - return err -} - -func newUnmarshalError(namespace, name, operation string, content []byte) error { - return fmt.Errorf("Unable to unmarshal content for (%s/%s) operation: '%s', content: '%+v'", namespace, name, operation, string(content)) -} - -func checkUpstreamOperation(operation string) error { - // current only support the 'status' operation - if operation != "status" { - return fmt.Errorf("unknown operation %s", operation) - } - return nil -} - -// updateDatasetStatus updates the dataset status -func (uc *UpstreamController) updateDatasetStatus(name, namespace string, status sednav1.DatasetStatus) error { - client := uc.client.Datasets(namespace) - - if status.UpdateTime == nil { - now := metav1.Now() - status.UpdateTime = &now - } - - return retryUpdateStatus(name, namespace, func() error { - dataset, err := client.Get(context.TODO(), name, metav1.GetOptions{}) - if err != nil { - return err - } - dataset.Status = status - _, err = client.UpdateStatus(context.TODO(), dataset, metav1.UpdateOptions{}) - return err - }) -} - -// updateDatasetFromEdge syncs update from edge -func (uc *UpstreamController) updateDatasetFromEdge(name, namespace, operation string, content []byte) error { - err := checkUpstreamOperation(operation) - if err != nil { - return err - } - - status := sednav1.DatasetStatus{} - err = json.Unmarshal(content, &status) - if err != nil { - return newUnmarshalError(namespace, name, operation, content) - } - - return uc.updateDatasetStatus(name, namespace, status) -} - -// convertToMetrics converts the metrics from LCs to resource metrics -func convertToMetrics(m map[string]interface{}) []sednav1.Metric { - var l []sednav1.Metric - for k, v := range m { - var displayValue string - switch t := v.(type) { - case string: - displayValue = t - default: - // ignore the json marshal error - b, _ := json.Marshal(v) - displayValue = string(b) - } - - l = append(l, sednav1.Metric{Key: k, Value: displayValue}) - } - return l -} - -func (uc *UpstreamController) updateJointInferenceMetrics(name, namespace string, metrics []sednav1.Metric) error { - client := uc.client.JointInferenceServices(namespace) - - return retryUpdateStatus(name, namespace, func() error { - joint, err := client.Get(context.TODO(), name, metav1.GetOptions{}) - if err != nil { - return err - } - joint.Status.Metrics = metrics - _, err = client.UpdateStatus(context.TODO(), joint, metav1.UpdateOptions{}) - return err - }) -} - -// updateJointInferenceFromEdge syncs the edge updates to k8s -func (uc *UpstreamController) updateJointInferenceFromEdge(name, namespace, operation string, content []byte) error { - err := checkUpstreamOperation(operation) - if err != nil { - return err - } - - // Output defines owner output information - type Output struct { - ServiceInfo map[string]interface{} `json:"ownerInfo"` - } - - var status struct { - // Phase always should be "inference" - Phase string `json:"phase"` - Status string `json:"status"` - Output *Output `json:"output"` - } - - err = json.Unmarshal(content, &status) - if err != nil { - return newUnmarshalError(namespace, name, operation, content) - } - - // TODO: propagate status.Status to k8s - - output := status.Output - if output == nil || output.ServiceInfo == nil { - // no output info - klog.Warningf("empty status info for joint inference service %s/%s", namespace, name) - return nil - } - - info := output.ServiceInfo - - for _, ignoreTimeKey := range []string{ - "startTime", - "updateTime", - } { - delete(info, ignoreTimeKey) - } - - metrics := convertToMetrics(info) - - err = uc.updateJointInferenceMetrics(name, namespace, metrics) - if err != nil { - return fmt.Errorf("failed to update metrics, err:%+w", err) - } - return nil -} - -func (uc *UpstreamController) updateModelMetrics(name, namespace string, metrics []sednav1.Metric) error { - client := uc.client.Models(namespace) - - return retryUpdateStatus(name, namespace, (func() error { - model, err := client.Get(context.TODO(), name, metav1.GetOptions{}) - if err != nil { - return err - } - - now := metav1.Now() - model.Status.UpdateTime = &now - model.Status.Metrics = metrics - _, err = client.UpdateStatus(context.TODO(), model, metav1.UpdateOptions{}) - return err - })) -} - -func (uc *UpstreamController) updateModelMetricsByFederatedName(name, namespace string, metrics []sednav1.Metric) error { - client := uc.client.FederatedLearningJobs(namespace) - var err error - federatedLearningJob, err := client.Get(context.TODO(), name, metav1.GetOptions{}) - if err != nil { - // federated crd not found - return err - } - modelName := federatedLearningJob.Spec.AggregationWorker.Model.Name - return uc.updateModelMetrics(modelName, namespace, metrics) -} - -func (uc *UpstreamController) appendFederatedLearningJobStatusCondition(name, namespace string, cond sednav1.FLJobCondition) error { - client := uc.client.FederatedLearningJobs(namespace) - - return retryUpdateStatus(name, namespace, (func() error { - job, err := client.Get(context.TODO(), name, metav1.GetOptions{}) - if err != nil { - return err - } - job.Status.Conditions = append(job.Status.Conditions, cond) - _, err = client.UpdateStatus(context.TODO(), job, metav1.UpdateOptions{}) - return err - })) -} - -// updateFederatedLearningJobFromEdge updates the federated job's status -func (uc *UpstreamController) updateFederatedLearningJobFromEdge(name, namespace, operation string, content []byte) (err error) { - err = checkUpstreamOperation(operation) - if err != nil { - return err - } - - // JobInfo defines the job information - type JobInfo struct { - // Current training round - CurrentRound int `json:"currentRound"` - UpdateTime string `json:"updateTime"` - } - - // Output defines job output information - type Output struct { - Models []Model `json:"models"` - JobInfo *JobInfo `json:"ownerInfo"` - } - - var status struct { - Phase string `json:"phase"` - Status string `json:"status"` - Output *Output `json:"output"` - } - - err = json.Unmarshal(content, &status) - if err != nil { - err = newUnmarshalError(namespace, name, operation, content) - return - } - - output := status.Output - - if output != nil { - // Update the model's metrics - if len(output.Models) > 0 { - // only one model - model := output.Models[0] - metrics := convertToMetrics(model.Metrics) - if len(metrics) > 0 { - uc.updateModelMetricsByFederatedName(name, namespace, metrics) - } - } - - jobInfo := output.JobInfo - // update job info if having any info - if jobInfo != nil && jobInfo.CurrentRound > 0 { - // Find a good place to save the progress info - // TODO: more meaningful reason/message - reason := "DoTraining" - message := fmt.Sprintf("Round %v reaches at %s", jobInfo.CurrentRound, jobInfo.UpdateTime) - cond := NewFLJobCondition(sednav1.FLJobCondTraining, reason, message) - uc.appendFederatedLearningJobStatusCondition(name, namespace, cond) - } - } - - return nil -} - -func (uc *UpstreamController) appendIncrementalLearningJobStatusCondition(name, namespace string, cond sednav1.ILJobCondition) error { - client := uc.client.IncrementalLearningJobs(namespace) - return retryUpdateStatus(name, namespace, (func() error { - job, err := client.Get(context.TODO(), name, metav1.GetOptions{}) - if err != nil { - return err - } - job.Status.Conditions = append(job.Status.Conditions, cond) - _, err = client.UpdateStatus(context.TODO(), job, metav1.UpdateOptions{}) - return err - })) -} - -// updateIncrementalLearningFromEdge syncs the edge updates to k8s -func (uc *UpstreamController) updateIncrementalLearningFromEdge(name, namespace, operation string, content []byte) error { - err := checkUpstreamOperation(operation) - if err != nil { - return err - } - var jobStatus struct { - Phase string `json:"phase"` - Status string `json:"status"` - } - - err = json.Unmarshal(content, &jobStatus) - if err != nil { - return newUnmarshalError(namespace, name, operation, content) - } - - // Get the condition data. - // Here unmarshal and marshal immediately to skip the unnecessary fields - var condData IncrementalCondData - err = json.Unmarshal(content, &condData) - if err != nil { - return newUnmarshalError(namespace, name, operation, content) - } - condDataBytes, _ := json.Marshal(&condData) - - cond := sednav1.ILJobCondition{ - Status: v1.ConditionTrue, - LastHeartbeatTime: metav1.Now(), - LastTransitionTime: metav1.Now(), - Data: string(condDataBytes), - Message: "reported by lc", - } - - switch strings.ToLower(jobStatus.Phase) { - case "train": - cond.Stage = sednav1.ILJobTrain - case "eval": - cond.Stage = sednav1.ILJobEval - case "deploy": - cond.Stage = sednav1.ILJobDeploy - default: - return fmt.Errorf("invalid condition stage: %v", jobStatus.Phase) - } - - switch strings.ToLower(jobStatus.Status) { - case "ready": - cond.Type = sednav1.ILJobStageCondReady - case "completed": - cond.Type = sednav1.ILJobStageCondCompleted - case "failed": - cond.Type = sednav1.ILJobStageCondFailed - case "waiting": - cond.Type = sednav1.ILJobStageCondWaiting - default: - return fmt.Errorf("invalid condition type: %v", jobStatus.Status) - } - - err = uc.appendIncrementalLearningJobStatusCondition(name, namespace, cond) - if err != nil { - return fmt.Errorf("failed to append condition, err:%+w", err) - } - return nil -} - -func (uc *UpstreamController) appendLifelongLearningJobStatusCondition(name, namespace string, cond sednav1.LLJobCondition) error { - client := uc.client.LifelongLearningJobs(namespace) - return retryUpdateStatus(name, namespace, func() error { - job, err := client.Get(context.TODO(), name, metav1.GetOptions{}) - if err != nil { - return err - } - job.Status.Conditions = append(job.Status.Conditions, cond) - _, err = client.UpdateStatus(context.TODO(), job, metav1.UpdateOptions{}) - return err - }) -} - -// updateLifelongLearningJobFromEdge syncs the edge updates to k8s -func (uc *UpstreamController) updateLifelongLearningJobFromEdge(name, namespace, operation string, content []byte) error { - err := checkUpstreamOperation(operation) - if err != nil { - return err - } - var jobStatus struct { - Phase string `json:"phase"` - Status string `json:"status"` - } - - err = json.Unmarshal(content, &jobStatus) - if err != nil { - return newUnmarshalError(namespace, name, operation, content) - } - - // Get the condition data. - // Here unmarshal and marshal immediately to skip the unnecessary fields - var condData LifelongLearningCondData - err = json.Unmarshal(content, &condData) - if err != nil { - return newUnmarshalError(namespace, name, operation, content) - } - condDataBytes, _ := json.Marshal(&condData) - - cond := sednav1.LLJobCondition{ - Status: v1.ConditionTrue, - LastHeartbeatTime: metav1.Now(), - LastTransitionTime: metav1.Now(), - Data: string(condDataBytes), - Message: "reported by lc", - } - - switch strings.ToLower(jobStatus.Phase) { - case "train": - cond.Stage = sednav1.LLJobTrain - case "eval": - cond.Stage = sednav1.LLJobEval - case "deploy": - cond.Stage = sednav1.LLJobDeploy - default: - return fmt.Errorf("invalid condition stage: %v", jobStatus.Phase) - } - - switch strings.ToLower(jobStatus.Status) { - case "ready": - cond.Type = sednav1.LLJobStageCondReady - case "completed": - cond.Type = sednav1.LLJobStageCondCompleted - case "failed": - cond.Type = sednav1.LLJobStageCondFailed - case "waiting": - cond.Type = sednav1.LLJobStageCondWaiting - default: - return fmt.Errorf("invalid condition type: %v", jobStatus.Status) - } - - err = uc.appendLifelongLearningJobStatusCondition(name, namespace, cond) - if err != nil { - return fmt.Errorf("failed to append condition, err:%+w", err) - } - return nil -} - -// syncEdgeUpdate receives the updates from edge and syncs these to k8s. -func (uc *UpstreamController) syncEdgeUpdate() { - for { - select { - case <-uc.messageLayer.Done(): - klog.Info("Stop sedna upstream loop") - return - default: - } - - update, err := uc.messageLayer.ReceiveResourceUpdate() - if err != nil { - klog.Warningf("Ignore update since this err: %+v", err) - continue - } - - kind := update.Kind - namespace := update.Namespace - name := update.Name - operation := update.Operation - - handler, ok := uc.updateHandlers[kind] - if ok { - err := handler(name, namespace, operation, update.Content) - if err != nil { - klog.Errorf("Error to handle %s %s/%s operation(%s): %+v", kind, namespace, name, operation, err) - } - } else { - klog.Warningf("No handler for resource kind %s", kind) - } - } -} - -// Start the upstream controller -func (uc *UpstreamController) Start() error { - klog.Info("Start the sedna upstream controller") - - go uc.syncEdgeUpdate() - return nil -} - -// GetName returns the name of the upstream controller -func (uc *UpstreamController) GetName() string { - return "UpstreamController" -} - -// NewUpstreamController creates a new Upstream controller from config -func NewUpstreamController(cfg *config.ControllerConfig) (FeatureControllerI, error) { - client, err := utils.NewCRDClient() - if err != nil { - return nil, fmt.Errorf("create crd client failed with error: %w", err) - } - uc := &UpstreamController{ - client: client, - messageLayer: messagelayer.NewContextMessageLayer(), - } - - // NOTE: current no direct model update from edge, - // model update will be triggered by the corresponding training feature - uc.updateHandlers = map[string]updateHandler{ - "dataset": uc.updateDatasetFromEdge, - "jointinferenceservice": uc.updateJointInferenceFromEdge, - "federatedlearningjob": uc.updateFederatedLearningJobFromEdge, - "incrementallearningjob": uc.updateIncrementalLearningFromEdge, - "lifelonglearningjob": uc.updateLifelongLearningJobFromEdge, - } - - return uc, nil -} diff --git a/pkg/localcontroller/manager/incrementallearningjob.go b/pkg/localcontroller/manager/incrementallearningjob.go index a49a98669..20166e0f2 100644 --- a/pkg/localcontroller/manager/incrementallearningjob.go +++ b/pkg/localcontroller/manager/incrementallearningjob.go @@ -31,7 +31,8 @@ import ( "github.com/kubeedge/sedna/cmd/sedna-lc/app/options" sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" - "github.com/kubeedge/sedna/pkg/globalmanager" + gmtypes "github.com/kubeedge/sedna/pkg/globalmanager/controllers/incrementallearning" + "github.com/kubeedge/sedna/pkg/globalmanager/runtime" "github.com/kubeedge/sedna/pkg/localcontroller/db" "github.com/kubeedge/sedna/pkg/localcontroller/gmclient" "github.com/kubeedge/sedna/pkg/localcontroller/storage" @@ -435,11 +436,11 @@ func newTrigger(t sednav1.Trigger) (trigger.Base, error) { func (im *IncrementalJobManager) getTrainOrEvalModel(job *IncrementalLearningJob, jobStage sednav1.ILJobStage) *ModelInfo { jobConditions := job.Status.Conditions - // TODO: globalmanager.type changes to common.type for gm and lc - var models []globalmanager.Model + // TODO: runtime.type changes to common.type for gm and lc + var models []runtime.Model for i := len(jobConditions) - 1; i >= 0; i-- { - var cond globalmanager.IncrementalCondData + var cond gmtypes.IncrementalCondData jobCond := jobConditions[i] if jobCond.Stage == sednav1.ILJobTrain && jobCond.Type == sednav1.ILJobStageCondCompleted { if err := (&cond).Unmarshal([]byte(jobCond.Data)); err != nil {