Skip to content

Commit

Permalink
feat: add event recorder event
Browse files Browse the repository at this point in the history
  • Loading branch information
googs1025 committed Apr 12, 2024
1 parent ec6aef7 commit cdb1192
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 4 deletions.
8 changes: 8 additions & 0 deletions pkg/constants/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,14 @@ const (
// The event uses the error(s) as the reason.
JobCreationFailedReason = "JobCreationFailed"

// Event reason used when a Job deletion fails.
// The event uses the error(s) as the reason.
JobDeletionFailedReason = "JobDeletionFailed"

// Event reason used when a Headless Service creation fails.
// The event uses the error(s) as the reason.
HeadlessServiceCreationFailedReason = "HeadlessServiceCreationFailed"

// Event reason and message for when the pod controller detects a violation
// of the JobSet exclusive placment policy (i.e., follower pods not colocated in
// the same topology domain as the leader pod for that Job).
Expand Down
15 changes: 11 additions & 4 deletions pkg/controllers/jobset_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,15 +124,15 @@ func (r *JobSetReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
if requeueAfter > 0 {
return ctrl.Result{RequeueAfter: requeueAfter}, nil
}
if err := r.deleteJobs(ctx, ownedJobs.active); err != nil {
if err := r.deleteJobs(ctx, &js, ownedJobs.active); err != nil {
log.Error(err, "deleting jobs")
return ctrl.Result{}, err
}
return ctrl.Result{}, nil
}

// Delete any jobs marked for deletion.
if err := r.deleteJobs(ctx, ownedJobs.delete); err != nil {
if err := r.deleteJobs(ctx, &js, ownedJobs.delete); err != nil {
log.Error(err, "deleting jobs")
return ctrl.Result{}, err
}
Expand Down Expand Up @@ -511,7 +511,7 @@ func (r *JobSetReconciler) createJobs(ctx context.Context, js *jobset.JobSet, ow
return allErrs
}

func (r *JobSetReconciler) deleteJobs(ctx context.Context, jobsForDeletion []*batchv1.Job) error {
func (r *JobSetReconciler) deleteJobs(ctx context.Context, js *jobset.JobSet, jobsForDeletion []*batchv1.Job) error {
log := ctrl.LoggerFrom(ctx)
lock := &sync.Mutex{}
var finalErrs []error
Expand All @@ -533,7 +533,13 @@ func (r *JobSetReconciler) deleteJobs(ctx context.Context, jobsForDeletion []*ba
}
log.V(2).Info("successfully deleted job", "job", klog.KObj(targetJob), "restart attempt", targetJob.Labels[targetJob.Labels[constants.RestartsKey]])
})
return errors.Join(finalErrs...)

allErrs := errors.Join(finalErrs...)
if allErrs != nil {
r.Record.Eventf(js, corev1.EventTypeWarning, constants.JobDeletionFailedReason, allErrs.Error())
return allErrs
}
return allErrs
}

// TODO: look into adopting service and updating the selector
Expand Down Expand Up @@ -576,6 +582,7 @@ func (r *JobSetReconciler) createHeadlessSvcIfNecessary(ctx context.Context, js

// Create headless service.
if err := r.Create(ctx, &headlessSvc); err != nil {
r.Record.Eventf(js, corev1.EventTypeWarning, constants.HeadlessServiceCreationFailedReason, err.Error())
return err
}
log.V(2).Info("successfully created headless service", "service", klog.KObj(&headlessSvc))
Expand Down

0 comments on commit cdb1192

Please sign in to comment.