-
Notifications
You must be signed in to change notification settings - Fork 972
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Update enqueue action, import overcommit plugin to limit pending jobs from inqueue. #1298
Conversation
d5a0744
to
e721964
Compare
@@ -655,7 +663,7 @@ func (ssn *Session) NodeOrderMapFn(task *api.TaskInfo, node *api.NodeInfo) (map[ | |||
return nodeScoreMap, priorityScore, nil | |||
} | |||
|
|||
// NodeOrderReduceFn invoke node order function of the plugins | |||
// NodeOrderReduceFn invoke nodeOrder function of the plugins |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
seperate those typo to different PRs.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
already deleted these.
8bba1de
to
5ff6c97
Compare
|
||
func (op *overcommitPlugin) OnSessionOpen(ssn *framework.Session) { | ||
overcommitStartTime := time.Now().UnixNano() | ||
op.pluginArguments.GetFloat64(&op.overCommitFactor, overCommitFactor) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
add overCommitFactor illegal check
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added overcommit-factor validity check: if input < 1, print warning logs & use default value instead.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
what happen if overcommit-factor
is an invalid number?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
what happen if
overcommit-factor
is an invalid number?
GetFloat64()
do not return err, only print warning logs, so here initializes overcommit-factor
with an illegal value -1
. If its value is not updated after GetFloat64()
, it will be processed the sam as condition 'input < 1'.
func (a Arguments) GetFloat64(ptr *float64, key string) { |
0c1a593
to
b08bcc3
Compare
Signed-off-by: jiangkaihua <jiangkaihua1@huawei.com>
b08bcc3
to
fce367d
Compare
…ion. Signed-off-by: jiangkaihua <jiangkaihua1@huawei.com>
fce367d
to
7014b84
Compare
} | ||
klog.V(4).Infof("overcommit plugin starts, overCommitFactor: %f", op.overCommitFactor) | ||
defer klog.V(4).Infof("overcommit plugin finishes, execution time: %dns", | ||
time.Now().UnixNano()-overcommitStartTime) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please remove time calculation in product, it only used for debug or analysis.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
time calculation is already removed.
op.inqueueResource.Add(jobMinReq) | ||
return true | ||
} | ||
klog.V(4).Infof("idle resource in cluster is overused, ignore job <%s/%s>", |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
change log from "idle resource in cluster is overused" to "resource in cluster is overused, not allow job <%s/%s> be inqueue"
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
modified this log.
382814a
to
07da59b
Compare
Signed-off-by: jiangkaihua <jiangkaihua1@huawei.com>
07da59b
to
763fdbf
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
/lgtm
/approve
/lgtm |
[APPROVALNOTIFIER] This PR is APPROVED This pull-request has been approved by: jiangkaihua, Thor-wl, william-wang The full list of commands accepted by this bot can be found here. The pull request process is described here
Needs approval from an approver in each of these files:
Approvers can indicate their approval by writing |
" using default value: %f", op.overCommitFactor, defaultOverCommitFactor) | ||
op.overCommitFactor = defaultOverCommitFactor | ||
} | ||
klog.V(4).Infof("Enter overcommit plugin ...") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
move to the first line of this func.
func (op *overcommitPlugin) OnSessionOpen(ssn *framework.Session) { | ||
op.pluginArguments.GetFloat64(&op.overCommitFactor, overCommitFactor) | ||
if op.overCommitFactor < 1.0 { | ||
klog.Warningf("invalid input %f for overcommit-factor, reason: overcommit-factor cannot be less than 1,"+ |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
s/invalid/Invalid
inqueue := api.EmptyResource() | ||
for _, job := range ssn.Jobs { | ||
if job.PodGroup.Status.Phase == scheduling.PodGroupInqueue { | ||
inqueue.Add(api.NewResource(*job.PodGroup.Spec.MinResources)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
MinResources
maybe nil
inqueue.Add(api.NewResource(*job.PodGroup.Spec.MinResources)) | ||
} | ||
} | ||
op.inqueueResource = inqueue.Clone() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why clone it?
inqueue := api.EmptyResource() | ||
inqueue.Add(op.inqueueResource) | ||
if job.PodGroup.Spec.MinResources == nil { | ||
klog.V(4).Infof("job <%s/%s> is bestEffort, allow it be inqueue", job.Namespace, job.Name) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
s/job/Job
//TODO: if allow 1 more job to be inqueue beyond overcommit-factor, large job may be inqueue and create pods | ||
jobMinReq := api.NewResource(*job.PodGroup.Spec.MinResources) | ||
if inqueue.Add(jobMinReq).LessEqual(idle) { | ||
klog.V(4).Infof("sufficient resources, allow job <%s/%s> be inqueue", job.Namespace, job.Name) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ditto
op.inqueueResource.Add(jobMinReq) | ||
return true | ||
} | ||
klog.V(4).Infof("resource in cluster is overused, not allow job <%s/%s> be inqueue", |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ditto
No description provided.