Skip to content

Commit

Permalink
sched/rt: Fix double enqueue caused by rt_effective_prio
Browse files Browse the repository at this point in the history
Double enqueues in rt runqueues (list) have been reported while running
a simple test that spawns a number of threads doing a short sleep/run
pattern while being concurrently setscheduled between rt and fair class.

  WARNING: CPU: 3 PID: 2825 at kernel/sched/rt.c:1294 enqueue_task_rt+0x355/0x360
  CPU: 3 PID: 2825 Comm: setsched__13
  RIP: 0010:enqueue_task_rt+0x355/0x360
  Call Trace:
   __sched_setscheduler+0x581/0x9d0
   _sched_setscheduler+0x63/0xa0
   do_sched_setscheduler+0xa0/0x150
   __x64_sys_sched_setscheduler+0x1a/0x30
   do_syscall_64+0x33/0x40
   entry_SYSCALL_64_after_hwframe+0x44/0xae

  list_add double add: new=ffff9867cb629b40, prev=ffff9867cb629b40,
		       next=ffff98679fc67ca0.
  kernel BUG at lib/list_debug.c:31!
  invalid opcode: 0000 [#1] PREEMPT_RT SMP PTI
  CPU: 3 PID: 2825 Comm: setsched__13
  RIP: 0010:__list_add_valid+0x41/0x50
  Call Trace:
   enqueue_task_rt+0x291/0x360
   __sched_setscheduler+0x581/0x9d0
   _sched_setscheduler+0x63/0xa0
   do_sched_setscheduler+0xa0/0x150
   __x64_sys_sched_setscheduler+0x1a/0x30
   do_syscall_64+0x33/0x40
   entry_SYSCALL_64_after_hwframe+0x44/0xae

__sched_setscheduler() uses rt_effective_prio() to handle proper queuing
of priority boosted tasks that are setscheduled while being boosted.
rt_effective_prio() is however called twice per each
__sched_setscheduler() call: first directly by __sched_setscheduler()
before dequeuing the task and then by __setscheduler() to actually do
the priority change. If the priority of the pi_top_task is concurrently
being changed however, it might happen that the two calls return
different results. If, for example, the first call returned the same rt
priority the task was running at and the second one a fair priority, the
task won't be removed by the rt list (on_list still set) and then
enqueued in the fair runqueue. When eventually setscheduled back to rt
it will be seen as enqueued already and the WARNING/BUG be issued.

Fix this by calling rt_effective_prio() only once and then reusing the
return value. While at it refactor code as well for clarity. Concurrent
priority inheritance handling is still safe and will eventually converge
to a new state by following the inheritance chain(s).

Fixes: 0782e63 ("sched: Handle priority boosted tasks proper in setscheduler()")
[squashed Peterz changes; added changelog]
Reported-by: Mark Simmons <msimmons@redhat.com>
Signed-off-by: Juri Lelli <juri.lelli@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20210803104501.38333-1-juri.lelli@redhat.com
  • Loading branch information
Peter Zijlstra committed Aug 4, 2021
1 parent c500bee commit f558c2b
Showing 1 changed file with 35 additions and 55 deletions.
90 changes: 35 additions & 55 deletions kernel/sched/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -1981,12 +1981,18 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
dequeue_task(rq, p, flags);
}

/*
* __normal_prio - return the priority that is based on the static prio
*/
static inline int __normal_prio(struct task_struct *p)
static inline int __normal_prio(int policy, int rt_prio, int nice)
{
return p->static_prio;
int prio;

if (dl_policy(policy))
prio = MAX_DL_PRIO - 1;
else if (rt_policy(policy))
prio = MAX_RT_PRIO - 1 - rt_prio;
else
prio = NICE_TO_PRIO(nice);

return prio;
}

/*
Expand All @@ -1998,15 +2004,7 @@ static inline int __normal_prio(struct task_struct *p)
*/
static inline int normal_prio(struct task_struct *p)
{
int prio;

if (task_has_dl_policy(p))
prio = MAX_DL_PRIO-1;
else if (task_has_rt_policy(p))
prio = MAX_RT_PRIO-1 - p->rt_priority;
else
prio = __normal_prio(p);
return prio;
return __normal_prio(p->policy, p->rt_priority, PRIO_TO_NICE(p->static_prio));
}

/*
Expand Down Expand Up @@ -4099,7 +4097,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
} else if (PRIO_TO_NICE(p->static_prio) < 0)
p->static_prio = NICE_TO_PRIO(0);

p->prio = p->normal_prio = __normal_prio(p);
p->prio = p->normal_prio = p->static_prio;
set_load_weight(p, false);

/*
Expand Down Expand Up @@ -6341,6 +6339,18 @@ int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flag
}
EXPORT_SYMBOL(default_wake_function);

static void __setscheduler_prio(struct task_struct *p, int prio)
{
if (dl_prio(prio))
p->sched_class = &dl_sched_class;
else if (rt_prio(prio))
p->sched_class = &rt_sched_class;
else
p->sched_class = &fair_sched_class;

p->prio = prio;
}

#ifdef CONFIG_RT_MUTEXES

static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
Expand Down Expand Up @@ -6456,22 +6466,19 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
} else {
p->dl.pi_se = &p->dl;
}
p->sched_class = &dl_sched_class;
} else if (rt_prio(prio)) {
if (dl_prio(oldprio))
p->dl.pi_se = &p->dl;
if (oldprio < prio)
queue_flag |= ENQUEUE_HEAD;
p->sched_class = &rt_sched_class;
} else {
if (dl_prio(oldprio))
p->dl.pi_se = &p->dl;
if (rt_prio(oldprio))
p->rt.timeout = 0;
p->sched_class = &fair_sched_class;
}

p->prio = prio;
__setscheduler_prio(p, prio);

if (queued)
enqueue_task(rq, p, queue_flag);
Expand Down Expand Up @@ -6824,35 +6831,6 @@ static void __setscheduler_params(struct task_struct *p,
set_load_weight(p, true);
}

/* Actually do priority change: must hold pi & rq lock. */
static void __setscheduler(struct rq *rq, struct task_struct *p,
const struct sched_attr *attr, bool keep_boost)
{
/*
* If params can't change scheduling class changes aren't allowed
* either.
*/
if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)
return;

__setscheduler_params(p, attr);

/*
* Keep a potential priority boosting if called from
* sched_setscheduler().
*/
p->prio = normal_prio(p);
if (keep_boost)
p->prio = rt_effective_prio(p, p->prio);

if (dl_prio(p->prio))
p->sched_class = &dl_sched_class;
else if (rt_prio(p->prio))
p->sched_class = &rt_sched_class;
else
p->sched_class = &fair_sched_class;
}

/*
* Check the target process has a UID that matches the current process's:
*/
Expand All @@ -6873,10 +6851,8 @@ static int __sched_setscheduler(struct task_struct *p,
const struct sched_attr *attr,
bool user, bool pi)
{
int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
MAX_RT_PRIO - 1 - attr->sched_priority;
int retval, oldprio, oldpolicy = -1, queued, running;
int new_effective_prio, policy = attr->sched_policy;
int oldpolicy = -1, policy = attr->sched_policy;
int retval, oldprio, newprio, queued, running;
const struct sched_class *prev_class;
struct callback_head *head;
struct rq_flags rf;
Expand Down Expand Up @@ -7074,6 +7050,7 @@ static int __sched_setscheduler(struct task_struct *p,
p->sched_reset_on_fork = reset_on_fork;
oldprio = p->prio;

newprio = __normal_prio(policy, attr->sched_priority, attr->sched_nice);
if (pi) {
/*
* Take priority boosted tasks into account. If the new
Expand All @@ -7082,8 +7059,8 @@ static int __sched_setscheduler(struct task_struct *p,
* the runqueue. This will be done when the task deboost
* itself.
*/
new_effective_prio = rt_effective_prio(p, newprio);
if (new_effective_prio == oldprio)
newprio = rt_effective_prio(p, newprio);
if (newprio == oldprio)
queue_flags &= ~DEQUEUE_MOVE;
}

Expand All @@ -7096,7 +7073,10 @@ static int __sched_setscheduler(struct task_struct *p,

prev_class = p->sched_class;

__setscheduler(rq, p, attr, pi);
if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
__setscheduler_params(p, attr);
__setscheduler_prio(p, newprio);
}
__setscheduler_uclamp(p, attr);

if (queued) {
Expand Down

0 comments on commit f558c2b

Please sign in to comment.