Re: [PATCH v2 09/10] sched: Remove sched_class::pick_next_task()

From: Vincent Guittot

Date: Tue May 19 2026 - 11:28:22 EST


On Mon, 11 May 2026 at 14:07, Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote:
>
> The reason for pick_next_task_fair() is the put/set optimization that
> avoids touching the common ancestors. However, it is possible to
> implement this in the put_prev_task() and set_next_task() calls as
> used in put_prev_set_next_task().
>
> Notably, put_prev_set_next_task() is the only site that:
>
> - calls put_prev_task() with a .next argument;
> - calls set_next_task() with .first = true.
>
> This means that put_prev_task() can determine the common hierarchy and
> stop there, and then set_next_task() can terminate where put_prev_task
> stopped.
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>

Hackbench results on my Arm64 dev machine stay similars with patch 8
and 9 (unlike patch 10)

Reviewed-by: Vincent Guittot <vincent.guittot@xxxxxxxxxx>


> ---
> kernel/sched/core.c | 27 +++------
> kernel/sched/fair.c | 139 +++++++++++++++++----------------------------------
> kernel/sched/sched.h | 14 -----
> 3 files changed, 57 insertions(+), 123 deletions(-)
>
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -5980,16 +5980,15 @@ __pick_next_task(struct rq *rq, struct t
> if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) &&
> rq->nr_running == rq->cfs.h_nr_queued)) {
>
> - p = pick_next_task_fair(rq, prev, rf);
> + p = pick_task_fair(rq, rf);
> if (unlikely(p == RETRY_TASK))
> goto restart;
>
> /* Assume the next prioritized class is idle_sched_class */
> - if (!p) {
> + if (!p)
> p = pick_task_idle(rq, rf);
> - put_prev_set_next_task(rq, prev, p);
> - }
>
> + put_prev_set_next_task(rq, prev, p);
> return p;
> }
>
> @@ -5997,20 +5996,12 @@ __pick_next_task(struct rq *rq, struct t
> prev_balance(rq, prev, rf);
>
> for_each_active_class(class) {
> - if (class->pick_next_task) {
> - p = class->pick_next_task(rq, prev, rf);
> - if (unlikely(p == RETRY_TASK))
> - goto restart;
> - if (p)
> - return p;
> - } else {
> - p = class->pick_task(rq, rf);
> - if (unlikely(p == RETRY_TASK))
> - goto restart;
> - if (p) {
> - put_prev_set_next_task(rq, prev, p);
> - return p;
> - }
> + p = class->pick_task(rq, rf);
> + if (unlikely(p == RETRY_TASK))
> + goto restart;
> + if (p) {
> + put_prev_set_next_task(rq, prev, p);
> + return p;
> }
> }
>
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -9214,7 +9214,7 @@ static void wakeup_preempt_fair(struct r
> resched_curr_lazy(rq);
> }
>
> -static struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf)
> +struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf)
> __must_hold(__rq_lockp(rq))
> {
> struct sched_entity *se;
> @@ -9257,72 +9257,6 @@ static struct task_struct *pick_task_fai
> return NULL;
> }
>
> -static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first);
> -static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first);
> -
> -struct task_struct *
> -pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> - __must_hold(__rq_lockp(rq))
> -{
> - struct sched_entity *se;
> - struct task_struct *p;
> -
> - p = pick_task_fair(rq, rf);
> - if (unlikely(p == RETRY_TASK))
> - return p;
> - if (!p)
> - return p;
> - se = &p->se;
> -
> -#ifdef CONFIG_FAIR_GROUP_SCHED
> - if (prev->sched_class != &fair_sched_class)
> - goto simple;
> -
> - __put_prev_set_next_dl_server(rq, prev, p);
> -
> - /*
> - * Because of the set_next_buddy() in dequeue_task_fair() it is rather
> - * likely that a next task is from the same cgroup as the current.
> - *
> - * Therefore attempt to avoid putting and setting the entire cgroup
> - * hierarchy, only change the part that actually changes.
> - *
> - * Since we haven't yet done put_prev_entity and if the selected task
> - * is a different task than we started out with, try and touch the
> - * least amount of cfs_rqs.
> - */
> - if (prev != p) {
> - struct sched_entity *pse = &prev->se;
> - struct cfs_rq *cfs_rq;
> -
> - while (!(cfs_rq = is_same_group(se, pse))) {
> - int se_depth = se->depth;
> - int pse_depth = pse->depth;
> -
> - if (se_depth <= pse_depth) {
> - put_prev_entity(cfs_rq_of(pse), pse);
> - pse = parent_entity(pse);
> - }
> - if (se_depth >= pse_depth) {
> - set_next_entity(cfs_rq_of(se), se, true);
> - se = parent_entity(se);
> - }
> - }
> -
> - put_prev_entity(cfs_rq, pse);
> - set_next_entity(cfs_rq, se, true);
> -
> - __set_next_task_fair(rq, p, true);
> - }
> -
> - return p;
> -
> -simple:
> -#endif /* CONFIG_FAIR_GROUP_SCHED */
> - put_prev_set_next_task(rq, prev, p);
> - return p;
> -}
> -
> static struct task_struct *
> fair_server_pick_task(struct sched_dl_entity *dl_se, struct rq_flags *rf)
> __must_hold(__rq_lockp(dl_se->rq))
> @@ -9346,10 +9280,33 @@ static void put_prev_task_fair(struct rq
> {
> struct sched_entity *se = &prev->se;
> struct cfs_rq *cfs_rq;
> + struct sched_entity *nse = NULL;
>
> - for_each_sched_entity(se) {
> +#ifdef CONFIG_FAIR_GROUP_SCHED
> + if (next && next->sched_class == &fair_sched_class)
> + nse = &next->se;
> +#endif
> +
> + while (se) {
> cfs_rq = cfs_rq_of(se);
> - put_prev_entity(cfs_rq, se);
> + if (!nse || cfs_rq->curr)
> + put_prev_entity(cfs_rq, se);
> +#ifdef CONFIG_FAIR_GROUP_SCHED
> + if (nse) {
> + if (is_same_group(se, nse))
> + break;
> +
> + int d = nse->depth - se->depth;
> + if (d >= 0) {
> + /* nse has equal or greater depth, ascend */
> + nse = parent_entity(nse);
> + /* if nse is the deeper, do not ascend se */
> + if (d > 0)
> + continue;
> + }
> + }
> +#endif
> + se = parent_entity(se);
> }
> }
>
> @@ -13896,10 +13853,30 @@ static void switched_to_fair(struct rq *
> }
> }
>
> -static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
> +/*
> + * Account for a task changing its policy or group.
> + *
> + * This routine is mostly called to set cfs_rq->curr field when a task
> + * migrates between groups/classes.
> + */
> +static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
> {
> struct sched_entity *se = &p->se;
>
> + for_each_sched_entity(se) {
> + struct cfs_rq *cfs_rq = cfs_rq_of(se);
> +
> + if (IS_ENABLED(CONFIG_FAIR_GROUP_SCHED) &&
> + first && cfs_rq->curr)
> + break;
> +
> + set_next_entity(cfs_rq, se, first);
> + /* ensure bandwidth has been allocated on our new cfs_rq */
> + account_cfs_rq_runtime(cfs_rq, 0);
> + }
> +
> + se = &p->se;
> +
> if (task_on_rq_queued(p)) {
> /*
> * Move the next running task to the front of the list, so our
> @@ -13919,27 +13896,6 @@ static void __set_next_task_fair(struct
> sched_fair_update_stop_tick(rq, p);
> }
>
> -/*
> - * Account for a task changing its policy or group.
> - *
> - * This routine is mostly called to set cfs_rq->curr field when a task
> - * migrates between groups/classes.
> - */
> -static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
> -{
> - struct sched_entity *se = &p->se;
> -
> - for_each_sched_entity(se) {
> - struct cfs_rq *cfs_rq = cfs_rq_of(se);
> -
> - set_next_entity(cfs_rq, se, first);
> - /* ensure bandwidth has been allocated on our new cfs_rq */
> - account_cfs_rq_runtime(cfs_rq, 0);
> - }
> -
> - __set_next_task_fair(rq, p, first);
> -}
> -
> void init_cfs_rq(struct cfs_rq *cfs_rq)
> {
> cfs_rq->tasks_timeline = RB_ROOT_CACHED;
> @@ -14251,7 +14207,6 @@ DEFINE_SCHED_CLASS(fair) = {
> .wakeup_preempt = wakeup_preempt_fair,
>
> .pick_task = pick_task_fair,
> - .pick_next_task = pick_next_task_fair,
> .put_prev_task = put_prev_task_fair,
> .set_next_task = set_next_task_fair,
>
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2555,17 +2555,6 @@ struct sched_class {
> * schedule/pick_next_task: rq->lock
> */
> struct task_struct *(*pick_task)(struct rq *rq, struct rq_flags *rf);
> - /*
> - * Optional! When implemented pick_next_task() should be equivalent to:
> - *
> - * next = pick_task();
> - * if (next) {
> - * put_prev_task(prev);
> - * set_next_task_first(next);
> - * }
> - */
> - struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev,
> - struct rq_flags *rf);
>
> /*
> * sched_change:
> @@ -2789,8 +2778,7 @@ static inline bool sched_fair_runnable(s
> return rq->cfs.nr_queued > 0;
> }
>
> -extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev,
> - struct rq_flags *rf);
> +extern struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf);
> extern struct task_struct *pick_task_idle(struct rq *rq, struct rq_flags *rf);
>
> #define SCA_CHECK 0x01
>
>