Re: [PATCH v29 7/9] sched: Add blocked_donor link to task for smarter mutex handoffs

From: Peter Zijlstra

Date: Tue May 19 2026 - 10:38:33 EST

On Tue, May 12, 2026 at 02:56:17AM +0000, John Stultz wrote:
> From: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
>
> Add link to the task this task is proxying for, and use it so
> the mutex owner can do an intelligent hand-off of the mutex to
> the task that the owner is running on behalf.
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
> Signed-off-by: Juri Lelli <juri.lelli@xxxxxxxxxx>
> Signed-off-by: Valentin Schneider <valentin.schneider@xxxxxxx>
> Signed-off-by: Connor O'Brien <connoro@xxxxxxxxxx>
> [jstultz: This patch was split out from larger proxy patch]
> Signed-off-by: John Stultz <jstultz@xxxxxxxxxx>
> ---
> include/linux/sched.h | 1 +
> init/init_task.c | 1 +
> kernel/fork.c | 1 +
> kernel/locking/mutex.c | 43 +++++++++++++++++++++++++++++++++++++++---
> kernel/sched/core.c | 14 +++++++++++++-
> 5 files changed, 56 insertions(+), 4 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index bbb183233855a..c93883ce82ee4 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1242,6 +1242,7 @@ struct task_struct {
> #endif
>
> struct mutex *blocked_on; /* lock we're blocked on */
> + struct task_struct *blocked_donor; /* task that is boosting this task */
> raw_spinlock_t blocked_lock;

The placement suggests this new field is also serialized by
blocked_lock, but that is not, in fact, the case AFAICT.

It is set in schedule(), while holding:

rq->lock
mutex->wait_lock
p->blocked_lock

But p != owner, so we don't hold owner->blocked_lock.

> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 8a223555be2e9..2226f594376d6 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -6821,7 +6821,17 @@ static void proxy_migrate_task(struct rq *rq, struct rq_flags *rf,
> * Find runnable lock owner to proxy for mutex blocked donor
> *
> * Follow the blocked-on relation:
> - * task->blocked_on -> mutex->owner -> task...
> + *
> + * ,-> task
> + * | | blocked-on
> + * | v
> + * blocked_donor | mutex
> + * | | owner
> + * | v
> + * `-- task
> + *
> + * and set the blocked_donor relation, this latter is used by the mutex
> + * code to find which (blocked) task to hand-off to.
> *
> * Lock order:
> *
> @@ -6963,6 +6973,7 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
> * rq, therefore holding @rq->lock is sufficient to
> * guarantee its existence, as per ttwu_remote().
> */
> + owner->blocked_donor = p;
> }
> WARN_ON_ONCE(owner && !owner->on_rq);
> return owner;
> @@ -7119,6 +7130,7 @@ static void __sched notrace __schedule(int sched_mode)
> clear_task_blocked_on(prev, NULL);
>
> rq_set_donor(rq, next);
> + next->blocked_donor = NULL;
> if (unlikely(next->is_blocked && next->blocked_on)) {
> next = find_proxy_task(rq, next, &rf);
> if (!next) {

Notably, blocked_donor is a back link that is specific to the current
schedule() call / donor pick cycle.

This means it is stable when either: holding rq->lock or disabling
preemption -- because when preemption is disabled.

This then brings us to the consumer side of things:

> diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
> index 09534628dc01a..0064b724ccda3 100644
> --- a/kernel/locking/mutex.c
> +++ b/kernel/locking/mutex.c
> @@ -980,7 +980,7 @@ EXPORT_SYMBOL_GPL(ww_mutex_lock_interruptible);
> static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip)
> __releases(lock)
> {
> - struct task_struct *next = NULL;
> + struct task_struct *donor, *next = NULL;
> struct mutex_waiter *waiter;
> DEFINE_WAKE_Q(wake_q);
> unsigned long owner;
> @@ -1001,6 +1001,12 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
> MUTEX_WARN_ON(__owner_task(owner) != current);
> MUTEX_WARN_ON(owner & MUTEX_FLAG_PICKUP);
>
> + if (sched_proxy_exec() && current->blocked_donor) {
> + /* force handoff if we have a blocked_donor */
> + owner = MUTEX_FLAG_HANDOFF;
> + break;
> + }
> +
> if (owner & MUTEX_FLAG_HANDOFF)
> break;
>

AFAICT this is racy since we don't have preemption disabled.

So we can observe ->blocked_donor (A) set, or (B) unset.
If (A) we can schedule() right after this (and before taking
->wait_lock) and it can be unset when we resume running this task. Or
(B), the exact opposite.

Now, (A) is harmless, because if ->blocked_donor becomes NULL, the
hand off code falls back to picking the first on the wait list and
things just get on.

*However*, (B) might be a problem, because then we will not have the
HANDOFF bit set even though there is in fact a donor we need to hand off
to.

> @@ -1013,19 +1019,50 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
> }
>
> raw_spin_lock_irqsave(&lock->wait_lock, flags);
> + raw_spin_lock(&current->blocked_lock);
> debug_mutex_unlock(lock);
> +
> + if (sched_proxy_exec()) {
> + /*
> + * If we have a task boosting current, and that task was boosting
> + * current through this lock, hand the lock to that task, as that
> + * is the highest waiter, as selected by the scheduling function.
> + */
> + donor = current->blocked_donor;
> + if (donor) {
> + struct mutex *next_lock;
> +
> + raw_spin_lock_nested(&donor->blocked_lock, SINGLE_DEPTH_NESTING);
> + next_lock = __get_task_blocked_on(donor);
> + if (next_lock == lock) {
> + next = donor;
> + __set_task_blocked_on_waking(donor, next_lock);
> + wake_q_add(&wake_q, donor);
> + current->blocked_donor = NULL;
> + }
> + raw_spin_unlock(&donor->blocked_lock);
> + }
> + }
> +
> + /*
> + * Failing that, pick first on the wait list.
> + */
> waiter = lock->first_waiter;
> - if (waiter) {
> + if (!next && waiter) {
> next = waiter->task;
>
> + raw_spin_lock_nested(&next->blocked_lock, SINGLE_DEPTH_NESTING);
> debug_mutex_wake_waiter(lock, waiter);
> - set_task_blocked_on_waking(next, lock);
> + __set_task_blocked_on_waking(next, lock);
> + raw_spin_unlock(&next->blocked_lock);
> wake_q_add(&wake_q, next);
> +
> }
>
> if (owner & MUTEX_FLAG_HANDOFF)
> __mutex_handoff(lock, next);
>
> + raw_spin_unlock(&current->blocked_lock);
> raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q);
> }

That is, we need this on top, no?

---
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1242,8 +1242,14 @@ struct task_struct {
#endif

struct mutex *blocked_on; /* lock we're blocked on */
- struct task_struct *blocked_donor; /* task that is boosting this task */
raw_spinlock_t blocked_lock;
+
+ /*
+ * The task that is boosting this task; a back link for the current
+ * donor stack. Set in schedule() -> find_proxy_task() and only stable
+ * under preempt_disable().
+ */
+ struct task_struct *blocked_donor;

#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
/*
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -990,6 +990,14 @@ static noinline void __sched __mutex_unl
__release(lock);

/*
+ * Ensures the proxy donor stack is stable across unlock and handoff.
+ * Specifically, it avoids the case where current->blocked_donor is
+ * NULL when it is inspected while doing the unlock, but a preemption
+ * before taking the wake_lock would make it set and a hand-off is
+ * missed.
+ */
+ guard(preempt)();
+ /*
* Release the lock before (potentially) taking the spinlock such that
* other contenders can get on with things ASAP.
*
@@ -1063,7 +1071,8 @@ static noinline void __sched __mutex_unl
__mutex_handoff(lock, next);

raw_spin_unlock(&current->blocked_lock);
- raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+ wake_up_q(&wake_q);
}

#ifndef CONFIG_DEBUG_LOCK_ALLOC