[RFC PATCH v5 8/9] sched/fair: Steal work from an overloaded CPU when CPU goes idle

From: Chen Jinghuang

Date: Fri Mar 20 2026 - 02:21:58 EST

From: Steve Sistare <steven.sistare@xxxxxxxxxx>

When a CPU has no more CFS tasks to run, and idle_balance() fails to find a
task, then attempt to steal a task from an overloaded CPU in the same LLC,
using the cfs_overload_cpus bitmap to efficiently identify candidates. To
minimize search time, steal the first migratable task that is found when
the bitmap is traversed. For fairness, search for migratable tasks on an
overloaded CPU in order of next to run.

This simple stealing yields a higher CPU utilization than idle_balance()
alone, because the search is cheap, so it may be called every time the CPU
is about to go idle. idle_balance() does more work because it searches
widely for the busiest queue, so to limit its CPU consumption, it declines
to search if the system is too busy. Simple stealing does not offload the
globally busiest queue, but it is much better than running nothing at all.

Stealing is controlled by the sched feature SCHED_STEAL, which is enabled
by default. Note that all test results presented below are based on the
NO_DELAY_DEQUEUE implementation.

Stealing imprroves utilization with only a modest CPU overhead in scheduler
code. In the following experiment, hackbench is run with varying numbers
of groups (40 tasks per group), and the delta in /proc/schedstat is shown
for each run, averaged per CPU, augmented with these non-standard stats:

steal - number of times a task is stolen from another CPU.

X6-2: 2 socket * 40 cores * 2 hyperthreads = 160 CPUs
Intel(R) Xeon(R) Platinum 8380 CPU @ 2.30GHz
hackbench <grps> process 100000

baseline
grps time %busy sched idle wake steal
1 2.182 20.00 35876 17905 17958 0
2 2.391 39.00 67753 33808 33921 0
3 2.871 47.00 100944 48966 51538 0
4 2.928 62.00 114489 55171 59059 0
8 4.852 83.00 219907 92961 121703 0

new
grps time %busy sched idle wake steal %speedup
1 2.229 18.00 45450 22691 22751 52 -2.1
2 2.123 40.00 49975 24977 24990 6 12.6
3 2.690 61.00 56118 22641 32780 9073 6.7
4 2.828 80.00 37927 12828 24165 8442 3.5
8 4.120 95.00 85929 8613 57858 11098 17.8

Elapsed time improves by 17.8, and CPU busy utilization is up
by 1 to 18% hitting 95% at peak load.

Signed-off-by: Steve Sistare <steven.sistare@xxxxxxxxxx>
Signed-off-by: Chen Jinghuang <chenjinghuang2@xxxxxxxxxx>
---
kernel/sched/fair.c | 174 ++++++++++++++++++++++++++++++++++++++--
kernel/sched/features.h | 6 ++
2 files changed, 174 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0bf6d18dac05..500215a57392 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5092,6 +5092,9 @@ static void overload_clear(struct rq *rq)
{
struct sparsemask *overload_cpus;

+ if (!sched_feat(STEAL))
+ return;
+
rcu_read_lock();
overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
if (overload_cpus)
@@ -5103,17 +5106,29 @@ static void overload_set(struct rq *rq)
{
struct sparsemask *overload_cpus;

+ if (!sched_feat(STEAL))
+ return;
+
rcu_read_lock();
overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
if (overload_cpus)
sparsemask_set_elem(overload_cpus, rq->cpu);
rcu_read_unlock();
}
+
+static int try_steal(struct rq *this_rq, struct rq_flags *rf);
+
#else /* CONFIG_SMP */
static inline void rq_idle_stamp_update(struct rq *rq) {}
static inline void rq_idle_stamp_clear(struct rq *rq) {}
static inline void overload_clear(struct rq *rq) {}
static inline void overload_set(struct rq *rq) {}
+
+static inline int try_steal(struct rq *this_rq, struct rq_flags *rf)
+{
+ return 0;
+}
+
#endif

void __setparam_fair(struct task_struct *p, const struct sched_attr *attr)
@@ -9024,21 +9039,24 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
idle:
if (rf) {
/*
- * We must set idle_stamp _before_ calling idle_balance(), such that we
- * measure the duration of idle_balance() as idle time.
+ * We must set idle_stamp _before_ calling try_steal() or
+ * sched_balance_newidle(), such that we measure the duration
+ * as idle time.
*/
rq_idle_stamp_update(rq);

new_tasks = sched_balance_newidle(rq, rf);
+ if (new_tasks == 0)
+ new_tasks = try_steal(rq, rf);

if (new_tasks)
rq_idle_stamp_clear(rq);

/*
- * Because sched_balance_newidle() releases (and re-acquires)
- * rq->lock, it is possible for any higher priority task to
- * appear. In that case we must re-start the pick_next_entity()
- * loop.
+ * Because try_steal() and sched_balance_newidle() release
+ * (and re-acquire) rq->lock, it is possible for any higher priority
+ * task to appear. In that case we must re-start the
+ * pick_next_entity() loop.
*/
if (new_tasks < 0)
return RETRY_TASK;
@@ -13133,6 +13151,150 @@ void sched_balance_trigger(struct rq *rq)
nohz_balancer_kick(rq);
}

+/*
+ * Search the runnable tasks in @cfs_rq in order of next to run, and find
+ * the first one that can be migrated to @dst_rq. @cfs_rq is locked on entry.
+ * On success, dequeue the task from @cfs_rq and return it, else return NULL.
+ */
+static struct task_struct *
+detach_next_task(struct cfs_rq *cfs_rq, struct rq *dst_rq)
+{
+ int dst_cpu = dst_rq->cpu;
+ struct task_struct *p;
+ struct rq *rq = rq_of(cfs_rq);
+
+ lockdep_assert_rq_held(rq);
+
+ list_for_each_entry_reverse(p, &rq->cfs_tasks, se.group_node) {
+ if (can_migrate_task_llc(p, rq, dst_rq)) {
+ detach_task_steal(p, rq, dst_cpu);
+ return p;
+ }
+ }
+ return NULL;
+}
+
+/*
+ * Attempt to migrate a CFS task from @src_cpu to @dst_rq. @locked indicates
+ * whether @dst_rq is already locked on entry. This function may lock or
+ * unlock @dst_rq, and updates @locked to indicate the locked state on return.
+ * The locking protocol is based on idle_balance().
+ * Returns 1 on success and 0 on failure.
+ */
+static int steal_from(struct rq *dst_rq, struct rq_flags *dst_rf, bool *locked,
+ int src_cpu)
+{
+ struct task_struct *p;
+ struct rq_flags rf;
+ int stolen = 0;
+ int dst_cpu = dst_rq->cpu;
+ struct rq *src_rq = cpu_rq(src_cpu);
+
+ if (dst_cpu == src_cpu || src_rq->cfs.h_nr_runnable < 2)
+ return 0;
+
+ if (*locked) {
+ rq_unpin_lock(dst_rq, dst_rf);
+ raw_spin_rq_unlock(dst_rq);
+ *locked = false;
+ }
+ rq_lock_irqsave(src_rq, &rf);
+ update_rq_clock(src_rq);
+
+ if (src_rq->cfs.h_nr_runnable < 2 || !cpu_active(src_cpu))
+ p = NULL;
+ else
+ p = detach_next_task(&src_rq->cfs, dst_rq);
+
+ rq_unlock(src_rq, &rf);
+
+ if (p) {
+ raw_spin_rq_lock(dst_rq);
+ rq_repin_lock(dst_rq, dst_rf);
+ *locked = true;
+ update_rq_clock(dst_rq);
+ attach_task(dst_rq, p);
+ stolen = 1;
+ }
+ local_irq_restore(rf.flags);
+
+ return stolen;
+}
+
+/*
+ * Conservative upper bound on the max cost of a steal, in nsecs (the typical
+ * cost is 1-2 microsec). Do not steal if average idle time is less.
+ */
+#define SCHED_STEAL_COST 10000
+
+/*
+ * Try to steal a runnable CFS task from a CPU in the same LLC as @dst_rq,
+ * and migrate it to @dst_rq. rq_lock is held on entry and return, but
+ * may be dropped in between. Return 1 on success, 0 on failure, and -1
+ * if a task in a different scheduling class has become runnable on @dst_rq.
+ */
+static int try_steal(struct rq *dst_rq, struct rq_flags *dst_rf)
+{
+ int src_cpu;
+ int dst_cpu = dst_rq->cpu;
+ bool locked = true;
+ int stolen = 0;
+ struct sparsemask *overload_cpus;
+
+ if (!sched_feat(STEAL))
+ return 0;
+
+ if (!cpu_active(dst_cpu))
+ return 0;
+
+ if (dst_rq->avg_idle < SCHED_STEAL_COST)
+ return 0;
+
+ /* Get bitmap of overloaded CPUs in the same LLC as @dst_rq */
+
+ rcu_read_lock();
+ overload_cpus = rcu_dereference(dst_rq->cfs_overload_cpus);
+ if (!overload_cpus) {
+ rcu_read_unlock();
+ return 0;
+ }
+
+#ifdef CONFIG_SCHED_SMT
+ /*
+ * First try overloaded CPUs on the same core to preserve cache warmth.
+ */
+ if (static_branch_likely(&sched_smt_present)) {
+ for_each_cpu(src_cpu, cpu_smt_mask(dst_cpu)) {
+ if (sparsemask_test_elem(overload_cpus, src_cpu) &&
+ steal_from(dst_rq, dst_rf, &locked, src_cpu)) {
+ stolen = 1;
+ goto out;
+ }
+ }
+ }
+#endif /* CONFIG_SCHED_SMT */
+
+ /* Accept any suitable task in the LLC */
+
+ sparsemask_for_each(overload_cpus, dst_cpu, src_cpu) {
+ if (steal_from(dst_rq, dst_rf, &locked, src_cpu)) {
+ stolen = 1;
+ goto out;
+ }
+ }
+
+out:
+ rcu_read_unlock();
+ if (!locked) {
+ raw_spin_rq_lock(dst_rq);
+ rq_repin_lock(dst_rq, dst_rf);
+ }
+ stolen |= (dst_rq->cfs.h_nr_runnable > 0);
+ if (dst_rq->nr_running != dst_rq->cfs.h_nr_runnable)
+ stolen = -1;
+ return stolen;
+}
+
static void rq_online_fair(struct rq *rq)
{
update_sysctl();
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 136a6584be79..e8c3e19bf585 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -87,6 +87,12 @@ SCHED_FEAT(TTWU_QUEUE, true)
*/
SCHED_FEAT(SIS_UTIL, true)

+/*
+ * Steal a CFS task from another CPU when going idle.
+ * Improves CPU utilization.
+ */
+SCHED_FEAT(STEAL, true)
+
/*
* Issue a WARN when we do multiple update_rq_clock() calls
* in a single rq->lock section. Default disabled because the
--
2.34.1