Re: [Patch v4 02/22] sched/cache: Limit the scan number of CPUs when calculating task occupancy

From: Luo Gengkun

Date: Thu Apr 09 2026 - 09:31:44 EST

On 2026/4/2 5:52, Tim Chen wrote:

From: Chen Yu <yu.c.chen@xxxxxxxxx>

When NUMA balancing is enabled, the kernel currently iterates over all
online CPUs to aggregate process-wide occupancy data. On large systems,
this global scan introduces significant overhead.

To reduce scan latency, limit the search to a subset of relevant CPUs:
1. The task's preferred NUMA node.
2. The node where the task is currently running.
3. The node that contains the task's current preferred LLC..

While focusing solely on the preferred NUMA node is ideal, a
process-wide scan must remain flexible because the "preferred node"
is a per-task attribute. Different threads within the same process may
have different preferred nodes, causing the process-wide preference to
migrate. Maintaining a mask that covers both the preferred and active
running nodes ensures accuracy while significantly reducing the number of
CPUs inspected.

To address the issue of scanning overhead, there is a more targeted approach: only scanning the CPUs actually accessed by the process, and evicting these CPUs when they remain unaccessed for a specific period of time.

This significantly reduces unnecessary scanning in most scenario. I have attached the patch below for review. Please feel free to integrate or modify these changes.

Thansk!
Luo Gengkun

From de33b29f31a882ac8c0038cac4ec659c7fdfe60d Mon Sep 17 00:00:00 2001
From: Luo Gengkun <luogengkun2@xxxxxxxxxx>
Date: Thu, 9 Apr 2026 07:22:16 +0000
Subject: [PATCH] sched/cache: Reduce the overhead of task_cache_work by only
scan the visisted cpus.

The overhead of task_cache_work is high, espeically in multi-NUMA system.
Currently, task_cache_work try to find the pref_llc by scan all cpus in the
system. However, most of these scans are meaningless, such as those for
cpus that have never been visited or were accessed a long time ago.

To address this problem, this patch introduces visited_cpus to track the
visited cpus and uses llc_epoch_visited_timeout to evict cpus that have
timed out.

Signed-off-by: Luo Gengkun <luogengkun2@xxxxxxxxxx>
---
include/linux/sched.h | 1 +
kernel/sched/debug.c | 2 ++
kernel/sched/fair.c | 19 ++++++++++++++++---
kernel/sched/sched.h | 1 +
4 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index dfa4bfd099c6..f2327a13fda8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2390,6 +2390,7 @@ struct sched_cache_time {

struct sched_cache_stat {
struct sched_cache_time __percpu *pcpu_sched;
+ struct cpumask visited_cpus;
raw_spinlock_t lock;
unsigned long epoch;
u64 nr_running_avg;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4469e1c152c8..da66acabff4f 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -675,6 +675,8 @@ static __init int sched_init_debug(void)
&llc_epoch_period);
debugfs_create_u32("epoch_affinity_timeout", 0644, llc,
&llc_epoch_affinity_timeout);
+ debugfs_create_u32("epoch_visited_timeout", 0644, llc,
+ &llc_epoch_visited_timeout);
debugfs_create_u32("overaggr_pct", 0644, llc,
&llc_overaggr_pct);
debugfs_create_u32("imb_pct", 0644, llc,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e4e22696a0b1..4ccabc412aa1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1285,6 +1285,7 @@ static void set_next_buddy(struct sched_entity *se);
__read_mostly unsigned int llc_aggr_tolerance = 1;
__read_mostly unsigned int llc_epoch_period = EPOCH_PERIOD;
__read_mostly unsigned int llc_epoch_affinity_timeout = EPOCH_LLC_AFFINITY_TIMEOUT;
+__read_mostly unsigned int llc_epoch_visited_timeout = EPOCH_LLC_AFFINITY_TIMEOUT;
__read_mostly unsigned int llc_imb_pct = 20;
__read_mostly unsigned int llc_overaggr_pct = 50;

@@ -1466,6 +1467,7 @@ void mm_init_sched(struct mm_struct *mm,
raw_spin_lock_init(&mm->sc_stat.lock);
mm->sc_stat.epoch = epoch;
mm->sc_stat.cpu = -1;
+ cpumask_clear(&mm->sc_stat.visited_cpus);

/*
* The update to mm->sc_stat should not be reordered
@@ -1582,6 +1584,7 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
pcpu_sched->runtime += delta_exec;
rq->cpu_runtime += delta_exec;
epoch = rq->cpu_epoch;
+ cpumask_set_cpu(cpu_of(rq), &mm->sc_stat.visited_cpus);
}

/*
@@ -1724,7 +1727,10 @@ static void task_cache_work(struct callback_head *work)
return;

scoped_guard (cpus_read_lock) {
- get_scan_cpumasks(cpus, p);
+ if (unlikely(llc_epoch_visited_timeout == 0))
+ get_scan_cpumasks(cpus, p);
+ else
+ cpumask_and(cpus, cpu_online_mask, &mm->sc_stat.visited_cpus);

for_each_cpu(cpu, cpus) {
/* XXX sched_cluster_active */
@@ -1736,8 +1742,15 @@ static void task_cache_work(struct callback_head *work)
continue;

for_each_cpu(i, sched_domain_span(sd)) {
- occ = fraction_mm_sched(cpu_rq(i),
- per_cpu_ptr(mm->sc_stat.pcpu_sched, i));
+ struct rq *rq = cpu_rq(i);
+ struct sched_cache_time *pcpu_sched = per_cpu_ptr(mm->sc_stat.pcpu_sched, i);
+ /* Skip the rq that has not been hit for a long time */
+ if (llc_epoch_visited_timeout && (rq->cpu_epoch - pcpu_sched->epoch) >
+ llc_epoch_visited_timeout) {
+ cpumask_clear_cpu(cpu_of(rq), &mm->sc_stat.visited_cpus);
+ continue;
+ }
+ occ = fraction_mm_sched(rq, pcpu_sched);
a_occ += occ;
if (occ > m_occ) {
m_occ = occ;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b757812725f7..93908bb17cc5 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -4041,6 +4041,7 @@ extern int sysctl_sched_cache_user;
extern unsigned int llc_aggr_tolerance;
extern unsigned int llc_epoch_period;
extern unsigned int llc_epoch_affinity_timeout;
+extern unsigned int llc_epoch_visited_timeout;
extern unsigned int llc_imb_pct;
extern unsigned int llc_overaggr_pct;

--
2.34.1

Future work may integrate numa_group to further refine task aggregation.

Suggested-by: Madadi Vineeth Reddy <vineethr@xxxxxxxxxxxxx>
Signed-off-by: Chen Yu <yu.c.chen@xxxxxxxxx>
Co-developed-by: Tim Chen <tim.c.chen@xxxxxxxxxxxxxxx>
Signed-off-by: Tim Chen <tim.c.chen@xxxxxxxxxxxxxxx>
---

Notes:
v3->v4:
New patch.

kernel/sched/fair.c | 46 ++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 45 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index eb3cfb852a93..20a33900f4ea 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1431,6 +1431,50 @@ static void task_tick_cache(struct rq *rq, struct task_struct *p)
}
}
+static void get_scan_cpumasks(cpumask_var_t cpus, struct task_struct *p)
+{
+#ifdef CONFIG_NUMA_BALANCING
+ int cpu, curr_cpu, pref_nid;
+
+ if (!static_branch_likely(&sched_numa_balancing))
+ goto out;
+
+ cpu = p->mm->sc_stat.cpu;
+ curr_cpu = task_cpu(p);
+
+ /*
+ * Scanning in the preferred NUMA node is ideal. However, the NUMA
+ * preferred node is per-task rather than per-process. It is possible
+ * for different threads of the process to have distinct preferred
+ * nodes; consequently, the process-wide preferred LLC may bounce
+ * between different nodes. As a workaround, maintain the scan
+ * CPU mask to also cover the process's current preferred LLC and the
+ * current running node to mitigate the bouncing risk.
+ * TBD: numa_group should be considered during task aggregation.
+ */
+ pref_nid = p->numa_preferred_nid;
+ /* honor the task's preferred node */
+ if (pref_nid == NUMA_NO_NODE)
+ goto out;
+
+ cpumask_or(cpus, cpus, cpumask_of_node(pref_nid));
+
+ /* honor the task's preferred LLC CPU */
+ if (cpu != -1 && !cpumask_test_cpu(cpu, cpus))
+ cpumask_or(cpus, cpus,
+ cpumask_of_node(cpu_to_node(cpu)));
+
+ /* make sure the task's current running node is included */
+ if (!cpumask_test_cpu(curr_cpu, cpus))
+ cpumask_or(cpus, cpus, cpumask_of_node(cpu_to_node(curr_cpu)));
+
+ return;
+
+out:
+#endif
+ cpumask_copy(cpus, cpu_online_mask);
+}
+
static void task_cache_work(struct callback_head *work)
{
struct task_struct *p = current;
@@ -1451,7 +1495,7 @@ static void task_cache_work(struct callback_head *work)
return;
scoped_guard (cpus_read_lock) {
- cpumask_copy(cpus, cpu_online_mask);
+ get_scan_cpumasks(cpus, p);
for_each_cpu(cpu, cpus) {
/* XXX sched_cluster_active */