Re: [PATCH v2 4/4] sched/rt: Split cpupri_vec->cpumask to per NUMA node to reduce contention

From: Peter Zijlstra

Date: Tue Mar 24 2026 - 08:10:54 EST

On Mon, Mar 23, 2026 at 11:45:01AM -0700, Tim Chen wrote:
> On Fri, 2026-03-20 at 13:40 +0100, Peter Zijlstra wrote:
> > On Mon, Jul 21, 2025 at 02:10:26PM +0800, Pan Deng wrote:
> >
> > > This change splits `cpupri_vec->cpumask` into per-NUMA-node data to
> > > mitigate false sharing.
> >
> > So I really do think we need something here. We're running into the
> > whole cpumask contention thing on a semi regular basis.
> >
> > But somehow I doubt this is it.
> >
> > I would suggest building a radix tree like structure based on ACPIID
> > -- which is inherently suitable for this given that is exactly how
> > CPUID-0b/1f are specified.
> >
>
> Are you thinking about replacing cpumask in cpupri_vec with something like xarray?
> And a question on using ACPIID for the CPU as index instead of CPUID.
> Is it because you want to even out access in the tree?

Sorry, s/ACPI/APIC/, I keep cursing the sadist that put those two
acronyms together.

No, because I want it sorted by topology. Per virtue of CPUID-b/1f the
APIC-ID is in topology order.

Perhaps a little something like this. It will obviously only build on
x86, and then only boot for those that have <=64 CPUs in their DIE
domain.

This needs ARCH_HAS_SBM and a cpumask based fallback implementation of
sbm at the very least.

Now, I was hoping AMD EPYC would have their CCD things as a topology
level, but going by the MADT/CPUID dump I got from Boris, this is not
the case. So we need to manually insert that level and hope the APIC-ID
range is nicely setup for that, or they need to do worse things still.

I think that for things like DMR DIE DTRT, but I've not yet seem one
upclose :/

Also, I really wish all the SNC capable chips would have the SNC domains
enumerated, even if SNC is not in use. This x86 topology enumeration
stuff is such a shit show :-(

Anyway, random hackery below, it basically does a 2 level structure
where the leaf is a whole cacheline (double check the kzalloc_obj()
stuff respects alignment) and we make sure the CPUs for that leaf are
actually from the same cache domain. At least, that's the theory, see
above ranting on the glories of topology enumeration.

It boots in qemu with --cpus 16,sockets=2,dies=2 and appears to 'work'.
YMMV

This code is very much a PoC, treat it as such.

---
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 9cd493d467d4..24012a91ac1e 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -54,6 +54,7 @@ static inline void x86_32_probe_apic(void) { }
#endif

extern u32 cpuid_to_apicid[];
+extern u32 apicid_to_cpuid[];

#define CPU_ACPIID_INVALID U32_MAX

diff --git a/arch/x86/include/asm/sbm.h b/arch/x86/include/asm/sbm.h
new file mode 100644
index 000000000000..9a4d283347d1
--- /dev/null
+++ b/arch/x86/include/asm/sbm.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <asm/apic.h>
+
+static __always_inline u32 arch_sbm_cpu_to_idx(unsigned int cpu)
+{
+ return cpuid_to_apicid[cpu];
+}
+
+static __always_inline u32 arch_sbm_idx_to_cpu(unsigned int idx)
+{
+ return apicid_to_cpuid[idx];
+}
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c
index eafcb1fc185a..6f3d18288600 100644
--- a/arch/x86/kernel/cpu/topology.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -48,6 +48,12 @@ DECLARE_BITMAP(phys_cpu_present_map, MAX_LOCAL_APIC) __read_mostly;

/* Used for CPU number allocation and parallel CPU bringup */
u32 cpuid_to_apicid[] __ro_after_init = { [0 ... NR_CPUS - 1] = BAD_APICID, };
+u32 apicid_to_cpuid[MAX_LOCAL_APIC] = { 0 };
+
+u32 arch_sbm_leafs __ro_after_init;
+u32 arch_sbm_shift __ro_after_init;
+u32 arch_sbm_mask __ro_after_init;
+u32 arch_sbm_bits __ro_after_init;

/* Bitmaps to mark registered APICs at each topology domain */
static struct { DECLARE_BITMAP(map, MAX_LOCAL_APIC); } apic_maps[TOPO_MAX_DOMAIN] __ro_after_init;
@@ -234,6 +240,7 @@ static __init void topo_register_apic(u32 apic_id, u32 acpi_id, bool present)
cpu = topo_get_cpunr(apic_id);

cpuid_to_apicid[cpu] = apic_id;
+ apicid_to_cpuid[apic_id] = cpu;
topo_set_cpuids(cpu, apic_id, acpi_id);
} else {
topo_info.nr_disabled_cpus++;
@@ -537,7 +544,9 @@ void __init topology_init_possible_cpus(void)
MAX_LOCAL_APIC, apicid);
if (apicid >= MAX_LOCAL_APIC)
break;
- cpuid_to_apicid[topo_info.nr_assigned_cpus++] = apicid;
+ cpu = topo_info.nr_assigned_cpus++;
+ cpuid_to_apicid[cpu] = apicid;
+ apicid_to_cpuid[apicid] = cpu;
}

for (cpu = 0; cpu < allowed; cpu++) {
@@ -551,6 +560,17 @@ void __init topology_init_possible_cpus(void)
cpu_mark_primary_thread(cpu, apicid);
set_cpu_present(cpu, test_bit(apicid, phys_cpu_present_map));
}
+
+ apicid = 0;
+ for_each_possible_cpu(cpu)
+ apicid = max(apicid, cpuid_to_apicid[cpu]);
+
+ arch_sbm_shift = x86_topo_system.dom_shifts[TOPO_DIE_DOMAIN] - 1;
+ arch_sbm_leafs = 1 + (apicid >> arch_sbm_shift);
+ arch_sbm_mask = (1 << arch_sbm_shift) - 1;
+ arch_sbm_bits = arch_sbm_shift;
+
+ pr_info("SBM: shift(%d) leafs(%d) APIC(%x)\n", arch_sbm_shift, arch_sbm_leafs, apicid);
}

/*
diff --git a/include/linux/sbm.h b/include/linux/sbm.h
new file mode 100644
index 000000000000..8beade6c0585
--- /dev/null
+++ b/include/linux/sbm.h
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_SBM_H
+#define _LINUX_SBM_H
+
+#include <linux/slab.h>
+#include <linux/bitmap.h>
+#include <linux/cpumask.h>
+#include <asm/sbm.h>
+
+extern unsigned int arch_sbm_leafs;
+extern unsigned int arch_sbm_shift;
+extern unsigned int arch_sbm_mask;
+extern unsigned int arch_sbm_bits;
+
+extern unsigned int arch_sbm_cpu_to_idx(unsigned int cpu);
+extern unsigned int arch_sbm_idx_to_cpu(unsigned int idx);
+
+enum sbm_type {
+ st_root = 0,
+ st_leaf,
+};
+
+struct sbm_root {
+ enum sbm_type type;
+ unsigned int nr;
+ struct sbm_leaf *leafs[] __counted_by(nr);
+};
+
+struct sbm_leaf {
+ enum sbm_type type;
+ unsigned long bitmap;
+} ____cacheline_aligned;
+
+struct sbm {
+ enum sbm_type type;
+};
+
+extern struct sbm *sbm_alloc(void);
+extern unsigned int sbm_find_next_bit(struct sbm *sbm, int start);
+
+#define __sbm_op(sbm, func) \
+({ \
+ struct sbm_leaf *leaf = (void *)sbm; \
+ int idx = arch_sbm_cpu_to_idx(cpu); \
+ if (sbm->type == st_root) { \
+ struct sbm_root *root = (void *)sbm; \
+ int nr = idx >> arch_sbm_shift; \
+ leaf = root->leafs[nr]; \
+ } \
+ int bit = idx & arch_sbm_mask; \
+ func(bit, &leaf->bitmap); \
+})
+
+static inline void sbm_cpu_set(struct sbm *sbm, int cpu)
+{
+ __sbm_op(sbm, set_bit);
+}
+
+static inline void sbm_cpu_clear(struct sbm *sbm, int cpu)
+{
+ __sbm_op(sbm, clear_bit);
+}
+
+static inline void __sbm_cpu_set(struct sbm *sbm, int cpu)
+{
+ __sbm_op(sbm, __set_bit);
+}
+
+static inline void __sbm_cpu_clear(struct sbm *sbm, int cpu)
+{
+ __sbm_op(sbm, __clear_bit);
+}
+
+static inline bool sbm_cpu_test(struct sbm *sbm, int cpu)
+{
+ return __sbm_op(sbm, test_bit);
+}
+
+#define sbm_for_each_set_bit(sbm, idx) \
+ for (int idx = sbm_find_next_bit(sbm, 0); \
+ idx >= 0; idx = sbm_find_next_bit(sbm, idx+1))
+
+#endif /* _LINUX_SBM_H */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 226509231e67..a3a423c4706e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -49,6 +49,7 @@
#include <linux/ratelimit.h>
#include <linux/task_work.h>
#include <linux/rbtree_augmented.h>
+#include <linux/sbm.h>

#include <asm/switch_to.h>

@@ -7384,7 +7385,7 @@ static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask);
#ifdef CONFIG_NO_HZ_COMMON

static struct {
- cpumask_var_t idle_cpus_mask;
+ struct sbm *sbm;
int has_blocked_load; /* Idle CPUS has blocked load */
int needs_update; /* Newly idle CPUs need their next_balance collated */
unsigned long next_balance; /* in jiffy units */
@@ -12615,12 +12616,11 @@ static inline int on_null_domain(struct rq *rq)
static inline int find_new_ilb(void)
{
int this_cpu = smp_processor_id();
- const struct cpumask *hk_mask;
int ilb_cpu;

- hk_mask = housekeeping_cpumask(HK_TYPE_KERNEL_NOISE);
+ sbm_for_each_set_bit(nohz.sbm, idx) {
+ ilb_cpu = arch_sbm_idx_to_cpu(idx);

- for_each_cpu_and(ilb_cpu, nohz.idle_cpus_mask, hk_mask) {
if (ilb_cpu == this_cpu)
continue;

@@ -12685,7 +12685,7 @@ static void nohz_balancer_kick(struct rq *rq)
unsigned long now = jiffies;
struct sched_domain_shared *sds;
struct sched_domain *sd;
- int nr_busy, i, cpu = rq->cpu;
+ int nr_busy, cpu = rq->cpu;
unsigned int flags = 0;

if (unlikely(rq->idle_balance))
@@ -12713,13 +12713,6 @@ static void nohz_balancer_kick(struct rq *rq)
if (time_before(now, nohz.next_balance))
goto out;

- /*
- * None are in tickless mode and hence no need for NOHZ idle load
- * balancing
- */
- if (unlikely(cpumask_empty(nohz.idle_cpus_mask)))
- return;
-
if (rq->nr_running >= 2) {
flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto out;
@@ -12739,24 +12732,6 @@ static void nohz_balancer_kick(struct rq *rq)
}
}

- sd = rcu_dereference_all(per_cpu(sd_asym_packing, cpu));
- if (sd) {
- /*
- * When ASYM_PACKING; see if there's a more preferred CPU
- * currently idle; in which case, kick the ILB to move tasks
- * around.
- *
- * When balancing between cores, all the SMT siblings of the
- * preferred CPU must be idle.
- */
- for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
- if (sched_asym(sd, i, cpu)) {
- flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
- goto unlock;
- }
- }
- }
-
sd = rcu_dereference_all(per_cpu(sd_asym_cpucapacity, cpu));
if (sd) {
/*
@@ -12829,7 +12804,8 @@ void nohz_balance_exit_idle(struct rq *rq)
return;

rq->nohz_tick_stopped = 0;
- cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask);
+ if (cpumask_test_cpu(rq->cpu, housekeeping_cpumask(HK_TYPE_KERNEL_NOISE)))
+ sbm_cpu_clear(nohz.sbm, rq->cpu);

set_cpu_sd_state_busy(rq->cpu);
}
@@ -12886,7 +12862,8 @@ void nohz_balance_enter_idle(int cpu)

rq->nohz_tick_stopped = 1;

- cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
+ if (cpumask_test_cpu(rq->cpu, housekeeping_cpumask(HK_TYPE_KERNEL_NOISE)))
+ sbm_cpu_set(nohz.sbm, rq->cpu);

/*
* Ensures that if nohz_idle_balance() fails to observe our
@@ -12913,7 +12890,7 @@ static bool update_nohz_stats(struct rq *rq)
if (!rq->has_blocked_load)
return false;

- if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
+ if (!sbm_cpu_test(nohz.sbm, cpu))
return false;

if (!time_after(jiffies, READ_ONCE(rq->last_blocked_load_update_tick)))
@@ -12967,7 +12944,9 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags)
* Start with the next CPU after this_cpu so we will end with this_cpu and let a
* chance for other idle cpu to pull load.
*/
- for_each_cpu_wrap(balance_cpu, nohz.idle_cpus_mask, this_cpu+1) {
+ sbm_for_each_set_bit(nohz.sbm, idx) {
+ balance_cpu = arch_sbm_idx_to_cpu(idx);
+
if (!idle_cpu(balance_cpu))
continue;

@@ -14250,6 +14229,6 @@ __init void init_sched_fair_class(void)
#ifdef CONFIG_NO_HZ_COMMON
nohz.next_balance = jiffies;
nohz.next_blocked = jiffies;
- zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
+ nohz.sbm = sbm_alloc();
#endif
}
diff --git a/lib/Makefile b/lib/Makefile
index 1b9ee167517f..8d1f6b5327d5 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -40,7 +40,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
is_single_threaded.o plist.o decompress.o kobject_uevent.o \
earlycpio.o seq_buf.o siphash.o dec_and_lock.o \
nmi_backtrace.o win_minmax.o memcat_p.o \
- buildid.o objpool.o iomem_copy.o sys_info.o
+ buildid.o objpool.o iomem_copy.o sys_info.o sbm.o

lib-$(CONFIG_UNION_FIND) += union_find.o
lib-$(CONFIG_PRINTK) += dump_stack.o
diff --git a/lib/sbm.c b/lib/sbm.c
new file mode 100644
index 000000000000..167cf857cd32
--- /dev/null
+++ b/lib/sbm.c
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/sbm.h>
+
+struct sbm *sbm_alloc(void)
+{
+ unsigned int nr = arch_sbm_leafs;
+ struct sbm_root *root = kzalloc_flex(*root, leafs, nr);
+ struct sbm_leaf *leaf;
+ if (!root)
+ return NULL;
+
+ root->type = st_root;
+
+ for (int i = 0; i < nr; i++) {
+ leaf = kzalloc_obj(*leaf);
+ if (!leaf)
+ goto fail;
+ leaf->type = st_leaf;
+ root->leafs[i] = leaf;
+ }
+
+ if (nr == 1) {
+ leaf = root->leafs[0];
+ kfree(root);
+ return (void *)leaf;
+ }
+
+ return (void *)root;
+
+fail:
+ for (int i = 0; i < nr; i++)
+ kfree(root->leafs[i]);
+ kfree(root);
+ return NULL;
+}
+
+unsigned int sbm_find_next_bit(struct sbm *sbm, int start)
+{
+ struct sbm_leaf *leaf = (void *)sbm;
+ struct sbm_root *root = (void *)sbm;
+ int nr = start >> arch_sbm_shift;
+ int bit = start & arch_sbm_mask;
+ unsigned long tmp, mask = (~0UL) << bit;
+ if (sbm->type == st_root) {
+ for (; nr < arch_sbm_leafs; nr++, mask = ~0UL) {
+ leaf = root->leafs[nr];
+ tmp = leaf->bitmap & mask;
+ if (!tmp)
+ continue;
+ }
+ } else {
+ tmp = leaf->bitmap & mask;
+ }
+ if (!tmp)
+ return -1;
+ return (nr << arch_sbm_shift) | __ffs(tmp);
+}