RE: [PATCH v2 1/4] sched/rt: Optimize cpupri_vec layout to mitigate cache line contention

From: Deng, Pan

Date: Fri Mar 27 2026 - 06:18:08 EST


>
> On Tue, Mar 24, 2026 at 09:36:14AM +0000, Deng, Pan wrote:
>
> > Regarding this patch, yes, using cacheline aligned could increase potential
> > memory usage.
> > After internal discussion, we are thinking of an alternative method to
> > mitigate the waste of memory usage, that is, using kmalloc() to allocate
> > count in a different memory space rather than placing the count and
> > cpumask together in this structure. The rationale is that, writing to
> > address pointed by the counter and reading the address from cpumask
> > is isolated in different memory space which could reduce the ratio of
> > cache false sharing, besides, kmalloc() based on slub/slab could place
> > the objects in different cache lines to reduce the cache contention.
> > The drawback of dynamic allocation counter is that, we have to maintain
> > the life cycle of the counters.
> > Could you please advise if sticking with current cache_align attribute
> > method or using kmalloc() is preferred?
>
> Well, you'd have to allocate a full cacheline anyway. If you allocate N
> 4 byte (counter) objects, there's a fair chance they end up in the same
> cacheline (its a SLAB after all) and then you're back to having a ton of
> false sharing.
>
> Anyway, for you specific workload, why isn't partitioning a viable
> solution? It would not need any kernel modifications and would get rid
> of the contention entirely.

Thank you very much for pointing this out.

We understand cpuset partitioning would eliminate the contention.
However, in managed container platforms (e.g., Kubernetes), users can
obtain RT capabilities for their workloads via CAP_SYS_NICE, but they
don't have host-level privileges to create cpuset partitions.

Besides the cache line align approach, regarding both the contention and
memory overhead, would it be possible to consider the alternative approach
as follow:

1. Use {counts[], masks[]} instead of vec[{count, mask}]

2. Separate counts[0] (CPUPRI_NORMAL), who experiences both heavy
write and read traffic.
Writes: RT task lifecycle operations (enqueue on empty runqueue,
dequeue from non-overloaded runqueue) frequently update the
normal priority count.
Reads: RT tasks searching for available CPUs scan from low to high
priority, with counts[0] being checked at the start of every
search iteration.
So that even if workloads used lower RT priorities (e.g., RT pri 49
instead of 99), counts[0] contention would still be heavy, not specific
to the pri-99 workload configuration.

3. Separate masks from counts to ensure no contention between them.

With the change struct cpupri size can be reduced from 26 cache lines to
21 cache lines, which saves more memory in cpuset partitioning scenarios.

code change looks like this:
---
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 42c40cfdf836..1e333e6edb1e 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -64,16 +64,38 @@ static int convert_prio(int prio)
return cpupri;
}

+/*
+ * Get pointer to count for given priority index.
+ *
+ * Skip padding after counts[0] for idx > 0 to access the correct location.
+ */
+static inline atomic_t *cpupri_count(struct cpupri *cp, int idx)
+{
+ if (idx > 0)
+ idx += CPUPRI_COUNT0_PADDING;
+
+ return &cp->pri_to_cpu.counts[idx];
+}
+
+/*
+ * Get pointer to mask for given priority index.
+ */
+static inline cpumask_var_t cpupri_mask(struct cpupri *cp, int idx)
+{
+ return cp->pri_to_cpu.masks[idx];
+}
+
static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p,
struct cpumask *lowest_mask, int idx)
{
- struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
+ cpumask_var_t cpu_mask = cpupri_mask(cp, idx);
+
int skip = 0;

- if (!atomic_read(&(vec)->count))
+ if (!atomic_read(cpupri_count(cp, idx)))
skip = 1;
/*
- * When looking at the vector, we need to read the counter,
+ * When looking at the vector, we need to read the count,
* do a memory barrier, then read the mask.
*
* Note: This is still all racy, but we can deal with it.
@@ -96,18 +118,18 @@ static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p,
if (skip)
return 0;

- if (cpumask_any_and(&p->cpus_mask, vec->mask) >= nr_cpu_ids)
+ if (cpumask_any_and(&p->cpus_mask, cpu_mask) >= nr_cpu_ids)
return 0;

if (lowest_mask) {
- cpumask_and(lowest_mask, &p->cpus_mask, vec->mask);
+ cpumask_and(lowest_mask, &p->cpus_mask, cpu_mask);
cpumask_and(lowest_mask, lowest_mask, cpu_active_mask);

/*
* We have to ensure that we have at least one bit
* still set in the array, since the map could have
* been concurrently emptied between the first and
- * second reads of vec->mask. If we hit this
+ * second reads of cpu_mask. If we hit this
* condition, simply act as though we never hit this
* priority level and continue on.
*/
@@ -227,23 +249,19 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
* cpu being missed by the priority loop in cpupri_find.
*/
if (likely(newpri != CPUPRI_INVALID)) {
- struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
-
- cpumask_set_cpu(cpu, vec->mask);
+ cpumask_set_cpu(cpu, cpupri_mask(cp, newpri));
/*
* When adding a new vector, we update the mask first,
* do a write memory barrier, and then update the count, to
* make sure the vector is visible when count is set.
*/
smp_mb__before_atomic();
- atomic_inc(&(vec)->count);
+ atomic_inc(cpupri_count(cp, newpri));
do_mb = 1;
}
if (likely(oldpri != CPUPRI_INVALID)) {
- struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
-
/*
- * Because the order of modification of the vec->count
+ * Because the order of modification of the cpu count
* is important, we must make sure that the update
* of the new prio is seen before we decrement the
* old prio. This makes sure that the loop sees
@@ -252,18 +270,18 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
* priority, as that will trigger an rt pull anyway.
*
* We only need to do a memory barrier if we updated
- * the new priority vec.
+ * cpu count or mask of the new priority.
*/
if (do_mb)
smp_mb__after_atomic();

/*
- * When removing from the vector, we decrement the counter first
+ * When removing from the vector, we decrement the count first
* do a memory barrier and then clear the mask.
*/
- atomic_dec(&(vec)->count);
+ atomic_dec(cpupri_count(cp, oldpri));
smp_mb__after_atomic();
- cpumask_clear_cpu(cpu, vec->mask);
+ cpumask_clear_cpu(cpu, cpupri_mask(cp, oldpri));
}

*currpri = newpri;
@@ -280,10 +298,8 @@ int cpupri_init(struct cpupri *cp)
int i;

for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
- struct cpupri_vec *vec = &cp->pri_to_cpu[i];
-
- atomic_set(&vec->count, 0);
- if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
+ atomic_set(cpupri_count(cp, i), 0);
+ if (!zalloc_cpumask_var(&cp->pri_to_cpu.masks[i], GFP_KERNEL))
goto cleanup;
}

@@ -298,7 +314,7 @@ int cpupri_init(struct cpupri *cp)

cleanup:
for (i--; i >= 0; i--)
- free_cpumask_var(cp->pri_to_cpu[i].mask);
+ free_cpumask_var(cpupri_mask(cp, i));
return -ENOMEM;
}

@@ -312,5 +328,5 @@ void cpupri_cleanup(struct cpupri *cp)

kfree(cp->cpu_to_pri);
for (i = 0; i < CPUPRI_NR_PRIORITIES; i++)
- free_cpumask_var(cp->pri_to_cpu[i].mask);
+ free_cpumask_var(cpupri_mask(cp, i));
}
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
index d6cba0020064..9041e2ffb3f3 100644
--- a/kernel/sched/cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -1,5 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */

+#include <linux/cache.h>
+
#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO+1)

#define CPUPRI_INVALID -1
@@ -7,13 +9,73 @@
/* values 1-99 are for RT1-RT99 priorities */
#define CPUPRI_HIGHER 100

+/*
+ * Padding to isolate counts[0] (CPUPRI_NORMAL) into its own cacheline.
+ *
+ * On 64-byte cacheline systems: (64 / 4) - 1 = 15 padding slots
+ * This places counts[0] alone in cacheline 0, counts[1..N] in subsequent lines.
+ */
+#define CPUPRI_COUNT0_PADDING ((SMP_CACHE_BYTES / sizeof(atomic_t)) - 1)
+
+/* Total count array size including padding after counts[0] */
+#define CPUPRI_COUNT_ARRAY_SIZE (CPUPRI_NR_PRIORITIES + CPUPRI_COUNT0_PADDING)
+
+/*
+ * Padding bytes to align mask vector to cacheline boundary.
+ * Ensures no false sharing between counts[] and masks[].
+ */
+#define CPUPRI_VEC_PADDING \
+ (SMP_CACHE_BYTES - \
+ (CPUPRI_COUNT_ARRAY_SIZE * sizeof(atomic_t) % SMP_CACHE_BYTES))
+
struct cpupri_vec {
- atomic_t count;
- cpumask_var_t mask;
+ /*
+ * Count vector with strategic padding to prevent false sharing.
+ *
+ * Layout (64-byte cachelines):
+ * Cacheline 0: counts[0] (CPUPRI_NORMAL) + 60 bytes padding
+ * Cacheline 1+: counts[1..100] (RT priorities 1-99, CPUPRI_HIGHER)
+ *
+ * counts[0] experiences the heaviest read and write traffic:
+ * - Write: RT task lifecycle operations (enqueue on empty runqueue,
+ dequeue from non-overloaded runqueue) frequently update the
+ normal priority count.
+ * - Read: RT tasks searching for available CPUs scan from low to high
+ priority, with counts[0] being checked at the start of every
+ search iteration.
+ * Isolating counts[0] in its own cacheline prevents contention with other
+ * priority counts during concurrent search and update operations.
+ */
+ atomic_t counts[CPUPRI_COUNT_ARRAY_SIZE];
+
+ /*
+ * Padding to separate count and mask vectors.
+ *
+ * Prevents false sharing between:
+ * - counts[] (read-write, hot path in cpupri_set)
+ * - masks[] (read-mostly, accessed in cpupri_find)
+ */
+ char padding[CPUPRI_VEC_PADDING];
+
+ /*
+ * CPU mask vector.
+ *
+ * Either stores:
+ * - Pointers to dynamically allocated cpumasks (read-mostly after init)
+ * - Inline cpumasks (if !CPUMASK_OFFSTACK)
+ */
+ cpumask_var_t masks[CPUPRI_NR_PRIORITIES];
};

struct cpupri {
- struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
+ /*
+ * Priority-to-CPU mapping.
+ *
+ * Single cpupri_vec structure containing all counts and masks,
+ * rather than 101 separate cpupri_vec elements. This reduces
+ * memory overhead from ~26 to ~21 cachelines.
+ */
+ struct cpupri_vec pri_to_cpu;
int *cpu_to_pri;
};

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 475bb5998295..2263237cdeb0 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1014,7 +1014,7 @@ struct root_domain {
* one runnable RT task.
*/
cpumask_var_t rto_mask;
- struct cpupri cpupri;
+ struct cpupri cpupri ____cacheline_aligned;

/*
* NULL-terminated list of performance domains intersecting with the