[PATCH] lib/group_cpus: rotate extra groups to avoid IRQ stacking

From: Naman Jain

Date: Tue Mar 24 2026 - 04:02:58 EST

When multiple devices call group_cpus_evenly() with the same number of
groups, the cluster-aware path in __try_group_cluster_cpus() assigns
extra groups to the same set of clusters every time, producing identical
affinity masks for every caller. CPUs in clusters that receive two
groups (and thus get single-CPU dedicated masks) end up handling
interrupts from ALL devices, creating an IRQ imbalance.

For example, on a 96-CPU / 2-NUMA-node system with 24 clusters of
2 CPUs each and 6 NVMe disks each requesting 62 vectors:
alloc_groups_to_nodes() distributes 31 groups across 24 clusters,
giving 7 clusters 2 groups (single-CPU mask = dedicated) and 17
clusters 1 group (2-CPU mask = shared). Because the assignment is
deterministic, all 6 disks produce the same mapping and the same 14
CPUs each accumulate 6 dedicated IRQs -- roughly twice the interrupt
load of other CPUs -- causing up to 11% per-disk throughput degradation
on IRQ-heavy CPUs.

Fix this by introducing a per-caller rotation offset via a static
atomic counter. After alloc_groups_to_nodes() determines each
cluster's group count, collect the extras (groups above the per-cluster
minimum), then redistribute them starting from a rotated position with
a stride of ncluster/extras so that successive callers scatter their
extra groups across different clusters. A capacity check
(cpumask_weight_and) ensures no cluster is assigned more groups than it
has CPUs, with a fallback loop for any extras that could not be placed
in the strided pass.

For systems without cluster topology, the same rotation is applied in
assign_cpus_to_groups() at the per-group level: the modular expression
(v + spread_offset) % nv->ngroups selects which groups receive the
extra CPU, replacing the previous sequential decrement.

Tested on a 96-vCPU Hyper-V VM (AMD EPYC 9V74, 48 clusters of 2, 2
NUMA nodes) with 6 NVMe data disks (63 MSI-X vectors each), running
4K random-read fio (psync, 16 jobs/disk, 120s, CPU-pinned):
- Worst-disk degradation vs average: 11% -> 5%
- Previously penalized disks: +12% IOPS, 10% lower latency

Fixes: 89802ca36c96 ("lib/group_cpus: make group CPU cluster aware")
Co-developed-by: Long Li <longli@xxxxxxxxxxxxx>
Signed-off-by: Long Li <longli@xxxxxxxxxxxxx>
Signed-off-by: Naman Jain <namjain@xxxxxxxxxxxxxxxxxxx>
---
lib/group_cpus.c | 116 ++++++++++++++++++++++++++++++++++++++++++-----
1 file changed, 104 insertions(+), 12 deletions(-)

diff --git a/lib/group_cpus.c b/lib/group_cpus.c
index e6e18d7a49bb..241c2c9b437d 100644
--- a/lib/group_cpus.c
+++ b/lib/group_cpus.c
@@ -7,6 +7,7 @@
#include <linux/slab.h>
#include <linux/cpu.h>
#include <linux/sort.h>
+#include <linux/atomic.h>
#include <linux/group_cpus.h>

#ifdef CONFIG_SMP
@@ -255,12 +256,20 @@ static void alloc_nodes_groups(unsigned int numgrps,
alloc_groups_to_nodes(numgrps, numcpus, node_groups, nr_node_ids);
}

+/*
+ * Per-caller rotation counter for group_cpus_evenly().
+ * Wrapping is harmless: the offset is only used modulo small values
+ * (ncluster or nv->ngroups), so any unsigned value works.
+ */
+static atomic_t group_spread_cnt = ATOMIC_INIT(0);
+
static void assign_cpus_to_groups(unsigned int ncpus,
struct cpumask *nmsk,
struct node_groups *nv,
struct cpumask *masks,
unsigned int *curgrp,
- unsigned int last_grp)
+ unsigned int last_grp,
+ unsigned int spread_offset)
{
unsigned int v, cpus_per_grp, extra_grps;
/* Account for rounding errors */
@@ -270,11 +279,15 @@ static void assign_cpus_to_groups(unsigned int ncpus,
for (v = 0; v < nv->ngroups; v++, *curgrp += 1) {
cpus_per_grp = ncpus / nv->ngroups;

- /* Account for extra groups to compensate rounding errors */
- if (extra_grps) {
+ /*
+ * Rotate which groups get the extra CPU so that
+ * successive callers produce different mappings,
+ * avoiding IRQ stacking when multiple devices
+ * share the same CPU topology.
+ */
+ if (extra_grps &&
+ (v + spread_offset) % nv->ngroups < extra_grps)
cpus_per_grp++;
- --extra_grps;
- }

/*
* wrapping has to be considered given 'startgrp'
@@ -361,7 +374,8 @@ static bool __try_group_cluster_cpus(unsigned int ncpus,
struct cpumask *node_cpumask,
struct cpumask *masks,
unsigned int *curgrp,
- unsigned int last_grp)
+ unsigned int last_grp,
+ unsigned int spread_offset)
{
struct node_groups *cluster_groups;
const struct cpumask **clusters;
@@ -379,6 +393,78 @@ static bool __try_group_cluster_cpus(unsigned int ncpus,
if (ncluster == 0)
goto fail_no_clusters;

+ /*
+ * Rotate which clusters receive extra groups so that different
+ * callers of group_cpus_evenly() produce different group-to-CPU
+ * mappings. Without this, all devices get identical affinity
+ * masks, causing IRQ stacking on CPUs assigned single-CPU groups.
+ *
+ * alloc_groups_to_nodes() distributes ngroups proportionally, but
+ * rounding causes some clusters to get one more group than others.
+ * The assignment is deterministic, so every device gets the same
+ * mapping. Fix: collect the "extra" groups (above the per-cluster
+ * minimum), then redistribute them starting from a rotated
+ * position. The ncpus constraint (ngroups <= ncpus) is preserved
+ * by checking each cluster's actual CPU count before adding.
+ *
+ * Note: after alloc_groups_to_nodes(), the union field holds
+ * ngroups (not ncpus), so we recompute ncpus from cluster masks.
+ */
+ if (ncluster > 1) {
+ unsigned int min_grps = UINT_MAX;
+ unsigned int total_extra = 0;
+ unsigned int start, stride;
+
+ for (i = 0; i < ncluster; i++) {
+ if (cluster_groups[i].ngroups < min_grps)
+ min_grps = cluster_groups[i].ngroups;
+ }
+
+ for (i = 0; i < ncluster; i++) {
+ if (cluster_groups[i].ngroups > min_grps) {
+ total_extra +=
+ cluster_groups[i].ngroups - min_grps;
+ cluster_groups[i].ngroups = min_grps;
+ }
+ }
+
+ /*
+ * Redistribute extras using a stride to scatter them
+ * across clusters. With stride = ncluster / extras,
+ * consecutive callers' extra sets overlap minimally
+ * (e.g. max 2 overlap for 6 callers with 24 clusters
+ * and 7 extras, vs 6 overlap with stride 1).
+ */
+ start = spread_offset % ncluster;
+ stride = (total_extra > 0 && total_extra < ncluster) ?
+ ncluster / total_extra : 1;
+
+ for (i = 0; i < ncluster && total_extra > 0; i++) {
+ unsigned int idx =
+ (start + i * stride) % ncluster;
+ unsigned int cap;
+
+ cap = cpumask_weight_and(clusters[cluster_groups[idx].id],
+ node_cpumask);
+ if (cluster_groups[idx].ngroups < cap) {
+ cluster_groups[idx].ngroups++;
+ total_extra--;
+ }
+ }
+
+ /* Fallback: place remaining extras wherever they fit */
+ for (i = 0; i < ncluster && total_extra > 0; i++) {
+ unsigned int cap;
+
+ cap = cpumask_weight_and(clusters[cluster_groups[i].id],
+ node_cpumask);
+ if (cluster_groups[i].ngroups < cap) {
+ cluster_groups[i].ngroups++;
+ total_extra--;
+ }
+ }
+ }
+
for (i = 0; i < ncluster; i++) {
struct node_groups *nv = &cluster_groups[i];

@@ -389,7 +475,8 @@ static bool __try_group_cluster_cpus(unsigned int ncpus,
continue;
WARN_ON_ONCE(nv->ngroups > nc);

- assign_cpus_to_groups(nc, nmsk, nv, masks, curgrp, last_grp);
+ assign_cpus_to_groups(nc, nmsk, nv, masks, curgrp, last_grp,
+ spread_offset);
}

ret = true;
@@ -404,7 +491,8 @@ static bool __try_group_cluster_cpus(unsigned int ncpus,
static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps,
cpumask_var_t *node_to_cpumask,
const struct cpumask *cpu_mask,
- struct cpumask *nmsk, struct cpumask *masks)
+ struct cpumask *nmsk, struct cpumask *masks,
+ unsigned int spread_offset)
{
unsigned int i, n, nodes, done = 0;
unsigned int last_grp = numgrps;
@@ -455,13 +543,14 @@ static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps,
WARN_ON_ONCE(nv->ngroups > ncpus);

if (__try_group_cluster_cpus(ncpus, nv->ngroups, nmsk,
- masks, &curgrp, last_grp)) {
+ masks, &curgrp, last_grp,
+ spread_offset)) {
done += nv->ngroups;
continue;
}

assign_cpus_to_groups(ncpus, nmsk, nv, masks, &curgrp,
- last_grp);
+ last_grp, spread_offset);
done += nv->ngroups;
}
kfree(node_groups);
@@ -488,6 +577,7 @@ static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps,
struct cpumask *group_cpus_evenly(unsigned int numgrps, unsigned int *nummasks)
{
unsigned int curgrp = 0, nr_present = 0, nr_others = 0;
+ unsigned int spread_offset;
cpumask_var_t *node_to_cpumask;
cpumask_var_t nmsk, npresmsk;
int ret = -ENOMEM;
@@ -510,6 +600,8 @@ struct cpumask *group_cpus_evenly(unsigned int numgrps, unsigned int *nummasks)
if (!masks)
goto fail_node_to_cpumask;

+ spread_offset = (unsigned int)atomic_fetch_inc(&group_spread_cnt);
+
build_node_to_cpumask(node_to_cpumask);

/*
@@ -528,7 +620,7 @@ struct cpumask *group_cpus_evenly(unsigned int numgrps, unsigned int *nummasks)

/* grouping present CPUs first */
ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask,
- npresmsk, nmsk, masks);
+ npresmsk, nmsk, masks, spread_offset);
if (ret < 0)
goto fail_node_to_cpumask;
nr_present = ret;
@@ -545,7 +637,7 @@ struct cpumask *group_cpus_evenly(unsigned int numgrps, unsigned int *nummasks)
curgrp = nr_present;
cpumask_andnot(npresmsk, cpu_possible_mask, npresmsk);
ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask,
- npresmsk, nmsk, masks);
+ npresmsk, nmsk, masks, spread_offset);
if (ret >= 0)
nr_others = ret;

base-commit: c369299895a591d96745d6492d4888259b004a9e
--
2.43.0