Re: [PATCH v2 2/5] sched/fair: Attach sched_domain_shared to sd_asym_cpucapacity

From: Andrea Righi

Date: Tue May 19 2026 - 02:50:07 EST


Hi Prateek,

On Tue, May 19, 2026 at 11:22:32AM +0530, K Prateek Nayak wrote:
> Hello Peter, Andrea,
>
> On 5/19/2026 2:28 AM, Peter Zijlstra wrote:
> > @@@ -2775,20 -3049,16 +3107,15 @@@ build_sched_domains(const struct cpumas
> > if (!sd)
> > continue;
> >
> > + if (has_asym)
> > - asym_claimed = claim_asym_sched_domain_shared(&d, i);
> > ++ claim_asym_sched_domain_shared(&d, i);
> > +
> > /* First, find the topmost SD_SHARE_LLC domain */
> > while (sd->parent && (sd->parent->flags & SD_SHARE_LLC))
> > sd = sd->parent;
> >
> > if (sd->flags & SD_SHARE_LLC) {
> > - /*
> > - * Initialize the sd->shared for SD_SHARE_LLC unless
> > - * the asym path above already claimed it.
> > - */
> > - if (!asym_claimed)
> > - init_sched_domain_shared(&d, sd);
> > - int sd_id = cpumask_first(sched_domain_span(sd));
> > -
> > - sd->shared = *per_cpu_ptr(d.sds, sd_id);
> > - atomic_set(&sd->shared->nr_busy_cpus, sd->span_weight);
> > - atomic_inc(&sd->shared->ref);
> > ++ init_sched_domain_shared(&d, sd);
>
> This will run into a small problem with "nr_idle_scan" if
> cpumask_first(sched_domain_span(sd)) is the same for both sd_asym and
> sd_llc.

Ah, good catch! When cpumask_first(asym_span) == cpumask_first(llc_span)
(big.LITTLE typical case), both sd_asym->shared and sd_llc->shared would alias
to d->sds[0].

>
> Load balancer at different domains will populate "nr_idle_scan" with
> different values and they alias to same ->shared if one isn't
> degenerated and I believe there is at least one way to hit the WARN_ON()
> from cpu_attach_domain() if the SD_ASYM_CPUCAPACITY_FULL comes before
> the last SD_SHARE_LLC domain and the latter is degenerated.
>
> How about this:
>
> (On top of queue:sched/core; Lightly tested on !ASYM_CPUCAPACITY system)
>
> diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
> index fe09d3268bc9..1d2c98dca211 100644
> --- a/include/linux/sched/topology.h
> +++ b/include/linux/sched/topology.h
> @@ -67,7 +67,15 @@ struct sched_domain_shared {
> atomic_t ref;
> atomic_t nr_busy_cpus;
> int has_idle_cores;
> - int nr_idle_scan;
> + union {
> + int nr_idle_scan;
> + /*
> + * Used during allocation to claim the
> + * sched_domain_shared object at
> + * multiple levels.

I think between build and the first LB tick, readers of nr_idle_scan may observe
leftover SD_* flags in nr_idle_scan. This shouldn't be a problem and should
self-heal soon, but maybe it's worth a comment? Something like:

* Note: between build and the first periodic LB tick, which
* rewrites the union via update_idle_cpu_scan(), readers of
* nr_idle_scan may observe the transient SD_* flag value as
* the scan bound. The flag bits are small positive integers,
* so the effect is just a slightly relaxed scan bound for one
* window and self-heals on the first tick.

> + */
> + int alloc_flags;
> + };
> #ifdef CONFIG_SCHED_CACHE
> unsigned long util_avg;
> unsigned long capacity;
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index dbfd9657f897..9ebd14652e9d 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -623,6 +623,12 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc)
> } while (sg != first);
> }
>
> +static void free_sched_domain_shared(struct sched_domain_shared *sds)
> +{
> + if (sds && atomic_dec_and_test(&sds->ref))
> + kfree(sds);
> +}
> +
> static void destroy_sched_domain(struct sched_domain *sd)
> {
> /*
> @@ -631,9 +637,7 @@ static void destroy_sched_domain(struct sched_domain *sd)
> * dropping group/capacity references, freeing where none remain.
> */
> free_sched_groups(sd->groups, 1);
> -
> - if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
> - kfree(sd->shared);
> + free_sched_domain_shared(sd->shared);
>
> #ifdef CONFIG_SCHED_CACHE
> /* only the bottom sd has llc_counts array */
> @@ -755,7 +759,14 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
>
> /* Pick reference to parent->shared. */
> if (parent->shared) {
> - WARN_ON_ONCE(tmp->shared);
> + /*
> + * It is safe to free a sd->shared that
> + * has not been published yet. If a
> + * sd->shared was published, the refcount
> + * will end up being non-zero and it will
> + * not be freed here.
> + */
> + free_sched_domain_shared(tmp->shared);
> tmp->shared = parent->shared;
> parent->shared = NULL;
> }
> @@ -2916,11 +2927,40 @@ static void adjust_numa_imbalance(struct sched_domain *sd_llc)
> }
> }
>
> -static void init_sched_domain_shared(struct s_data *d, struct sched_domain *sd)
> +static void
> +init_sched_domain_shared(struct s_data *d, struct sched_domain *sd, int flags)
> {
> - int sd_id = cpumask_first(sched_domain_span(sd));
> + int cpu;
> +
> + /*
> + * Multiple domains can try to claim a shared object like
> + * SD_ASYM_CPUCAPACITY and SD_SHARE_LLC which can alias to
> + * same cpumask_first(sched_domain_span(sd)) CPU and can
> + * cause "nr_idle_scan" to be populated incorrectly during
> + * load balncing.

nit: s/balncing/balancing/

> + *
> + * Find the first CPU in sched_domain_span(sd) with an
> + * unclaimed domain (!alloc_flags) or where the alloc_flag
> + * matches the requested flag (SD_* flag)
> + */
> + for_each_cpu(cpu, sched_domain_span(sd)) {
> + struct sched_domain_shared *sds = *per_cpu_ptr(d->sds, cpu);
> +
> + /*
> + * If the domain only has single CPU, allow temporary overlap
> + * in allocation since the domains will be degenerated anyways.
> + */
> + if (!sds->alloc_flags ||
> + sd->span_weight == 1 ||
> + sds->alloc_flags == flags) {
> + sds->alloc_flags = flags;
> + sd->shared = sds;
> + break;
> + }
> + }
> +
> + BUG_ON(!sd->shared);

Unreachable in practice, but should we have a WARN_ON_ONCE() +
bail/early-return? In this way we'd fall back to using LLC's shared for
sd_balance_shared, which seems nicer than a BUG_ON().

>
> - sd->shared = *per_cpu_ptr(d->sds, sd_id);
> /*
> * nr_busy_cpus is consumed only by the NOHZ kick path via
> * sd_balance_shared; on the asym-capacity path it is initialized but
> @@ -2960,7 +3000,7 @@ static bool claim_asym_sched_domain_shared(struct s_data *d, int cpu)
> if (!sd_asym || (sd_asym->flags & SD_NUMA))
> return false;
>
> - init_sched_domain_shared(d, sd_asym);
> + init_sched_domain_shared(d, sd_asym, SD_ASYM_CPUCAPACITY);
> return true;
> }
>
> @@ -3115,7 +3155,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
> sd = sd->parent;
>
> if (sd->flags & SD_SHARE_LLC) {
> - init_sched_domain_shared(&d, sd);
> + init_sched_domain_shared(&d, sd, SD_SHARE_LLC);
>
> /*
> * In presence of higher domains, adjust the
> --
> Thanks and Regards,
> Prateek
>

Thanks,
-Andrea