Re: [PATCH 1/2] sched/fair: consider hk_mask early in triggering ilb
From: Shrikanth Hegde
Date: Thu Mar 19 2026 - 09:13:59 EST
Hi Mukesh.
On 3/19/26 1:45 PM, Mukesh Kumar Chaurasiya wrote:
On Thu, Mar 19, 2026 at 12:23:13PM +0530, Shrikanth Hegde wrote:
Current code around nohz_balancer_kick and kick_ilb:There is no usage of ilb_cpus till this point. We can avoid this if
1. Checks for nohz.idle_cpus_mask to see if idle load balance(ilb) is
needed.
2. Does a few checks to see if any conditions meet the criteria.
3. Tries to find the idle CPU. But the idle CPU found should be part of
housekeeping CPUs.
If there is no housekeeping idle CPU, then step 2 is done
un-necessarily, since 3 bails out without doing the ilb.
Fix that by making the decision early and pass it on to find_new_ilb.
Use a percpu cpumask instead of allocating it everytime since this is in
fastpath.
If flags is set to NOHZ_STATS_KICK since the time is after nohz.next_blocked
but before nohz.next_balance and there are idle CPUs which are part of
housekeeping, need to copy the same logic there too.
While there, fix the stale comments around nohz.nr_cpus
Signed-off-by: Shrikanth Hegde <sshegde@xxxxxxxxxxxxx>
---
Didn't add the fixes tag since it addresses more than stale comments.
kernel/sched/fair.c | 45 +++++++++++++++++++++++++++++++--------------
1 file changed, 31 insertions(+), 14 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b19aeaa51ebc..02cca2c7a98d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7392,6 +7392,7 @@ static inline unsigned int cfs_h_nr_delayed(struct rq *rq)
static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask);
+static DEFINE_PER_CPU(cpumask_var_t, kick_ilb_tmpmask);
#ifdef CONFIG_NO_HZ_COMMON
@@ -12629,15 +12630,14 @@ static inline int on_null_domain(struct rq *rq)
* - When one of the busy CPUs notices that there may be an idle rebalancing
* needed, they will kick the idle load balancer, which then does idle
* load balancing for all the idle CPUs.
+ *
+ * @cpus idle CPUs in HK_TYPE_KERNEL_NOISE housekeeping
*/
-static inline int find_new_ilb(void)
+static inline int find_new_ilb(struct cpumask *cpus)
{
- const struct cpumask *hk_mask;
int ilb_cpu;
- hk_mask = housekeeping_cpumask(HK_TYPE_KERNEL_NOISE);
-
- for_each_cpu_and(ilb_cpu, nohz.idle_cpus_mask, hk_mask) {
+ for_each_cpu(ilb_cpu, cpus) {
if (ilb_cpu == smp_processor_id())
continue;
@@ -12656,7 +12656,7 @@ static inline int find_new_ilb(void)
* We pick the first idle CPU in the HK_TYPE_KERNEL_NOISE housekeeping set
* (if there is one).
*/
-static void kick_ilb(unsigned int flags)
+static void kick_ilb(unsigned int flags, struct cpumask *cpus)
{
int ilb_cpu;
@@ -12667,7 +12667,7 @@ static void kick_ilb(unsigned int flags)
if (flags & NOHZ_BALANCE_KICK)
nohz.next_balance = jiffies+1;
- ilb_cpu = find_new_ilb();
+ ilb_cpu = find_new_ilb(cpus);
if (ilb_cpu < 0)
return;
@@ -12700,6 +12700,7 @@ static void kick_ilb(unsigned int flags)
*/
static void nohz_balancer_kick(struct rq *rq)
{
+ struct cpumask *ilb_cpus = this_cpu_cpumask_var_ptr(kick_ilb_tmpmask);
unsigned long now = jiffies;
struct sched_domain_shared *sds;
struct sched_domain *sd;
@@ -12715,27 +12716,41 @@ static void nohz_balancer_kick(struct rq *rq)
*/
nohz_balance_exit_idle(rq);
+ /* ILB considers only HK_TYPE_KERNEL_NOISE housekeeping CPUs */
+
if (READ_ONCE(nohz.has_blocked_load) &&
- time_after(now, READ_ONCE(nohz.next_blocked)))
+ time_after(now, READ_ONCE(nohz.next_blocked))) {
flags = NOHZ_STATS_KICK;
+ cpumask_and(ilb_cpus, nohz.idle_cpus_mask,
+ housekeeping_cpumask(HK_TYPE_KERNEL_NOISE));
+ }
/*
- * Most of the time system is not 100% busy. i.e nohz.nr_cpus > 0
- * Skip the read if time is not due.
+ * Most of the time system is not 100% busy. i.e there are idle
+ * housekeeping CPUs.
+ *
+ * So, Skip the reading idle_cpus_mask if time is not due.
*
* If none are in tickless mode, there maybe a narrow window
* (28 jiffies, HZ=1000) where flags maybe set and kick_ilb called.
* But idle load balancing is not done as find_new_ilb fails.
- * That's very rare. So read nohz.nr_cpus only if time is due.
+ * That's very rare. So check (idle_cpus_mask & HK_TYPE_KERNEL_NOISE)
+ * only if time is due.
+ *
*/
if (time_before(now, nohz.next_balance))
goto out;
+ /* Avoid the double computation */
+ if (flags != NOHZ_STATS_KICK)
+ cpumask_and(ilb_cpus, nohz.idle_cpus_mask,
+ housekeeping_cpumask(HK_TYPE_KERNEL_NOISE));
+
condition and get the ilb_cpus here itself instead of earlier.
No there is. Why?
struct cpumask *ilb_cpus = this_cpu_cpumask_var_ptr(kick_ilb_tmpmask) << this is just a variable.
if (READ_ONCE(nohz.has_blocked_load) && time_after(now, READ_ONCE(nohz.next_blocked)))
flags = NOHZ_STATS_KICK
if (time_before(now, nohz.next_balance))
goto out;
If there are idle cpus, nohz.has_blocked_load=1 on idle entry which
could be after previous nohz idle balance. After 32 jiffies time now points
after next_blocked. But nohz.next_balance is typically set to 60 jiffies.
So, it goes to out with flags set and that passes ilb_cpus which is not set yet.
Hence both places setting the ilb_cpu is necessary.
I kept it at both places and added flags check since it is difficult to
predict movement of nohz.next_balance and nohz.next_blocked since there
multiple CPUs involved which maybe doing idle entry/exit. On first tick
after idle exit, nohz_balancer_kick would be called.
/*Rest LGTM
* None are in tickless mode and hence no need for NOHZ idle load
* balancing
*/
- if (unlikely(cpumask_empty(nohz.idle_cpus_mask)))
+ if (unlikely(cpumask_empty(ilb_cpus)))
return;
if (rq->nr_running >= 2) {
@@ -12767,7 +12782,7 @@ static void nohz_balancer_kick(struct rq *rq)
* When balancing between cores, all the SMT siblings of the
* preferred CPU must be idle.
*/
- for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
+ for_each_cpu_and(i, sched_domain_span(sd), ilb_cpus) {
if (sched_asym(sd, i, cpu)) {
flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto unlock;
@@ -12820,7 +12835,7 @@ static void nohz_balancer_kick(struct rq *rq)
flags |= NOHZ_NEXT_KICK;
if (flags)
- kick_ilb(flags);
+ kick_ilb(flags, ilb_cpus);
}
static void set_cpu_sd_state_busy(int cpu)
@@ -14253,6 +14268,8 @@ __init void init_sched_fair_class(void)
zalloc_cpumask_var_node(&per_cpu(select_rq_mask, i), GFP_KERNEL, cpu_to_node(i));
zalloc_cpumask_var_node(&per_cpu(should_we_balance_tmpmask, i),
GFP_KERNEL, cpu_to_node(i));
+ zalloc_cpumask_var_node(&per_cpu(kick_ilb_tmpmask, i),
+ GFP_KERNEL, cpu_to_node(i));
#ifdef CONFIG_CFS_BANDWIDTH
INIT_CSD(&cpu_rq(i)->cfsb_csd, __cfsb_csd_unthrottle, cpu_rq(i));
--
2.43.0
Thank you for going through the patch.