Re: [RFC patch v3 01/20] sched: Cache aware load-balancing

From: Chen, Yu C
Date: Fri Jul 04 2025 - 04:47:13 EST


Hi Shrikanth,

On 7/4/2025 3:29 AM, Shrikanth Hegde wrote:


Hi Tim, Chen,
skimming through the series and will try to go through in coming days.


Thanks for your interest in this change.


One of the many things on the eternal todo list has been finishing the
below hackery.

It is an attempt at modelling cache affinity -- and while the patch
really only targets LLC, it could very well be extended to also apply to
clusters (L2). Specifically any case of multiple cache domains inside a
node.

Anyway, I wrote this about a year ago, and I mentioned this at the
recent OSPM conf where Gautham and Prateek expressed interest in playing
with this code.

So here goes, very rough and largely unproven code ahead :-)

It applies to current tip/master, but I know it will fail the __percpu
validation that sits in -next, although that shouldn't be terribly hard
to fix up.

As is, it only computes a CPU inside the LLC that has the highest recent
runtime, this CPU is then used in the wake-up path to steer towards this
LLC and in task_hot() to limit migrations away from it.

More elaborate things could be done, notably there is an XXX in there
somewhere about finding the best LLC inside a NODE (interaction with
NUMA_BALANCING).

Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
---

[snip...]

+
  #ifdef CONFIG_RSEQ
      struct rseq __user *rseq;
      u32 rseq_len;
diff --git a/init/Kconfig b/init/Kconfig
index bf3a920064be..e2509127b6f9 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -953,6 +953,10 @@ config NUMA_BALANCING
        This system will be inactive on UMA systems.
+config SCHED_CACHE
+    bool "Cache aware scheduler"
+    default y
+

Should it depend on EXPERT?
IMO this could add quite a bit of overhead and maybe n by default?


I would leave this to Peter and Tim to decide.

  config NUMA_BALANCING_DEFAULT_ENABLED
      bool "Automatically enable NUMA aware memory/task placement"
      default y
diff --git a/kernel/fork.c b/kernel/fork.c
index 168681fc4b25..da1387823b9e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c

[snip]

+#ifdef CONFIG_SCHED_CACHE
+
+/*
+ * XXX numbers come from a place the sun don't shine -- probably wants to be SD
+ * tunable or so.
+ */
+#define EPOCH_PERIOD    (HZ/100)    /* 10 ms */
+#define EPOCH_OLD    5        /* 50 ms */

Have these been converted into tunables? I didn't spot that in the series.


OK, they could be added into debugfs.


+
+static inline void __update_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched)
+{
+    lockdep_assert_held(&rq->cpu_epoch_lock);
+
+    unsigned long n, now = jiffies;
+    long delta = now - rq->cpu_epoch_next;
+
+    if (delta > 0) {
+        n = (delta + EPOCH_PERIOD - 1) / EPOCH_PERIOD;
+        rq->cpu_epoch += n;
+        rq->cpu_epoch_next += n * EPOCH_PERIOD;
+        __shr_u64(&rq->cpu_runtime, n);

Another doubt i had, does this occupancy works when there is CPU bandwidth controller running?
A 50% occupancy may have different meaning when CPU bandwidth is set to 50%?


Even if cgroup throttle is enabled, the 50% might still indicate that
the occupancy on that CPU is real, and probably less "cache-hot".

+    }
+
+    n = rq->cpu_epoch - pcpu_sched->epoch;
+    if (n) {
+        pcpu_sched->epoch += n;
+        __shr_u64(&pcpu_sched->runtime, n);
+    }
+}
+
+static unsigned long fraction_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched)
+{
+    guard(raw_spinlock_irqsave)(&rq->cpu_epoch_lock);
+
+    __update_mm_sched(rq, pcpu_sched);
+
+    /*
+     * Runtime is a geometric series (r=0.5) and as such will sum to twice
+     * the accumulation period, this means the multiplcation here should
+     * not overflow.
+     */
+    return div64_u64(NICE_0_LOAD * pcpu_sched->runtime, rq- >cpu_runtime + 1);
+}
+
+static inline
+void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
+{
+    struct mm_struct *mm = p->mm;
+    struct mm_sched *pcpu_sched;
+    unsigned long epoch;
+
+    /*
+     * init_task and kthreads don't be having no mm
+     */
+    if (!mm || !mm->pcpu_sched)
+        return;
+
+    pcpu_sched = this_cpu_ptr(p->mm->pcpu_sched);
+
+    scoped_guard (raw_spinlock, &rq->cpu_epoch_lock) {
+        __update_mm_sched(rq, pcpu_sched);
+        pcpu_sched->runtime += delta_exec;
+        rq->cpu_runtime += delta_exec;
+        epoch = rq->cpu_epoch;
+    }
+
+    /*
+     * If this task hasn't hit task_cache_work() for a while, invalidate
+     * it's preferred state.
+     */
+    if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_OLD) {
+        mm->mm_sched_cpu = -1;
+        pcpu_sched->occ = -1;
+    }
+}
+
+static void task_tick_cache(struct rq *rq, struct task_struct *p)
+{
+    struct callback_head *work = &p->cache_work;
+    struct mm_struct *mm = p->mm;
+
+    if (!mm || !mm->pcpu_sched)
+        return;
+
+    if (mm->mm_sched_epoch == rq->cpu_epoch)
+        return;
+
+    guard(raw_spinlock)(&mm->mm_sched_lock);
+
+    if (mm->mm_sched_epoch == rq->cpu_epoch)
+        return;
+
+    if (work->next == work) {
+        task_work_add(p, work, TWA_RESUME);
+        WRITE_ONCE(mm->mm_sched_epoch, rq->cpu_epoch);
+    }
+}
+
+static void task_cache_work(struct callback_head *work)
+{
+    struct task_struct *p = current;
+    struct mm_struct *mm = p->mm;
+    unsigned long m_a_occ = 0;
+    int cpu, m_a_cpu = -1;
+    cpumask_var_t cpus;
+
+    WARN_ON_ONCE(work != &p->cache_work);
+
+    work->next = work;
+
+    if (p->flags & PF_EXITING)
+        return;
+
+    if (!alloc_cpumask_var(&cpus, GFP_KERNEL))
+        return;
+
+    scoped_guard (cpus_read_lock) {
+        cpumask_copy(cpus, cpu_online_mask);
+

As pointed out already, this is going to be costly in multi NUMA systems. Any cross NUMA access of
CPUs data is going to add overhead to system bus bandwidth and this happening at tick could be costly.


Yes, we are trying to reduce the overhead of CPU scan, although this
scan happens not at every tick.

Also, taking cpu_read_lock does preempt_disable, this could add to large preemptoff?

cpus_read_lock() just disables the preemption for a short time I
suppose? If it can not get the lock, it enable the preemption and goes
to sleep.

We need to measure the time it takes on large system. Will try and get back with that number

OK, looking forward to it.


+        for_each_cpu(cpu, cpus) {
+            /* XXX sched_cluster_active */
+            struct sched_domain *sd = per_cpu(sd_llc, cpu);
+            unsigned long occ, m_occ = 0, a_occ = 0;
+            int m_cpu = -1, nr = 0, i;
+
+            for_each_cpu(i, sched_domain_span(sd)) {
+                occ = fraction_mm_sched(cpu_rq(i),
+                            per_cpu_ptr(mm->pcpu_sched, i));
+                a_occ += occ;
+                if (occ > m_occ) {
+                    m_occ = occ;
+                    m_cpu = i;
+                }
+                nr++;
+                trace_printk("(%d) occ: %ld m_occ: %ld m_cpu: %d nr: %d\n",
+                         per_cpu(sd_llc_id, i), occ, m_occ, m_cpu, nr);
+            }
+
+            a_occ /= nr;
+            if (a_occ > m_a_occ) {
+                m_a_occ = a_occ;
+                m_a_cpu = m_cpu;
+            }
+
+            trace_printk("(%d) a_occ: %ld m_a_occ: %ld\n",
+                     per_cpu(sd_llc_id, cpu), a_occ, m_a_occ);
+
+            for_each_cpu(i, sched_domain_span(sd)) {
+                /* XXX threshold ? */
+                per_cpu_ptr(mm->pcpu_sched, i)->occ = a_occ;
+            }
+
+            cpumask_andnot(cpus, cpus, sched_domain_span(sd));
+        }
+    }
+
+    /*
+     * If the max average cache occupancy is 'small' we don't care.
+     */
+    if (m_a_occ < (NICE_0_LOAD >> EPOCH_OLD))
+        m_a_cpu = -1;
+
+    mm->mm_sched_cpu = m_a_cpu;
+
+    free_cpumask_var(cpus);
+}
+
+void init_sched_mm(struct task_struct *p)
+{
+    struct callback_head *work = &p->cache_work;
+    init_task_work(work, task_cache_work);
+    work->next = work;
+}
+
+#else
+
+static inline void account_mm_sched(struct rq *rq, struct task_struct *p,
+                    s64 delta_exec) { }
+
+
+void init_sched_mm(struct task_struct *p) { }
+
+static void task_tick_cache(struct rq *rq, struct task_struct *p) { }
+
+#endif
+
+static inline
+void update_curr_task(struct rq *rq, struct task_struct *p, s64 delta_exec)
  {
      trace_sched_stat_runtime(p, delta_exec);
      account_group_exec_runtime(p, delta_exec);
+    account_mm_sched(rq, p, delta_exec);
      cgroup_account_cputime(p, delta_exec);
  }

AFAIU, this works and cares only about SCHED_NORMAL.
update_curr_task called by common for RT/DL too. Maybe avoid for those?


OK, will fix it.

@@ -1215,7 +1434,7 @@ s64 update_curr_common(struct rq *rq)
      delta_exec = update_curr_se(rq, &donor->se);
      if (likely(delta_exec > 0))
-        update_curr_task(donor, delta_exec);
+        update_curr_task(rq, donor, delta_exec);
      return delta_exec;
  }

[snip...]

      check_update_overutilized_status(task_rq(curr));
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 47972f34ea70..d16ccd66ca07 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1171,6 +1171,12 @@ struct rq {
      u64            clock_pelt_idle_copy;
      u64            clock_idle_copy;
  #endif
+#ifdef CONFIG_SCHED_CACHE
+    raw_spinlock_t        cpu_epoch_lock;
+    u64            cpu_runtime;
+    unsigned long        cpu_epoch;
+    unsigned long        cpu_epoch_next;
+#endif

Maybe these can go to their own cacheline?


Sure. Do you mean there is risk to cause false
sharing, that theres *_epoch could race with
either atomic_t nr_iowait or u64 clock_idle_copy?

thanks,
Chenyu