[PATCH v3 15/20] sched/core: Introduce a simple steal monitor

From: Shrikanth Hegde

Date: Thu May 14 2026 - 11:42:26 EST


Start with a simple steal monitor.

It is meant to look at steal time and make the decision to
reduce/increase the preferred CPUs.

It has
- work function to execute the steal time calculations and decision
making periodically.
- temporary cpumask, which will be used in the work function. This helps
to avoid cpumask allocation in periodic work function.
- low and high thresholds for steal time.
- sampling period to control the frequency of steal time calculations.
- cache the previous decision to avoid oscillations

Signed-off-by: Shrikanth Hegde <sshegde@xxxxxxxxxxxxx>
---
include/linux/sched.h | 13 +++++++++++++
kernel/sched/core.c | 24 ++++++++++++++++++++++++
kernel/sched/sched.h | 3 +++
3 files changed, 40 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index dcfb57c90850..ee5f19a96118 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2516,4 +2516,17 @@ extern void migrate_enable(void);

DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())

+#ifdef CONFIG_PREFERRED_CPU
+struct steal_monitor_t {
+ struct work_struct work;
+ cpumask_var_t tmp_mask;
+ ktime_t prev_time;
+ u64 prev_steal;
+ int previous_decision;
+ unsigned int low_threshold;
+ unsigned int high_threshold;
+ unsigned int sampling_period_ms;
+};
+#endif
+
#endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1533a44d1a6b..907c6b38460b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9102,6 +9102,8 @@ void __init sched_init(void)

preempt_dynamic_init();

+ sched_init_steal_monitor();
+
scheduler_running = 1;
}

@@ -11351,4 +11353,26 @@ void sched_push_current_non_preferred_cpu(struct rq *rq)
push_task, this_cpu_ptr(&npc_push_task_work));
local_irq_restore(flags);
}
+
+struct steal_monitor_t steal_mon;
+
+void sched_init_steal_monitor(void)
+{
+ INIT_WORK(&steal_mon.work, sched_steal_detection_work);
+ zalloc_cpumask_var(&steal_mon.tmp_mask, GFP_KERNEL);
+ steal_mon.low_threshold = 200; /* 2% steal time */
+ steal_mon.high_threshold = 500; /* 5% steal time */
+ steal_mon.sampling_period_ms = 1000; /* once per second */
+}
+
+/* This is only a skeleton. Subsequent patches introduce more of it */
+void sched_steal_detection_work(struct work_struct *work)
+{
+ struct steal_monitor_t *sm = container_of(work, struct steal_monitor_t, work);
+ ktime_t now;
+
+ /* Update the prev_time for next iteration*/
+ now = ktime_get();
+ sm->prev_time = now;
+}
#endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index bcc65c8b4ac6..d674f8e8e854 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -4143,8 +4143,11 @@ static inline bool task_has_preferred_cpus(struct task_struct *p)
DECLARE_STATIC_KEY_FALSE(__sched_sm_enable);

void sched_push_current_non_preferred_cpu(struct rq *rq);
+void sched_init_steal_monitor(void);
+void sched_steal_detection_work(struct work_struct *work);
#else /* !CONFIG_PREFERRED_CPU */
static inline void sched_push_current_non_preferred_cpu(struct rq *rq) { }
+static inline void sched_init_steal_monitor(void) { }
#endif

#endif /* _KERNEL_SCHED_SCHED_H */
--
2.47.3