Re: [RFC PATCH 2/2] thermal/cpufreq_cooling: Use idle_time to get cpu_load when scx_enabled
From: Qais Yousef
Date: Mon Mar 23 2026 - 21:47:07 EST
On 03/20/26 19:31, Xuewen Yan wrote:
> From: Di Shen <di.shen@xxxxxxxxxx>
>
> Recently, while enabling sched-ext debugging, we observed abnormal behavior
> in our thermal power_allocator’s temperature control.
> Through debugging, we found that the CPU util was too low, causing
> the CPU frequency to remain unrestricted.
>
> This issue stems from the fact that in the sched_cpu_util() function,
> when scx is enabled, cpu_util_cfs becomes zero. As a result,
> the thermal subsystem perceives an extremely low CPU utilization,
> which degrades the effectiveness of the power_allocator’s control.
>
> However, the scx_cpuperf_target() reflects the targeted performance,
> not the utilisation. We couldn't use it.
>
> Until a perfect solution is found, using idle_time to get the cpu load
> might be a better approach.
>
> Co-developed-by: Xuewen Yan <xuewen.yan@xxxxxxxxxx>
> Signed-off-by: Xuewen Yan <xuewen.yan@xxxxxxxxxx>
> Signed-off-by: Di Shen <di.shen@xxxxxxxxxx>
> ---
> Previous discussion:
> https://lore.kernel.org/all/5a5d565b-33ac-4d5c-b0dd-1353324a6117@xxxxxxx/
>
> ---
> drivers/thermal/cpufreq_cooling.c | 54 ++++++++++++++++++++-----------
> 1 file changed, 35 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_cooling.c
> index d030dbeb2973..e8fa70a95d00 100644
> --- a/drivers/thermal/cpufreq_cooling.c
> +++ b/drivers/thermal/cpufreq_cooling.c
> @@ -24,6 +24,9 @@
> #include <linux/units.h>
>
> #include "thermal_trace.h"
> +#ifdef CONFIG_SCHED_CLASS_EXT
> +#include "../../kernel/sched/sched.h"
> +#endif
This is a terrible include
>
> /*
> * Cooling state <-> CPUFreq frequency
> @@ -72,7 +75,7 @@ struct cpufreq_cooling_device {
> struct em_perf_domain *em;
> struct cpufreq_policy *policy;
> struct thermal_cooling_device_ops cooling_ops;
> -#ifndef CONFIG_SMP
> +#if !defined(CONFIG_SMP) || defined(CONFIG_SCHED_CLASS_EXT)
> struct time_in_idle *idle_time;
> #endif
> struct freq_qos_request qos_req;
> @@ -147,23 +150,9 @@ static u32 cpu_power_to_freq(struct cpufreq_cooling_device *cpufreq_cdev,
> return freq;
> }
>
> -/**
> - * get_load() - get load for a cpu
> - * @cpufreq_cdev: struct cpufreq_cooling_device for the cpu
> - * @cpu: cpu number
> - *
> - * Return: The average load of cpu @cpu in percentage since this
> - * function was last called.
> - */
> -#ifdef CONFIG_SMP
> -static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu)
> -{
> - unsigned long util = sched_cpu_util(cpu);
> -
> - return (util * 100) / arch_scale_cpu_capacity(cpu);
> -}
> -#else /* !CONFIG_SMP */
> -static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu)
> +#if !defined(CONFIG_SMP) || defined(CONFIG_SCHED_CLASS_EXT)
> +static u32 get_load_from_idle_time(struct cpufreq_cooling_device *cpufreq_cdev,
> + int cpu)
> {
> u32 load;
> u64 now, now_idle, delta_time, delta_idle;
> @@ -183,8 +172,35 @@ static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu)
>
> return load;
> }
> -#endif /* CONFIG_SMP */
> +#endif /* !defined(CONFIG_SMP) || defined(CONFIG_SCHED_CLASS_EXT) */
More ugly ifdefs
>
> +/**
> + * get_load() - get load for a cpu
> + * @cpufreq_cdev: struct cpufreq_cooling_device for the cpu
> + * @cpu: cpu number
> + *
> + * Return: The average load of cpu @cpu in percentage since this
> + * function was last called.
> + */
> +#ifndef CONFIG_SMP
> +static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu,
> + int cpu_idx)
> +{
> + return get_load_from_idle_time(cpufreq_cdev, cpu, cpu_idx);
> +}
> +#else /* CONFIG_SMP */
> +static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu)
> +{
> + unsigned long util;
> +
> +#ifdef CONFIG_SCHED_CLASS_EXT
> + if (scx_enabled())
> + return get_load_from_idle_time(cpufreq_cdev, cpu);
> +#endif
Instead of this scx special hack, wouldn't it be better to implement this as
a special operation mode? But then this will beg the question do we actually
need sched_cpu_util() if it can all be done based on idle time and just remove
the deps on sched_cpu_util()?
ifdefing based on scx is nasty hack, this can be done better; most likely by
decoupling the deps on util if truly the idle time is enough. If it is not
enough, then I am not sure this will solve any problem.
> + util = sched_cpu_util(cpu);
> + return (util * 100) / arch_scale_cpu_capacity(cpu);
> +}
> +#endif /* !CONFIG_SMP */
> /**
> * get_dynamic_power() - calculate the dynamic power
> * @cpufreq_cdev: &cpufreq_cooling_device for this cdev
> --
> 2.25.1
>