[PATCH 4/9] s390/percpu: Use new percpu code section for arch_this_cpu_add()
From: Heiko Carstens
Date: Tue Mar 17 2026 - 15:59:50 EST
Convert arch_this_cpu_add() to make use of the new percpu code section
infrastructure.
With this the text size of the kernel image is reduced by ~76kb
(defconfig). Also more than 5300 generated preempt_schedule_notrace()
function calls within the kernel image (modules not counted) are removed.
With:
DEFINE_PER_CPU(long, foo);
void bar(long a) { this_cpu_add(foo, a); }
Old arch_this_cpu_add() looks like this:
00000000000000c0 <bar>:
c0: c0 04 00 00 00 00 jgnop c0 <bar>
c6: eb 01 03 a8 00 6a asi 936,1
cc: c4 18 00 00 00 00 lgrl %r1,cc <bar+0xc>
ce: R_390_GOTENT foo+0x2
d2: e3 10 03 b8 00 08 ag %r1,952
d8: eb 22 10 00 00 e8 laag %r2,%r2,0(%r1)
de: eb ff 03 a8 00 6e alsi 936,-1
e4: a7 a4 00 05 jhe ee <bar+0x2e>
e8: c0 f4 00 00 00 00 jg e8 <bar+0x28>
ea: R_390_PC32DBL __s390_indirect_jump_r14+0x2
ee: c0 f4 00 00 00 00 jg ee <bar+0x2e>
f0: R_390_PLT32DBL preempt_schedule_notrace+0x2
New arch_this_cpu_add() looks like this:
00000000000000c0 <bar>:
c0: c0 04 00 00 00 00 jgnop c0 <bar>
c6: c4 38 00 00 00 00 lgrl %r3,c6 <bar+0x6>
c8: R_390_GOTENT foo+0x2
cc: b9 04 00 43 lgr %r4,%r3
d0: eb 00 43 c0 00 52 mviy 960(%r4),0
d6: e3 40 03 b8 00 08 ag %r4,952
dc: eb 52 40 00 00 e8 laag %r5,%r2,0(%r4)
e2: eb 00 03 c0 00 52 mviy 960,0
e8: c0 f4 00 00 00 00 jg e8 <bar+0x28>
ea: R_390_PC32DBL __s390_indirect_jump_r14+0x2
Note that the conditional function call is removed.
Signed-off-by: Heiko Carstens <hca@xxxxxxxxxxxxx>
---
arch/s390/include/asm/percpu.h | 65 ++++++++++++++++++++++------------
1 file changed, 43 insertions(+), 22 deletions(-)
diff --git a/arch/s390/include/asm/percpu.h b/arch/s390/include/asm/percpu.h
index 05eb91428b42..c92b41efbc47 100644
--- a/arch/s390/include/asm/percpu.h
+++ b/arch/s390/include/asm/percpu.h
@@ -127,28 +127,49 @@
#else /* MARCH_HAS_Z196_FEATURES */
-#define arch_this_cpu_add(pcp, val, op1, op2, szcast) \
-{ \
- typedef typeof(pcp) pcp_op_T__; \
- pcp_op_T__ val__ = (val); \
- pcp_op_T__ old__, *ptr__; \
- preempt_disable_notrace(); \
- ptr__ = raw_cpu_ptr(&(pcp)); \
- if (__builtin_constant_p(val__) && \
- ((szcast)val__ > -129) && ((szcast)val__ < 128)) { \
- asm volatile( \
- op2 " %[ptr__],%[val__]" \
- : [ptr__] "+Q" (*ptr__) \
- : [val__] "i" ((szcast)val__) \
- : "cc"); \
- } else { \
- asm volatile( \
- op1 " %[old__],%[val__],%[ptr__]" \
- : [old__] "=d" (old__), [ptr__] "+Q" (*ptr__) \
- : [val__] "d" (val__) \
- : "cc"); \
- } \
- preempt_enable_notrace(); \
+#define arch_this_cpu_add(pcp, val, op1, op2, szcast) \
+{ \
+ unsigned long lc_pcpr, lc_pcpo; \
+ typedef typeof(pcp) pcp_op_T__; \
+ pcp_op_T__ val__ = (val); \
+ pcp_op_T__ old__, *ptr__; \
+ \
+ lc_pcpr = offsetof(struct lowcore, percpu_register); \
+ lc_pcpo = offsetof(struct lowcore, percpu_offset); \
+ ptr__ = &(pcp); \
+ if (__builtin_constant_p(val__) && \
+ ((szcast)val__ > -129) && ((szcast)val__ < 128)) { \
+ asm volatile( \
+ MVIY_PERCPU("%[disppcpr]", "%[dispaltpcpr]", "%[ptr__]")\
+ AG_ALT("%[disppcpo]", "%[dispaltpcpo]", "%[ptr__]") \
+ op2 " 0(%[ptr__]),%[val__]\n" \
+ MVIY_ALT("%[disppcpr]", "%[dispaltpcpr]", "%%r0") \
+ : [ptr__] "+&a" (ptr__), "+m" (*ptr__), \
+ "=m" (((struct lowcore *)0)->percpu_register) \
+ : [val__] "i" ((szcast)val__), \
+ [disppcpr] "i" (lc_pcpr), \
+ [disppcpo] "i" (lc_pcpo), \
+ [dispaltpcpr] "i" (lc_pcpr + LOWCORE_ALT_ADDRESS), \
+ [dispaltpcpo] "i" (lc_pcpo + LOWCORE_ALT_ADDRESS), \
+ "m" (((struct lowcore *)0)->percpu_offset) \
+ : "cc"); \
+ } else { \
+ asm volatile( \
+ MVIY_PERCPU("%[disppcpr]", "%[dispaltpcpr]", "%[ptr__]")\
+ AG_ALT("%[disppcpo]", "%[dispaltpcpo]", "%[ptr__]") \
+ op1 " %[old__],%[val__],0(%[ptr__])\n" \
+ MVIY_ALT("%[disppcpr]", "%[dispaltpcpr]", "%%r0") \
+ : [old__] "=&d" (old__), \
+ [ptr__] "+&a" (ptr__), "+m" (*ptr__), \
+ "=m" (((struct lowcore *)0)->percpu_register) \
+ : [val__] "d" (val__), \
+ [disppcpr] "i" (lc_pcpr), \
+ [disppcpo] "i" (lc_pcpo), \
+ [dispaltpcpr] "i" (lc_pcpr + LOWCORE_ALT_ADDRESS), \
+ [dispaltpcpo] "i" (lc_pcpo + LOWCORE_ALT_ADDRESS), \
+ "m" (((struct lowcore *)0)->percpu_offset) \
+ : "cc"); \
+ } \
}
#define this_cpu_add_4(pcp, val) arch_this_cpu_add(pcp, val, "laa", "asi", int)
--
2.51.0