[PATCH v3] Faster Arm64 __arch_copy_from_user and __arch_copy_to_user

From: Qi Xi

Date: Mon Mar 16 2026 - 08:43:39 EST


Based on Ben Niu's "Faster Arm64 __arch_copy_from_user and
__arch_copy_to_user" patch [1], this implementation further optimizes
and simplifies user space copies by:

1. Limiting optimization scope to >=128 bytes copies where PAN state matters.
For <128 bytes copies, the implementation uses non-privileged
instructions uniformly, simplifying the code and reducing maintenance
cost.
2. Adding "arm64.nopan" cmdline support using the standard idreg-override
framework, allowing runtime PAN disable without building separate
CONFIG_ARM64_PAN=y/n kernels as required by Ben Niu's version.
The implementation maintains separate paths for PAN-enabled (using
unprivileged ldtr/sttr) and PAN-disabled (using standard ldp/stp), with
runtime selection via ALTERNATIVE() at the large copy loop entry.
3. Retaining the critical path optimization from the original patch:
reducing pointer update instructions through manual batch updates,
processing 64 bytes per iteration with only one pair of add instructions.

Performance improvements measured on Kunpeng 920 with PAN disabled:

The ku_copy microbenchmark [2] (a kernel module that measures
copy_to/from_user throughput across various sizes by copying 1GB of
data in each test):
copy_to_user throughput change (positive = improvement):
128B: +0.9% 256B: +10.3% 512B: +23.3% 1024B: +38.1%
2048B: +56.2% 4096B: +68.5% 8192B: +74.8% 16384B: +79.7%
32768B: +80.7% 65536B: +81.3% 131072B: +77.3% 262144B: +77.9%
copy_from_user throughput change:
128B: +2.0% 256B: +7.5% 512B: +20.3% 1024B: +28.4%
2048B: +38.1% 4096B: +39.6% 8192B: +41.5% 16384B: +42.3%
32768B: +42.2% 65536B: +44.8% 131072B: +70.3% 262144B: +71.0%

Real-world workloads:
- RocksDB read-write mixed workload:
Overall throughput improved by 2%.
copy_to_user hotspot reduced from 3.3% to 2.7% of total CPU cycles.
copy_from_user hotspot reduced from 2.25% to 0.85% of total CPU cycles.

- BRPC rdma_performance (server side, baidu_std protocol over TCP):
copy_to_user accounts for ~11.5% of total CPU cycles.
After optimization, server CPU utilization reduced from 64% to 62%
(2% absolute improvement, equivalent to ~17% reduction in
copy_to_user overhead)

[1] https://lore.kernel.org/all/20251018052237.1368504-2-benniu@xxxxxxxx/
[2] https://github.com/mcfi/benchmark/tree/main/ku_copy

Co-developed-by: Ben Niu <benniu@xxxxxxxx>
Signed-off-by: Ben Niu <benniu@xxxxxxxx>
Signed-off-by: Jinjiang Tu <tujinjiang@xxxxxxxxxx>
Signed-off-by: Qi Xi <xiqi2@xxxxxxxxxx>
---
Changes in v3:
- Limiting optimization scope to >=128 bytes copies.
- Use idreg-override for PAN runtime selection with "arm64.nopan" cmdline.
---
arch/arm64/include/asm/asm-uaccess.h | 22 ++----
arch/arm64/kernel/pi/idreg-override.c | 2 +
arch/arm64/lib/copy_from_user.S | 17 +++-
arch/arm64/lib/copy_template.S | 108 +++++++++++++++++++-------
arch/arm64/lib/copy_to_user.S | 17 +++-
5 files changed, 114 insertions(+), 52 deletions(-)

diff --git a/arch/arm64/include/asm/asm-uaccess.h b/arch/arm64/include/asm/asm-uaccess.h
index 9148f5a31968..198a05d478fc 100644
--- a/arch/arm64/include/asm/asm-uaccess.h
+++ b/arch/arm64/include/asm/asm-uaccess.h
@@ -70,27 +70,21 @@ alternative_else_nop_endif
* This is complicated as there is no post-increment or pair versions of the
* unprivileged instructions, and USER() only works for single instructions.
*/
- .macro user_ldp l, reg1, reg2, addr, post_inc
-8888: ldtr \reg1, [\addr];
-8889: ldtr \reg2, [\addr, #8];
- add \addr, \addr, \post_inc;
+ .macro user_ldst l, inst, reg, addr, post_inc
+8888: \inst \reg, [\addr];
+ add \addr, \addr, \post_inc;

_asm_extable_uaccess 8888b, \l;
- _asm_extable_uaccess 8889b, \l;
.endm

- .macro user_stp l, reg1, reg2, addr, post_inc
-8888: sttr \reg1, [\addr];
-8889: sttr \reg2, [\addr, #8];
- add \addr, \addr, \post_inc;
+ .macro user_ldst_index l, inst, reg, addr, val
+8888: \inst \reg, [\addr, \val];

- _asm_extable_uaccess 8888b,\l;
- _asm_extable_uaccess 8889b,\l;
+ _asm_extable_uaccess 8888b, \l;
.endm

- .macro user_ldst l, inst, reg, addr, post_inc
-8888: \inst \reg, [\addr];
- add \addr, \addr, \post_inc;
+ .macro user_ldst_pair_index l, inst, reg1, reg2, addr, val
+8888: \inst \reg1, \reg2, [\addr, \val];

_asm_extable_uaccess 8888b, \l;
.endm
diff --git a/arch/arm64/kernel/pi/idreg-override.c b/arch/arm64/kernel/pi/idreg-override.c
index bc57b290e5e7..ac26f1f3aad4 100644
--- a/arch/arm64/kernel/pi/idreg-override.c
+++ b/arch/arm64/kernel/pi/idreg-override.c
@@ -64,6 +64,7 @@ static const struct ftr_set_desc mmfr1 __prel64_initconst = {
.override = &id_aa64mmfr1_override,
.fields = {
FIELD("vh", ID_AA64MMFR1_EL1_VH_SHIFT, mmfr1_vh_filter),
+ FIELD("pan", ID_AA64MMFR1_EL1_PAN_SHIFT, NULL),
{}
},
};
@@ -249,6 +250,7 @@ static const struct {
{ "arm64.nolva", "id_aa64mmfr2.varange=0" },
{ "arm64.no32bit_el0", "id_aa64pfr0.el0=1" },
{ "arm64.nompam", "id_aa64pfr0.mpam=0 id_aa64pfr1.mpam_frac=0" },
+ { "arm64.nopan", "id_aa64mmfr1.pan=0" },
};

static int __init parse_hexdigit(const char *p, u64 *v)
diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S
index 400057d607ec..1f578c4d0ae6 100644
--- a/arch/arm64/lib/copy_from_user.S
+++ b/arch/arm64/lib/copy_from_user.S
@@ -44,12 +44,21 @@
str \reg, [\ptr], \val
.endm

- .macro ldp1 reg1, reg2, ptr, val
- user_ldp 9997f, \reg1, \reg2, \ptr, \val
+ .macro ldp_unpriv reg1, reg2, ptr, val
+ user_ldst_index 9997f, ldtr, \reg1, \ptr, \val
+ user_ldst_index 9997f, ldtr, \reg2, \ptr, \val + 8
.endm

- .macro stp1 reg1, reg2, ptr, val
- stp \reg1, \reg2, [\ptr], \val
+ .macro stp_unpriv reg1, reg2, ptr, val
+ stp \reg1, \reg2, [\ptr, \val]
+ .endm
+
+ .macro ldp_priv reg1, reg2, ptr, val
+ user_ldst_pair_index 9997f, ldp, \reg1, \reg2, \ptr, \val
+ .endm
+
+ .macro stp_priv reg1, reg2, ptr, val
+ stp \reg1, \reg2, [\ptr, \val]
.endm

.macro cpy1 dst, src, count
diff --git a/arch/arm64/lib/copy_template.S b/arch/arm64/lib/copy_template.S
index 7f2f5a0e2fb9..5ef6dc9bf7d8 100644
--- a/arch/arm64/lib/copy_template.S
+++ b/arch/arm64/lib/copy_template.S
@@ -97,14 +97,20 @@ alternative_else_nop_endif
cmp tmp1w, #0x20
b.eq 1f
b.lt 2f
- ldp1 A_l, A_h, src, #16
- stp1 A_l, A_h, dst, #16
+ ldp_unpriv A_l, A_h, src, #0
+ stp_unpriv A_l, A_h, dst, #0
+ add src, src, #16
+ add dst, dst, #16
1:
- ldp1 A_l, A_h, src, #16
- stp1 A_l, A_h, dst, #16
+ ldp_unpriv A_l, A_h, src, #0
+ stp_unpriv A_l, A_h, dst, #0
+ add src, src, #16
+ add dst, dst, #16
2:
- ldp1 A_l, A_h, src, #16
- stp1 A_l, A_h, dst, #16
+ ldp_unpriv A_l, A_h, src, #0
+ stp_unpriv A_l, A_h, dst, #0
+ add src, src, #16
+ add dst, dst, #16
.Ltiny15:
/*
* Prefer to break one ldp/stp into several load/store to access
@@ -142,14 +148,16 @@ alternative_else_nop_endif
* Less than 128 bytes to copy, so handle 64 here and then jump
* to the tail.
*/
- ldp1 A_l, A_h, src, #16
- stp1 A_l, A_h, dst, #16
- ldp1 B_l, B_h, src, #16
- ldp1 C_l, C_h, src, #16
- stp1 B_l, B_h, dst, #16
- stp1 C_l, C_h, dst, #16
- ldp1 D_l, D_h, src, #16
- stp1 D_l, D_h, dst, #16
+ ldp_unpriv A_l, A_h, src, #0
+ stp_unpriv A_l, A_h, dst, #0
+ ldp_unpriv B_l, B_h, src, #16
+ ldp_unpriv C_l, C_h, src, #32
+ stp_unpriv B_l, B_h, dst, #16
+ stp_unpriv C_l, C_h, dst, #32
+ ldp_unpriv D_l, D_h, src, #48
+ stp_unpriv D_l, D_h, dst, #48
+ add src, src, #64
+ add dst, dst, #64

tst count, #0x3f
b.ne .Ltail63
@@ -161,30 +169,70 @@ alternative_else_nop_endif
*/
.p2align L1_CACHE_SHIFT
.Lcpy_body_large:
+ /* Runtime PAN decision for large copies */
+ ALTERNATIVE("b .Llarge_pan_disabled", "b .Llarge_pan_enabled", ARM64_HAS_PAN)
+
+.Llarge_pan_enabled:
+ /* PAN enabled version - use unprivileged loads (ldp_unpriv) */
/* pre-get 64 bytes data. */
- ldp1 A_l, A_h, src, #16
- ldp1 B_l, B_h, src, #16
- ldp1 C_l, C_h, src, #16
- ldp1 D_l, D_h, src, #16
+ ldp_unpriv A_l, A_h, src, #0
+ ldp_unpriv B_l, B_h, src, #16
+ ldp_unpriv C_l, C_h, src, #32
+ ldp_unpriv D_l, D_h, src, #48
+ add src, src, #64
+1:
+ /*
+ * interlace the load of next 64 bytes data block with store of the last
+ * loaded 64 bytes data.
+ */
+ stp_unpriv A_l, A_h, dst, #0
+ ldp_unpriv A_l, A_h, src, #0
+ stp_unpriv B_l, B_h, dst, #16
+ ldp_unpriv B_l, B_h, src, #16
+ stp_unpriv C_l, C_h, dst, #32
+ ldp_unpriv C_l, C_h, src, #32
+ stp_unpriv D_l, D_h, dst, #48
+ ldp_unpriv D_l, D_h, src, #48
+ add dst, dst, #64
+ add src, src, #64
+ subs count, count, #64
+ b.ge 1b
+ b .Llarge_done
+
+.Llarge_pan_disabled:
+ /* PAN disabled version - use normal loads without post-increment */
+ /* pre-get 64 bytes data using normal loads */
+ ldp_priv A_l, A_h, src, #0
+ ldp_priv B_l, B_h, src, #16
+ ldp_priv C_l, C_h, src, #32
+ ldp_priv D_l, D_h, src, #48
+ add src, src, #64
1:
/*
* interlace the load of next 64 bytes data block with store of the last
* loaded 64 bytes data.
*/
- stp1 A_l, A_h, dst, #16
- ldp1 A_l, A_h, src, #16
- stp1 B_l, B_h, dst, #16
- ldp1 B_l, B_h, src, #16
- stp1 C_l, C_h, dst, #16
- ldp1 C_l, C_h, src, #16
- stp1 D_l, D_h, dst, #16
- ldp1 D_l, D_h, src, #16
+ stp_priv A_l, A_h, dst, #0
+ ldp_priv A_l, A_h, src, #0
+ stp_priv B_l, B_h, dst, #16
+ ldp_priv B_l, B_h, src, #16
+ stp_priv C_l, C_h, dst, #32
+ ldp_priv C_l, C_h, src, #32
+ stp_priv D_l, D_h, dst, #48
+ ldp_priv D_l, D_h, src, #48
+ add dst, dst, #64
+ add src, src, #64
subs count, count, #64
b.ge 1b
- stp1 A_l, A_h, dst, #16
- stp1 B_l, B_h, dst, #16
- stp1 C_l, C_h, dst, #16
- stp1 D_l, D_h, dst, #16
+
+.Llarge_done:
+ /* Post-loop: store the last block of data using stp_unpriv */
+ /* (without post-increment) */
+ stp_unpriv A_l, A_h, dst, #0
+ stp_unpriv B_l, B_h, dst, #16
+ stp_unpriv C_l, C_h, dst, #32
+ stp_unpriv D_l, D_h, dst, #48
+ add dst, dst, #64

tst count, #0x3f
b.ne .Ltail63
diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S
index 819f2e3fc7a9..9738ae96c823 100644
--- a/arch/arm64/lib/copy_to_user.S
+++ b/arch/arm64/lib/copy_to_user.S
@@ -43,12 +43,21 @@
user_ldst 9997f, sttr, \reg, \ptr, \val
.endm

- .macro ldp1 reg1, reg2, ptr, val
- ldp \reg1, \reg2, [\ptr], \val
+ .macro ldp_unpriv reg1, reg2, ptr, val
+ ldp \reg1, \reg2, [\ptr, \val]
.endm

- .macro stp1 reg1, reg2, ptr, val
- user_stp 9997f, \reg1, \reg2, \ptr, \val
+ .macro stp_unpriv reg1, reg2, ptr, val
+ user_ldst_index 9997f, sttr, \reg1, \ptr, \val
+ user_ldst_index 9997f, sttr, \reg2, \ptr, \val + 8
+ .endm
+
+ .macro ldp_priv reg1, reg2, ptr, val
+ ldp \reg1, \reg2, [\ptr, \val]
+ .endm
+
+ .macro stp_priv reg1, reg2, ptr, val
+ user_ldst_pair_index 9997f, stp, \reg1, \reg2, \ptr, \val
.endm

.macro cpy1 dst, src, count
--
2.33.0