[PATCH v4 7/8] x86/string: extend memcpy_flushcache() fixed-size fastpaths

From: Li Zhe

Date: Wed Jun 03 2026 - 04:08:41 EST


Small constant-sized flushcache copies currently inline only the 4, 8,
and 16-byte cases. Larger constant sizes fall back to
__memcpy_flushcache() even when the destination is naturally aligned.

Factor the movnti sequences into 4/8/16/32/64-byte helpers and extend
the inline fastpath coverage to the additional aligned constant sizes
32, 48, 64, 80, and 96 bytes. Keep the existing 4/8/16-byte cases
handled directly in memcpy_flushcache() so they do not pick up the
extra alignment gating used by the larger fixed-size helpers.

Because memcpy_streaming() maps aligned transfers to
memcpy_flushcache(), these additional fixed-size cases also stay on
the inline movnti path for that helper.

Issue the stores in ascending address order so write-combining sees a
forward stream. Keep all other sizes on __memcpy_flushcache(), and keep
zero-length copies returning immediately.

Signed-off-by: Li Zhe <lizhe.67@xxxxxxxxxxxxx>
---
arch/x86/include/asm/string_64.h | 122 ++++++++++++++++++++++++++-----
1 file changed, 104 insertions(+), 18 deletions(-)

diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index aee63108577f..16d1aac2da24 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -83,24 +83,6 @@ int strcmp(const char *cs, const char *ct);
#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
#define __HAVE_ARCH_MEMCPY_FLUSHCACHE 1
void __memcpy_flushcache(void *dst, const void *src, size_t cnt);
-static __always_inline void memcpy_flushcache(void *dst, const void *src, size_t cnt)
-{
- if (__builtin_constant_p(cnt)) {
- switch (cnt) {
- case 4:
- asm ("movntil %1, %0" : "=m"(*(u32 *)dst) : "r"(*(u32 *)src));
- return;
- case 8:
- asm ("movntiq %1, %0" : "=m"(*(u64 *)dst) : "r"(*(u64 *)src));
- return;
- case 16:
- asm ("movntiq %1, %0" : "=m"(*(u64 *)dst) : "r"(*(u64 *)src));
- asm ("movntiq %1, %0" : "=m"(*(u64 *)(dst + 8)) : "r"(*(u64 *)(src + 8)));
- return;
- }
- }
- __memcpy_flushcache(dst, src, cnt);
-}

/*
* Only map memcpy_streaming() to memcpy_flushcache() when the destination
@@ -115,6 +97,110 @@ static __always_inline bool memcpy_flushcache_nt_safe(const void *dst,
return cnt && IS_ALIGNED(d, 8) && IS_ALIGNED(cnt, 4);
}

+static __always_inline void memcpy_flushcache_4(void *dst, const void *src)
+{
+ asm volatile("movntil %1, %0"
+ : "=m"(*(u32 *)dst)
+ : "r"(*(const u32 *)src)
+ : "memory");
+}
+
+static __always_inline void memcpy_flushcache_8(void *dst, const void *src)
+{
+ asm volatile("movntiq %1, %0"
+ : "=m"(*(u64 *)dst)
+ : "r"(*(const u64 *)src)
+ : "memory");
+}
+
+static __always_inline void memcpy_flushcache_16(void *dst,
+ const void *src)
+{
+ memcpy_flushcache_8(dst, src);
+ memcpy_flushcache_8(dst + 8, src + 8);
+}
+
+static __always_inline void memcpy_flushcache_32(void *dst,
+ const void *src)
+{
+ memcpy_flushcache_16(dst, src);
+ memcpy_flushcache_16(dst + 16, src + 16);
+}
+
+static __always_inline void memcpy_flushcache_64(void *dst,
+ const void *src)
+{
+ memcpy_flushcache_32(dst, src);
+ memcpy_flushcache_32(dst + 32, src + 32);
+}
+
+/*
+ * Keep the additional aligned fixed-size cases on the inline movnti path.
+ * Leave the existing 4/8/16-byte cases handled directly in
+ * memcpy_flushcache() so they do not pick up the extra alignment gating
+ * used by the larger fixed-size helpers.
+ */
+static __always_inline int memcpy_flushcache_large(void *dst,
+ const void *src,
+ size_t cnt)
+{
+ unsigned long d = (unsigned long)dst;
+ char *dptr = dst;
+ const char *sptr = src;
+
+ if (!IS_ALIGNED(d, 8))
+ return 0;
+
+ switch (cnt) {
+ case 32:
+ memcpy_flushcache_32(dptr, sptr);
+ return 1;
+ case 48:
+ memcpy_flushcache_32(dptr, sptr);
+ memcpy_flushcache_16(dptr + 32, sptr + 32);
+ return 1;
+ case 64:
+ memcpy_flushcache_64(dptr, sptr);
+ return 1;
+ case 80:
+ memcpy_flushcache_64(dptr, sptr);
+ memcpy_flushcache_16(dptr + 64, sptr + 64);
+ return 1;
+ case 96:
+ memcpy_flushcache_64(dptr, sptr);
+ memcpy_flushcache_32(dptr + 64, sptr + 64);
+ return 1;
+ }
+
+ return 0;
+}
+
+static __always_inline void memcpy_flushcache(void *dst, const void *src,
+ size_t cnt)
+{
+ if (!cnt)
+ return;
+
+ if (__builtin_constant_p(cnt)) {
+ switch (cnt) {
+ case 4:
+ memcpy_flushcache_4(dst, src);
+ return;
+ case 8:
+ memcpy_flushcache_8(dst, src);
+ return;
+ case 16:
+ memcpy_flushcache_16(dst, src);
+ return;
+ }
+
+ if (memcpy_flushcache_large(dst, src, cnt))
+ return;
+ }
+
+ __memcpy_flushcache(dst, src, cnt);
+}
+
#define __HAVE_ARCH_MEMCPY_STREAMING 1
static __always_inline void memcpy_streaming(void *dst, const void *src,
size_t cnt)
--
2.20.1