[patch 8/8] x86/vdso: Implement __vdso_futex_robust_try_unlock()

From: Thomas Gleixner

Date: Mon Mar 16 2026 - 13:14:46 EST


When the FUTEX_ROBUST_UNLOCK mechanism is used for unlocking (PI-)futexes,
then the unlock sequence in userspace looks like this:

1) robust_list_set_op_pending(mutex);
2) robust_list_remove(mutex);

lval = gettid();
3) if (atomic_try_cmpxchg(&mutex->lock, lval, 0))
4) robust_list_clear_op_pending();
else
5) sys_futex(OP,...FUTEX_ROBUST_UNLOCK);

That still leaves a minimal race window between #3 and #4 where the mutex
could be acquired by some other task which observes that it is the last
user and:

1) unmaps the mutex memory
2) maps a different file, which ends up covering the same address

When then the original task exits before reaching #6 then the kernel robust
list handling observes the pending op entry and tries to fix up user space.

In case that the newly mapped data contains the TID of the exiting thread
at the address of the mutex/futex the kernel will set the owner died bit in
that memory and therefore corrupt unrelated data.

Provide a VDSO function which exposes the critical section window in the
VDSO symbol table. The resulting addresses are updated in the task's mm
when the VDSO is (re)map()'ed.

The core code detects when a task was interrupted within the critical
section and is about to deliver a signal. It then invokes an architecture
specific function which determines whether the pending op pointer has to be
cleared or not. The assembly sequence for the non COMPAT case is:

mov %esi,%eax // Load TID into EAX
xor %ecx,%ecx // Set ECX to 0
lock cmpxchg %ecx,(%rdi) // Try the TID -> 0 transition
.Lstart:
jnz .Lend
movq $0x0,(%rdx) // Clear list_op_pending
.Lend:
ret

So the decision can be simply based on the ZF state in regs->flags.

If COMPAT is enabled then the try_unlock() function needs to take the size
bit in the OP pointer into account, which makes it slightly more complex:

mov %esi,%eax // Load TID into EAX
mov %rdx,%rsi // Get the op pointer
xor %ecx,%ecx // Set ECX to 0
and $0xfffffffffffffffe,%rsi // Clear the size bit
lock cmpxchg %ecx,(%rdi) // Try the TID -> 0 transition
.Lstart:
jnz .Lend
.Lsuccess:
testl $0x1,(%rdx) // Test the size bit
jz .Lop64 // Not set: 64-bit
movl $0x0,(%rsi) // Clear 32-bit
jmp .Lend
.Lop64:
movq $0x0,(%rsi) // Clear 64-bit
.Lend:
ret

The decision function has to check whether regs->ip is in the success
portion as the size bit test obviously modifies ZF too. If it is before
.Lsuccess then ZF contains the cmpxchg() result. If it's at of after
.Lsuccess then the pointer has to be cleared.

The original pointer with the size bit is preserved in RDX so the fixup can
utilize the existing clearing mechanism, which is used by sys_futex().

Arguably this could be avoided by providing separate functions and making
the IP range for the quick check in the exit to user path cover the whole
text section which contains the two functions. But that's not a win at all
because:

1) User space needs to handle the two variants instead of just
relying on a bit which can be saved in the mutex at
initialization time.

2) The fixup decision function has then to evaluate which code path is
used. That just adds more symbols and range checking for no real
value.

The unlock function is inspired by an idea from Mathieu Desnoyers.

Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxx>
Link: https://lore.kernel.org/20260311185409.1988269-1-mathieu.desnoyers@xxxxxxxxxxxx
---
arch/x86/Kconfig | 1
arch/x86/entry/vdso/common/vfutex.c | 72 +++++++++++++++++++++++++++++++
arch/x86/entry/vdso/vdso32/Makefile | 5 +-
arch/x86/entry/vdso/vdso32/vdso32.lds.S | 6 ++
arch/x86/entry/vdso/vdso32/vfutex.c | 1
arch/x86/entry/vdso/vdso64/Makefile | 7 +--
arch/x86/entry/vdso/vdso64/vdso64.lds.S | 6 ++
arch/x86/entry/vdso/vdso64/vdsox32.lds.S | 6 ++
arch/x86/entry/vdso/vdso64/vfutex.c | 1
arch/x86/include/asm/futex_robust.h | 44 ++++++++++++++++++
10 files changed, 144 insertions(+), 5 deletions(-)

--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -237,6 +237,7 @@ config X86
select HAVE_EFFICIENT_UNALIGNED_ACCESS
select HAVE_EISA if X86_32
select HAVE_EXIT_THREAD
+ select HAVE_FUTEX_ROBUST_UNLOCK
select HAVE_GENERIC_TIF_BITS
select HAVE_GUP_FAST
select HAVE_FENTRY if X86_64 || DYNAMIC_FTRACE
--- /dev/null
+++ b/arch/x86/entry/vdso/common/vfutex.c
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <vdso/futex.h>
+
+/*
+ * Compat enabled kernels have to take the size bit into account to support the
+ * mixed size use case of gaming emulators. Contrary to the kernel robust unlock
+ * mechanism all of this does not test for the 32-bit modifier in 32-bit VDSOs
+ * and in compat disabled kernels. User space can keep the pieces.
+ */
+#if defined(CONFIG_X86_64) && !defined(BUILD_VDSO32_64)
+
+#ifdef CONFIG_COMPAT
+
+# define ASM_CLEAR_PTR \
+ " testl $1, (%[pop]) \n" \
+ " jz .Lop64 \n" \
+ " movl $0, (%[pad]) \n" \
+ " jmp __vdso_futex_robust_try_unlock_cs_end \n" \
+ ".Lop64: \n" \
+ " movq $0, (%[pad]) \n"
+
+# define ASM_PAD_CONSTRAINT ,[pad] "S" (((unsigned long)pop) & ~0x1UL)
+
+#else /* CONFIG_COMPAT */
+
+# define ASM_CLEAR_PTR \
+ " movq $0, (%[pop]) \n"
+
+# define ASM_PAD_CONSTRAINT
+
+#endif /* !CONFIG_COMPAT */
+
+#else /* CONFIG_X86_64 && !BUILD_VDSO32_64 */
+
+# define ASM_CLEAR_PTR \
+ " movl $0, (%[pad]) \n"
+
+# define ASM_PAD_CONSTRAINT ,[pad] "S" (((unsigned long)pop) & ~0x1UL)
+
+#endif /* !CONFIG_X86_64 || BUILD_VDSO32_64 */
+
+uint32_t __vdso_futex_robust_try_unlock(uint32_t *lock, uint32_t tid, void *pop)
+{
+ asm volatile (
+ ".global __vdso_futex_robust_try_unlock_cs_start \n"
+ ".global __vdso_futex_robust_try_unlock_cs_success \n"
+ ".global __vdso_futex_robust_try_unlock_cs_end \n"
+ " \n"
+ " lock cmpxchgl %[val], (%[ptr]) \n"
+ " \n"
+ "__vdso_futex_robust_try_unlock_cs_start: \n"
+ " \n"
+ " jnz __vdso_futex_robust_try_unlock_cs_end \n"
+ " \n"
+ "__vdso_futex_robust_try_unlock_cs_success: \n"
+ " \n"
+ ASM_CLEAR_PTR
+ " \n"
+ "__vdso_futex_robust_try_unlock_cs_end: \n"
+ : [tid] "+a" (tid)
+ : [ptr] "D" (lock),
+ [pop] "d" (pop),
+ [val] "r" (0)
+ ASM_PAD_CONSTRAINT
+ : "memory"
+ );
+
+ return tid;
+}
+
+uint32_t futex_robust_try_unlock(uint32_t *, uint32_t, void **)
+ __attribute__((weak, alias("__vdso_futex_robust_try_unlock")));
--- a/arch/x86/entry/vdso/vdso32/Makefile
+++ b/arch/x86/entry/vdso/vdso32/Makefile
@@ -7,8 +7,9 @@
vdsos-y := 32

# Files to link into the vDSO:
-vobjs-y := note.o vclock_gettime.o vgetcpu.o
-vobjs-y += system_call.o sigreturn.o
+vobjs-y := note.o vclock_gettime.o vgetcpu.o
+vobjs-y += system_call.o sigreturn.o
+vobjs-$(CONFIG_FUTEX_ROBUST_UNLOCK) += vfutex.o

# Compilation flags
flags-y := -DBUILD_VDSO32 -m32 -mregparm=0
--- a/arch/x86/entry/vdso/vdso32/vdso32.lds.S
+++ b/arch/x86/entry/vdso/vdso32/vdso32.lds.S
@@ -30,6 +30,12 @@ VERSION
__vdso_clock_gettime64;
__vdso_clock_getres_time64;
__vdso_getcpu;
+#ifdef CONFIG_FUTEX_ROBUST_UNLOCK
+ __vdso_futex_robust_try_unlock;
+ __vdso_futex_robust_try_unlock_cs_start;
+ __vdso_futex_robust_try_unlock_cs_success;
+ __vdso_futex_robust_try_unlock_cs_end;
+#endif
};

LINUX_2.5 {
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso32/vfutex.c
@@ -0,0 +1 @@
+#include "common/vfutex.c"
--- a/arch/x86/entry/vdso/vdso64/Makefile
+++ b/arch/x86/entry/vdso/vdso64/Makefile
@@ -8,9 +8,10 @@ vdsos-y := 64
vdsos-$(CONFIG_X86_X32_ABI) += x32

# Files to link into the vDSO:
-vobjs-y := note.o vclock_gettime.o vgetcpu.o
-vobjs-y += vgetrandom.o vgetrandom-chacha.o
-vobjs-$(CONFIG_X86_SGX) += vsgx.o
+vobjs-y := note.o vclock_gettime.o vgetcpu.o
+vobjs-y += vgetrandom.o vgetrandom-chacha.o
+vobjs-$(CONFIG_X86_SGX) += vsgx.o
+vobjs-$(CONFIG_FUTEX_ROBUST_UNLOCK) += vfutex.o

# Compilation flags
flags-y := -DBUILD_VDSO64 -m64 -mcmodel=small
--- a/arch/x86/entry/vdso/vdso64/vdso64.lds.S
+++ b/arch/x86/entry/vdso/vdso64/vdso64.lds.S
@@ -32,6 +32,12 @@ VERSION {
#endif
getrandom;
__vdso_getrandom;
+#ifdef CONFIG_FUTEX_ROBUST_UNLOCK
+ __vdso_futex_robust_try_unlock;
+ __vdso_futex_robust_try_unlock_cs_start;
+ __vdso_futex_robust_try_unlock_cs_success;
+ __vdso_futex_robust_try_unlock_cs_end;
+#endif
local: *;
};
}
--- a/arch/x86/entry/vdso/vdso64/vdsox32.lds.S
+++ b/arch/x86/entry/vdso/vdso64/vdsox32.lds.S
@@ -22,6 +22,12 @@ VERSION {
__vdso_getcpu;
__vdso_time;
__vdso_clock_getres;
+#ifdef CONFIG_FUTEX_ROBUST_UNLOCK
+ __vdso_futex_robust_try_unlock;
+ __vdso_futex_robust_try_unlock_cs_start;
+ __vdso_futex_robust_try_unlock_cs_success;
+ __vdso_futex_robust_try_unlock_cs_end;
+#endif
local: *;
};
}
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso64/vfutex.c
@@ -0,0 +1 @@
+#include "common/vfutex.c"
--- /dev/null
+++ b/arch/x86/include/asm/futex_robust.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_FUTEX_ROBUST_H
+#define _ASM_X86_FUTEX_ROBUST_H
+
+#include <asm/ptrace.h>
+
+static __always_inline bool x86_futex_needs_robust_unlock_fixup(struct pt_regs *regs)
+{
+ /*
+ * This is tricky in the compat case as it has to take the size check
+ * into account. See the ASM magic in the VDSO vfutex code. If compat is
+ * disabled or this is a 32-bit kernel then ZF is authoritive no matter
+ * what.
+ */
+ if (!IS_ENABLED(CONFIG_X86_64) || !IS_ENABLED(CONFIG_IA32_EMULATION))
+ return !!(regs->flags & X86_EFLAGS_ZF);
+
+ /*
+ * For the compat case, the core code already established that regs->ip
+ * is >= cs_start and < cs_end. Now check whether it is at the
+ * conditional jump which checks the cmpxchg() or if it succeeded and
+ * does the size check, which obviously modifies ZF too.
+ */
+ if (regs->ip >= current->mm->futex.unlock_cs_success_ip)
+ return true;
+ /*
+ * It's at the jnz right after the cmpxchg(). ZF tells whether this
+ * succeeded or not.
+ */
+ return !!(regs->flags & X86_EFLAGS_ZF);
+}
+
+#define arch_futex_needs_robust_unlock_fixup(regs) \
+ x86_futex_needs_robust_unlock_fixup(regs)
+
+static __always_inline void __user *x86_futex_robust_unlock_get_pop(struct pt_regs *regs)
+{
+ return (void __user *)regs->dx;
+}
+
+#define arch_futex_robust_unlock_get_pop(regs) \
+ x86_futex_robust_unlock_get_pop(regs)
+
+#endif /* _ASM_X86_FUTEX_ROBUST_H */