[PATCH v2] fhandle: fix UAF due to unlocked ->mnt_ns read in may_decode_fh()

From: Jann Horn

Date: Wed Jun 03 2026 - 15:36:49 EST

may_decode_fh() accesses mount::mnt_ns without holding any locks; that
means the mount can concurrently be unmounted, and the mnt_namespace can
concurrently be freed after an RCU grace period.

This race can happens as follows, assuming that the mount point was
created by open_tree(..., OPEN_TREE_CLONE):

thread 1 thread 2 RCU
__do_sys_open_by_handle_at
do_handle_open
handle_to_path
may_decode_fh
is_mounted
[mount::mnt_ns access]
[mount::mnt_ns access]
__do_sys_close
fput_close_sync
__fput
dissolve_on_fput
umount_tree
class_namespace_excl_destructor
namespace_unlock
free_mnt_ns
mnt_ns_tree_remove
call_rcu(mnt_ns_release_rcu)
mnt_ns_release_rcu
mnt_ns_release
kfree
[mnt_namespace::user_ns access] **UAF**

Fix it by taking rcu_read_lock() around the mount::mnt_ns access, like
in __prepend_path().
Additionally, document the semantics of mount::mnt_ns, and use WRITE_ONCE()
for writers that can race with lockless readers.

This bug is unreachable unless one of the following is set:

- CONFIG_PREEMPTION
- CONFIG_RCU_STRICT_GRACE_PERIOD

because it requires an RCU grace period to happen during a syscall without
an explicit preemption.

This doesn't seem to have interesting security impact; worst-case, it could
leak the result of an integer comparison to userspace (from the level
check in cap_capable()), cause an endless loop, or crash the kernel by
dereferencing an invalid address.

Fixes: 620c266f3949 ("fhandle: relax open_by_handle_at() permission checks")
Cc: stable@xxxxxxxxxxxxxxx
Signed-off-by: Jann Horn <jannh@xxxxxxxxxx>
---
I used custom tooling to force this race condition to occur and check
that it leads to a KASAN splat - let me know if you want me to create a
kernel patch to force the race condition and a reproducer you can run.

I remember Christian asking me for feedback on the patch that introduced
the bug, and I missed the bug because I didn't realize what the semantics
of mount::mnt_ns are...
---
Changes in v2:
- improve comment on mnt_ns semantics based on discussion with viro@
- Link to v1: https://patch.msgid.link/20260603-vfs-fhandle-uaf-fix-v1-1-ff64ee367e4d@xxxxxxxxxx
---
fs/fhandle.c | 16 ++++++++++++++--
fs/mount.h | 10 +++++++++-
fs/namespace.c | 6 +++---
3 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/fs/fhandle.c b/fs/fhandle.c
index 642e3d569497..1ca7eb3a6cb5 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -285,6 +285,19 @@ static int do_handle_to_path(struct file_handle *handle, struct path *path,
return 0;
}

+static bool capable_wrt_mount(struct mount *mount)
+{
+ struct mnt_namespace *mnt_ns;
+
+ /*
+ * For ->mnt_ns access.
+ * The following READ_ONCE() is semantically rcu_dereference().
+ */
+ guard(rcu)();
+ mnt_ns = READ_ONCE(mount->mnt_ns);
+ return ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN);
+}
+
static inline int may_decode_fh(struct handle_to_path_ctx *ctx,
unsigned int o_flags)
{
@@ -320,8 +333,7 @@ static inline int may_decode_fh(struct handle_to_path_ctx *ctx,
if (ns_capable(root->mnt->mnt_sb->s_user_ns, CAP_SYS_ADMIN))
ctx->flags = HANDLE_CHECK_PERMS;
else if (is_mounted(root->mnt) &&
- ns_capable(real_mount(root->mnt)->mnt_ns->user_ns,
- CAP_SYS_ADMIN) &&
+ capable_wrt_mount(real_mount(root->mnt)) &&
!has_locked_children(real_mount(root->mnt), root->dentry))
ctx->flags = HANDLE_CHECK_PERMS | HANDLE_CHECK_SUBTREE;
else
diff --git a/fs/mount.h b/fs/mount.h
index e0816c11a198..5c120f8361bd 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -71,7 +71,15 @@ struct mount {
struct hlist_head mnt_slave_list;/* list of slave mounts */
struct hlist_node mnt_slave; /* slave list entry */
struct mount *mnt_master; /* slave is on master->mnt_slave_list */
- struct mnt_namespace *mnt_ns; /* containing namespace */
+ /*
+ * Containing namespace (active or deactivating, non-refcounted).
+ * Normally protected by namespace_sem.
+ * Can also be accessed locklessly under RCU. RCU readers can't rely on
+ * the namespace still being active, but implicitly hold a passive
+ * reference (because an RCU delay happens between a namespace being
+ * deactivated and the corresponding passive refcount drop).
+ */
+ struct mnt_namespace *mnt_ns;
struct mountpoint *mnt_mp; /* where is it mounted */
union {
struct hlist_node mnt_mp_list; /* list mounts with the same mountpoint */
diff --git a/fs/namespace.c b/fs/namespace.c
index fe919abd2f01..f5905f4ec560 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1079,7 +1079,7 @@ static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt)
bool mnt_first_node = true, mnt_last_node = true;

WARN_ON(mnt_ns_attached(mnt));
- mnt->mnt_ns = ns;
+ WRITE_ONCE(mnt->mnt_ns, ns);
while (*link) {
parent = *link;
if (mnt->mnt_id_unique < node_to_mount(parent)->mnt_id_unique) {
@@ -1434,7 +1434,7 @@ EXPORT_SYMBOL(mntget);
void mnt_make_shortterm(struct vfsmount *mnt)
{
if (mnt)
- real_mount(mnt)->mnt_ns = NULL;
+ WRITE_ONCE(real_mount(mnt)->mnt_ns, NULL);
}

/**
@@ -1806,7 +1806,7 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
ns->nr_mounts--;
__touch_mnt_namespace(ns);
}
- p->mnt_ns = NULL;
+ WRITE_ONCE(p->mnt_ns, NULL);
if (how & UMOUNT_SYNC)
p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;

---
base-commit: ba3e43a9e601636f5edb54e259a74f96ca3b8fd8
change-id: 20260603-vfs-fhandle-uaf-fix-32279d5b2758

Best regards,
--
Jann Horn <jannh@xxxxxxxxxx>