[PATCH v2] mm/madvise: prefer VMA lock for MADV_REMOVE
From: jiang.kun2
Date: Fri Apr 10 2026 - 04:08:00 EST
From: Jiang Kun <jiang.kun2@xxxxxxxxxx>
MADV_REMOVE prefers the per-VMA read lock for single-VMA, local-mm,
non-UFFD-armed ranges, avoiding mmap_lock contention for such ranges.
However, calling into the filesystem while holding vm_lock (VMA lock) can
create lock ordering issues. syzbot reported a possible deadlock in
blkdev_fallocate() when vfs_fallocate() is called under vm_lock.
Fix this by dropping the VMA lock before invoking vfs_fallocate(), after
taking an extra reference to the file. Keep the existing mmap_lock fallback
path and its userfaultfd coordination unchanged.
Repeated benchmark runs show no regression in the uncontended case, and show
benefit once mmap_lock contention is introduced.
Link: https://ci.syzbot.org/series/30acb9df-ca55-4cbf-81ed-89b84da8edc1
Link: https://lore.kernel.org/all/aWcZCwz__qwwKbxw@xxxxxxxxxxxxxxxxxxxx/
Signed-off-by: Jiang Kun <jiang.kun2@xxxxxxxxxx>
Signed-off-by: Yaxin Wang <wang.yaxin@xxxxxxxxxx>
---
mm/madvise.c | 21 +++++++++++++++++----
1 file changed, 17 insertions(+), 4 deletions(-)
diff --git a/mm/madvise.c b/mm/madvise.c
index 69708e953cf5..0932579bccb4 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -1008,8 +1008,6 @@ static long madvise_remove(struct madvise_behavior *madv_behavior)
unsigned long start = madv_behavior->range.start;
unsigned long end = madv_behavior->range.end;
- mark_mmap_lock_dropped(madv_behavior);
-
if (vma->vm_flags & VM_LOCKED)
return -EINVAL;
@@ -1025,6 +1023,20 @@ static long madvise_remove(struct madvise_behavior *madv_behavior)
offset = (loff_t)(start - vma->vm_start)
+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+ /* Avoid calling into the filesystem while holding a VMA lock. */
+ if (madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK) {
+ get_file(f);
+ vma_end_read(vma);
+ madv_behavior->vma = NULL;
+ error = vfs_fallocate(f,
+ FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+ offset, end - start);
+ fput(f);
+ return error;
+ }
+
+ mark_mmap_lock_dropped(madv_behavior);
+
/*
* Filesystem's fallocate may need to take i_rwsem. We need to
* explicitly grab a reference because the vma (and hence the
@@ -1677,7 +1689,8 @@ int madvise_walk_vmas(struct madvise_behavior *madv_behavior)
if (madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK &&
try_vma_read_lock(madv_behavior)) {
error = madvise_vma_behavior(madv_behavior);
- vma_end_read(madv_behavior->vma);
+ if (madv_behavior->vma)
+ vma_end_read(madv_behavior->vma);
return error;
}
@@ -1746,7 +1759,6 @@ static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavi
return MADVISE_NO_LOCK;
switch (madv_behavior->behavior) {
- case MADV_REMOVE:
case MADV_WILLNEED:
case MADV_COLD:
case MADV_PAGEOUT:
@@ -1754,6 +1766,7 @@ static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavi
case MADV_POPULATE_WRITE:
case MADV_COLLAPSE:
return MADVISE_MMAP_READ_LOCK;
+ case MADV_REMOVE:
case MADV_GUARD_INSTALL:
case MADV_GUARD_REMOVE:
case MADV_DONTNEED:
--
2.53.0