Re: [PATCH 2/2] fs/proc/task_mmu: read proc/pid/smaps_rollup under per-vma lock
From: Suren Baghdasaryan
Date: Fri Jun 05 2026 - 22:01:24 EST
On Fri, Jun 5, 2026 at 6:57 PM Suren Baghdasaryan <surenb@xxxxxxxxxx> wrote:
>
> proc/pid/smaps_rollup can be read using the combination of RCU and
> VMA read locks, similar to proc/pid/{maps|smaps|numa_maps}. RCU is
> required to safely traverse the VMA tree and VMA lock stabilizes the
> VMA being processed and the pagetable walk.
> Note that we have to keep the logic to drop mmap_lock on contention
> because even when using per-VMA locks we might have to fall back to
> holding the mmap_lock.
>
> Running Paul's contention benchmark [1] shows considerable improvement
> both in median and in the worst case latencies:
>
> Execution command: run-proc-vs-map.sh --nsamples 20 --rawdata -- \
> --busyduration 2 --procfile smaps_rollup
>
> Baseline:
Forgot to add the names for the columns:
# Median Minimum Maximum
> 0.174 0.161 2.553
> 0.174 0.164 2.663
> 0.174 0.165 2.664
> 0.174 0.166 2.679
> 0.174 0.167 2.691
> 0.174 0.168 2.704
> 0.174 0.169 2.729
> 0.174 0.172 2.741
> 0.174 0.174 2.745
> 0.174 0.174 2.755
> 0.174 0.175 2.790
> 0.174 0.177 2.809
> 0.174 0.179 3.096
> 0.174 0.183 3.144
> 0.174 0.184 3.158
> 0.174 0.185 3.175
> 0.174 0.185 4.568
> 0.174 0.198 4.821
> 0.174 0.214 5.143
> 0.174 0.251 5.220
>
> Patched:
>
> 0.007 0.007 1.952
> 0.007 0.007 1.955
> 0.007 0.007 1.955
> 0.007 0.007 1.955
> 0.007 0.007 1.957
> 0.007 0.007 1.969
> 0.007 0.007 2.065
> 0.007 0.007 2.075
> 0.007 0.007 2.146
> 0.007 0.007 2.195
> 0.007 0.007 2.223
> 0.007 0.007 2.259
> 0.007 0.007 2.488
> 0.007 0.007 2.562
> 0.007 0.007 2.599
> 0.007 0.007 2.697
> 0.007 0.007 3.030
> 0.007 0.007 3.075
> 0.007 0.007 3.145
> 0.007 0.007 3.225
>
> [1] https://github.com/paulmckrcu/proc-mmap_sem-test
>
> Signed-off-by: Suren Baghdasaryan <surenb@xxxxxxxxxx>
> ---
> fs/proc/task_mmu.c | 134 ++++++++++++++++++++-------------------------
> 1 file changed, 59 insertions(+), 75 deletions(-)
>
> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index 023422fcee12..c2bd9f5bbbcd 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -233,6 +233,16 @@ static inline void reacquire_rcu(struct proc_maps_private *priv)
> vma_iter_set(&priv->iter, priv->lock_ctx.locked_vma->vm_end);
> }
>
> +static inline bool is_mmap_lock_contended(struct proc_maps_private *priv)
> +{
> + struct proc_maps_locking_ctx *lock_ctx = &priv->lock_ctx;
> +
> + if (!lock_ctx->mmap_locked)
> + return false;
> +
> + return !!mmap_lock_is_contended(lock_ctx->mm);
> +}
> +
> #else /* CONFIG_PER_VMA_LOCK */
>
> static inline int lock_ctx_mm(struct proc_maps_locking_ctx *lock_ctx)
> @@ -268,6 +278,11 @@ static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv,
> return false;
> }
>
> +static inline bool is_mmap_lock_contended(struct proc_maps_private *priv)
> +{
> + return !!mmap_lock_is_contended(priv->lock_ctx.mm);
> +}
> +
> static inline void drop_rcu(struct proc_maps_private *priv) {}
> static inline void reacquire_rcu(struct proc_maps_private *priv) {}
>
> @@ -1486,12 +1501,15 @@ static int show_smap(struct seq_file *m, void *v)
> static int show_smaps_rollup(struct seq_file *m, void *v)
> {
> struct proc_maps_private *priv = m->private;
> + struct proc_maps_locking_ctx *lock_ctx = &priv->lock_ctx;
> + struct mm_struct *mm = lock_ctx->mm;
> struct mem_size_stats mss = {};
> - struct mm_struct *mm = priv->lock_ctx.mm;
> struct vm_area_struct *vma;
> - unsigned long vma_start = 0, last_vma_end = 0;
> + unsigned long vma_start = 0;
> + unsigned long last_vma_end = 0;
> + loff_t pos = 0;
> int ret = 0;
> - VMA_ITERATOR(vmi, mm, 0);
> +
>
> priv->task = get_proc_task(priv->inode);
> if (!priv->task)
> @@ -1502,89 +1520,55 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
> goto out_put_task;
> }
>
> - ret = lock_ctx_mm(&priv->lock_ctx);
> + hold_task_mempolicy(priv);
> + ret = lock_vma_range(m, lock_ctx);
> if (ret)
> goto out_put_mm;
>
> - hold_task_mempolicy(priv);
> - vma = vma_next(&vmi);
> -
> - if (unlikely(!vma))
> + vma_iter_init(&priv->iter, mm, 0);
> + vma = proc_get_vma(m, &pos);
> + if (unlikely(!vma) || vma == get_gate_vma(priv->lock_ctx.mm))
> goto empty_set;
>
> + if (IS_ERR(vma)) {
> + ret = PTR_ERR(vma);
> + goto out_unlock;
> + }
> +
> vma_start = vma->vm_start;
> - do {
> - smap_gather_stats(priv, vma, &mss, 0);
> + while (vma) {
> + if (IS_ERR(vma)) {
> + ret = PTR_ERR(vma);
> + goto out_unlock;
> + }
> +
> + if (vma == get_gate_vma(priv->lock_ctx.mm))
> + break;
> +
> + /*
> + * If after retaking mmap_lock, already reported VMA grew or
> + * merged with the next one, then iterate from last_vma_end.
> + */
> + smap_gather_stats(priv, vma, &mss,
> + vma->vm_start < last_vma_end ? last_vma_end : 0);
> last_vma_end = vma->vm_end;
>
> /*
> * Release mmap_lock temporarily if someone wants to
> - * access it for write request.
> + * take it for write request.
> */
> - if (mmap_lock_is_contended(mm)) {
> - vma_iter_invalidate(&vmi);
> - unlock_ctx_mm(&priv->lock_ctx);
> - ret = lock_ctx_mm(&priv->lock_ctx);
> - if (ret) {
> - release_task_mempolicy(priv);
> + if (is_mmap_lock_contended(priv)) {
> + unlock_vma_range(&priv->lock_ctx);
> + ret = lock_vma_range(m, lock_ctx);
> + if (ret)
> goto out_put_mm;
> - }
> -
> - /*
> - * After dropping the lock, there are four cases to
> - * consider. See the following example for explanation.
> - *
> - * +------+------+-----------+
> - * | VMA1 | VMA2 | VMA3 |
> - * +------+------+-----------+
> - * | | | |
> - * 4k 8k 16k 400k
> - *
> - * Suppose we drop the lock after reading VMA2 due to
> - * contention, then we get:
> - *
> - * last_vma_end = 16k
> - *
> - * 1) VMA2 is freed, but VMA3 exists:
> - *
> - * vma_next(vmi) will return VMA3.
> - * In this case, just continue from VMA3.
> - *
> - * 2) VMA2 still exists:
> - *
> - * vma_next(vmi) will return VMA3.
> - * In this case, just continue from VMA3.
> - *
> - * 3) No more VMAs can be found:
> - *
> - * vma_next(vmi) will return NULL.
> - * No more things to do, just break.
> - *
> - * 4) (last_vma_end - 1) is the middle of a vma (VMA'):
> - *
> - * vma_next(vmi) will return VMA' whose range
> - * contains last_vma_end.
> - * Iterate VMA' from last_vma_end.
> - */
> - vma = vma_next(&vmi);
> - /* Case 3 above */
> - if (!vma)
> - break;
> -
> - /* Case 1 and 2 above */
> - if (vma->vm_start >= last_vma_end) {
> - smap_gather_stats(priv, vma, &mss, 0);
> - last_vma_end = vma->vm_end;
> - continue;
> - }
>
> - /* Case 4 above */
> - if (vma->vm_end > last_vma_end) {
> - smap_gather_stats(priv, vma, &mss, last_vma_end);
> - last_vma_end = vma->vm_end;
> - }
> + /* Resume from the last position. */
> + pos = last_vma_end;
> + vma_iter_init(&priv->iter, mm, pos);
> }
> - } for_each_vma(vmi, vma);
> + vma = proc_get_vma(m, &pos);
> + }
>
> empty_set:
> show_vma_header_prefix(m, vma_start, last_vma_end, 0, 0, 0, 0);
> @@ -1593,10 +1577,10 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
>
> __show_smap(m, &mss, true);
>
> - release_task_mempolicy(priv);
> - unlock_ctx_mm(&priv->lock_ctx);
> -
> +out_unlock:
> + unlock_vma_range(&priv->lock_ctx);
> out_put_mm:
> + release_task_mempolicy(priv);
> mmput(mm);
> out_put_task:
> put_task_struct(priv->task);
> --
> 2.54.0.1032.g2f8565e1d1-goog
>