Re: [PATCH 2/3] mm/memcontrol: disable demotion in memcg direct reclaim
From: Yosry Ahmed
Date: Tue Mar 17 2026 - 19:44:53 EST
On Tue, Mar 17, 2026 at 4:07 PM Bing Jiao <bingjiao@xxxxxxxxxx> wrote:
>
> NUMA demotion counts towards reclaim targets in shrink_folio_list(), but
> it does not reduce the total memory usage of a memcg. In memcg direct
> reclaim paths (e.g., charge-triggered or manual limit writes), where
> demotion is allowed, this leads to "fake progress" where the reclaim
> loop concludes it has satisfied the memory request without actually
> reducing the cgroup's charge.
>
> This could result in inefficient reclaim loops, CPU waste, moving all
> pages to far-tier nodes, and potentially premature OOM kills when the
> cgroup is under memory pressure but demotion is still possible.
>
> Introduce the MEMCG_RECLAIM_NO_DEMOTION flag to disable demotion in
> these memcg-specific reclaim paths. This ensures that reclaim
> progress is only counted when memory is actually freed or swapped out.
See the discussion @
https://lore.kernel.org/linux-mm/20250909012141.1467-1-cuishw@xxxxxxxxxx/
and the commits/threads it is referring to.
>
> Signed-off-by: Bing Jiao <bingjiao@xxxxxxxxxx>
> ---
> include/linux/swap.h | 1 +
> mm/memcontrol-v1.c | 10 ++++++++--
> mm/memcontrol.c | 16 +++++++++++-----
> mm/vmscan.c | 1 +
> 4 files changed, 21 insertions(+), 7 deletions(-)
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 7a09df6977a5..e83897a6dc72 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -356,6 +356,7 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone
>
> #define MEMCG_RECLAIM_MAY_SWAP (1 << 1)
> #define MEMCG_RECLAIM_PROACTIVE (1 << 2)
> +#define MEMCG_RECLAIM_NO_DEMOTION (1 << 3)
> #define MIN_SWAPPINESS 0
> #define MAX_SWAPPINESS 200
>
> diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
> index 433bba9dfe71..3cb600e28e5b 100644
> --- a/mm/memcontrol-v1.c
> +++ b/mm/memcontrol-v1.c
> @@ -1466,6 +1466,10 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
> int ret;
> bool limits_invariant;
> struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
> + unsigned int reclaim_options = MEMCG_RECLAIM_NO_DEMOTION;
> +
> + if (!memsw)
> + reclaim_options |= MEMCG_RECLAIM_MAY_SWAP;
>
> do {
> if (signal_pending(current)) {
> @@ -1500,7 +1504,7 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
> }
>
> if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
> - memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP, NULL)) {
> + reclaim_options, NULL)) {
> ret = -EBUSY;
> break;
> }
> @@ -1520,6 +1524,8 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
> static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
> {
> int nr_retries = MAX_RECLAIM_RETRIES;
> + unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP |
> + MEMCG_RECLAIM_NO_DEMOTION;
>
> /* we call try-to-free pages for make this cgroup empty */
> lru_add_drain_all();
> @@ -1532,7 +1538,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
> return -EINTR;
>
> if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
> - MEMCG_RECLAIM_MAY_SWAP, NULL))
> + reclaim_options, NULL))
> nr_retries--;
> }
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 303ac622d22d..fcf1cd0da643 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -2287,6 +2287,8 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg,
> gfp_t gfp_mask)
> {
> unsigned long nr_reclaimed = 0;
> + unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP |
> + MEMCG_RECLAIM_NO_DEMOTION;
>
> do {
> unsigned long pflags;
> @@ -2300,7 +2302,7 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg,
> psi_memstall_enter(&pflags);
> nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
> gfp_mask,
> - MEMCG_RECLAIM_MAY_SWAP,
> + reclaim_options,
> NULL);
> psi_memstall_leave(&pflags);
> } while ((memcg = parent_mem_cgroup(memcg)) &&
> @@ -2572,7 +2574,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
> /* Avoid the refill and flush of the older stock */
> batch = nr_pages;
>
> - reclaim_options = MEMCG_RECLAIM_MAY_SWAP;
> + reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_NO_DEMOTION;
> if (!do_memsw_account() ||
> page_counter_try_charge(&memcg->memsw, batch, &counter)) {
> if (page_counter_try_charge(&memcg->memory, batch, &counter))
> @@ -2610,7 +2612,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
>
> psi_memstall_enter(&pflags);
> nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
> - gfp_mask, reclaim_options, NULL);
> + gfp_mask, reclaim_options, NULL);
> psi_memstall_leave(&pflags);
>
> if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
> @@ -4638,6 +4640,8 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
> {
> struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
> unsigned int nr_retries = MAX_RECLAIM_RETRIES;
> + unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP |
> + MEMCG_RECLAIM_NO_DEMOTION;
> bool drained = false;
> unsigned long high;
> int err;
> @@ -4669,7 +4673,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
> }
>
> reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
> - GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, NULL);
> + GFP_KERNEL, reclaim_options, NULL);
>
> if (!reclaimed && !nr_retries--)
> break;
> @@ -4690,6 +4694,8 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
> {
> struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
> unsigned int nr_reclaims = MAX_RECLAIM_RETRIES;
> + unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP |
> + MEMCG_RECLAIM_NO_DEMOTION;
> bool drained = false;
> unsigned long max;
> int err;
> @@ -4721,7 +4727,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
>
> if (nr_reclaims) {
> if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
> - GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, NULL))
> + GFP_KERNEL, reclaim_options, NULL))
> nr_reclaims--;
> continue;
> }
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 33287ba4a500..7a8617ba1748 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -6809,6 +6809,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
> .may_unmap = 1,
> .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
> .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
> + .no_demotion = !!(reclaim_options & MEMCG_RECLAIM_NO_DEMOTION),
> };
> /*
> * Traverse the ZONELIST_FALLBACK zonelist of the current node to put
> --
> 2.53.0.851.ga537e3e6e9-goog
>