[PATCH v2 2/2] cgroup/dmem: add dmem.memcg control file for double-charging to memcg

From: Eric Chanudet

Date: Tue May 19 2026 - 12:07:25 EST


Add a root-only cgroupfs file "dmem.memcg" that lets an administrator
configure whether allocations in a dmem region should also be charged to
the memory controller.

To handle inheritance, dmem adds a depends_on the memory controller,
unless MEMCG isn't configured in.

Double-charging is disabled by default. Once a charge is attempted, the
setting is locked to prevent inconsistent accounting by a small 4-state
machine (off, on, locked off, locked on).

The memcg to charge is derived from the pool's cgroup, since the pool
holds a reference to the dmem cgroup state that keeps the cgroup alive
until it gets uncharged.

Signed-off-by: Eric Chanudet <echanude@xxxxxxxxxx>
---
Documentation/admin-guide/cgroup-v2.rst | 23 +++++
kernel/cgroup/dmem.c | 158 +++++++++++++++++++++++++++++++-
2 files changed, 178 insertions(+), 3 deletions(-)

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 6efd0095ed995b1550317662bc1b56c7a7f3db23..1d2fa55ddf0faa17baa916a8914d3033e8e42359 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -2828,6 +2828,29 @@ DMEM Interface Files
drm/0000:03:00.0/vram0 12550144
drm/0000:03:00.0/stolen 8650752

+ dmem.memcg
+ A readwrite nested-keyed file that exists only on the root
+ cgroup. It configures whether allocations in a dmem region
+ should also be charged to the memory controller.
+
+ Upon the first charge to a region, its setting can no longer be changed
+ and is reported as "[true|false] (locked)".
+
+ Charges to the memory controller are visible in ``memory.stat`` as the
+ ``dmem`` entry, reported in bytes.
+
+ An example read output follows::
+
+ drm/0000:03:00.0/vram0 false
+ drm/0000:03:00.0/stolen false (locked)
+
+ Writing uses the same nested-keyed format::
+
+ echo "drm/0000:03:00.0/vram0 true" > dmem.memcg
+
+ This file is only available when the kernel is built with
+ ``CONFIG_MEMCG``.
+
HugeTLB
-------

diff --git a/kernel/cgroup/dmem.c b/kernel/cgroup/dmem.c
index 1ab1fb47f2711ecc60dd13e611a8a4920b48f3e9..e07b20b8025c528f190f84c76b088cb8a32a7f5e 100644
--- a/kernel/cgroup/dmem.c
+++ b/kernel/cgroup/dmem.c
@@ -17,6 +17,14 @@
#include <linux/refcount.h>
#include <linux/rculist.h>
#include <linux/slab.h>
+#include <linux/memcontrol.h>
+
+enum dmem_memcg_status {
+ DMEM_MEMCG_OFF,
+ DMEM_MEMCG_ON,
+ DMEM_MEMCG_LOCKED_OFF,
+ DMEM_MEMCG_LOCKED_ON,
+};

struct dmem_cgroup_region {
/**
@@ -51,6 +59,14 @@ struct dmem_cgroup_region {
* No new pools should be added to the region afterwards.
*/
bool unregistered;
+
+ /**
+ * @memcg_status: Whether allocation in this region should charge memcg.
+ * DMEM_MEMCG_OFF/DMEM_MEMCG_ON or
+ * DMEM_MEMCG_LOCKED_OFF/DMEM_MEMCG_LOCKED_ON, frozen after first allocation.
+ * Transitions to a locked state are one-way.
+ */
+ atomic_t memcg_status;
};

struct dmemcg_state {
@@ -609,6 +625,34 @@ get_cg_pool_unlocked(struct dmemcg_state *cg, struct dmem_cgroup_region *region)
return pool;
}

+static bool apply_memcg_charge(atomic_t *status)
+{
+ int state = atomic_read(status);
+
+ for (;;) {
+ switch (state) {
+ case DMEM_MEMCG_OFF:
+ state = atomic_cmpxchg(status, DMEM_MEMCG_OFF,
+ DMEM_MEMCG_LOCKED_OFF);
+ if (state != DMEM_MEMCG_OFF)
+ continue;
+ return false;
+ case DMEM_MEMCG_LOCKED_OFF:
+ return false;
+ case DMEM_MEMCG_ON:
+ state = atomic_cmpxchg(status, DMEM_MEMCG_ON,
+ DMEM_MEMCG_LOCKED_ON);
+ if (state != DMEM_MEMCG_ON)
+ continue;
+ return true;
+ case DMEM_MEMCG_LOCKED_ON:
+ return true;
+ }
+ WARN_ONCE(1, "Invalid memcg_status (%#x).\n", state);
+ return false;
+ }
+}
+
/**
* dmem_cgroup_uncharge() - Uncharge a pool.
* @pool: Pool to uncharge.
@@ -624,6 +668,12 @@ void dmem_cgroup_uncharge(struct dmem_cgroup_pool_state *pool, u64 size)
return;

page_counter_uncharge(&pool->cnt, size);
+
+ if (atomic_read(&pool->region->memcg_status) == DMEM_MEMCG_LOCKED_ON &&
+ !WARN_ON_ONCE(size > (u64)UINT_MAX << PAGE_SHIFT))
+ mem_cgroup_dmem_uncharge(pool->cs->css.cgroup,
+ PAGE_ALIGN(size) >> PAGE_SHIFT);
+
css_put(&pool->cs->css);
dmemcg_pool_put(pool);
}
@@ -655,6 +705,8 @@ int dmem_cgroup_try_charge(struct dmem_cgroup_region *region, u64 size,
struct dmemcg_state *cg;
struct dmem_cgroup_pool_state *pool;
struct page_counter *fail;
+ unsigned long nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ bool charge_memcg;
int ret;

*ret_pool = NULL;
@@ -670,7 +722,28 @@ int dmem_cgroup_try_charge(struct dmem_cgroup_region *region, u64 size,
pool = get_cg_pool_unlocked(cg, region);
if (IS_ERR(pool)) {
ret = PTR_ERR(pool);
- goto err;
+ goto err_css_put;
+ }
+
+ charge_memcg = apply_memcg_charge(&region->memcg_status);
+ if (charge_memcg) {
+ /* mem_cgroup_dmem_charge limitation from try_charge_memcg */
+ if (size > (u64)UINT_MAX << PAGE_SHIFT) {
+ ret = -EINVAL;
+ dmemcg_pool_put(pool);
+ goto err_css_put;
+ }
+
+ if (!mem_cgroup_dmem_charge(pool->cs->css.cgroup, nr_pages,
+ GFP_KERNEL)) {
+ /*
+ * No dmem_cgroup_state_evict_valuable() could help,
+ * there's no ret_limit_pool to return.
+ */
+ ret = -ENOMEM;
+ dmemcg_pool_put(pool);
+ goto err_css_put;
+ }
}

if (!page_counter_try_charge(&pool->cnt, size, &fail)) {
@@ -681,14 +754,17 @@ int dmem_cgroup_try_charge(struct dmem_cgroup_region *region, u64 size,
}
dmemcg_pool_put(pool);
ret = -EAGAIN;
- goto err;
+ goto err_uncharge_memcg;
}

/* On success, reference from get_current_dmemcs is transferred to *ret_pool */
*ret_pool = pool;
return 0;

-err:
+err_uncharge_memcg:
+ if (charge_memcg)
+ mem_cgroup_dmem_uncharge(pool->cs->css.cgroup, nr_pages);
+err_css_put:
css_put(&cg->css);
return ret;
}
@@ -845,6 +921,71 @@ static ssize_t dmem_cgroup_region_max_write(struct kernfs_open_file *of,
return dmemcg_limit_write(of, buf, nbytes, off, set_resource_max);
}

+#ifdef CONFIG_MEMCG
+static int dmem_cgroup_memcg_show(struct seq_file *sf, void *v)
+{
+ struct dmem_cgroup_region *region;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(region, &dmem_cgroup_regions, region_node) {
+ int state = atomic_read(&region->memcg_status);
+
+ seq_printf(sf, "%s %s\n", region->name,
+ state == DMEM_MEMCG_ON ? "true" :
+ state == DMEM_MEMCG_OFF ? "false" :
+ state == DMEM_MEMCG_LOCKED_ON ? "true (locked)" :
+ state == DMEM_MEMCG_LOCKED_OFF ? "false (locked)" :
+ "(invalid)");
+ }
+ rcu_read_unlock();
+ return 0;
+}
+
+static ssize_t dmem_cgroup_memcg_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ while (buf) {
+ struct dmem_cgroup_region *region;
+ char *options, *name;
+ bool flag;
+
+ options = buf;
+ buf = strchr(buf, '\n');
+ if (buf)
+ *buf++ = '\0';
+
+ options = strstrip(options);
+ if (!options[0])
+ continue;
+
+ name = strsep(&options, " \t");
+ if (!name[0])
+ continue;
+
+ if (!options || !options[0])
+ return -EINVAL;
+
+ if (kstrtobool(options, &flag))
+ return -EINVAL;
+
+ rcu_read_lock();
+ region = dmemcg_get_region_by_name(name);
+ rcu_read_unlock();
+ if (!region)
+ return -ENODEV;
+
+ atomic_cmpxchg(&region->memcg_status,
+ flag ? DMEM_MEMCG_OFF : DMEM_MEMCG_ON,
+ flag ? DMEM_MEMCG_ON : DMEM_MEMCG_OFF);
+ /* Continue if a region is already locked. */
+
+ kref_put(&region->ref, dmemcg_free_region);
+ }
+
+ return nbytes;
+}
+#endif
+
static struct cftype files[] = {
{
.name = "capacity",
@@ -873,6 +1014,14 @@ static struct cftype files[] = {
.seq_show = dmem_cgroup_region_max_show,
.flags = CFTYPE_NOT_ON_ROOT,
},
+#ifdef CONFIG_MEMCG
+ {
+ .name = "memcg",
+ .write = dmem_cgroup_memcg_write,
+ .seq_show = dmem_cgroup_memcg_show,
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ },
+#endif
{ } /* Zero entry terminates. */
};

@@ -882,4 +1031,7 @@ struct cgroup_subsys dmem_cgrp_subsys = {
.css_offline = dmemcs_offline,
.legacy_cftypes = files,
.dfl_cftypes = files,
+#ifdef CONFIG_MEMCG
+ .depends_on = 1 << memory_cgrp_id,
+#endif
};

--
2.52.0