Re: [RFC PATCH v3 1/4] mm/zsmalloc: introduce deferred free framework with callback ops

From: Wenchao Hao

Date: Sat May 09 2026 - 04:48:20 EST

On Sat, May 9, 2026 at 8:29 AM Nhat Pham <nphamcs@xxxxxxxxx> wrote:
>
> On Thu, May 7, 2026 at 11:08 PM Wenchao Hao <haowenchao22@xxxxxxxxx> wrote:
> >
> > Add a per-cpu deferred free mechanism to zsmalloc with a callback
> > interface that lets callers (zram, zswap) customize push and drain
> > behavior.
> >
> > Each CPU owns a single-page buffer. The hot path (zs_free_deferred)
> > writes a value into the current CPU's buffer via the push callback
> > with preemption disabled — no locks, no atomics. When the buffer
> > fills, it is swapped with a fresh page from a pre-allocated page
> > pool and the full page is queued to a WQ_UNBOUND worker for drain.
> >
> > The drain worker invokes the drain callback which performs the actual
> > expensive work (zs_free, slot_free, etc.) in batch, away from the
> > original hot path.
> >
> > Page pool management:
> > - Pool is pre-allocated at enable time (ZS_DEFERRED_POOL_SIZE pages)
> > - Full buffers are drained and returned to the pool
> > - If no free page is available when buffer is full, the push falls
> > back to synchronous processing by the caller
> >
> > Signed-off-by: Wenchao Hao <haowenchao@xxxxxxxxxx>
> > ---
> > +#define ZS_DEFERRED_POOL_SIZE (256 * 1024 / PAGE_SIZE)
>
> Seems oddly specific? :) And this doesn't quite scale with number of
> CPUs, or memory size?
>

256K holds the deferred metadata for ~128MB zswap or ~256MB zram
entries, which matches what a killed process typically has swapped
out. Pages sitting in the pool are memory that can't be used
elsewhere, so I didn't want it to grow with RAM/CPU. Happy to
parameterize it if you'd prefer.

> > +
> > +struct zs_deferred_percpu {
> > + unsigned int count;
> > + void *buf;
> > +};
> > +
> > struct zs_pool {
> > const char *name;
> >
> > @@ -217,6 +224,18 @@ struct zs_pool {
> > /* protect zspage migration/compaction */
> > rwlock_t lock;
> > atomic_t compaction_in_progress;
> > +
> > + /* per-cpu deferred free */
> > + const struct zs_deferred_ops *deferred_ops;
> > + void *deferred_private;
> > + struct zs_deferred_percpu __percpu *deferred;
> > + struct work_struct deferred_work;
> > + struct workqueue_struct *deferred_wq;
> > + struct list_head deferred_pool;
> > + unsigned int deferred_pool_count;
> > + spinlock_t deferred_pool_lock;
> > + struct list_head deferred_drain_list;
> > + spinlock_t deferred_drain_lock;
> > };
> >
> > static inline void zpdesc_set_first(struct zpdesc *zpdesc)
> > @@ -1416,6 +1435,171 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
> > }
> > EXPORT_SYMBOL_GPL(zs_free);
> >
> > +static struct page *deferred_pool_get(struct zs_pool *pool)
> > +{
> > + struct page *page = NULL;
> > +
> > + spin_lock(&pool->deferred_pool_lock);
> > + if (!list_empty(&pool->deferred_pool)) {
> > + page = list_first_entry(&pool->deferred_pool, struct page, lru);
> > + list_del(&page->lru);
> > + pool->deferred_pool_count--;
> > + }
> > + spin_unlock(&pool->deferred_pool_lock);
> > + return page;
> > +}
> > +
> > +static void deferred_pool_put(struct zfs_pool *pool, struct page *page)
> > +{
> > + spin_lock(&pool->deferred_pool_lock);
> > + list_add_tail(&page->lru, &pool->deferred_pool);
> > + pool->deferred_pool_count++;
> > + spin_unlock(&pool->deferred_pool_lock);
> > +}
> > +
> > +static void zs_deferred_work_fn(struct work_struct *work)
> > +{
> > + struct zs_pool *pool = container_of(work, struct zs_pool, deferred_work);
> > + struct page *page;
> > +
> > + while (true) {
> > + unsigned int count;
> > +
> > + spin_lock(&pool->deferred_drain_lock);
> > + if (list_empty(&pool->deferred_drain_list)) {
> > + spin_unlock(&pool->deferred_drain_lock);
> > + break;
> > + }
> > + page = list_first_entry(&pool->deferred_drain_list,
> > + struct page, lru);
> > + list_del(&page->lru);
> > + count = page_private(page);
> > + spin_unlock(&pool->deferred_drain_lock);
> > +
> > + pool->deferred_ops->drain(pool->deferred_private,
> > + page_address(page), count);
> > + deferred_pool_put(pool, page);
> > + cond_resched();
> > + }
> > +}
> > +
> > +bool zs_free_deferred(struct zs_pool *pool, unsigned long value)
> > +{
> > + struct zs_deferred_percpu *def;
> > + struct page *new_page, *full_page;
> > + enum zs_push_ret ret;
> > +
> > + if (!pool->deferred)
> > + return false;
> > +
> > + def = get_cpu_ptr(pool->deferred);
> > +
> > + ret = pool->deferred_ops->push(def->buf, def->count, value);
> > + if (ret == ZS_PUSH_OK) {
> > + def->count++;
> > + put_cpu_ptr(pool->deferred);
> > + return true;
> > + }
> > +
> > + if (ret == ZS_PUSH_FULL_QUEUED)
> > + def->count++;
> > +
> > + new_page = deferred_pool_get(pool);
> > + if (new_page) {
> > + full_page = virt_to_page(def->buf);
> > + set_page_private(full_page, def->count);
> > + def->buf = page_address(new_page);
> > + def->count = 0;
> > +
> > + if (ret == ZS_PUSH_FULL) {
> > + pool->deferred_ops->push(def->buf, 0, value);
> > + def->count = 1;
> > + }
> > + put_cpu_ptr(pool->deferred);
> > +
> > + spin_lock(&pool->deferred_drain_lock);
> > + list_add_tail(&full_page->lru, &pool->deferred_drain_list);
> > + spin_unlock(&pool->deferred_drain_lock);
> > + queue_work(pool->deferred_wq, &pool->deferred_work);
> > + return true;
> > + }
> > + put_cpu_ptr(pool->deferred);
> > +
> > + /* ret==2: value already queued, will be drained eventually */
> > + if (ret == 2)
>
> == 2? :)
>

Will replace with ZS_PUSH_FULL_QUEUED, if v4 still has
this logic.

Thanks,
Wenchao