[PATCH] bpf: Always defer local storage free

From: Andrea Righi

Date: Mon Mar 16 2026 - 18:29:06 EST


bpf_task_storage_delete() can be invoked from contexts that hold a raw
spinlock, such as sched_ext's ops.exit_task() callback, that is running
with the rq lock held.

The delete path eventually calls bpf_selem_unlink(), which frees the
element via bpf_selem_free_list() -> bpf_selem_free(). For task storage
with use_kmalloc_nolock, call_rcu_tasks_trace() is used, which is not
safe from raw spinlock context, triggering the following:

=============================
[ BUG: Invalid wait context ]
7.0.0-rc1-virtme #1 Not tainted
-----------------------------
(udev-worker)/115 is trying to lock:
ffffffffa6970dd0 (rcu_tasks_trace_srcu_struct_srcu_usage.lock){....}-{3:3}, at: spin_lock_irqsave_ssp_contention+0x54/0x90
other info that might help us debug this:
context-{5:5}
3 locks held by (udev-worker)/115:
#0: ffff8e16c634ce58 (&p->pi_lock){-.-.}-{2:2}, at: _task_rq_lock+0x2c/0x100
#1: ffff8e16fbdbdae0 (&rq->__lock){-.-.}-{2:2}, at: raw_spin_rq_lock_nested+0x24/0xb0
#2: ffffffffa6971b60 (rcu_read_lock){....}-{1:3}, at: __bpf_prog_enter+0x64/0x110
...
Sched_ext: cosmos_1.0.7_g780e898fc_dirty_x86_64_unknown_linux_gnu (enabled+all), task: runnable_at=-2ms
Call Trace:
dump_stack_lvl+0x6f/0xb0
__lock_acquire+0xf86/0x1de0
lock_acquire+0xcf/0x310
_raw_spin_lock_irqsave+0x39/0x60
spin_lock_irqsave_ssp_contention+0x54/0x90
srcu_gp_start_if_needed+0x2a7/0x490
bpf_selem_unlink+0x24b/0x590
bpf_task_storage_delete+0x3a/0x90
bpf_prog_3b623b4be76cfb86_scx_pmu_task_fini+0x26/0x2a
bpf_prog_4b1530d9d9852432_cosmos_exit_task+0x1d/0x1f
bpf__sched_ext_ops_exit_task+0x4b/0xa7
__scx_disable_and_exit_task+0x10a/0x200
scx_disable_and_exit_task+0xe/0x60

Fix by deferring memory deallocation to ensure it occurs outside the raw
spinlock context.

Fixes: f484f4a3e058 ("bpf: Replace bpf memory allocator with kmalloc_nolock() in local storage")
Signed-off-by: Andrea Righi <arighi@xxxxxxxxxx>
---
include/linux/bpf_local_storage.h | 1 +
kernel/bpf/bpf_local_storage.c | 96 +++++++++++++++++++++++++++++--
2 files changed, 93 insertions(+), 4 deletions(-)

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index 8157e8da61d40..7e348a5c6b85d 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -105,6 +105,7 @@ struct bpf_local_storage {
u64 mem_charge; /* Copy of mem charged to owner. Protected by "lock" */
refcount_t owner_refcnt;/* Used to pin owner when map_free is uncharging */
bool use_kmalloc_nolock;
+ struct hlist_node deferred_free_node; /* Used for deferred free */
};

/* U16_MAX is much more than enough for sk local storage
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 9c96a4477f81a..0fbf6029e1361 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -14,9 +14,26 @@
#include <linux/rcupdate.h>
#include <linux/rcupdate_trace.h>
#include <linux/rcupdate_wait.h>
+#include <linux/workqueue.h>

#define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_CLONE)

+static DEFINE_PER_CPU(struct hlist_head, bpf_deferred_selem_free_list);
+static DEFINE_PER_CPU(struct hlist_head, bpf_deferred_storage_free_list);
+static DEFINE_PER_CPU(atomic_t, bpf_deferred_free_pending);
+
+struct bpf_deferred_free_rcu {
+ struct rcu_head rcu;
+ int cpu;
+};
+static DEFINE_PER_CPU(struct bpf_deferred_free_rcu, bpf_deferred_free_rcu);
+
+struct bpf_deferred_free_work {
+ struct work_struct work;
+ int cpu;
+};
+static DEFINE_PER_CPU(struct bpf_deferred_free_work, bpf_deferred_free_work);
+
static struct bpf_local_storage_map_bucket *
select_bucket(struct bpf_local_storage_map *smap,
struct bpf_local_storage *local_storage)
@@ -260,6 +277,80 @@ static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now)
bpf_selem_free(selem, reuse_now);
}

+static void bpf_deferred_free_work_fn(struct work_struct *work)
+{
+ struct bpf_deferred_free_work *deferred_work =
+ container_of(work, struct bpf_deferred_free_work, work);
+ int cpu = deferred_work->cpu;
+ struct hlist_head *selem_list = per_cpu_ptr(&bpf_deferred_selem_free_list, cpu);
+ struct hlist_head *storage_list = per_cpu_ptr(&bpf_deferred_storage_free_list, cpu);
+ struct bpf_local_storage_elem *selem;
+ struct bpf_local_storage *local_storage;
+ struct hlist_node *n;
+
+ atomic_set(per_cpu_ptr(&bpf_deferred_free_pending, cpu), 0);
+
+ hlist_for_each_entry_safe(selem, n, selem_list, free_node) {
+ hlist_del_init(&selem->free_node);
+ bpf_selem_free(selem, true);
+ }
+
+ hlist_for_each_entry_safe(local_storage, n, storage_list, deferred_free_node) {
+ hlist_del_init(&local_storage->deferred_free_node);
+ bpf_local_storage_free(local_storage, true);
+ }
+}
+
+static void bpf_deferred_free_rcu_callback(struct rcu_head *rcu)
+{
+ struct bpf_deferred_free_rcu *deferred =
+ container_of(rcu, struct bpf_deferred_free_rcu, rcu);
+ int cpu = deferred->cpu;
+ struct bpf_deferred_free_work *work = per_cpu_ptr(&bpf_deferred_free_work, cpu);
+
+ work->cpu = cpu;
+ queue_work_on(cpu, system_wq, &work->work);
+}
+
+static void bpf_selem_unlink_defer_free(struct hlist_head *selem_free_list,
+ struct bpf_local_storage *local_storage,
+ bool free_local_storage)
+{
+ struct bpf_local_storage_elem *s;
+ struct hlist_node *n;
+ struct hlist_head *deferred_selem = this_cpu_ptr(&bpf_deferred_selem_free_list);
+ struct hlist_head *deferred_storage = this_cpu_ptr(&bpf_deferred_storage_free_list);
+ struct bpf_deferred_free_rcu *deferred_rcu = this_cpu_ptr(&bpf_deferred_free_rcu);
+
+ hlist_for_each_entry_safe(s, n, selem_free_list, free_node) {
+ hlist_del(&s->free_node);
+ hlist_add_head(&s->free_node, deferred_selem);
+ }
+
+ if (free_local_storage)
+ hlist_add_head(&local_storage->deferred_free_node, deferred_storage);
+
+ if (atomic_cmpxchg(this_cpu_ptr(&bpf_deferred_free_pending), 0, 1) == 0) {
+ deferred_rcu->cpu = smp_processor_id();
+ call_rcu(&deferred_rcu->rcu, bpf_deferred_free_rcu_callback);
+ }
+}
+
+static int __init bpf_local_storage_deferred_free_init(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ INIT_HLIST_HEAD(per_cpu_ptr(&bpf_deferred_selem_free_list, cpu));
+ INIT_HLIST_HEAD(per_cpu_ptr(&bpf_deferred_storage_free_list, cpu));
+ atomic_set(per_cpu_ptr(&bpf_deferred_free_pending, cpu), 0);
+ INIT_WORK(&per_cpu(bpf_deferred_free_work, cpu).work,
+ bpf_deferred_free_work_fn);
+ }
+ return 0;
+}
+subsys_initcall(bpf_local_storage_deferred_free_init);
+
static void bpf_selem_unlink_storage_nolock_misc(struct bpf_local_storage_elem *selem,
struct bpf_local_storage_map *smap,
struct bpf_local_storage *local_storage,
@@ -419,10 +510,7 @@ int bpf_selem_unlink(struct bpf_local_storage_elem *selem)
out:
raw_res_spin_unlock_irqrestore(&local_storage->lock, flags);

- bpf_selem_free_list(&selem_free_list, false);
-
- if (free_local_storage)
- bpf_local_storage_free(local_storage, false);
+ bpf_selem_unlink_defer_free(&selem_free_list, local_storage, free_local_storage);

return err;
}
--
2.53.0