[PATCH] io_uring/io-wq: avoid repeated task_work scans during teardown
From: Fengnan Chang
Date: Tue May 19 2026 - 23:18:54 EST
We hit hard-lockup reports from iou-wrk threads stuck in
task_work_cancel_match() during io-wq teardown in syzkaller test.
The root cause is that teardown repeatedly rescans the submitter task's
full task_work list under pi_lock, once per matched item.
Two spots are problematic:
1) io_wq_cancel_tw_create() loops calling task_work_cancel_match() to
remove worker-creation callbacks one at a time. Each call re-walks
the entire list from scratch while holding pi_lock.
2) io_worker_exit() unconditionally scans the submitter task_work list
for its own create_work, even when it never queued one. With many
workers exiting simultaneously against a large unrelated task_work
list, this adds up fast.
Fix (1) by adding task_work_cancel_match_all() that unlinks all matching
callbacks in a single traversal, then iterating the returned list locally.
Same try_cmpxchg() synchronisation as before, stops at the work_exited
sentinel.
Fix (2) by skipping the cancel entirely unless create_state indicates a
pending create_work. Since create_state is exclusively owned via
test_and_set_bit_lock, at most one callback can be queued per worker, so
the cancel is also simplified from a loop to a single call.
With this fix the reproducer (FIFO-open + MSG_RING SEND_FD stress) no
longer triggers hard-lockup reports, and task_work_cancel_match samples
drop to microseconds.
Fixes: c80ca4707d1a ("io-wq: cancel task_work on exit only targeting the current 'wq'")
Fixes: 1d5f5ea7cb7d ("io-wq: remove worker to owner tw dependency")
Signed-off-by: Fengnan Chang <changfengnan@xxxxxxxxxxxxx>
---
include/linux/task_work.h | 3 +++
io_uring/io-wq.c | 23 +++++++++++-------
kernel/task_work.c | 51 +++++++++++++++++++++++++++++++++++++++
3 files changed, 68 insertions(+), 9 deletions(-)
diff --git a/include/linux/task_work.h b/include/linux/task_work.h
index 0646804860ff1..fb39d18c7c1fe 100644
--- a/include/linux/task_work.h
+++ b/include/linux/task_work.h
@@ -31,6 +31,9 @@ int task_work_add(struct task_struct *task, struct callback_head *twork,
struct callback_head *task_work_cancel_match(struct task_struct *task,
bool (*match)(struct callback_head *, void *data), void *data);
+struct callback_head *
+task_work_cancel_match_all(struct task_struct *task,
+ bool (*match)(struct callback_head *, void *data), void *data);
struct callback_head *task_work_cancel_func(struct task_struct *, task_work_func_t);
bool task_work_cancel(struct task_struct *task, struct callback_head *cb);
void task_work_run(void);
diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
index 7a9f94a0ce6f2..58144bd5891fa 100644
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@@ -234,13 +234,15 @@ static void io_worker_exit(struct io_worker *worker)
struct io_wq *wq = worker->wq;
struct io_wq_acct *acct = io_wq_get_acct(worker);
- while (1) {
- struct callback_head *cb = task_work_cancel_match(wq->task,
- io_task_worker_match, worker);
-
- if (!cb)
- break;
- io_worker_cancel_cb(worker);
+ if (test_bit(0, &worker->create_state)) {
+ /*
+ * create_state is exclusively owned via test_and_set_bit_lock,
+ * so at most one create_work can be pending per worker — a
+ * single cancel attempt is sufficient here.
+ */
+ if (task_work_cancel_match(wq->task, io_task_worker_match,
+ worker))
+ io_worker_cancel_cb(worker);
}
io_worker_release(worker);
@@ -1319,11 +1321,13 @@ void io_wq_exit_start(struct io_wq *wq)
static void io_wq_cancel_tw_create(struct io_wq *wq)
{
- struct callback_head *cb;
+ struct callback_head *cb, *next;
- while ((cb = task_work_cancel_match(wq->task, io_task_work_match, wq)) != NULL) {
+ cb = task_work_cancel_match_all(wq->task, io_task_work_match, wq);
+ while (cb) {
struct io_worker *worker;
+ next = cb->next;
worker = container_of(cb, struct io_worker, create_work);
io_worker_cancel_cb(worker);
/*
@@ -1332,6 +1336,7 @@ static void io_wq_cancel_tw_create(struct io_wq *wq)
*/
if (cb->func == create_worker_cont)
kfree(worker);
+ cb = next;
}
}
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 0f7519f8e7c93..c133f6988e844 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -143,6 +143,57 @@ task_work_cancel_match(struct task_struct *task,
return work;
}
+/**
+ * task_work_cancel_match_all - cancel all pending works matching @match
+ * @task: the task which should execute the work
+ * @match: match function to call
+ * @data: data to be passed in to match function
+ *
+ * Removes all currently queued matching works in one traversal. The returned
+ * callbacks are linked through ->next in their original queue order. This is
+ * useful for teardown paths that need to cancel many callbacks of the same
+ * class without repeatedly rescanning the whole task_work list under
+ * task->pi_lock.
+ *
+ * RETURNS:
+ * The first found work or NULL if not found.
+ */
+struct callback_head *
+task_work_cancel_match_all(struct task_struct *task,
+ bool (*match)(struct callback_head *, void *data),
+ void *data)
+{
+ struct callback_head **pprev = &task->task_works;
+ struct callback_head *work, *next;
+ struct callback_head *head = NULL, **tail = &head;
+ unsigned long flags;
+
+ if (likely(!task_work_pending(task)))
+ return NULL;
+
+ raw_spin_lock_irqsave(&task->pi_lock, flags);
+ work = READ_ONCE(*pprev);
+ while (work && work != &work_exited) {
+ next = READ_ONCE(work->next);
+ if (!match(work, data)) {
+ pprev = &work->next;
+ work = next;
+ continue;
+ }
+
+ if (!try_cmpxchg(pprev, &work, next))
+ continue;
+
+ work->next = NULL;
+ *tail = work;
+ tail = &work->next;
+ work = next;
+ }
+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+
+ return head;
+}
+
static bool task_work_func_match(struct callback_head *cb, void *data)
{
return cb->func == data;
--
2.39.5 (Apple Git-154)