[PATCH] md/raid5: cleanup reshape stripes when too many devices fail

From: Chen Cheng

Date: Mon May 18 2026 - 08:35:27 EST

From: Chen Cheng <chencheng@xxxxxxxxx>

When a raid5/6 reshape is in progress and the array loses more than
max_degraded devices, raid5_error() sets MD_BROKEN and MD_RECOVERY_INTR,
but the reshape stripes reshape_request() handed out are never released.
The "s.failed > conf->max_degraded" branch of handle_stripe() calls
handle_failed_stripe() / handle_failed_sync() for user IO and resync,
but has no equivalent for the expand case, so three kinds of stripes
leak conf->reshape_stripes and mddev->recovery_active:

1. Destination stripes with skipped_disk == 0: STRIPE_EXPANDING +
STRIPE_EXPAND_READY set, but on a broken array the normal
completion at "s.expanded && !reconstruct_state && s.locked == 0"
may never fire.

2. Destination stripes with skipped_disk == 1: only STRIPE_EXPANDING
set, no STRIPE_HANDLE. They sit idle in the cache waiting for
source data that can no longer be read; handle_stripe() is never
called on them directly.

3. Source stripes (STRIPE_EXPAND_SOURCE) hit the failure branch but
the bit is never cleared and the destinations they feed are never
released.

md_do_sync() exits its main loop on MD_RECOVERY_INTR but then blocks
forever at

wait_event(mddev->recovery_wait,
!atomic_read(&mddev->recovery_active));

A concurrent "echo frozen > sync_action" then blocks in
stop_sync_thread() waiting for MD_RECOVERY_RUNNING to clear, and the
array becomes unstoppable without a reboot.

Reproducer:

DEVS=(/dev/sdb /dev/sdc /dev/sdd /dev/sde /dev/sdf)
for i in 0 1 2 3 4; do
s=$(blockdev --getsz ${DEVS[$i]})
dmsetup create dust$i --table "0 $s dust ${DEVS[$i]} 0 4096"
dmsetup message dust$i 0 quiet
done
mdadm -C /dev/md0 -e 1.2 -l 5 -n 4 -c 64 --assume-clean \
/dev/mapper/dust{0..3}
for b in $(seq 0 8191); do
dmsetup message dust0 0 addbadblock $b
dmsetup message dust1 0 addbadblock $b
done
mdadm --manage /dev/md0 --add /dev/mapper/dust4
mdadm --grow /dev/md0 -n 5 --backup-file=/tmp/grow.backup &
while [[ $(cat /sys/block/md0/md/sync_action) != reshape ]]; do
sleep 0.1
done
dmsetup message dust0 0 enable
dmsetup message dust1 0 enable
sleep 5
echo frozen > /sys/block/md0/md/sync_action # hangs forever

Before the fix, the two tasks deadlock against each other:

task:md0_reshape state:D
schedule
md_do_sync.cold+0x818/0xc25 # wait_event(recovery_wait,
md_thread # !recovery_active)
kthread

task:bash state:D
schedule
stop_sync_thread+0x1a3/0x350 # wait_event(resync_wait,
action_store # !MD_RECOVERY_RUNNING)
md_attr_store
kernfs_fop_write_iter
vfs_write
ksys_write

After the fix handle_stripe() releases the leaked reshape stripes via
the new handle_failed_reshape(), recovery_active drains to zero,
md_do_sync() prints

md/raid:md0: Cannot continue operation (2/5 failed).
md: md0: reshape interrupted.

clears MD_RECOVERY_RUNNING and returns; the "echo frozen" write
returns in <1s; "mdadm --stop /dev/md0" completes normally and no
task is left in D state.

Fix it by adding handle_failed_reshape(), called from handle_stripe()
when the failure branch fires on a reshape stripe. If sh is a
destination, the helper drops STRIPE_EXPANDING / STRIPE_EXPAND_READY,
decrements conf->reshape_stripes, wakes wait_for_reshape and calls
md_done_sync() to return the sectors reshape_request() accounted on
recovery_active. If sh is a source, the helper drops
STRIPE_EXPAND_SOURCE and walks sh's non-parity data disks using the
same raid5_compute_blocknr() / raid5_compute_sector() mapping
handle_stripe_expansion() uses to forward data, looks up each matching
destination with R5_GAS_NOBLOCK | R5_GAS_NOQUIESCE and applies the
destination cleanup to it.

Signed-off-by: Chen Cheng <chencheng@xxxxxxxxx>
---
drivers/md/raid5.c | 91 ++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 91 insertions(+)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 0d76e82f4506..f7d159b46a01 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4594,10 +4594,96 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
}
/* done submitting copies, wait for them to complete */
async_tx_quiesce(&tx);
}

+/*
+ * handle_failed_reshape - drop reshape state when too many devices have failed
+ *
+ * Called from handle_stripe() in the "s.failed > conf->max_degraded" branch
+ * when sh is participating in a reshape. raid5_error() has set MD_BROKEN
+ * and MD_RECOVERY_INTR); The reshape stripes that reshape_request() handed out
+ * must be released, otherwise they leak conf->reshape_stripes and
+ * mddev->recovery_active, and md_do_sync() hangs forever at
+ * wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)).
+ *
+ * Three kinds of stripes can reach this path:
+ *
+ * 1. Destination stripes with skipped_disk = 0 in reshape_request()
+ * - the new stripe maps entirely past the old array end, so its
+ * blocks are zero-filled in place without any source read.
+ * STRIPE_EXPANDING, STRIPE_EXPAND_READY and STRIPE_HANDLE are all set,
+ * handle_stripe() sees them with s.expanded == 1.
+ *
+ * 2. Destination stripes with skipped_disk = 1 - the new stripe
+ * overlaps existing data and still needs source blocks copied in by
+ * handle_stripe_expansion(). Only STRIPE_EXPANDING is set, *not*
+ * STRIPE_HANDLE, so they sit idle in the stripe cache until a successful
+ * source expand re-handles them. In the failure path no one ever does,
+ * so handle_stripe() will never see them on its own; they are cleaned up
+ * from the source side in step (b) below.
+ *
+ * 3. Source stripes (STRIPE_EXPAND_SOURCE) - reach handle_stripe() via the
+ * read-error path once the source members start returning EIO and
+ * raid5_error() marks them Faulty.
+ *
+ * Handling:
+ *
+ * (a) If STRIPE_EXPANDING is set on sh, clear it together with
+ * STRIPE_EXPAND_READY, atomic_dec conf->reshape_stripes, wake
+ * wait_for_reshape and md_done_sync(RAID5_STRIPE_SECTORS) to return
+ * the sectors reshape_request() accounted on recovery_active.
+ *
+ * (b) If STRIPE_EXPAND_SOURCE is set on sh, clear it and walk sh's
+ * non-parity disks the same way handle_stripe_expansion() does
+ * (raid5_compute_blocknr previous=1 -> raid5_compute_sector previous=0)
+ * to find each destination, look it up with
+ * R5_GAS_NOBLOCK | R5_GAS_NOQUIESCE and apply step (a) to it.
+ * A NULL lookup means the destination never contributed to
+ * reshape_stripes - nothing to release.
+ */
+static void handle_failed_reshape(struct r5conf *conf, struct stripe_head *sh)
+{
+ int i;
+
+ if (test_and_clear_bit(STRIPE_EXPANDING, &sh->state)) {
+ atomic_dec(&conf->reshape_stripes);
+ wake_up(&conf->wait_for_reshape);
+ md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf));
+ }
+
+ clear_bit(STRIPE_EXPAND_READY, &sh->state);
+
+ if (test_and_clear_bit(STRIPE_EXPAND_SOURCE, &sh->state)) {
+ for (i = 0; i < sh->disks; i++) {
+ int dd_idx;
+ struct stripe_head *sh2;
+ sector_t bn, sec;
+
+ if (i == sh->pd_idx)
+ continue;
+ if (conf->level == 6 && i == sh->qd_idx)
+ continue;
+
+ bn = raid5_compute_blocknr(sh, i, 1);
+ sec = raid5_compute_sector(conf, bn, 0, &dd_idx, NULL);
+ sh2 = raid5_get_active_stripe(conf, NULL, sec,
+ R5_GAS_NOBLOCK | R5_GAS_NOQUIESCE);
+ if (!sh2)
+ continue;
+ if (test_and_clear_bit(STRIPE_EXPANDING, &sh2->state)) {
+ atomic_dec(&conf->reshape_stripes);
+ wake_up(&conf->wait_for_reshape);
+ md_done_sync(conf->mddev,
+ RAID5_STRIPE_SECTORS(conf));
+ }
+ clear_bit(STRIPE_EXPAND_READY, &sh2->state);
+ raid5_release_stripe(sh2);
+ }
+ }
+}
+
static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
{
struct r5conf *conf = sh->raid_conf;
int disks = sh->disks;
struct r5dev *dev;
@@ -5001,10 +5087,15 @@ static void handle_stripe(struct stripe_head *sh)
break_stripe_batch_list(sh, 0);
if (s.to_read+s.to_write+s.written)
handle_failed_stripe(conf, sh, &s, disks);
if (s.syncing + s.replacing)
handle_failed_sync(conf, sh, &s);
+ if (s.expanding || s.expanded) {
+ handle_failed_reshape(conf, sh);
+ s.expanding = 0;
+ s.expanded = 0;
+ }
}

/* Now we check to see if any write operations have recently
* completed
*/
--
2.54.0