Re: [PATCH v4 14/23] ext4: implement partial block zero range path using iomap

From: Ojaswin Mujoo

Date: Wed May 27 2026 - 09:14:20 EST

On Mon, May 11, 2026 at 03:23:34PM +0800, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@xxxxxxxxxx>
>
> Introduce a new iomap_ops instance, ext4_iomap_zero_ops, along with
> ext4_iomap_block_zero_range() to implement block zeroing via the iomap
> infrastructure for ext4.
>
> ext4_iomap_block_zero_range() calls iomap_zero_range() with
> ext4_iomap_zero_begin() as the callback. The callback locates and zeros
> out either a mapped partial block or a dirty, unwritten partial block.
>
> Important constraints:
>
> Zeroing out under an active journal handle can cause deadlock, because
> the order of acquiring the folio lock and starting a handle is
> inconsistent with the iomap writeback path.
>
> Therefore, ext4_iomap_block_zero_range():
> - Must NOT be called under an active handle.
> - Cannot rely on data=ordered mode to ensure zeroed data persistence
> before updating i_disksize (for the cases of post-EOF append write,
> post-EOF fallocate, and truncate up). In subsequent patches, we will
> address this by synchronizing commit I/O but doesn't waiting for
> completion, and updating i_disksize to i_size only after the zeroed
> data has been written back.
>
> Signed-off-by: Zhang Yi <yi.zhang@xxxxxxxxxx>

Looks good in itself. Feel free to add:

Reviewed-by: Ojaswin Mujoo <ojaswin@xxxxxxxxxxxxx>

Regards,
Ojaswin

> ---
> fs/ext4/inode.c | 92 +++++++++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 92 insertions(+)
>
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index c6fe42d012fc..e0dae2501292 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -4101,6 +4101,51 @@ static int ext4_iomap_buffered_da_write_end(struct inode *inode, loff_t offset,
> return 0;
> }
>
> +static int ext4_iomap_zero_begin(struct inode *inode,
> + loff_t offset, loff_t length, unsigned int flags,
> + struct iomap *iomap, struct iomap *srcmap)
> +{
> + struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
> + struct ext4_map_blocks map;
> + u8 blkbits = inode->i_blkbits;
> + unsigned int iomap_flags = 0;
> + int ret;
> +
> + ret = ext4_emergency_state(inode->i_sb);
> + if (unlikely(ret))
> + return ret;
> +
> + if (WARN_ON_ONCE(!(flags & IOMAP_ZERO)))
> + return -EINVAL;
> +
> + ret = ext4_iomap_map_blocks(inode, offset, length, NULL, &map);
> + if (ret < 0)
> + return ret;
> +
> + /*
> + * Look up dirty folios for unwritten mappings within EOF. Providing
> + * this bypasses the flush iomap uses to trigger extent conversion
> + * when unwritten mappings have dirty pagecache in need of zeroing.
> + */
> + if (map.m_flags & EXT4_MAP_UNWRITTEN) {
> + loff_t start = ((loff_t)map.m_lblk) << blkbits;
> + loff_t end = ((loff_t)map.m_lblk + map.m_len) << blkbits;
> +
> + iomap_fill_dirty_folios(iter, &start, end, &iomap_flags);
> + if ((start >> blkbits) < map.m_lblk + map.m_len)
> + map.m_len = (start >> blkbits) - map.m_lblk;
> + }
> +
> + ext4_set_iomap(inode, iomap, &map, offset, length, flags);
> + iomap->flags |= iomap_flags;
> +
> + return 0;
> +}
> +
> +static const struct iomap_ops ext4_iomap_zero_ops = {
> + .iomap_begin = ext4_iomap_zero_begin,
> +};
> +
> /*
> * Since we always allocate unwritten extents, there is no need for
> * iomap_end to clean up allocated blocks on a short write.
> @@ -4616,6 +4661,47 @@ static int ext4_block_journalled_zero_range(struct inode *inode, loff_t from,
> return err;
> }
>
> +static int ext4_block_iomap_zero_range(struct inode *inode, loff_t from,
> + loff_t length, bool *did_zero,
> + bool *zero_written)
> +{
> + int ret;
> +
> + /*
> + * Zeroing out under an active handle can cause deadlock since
> + * the order of acquiring the folio lock and starting a handle is
> + * inconsistent with the iomap writeback procedure.
> + */
> + if (WARN_ON_ONCE(ext4_handle_valid(journal_current_handle())))
> + return -EINVAL;
> +
> + /* The zeroing scope should not extend across a block. */
> + if (WARN_ON_ONCE((from >> inode->i_blkbits) !=
> + ((from + length - 1) >> inode->i_blkbits)))
> + return -EINVAL;
> +
> + if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS) &&
> + !(inode_state_read_once(inode) & (I_NEW | I_FREEING)))
> + WARN_ON_ONCE(!inode_is_locked(inode) &&
> + !rwsem_is_locked(&inode->i_mapping->invalidate_lock));
> +
> + ret = iomap_zero_range(inode, from, length, did_zero,
> + &ext4_iomap_zero_ops, &ext4_iomap_write_ops,
> + NULL);
> + if (ret)
> + return ret;
> +
> + /*
> + * TODO: The iomap does not distinguish between different types of
> + * zeroing and always sets zero_written if a zeroing operation is
> + * performed, which may result in unnecessary order operations.
> + */
> + if (did_zero && zero_written)
> + *zero_written = *did_zero;
> +
> + return 0;
> +}
> +
> /*
> * Zeros out a mapping of length 'length' starting from file offset
> * 'from'. The range to be zero'd must be contained with in one block.
> @@ -4642,6 +4728,9 @@ static int ext4_block_zero_range(struct inode *inode,
> } else if (ext4_should_journal_data(inode)) {
> return ext4_block_journalled_zero_range(inode, from, length,
> did_zero);
> + } else if (ext4_inode_buffered_iomap(inode)) {
> + return ext4_block_iomap_zero_range(inode, from, length,
> + did_zero, zero_written);
> }
> return ext4_block_do_zero_range(inode, from, length, did_zero,
> zero_written);
> @@ -4682,6 +4771,9 @@ int ext4_block_zero_eof(struct inode *inode, loff_t from, loff_t end)
> * truncating up or performing an append write, because there might be
> * exposing stale on-disk data which may caused by concurrent post-EOF
> * mmap write during folio writeback.
> + *
> + * TODO: In the iomap path, handle this by updating i_disksize to
> + * i_size after the zeroed data has been written back.
> */
> if (ext4_should_order_data(inode) &&
> did_zero && zero_written && !IS_DAX(inode)) {
> --
> 2.52.0
>