Patchwork [v2,2/10] xfs: Add support FALLOC_FL_INSERT_RANGE for fallocate

login
register
mail settings
Submitter Namjae Jeon
Date May 8, 2014, 10:26 a.m.
Message ID <003801cf6aa7$f1f87b70$d5e97250$@samsung.com>
Download mbox | patch
Permalink /patch/346982/
State Superseded
Headers show

Comments

Namjae Jeon - May 8, 2014, 10:26 a.m.
This patch implements fallocate's FALLOC_FL_INSERT_RANGE for XFS.

1) Make sure that both offset and len are block size aligned.
2) Update the i_size of inode by len bytes.
3) Compute the file's logical block number against offset. If the computed
   block number is not the starting block of the extent, split the extent
   such that the block number is the starting block of the extent.
4) Shift all the extents which are lying bewteen [offset, last allocated extent]
   towards right by len bytes. This step will make a hole of len bytes
   at offset.
5) Allocate unwritten extents for the hole created in step 4.

Signed-off-by: Namjae Jeon <namjae.jeon@samsung.com>
Signed-off-by: Ashish Sangwan <a.sangwan@samsung.com>
---
 fs/xfs/xfs_bmap.c      | 372 ++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/xfs/xfs_bmap.h      |   9 +-
 fs/xfs/xfs_bmap_util.c | 129 ++++++++++++++++-
 fs/xfs/xfs_bmap_util.h |   2 +
 fs/xfs/xfs_file.c      |  37 ++++-
 fs/xfs/xfs_trace.h     |   1 +
 6 files changed, 545 insertions(+), 5 deletions(-)
Brian Foster - May 9, 2014, 3:24 p.m.
On Thu, May 08, 2014 at 07:26:16PM +0900, Namjae Jeon wrote:
> This patch implements fallocate's FALLOC_FL_INSERT_RANGE for XFS.
> 
> 1) Make sure that both offset and len are block size aligned.
> 2) Update the i_size of inode by len bytes.
> 3) Compute the file's logical block number against offset. If the computed
>    block number is not the starting block of the extent, split the extent
>    such that the block number is the starting block of the extent.
> 4) Shift all the extents which are lying bewteen [offset, last allocated extent]
>    towards right by len bytes. This step will make a hole of len bytes
>    at offset.
> 5) Allocate unwritten extents for the hole created in step 4.
> 
> Signed-off-by: Namjae Jeon <namjae.jeon@samsung.com>
> Signed-off-by: Ashish Sangwan <a.sangwan@samsung.com>
> ---
>  fs/xfs/xfs_bmap.c      | 372 ++++++++++++++++++++++++++++++++++++++++++++++++-
>  fs/xfs/xfs_bmap.h      |   9 +-
>  fs/xfs/xfs_bmap_util.c | 129 ++++++++++++++++-
>  fs/xfs/xfs_bmap_util.h |   2 +
>  fs/xfs/xfs_file.c      |  37 ++++-
>  fs/xfs/xfs_trace.h     |   1 +
>  6 files changed, 545 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
> index 1ff0da6..e24aa14 100644
> --- a/fs/xfs/xfs_bmap.c
> +++ b/fs/xfs/xfs_bmap.c
> @@ -5419,7 +5419,7 @@ error0:
>   * into, this will be considered invalid operation and we abort immediately.
>   */
>  int
> -xfs_bmap_shift_extents(
> +xfs_bmap_shift_extents_left(
>  	struct xfs_trans	*tp,
>  	struct xfs_inode	*ip,
>  	int			*done,
> @@ -5449,7 +5449,7 @@ xfs_bmap_shift_extents(
>  	    (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
>  	     XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
>  	     mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
> -		XFS_ERROR_REPORT("xfs_bmap_shift_extents",
> +		XFS_ERROR_REPORT("xfs_bmap_shift_extents_left",
>  				 XFS_ERRLEVEL_LOW, mp);
>  		return XFS_ERROR(EFSCORRUPTED);
>  	}
> @@ -5606,3 +5606,371 @@ del_cursor:
>  	xfs_trans_log_inode(tp, ip, logflags);
>  	return error;
>  }
> +
> +/*
> + * Splits an extent into two extents at split_fsb block that it is
> + * the first block of the current_ext. @current_ext is a target extent
> + * to be splitted. @split_fsb is a block where the extents is spliited.
> + * If split_fsb lies in a hole or the first block of extents, just return 0.
> + */
> +STATIC int
> +xfs_bmap_split_extent_at(
> +	struct xfs_trans	*tp,
> +	struct xfs_inode	*ip,
> +	xfs_fileoff_t		split_fsb,
> +	xfs_extnum_t		*current_ext,
> +	xfs_fsblock_t		*firstfsb,
> +	struct xfs_bmap_free	*free_list)
> +{
> +	int				whichfork = XFS_DATA_FORK;
> +	struct xfs_btree_cur		*cur;
> +	struct xfs_bmbt_rec_host	*gotp;
> +	struct xfs_bmbt_irec		got;
> +	struct xfs_bmbt_irec		new; /* splitted extent */
> +	struct xfs_mount		*mp = ip->i_mount;
> +	struct xfs_ifork		*ifp;
> +	xfs_fsblock_t			gotblkcnt; /* new block count for got */
> +	int				error = 0;
> +	int				logflags;
> +	int				i = 0;
> +
> +	if (unlikely(XFS_TEST_ERROR(
> +	    (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
> +	     XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
> +	     mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
> +		XFS_ERROR_REPORT("xfs_bmap_split_extent_at",
> +				 XFS_ERRLEVEL_LOW, mp);
> +		return XFS_ERROR(EFSCORRUPTED);
> +	}
> +
> +	if (XFS_FORCED_SHUTDOWN(mp))
> +		return XFS_ERROR(EIO);
> +
> +	ASSERT(current_ext != NULL);
> +
> +	ifp = XFS_IFORK_PTR(ip, whichfork);
> +	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
> +		/* Read in all the extents */
> +		error = xfs_iread_extents(tp, ip, whichfork);
> +		if (error)
> +			return error;
> +	}
> +
> +	gotp = xfs_iext_bno_to_ext(ifp, split_fsb, current_ext);
> +	/*
> +	 * gotp can be null in 2 cases: 1) if there are no extents
> +	 * or 2) split_fsb lies in a hole beyond which there are
> +	 * no extents. Either way, we are done.
> +	 */
> +	if (!gotp)
> +		return 0;
> +
> +	xfs_bmbt_get_all(gotp, &got);
> +
> +	/*
> +	 * Check split_fsb lies in a hole or the start boundary offset
> +	 * of the extent.
> +	 */
> +	if (got.br_startoff >= split_fsb)
> +		return 0;
> +
> +	gotblkcnt = split_fsb - got.br_startoff;
> +	new.br_startoff = split_fsb;
> +	new.br_startblock = got.br_startblock + gotblkcnt;
> +	new.br_blockcount = got.br_blockcount - gotblkcnt;
> +	new.br_state = got.br_state;
> +
> +	/* We are going to change core inode */
> +	logflags = XFS_ILOG_CORE;
> +
> +	if (ifp->if_flags & XFS_IFBROOT) {
> +		cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
> +		cur->bc_private.b.firstblock = *firstfsb;
> +		cur->bc_private.b.flist = free_list;
> +		cur->bc_private.b.flags = 0;
> +	} else {
> +		cur = NULL;
> +		logflags |= XFS_ILOG_DEXT;
> +	}
> +
> +	if (cur) {
> +		error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
> +					   got.br_startblock,
> +					   got.br_blockcount,
> +					   &i);
> +		if (error)
> +			goto del_cursor;
> +		XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
> +	}
> +
> +	xfs_bmbt_set_blockcount(gotp, gotblkcnt);
> +	got.br_blockcount = gotblkcnt;
> +	if (cur) {
> +		error = xfs_bmbt_update(cur, got.br_startoff,
> +					got.br_startblock,
> +					got.br_blockcount,
> +					got.br_state);
> +		if (error)
> +			goto del_cursor;
> +	}
> +
> +	/* Add new extent */
> +	(*current_ext)++;
> +	xfs_iext_insert(ip, *current_ext, 1, &new, 0);
> +	XFS_IFORK_NEXT_SET(ip, whichfork,
> +		XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
> +
> +	if (cur) {
> +		error = xfs_bmbt_lookup_eq(cur, new.br_startoff,
> +					   new.br_startblock, new.br_blockcount,
> +					   &i);
> +		if (error)
> +			goto del_cursor;
> +		XFS_WANT_CORRUPTED_GOTO(i == 0, del_cursor);
> +		cur->bc_rec.b.br_state = new.br_state;
> +
> +		error = xfs_btree_insert(cur, &i);
> +		if (error)
> +			goto del_cursor;
> +		XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
> +	}
> +
> +	/*
> +	 * Convert to a btree if necessary.
> +	 */
> +	if (xfs_bmap_needs_btree(ip, whichfork)) {
> +		int tmp_logflags; /* partial log flag return val */
> +
> +		ASSERT(cur == NULL);
> +		error = xfs_bmap_extents_to_btree(tp, ip, firstfsb, free_list,
> +				&cur, 0, &tmp_logflags, whichfork);
> +		logflags |= tmp_logflags;
> +	}
> +
> +del_cursor:
> +	if (cur)
> +		xfs_btree_del_cursor(cur,
> +			error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
> +	xfs_trans_log_inode(tp, ip, logflags);
> +	return error;
> +}
> +
> +int
> +xfs_bmap_split_extent(
> +	struct xfs_inode	*ip,
> +	xfs_fileoff_t		split_fsb,
> +	xfs_extnum_t		*split_ext)
> +{
> +	struct xfs_mount        *mp = ip->i_mount;
> +	struct xfs_trans	*tp;
> +	struct xfs_bmap_free	free_list;
> +	xfs_fsblock_t		firstfsb;
> +	int			committed;
> +	int			error;
> +
> +	tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
> +	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
> +			XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
> +
> +	if (error) {
> +		/*
> +		 * Free the transaction structure.
> +		 */
> +		ASSERT(XFS_FORCED_SHUTDOWN(mp));

As in the other patch, we're attempting to reserve fs blocks for the
transaction, so ENOSPC is a possibility that I think the assert should
accommodate.

> +		xfs_trans_cancel(tp, 0);
> +		return error;
> +	}
> +
> +	xfs_ilock(ip, XFS_ILOCK_EXCL);
> +	error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot,
> +			ip->i_gdquot, ip->i_pdquot,
> +			XFS_DIOSTRAT_SPACE_RES(mp, 0), 0,
> +			XFS_QMOPT_RES_REGBLKS);
> +	if (error)
> +		goto error1;
> +
> +	xfs_trans_ijoin(tp, ip, 0);
> +	xfs_bmap_init(&free_list, &firstfsb);
> +
> +	error = xfs_bmap_split_extent_at(tp, ip, split_fsb, split_ext,
> +					 &firstfsb, &free_list);
> +	if (error)
> +		goto error0;
> +
> +	error = xfs_bmap_finish(&tp, &free_list, &committed);
> +	if (error)
> +		goto error0;
> +
> +	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
> +	xfs_iunlock(ip, XFS_ILOCK_EXCL);
> +
> +	return error;
> +error0:
> +	xfs_bmap_cancel(&free_list);
> +error1:
> +	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
> +	xfs_iunlock(ip, XFS_ILOCK_EXCL);
> +	return error;
> +}
> +
> +/*
> + * Shift extent records to the right to make a hole.
> + * The maximum number of extents to be shifted in a single operation
> + * is @num_exts, and @current_ext keeps track of the current extent
> + * index we have shifted. @offset_shift_fsb is the length by which each
> + * extent is shifted. @end_ext is the last extent to be shifted.
> + */
> +int
> +xfs_bmap_shift_extents_right(
> +	struct xfs_trans	*tp,
> +	struct xfs_inode	*ip,
> +	int			*done,
> +	xfs_fileoff_t		offset_shift_fsb,
> +	xfs_extnum_t		*current_ext,
> +	xfs_extnum_t		end_ext,
> +	xfs_fsblock_t		*firstblock,
> +	struct xfs_bmap_free	*flist,
> +	int			num_exts)
> +{
> +	struct xfs_mount		*mp = ip->i_mount;
> +	struct xfs_btree_cur		*cur;
> +	struct xfs_bmbt_rec_host	*gotp;
> +	struct xfs_bmbt_irec		got;
> +	struct xfs_bmbt_irec            right;
> +	xfs_ifork_t			*ifp;
> +	xfs_fileoff_t			startoff;
> +	xfs_filblks_t			blockcount = 0;
> +	xfs_extnum_t			last_extent;
> +	int				error = 0;
> +	int				i;
> +	int				whichfork = XFS_DATA_FORK;
> +	int				logflags;
> +
> +	if (unlikely(XFS_TEST_ERROR(
> +	    (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
> +	     XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
> +	     mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
> +		XFS_ERROR_REPORT("xfs_bmap_shift_extents_right",
> +				 XFS_ERRLEVEL_LOW, mp);
> +		return XFS_ERROR(EFSCORRUPTED);
> +	}
> +
> +	if (XFS_FORCED_SHUTDOWN(mp))
> +		return XFS_ERROR(EIO);
> +
> +	ASSERT(current_ext != NULL);
> +
> +	/* We are going to change core inode */
> +	logflags = XFS_ILOG_CORE;
> +	ifp = XFS_IFORK_PTR(ip, whichfork);
> +
> +	if (ifp->if_flags & XFS_IFBROOT) {
> +		cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
> +		cur->bc_private.b.firstblock = *firstblock;
> +		cur->bc_private.b.flist = flist;
> +		cur->bc_private.b.flags = 0;
> +	} else {
> +		cur = NULL;
> +		logflags |= XFS_ILOG_DEXT;
> +	}
> +
> +	/* start shifting extents to right */
> +	while (num_exts-- > 0) {
> +		blockcount = 0;
> +
> +		if (*current_ext < end_ext) {
> +			*done = 1;
> +			break;
> +		}
> +
> +		gotp = xfs_iext_get_ext(ifp, *current_ext);
> +		xfs_bmbt_get_all(gotp, &got);
> +		startoff = got.br_startoff + offset_shift_fsb;
> +
> +		/*
> +		 * Before shifting extent into hole, make sure that the hole
> +		 * is large enough to accomodate the shift. This checking has
> +		 * to be performed for all except the last extent.
> +		 */
> +		last_extent = (ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) - 1;
> +		if (last_extent != *current_ext) {
> +			xfs_bmbt_get_all(xfs_iext_get_ext(ifp,
> +						*current_ext + 1), &right);
> +			if (startoff + got.br_blockcount > right.br_startoff) {
> +				error = XFS_ERROR(EINVAL);
> +				if (error)
> +					goto del_cursor;
> +			}
> +		}
> +
> +		/* Check if we can merge 2 adjacent extents */
> +		if (last_extent != *current_ext &&
> +		    right.br_startoff == startoff + got.br_blockcount &&
> +		    right.br_startblock ==
> +				got.br_startblock + got.br_blockcount &&
> +		    right.br_state == got.br_state &&
> +		    right.br_blockcount + got.br_blockcount <= MAXEXTLEN) {
> +			blockcount = right.br_blockcount + got.br_blockcount;
> +
> +			/* Make cursor point to the extent we will update */

The comment could be more clear about what we're doing in this case. For
example:

/*
 * Merge the current extent with the extent to the right. Remove the right
 * extent, calculate a new block count for the current extent to cover the range
 * of both and decrement the number of extents in the fork.
 */

I'd also move the comment before the blockcount calculation.

> +			if (cur) {
> +				error = xfs_bmbt_lookup_eq(cur,
> +							   right.br_startoff,
> +							   right.br_startblock,
> +							   right.br_blockcount,
> +							   &i);
> +				if (error)
> +					goto del_cursor;
> +				XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
> +			}
> +
> +			xfs_iext_remove(ip, *current_ext + 1, 1, 0);
> +			if (cur) {
> +				error = xfs_btree_delete(cur, &i);
> +				if (error)
> +					goto del_cursor;
> +				XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
> +			}
> +			XFS_IFORK_NEXT_SET(ip, whichfork,
> +					XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
> +
> +		}
> +
> +		if (cur) {
> +			error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
> +						   got.br_startblock,
> +						   got.br_blockcount,
> +						   &i);
> +			if (error)
> +				goto del_cursor;
> +			XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
> +		}
> +
> +		if (got.br_blockcount < blockcount) {
> +			xfs_bmbt_set_blockcount(gotp, blockcount);
> +			got.br_blockcount = blockcount;
> +		}

How about just 'if (blockcount)' so the algorithm is clear?

> +
> +
> +		xfs_bmbt_set_startoff(gotp, startoff);
> +		got.br_startoff = startoff;
> +
> +		if (cur) {
> +			error = xfs_bmbt_update(cur, got.br_startoff,
> +						got.br_startblock,
> +						got.br_blockcount,
> +						got.br_state);
> +			if (error)
> +				goto del_cursor;
> +		}
> +
> +		(*current_ext)--;
> +	}
> +
> +del_cursor:
> +	if (cur)
> +		xfs_btree_del_cursor(cur,
> +			error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
> +	xfs_trans_log_inode(tp, ip, logflags);
> +	return error;
> +}
> diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
> index 38ba36e..af05899 100644
> --- a/fs/xfs/xfs_bmap.h
> +++ b/fs/xfs/xfs_bmap.h
> @@ -179,10 +179,17 @@ int	xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
>  int	xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,
>  		xfs_extnum_t num);
>  uint	xfs_default_attroffset(struct xfs_inode *ip);
> -int	xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip,
> +int	xfs_bmap_shift_extents_left(struct xfs_trans *tp, struct xfs_inode *ip,
>  		int *done, xfs_fileoff_t start_fsb,
>  		xfs_fileoff_t offset_shift_fsb, xfs_extnum_t *current_ext,
>  		xfs_fsblock_t *firstblock, struct xfs_bmap_free	*flist,
>  		int num_exts);
> +int	xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset,
> +		xfs_extnum_t *split_ext);
> +int	xfs_bmap_shift_extents_right(struct xfs_trans *tp, struct xfs_inode *ip,
> +		int *done, xfs_fsblock_t offset_shift_fsb,
> +		xfs_extnum_t *current_ext, xfs_extnum_t end_ext,
> +		xfs_fsblock_t *firstblock, struct xfs_bmap_free *flist,
> +		int num_exts);
>  
>  #endif	/* __XFS_BMAP_H__ */
> diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
> index 296160b..5a56f5b 100644
> --- a/fs/xfs/xfs_bmap_util.c
> +++ b/fs/xfs/xfs_bmap_util.c
> @@ -1550,7 +1550,7 @@ xfs_collapse_file_space(
>  		 * We are using the write transaction in which max 2 bmbt
>  		 * updates are allowed
>  		 */
> -		error = xfs_bmap_shift_extents(tp, ip, &done, start_fsb,
> +		error = xfs_bmap_shift_extents_left(tp, ip, &done, start_fsb,
>  					       shift_fsb, &current_ext,
>  					       &first_block, &free_list,
>  					       XFS_BMAP_MAX_SHIFT_EXTENTS);
> @@ -1574,6 +1574,133 @@ out:
>  }
>  
>  /*
> + * xfs_insert_file_space()
> + *	This routine allocate disk space and shift extent for the given file.
> + *	The first thing we do is to sync dirty data and invalidate page cache
> + *	over the region on which insert range is working. And split an extent
> + *	to two extents at given offset by calling xfs_bmap_split_extent.
> + *	And shift all extent records which are laying between [offset,
> + *	last allocated extent] to the right to reserve hole range. Lastly
> + *	allocate an unwritten extent in hole range created by shifting extents.
> + *
> + * RETURNS:
> + *	0 on success
> + *	errno on error
> + *
> + */
> +int
> +xfs_insert_file_space(
> +	struct xfs_inode	*ip,
> +	loff_t			offset,
> +	loff_t			len)
> +{
> +	struct xfs_mount	*mp = ip->i_mount;
> +	struct xfs_trans	*tp;
> +	struct xfs_bmap_free	free_list;
> +	xfs_fsblock_t		first_block;
> +	xfs_ifork_t		*ifp;
> +	int			done = 0;
> +	int			committed;
> +	int			error;
> +	uint			rounding;
> +	xfs_fileoff_t		start_fsb;
> +	xfs_fileoff_t		shift_fsb;
> +	xfs_extnum_t		split_ext;
> +	xfs_extnum_t		current_ext = 0;
> +	xfs_off_t		ioffset;
> +
> +	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
> +	trace_xfs_insert_file_space(ip);
> +
> +	error = xfs_qm_dqattach(ip, 0);
> +	if (error)
> +		return error;
> +
> +	/* wait for the completion of any pending DIOs */
> +	inode_dio_wait(VFS_I(ip));
> +
> +	rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
> +	ioffset = offset & ~(rounding - 1);
> +	error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
> +			ioffset, -1);
> +	if (error)
> +		return error;
> +
> +	truncate_pagecache_range(VFS_I(ip), ioffset, -1);
> +
> +	start_fsb = XFS_B_TO_FSB(mp, offset);
> +	shift_fsb = XFS_B_TO_FSB(mp, len);
> +
> +	error = xfs_bmap_split_extent(ip, start_fsb, &split_ext);
> +	if (error)
> +		return error;
> +
> +	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
> +	current_ext = (ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) - 1;
> +	while (!error && !done) {
> +		tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
> +		/*
> +		 * We would need to reserve permanent block for transaction.
> +		 * This will come into picture when after shifting extent into
> +		 * hole we found that adjacent extents can be merged which
> +		 * may lead to freeing of a block during record update.
> +		 */
> +		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
> +				XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
> +		if (error) {
> +			ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
> +			xfs_trans_cancel(tp, 0);
> +			break;
> +		}
> +
> +		xfs_ilock(ip, XFS_ILOCK_EXCL);
> +		error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot,
> +				ip->i_gdquot, ip->i_pdquot,
> +				XFS_DIOSTRAT_SPACE_RES(mp, 0), 0,
> +				XFS_QMOPT_RES_REGBLKS);
> +		if (error)
> +			goto error1;
> +
> +		xfs_trans_ijoin(tp, ip, 0);
> +
> +		xfs_bmap_init(&free_list, &first_block);
> +
> +		/*
> +		 * We are using the write transaction in which max 2 bmbt
> +		 * updates are allowed
> +		 */
> +		error = xfs_bmap_shift_extents_right(tp, ip, &done, shift_fsb,
> +						&current_ext, split_ext,
> +						&first_block, &free_list,
> +						XFS_BMAP_MAX_SHIFT_EXTENTS);
> +		if (error)
> +			goto error0;
> +
> +		error = xfs_bmap_finish(&tp, &free_list, &committed);
> +		if (error)
> +			goto error0;
> +
> +		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
> +		xfs_iunlock(ip, XFS_ILOCK_EXCL);
> +		if (error)
> +			goto out;
> +	}
> +
> +	/* Add unwritten extent in a hole range. */
> +	error = xfs_alloc_file_space(ip, offset, len, XFS_BMAPI_PREALLOC);
> +
> +out:
> +	return error;
> +
> +error0:
> +	xfs_bmap_cancel(&free_list);
> +error1:
> +	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
> +	xfs_iunlock(ip, XFS_ILOCK_EXCL);
> +	return error;
> +}
> +
> +/*
>   * We need to check that the format of the data fork in the temporary inode is
>   * valid for the target inode before doing the swap. This is not a problem with
>   * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
> diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
> index 935ed2b..d62ab4b 100644
> --- a/fs/xfs/xfs_bmap_util.h
> +++ b/fs/xfs/xfs_bmap_util.h
> @@ -101,6 +101,8 @@ int	xfs_zero_file_space(struct xfs_inode *ip, xfs_off_t offset,
>  			    xfs_off_t len);
>  int	xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset,
>  				xfs_off_t len);
> +int	xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset,
> +				xfs_off_t len);
>  
>  /* EOF block manipulation functions */
>  bool	xfs_can_free_eofblocks(struct xfs_inode *ip, bool force);
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 97855c5..392b029 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -760,7 +760,8 @@ xfs_file_fallocate(
>  	if (!S_ISREG(inode->i_mode))
>  		return -EINVAL;
>  	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
> -		     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
> +		     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
> +		     FALLOC_FL_INSERT_RANGE))
>  		return -EOPNOTSUPP;
>  
>  	xfs_ilock(ip, XFS_IOLOCK_EXCL);
> @@ -790,6 +791,40 @@ xfs_file_fallocate(
>  		error = xfs_collapse_file_space(ip, offset, len);
>  		if (error)
>  			goto out_unlock;
> +	} else if (mode & FALLOC_FL_INSERT_RANGE) {
> +		unsigned blksize_mask = (1 << inode->i_blkbits) - 1;
> +		struct iattr iattr;
> +
> +		if (offset & blksize_mask || len & blksize_mask) {
> +			error = -EINVAL;
> +			goto out_unlock;
> +		}
> +
> +		/* Check for wrap through zero */
> +		if (inode->i_size + len > inode->i_sb->s_maxbytes) {
> +			error = -EFBIG;
> +			goto out_unlock;
> +		}
> +
> +		/* Offset should be less than i_size */
> +		if (offset >= i_size_read(inode)) {
> +			error = -EINVAL;
> +			goto out_unlock;
> +		}
> +
> +		/*
> +		 * The first thing we do is to expand file to
> +		 * avoid data loss if there is error while shifting
> +		 */
> +		iattr.ia_valid = ATTR_SIZE;
> +		iattr.ia_size = i_size_read(inode) + len;
> +		error = xfs_setattr_size(ip, &iattr);
> +		if (error)
> +			goto out_unlock;

I don't necessarily know that it's problematic to do the setattr before
the bmap fixup. We'll have a chance for partial completion of this
operation either way. But I'm not a fan of the code duplication here.
This also still skips the time update in the event of insert space
failure, though perhaps that's not such a big deal if we're returning an
error.

I think it would be better to leave things organized as before and
introduce an error2 variable and a &nrshifts or some such parameter to
xfs_insert_file_space() that initializes to 0 and returns the number of
record shifts. The caller can then decide whether it's appropriate to
break out immediately or do the inode size update and return the error.

Perhaps not the cleanest thing in the world, but also not the first
place we would use 'error2' to manage error priorities (grep around for
it)...

Brian

> +
> +		error = xfs_insert_file_space(ip, offset, len);
> +		if (error)
> +			goto out_unlock;
>  	} else {
>  		if (!(mode & FALLOC_FL_KEEP_SIZE) &&
>  		    offset + len > i_size_read(inode)) {
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index 152f827..8943c9f 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -663,6 +663,7 @@ DEFINE_INODE_EVENT(xfs_alloc_file_space);
>  DEFINE_INODE_EVENT(xfs_free_file_space);
>  DEFINE_INODE_EVENT(xfs_zero_file_space);
>  DEFINE_INODE_EVENT(xfs_collapse_file_space);
> +DEFINE_INODE_EVENT(xfs_insert_file_space);
>  DEFINE_INODE_EVENT(xfs_readdir);
>  #ifdef CONFIG_XFS_POSIX_ACL
>  DEFINE_INODE_EVENT(xfs_get_acl);
> -- 
> 1.7.11-rc0
> 
> _______________________________________________
> xfs mailing list
> xfs@oss.sgi.com
> http://oss.sgi.com/mailman/listinfo/xfs
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Namjae Jeon - May 12, 2014, 9:42 a.m.
> > +xfs_bmap_split_extent(
> > +	struct xfs_inode	*ip,
> > +	xfs_fileoff_t		split_fsb,
> > +	xfs_extnum_t		*split_ext)
> > +{
> > +	struct xfs_mount        *mp = ip->i_mount;
> > +	struct xfs_trans	*tp;
> > +	struct xfs_bmap_free	free_list;
> > +	xfs_fsblock_t		firstfsb;
> > +	int			committed;
> > +	int			error;
> > +
> > +	tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
> > +	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
> > +			XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
> > +
> > +	if (error) {
> > +		/*
> > +		 * Free the transaction structure.
> > +		 */
> > +		ASSERT(XFS_FORCED_SHUTDOWN(mp));
> 
Hi, Brian.
> As in the other patch, we're attempting to reserve fs blocks for the
> transaction, so ENOSPC is a possibility that I think the assert should
> accommodate.
How about removing the ASSERT completely as suggessted by Dave
in other thread?

> 
> > +		xfs_trans_cancel(tp, 0);
> > +		return error;
> > +	}
> > +

> > +
> > +		/*
> > +		 * Before shifting extent into hole, make sure that the hole
> > +		 * is large enough to accomodate the shift. This checking has
> > +		 * to be performed for all except the last extent.
> > +		 */
> > +		last_extent = (ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) - 1;
> > +		if (last_extent != *current_ext) {
> > +			xfs_bmbt_get_all(xfs_iext_get_ext(ifp,
> > +						*current_ext + 1), &right);
> > +			if (startoff + got.br_blockcount > right.br_startoff) {
> > +				error = XFS_ERROR(EINVAL);
> > +				if (error)
> > +					goto del_cursor;
> > +			}
> > +		}
> > +
> > +		/* Check if we can merge 2 adjacent extents */
> > +		if (last_extent != *current_ext &&
> > +		    right.br_startoff == startoff + got.br_blockcount &&
> > +		    right.br_startblock ==
> > +				got.br_startblock + got.br_blockcount &&
> > +		    right.br_state == got.br_state &&
> > +		    right.br_blockcount + got.br_blockcount <= MAXEXTLEN) {
> > +			blockcount = right.br_blockcount + got.br_blockcount;
> > +
> > +			/* Make cursor point to the extent we will update */
> 
> The comment could be more clear about what we're doing in this case. For
> example:
> 
> /*
>  * Merge the current extent with the extent to the right. Remove the right
>  * extent, calculate a new block count for the current extent to cover the range
>  * of both and decrement the number of extents in the fork.
>  */
> 
> I'd also move the comment before the blockcount calculation.
okay, I will add it as your suggestion.
> 
> > +			if (cur) {
> > +				error = xfs_bmbt_lookup_eq(cur,
> > +							   right.br_startoff,
> > +							   right.br_startblock,
> > +							   right.br_blockcount,
> > +							   &i);
> > +				if (error)
> > +					goto del_cursor;
> > +				XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
> > +			}
> > +
> > +			xfs_iext_remove(ip, *current_ext + 1, 1, 0);
> > +			if (cur) {
> > +				error = xfs_btree_delete(cur, &i);
> > +				if (error)
> > +					goto del_cursor;
> > +				XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
> > +			}
> > +			XFS_IFORK_NEXT_SET(ip, whichfork,
> > +					XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
> > +
> > +		}
> > +
> > +		if (cur) {
> > +			error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
> > +						   got.br_startblock,
> > +						   got.br_blockcount,
> > +						   &i);
> > +			if (error)
> > +				goto del_cursor;
> > +			XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
> > +		}
> > +
> > +		if (got.br_blockcount < blockcount) {
> > +			xfs_bmbt_set_blockcount(gotp, blockcount);
> > +			got.br_blockcount = blockcount;
> > +		}
> 
> How about just 'if (blockcount)' so the algorithm is clear?
yes, more clear.
> 
> > +
> > +
> > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> > index 97855c5..392b029 100644
> > --- a/fs/xfs/xfs_file.c
> > +++ b/fs/xfs/xfs_file.c
> > @@ -760,7 +760,8 @@ xfs_file_fallocate(
> >  	if (!S_ISREG(inode->i_mode))
> >  		return -EINVAL;
> >  	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
> > -		     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
> > +		     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
> > +		     FALLOC_FL_INSERT_RANGE))
> >  		return -EOPNOTSUPP;
> >
> >  	xfs_ilock(ip, XFS_IOLOCK_EXCL);
> > @@ -790,6 +791,40 @@ xfs_file_fallocate(
> >  		error = xfs_collapse_file_space(ip, offset, len);
> >  		if (error)
> >  			goto out_unlock;
> > +	} else if (mode & FALLOC_FL_INSERT_RANGE) {
> > +		unsigned blksize_mask = (1 << inode->i_blkbits) - 1;
> > +		struct iattr iattr;
> > +
> > +		if (offset & blksize_mask || len & blksize_mask) {
> > +			error = -EINVAL;
> > +			goto out_unlock;
> > +		}
> > +
> > +		/* Check for wrap through zero */
> > +		if (inode->i_size + len > inode->i_sb->s_maxbytes) {
> > +			error = -EFBIG;
> > +			goto out_unlock;
> > +		}
> > +
> > +		/* Offset should be less than i_size */
> > +		if (offset >= i_size_read(inode)) {
> > +			error = -EINVAL;
> > +			goto out_unlock;
> > +		}
> > +
> > +		/*
> > +		 * The first thing we do is to expand file to
> > +		 * avoid data loss if there is error while shifting
> > +		 */
> > +		iattr.ia_valid = ATTR_SIZE;
> > +		iattr.ia_size = i_size_read(inode) + len;
> > +		error = xfs_setattr_size(ip, &iattr);
> > +		if (error)
> > +			goto out_unlock;
> 
> I don't necessarily know that it's problematic to do the setattr before
> the bmap fixup. We'll have a chance for partial completion of this
> operation either way. But I'm not a fan of the code duplication here.
> This also still skips the time update in the event of insert space
> failure, though perhaps that's not such a big deal if we're returning an
> error.
> 
> I think it would be better to leave things organized as before and
> introduce an error2 variable and a &nrshifts or some such parameter to
> xfs_insert_file_space() that initializes to 0 and returns the number of
> record shifts. The caller can then decide whether it's appropriate to
> break out immediately or do the inode size update and return the error.
> 
> Perhaps not the cleanest thing in the world, but also not the first
> place we would use 'error2' to manage error priorities (grep around for
> it)...
Yes, Right. I also thought such sequence at first. But we should consider
sudden power off and unplug device case during shifting extent.
While we are in the middle of shifitng extents and if there is sudden
power failure user can still think that data is lost as we won't get any
chance to update the file size in these cases.
Updating file size before the shifitng operation can start will prevent this.

Thanks.
> 
> Brian
> 
> > +
> > +		error = xfs_insert_file_space(ip, offset, len);
> > +		if (error)
> > +			goto out_unlock;
> >  	} else {
> >  		if (!(mode & FALLOC_FL_KEEP_SIZE) &&
> >  		    offset + len > i_size_read(inode)) {

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Brian Foster - May 12, 2014, 11:25 a.m.
On Mon, May 12, 2014 at 06:42:37PM +0900, Namjae Jeon wrote:
> 
> > > +xfs_bmap_split_extent(
> > > +	struct xfs_inode	*ip,
> > > +	xfs_fileoff_t		split_fsb,
> > > +	xfs_extnum_t		*split_ext)
> > > +{
> > > +	struct xfs_mount        *mp = ip->i_mount;
> > > +	struct xfs_trans	*tp;
> > > +	struct xfs_bmap_free	free_list;
> > > +	xfs_fsblock_t		firstfsb;
> > > +	int			committed;
> > > +	int			error;
> > > +
> > > +	tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
> > > +	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
> > > +			XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
> > > +
> > > +	if (error) {
> > > +		/*
> > > +		 * Free the transaction structure.
> > > +		 */
> > > +		ASSERT(XFS_FORCED_SHUTDOWN(mp));
> > 
> Hi, Brian.
> > As in the other patch, we're attempting to reserve fs blocks for the
> > transaction, so ENOSPC is a possibility that I think the assert should
> > accommodate.
> How about removing the ASSERT completely as suggessted by Dave
> in other thread?
> 

Yeah, that works too. If Dave prefers to just remove these asserts
that's fine with me. I just wanted to make sure we aren't adding
spurious asserts for valid failures.

> > 
...
> > > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> > > index 97855c5..392b029 100644
> > > --- a/fs/xfs/xfs_file.c
> > > +++ b/fs/xfs/xfs_file.c
> > > @@ -760,7 +760,8 @@ xfs_file_fallocate(
> > >  	if (!S_ISREG(inode->i_mode))
> > >  		return -EINVAL;
> > >  	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
> > > -		     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
> > > +		     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
> > > +		     FALLOC_FL_INSERT_RANGE))
> > >  		return -EOPNOTSUPP;
> > >
> > >  	xfs_ilock(ip, XFS_IOLOCK_EXCL);
> > > @@ -790,6 +791,40 @@ xfs_file_fallocate(
> > >  		error = xfs_collapse_file_space(ip, offset, len);
> > >  		if (error)
> > >  			goto out_unlock;
> > > +	} else if (mode & FALLOC_FL_INSERT_RANGE) {
> > > +		unsigned blksize_mask = (1 << inode->i_blkbits) - 1;
> > > +		struct iattr iattr;
> > > +
> > > +		if (offset & blksize_mask || len & blksize_mask) {
> > > +			error = -EINVAL;
> > > +			goto out_unlock;
> > > +		}
> > > +
> > > +		/* Check for wrap through zero */
> > > +		if (inode->i_size + len > inode->i_sb->s_maxbytes) {
> > > +			error = -EFBIG;
> > > +			goto out_unlock;
> > > +		}
> > > +
> > > +		/* Offset should be less than i_size */
> > > +		if (offset >= i_size_read(inode)) {
> > > +			error = -EINVAL;
> > > +			goto out_unlock;
> > > +		}
> > > +
> > > +		/*
> > > +		 * The first thing we do is to expand file to
> > > +		 * avoid data loss if there is error while shifting
> > > +		 */
> > > +		iattr.ia_valid = ATTR_SIZE;
> > > +		iattr.ia_size = i_size_read(inode) + len;
> > > +		error = xfs_setattr_size(ip, &iattr);
> > > +		if (error)
> > > +			goto out_unlock;
> > 
> > I don't necessarily know that it's problematic to do the setattr before
> > the bmap fixup. We'll have a chance for partial completion of this
> > operation either way. But I'm not a fan of the code duplication here.
> > This also still skips the time update in the event of insert space
> > failure, though perhaps that's not such a big deal if we're returning an
> > error.
> > 
> > I think it would be better to leave things organized as before and
> > introduce an error2 variable and a &nrshifts or some such parameter to
> > xfs_insert_file_space() that initializes to 0 and returns the number of
> > record shifts. The caller can then decide whether it's appropriate to
> > break out immediately or do the inode size update and return the error.
> > 
> > Perhaps not the cleanest thing in the world, but also not the first
> > place we would use 'error2' to manage error priorities (grep around for
> > it)...
> Yes, Right. I also thought such sequence at first. But we should consider
> sudden power off and unplug device case during shifting extent.
> While we are in the middle of shifitng extents and if there is sudden
> power failure user can still think that data is lost as we won't get any
> chance to update the file size in these cases.
> Updating file size before the shifitng operation can start will prevent this.
> 
> Thanks.

Hmm, fair point. That seems less critical to me than the general error
sequence, but if we want to handle that case I think we could still fix
the duplication in xfs_file_fallocate(). We could possibly factor out
the common bits (update time and set size) into a helper, or what seems
a bit cleaner on first thought, move the bulk of the (mode &
FALLOC_FL_INSERT_RANGE) block to after the common part. Then the
function looks something like this:

	...
	xfs_ilock();

	/* pre-inode fixup ops */
	if (mode & ...) {
		...
	} else if (mode & FALLOC_FL_INSERT_RANGE) {
		/* comment as to what's going on here :) */

		/* error checks */

		new_size = ...;
		do_file_insert = 1;
	}
	...
	xfs_trans_ichgtime();
	xfs_setattr_size();
	...

	/*
	 * Some operations are performed after the inode size is updated. For
	 * example, insert range expands the address space of the file, shifts
	 * all subsequent extents over and allocates space into the hole.
	 * Updating the size first ensures that shifted extents aren't left
	 * hanging past EOF in the event of a crash or failure.
	 */
	if (do_file_insert) {
		/* alloc space */
		...
	}
	...

That seems a bit cleaner to me, but I'm not wedded to it. Thoughts? It
might be worth soliciting some other thoughts/ideas before reworking it.
Thanks.

Brian

> > 
> > Brian
> > 
> > > +
> > > +		error = xfs_insert_file_space(ip, offset, len);
> > > +		if (error)
> > > +			goto out_unlock;
> > >  	} else {
> > >  		if (!(mode & FALLOC_FL_KEEP_SIZE) &&
> > >  		    offset + len > i_size_read(inode)) {
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Namjae Jeon - May 13, 2014, 1:23 a.m.
> 
> On Mon, May 12, 2014 at 06:42:37PM +0900, Namjae Jeon wrote:
> >
> > > > +xfs_bmap_split_extent(
> > > > +	struct xfs_inode	*ip,
> > > > +	xfs_fileoff_t		split_fsb,
> > > > +	xfs_extnum_t		*split_ext)
> > > > +{
> > > > +	struct xfs_mount        *mp = ip->i_mount;
> > > > +	struct xfs_trans	*tp;
> > > > +	struct xfs_bmap_free	free_list;
> > > > +	xfs_fsblock_t		firstfsb;
> > > > +	int			committed;
> > > > +	int			error;
> > > > +
> > > > +	tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
> > > > +	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
> > > > +			XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
> > > > +
> > > > +	if (error) {
> > > > +		/*
> > > > +		 * Free the transaction structure.
> > > > +		 */
> > > > +		ASSERT(XFS_FORCED_SHUTDOWN(mp));
> > >
> > Hi, Brian.
> > > As in the other patch, we're attempting to reserve fs blocks for the
> > > transaction, so ENOSPC is a possibility that I think the assert should
> > > accommodate.
> > How about removing the ASSERT completely as suggessted by Dave
> > in other thread?
> >
> 
> Yeah, that works too. If Dave prefers to just remove these asserts
> that's fine with me. I just wanted to make sure we aren't adding
> spurious asserts for valid failures.
Okay.
> 
> > >
> ...
> > > > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> > > > index 97855c5..392b029 100644
> > > > --- a/fs/xfs/xfs_file.c
> > > > +++ b/fs/xfs/xfs_file.c
> > > > @@ -760,7 +760,8 @@ xfs_file_fallocate(
> > > >  	if (!S_ISREG(inode->i_mode))
> > > >  		return -EINVAL;
> > > >  	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
> > > > -		     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
> > > > +		     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
> > > > +		     FALLOC_FL_INSERT_RANGE))
> > > >  		return -EOPNOTSUPP;
> > > >
> > > >  	xfs_ilock(ip, XFS_IOLOCK_EXCL);
> > > > @@ -790,6 +791,40 @@ xfs_file_fallocate(
> > > >  		error = xfs_collapse_file_space(ip, offset, len);
> > > >  		if (error)
> > > >  			goto out_unlock;
> > > > +	} else if (mode & FALLOC_FL_INSERT_RANGE) {
> > > > +		unsigned blksize_mask = (1 << inode->i_blkbits) - 1;
> > > > +		struct iattr iattr;
> > > > +
> > > > +		if (offset & blksize_mask || len & blksize_mask) {
> > > > +			error = -EINVAL;
> > > > +			goto out_unlock;
> > > > +		}
> > > > +
> > > > +		/* Check for wrap through zero */
> > > > +		if (inode->i_size + len > inode->i_sb->s_maxbytes) {
> > > > +			error = -EFBIG;
> > > > +			goto out_unlock;
> > > > +		}
> > > > +
> > > > +		/* Offset should be less than i_size */
> > > > +		if (offset >= i_size_read(inode)) {
> > > > +			error = -EINVAL;
> > > > +			goto out_unlock;
> > > > +		}
> > > > +
> > > > +		/*
> > > > +		 * The first thing we do is to expand file to
> > > > +		 * avoid data loss if there is error while shifting
> > > > +		 */
> > > > +		iattr.ia_valid = ATTR_SIZE;
> > > > +		iattr.ia_size = i_size_read(inode) + len;
> > > > +		error = xfs_setattr_size(ip, &iattr);
> > > > +		if (error)
> > > > +			goto out_unlock;
> > >
> > > I don't necessarily know that it's problematic to do the setattr before
> > > the bmap fixup. We'll have a chance for partial completion of this
> > > operation either way. But I'm not a fan of the code duplication here.
> > > This also still skips the time update in the event of insert space
> > > failure, though perhaps that's not such a big deal if we're returning an
> > > error.
> > >
> > > I think it would be better to leave things organized as before and
> > > introduce an error2 variable and a &nrshifts or some such parameter to
> > > xfs_insert_file_space() that initializes to 0 and returns the number of
> > > record shifts. The caller can then decide whether it's appropriate to
> > > break out immediately or do the inode size update and return the error.
> > >
> > > Perhaps not the cleanest thing in the world, but also not the first
> > > place we would use 'error2' to manage error priorities (grep around for
> > > it)...
> > Yes, Right. I also thought such sequence at first. But we should consider
> > sudden power off and unplug device case during shifting extent.
> > While we are in the middle of shifitng extents and if there is sudden
> > power failure user can still think that data is lost as we won't get any
> > chance to update the file size in these cases.
> > Updating file size before the shifitng operation can start will prevent this.
> >
> > Thanks.
> 
> Hmm, fair point. That seems less critical to me than the general error
> sequence, but if we want to handle that case I think we could still fix
> the duplication in xfs_file_fallocate(). We could possibly factor out
> the common bits (update time and set size) into a helper, or what seems
> a bit cleaner on first thought, move the bulk of the (mode &
> FALLOC_FL_INSERT_RANGE) block to after the common part. Then the
> function looks something like this:
> 
> 	...
> 	xfs_ilock();
> 
> 	/* pre-inode fixup ops */
> 	if (mode & ...) {
> 		...
> 	} else if (mode & FALLOC_FL_INSERT_RANGE) {
> 		/* comment as to what's going on here :) */
> 
> 		/* error checks */
> 
> 		new_size = ...;
> 		do_file_insert = 1;
> 	}
> 	...
> 	xfs_trans_ichgtime();
> 	xfs_setattr_size();
> 	...
> 
> 	/*
> 	 * Some operations are performed after the inode size is updated. For
> 	 * example, insert range expands the address space of the file, shifts
> 	 * all subsequent extents over and allocates space into the hole.
> 	 * Updating the size first ensures that shifted extents aren't left
> 	 * hanging past EOF in the event of a crash or failure.
> 	 */
> 	if (do_file_insert) {
> 		/* alloc space */
> 		...
> 	}
> 	...
> 
> That seems a bit cleaner to me, but I'm not wedded to it. Thoughts? It
> might be worth soliciting some other thoughts/ideas before reworking it.
> Thanks.
Okay, I agree about your opinion.
And I would like to get some feedback from Dave before reworking.

Thanks for your valuable review!

> 
> Brian
> 
> > >
> > > Brian
> > >
> > > > +
> > > > +		error = xfs_insert_file_space(ip, offset, len);
> > > > +		if (error)
> > > > +			goto out_unlock;
> > > >  	} else {
> > > >  		if (!(mode & FALLOC_FL_KEEP_SIZE) &&
> > > >  		    offset + len > i_size_read(inode)) {
> >

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 1ff0da6..e24aa14 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -5419,7 +5419,7 @@  error0:
  * into, this will be considered invalid operation and we abort immediately.
  */
 int
-xfs_bmap_shift_extents(
+xfs_bmap_shift_extents_left(
 	struct xfs_trans	*tp,
 	struct xfs_inode	*ip,
 	int			*done,
@@ -5449,7 +5449,7 @@  xfs_bmap_shift_extents(
 	    (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
 	     XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
 	     mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
-		XFS_ERROR_REPORT("xfs_bmap_shift_extents",
+		XFS_ERROR_REPORT("xfs_bmap_shift_extents_left",
 				 XFS_ERRLEVEL_LOW, mp);
 		return XFS_ERROR(EFSCORRUPTED);
 	}
@@ -5606,3 +5606,371 @@  del_cursor:
 	xfs_trans_log_inode(tp, ip, logflags);
 	return error;
 }
+
+/*
+ * Splits an extent into two extents at split_fsb block that it is
+ * the first block of the current_ext. @current_ext is a target extent
+ * to be splitted. @split_fsb is a block where the extents is spliited.
+ * If split_fsb lies in a hole or the first block of extents, just return 0.
+ */
+STATIC int
+xfs_bmap_split_extent_at(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		split_fsb,
+	xfs_extnum_t		*current_ext,
+	xfs_fsblock_t		*firstfsb,
+	struct xfs_bmap_free	*free_list)
+{
+	int				whichfork = XFS_DATA_FORK;
+	struct xfs_btree_cur		*cur;
+	struct xfs_bmbt_rec_host	*gotp;
+	struct xfs_bmbt_irec		got;
+	struct xfs_bmbt_irec		new; /* splitted extent */
+	struct xfs_mount		*mp = ip->i_mount;
+	struct xfs_ifork		*ifp;
+	xfs_fsblock_t			gotblkcnt; /* new block count for got */
+	int				error = 0;
+	int				logflags;
+	int				i = 0;
+
+	if (unlikely(XFS_TEST_ERROR(
+	    (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+	     XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
+	     mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+		XFS_ERROR_REPORT("xfs_bmap_split_extent_at",
+				 XFS_ERRLEVEL_LOW, mp);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+
+	ASSERT(current_ext != NULL);
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+		/* Read in all the extents */
+		error = xfs_iread_extents(tp, ip, whichfork);
+		if (error)
+			return error;
+	}
+
+	gotp = xfs_iext_bno_to_ext(ifp, split_fsb, current_ext);
+	/*
+	 * gotp can be null in 2 cases: 1) if there are no extents
+	 * or 2) split_fsb lies in a hole beyond which there are
+	 * no extents. Either way, we are done.
+	 */
+	if (!gotp)
+		return 0;
+
+	xfs_bmbt_get_all(gotp, &got);
+
+	/*
+	 * Check split_fsb lies in a hole or the start boundary offset
+	 * of the extent.
+	 */
+	if (got.br_startoff >= split_fsb)
+		return 0;
+
+	gotblkcnt = split_fsb - got.br_startoff;
+	new.br_startoff = split_fsb;
+	new.br_startblock = got.br_startblock + gotblkcnt;
+	new.br_blockcount = got.br_blockcount - gotblkcnt;
+	new.br_state = got.br_state;
+
+	/* We are going to change core inode */
+	logflags = XFS_ILOG_CORE;
+
+	if (ifp->if_flags & XFS_IFBROOT) {
+		cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+		cur->bc_private.b.firstblock = *firstfsb;
+		cur->bc_private.b.flist = free_list;
+		cur->bc_private.b.flags = 0;
+	} else {
+		cur = NULL;
+		logflags |= XFS_ILOG_DEXT;
+	}
+
+	if (cur) {
+		error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
+					   got.br_startblock,
+					   got.br_blockcount,
+					   &i);
+		if (error)
+			goto del_cursor;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
+	}
+
+	xfs_bmbt_set_blockcount(gotp, gotblkcnt);
+	got.br_blockcount = gotblkcnt;
+	if (cur) {
+		error = xfs_bmbt_update(cur, got.br_startoff,
+					got.br_startblock,
+					got.br_blockcount,
+					got.br_state);
+		if (error)
+			goto del_cursor;
+	}
+
+	/* Add new extent */
+	(*current_ext)++;
+	xfs_iext_insert(ip, *current_ext, 1, &new, 0);
+	XFS_IFORK_NEXT_SET(ip, whichfork,
+		XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
+
+	if (cur) {
+		error = xfs_bmbt_lookup_eq(cur, new.br_startoff,
+					   new.br_startblock, new.br_blockcount,
+					   &i);
+		if (error)
+			goto del_cursor;
+		XFS_WANT_CORRUPTED_GOTO(i == 0, del_cursor);
+		cur->bc_rec.b.br_state = new.br_state;
+
+		error = xfs_btree_insert(cur, &i);
+		if (error)
+			goto del_cursor;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
+	}
+
+	/*
+	 * Convert to a btree if necessary.
+	 */
+	if (xfs_bmap_needs_btree(ip, whichfork)) {
+		int tmp_logflags; /* partial log flag return val */
+
+		ASSERT(cur == NULL);
+		error = xfs_bmap_extents_to_btree(tp, ip, firstfsb, free_list,
+				&cur, 0, &tmp_logflags, whichfork);
+		logflags |= tmp_logflags;
+	}
+
+del_cursor:
+	if (cur)
+		xfs_btree_del_cursor(cur,
+			error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+	xfs_trans_log_inode(tp, ip, logflags);
+	return error;
+}
+
+int
+xfs_bmap_split_extent(
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		split_fsb,
+	xfs_extnum_t		*split_ext)
+{
+	struct xfs_mount        *mp = ip->i_mount;
+	struct xfs_trans	*tp;
+	struct xfs_bmap_free	free_list;
+	xfs_fsblock_t		firstfsb;
+	int			committed;
+	int			error;
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
+	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
+			XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
+
+	if (error) {
+		/*
+		 * Free the transaction structure.
+		 */
+		ASSERT(XFS_FORCED_SHUTDOWN(mp));
+		xfs_trans_cancel(tp, 0);
+		return error;
+	}
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot,
+			ip->i_gdquot, ip->i_pdquot,
+			XFS_DIOSTRAT_SPACE_RES(mp, 0), 0,
+			XFS_QMOPT_RES_REGBLKS);
+	if (error)
+		goto error1;
+
+	xfs_trans_ijoin(tp, ip, 0);
+	xfs_bmap_init(&free_list, &firstfsb);
+
+	error = xfs_bmap_split_extent_at(tp, ip, split_fsb, split_ext,
+					 &firstfsb, &free_list);
+	if (error)
+		goto error0;
+
+	error = xfs_bmap_finish(&tp, &free_list, &committed);
+	if (error)
+		goto error0;
+
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+	return error;
+error0:
+	xfs_bmap_cancel(&free_list);
+error1:
+	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
+}
+
+/*
+ * Shift extent records to the right to make a hole.
+ * The maximum number of extents to be shifted in a single operation
+ * is @num_exts, and @current_ext keeps track of the current extent
+ * index we have shifted. @offset_shift_fsb is the length by which each
+ * extent is shifted. @end_ext is the last extent to be shifted.
+ */
+int
+xfs_bmap_shift_extents_right(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	int			*done,
+	xfs_fileoff_t		offset_shift_fsb,
+	xfs_extnum_t		*current_ext,
+	xfs_extnum_t		end_ext,
+	xfs_fsblock_t		*firstblock,
+	struct xfs_bmap_free	*flist,
+	int			num_exts)
+{
+	struct xfs_mount		*mp = ip->i_mount;
+	struct xfs_btree_cur		*cur;
+	struct xfs_bmbt_rec_host	*gotp;
+	struct xfs_bmbt_irec		got;
+	struct xfs_bmbt_irec            right;
+	xfs_ifork_t			*ifp;
+	xfs_fileoff_t			startoff;
+	xfs_filblks_t			blockcount = 0;
+	xfs_extnum_t			last_extent;
+	int				error = 0;
+	int				i;
+	int				whichfork = XFS_DATA_FORK;
+	int				logflags;
+
+	if (unlikely(XFS_TEST_ERROR(
+	    (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+	     XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
+	     mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+		XFS_ERROR_REPORT("xfs_bmap_shift_extents_right",
+				 XFS_ERRLEVEL_LOW, mp);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+
+	ASSERT(current_ext != NULL);
+
+	/* We are going to change core inode */
+	logflags = XFS_ILOG_CORE;
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+
+	if (ifp->if_flags & XFS_IFBROOT) {
+		cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+		cur->bc_private.b.firstblock = *firstblock;
+		cur->bc_private.b.flist = flist;
+		cur->bc_private.b.flags = 0;
+	} else {
+		cur = NULL;
+		logflags |= XFS_ILOG_DEXT;
+	}
+
+	/* start shifting extents to right */
+	while (num_exts-- > 0) {
+		blockcount = 0;
+
+		if (*current_ext < end_ext) {
+			*done = 1;
+			break;
+		}
+
+		gotp = xfs_iext_get_ext(ifp, *current_ext);
+		xfs_bmbt_get_all(gotp, &got);
+		startoff = got.br_startoff + offset_shift_fsb;
+
+		/*
+		 * Before shifting extent into hole, make sure that the hole
+		 * is large enough to accomodate the shift. This checking has
+		 * to be performed for all except the last extent.
+		 */
+		last_extent = (ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) - 1;
+		if (last_extent != *current_ext) {
+			xfs_bmbt_get_all(xfs_iext_get_ext(ifp,
+						*current_ext + 1), &right);
+			if (startoff + got.br_blockcount > right.br_startoff) {
+				error = XFS_ERROR(EINVAL);
+				if (error)
+					goto del_cursor;
+			}
+		}
+
+		/* Check if we can merge 2 adjacent extents */
+		if (last_extent != *current_ext &&
+		    right.br_startoff == startoff + got.br_blockcount &&
+		    right.br_startblock ==
+				got.br_startblock + got.br_blockcount &&
+		    right.br_state == got.br_state &&
+		    right.br_blockcount + got.br_blockcount <= MAXEXTLEN) {
+			blockcount = right.br_blockcount + got.br_blockcount;
+
+			/* Make cursor point to the extent we will update */
+			if (cur) {
+				error = xfs_bmbt_lookup_eq(cur,
+							   right.br_startoff,
+							   right.br_startblock,
+							   right.br_blockcount,
+							   &i);
+				if (error)
+					goto del_cursor;
+				XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
+			}
+
+			xfs_iext_remove(ip, *current_ext + 1, 1, 0);
+			if (cur) {
+				error = xfs_btree_delete(cur, &i);
+				if (error)
+					goto del_cursor;
+				XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
+			}
+			XFS_IFORK_NEXT_SET(ip, whichfork,
+					XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
+
+		}
+
+		if (cur) {
+			error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
+						   got.br_startblock,
+						   got.br_blockcount,
+						   &i);
+			if (error)
+				goto del_cursor;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
+		}
+
+		if (got.br_blockcount < blockcount) {
+			xfs_bmbt_set_blockcount(gotp, blockcount);
+			got.br_blockcount = blockcount;
+		}
+
+
+		xfs_bmbt_set_startoff(gotp, startoff);
+		got.br_startoff = startoff;
+
+		if (cur) {
+			error = xfs_bmbt_update(cur, got.br_startoff,
+						got.br_startblock,
+						got.br_blockcount,
+						got.br_state);
+			if (error)
+				goto del_cursor;
+		}
+
+		(*current_ext)--;
+	}
+
+del_cursor:
+	if (cur)
+		xfs_btree_del_cursor(cur,
+			error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+	xfs_trans_log_inode(tp, ip, logflags);
+	return error;
+}
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 38ba36e..af05899 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -179,10 +179,17 @@  int	xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
 int	xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,
 		xfs_extnum_t num);
 uint	xfs_default_attroffset(struct xfs_inode *ip);
-int	xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip,
+int	xfs_bmap_shift_extents_left(struct xfs_trans *tp, struct xfs_inode *ip,
 		int *done, xfs_fileoff_t start_fsb,
 		xfs_fileoff_t offset_shift_fsb, xfs_extnum_t *current_ext,
 		xfs_fsblock_t *firstblock, struct xfs_bmap_free	*flist,
 		int num_exts);
+int	xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset,
+		xfs_extnum_t *split_ext);
+int	xfs_bmap_shift_extents_right(struct xfs_trans *tp, struct xfs_inode *ip,
+		int *done, xfs_fsblock_t offset_shift_fsb,
+		xfs_extnum_t *current_ext, xfs_extnum_t end_ext,
+		xfs_fsblock_t *firstblock, struct xfs_bmap_free *flist,
+		int num_exts);
 
 #endif	/* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 296160b..5a56f5b 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1550,7 +1550,7 @@  xfs_collapse_file_space(
 		 * We are using the write transaction in which max 2 bmbt
 		 * updates are allowed
 		 */
-		error = xfs_bmap_shift_extents(tp, ip, &done, start_fsb,
+		error = xfs_bmap_shift_extents_left(tp, ip, &done, start_fsb,
 					       shift_fsb, &current_ext,
 					       &first_block, &free_list,
 					       XFS_BMAP_MAX_SHIFT_EXTENTS);
@@ -1574,6 +1574,133 @@  out:
 }
 
 /*
+ * xfs_insert_file_space()
+ *	This routine allocate disk space and shift extent for the given file.
+ *	The first thing we do is to sync dirty data and invalidate page cache
+ *	over the region on which insert range is working. And split an extent
+ *	to two extents at given offset by calling xfs_bmap_split_extent.
+ *	And shift all extent records which are laying between [offset,
+ *	last allocated extent] to the right to reserve hole range. Lastly
+ *	allocate an unwritten extent in hole range created by shifting extents.
+ *
+ * RETURNS:
+ *	0 on success
+ *	errno on error
+ *
+ */
+int
+xfs_insert_file_space(
+	struct xfs_inode	*ip,
+	loff_t			offset,
+	loff_t			len)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
+	struct xfs_bmap_free	free_list;
+	xfs_fsblock_t		first_block;
+	xfs_ifork_t		*ifp;
+	int			done = 0;
+	int			committed;
+	int			error;
+	uint			rounding;
+	xfs_fileoff_t		start_fsb;
+	xfs_fileoff_t		shift_fsb;
+	xfs_extnum_t		split_ext;
+	xfs_extnum_t		current_ext = 0;
+	xfs_off_t		ioffset;
+
+	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+	trace_xfs_insert_file_space(ip);
+
+	error = xfs_qm_dqattach(ip, 0);
+	if (error)
+		return error;
+
+	/* wait for the completion of any pending DIOs */
+	inode_dio_wait(VFS_I(ip));
+
+	rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
+	ioffset = offset & ~(rounding - 1);
+	error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
+			ioffset, -1);
+	if (error)
+		return error;
+
+	truncate_pagecache_range(VFS_I(ip), ioffset, -1);
+
+	start_fsb = XFS_B_TO_FSB(mp, offset);
+	shift_fsb = XFS_B_TO_FSB(mp, len);
+
+	error = xfs_bmap_split_extent(ip, start_fsb, &split_ext);
+	if (error)
+		return error;
+
+	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+	current_ext = (ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) - 1;
+	while (!error && !done) {
+		tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
+		/*
+		 * We would need to reserve permanent block for transaction.
+		 * This will come into picture when after shifting extent into
+		 * hole we found that adjacent extents can be merged which
+		 * may lead to freeing of a block during record update.
+		 */
+		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
+				XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
+		if (error) {
+			ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
+			xfs_trans_cancel(tp, 0);
+			break;
+		}
+
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot,
+				ip->i_gdquot, ip->i_pdquot,
+				XFS_DIOSTRAT_SPACE_RES(mp, 0), 0,
+				XFS_QMOPT_RES_REGBLKS);
+		if (error)
+			goto error1;
+
+		xfs_trans_ijoin(tp, ip, 0);
+
+		xfs_bmap_init(&free_list, &first_block);
+
+		/*
+		 * We are using the write transaction in which max 2 bmbt
+		 * updates are allowed
+		 */
+		error = xfs_bmap_shift_extents_right(tp, ip, &done, shift_fsb,
+						&current_ext, split_ext,
+						&first_block, &free_list,
+						XFS_BMAP_MAX_SHIFT_EXTENTS);
+		if (error)
+			goto error0;
+
+		error = xfs_bmap_finish(&tp, &free_list, &committed);
+		if (error)
+			goto error0;
+
+		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		if (error)
+			goto out;
+	}
+
+	/* Add unwritten extent in a hole range. */
+	error = xfs_alloc_file_space(ip, offset, len, XFS_BMAPI_PREALLOC);
+
+out:
+	return error;
+
+error0:
+	xfs_bmap_cancel(&free_list);
+error1:
+	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
+}
+
+/*
  * We need to check that the format of the data fork in the temporary inode is
  * valid for the target inode before doing the swap. This is not a problem with
  * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 935ed2b..d62ab4b 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -101,6 +101,8 @@  int	xfs_zero_file_space(struct xfs_inode *ip, xfs_off_t offset,
 			    xfs_off_t len);
 int	xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset,
 				xfs_off_t len);
+int	xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset,
+				xfs_off_t len);
 
 /* EOF block manipulation functions */
 bool	xfs_can_free_eofblocks(struct xfs_inode *ip, bool force);
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 97855c5..392b029 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -760,7 +760,8 @@  xfs_file_fallocate(
 	if (!S_ISREG(inode->i_mode))
 		return -EINVAL;
 	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
-		     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
+		     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
+		     FALLOC_FL_INSERT_RANGE))
 		return -EOPNOTSUPP;
 
 	xfs_ilock(ip, XFS_IOLOCK_EXCL);
@@ -790,6 +791,40 @@  xfs_file_fallocate(
 		error = xfs_collapse_file_space(ip, offset, len);
 		if (error)
 			goto out_unlock;
+	} else if (mode & FALLOC_FL_INSERT_RANGE) {
+		unsigned blksize_mask = (1 << inode->i_blkbits) - 1;
+		struct iattr iattr;
+
+		if (offset & blksize_mask || len & blksize_mask) {
+			error = -EINVAL;
+			goto out_unlock;
+		}
+
+		/* Check for wrap through zero */
+		if (inode->i_size + len > inode->i_sb->s_maxbytes) {
+			error = -EFBIG;
+			goto out_unlock;
+		}
+
+		/* Offset should be less than i_size */
+		if (offset >= i_size_read(inode)) {
+			error = -EINVAL;
+			goto out_unlock;
+		}
+
+		/*
+		 * The first thing we do is to expand file to
+		 * avoid data loss if there is error while shifting
+		 */
+		iattr.ia_valid = ATTR_SIZE;
+		iattr.ia_size = i_size_read(inode) + len;
+		error = xfs_setattr_size(ip, &iattr);
+		if (error)
+			goto out_unlock;
+
+		error = xfs_insert_file_space(ip, offset, len);
+		if (error)
+			goto out_unlock;
 	} else {
 		if (!(mode & FALLOC_FL_KEEP_SIZE) &&
 		    offset + len > i_size_read(inode)) {
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 152f827..8943c9f 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -663,6 +663,7 @@  DEFINE_INODE_EVENT(xfs_alloc_file_space);
 DEFINE_INODE_EVENT(xfs_free_file_space);
 DEFINE_INODE_EVENT(xfs_zero_file_space);
 DEFINE_INODE_EVENT(xfs_collapse_file_space);
+DEFINE_INODE_EVENT(xfs_insert_file_space);
 DEFINE_INODE_EVENT(xfs_readdir);
 #ifdef CONFIG_XFS_POSIX_ACL
 DEFINE_INODE_EVENT(xfs_get_acl);