diff mbox

[RESEND,2/10] xfs: Add support FALLOC_FL_COLLAPSE_RANGE for fallocate

Message ID 1391319851-3169-1-git-send-email-linkinjeon@gmail.com
State Superseded, archived
Headers show

Commit Message

Namjae Jeon Feb. 2, 2014, 5:44 a.m. UTC
From: Namjae Jeon <namjae.jeon@samsung.com>

Add support FALLOC_FL_COLLAPSE_RANGE for fallocate.

Signed-off-by: Namjae Jeon <namjae.jeon@samsung.com>
Signed-off-by: Ashish Sangwan <a.sangwan@samsung.com>
---
 fs/xfs/xfs_bmap.c      |  195 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_bmap.h      |    5 ++
 fs/xfs/xfs_bmap_util.c |   99 ++++++++++++++++++++++++
 fs/xfs/xfs_bmap_util.h |    2 +
 fs/xfs/xfs_file.c      |   19 ++++-
 fs/xfs/xfs_trace.h     |    1 +
 6 files changed, 319 insertions(+), 2 deletions(-)

Comments

Dave Chinner Feb. 10, 2014, 11:32 p.m. UTC | #1
On Sun, Feb 02, 2014 at 02:44:11PM +0900, Namjae Jeon wrote:
> From: Namjae Jeon <namjae.jeon@samsung.com>
> 
> Add support FALLOC_FL_COLLAPSE_RANGE for fallocate.
> 
> Signed-off-by: Namjae Jeon <namjae.jeon@samsung.com>
> Signed-off-by: Ashish Sangwan <a.sangwan@samsung.com>

A more detailed description would be nice for the change logs.
.....

> +	while (nexts++ < num_exts &&
> +	       *current_ext <  XFS_IFORK_NEXTENTS(ip, whichfork)) {
> +
> +		gotp = xfs_iext_get_ext(ifp, *current_ext);
> +		xfs_bmbt_get_all(gotp, &got);
> +		startoff = got.br_startoff - offset_shift_fsb;
> +
> +		/*
> +		 * Before shifting extent into hole, make sure that the hole
> +		 * is large enough to accomodate the shift.
> +		 */
> +		if (*current_ext) {
> +			xfs_bmbt_get_all(xfs_iext_get_ext(ifp,
> +						*current_ext - 1), &left);
> +
> +			if (startoff < left.br_startoff + left.br_blockcount)
> +				error = XFS_ERROR(EINVAL);
> +
> +		} else if (startoff > xfs_bmbt_get_startoff(gotp)) {
> +			/* Hole is at the start but not large enough */
> +			error = XFS_ERROR(EINVAL);
> +		}

This second branch seems wrong to me:

	startoff = got.br_startoff - offset_shift_fsb;
and
	got.br_startoff = xfs_bmbt_get_startoff(gotp)).

I'm not 100% sure what you are trying to check in this case -
perhaps some basic ascii art to describe the two cases is in order
here:

	left	hole		got
	+-------+hhhhhhhhhhhhhhh+---------+
	LS      LE              GS        GE
		HS              HE

The first is checking that GS - offset_shift_fsb is greater than LE.
i.e the shift doesn't overrun the hole betwenn LE and GS.

	left	hole		got
	+-------+hhhhhhhhhhhhhhh+---------+
	LS      LE              GS        GE
		HS              HE
	+-------+hhhhhhh+---------+
	LS      LE      GS'       GE'
		HS      HE'

The second I can't visualise from the code or comment....


> +
> +		if (error)
> +			goto del_cursor;
> +
> +		if (cur) {
> +			error = xfs_bmbt_lookup_eq(cur,
> +					got.br_startoff,
> +					got.br_startblock,
> +					got.br_blockcount,
> +					&i);

Whitespace comment - a more compact form is the typical XFS
convention if it will fit in 80 columns:

			error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
						   got.br_startblock,
						   got.br_blockcount, &i);

> +			if (error)
> +				goto del_cursor;
> +			XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
> +		}
> +
> +		/* Check if we can merge 2 adjacent extents */
> +		if (*current_ext &&
> +		    left.br_startoff + left.br_blockcount == startoff &&
> +		    left.br_startblock + left.br_blockcount ==
> +				got.br_startblock &&
> +		    left.br_state == got.br_state &&
> +		    left.br_blockcount + got.br_blockcount <= MAXEXTLEN) {
> +			blockcount = left.br_blockcount +
> +				xfs_bmbt_get_blockcount(gotp);

				got.br_blockcount?

> +			xfs_iext_remove(ip, *current_ext, 1, 0);
> +			if (cur) {
> +				error = xfs_btree_delete(cur, &i);
> +				if (error)
> +					goto del_cursor;
> +				XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
> +			}
> +			XFS_IFORK_NEXT_SET(ip, whichfork,
> +				XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
> +			gotp = xfs_iext_get_ext(ifp, --*current_ext);
> +			xfs_bmbt_get_all(gotp, &got);
> +
> +			/* Make cursor point to the extent we will update */
> +			if (cur) {
> +				error = xfs_bmbt_lookup_eq(cur,
> +				got.br_startoff,
> +				got.br_startblock,
> +				got.br_blockcount,
> +				&i);

whitespace.

> +				if (error)
> +					goto del_cursor;
> +				XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
> +			}
> +
> +			xfs_bmbt_set_blockcount(gotp, blockcount);
> +			got.br_blockcount = blockcount;
> +			goto bmbt_update;
> +		}
> +
> +		/* We have to update the startoff */
> +		xfs_bmbt_set_startoff(gotp, startoff);
> +		got.br_startoff = startoff;
> +
> +bmbt_update:

Use an } else {} for this, and the goto can be removed.

....
>  /*
> + * xfs_collapse_file_space()
> + *      This routine frees disk space and shift extent for the given file.
> + *
> + * RETURNS:
> + *      0 on success
> + *      errno on error
> + *
> + */
> +int
> +xfs_collapse_file_space(
> +	struct xfs_inode	*ip,
> +	xfs_off_t		offset,
> +	xfs_off_t		len)
> +{
> +	int			done = 0;
> +	struct xfs_mount	*mp = ip->i_mount;
> +	struct xfs_trans	*tp;
> +	int			error;
> +	xfs_extnum_t		current_ext = 0;
> +	struct xfs_bmap_free	free_list;
> +	xfs_fsblock_t		first_block;
> +	int			committed;
> +	xfs_fileoff_t		start_fsb;
> +	xfs_fileoff_t		shift_fsb;
> +
> +	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
> +
> +	trace_xfs_collapse_file_space(ip);
> +
> +	start_fsb = XFS_B_TO_FSB(mp, offset + len);
> +	shift_fsb = XFS_B_TO_FSB(mp, len);
> +
> +	/*
> +	 * The first thing we do is to free data blocks in the specified range
> +	 * by calling xfs_free_file_space(). It would also sync dirty data
> +	 * and invalidate page cache over the region on which collapse range
> +	 * is working.
> +	 */

This can probably go in the function header as part of describing
the overall algorithm the code is using.

> +	error = xfs_free_file_space(ip, offset, len);
> +	if (error)
> +		return error;
> +
> +	while (!error && !done) {
> +		tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
> +		tp->t_flags |= XFS_TRANS_RESERVE;
> +		/*
> +		 * We would need to reserve permanent block for transaction.
> +		 * This will come into picture when after shifting extent into
> +		 * hole we found that adjacent extents can be merged which
> +		 * may lead to freeing of a block during record update.
> +		 */
> +		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
> +				XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
> +		if (error) {
> +			ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
> +			xfs_trans_cancel(tp, 0);
> +			break;
> +		}
> +
> +		xfs_ilock(ip, XFS_ILOCK_EXCL);
> +		error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot,
> +				ip->i_gdquot, ip->i_pdquot,
> +				XFS_DIOSTRAT_SPACE_RES(mp, 0), 0,
> +				XFS_QMOPT_RES_REGBLKS);
> +		if (error)
> +			goto out;
> +
> +		xfs_trans_ijoin(tp, ip, 0);
> +
> +		xfs_bmap_init(&free_list, &first_block);
> +
> +		/*
> +		 * We are using the write transaction in which max 2 bmbt
> +		 * updates are allowed
> +		 */

Right, but you've only reserved blocks for a single BMBT split
through XFS_DIOSTRAT_SPACE_RES(). In cases of allocation, adjacent
offset allocation is guaranteed to only require one split at most
and hence it's safe from that perspective. However, I can see how a
shift can require a split on the first extent move, and a merge on
the second. e.g:


		left		middle		right
before		maxrecs		minrecs + 1	minrecs
first shift	maxrecs + 1	minrecs		minrecs
		split
		maxrecs / 2	minrecs		minrecs
second shift
		maxrecs/2 + 1	minrecs - 1	minrecs
				merge		merge
				minrecs*2 - 1	(freed)

So the question is whether the transaction reservations (both log
space and block allocation requirements) are covered.

> +		error = xfs_bmap_shift_extents(tp, ip, &done, start_fsb,
> +				shift_fsb, &current_ext,
> +				&first_block, &free_list, 2);

And that should really have a #define associated with it. ie.:

#define XFS_BMAP_MAX_SHIFT_EXTENTS	2

And document the constraints that lead to that number with the
definition.

Overall, all I'm really looking for here is sufficient comments to
document the constraints the code is operating under. Functionally
the code looks correct (apart from the branch above I can't work
out). Mostly I just want to make sure that in a couple of
years time I don't have to work it all out from first principles
again. ;)

Cheers,

Dave.
Namjae Jeon Feb. 11, 2014, 9:45 a.m. UTC | #2
Hi Dave.
2014-02-11 8:32 GMT+09:00, Dave Chinner <david@fromorbit.com>:
> On Sun, Feb 02, 2014 at 02:44:11PM +0900, Namjae Jeon wrote:
>> From: Namjae Jeon <namjae.jeon@samsung.com>
>>
>> Add support FALLOC_FL_COLLAPSE_RANGE for fallocate.
>>
>> Signed-off-by: Namjae Jeon <namjae.jeon@samsung.com>
>> Signed-off-by: Ashish Sangwan <a.sangwan@samsung.com>
>
> A more detailed description would be nice for the change logs.
> .....
Okay, I will update it on next version.
>
>> +	while (nexts++ < num_exts &&
>> +	       *current_ext <  XFS_IFORK_NEXTENTS(ip, whichfork)) {
>> +
>> +		gotp = xfs_iext_get_ext(ifp, *current_ext);
>> +		xfs_bmbt_get_all(gotp, &got);
>> +		startoff = got.br_startoff - offset_shift_fsb;
>> +
>> +		/*
>> +		 * Before shifting extent into hole, make sure that the hole
>> +		 * is large enough to accomodate the shift.
>> +		 */
>> +		if (*current_ext) {
>> +			xfs_bmbt_get_all(xfs_iext_get_ext(ifp,
>> +						*current_ext - 1), &left);
>> +
>> +			if (startoff < left.br_startoff + left.br_blockcount)
>> +				error = XFS_ERROR(EINVAL);
>> +
>> +		} else if (startoff > xfs_bmbt_get_startoff(gotp)) {
>> +			/* Hole is at the start but not large enough */
>> +			error = XFS_ERROR(EINVAL);
>> +		}
>
> This second branch seems wrong to me:
>
> 	startoff = got.br_startoff - offset_shift_fsb;
> and
> 	got.br_startoff = xfs_bmbt_get_startoff(gotp)).
>
> I'm not 100% sure what you are trying to check in this case -
> perhaps some basic ascii art to describe the two cases is in order
> here:
>
> 	left	hole		got
> 	+-------+hhhhhhhhhhhhhhh+---------+
> 	LS      LE              GS        GE
> 		HS              HE
>
> The first is checking that GS - offset_shift_fsb is greater than LE.
> i.e the shift doesn't overrun the hole betwenn LE and GS.
>
> 	left	hole		got
> 	+-------+hhhhhhhhhhhhhhh+---------+
> 	LS      LE              GS        GE
> 		HS              HE
> 	+-------+hhhhhhh+---------+
> 	LS      LE      GS'       GE'
> 		HS      HE'
>
> The second I can't visualise from the code or comment....
When we shift first extent, *current_ext  will be 0. So we need to
check that offset_shift_fsb ( Number of blocks to be shifted ) should
be less than starting offset of the first extent.
So, code will be changed more clearly like this.

 + else if (offset_shift_fsb > got.br_startoff) {
 + /* Hole is at the start but not large enough */
 + error = XFS_ERROR(EINVAL);
 + }
And will update comment more clearly.
>
>
>> +
>> +		if (error)
>> +			goto del_cursor;
>> +
>> +		if (cur) {
>> +			error = xfs_bmbt_lookup_eq(cur,
>> +					got.br_startoff,
>> +					got.br_startblock,
>> +					got.br_blockcount,
>> +					&i);
>
> Whitespace comment - a more compact form is the typical XFS
> convention if it will fit in 80 columns:
Okay. I will fix it.
>
> 			error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
> 						   got.br_startblock,
> 						   got.br_blockcount, &i);
>
>> +			if (error)
>> +				goto del_cursor;
>> +			XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
>> +		}
>> +
>> +		/* Check if we can merge 2 adjacent extents */
>> +		if (*current_ext &&
>> +		    left.br_startoff + left.br_blockcount == startoff &&
>> +		    left.br_startblock + left.br_blockcount ==
>> +				got.br_startblock &&
>> +		    left.br_state == got.br_state &&
>> +		    left.br_blockcount + got.br_blockcount <= MAXEXTLEN) {
>> +			blockcount = left.br_blockcount +
>> +				xfs_bmbt_get_blockcount(gotp);
>
> 				got.br_blockcount?
Right. will fix it.
>
>> +			xfs_iext_remove(ip, *current_ext, 1, 0);
>> +			if (cur) {
>> +				error = xfs_btree_delete(cur, &i);
>> +				if (error)
>> +					goto del_cursor;
>> +				XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
>> +			}
>> +			XFS_IFORK_NEXT_SET(ip, whichfork,
>> +				XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
>> +			gotp = xfs_iext_get_ext(ifp, --*current_ext);
>> +			xfs_bmbt_get_all(gotp, &got);
>> +
>> +			/* Make cursor point to the extent we will update */
>> +			if (cur) {
>> +				error = xfs_bmbt_lookup_eq(cur,
>> +				got.br_startoff,
>> +				got.br_startblock,
>> +				got.br_blockcount,
>> +				&i);
>
> whitespace.
Okay.
>
>> +				if (error)
>> +					goto del_cursor;
>> +				XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
>> +			}
>> +
>> +			xfs_bmbt_set_blockcount(gotp, blockcount);
>> +			got.br_blockcount = blockcount;
>> +			goto bmbt_update;
>> +		}
>> +
>> +		/* We have to update the startoff */
>> +		xfs_bmbt_set_startoff(gotp, startoff);
>> +		got.br_startoff = startoff;
>> +
>> +bmbt_update:
>
> Use an } else {} for this, and the goto can be removed.
Okay, I will change.
>
> ....
>>  /*
>> + * xfs_collapse_file_space()
>> + *      This routine frees disk space and shift extent for the given
>> file.
>> + *
>> + * RETURNS:
>> + *      0 on success
>> + *      errno on error
>> + *
>> + */
>> +int
>> +xfs_collapse_file_space(
>> +	struct xfs_inode	*ip,
>> +	xfs_off_t		offset,
>> +	xfs_off_t		len)
>> +{
>> +	int			done = 0;
>> +	struct xfs_mount	*mp = ip->i_mount;
>> +	struct xfs_trans	*tp;
>> +	int			error;
>> +	xfs_extnum_t		current_ext = 0;
>> +	struct xfs_bmap_free	free_list;
>> +	xfs_fsblock_t		first_block;
>> +	int			committed;
>> +	xfs_fileoff_t		start_fsb;
>> +	xfs_fileoff_t		shift_fsb;
>> +
>> +	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
>> +
>> +	trace_xfs_collapse_file_space(ip);
>> +
>> +	start_fsb = XFS_B_TO_FSB(mp, offset + len);
>> +	shift_fsb = XFS_B_TO_FSB(mp, len);
>> +
>> +	/*
>> +	 * The first thing we do is to free data blocks in the specified range
>> +	 * by calling xfs_free_file_space(). It would also sync dirty data
>> +	 * and invalidate page cache over the region on which collapse range
>> +	 * is working.
>> +	 */
>
> This can probably go in the function header as part of describing
> the overall algorithm the code is using.
Okay.

>
>> +	error = xfs_free_file_space(ip, offset, len);
>> +	if (error)
>> +		return error;
>> +
>> +	while (!error && !done) {
>> +		tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
>> +		tp->t_flags |= XFS_TRANS_RESERVE;
>> +		/*
>> +		 * We would need to reserve permanent block for transaction.
>> +		 * This will come into picture when after shifting extent into
>> +		 * hole we found that adjacent extents can be merged which
>> +		 * may lead to freeing of a block during record update.
>> +		 */
>> +		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
>> +				XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
>> +		if (error) {
>> +			ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
>> +			xfs_trans_cancel(tp, 0);
>> +			break;
>> +		}
>> +
>> +		xfs_ilock(ip, XFS_ILOCK_EXCL);
>> +		error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot,
>> +				ip->i_gdquot, ip->i_pdquot,
>> +				XFS_DIOSTRAT_SPACE_RES(mp, 0), 0,
>> +				XFS_QMOPT_RES_REGBLKS);
>> +		if (error)
>> +			goto out;
>> +
>> +		xfs_trans_ijoin(tp, ip, 0);
>> +
>> +		xfs_bmap_init(&free_list, &first_block);
>> +
>> +		/*
>> +		 * We are using the write transaction in which max 2 bmbt
>> +		 * updates are allowed
>> +		 */
>
> Right, but you've only reserved blocks for a single BMBT split
> through XFS_DIOSTRAT_SPACE_RES(). In cases of allocation, adjacent
> offset allocation is guaranteed to only require one split at most
> and hence it's safe from that perspective. However, I can see how a
> shift can require a split on the first extent move, and a merge on
> the second. e.g:
>
>
> 		left		middle		right
> before		maxrecs		minrecs + 1	minrecs
> first shift	maxrecs + 1	minrecs		minrecs
> 		split
> 		maxrecs / 2	minrecs		minrecs
> second shift
> 		maxrecs/2 + 1	minrecs - 1	minrecs
> 				merge		merge
> 				minrecs*2 - 1	(freed)
>
> So the question is whether the transaction reservations (both log
> space and block allocation requirements) are covered.
Yes, As you said, we could require two splits for extent move and
merge in some cases.
So, I will change number of shift extents with decraring define you
pointed like this.
#define XFS_BMAP_MAX_SHIFT_EXTENTS      1

>
>> +		error = xfs_bmap_shift_extents(tp, ip, &done, start_fsb,
>> +				shift_fsb, &current_ext,
>> +				&first_block, &free_list, 2);
>
> And that should really have a #define associated with it. ie.:
>
> #define XFS_BMAP_MAX_SHIFT_EXTENTS	2
>
> And document the constraints that lead to that number with the
> definition.
>
> Overall, all I'm really looking for here is sufficient comments to
> document the constraints the code is operating under. Functionally
> the code looks correct (apart from the branch above I can't work
> out). Mostly I just want to make sure that in a couple of
> years time I don't have to work it all out from first principles
> again. ;)
Okay, I will add sufficient comments to maintain easily this change.

Thanks for your review!
>
> Cheers,
>
> Dave.
>
> --
> Dave Chinner
> david@fromorbit.com
>
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 3ef11b2..aba3fc9 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -5358,3 +5358,198 @@  error0:
 	}
 	return error;
 }
+
+/*
+ * Shift extent records to the left to cover a hole.
+ *
+ * The maximum number of extents to be shifted in a single operation
+ * is @num_exts, and @current_ext keeps track of the current extent
+ * index we have shifted. @offset_shift_fsb is the length by which each
+ * extent is shifted. If there is no hole to shift the extents
+ * into, this will be considered invalid operation and we abort immediately.
+ */
+int
+xfs_bmap_shift_extents(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	int			*done,
+	xfs_fileoff_t		start_fsb,
+	xfs_fileoff_t		offset_shift_fsb,
+	xfs_extnum_t		*current_ext,
+	xfs_fsblock_t		*firstblock,
+	struct xfs_bmap_free	*flist,
+	int			num_exts)
+{
+	struct xfs_btree_cur		*cur;
+	struct xfs_bmbt_rec_host	*gotp;
+	struct xfs_bmbt_irec            got;
+	struct xfs_bmbt_irec		left;
+	struct xfs_mount		*mp = ip->i_mount;
+	struct xfs_ifork		*ifp;
+	xfs_extnum_t			nexts = 0;
+	xfs_fileoff_t			startoff;
+	int				error = 0;
+	int				i;
+	int				whichfork = XFS_DATA_FORK;
+	int				logflags;
+	xfs_filblks_t			blockcount = 0;
+
+	if (unlikely(XFS_TEST_ERROR(
+	    (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+	     XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
+	     mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+		XFS_ERROR_REPORT("xfs_bmap_shift_extents",
+				 XFS_ERRLEVEL_LOW, mp);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+
+	ASSERT(current_ext != NULL);
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+
+	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+		/* Read in all the extents */
+		error = xfs_iread_extents(tp, ip, whichfork);
+		if (error)
+			return error;
+	}
+
+	/*
+	 * If *current_ext is 0, we would need to lookup the extent
+	 * from where we would start shifting and store it in gotp.
+	 */
+	if (!*current_ext) {
+		gotp = xfs_iext_bno_to_ext(ifp, start_fsb, current_ext);
+		/*
+		 * gotp can be null in 2 cases: 1) if there are no extents
+		 * or 2) start_fsb lies in a hole beyond which there are
+		 * no extents. Either way, we are done.
+		 */
+		if (!gotp) {
+			*done = 1;
+			return 0;
+		}
+	}
+
+	/* We are going to change core inode */
+	logflags = XFS_ILOG_CORE;
+
+	if (ifp->if_flags & XFS_IFBROOT) {
+		cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+		cur->bc_private.b.firstblock = *firstblock;
+		cur->bc_private.b.flist = flist;
+		cur->bc_private.b.flags = 0;
+	} else {
+		cur = NULL;
+		logflags |= XFS_ILOG_DEXT;
+	}
+
+	while (nexts++ < num_exts &&
+	       *current_ext <  XFS_IFORK_NEXTENTS(ip, whichfork)) {
+
+		gotp = xfs_iext_get_ext(ifp, *current_ext);
+		xfs_bmbt_get_all(gotp, &got);
+		startoff = got.br_startoff - offset_shift_fsb;
+
+		/*
+		 * Before shifting extent into hole, make sure that the hole
+		 * is large enough to accomodate the shift.
+		 */
+		if (*current_ext) {
+			xfs_bmbt_get_all(xfs_iext_get_ext(ifp,
+						*current_ext - 1), &left);
+
+			if (startoff < left.br_startoff + left.br_blockcount)
+				error = XFS_ERROR(EINVAL);
+
+		} else if (startoff > xfs_bmbt_get_startoff(gotp)) {
+			/* Hole is at the start but not large enough */
+			error = XFS_ERROR(EINVAL);
+		}
+
+		if (error)
+			goto del_cursor;
+
+		if (cur) {
+			error = xfs_bmbt_lookup_eq(cur,
+					got.br_startoff,
+					got.br_startblock,
+					got.br_blockcount,
+					&i);
+			if (error)
+				goto del_cursor;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
+		}
+
+		/* Check if we can merge 2 adjacent extents */
+		if (*current_ext &&
+		    left.br_startoff + left.br_blockcount == startoff &&
+		    left.br_startblock + left.br_blockcount ==
+				got.br_startblock &&
+		    left.br_state == got.br_state &&
+		    left.br_blockcount + got.br_blockcount <= MAXEXTLEN) {
+			blockcount = left.br_blockcount +
+				xfs_bmbt_get_blockcount(gotp);
+			xfs_iext_remove(ip, *current_ext, 1, 0);
+			if (cur) {
+				error = xfs_btree_delete(cur, &i);
+				if (error)
+					goto del_cursor;
+				XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
+			}
+			XFS_IFORK_NEXT_SET(ip, whichfork,
+				XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
+			gotp = xfs_iext_get_ext(ifp, --*current_ext);
+			xfs_bmbt_get_all(gotp, &got);
+
+			/* Make cursor point to the extent we will update */
+			if (cur) {
+				error = xfs_bmbt_lookup_eq(cur,
+				got.br_startoff,
+				got.br_startblock,
+				got.br_blockcount,
+				&i);
+				if (error)
+					goto del_cursor;
+				XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
+			}
+
+			xfs_bmbt_set_blockcount(gotp, blockcount);
+			got.br_blockcount = blockcount;
+			goto bmbt_update;
+		}
+
+		/* We have to update the startoff */
+		xfs_bmbt_set_startoff(gotp, startoff);
+		got.br_startoff = startoff;
+
+bmbt_update:
+		if (cur) {
+			error = xfs_bmbt_update(cur,
+					got.br_startoff,
+					got.br_startblock,
+					got.br_blockcount,
+					got.br_state);
+			if (error)
+				goto del_cursor;
+		}
+
+		(*current_ext)++;
+	}
+
+	/* Check if we are done */
+	if (*current_ext ==  XFS_IFORK_NEXTENTS(ip, whichfork))
+		*done = 1;
+
+del_cursor:
+	if (cur)
+		xfs_btree_del_cursor(cur,
+			error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+
+	xfs_trans_log_inode(tp, ip, logflags);
+
+	return error;
+}
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 33b41f3..7bfe67c 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -169,5 +169,10 @@  int	xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
 int	xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,
 		xfs_extnum_t num);
 uint	xfs_default_attroffset(struct xfs_inode *ip);
+int	xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip,
+		int *done, xfs_fileoff_t start_fsb,
+		xfs_fileoff_t offset_shift_fsb, xfs_extnum_t *current_ext,
+		xfs_fsblock_t *firstblock, struct xfs_bmap_free	*flist,
+		int num_exts);
 
 #endif	/* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 5887e41..7b62f5d 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1446,6 +1446,105 @@  out:
 }
 
 /*
+ * xfs_collapse_file_space()
+ *      This routine frees disk space and shift extent for the given file.
+ *
+ * RETURNS:
+ *      0 on success
+ *      errno on error
+ *
+ */
+int
+xfs_collapse_file_space(
+	struct xfs_inode	*ip,
+	xfs_off_t		offset,
+	xfs_off_t		len)
+{
+	int			done = 0;
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
+	int			error;
+	xfs_extnum_t		current_ext = 0;
+	struct xfs_bmap_free	free_list;
+	xfs_fsblock_t		first_block;
+	int			committed;
+	xfs_fileoff_t		start_fsb;
+	xfs_fileoff_t		shift_fsb;
+
+	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+
+	trace_xfs_collapse_file_space(ip);
+
+	start_fsb = XFS_B_TO_FSB(mp, offset + len);
+	shift_fsb = XFS_B_TO_FSB(mp, len);
+
+	/*
+	 * The first thing we do is to free data blocks in the specified range
+	 * by calling xfs_free_file_space(). It would also sync dirty data
+	 * and invalidate page cache over the region on which collapse range
+	 * is working.
+	 */
+
+	error = xfs_free_file_space(ip, offset, len);
+	if (error)
+		return error;
+
+	while (!error && !done) {
+		tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
+		tp->t_flags |= XFS_TRANS_RESERVE;
+		/*
+		 * We would need to reserve permanent block for transaction.
+		 * This will come into picture when after shifting extent into
+		 * hole we found that adjacent extents can be merged which
+		 * may lead to freeing of a block during record update.
+		 */
+		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
+				XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
+		if (error) {
+			ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
+			xfs_trans_cancel(tp, 0);
+			break;
+		}
+
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot,
+				ip->i_gdquot, ip->i_pdquot,
+				XFS_DIOSTRAT_SPACE_RES(mp, 0), 0,
+				XFS_QMOPT_RES_REGBLKS);
+		if (error)
+			goto out;
+
+		xfs_trans_ijoin(tp, ip, 0);
+
+		xfs_bmap_init(&free_list, &first_block);
+
+		/*
+		 * We are using the write transaction in which max 2 bmbt
+		 * updates are allowed
+		 */
+		error = xfs_bmap_shift_extents(tp, ip, &done, start_fsb,
+				shift_fsb, &current_ext,
+				&first_block, &free_list, 2);
+		if (error)
+			goto out;
+
+		error = xfs_bmap_finish(&tp, &free_list, &committed);
+		if (error)
+			goto out;
+
+		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	}
+
+	return error;
+
+out:
+	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
+}
+
+/*
  * We need to check that the format of the data fork in the temporary inode is
  * valid for the target inode before doing the swap. This is not a problem with
  * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 900747b..935ed2b 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -99,6 +99,8 @@  int	xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset,
 			    xfs_off_t len);
 int	xfs_zero_file_space(struct xfs_inode *ip, xfs_off_t offset,
 			    xfs_off_t len);
+int	xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset,
+				xfs_off_t len);
 
 /* EOF block manipulation functions */
 bool	xfs_can_free_eofblocks(struct xfs_inode *ip, bool force);
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 52c91e1..accb25b 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -820,7 +820,8 @@  xfs_file_fallocate(
 
 	if (!S_ISREG(inode->i_mode))
 		return -EINVAL;
-	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
+		     FALLOC_FL_COLLAPSE_RANGE))
 		return -EOPNOTSUPP;
 
 	xfs_ilock(ip, XFS_IOLOCK_EXCL);
@@ -828,6 +829,20 @@  xfs_file_fallocate(
 		error = xfs_free_file_space(ip, offset, len);
 		if (error)
 			goto out_unlock;
+	} else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
+		unsigned blksize_mask = (1 << inode->i_blkbits) - 1;
+
+		if (offset & blksize_mask || len & blksize_mask) {
+			error = -EINVAL;
+			goto out_unlock;
+		}
+
+		ASSERT(offset + len < i_size_read(inode));
+		new_size = i_size_read(inode) - len;
+
+		error = xfs_collapse_file_space(ip, offset, len);
+		if (error)
+			goto out_unlock;
 	} else {
 		if (!(mode & FALLOC_FL_KEEP_SIZE) &&
 		    offset + len > i_size_read(inode)) {
@@ -856,7 +871,7 @@  xfs_file_fallocate(
 	if (ip->i_d.di_mode & S_IXGRP)
 		ip->i_d.di_mode &= ~S_ISGID;
 
-	if (!(mode & FALLOC_FL_PUNCH_HOLE))
+	if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE)))
 		ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
 
 	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 425dfa4..a4ae41c 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -603,6 +603,7 @@  DEFINE_INODE_EVENT(xfs_readlink);
 DEFINE_INODE_EVENT(xfs_inactive_symlink);
 DEFINE_INODE_EVENT(xfs_alloc_file_space);
 DEFINE_INODE_EVENT(xfs_free_file_space);
+DEFINE_INODE_EVENT(xfs_collapse_file_space);
 DEFINE_INODE_EVENT(xfs_readdir);
 #ifdef CONFIG_XFS_POSIX_ACL
 DEFINE_INODE_EVENT(xfs_get_acl);