Patchwork [RESEND,2/7] xfs: add support FALLOC_FL_COLLAPSE_RANGE for fallocate

login
register
mail settings
Submitter NamJae Jeon
Date Oct. 6, 2013, 8:13 p.m.
Message ID <1381090388-2761-1-git-send-email-linkinjeon@gmail.com>
Download mbox | patch
Permalink /patch/280910/
State Superseded
Headers show

Comments

NamJae Jeon - Oct. 6, 2013, 8:13 p.m.
From: Namjae Jeon <namjae.jeon@samsung.com>

Add support FALLOC_FL_COLLAPSE_RANGE for fallocate.

Signed-off-by: Namjae Jeon <namjae.jeon@samsung.com>
Signed-off-by: Ashish Sangwan <a.sangwan@samsung.com>
---
 fs/xfs/xfs_bmap.c      |  174 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_bmap.h      |    3 +
 fs/xfs/xfs_bmap_util.c |   96 ++++++++++++++++++++++++++
 fs/xfs/xfs_bmap_util.h |    2 +
 fs/xfs/xfs_file.c      |   20 ++++--
 fs/xfs/xfs_fs.h        |    6 ++
 6 files changed, 296 insertions(+), 5 deletions(-)
Dave Chinner - Oct. 10, 2013, 12:51 a.m.
On Mon, Oct 07, 2013 at 05:13:08AM +0900, Namjae Jeon wrote:
> From: Namjae Jeon <namjae.jeon@samsung.com>
> 
> Add support FALLOC_FL_COLLAPSE_RANGE for fallocate.
> 
> Signed-off-by: Namjae Jeon <namjae.jeon@samsung.com>
> Signed-off-by: Ashish Sangwan <a.sangwan@samsung.com>
> ---
>  fs/xfs/xfs_bmap.c      |  174 ++++++++++++++++++++++++++++++++++++++++++++++++
>  fs/xfs/xfs_bmap.h      |    3 +
>  fs/xfs/xfs_bmap_util.c |   96 ++++++++++++++++++++++++++
>  fs/xfs/xfs_bmap_util.h |    2 +
>  fs/xfs/xfs_file.c      |   20 ++++--
>  fs/xfs/xfs_fs.h        |    6 ++
>  6 files changed, 296 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
> index 92b8309..c12358e 100644
> --- a/fs/xfs/xfs_bmap.c
> +++ b/fs/xfs/xfs_bmap.c
> @@ -5356,3 +5356,177 @@ error0:
>  	}
>  	return error;
>  }
> +
> +/*
> + * Update extents by shifting them downwards into a hole.
> + * At max count number of extents will be shifted and *current_ext
> + * is the extent number which is currently being shifted.
> + * This function will return error if the hole is not present
> + * while shifting extents. On success, 0 is returned.
> + */

/*
 * Shift extent records to the left to cover a hole.
 *
 * The maximum number of extents to be shifted in a single operation
 * is @count, and @current_ext keeps track of the current extent
 * index we have shifted. If there is no hole to shift the extents
 * into, then we abort immediately.
 */
> +int
> +xfs_bmap_shift_extents(
> +	struct xfs_trans	*tp,
> +	struct xfs_inode	*ip,
> +	int			*done,
> +	xfs_fileoff_t		start_fsb,
> +	xfs_fileoff_t		shift,

Shift means ...? Number of extents to shift, a length, a number of
block, or something else?

> +	xfs_extnum_t		*current_ext,
> +	xfs_fsblock_t		*firstblock,
> +	struct xfs_bmap_free	*flist,
> +	int			count)

if count is the number of extents to shift, then it should be named
"num_exts" or something similar to describe what it is a count of.

> +{
> +	struct xfs_btree_cur		*cur;
> +	struct xfs_bmbt_rec_host	*gotp;
> +	struct xfs_bmbt_irec		left;
> +	struct xfs_mount		*mp = ip->i_mount;
> +	struct xfs_ifork		*ifp;
> +	xfs_extnum_t			nexts = 0;
> +	xfs_fileoff_t			startoff;
> +	int				error = 0;
> +	int				i;
> +	int				whichfork = XFS_DATA_FORK;
> +	int				state;
> +	int				logflags;
> +	xfs_filblks_t			blockcount = 0;
> +
> +	if (unlikely(XFS_TEST_ERROR(
> +	    (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
> +	     XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
> +	     mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
> +		XFS_ERROR_REPORT("xfs_bmap_shift_extents",
> +				 XFS_ERRLEVEL_LOW, mp);
> +		return XFS_ERROR(EFSCORRUPTED);
> +	}
> +
> +	if (XFS_FORCED_SHUTDOWN(mp))
> +		return XFS_ERROR(EIO);
> +
> +	ifp = XFS_IFORK_PTR(ip, whichfork);
> +
> +	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
> +		/* Read in all the extents */
> +		error = xfs_iread_extents(tp, ip, whichfork);
> +		if (error)
> +			return error;
> +	}
> +
> +	if (!*current_ext) {

I had to do a double take on that, because I thought it was checking
for a null pointer at first. It's not, so at the start of the
function:

	ASSERT(current_ext != NULL);

secondly, it's checking for a zero count, so make it clear in this
case:

	if (*current_ext == 0) {
	....
> +		gotp = xfs_iext_bno_to_ext(ifp, start_fsb, current_ext);
> +		/*
> +		 * gotp can be null in 2 cases: 1) if there are no extents
> +		 * or 2) start_fsb lies in a hole beyond which there are
> +		 * no extents. Either way, we are done.
> +		 */
> +		if (!gotp) {
> +			*done = 1;
> +			return 0;
> +		}

What does "gotp" mean in this context? Yes, it's the extent we got
from a lookup, but what extent is that? Is it the extent we are
shifting, the extent we are shifting it up against, or something
else?

> +	}
> +
> +	/* We are going to change core inode */
> +	logflags = XFS_ILOG_CORE;
> +
> +	if (ifp->if_flags & XFS_IFBROOT) {
> +		cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
> +		cur->bc_private.b.firstblock = *firstblock;
> +		cur->bc_private.b.flist = flist;
> +		cur->bc_private.b.flags = 0;
> +		}
> +	else {
> +		cur = NULL;
> +		logflags |= XFS_ILOG_DEXT;
> +	}
> +
> +	while (nexts++ < count &&
> +	       *current_ext <  XFS_IFORK_NEXTENTS(ip, whichfork)) {
> +		state = 0;
> +
> +		gotp = xfs_iext_get_ext(ifp, *current_ext);
> +		startoff = xfs_bmbt_get_startoff(gotp);
> +		startoff -= shift;

		xfs_bmbt_get_all(gotp, &got);

and then you can drop all the xfs_bmbt_get*() wrappers.

> +
> +		/*
> +		 * Before shifting extent into hole, make sure that the hole
> +		 * is large enough to accomodate the shift.
> +		 */
> +		if (*current_ext) {
> +			state |= BMAP_LEFT_VALID;
> +			xfs_bmbt_get_all(xfs_iext_get_ext(ifp,
> +						*current_ext - 1), &left);
> +
> +			if (isnullstartblock(left.br_startblock))
> +				state |= BMAP_LEFT_DELAY;
> +
> +			if (startoff < left.br_startoff + left.br_blockcount)
> +				error = XFS_ERROR(EFSCORRUPTED);

Why is the filesystem corrupted if the shift we asked for is too
large for the hole in the file? I haven't seen any checks before
this that guarantee that the hole is big enough for the shift...

> +
> +		} else if (startoff > xfs_bmbt_get_startoff(gotp))
> +			/* Hole is at the start but not large enough */
> +			error = XFS_ERROR(EFSCORRUPTED);

Same question....

> +
> +		if (error)
> +			goto del_cursor;
> +
> +		/* Check if we can merge 2 adjacent extents */
> +		if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) &&
> +		    left.br_startoff + left.br_blockcount == startoff &&
> +		    left.br_startblock + left.br_blockcount ==
> +		    xfs_bmbt_get_startblock(gotp) &&
> +		    xfs_bmbt_get_state(gotp) == left.br_state &&
> +		    left.br_blockcount + xfs_bmbt_get_blockcount(gotp) <=
> +		    MAXEXTLEN) {

The indenting needs work here - whitespace gives lots of context
that is missing here:

		if ((state & BMAP_LEFT_VALID) &&
		    !(state & BMAP_LEFT_DELAY) &&
		    left.br_startoff + left.br_blockcount == startoff &&
		    left.br_startblock + left.br_blockcount ==
				    xfs_bmbt_get_startblock(gotp) &&
		    xfs_bmbt_get_state(gotp) == left.br_state &&
		    left.br_blockcount + xfs_bmbt_get_blockcount(gotp) <=
				    MAXEXTLEN) {

And it can be simplified, too:

		if ((state & BMAP_LEFT_VALID) &&
		    !(state & BMAP_LEFT_DELAY) &&

is exactly the same as:

		if (state == BMAP_LEFT_VALID &&

> +			blockcount =
> +			left.br_blockcount + xfs_bmbt_get_blockcount(gotp);
> +			state |= BMAP_LEFT_CONTIG;
> +			xfs_iext_remove(ip, *current_ext, 1, 0);
> +			XFS_IFORK_NEXT_SET(ip, whichfork
> +				XFS_IFORK_NEXTENTS(ip, whichfork) - 1);

Ok, so you remove and extent from the in-memory tree, but I don't
see where you remove it from the on-disk btree.

> +			gotp = xfs_iext_get_ext(ifp, --*current_ext);

			xfs_bmbt_get_all(gotp, &got);

> +		}
> +
> +		if (cur) {
> +			error = xfs_bmbt_lookup_eq(cur,
> +					xfs_bmbt_get_startoff(gotp),
> +					xfs_bmbt_get_startblock(gotp),
> +					xfs_bmbt_get_blockcount(gotp),
> +					&i);
> +			if (error)
> +				goto del_cursor;
> +			XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
> +		}

This needs to be done before merging extents so the cursor points at
the record that needs to be deleted from the btree when you merge
the extent records. i.e. you need to completely separate the extent
merge case from the update case for both the in-memory extent tree
update and the on-disk btree update....

> +
>  	return xfs_trans_commit(tp, 0);
>  }
>  
> +
> +/*
> + * xfs_collapse_file_space: Implements the FALLOC_FL_COLLAPSE_SPACE flag.
> + */
> +int
> +xfs_collapse_file_space(
> +	struct xfs_inode	*ip,
> +	loff_t			offset,
> +	loff_t			len,
> +	int			attr_flags)
> +{
> +	int			done = 0;
> +	struct xfs_mount	*mp = ip->i_mount;
> +	uint			resblks;
> +	struct xfs_trans	*tp;
> +	int			error;
> +	xfs_extnum_t		current_ext = 0;
> +	struct xfs_bmap_free	free_list;
> +	xfs_fsblock_t		first_block;
> +	int			committed;
> +	xfs_fileoff_t	start_fsb = XFS_B_TO_FSB(mp, offset + len);
> +	xfs_fileoff_t	shift_fsb = XFS_B_TO_FSB(mp, len);
> +
> +	resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);

Why do we need a stack variable for this?

> +
> +	/*
> +	 * The first thing we do is to free data blocks in the specified range
> +	 * by calling xfs_free_file_space(). It would also sync dirty data
> +	 * and invalidate page cache over the region on which collapse range
> +	 * is working.
> +	 */
> +
> +	error = xfs_free_file_space(ip, offset, len, attr_flags);
> +	if (error)
> +		return error;

This separation of punching the hole and collapsing the range means
that the operation is not atomic w.r.t. concurrent IO, truncate or
other hole punch/preallocate operations if the XFS_IOLOCK_EXCL is
not held. Hence we need to ensure this operation is executed with
the correct locks held by the caller, and the correct flags passed
into the function. That is, we need these asserts before doing
anything else in this function:

	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
	ASSERT((attr_flags & XFS_ATTR_NOLOCK) == XFS_ATTR_NOLOCK);

This makes it clear that there's a bug in the function's locking in
the "out" case....

> +	while (!error && !done) {
> +		tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
> +		tp->t_flags |= XFS_TRANS_RESERVE;
> +		/*
> +		 * We would need to reserve permanent block for transaction.
> +		 * This will come into picture when after shifting extent into
> +		 * hole we found that adjacent extents can be merged which
> +		 * may lead to freeing of a block during record update.
> +		 */
> +		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, 0);
> +		if (error) {
> +			ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
> +			xfs_trans_cancel(tp, 0);
> +			break;
> +		}
> +
> +		xfs_ilock(ip, XFS_ILOCK_EXCL);
> +		error = xfs_trans_reserve_quota(tp, mp,
> +				ip->i_udquot, ip->i_gdquot, ip->i_pdquot,
> +				resblks, 0, XFS_QMOPT_RES_REGBLKS);
> +		if (error)
> +			goto out;
> +
> +		xfs_trans_ijoin(tp, ip, 0);
> +
> +		xfs_bmap_init(&free_list, &first_block);
> +
> +		/*
> +		 * We are using the write transaction in which max 2 bmbt
> +		 * updates are allowed
> +		 */
> +		error = xfs_bmap_shift_extents(tp, ip, &done, start_fsb,
> +				shift_fsb, &current_ext,
> +				&first_block, &free_list, 2);
> +		if (error)
> +			goto out;
> +
> +		error = xfs_bmap_finish(&tp, &free_list, &committed);
> +		if (error)
> +			goto out;
> +
> +		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
> +		xfs_iunlock(ip, XFS_ILOCK_EXCL);
> +	}
> +
> +	return error;
> +
> +out:
> +	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
> +	xfs_iunlock(ip, XFS_IOLOCK_EXCL);

That should be XFS_ILOCK_EXCL....

> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 818c623..9c9c1ff 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -807,7 +807,8 @@ xfs_file_fallocate(
>  	int		cmd = XFS_IOC_RESVSP;
>  	int		attr_flags = XFS_ATTR_NOLOCK;
>  
> -	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
> +	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
> +		     FALLOC_FL_COLLAPSE_RANGE))
>  		return -EOPNOTSUPP;
>  
>  	bf.l_whence = 0;
> @@ -819,10 +820,19 @@ xfs_file_fallocate(
>  	if (mode & FALLOC_FL_PUNCH_HOLE)
>  		cmd = XFS_IOC_UNRESVSP;
>  
> -	/* check the new inode size is valid before allocating */
> -	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
> -	    offset + len > i_size_read(inode)) {
> +	/* Shrink size in case of FALLOC_FL_COLLAPSE_RANGE */
> +	if (mode & FALLOC_FL_COLLAPSE_RANGE) {
> +		cmd = XFS_COLLAPSE_RANGE;
> +		if ((offset + len) > i_size_read(inode))
> +			new_size = offset;

That's an illegal case according to the higher layers. Don't handle
it here, replace it with:

		ASSERT(offset + len < i_size_read(inode));

> +		else
> +			new_size = i_size_read(inode) - len;

> +	} else if (!(mode & FALLOC_FL_KEEP_SIZE) &&
> +	    offset + len > i_size_read(inode))
>  		new_size = offset + len;
> +
> +	/* check the new inode size is valid before allocating */
> +	if (new_size || mode & FALLOC_FL_COLLAPSE_RANGE) {

That's a bit ugly.

	if (new_size != i_size_read(inode)) {
		....

would be better, and it handles the case of the new size being zero.

>  		error = inode_newsize_ok(inode, new_size);
>  		if (error)
>  			goto out_unlock;
> @@ -836,7 +846,7 @@ xfs_file_fallocate(
>  		goto out_unlock;
>  
>  	/* Change file size if needed */
> -	if (new_size) {
> +	if (new_size ||  mode & FALLOC_FL_COLLAPSE_RANGE) {
>  		struct iattr iattr;
>  
>  		iattr.ia_valid = ATTR_SIZE;

Same again.


> diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
> index 1edb5cc..99f5244 100644
> --- a/fs/xfs/xfs_fs.h
> +++ b/fs/xfs/xfs_fs.h
> @@ -516,6 +516,12 @@ typedef struct xfs_swapext
>  #define XFS_IOC_GETBMAPX	_IOWR('X', 56, struct getbmap)
>  #define XFS_IOC_ZERO_RANGE	_IOW ('X', 57, struct xfs_flock64)
>  #define XFS_IOC_FREE_EOFBLOCKS	_IOR ('X', 58, struct xfs_eofblocks)
> +/*
> + * Although there is no ioctl implemented yet, we reserve an ioctl number for
> + * representing collapse range operation to avoid any possible collision in
> + * switch case of xfs_change_file_space.
> + */
> +#define XFS_COLLAPSE_RANGE	_IOW('X', 59, struct xfs_flock64)

XFS_IOC_COLLAPSE_RANGE.

Cheers,

Dave.
NamJae Jeon - Oct. 10, 2013, 7 a.m.
>
> /*
>  * Shift extent records to the left to cover a hole.
>  *
>  * The maximum number of extents to be shifted in a single operation
>  * is @count, and @current_ext keeps track of the current extent
>  * index we have shifted. If there is no hole to shift the extents
>  * into, then we abort immediately.
>  */
Thanks for your help. I will change this comment instead of original one.

>> +int
>> +xfs_bmap_shift_extents(
>> +     struct xfs_trans        *tp,
>> +     struct xfs_inode        *ip,
>> +     int                     *done,
>> +     xfs_fileoff_t           start_fsb,
>> +     xfs_fileoff_t           shift,
>
> Shift means ...? Number of extents to shift, a length, a number of
> block, or something else?
Ah, yes, shift_len would be a more proper name

>
>> +     xfs_extnum_t            *current_ext,
>> +     xfs_fsblock_t           *firstblock,
>> +     struct xfs_bmap_free    *flist,
>> +     int                     count)
>
> if count is the number of extents to shift, then it should be named
> "num_exts" or something similar to describe what it is a count of.
Right, I will change num_exts.

>
>> +{
>> +     struct xfs_btree_cur            *cur;
>> +     struct xfs_bmbt_rec_host        *gotp;
>> +     struct xfs_bmbt_irec            left;
>> +     struct xfs_mount                *mp = ip->i_mount;
>> +     struct xfs_ifork                *ifp;
>> +     xfs_extnum_t                    nexts = 0;
>> +     xfs_fileoff_t                   startoff;
>> +     int                             error = 0;
>> +     int                             i;
>> +     int                             whichfork = XFS_DATA_FORK;
>> +     int                             state;
>> +     int                             logflags;
>> +     xfs_filblks_t                   blockcount = 0;
>> +
>> +     if (unlikely(XFS_TEST_ERROR(
>> +         (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
>> +          XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
>> +          mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
>> +             XFS_ERROR_REPORT("xfs_bmap_shift_extents",
>> +                              XFS_ERRLEVEL_LOW, mp);
>> +             return XFS_ERROR(EFSCORRUPTED);
>> +     }
>> +
>> +     if (XFS_FORCED_SHUTDOWN(mp))
>> +             return XFS_ERROR(EIO);
>> +
>> +     ifp = XFS_IFORK_PTR(ip, whichfork);
>> +
>> +     if (!(ifp->if_flags & XFS_IFEXTENTS)) {
>> +             /* Read in all the extents */
>> +             error = xfs_iread_extents(tp, ip, whichfork);
>> +             if (error)
>> +                     return error;
>> +     }
>> +
>> +     if (!*current_ext) {
>
> I had to do a double take on that, because I thought it was checking
> for a null pointer at first. It's not, so at the start of the
> function:
>
>         ASSERT(current_ext != NULL);
>
> secondly, it's checking for a zero count, so make it clear in this
> case:
>
>         if (*current_ext == 0) {
Okay, I will update like this.

>         ....
>> +             gotp = xfs_iext_bno_to_ext(ifp, start_fsb, current_ext);
>> +             /*
>> +              * gotp can be null in 2 cases: 1) if there are no extents
>> +              * or 2) start_fsb lies in a hole beyond which there are
>> +              * no extents. Either way, we are done.
>> +              */
>> +             if (!gotp) {
>> +                     *done = 1;
>> +                     return 0;
>> +             }
>
> What does "gotp" mean in this context? Yes, it's the extent we got
> from a lookup, but what extent is that? Is it the extent we are
> shifting, the extent we are shifting it up against, or something
> else?
>
>> +     }
>> +
>> +     /* We are going to change core inode */
>> +     logflags = XFS_ILOG_CORE;
>> +
>> +     if (ifp->if_flags & XFS_IFBROOT) {
>> +             cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
>> +             cur->bc_private.b.firstblock = *firstblock;
>> +             cur->bc_private.b.flist = flist;
>> +             cur->bc_private.b.flags = 0;
>> +             }
>> +     else {
>> +             cur = NULL;
>> +             logflags |= XFS_ILOG_DEXT;
>> +     }
>> +
>> +     while (nexts++ < count &&
>> +            *current_ext <  XFS_IFORK_NEXTENTS(ip, whichfork)) {
>> +             state = 0;
>> +
>> +             gotp = xfs_iext_get_ext(ifp, *current_ext);
>> +             startoff = xfs_bmbt_get_startoff(gotp);
>> +             startoff -= shift;
>
>                 xfs_bmbt_get_all(gotp, &got);
>
> and then you can drop all the xfs_bmbt_get*() wrappers.
Okay, I will check it.

>
>> +
>> +             /*
>> +              * Before shifting extent into hole, make sure that the hole
>> +              * is large enough to accomodate the shift.
>> +              */
>> +             if (*current_ext) {
>> +                     state |= BMAP_LEFT_VALID;
>> +                     xfs_bmbt_get_all(xfs_iext_get_ext(ifp,
>> +                                             *current_ext - 1), &left);
>> +
>> +                     if (isnullstartblock(left.br_startblock))
>> +                             state |= BMAP_LEFT_DELAY;
>> +
>> +                     if (startoff < left.br_startoff + left.br_blockcount)
>> +                             error = XFS_ERROR(EFSCORRUPTED);
>
> Why is the filesystem corrupted if the shift we asked for is too
> large for the hole in the file? I haven't seen any checks before
> this that guarantee that the hole is big enough for the shift...

we call xfs_free_file_space to free enough blocks for shifting.
If still the space is not big enough will it be considered as fs corrupted?
What error could we return in this case?

>
>> +
>> +             } else if (startoff > xfs_bmbt_get_startoff(gotp))
>> +                     /* Hole is at the start but not large enough */
>> +                     error = XFS_ERROR(EFSCORRUPTED);
>
> Same question....
>
>> +
>> +             if (error)
>> +                     goto del_cursor;
>> +
>> +             /* Check if we can merge 2 adjacent extents */
>> +             if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) &&
>> +                 left.br_startoff + left.br_blockcount == startoff &&
>> +                 left.br_startblock + left.br_blockcount ==
>> +                 xfs_bmbt_get_startblock(gotp) &&
>> +                 xfs_bmbt_get_state(gotp) == left.br_state &&
>> +                 left.br_blockcount + xfs_bmbt_get_blockcount(gotp) <=
>> +                 MAXEXTLEN) {
>
> The indenting needs work here - whitespace gives lots of context
> that is missing here:
>
>                 if ((state & BMAP_LEFT_VALID) &&
>                     !(state & BMAP_LEFT_DELAY) &&
>                     left.br_startoff + left.br_blockcount == startoff &&
>                     left.br_startblock + left.br_blockcount ==
>                                     xfs_bmbt_get_startblock(gotp) &&
>                     xfs_bmbt_get_state(gotp) == left.br_state &&
>                     left.br_blockcount + xfs_bmbt_get_blockcount(gotp) <=
>                                     MAXEXTLEN) {
>
> And it can be simplified, too:
>
>                 if ((state & BMAP_LEFT_VALID) &&
>                     !(state & BMAP_LEFT_DELAY) &&
>
> is exactly the same as:
>
>                 if (state == BMAP_LEFT_VALID &&
Right. I will update your points.

>
>> +                     blockcount =
>> +                     left.br_blockcount + xfs_bmbt_get_blockcount(gotp);
>> +                     state |= BMAP_LEFT_CONTIG;
>> +                     xfs_iext_remove(ip, *current_ext, 1, 0);
>> +                     XFS_IFORK_NEXT_SET(ip, whichfork
>> +                             XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
>
> Ok, so you remove and extent from the in-memory tree, but I don't
> see where you remove it from the on-disk btree.

Okay, I will add code to remove on-disk btree also.
>
>> +                     gotp = xfs_iext_get_ext(ifp, --*current_ext);
>
>                         xfs_bmbt_get_all(gotp, &got);
>
>> +             }
>> +
>> +             if (cur) {
>> +                     error = xfs_bmbt_lookup_eq(cur,
>> +                                     xfs_bmbt_get_startoff(gotp),
>> +                                     xfs_bmbt_get_startblock(gotp),
>> +                                     xfs_bmbt_get_blockcount(gotp),
>> +                                     &i);
>> +                     if (error)
>> +                             goto del_cursor;
>> +                     XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
>> +             }
>
> This needs to be done before merging extents so the cursor points at
> the record that needs to be deleted from the btree when you merge
> the extent records. i.e. you need to completely separate the extent
> merge case from the update case for both the in-memory extent tree
> update and the on-disk btree update....
Okay.

>
>> +
>>       return xfs_trans_commit(tp, 0);
>>  }
>>
>> +
>> +/*
>> + * xfs_collapse_file_space: Implements the FALLOC_FL_COLLAPSE_SPACE flag.
>> + */
>> +int
>> +xfs_collapse_file_space(
>> +     struct xfs_inode        *ip,
>> +     loff_t                  offset,
>> +     loff_t                  len,
>> +     int                     attr_flags)
>> +{
>> +     int                     done = 0;
>> +     struct xfs_mount        *mp = ip->i_mount;
>> +     uint                    resblks;
>> +     struct xfs_trans        *tp;
>> +     int                     error;
>> +     xfs_extnum_t            current_ext = 0;
>> +     struct xfs_bmap_free    free_list;
>> +     xfs_fsblock_t           first_block;
>> +     int                     committed;
>> +     xfs_fileoff_t   start_fsb = XFS_B_TO_FSB(mp, offset + len);
>> +     xfs_fileoff_t   shift_fsb = XFS_B_TO_FSB(mp, len);
>> +
>> +     resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
>
> Why do we need a stack variable for this?
Ah. I will directly use it instead of stack varable.

>
>> +
>> +     /*
>> +      * The first thing we do is to free data blocks in the specified range
>> +      * by calling xfs_free_file_space(). It would also sync dirty data
>> +      * and invalidate page cache over the region on which collapse range
>> +      * is working.
>> +      */
>> +
>> +     error = xfs_free_file_space(ip, offset, len, attr_flags);
>> +     if (error)
>> +             return error;
>
> This separation of punching the hole and collapsing the range means
> that the operation is not atomic w.r.t. concurrent IO, truncate or
> other hole punch/preallocate operations if the XFS_IOLOCK_EXCL is
> not held. Hence we need to ensure this operation is executed with
> the correct locks held by the caller, and the correct flags passed
> into the function. That is, we need these asserts before doing
> anything else in this function:
>
>         ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
>         ASSERT((attr_flags & XFS_ATTR_NOLOCK) == XFS_ATTR_NOLOCK);
>
> This makes it clear that there's a bug in the function's locking in
> the "out" case....
>
Yes, right. I will check.

>> +     while (!error && !done) {
>> +             tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
>> +             tp->t_flags |= XFS_TRANS_RESERVE;
>> +             /*
>> +              * We would need to reserve permanent block for transaction.
>> +              * This will come into picture when after shifting extent into
>> +              * hole we found that adjacent extents can be merged which
>> +              * may lead to freeing of a block during record update.
>> +              */
>> +             error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, 0);
>> +             if (error) {
>> +                     ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
>> +                     xfs_trans_cancel(tp, 0);
>> +                     break;
>> +             }
>> +
>> +             xfs_ilock(ip, XFS_ILOCK_EXCL);
>> +             error = xfs_trans_reserve_quota(tp, mp,
>> +                             ip->i_udquot, ip->i_gdquot, ip->i_pdquot,
>> +                             resblks, 0, XFS_QMOPT_RES_REGBLKS);
>> +             if (error)
>> +                     goto out;
>> +
>> +             xfs_trans_ijoin(tp, ip, 0);
>> +
>> +             xfs_bmap_init(&free_list, &first_block);
>> +
>> +             /*
>> +              * We are using the write transaction in which max 2 bmbt
>> +              * updates are allowed
>> +              */
>> +             error = xfs_bmap_shift_extents(tp, ip, &done, start_fsb,
>> +                             shift_fsb, &current_ext,
>> +                             &first_block, &free_list, 2);
>> +             if (error)
>> +                     goto out;
>> +
>> +             error = xfs_bmap_finish(&tp, &free_list, &committed);
>> +             if (error)
>> +                     goto out;
>> +
>> +             error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
>> +             xfs_iunlock(ip, XFS_ILOCK_EXCL);
>> +     }
>> +
>> +     return error;
>> +
>> +out:
>> +     xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
>> +     xfs_iunlock(ip, XFS_IOLOCK_EXCL);
>
> That should be XFS_ILOCK_EXCL....
Yes :)

>
>> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
>> index 818c623..9c9c1ff 100644
>> --- a/fs/xfs/xfs_file.c
>> +++ b/fs/xfs/xfs_file.c
>> @@ -807,7 +807,8 @@ xfs_file_fallocate(
>>       int             cmd = XFS_IOC_RESVSP;
>>       int             attr_flags = XFS_ATTR_NOLOCK;
>>
>> -     if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
>> +     if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
>> +                  FALLOC_FL_COLLAPSE_RANGE))
>>               return -EOPNOTSUPP;
>>
>>       bf.l_whence = 0;
>> @@ -819,10 +820,19 @@ xfs_file_fallocate(
>>       if (mode & FALLOC_FL_PUNCH_HOLE)
>>               cmd = XFS_IOC_UNRESVSP;
>>
>> -     /* check the new inode size is valid before allocating */
>> -     if (!(mode & FALLOC_FL_KEEP_SIZE) &&
>> -         offset + len > i_size_read(inode)) {
>> +     /* Shrink size in case of FALLOC_FL_COLLAPSE_RANGE */
>> +     if (mode & FALLOC_FL_COLLAPSE_RANGE) {
>> +             cmd = XFS_COLLAPSE_RANGE;
>> +             if ((offset + len) > i_size_read(inode))
>> +                     new_size = offset;
>
> That's an illegal case according to the higher layers. Don't handle
> it here, replace it with:
>
>                 ASSERT(offset + len < i_size_read(inode));
Okay.

>
>> +             else
>> +                     new_size = i_size_read(inode) - len;
>
>> +     } else if (!(mode & FALLOC_FL_KEEP_SIZE) &&
>> +         offset + len > i_size_read(inode))
>>               new_size = offset + len;
>> +
>> +     /* check the new inode size is valid before allocating */
>> +     if (new_size || mode & FALLOC_FL_COLLAPSE_RANGE) {
>
> That's a bit ugly.
>
>         if (new_size != i_size_read(inode)) {
>                 ....
>
> would be better, and it handles the case of the new size being zero.
Right. Will update it.

>
>>               error = inode_newsize_ok(inode, new_size);
>>               if (error)
>>                       goto out_unlock;
>> @@ -836,7 +846,7 @@ xfs_file_fallocate(
>>               goto out_unlock;
>>
>>       /* Change file size if needed */
>> -     if (new_size) {
>> +     if (new_size ||  mode & FALLOC_FL_COLLAPSE_RANGE) {
>>               struct iattr iattr;
>>
>>               iattr.ia_valid = ATTR_SIZE;
>
> Same again.
okay.

>
>
>> diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
>> index 1edb5cc..99f5244 100644
>> --- a/fs/xfs/xfs_fs.h
>> +++ b/fs/xfs/xfs_fs.h
>> @@ -516,6 +516,12 @@ typedef struct xfs_swapext
>>  #define XFS_IOC_GETBMAPX     _IOWR('X', 56, struct getbmap)
>>  #define XFS_IOC_ZERO_RANGE   _IOW ('X', 57, struct xfs_flock64)
>>  #define XFS_IOC_FREE_EOFBLOCKS       _IOR ('X', 58, struct xfs_eofblocks)
>> +/*
>> + * Although there is no ioctl implemented yet, we reserve an ioctl number for
>> + * representing collapse range operation to avoid any possible collision in
>> + * switch case of xfs_change_file_space.
>> + */
>> +#define XFS_COLLAPSE_RANGE   _IOW('X', 59, struct xfs_flock64)
>
> XFS_IOC_COLLAPSE_RANGE.
Okay.

Thanks for review!
>
> Cheers,
>
> Dave.
> --
> Dave Chinner
> david@fromorbit.com
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Chinner - Oct. 10, 2013, 9:23 p.m.
On Thu, Oct 10, 2013 at 04:00:13PM +0900, Namjae Jeon wrote:
> >
> > /*
> >  * Shift extent records to the left to cover a hole.
> >  *
> >  * The maximum number of extents to be shifted in a single operation
> >  * is @count, and @current_ext keeps track of the current extent
> >  * index we have shifted. If there is no hole to shift the extents
> >  * into, then we abort immediately.
> >  */
> Thanks for your help. I will change this comment instead of original one.
> 
> >> +int
> >> +xfs_bmap_shift_extents(
> >> +     struct xfs_trans        *tp,
> >> +     struct xfs_inode        *ip,
> >> +     int                     *done,
> >> +     xfs_fileoff_t           start_fsb,
> >> +     xfs_fileoff_t           shift,
> >
> > Shift means ...? Number of extents to shift, a length, a number of
> > block, or something else?
> Ah, yes, shift_len would be a more proper name

I'm not sure that's a lot better. What are we shifting? We are
shifting the offset of the blocks, right? And the unit is in fsb?
So perhaps offset_shift_fsb, and add that to the description of the
function above?

> >> +             /*
> >> +              * Before shifting extent into hole, make sure that the hole
> >> +              * is large enough to accomodate the shift.
> >> +              */
> >> +             if (*current_ext) {
> >> +                     state |= BMAP_LEFT_VALID;
> >> +                     xfs_bmbt_get_all(xfs_iext_get_ext(ifp,
> >> +                                             *current_ext - 1), &left);
> >> +
> >> +                     if (isnullstartblock(left.br_startblock))
> >> +                             state |= BMAP_LEFT_DELAY;
> >> +
> >> +                     if (startoff < left.br_startoff + left.br_blockcount)
> >> +                             error = XFS_ERROR(EFSCORRUPTED);
> >
> > Why is the filesystem corrupted if the shift we asked for is too
> > large for the hole in the file? I haven't seen any checks before
> > this that guarantee that the hole is big enough for the shift...
> 
> we call xfs_free_file_space to free enough blocks for shifting.
> If still the space is not big enough will it be considered as fs corrupted?
> What error could we return in this case?

Hole punching rounds inwards, and the amount of rounding is not
necessarily the nearest filesystem block. Again it's the block size
smaller than page size case that will trip you over here, as the
rounding  when punching holes will be done to page size, not
filesystem block size. Hence it's entirely possible that your
calculated shift start and lengths don't match the size of the hole
that was punched.

That doesn't mean there was a corruption - just that the hole wasn't
the size and shape that was expected....

Cheers,

Dave.
NamJae Jeon - Oct. 14, 2013, 3:30 a.m.
2013/10/11 Dave Chinner <david@fromorbit.com>:
> On Thu, Oct 10, 2013 at 04:00:13PM +0900, Namjae Jeon wrote:
>> >
>> > /*
>> >  * Shift extent records to the left to cover a hole.
>> >  *
>> >  * The maximum number of extents to be shifted in a single operation
>> >  * is @count, and @current_ext keeps track of the current extent
>> >  * index we have shifted. If there is no hole to shift the extents
>> >  * into, then we abort immediately.
>> >  */
>> Thanks for your help. I will change this comment instead of original one.
>>
>> >> +int
>> >> +xfs_bmap_shift_extents(
>> >> +     struct xfs_trans        *tp,
>> >> +     struct xfs_inode        *ip,
>> >> +     int                     *done,
>> >> +     xfs_fileoff_t           start_fsb,
>> >> +     xfs_fileoff_t           shift,
>> >
>> > Shift means ...? Number of extents to shift, a length, a number of
>> > block, or something else?
>> Ah, yes, shift_len would be a more proper name
>
> I'm not sure that's a lot better. What are we shifting? We are
> shifting the offset of the blocks, right? And the unit is in fsb?
> So perhaps offset_shift_fsb, and add that to the description of the
> function above?
Okay, I will change it as your suggestion.

>
>> >> +             /*
>> >> +              * Before shifting extent into hole, make sure that the hole
>> >> +              * is large enough to accomodate the shift.
>> >> +              */
>> >> +             if (*current_ext) {
>> >> +                     state |= BMAP_LEFT_VALID;
>> >> +                     xfs_bmbt_get_all(xfs_iext_get_ext(ifp,
>> >> +                                             *current_ext - 1), &left);
>> >> +
>> >> +                     if (isnullstartblock(left.br_startblock))
>> >> +                             state |= BMAP_LEFT_DELAY;
>> >> +
>> >> +                     if (startoff < left.br_startoff + left.br_blockcount)
>> >> +                             error = XFS_ERROR(EFSCORRUPTED);
>> >
>> > Why is the filesystem corrupted if the shift we asked for is too
>> > large for the hole in the file? I haven't seen any checks before
>> > this that guarantee that the hole is big enough for the shift...
>>
>> we call xfs_free_file_space to free enough blocks for shifting.
>> If still the space is not big enough will it be considered as fs corrupted?
>> What error could we return in this case?
>
> Hole punching rounds inwards, and the amount of rounding is not
> necessarily the nearest filesystem block. Again it's the block size
> smaller than page size case that will trip you over here, as the
> rounding  when punching holes will be done to page size, not
> filesystem block size. Hence it's entirely possible that your
> calculated shift start and lengths don't match the size of the hole
> that was punched.
>
> That doesn't mean there was a corruption - just that the hole wasn't
> the size and shape that was expected....
Right. I will consider your points.

Thanks very much for your valuable comments.
>
> Cheers,
>
> Dave.
> --
> Dave Chinner
> david@fromorbit.com
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 92b8309..c12358e 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -5356,3 +5356,177 @@  error0:
 	}
 	return error;
 }
+
+/*
+ * Update extents by shifting them downwards into a hole.
+ * At max count number of extents will be shifted and *current_ext
+ * is the extent number which is currently being shifted.
+ * This function will return error if the hole is not present
+ * while shifting extents. On success, 0 is returned.
+ */
+int
+xfs_bmap_shift_extents(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	int			*done,
+	xfs_fileoff_t		start_fsb,
+	xfs_fileoff_t		shift,
+	xfs_extnum_t		*current_ext,
+	xfs_fsblock_t		*firstblock,
+	struct xfs_bmap_free	*flist,
+	int			count)
+{
+	struct xfs_btree_cur		*cur;
+	struct xfs_bmbt_rec_host	*gotp;
+	struct xfs_bmbt_irec		left;
+	struct xfs_mount		*mp = ip->i_mount;
+	struct xfs_ifork		*ifp;
+	xfs_extnum_t			nexts = 0;
+	xfs_fileoff_t			startoff;
+	int				error = 0;
+	int				i;
+	int				whichfork = XFS_DATA_FORK;
+	int				state;
+	int				logflags;
+	xfs_filblks_t			blockcount = 0;
+
+	if (unlikely(XFS_TEST_ERROR(
+	    (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+	     XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
+	     mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+		XFS_ERROR_REPORT("xfs_bmap_shift_extents",
+				 XFS_ERRLEVEL_LOW, mp);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+
+	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+		/* Read in all the extents */
+		error = xfs_iread_extents(tp, ip, whichfork);
+		if (error)
+			return error;
+	}
+
+	if (!*current_ext) {
+		gotp = xfs_iext_bno_to_ext(ifp, start_fsb, current_ext);
+		/*
+		 * gotp can be null in 2 cases: 1) if there are no extents
+		 * or 2) start_fsb lies in a hole beyond which there are
+		 * no extents. Either way, we are done.
+		 */
+		if (!gotp) {
+			*done = 1;
+			return 0;
+		}
+	}
+
+	/* We are going to change core inode */
+	logflags = XFS_ILOG_CORE;
+
+	if (ifp->if_flags & XFS_IFBROOT) {
+		cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+		cur->bc_private.b.firstblock = *firstblock;
+		cur->bc_private.b.flist = flist;
+		cur->bc_private.b.flags = 0;
+		}
+	else {
+		cur = NULL;
+		logflags |= XFS_ILOG_DEXT;
+	}
+
+	while (nexts++ < count &&
+	       *current_ext <  XFS_IFORK_NEXTENTS(ip, whichfork)) {
+		state = 0;
+
+		gotp = xfs_iext_get_ext(ifp, *current_ext);
+		startoff = xfs_bmbt_get_startoff(gotp);
+		startoff -= shift;
+
+		/*
+		 * Before shifting extent into hole, make sure that the hole
+		 * is large enough to accomodate the shift.
+		 */
+		if (*current_ext) {
+			state |= BMAP_LEFT_VALID;
+			xfs_bmbt_get_all(xfs_iext_get_ext(ifp,
+						*current_ext - 1), &left);
+
+			if (isnullstartblock(left.br_startblock))
+				state |= BMAP_LEFT_DELAY;
+
+			if (startoff < left.br_startoff + left.br_blockcount)
+				error = XFS_ERROR(EFSCORRUPTED);
+
+		} else if (startoff > xfs_bmbt_get_startoff(gotp))
+			/* Hole is at the start but not large enough */
+			error = XFS_ERROR(EFSCORRUPTED);
+
+		if (error)
+			goto del_cursor;
+
+		/* Check if we can merge 2 adjacent extents */
+		if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) &&
+		    left.br_startoff + left.br_blockcount == startoff &&
+		    left.br_startblock + left.br_blockcount ==
+		    xfs_bmbt_get_startblock(gotp) &&
+		    xfs_bmbt_get_state(gotp) == left.br_state &&
+		    left.br_blockcount + xfs_bmbt_get_blockcount(gotp) <=
+		    MAXEXTLEN) {
+			blockcount =
+			left.br_blockcount + xfs_bmbt_get_blockcount(gotp);
+			state |= BMAP_LEFT_CONTIG;
+			xfs_iext_remove(ip, *current_ext, 1, 0);
+			XFS_IFORK_NEXT_SET(ip, whichfork,
+				XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
+			gotp = xfs_iext_get_ext(ifp, --*current_ext);
+		}
+
+		if (cur) {
+			error = xfs_bmbt_lookup_eq(cur,
+					xfs_bmbt_get_startoff(gotp),
+					xfs_bmbt_get_startblock(gotp),
+					xfs_bmbt_get_blockcount(gotp),
+					&i);
+			if (error)
+				goto del_cursor;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
+		}
+
+		if (state & BMAP_LEFT_CONTIG) {
+			/* We have to update extent block count */
+			xfs_bmbt_set_blockcount(gotp, blockcount);
+		} else {
+			/* We have to update the startoff */
+			xfs_bmbt_set_startoff(gotp, startoff);
+		}
+
+		if (cur) {
+			error = xfs_bmbt_update(cur,
+						xfs_bmbt_get_startoff(gotp),
+						xfs_bmbt_get_startblock(gotp),
+						xfs_bmbt_get_blockcount(gotp),
+						xfs_bmbt_get_state(gotp));
+			if (error)
+				goto del_cursor;
+		}
+
+		(*current_ext)++;
+	}
+
+	/* Check if we are done */
+	if (*current_ext ==  XFS_IFORK_NEXTENTS(ip, whichfork))
+		*done = 1;
+
+del_cursor:
+	if (cur)
+		xfs_btree_del_cursor(cur,
+			error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+
+	xfs_trans_log_inode(tp, ip, logflags);
+
+	return error;
+}
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 33b41f3..b16ebfa 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -169,5 +169,8 @@  int	xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
 int	xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,
 		xfs_extnum_t num);
 uint	xfs_default_attroffset(struct xfs_inode *ip);
+int	xfs_bmap_shift_extents(struct xfs_trans *, struct xfs_inode *,
+			int *, xfs_fileoff_t, xfs_fileoff_t, xfs_extnum_t *,
+			xfs_fsblock_t *, struct xfs_bmap_free *, int);
 
 #endif	/* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 541d59f..57f045e 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1556,6 +1556,7 @@  xfs_change_file_space(
 	case XFS_IOC_RESVSP64:
 	case XFS_IOC_UNRESVSP:
 	case XFS_IOC_UNRESVSP64:
+	case XFS_COLLAPSE_RANGE:
 		if (bf->l_len <= 0)
 			return XFS_ERROR(EINVAL);
 		break;
@@ -1638,6 +1639,12 @@  xfs_change_file_space(
 
 		clrprealloc = 1;
 		break;
+	case XFS_COLLAPSE_RANGE:
+		error = xfs_collapse_file_space(ip, startoffset, bf->l_len,
+						attr_flags);
+		if (error)
+			return error;
+		break;
 
 	default:
 		ASSERT(0);
@@ -1683,6 +1690,95 @@  xfs_change_file_space(
 	return xfs_trans_commit(tp, 0);
 }
 
+
+/*
+ * xfs_collapse_file_space: Implements the FALLOC_FL_COLLAPSE_SPACE flag.
+ */
+int
+xfs_collapse_file_space(
+	struct xfs_inode	*ip,
+	loff_t			offset,
+	loff_t			len,
+	int			attr_flags)
+{
+	int			done = 0;
+	struct xfs_mount	*mp = ip->i_mount;
+	uint			resblks;
+	struct xfs_trans	*tp;
+	int			error;
+	xfs_extnum_t		current_ext = 0;
+	struct xfs_bmap_free	free_list;
+	xfs_fsblock_t		first_block;
+	int			committed;
+	xfs_fileoff_t	start_fsb = XFS_B_TO_FSB(mp, offset + len);
+	xfs_fileoff_t	shift_fsb = XFS_B_TO_FSB(mp, len);
+
+	resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+
+	/*
+	 * The first thing we do is to free data blocks in the specified range
+	 * by calling xfs_free_file_space(). It would also sync dirty data
+	 * and invalidate page cache over the region on which collapse range
+	 * is working.
+	 */
+
+	error = xfs_free_file_space(ip, offset, len, attr_flags);
+	if (error)
+		return error;
+
+	while (!error && !done) {
+		tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
+		tp->t_flags |= XFS_TRANS_RESERVE;
+		/*
+		 * We would need to reserve permanent block for transaction.
+		 * This will come into picture when after shifting extent into
+		 * hole we found that adjacent extents can be merged which
+		 * may lead to freeing of a block during record update.
+		 */
+		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, 0);
+		if (error) {
+			ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
+			xfs_trans_cancel(tp, 0);
+			break;
+		}
+
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		error = xfs_trans_reserve_quota(tp, mp,
+				ip->i_udquot, ip->i_gdquot, ip->i_pdquot,
+				resblks, 0, XFS_QMOPT_RES_REGBLKS);
+		if (error)
+			goto out;
+
+		xfs_trans_ijoin(tp, ip, 0);
+
+		xfs_bmap_init(&free_list, &first_block);
+
+		/*
+		 * We are using the write transaction in which max 2 bmbt
+		 * updates are allowed
+		 */
+		error = xfs_bmap_shift_extents(tp, ip, &done, start_fsb,
+				shift_fsb, &current_ext,
+				&first_block, &free_list, 2);
+		if (error)
+			goto out;
+
+		error = xfs_bmap_finish(&tp, &free_list, &committed);
+		if (error)
+			goto out;
+
+		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	}
+
+	return error;
+
+out:
+	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+	return error;
+}
+
 /*
  * We need to check that the format of the data fork in the temporary inode is
  * valid for the target inode before doing the swap. This is not a problem with
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 0612609..588d29d 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -97,6 +97,8 @@  int	xfs_change_file_space(struct xfs_inode *ip, int cmd,
 			      xfs_flock64_t *bf, xfs_off_t offset,
 			      int attr_flags);
 
+int xfs_collapse_file_space(struct xfs_inode *, loff_t, loff_t, int);
+
 /* EOF block manipulation functions */
 bool	xfs_can_free_eofblocks(struct xfs_inode *ip, bool force);
 int	xfs_free_eofblocks(struct xfs_mount *mp, struct xfs_inode *ip,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 818c623..9c9c1ff 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -807,7 +807,8 @@  xfs_file_fallocate(
 	int		cmd = XFS_IOC_RESVSP;
 	int		attr_flags = XFS_ATTR_NOLOCK;
 
-	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
+		     FALLOC_FL_COLLAPSE_RANGE))
 		return -EOPNOTSUPP;
 
 	bf.l_whence = 0;
@@ -819,10 +820,19 @@  xfs_file_fallocate(
 	if (mode & FALLOC_FL_PUNCH_HOLE)
 		cmd = XFS_IOC_UNRESVSP;
 
-	/* check the new inode size is valid before allocating */
-	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
-	    offset + len > i_size_read(inode)) {
+	/* Shrink size in case of FALLOC_FL_COLLAPSE_RANGE */
+	if (mode & FALLOC_FL_COLLAPSE_RANGE) {
+		cmd = XFS_COLLAPSE_RANGE;
+		if ((offset + len) > i_size_read(inode))
+			new_size = offset;
+		else
+			new_size = i_size_read(inode) - len;
+	} else if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+	    offset + len > i_size_read(inode))
 		new_size = offset + len;
+
+	/* check the new inode size is valid before allocating */
+	if (new_size || mode & FALLOC_FL_COLLAPSE_RANGE) {
 		error = inode_newsize_ok(inode, new_size);
 		if (error)
 			goto out_unlock;
@@ -836,7 +846,7 @@  xfs_file_fallocate(
 		goto out_unlock;
 
 	/* Change file size if needed */
-	if (new_size) {
+	if (new_size ||  mode & FALLOC_FL_COLLAPSE_RANGE) {
 		struct iattr iattr;
 
 		iattr.ia_valid = ATTR_SIZE;
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 1edb5cc..99f5244 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -516,6 +516,12 @@  typedef struct xfs_swapext
 #define XFS_IOC_GETBMAPX	_IOWR('X', 56, struct getbmap)
 #define XFS_IOC_ZERO_RANGE	_IOW ('X', 57, struct xfs_flock64)
 #define XFS_IOC_FREE_EOFBLOCKS	_IOR ('X', 58, struct xfs_eofblocks)
+/*
+ * Although there is no ioctl implemented yet, we reserve an ioctl number for
+ * representing collapse range operation to avoid any possible collision in
+ * switch case of xfs_change_file_space.
+ */
+#define XFS_COLLAPSE_RANGE	_IOW('X', 59, struct xfs_flock64)
 
 /*
  * ioctl commands that replace IRIX syssgi()'s