Patchwork [2/3] xfs: Implement FALLOC_FL_COLLAPSE_RANGE

login
register
mail settings
Submitter NamJae Jeon
Date July 31, 2013, 2:42 p.m.
Message ID <1375281734-15874-1-git-send-email-linkinjeon@gmail.com>
Download mbox | patch
Permalink /patch/263728/
State Rejected
Headers show

Comments

NamJae Jeon - July 31, 2013, 2:42 p.m.
From: Namjae Jeon <namjae.jeon@samsung.com>

New fallocate flag FALLOC_FL_COLLAPSE_RANGE implementation for XFS.

Signed-off-by: Namjae Jeon <namjae.jeon@samsung.com>
Signed-off-by: Ashish Sangwan <a.sangwan@samsung.com>
---
 fs/xfs/xfs_bmap.c     |   92 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_bmap.h     |    3 ++
 fs/xfs/xfs_file.c     |   26 ++++++++++++--
 fs/xfs/xfs_iops.c     |   35 +++++++++++++++++++
 fs/xfs/xfs_vnodeops.c |   62 +++++++++++++++++++++++++++++++++
 fs/xfs/xfs_vnodeops.h |    2 ++
 6 files changed, 217 insertions(+), 3 deletions(-)
Ben Myers - July 31, 2013, 9:11 p.m.
Hey Namjae,

On Wed, Jul 31, 2013 at 11:42:14PM +0900, Namjae Jeon wrote:
> From: Namjae Jeon <namjae.jeon@samsung.com>
> 
> New fallocate flag FALLOC_FL_COLLAPSE_RANGE implementation for XFS.
> 
> Signed-off-by: Namjae Jeon <namjae.jeon@samsung.com>
> Signed-off-by: Ashish Sangwan <a.sangwan@samsung.com>
    
Very cool feature!  ;) 

I have a couple initial suggestions/questions:

> ---
>  fs/xfs/xfs_bmap.c     |   92 +++++++++++++++++++++++++++++++++++++++++++++++++
>  fs/xfs/xfs_bmap.h     |    3 ++
>  fs/xfs/xfs_file.c     |   26 ++++++++++++--
>  fs/xfs/xfs_iops.c     |   35 +++++++++++++++++++
>  fs/xfs/xfs_vnodeops.c |   62 +++++++++++++++++++++++++++++++++
>  fs/xfs/xfs_vnodeops.h |    2 ++
>  6 files changed, 217 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
> index 05c698c..ae677b1 100644
> --- a/fs/xfs/xfs_bmap.c
> +++ b/fs/xfs/xfs_bmap.c
> @@ -6145,3 +6145,95 @@ next_block:
>  
>  	return error;
>  }
> +
> +/*
> + * xfs_update_logical()
> + *	Updates starting logical block of extents by subtracting
> + *	shift from them. At max XFS_LINEAR_EXTS number extents
> + *	will be updated and *current_ext is the extent number which
> + *	is currently being updated.
> + */
> +int
> +xfs_update_logical(
> +	xfs_trans_t		*tp,
> +	struct xfs_inode	*ip,
> +	int			*done,
> +	xfs_fileoff_t		start_fsb,
> +	xfs_fileoff_t		shift,
> +	xfs_extnum_t		*current_ext)

Could we find a better name for this function?  Maybe something like
xfs_shift_extent_startblocks or xfs_shift_extent_offsets?

Also, is there also a case for being able to shift extent offsets upward in the
file's address space so that you can splice in a chunk of data?  For that I
think maybe you'd want to be able to shift on sub-block boundaries too, and
there would be some copying and zeroing involved on the boundary block.  Not sure.

> +{
> +	xfs_btree_cur_t		*cur;
> +	xfs_bmbt_rec_host_t	*gotp;
> +	xfs_mount_t		*mp;
> +	xfs_ifork_t		*ifp;
> +	xfs_extnum_t		nexts = 0;
> +	xfs_fileoff_t		startoff;
> +	int			error = 0;
> +	int			i;
> +
> +	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
> +	mp = ip->i_mount;
> +
> +	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
> +		(error = xfs_iread_extents(tp, ip, XFS_DATA_FORK)))
> +		return error;
> +
> +	if (!*current_ext) {
> +		gotp = xfs_iext_bno_to_ext(ifp, start_fsb, current_ext);
> +		/*
> +		 * gotp can be null in 2 cases: 1) if there are no extents
> +		 * or 2) start_fsb lies in a hole beyond which there are
> +		 * no extents. Either way, we are done.
> +		 */
> +		if (!gotp) {
> +			*done = 1;
> +			return 0;
> +		}
> +	}
> +
> +	if (ifp->if_flags & XFS_IFBROOT)
> +		cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK);
> +	else
> +		cur = NULL;
> +
> +	while (nexts++ < XFS_LINEAR_EXTS &&
> +	       *current_ext <  XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK)) {
> +		gotp = xfs_iext_get_ext(ifp, (*current_ext)++);
> +		startoff = xfs_bmbt_get_startoff(gotp);
> +		if (cur) {
> +			if ((error = xfs_bmbt_lookup_eq(cur,
> +					startoff,
> +					xfs_bmbt_get_startblock(gotp),
> +					xfs_bmbt_get_blockcount(gotp),
> +					&i)))
> +				goto del_cursor;
> +			XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
> +		}
> +		startoff -= shift;
> +		xfs_bmbt_set_startoff(gotp, startoff);
> +		if (cur) {
> +			error = xfs_bmbt_update(cur, startoff,
> +						xfs_bmbt_get_startblock(gotp),
> +						xfs_bmbt_get_blockcount(gotp),
> +						xfs_bmbt_get_state(gotp));
> +			if (error)
> +				goto del_cursor;
> +		}
> +	}
> +
> +	/* Check if we are done */
> +	if (*current_ext ==  XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK))
> +		*done = 1;
> +
> +del_cursor:
> +	if (cur) {
> +		if (!error)
> +			cur->bc_private.b.allocated = 0;
> +		 xfs_btree_del_cursor(cur,
> +				error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
> +	}
> +
> +	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE | XFS_ILOG_DEXT);
> +
> +	return error;
> +}
> diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
> index 1cf1292..f923734 100644
> --- a/fs/xfs/xfs_bmap.h
> +++ b/fs/xfs/xfs_bmap.h
> @@ -204,6 +204,9 @@ int	xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
>  int	xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,
>  		xfs_extnum_t num);
>  uint	xfs_default_attroffset(struct xfs_inode *ip);
> +int	xfs_update_logical(xfs_trans_t *tp, struct xfs_inode *ip, int *done,
> +		xfs_fileoff_t start_fsb, xfs_fileoff_t shift,
> +		xfs_extnum_t *current_ext);
>  
>  #ifdef __KERNEL__
>  /* bmap to userspace formatter - copy to user & advance pointer */
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index de3dc98..7d871bc 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -817,7 +817,12 @@ xfs_file_fallocate(
>  	int		cmd = XFS_IOC_RESVSP;
>  	int		attr_flags = XFS_ATTR_NOLOCK;
>  
> -	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
> +	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
> +		     FALLOC_FL_COLLAPSE_RANGE))
> +		return -EOPNOTSUPP;
> +
> +	/* not just yet for rt inodes */
> +	if ((mode & FALLOC_FL_COLLAPSE_RANGE) && XFS_IS_REALTIME_INODE(ip))
>  		return -EOPNOTSUPP;
>  
>  	bf.l_whence = 0;
> @@ -826,11 +831,11 @@ xfs_file_fallocate(
>  
>  	xfs_ilock(ip, XFS_IOLOCK_EXCL);
>  
> -	if (mode & FALLOC_FL_PUNCH_HOLE)
> +	if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE))
>  		cmd = XFS_IOC_UNRESVSP;
>  
>  	/* check the new inode size is valid before allocating */
> -	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
> +	if (!(mode & (FALLOC_FL_KEEP_SIZE | FALLOC_FL_COLLAPSE_RANGE)) &&
>  	    offset + len > i_size_read(inode)) {
>  		new_size = offset + len;
>  		error = inode_newsize_ok(inode, new_size);
> @@ -845,6 +850,21 @@ xfs_file_fallocate(
>  	if (error)
>  		goto out_unlock;
>  
> +	if (mode & FALLOC_FL_COLLAPSE_RANGE) {
> +		error = -xfs_collapse_file_space(ip, offset + len, len);
> +		if (error || offset >= i_size_read(inode))
> +			goto out_unlock;
> +
> +		/* Shrink size in case of FALLOC_FL_COLLAPSE_RANGE */
> +		if ((offset + len) > i_size_read(inode))
> +			new_size = offset;
> +		else
> +			new_size = i_size_read(inode) - len;
> +		error = -xfs_update_file_size(ip, new_size);
> +
> +		goto out_unlock;
> +	}
> +

Since you're not implementing an XFS_IOC_COLLAPSE_RANGE flag for the
xfs_change_file_space interface, it might be cleaner not to use change_space at
all... e.g. call xfs_free_file_space directly from xfs_collapse_file_space or
in the above conditional (making the conditional exclusive with the
change_space call).

Alternatively, you could implement this fully through xfs_change_file_space.

Either way I think it would be best for it to be all or nothing with respect to
the xfs_change_file_space interface, and my preference is that you implement
this through xfs_collapse_file_space interface just as the rest of these
operations are implemented in xfs.

>  	/* Change file size if needed */
>  	if (new_size) {
>  		struct iattr iattr;
> diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
> index 96dda62..16b20f1 100644
> --- a/fs/xfs/xfs_iops.c
> +++ b/fs/xfs/xfs_iops.c
> @@ -1236,3 +1236,38 @@ xfs_setup_inode(
>  
>  	unlock_new_inode(inode);
>  }
> +
> +/*
> + * xfs_update_file_size()
> + *	Just a simple disk size and time update
> + */
> +int
> +xfs_update_file_size(
> +	struct xfs_inode        *ip,
> +	loff_t			newsize)
> +{
> +	xfs_mount_t		*mp = ip->i_mount;
> +	struct inode		*inode = VFS_I(ip);
> +	struct xfs_trans	*tp;
> +	int			error;
> +
> +	tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
> +
> +	error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
> +	if (error)
> +		return error;
> +
> +	xfs_trans_ijoin(tp, ip, 0);
> +	truncate_setsize(inode, newsize);
> +	ip->i_d.di_size = newsize;
> +
> +	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
> +
> +	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
> +
> +	if (mp->m_flags & XFS_MOUNT_WSYNC)
> +		xfs_trans_set_sync(tp);
> +
> +	return xfs_trans_commit(tp, 0);
> +
> +}

Did you consider xfs_setattr_size for this?

Thanks,
	Ben
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Chinner - Aug. 1, 2013, 1:18 a.m.
On Wed, Jul 31, 2013 at 11:42:14PM +0900, Namjae Jeon wrote:
> From: Namjae Jeon <namjae.jeon@samsung.com>
> 
> New fallocate flag FALLOC_FL_COLLAPSE_RANGE implementation for XFS.

A good start, but there's lots of work needed to make this safe for
general use.

> Signed-off-by: Namjae Jeon <namjae.jeon@samsung.com>
> Signed-off-by: Ashish Sangwan <a.sangwan@samsung.com>
> ---
>  fs/xfs/xfs_bmap.c     |   92 +++++++++++++++++++++++++++++++++++++++++++++++++
>  fs/xfs/xfs_bmap.h     |    3 ++
>  fs/xfs/xfs_file.c     |   26 ++++++++++++--
>  fs/xfs/xfs_iops.c     |   35 +++++++++++++++++++
>  fs/xfs/xfs_vnodeops.c |   62 +++++++++++++++++++++++++++++++++
>  fs/xfs/xfs_vnodeops.h |    2 ++
>  6 files changed, 217 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
> index 05c698c..ae677b1 100644
> --- a/fs/xfs/xfs_bmap.c
> +++ b/fs/xfs/xfs_bmap.c
> @@ -6145,3 +6145,95 @@ next_block:
>  
>  	return error;
>  }
> +
> +/*
> + * xfs_update_logical()
> + *	Updates starting logical block of extents by subtracting
> + *	shift from them. At max XFS_LINEAR_EXTS number extents
> + *	will be updated and *current_ext is the extent number which
> + *	is currently being updated.

Single space after the "*" in the comments. Also, format them out to
80 columns.

> + */
> +int
> +xfs_update_logical(
> +	xfs_trans_t		*tp,
> +	struct xfs_inode	*ip,
> +	int			*done,
> +	xfs_fileoff_t		start_fsb,
> +	xfs_fileoff_t		shift,
> +	xfs_extnum_t		*current_ext)

This belongs in xfs_bmap_btree.c, named something like
xfs_bmbt_shift_rec_offsets().

Also, not typedefs for structures, please. (i.e. struct xfs_trans).


> +{
> +	xfs_btree_cur_t		*cur;

	struct xfs_btree_cur	*cur;

And for all the other structures, too.

> +	xfs_bmbt_rec_host_t	*gotp;
> +	xfs_mount_t		*mp;
> +	xfs_ifork_t		*ifp;
> +	xfs_extnum_t		nexts = 0;
> +	xfs_fileoff_t		startoff;
> +	int			error = 0;
> +	int			i;
> +
> +	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
> +	mp = ip->i_mount;

Do the assigment in the declaration on mp.

	struct xfs_mount	*mp = ip->i_mount;

> +
> +	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
> +		(error = xfs_iread_extents(tp, ip, XFS_DATA_FORK)))
> +		return error;

Hmmmm - that rings alarm bells.

> +
> +	if (!*current_ext) {
> +		gotp = xfs_iext_bno_to_ext(ifp, start_fsb, current_ext);
> +		/*
> +		 * gotp can be null in 2 cases: 1) if there are no extents
> +		 * or 2) start_fsb lies in a hole beyond which there are
> +		 * no extents. Either way, we are done.
> +		 */
> +		if (!gotp) {
> +			*done = 1;
> +			return 0;
> +		}
> +	}

So, you do a lookup on an extent index, which returns a incore
extent record.

> +
> +	if (ifp->if_flags & XFS_IFBROOT)
> +		cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK);
> +	else
> +		cur = NULL;
> +
> +	while (nexts++ < XFS_LINEAR_EXTS &&

What has XFS_LINEAR_EXTS got to do with how many extents can be moved
in a single transaction at a time?

> +	       *current_ext <  XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK)) {
> +		gotp = xfs_iext_get_ext(ifp, (*current_ext)++);
> +		startoff = xfs_bmbt_get_startoff(gotp);
> +		if (cur) {
> +			if ((error = xfs_bmbt_lookup_eq(cur,
> +					startoff,
> +					xfs_bmbt_get_startblock(gotp),
> +					xfs_bmbt_get_blockcount(gotp),
> +					&i)))

Please don't put assignments inside if() statements where possible.
i.e.
			error = xfs_bmbt_lookup_eq(cur, ....
			if (error)

Is the normal way of doing this.

> +				goto del_cursor;
> +			XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
> +		}
> +		startoff -= shift;
> +		xfs_bmbt_set_startoff(gotp, startoff);

So, we've looked up and extent, and reduced it's offset by shift.

> +		if (cur) {
> +			error = xfs_bmbt_update(cur, startoff,
> +						xfs_bmbt_get_startblock(gotp),
> +						xfs_bmbt_get_blockcount(gotp),
> +						xfs_bmbt_get_state(gotp));
> +			if (error)
> +				goto del_cursor;

And then we update the btree record in place.

Ok, major alarm bells are going off at this point. Is the record
still in the right place? Where did you validate that the shift
downwards didn't overlap the previous extent in the tree?

What happens if the start or end of the range to be shifted partially
overlaps an extent? The shift can only be done into a hole in the
file offset, so any attempt to shift downwards that overlaps an
existing extent is immediately invalid and indicates something is
wrong (pssobily corruption).

And if the end of the range being shifted partially overlaps an
extent, then that extent needs to be split - it requires two records
in the tree to track the part that was shifted and the part that was
not.

> +		}
> +	}
> +
> +	/* Check if we are done */
> +	if (*current_ext ==  XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK))
> +		*done = 1;
> +
> +del_cursor:
> +	if (cur) {
> +		if (!error)
> +			cur->bc_private.b.allocated = 0;

Um, why?

> -	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
> +	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
> +		     FALLOC_FL_COLLAPSE_RANGE))
> +		return -EOPNOTSUPP;
> +
> +	/* not just yet for rt inodes */
> +	if ((mode & FALLOC_FL_COLLAPSE_RANGE) && XFS_IS_REALTIME_INODE(ip))
>  		return -EOPNOTSUPP;

Why not? The extent manipulations are identical....

> @@ -845,6 +850,21 @@ xfs_file_fallocate(
>  	if (error)
>  		goto out_unlock;
>  
> +	if (mode & FALLOC_FL_COLLAPSE_RANGE) {
> +		error = -xfs_collapse_file_space(ip, offset + len, len);
> +		if (error || offset >= i_size_read(inode))
> +			goto out_unlock;
> +
> +		/* Shrink size in case of FALLOC_FL_COLLAPSE_RANGE */
> +		if ((offset + len) > i_size_read(inode))
> +			new_size = offset;
> +		else
> +			new_size = i_size_read(inode) - len;
> +		error = -xfs_update_file_size(ip, new_size);
> +
> +		goto out_unlock;
> +	}

This needs to vector through xfs_change_file_space() - it already
has code for doing offset/range validity checks, attaching
appropriate dquots for accounting, post-op file size and timestamp
updates, etc.

> +
> +/*
> + * xfs_update_file_size()
> + *	Just a simple disk size and time update
> + */
> +int
> +xfs_update_file_size(
> +	struct xfs_inode        *ip,
> +	loff_t			newsize)

This function should be nuked from orbit. I stopped looking at it
when the bug count got past 5. If you use xfs_change_file_space,
it's not necessary, either.

> diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
> index dc730ac..95b46e7 100644
> --- a/fs/xfs/xfs_vnodeops.c
> +++ b/fs/xfs/xfs_vnodeops.c
> @@ -1868,3 +1868,65 @@ xfs_change_file_space(
>  		xfs_trans_set_sync(tp);
>  	return xfs_trans_commit(tp, 0);
>  }
> +
> +/*
> + * xfs_collapse_file_space()
> + *	This implements the fallocate collapse range functionlaity.
> + *	It removes the hole from file by shifting all the extents which
> + *	lies beyond start block.
> + */
> +int
> +xfs_collapse_file_space(
> +	xfs_inode_t	*ip,
> +	loff_t		start,
> +	loff_t		shift)
> +{
> +	int		done = 0;
> +	xfs_mount_t	*mp = ip->i_mount;
> +	uint		resblks;
> +	xfs_trans_t	*tp;
> +	int		error = 0;
> +	xfs_extnum_t	current_ext = 0;
> +	xfs_fileoff_t	start_fsb = XFS_B_TO_FSB(mp, start);
> +	xfs_fileoff_t	shift_fsb = XFS_B_TO_FSB(mp, shift);
> +	resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
> +
> +	while (!error && !done) {
> +		tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
> +		tp->t_flags |= XFS_TRANS_RESERVE;
> +		error = xfs_trans_reserve(tp, resblks, XFS_WRITE_LOG_RES(mp),
> +					  0, XFS_TRANS_PERM_LOG_RES,
> +					  XFS_WRITE_LOG_COUNT);

Why a permanent log reservation for a single, non-nested transaction?

> +		if (error) {
> +			ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
> +			xfs_trans_cancel(tp, 0);
> +			break;
> +		}
> +
> +		xfs_ilock(ip, XFS_ILOCK_EXCL);
> +
> +		error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot,
> +						ip->i_gdquot, ip->i_pdquot,
> +						resblks, 0,
> +						XFS_QMOPT_RES_REGBLKS);
> +		if (error)
> +			goto out;
> +
> +		xfs_trans_ijoin(tp, ip, 0);
> +
> +		error = xfs_update_logical(tp, ip, &done, start_fsb,
> +					   shift_fsb, &current_ext);
> +		if (error)
> +			goto out;
> +
> +		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
> +		xfs_iunlock(ip, XFS_ILOCK_EXCL);
> +	}

Where are you punching out the blocks in the range we are shifting
the down into? i.e. before you can do this shift operation, you have
to ensure that the range being shifted into has no extents in it.
IOWs, the first operation that needs to be done is a hole punch of
the range being removed - we need xfs_free_file_space() call on the
range first.

And as Ted pointed out, we also need to invalidate the page cache
over the range being moved.

Cheers,

Dave.
NamJae Jeon - Aug. 1, 2013, 5:33 a.m.
2013/8/1, Dave Chinner <david@fromorbit.com>:
> On Wed, Jul 31, 2013 at 11:42:14PM +0900, Namjae Jeon wrote:
>> From: Namjae Jeon <namjae.jeon@samsung.com>
>>
>> New fallocate flag FALLOC_FL_COLLAPSE_RANGE implementation for XFS.
>
Hi Dave.
> A good start, but there's lots of work needed to make this safe for
> general use.
First, Thanks for your advice and help.

>
>> Signed-off-by: Namjae Jeon <namjae.jeon@samsung.com>
>> Signed-off-by: Ashish Sangwan <a.sangwan@samsung.com>
>> ---
>>  fs/xfs/xfs_bmap.c     |   92
>> +++++++++++++++++++++++++++++++++++++++++++++++++
>>  fs/xfs/xfs_bmap.h     |    3 ++
>>  fs/xfs/xfs_file.c     |   26 ++++++++++++--
>>  fs/xfs/xfs_iops.c     |   35 +++++++++++++++++++
>>  fs/xfs/xfs_vnodeops.c |   62 +++++++++++++++++++++++++++++++++
>>  fs/xfs/xfs_vnodeops.h |    2 ++
>>  6 files changed, 217 insertions(+), 3 deletions(-)
>>
>> diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
>> index 05c698c..ae677b1 100644
>> --- a/fs/xfs/xfs_bmap.c
>> +++ b/fs/xfs/xfs_bmap.c
>> @@ -6145,3 +6145,95 @@ next_block:
>>
>>  	return error;
>>  }
>> +
>> +/*
>> + * xfs_update_logical()
>> + *	Updates starting logical block of extents by subtracting
>> + *	shift from them. At max XFS_LINEAR_EXTS number extents
>> + *	will be updated and *current_ext is the extent number which
>> + *	is currently being updated.
>
> Single space after the "*" in the comments. Also, format them out to
> 80 columns.
>
Single space after * followed by a tab gives checkpath warning:
WARNING: please, no space before tabs
#29: FILE: fs/xfs/xfs_bmap.c:6151:
+ * ^IUpdates starting logical block of extents by subtracting$

>> + */
>> +int
>> +xfs_update_logical(
>> +	xfs_trans_t		*tp,
>> +	struct xfs_inode	*ip,
>> +	int			*done,
>> +	xfs_fileoff_t		start_fsb,
>> +	xfs_fileoff_t		shift,
>> +	xfs_extnum_t		*current_ext)
>
> This belongs in xfs_bmap_btree.c, named something like
> xfs_bmbt_shift_rec_offsets().
Okay.

>
> Also, not typedefs for structures, please. (i.e. struct xfs_trans).
Okay.

>
>
>> +{
>> +	xfs_btree_cur_t		*cur;
>
> 	struct xfs_btree_cur	*cur;
>
> And for all the other structures, too.
Okay.

>
>> +	xfs_bmbt_rec_host_t	*gotp;
>> +	xfs_mount_t		*mp;
>> +	xfs_ifork_t		*ifp;
>> +	xfs_extnum_t		nexts = 0;
>> +	xfs_fileoff_t		startoff;
>> +	int			error = 0;
>> +	int			i;
>> +
>> +	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
>> +	mp = ip->i_mount;
>
> Do the assigment in the declaration on mp.
Okay.

>
> 	struct xfs_mount	*mp = ip->i_mount;
>
>> +
>> +	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
>> +		(error = xfs_iread_extents(tp, ip, XFS_DATA_FORK)))
>> +		return error;
>
> Hmmmm - that rings alarm bells.
ok, we will remove the alarm.

>
>> +
>> +	if (!*current_ext) {
>> +		gotp = xfs_iext_bno_to_ext(ifp, start_fsb, current_ext);
>> +		/*
>> +		 * gotp can be null in 2 cases: 1) if there are no extents
>> +		 * or 2) start_fsb lies in a hole beyond which there are
>> +		 * no extents. Either way, we are done.
>> +		 */
>> +		if (!gotp) {
>> +			*done = 1;
>> +			return 0;
>> +		}
>> +	}
>
> So, you do a lookup on an extent index, which returns a incore
> extent record.
>
>> +
>> +	if (ifp->if_flags & XFS_IFBROOT)
>> +		cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK);
>> +	else
>> +		cur = NULL;
>> +
>> +	while (nexts++ < XFS_LINEAR_EXTS &&
>
> What has XFS_LINEAR_EXTS got to do with how many extents can be moved
> in a single transaction at a time?
We are also not sure how many extents to move in 1 transaction.
xfs_bunmapi deletes 3 extents in 1 iteration.
But deletion and updation are 2 different tasks.
BTW, we tested this patch with 10,000 extents and it is working fine.
Could you help us out here? What number would be appropriate?

>
>> +	       *current_ext <  XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK)) {
>> +		gotp = xfs_iext_get_ext(ifp, (*current_ext)++);
>> +		startoff = xfs_bmbt_get_startoff(gotp);
>> +		if (cur) {
>> +			if ((error = xfs_bmbt_lookup_eq(cur,
>> +					startoff,
>> +					xfs_bmbt_get_startblock(gotp),
>> +					xfs_bmbt_get_blockcount(gotp),
>> +					&i)))
>
> Please don't put assignments inside if() statements where possible.
> i.e.
> 			error = xfs_bmbt_lookup_eq(cur, ....
> 			if (error)
>
> Is the normal way of doing this.
Okay:)

>
>> +				goto del_cursor;
>> +			XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
>> +		}
>> +		startoff -= shift;
>> +		xfs_bmbt_set_startoff(gotp, startoff);
>
> So, we've looked up and extent, and reduced it's offset by shift.
>
>> +		if (cur) {
>> +			error = xfs_bmbt_update(cur, startoff,
>> +						xfs_bmbt_get_startblock(gotp),
>> +						xfs_bmbt_get_blockcount(gotp),
>> +						xfs_bmbt_get_state(gotp));
>> +			if (error)
>> +				goto del_cursor;
>
> And then we update the btree record in place.
>
> Ok, major alarm bells are going off at this point. Is the record
> still in the right place? Where did you validate that the shift
> downwards didn't overlap the previous extent in the tree?
>
> What happens if the start or end of the range to be shifted partially
> overlaps an extent? The shift can only be done into a hole in the
> file offset, so any attempt to shift downwards that overlaps an
> existing extent is immediately invalid and indicates something is
> wrong (pssobily corruption).
>
> And if the end of the range being shifted partially overlaps an
> extent, then that extent needs to be split - it requires two records
> in the tree to track the part that was shifted and the part that was
> not.
Your concern is correct.
There can be 2 approach of updating the extent tree.

Approach1: (The way truncate works): We start from the last allocated
extent and move towards the last offset which was punched out.
It mean start updating extents from right side and move towards left
and update btree in between
Approach2: We start from the extent just after the punched area and
move towards the last extent of the file.
It means moving from left to right and update btree in between.
When we used approach 1, we were getting btree corruption but there
seems to be no problem in approach2.
And, that holds true even in the case where 2 extents have _exactly_
same starting offset and length.
So, in approach2, attempt to shift extent that overlaps an existing
extent is working correctly without requiring any extent split.
we will check the reason for it.

>
>> +		}
>> +	}
>> +
>> +	/* Check if we are done */
>> +	if (*current_ext ==  XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK))
>> +		*done = 1;
>> +
>> +del_cursor:
>> +	if (cur) {
>> +		if (!error)
>> +			cur->bc_private.b.allocated = 0;
>
> Um, why?
Okay, will remove.
>
>> -	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
>> +	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
>> +		     FALLOC_FL_COLLAPSE_RANGE))
>> +		return -EOPNOTSUPP;
>> +
>> +	/* not just yet for rt inodes */
>> +	if ((mode & FALLOC_FL_COLLAPSE_RANGE) && XFS_IS_REALTIME_INODE(ip))
>>  		return -EOPNOTSUPP;
>
> Why not? The extent manipulations are identical....
Okay, Agree. we will remove this check in next release.
>
>> @@ -845,6 +850,21 @@ xfs_file_fallocate(
>>  	if (error)
>>  		goto out_unlock;
>>
>> +	if (mode & FALLOC_FL_COLLAPSE_RANGE) {
>> +		error = -xfs_collapse_file_space(ip, offset + len, len);
>> +		if (error || offset >= i_size_read(inode))
>> +			goto out_unlock;
>> +
>> +		/* Shrink size in case of FALLOC_FL_COLLAPSE_RANGE */
>> +		if ((offset + len) > i_size_read(inode))
>> +			new_size = offset;
>> +		else
>> +			new_size = i_size_read(inode) - len;
>> +		error = -xfs_update_file_size(ip, new_size);
>> +
>> +		goto out_unlock;
>> +	}
>
> This needs to vector through xfs_change_file_space() - it already
> has code for doing offset/range validity checks, attaching
> appropriate dquots for accounting, post-op file size and timestamp
> updates, etc.
It already is going through xfs_change_file_space(). Check this=>
@@ -826,11 +831,11 @@ xfs_file_fallocate(

        xfs_ilock(ip, XFS_IOLOCK_EXCL);

-       if (mode & FALLOC_FL_PUNCH_HOLE)
+       if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE))
                cmd = XFS_IOC_UNRESVSP;
So rightly said, we do not need to do validity checks etc for collapse range.

>
>> +
>> +/*
>> + * xfs_update_file_size()
>> + *	Just a simple disk size and time update
>> + */
>> +int
>> +xfs_update_file_size(
>> +	struct xfs_inode        *ip,
>> +	loff_t			newsize)
>
> This function should be nuked from orbit. I stopped looking at it
> when the bug count got past 5. If you use xfs_change_file_space,
> it's not necessary, either.
we are using xfs_change_file_space(). See my comment above. :)
But, xfs_change_file_space does not change logical file size.
Neither can we use xfs_setattr, because it will truncate the
preallocated extents beyond EOF.

>
>> diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
>> index dc730ac..95b46e7 100644
>> --- a/fs/xfs/xfs_vnodeops.c
>> +++ b/fs/xfs/xfs_vnodeops.c
>> @@ -1868,3 +1868,65 @@ xfs_change_file_space(
>>  		xfs_trans_set_sync(tp);
>>  	return xfs_trans_commit(tp, 0);
>>  }
>> +
>> +/*
>> + * xfs_collapse_file_space()
>> + *	This implements the fallocate collapse range functionlaity.
>> + *	It removes the hole from file by shifting all the extents which
>> + *	lies beyond start block.
>> + */
>> +int
>> +xfs_collapse_file_space(
>> +	xfs_inode_t	*ip,
>> +	loff_t		start,
>> +	loff_t		shift)
>> +{
>> +	int		done = 0;
>> +	xfs_mount_t	*mp = ip->i_mount;
>> +	uint		resblks;
>> +	xfs_trans_t	*tp;
>> +	int		error = 0;
>> +	xfs_extnum_t	current_ext = 0;
>> +	xfs_fileoff_t	start_fsb = XFS_B_TO_FSB(mp, start);
>> +	xfs_fileoff_t	shift_fsb = XFS_B_TO_FSB(mp, shift);
>> +	resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
>> +
>> +	while (!error && !done) {
>> +		tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
>> +		tp->t_flags |= XFS_TRANS_RESERVE;
>> +		error = xfs_trans_reserve(tp, resblks, XFS_WRITE_LOG_RES(mp),
>> +					  0, XFS_TRANS_PERM_LOG_RES,
>> +					  XFS_WRITE_LOG_COUNT);
>
> Why a permanent log reservation for a single, non-nested transaction?
XFS transaction log reservation code is becoming our major problem.
Could you suggest a proper way?

>
>> +		if (error) {
>> +			ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
>> +			xfs_trans_cancel(tp, 0);
>> +			break;
>> +		}
>> +
>> +		xfs_ilock(ip, XFS_ILOCK_EXCL);
>> +
>> +		error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot,
>> +						ip->i_gdquot, ip->i_pdquot,
>> +						resblks, 0,
>> +						XFS_QMOPT_RES_REGBLKS);
>> +		if (error)
>> +			goto out;
>> +
>> +		xfs_trans_ijoin(tp, ip, 0);
>> +
>> +		error = xfs_update_logical(tp, ip, &done, start_fsb,
>> +					   shift_fsb, &current_ext);
>> +		if (error)
>> +			goto out;
>> +
>> +		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
>> +		xfs_iunlock(ip, XFS_ILOCK_EXCL);
>> +	}
>
> Where are you punching out the blocks in the range we are shifting
> the down into? i.e. before you can do this shift operation, you have
We are calling xfs_free_file_space via xfs_change_file_space.

> to ensure that the range being shifted into has no extents in it.
> IOWs, the first operation that needs to be done is a hole punch of
> the range being removed - we need xfs_free_file_space() call on the
> range first.
>
> And as Ted pointed out, we also need to invalidate the page cache
> over the range being moved.
xfs_free_file_space does that for us.

Thanks Dave!
>
> Cheers,
>
> Dave.
> --
> Dave Chinner
> david@fromorbit.com
>
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
NamJae Jeon - Aug. 1, 2013, 6:20 a.m.
2013/8/1, Ben Myers <bpm@sgi.com>:
> Hey Namjae,
>
> On Wed, Jul 31, 2013 at 11:42:14PM +0900, Namjae Jeon wrote:
>> From: Namjae Jeon <namjae.jeon@samsung.com>
>>
>> New fallocate flag FALLOC_FL_COLLAPSE_RANGE implementation for XFS.
>>
>> Signed-off-by: Namjae Jeon <namjae.jeon@samsung.com>
>> Signed-off-by: Ashish Sangwan <a.sangwan@samsung.com>
>
Hi Ben.
> Very cool feature!  ;)
Thank you :)
>
> I have a couple initial suggestions/questions:
>
>> ---
>>  fs/xfs/xfs_bmap.c     |   92
>> +++++++++++++++++++++++++++++++++++++++++++++++++
>>  fs/xfs/xfs_bmap.h     |    3 ++
>>  fs/xfs/xfs_file.c     |   26 ++++++++++++--
>>  fs/xfs/xfs_iops.c     |   35 +++++++++++++++++++
>>  fs/xfs/xfs_vnodeops.c |   62 +++++++++++++++++++++++++++++++++
>>  fs/xfs/xfs_vnodeops.h |    2 ++
>>  6 files changed, 217 insertions(+), 3 deletions(-)
>>
>> diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
>> index 05c698c..ae677b1 100644
>> --- a/fs/xfs/xfs_bmap.c
>> +++ b/fs/xfs/xfs_bmap.c
>> @@ -6145,3 +6145,95 @@ next_block:
>>
>>  	return error;
>>  }
>> +
>> +/*
>> + * xfs_update_logical()
>> + *	Updates starting logical block of extents by subtracting
>> + *	shift from them. At max XFS_LINEAR_EXTS number extents
>> + *	will be updated and *current_ext is the extent number which
>> + *	is currently being updated.
>> + */
>> +int
>> +xfs_update_logical(
>> +	xfs_trans_t		*tp,
>> +	struct xfs_inode	*ip,
>> +	int			*done,
>> +	xfs_fileoff_t		start_fsb,
>> +	xfs_fileoff_t		shift,
>> +	xfs_extnum_t		*current_ext)
>
> Could we find a better name for this function?  Maybe something like
> xfs_shift_extent_startblocks or xfs_shift_extent_offsets?
Sure, Dave suggessted  xfs_bmbt_shift_rec_offsets().
>
> Also, is there also a case for being able to shift extent offsets upward in
> the
> file's address space so that you can splice in a chunk of data?  For that I
> think maybe you'd want to be able to shift on sub-block boundaries too, and
> there would be some copying and zeroing involved on the boundary block.  Not
> sure.
Currently no such case where we need to shift extents upward.
Also, no need to zero out anything as the offsets are fs block size aligned.
However, for Ext4, previously we submitted this ioctl which moves
extent blocks between files.
https://lkml.org/lkml/2013/6/23/10
This ioctl moves a range of data blocks from one file and appends them
at the end of other file.
This (appending at the end) seems a limitation, as we could easily
insert metadata blocks anywhere in the extent tree of second file.
This operation of inserting new extents in the middle will surely
require the extents of the second file to be moved upward.
We are planning to work on it.

>
>> +{
>> +	xfs_btree_cur_t		*cur;
>> +	xfs_bmbt_rec_host_t	*gotp;
>> +	xfs_mount_t		*mp;
>> +	xfs_ifork_t		*ifp;
>> +	xfs_extnum_t		nexts = 0;
>> +	xfs_fileoff_t		startoff;
>> +	int			error = 0;
>> +	int			i;
>> +
>> +	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
>> +	mp = ip->i_mount;
>> +
>> +	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
>> +		(error = xfs_iread_extents(tp, ip, XFS_DATA_FORK)))
>> +		return error;
>> +
>> +	if (!*current_ext) {
>> +		gotp = xfs_iext_bno_to_ext(ifp, start_fsb, current_ext);
>> +		/*
>> +		 * gotp can be null in 2 cases: 1) if there are no extents
>> +		 * or 2) start_fsb lies in a hole beyond which there are
>> +		 * no extents. Either way, we are done.
>> +		 */
>> +		if (!gotp) {
>> +			*done = 1;
>> +			return 0;
>> +		}
>> +	}
>> +
>> +	if (ifp->if_flags & XFS_IFBROOT)
>> +		cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK);
>> +	else
>> +		cur = NULL;
>> +
>> +	while (nexts++ < XFS_LINEAR_EXTS &&
>> +	       *current_ext <  XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK)) {
>> +		gotp = xfs_iext_get_ext(ifp, (*current_ext)++);
>> +		startoff = xfs_bmbt_get_startoff(gotp);
>> +		if (cur) {
>> +			if ((error = xfs_bmbt_lookup_eq(cur,
>> +					startoff,
>> +					xfs_bmbt_get_startblock(gotp),
>> +					xfs_bmbt_get_blockcount(gotp),
>> +					&i)))
>> +				goto del_cursor;
>> +			XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
>> +		}
>> +		startoff -= shift;
>> +		xfs_bmbt_set_startoff(gotp, startoff);
>> +		if (cur) {
>> +			error = xfs_bmbt_update(cur, startoff,
>> +						xfs_bmbt_get_startblock(gotp),
>> +						xfs_bmbt_get_blockcount(gotp),
>> +						xfs_bmbt_get_state(gotp));
>> +			if (error)
>> +				goto del_cursor;
>> +		}
>> +	}
>> +
>> +	/* Check if we are done */
>> +	if (*current_ext ==  XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK))
>> +		*done = 1;
>> +
>> +del_cursor:
>> +	if (cur) {
>> +		if (!error)
>> +			cur->bc_private.b.allocated = 0;
>> +		 xfs_btree_del_cursor(cur,
>> +				error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
>> +	}
>> +
>> +	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE | XFS_ILOG_DEXT);
>> +
>> +	return error;
>> +}
>> diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
>> index 1cf1292..f923734 100644
>> --- a/fs/xfs/xfs_bmap.h
>> +++ b/fs/xfs/xfs_bmap.h
>> @@ -204,6 +204,9 @@ int	xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode
>> *ip,
>>  int	xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,
>>  		xfs_extnum_t num);
>>  uint	xfs_default_attroffset(struct xfs_inode *ip);
>> +int	xfs_update_logical(xfs_trans_t *tp, struct xfs_inode *ip, int *done,
>> +		xfs_fileoff_t start_fsb, xfs_fileoff_t shift,
>> +		xfs_extnum_t *current_ext);
>>
>>  #ifdef __KERNEL__
>>  /* bmap to userspace formatter - copy to user & advance pointer */
>> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
>> index de3dc98..7d871bc 100644
>> --- a/fs/xfs/xfs_file.c
>> +++ b/fs/xfs/xfs_file.c
>> @@ -817,7 +817,12 @@ xfs_file_fallocate(
>>  	int		cmd = XFS_IOC_RESVSP;
>>  	int		attr_flags = XFS_ATTR_NOLOCK;
>>
>> -	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
>> +	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
>> +		     FALLOC_FL_COLLAPSE_RANGE))
>> +		return -EOPNOTSUPP;
>> +
>> +	/* not just yet for rt inodes */
>> +	if ((mode & FALLOC_FL_COLLAPSE_RANGE) && XFS_IS_REALTIME_INODE(ip))
>>  		return -EOPNOTSUPP;
>>
>>  	bf.l_whence = 0;
>> @@ -826,11 +831,11 @@ xfs_file_fallocate(
>>
>>  	xfs_ilock(ip, XFS_IOLOCK_EXCL);
>>
>> -	if (mode & FALLOC_FL_PUNCH_HOLE)
>> +	if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE))
>>  		cmd = XFS_IOC_UNRESVSP;
>>
>>  	/* check the new inode size is valid before allocating */
>> -	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
>> +	if (!(mode & (FALLOC_FL_KEEP_SIZE | FALLOC_FL_COLLAPSE_RANGE)) &&
>>  	    offset + len > i_size_read(inode)) {
>>  		new_size = offset + len;
>>  		error = inode_newsize_ok(inode, new_size);
>> @@ -845,6 +850,21 @@ xfs_file_fallocate(
>>  	if (error)
>>  		goto out_unlock;
>>
>> +	if (mode & FALLOC_FL_COLLAPSE_RANGE) {
>> +		error = -xfs_collapse_file_space(ip, offset + len, len);
>> +		if (error || offset >= i_size_read(inode))
>> +			goto out_unlock;
>> +
>> +		/* Shrink size in case of FALLOC_FL_COLLAPSE_RANGE */
>> +		if ((offset + len) > i_size_read(inode))
>> +			new_size = offset;
>> +		else
>> +			new_size = i_size_read(inode) - len;
>> +		error = -xfs_update_file_size(ip, new_size);
>> +
>> +		goto out_unlock;
>> +	}
>> +
>
> Since you're not implementing an XFS_IOC_COLLAPSE_RANGE flag for the
> xfs_change_file_space interface, it might be cleaner not to use change_space
> at
> all... e.g. call xfs_free_file_space directly from xfs_collapse_file_space
> or
> in the above conditional (making the conditional exclusive with the
> change_space call).
>
> Alternatively, you could implement this fully through
> xfs_change_file_space.
>
> Either way I think it would be best for it to be all or nothing with respect
> to
> the xfs_change_file_space interface, and my preference is that you
> implement
> this through xfs_collapse_file_space interface just as the rest of these
> operations are implemented in xfs.
OK, we will try to call xfs_change_file_space interface from within
collapse space.
All or nothing approach makes sense, but its just for avoiding code
duplicacy we decided to uses XFS_IOC_UNRESVSP.

>
>>  	/* Change file size if needed */
>>  	if (new_size) {
>>  		struct iattr iattr;
>> diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
>> index 96dda62..16b20f1 100644
>> --- a/fs/xfs/xfs_iops.c
>> +++ b/fs/xfs/xfs_iops.c
>> @@ -1236,3 +1236,38 @@ xfs_setup_inode(
>>
>>  	unlock_new_inode(inode);
>>  }
>> +
>> +/*
>> + * xfs_update_file_size()
>> + *	Just a simple disk size and time update
>> + */
>> +int
>> +xfs_update_file_size(
>> +	struct xfs_inode        *ip,
>> +	loff_t			newsize)
>> +{
>> +	xfs_mount_t		*mp = ip->i_mount;
>> +	struct inode		*inode = VFS_I(ip);
>> +	struct xfs_trans	*tp;
>> +	int			error;
>> +
>> +	tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
>> +
>> +	error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
>> +	if (error)
>> +		return error;
>> +
>> +	xfs_trans_ijoin(tp, ip, 0);
>> +	truncate_setsize(inode, newsize);
>> +	ip->i_d.di_size = newsize;
>> +
>> +	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
>> +
>> +	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
>> +
>> +	if (mp->m_flags & XFS_MOUNT_WSYNC)
>> +		xfs_trans_set_sync(tp);
>> +
>> +	return xfs_trans_commit(tp, 0);
>> +
>> +}
>
> Did you consider xfs_setattr_size for this?
Yeah we did, but it will also truncate any preallocated extent beyond EOF.

Thanks for review!
>
> Thanks,
> 	Ben
>
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig - Aug. 1, 2013, 8:24 a.m.
Instead of adding more mess to change_file_space it might be a good idea
to pull my

  "refactor the preallocation and hole punching code"

series from December in first.

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Chinner - Aug. 2, 2013, 2:44 a.m.
On Thu, Aug 01, 2013 at 01:24:02AM -0700, Christoph Hellwig wrote:
> Instead of adding more mess to change_file_space it might be a good idea
> to pull my
> 
>   "refactor the preallocation and hole punching code"
> 
> series from December in first.

You mean this one:

http://oss.sgi.com/archives/xfs/2012-12/msg00124.html

Yeah, probably makes sense to do this. I'll have a look at porting
it forwards to my current tree as xfs_vnodeops.c has gone away in
that series...

Cheers,

Dave.
Dave Chinner - Aug. 2, 2013, 3:47 a.m.
On Thu, Aug 01, 2013 at 02:33:09PM +0900, Namjae Jeon wrote:
> 2013/8/1, Dave Chinner <david@fromorbit.com>:
> > On Wed, Jul 31, 2013 at 11:42:14PM +0900, Namjae Jeon wrote:
> >> From: Namjae Jeon <namjae.jeon@samsung.com>
> >>
> >> New fallocate flag FALLOC_FL_COLLAPSE_RANGE implementation for XFS.
> >
> Hi Dave.
> > A good start, but there's lots of work needed to make this safe for
> > general use.
> First, Thanks for your advice and help.
> 
> >
> >> Signed-off-by: Namjae Jeon <namjae.jeon@samsung.com>
> >> Signed-off-by: Ashish Sangwan <a.sangwan@samsung.com>
> >> ---
> >>  fs/xfs/xfs_bmap.c     |   92
> >> +++++++++++++++++++++++++++++++++++++++++++++++++
> >>  fs/xfs/xfs_bmap.h     |    3 ++
> >>  fs/xfs/xfs_file.c     |   26 ++++++++++++--
> >>  fs/xfs/xfs_iops.c     |   35 +++++++++++++++++++
> >>  fs/xfs/xfs_vnodeops.c |   62 +++++++++++++++++++++++++++++++++
> >>  fs/xfs/xfs_vnodeops.h |    2 ++
> >>  6 files changed, 217 insertions(+), 3 deletions(-)
> >>
> >> diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
> >> index 05c698c..ae677b1 100644
> >> --- a/fs/xfs/xfs_bmap.c
> >> +++ b/fs/xfs/xfs_bmap.c
> >> @@ -6145,3 +6145,95 @@ next_block:
> >>
> >>  	return error;
> >>  }
> >> +
> >> +/*
> >> + * xfs_update_logical()
> >> + *	Updates starting logical block of extents by subtracting
> >> + *	shift from them. At max XFS_LINEAR_EXTS number extents
> >> + *	will be updated and *current_ext is the extent number which
> >> + *	is currently being updated.
> >
> > Single space after the "*" in the comments. Also, format them out to
> > 80 columns.
> >
> Single space after * followed by a tab gives checkpath warning:
> WARNING: please, no space before tabs
> #29: FILE: fs/xfs/xfs_bmap.c:6151:
> + * ^IUpdates starting logical block of extents by subtracting$

Get rid of the tabs altogether. The comment doesn't nee dthe
function name in it, and it doesn't need indenting. i.e.:

/*
 * Update the starting logical block of extents by subtracting shift from them.
 * At max XFS_LINEAR_EXTS number extents will be updated and *current_ext is
 * the extent number which is currently being updated.
 */


> > 	struct xfs_mount	*mp = ip->i_mount;
> >
> >> +
> >> +	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
> >> +		(error = xfs_iread_extents(tp, ip, XFS_DATA_FORK)))
> >> +		return error;
> >
> > Hmmmm - that rings alarm bells.
> ok, we will remove the alarm.

The reason it rings alarm bells is that this indicates that we have
an extent manipulation function being done without any of the usual
checks that the inode is in a format that we can do this
operation, or that it is correctly locked, or that the filesyste is
not shut down, etc. Have a look at all the other functions that call
xfs_iread_extents() in xfs_bmap.c....


> >> +
> >> +	if (!*current_ext) {
> >> +		gotp = xfs_iext_bno_to_ext(ifp, start_fsb, current_ext);
> >> +		/*
> >> +		 * gotp can be null in 2 cases: 1) if there are no extents
> >> +		 * or 2) start_fsb lies in a hole beyond which there are
> >> +		 * no extents. Either way, we are done.
> >> +		 */
> >> +		if (!gotp) {
> >> +			*done = 1;
> >> +			return 0;
> >> +		}
> >> +	}
> >
> > So, you do a lookup on an extent index, which returns a incore
> > extent record.
> >
> >> +
> >> +	if (ifp->if_flags & XFS_IFBROOT)
> >> +		cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK);
> >> +	else
> >> +		cur = NULL;
> >> +
> >> +	while (nexts++ < XFS_LINEAR_EXTS &&
> >
> > What has XFS_LINEAR_EXTS got to do with how many extents can be moved
> > in a single transaction at a time?
> We are also not sure how many extents to move in 1 transaction.
> xfs_bunmapi deletes 3 extents in 1 iteration.

No, it removes as many as the caller asks it to. The caller defines
that number because the caller is the one that reserved the
transaction space for the removal.

> But deletion and updation are 2 different tasks.

Yes, they are.

> BTW, we tested this patch with 10,000 extents and it is working fine.
> Could you help us out here? What number would be appropriate?

How many btree modifications can a record update do? Can it cause a
btree split? If it can cause a btree split, then we nee dto do
allocation and reserve enough blocks in the transaction reservation
for the split.

So, if we can split the bmbt, then allocation can cause the free
space btrees to split as well, so we need to reserve blocks for
those tree splits, too.

And if each record update can potentially cause a bmbt split and
multiple free space btree splits, then how many records should we
update in a single transaction? IOWs, it's no different to hole
punching, truncation, *allocation*, etc.

You used the write transaction reservation, which is defined as:

/*
 * In a write transaction we can allocate a maximum of 2
 * extents.  This gives:
 *    the inode getting the new extents: inode size
 *    the inode's bmap btree: max depth * block size
 *    the agfs of the ags from which the extents are allocated: 2 * sector
 *    the superblock free block counter: sector size
 *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
 * And the bmap_finish transaction can free bmap blocks in a join:
 *    the agfs of the ags containing the blocks: 2 * sector size
 *    the agfls of the ags containing the blocks: 2 * sector size
 *    the super block free block counter: sector size
 *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
 */


And that defines exactly how many bmbt updates you are
allowed to do in that transaction: 2 extents.

> >
> >> +				goto del_cursor;
> >> +			XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
> >> +		}
> >> +		startoff -= shift;
> >> +		xfs_bmbt_set_startoff(gotp, startoff);
> >
> > So, we've looked up and extent, and reduced it's offset by shift.
> >
> >> +		if (cur) {
> >> +			error = xfs_bmbt_update(cur, startoff,
> >> +						xfs_bmbt_get_startblock(gotp),
> >> +						xfs_bmbt_get_blockcount(gotp),
> >> +						xfs_bmbt_get_state(gotp));
> >> +			if (error)
> >> +				goto del_cursor;
> >
> > And then we update the btree record in place.
> >
> > Ok, major alarm bells are going off at this point. Is the record
> > still in the right place? Where did you validate that the shift
> > downwards didn't overlap the previous extent in the tree?
> >
> > What happens if the start or end of the range to be shifted partially
> > overlaps an extent? The shift can only be done into a hole in the
> > file offset, so any attempt to shift downwards that overlaps an
> > existing extent is immediately invalid and indicates something is
> > wrong (pssobily corruption).
> >
> > And if the end of the range being shifted partially overlaps an
> > extent, then that extent needs to be split - it requires two records
> > in the tree to track the part that was shifted and the part that was
> > not.
> Your concern is correct.
> There can be 2 approach of updating the extent tree.
> 
> Approach1: (The way truncate works): We start from the last allocated
> extent and move towards the last offset which was punched out.
> It mean start updating extents from right side and move towards left
> and update btree in between
> Approach2: We start from the extent just after the punched area and
> move towards the last extent of the file.

Yes, I can see *how* you are doing the shifting - I don't need you
to explain that. What I'm pointing out is that you are not
checking that it is safe to make this modification. i.e. where do
you guarantee that there is a hole in the location that you are
shifting the extents into?

FWIW, you don't check to see if the first shifted extent can be
merged with the extent that is to the left of it as a result of the
shift. If it is adjacent and contiguous and the result is a single
extent that doesn'te xceed the maximum supported by the filesystem,
then they should be merged and the old record removed.

> >> +	if (cur) {
> >> +		if (!error)
> >> +			cur->bc_private.b.allocated = 0;
> >
> > Um, why?
> Okay, will remove.

I'm asking you to explain why you put it there. Don't remove code
that might be necessary just because a hard question was asked....

> >> @@ -845,6 +850,21 @@ xfs_file_fallocate(
> >>  	if (error)
> >>  		goto out_unlock;
> >>
> >> +	if (mode & FALLOC_FL_COLLAPSE_RANGE) {
> >> +		error = -xfs_collapse_file_space(ip, offset + len, len);
> >> +		if (error || offset >= i_size_read(inode))
> >> +			goto out_unlock;
> >> +
> >> +		/* Shrink size in case of FALLOC_FL_COLLAPSE_RANGE */
> >> +		if ((offset + len) > i_size_read(inode))
> >> +			new_size = offset;
> >> +		else
> >> +			new_size = i_size_read(inode) - len;
> >> +		error = -xfs_update_file_size(ip, new_size);
> >> +
> >> +		goto out_unlock;
> >> +	}
> >
> > This needs to vector through xfs_change_file_space() - it already
> > has code for doing offset/range validity checks, attaching
> > appropriate dquots for accounting, post-op file size and timestamp
> > updates, etc.
> It already is going through xfs_change_file_space(). Check this=>

No it isn't - this is calling xfs_collapse_file_space from
xfs_file_fallocate(). That is not going through
xfs_change_file_space().

Oh, I see now, this hunk is *after* the xfs_change_file_space()
call.  That's nasty - it's a layering violation, pure and simple.

No wonder I thought that no hole punching was done, it's triggered
by a non-obvious flag set and so hidden three layers away from the
xfs_collapse_file_space() call that acts on the hole that was
punched.  This code *must* go through the same code paths as the
other fallocate operations so it is obvious how it interacts with
everything.

> >> +
> >> +/*
> >> + * xfs_update_file_size()
> >> + *	Just a simple disk size and time update
> >> + */
> >> +int
> >> +xfs_update_file_size(
> >> +	struct xfs_inode        *ip,
> >> +	loff_t			newsize)
> >
> > This function should be nuked from orbit. I stopped looking at it
> > when the bug count got past 5. If you use xfs_change_file_space,
> > it's not necessary, either.
> we are using xfs_change_file_space(). See my comment above. :)

Yes, badly. See my comment above. :)

> But, xfs_change_file_space does not change logical file size.
> Neither can we use xfs_setattr, because it will truncate the
> preallocated extents beyond EOF.

And the problem with that is?

Seriously, if you are chopping chunks out of a file and moving
extents around in it, you aren't going to be writing to it while
that is happening. For NLE workflows, this manipulation happens
after the entire stream is written. If you collapse the range while
the video is being written, you are going to end up with big
chunks of zeroes in the file as you the write() has a file position
way beyond the new EOF....

> >> diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
> >> index dc730ac..95b46e7 100644
> >> --- a/fs/xfs/xfs_vnodeops.c
> >> +++ b/fs/xfs/xfs_vnodeops.c
> >> @@ -1868,3 +1868,65 @@ xfs_change_file_space(
> >>  		xfs_trans_set_sync(tp);
> >>  	return xfs_trans_commit(tp, 0);
> >>  }
> >> +
> >> +/*
> >> + * xfs_collapse_file_space()
> >> + *	This implements the fallocate collapse range functionlaity.
> >> + *	It removes the hole from file by shifting all the extents which
> >> + *	lies beyond start block.
> >> + */
> >> +int
> >> +xfs_collapse_file_space(
> >> +	xfs_inode_t	*ip,
> >> +	loff_t		start,
> >> +	loff_t		shift)
> >> +{
> >> +	int		done = 0;
> >> +	xfs_mount_t	*mp = ip->i_mount;
> >> +	uint		resblks;
> >> +	xfs_trans_t	*tp;
> >> +	int		error = 0;
> >> +	xfs_extnum_t	current_ext = 0;
> >> +	xfs_fileoff_t	start_fsb = XFS_B_TO_FSB(mp, start);
> >> +	xfs_fileoff_t	shift_fsb = XFS_B_TO_FSB(mp, shift);
> >> +	resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
> >> +
> >> +	while (!error && !done) {
> >> +		tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
> >> +		tp->t_flags |= XFS_TRANS_RESERVE;
> >> +		error = xfs_trans_reserve(tp, resblks, XFS_WRITE_LOG_RES(mp),
> >> +					  0, XFS_TRANS_PERM_LOG_RES,
> >> +					  XFS_WRITE_LOG_COUNT);
> >
> > Why a permanent log reservation for a single, non-nested transaction?
> XFS transaction log reservation code is becoming our major problem.
> Could you suggest a proper way?

Permanent log transactions are only needed for transactions that
commit multiple times between reservations. You are doing:

	tp = alloc()
	reserve(tp, XFS_WRITE_LOG_RES, XFS_TRANS_PERM_LOG_RES, XFS_WRITE_LOG_COUNT)
	commit(tp, XFS_TRANS_RELEASE_LOG_RES)

It's a single transaction. Permanent log transactions are used for
multi-stage, rolling transactions that can be broken up into
multiple atomic operations, so as freeing multiple extents from a
file:

	tp = alloc()
	reserve(tp, XFS_WRITE_LOG_RES, XFS_TRANS_PERM_LOG_RES, XFS_WRITE_LOG_COUNT)
	.....
	tp2 = dup(tp)
	commit(tp)
	reserve(tp2, XFS_WRITE_LOG_RES, XFS_TRANS_PERM_LOG_RES, XFS_WRITE_LOG_COUNT)
	....
	commit(tp2, XFS_TRANS_PERM_LOG_RES).


The dup/reserve/commit loop keeps the same transaction context
across the whole operation and allows them to make continual forward
progress.

Hmmmm. In looking at this, I notice the update case that uses a
btree cursor doesn't have an the flist/firstblock initialised.
That'll cause an oops if a block is ever allocated or freed in a
record update. That would also indicate that the above does indeed
need a permanent log transaction and probably needs to be structured
similar to xfs_itruncate_extents with
xfs_bmap_init/xfs_bmap_finish() and a rolling transaction just in
case we end up modifying the btree.

Cheers,

Dave.
NamJae Jeon - Aug. 4, 2013, 8:29 a.m.
>
>> >> +	if (cur) {
>> >> +		if (!error)
>> >> +			cur->bc_private.b.allocated = 0;
>> >
>> > Um, why?
>> Okay, will remove.
>
> I'm asking you to explain why you put it there. Don't remove code
> that might be necessary just because a hard question was asked....
We copied this code from xfs_bunmapi. And we realize that why it is
there because xfs_bunmapi may split the extents, which could require
block allocation for btree, but I think that is not necessary here
because updating starting offsets of extents would not reqire a btree
split and allocation.
>
>> >> @@ -845,6 +850,21 @@ xfs_file_fallocate(
>> >>  	if (error)
>> >>  		goto out_unlock;
>> >>
>> >> +	if (mode & FALLOC_FL_COLLAPSE_RANGE) {
>> >> +		error = -xfs_collapse_file_space(ip, offset + len, len);
>> >> +		if (error || offset >= i_size_read(inode))
>> >> +			goto out_unlock;
>> >> +
>> >> +		/* Shrink size in case of FALLOC_FL_COLLAPSE_RANGE */
>> >> +		if ((offset + len) > i_size_read(inode))
>> >> +			new_size = offset;
>> >> +		else
>> >> +			new_size = i_size_read(inode) - len;
>> >> +		error = -xfs_update_file_size(ip, new_size);
>> >> +
>> >> +		goto out_unlock;
>> >> +	}
>> >
>> > This needs to vector through xfs_change_file_space() - it already
>> > has code for doing offset/range validity checks, attaching
>> > appropriate dquots for accounting, post-op file size and timestamp
>> > updates, etc.
>> It already is going through xfs_change_file_space(). Check this=>
>
> No it isn't - this is calling xfs_collapse_file_space from
> xfs_file_fallocate(). That is not going through
> xfs_change_file_space().
>
> Oh, I see now, this hunk is *after* the xfs_change_file_space()
> call.  That's nasty - it's a layering violation, pure and simple.
>
> No wonder I thought that no hole punching was done, it's triggered
> by a non-obvious flag set and so hidden three layers away from the
> xfs_collapse_file_space() call that acts on the hole that was
> punched.  This code *must* go through the same code paths as the
> other fallocate operations so it is obvious how it interacts with
> everything.
>
>> >> +
>> >> +/*
>> >> + * xfs_update_file_size()
>> >> + *	Just a simple disk size and time update
>> >> + */
>> >> +int
>> >> +xfs_update_file_size(
>> >> +	struct xfs_inode        *ip,
>> >> +	loff_t			newsize)
>> >
>> > This function should be nuked from orbit. I stopped looking at it
>> > when the bug count got past 5. If you use xfs_change_file_space,
>> > it's not necessary, either.
>> we are using xfs_change_file_space(). See my comment above. :)
>
> Yes, badly. See my comment above. :)
>
>> But, xfs_change_file_space does not change logical file size.
>> Neither can we use xfs_setattr, because it will truncate the
>> preallocated extents beyond EOF.
>
> And the problem with that is?
>
> Seriously, if you are chopping chunks out of a file and moving
> extents around in it, you aren't going to be writing to it while
> that is happening. For NLE workflows, this manipulation happens
> after the entire stream is written. If you collapse the range while
> the video is being written, you are going to end up with big
> chunks of zeroes in the file as you the write() has a file position
> way beyond the new EOF....
>
>> >> diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
>> >> index dc730ac..95b46e7 100644
>> >> --- a/fs/xfs/xfs_vnodeops.c
>> >> +++ b/fs/xfs/xfs_vnodeops.c
>> >> @@ -1868,3 +1868,65 @@ xfs_change_file_space(
>> >>  		xfs_trans_set_sync(tp);
>> >>  	return xfs_trans_commit(tp, 0);
>> >>  }
>> >> +
>> >> +/*
>> >> + * xfs_collapse_file_space()
>> >> + *	This implements the fallocate collapse range functionlaity.
>> >> + *	It removes the hole from file by shifting all the extents which
>> >> + *	lies beyond start block.
>> >> + */
>> >> +int
>> >> +xfs_collapse_file_space(
>> >> +	xfs_inode_t	*ip,
>> >> +	loff_t		start,
>> >> +	loff_t		shift)
>> >> +{
>> >> +	int		done = 0;
>> >> +	xfs_mount_t	*mp = ip->i_mount;
>> >> +	uint		resblks;
>> >> +	xfs_trans_t	*tp;
>> >> +	int		error = 0;
>> >> +	xfs_extnum_t	current_ext = 0;
>> >> +	xfs_fileoff_t	start_fsb = XFS_B_TO_FSB(mp, start);
>> >> +	xfs_fileoff_t	shift_fsb = XFS_B_TO_FSB(mp, shift);
>> >> +	resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
>> >> +
>> >> +	while (!error && !done) {
>> >> +		tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
>> >> +		tp->t_flags |= XFS_TRANS_RESERVE;
>> >> +		error = xfs_trans_reserve(tp, resblks, XFS_WRITE_LOG_RES(mp),
>> >> +					  0, XFS_TRANS_PERM_LOG_RES,
>> >> +					  XFS_WRITE_LOG_COUNT);
>> >
>> > Why a permanent log reservation for a single, non-nested transaction?
>> XFS transaction log reservation code is becoming our major problem.
>> Could you suggest a proper way?
>
> Permanent log transactions are only needed for transactions that
> commit multiple times between reservations. You are doing:
>
> 	tp = alloc()
> 	reserve(tp, XFS_WRITE_LOG_RES, XFS_TRANS_PERM_LOG_RES,
> XFS_WRITE_LOG_COUNT)
> 	commit(tp, XFS_TRANS_RELEASE_LOG_RES)
>
> It's a single transaction. Permanent log transactions are used for
> multi-stage, rolling transactions that can be broken up into
> multiple atomic operations, so as freeing multiple extents from a
> file:
>
> 	tp = alloc()
> 	reserve(tp, XFS_WRITE_LOG_RES, XFS_TRANS_PERM_LOG_RES,
> XFS_WRITE_LOG_COUNT)
> 	.....
> 	tp2 = dup(tp)
> 	commit(tp)
> 	reserve(tp2, XFS_WRITE_LOG_RES, XFS_TRANS_PERM_LOG_RES,
> XFS_WRITE_LOG_COUNT)
> 	....
> 	commit(tp2, XFS_TRANS_PERM_LOG_RES).
>
>
> The dup/reserve/commit loop keeps the same transaction context
> across the whole operation and allows them to make continual forward
> progress.
>
> Hmmmm. In looking at this, I notice the update case that uses a
> btree cursor doesn't have an the flist/firstblock initialised.
> That'll cause an oops if a block is ever allocated or freed in a
> record update. That would also indicate that the above does indeed
> need a permanent log transaction and probably needs to be structured
> similar to xfs_itruncate_extents with
> xfs_bmap_init/xfs_bmap_finish() and a rolling transaction just in
> case we end up modifying the btree.
Ok, we noted all your points and understand that a lot of work is
really needed to make it stable. we will try to implement them in next
version of patch. Really thanks for your time and help.

>
> Cheers,
>
> Dave.
> --
> Dave Chinner
> david@fromorbit.com
>
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 05c698c..ae677b1 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -6145,3 +6145,95 @@  next_block:
 
 	return error;
 }
+
+/*
+ * xfs_update_logical()
+ *	Updates starting logical block of extents by subtracting
+ *	shift from them. At max XFS_LINEAR_EXTS number extents
+ *	will be updated and *current_ext is the extent number which
+ *	is currently being updated.
+ */
+int
+xfs_update_logical(
+	xfs_trans_t		*tp,
+	struct xfs_inode	*ip,
+	int			*done,
+	xfs_fileoff_t		start_fsb,
+	xfs_fileoff_t		shift,
+	xfs_extnum_t		*current_ext)
+{
+	xfs_btree_cur_t		*cur;
+	xfs_bmbt_rec_host_t	*gotp;
+	xfs_mount_t		*mp;
+	xfs_ifork_t		*ifp;
+	xfs_extnum_t		nexts = 0;
+	xfs_fileoff_t		startoff;
+	int			error = 0;
+	int			i;
+
+	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+	mp = ip->i_mount;
+
+	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+		(error = xfs_iread_extents(tp, ip, XFS_DATA_FORK)))
+		return error;
+
+	if (!*current_ext) {
+		gotp = xfs_iext_bno_to_ext(ifp, start_fsb, current_ext);
+		/*
+		 * gotp can be null in 2 cases: 1) if there are no extents
+		 * or 2) start_fsb lies in a hole beyond which there are
+		 * no extents. Either way, we are done.
+		 */
+		if (!gotp) {
+			*done = 1;
+			return 0;
+		}
+	}
+
+	if (ifp->if_flags & XFS_IFBROOT)
+		cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK);
+	else
+		cur = NULL;
+
+	while (nexts++ < XFS_LINEAR_EXTS &&
+	       *current_ext <  XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK)) {
+		gotp = xfs_iext_get_ext(ifp, (*current_ext)++);
+		startoff = xfs_bmbt_get_startoff(gotp);
+		if (cur) {
+			if ((error = xfs_bmbt_lookup_eq(cur,
+					startoff,
+					xfs_bmbt_get_startblock(gotp),
+					xfs_bmbt_get_blockcount(gotp),
+					&i)))
+				goto del_cursor;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
+		}
+		startoff -= shift;
+		xfs_bmbt_set_startoff(gotp, startoff);
+		if (cur) {
+			error = xfs_bmbt_update(cur, startoff,
+						xfs_bmbt_get_startblock(gotp),
+						xfs_bmbt_get_blockcount(gotp),
+						xfs_bmbt_get_state(gotp));
+			if (error)
+				goto del_cursor;
+		}
+	}
+
+	/* Check if we are done */
+	if (*current_ext ==  XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK))
+		*done = 1;
+
+del_cursor:
+	if (cur) {
+		if (!error)
+			cur->bc_private.b.allocated = 0;
+		 xfs_btree_del_cursor(cur,
+				error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+	}
+
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE | XFS_ILOG_DEXT);
+
+	return error;
+}
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 1cf1292..f923734 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -204,6 +204,9 @@  int	xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
 int	xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,
 		xfs_extnum_t num);
 uint	xfs_default_attroffset(struct xfs_inode *ip);
+int	xfs_update_logical(xfs_trans_t *tp, struct xfs_inode *ip, int *done,
+		xfs_fileoff_t start_fsb, xfs_fileoff_t shift,
+		xfs_extnum_t *current_ext);
 
 #ifdef __KERNEL__
 /* bmap to userspace formatter - copy to user & advance pointer */
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index de3dc98..7d871bc 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -817,7 +817,12 @@  xfs_file_fallocate(
 	int		cmd = XFS_IOC_RESVSP;
 	int		attr_flags = XFS_ATTR_NOLOCK;
 
-	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
+		     FALLOC_FL_COLLAPSE_RANGE))
+		return -EOPNOTSUPP;
+
+	/* not just yet for rt inodes */
+	if ((mode & FALLOC_FL_COLLAPSE_RANGE) && XFS_IS_REALTIME_INODE(ip))
 		return -EOPNOTSUPP;
 
 	bf.l_whence = 0;
@@ -826,11 +831,11 @@  xfs_file_fallocate(
 
 	xfs_ilock(ip, XFS_IOLOCK_EXCL);
 
-	if (mode & FALLOC_FL_PUNCH_HOLE)
+	if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE))
 		cmd = XFS_IOC_UNRESVSP;
 
 	/* check the new inode size is valid before allocating */
-	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+	if (!(mode & (FALLOC_FL_KEEP_SIZE | FALLOC_FL_COLLAPSE_RANGE)) &&
 	    offset + len > i_size_read(inode)) {
 		new_size = offset + len;
 		error = inode_newsize_ok(inode, new_size);
@@ -845,6 +850,21 @@  xfs_file_fallocate(
 	if (error)
 		goto out_unlock;
 
+	if (mode & FALLOC_FL_COLLAPSE_RANGE) {
+		error = -xfs_collapse_file_space(ip, offset + len, len);
+		if (error || offset >= i_size_read(inode))
+			goto out_unlock;
+
+		/* Shrink size in case of FALLOC_FL_COLLAPSE_RANGE */
+		if ((offset + len) > i_size_read(inode))
+			new_size = offset;
+		else
+			new_size = i_size_read(inode) - len;
+		error = -xfs_update_file_size(ip, new_size);
+
+		goto out_unlock;
+	}
+
 	/* Change file size if needed */
 	if (new_size) {
 		struct iattr iattr;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 96dda62..16b20f1 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -1236,3 +1236,38 @@  xfs_setup_inode(
 
 	unlock_new_inode(inode);
 }
+
+/*
+ * xfs_update_file_size()
+ *	Just a simple disk size and time update
+ */
+int
+xfs_update_file_size(
+	struct xfs_inode        *ip,
+	loff_t			newsize)
+{
+	xfs_mount_t		*mp = ip->i_mount;
+	struct inode		*inode = VFS_I(ip);
+	struct xfs_trans	*tp;
+	int			error;
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
+
+	error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
+	if (error)
+		return error;
+
+	xfs_trans_ijoin(tp, ip, 0);
+	truncate_setsize(inode, newsize);
+	ip->i_d.di_size = newsize;
+
+	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
+
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	if (mp->m_flags & XFS_MOUNT_WSYNC)
+		xfs_trans_set_sync(tp);
+
+	return xfs_trans_commit(tp, 0);
+
+}
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index dc730ac..95b46e7 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -1868,3 +1868,65 @@  xfs_change_file_space(
 		xfs_trans_set_sync(tp);
 	return xfs_trans_commit(tp, 0);
 }
+
+/*
+ * xfs_collapse_file_space()
+ *	This implements the fallocate collapse range functionlaity.
+ *	It removes the hole from file by shifting all the extents which
+ *	lies beyond start block.
+ */
+int
+xfs_collapse_file_space(
+	xfs_inode_t	*ip,
+	loff_t		start,
+	loff_t		shift)
+{
+	int		done = 0;
+	xfs_mount_t	*mp = ip->i_mount;
+	uint		resblks;
+	xfs_trans_t	*tp;
+	int		error = 0;
+	xfs_extnum_t	current_ext = 0;
+	xfs_fileoff_t	start_fsb = XFS_B_TO_FSB(mp, start);
+	xfs_fileoff_t	shift_fsb = XFS_B_TO_FSB(mp, shift);
+	resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+
+	while (!error && !done) {
+		tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
+		tp->t_flags |= XFS_TRANS_RESERVE;
+		error = xfs_trans_reserve(tp, resblks, XFS_WRITE_LOG_RES(mp),
+					  0, XFS_TRANS_PERM_LOG_RES,
+					  XFS_WRITE_LOG_COUNT);
+		if (error) {
+			ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
+			xfs_trans_cancel(tp, 0);
+			break;
+		}
+
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+		error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot,
+						ip->i_gdquot, ip->i_pdquot,
+						resblks, 0,
+						XFS_QMOPT_RES_REGBLKS);
+		if (error)
+			goto out;
+
+		xfs_trans_ijoin(tp, ip, 0);
+
+		error = xfs_update_logical(tp, ip, &done, start_fsb,
+					   shift_fsb, &current_ext);
+		if (error)
+			goto out;
+
+		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	}
+
+	return error;
+
+out:
+	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
+}
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 38c67c3..a9684c9 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -51,5 +51,7 @@  int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
 int xfs_iozero(struct xfs_inode *, loff_t, size_t);
 int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
 int xfs_free_eofblocks(struct xfs_mount *, struct xfs_inode *, bool);
+int xfs_collapse_file_space(struct xfs_inode *, loff_t, loff_t);
+int xfs_update_file_size(struct xfs_inode *, loff_t);
 
 #endif /* _XFS_VNODEOPS_H */