diff mbox

[CFT] Saner error handling in skb_copy_datagram_iter() et.al.

Message ID 20170218000214.GA19777@ZenIV.linux.org.uk
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

Al Viro Feb. 18, 2017, 12:02 a.m. UTC
On Fri, Feb 17, 2017 at 05:03:15PM +0000, Al Viro wrote:
> On Fri, Feb 17, 2017 at 10:54:20AM -0500, David Miller wrote:
> > From: Al Viro <viro@ZenIV.linux.org.uk>
> > Date: Tue, 14 Feb 2017 01:33:06 +0000
> > 
> > > OK...  Remaining interesting question is whether it adds a noticable
> > > overhead.  Could somebody try it on assorted benchmarks and see if
> > > it slows the things down?  The patch in question follows:
> > 
> > That's about a 40 byte copy onto the stack for each invocation of this
> > thing.  You can benchmark all you want, but it's clear that this is
> > non-trivial amount of work and will take some operations from fitting
> > in the cache to not doing so for sure.
> 
> In principle, that could be reduced a bit (32 bytes - ->type is never
> changed, so we don't need to restore it), but that's not much of improvement...

Actually, I've a better solution.  Namely, analogue of iov_iter_advance()
for going backwards.  The restriction is that you should never unroll
further than where you've initially started *or* have the iovec, etc.
array modified under you.  For iovec/kvec/bio_vec it's trivial, for pipe -
a bit more convoluted, but still doable.  Then net/core/datagram.c stuff
could simply use iov_iter_unroll() in case of error - all we need to keep
track of is how much had we copied and that's easy to do.

The patch below is completely untested, but if it works it should avoid
buggering the fast paths at all, still giving the same semantics re
reverting ->msg_iter both on EINVAL and EFAULT.  Comments?

Comments

Christian Lamparter Feb. 19, 2017, 7:19 p.m. UTC | #1
On Saturday, February 18, 2017 12:02:14 AM CET Al Viro wrote:
> On Fri, Feb 17, 2017 at 05:03:15PM +0000, Al Viro wrote:
> > On Fri, Feb 17, 2017 at 10:54:20AM -0500, David Miller wrote:
> > > From: Al Viro <viro@ZenIV.linux.org.uk>
> > > Date: Tue, 14 Feb 2017 01:33:06 +0000
> > > 
> > > > OK...  Remaining interesting question is whether it adds a noticable
> > > > overhead.  Could somebody try it on assorted benchmarks and see if
> > > > it slows the things down?  The patch in question follows:
> > > 
> > > That's about a 40 byte copy onto the stack for each invocation of this
> > > thing.  You can benchmark all you want, but it's clear that this is
> > > non-trivial amount of work and will take some operations from fitting
> > > in the cache to not doing so for sure.
> > 
> > In principle, that could be reduced a bit (32 bytes - ->type is never
> > changed, so we don't need to restore it), but that's not much of improvement...
> 
> Actually, I've a better solution.  Namely, analogue of iov_iter_advance()
> for going backwards.  The restriction is that you should never unroll
> further than where you've initially started *or* have the iovec, etc.
> array modified under you.  For iovec/kvec/bio_vec it's trivial, for pipe -
> a bit more convoluted, but still doable.  Then net/core/datagram.c stuff
> could simply use iov_iter_unroll() in case of error - all we need to keep
> track of is how much had we copied and that's easy to do.
> 
> The patch below is completely untested, but if it works it should avoid
> buggering the fast paths at all, still giving the same semantics re
> reverting ->msg_iter both on EINVAL and EFAULT.  Comments?
I've tested it. It also fixes the corruption issue I can reproduce
with my setup.:
# tc qdisc add dev eth0 root netem corrupt 0.1%
(and the dlbug code)

So, for what's worth:
Tested-by: Christian Lamparter <chunkeey@googlemail.com>

> diff --git a/include/linux/uio.h b/include/linux/uio.h
> index 804e34c6f981..237965348bc2 100644
> --- a/include/linux/uio.h
> +++ b/include/linux/uio.h
> @@ -39,7 +39,10 @@ struct iov_iter {
>  	};
>  	union {
>  		unsigned long nr_segs;
> -		int idx;
> +		struct {
> +			int idx;
> +			int start_idx;
> +		};
>  	};
>  };
>  
> @@ -81,6 +84,7 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to);
>  size_t iov_iter_copy_from_user_atomic(struct page *page,
>  		struct iov_iter *i, unsigned long offset, size_t bytes);
>  void iov_iter_advance(struct iov_iter *i, size_t bytes);
> +void iov_iter_unroll(struct iov_iter *i, size_t bytes);
>  int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes);
>  size_t iov_iter_single_seg_count(const struct iov_iter *i);
>  size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
> diff --git a/lib/iov_iter.c b/lib/iov_iter.c
> index e68604ae3ced..afdaca37f5c9 100644
> --- a/lib/iov_iter.c
> +++ b/lib/iov_iter.c
> @@ -786,6 +786,68 @@ void iov_iter_advance(struct iov_iter *i, size_t size)
>  }
>  EXPORT_SYMBOL(iov_iter_advance);
>  
> +void iov_iter_unroll(struct iov_iter *i, size_t unroll)
> +{
> +	if (!unroll)
> +		return;
> +	i->count += unroll;
> +	if (unlikely(i->type & ITER_PIPE)) {
> +		struct pipe_inode_info *pipe = i->pipe;
> +		int idx = i->idx;
> +		size_t off = i->iov_offset;
> +		while (1) {
> +			size_t n = off - pipe->bufs[idx].offset;
> +			if (unroll < n) {
> +				off -= (n - unroll);
> +				break;
> +			}
> +			unroll -= n;
> +			if (!unroll && idx == i->start_idx) {
> +				off = 0;
> +				break;
> +			}
> +			if (!idx--)
> +				idx = pipe->buffers - 1;
> +			off = pipe->bufs[idx].offset + pipe->bufs[idx].len;
> +		}
> +		i->iov_offset = off;
> +		i->idx = idx;
> +		pipe_truncate(i);
> +		return;
> +	}
> +	if (unroll <= i->iov_offset) {
> +		i->iov_offset -= unroll;
> +		return;
> +	}
> +	unroll -= i->iov_offset;
> +	if (i->type & ITER_BVEC) {
> +		const struct bio_vec *bvec = i->bvec;
> +		while (1) {
> +			size_t n = (--bvec)->bv_len;
> +			i->nr_segs++;
> +			if (unroll <= n) {
> +				i->bvec = bvec;
> +				i->iov_offset = n - unroll;
> +				return;
> +			}
> +			unroll -= n;
> +		}
> +	} else { /* same logics for iovec and kvec */
> +		const struct iovec *iov = i->iov;
> +		while (1) {
> +			size_t n = (--iov)->iov_len;
> +			i->nr_segs++;
> +			if (unroll <= n) {
> +				i->iov = iov;
> +				i->iov_offset = n - unroll;
> +				return;
> +			}
> +			unroll -= n;
> +		}
> +	}
> +}
> +EXPORT_SYMBOL(iov_iter_unroll);
> +
>  /*
>   * Return the count of just the current iov_iter segment.
>   */
> @@ -839,6 +901,7 @@ void iov_iter_pipe(struct iov_iter *i, int direction,
>  	i->idx = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
>  	i->iov_offset = 0;
>  	i->count = count;
> +	i->start_idx = i->idx;
>  }
>  EXPORT_SYMBOL(iov_iter_pipe);
>  
> diff --git a/net/core/datagram.c b/net/core/datagram.c
> index 662bea587165..63c7353a6ba2 100644
> --- a/net/core/datagram.c
> +++ b/net/core/datagram.c
> @@ -394,7 +394,7 @@ int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
>  			   struct iov_iter *to, int len)
>  {
>  	int start = skb_headlen(skb);
> -	int i, copy = start - offset;
> +	int i, copy = start - offset, start_off = offset, n;
>  	struct sk_buff *frag_iter;
>  
>  	trace_skb_copy_datagram_iovec(skb, len);
> @@ -403,11 +403,12 @@ int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
>  	if (copy > 0) {
>  		if (copy > len)
>  			copy = len;
> -		if (copy_to_iter(skb->data + offset, copy, to) != copy)
> +		n = copy_to_iter(skb->data + offset, copy, to);
> +		offset += n;
> +		if (n != copy)
>  			goto short_copy;
>  		if ((len -= copy) == 0)
>  			return 0;
> -		offset += copy;
>  	}
>  
>  	/* Copy paged appendix. Hmm... why does this look so complicated? */
> @@ -421,13 +422,14 @@ int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
>  		if ((copy = end - offset) > 0) {
>  			if (copy > len)
>  				copy = len;
> -			if (copy_page_to_iter(skb_frag_page(frag),
> +			n = copy_page_to_iter(skb_frag_page(frag),
>  					      frag->page_offset + offset -
> -					      start, copy, to) != copy)
> +					      start, copy, to);
> +			offset += n;
> +			if (n != copy)
>  				goto short_copy;
>  			if (!(len -= copy))
>  				return 0;
> -			offset += copy;
>  		}
>  		start = end;
>  	}
> @@ -459,6 +461,7 @@ int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
>  	 */
>  
>  fault:
> +	iov_iter_unroll(to, offset - start_off);
>  	return -EFAULT;
>  
>  short_copy:
> @@ -609,7 +612,7 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
>  				      __wsum *csump)
>  {
>  	int start = skb_headlen(skb);
> -	int i, copy = start - offset;
> +	int i, copy = start - offset, start_off = offset;
>  	struct sk_buff *frag_iter;
>  	int pos = 0;
>  	int n;
> @@ -619,11 +622,11 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
>  		if (copy > len)
>  			copy = len;
>  		n = csum_and_copy_to_iter(skb->data + offset, copy, csump, to);
> +		offset += n;
>  		if (n != copy)
>  			goto fault;
>  		if ((len -= copy) == 0)
>  			return 0;
> -		offset += copy;
>  		pos = copy;
>  	}
>  
> @@ -645,12 +648,12 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
>  						  offset - start, copy,
>  						  &csum2, to);
>  			kunmap(page);
> +			offset += n;
>  			if (n != copy)
>  				goto fault;
>  			*csump = csum_block_add(*csump, csum2, pos);
>  			if (!(len -= copy))
>  				return 0;
> -			offset += copy;
>  			pos += copy;
>  		}
>  		start = end;
> @@ -683,6 +686,7 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
>  		return 0;
>  
>  fault:
> +	iov_iter_unroll(to, offset - start_off);
>  	return -EFAULT;
>  }
>  
> @@ -767,6 +771,7 @@ int skb_copy_and_csum_datagram_msg(struct sk_buff *skb,
>  	}
>  	return 0;
>  csum_error:
> +	iov_iter_unroll(&msg->msg_iter, chunk);
>  	return -EINVAL;
>  fault:
>  	return -EFAULT;
>
David Miller Feb. 20, 2017, 3:14 p.m. UTC | #2
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Sat, 18 Feb 2017 00:02:14 +0000

> On Fri, Feb 17, 2017 at 05:03:15PM +0000, Al Viro wrote:
>> On Fri, Feb 17, 2017 at 10:54:20AM -0500, David Miller wrote:
>> > From: Al Viro <viro@ZenIV.linux.org.uk>
>> > Date: Tue, 14 Feb 2017 01:33:06 +0000
>> > 
>> > > OK...  Remaining interesting question is whether it adds a noticable
>> > > overhead.  Could somebody try it on assorted benchmarks and see if
>> > > it slows the things down?  The patch in question follows:
>> > 
>> > That's about a 40 byte copy onto the stack for each invocation of this
>> > thing.  You can benchmark all you want, but it's clear that this is
>> > non-trivial amount of work and will take some operations from fitting
>> > in the cache to not doing so for sure.
>> 
>> In principle, that could be reduced a bit (32 bytes - ->type is never
>> changed, so we don't need to restore it), but that's not much of improvement...
> 
> Actually, I've a better solution.  Namely, analogue of iov_iter_advance()
> for going backwards.  The restriction is that you should never unroll
> further than where you've initially started *or* have the iovec, etc.
> array modified under you.  For iovec/kvec/bio_vec it's trivial, for pipe -
> a bit more convoluted, but still doable.  Then net/core/datagram.c stuff
> could simply use iov_iter_unroll() in case of error - all we need to keep
> track of is how much had we copied and that's easy to do.
> 
> The patch below is completely untested, but if it works it should avoid
> buggering the fast paths at all, still giving the same semantics re
> reverting ->msg_iter both on EINVAL and EFAULT.  Comments?

This looks a lot better to me.
David Laight Feb. 21, 2017, 1:25 p.m. UTC | #3
From: Al Viro
> Sent: 18 February 2017 00:02
...
> Actually, I've a better solution.  Namely, analogue of iov_iter_advance()
> for going backwards.  The restriction is that you should never unroll
> further than where you've initially started *or* have the iovec, etc.
> array modified under you
...
> +void iov_iter_unroll(struct iov_iter *i, size_t unroll)
...

I'm sure there is a better name, maybe iov_iter_backup() ?

	David
diff mbox

Patch

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 804e34c6f981..237965348bc2 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -39,7 +39,10 @@  struct iov_iter {
 	};
 	union {
 		unsigned long nr_segs;
-		int idx;
+		struct {
+			int idx;
+			int start_idx;
+		};
 	};
 };
 
@@ -81,6 +84,7 @@  unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to);
 size_t iov_iter_copy_from_user_atomic(struct page *page,
 		struct iov_iter *i, unsigned long offset, size_t bytes);
 void iov_iter_advance(struct iov_iter *i, size_t bytes);
+void iov_iter_unroll(struct iov_iter *i, size_t bytes);
 int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes);
 size_t iov_iter_single_seg_count(const struct iov_iter *i);
 size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index e68604ae3ced..afdaca37f5c9 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -786,6 +786,68 @@  void iov_iter_advance(struct iov_iter *i, size_t size)
 }
 EXPORT_SYMBOL(iov_iter_advance);
 
+void iov_iter_unroll(struct iov_iter *i, size_t unroll)
+{
+	if (!unroll)
+		return;
+	i->count += unroll;
+	if (unlikely(i->type & ITER_PIPE)) {
+		struct pipe_inode_info *pipe = i->pipe;
+		int idx = i->idx;
+		size_t off = i->iov_offset;
+		while (1) {
+			size_t n = off - pipe->bufs[idx].offset;
+			if (unroll < n) {
+				off -= (n - unroll);
+				break;
+			}
+			unroll -= n;
+			if (!unroll && idx == i->start_idx) {
+				off = 0;
+				break;
+			}
+			if (!idx--)
+				idx = pipe->buffers - 1;
+			off = pipe->bufs[idx].offset + pipe->bufs[idx].len;
+		}
+		i->iov_offset = off;
+		i->idx = idx;
+		pipe_truncate(i);
+		return;
+	}
+	if (unroll <= i->iov_offset) {
+		i->iov_offset -= unroll;
+		return;
+	}
+	unroll -= i->iov_offset;
+	if (i->type & ITER_BVEC) {
+		const struct bio_vec *bvec = i->bvec;
+		while (1) {
+			size_t n = (--bvec)->bv_len;
+			i->nr_segs++;
+			if (unroll <= n) {
+				i->bvec = bvec;
+				i->iov_offset = n - unroll;
+				return;
+			}
+			unroll -= n;
+		}
+	} else { /* same logics for iovec and kvec */
+		const struct iovec *iov = i->iov;
+		while (1) {
+			size_t n = (--iov)->iov_len;
+			i->nr_segs++;
+			if (unroll <= n) {
+				i->iov = iov;
+				i->iov_offset = n - unroll;
+				return;
+			}
+			unroll -= n;
+		}
+	}
+}
+EXPORT_SYMBOL(iov_iter_unroll);
+
 /*
  * Return the count of just the current iov_iter segment.
  */
@@ -839,6 +901,7 @@  void iov_iter_pipe(struct iov_iter *i, int direction,
 	i->idx = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
 	i->iov_offset = 0;
 	i->count = count;
+	i->start_idx = i->idx;
 }
 EXPORT_SYMBOL(iov_iter_pipe);
 
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 662bea587165..63c7353a6ba2 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -394,7 +394,7 @@  int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
 			   struct iov_iter *to, int len)
 {
 	int start = skb_headlen(skb);
-	int i, copy = start - offset;
+	int i, copy = start - offset, start_off = offset, n;
 	struct sk_buff *frag_iter;
 
 	trace_skb_copy_datagram_iovec(skb, len);
@@ -403,11 +403,12 @@  int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
 	if (copy > 0) {
 		if (copy > len)
 			copy = len;
-		if (copy_to_iter(skb->data + offset, copy, to) != copy)
+		n = copy_to_iter(skb->data + offset, copy, to);
+		offset += n;
+		if (n != copy)
 			goto short_copy;
 		if ((len -= copy) == 0)
 			return 0;
-		offset += copy;
 	}
 
 	/* Copy paged appendix. Hmm... why does this look so complicated? */
@@ -421,13 +422,14 @@  int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
 		if ((copy = end - offset) > 0) {
 			if (copy > len)
 				copy = len;
-			if (copy_page_to_iter(skb_frag_page(frag),
+			n = copy_page_to_iter(skb_frag_page(frag),
 					      frag->page_offset + offset -
-					      start, copy, to) != copy)
+					      start, copy, to);
+			offset += n;
+			if (n != copy)
 				goto short_copy;
 			if (!(len -= copy))
 				return 0;
-			offset += copy;
 		}
 		start = end;
 	}
@@ -459,6 +461,7 @@  int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
 	 */
 
 fault:
+	iov_iter_unroll(to, offset - start_off);
 	return -EFAULT;
 
 short_copy:
@@ -609,7 +612,7 @@  static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
 				      __wsum *csump)
 {
 	int start = skb_headlen(skb);
-	int i, copy = start - offset;
+	int i, copy = start - offset, start_off = offset;
 	struct sk_buff *frag_iter;
 	int pos = 0;
 	int n;
@@ -619,11 +622,11 @@  static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
 		if (copy > len)
 			copy = len;
 		n = csum_and_copy_to_iter(skb->data + offset, copy, csump, to);
+		offset += n;
 		if (n != copy)
 			goto fault;
 		if ((len -= copy) == 0)
 			return 0;
-		offset += copy;
 		pos = copy;
 	}
 
@@ -645,12 +648,12 @@  static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
 						  offset - start, copy,
 						  &csum2, to);
 			kunmap(page);
+			offset += n;
 			if (n != copy)
 				goto fault;
 			*csump = csum_block_add(*csump, csum2, pos);
 			if (!(len -= copy))
 				return 0;
-			offset += copy;
 			pos += copy;
 		}
 		start = end;
@@ -683,6 +686,7 @@  static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
 		return 0;
 
 fault:
+	iov_iter_unroll(to, offset - start_off);
 	return -EFAULT;
 }
 
@@ -767,6 +771,7 @@  int skb_copy_and_csum_datagram_msg(struct sk_buff *skb,
 	}
 	return 0;
 csum_error:
+	iov_iter_unroll(&msg->msg_iter, chunk);
 	return -EINVAL;
 fault:
 	return -EFAULT;