diff mbox

[09/13] netvm: Set PF_MEMALLOC as appropriate during SKB processing

Message ID 1303803414-5937-10-git-send-email-mgorman@suse.de
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

Mel Gorman April 26, 2011, 7:36 a.m. UTC
In order to make sure pfmemalloc packets receive all memory needed to proceed,
ensure processing of pfmemalloc SKBs happens under PF_MEMALLOC. This is
limited to a subset of protocols that are expected to be used for writing
to swap. Taps are not allowed to use PF_MEMALLOC as these are expected to
communicate with userspace processes which could be paged out.

[a.p.zijlstra@chello.nl: Ideas taken from various patches]
[jslaby@suse.cz: Lock imbalance fix]
Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 include/net/sock.h |    5 +++++
 net/core/dev.c     |   52 ++++++++++++++++++++++++++++++++++++++++++++++++----
 net/core/sock.c    |   16 ++++++++++++++++
 3 files changed, 69 insertions(+), 4 deletions(-)

Comments

NeilBrown April 26, 2011, 12:21 p.m. UTC | #1
On Tue, 26 Apr 2011 08:36:50 +0100 Mel Gorman <mgorman@suse.de> wrote:

> diff --git a/net/core/dev.c b/net/core/dev.c
> index 3871bf6..2d79a20 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -3095,6 +3095,27 @@ static void vlan_on_bond_hook(struct sk_buff *skb)
>  	}
>  }
>  
> +/*
> + * Limit which protocols can use the PFMEMALLOC reserves to those that are
> + * expected to be used for communication with swap.
> + */
> +static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
> +{
> +	if (skb_pfmemalloc(skb))
> +		switch (skb->protocol) {
> +		case __constant_htons(ETH_P_ARP):
> +		case __constant_htons(ETH_P_IP):
> +		case __constant_htons(ETH_P_IPV6):
> +		case __constant_htons(ETH_P_8021Q):
> +			break;
> +
> +		default:
> +			return false;
> +		}
> +
> +	return true;
> +}

This sort of thing really bugs me :-)
Neither the comment nor the function name actually describe what the function
is doing.  The function is checking *2* things.
   is_pfmemalloc_skb_or_pfmemalloc_protocol()
might be a more correct name, but is too verbose.

I would prefer the skb_pfmemalloc test were removed from here and ....

> +	if (!skb_pfmemalloc_protocol(skb))
> +		goto drop;
> +

...added here so this becomes:

      if (!skb_pfmemalloc(skb) && !skb_pfmemalloc_protocol(skb))
                goto drop;

which actually makes sense.

Thanks,
NeilBrown

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mel Gorman April 26, 2011, 2:10 p.m. UTC | #2
On Tue, Apr 26, 2011 at 10:21:57PM +1000, NeilBrown wrote:
> On Tue, 26 Apr 2011 08:36:50 +0100 Mel Gorman <mgorman@suse.de> wrote:
> 
> > diff --git a/net/core/dev.c b/net/core/dev.c
> > index 3871bf6..2d79a20 100644
> > --- a/net/core/dev.c
> > +++ b/net/core/dev.c
> > @@ -3095,6 +3095,27 @@ static void vlan_on_bond_hook(struct sk_buff *skb)
> >  	}
> >  }
> >  
> > +/*
> > + * Limit which protocols can use the PFMEMALLOC reserves to those that are
> > + * expected to be used for communication with swap.
> > + */
> > +static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
> > +{
> > +	if (skb_pfmemalloc(skb))
> > +		switch (skb->protocol) {
> > +		case __constant_htons(ETH_P_ARP):
> > +		case __constant_htons(ETH_P_IP):
> > +		case __constant_htons(ETH_P_IPV6):
> > +		case __constant_htons(ETH_P_8021Q):
> > +			break;
> > +
> > +		default:
> > +			return false;
> > +		}
> > +
> > +	return true;
> > +}
> 
> This sort of thing really bugs me :-)
> Neither the comment nor the function name actually describe what the function
> is doing.  The function is checking *2* things.
>    is_pfmemalloc_skb_or_pfmemalloc_protocol()
> might be a more correct name, but is too verbose.
> 
> I would prefer the skb_pfmemalloc test were removed from here and ....
> 
> > +	if (!skb_pfmemalloc_protocol(skb))
> > +		goto drop;
> > +
> 
> ...added here so this becomes:
> 
>       if (!skb_pfmemalloc(skb) && !skb_pfmemalloc_protocol(skb))
>                 goto drop;
> 
> which actually makes sense.
> 

Moving the check is neater but that check should be

	if (skb_pfmemalloc(skb) && !skb_pfmemalloc_protocol(skb))

? It's only if the skb was allocated from emergency reserves that we
need to consider dropping it to make way for other packets to be
received.
NeilBrown April 26, 2011, 11:22 p.m. UTC | #3
On Tue, 26 Apr 2011 15:10:48 +0100 Mel Gorman <mgorman@suse.de> wrote:

> On Tue, Apr 26, 2011 at 10:21:57PM +1000, NeilBrown wrote:
> > On Tue, 26 Apr 2011 08:36:50 +0100 Mel Gorman <mgorman@suse.de> wrote:
> > 
> > > diff --git a/net/core/dev.c b/net/core/dev.c
> > > index 3871bf6..2d79a20 100644
> > > --- a/net/core/dev.c
> > > +++ b/net/core/dev.c
> > > @@ -3095,6 +3095,27 @@ static void vlan_on_bond_hook(struct sk_buff *skb)
> > >  	}
> > >  }
> > >  
> > > +/*
> > > + * Limit which protocols can use the PFMEMALLOC reserves to those that are
> > > + * expected to be used for communication with swap.
> > > + */
> > > +static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
> > > +{
> > > +	if (skb_pfmemalloc(skb))
> > > +		switch (skb->protocol) {
> > > +		case __constant_htons(ETH_P_ARP):
> > > +		case __constant_htons(ETH_P_IP):
> > > +		case __constant_htons(ETH_P_IPV6):
> > > +		case __constant_htons(ETH_P_8021Q):
> > > +			break;
> > > +
> > > +		default:
> > > +			return false;
> > > +		}
> > > +
> > > +	return true;
> > > +}
> > 
> > This sort of thing really bugs me :-)
> > Neither the comment nor the function name actually describe what the function
> > is doing.  The function is checking *2* things.
> >    is_pfmemalloc_skb_or_pfmemalloc_protocol()
> > might be a more correct name, but is too verbose.
> > 
> > I would prefer the skb_pfmemalloc test were removed from here and ....
> > 
> > > +	if (!skb_pfmemalloc_protocol(skb))
> > > +		goto drop;
> > > +
> > 
> > ...added here so this becomes:
> > 
> >       if (!skb_pfmemalloc(skb) && !skb_pfmemalloc_protocol(skb))
> >                 goto drop;
> > 
> > which actually makes sense.
> > 
> 
> Moving the check is neater but that check should be
> 
> 	if (skb_pfmemalloc(skb) && !skb_pfmemalloc_protocol(skb))
> 
> ? It's only if the skb was allocated from emergency reserves that we
> need to consider dropping it to make way for other packets to be
> received.
> 

Correct.  I got my Boolean algebra all confused. Sorry 'bout that.

NeilBrown
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/net/sock.h b/include/net/sock.h
index e3aaa88..e928880 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -668,8 +668,13 @@  static inline __must_check int sk_add_backlog(struct sock *sk, struct sk_buff *s
 	return 0;
 }
 
+extern int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb);
+
 static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 {
+	if (skb_pfmemalloc(skb))
+		return __sk_backlog_rcv(sk, skb);
+
 	return sk->sk_backlog_rcv(sk, skb);
 }
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 3871bf6..2d79a20 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3095,6 +3095,27 @@  static void vlan_on_bond_hook(struct sk_buff *skb)
 	}
 }
 
+/*
+ * Limit which protocols can use the PFMEMALLOC reserves to those that are
+ * expected to be used for communication with swap.
+ */
+static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
+{
+	if (skb_pfmemalloc(skb))
+		switch (skb->protocol) {
+		case __constant_htons(ETH_P_ARP):
+		case __constant_htons(ETH_P_IP):
+		case __constant_htons(ETH_P_IPV6):
+		case __constant_htons(ETH_P_8021Q):
+			break;
+
+		default:
+			return false;
+		}
+
+	return true;
+}
+
 static int __netif_receive_skb(struct sk_buff *skb)
 {
 	struct packet_type *ptype, *pt_prev;
@@ -3104,15 +3125,28 @@  static int __netif_receive_skb(struct sk_buff *skb)
 	bool deliver_exact = false;
 	int ret = NET_RX_DROP;
 	__be16 type;
+	unsigned long pflags = current->flags;
 
 	if (!netdev_tstamp_prequeue)
 		net_timestamp_check(skb);
 
 	trace_netif_receive_skb(skb);
 
+	/*
+	 * PFMEMALLOC skbs are special, they should
+	 * - be delivered to SOCK_MEMALLOC sockets only
+	 * - stay away from userspace
+	 * - have bounded memory usage
+	 *
+	 * Use PF_MEMALLOC as this saves us from propagating the allocation
+	 * context down to all allocation sites.
+	 */
+	if (skb_pfmemalloc(skb))
+		current->flags |= PF_MEMALLOC;
+
 	/* if we've gotten here through NAPI, check netpoll */
 	if (netpoll_receive_skb(skb))
-		return NET_RX_DROP;
+		goto out;
 
 	if (!skb->skb_iif)
 		skb->skb_iif = skb->dev->ifindex;
@@ -3143,6 +3177,9 @@  another_round:
 	}
 #endif
 
+	if (skb_pfmemalloc(skb))
+		goto skip_taps;
+
 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
 		if (!ptype->dev || ptype->dev == skb->dev) {
 			if (pt_prev)
@@ -3151,13 +3188,17 @@  another_round:
 		}
 	}
 
+skip_taps:
 #ifdef CONFIG_NET_CLS_ACT
 	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
 	if (!skb)
-		goto out;
+		goto unlock;
 ncls:
 #endif
 
+	if (!skb_pfmemalloc_protocol(skb))
+		goto drop;
+
 	rx_handler = rcu_dereference(skb->dev->rx_handler);
 	if (rx_handler) {
 		if (pt_prev) {
@@ -3166,7 +3207,7 @@  ncls:
 		}
 		switch (rx_handler(&skb)) {
 		case RX_HANDLER_CONSUMED:
-			goto out;
+			goto unlock;
 		case RX_HANDLER_ANOTHER:
 			goto another_round;
 		case RX_HANDLER_EXACT:
@@ -3210,6 +3251,7 @@  ncls:
 	if (pt_prev) {
 		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 	} else {
+drop:
 		atomic_long_inc(&skb->dev->rx_dropped);
 		kfree_skb(skb);
 		/* Jamal, now you will not able to escape explaining
@@ -3218,8 +3260,10 @@  ncls:
 		ret = NET_RX_DROP;
 	}
 
-out:
+unlock:
 	rcu_read_unlock();
+out:
+	tsk_restore_flags(current, pflags, PF_MEMALLOC);
 	return ret;
 }
 
diff --git a/net/core/sock.c b/net/core/sock.c
index 8308609..ac36807 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -245,6 +245,22 @@  void sk_clear_memalloc(struct sock *sk)
 }
 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 
+int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
+{
+	int ret;
+	unsigned long pflags = current->flags;
+
+	/* these should have been dropped before queueing */
+	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
+
+	current->flags |= PF_MEMALLOC;
+	ret = sk->sk_backlog_rcv(sk, skb);
+	tsk_restore_flags(current, pflags, PF_MEMALLOC);
+
+	return ret;
+}
+EXPORT_SYMBOL(__sk_backlog_rcv);
+
 #if defined(CONFIG_CGROUPS) && !defined(CONFIG_NET_CLS_CGROUP)
 int net_cls_subsys_id = -1;
 EXPORT_SYMBOL_GPL(net_cls_subsys_id);