Patchwork [v4,2/2] macvtap: Implement multiqueue for macvtap driver

login
register
mail settings
Submitter Krishna Kumar
Date Aug. 4, 2010, 10:55 a.m.
Message ID <20100804105525.22212.45090.sendpatchset@krkumar2.in.ibm.com>
Download mbox | patch
Permalink /patch/60844/
State Superseded
Delegated to: David Miller
Headers show

Comments

Krishna Kumar - Aug. 4, 2010, 10:55 a.m.
From: Krishna Kumar <krkumar2@in.ibm.com>

Implement multiqueue facility for macvtap driver. The idea is that
a macvtap device can be opened multiple times and the fd's can be
used to register eg, as backend for vhost.

Signed-off-by: Krishna Kumar <krkumar2@in.ibm.com>
---
 drivers/net/macvtap.c      |   89 ++++++++++++++++++++++++++++-------
 include/linux/if_macvlan.h |    9 +++
 2 files changed, 80 insertions(+), 18 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Changli Gao - Aug. 4, 2010, 11:37 a.m.
On Wed, Aug 4, 2010 at 6:55 PM, Krishna Kumar <krkumar2@in.ibm.com> wrote:
> From: Krishna Kumar <krkumar2@in.ibm.com>
>
> Implement multiqueue facility for macvtap driver. The idea is that
> a macvtap device can be opened multiple times and the fd's can be
> used to register eg, as backend for vhost.
>
> Signed-off-by: Krishna Kumar <krkumar2@in.ibm.com>
> ---
>  drivers/net/macvtap.c      |   89 ++++++++++++++++++++++++++++-------
>  include/linux/if_macvlan.h |    9 +++
>  2 files changed, 80 insertions(+), 18 deletions(-)
>
> diff -ruNp org/drivers/net/macvtap.c new/drivers/net/macvtap.c
> --- org/drivers/net/macvtap.c   2010-07-28 15:10:10.000000000 +0530
> +++ new/drivers/net/macvtap.c   2010-08-04 16:19:32.000000000 +0530
> @@ -84,26 +84,45 @@ static const struct proto_ops macvtap_so
>  static DEFINE_SPINLOCK(macvtap_lock);
>
>  /*
> - * Choose the next free queue, for now there is only one
> + * get_slot: return a [unused/occupied] slot in vlan->taps[]:
> + *     - if 'q' is NULL, return the first empty slot;
> + *     - otherwise, return the slot this pointer occupies.
>  */
> +static int get_slot(struct macvlan_dev *vlan, struct macvtap_queue *q)
> +{
> +       int i;
> +
> +       for (i = 0; i < MAX_MACVTAP_QUEUES; i++) {
> +               if (rcu_dereference(vlan->taps[i]) == q)
> +                       return i;
> +       }
> +
> +       /* Should never happen */
> +       BUG_ON(1);
> +}
> +
>  static int macvtap_set_queue(struct net_device *dev, struct file *file,
>                                struct macvtap_queue *q)
>  {
>        struct macvlan_dev *vlan = netdev_priv(dev);
> +       int index;
>        int err = -EBUSY;
>
>        spin_lock(&macvtap_lock);
> -       if (rcu_dereference(vlan->tap))
> +       if (vlan->numvtaps == MAX_MACVTAP_QUEUES)
>                goto out;
>
>        err = 0;
> +       index = get_slot(vlan, NULL);
>        rcu_assign_pointer(q->vlan, vlan);
> -       rcu_assign_pointer(vlan->tap, q);
> +       rcu_assign_pointer(vlan->taps[index], q);
>        sock_hold(&q->sk);
>
>        q->file = file;
>        file->private_data = q;
>
> +       vlan->numvtaps++;
> +
>  out:
>        spin_unlock(&macvtap_lock);
>        return err;
> @@ -124,9 +143,12 @@ static void macvtap_put_queue(struct mac
>        spin_lock(&macvtap_lock);
>        vlan = rcu_dereference(q->vlan);
>        if (vlan) {
> -               rcu_assign_pointer(vlan->tap, NULL);
> +               int index = get_slot(vlan, q);
> +
> +               rcu_assign_pointer(vlan->taps[index], NULL);
>                rcu_assign_pointer(q->vlan, NULL);
>                sock_put(&q->sk);
> +               --vlan->numvtaps;
>        }
>
>        spin_unlock(&macvtap_lock);
> @@ -136,39 +158,72 @@ static void macvtap_put_queue(struct mac
>  }
>
>  /*
> - * Since we only support one queue, just dereference the pointer.
> + * Select a queue based on the rxq of the device on which this packet
> + * arrived. If the incoming device is not mq, calculate a flow hash to
> + * select a queue. vlan->numvtaps is cached in case it reduces during
> + * the execution of this function.
>  */
>  static struct macvtap_queue *macvtap_get_queue(struct net_device *dev,
>                                               struct sk_buff *skb)
>  {
>        struct macvlan_dev *vlan = netdev_priv(dev);
> +       struct macvtap_queue *tap = NULL;
> +       int numvtaps = vlan->numvtaps;
> +       __u32 rxq;
> +
> +       if (!numvtaps)
> +               goto out;
> +
> +       if (likely(skb_rx_queue_recorded(skb))) {
> +               rxq = skb_get_rx_queue(skb);
> +
> +               while (unlikely(rxq >= numvtaps))
> +                       rxq -= numvtaps;
>
> -       return rcu_dereference(vlan->tap);
> +               tap = rcu_dereference(vlan->taps[rxq]);
> +               if (tap)
> +                       goto out;
> +       }
> +
> +       rxq = skb_get_rxhash(skb);
> +       if (!rxq)
> +               rxq = smp_processor_id();
> +
> +       tap = rcu_dereference(vlan->taps[rxq & (numvtaps - 1)]);
> +

numvtaps maybe not power of 2. So some queue maybe can't be used.
You'd better maintain a online queue map and get the index as
get_rps_cpu() dones.

((u64)rxhash * numvtaps) >> 32

for the sbks, we can't get a valid rxhash, we'd better pass them to a
specified queue, such as queue 0.

> +out:
> +       return tap;
>  }
>
>  /*
>  * The net_device is going away, give up the reference
> - * that it holds on the queue (all the queues one day)
> - * and safely set the pointer from the queues to NULL.
> + * that it holds on all queues and safely set the pointer
> + * from the queues to NULL.
>  */
>  static void macvtap_del_queues(struct net_device *dev)
>  {
>        struct macvlan_dev *vlan = netdev_priv(dev);
> -       struct macvtap_queue *q;
> +       struct macvtap_queue *q, *qlist[MAX_MACVTAP_QUEUES];
> +       int i, j = 0;
>
> +       /* macvtap_put_queue can free some slots, so go through all slots */
>        spin_lock(&macvtap_lock);
> -       q = rcu_dereference(vlan->tap);
> -       if (!q) {
> -               spin_unlock(&macvtap_lock);
> -               return;
> +       for (i = 0; i < MAX_MACVTAP_QUEUES && vlan->numvtaps; i++) {
> +               q = rcu_dereference(vlan->taps[i]);
> +               if (q) {
> +                       qlist[j++] = q;
> +                       rcu_assign_pointer(vlan->taps[i], NULL);
> +                       rcu_assign_pointer(q->vlan, NULL);
> +                       vlan->numvtaps--;
> +               }
>        }
> -
> -       rcu_assign_pointer(vlan->tap, NULL);
> -       rcu_assign_pointer(q->vlan, NULL);
> +       BUG_ON(vlan->numvtaps != 0);
>        spin_unlock(&macvtap_lock);
>
>        synchronize_rcu();
> -       sock_put(&q->sk);
> +
> +       for (--j; j >= 0; j--)
> +               sock_put(&qlist[j]->sk);
>  }
>
>  /*
> diff -ruNp org/include/linux/if_macvlan.h new/include/linux/if_macvlan.h
> --- org/include/linux/if_macvlan.h      2010-07-28 15:10:10.000000000 +0530
> +++ new/include/linux/if_macvlan.h      2010-08-04 16:19:32.000000000 +0530
> @@ -40,6 +40,12 @@ struct macvlan_rx_stats {
>        unsigned long           rx_errors;
>  };
>
> +/*
> + * Maximum times a macvtap device can be opened. This can be used to
> + * configure the number of receive queue, e.g. for multiqueue virtio.
> + */
> +#define MAX_MACVTAP_QUEUES     (NR_CPUS < 16 ? NR_CPUS : 16)
> +
>  struct macvlan_dev {
>        struct net_device       *dev;
>        struct list_head        list;
> @@ -50,7 +56,8 @@ struct macvlan_dev {
>        enum macvlan_mode       mode;
>        int (*receive)(struct sk_buff *skb);
>        int (*forward)(struct net_device *dev, struct sk_buff *skb);
> -       struct macvtap_queue    *tap;
> +       struct macvtap_queue    *taps[MAX_MACVTAP_QUEUES];
> +       int                     numvtaps;
>  };
>
>  static inline void macvlan_count_rx(const struct macvlan_dev *vlan,
>
Krishna Kumar - Aug. 4, 2010, 1:25 p.m.
Changli Gao <xiaosuo@gmail.com> wrote on 08/04/2010 05:07:23 PM:

Thanks for your review and comments!

> >  static struct macvtap_queue *macvtap_get_queue(struct net_device *dev,
> >                                               struct sk_buff *skb)
> >  {
> >        struct macvlan_dev *vlan = netdev_priv(dev);
> > +       struct macvtap_queue *tap = NULL;
> > +       int numvtaps = vlan->numvtaps;
> > +       __u32 rxq;
> > +
> > +       if (!numvtaps)
> > +               goto out;
> > +
> > +       if (likely(skb_rx_queue_recorded(skb))) {
> > +               rxq = skb_get_rx_queue(skb);
> > +
> > +               while (unlikely(rxq >= numvtaps))
> > +                       rxq -= numvtaps;
> >
> > -       return rcu_dereference(vlan->tap);
> > +               tap = rcu_dereference(vlan->taps[rxq]);
> > +               if (tap)
> > +                       goto out;
> > +       }
> > +
> > +       rxq = skb_get_rxhash(skb);
> > +       if (!rxq)
> > +               rxq = smp_processor_id();
> > +
> > +       tap = rcu_dereference(vlan->taps[rxq & (numvtaps - 1)]);
> > +
>
> numvtaps maybe not power of 2. So some queue maybe can't be used.
> You'd better maintain a online queue map and get the index as
> get_rps_cpu() dones.
>
> ((u64)rxhash * numvtaps) >> 32
>
> for the sbks, we can't get a valid rxhash, we'd better pass them to a
> specified queue, such as queue 0.

macvtap *with* mq support would be used with mq devices - you
open multiple queues on the macvtap device depending on the
number of queues for the physical device. So since this is an
unlikely case (as can be seen in the patch, and I guess I
should add another "likely" to the "if (tap)" check since fd's
should not be closed), I guess a simple % can be used. Does
the following sound reasonable?

1. Use % to find the slot.
2. If slot is null - I don't want to handle this since I think
   it is better to return NULL if some fd's were closed by user.
   Typically this should never happen since fd's are opened and
   passed to vhost for setting up the backend. So if they are
   closed, then I think NULL is OK.

Arnd, please let me know what you would also suggest.

- KK

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Changli Gao - Aug. 4, 2010, 1:42 p.m.
On Wed, Aug 4, 2010 at 9:25 PM, Krishna Kumar2 <krkumar2@in.ibm.com> wrote:
> Changli Gao <xiaosuo@gmail.com> wrote on 08/04/2010 05:07:23 PM:
>
> Thanks for your review and comments!
>
>> >  static struct macvtap_queue *macvtap_get_queue(struct net_device *dev,
>> >                                               struct sk_buff *skb)
>> >  {
>> >        struct macvlan_dev *vlan = netdev_priv(dev);
>> > +       struct macvtap_queue *tap = NULL;
>> > +       int numvtaps = vlan->numvtaps;
>> > +       __u32 rxq;
>> > +
>> > +       if (!numvtaps)
>> > +               goto out;
>> > +
>> > +       if (likely(skb_rx_queue_recorded(skb))) {
>> > +               rxq = skb_get_rx_queue(skb);
>> > +
>> > +               while (unlikely(rxq >= numvtaps))
>> > +                       rxq -= numvtaps;
>> >
>> > -       return rcu_dereference(vlan->tap);
>> > +               tap = rcu_dereference(vlan->taps[rxq]);
>> > +               if (tap)
>> > +                       goto out;
>> > +       }
>> > +
>> > +       rxq = skb_get_rxhash(skb);
>> > +       if (!rxq)
>> > +               rxq = smp_processor_id();
>> > +
>> > +       tap = rcu_dereference(vlan->taps[rxq & (numvtaps - 1)]);
>> > +
>>
>> numvtaps maybe not power of 2. So some queue maybe can't be used.
>> You'd better maintain a online queue map and get the index as
>> get_rps_cpu() dones.
>>
>> ((u64)rxhash * numvtaps) >> 32
>>
>> for the sbks, we can't get a valid rxhash, we'd better pass them to a
>> specified queue, such as queue 0.
>
> macvtap *with* mq support would be used with mq devices - you
> open multiple queues on the macvtap device depending on the
> number of queues for the physical device. So since this is an
> unlikely case (as can be seen in the patch, and I guess I
> should add another "likely" to the "if (tap)" check since fd's
> should not be closed), I guess a simple % can be used. Does
> the following sound reasonable?
>
> 1. Use % to find the slot.

It is slower than the method used by get_rps_cpu().

> 2. If slot is null - I don't want to handle this since I think
>   it is better to return NULL if some fd's were closed by user.
>   Typically this should never happen since fd's are opened and
>   passed to vhost for setting up the backend. So if they are
>   closed, then I think NULL is OK.
>
> Arnd, please let me know what you would also suggest.
>
> - KK
>
>
Arnd Bergmann - Aug. 4, 2010, 2:54 p.m.
On Wednesday 04 August 2010, Krishna Kumar2 wrote:
> 1. Use % to find the slot.
> 2. If slot is null - I don't want to handle this since I think
>    it is better to return NULL if some fd's were closed by user.
>    Typically this should never happen since fd's are opened and
>    passed to vhost for setting up the backend. So if they are
>    closed, then I think NULL is OK.
> 

Having some of the file descriptors closed is not something that
we should optimize for, but then again it makes sense to still
keep going, mostly for consistency reasons: One day we may
have a use case for a dynamically growing and shrinking set of
queues. Falling back to the first queue is probably ok, we can
always add optimizations when this becomes a real use case.

	Arnd
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff -ruNp org/drivers/net/macvtap.c new/drivers/net/macvtap.c
--- org/drivers/net/macvtap.c	2010-07-28 15:10:10.000000000 +0530
+++ new/drivers/net/macvtap.c	2010-08-04 16:19:32.000000000 +0530
@@ -84,26 +84,45 @@  static const struct proto_ops macvtap_so
 static DEFINE_SPINLOCK(macvtap_lock);
 
 /*
- * Choose the next free queue, for now there is only one
+ * get_slot: return a [unused/occupied] slot in vlan->taps[]:
+ *	- if 'q' is NULL, return the first empty slot;
+ *	- otherwise, return the slot this pointer occupies.
  */
+static int get_slot(struct macvlan_dev *vlan, struct macvtap_queue *q)
+{
+	int i;
+
+	for (i = 0; i < MAX_MACVTAP_QUEUES; i++) {
+		if (rcu_dereference(vlan->taps[i]) == q)
+			return i;
+	}
+
+	/* Should never happen */
+	BUG_ON(1);
+}
+
 static int macvtap_set_queue(struct net_device *dev, struct file *file,
 				struct macvtap_queue *q)
 {
 	struct macvlan_dev *vlan = netdev_priv(dev);
+	int index;
 	int err = -EBUSY;
 
 	spin_lock(&macvtap_lock);
-	if (rcu_dereference(vlan->tap))
+	if (vlan->numvtaps == MAX_MACVTAP_QUEUES)
 		goto out;
 
 	err = 0;
+	index = get_slot(vlan, NULL);
 	rcu_assign_pointer(q->vlan, vlan);
-	rcu_assign_pointer(vlan->tap, q);
+	rcu_assign_pointer(vlan->taps[index], q);
 	sock_hold(&q->sk);
 
 	q->file = file;
 	file->private_data = q;
 
+	vlan->numvtaps++;
+
 out:
 	spin_unlock(&macvtap_lock);
 	return err;
@@ -124,9 +143,12 @@  static void macvtap_put_queue(struct mac
 	spin_lock(&macvtap_lock);
 	vlan = rcu_dereference(q->vlan);
 	if (vlan) {
-		rcu_assign_pointer(vlan->tap, NULL);
+		int index = get_slot(vlan, q);
+
+		rcu_assign_pointer(vlan->taps[index], NULL);
 		rcu_assign_pointer(q->vlan, NULL);
 		sock_put(&q->sk);
+		--vlan->numvtaps;
 	}
 
 	spin_unlock(&macvtap_lock);
@@ -136,39 +158,72 @@  static void macvtap_put_queue(struct mac
 }
 
 /*
- * Since we only support one queue, just dereference the pointer.
+ * Select a queue based on the rxq of the device on which this packet
+ * arrived. If the incoming device is not mq, calculate a flow hash to
+ * select a queue. vlan->numvtaps is cached in case it reduces during
+ * the execution of this function.
  */
 static struct macvtap_queue *macvtap_get_queue(struct net_device *dev,
 					       struct sk_buff *skb)
 {
 	struct macvlan_dev *vlan = netdev_priv(dev);
+	struct macvtap_queue *tap = NULL;
+	int numvtaps = vlan->numvtaps;
+	__u32 rxq;
+
+	if (!numvtaps)
+		goto out;
+
+	if (likely(skb_rx_queue_recorded(skb))) {
+		rxq = skb_get_rx_queue(skb);
+
+		while (unlikely(rxq >= numvtaps))
+			rxq -= numvtaps;
 
-	return rcu_dereference(vlan->tap);
+		tap = rcu_dereference(vlan->taps[rxq]);
+		if (tap)
+			goto out;
+	}
+
+	rxq = skb_get_rxhash(skb);
+	if (!rxq)
+		rxq = smp_processor_id();
+
+	tap = rcu_dereference(vlan->taps[rxq & (numvtaps - 1)]);
+
+out:
+	return tap;
 }
 
 /*
  * The net_device is going away, give up the reference
- * that it holds on the queue (all the queues one day)
- * and safely set the pointer from the queues to NULL.
+ * that it holds on all queues and safely set the pointer
+ * from the queues to NULL.
  */
 static void macvtap_del_queues(struct net_device *dev)
 {
 	struct macvlan_dev *vlan = netdev_priv(dev);
-	struct macvtap_queue *q;
+	struct macvtap_queue *q, *qlist[MAX_MACVTAP_QUEUES];
+	int i, j = 0;
 
+	/* macvtap_put_queue can free some slots, so go through all slots */
 	spin_lock(&macvtap_lock);
-	q = rcu_dereference(vlan->tap);
-	if (!q) {
-		spin_unlock(&macvtap_lock);
-		return;
+	for (i = 0; i < MAX_MACVTAP_QUEUES && vlan->numvtaps; i++) {
+		q = rcu_dereference(vlan->taps[i]);
+		if (q) {
+			qlist[j++] = q;
+			rcu_assign_pointer(vlan->taps[i], NULL);
+			rcu_assign_pointer(q->vlan, NULL);
+			vlan->numvtaps--;
+		}
 	}
-
-	rcu_assign_pointer(vlan->tap, NULL);
-	rcu_assign_pointer(q->vlan, NULL);
+	BUG_ON(vlan->numvtaps != 0);
 	spin_unlock(&macvtap_lock);
 
 	synchronize_rcu();
-	sock_put(&q->sk);
+
+	for (--j; j >= 0; j--)
+		sock_put(&qlist[j]->sk);
 }
 
 /*
diff -ruNp org/include/linux/if_macvlan.h new/include/linux/if_macvlan.h
--- org/include/linux/if_macvlan.h	2010-07-28 15:10:10.000000000 +0530
+++ new/include/linux/if_macvlan.h	2010-08-04 16:19:32.000000000 +0530
@@ -40,6 +40,12 @@  struct macvlan_rx_stats {
 	unsigned long		rx_errors;
 };
 
+/*
+ * Maximum times a macvtap device can be opened. This can be used to
+ * configure the number of receive queue, e.g. for multiqueue virtio.
+ */
+#define MAX_MACVTAP_QUEUES	(NR_CPUS < 16 ? NR_CPUS : 16)
+
 struct macvlan_dev {
 	struct net_device	*dev;
 	struct list_head	list;
@@ -50,7 +56,8 @@  struct macvlan_dev {
 	enum macvlan_mode	mode;
 	int (*receive)(struct sk_buff *skb);
 	int (*forward)(struct net_device *dev, struct sk_buff *skb);
-	struct macvtap_queue	*tap;
+	struct macvtap_queue	*taps[MAX_MACVTAP_QUEUES];
+	int			numvtaps;
 };
 
 static inline void macvlan_count_rx(const struct macvlan_dev *vlan,