diff mbox

[RFC,net-next,3/3] virtio-net: Add accelerated RFS support

Message ID 1389795654-28381-4-git-send-email-zwu.kernel@gmail.com
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

Zhiyong Wu Jan. 15, 2014, 2:20 p.m. UTC
From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>

Accelerated RFS is used to select the RX queue.

Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
---
 drivers/net/virtio_net.c |   56 +++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 55 insertions(+), 1 deletions(-)

Comments

Ben Hutchings Jan. 16, 2014, 9:31 p.m. UTC | #1
On Wed, 2014-01-15 at 22:20 +0800, Zhi Yong Wu wrote:
[...]
> +static int virtnet_init_rx_cpu_rmap(struct virtnet_info *vi)
> +{
> +	int rc = 0;
> +
> +#ifdef CONFIG_RFS_ACCEL
> +	struct virtio_device *vdev = vi->vdev;
> +	unsigned int irq;
> +	int i;
> +
> +	if (!vi->affinity_hint_set)
> +		goto out;
> +
> +	vi->dev->rx_cpu_rmap = alloc_irq_cpu_rmap(vi->max_queue_pairs);
> +	if (!vi->dev->rx_cpu_rmap) {
> +		rc = -ENOMEM;
> +		goto out;
> +	}
> +
> +	for (i = 0; i < vi->max_queue_pairs; i++) {
> +		irq = virtqueue_get_vq_irq(vdev, vi->rq[i].vq);
> +		if (irq == -1)
> +			goto failed;

Jumping into an if-statement is confusing.  Also do you really want to
return 0 in this case?

Otherwise this looks fine.

Ben.

> +		rc = irq_cpu_rmap_add(vi->dev->rx_cpu_rmap, irq);
> +		if (rc) {
> +failed:
> +			virtnet_free_irq_cpu_rmap(vi);
> +			goto out;
> +		}
> +	}
> +out:
> +#endif
> +	return rc;
> +}
[...]
Zhiyong Wu Jan. 16, 2014, 10 p.m. UTC | #2
On Fri, Jan 17, 2014 at 5:31 AM, Ben Hutchings
<bhutchings@solarflare.com> wrote:
> On Wed, 2014-01-15 at 22:20 +0800, Zhi Yong Wu wrote:
> [...]
>> +static int virtnet_init_rx_cpu_rmap(struct virtnet_info *vi)
>> +{
>> +     int rc = 0;
>> +
>> +#ifdef CONFIG_RFS_ACCEL
>> +     struct virtio_device *vdev = vi->vdev;
>> +     unsigned int irq;
>> +     int i;
>> +
>> +     if (!vi->affinity_hint_set)
>> +             goto out;
>> +
>> +     vi->dev->rx_cpu_rmap = alloc_irq_cpu_rmap(vi->max_queue_pairs);
>> +     if (!vi->dev->rx_cpu_rmap) {
>> +             rc = -ENOMEM;
>> +             goto out;
>> +     }
>> +
>> +     for (i = 0; i < vi->max_queue_pairs; i++) {
>> +             irq = virtqueue_get_vq_irq(vdev, vi->rq[i].vq);
>> +             if (irq == -1)
>> +                     goto failed;
>
> Jumping into an if-statement is confusing.  Also do you really want to
> return 0 in this case?
No, If it fail to get irq, i want it to exit as soon as possible,
otherwise it will cause irq_cpu_rmap_add() to be invoked with one
incorrect argument irq.

By the way, do you have thought about if it makes sense to add aRFS
support to virtio_net? For [patch 2/3], what do you think of those
missing stuff listed by me?
For how indirect table is implemented in sfc NIC, do you have any doc
to share with  me? thanks.

>
> Otherwise this looks fine.
>
> Ben.
>
>> +             rc = irq_cpu_rmap_add(vi->dev->rx_cpu_rmap, irq);
>> +             if (rc) {
>> +failed:
>> +                     virtnet_free_irq_cpu_rmap(vi);
>> +                     goto out;
>> +             }
>> +     }
>> +out:
>> +#endif
>> +     return rc;
>> +}
> [...]
>
> --
> Ben Hutchings, Staff Engineer, Solarflare
> Not speaking for my employer; that's the marketing department's job.
> They asked us to note that Solarflare product names are trademarked.
>
Ben Hutchings Jan. 16, 2014, 11:16 p.m. UTC | #3
On Fri, 2014-01-17 at 06:00 +0800, Zhi Yong Wu wrote:
> On Fri, Jan 17, 2014 at 5:31 AM, Ben Hutchings
> <bhutchings@solarflare.com> wrote:
> > On Wed, 2014-01-15 at 22:20 +0800, Zhi Yong Wu wrote:
> > [...]
> >> +static int virtnet_init_rx_cpu_rmap(struct virtnet_info *vi)
> >> +{
> >> +     int rc = 0;
> >> +
> >> +#ifdef CONFIG_RFS_ACCEL
> >> +     struct virtio_device *vdev = vi->vdev;
> >> +     unsigned int irq;
> >> +     int i;
> >> +
> >> +     if (!vi->affinity_hint_set)
> >> +             goto out;
> >> +
> >> +     vi->dev->rx_cpu_rmap = alloc_irq_cpu_rmap(vi->max_queue_pairs);
> >> +     if (!vi->dev->rx_cpu_rmap) {
> >> +             rc = -ENOMEM;
> >> +             goto out;
> >> +     }
> >> +
> >> +     for (i = 0; i < vi->max_queue_pairs; i++) {
> >> +             irq = virtqueue_get_vq_irq(vdev, vi->rq[i].vq);
> >> +             if (irq == -1)
> >> +                     goto failed;
> >
> > Jumping into an if-statement is confusing.  Also do you really want to
> > return 0 in this case?
> No, If it fail to get irq, i want it to exit as soon as possible,
> otherwise it will cause irq_cpu_rmap_add() to be invoked with one
> incorrect argument irq.

Well currently this goto does result in returning 0, as rc has not been
changed after its initialisation to 0.

> By the way, do you have thought about if it makes sense to add aRFS
> support to virtio_net? For [patch 2/3], what do you think of those
> missing stuff listed by me?
> For how indirect table is implemented in sfc NIC, do you have any doc
> to share with  me? thanks.

Going through that list:

> 1.)  guest virtio_net driver should have one filter table and its
> entries can be expired periodically;

In sfc, we keep a count how many entries have been inserted in each NAPI
context.  Whenever the NAPI poll function is about to call
napi_complete() and the count for that context has reached a trigger
level, it will scan some quota of filter entries for expiry.

> 2.)  guest virtio_net driver should pass rx queue index and filter
> info down to the emulated virtio_net NIC in QEMU.
> 3.) the emulated virtio_net NIC should have its indirect table to
> store the flow to rx queue mapping.
> 4.) the emulated virtio_net NIC should classify the rx packet to
> selected queue by applying the filter.

I think the most efficient way to do this would be to put a hash table
in some shared memory that both guest and host can read and write.  The
virtio control path would only be used to set up and tear down the
table.  I don't know whether virtio allows for that.

However, to take advantage of ARFS on a physical net driver, it would be
necessary to send a control request for part 2.

> 5.) update virtio spec.
> Do i miss anything? If yes, please correct me.
> For 3.) and 4.), do you have any doc about how they are implemented in
> physical NICs? e.g. mlx4_en or sfc, etc.

The Programmer's Reference Manuals for Solarflare controllers are only
available under NDA.  I can describe the hardware filtering briefly, but
actually I don't think it's very relevant to virtio_net.

There is a typical RSS hash indirection table (128 entries), but for
ARFS we use a different RX filter table which has 8K entries
(RX_FILTER_TBL0 on SFC4000/SFC9000 family).

Solarflare controllers support user-level networking, which requires
perfect filtering to deliver each application's flows into that
application's dedicated RX queue(s).  Lookups in this larger filter
table are still hash-based, but each entry specifies a TCP/IP or UDP/IP
4-tuple or local 2-tuple to match.  ARFS uses the 4-tuple type only.

To allow for hash collisions, a secondary hash function generates an
increment to be added to the initial table index repeatedly for hash
chaining.  There is a control register which tells the controller the
maximum hash chain length to search for each IP filter type; after this
it will fall back to checking MAC filters and then default filters.

On the SFC9100 family, filter updates and lookups are implemented by
firmware and the driver doesn't manage the filter table itself, but I
know it is still a hash table of perfect filters.

For ARFS, perfect filtering is not needed.  I think it would be
preferable to use a fairly big hash table and make insertion fail in
case of a collision.  Since the backend for virtio_net will do RX queue
selection in software, the entire table of queue indices should fit into
its L1 cache.

Ben.
Zhiyong Wu Jan. 17, 2014, 4:54 p.m. UTC | #4
On Fri, Jan 17, 2014 at 7:16 AM, Ben Hutchings
<bhutchings@solarflare.com> wrote:
> On Fri, 2014-01-17 at 06:00 +0800, Zhi Yong Wu wrote:
>> On Fri, Jan 17, 2014 at 5:31 AM, Ben Hutchings
>> <bhutchings@solarflare.com> wrote:
>> > On Wed, 2014-01-15 at 22:20 +0800, Zhi Yong Wu wrote:
>> > [...]
>> >> +static int virtnet_init_rx_cpu_rmap(struct virtnet_info *vi)
>> >> +{
>> >> +     int rc = 0;
>> >> +
>> >> +#ifdef CONFIG_RFS_ACCEL
>> >> +     struct virtio_device *vdev = vi->vdev;
>> >> +     unsigned int irq;
>> >> +     int i;
>> >> +
>> >> +     if (!vi->affinity_hint_set)
>> >> +             goto out;
>> >> +
>> >> +     vi->dev->rx_cpu_rmap = alloc_irq_cpu_rmap(vi->max_queue_pairs);
>> >> +     if (!vi->dev->rx_cpu_rmap) {
>> >> +             rc = -ENOMEM;
>> >> +             goto out;
>> >> +     }
>> >> +
>> >> +     for (i = 0; i < vi->max_queue_pairs; i++) {
>> >> +             irq = virtqueue_get_vq_irq(vdev, vi->rq[i].vq);
>> >> +             if (irq == -1)
>> >> +                     goto failed;
>> >
>> > Jumping into an if-statement is confusing.  Also do you really want to
>> > return 0 in this case?
>> No, If it fail to get irq, i want it to exit as soon as possible,
>> otherwise it will cause irq_cpu_rmap_add() to be invoked with one
>> incorrect argument irq.
>
> Well currently this goto does result in returning 0, as rc has not been
> changed after its initialisation to 0.
Yes, this goto will result in returning 0. I am thinking if it it
appropriate, but definitely  the current code has one bug here. I
thought that when this NIC doesn't enable MSI-X support, and if
CONFIG_RFS_ACCEL is defined, its aRFS cpu_rmap table will not be
allocated here and aRFS will be ignored automatically, but this will
trigger set_rps_cpu() to hit memory issue.

>
>> By the way, do you have thought about if it makes sense to add aRFS
>> support to virtio_net? For [patch 2/3], what do you think of those
>> missing stuff listed by me?
>> For how indirect table is implemented in sfc NIC, do you have any doc
>> to share with  me? thanks.
>
> Going through that list:
>
>> 1.)  guest virtio_net driver should have one filter table and its
>> entries can be expired periodically;
>
> In sfc, we keep a count how many entries have been inserted in each NAPI
> context.  Whenever the NAPI poll function is about to call
> napi_complete() and the count for that context has reached a trigger
> level, it will scan some quota of filter entries for expiry.
thanks for your sharing.
>
>> 2.)  guest virtio_net driver should pass rx queue index and filter
>> info down to the emulated virtio_net NIC in QEMU.
>> 3.) the emulated virtio_net NIC should have its indirect table to
>> store the flow to rx queue mapping.
>> 4.) the emulated virtio_net NIC should classify the rx packet to
>> selected queue by applying the filter.
>
> I think the most efficient way to do this would be to put a hash table
> in some shared memory that both guest and host can read and write.  The
> virtio control path would only be used to set up and tear down the
> table.  I don't know whether virtio allows for that.
Yes, i agree with you, and don't know what stefan think if it. CC
stefanha. He is virtio expert. Stefan, Can you give some advice about
this?

>
> However, to take advantage of ARFS on a physical net driver, it would be
> necessary to send a control request for part 2.
aRFS on a physical net driver? What is this physical net driver? I
thought that in order to enable aRFS, guest virtio_net driver should
send a control request to its emulated virtio_net NIC.
>
>> 5.) update virtio spec.
>> Do i miss anything? If yes, please correct me.
>> For 3.) and 4.), do you have any doc about how they are implemented in
>> physical NICs? e.g. mlx4_en or sfc, etc.
>
> The Programmer's Reference Manuals for Solarflare controllers are only
> available under NDA.  I can describe the hardware filtering briefly, but
> actually I don't think it's very relevant to virtio_net.
>
> There is a typical RSS hash indirection table (128 entries), but for
> ARFS we use a different RX filter table which has 8K entries
> (RX_FILTER_TBL0 on SFC4000/SFC9000 family).
>
> Solarflare controllers support user-level networking, which requires
> perfect filtering to deliver each application's flows into that
> application's dedicated RX queue(s).  Lookups in this larger filter
> table are still hash-based, but each entry specifies a TCP/IP or UDP/IP
> 4-tuple or local 2-tuple to match.  ARFS uses the 4-tuple type only.
>
> To allow for hash collisions, a secondary hash function generates an
> increment to be added to the initial table index repeatedly for hash
> chaining.  There is a control register which tells the controller the
> maximum hash chain length to search for each IP filter type; after this
> it will fall back to checking MAC filters and then default filters.
>
> On the SFC9100 family, filter updates and lookups are implemented by
> firmware and the driver doesn't manage the filter table itself, but I
> know it is still a hash table of perfect filters.
>
> For ARFS, perfect filtering is not needed.  I think it would be
> preferable to use a fairly big hash table and make insertion fail in
> case of a collision.  Since the backend for virtio_net will do RX queue
> selection in software, the entire table of queue indices should fit into
> its L1 cache.
It is exactly in detail, thanks a lot for your nice sharing. It is
very helpful to me. I will come back to carefully read this after we
reach the agreement about if aRFS should be integrated into virtio_net
and what its big picture is.

>
> Ben.
>
> --
> Ben Hutchings, Staff Engineer, Solarflare
> Not speaking for my employer; that's the marketing department's job.
> They asked us to note that Solarflare product names are trademarked.
>
Ben Hutchings Jan. 17, 2014, 5:20 p.m. UTC | #5
On Sat, 2014-01-18 at 00:54 +0800, Zhi Yong Wu wrote:
> On Fri, Jan 17, 2014 at 7:16 AM, Ben Hutchings
> <bhutchings@solarflare.com> wrote:
[...]
> > However, to take advantage of ARFS on a physical net driver, it would be
> > necessary to send a control request for part 2.
> aRFS on a physical net driver? What is this physical net driver? I
> thought that in order to enable aRFS, guest virtio_net driver should
> send a control request to its emulated virtio_net NIC.
[...]

If the backend is connected to a macvlan device on top of a physical net
device that supports ARFS, then there is further potential for improving
performance by steering to the best physical RX queue and CPU as well as
the best virtio_net RX queue and vCPU.

Ben.
Tom Herbert Jan. 18, 2014, 4:59 a.m. UTC | #6
Ben,

I've never quite understood why flow management in aRFS has to be done
with separate messages, and if I recall this seems to mitigate
performance gains to a large extent. It seems like we should be able
to piggyback on a TX descriptor for a connection information about the
RX side for that connection, namely the rxhash and queue mapping.
State creation should be implicit by just seeing a new rxhash value,
tear down might be accomplished with a separate flag on the final TX
packet on the connection (this would need some additional logic in the
stack). Is this method not feasible in either NICs or virtio-net?

On Fri, Jan 17, 2014 at 9:20 AM, Ben Hutchings
<bhutchings@solarflare.com> wrote:
> On Sat, 2014-01-18 at 00:54 +0800, Zhi Yong Wu wrote:
>> On Fri, Jan 17, 2014 at 7:16 AM, Ben Hutchings
>> <bhutchings@solarflare.com> wrote:
> [...]
>> > However, to take advantage of ARFS on a physical net driver, it would be
>> > necessary to send a control request for part 2.
>> aRFS on a physical net driver? What is this physical net driver? I
>> thought that in order to enable aRFS, guest virtio_net driver should
>> send a control request to its emulated virtio_net NIC.
> [...]
>
> If the backend is connected to a macvlan device on top of a physical net
> device that supports ARFS, then there is further potential for improving
> performance by steering to the best physical RX queue and CPU as well as
> the best virtio_net RX queue and vCPU.
>
> Ben.
>
> --
> Ben Hutchings, Staff Engineer, Solarflare
> Not speaking for my employer; that's the marketing department's job.
> They asked us to note that Solarflare product names are trademarked.
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ben Hutchings Jan. 18, 2014, 2:19 p.m. UTC | #7
On Fri, 2014-01-17 at 20:59 -0800, Tom Herbert wrote:
> Ben,
> 
> I've never quite understood why flow management in aRFS has to be done
> with separate messages, and if I recall this seems to mitigate
> performance gains to a large extent. It seems like we should be able
> to piggyback on a TX descriptor for a connection information about the
> RX side for that connection, namely the rxhash and queue mapping.
> State creation should be implicit by just seeing a new rxhash value,
> tear down might be accomplished with a separate flag on the final TX
> packet on the connection (this would need some additional logic in the
> stack). Is this method not feasible in either NICs or virtio-net?

Well that's roughly how Flow Director works, isn't it?  So it is
feasible on at least one NIC!  It might be possible to implement
something like that in firmware on the SFC9100 (with the filter based on
the following packet headers, not a hash), but I don't know.  As for
other vendors - I have no idea.

Inserting filters from the receive path seemed like a natural extension
of the software RFS implementation.  And it means that the hardware
filters are inserted a little earlier (no need to transmit another
packet), but maybe that doesn't matter much.

Ben.
diff mbox

Patch

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 046421c..9f4cfa7 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -26,6 +26,7 @@ 
 #include <linux/if_vlan.h>
 #include <linux/slab.h>
 #include <linux/cpu.h>
+#include <linux/cpu_rmap.h>
 
 static int napi_weight = NAPI_POLL_WEIGHT;
 module_param(napi_weight, int, 0444);
@@ -1554,6 +1555,49 @@  err:
 	return ret;
 }
 
+static void virtnet_free_irq_cpu_rmap(struct virtnet_info *vi)
+{
+#ifdef CONFIG_RFS_ACCEL
+	free_irq_cpu_rmap(vi->dev->rx_cpu_rmap);
+	vi->dev->rx_cpu_rmap = NULL;
+#endif
+}
+
+static int virtnet_init_rx_cpu_rmap(struct virtnet_info *vi)
+{
+	int rc = 0;
+
+#ifdef CONFIG_RFS_ACCEL
+	struct virtio_device *vdev = vi->vdev;
+	unsigned int irq;
+	int i;
+
+	if (!vi->affinity_hint_set)
+		goto out;
+
+	vi->dev->rx_cpu_rmap = alloc_irq_cpu_rmap(vi->max_queue_pairs);
+	if (!vi->dev->rx_cpu_rmap) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	for (i = 0; i < vi->max_queue_pairs; i++) {
+		irq = virtqueue_get_vq_irq(vdev, vi->rq[i].vq);
+		if (irq == -1)
+			goto failed;
+
+		rc = irq_cpu_rmap_add(vi->dev->rx_cpu_rmap, irq);
+		if (rc) {
+failed:
+			virtnet_free_irq_cpu_rmap(vi);
+			goto out;
+		}
+	}
+out:
+#endif
+	return rc;
+}
+
 static int virtnet_probe(struct virtio_device *vdev)
 {
 	int i, err;
@@ -1671,10 +1715,16 @@  static int virtnet_probe(struct virtio_device *vdev)
 	netif_set_real_num_tx_queues(dev, vi->curr_queue_pairs);
 	netif_set_real_num_rx_queues(dev, vi->curr_queue_pairs);
 
+	err = virtnet_init_rx_cpu_rmap(vi);
+	if (err) {
+		pr_debug("virtio_net: initializing rx cpu rmap failed\n");
+		goto free_vqs;
+	}
+
 	err = register_netdev(dev);
 	if (err) {
 		pr_debug("virtio_net: registering device failed\n");
-		goto free_vqs;
+		goto free_cpu_rmap;
 	}
 
 	/* Last of all, set up some receive buffers. */
@@ -1714,6 +1764,8 @@  static int virtnet_probe(struct virtio_device *vdev)
 free_recv_bufs:
 	free_receive_bufs(vi);
 	unregister_netdev(dev);
+free_cpu_rmap:
+	virtnet_free_irq_cpu_rmap(vi);
 free_vqs:
 	cancel_delayed_work_sync(&vi->refill);
 	virtnet_del_vqs(vi);
@@ -1751,6 +1803,8 @@  static void virtnet_remove(struct virtio_device *vdev)
 
 	unregister_netdev(vi->dev);
 
+	virtnet_free_irq_cpu_rmap(vi);
+
 	remove_vq_common(vi);
 	if (vi->alloc_frag.page)
 		put_page(vi->alloc_frag.page);