diff mbox

[net-next,09/10] bnx2x: Added FW GRO bridging support

Message ID 1358176310-31504-10-git-send-email-yuvalmin@broadcom.com
State Accepted, archived
Delegated to: David Miller
Headers show

Commit Message

Yuval Mintz Jan. 14, 2013, 3:11 p.m. UTC
Since submit 621b4d6 the bnx2x driver support FW GRO.
However, when using the device with GRO enabled in bridging
scenarios throughput is very low, as the bridge expects all
incoming packets to be passed with CHECKSUM_PARTIAL -
a demand which is satisfied by the SW GRO implementation,
but was missed in the bnx2x driver implementation (which returned
CHECKSUM_UNNECESSARY).

Now, given that the traffic is supported by FW GRO (TCP/IP),
the bnx2x driver calculates the pseudo checksum by itself,
passing skbs with CHECKSUM_PARTIAL and giving a much better
throughput when receiving GRO traffic.

Signed-off-by: Yuval Mintz <yuvalmin@broadcom.com>
Signed-off-by: Ariel Elior <ariele@broadcom.com>
Signed-off-by: Eilon Greenstein <eilong@broadcom.com>
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 54 ++++++++++++++++++++++++-
 1 file changed, 52 insertions(+), 2 deletions(-)

Comments

Eric Dumazet Jan. 14, 2013, 5:22 p.m. UTC | #1
On Mon, 2013-01-14 at 17:11 +0200, Yuval Mintz wrote:
> Since submit 621b4d6 the bnx2x driver support FW GRO.
> However, when using the device with GRO enabled in bridging
> scenarios throughput is very low, as the bridge expects all
> incoming packets to be passed with CHECKSUM_PARTIAL -
> a demand which is satisfied by the SW GRO implementation,
> but was missed in the bnx2x driver implementation (which returned
> CHECKSUM_UNNECESSARY).
> 
> Now, given that the traffic is supported by FW GRO (TCP/IP),
> the bnx2x driver calculates the pseudo checksum by itself,
> passing skbs with CHECKSUM_PARTIAL and giving a much better
> throughput when receiving GRO traffic.
> 
> Signed-off-by: Yuval Mintz <yuvalmin@broadcom.com>
> Signed-off-by: Ariel Elior <ariele@broadcom.com>
> Signed-off-by: Eilon Greenstein <eilong@broadcom.com>
> ---
>  drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 54 ++++++++++++++++++++++++-
>  1 file changed, 52 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
> index a6f4140..963eb2d 100644
> --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
> +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
> @@ -21,6 +21,7 @@
>  #include <linux/if_vlan.h>
>  #include <linux/interrupt.h>
>  #include <linux/ip.h>
> +#include <net/tcp.h>
>  #include <net/ipv6.h>
>  #include <net/ip6_checksum.h>
>  #include <linux/prefetch.h>
> @@ -506,7 +507,7 @@ static int bnx2x_fill_frag_skb(struct bnx2x *bp, struct bnx2x_fastpath *fp,
>  					tpa_info->parsing_flags, len_on_bd);
>  
>  		/* set for GRO */
> -		if (fp->mode == TPA_MODE_GRO)
> +		if (fp->mode == TPA_MODE_GRO && skb_shinfo(skb)->gso_size)
>  			skb_shinfo(skb)->gso_type =
>  			    (GET_FLAG(tpa_info->parsing_flags,
>  				      PARSING_FLAGS_OVER_ETHERNET_PROTOCOL) ==
> @@ -595,6 +596,55 @@ static void *bnx2x_frag_alloc(const struct bnx2x_fastpath *fp)
>  }
>  
> 
> +#ifdef CONFIG_INET
> +static void bnx2x_gro_ip_csum(struct bnx2x *bp, struct sk_buff *skb)
> +{
> +	const struct iphdr *iph = ip_hdr(skb);
> +	struct tcphdr *th;
> +
> +	skb_set_transport_header(skb, sizeof(struct iphdr));
> +	th = tcp_hdr(skb);
> +
> +	th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
> +				  iph->saddr, iph->daddr, 0);
> +}
> +
> +static void bnx2x_gro_ipv6_csum(struct bnx2x *bp, struct sk_buff *skb)
> +{
> +	struct ipv6hdr *iph = ipv6_hdr(skb);
> +	struct tcphdr *th;
> +
> +	skb_set_transport_header(skb, sizeof(struct ipv6hdr));
> +	th = tcp_hdr(skb);
> +
> +	th->check = ~tcp_v6_check(skb->len - skb_transport_offset(skb),
> +				  &iph->saddr, &iph->daddr, 0);
> +}
> +#endif
> +
> +static void bnx2x_gro_receive(struct bnx2x *bp, struct bnx2x_fastpath *fp,
> +			       struct sk_buff *skb)
> +{
> +#ifdef CONFIG_INET
> +	if (fp->mode == TPA_MODE_GRO && skb_shinfo(skb)->gso_size) {
> +		skb_set_network_header(skb, 0);
> +		switch (be16_to_cpu(skb->protocol)) {
> +		case ETH_P_IP:
> +			bnx2x_gro_ip_csum(bp, skb);
> +			break;
> +		case ETH_P_IPV6:
> +			bnx2x_gro_ipv6_csum(bp, skb);
> +			break;
> +		default:
> +			BNX2X_ERR("FW GRO supports only IPv4/IPv6, not 0x%04x\n",
> +				  be16_to_cpu(skb->protocol));
> +		}
> +		tcp_gro_complete(skb);

This looks weird to me. This should be called by GRO stack only.

What is the value of gso_segs ?



> +	}
> +#endif
> +	napi_gro_receive(&fp->napi, skb);
> +}
> +
>  static void bnx2x_tpa_stop(struct bnx2x *bp, struct bnx2x_fastpath *fp,
>  			   struct bnx2x_agg_info *tpa_info,
>  			   u16 pages,
> @@ -648,7 +698,7 @@ static void bnx2x_tpa_stop(struct bnx2x *bp, struct bnx2x_fastpath *fp,
>  					 skb, cqe, cqe_idx)) {
>  			if (tpa_info->parsing_flags & PARSING_FLAGS_VLAN)
>  				__vlan_hwaccel_put_tag(skb, tpa_info->vlan_tag);
> -			napi_gro_receive(&fp->napi, skb);
> +			bnx2x_gro_receive(bp, fp, skb);
>  		} else {
>  			DP(NETIF_MSG_RX_STATUS,
>  			   "Failed to allocate new pages - dropping packet!\n");


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Jan. 14, 2013, 6:44 p.m. UTC | #2
On Mon, 2013-01-14 at 09:22 -0800, Eric Dumazet wrote:

> This looks weird to me. This should be called by GRO stack only.
> 
> What is the value of gso_segs ?
> 
> 

The reason I am pointing this out is the recent change in commit
1def9238d4aa2146924994aa4b7dc861f03b9362
(net_sched: more precise pkt_len computation)

bnx2x not setting gso_segs means that qdisc accounting on ingress is
completely wrong.



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yuval Mintz Jan. 15, 2013, 7:28 a.m. UTC | #3
On 01/14/2013 08:44 PM, Eric Dumazet wrote:
> On Mon, 2013-01-14 at 09:22 -0800, Eric Dumazet wrote:
> 
> What is the value of gso_segs ?
> 
> The reason I am pointing this out is the recent change in commit
> 1def9238d4aa2146924994aa4b7dc861f03b9362
> (net_sched: more precise pkt_len computation)
> 
> bnx2x not setting gso_segs means that qdisc accounting on ingress is
> completely wrong.

Hi Eric,

First I just want to state that you're totally correct about the gso_segs -
bnx2x is not setting it correctly (it's currently totally omitted), and so
it would incorrectly affect the accounting.

However, notice this behaviour was not introduced in this patch -
Since 621b4d6 bnx2x is using FW GRO, overriding the kernel's GRO implementation:
As the bnx2x driver is supplied with the aggregated packet from its FW,
the bnx2x passes a value in the `gso_size' field of its skb, causing
`skb_is_gso' to return `true'.
This will cause the aggregated skb to override the GRO machinations
(in `dev_gro_receive'), overriding all calls to `gro_receive' and thus also
the call to `skb_gro_receive' which whould have incremented `gso_segs'.

This patch actually tries to correct said behaviour, obviously failing
with the gso_segs but still improving the current state of bnx2x GRO
in bridging scenarios.

>> This looks weird to me. This should be called by GRO stack only.

I think this is the main question - we could try and implement this
inside the network-core itself, but as said behaviour is unique to the
bnx2x driver (correct me if I'm wrong, but I'm unaware of any other
driver which does GRO without the kernel GRO implementation), the
solution is specially tailored for the bnx2x driver.

We could either:
  1. Continue with this patch, later sending a patch correcting gso_segs,
     as this is not a new issue.
  2. Send a V2 patch-series which will also set gso_segs correctly.
  3. Send a V2 patch-series which omits this patch, and later send an RFC
     for some kernel implementation which fixes the issue.

Your thoughts on this matter will be greatly appreciated.

Thanks,
Yuval

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yuval Mintz Jan. 15, 2013, 2:02 p.m. UTC | #4
>>> bnx2x not setting gso_segs means that qdisc accounting on ingress is
>>> completely wrong.
>>
>> Notice this behaviour was not introduced in this patch -
>>
>>	...
>>
>> We could either:
>>   1. Continue with this patch, later sending a patch correcting gso_segs,
>>      as this is not a new issue.
>>   2. Send a V2 patch-series which will also set gso_segs correctly.
> 
> I am fine with any solution, as long as we fix the problem.

Eric - Thanks.
Just to be certain - gso_segs should hold the number of non-aggregated packets
contained in the skb's frags, right?


Dave -

Considering Eric's response, following with option (1) or (2) seems like the
right way to go.

Do you want a new patch series which will include a fix for this,
or will you accept a later fix that sets the gso_segs correctly?

(considering this issue is not introduced in this patch,
merely isn't being solved by it)

Thanks,
Yuval Mintz

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Jan. 15, 2013, 2:39 p.m. UTC | #5
On Tue, 2013-01-15 at 09:28 +0200, Yuval Mintz wrote:
> On 01/14/2013 08:44 PM, Eric Dumazet wrote:
> > On Mon, 2013-01-14 at 09:22 -0800, Eric Dumazet wrote:
> > 
> > What is the value of gso_segs ?
> > 
> > The reason I am pointing this out is the recent change in commit
> > 1def9238d4aa2146924994aa4b7dc861f03b9362
> > (net_sched: more precise pkt_len computation)
> > 
> > bnx2x not setting gso_segs means that qdisc accounting on ingress is
> > completely wrong.
> 
> Hi Eric,
> 
> First I just want to state that you're totally correct about the gso_segs -
> bnx2x is not setting it correctly (it's currently totally omitted), and so
> it would incorrectly affect the accounting.
> 
> However, notice this behaviour was not introduced in this patch -
> Since 621b4d6 bnx2x is using FW GRO, overriding the kernel's GRO implementation:
> As the bnx2x driver is supplied with the aggregated packet from its FW,
> the bnx2x passes a value in the `gso_size' field of its skb, causing
> `skb_is_gso' to return `true'.
> This will cause the aggregated skb to override the GRO machinations
> (in `dev_gro_receive'), overriding all calls to `gro_receive' and thus also
> the call to `skb_gro_receive' which whould have incremented `gso_segs'.
> 
> This patch actually tries to correct said behaviour, obviously failing
> with the gso_segs but still improving the current state of bnx2x GRO
> in bridging scenarios.
> 
> >> This looks weird to me. This should be called by GRO stack only.
> 
> I think this is the main question - we could try and implement this
> inside the network-core itself, but as said behaviour is unique to the
> bnx2x driver (correct me if I'm wrong, but I'm unaware of any other
> driver which does GRO without the kernel GRO implementation), the
> solution is specially tailored for the bnx2x driver.
> 
> We could either:
>   1. Continue with this patch, later sending a patch correcting gso_segs,
>      as this is not a new issue.
>   2. Send a V2 patch-series which will also set gso_segs correctly.
>   3. Send a V2 patch-series which omits this patch, and later send an RFC
>      for some kernel implementation which fixes the issue.
> 
> Your thoughts on this matter will be greatly appreciated.

I am fine with any solution, as long as we fix the problem.

If GRO is done by the FW/driver instead of core network stack, we should
make sure :

- transport_header is set correctly (your patch seems to do that)
- gso_segs is computed (this could be done in core network, but this
adds yet another conditional test in th fast path, and it seems only
bnx2x would need this)

Thanks


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Jan. 15, 2013, 3:07 p.m. UTC | #6
On Tue, 2013-01-15 at 16:02 +0200, Yuval Mintz wrote:
> >>> bnx2x not setting gso_segs means that qdisc accounting on ingress is
> >>> completely wrong.
> >>
> >> Notice this behaviour was not introduced in this patch -
> >>
> >>	...
> >>
> >> We could either:
> >>   1. Continue with this patch, later sending a patch correcting gso_segs,
> >>      as this is not a new issue.
> >>   2. Send a V2 patch-series which will also set gso_segs correctly.
> > 
> > I am fine with any solution, as long as we fix the problem.
> 
> Eric - Thanks.
> Just to be certain - gso_segs should hold the number of non-aggregated packets
> contained in the skb's frags, right?

Thats right

If the FW doesnt provide the info, you could for example do :

shinfo->gso_segs = DIV_ROUND_UP(skb->len - hdr_len, shinfo->gso_size);

(hdr_len being the length of all headers (ethernet+network+TCP))



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller Jan. 15, 2013, 8:09 p.m. UTC | #7
From: "Yuval Mintz" <yuvalmin@broadcom.com>
Date: Tue, 15 Jan 2013 16:02:32 +0200

> Considering Eric's response, following with option (1) or (2) seems like the
> right way to go.
> 
> Do you want a new patch series which will include a fix for this,
> or will you accept a later fix that sets the gso_segs correctly?
> 
> (considering this issue is not introduced in this patch,
> merely isn't being solved by it)

I've merged your patch series as-is to net-next, please work on fixing
the problems Eric has pointed out.

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Jan. 16, 2013, 4:56 a.m. UTC | #8
On Tue, 2013-01-15 at 15:09 -0500, David Miller wrote:

> 
> I've merged your patch series as-is to net-next, please work on fixing
> the problems Eric has pointed out.

I have two fixes :

One in net/core/dev.c you probably can apply after usual review

One for bnx2x, I guess Broadcom guys need to check it.



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index a6f4140..963eb2d 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -21,6 +21,7 @@ 
 #include <linux/if_vlan.h>
 #include <linux/interrupt.h>
 #include <linux/ip.h>
+#include <net/tcp.h>
 #include <net/ipv6.h>
 #include <net/ip6_checksum.h>
 #include <linux/prefetch.h>
@@ -506,7 +507,7 @@  static int bnx2x_fill_frag_skb(struct bnx2x *bp, struct bnx2x_fastpath *fp,
 					tpa_info->parsing_flags, len_on_bd);
 
 		/* set for GRO */
-		if (fp->mode == TPA_MODE_GRO)
+		if (fp->mode == TPA_MODE_GRO && skb_shinfo(skb)->gso_size)
 			skb_shinfo(skb)->gso_type =
 			    (GET_FLAG(tpa_info->parsing_flags,
 				      PARSING_FLAGS_OVER_ETHERNET_PROTOCOL) ==
@@ -595,6 +596,55 @@  static void *bnx2x_frag_alloc(const struct bnx2x_fastpath *fp)
 }
 
 
+#ifdef CONFIG_INET
+static void bnx2x_gro_ip_csum(struct bnx2x *bp, struct sk_buff *skb)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	struct tcphdr *th;
+
+	skb_set_transport_header(skb, sizeof(struct iphdr));
+	th = tcp_hdr(skb);
+
+	th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
+				  iph->saddr, iph->daddr, 0);
+}
+
+static void bnx2x_gro_ipv6_csum(struct bnx2x *bp, struct sk_buff *skb)
+{
+	struct ipv6hdr *iph = ipv6_hdr(skb);
+	struct tcphdr *th;
+
+	skb_set_transport_header(skb, sizeof(struct ipv6hdr));
+	th = tcp_hdr(skb);
+
+	th->check = ~tcp_v6_check(skb->len - skb_transport_offset(skb),
+				  &iph->saddr, &iph->daddr, 0);
+}
+#endif
+
+static void bnx2x_gro_receive(struct bnx2x *bp, struct bnx2x_fastpath *fp,
+			       struct sk_buff *skb)
+{
+#ifdef CONFIG_INET
+	if (fp->mode == TPA_MODE_GRO && skb_shinfo(skb)->gso_size) {
+		skb_set_network_header(skb, 0);
+		switch (be16_to_cpu(skb->protocol)) {
+		case ETH_P_IP:
+			bnx2x_gro_ip_csum(bp, skb);
+			break;
+		case ETH_P_IPV6:
+			bnx2x_gro_ipv6_csum(bp, skb);
+			break;
+		default:
+			BNX2X_ERR("FW GRO supports only IPv4/IPv6, not 0x%04x\n",
+				  be16_to_cpu(skb->protocol));
+		}
+		tcp_gro_complete(skb);
+	}
+#endif
+	napi_gro_receive(&fp->napi, skb);
+}
+
 static void bnx2x_tpa_stop(struct bnx2x *bp, struct bnx2x_fastpath *fp,
 			   struct bnx2x_agg_info *tpa_info,
 			   u16 pages,
@@ -648,7 +698,7 @@  static void bnx2x_tpa_stop(struct bnx2x *bp, struct bnx2x_fastpath *fp,
 					 skb, cqe, cqe_idx)) {
 			if (tpa_info->parsing_flags & PARSING_FLAGS_VLAN)
 				__vlan_hwaccel_put_tag(skb, tpa_info->vlan_tag);
-			napi_gro_receive(&fp->napi, skb);
+			bnx2x_gro_receive(bp, fp, skb);
 		} else {
 			DP(NETIF_MSG_RX_STATUS,
 			   "Failed to allocate new pages - dropping packet!\n");