diff mbox

bnx2x: add support for receive hashing

Message ID alpine.DEB.1.00.1004222249400.27016@pokey.mtv.corp.google.com
State Accepted, archived
Delegated to: David Miller
Headers show

Commit Message

Tom Herbert April 23, 2010, 5:54 a.m. UTC
Add support to bnx2x to extract Toeplitz hash out of the receive descriptor
for use in skb->rxhash.

Signed-off-by: Tom Herbert <therbert@google.com>
---
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

David Miller April 23, 2010, 7:11 a.m. UTC | #1
From: Tom Herbert <therbert@google.com>
Date: Thu, 22 Apr 2010 22:54:16 -0700 (PDT)

> Add support to bnx2x to extract Toeplitz hash out of the receive descriptor
> for use in skb->rxhash.
> 
> Signed-off-by: Tom Herbert <therbert@google.com>

Sweeeeet.

Applied, thanks Tom.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet April 26, 2010, 5:20 p.m. UTC | #2
Le jeudi 22 avril 2010 à 22:54 -0700, Tom Herbert a écrit :
> Add support to bnx2x to extract Toeplitz hash out of the receive descriptor
> for use in skb->rxhash.
> 
> Signed-off-by: Tom Herbert <therbert@google.com>
> ---
> diff --git a/drivers/net/bnx2x.h b/drivers/net/bnx2x.h
> index 0819530..8bd2368 100644
> --- a/drivers/net/bnx2x.h
> +++ b/drivers/net/bnx2x.h
> @@ -1330,7 +1330,7 @@ static inline u32 reg_poll(struct bnx2x *bp, u32 reg, u32 expected, int ms,
>  		AEU_INPUTS_ATTN_BITS_MCP_LATCHED_UMP_TX_PARITY | \
>  		AEU_INPUTS_ATTN_BITS_MCP_LATCHED_SCPAD_PARITY)
>  
> -#define MULTI_FLAGS(bp) \
> +#define RSS_FLAGS(bp) \
>  		(TSTORM_ETH_FUNCTION_COMMON_CONFIG_RSS_IPV4_CAPABILITY | \
>  		 TSTORM_ETH_FUNCTION_COMMON_CONFIG_RSS_IPV4_TCP_CAPABILITY | \
>  		 TSTORM_ETH_FUNCTION_COMMON_CONFIG_RSS_IPV6_CAPABILITY | \
> diff --git a/drivers/net/bnx2x_main.c b/drivers/net/bnx2x_main.c
> index 0c6dba2..613f727 100644
> --- a/drivers/net/bnx2x_main.c
> +++ b/drivers/net/bnx2x_main.c
> @@ -1582,7 +1582,7 @@ static int bnx2x_rx_int(struct bnx2x_fastpath *fp, int budget)
>  		struct sw_rx_bd *rx_buf = NULL;
>  		struct sk_buff *skb;
>  		union eth_rx_cqe *cqe;
> -		u8 cqe_fp_flags;
> +		u8 cqe_fp_flags, cqe_fp_status_flags;
>  		u16 len, pad;
>  
>  		comp_ring_cons = RCQ_BD(sw_comp_cons);
> @@ -1598,6 +1598,7 @@ static int bnx2x_rx_int(struct bnx2x_fastpath *fp, int budget)
>  
>  		cqe = &fp->rx_comp_ring[comp_ring_cons];
>  		cqe_fp_flags = cqe->fast_path_cqe.type_error_flags;
> +		cqe_fp_status_flags = cqe->fast_path_cqe.status_flags;
>  
>  		DP(NETIF_MSG_RX_STATUS, "CQE type %x  err %x  status %x"
>  		   "  queue %x  vlan %x  len %u\n", CQE_TYPE(cqe_fp_flags),
> @@ -1727,6 +1728,12 @@ reuse_rx:
>  
>  			skb->protocol = eth_type_trans(skb, bp->dev);
>  
> +			if ((bp->dev->features & ETH_FLAG_RXHASH) &&
> +			    (cqe_fp_status_flags &
> +			     ETH_FAST_PATH_RX_CQE_RSS_HASH_FLG))
> +				skb->rxhash = le32_to_cpu(
> +				    cqe->fast_path_cqe.rss_hash_result);
> +
>  			skb->ip_summed = CHECKSUM_NONE;
>  			if (bp->rx_csum) {
>  				if (likely(BNX2X_RX_CSUM_OK(cqe)))
> @@ -5750,10 +5757,10 @@ static void bnx2x_init_internal_func(struct bnx2x *bp)
>  	u32 offset;
>  	u16 max_agg_size;
>  
> -	if (is_multi(bp)) {
> -		tstorm_config.config_flags = MULTI_FLAGS(bp);
> +	tstorm_config.config_flags = RSS_FLAGS(bp);
> +
> +	if (is_multi(bp))
>  		tstorm_config.rss_result_mask = MULTI_MASK;
> -	}
>  
>  	/* Enable TPA if needed */
>  	if (bp->flags & TPA_ENABLE_FLAG)
> @@ -6629,10 +6636,8 @@ static int bnx2x_init_common(struct bnx2x *bp)
>  	bnx2x_init_block(bp, PBF_BLOCK, COMMON_STAGE);
>  
>  	REG_WR(bp, SRC_REG_SOFT_RST, 1);
> -	for (i = SRC_REG_KEYRSS0_0; i <= SRC_REG_KEYRSS1_9; i += 4) {
> -		REG_WR(bp, i, 0xc0cac01a);
> -		/* TODO: replace with something meaningful */
> -	}
> +	for (i = SRC_REG_KEYRSS0_0; i <= SRC_REG_KEYRSS1_9; i += 4)
> +		REG_WR(bp, i, random32());
>  	bnx2x_init_block(bp, SRCH_BLOCK, COMMON_STAGE);
>  #ifdef BCM_CNIC
>  	REG_WR(bp, SRC_REG_KEYSEARCH_0, 0x63285672);
> @@ -11001,6 +11006,11 @@ static int bnx2x_set_flags(struct net_device *dev, u32 data)
>  		changed = 1;
>  	}
>  
> +	if (data & ETH_FLAG_RXHASH)
> +		dev->features |= NETIF_F_RXHASH;
> +	else
> +		dev->features &= ~NETIF_F_RXHASH;
> +
>  	if (changed && netif_running(dev)) {
>  		bnx2x_nic_unload(bp, UNLOAD_NORMAL);
>  		rc = bnx2x_nic_load(bp, LOAD_NORMAL);
> --

Hi Tom

I tested this rxhash feature on my bnx2x card, using latest net-next-2.6
and appropriate ethtool

ethtool -k eth1 rxhash on

Then I used my pktgen script, to flood machine with flows with udp dst
port varying between 4000 and 4015.

Software generated rxhash is OK (16 different values).

But with bnx2x provided hash, all skb were hashed to same rxhash
value :(

What are the specs of this hardware hash ?
It seems to not care of udp source/destination ports.

Also, should'nt we feed same values for the seeds on different nics ?

for (i = SRC_REG_KEYRSS0_0; i <= SRC_REG_KEYRSS1_9; i += 4)
	REG_WR(bp, i, random32());

Thanks


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tom Herbert April 26, 2010, 5:38 p.m. UTC | #3
> Hi Tom
>
> I tested this rxhash feature on my bnx2x card, using latest net-next-2.6
> and appropriate ethtool
>
> ethtool -k eth1 rxhash on
>
> Then I used my pktgen script, to flood machine with flows with udp dst
> port varying between 4000 and 4015.
>
> Software generated rxhash is OK (16 different values).
>
> But with bnx2x provided hash, all skb were hashed to same rxhash
> value :(
>
> What are the specs of this hardware hash ?
> It seems to not care of udp source/destination ports.
>

It would appear that way :-(.  I was going to ping Broadcom folks to
see if there's support for UDP.

> Also, should'nt we feed same values for the seeds on different nics ?
>
> for (i = SRC_REG_KEYRSS0_0; i <= SRC_REG_KEYRSS1_9; i += 4)
>        REG_WR(bp, i, random32());
>

Yes, that is reasonable.

> Thanks
>
>
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ben Hutchings April 26, 2010, 5:47 p.m. UTC | #4
On Mon, 2010-04-26 at 10:38 -0700, Tom Herbert wrote:
> > Hi Tom
> >
> > I tested this rxhash feature on my bnx2x card, using latest net-next-2.6
> > and appropriate ethtool
> >
> > ethtool -k eth1 rxhash on
> >
> > Then I used my pktgen script, to flood machine with flows with udp dst
> > port varying between 4000 and 4015.
> >
> > Software generated rxhash is OK (16 different values).
> >
> > But with bnx2x provided hash, all skb were hashed to same rxhash
> > value :(
> >
> > What are the specs of this hardware hash ?
> > It seems to not care of udp source/destination ports.
> >
> 
> It would appear that way :-(.  I was going to ping Broadcom folks to
> see if there's support for UDP.
[...]

Unfortunately the widely-implemented Toeplitz hash functions are defined
only for TCP/IPv4, TCP/IPv6, IPv4 and IPv6.

Ben.
David Miller April 26, 2010, 5:56 p.m. UTC | #5
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Mon, 26 Apr 2010 18:47:52 +0100

> Unfortunately the widely-implemented Toeplitz hash functions are defined
> only for TCP/IPv4, TCP/IPv6, IPv4 and IPv6.

What a complete and utter waste all of this is then.

Defining the hash as TCP only is about as stupid as making hw checksum
offload be TCP only.

Really, I'm completely stunned.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller April 26, 2010, 6:04 p.m. UTC | #6
From: Tom Herbert <therbert@google.com>
Date: Mon, 26 Apr 2010 10:38:27 -0700

> It would appear that way :-(.  I was going to ping Broadcom folks to
> see if there's support for UDP.

I'm pretty sure there isn't at this point.

We'll need to elide setting ->rxhash for non-TCP packets.  I bet that
the ETH_FAST_PATH_RX_CQE_RSS_HASH_TYPE field might be usable to making
this decision, but if not in the worst case we'll need to parse the
VLAN/ETH and IP4/IP6 headers to figure out the protocol.

Damn, I'm so pissed off about this.  This ruins everything!

How damn hard is it to add two 16-bit ports to the hash regardless of
protocol?
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tom Herbert April 26, 2010, 6:19 p.m. UTC | #7
> I'm pretty sure there isn't at this point.
>
> We'll need to elide setting ->rxhash for non-TCP packets.  I bet that
> the ETH_FAST_PATH_RX_CQE_RSS_HASH_TYPE field might be usable to making
> this decision, but if not in the worst case we'll need to parse the
> VLAN/ETH and IP4/IP6 headers to figure out the protocol.
>
> Damn, I'm so pissed off about this.  This ruins everything!
>
> How damn hard is it to add two 16-bit ports to the hash regardless of
> protocol?
>
Fair question.

This also hits RSS/multiqueue. In a netperf RR test, 500 streams
between my two 16 core AMDs:  TCP 970K tps, UDP 370K tps.  I'm
surprised they didn't catch that in some benchmarks...
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller April 26, 2010, 6:22 p.m. UTC | #8
From: Tom Herbert <therbert@google.com>
Date: Mon, 26 Apr 2010 11:19:05 -0700

> This also hits RSS/multiqueue. In a netperf RR test, 500 streams
> between my two 16 core AMDs:  TCP 970K tps, UDP 370K tps.  I'm
> surprised they didn't catch that in some benchmarks...

Meanwhile, these NIC vendors seem to have all the time in the world to
add iSCSI, RDMA and all the other stateful offload junk into their
firmware and silicon.

Yet they can't hash ports if the protocol is not TCP?  Beyond
baffling...
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rick Jones April 26, 2010, 8:19 p.m. UTC | #9
David Miller wrote:
> From: Tom Herbert <therbert@google.com>
> Date: Mon, 26 Apr 2010 11:19:05 -0700
> 
> 
>>This also hits RSS/multiqueue. In a netperf RR test, 500 streams
>>between my two 16 core AMDs:  TCP 970K tps, UDP 370K tps.  I'm
>>surprised they didn't catch that in some benchmarks...
> 
> 
> Meanwhile, these NIC vendors seem to have all the time in the world to
> add iSCSI, RDMA and all the other stateful offload junk into their
> firmware and silicon.
> 
> Yet they can't hash ports if the protocol is not TCP?  Beyond
> baffling...

As a networking guy I can see why it seems baffling, but stepping out of myself 
and thinking like the customers with whom I've interacted over the years, it is 
not baffling at all.

By and large, customers do not do anything "substantial" with UDP.  NFS went to 
TCP mounts 99 times out of 10 many years ago, leaving DNS as about the only 
thing left*. So, customers will not be chomping at the bit for improved UDP 
scalability/performance.  They would though, be jumping up and down demanding 
iSCSI performance and by implication all that comes along for the ride.

rick jones

* And even there, one of the biggest pushes is trying to make TCP "transaction 
friendly" to deal with DNS messages becoming larger than typical MTUs.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller April 26, 2010, 8:40 p.m. UTC | #10
From: Rick Jones <rick.jones2@hp.com>
Date: Mon, 26 Apr 2010 13:19:31 -0700

> As a networking guy I can see why it seems baffling, but stepping out
> of myself and thinking like the customers with whom I've interacted
> over the years, it is not baffling at all.

<sarcasm>
And hey nobody is using SCTP either, that's right, nobody...
</sarcasm>

Look, don't try to defend this abomination of a situation with some
"customers only use TCP" argument.  It only makes the situation look
even more absurd.  

Furthermore, people test system scalability using tools like pktgen,
which surprise surprise generates streams of UDP packets.  Most
hardware based scalability testers spew UDP too.

Everything in the world points to "this toeplitz hash situation is
stupid an inexcusable."

If UDP isn't used by anyone, then you tell me why the checksum engines
of all of these chips can handle them just fine.  Maybe the guy who
works on the checksum logic blocks doesn't talk to the guy who works
on the hashing ones?  Maybe the checksum guy can find the ports in a
UDP packet, but the hashing dude can't locate them?

What the heck do you think people use for various forms of media
streaming?  They often use UDP and it has to scale, and they'd like to
move to DCCP at some point too which is another argument for a fully
protocol agnostic hash.

Why do you think Eric Dumazet gives a crap about UDP scalability and
is constantly testing it?  What about VOIP?  H.323, RTP, etc.?

Some of these cards can even statelessly offload UDP fragmentation
too, in silicon, not even in firmware.  What's their excuse for
screwing up the hash?

Look, this is a complete joke from every angle, at least admit that
fact.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rick Jones April 26, 2010, 8:48 p.m. UTC | #11
David Miller wrote:
> From: Rick Jones <rick.jones2@hp.com>
> Date: Mon, 26 Apr 2010 13:19:31 -0700
> 
> 
>>As a networking guy I can see why it seems baffling, but stepping out
>>of myself and thinking like the customers with whom I've interacted
>>over the years, it is not baffling at all.
> 
> 
> <sarcasm>
> And hey nobody is using SCTP either, that's right, nobody...
> </sarcasm>
> 
> Look, don't try to defend this abomination of a situation with some
> "customers only use TCP" argument.  It only makes the situation look
> even more absurd.  

Do not confuse explanation with endorsement.

rick
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller April 26, 2010, 8:53 p.m. UTC | #12
From: Rick Jones <rick.jones2@hp.com>
Date: Mon, 26 Apr 2010 13:48:22 -0700

> Do not confuse explanation with endorsement.

Ok, fair enough.

But I don't see even the "other perspective" argument being even
valid.  Big shops still use UDP and it has to scale.

Or have they made multicast magically start working with TCP so
they can us it to do trades on the NASDAQ?
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
stephen hemminger April 26, 2010, 8:58 p.m. UTC | #13
On Mon, 26 Apr 2010 13:40:51 -0700 (PDT)
David Miller <davem@davemloft.net> wrote:

> From: Rick Jones <rick.jones2@hp.com>
> Date: Mon, 26 Apr 2010 13:19:31 -0700
> 
> > As a networking guy I can see why it seems baffling, but stepping out
> > of myself and thinking like the customers with whom I've interacted
> > over the years, it is not baffling at all.
> 
> <sarcasm>
> And hey nobody is using SCTP either, that's right, nobody...
> </sarcasm>
> 
> Look, don't try to defend this abomination of a situation with some
> "customers only use TCP" argument.  It only makes the situation look
> even more absurd.  
> 
> Furthermore, people test system scalability using tools like pktgen,
> which surprise surprise generates streams of UDP packets.  Most
> hardware based scalability testers spew UDP too.
> 
> Everything in the world points to "this toeplitz hash situation is
> stupid an inexcusable."
> 
> If UDP isn't used by anyone, then you tell me why the checksum engines
> of all of these chips can handle them just fine.  Maybe the guy who
> works on the checksum logic blocks doesn't talk to the guy who works
> on the hashing ones?  Maybe the checksum guy can find the ports in a
> UDP packet, but the hashing dude can't locate them?
> 
> What the heck do you think people use for various forms of media
> streaming?  They often use UDP and it has to scale, and they'd like to
> move to DCCP at some point too which is another argument for a fully
> protocol agnostic hash.
> 
> Why do you think Eric Dumazet gives a crap about UDP scalability and
> is constantly testing it?  What about VOIP?  H.323, RTP, etc.?
> 
> Some of these cards can even statelessly offload UDP fragmentation
> too, in silicon, not even in firmware.  What's their excuse for
> screwing up the hash?
> 
> Look, this is a complete joke from every angle, at least admit that
> fact.

I think it is fair to blame Microsoft for this as well. The vendors
follow what Msft tells them to do with NDIS spec. It looks like
IPV6 didn't make it in until the NDIS6.2 (Win7) spec.
jamal April 26, 2010, 9:11 p.m. UTC | #14
On Mon, 2010-04-26 at 13:40 -0700, David Miller wrote:

> Why do you think Eric Dumazet gives a crap about UDP scalability and
> is constantly testing it?  What about VOIP?  H.323, RTP, etc.?

This is big of course. All the SIP/RTP stuff is largely UDP. Most P2P
control is via UDP. I think i did read somewhere on measurement taken
in backbones showing consistent rise of UDP:TCP ratio (need to search
my bookmarks). 
It borders on lunacy to assume TCP only. Time to get an open source NIC
out there and ignore these big players? ;->

cheers,
jamal

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rick Jones April 26, 2010, 9:12 p.m. UTC | #15
David Miller wrote:
> From: Rick Jones <rick.jones2@hp.com>
> Date: Mon, 26 Apr 2010 13:48:22 -0700
> 
>>Do not confuse explanation with endorsement.
> 
> Ok, fair enough.
> 
> But I don't see even the "other perspective" argument being even
> valid.  Big shops still use UDP and it has to scale.

Preface - I too think it is massively stupid to ignore anything but TCP/IPv4, 
and unwise to ignore IPv6 and so on, but there is a very real reason why one of 
my email signatures reads:

"The road to hell is paved with business decisions"

> Or have they made multicast magically start working with TCP so
> they can us it to do trades on the NASDAQ?

No. How many NIC chips can NASDAQ be expected to move? 0.1%? or even 1% of the 
NIC chip market?

How many more NIC chips are in places where someone says "You sold me on 
iSCSI/FCoE/whatnot, why can't I get 'link-rate'  to/from iSCSI storage/whatnot?!"

The NIC designer is there with his finance guys breathing down his neck shouting 
"ROI Uber Alles!" and "Your budget is only this many monetary units!"  The 
system designers at the system vendors are hearing the same things from their 
own finance guys, have certain schedules, which then has them going to the NIC 
firms, who want to sell chips to the system guys "You have to be ready to ship 
by this date and your chip has to sell for no more than this."

Lather, rinse, repeat a few times and you get compromises on top of compromises.

Sometimes I think it is a wonder any of it actually works at all...

rick jones
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
jamal April 26, 2010, 9:14 p.m. UTC | #16
On Mon, 2010-04-26 at 17:11 -0400, jamal wrote:
> I think i did read somewhere on measurement taken
> in backbones showing consistent rise of UDP:TCP ratio (need to search
> my bookmarks). 

This one:
http://www.caida.org/research/traffic-analysis/tcpudpratio/

just read the conclusion.

cheers,
jamal


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Brian Bloniarz April 26, 2010, 11:27 p.m. UTC | #17
Rick Jones wrote:
> By and large, customers do not do anything "substantial" with UDP.  NFS
> went to TCP mounts 99 times out of 10 many years ago, leaving DNS as
> about the only thing left*. So, customers will not be chomping at the
> bit for improved UDP scalability/performance.  They would though, be
> jumping up and down demanding iSCSI performance and by implication all
> that comes along for the ride.

I work for a finance firm, and I've met a lot of guys who are
chomping at the bit for improved UDP performance. It's a
huge huge deal in our field. I'm nothing more than one customer
but I have a pretty negative reaction to this sentiment.

I'm also one of the people who occasionally bugs Eric Dumazet
about it :)

One saving grace for the financial multicast case: I think it's
pretty typical for the dest IP to be different for each flow
(different multicast groups). I confirmed on my multi-queue
bnx2 NIC (BCM5709) that those are spread out across the queues,
presumably the hash is similar.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Brian Bloniarz April 27, 2010, 1:37 p.m. UTC | #18
David Miller wrote:
> How damn hard is it to add two 16-bit ports to the hash regardless of
> protocol?
>   
Come to think of it, for UDP the hash must ignore
the srcport and srcaddr, because a single bound
socket is going to wildcard both those fields.

So the best we can hope for is for the hash to
include destport and destaddr? From looking at
my BCM5709's multiq behavior, I think broadcom's
hash includes the destaddr but not the destport.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet April 27, 2010, 2:30 p.m. UTC | #19
Le mardi 27 avril 2010 à 09:37 -0400, Brian Bloniarz a écrit :
> David Miller wrote:
> > How damn hard is it to add two 16-bit ports to the hash regardless of
> > protocol?
> >   
> Come to think of it, for UDP the hash must ignore
> the srcport and srcaddr, because a single bound
> socket is going to wildcard both those fields.
> 

For your application maybe ;)

Here, I have thousand of RTP flows to big mediagateways, so the
(srcaddr, dstaddr) is shared by all these flows.

Tom Herbert also wants a threaded DNS server.

For UDP, we could have a bitmap (system level ?) to say if a particular
destination port wants multi-cpu (RPS) spreading or not, even if the NIC
decided to use a single queue to submit frames to the host (ie mask the
src_addr and src_port). In this case, RFS on non conected UDP sockets
could be activated as well.

Or just a global sysctl to be able to mask the src_addr and/or src_port
in our software rxhash.




> So the best we can hope for is for the hash to
> include destport and destaddr? From looking at
> my BCM5709's multiq behavior, I think broadcom's
> hash includes the destaddr but not the destport.


Toepliz hash for UDP is a hash(src_addr, dst_addr)

Both src port and dst port are ignored.


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Brian Bloniarz April 27, 2010, 3:44 p.m. UTC | #20
Eric Dumazet wrote:
> Le mardi 27 avril 2010 à 09:37 -0400, Brian Bloniarz a écrit :
>> David Miller wrote:
>>> How damn hard is it to add two 16-bit ports to the hash regardless of
>>> protocol?
>>>   
>> Come to think of it, for UDP the hash must ignore
>> the srcport and srcaddr, because a single bound
>> socket is going to wildcard both those fields.
>>
> 
> For your application maybe ;)
> 
> Here, I have thousand of RTP flows to big mediagateways, so the
> (srcaddr, dstaddr) is shared by all these flows.
> 
> Tom Herbert also wants a threaded DNS server.
> 
> For UDP, we could have a bitmap (system level ?) to say if a particular
> destination port wants multi-cpu (RPS) spreading or not, even if the NIC
> decided to use a single queue to submit frames to the host (ie mask the
> src_addr and src_port). In this case, RFS on non conected UDP sockets
> could be activated as well.
> 
> Or just a global sysctl to be able to mask the src_addr and/or src_port
> in our software rxhash.

This is all good to know. Here are 3 other alternatives/suggestions:

1) Leave things as they are, if it really hurts that badly
for our workload we can just disable RPS entirely.
2) Add a global sysctl to disable RPS for datagram-based protocols.
3) Pluggable SW rxhashes :)

I haven't benchmarked anything for our workloads yet, so
I can't say for sure it's even a big issue.

Has anyone benchmarked RPS + single-threaded DNS servers? It'd
be a pessimization in that case, right? Even the multi-threaded
DNS server wouldn't be workable unless there was something like
the soreuseport patch you & Tom had been been discussing.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller April 27, 2010, 4:51 p.m. UTC | #21
From: Brian Bloniarz <bmb@athenacr.com>
Date: Tue, 27 Apr 2010 09:37:11 -0400

> David Miller wrote:
>> How damn hard is it to add two 16-bit ports to the hash regardless of
>> protocol?
>>   
> Come to think of it, for UDP the hash must ignore
> the srcport and srcaddr, because a single bound
> socket is going to wildcard both those fields.

For load distribution we don't care if the local socket is wildcard
bounded on source.

It's going to be fully specified in the packet, and that's enough.

Sure, for full RFS some amends might be necessary in this area, but
for RPS and adapter based hw steering, using all of the ports is
entirely sufficient.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Brian Bloniarz April 27, 2010, 5:02 p.m. UTC | #22
David Miller wrote:
> From: Brian Bloniarz <bmb@athenacr.com>
> Date: Tue, 27 Apr 2010 09:37:11 -0400
> 
>> David Miller wrote:
>>> How damn hard is it to add two 16-bit ports to the hash regardless of
>>> protocol?
>>>   
>> Come to think of it, for UDP the hash must ignore
>> the srcport and srcaddr, because a single bound
>> socket is going to wildcard both those fields.
> 
> For load distribution we don't care if the local socket is wildcard
> bounded on source.
> 
> It's going to be fully specified in the packet, and that's enough.

Maybe I'm misunderstanding... won't it distribute the
packet handling load to multiple cores, but then all
those cores will contend trying to deliver those packets
to the single socket?

I was assuming that this'd be a net loss over just doing
all the protocol handling on a single core. I haven't
done any benchmarks yet.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller April 27, 2010, 5:06 p.m. UTC | #23
From: Brian Bloniarz <bmb@athenacr.com>
Date: Tue, 27 Apr 2010 13:02:08 -0400

> Maybe I'm misunderstanding... won't it distribute the
> packet handling load to multiple cores, but then all
> those cores will contend trying to deliver those packets
> to the single socket?
> 
> I was assuming that this'd be a net loss over just doing
> all the protocol handling on a single core. I haven't
> done any benchmarks yet.

Whether it's a new loss depends upon the application.

Also, on the non-application side f.e. a router or firewall, this is
exactly the behavior you want.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet April 27, 2010, 5:13 p.m. UTC | #24
Le mardi 27 avril 2010 à 09:51 -0700, David Miller a écrit :
> From: Brian Bloniarz <bmb@athenacr.com>
> Date: Tue, 27 Apr 2010 09:37:11 -0400
> 
> > David Miller wrote:
> >> How damn hard is it to add two 16-bit ports to the hash regardless of
> >> protocol?
> >>   
> > Come to think of it, for UDP the hash must ignore
> > the srcport and srcaddr, because a single bound
> > socket is going to wildcard both those fields.
> 
> For load distribution we don't care if the local socket is wildcard
> bounded on source.
> 
> It's going to be fully specified in the packet, and that's enough.
> 
> Sure, for full RFS some amends might be necessary in this area, but
> for RPS and adapter based hw steering, using all of the ports is
> entirely sufficient.

Well well well...

I was doing some pktgen tests, with :

 pgset "src_min 192.168.0.10"
 pgset "src_max 192.168.0.110"
 pgset "dst_min 192.168.0.2"
 pgset "dst_max 192.168.0.2"
 pgset "udp_dst_min 4000"
 pgset "udp_dst_max 4000"

So I simulate 100 remote IPS bombarding a single port on target machine.
pktgen injects about 930.000 pps

sofirq of my target received on cpu0, and RPS spread packets to 7 other
cpus.

And my receiver is stuck (he can read about 50 pps !!!)


As soon as I disable rps, my receiver can catch 850.000 pps


RPS OFF: perf top of cpu 0

------------------------------------------------------------------------------------------------------------------------------
   PerfTop:    1001 irqs/sec  kernel:100.0% [1000Hz cycles],  (all, cpu: 0)
------------------------------------------------------------------------------------------------------------------------------

             samples  pcnt function               DSO
             _______ _____ ______________________ _______

              385.00 10.2% __udp4_lib_lookup      vmlinux
              322.00  8.5% ip_route_input         vmlinux
              312.00  8.3% sock_queue_rcv_skb     vmlinux
              262.00  6.9% do_raw_spin_lock       vmlinux
              251.00  6.6% __alloc_skb            vmlinux
              239.00  6.3% sock_put               vmlinux
              207.00  5.5% eth_type_trans         vmlinux
              202.00  5.4% __slab_alloc           vmlinux
              159.00  4.2% __kmalloc_track_caller vmlinux
              149.00  3.9% __sk_mem_schedule      vmlinux
              125.00  3.3% kmem_cache_alloc       vmlinux
              116.00  3.1% ipt_do_table           vmlinux
              115.00  3.0% do_raw_read_lock       vmlinux
               71.00  1.9% tg3_poll_work          vmlinux
               65.00  1.7% __netdev_alloc_skb     vmlinux
               64.00  1.7% skb_pull               vmlinux
               58.00  1.5% ip_rcv                 vmlinux
               58.00  1.5% __slab_free            vmlinux
               53.00  1.4% udp_queue_rcv_skb      vmlinux
               47.00  1.2% nf_iterate             vmlinux
               44.00  1.2% __netif_receive_skb    vmlinux
               29.00  0.8% sock_def_readable      vmlinux
               28.00  0.7% do_raw_spin_unlock     vmlinux
               26.00  0.7% kfree                  vmlinux
               25.00  0.7% __udp4_lib_rcv         vmlinux
               24.00  0.6% ip_rcv_finish          vmlinux
               24.00  0.6% __list_add             vmlinux


RPS, on, a perf top of a slave CPU :

------------------------------------------------------------------------------------------------------------------------------
   PerfTop:    1000 irqs/sec  kernel:100.0% [1000Hz cycles],  (all, cpu: 1)
------------------------------------------------------------------------------------------------------------------------------

             samples  pcnt function            DSO
             _______ _____ ___________________ _______

             2411.00 62.0% do_raw_spin_lock    vmlinux
              690.00 17.7% delay_tsc           vmlinux
              234.00  6.0% __udp4_lib_lookup   vmlinux
              174.00  4.5% sock_put            vmlinux
               72.00  1.9% ip_rcv              vmlinux
               51.00  1.3% __netif_receive_skb vmlinux
               43.00  1.1% do_raw_spin_unlock  vmlinux
               39.00  1.0% __delay             vmlinux
               38.00  1.0% sock_queue_rcv_skb  vmlinux
               36.00  0.9% udp_queue_rcv_skb   vmlinux
               31.00  0.8% ip_route_input      vmlinux
               15.00  0.4% __slab_free         vmlinux
               12.00  0.3% ipt_do_table        vmlinux
               11.00  0.3% skb_release_data    vmlinux
                7.00  0.2% kfree               vmlinux
                5.00  0.1% nf_iterate          vmlinux

So we have a BIG problem :

All cpus are fighting to get the socket lock,
and very litle progress is done.

Note this problem has nothing to do with RPS, we could have 
it with multiqueue as well.

Oh well...



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller April 27, 2010, 5:20 p.m. UTC | #25
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 27 Apr 2010 19:13:59 +0200

> So we have a BIG problem :
> 
> All cpus are fighting to get the socket lock,
> and very litle progress is done.
> 
> Note this problem has nothing to do with RPS, we could have 
> it with multiqueue as well.
> 
> Oh well...

Indeed, a huge issue, in that we haven't converted the UDP hash over
to RCU yet :-)

But because of the transient bind nature of UDP there are still a bunch
of cases that won't even cure.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tom Herbert April 27, 2010, 5:31 p.m. UTC | #26
> So we have a BIG problem :
>
> All cpus are fighting to get the socket lock,
> and very litle progress is done.
>
> Note this problem has nothing to do with RPS, we could have
> it with multiqueue as well.
>

This is the problem that we are addressing with so_reuseport!

> Oh well...
>
>
>
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet April 27, 2010, 5:36 p.m. UTC | #27
Le mardi 27 avril 2010 à 10:31 -0700, Tom Herbert a écrit :

> This is the problem that we are addressing with so_reuseport!

How standard applications are protected against a DDOS ?



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet April 27, 2010, 5:37 p.m. UTC | #28
Le mardi 27 avril 2010 à 10:20 -0700, David Miller a écrit :

> 
> Indeed, a huge issue, in that we haven't converted the UDP hash over
> to RCU yet :-)
> 

I am not sure what you mean, UDP hash _is_ RCU converted ;)

> But because of the transient bind nature of UDP there are still a bunch
> of cases that won't even cure.
> --

We might use the ticket spinlock paradigm to let writers go in parallel
and let the user the socket lock

Instead of having the bh_lock_sock() to protect receive_queue *and*
backlog, writers get a unique slot in a table, that 'user' can handle
later.

Or serialize writers (before they try to bh_lock_sock()) with a
dedicated lock, so that user has 50% chances to get the sock lock,
contending with at most one writer.



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eilon Greenstein April 27, 2010, 6:31 p.m. UTC | #29
On Mon, 2010-04-26 at 14:12 -0700, Rick Jones wrote:
> David Miller wrote:
> > From: Rick Jones <rick.jones2@hp.com>
> > Date: Mon, 26 Apr 2010 13:48:22 -0700
> > 
> >>Do not confuse explanation with endorsement.
> > 
> > Ok, fair enough.
> > 
> > But I don't see even the "other perspective" argument being even
> > valid.  Big shops still use UDP and it has to scale.
> 
> Preface - I too think it is massively stupid to ignore anything but TCP/IPv4, 
> and unwise to ignore IPv6 and so on, but there is a very real reason why one of 
> my email signatures reads:
> 
> "The road to hell is paved with business decisions"
> 
> > Or have they made multicast magically start working with TCP so
> > they can us it to do trades on the NASDAQ?
> 
> No. How many NIC chips can NASDAQ be expected to move? 0.1%? or even 1% of the 
> NIC chip market?
> 
> How many more NIC chips are in places where someone says "You sold me on 
> iSCSI/FCoE/whatnot, why can't I get 'link-rate'  to/from iSCSI storage/whatnot?!"
> 
> The NIC designer is there with his finance guys breathing down his neck shouting 
> "ROI Uber Alles!" and "Your budget is only this many monetary units!"  The 
> system designers at the system vendors are hearing the same things from their 
> own finance guys, have certain schedules, which then has them going to the NIC 
> firms, who want to sell chips to the system guys "You have to be ready to ship 
> by this date and your chip has to sell for no more than this."
> 
> Lather, rinse, repeat a few times and you get compromises on top of compromises.
> 
> Sometimes I think it is a wonder any of it actually works at all...
> 
> rick jones

Though the thread is going in a different direction now, I just wanted
to clarify two things:
- yes, the 57710 and 57711 only handle the IP (src+dst) for UDP toeplitz
hash. We all agree that it is much better to address the UDP ports as
well, but I think Rick Jones explained the process very well - thank you
Rick. Just to add one more (lame) excuse: the HW was designed before new
NAPI was introduced and it complies with the requirements from Redmond
- the next generation (57712) which we already sample does (finally)
support it. We are working on a patch series to enhance the bnx2x to
support this device now.

Eilon



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet April 27, 2010, 7:30 p.m. UTC | #30
Le mardi 27 avril 2010 à 21:31 +0300, Eilon Greenstein a écrit :

> Though the thread is going in a different direction now, I just wanted
> to clarify two things:
> - yes, the 57710 and 57711 only handle the IP (src+dst) for UDP toeplitz
> hash. We all agree that it is much better to address the UDP ports as
> well, but I think Rick Jones explained the process very well - thank you
> Rick. Just to add one more (lame) excuse: the HW was designed before new
> NAPI was introduced and it complies with the requirements from Redmond
> - the next generation (57712) which we already sample does (finally)
> support it. We are working on a patch series to enhance the bnx2x to
> support this device now.
> 

Thanks Eilon !


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Vladislav Zolotarov July 4, 2010, 4:36 p.m. UTC | #31
Tom, could u, pls., explain what did u mean by taking the (RSS) flags configuration out of RSS "if"? To recall "if(is_multi(bp))" is true iff RSS is enabled.

Thanks,
vlad

> @@ -5750,10 +5757,10 @@ static void bnx2x_init_internal_func(struct bnx2x
> *bp)
>  	u32 offset;
>  	u16 max_agg_size;
> 
> -	if (is_multi(bp)) {
> -		tstorm_config.config_flags = MULTI_FLAGS(bp);
> +	tstorm_config.config_flags = RSS_FLAGS(bp);
> +
> +	if (is_multi(bp))
>  		tstorm_config.rss_result_mask = MULTI_MASK;
> -	}
> 
>  	/* Enable TPA if needed */
>  	if (bp->flags & TPA_ENABLE_FLAG)


> -----Original Message-----
> From: netdev-owner@vger.kernel.org [mailto:netdev-owner@vger.kernel.org] On
> Behalf Of Tom Herbert
> Sent: Friday, April 23, 2010 8:54 AM
> To: davem@davemloft.net; netdev@vger.kernel.org
> Subject: [PATCH] bnx2x: add support for receive hashing
> 
> Add support to bnx2x to extract Toeplitz hash out of the receive descriptor
> for use in skb->rxhash.
> 
> Signed-off-by: Tom Herbert <therbert@google.com>
> ---
> diff --git a/drivers/net/bnx2x.h b/drivers/net/bnx2x.h
> index 0819530..8bd2368 100644
> --- a/drivers/net/bnx2x.h
> +++ b/drivers/net/bnx2x.h
> @@ -1330,7 +1330,7 @@ static inline u32 reg_poll(struct bnx2x *bp, u32 reg,
> u32 expected, int ms,
>  		AEU_INPUTS_ATTN_BITS_MCP_LATCHED_UMP_TX_PARITY | \
>  		AEU_INPUTS_ATTN_BITS_MCP_LATCHED_SCPAD_PARITY)
> 
> -#define MULTI_FLAGS(bp) \
> +#define RSS_FLAGS(bp) \
>  		(TSTORM_ETH_FUNCTION_COMMON_CONFIG_RSS_IPV4_CAPABILITY | \
>  		 TSTORM_ETH_FUNCTION_COMMON_CONFIG_RSS_IPV4_TCP_CAPABILITY | \
>  		 TSTORM_ETH_FUNCTION_COMMON_CONFIG_RSS_IPV6_CAPABILITY | \
> diff --git a/drivers/net/bnx2x_main.c b/drivers/net/bnx2x_main.c
> index 0c6dba2..613f727 100644
> --- a/drivers/net/bnx2x_main.c
> +++ b/drivers/net/bnx2x_main.c
> @@ -1582,7 +1582,7 @@ static int bnx2x_rx_int(struct bnx2x_fastpath *fp, int
> budget)
>  		struct sw_rx_bd *rx_buf = NULL;
>  		struct sk_buff *skb;
>  		union eth_rx_cqe *cqe;
> -		u8 cqe_fp_flags;
> +		u8 cqe_fp_flags, cqe_fp_status_flags;
>  		u16 len, pad;
> 
>  		comp_ring_cons = RCQ_BD(sw_comp_cons);
> @@ -1598,6 +1598,7 @@ static int bnx2x_rx_int(struct bnx2x_fastpath *fp, int
> budget)
> 
>  		cqe = &fp->rx_comp_ring[comp_ring_cons];
>  		cqe_fp_flags = cqe->fast_path_cqe.type_error_flags;
> +		cqe_fp_status_flags = cqe->fast_path_cqe.status_flags;
> 
>  		DP(NETIF_MSG_RX_STATUS, "CQE type %x  err %x  status %x"
>  		   "  queue %x  vlan %x  len %u\n", CQE_TYPE(cqe_fp_flags),
> @@ -1727,6 +1728,12 @@ reuse_rx:
> 
>  			skb->protocol = eth_type_trans(skb, bp->dev);
> 
> +			if ((bp->dev->features & ETH_FLAG_RXHASH) &&
> +			    (cqe_fp_status_flags &
> +			     ETH_FAST_PATH_RX_CQE_RSS_HASH_FLG))
> +				skb->rxhash = le32_to_cpu(
> +				    cqe->fast_path_cqe.rss_hash_result);
> +
>  			skb->ip_summed = CHECKSUM_NONE;
>  			if (bp->rx_csum) {
>  				if (likely(BNX2X_RX_CSUM_OK(cqe)))
> @@ -5750,10 +5757,10 @@ static void bnx2x_init_internal_func(struct bnx2x
> *bp)
>  	u32 offset;
>  	u16 max_agg_size;
> 
> -	if (is_multi(bp)) {
> -		tstorm_config.config_flags = MULTI_FLAGS(bp);
> +	tstorm_config.config_flags = RSS_FLAGS(bp);
> +
> +	if (is_multi(bp))
>  		tstorm_config.rss_result_mask = MULTI_MASK;
> -	}
> 
>  	/* Enable TPA if needed */
>  	if (bp->flags & TPA_ENABLE_FLAG)
> @@ -6629,10 +6636,8 @@ static int bnx2x_init_common(struct bnx2x *bp)
>  	bnx2x_init_block(bp, PBF_BLOCK, COMMON_STAGE);
> 
>  	REG_WR(bp, SRC_REG_SOFT_RST, 1);
> -	for (i = SRC_REG_KEYRSS0_0; i <= SRC_REG_KEYRSS1_9; i += 4) {
> -		REG_WR(bp, i, 0xc0cac01a);
> -		/* TODO: replace with something meaningful */
> -	}
> +	for (i = SRC_REG_KEYRSS0_0; i <= SRC_REG_KEYRSS1_9; i += 4)
> +		REG_WR(bp, i, random32());
>  	bnx2x_init_block(bp, SRCH_BLOCK, COMMON_STAGE);
>  #ifdef BCM_CNIC
>  	REG_WR(bp, SRC_REG_KEYSEARCH_0, 0x63285672);
> @@ -11001,6 +11006,11 @@ static int bnx2x_set_flags(struct net_device *dev,
> u32 data)
>  		changed = 1;
>  	}
> 
> +	if (data & ETH_FLAG_RXHASH)
> +		dev->features |= NETIF_F_RXHASH;
> +	else
> +		dev->features &= ~NETIF_F_RXHASH;
> +
>  	if (changed && netif_running(dev)) {
>  		bnx2x_nic_unload(bp, UNLOAD_NORMAL);
>  		rc = bnx2x_nic_load(bp, LOAD_NORMAL);
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Vladislav Zolotarov July 4, 2010, 4:46 p.m. UTC | #32
Is there any reason not to fill skb->rxhash for LROed packets?

Thanks,
vlad

> -----Original Message-----
> From: netdev-owner@vger.kernel.org [mailto:netdev-owner@vger.kernel.org] On
> Behalf Of Vladislav Zolotarov
> Sent: Sunday, July 04, 2010 7:36 PM
> To: Tom Herbert
> Cc: netdev@vger.kernel.org
> Subject: RE: [PATCH] bnx2x: add support for receive hashing
> 
> Tom, could u, pls., explain what did u mean by taking the (RSS) flags
> configuration out of RSS "if"? To recall "if(is_multi(bp))" is true iff RSS
> is enabled.
> 
> Thanks,
> vlad
> 
> > @@ -5750,10 +5757,10 @@ static void bnx2x_init_internal_func(struct bnx2x
> > *bp)
> >  	u32 offset;
> >  	u16 max_agg_size;
> >
> > -	if (is_multi(bp)) {
> > -		tstorm_config.config_flags = MULTI_FLAGS(bp);
> > +	tstorm_config.config_flags = RSS_FLAGS(bp);
> > +
> > +	if (is_multi(bp))
> >  		tstorm_config.rss_result_mask = MULTI_MASK;
> > -	}
> >
> >  	/* Enable TPA if needed */
> >  	if (bp->flags & TPA_ENABLE_FLAG)
> 
> 
> > -----Original Message-----
> > From: netdev-owner@vger.kernel.org [mailto:netdev-owner@vger.kernel.org] On
> > Behalf Of Tom Herbert
> > Sent: Friday, April 23, 2010 8:54 AM
> > To: davem@davemloft.net; netdev@vger.kernel.org
> > Subject: [PATCH] bnx2x: add support for receive hashing
> >
> > Add support to bnx2x to extract Toeplitz hash out of the receive descriptor
> > for use in skb->rxhash.
> >
> > Signed-off-by: Tom Herbert <therbert@google.com>
> > ---
> > diff --git a/drivers/net/bnx2x.h b/drivers/net/bnx2x.h
> > index 0819530..8bd2368 100644
> > --- a/drivers/net/bnx2x.h
> > +++ b/drivers/net/bnx2x.h
> > @@ -1330,7 +1330,7 @@ static inline u32 reg_poll(struct bnx2x *bp, u32 reg,
> > u32 expected, int ms,
> >  		AEU_INPUTS_ATTN_BITS_MCP_LATCHED_UMP_TX_PARITY | \
> >  		AEU_INPUTS_ATTN_BITS_MCP_LATCHED_SCPAD_PARITY)
> >
> > -#define MULTI_FLAGS(bp) \
> > +#define RSS_FLAGS(bp) \
> >  		(TSTORM_ETH_FUNCTION_COMMON_CONFIG_RSS_IPV4_CAPABILITY | \
> >  		 TSTORM_ETH_FUNCTION_COMMON_CONFIG_RSS_IPV4_TCP_CAPABILITY | \
> >  		 TSTORM_ETH_FUNCTION_COMMON_CONFIG_RSS_IPV6_CAPABILITY | \
> > diff --git a/drivers/net/bnx2x_main.c b/drivers/net/bnx2x_main.c
> > index 0c6dba2..613f727 100644
> > --- a/drivers/net/bnx2x_main.c
> > +++ b/drivers/net/bnx2x_main.c
> > @@ -1582,7 +1582,7 @@ static int bnx2x_rx_int(struct bnx2x_fastpath *fp,
> int
> > budget)
> >  		struct sw_rx_bd *rx_buf = NULL;
> >  		struct sk_buff *skb;
> >  		union eth_rx_cqe *cqe;
> > -		u8 cqe_fp_flags;
> > +		u8 cqe_fp_flags, cqe_fp_status_flags;
> >  		u16 len, pad;
> >
> >  		comp_ring_cons = RCQ_BD(sw_comp_cons);
> > @@ -1598,6 +1598,7 @@ static int bnx2x_rx_int(struct bnx2x_fastpath *fp,
> int
> > budget)
> >
> >  		cqe = &fp->rx_comp_ring[comp_ring_cons];
> >  		cqe_fp_flags = cqe->fast_path_cqe.type_error_flags;
> > +		cqe_fp_status_flags = cqe->fast_path_cqe.status_flags;
> >
> >  		DP(NETIF_MSG_RX_STATUS, "CQE type %x  err %x  status %x"
> >  		   "  queue %x  vlan %x  len %u\n", CQE_TYPE(cqe_fp_flags),
> > @@ -1727,6 +1728,12 @@ reuse_rx:
> >
> >  			skb->protocol = eth_type_trans(skb, bp->dev);
> >
> > +			if ((bp->dev->features & ETH_FLAG_RXHASH) &&
> > +			    (cqe_fp_status_flags &
> > +			     ETH_FAST_PATH_RX_CQE_RSS_HASH_FLG))
> > +				skb->rxhash = le32_to_cpu(
> > +				    cqe->fast_path_cqe.rss_hash_result);
> > +
> >  			skb->ip_summed = CHECKSUM_NONE;
> >  			if (bp->rx_csum) {
> >  				if (likely(BNX2X_RX_CSUM_OK(cqe)))
> > @@ -5750,10 +5757,10 @@ static void bnx2x_init_internal_func(struct bnx2x
> > *bp)
> >  	u32 offset;
> >  	u16 max_agg_size;
> >
> > -	if (is_multi(bp)) {
> > -		tstorm_config.config_flags = MULTI_FLAGS(bp);
> > +	tstorm_config.config_flags = RSS_FLAGS(bp);
> > +
> > +	if (is_multi(bp))
> >  		tstorm_config.rss_result_mask = MULTI_MASK;
> > -	}
> >
> >  	/* Enable TPA if needed */
> >  	if (bp->flags & TPA_ENABLE_FLAG)
> > @@ -6629,10 +6636,8 @@ static int bnx2x_init_common(struct bnx2x *bp)
> >  	bnx2x_init_block(bp, PBF_BLOCK, COMMON_STAGE);
> >
> >  	REG_WR(bp, SRC_REG_SOFT_RST, 1);
> > -	for (i = SRC_REG_KEYRSS0_0; i <= SRC_REG_KEYRSS1_9; i += 4) {
> > -		REG_WR(bp, i, 0xc0cac01a);
> > -		/* TODO: replace with something meaningful */
> > -	}
> > +	for (i = SRC_REG_KEYRSS0_0; i <= SRC_REG_KEYRSS1_9; i += 4)
> > +		REG_WR(bp, i, random32());
> >  	bnx2x_init_block(bp, SRCH_BLOCK, COMMON_STAGE);
> >  #ifdef BCM_CNIC
> >  	REG_WR(bp, SRC_REG_KEYSEARCH_0, 0x63285672);
> > @@ -11001,6 +11006,11 @@ static int bnx2x_set_flags(struct net_device *dev,
> > u32 data)
> >  		changed = 1;
> >  	}
> >
> > +	if (data & ETH_FLAG_RXHASH)
> > +		dev->features |= NETIF_F_RXHASH;
> > +	else
> > +		dev->features &= ~NETIF_F_RXHASH;
> > +
> >  	if (changed && netif_running(dev)) {
> >  		bnx2x_nic_unload(bp, UNLOAD_NORMAL);
> >  		rc = bnx2x_nic_load(bp, LOAD_NORMAL);
> > --
> > To unsubscribe from this list: send the line "unsubscribe netdev" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Vladislav Zolotarov July 6, 2010, 7:16 a.m. UTC | #33
Let me rephrase: the part of your patch below enables RSS flow in our FW even if there is only one HW queue and I wonder why? 
To refresh:
	1) FW won't provide Toeplitz hash on CQE if RSS is not enabled.
	2) There can be one HW queue in only 2 cases:
		- There is only one CPU in the system. In that case I wonder if u have anything to do with Toeplitz hash on the skb at all.
		- User has explicitly requested 1 HW queue with module parameter (num_queues=1). In that case if u r going to use the RSS hash on the skb means that u r actually going to do SW RSS instead of HW RSS, which sounds strange to me.

So, Herbert, could u, pls., explain, what was your original idea about these code lines?

Thanks,
vlad

> -----Original Message-----
> From: netdev-owner@vger.kernel.org [mailto:netdev-owner@vger.kernel.org] On
> Behalf Of Vladislav Zolotarov
> Sent: Sunday, July 04, 2010 7:36 PM
> To: Tom Herbert
> Cc: netdev@vger.kernel.org
> Subject: RE: [PATCH] bnx2x: add support for receive hashing
> 
> Tom, could u, pls., explain what did u mean by taking the (RSS) flags
> configuration out of RSS "if"? To recall "if(is_multi(bp))" is true iff RSS
> is enabled.
> 
> Thanks,
> vlad
> 
> > @@ -5750,10 +5757,10 @@ static void bnx2x_init_internal_func(struct bnx2x
> > *bp)
> >  	u32 offset;
> >  	u16 max_agg_size;
> >
> > -	if (is_multi(bp)) {
> > -		tstorm_config.config_flags = MULTI_FLAGS(bp);
> > +	tstorm_config.config_flags = RSS_FLAGS(bp);
> > +
> > +	if (is_multi(bp))
> >  		tstorm_config.rss_result_mask = MULTI_MASK;
> > -	}
> >
> >  	/* Enable TPA if needed */
> >  	if (bp->flags & TPA_ENABLE_FLAG)
> 
> 
> > -----Original Message-----
> > From: netdev-owner@vger.kernel.org [mailto:netdev-owner@vger.kernel.org] On
> > Behalf Of Tom Herbert
> > Sent: Friday, April 23, 2010 8:54 AM
> > To: davem@davemloft.net; netdev@vger.kernel.org
> > Subject: [PATCH] bnx2x: add support for receive hashing
> >
> > Add support to bnx2x to extract Toeplitz hash out of the receive descriptor
> > for use in skb->rxhash.
> >
> > Signed-off-by: Tom Herbert <therbert@google.com>
> > ---
> > diff --git a/drivers/net/bnx2x.h b/drivers/net/bnx2x.h
> > index 0819530..8bd2368 100644
> > --- a/drivers/net/bnx2x.h
> > +++ b/drivers/net/bnx2x.h
> > @@ -1330,7 +1330,7 @@ static inline u32 reg_poll(struct bnx2x *bp, u32 reg,
> > u32 expected, int ms,
> >  		AEU_INPUTS_ATTN_BITS_MCP_LATCHED_UMP_TX_PARITY | \
> >  		AEU_INPUTS_ATTN_BITS_MCP_LATCHED_SCPAD_PARITY)
> >
> > -#define MULTI_FLAGS(bp) \
> > +#define RSS_FLAGS(bp) \
> >  		(TSTORM_ETH_FUNCTION_COMMON_CONFIG_RSS_IPV4_CAPABILITY | \
> >  		 TSTORM_ETH_FUNCTION_COMMON_CONFIG_RSS_IPV4_TCP_CAPABILITY | \
> >  		 TSTORM_ETH_FUNCTION_COMMON_CONFIG_RSS_IPV6_CAPABILITY | \
> > diff --git a/drivers/net/bnx2x_main.c b/drivers/net/bnx2x_main.c
> > index 0c6dba2..613f727 100644
> > --- a/drivers/net/bnx2x_main.c
> > +++ b/drivers/net/bnx2x_main.c
> > @@ -1582,7 +1582,7 @@ static int bnx2x_rx_int(struct bnx2x_fastpath *fp,
> int
> > budget)
> >  		struct sw_rx_bd *rx_buf = NULL;
> >  		struct sk_buff *skb;
> >  		union eth_rx_cqe *cqe;
> > -		u8 cqe_fp_flags;
> > +		u8 cqe_fp_flags, cqe_fp_status_flags;
> >  		u16 len, pad;
> >
> >  		comp_ring_cons = RCQ_BD(sw_comp_cons);
> > @@ -1598,6 +1598,7 @@ static int bnx2x_rx_int(struct bnx2x_fastpath *fp,
> int
> > budget)
> >
> >  		cqe = &fp->rx_comp_ring[comp_ring_cons];
> >  		cqe_fp_flags = cqe->fast_path_cqe.type_error_flags;
> > +		cqe_fp_status_flags = cqe->fast_path_cqe.status_flags;
> >
> >  		DP(NETIF_MSG_RX_STATUS, "CQE type %x  err %x  status %x"
> >  		   "  queue %x  vlan %x  len %u\n", CQE_TYPE(cqe_fp_flags),
> > @@ -1727,6 +1728,12 @@ reuse_rx:
> >
> >  			skb->protocol = eth_type_trans(skb, bp->dev);
> >
> > +			if ((bp->dev->features & ETH_FLAG_RXHASH) &&
> > +			    (cqe_fp_status_flags &
> > +			     ETH_FAST_PATH_RX_CQE_RSS_HASH_FLG))
> > +				skb->rxhash = le32_to_cpu(
> > +				    cqe->fast_path_cqe.rss_hash_result);
> > +
> >  			skb->ip_summed = CHECKSUM_NONE;
> >  			if (bp->rx_csum) {
> >  				if (likely(BNX2X_RX_CSUM_OK(cqe)))
> > @@ -5750,10 +5757,10 @@ static void bnx2x_init_internal_func(struct bnx2x
> > *bp)
> >  	u32 offset;
> >  	u16 max_agg_size;
> >
> > -	if (is_multi(bp)) {
> > -		tstorm_config.config_flags = MULTI_FLAGS(bp);
> > +	tstorm_config.config_flags = RSS_FLAGS(bp);
> > +
> > +	if (is_multi(bp))
> >  		tstorm_config.rss_result_mask = MULTI_MASK;
> > -	}
> >
> >  	/* Enable TPA if needed */
> >  	if (bp->flags & TPA_ENABLE_FLAG)
> > @@ -6629,10 +6636,8 @@ static int bnx2x_init_common(struct bnx2x *bp)
> >  	bnx2x_init_block(bp, PBF_BLOCK, COMMON_STAGE);
> >
> >  	REG_WR(bp, SRC_REG_SOFT_RST, 1);
> > -	for (i = SRC_REG_KEYRSS0_0; i <= SRC_REG_KEYRSS1_9; i += 4) {
> > -		REG_WR(bp, i, 0xc0cac01a);
> > -		/* TODO: replace with something meaningful */
> > -	}
> > +	for (i = SRC_REG_KEYRSS0_0; i <= SRC_REG_KEYRSS1_9; i += 4)
> > +		REG_WR(bp, i, random32());
> >  	bnx2x_init_block(bp, SRCH_BLOCK, COMMON_STAGE);
> >  #ifdef BCM_CNIC
> >  	REG_WR(bp, SRC_REG_KEYSEARCH_0, 0x63285672);
> > @@ -11001,6 +11006,11 @@ static int bnx2x_set_flags(struct net_device *dev,
> > u32 data)
> >  		changed = 1;
> >  	}
> >
> > +	if (data & ETH_FLAG_RXHASH)
> > +		dev->features |= NETIF_F_RXHASH;
> > +	else
> > +		dev->features &= ~NETIF_F_RXHASH;
> > +
> >  	if (changed && netif_running(dev)) {
> >  		bnx2x_nic_unload(bp, UNLOAD_NORMAL);
> >  		rc = bnx2x_nic_load(bp, LOAD_NORMAL);
> > --
> > To unsubscribe from this list: send the line "unsubscribe netdev" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tom Herbert July 7, 2010, 7:17 p.m. UTC | #34
On Tue, Jul 6, 2010 at 12:16 AM, Vladislav Zolotarov <vladz@broadcom.com> wrote:
> Let me rephrase: the part of your patch below enables RSS flow in our FW even if there is only one HW queue and I wonder why?
> To refresh:
>        1) FW won't provide Toeplitz hash on CQE if RSS is not enabled.
>        2) There can be one HW queue in only 2 cases:
>                - There is only one CPU in the system. In that case I wonder if u have anything to do with Toeplitz hash on the skb at all.
>                - User has explicitly requested 1 HW queue with module parameter (num_queues=1). In that case if u r going to use the RSS hash on the skb means that u r actually going to do SW RSS instead of HW RSS, which sounds strange to me.

It might not be so strange.  Previously, we has hit a firmware bug in
bnx2x that was make multi-queue not perform well under some loads, so
we had disabled it for a while... it is a valid configuration we have
used.

>
> So, Herbert, could u, pls., explain, what was your original idea about these code lines?
>

It is to enable the device to provide the RSS hash in RX descriptor.
The hash severs two purposes now, it's used internally in the device
to perform RSS table lookup and also value in RX descriptor.  The
latter does not require multi-queue.  Strictly speaking, on a single
processor system without multqueue, it would be true that enabling the
RX hash on bnx2x is currently superfluous (notwithstanding some other
use of the hash might be implemented).

> Thanks,
> vlad
>
>> -----Original Message-----
>> From: netdev-owner@vger.kernel.org [mailto:netdev-owner@vger.kernel.org] On
>> Behalf Of Vladislav Zolotarov
>> Sent: Sunday, July 04, 2010 7:36 PM
>> To: Tom Herbert
>> Cc: netdev@vger.kernel.org
>> Subject: RE: [PATCH] bnx2x: add support for receive hashing
>>
>> Tom, could u, pls., explain what did u mean by taking the (RSS) flags
>> configuration out of RSS "if"? To recall "if(is_multi(bp))" is true iff RSS
>> is enabled.
>>
>> Thanks,
>> vlad
>>
>> > @@ -5750,10 +5757,10 @@ static void bnx2x_init_internal_func(struct bnx2x
>> > *bp)
>> >     u32 offset;
>> >     u16 max_agg_size;
>> >
>> > -   if (is_multi(bp)) {
>> > -           tstorm_config.config_flags = MULTI_FLAGS(bp);
>> > +   tstorm_config.config_flags = RSS_FLAGS(bp);
>> > +
>> > +   if (is_multi(bp))
>> >             tstorm_config.rss_result_mask = MULTI_MASK;
>> > -   }
>> >
>> >     /* Enable TPA if needed */
>> >     if (bp->flags & TPA_ENABLE_FLAG)
>>
>>
>> > -----Original Message-----
>> > From: netdev-owner@vger.kernel.org [mailto:netdev-owner@vger.kernel.org] On
>> > Behalf Of Tom Herbert
>> > Sent: Friday, April 23, 2010 8:54 AM
>> > To: davem@davemloft.net; netdev@vger.kernel.org
>> > Subject: [PATCH] bnx2x: add support for receive hashing
>> >
>> > Add support to bnx2x to extract Toeplitz hash out of the receive descriptor
>> > for use in skb->rxhash.
>> >
>> > Signed-off-by: Tom Herbert <therbert@google.com>
>> > ---
>> > diff --git a/drivers/net/bnx2x.h b/drivers/net/bnx2x.h
>> > index 0819530..8bd2368 100644
>> > --- a/drivers/net/bnx2x.h
>> > +++ b/drivers/net/bnx2x.h
>> > @@ -1330,7 +1330,7 @@ static inline u32 reg_poll(struct bnx2x *bp, u32 reg,
>> > u32 expected, int ms,
>> >             AEU_INPUTS_ATTN_BITS_MCP_LATCHED_UMP_TX_PARITY | \
>> >             AEU_INPUTS_ATTN_BITS_MCP_LATCHED_SCPAD_PARITY)
>> >
>> > -#define MULTI_FLAGS(bp) \
>> > +#define RSS_FLAGS(bp) \
>> >             (TSTORM_ETH_FUNCTION_COMMON_CONFIG_RSS_IPV4_CAPABILITY | \
>> >              TSTORM_ETH_FUNCTION_COMMON_CONFIG_RSS_IPV4_TCP_CAPABILITY | \
>> >              TSTORM_ETH_FUNCTION_COMMON_CONFIG_RSS_IPV6_CAPABILITY | \
>> > diff --git a/drivers/net/bnx2x_main.c b/drivers/net/bnx2x_main.c
>> > index 0c6dba2..613f727 100644
>> > --- a/drivers/net/bnx2x_main.c
>> > +++ b/drivers/net/bnx2x_main.c
>> > @@ -1582,7 +1582,7 @@ static int bnx2x_rx_int(struct bnx2x_fastpath *fp,
>> int
>> > budget)
>> >             struct sw_rx_bd *rx_buf = NULL;
>> >             struct sk_buff *skb;
>> >             union eth_rx_cqe *cqe;
>> > -           u8 cqe_fp_flags;
>> > +           u8 cqe_fp_flags, cqe_fp_status_flags;
>> >             u16 len, pad;
>> >
>> >             comp_ring_cons = RCQ_BD(sw_comp_cons);
>> > @@ -1598,6 +1598,7 @@ static int bnx2x_rx_int(struct bnx2x_fastpath *fp,
>> int
>> > budget)
>> >
>> >             cqe = &fp->rx_comp_ring[comp_ring_cons];
>> >             cqe_fp_flags = cqe->fast_path_cqe.type_error_flags;
>> > +           cqe_fp_status_flags = cqe->fast_path_cqe.status_flags;
>> >
>> >             DP(NETIF_MSG_RX_STATUS, "CQE type %x  err %x  status %x"
>> >                "  queue %x  vlan %x  len %u\n", CQE_TYPE(cqe_fp_flags),
>> > @@ -1727,6 +1728,12 @@ reuse_rx:
>> >
>> >                     skb->protocol = eth_type_trans(skb, bp->dev);
>> >
>> > +                   if ((bp->dev->features & ETH_FLAG_RXHASH) &&
>> > +                       (cqe_fp_status_flags &
>> > +                        ETH_FAST_PATH_RX_CQE_RSS_HASH_FLG))
>> > +                           skb->rxhash = le32_to_cpu(
>> > +                               cqe->fast_path_cqe.rss_hash_result);
>> > +
>> >                     skb->ip_summed = CHECKSUM_NONE;
>> >                     if (bp->rx_csum) {
>> >                             if (likely(BNX2X_RX_CSUM_OK(cqe)))
>> > @@ -5750,10 +5757,10 @@ static void bnx2x_init_internal_func(struct bnx2x
>> > *bp)
>> >     u32 offset;
>> >     u16 max_agg_size;
>> >
>> > -   if (is_multi(bp)) {
>> > -           tstorm_config.config_flags = MULTI_FLAGS(bp);
>> > +   tstorm_config.config_flags = RSS_FLAGS(bp);
>> > +
>> > +   if (is_multi(bp))
>> >             tstorm_config.rss_result_mask = MULTI_MASK;
>> > -   }
>> >
>> >     /* Enable TPA if needed */
>> >     if (bp->flags & TPA_ENABLE_FLAG)
>> > @@ -6629,10 +6636,8 @@ static int bnx2x_init_common(struct bnx2x *bp)
>> >     bnx2x_init_block(bp, PBF_BLOCK, COMMON_STAGE);
>> >
>> >     REG_WR(bp, SRC_REG_SOFT_RST, 1);
>> > -   for (i = SRC_REG_KEYRSS0_0; i <= SRC_REG_KEYRSS1_9; i += 4) {
>> > -           REG_WR(bp, i, 0xc0cac01a);
>> > -           /* TODO: replace with something meaningful */
>> > -   }
>> > +   for (i = SRC_REG_KEYRSS0_0; i <= SRC_REG_KEYRSS1_9; i += 4)
>> > +           REG_WR(bp, i, random32());
>> >     bnx2x_init_block(bp, SRCH_BLOCK, COMMON_STAGE);
>> >  #ifdef BCM_CNIC
>> >     REG_WR(bp, SRC_REG_KEYSEARCH_0, 0x63285672);
>> > @@ -11001,6 +11006,11 @@ static int bnx2x_set_flags(struct net_device *dev,
>> > u32 data)
>> >             changed = 1;
>> >     }
>> >
>> > +   if (data & ETH_FLAG_RXHASH)
>> > +           dev->features |= NETIF_F_RXHASH;
>> > +   else
>> > +           dev->features &= ~NETIF_F_RXHASH;
>> > +
>> >     if (changed && netif_running(dev)) {
>> >             bnx2x_nic_unload(bp, UNLOAD_NORMAL);
>> >             rc = bnx2x_nic_load(bp, LOAD_NORMAL);
>> > --
>> > To unsubscribe from this list: send the line "unsubscribe netdev" in
>> > the body of a message to majordomo@vger.kernel.org
>> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe netdev" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
>
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Vladislav Zolotarov July 8, 2010, 8:40 a.m. UTC | #35
> -----Original Message-----
> From: netdev-owner@vger.kernel.org [mailto:netdev-owner@vger.kernel.org] On
> Behalf Of Tom Herbert
> Sent: Wednesday, July 07, 2010 10:18 PM
> To: Vladislav Zolotarov
> Cc: netdev@vger.kernel.org
> Subject: Re: [PATCH] bnx2x: add support for receive hashing
> 
> On Tue, Jul 6, 2010 at 12:16 AM, Vladislav Zolotarov <vladz@broadcom.com>
> wrote:
> > Let me rephrase: the part of your patch below enables RSS flow in our FW
> even if there is only one HW queue and I wonder why?
> > To refresh:
> >        1) FW won't provide Toeplitz hash on CQE if RSS is not enabled.
> >        2) There can be one HW queue in only 2 cases:
> >                - There is only one CPU in the system. In that case I wonder
> if u have anything to do with Toeplitz hash on the skb at all.
> >                - User has explicitly requested 1 HW queue with module
> parameter (num_queues=1). In that case if u r going to use the RSS hash on
> the skb means that u r actually going to do SW RSS instead of HW RSS, which
> sounds strange to me.
> 
> It might not be so strange.  Previously, we has hit a firmware bug in
> bnx2x that was make multi-queue not perform well under some loads, so
> we had disabled it for a while... it is a valid configuration we have
> used.

Tom, according to our FW guys there are no known RSS issues since FW 5.0.x versions. Currently upstream has 5.2.13 FW so, if u know about any issue with the current upstream bnx2x FW, pls., let us know ASAP. Pls., include the relevant performance data. We would also like to know if u see that the FW bug u've mentioned has been resolved in the current upstream bnx2x FW so that we could remove these lines unless u have any other reason to leave them.

Thanks,
vlad

> 
> >
> > So, Herbert, could u, pls., explain, what was your original idea about
> these code lines?
> >
> 
> It is to enable the device to provide the RSS hash in RX descriptor.
> The hash severs two purposes now, it's used internally in the device
> to perform RSS table lookup and also value in RX descriptor.  The
> latter does not require multi-queue.  Strictly speaking, on a single
> processor system without multqueue, it would be true that enabling the
> RX hash on bnx2x is currently superfluous (notwithstanding some other
> use of the hash might be implemented).
> 
> > Thanks,
> > vlad
> >
> >> -----Original Message-----
> >> From: netdev-owner@vger.kernel.org [mailto:netdev-owner@vger.kernel.org]
> On
> >> Behalf Of Vladislav Zolotarov
> >> Sent: Sunday, July 04, 2010 7:36 PM
> >> To: Tom Herbert
> >> Cc: netdev@vger.kernel.org
> >> Subject: RE: [PATCH] bnx2x: add support for receive hashing
> >>
> >> Tom, could u, pls., explain what did u mean by taking the (RSS) flags
> >> configuration out of RSS "if"? To recall "if(is_multi(bp))" is true iff
> RSS
> >> is enabled.
> >>
> >> Thanks,
> >> vlad
> >>
> >> > @@ -5750,10 +5757,10 @@ static void bnx2x_init_internal_func(struct
> bnx2x
> >> > *bp)
> >> >     u32 offset;
> >> >     u16 max_agg_size;
> >> >
> >> > -   if (is_multi(bp)) {
> >> > -           tstorm_config.config_flags = MULTI_FLAGS(bp);
> >> > +   tstorm_config.config_flags = RSS_FLAGS(bp);
> >> > +
> >> > +   if (is_multi(bp))
> >> >             tstorm_config.rss_result_mask = MULTI_MASK;
> >> > -   }
> >> >
> >> >     /* Enable TPA if needed */
> >> >     if (bp->flags & TPA_ENABLE_FLAG)
> >>
> >>
> >> > -----Original Message-----
> >> > From: netdev-owner@vger.kernel.org [mailto:netdev-owner@vger.kernel.org]
> On
> >> > Behalf Of Tom Herbert
> >> > Sent: Friday, April 23, 2010 8:54 AM
> >> > To: davem@davemloft.net; netdev@vger.kernel.org
> >> > Subject: [PATCH] bnx2x: add support for receive hashing
> >> >
> >> > Add support to bnx2x to extract Toeplitz hash out of the receive
> descriptor
> >> > for use in skb->rxhash.
> >> >
> >> > Signed-off-by: Tom Herbert <therbert@google.com>
> >> > ---
> >> > diff --git a/drivers/net/bnx2x.h b/drivers/net/bnx2x.h
> >> > index 0819530..8bd2368 100644
> >> > --- a/drivers/net/bnx2x.h
> >> > +++ b/drivers/net/bnx2x.h
> >> > @@ -1330,7 +1330,7 @@ static inline u32 reg_poll(struct bnx2x *bp, u32
> reg,
> >> > u32 expected, int ms,
> >> >             AEU_INPUTS_ATTN_BITS_MCP_LATCHED_UMP_TX_PARITY | \
> >> >             AEU_INPUTS_ATTN_BITS_MCP_LATCHED_SCPAD_PARITY)
> >> >
> >> > -#define MULTI_FLAGS(bp) \
> >> > +#define RSS_FLAGS(bp) \
> >> >             (TSTORM_ETH_FUNCTION_COMMON_CONFIG_RSS_IPV4_CAPABILITY | \
> >> >              TSTORM_ETH_FUNCTION_COMMON_CONFIG_RSS_IPV4_TCP_CAPABILITY |
> \
> >> >              TSTORM_ETH_FUNCTION_COMMON_CONFIG_RSS_IPV6_CAPABILITY | \
> >> > diff --git a/drivers/net/bnx2x_main.c b/drivers/net/bnx2x_main.c
> >> > index 0c6dba2..613f727 100644
> >> > --- a/drivers/net/bnx2x_main.c
> >> > +++ b/drivers/net/bnx2x_main.c
> >> > @@ -1582,7 +1582,7 @@ static int bnx2x_rx_int(struct bnx2x_fastpath *fp,
> >> int
> >> > budget)
> >> >             struct sw_rx_bd *rx_buf = NULL;
> >> >             struct sk_buff *skb;
> >> >             union eth_rx_cqe *cqe;
> >> > -           u8 cqe_fp_flags;
> >> > +           u8 cqe_fp_flags, cqe_fp_status_flags;
> >> >             u16 len, pad;
> >> >
> >> >             comp_ring_cons = RCQ_BD(sw_comp_cons);
> >> > @@ -1598,6 +1598,7 @@ static int bnx2x_rx_int(struct bnx2x_fastpath *fp,
> >> int
> >> > budget)
> >> >
> >> >             cqe = &fp->rx_comp_ring[comp_ring_cons];
> >> >             cqe_fp_flags = cqe->fast_path_cqe.type_error_flags;
> >> > +           cqe_fp_status_flags = cqe->fast_path_cqe.status_flags;
> >> >
> >> >             DP(NETIF_MSG_RX_STATUS, "CQE type %x  err %x  status %x"
> >> >                "  queue %x  vlan %x  len %u\n", CQE_TYPE(cqe_fp_flags),
> >> > @@ -1727,6 +1728,12 @@ reuse_rx:
> >> >
> >> >                     skb->protocol = eth_type_trans(skb, bp->dev);
> >> >
> >> > +                   if ((bp->dev->features & ETH_FLAG_RXHASH) &&
> >> > +                       (cqe_fp_status_flags &
> >> > +                        ETH_FAST_PATH_RX_CQE_RSS_HASH_FLG))
> >> > +                           skb->rxhash = le32_to_cpu(
> >> > +                               cqe->fast_path_cqe.rss_hash_result);
> >> > +
> >> >                     skb->ip_summed = CHECKSUM_NONE;
> >> >                     if (bp->rx_csum) {
> >> >                             if (likely(BNX2X_RX_CSUM_OK(cqe)))
> >> > @@ -5750,10 +5757,10 @@ static void bnx2x_init_internal_func(struct
> bnx2x
> >> > *bp)
> >> >     u32 offset;
> >> >     u16 max_agg_size;
> >> >
> >> > -   if (is_multi(bp)) {
> >> > -           tstorm_config.config_flags = MULTI_FLAGS(bp);
> >> > +   tstorm_config.config_flags = RSS_FLAGS(bp);
> >> > +
> >> > +   if (is_multi(bp))
> >> >             tstorm_config.rss_result_mask = MULTI_MASK;
> >> > -   }
> >> >
> >> >     /* Enable TPA if needed */
> >> >     if (bp->flags & TPA_ENABLE_FLAG)
> >> > @@ -6629,10 +6636,8 @@ static int bnx2x_init_common(struct bnx2x *bp)
> >> >     bnx2x_init_block(bp, PBF_BLOCK, COMMON_STAGE);
> >> >
> >> >     REG_WR(bp, SRC_REG_SOFT_RST, 1);
> >> > -   for (i = SRC_REG_KEYRSS0_0; i <= SRC_REG_KEYRSS1_9; i += 4) {
> >> > -           REG_WR(bp, i, 0xc0cac01a);
> >> > -           /* TODO: replace with something meaningful */
> >> > -   }
> >> > +   for (i = SRC_REG_KEYRSS0_0; i <= SRC_REG_KEYRSS1_9; i += 4)
> >> > +           REG_WR(bp, i, random32());
> >> >     bnx2x_init_block(bp, SRCH_BLOCK, COMMON_STAGE);
> >> >  #ifdef BCM_CNIC
> >> >     REG_WR(bp, SRC_REG_KEYSEARCH_0, 0x63285672);
> >> > @@ -11001,6 +11006,11 @@ static int bnx2x_set_flags(struct net_device
> *dev,
> >> > u32 data)
> >> >             changed = 1;
> >> >     }
> >> >
> >> > +   if (data & ETH_FLAG_RXHASH)
> >> > +           dev->features |= NETIF_F_RXHASH;
> >> > +   else
> >> > +           dev->features &= ~NETIF_F_RXHASH;
> >> > +
> >> >     if (changed && netif_running(dev)) {
> >> >             bnx2x_nic_unload(bp, UNLOAD_NORMAL);
> >> >             rc = bnx2x_nic_load(bp, LOAD_NORMAL);
> >> > --
> >> > To unsubscribe from this list: send the line "unsubscribe netdev" in
> >> > the body of a message to majordomo@vger.kernel.org
> >> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> >>
> >>
> >> --
> >> To unsubscribe from this list: send the line "unsubscribe netdev" in
> >> the body of a message to majordomo@vger.kernel.org
> >> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> >
> >
> >
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller July 11, 2010, 2:12 a.m. UTC | #36
From: Tom Herbert <therbert@google.com>
Date: Wed, 7 Jul 2010 12:17:31 -0700

> It is to enable the device to provide the RSS hash in RX descriptor.
> The hash severs two purposes now, it's used internally in the device
> to perform RSS table lookup and also value in RX descriptor.  The
> latter does not require multi-queue.  Strictly speaking, on a single
> processor system without multqueue, it would be true that enabling the
> RX hash on bnx2x is currently superfluous (notwithstanding some other
> use of the hash might be implemented).

We intend to use the card provided RSS hash to optimize GSO
flow comparisons at some point.

There are other possible uses as well.

Therefore even in a single RX queue configuration, the driver
should provide the hash if it can.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Vladislav Zolotarov July 11, 2010, 10:02 a.m. UTC | #37
> -----Original Message-----
> From: David Miller [mailto:davem@davemloft.net]
> Sent: Sunday, July 11, 2010 5:13 AM
> To: therbert@google.com
> Cc: Vladislav Zolotarov; netdev@vger.kernel.org
> Subject: Re: [PATCH] bnx2x: add support for receive hashing
> 
> From: Tom Herbert <therbert@google.com>
> Date: Wed, 7 Jul 2010 12:17:31 -0700
> 
> > It is to enable the device to provide the RSS hash in RX descriptor.
> > The hash severs two purposes now, it's used internally in the device
> > to perform RSS table lookup and also value in RX descriptor.  The
> > latter does not require multi-queue.  Strictly speaking, on a single
> > processor system without multqueue, it would be true that enabling the
> > RX hash on bnx2x is currently superfluous (notwithstanding some other
> > use of the hash might be implemented).
> 
> We intend to use the card provided RSS hash to optimize GSO
> flow comparisons at some point.

Could u explain what did u mean here, pls.? GSO is a Tx flow while RSS hash is generated for the ingress traffic.

> 
> There are other possible uses as well.

Could u elaborate, pls?

> 
> Therefore even in a single RX queue configuration, the driver
> should provide the hash if it can.

Providing an RSS hash adds an additional work to our FW and HW. We would like to understand why do we need to pay this penalty. And surely we don't want to pay it for a possible use. We'd prefer to pay it when it's really needed.

Thanks,
vlad


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Vladislav Zolotarov July 11, 2010, 1:16 p.m. UTC | #38
Dave, it's obvious that there a demand for a new HW/FW configuration from our side - "rx hash enable" which is currently tightly coupled with the RSS capability. As long as RSS flow in our FW includes a few more things apart from just creating an RSS hash and as long as there are flows (even hypothetical) that demand the RSS hash regardless the RSS itself we started to work on separation of these two features from FW perspective. This of course will demand a new FW version but once we have it we'll be able to be more specific in HW configuration and have a cleaner code.

Technically, our FW may provide the Rx HASH always and in a current driver configuration this is what it does.
I wonder if the driver always provides the HW RX HASH in the skb->rxhash regardless the value of NETIF_F_RXHASH bit in a netdev->features will it cause any harm? If not we can get rid of two extra conditionals in the bnx2x_rx_int().

Thanks,
vlad

> -----Original Message-----
> From: netdev-owner@vger.kernel.org [mailto:netdev-owner@vger.kernel.org] On
> Behalf Of Vladislav Zolotarov
> Sent: Sunday, July 11, 2010 1:02 PM
> To: David Miller; therbert@google.com
> Cc: netdev@vger.kernel.org
> Subject: RE: [PATCH] bnx2x: add support for receive hashing
> 
> 
> 
> > -----Original Message-----
> > From: David Miller [mailto:davem@davemloft.net]
> > Sent: Sunday, July 11, 2010 5:13 AM
> > To: therbert@google.com
> > Cc: Vladislav Zolotarov; netdev@vger.kernel.org
> > Subject: Re: [PATCH] bnx2x: add support for receive hashing
> >
> > From: Tom Herbert <therbert@google.com>
> > Date: Wed, 7 Jul 2010 12:17:31 -0700
> >
> > > It is to enable the device to provide the RSS hash in RX descriptor.
> > > The hash severs two purposes now, it's used internally in the device
> > > to perform RSS table lookup and also value in RX descriptor.  The
> > > latter does not require multi-queue.  Strictly speaking, on a single
> > > processor system without multqueue, it would be true that enabling the
> > > RX hash on bnx2x is currently superfluous (notwithstanding some other
> > > use of the hash might be implemented).
> >
> > We intend to use the card provided RSS hash to optimize GSO
> > flow comparisons at some point.
> 
> Could u explain what did u mean here, pls.? GSO is a Tx flow while RSS hash
> is generated for the ingress traffic.
> 
> >
> > There are other possible uses as well.
> 
> Could u elaborate, pls?
> 
> >
> > Therefore even in a single RX queue configuration, the driver
> > should provide the hash if it can.
> 
> Providing an RSS hash adds an additional work to our FW and HW. We would like
> to understand why do we need to pay this penalty. And surely we don't want to
> pay it for a possible use. We'd prefer to pay it when it's really needed.
> 
> Thanks,
> vlad
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet July 11, 2010, 4:45 p.m. UTC | #39
Le dimanche 11 juillet 2010 à 06:16 -0700, Vladislav Zolotarov a écrit :
> Dave, it's obvious that there a demand for a new HW/FW configuration
> from our side - "rx hash enable" which is currently tightly coupled
> with the RSS capability. As long as RSS flow in our FW includes a few
> more things apart from just creating an RSS hash and as long as there
> are flows (even hypothetical) that demand the RSS hash regardless the
> RSS itself we started to work on separation of these two features from
> FW perspective. This of course will demand a new FW version but once
> we have it we'll be able to be more specific in HW configuration and
> have a cleaner code.
> 
> Technically, our FW may provide the Rx HASH always and in a current
> driver configuration this is what it does.
> I wonder if the driver always provides the HW RX HASH in the
> skb->rxhash regardless the value of NETIF_F_RXHASH bit in a
> netdev->features will it cause any harm? If not we can get rid of two
> extra conditionals in the bnx2x_rx_int().

Hi

Please dont top-post on netdev, thanks.

NETIF_RX_HASH bit is needed so that we can disable skb->rxhash from a
particular NIC if we know the hardware provided rxhash is not fulfilling
our needs (We prefer spend some cpu cycles to recompute a software
rxhash).

Software one for example deliver same rxhash values for both ways of a
TCP flow, it can help conntracking for example.

The conditional in driver rx is cheap, since the condition is the same
for all packets and CPU can predicts the branch.



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Vladislav Zolotarov July 11, 2010, 5:22 p.m. UTC | #40
> -----Original Message-----

> From: netdev-owner@vger.kernel.org [mailto:netdev-owner@vger.kernel.org] On

> Behalf Of Eric Dumazet

> Sent: Sunday, July 11, 2010 7:45 PM

> To: Vladislav Zolotarov

> Cc: David Miller; therbert@google.com; netdev@vger.kernel.org

> Subject: RE: [PATCH] bnx2x: add support for receive hashing

> 

> Le dimanche 11 juillet 2010 à 06:16 -0700, Vladislav Zolotarov a écrit :

> > Dave, it's obvious that there a demand for a new HW/FW configuration

> > from our side - "rx hash enable" which is currently tightly coupled

> > with the RSS capability. As long as RSS flow in our FW includes a few

> > more things apart from just creating an RSS hash and as long as there

> > are flows (even hypothetical) that demand the RSS hash regardless the

> > RSS itself we started to work on separation of these two features from

> > FW perspective. This of course will demand a new FW version but once

> > we have it we'll be able to be more specific in HW configuration and

> > have a cleaner code.

> >

> > Technically, our FW may provide the Rx HASH always and in a current

> > driver configuration this is what it does.

> > I wonder if the driver always provides the HW RX HASH in the

> > skb->rxhash regardless the value of NETIF_F_RXHASH bit in a

> > netdev->features will it cause any harm? If not we can get rid of two

> > extra conditionals in the bnx2x_rx_int().

> 

> Hi

> 

> Please dont top-post on netdev, thanks.


This discussion is directly related to Tom's patch that's why I'm posting on this thread. 

> 

> NETIF_RX_HASH bit is needed so that we can disable skb->rxhash from a

> particular NIC if we know the hardware provided rxhash is not fulfilling

> our needs (We prefer spend some cpu cycles to recompute a software

> rxhash).

> 

> Software one for example deliver same rxhash values for both ways of a

> TCP flow, it can help conntracking for example.


I understand that, in that case the proper implementation would be to check the netdev->features when u decide to calculate the SW rxhash, isn't it?

> 

> The conditional in driver rx is cheap, since the condition is the same

> for all packets and CPU can predicts the branch.


Not exactly. Our FW/HW won't provide the rxhash for none-IP packets setting the hash CQE field to zero and clearing the ETH_FAST_PATH_RX_CQE_RSS_HASH_FLG in the CQE statsu_flags (for ARPs for instance). Branch prediction is nice but considering my previous remark why do we need this branch at all?

> 

> 

> 

> --

> To unsubscribe from this list: send the line "unsubscribe netdev" in

> the body of a message to majordomo@vger.kernel.org

> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet July 11, 2010, 5:41 p.m. UTC | #41
Le dimanche 11 juillet 2010 à 10:22 -0700, Vladislav Zolotarov a écrit :
> 
> > -----Original Message-----
> > From: netdev-owner@vger.kernel.org [mailto:netdev-owner@vger.kernel.org] On
> > Behalf Of Eric Dumazet
> > Sent: Sunday, July 11, 2010 7:45 PM
> > To: Vladislav Zolotarov
> > Cc: David Miller; therbert@google.com; netdev@vger.kernel.org
> > Subject: RE: [PATCH] bnx2x: add support for receive hashing
> > 
> > Le dimanche 11 juillet 2010 à 06:16 -0700, Vladislav Zolotarov a écrit :
> > > Dave, it's obvious that there a demand for a new HW/FW configuration
> > > from our side - "rx hash enable" which is currently tightly coupled
> > > with the RSS capability. As long as RSS flow in our FW includes a few
> > > more things apart from just creating an RSS hash and as long as there
> > > are flows (even hypothetical) that demand the RSS hash regardless the
> > > RSS itself we started to work on separation of these two features from
> > > FW perspective. This of course will demand a new FW version but once
> > > we have it we'll be able to be more specific in HW configuration and
> > > have a cleaner code.
> > >
> > > Technically, our FW may provide the Rx HASH always and in a current
> > > driver configuration this is what it does.
> > > I wonder if the driver always provides the HW RX HASH in the
> > > skb->rxhash regardless the value of NETIF_F_RXHASH bit in a
> > > netdev->features will it cause any harm? If not we can get rid of two
> > > extra conditionals in the bnx2x_rx_int().
> > 
> > Hi
> > 
> > Please dont top-post on netdev, thanks.
> 
> This discussion is directly related to Tom's patch that's why I'm posting on this thread. 
> 

Thats not the question. I am saying "dont top-post", not "dont change
the subject"

> > 
> > NETIF_RX_HASH bit is needed so that we can disable skb->rxhash from a
> > particular NIC if we know the hardware provided rxhash is not fulfilling
> > our needs (We prefer spend some cpu cycles to recompute a software
> > rxhash).
> > 
> > Software one for example deliver same rxhash values for both ways of a
> > TCP flow, it can help conntracking for example.
> 
> I understand that, in that case the proper implementation would be to check the netdev->features when u decide to calculate the SW rxhash, isn't it?
> 

Nope, please check get_rps_cpus()

We only do :

if (skb->rxhash)
	goto got_hash; /* Skip hash computation on packet header */

That means if skb->rxhash is set, we dont force a recompute.

Your suggestion would move a test from device driver into
get_rps_cpus() ?

Given RPS is more targeted to old devices (not able to provide rxhash),
I am not sure it brings anything.

> > 
> > The conditional in driver rx is cheap, since the condition is the same
> > for all packets and CPU can predicts the branch.
> 
> Not exactly. Our FW/HW won't provide the rxhash for none-IP packets
> setting the hash CQE field to zero and clearing the
> ETH_FAST_PATH_RX_CQE_RSS_HASH_FLG in the CQE statsu_flags (for ARPs
> for instance). Branch prediction is nice but considering my previous
> remark why do we need this branch at all?

Please limit your lines to 70 chars

We want drivers to set skb->rxhash only if allowed to.

If you feel this is bad for your firmware/hardware/driver, dont set
skb->rxhash.



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Vladislav Zolotarov July 11, 2010, 6:18 p.m. UTC | #42
> -----Original Message-----

> From: Eric Dumazet [mailto:eric.dumazet@gmail.com]

> Sent: Sunday, July 11, 2010 8:42 PM

> To: Vladislav Zolotarov

> Cc: David Miller; therbert@google.com; netdev@vger.kernel.org

> Subject: RE: [PATCH] bnx2x: add support for receive hashing

> 

> Le dimanche 11 juillet 2010 à 10:22 -0700, Vladislav Zolotarov a écrit :

> >

> > > -----Original Message-----

> > > From: netdev-owner@vger.kernel.org [mailto:netdev-owner@vger.kernel.org]

> On

> > > Behalf Of Eric Dumazet

> > > Sent: Sunday, July 11, 2010 7:45 PM

> > > To: Vladislav Zolotarov

> > > Cc: David Miller; therbert@google.com; netdev@vger.kernel.org

> > > Subject: RE: [PATCH] bnx2x: add support for receive hashing

> > >

> > > Le dimanche 11 juillet 2010 à 06:16 -0700, Vladislav Zolotarov a écrit :

> > > > Dave, it's obvious that there a demand for a new HW/FW configuration

> > > > from our side - "rx hash enable" which is currently tightly coupled

> > > > with the RSS capability. As long as RSS flow in our FW includes a few

> > > > more things apart from just creating an RSS hash and as long as there

> > > > are flows (even hypothetical) that demand the RSS hash regardless the

> > > > RSS itself we started to work on separation of these two features from

> > > > FW perspective. This of course will demand a new FW version but once

> > > > we have it we'll be able to be more specific in HW configuration and

> > > > have a cleaner code.

> > > >

> > > > Technically, our FW may provide the Rx HASH always and in a current

> > > > driver configuration this is what it does.

> > > > I wonder if the driver always provides the HW RX HASH in the

> > > > skb->rxhash regardless the value of NETIF_F_RXHASH bit in a

> > > > netdev->features will it cause any harm? If not we can get rid of two

> > > > extra conditionals in the bnx2x_rx_int().

> > >

> > > Hi

> > >

> > > Please dont top-post on netdev, thanks.

> >

> > This discussion is directly related to Tom's patch that's why I'm posting

> on this thread.

> >

> 

> Thats not the question. I am saying "dont top-post", not "dont change

> the subject"

> 

> > >

> > > NETIF_RX_HASH bit is needed so that we can disable skb->rxhash from a

> > > particular NIC if we know the hardware provided rxhash is not fulfilling

> > > our needs (We prefer spend some cpu cycles to recompute a software

> > > rxhash).

> > >

> > > Software one for example deliver same rxhash values for both ways of a

> > > TCP flow, it can help conntracking for example.

> >

> > I understand that, in that case the proper implementation would be to check

> the netdev->features when u decide to calculate the SW rxhash, isn't it?

> >

> 

> Nope, please check get_rps_cpus()

> 

> We only do :

> 

> if (skb->rxhash)

> 	goto got_hash; /* Skip hash computation on packet header */

> 

> That means if skb->rxhash is set, we dont force a recompute.


Ok. No, prob. In that case we just need to check the netdev->feature in
the bnx2x_rx_int(). Checking on CQE flags is a not needed.
I'll post a patch shortly. 

Thanks,
vlad

> 

> Your suggestion would move a test from device driver into

> get_rps_cpus() ?

> 

> Given RPS is more targeted to old devices (not able to provide rxhash),

> I am not sure it brings anything.

> 

> > >

> > > The conditional in driver rx is cheap, since the condition is the same

> > > for all packets and CPU can predicts the branch.

> >

> > Not exactly. Our FW/HW won't provide the rxhash for none-IP packets

> > setting the hash CQE field to zero and clearing the

> > ETH_FAST_PATH_RX_CQE_RSS_HASH_FLG in the CQE statsu_flags (for ARPs

> > for instance). Branch prediction is nice but considering my previous

> > remark why do we need this branch at all?

> 

> Please limit your lines to 70 chars

> 

> We want drivers to set skb->rxhash only if allowed to.

> 

> If you feel this is bad for your firmware/hardware/driver, dont set

 skb->rxhash.
> 

> 

>
David Miller July 11, 2010, 10:34 p.m. UTC | #43
From: "Vladislav Zolotarov" <vladz@broadcom.com>
Date: Sun, 11 Jul 2010 03:02:17 -0700

>> We intend to use the card provided RSS hash to optimize GSO
>> flow comparisons at some point.
> 
> Could u explain what did u mean here, pls.? GSO is a Tx flow while
> RSS hash is generated for the ingress traffic.

I of course meant GRO, not GSO.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/net/bnx2x.h b/drivers/net/bnx2x.h
index 0819530..8bd2368 100644
--- a/drivers/net/bnx2x.h
+++ b/drivers/net/bnx2x.h
@@ -1330,7 +1330,7 @@  static inline u32 reg_poll(struct bnx2x *bp, u32 reg, u32 expected, int ms,
 		AEU_INPUTS_ATTN_BITS_MCP_LATCHED_UMP_TX_PARITY | \
 		AEU_INPUTS_ATTN_BITS_MCP_LATCHED_SCPAD_PARITY)
 
-#define MULTI_FLAGS(bp) \
+#define RSS_FLAGS(bp) \
 		(TSTORM_ETH_FUNCTION_COMMON_CONFIG_RSS_IPV4_CAPABILITY | \
 		 TSTORM_ETH_FUNCTION_COMMON_CONFIG_RSS_IPV4_TCP_CAPABILITY | \
 		 TSTORM_ETH_FUNCTION_COMMON_CONFIG_RSS_IPV6_CAPABILITY | \
diff --git a/drivers/net/bnx2x_main.c b/drivers/net/bnx2x_main.c
index 0c6dba2..613f727 100644
--- a/drivers/net/bnx2x_main.c
+++ b/drivers/net/bnx2x_main.c
@@ -1582,7 +1582,7 @@  static int bnx2x_rx_int(struct bnx2x_fastpath *fp, int budget)
 		struct sw_rx_bd *rx_buf = NULL;
 		struct sk_buff *skb;
 		union eth_rx_cqe *cqe;
-		u8 cqe_fp_flags;
+		u8 cqe_fp_flags, cqe_fp_status_flags;
 		u16 len, pad;
 
 		comp_ring_cons = RCQ_BD(sw_comp_cons);
@@ -1598,6 +1598,7 @@  static int bnx2x_rx_int(struct bnx2x_fastpath *fp, int budget)
 
 		cqe = &fp->rx_comp_ring[comp_ring_cons];
 		cqe_fp_flags = cqe->fast_path_cqe.type_error_flags;
+		cqe_fp_status_flags = cqe->fast_path_cqe.status_flags;
 
 		DP(NETIF_MSG_RX_STATUS, "CQE type %x  err %x  status %x"
 		   "  queue %x  vlan %x  len %u\n", CQE_TYPE(cqe_fp_flags),
@@ -1727,6 +1728,12 @@  reuse_rx:
 
 			skb->protocol = eth_type_trans(skb, bp->dev);
 
+			if ((bp->dev->features & ETH_FLAG_RXHASH) &&
+			    (cqe_fp_status_flags &
+			     ETH_FAST_PATH_RX_CQE_RSS_HASH_FLG))
+				skb->rxhash = le32_to_cpu(
+				    cqe->fast_path_cqe.rss_hash_result);
+
 			skb->ip_summed = CHECKSUM_NONE;
 			if (bp->rx_csum) {
 				if (likely(BNX2X_RX_CSUM_OK(cqe)))
@@ -5750,10 +5757,10 @@  static void bnx2x_init_internal_func(struct bnx2x *bp)
 	u32 offset;
 	u16 max_agg_size;
 
-	if (is_multi(bp)) {
-		tstorm_config.config_flags = MULTI_FLAGS(bp);
+	tstorm_config.config_flags = RSS_FLAGS(bp);
+
+	if (is_multi(bp))
 		tstorm_config.rss_result_mask = MULTI_MASK;
-	}
 
 	/* Enable TPA if needed */
 	if (bp->flags & TPA_ENABLE_FLAG)
@@ -6629,10 +6636,8 @@  static int bnx2x_init_common(struct bnx2x *bp)
 	bnx2x_init_block(bp, PBF_BLOCK, COMMON_STAGE);
 
 	REG_WR(bp, SRC_REG_SOFT_RST, 1);
-	for (i = SRC_REG_KEYRSS0_0; i <= SRC_REG_KEYRSS1_9; i += 4) {
-		REG_WR(bp, i, 0xc0cac01a);
-		/* TODO: replace with something meaningful */
-	}
+	for (i = SRC_REG_KEYRSS0_0; i <= SRC_REG_KEYRSS1_9; i += 4)
+		REG_WR(bp, i, random32());
 	bnx2x_init_block(bp, SRCH_BLOCK, COMMON_STAGE);
 #ifdef BCM_CNIC
 	REG_WR(bp, SRC_REG_KEYSEARCH_0, 0x63285672);
@@ -11001,6 +11006,11 @@  static int bnx2x_set_flags(struct net_device *dev, u32 data)
 		changed = 1;
 	}
 
+	if (data & ETH_FLAG_RXHASH)
+		dev->features |= NETIF_F_RXHASH;
+	else
+		dev->features &= ~NETIF_F_RXHASH;
+
 	if (changed && netif_running(dev)) {
 		bnx2x_nic_unload(bp, UNLOAD_NORMAL);
 		rc = bnx2x_nic_load(bp, LOAD_NORMAL);