diff mbox series

[RFC,02/13] net: dsa: implement a central TX reallocation procedure

Message ID 20201017213611.2557565-3-vladimir.oltean@nxp.com
State RFC
Delegated to: David Miller
Headers show
Series Generic TX reallocation for DSA | expand

Commit Message

Vladimir Oltean Oct. 17, 2020, 9:36 p.m. UTC
At the moment, taggers are left with the task of ensuring that the skb
headers are writable (which they aren't, if the frames were cloned for
TX timestamping, for flooding by the bridge, etc), and that there is
enough space in the skb data area for the DSA tag to be pushed.

Moreover, the life of tail taggers is even harder, because they need to
ensure that short frames have enough padding, a problem that normal
taggers don't have.

The principle of the DSA framework is that everything except for the
most intimate hardware specifics (like in this case, the actual packing
of the DSA tag bits) should be done inside the core, to avoid having
code paths that are very rarely tested.

So provide a TX reallocation procedure that should cover the known needs
of DSA today.

Note that this patch also gives the network stack a good hint about the
headroom/tailroom it's going to need. Up till now it wasn't doing that.
So the reallocation procedure should really be there only for the
exceptional cases, and for cloned packets which need to be unshared.
The tx_reallocs counter should prove that.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
---
 net/dsa/slave.c | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

Comments

Vladimir Oltean Oct. 17, 2020, 10:01 p.m. UTC | #1
On Sun, Oct 18, 2020 at 12:36:00AM +0300, Vladimir Oltean wrote:
> diff --git a/net/dsa/slave.c b/net/dsa/slave.c
> index d4326940233c..790f5c8deb13 100644
> --- a/net/dsa/slave.c
> +++ b/net/dsa/slave.c
> @@ -548,6 +548,36 @@ netdev_tx_t dsa_enqueue_skb(struct sk_buff *skb, struct net_device *dev)
>  }
>  EXPORT_SYMBOL_GPL(dsa_enqueue_skb);
>  
> +static int dsa_realloc_skb(struct sk_buff *skb, struct net_device *dev)

I forgot to actually pad the skb here, if it's a tail tagger, silly me.
The following changes should do the trick.

> +{
> +	struct net_device *master = dsa_slave_to_master(dev);

The addition of master->needed_headroom and master->needed_tailroom used
to be here, that's why this unused variable is here.

> +	struct dsa_slave_priv *p = netdev_priv(dev);
> +	struct dsa_slave_stats *e;
> +	int headroom, tailroom;
	int padlen = 0, err;
> +
> +	headroom = dev->needed_headroom;
> +	tailroom = dev->needed_tailroom;
> +	/* For tail taggers, we need to pad short frames ourselves, to ensure
> +	 * that the tail tag does not fail at its role of being at the end of
> +	 * the packet, once the master interface pads the frame.
> +	 */
> +	if (unlikely(tailroom && skb->len < ETH_ZLEN))
> +		tailroom += ETH_ZLEN - skb->len;
		~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		padlen = ETH_ZLEN - skb->len;
	tailroom += padlen;
> +
> +	if (likely(skb_headroom(skb) >= headroom &&
> +		   skb_tailroom(skb) >= tailroom) &&
> +		   !skb_cloned(skb))
> +		/* No reallocation needed, yay! */
> +		return 0;
> +
> +	e = this_cpu_ptr(p->extra_stats);
> +	u64_stats_update_begin(&e->syncp);
> +	e->tx_reallocs++;
> +	u64_stats_update_end(&e->syncp);
> +
> +	return pskb_expand_head(skb, headroom, tailroom, GFP_ATOMIC);
	~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	err = pskb_expand_head(skb, headroom, tailroom, GFP_ATOMIC);
	if (err < 0 || !padlen)
		return err;

	return __skb_put_padto(skb, padlen, false);
> +}
> +
>  static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev)
>  {
>  	struct dsa_slave_priv *p = netdev_priv(dev);
> @@ -567,6 +597,11 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev)
>  	 */
>  	dsa_skb_tx_timestamp(p, skb);
>  
> +	if (dsa_realloc_skb(skb, dev)) {
> +		kfree_skb(skb);
> +		return NETDEV_TX_OK;
> +	}
> +
>  	/* Transmit function may have to reallocate the original SKB,
>  	 * in which case it must have freed it. Only free it here on error.
>  	 */
> @@ -1802,6 +1837,14 @@ int dsa_slave_create(struct dsa_port *port)
>  	slave_dev->netdev_ops = &dsa_slave_netdev_ops;
>  	if (ds->ops->port_max_mtu)
>  		slave_dev->max_mtu = ds->ops->port_max_mtu(ds, port->index);
> +	/* Try to save one extra realloc later in the TX path (in the master)
> +	 * by also inheriting the master's needed headroom and tailroom.
> +	 * The 8021q driver also does this.
> +	 */

Also, this comment is bogus given the current code. It should be removed
from here, and...

> +	if (cpu_dp->tag_ops->tail_tag)
> +		slave_dev->needed_tailroom = cpu_dp->tag_ops->overhead;
> +	else
> +		slave_dev->needed_headroom = cpu_dp->tag_ops->overhead;
...put here, along with:
	slave_dev->needed_headroom += master->needed_headroom;
	slave_dev->needed_tailroom += master->needed_tailroom;
>  	SET_NETDEV_DEVTYPE(slave_dev, &dsa_type);
>  
>  	netdev_for_each_tx_queue(slave_dev, dsa_slave_set_lockdep_class_one,
> -- 
> 2.25.1
>
Florian Fainelli Oct. 17, 2020, 10:11 p.m. UTC | #2
On 10/17/2020 3:01 PM, Vladimir Oltean wrote:
> On Sun, Oct 18, 2020 at 12:36:00AM +0300, Vladimir Oltean wrote:
>> diff --git a/net/dsa/slave.c b/net/dsa/slave.c
>> index d4326940233c..790f5c8deb13 100644
>> --- a/net/dsa/slave.c
>> +++ b/net/dsa/slave.c
>> @@ -548,6 +548,36 @@ netdev_tx_t dsa_enqueue_skb(struct sk_buff *skb, struct net_device *dev)
>>   }
>>   EXPORT_SYMBOL_GPL(dsa_enqueue_skb);
>>   
>> +static int dsa_realloc_skb(struct sk_buff *skb, struct net_device *dev)
> 
> I forgot to actually pad the skb here, if it's a tail tagger, silly me.
> The following changes should do the trick.
> 
>> +{
>> +	struct net_device *master = dsa_slave_to_master(dev);
> 
> The addition of master->needed_headroom and master->needed_tailroom used
> to be here, that's why this unused variable is here.
> 
>> +	struct dsa_slave_priv *p = netdev_priv(dev);
>> +	struct dsa_slave_stats *e;
>> +	int headroom, tailroom;
> 	int padlen = 0, err;
>> +
>> +	headroom = dev->needed_headroom;
>> +	tailroom = dev->needed_tailroom;
>> +	/* For tail taggers, we need to pad short frames ourselves, to ensure
>> +	 * that the tail tag does not fail at its role of being at the end of
>> +	 * the packet, once the master interface pads the frame.
>> +	 */
>> +	if (unlikely(tailroom && skb->len < ETH_ZLEN))
>> +		tailroom += ETH_ZLEN - skb->len;
> 		~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> 		padlen = ETH_ZLEN - skb->len;
> 	tailroom += padlen;
>> +
>> +	if (likely(skb_headroom(skb) >= headroom &&
>> +		   skb_tailroom(skb) >= tailroom) &&
>> +		   !skb_cloned(skb))
>> +		/* No reallocation needed, yay! */
>> +		return 0;
>> +
>> +	e = this_cpu_ptr(p->extra_stats);
>> +	u64_stats_update_begin(&e->syncp);
>> +	e->tx_reallocs++;
>> +	u64_stats_update_end(&e->syncp);
>> +
>> +	return pskb_expand_head(skb, headroom, tailroom, GFP_ATOMIC);
> 	~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> 	err = pskb_expand_head(skb, headroom, tailroom, GFP_ATOMIC);
> 	if (err < 0 || !padlen)
> 		return err;
> 
> 	return __skb_put_padto(skb, padlen, false);
>> +}
>> +
>>   static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev)
>>   {
>>   	struct dsa_slave_priv *p = netdev_priv(dev);
>> @@ -567,6 +597,11 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev)
>>   	 */
>>   	dsa_skb_tx_timestamp(p, skb);
>>   
>> +	if (dsa_realloc_skb(skb, dev)) {
>> +		kfree_skb(skb);
>> +		return NETDEV_TX_OK;
>> +	}
>> +
>>   	/* Transmit function may have to reallocate the original SKB,
>>   	 * in which case it must have freed it. Only free it here on error.
>>   	 */
>> @@ -1802,6 +1837,14 @@ int dsa_slave_create(struct dsa_port *port)
>>   	slave_dev->netdev_ops = &dsa_slave_netdev_ops;
>>   	if (ds->ops->port_max_mtu)
>>   		slave_dev->max_mtu = ds->ops->port_max_mtu(ds, port->index);
>> +	/* Try to save one extra realloc later in the TX path (in the master)
>> +	 * by also inheriting the master's needed headroom and tailroom.
>> +	 * The 8021q driver also does this.
>> +	 */
> 
> Also, this comment is bogus given the current code. It should be removed
> from here, and...
> 
>> +	if (cpu_dp->tag_ops->tail_tag)
>> +		slave_dev->needed_tailroom = cpu_dp->tag_ops->overhead;
>> +	else
>> +		slave_dev->needed_headroom = cpu_dp->tag_ops->overhead;
> ...put here, along with:
> 	slave_dev->needed_headroom += master->needed_headroom;
> 	slave_dev->needed_tailroom += master->needed_tailroom;

Not positive you need that because you may be account for more head or 
tail room than necessary.

For instance with tag_brcm.c and systemport.c we need 4 bytes of head 
room for the Broadcom tag and an additional 8 bytes for pushing the 
transmit status block descriptor in front of the Ethernet frame about to 
be transmitted. These additional 8 bytes are a requirement of the DSA 
master here and exist regardless of DSA being used, but we should not be 
propagating them to the DSA slave.
Vladimir Oltean Oct. 17, 2020, 10:17 p.m. UTC | #3
On Sat, Oct 17, 2020 at 03:11:52PM -0700, Florian Fainelli wrote:
> > 	slave_dev->needed_headroom += master->needed_headroom;
> > 	slave_dev->needed_tailroom += master->needed_tailroom;
> 
> Not positive you need that because you may be account for more head or tail
> room than necessary.
> 
> For instance with tag_brcm.c and systemport.c we need 4 bytes of head room
> for the Broadcom tag and an additional 8 bytes for pushing the transmit
> status block descriptor in front of the Ethernet frame about to be
> transmitted. These additional 8 bytes are a requirement of the DSA master
> here and exist regardless of DSA being used, but we should not be
> propagating them to the DSA slave.

And that's exactly what I'm trying to do here, do you see any problem
with it? Basically I'm telling the network stack to allocate skbs with
large enough headroom and tailroom so that reallocations will not be
necessary for its entire TX journey. Not in DSA and not in the
systemport either. That's the exact reason why the VLAN driver does this
too, as far as I understand. Doing this trick also has the benefit that
it works with stacked DSA devices too. The real master has a headroom
of, say, 16 bytes, the first-level switch has 16 bytes, and the
second-level switch has 16 more bytes. So when you inject an skb into
the second-level switch (the one with the user ports that applications
will use), the skb will be reallocated only once, with a new headroom of
16 * 3 bytes, instead of potentially 3 times (incrementally, first for
16, then for 32, then for 48). Am I missing something?
Vladimir Oltean Oct. 17, 2020, 10:31 p.m. UTC | #4
On Sun, Oct 18, 2020 at 01:01:04AM +0300, Vladimir Oltean wrote:
> > +	return pskb_expand_head(skb, headroom, tailroom, GFP_ATOMIC);
> 	~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> 	err = pskb_expand_head(skb, headroom, tailroom, GFP_ATOMIC);
> 	if (err < 0 || !padlen)
> 		return err;
> 
> 	return __skb_put_padto(skb, padlen, false);

Oops, another one here. Should be:

	return __skb_put_padto(skb, skb->len + padlen, false);
> > +}
Vladimir Oltean Oct. 18, 2020, 12:13 a.m. UTC | #5
On Sun, Oct 18, 2020 at 01:31:20AM +0300, Vladimir Oltean wrote:
> On Sun, Oct 18, 2020 at 01:01:04AM +0300, Vladimir Oltean wrote:
> > > +	return pskb_expand_head(skb, headroom, tailroom, GFP_ATOMIC);
> > 	~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> > 	err = pskb_expand_head(skb, headroom, tailroom, GFP_ATOMIC);
> > 	if (err < 0 || !padlen)
> > 		return err;
> > 
> > 	return __skb_put_padto(skb, padlen, false);
> 
> Oops, another one here. Should be:
> 
> 	return __skb_put_padto(skb, skb->len + padlen, false);
> > > +}

Last one for today. This should actually be correct now, and not
allocate double the needed headroom size.

static int dsa_realloc_skb(struct sk_buff *skb, struct net_device *dev)
{
	struct dsa_slave_priv *p = netdev_priv(dev);
	struct dsa_slave_stats *e;
	int needed_headroom;
	int needed_tailroom;
	int padlen = 0, err;

	needed_headroom = dev->needed_headroom;
	needed_tailroom = dev->needed_tailroom;
	/* For tail taggers, we need to pad short frames ourselves, to ensure
	 * that the tail tag does not fail at its role of being at the end of
	 * the packet, once the master interface pads the frame.
	 */
	if (unlikely(needed_tailroom && skb->len < ETH_ZLEN))
		padlen = ETH_ZLEN - skb->len;
	needed_tailroom += padlen;
	needed_headroom -= skb_headroom(skb);
	needed_tailroom -= skb_tailroom(skb);

	if (likely(needed_headroom <= 0 && needed_tailroom <= 0 &&
		   !skb_cloned(skb)))
		/* No reallocation needed, yay! */
		return 0;

	e = this_cpu_ptr(p->extra_stats);
	u64_stats_update_begin(&e->syncp);
	e->tx_reallocs++;
	u64_stats_update_end(&e->syncp);

	err = pskb_expand_head(skb, max(needed_headroom, 0),
			       max(needed_tailroom, 0), GFP_ATOMIC);
	if (err < 0 || !padlen)
		return err;

	return __skb_put_padto(skb, ETH_ZLEN, false);
}
Florian Fainelli Oct. 18, 2020, 12:37 a.m. UTC | #6
On 10/17/2020 3:17 PM, Vladimir Oltean wrote:
> On Sat, Oct 17, 2020 at 03:11:52PM -0700, Florian Fainelli wrote:
>>> 	slave_dev->needed_headroom += master->needed_headroom;
>>> 	slave_dev->needed_tailroom += master->needed_tailroom;
>>
>> Not positive you need that because you may be account for more head or tail
>> room than necessary.
>>
>> For instance with tag_brcm.c and systemport.c we need 4 bytes of head room
>> for the Broadcom tag and an additional 8 bytes for pushing the transmit
>> status block descriptor in front of the Ethernet frame about to be
>> transmitted. These additional 8 bytes are a requirement of the DSA master
>> here and exist regardless of DSA being used, but we should not be
>> propagating them to the DSA slave.
> 
> And that's exactly what I'm trying to do here, do you see any problem
> with it? Basically I'm telling the network stack to allocate skbs with
> large enough headroom and tailroom so that reallocations will not be
> necessary for its entire TX journey. Not in DSA and not in the
> systemport either. That's the exact reason why the VLAN driver does this
> too, as far as I understand. Doing this trick also has the benefit that
> it works with stacked DSA devices too. The real master has a headroom
> of, say, 16 bytes, the first-level switch has 16 bytes, and the
> second-level switch has 16 more bytes. So when you inject an skb into
> the second-level switch (the one with the user ports that applications
> will use), the skb will be reallocated only once, with a new headroom of
> 16 * 3 bytes, instead of potentially 3 times (incrementally, first for
> 16, then for 32, then for 48). Am I missing something?
> 

That is fine with me, given that we can resolve most of the TX path 
ahead of time, I suppose we should indeed take advantage of that 
knowledge. Thanks!
Christian Eggers Oct. 18, 2020, 10:36 a.m. UTC | #7
Hi Vladimir,

thank you very much for bringing some progress into this.

I tried to test on top of latest net-next, but I currently get a linker error (not  related to this):
arch/arm/vfp/vfphw.o: in function `vfp_support_entry':
arch/arm/vfp/vfphw.S:85:(.text+0xa): relocation truncated to fit: R_ARM_THM_JUMP19 against symbol `vfp_kmode_exception' defined in .text.unlikely section in arch/arm/vfp/vfpmodule.o

So continued testing of your series (with all updates) and my PTP work on top
of net-next from 2020-10-14.

On Sunday, 18 October 2020, 02:13:27 CEST, Vladimir Oltean wrote:
> Last one for today. This should actually be correct now, and not
> allocate double the needed headroom size.
> 
> static int dsa_realloc_skb(struct sk_buff *skb, struct net_device *dev)
> {
> 	struct dsa_slave_priv *p = netdev_priv(dev);
> 	struct dsa_slave_stats *e;
> 	int needed_headroom;
> 	int needed_tailroom;
> 	int padlen = 0, err;
> 
> 	needed_headroom = dev->needed_headroom;
> 	needed_tailroom = dev->needed_tailroom;
Debugging shows that these values are correct for my device (0/5).

> 	/* For tail taggers, we need to pad short frames ourselves, to ensure
> 	 * that the tail tag does not fail at its role of being at the end of
> 	 * the packet, once the master interface pads the frame.
> 	 */
> 	if (unlikely(needed_tailroom && skb->len < ETH_ZLEN))
Can "needed_tailroom" be used equivalently for dsa_device_ops::tail_tag?

> 		padlen = ETH_ZLEN - skb->len;
> 	needed_tailroom += padlen;
> 	needed_headroom -= skb_headroom(skb);
> 	needed_tailroom -= skb_tailroom(skb);
> 
> 	if (likely(needed_headroom <= 0 && needed_tailroom <= 0 &&
> 		   !skb_cloned(skb)))
> 		/* No reallocation needed, yay! */
> 		return 0;
> 
> 	e = this_cpu_ptr(p->extra_stats);
> 	u64_stats_update_begin(&e->syncp);
> 	e->tx_reallocs++;
> 	u64_stats_update_end(&e->syncp);
> 
> 	err = pskb_expand_head(skb, max(needed_headroom, 0),
> 			       max(needed_tailroom, 0), GFP_ATOMIC);
You may remove the second max() statement (around needed_tailroom). This would
size the reallocated skb more exactly to the size actually required an may save
some RAM (already tested too).

Alternatively I suggest moving the max() statements up in order to completely
avoid negative values for needed_headroom / needed_tailroom:

	needed_headroom = max(needed_headroom - skb_headroom(skb), 0);
	needed_tailroom = max(needed_tailroom - skb_tailroom(skb), 0);

	if (likely(needed_headroom == 0 && needed_tailroom == 0 &&
		   !skb_cloned(skb)))
		/* No reallocation needed, yay! */
		return 0;
	...
	err = pskb_expand_head(skb, needed_headroom, needed_tailroom, GFP_ATOMIC);

> 	if (err < 0 || !padlen)
> 		return err;
This is correct but looks misleading for me. What about...
	if (err < 0)
		return err;

	return padlen ? __skb_put_padto(skb, ETH_ZLEN, false) : 0;


FYI two dumps of a padded skb (before/after) dsa_realloc_skb():
[ 1983.621180] old:skb len=58 headroom=2 headlen=58 tailroom=68
[ 1983.621180] mac=(2,14) net=(16,-1) trans=-1
[ 1983.621180] shinfo(txflags=1 nr_frags=0 gso(size=0 type=0 segs=0))
[ 1983.621180] csum(0x0 ip_summed=0 complete_sw=0 valid=0 level=0)
[ 1983.621180] hash(0x0 sw=0 l4=0) proto=0x88f7 pkttype=0 iif=0
[ 1983.635575] old:dev name=lan0 feat=0x0x0002000000005220
[ 1983.638205] old:sk family=17 type=3 proto=0
[ 1983.640323] old:skb linear:   00000000: 01 1b 19 00 00 00 ee 97 1f aa 93 21 88 f7 01 02
[ 1983.644416] old:skb linear:   00000010: 00 2c 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[ 1983.648656] old:skb linear:   00000020: 00 00 ee 97 1f ff fe aa 93 21 00 01 06 79 01 7f
[ 1983.652726] old:skb linear:   00000030: 00 00 00 00 00 00 00 00 00 00

[ 1983.656040] new:skb len=60 headroom=2 headlen=60 tailroom=66
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ok
[ 1983.656040] mac=(2,14) net=(16,-1) trans=-1
[ 1983.656040] shinfo(txflags=1 nr_frags=0 gso(size=0 type=0 segs=0))
[ 1983.656040] csum(0x0 ip_summed=0 complete_sw=0 valid=0 level=0)
[ 1983.656040] hash(0x0 sw=0 l4=0) proto=0x88f7 pkttype=0 iif=0
[ 1983.670427] new:dev name=lan0 feat=0x0x0002000000005220
[ 1983.673082] new:sk family=17 type=3 proto=0
[ 1983.675233] new:skb linear:   00000000: 01 1b 19 00 00 00 ee 97 1f aa 93 21 88 f7 01 02
[ 1983.679329] new:skb linear:   00000010: 00 2c 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[ 1983.683411] new:skb linear:   00000020: 00 00 ee 97 1f ff fe aa 93 21 00 01 06 79 01 7f
[ 1983.687506] new:skb linear:   00000030: 00 00 00 00 00 00 00 00 00 00 00 00

Tested-by: Christian Eggers <ceggers@arri.de>  # For tail taggers only

Best regards
Christian Eggers
Vladimir Oltean Oct. 18, 2020, 11:42 a.m. UTC | #8
On Sun, Oct 18, 2020 at 12:36:03PM +0200, Christian Eggers wrote:
> >       /* For tail taggers, we need to pad short frames ourselves, to ensure
> >        * that the tail tag does not fail at its role of being at the end of
> >        * the packet, once the master interface pads the frame.
> >        */
> >       if (unlikely(needed_tailroom && skb->len < ETH_ZLEN))
> Can "needed_tailroom" be used equivalently for dsa_device_ops::tail_tag?

For what I care, it's "equivalent enough".
Since needed_tailroom comes from slave->needed_tailroom + master->needed_tailroom,
there might be a situation when slave->needed_tailroom is 0, but padding
is performed nonetheless. I am not sure that this is something that will
occur in practice. If you grep drivers/net/ethernet, only Sun Virtual Network
devices set dev->needed_tailroom. I would prefer avoiding to touch any
other cache line, and not duplicating the tail_tag or overhead members
if I can avoid it. If it becomes a problem, I'll make that change.

> >               padlen = ETH_ZLEN - skb->len;
> >       needed_tailroom += padlen;
> >       needed_headroom -= skb_headroom(skb);
> >       needed_tailroom -= skb_tailroom(skb);
> >
> >       if (likely(needed_headroom <= 0 && needed_tailroom <= 0 &&
> >                  !skb_cloned(skb)))
> >               /* No reallocation needed, yay! */
> >               return 0;
> >
> >       e = this_cpu_ptr(p->extra_stats);
> >       u64_stats_update_begin(&e->syncp);
> >       e->tx_reallocs++;
> >       u64_stats_update_end(&e->syncp);
> >
> >       err = pskb_expand_head(skb, max(needed_headroom, 0),
> >                              max(needed_tailroom, 0), GFP_ATOMIC);
> You may remove the second max() statement (around needed_tailroom). This would
> size the reallocated skb more exactly to the size actually required an may save
> some RAM (already tested too).

Please explain more. needed_tailroom can be negative, why should I
shrink the tailroom?

> Alternatively I suggest moving the max() statements up in order to completely
> avoid negative values for needed_headroom / needed_tailroom:
> 
>         needed_headroom = max(needed_headroom - skb_headroom(skb), 0);
>         needed_tailroom = max(needed_tailroom - skb_tailroom(skb), 0);
> 
>         if (likely(needed_headroom == 0 && needed_tailroom == 0 &&
>                    !skb_cloned(skb)))
>                 /* No reallocation needed, yay! */
>                 return 0;
>         ...
>         err = pskb_expand_head(skb, needed_headroom, needed_tailroom, GFP_ATOMIC);
> 

Ok, this looks better, thanks.

> >       if (err < 0 || !padlen)
> >               return err;
> This is correct but looks misleading for me. What about...
>         if (err < 0)
>                 return err;
> 
>         return padlen ? __skb_put_padto(skb, ETH_ZLEN, false) : 0;
> 

Ok, I suppose it can be misleading. Will do this even if it's one more
branch. It's in the unlikely path anyway.

> FYI two dumps of a padded skb (before/after) dsa_realloc_skb():
> [ 1983.621180] old:skb len=58 headroom=2 headlen=58 tailroom=68
> [ 1983.621180] mac=(2,14) net=(16,-1) trans=-1
> [ 1983.621180] shinfo(txflags=1 nr_frags=0 gso(size=0 type=0 segs=0))
> [ 1983.621180] csum(0x0 ip_summed=0 complete_sw=0 valid=0 level=0)
> [ 1983.621180] hash(0x0 sw=0 l4=0) proto=0x88f7 pkttype=0 iif=0
> [ 1983.635575] old:dev name=lan0 feat=0x0x0002000000005220
> [ 1983.638205] old:sk family=17 type=3 proto=0
> [ 1983.640323] old:skb linear:   00000000: 01 1b 19 00 00 00 ee 97 1f aa 93 21 88 f7 01 02
> [ 1983.644416] old:skb linear:   00000010: 00 2c 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> [ 1983.648656] old:skb linear:   00000020: 00 00 ee 97 1f ff fe aa 93 21 00 01 06 79 01 7f
> [ 1983.652726] old:skb linear:   00000030: 00 00 00 00 00 00 00 00 00 00
> 
> [ 1983.656040] new:skb len=60 headroom=2 headlen=60 tailroom=66
> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ok
> [ 1983.656040] mac=(2,14) net=(16,-1) trans=-1
> [ 1983.656040] shinfo(txflags=1 nr_frags=0 gso(size=0 type=0 segs=0))
> [ 1983.656040] csum(0x0 ip_summed=0 complete_sw=0 valid=0 level=0)
> [ 1983.656040] hash(0x0 sw=0 l4=0) proto=0x88f7 pkttype=0 iif=0
> [ 1983.670427] new:dev name=lan0 feat=0x0x0002000000005220
> [ 1983.673082] new:sk family=17 type=3 proto=0
> [ 1983.675233] new:skb linear:   00000000: 01 1b 19 00 00 00 ee 97 1f aa 93 21 88 f7 01 02
> [ 1983.679329] new:skb linear:   00000010: 00 2c 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> [ 1983.683411] new:skb linear:   00000020: 00 00 ee 97 1f ff fe aa 93 21 00 01 06 79 01 7f
> [ 1983.687506] new:skb linear:   00000030: 00 00 00 00 00 00 00 00 00 00 00 00

The dumps look ok, and in line with what I saw.
For what it's worth, anybody can test with any tagger, you don' need
dedicated hardware. You just need to replace the value returned by your
dsa_switch_ops::get_tag_protocol method. This is enough to get skb dumps.
For more complicated things like ensuring 1588 timestamping
works, it won't be enough, of course, so your testing is still very
valuable to ensure that keeps working for you (it does for me).

> 
> Tested-by: Christian Eggers <ceggers@arri.de>  # For tail taggers only

Thanks, I'll resend this in about 2 weeks. In the meantime I'll update
this branch:
https://github.com/vladimiroltean/linux/commits/dsa-tx-realloc
Christian Eggers Oct. 18, 2020, 11:59 a.m. UTC | #9
On Sunday, 18 October 2020, 13:42:06 CEST, Vladimir Oltean wrote:
> On Sun, Oct 18, 2020 at 12:36:03PM +0200, Christian Eggers wrote:
> > >       err = pskb_expand_head(skb, max(needed_headroom, 0),
> > >       
> > >                              max(needed_tailroom, 0), GFP_ATOMIC);
> > 
> > You may remove the second max() statement (around needed_tailroom). This
> > would size the reallocated skb more exactly to the size actually required
> > an may save some RAM (already tested too).
> 
> Please explain more. needed_tailroom can be negative, why should I
> shrink the tailroom?
Because it will not be required anymore. This may lead to smaller memory 
allocations or the excess tailroom can be reused for headroom if needed. If 
none of both applies, the tailroom will not be changed.

regards
Christian
Vladimir Oltean Oct. 18, 2020, 12:15 p.m. UTC | #10
On Sun, Oct 18, 2020 at 01:59:43PM +0200, Christian Eggers wrote:
> On Sunday, 18 October 2020, 13:42:06 CEST, Vladimir Oltean wrote:
> > On Sun, Oct 18, 2020 at 12:36:03PM +0200, Christian Eggers wrote:
> > > >       err = pskb_expand_head(skb, max(needed_headroom, 0),
> > > >
> > > >                              max(needed_tailroom, 0), GFP_ATOMIC);
> > >
> > > You may remove the second max() statement (around needed_tailroom). This
> > > would size the reallocated skb more exactly to the size actually required
> > > an may save some RAM (already tested too).
> >
> > Please explain more. needed_tailroom can be negative, why should I
> > shrink the tailroom?
> Because it will not be required anymore. This may lead to smaller memory
> allocations or the excess tailroom can be reused for headroom if needed. If
> none of both applies, the tailroom will not be changed.

Understand now, you're talking about the case where the tailroom in the
skb is already larger than what we estimate the packet will need through
its entire journey in the TX path. I still won't shrink it though, I'll
keep using the second approach you suggested.
David Laight Oct. 19, 2020, 8:33 a.m. UTC | #11
From: Florian Fainelli>
> Sent: 17 October 2020 23:12
..
> Not positive you need that because you may be account for more head or
> tail room than necessary.
> 
> For instance with tag_brcm.c and systemport.c we need 4 bytes of head
> room for the Broadcom tag and an additional 8 bytes for pushing the
> transmit status block descriptor in front of the Ethernet frame about to
> be transmitted. These additional 8 bytes are a requirement of the DSA
> master here and exist regardless of DSA being used, but we should not be
> propagating them to the DSA slave.

Is it possible to send the extra bytes from a separate buffer fragment?
The entire area could be allocated (coherent) when the rings are
allocated.
That would save having to modify the skb at all.

Even if some bytes of the frame header need 'adjusting' transmitting
from a copy may be faster - especially on systems with an iommu.

Many (many) moons ago we found the cutoff point for copying frames
on a system with an iommu to be around 1k bytes.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Vladimir Oltean Oct. 19, 2020, 10:30 a.m. UTC | #12
On Mon, Oct 19, 2020 at 08:33:27AM +0000, David Laight wrote:
> Is it possible to send the extra bytes from a separate buffer fragment?
> The entire area could be allocated (coherent) when the rings are
> allocated.
> That would save having to modify the skb at all.
> 
> Even if some bytes of the frame header need 'adjusting' transmitting
> from a copy may be faster - especially on systems with an iommu.
> 
> Many (many) moons ago we found the cutoff point for copying frames
> on a system with an iommu to be around 1k bytes.

Please help me understand better how to implement what you're suggesting.
DSA switches have 3 places where they might insert a tag:
1. Between the source MAC address and the EtherType (this is the most
   common)
2. Before the destination MAC address
3. Before the FCS

I imagine that the most common scenario (1) is also the most difficult
to implement using fragments, since I would need to split the Ethernet
header from the rest of the skb data area, which might defeat the
purpose.

Also, simply integrating these 3 code paths into something generic will
bring challenges of its own.

Lastly, I fully expect the buffers to have proper headroom and tailroom
now (if they don't, then it's worth investigating what's the code path
that doesn't observe our dev->needed_headroom and dev->needed_tailroom).
The reallocation code is there just for clones (and as far as I
understand, fragments won't save us from the need of reallocating the
data areas of clones), and for short frames with DSA switches in case
(3).
David Laight Oct. 19, 2020, 11:14 a.m. UTC | #13
From: Vladimir Oltean
> Sent: 19 October 2020 11:31
> 
> On Mon, Oct 19, 2020 at 08:33:27AM +0000, David Laight wrote:
> > Is it possible to send the extra bytes from a separate buffer fragment?
> > The entire area could be allocated (coherent) when the rings are
> > allocated.
> > That would save having to modify the skb at all.
> >
> > Even if some bytes of the frame header need 'adjusting' transmitting
> > from a copy may be faster - especially on systems with an iommu.
> >
> > Many (many) moons ago we found the cutoff point for copying frames
> > on a system with an iommu to be around 1k bytes.
> 
> Please help me understand better how to implement what you're suggesting.
> DSA switches have 3 places where they might insert a tag:
> 1. Between the source MAC address and the EtherType (this is the most
>    common)
> 2. Before the destination MAC address
> 3. Before the FCS
> 
> I imagine that the most common scenario (1) is also the most difficult
> to implement using fragments, since I would need to split the Ethernet
> header from the rest of the skb data area, which might defeat the
> purpose.
> 
> Also, simply integrating these 3 code paths into something generic will
> bring challenges of its own.
> 
> Lastly, I fully expect the buffers to have proper headroom and tailroom
> now (if they don't, then it's worth investigating what's the code path
> that doesn't observe our dev->needed_headroom and dev->needed_tailroom).
> The reallocation code is there just for clones (and as far as I
> understand, fragments won't save us from the need of reallocating the
> data areas of clones), and for short frames with DSA switches in case
> (3).

If the skb are 'cloned' then I suspect you can't even put the MAC addresses
into the skb because they might be being transmitted on another interface.

OTOH TCP will have cloned the skb for retransmit so may ensure than there
isn't (still) a second reference (from an old transmit) before doing the
transmit - in which case you can write into the head/tail space.

Hmmm... I was thinking you were doing something for a specific driver.
But it looks more like it depends on what the interface is connected to.

If the MAC addresses (and ethertype) can be written into the skb head
space then it must be valid to rewrite the header containing the tag.
(With the proviso that none of the MAC drivers try to decode it again.)

One thing I have noticed is that the size of the 'headroom' for UDP
and RAWIP (where I was looking) packets depends on the final 'dev'.
This means that you can't copy the frame into an skb until you know
the 'dev' - but that might depend on the frame data.
This is a 'catch-22' problem.

I actually wonder how much the headroom varies.
It might be worth having a system-wide 'headroom' value.
A few extra bytes aren't really going to make any difference.

That might make it far less likely that there isn't the available
headroom for the tag - in which case you can just log once and discard.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Vladimir Oltean Oct. 19, 2020, 11:41 a.m. UTC | #14
On Mon, Oct 19, 2020 at 11:14:07AM +0000, David Laight wrote:
> If the skb are 'cloned' then I suspect you can't even put the MAC addresses
> into the skb because they might be being transmitted on another interface.

No, that's not right. The bridge floods frames by cloning them, and L2
forwarding doesn't need to modify the L2 header in any way. So when you
have a bridge upper, you will get cloned skbs with a valid MAC header in
them.

> OTOH TCP will have cloned the skb for retransmit so may ensure than there
> isn't (still) a second reference (from an old transmit) before doing the
> transmit - in which case you can write into the head/tail space.

In the case of TCP, I suppose the DSA layer will never see cloned skbs
for the reason you mention, true.

> Hmmm... I was thinking you were doing something for a specific driver.
> But it looks more like it depends on what the interface is connected to.

I'm not sure what you mean here.

> If the MAC addresses (and ethertype) can be written into the skb head
> space then it must be valid to rewrite the header containing the tag.
> (With the proviso that none of the MAC drivers try to decode it again.)

This phrase is almost correct.
Hardware TX timestamping also clones the skb, because the clone needs to
be ultimately reinjected into the socket's error queue, for the user
space program to retrieve the timestamp via a cmsg.

> One thing I have noticed is that the size of the 'headroom' for UDP
> and RAWIP (where I was looking) packets depends on the final 'dev'.
> This means that you can't copy the frame into an skb until you know
> the 'dev' - but that might depend on the frame data.
> This is a 'catch-22' problem.
> I actually wonder how much the headroom varies.
> It might be worth having a system-wide 'headroom' value.
> A few extra bytes aren't really going to make any difference.
>
> That might make it far less likely that there isn't the available
> headroom for the tag - in which case you can just log once and discard.

Again, the case of the bridge, and of TX timestamping, and of tail
taggers that need to perform padding, is enough that we need to
reallocate skbs, and can't just drop them when they're not writable or
there isn't enough head/tailroom.
Andrew Lunn Oct. 19, 2020, 12:19 p.m. UTC | #15
On Mon, Oct 19, 2020 at 10:30:47AM +0000, Vladimir Oltean wrote:
> On Mon, Oct 19, 2020 at 08:33:27AM +0000, David Laight wrote:
> > Is it possible to send the extra bytes from a separate buffer fragment?
> > The entire area could be allocated (coherent) when the rings are
> > allocated.
> > That would save having to modify the skb at all.
> > 
> > Even if some bytes of the frame header need 'adjusting' transmitting
> > from a copy may be faster - especially on systems with an iommu.
> > 
> > Many (many) moons ago we found the cutoff point for copying frames
> > on a system with an iommu to be around 1k bytes.
> 
> Please help me understand better how to implement what you're suggesting.
> DSA switches have 3 places where they might insert a tag:
> 1. Between the source MAC address and the EtherType (this is the most
>    common)
> 2. Before the destination MAC address
> 3. Before the FCS
> 
> I imagine that the most common scenario (1) is also the most difficult
> to implement using fragments, since I would need to split the Ethernet
> header from the rest of the skb data area, which might defeat the
> purpose.

We also have length issues. Most scatter/gather DMA engines require
the fragments are multiple of 4 bytes. Only the last segment does not
have this length restriction. And some of the DSA tag headers are 2
bytes, or 1 byte. So some master devices are going to have to convert
the fragments back to a linear buffer.

    Andrew
diff mbox series

Patch

diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index d4326940233c..790f5c8deb13 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -548,6 +548,36 @@  netdev_tx_t dsa_enqueue_skb(struct sk_buff *skb, struct net_device *dev)
 }
 EXPORT_SYMBOL_GPL(dsa_enqueue_skb);
 
+static int dsa_realloc_skb(struct sk_buff *skb, struct net_device *dev)
+{
+	struct net_device *master = dsa_slave_to_master(dev);
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	struct dsa_slave_stats *e;
+	int headroom, tailroom;
+
+	headroom = dev->needed_headroom;
+	tailroom = dev->needed_tailroom;
+	/* For tail taggers, we need to pad short frames ourselves, to ensure
+	 * that the tail tag does not fail at its role of being at the end of
+	 * the packet, once the master interface pads the frame.
+	 */
+	if (unlikely(tailroom && skb->len < ETH_ZLEN))
+		tailroom += ETH_ZLEN - skb->len;
+
+	if (likely(skb_headroom(skb) >= headroom &&
+		   skb_tailroom(skb) >= tailroom) &&
+		   !skb_cloned(skb))
+		/* No reallocation needed, yay! */
+		return 0;
+
+	e = this_cpu_ptr(p->extra_stats);
+	u64_stats_update_begin(&e->syncp);
+	e->tx_reallocs++;
+	u64_stats_update_end(&e->syncp);
+
+	return pskb_expand_head(skb, headroom, tailroom, GFP_ATOMIC);
+}
+
 static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
@@ -567,6 +597,11 @@  static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev)
 	 */
 	dsa_skb_tx_timestamp(p, skb);
 
+	if (dsa_realloc_skb(skb, dev)) {
+		kfree_skb(skb);
+		return NETDEV_TX_OK;
+	}
+
 	/* Transmit function may have to reallocate the original SKB,
 	 * in which case it must have freed it. Only free it here on error.
 	 */
@@ -1802,6 +1837,14 @@  int dsa_slave_create(struct dsa_port *port)
 	slave_dev->netdev_ops = &dsa_slave_netdev_ops;
 	if (ds->ops->port_max_mtu)
 		slave_dev->max_mtu = ds->ops->port_max_mtu(ds, port->index);
+	/* Try to save one extra realloc later in the TX path (in the master)
+	 * by also inheriting the master's needed headroom and tailroom.
+	 * The 8021q driver also does this.
+	 */
+	if (cpu_dp->tag_ops->tail_tag)
+		slave_dev->needed_tailroom = cpu_dp->tag_ops->overhead;
+	else
+		slave_dev->needed_headroom = cpu_dp->tag_ops->overhead;
 	SET_NETDEV_DEVTYPE(slave_dev, &dsa_type);
 
 	netdev_for_each_tx_queue(slave_dev, dsa_slave_set_lockdep_class_one,