diff mbox series

[net-next,4/5] veth: introduce more xdp counters

Message ID 0763c17646523acb4dc15aaec01decb4efe11eac.1584635611.git.lorenzo@kernel.org
State Accepted
Delegated to: David Miller
Headers show
Series add more xdp stats to veth driver | expand

Commit Message

Lorenzo Bianconi March 19, 2020, 4:41 p.m. UTC
Introduce xdp_xmit counter in order to distinguish between XDP_TX and
ndo_xdp_xmit stats. Introduce the following ethtool counters:
- rx_xdp_tx
- rx_xdp_tx_errors
- tx_xdp_xmit
- tx_xdp_xmit_errors
- rx_xdp_redirect

Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
---
 drivers/net/veth.c | 42 +++++++++++++++++++++++++++++++++++-------
 1 file changed, 35 insertions(+), 7 deletions(-)

Comments

Toshiaki Makita March 20, 2020, 1:22 p.m. UTC | #1
On 2020/03/20 1:41, Lorenzo Bianconi wrote:
> Introduce xdp_xmit counter in order to distinguish between XDP_TX and
> ndo_xdp_xmit stats. Introduce the following ethtool counters:
> - rx_xdp_tx
> - rx_xdp_tx_errors
> - tx_xdp_xmit
> - tx_xdp_xmit_errors
> - rx_xdp_redirect

Thank you for working on this!

> Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> ---
...
> @@ -395,7 +404,8 @@ static int veth_xdp_xmit(struct net_device *dev, int n,
>   	}
>   
>   	rcv_priv = netdev_priv(rcv);
> -	rq = &rcv_priv->rq[veth_select_rxq(rcv)];
> +	qidx = veth_select_rxq(rcv);
> +	rq = &rcv_priv->rq[qidx];
>   	/* Non-NULL xdp_prog ensures that xdp_ring is initialized on receive
>   	 * side. This means an XDP program is loaded on the peer and the peer
>   	 * device is up.
> @@ -424,6 +434,17 @@ static int veth_xdp_xmit(struct net_device *dev, int n,
>   	if (flags & XDP_XMIT_FLUSH)
>   		__veth_xdp_flush(rq);
>   
> +	rq = &priv->rq[qidx];

I think there is no guarantee that this rq exists. Qidx is less than 
rcv->real_num_rx_queues, but not necessarily less than 
dev->real_num_rx_queues.

> +	u64_stats_update_begin(&rq->stats.syncp);

So this can cuase NULL pointer dereference.

Toshiaki Makita

> +	if (ndo_xmit) {
> +		rq->stats.vs.xdp_xmit += n - drops;
> +		rq->stats.vs.xdp_xmit_err += drops;
> +	} else {
> +		rq->stats.vs.xdp_tx += n - drops;
> +		rq->stats.vs.xdp_tx_err += drops;
> +	}
> +	u64_stats_update_end(&rq->stats.syncp);
> +
>   	if (likely(!drops)) {
>   		rcu_read_unlock();
>   		return n;
> @@ -437,11 +458,17 @@ static int veth_xdp_xmit(struct net_device *dev, int n,
>   	return ret;
>   }
>   
> +static int veth_ndo_xdp_xmit(struct net_device *dev, int n,
> +			     struct xdp_frame **frames, u32 flags)
> +{
> +	return veth_xdp_xmit(dev, n, frames, flags, true);
> +}
> +
>   static void veth_xdp_flush_bq(struct net_device *dev, struct veth_xdp_tx_bq *bq)
>   {
>   	int sent, i, err = 0;
>   
> -	sent = veth_xdp_xmit(dev, bq->count, bq->q, 0);
> +	sent = veth_xdp_xmit(dev, bq->count, bq->q, 0, false);
>   	if (sent < 0) {
>   		err = sent;
>   		sent = 0;
> @@ -753,6 +780,7 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget,
>   	}
>   
>   	u64_stats_update_begin(&rq->stats.syncp);
> +	rq->stats.vs.xdp_redirect += stats->xdp_redirect;
>   	rq->stats.vs.xdp_bytes += stats->xdp_bytes;
>   	rq->stats.vs.xdp_drops += stats->xdp_drops;
>   	rq->stats.vs.rx_drops += stats->rx_drops;
> @@ -1172,7 +1200,7 @@ static const struct net_device_ops veth_netdev_ops = {
>   	.ndo_features_check	= passthru_features_check,
>   	.ndo_set_rx_headroom	= veth_set_rx_headroom,
>   	.ndo_bpf		= veth_xdp,
> -	.ndo_xdp_xmit		= veth_xdp_xmit,
> +	.ndo_xdp_xmit		= veth_ndo_xdp_xmit,
>   };
>   
>   #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \
>
Lorenzo Bianconi March 20, 2020, 1:37 p.m. UTC | #2
> On 2020/03/20 1:41, Lorenzo Bianconi wrote:
> > Introduce xdp_xmit counter in order to distinguish between XDP_TX and
> > ndo_xdp_xmit stats. Introduce the following ethtool counters:
> > - rx_xdp_tx
> > - rx_xdp_tx_errors
> > - tx_xdp_xmit
> > - tx_xdp_xmit_errors
> > - rx_xdp_redirect
> 
> Thank you for working on this!
> 
> > Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> > ---
> ...
> > @@ -395,7 +404,8 @@ static int veth_xdp_xmit(struct net_device *dev, int n,
> >   	}
> >   	rcv_priv = netdev_priv(rcv);
> > -	rq = &rcv_priv->rq[veth_select_rxq(rcv)];
> > +	qidx = veth_select_rxq(rcv);
> > +	rq = &rcv_priv->rq[qidx];
> >   	/* Non-NULL xdp_prog ensures that xdp_ring is initialized on receive
> >   	 * side. This means an XDP program is loaded on the peer and the peer
> >   	 * device is up.
> > @@ -424,6 +434,17 @@ static int veth_xdp_xmit(struct net_device *dev, int n,
> >   	if (flags & XDP_XMIT_FLUSH)
> >   		__veth_xdp_flush(rq);
> > +	rq = &priv->rq[qidx];
> 
> I think there is no guarantee that this rq exists. Qidx is less than
> rcv->real_num_rx_queues, but not necessarily less than
> dev->real_num_rx_queues.
> 
> > +	u64_stats_update_begin(&rq->stats.syncp);
> 
> So this can cuase NULL pointer dereference.

oh right, thanks for spotting this.
I think we can recompute qidx for tx netdevice in this case, doing something
like:

qidx = veth_select_rxq(dev);
rq = &priv->rq[qidx];

what do you think?

Regards,
Lorenzo

> 
> Toshiaki Makita
> 
> > +	if (ndo_xmit) {
> > +		rq->stats.vs.xdp_xmit += n - drops;
> > +		rq->stats.vs.xdp_xmit_err += drops;
> > +	} else {
> > +		rq->stats.vs.xdp_tx += n - drops;
> > +		rq->stats.vs.xdp_tx_err += drops;
> > +	}
> > +	u64_stats_update_end(&rq->stats.syncp);
> > +
> >   	if (likely(!drops)) {
> >   		rcu_read_unlock();
> >   		return n;
> > @@ -437,11 +458,17 @@ static int veth_xdp_xmit(struct net_device *dev, int n,
> >   	return ret;
> >   }
> > +static int veth_ndo_xdp_xmit(struct net_device *dev, int n,
> > +			     struct xdp_frame **frames, u32 flags)
> > +{
> > +	return veth_xdp_xmit(dev, n, frames, flags, true);
> > +}
> > +
> >   static void veth_xdp_flush_bq(struct net_device *dev, struct veth_xdp_tx_bq *bq)
> >   {
> >   	int sent, i, err = 0;
> > -	sent = veth_xdp_xmit(dev, bq->count, bq->q, 0);
> > +	sent = veth_xdp_xmit(dev, bq->count, bq->q, 0, false);
> >   	if (sent < 0) {
> >   		err = sent;
> >   		sent = 0;
> > @@ -753,6 +780,7 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget,
> >   	}
> >   	u64_stats_update_begin(&rq->stats.syncp);
> > +	rq->stats.vs.xdp_redirect += stats->xdp_redirect;
> >   	rq->stats.vs.xdp_bytes += stats->xdp_bytes;
> >   	rq->stats.vs.xdp_drops += stats->xdp_drops;
> >   	rq->stats.vs.rx_drops += stats->rx_drops;
> > @@ -1172,7 +1200,7 @@ static const struct net_device_ops veth_netdev_ops = {
> >   	.ndo_features_check	= passthru_features_check,
> >   	.ndo_set_rx_headroom	= veth_set_rx_headroom,
> >   	.ndo_bpf		= veth_xdp,
> > -	.ndo_xdp_xmit		= veth_xdp_xmit,
> > +	.ndo_xdp_xmit		= veth_ndo_xdp_xmit,
> >   };
> >   #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \
> >
Toshiaki Makita March 21, 2020, 1:38 p.m. UTC | #3
On 2020/03/20 22:37, Lorenzo Bianconi wrote:
>> On 2020/03/20 1:41, Lorenzo Bianconi wrote:
>>> Introduce xdp_xmit counter in order to distinguish between XDP_TX and
>>> ndo_xdp_xmit stats. Introduce the following ethtool counters:
>>> - rx_xdp_tx
>>> - rx_xdp_tx_errors
>>> - tx_xdp_xmit
>>> - tx_xdp_xmit_errors
>>> - rx_xdp_redirect
>>
>> Thank you for working on this!
>>
>>> Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
>>> ---
>> ...
>>> @@ -395,7 +404,8 @@ static int veth_xdp_xmit(struct net_device *dev, int n,
>>>    	}
>>>    	rcv_priv = netdev_priv(rcv);
>>> -	rq = &rcv_priv->rq[veth_select_rxq(rcv)];
>>> +	qidx = veth_select_rxq(rcv);
>>> +	rq = &rcv_priv->rq[qidx];
>>>    	/* Non-NULL xdp_prog ensures that xdp_ring is initialized on receive
>>>    	 * side. This means an XDP program is loaded on the peer and the peer
>>>    	 * device is up.
>>> @@ -424,6 +434,17 @@ static int veth_xdp_xmit(struct net_device *dev, int n,
>>>    	if (flags & XDP_XMIT_FLUSH)
>>>    		__veth_xdp_flush(rq);
>>> +	rq = &priv->rq[qidx];
>>
>> I think there is no guarantee that this rq exists. Qidx is less than
>> rcv->real_num_rx_queues, but not necessarily less than
>> dev->real_num_rx_queues.
>>
>>> +	u64_stats_update_begin(&rq->stats.syncp);
>>
>> So this can cuase NULL pointer dereference.
> 
> oh right, thanks for spotting this.
> I think we can recompute qidx for tx netdevice in this case, doing something
> like:
> 
> qidx = veth_select_rxq(dev);
> rq = &priv->rq[qidx];
> 
> what do you think?

This would not cause NULL pointer deref, but I wonder what counters 
you've added mean.

- rx_xdp_redirect, rx_xdp_drops, rx_xdp_tx

These counters names will be rx_queue_[i]_rx_xdp_[redirect|drops|tx].
"rx_" in their names looks redundant.
Also it looks like there is not "rx[i]_xdp_tx" counter but there is 
"rx[i]_xdp_tx_xmit" in mlx5 from this page.
https://community.mellanox.com/s/article/understanding-mlx5-ethtool-counters

- tx_xdp_xmit, tx_xdp_xmit_errors

These counters names will be rx_queue_[i]_tx_xdp_[xmit|xmit_errors].
Are these rx counters or tx counters?
If tx, currently there is no storage to store these tx counters so 
adding these would not be simple.
If rx, I guess you should use peer rx queue counters instead of dev rx 
queue counters.

Toshiaki Makita
Lorenzo Bianconi March 21, 2020, 2:30 p.m. UTC | #4
> On 2020/03/20 22:37, Lorenzo Bianconi wrote:
> > > On 2020/03/20 1:41, Lorenzo Bianconi wrote:
> > > > Introduce xdp_xmit counter in order to distinguish between XDP_TX and
> > > > ndo_xdp_xmit stats. Introduce the following ethtool counters:
> > > > - rx_xdp_tx
> > > > - rx_xdp_tx_errors
> > > > - tx_xdp_xmit
> > > > - tx_xdp_xmit_errors
> > > > - rx_xdp_redirect
> > > 
> > > Thank you for working on this!
> > > 
> > > > Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> > > > ---
> > > ...
> > > > @@ -395,7 +404,8 @@ static int veth_xdp_xmit(struct net_device *dev, int n,
> > > >    	}
> > > >    	rcv_priv = netdev_priv(rcv);
> > > > -	rq = &rcv_priv->rq[veth_select_rxq(rcv)];
> > > > +	qidx = veth_select_rxq(rcv);
> > > > +	rq = &rcv_priv->rq[qidx];
> > > >    	/* Non-NULL xdp_prog ensures that xdp_ring is initialized on receive
> > > >    	 * side. This means an XDP program is loaded on the peer and the peer
> > > >    	 * device is up.
> > > > @@ -424,6 +434,17 @@ static int veth_xdp_xmit(struct net_device *dev, int n,
> > > >    	if (flags & XDP_XMIT_FLUSH)
> > > >    		__veth_xdp_flush(rq);
> > > > +	rq = &priv->rq[qidx];
> > > 
> > > I think there is no guarantee that this rq exists. Qidx is less than
> > > rcv->real_num_rx_queues, but not necessarily less than
> > > dev->real_num_rx_queues.
> > > 
> > > > +	u64_stats_update_begin(&rq->stats.syncp);
> > > 
> > > So this can cuase NULL pointer dereference.
> > 
> > oh right, thanks for spotting this.
> > I think we can recompute qidx for tx netdevice in this case, doing something
> > like:
> > 
> > qidx = veth_select_rxq(dev);
> > rq = &priv->rq[qidx];
> > 
> > what do you think?
> 
> This would not cause NULL pointer deref, but I wonder what counters you've
> added mean.
> 
> - rx_xdp_redirect, rx_xdp_drops, rx_xdp_tx
> 
> These counters names will be rx_queue_[i]_rx_xdp_[redirect|drops|tx].
> "rx_" in their names looks redundant.

yes, we can drop the "rx" prefix in the stats name for them.

> Also it looks like there is not "rx[i]_xdp_tx" counter but there is
> "rx[i]_xdp_tx_xmit" in mlx5 from this page.
> https://community.mellanox.com/s/article/understanding-mlx5-ethtool-counters

rx[i]_xdp_tx_xmit and rx_xdp_tx are the same, we decided to use rx_xdp_tx for
it since it seems more clear

> 
> - tx_xdp_xmit, tx_xdp_xmit_errors
> 
> These counters names will be rx_queue_[i]_tx_xdp_[xmit|xmit_errors].
> Are these rx counters or tx counters?

tx_xdp_xmit[_errors] are used to count ndo_xmit stats so they are tx counters.
I reused veth_stats for it just for convenience. Probably we can show them without
rx suffix so it is clear they are transmitted by the current device.
Another approach would be create per_cput struct to collect all tx stats.
What do you think?

Regards,
Lorenzo

> If tx, currently there is no storage to store these tx counters so adding
> these would not be simple.
> If rx, I guess you should use peer rx queue counters instead of dev rx queue
> counters.
> 
> Toshiaki Makita
Toshiaki Makita March 22, 2020, 2:29 p.m. UTC | #5
On 2020/03/21 23:30, Lorenzo Bianconi wrote:
>> On 2020/03/20 22:37, Lorenzo Bianconi wrote:
>>>> On 2020/03/20 1:41, Lorenzo Bianconi wrote:
>>>>> Introduce xdp_xmit counter in order to distinguish between XDP_TX and
>>>>> ndo_xdp_xmit stats. Introduce the following ethtool counters:
>>>>> - rx_xdp_tx
>>>>> - rx_xdp_tx_errors
>>>>> - tx_xdp_xmit
>>>>> - tx_xdp_xmit_errors
>>>>> - rx_xdp_redirect
>>>>
>>>> Thank you for working on this!
>>>>
>>>>> Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
>>>>> ---
>>>> ...
>>>>> @@ -395,7 +404,8 @@ static int veth_xdp_xmit(struct net_device *dev, int n,
>>>>>     	}
>>>>>     	rcv_priv = netdev_priv(rcv);
>>>>> -	rq = &rcv_priv->rq[veth_select_rxq(rcv)];
>>>>> +	qidx = veth_select_rxq(rcv);
>>>>> +	rq = &rcv_priv->rq[qidx];
>>>>>     	/* Non-NULL xdp_prog ensures that xdp_ring is initialized on receive
>>>>>     	 * side. This means an XDP program is loaded on the peer and the peer
>>>>>     	 * device is up.
>>>>> @@ -424,6 +434,17 @@ static int veth_xdp_xmit(struct net_device *dev, int n,
>>>>>     	if (flags & XDP_XMIT_FLUSH)
>>>>>     		__veth_xdp_flush(rq);
>>>>> +	rq = &priv->rq[qidx];
>>>>
>>>> I think there is no guarantee that this rq exists. Qidx is less than
>>>> rcv->real_num_rx_queues, but not necessarily less than
>>>> dev->real_num_rx_queues.
>>>>
>>>>> +	u64_stats_update_begin(&rq->stats.syncp);
>>>>
>>>> So this can cuase NULL pointer dereference.
>>>
>>> oh right, thanks for spotting this.
>>> I think we can recompute qidx for tx netdevice in this case, doing something
>>> like:
>>>
>>> qidx = veth_select_rxq(dev);
>>> rq = &priv->rq[qidx];
>>>
>>> what do you think?
>>
>> This would not cause NULL pointer deref, but I wonder what counters you've
>> added mean.
>>
>> - rx_xdp_redirect, rx_xdp_drops, rx_xdp_tx
>>
>> These counters names will be rx_queue_[i]_rx_xdp_[redirect|drops|tx].
>> "rx_" in their names looks redundant.
> 
> yes, we can drop the "rx" prefix in the stats name for them.
> 
>> Also it looks like there is not "rx[i]_xdp_tx" counter but there is
>> "rx[i]_xdp_tx_xmit" in mlx5 from this page.
>> https://community.mellanox.com/s/article/understanding-mlx5-ethtool-counters
> 
> rx[i]_xdp_tx_xmit and rx_xdp_tx are the same, we decided to use rx_xdp_tx for
> it since it seems more clear

OK.

>> - tx_xdp_xmit, tx_xdp_xmit_errors
>>
>> These counters names will be rx_queue_[i]_tx_xdp_[xmit|xmit_errors].
>> Are these rx counters or tx counters?
> 
> tx_xdp_xmit[_errors] are used to count ndo_xmit stats so they are tx counters.
> I reused veth_stats for it just for convenience. Probably we can show them without
> rx suffix so it is clear they are transmitted by the current device.
> Another approach would be create per_cput struct to collect all tx stats.
> What do you think?

As veth_xdp_xmit really does not use tx queue but select peer rxq directly, per_cpu 
sounds more appropriate than per-queue.
One concern is consistency. Per-queue rx stats and per-cpu tx stats (or only sum of 
them?) looks inconsistent.
One alternative way is to change the queue selection login in veth_xdp_xmit and 
select txq instead of rxq. Then select peer rxq from txq, like veth_xmit. Accounting 
per queue tx stats is possible only when we can determine which txq is used.

Something like this:

static int veth_select_txq(struct net_device *dev)
{
	return smp_processor_id() % dev->real_num_tx_queues;
}

static int veth_xdp_xmit(struct net_device *dev, int n,
			 struct xdp_frame **frames, u32 flags)
{
	...
	txq = veth_select_txq(dev);
	rcv_rxq = txq; // 1-to-1 mapping from txq to peer rxq
	// Note: when XDP is enabled on rcv, this condition is always false
	if (rcv_rxq >= rcv->real_num_rx_queues)
		return -ENXIO;
	rcv_priv = netdev_priv(rcv);
	rq = &rcv_priv->rq[rcv_rxq];
	...
	// account txq stats in some way here
}

Thoughts?

Toshiaki Makita
Lorenzo Bianconi March 23, 2020, 5:31 p.m. UTC | #6
> On 2020/03/21 23:30, Lorenzo Bianconi wrote:
> > > On 2020/03/20 22:37, Lorenzo Bianconi wrote:
> > > > > On 2020/03/20 1:41, Lorenzo Bianconi wrote:

[...]

> As veth_xdp_xmit really does not use tx queue but select peer rxq directly,
> per_cpu sounds more appropriate than per-queue.
> One concern is consistency. Per-queue rx stats and per-cpu tx stats (or only
> sum of them?) looks inconsistent.
> One alternative way is to change the queue selection login in veth_xdp_xmit
> and select txq instead of rxq. Then select peer rxq from txq, like
> veth_xmit. Accounting per queue tx stats is possible only when we can
> determine which txq is used.
> 
> Something like this:
> 
> static int veth_select_txq(struct net_device *dev)
> {
> 	return smp_processor_id() % dev->real_num_tx_queues;
> }
> 
> static int veth_xdp_xmit(struct net_device *dev, int n,
> 			 struct xdp_frame **frames, u32 flags)
> {
> 	...
> 	txq = veth_select_txq(dev);
> 	rcv_rxq = txq; // 1-to-1 mapping from txq to peer rxq
> 	// Note: when XDP is enabled on rcv, this condition is always false
> 	if (rcv_rxq >= rcv->real_num_rx_queues)
> 		return -ENXIO;
> 	rcv_priv = netdev_priv(rcv);
> 	rq = &rcv_priv->rq[rcv_rxq];
> 	...
> 	// account txq stats in some way here
> }

actually I have a different idea..what about account tx stats on the peer rx
queue as a result of XDP_TX or ndo_xdp_xmit and properly report this info in
the ethool stats? In this way we do not have any locking issue and we still use
the per-queue stats approach. Could you please take a look to the following patch?

Regards,
Lorenzo

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index b6505a6c7102..f2acd2ee6287 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -92,17 +92,22 @@ struct veth_q_stat_desc {
 static const struct veth_q_stat_desc veth_rq_stats_desc[] = {
 	{ "xdp_packets",	VETH_RQ_STAT(xdp_packets) },
 	{ "xdp_bytes",		VETH_RQ_STAT(xdp_bytes) },
-	{ "rx_drops",		VETH_RQ_STAT(rx_drops) },
-	{ "rx_xdp_redirect",	VETH_RQ_STAT(xdp_redirect) },
-	{ "rx_xdp_drops",	VETH_RQ_STAT(xdp_drops) },
-	{ "rx_xdp_tx",		VETH_RQ_STAT(xdp_tx) },
-	{ "rx_xdp_tx_errors",	VETH_RQ_STAT(xdp_tx_err) },
-	{ "tx_xdp_xmit",	VETH_RQ_STAT(xdp_xmit) },
-	{ "tx_xdp_xmit_errors",	VETH_RQ_STAT(xdp_xmit_err) },
+	{ "drops",		VETH_RQ_STAT(rx_drops) },
+	{ "xdp_redirect",	VETH_RQ_STAT(xdp_redirect) },
+	{ "xdp_drops",		VETH_RQ_STAT(xdp_drops) },
 };
 
 #define VETH_RQ_STATS_LEN	ARRAY_SIZE(veth_rq_stats_desc)
 
+static const struct veth_q_stat_desc veth_tq_stats_desc[] = {
+	{ "xdp_tx",		VETH_RQ_STAT(xdp_tx) },
+	{ "xdp_tx_errors",	VETH_RQ_STAT(xdp_tx_err) },
+	{ "xdp_xmit",		VETH_RQ_STAT(xdp_xmit) },
+	{ "xdp_xmit_errors",	VETH_RQ_STAT(xdp_xmit_err) },
+};
+
+#define VETH_TQ_STATS_LEN	ARRAY_SIZE(veth_tq_stats_desc)
+
 static struct {
 	const char string[ETH_GSTRING_LEN];
 } ethtool_stats_keys[] = {
@@ -142,6 +147,14 @@ static void veth_get_strings(struct net_device *dev, u32 stringset, u8 *buf)
 				p += ETH_GSTRING_LEN;
 			}
 		}
+		for (i = 0; i < dev->real_num_tx_queues; i++) {
+			for (j = 0; j < VETH_TQ_STATS_LEN; j++) {
+				snprintf(p, ETH_GSTRING_LEN,
+					 "tx_queue_%u_%.18s",
+					 i, veth_tq_stats_desc[j].desc);
+				p += ETH_GSTRING_LEN;
+			}
+		}
 		break;
 	}
 }
@@ -151,7 +164,8 @@ static int veth_get_sset_count(struct net_device *dev, int sset)
 	switch (sset) {
 	case ETH_SS_STATS:
 		return ARRAY_SIZE(ethtool_stats_keys) +
-		       VETH_RQ_STATS_LEN * dev->real_num_rx_queues;
+		       VETH_RQ_STATS_LEN * dev->real_num_rx_queues +
+		       VETH_TQ_STATS_LEN * dev->real_num_tx_queues;
 	default:
 		return -EOPNOTSUPP;
 	}
@@ -160,7 +174,7 @@ static int veth_get_sset_count(struct net_device *dev, int sset)
 static void veth_get_ethtool_stats(struct net_device *dev,
 		struct ethtool_stats *stats, u64 *data)
 {
-	struct veth_priv *priv = netdev_priv(dev);
+	struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
 	struct net_device *peer = rtnl_dereference(priv->peer);
 	int i, j, idx;
 
@@ -181,6 +195,26 @@ static void veth_get_ethtool_stats(struct net_device *dev,
 		} while (u64_stats_fetch_retry_irq(&rq_stats->syncp, start));
 		idx += VETH_RQ_STATS_LEN;
 	}
+
+	if (!peer)
+		return;
+
+	rcv_priv = netdev_priv(peer);
+	for (i = 0; i < peer->real_num_rx_queues; i++) {
+		const struct veth_rq_stats *rq_stats = &rcv_priv->rq[i].stats;
+		const void *stats_base = (void *)&rq_stats->vs;
+		unsigned int start, tx_idx;
+		size_t offset;
+
+		tx_idx = (i % dev->real_num_tx_queues) * VETH_TQ_STATS_LEN;
+		do {
+			start = u64_stats_fetch_begin_irq(&rq_stats->syncp);
+			for (j = 0; j < VETH_TQ_STATS_LEN; j++) {
+				offset = veth_tq_stats_desc[j].offset;
+				data[tx_idx + idx + j] += *(u64 *)(stats_base + offset);
+			}
+		} while (u64_stats_fetch_retry_irq(&rq_stats->syncp, start));
+	}
 }
 
 static const struct ethtool_ops veth_ethtool_ops = {
@@ -340,8 +374,7 @@ static void veth_get_stats64(struct net_device *dev,
 	tot->tx_packets = packets;
 
 	veth_stats_rx(&rx, dev);
-	tot->tx_dropped += rx.xdp_xmit_err + rx.xdp_tx_err;
-	tot->rx_dropped = rx.rx_drops;
+	tot->rx_dropped = rx.rx_drops + rx.xdp_tx_err + rx.xdp_xmit_err;
 	tot->rx_bytes = rx.xdp_bytes;
 	tot->rx_packets = rx.xdp_packets;
 
@@ -353,7 +386,7 @@ static void veth_get_stats64(struct net_device *dev,
 		tot->rx_packets += packets;
 
 		veth_stats_rx(&rx, peer);
-		tot->rx_dropped += rx.xdp_xmit_err + rx.xdp_tx_err;
+		tot->tx_dropped += rx.xdp_xmit_err + rx.xdp_tx_err;
 		tot->tx_bytes += rx.xdp_bytes;
 		tot->tx_packets += rx.xdp_packets;
 	}
@@ -394,9 +427,9 @@ static int veth_xdp_xmit(struct net_device *dev, int n,
 			 u32 flags, bool ndo_xmit)
 {
 	struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
-	unsigned int qidx, max_len;
 	struct net_device *rcv;
 	int i, ret, drops = n;
+	unsigned int max_len;
 	struct veth_rq *rq;
 
 	rcu_read_lock();
@@ -414,8 +447,7 @@ static int veth_xdp_xmit(struct net_device *dev, int n,
 	}
 
 	rcv_priv = netdev_priv(rcv);
-	qidx = veth_select_rxq(rcv);
-	rq = &rcv_priv->rq[qidx];
+	rq = &rcv_priv->rq[veth_select_rxq(rcv)];
 	/* Non-NULL xdp_prog ensures that xdp_ring is initialized on receive
 	 * side. This means an XDP program is loaded on the peer and the peer
 	 * device is up.
@@ -446,7 +478,6 @@ static int veth_xdp_xmit(struct net_device *dev, int n,
 
 	ret = n - drops;
 drop:
-	rq = &priv->rq[qidx];
 	u64_stats_update_begin(&rq->stats.syncp);
 	if (ndo_xmit) {
 		rq->stats.vs.xdp_xmit += n - drops;

> 
> Thoughts?
> 
> Toshiaki Makita
Toshiaki Makita March 24, 2020, 2:21 p.m. UTC | #7
On 2020/03/24 2:31, Lorenzo Bianconi wrote:
>> On 2020/03/21 23:30, Lorenzo Bianconi wrote:
>>>> On 2020/03/20 22:37, Lorenzo Bianconi wrote:
>>>>>> On 2020/03/20 1:41, Lorenzo Bianconi wrote:
> 
> [...]
> 
>> As veth_xdp_xmit really does not use tx queue but select peer rxq directly,
>> per_cpu sounds more appropriate than per-queue.
>> One concern is consistency. Per-queue rx stats and per-cpu tx stats (or only
>> sum of them?) looks inconsistent.
>> One alternative way is to change the queue selection login in veth_xdp_xmit
>> and select txq instead of rxq. Then select peer rxq from txq, like
>> veth_xmit. Accounting per queue tx stats is possible only when we can
>> determine which txq is used.
>>
>> Something like this:
>>
>> static int veth_select_txq(struct net_device *dev)
>> {
>> 	return smp_processor_id() % dev->real_num_tx_queues;
>> }
>>
>> static int veth_xdp_xmit(struct net_device *dev, int n,
>> 			 struct xdp_frame **frames, u32 flags)
>> {
>> 	...
>> 	txq = veth_select_txq(dev);
>> 	rcv_rxq = txq; // 1-to-1 mapping from txq to peer rxq
>> 	// Note: when XDP is enabled on rcv, this condition is always false
>> 	if (rcv_rxq >= rcv->real_num_rx_queues)
>> 		return -ENXIO;
>> 	rcv_priv = netdev_priv(rcv);
>> 	rq = &rcv_priv->rq[rcv_rxq];
>> 	...
>> 	// account txq stats in some way here
>> }
> 
> actually I have a different idea..what about account tx stats on the peer rx
> queue as a result of XDP_TX or ndo_xdp_xmit and properly report this info in
> the ethool stats? In this way we do not have any locking issue and we still use
> the per-queue stats approach. Could you please take a look to the following patch?

Thanks I think your idea is nice.

> diff --git a/drivers/net/veth.c b/drivers/net/veth.c
> index b6505a6c7102..f2acd2ee6287 100644
> --- a/drivers/net/veth.c
> +++ b/drivers/net/veth.c
> @@ -92,17 +92,22 @@ struct veth_q_stat_desc {
>   static const struct veth_q_stat_desc veth_rq_stats_desc[] = {
>   	{ "xdp_packets",	VETH_RQ_STAT(xdp_packets) },
>   	{ "xdp_bytes",		VETH_RQ_STAT(xdp_bytes) },
> -	{ "rx_drops",		VETH_RQ_STAT(rx_drops) },
> -	{ "rx_xdp_redirect",	VETH_RQ_STAT(xdp_redirect) },
> -	{ "rx_xdp_drops",	VETH_RQ_STAT(xdp_drops) },
> -	{ "rx_xdp_tx",		VETH_RQ_STAT(xdp_tx) },
> -	{ "rx_xdp_tx_errors",	VETH_RQ_STAT(xdp_tx_err) },
> -	{ "tx_xdp_xmit",	VETH_RQ_STAT(xdp_xmit) },
> -	{ "tx_xdp_xmit_errors",	VETH_RQ_STAT(xdp_xmit_err) },
> +	{ "drops",		VETH_RQ_STAT(rx_drops) },
> +	{ "xdp_redirect",	VETH_RQ_STAT(xdp_redirect) },
> +	{ "xdp_drops",		VETH_RQ_STAT(xdp_drops) },
>   };
>   
>   #define VETH_RQ_STATS_LEN	ARRAY_SIZE(veth_rq_stats_desc)
>   
> +static const struct veth_q_stat_desc veth_tq_stats_desc[] = {
> +	{ "xdp_tx",		VETH_RQ_STAT(xdp_tx) },
> +	{ "xdp_tx_errors",	VETH_RQ_STAT(xdp_tx_err) },

I'm wondering why xdp_tx is not accounted in rx_queue?
You can count xdp_tx/tx_error somewhere in rx path like veth_xdp_flush_bq().
xdp_redirect and xdp_drops are similar actions and in rx stats.

> +	{ "xdp_xmit",		VETH_RQ_STAT(xdp_xmit) },
> +	{ "xdp_xmit_errors",	VETH_RQ_STAT(xdp_xmit_err) },
> +};
> +
> +#define VETH_TQ_STATS_LEN	ARRAY_SIZE(veth_tq_stats_desc)
> +
>   static struct {
>   	const char string[ETH_GSTRING_LEN];
>   } ethtool_stats_keys[] = {
> @@ -142,6 +147,14 @@ static void veth_get_strings(struct net_device *dev, u32 stringset, u8 *buf)
>   				p += ETH_GSTRING_LEN;
>   			}
>   		}
> +		for (i = 0; i < dev->real_num_tx_queues; i++) {
> +			for (j = 0; j < VETH_TQ_STATS_LEN; j++) {
> +				snprintf(p, ETH_GSTRING_LEN,
> +					 "tx_queue_%u_%.18s",
> +					 i, veth_tq_stats_desc[j].desc);
> +				p += ETH_GSTRING_LEN;
> +			}
> +		}
>   		break;
>   	}
>   }
> @@ -151,7 +164,8 @@ static int veth_get_sset_count(struct net_device *dev, int sset)
>   	switch (sset) {
>   	case ETH_SS_STATS:
>   		return ARRAY_SIZE(ethtool_stats_keys) +
> -		       VETH_RQ_STATS_LEN * dev->real_num_rx_queues;
> +		       VETH_RQ_STATS_LEN * dev->real_num_rx_queues +
> +		       VETH_TQ_STATS_LEN * dev->real_num_tx_queues;
>   	default:
>   		return -EOPNOTSUPP;
>   	}
> @@ -160,7 +174,7 @@ static int veth_get_sset_count(struct net_device *dev, int sset)
>   static void veth_get_ethtool_stats(struct net_device *dev,
>   		struct ethtool_stats *stats, u64 *data)
>   {
> -	struct veth_priv *priv = netdev_priv(dev);
> +	struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
>   	struct net_device *peer = rtnl_dereference(priv->peer);
>   	int i, j, idx;
>   
> @@ -181,6 +195,26 @@ static void veth_get_ethtool_stats(struct net_device *dev,
>   		} while (u64_stats_fetch_retry_irq(&rq_stats->syncp, start));
>   		idx += VETH_RQ_STATS_LEN;
>   	}
> +
> +	if (!peer)
> +		return;
> +
> +	rcv_priv = netdev_priv(peer);
> +	for (i = 0; i < peer->real_num_rx_queues; i++) {
> +		const struct veth_rq_stats *rq_stats = &rcv_priv->rq[i].stats;
> +		const void *stats_base = (void *)&rq_stats->vs;
> +		unsigned int start, tx_idx;
> +		size_t offset;
> +
> +		tx_idx = (i % dev->real_num_tx_queues) * VETH_TQ_STATS_LEN;

I'm a bit concerned that this can fold multiple rx queue counters into one tx 
counter. But I cannot think of a better idea provided that we want to align XDP 
stats between drivers. So I'm OK with this.

Toshiaki Makita

> +		do {
> +			start = u64_stats_fetch_begin_irq(&rq_stats->syncp);
> +			for (j = 0; j < VETH_TQ_STATS_LEN; j++) {
> +				offset = veth_tq_stats_desc[j].offset;
> +				data[tx_idx + idx + j] += *(u64 *)(stats_base + offset);
> +			}
> +		} while (u64_stats_fetch_retry_irq(&rq_stats->syncp, start));
> +	}
>   }
>   
>   static const struct ethtool_ops veth_ethtool_ops = {
> @@ -340,8 +374,7 @@ static void veth_get_stats64(struct net_device *dev,
>   	tot->tx_packets = packets;
>   
>   	veth_stats_rx(&rx, dev);
> -	tot->tx_dropped += rx.xdp_xmit_err + rx.xdp_tx_err;
> -	tot->rx_dropped = rx.rx_drops;
> +	tot->rx_dropped = rx.rx_drops + rx.xdp_tx_err + rx.xdp_xmit_err;
>   	tot->rx_bytes = rx.xdp_bytes;
>   	tot->rx_packets = rx.xdp_packets;
>   
> @@ -353,7 +386,7 @@ static void veth_get_stats64(struct net_device *dev,
>   		tot->rx_packets += packets;
>   
>   		veth_stats_rx(&rx, peer);
> -		tot->rx_dropped += rx.xdp_xmit_err + rx.xdp_tx_err;
> +		tot->tx_dropped += rx.xdp_xmit_err + rx.xdp_tx_err;
>   		tot->tx_bytes += rx.xdp_bytes;
>   		tot->tx_packets += rx.xdp_packets;
>   	}
> @@ -394,9 +427,9 @@ static int veth_xdp_xmit(struct net_device *dev, int n,
>   			 u32 flags, bool ndo_xmit)
>   {
>   	struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
> -	unsigned int qidx, max_len;
>   	struct net_device *rcv;
>   	int i, ret, drops = n;
> +	unsigned int max_len;
>   	struct veth_rq *rq;
>   
>   	rcu_read_lock();
> @@ -414,8 +447,7 @@ static int veth_xdp_xmit(struct net_device *dev, int n,
>   	}
>   
>   	rcv_priv = netdev_priv(rcv);
> -	qidx = veth_select_rxq(rcv);
> -	rq = &rcv_priv->rq[qidx];
> +	rq = &rcv_priv->rq[veth_select_rxq(rcv)];
>   	/* Non-NULL xdp_prog ensures that xdp_ring is initialized on receive
>   	 * side. This means an XDP program is loaded on the peer and the peer
>   	 * device is up.
> @@ -446,7 +478,6 @@ static int veth_xdp_xmit(struct net_device *dev, int n,
>   
>   	ret = n - drops;
>   drop:
> -	rq = &priv->rq[qidx];
>   	u64_stats_update_begin(&rq->stats.syncp);
>   	if (ndo_xmit) {
>   		rq->stats.vs.xdp_xmit += n - drops;
> 
>>
>> Thoughts?
>>
>> Toshiaki Makita
Lorenzo Bianconi March 24, 2020, 2:36 p.m. UTC | #8
[...]

> > > }
> > 
> > actually I have a different idea..what about account tx stats on the peer rx
> > queue as a result of XDP_TX or ndo_xdp_xmit and properly report this info in
> > the ethool stats? In this way we do not have any locking issue and we still use
> > the per-queue stats approach. Could you please take a look to the following patch?
> 
> Thanks I think your idea is nice.
> 
> > diff --git a/drivers/net/veth.c b/drivers/net/veth.c
> > index b6505a6c7102..f2acd2ee6287 100644
> > --- a/drivers/net/veth.c
> > +++ b/drivers/net/veth.c
> > @@ -92,17 +92,22 @@ struct veth_q_stat_desc {
> >   static const struct veth_q_stat_desc veth_rq_stats_desc[] = {
> >   	{ "xdp_packets",	VETH_RQ_STAT(xdp_packets) },
> >   	{ "xdp_bytes",		VETH_RQ_STAT(xdp_bytes) },
> > -	{ "rx_drops",		VETH_RQ_STAT(rx_drops) },
> > -	{ "rx_xdp_redirect",	VETH_RQ_STAT(xdp_redirect) },
> > -	{ "rx_xdp_drops",	VETH_RQ_STAT(xdp_drops) },
> > -	{ "rx_xdp_tx",		VETH_RQ_STAT(xdp_tx) },
> > -	{ "rx_xdp_tx_errors",	VETH_RQ_STAT(xdp_tx_err) },
> > -	{ "tx_xdp_xmit",	VETH_RQ_STAT(xdp_xmit) },
> > -	{ "tx_xdp_xmit_errors",	VETH_RQ_STAT(xdp_xmit_err) },
> > +	{ "drops",		VETH_RQ_STAT(rx_drops) },
> > +	{ "xdp_redirect",	VETH_RQ_STAT(xdp_redirect) },
> > +	{ "xdp_drops",		VETH_RQ_STAT(xdp_drops) },
> >   };
> >   #define VETH_RQ_STATS_LEN	ARRAY_SIZE(veth_rq_stats_desc)
> > +static const struct veth_q_stat_desc veth_tq_stats_desc[] = {
> > +	{ "xdp_tx",		VETH_RQ_STAT(xdp_tx) },
> > +	{ "xdp_tx_errors",	VETH_RQ_STAT(xdp_tx_err) },
> 
> I'm wondering why xdp_tx is not accounted in rx_queue?
> You can count xdp_tx/tx_error somewhere in rx path like veth_xdp_flush_bq().
> xdp_redirect and xdp_drops are similar actions and in rx stats.

Hi,

thanks for the review :)

I moved the accounting of xdp_tx/tx_error in veth_xdp_xmit for two reason:
1- veth_xdp_tx in veth_xdp_rcv_one or veth_xdp_rcv_skb returns an error
   for XDP_TX just if xdp_frame pointer is invalid but the packet can be
   discarded in veth_xdp_xmit if the device is 'under-pressure' (and this can
   be a problem since in XDP there are no queueing mechanisms so far)
2- to be symmetric  with ndo_xdp_xmit

> 
> > +	{ "xdp_xmit",		VETH_RQ_STAT(xdp_xmit) },
> > +	{ "xdp_xmit_errors",	VETH_RQ_STAT(xdp_xmit_err) },
> > +};
> > +
> > +#define VETH_TQ_STATS_LEN	ARRAY_SIZE(veth_tq_stats_desc)
> > +
> >   static struct {
> >   	const char string[ETH_GSTRING_LEN];
> >   } ethtool_stats_keys[] = {
> > @@ -142,6 +147,14 @@ static void veth_get_strings(struct net_device *dev, u32 stringset, u8 *buf)
> >   				p += ETH_GSTRING_LEN;
> >   			}
> >   		}
> > +		for (i = 0; i < dev->real_num_tx_queues; i++) {
> > +			for (j = 0; j < VETH_TQ_STATS_LEN; j++) {
> > +				snprintf(p, ETH_GSTRING_LEN,
> > +					 "tx_queue_%u_%.18s",
> > +					 i, veth_tq_stats_desc[j].desc);
> > +				p += ETH_GSTRING_LEN;
> > +			}
> > +		}
> >   		break;
> >   	}
> >   }
> > @@ -151,7 +164,8 @@ static int veth_get_sset_count(struct net_device *dev, int sset)
> >   	switch (sset) {
> >   	case ETH_SS_STATS:
> >   		return ARRAY_SIZE(ethtool_stats_keys) +
> > -		       VETH_RQ_STATS_LEN * dev->real_num_rx_queues;
> > +		       VETH_RQ_STATS_LEN * dev->real_num_rx_queues +
> > +		       VETH_TQ_STATS_LEN * dev->real_num_tx_queues;
> >   	default:
> >   		return -EOPNOTSUPP;
> >   	}
> > @@ -160,7 +174,7 @@ static int veth_get_sset_count(struct net_device *dev, int sset)
> >   static void veth_get_ethtool_stats(struct net_device *dev,
> >   		struct ethtool_stats *stats, u64 *data)
> >   {
> > -	struct veth_priv *priv = netdev_priv(dev);
> > +	struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
> >   	struct net_device *peer = rtnl_dereference(priv->peer);
> >   	int i, j, idx;
> > @@ -181,6 +195,26 @@ static void veth_get_ethtool_stats(struct net_device *dev,
> >   		} while (u64_stats_fetch_retry_irq(&rq_stats->syncp, start));
> >   		idx += VETH_RQ_STATS_LEN;
> >   	}
> > +
> > +	if (!peer)
> > +		return;
> > +
> > +	rcv_priv = netdev_priv(peer);
> > +	for (i = 0; i < peer->real_num_rx_queues; i++) {
> > +		const struct veth_rq_stats *rq_stats = &rcv_priv->rq[i].stats;
> > +		const void *stats_base = (void *)&rq_stats->vs;
> > +		unsigned int start, tx_idx;
> > +		size_t offset;
> > +
> > +		tx_idx = (i % dev->real_num_tx_queues) * VETH_TQ_STATS_LEN;
> 
> I'm a bit concerned that this can fold multiple rx queue counters into one
> tx counter. But I cannot think of a better idea provided that we want to
> align XDP stats between drivers. So I'm OK with this.

Since peer->real_num_rx_queues can be greater than dev->real_num_tx_queues,
right? IIUC the only guarantee we have is:

peer->real_num_tx_queues < dev->real_num_rx_queues

If you are fine with that approach, I will post a patch before net-next
closure.

Regards,
Lorenzo


> 
> Toshiaki Makita
> 
> > +		do {
> > +			start = u64_stats_fetch_begin_irq(&rq_stats->syncp);
> > +			for (j = 0; j < VETH_TQ_STATS_LEN; j++) {
> > +				offset = veth_tq_stats_desc[j].offset;
> > +				data[tx_idx + idx + j] += *(u64 *)(stats_base + offset);
> > +			}
> > +		} while (u64_stats_fetch_retry_irq(&rq_stats->syncp, start));
> > +	}
> >   }
> >   static const struct ethtool_ops veth_ethtool_ops = {
> > @@ -340,8 +374,7 @@ static void veth_get_stats64(struct net_device *dev,
> >   	tot->tx_packets = packets;
> >   	veth_stats_rx(&rx, dev);
> > -	tot->tx_dropped += rx.xdp_xmit_err + rx.xdp_tx_err;
> > -	tot->rx_dropped = rx.rx_drops;
> > +	tot->rx_dropped = rx.rx_drops + rx.xdp_tx_err + rx.xdp_xmit_err;
> >   	tot->rx_bytes = rx.xdp_bytes;
> >   	tot->rx_packets = rx.xdp_packets;
> > @@ -353,7 +386,7 @@ static void veth_get_stats64(struct net_device *dev,
> >   		tot->rx_packets += packets;
> >   		veth_stats_rx(&rx, peer);
> > -		tot->rx_dropped += rx.xdp_xmit_err + rx.xdp_tx_err;
> > +		tot->tx_dropped += rx.xdp_xmit_err + rx.xdp_tx_err;
> >   		tot->tx_bytes += rx.xdp_bytes;
> >   		tot->tx_packets += rx.xdp_packets;
> >   	}
> > @@ -394,9 +427,9 @@ static int veth_xdp_xmit(struct net_device *dev, int n,
> >   			 u32 flags, bool ndo_xmit)
> >   {
> >   	struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
> > -	unsigned int qidx, max_len;
> >   	struct net_device *rcv;
> >   	int i, ret, drops = n;
> > +	unsigned int max_len;
> >   	struct veth_rq *rq;
> >   	rcu_read_lock();
> > @@ -414,8 +447,7 @@ static int veth_xdp_xmit(struct net_device *dev, int n,
> >   	}
> >   	rcv_priv = netdev_priv(rcv);
> > -	qidx = veth_select_rxq(rcv);
> > -	rq = &rcv_priv->rq[qidx];
> > +	rq = &rcv_priv->rq[veth_select_rxq(rcv)];
> >   	/* Non-NULL xdp_prog ensures that xdp_ring is initialized on receive
> >   	 * side. This means an XDP program is loaded on the peer and the peer
> >   	 * device is up.
> > @@ -446,7 +478,6 @@ static int veth_xdp_xmit(struct net_device *dev, int n,
> >   	ret = n - drops;
> >   drop:
> > -	rq = &priv->rq[qidx];
> >   	u64_stats_update_begin(&rq->stats.syncp);
> >   	if (ndo_xmit) {
> >   		rq->stats.vs.xdp_xmit += n - drops;
> > 
> > > 
> > > Thoughts?
> > > 
> > > Toshiaki Makita
Toshiaki Makita March 25, 2020, 1:08 p.m. UTC | #9
On 2020/03/24 23:36, Lorenzo Bianconi wrote:
> 
> [...]
> 
>>>> }
>>>
>>> actually I have a different idea..what about account tx stats on the peer rx
>>> queue as a result of XDP_TX or ndo_xdp_xmit and properly report this info in
>>> the ethool stats? In this way we do not have any locking issue and we still use
>>> the per-queue stats approach. Could you please take a look to the following patch?
>>
>> Thanks I think your idea is nice.
>>
>>> diff --git a/drivers/net/veth.c b/drivers/net/veth.c
>>> index b6505a6c7102..f2acd2ee6287 100644
>>> --- a/drivers/net/veth.c
>>> +++ b/drivers/net/veth.c
>>> @@ -92,17 +92,22 @@ struct veth_q_stat_desc {
>>>    static const struct veth_q_stat_desc veth_rq_stats_desc[] = {
>>>    	{ "xdp_packets",	VETH_RQ_STAT(xdp_packets) },
>>>    	{ "xdp_bytes",		VETH_RQ_STAT(xdp_bytes) },
>>> -	{ "rx_drops",		VETH_RQ_STAT(rx_drops) },
>>> -	{ "rx_xdp_redirect",	VETH_RQ_STAT(xdp_redirect) },
>>> -	{ "rx_xdp_drops",	VETH_RQ_STAT(xdp_drops) },
>>> -	{ "rx_xdp_tx",		VETH_RQ_STAT(xdp_tx) },
>>> -	{ "rx_xdp_tx_errors",	VETH_RQ_STAT(xdp_tx_err) },
>>> -	{ "tx_xdp_xmit",	VETH_RQ_STAT(xdp_xmit) },
>>> -	{ "tx_xdp_xmit_errors",	VETH_RQ_STAT(xdp_xmit_err) },
>>> +	{ "drops",		VETH_RQ_STAT(rx_drops) },
>>> +	{ "xdp_redirect",	VETH_RQ_STAT(xdp_redirect) },
>>> +	{ "xdp_drops",		VETH_RQ_STAT(xdp_drops) },
>>>    };
>>>    #define VETH_RQ_STATS_LEN	ARRAY_SIZE(veth_rq_stats_desc)
>>> +static const struct veth_q_stat_desc veth_tq_stats_desc[] = {
>>> +	{ "xdp_tx",		VETH_RQ_STAT(xdp_tx) },
>>> +	{ "xdp_tx_errors",	VETH_RQ_STAT(xdp_tx_err) },
>>
>> I'm wondering why xdp_tx is not accounted in rx_queue?
>> You can count xdp_tx/tx_error somewhere in rx path like veth_xdp_flush_bq().
>> xdp_redirect and xdp_drops are similar actions and in rx stats.
> 
> Hi,
> 
> thanks for the review :)
> 
> I moved the accounting of xdp_tx/tx_error in veth_xdp_xmit for two reason:
> 1- veth_xdp_tx in veth_xdp_rcv_one or veth_xdp_rcv_skb returns an error
>     for XDP_TX just if xdp_frame pointer is invalid but the packet can be
>     discarded in veth_xdp_xmit if the device is 'under-pressure' (and this can
>     be a problem since in XDP there are no queueing mechanisms so far)

Right, but you can track the discard in veth_xdp_flush_bq().

> 2- to be symmetric  with ndo_xdp_xmit

I thought consistency between drivers is more important. What about other drivers?

Toshiaki Makita

> 
>>
>>> +	{ "xdp_xmit",		VETH_RQ_STAT(xdp_xmit) },
>>> +	{ "xdp_xmit_errors",	VETH_RQ_STAT(xdp_xmit_err) },
>>> +};
>>> +
>>> +#define VETH_TQ_STATS_LEN	ARRAY_SIZE(veth_tq_stats_desc)
>>> +
>>>    static struct {
>>>    	const char string[ETH_GSTRING_LEN];
>>>    } ethtool_stats_keys[] = {
>>> @@ -142,6 +147,14 @@ static void veth_get_strings(struct net_device *dev, u32 stringset, u8 *buf)
>>>    				p += ETH_GSTRING_LEN;
>>>    			}
>>>    		}
>>> +		for (i = 0; i < dev->real_num_tx_queues; i++) {
>>> +			for (j = 0; j < VETH_TQ_STATS_LEN; j++) {
>>> +				snprintf(p, ETH_GSTRING_LEN,
>>> +					 "tx_queue_%u_%.18s",
>>> +					 i, veth_tq_stats_desc[j].desc);
>>> +				p += ETH_GSTRING_LEN;
>>> +			}
>>> +		}
>>>    		break;
>>>    	}
>>>    }
>>> @@ -151,7 +164,8 @@ static int veth_get_sset_count(struct net_device *dev, int sset)
>>>    	switch (sset) {
>>>    	case ETH_SS_STATS:
>>>    		return ARRAY_SIZE(ethtool_stats_keys) +
>>> -		       VETH_RQ_STATS_LEN * dev->real_num_rx_queues;
>>> +		       VETH_RQ_STATS_LEN * dev->real_num_rx_queues +
>>> +		       VETH_TQ_STATS_LEN * dev->real_num_tx_queues;
>>>    	default:
>>>    		return -EOPNOTSUPP;
>>>    	}
>>> @@ -160,7 +174,7 @@ static int veth_get_sset_count(struct net_device *dev, int sset)
>>>    static void veth_get_ethtool_stats(struct net_device *dev,
>>>    		struct ethtool_stats *stats, u64 *data)
>>>    {
>>> -	struct veth_priv *priv = netdev_priv(dev);
>>> +	struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
>>>    	struct net_device *peer = rtnl_dereference(priv->peer);
>>>    	int i, j, idx;
>>> @@ -181,6 +195,26 @@ static void veth_get_ethtool_stats(struct net_device *dev,
>>>    		} while (u64_stats_fetch_retry_irq(&rq_stats->syncp, start));
>>>    		idx += VETH_RQ_STATS_LEN;
>>>    	}
>>> +
>>> +	if (!peer)
>>> +		return;
>>> +
>>> +	rcv_priv = netdev_priv(peer);
>>> +	for (i = 0; i < peer->real_num_rx_queues; i++) {
>>> +		const struct veth_rq_stats *rq_stats = &rcv_priv->rq[i].stats;
>>> +		const void *stats_base = (void *)&rq_stats->vs;
>>> +		unsigned int start, tx_idx;
>>> +		size_t offset;
>>> +
>>> +		tx_idx = (i % dev->real_num_tx_queues) * VETH_TQ_STATS_LEN;
>>
>> I'm a bit concerned that this can fold multiple rx queue counters into one
>> tx counter. But I cannot think of a better idea provided that we want to
>> align XDP stats between drivers. So I'm OK with this.
> 
> Since peer->real_num_rx_queues can be greater than dev->real_num_tx_queues,
> right? IIUC the only guarantee we have is:
> 
> peer->real_num_tx_queues < dev->real_num_rx_queues
> 
> If you are fine with that approach, I will post a patch before net-next
> closure.
> 
> Regards,
> Lorenzo
> 
> 
>>
>> Toshiaki Makita
>>
>>> +		do {
>>> +			start = u64_stats_fetch_begin_irq(&rq_stats->syncp);
>>> +			for (j = 0; j < VETH_TQ_STATS_LEN; j++) {
>>> +				offset = veth_tq_stats_desc[j].offset;
>>> +				data[tx_idx + idx + j] += *(u64 *)(stats_base + offset);
>>> +			}
>>> +		} while (u64_stats_fetch_retry_irq(&rq_stats->syncp, start));
>>> +	}
>>>    }
>>>    static const struct ethtool_ops veth_ethtool_ops = {
>>> @@ -340,8 +374,7 @@ static void veth_get_stats64(struct net_device *dev,
>>>    	tot->tx_packets = packets;
>>>    	veth_stats_rx(&rx, dev);
>>> -	tot->tx_dropped += rx.xdp_xmit_err + rx.xdp_tx_err;
>>> -	tot->rx_dropped = rx.rx_drops;
>>> +	tot->rx_dropped = rx.rx_drops + rx.xdp_tx_err + rx.xdp_xmit_err;
>>>    	tot->rx_bytes = rx.xdp_bytes;
>>>    	tot->rx_packets = rx.xdp_packets;
>>> @@ -353,7 +386,7 @@ static void veth_get_stats64(struct net_device *dev,
>>>    		tot->rx_packets += packets;
>>>    		veth_stats_rx(&rx, peer);
>>> -		tot->rx_dropped += rx.xdp_xmit_err + rx.xdp_tx_err;
>>> +		tot->tx_dropped += rx.xdp_xmit_err + rx.xdp_tx_err;
>>>    		tot->tx_bytes += rx.xdp_bytes;
>>>    		tot->tx_packets += rx.xdp_packets;
>>>    	}
>>> @@ -394,9 +427,9 @@ static int veth_xdp_xmit(struct net_device *dev, int n,
>>>    			 u32 flags, bool ndo_xmit)
>>>    {
>>>    	struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
>>> -	unsigned int qidx, max_len;
>>>    	struct net_device *rcv;
>>>    	int i, ret, drops = n;
>>> +	unsigned int max_len;
>>>    	struct veth_rq *rq;
>>>    	rcu_read_lock();
>>> @@ -414,8 +447,7 @@ static int veth_xdp_xmit(struct net_device *dev, int n,
>>>    	}
>>>    	rcv_priv = netdev_priv(rcv);
>>> -	qidx = veth_select_rxq(rcv);
>>> -	rq = &rcv_priv->rq[qidx];
>>> +	rq = &rcv_priv->rq[veth_select_rxq(rcv)];
>>>    	/* Non-NULL xdp_prog ensures that xdp_ring is initialized on receive
>>>    	 * side. This means an XDP program is loaded on the peer and the peer
>>>    	 * device is up.
>>> @@ -446,7 +478,6 @@ static int veth_xdp_xmit(struct net_device *dev, int n,
>>>    	ret = n - drops;
>>>    drop:
>>> -	rq = &priv->rq[qidx];
>>>    	u64_stats_update_begin(&rq->stats.syncp);
>>>    	if (ndo_xmit) {
>>>    		rq->stats.vs.xdp_xmit += n - drops;
>>>
>>>>
>>>> Thoughts?
>>>>
>>>> Toshiaki Makita
Lorenzo Bianconi March 25, 2020, 2:53 p.m. UTC | #10
> On 2020/03/24 23:36, Lorenzo Bianconi wrote:

[...]

> > I moved the accounting of xdp_tx/tx_error in veth_xdp_xmit for two reason:
> > 1- veth_xdp_tx in veth_xdp_rcv_one or veth_xdp_rcv_skb returns an error
> >     for XDP_TX just if xdp_frame pointer is invalid but the packet can be
> >     discarded in veth_xdp_xmit if the device is 'under-pressure' (and this can
> >     be a problem since in XDP there are no queueing mechanisms so far)
> 
> Right, but you can track the discard in veth_xdp_flush_bq().
> 

ack, I guess it is just a matter of passing veth_rq to veth_xdp_flush_bq and
account there for XDP_TX. I will post a patch.

Regards,
Lorenzo

> > 2- to be symmetric  with ndo_xdp_xmit
> 
> I thought consistency between drivers is more important. What about other drivers?
> 
> Toshiaki Makita
> 
> > 
> > > 
> > > > +	{ "xdp_xmit",		VETH_RQ_STAT(xdp_xmit) },
> > > > +	{ "xdp_xmit_errors",	VETH_RQ_STAT(xdp_xmit_err) },
> > > > +};
> > > > +
> > > > +#define VETH_TQ_STATS_LEN	ARRAY_SIZE(veth_tq_stats_desc)
> > > > +
> > > >    static struct {
> > > >    	const char string[ETH_GSTRING_LEN];
> > > >    } ethtool_stats_keys[] = {
> > > > @@ -142,6 +147,14 @@ static void veth_get_strings(struct net_device *dev, u32 stringset, u8 *buf)
> > > >    				p += ETH_GSTRING_LEN;
> > > >    			}
> > > >    		}
> > > > +		for (i = 0; i < dev->real_num_tx_queues; i++) {
> > > > +			for (j = 0; j < VETH_TQ_STATS_LEN; j++) {
> > > > +				snprintf(p, ETH_GSTRING_LEN,
> > > > +					 "tx_queue_%u_%.18s",
> > > > +					 i, veth_tq_stats_desc[j].desc);
> > > > +				p += ETH_GSTRING_LEN;
> > > > +			}
> > > > +		}
> > > >    		break;
> > > >    	}
> > > >    }
> > > > @@ -151,7 +164,8 @@ static int veth_get_sset_count(struct net_device *dev, int sset)
> > > >    	switch (sset) {
> > > >    	case ETH_SS_STATS:
> > > >    		return ARRAY_SIZE(ethtool_stats_keys) +
> > > > -		       VETH_RQ_STATS_LEN * dev->real_num_rx_queues;
> > > > +		       VETH_RQ_STATS_LEN * dev->real_num_rx_queues +
> > > > +		       VETH_TQ_STATS_LEN * dev->real_num_tx_queues;
> > > >    	default:
> > > >    		return -EOPNOTSUPP;
> > > >    	}
> > > > @@ -160,7 +174,7 @@ static int veth_get_sset_count(struct net_device *dev, int sset)
> > > >    static void veth_get_ethtool_stats(struct net_device *dev,
> > > >    		struct ethtool_stats *stats, u64 *data)
> > > >    {
> > > > -	struct veth_priv *priv = netdev_priv(dev);
> > > > +	struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
> > > >    	struct net_device *peer = rtnl_dereference(priv->peer);
> > > >    	int i, j, idx;
> > > > @@ -181,6 +195,26 @@ static void veth_get_ethtool_stats(struct net_device *dev,
> > > >    		} while (u64_stats_fetch_retry_irq(&rq_stats->syncp, start));
> > > >    		idx += VETH_RQ_STATS_LEN;
> > > >    	}
> > > > +
> > > > +	if (!peer)
> > > > +		return;
> > > > +
> > > > +	rcv_priv = netdev_priv(peer);
> > > > +	for (i = 0; i < peer->real_num_rx_queues; i++) {
> > > > +		const struct veth_rq_stats *rq_stats = &rcv_priv->rq[i].stats;
> > > > +		const void *stats_base = (void *)&rq_stats->vs;
> > > > +		unsigned int start, tx_idx;
> > > > +		size_t offset;
> > > > +
> > > > +		tx_idx = (i % dev->real_num_tx_queues) * VETH_TQ_STATS_LEN;
> > > 
> > > I'm a bit concerned that this can fold multiple rx queue counters into one
> > > tx counter. But I cannot think of a better idea provided that we want to
> > > align XDP stats between drivers. So I'm OK with this.
> > 
> > Since peer->real_num_rx_queues can be greater than dev->real_num_tx_queues,
> > right? IIUC the only guarantee we have is:
> > 
> > peer->real_num_tx_queues < dev->real_num_rx_queues
> > 
> > If you are fine with that approach, I will post a patch before net-next
> > closure.
> > 
> > Regards,
> > Lorenzo
> > 
> > 
> > > 
> > > Toshiaki Makita
> > > 
> > > > +		do {
> > > > +			start = u64_stats_fetch_begin_irq(&rq_stats->syncp);
> > > > +			for (j = 0; j < VETH_TQ_STATS_LEN; j++) {
> > > > +				offset = veth_tq_stats_desc[j].offset;
> > > > +				data[tx_idx + idx + j] += *(u64 *)(stats_base + offset);
> > > > +			}
> > > > +		} while (u64_stats_fetch_retry_irq(&rq_stats->syncp, start));
> > > > +	}
> > > >    }
> > > >    static const struct ethtool_ops veth_ethtool_ops = {
> > > > @@ -340,8 +374,7 @@ static void veth_get_stats64(struct net_device *dev,
> > > >    	tot->tx_packets = packets;
> > > >    	veth_stats_rx(&rx, dev);
> > > > -	tot->tx_dropped += rx.xdp_xmit_err + rx.xdp_tx_err;
> > > > -	tot->rx_dropped = rx.rx_drops;
> > > > +	tot->rx_dropped = rx.rx_drops + rx.xdp_tx_err + rx.xdp_xmit_err;
> > > >    	tot->rx_bytes = rx.xdp_bytes;
> > > >    	tot->rx_packets = rx.xdp_packets;
> > > > @@ -353,7 +386,7 @@ static void veth_get_stats64(struct net_device *dev,
> > > >    		tot->rx_packets += packets;
> > > >    		veth_stats_rx(&rx, peer);
> > > > -		tot->rx_dropped += rx.xdp_xmit_err + rx.xdp_tx_err;
> > > > +		tot->tx_dropped += rx.xdp_xmit_err + rx.xdp_tx_err;
> > > >    		tot->tx_bytes += rx.xdp_bytes;
> > > >    		tot->tx_packets += rx.xdp_packets;
> > > >    	}
> > > > @@ -394,9 +427,9 @@ static int veth_xdp_xmit(struct net_device *dev, int n,
> > > >    			 u32 flags, bool ndo_xmit)
> > > >    {
> > > >    	struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
> > > > -	unsigned int qidx, max_len;
> > > >    	struct net_device *rcv;
> > > >    	int i, ret, drops = n;
> > > > +	unsigned int max_len;
> > > >    	struct veth_rq *rq;
> > > >    	rcu_read_lock();
> > > > @@ -414,8 +447,7 @@ static int veth_xdp_xmit(struct net_device *dev, int n,
> > > >    	}
> > > >    	rcv_priv = netdev_priv(rcv);
> > > > -	qidx = veth_select_rxq(rcv);
> > > > -	rq = &rcv_priv->rq[qidx];
> > > > +	rq = &rcv_priv->rq[veth_select_rxq(rcv)];
> > > >    	/* Non-NULL xdp_prog ensures that xdp_ring is initialized on receive
> > > >    	 * side. This means an XDP program is loaded on the peer and the peer
> > > >    	 * device is up.
> > > > @@ -446,7 +478,6 @@ static int veth_xdp_xmit(struct net_device *dev, int n,
> > > >    	ret = n - drops;
> > > >    drop:
> > > > -	rq = &priv->rq[qidx];
> > > >    	u64_stats_update_begin(&rq->stats.syncp);
> > > >    	if (ndo_xmit) {
> > > >    		rq->stats.vs.xdp_xmit += n - drops;
> > > > 
> > > > > 
> > > > > Thoughts?
> > > > > 
> > > > > Toshiaki Makita
diff mbox series

Patch

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 2307696d4897..093b55acedb1 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -44,6 +44,9 @@  struct veth_stats {
 	u64	xdp_redirect;
 	u64	xdp_drops;
 	u64	xdp_tx;
+	u64	xdp_tx_err;
+	u64	xdp_xmit;
+	u64	xdp_xmit_err;
 };
 
 struct veth_rq_stats {
@@ -89,8 +92,13 @@  struct veth_q_stat_desc {
 static const struct veth_q_stat_desc veth_rq_stats_desc[] = {
 	{ "xdp_packets",	VETH_RQ_STAT(xdp_packets) },
 	{ "xdp_bytes",		VETH_RQ_STAT(xdp_bytes) },
-	{ "xdp_drops",		VETH_RQ_STAT(xdp_drops) },
 	{ "rx_drops",		VETH_RQ_STAT(rx_drops) },
+	{ "rx_xdp_redirect",	VETH_RQ_STAT(xdp_redirect) },
+	{ "rx_xdp_drops",	VETH_RQ_STAT(xdp_drops) },
+	{ "rx_xdp_tx",		VETH_RQ_STAT(xdp_tx) },
+	{ "rx_xdp_tx_errors",	VETH_RQ_STAT(xdp_tx_err) },
+	{ "tx_xdp_xmit",	VETH_RQ_STAT(xdp_xmit) },
+	{ "tx_xdp_xmit_errors",	VETH_RQ_STAT(xdp_xmit_err) },
 };
 
 #define VETH_RQ_STATS_LEN	ARRAY_SIZE(veth_rq_stats_desc)
@@ -129,7 +137,7 @@  static void veth_get_strings(struct net_device *dev, u32 stringset, u8 *buf)
 		for (i = 0; i < dev->real_num_rx_queues; i++) {
 			for (j = 0; j < VETH_RQ_STATS_LEN; j++) {
 				snprintf(p, ETH_GSTRING_LEN,
-					 "rx_queue_%u_%.11s",
+					 "rx_queue_%u_%.18s",
 					 i, veth_rq_stats_desc[j].desc);
 				p += ETH_GSTRING_LEN;
 			}
@@ -374,12 +382,13 @@  static int veth_select_rxq(struct net_device *dev)
 }
 
 static int veth_xdp_xmit(struct net_device *dev, int n,
-			 struct xdp_frame **frames, u32 flags)
+			 struct xdp_frame **frames,
+			 u32 flags, bool ndo_xmit)
 {
 	struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
+	unsigned int qidx, max_len;
 	struct net_device *rcv;
 	int i, ret, drops = n;
-	unsigned int max_len;
 	struct veth_rq *rq;
 
 	rcu_read_lock();
@@ -395,7 +404,8 @@  static int veth_xdp_xmit(struct net_device *dev, int n,
 	}
 
 	rcv_priv = netdev_priv(rcv);
-	rq = &rcv_priv->rq[veth_select_rxq(rcv)];
+	qidx = veth_select_rxq(rcv);
+	rq = &rcv_priv->rq[qidx];
 	/* Non-NULL xdp_prog ensures that xdp_ring is initialized on receive
 	 * side. This means an XDP program is loaded on the peer and the peer
 	 * device is up.
@@ -424,6 +434,17 @@  static int veth_xdp_xmit(struct net_device *dev, int n,
 	if (flags & XDP_XMIT_FLUSH)
 		__veth_xdp_flush(rq);
 
+	rq = &priv->rq[qidx];
+	u64_stats_update_begin(&rq->stats.syncp);
+	if (ndo_xmit) {
+		rq->stats.vs.xdp_xmit += n - drops;
+		rq->stats.vs.xdp_xmit_err += drops;
+	} else {
+		rq->stats.vs.xdp_tx += n - drops;
+		rq->stats.vs.xdp_tx_err += drops;
+	}
+	u64_stats_update_end(&rq->stats.syncp);
+
 	if (likely(!drops)) {
 		rcu_read_unlock();
 		return n;
@@ -437,11 +458,17 @@  static int veth_xdp_xmit(struct net_device *dev, int n,
 	return ret;
 }
 
+static int veth_ndo_xdp_xmit(struct net_device *dev, int n,
+			     struct xdp_frame **frames, u32 flags)
+{
+	return veth_xdp_xmit(dev, n, frames, flags, true);
+}
+
 static void veth_xdp_flush_bq(struct net_device *dev, struct veth_xdp_tx_bq *bq)
 {
 	int sent, i, err = 0;
 
-	sent = veth_xdp_xmit(dev, bq->count, bq->q, 0);
+	sent = veth_xdp_xmit(dev, bq->count, bq->q, 0, false);
 	if (sent < 0) {
 		err = sent;
 		sent = 0;
@@ -753,6 +780,7 @@  static int veth_xdp_rcv(struct veth_rq *rq, int budget,
 	}
 
 	u64_stats_update_begin(&rq->stats.syncp);
+	rq->stats.vs.xdp_redirect += stats->xdp_redirect;
 	rq->stats.vs.xdp_bytes += stats->xdp_bytes;
 	rq->stats.vs.xdp_drops += stats->xdp_drops;
 	rq->stats.vs.rx_drops += stats->rx_drops;
@@ -1172,7 +1200,7 @@  static const struct net_device_ops veth_netdev_ops = {
 	.ndo_features_check	= passthru_features_check,
 	.ndo_set_rx_headroom	= veth_set_rx_headroom,
 	.ndo_bpf		= veth_xdp,
-	.ndo_xdp_xmit		= veth_xdp_xmit,
+	.ndo_xdp_xmit		= veth_ndo_xdp_xmit,
 };
 
 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \