diff mbox series

[ovs-dev,v7,1/3] netdev: Add rxq callback function rxq_length()

Message ID 1516067473-29286-2-git-send-email-jan.scheurich@ericsson.com
State Superseded
Delegated to: Ian Stokes
Headers show
Series dpif-netdev: Detailed PMD performance metrics and supervision | expand

Commit Message

Jan Scheurich Jan. 16, 2018, 1:51 a.m. UTC
If implememented, this function returns the number of packets in an rx
queue of the netdev. If not implemented, it returns -1.

This function will be used in the upcoming commit for PMD performance
metrics to supervise the rx queue fill level for DPDK vhostuser ports.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
---
 lib/netdev-bsd.c      |  1 +
 lib/netdev-dpdk.c     | 36 +++++++++++++++++++++++++++++++-----
 lib/netdev-dummy.c    |  1 +
 lib/netdev-linux.c    |  1 +
 lib/netdev-provider.h |  3 +++
 lib/netdev-vport.c    |  1 +
 lib/netdev.c          |  9 +++++++++
 lib/netdev.h          |  1 +
 8 files changed, 48 insertions(+), 5 deletions(-)

Comments

Ilya Maximets Jan. 17, 2018, 10:47 a.m. UTC | #1
On 16.01.2018 04:51, Jan Scheurich wrote:
> If implememented, this function returns the number of packets in an rx
> queue of the netdev. If not implemented, it returns -1.

To be conform with other netdev functions it should return meaningful
error codes. As 'rte_eth_rx_queue_count' could return different errors
like -EINVAL or -ENOTSUP, 'netdev_rxq_length' itself should return
-EOPNOTSUPP if not implemented.

> 
> This function will be used in the upcoming commit for PMD performance
> metrics to supervise the rx queue fill level for DPDK vhostuser ports.
> 
> Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
> ---
>  lib/netdev-bsd.c      |  1 +
>  lib/netdev-dpdk.c     | 36 +++++++++++++++++++++++++++++++-----
>  lib/netdev-dummy.c    |  1 +
>  lib/netdev-linux.c    |  1 +
>  lib/netdev-provider.h |  3 +++
>  lib/netdev-vport.c    |  1 +
>  lib/netdev.c          |  9 +++++++++
>  lib/netdev.h          |  1 +
>  8 files changed, 48 insertions(+), 5 deletions(-)
> 
> diff --git a/lib/netdev-bsd.c b/lib/netdev-bsd.c
> index 05974c1..8d1771e 100644
> --- a/lib/netdev-bsd.c
> +++ b/lib/netdev-bsd.c
> @@ -1546,6 +1546,7 @@ netdev_bsd_update_flags(struct netdev *netdev_, enum netdev_flags off,
>      netdev_bsd_rxq_recv,                             \
>      netdev_bsd_rxq_wait,                             \
>      netdev_bsd_rxq_drain,                            \
> +    NULL, /* rxq_length */                           \
>                                                       \
>      NO_OFFLOAD_API                                   \
>  }
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> index ccda3fc..4200556 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -1839,6 +1839,27 @@ netdev_dpdk_rxq_recv(struct netdev_rxq *rxq, struct dp_packet_batch *batch)
>      return 0;
>  }
>  
> +static int
> +netdev_dpdk_vhost_rxq_length(struct netdev_rxq *rxq)
> +{
> +    struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
> +    int qid = rxq->queue_id;
> +

We must make all the checks as in rxq_recv() function before calling
'rte_vhost_rx_queue_count'. Otherwise we may crash here if device
will be occasionally disconnected:

    int vid = netdev_dpdk_get_vid(dev);

    if (OVS_UNLIKELY(vid < 0 || !dev->vhost_reconfigured
                     || !(dev->flags & NETDEV_UP))) {
        return -EAGAIN;
    }

Not sure about -EAGAIN, but we need to return some negative errno.

> +    /* The DPDK API returns a uint32_t which often has invalid bits in the
> +     * upper 16-bits. Need to restrict the value uint16_t. */
> +    return rte_vhost_rx_queue_count(netdev_dpdk_get_vid(dev),
> +                                    qid * VIRTIO_QNUM + VIRTIO_TXQ)
> +                & UINT16_MAX;
> +}
> +
> +static int
> +netdev_dpdk_rxq_length(struct netdev_rxq *rxq)
> +{
> +    struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq);
> +

Same here:

    struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);

    if (OVS_UNLIKELY(!(dev->flags & NETDEV_UP))) {                               
        return -EAGAIN;                                                           
    }

> +    return rte_eth_rx_queue_count(rx->port_id, rxq->queue_id);
> +}
> +
>  static inline int
>  netdev_dpdk_qos_run(struct netdev_dpdk *dev, struct rte_mbuf **pkts,
>                      int cnt, bool may_steal)
> @@ -3580,7 +3601,7 @@ unlock:
>                            GET_CARRIER, GET_STATS,			  \
>                            GET_CUSTOM_STATS,					  \
>                            GET_FEATURES, GET_STATUS,           \
> -                          RECONFIGURE, RXQ_RECV)              \
> +                          RECONFIGURE, RXQ_RECV, RXQ_LENGTH)  \
>  {                                                             \
>      NAME,                                                     \
>      true,                       /* is_pmd */                  \
> @@ -3649,6 +3670,7 @@ unlock:
>      RXQ_RECV,                                                 \
>      NULL,                       /* rx_wait */                 \
>      NULL,                       /* rxq_drain */               \
> +    RXQ_LENGTH,                                               \
>      NO_OFFLOAD_API                                            \
>  }
>  
> @@ -3667,7 +3689,8 @@ static const struct netdev_class dpdk_class =
>          netdev_dpdk_get_features,
>          netdev_dpdk_get_status,
>          netdev_dpdk_reconfigure,
> -        netdev_dpdk_rxq_recv);
> +        netdev_dpdk_rxq_recv,
> +        netdev_dpdk_rxq_length);
>  
>  static const struct netdev_class dpdk_ring_class =
>      NETDEV_DPDK_CLASS(
> @@ -3684,7 +3707,8 @@ static const struct netdev_class dpdk_ring_class =
>          netdev_dpdk_get_features,
>          netdev_dpdk_get_status,
>          netdev_dpdk_reconfigure,
> -        netdev_dpdk_rxq_recv);
> +        netdev_dpdk_rxq_recv,
> +        NULL);
>  
>  static const struct netdev_class dpdk_vhost_class =
>      NETDEV_DPDK_CLASS(
> @@ -3701,7 +3725,8 @@ static const struct netdev_class dpdk_vhost_class =
>          NULL,
>          NULL,
>          netdev_dpdk_vhost_reconfigure,
> -        netdev_dpdk_vhost_rxq_recv);
> +        netdev_dpdk_vhost_rxq_recv,
> +        netdev_dpdk_vhost_rxq_length);
>  static const struct netdev_class dpdk_vhost_client_class =
>      NETDEV_DPDK_CLASS(
>          "dpdkvhostuserclient",
> @@ -3717,7 +3742,8 @@ static const struct netdev_class dpdk_vhost_client_class =
>          NULL,
>          NULL,
>          netdev_dpdk_vhost_client_reconfigure,
> -        netdev_dpdk_vhost_rxq_recv);
> +        netdev_dpdk_vhost_rxq_recv,
> +        netdev_dpdk_vhost_rxq_length);
>  
>  void
>  netdev_dpdk_register(void)
> diff --git a/lib/netdev-dummy.c b/lib/netdev-dummy.c
> index 4246af3..7e2c0a2 100644
> --- a/lib/netdev-dummy.c
> +++ b/lib/netdev-dummy.c
> @@ -1457,6 +1457,7 @@ netdev_dummy_update_flags(struct netdev *netdev_,
>      netdev_dummy_rxq_recv,                                      \
>      netdev_dummy_rxq_wait,                                      \
>      netdev_dummy_rxq_drain,                                     \
> +    NULL,                       /* rxq_length */                \
>                                                                  \
>      NO_OFFLOAD_API                                              \
>  }
> diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
> index 37143b8..8b19890 100644
> --- a/lib/netdev-linux.c
> +++ b/lib/netdev-linux.c
> @@ -2890,6 +2890,7 @@ netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
>      netdev_linux_rxq_recv,                                      \
>      netdev_linux_rxq_wait,                                      \
>      netdev_linux_rxq_drain,                                     \
> +    NULL,                       /* rxq_length */                \
>                                                                  \
>      FLOW_OFFLOAD_API                                            \
>  }
> diff --git a/lib/netdev-provider.h b/lib/netdev-provider.h
> index 25bd671..297644a 100644
> --- a/lib/netdev-provider.h
> +++ b/lib/netdev-provider.h
> @@ -801,6 +801,9 @@ struct netdev_class {
>      /* Discards all packets waiting to be received from 'rx'. */
>      int (*rxq_drain)(struct netdev_rxq *rx);
>  
> +    /* Retrieve the number of packets present in an rx queue. */

Comment should be extended. See below.

In addition we need to mark here that function is not thread-safe and
could not be used while device reconfiguration.

> +    int (*rxq_length)(struct netdev_rxq *rx);
> +
>      /* ## -------------------------------- ## */
>      /* ## netdev flow offloading functions ## */
>      /* ## -------------------------------- ## */
> diff --git a/lib/netdev-vport.c b/lib/netdev-vport.c
> index 478ed90..1e7bc96 100644
> --- a/lib/netdev-vport.c
> +++ b/lib/netdev-vport.c
> @@ -944,6 +944,7 @@ netdev_vport_get_ifindex(const struct netdev *netdev_)
>      NULL,                   /* rx_recv */                   \
>      NULL,                   /* rx_wait */                   \
>      NULL,                   /* rx_drain */                  \
> +    NULL,                   /* rx_length */                 \
>                                                              \
>      NETDEV_FLOW_OFFLOAD_API
>  
> diff --git a/lib/netdev.c b/lib/netdev.c
> index be05dc6..063c318 100644
> --- a/lib/netdev.c
> +++ b/lib/netdev.c
> @@ -724,6 +724,15 @@ netdev_rxq_drain(struct netdev_rxq *rx)
>              : 0);
>  }
>  
> +/* Retrieve the number of packets present in an rx queue. */

Comment should clearly declare what kind of result should be treated
as an error, and what is the result in case of success. You may use
description for 'netdev_get_ifindex' as a reference.
Something like:

/* Returns the number of packets present in an rx queue, if successful, as a
 * positive number.  On failure, returns a negative errno value.
 *
 * Some network devices may not implement support for this function.  In such
 * cases this function will always return -EOPNOTSUPP. */ 

> +int
> +netdev_rxq_length(struct netdev_rxq *rx)
> +{
> +    return (rx->netdev->netdev_class->rxq_length
> +            ? rx->netdev->netdev_class->rxq_length(rx)
> +            : -1);
> +}
> +
>  /* Configures the number of tx queues of 'netdev'. Returns 0 if successful,
>   * otherwise a positive errno value.
>   *
> diff --git a/lib/netdev.h b/lib/netdev.h
> index ff1b604..edd41b1 100644
> --- a/lib/netdev.h
> +++ b/lib/netdev.h
> @@ -178,6 +178,7 @@ int netdev_rxq_get_queue_id(const struct netdev_rxq *);
>  int netdev_rxq_recv(struct netdev_rxq *rx, struct dp_packet_batch *);
>  void netdev_rxq_wait(struct netdev_rxq *);
>  int netdev_rxq_drain(struct netdev_rxq *);
> +int netdev_rxq_length(struct netdev_rxq *rx);

argument's name not needed here.

>  
>  /* Packet transmission. */
>  int netdev_send(struct netdev *, int qid, struct dp_packet_batch *,
>
Jan Scheurich Jan. 17, 2018, 11:21 p.m. UTC | #2
Thanks for the review. Answers inline.
Regards, Jan


> From: Ilya Maximets [mailto:i.maximets@samsung.com]
> Sent: Wednesday, 17 January, 2018 11:47
> Subject: Re: [PATCH v7 1/3] netdev: Add rxq callback function rxq_length()
> 
> On 16.01.2018 04:51, Jan Scheurich wrote:
> > If implememented, this function returns the number of packets in an rx
> > queue of the netdev. If not implemented, it returns -1.
> 
> To be conform with other netdev functions it should return meaningful
> error codes. As 'rte_eth_rx_queue_count' could return different errors
> like -EINVAL or -ENOTSUP, 'netdev_rxq_length' itself should return
> -EOPNOTSUPP if not implemented.

OK.

> 
> >
> > This function will be used in the upcoming commit for PMD performance
> > metrics to supervise the rx queue fill level for DPDK vhostuser ports.
> >
> > Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
> > ---
> >  lib/netdev-bsd.c      |  1 +
> >  lib/netdev-dpdk.c     | 36 +++++++++++++++++++++++++++++++-----
> >  lib/netdev-dummy.c    |  1 +
> >  lib/netdev-linux.c    |  1 +
> >  lib/netdev-provider.h |  3 +++
> >  lib/netdev-vport.c    |  1 +
> >  lib/netdev.c          |  9 +++++++++
> >  lib/netdev.h          |  1 +
> >  8 files changed, 48 insertions(+), 5 deletions(-)
> >
> > diff --git a/lib/netdev-bsd.c b/lib/netdev-bsd.c
> > index 05974c1..8d1771e 100644
> > --- a/lib/netdev-bsd.c
> > +++ b/lib/netdev-bsd.c
> > @@ -1546,6 +1546,7 @@ netdev_bsd_update_flags(struct netdev *netdev_, enum netdev_flags off,
> >      netdev_bsd_rxq_recv,                             \
> >      netdev_bsd_rxq_wait,                             \
> >      netdev_bsd_rxq_drain,                            \
> > +    NULL, /* rxq_length */                           \
> >                                                       \
> >      NO_OFFLOAD_API                                   \
> >  }
> > diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> > index ccda3fc..4200556 100644
> > --- a/lib/netdev-dpdk.c
> > +++ b/lib/netdev-dpdk.c
> > @@ -1839,6 +1839,27 @@ netdev_dpdk_rxq_recv(struct netdev_rxq *rxq, struct dp_packet_batch *batch)
> >      return 0;
> >  }
> >
> > +static int
> > +netdev_dpdk_vhost_rxq_length(struct netdev_rxq *rxq)
> > +{
> > +    struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
> > +    int qid = rxq->queue_id;
> > +
> 
> We must make all the checks as in rxq_recv() function before calling
> 'rte_vhost_rx_queue_count'. Otherwise we may crash here if device
> will be occasionally disconnected:
> 
>     int vid = netdev_dpdk_get_vid(dev);
> 
>     if (OVS_UNLIKELY(vid < 0 || !dev->vhost_reconfigured
>                      || !(dev->flags & NETDEV_UP))) {
>         return -EAGAIN;
>     }
> 
> Not sure about -EAGAIN, but we need to return some negative errno.

OK. Not necessary for our use case, as it will only be called by the PMD after having received a full batch of 32 packets, but in general I agree those checks are needed.

> 
> > +    /* The DPDK API returns a uint32_t which often has invalid bits in the
> > +     * upper 16-bits. Need to restrict the value uint16_t. */
> > +    return rte_vhost_rx_queue_count(netdev_dpdk_get_vid(dev),
> > +                                    qid * VIRTIO_QNUM + VIRTIO_TXQ)
> > +                & UINT16_MAX;
> > +}
> > +
> > +static int
> > +netdev_dpdk_rxq_length(struct netdev_rxq *rxq)
> > +{
> > +    struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq);
> > +
> 
> Same here:
> 
>     struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
> 
>     if (OVS_UNLIKELY(!(dev->flags & NETDEV_UP))) {
>         return -EAGAIN;
>     }
> 
> > +    return rte_eth_rx_queue_count(rx->port_id, rxq->queue_id);
> > +}
> > +
> >  static inline int
> >  netdev_dpdk_qos_run(struct netdev_dpdk *dev, struct rte_mbuf **pkts,
> >                      int cnt, bool may_steal)
> > @@ -3580,7 +3601,7 @@ unlock:
> >                            GET_CARRIER, GET_STATS,			  \
> >                            GET_CUSTOM_STATS,					  \
> >                            GET_FEATURES, GET_STATUS,           \
> > -                          RECONFIGURE, RXQ_RECV)              \
> > +                          RECONFIGURE, RXQ_RECV, RXQ_LENGTH)  \
> >  {                                                             \
> >      NAME,                                                     \
> >      true,                       /* is_pmd */                  \
> > @@ -3649,6 +3670,7 @@ unlock:
> >      RXQ_RECV,                                                 \
> >      NULL,                       /* rx_wait */                 \
> >      NULL,                       /* rxq_drain */               \
> > +    RXQ_LENGTH,                                               \
> >      NO_OFFLOAD_API                                            \
> >  }
> >
> > @@ -3667,7 +3689,8 @@ static const struct netdev_class dpdk_class =
> >          netdev_dpdk_get_features,
> >          netdev_dpdk_get_status,
> >          netdev_dpdk_reconfigure,
> > -        netdev_dpdk_rxq_recv);
> > +        netdev_dpdk_rxq_recv,
> > +        netdev_dpdk_rxq_length);
> >
> >  static const struct netdev_class dpdk_ring_class =
> >      NETDEV_DPDK_CLASS(
> > @@ -3684,7 +3707,8 @@ static const struct netdev_class dpdk_ring_class =
> >          netdev_dpdk_get_features,
> >          netdev_dpdk_get_status,
> >          netdev_dpdk_reconfigure,
> > -        netdev_dpdk_rxq_recv);
> > +        netdev_dpdk_rxq_recv,
> > +        NULL);
> >
> >  static const struct netdev_class dpdk_vhost_class =
> >      NETDEV_DPDK_CLASS(
> > @@ -3701,7 +3725,8 @@ static const struct netdev_class dpdk_vhost_class =
> >          NULL,
> >          NULL,
> >          netdev_dpdk_vhost_reconfigure,
> > -        netdev_dpdk_vhost_rxq_recv);
> > +        netdev_dpdk_vhost_rxq_recv,
> > +        netdev_dpdk_vhost_rxq_length);
> >  static const struct netdev_class dpdk_vhost_client_class =
> >      NETDEV_DPDK_CLASS(
> >          "dpdkvhostuserclient",
> > @@ -3717,7 +3742,8 @@ static const struct netdev_class dpdk_vhost_client_class =
> >          NULL,
> >          NULL,
> >          netdev_dpdk_vhost_client_reconfigure,
> > -        netdev_dpdk_vhost_rxq_recv);
> > +        netdev_dpdk_vhost_rxq_recv,
> > +        netdev_dpdk_vhost_rxq_length);
> >
> >  void
> >  netdev_dpdk_register(void)
> > diff --git a/lib/netdev-dummy.c b/lib/netdev-dummy.c
> > index 4246af3..7e2c0a2 100644
> > --- a/lib/netdev-dummy.c
> > +++ b/lib/netdev-dummy.c
> > @@ -1457,6 +1457,7 @@ netdev_dummy_update_flags(struct netdev *netdev_,
> >      netdev_dummy_rxq_recv,                                      \
> >      netdev_dummy_rxq_wait,                                      \
> >      netdev_dummy_rxq_drain,                                     \
> > +    NULL,                       /* rxq_length */                \
> >                                                                  \
> >      NO_OFFLOAD_API                                              \
> >  }
> > diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
> > index 37143b8..8b19890 100644
> > --- a/lib/netdev-linux.c
> > +++ b/lib/netdev-linux.c
> > @@ -2890,6 +2890,7 @@ netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
> >      netdev_linux_rxq_recv,                                      \
> >      netdev_linux_rxq_wait,                                      \
> >      netdev_linux_rxq_drain,                                     \
> > +    NULL,                       /* rxq_length */                \
> >                                                                  \
> >      FLOW_OFFLOAD_API                                            \
> >  }
> > diff --git a/lib/netdev-provider.h b/lib/netdev-provider.h
> > index 25bd671..297644a 100644
> > --- a/lib/netdev-provider.h
> > +++ b/lib/netdev-provider.h
> > @@ -801,6 +801,9 @@ struct netdev_class {
> >      /* Discards all packets waiting to be received from 'rx'. */
> >      int (*rxq_drain)(struct netdev_rxq *rx);
> >
> > +    /* Retrieve the number of packets present in an rx queue. */
> 
> Comment should be extended. See below.
> 
> In addition we need to mark here that function is not thread-safe and
> could not be used while device reconfiguration. 

I'm curious: is there any difference regarding thread-safeness and use during device configuration compared to the other netdev rxq functions? Why would it be necessary to list these constraints for this function but not the others? 

And why should this read-only function be a priori thread-unsafe? And in which sense: concurrent invocations of this function? Or concurrent invocation of this function with rxq_recv() and rxq_drain()?

Should the netdev.h and the netdev_provider.h not rather have a general disclaimer that the rxq functions are only to be called from a single thread assigned "polling" the rx queue, or alternatively be protected by a lock on the netdev user side?

> 
> > +    int (*rxq_length)(struct netdev_rxq *rx);
> > +
> >      /* ## -------------------------------- ## */
> >      /* ## netdev flow offloading functions ## */
> >      /* ## -------------------------------- ## */
> > diff --git a/lib/netdev-vport.c b/lib/netdev-vport.c
> > index 478ed90..1e7bc96 100644
> > --- a/lib/netdev-vport.c
> > +++ b/lib/netdev-vport.c
> > @@ -944,6 +944,7 @@ netdev_vport_get_ifindex(const struct netdev *netdev_)
> >      NULL,                   /* rx_recv */                   \
> >      NULL,                   /* rx_wait */                   \
> >      NULL,                   /* rx_drain */                  \
> > +    NULL,                   /* rx_length */                 \
> >                                                              \
> >      NETDEV_FLOW_OFFLOAD_API
> >
> > diff --git a/lib/netdev.c b/lib/netdev.c
> > index be05dc6..063c318 100644
> > --- a/lib/netdev.c
> > +++ b/lib/netdev.c
> > @@ -724,6 +724,15 @@ netdev_rxq_drain(struct netdev_rxq *rx)
> >              : 0);
> >  }
> >
> > +/* Retrieve the number of packets present in an rx queue. */
> 
> Comment should clearly declare what kind of result should be treated
> as an error, and what is the result in case of success. You may use
> description for 'netdev_get_ifindex' as a reference.
> Something like:
> 
> /* Returns the number of packets present in an rx queue, if successful, as a
>  * positive number.  On failure, returns a negative errno value.
>  *
>  * Some network devices may not implement support for this function.  In such
>  * cases this function will always return -EOPNOTSUPP. */

OK.
> 
> > +int
> > +netdev_rxq_length(struct netdev_rxq *rx)
> > +{
> > +    return (rx->netdev->netdev_class->rxq_length
> > +            ? rx->netdev->netdev_class->rxq_length(rx)
> > +            : -1);
> > +}
> > +
> >  /* Configures the number of tx queues of 'netdev'. Returns 0 if successful,
> >   * otherwise a positive errno value.
> >   *
> > diff --git a/lib/netdev.h b/lib/netdev.h
> > index ff1b604..edd41b1 100644
> > --- a/lib/netdev.h
> > +++ b/lib/netdev.h
> > @@ -178,6 +178,7 @@ int netdev_rxq_get_queue_id(const struct netdev_rxq *);
> >  int netdev_rxq_recv(struct netdev_rxq *rx, struct dp_packet_batch *);
> >  void netdev_rxq_wait(struct netdev_rxq *);
> >  int netdev_rxq_drain(struct netdev_rxq *);
> > +int netdev_rxq_length(struct netdev_rxq *rx);
> 
> argument's name not needed here.

OK.

> 
> >
> >  /* Packet transmission. */
> >  int netdev_send(struct netdev *, int qid, struct dp_packet_batch *,
> >
Ilya Maximets Jan. 18, 2018, 6:17 a.m. UTC | #3
On 18.01.2018 02:21, Jan Scheurich wrote:
> Thanks for the review. Answers inline.
> Regards, Jan
> 
> 
>> From: Ilya Maximets [mailto:i.maximets@samsung.com]
>> Sent: Wednesday, 17 January, 2018 11:47
>> Subject: Re: [PATCH v7 1/3] netdev: Add rxq callback function rxq_length()
>>
>> On 16.01.2018 04:51, Jan Scheurich wrote:
>>> If implememented, this function returns the number of packets in an rx
>>> queue of the netdev. If not implemented, it returns -1.
>>
>> To be conform with other netdev functions it should return meaningful
>> error codes. As 'rte_eth_rx_queue_count' could return different errors
>> like -EINVAL or -ENOTSUP, 'netdev_rxq_length' itself should return
>> -EOPNOTSUPP if not implemented.
> 
> OK.
> 
>>
>>>
>>> This function will be used in the upcoming commit for PMD performance
>>> metrics to supervise the rx queue fill level for DPDK vhostuser ports.
>>>
>>> Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
>>> ---
>>>  lib/netdev-bsd.c      |  1 +
>>>  lib/netdev-dpdk.c     | 36 +++++++++++++++++++++++++++++++-----
>>>  lib/netdev-dummy.c    |  1 +
>>>  lib/netdev-linux.c    |  1 +
>>>  lib/netdev-provider.h |  3 +++
>>>  lib/netdev-vport.c    |  1 +
>>>  lib/netdev.c          |  9 +++++++++
>>>  lib/netdev.h          |  1 +
>>>  8 files changed, 48 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/lib/netdev-bsd.c b/lib/netdev-bsd.c
>>> index 05974c1..8d1771e 100644
>>> --- a/lib/netdev-bsd.c
>>> +++ b/lib/netdev-bsd.c
>>> @@ -1546,6 +1546,7 @@ netdev_bsd_update_flags(struct netdev *netdev_, enum netdev_flags off,
>>>      netdev_bsd_rxq_recv,                             \
>>>      netdev_bsd_rxq_wait,                             \
>>>      netdev_bsd_rxq_drain,                            \
>>> +    NULL, /* rxq_length */                           \
>>>                                                       \
>>>      NO_OFFLOAD_API                                   \
>>>  }
>>> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
>>> index ccda3fc..4200556 100644
>>> --- a/lib/netdev-dpdk.c
>>> +++ b/lib/netdev-dpdk.c
>>> @@ -1839,6 +1839,27 @@ netdev_dpdk_rxq_recv(struct netdev_rxq *rxq, struct dp_packet_batch *batch)
>>>      return 0;
>>>  }
>>>
>>> +static int
>>> +netdev_dpdk_vhost_rxq_length(struct netdev_rxq *rxq)
>>> +{
>>> +    struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
>>> +    int qid = rxq->queue_id;
>>> +
>>
>> We must make all the checks as in rxq_recv() function before calling
>> 'rte_vhost_rx_queue_count'. Otherwise we may crash here if device
>> will be occasionally disconnected:
>>
>>     int vid = netdev_dpdk_get_vid(dev);
>>
>>     if (OVS_UNLIKELY(vid < 0 || !dev->vhost_reconfigured
>>                      || !(dev->flags & NETDEV_UP))) {
>>         return -EAGAIN;
>>     }
>>
>> Not sure about -EAGAIN, but we need to return some negative errno.
> 
> OK. Not necessary for our use case, as it will only be called by the PMD after having received a full batch of 32 packets, but in general I agree those checks are needed.

It's necessary because vhost device could be disconnected between
rxq_recv() and rxq_length(). In this case we will call rte_vhost_rx_queue_count()
with vid == -1. This will produce access to the random memory inside
dpdk and likely a segmentation fault.

See commit daf22bf7a826 ("netdev-dpdk: Fix calling vhost API with negative vid.")
for a example of a similar issue. And I'm taking this opportunity to recall that
you should retrieve the vid only once.

> 
>>
>>> +    /* The DPDK API returns a uint32_t which often has invalid bits in the
>>> +     * upper 16-bits. Need to restrict the value uint16_t. */
>>> +    return rte_vhost_rx_queue_count(netdev_dpdk_get_vid(dev),
>>> +                                    qid * VIRTIO_QNUM + VIRTIO_TXQ)
>>> +                & UINT16_MAX;
>>> +}
>>> +
>>> +static int
>>> +netdev_dpdk_rxq_length(struct netdev_rxq *rxq)
>>> +{
>>> +    struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq);
>>> +
>>
>> Same here:
>>
>>     struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
>>
>>     if (OVS_UNLIKELY(!(dev->flags & NETDEV_UP))) {
>>         return -EAGAIN;
>>     }
>>
>>> +    return rte_eth_rx_queue_count(rx->port_id, rxq->queue_id);
>>> +}
>>> +
>>>  static inline int
>>>  netdev_dpdk_qos_run(struct netdev_dpdk *dev, struct rte_mbuf **pkts,
>>>                      int cnt, bool may_steal)
>>> @@ -3580,7 +3601,7 @@ unlock:
>>>                            GET_CARRIER, GET_STATS,			  \
>>>                            GET_CUSTOM_STATS,					  \
>>>                            GET_FEATURES, GET_STATUS,           \
>>> -                          RECONFIGURE, RXQ_RECV)              \
>>> +                          RECONFIGURE, RXQ_RECV, RXQ_LENGTH)  \
>>>  {                                                             \
>>>      NAME,                                                     \
>>>      true,                       /* is_pmd */                  \
>>> @@ -3649,6 +3670,7 @@ unlock:
>>>      RXQ_RECV,                                                 \
>>>      NULL,                       /* rx_wait */                 \
>>>      NULL,                       /* rxq_drain */               \
>>> +    RXQ_LENGTH,                                               \
>>>      NO_OFFLOAD_API                                            \
>>>  }
>>>
>>> @@ -3667,7 +3689,8 @@ static const struct netdev_class dpdk_class =
>>>          netdev_dpdk_get_features,
>>>          netdev_dpdk_get_status,
>>>          netdev_dpdk_reconfigure,
>>> -        netdev_dpdk_rxq_recv);
>>> +        netdev_dpdk_rxq_recv,
>>> +        netdev_dpdk_rxq_length);
>>>
>>>  static const struct netdev_class dpdk_ring_class =
>>>      NETDEV_DPDK_CLASS(
>>> @@ -3684,7 +3707,8 @@ static const struct netdev_class dpdk_ring_class =
>>>          netdev_dpdk_get_features,
>>>          netdev_dpdk_get_status,
>>>          netdev_dpdk_reconfigure,
>>> -        netdev_dpdk_rxq_recv);
>>> +        netdev_dpdk_rxq_recv,
>>> +        NULL);
>>>
>>>  static const struct netdev_class dpdk_vhost_class =
>>>      NETDEV_DPDK_CLASS(
>>> @@ -3701,7 +3725,8 @@ static const struct netdev_class dpdk_vhost_class =
>>>          NULL,
>>>          NULL,
>>>          netdev_dpdk_vhost_reconfigure,
>>> -        netdev_dpdk_vhost_rxq_recv);
>>> +        netdev_dpdk_vhost_rxq_recv,
>>> +        netdev_dpdk_vhost_rxq_length);
>>>  static const struct netdev_class dpdk_vhost_client_class =
>>>      NETDEV_DPDK_CLASS(
>>>          "dpdkvhostuserclient",
>>> @@ -3717,7 +3742,8 @@ static const struct netdev_class dpdk_vhost_client_class =
>>>          NULL,
>>>          NULL,
>>>          netdev_dpdk_vhost_client_reconfigure,
>>> -        netdev_dpdk_vhost_rxq_recv);
>>> +        netdev_dpdk_vhost_rxq_recv,
>>> +        netdev_dpdk_vhost_rxq_length);
>>>
>>>  void
>>>  netdev_dpdk_register(void)
>>> diff --git a/lib/netdev-dummy.c b/lib/netdev-dummy.c
>>> index 4246af3..7e2c0a2 100644
>>> --- a/lib/netdev-dummy.c
>>> +++ b/lib/netdev-dummy.c
>>> @@ -1457,6 +1457,7 @@ netdev_dummy_update_flags(struct netdev *netdev_,
>>>      netdev_dummy_rxq_recv,                                      \
>>>      netdev_dummy_rxq_wait,                                      \
>>>      netdev_dummy_rxq_drain,                                     \
>>> +    NULL,                       /* rxq_length */                \
>>>                                                                  \
>>>      NO_OFFLOAD_API                                              \
>>>  }
>>> diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
>>> index 37143b8..8b19890 100644
>>> --- a/lib/netdev-linux.c
>>> +++ b/lib/netdev-linux.c
>>> @@ -2890,6 +2890,7 @@ netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
>>>      netdev_linux_rxq_recv,                                      \
>>>      netdev_linux_rxq_wait,                                      \
>>>      netdev_linux_rxq_drain,                                     \
>>> +    NULL,                       /* rxq_length */                \
>>>                                                                  \
>>>      FLOW_OFFLOAD_API                                            \
>>>  }
>>> diff --git a/lib/netdev-provider.h b/lib/netdev-provider.h
>>> index 25bd671..297644a 100644
>>> --- a/lib/netdev-provider.h
>>> +++ b/lib/netdev-provider.h
>>> @@ -801,6 +801,9 @@ struct netdev_class {
>>>      /* Discards all packets waiting to be received from 'rx'. */
>>>      int (*rxq_drain)(struct netdev_rxq *rx);
>>>
>>> +    /* Retrieve the number of packets present in an rx queue. */
>>
>> Comment should be extended. See below.
>>
>> In addition we need to mark here that function is not thread-safe and
>> could not be used while device reconfiguration. 
> 
> I'm curious: is there any difference regarding thread-safeness and use during device configuration compared to the other netdev rxq functions? Why would it be necessary to list these constraints for this function but not the others? 

Thread safety in general and particulary for netdev_rxq_*() functions described in
corresponding section at the top of lib/netdev.h . Thread safety of netdev_send()
in details described near to send() function definition in lib/netdev-provider.h
because it has it's own thread-safe related argument.

Comment near to reconfigure() in lib/netdev-provider.h states that we must not
use rxq_recv() and send() simultaneously with it.

I think, we should not describe the thread-safety and dependency with reconfigure()
for rxq_length() in it's own comment, but update comments listed above with this
new function.

> 
> And why should this read-only function be a priori thread-unsafe? And in which sense: concurrent invocations of this function? Or concurrent invocation of this function with rxq_recv() and rxq_drain()?

At least it could return wrong result in case of concurrent invocation with rxq_recv().

> 
> Should the netdev.h and the netdev_provider.h not rather have a general disclaimer that the rxq functions are only to be called from a single thread assigned "polling" the rx queue, or alternatively be protected by a lock on the netdev user side?

lib/netdev.h already has general thread-safety disclaimer.

> 
>>
>>> +    int (*rxq_length)(struct netdev_rxq *rx);
>>> +
>>>      /* ## -------------------------------- ## */
>>>      /* ## netdev flow offloading functions ## */
>>>      /* ## -------------------------------- ## */
>>> diff --git a/lib/netdev-vport.c b/lib/netdev-vport.c
>>> index 478ed90..1e7bc96 100644
>>> --- a/lib/netdev-vport.c
>>> +++ b/lib/netdev-vport.c
>>> @@ -944,6 +944,7 @@ netdev_vport_get_ifindex(const struct netdev *netdev_)
>>>      NULL,                   /* rx_recv */                   \
>>>      NULL,                   /* rx_wait */                   \
>>>      NULL,                   /* rx_drain */                  \
>>> +    NULL,                   /* rx_length */                 \
>>>                                                              \
>>>      NETDEV_FLOW_OFFLOAD_API
>>>
>>> diff --git a/lib/netdev.c b/lib/netdev.c
>>> index be05dc6..063c318 100644
>>> --- a/lib/netdev.c
>>> +++ b/lib/netdev.c
>>> @@ -724,6 +724,15 @@ netdev_rxq_drain(struct netdev_rxq *rx)
>>>              : 0);
>>>  }
>>>
>>> +/* Retrieve the number of packets present in an rx queue. */
>>
>> Comment should clearly declare what kind of result should be treated
>> as an error, and what is the result in case of success. You may use
>> description for 'netdev_get_ifindex' as a reference.
>> Something like:
>>
>> /* Returns the number of packets present in an rx queue, if successful, as a
>>  * positive number.  On failure, returns a negative errno value.
>>  *
>>  * Some network devices may not implement support for this function.  In such
>>  * cases this function will always return -EOPNOTSUPP. */
> 
> OK.
>>
>>> +int
>>> +netdev_rxq_length(struct netdev_rxq *rx)
>>> +{
>>> +    return (rx->netdev->netdev_class->rxq_length
>>> +            ? rx->netdev->netdev_class->rxq_length(rx)
>>> +            : -1);
>>> +}
>>> +
>>>  /* Configures the number of tx queues of 'netdev'. Returns 0 if successful,
>>>   * otherwise a positive errno value.
>>>   *
>>> diff --git a/lib/netdev.h b/lib/netdev.h
>>> index ff1b604..edd41b1 100644
>>> --- a/lib/netdev.h
>>> +++ b/lib/netdev.h
>>> @@ -178,6 +178,7 @@ int netdev_rxq_get_queue_id(const struct netdev_rxq *);
>>>  int netdev_rxq_recv(struct netdev_rxq *rx, struct dp_packet_batch *);
>>>  void netdev_rxq_wait(struct netdev_rxq *);
>>>  int netdev_rxq_drain(struct netdev_rxq *);
>>> +int netdev_rxq_length(struct netdev_rxq *rx);
>>
>> argument's name not needed here.
> 
> OK.
> 
>>
>>>
>>>  /* Packet transmission. */
>>>  int netdev_send(struct netdev *, int qid, struct dp_packet_batch *,
>>>
Billy O'Mahony Jan. 18, 2018, 10:51 a.m. UTC | #4
> -----Original Message-----
> From: Ilya Maximets [mailto:i.maximets@samsung.com]
> Sent: Thursday, January 18, 2018 6:18 AM
> To: Jan Scheurich <jan.scheurich@ericsson.com>; dev@openvswitch.org
> Cc: ktraynor@redhat.com; Stokes, Ian <ian.stokes@intel.com>; O Mahony,
> Billy <billy.o.mahony@intel.com>
> Subject: Re: [PATCH v7 1/3] netdev: Add rxq callback function rxq_length()
> 
> On 18.01.2018 02:21, Jan Scheurich wrote:
> > Thanks for the review. Answers inline.
> > Regards, Jan
> >
> >
> >> From: Ilya Maximets [mailto:i.maximets@samsung.com]
> >> Sent: Wednesday, 17 January, 2018 11:47
> >> Subject: Re: [PATCH v7 1/3] netdev: Add rxq callback function
> >> rxq_length()
> >>
> >> On 16.01.2018 04:51, Jan Scheurich wrote:
> >>> If implememented, this function returns the number of packets in an
> >>> rx queue of the netdev. If not implemented, it returns -1.
> >>
> >> To be conform with other netdev functions it should return meaningful
> >> error codes. As 'rte_eth_rx_queue_count' could return different
> >> errors like -EINVAL or -ENOTSUP, 'netdev_rxq_length' itself should
> >> return -EOPNOTSUPP if not implemented.
> >
> > OK.
> >
> >>
> >>>
> >>> This function will be used in the upcoming commit for PMD
> >>> performance metrics to supervise the rx queue fill level for DPDK
> vhostuser ports.
> >>>
> >>> Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
> >>> ---
> >>>  lib/netdev-bsd.c      |  1 +
> >>>  lib/netdev-dpdk.c     | 36 +++++++++++++++++++++++++++++++-----
> >>>  lib/netdev-dummy.c    |  1 +
> >>>  lib/netdev-linux.c    |  1 +
> >>>  lib/netdev-provider.h |  3 +++
> >>>  lib/netdev-vport.c    |  1 +
> >>>  lib/netdev.c          |  9 +++++++++
> >>>  lib/netdev.h          |  1 +
> >>>  8 files changed, 48 insertions(+), 5 deletions(-)
> >>>
> >>> diff --git a/lib/netdev-bsd.c b/lib/netdev-bsd.c index
> >>> 05974c1..8d1771e 100644
> >>> --- a/lib/netdev-bsd.c
> >>> +++ b/lib/netdev-bsd.c
> >>> @@ -1546,6 +1546,7 @@ netdev_bsd_update_flags(struct netdev
> *netdev_, enum netdev_flags off,
> >>>      netdev_bsd_rxq_recv,                             \
> >>>      netdev_bsd_rxq_wait,                             \
> >>>      netdev_bsd_rxq_drain,                            \
> >>> +    NULL, /* rxq_length */                           \
> >>>                                                       \
> >>>      NO_OFFLOAD_API                                   \
> >>>  }
> >>> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index
> >>> ccda3fc..4200556 100644
> >>> --- a/lib/netdev-dpdk.c
> >>> +++ b/lib/netdev-dpdk.c
> >>> @@ -1839,6 +1839,27 @@ netdev_dpdk_rxq_recv(struct netdev_rxq
> *rxq, struct dp_packet_batch *batch)
> >>>      return 0;
> >>>  }
> >>>
> >>> +static int
> >>> +netdev_dpdk_vhost_rxq_length(struct netdev_rxq *rxq) {
> >>> +    struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
> >>> +    int qid = rxq->queue_id;
> >>> +
> >>
> >> We must make all the checks as in rxq_recv() function before calling
> >> 'rte_vhost_rx_queue_count'. Otherwise we may crash here if device
> >> will be occasionally disconnected:
> >>
> >>     int vid = netdev_dpdk_get_vid(dev);
> >>
> >>     if (OVS_UNLIKELY(vid < 0 || !dev->vhost_reconfigured
> >>                      || !(dev->flags & NETDEV_UP))) {
> >>         return -EAGAIN;
> >>     }
> >>
> >> Not sure about -EAGAIN, but we need to return some negative errno.
> >
> > OK. Not necessary for our use case, as it will only be called by the PMD
> after having received a full batch of 32 packets, but in general I agree those
> checks are needed.
> 
> It's necessary because vhost device could be disconnected between
> rxq_recv() and rxq_length(). In this case we will call
> rte_vhost_rx_queue_count() with vid == -1. This will produce access to the
> random memory inside dpdk and likely a segmentation fault.
> 
> See commit daf22bf7a826 ("netdev-dpdk: Fix calling vhost API with negative
> vid.") for a example of a similar issue. And I'm taking this opportunity to recall
> that you should retrieve the vid only once.

 [[BO'M]] Is there not also the possibility that the vhost device gets disconnected between the call to get_vid() and rxq_recv()? 

Also, given these required calls to get_vid (which afaik requires some slow memory fencing) wouldn't that argue for the original approach where the rxq len is returned from rxq_recv(). As the call to rxq_length()  would be made once per batch once the queue is not being drained rxq_recv() the overhead could be significant.

> 
> >
> >>
> >>> +    /* The DPDK API returns a uint32_t which often has invalid bits in the
> >>> +     * upper 16-bits. Need to restrict the value uint16_t. */
> >>> +    return rte_vhost_rx_queue_count(netdev_dpdk_get_vid(dev),
> >>> +                                    qid * VIRTIO_QNUM + VIRTIO_TXQ)
> >>> +                & UINT16_MAX;
> >>> +}
> >>> +
> >>> +static int
> >>> +netdev_dpdk_rxq_length(struct netdev_rxq *rxq) {
> >>> +    struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq);
> >>> +
> >>
> >> Same here:
> >>
> >>     struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
> >>
> >>     if (OVS_UNLIKELY(!(dev->flags & NETDEV_UP))) {
> >>         return -EAGAIN;
> >>     }
> >>
> >>> +    return rte_eth_rx_queue_count(rx->port_id, rxq->queue_id); }
> >>> +
> >>>  static inline int
> >>>  netdev_dpdk_qos_run(struct netdev_dpdk *dev, struct rte_mbuf
> **pkts,
> >>>                      int cnt, bool may_steal) @@ -3580,7 +3601,7 @@
> >>> unlock:
> >>>                            GET_CARRIER, GET_STATS,			  \
> >>>                            GET_CUSTOM_STATS,
> 	  \
> >>>                            GET_FEATURES, GET_STATUS,           \
> >>> -                          RECONFIGURE, RXQ_RECV)              \
> >>> +                          RECONFIGURE, RXQ_RECV, RXQ_LENGTH)  \
> >>>  {                                                             \
> >>>      NAME,                                                     \
> >>>      true,                       /* is_pmd */                  \
> >>> @@ -3649,6 +3670,7 @@ unlock:
> >>>      RXQ_RECV,                                                 \
> >>>      NULL,                       /* rx_wait */                 \
> >>>      NULL,                       /* rxq_drain */               \
> >>> +    RXQ_LENGTH,                                               \
> >>>      NO_OFFLOAD_API                                            \
> >>>  }
> >>>
> >>> @@ -3667,7 +3689,8 @@ static const struct netdev_class dpdk_class =
> >>>          netdev_dpdk_get_features,
> >>>          netdev_dpdk_get_status,
> >>>          netdev_dpdk_reconfigure,
> >>> -        netdev_dpdk_rxq_recv);
> >>> +        netdev_dpdk_rxq_recv,
> >>> +        netdev_dpdk_rxq_length);
> >>>
> >>>  static const struct netdev_class dpdk_ring_class =
> >>>      NETDEV_DPDK_CLASS(
> >>> @@ -3684,7 +3707,8 @@ static const struct netdev_class
> dpdk_ring_class =
> >>>          netdev_dpdk_get_features,
> >>>          netdev_dpdk_get_status,
> >>>          netdev_dpdk_reconfigure,
> >>> -        netdev_dpdk_rxq_recv);
> >>> +        netdev_dpdk_rxq_recv,
> >>> +        NULL);
> >>>
> >>>  static const struct netdev_class dpdk_vhost_class =
> >>>      NETDEV_DPDK_CLASS(
> >>> @@ -3701,7 +3725,8 @@ static const struct netdev_class
> dpdk_vhost_class =
> >>>          NULL,
> >>>          NULL,
> >>>          netdev_dpdk_vhost_reconfigure,
> >>> -        netdev_dpdk_vhost_rxq_recv);
> >>> +        netdev_dpdk_vhost_rxq_recv,
> >>> +        netdev_dpdk_vhost_rxq_length);
> >>>  static const struct netdev_class dpdk_vhost_client_class =
> >>>      NETDEV_DPDK_CLASS(
> >>>          "dpdkvhostuserclient",
> >>> @@ -3717,7 +3742,8 @@ static const struct netdev_class
> dpdk_vhost_client_class =
> >>>          NULL,
> >>>          NULL,
> >>>          netdev_dpdk_vhost_client_reconfigure,
> >>> -        netdev_dpdk_vhost_rxq_recv);
> >>> +        netdev_dpdk_vhost_rxq_recv,
> >>> +        netdev_dpdk_vhost_rxq_length);
> >>>
> >>>  void
> >>>  netdev_dpdk_register(void)
> >>> diff --git a/lib/netdev-dummy.c b/lib/netdev-dummy.c index
> >>> 4246af3..7e2c0a2 100644
> >>> --- a/lib/netdev-dummy.c
> >>> +++ b/lib/netdev-dummy.c
> >>> @@ -1457,6 +1457,7 @@ netdev_dummy_update_flags(struct netdev
> *netdev_,
> >>>      netdev_dummy_rxq_recv,                                      \
> >>>      netdev_dummy_rxq_wait,                                      \
> >>>      netdev_dummy_rxq_drain,                                     \
> >>> +    NULL,                       /* rxq_length */                \
> >>>                                                                  \
> >>>      NO_OFFLOAD_API                                              \
> >>>  }
> >>> diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c index
> >>> 37143b8..8b19890 100644
> >>> --- a/lib/netdev-linux.c
> >>> +++ b/lib/netdev-linux.c
> >>> @@ -2890,6 +2890,7 @@ netdev_linux_update_flags(struct netdev
> *netdev_, enum netdev_flags off,
> >>>      netdev_linux_rxq_recv,                                      \
> >>>      netdev_linux_rxq_wait,                                      \
> >>>      netdev_linux_rxq_drain,                                     \
> >>> +    NULL,                       /* rxq_length */                \
> >>>                                                                  \
> >>>      FLOW_OFFLOAD_API                                            \
> >>>  }
> >>> diff --git a/lib/netdev-provider.h b/lib/netdev-provider.h index
> >>> 25bd671..297644a 100644
> >>> --- a/lib/netdev-provider.h
> >>> +++ b/lib/netdev-provider.h
> >>> @@ -801,6 +801,9 @@ struct netdev_class {
> >>>      /* Discards all packets waiting to be received from 'rx'. */
> >>>      int (*rxq_drain)(struct netdev_rxq *rx);
> >>>
> >>> +    /* Retrieve the number of packets present in an rx queue. */
> >>
> >> Comment should be extended. See below.
> >>
> >> In addition we need to mark here that function is not thread-safe and
> >> could not be used while device reconfiguration.
> >
> > I'm curious: is there any difference regarding thread-safeness and use
> during device configuration compared to the other netdev rxq functions?
> Why would it be necessary to list these constraints for this function but not
> the others?
> 
> Thread safety in general and particulary for netdev_rxq_*() functions
> described in corresponding section at the top of lib/netdev.h . Thread safety
> of netdev_send() in details described near to send() function definition in
> lib/netdev-provider.h because it has it's own thread-safe related argument.
> 
> Comment near to reconfigure() in lib/netdev-provider.h states that we must
> not use rxq_recv() and send() simultaneously with it.
> 
> I think, we should not describe the thread-safety and dependency with
> reconfigure() for rxq_length() in it's own comment, but update comments
> listed above with this new function.
> 
> >
> > And why should this read-only function be a priori thread-unsafe? And in
> which sense: concurrent invocations of this function? Or concurrent
> invocation of this function with rxq_recv() and rxq_drain()?
> 
> At least it could return wrong result in case of concurrent invocation with
> rxq_recv().
> 
> >
> > Should the netdev.h and the netdev_provider.h not rather have a general
> disclaimer that the rxq functions are only to be called from a single thread
> assigned "polling" the rx queue, or alternatively be protected by a lock on the
> netdev user side?
> 
> lib/netdev.h already has general thread-safety disclaimer.
> 
> >
> >>
> >>> +    int (*rxq_length)(struct netdev_rxq *rx);
> >>> +
> >>>      /* ## -------------------------------- ## */
> >>>      /* ## netdev flow offloading functions ## */
> >>>      /* ## -------------------------------- ## */ diff --git
> >>> a/lib/netdev-vport.c b/lib/netdev-vport.c index 478ed90..1e7bc96
> >>> 100644
> >>> --- a/lib/netdev-vport.c
> >>> +++ b/lib/netdev-vport.c
> >>> @@ -944,6 +944,7 @@ netdev_vport_get_ifindex(const struct netdev
> *netdev_)
> >>>      NULL,                   /* rx_recv */                   \
> >>>      NULL,                   /* rx_wait */                   \
> >>>      NULL,                   /* rx_drain */                  \
> >>> +    NULL,                   /* rx_length */                 \
> >>>                                                              \
> >>>      NETDEV_FLOW_OFFLOAD_API
> >>>
> >>> diff --git a/lib/netdev.c b/lib/netdev.c index be05dc6..063c318
> >>> 100644
> >>> --- a/lib/netdev.c
> >>> +++ b/lib/netdev.c
> >>> @@ -724,6 +724,15 @@ netdev_rxq_drain(struct netdev_rxq *rx)
> >>>              : 0);
> >>>  }
> >>>
> >>> +/* Retrieve the number of packets present in an rx queue. */
> >>
> >> Comment should clearly declare what kind of result should be treated
> >> as an error, and what is the result in case of success. You may use
> >> description for 'netdev_get_ifindex' as a reference.
> >> Something like:
> >>
> >> /* Returns the number of packets present in an rx queue, if
> >> successful, as a
> >>  * positive number.  On failure, returns a negative errno value.
> >>  *
> >>  * Some network devices may not implement support for this function.
> >> In such
> >>  * cases this function will always return -EOPNOTSUPP. */
> >
> > OK.
> >>
> >>> +int
> >>> +netdev_rxq_length(struct netdev_rxq *rx) {
> >>> +    return (rx->netdev->netdev_class->rxq_length
> >>> +            ? rx->netdev->netdev_class->rxq_length(rx)
> >>> +            : -1);
> >>> +}
> >>> +
> >>>  /* Configures the number of tx queues of 'netdev'. Returns 0 if
> successful,
> >>>   * otherwise a positive errno value.
> >>>   *
> >>> diff --git a/lib/netdev.h b/lib/netdev.h index ff1b604..edd41b1
> >>> 100644
> >>> --- a/lib/netdev.h
> >>> +++ b/lib/netdev.h
> >>> @@ -178,6 +178,7 @@ int netdev_rxq_get_queue_id(const struct
> >>> netdev_rxq *);  int netdev_rxq_recv(struct netdev_rxq *rx, struct
> >>> dp_packet_batch *);  void netdev_rxq_wait(struct netdev_rxq *);  int
> >>> netdev_rxq_drain(struct netdev_rxq *);
> >>> +int netdev_rxq_length(struct netdev_rxq *rx);
> >>
> >> argument's name not needed here.
> >
> > OK.
> >
> >>
> >>>
> >>>  /* Packet transmission. */
> >>>  int netdev_send(struct netdev *, int qid, struct dp_packet_batch *,
> >>>
Ilya Maximets Jan. 18, 2018, 11:30 a.m. UTC | #5
On 18.01.2018 13:51, O Mahony, Billy wrote:
> 
> 
>> -----Original Message-----
>> From: Ilya Maximets [mailto:i.maximets@samsung.com]
>> Sent: Thursday, January 18, 2018 6:18 AM
>> To: Jan Scheurich <jan.scheurich@ericsson.com>; dev@openvswitch.org
>> Cc: ktraynor@redhat.com; Stokes, Ian <ian.stokes@intel.com>; O Mahony,
>> Billy <billy.o.mahony@intel.com>
>> Subject: Re: [PATCH v7 1/3] netdev: Add rxq callback function rxq_length()
>>
>> On 18.01.2018 02:21, Jan Scheurich wrote:
>>> Thanks for the review. Answers inline.
>>> Regards, Jan
>>>
>>>
>>>> From: Ilya Maximets [mailto:i.maximets@samsung.com]
>>>> Sent: Wednesday, 17 January, 2018 11:47
>>>> Subject: Re: [PATCH v7 1/3] netdev: Add rxq callback function
>>>> rxq_length()
>>>>
>>>> On 16.01.2018 04:51, Jan Scheurich wrote:
>>>>> If implememented, this function returns the number of packets in an
>>>>> rx queue of the netdev. If not implemented, it returns -1.
>>>>
>>>> To be conform with other netdev functions it should return meaningful
>>>> error codes. As 'rte_eth_rx_queue_count' could return different
>>>> errors like -EINVAL or -ENOTSUP, 'netdev_rxq_length' itself should
>>>> return -EOPNOTSUPP if not implemented.
>>>
>>> OK.
>>>
>>>>
>>>>>
>>>>> This function will be used in the upcoming commit for PMD
>>>>> performance metrics to supervise the rx queue fill level for DPDK
>> vhostuser ports.
>>>>>
>>>>> Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
>>>>> ---
>>>>>  lib/netdev-bsd.c      |  1 +
>>>>>  lib/netdev-dpdk.c     | 36 +++++++++++++++++++++++++++++++-----
>>>>>  lib/netdev-dummy.c    |  1 +
>>>>>  lib/netdev-linux.c    |  1 +
>>>>>  lib/netdev-provider.h |  3 +++
>>>>>  lib/netdev-vport.c    |  1 +
>>>>>  lib/netdev.c          |  9 +++++++++
>>>>>  lib/netdev.h          |  1 +
>>>>>  8 files changed, 48 insertions(+), 5 deletions(-)
>>>>>
>>>>> diff --git a/lib/netdev-bsd.c b/lib/netdev-bsd.c index
>>>>> 05974c1..8d1771e 100644
>>>>> --- a/lib/netdev-bsd.c
>>>>> +++ b/lib/netdev-bsd.c
>>>>> @@ -1546,6 +1546,7 @@ netdev_bsd_update_flags(struct netdev
>> *netdev_, enum netdev_flags off,
>>>>>      netdev_bsd_rxq_recv,                             \
>>>>>      netdev_bsd_rxq_wait,                             \
>>>>>      netdev_bsd_rxq_drain,                            \
>>>>> +    NULL, /* rxq_length */                           \
>>>>>                                                       \
>>>>>      NO_OFFLOAD_API                                   \
>>>>>  }
>>>>> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index
>>>>> ccda3fc..4200556 100644
>>>>> --- a/lib/netdev-dpdk.c
>>>>> +++ b/lib/netdev-dpdk.c
>>>>> @@ -1839,6 +1839,27 @@ netdev_dpdk_rxq_recv(struct netdev_rxq
>> *rxq, struct dp_packet_batch *batch)
>>>>>      return 0;
>>>>>  }
>>>>>
>>>>> +static int
>>>>> +netdev_dpdk_vhost_rxq_length(struct netdev_rxq *rxq) {
>>>>> +    struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
>>>>> +    int qid = rxq->queue_id;
>>>>> +
>>>>
>>>> We must make all the checks as in rxq_recv() function before calling
>>>> 'rte_vhost_rx_queue_count'. Otherwise we may crash here if device
>>>> will be occasionally disconnected:
>>>>
>>>>     int vid = netdev_dpdk_get_vid(dev);
>>>>
>>>>     if (OVS_UNLIKELY(vid < 0 || !dev->vhost_reconfigured
>>>>                      || !(dev->flags & NETDEV_UP))) {
>>>>         return -EAGAIN;
>>>>     }
>>>>
>>>> Not sure about -EAGAIN, but we need to return some negative errno.
>>>
>>> OK. Not necessary for our use case, as it will only be called by the PMD
>> after having received a full batch of 32 packets, but in general I agree those
>> checks are needed.
>>
>> It's necessary because vhost device could be disconnected between
>> rxq_recv() and rxq_length(). In this case we will call
>> rte_vhost_rx_queue_count() with vid == -1. This will produce access to the
>> random memory inside dpdk and likely a segmentation fault.
>>
>> See commit daf22bf7a826 ("netdev-dpdk: Fix calling vhost API with negative
>> vid.") for a example of a similar issue. And I'm taking this opportunity to recall
>> that you should retrieve the vid only once.
> 
>  [[BO'M]] Is there not also the possibility that the vhost device gets disconnected between the call to get_vid() and rxq_recv()? 

You mean disconnect between netdev_dpdk_get_vid(dev) and rte_vhost_dequeue_burst(vid) ?
There is no issue in this case, because 'destroy_device()' will wait for other threads to
quiesce. This means that device structure inside dpdk will not be freed while we're inside
netdev_rxq_recv(). We can safely call any rte_vhost API for the old vid until device not
freed inside dpdk.

> 
> Also, given these required calls to get_vid (which afaik requires some slow memory fencing) wouldn't that argue for the original approach where the rxq len is returned from rxq_recv(). As the call to rxq_length()  would be made once per batch once the queue is not being drained rxq_recv() the overhead could be significant.

I'm not sure (I hope that Jan tested the performance of this version), but I feel that
'rte_vhost_rx_queue_count()' is more heavy operation.

> 
>>
>>>
>>>>
>>>>> +    /* The DPDK API returns a uint32_t which often has invalid bits in the
>>>>> +     * upper 16-bits. Need to restrict the value uint16_t. */
>>>>> +    return rte_vhost_rx_queue_count(netdev_dpdk_get_vid(dev),
>>>>> +                                    qid * VIRTIO_QNUM + VIRTIO_TXQ)
>>>>> +                & UINT16_MAX;
>>>>> +}
>>>>> +
>>>>> +static int
>>>>> +netdev_dpdk_rxq_length(struct netdev_rxq *rxq) {
>>>>> +    struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq);
>>>>> +
>>>>
>>>> Same here:
>>>>
>>>>     struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
>>>>
>>>>     if (OVS_UNLIKELY(!(dev->flags & NETDEV_UP))) {
>>>>         return -EAGAIN;
>>>>     }
>>>>
>>>>> +    return rte_eth_rx_queue_count(rx->port_id, rxq->queue_id); }
>>>>> +
>>>>>  static inline int
>>>>>  netdev_dpdk_qos_run(struct netdev_dpdk *dev, struct rte_mbuf
>> **pkts,
>>>>>                      int cnt, bool may_steal) @@ -3580,7 +3601,7 @@
>>>>> unlock:
>>>>>                            GET_CARRIER, GET_STATS,			  \
>>>>>                            GET_CUSTOM_STATS,
>> 	  \
>>>>>                            GET_FEATURES, GET_STATUS,           \
>>>>> -                          RECONFIGURE, RXQ_RECV)              \
>>>>> +                          RECONFIGURE, RXQ_RECV, RXQ_LENGTH)  \
>>>>>  {                                                             \
>>>>>      NAME,                                                     \
>>>>>      true,                       /* is_pmd */                  \
>>>>> @@ -3649,6 +3670,7 @@ unlock:
>>>>>      RXQ_RECV,                                                 \
>>>>>      NULL,                       /* rx_wait */                 \
>>>>>      NULL,                       /* rxq_drain */               \
>>>>> +    RXQ_LENGTH,                                               \
>>>>>      NO_OFFLOAD_API                                            \
>>>>>  }
>>>>>
>>>>> @@ -3667,7 +3689,8 @@ static const struct netdev_class dpdk_class =
>>>>>          netdev_dpdk_get_features,
>>>>>          netdev_dpdk_get_status,
>>>>>          netdev_dpdk_reconfigure,
>>>>> -        netdev_dpdk_rxq_recv);
>>>>> +        netdev_dpdk_rxq_recv,
>>>>> +        netdev_dpdk_rxq_length);
>>>>>
>>>>>  static const struct netdev_class dpdk_ring_class =
>>>>>      NETDEV_DPDK_CLASS(
>>>>> @@ -3684,7 +3707,8 @@ static const struct netdev_class
>> dpdk_ring_class =
>>>>>          netdev_dpdk_get_features,
>>>>>          netdev_dpdk_get_status,
>>>>>          netdev_dpdk_reconfigure,
>>>>> -        netdev_dpdk_rxq_recv);
>>>>> +        netdev_dpdk_rxq_recv,
>>>>> +        NULL);
>>>>>
>>>>>  static const struct netdev_class dpdk_vhost_class =
>>>>>      NETDEV_DPDK_CLASS(
>>>>> @@ -3701,7 +3725,8 @@ static const struct netdev_class
>> dpdk_vhost_class =
>>>>>          NULL,
>>>>>          NULL,
>>>>>          netdev_dpdk_vhost_reconfigure,
>>>>> -        netdev_dpdk_vhost_rxq_recv);
>>>>> +        netdev_dpdk_vhost_rxq_recv,
>>>>> +        netdev_dpdk_vhost_rxq_length);
>>>>>  static const struct netdev_class dpdk_vhost_client_class =
>>>>>      NETDEV_DPDK_CLASS(
>>>>>          "dpdkvhostuserclient",
>>>>> @@ -3717,7 +3742,8 @@ static const struct netdev_class
>> dpdk_vhost_client_class =
>>>>>          NULL,
>>>>>          NULL,
>>>>>          netdev_dpdk_vhost_client_reconfigure,
>>>>> -        netdev_dpdk_vhost_rxq_recv);
>>>>> +        netdev_dpdk_vhost_rxq_recv,
>>>>> +        netdev_dpdk_vhost_rxq_length);
>>>>>
>>>>>  void
>>>>>  netdev_dpdk_register(void)
>>>>> diff --git a/lib/netdev-dummy.c b/lib/netdev-dummy.c index
>>>>> 4246af3..7e2c0a2 100644
>>>>> --- a/lib/netdev-dummy.c
>>>>> +++ b/lib/netdev-dummy.c
>>>>> @@ -1457,6 +1457,7 @@ netdev_dummy_update_flags(struct netdev
>> *netdev_,
>>>>>      netdev_dummy_rxq_recv,                                      \
>>>>>      netdev_dummy_rxq_wait,                                      \
>>>>>      netdev_dummy_rxq_drain,                                     \
>>>>> +    NULL,                       /* rxq_length */                \
>>>>>                                                                  \
>>>>>      NO_OFFLOAD_API                                              \
>>>>>  }
>>>>> diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c index
>>>>> 37143b8..8b19890 100644
>>>>> --- a/lib/netdev-linux.c
>>>>> +++ b/lib/netdev-linux.c
>>>>> @@ -2890,6 +2890,7 @@ netdev_linux_update_flags(struct netdev
>> *netdev_, enum netdev_flags off,
>>>>>      netdev_linux_rxq_recv,                                      \
>>>>>      netdev_linux_rxq_wait,                                      \
>>>>>      netdev_linux_rxq_drain,                                     \
>>>>> +    NULL,                       /* rxq_length */                \
>>>>>                                                                  \
>>>>>      FLOW_OFFLOAD_API                                            \
>>>>>  }
>>>>> diff --git a/lib/netdev-provider.h b/lib/netdev-provider.h index
>>>>> 25bd671..297644a 100644
>>>>> --- a/lib/netdev-provider.h
>>>>> +++ b/lib/netdev-provider.h
>>>>> @@ -801,6 +801,9 @@ struct netdev_class {
>>>>>      /* Discards all packets waiting to be received from 'rx'. */
>>>>>      int (*rxq_drain)(struct netdev_rxq *rx);
>>>>>
>>>>> +    /* Retrieve the number of packets present in an rx queue. */
>>>>
>>>> Comment should be extended. See below.
>>>>
>>>> In addition we need to mark here that function is not thread-safe and
>>>> could not be used while device reconfiguration.
>>>
>>> I'm curious: is there any difference regarding thread-safeness and use
>> during device configuration compared to the other netdev rxq functions?
>> Why would it be necessary to list these constraints for this function but not
>> the others?
>>
>> Thread safety in general and particulary for netdev_rxq_*() functions
>> described in corresponding section at the top of lib/netdev.h . Thread safety
>> of netdev_send() in details described near to send() function definition in
>> lib/netdev-provider.h because it has it's own thread-safe related argument.
>>
>> Comment near to reconfigure() in lib/netdev-provider.h states that we must
>> not use rxq_recv() and send() simultaneously with it.
>>
>> I think, we should not describe the thread-safety and dependency with
>> reconfigure() for rxq_length() in it's own comment, but update comments
>> listed above with this new function.
>>
>>>
>>> And why should this read-only function be a priori thread-unsafe? And in
>> which sense: concurrent invocations of this function? Or concurrent
>> invocation of this function with rxq_recv() and rxq_drain()?
>>
>> At least it could return wrong result in case of concurrent invocation with
>> rxq_recv().
>>
>>>
>>> Should the netdev.h and the netdev_provider.h not rather have a general
>> disclaimer that the rxq functions are only to be called from a single thread
>> assigned "polling" the rx queue, or alternatively be protected by a lock on the
>> netdev user side?
>>
>> lib/netdev.h already has general thread-safety disclaimer.
>>
>>>
>>>>
>>>>> +    int (*rxq_length)(struct netdev_rxq *rx);
>>>>> +
>>>>>      /* ## -------------------------------- ## */
>>>>>      /* ## netdev flow offloading functions ## */
>>>>>      /* ## -------------------------------- ## */ diff --git
>>>>> a/lib/netdev-vport.c b/lib/netdev-vport.c index 478ed90..1e7bc96
>>>>> 100644
>>>>> --- a/lib/netdev-vport.c
>>>>> +++ b/lib/netdev-vport.c
>>>>> @@ -944,6 +944,7 @@ netdev_vport_get_ifindex(const struct netdev
>> *netdev_)
>>>>>      NULL,                   /* rx_recv */                   \
>>>>>      NULL,                   /* rx_wait */                   \
>>>>>      NULL,                   /* rx_drain */                  \
>>>>> +    NULL,                   /* rx_length */                 \
>>>>>                                                              \
>>>>>      NETDEV_FLOW_OFFLOAD_API
>>>>>
>>>>> diff --git a/lib/netdev.c b/lib/netdev.c index be05dc6..063c318
>>>>> 100644
>>>>> --- a/lib/netdev.c
>>>>> +++ b/lib/netdev.c
>>>>> @@ -724,6 +724,15 @@ netdev_rxq_drain(struct netdev_rxq *rx)
>>>>>              : 0);
>>>>>  }
>>>>>
>>>>> +/* Retrieve the number of packets present in an rx queue. */
>>>>
>>>> Comment should clearly declare what kind of result should be treated
>>>> as an error, and what is the result in case of success. You may use
>>>> description for 'netdev_get_ifindex' as a reference.
>>>> Something like:
>>>>
>>>> /* Returns the number of packets present in an rx queue, if
>>>> successful, as a
>>>>  * positive number.  On failure, returns a negative errno value.
>>>>  *
>>>>  * Some network devices may not implement support for this function.
>>>> In such
>>>>  * cases this function will always return -EOPNOTSUPP. */
>>>
>>> OK.
>>>>
>>>>> +int
>>>>> +netdev_rxq_length(struct netdev_rxq *rx) {
>>>>> +    return (rx->netdev->netdev_class->rxq_length
>>>>> +            ? rx->netdev->netdev_class->rxq_length(rx)
>>>>> +            : -1);
>>>>> +}
>>>>> +
>>>>>  /* Configures the number of tx queues of 'netdev'. Returns 0 if
>> successful,
>>>>>   * otherwise a positive errno value.
>>>>>   *
>>>>> diff --git a/lib/netdev.h b/lib/netdev.h index ff1b604..edd41b1
>>>>> 100644
>>>>> --- a/lib/netdev.h
>>>>> +++ b/lib/netdev.h
>>>>> @@ -178,6 +178,7 @@ int netdev_rxq_get_queue_id(const struct
>>>>> netdev_rxq *);  int netdev_rxq_recv(struct netdev_rxq *rx, struct
>>>>> dp_packet_batch *);  void netdev_rxq_wait(struct netdev_rxq *);  int
>>>>> netdev_rxq_drain(struct netdev_rxq *);
>>>>> +int netdev_rxq_length(struct netdev_rxq *rx);
>>>>
>>>> argument's name not needed here.
>>>
>>> OK.
>>>
>>>>
>>>>>
>>>>>  /* Packet transmission. */
>>>>>  int netdev_send(struct netdev *, int qid, struct dp_packet_batch *,
>>>>>
Jan Scheurich Jan. 18, 2018, 4:58 p.m. UTC | #6
> >>>
> >>> OK. Not necessary for our use case, as it will only be called by the PMD
> >> after having received a full batch of 32 packets, but in general I agree those
> >> checks are needed.
> >>
> >> It's necessary because vhost device could be disconnected between
> >> rxq_recv() and rxq_length(). In this case we will call
> >> rte_vhost_rx_queue_count() with vid == -1. This will produce access to the
> >> random memory inside dpdk and likely a segmentation fault.
> >>
> >> See commit daf22bf7a826 ("netdev-dpdk: Fix calling vhost API with negative
> >> vid.") for a example of a similar issue. And I'm taking this opportunity to recall
> >> that you should retrieve the vid only once.
> >
> >  [[BO'M]] Is there not also the possibility that the vhost device gets disconnected between the call to get_vid() and rxq_recv()?
> 
> You mean disconnect between netdev_dpdk_get_vid(dev) and rte_vhost_dequeue_burst(vid) ?
> There is no issue in this case, because 'destroy_device()' will wait for other threads to
> quiesce. This means that device structure inside dpdk will not be freed while we're inside
> netdev_rxq_recv(). We can safely call any rte_vhost API for the old vid until device not
> freed inside dpdk.
> 
> >
> > Also, given these required calls to get_vid (which afaik requires some slow memory fencing) wouldn't that argue for the original
> approach where the rxq len is returned from rxq_recv(). As the call to rxq_length()  would be made once per batch once the queue is not
> being drained rxq_recv() the overhead could be significant.
> 
> I'm not sure (I hope that Jan tested the performance of this version), but I feel that
> 'rte_vhost_rx_queue_count()' is more heavy operation.

I have not done any performance tests yet with the new netdev_rxq_length() call after adding the vid and other checks. The actual 'rte_vhost_rx_queue_count()' is very lightweight. So if the memory fencing for get_vid() is expensive it might mean a hit. (Note: The rte_eth_queue_count() functions for physical ixgbe queues was much more costly).

Thinking about it, I would tend to agree with Billy that it seems simpler as well as more accurate and efficient to let the caller provide an optional output parameter "uint32_t *rxq_len" in rqx_recv() if they are interested in the queue fill level and retrieve both in one atomic operation, so that we avoid the duplicate vid checks.

Can we agree on that?

BR, Jan
Billy O'Mahony Jan. 19, 2018, 9:53 a.m. UTC | #7
> -----Original Message-----
> From: Jan Scheurich [mailto:jan.scheurich@ericsson.com]
> Sent: Thursday, January 18, 2018 4:59 PM
> To: Ilya Maximets <i.maximets@samsung.com>; O Mahony, Billy
> <billy.o.mahony@intel.com>; dev@openvswitch.org
> Cc: ktraynor@redhat.com; Stokes, Ian <ian.stokes@intel.com>
> Subject: RE: [PATCH v7 1/3] netdev: Add rxq callback function rxq_length()
> 
> > >>>
> > >>> OK. Not necessary for our use case, as it will only be called by
> > >>> the PMD
> > >> after having received a full batch of 32 packets, but in general I
> > >> agree those checks are needed.
> > >>
> > >> It's necessary because vhost device could be disconnected between
> > >> rxq_recv() and rxq_length(). In this case we will call
> > >> rte_vhost_rx_queue_count() with vid == -1. This will produce access
> > >> to the random memory inside dpdk and likely a segmentation fault.
> > >>
> > >> See commit daf22bf7a826 ("netdev-dpdk: Fix calling vhost API with
> > >> negative
> > >> vid.") for a example of a similar issue. And I'm taking this
> > >> opportunity to recall that you should retrieve the vid only once.
> > >
> > >  [[BO'M]] Is there not also the possibility that the vhost device gets
> disconnected between the call to get_vid() and rxq_recv()?
> >
> > You mean disconnect between netdev_dpdk_get_vid(dev) and
> rte_vhost_dequeue_burst(vid) ?
> > There is no issue in this case, because 'destroy_device()' will wait
> > for other threads to quiesce. This means that device structure inside
> > dpdk will not be freed while we're inside netdev_rxq_recv(). We can
> > safely call any rte_vhost API for the old vid until device not freed inside
> dpdk.
> >
> > >
> > > Also, given these required calls to get_vid (which afaik requires
> > > some slow memory fencing) wouldn't that argue for the original
> > approach where the rxq len is returned from rxq_recv(). As the call to
> > rxq_length()  would be made once per batch once the queue is not being
> drained rxq_recv() the overhead could be significant.
> >
> > I'm not sure (I hope that Jan tested the performance of this version),
> > but I feel that 'rte_vhost_rx_queue_count()' is more heavy operation.
> 
> I have not done any performance tests yet with the new
> netdev_rxq_length() call after adding the vid and other checks. The actual
> 'rte_vhost_rx_queue_count()' is very lightweight. So if the memory fencing
> for get_vid() is expensive it might mean a hit. (Note: The
> rte_eth_queue_count() functions for physical ixgbe queues was much more
> costly).
> 
> Thinking about it, I would tend to agree with Billy that it seems simpler as well
> as more accurate and efficient to let the caller provide an optional output
> parameter "uint32_t *rxq_len" in rqx_recv() if they are interested in the
> queue fill level and retrieve both in one atomic operation, so that we avoid
> the duplicate vid checks.
[[BO'M]] I like the idea of supplying the pointer which makes it optional and so gets around other issues we discussed - such as a client may only be interested in rxq len for certain netdev types or may know a priori that it's an expensive operation for other netdev types. 
> 
> Can we agree on that?
> 
> BR, Jan
Jan Scheurich Jan. 19, 2018, 10:16 a.m. UTC | #8
> > > > Also, given these required calls to get_vid (which afaik requires
> > > > some slow memory fencing) wouldn't that argue for the original
> > > approach where the rxq len is returned from rxq_recv(). As the call to
> > > rxq_length()  would be made once per batch once the queue is not being
> > drained rxq_recv() the overhead could be significant.
> > >
> > > I'm not sure (I hope that Jan tested the performance of this version),
> > > but I feel that 'rte_vhost_rx_queue_count()' is more heavy operation.
> >
> > I have not done any performance tests yet with the new
> > netdev_rxq_length() call after adding the vid and other checks. The actual
> > 'rte_vhost_rx_queue_count()' is very lightweight. So if the memory fencing
> > for get_vid() is expensive it might mean a hit. (Note: The
> > rte_eth_queue_count() functions for physical ixgbe queues was much more
> > costly).
> >
> > Thinking about it, I would tend to agree with Billy that it seems simpler as well
> > as more accurate and efficient to let the caller provide an optional output
> > parameter "uint32_t *rxq_len" in rqx_recv() if they are interested in the
> > queue fill level and retrieve both in one atomic operation, so that we avoid
> > the duplicate vid checks.
> [[BO'M]] I like the idea of supplying the pointer which makes it optional and so gets around other issues we discussed - such as a client
> may only be interested in rxq len for certain netdev types or may know a priori that it's an expensive operation for other netdev types.

Thanks. I am preparing v8 with this change. The optional output parameter will return the remaining
queue length after receiving the packet batch. That seems a meaningful metric for the rxq_recv() API
as it would also allow the caller to decide whether or not to continue polling (e.g. in the context
of the priority polling scheme).

BR, Jan

> >
> > Can we agree on that?
> >
> > BR, Jan
diff mbox series

Patch

diff --git a/lib/netdev-bsd.c b/lib/netdev-bsd.c
index 05974c1..8d1771e 100644
--- a/lib/netdev-bsd.c
+++ b/lib/netdev-bsd.c
@@ -1546,6 +1546,7 @@  netdev_bsd_update_flags(struct netdev *netdev_, enum netdev_flags off,
     netdev_bsd_rxq_recv,                             \
     netdev_bsd_rxq_wait,                             \
     netdev_bsd_rxq_drain,                            \
+    NULL, /* rxq_length */                           \
                                                      \
     NO_OFFLOAD_API                                   \
 }
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index ccda3fc..4200556 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -1839,6 +1839,27 @@  netdev_dpdk_rxq_recv(struct netdev_rxq *rxq, struct dp_packet_batch *batch)
     return 0;
 }
 
+static int
+netdev_dpdk_vhost_rxq_length(struct netdev_rxq *rxq)
+{
+    struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
+    int qid = rxq->queue_id;
+
+    /* The DPDK API returns a uint32_t which often has invalid bits in the
+     * upper 16-bits. Need to restrict the value uint16_t. */
+    return rte_vhost_rx_queue_count(netdev_dpdk_get_vid(dev),
+                                    qid * VIRTIO_QNUM + VIRTIO_TXQ)
+                & UINT16_MAX;
+}
+
+static int
+netdev_dpdk_rxq_length(struct netdev_rxq *rxq)
+{
+    struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq);
+
+    return rte_eth_rx_queue_count(rx->port_id, rxq->queue_id);
+}
+
 static inline int
 netdev_dpdk_qos_run(struct netdev_dpdk *dev, struct rte_mbuf **pkts,
                     int cnt, bool may_steal)
@@ -3580,7 +3601,7 @@  unlock:
                           GET_CARRIER, GET_STATS,			  \
                           GET_CUSTOM_STATS,					  \
                           GET_FEATURES, GET_STATUS,           \
-                          RECONFIGURE, RXQ_RECV)              \
+                          RECONFIGURE, RXQ_RECV, RXQ_LENGTH)  \
 {                                                             \
     NAME,                                                     \
     true,                       /* is_pmd */                  \
@@ -3649,6 +3670,7 @@  unlock:
     RXQ_RECV,                                                 \
     NULL,                       /* rx_wait */                 \
     NULL,                       /* rxq_drain */               \
+    RXQ_LENGTH,                                               \
     NO_OFFLOAD_API                                            \
 }
 
@@ -3667,7 +3689,8 @@  static const struct netdev_class dpdk_class =
         netdev_dpdk_get_features,
         netdev_dpdk_get_status,
         netdev_dpdk_reconfigure,
-        netdev_dpdk_rxq_recv);
+        netdev_dpdk_rxq_recv,
+        netdev_dpdk_rxq_length);
 
 static const struct netdev_class dpdk_ring_class =
     NETDEV_DPDK_CLASS(
@@ -3684,7 +3707,8 @@  static const struct netdev_class dpdk_ring_class =
         netdev_dpdk_get_features,
         netdev_dpdk_get_status,
         netdev_dpdk_reconfigure,
-        netdev_dpdk_rxq_recv);
+        netdev_dpdk_rxq_recv,
+        NULL);
 
 static const struct netdev_class dpdk_vhost_class =
     NETDEV_DPDK_CLASS(
@@ -3701,7 +3725,8 @@  static const struct netdev_class dpdk_vhost_class =
         NULL,
         NULL,
         netdev_dpdk_vhost_reconfigure,
-        netdev_dpdk_vhost_rxq_recv);
+        netdev_dpdk_vhost_rxq_recv,
+        netdev_dpdk_vhost_rxq_length);
 static const struct netdev_class dpdk_vhost_client_class =
     NETDEV_DPDK_CLASS(
         "dpdkvhostuserclient",
@@ -3717,7 +3742,8 @@  static const struct netdev_class dpdk_vhost_client_class =
         NULL,
         NULL,
         netdev_dpdk_vhost_client_reconfigure,
-        netdev_dpdk_vhost_rxq_recv);
+        netdev_dpdk_vhost_rxq_recv,
+        netdev_dpdk_vhost_rxq_length);
 
 void
 netdev_dpdk_register(void)
diff --git a/lib/netdev-dummy.c b/lib/netdev-dummy.c
index 4246af3..7e2c0a2 100644
--- a/lib/netdev-dummy.c
+++ b/lib/netdev-dummy.c
@@ -1457,6 +1457,7 @@  netdev_dummy_update_flags(struct netdev *netdev_,
     netdev_dummy_rxq_recv,                                      \
     netdev_dummy_rxq_wait,                                      \
     netdev_dummy_rxq_drain,                                     \
+    NULL,                       /* rxq_length */                \
                                                                 \
     NO_OFFLOAD_API                                              \
 }
diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
index 37143b8..8b19890 100644
--- a/lib/netdev-linux.c
+++ b/lib/netdev-linux.c
@@ -2890,6 +2890,7 @@  netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
     netdev_linux_rxq_recv,                                      \
     netdev_linux_rxq_wait,                                      \
     netdev_linux_rxq_drain,                                     \
+    NULL,                       /* rxq_length */                \
                                                                 \
     FLOW_OFFLOAD_API                                            \
 }
diff --git a/lib/netdev-provider.h b/lib/netdev-provider.h
index 25bd671..297644a 100644
--- a/lib/netdev-provider.h
+++ b/lib/netdev-provider.h
@@ -801,6 +801,9 @@  struct netdev_class {
     /* Discards all packets waiting to be received from 'rx'. */
     int (*rxq_drain)(struct netdev_rxq *rx);
 
+    /* Retrieve the number of packets present in an rx queue. */
+    int (*rxq_length)(struct netdev_rxq *rx);
+
     /* ## -------------------------------- ## */
     /* ## netdev flow offloading functions ## */
     /* ## -------------------------------- ## */
diff --git a/lib/netdev-vport.c b/lib/netdev-vport.c
index 478ed90..1e7bc96 100644
--- a/lib/netdev-vport.c
+++ b/lib/netdev-vport.c
@@ -944,6 +944,7 @@  netdev_vport_get_ifindex(const struct netdev *netdev_)
     NULL,                   /* rx_recv */                   \
     NULL,                   /* rx_wait */                   \
     NULL,                   /* rx_drain */                  \
+    NULL,                   /* rx_length */                 \
                                                             \
     NETDEV_FLOW_OFFLOAD_API
 
diff --git a/lib/netdev.c b/lib/netdev.c
index be05dc6..063c318 100644
--- a/lib/netdev.c
+++ b/lib/netdev.c
@@ -724,6 +724,15 @@  netdev_rxq_drain(struct netdev_rxq *rx)
             : 0);
 }
 
+/* Retrieve the number of packets present in an rx queue. */
+int
+netdev_rxq_length(struct netdev_rxq *rx)
+{
+    return (rx->netdev->netdev_class->rxq_length
+            ? rx->netdev->netdev_class->rxq_length(rx)
+            : -1);
+}
+
 /* Configures the number of tx queues of 'netdev'. Returns 0 if successful,
  * otherwise a positive errno value.
  *
diff --git a/lib/netdev.h b/lib/netdev.h
index ff1b604..edd41b1 100644
--- a/lib/netdev.h
+++ b/lib/netdev.h
@@ -178,6 +178,7 @@  int netdev_rxq_get_queue_id(const struct netdev_rxq *);
 int netdev_rxq_recv(struct netdev_rxq *rx, struct dp_packet_batch *);
 void netdev_rxq_wait(struct netdev_rxq *);
 int netdev_rxq_drain(struct netdev_rxq *);
+int netdev_rxq_length(struct netdev_rxq *rx);
 
 /* Packet transmission. */
 int netdev_send(struct netdev *, int qid, struct dp_packet_batch *,