diff mbox

[ovs-dev,5/6] netdev-dpdk: Add netdev_dpdk_vhost_txq_flush function.

Message ID 1497815785-47222-6-git-send-email-bhanuprakash.bodireddy@intel.com
State Superseded
Headers show

Commit Message

Bodireddy, Bhanuprakash June 18, 2017, 7:56 p.m. UTC
Add netdev_dpdk_vhost_txq_flush(), that flushes packets on vHost User
port queues. Also add netdev_dpdk_vhost_tx_burst() function that
uses rte_vhost_enqueue_burst() to enqueue burst of packets on vHost User
ports.

Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Signed-off-by: Antonio Fischetti <antonio.fischetti@intel.com>
Co-authored-by: Antonio Fischetti <antonio.fischetti@intel.com>
Acked-by: Eelco Chaudron <echaudro@redhat.com>
---
 lib/netdev-dpdk.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 72 insertions(+), 4 deletions(-)

Comments

Ilya Maximets June 23, 2017, 12:10 p.m. UTC | #1
> Add netdev_dpdk_vhost_txq_flush(), that flushes packets on vHost User
> port queues. Also add netdev_dpdk_vhost_tx_burst() function that
> uses rte_vhost_enqueue_burst() to enqueue burst of packets on vHost User
> ports.
> 
> Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy at intel.com>
> Signed-off-by: Antonio Fischetti <antonio.fischetti at intel.com>
> Co-authored-by: Antonio Fischetti <antonio.fischetti at intel.com>
> Acked-by: Eelco Chaudron <echaudro at redhat.com>
> ---
>  lib/netdev-dpdk.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 72 insertions(+), 4 deletions(-)
> 
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> index 50a9a2c..47343e8 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -307,12 +307,22 @@ struct dpdk_tx_queue {
>                                      * pmd threads (see 'concurrent_txq'). */
>      int map;                       /* Mapping of configured vhost-user queues
>                                      * to enabled by guest. */
> -    int dpdk_pkt_cnt;              /* Number of buffered packets waiting to
> +    union {
> +        int dpdk_pkt_cnt;          /* Number of buffered packets waiting to
>                                        be sent on DPDK tx queue. */
> -    struct rte_mbuf *dpdk_burst_pkts[INTERIM_QUEUE_BURST_THRESHOLD];
> +        int vhost_pkt_cnt;         /* Number of buffered packets waiting to
> +                                      be sent on vhost port. */
> +    };
> +
> +    union {
> +        struct rte_mbuf *dpdk_burst_pkts[INTERIM_QUEUE_BURST_THRESHOLD];
>                                     /* Intermediate queue where packets can
>                                      * be buffered to amortize the cost of MMIO
>                                      * writes. */
> +        struct dp_packet *vhost_burst_pkts[INTERIM_QUEUE_BURST_THRESHOLD];
> +                                   /* Intermediate queue where packets can
> +                                    * be buffered for vhost ports. */
> +    };
>  };
>  
>  /* dpdk has no way to remove dpdk ring ethernet devices
> @@ -1719,6 +1729,63 @@ netdev_dpdk_vhost_update_tx_counters(struct netdev_stats *stats,
>      }
>  }
>  
> +static int
> +netdev_dpdk_vhost_tx_burst(struct netdev_dpdk *dev, int qid)
> +{
> +    struct dpdk_tx_queue *txq = &dev->tx_q[qid];
> +    struct rte_mbuf **cur_pkts = (struct rte_mbuf **)txq->vhost_burst_pkts;
> +
> +    int tx_vid = netdev_dpdk_get_vid(dev);
> +    int tx_qid = qid * VIRTIO_QNUM + VIRTIO_RXQ;
> +    uint32_t sent = 0;
> +    uint32_t retries = 0;
> +    uint32_t sum, total_pkts;
> +
> +    total_pkts = sum = txq->vhost_pkt_cnt;
> +    do {
> +        uint32_t ret;
> +        ret = rte_vhost_enqueue_burst(tx_vid, tx_qid, &cur_pkts[sent], sum);
> +        if (OVS_UNLIKELY(!ret)) {
> +            /* No packets enqueued - do not retry. */
> +            break;
> +        } else {
> +            /* Packet have been sent */
> +            sent += ret;
> +
> +            /* 'sum' packet have to be retransmitted */
> +            sum -= ret;
> +        }
> +    } while (sum && (retries++ < VHOST_ENQ_RETRY_NUM));
> +
> +    for (int i = 0; i < total_pkts; i++) {
> +        dp_packet_delete(txq->vhost_burst_pkts[i]);
> +    }
> +
> +    /* Reset pkt count */
> +    txq->vhost_pkt_cnt = 0;
> +
> +    /* 'sum' refers to packets dropped */
> +    return sum;
> +}
> +
> +/* Flush the txq if there are any packets available.
> + * dynamic_txqs/concurrent_txq is disabled for vHost User ports as
> + * 'OVS_VHOST_MAX_QUEUE_NUM' txqs are preallocated.
> + */

This comment is completely untrue. You may ignore 'concurrent_txq'
because you *must* lock the queue in any case because of dynamic
txq remapping inside netdev-dpdk. You must take the spinlock for
the 'dev->tx_q[qid % netdev->n_txq].map' before sending packets.

In current implementation you're able to call send and flush
simultaneously for the same queue from different threads because
'flush' doesn't care about queue remapping.

See '__netdev_dpdk_vhost_send' and 'netdev_dpdk_remap_txqs' for
detail.

Additionally, flushing logic will be broken in case of txq remapping
because you may have different underneath queue each time you
trying to send of flush.

Have you ever tested this with multiqueue vhost?
With disabling/enabling queues inside the guest?

Best regards, Ilya Maximets.

> +static int
> +netdev_dpdk_vhost_txq_flush(struct netdev *netdev, int qid,
> +                            bool concurrent_txq OVS_UNUSED)
> +{
> +    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> +    struct dpdk_tx_queue *txq = &dev->tx_q[qid];
> +
> +    if (OVS_LIKELY(txq->vhost_pkt_cnt)) {
> +        netdev_dpdk_vhost_tx_burst(dev, qid);
> +    }
> +
> +    return 0;
> +}
> +
>  static void
>  __netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
>                           struct dp_packet **pkts, int cnt)
> @@ -3432,7 +3499,8 @@ static const struct netdev_class dpdk_vhost_class =
>          NULL,
>          netdev_dpdk_vhost_reconfigure,
>          netdev_dpdk_vhost_rxq_recv,
> -        NULL);
> +        netdev_dpdk_vhost_txq_flush);
> +
>  static const struct netdev_class dpdk_vhost_client_class =
>      NETDEV_DPDK_CLASS(
>          "dpdkvhostuserclient",
> @@ -3448,7 +3516,7 @@ static const struct netdev_class dpdk_vhost_client_class =
>          NULL,
>          netdev_dpdk_vhost_client_reconfigure,
>          netdev_dpdk_vhost_rxq_recv,
> -        NULL);
> +        netdev_dpdk_vhost_txq_flush);
>  
>  void
>  netdev_dpdk_register(void)
> -- 
> 2.4.11
Bodireddy, Bhanuprakash June 25, 2017, 9:52 p.m. UTC | #2
>> +
>> +/* Flush the txq if there are any packets available.
>> + * dynamic_txqs/concurrent_txq is disabled for vHost User ports as
>> + * 'OVS_VHOST_MAX_QUEUE_NUM' txqs are preallocated.
>> + */
>
>This comment is completely untrue. You may ignore 'concurrent_txq'
>because you *must* lock the queue in any case because of dynamic txq
>remapping inside netdev-dpdk. You must take the spinlock for the 'dev-
>>tx_q[qid % netdev->n_txq].map' before sending packets.

Thanks for catching this and the lock should be taken before flushing the queue. Below is how the new logic with spinlocks. 

/* Flush the txq if there are any packets available. */
static int
netdev_dpdk_vhost_txq_flush(struct netdev *netdev, int qid,
                            bool concurrent_txq OVS_UNUSED)
{
    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
    struct dpdk_tx_queue *txq;

    qid = dev->tx_q[qid % netdev->n_txq].map;

    txq = &dev->tx_q[qid];
    if (OVS_LIKELY(txq->vhost_pkt_cnt)) {
        rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
        netdev_dpdk_vhost_tx_burst(dev, qid);
        rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
    }

    return 0;
}

>
>In current implementation you're able to call send and flush simultaneously
>for the same queue from different threads because 'flush' doesn't care about
>queue remapping.
>
>See '__netdev_dpdk_vhost_send' and 'netdev_dpdk_remap_txqs' for detail.
>
>Additionally, flushing logic will be broken in case of txq remapping because
>you may have different underneath queue each time you trying to send of
>flush.

I remember you raised this point earlier. To handle this case,  'last_used_qid' was introduced in tx_port. 
With this we can track any change in qid and make sure the packets are  flushed in the old queue. This logic is in patch 3/6 of this series.

>
>Have you ever tested this with multiqueue vhost?
>With disabling/enabling queues inside the guest?

I did basic sanity testing with vhost multiqueue to verify throughput and to check if flush logic is working at low rate(1 pkt each is sent to VM) on multiple VMs.
When you say queue 'enable/disable' in the guest are you referring to using 'ethtool -L <inface> combined <channel no>' ?
If this is the case I did do this by configuring 5 rxqs(DPDK and vhost User) and changed the channel nos and verified with testpmd app again for flushing scenarios. I didn't testing with kernel forwarding here.

Regards,
Bhanuprakash.
Ilya Maximets June 26, 2017, 5:40 a.m. UTC | #3
On 26.06.2017 00:52, Bodireddy, Bhanuprakash wrote:
>>> +
>>> +/* Flush the txq if there are any packets available.
>>> + * dynamic_txqs/concurrent_txq is disabled for vHost User ports as
>>> + * 'OVS_VHOST_MAX_QUEUE_NUM' txqs are preallocated.
>>> + */
>>
>> This comment is completely untrue. You may ignore 'concurrent_txq'
>> because you *must* lock the queue in any case because of dynamic txq
>> remapping inside netdev-dpdk. You must take the spinlock for the 'dev-
>>> tx_q[qid % netdev->n_txq].map' before sending packets.
> 
> Thanks for catching this and the lock should be taken before flushing the queue. Below is how the new logic with spinlocks. 
> 
> /* Flush the txq if there are any packets available. */
> static int
> netdev_dpdk_vhost_txq_flush(struct netdev *netdev, int qid,
>                             bool concurrent_txq OVS_UNUSED)
> {
>     struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>     struct dpdk_tx_queue *txq;
> 
>     qid = dev->tx_q[qid % netdev->n_txq].map;
> 
>     txq = &dev->tx_q[qid];
>     if (OVS_LIKELY(txq->vhost_pkt_cnt)) {
>         rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
>         netdev_dpdk_vhost_tx_burst(dev, qid);
>         rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
>     }
> 
>     return 0;
> }
> 
>>
>> In current implementation you're able to call send and flush simultaneously
>> for the same queue from different threads because 'flush' doesn't care about
>> queue remapping.
>>
>> See '__netdev_dpdk_vhost_send' and 'netdev_dpdk_remap_txqs' for detail.
>>
>> Additionally, flushing logic will be broken in case of txq remapping because
>> you may have different underneath queue each time you trying to send of
>> flush.
> 
> I remember you raised this point earlier. To handle this case,  'last_used_qid' was introduced in tx_port. 
> With this we can track any change in qid and make sure the packets are  flushed in the old queue.
> This logic is in patch 3/6 of this series.

I'm talking about txq remapping inside netdev-dpdk not about XPS.
You're trying to flush the 'qid = tx_q[...].map' but the 'map'
value can be changed at any time because of enabling/disabling
vrings inside guest. Refer the 'vring_state_changed()' callback
that triggers 'netdev_dpdk_remap_txqs' which I already mentioned.
It's actually the reason why we're using unconditional locking
for vhost-user ports ignoring 'concurrent_txq' value.

>> Have you ever tested this with multiqueue vhost?
>> With disabling/enabling queues inside the guest?
> 
> I did basic sanity testing with vhost multiqueue to verify throughput and to check if flush logic is working at low rate(1 pkt each is sent to VM) on multiple VMs.
> When you say queue 'enable/disable' in the guest are you referring to using 'ethtool -L <inface> combined <channel no>' ?
> If this is the case I did do this by configuring 5 rxqs(DPDK and vhost User) and changed the channel nos and verified with testpmd app again for flushing scenarios. I didn't testing with kernel forwarding here.
> 
> Regards,
> Bhanuprakash.
>
Bodireddy, Bhanuprakash June 27, 2017, 8:31 p.m. UTC | #4
>On 26.06.2017 00:52, Bodireddy, Bhanuprakash wrote:
>>>> +
>>>> +/* Flush the txq if there are any packets available.
>>>> + * dynamic_txqs/concurrent_txq is disabled for vHost User ports as
>>>> + * 'OVS_VHOST_MAX_QUEUE_NUM' txqs are preallocated.
>>>> + */
>>>
>>> This comment is completely untrue. You may ignore 'concurrent_txq'
>>> because you *must* lock the queue in any case because of dynamic txq
>>> remapping inside netdev-dpdk. You must take the spinlock for the
>>> 'dev-
>>>> tx_q[qid % netdev->n_txq].map' before sending packets.
>>
>> Thanks for catching this and the lock should be taken before flushing the
>queue. Below is how the new logic with spinlocks.
>>
>> /* Flush the txq if there are any packets available. */ static int
>> netdev_dpdk_vhost_txq_flush(struct netdev *netdev, int qid,
>>                             bool concurrent_txq OVS_UNUSED) {
>>     struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>>     struct dpdk_tx_queue *txq;
>>
>>     qid = dev->tx_q[qid % netdev->n_txq].map;
>>
>>     txq = &dev->tx_q[qid];
>>     if (OVS_LIKELY(txq->vhost_pkt_cnt)) {
>>         rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
>>         netdev_dpdk_vhost_tx_burst(dev, qid);
>>         rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
>>     }
>>
>>     return 0;
>> }
>>
>>>
>>> In current implementation you're able to call send and flush
>>> simultaneously for the same queue from different threads because
>>> 'flush' doesn't care about queue remapping.
>>>
>>> See '__netdev_dpdk_vhost_send' and 'netdev_dpdk_remap_txqs' for
>detail.
>>>
>>> Additionally, flushing logic will be broken in case of txq remapping
>>> because you may have different underneath queue each time you trying
>>> to send of flush.
>>
>> I remember you raised this point earlier. To handle this case,  'last_used_qid'
>was introduced in tx_port.
>> With this we can track any change in qid and make sure the packets are
>flushed in the old queue.
>> This logic is in patch 3/6 of this series.
>
>I'm talking about txq remapping inside netdev-dpdk not about XPS.
>You're trying to flush the 'qid = tx_q[...].map' but the 'map'
>value can be changed at any time because of enabling/disabling vrings inside
>guest. Refer the 'vring_state_changed()' callback that triggers
>'netdev_dpdk_remap_txqs' which I already mentioned.
>It's actually the reason why we're using unconditional locking for vhost-user
>ports ignoring 'concurrent_txq' value.

I  spent some time looking at the above mentioned functions and see that 'qid = tx_q[...].map' is updated by 'vhost_thread' as part of callback function. This  gets triggered only when the queues are enabled/disabled in the guest.  I was using 'ethtool -L' in the guest for testing this.
As you rightly mentioned the flushing logic breaks and the packets may potentially get stuck in the queue. During my testing I found this corner case that poses a problem with the patch  (steps in order).
	
-  Multiqueue is enabled on the guest with 2 rx,tx queues.
- In PVP case, Traffic is sent to VM and the packets are sent on the 'queue 1' of the vHostUser port.
- PMD thread  keeps queueing packets on the vhostuser port queue. [using netdev_dpdk_vhost_send()]
- Meanwhile user changes queue configuration using 'ethtool -L ens3 combined 1', enables only one queue now and disabling the other.
- Callback functions gets triggered and disables the queue 1.
- 'tx_q[...].map' is updated to '0' [queue 0] - default queue.

- Meantime PMD thread reads the map and finds that tx_q[..].map == '0' and flushes the packets in the queue0. 
This is the problem as the packets enqueued earlier on queue1 were not flushed.

How about the below fix?
- Before disabling the queues (qid = OVS_VHOST_QUEUE_DISABLED), flush the packets in the queue from vring_state_changed().

--------------------------vring_state_changed()------------------------------------
if (strncmp(ifname, dev->vhost_id, IF_NAME_SZ) == 0) {
            if (enable) {
                dev->tx_q[qid].map = qid;
            } else {
                /* If the queue is disabled in the guest, the corresponding qid
                 * map should be set to OVS_VHOST_QUEUE_DISABLED(-2).
                 *
                 * The packets that were queued in 'qid' can be potentially
                 * stuck and should be flushed before it is disabled.
                 */
                netdev_dpdk_vhost_txq_flush(&dev->up, dev->tx_q[qid].map, 0);
                dev->tx_q[qid].map = OVS_VHOST_QUEUE_DISABLED;
            }
            netdev_dpdk_remap_txqs(dev);
            exists = true;
            ovs_mutex_unlock(&dev->mutex);
            break;
        }
        ovs_mutex_unlock(&dev->mutex);
----------------------------------------------------------------------------------------------

Also update the txq_flush function to return immediately if queue is disabled (qid == OVS_VHOST_QUEUE_DISABLED)

-----------------------------netdev_dpdk_vhost_txq_flush()-----------------------------
netdev_dpdk_vhost_txq_flush(struct netdev *netdev, int qid,
                            bool concurrent_txq OVS_UNUSED)
{
    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
    struct dpdk_tx_queue *txq;

    qid = dev->tx_q[qid % netdev->n_txq].map;

    /* The qid may be disabled in the guest and has been set to
     * OVS_VHOST_QUEUE_DISABLED.
     */
    if (OVS_UNLIKELY(qid < 0)) {
        return 0;
    }

    txq = &dev->tx_q[qid];
    if (OVS_LIKELY(txq->vhost_pkt_cnt)) {
        rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
        netdev_dpdk_vhost_tx_burst(dev, qid);
        rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
    }

    return 0;
}
--------------------------------------------------------------------------------------------------

Regards,
Bhanuprakash.

>
>>> Have you ever tested this with multiqueue vhost?
>>> With disabling/enabling queues inside the guest?
>>
>> I did basic sanity testing with vhost multiqueue to verify throughput and to
>check if flush logic is working at low rate(1 pkt each is sent to VM) on multiple
>VMs.
>> When you say queue 'enable/disable' in the guest are you referring to using
>'ethtool -L <inface> combined <channel no>' ?
>> If this is the case I did do this by configuring 5 rxqs(DPDK and vhost User)
>and changed the channel nos and verified with testpmd app again for flushing
>scenarios. I didn't testing with kernel forwarding here.
>>
>> Regards,
>> Bhanuprakash.
>>
Ilya Maximets June 28, 2017, 7:10 a.m. UTC | #5
On 27.06.2017 23:31, Bodireddy, Bhanuprakash wrote:
>> On 26.06.2017 00:52, Bodireddy, Bhanuprakash wrote:
>>>>> +
>>>>> +/* Flush the txq if there are any packets available.
>>>>> + * dynamic_txqs/concurrent_txq is disabled for vHost User ports as
>>>>> + * 'OVS_VHOST_MAX_QUEUE_NUM' txqs are preallocated.
>>>>> + */
>>>>
>>>> This comment is completely untrue. You may ignore 'concurrent_txq'
>>>> because you *must* lock the queue in any case because of dynamic txq
>>>> remapping inside netdev-dpdk. You must take the spinlock for the
>>>> 'dev-
>>>>> tx_q[qid % netdev->n_txq].map' before sending packets.
>>>
>>> Thanks for catching this and the lock should be taken before flushing the
>> queue. Below is how the new logic with spinlocks.
>>>
>>> /* Flush the txq if there are any packets available. */ static int
>>> netdev_dpdk_vhost_txq_flush(struct netdev *netdev, int qid,
>>>                             bool concurrent_txq OVS_UNUSED) {
>>>     struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>>>     struct dpdk_tx_queue *txq;
>>>
>>>     qid = dev->tx_q[qid % netdev->n_txq].map;
>>>
>>>     txq = &dev->tx_q[qid];
>>>     if (OVS_LIKELY(txq->vhost_pkt_cnt)) {
>>>         rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
>>>         netdev_dpdk_vhost_tx_burst(dev, qid);
>>>         rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
>>>     }
>>>
>>>     return 0;
>>> }
>>>
>>>>
>>>> In current implementation you're able to call send and flush
>>>> simultaneously for the same queue from different threads because
>>>> 'flush' doesn't care about queue remapping.
>>>>
>>>> See '__netdev_dpdk_vhost_send' and 'netdev_dpdk_remap_txqs' for
>> detail.
>>>>
>>>> Additionally, flushing logic will be broken in case of txq remapping
>>>> because you may have different underneath queue each time you trying
>>>> to send of flush.
>>>
>>> I remember you raised this point earlier. To handle this case,  'last_used_qid'
>> was introduced in tx_port.
>>> With this we can track any change in qid and make sure the packets are
>> flushed in the old queue.
>>> This logic is in patch 3/6 of this series.
>>
>> I'm talking about txq remapping inside netdev-dpdk not about XPS.
>> You're trying to flush the 'qid = tx_q[...].map' but the 'map'
>> value can be changed at any time because of enabling/disabling vrings inside
>> guest. Refer the 'vring_state_changed()' callback that triggers
>> 'netdev_dpdk_remap_txqs' which I already mentioned.
>> It's actually the reason why we're using unconditional locking for vhost-user
>> ports ignoring 'concurrent_txq' value.
> 
> I  spent some time looking at the above mentioned functions and see that 'qid = tx_q[...].map' is updated by 'vhost_thread' as part of callback function. This  gets triggered only when the queues are enabled/disabled in the guest.  I was using 'ethtool -L' in the guest for testing this.
> As you rightly mentioned the flushing logic breaks and the packets may potentially get stuck in the queue. During my testing I found this corner case that poses a problem with the patch  (steps in order).
> 	
> -  Multiqueue is enabled on the guest with 2 rx,tx queues.
> - In PVP case, Traffic is sent to VM and the packets are sent on the 'queue 1' of the vHostUser port.
> - PMD thread  keeps queueing packets on the vhostuser port queue. [using netdev_dpdk_vhost_send()]
> - Meanwhile user changes queue configuration using 'ethtool -L ens3 combined 1', enables only one queue now and disabling the other.
> - Callback functions gets triggered and disables the queue 1.
> - 'tx_q[...].map' is updated to '0' [queue 0] - default queue.
> 
> - Meantime PMD thread reads the map and finds that tx_q[..].map == '0' and flushes the packets in the queue0. 
> This is the problem as the packets enqueued earlier on queue1 were not flushed.
> 
> How about the below fix?
> - Before disabling the queues (qid = OVS_VHOST_QUEUE_DISABLED), flush the packets in the queue from vring_state_changed().

It's not right to send packets to queues we're going to disable in OvS. We're
just responding on notification, the real queue was already effectively disabled
on the guest side and packets will not be delivered anyway and we should not
try to write to disabled queue from the OvS point of view.
I think it's better to just drop them here.

> 
> --------------------------vring_state_changed()------------------------------------
> if (strncmp(ifname, dev->vhost_id, IF_NAME_SZ) == 0) {
>             if (enable) {
>                 dev->tx_q[qid].map = qid;
>             } else {
>                 /* If the queue is disabled in the guest, the corresponding qid
>                  * map should be set to OVS_VHOST_QUEUE_DISABLED(-2).
>                  *
>                  * The packets that were queued in 'qid' can be potentially
>                  * stuck and should be flushed before it is disabled.
>                  */
>                 netdev_dpdk_vhost_txq_flush(&dev->up, dev->tx_q[qid].map, 0);

It doesn't matter because of my explanation above, but here should be just 'qid'
not the 'dev->tx_q[qid].map', otherwise you may pass negative value to the flush
function and remap it again to nowhere.

>                 dev->tx_q[qid].map = OVS_VHOST_QUEUE_DISABLED;
>             }
>             netdev_dpdk_remap_txqs(dev);
>             exists = true;
>             ovs_mutex_unlock(&dev->mutex);
>             break;
>         }
>         ovs_mutex_unlock(&dev->mutex);
> ----------------------------------------------------------------------------------------------
> 
> Also update the txq_flush function to return immediately if queue is disabled (qid == OVS_VHOST_QUEUE_DISABLED)

Hm. That is not the only issue here. You're actually need the same checking
as in send function:

     if (OVS_UNLIKELY(!is_vhost_running(dev) || qid < 0                         
                      || !(dev->flags & NETDEV_UP))) {                          
         rte_spinlock_lock(&dev->stats_lock);                                   
         dev->stats.tx_dropped+= cnt;                                           
         rte_spinlock_unlock(&dev->stats_lock);                                 
         goto out;                                                              
     }

Otherwise you may try to send packets in the middle of initialization
or reconfiguration process where device may not have 'vid' or
initialized queue ids and even mempool can be not allocated at this
point.

> 
> -----------------------------netdev_dpdk_vhost_txq_flush()-----------------------------
> netdev_dpdk_vhost_txq_flush(struct netdev *netdev, int qid,
>                             bool concurrent_txq OVS_UNUSED)
> {
>     struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>     struct dpdk_tx_queue *txq;
> 
>     qid = dev->tx_q[qid % netdev->n_txq].map;
> 
>     /* The qid may be disabled in the guest and has been set to
>      * OVS_VHOST_QUEUE_DISABLED.
>      */
>     if (OVS_UNLIKELY(qid < 0)) {
>         return 0;
>     }
> 
>     txq = &dev->tx_q[qid];
>     if (OVS_LIKELY(txq->vhost_pkt_cnt)) {
>         rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
>         netdev_dpdk_vhost_tx_burst(dev, qid);
>         rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
>     }
> 
>     return 0;
> }
> --------------------------------------------------------------------------------------------------
> 
> Regards,
> Bhanuprakash.
> 
>>
>>>> Have you ever tested this with multiqueue vhost?
>>>> With disabling/enabling queues inside the guest?
>>>
>>> I did basic sanity testing with vhost multiqueue to verify throughput and to
>> check if flush logic is working at low rate(1 pkt each is sent to VM) on multiple
>> VMs.
>>> When you say queue 'enable/disable' in the guest are you referring to using
>> 'ethtool -L <inface> combined <channel no>' ?
>>> If this is the case I did do this by configuring 5 rxqs(DPDK and vhost User)
>> and changed the channel nos and verified with testpmd app again for flushing
>> scenarios. I didn't testing with kernel forwarding here.
>>>
>>> Regards,
>>> Bhanuprakash.
>>>
Bodireddy, Bhanuprakash June 28, 2017, 9:20 p.m. UTC | #6
>
>On 27.06.2017 23:31, Bodireddy, Bhanuprakash wrote:
>>> On 26.06.2017 00:52, Bodireddy, Bhanuprakash wrote:
>>>>>> +
>>>>>> +/* Flush the txq if there are any packets available.
>>>>>> + * dynamic_txqs/concurrent_txq is disabled for vHost User ports
>>>>>> +as
>>>>>> + * 'OVS_VHOST_MAX_QUEUE_NUM' txqs are preallocated.
>>>>>> + */
>>>>>
>>>>> This comment is completely untrue. You may ignore 'concurrent_txq'
>>>>> because you *must* lock the queue in any case because of dynamic
>>>>> txq remapping inside netdev-dpdk. You must take the spinlock for
>>>>> the
>>>>> 'dev-
>>>>>> tx_q[qid % netdev->n_txq].map' before sending packets.
>>>>
>>>> Thanks for catching this and the lock should be taken before
>>>> flushing the
>>> queue. Below is how the new logic with spinlocks.
>>>>
>>>> /* Flush the txq if there are any packets available. */ static int
>>>> netdev_dpdk_vhost_txq_flush(struct netdev *netdev, int qid,
>>>>                             bool concurrent_txq OVS_UNUSED) {
>>>>     struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>>>>     struct dpdk_tx_queue *txq;
>>>>
>>>>     qid = dev->tx_q[qid % netdev->n_txq].map;
>>>>
>>>>     txq = &dev->tx_q[qid];
>>>>     if (OVS_LIKELY(txq->vhost_pkt_cnt)) {
>>>>         rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
>>>>         netdev_dpdk_vhost_tx_burst(dev, qid);
>>>>         rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
>>>>     }
>>>>
>>>>     return 0;
>>>> }
>>>>
>>>>>
>>>>> In current implementation you're able to call send and flush
>>>>> simultaneously for the same queue from different threads because
>>>>> 'flush' doesn't care about queue remapping.
>>>>>
>>>>> See '__netdev_dpdk_vhost_send' and 'netdev_dpdk_remap_txqs' for
>>> detail.
>>>>>
>>>>> Additionally, flushing logic will be broken in case of txq
>>>>> remapping because you may have different underneath queue each
>time
>>>>> you trying to send of flush.
>>>>
>>>> I remember you raised this point earlier. To handle this case,
>'last_used_qid'
>>> was introduced in tx_port.
>>>> With this we can track any change in qid and make sure the packets
>>>> are
>>> flushed in the old queue.
>>>> This logic is in patch 3/6 of this series.
>>>
>>> I'm talking about txq remapping inside netdev-dpdk not about XPS.
>>> You're trying to flush the 'qid = tx_q[...].map' but the 'map'
>>> value can be changed at any time because of enabling/disabling vrings
>>> inside guest. Refer the 'vring_state_changed()' callback that
>>> triggers 'netdev_dpdk_remap_txqs' which I already mentioned.
>>> It's actually the reason why we're using unconditional locking for
>>> vhost-user ports ignoring 'concurrent_txq' value.
>>
>> I  spent some time looking at the above mentioned functions and see that
>'qid = tx_q[...].map' is updated by 'vhost_thread' as part of callback function.
>This  gets triggered only when the queues are enabled/disabled in the guest.
>I was using 'ethtool -L' in the guest for testing this.
>> As you rightly mentioned the flushing logic breaks and the packets may
>potentially get stuck in the queue. During my testing I found this corner case
>that poses a problem with the patch  (steps in order).
>>
>> -  Multiqueue is enabled on the guest with 2 rx,tx queues.
>> - In PVP case, Traffic is sent to VM and the packets are sent on the 'queue 1'
>of the vHostUser port.
>> - PMD thread  keeps queueing packets on the vhostuser port queue.
>> [using netdev_dpdk_vhost_send()]
>> - Meanwhile user changes queue configuration using 'ethtool -L ens3
>combined 1', enables only one queue now and disabling the other.
>> - Callback functions gets triggered and disables the queue 1.
>> - 'tx_q[...].map' is updated to '0' [queue 0] - default queue.
>>
>> - Meantime PMD thread reads the map and finds that tx_q[..].map == '0'
>and flushes the packets in the queue0.
>> This is the problem as the packets enqueued earlier on queue1 were not
>flushed.
>>
>> How about the below fix?
>> - Before disabling the queues (qid = OVS_VHOST_QUEUE_DISABLED), flush
>the packets in the queue from vring_state_changed().
>
>It's not right to send packets to queues we're going to disable in OvS. We're
>just responding on notification, the real queue was already effectively
>disabled on the guest side and packets will not be delivered anyway and we
>should not try to write to disabled queue from the OvS point of view.
>I think it's better to just drop them here.

I was in doubt how to handle this. I initially dropped the packets but then thought maybe there is a small time window for the queue to get disabled and so invoked flush function here.
No worries, I will drop the packets. 

>
>>
>> --------------------------vring_state_changed()-----------------------
>> ------------- if (strncmp(ifname, dev->vhost_id, IF_NAME_SZ) == 0) {
>>             if (enable) {
>>                 dev->tx_q[qid].map = qid;
>>             } else {
>>                 /* If the queue is disabled in the guest, the corresponding qid
>>                  * map should be set to OVS_VHOST_QUEUE_DISABLED(-2).
>>                  *
>>                  * The packets that were queued in 'qid' can be potentially
>>                  * stuck and should be flushed before it is disabled.
>>                  */
>>                 netdev_dpdk_vhost_txq_flush(&dev->up,
>> dev->tx_q[qid].map, 0);
>
>It doesn't matter because of my explanation above, but here should be just
>'qid'
>not the 'dev->tx_q[qid].map', otherwise you may pass negative value to the
>flush function and remap it again to nowhere.

Ok. 
>
>>                 dev->tx_q[qid].map = OVS_VHOST_QUEUE_DISABLED;
>>             }
>>             netdev_dpdk_remap_txqs(dev);
>>             exists = true;
>>             ovs_mutex_unlock(&dev->mutex);
>>             break;
>>         }
>>         ovs_mutex_unlock(&dev->mutex);
>> ----------------------------------------------------------------------
>> ------------------------
>>
>> Also update the txq_flush function to return immediately if queue is
>> disabled (qid == OVS_VHOST_QUEUE_DISABLED)
>
>Hm. That is not the only issue here. You're actually need the same checking as
>in send function:

Ok similar to what we do in the vhost_send().

>
>     if (OVS_UNLIKELY(!is_vhost_running(dev) || qid < 0
>                      || !(dev->flags & NETDEV_UP))) {
>         rte_spinlock_lock(&dev->stats_lock);
>         dev->stats.tx_dropped+= cnt;
>         rte_spinlock_unlock(&dev->stats_lock);
>         goto out;
>     }

Instead of goto I will free the packets there itself if at all any.

>
>Otherwise you may try to send packets in the middle of initialization or
>reconfiguration process where device may not have 'vid' or initialized queue
>ids and even mempool can be not allocated at this point.

Completely agree.

- Bhanuprakash.

>
>>
>> -----------------------------netdev_dpdk_vhost_txq_flush()------------
>> ----------------- netdev_dpdk_vhost_txq_flush(struct netdev *netdev,
>> int qid,
>>                             bool concurrent_txq OVS_UNUSED) {
>>     struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>>     struct dpdk_tx_queue *txq;
>>
>>     qid = dev->tx_q[qid % netdev->n_txq].map;
>>
>>     /* The qid may be disabled in the guest and has been set to
>>      * OVS_VHOST_QUEUE_DISABLED.
>>      */
>>     if (OVS_UNLIKELY(qid < 0)) {
>>         return 0;
>>     }
>>
>>     txq = &dev->tx_q[qid];
>>     if (OVS_LIKELY(txq->vhost_pkt_cnt)) {
>>         rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
>>         netdev_dpdk_vhost_tx_burst(dev, qid);
>>         rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
>>     }
>>
>>     return 0;
>> }
>> ----------------------------------------------------------------------
>> ----------------------------
>>
>> Regards,
>> Bhanuprakash.
>>
>>>
>>>>> Have you ever tested this with multiqueue vhost?
>>>>> With disabling/enabling queues inside the guest?
>>>>
>>>> I did basic sanity testing with vhost multiqueue to verify
>>>> throughput and to
>>> check if flush logic is working at low rate(1 pkt each is sent to VM)
>>> on multiple VMs.
>>>> When you say queue 'enable/disable' in the guest are you referring
>>>> to using
>>> 'ethtool -L <inface> combined <channel no>' ?
>>>> If this is the case I did do this by configuring 5 rxqs(DPDK and
>>>> vhost User)
>>> and changed the channel nos and verified with testpmd app again for
>>> flushing scenarios. I didn't testing with kernel forwarding here.
>>>>
>>>> Regards,
>>>> Bhanuprakash.
>>>>
diff mbox

Patch

diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 50a9a2c..47343e8 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -307,12 +307,22 @@  struct dpdk_tx_queue {
                                     * pmd threads (see 'concurrent_txq'). */
     int map;                       /* Mapping of configured vhost-user queues
                                     * to enabled by guest. */
-    int dpdk_pkt_cnt;              /* Number of buffered packets waiting to
+    union {
+        int dpdk_pkt_cnt;          /* Number of buffered packets waiting to
                                       be sent on DPDK tx queue. */
-    struct rte_mbuf *dpdk_burst_pkts[INTERIM_QUEUE_BURST_THRESHOLD];
+        int vhost_pkt_cnt;         /* Number of buffered packets waiting to
+                                      be sent on vhost port. */
+    };
+
+    union {
+        struct rte_mbuf *dpdk_burst_pkts[INTERIM_QUEUE_BURST_THRESHOLD];
                                    /* Intermediate queue where packets can
                                     * be buffered to amortize the cost of MMIO
                                     * writes. */
+        struct dp_packet *vhost_burst_pkts[INTERIM_QUEUE_BURST_THRESHOLD];
+                                   /* Intermediate queue where packets can
+                                    * be buffered for vhost ports. */
+    };
 };
 
 /* dpdk has no way to remove dpdk ring ethernet devices
@@ -1719,6 +1729,63 @@  netdev_dpdk_vhost_update_tx_counters(struct netdev_stats *stats,
     }
 }
 
+static int
+netdev_dpdk_vhost_tx_burst(struct netdev_dpdk *dev, int qid)
+{
+    struct dpdk_tx_queue *txq = &dev->tx_q[qid];
+    struct rte_mbuf **cur_pkts = (struct rte_mbuf **)txq->vhost_burst_pkts;
+
+    int tx_vid = netdev_dpdk_get_vid(dev);
+    int tx_qid = qid * VIRTIO_QNUM + VIRTIO_RXQ;
+    uint32_t sent = 0;
+    uint32_t retries = 0;
+    uint32_t sum, total_pkts;
+
+    total_pkts = sum = txq->vhost_pkt_cnt;
+    do {
+        uint32_t ret;
+        ret = rte_vhost_enqueue_burst(tx_vid, tx_qid, &cur_pkts[sent], sum);
+        if (OVS_UNLIKELY(!ret)) {
+            /* No packets enqueued - do not retry. */
+            break;
+        } else {
+            /* Packet have been sent */
+            sent += ret;
+
+            /* 'sum' packet have to be retransmitted */
+            sum -= ret;
+        }
+    } while (sum && (retries++ < VHOST_ENQ_RETRY_NUM));
+
+    for (int i = 0; i < total_pkts; i++) {
+        dp_packet_delete(txq->vhost_burst_pkts[i]);
+    }
+
+    /* Reset pkt count */
+    txq->vhost_pkt_cnt = 0;
+
+    /* 'sum' refers to packets dropped */
+    return sum;
+}
+
+/* Flush the txq if there are any packets available.
+ * dynamic_txqs/concurrent_txq is disabled for vHost User ports as
+ * 'OVS_VHOST_MAX_QUEUE_NUM' txqs are preallocated.
+ */
+static int
+netdev_dpdk_vhost_txq_flush(struct netdev *netdev, int qid,
+                            bool concurrent_txq OVS_UNUSED)
+{
+    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
+    struct dpdk_tx_queue *txq = &dev->tx_q[qid];
+
+    if (OVS_LIKELY(txq->vhost_pkt_cnt)) {
+        netdev_dpdk_vhost_tx_burst(dev, qid);
+    }
+
+    return 0;
+}
+
 static void
 __netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
                          struct dp_packet **pkts, int cnt)
@@ -3432,7 +3499,8 @@  static const struct netdev_class dpdk_vhost_class =
         NULL,
         netdev_dpdk_vhost_reconfigure,
         netdev_dpdk_vhost_rxq_recv,
-        NULL);
+        netdev_dpdk_vhost_txq_flush);
+
 static const struct netdev_class dpdk_vhost_client_class =
     NETDEV_DPDK_CLASS(
         "dpdkvhostuserclient",
@@ -3448,7 +3516,7 @@  static const struct netdev_class dpdk_vhost_client_class =
         NULL,
         netdev_dpdk_vhost_client_reconfigure,
         netdev_dpdk_vhost_rxq_recv,
-        NULL);
+        netdev_dpdk_vhost_txq_flush);
 
 void
 netdev_dpdk_register(void)