diff mbox series

[ovs-dev,dpdk-latest] netdev-dpdk: add custom vhost statistics to count IRQs

Message ID 20191015112013.12369.42639.stgit@netdev64
State Superseded
Headers show
Series [ovs-dev,dpdk-latest] netdev-dpdk: add custom vhost statistics to count IRQs | expand

Commit Message

Eelco Chaudron Oct. 15, 2019, 11:20 a.m. UTC
When the dpdk vhost library executes an eventfd_write() call,
i.e. waking up the guest, a new callback will be called.

This patch adds the callback to count the number of
interrupts sent to the VM to track the number of times
interrupts where generated.

This might be of interest to find out system-calls were
called in the DPDK fast path.

The custom statistics can be seen with the following command:

  ovs-ofctl -O OpenFlow14 dump-ports <bridge> {<port>}

Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
---
 lib/netdev-dpdk.c |   27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

Comments

Ilya Maximets Oct. 16, 2019, 1:03 p.m. UTC | #1
On 15.10.2019 13:20, Eelco Chaudron wrote:
> When the dpdk vhost library executes an eventfd_write() call,
> i.e. waking up the guest, a new callback will be called.
> 
> This patch adds the callback to count the number of
> interrupts sent to the VM to track the number of times
> interrupts where generated.
> 
> This might be of interest to find out system-calls were
> called in the DPDK fast path.
> 
> The custom statistics can be seen with the following command:
> 
>    ovs-ofctl -O OpenFlow14 dump-ports <bridge> {<port>}
> 
> Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
> ---
>   lib/netdev-dpdk.c |   27 ++++++++++++++++++++++++++-
>   1 file changed, 26 insertions(+), 1 deletion(-)
> 
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> index ba92e89..7ebbcdc 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -165,6 +165,8 @@ static int new_device(int vid);
>   static void destroy_device(int vid);
>   static int vring_state_changed(int vid, uint16_t queue_id, int enable);
>   static void destroy_connection(int vid);
> +static void vhost_guest_notified(int vid);
> +
>   static const struct vhost_device_ops virtio_net_device_ops =
>   {
>       .new_device =  new_device,
> @@ -173,6 +175,7 @@ static const struct vhost_device_ops virtio_net_device_ops =
>       .features_changed = NULL,
>       .new_connection = NULL,
>       .destroy_connection = destroy_connection,
> +    .guest_notified = vhost_guest_notified,
>   };
>   
>   enum { DPDK_RING_SIZE = 256 };
> @@ -416,6 +419,8 @@ struct netdev_dpdk {
>           struct netdev_stats stats;
>           /* Custom stat for retries when unable to transmit. */
>           uint64_t tx_retries;
> +        /* Custom stat counting number of times a vhost remote was woken up */
> +        uint64_t vhost_irqs;
>           /* Protects stats */
>           rte_spinlock_t stats_lock;
>           /* 4 pad bytes here. */
> @@ -2826,7 +2831,8 @@ netdev_dpdk_vhost_get_custom_stats(const struct netdev *netdev,
>       int i;
>   
>   #define VHOST_CSTATS \
> -    VHOST_CSTAT(tx_retries)
> +    VHOST_CSTAT(tx_retries) \
> +    VHOST_CSTAT(vhost_irqs)
>   
>   #define VHOST_CSTAT(NAME) + 1
>       custom_stats->size = VHOST_CSTATS;
> @@ -3746,6 +3752,25 @@ destroy_connection(int vid)
>       }
>   }
>   
> +static
> +void vhost_guest_notified(int vid)
> +{
> +    struct netdev_dpdk *dev;
> +
> +    ovs_mutex_lock(&dpdk_mutex);
> +    LIST_FOR_EACH (dev, list_node, &dpdk_list) {
> +        if (netdev_dpdk_get_vid(dev) == vid) {
> +            ovs_mutex_lock(&dev->mutex);
> +            rte_spinlock_lock(&dev->stats_lock);
> +            dev->vhost_irqs++;
> +            rte_spinlock_unlock(&dev->stats_lock);
> +            ovs_mutex_unlock(&dev->mutex);
> +            break;
> +        }
> +    }
> +    ovs_mutex_unlock(&dpdk_mutex);

So, for every single eventfd_write() we're taking the global mutex,
traversing list of all the devices, taking one more mutex and finally
taking the spinlock just to increment a single counter.
I think, it's too much.

Wouldn't it significantly affect interrupt based virtio drivers in
terms of performance?  How frequently 2 vhost-user ports will lock
each other on the global device mutex?  How frequently 2 PMD threads
will lock each other on rx/tx to different vrings of the same vhost
port?

I see that 'guest_notified' patch is already in dpdk master, but
wouldn't it be better to accumulate this statistics inside DPDK
and return it with some new API like rte_vhost_get_vring_xstats()
if required?  I don't see any usecase for this callback beside
counting some stats.

For the current patch I could only suggest to convert it to simple
coverage counter like it done in 'vhost_tx_contention' patch here:
https://patchwork.ozlabs.org/patch/1175849/

Best regards, Ilya Maximets.
Eelco Chaudron Oct. 25, 2019, 1:51 p.m. UTC | #2
On 16 Oct 2019, at 15:03, Ilya Maximets wrote:

> On 15.10.2019 13:20, Eelco Chaudron wrote:
>> When the dpdk vhost library executes an eventfd_write() call,
>> i.e. waking up the guest, a new callback will be called.
>>
>> This patch adds the callback to count the number of
>> interrupts sent to the VM to track the number of times
>> interrupts where generated.
>>
>> This might be of interest to find out system-calls were
>> called in the DPDK fast path.
>>
>> The custom statistics can be seen with the following command:
>>
>>    ovs-ofctl -O OpenFlow14 dump-ports <bridge> {<port>}
>>
>> Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
>> ---
>>   lib/netdev-dpdk.c |   27 ++++++++++++++++++++++++++-
>>   1 file changed, 26 insertions(+), 1 deletion(-)
>>
>> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
>> index ba92e89..7ebbcdc 100644
>> --- a/lib/netdev-dpdk.c
>> +++ b/lib/netdev-dpdk.c
>> @@ -165,6 +165,8 @@ static int new_device(int vid);
>>   static void destroy_device(int vid);
>>   static int vring_state_changed(int vid, uint16_t queue_id, int 
>> enable);
>>   static void destroy_connection(int vid);
>> +static void vhost_guest_notified(int vid);
>> +
>>   static const struct vhost_device_ops virtio_net_device_ops =
>>   {
>>       .new_device =  new_device,
>> @@ -173,6 +175,7 @@ static const struct vhost_device_ops 
>> virtio_net_device_ops =
>>       .features_changed = NULL,
>>       .new_connection = NULL,
>>       .destroy_connection = destroy_connection,
>> +    .guest_notified = vhost_guest_notified,
>>   };
>>    enum { DPDK_RING_SIZE = 256 };
>> @@ -416,6 +419,8 @@ struct netdev_dpdk {
>>           struct netdev_stats stats;
>>           /* Custom stat for retries when unable to transmit. */
>>           uint64_t tx_retries;
>> +        /* Custom stat counting number of times a vhost remote was 
>> woken up */
>> +        uint64_t vhost_irqs;
>>           /* Protects stats */
>>           rte_spinlock_t stats_lock;
>>           /* 4 pad bytes here. */
>> @@ -2826,7 +2831,8 @@ netdev_dpdk_vhost_get_custom_stats(const struct 
>> netdev *netdev,
>>       int i;
>>    #define VHOST_CSTATS \
>> -    VHOST_CSTAT(tx_retries)
>> +    VHOST_CSTAT(tx_retries) \
>> +    VHOST_CSTAT(vhost_irqs)
>>    #define VHOST_CSTAT(NAME) + 1
>>       custom_stats->size = VHOST_CSTATS;
>> @@ -3746,6 +3752,25 @@ destroy_connection(int vid)
>>       }
>>   }
>>  +static
>> +void vhost_guest_notified(int vid)
>> +{
>> +    struct netdev_dpdk *dev;
>> +
>> +    ovs_mutex_lock(&dpdk_mutex);
>> +    LIST_FOR_EACH (dev, list_node, &dpdk_list) {
>> +        if (netdev_dpdk_get_vid(dev) == vid) {
>> +            ovs_mutex_lock(&dev->mutex);
>> +            rte_spinlock_lock(&dev->stats_lock);
>> +            dev->vhost_irqs++;
>> +            rte_spinlock_unlock(&dev->stats_lock);
>> +            ovs_mutex_unlock(&dev->mutex);
>> +            break;
>> +        }
>> +    }
>> +    ovs_mutex_unlock(&dpdk_mutex);
>
> So, for every single eventfd_write() we're taking the global mutex,
> traversing list of all the devices, taking one more mutex and finally
> taking the spinlock just to increment a single counter.
> I think, it's too much.

Yes you are right this might be a bit of overkill…

> Wouldn't it significantly affect interrupt based virtio drivers in
> terms of performance?  How frequently 2 vhost-user ports will lock
> each other on the global device mutex?  How frequently 2 PMD threads
> will lock each other on rx/tx to different vrings of the same vhost
> port?

I used iperf3 and did not show any negative effects (maybe I should add 
more than 4 physical queues, 1 had one VM queue), but also the results 
show large deviations.

> I see that 'guest_notified' patch is already in dpdk master, but
> wouldn't it be better to accumulate this statistics inside DPDK
> and return it with some new API like rte_vhost_get_vring_xstats()
> if required?  I don't see any usecase for this callback beside
> counting some stats.

I agree, however, the vhost library has no internal counters (only the 
PMD implementation which we do not use), so hence they liked the 
callback.

> For the current patch I could only suggest to convert it to simple
> coverage counter like it done in 'vhost_tx_contention' patch here:
> https://patchwork.ozlabs.org/patch/1175849/

This is what I’ll do (and will send a patch soon). Although from a 
user perspective a per vhost user would make more sense as it could 
easily indicate which VM is configured wrongly…

Cheers,

Eelco
Ilya Maximets Oct. 25, 2019, 3:06 p.m. UTC | #3
On 25.10.2019 15:51, Eelco Chaudron wrote:
> 
> 
> On 16 Oct 2019, at 15:03, Ilya Maximets wrote:
> 
>> On 15.10.2019 13:20, Eelco Chaudron wrote:
>>> When the dpdk vhost library executes an eventfd_write() call,
>>> i.e. waking up the guest, a new callback will be called.
>>>
>>> This patch adds the callback to count the number of
>>> interrupts sent to the VM to track the number of times
>>> interrupts where generated.
>>>
>>> This might be of interest to find out system-calls were
>>> called in the DPDK fast path.
>>>
>>> The custom statistics can be seen with the following command:
>>>
>>>    ovs-ofctl -O OpenFlow14 dump-ports <bridge> {<port>}
>>>
>>> Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
>>> ---
>>>   lib/netdev-dpdk.c |   27 ++++++++++++++++++++++++++-
>>>   1 file changed, 26 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
>>> index ba92e89..7ebbcdc 100644
>>> --- a/lib/netdev-dpdk.c
>>> +++ b/lib/netdev-dpdk.c
>>> @@ -165,6 +165,8 @@ static int new_device(int vid);
>>>   static void destroy_device(int vid);
>>>   static int vring_state_changed(int vid, uint16_t queue_id, int enable);
>>>   static void destroy_connection(int vid);
>>> +static void vhost_guest_notified(int vid);
>>> +
>>>   static const struct vhost_device_ops virtio_net_device_ops =
>>>   {
>>>       .new_device =  new_device,
>>> @@ -173,6 +175,7 @@ static const struct vhost_device_ops virtio_net_device_ops =
>>>       .features_changed = NULL,
>>>       .new_connection = NULL,
>>>       .destroy_connection = destroy_connection,
>>> +    .guest_notified = vhost_guest_notified,
>>>   };
>>>    enum { DPDK_RING_SIZE = 256 };
>>> @@ -416,6 +419,8 @@ struct netdev_dpdk {
>>>           struct netdev_stats stats;
>>>           /* Custom stat for retries when unable to transmit. */
>>>           uint64_t tx_retries;
>>> +        /* Custom stat counting number of times a vhost remote was woken up */
>>> +        uint64_t vhost_irqs;
>>>           /* Protects stats */
>>>           rte_spinlock_t stats_lock;
>>>           /* 4 pad bytes here. */
>>> @@ -2826,7 +2831,8 @@ netdev_dpdk_vhost_get_custom_stats(const struct netdev *netdev,
>>>       int i;
>>>    #define VHOST_CSTATS \
>>> -    VHOST_CSTAT(tx_retries)
>>> +    VHOST_CSTAT(tx_retries) \
>>> +    VHOST_CSTAT(vhost_irqs)
>>>    #define VHOST_CSTAT(NAME) + 1
>>>       custom_stats->size = VHOST_CSTATS;
>>> @@ -3746,6 +3752,25 @@ destroy_connection(int vid)
>>>       }
>>>   }
>>>  +static
>>> +void vhost_guest_notified(int vid)
>>> +{
>>> +    struct netdev_dpdk *dev;
>>> +
>>> +    ovs_mutex_lock(&dpdk_mutex);
>>> +    LIST_FOR_EACH (dev, list_node, &dpdk_list) {
>>> +        if (netdev_dpdk_get_vid(dev) == vid) {
>>> +            ovs_mutex_lock(&dev->mutex);
>>> +            rte_spinlock_lock(&dev->stats_lock);
>>> +            dev->vhost_irqs++;
>>> +            rte_spinlock_unlock(&dev->stats_lock);
>>> +            ovs_mutex_unlock(&dev->mutex);
>>> +            break;
>>> +        }
>>> +    }
>>> +    ovs_mutex_unlock(&dpdk_mutex);
>>
>> So, for every single eventfd_write() we're taking the global mutex,
>> traversing list of all the devices, taking one more mutex and finally
>> taking the spinlock just to increment a single counter.
>> I think, it's too much.
> 
> Yes you are right this might be a bit of overkill…
> 
>> Wouldn't it significantly affect interrupt based virtio drivers in
>> terms of performance?  How frequently 2 vhost-user ports will lock
>> each other on the global device mutex?  How frequently 2 PMD threads
>> will lock each other on rx/tx to different vrings of the same vhost
>> port?
> 
> I used iperf3 and did not show any negative effects (maybe I should add more than 4 physical queues, 1 had one VM queue), but also the results show large deviations.

 From my experience, I'd say that iperf via kernel virtio driver is not able
to saturate a PMD thread.  They are likely relaxed and have time to wait
on the mutex.  Also, You, probably, have only couple of ports, so the critical
section is not large enough to produce frequent interlocking.

I'm just pointing that to count number of eventfd syscalls we're probably
making 2 other syscalls and probably sleeping in them.

For testing purposes, I'd suggest to add 10-20(100?) dummy pmd ports (e.g.
vhost without VMs) and assign them to different thread. Probably, more threads
in iperf for traffic generation, switching to small udp packets.
You could also replace lock() with trylock() for dpdk_mutex and count
contentions in a same way as it done for tx_lock in David's patch.

> 
>> I see that 'guest_notified' patch is already in dpdk master, but
>> wouldn't it be better to accumulate this statistics inside DPDK
>> and return it with some new API like rte_vhost_get_vring_xstats()
>> if required?  I don't see any usecase for this callback beside
>> counting some stats.
> 
> I agree, however, the vhost library has no internal counters (only the PMD implementation which we do not use), so hence they liked the callback.

One more possible issue is that this not-so-useful callback hijacked
the reserved space in the structure and we could suffer in the
future while adding new callbacks due to ABI breakage.
(In context of DPDK API/ABI stability discussions)

> 
>> For the current patch I could only suggest to convert it to simple
>> coverage counter like it done in 'vhost_tx_contention' patch here:
>> https://patchwork.ozlabs.org/patch/1175849/
> 
> This is what I’ll do (and will send a patch soon). Although from a user perspective a per vhost user would make more sense as it could easily indicate which VM is configured wrongly…

Sure.  I think that we need to re-imagine concept and implementation of
pmd-perf-stats in a "coverage"-like way, as I suggested previously while
discussing initial patches for pmd-perf, to count stats like vhost qlen,
tx contentions and this one.  At least, this should be not so hard to do
on a per-PMD basis.  per-netdev might be more tricky, but I believe that
it is possible.

For now we have architectural limitations that we have to live with.

Best regards, Ilya Maximets.
Eelco Chaudron Oct. 28, 2019, 10:14 a.m. UTC | #4
On 25 Oct 2019, at 17:06, Ilya Maximets wrote:

> On 25.10.2019 15:51, Eelco Chaudron wrote:

<SNIP>

>>>>  +static
>>>> +void vhost_guest_notified(int vid)
>>>> +{
>>>> +    struct netdev_dpdk *dev;
>>>> +
>>>> +    ovs_mutex_lock(&dpdk_mutex);
>>>> +    LIST_FOR_EACH (dev, list_node, &dpdk_list) {
>>>> +        if (netdev_dpdk_get_vid(dev) == vid) {
>>>> +            ovs_mutex_lock(&dev->mutex);
>>>> +            rte_spinlock_lock(&dev->stats_lock);
>>>> +            dev->vhost_irqs++;
>>>> +            rte_spinlock_unlock(&dev->stats_lock);
>>>> +            ovs_mutex_unlock(&dev->mutex);
>>>> +            break;
>>>> +        }
>>>> +    }
>>>> +    ovs_mutex_unlock(&dpdk_mutex);
>>>
>>> So, for every single eventfd_write() we're taking the global mutex,
>>> traversing list of all the devices, taking one more mutex and 
>>> finally
>>> taking the spinlock just to increment a single counter.
>>> I think, it's too much.
>>
>> Yes you are right this might be a bit of overkill…
>>
>>> Wouldn't it significantly affect interrupt based virtio drivers in
>>> terms of performance?  How frequently 2 vhost-user ports will lock
>>> each other on the global device mutex?  How frequently 2 PMD 
>>> threads
>>> will lock each other on rx/tx to different vrings of the same vhost
>>> port?
>>
>> I used iperf3 and did not show any negative effects (maybe I should 
>> add more than 4 physical queues, 1 had one VM queue), but also the 
>> results show large deviations.
>
> From my experience, I'd say that iperf via kernel virtio driver is not 
> able
> to saturate a PMD thread.  They are likely relaxed and have time to 
> wait
> on the mutex.  Also, You, probably, have only couple of ports, so the 
> critical
> section is not large enough to produce frequent interlocking.
>
> I'm just pointing that to count number of eventfd syscalls we're 
> probably
> making 2 other syscalls and probably sleeping in them.

Yes I agree and send out a v2 using a simple coverage counter

> For testing purposes, I'd suggest to add 10-20(100?) dummy pmd ports 
> (e.g.
> vhost without VMs) and assign them to different thread. Probably, more 
> threads
> in iperf for traffic generation, switching to small udp packets.
> You could also replace lock() with trylock() for dpdk_mutex and count
> contentions in a same way as it done for tx_lock in David's patch.

The try lock would have been a good thing to see if I hit this lock 
contention.
Was already testing with small UDP packets, but used iPerf as I needed a 
kernel like tool not a poll mode one based on DPDK.

>>
>>> I see that 'guest_notified' patch is already in dpdk master, but
>>> wouldn't it be better to accumulate this statistics inside DPDK
>>> and return it with some new API like rte_vhost_get_vring_xstats()
>>> if required?  I don't see any usecase for this callback beside
>>> counting some stats.
>>
>> I agree, however, the vhost library has no internal counters (only 
>> the PMD implementation which we do not use), so hence they liked the 
>> callback.
>
> One more possible issue is that this not-so-useful callback hijacked
> the reserved space in the structure and we could suffer in the
> future while adding new callbacks due to ABI breakage.
> (In context of DPDK API/ABI stability discussions)
>
>>
>>> For the current patch I could only suggest to convert it to simple
>>> coverage counter like it done in 'vhost_tx_contention' patch here:
>>> https://patchwork.ozlabs.org/patch/1175849/
>>
>> This is what I’ll do (and will send a patch soon). Although from a 
>> user perspective a per vhost user would make more sense as it could 
>> easily indicate which VM is configured wrongly…
>
> Sure.  I think that we need to re-imagine concept and implementation 
> of
> pmd-perf-stats in a "coverage"-like way, as I suggested previously 
> while
> discussing initial patches for pmd-perf, to count stats like vhost 
> qlen,
> tx contentions and this one.  At least, this should be not so hard to 
> do
> on a per-PMD basis.  per-netdev might be more tricky, but I believe 
> that
> it is possible.

This might be something that should be added. For per-netdev we could 
use atomics assuming the counters are for exception use cases (or per 
CPU) and get ride of any locking.

> For now we have architectural limitations that we have to live with.
>
> Best regards, Ilya Maximets.
Ilya Maximets Oct. 28, 2019, 12:53 p.m. UTC | #5
On 28.10.2019 11:14, Eelco Chaudron wrote:
> 
> 
> On 25 Oct 2019, at 17:06, Ilya Maximets wrote:
> 
>> On 25.10.2019 15:51, Eelco Chaudron wrote:
> 
> <SNIP>
> 
>>>>>  +static
>>>>> +void vhost_guest_notified(int vid)
>>>>> +{
>>>>> +    struct netdev_dpdk *dev;
>>>>> +
>>>>> +    ovs_mutex_lock(&dpdk_mutex);
>>>>> +    LIST_FOR_EACH (dev, list_node, &dpdk_list) {
>>>>> +        if (netdev_dpdk_get_vid(dev) == vid) {
>>>>> +            ovs_mutex_lock(&dev->mutex);
>>>>> +            rte_spinlock_lock(&dev->stats_lock);
>>>>> +            dev->vhost_irqs++;
>>>>> +            rte_spinlock_unlock(&dev->stats_lock);
>>>>> +            ovs_mutex_unlock(&dev->mutex);
>>>>> +            break;
>>>>> +        }
>>>>> +    }
>>>>> +    ovs_mutex_unlock(&dpdk_mutex);
>>>>
>>>> So, for every single eventfd_write() we're taking the global mutex,
>>>> traversing list of all the devices, taking one more mutex and finally
>>>> taking the spinlock just to increment a single counter.
>>>> I think, it's too much.
>>>
>>> Yes you are right this might be a bit of overkill…
>>>
>>>> Wouldn't it significantly affect interrupt based virtio drivers in
>>>> terms of performance?  How frequently 2 vhost-user ports will lock
>>>> each other on the global device mutex?  How frequently 2 PMD threads
>>>> will lock each other on rx/tx to different vrings of the same vhost
>>>> port?
>>>
>>> I used iperf3 and did not show any negative effects (maybe I should add more than 4 physical queues, 1 had one VM queue), but also the results show large deviations.
>>
>> From my experience, I'd say that iperf via kernel virtio driver is not able
>> to saturate a PMD thread.  They are likely relaxed and have time to wait
>> on the mutex.  Also, You, probably, have only couple of ports, so the critical
>> section is not large enough to produce frequent interlocking.
>>
>> I'm just pointing that to count number of eventfd syscalls we're probably
>> making 2 other syscalls and probably sleeping in them.
> 
> Yes I agree and send out a v2 using a simple coverage counter
> 
>> For testing purposes, I'd suggest to add 10-20(100?) dummy pmd ports (e.g.
>> vhost without VMs) and assign them to different thread. Probably, more threads
>> in iperf for traffic generation, switching to small udp packets.
>> You could also replace lock() with trylock() for dpdk_mutex and count
>> contentions in a same way as it done for tx_lock in David's patch.
> 
> The try lock would have been a good thing to see if I hit this lock contention.
> Was already testing with small UDP packets, but used iPerf as I needed a kernel like tool not a poll mode one based on DPDK.
> 
>>>
>>>> I see that 'guest_notified' patch is already in dpdk master, but
>>>> wouldn't it be better to accumulate this statistics inside DPDK
>>>> and return it with some new API like rte_vhost_get_vring_xstats()
>>>> if required?  I don't see any usecase for this callback beside
>>>> counting some stats.
>>>
>>> I agree, however, the vhost library has no internal counters (only the PMD implementation which we do not use), so hence they liked the callback.
>>
>> One more possible issue is that this not-so-useful callback hijacked
>> the reserved space in the structure and we could suffer in the
>> future while adding new callbacks due to ABI breakage.
>> (In context of DPDK API/ABI stability discussions)
>>
>>>
>>>> For the current patch I could only suggest to convert it to simple
>>>> coverage counter like it done in 'vhost_tx_contention' patch here:
>>>> https://patchwork.ozlabs.org/patch/1175849/
>>>
>>> This is what I’ll do (and will send a patch soon). Although from a user perspective a per vhost user would make more sense as it could easily indicate which VM is configured wrongly…
>>
>> Sure.  I think that we need to re-imagine concept and implementation of
>> pmd-perf-stats in a "coverage"-like way, as I suggested previously while
>> discussing initial patches for pmd-perf, to count stats like vhost qlen,
>> tx contentions and this one.  At least, this should be not so hard to do
>> on a per-PMD basis.  per-netdev might be more tricky, but I believe that
>> it is possible.
> 
> This might be something that should be added. For per-netdev we could use atomics assuming the counters are for exception use cases (or per CPU) and get ride of any locking.

I have lockless netdev stats in my ToDo list for a few months already since
all these patches about packet drops started to appear.

However, for vhost we could also need some lockless (e.g. rcu protected)
structure to obtain netdev by the vid.

Key point for stats like tx_contentions or random syscall counters is that
I don't want them to be part of a regular (even custom) network statistics,
because they are not network statistics.  So, this needs to be accounted
in a different way, e.g. in pmd-perf-stats or some similar netdev-perf-stats
implemented on the same re-worked pmd-perf infrastructure.

Best regards, Ilya Maximets.
diff mbox series

Patch

diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index ba92e89..7ebbcdc 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -165,6 +165,8 @@  static int new_device(int vid);
 static void destroy_device(int vid);
 static int vring_state_changed(int vid, uint16_t queue_id, int enable);
 static void destroy_connection(int vid);
+static void vhost_guest_notified(int vid);
+
 static const struct vhost_device_ops virtio_net_device_ops =
 {
     .new_device =  new_device,
@@ -173,6 +175,7 @@  static const struct vhost_device_ops virtio_net_device_ops =
     .features_changed = NULL,
     .new_connection = NULL,
     .destroy_connection = destroy_connection,
+    .guest_notified = vhost_guest_notified,
 };
 
 enum { DPDK_RING_SIZE = 256 };
@@ -416,6 +419,8 @@  struct netdev_dpdk {
         struct netdev_stats stats;
         /* Custom stat for retries when unable to transmit. */
         uint64_t tx_retries;
+        /* Custom stat counting number of times a vhost remote was woken up */
+        uint64_t vhost_irqs;
         /* Protects stats */
         rte_spinlock_t stats_lock;
         /* 4 pad bytes here. */
@@ -2826,7 +2831,8 @@  netdev_dpdk_vhost_get_custom_stats(const struct netdev *netdev,
     int i;
 
 #define VHOST_CSTATS \
-    VHOST_CSTAT(tx_retries)
+    VHOST_CSTAT(tx_retries) \
+    VHOST_CSTAT(vhost_irqs)
 
 #define VHOST_CSTAT(NAME) + 1
     custom_stats->size = VHOST_CSTATS;
@@ -3746,6 +3752,25 @@  destroy_connection(int vid)
     }
 }
 
+static
+void vhost_guest_notified(int vid)
+{
+    struct netdev_dpdk *dev;
+
+    ovs_mutex_lock(&dpdk_mutex);
+    LIST_FOR_EACH (dev, list_node, &dpdk_list) {
+        if (netdev_dpdk_get_vid(dev) == vid) {
+            ovs_mutex_lock(&dev->mutex);
+            rte_spinlock_lock(&dev->stats_lock);
+            dev->vhost_irqs++;
+            rte_spinlock_unlock(&dev->stats_lock);
+            ovs_mutex_unlock(&dev->mutex);
+            break;
+        }
+    }
+    ovs_mutex_unlock(&dpdk_mutex);
+}
+
 /*
  * Retrieve the DPDK virtio device ID (vid) associated with a vhostuser
  * or vhostuserclient netdev.