Patchwork [4/5] virtio-net: Introduce a new bottom half packet TX

login
register
mail settings
Submitter Alex Williamson
Date Aug. 27, 2010, 10:37 p.m.
Message ID <20100827223736.2696.16854.stgit@s20.home>
Download mbox | patch
Permalink /patch/62882/
State New
Headers show

Comments

Alex Williamson - Aug. 27, 2010, 10:37 p.m.
Based on a patch from Mark McLoughlin, this patch introduces a new
bottom half packet transmitter that avoids the latency imposed by
the tx_timer approach.  Rather than scheduling a timer when a TX
packet comes in, schedule a bottom half to be run from the iothread.
The bottom half handler first attempts to flush the queue with
notification disabled (this is where we could race with a guest
without txburst).  If we flush a full burst, reschedule immediately.
If we send short of a full burst, try to re-enable notification.
To avoid a race with TXs that may have occurred, we must then
flush again.  If we find some packets to send, the guest it probably
active, so we can reschedule again.

tx_timer and tx_bh are mutually exclusive, so we can re-use the
tx_waiting flag to indicate one or the other needs to be setup.
This allows us to seamlessly migrate between timer and bh TX
handling.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---

 hw/virtio-net.c |   81 ++++++++++++++++++++++++++++++++++++++++++++++---------
 1 files changed, 68 insertions(+), 13 deletions(-)
Michael S. Tsirkin - Aug. 31, 2010, 8:14 p.m.
On Fri, Aug 27, 2010 at 04:37:36PM -0600, Alex Williamson wrote:
> Based on a patch from Mark McLoughlin, this patch introduces a new
> bottom half packet transmitter that avoids the latency imposed by
> the tx_timer approach.  Rather than scheduling a timer when a TX
> packet comes in, schedule a bottom half to be run from the iothread.
> The bottom half handler first attempts to flush the queue with
> notification disabled (this is where we could race with a guest
> without txburst).  If we flush a full burst, reschedule immediately.
> If we send short of a full burst, try to re-enable notification.
> To avoid a race with TXs that may have occurred, we must then
> flush again.  If we find some packets to send, the guest it probably
> active, so we can reschedule again.
> 
> tx_timer and tx_bh are mutually exclusive, so we can re-use the
> tx_waiting flag to indicate one or the other needs to be setup.
> This allows us to seamlessly migrate between timer and bh TX
> handling.
> 
> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> ---
> 
>  hw/virtio-net.c |   81 ++++++++++++++++++++++++++++++++++++++++++++++---------
>  1 files changed, 68 insertions(+), 13 deletions(-)
> 
> diff --git a/hw/virtio-net.c b/hw/virtio-net.c
> index 8b652f2..3288c77 100644
> --- a/hw/virtio-net.c
> +++ b/hw/virtio-net.c
> @@ -36,6 +36,7 @@ typedef struct VirtIONet
>      VirtQueue *ctrl_vq;
>      NICState *nic;
>      QEMUTimer *tx_timer;
> +    QEMUBH *tx_bh;
>      uint32_t tx_timeout;
>      int32_t tx_burst;
>      int tx_waiting;
> @@ -704,16 +705,25 @@ static void virtio_net_handle_tx(VirtIODevice *vdev, VirtQueue *vq)
>  {
>      VirtIONet *n = to_virtio_net(vdev);
>  
> -    if (n->tx_waiting) {
> -        virtio_queue_set_notification(vq, 1);
> -        qemu_del_timer(n->tx_timer);
> -        n->tx_waiting = 0;
> -        virtio_net_flush_tx(n, vq);
> +    if (n->tx_timer) {
> +        if (n->tx_waiting) {
> +            virtio_queue_set_notification(vq, 1);
> +            qemu_del_timer(n->tx_timer);
> +            n->tx_waiting = 0;
> +            virtio_net_flush_tx(n, vq);
> +        } else {
> +            qemu_mod_timer(n->tx_timer,
> +                           qemu_get_clock(vm_clock) + n->tx_timeout);
> +            n->tx_waiting = 1;
> +            virtio_queue_set_notification(vq, 0);
> +        }
>      } else {
> -        qemu_mod_timer(n->tx_timer,
> -                       qemu_get_clock(vm_clock) + n->tx_timeout);
> +        if (unlikely(n->tx_waiting)) {
> +            return;
> +        }
> +        virtio_queue_set_notification(n->tx_vq, 0);
> +        qemu_bh_schedule(n->tx_bh);
>          n->tx_waiting = 1;
> -        virtio_queue_set_notification(vq, 0);
>      }
>  }
>  
> @@ -731,6 +741,41 @@ static void virtio_net_tx_timer(void *opaque)
>      virtio_net_flush_tx(n, n->tx_vq);
>  }
>  
> +static void virtio_net_tx_bh(void *opaque)
> +{
> +    VirtIONet *n = opaque;
> +    int32_t ret;
> +
> +    n->tx_waiting = 0;
> +
> +    /* Just in case the driver is not ready on more */
> +    if (unlikely(!(n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK)))
> +        return;
> +
> +    ret = virtio_net_flush_tx(n, n->tx_vq);
> +    if (ret == -EBUSY) {
> +        return; /* Notification re-enable handled by tx_complete */
> +    }
> +
> +    /* If we flush a full burst of packets, assume there are
> +     * more coming and immediately reschedule */
> +    if (ret >= n->tx_burst) {
> +        qemu_bh_schedule(n->tx_bh);
> +        n->tx_waiting = 1;
> +        return;
> +    }
> +
> +    /* If less than a full burst, re-enable notification and flush
> +     * anything that may have come in while we weren't looking.  If
> +     * we find something, assume the guest is still active and reschedule */
> +    virtio_queue_set_notification(n->tx_vq, 1);
> +    if (virtio_net_flush_tx(n, n->tx_vq) > 0) {

Shouldn't this be virtio_net_flush_tx(n, n->tx_vq) >= n->tx_burst?
If we get less than tx_burst, the ring is empty now so no need to
reschedule.
Right?

> +        virtio_queue_set_notification(n->tx_vq, 0);
> +        qemu_bh_schedule(n->tx_bh);
> +        n->tx_waiting = 1;
> +    }
> +}
> +
>  static void virtio_net_save(QEMUFile *f, void *opaque)
>  {
>      VirtIONet *n = opaque;
> @@ -850,8 +895,12 @@ static int virtio_net_load(QEMUFile *f, void *opaque, int version_id)
>      n->mac_table.first_multi = i;
>  
>      if (n->tx_waiting) {
> -        qemu_mod_timer(n->tx_timer,
> -                       qemu_get_clock(vm_clock) + n->tx_timeout);
> +        if (n->tx_timer) {
> +            qemu_mod_timer(n->tx_timer,
> +                           qemu_get_clock(vm_clock) + n->tx_timeout);
> +        } else {
> +            qemu_bh_schedule(n->tx_bh);
> +        }
>      }
>      return 0;
>  }
> @@ -939,9 +988,9 @@ VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf,
>  
>      qemu_format_nic_info_str(&n->nic->nc, conf->macaddr.a);
>  
> -    n->tx_timer = qemu_new_timer(vm_clock, virtio_net_tx_timer, n);
>      n->tx_waiting = 0;
>      if (txtimer) {
> +        n->tx_timer = qemu_new_timer(vm_clock, virtio_net_tx_timer, n);
>          if (txtimer == 1) {
>              /* For convenience, 1 = "on" = predefined default, anything else
>               * specifies and actual timeout value */
> @@ -949,6 +998,8 @@ VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf,
>          } else {
>              n->tx_timeout = txtimer;
>          }
> +    } else {
> +        n->tx_bh = qemu_bh_new(virtio_net_tx_bh, n);
>      }
>      n->tx_burst = txburst;
>      n->mergeable_rx_bufs = 0;
> @@ -982,8 +1033,12 @@ void virtio_net_exit(VirtIODevice *vdev)
>      qemu_free(n->mac_table.macs);
>      qemu_free(n->vlans);
>  
> -    qemu_del_timer(n->tx_timer);
> -    qemu_free_timer(n->tx_timer);
> +    if (n->tx_timer) {
> +        qemu_del_timer(n->tx_timer);
> +        qemu_free_timer(n->tx_timer);
> +    } else {
> +        qemu_bh_delete(n->tx_bh);
> +    }
>  
>      virtio_cleanup(&n->vdev);
>      qemu_del_vlan_client(&n->nic->nc);
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alex Williamson - Aug. 31, 2010, 8:33 p.m.
On Tue, 2010-08-31 at 23:14 +0300, Michael S. Tsirkin wrote:
> On Fri, Aug 27, 2010 at 04:37:36PM -0600, Alex Williamson wrote:
> > Based on a patch from Mark McLoughlin, this patch introduces a new
> > bottom half packet transmitter that avoids the latency imposed by
> > the tx_timer approach.  Rather than scheduling a timer when a TX
> > packet comes in, schedule a bottom half to be run from the iothread.
> > The bottom half handler first attempts to flush the queue with
> > notification disabled (this is where we could race with a guest
> > without txburst).  If we flush a full burst, reschedule immediately.
> > If we send short of a full burst, try to re-enable notification.
> > To avoid a race with TXs that may have occurred, we must then
> > flush again.  If we find some packets to send, the guest it probably
> > active, so we can reschedule again.
> > 
> > tx_timer and tx_bh are mutually exclusive, so we can re-use the
> > tx_waiting flag to indicate one or the other needs to be setup.
> > This allows us to seamlessly migrate between timer and bh TX
> > handling.
> > 
> > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > ---
> > 
> >  hw/virtio-net.c |   81 ++++++++++++++++++++++++++++++++++++++++++++++---------
> >  1 files changed, 68 insertions(+), 13 deletions(-)
> > 
> > diff --git a/hw/virtio-net.c b/hw/virtio-net.c
> > index 8b652f2..3288c77 100644
> > --- a/hw/virtio-net.c
> > +++ b/hw/virtio-net.c
> > @@ -36,6 +36,7 @@ typedef struct VirtIONet
> >      VirtQueue *ctrl_vq;
> >      NICState *nic;
> >      QEMUTimer *tx_timer;
> > +    QEMUBH *tx_bh;
> >      uint32_t tx_timeout;
> >      int32_t tx_burst;
> >      int tx_waiting;
> > @@ -704,16 +705,25 @@ static void virtio_net_handle_tx(VirtIODevice *vdev, VirtQueue *vq)
> >  {
> >      VirtIONet *n = to_virtio_net(vdev);
> >  
> > -    if (n->tx_waiting) {
> > -        virtio_queue_set_notification(vq, 1);
> > -        qemu_del_timer(n->tx_timer);
> > -        n->tx_waiting = 0;
> > -        virtio_net_flush_tx(n, vq);
> > +    if (n->tx_timer) {
> > +        if (n->tx_waiting) {
> > +            virtio_queue_set_notification(vq, 1);
> > +            qemu_del_timer(n->tx_timer);
> > +            n->tx_waiting = 0;
> > +            virtio_net_flush_tx(n, vq);
> > +        } else {
> > +            qemu_mod_timer(n->tx_timer,
> > +                           qemu_get_clock(vm_clock) + n->tx_timeout);
> > +            n->tx_waiting = 1;
> > +            virtio_queue_set_notification(vq, 0);
> > +        }
> >      } else {
> > -        qemu_mod_timer(n->tx_timer,
> > -                       qemu_get_clock(vm_clock) + n->tx_timeout);
> > +        if (unlikely(n->tx_waiting)) {
> > +            return;
> > +        }
> > +        virtio_queue_set_notification(n->tx_vq, 0);
> > +        qemu_bh_schedule(n->tx_bh);
> >          n->tx_waiting = 1;
> > -        virtio_queue_set_notification(vq, 0);
> >      }
> >  }
> >  
> > @@ -731,6 +741,41 @@ static void virtio_net_tx_timer(void *opaque)
> >      virtio_net_flush_tx(n, n->tx_vq);
> >  }
> >  
> > +static void virtio_net_tx_bh(void *opaque)
> > +{
> > +    VirtIONet *n = opaque;
> > +    int32_t ret;
> > +
> > +    n->tx_waiting = 0;
> > +
> > +    /* Just in case the driver is not ready on more */
> > +    if (unlikely(!(n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK)))
> > +        return;
> > +
> > +    ret = virtio_net_flush_tx(n, n->tx_vq);
> > +    if (ret == -EBUSY) {
> > +        return; /* Notification re-enable handled by tx_complete */
> > +    }
> > +
> > +    /* If we flush a full burst of packets, assume there are
> > +     * more coming and immediately reschedule */
> > +    if (ret >= n->tx_burst) {
> > +        qemu_bh_schedule(n->tx_bh);
> > +        n->tx_waiting = 1;
> > +        return;
> > +    }
> > +
> > +    /* If less than a full burst, re-enable notification and flush
> > +     * anything that may have come in while we weren't looking.  If
> > +     * we find something, assume the guest is still active and reschedule */
> > +    virtio_queue_set_notification(n->tx_vq, 1);
> > +    if (virtio_net_flush_tx(n, n->tx_vq) > 0) {
> 
> Shouldn't this be virtio_net_flush_tx(n, n->tx_vq) >= n->tx_burst?
> If we get less than tx_burst, the ring is empty now so no need to
> reschedule.
> Right?

I suppose it depends on how aggressive we want to be.  If the guest put
something on the queue between the first flush and this one, then it
might be actively transmitting, and if we want to optimize latency, we
anticipate that it might continue to transmit and re-schedule.  This is
taken straight from markmc's rhel5 patch.  I wouldn't argue that it's
wrong to not reschedule here, but it's clearly less aggressive.  Thanks,

Alex

> > +        virtio_queue_set_notification(n->tx_vq, 0);
> > +        qemu_bh_schedule(n->tx_bh);
> > +        n->tx_waiting = 1;
> > +    }
> > +}
> > +
> >  static void virtio_net_save(QEMUFile *f, void *opaque)
> >  {
> >      VirtIONet *n = opaque;
> > @@ -850,8 +895,12 @@ static int virtio_net_load(QEMUFile *f, void *opaque, int version_id)
> >      n->mac_table.first_multi = i;
> >  
> >      if (n->tx_waiting) {
> > -        qemu_mod_timer(n->tx_timer,
> > -                       qemu_get_clock(vm_clock) + n->tx_timeout);
> > +        if (n->tx_timer) {
> > +            qemu_mod_timer(n->tx_timer,
> > +                           qemu_get_clock(vm_clock) + n->tx_timeout);
> > +        } else {
> > +            qemu_bh_schedule(n->tx_bh);
> > +        }
> >      }
> >      return 0;
> >  }
> > @@ -939,9 +988,9 @@ VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf,
> >  
> >      qemu_format_nic_info_str(&n->nic->nc, conf->macaddr.a);
> >  
> > -    n->tx_timer = qemu_new_timer(vm_clock, virtio_net_tx_timer, n);
> >      n->tx_waiting = 0;
> >      if (txtimer) {
> > +        n->tx_timer = qemu_new_timer(vm_clock, virtio_net_tx_timer, n);
> >          if (txtimer == 1) {
> >              /* For convenience, 1 = "on" = predefined default, anything else
> >               * specifies and actual timeout value */
> > @@ -949,6 +998,8 @@ VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf,
> >          } else {
> >              n->tx_timeout = txtimer;
> >          }
> > +    } else {
> > +        n->tx_bh = qemu_bh_new(virtio_net_tx_bh, n);
> >      }
> >      n->tx_burst = txburst;
> >      n->mergeable_rx_bufs = 0;
> > @@ -982,8 +1033,12 @@ void virtio_net_exit(VirtIODevice *vdev)
> >      qemu_free(n->mac_table.macs);
> >      qemu_free(n->vlans);
> >  
> > -    qemu_del_timer(n->tx_timer);
> > -    qemu_free_timer(n->tx_timer);
> > +    if (n->tx_timer) {
> > +        qemu_del_timer(n->tx_timer);
> > +        qemu_free_timer(n->tx_timer);
> > +    } else {
> > +        qemu_bh_delete(n->tx_bh);
> > +    }
> >  
> >      virtio_cleanup(&n->vdev);
> >      qemu_del_vlan_client(&n->nic->nc);
> > 
> > --
> > To unsubscribe from this list: send the line "unsubscribe kvm" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin - Sept. 1, 2010, 9:47 a.m.
On Tue, Aug 31, 2010 at 02:33:46PM -0600, Alex Williamson wrote:
> On Tue, 2010-08-31 at 23:14 +0300, Michael S. Tsirkin wrote:
> > On Fri, Aug 27, 2010 at 04:37:36PM -0600, Alex Williamson wrote:
> > > Based on a patch from Mark McLoughlin, this patch introduces a new
> > > bottom half packet transmitter that avoids the latency imposed by
> > > the tx_timer approach.  Rather than scheduling a timer when a TX
> > > packet comes in, schedule a bottom half to be run from the iothread.
> > > The bottom half handler first attempts to flush the queue with
> > > notification disabled (this is where we could race with a guest
> > > without txburst).  If we flush a full burst, reschedule immediately.
> > > If we send short of a full burst, try to re-enable notification.
> > > To avoid a race with TXs that may have occurred, we must then
> > > flush again.  If we find some packets to send, the guest it probably
> > > active, so we can reschedule again.
> > > 
> > > tx_timer and tx_bh are mutually exclusive, so we can re-use the
> > > tx_waiting flag to indicate one or the other needs to be setup.
> > > This allows us to seamlessly migrate between timer and bh TX
> > > handling.
> > > 
> > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > > ---
> > > 
> > >  hw/virtio-net.c |   81 ++++++++++++++++++++++++++++++++++++++++++++++---------
> > >  1 files changed, 68 insertions(+), 13 deletions(-)
> > > 
> > > diff --git a/hw/virtio-net.c b/hw/virtio-net.c
> > > index 8b652f2..3288c77 100644
> > > --- a/hw/virtio-net.c
> > > +++ b/hw/virtio-net.c
> > > @@ -36,6 +36,7 @@ typedef struct VirtIONet
> > >      VirtQueue *ctrl_vq;
> > >      NICState *nic;
> > >      QEMUTimer *tx_timer;
> > > +    QEMUBH *tx_bh;
> > >      uint32_t tx_timeout;
> > >      int32_t tx_burst;
> > >      int tx_waiting;
> > > @@ -704,16 +705,25 @@ static void virtio_net_handle_tx(VirtIODevice *vdev, VirtQueue *vq)
> > >  {
> > >      VirtIONet *n = to_virtio_net(vdev);
> > >  
> > > -    if (n->tx_waiting) {
> > > -        virtio_queue_set_notification(vq, 1);
> > > -        qemu_del_timer(n->tx_timer);
> > > -        n->tx_waiting = 0;
> > > -        virtio_net_flush_tx(n, vq);
> > > +    if (n->tx_timer) {
> > > +        if (n->tx_waiting) {
> > > +            virtio_queue_set_notification(vq, 1);
> > > +            qemu_del_timer(n->tx_timer);
> > > +            n->tx_waiting = 0;
> > > +            virtio_net_flush_tx(n, vq);
> > > +        } else {
> > > +            qemu_mod_timer(n->tx_timer,
> > > +                           qemu_get_clock(vm_clock) + n->tx_timeout);
> > > +            n->tx_waiting = 1;
> > > +            virtio_queue_set_notification(vq, 0);
> > > +        }
> > >      } else {
> > > -        qemu_mod_timer(n->tx_timer,
> > > -                       qemu_get_clock(vm_clock) + n->tx_timeout);
> > > +        if (unlikely(n->tx_waiting)) {
> > > +            return;
> > > +        }
> > > +        virtio_queue_set_notification(n->tx_vq, 0);
> > > +        qemu_bh_schedule(n->tx_bh);
> > >          n->tx_waiting = 1;
> > > -        virtio_queue_set_notification(vq, 0);
> > >      }
> > >  }
> > >  
> > > @@ -731,6 +741,41 @@ static void virtio_net_tx_timer(void *opaque)
> > >      virtio_net_flush_tx(n, n->tx_vq);
> > >  }
> > >  
> > > +static void virtio_net_tx_bh(void *opaque)
> > > +{
> > > +    VirtIONet *n = opaque;
> > > +    int32_t ret;
> > > +
> > > +    n->tx_waiting = 0;
> > > +
> > > +    /* Just in case the driver is not ready on more */
> > > +    if (unlikely(!(n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK)))
> > > +        return;
> > > +
> > > +    ret = virtio_net_flush_tx(n, n->tx_vq);
> > > +    if (ret == -EBUSY) {
> > > +        return; /* Notification re-enable handled by tx_complete */
> > > +    }
> > > +
> > > +    /* If we flush a full burst of packets, assume there are
> > > +     * more coming and immediately reschedule */
> > > +    if (ret >= n->tx_burst) {
> > > +        qemu_bh_schedule(n->tx_bh);
> > > +        n->tx_waiting = 1;
> > > +        return;
> > > +    }
> > > +
> > > +    /* If less than a full burst, re-enable notification and flush
> > > +     * anything that may have come in while we weren't looking.  If
> > > +     * we find something, assume the guest is still active and reschedule */
> > > +    virtio_queue_set_notification(n->tx_vq, 1);
> > > +    if (virtio_net_flush_tx(n, n->tx_vq) > 0) {
> > 
> > Shouldn't this be virtio_net_flush_tx(n, n->tx_vq) >= n->tx_burst?
> > If we get less than tx_burst, the ring is empty now so no need to
> > reschedule.
> > Right?
> 
> I suppose it depends on how aggressive we want to be.  If the guest put
> something on the queue between the first flush and this one, then it
> might be actively transmitting, and if we want to optimize latency, we
> anticipate that it might continue to transmit and re-schedule.  This is
> taken straight from markmc's rhel5 patch.  I wouldn't argue that it's
> wrong to not reschedule here, but it's clearly less aggressive.  Thanks,
> 
> Alex

I'm a bit concerned that we are aggressive but not consistently aggressive.
For example if the guest adds a packet before we do disable
notification, we do not reschedule bh, but if it adds a packet
after this, we do. If we get 255 packets, then another 255 packets,
we poll without rescheduling an extra bh, if we get 255*2 packets in one
go we reschedule.

I think it might cause jitter in performance where e.g. slowing
guest down a bit suddenly speeds up networking.

It might be better to be consistent: always poll at most 256 entries,
if we get all of them reschedule, if we get x < 256 we enable notification,
and poll again, if we get 256 - x entries we reschedule, if we get less
stop polling.



> > > +        virtio_queue_set_notification(n->tx_vq, 0);
> > > +        qemu_bh_schedule(n->tx_bh);
> > > +        n->tx_waiting = 1;
> > > +    }
> > > +}
> > > +
> > >  static void virtio_net_save(QEMUFile *f, void *opaque)
> > >  {
> > >      VirtIONet *n = opaque;
> > > @@ -850,8 +895,12 @@ static int virtio_net_load(QEMUFile *f, void *opaque, int version_id)
> > >      n->mac_table.first_multi = i;
> > >  
> > >      if (n->tx_waiting) {
> > > -        qemu_mod_timer(n->tx_timer,
> > > -                       qemu_get_clock(vm_clock) + n->tx_timeout);
> > > +        if (n->tx_timer) {
> > > +            qemu_mod_timer(n->tx_timer,
> > > +                           qemu_get_clock(vm_clock) + n->tx_timeout);
> > > +        } else {
> > > +            qemu_bh_schedule(n->tx_bh);
> > > +        }
> > >      }
> > >      return 0;
> > >  }
> > > @@ -939,9 +988,9 @@ VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf,
> > >  
> > >      qemu_format_nic_info_str(&n->nic->nc, conf->macaddr.a);
> > >  
> > > -    n->tx_timer = qemu_new_timer(vm_clock, virtio_net_tx_timer, n);
> > >      n->tx_waiting = 0;
> > >      if (txtimer) {
> > > +        n->tx_timer = qemu_new_timer(vm_clock, virtio_net_tx_timer, n);
> > >          if (txtimer == 1) {
> > >              /* For convenience, 1 = "on" = predefined default, anything else
> > >               * specifies and actual timeout value */
> > > @@ -949,6 +998,8 @@ VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf,
> > >          } else {
> > >              n->tx_timeout = txtimer;
> > >          }
> > > +    } else {
> > > +        n->tx_bh = qemu_bh_new(virtio_net_tx_bh, n);
> > >      }
> > >      n->tx_burst = txburst;
> > >      n->mergeable_rx_bufs = 0;
> > > @@ -982,8 +1033,12 @@ void virtio_net_exit(VirtIODevice *vdev)
> > >      qemu_free(n->mac_table.macs);
> > >      qemu_free(n->vlans);
> > >  
> > > -    qemu_del_timer(n->tx_timer);
> > > -    qemu_free_timer(n->tx_timer);
> > > +    if (n->tx_timer) {
> > > +        qemu_del_timer(n->tx_timer);
> > > +        qemu_free_timer(n->tx_timer);
> > > +    } else {
> > > +        qemu_bh_delete(n->tx_bh);
> > > +    }
> > >  
> > >      virtio_cleanup(&n->vdev);
> > >      qemu_del_vlan_client(&n->nic->nc);
> > > 
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe kvm" in
> > > the body of a message to majordomo@vger.kernel.org
> > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
>

Patch

diff --git a/hw/virtio-net.c b/hw/virtio-net.c
index 8b652f2..3288c77 100644
--- a/hw/virtio-net.c
+++ b/hw/virtio-net.c
@@ -36,6 +36,7 @@  typedef struct VirtIONet
     VirtQueue *ctrl_vq;
     NICState *nic;
     QEMUTimer *tx_timer;
+    QEMUBH *tx_bh;
     uint32_t tx_timeout;
     int32_t tx_burst;
     int tx_waiting;
@@ -704,16 +705,25 @@  static void virtio_net_handle_tx(VirtIODevice *vdev, VirtQueue *vq)
 {
     VirtIONet *n = to_virtio_net(vdev);
 
-    if (n->tx_waiting) {
-        virtio_queue_set_notification(vq, 1);
-        qemu_del_timer(n->tx_timer);
-        n->tx_waiting = 0;
-        virtio_net_flush_tx(n, vq);
+    if (n->tx_timer) {
+        if (n->tx_waiting) {
+            virtio_queue_set_notification(vq, 1);
+            qemu_del_timer(n->tx_timer);
+            n->tx_waiting = 0;
+            virtio_net_flush_tx(n, vq);
+        } else {
+            qemu_mod_timer(n->tx_timer,
+                           qemu_get_clock(vm_clock) + n->tx_timeout);
+            n->tx_waiting = 1;
+            virtio_queue_set_notification(vq, 0);
+        }
     } else {
-        qemu_mod_timer(n->tx_timer,
-                       qemu_get_clock(vm_clock) + n->tx_timeout);
+        if (unlikely(n->tx_waiting)) {
+            return;
+        }
+        virtio_queue_set_notification(n->tx_vq, 0);
+        qemu_bh_schedule(n->tx_bh);
         n->tx_waiting = 1;
-        virtio_queue_set_notification(vq, 0);
     }
 }
 
@@ -731,6 +741,41 @@  static void virtio_net_tx_timer(void *opaque)
     virtio_net_flush_tx(n, n->tx_vq);
 }
 
+static void virtio_net_tx_bh(void *opaque)
+{
+    VirtIONet *n = opaque;
+    int32_t ret;
+
+    n->tx_waiting = 0;
+
+    /* Just in case the driver is not ready on more */
+    if (unlikely(!(n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK)))
+        return;
+
+    ret = virtio_net_flush_tx(n, n->tx_vq);
+    if (ret == -EBUSY) {
+        return; /* Notification re-enable handled by tx_complete */
+    }
+
+    /* If we flush a full burst of packets, assume there are
+     * more coming and immediately reschedule */
+    if (ret >= n->tx_burst) {
+        qemu_bh_schedule(n->tx_bh);
+        n->tx_waiting = 1;
+        return;
+    }
+
+    /* If less than a full burst, re-enable notification and flush
+     * anything that may have come in while we weren't looking.  If
+     * we find something, assume the guest is still active and reschedule */
+    virtio_queue_set_notification(n->tx_vq, 1);
+    if (virtio_net_flush_tx(n, n->tx_vq) > 0) {
+        virtio_queue_set_notification(n->tx_vq, 0);
+        qemu_bh_schedule(n->tx_bh);
+        n->tx_waiting = 1;
+    }
+}
+
 static void virtio_net_save(QEMUFile *f, void *opaque)
 {
     VirtIONet *n = opaque;
@@ -850,8 +895,12 @@  static int virtio_net_load(QEMUFile *f, void *opaque, int version_id)
     n->mac_table.first_multi = i;
 
     if (n->tx_waiting) {
-        qemu_mod_timer(n->tx_timer,
-                       qemu_get_clock(vm_clock) + n->tx_timeout);
+        if (n->tx_timer) {
+            qemu_mod_timer(n->tx_timer,
+                           qemu_get_clock(vm_clock) + n->tx_timeout);
+        } else {
+            qemu_bh_schedule(n->tx_bh);
+        }
     }
     return 0;
 }
@@ -939,9 +988,9 @@  VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf,
 
     qemu_format_nic_info_str(&n->nic->nc, conf->macaddr.a);
 
-    n->tx_timer = qemu_new_timer(vm_clock, virtio_net_tx_timer, n);
     n->tx_waiting = 0;
     if (txtimer) {
+        n->tx_timer = qemu_new_timer(vm_clock, virtio_net_tx_timer, n);
         if (txtimer == 1) {
             /* For convenience, 1 = "on" = predefined default, anything else
              * specifies and actual timeout value */
@@ -949,6 +998,8 @@  VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf,
         } else {
             n->tx_timeout = txtimer;
         }
+    } else {
+        n->tx_bh = qemu_bh_new(virtio_net_tx_bh, n);
     }
     n->tx_burst = txburst;
     n->mergeable_rx_bufs = 0;
@@ -982,8 +1033,12 @@  void virtio_net_exit(VirtIODevice *vdev)
     qemu_free(n->mac_table.macs);
     qemu_free(n->vlans);
 
-    qemu_del_timer(n->tx_timer);
-    qemu_free_timer(n->tx_timer);
+    if (n->tx_timer) {
+        qemu_del_timer(n->tx_timer);
+        qemu_free_timer(n->tx_timer);
+    } else {
+        qemu_bh_delete(n->tx_bh);
+    }
 
     virtio_cleanup(&n->vdev);
     qemu_del_vlan_client(&n->nic->nc);