diff mbox series

[v2] e1000e: using bottom half to send packets

Message ID 20200720164535.76559-1-liq3ea@163.com
State New
Headers show
Series [v2] e1000e: using bottom half to send packets | expand

Commit Message

Li Qiang July 20, 2020, 4:45 p.m. UTC
Alexander Bulekov reported a UAF bug related e1000e packets send.

-->https://bugs.launchpad.net/qemu/+bug/1886362

This is because the guest trigger a e1000e packet send and set the
data's address to e1000e's MMIO address. So when the e1000e do DMA
it will write the MMIO again and trigger re-entrancy and finally
causes this UAF.

Paolo suggested to use a bottom half whenever MMIO is doing complicate
things in here:
-->https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg03342.html

Reference here:
'The easiest solution is to delay processing of descriptors to a bottom
half whenever MMIO is doing something complicated.  This is also better
for latency because it will free the vCPU thread more quickly and leave
the work to the I/O thread.'

This patch fixes this UAF.

Signed-off-by: Li Qiang <liq3ea@163.com>
---
Change since v1:
Per Jason's review here:
-- https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg05368.html
1. Cancel and schedule the tx bh when VM is stopped or resume
2. Add a tx_burst for e1000e configuration to throttle the bh execution
3. Add a tx_waiting to record whether the bh is pending or not
Don't use BQL in the tx_bh handler as when tx_bh is executed, the BQL is
acquired.

 hw/net/e1000e.c      |   6 +++
 hw/net/e1000e_core.c | 106 ++++++++++++++++++++++++++++++++++---------
 hw/net/e1000e_core.h |   8 ++++
 3 files changed, 98 insertions(+), 22 deletions(-)

Comments

Jason Wang July 21, 2020, 2:03 a.m. UTC | #1
On 2020/7/21 上午12:45, Li Qiang wrote:
> Alexander Bulekov reported a UAF bug related e1000e packets send.
>
> -->https://bugs.launchpad.net/qemu/+bug/1886362
>
> This is because the guest trigger a e1000e packet send and set the
> data's address to e1000e's MMIO address. So when the e1000e do DMA
> it will write the MMIO again and trigger re-entrancy and finally
> causes this UAF.
>
> Paolo suggested to use a bottom half whenever MMIO is doing complicate
> things in here:
> -->https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg03342.html
>
> Reference here:
> 'The easiest solution is to delay processing of descriptors to a bottom
> half whenever MMIO is doing something complicated.  This is also better
> for latency because it will free the vCPU thread more quickly and leave
> the work to the I/O thread.'
>
> This patch fixes this UAF.
>
> Signed-off-by: Li Qiang <liq3ea@163.com>
> ---
> Change since v1:
> Per Jason's review here:
> -- https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg05368.html
> 1. Cancel and schedule the tx bh when VM is stopped or resume
> 2. Add a tx_burst for e1000e configuration to throttle the bh execution
> 3. Add a tx_waiting to record whether the bh is pending or not
> Don't use BQL in the tx_bh handler as when tx_bh is executed, the BQL is
> acquired.
>
>   hw/net/e1000e.c      |   6 +++
>   hw/net/e1000e_core.c | 106 ++++++++++++++++++++++++++++++++++---------
>   hw/net/e1000e_core.h |   8 ++++
>   3 files changed, 98 insertions(+), 22 deletions(-)
>
> diff --git a/hw/net/e1000e.c b/hw/net/e1000e.c
> index fda34518c9..24e35a78bf 100644
> --- a/hw/net/e1000e.c
> +++ b/hw/net/e1000e.c
> @@ -77,10 +77,14 @@ typedef struct E1000EState {
>   
>       bool disable_vnet;
>   
> +    int32_t tx_burst;
> +
>       E1000ECore core;
>   
>   } E1000EState;
>   
> +#define TX_BURST 256
> +
>   #define E1000E_MMIO_IDX     0
>   #define E1000E_FLASH_IDX    1
>   #define E1000E_IO_IDX       2
> @@ -263,6 +267,7 @@ static void e1000e_core_realize(E1000EState *s)
>   {
>       s->core.owner = &s->parent_obj;
>       s->core.owner_nic = s->nic;
> +    s->core.tx_burst = s->tx_burst;
>   }
>   
>   static void
> @@ -665,6 +670,7 @@ static Property e1000e_properties[] = {
>                           e1000e_prop_subsys_ven, uint16_t),
>       DEFINE_PROP_SIGNED("subsys", E1000EState, subsys, 0,
>                           e1000e_prop_subsys, uint16_t),
> +    DEFINE_PROP_INT32("x-txburst", E1000EState, tx_burst, TX_BURST),
>       DEFINE_PROP_END_OF_LIST(),
>   };
>   
> diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c
> index bcd186cac5..0a38a50cca 100644
> --- a/hw/net/e1000e_core.c
> +++ b/hw/net/e1000e_core.c
> @@ -909,19 +909,18 @@ e1000e_rx_ring_init(E1000ECore *core, E1000E_RxRing *rxr, int idx)
>       rxr->i      = &i[idx];
>   }
>   
> -static void
> -e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr)
> +static int32_t
> +e1000e_start_xmit(struct e1000e_tx *q)
>   {
> +    E1000ECore *core = q->core;
>       dma_addr_t base;
>       struct e1000_tx_desc desc;
> -    bool ide = false;
> -    const E1000E_RingInfo *txi = txr->i;
> -    uint32_t cause = E1000_ICS_TXQE;
> +    const E1000E_RingInfo *txi;
> +    E1000E_TxRing txr;
> +    int32_t num_packets = 0;
>   
> -    if (!(core->mac[TCTL] & E1000_TCTL_EN)) {
> -        trace_e1000e_tx_disabled();
> -        return;
> -    }
> +    e1000e_tx_ring_init(core, &txr, q - &core->tx[0]);
> +    txi = txr.i;
>   
>       while (!e1000e_ring_empty(core, txi)) {
>           base = e1000e_ring_head_descr(core, txi);
> @@ -931,15 +930,17 @@ e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr)
>           trace_e1000e_tx_descr((void *)(intptr_t)desc.buffer_addr,
>                                 desc.lower.data, desc.upper.data);
>   
> -        e1000e_process_tx_desc(core, txr->tx, &desc, txi->idx);
> -        cause |= e1000e_txdesc_writeback(core, base, &desc, &ide, txi->idx);
> +        e1000e_process_tx_desc(core, txr.tx, &desc, txi->idx);
> +        q->cause |= e1000e_txdesc_writeback(core, base, &desc,
> +                                            &q->ide, txi->idx);
>   
>           e1000e_ring_advance(core, txi, 1);
> +        if (++num_packets >= core->tx_burst) {
> +            break;
> +        }
>       }
>   
> -    if (!ide || !e1000e_intrmgr_delay_tx_causes(core, &cause)) {
> -        e1000e_set_interrupt_cause(core, cause);
> -    }
> +    return num_packets;
>   }
>   
>   static bool
> @@ -2423,32 +2424,41 @@ e1000e_set_dbal(E1000ECore *core, int index, uint32_t val)
>   static void
>   e1000e_set_tctl(E1000ECore *core, int index, uint32_t val)
>   {
> -    E1000E_TxRing txr;
>       core->mac[index] = val;
>   
>       if (core->mac[TARC0] & E1000_TARC_ENABLE) {
> -        e1000e_tx_ring_init(core, &txr, 0);
> -        e1000e_start_xmit(core, &txr);
> +        if (core->tx[0].tx_waiting) {
> +            return;
> +        }
> +        core->tx[0].tx_waiting = 1;
> +        if (!core->vm_running) {
> +            return;
> +        }
> +        qemu_bh_schedule(core->tx[0].tx_bh);
>       }
>   
>       if (core->mac[TARC1] & E1000_TARC_ENABLE) {
> -        e1000e_tx_ring_init(core, &txr, 1);
> -        e1000e_start_xmit(core, &txr);
> +        if (core->tx[1].tx_waiting) {
> +            return;
> +        }
> +        core->tx[1].tx_waiting = 1;
> +        if (!core->vm_running) {
> +            return;
> +        }
> +        qemu_bh_schedule(core->tx[1].tx_bh);
>       }
>   }
>   
>   static void
>   e1000e_set_tdt(E1000ECore *core, int index, uint32_t val)
>   {
> -    E1000E_TxRing txr;
>       int qidx = e1000e_mq_queue_idx(TDT, index);
>       uint32_t tarc_reg = (qidx == 0) ? TARC0 : TARC1;
>   
>       core->mac[index] = val & 0xffff;
>   
>       if (core->mac[tarc_reg] & E1000_TARC_ENABLE) {
> -        e1000e_tx_ring_init(core, &txr, qidx);
> -        e1000e_start_xmit(core, &txr);
> +        qemu_bh_schedule(core->tx[qidx].tx_bh);
>       }
>   }
>   
> @@ -3315,10 +3325,56 @@ e1000e_vm_state_change(void *opaque, int running, RunState state)
>           trace_e1000e_vm_state_running();
>           e1000e_intrmgr_resume(core);
>           e1000e_autoneg_resume(core);
> +        core->vm_running = 1;
> +
> +        for (int i = 0; i < E1000E_NUM_QUEUES; i++) {
> +            qemu_bh_schedule(core->tx[i].tx_bh);


I guess the reason for the unconditional scheduling of bh is to make 
sure tx work after live migration since we don't migrate tx_waiting.

If yes, better add a comment here. And do we need to clear tx_waiting here?


> +        }
> +
>       } else {
>           trace_e1000e_vm_state_stopped();
> +
> +        for (int i = 0; i < E1000E_NUM_QUEUES; i++) {
> +            qemu_bh_cancel(core->tx[i].tx_bh);
> +        }
> +
>           e1000e_autoneg_pause(core);
>           e1000e_intrmgr_pause(core);
> +        core->vm_running = 0;
> +    }
> +}
> +
> +
> +static void e1000e_core_tx_bh(void *opaque)
> +{
> +    struct e1000e_tx *q = opaque;
> +    E1000ECore *core = q->core;
> +    int32_t ret;
> +
> +    if (!core->vm_running) {
> +        assert(q->tx_waiting);
> +        return;
> +    }
> +
> +    q->tx_waiting = 0;
> +
> +    if (!(core->mac[TCTL] & E1000_TCTL_EN)) {
> +        trace_e1000e_tx_disabled();
> +        return;
> +    }
> +
> +    q->cause = E1000_ICS_TXQE;
> +    q->ide = false;
> +
> +    ret = e1000e_start_xmit(q);
> +    if (ret >= core->tx_burst) {
> +        qemu_bh_schedule(q->tx_bh);
> +        q->tx_waiting = 1;
> +        return;
> +    }
> +
> +    if (!q->ide || !e1000e_intrmgr_delay_tx_causes(core, &q->cause)) {
> +        e1000e_set_interrupt_cause(core, q->cause);


So I think this will cause some delay of the interrupt delivering. It 
looks to be it's better to leave the set ics in e1000e_start_xmit().


>       }
>   }
>   
> @@ -3334,12 +3390,15 @@ e1000e_core_pci_realize(E1000ECore     *core,
>                                          e1000e_autoneg_timer, core);
>       e1000e_intrmgr_pci_realize(core);
>   
> +    core->vm_running = runstate_is_running();
>       core->vmstate =
>           qemu_add_vm_change_state_handler(e1000e_vm_state_change, core);
>   
>       for (i = 0; i < E1000E_NUM_QUEUES; i++) {
>           net_tx_pkt_init(&core->tx[i].tx_pkt, core->owner,
>                           E1000E_MAX_TX_FRAGS, core->has_vnet);
> +        core->tx[i].core = core;
> +        core->tx[i].tx_bh = qemu_bh_new(e1000e_core_tx_bh, &core->tx[i]);
>       }
>   
>       net_rx_pkt_init(&core->rx_pkt, core->has_vnet);
> @@ -3367,6 +3426,9 @@ e1000e_core_pci_uninit(E1000ECore *core)
>       for (i = 0; i < E1000E_NUM_QUEUES; i++) {
>           net_tx_pkt_reset(core->tx[i].tx_pkt);
>           net_tx_pkt_uninit(core->tx[i].tx_pkt);
> +        qemu_bh_cancel(core->tx[i].tx_bh);
> +        qemu_bh_delete(core->tx[i].tx_bh);
> +        core->tx[i].tx_bh = NULL;
>       }
>   
>       net_rx_pkt_uninit(core->rx_pkt);
> diff --git a/hw/net/e1000e_core.h b/hw/net/e1000e_core.h
> index aee32f7e48..0c16dce3a6 100644
> --- a/hw/net/e1000e_core.h
> +++ b/hw/net/e1000e_core.h
> @@ -77,10 +77,18 @@ struct E1000Core {
>           unsigned char sum_needed;
>           bool cptse;
>           struct NetTxPkt *tx_pkt;
> +        QEMUBH *tx_bh;
> +        uint32_t tx_waiting;
> +        uint32_t cause;
> +        bool ide;
> +        E1000ECore *core;
>       } tx[E1000E_NUM_QUEUES];
>   
>       struct NetRxPkt *rx_pkt;
>   
> +    int32_t tx_burst;
> +
> +    bool vm_running;
>       bool has_vnet;
>       int max_queue_num;
>   


Do we need to cancel the bh and reset tx_waiting in e1000e_core_reset()?

Thanks
Li Qiang July 21, 2020, 4:33 a.m. UTC | #2
Jason Wang <jasowang@redhat.com> 于2020年7月21日周二 上午10:03写道:
>
>
> On 2020/7/21 上午12:45, Li Qiang wrote:
> > Alexander Bulekov reported a UAF bug related e1000e packets send.
> >
> > -->https://bugs.launchpad.net/qemu/+bug/1886362
> >
> > This is because the guest trigger a e1000e packet send and set the
> > data's address to e1000e's MMIO address. So when the e1000e do DMA
> > it will write the MMIO again and trigger re-entrancy and finally
> > causes this UAF.
> >
> > Paolo suggested to use a bottom half whenever MMIO is doing complicate
> > things in here:
> > -->https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg03342.html
> >
> > Reference here:
> > 'The easiest solution is to delay processing of descriptors to a bottom
> > half whenever MMIO is doing something complicated.  This is also better
> > for latency because it will free the vCPU thread more quickly and leave
> > the work to the I/O thread.'
> >
> > This patch fixes this UAF.
> >
> > Signed-off-by: Li Qiang <liq3ea@163.com>
> > ---
> > Change since v1:
> > Per Jason's review here:
> > -- https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg05368.html
> > 1. Cancel and schedule the tx bh when VM is stopped or resume
> > 2. Add a tx_burst for e1000e configuration to throttle the bh execution
> > 3. Add a tx_waiting to record whether the bh is pending or not
> > Don't use BQL in the tx_bh handler as when tx_bh is executed, the BQL is
> > acquired.
> >
> >   hw/net/e1000e.c      |   6 +++
> >   hw/net/e1000e_core.c | 106 ++++++++++++++++++++++++++++++++++---------
> >   hw/net/e1000e_core.h |   8 ++++
> >   3 files changed, 98 insertions(+), 22 deletions(-)
> >
> > diff --git a/hw/net/e1000e.c b/hw/net/e1000e.c
> > index fda34518c9..24e35a78bf 100644
> > --- a/hw/net/e1000e.c
> > +++ b/hw/net/e1000e.c
> > @@ -77,10 +77,14 @@ typedef struct E1000EState {
> >
> >       bool disable_vnet;
> >
> > +    int32_t tx_burst;
> > +
> >       E1000ECore core;
> >
> >   } E1000EState;
> >
> > +#define TX_BURST 256
> > +
> >   #define E1000E_MMIO_IDX     0
> >   #define E1000E_FLASH_IDX    1
> >   #define E1000E_IO_IDX       2
> > @@ -263,6 +267,7 @@ static void e1000e_core_realize(E1000EState *s)
> >   {
> >       s->core.owner = &s->parent_obj;
> >       s->core.owner_nic = s->nic;
> > +    s->core.tx_burst = s->tx_burst;
> >   }
> >
> >   static void
> > @@ -665,6 +670,7 @@ static Property e1000e_properties[] = {
> >                           e1000e_prop_subsys_ven, uint16_t),
> >       DEFINE_PROP_SIGNED("subsys", E1000EState, subsys, 0,
> >                           e1000e_prop_subsys, uint16_t),
> > +    DEFINE_PROP_INT32("x-txburst", E1000EState, tx_burst, TX_BURST),
> >       DEFINE_PROP_END_OF_LIST(),
> >   };
> >
> > diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c
> > index bcd186cac5..0a38a50cca 100644
> > --- a/hw/net/e1000e_core.c
> > +++ b/hw/net/e1000e_core.c
> > @@ -909,19 +909,18 @@ e1000e_rx_ring_init(E1000ECore *core, E1000E_RxRing *rxr, int idx)
> >       rxr->i      = &i[idx];
> >   }
> >
> > -static void
> > -e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr)
> > +static int32_t
> > +e1000e_start_xmit(struct e1000e_tx *q)
> >   {
> > +    E1000ECore *core = q->core;
> >       dma_addr_t base;
> >       struct e1000_tx_desc desc;
> > -    bool ide = false;
> > -    const E1000E_RingInfo *txi = txr->i;
> > -    uint32_t cause = E1000_ICS_TXQE;
> > +    const E1000E_RingInfo *txi;
> > +    E1000E_TxRing txr;
> > +    int32_t num_packets = 0;
> >
> > -    if (!(core->mac[TCTL] & E1000_TCTL_EN)) {
> > -        trace_e1000e_tx_disabled();
> > -        return;
> > -    }
> > +    e1000e_tx_ring_init(core, &txr, q - &core->tx[0]);
> > +    txi = txr.i;
> >
> >       while (!e1000e_ring_empty(core, txi)) {
> >           base = e1000e_ring_head_descr(core, txi);
> > @@ -931,15 +930,17 @@ e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr)
> >           trace_e1000e_tx_descr((void *)(intptr_t)desc.buffer_addr,
> >                                 desc.lower.data, desc.upper.data);
> >
> > -        e1000e_process_tx_desc(core, txr->tx, &desc, txi->idx);
> > -        cause |= e1000e_txdesc_writeback(core, base, &desc, &ide, txi->idx);
> > +        e1000e_process_tx_desc(core, txr.tx, &desc, txi->idx);
> > +        q->cause |= e1000e_txdesc_writeback(core, base, &desc,
> > +                                            &q->ide, txi->idx);
> >
> >           e1000e_ring_advance(core, txi, 1);
> > +        if (++num_packets >= core->tx_burst) {
> > +            break;
> > +        }
> >       }
> >
> > -    if (!ide || !e1000e_intrmgr_delay_tx_causes(core, &cause)) {
> > -        e1000e_set_interrupt_cause(core, cause);
> > -    }
> > +    return num_packets;
> >   }
> >
> >   static bool
> > @@ -2423,32 +2424,41 @@ e1000e_set_dbal(E1000ECore *core, int index, uint32_t val)
> >   static void
> >   e1000e_set_tctl(E1000ECore *core, int index, uint32_t val)
> >   {
> > -    E1000E_TxRing txr;
> >       core->mac[index] = val;
> >
> >       if (core->mac[TARC0] & E1000_TARC_ENABLE) {
> > -        e1000e_tx_ring_init(core, &txr, 0);
> > -        e1000e_start_xmit(core, &txr);
> > +        if (core->tx[0].tx_waiting) {
> > +            return;
> > +        }
> > +        core->tx[0].tx_waiting = 1;
> > +        if (!core->vm_running) {
> > +            return;
> > +        }
> > +        qemu_bh_schedule(core->tx[0].tx_bh);
> >       }
> >
> >       if (core->mac[TARC1] & E1000_TARC_ENABLE) {
> > -        e1000e_tx_ring_init(core, &txr, 1);
> > -        e1000e_start_xmit(core, &txr);
> > +        if (core->tx[1].tx_waiting) {
> > +            return;
> > +        }
> > +        core->tx[1].tx_waiting = 1;
> > +        if (!core->vm_running) {
> > +            return;
> > +        }
> > +        qemu_bh_schedule(core->tx[1].tx_bh);
> >       }
> >   }
> >
> >   static void
> >   e1000e_set_tdt(E1000ECore *core, int index, uint32_t val)
> >   {
> > -    E1000E_TxRing txr;
> >       int qidx = e1000e_mq_queue_idx(TDT, index);
> >       uint32_t tarc_reg = (qidx == 0) ? TARC0 : TARC1;
> >
> >       core->mac[index] = val & 0xffff;
> >
> >       if (core->mac[tarc_reg] & E1000_TARC_ENABLE) {
> > -        e1000e_tx_ring_init(core, &txr, qidx);
> > -        e1000e_start_xmit(core, &txr);
> > +        qemu_bh_schedule(core->tx[qidx].tx_bh);
> >       }
> >   }
> >
> > @@ -3315,10 +3325,56 @@ e1000e_vm_state_change(void *opaque, int running, RunState state)
> >           trace_e1000e_vm_state_running();
> >           e1000e_intrmgr_resume(core);
> >           e1000e_autoneg_resume(core);
> > +        core->vm_running = 1;
> > +
> > +        for (int i = 0; i < E1000E_NUM_QUEUES; i++) {
> > +            qemu_bh_schedule(core->tx[i].tx_bh);
>
>
> I guess the reason for the unconditional scheduling of bh is to make
> sure tx work after live migration since we don't migrate tx_waiting.
>
> If yes, better add a comment here.

Ok will do in next revision.

And do we need to clear tx_waiting here?

I think there is no need as the tx_bh handler will do this.

>
>
> > +        }
> > +
> >       } else {
> >           trace_e1000e_vm_state_stopped();
> > +
> > +        for (int i = 0; i < E1000E_NUM_QUEUES; i++) {
> > +            qemu_bh_cancel(core->tx[i].tx_bh);
> > +        }
> > +
> >           e1000e_autoneg_pause(core);
> >           e1000e_intrmgr_pause(core);
> > +        core->vm_running = 0;
> > +    }
> > +}
> > +
> > +
> > +static void e1000e_core_tx_bh(void *opaque)
> > +{
> > +    struct e1000e_tx *q = opaque;
> > +    E1000ECore *core = q->core;
> > +    int32_t ret;
> > +
> > +    if (!core->vm_running) {
> > +        assert(q->tx_waiting);
> > +        return;
> > +    }
> > +
> > +    q->tx_waiting = 0;
> > +
> > +    if (!(core->mac[TCTL] & E1000_TCTL_EN)) {
> > +        trace_e1000e_tx_disabled();
> > +        return;
> > +    }
> > +
> > +    q->cause = E1000_ICS_TXQE;
> > +    q->ide = false;
> > +
> > +    ret = e1000e_start_xmit(q);
> > +    if (ret >= core->tx_burst) {
> > +        qemu_bh_schedule(q->tx_bh);
> > +        q->tx_waiting = 1;
> > +        return;
> > +    }
> > +
> > +    if (!q->ide || !e1000e_intrmgr_delay_tx_causes(core, &q->cause)) {
> > +        e1000e_set_interrupt_cause(core, q->cause);
>
>
> So I think this will cause some delay of the interrupt delivering.

Exactly. if tx burst occurs, the interrupt won't be triggered in the
first tx_bh handler.
As we give the vcpu more time and this may acceptable?

It
> looks to be it's better to leave the set ics in e1000e_start_xmit().

I think let e1000e_start_xmit just send packets is more clean.
Leave setting ics in it will not improve the performance as far as I can see.
What's your idea here to leave it in e1000e_start_xmit?

>
>
> >       }
> >   }
> >
> > @@ -3334,12 +3390,15 @@ e1000e_core_pci_realize(E1000ECore     *core,
> >                                          e1000e_autoneg_timer, core);
> >       e1000e_intrmgr_pci_realize(core);
> >
> > +    core->vm_running = runstate_is_running();
> >       core->vmstate =
> >           qemu_add_vm_change_state_handler(e1000e_vm_state_change, core);
> >
> >       for (i = 0; i < E1000E_NUM_QUEUES; i++) {
> >           net_tx_pkt_init(&core->tx[i].tx_pkt, core->owner,
> >                           E1000E_MAX_TX_FRAGS, core->has_vnet);
> > +        core->tx[i].core = core;
> > +        core->tx[i].tx_bh = qemu_bh_new(e1000e_core_tx_bh, &core->tx[i]);
> >       }
> >
> >       net_rx_pkt_init(&core->rx_pkt, core->has_vnet);
> > @@ -3367,6 +3426,9 @@ e1000e_core_pci_uninit(E1000ECore *core)
> >       for (i = 0; i < E1000E_NUM_QUEUES; i++) {
> >           net_tx_pkt_reset(core->tx[i].tx_pkt);
> >           net_tx_pkt_uninit(core->tx[i].tx_pkt);
> > +        qemu_bh_cancel(core->tx[i].tx_bh);
> > +        qemu_bh_delete(core->tx[i].tx_bh);
> > +        core->tx[i].tx_bh = NULL;
> >       }
> >
> >       net_rx_pkt_uninit(core->rx_pkt);
> > diff --git a/hw/net/e1000e_core.h b/hw/net/e1000e_core.h
> > index aee32f7e48..0c16dce3a6 100644
> > --- a/hw/net/e1000e_core.h
> > +++ b/hw/net/e1000e_core.h
> > @@ -77,10 +77,18 @@ struct E1000Core {
> >           unsigned char sum_needed;
> >           bool cptse;
> >           struct NetTxPkt *tx_pkt;
> > +        QEMUBH *tx_bh;
> > +        uint32_t tx_waiting;
> > +        uint32_t cause;
> > +        bool ide;
> > +        E1000ECore *core;
> >       } tx[E1000E_NUM_QUEUES];
> >
> >       struct NetRxPkt *rx_pkt;
> >
> > +    int32_t tx_burst;
> > +
> > +    bool vm_running;
> >       bool has_vnet;
> >       int max_queue_num;
> >
>
>
> Do we need to cancel the bh and reset tx_waiting in e1000e_core_reset()?

I think we need. But why the virtio net doesn't do this? Maybe it also
lack of this?


Thanks,
Li Qiang

>
> Thanks
>
Jason Wang July 21, 2020, 5:30 a.m. UTC | #3
On 2020/7/21 下午12:33, Li Qiang wrote:
> Jason Wang <jasowang@redhat.com> 于2020年7月21日周二 上午10:03写道:
>>
>> On 2020/7/21 上午12:45, Li Qiang wrote:
>>> Alexander Bulekov reported a UAF bug related e1000e packets send.
>>>
>>> -->https://bugs.launchpad.net/qemu/+bug/1886362
>>>
>>> This is because the guest trigger a e1000e packet send and set the
>>> data's address to e1000e's MMIO address. So when the e1000e do DMA
>>> it will write the MMIO again and trigger re-entrancy and finally
>>> causes this UAF.
>>>
>>> Paolo suggested to use a bottom half whenever MMIO is doing complicate
>>> things in here:
>>> -->https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg03342.html
>>>
>>> Reference here:
>>> 'The easiest solution is to delay processing of descriptors to a bottom
>>> half whenever MMIO is doing something complicated.  This is also better
>>> for latency because it will free the vCPU thread more quickly and leave
>>> the work to the I/O thread.'
>>>
>>> This patch fixes this UAF.
>>>
>>> Signed-off-by: Li Qiang <liq3ea@163.com>
>>> ---
>>> Change since v1:
>>> Per Jason's review here:
>>> -- https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg05368.html
>>> 1. Cancel and schedule the tx bh when VM is stopped or resume
>>> 2. Add a tx_burst for e1000e configuration to throttle the bh execution
>>> 3. Add a tx_waiting to record whether the bh is pending or not
>>> Don't use BQL in the tx_bh handler as when tx_bh is executed, the BQL is
>>> acquired.
>>>
>>>    hw/net/e1000e.c      |   6 +++
>>>    hw/net/e1000e_core.c | 106 ++++++++++++++++++++++++++++++++++---------
>>>    hw/net/e1000e_core.h |   8 ++++
>>>    3 files changed, 98 insertions(+), 22 deletions(-)
>>>
>>> diff --git a/hw/net/e1000e.c b/hw/net/e1000e.c
>>> index fda34518c9..24e35a78bf 100644
>>> --- a/hw/net/e1000e.c
>>> +++ b/hw/net/e1000e.c
>>> @@ -77,10 +77,14 @@ typedef struct E1000EState {
>>>
>>>        bool disable_vnet;
>>>
>>> +    int32_t tx_burst;
>>> +
>>>        E1000ECore core;
>>>
>>>    } E1000EState;
>>>
>>> +#define TX_BURST 256
>>> +
>>>    #define E1000E_MMIO_IDX     0
>>>    #define E1000E_FLASH_IDX    1
>>>    #define E1000E_IO_IDX       2
>>> @@ -263,6 +267,7 @@ static void e1000e_core_realize(E1000EState *s)
>>>    {
>>>        s->core.owner = &s->parent_obj;
>>>        s->core.owner_nic = s->nic;
>>> +    s->core.tx_burst = s->tx_burst;
>>>    }
>>>
>>>    static void
>>> @@ -665,6 +670,7 @@ static Property e1000e_properties[] = {
>>>                            e1000e_prop_subsys_ven, uint16_t),
>>>        DEFINE_PROP_SIGNED("subsys", E1000EState, subsys, 0,
>>>                            e1000e_prop_subsys, uint16_t),
>>> +    DEFINE_PROP_INT32("x-txburst", E1000EState, tx_burst, TX_BURST),
>>>        DEFINE_PROP_END_OF_LIST(),
>>>    };
>>>
>>> diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c
>>> index bcd186cac5..0a38a50cca 100644
>>> --- a/hw/net/e1000e_core.c
>>> +++ b/hw/net/e1000e_core.c
>>> @@ -909,19 +909,18 @@ e1000e_rx_ring_init(E1000ECore *core, E1000E_RxRing *rxr, int idx)
>>>        rxr->i      = &i[idx];
>>>    }
>>>
>>> -static void
>>> -e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr)
>>> +static int32_t
>>> +e1000e_start_xmit(struct e1000e_tx *q)
>>>    {
>>> +    E1000ECore *core = q->core;
>>>        dma_addr_t base;
>>>        struct e1000_tx_desc desc;
>>> -    bool ide = false;
>>> -    const E1000E_RingInfo *txi = txr->i;
>>> -    uint32_t cause = E1000_ICS_TXQE;
>>> +    const E1000E_RingInfo *txi;
>>> +    E1000E_TxRing txr;
>>> +    int32_t num_packets = 0;
>>>
>>> -    if (!(core->mac[TCTL] & E1000_TCTL_EN)) {
>>> -        trace_e1000e_tx_disabled();
>>> -        return;
>>> -    }
>>> +    e1000e_tx_ring_init(core, &txr, q - &core->tx[0]);
>>> +    txi = txr.i;
>>>
>>>        while (!e1000e_ring_empty(core, txi)) {
>>>            base = e1000e_ring_head_descr(core, txi);
>>> @@ -931,15 +930,17 @@ e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr)
>>>            trace_e1000e_tx_descr((void *)(intptr_t)desc.buffer_addr,
>>>                                  desc.lower.data, desc.upper.data);
>>>
>>> -        e1000e_process_tx_desc(core, txr->tx, &desc, txi->idx);
>>> -        cause |= e1000e_txdesc_writeback(core, base, &desc, &ide, txi->idx);
>>> +        e1000e_process_tx_desc(core, txr.tx, &desc, txi->idx);
>>> +        q->cause |= e1000e_txdesc_writeback(core, base, &desc,
>>> +                                            &q->ide, txi->idx);
>>>
>>>            e1000e_ring_advance(core, txi, 1);
>>> +        if (++num_packets >= core->tx_burst) {
>>> +            break;
>>> +        }
>>>        }
>>>
>>> -    if (!ide || !e1000e_intrmgr_delay_tx_causes(core, &cause)) {
>>> -        e1000e_set_interrupt_cause(core, cause);
>>> -    }
>>> +    return num_packets;
>>>    }
>>>
>>>    static bool
>>> @@ -2423,32 +2424,41 @@ e1000e_set_dbal(E1000ECore *core, int index, uint32_t val)
>>>    static void
>>>    e1000e_set_tctl(E1000ECore *core, int index, uint32_t val)
>>>    {
>>> -    E1000E_TxRing txr;
>>>        core->mac[index] = val;
>>>
>>>        if (core->mac[TARC0] & E1000_TARC_ENABLE) {
>>> -        e1000e_tx_ring_init(core, &txr, 0);
>>> -        e1000e_start_xmit(core, &txr);
>>> +        if (core->tx[0].tx_waiting) {
>>> +            return;
>>> +        }
>>> +        core->tx[0].tx_waiting = 1;
>>> +        if (!core->vm_running) {
>>> +            return;
>>> +        }
>>> +        qemu_bh_schedule(core->tx[0].tx_bh);
>>>        }
>>>
>>>        if (core->mac[TARC1] & E1000_TARC_ENABLE) {
>>> -        e1000e_tx_ring_init(core, &txr, 1);
>>> -        e1000e_start_xmit(core, &txr);
>>> +        if (core->tx[1].tx_waiting) {
>>> +            return;
>>> +        }
>>> +        core->tx[1].tx_waiting = 1;
>>> +        if (!core->vm_running) {
>>> +            return;
>>> +        }
>>> +        qemu_bh_schedule(core->tx[1].tx_bh);
>>>        }
>>>    }
>>>
>>>    static void
>>>    e1000e_set_tdt(E1000ECore *core, int index, uint32_t val)
>>>    {
>>> -    E1000E_TxRing txr;
>>>        int qidx = e1000e_mq_queue_idx(TDT, index);
>>>        uint32_t tarc_reg = (qidx == 0) ? TARC0 : TARC1;
>>>
>>>        core->mac[index] = val & 0xffff;
>>>
>>>        if (core->mac[tarc_reg] & E1000_TARC_ENABLE) {
>>> -        e1000e_tx_ring_init(core, &txr, qidx);
>>> -        e1000e_start_xmit(core, &txr);
>>> +        qemu_bh_schedule(core->tx[qidx].tx_bh);
>>>        }
>>>    }
>>>
>>> @@ -3315,10 +3325,56 @@ e1000e_vm_state_change(void *opaque, int running, RunState state)
>>>            trace_e1000e_vm_state_running();
>>>            e1000e_intrmgr_resume(core);
>>>            e1000e_autoneg_resume(core);
>>> +        core->vm_running = 1;
>>> +
>>> +        for (int i = 0; i < E1000E_NUM_QUEUES; i++) {
>>> +            qemu_bh_schedule(core->tx[i].tx_bh);
>>
>> I guess the reason for the unconditional scheduling of bh is to make
>> sure tx work after live migration since we don't migrate tx_waiting.
>>
>> If yes, better add a comment here.
> Ok will do in next revision.
>
> And do we need to clear tx_waiting here?
>
> I think there is no need as the tx_bh handler will do this.


Yes.


>
>>
>>> +        }
>>> +
>>>        } else {
>>>            trace_e1000e_vm_state_stopped();
>>> +
>>> +        for (int i = 0; i < E1000E_NUM_QUEUES; i++) {
>>> +            qemu_bh_cancel(core->tx[i].tx_bh);
>>> +        }
>>> +
>>>            e1000e_autoneg_pause(core);
>>>            e1000e_intrmgr_pause(core);
>>> +        core->vm_running = 0;
>>> +    }
>>> +}
>>> +
>>> +
>>> +static void e1000e_core_tx_bh(void *opaque)
>>> +{
>>> +    struct e1000e_tx *q = opaque;
>>> +    E1000ECore *core = q->core;
>>> +    int32_t ret;
>>> +
>>> +    if (!core->vm_running) {
>>> +        assert(q->tx_waiting);
>>> +        return;
>>> +    }
>>> +
>>> +    q->tx_waiting = 0;
>>> +
>>> +    if (!(core->mac[TCTL] & E1000_TCTL_EN)) {
>>> +        trace_e1000e_tx_disabled();
>>> +        return;
>>> +    }
>>> +
>>> +    q->cause = E1000_ICS_TXQE;
>>> +    q->ide = false;
>>> +
>>> +    ret = e1000e_start_xmit(q);
>>> +    if (ret >= core->tx_burst) {
>>> +        qemu_bh_schedule(q->tx_bh);
>>> +        q->tx_waiting = 1;
>>> +        return;
>>> +    }
>>> +
>>> +    if (!q->ide || !e1000e_intrmgr_delay_tx_causes(core, &q->cause)) {
>>> +        e1000e_set_interrupt_cause(core, q->cause);
>>
>> So I think this will cause some delay of the interrupt delivering.
> Exactly. if tx burst occurs, the interrupt won't be triggered in the
> first tx_bh handler.
> As we give the vcpu more time and this may acceptable?


I think not, e1000e has its own interrupt delay mechanism, let's keep it 
work as much as possible.


>
> It
>> looks to be it's better to leave the set ics in e1000e_start_xmit().
> I think let e1000e_start_xmit just send packets is more clean.
> Leave setting ics in it will not improve the performance as far as I can see.
> What's your idea here to leave it in e1000e_start_xmit?


I just worry about the delay of the interrupt if you do it in 
e1000e_core_tx_bh() and just let e1000e_start_xmit() return the number 
of packets transmitted and keep most of its code untouched.


>
>>
>>>        }
>>>    }
>>>
>>> @@ -3334,12 +3390,15 @@ e1000e_core_pci_realize(E1000ECore     *core,
>>>                                           e1000e_autoneg_timer, core);
>>>        e1000e_intrmgr_pci_realize(core);
>>>
>>> +    core->vm_running = runstate_is_running();
>>>        core->vmstate =
>>>            qemu_add_vm_change_state_handler(e1000e_vm_state_change, core);
>>>
>>>        for (i = 0; i < E1000E_NUM_QUEUES; i++) {
>>>            net_tx_pkt_init(&core->tx[i].tx_pkt, core->owner,
>>>                            E1000E_MAX_TX_FRAGS, core->has_vnet);
>>> +        core->tx[i].core = core;
>>> +        core->tx[i].tx_bh = qemu_bh_new(e1000e_core_tx_bh, &core->tx[i]);
>>>        }
>>>
>>>        net_rx_pkt_init(&core->rx_pkt, core->has_vnet);
>>> @@ -3367,6 +3426,9 @@ e1000e_core_pci_uninit(E1000ECore *core)
>>>        for (i = 0; i < E1000E_NUM_QUEUES; i++) {
>>>            net_tx_pkt_reset(core->tx[i].tx_pkt);
>>>            net_tx_pkt_uninit(core->tx[i].tx_pkt);
>>> +        qemu_bh_cancel(core->tx[i].tx_bh);
>>> +        qemu_bh_delete(core->tx[i].tx_bh);
>>> +        core->tx[i].tx_bh = NULL;
>>>        }
>>>
>>>        net_rx_pkt_uninit(core->rx_pkt);
>>> diff --git a/hw/net/e1000e_core.h b/hw/net/e1000e_core.h
>>> index aee32f7e48..0c16dce3a6 100644
>>> --- a/hw/net/e1000e_core.h
>>> +++ b/hw/net/e1000e_core.h
>>> @@ -77,10 +77,18 @@ struct E1000Core {
>>>            unsigned char sum_needed;
>>>            bool cptse;
>>>            struct NetTxPkt *tx_pkt;
>>> +        QEMUBH *tx_bh;
>>> +        uint32_t tx_waiting;
>>> +        uint32_t cause;
>>> +        bool ide;
>>> +        E1000ECore *core;
>>>        } tx[E1000E_NUM_QUEUES];
>>>
>>>        struct NetRxPkt *rx_pkt;
>>>
>>> +    int32_t tx_burst;
>>> +
>>> +    bool vm_running;
>>>        bool has_vnet;
>>>        int max_queue_num;
>>>
>>
>> Do we need to cancel the bh and reset tx_waiting in e1000e_core_reset()?
> I think we need. But why the virtio net doesn't do this? Maybe it also
> lack of this?


Ok, so I think the current code is probably OK.

virtio-net check DRIVER_OK and the code here check  TCTL_EN.

Thanks


>
> Thanks,
> Li Qiang
>
>> Thanks
>>
Li Qiang July 21, 2020, 5:59 a.m. UTC | #4
Jason Wang <jasowang@redhat.com> 于2020年7月21日周二 下午1:30写道:
>
>
> On 2020/7/21 下午12:33, Li Qiang wrote:
> > Jason Wang <jasowang@redhat.com> 于2020年7月21日周二 上午10:03写道:
> >>
> >> On 2020/7/21 上午12:45, Li Qiang wrote:
> >>> Alexander Bulekov reported a UAF bug related e1000e packets send.
> >>>
> >>> -->https://bugs.launchpad.net/qemu/+bug/1886362
> >>>
> >>> This is because the guest trigger a e1000e packet send and set the
> >>> data's address to e1000e's MMIO address. So when the e1000e do DMA
> >>> it will write the MMIO again and trigger re-entrancy and finally
> >>> causes this UAF.
> >>>
> >>> Paolo suggested to use a bottom half whenever MMIO is doing complicate
> >>> things in here:
> >>> -->https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg03342.html
> >>>
> >>> Reference here:
> >>> 'The easiest solution is to delay processing of descriptors to a bottom
> >>> half whenever MMIO is doing something complicated.  This is also better
> >>> for latency because it will free the vCPU thread more quickly and leave
> >>> the work to the I/O thread.'
> >>>
> >>> This patch fixes this UAF.
> >>>
> >>> Signed-off-by: Li Qiang <liq3ea@163.com>
> >>> ---
> >>> Change since v1:
> >>> Per Jason's review here:
> >>> -- https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg05368.html
> >>> 1. Cancel and schedule the tx bh when VM is stopped or resume
> >>> 2. Add a tx_burst for e1000e configuration to throttle the bh execution
> >>> 3. Add a tx_waiting to record whether the bh is pending or not
> >>> Don't use BQL in the tx_bh handler as when tx_bh is executed, the BQL is
> >>> acquired.
> >>>
> >>>    hw/net/e1000e.c      |   6 +++
> >>>    hw/net/e1000e_core.c | 106 ++++++++++++++++++++++++++++++++++---------
> >>>    hw/net/e1000e_core.h |   8 ++++
> >>>    3 files changed, 98 insertions(+), 22 deletions(-)
> >>>
> >>> diff --git a/hw/net/e1000e.c b/hw/net/e1000e.c
> >>> index fda34518c9..24e35a78bf 100644
> >>> --- a/hw/net/e1000e.c
> >>> +++ b/hw/net/e1000e.c
> >>> @@ -77,10 +77,14 @@ typedef struct E1000EState {
> >>>
> >>>        bool disable_vnet;
> >>>
> >>> +    int32_t tx_burst;
> >>> +
> >>>        E1000ECore core;
> >>>
> >>>    } E1000EState;
> >>>
> >>> +#define TX_BURST 256
> >>> +
> >>>    #define E1000E_MMIO_IDX     0
> >>>    #define E1000E_FLASH_IDX    1
> >>>    #define E1000E_IO_IDX       2
> >>> @@ -263,6 +267,7 @@ static void e1000e_core_realize(E1000EState *s)
> >>>    {
> >>>        s->core.owner = &s->parent_obj;
> >>>        s->core.owner_nic = s->nic;
> >>> +    s->core.tx_burst = s->tx_burst;
> >>>    }
> >>>
> >>>    static void
> >>> @@ -665,6 +670,7 @@ static Property e1000e_properties[] = {
> >>>                            e1000e_prop_subsys_ven, uint16_t),
> >>>        DEFINE_PROP_SIGNED("subsys", E1000EState, subsys, 0,
> >>>                            e1000e_prop_subsys, uint16_t),
> >>> +    DEFINE_PROP_INT32("x-txburst", E1000EState, tx_burst, TX_BURST),
> >>>        DEFINE_PROP_END_OF_LIST(),
> >>>    };
> >>>
> >>> diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c
> >>> index bcd186cac5..0a38a50cca 100644
> >>> --- a/hw/net/e1000e_core.c
> >>> +++ b/hw/net/e1000e_core.c
> >>> @@ -909,19 +909,18 @@ e1000e_rx_ring_init(E1000ECore *core, E1000E_RxRing *rxr, int idx)
> >>>        rxr->i      = &i[idx];
> >>>    }
> >>>
> >>> -static void
> >>> -e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr)
> >>> +static int32_t
> >>> +e1000e_start_xmit(struct e1000e_tx *q)
> >>>    {
> >>> +    E1000ECore *core = q->core;
> >>>        dma_addr_t base;
> >>>        struct e1000_tx_desc desc;
> >>> -    bool ide = false;
> >>> -    const E1000E_RingInfo *txi = txr->i;
> >>> -    uint32_t cause = E1000_ICS_TXQE;
> >>> +    const E1000E_RingInfo *txi;
> >>> +    E1000E_TxRing txr;
> >>> +    int32_t num_packets = 0;
> >>>
> >>> -    if (!(core->mac[TCTL] & E1000_TCTL_EN)) {
> >>> -        trace_e1000e_tx_disabled();
> >>> -        return;
> >>> -    }
> >>> +    e1000e_tx_ring_init(core, &txr, q - &core->tx[0]);
> >>> +    txi = txr.i;
> >>>
> >>>        while (!e1000e_ring_empty(core, txi)) {
> >>>            base = e1000e_ring_head_descr(core, txi);
> >>> @@ -931,15 +930,17 @@ e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr)
> >>>            trace_e1000e_tx_descr((void *)(intptr_t)desc.buffer_addr,
> >>>                                  desc.lower.data, desc.upper.data);
> >>>
> >>> -        e1000e_process_tx_desc(core, txr->tx, &desc, txi->idx);
> >>> -        cause |= e1000e_txdesc_writeback(core, base, &desc, &ide, txi->idx);
> >>> +        e1000e_process_tx_desc(core, txr.tx, &desc, txi->idx);
> >>> +        q->cause |= e1000e_txdesc_writeback(core, base, &desc,
> >>> +                                            &q->ide, txi->idx);
> >>>
> >>>            e1000e_ring_advance(core, txi, 1);
> >>> +        if (++num_packets >= core->tx_burst) {
> >>> +            break;
> >>> +        }
> >>>        }
> >>>
> >>> -    if (!ide || !e1000e_intrmgr_delay_tx_causes(core, &cause)) {
> >>> -        e1000e_set_interrupt_cause(core, cause);
> >>> -    }
> >>> +    return num_packets;
> >>>    }
> >>>
> >>>    static bool
> >>> @@ -2423,32 +2424,41 @@ e1000e_set_dbal(E1000ECore *core, int index, uint32_t val)
> >>>    static void
> >>>    e1000e_set_tctl(E1000ECore *core, int index, uint32_t val)
> >>>    {
> >>> -    E1000E_TxRing txr;
> >>>        core->mac[index] = val;
> >>>
> >>>        if (core->mac[TARC0] & E1000_TARC_ENABLE) {
> >>> -        e1000e_tx_ring_init(core, &txr, 0);
> >>> -        e1000e_start_xmit(core, &txr);
> >>> +        if (core->tx[0].tx_waiting) {
> >>> +            return;
> >>> +        }
> >>> +        core->tx[0].tx_waiting = 1;
> >>> +        if (!core->vm_running) {
> >>> +            return;
> >>> +        }
> >>> +        qemu_bh_schedule(core->tx[0].tx_bh);
> >>>        }
> >>>
> >>>        if (core->mac[TARC1] & E1000_TARC_ENABLE) {
> >>> -        e1000e_tx_ring_init(core, &txr, 1);
> >>> -        e1000e_start_xmit(core, &txr);
> >>> +        if (core->tx[1].tx_waiting) {
> >>> +            return;
> >>> +        }
> >>> +        core->tx[1].tx_waiting = 1;
> >>> +        if (!core->vm_running) {
> >>> +            return;
> >>> +        }
> >>> +        qemu_bh_schedule(core->tx[1].tx_bh);
> >>>        }
> >>>    }
> >>>
> >>>    static void
> >>>    e1000e_set_tdt(E1000ECore *core, int index, uint32_t val)
> >>>    {
> >>> -    E1000E_TxRing txr;
> >>>        int qidx = e1000e_mq_queue_idx(TDT, index);
> >>>        uint32_t tarc_reg = (qidx == 0) ? TARC0 : TARC1;
> >>>
> >>>        core->mac[index] = val & 0xffff;
> >>>
> >>>        if (core->mac[tarc_reg] & E1000_TARC_ENABLE) {
> >>> -        e1000e_tx_ring_init(core, &txr, qidx);
> >>> -        e1000e_start_xmit(core, &txr);
> >>> +        qemu_bh_schedule(core->tx[qidx].tx_bh);
> >>>        }
> >>>    }
> >>>
> >>> @@ -3315,10 +3325,56 @@ e1000e_vm_state_change(void *opaque, int running, RunState state)
> >>>            trace_e1000e_vm_state_running();
> >>>            e1000e_intrmgr_resume(core);
> >>>            e1000e_autoneg_resume(core);
> >>> +        core->vm_running = 1;
> >>> +
> >>> +        for (int i = 0; i < E1000E_NUM_QUEUES; i++) {
> >>> +            qemu_bh_schedule(core->tx[i].tx_bh);
> >>
> >> I guess the reason for the unconditional scheduling of bh is to make
> >> sure tx work after live migration since we don't migrate tx_waiting.
> >>
> >> If yes, better add a comment here.
> > Ok will do in next revision.
> >
> > And do we need to clear tx_waiting here?
> >
> > I think there is no need as the tx_bh handler will do this.
>
>
> Yes.
>
>
> >
> >>
> >>> +        }
> >>> +
> >>>        } else {
> >>>            trace_e1000e_vm_state_stopped();
> >>> +
> >>> +        for (int i = 0; i < E1000E_NUM_QUEUES; i++) {
> >>> +            qemu_bh_cancel(core->tx[i].tx_bh);
> >>> +        }
> >>> +
> >>>            e1000e_autoneg_pause(core);
> >>>            e1000e_intrmgr_pause(core);
> >>> +        core->vm_running = 0;
> >>> +    }
> >>> +}
> >>> +
> >>> +
> >>> +static void e1000e_core_tx_bh(void *opaque)
> >>> +{
> >>> +    struct e1000e_tx *q = opaque;
> >>> +    E1000ECore *core = q->core;
> >>> +    int32_t ret;
> >>> +
> >>> +    if (!core->vm_running) {
> >>> +        assert(q->tx_waiting);
> >>> +        return;
> >>> +    }
> >>> +
> >>> +    q->tx_waiting = 0;
> >>> +
> >>> +    if (!(core->mac[TCTL] & E1000_TCTL_EN)) {
> >>> +        trace_e1000e_tx_disabled();
> >>> +        return;
> >>> +    }
> >>> +
> >>> +    q->cause = E1000_ICS_TXQE;
> >>> +    q->ide = false;
> >>> +
> >>> +    ret = e1000e_start_xmit(q);
> >>> +    if (ret >= core->tx_burst) {
> >>> +        qemu_bh_schedule(q->tx_bh);
> >>> +        q->tx_waiting = 1;
> >>> +        return;
> >>> +    }
> >>> +
> >>> +    if (!q->ide || !e1000e_intrmgr_delay_tx_causes(core, &q->cause)) {
> >>> +        e1000e_set_interrupt_cause(core, q->cause);
> >>
> >> So I think this will cause some delay of the interrupt delivering.
> > Exactly. if tx burst occurs, the interrupt won't be triggered in the
> > first tx_bh handler.
> > As we give the vcpu more time and this may acceptable?
>
>
> I think not, e1000e has its own interrupt delay mechanism, let's keep it
> work as much as possible.
>
>
> >
> > It
> >> looks to be it's better to leave the set ics in e1000e_start_xmit().
> > I think let e1000e_start_xmit just send packets is more clean.
> > Leave setting ics in it will not improve the performance as far as I can see.
> > What's your idea here to leave it in e1000e_start_xmit?
>
>
> I just worry about the delay of the interrupt if you do it in
> e1000e_core_tx_bh() and just let e1000e_start_xmit() return the number
> of packets transmitted and keep most of its code untouched.

Sorry here let's make sure I got your meaning.
'The set ics' is meaning 'set interrupt cause' , in code:

+    if (!q->ide || !e1000e_intrmgr_delay_tx_causes(core, &q->cause)) {
+        e1000e_set_interrupt_cause(core, q->cause);

Do you mean we should leave it in the 'e1000e_start_xmit'? My
understanding is this.
if it is this, we should also add following code in it:

+    if (ret >= core->tx_burst) {
+        qemu_bh_schedule(q->tx_bh);
+        q->tx_waiting = 1;
+        return;
+    }

Then the tx_bh handler just call 'e1000e_start_xmit'. Right?



>
>
> >
> >>
> >>>        }
> >>>    }
> >>>
> >>> @@ -3334,12 +3390,15 @@ e1000e_core_pci_realize(E1000ECore     *core,
> >>>                                           e1000e_autoneg_timer, core);
> >>>        e1000e_intrmgr_pci_realize(core);
> >>>
> >>> +    core->vm_running = runstate_is_running();
> >>>        core->vmstate =
> >>>            qemu_add_vm_change_state_handler(e1000e_vm_state_change, core);
> >>>
> >>>        for (i = 0; i < E1000E_NUM_QUEUES; i++) {
> >>>            net_tx_pkt_init(&core->tx[i].tx_pkt, core->owner,
> >>>                            E1000E_MAX_TX_FRAGS, core->has_vnet);
> >>> +        core->tx[i].core = core;
> >>> +        core->tx[i].tx_bh = qemu_bh_new(e1000e_core_tx_bh, &core->tx[i]);
> >>>        }
> >>>
> >>>        net_rx_pkt_init(&core->rx_pkt, core->has_vnet);
> >>> @@ -3367,6 +3426,9 @@ e1000e_core_pci_uninit(E1000ECore *core)
> >>>        for (i = 0; i < E1000E_NUM_QUEUES; i++) {
> >>>            net_tx_pkt_reset(core->tx[i].tx_pkt);
> >>>            net_tx_pkt_uninit(core->tx[i].tx_pkt);
> >>> +        qemu_bh_cancel(core->tx[i].tx_bh);
> >>> +        qemu_bh_delete(core->tx[i].tx_bh);
> >>> +        core->tx[i].tx_bh = NULL;
> >>>        }
> >>>
> >>>        net_rx_pkt_uninit(core->rx_pkt);
> >>> diff --git a/hw/net/e1000e_core.h b/hw/net/e1000e_core.h
> >>> index aee32f7e48..0c16dce3a6 100644
> >>> --- a/hw/net/e1000e_core.h
> >>> +++ b/hw/net/e1000e_core.h
> >>> @@ -77,10 +77,18 @@ struct E1000Core {
> >>>            unsigned char sum_needed;
> >>>            bool cptse;
> >>>            struct NetTxPkt *tx_pkt;
> >>> +        QEMUBH *tx_bh;
> >>> +        uint32_t tx_waiting;
> >>> +        uint32_t cause;
> >>> +        bool ide;
> >>> +        E1000ECore *core;
> >>>        } tx[E1000E_NUM_QUEUES];
> >>>
> >>>        struct NetRxPkt *rx_pkt;
> >>>
> >>> +    int32_t tx_burst;
> >>> +
> >>> +    bool vm_running;
> >>>        bool has_vnet;
> >>>        int max_queue_num;
> >>>
> >>
> >> Do we need to cancel the bh and reset tx_waiting in e1000e_core_reset()?
> > I think we need. But why the virtio net doesn't do this? Maybe it also
> > lack of this?
>
>
> Ok, so I think the current code is probably OK.

So my next revision will do following:
1. add comment for the tx_bh schdule when VM resume.
2. Leave some code in the 'e1000e_start_xmit'
3. cancel the tx_bh and reset tx_waiting in e1000e_core_reset

If I lost/misunderstanding anything please let me know.

Thanks,
Li Qiang

>
> virtio-net check DRIVER_OK and the code here check  TCTL_EN.
>
> Thanks
>
>
> >
> > Thanks,
> > Li Qiang
> >
> >> Thanks
> >>
>
Jason Wang July 21, 2020, 6:05 a.m. UTC | #5
On 2020/7/21 下午1:59, Li Qiang wrote:
> Jason Wang <jasowang@redhat.com> 于2020年7月21日周二 下午1:30写道:
>>
>> On 2020/7/21 下午12:33, Li Qiang wrote:
>>> Jason Wang <jasowang@redhat.com> 于2020年7月21日周二 上午10:03写道:
>>>> On 2020/7/21 上午12:45, Li Qiang wrote:
>>>>> Alexander Bulekov reported a UAF bug related e1000e packets send.
>>>>>
>>>>> -->https://bugs.launchpad.net/qemu/+bug/1886362
>>>>>
>>>>> This is because the guest trigger a e1000e packet send and set the
>>>>> data's address to e1000e's MMIO address. So when the e1000e do DMA
>>>>> it will write the MMIO again and trigger re-entrancy and finally
>>>>> causes this UAF.
>>>>>
>>>>> Paolo suggested to use a bottom half whenever MMIO is doing complicate
>>>>> things in here:
>>>>> -->https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg03342.html
>>>>>
>>>>> Reference here:
>>>>> 'The easiest solution is to delay processing of descriptors to a bottom
>>>>> half whenever MMIO is doing something complicated.  This is also better
>>>>> for latency because it will free the vCPU thread more quickly and leave
>>>>> the work to the I/O thread.'
>>>>>
>>>>> This patch fixes this UAF.
>>>>>
>>>>> Signed-off-by: Li Qiang <liq3ea@163.com>
>>>>> ---
>>>>> Change since v1:
>>>>> Per Jason's review here:
>>>>> -- https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg05368.html
>>>>> 1. Cancel and schedule the tx bh when VM is stopped or resume
>>>>> 2. Add a tx_burst for e1000e configuration to throttle the bh execution
>>>>> 3. Add a tx_waiting to record whether the bh is pending or not
>>>>> Don't use BQL in the tx_bh handler as when tx_bh is executed, the BQL is
>>>>> acquired.
>>>>>
>>>>>     hw/net/e1000e.c      |   6 +++
>>>>>     hw/net/e1000e_core.c | 106 ++++++++++++++++++++++++++++++++++---------
>>>>>     hw/net/e1000e_core.h |   8 ++++
>>>>>     3 files changed, 98 insertions(+), 22 deletions(-)
>>>>>
>>>>> diff --git a/hw/net/e1000e.c b/hw/net/e1000e.c
>>>>> index fda34518c9..24e35a78bf 100644
>>>>> --- a/hw/net/e1000e.c
>>>>> +++ b/hw/net/e1000e.c
>>>>> @@ -77,10 +77,14 @@ typedef struct E1000EState {
>>>>>
>>>>>         bool disable_vnet;
>>>>>
>>>>> +    int32_t tx_burst;
>>>>> +
>>>>>         E1000ECore core;
>>>>>
>>>>>     } E1000EState;
>>>>>
>>>>> +#define TX_BURST 256
>>>>> +
>>>>>     #define E1000E_MMIO_IDX     0
>>>>>     #define E1000E_FLASH_IDX    1
>>>>>     #define E1000E_IO_IDX       2
>>>>> @@ -263,6 +267,7 @@ static void e1000e_core_realize(E1000EState *s)
>>>>>     {
>>>>>         s->core.owner = &s->parent_obj;
>>>>>         s->core.owner_nic = s->nic;
>>>>> +    s->core.tx_burst = s->tx_burst;
>>>>>     }
>>>>>
>>>>>     static void
>>>>> @@ -665,6 +670,7 @@ static Property e1000e_properties[] = {
>>>>>                             e1000e_prop_subsys_ven, uint16_t),
>>>>>         DEFINE_PROP_SIGNED("subsys", E1000EState, subsys, 0,
>>>>>                             e1000e_prop_subsys, uint16_t),
>>>>> +    DEFINE_PROP_INT32("x-txburst", E1000EState, tx_burst, TX_BURST),
>>>>>         DEFINE_PROP_END_OF_LIST(),
>>>>>     };
>>>>>
>>>>> diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c
>>>>> index bcd186cac5..0a38a50cca 100644
>>>>> --- a/hw/net/e1000e_core.c
>>>>> +++ b/hw/net/e1000e_core.c
>>>>> @@ -909,19 +909,18 @@ e1000e_rx_ring_init(E1000ECore *core, E1000E_RxRing *rxr, int idx)
>>>>>         rxr->i      = &i[idx];
>>>>>     }
>>>>>
>>>>> -static void
>>>>> -e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr)
>>>>> +static int32_t
>>>>> +e1000e_start_xmit(struct e1000e_tx *q)
>>>>>     {
>>>>> +    E1000ECore *core = q->core;
>>>>>         dma_addr_t base;
>>>>>         struct e1000_tx_desc desc;
>>>>> -    bool ide = false;
>>>>> -    const E1000E_RingInfo *txi = txr->i;
>>>>> -    uint32_t cause = E1000_ICS_TXQE;
>>>>> +    const E1000E_RingInfo *txi;
>>>>> +    E1000E_TxRing txr;
>>>>> +    int32_t num_packets = 0;
>>>>>
>>>>> -    if (!(core->mac[TCTL] & E1000_TCTL_EN)) {
>>>>> -        trace_e1000e_tx_disabled();
>>>>> -        return;
>>>>> -    }
>>>>> +    e1000e_tx_ring_init(core, &txr, q - &core->tx[0]);
>>>>> +    txi = txr.i;
>>>>>
>>>>>         while (!e1000e_ring_empty(core, txi)) {
>>>>>             base = e1000e_ring_head_descr(core, txi);
>>>>> @@ -931,15 +930,17 @@ e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr)
>>>>>             trace_e1000e_tx_descr((void *)(intptr_t)desc.buffer_addr,
>>>>>                                   desc.lower.data, desc.upper.data);
>>>>>
>>>>> -        e1000e_process_tx_desc(core, txr->tx, &desc, txi->idx);
>>>>> -        cause |= e1000e_txdesc_writeback(core, base, &desc, &ide, txi->idx);
>>>>> +        e1000e_process_tx_desc(core, txr.tx, &desc, txi->idx);
>>>>> +        q->cause |= e1000e_txdesc_writeback(core, base, &desc,
>>>>> +                                            &q->ide, txi->idx);
>>>>>
>>>>>             e1000e_ring_advance(core, txi, 1);
>>>>> +        if (++num_packets >= core->tx_burst) {
>>>>> +            break;
>>>>> +        }
>>>>>         }
>>>>>
>>>>> -    if (!ide || !e1000e_intrmgr_delay_tx_causes(core, &cause)) {
>>>>> -        e1000e_set_interrupt_cause(core, cause);
>>>>> -    }
>>>>> +    return num_packets;
>>>>>     }
>>>>>
>>>>>     static bool
>>>>> @@ -2423,32 +2424,41 @@ e1000e_set_dbal(E1000ECore *core, int index, uint32_t val)
>>>>>     static void
>>>>>     e1000e_set_tctl(E1000ECore *core, int index, uint32_t val)
>>>>>     {
>>>>> -    E1000E_TxRing txr;
>>>>>         core->mac[index] = val;
>>>>>
>>>>>         if (core->mac[TARC0] & E1000_TARC_ENABLE) {
>>>>> -        e1000e_tx_ring_init(core, &txr, 0);
>>>>> -        e1000e_start_xmit(core, &txr);
>>>>> +        if (core->tx[0].tx_waiting) {
>>>>> +            return;
>>>>> +        }
>>>>> +        core->tx[0].tx_waiting = 1;
>>>>> +        if (!core->vm_running) {
>>>>> +            return;
>>>>> +        }
>>>>> +        qemu_bh_schedule(core->tx[0].tx_bh);
>>>>>         }
>>>>>
>>>>>         if (core->mac[TARC1] & E1000_TARC_ENABLE) {
>>>>> -        e1000e_tx_ring_init(core, &txr, 1);
>>>>> -        e1000e_start_xmit(core, &txr);
>>>>> +        if (core->tx[1].tx_waiting) {
>>>>> +            return;
>>>>> +        }
>>>>> +        core->tx[1].tx_waiting = 1;
>>>>> +        if (!core->vm_running) {
>>>>> +            return;
>>>>> +        }
>>>>> +        qemu_bh_schedule(core->tx[1].tx_bh);
>>>>>         }
>>>>>     }
>>>>>
>>>>>     static void
>>>>>     e1000e_set_tdt(E1000ECore *core, int index, uint32_t val)
>>>>>     {
>>>>> -    E1000E_TxRing txr;
>>>>>         int qidx = e1000e_mq_queue_idx(TDT, index);
>>>>>         uint32_t tarc_reg = (qidx == 0) ? TARC0 : TARC1;
>>>>>
>>>>>         core->mac[index] = val & 0xffff;
>>>>>
>>>>>         if (core->mac[tarc_reg] & E1000_TARC_ENABLE) {
>>>>> -        e1000e_tx_ring_init(core, &txr, qidx);
>>>>> -        e1000e_start_xmit(core, &txr);
>>>>> +        qemu_bh_schedule(core->tx[qidx].tx_bh);
>>>>>         }
>>>>>     }
>>>>>
>>>>> @@ -3315,10 +3325,56 @@ e1000e_vm_state_change(void *opaque, int running, RunState state)
>>>>>             trace_e1000e_vm_state_running();
>>>>>             e1000e_intrmgr_resume(core);
>>>>>             e1000e_autoneg_resume(core);
>>>>> +        core->vm_running = 1;
>>>>> +
>>>>> +        for (int i = 0; i < E1000E_NUM_QUEUES; i++) {
>>>>> +            qemu_bh_schedule(core->tx[i].tx_bh);
>>>> I guess the reason for the unconditional scheduling of bh is to make
>>>> sure tx work after live migration since we don't migrate tx_waiting.
>>>>
>>>> If yes, better add a comment here.
>>> Ok will do in next revision.
>>>
>>> And do we need to clear tx_waiting here?
>>>
>>> I think there is no need as the tx_bh handler will do this.
>>
>> Yes.
>>
>>
>>>>> +        }
>>>>> +
>>>>>         } else {
>>>>>             trace_e1000e_vm_state_stopped();
>>>>> +
>>>>> +        for (int i = 0; i < E1000E_NUM_QUEUES; i++) {
>>>>> +            qemu_bh_cancel(core->tx[i].tx_bh);
>>>>> +        }
>>>>> +
>>>>>             e1000e_autoneg_pause(core);
>>>>>             e1000e_intrmgr_pause(core);
>>>>> +        core->vm_running = 0;
>>>>> +    }
>>>>> +}
>>>>> +
>>>>> +
>>>>> +static void e1000e_core_tx_bh(void *opaque)
>>>>> +{
>>>>> +    struct e1000e_tx *q = opaque;
>>>>> +    E1000ECore *core = q->core;
>>>>> +    int32_t ret;
>>>>> +
>>>>> +    if (!core->vm_running) {
>>>>> +        assert(q->tx_waiting);
>>>>> +        return;
>>>>> +    }
>>>>> +
>>>>> +    q->tx_waiting = 0;
>>>>> +
>>>>> +    if (!(core->mac[TCTL] & E1000_TCTL_EN)) {
>>>>> +        trace_e1000e_tx_disabled();
>>>>> +        return;
>>>>> +    }
>>>>> +
>>>>> +    q->cause = E1000_ICS_TXQE;
>>>>> +    q->ide = false;
>>>>> +
>>>>> +    ret = e1000e_start_xmit(q);
>>>>> +    if (ret >= core->tx_burst) {
>>>>> +        qemu_bh_schedule(q->tx_bh);
>>>>> +        q->tx_waiting = 1;
>>>>> +        return;
>>>>> +    }
>>>>> +
>>>>> +    if (!q->ide || !e1000e_intrmgr_delay_tx_causes(core, &q->cause)) {
>>>>> +        e1000e_set_interrupt_cause(core, q->cause);
>>>> So I think this will cause some delay of the interrupt delivering.
>>> Exactly. if tx burst occurs, the interrupt won't be triggered in the
>>> first tx_bh handler.
>>> As we give the vcpu more time and this may acceptable?
>>
>> I think not, e1000e has its own interrupt delay mechanism, let's keep it
>> work as much as possible.
>>
>>
>>> It
>>>> looks to be it's better to leave the set ics in e1000e_start_xmit().
>>> I think let e1000e_start_xmit just send packets is more clean.
>>> Leave setting ics in it will not improve the performance as far as I can see.
>>> What's your idea here to leave it in e1000e_start_xmit?
>>
>> I just worry about the delay of the interrupt if you do it in
>> e1000e_core_tx_bh() and just let e1000e_start_xmit() return the number
>> of packets transmitted and keep most of its code untouched.
> Sorry here let's make sure I got your meaning.
> 'The set ics' is meaning 'set interrupt cause' , in code:
>
> +    if (!q->ide || !e1000e_intrmgr_delay_tx_causes(core, &q->cause)) {
> +        e1000e_set_interrupt_cause(core, q->cause);
>
> Do you mean we should leave it in the 'e1000e_start_xmit'? My
> understanding is this.
> if it is this, we should also add following code in it:
>
> +    if (ret >= core->tx_burst) {
> +        qemu_bh_schedule(q->tx_bh);
> +        q->tx_waiting = 1;
> +        return;
> +    }
>
> Then the tx_bh handler just call 'e1000e_start_xmit'. Right?


This should work.


>
>
>
>>
>>>>>         }
>>>>>     }
>>>>>
>>>>> @@ -3334,12 +3390,15 @@ e1000e_core_pci_realize(E1000ECore     *core,
>>>>>                                            e1000e_autoneg_timer, core);
>>>>>         e1000e_intrmgr_pci_realize(core);
>>>>>
>>>>> +    core->vm_running = runstate_is_running();
>>>>>         core->vmstate =
>>>>>             qemu_add_vm_change_state_handler(e1000e_vm_state_change, core);
>>>>>
>>>>>         for (i = 0; i < E1000E_NUM_QUEUES; i++) {
>>>>>             net_tx_pkt_init(&core->tx[i].tx_pkt, core->owner,
>>>>>                             E1000E_MAX_TX_FRAGS, core->has_vnet);
>>>>> +        core->tx[i].core = core;
>>>>> +        core->tx[i].tx_bh = qemu_bh_new(e1000e_core_tx_bh, &core->tx[i]);
>>>>>         }
>>>>>
>>>>>         net_rx_pkt_init(&core->rx_pkt, core->has_vnet);
>>>>> @@ -3367,6 +3426,9 @@ e1000e_core_pci_uninit(E1000ECore *core)
>>>>>         for (i = 0; i < E1000E_NUM_QUEUES; i++) {
>>>>>             net_tx_pkt_reset(core->tx[i].tx_pkt);
>>>>>             net_tx_pkt_uninit(core->tx[i].tx_pkt);
>>>>> +        qemu_bh_cancel(core->tx[i].tx_bh);
>>>>> +        qemu_bh_delete(core->tx[i].tx_bh);
>>>>> +        core->tx[i].tx_bh = NULL;
>>>>>         }
>>>>>
>>>>>         net_rx_pkt_uninit(core->rx_pkt);
>>>>> diff --git a/hw/net/e1000e_core.h b/hw/net/e1000e_core.h
>>>>> index aee32f7e48..0c16dce3a6 100644
>>>>> --- a/hw/net/e1000e_core.h
>>>>> +++ b/hw/net/e1000e_core.h
>>>>> @@ -77,10 +77,18 @@ struct E1000Core {
>>>>>             unsigned char sum_needed;
>>>>>             bool cptse;
>>>>>             struct NetTxPkt *tx_pkt;
>>>>> +        QEMUBH *tx_bh;
>>>>> +        uint32_t tx_waiting;
>>>>> +        uint32_t cause;
>>>>> +        bool ide;
>>>>> +        E1000ECore *core;
>>>>>         } tx[E1000E_NUM_QUEUES];
>>>>>
>>>>>         struct NetRxPkt *rx_pkt;
>>>>>
>>>>> +    int32_t tx_burst;
>>>>> +
>>>>> +    bool vm_running;
>>>>>         bool has_vnet;
>>>>>         int max_queue_num;
>>>>>
>>>> Do we need to cancel the bh and reset tx_waiting in e1000e_core_reset()?
>>> I think we need. But why the virtio net doesn't do this? Maybe it also
>>> lack of this?
>>
>> Ok, so I think the current code is probably OK.
> So my next revision will do following:
> 1. add comment for the tx_bh schdule when VM resume.
> 2. Leave some code in the 'e1000e_start_xmit'
> 3. cancel the tx_bh and reset tx_waiting in e1000e_core_reset
>
> If I lost/misunderstanding anything please let me know.


Your understanding is correct.

Thanks


>
> Thanks,
> Li Qiang
>
>> virtio-net check DRIVER_OK and the code here check  TCTL_EN.
>>
>> Thanks
>>
>>
>>> Thanks,
>>> Li Qiang
>>>
>>>> Thanks
>>>>
diff mbox series

Patch

diff --git a/hw/net/e1000e.c b/hw/net/e1000e.c
index fda34518c9..24e35a78bf 100644
--- a/hw/net/e1000e.c
+++ b/hw/net/e1000e.c
@@ -77,10 +77,14 @@  typedef struct E1000EState {
 
     bool disable_vnet;
 
+    int32_t tx_burst;
+
     E1000ECore core;
 
 } E1000EState;
 
+#define TX_BURST 256
+
 #define E1000E_MMIO_IDX     0
 #define E1000E_FLASH_IDX    1
 #define E1000E_IO_IDX       2
@@ -263,6 +267,7 @@  static void e1000e_core_realize(E1000EState *s)
 {
     s->core.owner = &s->parent_obj;
     s->core.owner_nic = s->nic;
+    s->core.tx_burst = s->tx_burst;
 }
 
 static void
@@ -665,6 +670,7 @@  static Property e1000e_properties[] = {
                         e1000e_prop_subsys_ven, uint16_t),
     DEFINE_PROP_SIGNED("subsys", E1000EState, subsys, 0,
                         e1000e_prop_subsys, uint16_t),
+    DEFINE_PROP_INT32("x-txburst", E1000EState, tx_burst, TX_BURST),
     DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c
index bcd186cac5..0a38a50cca 100644
--- a/hw/net/e1000e_core.c
+++ b/hw/net/e1000e_core.c
@@ -909,19 +909,18 @@  e1000e_rx_ring_init(E1000ECore *core, E1000E_RxRing *rxr, int idx)
     rxr->i      = &i[idx];
 }
 
-static void
-e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr)
+static int32_t
+e1000e_start_xmit(struct e1000e_tx *q)
 {
+    E1000ECore *core = q->core;
     dma_addr_t base;
     struct e1000_tx_desc desc;
-    bool ide = false;
-    const E1000E_RingInfo *txi = txr->i;
-    uint32_t cause = E1000_ICS_TXQE;
+    const E1000E_RingInfo *txi;
+    E1000E_TxRing txr;
+    int32_t num_packets = 0;
 
-    if (!(core->mac[TCTL] & E1000_TCTL_EN)) {
-        trace_e1000e_tx_disabled();
-        return;
-    }
+    e1000e_tx_ring_init(core, &txr, q - &core->tx[0]);
+    txi = txr.i;
 
     while (!e1000e_ring_empty(core, txi)) {
         base = e1000e_ring_head_descr(core, txi);
@@ -931,15 +930,17 @@  e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr)
         trace_e1000e_tx_descr((void *)(intptr_t)desc.buffer_addr,
                               desc.lower.data, desc.upper.data);
 
-        e1000e_process_tx_desc(core, txr->tx, &desc, txi->idx);
-        cause |= e1000e_txdesc_writeback(core, base, &desc, &ide, txi->idx);
+        e1000e_process_tx_desc(core, txr.tx, &desc, txi->idx);
+        q->cause |= e1000e_txdesc_writeback(core, base, &desc,
+                                            &q->ide, txi->idx);
 
         e1000e_ring_advance(core, txi, 1);
+        if (++num_packets >= core->tx_burst) {
+            break;
+        }
     }
 
-    if (!ide || !e1000e_intrmgr_delay_tx_causes(core, &cause)) {
-        e1000e_set_interrupt_cause(core, cause);
-    }
+    return num_packets;
 }
 
 static bool
@@ -2423,32 +2424,41 @@  e1000e_set_dbal(E1000ECore *core, int index, uint32_t val)
 static void
 e1000e_set_tctl(E1000ECore *core, int index, uint32_t val)
 {
-    E1000E_TxRing txr;
     core->mac[index] = val;
 
     if (core->mac[TARC0] & E1000_TARC_ENABLE) {
-        e1000e_tx_ring_init(core, &txr, 0);
-        e1000e_start_xmit(core, &txr);
+        if (core->tx[0].tx_waiting) {
+            return;
+        }
+        core->tx[0].tx_waiting = 1;
+        if (!core->vm_running) {
+            return;
+        }
+        qemu_bh_schedule(core->tx[0].tx_bh);
     }
 
     if (core->mac[TARC1] & E1000_TARC_ENABLE) {
-        e1000e_tx_ring_init(core, &txr, 1);
-        e1000e_start_xmit(core, &txr);
+        if (core->tx[1].tx_waiting) {
+            return;
+        }
+        core->tx[1].tx_waiting = 1;
+        if (!core->vm_running) {
+            return;
+        }
+        qemu_bh_schedule(core->tx[1].tx_bh);
     }
 }
 
 static void
 e1000e_set_tdt(E1000ECore *core, int index, uint32_t val)
 {
-    E1000E_TxRing txr;
     int qidx = e1000e_mq_queue_idx(TDT, index);
     uint32_t tarc_reg = (qidx == 0) ? TARC0 : TARC1;
 
     core->mac[index] = val & 0xffff;
 
     if (core->mac[tarc_reg] & E1000_TARC_ENABLE) {
-        e1000e_tx_ring_init(core, &txr, qidx);
-        e1000e_start_xmit(core, &txr);
+        qemu_bh_schedule(core->tx[qidx].tx_bh);
     }
 }
 
@@ -3315,10 +3325,56 @@  e1000e_vm_state_change(void *opaque, int running, RunState state)
         trace_e1000e_vm_state_running();
         e1000e_intrmgr_resume(core);
         e1000e_autoneg_resume(core);
+        core->vm_running = 1;
+
+        for (int i = 0; i < E1000E_NUM_QUEUES; i++) {
+            qemu_bh_schedule(core->tx[i].tx_bh);
+        }
+
     } else {
         trace_e1000e_vm_state_stopped();
+
+        for (int i = 0; i < E1000E_NUM_QUEUES; i++) {
+            qemu_bh_cancel(core->tx[i].tx_bh);
+        }
+
         e1000e_autoneg_pause(core);
         e1000e_intrmgr_pause(core);
+        core->vm_running = 0;
+    }
+}
+
+
+static void e1000e_core_tx_bh(void *opaque)
+{
+    struct e1000e_tx *q = opaque;
+    E1000ECore *core = q->core;
+    int32_t ret;
+
+    if (!core->vm_running) {
+        assert(q->tx_waiting);
+        return;
+    }
+
+    q->tx_waiting = 0;
+
+    if (!(core->mac[TCTL] & E1000_TCTL_EN)) {
+        trace_e1000e_tx_disabled();
+        return;
+    }
+
+    q->cause = E1000_ICS_TXQE;
+    q->ide = false;
+
+    ret = e1000e_start_xmit(q);
+    if (ret >= core->tx_burst) {
+        qemu_bh_schedule(q->tx_bh);
+        q->tx_waiting = 1;
+        return;
+    }
+
+    if (!q->ide || !e1000e_intrmgr_delay_tx_causes(core, &q->cause)) {
+        e1000e_set_interrupt_cause(core, q->cause);
     }
 }
 
@@ -3334,12 +3390,15 @@  e1000e_core_pci_realize(E1000ECore     *core,
                                        e1000e_autoneg_timer, core);
     e1000e_intrmgr_pci_realize(core);
 
+    core->vm_running = runstate_is_running();
     core->vmstate =
         qemu_add_vm_change_state_handler(e1000e_vm_state_change, core);
 
     for (i = 0; i < E1000E_NUM_QUEUES; i++) {
         net_tx_pkt_init(&core->tx[i].tx_pkt, core->owner,
                         E1000E_MAX_TX_FRAGS, core->has_vnet);
+        core->tx[i].core = core;
+        core->tx[i].tx_bh = qemu_bh_new(e1000e_core_tx_bh, &core->tx[i]);
     }
 
     net_rx_pkt_init(&core->rx_pkt, core->has_vnet);
@@ -3367,6 +3426,9 @@  e1000e_core_pci_uninit(E1000ECore *core)
     for (i = 0; i < E1000E_NUM_QUEUES; i++) {
         net_tx_pkt_reset(core->tx[i].tx_pkt);
         net_tx_pkt_uninit(core->tx[i].tx_pkt);
+        qemu_bh_cancel(core->tx[i].tx_bh);
+        qemu_bh_delete(core->tx[i].tx_bh);
+        core->tx[i].tx_bh = NULL;
     }
 
     net_rx_pkt_uninit(core->rx_pkt);
diff --git a/hw/net/e1000e_core.h b/hw/net/e1000e_core.h
index aee32f7e48..0c16dce3a6 100644
--- a/hw/net/e1000e_core.h
+++ b/hw/net/e1000e_core.h
@@ -77,10 +77,18 @@  struct E1000Core {
         unsigned char sum_needed;
         bool cptse;
         struct NetTxPkt *tx_pkt;
+        QEMUBH *tx_bh;
+        uint32_t tx_waiting;
+        uint32_t cause;
+        bool ide;
+        E1000ECore *core;
     } tx[E1000E_NUM_QUEUES];
 
     struct NetRxPkt *rx_pkt;
 
+    int32_t tx_burst;
+
+    bool vm_running;
     bool has_vnet;
     int max_queue_num;