Message ID | 20200720164535.76559-1-liq3ea@163.com |
---|---|
State | New |
Headers | show |
Series | [v2] e1000e: using bottom half to send packets | expand |
On 2020/7/21 上午12:45, Li Qiang wrote: > Alexander Bulekov reported a UAF bug related e1000e packets send. > > -->https://bugs.launchpad.net/qemu/+bug/1886362 > > This is because the guest trigger a e1000e packet send and set the > data's address to e1000e's MMIO address. So when the e1000e do DMA > it will write the MMIO again and trigger re-entrancy and finally > causes this UAF. > > Paolo suggested to use a bottom half whenever MMIO is doing complicate > things in here: > -->https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg03342.html > > Reference here: > 'The easiest solution is to delay processing of descriptors to a bottom > half whenever MMIO is doing something complicated. This is also better > for latency because it will free the vCPU thread more quickly and leave > the work to the I/O thread.' > > This patch fixes this UAF. > > Signed-off-by: Li Qiang <liq3ea@163.com> > --- > Change since v1: > Per Jason's review here: > -- https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg05368.html > 1. Cancel and schedule the tx bh when VM is stopped or resume > 2. Add a tx_burst for e1000e configuration to throttle the bh execution > 3. Add a tx_waiting to record whether the bh is pending or not > Don't use BQL in the tx_bh handler as when tx_bh is executed, the BQL is > acquired. > > hw/net/e1000e.c | 6 +++ > hw/net/e1000e_core.c | 106 ++++++++++++++++++++++++++++++++++--------- > hw/net/e1000e_core.h | 8 ++++ > 3 files changed, 98 insertions(+), 22 deletions(-) > > diff --git a/hw/net/e1000e.c b/hw/net/e1000e.c > index fda34518c9..24e35a78bf 100644 > --- a/hw/net/e1000e.c > +++ b/hw/net/e1000e.c > @@ -77,10 +77,14 @@ typedef struct E1000EState { > > bool disable_vnet; > > + int32_t tx_burst; > + > E1000ECore core; > > } E1000EState; > > +#define TX_BURST 256 > + > #define E1000E_MMIO_IDX 0 > #define E1000E_FLASH_IDX 1 > #define E1000E_IO_IDX 2 > @@ -263,6 +267,7 @@ static void e1000e_core_realize(E1000EState *s) > { > s->core.owner = &s->parent_obj; > s->core.owner_nic = s->nic; > + s->core.tx_burst = s->tx_burst; > } > > static void > @@ -665,6 +670,7 @@ static Property e1000e_properties[] = { > e1000e_prop_subsys_ven, uint16_t), > DEFINE_PROP_SIGNED("subsys", E1000EState, subsys, 0, > e1000e_prop_subsys, uint16_t), > + DEFINE_PROP_INT32("x-txburst", E1000EState, tx_burst, TX_BURST), > DEFINE_PROP_END_OF_LIST(), > }; > > diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c > index bcd186cac5..0a38a50cca 100644 > --- a/hw/net/e1000e_core.c > +++ b/hw/net/e1000e_core.c > @@ -909,19 +909,18 @@ e1000e_rx_ring_init(E1000ECore *core, E1000E_RxRing *rxr, int idx) > rxr->i = &i[idx]; > } > > -static void > -e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr) > +static int32_t > +e1000e_start_xmit(struct e1000e_tx *q) > { > + E1000ECore *core = q->core; > dma_addr_t base; > struct e1000_tx_desc desc; > - bool ide = false; > - const E1000E_RingInfo *txi = txr->i; > - uint32_t cause = E1000_ICS_TXQE; > + const E1000E_RingInfo *txi; > + E1000E_TxRing txr; > + int32_t num_packets = 0; > > - if (!(core->mac[TCTL] & E1000_TCTL_EN)) { > - trace_e1000e_tx_disabled(); > - return; > - } > + e1000e_tx_ring_init(core, &txr, q - &core->tx[0]); > + txi = txr.i; > > while (!e1000e_ring_empty(core, txi)) { > base = e1000e_ring_head_descr(core, txi); > @@ -931,15 +930,17 @@ e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr) > trace_e1000e_tx_descr((void *)(intptr_t)desc.buffer_addr, > desc.lower.data, desc.upper.data); > > - e1000e_process_tx_desc(core, txr->tx, &desc, txi->idx); > - cause |= e1000e_txdesc_writeback(core, base, &desc, &ide, txi->idx); > + e1000e_process_tx_desc(core, txr.tx, &desc, txi->idx); > + q->cause |= e1000e_txdesc_writeback(core, base, &desc, > + &q->ide, txi->idx); > > e1000e_ring_advance(core, txi, 1); > + if (++num_packets >= core->tx_burst) { > + break; > + } > } > > - if (!ide || !e1000e_intrmgr_delay_tx_causes(core, &cause)) { > - e1000e_set_interrupt_cause(core, cause); > - } > + return num_packets; > } > > static bool > @@ -2423,32 +2424,41 @@ e1000e_set_dbal(E1000ECore *core, int index, uint32_t val) > static void > e1000e_set_tctl(E1000ECore *core, int index, uint32_t val) > { > - E1000E_TxRing txr; > core->mac[index] = val; > > if (core->mac[TARC0] & E1000_TARC_ENABLE) { > - e1000e_tx_ring_init(core, &txr, 0); > - e1000e_start_xmit(core, &txr); > + if (core->tx[0].tx_waiting) { > + return; > + } > + core->tx[0].tx_waiting = 1; > + if (!core->vm_running) { > + return; > + } > + qemu_bh_schedule(core->tx[0].tx_bh); > } > > if (core->mac[TARC1] & E1000_TARC_ENABLE) { > - e1000e_tx_ring_init(core, &txr, 1); > - e1000e_start_xmit(core, &txr); > + if (core->tx[1].tx_waiting) { > + return; > + } > + core->tx[1].tx_waiting = 1; > + if (!core->vm_running) { > + return; > + } > + qemu_bh_schedule(core->tx[1].tx_bh); > } > } > > static void > e1000e_set_tdt(E1000ECore *core, int index, uint32_t val) > { > - E1000E_TxRing txr; > int qidx = e1000e_mq_queue_idx(TDT, index); > uint32_t tarc_reg = (qidx == 0) ? TARC0 : TARC1; > > core->mac[index] = val & 0xffff; > > if (core->mac[tarc_reg] & E1000_TARC_ENABLE) { > - e1000e_tx_ring_init(core, &txr, qidx); > - e1000e_start_xmit(core, &txr); > + qemu_bh_schedule(core->tx[qidx].tx_bh); > } > } > > @@ -3315,10 +3325,56 @@ e1000e_vm_state_change(void *opaque, int running, RunState state) > trace_e1000e_vm_state_running(); > e1000e_intrmgr_resume(core); > e1000e_autoneg_resume(core); > + core->vm_running = 1; > + > + for (int i = 0; i < E1000E_NUM_QUEUES; i++) { > + qemu_bh_schedule(core->tx[i].tx_bh); I guess the reason for the unconditional scheduling of bh is to make sure tx work after live migration since we don't migrate tx_waiting. If yes, better add a comment here. And do we need to clear tx_waiting here? > + } > + > } else { > trace_e1000e_vm_state_stopped(); > + > + for (int i = 0; i < E1000E_NUM_QUEUES; i++) { > + qemu_bh_cancel(core->tx[i].tx_bh); > + } > + > e1000e_autoneg_pause(core); > e1000e_intrmgr_pause(core); > + core->vm_running = 0; > + } > +} > + > + > +static void e1000e_core_tx_bh(void *opaque) > +{ > + struct e1000e_tx *q = opaque; > + E1000ECore *core = q->core; > + int32_t ret; > + > + if (!core->vm_running) { > + assert(q->tx_waiting); > + return; > + } > + > + q->tx_waiting = 0; > + > + if (!(core->mac[TCTL] & E1000_TCTL_EN)) { > + trace_e1000e_tx_disabled(); > + return; > + } > + > + q->cause = E1000_ICS_TXQE; > + q->ide = false; > + > + ret = e1000e_start_xmit(q); > + if (ret >= core->tx_burst) { > + qemu_bh_schedule(q->tx_bh); > + q->tx_waiting = 1; > + return; > + } > + > + if (!q->ide || !e1000e_intrmgr_delay_tx_causes(core, &q->cause)) { > + e1000e_set_interrupt_cause(core, q->cause); So I think this will cause some delay of the interrupt delivering. It looks to be it's better to leave the set ics in e1000e_start_xmit(). > } > } > > @@ -3334,12 +3390,15 @@ e1000e_core_pci_realize(E1000ECore *core, > e1000e_autoneg_timer, core); > e1000e_intrmgr_pci_realize(core); > > + core->vm_running = runstate_is_running(); > core->vmstate = > qemu_add_vm_change_state_handler(e1000e_vm_state_change, core); > > for (i = 0; i < E1000E_NUM_QUEUES; i++) { > net_tx_pkt_init(&core->tx[i].tx_pkt, core->owner, > E1000E_MAX_TX_FRAGS, core->has_vnet); > + core->tx[i].core = core; > + core->tx[i].tx_bh = qemu_bh_new(e1000e_core_tx_bh, &core->tx[i]); > } > > net_rx_pkt_init(&core->rx_pkt, core->has_vnet); > @@ -3367,6 +3426,9 @@ e1000e_core_pci_uninit(E1000ECore *core) > for (i = 0; i < E1000E_NUM_QUEUES; i++) { > net_tx_pkt_reset(core->tx[i].tx_pkt); > net_tx_pkt_uninit(core->tx[i].tx_pkt); > + qemu_bh_cancel(core->tx[i].tx_bh); > + qemu_bh_delete(core->tx[i].tx_bh); > + core->tx[i].tx_bh = NULL; > } > > net_rx_pkt_uninit(core->rx_pkt); > diff --git a/hw/net/e1000e_core.h b/hw/net/e1000e_core.h > index aee32f7e48..0c16dce3a6 100644 > --- a/hw/net/e1000e_core.h > +++ b/hw/net/e1000e_core.h > @@ -77,10 +77,18 @@ struct E1000Core { > unsigned char sum_needed; > bool cptse; > struct NetTxPkt *tx_pkt; > + QEMUBH *tx_bh; > + uint32_t tx_waiting; > + uint32_t cause; > + bool ide; > + E1000ECore *core; > } tx[E1000E_NUM_QUEUES]; > > struct NetRxPkt *rx_pkt; > > + int32_t tx_burst; > + > + bool vm_running; > bool has_vnet; > int max_queue_num; > Do we need to cancel the bh and reset tx_waiting in e1000e_core_reset()? Thanks
Jason Wang <jasowang@redhat.com> 于2020年7月21日周二 上午10:03写道: > > > On 2020/7/21 上午12:45, Li Qiang wrote: > > Alexander Bulekov reported a UAF bug related e1000e packets send. > > > > -->https://bugs.launchpad.net/qemu/+bug/1886362 > > > > This is because the guest trigger a e1000e packet send and set the > > data's address to e1000e's MMIO address. So when the e1000e do DMA > > it will write the MMIO again and trigger re-entrancy and finally > > causes this UAF. > > > > Paolo suggested to use a bottom half whenever MMIO is doing complicate > > things in here: > > -->https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg03342.html > > > > Reference here: > > 'The easiest solution is to delay processing of descriptors to a bottom > > half whenever MMIO is doing something complicated. This is also better > > for latency because it will free the vCPU thread more quickly and leave > > the work to the I/O thread.' > > > > This patch fixes this UAF. > > > > Signed-off-by: Li Qiang <liq3ea@163.com> > > --- > > Change since v1: > > Per Jason's review here: > > -- https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg05368.html > > 1. Cancel and schedule the tx bh when VM is stopped or resume > > 2. Add a tx_burst for e1000e configuration to throttle the bh execution > > 3. Add a tx_waiting to record whether the bh is pending or not > > Don't use BQL in the tx_bh handler as when tx_bh is executed, the BQL is > > acquired. > > > > hw/net/e1000e.c | 6 +++ > > hw/net/e1000e_core.c | 106 ++++++++++++++++++++++++++++++++++--------- > > hw/net/e1000e_core.h | 8 ++++ > > 3 files changed, 98 insertions(+), 22 deletions(-) > > > > diff --git a/hw/net/e1000e.c b/hw/net/e1000e.c > > index fda34518c9..24e35a78bf 100644 > > --- a/hw/net/e1000e.c > > +++ b/hw/net/e1000e.c > > @@ -77,10 +77,14 @@ typedef struct E1000EState { > > > > bool disable_vnet; > > > > + int32_t tx_burst; > > + > > E1000ECore core; > > > > } E1000EState; > > > > +#define TX_BURST 256 > > + > > #define E1000E_MMIO_IDX 0 > > #define E1000E_FLASH_IDX 1 > > #define E1000E_IO_IDX 2 > > @@ -263,6 +267,7 @@ static void e1000e_core_realize(E1000EState *s) > > { > > s->core.owner = &s->parent_obj; > > s->core.owner_nic = s->nic; > > + s->core.tx_burst = s->tx_burst; > > } > > > > static void > > @@ -665,6 +670,7 @@ static Property e1000e_properties[] = { > > e1000e_prop_subsys_ven, uint16_t), > > DEFINE_PROP_SIGNED("subsys", E1000EState, subsys, 0, > > e1000e_prop_subsys, uint16_t), > > + DEFINE_PROP_INT32("x-txburst", E1000EState, tx_burst, TX_BURST), > > DEFINE_PROP_END_OF_LIST(), > > }; > > > > diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c > > index bcd186cac5..0a38a50cca 100644 > > --- a/hw/net/e1000e_core.c > > +++ b/hw/net/e1000e_core.c > > @@ -909,19 +909,18 @@ e1000e_rx_ring_init(E1000ECore *core, E1000E_RxRing *rxr, int idx) > > rxr->i = &i[idx]; > > } > > > > -static void > > -e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr) > > +static int32_t > > +e1000e_start_xmit(struct e1000e_tx *q) > > { > > + E1000ECore *core = q->core; > > dma_addr_t base; > > struct e1000_tx_desc desc; > > - bool ide = false; > > - const E1000E_RingInfo *txi = txr->i; > > - uint32_t cause = E1000_ICS_TXQE; > > + const E1000E_RingInfo *txi; > > + E1000E_TxRing txr; > > + int32_t num_packets = 0; > > > > - if (!(core->mac[TCTL] & E1000_TCTL_EN)) { > > - trace_e1000e_tx_disabled(); > > - return; > > - } > > + e1000e_tx_ring_init(core, &txr, q - &core->tx[0]); > > + txi = txr.i; > > > > while (!e1000e_ring_empty(core, txi)) { > > base = e1000e_ring_head_descr(core, txi); > > @@ -931,15 +930,17 @@ e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr) > > trace_e1000e_tx_descr((void *)(intptr_t)desc.buffer_addr, > > desc.lower.data, desc.upper.data); > > > > - e1000e_process_tx_desc(core, txr->tx, &desc, txi->idx); > > - cause |= e1000e_txdesc_writeback(core, base, &desc, &ide, txi->idx); > > + e1000e_process_tx_desc(core, txr.tx, &desc, txi->idx); > > + q->cause |= e1000e_txdesc_writeback(core, base, &desc, > > + &q->ide, txi->idx); > > > > e1000e_ring_advance(core, txi, 1); > > + if (++num_packets >= core->tx_burst) { > > + break; > > + } > > } > > > > - if (!ide || !e1000e_intrmgr_delay_tx_causes(core, &cause)) { > > - e1000e_set_interrupt_cause(core, cause); > > - } > > + return num_packets; > > } > > > > static bool > > @@ -2423,32 +2424,41 @@ e1000e_set_dbal(E1000ECore *core, int index, uint32_t val) > > static void > > e1000e_set_tctl(E1000ECore *core, int index, uint32_t val) > > { > > - E1000E_TxRing txr; > > core->mac[index] = val; > > > > if (core->mac[TARC0] & E1000_TARC_ENABLE) { > > - e1000e_tx_ring_init(core, &txr, 0); > > - e1000e_start_xmit(core, &txr); > > + if (core->tx[0].tx_waiting) { > > + return; > > + } > > + core->tx[0].tx_waiting = 1; > > + if (!core->vm_running) { > > + return; > > + } > > + qemu_bh_schedule(core->tx[0].tx_bh); > > } > > > > if (core->mac[TARC1] & E1000_TARC_ENABLE) { > > - e1000e_tx_ring_init(core, &txr, 1); > > - e1000e_start_xmit(core, &txr); > > + if (core->tx[1].tx_waiting) { > > + return; > > + } > > + core->tx[1].tx_waiting = 1; > > + if (!core->vm_running) { > > + return; > > + } > > + qemu_bh_schedule(core->tx[1].tx_bh); > > } > > } > > > > static void > > e1000e_set_tdt(E1000ECore *core, int index, uint32_t val) > > { > > - E1000E_TxRing txr; > > int qidx = e1000e_mq_queue_idx(TDT, index); > > uint32_t tarc_reg = (qidx == 0) ? TARC0 : TARC1; > > > > core->mac[index] = val & 0xffff; > > > > if (core->mac[tarc_reg] & E1000_TARC_ENABLE) { > > - e1000e_tx_ring_init(core, &txr, qidx); > > - e1000e_start_xmit(core, &txr); > > + qemu_bh_schedule(core->tx[qidx].tx_bh); > > } > > } > > > > @@ -3315,10 +3325,56 @@ e1000e_vm_state_change(void *opaque, int running, RunState state) > > trace_e1000e_vm_state_running(); > > e1000e_intrmgr_resume(core); > > e1000e_autoneg_resume(core); > > + core->vm_running = 1; > > + > > + for (int i = 0; i < E1000E_NUM_QUEUES; i++) { > > + qemu_bh_schedule(core->tx[i].tx_bh); > > > I guess the reason for the unconditional scheduling of bh is to make > sure tx work after live migration since we don't migrate tx_waiting. > > If yes, better add a comment here. Ok will do in next revision. And do we need to clear tx_waiting here? I think there is no need as the tx_bh handler will do this. > > > > + } > > + > > } else { > > trace_e1000e_vm_state_stopped(); > > + > > + for (int i = 0; i < E1000E_NUM_QUEUES; i++) { > > + qemu_bh_cancel(core->tx[i].tx_bh); > > + } > > + > > e1000e_autoneg_pause(core); > > e1000e_intrmgr_pause(core); > > + core->vm_running = 0; > > + } > > +} > > + > > + > > +static void e1000e_core_tx_bh(void *opaque) > > +{ > > + struct e1000e_tx *q = opaque; > > + E1000ECore *core = q->core; > > + int32_t ret; > > + > > + if (!core->vm_running) { > > + assert(q->tx_waiting); > > + return; > > + } > > + > > + q->tx_waiting = 0; > > + > > + if (!(core->mac[TCTL] & E1000_TCTL_EN)) { > > + trace_e1000e_tx_disabled(); > > + return; > > + } > > + > > + q->cause = E1000_ICS_TXQE; > > + q->ide = false; > > + > > + ret = e1000e_start_xmit(q); > > + if (ret >= core->tx_burst) { > > + qemu_bh_schedule(q->tx_bh); > > + q->tx_waiting = 1; > > + return; > > + } > > + > > + if (!q->ide || !e1000e_intrmgr_delay_tx_causes(core, &q->cause)) { > > + e1000e_set_interrupt_cause(core, q->cause); > > > So I think this will cause some delay of the interrupt delivering. Exactly. if tx burst occurs, the interrupt won't be triggered in the first tx_bh handler. As we give the vcpu more time and this may acceptable? It > looks to be it's better to leave the set ics in e1000e_start_xmit(). I think let e1000e_start_xmit just send packets is more clean. Leave setting ics in it will not improve the performance as far as I can see. What's your idea here to leave it in e1000e_start_xmit? > > > > } > > } > > > > @@ -3334,12 +3390,15 @@ e1000e_core_pci_realize(E1000ECore *core, > > e1000e_autoneg_timer, core); > > e1000e_intrmgr_pci_realize(core); > > > > + core->vm_running = runstate_is_running(); > > core->vmstate = > > qemu_add_vm_change_state_handler(e1000e_vm_state_change, core); > > > > for (i = 0; i < E1000E_NUM_QUEUES; i++) { > > net_tx_pkt_init(&core->tx[i].tx_pkt, core->owner, > > E1000E_MAX_TX_FRAGS, core->has_vnet); > > + core->tx[i].core = core; > > + core->tx[i].tx_bh = qemu_bh_new(e1000e_core_tx_bh, &core->tx[i]); > > } > > > > net_rx_pkt_init(&core->rx_pkt, core->has_vnet); > > @@ -3367,6 +3426,9 @@ e1000e_core_pci_uninit(E1000ECore *core) > > for (i = 0; i < E1000E_NUM_QUEUES; i++) { > > net_tx_pkt_reset(core->tx[i].tx_pkt); > > net_tx_pkt_uninit(core->tx[i].tx_pkt); > > + qemu_bh_cancel(core->tx[i].tx_bh); > > + qemu_bh_delete(core->tx[i].tx_bh); > > + core->tx[i].tx_bh = NULL; > > } > > > > net_rx_pkt_uninit(core->rx_pkt); > > diff --git a/hw/net/e1000e_core.h b/hw/net/e1000e_core.h > > index aee32f7e48..0c16dce3a6 100644 > > --- a/hw/net/e1000e_core.h > > +++ b/hw/net/e1000e_core.h > > @@ -77,10 +77,18 @@ struct E1000Core { > > unsigned char sum_needed; > > bool cptse; > > struct NetTxPkt *tx_pkt; > > + QEMUBH *tx_bh; > > + uint32_t tx_waiting; > > + uint32_t cause; > > + bool ide; > > + E1000ECore *core; > > } tx[E1000E_NUM_QUEUES]; > > > > struct NetRxPkt *rx_pkt; > > > > + int32_t tx_burst; > > + > > + bool vm_running; > > bool has_vnet; > > int max_queue_num; > > > > > Do we need to cancel the bh and reset tx_waiting in e1000e_core_reset()? I think we need. But why the virtio net doesn't do this? Maybe it also lack of this? Thanks, Li Qiang > > Thanks >
On 2020/7/21 下午12:33, Li Qiang wrote: > Jason Wang <jasowang@redhat.com> 于2020年7月21日周二 上午10:03写道: >> >> On 2020/7/21 上午12:45, Li Qiang wrote: >>> Alexander Bulekov reported a UAF bug related e1000e packets send. >>> >>> -->https://bugs.launchpad.net/qemu/+bug/1886362 >>> >>> This is because the guest trigger a e1000e packet send and set the >>> data's address to e1000e's MMIO address. So when the e1000e do DMA >>> it will write the MMIO again and trigger re-entrancy and finally >>> causes this UAF. >>> >>> Paolo suggested to use a bottom half whenever MMIO is doing complicate >>> things in here: >>> -->https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg03342.html >>> >>> Reference here: >>> 'The easiest solution is to delay processing of descriptors to a bottom >>> half whenever MMIO is doing something complicated. This is also better >>> for latency because it will free the vCPU thread more quickly and leave >>> the work to the I/O thread.' >>> >>> This patch fixes this UAF. >>> >>> Signed-off-by: Li Qiang <liq3ea@163.com> >>> --- >>> Change since v1: >>> Per Jason's review here: >>> -- https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg05368.html >>> 1. Cancel and schedule the tx bh when VM is stopped or resume >>> 2. Add a tx_burst for e1000e configuration to throttle the bh execution >>> 3. Add a tx_waiting to record whether the bh is pending or not >>> Don't use BQL in the tx_bh handler as when tx_bh is executed, the BQL is >>> acquired. >>> >>> hw/net/e1000e.c | 6 +++ >>> hw/net/e1000e_core.c | 106 ++++++++++++++++++++++++++++++++++--------- >>> hw/net/e1000e_core.h | 8 ++++ >>> 3 files changed, 98 insertions(+), 22 deletions(-) >>> >>> diff --git a/hw/net/e1000e.c b/hw/net/e1000e.c >>> index fda34518c9..24e35a78bf 100644 >>> --- a/hw/net/e1000e.c >>> +++ b/hw/net/e1000e.c >>> @@ -77,10 +77,14 @@ typedef struct E1000EState { >>> >>> bool disable_vnet; >>> >>> + int32_t tx_burst; >>> + >>> E1000ECore core; >>> >>> } E1000EState; >>> >>> +#define TX_BURST 256 >>> + >>> #define E1000E_MMIO_IDX 0 >>> #define E1000E_FLASH_IDX 1 >>> #define E1000E_IO_IDX 2 >>> @@ -263,6 +267,7 @@ static void e1000e_core_realize(E1000EState *s) >>> { >>> s->core.owner = &s->parent_obj; >>> s->core.owner_nic = s->nic; >>> + s->core.tx_burst = s->tx_burst; >>> } >>> >>> static void >>> @@ -665,6 +670,7 @@ static Property e1000e_properties[] = { >>> e1000e_prop_subsys_ven, uint16_t), >>> DEFINE_PROP_SIGNED("subsys", E1000EState, subsys, 0, >>> e1000e_prop_subsys, uint16_t), >>> + DEFINE_PROP_INT32("x-txburst", E1000EState, tx_burst, TX_BURST), >>> DEFINE_PROP_END_OF_LIST(), >>> }; >>> >>> diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c >>> index bcd186cac5..0a38a50cca 100644 >>> --- a/hw/net/e1000e_core.c >>> +++ b/hw/net/e1000e_core.c >>> @@ -909,19 +909,18 @@ e1000e_rx_ring_init(E1000ECore *core, E1000E_RxRing *rxr, int idx) >>> rxr->i = &i[idx]; >>> } >>> >>> -static void >>> -e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr) >>> +static int32_t >>> +e1000e_start_xmit(struct e1000e_tx *q) >>> { >>> + E1000ECore *core = q->core; >>> dma_addr_t base; >>> struct e1000_tx_desc desc; >>> - bool ide = false; >>> - const E1000E_RingInfo *txi = txr->i; >>> - uint32_t cause = E1000_ICS_TXQE; >>> + const E1000E_RingInfo *txi; >>> + E1000E_TxRing txr; >>> + int32_t num_packets = 0; >>> >>> - if (!(core->mac[TCTL] & E1000_TCTL_EN)) { >>> - trace_e1000e_tx_disabled(); >>> - return; >>> - } >>> + e1000e_tx_ring_init(core, &txr, q - &core->tx[0]); >>> + txi = txr.i; >>> >>> while (!e1000e_ring_empty(core, txi)) { >>> base = e1000e_ring_head_descr(core, txi); >>> @@ -931,15 +930,17 @@ e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr) >>> trace_e1000e_tx_descr((void *)(intptr_t)desc.buffer_addr, >>> desc.lower.data, desc.upper.data); >>> >>> - e1000e_process_tx_desc(core, txr->tx, &desc, txi->idx); >>> - cause |= e1000e_txdesc_writeback(core, base, &desc, &ide, txi->idx); >>> + e1000e_process_tx_desc(core, txr.tx, &desc, txi->idx); >>> + q->cause |= e1000e_txdesc_writeback(core, base, &desc, >>> + &q->ide, txi->idx); >>> >>> e1000e_ring_advance(core, txi, 1); >>> + if (++num_packets >= core->tx_burst) { >>> + break; >>> + } >>> } >>> >>> - if (!ide || !e1000e_intrmgr_delay_tx_causes(core, &cause)) { >>> - e1000e_set_interrupt_cause(core, cause); >>> - } >>> + return num_packets; >>> } >>> >>> static bool >>> @@ -2423,32 +2424,41 @@ e1000e_set_dbal(E1000ECore *core, int index, uint32_t val) >>> static void >>> e1000e_set_tctl(E1000ECore *core, int index, uint32_t val) >>> { >>> - E1000E_TxRing txr; >>> core->mac[index] = val; >>> >>> if (core->mac[TARC0] & E1000_TARC_ENABLE) { >>> - e1000e_tx_ring_init(core, &txr, 0); >>> - e1000e_start_xmit(core, &txr); >>> + if (core->tx[0].tx_waiting) { >>> + return; >>> + } >>> + core->tx[0].tx_waiting = 1; >>> + if (!core->vm_running) { >>> + return; >>> + } >>> + qemu_bh_schedule(core->tx[0].tx_bh); >>> } >>> >>> if (core->mac[TARC1] & E1000_TARC_ENABLE) { >>> - e1000e_tx_ring_init(core, &txr, 1); >>> - e1000e_start_xmit(core, &txr); >>> + if (core->tx[1].tx_waiting) { >>> + return; >>> + } >>> + core->tx[1].tx_waiting = 1; >>> + if (!core->vm_running) { >>> + return; >>> + } >>> + qemu_bh_schedule(core->tx[1].tx_bh); >>> } >>> } >>> >>> static void >>> e1000e_set_tdt(E1000ECore *core, int index, uint32_t val) >>> { >>> - E1000E_TxRing txr; >>> int qidx = e1000e_mq_queue_idx(TDT, index); >>> uint32_t tarc_reg = (qidx == 0) ? TARC0 : TARC1; >>> >>> core->mac[index] = val & 0xffff; >>> >>> if (core->mac[tarc_reg] & E1000_TARC_ENABLE) { >>> - e1000e_tx_ring_init(core, &txr, qidx); >>> - e1000e_start_xmit(core, &txr); >>> + qemu_bh_schedule(core->tx[qidx].tx_bh); >>> } >>> } >>> >>> @@ -3315,10 +3325,56 @@ e1000e_vm_state_change(void *opaque, int running, RunState state) >>> trace_e1000e_vm_state_running(); >>> e1000e_intrmgr_resume(core); >>> e1000e_autoneg_resume(core); >>> + core->vm_running = 1; >>> + >>> + for (int i = 0; i < E1000E_NUM_QUEUES; i++) { >>> + qemu_bh_schedule(core->tx[i].tx_bh); >> >> I guess the reason for the unconditional scheduling of bh is to make >> sure tx work after live migration since we don't migrate tx_waiting. >> >> If yes, better add a comment here. > Ok will do in next revision. > > And do we need to clear tx_waiting here? > > I think there is no need as the tx_bh handler will do this. Yes. > >> >>> + } >>> + >>> } else { >>> trace_e1000e_vm_state_stopped(); >>> + >>> + for (int i = 0; i < E1000E_NUM_QUEUES; i++) { >>> + qemu_bh_cancel(core->tx[i].tx_bh); >>> + } >>> + >>> e1000e_autoneg_pause(core); >>> e1000e_intrmgr_pause(core); >>> + core->vm_running = 0; >>> + } >>> +} >>> + >>> + >>> +static void e1000e_core_tx_bh(void *opaque) >>> +{ >>> + struct e1000e_tx *q = opaque; >>> + E1000ECore *core = q->core; >>> + int32_t ret; >>> + >>> + if (!core->vm_running) { >>> + assert(q->tx_waiting); >>> + return; >>> + } >>> + >>> + q->tx_waiting = 0; >>> + >>> + if (!(core->mac[TCTL] & E1000_TCTL_EN)) { >>> + trace_e1000e_tx_disabled(); >>> + return; >>> + } >>> + >>> + q->cause = E1000_ICS_TXQE; >>> + q->ide = false; >>> + >>> + ret = e1000e_start_xmit(q); >>> + if (ret >= core->tx_burst) { >>> + qemu_bh_schedule(q->tx_bh); >>> + q->tx_waiting = 1; >>> + return; >>> + } >>> + >>> + if (!q->ide || !e1000e_intrmgr_delay_tx_causes(core, &q->cause)) { >>> + e1000e_set_interrupt_cause(core, q->cause); >> >> So I think this will cause some delay of the interrupt delivering. > Exactly. if tx burst occurs, the interrupt won't be triggered in the > first tx_bh handler. > As we give the vcpu more time and this may acceptable? I think not, e1000e has its own interrupt delay mechanism, let's keep it work as much as possible. > > It >> looks to be it's better to leave the set ics in e1000e_start_xmit(). > I think let e1000e_start_xmit just send packets is more clean. > Leave setting ics in it will not improve the performance as far as I can see. > What's your idea here to leave it in e1000e_start_xmit? I just worry about the delay of the interrupt if you do it in e1000e_core_tx_bh() and just let e1000e_start_xmit() return the number of packets transmitted and keep most of its code untouched. > >> >>> } >>> } >>> >>> @@ -3334,12 +3390,15 @@ e1000e_core_pci_realize(E1000ECore *core, >>> e1000e_autoneg_timer, core); >>> e1000e_intrmgr_pci_realize(core); >>> >>> + core->vm_running = runstate_is_running(); >>> core->vmstate = >>> qemu_add_vm_change_state_handler(e1000e_vm_state_change, core); >>> >>> for (i = 0; i < E1000E_NUM_QUEUES; i++) { >>> net_tx_pkt_init(&core->tx[i].tx_pkt, core->owner, >>> E1000E_MAX_TX_FRAGS, core->has_vnet); >>> + core->tx[i].core = core; >>> + core->tx[i].tx_bh = qemu_bh_new(e1000e_core_tx_bh, &core->tx[i]); >>> } >>> >>> net_rx_pkt_init(&core->rx_pkt, core->has_vnet); >>> @@ -3367,6 +3426,9 @@ e1000e_core_pci_uninit(E1000ECore *core) >>> for (i = 0; i < E1000E_NUM_QUEUES; i++) { >>> net_tx_pkt_reset(core->tx[i].tx_pkt); >>> net_tx_pkt_uninit(core->tx[i].tx_pkt); >>> + qemu_bh_cancel(core->tx[i].tx_bh); >>> + qemu_bh_delete(core->tx[i].tx_bh); >>> + core->tx[i].tx_bh = NULL; >>> } >>> >>> net_rx_pkt_uninit(core->rx_pkt); >>> diff --git a/hw/net/e1000e_core.h b/hw/net/e1000e_core.h >>> index aee32f7e48..0c16dce3a6 100644 >>> --- a/hw/net/e1000e_core.h >>> +++ b/hw/net/e1000e_core.h >>> @@ -77,10 +77,18 @@ struct E1000Core { >>> unsigned char sum_needed; >>> bool cptse; >>> struct NetTxPkt *tx_pkt; >>> + QEMUBH *tx_bh; >>> + uint32_t tx_waiting; >>> + uint32_t cause; >>> + bool ide; >>> + E1000ECore *core; >>> } tx[E1000E_NUM_QUEUES]; >>> >>> struct NetRxPkt *rx_pkt; >>> >>> + int32_t tx_burst; >>> + >>> + bool vm_running; >>> bool has_vnet; >>> int max_queue_num; >>> >> >> Do we need to cancel the bh and reset tx_waiting in e1000e_core_reset()? > I think we need. But why the virtio net doesn't do this? Maybe it also > lack of this? Ok, so I think the current code is probably OK. virtio-net check DRIVER_OK and the code here check TCTL_EN. Thanks > > Thanks, > Li Qiang > >> Thanks >>
Jason Wang <jasowang@redhat.com> 于2020年7月21日周二 下午1:30写道: > > > On 2020/7/21 下午12:33, Li Qiang wrote: > > Jason Wang <jasowang@redhat.com> 于2020年7月21日周二 上午10:03写道: > >> > >> On 2020/7/21 上午12:45, Li Qiang wrote: > >>> Alexander Bulekov reported a UAF bug related e1000e packets send. > >>> > >>> -->https://bugs.launchpad.net/qemu/+bug/1886362 > >>> > >>> This is because the guest trigger a e1000e packet send and set the > >>> data's address to e1000e's MMIO address. So when the e1000e do DMA > >>> it will write the MMIO again and trigger re-entrancy and finally > >>> causes this UAF. > >>> > >>> Paolo suggested to use a bottom half whenever MMIO is doing complicate > >>> things in here: > >>> -->https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg03342.html > >>> > >>> Reference here: > >>> 'The easiest solution is to delay processing of descriptors to a bottom > >>> half whenever MMIO is doing something complicated. This is also better > >>> for latency because it will free the vCPU thread more quickly and leave > >>> the work to the I/O thread.' > >>> > >>> This patch fixes this UAF. > >>> > >>> Signed-off-by: Li Qiang <liq3ea@163.com> > >>> --- > >>> Change since v1: > >>> Per Jason's review here: > >>> -- https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg05368.html > >>> 1. Cancel and schedule the tx bh when VM is stopped or resume > >>> 2. Add a tx_burst for e1000e configuration to throttle the bh execution > >>> 3. Add a tx_waiting to record whether the bh is pending or not > >>> Don't use BQL in the tx_bh handler as when tx_bh is executed, the BQL is > >>> acquired. > >>> > >>> hw/net/e1000e.c | 6 +++ > >>> hw/net/e1000e_core.c | 106 ++++++++++++++++++++++++++++++++++--------- > >>> hw/net/e1000e_core.h | 8 ++++ > >>> 3 files changed, 98 insertions(+), 22 deletions(-) > >>> > >>> diff --git a/hw/net/e1000e.c b/hw/net/e1000e.c > >>> index fda34518c9..24e35a78bf 100644 > >>> --- a/hw/net/e1000e.c > >>> +++ b/hw/net/e1000e.c > >>> @@ -77,10 +77,14 @@ typedef struct E1000EState { > >>> > >>> bool disable_vnet; > >>> > >>> + int32_t tx_burst; > >>> + > >>> E1000ECore core; > >>> > >>> } E1000EState; > >>> > >>> +#define TX_BURST 256 > >>> + > >>> #define E1000E_MMIO_IDX 0 > >>> #define E1000E_FLASH_IDX 1 > >>> #define E1000E_IO_IDX 2 > >>> @@ -263,6 +267,7 @@ static void e1000e_core_realize(E1000EState *s) > >>> { > >>> s->core.owner = &s->parent_obj; > >>> s->core.owner_nic = s->nic; > >>> + s->core.tx_burst = s->tx_burst; > >>> } > >>> > >>> static void > >>> @@ -665,6 +670,7 @@ static Property e1000e_properties[] = { > >>> e1000e_prop_subsys_ven, uint16_t), > >>> DEFINE_PROP_SIGNED("subsys", E1000EState, subsys, 0, > >>> e1000e_prop_subsys, uint16_t), > >>> + DEFINE_PROP_INT32("x-txburst", E1000EState, tx_burst, TX_BURST), > >>> DEFINE_PROP_END_OF_LIST(), > >>> }; > >>> > >>> diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c > >>> index bcd186cac5..0a38a50cca 100644 > >>> --- a/hw/net/e1000e_core.c > >>> +++ b/hw/net/e1000e_core.c > >>> @@ -909,19 +909,18 @@ e1000e_rx_ring_init(E1000ECore *core, E1000E_RxRing *rxr, int idx) > >>> rxr->i = &i[idx]; > >>> } > >>> > >>> -static void > >>> -e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr) > >>> +static int32_t > >>> +e1000e_start_xmit(struct e1000e_tx *q) > >>> { > >>> + E1000ECore *core = q->core; > >>> dma_addr_t base; > >>> struct e1000_tx_desc desc; > >>> - bool ide = false; > >>> - const E1000E_RingInfo *txi = txr->i; > >>> - uint32_t cause = E1000_ICS_TXQE; > >>> + const E1000E_RingInfo *txi; > >>> + E1000E_TxRing txr; > >>> + int32_t num_packets = 0; > >>> > >>> - if (!(core->mac[TCTL] & E1000_TCTL_EN)) { > >>> - trace_e1000e_tx_disabled(); > >>> - return; > >>> - } > >>> + e1000e_tx_ring_init(core, &txr, q - &core->tx[0]); > >>> + txi = txr.i; > >>> > >>> while (!e1000e_ring_empty(core, txi)) { > >>> base = e1000e_ring_head_descr(core, txi); > >>> @@ -931,15 +930,17 @@ e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr) > >>> trace_e1000e_tx_descr((void *)(intptr_t)desc.buffer_addr, > >>> desc.lower.data, desc.upper.data); > >>> > >>> - e1000e_process_tx_desc(core, txr->tx, &desc, txi->idx); > >>> - cause |= e1000e_txdesc_writeback(core, base, &desc, &ide, txi->idx); > >>> + e1000e_process_tx_desc(core, txr.tx, &desc, txi->idx); > >>> + q->cause |= e1000e_txdesc_writeback(core, base, &desc, > >>> + &q->ide, txi->idx); > >>> > >>> e1000e_ring_advance(core, txi, 1); > >>> + if (++num_packets >= core->tx_burst) { > >>> + break; > >>> + } > >>> } > >>> > >>> - if (!ide || !e1000e_intrmgr_delay_tx_causes(core, &cause)) { > >>> - e1000e_set_interrupt_cause(core, cause); > >>> - } > >>> + return num_packets; > >>> } > >>> > >>> static bool > >>> @@ -2423,32 +2424,41 @@ e1000e_set_dbal(E1000ECore *core, int index, uint32_t val) > >>> static void > >>> e1000e_set_tctl(E1000ECore *core, int index, uint32_t val) > >>> { > >>> - E1000E_TxRing txr; > >>> core->mac[index] = val; > >>> > >>> if (core->mac[TARC0] & E1000_TARC_ENABLE) { > >>> - e1000e_tx_ring_init(core, &txr, 0); > >>> - e1000e_start_xmit(core, &txr); > >>> + if (core->tx[0].tx_waiting) { > >>> + return; > >>> + } > >>> + core->tx[0].tx_waiting = 1; > >>> + if (!core->vm_running) { > >>> + return; > >>> + } > >>> + qemu_bh_schedule(core->tx[0].tx_bh); > >>> } > >>> > >>> if (core->mac[TARC1] & E1000_TARC_ENABLE) { > >>> - e1000e_tx_ring_init(core, &txr, 1); > >>> - e1000e_start_xmit(core, &txr); > >>> + if (core->tx[1].tx_waiting) { > >>> + return; > >>> + } > >>> + core->tx[1].tx_waiting = 1; > >>> + if (!core->vm_running) { > >>> + return; > >>> + } > >>> + qemu_bh_schedule(core->tx[1].tx_bh); > >>> } > >>> } > >>> > >>> static void > >>> e1000e_set_tdt(E1000ECore *core, int index, uint32_t val) > >>> { > >>> - E1000E_TxRing txr; > >>> int qidx = e1000e_mq_queue_idx(TDT, index); > >>> uint32_t tarc_reg = (qidx == 0) ? TARC0 : TARC1; > >>> > >>> core->mac[index] = val & 0xffff; > >>> > >>> if (core->mac[tarc_reg] & E1000_TARC_ENABLE) { > >>> - e1000e_tx_ring_init(core, &txr, qidx); > >>> - e1000e_start_xmit(core, &txr); > >>> + qemu_bh_schedule(core->tx[qidx].tx_bh); > >>> } > >>> } > >>> > >>> @@ -3315,10 +3325,56 @@ e1000e_vm_state_change(void *opaque, int running, RunState state) > >>> trace_e1000e_vm_state_running(); > >>> e1000e_intrmgr_resume(core); > >>> e1000e_autoneg_resume(core); > >>> + core->vm_running = 1; > >>> + > >>> + for (int i = 0; i < E1000E_NUM_QUEUES; i++) { > >>> + qemu_bh_schedule(core->tx[i].tx_bh); > >> > >> I guess the reason for the unconditional scheduling of bh is to make > >> sure tx work after live migration since we don't migrate tx_waiting. > >> > >> If yes, better add a comment here. > > Ok will do in next revision. > > > > And do we need to clear tx_waiting here? > > > > I think there is no need as the tx_bh handler will do this. > > > Yes. > > > > > >> > >>> + } > >>> + > >>> } else { > >>> trace_e1000e_vm_state_stopped(); > >>> + > >>> + for (int i = 0; i < E1000E_NUM_QUEUES; i++) { > >>> + qemu_bh_cancel(core->tx[i].tx_bh); > >>> + } > >>> + > >>> e1000e_autoneg_pause(core); > >>> e1000e_intrmgr_pause(core); > >>> + core->vm_running = 0; > >>> + } > >>> +} > >>> + > >>> + > >>> +static void e1000e_core_tx_bh(void *opaque) > >>> +{ > >>> + struct e1000e_tx *q = opaque; > >>> + E1000ECore *core = q->core; > >>> + int32_t ret; > >>> + > >>> + if (!core->vm_running) { > >>> + assert(q->tx_waiting); > >>> + return; > >>> + } > >>> + > >>> + q->tx_waiting = 0; > >>> + > >>> + if (!(core->mac[TCTL] & E1000_TCTL_EN)) { > >>> + trace_e1000e_tx_disabled(); > >>> + return; > >>> + } > >>> + > >>> + q->cause = E1000_ICS_TXQE; > >>> + q->ide = false; > >>> + > >>> + ret = e1000e_start_xmit(q); > >>> + if (ret >= core->tx_burst) { > >>> + qemu_bh_schedule(q->tx_bh); > >>> + q->tx_waiting = 1; > >>> + return; > >>> + } > >>> + > >>> + if (!q->ide || !e1000e_intrmgr_delay_tx_causes(core, &q->cause)) { > >>> + e1000e_set_interrupt_cause(core, q->cause); > >> > >> So I think this will cause some delay of the interrupt delivering. > > Exactly. if tx burst occurs, the interrupt won't be triggered in the > > first tx_bh handler. > > As we give the vcpu more time and this may acceptable? > > > I think not, e1000e has its own interrupt delay mechanism, let's keep it > work as much as possible. > > > > > > It > >> looks to be it's better to leave the set ics in e1000e_start_xmit(). > > I think let e1000e_start_xmit just send packets is more clean. > > Leave setting ics in it will not improve the performance as far as I can see. > > What's your idea here to leave it in e1000e_start_xmit? > > > I just worry about the delay of the interrupt if you do it in > e1000e_core_tx_bh() and just let e1000e_start_xmit() return the number > of packets transmitted and keep most of its code untouched. Sorry here let's make sure I got your meaning. 'The set ics' is meaning 'set interrupt cause' , in code: + if (!q->ide || !e1000e_intrmgr_delay_tx_causes(core, &q->cause)) { + e1000e_set_interrupt_cause(core, q->cause); Do you mean we should leave it in the 'e1000e_start_xmit'? My understanding is this. if it is this, we should also add following code in it: + if (ret >= core->tx_burst) { + qemu_bh_schedule(q->tx_bh); + q->tx_waiting = 1; + return; + } Then the tx_bh handler just call 'e1000e_start_xmit'. Right? > > > > > >> > >>> } > >>> } > >>> > >>> @@ -3334,12 +3390,15 @@ e1000e_core_pci_realize(E1000ECore *core, > >>> e1000e_autoneg_timer, core); > >>> e1000e_intrmgr_pci_realize(core); > >>> > >>> + core->vm_running = runstate_is_running(); > >>> core->vmstate = > >>> qemu_add_vm_change_state_handler(e1000e_vm_state_change, core); > >>> > >>> for (i = 0; i < E1000E_NUM_QUEUES; i++) { > >>> net_tx_pkt_init(&core->tx[i].tx_pkt, core->owner, > >>> E1000E_MAX_TX_FRAGS, core->has_vnet); > >>> + core->tx[i].core = core; > >>> + core->tx[i].tx_bh = qemu_bh_new(e1000e_core_tx_bh, &core->tx[i]); > >>> } > >>> > >>> net_rx_pkt_init(&core->rx_pkt, core->has_vnet); > >>> @@ -3367,6 +3426,9 @@ e1000e_core_pci_uninit(E1000ECore *core) > >>> for (i = 0; i < E1000E_NUM_QUEUES; i++) { > >>> net_tx_pkt_reset(core->tx[i].tx_pkt); > >>> net_tx_pkt_uninit(core->tx[i].tx_pkt); > >>> + qemu_bh_cancel(core->tx[i].tx_bh); > >>> + qemu_bh_delete(core->tx[i].tx_bh); > >>> + core->tx[i].tx_bh = NULL; > >>> } > >>> > >>> net_rx_pkt_uninit(core->rx_pkt); > >>> diff --git a/hw/net/e1000e_core.h b/hw/net/e1000e_core.h > >>> index aee32f7e48..0c16dce3a6 100644 > >>> --- a/hw/net/e1000e_core.h > >>> +++ b/hw/net/e1000e_core.h > >>> @@ -77,10 +77,18 @@ struct E1000Core { > >>> unsigned char sum_needed; > >>> bool cptse; > >>> struct NetTxPkt *tx_pkt; > >>> + QEMUBH *tx_bh; > >>> + uint32_t tx_waiting; > >>> + uint32_t cause; > >>> + bool ide; > >>> + E1000ECore *core; > >>> } tx[E1000E_NUM_QUEUES]; > >>> > >>> struct NetRxPkt *rx_pkt; > >>> > >>> + int32_t tx_burst; > >>> + > >>> + bool vm_running; > >>> bool has_vnet; > >>> int max_queue_num; > >>> > >> > >> Do we need to cancel the bh and reset tx_waiting in e1000e_core_reset()? > > I think we need. But why the virtio net doesn't do this? Maybe it also > > lack of this? > > > Ok, so I think the current code is probably OK. So my next revision will do following: 1. add comment for the tx_bh schdule when VM resume. 2. Leave some code in the 'e1000e_start_xmit' 3. cancel the tx_bh and reset tx_waiting in e1000e_core_reset If I lost/misunderstanding anything please let me know. Thanks, Li Qiang > > virtio-net check DRIVER_OK and the code here check TCTL_EN. > > Thanks > > > > > > Thanks, > > Li Qiang > > > >> Thanks > >> >
On 2020/7/21 下午1:59, Li Qiang wrote: > Jason Wang <jasowang@redhat.com> 于2020年7月21日周二 下午1:30写道: >> >> On 2020/7/21 下午12:33, Li Qiang wrote: >>> Jason Wang <jasowang@redhat.com> 于2020年7月21日周二 上午10:03写道: >>>> On 2020/7/21 上午12:45, Li Qiang wrote: >>>>> Alexander Bulekov reported a UAF bug related e1000e packets send. >>>>> >>>>> -->https://bugs.launchpad.net/qemu/+bug/1886362 >>>>> >>>>> This is because the guest trigger a e1000e packet send and set the >>>>> data's address to e1000e's MMIO address. So when the e1000e do DMA >>>>> it will write the MMIO again and trigger re-entrancy and finally >>>>> causes this UAF. >>>>> >>>>> Paolo suggested to use a bottom half whenever MMIO is doing complicate >>>>> things in here: >>>>> -->https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg03342.html >>>>> >>>>> Reference here: >>>>> 'The easiest solution is to delay processing of descriptors to a bottom >>>>> half whenever MMIO is doing something complicated. This is also better >>>>> for latency because it will free the vCPU thread more quickly and leave >>>>> the work to the I/O thread.' >>>>> >>>>> This patch fixes this UAF. >>>>> >>>>> Signed-off-by: Li Qiang <liq3ea@163.com> >>>>> --- >>>>> Change since v1: >>>>> Per Jason's review here: >>>>> -- https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg05368.html >>>>> 1. Cancel and schedule the tx bh when VM is stopped or resume >>>>> 2. Add a tx_burst for e1000e configuration to throttle the bh execution >>>>> 3. Add a tx_waiting to record whether the bh is pending or not >>>>> Don't use BQL in the tx_bh handler as when tx_bh is executed, the BQL is >>>>> acquired. >>>>> >>>>> hw/net/e1000e.c | 6 +++ >>>>> hw/net/e1000e_core.c | 106 ++++++++++++++++++++++++++++++++++--------- >>>>> hw/net/e1000e_core.h | 8 ++++ >>>>> 3 files changed, 98 insertions(+), 22 deletions(-) >>>>> >>>>> diff --git a/hw/net/e1000e.c b/hw/net/e1000e.c >>>>> index fda34518c9..24e35a78bf 100644 >>>>> --- a/hw/net/e1000e.c >>>>> +++ b/hw/net/e1000e.c >>>>> @@ -77,10 +77,14 @@ typedef struct E1000EState { >>>>> >>>>> bool disable_vnet; >>>>> >>>>> + int32_t tx_burst; >>>>> + >>>>> E1000ECore core; >>>>> >>>>> } E1000EState; >>>>> >>>>> +#define TX_BURST 256 >>>>> + >>>>> #define E1000E_MMIO_IDX 0 >>>>> #define E1000E_FLASH_IDX 1 >>>>> #define E1000E_IO_IDX 2 >>>>> @@ -263,6 +267,7 @@ static void e1000e_core_realize(E1000EState *s) >>>>> { >>>>> s->core.owner = &s->parent_obj; >>>>> s->core.owner_nic = s->nic; >>>>> + s->core.tx_burst = s->tx_burst; >>>>> } >>>>> >>>>> static void >>>>> @@ -665,6 +670,7 @@ static Property e1000e_properties[] = { >>>>> e1000e_prop_subsys_ven, uint16_t), >>>>> DEFINE_PROP_SIGNED("subsys", E1000EState, subsys, 0, >>>>> e1000e_prop_subsys, uint16_t), >>>>> + DEFINE_PROP_INT32("x-txburst", E1000EState, tx_burst, TX_BURST), >>>>> DEFINE_PROP_END_OF_LIST(), >>>>> }; >>>>> >>>>> diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c >>>>> index bcd186cac5..0a38a50cca 100644 >>>>> --- a/hw/net/e1000e_core.c >>>>> +++ b/hw/net/e1000e_core.c >>>>> @@ -909,19 +909,18 @@ e1000e_rx_ring_init(E1000ECore *core, E1000E_RxRing *rxr, int idx) >>>>> rxr->i = &i[idx]; >>>>> } >>>>> >>>>> -static void >>>>> -e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr) >>>>> +static int32_t >>>>> +e1000e_start_xmit(struct e1000e_tx *q) >>>>> { >>>>> + E1000ECore *core = q->core; >>>>> dma_addr_t base; >>>>> struct e1000_tx_desc desc; >>>>> - bool ide = false; >>>>> - const E1000E_RingInfo *txi = txr->i; >>>>> - uint32_t cause = E1000_ICS_TXQE; >>>>> + const E1000E_RingInfo *txi; >>>>> + E1000E_TxRing txr; >>>>> + int32_t num_packets = 0; >>>>> >>>>> - if (!(core->mac[TCTL] & E1000_TCTL_EN)) { >>>>> - trace_e1000e_tx_disabled(); >>>>> - return; >>>>> - } >>>>> + e1000e_tx_ring_init(core, &txr, q - &core->tx[0]); >>>>> + txi = txr.i; >>>>> >>>>> while (!e1000e_ring_empty(core, txi)) { >>>>> base = e1000e_ring_head_descr(core, txi); >>>>> @@ -931,15 +930,17 @@ e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr) >>>>> trace_e1000e_tx_descr((void *)(intptr_t)desc.buffer_addr, >>>>> desc.lower.data, desc.upper.data); >>>>> >>>>> - e1000e_process_tx_desc(core, txr->tx, &desc, txi->idx); >>>>> - cause |= e1000e_txdesc_writeback(core, base, &desc, &ide, txi->idx); >>>>> + e1000e_process_tx_desc(core, txr.tx, &desc, txi->idx); >>>>> + q->cause |= e1000e_txdesc_writeback(core, base, &desc, >>>>> + &q->ide, txi->idx); >>>>> >>>>> e1000e_ring_advance(core, txi, 1); >>>>> + if (++num_packets >= core->tx_burst) { >>>>> + break; >>>>> + } >>>>> } >>>>> >>>>> - if (!ide || !e1000e_intrmgr_delay_tx_causes(core, &cause)) { >>>>> - e1000e_set_interrupt_cause(core, cause); >>>>> - } >>>>> + return num_packets; >>>>> } >>>>> >>>>> static bool >>>>> @@ -2423,32 +2424,41 @@ e1000e_set_dbal(E1000ECore *core, int index, uint32_t val) >>>>> static void >>>>> e1000e_set_tctl(E1000ECore *core, int index, uint32_t val) >>>>> { >>>>> - E1000E_TxRing txr; >>>>> core->mac[index] = val; >>>>> >>>>> if (core->mac[TARC0] & E1000_TARC_ENABLE) { >>>>> - e1000e_tx_ring_init(core, &txr, 0); >>>>> - e1000e_start_xmit(core, &txr); >>>>> + if (core->tx[0].tx_waiting) { >>>>> + return; >>>>> + } >>>>> + core->tx[0].tx_waiting = 1; >>>>> + if (!core->vm_running) { >>>>> + return; >>>>> + } >>>>> + qemu_bh_schedule(core->tx[0].tx_bh); >>>>> } >>>>> >>>>> if (core->mac[TARC1] & E1000_TARC_ENABLE) { >>>>> - e1000e_tx_ring_init(core, &txr, 1); >>>>> - e1000e_start_xmit(core, &txr); >>>>> + if (core->tx[1].tx_waiting) { >>>>> + return; >>>>> + } >>>>> + core->tx[1].tx_waiting = 1; >>>>> + if (!core->vm_running) { >>>>> + return; >>>>> + } >>>>> + qemu_bh_schedule(core->tx[1].tx_bh); >>>>> } >>>>> } >>>>> >>>>> static void >>>>> e1000e_set_tdt(E1000ECore *core, int index, uint32_t val) >>>>> { >>>>> - E1000E_TxRing txr; >>>>> int qidx = e1000e_mq_queue_idx(TDT, index); >>>>> uint32_t tarc_reg = (qidx == 0) ? TARC0 : TARC1; >>>>> >>>>> core->mac[index] = val & 0xffff; >>>>> >>>>> if (core->mac[tarc_reg] & E1000_TARC_ENABLE) { >>>>> - e1000e_tx_ring_init(core, &txr, qidx); >>>>> - e1000e_start_xmit(core, &txr); >>>>> + qemu_bh_schedule(core->tx[qidx].tx_bh); >>>>> } >>>>> } >>>>> >>>>> @@ -3315,10 +3325,56 @@ e1000e_vm_state_change(void *opaque, int running, RunState state) >>>>> trace_e1000e_vm_state_running(); >>>>> e1000e_intrmgr_resume(core); >>>>> e1000e_autoneg_resume(core); >>>>> + core->vm_running = 1; >>>>> + >>>>> + for (int i = 0; i < E1000E_NUM_QUEUES; i++) { >>>>> + qemu_bh_schedule(core->tx[i].tx_bh); >>>> I guess the reason for the unconditional scheduling of bh is to make >>>> sure tx work after live migration since we don't migrate tx_waiting. >>>> >>>> If yes, better add a comment here. >>> Ok will do in next revision. >>> >>> And do we need to clear tx_waiting here? >>> >>> I think there is no need as the tx_bh handler will do this. >> >> Yes. >> >> >>>>> + } >>>>> + >>>>> } else { >>>>> trace_e1000e_vm_state_stopped(); >>>>> + >>>>> + for (int i = 0; i < E1000E_NUM_QUEUES; i++) { >>>>> + qemu_bh_cancel(core->tx[i].tx_bh); >>>>> + } >>>>> + >>>>> e1000e_autoneg_pause(core); >>>>> e1000e_intrmgr_pause(core); >>>>> + core->vm_running = 0; >>>>> + } >>>>> +} >>>>> + >>>>> + >>>>> +static void e1000e_core_tx_bh(void *opaque) >>>>> +{ >>>>> + struct e1000e_tx *q = opaque; >>>>> + E1000ECore *core = q->core; >>>>> + int32_t ret; >>>>> + >>>>> + if (!core->vm_running) { >>>>> + assert(q->tx_waiting); >>>>> + return; >>>>> + } >>>>> + >>>>> + q->tx_waiting = 0; >>>>> + >>>>> + if (!(core->mac[TCTL] & E1000_TCTL_EN)) { >>>>> + trace_e1000e_tx_disabled(); >>>>> + return; >>>>> + } >>>>> + >>>>> + q->cause = E1000_ICS_TXQE; >>>>> + q->ide = false; >>>>> + >>>>> + ret = e1000e_start_xmit(q); >>>>> + if (ret >= core->tx_burst) { >>>>> + qemu_bh_schedule(q->tx_bh); >>>>> + q->tx_waiting = 1; >>>>> + return; >>>>> + } >>>>> + >>>>> + if (!q->ide || !e1000e_intrmgr_delay_tx_causes(core, &q->cause)) { >>>>> + e1000e_set_interrupt_cause(core, q->cause); >>>> So I think this will cause some delay of the interrupt delivering. >>> Exactly. if tx burst occurs, the interrupt won't be triggered in the >>> first tx_bh handler. >>> As we give the vcpu more time and this may acceptable? >> >> I think not, e1000e has its own interrupt delay mechanism, let's keep it >> work as much as possible. >> >> >>> It >>>> looks to be it's better to leave the set ics in e1000e_start_xmit(). >>> I think let e1000e_start_xmit just send packets is more clean. >>> Leave setting ics in it will not improve the performance as far as I can see. >>> What's your idea here to leave it in e1000e_start_xmit? >> >> I just worry about the delay of the interrupt if you do it in >> e1000e_core_tx_bh() and just let e1000e_start_xmit() return the number >> of packets transmitted and keep most of its code untouched. > Sorry here let's make sure I got your meaning. > 'The set ics' is meaning 'set interrupt cause' , in code: > > + if (!q->ide || !e1000e_intrmgr_delay_tx_causes(core, &q->cause)) { > + e1000e_set_interrupt_cause(core, q->cause); > > Do you mean we should leave it in the 'e1000e_start_xmit'? My > understanding is this. > if it is this, we should also add following code in it: > > + if (ret >= core->tx_burst) { > + qemu_bh_schedule(q->tx_bh); > + q->tx_waiting = 1; > + return; > + } > > Then the tx_bh handler just call 'e1000e_start_xmit'. Right? This should work. > > > >> >>>>> } >>>>> } >>>>> >>>>> @@ -3334,12 +3390,15 @@ e1000e_core_pci_realize(E1000ECore *core, >>>>> e1000e_autoneg_timer, core); >>>>> e1000e_intrmgr_pci_realize(core); >>>>> >>>>> + core->vm_running = runstate_is_running(); >>>>> core->vmstate = >>>>> qemu_add_vm_change_state_handler(e1000e_vm_state_change, core); >>>>> >>>>> for (i = 0; i < E1000E_NUM_QUEUES; i++) { >>>>> net_tx_pkt_init(&core->tx[i].tx_pkt, core->owner, >>>>> E1000E_MAX_TX_FRAGS, core->has_vnet); >>>>> + core->tx[i].core = core; >>>>> + core->tx[i].tx_bh = qemu_bh_new(e1000e_core_tx_bh, &core->tx[i]); >>>>> } >>>>> >>>>> net_rx_pkt_init(&core->rx_pkt, core->has_vnet); >>>>> @@ -3367,6 +3426,9 @@ e1000e_core_pci_uninit(E1000ECore *core) >>>>> for (i = 0; i < E1000E_NUM_QUEUES; i++) { >>>>> net_tx_pkt_reset(core->tx[i].tx_pkt); >>>>> net_tx_pkt_uninit(core->tx[i].tx_pkt); >>>>> + qemu_bh_cancel(core->tx[i].tx_bh); >>>>> + qemu_bh_delete(core->tx[i].tx_bh); >>>>> + core->tx[i].tx_bh = NULL; >>>>> } >>>>> >>>>> net_rx_pkt_uninit(core->rx_pkt); >>>>> diff --git a/hw/net/e1000e_core.h b/hw/net/e1000e_core.h >>>>> index aee32f7e48..0c16dce3a6 100644 >>>>> --- a/hw/net/e1000e_core.h >>>>> +++ b/hw/net/e1000e_core.h >>>>> @@ -77,10 +77,18 @@ struct E1000Core { >>>>> unsigned char sum_needed; >>>>> bool cptse; >>>>> struct NetTxPkt *tx_pkt; >>>>> + QEMUBH *tx_bh; >>>>> + uint32_t tx_waiting; >>>>> + uint32_t cause; >>>>> + bool ide; >>>>> + E1000ECore *core; >>>>> } tx[E1000E_NUM_QUEUES]; >>>>> >>>>> struct NetRxPkt *rx_pkt; >>>>> >>>>> + int32_t tx_burst; >>>>> + >>>>> + bool vm_running; >>>>> bool has_vnet; >>>>> int max_queue_num; >>>>> >>>> Do we need to cancel the bh and reset tx_waiting in e1000e_core_reset()? >>> I think we need. But why the virtio net doesn't do this? Maybe it also >>> lack of this? >> >> Ok, so I think the current code is probably OK. > So my next revision will do following: > 1. add comment for the tx_bh schdule when VM resume. > 2. Leave some code in the 'e1000e_start_xmit' > 3. cancel the tx_bh and reset tx_waiting in e1000e_core_reset > > If I lost/misunderstanding anything please let me know. Your understanding is correct. Thanks > > Thanks, > Li Qiang > >> virtio-net check DRIVER_OK and the code here check TCTL_EN. >> >> Thanks >> >> >>> Thanks, >>> Li Qiang >>> >>>> Thanks >>>>
diff --git a/hw/net/e1000e.c b/hw/net/e1000e.c index fda34518c9..24e35a78bf 100644 --- a/hw/net/e1000e.c +++ b/hw/net/e1000e.c @@ -77,10 +77,14 @@ typedef struct E1000EState { bool disable_vnet; + int32_t tx_burst; + E1000ECore core; } E1000EState; +#define TX_BURST 256 + #define E1000E_MMIO_IDX 0 #define E1000E_FLASH_IDX 1 #define E1000E_IO_IDX 2 @@ -263,6 +267,7 @@ static void e1000e_core_realize(E1000EState *s) { s->core.owner = &s->parent_obj; s->core.owner_nic = s->nic; + s->core.tx_burst = s->tx_burst; } static void @@ -665,6 +670,7 @@ static Property e1000e_properties[] = { e1000e_prop_subsys_ven, uint16_t), DEFINE_PROP_SIGNED("subsys", E1000EState, subsys, 0, e1000e_prop_subsys, uint16_t), + DEFINE_PROP_INT32("x-txburst", E1000EState, tx_burst, TX_BURST), DEFINE_PROP_END_OF_LIST(), }; diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c index bcd186cac5..0a38a50cca 100644 --- a/hw/net/e1000e_core.c +++ b/hw/net/e1000e_core.c @@ -909,19 +909,18 @@ e1000e_rx_ring_init(E1000ECore *core, E1000E_RxRing *rxr, int idx) rxr->i = &i[idx]; } -static void -e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr) +static int32_t +e1000e_start_xmit(struct e1000e_tx *q) { + E1000ECore *core = q->core; dma_addr_t base; struct e1000_tx_desc desc; - bool ide = false; - const E1000E_RingInfo *txi = txr->i; - uint32_t cause = E1000_ICS_TXQE; + const E1000E_RingInfo *txi; + E1000E_TxRing txr; + int32_t num_packets = 0; - if (!(core->mac[TCTL] & E1000_TCTL_EN)) { - trace_e1000e_tx_disabled(); - return; - } + e1000e_tx_ring_init(core, &txr, q - &core->tx[0]); + txi = txr.i; while (!e1000e_ring_empty(core, txi)) { base = e1000e_ring_head_descr(core, txi); @@ -931,15 +930,17 @@ e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr) trace_e1000e_tx_descr((void *)(intptr_t)desc.buffer_addr, desc.lower.data, desc.upper.data); - e1000e_process_tx_desc(core, txr->tx, &desc, txi->idx); - cause |= e1000e_txdesc_writeback(core, base, &desc, &ide, txi->idx); + e1000e_process_tx_desc(core, txr.tx, &desc, txi->idx); + q->cause |= e1000e_txdesc_writeback(core, base, &desc, + &q->ide, txi->idx); e1000e_ring_advance(core, txi, 1); + if (++num_packets >= core->tx_burst) { + break; + } } - if (!ide || !e1000e_intrmgr_delay_tx_causes(core, &cause)) { - e1000e_set_interrupt_cause(core, cause); - } + return num_packets; } static bool @@ -2423,32 +2424,41 @@ e1000e_set_dbal(E1000ECore *core, int index, uint32_t val) static void e1000e_set_tctl(E1000ECore *core, int index, uint32_t val) { - E1000E_TxRing txr; core->mac[index] = val; if (core->mac[TARC0] & E1000_TARC_ENABLE) { - e1000e_tx_ring_init(core, &txr, 0); - e1000e_start_xmit(core, &txr); + if (core->tx[0].tx_waiting) { + return; + } + core->tx[0].tx_waiting = 1; + if (!core->vm_running) { + return; + } + qemu_bh_schedule(core->tx[0].tx_bh); } if (core->mac[TARC1] & E1000_TARC_ENABLE) { - e1000e_tx_ring_init(core, &txr, 1); - e1000e_start_xmit(core, &txr); + if (core->tx[1].tx_waiting) { + return; + } + core->tx[1].tx_waiting = 1; + if (!core->vm_running) { + return; + } + qemu_bh_schedule(core->tx[1].tx_bh); } } static void e1000e_set_tdt(E1000ECore *core, int index, uint32_t val) { - E1000E_TxRing txr; int qidx = e1000e_mq_queue_idx(TDT, index); uint32_t tarc_reg = (qidx == 0) ? TARC0 : TARC1; core->mac[index] = val & 0xffff; if (core->mac[tarc_reg] & E1000_TARC_ENABLE) { - e1000e_tx_ring_init(core, &txr, qidx); - e1000e_start_xmit(core, &txr); + qemu_bh_schedule(core->tx[qidx].tx_bh); } } @@ -3315,10 +3325,56 @@ e1000e_vm_state_change(void *opaque, int running, RunState state) trace_e1000e_vm_state_running(); e1000e_intrmgr_resume(core); e1000e_autoneg_resume(core); + core->vm_running = 1; + + for (int i = 0; i < E1000E_NUM_QUEUES; i++) { + qemu_bh_schedule(core->tx[i].tx_bh); + } + } else { trace_e1000e_vm_state_stopped(); + + for (int i = 0; i < E1000E_NUM_QUEUES; i++) { + qemu_bh_cancel(core->tx[i].tx_bh); + } + e1000e_autoneg_pause(core); e1000e_intrmgr_pause(core); + core->vm_running = 0; + } +} + + +static void e1000e_core_tx_bh(void *opaque) +{ + struct e1000e_tx *q = opaque; + E1000ECore *core = q->core; + int32_t ret; + + if (!core->vm_running) { + assert(q->tx_waiting); + return; + } + + q->tx_waiting = 0; + + if (!(core->mac[TCTL] & E1000_TCTL_EN)) { + trace_e1000e_tx_disabled(); + return; + } + + q->cause = E1000_ICS_TXQE; + q->ide = false; + + ret = e1000e_start_xmit(q); + if (ret >= core->tx_burst) { + qemu_bh_schedule(q->tx_bh); + q->tx_waiting = 1; + return; + } + + if (!q->ide || !e1000e_intrmgr_delay_tx_causes(core, &q->cause)) { + e1000e_set_interrupt_cause(core, q->cause); } } @@ -3334,12 +3390,15 @@ e1000e_core_pci_realize(E1000ECore *core, e1000e_autoneg_timer, core); e1000e_intrmgr_pci_realize(core); + core->vm_running = runstate_is_running(); core->vmstate = qemu_add_vm_change_state_handler(e1000e_vm_state_change, core); for (i = 0; i < E1000E_NUM_QUEUES; i++) { net_tx_pkt_init(&core->tx[i].tx_pkt, core->owner, E1000E_MAX_TX_FRAGS, core->has_vnet); + core->tx[i].core = core; + core->tx[i].tx_bh = qemu_bh_new(e1000e_core_tx_bh, &core->tx[i]); } net_rx_pkt_init(&core->rx_pkt, core->has_vnet); @@ -3367,6 +3426,9 @@ e1000e_core_pci_uninit(E1000ECore *core) for (i = 0; i < E1000E_NUM_QUEUES; i++) { net_tx_pkt_reset(core->tx[i].tx_pkt); net_tx_pkt_uninit(core->tx[i].tx_pkt); + qemu_bh_cancel(core->tx[i].tx_bh); + qemu_bh_delete(core->tx[i].tx_bh); + core->tx[i].tx_bh = NULL; } net_rx_pkt_uninit(core->rx_pkt); diff --git a/hw/net/e1000e_core.h b/hw/net/e1000e_core.h index aee32f7e48..0c16dce3a6 100644 --- a/hw/net/e1000e_core.h +++ b/hw/net/e1000e_core.h @@ -77,10 +77,18 @@ struct E1000Core { unsigned char sum_needed; bool cptse; struct NetTxPkt *tx_pkt; + QEMUBH *tx_bh; + uint32_t tx_waiting; + uint32_t cause; + bool ide; + E1000ECore *core; } tx[E1000E_NUM_QUEUES]; struct NetRxPkt *rx_pkt; + int32_t tx_burst; + + bool vm_running; bool has_vnet; int max_queue_num;
Alexander Bulekov reported a UAF bug related e1000e packets send. -->https://bugs.launchpad.net/qemu/+bug/1886362 This is because the guest trigger a e1000e packet send and set the data's address to e1000e's MMIO address. So when the e1000e do DMA it will write the MMIO again and trigger re-entrancy and finally causes this UAF. Paolo suggested to use a bottom half whenever MMIO is doing complicate things in here: -->https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg03342.html Reference here: 'The easiest solution is to delay processing of descriptors to a bottom half whenever MMIO is doing something complicated. This is also better for latency because it will free the vCPU thread more quickly and leave the work to the I/O thread.' This patch fixes this UAF. Signed-off-by: Li Qiang <liq3ea@163.com> --- Change since v1: Per Jason's review here: -- https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg05368.html 1. Cancel and schedule the tx bh when VM is stopped or resume 2. Add a tx_burst for e1000e configuration to throttle the bh execution 3. Add a tx_waiting to record whether the bh is pending or not Don't use BQL in the tx_bh handler as when tx_bh is executed, the BQL is acquired. hw/net/e1000e.c | 6 +++ hw/net/e1000e_core.c | 106 ++++++++++++++++++++++++++++++++++--------- hw/net/e1000e_core.h | 8 ++++ 3 files changed, 98 insertions(+), 22 deletions(-)