Patchwork [RFC] updated e1000 mitigation patch

login
register
mail settings
Submitter Luigi Rizzo
Date Dec. 27, 2012, 10:06 a.m.
Message ID <20121227100658.GB48891@onelab2.iet.unipi.it>
Download mbox | patch
Permalink /patch/208294/
State New
Headers show

Comments

Luigi Rizzo - Dec. 27, 2012, 10:06 a.m.
Before submitting a proper patch, I'd like to hear feedback on the
following proposed change to hw/e1000.c to properly implement
interrupt mitigation. 
This is joint work with Vincenzo Maffione and Giuseppe Lettieri (in Cc),
and is a followup to a similar patch i posted in july

https://lists.gnu.org/archive/html/qemu-devel/2012-07/msg03195.html


The patch models three of the five (sic!) e1000 register that control
moderation, and uses qemu timers for that (the minimum delay for
these timers affects the fidelity of the emulation).

Right now there is a static variable that controls whether the
feature is enabled. I would probably like to make it a parameter
accessible from the command line in qemu, possibly extending it to
override the mitigation delay set by the device driver.

Right now we reached transmit speeds in the order of 2-300Kpps
(if matched with a proper guest device driver optimization, see
https://groups.google.com/forum/?fromgroups=#!topic/mailing.freebsd.emulator/xQHR_hleCuc )

Some performance data using a FreeBSD guest, for udp transmissions:

					KVM		QEMU
standard KVM, standard driver            20 Kpps	 6.3 Kpps
modified KVM, standard driver            37 Kpps	28 Kpps
modified KVM, modified driver           200 Kpps	34 Kpps
 
As you can see, on kvm this change gives a 5x speedup to the tx path,
which combines nicely with the 2x speedup that comes from supporting
interrupt mitigation alone in the hypervisor. Without kvm (or kqemu ?)
the benefits are much lower, as the guest becomes too slow.

cheers
luigi
Blue Swirl - Dec. 28, 2012, 5:48 p.m.
On Thu, Dec 27, 2012 at 10:06 AM, Luigi Rizzo <rizzo@iet.unipi.it> wrote:
> Before submitting a proper patch, I'd like to hear feedback on the
> following proposed change to hw/e1000.c to properly implement
> interrupt mitigation.
> This is joint work with Vincenzo Maffione and Giuseppe Lettieri (in Cc),
> and is a followup to a similar patch i posted in july
>
> https://lists.gnu.org/archive/html/qemu-devel/2012-07/msg03195.html
>
>
> The patch models three of the five (sic!) e1000 register that control
> moderation, and uses qemu timers for that (the minimum delay for
> these timers affects the fidelity of the emulation).
>
> Right now there is a static variable that controls whether the
> feature is enabled. I would probably like to make it a parameter
> accessible from the command line in qemu, possibly extending it to
> override the mitigation delay set by the device driver.
>
> Right now we reached transmit speeds in the order of 2-300Kpps
> (if matched with a proper guest device driver optimization, see
> https://groups.google.com/forum/?fromgroups=#!topic/mailing.freebsd.emulator/xQHR_hleCuc )
>
> Some performance data using a FreeBSD guest, for udp transmissions:
>
>                                         KVM             QEMU
> standard KVM, standard driver            20 Kpps         6.3 Kpps
> modified KVM, standard driver            37 Kpps        28 Kpps
> modified KVM, modified driver           200 Kpps        34 Kpps
>
> As you can see, on kvm this change gives a 5x speedup to the tx path,
> which combines nicely with the 2x speedup that comes from supporting
> interrupt mitigation alone in the hypervisor. Without kvm (or kqemu ?)
> the benefits are much lower, as the guest becomes too slow.
>
> cheers
> luigi
>
> diff -urp qemu-1.3.0-orig/hw/e1000.c qemu-1.3.0/hw/e1000.c
> --- qemu-1.3.0-orig/hw/e1000.c  2012-12-03 20:37:05.000000000 +0100
> +++ qemu-1.3.0/hw/e1000.c       2012-12-27 09:47:16.000000000 +0100
> @@ -35,6 +35,8 @@
>
>  #include "e1000_hw.h"
>
> +static int mit_on = 1; /* interrupt mitigation enable */
> +
>  #define E1000_DEBUG
>
>  #ifdef E1000_DEBUG
> @@ -129,6 +131,9 @@ typedef struct E1000State_st {
>      } eecd_state;
>
>      QEMUTimer *autoneg_timer;
> +    QEMUTimer *mit_timer;      // handle for the timer
> +    uint32_t mit_timer_on;     // mitigation timer active
> +    uint32_t mit_cause;                // pending interrupt cause
>  } E1000State;
>
>  #define        defreg(x)       x = (E1000_##x>>2)
> @@ -144,6 +149,7 @@ enum {
>      defreg(TPR),       defreg(TPT),    defreg(TXDCTL), defreg(WUFC),
>      defreg(RA),                defreg(MTA),    defreg(CRCERRS),defreg(VFTA),
>      defreg(VET),
> +    defreg(RDTR),      defreg(RADV),   defreg(TADV),   defreg(ITR),
>  };
>
>  static void
> @@ -639,6 +645,68 @@ static uint64_t tx_desc_base(E1000State
>      return (bah << 32) + bal;
>  }
>
> +/* helper function, 0 means the value is not set */
> +static inline void
> +mit_update_delay(uint32_t *curr, uint32_t value)
> +{
> +    if (value && (*curr == 0 || value < *curr))

Missing braces, please read CODING_STYLE and check the patches with
scripts/checkpatch.pl.

Also in several places below.

> +       *curr = value;
> +}
> +
> +/*
> + * If necessary, rearm the timer and post an interrupt.
> + * Called at the end of tx/rx routines (mit_timer_on == 0),
> + * and when the timer fires (mit_timer_on == 1).
> + * We provide a partial implementation of interrupt mitigation,
> + * emulating only RADV, TADV and ITR (lower 16 bits, 1024ns units for
> + * RADV and TADV, 256ns units for ITR). RDTR is only used to enable RADV;
> + * relative timers based on TIDV and RDTR are not implemented.
> + */
> +static void
> +mit_rearm_and_int(void *opaque)
> +{
> +    E1000State *s = opaque;
> +    uint32_t mit_delay = 0;
> +
> +    /*
> +     * Clear the flag. It is only set when the callback fires,
> +     * and we need to clear it anyways.
> +     */
> +    s->mit_timer_on = 0;
> +    if (s->mit_cause == 0) /* no events pending, we are done */
> +       return;
> +    /*
> +     * Compute the next mitigation delay according to pending interrupts
> +     * and the current values of RADV (provided RDTR!=0), TADV and ITR.
> +     * Then rearm the timer.
> +     */
> +    if (s->mit_cause & (E1000_ICR_TXQE | E1000_ICR_TXDW))
> +       mit_update_delay(&mit_delay, s->mac_reg[TADV] * 4);
> +    if (s->mac_reg[RDTR] && (s->mit_cause & E1000_ICS_RXT0))
> +       mit_update_delay(&mit_delay, s->mac_reg[RADV] * 4);
> +    mit_update_delay(&mit_delay, s->mac_reg[ITR]);
> +
> +    if (mit_delay) {
> +       s->mit_timer_on = 1;
> +       qemu_mod_timer(s->mit_timer,
> +               qemu_get_clock_ns(vm_clock) + mit_delay * 256);
> +    }
> +    set_ics(s, 0, s->mit_cause);
> +    s->mit_cause = 0;
> +}
> +
> +static void
> +mit_set_ics(E1000State *s, uint32_t cause)
> +{
> +    if (mit_on == 0) {
> +       set_ics(s, 0, cause);
> +       return;
> +    }
> +    s->mit_cause |= cause;
> +    if (!s->mit_timer_on)
> +       mit_rearm_and_int(s);
> +}
> +
>  static void
>  start_xmit(E1000State *s)
>  {
> @@ -676,7 +744,7 @@ start_xmit(E1000State *s)
>              break;
>          }
>      }
> -    set_ics(s, 0, cause);
> +    mit_set_ics(s, cause);
>  }
>
>  static int
> @@ -894,7 +962,7 @@ e1000_receive(NetClientState *nc, const
>          s->rxbuf_min_shift)
>          n |= E1000_ICS_RXDMT0;
>
> -    set_ics(s, 0, n);
> +    mit_set_ics(s, n);
>
>      return size;
>  }
> @@ -999,6 +1067,7 @@ static uint32_t (*macreg_readops[])(E100
>      getreg(RDH),       getreg(RDT),    getreg(VET),    getreg(ICS),
>      getreg(TDBAL),     getreg(TDBAH),  getreg(RDBAH),  getreg(RDBAL),
>      getreg(TDLEN),     getreg(RDLEN),
> +    getreg(RDTR),      getreg(RADV),   getreg(TADV),   getreg(ITR),
>
>      [TOTH] = mac_read_clr8,    [TORH] = mac_read_clr8, [GPRC] = mac_read_clr4,
>      [GPTC] = mac_read_clr4,    [TPR] = mac_read_clr4,  [TPT] = mac_read_clr4,
> @@ -1015,6 +1084,8 @@ static void (*macreg_writeops[])(E1000St
>      putreg(PBA),       putreg(EERD),   putreg(SWSM),   putreg(WUFC),
>      putreg(TDBAL),     putreg(TDBAH),  putreg(TXDCTL), putreg(RDBAH),
>      putreg(RDBAL),     putreg(LEDCTL), putreg(VET),
> +    [RDTR] = set_16bit,        [RADV] = set_16bit,     [TADV] = set_16bit,
> +    [ITR] = set_16bit,
>      [TDLEN] = set_dlen,        [RDLEN] = set_dlen,     [TCTL] = set_tctl,
>      [TDT] = set_tctl,  [MDIC] = set_mdic,      [ICS] = set_ics,
>      [TDH] = set_16bit, [RDH] = set_16bit,      [RDT] = set_rdt,
> @@ -1286,6 +1357,9 @@ static int pci_e1000_init(PCIDevice *pci
>      add_boot_device_path(d->conf.bootindex, &pci_dev->qdev, "/ethernet-phy@0");
>
>      d->autoneg_timer = qemu_new_timer_ms(vm_clock, e1000_autoneg_timer, d);
> +    d->mit_cause = 0;
> +    d->mit_timer_on = 0;
> +    d->mit_timer = qemu_new_timer_ns(vm_clock, mit_rearm_and_int, d);
>
>      return 0;
>  }
>
Stefan Hajnoczi - Jan. 10, 2013, 12:25 p.m.
On Thu, Dec 27, 2012 at 11:06:58AM +0100, Luigi Rizzo wrote:
> diff -urp qemu-1.3.0-orig/hw/e1000.c qemu-1.3.0/hw/e1000.c
> --- qemu-1.3.0-orig/hw/e1000.c	2012-12-03 20:37:05.000000000 +0100
> +++ qemu-1.3.0/hw/e1000.c	2012-12-27 09:47:16.000000000 +0100
> @@ -35,6 +35,8 @@
>  
>  #include "e1000_hw.h"
>  
> +static int mit_on = 1;	/* interrupt mitigation enable */

If you want to make this optional then please put it inside E1000State
so it can be toggled per NIC.

If you don't want to make it optional then please post results for
recent Linux and Windows guests to show that out-of-the-box performance
is not degraded.

>  #define E1000_DEBUG
>  
>  #ifdef E1000_DEBUG
> @@ -129,6 +131,9 @@ typedef struct E1000State_st {
>      } eecd_state;
>  
>      QEMUTimer *autoneg_timer;
> +    QEMUTimer *mit_timer;	// handle for the timer
> +    uint32_t mit_timer_on;	// mitigation timer active
> +    uint32_t mit_cause;		// pending interrupt cause

Live migration support is required so that pending interrupts are
migrated.  We cannot lose interrupts across live migration.

I would migrate mit_cause but not mit_timer_on.  When loading (resuming
from live migration), look at mit_cause and raise any interrupts.

Stefan
Luigi Rizzo - Jan. 11, 2013, 2:53 p.m.
On Thu, Jan 10, 2013 at 01:25:48PM +0100, Stefan Hajnoczi wrote:
> On Thu, Dec 27, 2012 at 11:06:58AM +0100, Luigi Rizzo wrote:
> > diff -urp qemu-1.3.0-orig/hw/e1000.c qemu-1.3.0/hw/e1000.c
> > --- qemu-1.3.0-orig/hw/e1000.c	2012-12-03 20:37:05.000000000 +0100
> > +++ qemu-1.3.0/hw/e1000.c	2012-12-27 09:47:16.000000000 +0100
> > @@ -35,6 +35,8 @@
> >  
> >  #include "e1000_hw.h"
> >  
> > +static int mit_on = 1;	/* interrupt mitigation enable */
> 
> If you want to make this optional then please put it inside E1000State
> so it can be toggled per NIC.

what is the simplest way to add NIC-specific options ?
I have added one line to e1000_properties, as below

 static Property e1000_properties[] = {
     DEFINE_NIC_PROPERTIES(E1000State, conf),
+    DEFINE_PROP_UINT32("mit_on, E1000State, mit_on, 0),
     DEFINE_PROP_END_OF_LIST(),
 };

and this way i can do recognise this on the command line
	qemu ... -device e1000,mit_on=1 ...

but i do not know how to set the property for the NIC i am using.
Specifically, i normally run qemu with "-net nic,model=1000"
(leaving the nic unconnected to the host network, so i can
test the tx path without being constrained by the backend's speed)

Any suggestion ?

thanks
luigi
Stefan Hajnoczi - Jan. 14, 2013, 8:58 a.m.
On Fri, Jan 11, 2013 at 03:53:50PM +0100, Luigi Rizzo wrote:
> On Thu, Jan 10, 2013 at 01:25:48PM +0100, Stefan Hajnoczi wrote:
> > On Thu, Dec 27, 2012 at 11:06:58AM +0100, Luigi Rizzo wrote:
> > > diff -urp qemu-1.3.0-orig/hw/e1000.c qemu-1.3.0/hw/e1000.c
> > > --- qemu-1.3.0-orig/hw/e1000.c	2012-12-03 20:37:05.000000000 +0100
> > > +++ qemu-1.3.0/hw/e1000.c	2012-12-27 09:47:16.000000000 +0100
> > > @@ -35,6 +35,8 @@
> > >  
> > >  #include "e1000_hw.h"
> > >  
> > > +static int mit_on = 1;	/* interrupt mitigation enable */
> > 
> > If you want to make this optional then please put it inside E1000State
> > so it can be toggled per NIC.
> 
> what is the simplest way to add NIC-specific options ?
> I have added one line to e1000_properties, as below
> 
>  static Property e1000_properties[] = {
>      DEFINE_NIC_PROPERTIES(E1000State, conf),
> +    DEFINE_PROP_UINT32("mit_on, E1000State, mit_on, 0),
>      DEFINE_PROP_END_OF_LIST(),
>  };
> 
> and this way i can do recognise this on the command line
> 	qemu ... -device e1000,mit_on=1 ...
> 
> but i do not know how to set the property for the NIC i am using.
> Specifically, i normally run qemu with "-net nic,model=1000"
> (leaving the nic unconnected to the host network, so i can
> test the tx path without being constrained by the backend's speed)

Short answer: -device e1000,mit_on=1,vlan=2

That puts the e1000 on its own hub (QEMU "VLAN") and not connected to
the outside world.  This is equivalent what you get with "-net
nic,model=e1000" except you can specify device properties like "mit_on".

Long answer:

"-net nic" is handled in net_init_nic().  Unfortunately this goes via
the intermediate NICInfo struct instead of directly to the e1000
properties.  You'd need to plumb the option through NICInfo, but that's
ugly since NICInfo is not e1000-specific and other NICs don't have the
option.

I'm not sure if it's possible to get rid of NICInfo and the nd_table[]
global but I hope so.

For the time being I'd avoid using device-specific options with "-net
nic".

Stefan

Patch

diff -urp qemu-1.3.0-orig/hw/e1000.c qemu-1.3.0/hw/e1000.c
--- qemu-1.3.0-orig/hw/e1000.c	2012-12-03 20:37:05.000000000 +0100
+++ qemu-1.3.0/hw/e1000.c	2012-12-27 09:47:16.000000000 +0100
@@ -35,6 +35,8 @@ 
 
 #include "e1000_hw.h"
 
+static int mit_on = 1;	/* interrupt mitigation enable */
+
 #define E1000_DEBUG
 
 #ifdef E1000_DEBUG
@@ -129,6 +131,9 @@  typedef struct E1000State_st {
     } eecd_state;
 
     QEMUTimer *autoneg_timer;
+    QEMUTimer *mit_timer;	// handle for the timer
+    uint32_t mit_timer_on;	// mitigation timer active
+    uint32_t mit_cause;		// pending interrupt cause
 } E1000State;
 
 #define	defreg(x)	x = (E1000_##x>>2)
@@ -144,6 +149,7 @@  enum {
     defreg(TPR),	defreg(TPT),	defreg(TXDCTL),	defreg(WUFC),
     defreg(RA),		defreg(MTA),	defreg(CRCERRS),defreg(VFTA),
     defreg(VET),
+    defreg(RDTR),	defreg(RADV),	defreg(TADV),	defreg(ITR),
 };
 
 static void
@@ -639,6 +645,68 @@  static uint64_t tx_desc_base(E1000State
     return (bah << 32) + bal;
 }
 
+/* helper function, 0 means the value is not set */
+static inline void
+mit_update_delay(uint32_t *curr, uint32_t value)
+{
+    if (value && (*curr == 0 || value < *curr))
+	*curr = value;
+}
+
+/*
+ * If necessary, rearm the timer and post an interrupt.
+ * Called at the end of tx/rx routines (mit_timer_on == 0),
+ * and when the timer fires (mit_timer_on == 1).
+ * We provide a partial implementation of interrupt mitigation,
+ * emulating only RADV, TADV and ITR (lower 16 bits, 1024ns units for
+ * RADV and TADV, 256ns units for ITR). RDTR is only used to enable RADV;
+ * relative timers based on TIDV and RDTR are not implemented.
+ */
+static void
+mit_rearm_and_int(void *opaque)
+{
+    E1000State *s = opaque;
+    uint32_t mit_delay = 0;
+
+    /*
+     * Clear the flag. It is only set when the callback fires,
+     * and we need to clear it anyways.
+     */
+    s->mit_timer_on = 0;
+    if (s->mit_cause == 0) /* no events pending, we are done */
+	return;
+    /*
+     * Compute the next mitigation delay according to pending interrupts
+     * and the current values of RADV (provided RDTR!=0), TADV and ITR.
+     * Then rearm the timer.
+     */
+    if (s->mit_cause & (E1000_ICR_TXQE | E1000_ICR_TXDW))
+	mit_update_delay(&mit_delay, s->mac_reg[TADV] * 4);
+    if (s->mac_reg[RDTR] && (s->mit_cause & E1000_ICS_RXT0))
+	mit_update_delay(&mit_delay, s->mac_reg[RADV] * 4);
+    mit_update_delay(&mit_delay, s->mac_reg[ITR]);  
+
+    if (mit_delay) {
+	s->mit_timer_on = 1;
+	qemu_mod_timer(s->mit_timer,
+		qemu_get_clock_ns(vm_clock) + mit_delay * 256);
+    }
+    set_ics(s, 0, s->mit_cause);
+    s->mit_cause = 0;
+}
+
+static void
+mit_set_ics(E1000State *s, uint32_t cause)
+{
+    if (mit_on == 0) {
+	set_ics(s, 0, cause);
+	return;
+    }
+    s->mit_cause |= cause;
+    if (!s->mit_timer_on)
+	mit_rearm_and_int(s);
+}
+
 static void
 start_xmit(E1000State *s)
 {
@@ -676,7 +744,7 @@  start_xmit(E1000State *s)
             break;
         }
     }
-    set_ics(s, 0, cause);
+    mit_set_ics(s, cause);
 }
 
 static int
@@ -894,7 +962,7 @@  e1000_receive(NetClientState *nc, const
         s->rxbuf_min_shift)
         n |= E1000_ICS_RXDMT0;
 
-    set_ics(s, 0, n);
+    mit_set_ics(s, n);
 
     return size;
 }
@@ -999,6 +1067,7 @@  static uint32_t (*macreg_readops[])(E100
     getreg(RDH),	getreg(RDT),	getreg(VET),	getreg(ICS),
     getreg(TDBAL),	getreg(TDBAH),	getreg(RDBAH),	getreg(RDBAL),
     getreg(TDLEN),	getreg(RDLEN),
+    getreg(RDTR),	getreg(RADV),	getreg(TADV),	getreg(ITR),
 
     [TOTH] = mac_read_clr8,	[TORH] = mac_read_clr8,	[GPRC] = mac_read_clr4,
     [GPTC] = mac_read_clr4,	[TPR] = mac_read_clr4,	[TPT] = mac_read_clr4,
@@ -1015,6 +1084,8 @@  static void (*macreg_writeops[])(E1000St
     putreg(PBA),	putreg(EERD),	putreg(SWSM),	putreg(WUFC),
     putreg(TDBAL),	putreg(TDBAH),	putreg(TXDCTL),	putreg(RDBAH),
     putreg(RDBAL),	putreg(LEDCTL), putreg(VET),
+    [RDTR] = set_16bit,	[RADV] = set_16bit,	[TADV] = set_16bit,
+    [ITR] = set_16bit,
     [TDLEN] = set_dlen,	[RDLEN] = set_dlen,	[TCTL] = set_tctl,
     [TDT] = set_tctl,	[MDIC] = set_mdic,	[ICS] = set_ics,
     [TDH] = set_16bit,	[RDH] = set_16bit,	[RDT] = set_rdt,
@@ -1286,6 +1357,9 @@  static int pci_e1000_init(PCIDevice *pci
     add_boot_device_path(d->conf.bootindex, &pci_dev->qdev, "/ethernet-phy@0");
 
     d->autoneg_timer = qemu_new_timer_ms(vm_clock, e1000_autoneg_timer, d);
+    d->mit_cause = 0;
+    d->mit_timer_on = 0;
+    d->mit_timer = qemu_new_timer_ns(vm_clock, mit_rearm_and_int, d);
 
     return 0;
 }