Patchwork e1000: add interrupt mitigation support

login
register
mail settings
Submitter Vincenzo Maffione
Date July 25, 2013, 1:25 p.m.
Message ID <CA+_eA9hJEWkVVTmVSG8Qcj2PsMTz+MpZNga0eSYTS7MuNrgnVQ@mail.gmail.com>
Download mbox | patch
Permalink /patch/261693/
State New
Headers show

Comments

Vincenzo Maffione - July 25, 2013, 1:25 p.m.
From e500c2fde6015def020166b7aac91d053d361d1e Mon Sep 17 00:00:00 2001
From: Vincenzo Maffione <v.maffione@gmail.com>
Date: Thu, 25 Jul 2013 15:14:24 +0200
Subject: [PATCH] e1000: add interrupt mitigation support

This patch partially implements the e1000 interrupt mitigation mechanisms.
Using a single QEMUTimer, it emulates the ITR register (which is the newer
mitigation register, recommended by Intel) and approximately emulates
RADV and TADV registers. TIDV and RDTR register functionalities are not
emulated (RDTR is only used to validate RADV, according to the e1000 specs).

RADV, TADV, TIDV and RDTR registers make up the older e1000 mitigation
mechanism and would need a timer each to be completely emulated. However,
a single timer has been used in order to reach a good compromise between
emulation accuracy and simplicity/efficiency.

The implemented mechanism can be enabled/disabled specifying the command
line e1000-specific boolean parameter "mit", e.g.

    qemu-system-x86_64 -device e1000,mit=on,... ...

For more information, see the Software developer's manual at
http://download.intel.com/design/network/manuals/8254x_GBe_SDM.pdf.

Signed-off-by: Vincenzo Maffione <v.maffione@gmail.com>
---
 hw/net/e1000.c | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 96 insertions(+), 3 deletions(-)

     [GPTC] = mac_read_clr4,    [TPR] = mac_read_clr4,    [TPT] = mac_read_clr4,
@@ -1069,6 +1143,8 @@ static void (*macreg_writeops[])(E1000State *,
int, uint32_t) = {
     [TDH] = set_16bit,    [RDH] = set_16bit,    [RDT] = set_rdt,
     [IMC] = set_imc,    [IMS] = set_ims,    [ICR] = set_icr,
     [EECD] = set_eecd,    [RCTL] = set_rx_control, [CTRL] = set_ctrl,
+    [RDTR] = set_16bit, [RADV] = set_16bit,     [TADV] = set_16bit,
+    [ITR] = set_16bit,
     [RA ... RA+31] = &mac_writereg,
     [MTA ... MTA+127] = &mac_writereg,
     [VFTA ... VFTA+127] = &mac_writereg,
@@ -1171,6 +1247,11 @@ static int e1000_post_load(void *opaque, int version_id)
     E1000State *s = opaque;
     NetClientState *nc = qemu_get_queue(s->nic);

+    /* If the mitigation timer was active, emulate a timeout now. */
+    if (s->mit_timer_on) {
+        e1000_mit_timer(s);
+    }
+
     /* nc.link_down can't be migrated, so infer link_down according
      * to link status bit in mac_reg[STATUS].
      * Alternatively, restart link negotiation if it was in progress. */
@@ -1263,9 +1344,17 @@ static const VMStateDescription vmstate_e1000 = {
         VMSTATE_UINT32(mac_reg[TXDCTL], E1000State),
         VMSTATE_UINT32(mac_reg[WUFC], E1000State),
         VMSTATE_UINT32(mac_reg[VET], E1000State),
+        VMSTATE_UINT32(mac_reg[RDTR], E1000State),
+        VMSTATE_UINT32(mac_reg[RADV], E1000State),
+        VMSTATE_UINT32(mac_reg[TADV], E1000State),
+        VMSTATE_UINT32(mac_reg[ITR], E1000State),
         VMSTATE_UINT32_SUB_ARRAY(mac_reg, E1000State, RA, 32),
         VMSTATE_UINT32_SUB_ARRAY(mac_reg, E1000State, MTA, 128),
         VMSTATE_UINT32_SUB_ARRAY(mac_reg, E1000State, VFTA, 128),
+        VMSTATE_BOOL(mit_timer_on, E1000State),
+        VMSTATE_BOOL(mit_irq_level, E1000State),
+        VMSTATE_BOOL(mit_on, E1000State),
+        VMSTATE_UINT32(mit_ide, E1000State),
         VMSTATE_END_OF_LIST()
     }
 };
@@ -1316,6 +1405,8 @@ pci_e1000_uninit(PCIDevice *dev)

     qemu_del_timer(d->autoneg_timer);
     qemu_free_timer(d->autoneg_timer);
+    qemu_del_timer(d->mit_timer);
+    qemu_free_timer(d->mit_timer);
     memory_region_destroy(&d->mmio);
     memory_region_destroy(&d->io);
     qemu_del_nic(d->nic);
@@ -1371,6 +1462,7 @@ static int pci_e1000_init(PCIDevice *pci_dev)
     add_boot_device_path(d->conf.bootindex, dev, "/ethernet-phy@0");

     d->autoneg_timer = qemu_new_timer_ms(vm_clock, e1000_autoneg_timer, d);
+    d->mit_timer = qemu_new_timer_ns(vm_clock, e1000_mit_timer, d);

     return 0;
 }
@@ -1385,6 +1477,7 @@ static Property e1000_properties[] = {
     DEFINE_NIC_PROPERTIES(E1000State, conf),
     DEFINE_PROP_BIT("autonegotiation", E1000State,
                     compat_flags, E1000_FLAG_AUTONEG_BIT, true),
+    DEFINE_PROP_BOOL("mit", E1000State, mit_on, true),
     DEFINE_PROP_END_OF_LIST(),
 };
Michael S. Tsirkin - July 25, 2013, 3:18 p.m.
On Thu, Jul 25, 2013 at 03:25:32PM +0200, Vincenzo Maffione wrote:
> >From e500c2fde6015def020166b7aac91d053d361d1e Mon Sep 17 00:00:00 2001
> From: Vincenzo Maffione <v.maffione@gmail.com>
> Date: Thu, 25 Jul 2013 15:14:24 +0200
> Subject: [PATCH] e1000: add interrupt mitigation support
> 
> This patch partially implements the e1000 interrupt mitigation mechanisms.
> Using a single QEMUTimer, it emulates the ITR register (which is the newer
> mitigation register, recommended by Intel) and approximately emulates
> RADV and TADV registers. TIDV and RDTR register functionalities are not
> emulated (RDTR is only used to validate RADV, according to the e1000 specs).
> 
> RADV, TADV, TIDV and RDTR registers make up the older e1000 mitigation
> mechanism and would need a timer each to be completely emulated. However,
> a single timer has been used in order to reach a good compromise between
> emulation accuracy and simplicity/efficiency.
> 
> The implemented mechanism can be enabled/disabled specifying the command
> line e1000-specific boolean parameter "mit", e.g.
> 
>     qemu-system-x86_64 -device e1000,mit=on,... ...
> 
> For more information, see the Software developer's manual at
> http://download.intel.com/design/network/manuals/8254x_GBe_SDM.pdf.
> 
> Signed-off-by: Vincenzo Maffione <v.maffione@gmail.com>

You'll also need to disable this nwe mechanism when
-M pc-1.5 is specified.

> ---
>  hw/net/e1000.c | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 96 insertions(+), 3 deletions(-)
> 
> diff --git a/hw/net/e1000.c b/hw/net/e1000.c
> index b952d8d..9c573ab 100644
> --- a/hw/net/e1000.c
> +++ b/hw/net/e1000.c
> @@ -135,6 +135,12 @@ typedef struct E1000State_st {
> 
>      QEMUTimer *autoneg_timer;
> 
> +    QEMUTimer *mit_timer;      /* Mitigation timer. */
> +    bool mit_timer_on;         /* Mitigation timer is running. */
> +    bool mit_irq_level;        /* Tracks interrupt pin level. */
> +    bool mit_on;               /* Mitigation enabled. */
> +    uint32_t mit_ide;          /* Tracks E1000_TXD_CMD_IDE bit. */
> +
>  /* Compatibility flags for migration to/from qemu 1.3.0 and older */
>  #define E1000_FLAG_AUTONEG_BIT 0
>  #define E1000_FLAG_AUTONEG (1 << E1000_FLAG_AUTONEG_BIT)
> @@ -158,7 +164,8 @@ enum {
>      defreg(TORH),    defreg(TORL),    defreg(TOTH),    defreg(TOTL),
>      defreg(TPR),    defreg(TPT),    defreg(TXDCTL),    defreg(WUFC),
>      defreg(RA),        defreg(MTA),    defreg(CRCERRS),defreg(VFTA),
> -    defreg(VET),
> +    defreg(VET),        defreg(RDTR),   defreg(RADV),   defreg(TADV),
> +    defreg(ITR),
>  };
> 
>  static void
> @@ -245,10 +252,21 @@ static const uint32_t mac_reg_init[] = {
>                  E1000_MANC_RMCP_EN,
>  };
> 
> +/* Helper function, *curr == 0 means the value is not set */
> +static inline void
> +mit_update_delay(uint32_t *curr, uint32_t value)
> +{
> +    if (value && (*curr == 0 || value < *curr)) {
> +        *curr = value;
> +    }
> +}
> +
>  static void
>  set_interrupt_cause(E1000State *s, int index, uint32_t val)
>  {
>      PCIDevice *d = PCI_DEVICE(s);
> +    uint32_t pending_ints;
> +    uint32_t mit_delay;
> 
>      if (val && (E1000_DEVID >= E1000_DEV_ID_82547EI_MOBILE)) {
>          /* Only for 8257x */
> @@ -266,7 +284,57 @@ set_interrupt_cause(E1000State *s, int index, uint32_t val)
>       */
>      s->mac_reg[ICS] = val;
> 
> -    qemu_set_irq(d->irq[0], (s->mac_reg[IMS] & s->mac_reg[ICR]) != 0);
> +    pending_ints = (s->mac_reg[IMS] & s->mac_reg[ICR]);
> +    if (!s->mit_irq_level && pending_ints) {
> +        /*
> +         * Here we detect a potential raising edge. We postpone raising the
> +         * interrupt line if we are inside the mitigation delay window
> +         * (s->mit_timer_on == 1).
> +         * We provide a partial implementation of interrupt mitigation,
> +         * emulating only RADV, TADV and ITR (lower 16 bits, 1024ns units for
> +         * RADV and TADV, 256ns units for ITR). RDTR is only used to enable
> +         * RADV; relative timers based on TIDV and RDTR are not implemented.
> +         */
> +        if (s->mit_timer_on) {
> +            return;
> +        }
> +        if (s->mit_on) {
> +            /* Compute the next mitigation delay according to pending
> +             * interrupts and the current values of RADV (provided
> +             * RDTR!=0), TADV and ITR.
> +             * Then rearm the timer.
> +             */
> +            mit_delay = 0;
> +            if (s->mit_ide &&
> +                    (pending_ints & (E1000_ICR_TXQE | E1000_ICR_TXDW))) {
> +                mit_update_delay(&mit_delay, s->mac_reg[TADV] * 4);
> +            }
> +            if (s->mac_reg[RDTR] && (pending_ints & E1000_ICS_RXT0)) {
> +                mit_update_delay(&mit_delay, s->mac_reg[RADV] * 4);
> +            }
> +            mit_update_delay(&mit_delay, s->mac_reg[ITR]);
> +
> +            if (mit_delay) {
> +                s->mit_timer_on = 1;
> +                qemu_mod_timer(s->mit_timer,
> +                        qemu_get_clock_ns(vm_clock) + mit_delay * 256);
> +            }
> +            s->mit_ide = 0;
> +        }
> +    }
> +
> +    s->mit_irq_level = (pending_ints != 0);
> +    qemu_set_irq(d->irq[0], s->mit_irq_level);
> +}
> +
> +static void
> +e1000_mit_timer(void *opaque)
> +{
> +    E1000State *s = opaque;
> +
> +    s->mit_timer_on = 0;
> +    /* Call set_interrupt_cause to update the irq level (if necessary). */
> +    set_interrupt_cause(s, 0, s->mac_reg[ICR]);
>  }
> 
>  static void
> @@ -307,6 +375,10 @@ static void e1000_reset(void *opaque)
>      int i;
> 
>      qemu_del_timer(d->autoneg_timer);
> +    qemu_del_timer(d->mit_timer);
> +    d->mit_timer_on = 0;
> +    d->mit_irq_level = 0;
> +    d->mit_ide = 0;
>      memset(d->phy_reg, 0, sizeof d->phy_reg);
>      memmove(d->phy_reg, phy_reg_init, sizeof phy_reg_init);
>      memset(d->mac_reg, 0, sizeof d->mac_reg);
> @@ -572,6 +644,7 @@ process_tx_desc(E1000State *s, struct e1000_tx_desc *dp)
>      struct e1000_context_desc *xp = (struct e1000_context_desc *)dp;
>      struct e1000_tx *tp = &s->tx;
> 
> +    s->mit_ide |= (txd_lower & E1000_TXD_CMD_IDE);
>      if (dtype == E1000_TXD_CMD_DEXT) {    // context descriptor
>          op = le32_to_cpu(xp->cmd_and_length);
>          tp->ipcss = xp->lower_setup.ip_fields.ipcss;
> @@ -1047,7 +1120,8 @@ static uint32_t (*macreg_readops[])(E1000State *, int) = {
>      getreg(TORL),    getreg(TOTL),    getreg(IMS),    getreg(TCTL),
>      getreg(RDH),    getreg(RDT),    getreg(VET),    getreg(ICS),
>      getreg(TDBAL),    getreg(TDBAH),    getreg(RDBAH),    getreg(RDBAL),
> -    getreg(TDLEN),    getreg(RDLEN),
> +    getreg(TDLEN),      getreg(RDLEN),  getreg(RDTR),   getreg(RADV),
> +    getreg(TADV),       getreg(ITR),
> 
>      [TOTH] = mac_read_clr8,    [TORH] = mac_read_clr8,    [GPRC] =
> mac_read_clr4,
>      [GPTC] = mac_read_clr4,    [TPR] = mac_read_clr4,    [TPT] = mac_read_clr4,
> @@ -1069,6 +1143,8 @@ static void (*macreg_writeops[])(E1000State *,
> int, uint32_t) = {
>      [TDH] = set_16bit,    [RDH] = set_16bit,    [RDT] = set_rdt,
>      [IMC] = set_imc,    [IMS] = set_ims,    [ICR] = set_icr,
>      [EECD] = set_eecd,    [RCTL] = set_rx_control, [CTRL] = set_ctrl,
> +    [RDTR] = set_16bit, [RADV] = set_16bit,     [TADV] = set_16bit,
> +    [ITR] = set_16bit,
>      [RA ... RA+31] = &mac_writereg,
>      [MTA ... MTA+127] = &mac_writereg,
>      [VFTA ... VFTA+127] = &mac_writereg,
> @@ -1171,6 +1247,11 @@ static int e1000_post_load(void *opaque, int version_id)
>      E1000State *s = opaque;
>      NetClientState *nc = qemu_get_queue(s->nic);
> 
> +    /* If the mitigation timer was active, emulate a timeout now. */
> +    if (s->mit_timer_on) {
> +        e1000_mit_timer(s);
> +    }
> +
>      /* nc.link_down can't be migrated, so infer link_down according
>       * to link status bit in mac_reg[STATUS].
>       * Alternatively, restart link negotiation if it was in progress. */
> @@ -1263,9 +1344,17 @@ static const VMStateDescription vmstate_e1000 = {
>          VMSTATE_UINT32(mac_reg[TXDCTL], E1000State),
>          VMSTATE_UINT32(mac_reg[WUFC], E1000State),
>          VMSTATE_UINT32(mac_reg[VET], E1000State),
> +        VMSTATE_UINT32(mac_reg[RDTR], E1000State),
> +        VMSTATE_UINT32(mac_reg[RADV], E1000State),
> +        VMSTATE_UINT32(mac_reg[TADV], E1000State),
> +        VMSTATE_UINT32(mac_reg[ITR], E1000State),
>          VMSTATE_UINT32_SUB_ARRAY(mac_reg, E1000State, RA, 32),
>          VMSTATE_UINT32_SUB_ARRAY(mac_reg, E1000State, MTA, 128),
>          VMSTATE_UINT32_SUB_ARRAY(mac_reg, E1000State, VFTA, 128),
> +        VMSTATE_BOOL(mit_timer_on, E1000State),
> +        VMSTATE_BOOL(mit_irq_level, E1000State),
> +        VMSTATE_BOOL(mit_on, E1000State),
> +        VMSTATE_UINT32(mit_ide, E1000State),
>          VMSTATE_END_OF_LIST()
>      }
>  };

This will break cross version migration. Need to be conditional on
the new flag actually being enabled.

> @@ -1316,6 +1405,8 @@ pci_e1000_uninit(PCIDevice *dev)
> 
>      qemu_del_timer(d->autoneg_timer);
>      qemu_free_timer(d->autoneg_timer);
> +    qemu_del_timer(d->mit_timer);
> +    qemu_free_timer(d->mit_timer);
>      memory_region_destroy(&d->mmio);
>      memory_region_destroy(&d->io);
>      qemu_del_nic(d->nic);
> @@ -1371,6 +1462,7 @@ static int pci_e1000_init(PCIDevice *pci_dev)
>      add_boot_device_path(d->conf.bootindex, dev, "/ethernet-phy@0");
> 
>      d->autoneg_timer = qemu_new_timer_ms(vm_clock, e1000_autoneg_timer, d);
> +    d->mit_timer = qemu_new_timer_ns(vm_clock, e1000_mit_timer, d);
> 
>      return 0;
>  }
> @@ -1385,6 +1477,7 @@ static Property e1000_properties[] = {
>      DEFINE_NIC_PROPERTIES(E1000State, conf),
>      DEFINE_PROP_BIT("autonegotiation", E1000State,
>                      compat_flags, E1000_FLAG_AUTONEG_BIT, true),
> +    DEFINE_PROP_BOOL("mit", E1000State, mit_on, true),

How about a bit in compat_flags? That's why we invented this field.

>      DEFINE_PROP_END_OF_LIST(),
>  };
> 
> -- 
> 1.8.3.3

Patch

diff --git a/hw/net/e1000.c b/hw/net/e1000.c
index b952d8d..9c573ab 100644
--- a/hw/net/e1000.c
+++ b/hw/net/e1000.c
@@ -135,6 +135,12 @@  typedef struct E1000State_st {

     QEMUTimer *autoneg_timer;

+    QEMUTimer *mit_timer;      /* Mitigation timer. */
+    bool mit_timer_on;         /* Mitigation timer is running. */
+    bool mit_irq_level;        /* Tracks interrupt pin level. */
+    bool mit_on;               /* Mitigation enabled. */
+    uint32_t mit_ide;          /* Tracks E1000_TXD_CMD_IDE bit. */
+
 /* Compatibility flags for migration to/from qemu 1.3.0 and older */
 #define E1000_FLAG_AUTONEG_BIT 0
 #define E1000_FLAG_AUTONEG (1 << E1000_FLAG_AUTONEG_BIT)
@@ -158,7 +164,8 @@  enum {
     defreg(TORH),    defreg(TORL),    defreg(TOTH),    defreg(TOTL),
     defreg(TPR),    defreg(TPT),    defreg(TXDCTL),    defreg(WUFC),
     defreg(RA),        defreg(MTA),    defreg(CRCERRS),defreg(VFTA),
-    defreg(VET),
+    defreg(VET),        defreg(RDTR),   defreg(RADV),   defreg(TADV),
+    defreg(ITR),
 };

 static void
@@ -245,10 +252,21 @@  static const uint32_t mac_reg_init[] = {
                 E1000_MANC_RMCP_EN,
 };

+/* Helper function, *curr == 0 means the value is not set */
+static inline void
+mit_update_delay(uint32_t *curr, uint32_t value)
+{
+    if (value && (*curr == 0 || value < *curr)) {
+        *curr = value;
+    }
+}
+
 static void
 set_interrupt_cause(E1000State *s, int index, uint32_t val)
 {
     PCIDevice *d = PCI_DEVICE(s);
+    uint32_t pending_ints;
+    uint32_t mit_delay;

     if (val && (E1000_DEVID >= E1000_DEV_ID_82547EI_MOBILE)) {
         /* Only for 8257x */
@@ -266,7 +284,57 @@  set_interrupt_cause(E1000State *s, int index, uint32_t val)
      */
     s->mac_reg[ICS] = val;

-    qemu_set_irq(d->irq[0], (s->mac_reg[IMS] & s->mac_reg[ICR]) != 0);
+    pending_ints = (s->mac_reg[IMS] & s->mac_reg[ICR]);
+    if (!s->mit_irq_level && pending_ints) {
+        /*
+         * Here we detect a potential raising edge. We postpone raising the
+         * interrupt line if we are inside the mitigation delay window
+         * (s->mit_timer_on == 1).
+         * We provide a partial implementation of interrupt mitigation,
+         * emulating only RADV, TADV and ITR (lower 16 bits, 1024ns units for
+         * RADV and TADV, 256ns units for ITR). RDTR is only used to enable
+         * RADV; relative timers based on TIDV and RDTR are not implemented.
+         */
+        if (s->mit_timer_on) {
+            return;
+        }
+        if (s->mit_on) {
+            /* Compute the next mitigation delay according to pending
+             * interrupts and the current values of RADV (provided
+             * RDTR!=0), TADV and ITR.
+             * Then rearm the timer.
+             */
+            mit_delay = 0;
+            if (s->mit_ide &&
+                    (pending_ints & (E1000_ICR_TXQE | E1000_ICR_TXDW))) {
+                mit_update_delay(&mit_delay, s->mac_reg[TADV] * 4);
+            }
+            if (s->mac_reg[RDTR] && (pending_ints & E1000_ICS_RXT0)) {
+                mit_update_delay(&mit_delay, s->mac_reg[RADV] * 4);
+            }
+            mit_update_delay(&mit_delay, s->mac_reg[ITR]);
+
+            if (mit_delay) {
+                s->mit_timer_on = 1;
+                qemu_mod_timer(s->mit_timer,
+                        qemu_get_clock_ns(vm_clock) + mit_delay * 256);
+            }
+            s->mit_ide = 0;
+        }
+    }
+
+    s->mit_irq_level = (pending_ints != 0);
+    qemu_set_irq(d->irq[0], s->mit_irq_level);
+}
+
+static void
+e1000_mit_timer(void *opaque)
+{
+    E1000State *s = opaque;
+
+    s->mit_timer_on = 0;
+    /* Call set_interrupt_cause to update the irq level (if necessary). */
+    set_interrupt_cause(s, 0, s->mac_reg[ICR]);
 }

 static void
@@ -307,6 +375,10 @@  static void e1000_reset(void *opaque)
     int i;

     qemu_del_timer(d->autoneg_timer);
+    qemu_del_timer(d->mit_timer);
+    d->mit_timer_on = 0;
+    d->mit_irq_level = 0;
+    d->mit_ide = 0;
     memset(d->phy_reg, 0, sizeof d->phy_reg);
     memmove(d->phy_reg, phy_reg_init, sizeof phy_reg_init);
     memset(d->mac_reg, 0, sizeof d->mac_reg);
@@ -572,6 +644,7 @@  process_tx_desc(E1000State *s, struct e1000_tx_desc *dp)
     struct e1000_context_desc *xp = (struct e1000_context_desc *)dp;
     struct e1000_tx *tp = &s->tx;

+    s->mit_ide |= (txd_lower & E1000_TXD_CMD_IDE);
     if (dtype == E1000_TXD_CMD_DEXT) {    // context descriptor
         op = le32_to_cpu(xp->cmd_and_length);
         tp->ipcss = xp->lower_setup.ip_fields.ipcss;
@@ -1047,7 +1120,8 @@  static uint32_t (*macreg_readops[])(E1000State *, int) = {
     getreg(TORL),    getreg(TOTL),    getreg(IMS),    getreg(TCTL),
     getreg(RDH),    getreg(RDT),    getreg(VET),    getreg(ICS),
     getreg(TDBAL),    getreg(TDBAH),    getreg(RDBAH),    getreg(RDBAL),
-    getreg(TDLEN),    getreg(RDLEN),
+    getreg(TDLEN),      getreg(RDLEN),  getreg(RDTR),   getreg(RADV),
+    getreg(TADV),       getreg(ITR),

     [TOTH] = mac_read_clr8,    [TORH] = mac_read_clr8,    [GPRC] =
mac_read_clr4,