diff mbox

[v2,2/9] pcie: helper functions for pcie extended capability.

Message ID 20d8ef39308fd49bf95d83485402fc20499674d3.1283931134.git.yamahata@valinux.co.jp
State New
Headers show

Commit Message

Isaku Yamahata Sept. 8, 2010, 7:39 a.m. UTC
This patch implements helper functions for pci express extended capability.
NOTE: presence detection depends on pci_qdev_init() change.
      PCIExpressDevice::aer_log_max is in PCIDevice for device property.

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
---
 Makefile.objs |    1 +
 hw/pci.h      |   24 +
 hw/pcie.c     | 1668 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 hw/pcie.h     |  186 +++++++
 qemu-common.h |    1 +
 5 files changed, 1880 insertions(+), 0 deletions(-)
 create mode 100644 hw/pcie.c
 create mode 100644 hw/pcie.h

Comments

Michael S. Tsirkin Sept. 8, 2010, 10:31 a.m. UTC | #1
On Wed, Sep 08, 2010 at 04:39:35PM +0900, Isaku Yamahata wrote:
> This patch implements helper functions for pci express extended capability.
> NOTE: presence detection depends on pci_qdev_init() change.
>       PCIExpressDevice::aer_log_max is in PCIDevice for device property.
> 
> Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>

Thanks!

Started looking into this. Didn't complete before I run out of time
buit since I'll be offline for several days, here's what I have.



> ---
>  Makefile.objs |    1 +
>  hw/pci.h      |   24 +
>  hw/pcie.c     | 1668 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  hw/pcie.h     |  186 +++++++
>  qemu-common.h |    1 +
>  5 files changed, 1880 insertions(+), 0 deletions(-)
>  create mode 100644 hw/pcie.c
>  create mode 100644 hw/pcie.h
> 
> diff --git a/Makefile.objs b/Makefile.objs
> index 5f5a4c5..eeb5134 100644
> --- a/Makefile.objs
> +++ b/Makefile.objs
> @@ -186,6 +186,7 @@ hw-obj-$(CONFIG_PIIX4) += piix4.o
>  # PCI watchdog devices
>  hw-obj-y += wdt_i6300esb.o
>  
> +hw-obj-y += pcie.o
>  hw-obj-y += msix.o msi.o
>  
>  # PCI network cards
> diff --git a/hw/pci.h b/hw/pci.h
> index 296c7ba..bccab3a 100644
> --- a/hw/pci.h
> +++ b/hw/pci.h
> @@ -9,6 +9,8 @@
>  /* PCI includes legacy ISA access.  */
>  #include "isa.h"
>  
> +#include "pcie.h"
> +
>  /* PCI bus */
>  
>  #define PCI_DEVFN(slot, func)   ((((slot) & 0x1f) << 3) | ((func) & 0x07))
> @@ -172,6 +174,12 @@ struct PCIDevice {
>      /* Offset of MSI capability in config space */
>      uint8_t msi_cap;
>  
> +    /* PCI Express */
> +    PCIExpressDevice *exp;
> +    /* Theoretically this belongs to  PCIExpressDevice.
> +       However it is here for property and save/load */
> +    struct pcie_aer_log aer_log;
> +

Hmm. Instead, let's put PCIExpressDevice instead of a pointer here,
and move aer_log? As a bonus, pcie_ APIs could get the express device
(if you like, I'm not sure it's a good idea, but it's possible).

>      /* Location of option rom */
>      char *romfile;
>      ram_addr_t rom_offset;
> @@ -367,6 +375,22 @@ static inline uint32_t pci_config_size(const PCIDevice *d)
>      return pci_is_express(d) ? PCIE_CONFIG_SPACE_SIZE : PCI_CONFIG_SPACE_SIZE;
>  }
>  
> +
> +/* These are pci express specific, so should belong to pcie.h.
> +   they're here to avoid header inclusion error. */

What is 'header inclusion error' we are trying to avoid?

> +static inline uint8_t pci_pcie_cap(const PCIDevice *d)
> +{
> +    return d->exp ? d->exp->exp_cap : 0;
> +}
> +
> +/* AER */
> +static inline uint16_t pcie_aer_cap(const PCIDevice *d)
> +{
> +    assert(d->exp);
> +    return d->exp->aer_cap;
> +}
> +
> +
>  /* These are not pci specific. Should move into a separate header.
>   * Only pci.c uses them, so keep them here for now.
>   */
> diff --git a/hw/pcie.c b/hw/pcie.c
> new file mode 100644
> index 0000000..1f24c2a
> --- /dev/null
> +++ b/hw/pcie.c
> @@ -0,0 +1,1668 @@
> +/*
> + * pcie.c
> + *
> + * Copyright (c) 2010 Isaku Yamahata <yamahata at valinux co jp>
> + *                    VA Linux Systems Japan K.K.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include "sysemu.h"
> +#include "pci_bridge.h"
> +#include "pcie.h"
> +#include "msix.h"
> +#include "msi.h"
> +#include "pci_internals.h"
> +
> +//#define DEBUG_PCIE
> +#ifdef DEBUG_PCIE
> +# define PCIE_DPRINTF(fmt, ...)                                         \
> +    fprintf(stderr, "%s:%d " fmt, __func__, __LINE__, ## __VA_ARGS__)
> +#else
> +# define PCIE_DPRINTF(fmt, ...) do {} while (0)
> +#endif
> +#define PCIE_DEV_PRINTF(dev, fmt, ...)                                  \
> +    PCIE_DPRINTF("%s:%x "fmt, (dev)->name, (dev)->devfn, ## __VA_ARGS__)
> +
> +static inline const char *pcie_hp_event_name(enum PCIExpressHotPlugEvent event)
> +{
> +    switch (event) {
> +    case PCI_EXP_HP_EV_ABP:
> +        return "attention button pushed";
> +    case PCI_EXP_HP_EV_PDC:
> +        return "present detection changed";
> +    case PCI_EXP_HP_EV_CCI:
> +        return "command completed";
> +    default:
> +        break;
> +    }
> +    return "Unknown event";
> +}
> +
> +static void pcie_aer_clear_error(PCIDevice *dev);
> +static void pcie_aer_root_notify(PCIDevice *dev, bool trigger, int level);
> +static AER_ERR_MSG_RESULT
> +pcie_aer_errmsg_alldev(PCIDevice *dev, const struct pcie_aer_err_msg *msg);
> +static AER_ERR_MSG_RESULT
> +pcie_aer_errmsg_vbridge(PCIDevice *dev, const struct pcie_aer_err_msg *msg);
> +
> +/***************************************************************************
> + * pci express capability helper functions
> + */
> +
> +#define PCI_EXP_VER2_SIZEOF     0x3c    /* express capability of version 2 */
> +
> +/* PCI_EXP_FLAGS */
> +#define PCI_EXP_FLAGS_VER2      2       /* for now, supports only version 2 */
> +#define PCI_EXP_FLAGS_IRQ_SHIFT 9

Please define this as (ffs(PCI_EXP_FLAGS_IRQ) - 1)
or open-code.

> +#define PCI_EXP_FLAGS_IRQ_REG(irq)      (((irq) << PCI_EXP_FLAGS_IRQ_SHIFT) & PCI_EXP_FLAGS_IRQ)

line too long. Do we need the & PCI_EXP_FLAGS_IRQ?
I think it would be cleaner to open-code this.

> +#define PCI_EXP_FLAGS_TYPE_SHIFT        4


Please define this as (ffs(PCI_EXP_FLAGS_TYPE) - 1)
or open-code.

> +
> +/* PCI_EXP_LINK{CAP, STA} */
> +/* link speed */
> +#define PCI_EXP_LNK_LS_25               1
> +
> +#define PCI_EXP_LNK_MLW_SHIFT           4
> +#define PCI_EXP_LNK_MLW_1               (1 << PCI_EXP_LNK_MLW_SHIFT)
> +
> +/* PCI_EXP_LINKCAP */
> +#define PCI_EXP_LNKCAP_ASPMS_SHIFT      10
> +#define PCI_EXP_LNKCAP_ASPMS_0S         (1 << PCI_EXP_LNKCAP_ASPMS_SHIFT)
> +
> +#define PCI_EXP_LNKCAP_PN_SHIFT         24
> +#define PCI_EXP_LNKCAP_PN_REG(pn)       (((pn) << PCI_EXP_LNKCAP_PN_SHIFT) & PCI_EXP_LNKCAP_PN)
> +
> +#define PCI_EXP_SLTCAP_PSN_SHIFT        19
> +#define PCI_EXP_SLTCAP_PSN_REG(slot)    (((slot) << PCI_EXP_SLTCAP_PSN_SHIFT) & PCI_EXP_SLTCAP_PSN)
> +
> +#define PCI_EXP_SLTCTL_AIC_SHIFT        6
> +#define PCI_EXP_SLTCTL_AIC_ON           (PCI_EXP_HP_IND_ON << PCI_EXP_SLTCTL_AIC_SHIFT)
> +#define PCI_EXP_SLTCTL_AIC_BLINK        (PCI_EXP_HP_IND_BLINK << PCI_EXP_SLTCTL_AIC_SHIFT)
> +#define PCI_EXP_SLTCTL_AIC_OFF          (PCI_EXP_HP_IND_OFF << PCI_EXP_SLTCTL_AIC_SHIFT)
> +
> +#define PCI_EXP_SLTCTL_PIC_SHIFT        8
> +#define PCI_EXP_SLTCTL_PIC_ON           (PCI_EXP_HP_IND_ON << PCI_EXP_SLTCTL_PIC_SHIFT)
> +#define PCI_EXP_SLTCTL_PIC_BLINK        (PCI_EXP_HP_IND_BLINK << PCI_EXP_SLTCTL_PIC_SHIFT)
> +#define PCI_EXP_SLTCTL_PIC_OFF          (PCI_EXP_HP_IND_OFF << PCI_EXP_SLTCTL_PIC_SHIFT)
> +
> +#define PCI_EXP_DEVCAP2_EFF             0x100000
> +#define PCI_EXP_DEVCAP2_EETLPP          0x200000
> +
> +#define PCI_EXP_DEVCTL2_EETLPPB         0x80

Many constants here to audit.
Let's define _SHIFT macros as ffs, _REG macros
don't really need the & masking, and when we
remove it it becomes small enough to open-code.
Also some macros seem unused?

> +
> +static void pcie_notify(PCIDevice *dev, uint16_t vector,
> +                        bool trigger, int level)
> +{
> +    /* masking/masking interrupt is handled by upper layer.
> +     * i.e. msix_notify() for MSI-X
> +     *      msi_notify()  for MSI
> +     *      pci_set_irq() for INTx
> +     */

So this will send another interrupt when level is 0?

> +    PCIE_DEV_PRINTF(dev, "noitfy vector %d tirgger:%d level:%d\n",
> +                    vector, trigger, level);
> +    if (msix_enabled(dev)) {
> +        if (trigger) {
> +            msix_notify(dev, vector);
> +        }
> +    } else if (msi_enabled(dev)) {
> +        if (trigger){
> +            msi_notify(dev, vector);
> +        }
> +    } else  {

two spaces before {

> +        qemu_set_irq(dev->irq[0], level);
> +    }
> +}
> +
> +static inline uint32_t pcie_written_val_long(uint32_t addr, uint32_t val,
> +                                             uint32_t pos)
> +{
> +    if (addr >= pos) {
> +        val <<= addr - pos;
> +    } else {
> +        val >>= pos - addr;
> +    }
> +    return val;
> +}
> +

Note that above is undefined if |pos - addr| > 32.

> +static inline uint16_t pcie_written_val_word(uint32_t addr, uint32_t val,
> +                                             uint32_t pos)
> +{
> +    return pcie_written_val_long(addr, val, pos) & 0xffff;
> +}
> +

& 0xffff is not needed here.

two functions above are not clearly named.
Thery have nothing to do with write: they just get
word from address/value pair.

> +/*
> + * RW1C: Write-1-to-clear
> + * regiger      written val        result
> + * 0            0               => 0
> + * 1            0               => 1
> + * 0            1               => 0
> + * 1            1               => 0
> + */
> +static inline void pcie_w1c_long(PCIDevice *d, uint32_t pos, uint32_t mask,
> +                                 uint32_t addr, uint32_t val)
> +{
> +    uint32_t written = pcie_written_val_long(addr, val, pos) & mask;
> +    uint32_t reg = pci_get_long(d->config + pos);
> +    reg &= ~written;
> +    pci_set_long(d->config + pos, reg);
> +}
> +
> +static inline void pcie_w1c_word(PCIDevice *d, uint32_t pos, uint16_t mask,
> +                                 uint32_t addr, uint32_t val)
> +{
> +    uint16_t written = pcie_written_val_word(addr, val, pos) & mask;
> +    uint16_t reg = pci_get_word(d->config + pos);
> +    reg &= ~written;
> +    pci_set_word(d->config + pos, reg);
> +}
> +

So the SERR bit support IMO belongs in pci. And this means the W1C
inline functions need to move there.

pci.c implemented this in a simpler way, by shifting
val by 8 bytes each time. Can we find a way to do it
in a similar way? I'll try to think about it.


> +int pci_pcie_cap_init(PCIDevice *dev,
> +                      uint8_t offset, uint8_t type, uint8_t port)

I think we should have
pcie_init
that would init everything and be external,
and this one  should be static, and this one
should be static.

> +{
> +    int exp_cap;
> +    uint8_t *pcie_cap;
> +
> +    assert(pci_is_express(dev));
> +    dev->exp = qemu_mallocz(sizeof(*dev->exp));
> +
> +    exp_cap = pci_add_capability(dev, PCI_CAP_ID_EXP, offset,
> +                                 PCI_EXP_VER2_SIZEOF);
> +    if (exp_cap < 0) {
> +        qemu_free(dev->exp);
> +        dev->exp = NULL;
> +        return exp_cap;
> +    }
> +    dev->exp->exp_cap = exp_cap;
> +    /* dev->cap_present |= QEMU_PCI_CAP_EXPRESS; */ /* already done in pci_qdev_init() */
> +
> +    pcie_cap = dev->config + pci_pcie_cap(dev);
> +
> +    /* capability register
> +       interrupt message number defaults to 0 */
> +    pci_set_word(pcie_cap + PCI_EXP_FLAGS,
> +                 ((type << PCI_EXP_FLAGS_TYPE_SHIFT) & PCI_EXP_FLAGS_TYPE) |
> +                 PCI_EXP_FLAGS_VER2);
> +
> +    /* device capability register
> +     * table 7-12:
> +     * roll based error reporting bit must be set by all
> +     * Functions conforming to the ECN, PCI Express Base
> +     * Specification, Revision 1.1., or subsequent PCI Express Base
> +     * Specification revisions.
> +     */
> +    pci_set_long(pcie_cap + PCI_EXP_DEVCAP, PCI_EXP_DEVCAP_RBER);
> +
> +    pci_set_long(pcie_cap + PCI_EXP_LNKCAP,
> +                 PCI_EXP_LNKCAP_PN_REG(port) |
> +                 PCI_EXP_LNKCAP_ASPMS_0S |
> +                 PCI_EXP_LNK_MLW_1 |
> +                 PCI_EXP_LNK_LS_25);
> +
> +    pci_set_word(pcie_cap + PCI_EXP_LNKSTA,
> +                 PCI_EXP_LNK_MLW_1 | PCI_EXP_LNK_LS_25);
> +
> +    pci_set_long(pcie_cap + PCI_EXP_DEVCAP2,
> +                 PCI_EXP_DEVCAP2_EFF | PCI_EXP_DEVCAP2_EETLPP);
> +
> +    pci_set_word(dev->wmask + exp_cap, PCI_EXP_DEVCTL2_EETLPPB);
> +    return exp_cap;
> +}
> +
> +int pci_pcie_cap_exit(PCIDevice *dev)
> +{
> +    /* pci_del_capability(dev, PCI_CAP_ID_EXP, PCI_EXP_VER2_SIZEOF); */
> +    qemu_free(dev->exp);
> +    return 0;
> +}
> +
> +uint8_t pcie_cap_get_type(const PCIDevice *dev)
> +{
> +    uint32_t pos = pci_pcie_cap(dev);
> +    assert(pos > 0);
> +    return (pci_get_word(dev->config + pos + PCI_EXP_FLAGS) &
> +            PCI_EXP_FLAGS_TYPE) >> PCI_EXP_FLAGS_TYPE_SHIFT;
> +}
> +
> +/* MSI/MSI-X */
> +/* pci express interrupt message number */
> +void pcie_cap_flags_set_vector(PCIDevice *dev, uint8_t vector)
> +{
> +    uint8_t *pcie_cap = dev->config + pci_pcie_cap(dev);
> +    uint16_t tmp;
> +
> +    assert(vector <= 32);
> +    tmp = pci_get_word(pcie_cap + PCI_EXP_FLAGS);
> +    tmp &= ~PCI_EXP_FLAGS_IRQ;
> +    tmp |= PCI_EXP_FLAGS_IRQ_REG(vector);
> +    pci_set_word(pcie_cap + PCI_EXP_FLAGS, tmp);
> +}
> +
> +uint8_t pcie_cap_flags_get_vector(PCIDevice *dev)
> +{
> +    return (pci_get_word(dev->config + pci_pcie_cap(dev) + PCI_EXP_FLAGS) &
> +            PCI_EXP_FLAGS_IRQ) >> PCI_EXP_FLAGS_IRQ_SHIFT;
> +}
> +
> +static void pcie_cap_notify(PCIDevice *dev, bool trigger, int level)
> +{
> +    pcie_notify(dev, pcie_cap_flags_get_vector(dev), trigger, level);
> +}
> +

better opencoded.

> +void pcie_cap_deverr_init(PCIDevice *dev)
> +{
> +    uint32_t pos = pci_pcie_cap(dev);
> +    uint8_t *pcie_cap = dev->config + pos;
> +    uint8_t *pcie_wmask = dev->wmask + pos;
> +
> +    pci_set_long(pcie_cap + PCI_EXP_DEVCAP,
> +                 pci_get_long(pcie_cap + PCI_EXP_DEVCAP) |
> +                 PCI_EXP_DEVCAP_RBER);
> +
> +    pci_set_long(pcie_wmask + PCI_EXP_DEVCTL,
> +                 pci_get_long(pcie_wmask + PCI_EXP_DEVCTL) |
> +                 PCI_EXP_DEVCTL_CERE | PCI_EXP_DEVCTL_NFERE |
> +                 PCI_EXP_DEVCTL_FERE | PCI_EXP_DEVCTL_URRE);
> +}
> +
> +void pcie_cap_deverr_reset(PCIDevice *dev)
> +{
> +    uint8_t *pcie_cap = dev->config + pci_pcie_cap(dev);
> +    pci_set_long(pcie_cap + PCI_EXP_DEVCTL,
> +                 pci_get_long(pcie_cap + PCI_EXP_DEVCTL) &
> +                 ~(PCI_EXP_DEVCTL_CERE | PCI_EXP_DEVCTL_NFERE |
> +                   PCI_EXP_DEVCTL_FERE | PCI_EXP_DEVCTL_URRE));
> +}
> +
> +void pcie_cap_deverr_write_config(PCIDevice *dev,
> +                                  uint32_t addr, uint32_t val, int len)
> +{
> +    uint32_t pos = pci_pcie_cap(dev);
> +    if (ranges_overlap(addr, len, pos + PCI_EXP_DEVSTA, 4)) {
> +        /* RW1C */
> +        pcie_w1c_long(dev, pos + PCI_EXP_DEVSTA,
> +                      PCI_EXP_DEVSTA_CED | PCI_EXP_DEVSTA_NFED |
> +                      PCI_EXP_DEVSTA_URD | PCI_EXP_DEVSTA_URD,
> +                      addr, val);
> +    }
> +}
> +
> +/*
> + * events: PCI_EXP_HP_EV_xxx
> + * status: bit or of PCI_EXP_SLTSTA_xxx
> + */
> +static void pcie_cap_slot_event(PCIDevice *dev,
> +                                enum PCIExpressHotPlugEvent events,
> +                                uint16_t status)
> +{
> +    bool trigger = false;
> +    int level = 0;
> +    uint8_t *pcie_cap = dev->config + pci_pcie_cap(dev);
> +    uint16_t sltctl = pci_get_word(pcie_cap + PCI_EXP_SLTCTL);
> +    uint16_t sltsta = pci_get_word(pcie_cap + PCI_EXP_SLTSTA);
> +
> +    PCIE_DEV_PRINTF(dev,
> +                    "sltctl: 0x%0x2 sltsta: 0x%02x event:%x %s status:%d\n",
> +                    sltctl, sltsta,
> +                    events, pcie_hp_event_name(events), status);
> +    events &= PCI_EXP_HP_EV_SUPPORTED;
> +    if ((sltctl & PCI_EXP_SLTCTL_HPIE) && (sltctl & events) &&
> +        ((sltsta ^ events) & events) /* 0 -> 1 */) {
> +        trigger = true;
> +    }

else trigger = false and do not initialize

> +
> +    if (events & PCI_EXP_HP_EV_PDC) {
> +        sltsta &= ~PCI_EXP_SLTSTA_PDS;
> +        sltsta |= (status & PCI_EXP_SLTSTA_PDS);
> +    }
> +    sltsta |= events;
> +    pci_set_word(pcie_cap + PCI_EXP_SLTSTA, sltsta);
> +    PCIE_DEV_PRINTF(dev, "sltsta -> %02xn", sltsta);
> +
> +    if ((sltctl & PCI_EXP_SLTCTL_HPIE) && (sltsta & PCI_EXP_HP_EV_SUPPORTED)) {
> +        level = 1;
> +    }

else level = 0
and don't initialize would be clearer.

> +
> +    pcie_cap_notify(dev, trigger, level);
> +}
> +
> +static int pcie_cap_slot_hotplug(DeviceState *qdev,
> +                                 PCIDevice *pci_dev, int state)
> +{
> +    PCIDevice *d = DO_UPCAST(PCIDevice, qdev, qdev);
> +    uint8_t *pcie_cap = d->config + pci_pcie_cap(d);
> +    uint16_t sltsta = pci_get_word(pcie_cap + PCI_EXP_SLTSTA);
> +
> +    if (!pci_dev->qdev.hotplugged) {
> +        assert(state); /* this case only happens machine creation. */
> +        sltsta |= PCI_EXP_SLTSTA_PDS;
> +        pci_set_word(pcie_cap + PCI_EXP_SLTSTA, sltsta);
> +        return 0;
> +    }
> +
> +    PCIE_DEV_PRINTF(pci_dev, "hotplug state: %d\n", state);
> +    if (sltsta & PCI_EXP_SLTSTA_EIS) {
> +        /* the slot is electromechanically locked. */
> +        return -EBUSY;
> +    }
> +
> +    if (state) {
> +        if (PCI_FUNC(pci_dev->devfn) == 0) {
> +            /* event is per slot. Not per function
> +             * only generates event for function = 0.
> +             * When hot plug, populate functions > 0
> +             * and then add function = 0 last.
> +             */
> +            pcie_cap_slot_event(d, PCI_EXP_HP_EV_PDC, PCI_EXP_SLTSTA_PDS);
> +        }
> +    } else {
> +        PCIBridge *br;
> +        PCIBus *bus;
> +        DeviceState *next;
> +        if (PCI_FUNC(pci_dev->devfn) != 0) {
> +            /* event is per slot. Not per function.
> +               accepts function = 0 only. */
> +            return -EINVAL;
> +        }
> +
> +        /* zap all functions. */
> +        br = DO_UPCAST(PCIBridge, dev, d);
> +        bus = pci_bridge_get_sec_bus(br);
> +        QLIST_FOREACH_SAFE(qdev, &bus->qbus.children, sibling, next) {
> +            qdev_free(qdev);
> +        }
> +
> +        pcie_cap_slot_event(d, PCI_EXP_HP_EV_PDC, 0);
> +    }
> +    return 0;
> +}
> +
> +/* pci express slot for pci express root/downstream port
> +   PCI express capability slot registers */
> +void pcie_cap_slot_init(PCIDevice *dev, uint16_t slot)
> +{
> +    uint8_t *pcie_cap = dev->config + pci_pcie_cap(dev);
> +    uint8_t *pcie_wmask = dev->wmask + pci_pcie_cap(dev);
> +    uint32_t tmp;
> +
> +    pci_set_word(pcie_cap + PCI_EXP_FLAGS,
> +                 pci_get_word(pcie_cap + PCI_EXP_FLAGS) | PCI_EXP_FLAGS_SLOT);
> +
> +    tmp = pci_get_long(pcie_cap + PCI_EXP_SLTCAP);
> +    tmp &= PCI_EXP_SLTCAP_PSN;
> +    tmp |=
> +        PCI_EXP_SLTCAP_PSN_REG(slot) |
> +        PCI_EXP_SLTCAP_EIP |
> +        PCI_EXP_SLTCAP_HPS |
> +        PCI_EXP_SLTCAP_HPC |
> +        PCI_EXP_SLTCAP_PIP |
> +        PCI_EXP_SLTCAP_AIP |
> +        PCI_EXP_SLTCAP_ABP;
> +    pci_set_long(pcie_cap + PCI_EXP_SLTCAP, tmp);
> +
> +    tmp = pci_get_word(pcie_cap + PCI_EXP_SLTCTL);
> +    tmp &= ~(PCI_EXP_SLTCTL_PIC | PCI_EXP_SLTCTL_AIC);
> +    tmp |= PCI_EXP_SLTCTL_PIC_OFF | PCI_EXP_SLTCTL_AIC_OFF;
> +    pci_set_word(pcie_cap + PCI_EXP_SLTCTL, tmp);
> +    pci_set_word(pcie_wmask + PCI_EXP_SLTCTL,
> +                 pci_get_word(pcie_wmask + PCI_EXP_SLTCTL) |
> +                 PCI_EXP_SLTCTL_PIC |
> +                 PCI_EXP_SLTCTL_AIC |
> +                 PCI_EXP_SLTCTL_HPIE |
> +                 PCI_EXP_SLTCTL_CCIE |
> +                 PCI_EXP_SLTCTL_PDCE |
> +                 PCI_EXP_SLTCTL_ABPE);
> +
> +    pci_bus_hotplug(pci_bridge_get_sec_bus(DO_UPCAST(PCIBridge, dev, dev)),
> +                    pcie_cap_slot_hotplug, &dev->qdev);
> +}
> +
> +void pcie_cap_slot_reset(PCIDevice *dev)
> +{
> +    uint8_t *pcie_cap = dev->config + pci_pcie_cap(dev);
> +    uint32_t tmp;
> +
> +    PCIE_DEV_PRINTF(dev, "reset\n");
> +
> +    tmp = pci_get_word(pcie_cap + PCI_EXP_SLTCTL);
> +    tmp &= ~(PCI_EXP_SLTCTL_EIC |
> +             PCI_EXP_SLTCTL_PIC |
> +             PCI_EXP_SLTCTL_AIC |
> +             PCI_EXP_SLTCTL_HPIE |
> +             PCI_EXP_SLTCTL_CCIE |
> +             PCI_EXP_SLTCTL_PDCE |
> +             PCI_EXP_SLTCTL_ABPE);
> +    tmp |= PCI_EXP_SLTCTL_PIC_OFF | PCI_EXP_SLTCTL_AIC_OFF;
> +    pci_set_word(pcie_cap + PCI_EXP_SLTCTL, tmp);
> +
> +    tmp = pci_get_word(pcie_cap + PCI_EXP_SLTSTA);
> +    tmp &= ~(PCI_EXP_SLTSTA_EIS | /* by reset, the lock is released */
> +             PCI_EXP_SLTSTA_CC |
> +             PCI_EXP_SLTSTA_PDC |
> +             PCI_EXP_SLTSTA_ABP);
> +    pci_set_word(pcie_cap + PCI_EXP_SLTSTA, tmp);
> +}
> +
> +void pcie_cap_slot_write_config(PCIDevice *dev,
> +                                uint32_t addr, uint32_t val, int len,
> +                                uint16_t sltctl_prev)
> +{
> +    uint32_t pos = pci_pcie_cap(dev);
> +    uint8_t *pcie_cap = dev->config + pos;
> +    uint16_t sltctl = pci_get_word(pcie_cap + PCI_EXP_SLTCTL);
> +    uint16_t sltsta = pci_get_word(pcie_cap + PCI_EXP_SLTSTA);
> +
> +    PCIE_DEV_PRINTF(dev,
> +                    "addr: 0x%x val: 0x%x len: %d\n"
> +                    "\tsltctl_prev: 0x%02x sltctl: 0x%02x sltsta 0x%02x\n",
> +                    addr, val, len, sltctl_prev, sltctl, sltsta);
> +    /* SLTSTA: process SLTSTA before SLTCTL to avoid spurious interrupt */
> +    if (ranges_overlap(addr, len, pos + PCI_EXP_SLTSTA, 2)) {
> +        /* RW1C */
> +        pcie_w1c_word(dev, pos + PCI_EXP_SLTSTA, PCI_EXP_HP_EV_SUPPORTED,
> +                      addr, val);
> +        sltsta = pci_get_word(pcie_cap + PCI_EXP_SLTSTA);
> +
> +        /* write to stlsta results in clearing bits,
> +           so new interrupts won't be generated. */
> +        PCIE_DEV_PRINTF(dev, "sltsta -> 0x%02x\n", sltsta);
> +    }
> +
> +    /* SLTCTL */
> +    if (ranges_overlap(addr, len, pos + PCI_EXP_SLTCTL, 2)) {
> +        PCIE_DEV_PRINTF(dev, "sltctl: 0x%02x -> 0x%02x\n",
> +                        sltctl_prev, sltctl);
> +        if (pcie_written_val_word(addr, val, pos + PCI_EXP_SLTCTL) &
> +            PCI_EXP_SLTCTL_EIC) {
> +            /* toggle PCI_EXP_SLTSTA_EIS */
> +            sltsta = (sltsta & ~PCI_EXP_SLTSTA_EIS) |
> +                ((sltsta ^ PCI_EXP_SLTSTA_EIS) & PCI_EXP_SLTSTA_EIS);
> +            pci_set_word(pcie_cap + PCI_EXP_SLTSTA, sltsta);
> +            PCIE_DEV_PRINTF(dev, "PCI_EXP_SLTCTL_EIC: sltsta -> 0x%02x\n",
> +                            sltsta);
> +        }
> +
> +        if (sltctl & PCI_EXP_SLTCTL_HPIE) {
> +            bool trigger = false;
> +            int level = 0;
> +
> +            if (((sltctl_prev ^ sltctl) & sltctl) & PCI_EXP_HP_EV_SUPPORTED) {
> +                /* 0 -> 1 */
> +                trigger = true;
> +            }

else trigger = false and do not initialize.

> +            if ((sltctl & sltsta) & PCI_EXP_HP_EV_SUPPORTED) {
> +                level = 1;
> +            }

else level = 0 and avoid init would be clearer

> +            pcie_cap_notify(dev, trigger, level);
> +        }
> +
> +        /* command completed.
> +           unlike real hardware, command completes instantaneously */
> +#define PCI_EXP_SLTCTL_SUPPORTED        \
> +            (PCI_EXP_SLTCTL_ABPE |      \
> +             PCI_EXP_SLTCTL_PDCE |      \
> +             PCI_EXP_SLTCTL_CCIE |      \
> +             PCI_EXP_SLTCTL_HPIE |      \
> +             PCI_EXP_SLTCTL_AIC |       \
> +             PCI_EXP_SLTCTL_PCC |       \
> +             PCI_EXP_SLTCTL_EIC)
> +        if ( 1 /* (sltctl_prev ^ sltctl) & PCI_EXP_SLTCTL_SUPPORTED */ ) {

what does the above comment mean?

> +            /* set command completed bit */
> +            pcie_cap_slot_event(dev, PCI_EXP_HP_EV_CCI, 0);
> +        }
> +    }
> +}
> +
> +void pcie_cap_slot_push_attention_button(PCIDevice *dev)
> +{
> +    pcie_cap_slot_event(dev, PCI_EXP_HP_EV_ABP, 0);
> +}
> +
> +/* root control/capabilities/status. PME isn't emulated for now */
> +void pcie_cap_root_init(PCIDevice *dev)
> +{
> +    uint8_t pos = pci_pcie_cap(dev);
> +    pci_set_word(dev->wmask + pos + PCI_EXP_RTCTL,
> +                 PCI_EXP_RTCTL_SECEE | PCI_EXP_RTCTL_SENFEE |
> +                 PCI_EXP_RTCTL_SEFEE);
> +}
> +
> +void pcie_cap_root_reset(PCIDevice *dev)
> +{
> +    uint8_t *pcie_cap = dev->config + pci_pcie_cap(dev);
> +    pci_set_word(pcie_cap + PCI_EXP_RTCTL, 0);
> +}
> +
> +/* function level reset(FLR) */
> +void pcie_cap_flr_init(PCIDevice *dev, pcie_flr_fn flr)
> +{
> +    uint8_t *pcie_cap = dev->config + pci_pcie_cap(dev);
> +    pci_set_word(pcie_cap + PCI_EXP_DEVCAP,
> +                 pci_get_word(pcie_cap + PCI_EXP_DEVCAP) | PCI_EXP_DEVCAP_FLR);
> +    dev->exp->flr = flr;
> +}
> +
> +void pcie_cap_flr_write_config(PCIDevice *dev,
> +                               uint32_t addr, uint32_t val, int len)
> +{
> +    uint32_t pos = pci_pcie_cap(dev);
> +    if (ranges_overlap(addr, len, pos + PCI_EXP_DEVCTL, 2)) {
> +        uint16_t val16 = pcie_written_val_word(addr, val,
> +                                               pos + PCI_EXP_DEVCTL);
> +        if ((val16 & PCI_EXP_DEVCTL_BCR_FLR) && dev->exp->flr) {
> +            dev->exp->flr(dev);
> +        }
> +    }
> +}
> +
> +
> +/* Alternative Routing-ID Interpretation (ARI) */
> +/* ari forwarding support for down stream port */
> +void pcie_cap_ari_init(PCIDevice *dev)
> +{
> +    uint8_t *pcie_cap = dev->config + pci_pcie_cap(dev);
> +    uint8_t *pcie_wmask = dev->wmask + pci_pcie_cap(dev);
> +
> +    pci_set_long(pcie_cap + PCI_EXP_DEVCAP2,
> +                 pci_get_long(pcie_cap + PCI_EXP_DEVCAP2) |
> +                 PCI_EXP_DEVCAP2_ARI);
> +
> +    pci_set_long(pcie_wmask + PCI_EXP_DEVCTL2,
> +                 pci_get_long(pcie_wmask + PCI_EXP_DEVCTL2) |
> +                 PCI_EXP_DEVCTL2_ARI);
> +}
> +
> +void pcie_cap_ari_reset(PCIDevice *dev)
> +{
> +    uint8_t *pcie_cap = dev->config + pci_pcie_cap(dev);
> +
> +    pci_set_long(pcie_cap + PCI_EXP_DEVCTL2,
> +                 pci_get_long(pcie_cap + PCI_EXP_DEVCTL2) &
> +                 ~PCI_EXP_DEVCTL2_ARI);
> +}
> +
> +bool pcie_cap_is_ari_enabled(const PCIDevice *dev)
> +{
> +    if (!pci_is_express(dev)) {
> +        return false;
> +    }
> +    if (!pci_pcie_cap(dev)) {
> +        return false;
> +    }
> +
> +    return pci_get_long(dev->config + pci_pcie_cap(dev) + PCI_EXP_DEVCTL2) &
> +        PCI_EXP_DEVCTL2_ARI;
> +}
> +
> +/**************************************************************************
> + * pci express extended capability allocation functions
> + * uint16_t ext_cap_id (16 bit)
> + * uint8_t cap_ver (4 bit)
> + * uint16_t cap_offset (12 bit)
> + * uint16_t ext_cap_size
> + */
> +
> +#define PCI_EXT_CAP_VER_SHIFT   16
> +#define PCI_EXT_CAP_NEXT_MASK   0xfff00000

Should be 0xffc << PCI_EXT_CAP_NEXT_SHIFT ?
Pls add a TODO to try and add this to pci_regs.h in linux.

> +#define PCI_EXT_CAP_NEXT_SHIFT  20
> +
> +#define PCI_EXT_CAP(id, ver, next) ((id) | ((ver) << PCI_EXT_CAP_VER_SHIFT) | ((next) << PCI_EXT_CAP_NEXT_SHIFT))

line too long. I think this will cause integer overflow
if next is uint16_t and has high bit set.
Better make this an inline function with uint32_t arguments.

> +
> +#define PCI_EXT_CAP_ALIGN       4
> +#define PCI_EXT_CAP_ALIGNUP(x)  (((x) + PCI_EXT_CAP_ALIGN - 1) & ~(PCI_EXT_CAP_ALIGN - 1))

line too long

> +
> +static int16_t pcie_ext_cap_find_space(PCIDevice *dev, uint16_t size)
> +{
> +    uint16_t offset = PCI_CONFIG_SPACE_SIZE;
> +    uint16_t i = offset;
> +
> +    while (i < PCIE_CONFIG_SPACE_SIZE - size) {
> +        if (dev->used[i]) {
> +            offset = PCI_EXT_CAP_ALIGNUP(i + 1);
> +            i = offset;
> +            continue;
> +        } else if (i - offset + 1 == size) {
> +            return offset;
> +        }
> +
> +        ++i;
> +    }
> +
> +    return 0;
> +}
> +

So here, is this allocator worth it?
Migration requires that offsets never change, so
let's precompute them and pass to 

> +static uint16_t pcie_find_ext_capability_list(PCIDevice *dev, uint16_t cap_id,
> +                                              uint16_t *prev_p)

Maybe rename
static uint16_t pcie_find_capability_list(PCIDevice *dev, uint16_t cap_id,

it is clear it's extended as pci_ one does regular caps.

> +{
> +    int ttl;
> +
> +    uint16_t prev = 0;
> +    uint16_t next = PCI_CONFIG_SPACE_SIZE;
> +    uint32_t header = pci_get_long(dev->config + next);
> +
> +    if (!header) {
> +        return 0;
> +    }
> +
> +    /* minimum 8 bytes per capability */
> +    ttl = (PCIE_CONFIG_SPACE_SIZE - PCI_CONFIG_SPACE_SIZE) / 8;
> +
> +    while (ttl-- > 0) {
> +        if (PCI_EXT_CAP_ID(header) == cap_id) {
> +            break;
> +        }
> +
> +        prev = next;
> +        next = PCI_EXT_CAP_NEXT(header);
> +        if (next < PCI_CONFIG_SPACE_SIZE) {
> +            return 0;
> +        }
> +        header = pci_get_long(dev->config + prev);
> +    }
> +
> +    if (!ttl) {
> +        return 0;
> +    }
> +    if (prev_p) {
> +        *prev_p = prev;
> +    }
> +    return next;
> +}

I don't yet understand this code.
This seems to do in 38 lines what the pci.c equivalent did in 18?
What eactly is ttl? Why do we need assumptions on capability size?

The following is a copy from pci.c, with PCI macros replaced by PCI_EXT
ones. Would this work? If not what did I miss?

static uint8_t pcie_find_capability_list(PCIDevice *pdev, uint8_t cap_id,
                                         uint8_t *prev_p)
{
    uint16_t next, prev;

    for (prev = PCI_CONFIG_SPACE_SIZE;
	 next = PCI_EXT_CAP_NEXT(pci_get_long(pdev->config + prev));
         prev = next)
        if (PCI_EXT_CAP_ID(pci_get_long(pdev->config + next) == cap_id))
            break;

    if (prev_p)
        *prev_p = prev;
    return next;
}

> +uint16_t pcie_find_ext_capability(PCIDevice *dev, uint16_t cap_id)
> +{
> +    return pcie_find_ext_capability_list(dev, cap_id, NULL);
> +}
> +
> +static void pcie_ext_cap_set_next(PCIDevice *dev, uint16_t pos, uint16_t next)
> +{
> +    uint16_t header = pci_get_long(dev->config + pos);
> +    assert(!(next & (PCI_EXT_CAP_ALIGN - 1)));
> +    header = (header & ~PCI_EXT_CAP_NEXT_MASK) |
> +        ((next << PCI_EXT_CAP_NEXT_SHIFT) & PCI_EXT_CAP_NEXT_MASK);
> +    pci_set_long(dev->config + pos, header);
> +}
> +
> +static void pcie_allocate_ext_capability(PCIDevice *dev,
> +                                         uint16_t cap_id, uint8_t cap_ver,
> +                                         uint16_t offset, uint16_t size)
> +{
> +    uint32_t header;
> +    uint16_t next;
> +
> +    assert(offset < offset + size);
> +    assert(offset + size < PCIE_CONFIG_SPACE_SIZE);
> +    assert(size >= 8);
> +
> +    if (offset == PCI_CONFIG_SPACE_SIZE) {

Let's make offset == 0 have special meaning, same as pci please.

> +        header = pci_get_long(dev->config + offset);
> +        next = PCI_EXT_CAP_NEXT(header);
> +    } else {

I think we can reuse the find_list routine (e.g. pass 0 as cap_id)?

> +        /* find last ext cap */
> +        int ttl = (PCIE_CONFIG_SPACE_SIZE - PCI_CONFIG_SPACE_SIZE) / 8;
> +        uint16_t pos = PCI_CONFIG_SPACE_SIZE;
> +        while (ttl-- > 0) {
> +            header = pci_get_long(dev->config + pos);
> +            if (PCI_EXT_CAP_NEXT(header) < PCI_CONFIG_SPACE_SIZE) {
> +                break;
> +            }
> +
> +            pos = PCI_EXT_CAP_NEXT(header);
> +        }
> +
> +        assert(ttl > 0); /* since it is known that [offset, offset + size]
> +                            is unused, so ttl shouldn't be zero */
> +        pcie_ext_cap_set_next(dev, pos, offset);
> +        next = 0;
> +    }
> +    pci_set_long(dev->config + offset, PCI_EXT_CAP(cap_id, cap_ver, next));
> +
> +    memset(dev->used + offset, 0xFF, size);
> +    /* Make capability read-only by default */
> +    memset(dev->wmask + offset, 0, size);
> +    /* Check capability by default */
> +    memset(dev->cmask + offset, 0xFF, size);
> +}
> +
> +int pcie_add_ext_capability(PCIDevice *dev,
> +                            uint16_t cap_id, uint8_t cap_ver, uint16_t size)

this should be static

> +{
> +    uint16_t offset = pcie_ext_cap_find_space(dev, size);
> +
> +    if (!offset) {
> +        return -ENOSPC;
> +    }
> +
> +    pcie_allocate_ext_capability(dev, cap_id, cap_ver, offset, size);
> +    return offset;
> +}
> +
> +int pcie_append_ext_capability(PCIDevice *dev,
> +                               uint16_t cap_id, uint8_t cap_ver,
> +                               uint16_t offset, uint16_t size)

rename this pcie_add_capability

> +{
> +    uint16_t i;
> +
> +    if (!offset) {
> +        return pcie_add_ext_capability(dev, cap_id, cap_ver, size);

open-code this call and remove pcie_add_ext_capability function.

> +    }
> +
> +    assert(offset < offset + size);
> +    assert(offset + size < PCIE_CONFIG_SPACE_SIZE);
> +    assert(size >= 8);
> +
> +    for (i = offset; i < offset + size; ++i) {
> +        if (dev->used[i]) {
> +            return -EBUSY;
> +        }
> +    }
> +
> +    pcie_allocate_ext_capability(dev, cap_id, cap_ver, offset, size);
> +    return offset;
> +}
> +
> +void pcie_del_ext_capability(PCIDevice *dev, uint16_t cap_id, uint16_t size)

rename pcie_del_capability

> +{
> +    uint16_t prev;
> +    uint16_t offset = pcie_find_ext_capability_list(dev, cap_id, &prev);
> +    uint32_t header;
> +
> +    if (!offset) {
> +        return;
> +    }
> +
> +    header = pci_get_long(dev->config + offset);
> +    if (prev) {
> +        pcie_ext_cap_set_next(dev, prev, PCI_EXT_CAP_NEXT(header));
> +    } else {
> +        /* move up next ext cap to PCI_CONFIG_SPACE_SIZE? */
> +        assert(offset == PCI_CONFIG_SPACE_SIZE);
> +        pci_set_long(dev->config + offset,
> +                     PCI_EXT_CAP(0, 0, PCI_EXT_CAP_NEXT(header)));
> +    }
> +
> +    /* Make capability writeable again */
> +    memset(dev->wmask + offset, 0xff, size);
> +    /* Clear cmask as device-specific registers can't be checked */
> +    memset(dev->cmask + offset, 0, size);
> +    memset(dev->used + offset, 0, size);
> +}
> +
> +void pcie_reserve_ext_capability(PCIDevice *dev,
> +                                 uint16_t offset, uint16_t size)
> +{
> +    memset(dev->used + offset, 0xff, size);
> +}
> +
> +/**************************************************************************
> + * pci express extended capability helper functions
> + */

Split out ari.c and aer.c? Just a thought ...

> +/* ARI */
> +#define PCI_ARI_VER     1
> +#define PCI_ARI_SIZEOF  8
> +
> +int pcie_ari_init(PCIDevice *dev, uint16_t offset, uint16_t nextfn)
> +{
> +    int pos;
> +    pos = pcie_append_ext_capability(dev, PCI_EXT_CAP_ID_ARI, PCI_ARI_VER,
> +                                     offset, PCI_ARI_SIZEOF);
> +    if (pos < 0) {
> +        return pos;
> +    }
> +
> +    pci_set_long(dev->config + pos + PCI_ARI_CAP, PCI_ARI_CAP_NFN(nextfn));
> +    return pos;
> +}
> +
> +/* AER */
> +#define PCI_ERR_VER                     2
> +#define PCI_ERR_SIZEOF                  0x48
> +
> +#define PCI_ERR_UNC_SDN                 0x00000020      /* surprise down */
> +#define PCI_ERR_UNC_ACSV                0x00200000      /* ACS Violation */
> +#define PCI_ERR_UNC_INTN                0x00400000      /* Internal Error */
> +#define PCI_ERR_UNC_MCBTLP              0x00800000      /* MC Blcoked TLP */
> +#define PCI_ERR_UNC_ATOP_EBLOCKED       0x01000000      /* atomic op egress blocked */
> +#define PCI_ERR_UNC_TLP_PRF_BLOCKED     0x02000000      /* TLP Prefix Blocked */
> +#define PCI_ERR_UNC_SUPPORTED           (PCI_ERR_UNC_DLP |              \
> +                                         PCI_ERR_UNC_SDN |              \
> +                                         PCI_ERR_UNC_POISON_TLP |       \
> +                                         PCI_ERR_UNC_FCP |              \
> +                                         PCI_ERR_UNC_COMP_TIME |        \
> +                                         PCI_ERR_UNC_COMP_ABORT |       \
> +                                         PCI_ERR_UNC_UNX_COMP |         \
> +                                         PCI_ERR_UNC_RX_OVER |          \
> +                                         PCI_ERR_UNC_MALF_TLP |         \
> +                                         PCI_ERR_UNC_ECRC |             \
> +                                         PCI_ERR_UNC_UNSUP |            \
> +                                         PCI_ERR_UNC_ACSV |             \
> +                                         PCI_ERR_UNC_INTN |             \
> +                                         PCI_ERR_UNC_MCBTLP |           \
> +                                         PCI_ERR_UNC_ATOP_EBLOCKED |    \
> +                                         PCI_ERR_UNC_TLP_PRF_BLOCKED)
> +
> +#define PCI_ERR_UNC_SEVERITY_DEFAULT    (PCI_ERR_UNC_DLP |              \
> +                                         PCI_ERR_UNC_SDN |              \
> +                                         PCI_ERR_UNC_FCP |              \
> +                                         PCI_ERR_UNC_RX_OVER |          \
> +                                         PCI_ERR_UNC_MALF_TLP |         \
> +                                         PCI_ERR_UNC_INTN)
> +
> +#define PCI_ERR_COR_ADV_NONFATAL        0x00002000      /* Advisory Non-Fatal */
> +#define PCI_ERR_COR_INTERNAL            0x00004000      /* Corrected Internal */
> +#define PCI_ERR_COR_HL_OVERFLOW         0x00008000      /* Header Long Overflow */
> +#define PCI_ERR_COR_SUPPORTED           (PCI_ERR_COR_RCVR |             \
> +                                         PCI_ERR_COR_BAD_TLP |          \
> +                                         PCI_ERR_COR_BAD_DLLP |         \
> +                                         PCI_ERR_COR_REP_ROLL |         \
> +                                         PCI_ERR_COR_REP_TIMER |        \
> +                                         PCI_ERR_COR_ADV_NONFATAL |     \
> +                                         PCI_ERR_COR_INTERNAL |         \
> +                                         PCI_ERR_COR_HL_OVERFLOW)
> +#define PCI_ERR_COR_MASK_DEFAULT        (PCI_ERR_COR_ADV_NONFATAL |     \
> +                                         PCI_ERR_COR_INTERNAL |         \
> +                                         PCI_ERR_COR_HL_OVERFLOW)
> +
> +
> +#define PCI_ERR_CAP_FEP_MASK            0x0000001f
> +#define PCI_ERR_CAP_MHRC                0x00000200
> +#define PCI_ERR_CAP_MHRE                0x00000400
> +#define PCI_ERR_CAP_TLP                 0x00000800
> +
> +#define PCI_ERR_TLP_PREFIX_LOG          0x38
> +
> +/* From 6.2.7 Error Listing and Rules. Table 6-2, 6-3 and 6-4 */
> +static enum PCIE_AER_SEVERITY pcie_aer_uncor_default_severity(uint32_t status)
> +{
> +    switch (status) {
> +    case PCI_ERR_UNC_INTN:
> +    case PCI_ERR_UNC_DLP:
> +    case PCI_ERR_UNC_SDN:
> +    case PCI_ERR_UNC_RX_OVER:
> +    case PCI_ERR_UNC_FCP:
> +    case PCI_ERR_UNC_MALF_TLP:
> +        return AER_ERR_FATAL;
> +    case PCI_ERR_UNC_POISON_TLP:
> +    case PCI_ERR_UNC_ECRC:
> +    case PCI_ERR_UNC_UNSUP:
> +    case PCI_ERR_UNC_COMP_TIME:
> +    case PCI_ERR_UNC_COMP_ABORT:
> +    case PCI_ERR_UNC_UNX_COMP:
> +    case PCI_ERR_UNC_ACSV:
> +    case PCI_ERR_UNC_MCBTLP:
> +    case PCI_ERR_UNC_ATOP_EBLOCKED:
> +    case PCI_ERR_UNC_TLP_PRF_BLOCKED:
> +        return AER_ERR_NONFATAL;
> +    default:
> +        break;
> +    }
> +    abort();
> +    return AER_ERR_FATAL;
> +}
> +
> +static uint32_t pcie_aer_log_next(uint32_t i, uint32_t max)
> +{
> +    return (i + 1) % max;
> +}
> +
> +static bool pcie_aer_log_empty_index(uint32_t producer, uint32_t consumer)
> +{
> +    return producer == consumer;
> +}
> +
> +static bool pcie_aer_log_empty(struct pcie_aer_log *aer_log)
> +{
> +    return pcie_aer_log_empty_index(aer_log->producer, aer_log->consumer);
> +}
> +
> +static bool pcie_aer_log_full(struct pcie_aer_log *aer_log)
> +{
> +    return pcie_aer_log_next(aer_log->producer, aer_log->log_max) ==
> +        aer_log->consumer;
> +}
> +
> +static uint32_t pcie_aer_log_add(struct pcie_aer_log *aer_log)
> +{
> +    uint32_t i = aer_log->producer;
> +    aer_log->producer = pcie_aer_log_next(aer_log->producer, aer_log->log_max);
> +    return i;
> +}
> +
> +static uint32_t pcie_aer_log_del(struct pcie_aer_log *aer_log)
> +{
> +    uint32_t i = aer_log->consumer;
> +    aer_log->consumer = pcie_aer_log_next(aer_log->consumer, aer_log->log_max);
> +    return i;
> +}
> +
> +static int pcie_aer_log_add_err(struct pcie_aer_log *aer_log,
> +                                const struct pcie_aer_err *err)
> +{
> +    uint32_t i;
> +    if (pcie_aer_log_full(aer_log)) {
> +        return -1;
> +    }
> +    i = pcie_aer_log_add(aer_log);
> +    memcpy(&aer_log->log[i], err, sizeof(*err));
> +    return 0;
> +}
> +
> +static const struct pcie_aer_err*
> +pcie_aer_log_del_err(struct pcie_aer_log *aer_log)
> +{
> +    uint32_t i;
> +    assert(!pcie_aer_log_empty(aer_log));
> +    i = pcie_aer_log_del(aer_log);
> +    return &aer_log->log[i];
> +}
> +
> +static void pcie_aer_log_clear_all_err(struct pcie_aer_log *aer_log)
> +{
> +    aer_log->producer = 0;
> +    aer_log->consumer = 0;
> +}
> +
> +int pcie_aer_init(PCIDevice *dev, uint16_t offset)
> +{
> +    int pos;
> +    PCIExpressDevice *exp;
> +
> +    pci_set_word(dev->wmask + PCI_COMMAND,
> +                 pci_get_word(dev->wmask + PCI_COMMAND) | PCI_COMMAND_SERR);
> +
> +    pos = pcie_append_ext_capability(dev, PCI_EXT_CAP_ID_ERR, PCI_ERR_VER,
> +                                     offset, PCI_ERR_SIZEOF);
> +    if (pos < 0) {
> +        return pos;
> +    }
> +    exp = dev->exp;
> +    exp->aer_cap = pos;
> +    if (dev->aer_log.log_max == PCIE_AER_LOG_MAX_UNSET) {
> +        dev->aer_log.log_max = PCIE_AER_LOG_MAX_DEFAULT;
> +    }
> +    if (dev->aer_log.log_max > PCIE_AER_LOG_MAX_MAX) {
> +        dev->aer_log.log_max = PCIE_AER_LOG_MAX_MAX;
> +    }
> +    dev->aer_log.log =
> +        qemu_mallocz(sizeof(dev->aer_log.log[0]) * dev->aer_log.log_max);
> +
> +    pci_set_long(dev->wmask + pos + PCI_ERR_UNCOR_MASK,
> +                 PCI_ERR_UNC_SUPPORTED);
> +
> +    pci_set_long(dev->config + pos + PCI_ERR_UNCOR_SEVER,
> +                 PCI_ERR_UNC_SEVERITY_DEFAULT);
> +    pci_set_long(dev->wmask + pos + PCI_ERR_UNCOR_SEVER,
> +                 PCI_ERR_UNC_SUPPORTED);
> +
> +    pci_set_long(dev->config + pos + PCI_ERR_COR_MASK,
> +                 PCI_ERR_COR_MASK_DEFAULT);
> +    pci_set_long(dev->wmask + pos + PCI_ERR_COR_MASK,
> +                 PCI_ERR_COR_SUPPORTED);
> +
> +    /* capabilities and control. multiple header logging is supported */
> +    if (dev->aer_log.log_max > 0) {
> +        pci_set_long(dev->config + pos + PCI_ERR_CAP,
> +                     PCI_ERR_CAP_ECRC_GENC | PCI_ERR_CAP_ECRC_CHKC |
> +                     PCI_ERR_CAP_MHRC);
> +        pci_set_long(dev->wmask + pos + PCI_ERR_CAP,
> +                     PCI_ERR_CAP_ECRC_GENE | PCI_ERR_CAP_ECRC_CHKE |
> +                     PCI_ERR_CAP_MHRE);
> +    } else {
> +        pci_set_long(dev->config + pos + PCI_ERR_CAP,
> +                     PCI_ERR_CAP_ECRC_GENC | PCI_ERR_CAP_ECRC_CHKC);
> +        pci_set_long(dev->wmask + pos + PCI_ERR_CAP,
> +                     PCI_ERR_CAP_ECRC_GENE | PCI_ERR_CAP_ECRC_CHKE);
> +    }
> +
> +    switch (pcie_cap_get_type(dev)) {
> +    case PCI_EXP_TYPE_ROOT_PORT:
> +        /* this case will be set by pcie_aer_root_init() */
> +        /* fallthrough */
> +    case PCI_EXP_TYPE_DOWNSTREAM:
> +    case PCI_EXP_TYPE_UPSTREAM:
> +        pci_set_word(dev->wmask + PCI_BRIDGE_CONTROL,
> +                     pci_get_word(dev->wmask + PCI_BRIDGE_CONTROL) |
> +                     PCI_BRIDGE_CTL_SERR);
> +        exp->aer_errmsg = pcie_aer_errmsg_vbridge;
> +        break;
> +    default:
> +        exp->aer_errmsg = pcie_aer_errmsg_alldev;
> +        break;
> +    }
> +    return pos;
> +}
> +
> +void pcie_aer_exit(PCIDevice *dev)
> +{
> +    pci_del_capability(dev, PCI_EXT_CAP_ID_ERR, PCI_ERR_SIZEOF);
> +    qemu_free(dev->aer_log.log);
> +}
> +
> +/* Multiple Header recording isn't implemented. Is it wanted? */
> +void pcie_aer_write_config(PCIDevice *dev,
> +                           uint32_t addr, uint32_t val, int len)
> +{
> +    uint32_t pos = dev->exp->aer_cap;
> +
> +    /* PCI_STATUS_SIG_SYSTEM_ERROR */
> +    if (ranges_overlap(addr, len, PCI_STATUS, 2)) {
> +        pcie_w1c_word(dev, PCI_STATUS, PCI_STATUS_SIG_SYSTEM_ERROR, addr, val);
> +    }
> +
> +    /* uncorrectable */
> +    if (ranges_overlap(addr, len, pos + PCI_ERR_UNCOR_STATUS, 4)) {
> +        uint32_t written =
> +            pcie_written_val_long(addr, val, pos + PCI_ERR_UNCOR_STATUS) &
> +            PCI_ERR_UNC_SUPPORTED;
> +        uint32_t uncorsta =
> +            pci_get_long(dev->config + pos + PCI_ERR_UNCOR_STATUS);
> +        uint32_t errcap = pci_get_long(dev->config + pos + PCI_ERR_CAP);
> +        uint32_t first_error = (1 << PCI_ERR_CAP_FEP(errcap));
> +
> +        if ((uncorsta & first_error) && (written & first_error)) {
> +            pcie_aer_clear_error(dev);
> +        }
> +        if (!(errcap & PCI_ERR_CAP_MHRE)) {
> +            /* RW1CS */
> +            pcie_w1c_long(dev, pos + PCI_ERR_UNCOR_STATUS,
> +                          PCI_ERR_UNC_SUPPORTED, addr, val);
> +        }
> +    }
> +
> +    /* correctable */
> +    if (ranges_overlap(addr, len, pos + PCI_ERR_COR_STATUS, 4)) {
> +        /* RW1CS */
> +        pcie_w1c_long(dev, pos + PCI_ERR_COR_STATUS, PCI_ERR_COR_SUPPORTED,
> +                      addr, val);
> +    }
> +
> +    /* capability & control */
> +    if (ranges_overlap(addr, len, pos + PCI_ERR_CAP, 4)) {
> +        uint32_t err_cap = pci_get_long(dev->config + pos + PCI_ERR_CAP);
> +        if (!(err_cap & PCI_ERR_CAP_MHRE)) {
> +            pcie_aer_log_clear_all_err(&dev->aer_log);
> +        }
> +    }
> +}
> +
> +#define PCI_SEC_STATUS_RCV_SYSTEM_ERROR         0x4000
> +
> +void pcie_aer_write_config_vbridge(PCIDevice *dev,
> +                                   uint32_t addr, uint32_t val, int len)
> +{
> +    /* PCI_SEC_STATUS_RCV_SYSTEM_ERROR */
> +    if (ranges_overlap(addr, len, PCI_STATUS, 2)) {
> +        pcie_w1c_word(dev, PCI_SEC_STATUS, PCI_SEC_STATUS_RCV_SYSTEM_ERROR,
> +                      addr, val);
> +    }
> +}
> +
> +static inline void pcie_aer_errmsg(PCIDevice *dev,
> +                                   const struct pcie_aer_err_msg *msg)
> +{
> +    assert(dev->exp);
> +    assert(dev->exp->aer_errmsg);
> +    dev->exp->aer_errmsg(dev, msg);
> +}
> +
> +static AER_ERR_MSG_RESULT
> +pcie_aer_errmsg_alldev(PCIDevice *dev, const struct pcie_aer_err_msg *msg)
> +{
> +    uint16_t cmd = pci_get_word(dev->config + PCI_COMMAND);
> +    bool transmit1 =
> +        pcie_aer_err_msg_is_uncor(msg) && (cmd & PCI_COMMAND_SERR);
> +    uint32_t pos = pci_pcie_cap(dev);
> +    uint32_t devctl = pci_get_word(dev->config + pos + PCI_EXP_DEVCTL);
> +    bool transmit2 = msg->severity & devctl;
> +    PCIDevice *parent_port;
> +
> +    if (transmit1) {
> +        if (pcie_aer_err_msg_is_uncor(msg)) {
> +            /* Signaled System Error */
> +            uint8_t *status = dev->config + PCI_STATUS;
> +            pci_set_word(status,
> +                         pci_get_word(status) | PCI_STATUS_SIG_SYSTEM_ERROR);
> +        }
> +    }
> +
> +    if (!(transmit1 || transmit2)) {
> +        return AER_ERR_MSG_MASKED;
> +    }
> +
> +    /* send up error message */
> +    if (pci_is_express(dev) &&
> +        pcie_cap_get_type(dev) == PCI_EXP_TYPE_ROOT_PORT) {
> +        /* Root port notify system itself,
> +           or send the error message to root complex event collector. */
> +        /*
> +         * if root port is associated to event collector, set
> +         * parent_port = root complex event collector
> +         * For now root complex event collector isn't supported.
> +         */
> +        parent_port = NULL;
> +    } else {
> +        parent_port = pci_bridge_get_device(dev->bus);
> +    }
> +    if (parent_port) {
> +        if (!pci_is_express(parent_port)) {
> +            /* What to do? */
> +            return AER_ERR_MSG_MASKED;
> +        }
> +        pcie_aer_errmsg(parent_port, msg);
> +    }
> +    return AER_ERR_MSG_SENT;
> +}
> +
> +static AER_ERR_MSG_RESULT
> +pcie_aer_errmsg_vbridge(PCIDevice *dev, const struct pcie_aer_err_msg *msg)
> +{
> +    uint16_t bridge_control = pci_get_word(dev->config + PCI_BRIDGE_CONTROL);
> +
> +    if (pcie_aer_err_msg_is_uncor(msg)) {
> +        /* Received System Error */
> +        uint8_t *sec_status = dev->config + PCI_SEC_STATUS;
> +        pci_set_word(sec_status,
> +                     pci_get_word(sec_status) |
> +                     PCI_SEC_STATUS_RCV_SYSTEM_ERROR);
> +    }
> +
> +    if (!(bridge_control & PCI_BRIDGE_CTL_SERR)) {
> +        return AER_ERR_MSG_MASKED;
> +    }
> +    return pcie_aer_errmsg_alldev(dev, msg);
> +}
> +
> +static AER_ERR_MSG_RESULT
> +pcie_aer_errmsg_root_port(PCIDevice *dev, const struct pcie_aer_err_msg *msg)
> +{
> +    AER_ERR_MSG_RESULT ret;
> +    uint16_t cmd;
> +    uint8_t *aer_cap;
> +    uint32_t root_cmd;
> +    uint32_t root_sta;
> +    bool trigger;
> +
> +    ret = pcie_aer_errmsg_vbridge(dev, msg);
> +    if (ret != AER_ERR_MSG_SENT) {
> +        return ret;
> +    }
> +
> +    ret = AER_ERR_MSG_MASKED;
> +    cmd = pci_get_word(dev->config + PCI_COMMAND);
> +    aer_cap = dev->config + pcie_aer_cap(dev);
> +    root_cmd = pci_get_long(aer_cap + PCI_ERR_ROOT_COMMAND);
> +    root_sta = pci_get_long(aer_cap + PCI_ERR_ROOT_STATUS);
> +    trigger = false;
> +
> +    if (cmd & PCI_COMMAND_SERR) {
> +        /* System Error. Platform Specific */
> +        /* ret = AER_ERR_MSG_SENT; */
> +    }
> +
> +    /* Errro Message Received: Root Error Status register */
> +    switch (msg->severity) {
> +    case AER_ERR_COR:
> +        if (root_sta & PCI_ERR_ROOT_COR_RCV) {
> +            root_sta |= PCI_ERR_ROOT_MULTI_COR_RCV;
> +        } else {
> +            if (root_cmd & PCI_ERR_ROOT_CMD_COR_EN) {
> +                trigger = true;
> +            }
> +            pci_set_word(aer_cap + PCI_ERR_ROOT_COR_SRC, msg->source_id);
> +        }
> +        root_sta |= PCI_ERR_ROOT_COR_RCV;
> +        break;
> +    case AER_ERR_NONFATAL:
> +        if (!(root_sta & PCI_ERR_ROOT_NONFATAL_RCV) &&
> +            root_cmd & PCI_ERR_ROOT_CMD_NONFATAL_EN) {
> +            trigger = true;
> +        }
> +        root_sta |= PCI_ERR_ROOT_NONFATAL_RCV;
> +        break;
> +    case AER_ERR_FATAL:
> +        if (!(root_sta & PCI_ERR_ROOT_FATAL_RCV) &&
> +            root_cmd & PCI_ERR_ROOT_CMD_FATAL_EN) {
> +            trigger = true;
> +        }
> +        if (!(root_sta & PCI_ERR_ROOT_UNCOR_RCV)) {
> +            root_sta |= PCI_ERR_ROOT_FIRST_FATAL;
> +        }
> +        root_sta |= PCI_ERR_ROOT_FATAL_RCV;
> +        break;
> +    }
> +    if (pcie_aer_err_msg_is_uncor(msg)) {
> +        if (root_sta & PCI_ERR_ROOT_UNCOR_RCV) {
> +            root_sta |= PCI_ERR_ROOT_MULTI_UNCOR_RCV;
> +        } else {
> +            pci_set_word(aer_cap + PCI_ERR_ROOT_SRC, msg->source_id);
> +        }
> +        root_sta |= PCI_ERR_ROOT_UNCOR_RCV;
> +    }
> +    pci_set_long(aer_cap + PCI_ERR_ROOT_STATUS, root_sta);
> +
> +    if (root_cmd & msg->severity) {
> +        /* Error Interrupt(INTx or MSI) */
> +        pcie_aer_root_notify(dev, trigger, 1);
> +        ret = AER_ERR_MSG_SENT;
> +    }
> +    return ret;
> +}
> +
> +static void pcie_aer_update_log(PCIDevice *dev, const struct pcie_aer_err *err)
> +{
> +    uint8_t *aer_cap = dev->config + pcie_aer_cap(dev);
> +    uint8_t first_bit = ffsl(err->status) - 1;
> +    uint32_t errcap = pci_get_long(aer_cap + PCI_ERR_CAP);
> +    int i;
> +    uint32_t dw;
> +
> +    errcap &= ~(PCI_ERR_CAP_FEP_MASK | PCI_ERR_CAP_TLP);
> +    errcap |= PCI_ERR_CAP_FEP(first_bit);
> +
> +    if (err->flags & PCIE_AER_ERR_HEADER_VALID) {
> +        for (i = 0; i < ARRAY_SIZE(err->header); ++i) {
> +            /* 7.10.8 Header Log Register */
> +            cpu_to_be32wu(&dw, err->header[i]);
> +            memcpy(aer_cap + PCI_ERR_HEADER_LOG + sizeof(err->header[0]) * i,
> +                   &dw, sizeof(dw));
> +        }
> +    } else {
> +        assert(!(err->flags & PCIE_AER_ERR_TLP_PRESENT));
> +        memset(aer_cap + PCI_ERR_HEADER_LOG, 0, sizeof(err->header));
> +    }
> +
> +    if ((err->flags & PCIE_AER_ERR_TLP_PRESENT) &&
> +        (pci_get_long(dev->config + pci_pcie_cap(dev) + PCI_EXP_DEVCTL2) &
> +         PCI_EXP_DEVCAP2_EETLPP)) {
> +        for (i = 0; i < ARRAY_SIZE(err->prefix); ++i) {
> +            /* 7.10.12 tlp prefix log register */
> +            cpu_to_be32wu(&dw, err->prefix[i]);
> +            memcpy(aer_cap + PCI_ERR_TLP_PREFIX_LOG +
> +                   sizeof(err->prefix[0]) * i, &dw, sizeof(dw));
> +        }
> +        errcap |= PCI_ERR_CAP_TLP;
> +    } else {
> +        memset(aer_cap + PCI_ERR_TLP_PREFIX_LOG, 0, sizeof(err->prefix));
> +    }
> +    pci_set_long(aer_cap + PCI_ERR_CAP, errcap);
> +}
> +
> +static void pcie_aer_clear_log(PCIDevice *dev)
> +{
> +    struct pcie_aer_err *err;
> +    uint8_t *aer_cap = dev->config + pcie_aer_cap(dev);
> +    uint32_t errcap = pci_get_long(aer_cap + PCI_ERR_CAP);
> +
> +    errcap &= ~(PCI_ERR_CAP_FEP_MASK | PCI_ERR_CAP_TLP);
> +    pci_set_long(aer_cap + PCI_ERR_CAP, errcap);
> +
> +    memset(aer_cap + PCI_ERR_HEADER_LOG, 0, sizeof(err->header));
> +    memset(aer_cap + PCI_ERR_TLP_PREFIX_LOG, 0, sizeof(err->prefix));
> +}
> +
> +static int pcie_aer_record_error(PCIDevice *dev,
> +                                 const struct pcie_aer_err *err)
> +{
> +    uint8_t *aer_cap = dev->config + pcie_aer_cap(dev);
> +    uint32_t errcap = pci_get_long(aer_cap + PCI_ERR_CAP);
> +    int fep = PCI_ERR_CAP_FEP(errcap);
> +
> +    if (errcap & PCI_ERR_CAP_MHRE &&
> +        (pci_get_long(aer_cap + PCI_ERR_UNCOR_STATUS) & (1ULL << fep))) {
> +        /*  Not first error. queue error */
> +        if (pcie_aer_log_add_err(&dev->aer_log, err) < 0) {
> +            /* overflow */
> +            return -1;
> +        }
> +        return 0;
> +    }
> +
> +    pcie_aer_update_log(dev, err);
> +    return 0;
> +}
> +
> +static void pcie_aer_clear_error(PCIDevice *dev)
> +{
> +    uint8_t *aer_cap = dev->config + pcie_aer_cap(dev);
> +    uint32_t errcap = pci_get_long(aer_cap + PCI_ERR_CAP);
> +    uint32_t old_err = (1UL << PCI_ERR_CAP_FEP(errcap));
> +    struct pcie_aer_log *aer_log = &dev->aer_log;
> +    const struct pcie_aer_err *err;
> +    uint32_t consumer;
> +
> +    if (!(errcap & PCI_ERR_CAP_MHRE) || pcie_aer_log_empty(aer_log)) {
> +        pcie_aer_clear_log(dev);
> +        pci_set_long(aer_cap + PCI_ERR_UNCOR_STATUS,
> +                     pci_get_long(aer_cap + PCI_ERR_UNCOR_STATUS) & ~old_err);
> +        return;
> +    }
> +
> +    /* if no same error is queued, clear bit in uncorrectable error status */
> +    for (consumer = dev->aer_log.consumer;
> +         !pcie_aer_log_empty_index(dev->aer_log.producer, consumer);
> +         consumer = pcie_aer_log_next(consumer, dev->aer_log.log_max)) {
> +        if (dev->aer_log.log[consumer].status & old_err) {
> +            old_err = 0;
> +            break;
> +        }
> +    }
> +    if (old_err) {
> +        pci_set_long(aer_cap + PCI_ERR_UNCOR_STATUS,
> +                     pci_get_long(aer_cap + PCI_ERR_UNCOR_STATUS) & ~old_err);
> +    }
> +
> +    err = pcie_aer_log_del_err(aer_log);
> +    pcie_aer_update_log(dev, err);
> +}
> +
> +/*
> + * non-Function specific error must be recorded in all functions.
> + * It is the responsibility of the caller of this function.
> + * It is also caller's responsiblity to determine which function should
> + * report the rerror.
> + *
> + * 6.2.4 Error Logging
> + * 6.2.5 Sqeucne of Device Error Signaling and Logging Operations
> + * table 6-2: Flowchard Showing Sequence of Device Error Signaling and Logging
> + *            Operations
> + *
> + * Although this implementation can be shortened/optimized, this is kept
> + * parallel to table 6-2.
> + */
> +void pcie_aer_inject_error(PCIDevice *dev, const struct pcie_aer_err *err)
> +{
> +    uint8_t *exp_cap;
> +    uint8_t *aer_cap = NULL;
> +    uint32_t devctl = 0;
> +    uint32_t devsta = 0;
> +    uint32_t status = err->status;
> +    uint32_t mask;
> +    bool is_unsupported_request =
> +        (!(err->flags & PCIE_AER_ERR_IS_CORRECTABLE) &&
> +         err->status == PCI_ERR_UNC_UNSUP);
> +    bool is_advisory_nonfatal = false;  /* for advisory non-fatal error */
> +    uint32_t uncor_status = 0;          /* for advisory non-fatal error */
> +    struct pcie_aer_err_msg msg;
> +    int is_header_log_overflowed = 0;
> +
> +    if (!pci_is_express(dev)) {
> +        /* What to do? */
> +        return;
> +    }
> +
> +    if (err->flags & PCIE_AER_ERR_IS_CORRECTABLE) {
> +        status &= PCI_ERR_COR_SUPPORTED;
> +    } else {
> +        status &= PCI_ERR_UNC_SUPPORTED;
> +    }
> +    if (!status || status & (status - 1)) {
> +        /* invalid status bit. one and only one bit must be set */
> +        return;
> +    }
> +
> +    exp_cap = dev->config + pci_pcie_cap(dev);
> +    if (dev->exp->aer_cap) {
> +        aer_cap = dev->config + pcie_aer_cap(dev);
> +        devctl = pci_get_long(exp_cap + PCI_EXP_DEVCTL);
> +        devsta = pci_get_long(exp_cap + PCI_EXP_DEVSTA);
> +    }
> +    if (err->flags & PCIE_AER_ERR_IS_CORRECTABLE) {
> +    correctable_error:
> +        devsta |= PCI_EXP_DEVSTA_CED;
> +        if (is_unsupported_request) {
> +            devsta |= PCI_EXP_DEVSTA_URD;
> +        }
> +        pci_set_word(exp_cap + PCI_EXP_DEVSTA, devsta);
> +
> +        if (aer_cap) {
> +            pci_set_long(aer_cap + PCI_ERR_COR_STATUS,
> +                         pci_get_long(aer_cap + PCI_ERR_COR_STATUS) | status);
> +            mask = pci_get_long(aer_cap + PCI_ERR_COR_MASK);
> +            if (mask & status) {
> +                return;
> +            }
> +            if (is_advisory_nonfatal) {
> +                uint32_t uncor_mask =
> +                    pci_get_long(aer_cap + PCI_ERR_UNCOR_MASK);
> +                if (!(uncor_mask & uncor_status)) {
> +                    is_header_log_overflowed = pcie_aer_record_error(dev, err);
> +                }
> +                pci_set_long(aer_cap + PCI_ERR_UNCOR_STATUS,
> +                             pci_get_long(aer_cap + PCI_ERR_UNCOR_STATUS) |
> +                             uncor_status);
> +            }
> +        }
> +
> +        if (is_unsupported_request && !(devctl & PCI_EXP_DEVCTL_URRE)) {
> +            return;
> +        }
> +        if (!(devctl & PCI_EXP_DEVCTL_CERE)) {
> +            return;
> +        }
> +        msg.severity = AER_ERR_COR;
> +    } else {
> +        bool is_fatal =
> +            (pcie_aer_uncor_default_severity(status) == AER_ERR_FATAL);
> +        uint16_t cmd;
> +
> +        if (aer_cap) {
> +            is_fatal = status & pci_get_long(aer_cap + PCI_ERR_UNCOR_SEVER);
> +        }
> +        if (!is_fatal && (err->flags & PCIE_AER_ERR_MAYBE_ADVISORY)) {
> +            is_advisory_nonfatal = true;
> +            uncor_status = status;
> +            status = PCI_ERR_COR_ADV_NONFATAL;
> +            goto correctable_error;
> +        }
> +        if (is_fatal) {
> +            devsta |= PCI_EXP_DEVSTA_FED;
> +        } else {
> +            devsta |= PCI_EXP_DEVSTA_NFED;
> +        }
> +        if (is_unsupported_request) {
> +            devsta |= PCI_EXP_DEVSTA_URD;
> +        }
> +        pci_set_long(exp_cap + PCI_EXP_DEVSTA, devsta);
> +
> +        if (aer_cap) {
> +            mask = pci_get_long(aer_cap + PCI_ERR_UNCOR_MASK);
> +            if (mask & status) {
> +                pci_set_long(aer_cap + PCI_ERR_UNCOR_STATUS,
> +                             pci_get_long(aer_cap + PCI_ERR_UNCOR_STATUS) |
> +                             status);
> +                return;
> +            }
> +
> +            is_header_log_overflowed = pcie_aer_record_error(dev, err);
> +            pci_set_long(aer_cap + PCI_ERR_UNCOR_STATUS,
> +                         pci_get_long(aer_cap + PCI_ERR_UNCOR_STATUS) |
> +                         status);
> +        }
> +
> +        cmd = pci_get_word(dev->config + PCI_COMMAND);
> +        if (is_unsupported_request &&
> +            !(devctl & PCI_EXP_DEVCTL_URRE) && !(cmd & PCI_COMMAND_SERR)) {
> +            return;
> +        }
> +        if (is_fatal) {
> +            if (!((cmd & PCI_COMMAND_SERR) ||
> +                  (devctl & PCI_EXP_DEVCTL_FERE))) {
> +                return;
> +            }
> +            msg.severity = AER_ERR_FATAL;
> +        } else {
> +            if (!((cmd & PCI_COMMAND_SERR) ||
> +                  (devctl & PCI_EXP_DEVCTL_NFERE))) {
> +                return;
> +            }
> +            msg.severity = AER_ERR_NONFATAL;
> +        }
> +    }
> +
> +    /* send up error message */
> +    msg.source_id = err->source_id;
> +    pcie_aer_errmsg(dev, &msg);
> +
> +    if (is_header_log_overflowed) {
> +        struct pcie_aer_err header_log_overflow = {
> +            .status = PCI_ERR_COR_HL_OVERFLOW,
> +            .flags = PCIE_AER_ERR_IS_CORRECTABLE,
> +            .header = {0, 0, 0, 0},
> +            .prefix = {0, 0, 0, 0},
> +        };
> +        pcie_aer_inject_error(dev, &header_log_overflow);
> +    }
> +}
> +
> +/* aer root error command/status */
> +#define PCI_ERR_ROOT_CMD_EN_MASK        (PCI_ERR_ROOT_CMD_COR_EN |      \
> +                                         PCI_ERR_ROOT_CMD_NONFATAL_EN | \
> +                                         PCI_ERR_ROOT_CMD_FATAL_EN)
> +
> +#define PCI_ERR_ROOT_IRQ_SHIFT          26
> +#define PCI_ERR_ROOT_IRQ                0xf8000000
> +#define PCI_ERR_ROOT_STATUS_REPORT_MASK (PCI_ERR_ROOT_COR_RCV |         \
> +                                         PCI_ERR_ROOT_MULTI_COR_RCV |   \
> +                                         PCI_ERR_ROOT_UNCOR_RCV |       \
> +                                         PCI_ERR_ROOT_MULTI_UNCOR_RCV | \
> +                                         PCI_ERR_ROOT_FIRST_FATAL |     \
> +                                         PCI_ERR_ROOT_NONFATAL_RCV |    \
> +                                         PCI_ERR_ROOT_FATAL_RCV)
> +
> +void pcie_aer_root_set_vector(PCIDevice *dev, uint8_t vector)
> +{
> +    uint8_t *aer_cap = dev->config + pcie_aer_cap(dev);
> +    uint32_t root_status = pci_get_long(aer_cap + PCI_ERR_ROOT_STATUS);
> +    root_status &= ~PCI_ERR_ROOT_IRQ;
> +    root_status |=
> +        (((uint32_t)vector) << PCI_ERR_ROOT_IRQ_SHIFT) & PCI_ERR_ROOT_IRQ;
> +    pci_set_long(aer_cap + PCI_ERR_ROOT_STATUS, root_status);
> +}
> +
> +static uint8_t pcie_aer_root_get_vector(PCIDevice *dev)
> +{
> +    uint8_t *aer_cap = dev->config + pcie_aer_cap(dev);
> +    uint32_t root_status = pci_get_long(aer_cap + PCI_ERR_ROOT_STATUS);
> +    return (root_status & PCI_ERR_ROOT_IRQ) >> PCI_ERR_ROOT_IRQ_SHIFT;
> +}
> +
> +static void pcie_aer_root_notify(PCIDevice *dev, bool trigger, int level)
> +{
> +    pcie_notify(dev, pcie_aer_root_get_vector(dev), trigger, level);
> +}
> +
> +void pcie_aer_root_init(PCIDevice *dev)
> +{
> +    uint16_t pos = pcie_aer_cap(dev);
> +
> +    pci_set_long(dev->wmask + pos + PCI_ERR_ROOT_COMMAND,
> +                 PCI_ERR_ROOT_CMD_EN_MASK);
> +    dev->exp->aer_errmsg = pcie_aer_errmsg_root_port;
> +}
> +
> +void pcie_aer_root_reset(PCIDevice *dev)
> +{
> +    uint8_t* aer_cap = dev->config + pcie_aer_cap(dev);
> +
> +    pci_set_long(aer_cap + PCI_ERR_ROOT_COMMAND, 0);
> +
> +    /*
> +     * Advanced Error Interrupt Message Number in Root Error Status Register
> +     * must be updated by chip dependent code.
> +     */
> +}
> +
> +static bool pcie_aer_root_does_trigger(uint32_t cmd, uint32_t sta)
> +{
> +    return
> +        ((cmd & PCI_ERR_ROOT_CMD_COR_EN) && (sta & PCI_ERR_ROOT_COR_RCV)) ||
> +        ((cmd & PCI_ERR_ROOT_CMD_NONFATAL_EN) &&
> +         (sta & PCI_ERR_ROOT_NONFATAL_RCV)) ||
> +        ((cmd & PCI_ERR_ROOT_CMD_FATAL_EN) && (sta & PCI_ERR_ROOT_FATAL_RCV));
> +}
> +
> +void pcie_aer_root_write_config(PCIDevice *dev,
> +                                uint32_t addr, uint32_t val, int len,
> +                                uint32_t root_cmd_prev)
> +{
> +    uint16_t pos = pcie_aer_cap(dev);
> +    uint8_t *aer_cap = dev->config + pos;
> +    uint32_t root_status;
> +
> +    if (ranges_overlap(addr, len, pos + PCI_ERR_ROOT_STATUS, 4)) {
> +        /* RW1CS */
> +        pcie_w1c_long(dev, pos + PCI_ERR_ROOT_STATUS,
> +                      PCI_ERR_ROOT_STATUS_REPORT_MASK, addr, val);
> +    }
> +
> +    /* root command */
> +    if (ranges_overlap(addr, len, pos + PCI_ERR_ROOT_COMMAND, 4)) {
> +        uint32_t root_cmd = pci_get_long(aer_cap + PCI_ERR_ROOT_COMMAND);
> +        if (root_cmd & PCI_ERR_ROOT_CMD_EN_MASK) {
> +            bool trigger = false;
> +            int level = 0;
> +            uint32_t root_cmd_set = (root_cmd_prev ^ root_cmd) & root_cmd;
> +
> +            /* 0 -> 1 */
> +            root_status = pci_get_long(aer_cap + PCI_ERR_ROOT_STATUS);
> +            if (pcie_aer_root_does_trigger(root_cmd_set, root_status)) {
> +                trigger = true;
> +            }
> +            if (pcie_aer_root_does_trigger(root_cmd, root_status)) {
> +                level = 1;
> +            }
> +            pcie_aer_root_notify(dev, trigger, level);
> +        }
> +    }
> +}
> +
> +static const VMStateDescription vmstate_pcie_aer_err = {
> +    .name = "PCIE_AER_ERROR",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .minimum_version_id_old = 1,
> +    .fields     = (VMStateField[]) {
> +        VMSTATE_UINT32(status, struct pcie_aer_err),
> +        VMSTATE_UINT16(source_id, struct pcie_aer_err),
> +        VMSTATE_UINT16(flags, struct pcie_aer_err),
> +        VMSTATE_UINT32_ARRAY(header, struct pcie_aer_err, 4),
> +        VMSTATE_UINT32_ARRAY(prefix, struct pcie_aer_err, 4),
> +        VMSTATE_END_OF_LIST()
> +    }
> +};
> +
> +#define VMSTATE_PCIE_AER_ERRS(_field, _state, _field_num, _vmsd, _type) { \
> +    .name       = (stringify(_field)),                                    \
> +    .version_id = 0,                                                      \
> +    .num_offset = vmstate_offset_value(_state, _field_num, uint16_t),     \
> +    .size       = sizeof(_type),                                          \
> +    .vmsd       = &(_vmsd),                                               \
> +    .flags      = VMS_POINTER | VMS_VARRAY_UINT16 | VMS_STRUCT,           \
> +    .offset     = vmstate_offset_pointer(_state, _field, _type),          \
> +}
> +
> +const VMStateDescription vmstate_pcie_aer_log = {
> +    .name = "PCIE_AER_ERROR_LOG",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .minimum_version_id_old = 1,
> +    .fields     = (VMStateField[]) {
> +        VMSTATE_UINT32(producer, struct pcie_aer_log),
> +        VMSTATE_UINT32(consumer, struct pcie_aer_log),
> +        VMSTATE_UINT16(log_max, struct pcie_aer_log),
> +        VMSTATE_PCIE_AER_ERRS(log, struct pcie_aer_log, log_max,
> +                              vmstate_pcie_aer_err, struct pcie_aer_err),
> +        VMSTATE_END_OF_LIST()
> +    }
> +};
> diff --git a/hw/pcie.h b/hw/pcie.h
> new file mode 100644
> index 0000000..07f42c6
> --- /dev/null
> +++ b/hw/pcie.h
> @@ -0,0 +1,186 @@
> +/*
> + * pcie.h
> + *
> + * Copyright (c) 2010 Isaku Yamahata <yamahata at valinux co jp>
> + *                    VA Linux Systems Japan K.K.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#ifndef QEMU_PCIE_H
> +#define QEMU_PCIE_H
> +
> +#include "hw.h"
> +
> +enum PCIExpressIndicator {
> +    /* for attention and power indicator */
> +    PCI_EXP_HP_IND_RESERVED     = 0b00,
> +    PCI_EXP_HP_IND_ON           = 0b01,
> +    PCI_EXP_HP_IND_BLINK        = 0b10,
> +    PCI_EXP_HP_IND_OFF          = 0b11,
> +};
> +
> +enum PCIExpressHotPlugEvent {
> +    /* the bits match the bits in Slot Control/Status registers.
> +     * PCI_EXP_HP_EV_xxx = PCI_EXP_SLTCTL_xxxE = PCI_EXP_SLTSTA_xxx
> +     */
> +    PCI_EXP_HP_EV_ABP   = 0b00001,      /* attention button preseed */
> +    PCI_EXP_HP_EV_PDC   = 0b01000,      /* presence detect changed */
> +    PCI_EXP_HP_EV_CCI   = 0b10000,      /* command completed */
> +
> +    PCI_EXP_HP_EV_SUPPORTED     = 0b11001,       /* supported event mask  */
> +    /* events not listed aren't supported */
> +};
> +
> +typedef void (*pcie_flr_fn)(PCIDevice *dev);
> +
> +struct pcie_aer_err_msg;
> +enum AER_ERR_MSG_RESULT {
> +    AER_ERR_MSG_MASKED,
> +    AER_ERR_MSG_SENT,
> +};
> +typedef enum AER_ERR_MSG_RESULT AER_ERR_MSG_RESULT;
> +typedef AER_ERR_MSG_RESULT (*pcie_aer_errmsg_fn)(PCIDevice *dev, const struct pcie_aer_err_msg *msg);
> +
> +struct PCIExpressDevice {
> +    /* Offset of express capability in config space */
> +    uint8_t exp_cap;
> +
> +    /* FLR */
> +    pcie_flr_fn flr;
> +
> +    /* AER */
> +    uint16_t aer_cap;
> +    pcie_aer_errmsg_fn aer_errmsg;
> +};
> +
> +struct pcie_aer_log {
> +    uint32_t producer;
> +    uint32_t consumer;
> +
> +#define PCIE_AER_LOG_MAX_DEFAULT        8
> +#define PCIE_AER_LOG_MAX_MAX            128 /* what is appropriate? */
> +#define PCIE_AER_LOG_MAX_UNSET          (~(uint16_t)0)
> +    uint16_t log_max;
> +
> +    struct pcie_aer_err *log;
> +};

This is not how we are supposed to name the types in qemu, is it?
Either all lower case type names without a typedef
(violates CodingStyle but a lot of code does this)
or a mixed case with a typedef like CodingStyle wants,
please.

> +
> +extern const VMStateDescription vmstate_pcie_aer_log;
> +
> +/* PCI express capability helper functions */
> +int pci_pcie_cap_init(PCIDevice *dev,
> +                      uint8_t offset, uint8_t type, uint8_t port);
> +int pci_pcie_cap_exit(PCIDevice *dev);
> +uint8_t pcie_cap_get_type(const PCIDevice *dev);
> +void pcie_cap_flags_set_vector(PCIDevice *dev, uint8_t vector);
> +uint8_t pcie_cap_flags_get_vector(PCIDevice *dev);
> +
> +void pcie_cap_deverr_init(PCIDevice *dev);
> +void pcie_cap_deverr_reset(PCIDevice *dev);
> +void pcie_cap_deverr_write_config(PCIDevice *dev,
> +                                  uint32_t addr, uint32_t val, int len);
> +
> +void pcie_cap_slot_init(PCIDevice *dev, uint16_t slot);
> +void pcie_cap_slot_reset(PCIDevice *dev);
> +void pcie_cap_slot_write_config(PCIDevice *dev,
> +                                uint32_t addr, uint32_t val, int len,
> +                                uint16_t sltctl_prev);
> +void pcie_cap_slot_push_attention_button(PCIDevice *dev);
> +
> +void pcie_cap_root_init(PCIDevice *dev);
> +void pcie_cap_root_reset(PCIDevice *dev);
> +
> +void pcie_cap_flr_init(PCIDevice *dev, pcie_flr_fn flr);
> +void pcie_cap_flr_write_config(PCIDevice *dev,
> +                           uint32_t addr, uint32_t val, int len);
> +
> +void pcie_cap_ari_init(PCIDevice *dev);
> +void pcie_cap_ari_reset(PCIDevice *dev);
> +bool pcie_cap_is_ari_enabled(const PCIDevice *dev);
> +
> +/* PCI express extended capability helper functions */
> +uint16_t pcie_find_ext_capability(PCIDevice *dev, uint16_t cap_id);
> +int pcie_add_ext_capability(PCIDevice *dev,
> +                            uint16_t cap_id, uint8_t cap_ver, uint16_t size);
> +int pcie_append_ext_capability(PCIDevice *dev,
> +                               uint16_t cap_id, uint8_t cap_ver,
> +                               uint16_t offset, uint16_t size);
> +void pcie_del_ext_capability(PCIDevice *dev, uint16_t cap_id, uint16_t size);
> +void pcie_reserve_ext_capability(PCIDevice *dev,
> +                                 uint16_t offset, uint16_t size);
> +
> +int pcie_ari_init(PCIDevice *dev, uint16_t offset, uint16_t nextfn);
> +
> +/* PCI express extended capabilities */
> +
> +/* AER */
> +/* aer error severity */
> +enum PCIE_AER_SEVERITY {
> +    /* those value are same as
> +     * Root error command register in aer extended cap and
> +     * root control register in pci express cap.
> +     */
> +    AER_ERR_COR         = 0x1,
> +    AER_ERR_NONFATAL    = 0x2,
> +    AER_ERR_FATAL       = 0x4,
> +};
> +
> +/* aer error message: error signaling message has only error sevirity and
> +   source id. See 2.2.8.3 error signaling messages */
> +struct pcie_aer_err_msg {
> +    enum PCIE_AER_SEVERITY severity;
> +    uint16_t source_id; /* bdf */
> +};
> +
> +static inline bool
> +pcie_aer_err_msg_is_uncor(const struct pcie_aer_err_msg *msg)
> +{
> +    return msg->severity == AER_ERR_NONFATAL || msg->severity == AER_ERR_FATAL;
> +}
> +
> +/* error */
> +struct pcie_aer_err {
> +    uint32_t status;    /* error status bits */
> +    uint16_t source_id; /* bdf */
> +
> +#define PCIE_AER_ERR_IS_CORRECTABLE     0x1     /* correctable/uncorrectable */
> +#define PCIE_AER_ERR_MAYBE_ADVISORY     0x2     /* maybe advisory non-fatal */
> +#define PCIE_AER_ERR_HEADER_VALID       0x4     /* TLP header is logged */
> +#define PCIE_AER_ERR_TLP_PRESENT        0x8     /* TLP Prefix is logged */
> +    uint16_t flags;
> +
> +    uint32_t header[4]; /* TLP header */
> +    uint32_t prefix[4]; /* TLP header prefix */
> +};
> +
> +int pcie_aer_init(PCIDevice *dev, uint16_t offset);
> +void pcie_aer_exit(PCIDevice *dev);
> +void pcie_aer_write_config(PCIDevice *dev,
> +                           uint32_t addr, uint32_t val, int len);
> +void pcie_aer_write_config_vbridge(PCIDevice *dev,
> +                                   uint32_t addr, uint32_t val, int len);
> +
> +/* aer root port */
> +void pcie_aer_root_set_vector(PCIDevice *dev, uint8_t vector);
> +void pcie_aer_root_init(PCIDevice *dev);
> +void pcie_aer_root_reset(PCIDevice *dev);
> +void pcie_aer_root_write_config(PCIDevice *dev,
> +                                uint32_t addr, uint32_t val, int len,
> +                                uint32_t root_cmd_prev);
> +
> +/* error injection */
> +void pcie_aer_inject_error(PCIDevice *dev, const struct pcie_aer_err *err);
> +
> +#endif /* QEMU_PCIE_H */
> diff --git a/qemu-common.h b/qemu-common.h
> index d735235..6d9ee26 100644
> --- a/qemu-common.h
> +++ b/qemu-common.h
> @@ -219,6 +219,7 @@ typedef struct PCIHostState PCIHostState;
>  typedef struct PCIExpressHost PCIExpressHost;
>  typedef struct PCIBus PCIBus;
>  typedef struct PCIDevice PCIDevice;
> +typedef struct PCIExpressDevice PCIExpressDevice;
>  typedef struct PCIBridge PCIBridge;
>  typedef struct SerialState SerialState;
>  typedef struct IRQState *qemu_irq;
> -- 
> 1.7.1.1
Wei Xu Sept. 8, 2010, 5:38 p.m. UTC | #2
Isaku:

For binary constants below, to achieve max compatibility with gcc versions,
I recommend to change to hex (0x...):

> +
> +enum PCIExpressIndicator {
> +    /* for attention and power indicator */
> +    PCI_EXP_HP_IND_RESERVED     = 0b00,
> +    PCI_EXP_HP_IND_ON           = 0b01,
> +    PCI_EXP_HP_IND_BLINK        = 0b10,
> +    PCI_EXP_HP_IND_OFF          = 0b11,
> +};
> +
> +enum PCIExpressHotPlugEvent {
> +    /* the bits match the bits in Slot Control/Status registers.
> +     * PCI_EXP_HP_EV_xxx = PCI_EXP_SLTCTL_xxxE = PCI_EXP_SLTSTA_xxx
> +     */
> +    PCI_EXP_HP_EV_ABP   = 0b00001,      /* attention button preseed */
> +    PCI_EXP_HP_EV_PDC   = 0b01000,      /* presence detect changed */
> +    PCI_EXP_HP_EV_CCI   = 0b10000,      /* command completed */
> +

Wei Xu



On 9/8/10 12:39 AM, "Isaku Yamahata" <yamahata@valinux.co.jp> wrote:

> This patch implements helper functions for pci express extended capability.
> NOTE: presence detection depends on pci_qdev_init() change.
>       PCIExpressDevice::aer_log_max is in PCIDevice for device property.
> 
> Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
> ---
>  Makefile.objs |    1 +
>  hw/pci.h      |   24 +
>  hw/pcie.c     | 1668
> +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  hw/pcie.h     |  186 +++++++
>  qemu-common.h |    1 +
>  5 files changed, 1880 insertions(+), 0 deletions(-)
>  create mode 100644 hw/pcie.c
>  create mode 100644 hw/pcie.h
> 
> diff --git a/Makefile.objs b/Makefile.objs
> index 5f5a4c5..eeb5134 100644
> --- a/Makefile.objs
> +++ b/Makefile.objs
> @@ -186,6 +186,7 @@ hw-obj-$(CONFIG_PIIX4) += piix4.o
>  # PCI watchdog devies
>  hw-obj-y += wdt_i6300esb.o
>  
> +hw-obj-y += pcie.o
>  hw-obj-y += msix.o msi.o
>  
>  # PCI network cards
> diff --git a/hw/pci.h b/hw/pci.h
> index 296c7ba..bccab3a 100644
> --- a/hw/pci.h
> +++ b/hw/pci.h
> @@ -9,6 +9,8 @@
>  /* PCI includes legacy ISA access.  */
>  #include "isa.h"
>  
> +#include "pcie.h"
> +
>  /* PCI bus */
>  
>  #define PCI_DEVFN(slot, func)   ((((slot) & 0x1f) << 3) | ((func) & 0x07))
> @@ -172,6 +174,12 @@ struct PCIDevice {
>      /* Offset of MSI capability in config space */
>      uint8_t msi_cap;
>  
> +    /* PCI Express */
> +    PCIExpressDevice *exp;
> +    /* Theoretically this belongs to  PCIExpressDevice.
> +       However it is here for property and save/load */
> +    struct pcie_aer_log aer_log;
> +
>      /* Location of option rom */
>      char *romfile;
>      ram_addr_t rom_offset;
> @@ -367,6 +375,22 @@ static inline uint32_t pci_config_size(const PCIDevice
> *d)
>      return pci_is_express(d) ? PCIE_CONFIG_SPACE_SIZE :
> PCI_CONFIG_SPACE_SIZE;
>  }
>  
> +
> +/* These are pci express specific, so should belong to pcie.h.
> +   they're here to avoid header inclusion error. */
> +static inline uint8_t pci_pcie_cap(const PCIDevice *d)
> +{
> +    return d->exp ? d->exp->exp_cap : 0;
> +}
> +
> +/* AER */
> +static inline uint16_t pcie_aer_cap(const PCIDevice *d)
> +{
> +    assert(d->exp);
> +    return d->exp->aer_cap;
> +}
> +
> +
>  /* These are not pci specific. Should move into a separate header.
>   * Only pci.c uses them, so keep them here for now.
>   */
> diff --git a/hw/pcie.c b/hw/pcie.c
> new file mode 100644
> index 0000000..1f24c2a
> --- /dev/null
> +++ b/hw/pcie.c
> @@ -0,0 +1,1668 @@
> +/*
> + * pcie.c
> + *
> + * Copyright (c) 2010 Isaku Yamahata <yamahata at valinux co jp>
> + *                    VA Linux Systems Japan K.K.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include "sysemu.h"
> +#include "pci_bridge.h"
> +#include "pcie.h"
> +#include "msix.h"
> +#include "msi.h"
> +#include "pci_internals.h"
> +
> +//#define DEBUG_PCIE
> +#ifdef DEBUG_PCIE
> +# define PCIE_DPRINTF(fmt, ...)                                         \
> +    fprintf(stderr, "%s:%d " fmt, __func__, __LINE__, ## __VA_ARGS__)
> +#else
> +# define PCIE_DPRINTF(fmt, ...) do {} while (0)
> +#endif
> +#define PCIE_DEV_PRINTF(dev, fmt, ...)                                  \
> +    PCIE_DPRINTF("%s:%x "fmt, (dev)->name, (dev)->devfn, ## __VA_ARGS__)
> +
> +static inline const char *pcie_hp_event_name(enum PCIExpressHotPlugEvent
> event)
> +{
> +    switch (event) {
> +    case PCI_EXP_HP_EV_ABP:
> +        return "attention button pushed";
> +    case PCI_EXP_HP_EV_PDC:
> +        return "present detection changed";
> +    case PCI_EXP_HP_EV_CCI:
> +        return "command completed";
> +    default:
> +        break;
> +    }
> +    return "Unknown event";
> +}
> +
> +static void pcie_aer_clear_error(PCIDevice *dev);
> static void pcie_aer_root_notify(PCIDevice *dev, bool trigger, int level);
> +static AER_ERR_MSG_RESULT
> +pcie_aer_errmsg_alldev(PCIDevice *dev, const struct pcie_aer_err_msg *msg);
> +static AER_ERR_MSG_RESULT
> +pcie_aer_errmsg_vbridgePCIDevice *dev, const struct pcie_aer_err_msg *msg);
> +
> +/**************************************************************************
> + * pci express capability helper functions
> + */
> +
> +#define PCI_EXP_VER2_SIZEOF     0x3c    /* express capability of version 2 */
> +
> +/* PCI_EXP_FLAGS */
> +#define PCI_EXP_FLAGS_VER2      2       /* for now, supports only version 2
> */
> +#define PCI_EXP_FLAGS_IRQ_SHIFT 9
> +#define PCI_EXP_FLAGS_IRQ_REG(irq)      (((irq) << PCI_EXP_FLAGS_IRQ_SHIFT) &
> PCI_EXP_FLAGS_IRQ)
> +#define PCI_EXP_FLAGS_TYPE_SHIFT        4
> +
> +/* PCI_EXP_LINK{CAP, STA} */
> +/* link speed */
> +#define PCI_EXP_LNK_LS_25               1
> +
> +#define PCI_EXP_LNK_MLW_SHIFT           4
> +#define PCI_EXP_LNK_MLW_1               (1 << PCI_EXP_LNK_MLW_SHIFT)
> +
> +/* PCI_EXP_LINKCAP */
> +#define PCI_EXP_LNKCAP_ASPMS_SHIFT      10
> +#define PCI_EXP_LNKCAP_ASPMS_0S         (1 << PCI_EXP_LNKCAP_ASPMS_SHIFT)
> +
> +#define PCI_EXP_LNKCAP_PN_SHIFT         24
> +#define PCI_EXP_LNKCAP_PN_REG(pn)       (((pn) << PCI_EXP_LNKCAP_PN_SHIFT) &
> PCI_EXP_LNKCAP_PN)
> +
> +#define PCI_EXP_SLTCAP_PSN_SHIFT        19
> +#define PCI_EXP_SLTCAP_PSN_REG(slot)    (((slot) << PCI_EXP_SLTCAP_PSN_SHIFT)
> & PCI_EXP_SLTCAP_PSN)
> +
> +#define PCI_EXP_SLTCTL_AIC_SHIFT        6
> +#define PCI_EXP_SLTCTL_AIC_ON           (PCI_EXP_HP_IND_ON <<
> PCI_EXP_SLTCTL_AIC_SHIFT)
> +#define PCI_EXP_SLTCTL_AIC_BLINK        (PCI_EXP_HP_IND_BLINK <<
> PCI_EXP_SLTCTL_AIC_SHIFT)
> +#define PCI_EXP_SLTCTL_AIC_OFF          (PCI_EXP_HP_IND_OFF <<
> PCI_EXP_SLTCTL_AIC_SHIFT)
> +
> +#define PCI_EXP_SLTCTL_PIC_SHIFT        8
> +#define PCI_EXP_SLTCTL_PIC_ON           (PCI_EXP_HP_IND_ON <<
> PCI_EXP_SLTCTL_PIC_SHIFT)
> +#define PCI_EXP_SLTCTL_PIC_BLINK        (PCI_EXP_HP_IND_BLINK <<
> PCI_EXP_SLTCTL_PIC_SHIFT)
> +#define PCI_EXP_SLTCTL_PIC_OFF          (PCI_EXP_HP_IND_OFF <<
> PCI_EXP_SLTCTL_PIC_SHIFT)
> +
> +#define PCI_EXP_DEVCAP2_EFF             0x100000
> +#define PCI_EXP_DEVCAP2_EETLPP          0x200000
> +
> +#define PCI_EXP_DEVCTL2_EETLPPB         0x80
> +
> +static void pcie_notify(PCIDevice *dev, uint16_t vector,
> +                        bool trigger, int level)
> +{
> +    /* masking/masking interrupt is handled by upper layer.
> +     * i.e. msix_notify() for MSI-X
> +     *      msi_notify()  for MSI
> +     *      pci_set_irq() for INTx
> +     */
> +    PCIE_DEV_PRINTF(dev, "noitfy vector %d tirgger:%d level:%d\n",
> +                    vector, trigger, level);
> +    if (msix_enabled(dev)) {
> +        if (trigger) {
> +            msix_notify(dev, vector);
> +        }
> +    } else if (msi_enabled(dev)) {
> +        if (trigger){
> +            msi_notify(dev, vector);
> +        }
> +    } else  {
> +        qemu_set_irq(dev->irq[0], level);
> +    }
> +}
> +
> +static inline uint32_t pcie_written_val_long(uint32_t addr, uint32_t val,
> +                                             uint32_t pos)
> +{
> +    if (addr >= pos) {
> +        val <<= addr - pos;
> +    } else {
> +        val >>= pos - addr;
> +    }
> +    return val;
> +}
> +
> +static inline uint16_t pcie_written_val_word(uint32_t addr, uint32_t val,
> +                                             uint32_t pos)
> +{
> +    return pcie_written_val_long(addr, val, pos) & 0xffff;
> +}
> +
> +/*
> + * RW1C: Write-1-to-clear
> + * regiger      written val        result
> + * 0            0               => 0
> + * 1            0               => 1
> + * 0            1               => 0
> + * 1            1               => 0
> + */
> +static inline void pcie_w1c_long(PCIDevice *d, uint32_t pos, uint32_t mask,
> +                                 uint32_t addr, uint32_t val)
> +{
> +    uint32_t written = pcie_written_val_long(addr, val, pos) & mask;
> +    uint32_t reg = pci_get_long(d->config + pos);
> +    reg &= ~written;
> +    pci_set_long(d->config + pos, reg);
> +}
> +
> +static inline void pcie_w1c_word(PCIDevice *d, uint32_t pos, uint16_t mask,
> +                                 uint32_t addr, uint32_t val)
> +{
> +    uint16_t written = pcie_written_val_word(addr, val, pos) & mask;
> +    uint16_t reg = pci_get_word(d->config + pos);
> +    reg &= ~written;
> +    pci_set_word(d->config + pos, reg);
> +}
> +
> +int pci_pcie_cap_init(PCIDevice *dev,
> +                      uint8_t offset, uint8_t type, uint8_t port)
> +{
> +    int exp_cap;
> +    uint8_t *pcie_cap;
> +
> +    assert(pci_is_express(dev));
> +    dev->exp = qemu_mallocz(sizeof(*dev->exp));
> +
> +    exp_cap = pci_add_capability(dev, PCI_CAP_ID_EXP, offset,
> +                                 PCI_EXP_VER2_SIZEOF);
> +    if (exp_cap < 0) {
> +        qemu_free(dev->exp);
> +        dev->exp = NULL;
> +        return exp_cap;
> +    }
> +    dev->exp->exp_cap = exp_cap;
> +    /* dev->cap_present |= QEMU_PCI_CAP_EXPRESS; */ /* already done in
> pci_qdev_init() */
> +
> +    pcie_cap = dev->config + pci_pcie_cap(dev);
> +
> +    /* capability register
> +       interrupt message number defaults to 0 */
> +    pci_set_word(pcie_cap + PCI_EXP_FLAGS,
> +                 ((type << PCI_EXP_FLAGS_TYPE_SHIFT) & PCI_EXP_FLAGS_TYPE) |
> +                 PCI_EXP_FLAGS_VER2);
> +
> +    /* device capability register
> +     * table 7-12:
> +     * roll based error reporting bit must be set by all
> +     * Functions conforming to the ECN, PCI Express Base
> +     * Specification, Revision 1.1., or subsequent PCI Express Base
> +     * Specification revisions.
> +     */
> +    pci_set_long(pcie_cap + PCI_EXP_DEVCAP, PCI_EXP_DEVCAP_RBER);
> +
> +    pci_set_long(pcie_cap + PCI_EXP_LNKCAP,
> +                 PCI_EXP_LNKCAP_PN_REG(port) |
> +                 PCI_EXP_LNKCAP_ASPMS_0S |
> +                 PCI_EXP_LNK_MLW_1 |
> +                 PCI_EXP_LNK_LS_25);
> +
> +    pci_set_word(pcie_cap + PCI_EXP_LNKSTA,
> +                 PCI_EXP_LNK_MLW_1 | PCI_EXP_LNK_LS_25);
> +
> +    pci_set_long(pcie_cap + PCI_EXP_DEVCAP2,
> +                 PCI_EXP_DEVCAP2_EFF | PCI_EXP_DEVCAP2_EETLPP);
> +
> +    pci_set_word(dev->wmask + exp_cap, PCI_EXP_DEVCTL2_EETLPPB);
> +    return exp_cap;
> +}
> +
> +int pci_pcie_cap_exit(PCIDevice *dev)
> +{
> +    /* pci_del_capability(dev, PCI_CAP_ID_EXP, PCI_EXP_VER2_SIZEOF); */
> +    qemu_free(dev->exp);
> +    return 0;
> +}
> +
> +uint8_t pcie_cap_get_type(const PCIDevice *dev)
> +{
> +    uint32_t pos = pci_pcie_cap(dev);
> +    assert(pos > 0);
> +    return (pci_get_word(dev->config + pos + PCI_EXP_FLAGS) &
> +            PCI_EXP_FLAGS_TYPE) >> PCI_EXP_FLAGS_TYPE_SHIFT;
> +}
> +
> +/* MSI/MSI-X */
> +/* pci express interrupt message number */
> +void pcie_cap_flags_set_vector(PCIDevice *dev, uint8_t vector)
> +{
> +    uint8_t *pcie_cap = dev->config + pci_pcie_cap(dev);
> +    uint16_t tmp;
> +
> +    assert(vector <= 32);
> +    tmp = pci_get_word(pcie_cap + PCI_EXP_FLAGS);
> +    tmp &= ~PCI_EXP_FLAGS_IRQ;
> +    tmp |= PCI_EXP_FLAGS_IRQ_REG(vector);
> +    pci_set_word(pcie_cap + PCI_EXP_FLAGS, tmp);
> +}
> +
> +uint8_t pcie_cap_flags_get_vector(PCIDevice *dev)
> +{
> +    return (pci_get_word(dev->config + pci_pcie_cap(dev) + PCI_EXP_FLAGS) &
> +            PCI_EXP_FLAGS_IRQ) >> PCI_EXP_FLAGS_IRQ_SHIFT;
> +}
> +
> +static void pcie_cap_notify(PCIDevice *dev, bool trigger, int level)
> +{
> +    pcie_notify(dev, pcie_cap_flags_get_vector(dev), trigger, level);
> +}
> +
> +void pcie_cap_deverr_init(PCIDevice *dev)
> +{
> +    uint32_t pos = pci_pcie_cap(dev);
> +    uint8_t *pcie_cap = dev->config + pos;
> +    uint8_t *pcie_wmask = dev->wmask + pos;
> +
> +    pci_set_long(pcie_cap + PCI_EXP_DEVCAP,
> +                 pci_get_long(pcie_cap + PCI_EXP_DEVCAP) |
> +                 PCI_EXP_DEVCAP_RBER);
> +
> +    pci_set_long(pcie_wmask + PCI_EXP_DEVCTL,
> +                 pci_get_long(pcie_wmask + PCI_EXP_DEVCTL) |
> +                 PCI_EXP_DEVCTL_CERE | PCI_EXP_DEVCTL_NFERE |
> +                 PCI_EXP_DEVCTL_FERE | PCI_EXP_DEVCTL_URRE);
> +}
> +
> +void pcie_cap_deverr_reset(PCIDevice *dev)
> +{
> +    uint8_t *pcie_cap = dev->config + pci_pcie_cap(dev);
> +    pci_set_long(pcie_cap + PCI_EXP_DEVCTL,
> +                 pci_get_long(pcie_cap + PCI_EXP_DEVCTL) &
> +                 ~(PCI_EXP_DEVCTL_CERE | PCI_EXP_DEVCTL_NFERE |
> +                   PCI_EXP_DEVCTL_FERE | PCI_EXP_DEVCTL_URRE));
> +}
> +
> +void pcie_cap_deverr_write_config(PCIDevice *dev,
> +                                  uint32_t addr, uint32_t val, int len)
> +{
> +    uint32_t pos = pci_pcie_cap(dev);
> +    if (ranges_overlap(addr, len, pos + PCI_EXP_DEVSTA, 4)) {
> +        /* RW1C */
> +        pcie_w1c_long(dev, pos + PCI_EXP_DEVSTA,
> +                      PCI_EXP_DEVSTA_CED | PCI_EXP_DEVSTA_NFED |
> +                      PCI_EXP_DEVSTA_URD | PCI_EXP_DEVSTA_URD,
> +                      addr, val);
> +    }
> +}
> +
> +/*
> + * events: PCI_EXP_HP_EV_xxx
> + * status: bit or of PCI_EXP_SLTSTA_xxx
> + */
> +static void pcie_cap_slot_event(PCIDevice *dev,
> +                                enum PCIExpressHotPlugEvent events,
> +                                uint16_t status)
> +{
> +    bool trigger = false;
> +    int level = 0;
> +    uint8_t *pcie_cap = dev->config + pci_pcie_cap(dev);
> +    uint16_t sltctl = pci_get_word(pcie_cap + PCI_EXP_SLTCTL);
> +    uint16_t sltsta = pci_get_word(pcie_cap + PCI_EXP_SLTSTA);
> +
> +    PCIE_DEV_PRINTF(dev,
> +                    "sltctl: 0x%0x2 sltsta: 0x%02x event:%x %s status:%d\n",
> +                    sltctl, sltsta,
> +                    events, pcie_hp_event_name(events), status);
> +    events &= PCI_EXP_HP_EV_SUPPORTED;
> +    if ((sltctl & PCI_EXP_SLTCTL_HPIE) && (sltctl & events) &&
> +        ((sltsta ^ events) & events) /* 0 -> 1 */) {
> +        trigger = true;
> +    }
> +
> +    if (events & PCI_EXP_HP_EV_PDC) {
> +        sltsta &= ~PCI_EXP_SLTSTA_PDS;
> +        sltsta |= (status & PCI_EXP_SLTSTA_PDS);
> +    }
> +    sltsta |= events;
> +    pci_set_word(pcie_cap + PCI_EXP_SLTSTA, sltsta);
> +    PCIE_DEV_PRINTF(dev, "sltsta -> %02xn", sltsta);
> +
> +    if ((sltctl & PCI_EXP_SLTCTL_HPIE) && (sltsta & PCI_EXP_HP_EV_SUPPORTED))
> {
> +        level = 1;
> +    }
> +
> +    pcie_cap_notify(dev, trigger, level);
> +}
> +
> +static int pcie_cap_slot_hotplug(DeviceState *qdev,
> +                                 PCIDevice *pci_dev, int state)
> +{
> +    PCIDevice *d = DO_UPCAST(PCIDevice, qdev, qdev);
> +    uint8_t *pcie_cap = d->config + pci_pcie_cap(d);
> +    uint16_t sltsta = pci_get_word(pcie_cap + PCI_EXP_SLTSTA);
> +
> +    if (!pci_dev->qdev.hotplugged) {
> +        assert(state); /* this case only happens machine creation. */
> +        sltsta |= PCI_EXP_SLTSTA_PDS;
> +        pci_set_word(pcie_cap + PCI_EXP_SLTSTA, sltsta);
> +        return 0;
> +    }
> +
> +    PCIE_DEV_PRINTF(pci_dev, "hotplug state: %d\n", state);
> +    if (sltsta & PCI_EXP_SLTSTA_EIS) {
> +        /* the slot is electromechanically locked. */
> +        return -EBUSY;
> +    }
> +
> +    if (state) {
> +        if (PCI_FUNC(pci_dev->devfn) == 0) {
> +            /* event is per slot. Not per function
> +             * only generates event for function = 0.
> +             * When hot plug, populate functions > 0
> +             * and then add function = 0 last.
> +             */
> +            pcie_cap_slot_event(d, PCI_EXP_HP_EV_PDC, PCI_EXP_SLTSTA_PDS);
> +        }
> +    } else {
> +        PCIBridge *br;
> +        PCIBus *bus;
> +        DeviceState *next;
> +        if (PCI_FUNC(pci_dev->devfn) != 0) {
> +            /* event is per slot. Not per function.
> +               accepts function = 0 only. */
> +            return -EINVAL;
> +        }
> +
> +        /* zap all functions. */
> +        br = DO_UPCAST(PCIBridge, dev, d);
> +        bus = pci_bridge_get_sec_bus(br);
> +        QLIST_FOREACH_SAFE(qdev, &bus->qbus.children, sibling, next) {
> +            qdev_free(qdev);
> +        }
> +
> +        pcie_cap_slot_event(d, PCI_EXP_HP_EV_PDC, 0);
> +    }
> +    return 0;
> +}
> +
> +/* pci express slot for pci express root/downstream port
> +   PCI express capability slot registers */
> +void pcie_cap_slot_init(PCIDevice *dev, uint16_t slot)
> +{
> +    uint8_t *pcie_cap = dev->config + pci_pcie_cap(dev);
> +    uint8_t *pcie_wmask = dev->wmask + pci_pcie_cap(dev);
> +    uint32_t tmp;
> +
> +    pci_set_word(pcie_cap + PCI_EXP_FLAGS,
> +                 pci_get_word(pcie_cap + PCI_EXP_FLAGS) |
> PCI_EXP_FLAGS_SLOT);
> +
> +    tmp = pci_get_long(pcie_cap + PCI_EXP_SLTCAP);
> +    tmp &= PCI_EXP_SLTCAP_PSN;
> +    tmp |=
> +        PCI_EXP_SLTCAP_PSN_REG(slot) |
> +        PCI_EXP_SLTCAP_EIP |
> +        PCI_EXP_SLTCAP_HPS |
> +        PCI_EXP_SLTCAP_HPC |
> +        PCI_EXP_SLTCAP_PIP |
> +        PCI_EXP_SLTCAP_AIP |
> +        PCI_EXP_SLTCAP_ABP;
> +    pci_set_long(pcie_cap + PCI_EXP_SLTCAP, tmp);
> +
> +    tmp = pci_get_word(pcie_cap + PCI_EXP_SLTCTL);
> +    tmp &= ~(PCI_EXP_SLTCTL_PIC | PCI_EXP_SLTCTL_AIC);
> +    tmp |= PCI_EXP_SLTCTL_PIC_OFF | PCI_EXP_SLTCTL_AIC_OFF;
> +    pci_set_word(pcie_cap + PCI_EXP_SLTCTL, tmp);
> +    pci_set_word(pcie_wmask + PCI_EXP_SLTCTL,
> +                 pci_get_word(pcie_wmask + PCI_EXP_SLTCTL) |
> +                 PCI_EXP_SLTCTL_PIC |
> +                 PCI_EXP_SLTCTL_AIC |
> +                 PCI_EXP_SLTCTL_HPIE |
> +                 PCI_EXP_SLTCTL_CCIE |
> +                 PCI_EXP_SLTCTL_PDCE |
> +                 PCI_EXP_SLTCTL_ABPE);
> +
> +    pci_bus_hotplug(pci_bridge_get_sec_bus(DO_UPCAST(PCIBridge, dev, dev)),
> +                    pcie_cap_slot_hotplug, &dev->qdev);
> +}
> +
> +void pcie_cap_slot_reset(PCIDevice *dev)
> +{
> +    uint8_t *pcie_cap = dev->config + pci_pcie_cap(dev);
> +    uint32_t tmp;
> +
> +    PCIE_DEV_PRINTF(dev, "reset\n");
> +
> +    tmp = pci_get_word(pcie_cap + PCI_EXP_SLTCTL);
> +    tmp &= ~(PCI_EXP_SLTCTL_EIC |
> +             PCI_EXP_SLTCTL_PIC |
> +             PCI_EXP_SLTCTL_AIC |
> +             PCI_EXP_SLTCTL_HPIE |
> +             PCI_EXP_SLTCTL_CCIE |
> +             PCI_EXP_SLTCTL_PDCE |
> +             PCI_EXP_SLTCTL_ABPE);
> +    tmp |= PCI_EXP_SLTCTL_PIC_OFF | PCI_EXP_SLTCTL_AIC_OFF;
> +    pci_set_word(pcie_cap + PCI_EXP_SLTCTL, tmp);
> +
> +    tmp = pci_get_word(pcie_cap + PCI_EXP_SLTSTA);
> +    tmp &= ~(PCI_EXP_SLTSTA_EIS | /* by reset, the lock is released */
> +             PCI_EXP_SLTSTA_CC |
> +             PCI_EXP_SLTSTA_PDC |
> +             PCI_EXP_SLTSTA_ABP);
> +    pci_set_word(pcie_cap + PCI_EXP_SLTSTA, tmp);
> +}
> +
> +void pcie_cap_slot_write_config(PCIDevice *dev,
> +                                uint32_t addr, uint32_t val, int len,
> +                                uint16_t sltctl_prev)
> +{
> +    uint32_t pos = pci_pcie_cap(dev);
> +    uint8_t *pcie_cap = dev->config + pos;
> +    uint16_t sltctl = pci_get_word(pcie_cap + PCI_EXP_SLTCTL);
> +    uint16_t sltsta = pci_get_word(pcie_cap + PCI_EXP_SLTSTA);
> +
> +    PCIE_DEV_PRINTF(dev,
> +                    "addr: 0x%x val: 0x%x len: %d\n"
> +                    "\tsltctl_prev: 0x%02x sltctl: 0x%02x sltsta 0x%02x\n",
> +                    addr, val, len, sltctl_prev, sltctl, sltsta);
> +    /* SLTSTA: process SLTSTA before SLTCTL to avoid spurious interrupt */
> +    if (ranges_overlap(addr, len, pos + PCI_EXP_SLTSTA, 2)) {
> +        /* RW1C */
> +        pcie_w1c_word(dev, pos + PCI_EXP_SLTSTA, PCI_EXP_HP_EV_SUPPORTED,
> +                      addr, val);
> +        sltsta = pci_get_word(pcie_cap + PCI_EXP_SLTSTA);
> +
> +        /* write to stlsta results in clearing bits,
> +           so new interrupts won't be generated. */
> +        PCIE_DEV_PRINTF(dev, "sltsta -> 0x%02x\n", sltsta);
> +    }
> +
> +    /* SLTCTL */
> +    if (ranges_overlap(addr, len, pos + PCI_EXP_SLTCTL, 2)) {
> +        PCIE_DEV_PRINTF(dev, "sltctl: 0x%02x -> 0x%02x\n",
> +                        sltctl_prev, sltctl);
> +        if (pcie_written_val_word(addr, val, pos + PCI_EXP_SLTCTL) &
> +            PCI_EXP_SLTCTL_EIC) {
> +            /* toggle PCI_EXP_SLTSTA_EIS */
> +            sltsta = (sltsta & ~PCI_EXP_SLTSTA_EIS) |
> +                ((sltsta ^ PCI_EXP_SLTSTA_EIS) & PCI_EXP_SLTSTA_EIS);
> +            pci_set_word(pcie_cap + PCI_EXP_SLTSTA, sltsta);
> +            PCIE_DEV_PRINTF(dev, "PCI_EXP_SLTCTL_EIC: sltsta -> 0x%02x\n",
> +                            sltsta);
> +        }
> +
> +        if (sltctl & PCI_EXP_SLTCTL_HPIE) {
> +            bool trigger = false;
> +            int level = 0;
> +
> +            if (((sltctl_prev ^ sltctl) & sltctl) & PCI_EXP_HP_EV_SUPPORTED)
> {
> +                /* 0 -> 1 */
> +                trigger = true;
> +            }
> +            if ((sltctl & sltsta) & PCI_EXP_HP_EV_SUPPORTED) {
> +                level = 1;
> +            }
> +            pcie_cap_notify(dev, trigger, level);
> +        }
> +
> +        /* command completed.
> +           unlike real hardware, command completes instantaneously */
> +#define PCI_EXP_SLTCTL_SUPPORTED        \
> +            (PCI_EXP_SLTCTL_ABPE |      \
> +             PCI_EXP_SLTCTL_PDCE |      \
> +             PCI_EXP_SLTCTL_CCIE |      \
> +             PCI_EXP_SLTCTL_HPIE |      \
> +             PCI_EXP_SLTCTL_AIC |       \
> +             PCI_EXP_SLTCTL_PCC |       \
> +             PCI_EXP_SLTCTL_EIC)
> +        if ( 1 /* (sltctl_prev ^ sltctl) & PCI_EXP_SLTCTL_SUPPORTED */ ) {
> +            /* set command completed bit */
> +            pcie_cap_slot_event(dev, PCI_EXP_HP_EV_CCI, 0);
> +        }
> +    }
> +}
> +
> +void pcie_cap_slot_push_attention_button(PCIDevice *dev)
> +{
> +    pcie_cap_slot_event(dev, PCI_EXP_HP_EV_ABP, 0);
> +}
> +
> +/* root control/capabilities/status. PME isn't emulated for now */
> +void pcie_cap_root_init(PCIDevice *dev)
> +{
> +    uint8_t pos = pci_pcie_cap(dev);
> +    pci_set_word(dev->wmask + pos + PCI_EXP_RTCTL,
> +                 PCI_EXP_RTCTL_SECEE | PCI_EXP_RTCTL_SENFEE |
> +                 PCI_EXP_RTCTL_SEFEE);
> +}
> +
> +void pcie_cap_root_reset(PCIDevice *dev)
> +{
> +    uint8_t *pcie_cap = dev->config + pci_pcie_cap(dev);
> +    pci_set_word(pcie_cap + PCI_EXP_RTCTL, 0);
> +}
> +
> +/* function level reset(FLR) */
> +void pcie_cap_flr_init(PCIDevice *dev, pcie_flr_fn flr)
> +{
> +    uint8_t *pcie_cap = dev->config + pci_pcie_cap(dev);
> +    pci_set_word(pcie_cap + PCI_EXP_DEVCAP,
> +                 pci_get_word(pcie_cap + PCI_EXP_DEVCAP) |
> PCI_EXP_DEVCAP_FLR);
> +    dev->exp->flr = flr;
> +}
> +
> +void pcie_cap_flr_write_config(PCIDevice *dev,
> +                               uint32_t addr, uint32_t val, int len)
> +{
> +    uint32_t pos = pci_pcie_cap(dev);
> +    if (ranges_overlap(addr, len, pos + PCI_EXP_DEVCTL, 2)) {
> +        uint16_t val16 = pcie_written_val_word(addr, val,
> +                                               pos + PCI_EXP_DEVCTL);
> +        if ((val16 & PCI_EXP_DEVCTL_BCR_FLR) && dev->exp->flr) {
> +            dev->exp->flr(dev);
> +        }
> +    }
> +}
> +
> +
> +/* Alternative Routing-ID Interpretation (ARI) */
> +/* ari forwarding support for down stream port */
> +void pcie_cap_ari_init(PCIDevice *dev)
> +{
> +    uint8_t *pcie_cap = dev->config + pci_pcie_cap(dev);
> +    uint8_t *pcie_wmask = dev->wmask + pci_pcie_cap(dev);
> +
> +    pci_set_long(pcie_cap + PCI_EXP_DEVCAP2,
> +                 pci_get_long(pcie_cap + PCI_EXP_DEVCAP2) |
> +                 PCI_EXP_DEVCAP2_ARI);
> +
> +    pci_set_long(pcie_wmask + PCI_EXP_DEVCTL2,
> +                 pci_get_long(pcie_wmask + PCI_EXP_DEVCTL2) |
> +                 PCI_EXP_DEVCTL2_ARI);
> +}
> +
> +void pcie_cap_ari_reset(PCIDevice *dev)
> +{
> +    uint8_t *pcie_cap = dev->config + pci_pcie_cap(dev);
> +
> +    pci_set_long(pcie_cap + PCI_EXP_DEVCTL2,
> +                 pci_get_long(pcie_cap + PCI_EXP_DEVCTL2) &
> +                 ~PCI_EXP_DEVCTL2_ARI);
> +}
> +
> +bool pcie_cap_is_ari_enabled(const PCIDevice *dev)
> +{
> +    if (!pci_is_express(dev)) {
> +        return false;
> +    }
> +    if (!pci_pcie_cap(dev)) {
> +        return false;
> +    }
> +
> +    return pci_get_long(dev->config + pci_pcie_cap(dev) + PCI_EXP_DEVCTL2) &
> +        PCI_EXP_DEVCTL2_ARI;
> +}
> +
> +/**************************************************************************
> + * pci express extended capability allocation functions
> + * uint16_t ext_cap_id (16 bit)
> + * uint8_t cap_ver (4 bit)
> + * uint16_t cap_offset (12 bit)
> + * uint16_t ext_cap_size
> + */
> +
> +#define PCI_EXT_CAP_VER_SHIFT   16
> +#define PCI_EXT_CAP_NEXT_MASK   0xfff00000
> +#define PCI_EXT_CAP_NEXT_SHIFT  20
> +
> +#define PCI_EXT_CAP(id, ver, next) ((id) | ((ver) << PCI_EXT_CAP_VER_SHIFT) |
> ((next) << PCI_EXT_CAP_NEXT_SHIFT))
> +
> +#define PCI_EXT_CAP_ALIGN       4
> +#define PCI_EXT_CAP_ALIGNUP(x)  (((x) + PCI_EXT_CAP_ALIGN - 1) &
> ~(PCI_EXT_CAP_ALIGN - 1))
> +
> +static int16_t pcie_ext_cap_find_space(PCIDevice *dev, uint16_t size)
> +{
> +    uint16_t offset = PCI_CONFIG_SPACE_SIZE;
> +    uint16_t i = offset;
> +
> +    while (i < PCIE_CONFIG_SPACE_SIZE - size) {
> +        if (dev->used[i]) {
> +            offset = PCI_EXT_CAP_ALIGNUP(i + 1);
> +            i = offset;
> +            continue;
> +        } else if (i - offset + 1 == size) {
> +            return offset;
> +        }
> +
> +        ++i;
> +    }
> +
> +    return 0;
> +}
> +
> +static uint16_t pcie_find_ext_capability_list(PCIDevice *dev, uint16_t
> cap_id,
> +                                              uint16_t *prev_p)
> +{
> +    int ttl;
> +
> +    uint16_t prev = 0;
> +    uint16_t next = PCI_CONFIG_SPACE_SIZE;
> +    uint32_t header = pci_get_long(dev->config + next);
> +
> +    if (!header) {
> +        return 0;
> +    }
> +
> +    /* minimum 8 bytes per capability */
> +    ttl = (PCIE_CONFIG_SPACE_SIZE - PCI_CONFIG_SPACE_SIZE) / 8;
> +
> +    while (ttl-- > 0) {
> +        if (PCI_EXT_CAP_ID(header) == cap_id) {
> +            break;
> +        }
> +
> +        prev = next;
> +        next = PCI_EXT_CAP_NEXT(header);
> +        if (next < PCI_CONFIG_SPACE_SIZE) {
> +            return 0;
> +        }
> +        header = pci_get_long(dev->config + prev);
> +    }
> +
> +    if (!ttl) {
> +        return 0;
> +    }
> +    if (prev_p) {
> +        *prev_p = prev;
> +    }
> +    return next;
> +}
> +
> +uint16_t pcie_find_ext_capability(PCIDevice *dev, uint16_t cap_id)
> +{
> +    return pcie_find_ext_capability_list(dev, cap_id, NULL);
> +}
> +
> +static void pcie_ext_cap_set_next(PCIDevice *dev, uint16_t pos, uint16_t
> next)
> +{
> +    uint16_t header = pci_get_long(dev->config + pos);
> +    assert(!(next & (PCI_EXT_CAP_ALIGN - 1)));
> +    header = (header & ~PCI_EXT_CAP_NEXT_MASK) |
> +        ((next << PCI_EXT_CAP_NEXT_SHIFT) & PCI_EXT_CAP_NEXT_MASK);
> +    pci_set_long(dev->config + pos, header);
> +}
> +
> +static void pcie_allocate_ext_capability(PCIDevice *dev,
> +                                         uint16_t cap_id, uint8_t cap_ver,
> +                                         uint16_t offset, uint16_t size)
> +{
> +    uint32_t header;
> +    uint16_t next;
> +
> +    assert(offset < offset + size);
> +    assert(offset + size < PCIE_CONFIG_SPACE_SIZE);
> +    assert(size >= 8);
> +
> +    if (offset == PCI_CONFIG_SPACE_SIZE) {
> +        header = pci_get_long(dev->config + offset);
> +        next = PCI_EXT_CAP_NEXT(header);
> +    } else {
> +        /* find last ext cap */
> +        int ttl = (PCIE_CONFIG_SPACE_SIZE - PCI_CONFIG_SPACE_SIZE) / 8;
> +        uint16_t pos = PCI_CONFIG_SPACE_SIZE;
> +        while (ttl-- > 0) {
> +            header = pci_get_long(dev->config + pos);
> +            if (PCI_EXT_CAP_NEXT(header) < PCI_CONFIG_SPACE_SIZE) {
> +                break;
> +            }
> +
> +            pos = PCI_EXT_CAP_NEXT(header);
> +        }
> +
> +        assert(ttl > 0); /* since it is known that [offset, offset + size]
> +                            is unused, so ttl shouldn't be zero */
> +        pcie_ext_cap_set_next(dev, pos, offset);
> +        next = 0;
> +    }
> +    pci_set_long(dev->config + offset, PCI_EXT_CAP(cap_id, cap_ver, next));
> +
> +    memset(dev->used + offset, 0xFF, size);
> +    /* Make capability read-only by default */
> +    memset(dev->wmask + offset, 0, size);
> +    /* Check capability by default */
> +    memset(dev->cmask + offset, 0xFF, size);
> +}
> +
> +int pcie_add_ext_capability(PCIDevice *dev,
> +                            uint16_t cap_id, uint8_t cap_ver, uint16_t size)
> +{
> +    uint16_t offset = pcie_ext_cap_find_space(dev, size);
> +
> +    if (!offset) {
> +        return -ENOSPC;
> +    }
> +
> +    pcie_allocate_ext_capability(dev, cap_id, cap_ver, offset, size);
> +    return offset;
> +}
> +
> +int pcie_append_ext_capability(PCIDevice *dev,
> +                               uint16_t cap_id, uint8_t cap_ver,
> +                               uint16_t offset, uint16_t size)
> +{
> +    uint16_t i;
> +
> +    if (!offset) {
> +        return pcie_add_ext_capability(dev, cap_id, cap_ver, size);
> +    }
> +
> +    assert(offset < offset + size);
> +    assert(offset + size < PCIE_CONFIG_SPACE_SIZE);
> +    assert(size >= 8);
> +
> +    for (i = offset; i < offset + size; ++i) {
> +        if (dev->used[i]) {
> +            return -EBUSY;
> +        }
> +    }
> +
> +    pcie_allocate_ext_capability(dev, cap_id, cap_ver, offset, size);
> +    return offset;
> +}
> +
> +void pcie_del_ext_capability(PCIDevice *dev, uint16_t cap_id, uint16_t size)
> +{
> +    uint16_t prev;
> +    uint16_t offset = pcie_find_ext_capability_list(dev, cap_id, &prev);
> +    uint32_t header;
> +
> +    if (!offset) {
> +        return;
> +    }
> +
> +    header = pci_get_long(dev->config + offset);
> +    if (prev) {
> +        pcie_ext_cap_set_next(dev, prev, PCI_EXT_CAP_NEXT(header));
> +    } else {
> +        /* move up next ext cap to PCI_CONFIG_SPACE_SIZE? */
> +        assert(offset == PCI_CONFIG_SPACE_SIZE);
> +        pci_set_long(dev->config + offset,
> +                     PCI_EXT_CAP(0, 0, PCI_EXT_CAP_NEXT(header)));
> +    }
> +
> +    /* Make capability writeable again */
> +    memset(dev->wmask + offset, 0xff, size);
> +    /* Clear cmask as device-specific registers can't be checked */
> +    memset(dev->cmask + offset, 0, size);
> +    memset(dev->used + offset, 0, size);
> +}
> +
> +void pcie_reserve_ext_capability(PCIDevice *dev,
> +                                 uint16_t offset, uint16_t size)
> +{
> +    memset(dev->used + offset, 0xff, size);
> +}
> +
> +/**************************************************************************
> + * pci express extended capability helper functions
> + */
> +
> +/* ARI */
> +#define PCI_ARI_VER     1
> +#define PCI_ARI_SIZEOF  8
> +
> +int pcie_ari_init(PCIDevice *dev, uint16_t offset, uint16_t nextfn)
> +{
> +    int pos;
> +    pos = pcie_append_ext_capability(dev, PCI_EXT_CAP_ID_ARI, PCI_ARI_VER,
> +                                     offset, PCI_ARI_SIZEOF);
> +    if (pos < 0) {
> +        return pos;
> +    }
> +
> +    pci_set_long(dev->config + pos + PCI_ARI_CAP, PCI_ARI_CAP_NFN(nextfn));
> +    return pos;
> +}
> +
> +/* AER */
> +#define PCI_ERR_VER                     2
> +#define PCI_ERR_SIZEOF                  0x48
> +
> +#define PCI_ERR_UNC_SDN                 0x00000020      /* surprise down */
> +#define PCI_ERR_UNC_ACSV                0x00200000      /* ACS Violation */
> +#define PCI_ERR_UNC_INTN                0x00400000      /* Internal Error */
> +#define PCI_ERR_UNC_MCBTLP              0x00800000      /* MC Blcoked TLP */
> +#define PCI_ERR_UNC_ATOP_EBLOCKED       0x01000000      /* atomic op egress 
> blocked */
> +#define PCI_ERR_UNC_TLP_PRF_BLOCKED     0x02000000      /* TLP Prefix Blocked 
> */
> +#define PCI_ERR_UNC_SUPPORTED           (PCI_ERR_UNC_DLP |              \
> +                                         PCI_ERR_UNC_SDN |              \
> +                                         PCI_ERR_UNC_POISON_TLP |       \
> +                                         PCI_ERR_UNC_FCP |              \
> +                                         PCI_ERR_UNC_COMP_TIME |        \
> +                                         PCI_ERR_UNC_COMP_ABORT |       \
> +                                         PCI_ERR_UNC_UNX_COMP |         \
> +                                         PCI_ERR_UNC_RX_OVER |          \
> +                                         PCI_ERR_UNC_MALF_TLP |         \
> +                                         PCI_ERR_UNC_ECRC |             \
> +                                         PCI_ERR_UNC_UNSUP |            \
> +                                         PCI_ERR_UNC_ACSV |             \
> +                                         PCI_ERR_UNC_INTN |             \
> +                                         PCI_ERR_UNC_MCBTLP |           \
> +                                         PCI_ERR_UNC_ATOP_EBLOCKED |    \
> +                                         PCI_ERR_UNC_TLP_PRF_BLOCKED)
> +
> +#define PCI_ERR_UNC_SEVERITY_DEFAULT    (PCI_ERR_UNC_DLP |              \
> +                                         PCI_ERR_UNC_SDN |              \
> +                                         PCI_ERR_UNC_FCP |              \
> +                                         PCI_ERR_UNC_RX_OVER |          \
> +                                         PCI_ERR_UNC_MALF_TLP |         \
> +                                         PCI_ERR_UNC_INTN)
> +
> +#define PCI_ERR_COR_ADV_NONFATAL        0x00002000      /* Advisory Non-Fatal 
> */
> +#define PCI_ERR_COR_INTERNAL            0x00004000      /* Corrected Internal 
> */
> +#define PCI_ERR_COR_HL_OVERFLOW         0x00008000      /* Header Long 
> Overflow */
> +#define PCI_ERR_COR_SUPPORTED           (PCI_ERR_COR_RCVR |             \
> +                                         PCI_ERR_COR_BAD_TLP |          \
> +                                         PCI_ERR_COR_BAD_DLLP |         \
> +                                         PCI_ERR_COR_REP_ROLL |         \
> +                                         PCI_ERR_COR_REP_TIMER |        \
> +                                         PCI_ERR_COR_ADV_NONFATAL |     \
> +                                         PCI_ERR_COR_INTERNAL |         \
> +                                         PCI_ERR_COR_HL_OVERFLOW)
> +#define PCI_ERR_COR_MASK_DEFAULT        (PCI_ERR_COR_ADV_NONFATAL |     \
> +                                         PCI_ERR_COR_INTERNAL |         \
> +                                         PCI_ERR_COR_HL_OVERFLOW)
> +
> +
> +#define PCI_ERR_CAP_FEP_MASK            0x0000001f
> +#define PCI_ERR_CAP_MHRC                0x00000200
> +#define PCI_ERR_CAP_MHRE                0x00000400
> +#define PCI_ERR_CAP_TLP                 0x00000800
> +
> +#define PCI_ERR_TLP_PREFIX_LOG          0x38
> +
> +/* From 6.2.7 Error Listing and Rules. Table 6-2, 6-3 and 6-4 */
> +static enum PCIE_AER_SEVERITY pcie_aer_uncor_default_severity(uint32_t 
> status)
> +{
> +    switch (status) {
> +    case PCI_ERR_UNC_INTN:
> +    case PCI_ERR_UNC_DLP:
> +    case PCI_ERR_UNC_SDN:
> +    case PCI_ERR_UNC_RX_OVER:
> +    case PCI_ERR_UNC_FCP:
> +    case PCI_ERR_UNC_MALF_TLP:
> +        return AER_ERR_FATAL;
> +    case PCI_ERR_UNC_POISON_TLP:
> +    case PCI_ERR_UNC_ECRC:
> +    case PCI_ERR_UNC_UNSUP:
> +    case PCI_ERR_UNC_COMP_TIME:
> +    case PCI_ERR_UNC_COMP_ABORT:
> +    case PCI_ERR_UNC_UNX_COMP:
> +    case PCI_ERR_UNC_ACSV:
> +    case PCI_ERR_UNC_MCBTLP:
> +    case PCI_ERR_UNC_ATOP_EBLOCKED:
> +    case PCI_ERR_UNC_TLP_PRF_BLOCKED:
> +        return AER_ERR_NONFATAL;
> +    default:
> +        break;
> +    }
> +    abort();
> +    return AER_ERR_FATAL;
> +}
> +
> +static uint32_t pcie_aer_log_next(uint32_t i, uint32_t max)
> +{
> +    return (i + 1) % max;
> +}
> +
> +static bool pcie_aer_log_empty_index(uint32_t producer, uint32_t consumer)
> +{
> +    return producer == consumer;
> +}
> +
> +static bool pcie_aer_log_empty(struct pcie_aer_log *aer_log)
> +{
> +    return pcie_aer_log_empty_index(aer_log->producer, aer_log->consumer);
> +}
> +
> +static bool pcie_aer_log_full(struct pcie_aer_log *aer_log)
> +{
> +    return pcie_aer_log_next(aer_log->producer, aer_log->log_max) ==
> +        aer_log->consumer;
> +}
> +
> +static uint32_t pcie_aer_log_add(struct pcie_aer_log *aer_log)
> +{
> +    uint32_t i = aer_log->producer;
> +    aer_log->producer = pcie_aer_log_next(aer_log->producer, 
> aer_log->log_max);
> +    return i;
> +}
> +
> +static uint32_t pcie_aer_log_del(struct pcie_aer_log *aer_log)
> +{
> +    uint32_t i = aer_log->consumer;
> +    aer_log->consumer = pcie_aer_log_next(aer_log->consumer, 
> aer_log->log_max);
> +    return i;
> +}
> +
> +static int pcie_aer_log_add_err(struct pcie_aer_log *aer_log,
> +                                const struct pcie_aer_err *err)
> +{
> +    uint32_t i;
> +    if (pcie_aer_log_full(aer_log)) {
> +        return -1;
> +    }
> +    i = pcie_aer_log_add(aer_log);
> +    memcpy(&aer_log->log[i], err, sizeof(*err));
> +    return 0;
> +}
> +
> +static const struct pcie_aer_err*
> +pcie_aer_log_del_err(struct pcie_aer_log *aer_log)
> +{
> +    uint32_t i;
> +    assert(!pcie_aer_log_empty(aer_log));
> +    i = pcie_aer_log_del(aer_log);
> +    return &aer_log->log[i];
> +}
> +
> +static void pcie_aer_log_clear_all_err(struct pcie_aer_log *aer_log)
> +{
> +    aer_log->producer = 0;
> +    aer_log->consumer = 0;
> +}
> +
> +int pcie_aer_init(PCIDevice *dev, uint16_t offset)
> +{
> +    int pos;
> +    PCIExpressDevice *exp;
> +
> +    pci_set_word(dev->wmask + PCI_COMMAND,
> +                 pci_get_word(dev->wmask + PCI_COMMAND) | PCI_COMMAND_SERR);
> +
> +    pos = pcie_append_ext_capability(dev, PCI_EXT_CAP_ID_ERR, PCI_ERR_VER,
> +                                     offset, PCI_ERR_SIZEOF);
> +    if (pos < 0) {
> +        return pos;
> +    }
> +    exp = dev->exp;
> +    exp->aer_cap = pos;
> +    if (dev->aer_log.log_max == PCIE_AER_LOG_MAX_UNSET) {
> +        dev->aer_log.log_max = PCIE_AER_LOG_MAX_DEFAULT;
> +    }
> +    if (dev->aer_log.log_max > PCIE_AER_LOG_MAX_MAX) {
> +        dev->aer_log.log_max = PCIE_AER_LOG_MAX_MAX;
> +    }
> +    dev->aer_log.log =
> +        qemu_mallocz(sizeof(dev->aer_log.log[0]) * dev->aer_log.log_max);
> +
> +    pci_set_long(dev->wmask + pos + PCI_ERR_UNCOR_MASK,
> +                 PCI_ERR_UNC_SUPPORTED);
> +
> +    pci_set_long(dev->config + pos + PCI_ERR_UNCOR_SEVER,
> +                 PCI_ERR_UNC_SEVERITY_DEFAULT);
> +    pci_set_long(dev->wmask + pos + PCI_ERR_UNCOR_SEVER,
> +                 PCI_ERR_UNC_SUPPORTED);
> +
> +    pci_set_long(dev->config + pos + PCI_ERR_COR_MASK,
> +                 PCI_ERR_COR_MASK_DEFAULT);
> +    pci_set_long(dev->wmask + pos + PCI_ERR_COR_MASK,
> +                 PCI_ERR_COR_SUPPORTED);
> +
> +    /* capabilities and control. multiple header logging is supported */
> +    if (dev->aer_log.log_max > 0) {
> +        pci_set_long(dev->config + pos + PCI_ERR_CAP,
> +                     PCI_ERR_CAP_ECRC_GENC | PCI_ERR_CAP_ECRC_CHKC |
> +                     PCI_ERR_CAP_MHRC);
> +        pci_set_long(dev->wmask + pos + PCI_ERR_CAP,
> +                     PCI_ERR_CAP_ECRC_GENE | PCI_ERR_CAP_ECRC_CHKE |
> +                     PCI_ERR_CAP_MHRE);
> +    } else {
> +        pci_set_long(dev->config + pos + PCI_ERR_CAP,
> +                     PCI_ERR_CAP_ECRC_GENC | PCI_ERR_CAP_ECRC_CHKC);
> +        pci_set_long(dev->wmask + pos + PCI_ERR_CAP,
> +                     PCI_ERR_CAP_ECRC_GENE | PCI_ERR_CAP_ECRC_CHKE);
> +    }
> +
> +    switch (pcie_cap_get_type(dev)) {
> +    case PCI_EXP_TYPE_ROOT_PORT:
> +        /* this case will be set by pcie_aer_root_init() */
> +        /* fallthrough */
> +    case PCI_EXP_TYPE_DOWNSTREAM:
> +    case PCI_EXP_TYPE_UPSTREAM:
> +        pci_set_word(dev->wmask + PCI_BRIDGE_CONTROL,
> +                     pci_get_word(dev->wmask + PCI_BRIDGE_CONTROL) |
> +                     PCI_BRIDGE_CTL_SERR);
> +        exp->aer_errmsg = pcie_aer_errmsg_vbridge;
> +        break;
> +    default:
> +        exp->aer_errmsg = pcie_aer_errmsg_alldev;
> +        break;
> +    }
> +    return pos;
> +}
> +
> +void pcie_aer_exit(PCIDevice *dev)
> +{
> +    pci_del_capability(dev, PCI_EXT_CAP_ID_ERR, PCI_ERR_SIZEOF);
> +    qemu_free(dev->aer_log.log);
> +}
> +
> +/* Multiple Header recording isn't implemented. Is it wanted? */
> +void pcie_aer_write_config(PCIDevice *dev,
> +                           uint32_t addr, uint32_t val, int len)
> +{
> +    uint32_t pos = dev->exp->aer_cap;
> +
> +    /* PCI_STATUS_SIG_SYSTEM_ERROR */
> +    if (ranges_overlap(addr, len, PCI_STATUS, 2)) {
> +        pcie_w1c_word(dev, PCI_STATUS, PCI_STATUS_SIG_SYSTEM_ERROR, addr, 
> val);
> +    }
> +
> +    /* uncorrectable */
> +    if (ranges_overlap(addr, len, pos + PCI_ERR_UNCOR_STATUS, 4)) {
> +        uint32_t written =
> +            pcie_written_val_long(addr, val, pos + PCI_ERR_UNCOR_STATUS) &
> +            PCI_ERR_UNC_SUPPORTED;
> +        uint32_t uncorsta =
> +            pci_get_long(dev->config + pos + PCI_ERR_UNCOR_STATUS);
> +        uint32_t errcap = pci_get_long(dev->config + pos + PCI_ERR_CAP);
> +        uint32_t first_error = (1 << PCI_ERR_CAP_FEP(errcap));
> +
> +        if ((uncorsta & first_error) && (written & first_error)) {
> +            pcie_aer_clear_error(dev);
> +        }
> +        if (!(errcap & PCI_ERR_CAP_MHRE)) {
> +            /* RW1CS */
> +            pcie_w1c_long(dev, pos + PCI_ERR_UNCOR_STATUS,
> +                          PCI_ERR_UNC_SUPPORTED, addr, val);
> +        }
> +    }
> +
> +    /* correctable */
> +    if (ranges_overlap(addr, len, pos + PCI_ERR_COR_STATUS, 4)) {
> +        /* RW1CS */
> +        pcie_w1c_long(dev, pos + PCI_ERR_COR_STATUS, PCI_ERR_COR_SUPPORTED,
> +                      addr, val);
> +    }
> +
> +    /* capability & control */
> +    if (ranges_overlap(addr, len, pos + PCI_ERR_CAP, 4)) {
> +        uint32_t err_cap = pci_get_long(dev->config + pos + PCI_ERR_CAP);
> +        if (!(err_cap & PCI_ERR_CAP_MHRE)) {
> +            pcie_aer_log_clear_all_err(&dev->aer_log);
> +        }
> +    }
> +}
> +
> +#define PCI_SEC_STATUS_RCV_SYSTEM_ERROR         0x4000
> +
> +void pcie_aer_write_config_vbridge(PCIDevice *dev,
> +                                   uint32_t addr, uint32_t val, int len)
> +{
> +    /* PCI_SEC_STATUS_RCV_SYSTEM_ERROR */
> +    if (ranges_overlap(addr, len, PCI_STATUS, 2)) {
> +        pcie_w1c_word(dev, PCI_SEC_STATUS, PCI_SEC_STATUS_RCV_SYSTEM_ERROR,
> +                      addr, val);
> +    }
> +}
> +
> +static inline void pcie_aer_errmsg(PCIDevice *dev,
> +                                   const struct pcie_aer_err_msg *msg)
> +{
> +    assert(dev->exp);
> +    assert(dev->exp->aer_errmsg);
> +    dev->exp->aer_errmsg(dev, msg);
> +}
> +
> +static AER_ERR_MSG_RESULT
> +pcie_aer_errmsg_alldev(PCIDevice *dev, const struct pcie_aer_err_msg *msg)
> +{
> +    uint16_t cmd = pci_get_word(dev->config + PCI_COMMAND);
> +    bool transmit1 =
> +        pcie_aer_err_msg_is_uncor(msg) && (cmd & PCI_COMMAND_SERR);
> +    uint32_t pos = pci_pcie_cap(dev);
> +    uint32_t devctl = pci_get_word(dev->config + pos + PCI_EXP_DEVCTL);
> +    bool transmit2 = msg->severity & devctl;
> +    PCIDevice *parent_port;
> +
> +    if (transmit1) {
> +        if (pcie_aer_err_msg_is_uncor(msg)) {
> +            /* Signaled System Error */
> +            uint8_t *status = dev->config + PCI_STATUS;
> +            pci_set_word(status,
> +                         pci_get_word(status) | PCI_STATUS_SIG_SYSTEM_ERROR);
> +        }
> +    }
> +
> +    if (!(transmit1 || transmit2)) {
> +        return AER_ERR_MSG_MASKED;
> +    }
> +
> +    /* send up error message */
> +    if (pci_is_express(dev) &&
> +        pcie_cap_get_type(dev) == PCI_EXP_TYPE_ROOT_PORT) {
> +        /* Root port notify system itself,
> +           or send the error message to root complex event collector. */
> +        /*
> +         * if root port is associated to event collector, set
> +         * parent_port = root complex event collector
> +         * For now root complex event collector isn't supported.
> +         */
> +        parent_port = NULL;
> +    } else {
> +        parent_port = pci_bridge_get_device(dev->bus);
> +    }
> +    if (parent_port) {
> +        if (!pci_is_express(parent_port)) {
> +            /* What to do? */
> +            return AER_ERR_MSG_MASKED;
> +        }
> +        pcie_aer_errmsg(parent_port, msg);
> +    }
> +    return AER_ERR_MSG_SENT;
> +}
> +
> +static AER_ERR_MSG_RESULT
> +pcie_aer_errmsg_vbridge(PCIDevice *dev, const struct pcie_aer_err_msg *msg)
> +{
> +    uint16_t bridge_control = pci_get_word(dev->config + PCI_BRIDGE_CONTROL);
> +
> +    if (pcie_aer_err_msg_is_uncor(msg)) {
> +        /* Received System Error */
> +        uint8_t *sec_status = dev->config + PCI_SEC_STATUS;
> +        pci_set_word(sec_status,
> +                     pci_get_word(sec_status) |
> +                     PCI_SEC_STATUS_RCV_SYSTEM_ERROR);
> +    }
> +
> +    if (!(bridge_control & PCI_BRIDGE_CTL_SERR)) {
> +        return AER_ERR_MSG_MASKED;
> +    }
> +    return pcie_aer_errmsg_alldev(dev, msg);
> +}
> +
> +static AER_ERR_MSG_RESULT
> +pcie_aer_errmsg_root_port(PCIDevice *dev, const struct pcie_aer_err_msg *msg)
> +{
> +    AER_ERR_MSG_RESULT ret;
> +    uint16_t cmd;
> +    uint8_t *aer_cap;
> +    uint32_t root_cmd;
> +    uint32_t root_sta;
> +    bool trigger;
> +
> +    ret = pcie_aer_errmsg_vbridge(dev, msg);
> +    if (ret != AER_ERR_MSG_SENT) {
> +        return ret;
> +    }
> +
> +    ret = AER_ERR_MSG_MASKED;
> +    cmd = pci_get_word(dev->config + PCI_COMMAND);
> +    aer_cap = dev->config + pcie_aer_cap(dev);
> +    root_cmd = pci_get_long(aer_cap + PCI_ERR_ROOT_COMMAND);
> +    root_sta = pci_get_long(aer_cap + PCI_ERR_ROOT_STATUS);
> +    trigger = false;
> +
> +    if (cmd & PCI_COMMAND_SERR) {
> +        /* System Error. Platform Specific */
> +        /* ret = AER_ERR_MSG_SENT; */
> +    }
> +
> +    /* Errro Message Received: Root Error Status register */
> +    switch (msg->severity) {
> +    case AER_ERR_COR:
> +        if (root_sta & PCI_ERR_ROOT_COR_RCV) {
> +            root_sta |= PCI_ERR_ROOT_MULTI_COR_RCV;
> +        } else {
> +            if (root_cmd & PCI_ERR_ROOT_CMD_COR_EN) {
> +                trigger = true;
> +            }
> +            pci_set_word(aer_cap + PCI_ERR_ROOT_COR_SRC, msg->source_id);
> +        }
> +        root_sta |= PCI_ERR_ROOT_COR_RCV;
> +        break;
> +    case AER_ERR_NONFATAL:
> +        if (!(root_sta & PCI_ERR_ROOT_NONFATAL_RCV) &&
> +            root_cmd & PCI_ERR_ROOT_CMD_NONFATAL_EN) {
> +            trigger = true;
> +        }
> +        root_sta |= PCI_ERR_ROOT_NONFATAL_RCV;
> +        break;
> +    case AER_ERR_FATAL:
> +        if (!(root_sta & PCI_ERR_ROOT_FATAL_RCV) &&
> +            root_cmd & PCI_ERR_ROOT_CMD_FATAL_EN) {
> +            trigger = true;
> +        }
> +        if (!(root_sta & PCI_ERR_ROOT_UNCOR_RCV)) {
> +            root_sta |= PCI_ERR_ROOT_FIRST_FATAL;
> +        }
> +        root_sta |= PCI_ERR_ROOT_FATAL_RCV;
> +        break;
> +    }
> +    if (pcie_aer_err_msg_is_uncor(msg)) {
> +        if (root_sta & PCI_ERR_ROOT_UNCOR_RCV) {
> +            root_sta |= PCI_ERR_ROOT_MULTI_UNCOR_RCV;
> +        } else {
> +            pci_set_word(aer_cap + PCI_ERR_ROOT_SRC, msg->source_id);
> +        }
> +        root_sta |= PCI_ERR_ROOT_UNCOR_RCV;
> +    }
> +    pci_set_long(aer_cap + PCI_ERR_ROOT_STATUS, root_sta);
> +
> +    if (root_cmd & msg->severity) {
> +        /* Error Interrupt(INTx or MSI) */
> +        pcie_aer_root_notify(dev, trigger, 1);
> +        ret = AER_ERR_MSG_SENT;
> +    }
> +    return ret;
> +}
> +
> +static void pcie_aer_update_log(PCIDevice *dev, const struct pcie_aer_err 
> *err)
> +{
> +    uint8_t *aer_cap = dev->config + pcie_aer_cap(dev);
> +    uint8_t first_bit = ffsl(err->status) - 1;
> +    uint32_t errcap = pci_get_long(aer_cap + PCI_ERR_CAP);
> +    int i;
> +    uint32_t dw;
> +
> +    errcap &= ~(PCI_ERR_CAP_FEP_MASK | PCI_ERR_CAP_TLP);
> +    errcap |= PCI_ERR_CAP_FEP(first_bit);
> +
> +    if (err->flags & PCIE_AER_ERR_HEADER_VALID) {
> +        for (i = 0; i < ARRAY_SIZE(err->header); ++i) {
> +            /* 7.10.8 Header Log Register */
> +            cpu_to_be32wu(&dw, err->header[i]);
> +            memcpy(aer_cap + PCI_ERR_HEADER_LOG + sizeof(err->header[0]) * i,
> +                   &dw, sizeof(dw));
> +        }
> +    } else {
> +        assert(!(err->flags & PCIE_AER_ERR_TLP_PRESENT));
> +        memset(aer_cap + PCI_ERR_HEADER_LOG, 0, sizeof(err->header));
> +    }
> +
> +    if ((err->flags & PCIE_AER_ERR_TLP_PRESENT) &&
> +        (pci_get_long(dev->config + pci_pcie_cap(dev) + PCI_EXP_DEVCTL2) &
> +         PCI_EXP_DEVCAP2_EETLPP)) {
> +        for (i = 0; i < ARRAY_SIZE(err->prefix); ++i) {
> +            /* 7.10.12 tlp prefix log register */
> +            cpu_to_be32wu(&dw, err->prefix[i]);
> +            memcpy(aer_cap + PCI_ERR_TLP_PREFIX_LOG +
> +                   sizeof(err->prefix[0]) * i, &dw, sizeof(dw));
> +        }
> +        errcap |= PCI_ERR_CAP_TLP;
> +    } else {
> +        memset(aer_cap + PCI_ERR_TLP_PREFIX_LOG, 0, sizeof(err->prefix));
> +    }
> +    pci_set_long(aer_cap + PCI_ERR_CAP, errcap);
> +}
> +
> +static void pcie_aer_clear_log(PCIDevice *dev)
> +{
> +    struct pcie_aer_err *err;
> +    uint8_t *aer_cap = dev->config + pcie_aer_cap(dev);
> +    uint32_t errcap = pci_get_long(aer_cap + PCI_ERR_CAP);
> +
> +    errcap &= ~(PCI_ERR_CAP_FEP_MASK | PCI_ERR_CAP_TLP);
> +    pci_set_long(aer_cap + PCI_ERR_CAP, errcap);
> +
> +    memset(aer_cap + PCI_ERR_HEADER_LOG, 0, sizeof(err->header));
> +    memset(aer_cap + PCI_ERR_TLP_PREFIX_LOG, 0, sizeof(err->prefix));
> +}
> +
> +static int pcie_aer_record_error(PCIDevice *dev,
> +                                 const struct pcie_aer_err *err)
> +{
> +    uint8_t *aer_cap = dev->config + pcie_aer_cap(dev);
> +    uint32_t errcap = pci_get_long(aer_cap + PCI_ERR_CAP);
> +    int fep = PCI_ERR_CAP_FEP(errcap);
> +
> +    if (errcap & PCI_ERR_CAP_MHRE &&
> +        (pci_get_long(aer_cap + PCI_ERR_UNCOR_STATUS) & (1ULL << fep))) {
> +        /*  Not first error. queue error */
> +        if (pcie_aer_log_add_err(&dev->aer_log, err) < 0) {
> +            /* overflow */
> +            return -1;
> +        }
> +        return 0;
> +    }
> +
> +    pcie_aer_update_log(dev, err);
> +    return 0;
> +}
> +
> +static void pcie_aer_clear_error(PCIDevice *dev)
> +{
> +    uint8_t *aer_cap = dev->config + pcie_aer_cap(dev);
> +    uint32_t errcap = pci_get_long(aer_cap + PCI_ERR_CAP);
> +    uint32_t old_err = (1UL << PCI_ERR_CAP_FEP(errcap));
> +    struct pcie_aer_log *aer_log = &dev->aer_log;
> +    const struct pcie_aer_err *err;
> +    uint32_t consumer;
> +
> +    if (!(errcap & PCI_ERR_CAP_MHRE) || pcie_aer_log_empty(aer_log)) {
> +        pcie_aer_clear_log(dev);
> +        pci_set_long(aer_cap + PCI_ERR_UNCOR_STATUS,
> +                     pci_get_long(aer_cap + PCI_ERR_UNCOR_STATUS) & 
> ~old_err);
> +        return;
> +    }
> +
> +    /* if no same error is queued, clear bit in uncorrectable error status */
> +    for (consumer = dev->aer_log.consumer;
> +         !pcie_aer_log_empty_index(dev->aer_log.producer, consumer);
> +         consumer = pcie_aer_log_next(consumer, dev->aer_log.log_max)) {
> +        if (dev->aer_log.log[consumer].status & old_err) {
> +            old_err = 0;
> +            break;
> +        }
> +    }
> +    if (old_err) {
> +        pci_set_long(aer_cap + PCI_ERR_UNCOR_STATUS,
> +                     pci_get_long(aer_cap + PCI_ERR_UNCOR_STATUS) & 
> ~old_err);
> +    }
> +
> +    err = pcie_aer_log_del_err(aer_log);
> +    pcie_aer_update_log(dev, err);
> +}
> +
> +/*
> + * non-Function specific error must be recorded in all functions.
> + * It is the responsibility of the caller of this function.
> + * It is also caller's responsiblity to determine which function should
> + * report the rerror.
> + *
> + * 6.2.4 Error Logging
> + * 6.2.5 Sqeucne of Device Error Signaling and Logging Operations
> + * table 6-2: Flowchard Showing Sequence of Device Error Signaling and 
> Logging
> + *            Operations
> + *
> + * Although this implementation can be shortened/optimized, this is kept
> + * parallel to table 6-2.
> + */
> +void pcie_aer_inject_error(PCIDevice *dev, const struct pcie_aer_err *err)
> +{
> +    uint8_t *exp_cap;
> +    uint8_t *aer_cap = NULL;
> +    uint32_t devctl = 0;
> +    uint32_t devsta = 0;
> +    uint32_t status = err->status;
> +    uint32_t mask;
> +    bool is_unsupported_request =
> +        (!(err->flags & PCIE_AER_ERR_IS_CORRECTABLE) &&
> +         err->status == PCI_ERR_UNC_UNSUP);
> +    bool is_advisory_nonfatal = false;  /* for advisory non-fatal error */
> +    uint32_t uncor_status = 0;          /* for advisory non-fatal error */
> +    struct pcie_aer_err_msg msg;
> +    int is_header_log_overflowed = 0;
> +
> +    if (!pci_is_express(dev)) {
> +        /* What to do? */
> +        return;
> +    }
> +
> +    if (err->flags & PCIE_AER_ERR_IS_CORRECTABLE) {
> +        status &= PCI_ERR_COR_SUPPORTED;
> +    } else {
> +        status &= PCI_ERR_UNC_SUPPORTED;
> +    }
> +    if (!status || status & (status - 1)) {
> +        /* invalid status bit. one and only one bit must be set */
> +        return;
> +    }
> +
> +    exp_cap = dev->config + pci_pcie_cap(dev);
> +    if (dev->exp->aer_cap) {
> +        aer_cap = dev->config + pcie_aer_cap(dev);
> +        devctl = pci_get_long(exp_cap + PCI_EXP_DEVCTL);
> +        devsta = pci_get_long(exp_cap + PCI_EXP_DEVSTA);
> +    }
> +    if (err->flags & PCIE_AER_ERR_IS_CORRECTABLE) {
> +    correctable_error:
> +        devsta |= PCI_EXP_DEVSTA_CED;
> +        if (is_unsupported_request) {
> +            devsta |= PCI_EXP_DEVSTA_URD;
> +        }
> +        pci_set_word(exp_cap + PCI_EXP_DEVSTA, devsta);
> +
> +        if (aer_cap) {
> +            pci_set_long(aer_cap + PCI_ERR_COR_STATUS,
> +                         pci_get_long(aer_cap + PCI_ERR_COR_STATUS) | 
> status);
> +            mask = pci_get_long(aer_cap + PCI_ERR_COR_MASK);
> +            if (mask & status) {
> +                return;
> +            }
> +            if (is_advisory_nonfatal) {
> +                uint32_t uncor_mask =
> +                    pci_get_long(aer_cap + PCI_ERR_UNCOR_MASK);
> +                if (!(uncor_mask & uncor_status)) {
> +                    is_header_log_overflowed = pcie_aer_record_error(dev, 
> err);
> +                }
> +                pci_set_long(aer_cap + PCI_ERR_UNCOR_STATUS,
> +                             pci_get_long(aer_cap + PCI_ERR_UNCOR_STATUS) |
> +                             uncor_status);
> +            }
> +        }
> +
> +        if (is_unsupported_request && !(devctl & PCI_EXP_DEVCTL_URRE)) {
> +            return;
> +        }
> +        if (!(devctl & PCI_EXP_DEVCTL_CERE)) {
> +            return;
> +        }
> +        msg.severity = AER_ERR_COR;
> +    } else {
> +        bool is_fatal =
> +            (pcie_aer_uncor_default_severity(status) == AER_ERR_FATAL);
> +        uint16_t cmd;
> +
> +        if (aer_cap) {
> +            is_fatal = status & pci_get_long(aer_cap + PCI_ERR_UNCOR_SEVER);
> +        }
> +        if (!is_fatal && (err->flags & PCIE_AER_ERR_MAYBE_ADVISORY)) {
> +            is_advisory_nonfatal = true;
> +            uncor_status = status;
> +            status = PCI_ERR_COR_ADV_NONFATAL;
> +            goto correctable_error;
> +        }
> +        if (is_fatal) {
> +            devsta |= PCI_EXP_DEVSTA_FED;
> +        } else {
> +            devsta |= PCI_EXP_DEVSTA_NFED;
> +        }
> +        if (is_unsupported_request) {
> +            devsta |= PCI_EXP_DEVSTA_URD;
> +        }
> +        pci_set_long(exp_cap + PCI_EXP_DEVSTA, devsta);
> +
> +        if (aer_cap) {
> +            mask = pci_get_long(aer_cap + PCI_ERR_UNCOR_MASK);
> +            if (mask & status) {
> +                pci_set_long(aer_cap + PCI_ERR_UNCOR_STATUS,
> +                             pci_get_long(aer_cap + PCI_ERR_UNCOR_STATUS) |
> +                             status);
> +                return;
> +            }
> +
> +            is_header_log_overflowed = pcie_aer_record_error(dev, err);
> +            pci_set_long(aer_cap + PCI_ERR_UNCOR_STATUS,
> +                         pci_get_long(aer_cap + PCI_ERR_UNCOR_STATUS) |
> +                         status);
> +        }
> +
> +        cmd = pci_get_word(dev->config + PCI_COMMAND);
> +        if (is_unsupported_request &&
> +            !(devctl & PCI_EXP_DEVCTL_URRE) && !(cmd & PCI_COMMAND_SERR)) {
> +            return;
> +        }
> +        if (is_fatal) {
> +            if (!((cmd & PCI_COMMAND_SERR) ||
> +                  (devctl & PCI_EXP_DEVCTL_FERE))) {
> +                return;
> +            }
> +            msg.severity = AER_ERR_FATAL;
> +        } else {
> +            if (!((cmd & PCI_COMMAND_SERR) ||
> +                  (devctl & PCI_EXP_DEVCTL_NFERE))) {
> +                return;
> +            }
> +            msg.severity = AER_ERR_NONFATAL;
> +        }
> +    }
> +
> +    /* send up error message */
> +    msg.source_id = err->source_id;
> +    pcie_aer_errmsg(dev, &msg);
> +
> +    if (is_header_log_overflowed) {
> +        struct pcie_aer_err header_log_overflow = {
> +            .status = PCI_ERR_COR_HL_OVERFLOW,
> +            .flags = PCIE_AER_ERR_IS_CORRECTABLE,
> +            .header = {0, 0, 0, 0},
> +            .prefix = {0, 0, 0, 0},
> +        };
> +        pcie_aer_inject_error(dev, &header_log_overflow);
> +    }
> +}
> +
> +/* aer root error command/status */
> +#define PCI_ERR_ROOT_CMD_EN_MASK        (PCI_ERR_ROOT_CMD_COR_EN |      \
> +                                         PCI_ERR_ROOT_CMD_NONFATAL_EN | \
> +                                         PCI_ERR_ROOT_CMD_FATAL_EN)
> +
> +#define PCI_ERR_ROOT_IRQ_SHIFT          26
> +#define PCI_ERR_ROOT_IRQ                0xf8000000
> +#define PCI_ERR_ROOT_STATUS_REPORT_MASK (PCI_ERR_ROOT_COR_RCV |         \
> +                                         PCI_ERR_ROOT_MULTI_COR_RCV |   \
> +                                         PCI_ERR_ROOT_UNCOR_RCV |       \
> +                                         PCI_ERR_ROOT_MULTI_UNCOR_RCV | \
> +                                         PCI_ERR_ROOT_FIRST_FATAL |     \
> +                                         PCI_ERR_ROOT_NONFATAL_RCV |    \
> +                                         PCI_ERR_ROOT_FATAL_RCV)
> +
> +void pcie_aer_root_set_vector(PCIDevice *dev, uint8_t vector)
> +{
> +    uint8_t *aer_cap = dev->config + pcie_aer_cap(dev);
> +    uint32_t root_status = pci_get_long(aer_cap + PCI_ERR_ROOT_STATUS);
> +    root_status &= ~PCI_ERR_ROOT_IRQ;
> +    root_status |=
> +        (((uint32_t)vector) << PCI_ERR_ROOT_IRQ_SHIFT) & PCI_ERR_ROOT_IRQ;
> +    pci_set_long(aer_cap + PCI_ERR_ROOT_STATUS, root_status);
> +}
> +
> +static uint8_t pcie_aer_root_get_vector(PCIDevice *dev)
> +{
> +    uint8_t *aer_cap = dev->config + pcie_aer_cap(dev);
> +    uint32_t root_status = pci_get_long(aer_cap + PCI_ERR_ROOT_STATUS);
> +    return (root_status & PCI_ERR_ROOT_IRQ) >> PCI_ERR_ROOT_IRQ_SHIFT;
> +}
> +
> +static void pcie_aer_root_notify(PCIDevice *dev, bool trigger, int level)
> +{
> +    pcie_notify(dev, pcie_aer_root_get_vector(dev), trigger, level);
> +}
> +
> +void pcie_aer_root_init(PCIDevice *dev)
> +{
> +    uint16_t pos = pcie_aer_cap(dev);
> +
> +    pci_set_long(dev->wmask + pos + PCI_ERR_ROOT_COMMAND,
> +                 PCI_ERR_ROOT_CMD_EN_MASK);
> +    dev->exp->aer_errmsg = pcie_aer_errmsg_root_port;
> +}
> +
> +void pcie_aer_root_reset(PCIDevice *dev)
> +{
> +    uint8_t* aer_cap = dev->config + pcie_aer_cap(dev);
> +
> +    pci_set_long(aer_cap + PCI_ERR_ROOT_COMMAND, 0);
> +
> +    /*
> +     * Advanced Error Interrupt Message Number in Root Error Status Register
> +     * must be updated by chip dependent code.
> +     */
> +}
> +
> +static bool pcie_aer_root_does_trigger(uint32_t cmd, uint32_t sta)
> +{
> +    return
> +        ((cmd & PCI_ERR_ROOT_CMD_COR_EN) && (sta & PCI_ERR_ROOT_COR_RCV)) ||
> +        ((cmd & PCI_ERR_ROOT_CMD_NONFATAL_EN) &&
> +         (sta & PCI_ERR_ROOT_NONFATAL_RCV)) ||
> +        ((cmd & PCI_ERR_ROOT_CMD_FATAL_EN) && (sta & 
> PCI_ERR_ROOT_FATAL_RCV));
> +}
> +
> +void pcie_aer_root_write_config(PCIDevice *dev,
> +                                uint32_t addr, uint32_t val, int len,
> +                                uint32_t root_cmd_prev)
> +{
> +    uint16_t pos = pcie_aer_cap(dev);
> +    uint8_t *aer_cap = dev->config + pos;
> +    uint32_t root_status;
> +
> +    if (ranges_overlap(addr, len, pos + PCI_ERR_ROOT_STATUS, 4)) {
> +        /* RW1CS */
> +        pcie_w1c_long(dev, pos + PCI_ERR_ROOT_STATUS,
> +                      PCI_ERR_ROOT_STATUS_REPORT_MASK, addr, val);
> +    }
> +
> +    /* root command */
> +    if (ranges_overlap(addr, len, pos + PCI_ERR_ROOT_COMMAND, 4)) {
> +        uint32_t root_cmd = pci_get_long(aer_cap + PCI_ERR_ROOT_COMMAND);
> +        if (root_cmd & PCI_ERR_ROOT_CMD_EN_MASK) {
> +            bool trigger = false;
> +            int level = 0;
> +            uint32_t root_cmd_set = (root_cmd_prev ^ root_cmd) & root_cmd;
> +
> +            /* 0 -> 1 */
> +            root_status = pci_get_long(aer_cap + PCI_ERR_ROOT_STATUS);
> +            if (pcie_aer_root_does_trigger(root_cmd_set, root_status)) {
> +                trigger = true;
> +            }
> +            if (pcie_aer_root_does_trigger(root_cmd, root_status)) {
> +                level = 1;
> +            }
> +            pcie_aer_root_notify(dev, trigger, level);
> +        }
> +    }
> +}
> +
> +static const VMStateDescription vmstate_pcie_aer_err = {
> +    .name = "PCIE_AER_ERROR",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .minimum_version_id_old = 1,
> +    .fields     = (VMStateField[]) {
> +        VMSTATE_UINT32(status, struct pcie_aer_err),
> +        VMSTATE_UINT16(source_id, struct pcie_aer_err),
> +        VMSTATE_UINT16(flags, struct pcie_aer_err),
> +        VMSTATE_UINT32_ARRAY(header, struct pcie_aer_err, 4),
> +        VMSTATE_UINT32_ARRAY(prefix, struct pcie_aer_err, 4),
> +        VMSTATE_END_OF_LIST()
> +    }
> +};
> +
> +#define VMSTATE_PCIE_AER_ERRS(_field, _state, _field_num, _vmsd, _type) { \
> +    .name       = (stringify(_field)),                                    \
> +    .version_id = 0,                                                      \
> +    .num_offset = vmstate_offset_value(_state, _field_num, uint16_t),     \
> +    .size       = sizeof(_type),                                          \
> +    .vmsd       = &(_vmsd),                                               \
> +    .flags      = VMS_POINTER | VMS_VARRAY_UINT16 | VMS_STRUCT,           \
> +    .offset     = vmstate_offset_pointer(_state, _field, _type),          \
> +}
> +
> +const VMStateDescription vmstate_pcie_aer_log = {
> +    .name = "PCIE_AER_ERROR_LOG",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .minimum_version_id_old = 1,
> +    .fields     = (VMStateField[]) {
> +        VMSTATE_UINT32(producer, struct pcie_aer_log),
> +        VMSTATE_UINT32(consumer, struct pcie_aer_log),
> +        VMSTATE_UINT16(log_max, struct pcie_aer_log),
> +        VMSTATE_PCIE_AER_ERRS(log, struct pcie_aer_log, log_max,
> +                              vmstate_pcie_aer_err, struct pcie_aer_err),
> +        VMSTATE_END_OF_LIST()
> +    }
> +};
> diff --git a/hw/pcie.h b/hw/pcie.h
> new file mode 100644
> index 0000000..07f42c6
> --- /dev/null
> +++ b/hw/pcie.h
> @@ -0,0 +1,186 @@
> +/*
> + * pcie.h
> + *
> + * Copyright (c) 2010 Isaku Yamahata <yamahata at valinux co jp>
> + *                    VA Linux Systems Japan K.K.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#ifndef QEMU_PCIE_H
> +#define QEMU_PCIE_H
> +
> +#include "hw.h"
> +
> +enum PCIExpressIndicator {
> +    /* for attention and power indicator */
> +    PCI_EXP_HP_IND_RESERVED     = 0b00,
> +    PCI_EXP_HP_IND_ON           = 0b01,
> +    PCI_EXP_HP_IND_BLINK        = 0b10,
> +    PCI_EXP_HP_IND_OFF          = 0b11,
> +};
> +
> +enum PCIExpressHotPlugEvent {
> +    /* the bits match the bits in Slot Control/Status registers.
> +     * PCI_EXP_HP_EV_xxx = PCI_EXP_SLTCTL_xxxE = PCI_EXP_SLTSTA_xxx
> +     */
> +    PCI_EXP_HP_EV_ABP   = 0b00001,      /* attention button preseed */
> +    PCI_EXP_HP_EV_PDC   = 0b01000,      /* presence detect changed */
> +    PCI_EXP_HP_EV_CCI   = 0b10000,      /* command completed */
> +
> +    PCI_EXP_HP_EV_SUPPORTED     = 0b11001,       /* supported event mask  */
> +    /* events not listed aren't supported */
> +};
> +
> +typedef void (*pcie_flr_fn)(PCIDevice *dev);
> +
> +struct pcie_aer_err_msg;
> +enum AER_ERR_MSG_RESULT {
> +    AER_ERR_MSG_MASKED,
> +    AER_ERR_MSG_SENT,
> +};
> +typedef enum AER_ERR_MSG_RESULT AER_ERR_MSG_RESULT;
> +typedef AER_ERR_MSG_RESULT (*pcie_aer_errmsg_fn)(PCIDevice *dev, const struct 
> pcie_aer_err_msg *msg);
> +
> +struct PCIExpressDevice {
> +    /* Offset of express capability in config space */
> +    uint8_t exp_cap;
> +
> +    /* FLR */
> +    pcie_flr_fn flr;
> +
> +    /* AER */
> +    uint16_t aer_cap;
> +    pcie_aer_errmsg_fn aer_errmsg;
> +};
> +
> +struct pcie_aer_log {
> +    uint32_t producer;
> +    uint32_t consumer;
> +
> +#define PCIE_AER_LOG_MAX_DEFAULT        8
> +#define PCIE_AER_LOG_MAX_MAX            128 /* what is appropriate? */
> +#define PCIE_AER_LOG_MAX_UNSET          (~(uint16_t)0)
> +    uint16_t log_max;
> +
> +    struct pcie_aer_err *log;
> +};
> +
> +extern const VMStateDescription vmstate_pcie_aer_log;
> +
> +/* PCI express capability helper functions */
> +int pci_pcie_cap_init(PCIDevice *dev,
> +                      uint8_t offset, uint8_t type, uint8_t port);
> +int pci_pcie_cap_exit(PCIDevice *dev);
> +uint8_t pcie_cap_get_type(const PCIDevice *dev);
> +void pcie_cap_flags_set_vector(PCIDevice *dev, uint8_t vector);
> +uint8_t pcie_cap_flags_get_vector(PCIDevice *dev);
> +
> +void pcie_cap_deverr_init(PCIDevice *dev);
> +void pcie_cap_deverr_reset(PCIDevice *dev);
> +void pcie_cap_deverr_write_config(PCIDevice *dev,
> +                                  uint32_t addr, uint32_t val, int len);
> +
> +void pcie_cap_slot_init(PCIDevice *dev, uint16_t slot);
> +void pcie_cap_slot_reset(PCIDevice *dev);
> +void pcie_cap_slot_write_config(PCIDevice *dev,
> +                                uint32_t addr, uint32_t val, int len,
> +                                uint16_t sltctl_prev);
> +void pcie_cap_slot_push_attention_button(PCIDevice *dev);
> +
> +void pcie_cap_root_init(PCIDevice *dev);
> +void pcie_cap_root_reset(PCIDevice *dev);
> +
> +void pcie_cap_flr_init(PCIDevice *dev, pcie_flr_fn flr);
> +void pcie_cap_flr_write_config(PCIDevice *dev,
> +                           uint32_t addr, uint32_t val, int len);
> +
> +void pcie_cap_ari_init(PCIDevice *dev);
> +void pcie_cap_ari_reset(PCIDevice *dev);
> +bool pcie_cap_is_ari_enabled(const PCIDevice *dev);
> +
> +/* PCI express extended capability helper functions */
> +uint16_t pcie_find_ext_capability(PCIDevice *dev, uint16_t cap_id);
> +int pcie_add_ext_capability(PCIDevice *dev,
> +                            uint16_t cap_id, uint8_t cap_ver, uint16_t size);
> +int pcie_append_ext_capability(PCIDevice *dev,
> +                               uint16_t cap_id, uint8_t cap_ver,
> +                               uint16_t offset, uint16_t size);
> +void pcie_del_ext_capability(PCIDevice *dev, uint16_t cap_id, uint16_t size);
> +void pcie_reserve_ext_capability(PCIDevice *dev,
> +                                 uint16_t offset, uint16_t size);
> +
> +int pcie_ari_init(PCIDevice *dev, uint16_t offset, uint16_t nextfn);
> +
> +/* PCI express extended capabilities */
> +
> +/* AER */
> +/* aer error severity */
> +enum PCIE_AER_SEVERITY {
> +    /* those value are same as
> +     * Root error command register in aer extended cap and
> +     * root control register in pci express cap.
> +     */
> +    AER_ERR_COR         = 0x1,
> +    AER_ERR_NONFATAL    = 0x2,
> +    AER_ERR_FATAL       = 0x4,
> +};
> +
> +/* aer error message: error signaling message has only error sevirity and
> +   source id. See 2.2.8.3 error signaling messages */
> +struct pcie_aer_err_msg {
> +    enum PCIE_AER_SEVERITY severity;
> +    uint16_t source_id; /* bdf */
> +};
> +
> +static inline bool
> +pcie_aer_err_msg_is_uncor(const struct pcie_aer_err_msg *msg)
> +{
> +    return msg->severity == AER_ERR_NONFATAL || msg->severity == 
> AER_ERR_FATAL;
> +}
> +
> +/* error */
> +struct pcie_aer_err {
> +    uint32_t status;    /* error status bits */
> +    uint16_t source_id; /* bdf */
> +
> +#define PCIE_AER_ERR_IS_CORRECTABLE     0x1     /* correctable/uncorrectable 
> */
> +#define PCIE_AER_ERR_MAYBE_ADVISORY     0x2     /* maybe advisory non-fatal 
> */
> +#define PCIE_AER_ERR_HEADER_VALID       0x4     /* TLP header is logged */
> +#define PCIE_AER_ERR_TLP_PRESENT        0x8     /* TLP Prefix is logged */
> +    uint16_t flags;
> +
> +    uint32_t header[4]; /* TLP header */
> +    uint32_t prefix[4]; /* TLP header prefix */
> +};
> +
> +int pcie_aer_init(PCIDevice *dev, uint16_t offset);
> +void pcie_aer_exit(PCIDevice *dev);
> +void pcie_aer_write_config(PCIDevice *dev,
> +                           uint32_t addr, uint32_t val, int len);
> +void pcie_aer_write_config_vbridge(PCIDevice *dev,
> +                                   uint32_t addr, uint32_t val, int len);
> +
> +/* aer root port */
> +void pcie_aer_root_set_vector(PCIDevice *dev, uint8_t vector);
> +void pcie_aer_root_init(PCIDevice *dev);
> +void pcie_aer_root_reset(PCIDevice *dev);
> +void pcie_aer_root_write_config(PCIDevice *dev,
> +                                uint32_t addr, uint32_t val, int len,
> +                                uint32_t root_cmd_prev);
> +
> +/* error injection */
> +void pcie_aer_inject_error(PCIDevice *dev, const struct pcie_aer_err *err);
> +
> +#endif /* QEMU_PCIE_H */
> diff --git a/qemu-common.h b/qemu-common.h
> index d735235..6d9ee26 100644
> --- a/qemu-common.h
> +++ b/qemu-common.h
> @@ -219,6 +219,7 @@ typedef struct PCIHostState PCIHostState;
>  typedef struct PCIExpressHost PCIExpressHost;
>  typedef struct PCIBus PCIBus;
>  typedef struct PCIDevice PCIDevice;
> +typedef struct PCIExpressDevice PCIExpressDevice;
>  typedef struct PCIBridge PCIBridge;
>  typedef struct SerialState SerialState;
>  typedef struct IRQState *qemu_irq;
Blue Swirl Sept. 12, 2010, 7:49 a.m. UTC | #3
On Wed, Sep 8, 2010 at 5:38 PM, Wei Xu <wexu2@cisco.com> wrote:
> Isaku:
>
> For binary constants below, to achieve max compatibility with gcc versions,
> I recommend to change to hex (0x...):

Yes, binary constants were only added to GCC 4.3.x. Since they are
also GCC extensions with no obvious way to circumvent their use (as
with GCC attributes), they shouldn't be used.
Michael S. Tsirkin Sept. 12, 2010, 1:26 p.m. UTC | #4
On Wed, Sep 08, 2010 at 04:39:35PM +0900, Isaku Yamahata wrote:
> +#define PCI_EXP_SLTCTL_AIC_SHIFT        6
> +#define PCI_EXP_SLTCTL_AIC_ON           (PCI_EXP_HP_IND_ON << PCI_EXP_SLTCTL_AIC_SHIFT)
> +#define PCI_EXP_SLTCTL_AIC_BLINK        (PCI_EXP_HP_IND_BLINK << PCI_EXP_SLTCTL_AIC_SHIFT)
> +#define PCI_EXP_SLTCTL_AIC_OFF          (PCI_EXP_HP_IND_OFF << PCI_EXP_SLTCTL_AIC_SHIFT)
> +
> +#define PCI_EXP_SLTCTL_PIC_SHIFT        8
> +#define PCI_EXP_SLTCTL_PIC_ON           (PCI_EXP_HP_IND_ON << PCI_EXP_SLTCTL_PIC_SHIFT)
> +#define PCI_EXP_SLTCTL_PIC_BLINK        (PCI_EXP_HP_IND_BLINK << PCI_EXP_SLTCTL_PIC_SHIFT)
> +#define PCI_EXP_SLTCTL_PIC_OFF          (PCI_EXP_HP_IND_OFF << PCI_EXP_SLTCTL_PIC_SHIFT)

It might be better to simply define the 6 macros we are using directly.
The duplication here is minimal, and I guess it will be easier get
them into linux this way.
Isaku Yamahata Sept. 15, 2010, 5:50 a.m. UTC | #5
On Wed, Sep 08, 2010 at 01:31:22PM +0300, Michael S. Tsirkin wrote:
> > +
> > +static void pcie_notify(PCIDevice *dev, uint16_t vector,
> > +                        bool trigger, int level)
> > +{
> > +    /* masking/masking interrupt is handled by upper layer.
> > +     * i.e. msix_notify() for MSI-X
> > +     *      msi_notify()  for MSI
> > +     *      pci_set_irq() for INTx
> > +     */
> 
> So this will send another interrupt when level is 0?

Yes. The condition that triggers MSI-X/MSI can be different from
the one that asserts INTx as you can see it in the following code.
trigger and level are set independently.
Michael S. Tsirkin Sept. 15, 2010, 1:05 p.m. UTC | #6
On Wed, Sep 15, 2010 at 02:50:01PM +0900, Isaku Yamahata wrote:
> On Wed, Sep 08, 2010 at 01:31:22PM +0300, Michael S. Tsirkin wrote:
> > > +
> > > +static void pcie_notify(PCIDevice *dev, uint16_t vector,
> > > +                        bool trigger, int level)
> > > +{
> > > +    /* masking/masking interrupt is handled by upper layer.
> > > +     * i.e. msix_notify() for MSI-X
> > > +     *      msi_notify()  for MSI
> > > +     *      pci_set_irq() for INTx
> > > +     */
> > 
> > So this will send another interrupt when level is 0?
> 
> Yes. The condition that triggers MSI-X/MSI can be different from
> the one that asserts INTx as you can see it in the following code.
> trigger and level are set independently.

Looks like a bug ... but we are better off splitting this
to assert/deassert case as I suggested separately, anyway.

> -- 
> yamahata
Isaku Yamahata Sept. 19, 2010, 4:11 a.m. UTC | #7
On Wed, Sep 15, 2010 at 03:05:13PM +0200, Michael S. Tsirkin wrote:
> On Wed, Sep 15, 2010 at 02:50:01PM +0900, Isaku Yamahata wrote:
> > On Wed, Sep 08, 2010 at 01:31:22PM +0300, Michael S. Tsirkin wrote:
> > > > +
> > > > +static void pcie_notify(PCIDevice *dev, uint16_t vector,
> > > > +                        bool trigger, int level)
> > > > +{
> > > > +    /* masking/masking interrupt is handled by upper layer.
> > > > +     * i.e. msix_notify() for MSI-X
> > > > +     *      msi_notify()  for MSI
> > > > +     *      pci_set_irq() for INTx
> > > > +     */
> > > 
> > > So this will send another interrupt when level is 0?
> > 
> > Yes. The condition that triggers MSI-X/MSI can be different from
> > the one that asserts INTx as you can see it in the following code.
> > trigger and level are set independently.
> 
> Looks like a bug ...

No. It can and the spec requires it. The mode of INTx and MSI is exclusive.
I think that it's quite reasonable to assume the basic knowledge
of express. For example

From 6.7.3.4. Software Notification of Hot-Plug Events

> If the Port is enabled for level-triggered interrupt signaling using
> the INTx messages, the virtualization INTx wire must be asserted whenever
> and as long as the following conditions are satisfied:

and the list of conditions..

> If the Port is enabled for edge-triggered interrupt signaling using
> MSI or MSI-X, an interrupt message must be sent every time the logical
> AND of the following conditions transitions from FALSE to TRUE:

and the list of conditions.
Michael S. Tsirkin Sept. 19, 2010, 11:51 a.m. UTC | #8
On Sun, Sep 19, 2010 at 01:11:21PM +0900, Isaku Yamahata wrote:
> On Wed, Sep 15, 2010 at 03:05:13PM +0200, Michael S. Tsirkin wrote:
> > On Wed, Sep 15, 2010 at 02:50:01PM +0900, Isaku Yamahata wrote:
> > > On Wed, Sep 08, 2010 at 01:31:22PM +0300, Michael S. Tsirkin wrote:
> > > > > +
> > > > > +static void pcie_notify(PCIDevice *dev, uint16_t vector,
> > > > > +                        bool trigger, int level)
> > > > > +{
> > > > > +    /* masking/masking interrupt is handled by upper layer.
> > > > > +     * i.e. msix_notify() for MSI-X
> > > > > +     *      msi_notify()  for MSI
> > > > > +     *      pci_set_irq() for INTx
> > > > > +     */
> > > > 
> > > > So this will send another interrupt when level is 0?
> > > 
> > > Yes. The condition that triggers MSI-X/MSI can be different from
> > > the one that asserts INTx as you can see it in the following code.
> > > trigger and level are set independently.
> > 
> > Looks like a bug ...
> 
> No. It can and the spec requires it. The mode of INTx and MSI is exclusive.
> I think that it's quite reasonable to assume the basic knowledge
> of express. For example
> 
> >From 6.7.3.4. Software Notification of Hot-Plug Events
> 
> > If the Port is enabled for level-triggered interrupt signaling using
> > the INTx messages, the virtualization INTx wire must be asserted whenever
> > and as long as the following conditions are satisfied:
> 
> and the list of conditions..
> 
> > If the Port is enabled for edge-triggered interrupt signaling using
> > MSI or MSI-X, an interrupt message must be sent every time the logical
> > AND of the following conditions transitions from FALSE to TRUE:
> 
> and the list of conditions.

I guess I just don't seem to be able to map the code to spec.
I don't understand what trigger and level are.

I think it would become clearer if we have two functions:
assert and dessert - instead of attempting to encode
it all as level and trigger.

Deassert would simply do nothing for msi/msix.

> yamahata
diff mbox

Patch

diff --git a/Makefile.objs b/Makefile.objs
index 5f5a4c5..eeb5134 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -186,6 +186,7 @@  hw-obj-$(CONFIG_PIIX4) += piix4.o
 # PCI watchdog devices
 hw-obj-y += wdt_i6300esb.o
 
+hw-obj-y += pcie.o
 hw-obj-y += msix.o msi.o
 
 # PCI network cards
diff --git a/hw/pci.h b/hw/pci.h
index 296c7ba..bccab3a 100644
--- a/hw/pci.h
+++ b/hw/pci.h
@@ -9,6 +9,8 @@ 
 /* PCI includes legacy ISA access.  */
 #include "isa.h"
 
+#include "pcie.h"
+
 /* PCI bus */
 
 #define PCI_DEVFN(slot, func)   ((((slot) & 0x1f) << 3) | ((func) & 0x07))
@@ -172,6 +174,12 @@  struct PCIDevice {
     /* Offset of MSI capability in config space */
     uint8_t msi_cap;
 
+    /* PCI Express */
+    PCIExpressDevice *exp;
+    /* Theoretically this belongs to  PCIExpressDevice.
+       However it is here for property and save/load */
+    struct pcie_aer_log aer_log;
+
     /* Location of option rom */
     char *romfile;
     ram_addr_t rom_offset;
@@ -367,6 +375,22 @@  static inline uint32_t pci_config_size(const PCIDevice *d)
     return pci_is_express(d) ? PCIE_CONFIG_SPACE_SIZE : PCI_CONFIG_SPACE_SIZE;
 }
 
+
+/* These are pci express specific, so should belong to pcie.h.
+   they're here to avoid header inclusion error. */
+static inline uint8_t pci_pcie_cap(const PCIDevice *d)
+{
+    return d->exp ? d->exp->exp_cap : 0;
+}
+
+/* AER */
+static inline uint16_t pcie_aer_cap(const PCIDevice *d)
+{
+    assert(d->exp);
+    return d->exp->aer_cap;
+}
+
+
 /* These are not pci specific. Should move into a separate header.
  * Only pci.c uses them, so keep them here for now.
  */
diff --git a/hw/pcie.c b/hw/pcie.c
new file mode 100644
index 0000000..1f24c2a
--- /dev/null
+++ b/hw/pcie.c
@@ -0,0 +1,1668 @@ 
+/*
+ * pcie.c
+ *
+ * Copyright (c) 2010 Isaku Yamahata <yamahata at valinux co jp>
+ *                    VA Linux Systems Japan K.K.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "sysemu.h"
+#include "pci_bridge.h"
+#include "pcie.h"
+#include "msix.h"
+#include "msi.h"
+#include "pci_internals.h"
+
+//#define DEBUG_PCIE
+#ifdef DEBUG_PCIE
+# define PCIE_DPRINTF(fmt, ...)                                         \
+    fprintf(stderr, "%s:%d " fmt, __func__, __LINE__, ## __VA_ARGS__)
+#else
+# define PCIE_DPRINTF(fmt, ...) do {} while (0)
+#endif
+#define PCIE_DEV_PRINTF(dev, fmt, ...)                                  \
+    PCIE_DPRINTF("%s:%x "fmt, (dev)->name, (dev)->devfn, ## __VA_ARGS__)
+
+static inline const char *pcie_hp_event_name(enum PCIExpressHotPlugEvent event)
+{
+    switch (event) {
+    case PCI_EXP_HP_EV_ABP:
+        return "attention button pushed";
+    case PCI_EXP_HP_EV_PDC:
+        return "present detection changed";
+    case PCI_EXP_HP_EV_CCI:
+        return "command completed";
+    default:
+        break;
+    }
+    return "Unknown event";
+}
+
+static void pcie_aer_clear_error(PCIDevice *dev);
+static void pcie_aer_root_notify(PCIDevice *dev, bool trigger, int level);
+static AER_ERR_MSG_RESULT
+pcie_aer_errmsg_alldev(PCIDevice *dev, const struct pcie_aer_err_msg *msg);
+static AER_ERR_MSG_RESULT
+pcie_aer_errmsg_vbridge(PCIDevice *dev, const struct pcie_aer_err_msg *msg);
+
+/***************************************************************************
+ * pci express capability helper functions
+ */
+
+#define PCI_EXP_VER2_SIZEOF     0x3c    /* express capability of version 2 */
+
+/* PCI_EXP_FLAGS */
+#define PCI_EXP_FLAGS_VER2      2       /* for now, supports only version 2 */
+#define PCI_EXP_FLAGS_IRQ_SHIFT 9
+#define PCI_EXP_FLAGS_IRQ_REG(irq)      (((irq) << PCI_EXP_FLAGS_IRQ_SHIFT) & PCI_EXP_FLAGS_IRQ)
+#define PCI_EXP_FLAGS_TYPE_SHIFT        4
+
+/* PCI_EXP_LINK{CAP, STA} */
+/* link speed */
+#define PCI_EXP_LNK_LS_25               1
+
+#define PCI_EXP_LNK_MLW_SHIFT           4
+#define PCI_EXP_LNK_MLW_1               (1 << PCI_EXP_LNK_MLW_SHIFT)
+
+/* PCI_EXP_LINKCAP */
+#define PCI_EXP_LNKCAP_ASPMS_SHIFT      10
+#define PCI_EXP_LNKCAP_ASPMS_0S         (1 << PCI_EXP_LNKCAP_ASPMS_SHIFT)
+
+#define PCI_EXP_LNKCAP_PN_SHIFT         24
+#define PCI_EXP_LNKCAP_PN_REG(pn)       (((pn) << PCI_EXP_LNKCAP_PN_SHIFT) & PCI_EXP_LNKCAP_PN)
+
+#define PCI_EXP_SLTCAP_PSN_SHIFT        19
+#define PCI_EXP_SLTCAP_PSN_REG(slot)    (((slot) << PCI_EXP_SLTCAP_PSN_SHIFT) & PCI_EXP_SLTCAP_PSN)
+
+#define PCI_EXP_SLTCTL_AIC_SHIFT        6
+#define PCI_EXP_SLTCTL_AIC_ON           (PCI_EXP_HP_IND_ON << PCI_EXP_SLTCTL_AIC_SHIFT)
+#define PCI_EXP_SLTCTL_AIC_BLINK        (PCI_EXP_HP_IND_BLINK << PCI_EXP_SLTCTL_AIC_SHIFT)
+#define PCI_EXP_SLTCTL_AIC_OFF          (PCI_EXP_HP_IND_OFF << PCI_EXP_SLTCTL_AIC_SHIFT)
+
+#define PCI_EXP_SLTCTL_PIC_SHIFT        8
+#define PCI_EXP_SLTCTL_PIC_ON           (PCI_EXP_HP_IND_ON << PCI_EXP_SLTCTL_PIC_SHIFT)
+#define PCI_EXP_SLTCTL_PIC_BLINK        (PCI_EXP_HP_IND_BLINK << PCI_EXP_SLTCTL_PIC_SHIFT)
+#define PCI_EXP_SLTCTL_PIC_OFF          (PCI_EXP_HP_IND_OFF << PCI_EXP_SLTCTL_PIC_SHIFT)
+
+#define PCI_EXP_DEVCAP2_EFF             0x100000
+#define PCI_EXP_DEVCAP2_EETLPP          0x200000
+
+#define PCI_EXP_DEVCTL2_EETLPPB         0x80
+
+static void pcie_notify(PCIDevice *dev, uint16_t vector,
+                        bool trigger, int level)
+{
+    /* masking/masking interrupt is handled by upper layer.
+     * i.e. msix_notify() for MSI-X
+     *      msi_notify()  for MSI
+     *      pci_set_irq() for INTx
+     */
+    PCIE_DEV_PRINTF(dev, "noitfy vector %d tirgger:%d level:%d\n",
+                    vector, trigger, level);
+    if (msix_enabled(dev)) {
+        if (trigger) {
+            msix_notify(dev, vector);
+        }
+    } else if (msi_enabled(dev)) {
+        if (trigger){
+            msi_notify(dev, vector);
+        }
+    } else  {
+        qemu_set_irq(dev->irq[0], level);
+    }
+}
+
+static inline uint32_t pcie_written_val_long(uint32_t addr, uint32_t val,
+                                             uint32_t pos)
+{
+    if (addr >= pos) {
+        val <<= addr - pos;
+    } else {
+        val >>= pos - addr;
+    }
+    return val;
+}
+
+static inline uint16_t pcie_written_val_word(uint32_t addr, uint32_t val,
+                                             uint32_t pos)
+{
+    return pcie_written_val_long(addr, val, pos) & 0xffff;
+}
+
+/*
+ * RW1C: Write-1-to-clear
+ * regiger      written val        result
+ * 0            0               => 0
+ * 1            0               => 1
+ * 0            1               => 0
+ * 1            1               => 0
+ */
+static inline void pcie_w1c_long(PCIDevice *d, uint32_t pos, uint32_t mask,
+                                 uint32_t addr, uint32_t val)
+{
+    uint32_t written = pcie_written_val_long(addr, val, pos) & mask;
+    uint32_t reg = pci_get_long(d->config + pos);
+    reg &= ~written;
+    pci_set_long(d->config + pos, reg);
+}
+
+static inline void pcie_w1c_word(PCIDevice *d, uint32_t pos, uint16_t mask,
+                                 uint32_t addr, uint32_t val)
+{
+    uint16_t written = pcie_written_val_word(addr, val, pos) & mask;
+    uint16_t reg = pci_get_word(d->config + pos);
+    reg &= ~written;
+    pci_set_word(d->config + pos, reg);
+}
+
+int pci_pcie_cap_init(PCIDevice *dev,
+                      uint8_t offset, uint8_t type, uint8_t port)
+{
+    int exp_cap;
+    uint8_t *pcie_cap;
+
+    assert(pci_is_express(dev));
+    dev->exp = qemu_mallocz(sizeof(*dev->exp));
+
+    exp_cap = pci_add_capability(dev, PCI_CAP_ID_EXP, offset,
+                                 PCI_EXP_VER2_SIZEOF);
+    if (exp_cap < 0) {
+        qemu_free(dev->exp);
+        dev->exp = NULL;
+        return exp_cap;
+    }
+    dev->exp->exp_cap = exp_cap;
+    /* dev->cap_present |= QEMU_PCI_CAP_EXPRESS; */ /* already done in pci_qdev_init() */
+
+    pcie_cap = dev->config + pci_pcie_cap(dev);
+
+    /* capability register
+       interrupt message number defaults to 0 */
+    pci_set_word(pcie_cap + PCI_EXP_FLAGS,
+                 ((type << PCI_EXP_FLAGS_TYPE_SHIFT) & PCI_EXP_FLAGS_TYPE) |
+                 PCI_EXP_FLAGS_VER2);
+
+    /* device capability register
+     * table 7-12:
+     * roll based error reporting bit must be set by all
+     * Functions conforming to the ECN, PCI Express Base
+     * Specification, Revision 1.1., or subsequent PCI Express Base
+     * Specification revisions.
+     */
+    pci_set_long(pcie_cap + PCI_EXP_DEVCAP, PCI_EXP_DEVCAP_RBER);
+
+    pci_set_long(pcie_cap + PCI_EXP_LNKCAP,
+                 PCI_EXP_LNKCAP_PN_REG(port) |
+                 PCI_EXP_LNKCAP_ASPMS_0S |
+                 PCI_EXP_LNK_MLW_1 |
+                 PCI_EXP_LNK_LS_25);
+
+    pci_set_word(pcie_cap + PCI_EXP_LNKSTA,
+                 PCI_EXP_LNK_MLW_1 | PCI_EXP_LNK_LS_25);
+
+    pci_set_long(pcie_cap + PCI_EXP_DEVCAP2,
+                 PCI_EXP_DEVCAP2_EFF | PCI_EXP_DEVCAP2_EETLPP);
+
+    pci_set_word(dev->wmask + exp_cap, PCI_EXP_DEVCTL2_EETLPPB);
+    return exp_cap;
+}
+
+int pci_pcie_cap_exit(PCIDevice *dev)
+{
+    /* pci_del_capability(dev, PCI_CAP_ID_EXP, PCI_EXP_VER2_SIZEOF); */
+    qemu_free(dev->exp);
+    return 0;
+}
+
+uint8_t pcie_cap_get_type(const PCIDevice *dev)
+{
+    uint32_t pos = pci_pcie_cap(dev);
+    assert(pos > 0);
+    return (pci_get_word(dev->config + pos + PCI_EXP_FLAGS) &
+            PCI_EXP_FLAGS_TYPE) >> PCI_EXP_FLAGS_TYPE_SHIFT;
+}
+
+/* MSI/MSI-X */
+/* pci express interrupt message number */
+void pcie_cap_flags_set_vector(PCIDevice *dev, uint8_t vector)
+{
+    uint8_t *pcie_cap = dev->config + pci_pcie_cap(dev);
+    uint16_t tmp;
+
+    assert(vector <= 32);
+    tmp = pci_get_word(pcie_cap + PCI_EXP_FLAGS);
+    tmp &= ~PCI_EXP_FLAGS_IRQ;
+    tmp |= PCI_EXP_FLAGS_IRQ_REG(vector);
+    pci_set_word(pcie_cap + PCI_EXP_FLAGS, tmp);
+}
+
+uint8_t pcie_cap_flags_get_vector(PCIDevice *dev)
+{
+    return (pci_get_word(dev->config + pci_pcie_cap(dev) + PCI_EXP_FLAGS) &
+            PCI_EXP_FLAGS_IRQ) >> PCI_EXP_FLAGS_IRQ_SHIFT;
+}
+
+static void pcie_cap_notify(PCIDevice *dev, bool trigger, int level)
+{
+    pcie_notify(dev, pcie_cap_flags_get_vector(dev), trigger, level);
+}
+
+void pcie_cap_deverr_init(PCIDevice *dev)
+{
+    uint32_t pos = pci_pcie_cap(dev);
+    uint8_t *pcie_cap = dev->config + pos;
+    uint8_t *pcie_wmask = dev->wmask + pos;
+
+    pci_set_long(pcie_cap + PCI_EXP_DEVCAP,
+                 pci_get_long(pcie_cap + PCI_EXP_DEVCAP) |
+                 PCI_EXP_DEVCAP_RBER);
+
+    pci_set_long(pcie_wmask + PCI_EXP_DEVCTL,
+                 pci_get_long(pcie_wmask + PCI_EXP_DEVCTL) |
+                 PCI_EXP_DEVCTL_CERE | PCI_EXP_DEVCTL_NFERE |
+                 PCI_EXP_DEVCTL_FERE | PCI_EXP_DEVCTL_URRE);
+}
+
+void pcie_cap_deverr_reset(PCIDevice *dev)
+{
+    uint8_t *pcie_cap = dev->config + pci_pcie_cap(dev);
+    pci_set_long(pcie_cap + PCI_EXP_DEVCTL,
+                 pci_get_long(pcie_cap + PCI_EXP_DEVCTL) &
+                 ~(PCI_EXP_DEVCTL_CERE | PCI_EXP_DEVCTL_NFERE |
+                   PCI_EXP_DEVCTL_FERE | PCI_EXP_DEVCTL_URRE));
+}
+
+void pcie_cap_deverr_write_config(PCIDevice *dev,
+                                  uint32_t addr, uint32_t val, int len)
+{
+    uint32_t pos = pci_pcie_cap(dev);
+    if (ranges_overlap(addr, len, pos + PCI_EXP_DEVSTA, 4)) {
+        /* RW1C */
+        pcie_w1c_long(dev, pos + PCI_EXP_DEVSTA,
+                      PCI_EXP_DEVSTA_CED | PCI_EXP_DEVSTA_NFED |
+                      PCI_EXP_DEVSTA_URD | PCI_EXP_DEVSTA_URD,
+                      addr, val);
+    }
+}
+
+/*
+ * events: PCI_EXP_HP_EV_xxx
+ * status: bit or of PCI_EXP_SLTSTA_xxx
+ */
+static void pcie_cap_slot_event(PCIDevice *dev,
+                                enum PCIExpressHotPlugEvent events,
+                                uint16_t status)
+{
+    bool trigger = false;
+    int level = 0;
+    uint8_t *pcie_cap = dev->config + pci_pcie_cap(dev);
+    uint16_t sltctl = pci_get_word(pcie_cap + PCI_EXP_SLTCTL);
+    uint16_t sltsta = pci_get_word(pcie_cap + PCI_EXP_SLTSTA);
+
+    PCIE_DEV_PRINTF(dev,
+                    "sltctl: 0x%0x2 sltsta: 0x%02x event:%x %s status:%d\n",
+                    sltctl, sltsta,
+                    events, pcie_hp_event_name(events), status);
+    events &= PCI_EXP_HP_EV_SUPPORTED;
+    if ((sltctl & PCI_EXP_SLTCTL_HPIE) && (sltctl & events) &&
+        ((sltsta ^ events) & events) /* 0 -> 1 */) {
+        trigger = true;
+    }
+
+    if (events & PCI_EXP_HP_EV_PDC) {
+        sltsta &= ~PCI_EXP_SLTSTA_PDS;
+        sltsta |= (status & PCI_EXP_SLTSTA_PDS);
+    }
+    sltsta |= events;
+    pci_set_word(pcie_cap + PCI_EXP_SLTSTA, sltsta);
+    PCIE_DEV_PRINTF(dev, "sltsta -> %02xn", sltsta);
+
+    if ((sltctl & PCI_EXP_SLTCTL_HPIE) && (sltsta & PCI_EXP_HP_EV_SUPPORTED)) {
+        level = 1;
+    }
+
+    pcie_cap_notify(dev, trigger, level);
+}
+
+static int pcie_cap_slot_hotplug(DeviceState *qdev,
+                                 PCIDevice *pci_dev, int state)
+{
+    PCIDevice *d = DO_UPCAST(PCIDevice, qdev, qdev);
+    uint8_t *pcie_cap = d->config + pci_pcie_cap(d);
+    uint16_t sltsta = pci_get_word(pcie_cap + PCI_EXP_SLTSTA);
+
+    if (!pci_dev->qdev.hotplugged) {
+        assert(state); /* this case only happens machine creation. */
+        sltsta |= PCI_EXP_SLTSTA_PDS;
+        pci_set_word(pcie_cap + PCI_EXP_SLTSTA, sltsta);
+        return 0;
+    }
+
+    PCIE_DEV_PRINTF(pci_dev, "hotplug state: %d\n", state);
+    if (sltsta & PCI_EXP_SLTSTA_EIS) {
+        /* the slot is electromechanically locked. */
+        return -EBUSY;
+    }
+
+    if (state) {
+        if (PCI_FUNC(pci_dev->devfn) == 0) {
+            /* event is per slot. Not per function
+             * only generates event for function = 0.
+             * When hot plug, populate functions > 0
+             * and then add function = 0 last.
+             */
+            pcie_cap_slot_event(d, PCI_EXP_HP_EV_PDC, PCI_EXP_SLTSTA_PDS);
+        }
+    } else {
+        PCIBridge *br;
+        PCIBus *bus;
+        DeviceState *next;
+        if (PCI_FUNC(pci_dev->devfn) != 0) {
+            /* event is per slot. Not per function.
+               accepts function = 0 only. */
+            return -EINVAL;
+        }
+
+        /* zap all functions. */
+        br = DO_UPCAST(PCIBridge, dev, d);
+        bus = pci_bridge_get_sec_bus(br);
+        QLIST_FOREACH_SAFE(qdev, &bus->qbus.children, sibling, next) {
+            qdev_free(qdev);
+        }
+
+        pcie_cap_slot_event(d, PCI_EXP_HP_EV_PDC, 0);
+    }
+    return 0;
+}
+
+/* pci express slot for pci express root/downstream port
+   PCI express capability slot registers */
+void pcie_cap_slot_init(PCIDevice *dev, uint16_t slot)
+{
+    uint8_t *pcie_cap = dev->config + pci_pcie_cap(dev);
+    uint8_t *pcie_wmask = dev->wmask + pci_pcie_cap(dev);
+    uint32_t tmp;
+
+    pci_set_word(pcie_cap + PCI_EXP_FLAGS,
+                 pci_get_word(pcie_cap + PCI_EXP_FLAGS) | PCI_EXP_FLAGS_SLOT);
+
+    tmp = pci_get_long(pcie_cap + PCI_EXP_SLTCAP);
+    tmp &= PCI_EXP_SLTCAP_PSN;
+    tmp |=
+        PCI_EXP_SLTCAP_PSN_REG(slot) |
+        PCI_EXP_SLTCAP_EIP |
+        PCI_EXP_SLTCAP_HPS |
+        PCI_EXP_SLTCAP_HPC |
+        PCI_EXP_SLTCAP_PIP |
+        PCI_EXP_SLTCAP_AIP |
+        PCI_EXP_SLTCAP_ABP;
+    pci_set_long(pcie_cap + PCI_EXP_SLTCAP, tmp);
+
+    tmp = pci_get_word(pcie_cap + PCI_EXP_SLTCTL);
+    tmp &= ~(PCI_EXP_SLTCTL_PIC | PCI_EXP_SLTCTL_AIC);
+    tmp |= PCI_EXP_SLTCTL_PIC_OFF | PCI_EXP_SLTCTL_AIC_OFF;
+    pci_set_word(pcie_cap + PCI_EXP_SLTCTL, tmp);
+    pci_set_word(pcie_wmask + PCI_EXP_SLTCTL,
+                 pci_get_word(pcie_wmask + PCI_EXP_SLTCTL) |
+                 PCI_EXP_SLTCTL_PIC |
+                 PCI_EXP_SLTCTL_AIC |
+                 PCI_EXP_SLTCTL_HPIE |
+                 PCI_EXP_SLTCTL_CCIE |
+                 PCI_EXP_SLTCTL_PDCE |
+                 PCI_EXP_SLTCTL_ABPE);
+
+    pci_bus_hotplug(pci_bridge_get_sec_bus(DO_UPCAST(PCIBridge, dev, dev)),
+                    pcie_cap_slot_hotplug, &dev->qdev);
+}
+
+void pcie_cap_slot_reset(PCIDevice *dev)
+{
+    uint8_t *pcie_cap = dev->config + pci_pcie_cap(dev);
+    uint32_t tmp;
+
+    PCIE_DEV_PRINTF(dev, "reset\n");
+
+    tmp = pci_get_word(pcie_cap + PCI_EXP_SLTCTL);
+    tmp &= ~(PCI_EXP_SLTCTL_EIC |
+             PCI_EXP_SLTCTL_PIC |
+             PCI_EXP_SLTCTL_AIC |
+             PCI_EXP_SLTCTL_HPIE |
+             PCI_EXP_SLTCTL_CCIE |
+             PCI_EXP_SLTCTL_PDCE |
+             PCI_EXP_SLTCTL_ABPE);
+    tmp |= PCI_EXP_SLTCTL_PIC_OFF | PCI_EXP_SLTCTL_AIC_OFF;
+    pci_set_word(pcie_cap + PCI_EXP_SLTCTL, tmp);
+
+    tmp = pci_get_word(pcie_cap + PCI_EXP_SLTSTA);
+    tmp &= ~(PCI_EXP_SLTSTA_EIS | /* by reset, the lock is released */
+             PCI_EXP_SLTSTA_CC |
+             PCI_EXP_SLTSTA_PDC |
+             PCI_EXP_SLTSTA_ABP);
+    pci_set_word(pcie_cap + PCI_EXP_SLTSTA, tmp);
+}
+
+void pcie_cap_slot_write_config(PCIDevice *dev,
+                                uint32_t addr, uint32_t val, int len,
+                                uint16_t sltctl_prev)
+{
+    uint32_t pos = pci_pcie_cap(dev);
+    uint8_t *pcie_cap = dev->config + pos;
+    uint16_t sltctl = pci_get_word(pcie_cap + PCI_EXP_SLTCTL);
+    uint16_t sltsta = pci_get_word(pcie_cap + PCI_EXP_SLTSTA);
+
+    PCIE_DEV_PRINTF(dev,
+                    "addr: 0x%x val: 0x%x len: %d\n"
+                    "\tsltctl_prev: 0x%02x sltctl: 0x%02x sltsta 0x%02x\n",
+                    addr, val, len, sltctl_prev, sltctl, sltsta);
+    /* SLTSTA: process SLTSTA before SLTCTL to avoid spurious interrupt */
+    if (ranges_overlap(addr, len, pos + PCI_EXP_SLTSTA, 2)) {
+        /* RW1C */
+        pcie_w1c_word(dev, pos + PCI_EXP_SLTSTA, PCI_EXP_HP_EV_SUPPORTED,
+                      addr, val);
+        sltsta = pci_get_word(pcie_cap + PCI_EXP_SLTSTA);
+
+        /* write to stlsta results in clearing bits,
+           so new interrupts won't be generated. */
+        PCIE_DEV_PRINTF(dev, "sltsta -> 0x%02x\n", sltsta);
+    }
+
+    /* SLTCTL */
+    if (ranges_overlap(addr, len, pos + PCI_EXP_SLTCTL, 2)) {
+        PCIE_DEV_PRINTF(dev, "sltctl: 0x%02x -> 0x%02x\n",
+                        sltctl_prev, sltctl);
+        if (pcie_written_val_word(addr, val, pos + PCI_EXP_SLTCTL) &
+            PCI_EXP_SLTCTL_EIC) {
+            /* toggle PCI_EXP_SLTSTA_EIS */
+            sltsta = (sltsta & ~PCI_EXP_SLTSTA_EIS) |
+                ((sltsta ^ PCI_EXP_SLTSTA_EIS) & PCI_EXP_SLTSTA_EIS);
+            pci_set_word(pcie_cap + PCI_EXP_SLTSTA, sltsta);
+            PCIE_DEV_PRINTF(dev, "PCI_EXP_SLTCTL_EIC: sltsta -> 0x%02x\n",
+                            sltsta);
+        }
+
+        if (sltctl & PCI_EXP_SLTCTL_HPIE) {
+            bool trigger = false;
+            int level = 0;
+
+            if (((sltctl_prev ^ sltctl) & sltctl) & PCI_EXP_HP_EV_SUPPORTED) {
+                /* 0 -> 1 */
+                trigger = true;
+            }
+            if ((sltctl & sltsta) & PCI_EXP_HP_EV_SUPPORTED) {
+                level = 1;
+            }
+            pcie_cap_notify(dev, trigger, level);
+        }
+
+        /* command completed.
+           unlike real hardware, command completes instantaneously */
+#define PCI_EXP_SLTCTL_SUPPORTED        \
+            (PCI_EXP_SLTCTL_ABPE |      \
+             PCI_EXP_SLTCTL_PDCE |      \
+             PCI_EXP_SLTCTL_CCIE |      \
+             PCI_EXP_SLTCTL_HPIE |      \
+             PCI_EXP_SLTCTL_AIC |       \
+             PCI_EXP_SLTCTL_PCC |       \
+             PCI_EXP_SLTCTL_EIC)
+        if ( 1 /* (sltctl_prev ^ sltctl) & PCI_EXP_SLTCTL_SUPPORTED */ ) {
+            /* set command completed bit */
+            pcie_cap_slot_event(dev, PCI_EXP_HP_EV_CCI, 0);
+        }
+    }
+}
+
+void pcie_cap_slot_push_attention_button(PCIDevice *dev)
+{
+    pcie_cap_slot_event(dev, PCI_EXP_HP_EV_ABP, 0);
+}
+
+/* root control/capabilities/status. PME isn't emulated for now */
+void pcie_cap_root_init(PCIDevice *dev)
+{
+    uint8_t pos = pci_pcie_cap(dev);
+    pci_set_word(dev->wmask + pos + PCI_EXP_RTCTL,
+                 PCI_EXP_RTCTL_SECEE | PCI_EXP_RTCTL_SENFEE |
+                 PCI_EXP_RTCTL_SEFEE);
+}
+
+void pcie_cap_root_reset(PCIDevice *dev)
+{
+    uint8_t *pcie_cap = dev->config + pci_pcie_cap(dev);
+    pci_set_word(pcie_cap + PCI_EXP_RTCTL, 0);
+}
+
+/* function level reset(FLR) */
+void pcie_cap_flr_init(PCIDevice *dev, pcie_flr_fn flr)
+{
+    uint8_t *pcie_cap = dev->config + pci_pcie_cap(dev);
+    pci_set_word(pcie_cap + PCI_EXP_DEVCAP,
+                 pci_get_word(pcie_cap + PCI_EXP_DEVCAP) | PCI_EXP_DEVCAP_FLR);
+    dev->exp->flr = flr;
+}
+
+void pcie_cap_flr_write_config(PCIDevice *dev,
+                               uint32_t addr, uint32_t val, int len)
+{
+    uint32_t pos = pci_pcie_cap(dev);
+    if (ranges_overlap(addr, len, pos + PCI_EXP_DEVCTL, 2)) {
+        uint16_t val16 = pcie_written_val_word(addr, val,
+                                               pos + PCI_EXP_DEVCTL);
+        if ((val16 & PCI_EXP_DEVCTL_BCR_FLR) && dev->exp->flr) {
+            dev->exp->flr(dev);
+        }
+    }
+}
+
+
+/* Alternative Routing-ID Interpretation (ARI) */
+/* ari forwarding support for down stream port */
+void pcie_cap_ari_init(PCIDevice *dev)
+{
+    uint8_t *pcie_cap = dev->config + pci_pcie_cap(dev);
+    uint8_t *pcie_wmask = dev->wmask + pci_pcie_cap(dev);
+
+    pci_set_long(pcie_cap + PCI_EXP_DEVCAP2,
+                 pci_get_long(pcie_cap + PCI_EXP_DEVCAP2) |
+                 PCI_EXP_DEVCAP2_ARI);
+
+    pci_set_long(pcie_wmask + PCI_EXP_DEVCTL2,
+                 pci_get_long(pcie_wmask + PCI_EXP_DEVCTL2) |
+                 PCI_EXP_DEVCTL2_ARI);
+}
+
+void pcie_cap_ari_reset(PCIDevice *dev)
+{
+    uint8_t *pcie_cap = dev->config + pci_pcie_cap(dev);
+
+    pci_set_long(pcie_cap + PCI_EXP_DEVCTL2,
+                 pci_get_long(pcie_cap + PCI_EXP_DEVCTL2) &
+                 ~PCI_EXP_DEVCTL2_ARI);
+}
+
+bool pcie_cap_is_ari_enabled(const PCIDevice *dev)
+{
+    if (!pci_is_express(dev)) {
+        return false;
+    }
+    if (!pci_pcie_cap(dev)) {
+        return false;
+    }
+
+    return pci_get_long(dev->config + pci_pcie_cap(dev) + PCI_EXP_DEVCTL2) &
+        PCI_EXP_DEVCTL2_ARI;
+}
+
+/**************************************************************************
+ * pci express extended capability allocation functions
+ * uint16_t ext_cap_id (16 bit)
+ * uint8_t cap_ver (4 bit)
+ * uint16_t cap_offset (12 bit)
+ * uint16_t ext_cap_size
+ */
+
+#define PCI_EXT_CAP_VER_SHIFT   16
+#define PCI_EXT_CAP_NEXT_MASK   0xfff00000
+#define PCI_EXT_CAP_NEXT_SHIFT  20
+
+#define PCI_EXT_CAP(id, ver, next) ((id) | ((ver) << PCI_EXT_CAP_VER_SHIFT) | ((next) << PCI_EXT_CAP_NEXT_SHIFT))
+
+#define PCI_EXT_CAP_ALIGN       4
+#define PCI_EXT_CAP_ALIGNUP(x)  (((x) + PCI_EXT_CAP_ALIGN - 1) & ~(PCI_EXT_CAP_ALIGN - 1))
+
+static int16_t pcie_ext_cap_find_space(PCIDevice *dev, uint16_t size)
+{
+    uint16_t offset = PCI_CONFIG_SPACE_SIZE;
+    uint16_t i = offset;
+
+    while (i < PCIE_CONFIG_SPACE_SIZE - size) {
+        if (dev->used[i]) {
+            offset = PCI_EXT_CAP_ALIGNUP(i + 1);
+            i = offset;
+            continue;
+        } else if (i - offset + 1 == size) {
+            return offset;
+        }
+
+        ++i;
+    }
+
+    return 0;
+}
+
+static uint16_t pcie_find_ext_capability_list(PCIDevice *dev, uint16_t cap_id,
+                                              uint16_t *prev_p)
+{
+    int ttl;
+
+    uint16_t prev = 0;
+    uint16_t next = PCI_CONFIG_SPACE_SIZE;
+    uint32_t header = pci_get_long(dev->config + next);
+
+    if (!header) {
+        return 0;
+    }
+
+    /* minimum 8 bytes per capability */
+    ttl = (PCIE_CONFIG_SPACE_SIZE - PCI_CONFIG_SPACE_SIZE) / 8;
+
+    while (ttl-- > 0) {
+        if (PCI_EXT_CAP_ID(header) == cap_id) {
+            break;
+        }
+
+        prev = next;
+        next = PCI_EXT_CAP_NEXT(header);
+        if (next < PCI_CONFIG_SPACE_SIZE) {
+            return 0;
+        }
+        header = pci_get_long(dev->config + prev);
+    }
+
+    if (!ttl) {
+        return 0;
+    }
+    if (prev_p) {
+        *prev_p = prev;
+    }
+    return next;
+}
+
+uint16_t pcie_find_ext_capability(PCIDevice *dev, uint16_t cap_id)
+{
+    return pcie_find_ext_capability_list(dev, cap_id, NULL);
+}
+
+static void pcie_ext_cap_set_next(PCIDevice *dev, uint16_t pos, uint16_t next)
+{
+    uint16_t header = pci_get_long(dev->config + pos);
+    assert(!(next & (PCI_EXT_CAP_ALIGN - 1)));
+    header = (header & ~PCI_EXT_CAP_NEXT_MASK) |
+        ((next << PCI_EXT_CAP_NEXT_SHIFT) & PCI_EXT_CAP_NEXT_MASK);
+    pci_set_long(dev->config + pos, header);
+}
+
+static void pcie_allocate_ext_capability(PCIDevice *dev,
+                                         uint16_t cap_id, uint8_t cap_ver,
+                                         uint16_t offset, uint16_t size)
+{
+    uint32_t header;
+    uint16_t next;
+
+    assert(offset < offset + size);
+    assert(offset + size < PCIE_CONFIG_SPACE_SIZE);
+    assert(size >= 8);
+
+    if (offset == PCI_CONFIG_SPACE_SIZE) {
+        header = pci_get_long(dev->config + offset);
+        next = PCI_EXT_CAP_NEXT(header);
+    } else {
+        /* find last ext cap */
+        int ttl = (PCIE_CONFIG_SPACE_SIZE - PCI_CONFIG_SPACE_SIZE) / 8;
+        uint16_t pos = PCI_CONFIG_SPACE_SIZE;
+        while (ttl-- > 0) {
+            header = pci_get_long(dev->config + pos);
+            if (PCI_EXT_CAP_NEXT(header) < PCI_CONFIG_SPACE_SIZE) {
+                break;
+            }
+
+            pos = PCI_EXT_CAP_NEXT(header);
+        }
+
+        assert(ttl > 0); /* since it is known that [offset, offset + size]
+                            is unused, so ttl shouldn't be zero */
+        pcie_ext_cap_set_next(dev, pos, offset);
+        next = 0;
+    }
+    pci_set_long(dev->config + offset, PCI_EXT_CAP(cap_id, cap_ver, next));
+
+    memset(dev->used + offset, 0xFF, size);
+    /* Make capability read-only by default */
+    memset(dev->wmask + offset, 0, size);
+    /* Check capability by default */
+    memset(dev->cmask + offset, 0xFF, size);
+}
+
+int pcie_add_ext_capability(PCIDevice *dev,
+                            uint16_t cap_id, uint8_t cap_ver, uint16_t size)
+{
+    uint16_t offset = pcie_ext_cap_find_space(dev, size);
+
+    if (!offset) {
+        return -ENOSPC;
+    }
+
+    pcie_allocate_ext_capability(dev, cap_id, cap_ver, offset, size);
+    return offset;
+}
+
+int pcie_append_ext_capability(PCIDevice *dev,
+                               uint16_t cap_id, uint8_t cap_ver,
+                               uint16_t offset, uint16_t size)
+{
+    uint16_t i;
+
+    if (!offset) {
+        return pcie_add_ext_capability(dev, cap_id, cap_ver, size);
+    }
+
+    assert(offset < offset + size);
+    assert(offset + size < PCIE_CONFIG_SPACE_SIZE);
+    assert(size >= 8);
+
+    for (i = offset; i < offset + size; ++i) {
+        if (dev->used[i]) {
+            return -EBUSY;
+        }
+    }
+
+    pcie_allocate_ext_capability(dev, cap_id, cap_ver, offset, size);
+    return offset;
+}
+
+void pcie_del_ext_capability(PCIDevice *dev, uint16_t cap_id, uint16_t size)
+{
+    uint16_t prev;
+    uint16_t offset = pcie_find_ext_capability_list(dev, cap_id, &prev);
+    uint32_t header;
+
+    if (!offset) {
+        return;
+    }
+
+    header = pci_get_long(dev->config + offset);
+    if (prev) {
+        pcie_ext_cap_set_next(dev, prev, PCI_EXT_CAP_NEXT(header));
+    } else {
+        /* move up next ext cap to PCI_CONFIG_SPACE_SIZE? */
+        assert(offset == PCI_CONFIG_SPACE_SIZE);
+        pci_set_long(dev->config + offset,
+                     PCI_EXT_CAP(0, 0, PCI_EXT_CAP_NEXT(header)));
+    }
+
+    /* Make capability writeable again */
+    memset(dev->wmask + offset, 0xff, size);
+    /* Clear cmask as device-specific registers can't be checked */
+    memset(dev->cmask + offset, 0, size);
+    memset(dev->used + offset, 0, size);
+}
+
+void pcie_reserve_ext_capability(PCIDevice *dev,
+                                 uint16_t offset, uint16_t size)
+{
+    memset(dev->used + offset, 0xff, size);
+}
+
+/**************************************************************************
+ * pci express extended capability helper functions
+ */
+
+/* ARI */
+#define PCI_ARI_VER     1
+#define PCI_ARI_SIZEOF  8
+
+int pcie_ari_init(PCIDevice *dev, uint16_t offset, uint16_t nextfn)
+{
+    int pos;
+    pos = pcie_append_ext_capability(dev, PCI_EXT_CAP_ID_ARI, PCI_ARI_VER,
+                                     offset, PCI_ARI_SIZEOF);
+    if (pos < 0) {
+        return pos;
+    }
+
+    pci_set_long(dev->config + pos + PCI_ARI_CAP, PCI_ARI_CAP_NFN(nextfn));
+    return pos;
+}
+
+/* AER */
+#define PCI_ERR_VER                     2
+#define PCI_ERR_SIZEOF                  0x48
+
+#define PCI_ERR_UNC_SDN                 0x00000020      /* surprise down */
+#define PCI_ERR_UNC_ACSV                0x00200000      /* ACS Violation */
+#define PCI_ERR_UNC_INTN                0x00400000      /* Internal Error */
+#define PCI_ERR_UNC_MCBTLP              0x00800000      /* MC Blcoked TLP */
+#define PCI_ERR_UNC_ATOP_EBLOCKED       0x01000000      /* atomic op egress blocked */
+#define PCI_ERR_UNC_TLP_PRF_BLOCKED     0x02000000      /* TLP Prefix Blocked */
+#define PCI_ERR_UNC_SUPPORTED           (PCI_ERR_UNC_DLP |              \
+                                         PCI_ERR_UNC_SDN |              \
+                                         PCI_ERR_UNC_POISON_TLP |       \
+                                         PCI_ERR_UNC_FCP |              \
+                                         PCI_ERR_UNC_COMP_TIME |        \
+                                         PCI_ERR_UNC_COMP_ABORT |       \
+                                         PCI_ERR_UNC_UNX_COMP |         \
+                                         PCI_ERR_UNC_RX_OVER |          \
+                                         PCI_ERR_UNC_MALF_TLP |         \
+                                         PCI_ERR_UNC_ECRC |             \
+                                         PCI_ERR_UNC_UNSUP |            \
+                                         PCI_ERR_UNC_ACSV |             \
+                                         PCI_ERR_UNC_INTN |             \
+                                         PCI_ERR_UNC_MCBTLP |           \
+                                         PCI_ERR_UNC_ATOP_EBLOCKED |    \
+                                         PCI_ERR_UNC_TLP_PRF_BLOCKED)
+
+#define PCI_ERR_UNC_SEVERITY_DEFAULT    (PCI_ERR_UNC_DLP |              \
+                                         PCI_ERR_UNC_SDN |              \
+                                         PCI_ERR_UNC_FCP |              \
+                                         PCI_ERR_UNC_RX_OVER |          \
+                                         PCI_ERR_UNC_MALF_TLP |         \
+                                         PCI_ERR_UNC_INTN)
+
+#define PCI_ERR_COR_ADV_NONFATAL        0x00002000      /* Advisory Non-Fatal */
+#define PCI_ERR_COR_INTERNAL            0x00004000      /* Corrected Internal */
+#define PCI_ERR_COR_HL_OVERFLOW         0x00008000      /* Header Long Overflow */
+#define PCI_ERR_COR_SUPPORTED           (PCI_ERR_COR_RCVR |             \
+                                         PCI_ERR_COR_BAD_TLP |          \
+                                         PCI_ERR_COR_BAD_DLLP |         \
+                                         PCI_ERR_COR_REP_ROLL |         \
+                                         PCI_ERR_COR_REP_TIMER |        \
+                                         PCI_ERR_COR_ADV_NONFATAL |     \
+                                         PCI_ERR_COR_INTERNAL |         \
+                                         PCI_ERR_COR_HL_OVERFLOW)
+#define PCI_ERR_COR_MASK_DEFAULT        (PCI_ERR_COR_ADV_NONFATAL |     \
+                                         PCI_ERR_COR_INTERNAL |         \
+                                         PCI_ERR_COR_HL_OVERFLOW)
+
+
+#define PCI_ERR_CAP_FEP_MASK            0x0000001f
+#define PCI_ERR_CAP_MHRC                0x00000200
+#define PCI_ERR_CAP_MHRE                0x00000400
+#define PCI_ERR_CAP_TLP                 0x00000800
+
+#define PCI_ERR_TLP_PREFIX_LOG          0x38
+
+/* From 6.2.7 Error Listing and Rules. Table 6-2, 6-3 and 6-4 */
+static enum PCIE_AER_SEVERITY pcie_aer_uncor_default_severity(uint32_t status)
+{
+    switch (status) {
+    case PCI_ERR_UNC_INTN:
+    case PCI_ERR_UNC_DLP:
+    case PCI_ERR_UNC_SDN:
+    case PCI_ERR_UNC_RX_OVER:
+    case PCI_ERR_UNC_FCP:
+    case PCI_ERR_UNC_MALF_TLP:
+        return AER_ERR_FATAL;
+    case PCI_ERR_UNC_POISON_TLP:
+    case PCI_ERR_UNC_ECRC:
+    case PCI_ERR_UNC_UNSUP:
+    case PCI_ERR_UNC_COMP_TIME:
+    case PCI_ERR_UNC_COMP_ABORT:
+    case PCI_ERR_UNC_UNX_COMP:
+    case PCI_ERR_UNC_ACSV:
+    case PCI_ERR_UNC_MCBTLP:
+    case PCI_ERR_UNC_ATOP_EBLOCKED:
+    case PCI_ERR_UNC_TLP_PRF_BLOCKED:
+        return AER_ERR_NONFATAL;
+    default:
+        break;
+    }
+    abort();
+    return AER_ERR_FATAL;
+}
+
+static uint32_t pcie_aer_log_next(uint32_t i, uint32_t max)
+{
+    return (i + 1) % max;
+}
+
+static bool pcie_aer_log_empty_index(uint32_t producer, uint32_t consumer)
+{
+    return producer == consumer;
+}
+
+static bool pcie_aer_log_empty(struct pcie_aer_log *aer_log)
+{
+    return pcie_aer_log_empty_index(aer_log->producer, aer_log->consumer);
+}
+
+static bool pcie_aer_log_full(struct pcie_aer_log *aer_log)
+{
+    return pcie_aer_log_next(aer_log->producer, aer_log->log_max) ==
+        aer_log->consumer;
+}
+
+static uint32_t pcie_aer_log_add(struct pcie_aer_log *aer_log)
+{
+    uint32_t i = aer_log->producer;
+    aer_log->producer = pcie_aer_log_next(aer_log->producer, aer_log->log_max);
+    return i;
+}
+
+static uint32_t pcie_aer_log_del(struct pcie_aer_log *aer_log)
+{
+    uint32_t i = aer_log->consumer;
+    aer_log->consumer = pcie_aer_log_next(aer_log->consumer, aer_log->log_max);
+    return i;
+}
+
+static int pcie_aer_log_add_err(struct pcie_aer_log *aer_log,
+                                const struct pcie_aer_err *err)
+{
+    uint32_t i;
+    if (pcie_aer_log_full(aer_log)) {
+        return -1;
+    }
+    i = pcie_aer_log_add(aer_log);
+    memcpy(&aer_log->log[i], err, sizeof(*err));
+    return 0;
+}
+
+static const struct pcie_aer_err*
+pcie_aer_log_del_err(struct pcie_aer_log *aer_log)
+{
+    uint32_t i;
+    assert(!pcie_aer_log_empty(aer_log));
+    i = pcie_aer_log_del(aer_log);
+    return &aer_log->log[i];
+}
+
+static void pcie_aer_log_clear_all_err(struct pcie_aer_log *aer_log)
+{
+    aer_log->producer = 0;
+    aer_log->consumer = 0;
+}
+
+int pcie_aer_init(PCIDevice *dev, uint16_t offset)
+{
+    int pos;
+    PCIExpressDevice *exp;
+
+    pci_set_word(dev->wmask + PCI_COMMAND,
+                 pci_get_word(dev->wmask + PCI_COMMAND) | PCI_COMMAND_SERR);
+
+    pos = pcie_append_ext_capability(dev, PCI_EXT_CAP_ID_ERR, PCI_ERR_VER,
+                                     offset, PCI_ERR_SIZEOF);
+    if (pos < 0) {
+        return pos;
+    }
+    exp = dev->exp;
+    exp->aer_cap = pos;
+    if (dev->aer_log.log_max == PCIE_AER_LOG_MAX_UNSET) {
+        dev->aer_log.log_max = PCIE_AER_LOG_MAX_DEFAULT;
+    }
+    if (dev->aer_log.log_max > PCIE_AER_LOG_MAX_MAX) {
+        dev->aer_log.log_max = PCIE_AER_LOG_MAX_MAX;
+    }
+    dev->aer_log.log =
+        qemu_mallocz(sizeof(dev->aer_log.log[0]) * dev->aer_log.log_max);
+
+    pci_set_long(dev->wmask + pos + PCI_ERR_UNCOR_MASK,
+                 PCI_ERR_UNC_SUPPORTED);
+
+    pci_set_long(dev->config + pos + PCI_ERR_UNCOR_SEVER,
+                 PCI_ERR_UNC_SEVERITY_DEFAULT);
+    pci_set_long(dev->wmask + pos + PCI_ERR_UNCOR_SEVER,
+                 PCI_ERR_UNC_SUPPORTED);
+
+    pci_set_long(dev->config + pos + PCI_ERR_COR_MASK,
+                 PCI_ERR_COR_MASK_DEFAULT);
+    pci_set_long(dev->wmask + pos + PCI_ERR_COR_MASK,
+                 PCI_ERR_COR_SUPPORTED);
+
+    /* capabilities and control. multiple header logging is supported */
+    if (dev->aer_log.log_max > 0) {
+        pci_set_long(dev->config + pos + PCI_ERR_CAP,
+                     PCI_ERR_CAP_ECRC_GENC | PCI_ERR_CAP_ECRC_CHKC |
+                     PCI_ERR_CAP_MHRC);
+        pci_set_long(dev->wmask + pos + PCI_ERR_CAP,
+                     PCI_ERR_CAP_ECRC_GENE | PCI_ERR_CAP_ECRC_CHKE |
+                     PCI_ERR_CAP_MHRE);
+    } else {
+        pci_set_long(dev->config + pos + PCI_ERR_CAP,
+                     PCI_ERR_CAP_ECRC_GENC | PCI_ERR_CAP_ECRC_CHKC);
+        pci_set_long(dev->wmask + pos + PCI_ERR_CAP,
+                     PCI_ERR_CAP_ECRC_GENE | PCI_ERR_CAP_ECRC_CHKE);
+    }
+
+    switch (pcie_cap_get_type(dev)) {
+    case PCI_EXP_TYPE_ROOT_PORT:
+        /* this case will be set by pcie_aer_root_init() */
+        /* fallthrough */
+    case PCI_EXP_TYPE_DOWNSTREAM:
+    case PCI_EXP_TYPE_UPSTREAM:
+        pci_set_word(dev->wmask + PCI_BRIDGE_CONTROL,
+                     pci_get_word(dev->wmask + PCI_BRIDGE_CONTROL) |
+                     PCI_BRIDGE_CTL_SERR);
+        exp->aer_errmsg = pcie_aer_errmsg_vbridge;
+        break;
+    default:
+        exp->aer_errmsg = pcie_aer_errmsg_alldev;
+        break;
+    }
+    return pos;
+}
+
+void pcie_aer_exit(PCIDevice *dev)
+{
+    pci_del_capability(dev, PCI_EXT_CAP_ID_ERR, PCI_ERR_SIZEOF);
+    qemu_free(dev->aer_log.log);
+}
+
+/* Multiple Header recording isn't implemented. Is it wanted? */
+void pcie_aer_write_config(PCIDevice *dev,
+                           uint32_t addr, uint32_t val, int len)
+{
+    uint32_t pos = dev->exp->aer_cap;
+
+    /* PCI_STATUS_SIG_SYSTEM_ERROR */
+    if (ranges_overlap(addr, len, PCI_STATUS, 2)) {
+        pcie_w1c_word(dev, PCI_STATUS, PCI_STATUS_SIG_SYSTEM_ERROR, addr, val);
+    }
+
+    /* uncorrectable */
+    if (ranges_overlap(addr, len, pos + PCI_ERR_UNCOR_STATUS, 4)) {
+        uint32_t written =
+            pcie_written_val_long(addr, val, pos + PCI_ERR_UNCOR_STATUS) &
+            PCI_ERR_UNC_SUPPORTED;
+        uint32_t uncorsta =
+            pci_get_long(dev->config + pos + PCI_ERR_UNCOR_STATUS);
+        uint32_t errcap = pci_get_long(dev->config + pos + PCI_ERR_CAP);
+        uint32_t first_error = (1 << PCI_ERR_CAP_FEP(errcap));
+
+        if ((uncorsta & first_error) && (written & first_error)) {
+            pcie_aer_clear_error(dev);
+        }
+        if (!(errcap & PCI_ERR_CAP_MHRE)) {
+            /* RW1CS */
+            pcie_w1c_long(dev, pos + PCI_ERR_UNCOR_STATUS,
+                          PCI_ERR_UNC_SUPPORTED, addr, val);
+        }
+    }
+
+    /* correctable */
+    if (ranges_overlap(addr, len, pos + PCI_ERR_COR_STATUS, 4)) {
+        /* RW1CS */
+        pcie_w1c_long(dev, pos + PCI_ERR_COR_STATUS, PCI_ERR_COR_SUPPORTED,
+                      addr, val);
+    }
+
+    /* capability & control */
+    if (ranges_overlap(addr, len, pos + PCI_ERR_CAP, 4)) {
+        uint32_t err_cap = pci_get_long(dev->config + pos + PCI_ERR_CAP);
+        if (!(err_cap & PCI_ERR_CAP_MHRE)) {
+            pcie_aer_log_clear_all_err(&dev->aer_log);
+        }
+    }
+}
+
+#define PCI_SEC_STATUS_RCV_SYSTEM_ERROR         0x4000
+
+void pcie_aer_write_config_vbridge(PCIDevice *dev,
+                                   uint32_t addr, uint32_t val, int len)
+{
+    /* PCI_SEC_STATUS_RCV_SYSTEM_ERROR */
+    if (ranges_overlap(addr, len, PCI_STATUS, 2)) {
+        pcie_w1c_word(dev, PCI_SEC_STATUS, PCI_SEC_STATUS_RCV_SYSTEM_ERROR,
+                      addr, val);
+    }
+}
+
+static inline void pcie_aer_errmsg(PCIDevice *dev,
+                                   const struct pcie_aer_err_msg *msg)
+{
+    assert(dev->exp);
+    assert(dev->exp->aer_errmsg);
+    dev->exp->aer_errmsg(dev, msg);
+}
+
+static AER_ERR_MSG_RESULT
+pcie_aer_errmsg_alldev(PCIDevice *dev, const struct pcie_aer_err_msg *msg)
+{
+    uint16_t cmd = pci_get_word(dev->config + PCI_COMMAND);
+    bool transmit1 =
+        pcie_aer_err_msg_is_uncor(msg) && (cmd & PCI_COMMAND_SERR);
+    uint32_t pos = pci_pcie_cap(dev);
+    uint32_t devctl = pci_get_word(dev->config + pos + PCI_EXP_DEVCTL);
+    bool transmit2 = msg->severity & devctl;
+    PCIDevice *parent_port;
+
+    if (transmit1) {
+        if (pcie_aer_err_msg_is_uncor(msg)) {
+            /* Signaled System Error */
+            uint8_t *status = dev->config + PCI_STATUS;
+            pci_set_word(status,
+                         pci_get_word(status) | PCI_STATUS_SIG_SYSTEM_ERROR);
+        }
+    }
+
+    if (!(transmit1 || transmit2)) {
+        return AER_ERR_MSG_MASKED;
+    }
+
+    /* send up error message */
+    if (pci_is_express(dev) &&
+        pcie_cap_get_type(dev) == PCI_EXP_TYPE_ROOT_PORT) {
+        /* Root port notify system itself,
+           or send the error message to root complex event collector. */
+        /*
+         * if root port is associated to event collector, set
+         * parent_port = root complex event collector
+         * For now root complex event collector isn't supported.
+         */
+        parent_port = NULL;
+    } else {
+        parent_port = pci_bridge_get_device(dev->bus);
+    }
+    if (parent_port) {
+        if (!pci_is_express(parent_port)) {
+            /* What to do? */
+            return AER_ERR_MSG_MASKED;
+        }
+        pcie_aer_errmsg(parent_port, msg);
+    }
+    return AER_ERR_MSG_SENT;
+}
+
+static AER_ERR_MSG_RESULT
+pcie_aer_errmsg_vbridge(PCIDevice *dev, const struct pcie_aer_err_msg *msg)
+{
+    uint16_t bridge_control = pci_get_word(dev->config + PCI_BRIDGE_CONTROL);
+
+    if (pcie_aer_err_msg_is_uncor(msg)) {
+        /* Received System Error */
+        uint8_t *sec_status = dev->config + PCI_SEC_STATUS;
+        pci_set_word(sec_status,
+                     pci_get_word(sec_status) |
+                     PCI_SEC_STATUS_RCV_SYSTEM_ERROR);
+    }
+
+    if (!(bridge_control & PCI_BRIDGE_CTL_SERR)) {
+        return AER_ERR_MSG_MASKED;
+    }
+    return pcie_aer_errmsg_alldev(dev, msg);
+}
+
+static AER_ERR_MSG_RESULT
+pcie_aer_errmsg_root_port(PCIDevice *dev, const struct pcie_aer_err_msg *msg)
+{
+    AER_ERR_MSG_RESULT ret;
+    uint16_t cmd;
+    uint8_t *aer_cap;
+    uint32_t root_cmd;
+    uint32_t root_sta;
+    bool trigger;
+
+    ret = pcie_aer_errmsg_vbridge(dev, msg);
+    if (ret != AER_ERR_MSG_SENT) {
+        return ret;
+    }
+
+    ret = AER_ERR_MSG_MASKED;
+    cmd = pci_get_word(dev->config + PCI_COMMAND);
+    aer_cap = dev->config + pcie_aer_cap(dev);
+    root_cmd = pci_get_long(aer_cap + PCI_ERR_ROOT_COMMAND);
+    root_sta = pci_get_long(aer_cap + PCI_ERR_ROOT_STATUS);
+    trigger = false;
+
+    if (cmd & PCI_COMMAND_SERR) {
+        /* System Error. Platform Specific */
+        /* ret = AER_ERR_MSG_SENT; */
+    }
+
+    /* Errro Message Received: Root Error Status register */
+    switch (msg->severity) {
+    case AER_ERR_COR:
+        if (root_sta & PCI_ERR_ROOT_COR_RCV) {
+            root_sta |= PCI_ERR_ROOT_MULTI_COR_RCV;
+        } else {
+            if (root_cmd & PCI_ERR_ROOT_CMD_COR_EN) {
+                trigger = true;
+            }
+            pci_set_word(aer_cap + PCI_ERR_ROOT_COR_SRC, msg->source_id);
+        }
+        root_sta |= PCI_ERR_ROOT_COR_RCV;
+        break;
+    case AER_ERR_NONFATAL:
+        if (!(root_sta & PCI_ERR_ROOT_NONFATAL_RCV) &&
+            root_cmd & PCI_ERR_ROOT_CMD_NONFATAL_EN) {
+            trigger = true;
+        }
+        root_sta |= PCI_ERR_ROOT_NONFATAL_RCV;
+        break;
+    case AER_ERR_FATAL:
+        if (!(root_sta & PCI_ERR_ROOT_FATAL_RCV) &&
+            root_cmd & PCI_ERR_ROOT_CMD_FATAL_EN) {
+            trigger = true;
+        }
+        if (!(root_sta & PCI_ERR_ROOT_UNCOR_RCV)) {
+            root_sta |= PCI_ERR_ROOT_FIRST_FATAL;
+        }
+        root_sta |= PCI_ERR_ROOT_FATAL_RCV;
+        break;
+    }
+    if (pcie_aer_err_msg_is_uncor(msg)) {
+        if (root_sta & PCI_ERR_ROOT_UNCOR_RCV) {
+            root_sta |= PCI_ERR_ROOT_MULTI_UNCOR_RCV;
+        } else {
+            pci_set_word(aer_cap + PCI_ERR_ROOT_SRC, msg->source_id);
+        }
+        root_sta |= PCI_ERR_ROOT_UNCOR_RCV;
+    }
+    pci_set_long(aer_cap + PCI_ERR_ROOT_STATUS, root_sta);
+
+    if (root_cmd & msg->severity) {
+        /* Error Interrupt(INTx or MSI) */
+        pcie_aer_root_notify(dev, trigger, 1);
+        ret = AER_ERR_MSG_SENT;
+    }
+    return ret;
+}
+
+static void pcie_aer_update_log(PCIDevice *dev, const struct pcie_aer_err *err)
+{
+    uint8_t *aer_cap = dev->config + pcie_aer_cap(dev);
+    uint8_t first_bit = ffsl(err->status) - 1;
+    uint32_t errcap = pci_get_long(aer_cap + PCI_ERR_CAP);
+    int i;
+    uint32_t dw;
+
+    errcap &= ~(PCI_ERR_CAP_FEP_MASK | PCI_ERR_CAP_TLP);
+    errcap |= PCI_ERR_CAP_FEP(first_bit);
+
+    if (err->flags & PCIE_AER_ERR_HEADER_VALID) {
+        for (i = 0; i < ARRAY_SIZE(err->header); ++i) {
+            /* 7.10.8 Header Log Register */
+            cpu_to_be32wu(&dw, err->header[i]);
+            memcpy(aer_cap + PCI_ERR_HEADER_LOG + sizeof(err->header[0]) * i,
+                   &dw, sizeof(dw));
+        }
+    } else {
+        assert(!(err->flags & PCIE_AER_ERR_TLP_PRESENT));
+        memset(aer_cap + PCI_ERR_HEADER_LOG, 0, sizeof(err->header));
+    }
+
+    if ((err->flags & PCIE_AER_ERR_TLP_PRESENT) &&
+        (pci_get_long(dev->config + pci_pcie_cap(dev) + PCI_EXP_DEVCTL2) &
+         PCI_EXP_DEVCAP2_EETLPP)) {
+        for (i = 0; i < ARRAY_SIZE(err->prefix); ++i) {
+            /* 7.10.12 tlp prefix log register */
+            cpu_to_be32wu(&dw, err->prefix[i]);
+            memcpy(aer_cap + PCI_ERR_TLP_PREFIX_LOG +
+                   sizeof(err->prefix[0]) * i, &dw, sizeof(dw));
+        }
+        errcap |= PCI_ERR_CAP_TLP;
+    } else {
+        memset(aer_cap + PCI_ERR_TLP_PREFIX_LOG, 0, sizeof(err->prefix));
+    }
+    pci_set_long(aer_cap + PCI_ERR_CAP, errcap);
+}
+
+static void pcie_aer_clear_log(PCIDevice *dev)
+{
+    struct pcie_aer_err *err;
+    uint8_t *aer_cap = dev->config + pcie_aer_cap(dev);
+    uint32_t errcap = pci_get_long(aer_cap + PCI_ERR_CAP);
+
+    errcap &= ~(PCI_ERR_CAP_FEP_MASK | PCI_ERR_CAP_TLP);
+    pci_set_long(aer_cap + PCI_ERR_CAP, errcap);
+
+    memset(aer_cap + PCI_ERR_HEADER_LOG, 0, sizeof(err->header));
+    memset(aer_cap + PCI_ERR_TLP_PREFIX_LOG, 0, sizeof(err->prefix));
+}
+
+static int pcie_aer_record_error(PCIDevice *dev,
+                                 const struct pcie_aer_err *err)
+{
+    uint8_t *aer_cap = dev->config + pcie_aer_cap(dev);
+    uint32_t errcap = pci_get_long(aer_cap + PCI_ERR_CAP);
+    int fep = PCI_ERR_CAP_FEP(errcap);
+
+    if (errcap & PCI_ERR_CAP_MHRE &&
+        (pci_get_long(aer_cap + PCI_ERR_UNCOR_STATUS) & (1ULL << fep))) {
+        /*  Not first error. queue error */
+        if (pcie_aer_log_add_err(&dev->aer_log, err) < 0) {
+            /* overflow */
+            return -1;
+        }
+        return 0;
+    }
+
+    pcie_aer_update_log(dev, err);
+    return 0;
+}
+
+static void pcie_aer_clear_error(PCIDevice *dev)
+{
+    uint8_t *aer_cap = dev->config + pcie_aer_cap(dev);
+    uint32_t errcap = pci_get_long(aer_cap + PCI_ERR_CAP);
+    uint32_t old_err = (1UL << PCI_ERR_CAP_FEP(errcap));
+    struct pcie_aer_log *aer_log = &dev->aer_log;
+    const struct pcie_aer_err *err;
+    uint32_t consumer;
+
+    if (!(errcap & PCI_ERR_CAP_MHRE) || pcie_aer_log_empty(aer_log)) {
+        pcie_aer_clear_log(dev);
+        pci_set_long(aer_cap + PCI_ERR_UNCOR_STATUS,
+                     pci_get_long(aer_cap + PCI_ERR_UNCOR_STATUS) & ~old_err);
+        return;
+    }
+
+    /* if no same error is queued, clear bit in uncorrectable error status */
+    for (consumer = dev->aer_log.consumer;
+         !pcie_aer_log_empty_index(dev->aer_log.producer, consumer);
+         consumer = pcie_aer_log_next(consumer, dev->aer_log.log_max)) {
+        if (dev->aer_log.log[consumer].status & old_err) {
+            old_err = 0;
+            break;
+        }
+    }
+    if (old_err) {
+        pci_set_long(aer_cap + PCI_ERR_UNCOR_STATUS,
+                     pci_get_long(aer_cap + PCI_ERR_UNCOR_STATUS) & ~old_err);
+    }
+
+    err = pcie_aer_log_del_err(aer_log);
+    pcie_aer_update_log(dev, err);
+}
+
+/*
+ * non-Function specific error must be recorded in all functions.
+ * It is the responsibility of the caller of this function.
+ * It is also caller's responsiblity to determine which function should
+ * report the rerror.
+ *
+ * 6.2.4 Error Logging
+ * 6.2.5 Sqeucne of Device Error Signaling and Logging Operations
+ * table 6-2: Flowchard Showing Sequence of Device Error Signaling and Logging
+ *            Operations
+ *
+ * Although this implementation can be shortened/optimized, this is kept
+ * parallel to table 6-2.
+ */
+void pcie_aer_inject_error(PCIDevice *dev, const struct pcie_aer_err *err)
+{
+    uint8_t *exp_cap;
+    uint8_t *aer_cap = NULL;
+    uint32_t devctl = 0;
+    uint32_t devsta = 0;
+    uint32_t status = err->status;
+    uint32_t mask;
+    bool is_unsupported_request =
+        (!(err->flags & PCIE_AER_ERR_IS_CORRECTABLE) &&
+         err->status == PCI_ERR_UNC_UNSUP);
+    bool is_advisory_nonfatal = false;  /* for advisory non-fatal error */
+    uint32_t uncor_status = 0;          /* for advisory non-fatal error */
+    struct pcie_aer_err_msg msg;
+    int is_header_log_overflowed = 0;
+
+    if (!pci_is_express(dev)) {
+        /* What to do? */
+        return;
+    }
+
+    if (err->flags & PCIE_AER_ERR_IS_CORRECTABLE) {
+        status &= PCI_ERR_COR_SUPPORTED;
+    } else {
+        status &= PCI_ERR_UNC_SUPPORTED;
+    }
+    if (!status || status & (status - 1)) {
+        /* invalid status bit. one and only one bit must be set */
+        return;
+    }
+
+    exp_cap = dev->config + pci_pcie_cap(dev);
+    if (dev->exp->aer_cap) {
+        aer_cap = dev->config + pcie_aer_cap(dev);
+        devctl = pci_get_long(exp_cap + PCI_EXP_DEVCTL);
+        devsta = pci_get_long(exp_cap + PCI_EXP_DEVSTA);
+    }
+    if (err->flags & PCIE_AER_ERR_IS_CORRECTABLE) {
+    correctable_error:
+        devsta |= PCI_EXP_DEVSTA_CED;
+        if (is_unsupported_request) {
+            devsta |= PCI_EXP_DEVSTA_URD;
+        }
+        pci_set_word(exp_cap + PCI_EXP_DEVSTA, devsta);
+
+        if (aer_cap) {
+            pci_set_long(aer_cap + PCI_ERR_COR_STATUS,
+                         pci_get_long(aer_cap + PCI_ERR_COR_STATUS) | status);
+            mask = pci_get_long(aer_cap + PCI_ERR_COR_MASK);
+            if (mask & status) {
+                return;
+            }
+            if (is_advisory_nonfatal) {
+                uint32_t uncor_mask =
+                    pci_get_long(aer_cap + PCI_ERR_UNCOR_MASK);
+                if (!(uncor_mask & uncor_status)) {
+                    is_header_log_overflowed = pcie_aer_record_error(dev, err);
+                }
+                pci_set_long(aer_cap + PCI_ERR_UNCOR_STATUS,
+                             pci_get_long(aer_cap + PCI_ERR_UNCOR_STATUS) |
+                             uncor_status);
+            }
+        }
+
+        if (is_unsupported_request && !(devctl & PCI_EXP_DEVCTL_URRE)) {
+            return;
+        }
+        if (!(devctl & PCI_EXP_DEVCTL_CERE)) {
+            return;
+        }
+        msg.severity = AER_ERR_COR;
+    } else {
+        bool is_fatal =
+            (pcie_aer_uncor_default_severity(status) == AER_ERR_FATAL);
+        uint16_t cmd;
+
+        if (aer_cap) {
+            is_fatal = status & pci_get_long(aer_cap + PCI_ERR_UNCOR_SEVER);
+        }
+        if (!is_fatal && (err->flags & PCIE_AER_ERR_MAYBE_ADVISORY)) {
+            is_advisory_nonfatal = true;
+            uncor_status = status;
+            status = PCI_ERR_COR_ADV_NONFATAL;
+            goto correctable_error;
+        }
+        if (is_fatal) {
+            devsta |= PCI_EXP_DEVSTA_FED;
+        } else {
+            devsta |= PCI_EXP_DEVSTA_NFED;
+        }
+        if (is_unsupported_request) {
+            devsta |= PCI_EXP_DEVSTA_URD;
+        }
+        pci_set_long(exp_cap + PCI_EXP_DEVSTA, devsta);
+
+        if (aer_cap) {
+            mask = pci_get_long(aer_cap + PCI_ERR_UNCOR_MASK);
+            if (mask & status) {
+                pci_set_long(aer_cap + PCI_ERR_UNCOR_STATUS,
+                             pci_get_long(aer_cap + PCI_ERR_UNCOR_STATUS) |
+                             status);
+                return;
+            }
+
+            is_header_log_overflowed = pcie_aer_record_error(dev, err);
+            pci_set_long(aer_cap + PCI_ERR_UNCOR_STATUS,
+                         pci_get_long(aer_cap + PCI_ERR_UNCOR_STATUS) |
+                         status);
+        }
+
+        cmd = pci_get_word(dev->config + PCI_COMMAND);
+        if (is_unsupported_request &&
+            !(devctl & PCI_EXP_DEVCTL_URRE) && !(cmd & PCI_COMMAND_SERR)) {
+            return;
+        }
+        if (is_fatal) {
+            if (!((cmd & PCI_COMMAND_SERR) ||
+                  (devctl & PCI_EXP_DEVCTL_FERE))) {
+                return;
+            }
+            msg.severity = AER_ERR_FATAL;
+        } else {
+            if (!((cmd & PCI_COMMAND_SERR) ||
+                  (devctl & PCI_EXP_DEVCTL_NFERE))) {
+                return;
+            }
+            msg.severity = AER_ERR_NONFATAL;
+        }
+    }
+
+    /* send up error message */
+    msg.source_id = err->source_id;
+    pcie_aer_errmsg(dev, &msg);
+
+    if (is_header_log_overflowed) {
+        struct pcie_aer_err header_log_overflow = {
+            .status = PCI_ERR_COR_HL_OVERFLOW,
+            .flags = PCIE_AER_ERR_IS_CORRECTABLE,
+            .header = {0, 0, 0, 0},
+            .prefix = {0, 0, 0, 0},
+        };
+        pcie_aer_inject_error(dev, &header_log_overflow);
+    }
+}
+
+/* aer root error command/status */
+#define PCI_ERR_ROOT_CMD_EN_MASK        (PCI_ERR_ROOT_CMD_COR_EN |      \
+                                         PCI_ERR_ROOT_CMD_NONFATAL_EN | \
+                                         PCI_ERR_ROOT_CMD_FATAL_EN)
+
+#define PCI_ERR_ROOT_IRQ_SHIFT          26
+#define PCI_ERR_ROOT_IRQ                0xf8000000
+#define PCI_ERR_ROOT_STATUS_REPORT_MASK (PCI_ERR_ROOT_COR_RCV |         \
+                                         PCI_ERR_ROOT_MULTI_COR_RCV |   \
+                                         PCI_ERR_ROOT_UNCOR_RCV |       \
+                                         PCI_ERR_ROOT_MULTI_UNCOR_RCV | \
+                                         PCI_ERR_ROOT_FIRST_FATAL |     \
+                                         PCI_ERR_ROOT_NONFATAL_RCV |    \
+                                         PCI_ERR_ROOT_FATAL_RCV)
+
+void pcie_aer_root_set_vector(PCIDevice *dev, uint8_t vector)
+{
+    uint8_t *aer_cap = dev->config + pcie_aer_cap(dev);
+    uint32_t root_status = pci_get_long(aer_cap + PCI_ERR_ROOT_STATUS);
+    root_status &= ~PCI_ERR_ROOT_IRQ;
+    root_status |=
+        (((uint32_t)vector) << PCI_ERR_ROOT_IRQ_SHIFT) & PCI_ERR_ROOT_IRQ;
+    pci_set_long(aer_cap + PCI_ERR_ROOT_STATUS, root_status);
+}
+
+static uint8_t pcie_aer_root_get_vector(PCIDevice *dev)
+{
+    uint8_t *aer_cap = dev->config + pcie_aer_cap(dev);
+    uint32_t root_status = pci_get_long(aer_cap + PCI_ERR_ROOT_STATUS);
+    return (root_status & PCI_ERR_ROOT_IRQ) >> PCI_ERR_ROOT_IRQ_SHIFT;
+}
+
+static void pcie_aer_root_notify(PCIDevice *dev, bool trigger, int level)
+{
+    pcie_notify(dev, pcie_aer_root_get_vector(dev), trigger, level);
+}
+
+void pcie_aer_root_init(PCIDevice *dev)
+{
+    uint16_t pos = pcie_aer_cap(dev);
+
+    pci_set_long(dev->wmask + pos + PCI_ERR_ROOT_COMMAND,
+                 PCI_ERR_ROOT_CMD_EN_MASK);
+    dev->exp->aer_errmsg = pcie_aer_errmsg_root_port;
+}
+
+void pcie_aer_root_reset(PCIDevice *dev)
+{
+    uint8_t* aer_cap = dev->config + pcie_aer_cap(dev);
+
+    pci_set_long(aer_cap + PCI_ERR_ROOT_COMMAND, 0);
+
+    /*
+     * Advanced Error Interrupt Message Number in Root Error Status Register
+     * must be updated by chip dependent code.
+     */
+}
+
+static bool pcie_aer_root_does_trigger(uint32_t cmd, uint32_t sta)
+{
+    return
+        ((cmd & PCI_ERR_ROOT_CMD_COR_EN) && (sta & PCI_ERR_ROOT_COR_RCV)) ||
+        ((cmd & PCI_ERR_ROOT_CMD_NONFATAL_EN) &&
+         (sta & PCI_ERR_ROOT_NONFATAL_RCV)) ||
+        ((cmd & PCI_ERR_ROOT_CMD_FATAL_EN) && (sta & PCI_ERR_ROOT_FATAL_RCV));
+}
+
+void pcie_aer_root_write_config(PCIDevice *dev,
+                                uint32_t addr, uint32_t val, int len,
+                                uint32_t root_cmd_prev)
+{
+    uint16_t pos = pcie_aer_cap(dev);
+    uint8_t *aer_cap = dev->config + pos;
+    uint32_t root_status;
+
+    if (ranges_overlap(addr, len, pos + PCI_ERR_ROOT_STATUS, 4)) {
+        /* RW1CS */
+        pcie_w1c_long(dev, pos + PCI_ERR_ROOT_STATUS,
+                      PCI_ERR_ROOT_STATUS_REPORT_MASK, addr, val);
+    }
+
+    /* root command */
+    if (ranges_overlap(addr, len, pos + PCI_ERR_ROOT_COMMAND, 4)) {
+        uint32_t root_cmd = pci_get_long(aer_cap + PCI_ERR_ROOT_COMMAND);
+        if (root_cmd & PCI_ERR_ROOT_CMD_EN_MASK) {
+            bool trigger = false;
+            int level = 0;
+            uint32_t root_cmd_set = (root_cmd_prev ^ root_cmd) & root_cmd;
+
+            /* 0 -> 1 */
+            root_status = pci_get_long(aer_cap + PCI_ERR_ROOT_STATUS);
+            if (pcie_aer_root_does_trigger(root_cmd_set, root_status)) {
+                trigger = true;
+            }
+            if (pcie_aer_root_does_trigger(root_cmd, root_status)) {
+                level = 1;
+            }
+            pcie_aer_root_notify(dev, trigger, level);
+        }
+    }
+}
+
+static const VMStateDescription vmstate_pcie_aer_err = {
+    .name = "PCIE_AER_ERROR",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .minimum_version_id_old = 1,
+    .fields     = (VMStateField[]) {
+        VMSTATE_UINT32(status, struct pcie_aer_err),
+        VMSTATE_UINT16(source_id, struct pcie_aer_err),
+        VMSTATE_UINT16(flags, struct pcie_aer_err),
+        VMSTATE_UINT32_ARRAY(header, struct pcie_aer_err, 4),
+        VMSTATE_UINT32_ARRAY(prefix, struct pcie_aer_err, 4),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+#define VMSTATE_PCIE_AER_ERRS(_field, _state, _field_num, _vmsd, _type) { \
+    .name       = (stringify(_field)),                                    \
+    .version_id = 0,                                                      \
+    .num_offset = vmstate_offset_value(_state, _field_num, uint16_t),     \
+    .size       = sizeof(_type),                                          \
+    .vmsd       = &(_vmsd),                                               \
+    .flags      = VMS_POINTER | VMS_VARRAY_UINT16 | VMS_STRUCT,           \
+    .offset     = vmstate_offset_pointer(_state, _field, _type),          \
+}
+
+const VMStateDescription vmstate_pcie_aer_log = {
+    .name = "PCIE_AER_ERROR_LOG",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .minimum_version_id_old = 1,
+    .fields     = (VMStateField[]) {
+        VMSTATE_UINT32(producer, struct pcie_aer_log),
+        VMSTATE_UINT32(consumer, struct pcie_aer_log),
+        VMSTATE_UINT16(log_max, struct pcie_aer_log),
+        VMSTATE_PCIE_AER_ERRS(log, struct pcie_aer_log, log_max,
+                              vmstate_pcie_aer_err, struct pcie_aer_err),
+        VMSTATE_END_OF_LIST()
+    }
+};
diff --git a/hw/pcie.h b/hw/pcie.h
new file mode 100644
index 0000000..07f42c6
--- /dev/null
+++ b/hw/pcie.h
@@ -0,0 +1,186 @@ 
+/*
+ * pcie.h
+ *
+ * Copyright (c) 2010 Isaku Yamahata <yamahata at valinux co jp>
+ *                    VA Linux Systems Japan K.K.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef QEMU_PCIE_H
+#define QEMU_PCIE_H
+
+#include "hw.h"
+
+enum PCIExpressIndicator {
+    /* for attention and power indicator */
+    PCI_EXP_HP_IND_RESERVED     = 0b00,
+    PCI_EXP_HP_IND_ON           = 0b01,
+    PCI_EXP_HP_IND_BLINK        = 0b10,
+    PCI_EXP_HP_IND_OFF          = 0b11,
+};
+
+enum PCIExpressHotPlugEvent {
+    /* the bits match the bits in Slot Control/Status registers.
+     * PCI_EXP_HP_EV_xxx = PCI_EXP_SLTCTL_xxxE = PCI_EXP_SLTSTA_xxx
+     */
+    PCI_EXP_HP_EV_ABP   = 0b00001,      /* attention button preseed */
+    PCI_EXP_HP_EV_PDC   = 0b01000,      /* presence detect changed */
+    PCI_EXP_HP_EV_CCI   = 0b10000,      /* command completed */
+
+    PCI_EXP_HP_EV_SUPPORTED     = 0b11001,       /* supported event mask  */
+    /* events not listed aren't supported */
+};
+
+typedef void (*pcie_flr_fn)(PCIDevice *dev);
+
+struct pcie_aer_err_msg;
+enum AER_ERR_MSG_RESULT {
+    AER_ERR_MSG_MASKED,
+    AER_ERR_MSG_SENT,
+};
+typedef enum AER_ERR_MSG_RESULT AER_ERR_MSG_RESULT;
+typedef AER_ERR_MSG_RESULT (*pcie_aer_errmsg_fn)(PCIDevice *dev, const struct pcie_aer_err_msg *msg);
+
+struct PCIExpressDevice {
+    /* Offset of express capability in config space */
+    uint8_t exp_cap;
+
+    /* FLR */
+    pcie_flr_fn flr;
+
+    /* AER */
+    uint16_t aer_cap;
+    pcie_aer_errmsg_fn aer_errmsg;
+};
+
+struct pcie_aer_log {
+    uint32_t producer;
+    uint32_t consumer;
+
+#define PCIE_AER_LOG_MAX_DEFAULT        8
+#define PCIE_AER_LOG_MAX_MAX            128 /* what is appropriate? */
+#define PCIE_AER_LOG_MAX_UNSET          (~(uint16_t)0)
+    uint16_t log_max;
+
+    struct pcie_aer_err *log;
+};
+
+extern const VMStateDescription vmstate_pcie_aer_log;
+
+/* PCI express capability helper functions */
+int pci_pcie_cap_init(PCIDevice *dev,
+                      uint8_t offset, uint8_t type, uint8_t port);
+int pci_pcie_cap_exit(PCIDevice *dev);
+uint8_t pcie_cap_get_type(const PCIDevice *dev);
+void pcie_cap_flags_set_vector(PCIDevice *dev, uint8_t vector);
+uint8_t pcie_cap_flags_get_vector(PCIDevice *dev);
+
+void pcie_cap_deverr_init(PCIDevice *dev);
+void pcie_cap_deverr_reset(PCIDevice *dev);
+void pcie_cap_deverr_write_config(PCIDevice *dev,
+                                  uint32_t addr, uint32_t val, int len);
+
+void pcie_cap_slot_init(PCIDevice *dev, uint16_t slot);
+void pcie_cap_slot_reset(PCIDevice *dev);
+void pcie_cap_slot_write_config(PCIDevice *dev,
+                                uint32_t addr, uint32_t val, int len,
+                                uint16_t sltctl_prev);
+void pcie_cap_slot_push_attention_button(PCIDevice *dev);
+
+void pcie_cap_root_init(PCIDevice *dev);
+void pcie_cap_root_reset(PCIDevice *dev);
+
+void pcie_cap_flr_init(PCIDevice *dev, pcie_flr_fn flr);
+void pcie_cap_flr_write_config(PCIDevice *dev,
+                           uint32_t addr, uint32_t val, int len);
+
+void pcie_cap_ari_init(PCIDevice *dev);
+void pcie_cap_ari_reset(PCIDevice *dev);
+bool pcie_cap_is_ari_enabled(const PCIDevice *dev);
+
+/* PCI express extended capability helper functions */
+uint16_t pcie_find_ext_capability(PCIDevice *dev, uint16_t cap_id);
+int pcie_add_ext_capability(PCIDevice *dev,
+                            uint16_t cap_id, uint8_t cap_ver, uint16_t size);
+int pcie_append_ext_capability(PCIDevice *dev,
+                               uint16_t cap_id, uint8_t cap_ver,
+                               uint16_t offset, uint16_t size);
+void pcie_del_ext_capability(PCIDevice *dev, uint16_t cap_id, uint16_t size);
+void pcie_reserve_ext_capability(PCIDevice *dev,
+                                 uint16_t offset, uint16_t size);
+
+int pcie_ari_init(PCIDevice *dev, uint16_t offset, uint16_t nextfn);
+
+/* PCI express extended capabilities */
+
+/* AER */
+/* aer error severity */
+enum PCIE_AER_SEVERITY {
+    /* those value are same as
+     * Root error command register in aer extended cap and
+     * root control register in pci express cap.
+     */
+    AER_ERR_COR         = 0x1,
+    AER_ERR_NONFATAL    = 0x2,
+    AER_ERR_FATAL       = 0x4,
+};
+
+/* aer error message: error signaling message has only error sevirity and
+   source id. See 2.2.8.3 error signaling messages */
+struct pcie_aer_err_msg {
+    enum PCIE_AER_SEVERITY severity;
+    uint16_t source_id; /* bdf */
+};
+
+static inline bool
+pcie_aer_err_msg_is_uncor(const struct pcie_aer_err_msg *msg)
+{
+    return msg->severity == AER_ERR_NONFATAL || msg->severity == AER_ERR_FATAL;
+}
+
+/* error */
+struct pcie_aer_err {
+    uint32_t status;    /* error status bits */
+    uint16_t source_id; /* bdf */
+
+#define PCIE_AER_ERR_IS_CORRECTABLE     0x1     /* correctable/uncorrectable */
+#define PCIE_AER_ERR_MAYBE_ADVISORY     0x2     /* maybe advisory non-fatal */
+#define PCIE_AER_ERR_HEADER_VALID       0x4     /* TLP header is logged */
+#define PCIE_AER_ERR_TLP_PRESENT        0x8     /* TLP Prefix is logged */
+    uint16_t flags;
+
+    uint32_t header[4]; /* TLP header */
+    uint32_t prefix[4]; /* TLP header prefix */
+};
+
+int pcie_aer_init(PCIDevice *dev, uint16_t offset);
+void pcie_aer_exit(PCIDevice *dev);
+void pcie_aer_write_config(PCIDevice *dev,
+                           uint32_t addr, uint32_t val, int len);
+void pcie_aer_write_config_vbridge(PCIDevice *dev,
+                                   uint32_t addr, uint32_t val, int len);
+
+/* aer root port */
+void pcie_aer_root_set_vector(PCIDevice *dev, uint8_t vector);
+void pcie_aer_root_init(PCIDevice *dev);
+void pcie_aer_root_reset(PCIDevice *dev);
+void pcie_aer_root_write_config(PCIDevice *dev,
+                                uint32_t addr, uint32_t val, int len,
+                                uint32_t root_cmd_prev);
+
+/* error injection */
+void pcie_aer_inject_error(PCIDevice *dev, const struct pcie_aer_err *err);
+
+#endif /* QEMU_PCIE_H */
diff --git a/qemu-common.h b/qemu-common.h
index d735235..6d9ee26 100644
--- a/qemu-common.h
+++ b/qemu-common.h
@@ -219,6 +219,7 @@  typedef struct PCIHostState PCIHostState;
 typedef struct PCIExpressHost PCIExpressHost;
 typedef struct PCIBus PCIBus;
 typedef struct PCIDevice PCIDevice;
+typedef struct PCIExpressDevice PCIExpressDevice;
 typedef struct PCIBridge PCIBridge;
 typedef struct SerialState SerialState;
 typedef struct IRQState *qemu_irq;