Patchwork [v6,08/12] pcie/aer: helper functions for pcie aer capability

login
register
mail settings
Submitter Isaku Yamahata
Date Oct. 20, 2010, 8:18 a.m.
Message ID <b3216e946c77598d47c76c9b8cdb6d46dbeeea73.1287562197.git.yamahata@valinux.co.jp>
Download mbox | patch
Permalink /patch/68396/
State New
Headers show

Comments

Isaku Yamahata - Oct. 20, 2010, 8:18 a.m.
This patch implements helper functions for pcie aer capability
which will be used later.

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
---
Chnages v5 -> v6:
- cleaned up pcie_aer_write_config().
- enum definition.

Changes v4 -> v5:
- use pci_xxx_test_and_xxx_mask()
- rewrote PCIDevice::written bits.
- eliminated pcie_aer_notify()
- introduced PCIExpressDevice::aer_intx

Changes v3 -> v4:
- various naming fixes.
- use pci bit operation helper function
- eliminate errmsg function pointer
- replace pci_shift_xxx() with PCIDevice::written
- uncorrect error status register.
- dropped pcie_aer_cap()

Changes v2 -> v3:
- split out from pcie.[ch] to pcie_aer.[ch] to make the files sorter.
- embeded PCIExpressDevice into PCIDevice.
- CodingStyle fix
---
 Makefile.objs |    2 +-
 hw/pcie.h     |   14 +
 hw/pcie_aer.c |  780 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 hw/pcie_aer.h |  105 ++++++++
 qemu-common.h |    3 +
 5 files changed, 903 insertions(+), 1 deletions(-)
 create mode 100644 hw/pcie_aer.c
 create mode 100644 hw/pcie_aer.h
Michael S. Tsirkin - Oct. 20, 2010, 9:56 a.m.
On Wed, Oct 20, 2010 at 05:18:57PM +0900, Isaku Yamahata wrote:
> This patch implements helper functions for pcie aer capability
> which will be used later.
> 
> Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>

Some style comments and a couple of minor bugs.

> ---
> Chnages v5 -> v6:
> - cleaned up pcie_aer_write_config().
> - enum definition.
> 
> Changes v4 -> v5:
> - use pci_xxx_test_and_xxx_mask()
> - rewrote PCIDevice::written bits.
> - eliminated pcie_aer_notify()
> - introduced PCIExpressDevice::aer_intx
> 
> Changes v3 -> v4:
> - various naming fixes.
> - use pci bit operation helper function
> - eliminate errmsg function pointer
> - replace pci_shift_xxx() with PCIDevice::written
> - uncorrect error status register.
> - dropped pcie_aer_cap()
> 
> Changes v2 -> v3:
> - split out from pcie.[ch] to pcie_aer.[ch] to make the files sorter.
> - embeded PCIExpressDevice into PCIDevice.
> - CodingStyle fix
> ---
>  Makefile.objs |    2 +-
>  hw/pcie.h     |   14 +
>  hw/pcie_aer.c |  780 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  hw/pcie_aer.h |  105 ++++++++
>  qemu-common.h |    3 +
>  5 files changed, 903 insertions(+), 1 deletions(-)
>  create mode 100644 hw/pcie_aer.c
>  create mode 100644 hw/pcie_aer.h
> 
> diff --git a/Makefile.objs b/Makefile.objs
> index 138e545..48f98f3 100644
> --- a/Makefile.objs
> +++ b/Makefile.objs
> @@ -187,7 +187,7 @@ hw-obj-$(CONFIG_PIIX4) += piix4.o
>  # PCI watchdog devices
>  hw-obj-y += wdt_i6300esb.o
>  
> -hw-obj-y += pcie.o pcie_port.o
> +hw-obj-y += pcie.o pcie_aer.o pcie_port.o
>  hw-obj-y += msix.o msi.o
>  
>  # PCI network cards
> diff --git a/hw/pcie.h b/hw/pcie.h
> index 2871e27..415a680 100644
> --- a/hw/pcie.h
> +++ b/hw/pcie.h
> @@ -24,6 +24,7 @@
>  #include "hw.h"
>  #include "pci_regs.h"
>  #include "pcie_regs.h"
> +#include "pcie_aer.h"
>  
>  typedef enum {
>      /* for attention and power indicator */
> @@ -74,6 +75,19 @@ struct PCIExpressDevice {
>                                   * also initialize it when loaded as
>                                   * appropreately.
>                                   */
> +
> +    /* AER */
> +    uint16_t aer_cap;
> +    PCIEAERLog aer_log;
> +    unsigned int aer_intx;      /* INTx for error reporting
> +                                 * default is 0 = INTA#
> +                                 * If the chip wants to use other interrupt
> +                                 * line, initialize this member with the
> +                                 * desired number.
> +                                 * If the chip dynamically changes this member,
> +                                 * also initialize it when loaded as
> +                                 * appropreately.
> +                                 */
>  };
>  
>  /* PCI express capability helper functions */
> diff --git a/hw/pcie_aer.c b/hw/pcie_aer.c
> new file mode 100644
> index 0000000..b8cede3
> --- /dev/null
> +++ b/hw/pcie_aer.c
> @@ -0,0 +1,780 @@
> +/*
> + * pcie_aer.c
> + *
> + * Copyright (c) 2010 Isaku Yamahata <yamahata at valinux co jp>
> + *                    VA Linux Systems Japan K.K.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include "sysemu.h"
> +#include "pci_bridge.h"
> +#include "pcie.h"
> +#include "msix.h"
> +#include "msi.h"
> +#include "pci_internals.h"
> +#include "pcie_regs.h"
> +
> +//#define DEBUG_PCIE
> +#ifdef DEBUG_PCIE
> +# define PCIE_DPRINTF(fmt, ...)                                         \
> +    fprintf(stderr, "%s:%d " fmt, __func__, __LINE__, ## __VA_ARGS__)
> +#else
> +# define PCIE_DPRINTF(fmt, ...) do {} while (0)
> +#endif
> +#define PCIE_DEV_PRINTF(dev, fmt, ...)                                  \
> +    PCIE_DPRINTF("%s:%x "fmt, (dev)->name, (dev)->devfn, ## __VA_ARGS__)
> +
> +static void pcie_aer_clear_error(PCIDevice *dev);
> +static uint8_t pcie_aer_root_get_vector(PCIDevice *dev);
> +static AERMsgResult
> +pcie_aer_msg_alldev(PCIDevice *dev, const PCIEAERMsg *msg);
> +static AERMsgResult
> +pcie_aer_msg_vbridge(PCIDevice *dev, const PCIEAERMsg *msg);
> +static AERMsgResult
> +pcie_aer_msg_root_port(PCIDevice *dev, const PCIEAERMsg *msg);

Just make the above bool, e.g. true if sent?

Also don't forward declare static functions. Just order them properly in the
file.

> +
> +/* From 6.2.7 Error Listing and Rules. Table 6-2, 6-3 and 6-4 */
> +static PCIEAERSeverity pcie_aer_uncor_default_severity(uint32_t status)
> +{
> +    switch (status) {
> +    case PCI_ERR_UNC_INTN:
> +    case PCI_ERR_UNC_DLP:
> +    case PCI_ERR_UNC_SDN:
> +    case PCI_ERR_UNC_RX_OVER:
> +    case PCI_ERR_UNC_FCP:
> +    case PCI_ERR_UNC_MALF_TLP:
> +        return AER_ERR_FATAL;
> +    case PCI_ERR_UNC_POISON_TLP:
> +    case PCI_ERR_UNC_ECRC:
> +    case PCI_ERR_UNC_UNSUP:
> +    case PCI_ERR_UNC_COMP_TIME:
> +    case PCI_ERR_UNC_COMP_ABORT:
> +    case PCI_ERR_UNC_UNX_COMP:
> +    case PCI_ERR_UNC_ACSV:
> +    case PCI_ERR_UNC_MCBTLP:
> +    case PCI_ERR_UNC_ATOP_EBLOCKED:
> +    case PCI_ERR_UNC_TLP_PRF_BLOCKED:
> +        return AER_ERR_NONFATAL;
> +    default:
> +        break;
> +    }
> +    abort();
> +    return AER_ERR_FATAL;
> +}
> +
> +static uint32_t aer_log_next(uint32_t i, uint32_t max)
> +{
> +    return (i + 1) % max;
> +}
> +

Pass in PCIEAERLog* as 1st parameter instead of max.
This way we get it right.

> +static bool aer_log_empty_index(uint32_t producer, uint32_t consumer)
> +{
> +    return producer == consumer;
> +}
> +

This one is better opencoded.

> +static bool aer_log_empty(PCIEAERLog *aer_log)
> +{
> +    return aer_log_empty_index(aer_log->producer, aer_log->consumer);
> +}
> +
> +static bool aer_log_full(PCIEAERLog *aer_log)
> +{
> +    return aer_log_next(aer_log->producer, aer_log->log_max) ==
> +        aer_log->consumer;
> +}
> +
> +static uint32_t aer_log_add(PCIEAERLog *aer_log)
> +{
> +    uint32_t i = aer_log->producer;
> +    aer_log->producer = aer_log_next(aer_log->producer, aer_log->log_max);
> +    return i;
> +}
> +
> +static uint32_t aer_log_del(PCIEAERLog *aer_log)
> +{
> +    uint32_t i = aer_log->consumer;
> +    aer_log->consumer = aer_log_next(aer_log->consumer, aer_log->log_max);
> +    return i;
> +}


Please just use 'int' or 'unsigned' instead of uint32_t if you simply
want to say 'a number'.  Using specific length makes it impossible to
say where you *really* want a value.

> +
> +static int aer_log_add_err(PCIEAERLog *aer_log, const PCIEAERErr *err)
> +{
> +    uint32_t i;
> +    if (aer_log_full(aer_log)) {
> +        return -1;
> +    }
> +    i = aer_log_add(aer_log);
> +    memcpy(&aer_log->log[i], err, sizeof(*err));

sizeof is not a function, you don't need ()

> +    return 0;
> +}
> +
> +static const PCIEAERErr* aer_log_del_err(PCIEAERLog *aer_log)
> +{
> +    uint32_t i;
> +    assert(!aer_log_empty(aer_log));
> +    i = aer_log_del(aer_log);
> +    return &aer_log->log[i];
> +}
> +
> +static void aer_log_clear_all_err(PCIEAERLog *aer_log)
> +{
> +    aer_log->producer = 0;
> +    aer_log->consumer = 0;
> +}
> +
> +void pcie_aer_init(PCIDevice *dev, uint16_t offset)
> +{
> +    PCIExpressDevice *exp;
> +
> +    pci_word_test_and_set_mask(dev->wmask + PCI_COMMAND, PCI_COMMAND_SERR);
> +    pci_word_test_and_set_mask(dev->w1cmask + PCI_STATUS,
> +                               PCI_STATUS_SIG_SYSTEM_ERROR);
> +
> +    pcie_add_capability(dev, PCI_EXT_CAP_ID_ERR, PCI_ERR_VER,
> +                        offset, PCI_ERR_SIZEOF);
> +    exp = &dev->exp;
> +    exp->aer_cap = offset;
> +    if (dev->exp.aer_log.log_max == PCIE_AER_LOG_MAX_UNSET) {
> +        dev->exp.aer_log.log_max = PCIE_AER_LOG_MAX_DEFAULT;
> +    }
> +    if (dev->exp.aer_log.log_max > PCIE_AER_LOG_MAX_MAX) {
> +        dev->exp.aer_log.log_max = PCIE_AER_LOG_MAX_MAX;
> +    }
> +    dev->exp.aer_log.log = qemu_mallocz(sizeof(dev->exp.aer_log.log[0]) *
> +                                        dev->exp.aer_log.log_max);
> +
> +    /* On reset PCI_ERR_CAP_MHRE is disabled
> +     * PCI_ERR_CAP_MHRE is RWS so that reset doesn't affect related
> +     * registers
> +     */
> +    pci_set_long(dev->w1cmask + offset + PCI_ERR_UNCOR_STATUS,
> +                 PCI_ERR_UNC_SUPPORTED);
> +
> +    pci_set_long(dev->config + offset + PCI_ERR_UNCOR_SEVER,
> +                 PCI_ERR_UNC_SEVERITY_DEFAULT);
> +    pci_set_long(dev->wmask + offset + PCI_ERR_UNCOR_SEVER,
> +                 PCI_ERR_UNC_SUPPORTED);
> +
> +    pci_long_test_and_set_mask(dev->w1cmask + offset + PCI_ERR_COR_STATUS,
> +                               PCI_ERR_COR_STATUS);
> +
> +    pci_set_long(dev->config + offset + PCI_ERR_COR_MASK,
> +                 PCI_ERR_COR_MASK_DEFAULT);
> +    pci_set_long(dev->wmask + offset + PCI_ERR_COR_MASK,
> +                 PCI_ERR_COR_SUPPORTED);
> +
> +    /* capabilities and control. multiple header logging is supported */
> +    if (dev->exp.aer_log.log_max > 0) {
> +        pci_set_long(dev->config + offset + PCI_ERR_CAP,
> +                     PCI_ERR_CAP_ECRC_GENC | PCI_ERR_CAP_ECRC_CHKC |
> +                     PCI_ERR_CAP_MHRC);
> +        pci_set_long(dev->wmask + offset + PCI_ERR_CAP,
> +                     PCI_ERR_CAP_ECRC_GENE | PCI_ERR_CAP_ECRC_CHKE |
> +                     PCI_ERR_CAP_MHRE);
> +    } else {
> +        pci_set_long(dev->config + offset + PCI_ERR_CAP,
> +                     PCI_ERR_CAP_ECRC_GENC | PCI_ERR_CAP_ECRC_CHKC);
> +        pci_set_long(dev->wmask + offset + PCI_ERR_CAP,
> +                     PCI_ERR_CAP_ECRC_GENE | PCI_ERR_CAP_ECRC_CHKE);
> +    }
> +
> +    switch (pcie_cap_get_type(dev)) {
> +    case PCI_EXP_TYPE_ROOT_PORT:
> +        /* this case will be set by pcie_aer_root_init() */
> +        /* fallthrough */
> +    case PCI_EXP_TYPE_DOWNSTREAM:
> +    case PCI_EXP_TYPE_UPSTREAM:
> +        pci_word_test_and_set_mask(dev->wmask + PCI_BRIDGE_CONTROL,
> +                                   PCI_BRIDGE_CTL_SERR);
> +        pci_long_test_and_set_mask(dev->w1cmask + PCI_STATUS,
> +                                   PCI_SEC_STATUS_RCV_SYSTEM_ERROR);
> +        break;
> +    default:
> +        /* nothing */
> +        break;
> +    }
> +}
> +
> +void pcie_aer_exit(PCIDevice *dev)
> +{
> +    qemu_free(dev->exp.aer_log.log);
> +}
> +
> +void pcie_aer_write_config(PCIDevice *dev,
> +                           uint32_t addr, uint32_t val, int len,
> +                           uint32_t uncorsta_old)
> +{
> +    uint32_t pos = dev->exp.aer_cap;

dev->config + pos would be a better variable to use.

> +    uint32_t errcap = pci_get_long(dev->config + pos + PCI_ERR_CAP);
> +    uint32_t first_error = 1U << PCI_ERR_CAP_FEP(errcap);
> +    uint32_t uncorsta = pci_get_long(dev->config + pos + PCI_ERR_UNCOR_STATUS);
> +
> +    /* uncorrectable error */
> +    if ((uncorsta_old & first_error) && !(uncorsta & first_error)) {
> +        /* the bit that corresponds to the first error is cleared */
> +        pcie_aer_clear_error(dev);

So why not just call this unconditionally?

> +    } else if (errcap & PCI_ERR_CAP_MHRE) {
> +        /* When PCI_ERR_CAP_MHRE is enabled and the first error isn't cleared
> +         * nothing should happen. So we have to revert the modification to
> +         * the register.
> +         */
> +        pci_set_long(dev->config + pos + PCI_ERR_UNCOR_STATUS, uncorsta_old);

Why do we need uncorsta_old? Why can't we get the info from the log?

> +    }
> +
> +    /* capability & control */
> +    if (ranges_overlap(addr, len, pos + PCI_ERR_CAP, 4)) {

I can not figure this out. What does this range check do?
And why use capability number as offset?

> +        if (!(errcap & PCI_ERR_CAP_MHRE)) {

also checking twise is ugly.

> +            aer_log_clear_all_err(&dev->exp.aer_log);
> +        }
> +    }
> +}
> +
> +static inline void pcie_aer_msg(PCIDevice *dev, const PCIEAERMsg *msg)
> +{
> +    uint8_t type;
> +    AERMsgResult result;
> +
> +    assert(pci_is_express(dev));
> +
> +    type = pcie_cap_get_type(dev);
> +    if (type == PCI_EXP_TYPE_ROOT_PORT ||
> +        type == PCI_EXP_TYPE_UPSTREAM ||
> +        type == PCI_EXP_TYPE_DOWNSTREAM) {
> +        result = pcie_aer_msg_vbridge(dev, msg);
> +        if (result != AER_MSG_SENT) {
> +            return;
> +        }
> +    }
> +    result = pcie_aer_msg_alldev(dev, msg);
> +    if (type == PCI_EXP_TYPE_ROOT_PORT && result == AER_MSG_SENT) {
> +        pcie_aer_msg_root_port(dev, msg);
> +    }
> +}
> +
> +static AERMsgResult
> +pcie_aer_msg_alldev(PCIDevice *dev, const PCIEAERMsg *msg)
> +{
> +    uint16_t cmd = pci_get_word(dev->config + PCI_COMMAND);
> +    bool transmit1 =
> +        pcie_aer_msg_is_uncor(msg) && (cmd & PCI_COMMAND_SERR);
> +    uint32_t devctl = pci_get_word(dev->config +
> +                                   dev->exp.exp_cap + PCI_EXP_DEVCTL);

Word is uint16, so devctl should be too.

> +    bool transmit2 = msg->severity & devctl;
> +    PCIDevice *parent_port;
> +
> +    if (transmit1) {
> +        if (pcie_aer_msg_is_uncor(msg)) {
> +            /* Signaled System Error */
> +            pci_word_test_and_set_mask(dev->config + PCI_STATUS,
> +                                       PCI_STATUS_SIG_SYSTEM_ERROR);
> +        }
> +    }
> +
> +    if (!(transmit1 || transmit2)) {
> +        return AER_MSG_MASKED;
> +    }
> +
> +    /* send up error message */
> +    if (pci_is_express(dev) &&
> +        pcie_cap_get_type(dev) == PCI_EXP_TYPE_ROOT_PORT) {
> +        /* Root port notify system itself,
> +           or send the error message to root complex event collector. */
> +        /*
> +         * if root port is associated to event collector, set
> +         * parent_port = root complex event collector
> +         * For now root complex event collector isn't supported.
> +         */
> +        parent_port = NULL;
> +    } else {
> +        parent_port = pci_bridge_get_device(dev->bus);
> +    }
> +    if (parent_port) {
> +        if (!pci_is_express(parent_port)) {
> +            /* What to do? */
> +            return AER_MSG_MASKED;
> +        }
> +        pcie_aer_msg(parent_port, msg);
> +    }
> +    return AER_MSG_SENT;
> +}
> +
> +static AERMsgResult
> +pcie_aer_msg_vbridge(PCIDevice *dev, const PCIEAERMsg *msg)
> +{
> +    uint16_t bridge_control = pci_get_word(dev->config + PCI_BRIDGE_CONTROL);
> +
> +    if (pcie_aer_msg_is_uncor(msg)) {
> +        /* Received System Error */
> +        pci_word_test_and_set_mask(dev->config + PCI_SEC_STATUS,
> +                                   PCI_SEC_STATUS_RCV_SYSTEM_ERROR);
> +    }
> +
> +    if (!(bridge_control & PCI_BRIDGE_CTL_SERR)) {
> +        return AER_MSG_MASKED;
> +    }
> +    return AER_MSG_SENT;
> +}
> +
> +static AERMsgResult
> +pcie_aer_msg_root_port(PCIDevice *dev, const PCIEAERMsg *msg)
> +{
> +    AERMsgResult ret;
> +    uint16_t cmd;
> +    uint8_t *aer_cap;
> +    uint32_t root_cmd;
> +    uint32_t root_sta;
> +    bool msi_trigger;
> +
> +    ret = AER_MSG_MASKED;
> +    cmd = pci_get_word(dev->config + PCI_COMMAND);
> +    aer_cap = dev->config + dev->exp.aer_cap;
> +    root_cmd = pci_get_long(aer_cap + PCI_ERR_ROOT_COMMAND);
> +    root_sta = pci_get_long(aer_cap + PCI_ERR_ROOT_STATUS);
> +    msi_trigger = false;
> +
> +    if (cmd & PCI_COMMAND_SERR) {
> +        /* System Error. Platform Specific */
> +        /* ret = AER_MSG_SENT; */
> +    }
> +
> +    /* Errro Message Received: Root Error Status register */
> +    switch (msg->severity) {
> +    case AER_ERR_COR:
> +        if (root_sta & PCI_ERR_ROOT_COR_RCV) {
> +            root_sta |= PCI_ERR_ROOT_MULTI_COR_RCV;
> +        } else {
> +            if (root_cmd & PCI_ERR_ROOT_CMD_COR_EN) {
> +                msi_trigger = true;
> +            }
> +            pci_set_word(aer_cap + PCI_ERR_ROOT_COR_SRC, msg->source_id);
> +        }
> +        root_sta |= PCI_ERR_ROOT_COR_RCV;
> +        break;
> +    case AER_ERR_NONFATAL:
> +        if (!(root_sta & PCI_ERR_ROOT_NONFATAL_RCV) &&
> +            root_cmd & PCI_ERR_ROOT_CMD_NONFATAL_EN) {
> +            msi_trigger = true;
> +        }
> +        root_sta |= PCI_ERR_ROOT_NONFATAL_RCV;
> +        break;
> +    case AER_ERR_FATAL:
> +        if (!(root_sta & PCI_ERR_ROOT_FATAL_RCV) &&
> +            root_cmd & PCI_ERR_ROOT_CMD_FATAL_EN) {
> +            msi_trigger = true;
> +        }
> +        if (!(root_sta & PCI_ERR_ROOT_UNCOR_RCV)) {
> +            root_sta |= PCI_ERR_ROOT_FIRST_FATAL;
> +        }
> +        root_sta |= PCI_ERR_ROOT_FATAL_RCV;
> +        break;
> +    }
> +    if (pcie_aer_msg_is_uncor(msg)) {
> +        if (root_sta & PCI_ERR_ROOT_UNCOR_RCV) {
> +            root_sta |= PCI_ERR_ROOT_MULTI_UNCOR_RCV;
> +        } else {
> +            pci_set_word(aer_cap + PCI_ERR_ROOT_SRC, msg->source_id);
> +        }
> +        root_sta |= PCI_ERR_ROOT_UNCOR_RCV;
> +    }
> +    pci_set_long(aer_cap + PCI_ERR_ROOT_STATUS, root_sta);
> +
> +    if (root_cmd & msg->severity) {
> +        /* 6.2.4.1.2 Interrupt Generation */
> +        if (pci_msi_enabled(dev)) {
> +            pci_msi_notify(dev, pcie_aer_root_get_vector(dev));
> +        } else {
> +            qemu_set_irq(dev->irq[dev->exp.aer_intx], 1);
> +        }
> +        ret = AER_MSG_SENT;
> +    }
> +    return ret;
> +}
> +
> +static void pcie_aer_update_log(PCIDevice *dev, const PCIEAERErr *err)
> +{
> +    uint8_t *aer_cap = dev->config + dev->exp.aer_cap;
> +    uint8_t first_bit = ffsl(err->status) - 1;

What guarantees err->status is not 0 here?

> +    uint32_t errcap = pci_get_long(aer_cap + PCI_ERR_CAP);
> +    int i;
> +    uint32_t dw;
> +
> +    errcap &= ~(PCI_ERR_CAP_FEP_MASK | PCI_ERR_CAP_TLP);
> +    errcap |= PCI_ERR_CAP_FEP(first_bit);
> +
> +    if (err->flags & PCIE_AER_ERR_HEADER_VALID) {
> +        for (i = 0; i < ARRAY_SIZE(err->header); ++i) {
> +            /* 7.10.8 Header Log Register */
> +            cpu_to_be32wu(&dw, err->header[i]);
> +            memcpy(aer_cap + PCI_ERR_HEADER_LOG + sizeof(err->header[0]) * i,
> +                   &dw, sizeof(dw));

memcpy into config space is almost sure to be wrong on BE hosts.
Please fill all fields with appropriate pci_ macros.
This also ensures you don't depend on specific layout for correctness.

sizeof(err->header) is confusing here. Please just use a macro.

> +        }
> +    } else {
> +        assert(!(err->flags & PCIE_AER_ERR_TLP_PRESENT));
> +        memset(aer_cap + PCI_ERR_HEADER_LOG, 0, sizeof(err->header));

sizeof(err->header) is confusing here. Please just use a macro.

> +    }
> +
> +    if ((err->flags & PCIE_AER_ERR_TLP_PRESENT) &&
> +        (pci_get_long(dev->config + dev->exp.exp_cap + PCI_EXP_DEVCTL2) &
> +         PCI_EXP_DEVCAP2_EETLPP)) {
> +        for (i = 0; i < ARRAY_SIZE(err->prefix); ++i) {
> +            /* 7.10.12 tlp prefix log register */
> +            cpu_to_be32wu(&dw, err->prefix[i]);
> +            memcpy(aer_cap + PCI_ERR_TLP_PREFIX_LOG +
> +                   sizeof(err->prefix[0]) * i, &dw, sizeof(dw));

as above

> +        }
> +        errcap |= PCI_ERR_CAP_TLP;
> +    } else {
> +        memset(aer_cap + PCI_ERR_TLP_PREFIX_LOG, 0, sizeof(err->prefix));
> +    }
> +    pci_set_long(aer_cap + PCI_ERR_CAP, errcap);
> +}
> +
> +static void pcie_aer_clear_log(PCIDevice *dev)
> +{
> +    PCIEAERErr *err;
> +    uint8_t *aer_cap = dev->config + dev->exp.aer_cap;
> +
> +    pci_long_test_and_clear_mask(aer_cap + PCI_ERR_CAP,
> +                                 PCI_ERR_CAP_FEP_MASK | PCI_ERR_CAP_TLP);
> +
> +    memset(aer_cap + PCI_ERR_HEADER_LOG, 0, sizeof(err->header));
> +    memset(aer_cap + PCI_ERR_TLP_PREFIX_LOG, 0, sizeof(err->prefix));

So err is in fact unused?

This assumes that layout for PCIEAERErr matches config space.
This will lead with time to attempts to copy structure directly.
This is wrong e.g. because of endianness. Please don't do
these tricks: just a macro defining size will be enough.

> +}
> +
> +static int pcie_aer_record_error(PCIDevice *dev,
> +                                 const PCIEAERErr *err)
> +{
> +    uint8_t *aer_cap = dev->config + dev->exp.aer_cap;
> +    uint32_t errcap = pci_get_long(aer_cap + PCI_ERR_CAP);
> +    int fep = PCI_ERR_CAP_FEP(errcap);
> +
> +    if (errcap & PCI_ERR_CAP_MHRE &&
> +        (pci_get_long(aer_cap + PCI_ERR_UNCOR_STATUS) & (1ULL << fep))) {

get_long returns 32 bit
1ULL gives you 64 bit
So why bother with it?
Looking at code fep is 0 to 31, so 1U would be what you want.

> +        /*  Not first error. queue error */
> +        if (aer_log_add_err(&dev->exp.aer_log, err) < 0) {
> +            /* overflow */
> +            return -1;
> +        }
> +        return 0;
> +    }
> +
> +    pcie_aer_update_log(dev, err);
> +    return 0;
> +}
> +
> +static void pcie_aer_clear_error(PCIDevice *dev)
> +{
> +    uint8_t *aer_cap = dev->config + dev->exp.aer_cap;
> +    uint32_t errcap = pci_get_long(aer_cap + PCI_ERR_CAP);
> +    PCIEAERLog *aer_log = &dev->exp.aer_log;
> +    const PCIEAERErr *err;
> +    uint32_t consumer;
> +
> +    if (!(errcap & PCI_ERR_CAP_MHRE) || aer_log_empty(aer_log)) {
> +        pcie_aer_clear_log(dev);
> +        return;
> +    }
> +
> +    /*
> +     * If more errors are queued, set corresponding bits in uncorrectable
> +     * error status.
> +     * We emulates uncorrectable error status register as W1CS.

We emulate

> +     * So set bit in uncorrectable error status here again for multiple
> +     * error recording support.
> +     *
> +     * 6.2.4.2 Multiple Error Handling(Advanced Error Reporting Capability)
> +     */
> +    for (consumer = dev->exp.aer_log.consumer;
> +         !aer_log_empty_index(dev->exp.aer_log.producer, consumer);
> +         consumer = aer_log_next(consumer, dev->exp.aer_log.log_max)) {
> +        pci_long_test_and_set_mask(aer_cap + PCI_ERR_UNCOR_STATUS,
> +                                   dev->exp.aer_log.log[consumer].status);
> +    }
> +
> +    err = aer_log_del_err(aer_log);
> +    pcie_aer_update_log(dev, err);
> +}
> +
> +/*
> + * non-Function specific error must be recorded in all functions.
> + * It is the responsibility of the caller of this function.
> + * It is also caller's responsiblity to determine which function should
> + * report the rerror.
> + *
> + * 6.2.4 Error Logging
> + * 6.2.5 Sqeucne of Device Error Signaling and Logging Operations

typo?

> + * table 6-2: Flowchard Showing Sequence of Device Error Signaling and Logging
> + *            Operations
> + *
> + * Although this implementation can be shortened/optimized, this is kept
> + * parallel to table 6-2.

Yes but this does not mean this can not be split up
into subfunctions.

> + */
> +void pcie_aer_inject_error(PCIDevice *dev, const PCIEAERErr *err)
> +{
> +    uint8_t *exp_cap;
> +    uint8_t *aer_cap = NULL;
> +    uint32_t devctl = 0;
> +    uint32_t devsta = 0;
> +    uint32_t status = err->status;
> +    uint32_t mask;
> +    bool is_unsupported_request =
> +        (!(err->flags & PCIE_AER_ERR_IS_CORRECTABLE) &&
> +         err->status == PCI_ERR_UNC_UNSUP);
> +    bool is_advisory_nonfatal = false;  /* for advisory non-fatal error */
> +    uint32_t uncor_status = 0;          /* for advisory non-fatal error */
> +    PCIEAERMsg msg;
> +    int is_header_log_overflowed = 0;
> +
> +    if (!pci_is_express(dev)) {
> +        /* What to do? */

This is used from command, right?
So return error?

> +        return;
> +    }
> +
> +    if (err->flags & PCIE_AER_ERR_IS_CORRECTABLE) {
> +        status &= PCI_ERR_COR_SUPPORTED;
> +    } else {
> +        status &= PCI_ERR_UNC_SUPPORTED;
> +    }
> +    if (!status || status & (status - 1)) {

So you want to rely on & being higher precedence than && or not?
I prefer when we do, use less (), but the rest of the file seems
to use (). Let's be consistent.

> +        /* invalid status bit. one and only one bit must be set */

Pls put comments on same or above lines as the code, not after :)

> +        return;
So return error?
> +    }
> +
> +    exp_cap = dev->config + dev->exp.exp_cap;
> +    if (dev->exp.aer_cap) {
> +        aer_cap = dev->config + dev->exp.aer_cap;
> +        devctl = pci_get_long(exp_cap + PCI_EXP_DEVCTL);
> +        devsta = pci_get_long(exp_cap + PCI_EXP_DEVSTA);
> +    }
> +    if (err->flags & PCIE_AER_ERR_IS_CORRECTABLE) {
> +    correctable_error:

Please, don't use goto to implement logic.
Split correctable/uncorrectable into subfunctions and call them any
number of times.

> +        devsta |= PCI_EXP_DEVSTA_CED;
> +        if (is_unsupported_request) {
> +            devsta |= PCI_EXP_DEVSTA_URD;
> +        }
> +        pci_set_word(exp_cap + PCI_EXP_DEVSTA, devsta);
> +
> +        if (aer_cap) {
> +            pci_long_test_and_set_mask(aer_cap + PCI_ERR_COR_STATUS, status);
> +            mask = pci_get_long(aer_cap + PCI_ERR_COR_MASK);
> +            if (mask & status) {
> +                return;
> +            }
> +            if (is_advisory_nonfatal) {
> +                uint32_t uncor_mask =
> +                    pci_get_long(aer_cap + PCI_ERR_UNCOR_MASK);
> +                if (!(uncor_mask & uncor_status)) {
> +                    is_header_log_overflowed = pcie_aer_record_error(dev, err);
> +                }
> +                pci_long_test_and_set_mask(aer_cap + PCI_ERR_UNCOR_STATUS,
> +                                           uncor_status);
> +            }
> +        }
> +
> +        if (is_unsupported_request && !(devctl & PCI_EXP_DEVCTL_URRE)) {
> +            return;
> +        }
> +        if (!(devctl & PCI_EXP_DEVCTL_CERE)) {
> +            return;
> +        }
> +        msg.severity = AER_ERR_COR;
> +    } else {
> +        bool is_fatal =
> +            (pcie_aer_uncor_default_severity(status) == AER_ERR_FATAL);

Don't need (); also 

> +        uint16_t cmd;
> +
> +        if (aer_cap) {
> +            is_fatal = status & pci_get_long(aer_cap + PCI_ERR_UNCOR_SEVER);
> +        }
> +        if (!is_fatal && (err->flags & PCIE_AER_ERR_MAYBE_ADVISORY)) {
> +            is_advisory_nonfatal = true;
> +            uncor_status = status;
> +            status = PCI_ERR_COR_ADV_NONFATAL;
> +            goto correctable_error;
> +        }
> +        if (is_fatal) {
> +            devsta |= PCI_EXP_DEVSTA_FED;
> +        } else {
> +            devsta |= PCI_EXP_DEVSTA_NFED;
> +        }
> +        if (is_unsupported_request) {
> +            devsta |= PCI_EXP_DEVSTA_URD;
> +        }
> +        pci_set_long(exp_cap + PCI_EXP_DEVSTA, devsta);
> +
> +        if (aer_cap) {
> +            mask = pci_get_long(aer_cap + PCI_ERR_UNCOR_MASK);
> +            if (mask & status) {
> +                pci_long_test_and_set_mask(aer_cap + PCI_ERR_UNCOR_STATUS,
> +                                           status);
> +                return;
> +            }
> +
> +            is_header_log_overflowed = pcie_aer_record_error(dev, err);
> +            pci_long_test_and_set_mask(aer_cap + PCI_ERR_UNCOR_STATUS, status);
> +        }
> +
> +        cmd = pci_get_word(dev->config + PCI_COMMAND);
> +        if (is_unsupported_request &&
> +            !(devctl & PCI_EXP_DEVCTL_URRE) && !(cmd & PCI_COMMAND_SERR)) {
> +            return;
> +        }
> +        if (is_fatal) {
> +            if (!((cmd & PCI_COMMAND_SERR) ||
> +                  (devctl & PCI_EXP_DEVCTL_FERE))) {
> +                return;
> +            }
> +            msg.severity = AER_ERR_FATAL;
> +        } else {
> +            if (!((cmd & PCI_COMMAND_SERR) ||
> +                  (devctl & PCI_EXP_DEVCTL_NFERE))) {
> +                return;
> +            }
> +            msg.severity = AER_ERR_NONFATAL;
> +        }
> +    }
> +
> +    /* send up error message */
> +    msg.source_id = err->source_id;
> +    pcie_aer_msg(dev, &msg);
> +
> +    if (is_header_log_overflowed) {
> +        PCIEAERErr header_log_overflow = {
> +            .status = PCI_ERR_COR_HL_OVERFLOW,
> +            .flags = PCIE_AER_ERR_IS_CORRECTABLE,
> +            .header = {0, 0, 0, 0},
> +            .prefix = {0, 0, 0, 0},

You don't need the 0,0,0 thing. Whatever you don't specify is 0.

> +        };
> +        pcie_aer_inject_error(dev, &header_log_overflow);
> +    }
> +}
> +
> +void pcie_aer_root_set_vector(PCIDevice *dev, uint8_t vector)

Just get vector as unsigned, you won't have to cast it down below.

> +{
> +    uint8_t *aer_cap = dev->config + dev->exp.aer_cap;
> +    assert(vector < PCI_ERR_ROOT_IRQ_MAX);
> +    pci_long_test_and_clear_mask(aer_cap + PCI_ERR_ROOT_STATUS,
> +                                 PCI_ERR_ROOT_IRQ);
> +    pci_long_test_and_set_mask(aer_cap + PCI_ERR_ROOT_STATUS,
> +                               ((uint32_t)vector) << PCI_ERR_ROOT_IRQ_SHIFT);
> +}
> +
> +static uint8_t pcie_aer_root_get_vector(PCIDevice *dev)
> +{
> +    uint8_t *aer_cap = dev->config + dev->exp.aer_cap;
> +    uint32_t root_status = pci_get_long(aer_cap + PCI_ERR_ROOT_STATUS);
> +    return (root_status & PCI_ERR_ROOT_IRQ) >> PCI_ERR_ROOT_IRQ_SHIFT;
> +}
> +
> +void pcie_aer_root_init(PCIDevice *dev)
> +{
> +    uint16_t pos = dev->exp.aer_cap;
> +
> +    pci_set_long(dev->wmask + pos + PCI_ERR_ROOT_COMMAND,
> +                 PCI_ERR_ROOT_CMD_EN_MASK);
> +    pci_set_long(dev->w1cmask + pos + PCI_ERR_ROOT_STATUS,
> +                 PCI_ERR_ROOT_STATUS_REPORT_MASK);
> +}
> +
> +void pcie_aer_root_reset(PCIDevice *dev)
> +{
> +    uint8_t* aer_cap = dev->config + dev->exp.aer_cap;
> +
> +    pci_set_long(aer_cap + PCI_ERR_ROOT_COMMAND, 0);
> +
> +    /*
> +     * Advanced Error Interrupt Message Number in Root Error Status Register
> +     * must be updated by chip dependent code because it's chip dependent
> +     * which number is used.
> +     */
> +}
> +
> +static bool pcie_aer_root_does_trigger(uint32_t cmd, uint32_t status)
> +{
> +    return
> +        ((cmd & PCI_ERR_ROOT_CMD_COR_EN) && (status & PCI_ERR_ROOT_COR_RCV)) ||
> +        ((cmd & PCI_ERR_ROOT_CMD_NONFATAL_EN) &&
> +         (status & PCI_ERR_ROOT_NONFATAL_RCV)) ||
> +        ((cmd & PCI_ERR_ROOT_CMD_FATAL_EN) &&
> +         (status & PCI_ERR_ROOT_FATAL_RCV));
> +}
> +
> +void pcie_aer_root_write_config(PCIDevice *dev,
> +                                uint32_t addr, uint32_t val, int len,
> +                                uint32_t root_cmd_prev)
> +{
> +    uint16_t pos = dev->exp.aer_cap;
> +    uint8_t *aer_cap = dev->config + pos;
> +
> +    /* root command register */
> +    uint32_t root_cmd = pci_get_long(aer_cap + PCI_ERR_ROOT_COMMAND);
> +    if (root_cmd & PCI_ERR_ROOT_CMD_EN_MASK) {
> +        /* 6.2.4.1.2 Interrupt Generation */
> +
> +        /* 0 -> 1 */
> +        uint32_t root_cmd_set = (root_cmd_prev ^ root_cmd) & root_cmd;
> +        uint32_t root_status = pci_get_long(aer_cap + PCI_ERR_ROOT_STATUS);
> +
> +        if (pci_msi_enabled(dev)) {
> +            if (pcie_aer_root_does_trigger(root_cmd_set, root_status)) {
> +                pci_msi_notify(dev, pcie_aer_root_get_vector(dev));
> +            }
> +        } else {
> +            int int_level = pcie_aer_root_does_trigger(root_cmd, root_status);
> +            qemu_set_irq(dev->irq[dev->exp.aer_intx], int_level);
> +        }
> +    }
> +}
> +
> +static const VMStateDescription vmstate_pcie_aer_err = {
> +    .name = "PCIE_AER_ERROR",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .minimum_version_id_old = 1,
> +    .fields     = (VMStateField[]) {
> +        VMSTATE_UINT32(status, PCIEAERErr),
> +        VMSTATE_UINT16(source_id, PCIEAERErr),
> +        VMSTATE_UINT16(flags, PCIEAERErr),
> +        VMSTATE_UINT32_ARRAY(header, PCIEAERErr, 4),
> +        VMSTATE_UINT32_ARRAY(prefix, PCIEAERErr, 4),
> +        VMSTATE_END_OF_LIST()
> +    }
> +};
> +
> +#define VMSTATE_PCIE_AER_ERRS(_field, _state, _field_num, _vmsd, _type) { \
> +    .name       = (stringify(_field)),                                    \
> +    .version_id = 0,                                                      \
> +    .num_offset = vmstate_offset_value(_state, _field_num, uint16_t),     \
> +    .size       = sizeof(_type),                                          \
> +    .vmsd       = &(_vmsd),                                               \
> +    .flags      = VMS_POINTER | VMS_VARRAY_UINT16 | VMS_STRUCT,           \
> +    .offset     = vmstate_offset_pointer(_state, _field, _type),          \
> +}
> +
> +const VMStateDescription vmstate_pcie_aer_log = {
> +    .name = "PCIE_AER_ERROR_LOG",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .minimum_version_id_old = 1,
> +    .fields     = (VMStateField[]) {
> +        VMSTATE_UINT32(producer, PCIEAERLog),
> +        VMSTATE_UINT32(consumer, PCIEAERLog),
> +        VMSTATE_UINT16(log_max, PCIEAERLog),
> +        VMSTATE_PCIE_AER_ERRS(log, PCIEAERLog, log_max,
> +                              vmstate_pcie_aer_err, PCIEAERErr),
> +        VMSTATE_END_OF_LIST()
> +    }
> +};
> +
> diff --git a/hw/pcie_aer.h b/hw/pcie_aer.h
> new file mode 100644
> index 0000000..db9f87c
> --- /dev/null
> +++ b/hw/pcie_aer.h
> @@ -0,0 +1,105 @@
> +/*
> + * pcie_aer.h
> + *
> + * Copyright (c) 2010 Isaku Yamahata <yamahata at valinux co jp>
> + *                    VA Linux Systems Japan K.K.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#ifndef QEMU_PCIE_AER_H
> +#define QEMU_PCIE_AER_H
> +
> +#include "hw.h"
> +
> +/* definitions which PCIExpressDevice uses */
> +typedef enum {
> +    AER_MSG_MASKED,
> +    AER_MSG_SENT,
> +} AERMsgResult;


We don't really need this, especially not in an external header.

> +
> +/* AER log */
> +struct PCIEAERLog {
> +    /* This structure is saved/loaded.
> +       So explicitly size them instead of unsigned int */
> +    uint32_t producer;
> +    uint32_t consumer;
> +
> +#define PCIE_AER_LOG_MAX_DEFAULT        8
> +#define PCIE_AER_LOG_MAX_MAX            128 /* what is appropriate? */
> +#define PCIE_AER_LOG_MAX_UNSET          0xffff
> +    uint16_t log_max;
> +
> +    PCIEAERErr *log;    /* ringed buffer */
> +};
> +
> +/* aer error severity */
> +typedef enum {
> +    /* those value are same as
> +     * Root error command register in aer extended cap and
> +     * root control register in pci express cap.
> +     */
> +    AER_ERR_COR         = PCI_ERR_ROOT_CMD_COR_EN,
> +    AER_ERR_NONFATAL    = PCI_ERR_ROOT_CMD_NONFATAL_EN,
> +    AER_ERR_FATAL       = PCI_ERR_ROOT_CMD_FATAL_EN,
> +} PCIEAERSeverity;
> +

I think we are better off without the enum above.
It just adds 2 ways to get the same value and
thre will always be confusion.
And C compiler does not give us type checking anyway.

> +/* aer error message: error signaling message has only error sevirity and
> +   source id. See 2.2.8.3 error signaling messages */
> +struct PCIEAERMsg {
> +    PCIEAERSeverity severity;
> +    uint16_t source_id; /* bdf */
> +};
> +
> +static inline bool
> +pcie_aer_msg_is_uncor(const PCIEAERMsg *msg)
> +{
> +    return msg->severity == AER_ERR_NONFATAL || msg->severity == AER_ERR_FATAL;
> +}
> +
> +/* error */
> +struct PCIEAERErr {
> +    uint32_t status;    /* error status bits */
> +    uint16_t source_id; /* bdf */
> +
> +#define PCIE_AER_ERR_IS_CORRECTABLE     0x1     /* correctable/uncorrectable */
> +#define PCIE_AER_ERR_MAYBE_ADVISORY     0x2     /* maybe advisory non-fatal */
> +#define PCIE_AER_ERR_HEADER_VALID       0x4     /* TLP header is logged */
> +#define PCIE_AER_ERR_TLP_PRESENT        0x8     /* TLP Prefix is logged */
> +    uint16_t flags;
> +
> +    uint32_t header[4]; /* TLP header */
> +    uint32_t prefix[4]; /* TLP header prefix */
> +};
> +
> +extern const VMStateDescription vmstate_pcie_aer_log;
> +
> +void pcie_aer_init(PCIDevice *dev, uint16_t offset);
> +void pcie_aer_exit(PCIDevice *dev);
> +void pcie_aer_write_config(PCIDevice *dev,
> +                           uint32_t addr, uint32_t val, int len,
> +                           uint32_t uncorsta_prev);
> +
> +/* aer root port */
> +void pcie_aer_root_set_vector(PCIDevice *dev, uint8_t vector);
> +void pcie_aer_root_init(PCIDevice *dev);
> +void pcie_aer_root_reset(PCIDevice *dev);
> +void pcie_aer_root_write_config(PCIDevice *dev,
> +                                uint32_t addr, uint32_t val, int len,
> +                                uint32_t root_cmd_prev);
> +
> +/* error injection */
> +void pcie_aer_inject_error(PCIDevice *dev, const PCIEAERErr *err);
> +
> +#endif /* QEMU_PCIE_AER_H */
> diff --git a/qemu-common.h b/qemu-common.h
> index b97b16e..63ee8c3 100644
> --- a/qemu-common.h
> +++ b/qemu-common.h
> @@ -221,6 +221,9 @@ typedef struct PCIBus PCIBus;
>  typedef struct PCIDevice PCIDevice;
>  typedef struct PCIExpressDevice PCIExpressDevice;
>  typedef struct PCIBridge PCIBridge;
> +typedef struct PCIEAERMsg PCIEAERMsg;
> +typedef struct PCIEAERLog PCIEAERLog;
> +typedef struct PCIEAERErr PCIEAERErr;
>  typedef struct PCIEPort PCIEPort;
>  typedef struct PCIESlot PCIESlot;
>  typedef struct SerialState SerialState;
> -- 
> 1.7.1.1
Isaku Yamahata - Oct. 21, 2010, 5:15 a.m.
Thank you for detailed review.

On Wed, Oct 20, 2010 at 11:56:16AM +0200, Michael S. Tsirkin wrote:
> > +static uint32_t aer_log_del(PCIEAERLog *aer_log)
> > +{
> > +    uint32_t i = aer_log->consumer;
> > +    aer_log->consumer = aer_log_next(aer_log->consumer, aer_log->log_max);
> > +    return i;
> > +}
> 
> 
> Please just use 'int' or 'unsigned' instead of uint32_t if you simply
> want to say 'a number'.  Using specific length makes it impossible to
> say where you *really* want a value.

PCIEAERLog is saved/loaded. So explicit sized number is chosen.
Michael S. Tsirkin - Oct. 21, 2010, 8:07 a.m.
On Thu, Oct 21, 2010 at 02:15:24PM +0900, Isaku Yamahata wrote:
> Thank you for detailed review.
> 
> On Wed, Oct 20, 2010 at 11:56:16AM +0200, Michael S. Tsirkin wrote:
> > > +static uint32_t aer_log_del(PCIEAERLog *aer_log)
> > > +{
> > > +    uint32_t i = aer_log->consumer;
> > > +    aer_log->consumer = aer_log_next(aer_log->consumer, aer_log->log_max);
> > > +    return i;
> > > +}
> > 
> > 
> > Please just use 'int' or 'unsigned' instead of uint32_t if you simply
> > want to say 'a number'.  Using specific length makes it impossible to
> > say where you *really* want a value.
> 
> PCIEAERLog is saved/loaded. So explicit sized number is chosen.

I didn't notice. But consumer/producer are not guest visible, are they?
If yes I think it's a mistake to save/load them, tying us to
a specific implementation: just calculate and save the # of
valid entries in the log.

Also I put the comment here but it is a general one: pls go over the
code, and just take a look at what types you use all over and think
whether size really matters. In most places it does not, it just must be
big enough, so use int or unsigned there.  It will be much harder for
others to do so later.

> -- 
> yamahata
Isaku Yamahata - Oct. 21, 2010, 11:53 p.m.
On Thu, Oct 21, 2010 at 10:07:01AM +0200, Michael S. Tsirkin wrote:
> On Thu, Oct 21, 2010 at 02:15:24PM +0900, Isaku Yamahata wrote:
> > Thank you for detailed review.
> > 
> > On Wed, Oct 20, 2010 at 11:56:16AM +0200, Michael S. Tsirkin wrote:
> > > > +static uint32_t aer_log_del(PCIEAERLog *aer_log)
> > > > +{
> > > > +    uint32_t i = aer_log->consumer;
> > > > +    aer_log->consumer = aer_log_next(aer_log->consumer, aer_log->log_max);
> > > > +    return i;
> > > > +}
> > > 
> > > 
> > > Please just use 'int' or 'unsigned' instead of uint32_t if you simply
> > > want to say 'a number'.  Using specific length makes it impossible to
> > > say where you *really* want a value.
> > 
> > PCIEAERLog is saved/loaded. So explicit sized number is chosen.
> 
> I didn't notice. But consumer/producer are not guest visible, are they?
> If yes I think it's a mistake to save/load them, tying us to
> a specific implementation: just calculate and save the # of
> valid entries in the log.

ACIEAERLog and PCIEAERErr themselves are the implementation specific
internal states which is not visible to guest.
So there is no point of arguing that consumer/producer are 
specific to implementation.


> Also I put the comment here but it is a general one: pls go over the
> code, and just take a look at what types you use all over and think
> whether size really matters. In most places it does not, it just must be
> big enough, so use int or unsigned there.  It will be much harder for
> others to do so later.

Will do.
Michael S. Tsirkin - Oct. 22, 2010, 9:27 a.m.
On Fri, Oct 22, 2010 at 08:53:15AM +0900, Isaku Yamahata wrote:
> On Thu, Oct 21, 2010 at 10:07:01AM +0200, Michael S. Tsirkin wrote:
> > On Thu, Oct 21, 2010 at 02:15:24PM +0900, Isaku Yamahata wrote:
> > > Thank you for detailed review.
> > > 
> > > On Wed, Oct 20, 2010 at 11:56:16AM +0200, Michael S. Tsirkin wrote:
> > > > > +static uint32_t aer_log_del(PCIEAERLog *aer_log)
> > > > > +{
> > > > > +    uint32_t i = aer_log->consumer;
> > > > > +    aer_log->consumer = aer_log_next(aer_log->consumer, aer_log->log_max);
> > > > > +    return i;
> > > > > +}
> > > > 
> > > > 
> > > > Please just use 'int' or 'unsigned' instead of uint32_t if you simply
> > > > want to say 'a number'.  Using specific length makes it impossible to
> > > > say where you *really* want a value.
> > > 
> > > PCIEAERLog is saved/loaded. So explicit sized number is chosen.
> > 
> > I didn't notice. But consumer/producer are not guest visible, are they?
> > If yes I think it's a mistake to save/load them, tying us to
> > a specific implementation: just calculate and save the # of
> > valid entries in the log.
> 
> ACIEAERLog and PCIEAERErr themselves are the implementation specific
> internal states which is not visible to guest.
> So there is no point of arguing that consumer/producer are 
> specific to implementation.

Well the errors can be read out by the guest, so they are
potentially guest visible. Thus I think the only thing
we want to migrate is the list of errors logged.

> > Also I put the comment here but it is a general one: pls go over the
> > code, and just take a look at what types you use all over and think
> > whether size really matters. In most places it does not, it just must be
> > big enough, so use int or unsigned there.  It will be much harder for
> > others to do so later.
> 
> Will do.
> -- 
> yamahata
Markus Armbruster - Oct. 22, 2010, 11:28 a.m.
"Michael S. Tsirkin" <mst@redhat.com> writes:

> On Wed, Oct 20, 2010 at 05:18:57PM +0900, Isaku Yamahata wrote:
>> This patch implements helper functions for pcie aer capability
>> which will be used later.
>> 
>> Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
>
> Some style comments and a couple of minor bugs.
[...]
>> +
>> +static int aer_log_add_err(PCIEAERLog *aer_log, const PCIEAERErr *err)
>> +{
>> +    uint32_t i;
>> +    if (aer_log_full(aer_log)) {
>> +        return -1;
>> +    }
>> +    i = aer_log_add(aer_log);
>> +    memcpy(&aer_log->log[i], err, sizeof(*err));
>
> sizeof is not a function, you don't need ()

Matter of taste.  Yes, it's an operator, but it's as function-like as
operators get.

For what it's worth, grep says roughly 98% of our sizeof uses are
followed by '('.  I doubt the parenthesis are required for grouping that
often.

[...]

Patch

diff --git a/Makefile.objs b/Makefile.objs
index 138e545..48f98f3 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -187,7 +187,7 @@  hw-obj-$(CONFIG_PIIX4) += piix4.o
 # PCI watchdog devices
 hw-obj-y += wdt_i6300esb.o
 
-hw-obj-y += pcie.o pcie_port.o
+hw-obj-y += pcie.o pcie_aer.o pcie_port.o
 hw-obj-y += msix.o msi.o
 
 # PCI network cards
diff --git a/hw/pcie.h b/hw/pcie.h
index 2871e27..415a680 100644
--- a/hw/pcie.h
+++ b/hw/pcie.h
@@ -24,6 +24,7 @@ 
 #include "hw.h"
 #include "pci_regs.h"
 #include "pcie_regs.h"
+#include "pcie_aer.h"
 
 typedef enum {
     /* for attention and power indicator */
@@ -74,6 +75,19 @@  struct PCIExpressDevice {
                                  * also initialize it when loaded as
                                  * appropreately.
                                  */
+
+    /* AER */
+    uint16_t aer_cap;
+    PCIEAERLog aer_log;
+    unsigned int aer_intx;      /* INTx for error reporting
+                                 * default is 0 = INTA#
+                                 * If the chip wants to use other interrupt
+                                 * line, initialize this member with the
+                                 * desired number.
+                                 * If the chip dynamically changes this member,
+                                 * also initialize it when loaded as
+                                 * appropreately.
+                                 */
 };
 
 /* PCI express capability helper functions */
diff --git a/hw/pcie_aer.c b/hw/pcie_aer.c
new file mode 100644
index 0000000..b8cede3
--- /dev/null
+++ b/hw/pcie_aer.c
@@ -0,0 +1,780 @@ 
+/*
+ * pcie_aer.c
+ *
+ * Copyright (c) 2010 Isaku Yamahata <yamahata at valinux co jp>
+ *                    VA Linux Systems Japan K.K.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "sysemu.h"
+#include "pci_bridge.h"
+#include "pcie.h"
+#include "msix.h"
+#include "msi.h"
+#include "pci_internals.h"
+#include "pcie_regs.h"
+
+//#define DEBUG_PCIE
+#ifdef DEBUG_PCIE
+# define PCIE_DPRINTF(fmt, ...)                                         \
+    fprintf(stderr, "%s:%d " fmt, __func__, __LINE__, ## __VA_ARGS__)
+#else
+# define PCIE_DPRINTF(fmt, ...) do {} while (0)
+#endif
+#define PCIE_DEV_PRINTF(dev, fmt, ...)                                  \
+    PCIE_DPRINTF("%s:%x "fmt, (dev)->name, (dev)->devfn, ## __VA_ARGS__)
+
+static void pcie_aer_clear_error(PCIDevice *dev);
+static uint8_t pcie_aer_root_get_vector(PCIDevice *dev);
+static AERMsgResult
+pcie_aer_msg_alldev(PCIDevice *dev, const PCIEAERMsg *msg);
+static AERMsgResult
+pcie_aer_msg_vbridge(PCIDevice *dev, const PCIEAERMsg *msg);
+static AERMsgResult
+pcie_aer_msg_root_port(PCIDevice *dev, const PCIEAERMsg *msg);
+
+/* From 6.2.7 Error Listing and Rules. Table 6-2, 6-3 and 6-4 */
+static PCIEAERSeverity pcie_aer_uncor_default_severity(uint32_t status)
+{
+    switch (status) {
+    case PCI_ERR_UNC_INTN:
+    case PCI_ERR_UNC_DLP:
+    case PCI_ERR_UNC_SDN:
+    case PCI_ERR_UNC_RX_OVER:
+    case PCI_ERR_UNC_FCP:
+    case PCI_ERR_UNC_MALF_TLP:
+        return AER_ERR_FATAL;
+    case PCI_ERR_UNC_POISON_TLP:
+    case PCI_ERR_UNC_ECRC:
+    case PCI_ERR_UNC_UNSUP:
+    case PCI_ERR_UNC_COMP_TIME:
+    case PCI_ERR_UNC_COMP_ABORT:
+    case PCI_ERR_UNC_UNX_COMP:
+    case PCI_ERR_UNC_ACSV:
+    case PCI_ERR_UNC_MCBTLP:
+    case PCI_ERR_UNC_ATOP_EBLOCKED:
+    case PCI_ERR_UNC_TLP_PRF_BLOCKED:
+        return AER_ERR_NONFATAL;
+    default:
+        break;
+    }
+    abort();
+    return AER_ERR_FATAL;
+}
+
+static uint32_t aer_log_next(uint32_t i, uint32_t max)
+{
+    return (i + 1) % max;
+}
+
+static bool aer_log_empty_index(uint32_t producer, uint32_t consumer)
+{
+    return producer == consumer;
+}
+
+static bool aer_log_empty(PCIEAERLog *aer_log)
+{
+    return aer_log_empty_index(aer_log->producer, aer_log->consumer);
+}
+
+static bool aer_log_full(PCIEAERLog *aer_log)
+{
+    return aer_log_next(aer_log->producer, aer_log->log_max) ==
+        aer_log->consumer;
+}
+
+static uint32_t aer_log_add(PCIEAERLog *aer_log)
+{
+    uint32_t i = aer_log->producer;
+    aer_log->producer = aer_log_next(aer_log->producer, aer_log->log_max);
+    return i;
+}
+
+static uint32_t aer_log_del(PCIEAERLog *aer_log)
+{
+    uint32_t i = aer_log->consumer;
+    aer_log->consumer = aer_log_next(aer_log->consumer, aer_log->log_max);
+    return i;
+}
+
+static int aer_log_add_err(PCIEAERLog *aer_log, const PCIEAERErr *err)
+{
+    uint32_t i;
+    if (aer_log_full(aer_log)) {
+        return -1;
+    }
+    i = aer_log_add(aer_log);
+    memcpy(&aer_log->log[i], err, sizeof(*err));
+    return 0;
+}
+
+static const PCIEAERErr* aer_log_del_err(PCIEAERLog *aer_log)
+{
+    uint32_t i;
+    assert(!aer_log_empty(aer_log));
+    i = aer_log_del(aer_log);
+    return &aer_log->log[i];
+}
+
+static void aer_log_clear_all_err(PCIEAERLog *aer_log)
+{
+    aer_log->producer = 0;
+    aer_log->consumer = 0;
+}
+
+void pcie_aer_init(PCIDevice *dev, uint16_t offset)
+{
+    PCIExpressDevice *exp;
+
+    pci_word_test_and_set_mask(dev->wmask + PCI_COMMAND, PCI_COMMAND_SERR);
+    pci_word_test_and_set_mask(dev->w1cmask + PCI_STATUS,
+                               PCI_STATUS_SIG_SYSTEM_ERROR);
+
+    pcie_add_capability(dev, PCI_EXT_CAP_ID_ERR, PCI_ERR_VER,
+                        offset, PCI_ERR_SIZEOF);
+    exp = &dev->exp;
+    exp->aer_cap = offset;
+    if (dev->exp.aer_log.log_max == PCIE_AER_LOG_MAX_UNSET) {
+        dev->exp.aer_log.log_max = PCIE_AER_LOG_MAX_DEFAULT;
+    }
+    if (dev->exp.aer_log.log_max > PCIE_AER_LOG_MAX_MAX) {
+        dev->exp.aer_log.log_max = PCIE_AER_LOG_MAX_MAX;
+    }
+    dev->exp.aer_log.log = qemu_mallocz(sizeof(dev->exp.aer_log.log[0]) *
+                                        dev->exp.aer_log.log_max);
+
+    /* On reset PCI_ERR_CAP_MHRE is disabled
+     * PCI_ERR_CAP_MHRE is RWS so that reset doesn't affect related
+     * registers
+     */
+    pci_set_long(dev->w1cmask + offset + PCI_ERR_UNCOR_STATUS,
+                 PCI_ERR_UNC_SUPPORTED);
+
+    pci_set_long(dev->config + offset + PCI_ERR_UNCOR_SEVER,
+                 PCI_ERR_UNC_SEVERITY_DEFAULT);
+    pci_set_long(dev->wmask + offset + PCI_ERR_UNCOR_SEVER,
+                 PCI_ERR_UNC_SUPPORTED);
+
+    pci_long_test_and_set_mask(dev->w1cmask + offset + PCI_ERR_COR_STATUS,
+                               PCI_ERR_COR_STATUS);
+
+    pci_set_long(dev->config + offset + PCI_ERR_COR_MASK,
+                 PCI_ERR_COR_MASK_DEFAULT);
+    pci_set_long(dev->wmask + offset + PCI_ERR_COR_MASK,
+                 PCI_ERR_COR_SUPPORTED);
+
+    /* capabilities and control. multiple header logging is supported */
+    if (dev->exp.aer_log.log_max > 0) {
+        pci_set_long(dev->config + offset + PCI_ERR_CAP,
+                     PCI_ERR_CAP_ECRC_GENC | PCI_ERR_CAP_ECRC_CHKC |
+                     PCI_ERR_CAP_MHRC);
+        pci_set_long(dev->wmask + offset + PCI_ERR_CAP,
+                     PCI_ERR_CAP_ECRC_GENE | PCI_ERR_CAP_ECRC_CHKE |
+                     PCI_ERR_CAP_MHRE);
+    } else {
+        pci_set_long(dev->config + offset + PCI_ERR_CAP,
+                     PCI_ERR_CAP_ECRC_GENC | PCI_ERR_CAP_ECRC_CHKC);
+        pci_set_long(dev->wmask + offset + PCI_ERR_CAP,
+                     PCI_ERR_CAP_ECRC_GENE | PCI_ERR_CAP_ECRC_CHKE);
+    }
+
+    switch (pcie_cap_get_type(dev)) {
+    case PCI_EXP_TYPE_ROOT_PORT:
+        /* this case will be set by pcie_aer_root_init() */
+        /* fallthrough */
+    case PCI_EXP_TYPE_DOWNSTREAM:
+    case PCI_EXP_TYPE_UPSTREAM:
+        pci_word_test_and_set_mask(dev->wmask + PCI_BRIDGE_CONTROL,
+                                   PCI_BRIDGE_CTL_SERR);
+        pci_long_test_and_set_mask(dev->w1cmask + PCI_STATUS,
+                                   PCI_SEC_STATUS_RCV_SYSTEM_ERROR);
+        break;
+    default:
+        /* nothing */
+        break;
+    }
+}
+
+void pcie_aer_exit(PCIDevice *dev)
+{
+    qemu_free(dev->exp.aer_log.log);
+}
+
+void pcie_aer_write_config(PCIDevice *dev,
+                           uint32_t addr, uint32_t val, int len,
+                           uint32_t uncorsta_old)
+{
+    uint32_t pos = dev->exp.aer_cap;
+    uint32_t errcap = pci_get_long(dev->config + pos + PCI_ERR_CAP);
+    uint32_t first_error = 1U << PCI_ERR_CAP_FEP(errcap);
+    uint32_t uncorsta = pci_get_long(dev->config + pos + PCI_ERR_UNCOR_STATUS);
+
+    /* uncorrectable error */
+    if ((uncorsta_old & first_error) && !(uncorsta & first_error)) {
+        /* the bit that corresponds to the first error is cleared */
+        pcie_aer_clear_error(dev);
+    } else if (errcap & PCI_ERR_CAP_MHRE) {
+        /* When PCI_ERR_CAP_MHRE is enabled and the first error isn't cleared
+         * nothing should happen. So we have to revert the modification to
+         * the register.
+         */
+        pci_set_long(dev->config + pos + PCI_ERR_UNCOR_STATUS, uncorsta_old);
+    }
+
+    /* capability & control */
+    if (ranges_overlap(addr, len, pos + PCI_ERR_CAP, 4)) {
+        if (!(errcap & PCI_ERR_CAP_MHRE)) {
+            aer_log_clear_all_err(&dev->exp.aer_log);
+        }
+    }
+}
+
+static inline void pcie_aer_msg(PCIDevice *dev, const PCIEAERMsg *msg)
+{
+    uint8_t type;
+    AERMsgResult result;
+
+    assert(pci_is_express(dev));
+
+    type = pcie_cap_get_type(dev);
+    if (type == PCI_EXP_TYPE_ROOT_PORT ||
+        type == PCI_EXP_TYPE_UPSTREAM ||
+        type == PCI_EXP_TYPE_DOWNSTREAM) {
+        result = pcie_aer_msg_vbridge(dev, msg);
+        if (result != AER_MSG_SENT) {
+            return;
+        }
+    }
+    result = pcie_aer_msg_alldev(dev, msg);
+    if (type == PCI_EXP_TYPE_ROOT_PORT && result == AER_MSG_SENT) {
+        pcie_aer_msg_root_port(dev, msg);
+    }
+}
+
+static AERMsgResult
+pcie_aer_msg_alldev(PCIDevice *dev, const PCIEAERMsg *msg)
+{
+    uint16_t cmd = pci_get_word(dev->config + PCI_COMMAND);
+    bool transmit1 =
+        pcie_aer_msg_is_uncor(msg) && (cmd & PCI_COMMAND_SERR);
+    uint32_t devctl = pci_get_word(dev->config +
+                                   dev->exp.exp_cap + PCI_EXP_DEVCTL);
+    bool transmit2 = msg->severity & devctl;
+    PCIDevice *parent_port;
+
+    if (transmit1) {
+        if (pcie_aer_msg_is_uncor(msg)) {
+            /* Signaled System Error */
+            pci_word_test_and_set_mask(dev->config + PCI_STATUS,
+                                       PCI_STATUS_SIG_SYSTEM_ERROR);
+        }
+    }
+
+    if (!(transmit1 || transmit2)) {
+        return AER_MSG_MASKED;
+    }
+
+    /* send up error message */
+    if (pci_is_express(dev) &&
+        pcie_cap_get_type(dev) == PCI_EXP_TYPE_ROOT_PORT) {
+        /* Root port notify system itself,
+           or send the error message to root complex event collector. */
+        /*
+         * if root port is associated to event collector, set
+         * parent_port = root complex event collector
+         * For now root complex event collector isn't supported.
+         */
+        parent_port = NULL;
+    } else {
+        parent_port = pci_bridge_get_device(dev->bus);
+    }
+    if (parent_port) {
+        if (!pci_is_express(parent_port)) {
+            /* What to do? */
+            return AER_MSG_MASKED;
+        }
+        pcie_aer_msg(parent_port, msg);
+    }
+    return AER_MSG_SENT;
+}
+
+static AERMsgResult
+pcie_aer_msg_vbridge(PCIDevice *dev, const PCIEAERMsg *msg)
+{
+    uint16_t bridge_control = pci_get_word(dev->config + PCI_BRIDGE_CONTROL);
+
+    if (pcie_aer_msg_is_uncor(msg)) {
+        /* Received System Error */
+        pci_word_test_and_set_mask(dev->config + PCI_SEC_STATUS,
+                                   PCI_SEC_STATUS_RCV_SYSTEM_ERROR);
+    }
+
+    if (!(bridge_control & PCI_BRIDGE_CTL_SERR)) {
+        return AER_MSG_MASKED;
+    }
+    return AER_MSG_SENT;
+}
+
+static AERMsgResult
+pcie_aer_msg_root_port(PCIDevice *dev, const PCIEAERMsg *msg)
+{
+    AERMsgResult ret;
+    uint16_t cmd;
+    uint8_t *aer_cap;
+    uint32_t root_cmd;
+    uint32_t root_sta;
+    bool msi_trigger;
+
+    ret = AER_MSG_MASKED;
+    cmd = pci_get_word(dev->config + PCI_COMMAND);
+    aer_cap = dev->config + dev->exp.aer_cap;
+    root_cmd = pci_get_long(aer_cap + PCI_ERR_ROOT_COMMAND);
+    root_sta = pci_get_long(aer_cap + PCI_ERR_ROOT_STATUS);
+    msi_trigger = false;
+
+    if (cmd & PCI_COMMAND_SERR) {
+        /* System Error. Platform Specific */
+        /* ret = AER_MSG_SENT; */
+    }
+
+    /* Errro Message Received: Root Error Status register */
+    switch (msg->severity) {
+    case AER_ERR_COR:
+        if (root_sta & PCI_ERR_ROOT_COR_RCV) {
+            root_sta |= PCI_ERR_ROOT_MULTI_COR_RCV;
+        } else {
+            if (root_cmd & PCI_ERR_ROOT_CMD_COR_EN) {
+                msi_trigger = true;
+            }
+            pci_set_word(aer_cap + PCI_ERR_ROOT_COR_SRC, msg->source_id);
+        }
+        root_sta |= PCI_ERR_ROOT_COR_RCV;
+        break;
+    case AER_ERR_NONFATAL:
+        if (!(root_sta & PCI_ERR_ROOT_NONFATAL_RCV) &&
+            root_cmd & PCI_ERR_ROOT_CMD_NONFATAL_EN) {
+            msi_trigger = true;
+        }
+        root_sta |= PCI_ERR_ROOT_NONFATAL_RCV;
+        break;
+    case AER_ERR_FATAL:
+        if (!(root_sta & PCI_ERR_ROOT_FATAL_RCV) &&
+            root_cmd & PCI_ERR_ROOT_CMD_FATAL_EN) {
+            msi_trigger = true;
+        }
+        if (!(root_sta & PCI_ERR_ROOT_UNCOR_RCV)) {
+            root_sta |= PCI_ERR_ROOT_FIRST_FATAL;
+        }
+        root_sta |= PCI_ERR_ROOT_FATAL_RCV;
+        break;
+    }
+    if (pcie_aer_msg_is_uncor(msg)) {
+        if (root_sta & PCI_ERR_ROOT_UNCOR_RCV) {
+            root_sta |= PCI_ERR_ROOT_MULTI_UNCOR_RCV;
+        } else {
+            pci_set_word(aer_cap + PCI_ERR_ROOT_SRC, msg->source_id);
+        }
+        root_sta |= PCI_ERR_ROOT_UNCOR_RCV;
+    }
+    pci_set_long(aer_cap + PCI_ERR_ROOT_STATUS, root_sta);
+
+    if (root_cmd & msg->severity) {
+        /* 6.2.4.1.2 Interrupt Generation */
+        if (pci_msi_enabled(dev)) {
+            pci_msi_notify(dev, pcie_aer_root_get_vector(dev));
+        } else {
+            qemu_set_irq(dev->irq[dev->exp.aer_intx], 1);
+        }
+        ret = AER_MSG_SENT;
+    }
+    return ret;
+}
+
+static void pcie_aer_update_log(PCIDevice *dev, const PCIEAERErr *err)
+{
+    uint8_t *aer_cap = dev->config + dev->exp.aer_cap;
+    uint8_t first_bit = ffsl(err->status) - 1;
+    uint32_t errcap = pci_get_long(aer_cap + PCI_ERR_CAP);
+    int i;
+    uint32_t dw;
+
+    errcap &= ~(PCI_ERR_CAP_FEP_MASK | PCI_ERR_CAP_TLP);
+    errcap |= PCI_ERR_CAP_FEP(first_bit);
+
+    if (err->flags & PCIE_AER_ERR_HEADER_VALID) {
+        for (i = 0; i < ARRAY_SIZE(err->header); ++i) {
+            /* 7.10.8 Header Log Register */
+            cpu_to_be32wu(&dw, err->header[i]);
+            memcpy(aer_cap + PCI_ERR_HEADER_LOG + sizeof(err->header[0]) * i,
+                   &dw, sizeof(dw));
+        }
+    } else {
+        assert(!(err->flags & PCIE_AER_ERR_TLP_PRESENT));
+        memset(aer_cap + PCI_ERR_HEADER_LOG, 0, sizeof(err->header));
+    }
+
+    if ((err->flags & PCIE_AER_ERR_TLP_PRESENT) &&
+        (pci_get_long(dev->config + dev->exp.exp_cap + PCI_EXP_DEVCTL2) &
+         PCI_EXP_DEVCAP2_EETLPP)) {
+        for (i = 0; i < ARRAY_SIZE(err->prefix); ++i) {
+            /* 7.10.12 tlp prefix log register */
+            cpu_to_be32wu(&dw, err->prefix[i]);
+            memcpy(aer_cap + PCI_ERR_TLP_PREFIX_LOG +
+                   sizeof(err->prefix[0]) * i, &dw, sizeof(dw));
+        }
+        errcap |= PCI_ERR_CAP_TLP;
+    } else {
+        memset(aer_cap + PCI_ERR_TLP_PREFIX_LOG, 0, sizeof(err->prefix));
+    }
+    pci_set_long(aer_cap + PCI_ERR_CAP, errcap);
+}
+
+static void pcie_aer_clear_log(PCIDevice *dev)
+{
+    PCIEAERErr *err;
+    uint8_t *aer_cap = dev->config + dev->exp.aer_cap;
+
+    pci_long_test_and_clear_mask(aer_cap + PCI_ERR_CAP,
+                                 PCI_ERR_CAP_FEP_MASK | PCI_ERR_CAP_TLP);
+
+    memset(aer_cap + PCI_ERR_HEADER_LOG, 0, sizeof(err->header));
+    memset(aer_cap + PCI_ERR_TLP_PREFIX_LOG, 0, sizeof(err->prefix));
+}
+
+static int pcie_aer_record_error(PCIDevice *dev,
+                                 const PCIEAERErr *err)
+{
+    uint8_t *aer_cap = dev->config + dev->exp.aer_cap;
+    uint32_t errcap = pci_get_long(aer_cap + PCI_ERR_CAP);
+    int fep = PCI_ERR_CAP_FEP(errcap);
+
+    if (errcap & PCI_ERR_CAP_MHRE &&
+        (pci_get_long(aer_cap + PCI_ERR_UNCOR_STATUS) & (1ULL << fep))) {
+        /*  Not first error. queue error */
+        if (aer_log_add_err(&dev->exp.aer_log, err) < 0) {
+            /* overflow */
+            return -1;
+        }
+        return 0;
+    }
+
+    pcie_aer_update_log(dev, err);
+    return 0;
+}
+
+static void pcie_aer_clear_error(PCIDevice *dev)
+{
+    uint8_t *aer_cap = dev->config + dev->exp.aer_cap;
+    uint32_t errcap = pci_get_long(aer_cap + PCI_ERR_CAP);
+    PCIEAERLog *aer_log = &dev->exp.aer_log;
+    const PCIEAERErr *err;
+    uint32_t consumer;
+
+    if (!(errcap & PCI_ERR_CAP_MHRE) || aer_log_empty(aer_log)) {
+        pcie_aer_clear_log(dev);
+        return;
+    }
+
+    /*
+     * If more errors are queued, set corresponding bits in uncorrectable
+     * error status.
+     * We emulates uncorrectable error status register as W1CS.
+     * So set bit in uncorrectable error status here again for multiple
+     * error recording support.
+     *
+     * 6.2.4.2 Multiple Error Handling(Advanced Error Reporting Capability)
+     */
+    for (consumer = dev->exp.aer_log.consumer;
+         !aer_log_empty_index(dev->exp.aer_log.producer, consumer);
+         consumer = aer_log_next(consumer, dev->exp.aer_log.log_max)) {
+        pci_long_test_and_set_mask(aer_cap + PCI_ERR_UNCOR_STATUS,
+                                   dev->exp.aer_log.log[consumer].status);
+    }
+
+    err = aer_log_del_err(aer_log);
+    pcie_aer_update_log(dev, err);
+}
+
+/*
+ * non-Function specific error must be recorded in all functions.
+ * It is the responsibility of the caller of this function.
+ * It is also caller's responsiblity to determine which function should
+ * report the rerror.
+ *
+ * 6.2.4 Error Logging
+ * 6.2.5 Sqeucne of Device Error Signaling and Logging Operations
+ * table 6-2: Flowchard Showing Sequence of Device Error Signaling and Logging
+ *            Operations
+ *
+ * Although this implementation can be shortened/optimized, this is kept
+ * parallel to table 6-2.
+ */
+void pcie_aer_inject_error(PCIDevice *dev, const PCIEAERErr *err)
+{
+    uint8_t *exp_cap;
+    uint8_t *aer_cap = NULL;
+    uint32_t devctl = 0;
+    uint32_t devsta = 0;
+    uint32_t status = err->status;
+    uint32_t mask;
+    bool is_unsupported_request =
+        (!(err->flags & PCIE_AER_ERR_IS_CORRECTABLE) &&
+         err->status == PCI_ERR_UNC_UNSUP);
+    bool is_advisory_nonfatal = false;  /* for advisory non-fatal error */
+    uint32_t uncor_status = 0;          /* for advisory non-fatal error */
+    PCIEAERMsg msg;
+    int is_header_log_overflowed = 0;
+
+    if (!pci_is_express(dev)) {
+        /* What to do? */
+        return;
+    }
+
+    if (err->flags & PCIE_AER_ERR_IS_CORRECTABLE) {
+        status &= PCI_ERR_COR_SUPPORTED;
+    } else {
+        status &= PCI_ERR_UNC_SUPPORTED;
+    }
+    if (!status || status & (status - 1)) {
+        /* invalid status bit. one and only one bit must be set */
+        return;
+    }
+
+    exp_cap = dev->config + dev->exp.exp_cap;
+    if (dev->exp.aer_cap) {
+        aer_cap = dev->config + dev->exp.aer_cap;
+        devctl = pci_get_long(exp_cap + PCI_EXP_DEVCTL);
+        devsta = pci_get_long(exp_cap + PCI_EXP_DEVSTA);
+    }
+    if (err->flags & PCIE_AER_ERR_IS_CORRECTABLE) {
+    correctable_error:
+        devsta |= PCI_EXP_DEVSTA_CED;
+        if (is_unsupported_request) {
+            devsta |= PCI_EXP_DEVSTA_URD;
+        }
+        pci_set_word(exp_cap + PCI_EXP_DEVSTA, devsta);
+
+        if (aer_cap) {
+            pci_long_test_and_set_mask(aer_cap + PCI_ERR_COR_STATUS, status);
+            mask = pci_get_long(aer_cap + PCI_ERR_COR_MASK);
+            if (mask & status) {
+                return;
+            }
+            if (is_advisory_nonfatal) {
+                uint32_t uncor_mask =
+                    pci_get_long(aer_cap + PCI_ERR_UNCOR_MASK);
+                if (!(uncor_mask & uncor_status)) {
+                    is_header_log_overflowed = pcie_aer_record_error(dev, err);
+                }
+                pci_long_test_and_set_mask(aer_cap + PCI_ERR_UNCOR_STATUS,
+                                           uncor_status);
+            }
+        }
+
+        if (is_unsupported_request && !(devctl & PCI_EXP_DEVCTL_URRE)) {
+            return;
+        }
+        if (!(devctl & PCI_EXP_DEVCTL_CERE)) {
+            return;
+        }
+        msg.severity = AER_ERR_COR;
+    } else {
+        bool is_fatal =
+            (pcie_aer_uncor_default_severity(status) == AER_ERR_FATAL);
+        uint16_t cmd;
+
+        if (aer_cap) {
+            is_fatal = status & pci_get_long(aer_cap + PCI_ERR_UNCOR_SEVER);
+        }
+        if (!is_fatal && (err->flags & PCIE_AER_ERR_MAYBE_ADVISORY)) {
+            is_advisory_nonfatal = true;
+            uncor_status = status;
+            status = PCI_ERR_COR_ADV_NONFATAL;
+            goto correctable_error;
+        }
+        if (is_fatal) {
+            devsta |= PCI_EXP_DEVSTA_FED;
+        } else {
+            devsta |= PCI_EXP_DEVSTA_NFED;
+        }
+        if (is_unsupported_request) {
+            devsta |= PCI_EXP_DEVSTA_URD;
+        }
+        pci_set_long(exp_cap + PCI_EXP_DEVSTA, devsta);
+
+        if (aer_cap) {
+            mask = pci_get_long(aer_cap + PCI_ERR_UNCOR_MASK);
+            if (mask & status) {
+                pci_long_test_and_set_mask(aer_cap + PCI_ERR_UNCOR_STATUS,
+                                           status);
+                return;
+            }
+
+            is_header_log_overflowed = pcie_aer_record_error(dev, err);
+            pci_long_test_and_set_mask(aer_cap + PCI_ERR_UNCOR_STATUS, status);
+        }
+
+        cmd = pci_get_word(dev->config + PCI_COMMAND);
+        if (is_unsupported_request &&
+            !(devctl & PCI_EXP_DEVCTL_URRE) && !(cmd & PCI_COMMAND_SERR)) {
+            return;
+        }
+        if (is_fatal) {
+            if (!((cmd & PCI_COMMAND_SERR) ||
+                  (devctl & PCI_EXP_DEVCTL_FERE))) {
+                return;
+            }
+            msg.severity = AER_ERR_FATAL;
+        } else {
+            if (!((cmd & PCI_COMMAND_SERR) ||
+                  (devctl & PCI_EXP_DEVCTL_NFERE))) {
+                return;
+            }
+            msg.severity = AER_ERR_NONFATAL;
+        }
+    }
+
+    /* send up error message */
+    msg.source_id = err->source_id;
+    pcie_aer_msg(dev, &msg);
+
+    if (is_header_log_overflowed) {
+        PCIEAERErr header_log_overflow = {
+            .status = PCI_ERR_COR_HL_OVERFLOW,
+            .flags = PCIE_AER_ERR_IS_CORRECTABLE,
+            .header = {0, 0, 0, 0},
+            .prefix = {0, 0, 0, 0},
+        };
+        pcie_aer_inject_error(dev, &header_log_overflow);
+    }
+}
+
+void pcie_aer_root_set_vector(PCIDevice *dev, uint8_t vector)
+{
+    uint8_t *aer_cap = dev->config + dev->exp.aer_cap;
+    assert(vector < PCI_ERR_ROOT_IRQ_MAX);
+    pci_long_test_and_clear_mask(aer_cap + PCI_ERR_ROOT_STATUS,
+                                 PCI_ERR_ROOT_IRQ);
+    pci_long_test_and_set_mask(aer_cap + PCI_ERR_ROOT_STATUS,
+                               ((uint32_t)vector) << PCI_ERR_ROOT_IRQ_SHIFT);
+}
+
+static uint8_t pcie_aer_root_get_vector(PCIDevice *dev)
+{
+    uint8_t *aer_cap = dev->config + dev->exp.aer_cap;
+    uint32_t root_status = pci_get_long(aer_cap + PCI_ERR_ROOT_STATUS);
+    return (root_status & PCI_ERR_ROOT_IRQ) >> PCI_ERR_ROOT_IRQ_SHIFT;
+}
+
+void pcie_aer_root_init(PCIDevice *dev)
+{
+    uint16_t pos = dev->exp.aer_cap;
+
+    pci_set_long(dev->wmask + pos + PCI_ERR_ROOT_COMMAND,
+                 PCI_ERR_ROOT_CMD_EN_MASK);
+    pci_set_long(dev->w1cmask + pos + PCI_ERR_ROOT_STATUS,
+                 PCI_ERR_ROOT_STATUS_REPORT_MASK);
+}
+
+void pcie_aer_root_reset(PCIDevice *dev)
+{
+    uint8_t* aer_cap = dev->config + dev->exp.aer_cap;
+
+    pci_set_long(aer_cap + PCI_ERR_ROOT_COMMAND, 0);
+
+    /*
+     * Advanced Error Interrupt Message Number in Root Error Status Register
+     * must be updated by chip dependent code because it's chip dependent
+     * which number is used.
+     */
+}
+
+static bool pcie_aer_root_does_trigger(uint32_t cmd, uint32_t status)
+{
+    return
+        ((cmd & PCI_ERR_ROOT_CMD_COR_EN) && (status & PCI_ERR_ROOT_COR_RCV)) ||
+        ((cmd & PCI_ERR_ROOT_CMD_NONFATAL_EN) &&
+         (status & PCI_ERR_ROOT_NONFATAL_RCV)) ||
+        ((cmd & PCI_ERR_ROOT_CMD_FATAL_EN) &&
+         (status & PCI_ERR_ROOT_FATAL_RCV));
+}
+
+void pcie_aer_root_write_config(PCIDevice *dev,
+                                uint32_t addr, uint32_t val, int len,
+                                uint32_t root_cmd_prev)
+{
+    uint16_t pos = dev->exp.aer_cap;
+    uint8_t *aer_cap = dev->config + pos;
+
+    /* root command register */
+    uint32_t root_cmd = pci_get_long(aer_cap + PCI_ERR_ROOT_COMMAND);
+    if (root_cmd & PCI_ERR_ROOT_CMD_EN_MASK) {
+        /* 6.2.4.1.2 Interrupt Generation */
+
+        /* 0 -> 1 */
+        uint32_t root_cmd_set = (root_cmd_prev ^ root_cmd) & root_cmd;
+        uint32_t root_status = pci_get_long(aer_cap + PCI_ERR_ROOT_STATUS);
+
+        if (pci_msi_enabled(dev)) {
+            if (pcie_aer_root_does_trigger(root_cmd_set, root_status)) {
+                pci_msi_notify(dev, pcie_aer_root_get_vector(dev));
+            }
+        } else {
+            int int_level = pcie_aer_root_does_trigger(root_cmd, root_status);
+            qemu_set_irq(dev->irq[dev->exp.aer_intx], int_level);
+        }
+    }
+}
+
+static const VMStateDescription vmstate_pcie_aer_err = {
+    .name = "PCIE_AER_ERROR",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .minimum_version_id_old = 1,
+    .fields     = (VMStateField[]) {
+        VMSTATE_UINT32(status, PCIEAERErr),
+        VMSTATE_UINT16(source_id, PCIEAERErr),
+        VMSTATE_UINT16(flags, PCIEAERErr),
+        VMSTATE_UINT32_ARRAY(header, PCIEAERErr, 4),
+        VMSTATE_UINT32_ARRAY(prefix, PCIEAERErr, 4),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+#define VMSTATE_PCIE_AER_ERRS(_field, _state, _field_num, _vmsd, _type) { \
+    .name       = (stringify(_field)),                                    \
+    .version_id = 0,                                                      \
+    .num_offset = vmstate_offset_value(_state, _field_num, uint16_t),     \
+    .size       = sizeof(_type),                                          \
+    .vmsd       = &(_vmsd),                                               \
+    .flags      = VMS_POINTER | VMS_VARRAY_UINT16 | VMS_STRUCT,           \
+    .offset     = vmstate_offset_pointer(_state, _field, _type),          \
+}
+
+const VMStateDescription vmstate_pcie_aer_log = {
+    .name = "PCIE_AER_ERROR_LOG",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .minimum_version_id_old = 1,
+    .fields     = (VMStateField[]) {
+        VMSTATE_UINT32(producer, PCIEAERLog),
+        VMSTATE_UINT32(consumer, PCIEAERLog),
+        VMSTATE_UINT16(log_max, PCIEAERLog),
+        VMSTATE_PCIE_AER_ERRS(log, PCIEAERLog, log_max,
+                              vmstate_pcie_aer_err, PCIEAERErr),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
diff --git a/hw/pcie_aer.h b/hw/pcie_aer.h
new file mode 100644
index 0000000..db9f87c
--- /dev/null
+++ b/hw/pcie_aer.h
@@ -0,0 +1,105 @@ 
+/*
+ * pcie_aer.h
+ *
+ * Copyright (c) 2010 Isaku Yamahata <yamahata at valinux co jp>
+ *                    VA Linux Systems Japan K.K.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef QEMU_PCIE_AER_H
+#define QEMU_PCIE_AER_H
+
+#include "hw.h"
+
+/* definitions which PCIExpressDevice uses */
+typedef enum {
+    AER_MSG_MASKED,
+    AER_MSG_SENT,
+} AERMsgResult;
+
+/* AER log */
+struct PCIEAERLog {
+    /* This structure is saved/loaded.
+       So explicitly size them instead of unsigned int */
+    uint32_t producer;
+    uint32_t consumer;
+
+#define PCIE_AER_LOG_MAX_DEFAULT        8
+#define PCIE_AER_LOG_MAX_MAX            128 /* what is appropriate? */
+#define PCIE_AER_LOG_MAX_UNSET          0xffff
+    uint16_t log_max;
+
+    PCIEAERErr *log;    /* ringed buffer */
+};
+
+/* aer error severity */
+typedef enum {
+    /* those value are same as
+     * Root error command register in aer extended cap and
+     * root control register in pci express cap.
+     */
+    AER_ERR_COR         = PCI_ERR_ROOT_CMD_COR_EN,
+    AER_ERR_NONFATAL    = PCI_ERR_ROOT_CMD_NONFATAL_EN,
+    AER_ERR_FATAL       = PCI_ERR_ROOT_CMD_FATAL_EN,
+} PCIEAERSeverity;
+
+/* aer error message: error signaling message has only error sevirity and
+   source id. See 2.2.8.3 error signaling messages */
+struct PCIEAERMsg {
+    PCIEAERSeverity severity;
+    uint16_t source_id; /* bdf */
+};
+
+static inline bool
+pcie_aer_msg_is_uncor(const PCIEAERMsg *msg)
+{
+    return msg->severity == AER_ERR_NONFATAL || msg->severity == AER_ERR_FATAL;
+}
+
+/* error */
+struct PCIEAERErr {
+    uint32_t status;    /* error status bits */
+    uint16_t source_id; /* bdf */
+
+#define PCIE_AER_ERR_IS_CORRECTABLE     0x1     /* correctable/uncorrectable */
+#define PCIE_AER_ERR_MAYBE_ADVISORY     0x2     /* maybe advisory non-fatal */
+#define PCIE_AER_ERR_HEADER_VALID       0x4     /* TLP header is logged */
+#define PCIE_AER_ERR_TLP_PRESENT        0x8     /* TLP Prefix is logged */
+    uint16_t flags;
+
+    uint32_t header[4]; /* TLP header */
+    uint32_t prefix[4]; /* TLP header prefix */
+};
+
+extern const VMStateDescription vmstate_pcie_aer_log;
+
+void pcie_aer_init(PCIDevice *dev, uint16_t offset);
+void pcie_aer_exit(PCIDevice *dev);
+void pcie_aer_write_config(PCIDevice *dev,
+                           uint32_t addr, uint32_t val, int len,
+                           uint32_t uncorsta_prev);
+
+/* aer root port */
+void pcie_aer_root_set_vector(PCIDevice *dev, uint8_t vector);
+void pcie_aer_root_init(PCIDevice *dev);
+void pcie_aer_root_reset(PCIDevice *dev);
+void pcie_aer_root_write_config(PCIDevice *dev,
+                                uint32_t addr, uint32_t val, int len,
+                                uint32_t root_cmd_prev);
+
+/* error injection */
+void pcie_aer_inject_error(PCIDevice *dev, const PCIEAERErr *err);
+
+#endif /* QEMU_PCIE_AER_H */
diff --git a/qemu-common.h b/qemu-common.h
index b97b16e..63ee8c3 100644
--- a/qemu-common.h
+++ b/qemu-common.h
@@ -221,6 +221,9 @@  typedef struct PCIBus PCIBus;
 typedef struct PCIDevice PCIDevice;
 typedef struct PCIExpressDevice PCIExpressDevice;
 typedef struct PCIBridge PCIBridge;
+typedef struct PCIEAERMsg PCIEAERMsg;
+typedef struct PCIEAERLog PCIEAERLog;
+typedef struct PCIEAERErr PCIEAERErr;
 typedef struct PCIEPort PCIEPort;
 typedef struct PCIESlot PCIESlot;
 typedef struct SerialState SerialState;