diff mbox

[v3,05/13] pcie: helper functions for pcie capability and extended capability.

Message ID 114f4c5b90a68aea5df7e633c31d67c72d0239e4.1284528424.git.yamahata@valinux.co.jp
State New
Headers show

Commit Message

Isaku Yamahata Sept. 15, 2010, 5:38 a.m. UTC
This patch implements helper functions for pci express capability
and pci express extended capability allocation.
NOTE: presence detection depends on pci_qdev_init() change.

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>

---
Changes v2 -> v3:
- don't use 0b gcc extension. use 0x instead.
- split out constants into pcie_regs.h for linux merge.
- export some helpers for pcie-aer split.
- split out aer helper functions from pcie.c to pcie_aer.c
- embed PCIExpressDevice into PCIDevice.
---
 Makefile.objs |    1 +
 hw/pci.h      |   12 +
 hw/pcie.c     |  638 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 hw/pcie.h     |   96 +++++++++
 qemu-common.h |    1 +
 5 files changed, 748 insertions(+), 0 deletions(-)
 create mode 100644 hw/pcie.c
 create mode 100644 hw/pcie.h

Comments

Michael S. Tsirkin Sept. 15, 2010, 12:43 p.m. UTC | #1
On Wed, Sep 15, 2010 at 02:38:18PM +0900, Isaku Yamahata wrote:
> This patch implements helper functions for pci express capability
> and pci express extended capability allocation.
> NOTE: presence detection depends on pci_qdev_init() change.
> 
> Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
> 
> ---
> Changes v2 -> v3:
> - don't use 0b gcc extension. use 0x instead.
> - split out constants into pcie_regs.h for linux merge.
> - export some helpers for pcie-aer split.
> - split out aer helper functions from pcie.c to pcie_aer.c
> - embed PCIExpressDevice into PCIDevice.
> ---
>  Makefile.objs |    1 +
>  hw/pci.h      |   12 +
>  hw/pcie.c     |  638 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  hw/pcie.h     |   96 +++++++++
>  qemu-common.h |    1 +
>  5 files changed, 748 insertions(+), 0 deletions(-)
>  create mode 100644 hw/pcie.c
>  create mode 100644 hw/pcie.h
> 
> diff --git a/Makefile.objs b/Makefile.objs
> index 5f5a4c5..eeb5134 100644
> --- a/Makefile.objs
> +++ b/Makefile.objs
> @@ -186,6 +186,7 @@ hw-obj-$(CONFIG_PIIX4) += piix4.o
>  # PCI watchdog devices
>  hw-obj-y += wdt_i6300esb.o
>  
> +hw-obj-y += pcie.o
>  hw-obj-y += msix.o msi.o
>  
>  # PCI network cards
> diff --git a/hw/pci.h b/hw/pci.h
> index 630631b..19e85f5 100644
> --- a/hw/pci.h
> +++ b/hw/pci.h
> @@ -9,6 +9,8 @@
>  /* PCI includes legacy ISA access.  */
>  #include "isa.h"
>  
> +#include "pcie.h"
> +
>  /* PCI bus */
>  
>  #define PCI_DEVFN(slot, func)   ((((slot) & 0x1f) << 3) | ((func) & 0x07))
> @@ -175,6 +177,9 @@ struct PCIDevice {
>      /* Offset of MSI capability in config space */
>      uint8_t msi_cap;
>  
> +    /* PCI Express */
> +    PCIExpressDevice exp;
> +
>      /* Location of option rom */
>      char *romfile;
>      ram_addr_t rom_offset;
> @@ -389,6 +394,13 @@ static inline uint32_t pci_config_size(const PCIDevice *d)
>      return pci_is_express(d) ? PCIE_CONFIG_SPACE_SIZE : PCI_CONFIG_SPACE_SIZE;
>  }
>  
> +/* These are pci express specific, so should belong to pcie.h.
> +   they're here to avoid mutual header dependency. */
> +static inline uint8_t pci_pcie_cap(const PCIDevice *d)
> +{
> +    return pci_is_express(d) ? d->exp.exp_cap : 0;
> +}
> +

This one seems useless: 0 is not right for how you use
this function. Just use the field directly?

>  /* These are not pci specific. Should move into a separate header.
>   * Only pci.c uses them, so keep them here for now.
>   */
> diff --git a/hw/pcie.c b/hw/pcie.c
> new file mode 100644
> index 0000000..a6f396b
> --- /dev/null
> +++ b/hw/pcie.c
> @@ -0,0 +1,638 @@
> +/*
> + * pcie.c
> + *
> + * Copyright (c) 2010 Isaku Yamahata <yamahata at valinux co jp>
> + *                    VA Linux Systems Japan K.K.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include "sysemu.h"
> +#include "pci_bridge.h"
> +#include "pcie.h"
> +#include "msix.h"
> +#include "msi.h"
> +#include "pci_internals.h"
> +#include "pcie_regs.h"
> +
> +//#define DEBUG_PCIE
> +#ifdef DEBUG_PCIE
> +# define PCIE_DPRINTF(fmt, ...)                                         \
> +    fprintf(stderr, "%s:%d " fmt, __func__, __LINE__, ## __VA_ARGS__)
> +#else
> +# define PCIE_DPRINTF(fmt, ...) do {} while (0)
> +#endif
> +#define PCIE_DEV_PRINTF(dev, fmt, ...)                                  \
> +    PCIE_DPRINTF("%s:%x "fmt, (dev)->name, (dev)->devfn, ## __VA_ARGS__)
> +
> +static inline const char *pcie_hp_event_name(enum PCIExpressHotPlugEvent event)
> +{
> +    switch (event) {
> +    case PCI_EXP_HP_EV_ABP:
> +        return "attention button pushed";
> +    case PCI_EXP_HP_EV_PDC:
> +        return "present detection changed";
> +    case PCI_EXP_HP_EV_CCI:
> +        return "command completed";
> +    default:
> +        break;
> +    }
> +    return "Unknown event";
> +}
> +

Nice, but so much code just for debug? print the code out inx hex ...

> +/***************************************************************************
> + * pci express capability helper functions
> + */
> +void pcie_notify(PCIDevice *dev, uint16_t vector, bool trigger, int level)

Why is this not static? It makes sense for internal stuff possibly,
but I think functions will need to know what to do: they can't
treat msi/msix/irq identically anyway.

The API seems confusing, I think this is what is creating
code for you. Specifically level = 0 does not notify at all.
So I think we need two:
1. pcie_assert_interrupt which sends msi or sets level to 1
2. pcie_deassert_interrupt which sets level to 0, or nothing
   for non msi.

Then below you can e.g.
if (!sltctrl) {
	pcie_deassert(...);
	return;
}

> +{
> +    /* masking/masking interrupt is handled by upper layer.
> +     * i.e. msix_notify() for MSI-X
> +     *      msi_notify()  for MSI
> +     *      pci_set_irq() for INTx
> +     */
> +    PCIE_DEV_PRINTF(dev, "noitfy vector %d tirgger:%d level:%d\n",

typo

> +                    vector, trigger, level);
> +    if (msix_enabled(dev)) {
> +        if (trigger) {
> +            msix_notify(dev, vector);
> +        }
> +    } else if (msi_enabled(dev)) {
> +        if (trigger){
> +            msi_notify(dev, vector);
> +        }
> +    } else {
> +        qemu_set_irq(dev->irq[0], level);

always 0? really? This is INTA# - is this what the spec says?

> +    }
> +}
> +
> +int pcie_cap_init(PCIDevice *dev, uint8_t offset, uint8_t type, uint8_t port)
> +{
> +    int exp_cap;
> +    uint8_t *pcie_cap;
> +
> +    assert(pci_is_express(dev));
> +
> +    exp_cap = pci_add_capability(dev, PCI_CAP_ID_EXP, offset,
> +                                 PCI_EXP_VER2_SIZEOF);
> +    if (exp_cap < 0) {
> +        return exp_cap;
> +    }
> +    dev->exp.exp_cap = exp_cap;
> +
> +    /* already done in pci_qdev_init() */
> +    assert(dev->cap_present & QEMU_PCI_CAP_EXPRESS);

Hmm. Why do we set this flag in qdev init but do the
rest of it here?

> +
> +    pcie_cap = dev->config + pci_pcie_cap(dev);

come on, just use exp_cap.

> +
> +    /* capability register
> +       interrupt message number defaults to 0 */
> +    pci_set_word(pcie_cap + PCI_EXP_FLAGS,
> +                 ((type << PCI_EXP_FLAGS_TYPE_SHIFT) & PCI_EXP_FLAGS_TYPE) |
> +                 PCI_EXP_FLAGS_VER2);
> +
> +    /* device capability register
> +     * table 7-12:
> +     * roll based error reporting bit must be set by all
> +     * Functions conforming to the ECN, PCI Express Base
> +     * Specification, Revision 1.1., or subsequent PCI Express Base
> +     * Specification revisions.
> +     */
> +    pci_set_long(pcie_cap + PCI_EXP_DEVCAP, PCI_EXP_DEVCAP_RBER);
> +
> +    pci_set_long(pcie_cap + PCI_EXP_LNKCAP,
> +                 (port << PCI_EXP_LNKCAP_PN_SHIFT) |
> +                 PCI_EXP_LNKCAP_ASPMS_0S |
> +                 PCI_EXP_LNK_MLW_1 |
> +                 PCI_EXP_LNK_LS_25);
> +
> +    pci_set_word(pcie_cap + PCI_EXP_LNKSTA,
> +                 PCI_EXP_LNK_MLW_1 | PCI_EXP_LNK_LS_25);
> +
> +    pci_set_long(pcie_cap + PCI_EXP_DEVCAP2,
> +                 PCI_EXP_DEVCAP2_EFF | PCI_EXP_DEVCAP2_EETLPP);
> +
> +    pci_set_word(dev->wmask + exp_cap, PCI_EXP_DEVCTL2_EETLPPB);
> +    return exp_cap;
> +}
> +
> +void pcie_cap_exit(PCIDevice *dev)
> +{
> +    pci_del_capability(dev, PCI_CAP_ID_EXP, PCI_EXP_VER2_SIZEOF);
> +}
> +
> +uint8_t pcie_cap_get_type(const PCIDevice *dev)
> +{
> +    uint32_t pos = pci_pcie_cap(dev);
> +    assert(pos > 0);
> +    return (pci_get_word(dev->config + pos + PCI_EXP_FLAGS) &
> +            PCI_EXP_FLAGS_TYPE) >> PCI_EXP_FLAGS_TYPE_SHIFT;
> +}
> +
> +/* MSI/MSI-X */
> +/* pci express interrupt message number */
> +void pcie_cap_flags_set_vector(PCIDevice *dev, uint8_t vector)
> +{
> +    uint8_t *pcie_cap = dev->config + pci_pcie_cap(dev);
> +    uint16_t tmp;
> +
> +    assert(vector <= 32);
> +    tmp = pci_get_word(pcie_cap + PCI_EXP_FLAGS);
> +    tmp &= ~PCI_EXP_FLAGS_IRQ;
> +    tmp |= vector << PCI_EXP_FLAGS_IRQ_SHIFT;
> +    pci_set_word(pcie_cap + PCI_EXP_FLAGS, tmp);
> +}
> +
> +uint8_t pcie_cap_flags_get_vector(PCIDevice *dev)
> +{
> +    return (pci_get_word(dev->config + pci_pcie_cap(dev) + PCI_EXP_FLAGS) &
> +            PCI_EXP_FLAGS_IRQ) >> PCI_EXP_FLAGS_IRQ_SHIFT;
> +}
> +
> +void pcie_cap_deverr_init(PCIDevice *dev)
> +{
> +    uint32_t pos = pci_pcie_cap(dev);
> +    uint8_t *pcie_cap = dev->config + pos;
> +    uint8_t *pcie_wmask = dev->wmask + pos;
> +    uint8_t *pcie_w1cmask = dev->wmask + pos;
> +
> +    pci_set_long(pcie_cap + PCI_EXP_DEVCAP,
> +                 pci_get_long(pcie_cap + PCI_EXP_DEVCAP) |
> +                 PCI_EXP_DEVCAP_RBER);
> +
> +    pci_set_long(pcie_wmask + PCI_EXP_DEVCTL,
> +                 pci_get_long(pcie_wmask + PCI_EXP_DEVCTL) |
> +                 PCI_EXP_DEVCTL_CERE | PCI_EXP_DEVCTL_NFERE |
> +                 PCI_EXP_DEVCTL_FERE | PCI_EXP_DEVCTL_URRE);
> +
> +    pci_set_long(pcie_w1cmask + PCI_EXP_DEVSTA,
> +                 pci_get_long(pcie_w1cmask + PCI_EXP_DEVSTA) |
> +                 PCI_EXP_DEVSTA_CED | PCI_EXP_DEVSTA_NFED |
> +                 PCI_EXP_DEVSTA_URD | PCI_EXP_DEVSTA_URD);
> +}
> +
> +void pcie_cap_deverr_reset(PCIDevice *dev)
> +{
> +    uint8_t *pcie_cap = dev->config + pci_pcie_cap(dev);
> +    pci_set_long(pcie_cap + PCI_EXP_DEVCTL,
> +                 pci_get_long(pcie_cap + PCI_EXP_DEVCTL) &
> +                 ~(PCI_EXP_DEVCTL_CERE | PCI_EXP_DEVCTL_NFERE |
> +                   PCI_EXP_DEVCTL_FERE | PCI_EXP_DEVCTL_URRE));
> +}
> +
> +/*
> + * events: PCI_EXP_HP_EV_xxx
> + * status: bit or of PCI_EXP_SLTSTA_xxx

or of?

also - make status the right enum then?


Could you replace the comment above with one describing what this
is supposed to do?
Why can't users simply call pcie_assert_interrupt directly?
The below comments are based on an incomplete understanding
of the below function.

> + */
> +static void pcie_cap_slot_event(PCIDevice *dev,
> +                                enum PCIExpressHotPlugEvent events,
> +                                uint16_t status)
> +{

Most of the code seems to simply validate inputs. But why?
You always pass in valid numbers
also events -> event, you always pass a single one.
It looks like it'll be simpler if you just assume
a single event, and move the status bit
tweaking outside of this function, it is
only useful for PDS ...

The we will get a straightforward

> +    bool trigger;
> +    int level;
> +    uint8_t *pcie_cap = dev->config + pci_pcie_cap(dev);
> +    uint16_t sltctl = pci_get_word(pcie_cap + PCI_EXP_SLTCTL);
> +    uint16_t sltsta = pci_get_word(pcie_cap + PCI_EXP_SLTSTA);
> +
> +    PCIE_DEV_PRINTF(dev,
> +                    "sltctl: 0x%0x2 sltsta: 0x%02x event:%x %s status:%d\n",
> +                    sltctl, sltsta,
> +                    events, pcie_hp_event_name(events), status);
> +    events &= PCI_EXP_HP_EV_SUPPORTED;
> +    if ((sltctl & PCI_EXP_SLTCTL_HPIE) && (sltctl & events) &&
> +        ((sltsta ^ events) & events) /* 0 -> 1 */) {
> +        trigger = true;
> +    } else {
> +        trigger = false;
> +    }
> +
> +    if (events & PCI_EXP_HP_EV_PDC) {
> +        sltsta &= ~PCI_EXP_SLTSTA_PDS;
> +        sltsta |= (status & PCI_EXP_SLTSTA_PDS);
> +    }
> +    sltsta |= events;
> +    pci_set_word(pcie_cap + PCI_EXP_SLTSTA, sltsta);
> +    PCIE_DEV_PRINTF(dev, "sltsta -> %02xn", sltsta);
> +
> +    if ((sltctl & PCI_EXP_SLTCTL_HPIE) && (sltsta & PCI_EXP_HP_EV_SUPPORTED)) {
> +        level = 1;
> +    } else {
> +        level = 0;
> +    }


you can replace if with assignment here and elsewhere:
	level = (sltctl & PCI_EXP_SLTCTL_HPIE) && (sltsta & PCI_EXP_HP_EV_SUPPORTED);


> +
> +    pcie_notify(dev, pcie_cap_flags_get_vector(dev), trigger, level);
> +}
> +
> +static int pcie_cap_slot_hotplug(DeviceState *qdev,
> +                                 PCIDevice *pci_dev, int state)
> +{
> +    PCIDevice *d = DO_UPCAST(PCIDevice, qdev, qdev);
> +    uint8_t *pcie_cap = d->config + pci_pcie_cap(d);
> +    uint16_t sltsta = pci_get_word(pcie_cap + PCI_EXP_SLTSTA);
> +
> +    if (!pci_dev->qdev.hotplugged) {
> +        assert(state); /* this case only happens machine creation. */

at machine creation?

> +        sltsta |= PCI_EXP_SLTSTA_PDS;
> +        pci_set_word(pcie_cap + PCI_EXP_SLTSTA, sltsta);
> +        return 0;
> +    }
> +
> +    PCIE_DEV_PRINTF(pci_dev, "hotplug state: %d\n", state);
> +    if (sltsta & PCI_EXP_SLTSTA_EIS) {
> +        /* the slot is electromechanically locked. */

We'll need to produce some error here.

> +        return -EBUSY;
> +    }
> +
> +    if (state) {
> +        if (PCI_FUNC(pci_dev->devfn) == 0) {
> +            /* event is per slot. Not per function
> +             * only generates event for function = 0.
> +             * When hot plug, populate functions > 0
> +             * and then add function = 0 last.
> +             */
> +            pcie_cap_slot_event(d, PCI_EXP_HP_EV_PDC, PCI_EXP_SLTSTA_PDS);
> +        }
> +    } else {
> +        PCIBridge *br;
> +        PCIBus *bus;
> +        DeviceState *next;
> +        if (PCI_FUNC(pci_dev->devfn) != 0) {
> +            /* event is per slot. Not per function.
> +               accepts function = 0 only. */
> +            return -EINVAL;

Can user or guest trigger this?
If yes print an error.
IF no, assert.

> +        }
> +
> +        /* zap all functions. */
> +        br = DO_UPCAST(PCIBridge, dev, d);
> +        bus = pci_bridge_get_sec_bus(br);
> +        QLIST_FOREACH_SAFE(qdev, &bus->qbus.children, sibling, next) {
> +            qdev_free(qdev);
> +        }
> +
> +        pcie_cap_slot_event(d, PCI_EXP_HP_EV_PDC, 0);
> +    }
> +    return 0;
> +}
> +
> +/* pci express slot for pci express root/downstream port
> +   PCI express capability slot registers */
> +void pcie_cap_slot_init(PCIDevice *dev, uint16_t slot)
> +{
> +    uint8_t *pcie_cap = dev->config + pci_pcie_cap(dev);
> +    uint8_t *pcie_wmask = dev->wmask + pci_pcie_cap(dev);
> +    uint8_t *pcie_w1cmask = dev->w1cmask + pci_pcie_cap(dev);
> +    uint32_t tmp;
> +
> +    pci_set_word(pcie_cap + PCI_EXP_FLAGS,
> +                 pci_get_word(pcie_cap + PCI_EXP_FLAGS) | PCI_EXP_FLAGS_SLOT);
> +
> +    tmp = pci_get_long(pcie_cap + PCI_EXP_SLTCAP);
> +    tmp &= PCI_EXP_SLTCAP_PSN;
> +    tmp |=
> +        (slot << PCI_EXP_SLTCAP_PSN_SHIFT) |
> +        PCI_EXP_SLTCAP_EIP |
> +        PCI_EXP_SLTCAP_HPS |
> +        PCI_EXP_SLTCAP_HPC |
> +        PCI_EXP_SLTCAP_PIP |
> +        PCI_EXP_SLTCAP_AIP |
> +        PCI_EXP_SLTCAP_ABP;
> +    pci_set_long(pcie_cap + PCI_EXP_SLTCAP, tmp);
> +
> +    tmp = pci_get_word(pcie_cap + PCI_EXP_SLTCTL);
> +    tmp &= ~(PCI_EXP_SLTCTL_PIC | PCI_EXP_SLTCTL_AIC);
> +    tmp |= PCI_EXP_SLTCTL_PIC_OFF | PCI_EXP_SLTCTL_AIC_OFF;
> +    pci_set_word(pcie_cap + PCI_EXP_SLTCTL, tmp);
> +    pci_set_word(pcie_wmask + PCI_EXP_SLTCTL,
> +                 pci_get_word(pcie_wmask + PCI_EXP_SLTCTL) |
> +                 PCI_EXP_SLTCTL_PIC |
> +                 PCI_EXP_SLTCTL_AIC |
> +                 PCI_EXP_SLTCTL_HPIE |
> +                 PCI_EXP_SLTCTL_CCIE |
> +                 PCI_EXP_SLTCTL_PDCE |
> +                 PCI_EXP_SLTCTL_ABPE);
> +
> +    pci_set_word(pcie_w1cmask + PCI_EXP_SLTSTA,
> +                 pci_get_word(pcie_w1cmask + PCI_EXP_SLTSTA) |
> +                 PCI_EXP_HP_EV_SUPPORTED);
> +
> +    pci_bus_hotplug(pci_bridge_get_sec_bus(DO_UPCAST(PCIBridge, dev, dev)),
> +                    pcie_cap_slot_hotplug, &dev->qdev);
> +}
> +
> +void pcie_cap_slot_reset(PCIDevice *dev)
> +{
> +    uint8_t *pcie_cap = dev->config + pci_pcie_cap(dev);
> +    uint32_t tmp;
> +
> +    PCIE_DEV_PRINTF(dev, "reset\n");
> +
> +    tmp = pci_get_word(pcie_cap + PCI_EXP_SLTCTL);
> +    tmp &= ~(PCI_EXP_SLTCTL_EIC |
> +             PCI_EXP_SLTCTL_PIC |
> +             PCI_EXP_SLTCTL_AIC |
> +             PCI_EXP_SLTCTL_HPIE |
> +             PCI_EXP_SLTCTL_CCIE |
> +             PCI_EXP_SLTCTL_PDCE |
> +             PCI_EXP_SLTCTL_ABPE);
> +    tmp |= PCI_EXP_SLTCTL_PIC_OFF | PCI_EXP_SLTCTL_AIC_OFF;
> +    pci_set_word(pcie_cap + PCI_EXP_SLTCTL, tmp);
> +
> +    tmp = pci_get_word(pcie_cap + PCI_EXP_SLTSTA);
> +    tmp &= ~(PCI_EXP_SLTSTA_EIS | /* by reset, the lock is released */
> +             PCI_EXP_SLTSTA_CC |
> +             PCI_EXP_SLTSTA_PDC |
> +             PCI_EXP_SLTSTA_ABP);
> +    pci_set_word(pcie_cap + PCI_EXP_SLTSTA, tmp);
> +}
> +
> +void pcie_cap_slot_write_config(PCIDevice *dev,
> +                                uint32_t addr, uint32_t val, int len,
> +                                uint16_t sltctl_prev)
> +{
> +    uint32_t pos = pci_pcie_cap(dev);
> +    uint8_t *pcie_cap = dev->config + pos;
> +    uint16_t sltctl = pci_get_word(pcie_cap + PCI_EXP_SLTCTL);
> +    uint16_t sltsta = pci_get_word(pcie_cap + PCI_EXP_SLTSTA);
> +
> +    PCIE_DEV_PRINTF(dev,
> +                    "addr: 0x%x val: 0x%x len: %d\n"
> +                    "\tsltctl_prev: 0x%02x sltctl: 0x%02x sltsta 0x%02x\n",
> +                    addr, val, len, sltctl_prev, sltctl, sltsta);
> +    /* SLTSTA: process SLTSTA before SLTCTL to avoid spurious interrupt */
> +    if (ranges_overlap(addr, len, pos + PCI_EXP_SLTSTA, 2)) {
> +        sltsta = pci_get_word(pcie_cap + PCI_EXP_SLTSTA);
> +
> +        /* write to stlsta results in clearing bits,
> +           so new interrupts won't be generated. */
> +        PCIE_DEV_PRINTF(dev, "sltsta -> 0x%02x\n", sltsta);
> +    }
> +
> +    /* SLTCTL */
> +    if (ranges_overlap(addr, len, pos + PCI_EXP_SLTCTL, 2)) {
> +        PCIE_DEV_PRINTF(dev, "sltctl: 0x%02x -> 0x%02x\n",
> +                        sltctl_prev, sltctl);
> +        if (pci_shift_word(addr, val, pos + PCI_EXP_SLTCTL) &

This is too complex. Just make the bit writeable,
test and clear, then change status.

> +            PCI_EXP_SLTCTL_EIC) {
> +            /* toggle PCI_EXP_SLTSTA_EIS */
> +            sltsta = (sltsta & ~PCI_EXP_SLTSTA_EIS) |
> +                ((sltsta ^ PCI_EXP_SLTSTA_EIS) & PCI_EXP_SLTSTA_EIS);

You mean
		sltsta ^= PCI_EXP_SLTSTA_EIS
?

> +            pci_set_word(pcie_cap + PCI_EXP_SLTSTA, sltsta);
> +            PCIE_DEV_PRINTF(dev, "PCI_EXP_SLTCTL_EIC: sltsta -> 0x%02x\n",
> +                            sltsta);
> +        }
> +
> +        if (sltctl & PCI_EXP_SLTCTL_HPIE) {
> +            bool trigger;
> +            int level;
> +
> +            if (((sltctl_prev ^ sltctl) & sltctl) & PCI_EXP_HP_EV_SUPPORTED) {
kill extra () they are not helpful:
            if ((sltctl_prev ^ sltctl) & sltctl & PCI_EXP_HP_EV_SUPPORTED)


> +                /* 0 -> 1 */
> +                trigger = true;
> +            } else {
> +                trigger = false;
> +            }
> +            if ((sltctl & sltsta) & PCI_EXP_HP_EV_SUPPORTED) {

kill extra () they are not helpful

> +                level = 1;
> +            } else {
> +                level = 0;
> +            }

What is this trying to implement?


> +            pcie_notify(dev, pcie_cap_flags_get_vector(dev), trigger, level);
> +        }
> +
> +        if (!((sltctl_prev ^ sltctl) & PCI_EXP_SLTCTL_SUPPORTED)) {
> +            PCIE_DEV_PRINTF(dev,
> +                            "sprious command completion slctl 0x%x -> 0x%x\n",
> +                            sltctl_prev, sltctl);
> +        }
> +
> +        /* command completion.
> +         * Real hardware might take a while to complete
> +         * requested command because physical movement would be involved
> +         * like locking the electromechanical lock.
> +         * However in our case, command is completed instantaneously above,
> +         * so send a command completion event right now.
> +         */
> +        /* set command completed bit */
> +        pcie_cap_slot_event(dev, PCI_EXP_HP_EV_CCI, 0);
> +    }
> +}
> +
> +void pcie_cap_slot_push_attention_button(PCIDevice *dev)
> +{
> +    pcie_cap_slot_event(dev, PCI_EXP_HP_EV_ABP, 0);
> +}
> +
> +/* root control/capabilities/status. PME isn't emulated for now */
> +void pcie_cap_root_init(PCIDevice *dev)
> +{
> +    uint8_t pos = pci_pcie_cap(dev);
> +    pci_set_word(dev->wmask + pos + PCI_EXP_RTCTL,
> +                 PCI_EXP_RTCTL_SECEE | PCI_EXP_RTCTL_SENFEE |
> +                 PCI_EXP_RTCTL_SEFEE);
> +}
> +
> +void pcie_cap_root_reset(PCIDevice *dev)
> +{
> +    uint8_t *pcie_cap = dev->config + pci_pcie_cap(dev);
> +    pci_set_word(pcie_cap + PCI_EXP_RTCTL, 0);
> +}
> +
> +/* function level reset(FLR) */
> +void pcie_cap_flr_init(PCIDevice *dev, pcie_flr_fn flr)
> +{
> +    uint8_t *pcie_cap = dev->config + pci_pcie_cap(dev);
> +    pci_set_word(pcie_cap + PCI_EXP_DEVCAP,
> +                 pci_get_word(pcie_cap + PCI_EXP_DEVCAP) | PCI_EXP_DEVCAP_FLR);
> +    dev->exp.flr = flr;
> +}
> +
> +void pcie_cap_flr_write_config(PCIDevice *dev,
> +                               uint32_t addr, uint32_t val, int len)
> +{
> +    uint32_t pos = pci_pcie_cap(dev);
> +    if (ranges_overlap(addr, len, pos + PCI_EXP_DEVCTL, 2)) {
> +        uint16_t val16 = pci_shift_word(addr, val, pos + PCI_EXP_DEVCTL);
> +        if ((val16 & PCI_EXP_DEVCTL_BCR_FLR) && dev->exp.flr) {
> +            dev->exp.flr(dev);
> +        }
> +    }

Just make FLR writeable, and clear it after calling reset.
This will also make it possible for devices to detect
that they are reset because of FLR.

> +}
> +
> +/* Alternative Routing-ID Interpretation (ARI) */
> +/* ari forwarding support for down stream port */
> +void pcie_cap_ari_init(PCIDevice *dev)
> +{
> +    uint8_t *pcie_cap = dev->config + pci_pcie_cap(dev);
> +    uint8_t *pcie_wmask = dev->wmask + pci_pcie_cap(dev);
> +
> +    pci_set_long(pcie_cap + PCI_EXP_DEVCAP2,
> +                 pci_get_long(pcie_cap + PCI_EXP_DEVCAP2) |
> +                 PCI_EXP_DEVCAP2_ARI);
> +
> +    pci_set_long(pcie_wmask + PCI_EXP_DEVCTL2,
> +                 pci_get_long(pcie_wmask + PCI_EXP_DEVCTL2) |
> +                 PCI_EXP_DEVCTL2_ARI);
> +}
> +
> +void pcie_cap_ari_reset(PCIDevice *dev)
> +{
> +    uint8_t *pcie_cap = dev->config + pci_pcie_cap(dev);
> +
> +    pci_set_long(pcie_cap + PCI_EXP_DEVCTL2,
> +                 pci_get_long(pcie_cap + PCI_EXP_DEVCTL2) &
> +                 ~PCI_EXP_DEVCTL2_ARI);
> +}
> +
> +bool pcie_cap_is_ari_enabled(const PCIDevice *dev)
> +{
> +    if (!pci_is_express(dev)) {
> +        return false;
> +    }
> +    if (!pci_pcie_cap(dev)) {
> +        return false;
> +    }
> +
> +    return pci_get_long(dev->config + pci_pcie_cap(dev) + PCI_EXP_DEVCTL2) &
> +        PCI_EXP_DEVCTL2_ARI;
> +}
> +
> +/**************************************************************************
> + * pci express extended capability allocation functions
> + * uint16_t ext_cap_id (16 bit)
> + * uint8_t cap_ver (4 bit)
> + * uint16_t cap_offset (12 bit)
> + * uint16_t ext_cap_size
> + */
> +
> +#define PCI_EXT_CAP_NO_ID       ((uint16_t)0)   /* 0 is reserved cap id.
> +                                                 * use internally to find the
> +                                                 * last capability in the
> +                                                 * linked list
> +                                                 */


better remove the macro and move the comment to where
it's used.

> +
> +static uint16_t pcie_find_capability_list(PCIDevice *dev, uint16_t cap_id,
> +                                          uint16_t *prev_p)
> +{
> +    uint16_t prev = 0;
> +    uint16_t next = PCI_CONFIG_SPACE_SIZE;
> +    uint32_t header = pci_get_long(dev->config + next);

next -> PCI_CONFIG_SPACE_SIZE in line above.

> +
> +    if (!header) {
> +        /* no extended capability */
> +        next = 0;
> +        goto out;
> +    }
> +
> +    while (next) {

will be clearer as a for loop?
	for (next = PCI_CONFIG_SPACE_SIZE; next;
	 (prev = next),(next = PCI_EXT_CAP_NEXT(header)))

> +        assert(next >= PCI_CONFIG_SPACE_SIZE);
> +        assert(next <= PCIE_CONFIG_SPACE_SIZE - 8);
> +
> +        header = pci_get_long(dev->config + next);
> +        if (PCI_EXT_CAP_ID(header) == cap_id) {
> +            break;
> +        }
> +        prev = next;
> +        next = PCI_EXT_CAP_NEXT(header);
> +    }
> +
> +out:
> +    if (prev_p) {
> +        *prev_p = prev;
> +    }
> +    return next;
> +}
> +
> +uint16_t pcie_find_capability(PCIDevice *dev, uint16_t cap_id)
> +{
> +    return pcie_find_capability_list(dev, cap_id, NULL);
> +}
> +
> +static void pcie_ext_cap_set_next(PCIDevice *dev, uint16_t pos, uint16_t next)
> +{
> +    uint16_t header = pci_get_long(dev->config + pos);
> +    assert(!(next & (PCI_EXT_CAP_ALIGN - 1)));
> +    header = (header & ~PCI_EXT_CAP_NEXT_MASK) |
> +        ((next << PCI_EXT_CAP_NEXT_SHIFT) & PCI_EXT_CAP_NEXT_MASK);
> +    pci_set_long(dev->config + pos, header);
> +}
> +
> +/*
> + * caller must supply valid (offset, size) * such that the range shouldn't
> + * overlap with other capability or other registers.
> + * This function doesn't check it.
> + */
> +void pcie_add_capability(PCIDevice *dev,
> +                         uint16_t cap_id, uint8_t cap_ver,
> +                         uint16_t offset, uint16_t size)
> +{
> +    uint32_t header;
> +    uint16_t next;
> +
> +    assert(offset >= PCI_CONFIG_SPACE_SIZE);
> +    assert(offset < offset + size);
> +    assert(offset + size < PCIE_CONFIG_SPACE_SIZE);
> +    assert(size >= 8);
> +    assert(pci_is_express(dev));
> +
> +    if (offset == PCI_CONFIG_SPACE_SIZE) {
> +        header = pci_get_long(dev->config + offset);
> +        next = PCI_EXT_CAP_NEXT(header);
> +    } else {
> +        uint16_t prev;
> +        next = pcie_find_capability_list(dev, PCI_EXT_CAP_NO_ID, &prev);
> +        assert(next == 0);
> +        pcie_ext_cap_set_next(dev, prev, offset);
> +    }
> +    pci_set_long(dev->config + offset, PCI_EXT_CAP(cap_id, cap_ver, next));
> +
> +    /* Make capability read-only by default */
> +    memset(dev->wmask + offset, 0, size);
> +    /* Check capability by default */
> +    memset(dev->cmask + offset, 0xFF, size);
> +}
> +
> +void pcie_del_capability(PCIDevice *dev, uint16_t cap_id, uint16_t size)
> +{
> +    uint16_t prev;
> +    uint16_t offset = pcie_find_capability_list(dev, cap_id, &prev);
> +    uint32_t header;
> +
> +    assert(offset >= PCI_CONFIG_SPACE_SIZE);

Should be assert(offset). This is what we return on error, right?

> +    header = pci_get_long(dev->config + offset);
> +    if (prev) {
> +        pcie_ext_cap_set_next(dev, prev, PCI_EXT_CAP_NEXT(header));
> +    } else {
> +        /* move up next ext cap to PCI_CONFIG_SPACE_SIZE? */

Since we don't now, add assert that next is 0.

> +        assert(offset == PCI_CONFIG_SPACE_SIZE);
> +        pci_set_long(dev->config + offset,
> +                     PCI_EXT_CAP(0, 0, PCI_EXT_CAP_NEXT(header)));
> +    }
> +
> +    /* Make those registers read-only reserved zero */

So you make them readonly in both add and delete?
delete should revert add: let's put the
masks back the way they were: writeable.

> +    memset(dev->config + offset, 0, size);
> +    memset(dev->wmask + offset, 0, size);
> +    /* Clear cmask as device-specific registers can't be checked */
> +    memset(dev->cmask + offset, 0, size);
> +}
> +
> +/**************************************************************************
> + * pci express extended capability helper functions
> + */
> +
> +/* ARI */
> +void pcie_ari_init(PCIDevice *dev, uint16_t offset, uint16_t nextfn)
> +{
> +    pcie_add_capability(dev, PCI_EXT_CAP_ID_ARI, PCI_ARI_VER,
> +                        offset, PCI_ARI_SIZEOF);
> +    pci_set_long(dev->config + offset + PCI_ARI_CAP, PCI_ARI_CAP_NFN(nextfn));
> +}
> diff --git a/hw/pcie.h b/hw/pcie.h
> new file mode 100644
> index 0000000..37713dc
> --- /dev/null
> +++ b/hw/pcie.h
> @@ -0,0 +1,96 @@
> +/*
> + * pcie.h
> + *
> + * Copyright (c) 2010 Isaku Yamahata <yamahata at valinux co jp>
> + *                    VA Linux Systems Japan K.K.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#ifndef QEMU_PCIE_H
> +#define QEMU_PCIE_H
> +
> +#include "hw.h"
> +#include "pcie_regs.h"
> +
> +enum PCIExpressIndicator {
> +    /* for attention and power indicator */
> +    PCI_EXP_HP_IND_RESERVED     = PCI_EXP_SLTCTL_IND_RESERVED,
> +    PCI_EXP_HP_IND_ON           = PCI_EXP_SLTCTL_IND_ON,
> +    PCI_EXP_HP_IND_BLINK        = PCI_EXP_SLTCTL_IND_BLINK,
> +    PCI_EXP_HP_IND_OFF          = PCI_EXP_SLTCTL_IND_OFF,
> +};
> +
> +enum PCIExpressHotPlugEvent {

this is not how we name types, I think?
Either typedef enum {} PCIExpressHotPlugEvent, or
enum pcie_hot_plug_event { }.
Same for other types.

> +    /* the bits match the bits in Slot Control/Status registers.
> +     * PCI_EXP_HP_EV_xxx = PCI_EXP_SLTCTL_xxxE = PCI_EXP_SLTSTA_xxx

Is it important that they match?
We don't assume this in code, do we?

> +     */
> +    PCI_EXP_HP_EV_ABP           = 0x01,         /* attention button preseed */

typo

> +    PCI_EXP_HP_EV_PDC           = 0x08,         /* presence detect changed */
> +    PCI_EXP_HP_EV_CCI           = 0x10,         /* command completed */
> +
> +    PCI_EXP_HP_EV_SUPPORTED     = 0x19,         /* supported event mask  */

Gave me pause until I saw
	PCI_EXP_HP_EV_SUPPORTED = (PCI_EXP_HP_EV_ABP | PCI_EXP_HP_EV_PDC | PCI_EXP_HP_EV_CCI)
so make this explicit?

Also - non supported bits would always be readonly, right?
So why do we need this and all the masking?
I think we should be able to get away with checking
the whole register is not 0, and get rid of this?


> +    /* events not listed aren't supported */
> +};
> +
> +typedef void (*pcie_flr_fn)(PCIDevice *dev);

Is flr special?  Can't we use the generic reset handlers?
If not why?

> +
> +struct PCIExpressDevice {
> +    /* Offset of express capability in config space */
> +    uint8_t exp_cap;
> +
> +    /* FLR */
> +    pcie_flr_fn flr;
> +};
> +
> +void pcie_notify(PCIDevice *dev, uint16_t vector, bool trigger, int level);
> +
> +/* PCI express capability helper functions */
> +int pcie_cap_init(PCIDevice *dev, uint8_t offset, uint8_t type, uint8_t port);
> +void pcie_cap_exit(PCIDevice *dev);
> +uint8_t pcie_cap_get_type(const PCIDevice *dev);
> +void pcie_cap_flags_set_vector(PCIDevice *dev, uint8_t vector);
> +uint8_t pcie_cap_flags_get_vector(PCIDevice *dev);
> +
> +void pcie_cap_deverr_init(PCIDevice *dev);
> +void pcie_cap_deverr_reset(PCIDevice *dev);
> +
> +void pcie_cap_slot_init(PCIDevice *dev, uint16_t slot);
> +void pcie_cap_slot_reset(PCIDevice *dev);
> +void pcie_cap_slot_write_config(PCIDevice *dev,
> +                                uint32_t addr, uint32_t val, int len,
> +                                uint16_t sltctl_prev);
> +void pcie_cap_slot_push_attention_button(PCIDevice *dev);
> +
> +void pcie_cap_root_init(PCIDevice *dev);
> +void pcie_cap_root_reset(PCIDevice *dev);
> +
> +void pcie_cap_flr_init(PCIDevice *dev, pcie_flr_fn flr);
> +void pcie_cap_flr_write_config(PCIDevice *dev,
> +                           uint32_t addr, uint32_t val, int len);
> +
> +void pcie_cap_ari_init(PCIDevice *dev);
> +void pcie_cap_ari_reset(PCIDevice *dev);
> +bool pcie_cap_is_ari_enabled(const PCIDevice *dev);
> +
> +/* PCI express extended capability helper functions */
> +uint16_t pcie_find_capability(PCIDevice *dev, uint16_t cap_id);
> +void pcie_add_capability(PCIDevice *dev,
> +                         uint16_t cap_id, uint8_t cap_ver,
> +                         uint16_t offset, uint16_t size);
> +void pcie_del_capability(PCIDevice *dev, uint16_t cap_id, uint16_t size);
> +
> +void pcie_ari_init(PCIDevice *dev, uint16_t offset, uint16_t nextfn);
> +
> +#endif /* QEMU_PCIE_H */
> diff --git a/qemu-common.h b/qemu-common.h
> index d735235..6d9ee26 100644
> --- a/qemu-common.h
> +++ b/qemu-common.h
> @@ -219,6 +219,7 @@ typedef struct PCIHostState PCIHostState;
>  typedef struct PCIExpressHost PCIExpressHost;
>  typedef struct PCIBus PCIBus;
>  typedef struct PCIDevice PCIDevice;
> +typedef struct PCIExpressDevice PCIExpressDevice;
>  typedef struct PCIBridge PCIBridge;
>  typedef struct SerialState SerialState;
>  typedef struct IRQState *qemu_irq;
> -- 
> 1.7.1.1
Isaku Yamahata Sept. 19, 2010, 4:56 a.m. UTC | #2
On Wed, Sep 15, 2010 at 02:43:10PM +0200, Michael S. Tsirkin wrote:
> > +/***************************************************************************
> > + * pci express capability helper functions
> > + */
> > +void pcie_notify(PCIDevice *dev, uint16_t vector, bool trigger, int level)
> 
> Why is this not static? It makes sense for internal stuff possibly,
> but I think functions will need to know what to do: they can't
> treat msi/msix/irq identically anyway.

The aer code which I split out uses it.


> The API seems confusing, I think this is what is creating
> code for you. Specifically level = 0 does not notify at all.
> So I think we need two:
> 1. pcie_assert_interrupt which sends msi or sets level to 1
> 2. pcie_deassert_interrupt which sets level to 0, or nothing
>    for non msi.
> 
> Then below you can e.g.
> if (!sltctrl) {
> 	pcie_deassert(...);
> 	return;
> }

As I already mentioned in the other mail, when to assert MSI
can be different from INTx. The express spec utilizes it.
For example hot plug, aer, and so on.


> > +                    vector, trigger, level);
> > +    if (msix_enabled(dev)) {
> > +        if (trigger) {
> > +            msix_notify(dev, vector);
> > +        }
> > +    } else if (msi_enabled(dev)) {
> > +        if (trigger){
> > +            msi_notify(dev, vector);
> > +        }
> > +    } else {
> > +        qemu_set_irq(dev->irq[0], level);
> 
> always 0? really? This is INTA# - is this what the spec says?

It depends on each device implementation. So I just picked INTA#.
Okay, I'll make it customizable by using property.


> > +    }
> > +}
> > +
> > +int pcie_cap_init(PCIDevice *dev, uint8_t offset, uint8_t type, uint8_t port)
> > +{
> > +    int exp_cap;
> > +    uint8_t *pcie_cap;
> > +
> > +    assert(pci_is_express(dev));
> > +
> > +    exp_cap = pci_add_capability(dev, PCI_CAP_ID_EXP, offset,
> > +                                 PCI_EXP_VER2_SIZEOF);
> > +    if (exp_cap < 0) {
> > +        return exp_cap;
> > +    }
> > +    dev->exp.exp_cap = exp_cap;
> > +
> > +    /* already done in pci_qdev_init() */
> > +    assert(dev->cap_present & QEMU_PCI_CAP_EXPRESS);
> 
> Hmm. Why do we set this flag in qdev init but do the
> rest of it here?

pci_qdev_init() needs to know it for allocating config array.


> > +        return -EBUSY;
> > +    }
> > +
> > +    if (state) {
> > +        if (PCI_FUNC(pci_dev->devfn) == 0) {
> > +            /* event is per slot. Not per function
> > +             * only generates event for function = 0.
> > +             * When hot plug, populate functions > 0
> > +             * and then add function = 0 last.
> > +             */
> > +            pcie_cap_slot_event(d, PCI_EXP_HP_EV_PDC, PCI_EXP_SLTSTA_PDS);
> > +        }
> > +    } else {
> > +        PCIBridge *br;
> > +        PCIBus *bus;
> > +        DeviceState *next;
> > +        if (PCI_FUNC(pci_dev->devfn) != 0) {
> > +            /* event is per slot. Not per function.
> > +               accepts function = 0 only. */
> > +            return -EINVAL;
> 
> Can user or guest trigger this?
> If yes print an error.
> IF no, assert.

Not yet. When multi function device hot plug is supported in some sense,
it will be triggered in some way. The code is just place holder until then.
Some TODO comment should have been there.


> > +                level = 1;
> > +            } else {
> > +                level = 0;
> > +            }
> 
> What is this trying to implement?

hot plut event notification. Please refer to
6.7. PCI Express Hot-Plug Support
6.7.3. PCI Express Hot-Plug Events
6.7.3.4. Software Notification of Hot-Plug Events

> > +        assert(offset == PCI_CONFIG_SPACE_SIZE);
> > +        pci_set_long(dev->config + offset,
> > +                     PCI_EXT_CAP(0, 0, PCI_EXT_CAP_NEXT(header)));
> > +    }
> > +
> > +    /* Make those registers read-only reserved zero */
> 
> So you make them readonly in both add and delete?
> delete should revert add: let's put the
> masks back the way they were: writeable.

In fact zeroing in add is redundant, but I added it following msix code.

The usage model is
- At first the registers are unused, so should be read only zero.
- add capability
  (zeroing is redundant)
- the device specific code sets config/mask/cmask/w1cmask as it likes
- del capability
  This makes the register unused, i.e. read only zero.

Maybe it's possible to make zeroing them caller's responsible.


> > +    /* the bits match the bits in Slot Control/Status registers.
> > +     * PCI_EXP_HP_EV_xxx = PCI_EXP_SLTCTL_xxxE = PCI_EXP_SLTSTA_xxx
> 
> Is it important that they match?
> We don't assume this in code, do we?

Yes, we're using it when setting stlsta.


> > +    /* events not listed aren't supported */
> > +};
> > +
> > +typedef void (*pcie_flr_fn)(PCIDevice *dev);
> 
> Is flr special?  Can't we use the generic reset handlers?
> If not why?

Reset(cold reset/warm reset) in generic sense corresponds to
conventional reset in express sense which corresponds to PCI RST#.
On the other hand FLR is different from the conventional one.

Cited from the spec

6.6. PCI Express Reset - Rules
6.6.1. Conventional Reset
 Conventional Reset includes all reset mechanisms other than Function
 Level Reset.
6.6.2. Function-Level Reset (FLR)

Most devices would implement FLR as just calling something like
qdev_reset. But the spec differentiates FLR from conventional reset,
so the generic pcie layer should do.
Michael S. Tsirkin Sept. 19, 2010, 11:45 a.m. UTC | #3
On Sun, Sep 19, 2010 at 01:56:23PM +0900, Isaku Yamahata wrote:
> On Wed, Sep 15, 2010 at 02:43:10PM +0200, Michael S. Tsirkin wrote:
> > > +/***************************************************************************
> > > + * pci express capability helper functions
> > > + */
> > > +void pcie_notify(PCIDevice *dev, uint16_t vector, bool trigger, int level)
> > 
> > Why is this not static? It makes sense for internal stuff possibly,
> > but I think functions will need to know what to do: they can't
> > treat msi/msix/irq identically anyway.
> 
> The aer code which I split out uses it.

Move it there?

> > The API seems confusing, I think this is what is creating
> > code for you. Specifically level = 0 does not notify at all.
> > So I think we need two:
> > 1. pcie_assert_interrupt which sends msi or sets level to 1
> > 2. pcie_deassert_interrupt which sets level to 0, or nothing
> >    for non msi.
> > 
> > Then below you can e.g.
> > if (!sltctrl) {
> > 	pcie_deassert(...);
> > 	return;
> > }
> 
> As I already mentioned in the other mail, when to assert MSI
> can be different from INTx. The express spec utilizes it.
> For example hot plug, aer, and so on.

My comment really has to do with the API.
The API that gets level and trigger values is confusing.
Two functions assert and deassert would be clearer.

> 
> > > +                    vector, trigger, level);
> > > +    if (msix_enabled(dev)) {
> > > +        if (trigger) {
> > > +            msix_notify(dev, vector);
> > > +        }
> > > +    } else if (msi_enabled(dev)) {
> > > +        if (trigger){
> > > +            msi_notify(dev, vector);
> > > +        }
> > > +    } else {
> > > +        qemu_set_irq(dev->irq[0], level);
> > 
> > always 0? really? This is INTA# - is this what the spec says?
> 
> It depends on each device implementation. So I just picked INTA#.
> Okay, I'll make it customizable by using property.

This might not even be static.
What if device can assert multiple INTx# pins?
I think a saner way is just to expose this in the API.

> 
> > > +    }
> > > +}
> > > +
> > > +int pcie_cap_init(PCIDevice *dev, uint8_t offset, uint8_t type, uint8_t port)
> > > +{
> > > +    int exp_cap;
> > > +    uint8_t *pcie_cap;
> > > +
> > > +    assert(pci_is_express(dev));
> > > +
> > > +    exp_cap = pci_add_capability(dev, PCI_CAP_ID_EXP, offset,
> > > +                                 PCI_EXP_VER2_SIZEOF);
> > > +    if (exp_cap < 0) {
> > > +        return exp_cap;
> > > +    }
> > > +    dev->exp.exp_cap = exp_cap;
> > > +
> > > +    /* already done in pci_qdev_init() */
> > > +    assert(dev->cap_present & QEMU_PCI_CAP_EXPRESS);
> > 
> > Hmm. Why do we set this flag in qdev init but do the
> > rest of it here?
> 
> pci_qdev_init() needs to know it for allocating config array.
> 

Can we do everything in qdev init function?

> > > +        return -EBUSY;
> > > +    }
> > > +
> > > +    if (state) {
> > > +        if (PCI_FUNC(pci_dev->devfn) == 0) {
> > > +            /* event is per slot. Not per function
> > > +             * only generates event for function = 0.
> > > +             * When hot plug, populate functions > 0
> > > +             * and then add function = 0 last.
> > > +             */
> > > +            pcie_cap_slot_event(d, PCI_EXP_HP_EV_PDC, PCI_EXP_SLTSTA_PDS);
> > > +        }
> > > +    } else {
> > > +        PCIBridge *br;
> > > +        PCIBus *bus;
> > > +        DeviceState *next;
> > > +        if (PCI_FUNC(pci_dev->devfn) != 0) {
> > > +            /* event is per slot. Not per function.
> > > +               accepts function = 0 only. */
> > > +            return -EINVAL;
> > 
> > Can user or guest trigger this?
> > If yes print an error.
> > IF no, assert.
> 
> Not yet. When multi function device hot plug is supported in some sense,
> it will be triggered in some way. The code is just place holder until then.
> Some TODO comment should have been there.

+ assert for now.

> 
> > > +                level = 1;
> > > +            } else {
> > > +                level = 0;
> > > +            }
> > 
> > What is this trying to implement?
> 
> hot plut event notification. Please refer to
> 6.7. PCI Express Hot-Plug Support
> 6.7.3. PCI Express Hot-Plug Events
> 6.7.3.4. Software Notification of Hot-Plug Events
> 
> > > +        assert(offset == PCI_CONFIG_SPACE_SIZE);
> > > +        pci_set_long(dev->config + offset,
> > > +                     PCI_EXT_CAP(0, 0, PCI_EXT_CAP_NEXT(header)));
> > > +    }
> > > +
> > > +    /* Make those registers read-only reserved zero */
> > 
> > So you make them readonly in both add and delete?
> > delete should revert add: let's put the
> > masks back the way they were: writeable.
> 
> In fact zeroing in add is redundant, but I added it following msix code.

It is not redundand there as registers are writeable by default:
    memset(dev->wmask + PCI_CONFIG_HEADER_SIZE, 0xff,
           config_size - PCI_CONFIG_HEADER_SIZE);


> 
> The usage model is
> - At first the registers are unused, so should be read only zero.

You can't know they are unused: in PCI spec everything
outside capability list is vendor specific so it
is writeable to let drivers do their thing.

> - add capability
>   (zeroing is redundant)
> - the device specific code sets config/mask/cmask/w1cmask as it likes
> - del capability
>   This makes the register unused, i.e. read only zero.

as above.

> Maybe it's possible to make zeroing them caller's responsible.
> 
> 
> > > +    /* the bits match the bits in Slot Control/Status registers.
> > > +     * PCI_EXP_HP_EV_xxx = PCI_EXP_SLTCTL_xxxE = PCI_EXP_SLTSTA_xxx
> > 
> > Is it important that they match?
> > We don't assume this in code, do we?
> 
> Yes, we're using it when setting stlsta.

Then they should be defined one through the other,
or just use one set of macros and have a comment in code
explaining that layout is identical.

> 
> > > +    /* events not listed aren't supported */
> > > +};
> > > +
> > > +typedef void (*pcie_flr_fn)(PCIDevice *dev);
> > 
> > Is flr special?  Can't we use the generic reset handlers?
> > If not why?
> 
> Reset(cold reset/warm reset) in generic sense corresponds to
> conventional reset in express sense which corresponds to PCI RST#.
> On the other hand FLR is different from the conventional one.
> 
> Cited from the spec
> 
> 6.6. PCI Express Reset - Rules
> 6.6.1. Conventional Reset
>  Conventional Reset includes all reset mechanisms other than Function
>  Level Reset.
> 6.6.2. Function-Level Reset (FLR)
> 
> Most devices would implement FLR as just calling something like
> qdev_reset. But the spec differentiates FLR from conventional reset,
> so the generic pcie layer should do.

I am not sure I agree. If most devices don't care, or
behave almost identically, it's ok to just call qdev_reset:
devices can find out that FLR is in progress by looking
at the config space (bit will be set there) - and we can
add a helper pcie_flr_in_progress() to test it.

This way most devices will work out of box.
Only if behaviour is mostly different would it make sense
to have a completely separate reset path.
What happens with devices you implemented?

> -- 
> yamahata
Isaku Yamahata Sept. 24, 2010, 2:24 a.m. UTC | #4
On Sun, Sep 19, 2010 at 01:45:33PM +0200, Michael S. Tsirkin wrote:
> On Sun, Sep 19, 2010 at 01:56:23PM +0900, Isaku Yamahata wrote:
> > On Wed, Sep 15, 2010 at 02:43:10PM +0200, Michael S. Tsirkin wrote:
> > > > +/***************************************************************************
> > > > + * pci express capability helper functions
> > > > + */
> > > > +void pcie_notify(PCIDevice *dev, uint16_t vector, bool trigger, int level)
> > > 
> > > Why is this not static? It makes sense for internal stuff possibly,
> > > but I think functions will need to know what to do: they can't
> > > treat msi/msix/irq identically anyway.
> > 
> > The aer code which I split out uses it.
> 
> Move it there?

_Both_ pcie.c and pcie_aer.c use it.


> > > > +        assert(offset == PCI_CONFIG_SPACE_SIZE);
> > > > +        pci_set_long(dev->config + offset,
> > > > +                     PCI_EXT_CAP(0, 0, PCI_EXT_CAP_NEXT(header)));
> > > > +    }
> > > > +
> > > > +    /* Make those registers read-only reserved zero */
> > > 
> > > So you make them readonly in both add and delete?
> > > delete should revert add: let's put the
> > > masks back the way they were: writeable.
> > 
> > In fact zeroing in add is redundant, but I added it following msix code.
> 
> It is not redundand there as registers are writeable by default:
>     memset(dev->wmask + PCI_CONFIG_HEADER_SIZE, 0xff,
>            config_size - PCI_CONFIG_HEADER_SIZE);
> 
> 
> > 
> > The usage model is
> > - At first the registers are unused, so should be read only zero.
> 
> You can't know they are unused: in PCI spec everything
> outside capability list is vendor specific so it
> is writeable to let drivers do their thing.

So why PCIDevice::wmask is zero when initialized by do_pci_register_device()?
Anyway pcie_capability_del() is only called via PCIDeviceInfo::exit,
so I don't see any reason why manipulating the capability linked
list just before destroying PCIDevice.
Let's eliminate pcie_capability_del() at all.


> > > > +    /* events not listed aren't supported */
> > > > +};
> > > > +
> > > > +typedef void (*pcie_flr_fn)(PCIDevice *dev);
> > > 
> > > Is flr special?  Can't we use the generic reset handlers?
> > > If not why?
> > 
> > Reset(cold reset/warm reset) in generic sense corresponds to
> > conventional reset in express sense which corresponds to PCI RST#.
> > On the other hand FLR is different from the conventional one.
> > 
> > Cited from the spec
> > 
> > 6.6. PCI Express Reset - Rules
> > 6.6.1. Conventional Reset
> >  Conventional Reset includes all reset mechanisms other than Function
> >  Level Reset.
> > 6.6.2. Function-Level Reset (FLR)
> > 
> > Most devices would implement FLR as just calling something like
> > qdev_reset. But the spec differentiates FLR from conventional reset,
> > so the generic pcie layer should do.
> 
> I am not sure I agree. If most devices don't care, or
> behave almost identically, it's ok to just call qdev_reset:
> devices can find out that FLR is in progress by looking
> at the config space (bit will be set there) - and we can
> add a helper pcie_flr_in_progress() to test it.
> 
> This way most devices will work out of box.
> Only if behaviour is mostly different would it make sense
> to have a completely separate reset path.
> What happens with devices you implemented?

With either approach, we can implement FLR.
The above approach will eventually result in passing passing parameter,
somthing like reset_kind, to callbacks. The argument tells what kind of
reset is triggered. cold reset, warm reset and many bus/device specific
resets. In fact it was my first approach.
But when we discussed on qdev reset() with Anthony, he claimed that handling
reset type should be handled by introducing other APIs because
it would just bloat qdev reset callback function with big switch.
So I introduced new flr callback.
Michael S. Tsirkin Sept. 26, 2010, 12:32 p.m. UTC | #5
On Fri, Sep 24, 2010 at 11:24:50AM +0900, Isaku Yamahata wrote:
> On Sun, Sep 19, 2010 at 01:45:33PM +0200, Michael S. Tsirkin wrote:
> > On Sun, Sep 19, 2010 at 01:56:23PM +0900, Isaku Yamahata wrote:
> > > On Wed, Sep 15, 2010 at 02:43:10PM +0200, Michael S. Tsirkin wrote:
> > > > > +/***************************************************************************
> > > > > + * pci express capability helper functions
> > > > > + */
> > > > > +void pcie_notify(PCIDevice *dev, uint16_t vector, bool trigger, int level)
> > > > 
> > > > Why is this not static? It makes sense for internal stuff possibly,
> > > > but I think functions will need to know what to do: they can't
> > > > treat msi/msix/irq identically anyway.
> > > 
> > > The aer code which I split out uses it.
> > 
> > Move it there?
> 
> _Both_ pcie.c and pcie_aer.c use it.
> 
> 
> > > > > +        assert(offset == PCI_CONFIG_SPACE_SIZE);
> > > > > +        pci_set_long(dev->config + offset,
> > > > > +                     PCI_EXT_CAP(0, 0, PCI_EXT_CAP_NEXT(header)));
> > > > > +    }
> > > > > +
> > > > > +    /* Make those registers read-only reserved zero */
> > > > 
> > > > So you make them readonly in both add and delete?
> > > > delete should revert add: let's put the
> > > > masks back the way they were: writeable.
> > > 
> > > In fact zeroing in add is redundant, but I added it following msix code.
> > 
> > It is not redundand there as registers are writeable by default:
> >     memset(dev->wmask + PCI_CONFIG_HEADER_SIZE, 0xff,
> >            config_size - PCI_CONFIG_HEADER_SIZE);
> > 
> > 
> > > 
> > > The usage model is
> > > - At first the registers are unused, so should be read only zero.
> > 
> > You can't know they are unused: in PCI spec everything
> > outside capability list is vendor specific so it
> > is writeable to let drivers do their thing.
> 
> So why PCIDevice::wmask is zero when initialized by do_pci_register_device()?

I think it isn't: only the header is 0: we have in pci_init_wmask
    memset(dev->wmask + PCI_CONFIG_HEADER_SIZE, 0xff,
           config_size - PCI_CONFIG_HEADER_SIZE);
what am I missing?

> Anyway pcie_capability_del() is only called via PCIDeviceInfo::exit,
> so I don't see any reason why manipulating the capability linked
> list just before destroying PCIDevice.
> Let's eliminate pcie_capability_del() at all.

OK.

> > > > > +    /* events not listed aren't supported */
> > > > > +};
> > > > > +
> > > > > +typedef void (*pcie_flr_fn)(PCIDevice *dev);
> > > > 
> > > > Is flr special?  Can't we use the generic reset handlers?
> > > > If not why?
> > > 
> > > Reset(cold reset/warm reset) in generic sense corresponds to
> > > conventional reset in express sense which corresponds to PCI RST#.
> > > On the other hand FLR is different from the conventional one.
> > > 
> > > Cited from the spec
> > > 
> > > 6.6. PCI Express Reset - Rules
> > > 6.6.1. Conventional Reset
> > >  Conventional Reset includes all reset mechanisms other than Function
> > >  Level Reset.
> > > 6.6.2. Function-Level Reset (FLR)
> > > 
> > > Most devices would implement FLR as just calling something like
> > > qdev_reset. But the spec differentiates FLR from conventional reset,
> > > so the generic pcie layer should do.
> > 
> > I am not sure I agree. If most devices don't care, or
> > behave almost identically, it's ok to just call qdev_reset:
> > devices can find out that FLR is in progress by looking
> > at the config space (bit will be set there) - and we can
> > add a helper pcie_flr_in_progress() to test it.
> > 
> > This way most devices will work out of box.
> > Only if behaviour is mostly different would it make sense
> > to have a completely separate reset path.
> > What happens with devices you implemented?
> 
> With either approach, we can implement FLR.
> The above approach will eventually result in passing passing parameter,
> somthing like reset_kind, to callbacks. The argument tells what kind of
> reset is triggered. cold reset, warm reset and many bus/device specific
> resets. In fact it was my first approach.
> But when we discussed on qdev reset() with Anthony, he claimed that handling
> reset type should be handled by introducing other APIs because
> it would just bloat qdev reset callback function with big switch.
> So I introduced new flr callback.

So I think this addresses this in a different way: we don't need a
parameter as devices can check state to see what reset is in progress.
Most of them do not need to bother, so we expect that there will not be any
giant switch statements.

But - you are writing the code after all so you tell us whether the reset code
is mostly same or mostly different: if it's same we should probably
reuse existing callbacks, to avoid boilerplate code
in devices, if it is different maybe add a new one.

Anthony, makes sense?

> -- 
> yamahata
diff mbox

Patch

diff --git a/Makefile.objs b/Makefile.objs
index 5f5a4c5..eeb5134 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -186,6 +186,7 @@  hw-obj-$(CONFIG_PIIX4) += piix4.o
 # PCI watchdog devices
 hw-obj-y += wdt_i6300esb.o
 
+hw-obj-y += pcie.o
 hw-obj-y += msix.o msi.o
 
 # PCI network cards
diff --git a/hw/pci.h b/hw/pci.h
index 630631b..19e85f5 100644
--- a/hw/pci.h
+++ b/hw/pci.h
@@ -9,6 +9,8 @@ 
 /* PCI includes legacy ISA access.  */
 #include "isa.h"
 
+#include "pcie.h"
+
 /* PCI bus */
 
 #define PCI_DEVFN(slot, func)   ((((slot) & 0x1f) << 3) | ((func) & 0x07))
@@ -175,6 +177,9 @@  struct PCIDevice {
     /* Offset of MSI capability in config space */
     uint8_t msi_cap;
 
+    /* PCI Express */
+    PCIExpressDevice exp;
+
     /* Location of option rom */
     char *romfile;
     ram_addr_t rom_offset;
@@ -389,6 +394,13 @@  static inline uint32_t pci_config_size(const PCIDevice *d)
     return pci_is_express(d) ? PCIE_CONFIG_SPACE_SIZE : PCI_CONFIG_SPACE_SIZE;
 }
 
+/* These are pci express specific, so should belong to pcie.h.
+   they're here to avoid mutual header dependency. */
+static inline uint8_t pci_pcie_cap(const PCIDevice *d)
+{
+    return pci_is_express(d) ? d->exp.exp_cap : 0;
+}
+
 /* These are not pci specific. Should move into a separate header.
  * Only pci.c uses them, so keep them here for now.
  */
diff --git a/hw/pcie.c b/hw/pcie.c
new file mode 100644
index 0000000..a6f396b
--- /dev/null
+++ b/hw/pcie.c
@@ -0,0 +1,638 @@ 
+/*
+ * pcie.c
+ *
+ * Copyright (c) 2010 Isaku Yamahata <yamahata at valinux co jp>
+ *                    VA Linux Systems Japan K.K.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "sysemu.h"
+#include "pci_bridge.h"
+#include "pcie.h"
+#include "msix.h"
+#include "msi.h"
+#include "pci_internals.h"
+#include "pcie_regs.h"
+
+//#define DEBUG_PCIE
+#ifdef DEBUG_PCIE
+# define PCIE_DPRINTF(fmt, ...)                                         \
+    fprintf(stderr, "%s:%d " fmt, __func__, __LINE__, ## __VA_ARGS__)
+#else
+# define PCIE_DPRINTF(fmt, ...) do {} while (0)
+#endif
+#define PCIE_DEV_PRINTF(dev, fmt, ...)                                  \
+    PCIE_DPRINTF("%s:%x "fmt, (dev)->name, (dev)->devfn, ## __VA_ARGS__)
+
+static inline const char *pcie_hp_event_name(enum PCIExpressHotPlugEvent event)
+{
+    switch (event) {
+    case PCI_EXP_HP_EV_ABP:
+        return "attention button pushed";
+    case PCI_EXP_HP_EV_PDC:
+        return "present detection changed";
+    case PCI_EXP_HP_EV_CCI:
+        return "command completed";
+    default:
+        break;
+    }
+    return "Unknown event";
+}
+
+/***************************************************************************
+ * pci express capability helper functions
+ */
+void pcie_notify(PCIDevice *dev, uint16_t vector, bool trigger, int level)
+{
+    /* masking/masking interrupt is handled by upper layer.
+     * i.e. msix_notify() for MSI-X
+     *      msi_notify()  for MSI
+     *      pci_set_irq() for INTx
+     */
+    PCIE_DEV_PRINTF(dev, "noitfy vector %d tirgger:%d level:%d\n",
+                    vector, trigger, level);
+    if (msix_enabled(dev)) {
+        if (trigger) {
+            msix_notify(dev, vector);
+        }
+    } else if (msi_enabled(dev)) {
+        if (trigger){
+            msi_notify(dev, vector);
+        }
+    } else {
+        qemu_set_irq(dev->irq[0], level);
+    }
+}
+
+int pcie_cap_init(PCIDevice *dev, uint8_t offset, uint8_t type, uint8_t port)
+{
+    int exp_cap;
+    uint8_t *pcie_cap;
+
+    assert(pci_is_express(dev));
+
+    exp_cap = pci_add_capability(dev, PCI_CAP_ID_EXP, offset,
+                                 PCI_EXP_VER2_SIZEOF);
+    if (exp_cap < 0) {
+        return exp_cap;
+    }
+    dev->exp.exp_cap = exp_cap;
+
+    /* already done in pci_qdev_init() */
+    assert(dev->cap_present & QEMU_PCI_CAP_EXPRESS);
+
+    pcie_cap = dev->config + pci_pcie_cap(dev);
+
+    /* capability register
+       interrupt message number defaults to 0 */
+    pci_set_word(pcie_cap + PCI_EXP_FLAGS,
+                 ((type << PCI_EXP_FLAGS_TYPE_SHIFT) & PCI_EXP_FLAGS_TYPE) |
+                 PCI_EXP_FLAGS_VER2);
+
+    /* device capability register
+     * table 7-12:
+     * roll based error reporting bit must be set by all
+     * Functions conforming to the ECN, PCI Express Base
+     * Specification, Revision 1.1., or subsequent PCI Express Base
+     * Specification revisions.
+     */
+    pci_set_long(pcie_cap + PCI_EXP_DEVCAP, PCI_EXP_DEVCAP_RBER);
+
+    pci_set_long(pcie_cap + PCI_EXP_LNKCAP,
+                 (port << PCI_EXP_LNKCAP_PN_SHIFT) |
+                 PCI_EXP_LNKCAP_ASPMS_0S |
+                 PCI_EXP_LNK_MLW_1 |
+                 PCI_EXP_LNK_LS_25);
+
+    pci_set_word(pcie_cap + PCI_EXP_LNKSTA,
+                 PCI_EXP_LNK_MLW_1 | PCI_EXP_LNK_LS_25);
+
+    pci_set_long(pcie_cap + PCI_EXP_DEVCAP2,
+                 PCI_EXP_DEVCAP2_EFF | PCI_EXP_DEVCAP2_EETLPP);
+
+    pci_set_word(dev->wmask + exp_cap, PCI_EXP_DEVCTL2_EETLPPB);
+    return exp_cap;
+}
+
+void pcie_cap_exit(PCIDevice *dev)
+{
+    pci_del_capability(dev, PCI_CAP_ID_EXP, PCI_EXP_VER2_SIZEOF);
+}
+
+uint8_t pcie_cap_get_type(const PCIDevice *dev)
+{
+    uint32_t pos = pci_pcie_cap(dev);
+    assert(pos > 0);
+    return (pci_get_word(dev->config + pos + PCI_EXP_FLAGS) &
+            PCI_EXP_FLAGS_TYPE) >> PCI_EXP_FLAGS_TYPE_SHIFT;
+}
+
+/* MSI/MSI-X */
+/* pci express interrupt message number */
+void pcie_cap_flags_set_vector(PCIDevice *dev, uint8_t vector)
+{
+    uint8_t *pcie_cap = dev->config + pci_pcie_cap(dev);
+    uint16_t tmp;
+
+    assert(vector <= 32);
+    tmp = pci_get_word(pcie_cap + PCI_EXP_FLAGS);
+    tmp &= ~PCI_EXP_FLAGS_IRQ;
+    tmp |= vector << PCI_EXP_FLAGS_IRQ_SHIFT;
+    pci_set_word(pcie_cap + PCI_EXP_FLAGS, tmp);
+}
+
+uint8_t pcie_cap_flags_get_vector(PCIDevice *dev)
+{
+    return (pci_get_word(dev->config + pci_pcie_cap(dev) + PCI_EXP_FLAGS) &
+            PCI_EXP_FLAGS_IRQ) >> PCI_EXP_FLAGS_IRQ_SHIFT;
+}
+
+void pcie_cap_deverr_init(PCIDevice *dev)
+{
+    uint32_t pos = pci_pcie_cap(dev);
+    uint8_t *pcie_cap = dev->config + pos;
+    uint8_t *pcie_wmask = dev->wmask + pos;
+    uint8_t *pcie_w1cmask = dev->wmask + pos;
+
+    pci_set_long(pcie_cap + PCI_EXP_DEVCAP,
+                 pci_get_long(pcie_cap + PCI_EXP_DEVCAP) |
+                 PCI_EXP_DEVCAP_RBER);
+
+    pci_set_long(pcie_wmask + PCI_EXP_DEVCTL,
+                 pci_get_long(pcie_wmask + PCI_EXP_DEVCTL) |
+                 PCI_EXP_DEVCTL_CERE | PCI_EXP_DEVCTL_NFERE |
+                 PCI_EXP_DEVCTL_FERE | PCI_EXP_DEVCTL_URRE);
+
+    pci_set_long(pcie_w1cmask + PCI_EXP_DEVSTA,
+                 pci_get_long(pcie_w1cmask + PCI_EXP_DEVSTA) |
+                 PCI_EXP_DEVSTA_CED | PCI_EXP_DEVSTA_NFED |
+                 PCI_EXP_DEVSTA_URD | PCI_EXP_DEVSTA_URD);
+}
+
+void pcie_cap_deverr_reset(PCIDevice *dev)
+{
+    uint8_t *pcie_cap = dev->config + pci_pcie_cap(dev);
+    pci_set_long(pcie_cap + PCI_EXP_DEVCTL,
+                 pci_get_long(pcie_cap + PCI_EXP_DEVCTL) &
+                 ~(PCI_EXP_DEVCTL_CERE | PCI_EXP_DEVCTL_NFERE |
+                   PCI_EXP_DEVCTL_FERE | PCI_EXP_DEVCTL_URRE));
+}
+
+/*
+ * events: PCI_EXP_HP_EV_xxx
+ * status: bit or of PCI_EXP_SLTSTA_xxx
+ */
+static void pcie_cap_slot_event(PCIDevice *dev,
+                                enum PCIExpressHotPlugEvent events,
+                                uint16_t status)
+{
+    bool trigger;
+    int level;
+    uint8_t *pcie_cap = dev->config + pci_pcie_cap(dev);
+    uint16_t sltctl = pci_get_word(pcie_cap + PCI_EXP_SLTCTL);
+    uint16_t sltsta = pci_get_word(pcie_cap + PCI_EXP_SLTSTA);
+
+    PCIE_DEV_PRINTF(dev,
+                    "sltctl: 0x%0x2 sltsta: 0x%02x event:%x %s status:%d\n",
+                    sltctl, sltsta,
+                    events, pcie_hp_event_name(events), status);
+    events &= PCI_EXP_HP_EV_SUPPORTED;
+    if ((sltctl & PCI_EXP_SLTCTL_HPIE) && (sltctl & events) &&
+        ((sltsta ^ events) & events) /* 0 -> 1 */) {
+        trigger = true;
+    } else {
+        trigger = false;
+    }
+
+    if (events & PCI_EXP_HP_EV_PDC) {
+        sltsta &= ~PCI_EXP_SLTSTA_PDS;
+        sltsta |= (status & PCI_EXP_SLTSTA_PDS);
+    }
+    sltsta |= events;
+    pci_set_word(pcie_cap + PCI_EXP_SLTSTA, sltsta);
+    PCIE_DEV_PRINTF(dev, "sltsta -> %02xn", sltsta);
+
+    if ((sltctl & PCI_EXP_SLTCTL_HPIE) && (sltsta & PCI_EXP_HP_EV_SUPPORTED)) {
+        level = 1;
+    } else {
+        level = 0;
+    }
+
+    pcie_notify(dev, pcie_cap_flags_get_vector(dev), trigger, level);
+}
+
+static int pcie_cap_slot_hotplug(DeviceState *qdev,
+                                 PCIDevice *pci_dev, int state)
+{
+    PCIDevice *d = DO_UPCAST(PCIDevice, qdev, qdev);
+    uint8_t *pcie_cap = d->config + pci_pcie_cap(d);
+    uint16_t sltsta = pci_get_word(pcie_cap + PCI_EXP_SLTSTA);
+
+    if (!pci_dev->qdev.hotplugged) {
+        assert(state); /* this case only happens machine creation. */
+        sltsta |= PCI_EXP_SLTSTA_PDS;
+        pci_set_word(pcie_cap + PCI_EXP_SLTSTA, sltsta);
+        return 0;
+    }
+
+    PCIE_DEV_PRINTF(pci_dev, "hotplug state: %d\n", state);
+    if (sltsta & PCI_EXP_SLTSTA_EIS) {
+        /* the slot is electromechanically locked. */
+        return -EBUSY;
+    }
+
+    if (state) {
+        if (PCI_FUNC(pci_dev->devfn) == 0) {
+            /* event is per slot. Not per function
+             * only generates event for function = 0.
+             * When hot plug, populate functions > 0
+             * and then add function = 0 last.
+             */
+            pcie_cap_slot_event(d, PCI_EXP_HP_EV_PDC, PCI_EXP_SLTSTA_PDS);
+        }
+    } else {
+        PCIBridge *br;
+        PCIBus *bus;
+        DeviceState *next;
+        if (PCI_FUNC(pci_dev->devfn) != 0) {
+            /* event is per slot. Not per function.
+               accepts function = 0 only. */
+            return -EINVAL;
+        }
+
+        /* zap all functions. */
+        br = DO_UPCAST(PCIBridge, dev, d);
+        bus = pci_bridge_get_sec_bus(br);
+        QLIST_FOREACH_SAFE(qdev, &bus->qbus.children, sibling, next) {
+            qdev_free(qdev);
+        }
+
+        pcie_cap_slot_event(d, PCI_EXP_HP_EV_PDC, 0);
+    }
+    return 0;
+}
+
+/* pci express slot for pci express root/downstream port
+   PCI express capability slot registers */
+void pcie_cap_slot_init(PCIDevice *dev, uint16_t slot)
+{
+    uint8_t *pcie_cap = dev->config + pci_pcie_cap(dev);
+    uint8_t *pcie_wmask = dev->wmask + pci_pcie_cap(dev);
+    uint8_t *pcie_w1cmask = dev->w1cmask + pci_pcie_cap(dev);
+    uint32_t tmp;
+
+    pci_set_word(pcie_cap + PCI_EXP_FLAGS,
+                 pci_get_word(pcie_cap + PCI_EXP_FLAGS) | PCI_EXP_FLAGS_SLOT);
+
+    tmp = pci_get_long(pcie_cap + PCI_EXP_SLTCAP);
+    tmp &= PCI_EXP_SLTCAP_PSN;
+    tmp |=
+        (slot << PCI_EXP_SLTCAP_PSN_SHIFT) |
+        PCI_EXP_SLTCAP_EIP |
+        PCI_EXP_SLTCAP_HPS |
+        PCI_EXP_SLTCAP_HPC |
+        PCI_EXP_SLTCAP_PIP |
+        PCI_EXP_SLTCAP_AIP |
+        PCI_EXP_SLTCAP_ABP;
+    pci_set_long(pcie_cap + PCI_EXP_SLTCAP, tmp);
+
+    tmp = pci_get_word(pcie_cap + PCI_EXP_SLTCTL);
+    tmp &= ~(PCI_EXP_SLTCTL_PIC | PCI_EXP_SLTCTL_AIC);
+    tmp |= PCI_EXP_SLTCTL_PIC_OFF | PCI_EXP_SLTCTL_AIC_OFF;
+    pci_set_word(pcie_cap + PCI_EXP_SLTCTL, tmp);
+    pci_set_word(pcie_wmask + PCI_EXP_SLTCTL,
+                 pci_get_word(pcie_wmask + PCI_EXP_SLTCTL) |
+                 PCI_EXP_SLTCTL_PIC |
+                 PCI_EXP_SLTCTL_AIC |
+                 PCI_EXP_SLTCTL_HPIE |
+                 PCI_EXP_SLTCTL_CCIE |
+                 PCI_EXP_SLTCTL_PDCE |
+                 PCI_EXP_SLTCTL_ABPE);
+
+    pci_set_word(pcie_w1cmask + PCI_EXP_SLTSTA,
+                 pci_get_word(pcie_w1cmask + PCI_EXP_SLTSTA) |
+                 PCI_EXP_HP_EV_SUPPORTED);
+
+    pci_bus_hotplug(pci_bridge_get_sec_bus(DO_UPCAST(PCIBridge, dev, dev)),
+                    pcie_cap_slot_hotplug, &dev->qdev);
+}
+
+void pcie_cap_slot_reset(PCIDevice *dev)
+{
+    uint8_t *pcie_cap = dev->config + pci_pcie_cap(dev);
+    uint32_t tmp;
+
+    PCIE_DEV_PRINTF(dev, "reset\n");
+
+    tmp = pci_get_word(pcie_cap + PCI_EXP_SLTCTL);
+    tmp &= ~(PCI_EXP_SLTCTL_EIC |
+             PCI_EXP_SLTCTL_PIC |
+             PCI_EXP_SLTCTL_AIC |
+             PCI_EXP_SLTCTL_HPIE |
+             PCI_EXP_SLTCTL_CCIE |
+             PCI_EXP_SLTCTL_PDCE |
+             PCI_EXP_SLTCTL_ABPE);
+    tmp |= PCI_EXP_SLTCTL_PIC_OFF | PCI_EXP_SLTCTL_AIC_OFF;
+    pci_set_word(pcie_cap + PCI_EXP_SLTCTL, tmp);
+
+    tmp = pci_get_word(pcie_cap + PCI_EXP_SLTSTA);
+    tmp &= ~(PCI_EXP_SLTSTA_EIS | /* by reset, the lock is released */
+             PCI_EXP_SLTSTA_CC |
+             PCI_EXP_SLTSTA_PDC |
+             PCI_EXP_SLTSTA_ABP);
+    pci_set_word(pcie_cap + PCI_EXP_SLTSTA, tmp);
+}
+
+void pcie_cap_slot_write_config(PCIDevice *dev,
+                                uint32_t addr, uint32_t val, int len,
+                                uint16_t sltctl_prev)
+{
+    uint32_t pos = pci_pcie_cap(dev);
+    uint8_t *pcie_cap = dev->config + pos;
+    uint16_t sltctl = pci_get_word(pcie_cap + PCI_EXP_SLTCTL);
+    uint16_t sltsta = pci_get_word(pcie_cap + PCI_EXP_SLTSTA);
+
+    PCIE_DEV_PRINTF(dev,
+                    "addr: 0x%x val: 0x%x len: %d\n"
+                    "\tsltctl_prev: 0x%02x sltctl: 0x%02x sltsta 0x%02x\n",
+                    addr, val, len, sltctl_prev, sltctl, sltsta);
+    /* SLTSTA: process SLTSTA before SLTCTL to avoid spurious interrupt */
+    if (ranges_overlap(addr, len, pos + PCI_EXP_SLTSTA, 2)) {
+        sltsta = pci_get_word(pcie_cap + PCI_EXP_SLTSTA);
+
+        /* write to stlsta results in clearing bits,
+           so new interrupts won't be generated. */
+        PCIE_DEV_PRINTF(dev, "sltsta -> 0x%02x\n", sltsta);
+    }
+
+    /* SLTCTL */
+    if (ranges_overlap(addr, len, pos + PCI_EXP_SLTCTL, 2)) {
+        PCIE_DEV_PRINTF(dev, "sltctl: 0x%02x -> 0x%02x\n",
+                        sltctl_prev, sltctl);
+        if (pci_shift_word(addr, val, pos + PCI_EXP_SLTCTL) &
+            PCI_EXP_SLTCTL_EIC) {
+            /* toggle PCI_EXP_SLTSTA_EIS */
+            sltsta = (sltsta & ~PCI_EXP_SLTSTA_EIS) |
+                ((sltsta ^ PCI_EXP_SLTSTA_EIS) & PCI_EXP_SLTSTA_EIS);
+            pci_set_word(pcie_cap + PCI_EXP_SLTSTA, sltsta);
+            PCIE_DEV_PRINTF(dev, "PCI_EXP_SLTCTL_EIC: sltsta -> 0x%02x\n",
+                            sltsta);
+        }
+
+        if (sltctl & PCI_EXP_SLTCTL_HPIE) {
+            bool trigger;
+            int level;
+
+            if (((sltctl_prev ^ sltctl) & sltctl) & PCI_EXP_HP_EV_SUPPORTED) {
+                /* 0 -> 1 */
+                trigger = true;
+            } else {
+                trigger = false;
+            }
+            if ((sltctl & sltsta) & PCI_EXP_HP_EV_SUPPORTED) {
+                level = 1;
+            } else {
+                level = 0;
+            }
+            pcie_notify(dev, pcie_cap_flags_get_vector(dev), trigger, level);
+        }
+
+        if (!((sltctl_prev ^ sltctl) & PCI_EXP_SLTCTL_SUPPORTED)) {
+            PCIE_DEV_PRINTF(dev,
+                            "sprious command completion slctl 0x%x -> 0x%x\n",
+                            sltctl_prev, sltctl);
+        }
+
+        /* command completion.
+         * Real hardware might take a while to complete
+         * requested command because physical movement would be involved
+         * like locking the electromechanical lock.
+         * However in our case, command is completed instantaneously above,
+         * so send a command completion event right now.
+         */
+        /* set command completed bit */
+        pcie_cap_slot_event(dev, PCI_EXP_HP_EV_CCI, 0);
+    }
+}
+
+void pcie_cap_slot_push_attention_button(PCIDevice *dev)
+{
+    pcie_cap_slot_event(dev, PCI_EXP_HP_EV_ABP, 0);
+}
+
+/* root control/capabilities/status. PME isn't emulated for now */
+void pcie_cap_root_init(PCIDevice *dev)
+{
+    uint8_t pos = pci_pcie_cap(dev);
+    pci_set_word(dev->wmask + pos + PCI_EXP_RTCTL,
+                 PCI_EXP_RTCTL_SECEE | PCI_EXP_RTCTL_SENFEE |
+                 PCI_EXP_RTCTL_SEFEE);
+}
+
+void pcie_cap_root_reset(PCIDevice *dev)
+{
+    uint8_t *pcie_cap = dev->config + pci_pcie_cap(dev);
+    pci_set_word(pcie_cap + PCI_EXP_RTCTL, 0);
+}
+
+/* function level reset(FLR) */
+void pcie_cap_flr_init(PCIDevice *dev, pcie_flr_fn flr)
+{
+    uint8_t *pcie_cap = dev->config + pci_pcie_cap(dev);
+    pci_set_word(pcie_cap + PCI_EXP_DEVCAP,
+                 pci_get_word(pcie_cap + PCI_EXP_DEVCAP) | PCI_EXP_DEVCAP_FLR);
+    dev->exp.flr = flr;
+}
+
+void pcie_cap_flr_write_config(PCIDevice *dev,
+                               uint32_t addr, uint32_t val, int len)
+{
+    uint32_t pos = pci_pcie_cap(dev);
+    if (ranges_overlap(addr, len, pos + PCI_EXP_DEVCTL, 2)) {
+        uint16_t val16 = pci_shift_word(addr, val, pos + PCI_EXP_DEVCTL);
+        if ((val16 & PCI_EXP_DEVCTL_BCR_FLR) && dev->exp.flr) {
+            dev->exp.flr(dev);
+        }
+    }
+}
+
+/* Alternative Routing-ID Interpretation (ARI) */
+/* ari forwarding support for down stream port */
+void pcie_cap_ari_init(PCIDevice *dev)
+{
+    uint8_t *pcie_cap = dev->config + pci_pcie_cap(dev);
+    uint8_t *pcie_wmask = dev->wmask + pci_pcie_cap(dev);
+
+    pci_set_long(pcie_cap + PCI_EXP_DEVCAP2,
+                 pci_get_long(pcie_cap + PCI_EXP_DEVCAP2) |
+                 PCI_EXP_DEVCAP2_ARI);
+
+    pci_set_long(pcie_wmask + PCI_EXP_DEVCTL2,
+                 pci_get_long(pcie_wmask + PCI_EXP_DEVCTL2) |
+                 PCI_EXP_DEVCTL2_ARI);
+}
+
+void pcie_cap_ari_reset(PCIDevice *dev)
+{
+    uint8_t *pcie_cap = dev->config + pci_pcie_cap(dev);
+
+    pci_set_long(pcie_cap + PCI_EXP_DEVCTL2,
+                 pci_get_long(pcie_cap + PCI_EXP_DEVCTL2) &
+                 ~PCI_EXP_DEVCTL2_ARI);
+}
+
+bool pcie_cap_is_ari_enabled(const PCIDevice *dev)
+{
+    if (!pci_is_express(dev)) {
+        return false;
+    }
+    if (!pci_pcie_cap(dev)) {
+        return false;
+    }
+
+    return pci_get_long(dev->config + pci_pcie_cap(dev) + PCI_EXP_DEVCTL2) &
+        PCI_EXP_DEVCTL2_ARI;
+}
+
+/**************************************************************************
+ * pci express extended capability allocation functions
+ * uint16_t ext_cap_id (16 bit)
+ * uint8_t cap_ver (4 bit)
+ * uint16_t cap_offset (12 bit)
+ * uint16_t ext_cap_size
+ */
+
+#define PCI_EXT_CAP_NO_ID       ((uint16_t)0)   /* 0 is reserved cap id.
+                                                 * use internally to find the
+                                                 * last capability in the
+                                                 * linked list
+                                                 */
+
+static uint16_t pcie_find_capability_list(PCIDevice *dev, uint16_t cap_id,
+                                          uint16_t *prev_p)
+{
+    uint16_t prev = 0;
+    uint16_t next = PCI_CONFIG_SPACE_SIZE;
+    uint32_t header = pci_get_long(dev->config + next);
+
+    if (!header) {
+        /* no extended capability */
+        next = 0;
+        goto out;
+    }
+
+    while (next) {
+        assert(next >= PCI_CONFIG_SPACE_SIZE);
+        assert(next <= PCIE_CONFIG_SPACE_SIZE - 8);
+
+        header = pci_get_long(dev->config + next);
+        if (PCI_EXT_CAP_ID(header) == cap_id) {
+            break;
+        }
+        prev = next;
+        next = PCI_EXT_CAP_NEXT(header);
+    }
+
+out:
+    if (prev_p) {
+        *prev_p = prev;
+    }
+    return next;
+}
+
+uint16_t pcie_find_capability(PCIDevice *dev, uint16_t cap_id)
+{
+    return pcie_find_capability_list(dev, cap_id, NULL);
+}
+
+static void pcie_ext_cap_set_next(PCIDevice *dev, uint16_t pos, uint16_t next)
+{
+    uint16_t header = pci_get_long(dev->config + pos);
+    assert(!(next & (PCI_EXT_CAP_ALIGN - 1)));
+    header = (header & ~PCI_EXT_CAP_NEXT_MASK) |
+        ((next << PCI_EXT_CAP_NEXT_SHIFT) & PCI_EXT_CAP_NEXT_MASK);
+    pci_set_long(dev->config + pos, header);
+}
+
+/*
+ * caller must supply valid (offset, size) * such that the range shouldn't
+ * overlap with other capability or other registers.
+ * This function doesn't check it.
+ */
+void pcie_add_capability(PCIDevice *dev,
+                         uint16_t cap_id, uint8_t cap_ver,
+                         uint16_t offset, uint16_t size)
+{
+    uint32_t header;
+    uint16_t next;
+
+    assert(offset >= PCI_CONFIG_SPACE_SIZE);
+    assert(offset < offset + size);
+    assert(offset + size < PCIE_CONFIG_SPACE_SIZE);
+    assert(size >= 8);
+    assert(pci_is_express(dev));
+
+    if (offset == PCI_CONFIG_SPACE_SIZE) {
+        header = pci_get_long(dev->config + offset);
+        next = PCI_EXT_CAP_NEXT(header);
+    } else {
+        uint16_t prev;
+        next = pcie_find_capability_list(dev, PCI_EXT_CAP_NO_ID, &prev);
+        assert(next == 0);
+        pcie_ext_cap_set_next(dev, prev, offset);
+    }
+    pci_set_long(dev->config + offset, PCI_EXT_CAP(cap_id, cap_ver, next));
+
+    /* Make capability read-only by default */
+    memset(dev->wmask + offset, 0, size);
+    /* Check capability by default */
+    memset(dev->cmask + offset, 0xFF, size);
+}
+
+void pcie_del_capability(PCIDevice *dev, uint16_t cap_id, uint16_t size)
+{
+    uint16_t prev;
+    uint16_t offset = pcie_find_capability_list(dev, cap_id, &prev);
+    uint32_t header;
+
+    assert(offset >= PCI_CONFIG_SPACE_SIZE);
+    header = pci_get_long(dev->config + offset);
+    if (prev) {
+        pcie_ext_cap_set_next(dev, prev, PCI_EXT_CAP_NEXT(header));
+    } else {
+        /* move up next ext cap to PCI_CONFIG_SPACE_SIZE? */
+        assert(offset == PCI_CONFIG_SPACE_SIZE);
+        pci_set_long(dev->config + offset,
+                     PCI_EXT_CAP(0, 0, PCI_EXT_CAP_NEXT(header)));
+    }
+
+    /* Make those registers read-only reserved zero */
+    memset(dev->config + offset, 0, size);
+    memset(dev->wmask + offset, 0, size);
+    /* Clear cmask as device-specific registers can't be checked */
+    memset(dev->cmask + offset, 0, size);
+}
+
+/**************************************************************************
+ * pci express extended capability helper functions
+ */
+
+/* ARI */
+void pcie_ari_init(PCIDevice *dev, uint16_t offset, uint16_t nextfn)
+{
+    pcie_add_capability(dev, PCI_EXT_CAP_ID_ARI, PCI_ARI_VER,
+                        offset, PCI_ARI_SIZEOF);
+    pci_set_long(dev->config + offset + PCI_ARI_CAP, PCI_ARI_CAP_NFN(nextfn));
+}
diff --git a/hw/pcie.h b/hw/pcie.h
new file mode 100644
index 0000000..37713dc
--- /dev/null
+++ b/hw/pcie.h
@@ -0,0 +1,96 @@ 
+/*
+ * pcie.h
+ *
+ * Copyright (c) 2010 Isaku Yamahata <yamahata at valinux co jp>
+ *                    VA Linux Systems Japan K.K.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef QEMU_PCIE_H
+#define QEMU_PCIE_H
+
+#include "hw.h"
+#include "pcie_regs.h"
+
+enum PCIExpressIndicator {
+    /* for attention and power indicator */
+    PCI_EXP_HP_IND_RESERVED     = PCI_EXP_SLTCTL_IND_RESERVED,
+    PCI_EXP_HP_IND_ON           = PCI_EXP_SLTCTL_IND_ON,
+    PCI_EXP_HP_IND_BLINK        = PCI_EXP_SLTCTL_IND_BLINK,
+    PCI_EXP_HP_IND_OFF          = PCI_EXP_SLTCTL_IND_OFF,
+};
+
+enum PCIExpressHotPlugEvent {
+    /* the bits match the bits in Slot Control/Status registers.
+     * PCI_EXP_HP_EV_xxx = PCI_EXP_SLTCTL_xxxE = PCI_EXP_SLTSTA_xxx
+     */
+    PCI_EXP_HP_EV_ABP           = 0x01,         /* attention button preseed */
+    PCI_EXP_HP_EV_PDC           = 0x08,         /* presence detect changed */
+    PCI_EXP_HP_EV_CCI           = 0x10,         /* command completed */
+
+    PCI_EXP_HP_EV_SUPPORTED     = 0x19,         /* supported event mask  */
+    /* events not listed aren't supported */
+};
+
+typedef void (*pcie_flr_fn)(PCIDevice *dev);
+
+struct PCIExpressDevice {
+    /* Offset of express capability in config space */
+    uint8_t exp_cap;
+
+    /* FLR */
+    pcie_flr_fn flr;
+};
+
+void pcie_notify(PCIDevice *dev, uint16_t vector, bool trigger, int level);
+
+/* PCI express capability helper functions */
+int pcie_cap_init(PCIDevice *dev, uint8_t offset, uint8_t type, uint8_t port);
+void pcie_cap_exit(PCIDevice *dev);
+uint8_t pcie_cap_get_type(const PCIDevice *dev);
+void pcie_cap_flags_set_vector(PCIDevice *dev, uint8_t vector);
+uint8_t pcie_cap_flags_get_vector(PCIDevice *dev);
+
+void pcie_cap_deverr_init(PCIDevice *dev);
+void pcie_cap_deverr_reset(PCIDevice *dev);
+
+void pcie_cap_slot_init(PCIDevice *dev, uint16_t slot);
+void pcie_cap_slot_reset(PCIDevice *dev);
+void pcie_cap_slot_write_config(PCIDevice *dev,
+                                uint32_t addr, uint32_t val, int len,
+                                uint16_t sltctl_prev);
+void pcie_cap_slot_push_attention_button(PCIDevice *dev);
+
+void pcie_cap_root_init(PCIDevice *dev);
+void pcie_cap_root_reset(PCIDevice *dev);
+
+void pcie_cap_flr_init(PCIDevice *dev, pcie_flr_fn flr);
+void pcie_cap_flr_write_config(PCIDevice *dev,
+                           uint32_t addr, uint32_t val, int len);
+
+void pcie_cap_ari_init(PCIDevice *dev);
+void pcie_cap_ari_reset(PCIDevice *dev);
+bool pcie_cap_is_ari_enabled(const PCIDevice *dev);
+
+/* PCI express extended capability helper functions */
+uint16_t pcie_find_capability(PCIDevice *dev, uint16_t cap_id);
+void pcie_add_capability(PCIDevice *dev,
+                         uint16_t cap_id, uint8_t cap_ver,
+                         uint16_t offset, uint16_t size);
+void pcie_del_capability(PCIDevice *dev, uint16_t cap_id, uint16_t size);
+
+void pcie_ari_init(PCIDevice *dev, uint16_t offset, uint16_t nextfn);
+
+#endif /* QEMU_PCIE_H */
diff --git a/qemu-common.h b/qemu-common.h
index d735235..6d9ee26 100644
--- a/qemu-common.h
+++ b/qemu-common.h
@@ -219,6 +219,7 @@  typedef struct PCIHostState PCIHostState;
 typedef struct PCIExpressHost PCIExpressHost;
 typedef struct PCIBus PCIBus;
 typedef struct PCIDevice PCIDevice;
+typedef struct PCIExpressDevice PCIExpressDevice;
 typedef struct PCIBridge PCIBridge;
 typedef struct SerialState SerialState;
 typedef struct IRQState *qemu_irq;