Patchwork pci: Length-align config space accesses

login
register
mail settings
Submitter Jan Kiszka
Date July 19, 2011, 9:39 p.m.
Message ID <4E25F976.8010200@web.de>
Download mbox | patch
Permalink /patch/105548/
State New
Headers show

Comments

Jan Kiszka - July 19, 2011, 9:39 p.m.
From: Jan Kiszka <jan.kiszka@siemens.com>

Introduce pci_config_read/write helpers to split up config space
accesses that are not length-aligned. This particularly avoids that each
and every device needs to check for config space overruns. Also move the
access length assertion to the new helpers.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
---

This will also simplify my cap refactorings for the device-assignment
code in qemu-kvm.

 hw/pci.c       |   43 +++++++++++++++++++++++++++++++++++++++----
 hw/pci.h       |    5 +++++
 hw/pci_host.c  |   14 ++++++--------
 hw/pcie_host.c |   12 ++++++------
 4 files changed, 56 insertions(+), 18 deletions(-)
Isaku Yamahata - July 20, 2011, noon
Hi. This clean up looks good basically.
But when conventional pci device is accessed via MMCONFIG area,
addr &= addr_mask doesn't work as expected.
The config area of [256, 4K) of conventional pci should have no effect.

thanks,

On Tue, Jul 19, 2011 at 11:39:02PM +0200, Jan Kiszka wrote:
> From: Jan Kiszka <jan.kiszka@siemens.com>
> 
> Introduce pci_config_read/write helpers to split up config space
> accesses that are not length-aligned. This particularly avoids that each
> and every device needs to check for config space overruns. Also move the
> access length assertion to the new helpers.
> 
> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> ---
> 
> This will also simplify my cap refactorings for the device-assignment
> code in qemu-kvm.
> 
>  hw/pci.c       |   43 +++++++++++++++++++++++++++++++++++++++----
>  hw/pci.h       |    5 +++++
>  hw/pci_host.c  |   14 ++++++--------
>  hw/pcie_host.c |   12 ++++++------
>  4 files changed, 56 insertions(+), 18 deletions(-)
> 
> diff --git a/hw/pci.c b/hw/pci.c
> index b904a4e..6957ece 100644
> --- a/hw/pci.c
> +++ b/hw/pci.c
> @@ -1104,12 +1104,48 @@ static void pci_update_irq_disabled(PCIDevice *d, int was_irq_disabled)
>      }
>  }
>  
> +void pci_config_write(PCIDevice *pci_dev, uint32_t addr, uint32_t addr_mask,
> +                      uint32_t val, uint32_t len)
> +{
> +    assert(len == 1 || len == 2 || len == 4);
> +
> +    addr &= addr_mask;
> +
> +    if (addr & (len - 1)) {
> +        len >>= 1;
> +        pci_config_write(pci_dev, addr, addr_mask, val, len);
> +        pci_config_write(pci_dev, addr + len, addr_mask,
> +                         val >> (len * 8), len);
> +    } else {
> +        pci_dev->config_write(pci_dev, addr, val, len);
> +    }
> +}
> +
> +uint32_t pci_config_read(PCIDevice *pci_dev, uint32_t addr, uint32_t addr_mask,
> +                         uint32_t len)
> +{
> +    uint32_t val;
> +
> +    assert(len == 1 || len == 2 || len == 4);
> +
> +    addr &= addr_mask;
> +
> +    if (addr & (len - 1)) {
> +        len >>= 1;
> +        val = pci_config_read(pci_dev, addr, addr_mask, len);
> +        val |= pci_config_read(pci_dev, addr + len,
> +                               addr_mask, len) << (len * 8);
> +    } else {
> +        val = pci_dev->config_read(pci_dev, addr, len);
> +    }
> +    return val;
> +}
> +
>  uint32_t pci_default_read_config(PCIDevice *d,
>                                   uint32_t address, int len)
>  {
>      uint32_t val = 0;
> -    assert(len == 1 || len == 2 || len == 4);
> -    len = MIN(len, pci_config_size(d) - address);
> +
>      memcpy(&val, d->config + address, len);
>      return le32_to_cpu(val);
>  }
> @@ -1117,9 +1153,8 @@ uint32_t pci_default_read_config(PCIDevice *d,
>  void pci_default_write_config(PCIDevice *d, uint32_t addr, uint32_t val, int l)
>  {
>      int i, was_irq_disabled = pci_irq_disabled(d);
> -    uint32_t config_size = pci_config_size(d);
>  
> -    for (i = 0; i < l && addr + i < config_size; val >>= 8, ++i) {
> +    for (i = 0; i < l; val >>= 8, ++i) {
>          uint8_t wmask = d->wmask[addr + i];
>          uint8_t w1cmask = d->w1cmask[addr + i];
>          assert(!(wmask & w1cmask));
> diff --git a/hw/pci.h b/hw/pci.h
> index c220745..3d5b39c 100644
> --- a/hw/pci.h
> +++ b/hw/pci.h
> @@ -482,4 +482,9 @@ static inline uint32_t pci_config_size(const PCIDevice *d)
>      return pci_is_express(d) ? PCIE_CONFIG_SPACE_SIZE : PCI_CONFIG_SPACE_SIZE;
>  }
>  
> +void pci_config_write(PCIDevice *pci_dev, uint32_t addr, uint32_t addr_mask,
> +                      uint32_t val, uint32_t len);
> +uint32_t pci_config_read(PCIDevice *pci_dev, uint32_t addr, uint32_t addr_mask,
> +                         uint32_t len);
> +
>  #endif
> diff --git a/hw/pci_host.c b/hw/pci_host.c
> index 728e2d4..db888fb 100644
> --- a/hw/pci_host.c
> +++ b/hw/pci_host.c
> @@ -50,30 +50,28 @@ static inline PCIDevice *pci_dev_find_by_addr(PCIBus *bus, uint32_t addr)
>  void pci_data_write(PCIBus *s, uint32_t addr, uint32_t val, int len)
>  {
>      PCIDevice *pci_dev = pci_dev_find_by_addr(s, addr);
> -    uint32_t config_addr = addr & (PCI_CONFIG_SPACE_SIZE - 1);
>  
> -    if (!pci_dev)
> +    if (!pci_dev) {
>          return;
> +    }
>  
>      PCI_DPRINTF("%s: %s: addr=%02" PRIx32 " val=%08" PRIx32 " len=%d\n",
> -                __func__, pci_dev->name, config_addr, val, len);
> -    pci_dev->config_write(pci_dev, config_addr, val, len);
> +                __func__, pci_dev->name, addr, val, len);
> +    pci_config_write(pci_dev, addr, PCI_CONFIG_SPACE_SIZE - 1, val, len);
>  }
>  
>  uint32_t pci_data_read(PCIBus *s, uint32_t addr, int len)
>  {
>      PCIDevice *pci_dev = pci_dev_find_by_addr(s, addr);
> -    uint32_t config_addr = addr & (PCI_CONFIG_SPACE_SIZE - 1);
>      uint32_t val;
>  
> -    assert(len == 1 || len == 2 || len == 4);
>      if (!pci_dev) {
>          return ~0x0;
>      }
>  
> -    val = pci_dev->config_read(pci_dev, config_addr, len);
> +    val = pci_config_read(pci_dev, addr, PCI_CONFIG_SPACE_SIZE - 1, len);
>      PCI_DPRINTF("%s: %s: addr=%02"PRIx32" val=%08"PRIx32" len=%d\n",
> -                __func__, pci_dev->name, config_addr, val, len);
> +                __func__, pci_dev->name, addr, val, len);
>  
>      return val;
>  }
> diff --git a/hw/pcie_host.c b/hw/pcie_host.c
> index b749865..e10d7a3 100644
> --- a/hw/pcie_host.c
> +++ b/hw/pcie_host.c
> @@ -57,22 +57,22 @@ static void pcie_mmcfg_data_write(PCIBus *s,
>  {
>      PCIDevice *pci_dev = pcie_dev_find_by_mmcfg_addr(s, mmcfg_addr);
>  
> -    if (!pci_dev)
> +    if (!pci_dev) {
>          return;
> -
> -    pci_dev->config_write(pci_dev,
> -                          PCIE_MMCFG_CONFOFFSET(mmcfg_addr), val, len);
> +    }
> +    pci_config_write(pci_dev, PCIE_MMCFG_CONFOFFSET(mmcfg_addr),
> +                     PCIE_MMCFG_CONFOFFSET_MASK, val, len);
>  }
>  
>  static uint32_t pcie_mmcfg_data_read(PCIBus *s, uint32_t addr, int len)
>  {
>      PCIDevice *pci_dev = pcie_dev_find_by_mmcfg_addr(s, addr);
>  
> -    assert(len == 1 || len == 2 || len == 4);
>      if (!pci_dev) {
>          return ~0x0;
>      }
> -    return pci_dev->config_read(pci_dev, PCIE_MMCFG_CONFOFFSET(addr), len);
> +    return pci_config_read(pci_dev, PCIE_MMCFG_CONFOFFSET(addr),
> +                           PCIE_MMCFG_CONFOFFSET_MASK, len);
>  }
>  
>  static void pcie_mmcfg_data_writeb(void *opaque,
> -- 
> 1.7.3.4
>
Jan Kiszka - July 20, 2011, 12:15 p.m.
On 2011-07-20 14:00, Isaku Yamahata wrote:
> Hi. This clean up looks good basically.

Oops, forgot to cc you. Sorry.

> But when conventional pci device is accessed via MMCONFIG area,
> addr &= addr_mask doesn't work as expected.
> The config area of [256, 4K) of conventional pci should have no effect.

Mmh, I see. Looks like we need to split accesses at this boundary and
executed them separately.

Jan
Michael S. Tsirkin - July 20, 2011, 2:22 p.m.
On Tue, Jul 19, 2011 at 11:39:02PM +0200, Jan Kiszka wrote:
> From: Jan Kiszka <jan.kiszka@siemens.com>
> 
> Introduce pci_config_read/write helpers to split up config space
> accesses that are not length-aligned. This particularly avoids that each
> and every device needs to check for config space overruns. Also move the
> access length assertion to the new helpers.
> 
> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>

Nice, this is possible now that everyone uses pci_host.
Some comments below.

> ---
> 
> This will also simplify my cap refactorings for the device-assignment
> code in qemu-kvm.
> 
>  hw/pci.c       |   43 +++++++++++++++++++++++++++++++++++++++----
>  hw/pci.h       |    5 +++++
>  hw/pci_host.c  |   14 ++++++--------
>  hw/pcie_host.c |   12 ++++++------
>  4 files changed, 56 insertions(+), 18 deletions(-)
> 
> diff --git a/hw/pci.c b/hw/pci.c
> index b904a4e..6957ece 100644
> --- a/hw/pci.c
> +++ b/hw/pci.c
> @@ -1104,12 +1104,48 @@ static void pci_update_irq_disabled(PCIDevice *d, int was_irq_disabled)
>      }
>  }
>  
> +void pci_config_write(PCIDevice *pci_dev, uint32_t addr, uint32_t addr_mask,
> +                      uint32_t val, uint32_t len)

Let's put these in pci_host.h/pci_host.c, and prefix in a way
so we don't want devices to use this mistakenly.
Also please add a comment here saying what this does.
Maybe also note how, if some system device ever does
need misaligned accesses, it will need some special hacks.

> +{
> +    assert(len == 1 || len == 2 || len == 4);
> +
> +    addr &= addr_mask;

A 4 byte access at address 255 would wrap around to 0 with this,
while previously we ignored high bits on write and returned
0 on read.

I'd say let's check config size. With some more comments
this gets:

	if (addr >= pci_config_size(pci_dev)) {
		return;
	}

	if (likely(!(addr & (len - 1)))) {
	       pci_dev->config_write(pci_dev, addr, val, len);
	       return;
	}

	/* Handle unaligned access: split it to two accesses each
	 * half the length of the original one. */
	len = len / 2;
	pci_host_config_write(pci_dev, addr, addr_mask, val, len);
	pci_host_config_write(pci_dev, addr + len, config_size, val >> (len * 8), len);

I am kind of unhappy with adding recursion here but
it does seem to be the least of evils, and
it is rare (and can't happen on the pc).

> +
> +    if (addr & (len - 1)) {

Let's add an unlikely here.

> +        len >>= 1;

I know it's the same but I prefer len /= 2;

> +        pci_config_write(pci_dev, addr, addr_mask, val, len);
> +        pci_config_write(pci_dev, addr + len, addr_mask,
> +                         val >> (len * 8), len);
> +    } else {
> +        pci_dev->config_write(pci_dev, addr, val, len);
> +    }
> +}
> +
> +uint32_t pci_config_read(PCIDevice *pci_dev, uint32_t addr, uint32_t addr_mask,
> +                         uint32_t len)
> +{
> +    uint32_t val;
> +
> +    assert(len == 1 || len == 2 || len == 4);
> +
> +    addr &= addr_mask;
> +
> +    if (addr & (len - 1)) {
> +        len >>= 1;
> +        val = pci_config_read(pci_dev, addr, addr_mask, len);
> +        val |= pci_config_read(pci_dev, addr + len,
> +                               addr_mask, len) << (len * 8);
> +    } else {
> +        val = pci_dev->config_read(pci_dev, addr, len);
> +    }
> +    return val;
> +}
> +
>  uint32_t pci_default_read_config(PCIDevice *d,
>                                   uint32_t address, int len)
>  {
>      uint32_t val = 0;
> -    assert(len == 1 || len == 2 || len == 4);
> -    len = MIN(len, pci_config_size(d) - address);
> +
>      memcpy(&val, d->config + address, len);
>      return le32_to_cpu(val);
>  }
> @@ -1117,9 +1153,8 @@ uint32_t pci_default_read_config(PCIDevice *d,
>  void pci_default_write_config(PCIDevice *d, uint32_t addr, uint32_t val, int l)
>  {
>      int i, was_irq_disabled = pci_irq_disabled(d);
> -    uint32_t config_size = pci_config_size(d);
>  
> -    for (i = 0; i < l && addr + i < config_size; val >>= 8, ++i) {
> +    for (i = 0; i < l; val >>= 8, ++i) {
>          uint8_t wmask = d->wmask[addr + i];
>          uint8_t w1cmask = d->w1cmask[addr + i];
>          assert(!(wmask & w1cmask));
> diff --git a/hw/pci.h b/hw/pci.h
> index c220745..3d5b39c 100644
> --- a/hw/pci.h
> +++ b/hw/pci.h
> @@ -482,4 +482,9 @@ static inline uint32_t pci_config_size(const PCIDevice *d)
>      return pci_is_express(d) ? PCIE_CONFIG_SPACE_SIZE : PCI_CONFIG_SPACE_SIZE;
>  }
>  
> +void pci_config_write(PCIDevice *pci_dev, uint32_t addr, uint32_t addr_mask,
> +                      uint32_t val, uint32_t len);
> +uint32_t pci_config_read(PCIDevice *pci_dev, uint32_t addr, uint32_t addr_mask,
> +                         uint32_t len);
> +
>  #endif
> diff --git a/hw/pci_host.c b/hw/pci_host.c
> index 728e2d4..db888fb 100644
> --- a/hw/pci_host.c
> +++ b/hw/pci_host.c
> @@ -50,30 +50,28 @@ static inline PCIDevice *pci_dev_find_by_addr(PCIBus *bus, uint32_t addr)
>  void pci_data_write(PCIBus *s, uint32_t addr, uint32_t val, int len)
>  {
>      PCIDevice *pci_dev = pci_dev_find_by_addr(s, addr);
> -    uint32_t config_addr = addr & (PCI_CONFIG_SPACE_SIZE - 1);
>  
> -    if (!pci_dev)
> +    if (!pci_dev) {
>          return;
> +    }
>  
>      PCI_DPRINTF("%s: %s: addr=%02" PRIx32 " val=%08" PRIx32 " len=%d\n",
> -                __func__, pci_dev->name, config_addr, val, len);
> -    pci_dev->config_write(pci_dev, config_addr, val, len);
> +                __func__, pci_dev->name, addr, val, len);
> +    pci_config_write(pci_dev, addr, PCI_CONFIG_SPACE_SIZE - 1, val, len);
>  }
>  
>  uint32_t pci_data_read(PCIBus *s, uint32_t addr, int len)
>  {
>      PCIDevice *pci_dev = pci_dev_find_by_addr(s, addr);
> -    uint32_t config_addr = addr & (PCI_CONFIG_SPACE_SIZE - 1);
>      uint32_t val;
>  
> -    assert(len == 1 || len == 2 || len == 4);
>      if (!pci_dev) {
>          return ~0x0;
>      }
>  
> -    val = pci_dev->config_read(pci_dev, config_addr, len);
> +    val = pci_config_read(pci_dev, addr, PCI_CONFIG_SPACE_SIZE - 1, len);
>      PCI_DPRINTF("%s: %s: addr=%02"PRIx32" val=%08"PRIx32" len=%d\n",
> -                __func__, pci_dev->name, config_addr, val, len);
> +                __func__, pci_dev->name, addr, val, len);
>  
>      return val;
>  }
> diff --git a/hw/pcie_host.c b/hw/pcie_host.c
> index b749865..e10d7a3 100644
> --- a/hw/pcie_host.c
> +++ b/hw/pcie_host.c
> @@ -57,22 +57,22 @@ static void pcie_mmcfg_data_write(PCIBus *s,
>  {
>      PCIDevice *pci_dev = pcie_dev_find_by_mmcfg_addr(s, mmcfg_addr);
>  
> -    if (!pci_dev)
> +    if (!pci_dev) {
>          return;
> -
> -    pci_dev->config_write(pci_dev,
> -                          PCIE_MMCFG_CONFOFFSET(mmcfg_addr), val, len);
> +    }
> +    pci_config_write(pci_dev, PCIE_MMCFG_CONFOFFSET(mmcfg_addr),
> +                     PCIE_MMCFG_CONFOFFSET_MASK, val, len);
>  }
>  
>  static uint32_t pcie_mmcfg_data_read(PCIBus *s, uint32_t addr, int len)
>  {
>      PCIDevice *pci_dev = pcie_dev_find_by_mmcfg_addr(s, addr);
>  
> -    assert(len == 1 || len == 2 || len == 4);
>      if (!pci_dev) {
>          return ~0x0;
>      }
> -    return pci_dev->config_read(pci_dev, PCIE_MMCFG_CONFOFFSET(addr), len);
> +    return pci_config_read(pci_dev, PCIE_MMCFG_CONFOFFSET(addr),
> +                           PCIE_MMCFG_CONFOFFSET_MASK, len);
>  }
>  
>  static void pcie_mmcfg_data_writeb(void *opaque,
> -- 
> 1.7.3.4
Jan Kiszka - July 20, 2011, 2:27 p.m.
On 2011-07-20 14:15, Jan Kiszka wrote:
> On 2011-07-20 14:00, Isaku Yamahata wrote:
>> Hi. This clean up looks good basically.
> 
> Oops, forgot to cc you. Sorry.
> 
>> But when conventional pci device is accessed via MMCONFIG area,
>> addr &= addr_mask doesn't work as expected.
>> The config area of [256, 4K) of conventional pci should have no effect.
> 
> Mmh, I see. Looks like we need to split accesses at this boundary and
> executed them separately.

Nope, no such issue: we already automatically split up accesses that
span the legacy/extended boundary. Just like so far, legacy config space
handlers have to filter out requests that address regions >= 256.

Jan
Jan Kiszka - July 20, 2011, 2:36 p.m.
On 2011-07-20 16:22, Michael S. Tsirkin wrote:
> On Tue, Jul 19, 2011 at 11:39:02PM +0200, Jan Kiszka wrote:
>> From: Jan Kiszka <jan.kiszka@siemens.com>
>>
>> Introduce pci_config_read/write helpers to split up config space
>> accesses that are not length-aligned. This particularly avoids that each
>> and every device needs to check for config space overruns. Also move the
>> access length assertion to the new helpers.
>>
>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> 
> Nice, this is possible now that everyone uses pci_host.
> Some comments below.
> 
>> ---
>>
>> This will also simplify my cap refactorings for the device-assignment
>> code in qemu-kvm.
>>
>>  hw/pci.c       |   43 +++++++++++++++++++++++++++++++++++++++----
>>  hw/pci.h       |    5 +++++
>>  hw/pci_host.c  |   14 ++++++--------
>>  hw/pcie_host.c |   12 ++++++------
>>  4 files changed, 56 insertions(+), 18 deletions(-)
>>
>> diff --git a/hw/pci.c b/hw/pci.c
>> index b904a4e..6957ece 100644
>> --- a/hw/pci.c
>> +++ b/hw/pci.c
>> @@ -1104,12 +1104,48 @@ static void pci_update_irq_disabled(PCIDevice *d, int was_irq_disabled)
>>      }
>>  }
>>  
>> +void pci_config_write(PCIDevice *pci_dev, uint32_t addr, uint32_t addr_mask,
>> +                      uint32_t val, uint32_t len)
> 
> Let's put these in pci_host.h/pci_host.c, and prefix in a way
> so we don't want devices to use this mistakenly.

I didn't find direct relations between pcie_host.c and pci_host.c in the
current code, so went for pci.c - but no problem to change.

pci_host_config_read/write?

> Also please add a comment here saying what this does.
> Maybe also note how, if some system device ever does
> need misaligned accesses, it will need some special hacks.

OK.

> 
>> +{
>> +    assert(len == 1 || len == 2 || len == 4);
>> +
>> +    addr &= addr_mask;
> 
> A 4 byte access at address 255 would wrap around to 0 with this,
> while previously we ignored high bits on write and returned
> 0 on read.

The question is rather what the spec or real hw demand from us. Do you
have any insights on this? I didn't find hints which behavior is more
compliant. It looks like such access patterns are deep into bug land anyway.

> 
> I'd say let's check config size. With some more comments
> this gets:
> 
> 	if (addr >= pci_config_size(pci_dev)) {
> 		return;
> 	}
> 
> 	if (likely(!(addr & (len - 1)))) {
> 	       pci_dev->config_write(pci_dev, addr, val, len);
> 	       return;
> 	}
> 
> 	/* Handle unaligned access: split it to two accesses each
> 	 * half the length of the original one. */
> 	len = len / 2;
> 	pci_host_config_write(pci_dev, addr, addr_mask, val, len);
> 	pci_host_config_write(pci_dev, addr + len, config_size, val >> (len * 8), len);
> 
> I am kind of unhappy with adding recursion here but
> it does seem to be the least of evils, and
> it is rare (and can't happen on the pc).

+ recursion depth is naturally limited.

> 
>> +
>> +    if (addr & (len - 1)) {
> 
> Let's add an unlikely here.
> 
>> +        len >>= 1;
> 
> I know it's the same but I prefer len /= 2;

OK for both.

Thanks,
Jan
Michael S. Tsirkin - July 20, 2011, 3:47 p.m.
On Wed, Jul 20, 2011 at 04:36:51PM +0200, Jan Kiszka wrote:
> > A 4 byte access at address 255 would wrap around to 0 with this,
> > while previously we ignored high bits on write and returned
> > 0 on read.
> 
> The question is rather what the spec or real hw demand from us. Do you
> have any insights on this? I didn't find hints which behavior is more
> compliant. It looks like such access patterns are deep into bug land anyway.

For the express side, we don't need to support accesses that cross a 4
byte boundary - but guests might be able to generate them.
Just in case, I'd prefer keeping the current behaviour WRT that.
Paul, do you know which guests can do this?

However, as far as I can tell accesses within a dword can have
any bytes enabled. So our interface for passing that info to
devices isn't all that good, and I'm not 100% sure splitting
up to length aligned transactions is better.
Maybe passing in a byte_enable mask is a better idea.


For example, as far as I an tell sysfs let you write/read
any length (it splits it up internally ATM) so
it seems somewhat useless to do that work in
userspace as well.
We will, however, have to see how does the code look in the end.
Isaku Yamahata - July 20, 2011, 4:17 p.m.
On Wed, Jul 20, 2011 at 04:27:08PM +0200, Jan Kiszka wrote:
> On 2011-07-20 14:15, Jan Kiszka wrote:
> > On 2011-07-20 14:00, Isaku Yamahata wrote:
> >> Hi. This clean up looks good basically.
> > 
> > Oops, forgot to cc you. Sorry.
> > 
> >> But when conventional pci device is accessed via MMCONFIG area,
> >> addr &= addr_mask doesn't work as expected.
> >> The config area of [256, 4K) of conventional pci should have no effect.
> > 
> > Mmh, I see. Looks like we need to split accesses at this boundary and
> > executed them separately.
> 
> Nope, no such issue: we already automatically split up accesses that
> span the legacy/extended boundary. Just like so far, legacy config space
> handlers have to filter out requests that address regions >= 256.

For example, when accessing to offset 257 of conventional pci device,
the access is routed to offset 1 due to the masking.
Such overwrapping isn't correct.
Jan Kiszka - July 20, 2011, 4:18 p.m.
On 2011-07-20 18:17, Isaku Yamahata wrote:
> On Wed, Jul 20, 2011 at 04:27:08PM +0200, Jan Kiszka wrote:
>> On 2011-07-20 14:15, Jan Kiszka wrote:
>>> On 2011-07-20 14:00, Isaku Yamahata wrote:
>>>> Hi. This clean up looks good basically.
>>>
>>> Oops, forgot to cc you. Sorry.
>>>
>>>> But when conventional pci device is accessed via MMCONFIG area,
>>>> addr &= addr_mask doesn't work as expected.
>>>> The config area of [256, 4K) of conventional pci should have no effect.
>>>
>>> Mmh, I see. Looks like we need to split accesses at this boundary and
>>> executed them separately.
>>
>> Nope, no such issue: we already automatically split up accesses that
>> span the legacy/extended boundary. Just like so far, legacy config space
>> handlers have to filter out requests that address regions >= 256.
> 
> For example, when accessing to offset 257 of conventional pci device,
> the access is routed to offset 1 due to the masking.
> Such overwrapping isn't correct.

No, it isn't routed like that. The mask used via mmio is 0xfff.

Jan
Michael S. Tsirkin - July 20, 2011, 4:19 p.m.
On Thu, Jul 21, 2011 at 01:17:02AM +0900, Isaku Yamahata wrote:
> On Wed, Jul 20, 2011 at 04:27:08PM +0200, Jan Kiszka wrote:
> > On 2011-07-20 14:15, Jan Kiszka wrote:
> > > On 2011-07-20 14:00, Isaku Yamahata wrote:
> > >> Hi. This clean up looks good basically.
> > > 
> > > Oops, forgot to cc you. Sorry.
> > > 
> > >> But when conventional pci device is accessed via MMCONFIG area,
> > >> addr &= addr_mask doesn't work as expected.
> > >> The config area of [256, 4K) of conventional pci should have no effect.
> > > 
> > > Mmh, I see. Looks like we need to split accesses at this boundary and
> > > executed them separately.
> > 
> > Nope, no such issue: we already automatically split up accesses that
> > span the legacy/extended boundary. Just like so far, legacy config space
> > handlers have to filter out requests that address regions >= 256.
> 
> For example, when accessing to offset 257 of conventional pci device,
> the access is routed to offset 1 due to the masking.
> Such overwrapping isn't correct.

Can guest trigger this on some systems? I think it can't on a pc, right?

> -- 
> yamahata
Isaku Yamahata - July 20, 2011, 4:33 p.m.
On Wed, Jul 20, 2011 at 06:18:43PM +0200, Jan Kiszka wrote:
> On 2011-07-20 18:17, Isaku Yamahata wrote:
> > On Wed, Jul 20, 2011 at 04:27:08PM +0200, Jan Kiszka wrote:
> >> On 2011-07-20 14:15, Jan Kiszka wrote:
> >>> On 2011-07-20 14:00, Isaku Yamahata wrote:
> >>>> Hi. This clean up looks good basically.
> >>>
> >>> Oops, forgot to cc you. Sorry.
> >>>
> >>>> But when conventional pci device is accessed via MMCONFIG area,
> >>>> addr &= addr_mask doesn't work as expected.
> >>>> The config area of [256, 4K) of conventional pci should have no effect.
> >>>
> >>> Mmh, I see. Looks like we need to split accesses at this boundary and
> >>> executed them separately.
> >>
> >> Nope, no such issue: we already automatically split up accesses that
> >> span the legacy/extended boundary. Just like so far, legacy config space
> >> handlers have to filter out requests that address regions >= 256.
> > 
> > For example, when accessing to offset 257 of conventional pci device,
> > the access is routed to offset 1 due to the masking.
> > Such overwrapping isn't correct.
> 
> No, it isn't routed like that. The mask used via mmio is 0xfff.

Ah sorry, you're right.
So pci_default_{read, write}_config() need to check it.
Michael S. Tsirkin - July 20, 2011, 4:45 p.m.
On Wed, Jul 20, 2011 at 06:18:43PM +0200, Jan Kiszka wrote:
> On 2011-07-20 18:17, Isaku Yamahata wrote:
> > On Wed, Jul 20, 2011 at 04:27:08PM +0200, Jan Kiszka wrote:
> >> On 2011-07-20 14:15, Jan Kiszka wrote:
> >>> On 2011-07-20 14:00, Isaku Yamahata wrote:
> >>>> Hi. This clean up looks good basically.
> >>>
> >>> Oops, forgot to cc you. Sorry.
> >>>
> >>>> But when conventional pci device is accessed via MMCONFIG area,
> >>>> addr &= addr_mask doesn't work as expected.
> >>>> The config area of [256, 4K) of conventional pci should have no effect.
> >>>
> >>> Mmh, I see. Looks like we need to split accesses at this boundary and
> >>> executed them separately.
> >>
> >> Nope, no such issue: we already automatically split up accesses that
> >> span the legacy/extended boundary. Just like so far, legacy config space
> >> handlers have to filter out requests that address regions >= 256.
> > 
> > For example, when accessing to offset 257 of conventional pci device,
> > the access is routed to offset 1 due to the masking.
> > Such overwrapping isn't correct.
> 
> No, it isn't routed like that. The mask used via mmio is 0xfff.
> 
> Jan

I thought about this some more, I'd like to see how devices
are going to benefit. Any examples?
If not, is it easier to simply make this logic
part of dev assignment?


> -- 
> Siemens AG, Corporate Technology, CT T DE IT 1
> Corporate Competence Center Embedded Linux
Jan Kiszka - July 20, 2011, 5:10 p.m.
On 2011-07-20 18:45, Michael S. Tsirkin wrote:
> On Wed, Jul 20, 2011 at 06:18:43PM +0200, Jan Kiszka wrote:
>> On 2011-07-20 18:17, Isaku Yamahata wrote:
>>> On Wed, Jul 20, 2011 at 04:27:08PM +0200, Jan Kiszka wrote:
>>>> On 2011-07-20 14:15, Jan Kiszka wrote:
>>>>> On 2011-07-20 14:00, Isaku Yamahata wrote:
>>>>>> Hi. This clean up looks good basically.
>>>>>
>>>>> Oops, forgot to cc you. Sorry.
>>>>>
>>>>>> But when conventional pci device is accessed via MMCONFIG area,
>>>>>> addr &= addr_mask doesn't work as expected.
>>>>>> The config area of [256, 4K) of conventional pci should have no effect.
>>>>>
>>>>> Mmh, I see. Looks like we need to split accesses at this boundary and
>>>>> executed them separately.
>>>>
>>>> Nope, no such issue: we already automatically split up accesses that
>>>> span the legacy/extended boundary. Just like so far, legacy config space
>>>> handlers have to filter out requests that address regions >= 256.
>>>
>>> For example, when accessing to offset 257 of conventional pci device,
>>> the access is routed to offset 1 due to the masking.
>>> Such overwrapping isn't correct.
>>
>> No, it isn't routed like that. The mask used via mmio is 0xfff.
>>
>> Jan
> 
> I thought about this some more, I'd like to see how devices
> are going to benefit. Any examples?

No in-tree device currently gets beyond the "write default, check range"
pattern.

> If not, is it easier to simply make this logic
> part of dev assignment?

That's a question of clean interfaces. Not checking at the core means
exposing invalid accesses to the callbacks.

The logic is required anyway, so better put it in a central place. E.g.
if the Xen people push their device assignment as well, we will suddenly
have more than one user.

Jan
Michael S. Tsirkin - July 20, 2011, 8:16 p.m.
On Wed, Jul 20, 2011 at 07:10:57PM +0200, Jan Kiszka wrote:
> On 2011-07-20 18:45, Michael S. Tsirkin wrote:
> > On Wed, Jul 20, 2011 at 06:18:43PM +0200, Jan Kiszka wrote:
> >> On 2011-07-20 18:17, Isaku Yamahata wrote:
> >>> On Wed, Jul 20, 2011 at 04:27:08PM +0200, Jan Kiszka wrote:
> >>>> On 2011-07-20 14:15, Jan Kiszka wrote:
> >>>>> On 2011-07-20 14:00, Isaku Yamahata wrote:
> >>>>>> Hi. This clean up looks good basically.
> >>>>>
> >>>>> Oops, forgot to cc you. Sorry.
> >>>>>
> >>>>>> But when conventional pci device is accessed via MMCONFIG area,
> >>>>>> addr &= addr_mask doesn't work as expected.
> >>>>>> The config area of [256, 4K) of conventional pci should have no effect.
> >>>>>
> >>>>> Mmh, I see. Looks like we need to split accesses at this boundary and
> >>>>> executed them separately.
> >>>>
> >>>> Nope, no such issue: we already automatically split up accesses that
> >>>> span the legacy/extended boundary. Just like so far, legacy config space
> >>>> handlers have to filter out requests that address regions >= 256.
> >>>
> >>> For example, when accessing to offset 257 of conventional pci device,
> >>> the access is routed to offset 1 due to the masking.
> >>> Such overwrapping isn't correct.
> >>
> >> No, it isn't routed like that. The mask used via mmio is 0xfff.
> >>
> >> Jan
> > 
> > I thought about this some more, I'd like to see how devices
> > are going to benefit. Any examples?
> 
> No in-tree device currently gets beyond the "write default, check range"
> pattern.
> 
> > If not, is it easier to simply make this logic
> > part of dev assignment?
> 
> That's a question of clean interfaces. Not checking at the core means
> exposing invalid accesses to the callbacks.

That's the point: at least the express spec seems to explicitly allow
any accesses within a dword, and disallow accesses crossing dwords.
And there's no way to create such on the PCI bus either.

So it makes sense to handle cross-dword accesses in a special way
(possibly disallow them altogether, need to check non-pc systems
for this).

But non-aligned access within a dword is legal according to the spec.

> The logic is required anyway, so better put it in a central place. E.g.
> if the Xen people push their device assignment as well, we will suddenly
> have more than one user.
> 
> Jan

That's the question: why is the logic required?

Patch

diff --git a/hw/pci.c b/hw/pci.c
index b904a4e..6957ece 100644
--- a/hw/pci.c
+++ b/hw/pci.c
@@ -1104,12 +1104,48 @@  static void pci_update_irq_disabled(PCIDevice *d, int was_irq_disabled)
     }
 }
 
+void pci_config_write(PCIDevice *pci_dev, uint32_t addr, uint32_t addr_mask,
+                      uint32_t val, uint32_t len)
+{
+    assert(len == 1 || len == 2 || len == 4);
+
+    addr &= addr_mask;
+
+    if (addr & (len - 1)) {
+        len >>= 1;
+        pci_config_write(pci_dev, addr, addr_mask, val, len);
+        pci_config_write(pci_dev, addr + len, addr_mask,
+                         val >> (len * 8), len);
+    } else {
+        pci_dev->config_write(pci_dev, addr, val, len);
+    }
+}
+
+uint32_t pci_config_read(PCIDevice *pci_dev, uint32_t addr, uint32_t addr_mask,
+                         uint32_t len)
+{
+    uint32_t val;
+
+    assert(len == 1 || len == 2 || len == 4);
+
+    addr &= addr_mask;
+
+    if (addr & (len - 1)) {
+        len >>= 1;
+        val = pci_config_read(pci_dev, addr, addr_mask, len);
+        val |= pci_config_read(pci_dev, addr + len,
+                               addr_mask, len) << (len * 8);
+    } else {
+        val = pci_dev->config_read(pci_dev, addr, len);
+    }
+    return val;
+}
+
 uint32_t pci_default_read_config(PCIDevice *d,
                                  uint32_t address, int len)
 {
     uint32_t val = 0;
-    assert(len == 1 || len == 2 || len == 4);
-    len = MIN(len, pci_config_size(d) - address);
+
     memcpy(&val, d->config + address, len);
     return le32_to_cpu(val);
 }
@@ -1117,9 +1153,8 @@  uint32_t pci_default_read_config(PCIDevice *d,
 void pci_default_write_config(PCIDevice *d, uint32_t addr, uint32_t val, int l)
 {
     int i, was_irq_disabled = pci_irq_disabled(d);
-    uint32_t config_size = pci_config_size(d);
 
-    for (i = 0; i < l && addr + i < config_size; val >>= 8, ++i) {
+    for (i = 0; i < l; val >>= 8, ++i) {
         uint8_t wmask = d->wmask[addr + i];
         uint8_t w1cmask = d->w1cmask[addr + i];
         assert(!(wmask & w1cmask));
diff --git a/hw/pci.h b/hw/pci.h
index c220745..3d5b39c 100644
--- a/hw/pci.h
+++ b/hw/pci.h
@@ -482,4 +482,9 @@  static inline uint32_t pci_config_size(const PCIDevice *d)
     return pci_is_express(d) ? PCIE_CONFIG_SPACE_SIZE : PCI_CONFIG_SPACE_SIZE;
 }
 
+void pci_config_write(PCIDevice *pci_dev, uint32_t addr, uint32_t addr_mask,
+                      uint32_t val, uint32_t len);
+uint32_t pci_config_read(PCIDevice *pci_dev, uint32_t addr, uint32_t addr_mask,
+                         uint32_t len);
+
 #endif
diff --git a/hw/pci_host.c b/hw/pci_host.c
index 728e2d4..db888fb 100644
--- a/hw/pci_host.c
+++ b/hw/pci_host.c
@@ -50,30 +50,28 @@  static inline PCIDevice *pci_dev_find_by_addr(PCIBus *bus, uint32_t addr)
 void pci_data_write(PCIBus *s, uint32_t addr, uint32_t val, int len)
 {
     PCIDevice *pci_dev = pci_dev_find_by_addr(s, addr);
-    uint32_t config_addr = addr & (PCI_CONFIG_SPACE_SIZE - 1);
 
-    if (!pci_dev)
+    if (!pci_dev) {
         return;
+    }
 
     PCI_DPRINTF("%s: %s: addr=%02" PRIx32 " val=%08" PRIx32 " len=%d\n",
-                __func__, pci_dev->name, config_addr, val, len);
-    pci_dev->config_write(pci_dev, config_addr, val, len);
+                __func__, pci_dev->name, addr, val, len);
+    pci_config_write(pci_dev, addr, PCI_CONFIG_SPACE_SIZE - 1, val, len);
 }
 
 uint32_t pci_data_read(PCIBus *s, uint32_t addr, int len)
 {
     PCIDevice *pci_dev = pci_dev_find_by_addr(s, addr);
-    uint32_t config_addr = addr & (PCI_CONFIG_SPACE_SIZE - 1);
     uint32_t val;
 
-    assert(len == 1 || len == 2 || len == 4);
     if (!pci_dev) {
         return ~0x0;
     }
 
-    val = pci_dev->config_read(pci_dev, config_addr, len);
+    val = pci_config_read(pci_dev, addr, PCI_CONFIG_SPACE_SIZE - 1, len);
     PCI_DPRINTF("%s: %s: addr=%02"PRIx32" val=%08"PRIx32" len=%d\n",
-                __func__, pci_dev->name, config_addr, val, len);
+                __func__, pci_dev->name, addr, val, len);
 
     return val;
 }
diff --git a/hw/pcie_host.c b/hw/pcie_host.c
index b749865..e10d7a3 100644
--- a/hw/pcie_host.c
+++ b/hw/pcie_host.c
@@ -57,22 +57,22 @@  static void pcie_mmcfg_data_write(PCIBus *s,
 {
     PCIDevice *pci_dev = pcie_dev_find_by_mmcfg_addr(s, mmcfg_addr);
 
-    if (!pci_dev)
+    if (!pci_dev) {
         return;
-
-    pci_dev->config_write(pci_dev,
-                          PCIE_MMCFG_CONFOFFSET(mmcfg_addr), val, len);
+    }
+    pci_config_write(pci_dev, PCIE_MMCFG_CONFOFFSET(mmcfg_addr),
+                     PCIE_MMCFG_CONFOFFSET_MASK, val, len);
 }
 
 static uint32_t pcie_mmcfg_data_read(PCIBus *s, uint32_t addr, int len)
 {
     PCIDevice *pci_dev = pcie_dev_find_by_mmcfg_addr(s, addr);
 
-    assert(len == 1 || len == 2 || len == 4);
     if (!pci_dev) {
         return ~0x0;
     }
-    return pci_dev->config_read(pci_dev, PCIE_MMCFG_CONFOFFSET(addr), len);
+    return pci_config_read(pci_dev, PCIE_MMCFG_CONFOFFSET(addr),
+                           PCIE_MMCFG_CONFOFFSET_MASK, len);
 }
 
 static void pcie_mmcfg_data_writeb(void *opaque,