diff mbox

[v4,3/7] Implement fw_cfg DMA interface

Message ID 1443701819-13855-4-git-send-email-markmb@redhat.com
State New
Headers show

Commit Message

Marc Marí Oct. 1, 2015, 12:16 p.m. UTC
Based on the specifications on docs/specs/fw_cfg.txt

This interface is an addon. The old interface can still be used as usual.

Based on Gerd Hoffman's initial implementation.

Signed-off-by: Marc Marí <markmb@redhat.com>
---
 hw/arm/virt.c             |   2 +-
 hw/nvram/fw_cfg.c         | 231 +++++++++++++++++++++++++++++++++++++++++++---
 include/hw/nvram/fw_cfg.h |  16 +++-
 3 files changed, 233 insertions(+), 16 deletions(-)

Comments

Laszlo Ersek Oct. 1, 2015, 2:36 p.m. UTC | #1
This looks good to me. Thanks for addressing my v3 request.

I have some new remarks here. I feel *really* bad for not finding them
earlier. (If you get tired of working on this series, I could pick it up
and try to shepherd it further.)

On 10/01/15 14:16, Marc Marí wrote:
> Based on the specifications on docs/specs/fw_cfg.txt
> 
> This interface is an addon. The old interface can still be used as usual.
> 
> Based on Gerd Hoffman's initial implementation.
> 
> Signed-off-by: Marc Marí <markmb@redhat.com>
> ---
>  hw/arm/virt.c             |   2 +-
>  hw/nvram/fw_cfg.c         | 231 +++++++++++++++++++++++++++++++++++++++++++---
>  include/hw/nvram/fw_cfg.h |  16 +++-
>  3 files changed, 233 insertions(+), 16 deletions(-)
> 
> diff --git a/hw/arm/virt.c b/hw/arm/virt.c
> index d25d6cf..7ae984f 100644
> --- a/hw/arm/virt.c
> +++ b/hw/arm/virt.c
> @@ -683,7 +683,7 @@ static void create_fw_cfg(const VirtBoardInfo *vbi)
>      hwaddr size = vbi->memmap[VIRT_FW_CFG].size;
>      char *nodename;
>  
> -    fw_cfg_init_mem_wide(base + 8, base, 8);
> +    fw_cfg_init_mem_wide(base + 8, base, 8, 0, NULL);
>  
>      nodename = g_strdup_printf("/fw-cfg@%" PRIx64, base);
>      qemu_fdt_add_subnode(vbi->fdt, nodename);
> diff --git a/hw/nvram/fw_cfg.c b/hw/nvram/fw_cfg.c
> index 658f8c4..59933b3 100644
> --- a/hw/nvram/fw_cfg.c
> +++ b/hw/nvram/fw_cfg.c
> @@ -23,6 +23,7 @@
>   */
>  #include "hw/hw.h"
>  #include "sysemu/sysemu.h"
> +#include "sysemu/dma.h"
>  #include "hw/isa/isa.h"
>  #include "hw/nvram/fw_cfg.h"
>  #include "hw/sysbus.h"
> @@ -30,7 +31,7 @@
>  #include "qemu/error-report.h"
>  #include "qemu/config-file.h"
>  
> -#define FW_CFG_SIZE 2
> +#define FW_CFG_CTL_SIZE 2
>  #define FW_CFG_NAME "fw_cfg"
>  #define FW_CFG_PATH "/machine/" FW_CFG_NAME
>  
> @@ -42,6 +43,16 @@
>  #define FW_CFG_IO(obj)  OBJECT_CHECK(FWCfgIoState,  (obj), TYPE_FW_CFG_IO)
>  #define FW_CFG_MEM(obj) OBJECT_CHECK(FWCfgMemState, (obj), TYPE_FW_CFG_MEM)
>  
> +/* FW_CFG_VERSION bits */
> +#define FW_CFG_VERSION      0x01
> +#define FW_CFG_VERSION_DMA  0x02
> +
> +/* FW_CFG_DMA_CONTROL bits */
> +#define FW_CFG_DMA_CTL_ERROR   0x01
> +#define FW_CFG_DMA_CTL_READ    0x02
> +#define FW_CFG_DMA_CTL_SKIP    0x04
> +#define FW_CFG_DMA_CTL_SELECT  0x08
> +
>  typedef struct FWCfgEntry {
>      uint32_t len;
>      uint8_t *data;
> @@ -59,6 +70,10 @@ struct FWCfgState {
>      uint16_t cur_entry;
>      uint32_t cur_offset;
>      Notifier machine_ready;
> +
> +    bool dma_enabled;
> +    AddressSpace *dma_as;
> +    dma_addr_t dma_addr;
>  };
>  
>  struct FWCfgIoState {
> @@ -66,8 +81,8 @@ struct FWCfgIoState {
>      FWCfgState parent_obj;
>      /*< public >*/
>  
> -    MemoryRegion comb_iomem;
> -    uint32_t iobase;
> +    MemoryRegion comb_iomem, dma_iomem;
> +    uint32_t iobase, dma_iobase;
>  };
>  
>  struct FWCfgMemState {
> @@ -75,7 +90,7 @@ struct FWCfgMemState {
>      FWCfgState parent_obj;
>      /*< public >*/
>  
> -    MemoryRegion ctl_iomem, data_iomem;
> +    MemoryRegion ctl_iomem, data_iomem, dma_iomem;
>      uint32_t data_width;
>      MemoryRegionOps wide_data_ops;
>  };

(1) I *think* the new "dma_iomem" field, of type MemoryRegion, could be
moved up to the parent struct FWCfgEntry, from both FWCfgMemState and
FWCfgIoState. (And the references in the rest of the code could be updated.)

(

Independently, some loud thinking, mostly for myself: I've always been
surprised by the difference between (a) FWCfgIoState *carrying*
"dma_iobase" as a field -- and a property! --, and (b) FWCfgMemState
*not* carrying the same as a field -- nor as a property.

I think I finally understand this difference now. It is all rooted in
the difference between the internal APIs sysbus_add_io() and
sysbus_init_mmio(). Both of these are called from the device realize
functions, but the first (sysbus_add_io()) wants the IO port address at
once, whereas the second (sysbus_init_mmio()) doesn't want the address
-- the actual mapping (sysbus_mmio_map()) is delayed to board code; the
device code doesn't want to be aware of it.

And this ripples to the top. Because sysbus_add_io() wants the IO port
address, we must pass that address to the device realize function. And
for that, we need a device property -- "dma_iobase". This is not new, it
just follows the example of the preexistent "iobase" field / property.

Whereas, in the sysbus_init_mmio() case, we can keep the MMIO address
private to the board code; the realize function need not know the
address. However, the realize function does need to know the *fact* that
we're going to do DMA. Given that we must maintain this fact (in
"FWCfgState.dma_enabled") anyway, for other -- e.g. migration subsection
-- purposes as well, it makes sense to expose that same field of the
parent struct as a property, so we can set it in the memory mapped case
*before* the realize function looks at it.

I feel better now, thanks for listening.

)

Then,

> @@ -292,6 +307,119 @@ static void fw_cfg_data_mem_write(void *opaque, hwaddr addr,
>      } while (i);
>  }
>  
> +static void fw_cfg_dma_transfer(FWCfgState *s)
> +{
> +    dma_addr_t len;
> +    FWCfgDmaAccess dma;
> +    int arch;
> +    FWCfgEntry *e;
> +    int read;
> +    dma_addr_t dma_addr;
> +
> +    /* Reset the address before the next access */
> +    dma_addr = s->dma_addr;
> +    s->dma_addr = 0;
> +
> +    dma.address = ldq_be_dma(s->dma_as,
> +                            dma_addr + offsetof(FWCfgDmaAccess, address));
> +    dma.length = ldl_be_dma(s->dma_as,
> +                            dma_addr + offsetof(FWCfgDmaAccess, length));
> +    dma.control = ldl_be_dma(s->dma_as,
> +                            dma_addr + offsetof(FWCfgDmaAccess, control));
> +
> +    if (dma.control & FW_CFG_DMA_CTL_SELECT) {
> +        fw_cfg_select(s, dma.control >> 16);
> +    }
> +
> +    arch = !!(s->cur_entry & FW_CFG_ARCH_LOCAL);
> +    e = &s->entries[arch][s->cur_entry & FW_CFG_ENTRY_MASK];
> +
> +    if (dma.control & FW_CFG_DMA_CTL_READ) {
> +        read = 1;
> +    } else if (dma.control & FW_CFG_DMA_CTL_SKIP) {
> +        read = 0;
> +    } else {
> +        dma.length = 0;

I can see you addressed Kevin's comment here.


> +    }
> +
> +    dma.control = 0;
> +
> +    while (dma.length > 0 && !(dma.control & FW_CFG_DMA_CTL_ERROR)) {
> +        if (s->cur_entry == FW_CFG_INVALID || !e->data ||
> +                                s->cur_offset >= e->len) {
> +            len = dma.length;
> +
> +            /* If the access is not a read access, it will be a skip access,
> +             * tested before.
> +             */
> +            if (read) {
> +                if (dma_memory_set(s->dma_as, dma.address, 0, len)) {
> +                    dma.control |= FW_CFG_DMA_CTL_ERROR;
> +                }
> +            }
> +
> +        } else {
> +            if (dma.length <= (e->len - s->cur_offset)) {
> +                len = dma.length;
> +            } else {
> +                len = (e->len - s->cur_offset);
> +            }
> +
> +            if (e->read_callback) {
> +                e->read_callback(e->callback_opaque, s->cur_offset);
> +            }
> +
> +            /* If the access is not a read access, it will be a skip access,
> +             * tested before.
> +             */
> +            if (read) {
> +                if (dma_memory_write(s->dma_as, dma.address,
> +                                    &e->data[s->cur_offset], len)) {
> +                    dma.control |= FW_CFG_DMA_CTL_ERROR;
> +                }
> +            }
> +
> +            s->cur_offset += len;
> +        }
> +
> +        dma.address += len;
> +        dma.length  -= len;
> +
> +    }
> +
> +    stl_be_dma(s->dma_as, dma_addr + offsetof(FWCfgDmaAccess, control),
> +                dma.control);
> +
> +    trace_fw_cfg_read(s, 0);
> +}

Seems OK to me.

> +
> +static void fw_cfg_dma_mem_write(void *opaque, hwaddr addr,
> +                                 uint64_t value, unsigned size)
> +{
> +    FWCfgState *s = opaque;
> +
> +    if (size == 4) {
> +        if (addr == 0) {
> +            /* FWCfgDmaAccess high address */
> +            s->dma_addr = value << 32;
> +        } else if (addr == 4) {
> +            /* FWCfgDmaAccess low address */
> +            s->dma_addr |= value;
> +            fw_cfg_dma_transfer(s);
> +        }
> +    } else if (size == 8 && addr == 0) {
> +        s->dma_addr = value;
> +        fw_cfg_dma_transfer(s);
> +    }
> +}

Seems to match the zeroing of s->dma_addr in fw_cfg_dma_transfer(). Good.

> +
> +static bool fw_cfg_dma_mem_valid(void *opaque, hwaddr addr,
> +                                  unsigned size, bool is_write)
> +{
> +    return is_write && ((size == 4 && (addr == 0 || addr == 4)) ||
> +                        (size == 8 && addr == 0));
> +}
> +
>  static bool fw_cfg_data_mem_valid(void *opaque, hwaddr addr,
>                                    unsigned size, bool is_write)
>  {
> @@ -359,6 +487,12 @@ static const MemoryRegionOps fw_cfg_comb_mem_ops = {
>      .valid.accepts = fw_cfg_comb_valid,
>  };
>  
> +static const MemoryRegionOps fw_cfg_dma_mem_ops = {
> +    .write = fw_cfg_dma_mem_write,
> +    .endianness = DEVICE_BIG_ENDIAN,
> +    .valid.accepts = fw_cfg_dma_mem_valid,
> +};

(2) Okay. This is somewhat important, and *completely* non-intuitive,
unfortunately.

Without setting *both*

    .valid.max_access_size = 8,
    .impl.max_access_size = 8,

here, the memory subsystem will split up all 8-byte wide accesses (from
the guest side) to two 4-byte wide calls to fw_cfg_dma_mem_write()).

Those calls do satisfy the ordering logic in fw_cfg_dma_mem_write(), but
nonetheless, the lack of the above setting makes the following code in
fw_cfg_dma_mem_write() dead:

> +    } else if (size == 8 && addr == 0) {
> +        s->dma_addr = value;
> +        fw_cfg_dma_transfer(s);
> +    }

(I verified this claim with gdb on aarch64.)

So, please initialize both of the above fields to 8.

> +
>  static void fw_cfg_reset(DeviceState *d)
>  {
>      FWCfgState *s = FW_CFG(d);
> @@ -399,6 +533,22 @@ static bool is_version_1(void *opaque, int version_id)
>      return version_id == 1;
>  }
>  
> +static bool fw_cfg_dma_enabled(void *opaque)
> +{
> +    FWCfgState *s = opaque;
> +
> +    return s->dma_enabled;
> +}
> +
> +static VMStateDescription vmstate_fw_cfg_dma = {
> +    .name = "fw_cfg/dma",
> +    .needed = fw_cfg_dma_enabled,
> +    .fields = (VMStateField[]) {
> +        VMSTATE_UINT64(dma_addr, FWCfgState),
> +        VMSTATE_END_OF_LIST()
> +    },
> +};

Looks good to me. All fields that come from the command line (ie.
management layer) need not / must not be part of the migration stream.
And all data that is programmed by the guest, must. Here, "dma_addr" is
the only such item. Okay.

> +
>  static const VMStateDescription vmstate_fw_cfg = {
>      .name = "fw_cfg",
>      .version_id = 2,
> @@ -408,6 +558,10 @@ static const VMStateDescription vmstate_fw_cfg = {
>          VMSTATE_UINT16_HACK(cur_offset, FWCfgState, is_version_1),
>          VMSTATE_UINT32_V(cur_offset, FWCfgState, 2),
>          VMSTATE_END_OF_LIST()
> +    },
> +    .subsections = (const VMStateDescription*[]) {
> +        &vmstate_fw_cfg_dma,
> +        NULL,
>      }
>  };
>  
> @@ -593,7 +747,6 @@ static void fw_cfg_init1(DeviceState *dev)
>      qdev_init_nofail(dev);
>  
>      fw_cfg_add_bytes(s, FW_CFG_SIGNATURE, (char *)"QEMU", 4);
> -    fw_cfg_add_i32(s, FW_CFG_ID, 1);
>      fw_cfg_add_bytes(s, FW_CFG_UUID, qemu_uuid, 16);
>      fw_cfg_add_i16(s, FW_CFG_NOGRAPHIC, (uint16_t)(display_type == DT_NOGRAPHIC));
>      fw_cfg_add_i16(s, FW_CFG_NB_CPUS, (uint16_t)smp_cpus);

This is called from fw_cfg_init_io() and fw_cfg_init_mem_wide().

The former is renamed to fw_cfg_init_io_dma() -- and gets a wrapper
under the original name --, and sets FW_CFG_ID expliticly.

The latter sets FW_CFG_ID expliticly.

Okay.


> @@ -605,25 +758,52 @@ static void fw_cfg_init1(DeviceState *dev)
>      qemu_add_machine_init_done_notifier(&s->machine_ready);
>  }
>  
> -FWCfgState *fw_cfg_init_io(uint32_t iobase)
> +FWCfgState *fw_cfg_init_io_dma(uint32_t iobase, uint32_t dma_iobase,
> +                                AddressSpace *dma_as)
>  {
>      DeviceState *dev;
> +    FWCfgState *s;
> +    uint32_t version = FW_CFG_VERSION;
>  
>      dev = qdev_create(NULL, TYPE_FW_CFG_IO);
>      qdev_prop_set_uint32(dev, "iobase", iobase);
> +    qdev_prop_set_uint32(dev, "dma_iobase", dma_iobase);
> +
>      fw_cfg_init1(dev);
> +    s = FW_CFG(dev);
> +
> +    if (dma_as) {
> +        /* 64 bits for the address field */
> +        s->dma_as = dma_as;
> +        s->dma_enabled = true;
> +        s->dma_addr = 0;
> +
> +        version |= FW_CFG_VERSION_DMA;
> +    }
>  
> -    return FW_CFG(dev);
> +    fw_cfg_add_i32(s, FW_CFG_ID, version);
> +
> +    return s;
>  }
>  
> -FWCfgState *fw_cfg_init_mem_wide(hwaddr ctl_addr, hwaddr data_addr,
> -                                 uint32_t data_width)
> +FWCfgState *fw_cfg_init_io(uint32_t iobase)
> +{
> +    return fw_cfg_init_io_dma(iobase, 0, NULL);
> +}
> +
> +FWCfgState *fw_cfg_init_mem_wide(hwaddr ctl_addr,
> +                                 hwaddr data_addr, uint32_t data_width,
> +                                 hwaddr dma_addr, AddressSpace *dma_as)
>  {
>      DeviceState *dev;
>      SysBusDevice *sbd;
> +    FWCfgState *s;
> +    uint32_t version = FW_CFG_VERSION;
> +    bool dma_enabled = dma_addr && dma_as;
>  
>      dev = qdev_create(NULL, TYPE_FW_CFG_MEM);
>      qdev_prop_set_uint32(dev, "data_width", data_width);
> +    qdev_prop_set_bit(dev, "dma_enabled", dma_enabled);
>  
>      fw_cfg_init1(dev);
>  
> @@ -631,13 +811,25 @@ FWCfgState *fw_cfg_init_mem_wide(hwaddr ctl_addr, hwaddr data_addr,
>      sysbus_mmio_map(sbd, 0, ctl_addr);
>      sysbus_mmio_map(sbd, 1, data_addr);
>  
> -    return FW_CFG(dev);
> +    s = FW_CFG(dev);
> +
> +    if (dma_enabled) {
> +        s->dma_as = dma_as;
> +        s->dma_addr = 0;
> +        sysbus_mmio_map(sbd, 2, dma_addr);
> +        version |= FW_CFG_VERSION_DMA;
> +    }
> +
> +    fw_cfg_add_i32(s, FW_CFG_ID, version);
> +
> +    return s;
>  }
>  
>  FWCfgState *fw_cfg_init_mem(hwaddr ctl_addr, hwaddr data_addr)
>  {
>      return fw_cfg_init_mem_wide(ctl_addr, data_addr,
> -                                fw_cfg_data_mem_ops.valid.max_access_size);
> +                                fw_cfg_data_mem_ops.valid.max_access_size,
> +                                0, NULL);
>  }
>  
>  
> @@ -664,6 +856,7 @@ static const TypeInfo fw_cfg_info = {
>  
>  static Property fw_cfg_io_properties[] = {
>      DEFINE_PROP_UINT32("iobase", FWCfgIoState, iobase, -1),
> +    DEFINE_PROP_UINT32("dma_iobase", FWCfgIoState, dma_iobase, -1),
>      DEFINE_PROP_END_OF_LIST(),
>  };
>  
> @@ -673,8 +866,12 @@ static void fw_cfg_io_realize(DeviceState *dev, Error **errp)
>      SysBusDevice *sbd = SYS_BUS_DEVICE(dev);
>  
>      memory_region_init_io(&s->comb_iomem, OBJECT(s), &fw_cfg_comb_mem_ops,
> -                          FW_CFG(s), "fwcfg", FW_CFG_SIZE);
> +                          FW_CFG(s), "fwcfg", FW_CFG_CTL_SIZE);
>      sysbus_add_io(sbd, s->iobase, &s->comb_iomem);
> +
> +    memory_region_init_io(&s->dma_iomem, OBJECT(s), &fw_cfg_dma_mem_ops,
> +                          FW_CFG(s), "fwcfg.dma", sizeof(dma_addr_t));
> +    sysbus_add_io(sbd, s->dma_iobase, &s->dma_iomem);
>  }

(3) Hmmmm. I think this should be made conditional. sysbus_add_io() maps
the region into IO port space immediately. Callers of fw_cfg_init_io()
should *not* reach sysbus_add_io(); it makes no sense to map the DMA
addr register at IO port 0.

(And then you can omit memory_region_init_io() as well, if dma_iobase is
zero.)

The rest of the code looks fine to me.

Again, I apologize for sucking this much at timely reviews lately. If
you fix (2) and (3) above -- optionally: (1) as well --, then you'll
have my R-b.

If you've lost your patience, I can pick up this series. :)

Thank you
Laszlo


>  
>  static void fw_cfg_io_class_init(ObjectClass *klass, void *data)
> @@ -695,6 +892,8 @@ static const TypeInfo fw_cfg_io_info = {
>  
>  static Property fw_cfg_mem_properties[] = {
>      DEFINE_PROP_UINT32("data_width", FWCfgMemState, data_width, -1),
> +    DEFINE_PROP_BOOL("dma_enabled", FWCfgMemState, parent_obj.dma_enabled,
> +                     false),
>      DEFINE_PROP_END_OF_LIST(),
>  };
>  
> @@ -705,7 +904,7 @@ static void fw_cfg_mem_realize(DeviceState *dev, Error **errp)
>      const MemoryRegionOps *data_ops = &fw_cfg_data_mem_ops;
>  
>      memory_region_init_io(&s->ctl_iomem, OBJECT(s), &fw_cfg_ctl_mem_ops,
> -                          FW_CFG(s), "fwcfg.ctl", FW_CFG_SIZE);
> +                          FW_CFG(s), "fwcfg.ctl", FW_CFG_CTL_SIZE);
>      sysbus_init_mmio(sbd, &s->ctl_iomem);
>  
>      if (s->data_width > data_ops->valid.max_access_size) {
> @@ -723,6 +922,12 @@ static void fw_cfg_mem_realize(DeviceState *dev, Error **errp)
>      memory_region_init_io(&s->data_iomem, OBJECT(s), data_ops, FW_CFG(s),
>                            "fwcfg.data", data_ops->valid.max_access_size);
>      sysbus_init_mmio(sbd, &s->data_iomem);
> +
> +    if (FW_CFG(s)->dma_enabled) {
> +        memory_region_init_io(&s->dma_iomem, OBJECT(s), &fw_cfg_dma_mem_ops,
> +                              FW_CFG(s), "fwcfg.dma", sizeof(dma_addr_t));
> +        sysbus_init_mmio(sbd, &s->dma_iomem);
> +    }
>  }
>  
>  static void fw_cfg_mem_class_init(ObjectClass *klass, void *data)
> diff --git a/include/hw/nvram/fw_cfg.h b/include/hw/nvram/fw_cfg.h
> index e60d3ca..ee0cd8a 100644
> --- a/include/hw/nvram/fw_cfg.h
> +++ b/include/hw/nvram/fw_cfg.h
> @@ -61,6 +61,15 @@ typedef struct FWCfgFiles {
>      FWCfgFile f[];
>  } FWCfgFiles;
>  
> +/* Control as first field allows for different structures selected by this
> + * field, which might be useful in the future
> + */
> +typedef struct FWCfgDmaAccess {
> +    uint32_t control;
> +    uint32_t length;
> +    uint64_t address;
> +} QEMU_PACKED FWCfgDmaAccess;
> +
>  typedef void (*FWCfgCallback)(void *opaque, uint8_t *data);
>  typedef void (*FWCfgReadCallback)(void *opaque, uint32_t offset);
>  
> @@ -77,10 +86,13 @@ void fw_cfg_add_file_callback(FWCfgState *s, const char *filename,
>                                void *data, size_t len);
>  void *fw_cfg_modify_file(FWCfgState *s, const char *filename, void *data,
>                           size_t len);
> +FWCfgState *fw_cfg_init_io_dma(uint32_t iobase, uint32_t dma_iobase,
> +                                AddressSpace *dma_as);
>  FWCfgState *fw_cfg_init_io(uint32_t iobase);
>  FWCfgState *fw_cfg_init_mem(hwaddr ctl_addr, hwaddr data_addr);
> -FWCfgState *fw_cfg_init_mem_wide(hwaddr ctl_addr, hwaddr data_addr,
> -                                 uint32_t data_width);
> +FWCfgState *fw_cfg_init_mem_wide(hwaddr ctl_addr,
> +                                 hwaddr data_addr, uint32_t data_width,
> +                                 hwaddr dma_addr, AddressSpace *dma_as);
>  
>  FWCfgState *fw_cfg_find(void);
>  
>
Marc Marí Oct. 1, 2015, 3:52 p.m. UTC | #2
On Thu, 1 Oct 2015 16:36:07 +0200
Laszlo Ersek <lersek@redhat.com> wrote:

> This looks good to me. Thanks for addressing my v3 request.
> 
> I have some new remarks here. I feel *really* bad for not finding them
> earlier. (If you get tired of working on this series, I could pick it
> up and try to shepherd it further.)

I'll continue with it, don't worry. Thanks for your comments.

Marc

> 
> On 10/01/15 14:16, Marc Marí wrote:
> > Based on the specifications on docs/specs/fw_cfg.txt
> > 
> > This interface is an addon. The old interface can still be used as
> > usual.
> > 
> > Based on Gerd Hoffman's initial implementation.
> > 
> > Signed-off-by: Marc Marí <markmb@redhat.com>
> > ---
> >  hw/arm/virt.c             |   2 +-
> >  hw/nvram/fw_cfg.c         | 231
> > +++++++++++++++++++++++++++++++++++++++++++---
> > include/hw/nvram/fw_cfg.h |  16 +++- 3 files changed, 233
> > insertions(+), 16 deletions(-)
> > 
> > diff --git a/hw/arm/virt.c b/hw/arm/virt.c
> > index d25d6cf..7ae984f 100644
> > --- a/hw/arm/virt.c
> > +++ b/hw/arm/virt.c
> > @@ -683,7 +683,7 @@ static void create_fw_cfg(const VirtBoardInfo
> > *vbi) hwaddr size = vbi->memmap[VIRT_FW_CFG].size;
> >      char *nodename;
> >  
> > -    fw_cfg_init_mem_wide(base + 8, base, 8);
> > +    fw_cfg_init_mem_wide(base + 8, base, 8, 0, NULL);
> >  
> >      nodename = g_strdup_printf("/fw-cfg@%" PRIx64, base);
> >      qemu_fdt_add_subnode(vbi->fdt, nodename);
> > diff --git a/hw/nvram/fw_cfg.c b/hw/nvram/fw_cfg.c
> > index 658f8c4..59933b3 100644
> > --- a/hw/nvram/fw_cfg.c
> > +++ b/hw/nvram/fw_cfg.c
> > @@ -23,6 +23,7 @@
> >   */
> >  #include "hw/hw.h"
> >  #include "sysemu/sysemu.h"
> > +#include "sysemu/dma.h"
> >  #include "hw/isa/isa.h"
> >  #include "hw/nvram/fw_cfg.h"
> >  #include "hw/sysbus.h"
> > @@ -30,7 +31,7 @@
> >  #include "qemu/error-report.h"
> >  #include "qemu/config-file.h"
> >  
> > -#define FW_CFG_SIZE 2
> > +#define FW_CFG_CTL_SIZE 2
> >  #define FW_CFG_NAME "fw_cfg"
> >  #define FW_CFG_PATH "/machine/" FW_CFG_NAME
> >  
> > @@ -42,6 +43,16 @@
> >  #define FW_CFG_IO(obj)  OBJECT_CHECK(FWCfgIoState,  (obj),
> > TYPE_FW_CFG_IO) #define FW_CFG_MEM(obj) OBJECT_CHECK(FWCfgMemState,
> > (obj), TYPE_FW_CFG_MEM) 
> > +/* FW_CFG_VERSION bits */
> > +#define FW_CFG_VERSION      0x01
> > +#define FW_CFG_VERSION_DMA  0x02
> > +
> > +/* FW_CFG_DMA_CONTROL bits */
> > +#define FW_CFG_DMA_CTL_ERROR   0x01
> > +#define FW_CFG_DMA_CTL_READ    0x02
> > +#define FW_CFG_DMA_CTL_SKIP    0x04
> > +#define FW_CFG_DMA_CTL_SELECT  0x08
> > +
> >  typedef struct FWCfgEntry {
> >      uint32_t len;
> >      uint8_t *data;
> > @@ -59,6 +70,10 @@ struct FWCfgState {
> >      uint16_t cur_entry;
> >      uint32_t cur_offset;
> >      Notifier machine_ready;
> > +
> > +    bool dma_enabled;
> > +    AddressSpace *dma_as;
> > +    dma_addr_t dma_addr;
> >  };
> >  
> >  struct FWCfgIoState {
> > @@ -66,8 +81,8 @@ struct FWCfgIoState {
> >      FWCfgState parent_obj;
> >      /*< public >*/
> >  
> > -    MemoryRegion comb_iomem;
> > -    uint32_t iobase;
> > +    MemoryRegion comb_iomem, dma_iomem;
> > +    uint32_t iobase, dma_iobase;
> >  };
> >  
> >  struct FWCfgMemState {
> > @@ -75,7 +90,7 @@ struct FWCfgMemState {
> >      FWCfgState parent_obj;
> >      /*< public >*/
> >  
> > -    MemoryRegion ctl_iomem, data_iomem;
> > +    MemoryRegion ctl_iomem, data_iomem, dma_iomem;
> >      uint32_t data_width;
> >      MemoryRegionOps wide_data_ops;
> >  };
> 
> (1) I *think* the new "dma_iomem" field, of type MemoryRegion, could
> be moved up to the parent struct FWCfgEntry, from both FWCfgMemState
> and FWCfgIoState. (And the references in the rest of the code could
> be updated.)
> 
> (
> 
> Independently, some loud thinking, mostly for myself: I've always been
> surprised by the difference between (a) FWCfgIoState *carrying*
> "dma_iobase" as a field -- and a property! --, and (b) FWCfgMemState
> *not* carrying the same as a field -- nor as a property.
> 
> I think I finally understand this difference now. It is all rooted in
> the difference between the internal APIs sysbus_add_io() and
> sysbus_init_mmio(). Both of these are called from the device realize
> functions, but the first (sysbus_add_io()) wants the IO port address
> at once, whereas the second (sysbus_init_mmio()) doesn't want the
> address -- the actual mapping (sysbus_mmio_map()) is delayed to board
> code; the device code doesn't want to be aware of it.
> 
> And this ripples to the top. Because sysbus_add_io() wants the IO port
> address, we must pass that address to the device realize function. And
> for that, we need a device property -- "dma_iobase". This is not new,
> it just follows the example of the preexistent "iobase" field /
> property.
> 
> Whereas, in the sysbus_init_mmio() case, we can keep the MMIO address
> private to the board code; the realize function need not know the
> address. However, the realize function does need to know the *fact*
> that we're going to do DMA. Given that we must maintain this fact (in
> "FWCfgState.dma_enabled") anyway, for other -- e.g. migration
> subsection -- purposes as well, it makes sense to expose that same
> field of the parent struct as a property, so we can set it in the
> memory mapped case *before* the realize function looks at it.
> 
> I feel better now, thanks for listening.
> 
> )
> 
> Then,
> 
> > @@ -292,6 +307,119 @@ static void fw_cfg_data_mem_write(void
> > *opaque, hwaddr addr, } while (i);
> >  }
> >  
> > +static void fw_cfg_dma_transfer(FWCfgState *s)
> > +{
> > +    dma_addr_t len;
> > +    FWCfgDmaAccess dma;
> > +    int arch;
> > +    FWCfgEntry *e;
> > +    int read;
> > +    dma_addr_t dma_addr;
> > +
> > +    /* Reset the address before the next access */
> > +    dma_addr = s->dma_addr;
> > +    s->dma_addr = 0;
> > +
> > +    dma.address = ldq_be_dma(s->dma_as,
> > +                            dma_addr + offsetof(FWCfgDmaAccess,
> > address));
> > +    dma.length = ldl_be_dma(s->dma_as,
> > +                            dma_addr + offsetof(FWCfgDmaAccess,
> > length));
> > +    dma.control = ldl_be_dma(s->dma_as,
> > +                            dma_addr + offsetof(FWCfgDmaAccess,
> > control)); +
> > +    if (dma.control & FW_CFG_DMA_CTL_SELECT) {
> > +        fw_cfg_select(s, dma.control >> 16);
> > +    }
> > +
> > +    arch = !!(s->cur_entry & FW_CFG_ARCH_LOCAL);
> > +    e = &s->entries[arch][s->cur_entry & FW_CFG_ENTRY_MASK];
> > +
> > +    if (dma.control & FW_CFG_DMA_CTL_READ) {
> > +        read = 1;
> > +    } else if (dma.control & FW_CFG_DMA_CTL_SKIP) {
> > +        read = 0;
> > +    } else {
> > +        dma.length = 0;
> 
> I can see you addressed Kevin's comment here.
> 
> 
> > +    }
> > +
> > +    dma.control = 0;
> > +
> > +    while (dma.length > 0 && !(dma.control &
> > FW_CFG_DMA_CTL_ERROR)) {
> > +        if (s->cur_entry == FW_CFG_INVALID || !e->data ||
> > +                                s->cur_offset >= e->len) {
> > +            len = dma.length;
> > +
> > +            /* If the access is not a read access, it will be a
> > skip access,
> > +             * tested before.
> > +             */
> > +            if (read) {
> > +                if (dma_memory_set(s->dma_as, dma.address, 0,
> > len)) {
> > +                    dma.control |= FW_CFG_DMA_CTL_ERROR;
> > +                }
> > +            }
> > +
> > +        } else {
> > +            if (dma.length <= (e->len - s->cur_offset)) {
> > +                len = dma.length;
> > +            } else {
> > +                len = (e->len - s->cur_offset);
> > +            }
> > +
> > +            if (e->read_callback) {
> > +                e->read_callback(e->callback_opaque,
> > s->cur_offset);
> > +            }
> > +
> > +            /* If the access is not a read access, it will be a
> > skip access,
> > +             * tested before.
> > +             */
> > +            if (read) {
> > +                if (dma_memory_write(s->dma_as, dma.address,
> > +                                    &e->data[s->cur_offset], len))
> > {
> > +                    dma.control |= FW_CFG_DMA_CTL_ERROR;
> > +                }
> > +            }
> > +
> > +            s->cur_offset += len;
> > +        }
> > +
> > +        dma.address += len;
> > +        dma.length  -= len;
> > +
> > +    }
> > +
> > +    stl_be_dma(s->dma_as, dma_addr + offsetof(FWCfgDmaAccess,
> > control),
> > +                dma.control);
> > +
> > +    trace_fw_cfg_read(s, 0);
> > +}
> 
> Seems OK to me.
> 
> > +
> > +static void fw_cfg_dma_mem_write(void *opaque, hwaddr addr,
> > +                                 uint64_t value, unsigned size)
> > +{
> > +    FWCfgState *s = opaque;
> > +
> > +    if (size == 4) {
> > +        if (addr == 0) {
> > +            /* FWCfgDmaAccess high address */
> > +            s->dma_addr = value << 32;
> > +        } else if (addr == 4) {
> > +            /* FWCfgDmaAccess low address */
> > +            s->dma_addr |= value;
> > +            fw_cfg_dma_transfer(s);
> > +        }
> > +    } else if (size == 8 && addr == 0) {
> > +        s->dma_addr = value;
> > +        fw_cfg_dma_transfer(s);
> > +    }
> > +}
> 
> Seems to match the zeroing of s->dma_addr in fw_cfg_dma_transfer().
> Good.
> 
> > +
> > +static bool fw_cfg_dma_mem_valid(void *opaque, hwaddr addr,
> > +                                  unsigned size, bool is_write)
> > +{
> > +    return is_write && ((size == 4 && (addr == 0 || addr == 4)) ||
> > +                        (size == 8 && addr == 0));
> > +}
> > +
> >  static bool fw_cfg_data_mem_valid(void *opaque, hwaddr addr,
> >                                    unsigned size, bool is_write)
> >  {
> > @@ -359,6 +487,12 @@ static const MemoryRegionOps
> > fw_cfg_comb_mem_ops = { .valid.accepts = fw_cfg_comb_valid,
> >  };
> >  
> > +static const MemoryRegionOps fw_cfg_dma_mem_ops = {
> > +    .write = fw_cfg_dma_mem_write,
> > +    .endianness = DEVICE_BIG_ENDIAN,
> > +    .valid.accepts = fw_cfg_dma_mem_valid,
> > +};
> 
> (2) Okay. This is somewhat important, and *completely* non-intuitive,
> unfortunately.
> 
> Without setting *both*
> 
>     .valid.max_access_size = 8,
>     .impl.max_access_size = 8,
> 
> here, the memory subsystem will split up all 8-byte wide accesses
> (from the guest side) to two 4-byte wide calls to
> fw_cfg_dma_mem_write()).
> 
> Those calls do satisfy the ordering logic in fw_cfg_dma_mem_write(),
> but nonetheless, the lack of the above setting makes the following
> code in fw_cfg_dma_mem_write() dead:
> 
> > +    } else if (size == 8 && addr == 0) {
> > +        s->dma_addr = value;
> > +        fw_cfg_dma_transfer(s);
> > +    }
> 
> (I verified this claim with gdb on aarch64.)
> 
> So, please initialize both of the above fields to 8.
> 
> > +
> >  static void fw_cfg_reset(DeviceState *d)
> >  {
> >      FWCfgState *s = FW_CFG(d);
> > @@ -399,6 +533,22 @@ static bool is_version_1(void *opaque, int
> > version_id) return version_id == 1;
> >  }
> >  
> > +static bool fw_cfg_dma_enabled(void *opaque)
> > +{
> > +    FWCfgState *s = opaque;
> > +
> > +    return s->dma_enabled;
> > +}
> > +
> > +static VMStateDescription vmstate_fw_cfg_dma = {
> > +    .name = "fw_cfg/dma",
> > +    .needed = fw_cfg_dma_enabled,
> > +    .fields = (VMStateField[]) {
> > +        VMSTATE_UINT64(dma_addr, FWCfgState),
> > +        VMSTATE_END_OF_LIST()
> > +    },
> > +};
> 
> Looks good to me. All fields that come from the command line (ie.
> management layer) need not / must not be part of the migration stream.
> And all data that is programmed by the guest, must. Here, "dma_addr"
> is the only such item. Okay.
> 
> > +
> >  static const VMStateDescription vmstate_fw_cfg = {
> >      .name = "fw_cfg",
> >      .version_id = 2,
> > @@ -408,6 +558,10 @@ static const VMStateDescription vmstate_fw_cfg
> > = { VMSTATE_UINT16_HACK(cur_offset, FWCfgState, is_version_1),
> >          VMSTATE_UINT32_V(cur_offset, FWCfgState, 2),
> >          VMSTATE_END_OF_LIST()
> > +    },
> > +    .subsections = (const VMStateDescription*[]) {
> > +        &vmstate_fw_cfg_dma,
> > +        NULL,
> >      }
> >  };
> >  
> > @@ -593,7 +747,6 @@ static void fw_cfg_init1(DeviceState *dev)
> >      qdev_init_nofail(dev);
> >  
> >      fw_cfg_add_bytes(s, FW_CFG_SIGNATURE, (char *)"QEMU", 4);
> > -    fw_cfg_add_i32(s, FW_CFG_ID, 1);
> >      fw_cfg_add_bytes(s, FW_CFG_UUID, qemu_uuid, 16);
> >      fw_cfg_add_i16(s, FW_CFG_NOGRAPHIC, (uint16_t)(display_type ==
> > DT_NOGRAPHIC)); fw_cfg_add_i16(s, FW_CFG_NB_CPUS,
> > (uint16_t)smp_cpus);
> 
> This is called from fw_cfg_init_io() and fw_cfg_init_mem_wide().
> 
> The former is renamed to fw_cfg_init_io_dma() -- and gets a wrapper
> under the original name --, and sets FW_CFG_ID expliticly.
> 
> The latter sets FW_CFG_ID expliticly.
> 
> Okay.
> 
> 
> > @@ -605,25 +758,52 @@ static void fw_cfg_init1(DeviceState *dev)
> >      qemu_add_machine_init_done_notifier(&s->machine_ready);
> >  }
> >  
> > -FWCfgState *fw_cfg_init_io(uint32_t iobase)
> > +FWCfgState *fw_cfg_init_io_dma(uint32_t iobase, uint32_t
> > dma_iobase,
> > +                                AddressSpace *dma_as)
> >  {
> >      DeviceState *dev;
> > +    FWCfgState *s;
> > +    uint32_t version = FW_CFG_VERSION;
> >  
> >      dev = qdev_create(NULL, TYPE_FW_CFG_IO);
> >      qdev_prop_set_uint32(dev, "iobase", iobase);
> > +    qdev_prop_set_uint32(dev, "dma_iobase", dma_iobase);
> > +
> >      fw_cfg_init1(dev);
> > +    s = FW_CFG(dev);
> > +
> > +    if (dma_as) {
> > +        /* 64 bits for the address field */
> > +        s->dma_as = dma_as;
> > +        s->dma_enabled = true;
> > +        s->dma_addr = 0;
> > +
> > +        version |= FW_CFG_VERSION_DMA;
> > +    }
> >  
> > -    return FW_CFG(dev);
> > +    fw_cfg_add_i32(s, FW_CFG_ID, version);
> > +
> > +    return s;
> >  }
> >  
> > -FWCfgState *fw_cfg_init_mem_wide(hwaddr ctl_addr, hwaddr data_addr,
> > -                                 uint32_t data_width)
> > +FWCfgState *fw_cfg_init_io(uint32_t iobase)
> > +{
> > +    return fw_cfg_init_io_dma(iobase, 0, NULL);
> > +}
> > +
> > +FWCfgState *fw_cfg_init_mem_wide(hwaddr ctl_addr,
> > +                                 hwaddr data_addr, uint32_t
> > data_width,
> > +                                 hwaddr dma_addr, AddressSpace
> > *dma_as) {
> >      DeviceState *dev;
> >      SysBusDevice *sbd;
> > +    FWCfgState *s;
> > +    uint32_t version = FW_CFG_VERSION;
> > +    bool dma_enabled = dma_addr && dma_as;
> >  
> >      dev = qdev_create(NULL, TYPE_FW_CFG_MEM);
> >      qdev_prop_set_uint32(dev, "data_width", data_width);
> > +    qdev_prop_set_bit(dev, "dma_enabled", dma_enabled);
> >  
> >      fw_cfg_init1(dev);
> >  
> > @@ -631,13 +811,25 @@ FWCfgState *fw_cfg_init_mem_wide(hwaddr
> > ctl_addr, hwaddr data_addr, sysbus_mmio_map(sbd, 0, ctl_addr);
> >      sysbus_mmio_map(sbd, 1, data_addr);
> >  
> > -    return FW_CFG(dev);
> > +    s = FW_CFG(dev);
> > +
> > +    if (dma_enabled) {
> > +        s->dma_as = dma_as;
> > +        s->dma_addr = 0;
> > +        sysbus_mmio_map(sbd, 2, dma_addr);
> > +        version |= FW_CFG_VERSION_DMA;
> > +    }
> > +
> > +    fw_cfg_add_i32(s, FW_CFG_ID, version);
> > +
> > +    return s;
> >  }
> >  
> >  FWCfgState *fw_cfg_init_mem(hwaddr ctl_addr, hwaddr data_addr)
> >  {
> >      return fw_cfg_init_mem_wide(ctl_addr, data_addr,
> > -
> > fw_cfg_data_mem_ops.valid.max_access_size);
> > +
> > fw_cfg_data_mem_ops.valid.max_access_size,
> > +                                0, NULL);
> >  }
> >  
> >  
> > @@ -664,6 +856,7 @@ static const TypeInfo fw_cfg_info = {
> >  
> >  static Property fw_cfg_io_properties[] = {
> >      DEFINE_PROP_UINT32("iobase", FWCfgIoState, iobase, -1),
> > +    DEFINE_PROP_UINT32("dma_iobase", FWCfgIoState, dma_iobase, -1),
> >      DEFINE_PROP_END_OF_LIST(),
> >  };
> >  
> > @@ -673,8 +866,12 @@ static void fw_cfg_io_realize(DeviceState
> > *dev, Error **errp) SysBusDevice *sbd = SYS_BUS_DEVICE(dev);
> >  
> >      memory_region_init_io(&s->comb_iomem, OBJECT(s),
> > &fw_cfg_comb_mem_ops,
> > -                          FW_CFG(s), "fwcfg", FW_CFG_SIZE);
> > +                          FW_CFG(s), "fwcfg", FW_CFG_CTL_SIZE);
> >      sysbus_add_io(sbd, s->iobase, &s->comb_iomem);
> > +
> > +    memory_region_init_io(&s->dma_iomem, OBJECT(s),
> > &fw_cfg_dma_mem_ops,
> > +                          FW_CFG(s), "fwcfg.dma",
> > sizeof(dma_addr_t));
> > +    sysbus_add_io(sbd, s->dma_iobase, &s->dma_iomem);
> >  }
> 
> (3) Hmmmm. I think this should be made conditional. sysbus_add_io()
> maps the region into IO port space immediately. Callers of
> fw_cfg_init_io() should *not* reach sysbus_add_io(); it makes no
> sense to map the DMA addr register at IO port 0.
> 
> (And then you can omit memory_region_init_io() as well, if dma_iobase
> is zero.)
> 
> The rest of the code looks fine to me.
> 
> Again, I apologize for sucking this much at timely reviews lately. If
> you fix (2) and (3) above -- optionally: (1) as well --, then you'll
> have my R-b.
> 
> If you've lost your patience, I can pick up this series. :)
> 
> Thank you
> Laszlo
> 
> 
> >  
> >  static void fw_cfg_io_class_init(ObjectClass *klass, void *data)
> > @@ -695,6 +892,8 @@ static const TypeInfo fw_cfg_io_info = {
> >  
> >  static Property fw_cfg_mem_properties[] = {
> >      DEFINE_PROP_UINT32("data_width", FWCfgMemState, data_width,
> > -1),
> > +    DEFINE_PROP_BOOL("dma_enabled", FWCfgMemState,
> > parent_obj.dma_enabled,
> > +                     false),
> >      DEFINE_PROP_END_OF_LIST(),
> >  };
> >  
> > @@ -705,7 +904,7 @@ static void fw_cfg_mem_realize(DeviceState
> > *dev, Error **errp) const MemoryRegionOps *data_ops =
> > &fw_cfg_data_mem_ops; 
> >      memory_region_init_io(&s->ctl_iomem, OBJECT(s),
> > &fw_cfg_ctl_mem_ops,
> > -                          FW_CFG(s), "fwcfg.ctl", FW_CFG_SIZE);
> > +                          FW_CFG(s), "fwcfg.ctl", FW_CFG_CTL_SIZE);
> >      sysbus_init_mmio(sbd, &s->ctl_iomem);
> >  
> >      if (s->data_width > data_ops->valid.max_access_size) {
> > @@ -723,6 +922,12 @@ static void fw_cfg_mem_realize(DeviceState
> > *dev, Error **errp) memory_region_init_io(&s->data_iomem,
> > OBJECT(s), data_ops, FW_CFG(s), "fwcfg.data",
> > data_ops->valid.max_access_size); sysbus_init_mmio(sbd,
> > &s->data_iomem); +
> > +    if (FW_CFG(s)->dma_enabled) {
> > +        memory_region_init_io(&s->dma_iomem, OBJECT(s),
> > &fw_cfg_dma_mem_ops,
> > +                              FW_CFG(s), "fwcfg.dma",
> > sizeof(dma_addr_t));
> > +        sysbus_init_mmio(sbd, &s->dma_iomem);
> > +    }
> >  }
> >  
> >  static void fw_cfg_mem_class_init(ObjectClass *klass, void *data)
> > diff --git a/include/hw/nvram/fw_cfg.h b/include/hw/nvram/fw_cfg.h
> > index e60d3ca..ee0cd8a 100644
> > --- a/include/hw/nvram/fw_cfg.h
> > +++ b/include/hw/nvram/fw_cfg.h
> > @@ -61,6 +61,15 @@ typedef struct FWCfgFiles {
> >      FWCfgFile f[];
> >  } FWCfgFiles;
> >  
> > +/* Control as first field allows for different structures selected
> > by this
> > + * field, which might be useful in the future
> > + */
> > +typedef struct FWCfgDmaAccess {
> > +    uint32_t control;
> > +    uint32_t length;
> > +    uint64_t address;
> > +} QEMU_PACKED FWCfgDmaAccess;
> > +
> >  typedef void (*FWCfgCallback)(void *opaque, uint8_t *data);
> >  typedef void (*FWCfgReadCallback)(void *opaque, uint32_t offset);
> >  
> > @@ -77,10 +86,13 @@ void fw_cfg_add_file_callback(FWCfgState *s,
> > const char *filename, void *data, size_t len);
> >  void *fw_cfg_modify_file(FWCfgState *s, const char *filename, void
> > *data, size_t len);
> > +FWCfgState *fw_cfg_init_io_dma(uint32_t iobase, uint32_t
> > dma_iobase,
> > +                                AddressSpace *dma_as);
> >  FWCfgState *fw_cfg_init_io(uint32_t iobase);
> >  FWCfgState *fw_cfg_init_mem(hwaddr ctl_addr, hwaddr data_addr);
> > -FWCfgState *fw_cfg_init_mem_wide(hwaddr ctl_addr, hwaddr data_addr,
> > -                                 uint32_t data_width);
> > +FWCfgState *fw_cfg_init_mem_wide(hwaddr ctl_addr,
> > +                                 hwaddr data_addr, uint32_t
> > data_width,
> > +                                 hwaddr dma_addr, AddressSpace
> > *dma_as); 
> >  FWCfgState *fw_cfg_find(void);
> >  
> > 
>
Peter Maydell Oct. 1, 2015, 5:18 p.m. UTC | #3
On 1 October 2015 at 15:36, Laszlo Ersek <lersek@redhat.com> wrote:
> I think I finally understand this difference now. It is all rooted in
> the difference between the internal APIs sysbus_add_io() and
> sysbus_init_mmio(). Both of these are called from the device realize
> functions, but the first (sysbus_add_io()) wants the IO port address at
> once, whereas the second (sysbus_init_mmio()) doesn't want the address
> -- the actual mapping (sysbus_mmio_map()) is delayed to board code; the
> device code doesn't want to be aware of it.

Yes. The sysbus_add_io() API is firmly wedded to the x86 I/O port
concept and to the idea that devices are at fixed I/O port addresses
which don't depend on what board they're in (because the few
non-x86 systems that set up some kind of "IO port" abstraction
are generally doing it to look more x86-like).

That said, sysbus_add_io() is one of those odd functions which
we use half a dozen times in the whole codebase and which leaves
me wondering if it ought to be refactored to work differently
(eg split into "declare IO ports" and "map IO ports into IO space"
like the mmio functions)...

thanks
-- PMM
Laszlo Ersek Oct. 1, 2015, 7:20 p.m. UTC | #4
On 10/01/15 19:18, Peter Maydell wrote:
> On 1 October 2015 at 15:36, Laszlo Ersek <lersek@redhat.com> wrote:
>> I think I finally understand this difference now. It is all rooted in
>> the difference between the internal APIs sysbus_add_io() and
>> sysbus_init_mmio(). Both of these are called from the device realize
>> functions, but the first (sysbus_add_io()) wants the IO port address at
>> once, whereas the second (sysbus_init_mmio()) doesn't want the address
>> -- the actual mapping (sysbus_mmio_map()) is delayed to board code; the
>> device code doesn't want to be aware of it.
> 
> Yes. The sysbus_add_io() API is firmly wedded to the x86 I/O port
> concept and to the idea that devices are at fixed I/O port addresses
> which don't depend on what board they're in (because the few
> non-x86 systems that set up some kind of "IO port" abstraction
> are generally doing it to look more x86-like).
> 
> That said, sysbus_add_io() is one of those odd functions which
> we use half a dozen times in the whole codebase and which leaves
> me wondering if it ought to be refactored to work differently
> (eg split into "declare IO ports" and "map IO ports into IO space"
> like the mmio functions)...

I had the same idea looming in the back of my mind, but I wouldn't know
how to attack the refactoring (plus I wouldn't want to delay Marc's work
with it -- after all the function is not being introduced by this
series), so I didn't bring it up. :P

Thanks!
Laszlo
Stefan Hajnoczi Oct. 6, 2015, 2:44 p.m. UTC | #5
On Thu, Oct 01, 2015 at 02:16:55PM +0200, Marc Marí wrote:
> @@ -292,6 +307,119 @@ static void fw_cfg_data_mem_write(void *opaque, hwaddr addr,
>      } while (i);
>  }
>  
> +static void fw_cfg_dma_transfer(FWCfgState *s)
> +{
> +    dma_addr_t len;
> +    FWCfgDmaAccess dma;
> +    int arch;
> +    FWCfgEntry *e;
> +    int read;
> +    dma_addr_t dma_addr;
> +
> +    /* Reset the address before the next access */
> +    dma_addr = s->dma_addr;
> +    s->dma_addr = 0;
> +
> +    dma.address = ldq_be_dma(s->dma_as,
> +                            dma_addr + offsetof(FWCfgDmaAccess, address));
> +    dma.length = ldl_be_dma(s->dma_as,
> +                            dma_addr + offsetof(FWCfgDmaAccess, length));
> +    dma.control = ldl_be_dma(s->dma_as,
> +                            dma_addr + offsetof(FWCfgDmaAccess, control));

ldq_be_dma() doesn't report errors.  If dma_addr is invalid the return
value could be anything.  Memory corruption inside the guest is possible
if the address/length/control values happen to cause a memory read
operation!

Instead, please use:

if (dma_memory_read(s->dma_as, dma_addr, &dma, sizeof(dma))) {
    stl_be_dma(s->dma_as, dma_addr + offsetof(FWCfgDmaAccess, control),
               FW_CFG_DMA_CTL_ERROR);
    return;
}

dma.address = be64_to_cpu(dma.address);
dma.length = be32_to_cpu(dma.length);
dma.control = be32_to_cpu(dma.control);

> +
> +    if (dma.control & FW_CFG_DMA_CTL_SELECT) {
> +        fw_cfg_select(s, dma.control >> 16);
> +    }
> +
> +    arch = !!(s->cur_entry & FW_CFG_ARCH_LOCAL);
> +    e = &s->entries[arch][s->cur_entry & FW_CFG_ENTRY_MASK];
> +
> +    if (dma.control & FW_CFG_DMA_CTL_READ) {
> +        read = 1;
> +    } else if (dma.control & FW_CFG_DMA_CTL_SKIP) {
> +        read = 0;
> +    } else {
> +        dma.length = 0;
> +    }
> +
> +    dma.control = 0;
> +
> +    while (dma.length > 0 && !(dma.control & FW_CFG_DMA_CTL_ERROR)) {
> +        if (s->cur_entry == FW_CFG_INVALID || !e->data ||
> +                                s->cur_offset >= e->len) {
> +            len = dma.length;
> +
> +            /* If the access is not a read access, it will be a skip access,
> +             * tested before.
> +             */
> +            if (read) {
> +                if (dma_memory_set(s->dma_as, dma.address, 0, len)) {
> +                    dma.control |= FW_CFG_DMA_CTL_ERROR;
> +                }
> +            }
> +
> +        } else {
> +            if (dma.length <= (e->len - s->cur_offset)) {
> +                len = dma.length;
> +            } else {
> +                len = (e->len - s->cur_offset);
> +            }
> +
> +            if (e->read_callback) {
> +                e->read_callback(e->callback_opaque, s->cur_offset);
> +            }
> +
> +            /* If the access is not a read access, it will be a skip access,
> +             * tested before.
> +             */
> +            if (read) {
> +                if (dma_memory_write(s->dma_as, dma.address,
> +                                    &e->data[s->cur_offset], len)) {
> +                    dma.control |= FW_CFG_DMA_CTL_ERROR;
> +                }
> +            }
> +
> +            s->cur_offset += len;
> +        }
> +
> +        dma.address += len;
> +        dma.length  -= len;

I thought these fields are written back to guest memory?  For example,
so the guest knows how many bytes were read before the error occurred.
Peter Maydell Oct. 6, 2015, 2:53 p.m. UTC | #6
On 6 October 2015 at 15:44, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> On Thu, Oct 01, 2015 at 02:16:55PM +0200, Marc Marí wrote:
>> @@ -292,6 +307,119 @@ static void fw_cfg_data_mem_write(void *opaque, hwaddr addr,
>>      } while (i);
>>  }
>>
>> +static void fw_cfg_dma_transfer(FWCfgState *s)
>> +{
>> +    dma_addr_t len;
>> +    FWCfgDmaAccess dma;
>> +    int arch;
>> +    FWCfgEntry *e;
>> +    int read;
>> +    dma_addr_t dma_addr;
>> +
>> +    /* Reset the address before the next access */
>> +    dma_addr = s->dma_addr;
>> +    s->dma_addr = 0;
>> +
>> +    dma.address = ldq_be_dma(s->dma_as,
>> +                            dma_addr + offsetof(FWCfgDmaAccess, address));
>> +    dma.length = ldl_be_dma(s->dma_as,
>> +                            dma_addr + offsetof(FWCfgDmaAccess, length));
>> +    dma.control = ldl_be_dma(s->dma_as,
>> +                            dma_addr + offsetof(FWCfgDmaAccess, control));
>
> ldq_be_dma() doesn't report errors.  If dma_addr is invalid the return
> value could be anything.  Memory corruption inside the guest is possible
> if the address/length/control values happen to cause a memory read
> operation!

We discussed this in a previous revision. IMHO if the guest has
passed us a bogus dma_addr it should expect memory corruption.
We only need to be sure we don't allow a VM escape.

> Instead, please use:
>
> if (dma_memory_read(s->dma_as, dma_addr, &dma, sizeof(dma))) {
>     stl_be_dma(s->dma_as, dma_addr + offsetof(FWCfgDmaAccess, control),
>                FW_CFG_DMA_CTL_ERROR);

If the guest handed us a bad dma_addr then this write will also
be bogus and could corrupt the guest's memory.

thanks
-- PMM
Marc Marí Oct. 6, 2015, 2:54 p.m. UTC | #7
On Tue, 6 Oct 2015 15:44:53 +0100
Stefan Hajnoczi <stefanha@gmail.com> wrote:

> On Thu, Oct 01, 2015 at 02:16:55PM +0200, Marc Marí wrote:
> > @@ -292,6 +307,119 @@ static void fw_cfg_data_mem_write(void
> > *opaque, hwaddr addr, } while (i);
> >  }
> >  
> > +static void fw_cfg_dma_transfer(FWCfgState *s)
> > +{
> > +    dma_addr_t len;
> > +    FWCfgDmaAccess dma;
> > +    int arch;
> > +    FWCfgEntry *e;
> > +    int read;
> > +    dma_addr_t dma_addr;
> > +
> > +    /* Reset the address before the next access */
> > +    dma_addr = s->dma_addr;
> > +    s->dma_addr = 0;
> > +
> > +    dma.address = ldq_be_dma(s->dma_as,
> > +                            dma_addr + offsetof(FWCfgDmaAccess,
> > address));
> > +    dma.length = ldl_be_dma(s->dma_as,
> > +                            dma_addr + offsetof(FWCfgDmaAccess,
> > length));
> > +    dma.control = ldl_be_dma(s->dma_as,
> > +                            dma_addr + offsetof(FWCfgDmaAccess,
> > control));
> 
> ldq_be_dma() doesn't report errors.  If dma_addr is invalid the return
> value could be anything.  Memory corruption inside the guest is
> possible if the address/length/control values happen to cause a
> memory read operation!
> 
> Instead, please use:
> 
> if (dma_memory_read(s->dma_as, dma_addr, &dma, sizeof(dma))) {
>     stl_be_dma(s->dma_as, dma_addr + offsetof(FWCfgDmaAccess,
> control), FW_CFG_DMA_CTL_ERROR);
>     return;
> }
> 
> dma.address = be64_to_cpu(dma.address);
> dma.length = be32_to_cpu(dma.length);
> dma.control = be32_to_cpu(dma.control);
> 
> > +
> > +    if (dma.control & FW_CFG_DMA_CTL_SELECT) {
> > +        fw_cfg_select(s, dma.control >> 16);
> > +    }
> > +
> > +    arch = !!(s->cur_entry & FW_CFG_ARCH_LOCAL);
> > +    e = &s->entries[arch][s->cur_entry & FW_CFG_ENTRY_MASK];
> > +
> > +    if (dma.control & FW_CFG_DMA_CTL_READ) {
> > +        read = 1;
> > +    } else if (dma.control & FW_CFG_DMA_CTL_SKIP) {
> > +        read = 0;
> > +    } else {
> > +        dma.length = 0;
> > +    }
> > +
> > +    dma.control = 0;
> > +
> > +    while (dma.length > 0 && !(dma.control &
> > FW_CFG_DMA_CTL_ERROR)) {
> > +        if (s->cur_entry == FW_CFG_INVALID || !e->data ||
> > +                                s->cur_offset >= e->len) {
> > +            len = dma.length;
> > +
> > +            /* If the access is not a read access, it will be a
> > skip access,
> > +             * tested before.
> > +             */
> > +            if (read) {
> > +                if (dma_memory_set(s->dma_as, dma.address, 0,
> > len)) {
> > +                    dma.control |= FW_CFG_DMA_CTL_ERROR;
> > +                }
> > +            }
> > +
> > +        } else {
> > +            if (dma.length <= (e->len - s->cur_offset)) {
> > +                len = dma.length;
> > +            } else {
> > +                len = (e->len - s->cur_offset);
> > +            }
> > +
> > +            if (e->read_callback) {
> > +                e->read_callback(e->callback_opaque,
> > s->cur_offset);
> > +            }
> > +
> > +            /* If the access is not a read access, it will be a
> > skip access,
> > +             * tested before.
> > +             */
> > +            if (read) {
> > +                if (dma_memory_write(s->dma_as, dma.address,
> > +                                    &e->data[s->cur_offset], len))
> > {
> > +                    dma.control |= FW_CFG_DMA_CTL_ERROR;
> > +                }
> > +            }
> > +
> > +            s->cur_offset += len;
> > +        }
> > +
> > +        dma.address += len;
> > +        dma.length  -= len;
> 
> I thought these fields are written back to guest memory?  For example,
> so the guest knows how many bytes were read before the error occurred.

This was proposed here:

http://lists.gnu.org/archive/html/qemu-devel/2015-08/msg04001.html

I also don't see much benefit into knowing how many bytes were read. If
the guest is trying to read an invalid entry or past the end of that
entry, the memory will be filled with zeros. The only moment when an
error would be reported is when there's some problem in the mapping.
And this problem is bad enough to just abort the whole operation.

Regards
Marc
Stefan Hajnoczi Oct. 8, 2015, 9:07 a.m. UTC | #8
On Tue, Oct 06, 2015 at 03:53:33PM +0100, Peter Maydell wrote:
> On 6 October 2015 at 15:44, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> > On Thu, Oct 01, 2015 at 02:16:55PM +0200, Marc Marí wrote:
> >> @@ -292,6 +307,119 @@ static void fw_cfg_data_mem_write(void *opaque, hwaddr addr,
> >>      } while (i);
> >>  }
> >>
> >> +static void fw_cfg_dma_transfer(FWCfgState *s)
> >> +{
> >> +    dma_addr_t len;
> >> +    FWCfgDmaAccess dma;
> >> +    int arch;
> >> +    FWCfgEntry *e;
> >> +    int read;
> >> +    dma_addr_t dma_addr;
> >> +
> >> +    /* Reset the address before the next access */
> >> +    dma_addr = s->dma_addr;
> >> +    s->dma_addr = 0;
> >> +
> >> +    dma.address = ldq_be_dma(s->dma_as,
> >> +                            dma_addr + offsetof(FWCfgDmaAccess, address));
> >> +    dma.length = ldl_be_dma(s->dma_as,
> >> +                            dma_addr + offsetof(FWCfgDmaAccess, length));
> >> +    dma.control = ldl_be_dma(s->dma_as,
> >> +                            dma_addr + offsetof(FWCfgDmaAccess, control));
> >
> > ldq_be_dma() doesn't report errors.  If dma_addr is invalid the return
> > value could be anything.  Memory corruption inside the guest is possible
> > if the address/length/control values happen to cause a memory read
> > operation!
> 
> We discussed this in a previous revision. IMHO if the guest has
> passed us a bogus dma_addr it should expect memory corruption.
> We only need to be sure we don't allow a VM escape.

Even if the guest-visible behavior doesn't matter, Valgrind won't like
this.  ldq_be_dma() reads from uninitialized stack memory:

#define DEFINE_LDST_DMA(_lname, _sname, _bits, _end) \
    static inline uint##_bits##_t ld##_lname##_##_end##_dma(AddressSpace *as, \
                                                            dma_addr_t addr) \
    {                                                                   \
        uint##_bits##_t val;                                            \
        dma_memory_read(as, addr, &val, (_bits) / 8);                   \
        return _end##_bits##_to_cpu(val);                               \
    }

Bad QEMU, bad userspace process :).

I think we really need to check the error and at least return early.

> > Instead, please use:
> >
> > if (dma_memory_read(s->dma_as, dma_addr, &dma, sizeof(dma))) {
> >     stl_be_dma(s->dma_as, dma_addr + offsetof(FWCfgDmaAccess, control),
> >                FW_CFG_DMA_CTL_ERROR);
> 
> If the guest handed us a bad dma_addr then this write will also
> be bogus and could corrupt the guest's memory.

That's fine because it's not a random address - it's the address that
the guest gave us.

Stefan
Marc Marí Oct. 8, 2015, 10:01 a.m. UTC | #9
On Thu, 8 Oct 2015 10:07:34 +0100
Stefan Hajnoczi <stefanha@gmail.com> wrote:

> On Tue, Oct 06, 2015 at 03:53:33PM +0100, Peter Maydell wrote:
> > On 6 October 2015 at 15:44, Stefan Hajnoczi <stefanha@gmail.com>
> > wrote:
> > > On Thu, Oct 01, 2015 at 02:16:55PM +0200, Marc Marí wrote:
> > >> @@ -292,6 +307,119 @@ static void fw_cfg_data_mem_write(void
> > >> *opaque, hwaddr addr, } while (i);
> > >>  }
> > >>
> > >> +static void fw_cfg_dma_transfer(FWCfgState *s)
> > >> +{
> > >> +    dma_addr_t len;
> > >> +    FWCfgDmaAccess dma;
> > >> +    int arch;
> > >> +    FWCfgEntry *e;
> > >> +    int read;
> > >> +    dma_addr_t dma_addr;
> > >> +
> > >> +    /* Reset the address before the next access */
> > >> +    dma_addr = s->dma_addr;
> > >> +    s->dma_addr = 0;
> > >> +
> > >> +    dma.address = ldq_be_dma(s->dma_as,
> > >> +                            dma_addr + offsetof(FWCfgDmaAccess,
> > >> address));
> > >> +    dma.length = ldl_be_dma(s->dma_as,
> > >> +                            dma_addr + offsetof(FWCfgDmaAccess,
> > >> length));
> > >> +    dma.control = ldl_be_dma(s->dma_as,
> > >> +                            dma_addr + offsetof(FWCfgDmaAccess,
> > >> control));
> > >
> > > ldq_be_dma() doesn't report errors.  If dma_addr is invalid the
> > > return value could be anything.  Memory corruption inside the
> > > guest is possible if the address/length/control values happen to
> > > cause a memory read operation!
> > 
> > We discussed this in a previous revision. IMHO if the guest has
> > passed us a bogus dma_addr it should expect memory corruption.
> > We only need to be sure we don't allow a VM escape.
> 
> Even if the guest-visible behavior doesn't matter, Valgrind won't like
> this.  ldq_be_dma() reads from uninitialized stack memory:
> 
> #define DEFINE_LDST_DMA(_lname, _sname, _bits, _end) \
>     static inline uint##_bits##_t
> ld##_lname##_##_end##_dma(AddressSpace *as, \ dma_addr_t addr) \
>     {                                                                   \
>         uint##_bits##_t
> val;                                            \ dma_memory_read(as,
> addr, &val, (_bits) / 8);                   \ return
> _end##_bits##_to_cpu(val);                               \ }
> 
> Bad QEMU, bad userspace process :).
> 
> I think we really need to check the error and at least return early.

It doesn't hurt to check the error. I'll add it.

Thanks
Marc

> > > Instead, please use:
> > >
> > > if (dma_memory_read(s->dma_as, dma_addr, &dma, sizeof(dma))) {
> > >     stl_be_dma(s->dma_as, dma_addr + offsetof(FWCfgDmaAccess,
> > > control), FW_CFG_DMA_CTL_ERROR);
> > 
> > If the guest handed us a bad dma_addr then this write will also
> > be bogus and could corrupt the guest's memory.
> 
> That's fine because it's not a random address - it's the address that
> the guest gave us.
> 
> Stefan
diff mbox

Patch

diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index d25d6cf..7ae984f 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -683,7 +683,7 @@  static void create_fw_cfg(const VirtBoardInfo *vbi)
     hwaddr size = vbi->memmap[VIRT_FW_CFG].size;
     char *nodename;
 
-    fw_cfg_init_mem_wide(base + 8, base, 8);
+    fw_cfg_init_mem_wide(base + 8, base, 8, 0, NULL);
 
     nodename = g_strdup_printf("/fw-cfg@%" PRIx64, base);
     qemu_fdt_add_subnode(vbi->fdt, nodename);
diff --git a/hw/nvram/fw_cfg.c b/hw/nvram/fw_cfg.c
index 658f8c4..59933b3 100644
--- a/hw/nvram/fw_cfg.c
+++ b/hw/nvram/fw_cfg.c
@@ -23,6 +23,7 @@ 
  */
 #include "hw/hw.h"
 #include "sysemu/sysemu.h"
+#include "sysemu/dma.h"
 #include "hw/isa/isa.h"
 #include "hw/nvram/fw_cfg.h"
 #include "hw/sysbus.h"
@@ -30,7 +31,7 @@ 
 #include "qemu/error-report.h"
 #include "qemu/config-file.h"
 
-#define FW_CFG_SIZE 2
+#define FW_CFG_CTL_SIZE 2
 #define FW_CFG_NAME "fw_cfg"
 #define FW_CFG_PATH "/machine/" FW_CFG_NAME
 
@@ -42,6 +43,16 @@ 
 #define FW_CFG_IO(obj)  OBJECT_CHECK(FWCfgIoState,  (obj), TYPE_FW_CFG_IO)
 #define FW_CFG_MEM(obj) OBJECT_CHECK(FWCfgMemState, (obj), TYPE_FW_CFG_MEM)
 
+/* FW_CFG_VERSION bits */
+#define FW_CFG_VERSION      0x01
+#define FW_CFG_VERSION_DMA  0x02
+
+/* FW_CFG_DMA_CONTROL bits */
+#define FW_CFG_DMA_CTL_ERROR   0x01
+#define FW_CFG_DMA_CTL_READ    0x02
+#define FW_CFG_DMA_CTL_SKIP    0x04
+#define FW_CFG_DMA_CTL_SELECT  0x08
+
 typedef struct FWCfgEntry {
     uint32_t len;
     uint8_t *data;
@@ -59,6 +70,10 @@  struct FWCfgState {
     uint16_t cur_entry;
     uint32_t cur_offset;
     Notifier machine_ready;
+
+    bool dma_enabled;
+    AddressSpace *dma_as;
+    dma_addr_t dma_addr;
 };
 
 struct FWCfgIoState {
@@ -66,8 +81,8 @@  struct FWCfgIoState {
     FWCfgState parent_obj;
     /*< public >*/
 
-    MemoryRegion comb_iomem;
-    uint32_t iobase;
+    MemoryRegion comb_iomem, dma_iomem;
+    uint32_t iobase, dma_iobase;
 };
 
 struct FWCfgMemState {
@@ -75,7 +90,7 @@  struct FWCfgMemState {
     FWCfgState parent_obj;
     /*< public >*/
 
-    MemoryRegion ctl_iomem, data_iomem;
+    MemoryRegion ctl_iomem, data_iomem, dma_iomem;
     uint32_t data_width;
     MemoryRegionOps wide_data_ops;
 };
@@ -292,6 +307,119 @@  static void fw_cfg_data_mem_write(void *opaque, hwaddr addr,
     } while (i);
 }
 
+static void fw_cfg_dma_transfer(FWCfgState *s)
+{
+    dma_addr_t len;
+    FWCfgDmaAccess dma;
+    int arch;
+    FWCfgEntry *e;
+    int read;
+    dma_addr_t dma_addr;
+
+    /* Reset the address before the next access */
+    dma_addr = s->dma_addr;
+    s->dma_addr = 0;
+
+    dma.address = ldq_be_dma(s->dma_as,
+                            dma_addr + offsetof(FWCfgDmaAccess, address));
+    dma.length = ldl_be_dma(s->dma_as,
+                            dma_addr + offsetof(FWCfgDmaAccess, length));
+    dma.control = ldl_be_dma(s->dma_as,
+                            dma_addr + offsetof(FWCfgDmaAccess, control));
+
+    if (dma.control & FW_CFG_DMA_CTL_SELECT) {
+        fw_cfg_select(s, dma.control >> 16);
+    }
+
+    arch = !!(s->cur_entry & FW_CFG_ARCH_LOCAL);
+    e = &s->entries[arch][s->cur_entry & FW_CFG_ENTRY_MASK];
+
+    if (dma.control & FW_CFG_DMA_CTL_READ) {
+        read = 1;
+    } else if (dma.control & FW_CFG_DMA_CTL_SKIP) {
+        read = 0;
+    } else {
+        dma.length = 0;
+    }
+
+    dma.control = 0;
+
+    while (dma.length > 0 && !(dma.control & FW_CFG_DMA_CTL_ERROR)) {
+        if (s->cur_entry == FW_CFG_INVALID || !e->data ||
+                                s->cur_offset >= e->len) {
+            len = dma.length;
+
+            /* If the access is not a read access, it will be a skip access,
+             * tested before.
+             */
+            if (read) {
+                if (dma_memory_set(s->dma_as, dma.address, 0, len)) {
+                    dma.control |= FW_CFG_DMA_CTL_ERROR;
+                }
+            }
+
+        } else {
+            if (dma.length <= (e->len - s->cur_offset)) {
+                len = dma.length;
+            } else {
+                len = (e->len - s->cur_offset);
+            }
+
+            if (e->read_callback) {
+                e->read_callback(e->callback_opaque, s->cur_offset);
+            }
+
+            /* If the access is not a read access, it will be a skip access,
+             * tested before.
+             */
+            if (read) {
+                if (dma_memory_write(s->dma_as, dma.address,
+                                    &e->data[s->cur_offset], len)) {
+                    dma.control |= FW_CFG_DMA_CTL_ERROR;
+                }
+            }
+
+            s->cur_offset += len;
+        }
+
+        dma.address += len;
+        dma.length  -= len;
+
+    }
+
+    stl_be_dma(s->dma_as, dma_addr + offsetof(FWCfgDmaAccess, control),
+                dma.control);
+
+    trace_fw_cfg_read(s, 0);
+}
+
+static void fw_cfg_dma_mem_write(void *opaque, hwaddr addr,
+                                 uint64_t value, unsigned size)
+{
+    FWCfgState *s = opaque;
+
+    if (size == 4) {
+        if (addr == 0) {
+            /* FWCfgDmaAccess high address */
+            s->dma_addr = value << 32;
+        } else if (addr == 4) {
+            /* FWCfgDmaAccess low address */
+            s->dma_addr |= value;
+            fw_cfg_dma_transfer(s);
+        }
+    } else if (size == 8 && addr == 0) {
+        s->dma_addr = value;
+        fw_cfg_dma_transfer(s);
+    }
+}
+
+static bool fw_cfg_dma_mem_valid(void *opaque, hwaddr addr,
+                                  unsigned size, bool is_write)
+{
+    return is_write && ((size == 4 && (addr == 0 || addr == 4)) ||
+                        (size == 8 && addr == 0));
+}
+
 static bool fw_cfg_data_mem_valid(void *opaque, hwaddr addr,
                                   unsigned size, bool is_write)
 {
@@ -359,6 +487,12 @@  static const MemoryRegionOps fw_cfg_comb_mem_ops = {
     .valid.accepts = fw_cfg_comb_valid,
 };
 
+static const MemoryRegionOps fw_cfg_dma_mem_ops = {
+    .write = fw_cfg_dma_mem_write,
+    .endianness = DEVICE_BIG_ENDIAN,
+    .valid.accepts = fw_cfg_dma_mem_valid,
+};
+
 static void fw_cfg_reset(DeviceState *d)
 {
     FWCfgState *s = FW_CFG(d);
@@ -399,6 +533,22 @@  static bool is_version_1(void *opaque, int version_id)
     return version_id == 1;
 }
 
+static bool fw_cfg_dma_enabled(void *opaque)
+{
+    FWCfgState *s = opaque;
+
+    return s->dma_enabled;
+}
+
+static VMStateDescription vmstate_fw_cfg_dma = {
+    .name = "fw_cfg/dma",
+    .needed = fw_cfg_dma_enabled,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT64(dma_addr, FWCfgState),
+        VMSTATE_END_OF_LIST()
+    },
+};
+
 static const VMStateDescription vmstate_fw_cfg = {
     .name = "fw_cfg",
     .version_id = 2,
@@ -408,6 +558,10 @@  static const VMStateDescription vmstate_fw_cfg = {
         VMSTATE_UINT16_HACK(cur_offset, FWCfgState, is_version_1),
         VMSTATE_UINT32_V(cur_offset, FWCfgState, 2),
         VMSTATE_END_OF_LIST()
+    },
+    .subsections = (const VMStateDescription*[]) {
+        &vmstate_fw_cfg_dma,
+        NULL,
     }
 };
 
@@ -593,7 +747,6 @@  static void fw_cfg_init1(DeviceState *dev)
     qdev_init_nofail(dev);
 
     fw_cfg_add_bytes(s, FW_CFG_SIGNATURE, (char *)"QEMU", 4);
-    fw_cfg_add_i32(s, FW_CFG_ID, 1);
     fw_cfg_add_bytes(s, FW_CFG_UUID, qemu_uuid, 16);
     fw_cfg_add_i16(s, FW_CFG_NOGRAPHIC, (uint16_t)(display_type == DT_NOGRAPHIC));
     fw_cfg_add_i16(s, FW_CFG_NB_CPUS, (uint16_t)smp_cpus);
@@ -605,25 +758,52 @@  static void fw_cfg_init1(DeviceState *dev)
     qemu_add_machine_init_done_notifier(&s->machine_ready);
 }
 
-FWCfgState *fw_cfg_init_io(uint32_t iobase)
+FWCfgState *fw_cfg_init_io_dma(uint32_t iobase, uint32_t dma_iobase,
+                                AddressSpace *dma_as)
 {
     DeviceState *dev;
+    FWCfgState *s;
+    uint32_t version = FW_CFG_VERSION;
 
     dev = qdev_create(NULL, TYPE_FW_CFG_IO);
     qdev_prop_set_uint32(dev, "iobase", iobase);
+    qdev_prop_set_uint32(dev, "dma_iobase", dma_iobase);
+
     fw_cfg_init1(dev);
+    s = FW_CFG(dev);
+
+    if (dma_as) {
+        /* 64 bits for the address field */
+        s->dma_as = dma_as;
+        s->dma_enabled = true;
+        s->dma_addr = 0;
+
+        version |= FW_CFG_VERSION_DMA;
+    }
 
-    return FW_CFG(dev);
+    fw_cfg_add_i32(s, FW_CFG_ID, version);
+
+    return s;
 }
 
-FWCfgState *fw_cfg_init_mem_wide(hwaddr ctl_addr, hwaddr data_addr,
-                                 uint32_t data_width)
+FWCfgState *fw_cfg_init_io(uint32_t iobase)
+{
+    return fw_cfg_init_io_dma(iobase, 0, NULL);
+}
+
+FWCfgState *fw_cfg_init_mem_wide(hwaddr ctl_addr,
+                                 hwaddr data_addr, uint32_t data_width,
+                                 hwaddr dma_addr, AddressSpace *dma_as)
 {
     DeviceState *dev;
     SysBusDevice *sbd;
+    FWCfgState *s;
+    uint32_t version = FW_CFG_VERSION;
+    bool dma_enabled = dma_addr && dma_as;
 
     dev = qdev_create(NULL, TYPE_FW_CFG_MEM);
     qdev_prop_set_uint32(dev, "data_width", data_width);
+    qdev_prop_set_bit(dev, "dma_enabled", dma_enabled);
 
     fw_cfg_init1(dev);
 
@@ -631,13 +811,25 @@  FWCfgState *fw_cfg_init_mem_wide(hwaddr ctl_addr, hwaddr data_addr,
     sysbus_mmio_map(sbd, 0, ctl_addr);
     sysbus_mmio_map(sbd, 1, data_addr);
 
-    return FW_CFG(dev);
+    s = FW_CFG(dev);
+
+    if (dma_enabled) {
+        s->dma_as = dma_as;
+        s->dma_addr = 0;
+        sysbus_mmio_map(sbd, 2, dma_addr);
+        version |= FW_CFG_VERSION_DMA;
+    }
+
+    fw_cfg_add_i32(s, FW_CFG_ID, version);
+
+    return s;
 }
 
 FWCfgState *fw_cfg_init_mem(hwaddr ctl_addr, hwaddr data_addr)
 {
     return fw_cfg_init_mem_wide(ctl_addr, data_addr,
-                                fw_cfg_data_mem_ops.valid.max_access_size);
+                                fw_cfg_data_mem_ops.valid.max_access_size,
+                                0, NULL);
 }
 
 
@@ -664,6 +856,7 @@  static const TypeInfo fw_cfg_info = {
 
 static Property fw_cfg_io_properties[] = {
     DEFINE_PROP_UINT32("iobase", FWCfgIoState, iobase, -1),
+    DEFINE_PROP_UINT32("dma_iobase", FWCfgIoState, dma_iobase, -1),
     DEFINE_PROP_END_OF_LIST(),
 };
 
@@ -673,8 +866,12 @@  static void fw_cfg_io_realize(DeviceState *dev, Error **errp)
     SysBusDevice *sbd = SYS_BUS_DEVICE(dev);
 
     memory_region_init_io(&s->comb_iomem, OBJECT(s), &fw_cfg_comb_mem_ops,
-                          FW_CFG(s), "fwcfg", FW_CFG_SIZE);
+                          FW_CFG(s), "fwcfg", FW_CFG_CTL_SIZE);
     sysbus_add_io(sbd, s->iobase, &s->comb_iomem);
+
+    memory_region_init_io(&s->dma_iomem, OBJECT(s), &fw_cfg_dma_mem_ops,
+                          FW_CFG(s), "fwcfg.dma", sizeof(dma_addr_t));
+    sysbus_add_io(sbd, s->dma_iobase, &s->dma_iomem);
 }
 
 static void fw_cfg_io_class_init(ObjectClass *klass, void *data)
@@ -695,6 +892,8 @@  static const TypeInfo fw_cfg_io_info = {
 
 static Property fw_cfg_mem_properties[] = {
     DEFINE_PROP_UINT32("data_width", FWCfgMemState, data_width, -1),
+    DEFINE_PROP_BOOL("dma_enabled", FWCfgMemState, parent_obj.dma_enabled,
+                     false),
     DEFINE_PROP_END_OF_LIST(),
 };
 
@@ -705,7 +904,7 @@  static void fw_cfg_mem_realize(DeviceState *dev, Error **errp)
     const MemoryRegionOps *data_ops = &fw_cfg_data_mem_ops;
 
     memory_region_init_io(&s->ctl_iomem, OBJECT(s), &fw_cfg_ctl_mem_ops,
-                          FW_CFG(s), "fwcfg.ctl", FW_CFG_SIZE);
+                          FW_CFG(s), "fwcfg.ctl", FW_CFG_CTL_SIZE);
     sysbus_init_mmio(sbd, &s->ctl_iomem);
 
     if (s->data_width > data_ops->valid.max_access_size) {
@@ -723,6 +922,12 @@  static void fw_cfg_mem_realize(DeviceState *dev, Error **errp)
     memory_region_init_io(&s->data_iomem, OBJECT(s), data_ops, FW_CFG(s),
                           "fwcfg.data", data_ops->valid.max_access_size);
     sysbus_init_mmio(sbd, &s->data_iomem);
+
+    if (FW_CFG(s)->dma_enabled) {
+        memory_region_init_io(&s->dma_iomem, OBJECT(s), &fw_cfg_dma_mem_ops,
+                              FW_CFG(s), "fwcfg.dma", sizeof(dma_addr_t));
+        sysbus_init_mmio(sbd, &s->dma_iomem);
+    }
 }
 
 static void fw_cfg_mem_class_init(ObjectClass *klass, void *data)
diff --git a/include/hw/nvram/fw_cfg.h b/include/hw/nvram/fw_cfg.h
index e60d3ca..ee0cd8a 100644
--- a/include/hw/nvram/fw_cfg.h
+++ b/include/hw/nvram/fw_cfg.h
@@ -61,6 +61,15 @@  typedef struct FWCfgFiles {
     FWCfgFile f[];
 } FWCfgFiles;
 
+/* Control as first field allows for different structures selected by this
+ * field, which might be useful in the future
+ */
+typedef struct FWCfgDmaAccess {
+    uint32_t control;
+    uint32_t length;
+    uint64_t address;
+} QEMU_PACKED FWCfgDmaAccess;
+
 typedef void (*FWCfgCallback)(void *opaque, uint8_t *data);
 typedef void (*FWCfgReadCallback)(void *opaque, uint32_t offset);
 
@@ -77,10 +86,13 @@  void fw_cfg_add_file_callback(FWCfgState *s, const char *filename,
                               void *data, size_t len);
 void *fw_cfg_modify_file(FWCfgState *s, const char *filename, void *data,
                          size_t len);
+FWCfgState *fw_cfg_init_io_dma(uint32_t iobase, uint32_t dma_iobase,
+                                AddressSpace *dma_as);
 FWCfgState *fw_cfg_init_io(uint32_t iobase);
 FWCfgState *fw_cfg_init_mem(hwaddr ctl_addr, hwaddr data_addr);
-FWCfgState *fw_cfg_init_mem_wide(hwaddr ctl_addr, hwaddr data_addr,
-                                 uint32_t data_width);
+FWCfgState *fw_cfg_init_mem_wide(hwaddr ctl_addr,
+                                 hwaddr data_addr, uint32_t data_width,
+                                 hwaddr dma_addr, AddressSpace *dma_as);
 
 FWCfgState *fw_cfg_find(void);