diff mbox series

[v3,4/4] virtio: Enable IOMMU

Message ID 20191120235801.4928-5-aik@ozlabs.ru
State Superseded
Headers show
Series virtio: Enable iommu_platform=on | expand

Commit Message

Alexey Kardashevskiy Nov. 20, 2019, 11:58 p.m. UTC
When QEMU is started with iommu_platform=on, the guest driver must accept
it or the device will fail.

This enables IOMMU support for virtio-net, -scsi, -block, -serial, -9p
devices. -serial and -9p are only compile tested though.

For virtio-net we map all RX buffers once and TX when xmit() is called
and unmap older pages when we are about to reuse the VQ descriptor.
As all other devices are synchronous, we unmap IOMMU pages right after
completion of a transaction.

This depends on QEMU's:
https://patchwork.ozlabs.org/patch/1194067/

Copied-bits-and-pieces-from: Michael Roth <mdroth@linux.vnet.ibm.com>
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
Changes:
v2:
* added Mike's fs/dma-instance-function.fs
* total rework
---
 board-qemu/slof/Makefile         |  1 +
 lib/libvirtio/virtio.h           |  5 ++
 lib/libvirtio/virtio-9p.c        |  4 ++
 lib/libvirtio/virtio-blk.c       |  4 ++
 lib/libvirtio/virtio-net.c       |  5 +-
 lib/libvirtio/virtio-scsi.c      |  5 ++
 lib/libvirtio/virtio-serial.c    | 12 +++--
 lib/libvirtio/virtio.c           | 82 +++++++++++++++++++++++++++++++-
 board-qemu/slof/OF.fs            |  3 ++
 slof/fs/dma-instance-function.fs | 28 +++++++++++
 10 files changed, 143 insertions(+), 6 deletions(-)
 create mode 100644 slof/fs/dma-instance-function.fs

Comments

Michael Roth Nov. 22, 2019, 12:09 a.m. UTC | #1
Quoting Alexey Kardashevskiy (2019-11-20 17:58:01)
> When QEMU is started with iommu_platform=on, the guest driver must accept
> it or the device will fail.
> 
> This enables IOMMU support for virtio-net, -scsi, -block, -serial, -9p
> devices. -serial and -9p are only compile tested though.
> 
> For virtio-net we map all RX buffers once and TX when xmit() is called
> and unmap older pages when we are about to reuse the VQ descriptor.
> As all other devices are synchronous, we unmap IOMMU pages right after
> completion of a transaction.
> 
> This depends on QEMU's:
> https://patchwork.ozlabs.org/patch/1194067/
> 
> Copied-bits-and-pieces-from: Michael Roth <mdroth@linux.vnet.ibm.com>
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
> Changes:
> v2:
> * added Mike's fs/dma-instance-function.fs
> * total rework
> ---
>  board-qemu/slof/Makefile         |  1 +
>  lib/libvirtio/virtio.h           |  5 ++
>  lib/libvirtio/virtio-9p.c        |  4 ++
>  lib/libvirtio/virtio-blk.c       |  4 ++
>  lib/libvirtio/virtio-net.c       |  5 +-
>  lib/libvirtio/virtio-scsi.c      |  5 ++
>  lib/libvirtio/virtio-serial.c    | 12 +++--
>  lib/libvirtio/virtio.c           | 82 +++++++++++++++++++++++++++++++-
>  board-qemu/slof/OF.fs            |  3 ++
>  slof/fs/dma-instance-function.fs | 28 +++++++++++
>  10 files changed, 143 insertions(+), 6 deletions(-)
>  create mode 100644 slof/fs/dma-instance-function.fs
> 
> diff --git a/board-qemu/slof/Makefile b/board-qemu/slof/Makefile
> index 2263e751bde9..d7ed2d7a6f18 100644
> --- a/board-qemu/slof/Makefile
> +++ b/board-qemu/slof/Makefile
> @@ -99,6 +99,7 @@ OF_FFS_FILES = \
>         $(SLOFCMNDIR)/fs/graphics.fs \
>         $(SLOFCMNDIR)/fs/generic-disk.fs \
>         $(SLOFCMNDIR)/fs/dma-function.fs \
> +       $(SLOFCMNDIR)/fs/dma-instance-function.fs \
>         $(SLOFCMNDIR)/fs/pci-device.fs \
>         $(SLOFCMNDIR)/fs/pci-bridge.fs \
>         $(SLOFCMNDIR)/fs/pci-properties.fs \
> diff --git a/lib/libvirtio/virtio.h b/lib/libvirtio/virtio.h
> index 4927a97f9f5f..250120b3112b 100644
> --- a/lib/libvirtio/virtio.h
> +++ b/lib/libvirtio/virtio.h
> @@ -29,6 +29,7 @@
>  #define VIRTIO_F_RING_INDIRECT_DESC    BIT(28)
>  #define VIRTIO_F_RING_EVENT_IDX                BIT(29)
>  #define VIRTIO_F_VERSION_1             BIT(32)
> +#define VIRTIO_F_IOMMU_PLATFORM         BIT(33)
> 
>  #define VIRTIO_TIMEOUT                 5000 /* 5 sec timeout */
> 
> @@ -84,6 +85,8 @@ struct vqs {
>         struct vring_desc *desc;
>         struct vring_avail *avail;
>         struct vring_used *used;
> +       void **desc_gpas; /* to get gpa from desc->addr (which is ioba) */
> +       uint64_t bus_desc;
>  };
> 
>  struct virtio_device {
> @@ -109,6 +112,8 @@ extern struct vring_used *virtio_get_vring_used(struct virtio_device *dev, int q
>  extern void virtio_fill_desc(struct vqs *vq, int id, uint64_t features,
>                               uint64_t addr, uint32_t len,
>                               uint16_t flags, uint16_t next);
> +extern void virtio_free_desc(struct vqs *vq, int id, uint64_t features);
> +void *virtio_desc_addr(struct virtio_device *vdev, int queue, int id);
>  extern struct vqs *virtio_queue_init_vq(struct virtio_device *dev, unsigned int id);
>  extern void virtio_queue_term_vq(struct virtio_device *dev, struct vqs *vq, unsigned int id);
> 
> diff --git a/lib/libvirtio/virtio-9p.c b/lib/libvirtio/virtio-9p.c
> index 426069fe9509..76078612b06e 100644
> --- a/lib/libvirtio/virtio-9p.c
> +++ b/lib/libvirtio/virtio-9p.c
> @@ -129,6 +129,10 @@ static int virtio_9p_transact(void *opaque, uint8_t *tx, int tx_size, uint8_t *r
>                 // do something better
>                 mb();
>         }
> +
> +       virtio_free_desc(vq, id, dev->features);
> +       virtio_free_desc(vq, id + 1, dev->features);
> +
>         if (i == 0) {
>                 return -1;
>         }
> diff --git a/lib/libvirtio/virtio-blk.c b/lib/libvirtio/virtio-blk.c
> index a0dadbb0d6a8..0363038e559d 100644
> --- a/lib/libvirtio/virtio-blk.c
> +++ b/lib/libvirtio/virtio-blk.c
> @@ -195,6 +195,10 @@ virtioblk_transfer(struct virtio_device *dev, char *buf, uint64_t blocknum,
>                         break;
>         }
> 
> +       virtio_free_desc(vq, id, dev->features);
> +       virtio_free_desc(vq, id + 1, dev->features);
> +       virtio_free_desc(vq, id + 2, dev->features);
> +
>         if (status == 0)
>                 return cnt;
> 
> diff --git a/lib/libvirtio/virtio-net.c b/lib/libvirtio/virtio-net.c
> index ae67883020ef..5a0d19088527 100644
> --- a/lib/libvirtio/virtio-net.c
> +++ b/lib/libvirtio/virtio-net.c
> @@ -255,6 +255,9 @@ static int virtionet_xmit(struct virtio_net *vnet, char *buf, int len)
>         idx = virtio_modern16_to_cpu(vdev, vq_tx->avail->idx);
>         id = (idx * 2) % vq_tx->size;
> 
> +       virtio_free_desc(vq_tx, id, vdev->features);
> +       virtio_free_desc(vq_tx, id + 1, vdev->features);
> +
>         /* Set up virtqueue descriptor for header */
>         virtio_fill_desc(vq_tx, id, vdev->features, (uint64_t)nethdr,
>                          net_hdr_size, VRING_DESC_F_NEXT, id + 1);
> @@ -317,7 +320,7 @@ static int virtionet_receive(struct virtio_net *vnet, char *buf, int maxlen)
>  #endif
> 
>         /* Copy data to destination buffer */
> -       memcpy(buf, (void *)virtio_modern64_to_cpu(vdev, vq_rx->desc[id].addr), len);
> +       memcpy(buf, virtio_desc_addr(vdev, VQ_RX, id), len);
> 
>         /* Move indices to next entries */
>         last_rx_idx = last_rx_idx + 1;
> diff --git a/lib/libvirtio/virtio-scsi.c b/lib/libvirtio/virtio-scsi.c
> index 251661e8d466..44573d10e24c 100644
> --- a/lib/libvirtio/virtio-scsi.c
> +++ b/lib/libvirtio/virtio-scsi.c
> @@ -81,6 +81,11 @@ int virtioscsi_send(struct virtio_device *dev,
>                         break;
>         }
> 
> +       virtio_free_desc(vq, id, dev->features);
> +       virtio_free_desc(vq, id + 1, dev->features);
> +       if (!(buf == NULL || buf_len == 0))
> +               virtio_free_desc(vq, id + 2, dev->features);
> +
>         return 0;
>  }
> 
> diff --git a/lib/libvirtio/virtio-serial.c b/lib/libvirtio/virtio-serial.c
> index 78ab0e25d7db..66fbfe8b6cc6 100644
> --- a/lib/libvirtio/virtio-serial.c
> +++ b/lib/libvirtio/virtio-serial.c
> @@ -108,7 +108,7 @@ void virtio_serial_shutdown(struct virtio_device *dev)
> 
>  int virtio_serial_putchar(struct virtio_device *dev, char c)
>  {
> -       int id;
> +       int id, ret;
>         uint32_t vq_size, time;
>         volatile uint16_t *current_used_idx;
>         uint16_t last_used_idx, avail_idx;
> @@ -135,17 +135,21 @@ int virtio_serial_putchar(struct virtio_device *dev, char c)
>         virtio_queue_notify(dev, TX_Q);
> 
>         /* Wait for host to consume the descriptor */
> +       ret = 1;
>         time = SLOF_GetTimer() + VIRTIO_TIMEOUT;
>         while (*current_used_idx == last_used_idx) {
>                 // do something better
>                 mb();
>                 if (time < SLOF_GetTimer()) {
>                         printf("virtio_serial_putchar failed! \n");
> -                       return 0;
> +                       ret = 0;
> +                       break;
>                 }
>         }
> 
> -       return 1;
> +       virtio_free_desc(vq, id, dev->features);
> +
> +       return ret;
>  }
> 
>  char virtio_serial_getchar(struct virtio_device *dev)
> @@ -165,7 +169,7 @@ char virtio_serial_getchar(struct virtio_device *dev)
>                 % vq_rx->size;
> 
>         /* Copy data to destination buffer */
> -       memcpy(buf, (void *)virtio_modern64_to_cpu(dev, vq_rx->desc[id - 1].addr), RX_ELEM_SIZE);
> +       memcpy(buf, virtio_desc_addr(dev, RX_Q, id - 1), RX_ELEM_SIZE);
> 
>         /* Move indices to next entries */
>         last_rx_idx = last_rx_idx + 1;
> diff --git a/lib/libvirtio/virtio.c b/lib/libvirtio/virtio.c
> index fff0c7e54c19..d4db5f656888 100644
> --- a/lib/libvirtio/virtio.c
> +++ b/lib/libvirtio/virtio.c
> @@ -273,6 +273,17 @@ void virtio_fill_desc(struct vqs *vq, int id, uint64_t features,
>         next %= vq->size;
> 
>         if (features & VIRTIO_F_VERSION_1) {
> +               if (features & VIRTIO_F_IOMMU_PLATFORM) {
> +                       void *gpa = (void *) addr;
> +
> +                       if (!vq->desc_gpas) {
> +                               fprintf(stderr, "IOMMU setup has not been done!\n");
> +                               return;
> +                       }
> +
> +                       addr = SLOF_dma_map_in(gpa, len, 0);
> +                       vq->desc_gpas[id] = gpa;
> +               }
>                 desc->addr = cpu_to_le64(addr);
>                 desc->len = cpu_to_le32(len);
>                 desc->flags = cpu_to_le16(flags);
> @@ -285,6 +296,32 @@ void virtio_fill_desc(struct vqs *vq, int id, uint64_t features,
>         }
>  }
> 
> +void virtio_free_desc(struct vqs *vq, int id, uint64_t features)
> +{
> +       struct vring_desc *desc;
> +
> +       id %= vq->size;
> +       desc = &vq->desc[id];
> +
> +       if (features & VIRTIO_F_VERSION_1) {
> +               if (features & VIRTIO_F_IOMMU_PLATFORM) {
> +                       SLOF_dma_map_out(le64_to_cpu(desc->addr),
> +                                        0, le32_to_cpu(desc->len));
> +                       vq->desc_gpas[id] = NULL;
> +               }
> +       }
> +}

On a 4.18 RHEL8 kernel I was unable to boot from virtio-blk, it complains the
disk isn't bootable and the H_PUT_TCE traces show:

 qemu-system-ppc-19019 [045] .... 786844.067356: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x1000,0xe46b003,0xdc73350
 qemu-system-ppc-19019 [045] .... 786844.067358: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
 qemu-system-ppc-19019 [045] .... 786844.067361: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x2000,0xe46c003,0xdc73350
 qemu-system-ppc-19019 [045] .... 786844.067362: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
 qemu-system-ppc-19019 [041] .... 786844.070532: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x3000,0xe67b003,0xdc73490
 qemu-system-ppc-19019 [041] .... 786844.070533: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
 qemu-system-ppc-19019 [041] .... 786844.070817: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x4000,0xe469003,0xdc73490
 qemu-system-ppc-19019 [041] .... 786844.070818: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
 qemu-system-ppc-19019 [041] .... 786844.071102: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x5000,0xe67a003,0xdc73490
 qemu-system-ppc-19019 [041] .... 786844.071103: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
 qemu-system-ppc-19019 [041] .... 786844.071123: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
 qemu-system-ppc-19019 [041] .... 786844.073532: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x3228,0x0,0xdc73488
 qemu-system-ppc-19019 [041] .... 786844.073534: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_PARAMETER

If I disable in-kernel TCE acceleration, or use TCG, it seems to work
fine. If I add the following hack it seems to fix things:

  @@ -305,7 +307,8 @@ void virtio_free_desc(struct vqs *vq, int id, uint64_t features)
   
          if (features & VIRTIO_F_VERSION_1) {
                  if (features & VIRTIO_F_IOMMU_PLATFORM) {
  -                       SLOF_dma_map_out(le64_to_cpu(desc->addr),
  +                       fprintf(stderr, "mapping out desc->addr: %llx\n", le64_to_cpu(desc->addr));
  +                       SLOF_dma_map_out(le64_to_cpu(desc->addr) & ~(0xfff),
                                           0, le32_to_cpu(desc->len));
                          vq->desc_gpas[id] = NULL;
                  }

I guess the kernel is a bit stricter about unmapping unaligned TCE entries? If
so, maybe we should align automatically in dma-map-out?

> +
> +void *virtio_desc_addr(struct virtio_device *vdev, int queue, int id)
> +{
> +       struct vqs *vq = &vdev->vq[queue];
> +
> +       if (vq->desc_gpas)
> +               return vq->desc_gpas[id];
> +
> +       return (void *) virtio_modern64_to_cpu(vdev, vq->desc[id].addr);
> +}
> +
>  /**
>   * Reset virtio device
>   */

<snip>

> diff --git a/board-qemu/slof/OF.fs b/board-qemu/slof/OF.fs
> index a85f6c558e67..3e117ad03e09 100644
> --- a/board-qemu/slof/OF.fs
> +++ b/board-qemu/slof/OF.fs
> @@ -143,6 +143,9 @@ check-for-nvramrc
> 
>  8a0 cp
> 
> +\ For DMA functions used by client/package instances.
> +#include "dma-instance-function.fs"
> +
>  \ The client interface.
>  #include "client.fs"
>  \ ELF binary file format.
> diff --git a/slof/fs/dma-instance-function.fs b/slof/fs/dma-instance-function.fs
> new file mode 100644
> index 000000000000..057181be0f44
> --- /dev/null
> +++ b/slof/fs/dma-instance-function.fs
> @@ -0,0 +1,28 @@
> +\ ****************************************************************************/
> +\ * Copyright (c) 2011 IBM Corporation

I copy/pasted the license header and forgot to update, should probably change
to 2019.

Also, if you think you need it:

Signed-off-by: Michael Roth <mdroth@linux.vnet.ibm.com>

> +\ * All rights reserved.
> +\ * This program and the accompanying materials
> +\ * are made available under the terms of the BSD License
> +\ * which accompanies this distribution, and is available at
> +\ * http://www.opensource.org/licenses/bsd-license.php
> +\ *
> +\ * Contributors:
> +\ *     IBM Corporation - initial implementation
> +\ ****************************************************************************/
> +
> +\ DMA memory allocation functions
> +: dma-alloc ( size -- virt )
> +   s" dma-alloc" $call-parent
> +;
> +
> +: dma-free ( virt size -- )
> +   s" dma-free" $call-parent
> +;
> +
> +: dma-map-in ( virt size cacheable? -- devaddr )
> +   s" dma-map-in" $call-parent
> +;
> +
> +: dma-map-out ( virt devaddr size -- )
> +   s" dma-map-out" $call-parent
> +;
> -- 
> 2.17.1
> 
> _______________________________________________
> SLOF mailing list
> SLOF@lists.ozlabs.org
> https://lists.ozlabs.org/listinfo/slof
David Gibson Nov. 22, 2019, 1:11 a.m. UTC | #2
On Thu, Nov 21, 2019 at 06:09:56PM -0600, Michael Roth wrote:
> Quoting Alexey Kardashevskiy (2019-11-20 17:58:01)
> > When QEMU is started with iommu_platform=on, the guest driver must accept
> > it or the device will fail.
> > 
> > This enables IOMMU support for virtio-net, -scsi, -block, -serial, -9p
> > devices. -serial and -9p are only compile tested though.
> > 
> > For virtio-net we map all RX buffers once and TX when xmit() is called
> > and unmap older pages when we are about to reuse the VQ descriptor.
> > As all other devices are synchronous, we unmap IOMMU pages right after
> > completion of a transaction.
> > 
> > This depends on QEMU's:
> > https://patchwork.ozlabs.org/patch/1194067/
> > 
> > Copied-bits-and-pieces-from: Michael Roth <mdroth@linux.vnet.ibm.com>
> > Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> > ---
> > Changes:
> > v2:
> > * added Mike's fs/dma-instance-function.fs
> > * total rework
> > ---
> >  board-qemu/slof/Makefile         |  1 +
> >  lib/libvirtio/virtio.h           |  5 ++
> >  lib/libvirtio/virtio-9p.c        |  4 ++
> >  lib/libvirtio/virtio-blk.c       |  4 ++
> >  lib/libvirtio/virtio-net.c       |  5 +-
> >  lib/libvirtio/virtio-scsi.c      |  5 ++
> >  lib/libvirtio/virtio-serial.c    | 12 +++--
> >  lib/libvirtio/virtio.c           | 82 +++++++++++++++++++++++++++++++-
> >  board-qemu/slof/OF.fs            |  3 ++
> >  slof/fs/dma-instance-function.fs | 28 +++++++++++
> >  10 files changed, 143 insertions(+), 6 deletions(-)
> >  create mode 100644 slof/fs/dma-instance-function.fs
> > 
> > diff --git a/board-qemu/slof/Makefile b/board-qemu/slof/Makefile
> > index 2263e751bde9..d7ed2d7a6f18 100644
> > --- a/board-qemu/slof/Makefile
> > +++ b/board-qemu/slof/Makefile
> > @@ -99,6 +99,7 @@ OF_FFS_FILES = \
> >         $(SLOFCMNDIR)/fs/graphics.fs \
> >         $(SLOFCMNDIR)/fs/generic-disk.fs \
> >         $(SLOFCMNDIR)/fs/dma-function.fs \
> > +       $(SLOFCMNDIR)/fs/dma-instance-function.fs \
> >         $(SLOFCMNDIR)/fs/pci-device.fs \
> >         $(SLOFCMNDIR)/fs/pci-bridge.fs \
> >         $(SLOFCMNDIR)/fs/pci-properties.fs \
> > diff --git a/lib/libvirtio/virtio.h b/lib/libvirtio/virtio.h
> > index 4927a97f9f5f..250120b3112b 100644
> > --- a/lib/libvirtio/virtio.h
> > +++ b/lib/libvirtio/virtio.h
> > @@ -29,6 +29,7 @@
> >  #define VIRTIO_F_RING_INDIRECT_DESC    BIT(28)
> >  #define VIRTIO_F_RING_EVENT_IDX                BIT(29)
> >  #define VIRTIO_F_VERSION_1             BIT(32)
> > +#define VIRTIO_F_IOMMU_PLATFORM         BIT(33)
> > 
> >  #define VIRTIO_TIMEOUT                 5000 /* 5 sec timeout */
> > 
> > @@ -84,6 +85,8 @@ struct vqs {
> >         struct vring_desc *desc;
> >         struct vring_avail *avail;
> >         struct vring_used *used;
> > +       void **desc_gpas; /* to get gpa from desc->addr (which is ioba) */
> > +       uint64_t bus_desc;
> >  };
> > 
> >  struct virtio_device {
> > @@ -109,6 +112,8 @@ extern struct vring_used *virtio_get_vring_used(struct virtio_device *dev, int q
> >  extern void virtio_fill_desc(struct vqs *vq, int id, uint64_t features,
> >                               uint64_t addr, uint32_t len,
> >                               uint16_t flags, uint16_t next);
> > +extern void virtio_free_desc(struct vqs *vq, int id, uint64_t features);
> > +void *virtio_desc_addr(struct virtio_device *vdev, int queue, int id);
> >  extern struct vqs *virtio_queue_init_vq(struct virtio_device *dev, unsigned int id);
> >  extern void virtio_queue_term_vq(struct virtio_device *dev, struct vqs *vq, unsigned int id);
> > 
> > diff --git a/lib/libvirtio/virtio-9p.c b/lib/libvirtio/virtio-9p.c
> > index 426069fe9509..76078612b06e 100644
> > --- a/lib/libvirtio/virtio-9p.c
> > +++ b/lib/libvirtio/virtio-9p.c
> > @@ -129,6 +129,10 @@ static int virtio_9p_transact(void *opaque, uint8_t *tx, int tx_size, uint8_t *r
> >                 // do something better
> >                 mb();
> >         }
> > +
> > +       virtio_free_desc(vq, id, dev->features);
> > +       virtio_free_desc(vq, id + 1, dev->features);
> > +
> >         if (i == 0) {
> >                 return -1;
> >         }
> > diff --git a/lib/libvirtio/virtio-blk.c b/lib/libvirtio/virtio-blk.c
> > index a0dadbb0d6a8..0363038e559d 100644
> > --- a/lib/libvirtio/virtio-blk.c
> > +++ b/lib/libvirtio/virtio-blk.c
> > @@ -195,6 +195,10 @@ virtioblk_transfer(struct virtio_device *dev, char *buf, uint64_t blocknum,
> >                         break;
> >         }
> > 
> > +       virtio_free_desc(vq, id, dev->features);
> > +       virtio_free_desc(vq, id + 1, dev->features);
> > +       virtio_free_desc(vq, id + 2, dev->features);
> > +
> >         if (status == 0)
> >                 return cnt;
> > 
> > diff --git a/lib/libvirtio/virtio-net.c b/lib/libvirtio/virtio-net.c
> > index ae67883020ef..5a0d19088527 100644
> > --- a/lib/libvirtio/virtio-net.c
> > +++ b/lib/libvirtio/virtio-net.c
> > @@ -255,6 +255,9 @@ static int virtionet_xmit(struct virtio_net *vnet, char *buf, int len)
> >         idx = virtio_modern16_to_cpu(vdev, vq_tx->avail->idx);
> >         id = (idx * 2) % vq_tx->size;
> > 
> > +       virtio_free_desc(vq_tx, id, vdev->features);
> > +       virtio_free_desc(vq_tx, id + 1, vdev->features);
> > +
> >         /* Set up virtqueue descriptor for header */
> >         virtio_fill_desc(vq_tx, id, vdev->features, (uint64_t)nethdr,
> >                          net_hdr_size, VRING_DESC_F_NEXT, id + 1);
> > @@ -317,7 +320,7 @@ static int virtionet_receive(struct virtio_net *vnet, char *buf, int maxlen)
> >  #endif
> > 
> >         /* Copy data to destination buffer */
> > -       memcpy(buf, (void *)virtio_modern64_to_cpu(vdev, vq_rx->desc[id].addr), len);
> > +       memcpy(buf, virtio_desc_addr(vdev, VQ_RX, id), len);
> > 
> >         /* Move indices to next entries */
> >         last_rx_idx = last_rx_idx + 1;
> > diff --git a/lib/libvirtio/virtio-scsi.c b/lib/libvirtio/virtio-scsi.c
> > index 251661e8d466..44573d10e24c 100644
> > --- a/lib/libvirtio/virtio-scsi.c
> > +++ b/lib/libvirtio/virtio-scsi.c
> > @@ -81,6 +81,11 @@ int virtioscsi_send(struct virtio_device *dev,
> >                         break;
> >         }
> > 
> > +       virtio_free_desc(vq, id, dev->features);
> > +       virtio_free_desc(vq, id + 1, dev->features);
> > +       if (!(buf == NULL || buf_len == 0))
> > +               virtio_free_desc(vq, id + 2, dev->features);
> > +
> >         return 0;
> >  }
> > 
> > diff --git a/lib/libvirtio/virtio-serial.c b/lib/libvirtio/virtio-serial.c
> > index 78ab0e25d7db..66fbfe8b6cc6 100644
> > --- a/lib/libvirtio/virtio-serial.c
> > +++ b/lib/libvirtio/virtio-serial.c
> > @@ -108,7 +108,7 @@ void virtio_serial_shutdown(struct virtio_device *dev)
> > 
> >  int virtio_serial_putchar(struct virtio_device *dev, char c)
> >  {
> > -       int id;
> > +       int id, ret;
> >         uint32_t vq_size, time;
> >         volatile uint16_t *current_used_idx;
> >         uint16_t last_used_idx, avail_idx;
> > @@ -135,17 +135,21 @@ int virtio_serial_putchar(struct virtio_device *dev, char c)
> >         virtio_queue_notify(dev, TX_Q);
> > 
> >         /* Wait for host to consume the descriptor */
> > +       ret = 1;
> >         time = SLOF_GetTimer() + VIRTIO_TIMEOUT;
> >         while (*current_used_idx == last_used_idx) {
> >                 // do something better
> >                 mb();
> >                 if (time < SLOF_GetTimer()) {
> >                         printf("virtio_serial_putchar failed! \n");
> > -                       return 0;
> > +                       ret = 0;
> > +                       break;
> >                 }
> >         }
> > 
> > -       return 1;
> > +       virtio_free_desc(vq, id, dev->features);
> > +
> > +       return ret;
> >  }
> > 
> >  char virtio_serial_getchar(struct virtio_device *dev)
> > @@ -165,7 +169,7 @@ char virtio_serial_getchar(struct virtio_device *dev)
> >                 % vq_rx->size;
> > 
> >         /* Copy data to destination buffer */
> > -       memcpy(buf, (void *)virtio_modern64_to_cpu(dev, vq_rx->desc[id - 1].addr), RX_ELEM_SIZE);
> > +       memcpy(buf, virtio_desc_addr(dev, RX_Q, id - 1), RX_ELEM_SIZE);
> > 
> >         /* Move indices to next entries */
> >         last_rx_idx = last_rx_idx + 1;
> > diff --git a/lib/libvirtio/virtio.c b/lib/libvirtio/virtio.c
> > index fff0c7e54c19..d4db5f656888 100644
> > --- a/lib/libvirtio/virtio.c
> > +++ b/lib/libvirtio/virtio.c
> > @@ -273,6 +273,17 @@ void virtio_fill_desc(struct vqs *vq, int id, uint64_t features,
> >         next %= vq->size;
> > 
> >         if (features & VIRTIO_F_VERSION_1) {
> > +               if (features & VIRTIO_F_IOMMU_PLATFORM) {
> > +                       void *gpa = (void *) addr;
> > +
> > +                       if (!vq->desc_gpas) {
> > +                               fprintf(stderr, "IOMMU setup has not been done!\n");
> > +                               return;
> > +                       }
> > +
> > +                       addr = SLOF_dma_map_in(gpa, len, 0);
> > +                       vq->desc_gpas[id] = gpa;
> > +               }
> >                 desc->addr = cpu_to_le64(addr);
> >                 desc->len = cpu_to_le32(len);
> >                 desc->flags = cpu_to_le16(flags);
> > @@ -285,6 +296,32 @@ void virtio_fill_desc(struct vqs *vq, int id, uint64_t features,
> >         }
> >  }
> > 
> > +void virtio_free_desc(struct vqs *vq, int id, uint64_t features)
> > +{
> > +       struct vring_desc *desc;
> > +
> > +       id %= vq->size;
> > +       desc = &vq->desc[id];
> > +
> > +       if (features & VIRTIO_F_VERSION_1) {
> > +               if (features & VIRTIO_F_IOMMU_PLATFORM) {
> > +                       SLOF_dma_map_out(le64_to_cpu(desc->addr),
> > +                                        0, le32_to_cpu(desc->len));
> > +                       vq->desc_gpas[id] = NULL;
> > +               }
> > +       }
> > +}
> 
> On a 4.18 RHEL8 kernel I was unable to boot from virtio-blk, it complains the
> disk isn't bootable and the H_PUT_TCE traces show:
> 
>  qemu-system-ppc-19019 [045] .... 786844.067356: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x1000,0xe46b003,0xdc73350
>  qemu-system-ppc-19019 [045] .... 786844.067358: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
>  qemu-system-ppc-19019 [045] .... 786844.067361: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x2000,0xe46c003,0xdc73350
>  qemu-system-ppc-19019 [045] .... 786844.067362: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
>  qemu-system-ppc-19019 [041] .... 786844.070532: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x3000,0xe67b003,0xdc73490
>  qemu-system-ppc-19019 [041] .... 786844.070533: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
>  qemu-system-ppc-19019 [041] .... 786844.070817: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x4000,0xe469003,0xdc73490
>  qemu-system-ppc-19019 [041] .... 786844.070818: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
>  qemu-system-ppc-19019 [041] .... 786844.071102: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x5000,0xe67a003,0xdc73490
>  qemu-system-ppc-19019 [041] .... 786844.071103: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
>  qemu-system-ppc-19019 [041] .... 786844.071123: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
>  qemu-system-ppc-19019 [041] .... 786844.073532: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x3228,0x0,0xdc73488
>  qemu-system-ppc-19019 [041] .... 786844.073534: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_PARAMETER
> 
> If I disable in-kernel TCE acceleration, or use TCG, it seems to work
> fine. If I add the following hack it seems to fix things:
> 
>   @@ -305,7 +307,8 @@ void virtio_free_desc(struct vqs *vq, int id, uint64_t features)
>    
>           if (features & VIRTIO_F_VERSION_1) {
>                   if (features & VIRTIO_F_IOMMU_PLATFORM) {
>   -                       SLOF_dma_map_out(le64_to_cpu(desc->addr),
>   +                       fprintf(stderr, "mapping out desc->addr: %llx\n", le64_to_cpu(desc->addr));
>   +                       SLOF_dma_map_out(le64_to_cpu(desc->addr) & ~(0xfff),
>                                            0, le32_to_cpu(desc->len));
>                           vq->desc_gpas[id] = NULL;
>                   }
> 
> I guess the kernel is a bit stricter about unmapping unaligned TCE entries? If
> so, maybe we should align automatically in dma-map-out?

Yes, I think we should.

It's probably also a good idea to match the qemu and kernel behaviour
to each other.  PAPR doesn't seem to specifically say what to do about
unaligned IOBAs, but I'm inclined to make qemu stricter rather than
making the kernel less strict.  Any counter opinions?
Alexey Kardashevskiy Nov. 22, 2019, 1:49 a.m. UTC | #3
On 22/11/2019 11:09, Michael Roth wrote:
> Quoting Alexey Kardashevskiy (2019-11-20 17:58:01)
>> When QEMU is started with iommu_platform=on, the guest driver must accept
>> it or the device will fail.
>>
>> This enables IOMMU support for virtio-net, -scsi, -block, -serial, -9p
>> devices. -serial and -9p are only compile tested though.
>>
>> For virtio-net we map all RX buffers once and TX when xmit() is called
>> and unmap older pages when we are about to reuse the VQ descriptor.
>> As all other devices are synchronous, we unmap IOMMU pages right after
>> completion of a transaction.
>>
>> This depends on QEMU's:
>> https://patchwork.ozlabs.org/patch/1194067/
>>
>> Copied-bits-and-pieces-from: Michael Roth <mdroth@linux.vnet.ibm.com>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> ---
>> Changes:
>> v2:
>> * added Mike's fs/dma-instance-function.fs
>> * total rework
>> ---
>>  board-qemu/slof/Makefile         |  1 +
>>  lib/libvirtio/virtio.h           |  5 ++
>>  lib/libvirtio/virtio-9p.c        |  4 ++
>>  lib/libvirtio/virtio-blk.c       |  4 ++
>>  lib/libvirtio/virtio-net.c       |  5 +-
>>  lib/libvirtio/virtio-scsi.c      |  5 ++
>>  lib/libvirtio/virtio-serial.c    | 12 +++--
>>  lib/libvirtio/virtio.c           | 82 +++++++++++++++++++++++++++++++-
>>  board-qemu/slof/OF.fs            |  3 ++
>>  slof/fs/dma-instance-function.fs | 28 +++++++++++
>>  10 files changed, 143 insertions(+), 6 deletions(-)
>>  create mode 100644 slof/fs/dma-instance-function.fs
>>
>> diff --git a/board-qemu/slof/Makefile b/board-qemu/slof/Makefile
>> index 2263e751bde9..d7ed2d7a6f18 100644
>> --- a/board-qemu/slof/Makefile
>> +++ b/board-qemu/slof/Makefile
>> @@ -99,6 +99,7 @@ OF_FFS_FILES = \
>>         $(SLOFCMNDIR)/fs/graphics.fs \
>>         $(SLOFCMNDIR)/fs/generic-disk.fs \
>>         $(SLOFCMNDIR)/fs/dma-function.fs \
>> +       $(SLOFCMNDIR)/fs/dma-instance-function.fs \
>>         $(SLOFCMNDIR)/fs/pci-device.fs \
>>         $(SLOFCMNDIR)/fs/pci-bridge.fs \
>>         $(SLOFCMNDIR)/fs/pci-properties.fs \
>> diff --git a/lib/libvirtio/virtio.h b/lib/libvirtio/virtio.h
>> index 4927a97f9f5f..250120b3112b 100644
>> --- a/lib/libvirtio/virtio.h
>> +++ b/lib/libvirtio/virtio.h
>> @@ -29,6 +29,7 @@
>>  #define VIRTIO_F_RING_INDIRECT_DESC    BIT(28)
>>  #define VIRTIO_F_RING_EVENT_IDX                BIT(29)
>>  #define VIRTIO_F_VERSION_1             BIT(32)
>> +#define VIRTIO_F_IOMMU_PLATFORM         BIT(33)
>>
>>  #define VIRTIO_TIMEOUT                 5000 /* 5 sec timeout */
>>
>> @@ -84,6 +85,8 @@ struct vqs {
>>         struct vring_desc *desc;
>>         struct vring_avail *avail;
>>         struct vring_used *used;
>> +       void **desc_gpas; /* to get gpa from desc->addr (which is ioba) */
>> +       uint64_t bus_desc;
>>  };
>>
>>  struct virtio_device {
>> @@ -109,6 +112,8 @@ extern struct vring_used *virtio_get_vring_used(struct virtio_device *dev, int q
>>  extern void virtio_fill_desc(struct vqs *vq, int id, uint64_t features,
>>                               uint64_t addr, uint32_t len,
>>                               uint16_t flags, uint16_t next);
>> +extern void virtio_free_desc(struct vqs *vq, int id, uint64_t features);
>> +void *virtio_desc_addr(struct virtio_device *vdev, int queue, int id);
>>  extern struct vqs *virtio_queue_init_vq(struct virtio_device *dev, unsigned int id);
>>  extern void virtio_queue_term_vq(struct virtio_device *dev, struct vqs *vq, unsigned int id);
>>
>> diff --git a/lib/libvirtio/virtio-9p.c b/lib/libvirtio/virtio-9p.c
>> index 426069fe9509..76078612b06e 100644
>> --- a/lib/libvirtio/virtio-9p.c
>> +++ b/lib/libvirtio/virtio-9p.c
>> @@ -129,6 +129,10 @@ static int virtio_9p_transact(void *opaque, uint8_t *tx, int tx_size, uint8_t *r
>>                 // do something better
>>                 mb();
>>         }
>> +
>> +       virtio_free_desc(vq, id, dev->features);
>> +       virtio_free_desc(vq, id + 1, dev->features);
>> +
>>         if (i == 0) {
>>                 return -1;
>>         }
>> diff --git a/lib/libvirtio/virtio-blk.c b/lib/libvirtio/virtio-blk.c
>> index a0dadbb0d6a8..0363038e559d 100644
>> --- a/lib/libvirtio/virtio-blk.c
>> +++ b/lib/libvirtio/virtio-blk.c
>> @@ -195,6 +195,10 @@ virtioblk_transfer(struct virtio_device *dev, char *buf, uint64_t blocknum,
>>                         break;
>>         }
>>
>> +       virtio_free_desc(vq, id, dev->features);
>> +       virtio_free_desc(vq, id + 1, dev->features);
>> +       virtio_free_desc(vq, id + 2, dev->features);
>> +
>>         if (status == 0)
>>                 return cnt;
>>
>> diff --git a/lib/libvirtio/virtio-net.c b/lib/libvirtio/virtio-net.c
>> index ae67883020ef..5a0d19088527 100644
>> --- a/lib/libvirtio/virtio-net.c
>> +++ b/lib/libvirtio/virtio-net.c
>> @@ -255,6 +255,9 @@ static int virtionet_xmit(struct virtio_net *vnet, char *buf, int len)
>>         idx = virtio_modern16_to_cpu(vdev, vq_tx->avail->idx);
>>         id = (idx * 2) % vq_tx->size;
>>
>> +       virtio_free_desc(vq_tx, id, vdev->features);
>> +       virtio_free_desc(vq_tx, id + 1, vdev->features);
>> +
>>         /* Set up virtqueue descriptor for header */
>>         virtio_fill_desc(vq_tx, id, vdev->features, (uint64_t)nethdr,
>>                          net_hdr_size, VRING_DESC_F_NEXT, id + 1);
>> @@ -317,7 +320,7 @@ static int virtionet_receive(struct virtio_net *vnet, char *buf, int maxlen)
>>  #endif
>>
>>         /* Copy data to destination buffer */
>> -       memcpy(buf, (void *)virtio_modern64_to_cpu(vdev, vq_rx->desc[id].addr), len);
>> +       memcpy(buf, virtio_desc_addr(vdev, VQ_RX, id), len);
>>
>>         /* Move indices to next entries */
>>         last_rx_idx = last_rx_idx + 1;
>> diff --git a/lib/libvirtio/virtio-scsi.c b/lib/libvirtio/virtio-scsi.c
>> index 251661e8d466..44573d10e24c 100644
>> --- a/lib/libvirtio/virtio-scsi.c
>> +++ b/lib/libvirtio/virtio-scsi.c
>> @@ -81,6 +81,11 @@ int virtioscsi_send(struct virtio_device *dev,
>>                         break;
>>         }
>>
>> +       virtio_free_desc(vq, id, dev->features);
>> +       virtio_free_desc(vq, id + 1, dev->features);
>> +       if (!(buf == NULL || buf_len == 0))
>> +               virtio_free_desc(vq, id + 2, dev->features);
>> +
>>         return 0;
>>  }
>>
>> diff --git a/lib/libvirtio/virtio-serial.c b/lib/libvirtio/virtio-serial.c
>> index 78ab0e25d7db..66fbfe8b6cc6 100644
>> --- a/lib/libvirtio/virtio-serial.c
>> +++ b/lib/libvirtio/virtio-serial.c
>> @@ -108,7 +108,7 @@ void virtio_serial_shutdown(struct virtio_device *dev)
>>
>>  int virtio_serial_putchar(struct virtio_device *dev, char c)
>>  {
>> -       int id;
>> +       int id, ret;
>>         uint32_t vq_size, time;
>>         volatile uint16_t *current_used_idx;
>>         uint16_t last_used_idx, avail_idx;
>> @@ -135,17 +135,21 @@ int virtio_serial_putchar(struct virtio_device *dev, char c)
>>         virtio_queue_notify(dev, TX_Q);
>>
>>         /* Wait for host to consume the descriptor */
>> +       ret = 1;
>>         time = SLOF_GetTimer() + VIRTIO_TIMEOUT;
>>         while (*current_used_idx == last_used_idx) {
>>                 // do something better
>>                 mb();
>>                 if (time < SLOF_GetTimer()) {
>>                         printf("virtio_serial_putchar failed! \n");
>> -                       return 0;
>> +                       ret = 0;
>> +                       break;
>>                 }
>>         }
>>
>> -       return 1;
>> +       virtio_free_desc(vq, id, dev->features);
>> +
>> +       return ret;
>>  }
>>
>>  char virtio_serial_getchar(struct virtio_device *dev)
>> @@ -165,7 +169,7 @@ char virtio_serial_getchar(struct virtio_device *dev)
>>                 % vq_rx->size;
>>
>>         /* Copy data to destination buffer */
>> -       memcpy(buf, (void *)virtio_modern64_to_cpu(dev, vq_rx->desc[id - 1].addr), RX_ELEM_SIZE);
>> +       memcpy(buf, virtio_desc_addr(dev, RX_Q, id - 1), RX_ELEM_SIZE);
>>
>>         /* Move indices to next entries */
>>         last_rx_idx = last_rx_idx + 1;
>> diff --git a/lib/libvirtio/virtio.c b/lib/libvirtio/virtio.c
>> index fff0c7e54c19..d4db5f656888 100644
>> --- a/lib/libvirtio/virtio.c
>> +++ b/lib/libvirtio/virtio.c
>> @@ -273,6 +273,17 @@ void virtio_fill_desc(struct vqs *vq, int id, uint64_t features,
>>         next %= vq->size;
>>
>>         if (features & VIRTIO_F_VERSION_1) {
>> +               if (features & VIRTIO_F_IOMMU_PLATFORM) {
>> +                       void *gpa = (void *) addr;
>> +
>> +                       if (!vq->desc_gpas) {
>> +                               fprintf(stderr, "IOMMU setup has not been done!\n");
>> +                               return;
>> +                       }
>> +
>> +                       addr = SLOF_dma_map_in(gpa, len, 0);
>> +                       vq->desc_gpas[id] = gpa;
>> +               }
>>                 desc->addr = cpu_to_le64(addr);
>>                 desc->len = cpu_to_le32(len);
>>                 desc->flags = cpu_to_le16(flags);
>> @@ -285,6 +296,32 @@ void virtio_fill_desc(struct vqs *vq, int id, uint64_t features,
>>         }
>>  }
>>
>> +void virtio_free_desc(struct vqs *vq, int id, uint64_t features)
>> +{
>> +       struct vring_desc *desc;
>> +
>> +       id %= vq->size;
>> +       desc = &vq->desc[id];
>> +
>> +       if (features & VIRTIO_F_VERSION_1) {
>> +               if (features & VIRTIO_F_IOMMU_PLATFORM) {
>> +                       SLOF_dma_map_out(le64_to_cpu(desc->addr),
>> +                                        0, le32_to_cpu(desc->len));
>> +                       vq->desc_gpas[id] = NULL;
>> +               }
>> +       }
>> +}
> 
> On a 4.18 RHEL8 kernel I was unable to boot from virtio-blk, it complains the
> disk isn't bootable and the H_PUT_TCE traces show:
> 
>  qemu-system-ppc-19019 [045] .... 786844.067356: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x1000,0xe46b003,0xdc73350
>  qemu-system-ppc-19019 [045] .... 786844.067358: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
>  qemu-system-ppc-19019 [045] .... 786844.067361: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x2000,0xe46c003,0xdc73350
>  qemu-system-ppc-19019 [045] .... 786844.067362: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
>  qemu-system-ppc-19019 [041] .... 786844.070532: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x3000,0xe67b003,0xdc73490
>  qemu-system-ppc-19019 [041] .... 786844.070533: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
>  qemu-system-ppc-19019 [041] .... 786844.070817: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x4000,0xe469003,0xdc73490
>  qemu-system-ppc-19019 [041] .... 786844.070818: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
>  qemu-system-ppc-19019 [041] .... 786844.071102: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x5000,0xe67a003,0xdc73490
>  qemu-system-ppc-19019 [041] .... 786844.071103: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
>  qemu-system-ppc-19019 [041] .... 786844.071123: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
>  qemu-system-ppc-19019 [041] .... 786844.073532: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x3228,0x0,0xdc73488
>  qemu-system-ppc-19019 [041] .... 786844.073534: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_PARAMETER
> 
> If I disable in-kernel TCE acceleration, or use TCG, it seems to work
> fine. If I add the following hack it seems to fix things:
> 
>   @@ -305,7 +307,8 @@ void virtio_free_desc(struct vqs *vq, int id, uint64_t features)
>    
>           if (features & VIRTIO_F_VERSION_1) {
>                   if (features & VIRTIO_F_IOMMU_PLATFORM) {
>   -                       SLOF_dma_map_out(le64_to_cpu(desc->addr),
>   +                       fprintf(stderr, "mapping out desc->addr: %llx\n", le64_to_cpu(desc->addr));
>   +                       SLOF_dma_map_out(le64_to_cpu(desc->addr) & ~(0xfff),
>                                            0, le32_to_cpu(desc->len));
>                           vq->desc_gpas[id] = NULL;
>                   }
> 
> I guess the kernel is a bit stricter about unmapping unaligned TCE entries? If
> so, maybe we should align automatically in dma-map-out?


I thought I did that as well in 1/4, I'll double check. Hmmm.


> 
>> +
>> +void *virtio_desc_addr(struct virtio_device *vdev, int queue, int id)
>> +{
>> +       struct vqs *vq = &vdev->vq[queue];
>> +
>> +       if (vq->desc_gpas)
>> +               return vq->desc_gpas[id];
>> +
>> +       return (void *) virtio_modern64_to_cpu(vdev, vq->desc[id].addr);
>> +}
>> +
>>  /**
>>   * Reset virtio device
>>   */
> 
> <snip>
> 
>> diff --git a/board-qemu/slof/OF.fs b/board-qemu/slof/OF.fs
>> index a85f6c558e67..3e117ad03e09 100644
>> --- a/board-qemu/slof/OF.fs
>> +++ b/board-qemu/slof/OF.fs
>> @@ -143,6 +143,9 @@ check-for-nvramrc
>>
>>  8a0 cp
>>
>> +\ For DMA functions used by client/package instances.
>> +#include "dma-instance-function.fs"
>> +
>>  \ The client interface.
>>  #include "client.fs"
>>  \ ELF binary file format.
>> diff --git a/slof/fs/dma-instance-function.fs b/slof/fs/dma-instance-function.fs
>> new file mode 100644
>> index 000000000000..057181be0f44
>> --- /dev/null
>> +++ b/slof/fs/dma-instance-function.fs
>> @@ -0,0 +1,28 @@
>> +\ ****************************************************************************/
>> +\ * Copyright (c) 2011 IBM Corporation
> 
> I copy/pasted the license header and forgot to update, should probably change
> to 2019.
> 
> Also, if you think you need it:
> 
> Signed-off-by: Michael Roth <mdroth@linux.vnet.ibm.com>

Yes I do, thanks!




>> +\ * All rights reserved.
>> +\ * This program and the accompanying materials
>> +\ * are made available under the terms of the BSD License
>> +\ * which accompanies this distribution, and is available at
>> +\ * http://www.opensource.org/licenses/bsd-license.php
>> +\ *
>> +\ * Contributors:
>> +\ *     IBM Corporation - initial implementation
>> +\ ****************************************************************************/
>> +
>> +\ DMA memory allocation functions
>> +: dma-alloc ( size -- virt )
>> +   s" dma-alloc" $call-parent
>> +;
>> +
>> +: dma-free ( virt size -- )
>> +   s" dma-free" $call-parent
>> +;
>> +
>> +: dma-map-in ( virt size cacheable? -- devaddr )
>> +   s" dma-map-in" $call-parent
>> +;
>> +
>> +: dma-map-out ( virt devaddr size -- )
>> +   s" dma-map-out" $call-parent
>> +;
>> -- 
>> 2.17.1
>>
>> _______________________________________________
>> SLOF mailing list
>> SLOF@lists.ozlabs.org
>> https://lists.ozlabs.org/listinfo/slof
Alexey Kardashevskiy Nov. 22, 2019, 3:14 a.m. UTC | #4
On 22/11/2019 12:49, Alexey Kardashevskiy wrote:
> 
> 
> On 22/11/2019 11:09, Michael Roth wrote:
>> Quoting Alexey Kardashevskiy (2019-11-20 17:58:01)
>>> When QEMU is started with iommu_platform=on, the guest driver must accept
>>> it or the device will fail.
>>>
>>> This enables IOMMU support for virtio-net, -scsi, -block, -serial, -9p
>>> devices. -serial and -9p are only compile tested though.
>>>
>>> For virtio-net we map all RX buffers once and TX when xmit() is called
>>> and unmap older pages when we are about to reuse the VQ descriptor.
>>> As all other devices are synchronous, we unmap IOMMU pages right after
>>> completion of a transaction.
>>>
>>> This depends on QEMU's:
>>> https://patchwork.ozlabs.org/patch/1194067/
>>>
>>> Copied-bits-and-pieces-from: Michael Roth <mdroth@linux.vnet.ibm.com>
>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>> ---
>>> Changes:
>>> v2:
>>> * added Mike's fs/dma-instance-function.fs
>>> * total rework
>>> ---
>>>  board-qemu/slof/Makefile         |  1 +
>>>  lib/libvirtio/virtio.h           |  5 ++
>>>  lib/libvirtio/virtio-9p.c        |  4 ++
>>>  lib/libvirtio/virtio-blk.c       |  4 ++
>>>  lib/libvirtio/virtio-net.c       |  5 +-
>>>  lib/libvirtio/virtio-scsi.c      |  5 ++
>>>  lib/libvirtio/virtio-serial.c    | 12 +++--
>>>  lib/libvirtio/virtio.c           | 82 +++++++++++++++++++++++++++++++-
>>>  board-qemu/slof/OF.fs            |  3 ++
>>>  slof/fs/dma-instance-function.fs | 28 +++++++++++
>>>  10 files changed, 143 insertions(+), 6 deletions(-)
>>>  create mode 100644 slof/fs/dma-instance-function.fs
>>>
>>> diff --git a/board-qemu/slof/Makefile b/board-qemu/slof/Makefile
>>> index 2263e751bde9..d7ed2d7a6f18 100644
>>> --- a/board-qemu/slof/Makefile
>>> +++ b/board-qemu/slof/Makefile
>>> @@ -99,6 +99,7 @@ OF_FFS_FILES = \
>>>         $(SLOFCMNDIR)/fs/graphics.fs \
>>>         $(SLOFCMNDIR)/fs/generic-disk.fs \
>>>         $(SLOFCMNDIR)/fs/dma-function.fs \
>>> +       $(SLOFCMNDIR)/fs/dma-instance-function.fs \
>>>         $(SLOFCMNDIR)/fs/pci-device.fs \
>>>         $(SLOFCMNDIR)/fs/pci-bridge.fs \
>>>         $(SLOFCMNDIR)/fs/pci-properties.fs \
>>> diff --git a/lib/libvirtio/virtio.h b/lib/libvirtio/virtio.h
>>> index 4927a97f9f5f..250120b3112b 100644
>>> --- a/lib/libvirtio/virtio.h
>>> +++ b/lib/libvirtio/virtio.h
>>> @@ -29,6 +29,7 @@
>>>  #define VIRTIO_F_RING_INDIRECT_DESC    BIT(28)
>>>  #define VIRTIO_F_RING_EVENT_IDX                BIT(29)
>>>  #define VIRTIO_F_VERSION_1             BIT(32)
>>> +#define VIRTIO_F_IOMMU_PLATFORM         BIT(33)
>>>
>>>  #define VIRTIO_TIMEOUT                 5000 /* 5 sec timeout */
>>>
>>> @@ -84,6 +85,8 @@ struct vqs {
>>>         struct vring_desc *desc;
>>>         struct vring_avail *avail;
>>>         struct vring_used *used;
>>> +       void **desc_gpas; /* to get gpa from desc->addr (which is ioba) */
>>> +       uint64_t bus_desc;
>>>  };
>>>
>>>  struct virtio_device {
>>> @@ -109,6 +112,8 @@ extern struct vring_used *virtio_get_vring_used(struct virtio_device *dev, int q
>>>  extern void virtio_fill_desc(struct vqs *vq, int id, uint64_t features,
>>>                               uint64_t addr, uint32_t len,
>>>                               uint16_t flags, uint16_t next);
>>> +extern void virtio_free_desc(struct vqs *vq, int id, uint64_t features);
>>> +void *virtio_desc_addr(struct virtio_device *vdev, int queue, int id);
>>>  extern struct vqs *virtio_queue_init_vq(struct virtio_device *dev, unsigned int id);
>>>  extern void virtio_queue_term_vq(struct virtio_device *dev, struct vqs *vq, unsigned int id);
>>>
>>> diff --git a/lib/libvirtio/virtio-9p.c b/lib/libvirtio/virtio-9p.c
>>> index 426069fe9509..76078612b06e 100644
>>> --- a/lib/libvirtio/virtio-9p.c
>>> +++ b/lib/libvirtio/virtio-9p.c
>>> @@ -129,6 +129,10 @@ static int virtio_9p_transact(void *opaque, uint8_t *tx, int tx_size, uint8_t *r
>>>                 // do something better
>>>                 mb();
>>>         }
>>> +
>>> +       virtio_free_desc(vq, id, dev->features);
>>> +       virtio_free_desc(vq, id + 1, dev->features);
>>> +
>>>         if (i == 0) {
>>>                 return -1;
>>>         }
>>> diff --git a/lib/libvirtio/virtio-blk.c b/lib/libvirtio/virtio-blk.c
>>> index a0dadbb0d6a8..0363038e559d 100644
>>> --- a/lib/libvirtio/virtio-blk.c
>>> +++ b/lib/libvirtio/virtio-blk.c
>>> @@ -195,6 +195,10 @@ virtioblk_transfer(struct virtio_device *dev, char *buf, uint64_t blocknum,
>>>                         break;
>>>         }
>>>
>>> +       virtio_free_desc(vq, id, dev->features);
>>> +       virtio_free_desc(vq, id + 1, dev->features);
>>> +       virtio_free_desc(vq, id + 2, dev->features);
>>> +
>>>         if (status == 0)
>>>                 return cnt;
>>>
>>> diff --git a/lib/libvirtio/virtio-net.c b/lib/libvirtio/virtio-net.c
>>> index ae67883020ef..5a0d19088527 100644
>>> --- a/lib/libvirtio/virtio-net.c
>>> +++ b/lib/libvirtio/virtio-net.c
>>> @@ -255,6 +255,9 @@ static int virtionet_xmit(struct virtio_net *vnet, char *buf, int len)
>>>         idx = virtio_modern16_to_cpu(vdev, vq_tx->avail->idx);
>>>         id = (idx * 2) % vq_tx->size;
>>>
>>> +       virtio_free_desc(vq_tx, id, vdev->features);
>>> +       virtio_free_desc(vq_tx, id + 1, vdev->features);
>>> +
>>>         /* Set up virtqueue descriptor for header */
>>>         virtio_fill_desc(vq_tx, id, vdev->features, (uint64_t)nethdr,
>>>                          net_hdr_size, VRING_DESC_F_NEXT, id + 1);
>>> @@ -317,7 +320,7 @@ static int virtionet_receive(struct virtio_net *vnet, char *buf, int maxlen)
>>>  #endif
>>>
>>>         /* Copy data to destination buffer */
>>> -       memcpy(buf, (void *)virtio_modern64_to_cpu(vdev, vq_rx->desc[id].addr), len);
>>> +       memcpy(buf, virtio_desc_addr(vdev, VQ_RX, id), len);
>>>
>>>         /* Move indices to next entries */
>>>         last_rx_idx = last_rx_idx + 1;
>>> diff --git a/lib/libvirtio/virtio-scsi.c b/lib/libvirtio/virtio-scsi.c
>>> index 251661e8d466..44573d10e24c 100644
>>> --- a/lib/libvirtio/virtio-scsi.c
>>> +++ b/lib/libvirtio/virtio-scsi.c
>>> @@ -81,6 +81,11 @@ int virtioscsi_send(struct virtio_device *dev,
>>>                         break;
>>>         }
>>>
>>> +       virtio_free_desc(vq, id, dev->features);
>>> +       virtio_free_desc(vq, id + 1, dev->features);
>>> +       if (!(buf == NULL || buf_len == 0))
>>> +               virtio_free_desc(vq, id + 2, dev->features);
>>> +
>>>         return 0;
>>>  }
>>>
>>> diff --git a/lib/libvirtio/virtio-serial.c b/lib/libvirtio/virtio-serial.c
>>> index 78ab0e25d7db..66fbfe8b6cc6 100644
>>> --- a/lib/libvirtio/virtio-serial.c
>>> +++ b/lib/libvirtio/virtio-serial.c
>>> @@ -108,7 +108,7 @@ void virtio_serial_shutdown(struct virtio_device *dev)
>>>
>>>  int virtio_serial_putchar(struct virtio_device *dev, char c)
>>>  {
>>> -       int id;
>>> +       int id, ret;
>>>         uint32_t vq_size, time;
>>>         volatile uint16_t *current_used_idx;
>>>         uint16_t last_used_idx, avail_idx;
>>> @@ -135,17 +135,21 @@ int virtio_serial_putchar(struct virtio_device *dev, char c)
>>>         virtio_queue_notify(dev, TX_Q);
>>>
>>>         /* Wait for host to consume the descriptor */
>>> +       ret = 1;
>>>         time = SLOF_GetTimer() + VIRTIO_TIMEOUT;
>>>         while (*current_used_idx == last_used_idx) {
>>>                 // do something better
>>>                 mb();
>>>                 if (time < SLOF_GetTimer()) {
>>>                         printf("virtio_serial_putchar failed! \n");
>>> -                       return 0;
>>> +                       ret = 0;
>>> +                       break;
>>>                 }
>>>         }
>>>
>>> -       return 1;
>>> +       virtio_free_desc(vq, id, dev->features);
>>> +
>>> +       return ret;
>>>  }
>>>
>>>  char virtio_serial_getchar(struct virtio_device *dev)
>>> @@ -165,7 +169,7 @@ char virtio_serial_getchar(struct virtio_device *dev)
>>>                 % vq_rx->size;
>>>
>>>         /* Copy data to destination buffer */
>>> -       memcpy(buf, (void *)virtio_modern64_to_cpu(dev, vq_rx->desc[id - 1].addr), RX_ELEM_SIZE);
>>> +       memcpy(buf, virtio_desc_addr(dev, RX_Q, id - 1), RX_ELEM_SIZE);
>>>
>>>         /* Move indices to next entries */
>>>         last_rx_idx = last_rx_idx + 1;
>>> diff --git a/lib/libvirtio/virtio.c b/lib/libvirtio/virtio.c
>>> index fff0c7e54c19..d4db5f656888 100644
>>> --- a/lib/libvirtio/virtio.c
>>> +++ b/lib/libvirtio/virtio.c
>>> @@ -273,6 +273,17 @@ void virtio_fill_desc(struct vqs *vq, int id, uint64_t features,
>>>         next %= vq->size;
>>>
>>>         if (features & VIRTIO_F_VERSION_1) {
>>> +               if (features & VIRTIO_F_IOMMU_PLATFORM) {
>>> +                       void *gpa = (void *) addr;
>>> +
>>> +                       if (!vq->desc_gpas) {
>>> +                               fprintf(stderr, "IOMMU setup has not been done!\n");
>>> +                               return;
>>> +                       }
>>> +
>>> +                       addr = SLOF_dma_map_in(gpa, len, 0);
>>> +                       vq->desc_gpas[id] = gpa;
>>> +               }
>>>                 desc->addr = cpu_to_le64(addr);
>>>                 desc->len = cpu_to_le32(len);
>>>                 desc->flags = cpu_to_le16(flags);
>>> @@ -285,6 +296,32 @@ void virtio_fill_desc(struct vqs *vq, int id, uint64_t features,
>>>         }
>>>  }
>>>
>>> +void virtio_free_desc(struct vqs *vq, int id, uint64_t features)
>>> +{
>>> +       struct vring_desc *desc;
>>> +
>>> +       id %= vq->size;
>>> +       desc = &vq->desc[id];
>>> +
>>> +       if (features & VIRTIO_F_VERSION_1) {
>>> +               if (features & VIRTIO_F_IOMMU_PLATFORM) {
>>> +                       SLOF_dma_map_out(le64_to_cpu(desc->addr),
>>> +                                        0, le32_to_cpu(desc->len));
>>> +                       vq->desc_gpas[id] = NULL;
>>> +               }
>>> +       }
>>> +}
>>
>> On a 4.18 RHEL8 kernel I was unable to boot from virtio-blk, it complains the
>> disk isn't bootable and the H_PUT_TCE traces show:
>>
>>  qemu-system-ppc-19019 [045] .... 786844.067356: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x1000,0xe46b003,0xdc73350
>>  qemu-system-ppc-19019 [045] .... 786844.067358: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
>>  qemu-system-ppc-19019 [045] .... 786844.067361: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x2000,0xe46c003,0xdc73350
>>  qemu-system-ppc-19019 [045] .... 786844.067362: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
>>  qemu-system-ppc-19019 [041] .... 786844.070532: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x3000,0xe67b003,0xdc73490
>>  qemu-system-ppc-19019 [041] .... 786844.070533: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
>>  qemu-system-ppc-19019 [041] .... 786844.070817: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x4000,0xe469003,0xdc73490
>>  qemu-system-ppc-19019 [041] .... 786844.070818: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
>>  qemu-system-ppc-19019 [041] .... 786844.071102: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x5000,0xe67a003,0xdc73490
>>  qemu-system-ppc-19019 [041] .... 786844.071103: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
>>  qemu-system-ppc-19019 [041] .... 786844.071123: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
>>  qemu-system-ppc-19019 [041] .... 786844.073532: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x3228,0x0,0xdc73488
>>  qemu-system-ppc-19019 [041] .... 786844.073534: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_PARAMETER
>>
>> If I disable in-kernel TCE acceleration, or use TCG, it seems to work
>> fine. If I add the following hack it seems to fix things:
>>
>>   @@ -305,7 +307,8 @@ void virtio_free_desc(struct vqs *vq, int id, uint64_t features)
>>    
>>           if (features & VIRTIO_F_VERSION_1) {
>>                   if (features & VIRTIO_F_IOMMU_PLATFORM) {
>>   -                       SLOF_dma_map_out(le64_to_cpu(desc->addr),
>>   +                       fprintf(stderr, "mapping out desc->addr: %llx\n", le64_to_cpu(desc->addr));
>>   +                       SLOF_dma_map_out(le64_to_cpu(desc->addr) & ~(0xfff),
>>                                            0, le32_to_cpu(desc->len));
>>                           vq->desc_gpas[id] = NULL;
>>                   }
>>
>> I guess the kernel is a bit stricter about unmapping unaligned TCE entries? If
>> so, maybe we should align automatically in dma-map-out?
> 
> 
> I thought I did that as well in 1/4, I'll double check. Hmmm.


v2 of these patches did not align (or, trunc, really) in dma-map-out but
v3 do, it is "swap dma-trunc swap" in dma-map-out of 1/4. SO I am really
confused now.

It is true that checking differs between qemu's and kvm's h_put_tce
handlers, I added this (see below) to my qemu and it still works, can
you please try in your tree and capture the spapr_iommu_pci_put traces?
Thanks,


diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
index 3d3bcc86496a..feae27bf11fa 100644
--- a/hw/ppc/spapr_iommu.c
+++ b/hw/ppc/spapr_iommu.c
@@ -564,6 +564,9 @@ static target_ulong h_put_tce(PowerPCCPU *cpu,
SpaprMachineState *spapr,
     if (tcet) {
         hwaddr page_mask = IOMMU_PAGE_MASK(tcet->page_shift);

+        if (ioba & ~page_mask)
+            abort();
+
David Gibson Nov. 22, 2019, 3:35 a.m. UTC | #5
On Fri, Nov 22, 2019 at 02:14:10PM +1100, Alexey Kardashevskiy wrote:
> 
> 
> On 22/11/2019 12:49, Alexey Kardashevskiy wrote:
> > 
> > 
> > On 22/11/2019 11:09, Michael Roth wrote:
> >> Quoting Alexey Kardashevskiy (2019-11-20 17:58:01)
> >>> When QEMU is started with iommu_platform=on, the guest driver must accept
> >>> it or the device will fail.
> >>>
> >>> This enables IOMMU support for virtio-net, -scsi, -block, -serial, -9p
> >>> devices. -serial and -9p are only compile tested though.
> >>>
> >>> For virtio-net we map all RX buffers once and TX when xmit() is called
> >>> and unmap older pages when we are about to reuse the VQ descriptor.
> >>> As all other devices are synchronous, we unmap IOMMU pages right after
> >>> completion of a transaction.
> >>>
> >>> This depends on QEMU's:
> >>> https://patchwork.ozlabs.org/patch/1194067/
> >>>
> >>> Copied-bits-and-pieces-from: Michael Roth <mdroth@linux.vnet.ibm.com>
> >>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >>> ---
> >>> Changes:
> >>> v2:
> >>> * added Mike's fs/dma-instance-function.fs
> >>> * total rework
> >>> ---
> >>>  board-qemu/slof/Makefile         |  1 +
> >>>  lib/libvirtio/virtio.h           |  5 ++
> >>>  lib/libvirtio/virtio-9p.c        |  4 ++
> >>>  lib/libvirtio/virtio-blk.c       |  4 ++
> >>>  lib/libvirtio/virtio-net.c       |  5 +-
> >>>  lib/libvirtio/virtio-scsi.c      |  5 ++
> >>>  lib/libvirtio/virtio-serial.c    | 12 +++--
> >>>  lib/libvirtio/virtio.c           | 82 +++++++++++++++++++++++++++++++-
> >>>  board-qemu/slof/OF.fs            |  3 ++
> >>>  slof/fs/dma-instance-function.fs | 28 +++++++++++
> >>>  10 files changed, 143 insertions(+), 6 deletions(-)
> >>>  create mode 100644 slof/fs/dma-instance-function.fs
> >>>
> >>> diff --git a/board-qemu/slof/Makefile b/board-qemu/slof/Makefile
> >>> index 2263e751bde9..d7ed2d7a6f18 100644
> >>> --- a/board-qemu/slof/Makefile
> >>> +++ b/board-qemu/slof/Makefile
> >>> @@ -99,6 +99,7 @@ OF_FFS_FILES = \
> >>>         $(SLOFCMNDIR)/fs/graphics.fs \
> >>>         $(SLOFCMNDIR)/fs/generic-disk.fs \
> >>>         $(SLOFCMNDIR)/fs/dma-function.fs \
> >>> +       $(SLOFCMNDIR)/fs/dma-instance-function.fs \
> >>>         $(SLOFCMNDIR)/fs/pci-device.fs \
> >>>         $(SLOFCMNDIR)/fs/pci-bridge.fs \
> >>>         $(SLOFCMNDIR)/fs/pci-properties.fs \
> >>> diff --git a/lib/libvirtio/virtio.h b/lib/libvirtio/virtio.h
> >>> index 4927a97f9f5f..250120b3112b 100644
> >>> --- a/lib/libvirtio/virtio.h
> >>> +++ b/lib/libvirtio/virtio.h
> >>> @@ -29,6 +29,7 @@
> >>>  #define VIRTIO_F_RING_INDIRECT_DESC    BIT(28)
> >>>  #define VIRTIO_F_RING_EVENT_IDX                BIT(29)
> >>>  #define VIRTIO_F_VERSION_1             BIT(32)
> >>> +#define VIRTIO_F_IOMMU_PLATFORM         BIT(33)
> >>>
> >>>  #define VIRTIO_TIMEOUT                 5000 /* 5 sec timeout */
> >>>
> >>> @@ -84,6 +85,8 @@ struct vqs {
> >>>         struct vring_desc *desc;
> >>>         struct vring_avail *avail;
> >>>         struct vring_used *used;
> >>> +       void **desc_gpas; /* to get gpa from desc->addr (which is ioba) */
> >>> +       uint64_t bus_desc;
> >>>  };
> >>>
> >>>  struct virtio_device {
> >>> @@ -109,6 +112,8 @@ extern struct vring_used *virtio_get_vring_used(struct virtio_device *dev, int q
> >>>  extern void virtio_fill_desc(struct vqs *vq, int id, uint64_t features,
> >>>                               uint64_t addr, uint32_t len,
> >>>                               uint16_t flags, uint16_t next);
> >>> +extern void virtio_free_desc(struct vqs *vq, int id, uint64_t features);
> >>> +void *virtio_desc_addr(struct virtio_device *vdev, int queue, int id);
> >>>  extern struct vqs *virtio_queue_init_vq(struct virtio_device *dev, unsigned int id);
> >>>  extern void virtio_queue_term_vq(struct virtio_device *dev, struct vqs *vq, unsigned int id);
> >>>
> >>> diff --git a/lib/libvirtio/virtio-9p.c b/lib/libvirtio/virtio-9p.c
> >>> index 426069fe9509..76078612b06e 100644
> >>> --- a/lib/libvirtio/virtio-9p.c
> >>> +++ b/lib/libvirtio/virtio-9p.c
> >>> @@ -129,6 +129,10 @@ static int virtio_9p_transact(void *opaque, uint8_t *tx, int tx_size, uint8_t *r
> >>>                 // do something better
> >>>                 mb();
> >>>         }
> >>> +
> >>> +       virtio_free_desc(vq, id, dev->features);
> >>> +       virtio_free_desc(vq, id + 1, dev->features);
> >>> +
> >>>         if (i == 0) {
> >>>                 return -1;
> >>>         }
> >>> diff --git a/lib/libvirtio/virtio-blk.c b/lib/libvirtio/virtio-blk.c
> >>> index a0dadbb0d6a8..0363038e559d 100644
> >>> --- a/lib/libvirtio/virtio-blk.c
> >>> +++ b/lib/libvirtio/virtio-blk.c
> >>> @@ -195,6 +195,10 @@ virtioblk_transfer(struct virtio_device *dev, char *buf, uint64_t blocknum,
> >>>                         break;
> >>>         }
> >>>
> >>> +       virtio_free_desc(vq, id, dev->features);
> >>> +       virtio_free_desc(vq, id + 1, dev->features);
> >>> +       virtio_free_desc(vq, id + 2, dev->features);
> >>> +
> >>>         if (status == 0)
> >>>                 return cnt;
> >>>
> >>> diff --git a/lib/libvirtio/virtio-net.c b/lib/libvirtio/virtio-net.c
> >>> index ae67883020ef..5a0d19088527 100644
> >>> --- a/lib/libvirtio/virtio-net.c
> >>> +++ b/lib/libvirtio/virtio-net.c
> >>> @@ -255,6 +255,9 @@ static int virtionet_xmit(struct virtio_net *vnet, char *buf, int len)
> >>>         idx = virtio_modern16_to_cpu(vdev, vq_tx->avail->idx);
> >>>         id = (idx * 2) % vq_tx->size;
> >>>
> >>> +       virtio_free_desc(vq_tx, id, vdev->features);
> >>> +       virtio_free_desc(vq_tx, id + 1, vdev->features);
> >>> +
> >>>         /* Set up virtqueue descriptor for header */
> >>>         virtio_fill_desc(vq_tx, id, vdev->features, (uint64_t)nethdr,
> >>>                          net_hdr_size, VRING_DESC_F_NEXT, id + 1);
> >>> @@ -317,7 +320,7 @@ static int virtionet_receive(struct virtio_net *vnet, char *buf, int maxlen)
> >>>  #endif
> >>>
> >>>         /* Copy data to destination buffer */
> >>> -       memcpy(buf, (void *)virtio_modern64_to_cpu(vdev, vq_rx->desc[id].addr), len);
> >>> +       memcpy(buf, virtio_desc_addr(vdev, VQ_RX, id), len);
> >>>
> >>>         /* Move indices to next entries */
> >>>         last_rx_idx = last_rx_idx + 1;
> >>> diff --git a/lib/libvirtio/virtio-scsi.c b/lib/libvirtio/virtio-scsi.c
> >>> index 251661e8d466..44573d10e24c 100644
> >>> --- a/lib/libvirtio/virtio-scsi.c
> >>> +++ b/lib/libvirtio/virtio-scsi.c
> >>> @@ -81,6 +81,11 @@ int virtioscsi_send(struct virtio_device *dev,
> >>>                         break;
> >>>         }
> >>>
> >>> +       virtio_free_desc(vq, id, dev->features);
> >>> +       virtio_free_desc(vq, id + 1, dev->features);
> >>> +       if (!(buf == NULL || buf_len == 0))
> >>> +               virtio_free_desc(vq, id + 2, dev->features);
> >>> +
> >>>         return 0;
> >>>  }
> >>>
> >>> diff --git a/lib/libvirtio/virtio-serial.c b/lib/libvirtio/virtio-serial.c
> >>> index 78ab0e25d7db..66fbfe8b6cc6 100644
> >>> --- a/lib/libvirtio/virtio-serial.c
> >>> +++ b/lib/libvirtio/virtio-serial.c
> >>> @@ -108,7 +108,7 @@ void virtio_serial_shutdown(struct virtio_device *dev)
> >>>
> >>>  int virtio_serial_putchar(struct virtio_device *dev, char c)
> >>>  {
> >>> -       int id;
> >>> +       int id, ret;
> >>>         uint32_t vq_size, time;
> >>>         volatile uint16_t *current_used_idx;
> >>>         uint16_t last_used_idx, avail_idx;
> >>> @@ -135,17 +135,21 @@ int virtio_serial_putchar(struct virtio_device *dev, char c)
> >>>         virtio_queue_notify(dev, TX_Q);
> >>>
> >>>         /* Wait for host to consume the descriptor */
> >>> +       ret = 1;
> >>>         time = SLOF_GetTimer() + VIRTIO_TIMEOUT;
> >>>         while (*current_used_idx == last_used_idx) {
> >>>                 // do something better
> >>>                 mb();
> >>>                 if (time < SLOF_GetTimer()) {
> >>>                         printf("virtio_serial_putchar failed! \n");
> >>> -                       return 0;
> >>> +                       ret = 0;
> >>> +                       break;
> >>>                 }
> >>>         }
> >>>
> >>> -       return 1;
> >>> +       virtio_free_desc(vq, id, dev->features);
> >>> +
> >>> +       return ret;
> >>>  }
> >>>
> >>>  char virtio_serial_getchar(struct virtio_device *dev)
> >>> @@ -165,7 +169,7 @@ char virtio_serial_getchar(struct virtio_device *dev)
> >>>                 % vq_rx->size;
> >>>
> >>>         /* Copy data to destination buffer */
> >>> -       memcpy(buf, (void *)virtio_modern64_to_cpu(dev, vq_rx->desc[id - 1].addr), RX_ELEM_SIZE);
> >>> +       memcpy(buf, virtio_desc_addr(dev, RX_Q, id - 1), RX_ELEM_SIZE);
> >>>
> >>>         /* Move indices to next entries */
> >>>         last_rx_idx = last_rx_idx + 1;
> >>> diff --git a/lib/libvirtio/virtio.c b/lib/libvirtio/virtio.c
> >>> index fff0c7e54c19..d4db5f656888 100644
> >>> --- a/lib/libvirtio/virtio.c
> >>> +++ b/lib/libvirtio/virtio.c
> >>> @@ -273,6 +273,17 @@ void virtio_fill_desc(struct vqs *vq, int id, uint64_t features,
> >>>         next %= vq->size;
> >>>
> >>>         if (features & VIRTIO_F_VERSION_1) {
> >>> +               if (features & VIRTIO_F_IOMMU_PLATFORM) {
> >>> +                       void *gpa = (void *) addr;
> >>> +
> >>> +                       if (!vq->desc_gpas) {
> >>> +                               fprintf(stderr, "IOMMU setup has not been done!\n");
> >>> +                               return;
> >>> +                       }
> >>> +
> >>> +                       addr = SLOF_dma_map_in(gpa, len, 0);
> >>> +                       vq->desc_gpas[id] = gpa;
> >>> +               }
> >>>                 desc->addr = cpu_to_le64(addr);
> >>>                 desc->len = cpu_to_le32(len);
> >>>                 desc->flags = cpu_to_le16(flags);
> >>> @@ -285,6 +296,32 @@ void virtio_fill_desc(struct vqs *vq, int id, uint64_t features,
> >>>         }
> >>>  }
> >>>
> >>> +void virtio_free_desc(struct vqs *vq, int id, uint64_t features)
> >>> +{
> >>> +       struct vring_desc *desc;
> >>> +
> >>> +       id %= vq->size;
> >>> +       desc = &vq->desc[id];
> >>> +
> >>> +       if (features & VIRTIO_F_VERSION_1) {
> >>> +               if (features & VIRTIO_F_IOMMU_PLATFORM) {
> >>> +                       SLOF_dma_map_out(le64_to_cpu(desc->addr),
> >>> +                                        0, le32_to_cpu(desc->len));
> >>> +                       vq->desc_gpas[id] = NULL;
> >>> +               }
> >>> +       }
> >>> +}
> >>
> >> On a 4.18 RHEL8 kernel I was unable to boot from virtio-blk, it complains the
> >> disk isn't bootable and the H_PUT_TCE traces show:
> >>
> >>  qemu-system-ppc-19019 [045] .... 786844.067356: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x1000,0xe46b003,0xdc73350
> >>  qemu-system-ppc-19019 [045] .... 786844.067358: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
> >>  qemu-system-ppc-19019 [045] .... 786844.067361: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x2000,0xe46c003,0xdc73350
> >>  qemu-system-ppc-19019 [045] .... 786844.067362: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
> >>  qemu-system-ppc-19019 [041] .... 786844.070532: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x3000,0xe67b003,0xdc73490
> >>  qemu-system-ppc-19019 [041] .... 786844.070533: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
> >>  qemu-system-ppc-19019 [041] .... 786844.070817: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x4000,0xe469003,0xdc73490
> >>  qemu-system-ppc-19019 [041] .... 786844.070818: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
> >>  qemu-system-ppc-19019 [041] .... 786844.071102: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x5000,0xe67a003,0xdc73490
> >>  qemu-system-ppc-19019 [041] .... 786844.071103: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
> >>  qemu-system-ppc-19019 [041] .... 786844.071123: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
> >>  qemu-system-ppc-19019 [041] .... 786844.073532: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x3228,0x0,0xdc73488

Note here .......................................................................................................^^^^^^

> >>  qemu-system-ppc-19019 [041] .... 786844.073534: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_PARAMETER
> >>
> >> If I disable in-kernel TCE acceleration, or use TCG, it seems to work
> >> fine. If I add the following hack it seems to fix things:
> >>
> >>   @@ -305,7 +307,8 @@ void virtio_free_desc(struct vqs *vq, int id, uint64_t features)
> >>    
> >>           if (features & VIRTIO_F_VERSION_1) {
> >>                   if (features & VIRTIO_F_IOMMU_PLATFORM) {
> >>   -                       SLOF_dma_map_out(le64_to_cpu(desc->addr),
> >>   +                       fprintf(stderr, "mapping out desc->addr: %llx\n", le64_to_cpu(desc->addr));
> >>   +                       SLOF_dma_map_out(le64_to_cpu(desc->addr) & ~(0xfff),
> >>                                            0, le32_to_cpu(desc->len));
> >>                           vq->desc_gpas[id] = NULL;
> >>                   }
> >>
> >> I guess the kernel is a bit stricter about unmapping unaligned TCE entries? If
> >> so, maybe we should align automatically in dma-map-out?
> > 
> > 
> > I thought I did that as well in 1/4, I'll double check. Hmmm.
> 
> 
> v2 of these patches did not align (or, trunc, really) in dma-map-out but
> v3 do, it is "swap dma-trunc swap" in dma-map-out of 1/4. SO I am really
> confused now.

Huh, so am I.  Because somehow or other we are seeing a call to
H_PUT_TCE with an unaligned IOBA in Mike's trace above.

> 
> It is true that checking differs between qemu's and kvm's h_put_tce
> handlers, I added this (see below) to my qemu and it still works, can
> you please try in your tree and capture the spapr_iommu_pci_put traces?
> Thanks,
> 
> 
> diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
> index 3d3bcc86496a..feae27bf11fa 100644
> --- a/hw/ppc/spapr_iommu.c
> +++ b/hw/ppc/spapr_iommu.c
> @@ -564,6 +564,9 @@ static target_ulong h_put_tce(PowerPCCPU *cpu,
> SpaprMachineState *spapr,
>      if (tcet) {
>          hwaddr page_mask = IOMMU_PAGE_MASK(tcet->page_shift);
> 
> +        if (ioba & ~page_mask)
> +            abort();
> +
> 
> 
> 
>
Michael Roth Nov. 22, 2019, 2:42 p.m. UTC | #6
Quoting Alexey Kardashevskiy (2019-11-21 21:14:10)
> 
> 
> On 22/11/2019 12:49, Alexey Kardashevskiy wrote:
> > 
> > 
> > On 22/11/2019 11:09, Michael Roth wrote:
> >> Quoting Alexey Kardashevskiy (2019-11-20 17:58:01)
> >>> When QEMU is started with iommu_platform=on, the guest driver must accept
> >>> it or the device will fail.
> >>>
> >>> This enables IOMMU support for virtio-net, -scsi, -block, -serial, -9p
> >>> devices. -serial and -9p are only compile tested though.
> >>>
> >>> For virtio-net we map all RX buffers once and TX when xmit() is called
> >>> and unmap older pages when we are about to reuse the VQ descriptor.
> >>> As all other devices are synchronous, we unmap IOMMU pages right after
> >>> completion of a transaction.
> >>>
> >>> This depends on QEMU's:
> >>> https://patchwork.ozlabs.org/patch/1194067/
> >>>
> >>> Copied-bits-and-pieces-from: Michael Roth <mdroth@linux.vnet.ibm.com>
> >>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >>> ---
> >>> Changes:
> >>> v2:
> >>> * added Mike's fs/dma-instance-function.fs
> >>> * total rework
> >>> ---
> >>>  board-qemu/slof/Makefile         |  1 +
> >>>  lib/libvirtio/virtio.h           |  5 ++
> >>>  lib/libvirtio/virtio-9p.c        |  4 ++
> >>>  lib/libvirtio/virtio-blk.c       |  4 ++
> >>>  lib/libvirtio/virtio-net.c       |  5 +-
> >>>  lib/libvirtio/virtio-scsi.c      |  5 ++
> >>>  lib/libvirtio/virtio-serial.c    | 12 +++--
> >>>  lib/libvirtio/virtio.c           | 82 +++++++++++++++++++++++++++++++-
> >>>  board-qemu/slof/OF.fs            |  3 ++
> >>>  slof/fs/dma-instance-function.fs | 28 +++++++++++
> >>>  10 files changed, 143 insertions(+), 6 deletions(-)
> >>>  create mode 100644 slof/fs/dma-instance-function.fs
> >>>
> >>> diff --git a/board-qemu/slof/Makefile b/board-qemu/slof/Makefile
> >>> index 2263e751bde9..d7ed2d7a6f18 100644
> >>> --- a/board-qemu/slof/Makefile
> >>> +++ b/board-qemu/slof/Makefile
> >>> @@ -99,6 +99,7 @@ OF_FFS_FILES = \
> >>>         $(SLOFCMNDIR)/fs/graphics.fs \
> >>>         $(SLOFCMNDIR)/fs/generic-disk.fs \
> >>>         $(SLOFCMNDIR)/fs/dma-function.fs \
> >>> +       $(SLOFCMNDIR)/fs/dma-instance-function.fs \
> >>>         $(SLOFCMNDIR)/fs/pci-device.fs \
> >>>         $(SLOFCMNDIR)/fs/pci-bridge.fs \
> >>>         $(SLOFCMNDIR)/fs/pci-properties.fs \
> >>> diff --git a/lib/libvirtio/virtio.h b/lib/libvirtio/virtio.h
> >>> index 4927a97f9f5f..250120b3112b 100644
> >>> --- a/lib/libvirtio/virtio.h
> >>> +++ b/lib/libvirtio/virtio.h
> >>> @@ -29,6 +29,7 @@
> >>>  #define VIRTIO_F_RING_INDIRECT_DESC    BIT(28)
> >>>  #define VIRTIO_F_RING_EVENT_IDX                BIT(29)
> >>>  #define VIRTIO_F_VERSION_1             BIT(32)
> >>> +#define VIRTIO_F_IOMMU_PLATFORM         BIT(33)
> >>>
> >>>  #define VIRTIO_TIMEOUT                 5000 /* 5 sec timeout */
> >>>
> >>> @@ -84,6 +85,8 @@ struct vqs {
> >>>         struct vring_desc *desc;
> >>>         struct vring_avail *avail;
> >>>         struct vring_used *used;
> >>> +       void **desc_gpas; /* to get gpa from desc->addr (which is ioba) */
> >>> +       uint64_t bus_desc;
> >>>  };
> >>>
> >>>  struct virtio_device {
> >>> @@ -109,6 +112,8 @@ extern struct vring_used *virtio_get_vring_used(struct virtio_device *dev, int q
> >>>  extern void virtio_fill_desc(struct vqs *vq, int id, uint64_t features,
> >>>                               uint64_t addr, uint32_t len,
> >>>                               uint16_t flags, uint16_t next);
> >>> +extern void virtio_free_desc(struct vqs *vq, int id, uint64_t features);
> >>> +void *virtio_desc_addr(struct virtio_device *vdev, int queue, int id);
> >>>  extern struct vqs *virtio_queue_init_vq(struct virtio_device *dev, unsigned int id);
> >>>  extern void virtio_queue_term_vq(struct virtio_device *dev, struct vqs *vq, unsigned int id);
> >>>
> >>> diff --git a/lib/libvirtio/virtio-9p.c b/lib/libvirtio/virtio-9p.c
> >>> index 426069fe9509..76078612b06e 100644
> >>> --- a/lib/libvirtio/virtio-9p.c
> >>> +++ b/lib/libvirtio/virtio-9p.c
> >>> @@ -129,6 +129,10 @@ static int virtio_9p_transact(void *opaque, uint8_t *tx, int tx_size, uint8_t *r
> >>>                 // do something better
> >>>                 mb();
> >>>         }
> >>> +
> >>> +       virtio_free_desc(vq, id, dev->features);
> >>> +       virtio_free_desc(vq, id + 1, dev->features);
> >>> +
> >>>         if (i == 0) {
> >>>                 return -1;
> >>>         }
> >>> diff --git a/lib/libvirtio/virtio-blk.c b/lib/libvirtio/virtio-blk.c
> >>> index a0dadbb0d6a8..0363038e559d 100644
> >>> --- a/lib/libvirtio/virtio-blk.c
> >>> +++ b/lib/libvirtio/virtio-blk.c
> >>> @@ -195,6 +195,10 @@ virtioblk_transfer(struct virtio_device *dev, char *buf, uint64_t blocknum,
> >>>                         break;
> >>>         }
> >>>
> >>> +       virtio_free_desc(vq, id, dev->features);
> >>> +       virtio_free_desc(vq, id + 1, dev->features);
> >>> +       virtio_free_desc(vq, id + 2, dev->features);
> >>> +
> >>>         if (status == 0)
> >>>                 return cnt;
> >>>
> >>> diff --git a/lib/libvirtio/virtio-net.c b/lib/libvirtio/virtio-net.c
> >>> index ae67883020ef..5a0d19088527 100644
> >>> --- a/lib/libvirtio/virtio-net.c
> >>> +++ b/lib/libvirtio/virtio-net.c
> >>> @@ -255,6 +255,9 @@ static int virtionet_xmit(struct virtio_net *vnet, char *buf, int len)
> >>>         idx = virtio_modern16_to_cpu(vdev, vq_tx->avail->idx);
> >>>         id = (idx * 2) % vq_tx->size;
> >>>
> >>> +       virtio_free_desc(vq_tx, id, vdev->features);
> >>> +       virtio_free_desc(vq_tx, id + 1, vdev->features);
> >>> +
> >>>         /* Set up virtqueue descriptor for header */
> >>>         virtio_fill_desc(vq_tx, id, vdev->features, (uint64_t)nethdr,
> >>>                          net_hdr_size, VRING_DESC_F_NEXT, id + 1);
> >>> @@ -317,7 +320,7 @@ static int virtionet_receive(struct virtio_net *vnet, char *buf, int maxlen)
> >>>  #endif
> >>>
> >>>         /* Copy data to destination buffer */
> >>> -       memcpy(buf, (void *)virtio_modern64_to_cpu(vdev, vq_rx->desc[id].addr), len);
> >>> +       memcpy(buf, virtio_desc_addr(vdev, VQ_RX, id), len);
> >>>
> >>>         /* Move indices to next entries */
> >>>         last_rx_idx = last_rx_idx + 1;
> >>> diff --git a/lib/libvirtio/virtio-scsi.c b/lib/libvirtio/virtio-scsi.c
> >>> index 251661e8d466..44573d10e24c 100644
> >>> --- a/lib/libvirtio/virtio-scsi.c
> >>> +++ b/lib/libvirtio/virtio-scsi.c
> >>> @@ -81,6 +81,11 @@ int virtioscsi_send(struct virtio_device *dev,
> >>>                         break;
> >>>         }
> >>>
> >>> +       virtio_free_desc(vq, id, dev->features);
> >>> +       virtio_free_desc(vq, id + 1, dev->features);
> >>> +       if (!(buf == NULL || buf_len == 0))
> >>> +               virtio_free_desc(vq, id + 2, dev->features);
> >>> +
> >>>         return 0;
> >>>  }
> >>>
> >>> diff --git a/lib/libvirtio/virtio-serial.c b/lib/libvirtio/virtio-serial.c
> >>> index 78ab0e25d7db..66fbfe8b6cc6 100644
> >>> --- a/lib/libvirtio/virtio-serial.c
> >>> +++ b/lib/libvirtio/virtio-serial.c
> >>> @@ -108,7 +108,7 @@ void virtio_serial_shutdown(struct virtio_device *dev)
> >>>
> >>>  int virtio_serial_putchar(struct virtio_device *dev, char c)
> >>>  {
> >>> -       int id;
> >>> +       int id, ret;
> >>>         uint32_t vq_size, time;
> >>>         volatile uint16_t *current_used_idx;
> >>>         uint16_t last_used_idx, avail_idx;
> >>> @@ -135,17 +135,21 @@ int virtio_serial_putchar(struct virtio_device *dev, char c)
> >>>         virtio_queue_notify(dev, TX_Q);
> >>>
> >>>         /* Wait for host to consume the descriptor */
> >>> +       ret = 1;
> >>>         time = SLOF_GetTimer() + VIRTIO_TIMEOUT;
> >>>         while (*current_used_idx == last_used_idx) {
> >>>                 // do something better
> >>>                 mb();
> >>>                 if (time < SLOF_GetTimer()) {
> >>>                         printf("virtio_serial_putchar failed! \n");
> >>> -                       return 0;
> >>> +                       ret = 0;
> >>> +                       break;
> >>>                 }
> >>>         }
> >>>
> >>> -       return 1;
> >>> +       virtio_free_desc(vq, id, dev->features);
> >>> +
> >>> +       return ret;
> >>>  }
> >>>
> >>>  char virtio_serial_getchar(struct virtio_device *dev)
> >>> @@ -165,7 +169,7 @@ char virtio_serial_getchar(struct virtio_device *dev)
> >>>                 % vq_rx->size;
> >>>
> >>>         /* Copy data to destination buffer */
> >>> -       memcpy(buf, (void *)virtio_modern64_to_cpu(dev, vq_rx->desc[id - 1].addr), RX_ELEM_SIZE);
> >>> +       memcpy(buf, virtio_desc_addr(dev, RX_Q, id - 1), RX_ELEM_SIZE);
> >>>
> >>>         /* Move indices to next entries */
> >>>         last_rx_idx = last_rx_idx + 1;
> >>> diff --git a/lib/libvirtio/virtio.c b/lib/libvirtio/virtio.c
> >>> index fff0c7e54c19..d4db5f656888 100644
> >>> --- a/lib/libvirtio/virtio.c
> >>> +++ b/lib/libvirtio/virtio.c
> >>> @@ -273,6 +273,17 @@ void virtio_fill_desc(struct vqs *vq, int id, uint64_t features,
> >>>         next %= vq->size;
> >>>
> >>>         if (features & VIRTIO_F_VERSION_1) {
> >>> +               if (features & VIRTIO_F_IOMMU_PLATFORM) {
> >>> +                       void *gpa = (void *) addr;
> >>> +
> >>> +                       if (!vq->desc_gpas) {
> >>> +                               fprintf(stderr, "IOMMU setup has not been done!\n");
> >>> +                               return;
> >>> +                       }
> >>> +
> >>> +                       addr = SLOF_dma_map_in(gpa, len, 0);
> >>> +                       vq->desc_gpas[id] = gpa;
> >>> +               }
> >>>                 desc->addr = cpu_to_le64(addr);
> >>>                 desc->len = cpu_to_le32(len);
> >>>                 desc->flags = cpu_to_le16(flags);
> >>> @@ -285,6 +296,32 @@ void virtio_fill_desc(struct vqs *vq, int id, uint64_t features,
> >>>         }
> >>>  }
> >>>
> >>> +void virtio_free_desc(struct vqs *vq, int id, uint64_t features)
> >>> +{
> >>> +       struct vring_desc *desc;
> >>> +
> >>> +       id %= vq->size;
> >>> +       desc = &vq->desc[id];
> >>> +
> >>> +       if (features & VIRTIO_F_VERSION_1) {
> >>> +               if (features & VIRTIO_F_IOMMU_PLATFORM) {
> >>> +                       SLOF_dma_map_out(le64_to_cpu(desc->addr),
> >>> +                                        0, le32_to_cpu(desc->len));
> >>> +                       vq->desc_gpas[id] = NULL;
> >>> +               }
> >>> +       }
> >>> +}
> >>
> >> On a 4.18 RHEL8 kernel I was unable to boot from virtio-blk, it complains the
> >> disk isn't bootable and the H_PUT_TCE traces show:
> >>
> >>  qemu-system-ppc-19019 [045] .... 786844.067356: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x1000,0xe46b003,0xdc73350
> >>  qemu-system-ppc-19019 [045] .... 786844.067358: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
> >>  qemu-system-ppc-19019 [045] .... 786844.067361: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x2000,0xe46c003,0xdc73350
> >>  qemu-system-ppc-19019 [045] .... 786844.067362: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
> >>  qemu-system-ppc-19019 [041] .... 786844.070532: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x3000,0xe67b003,0xdc73490
> >>  qemu-system-ppc-19019 [041] .... 786844.070533: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
> >>  qemu-system-ppc-19019 [041] .... 786844.070817: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x4000,0xe469003,0xdc73490
> >>  qemu-system-ppc-19019 [041] .... 786844.070818: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
> >>  qemu-system-ppc-19019 [041] .... 786844.071102: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x5000,0xe67a003,0xdc73490
> >>  qemu-system-ppc-19019 [041] .... 786844.071103: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
> >>  qemu-system-ppc-19019 [041] .... 786844.071123: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
> >>  qemu-system-ppc-19019 [041] .... 786844.073532: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x3228,0x0,0xdc73488
> >>  qemu-system-ppc-19019 [041] .... 786844.073534: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_PARAMETER
> >>
> >> If I disable in-kernel TCE acceleration, or use TCG, it seems to work
> >> fine. If I add the following hack it seems to fix things:
> >>
> >>   @@ -305,7 +307,8 @@ void virtio_free_desc(struct vqs *vq, int id, uint64_t features)
> >>    
> >>           if (features & VIRTIO_F_VERSION_1) {
> >>                   if (features & VIRTIO_F_IOMMU_PLATFORM) {
> >>   -                       SLOF_dma_map_out(le64_to_cpu(desc->addr),
> >>   +                       fprintf(stderr, "mapping out desc->addr: %llx\n", le64_to_cpu(desc->addr));
> >>   +                       SLOF_dma_map_out(le64_to_cpu(desc->addr) & ~(0xfff),
> >>                                            0, le32_to_cpu(desc->len));
> >>                           vq->desc_gpas[id] = NULL;
> >>                   }
> >>
> >> I guess the kernel is a bit stricter about unmapping unaligned TCE entries? If
> >> so, maybe we should align automatically in dma-map-out?
> > 
> > 
> > I thought I did that as well in 1/4, I'll double check. Hmmm.
> 
> 
> v2 of these patches did not align (or, trunc, really) in dma-map-out but
> v3 do, it is "swap dma-trunc swap" in dma-map-out of 1/4. SO I am really
> confused now.

Yah sorry I think the issue was on my end. I thought:

  https://github.com/aik/SLOF/commits/virtio-iommu
  
reflected this series, but it does contain an older version of the
dma-map* patch:

  index f136324..be529c9 100644
  --- a/board-qemu/slof/pci-phb.fs
  +++ b/board-qemu/slof/pci-phb.fs
  @@ -146,6 +146,7 @@ setup-puid
   \ grub does not align allocated addresses to the size so when mapping,
   \ we might ask bm-alloc for an extra IOMMU page
   : dma-align ( size virt -- aligned-size ) tce-mask and + tce-ps #aligned ;
  +: dma-trunc ( addr -- addr&~fff) tce-mask not and ;
  
   : dma-map-in  ( virt size cachable? -- devaddr )
      phb-debug? IF cr ." dma-map-in called: " .s cr THEN
  @@ -163,7 +164,7 @@ setup-puid
      swap                             ( size dev-addr virt )
      2dup tce-mask and or >r         \ add page offset to the return value
  
  -   tce-mask not and 3 OR            \ Trunk virt and add read and write perm
  +   dma-trunc 3 OR                   \ Truncate and add read and write perm
      rot                             ( virt dev-addr size r: dev-addr )
      0
      ?DO
  @@ -179,10 +180,10 @@ setup-puid
   : dma-map-out  ( virt devaddr size -- )
      phb-debug? IF cr ." dma-map-out called: " .s cr THEN
      (init-dma-window-vars)
  -   rot drop            ( devaddr size )
  +   rot drop                        ( devaddr size )
      over dma-align
  -   2dup swap tce-mask not and swap bm-handle -rot
  -   bm-free
  +   swap dma-trunc swap             ( devaddr-trunc size-extended )
  +   2dup bm-handle -rot bm-free
      0
      ?DO
          dup 0 dma-window-liobn -rot


I haven't been able to re-test yet since something seems to have gone
awry with the test system last night, but I assume that's what the issue
was. Sorry for the confusion.

> 
> It is true that checking differs between qemu's and kvm's h_put_tce
> handlers, I added this (see below) to my qemu and it still works, can
> you please try in your tree and capture the spapr_iommu_pci_put traces?
> Thanks,
> 
> 
> diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
> index 3d3bcc86496a..feae27bf11fa 100644
> --- a/hw/ppc/spapr_iommu.c
> +++ b/hw/ppc/spapr_iommu.c
> @@ -564,6 +564,9 @@ static target_ulong h_put_tce(PowerPCCPU *cpu,
> SpaprMachineState *spapr,
>      if (tcet) {
>          hwaddr page_mask = IOMMU_PAGE_MASK(tcet->page_shift);
> 
> +        if (ioba & ~page_mask)
> +            abort();
> +
> 
> 
> 
> 
> -- 
> Alexey
Alexey Kardashevskiy Nov. 28, 2019, 12:15 a.m. UTC | #7
On 23/11/2019 01:42, Michael Roth wrote:
> Quoting Alexey Kardashevskiy (2019-11-21 21:14:10)
>>
>>
>> On 22/11/2019 12:49, Alexey Kardashevskiy wrote:
>>>
>>>
>>> On 22/11/2019 11:09, Michael Roth wrote:
>>>> Quoting Alexey Kardashevskiy (2019-11-20 17:58:01)
>>>>> When QEMU is started with iommu_platform=on, the guest driver must accept
>>>>> it or the device will fail.
>>>>>
>>>>> This enables IOMMU support for virtio-net, -scsi, -block, -serial, -9p
>>>>> devices. -serial and -9p are only compile tested though.
>>>>>
>>>>> For virtio-net we map all RX buffers once and TX when xmit() is called
>>>>> and unmap older pages when we are about to reuse the VQ descriptor.
>>>>> As all other devices are synchronous, we unmap IOMMU pages right after
>>>>> completion of a transaction.
>>>>>
>>>>> This depends on QEMU's:
>>>>> https://patchwork.ozlabs.org/patch/1194067/
>>>>>
>>>>> Copied-bits-and-pieces-from: Michael Roth <mdroth@linux.vnet.ibm.com>
>>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>>> ---
>>>>> Changes:
>>>>> v2:
>>>>> * added Mike's fs/dma-instance-function.fs
>>>>> * total rework
>>>>> ---
>>>>>  board-qemu/slof/Makefile         |  1 +
>>>>>  lib/libvirtio/virtio.h           |  5 ++
>>>>>  lib/libvirtio/virtio-9p.c        |  4 ++
>>>>>  lib/libvirtio/virtio-blk.c       |  4 ++
>>>>>  lib/libvirtio/virtio-net.c       |  5 +-
>>>>>  lib/libvirtio/virtio-scsi.c      |  5 ++
>>>>>  lib/libvirtio/virtio-serial.c    | 12 +++--
>>>>>  lib/libvirtio/virtio.c           | 82 +++++++++++++++++++++++++++++++-
>>>>>  board-qemu/slof/OF.fs            |  3 ++
>>>>>  slof/fs/dma-instance-function.fs | 28 +++++++++++
>>>>>  10 files changed, 143 insertions(+), 6 deletions(-)
>>>>>  create mode 100644 slof/fs/dma-instance-function.fs
>>>>>
>>>>> diff --git a/board-qemu/slof/Makefile b/board-qemu/slof/Makefile
>>>>> index 2263e751bde9..d7ed2d7a6f18 100644
>>>>> --- a/board-qemu/slof/Makefile
>>>>> +++ b/board-qemu/slof/Makefile
>>>>> @@ -99,6 +99,7 @@ OF_FFS_FILES = \
>>>>>         $(SLOFCMNDIR)/fs/graphics.fs \
>>>>>         $(SLOFCMNDIR)/fs/generic-disk.fs \
>>>>>         $(SLOFCMNDIR)/fs/dma-function.fs \
>>>>> +       $(SLOFCMNDIR)/fs/dma-instance-function.fs \
>>>>>         $(SLOFCMNDIR)/fs/pci-device.fs \
>>>>>         $(SLOFCMNDIR)/fs/pci-bridge.fs \
>>>>>         $(SLOFCMNDIR)/fs/pci-properties.fs \
>>>>> diff --git a/lib/libvirtio/virtio.h b/lib/libvirtio/virtio.h
>>>>> index 4927a97f9f5f..250120b3112b 100644
>>>>> --- a/lib/libvirtio/virtio.h
>>>>> +++ b/lib/libvirtio/virtio.h
>>>>> @@ -29,6 +29,7 @@
>>>>>  #define VIRTIO_F_RING_INDIRECT_DESC    BIT(28)
>>>>>  #define VIRTIO_F_RING_EVENT_IDX                BIT(29)
>>>>>  #define VIRTIO_F_VERSION_1             BIT(32)
>>>>> +#define VIRTIO_F_IOMMU_PLATFORM         BIT(33)
>>>>>
>>>>>  #define VIRTIO_TIMEOUT                 5000 /* 5 sec timeout */
>>>>>
>>>>> @@ -84,6 +85,8 @@ struct vqs {
>>>>>         struct vring_desc *desc;
>>>>>         struct vring_avail *avail;
>>>>>         struct vring_used *used;
>>>>> +       void **desc_gpas; /* to get gpa from desc->addr (which is ioba) */
>>>>> +       uint64_t bus_desc;
>>>>>  };
>>>>>
>>>>>  struct virtio_device {
>>>>> @@ -109,6 +112,8 @@ extern struct vring_used *virtio_get_vring_used(struct virtio_device *dev, int q
>>>>>  extern void virtio_fill_desc(struct vqs *vq, int id, uint64_t features,
>>>>>                               uint64_t addr, uint32_t len,
>>>>>                               uint16_t flags, uint16_t next);
>>>>> +extern void virtio_free_desc(struct vqs *vq, int id, uint64_t features);
>>>>> +void *virtio_desc_addr(struct virtio_device *vdev, int queue, int id);
>>>>>  extern struct vqs *virtio_queue_init_vq(struct virtio_device *dev, unsigned int id);
>>>>>  extern void virtio_queue_term_vq(struct virtio_device *dev, struct vqs *vq, unsigned int id);
>>>>>
>>>>> diff --git a/lib/libvirtio/virtio-9p.c b/lib/libvirtio/virtio-9p.c
>>>>> index 426069fe9509..76078612b06e 100644
>>>>> --- a/lib/libvirtio/virtio-9p.c
>>>>> +++ b/lib/libvirtio/virtio-9p.c
>>>>> @@ -129,6 +129,10 @@ static int virtio_9p_transact(void *opaque, uint8_t *tx, int tx_size, uint8_t *r
>>>>>                 // do something better
>>>>>                 mb();
>>>>>         }
>>>>> +
>>>>> +       virtio_free_desc(vq, id, dev->features);
>>>>> +       virtio_free_desc(vq, id + 1, dev->features);
>>>>> +
>>>>>         if (i == 0) {
>>>>>                 return -1;
>>>>>         }
>>>>> diff --git a/lib/libvirtio/virtio-blk.c b/lib/libvirtio/virtio-blk.c
>>>>> index a0dadbb0d6a8..0363038e559d 100644
>>>>> --- a/lib/libvirtio/virtio-blk.c
>>>>> +++ b/lib/libvirtio/virtio-blk.c
>>>>> @@ -195,6 +195,10 @@ virtioblk_transfer(struct virtio_device *dev, char *buf, uint64_t blocknum,
>>>>>                         break;
>>>>>         }
>>>>>
>>>>> +       virtio_free_desc(vq, id, dev->features);
>>>>> +       virtio_free_desc(vq, id + 1, dev->features);
>>>>> +       virtio_free_desc(vq, id + 2, dev->features);
>>>>> +
>>>>>         if (status == 0)
>>>>>                 return cnt;
>>>>>
>>>>> diff --git a/lib/libvirtio/virtio-net.c b/lib/libvirtio/virtio-net.c
>>>>> index ae67883020ef..5a0d19088527 100644
>>>>> --- a/lib/libvirtio/virtio-net.c
>>>>> +++ b/lib/libvirtio/virtio-net.c
>>>>> @@ -255,6 +255,9 @@ static int virtionet_xmit(struct virtio_net *vnet, char *buf, int len)
>>>>>         idx = virtio_modern16_to_cpu(vdev, vq_tx->avail->idx);
>>>>>         id = (idx * 2) % vq_tx->size;
>>>>>
>>>>> +       virtio_free_desc(vq_tx, id, vdev->features);
>>>>> +       virtio_free_desc(vq_tx, id + 1, vdev->features);
>>>>> +
>>>>>         /* Set up virtqueue descriptor for header */
>>>>>         virtio_fill_desc(vq_tx, id, vdev->features, (uint64_t)nethdr,
>>>>>                          net_hdr_size, VRING_DESC_F_NEXT, id + 1);
>>>>> @@ -317,7 +320,7 @@ static int virtionet_receive(struct virtio_net *vnet, char *buf, int maxlen)
>>>>>  #endif
>>>>>
>>>>>         /* Copy data to destination buffer */
>>>>> -       memcpy(buf, (void *)virtio_modern64_to_cpu(vdev, vq_rx->desc[id].addr), len);
>>>>> +       memcpy(buf, virtio_desc_addr(vdev, VQ_RX, id), len);
>>>>>
>>>>>         /* Move indices to next entries */
>>>>>         last_rx_idx = last_rx_idx + 1;
>>>>> diff --git a/lib/libvirtio/virtio-scsi.c b/lib/libvirtio/virtio-scsi.c
>>>>> index 251661e8d466..44573d10e24c 100644
>>>>> --- a/lib/libvirtio/virtio-scsi.c
>>>>> +++ b/lib/libvirtio/virtio-scsi.c
>>>>> @@ -81,6 +81,11 @@ int virtioscsi_send(struct virtio_device *dev,
>>>>>                         break;
>>>>>         }
>>>>>
>>>>> +       virtio_free_desc(vq, id, dev->features);
>>>>> +       virtio_free_desc(vq, id + 1, dev->features);
>>>>> +       if (!(buf == NULL || buf_len == 0))
>>>>> +               virtio_free_desc(vq, id + 2, dev->features);
>>>>> +
>>>>>         return 0;
>>>>>  }
>>>>>
>>>>> diff --git a/lib/libvirtio/virtio-serial.c b/lib/libvirtio/virtio-serial.c
>>>>> index 78ab0e25d7db..66fbfe8b6cc6 100644
>>>>> --- a/lib/libvirtio/virtio-serial.c
>>>>> +++ b/lib/libvirtio/virtio-serial.c
>>>>> @@ -108,7 +108,7 @@ void virtio_serial_shutdown(struct virtio_device *dev)
>>>>>
>>>>>  int virtio_serial_putchar(struct virtio_device *dev, char c)
>>>>>  {
>>>>> -       int id;
>>>>> +       int id, ret;
>>>>>         uint32_t vq_size, time;
>>>>>         volatile uint16_t *current_used_idx;
>>>>>         uint16_t last_used_idx, avail_idx;
>>>>> @@ -135,17 +135,21 @@ int virtio_serial_putchar(struct virtio_device *dev, char c)
>>>>>         virtio_queue_notify(dev, TX_Q);
>>>>>
>>>>>         /* Wait for host to consume the descriptor */
>>>>> +       ret = 1;
>>>>>         time = SLOF_GetTimer() + VIRTIO_TIMEOUT;
>>>>>         while (*current_used_idx == last_used_idx) {
>>>>>                 // do something better
>>>>>                 mb();
>>>>>                 if (time < SLOF_GetTimer()) {
>>>>>                         printf("virtio_serial_putchar failed! \n");
>>>>> -                       return 0;
>>>>> +                       ret = 0;
>>>>> +                       break;
>>>>>                 }
>>>>>         }
>>>>>
>>>>> -       return 1;
>>>>> +       virtio_free_desc(vq, id, dev->features);
>>>>> +
>>>>> +       return ret;
>>>>>  }
>>>>>
>>>>>  char virtio_serial_getchar(struct virtio_device *dev)
>>>>> @@ -165,7 +169,7 @@ char virtio_serial_getchar(struct virtio_device *dev)
>>>>>                 % vq_rx->size;
>>>>>
>>>>>         /* Copy data to destination buffer */
>>>>> -       memcpy(buf, (void *)virtio_modern64_to_cpu(dev, vq_rx->desc[id - 1].addr), RX_ELEM_SIZE);
>>>>> +       memcpy(buf, virtio_desc_addr(dev, RX_Q, id - 1), RX_ELEM_SIZE);
>>>>>
>>>>>         /* Move indices to next entries */
>>>>>         last_rx_idx = last_rx_idx + 1;
>>>>> diff --git a/lib/libvirtio/virtio.c b/lib/libvirtio/virtio.c
>>>>> index fff0c7e54c19..d4db5f656888 100644
>>>>> --- a/lib/libvirtio/virtio.c
>>>>> +++ b/lib/libvirtio/virtio.c
>>>>> @@ -273,6 +273,17 @@ void virtio_fill_desc(struct vqs *vq, int id, uint64_t features,
>>>>>         next %= vq->size;
>>>>>
>>>>>         if (features & VIRTIO_F_VERSION_1) {
>>>>> +               if (features & VIRTIO_F_IOMMU_PLATFORM) {
>>>>> +                       void *gpa = (void *) addr;
>>>>> +
>>>>> +                       if (!vq->desc_gpas) {
>>>>> +                               fprintf(stderr, "IOMMU setup has not been done!\n");
>>>>> +                               return;
>>>>> +                       }
>>>>> +
>>>>> +                       addr = SLOF_dma_map_in(gpa, len, 0);
>>>>> +                       vq->desc_gpas[id] = gpa;
>>>>> +               }
>>>>>                 desc->addr = cpu_to_le64(addr);
>>>>>                 desc->len = cpu_to_le32(len);
>>>>>                 desc->flags = cpu_to_le16(flags);
>>>>> @@ -285,6 +296,32 @@ void virtio_fill_desc(struct vqs *vq, int id, uint64_t features,
>>>>>         }
>>>>>  }
>>>>>
>>>>> +void virtio_free_desc(struct vqs *vq, int id, uint64_t features)
>>>>> +{
>>>>> +       struct vring_desc *desc;
>>>>> +
>>>>> +       id %= vq->size;
>>>>> +       desc = &vq->desc[id];
>>>>> +
>>>>> +       if (features & VIRTIO_F_VERSION_1) {
>>>>> +               if (features & VIRTIO_F_IOMMU_PLATFORM) {
>>>>> +                       SLOF_dma_map_out(le64_to_cpu(desc->addr),
>>>>> +                                        0, le32_to_cpu(desc->len));
>>>>> +                       vq->desc_gpas[id] = NULL;
>>>>> +               }
>>>>> +       }
>>>>> +}
>>>>
>>>> On a 4.18 RHEL8 kernel I was unable to boot from virtio-blk, it complains the
>>>> disk isn't bootable and the H_PUT_TCE traces show:
>>>>
>>>>  qemu-system-ppc-19019 [045] .... 786844.067356: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x1000,0xe46b003,0xdc73350
>>>>  qemu-system-ppc-19019 [045] .... 786844.067358: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
>>>>  qemu-system-ppc-19019 [045] .... 786844.067361: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x2000,0xe46c003,0xdc73350
>>>>  qemu-system-ppc-19019 [045] .... 786844.067362: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
>>>>  qemu-system-ppc-19019 [041] .... 786844.070532: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x3000,0xe67b003,0xdc73490
>>>>  qemu-system-ppc-19019 [041] .... 786844.070533: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
>>>>  qemu-system-ppc-19019 [041] .... 786844.070817: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x4000,0xe469003,0xdc73490
>>>>  qemu-system-ppc-19019 [041] .... 786844.070818: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
>>>>  qemu-system-ppc-19019 [041] .... 786844.071102: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x5000,0xe67a003,0xdc73490
>>>>  qemu-system-ppc-19019 [041] .... 786844.071103: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
>>>>  qemu-system-ppc-19019 [041] .... 786844.071123: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
>>>>  qemu-system-ppc-19019 [041] .... 786844.073532: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x3228,0x0,0xdc73488
>>>>  qemu-system-ppc-19019 [041] .... 786844.073534: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_PARAMETER
>>>>
>>>> If I disable in-kernel TCE acceleration, or use TCG, it seems to work
>>>> fine. If I add the following hack it seems to fix things:
>>>>
>>>>   @@ -305,7 +307,8 @@ void virtio_free_desc(struct vqs *vq, int id, uint64_t features)
>>>>    
>>>>           if (features & VIRTIO_F_VERSION_1) {
>>>>                   if (features & VIRTIO_F_IOMMU_PLATFORM) {
>>>>   -                       SLOF_dma_map_out(le64_to_cpu(desc->addr),
>>>>   +                       fprintf(stderr, "mapping out desc->addr: %llx\n", le64_to_cpu(desc->addr));
>>>>   +                       SLOF_dma_map_out(le64_to_cpu(desc->addr) & ~(0xfff),
>>>>                                            0, le32_to_cpu(desc->len));
>>>>                           vq->desc_gpas[id] = NULL;
>>>>                   }
>>>>
>>>> I guess the kernel is a bit stricter about unmapping unaligned TCE entries? If
>>>> so, maybe we should align automatically in dma-map-out?
>>>
>>>
>>> I thought I did that as well in 1/4, I'll double check. Hmmm.
>>
>>
>> v2 of these patches did not align (or, trunc, really) in dma-map-out but
>> v3 do, it is "swap dma-trunc swap" in dma-map-out of 1/4. SO I am really
>> confused now.
> 
> Yah sorry I think the issue was on my end. I thought:
> 
>   https://github.com/aik/SLOF/commits/virtio-iommu


My bad, I should have pushed it out since I started pushing them out.

The question now is if we are good to go and I can push these out and
send SLOF blob update so people can actually start testing it? Thanks,


>   
> reflected this series, but it does contain an older version of the
> dma-map* patch:
> 
>   index f136324..be529c9 100644
>   --- a/board-qemu/slof/pci-phb.fs
>   +++ b/board-qemu/slof/pci-phb.fs
>   @@ -146,6 +146,7 @@ setup-puid
>    \ grub does not align allocated addresses to the size so when mapping,
>    \ we might ask bm-alloc for an extra IOMMU page
>    : dma-align ( size virt -- aligned-size ) tce-mask and + tce-ps #aligned ;
>   +: dma-trunc ( addr -- addr&~fff) tce-mask not and ;
>   
>    : dma-map-in  ( virt size cachable? -- devaddr )
>       phb-debug? IF cr ." dma-map-in called: " .s cr THEN
>   @@ -163,7 +164,7 @@ setup-puid
>       swap                             ( size dev-addr virt )
>       2dup tce-mask and or >r         \ add page offset to the return value
>   
>   -   tce-mask not and 3 OR            \ Trunk virt and add read and write perm
>   +   dma-trunc 3 OR                   \ Truncate and add read and write perm
>       rot                             ( virt dev-addr size r: dev-addr )
>       0
>       ?DO
>   @@ -179,10 +180,10 @@ setup-puid
>    : dma-map-out  ( virt devaddr size -- )
>       phb-debug? IF cr ." dma-map-out called: " .s cr THEN
>       (init-dma-window-vars)
>   -   rot drop            ( devaddr size )
>   +   rot drop                        ( devaddr size )
>       over dma-align
>   -   2dup swap tce-mask not and swap bm-handle -rot
>   -   bm-free
>   +   swap dma-trunc swap             ( devaddr-trunc size-extended )
>   +   2dup bm-handle -rot bm-free
>       0
>       ?DO
>           dup 0 dma-window-liobn -rot
> 
> 
> I haven't been able to re-test yet since something seems to have gone
> awry with the test system last night, but I assume that's what the issue
> was. Sorry for the confusion.
> 
>>
>> It is true that checking differs between qemu's and kvm's h_put_tce
>> handlers, I added this (see below) to my qemu and it still works, can
>> you please try in your tree and capture the spapr_iommu_pci_put traces?
>> Thanks,
>>
>>
>> diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
>> index 3d3bcc86496a..feae27bf11fa 100644
>> --- a/hw/ppc/spapr_iommu.c
>> +++ b/hw/ppc/spapr_iommu.c
>> @@ -564,6 +564,9 @@ static target_ulong h_put_tce(PowerPCCPU *cpu,
>> SpaprMachineState *spapr,
>>      if (tcet) {
>>          hwaddr page_mask = IOMMU_PAGE_MASK(tcet->page_shift);
>>
>> +        if (ioba & ~page_mask)
>> +            abort();
>> +
>>
>>
>>
>>
>> -- 
>> Alexey
Michael Roth Dec. 3, 2019, 2:49 a.m. UTC | #8
Quoting Alexey Kardashevskiy (2019-11-27 18:15:03)
> 
> 
> On 23/11/2019 01:42, Michael Roth wrote:
> > Quoting Alexey Kardashevskiy (2019-11-21 21:14:10)
> >>
> >>
> >> On 22/11/2019 12:49, Alexey Kardashevskiy wrote:
> >>>
> >>>
> >>> On 22/11/2019 11:09, Michael Roth wrote:
> >>>> Quoting Alexey Kardashevskiy (2019-11-20 17:58:01)
> >>>>> When QEMU is started with iommu_platform=on, the guest driver must accept
> >>>>> it or the device will fail.
> >>>>>
> >>>>> This enables IOMMU support for virtio-net, -scsi, -block, -serial, -9p
> >>>>> devices. -serial and -9p are only compile tested though.
> >>>>>
> >>>>> For virtio-net we map all RX buffers once and TX when xmit() is called
> >>>>> and unmap older pages when we are about to reuse the VQ descriptor.
> >>>>> As all other devices are synchronous, we unmap IOMMU pages right after
> >>>>> completion of a transaction.
> >>>>>
> >>>>> This depends on QEMU's:
> >>>>> https://patchwork.ozlabs.org/patch/1194067/
> >>>>>
> >>>>> Copied-bits-and-pieces-from: Michael Roth <mdroth@linux.vnet.ibm.com>
> >>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >>>>> ---
> >>>>> Changes:
> >>>>> v2:
> >>>>> * added Mike's fs/dma-instance-function.fs
> >>>>> * total rework
> >>>>> ---
> >>>>>  board-qemu/slof/Makefile         |  1 +
> >>>>>  lib/libvirtio/virtio.h           |  5 ++
> >>>>>  lib/libvirtio/virtio-9p.c        |  4 ++
> >>>>>  lib/libvirtio/virtio-blk.c       |  4 ++
> >>>>>  lib/libvirtio/virtio-net.c       |  5 +-
> >>>>>  lib/libvirtio/virtio-scsi.c      |  5 ++
> >>>>>  lib/libvirtio/virtio-serial.c    | 12 +++--
> >>>>>  lib/libvirtio/virtio.c           | 82 +++++++++++++++++++++++++++++++-
> >>>>>  board-qemu/slof/OF.fs            |  3 ++
> >>>>>  slof/fs/dma-instance-function.fs | 28 +++++++++++
> >>>>>  10 files changed, 143 insertions(+), 6 deletions(-)
> >>>>>  create mode 100644 slof/fs/dma-instance-function.fs
> >>>>>
> >>>>> diff --git a/board-qemu/slof/Makefile b/board-qemu/slof/Makefile
> >>>>> index 2263e751bde9..d7ed2d7a6f18 100644
> >>>>> --- a/board-qemu/slof/Makefile
> >>>>> +++ b/board-qemu/slof/Makefile
> >>>>> @@ -99,6 +99,7 @@ OF_FFS_FILES = \
> >>>>>         $(SLOFCMNDIR)/fs/graphics.fs \
> >>>>>         $(SLOFCMNDIR)/fs/generic-disk.fs \
> >>>>>         $(SLOFCMNDIR)/fs/dma-function.fs \
> >>>>> +       $(SLOFCMNDIR)/fs/dma-instance-function.fs \
> >>>>>         $(SLOFCMNDIR)/fs/pci-device.fs \
> >>>>>         $(SLOFCMNDIR)/fs/pci-bridge.fs \
> >>>>>         $(SLOFCMNDIR)/fs/pci-properties.fs \
> >>>>> diff --git a/lib/libvirtio/virtio.h b/lib/libvirtio/virtio.h
> >>>>> index 4927a97f9f5f..250120b3112b 100644
> >>>>> --- a/lib/libvirtio/virtio.h
> >>>>> +++ b/lib/libvirtio/virtio.h
> >>>>> @@ -29,6 +29,7 @@
> >>>>>  #define VIRTIO_F_RING_INDIRECT_DESC    BIT(28)
> >>>>>  #define VIRTIO_F_RING_EVENT_IDX                BIT(29)
> >>>>>  #define VIRTIO_F_VERSION_1             BIT(32)
> >>>>> +#define VIRTIO_F_IOMMU_PLATFORM         BIT(33)
> >>>>>
> >>>>>  #define VIRTIO_TIMEOUT                 5000 /* 5 sec timeout */
> >>>>>
> >>>>> @@ -84,6 +85,8 @@ struct vqs {
> >>>>>         struct vring_desc *desc;
> >>>>>         struct vring_avail *avail;
> >>>>>         struct vring_used *used;
> >>>>> +       void **desc_gpas; /* to get gpa from desc->addr (which is ioba) */
> >>>>> +       uint64_t bus_desc;
> >>>>>  };
> >>>>>
> >>>>>  struct virtio_device {
> >>>>> @@ -109,6 +112,8 @@ extern struct vring_used *virtio_get_vring_used(struct virtio_device *dev, int q
> >>>>>  extern void virtio_fill_desc(struct vqs *vq, int id, uint64_t features,
> >>>>>                               uint64_t addr, uint32_t len,
> >>>>>                               uint16_t flags, uint16_t next);
> >>>>> +extern void virtio_free_desc(struct vqs *vq, int id, uint64_t features);
> >>>>> +void *virtio_desc_addr(struct virtio_device *vdev, int queue, int id);
> >>>>>  extern struct vqs *virtio_queue_init_vq(struct virtio_device *dev, unsigned int id);
> >>>>>  extern void virtio_queue_term_vq(struct virtio_device *dev, struct vqs *vq, unsigned int id);
> >>>>>
> >>>>> diff --git a/lib/libvirtio/virtio-9p.c b/lib/libvirtio/virtio-9p.c
> >>>>> index 426069fe9509..76078612b06e 100644
> >>>>> --- a/lib/libvirtio/virtio-9p.c
> >>>>> +++ b/lib/libvirtio/virtio-9p.c
> >>>>> @@ -129,6 +129,10 @@ static int virtio_9p_transact(void *opaque, uint8_t *tx, int tx_size, uint8_t *r
> >>>>>                 // do something better
> >>>>>                 mb();
> >>>>>         }
> >>>>> +
> >>>>> +       virtio_free_desc(vq, id, dev->features);
> >>>>> +       virtio_free_desc(vq, id + 1, dev->features);
> >>>>> +
> >>>>>         if (i == 0) {
> >>>>>                 return -1;
> >>>>>         }
> >>>>> diff --git a/lib/libvirtio/virtio-blk.c b/lib/libvirtio/virtio-blk.c
> >>>>> index a0dadbb0d6a8..0363038e559d 100644
> >>>>> --- a/lib/libvirtio/virtio-blk.c
> >>>>> +++ b/lib/libvirtio/virtio-blk.c
> >>>>> @@ -195,6 +195,10 @@ virtioblk_transfer(struct virtio_device *dev, char *buf, uint64_t blocknum,
> >>>>>                         break;
> >>>>>         }
> >>>>>
> >>>>> +       virtio_free_desc(vq, id, dev->features);
> >>>>> +       virtio_free_desc(vq, id + 1, dev->features);
> >>>>> +       virtio_free_desc(vq, id + 2, dev->features);
> >>>>> +
> >>>>>         if (status == 0)
> >>>>>                 return cnt;
> >>>>>
> >>>>> diff --git a/lib/libvirtio/virtio-net.c b/lib/libvirtio/virtio-net.c
> >>>>> index ae67883020ef..5a0d19088527 100644
> >>>>> --- a/lib/libvirtio/virtio-net.c
> >>>>> +++ b/lib/libvirtio/virtio-net.c
> >>>>> @@ -255,6 +255,9 @@ static int virtionet_xmit(struct virtio_net *vnet, char *buf, int len)
> >>>>>         idx = virtio_modern16_to_cpu(vdev, vq_tx->avail->idx);
> >>>>>         id = (idx * 2) % vq_tx->size;
> >>>>>
> >>>>> +       virtio_free_desc(vq_tx, id, vdev->features);
> >>>>> +       virtio_free_desc(vq_tx, id + 1, vdev->features);
> >>>>> +
> >>>>>         /* Set up virtqueue descriptor for header */
> >>>>>         virtio_fill_desc(vq_tx, id, vdev->features, (uint64_t)nethdr,
> >>>>>                          net_hdr_size, VRING_DESC_F_NEXT, id + 1);
> >>>>> @@ -317,7 +320,7 @@ static int virtionet_receive(struct virtio_net *vnet, char *buf, int maxlen)
> >>>>>  #endif
> >>>>>
> >>>>>         /* Copy data to destination buffer */
> >>>>> -       memcpy(buf, (void *)virtio_modern64_to_cpu(vdev, vq_rx->desc[id].addr), len);
> >>>>> +       memcpy(buf, virtio_desc_addr(vdev, VQ_RX, id), len);
> >>>>>
> >>>>>         /* Move indices to next entries */
> >>>>>         last_rx_idx = last_rx_idx + 1;
> >>>>> diff --git a/lib/libvirtio/virtio-scsi.c b/lib/libvirtio/virtio-scsi.c
> >>>>> index 251661e8d466..44573d10e24c 100644
> >>>>> --- a/lib/libvirtio/virtio-scsi.c
> >>>>> +++ b/lib/libvirtio/virtio-scsi.c
> >>>>> @@ -81,6 +81,11 @@ int virtioscsi_send(struct virtio_device *dev,
> >>>>>                         break;
> >>>>>         }
> >>>>>
> >>>>> +       virtio_free_desc(vq, id, dev->features);
> >>>>> +       virtio_free_desc(vq, id + 1, dev->features);
> >>>>> +       if (!(buf == NULL || buf_len == 0))
> >>>>> +               virtio_free_desc(vq, id + 2, dev->features);
> >>>>> +
> >>>>>         return 0;
> >>>>>  }
> >>>>>
> >>>>> diff --git a/lib/libvirtio/virtio-serial.c b/lib/libvirtio/virtio-serial.c
> >>>>> index 78ab0e25d7db..66fbfe8b6cc6 100644
> >>>>> --- a/lib/libvirtio/virtio-serial.c
> >>>>> +++ b/lib/libvirtio/virtio-serial.c
> >>>>> @@ -108,7 +108,7 @@ void virtio_serial_shutdown(struct virtio_device *dev)
> >>>>>
> >>>>>  int virtio_serial_putchar(struct virtio_device *dev, char c)
> >>>>>  {
> >>>>> -       int id;
> >>>>> +       int id, ret;
> >>>>>         uint32_t vq_size, time;
> >>>>>         volatile uint16_t *current_used_idx;
> >>>>>         uint16_t last_used_idx, avail_idx;
> >>>>> @@ -135,17 +135,21 @@ int virtio_serial_putchar(struct virtio_device *dev, char c)
> >>>>>         virtio_queue_notify(dev, TX_Q);
> >>>>>
> >>>>>         /* Wait for host to consume the descriptor */
> >>>>> +       ret = 1;
> >>>>>         time = SLOF_GetTimer() + VIRTIO_TIMEOUT;
> >>>>>         while (*current_used_idx == last_used_idx) {
> >>>>>                 // do something better
> >>>>>                 mb();
> >>>>>                 if (time < SLOF_GetTimer()) {
> >>>>>                         printf("virtio_serial_putchar failed! \n");
> >>>>> -                       return 0;
> >>>>> +                       ret = 0;
> >>>>> +                       break;
> >>>>>                 }
> >>>>>         }
> >>>>>
> >>>>> -       return 1;
> >>>>> +       virtio_free_desc(vq, id, dev->features);
> >>>>> +
> >>>>> +       return ret;
> >>>>>  }
> >>>>>
> >>>>>  char virtio_serial_getchar(struct virtio_device *dev)
> >>>>> @@ -165,7 +169,7 @@ char virtio_serial_getchar(struct virtio_device *dev)
> >>>>>                 % vq_rx->size;
> >>>>>
> >>>>>         /* Copy data to destination buffer */
> >>>>> -       memcpy(buf, (void *)virtio_modern64_to_cpu(dev, vq_rx->desc[id - 1].addr), RX_ELEM_SIZE);
> >>>>> +       memcpy(buf, virtio_desc_addr(dev, RX_Q, id - 1), RX_ELEM_SIZE);
> >>>>>
> >>>>>         /* Move indices to next entries */
> >>>>>         last_rx_idx = last_rx_idx + 1;
> >>>>> diff --git a/lib/libvirtio/virtio.c b/lib/libvirtio/virtio.c
> >>>>> index fff0c7e54c19..d4db5f656888 100644
> >>>>> --- a/lib/libvirtio/virtio.c
> >>>>> +++ b/lib/libvirtio/virtio.c
> >>>>> @@ -273,6 +273,17 @@ void virtio_fill_desc(struct vqs *vq, int id, uint64_t features,
> >>>>>         next %= vq->size;
> >>>>>
> >>>>>         if (features & VIRTIO_F_VERSION_1) {
> >>>>> +               if (features & VIRTIO_F_IOMMU_PLATFORM) {
> >>>>> +                       void *gpa = (void *) addr;
> >>>>> +
> >>>>> +                       if (!vq->desc_gpas) {
> >>>>> +                               fprintf(stderr, "IOMMU setup has not been done!\n");
> >>>>> +                               return;
> >>>>> +                       }
> >>>>> +
> >>>>> +                       addr = SLOF_dma_map_in(gpa, len, 0);
> >>>>> +                       vq->desc_gpas[id] = gpa;
> >>>>> +               }
> >>>>>                 desc->addr = cpu_to_le64(addr);
> >>>>>                 desc->len = cpu_to_le32(len);
> >>>>>                 desc->flags = cpu_to_le16(flags);
> >>>>> @@ -285,6 +296,32 @@ void virtio_fill_desc(struct vqs *vq, int id, uint64_t features,
> >>>>>         }
> >>>>>  }
> >>>>>
> >>>>> +void virtio_free_desc(struct vqs *vq, int id, uint64_t features)
> >>>>> +{
> >>>>> +       struct vring_desc *desc;
> >>>>> +
> >>>>> +       id %= vq->size;
> >>>>> +       desc = &vq->desc[id];
> >>>>> +
> >>>>> +       if (features & VIRTIO_F_VERSION_1) {
> >>>>> +               if (features & VIRTIO_F_IOMMU_PLATFORM) {
> >>>>> +                       SLOF_dma_map_out(le64_to_cpu(desc->addr),
> >>>>> +                                        0, le32_to_cpu(desc->len));
> >>>>> +                       vq->desc_gpas[id] = NULL;
> >>>>> +               }
> >>>>> +       }
> >>>>> +}
> >>>>
> >>>> On a 4.18 RHEL8 kernel I was unable to boot from virtio-blk, it complains the
> >>>> disk isn't bootable and the H_PUT_TCE traces show:
> >>>>
> >>>>  qemu-system-ppc-19019 [045] .... 786844.067356: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x1000,0xe46b003,0xdc73350
> >>>>  qemu-system-ppc-19019 [045] .... 786844.067358: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
> >>>>  qemu-system-ppc-19019 [045] .... 786844.067361: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x2000,0xe46c003,0xdc73350
> >>>>  qemu-system-ppc-19019 [045] .... 786844.067362: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
> >>>>  qemu-system-ppc-19019 [041] .... 786844.070532: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x3000,0xe67b003,0xdc73490
> >>>>  qemu-system-ppc-19019 [041] .... 786844.070533: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
> >>>>  qemu-system-ppc-19019 [041] .... 786844.070817: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x4000,0xe469003,0xdc73490
> >>>>  qemu-system-ppc-19019 [041] .... 786844.070818: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
> >>>>  qemu-system-ppc-19019 [041] .... 786844.071102: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x5000,0xe67a003,0xdc73490
> >>>>  qemu-system-ppc-19019 [041] .... 786844.071103: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
> >>>>  qemu-system-ppc-19019 [041] .... 786844.071123: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
> >>>>  qemu-system-ppc-19019 [041] .... 786844.073532: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x3228,0x0,0xdc73488
> >>>>  qemu-system-ppc-19019 [041] .... 786844.073534: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_PARAMETER
> >>>>
> >>>> If I disable in-kernel TCE acceleration, or use TCG, it seems to work
> >>>> fine. If I add the following hack it seems to fix things:
> >>>>
> >>>>   @@ -305,7 +307,8 @@ void virtio_free_desc(struct vqs *vq, int id, uint64_t features)
> >>>>    
> >>>>           if (features & VIRTIO_F_VERSION_1) {
> >>>>                   if (features & VIRTIO_F_IOMMU_PLATFORM) {
> >>>>   -                       SLOF_dma_map_out(le64_to_cpu(desc->addr),
> >>>>   +                       fprintf(stderr, "mapping out desc->addr: %llx\n", le64_to_cpu(desc->addr));
> >>>>   +                       SLOF_dma_map_out(le64_to_cpu(desc->addr) & ~(0xfff),
> >>>>                                            0, le32_to_cpu(desc->len));
> >>>>                           vq->desc_gpas[id] = NULL;
> >>>>                   }
> >>>>
> >>>> I guess the kernel is a bit stricter about unmapping unaligned TCE entries? If
> >>>> so, maybe we should align automatically in dma-map-out?
> >>>
> >>>
> >>> I thought I did that as well in 1/4, I'll double check. Hmmm.
> >>
> >>
> >> v2 of these patches did not align (or, trunc, really) in dma-map-out but
> >> v3 do, it is "swap dma-trunc swap" in dma-map-out of 1/4. SO I am really
> >> confused now.
> > 
> > Yah sorry I think the issue was on my end. I thought:
> > 
> >   https://github.com/aik/SLOF/commits/virtio-iommu
> 
> 
> My bad, I should have pushed it out since I started pushing them out.
> 
> The question now is if we are good to go and I can push these out and
> send SLOF blob update so people can actually start testing it? Thanks,

System came back after clearing GARD record, re-tested disk boot for
virtio-blk/virtio-scsi, and netboot via virtio-net, with iommu on/off and
everything seems to be working.

Still need to verify in an actual uv-capable system, but in terms of base
IOMMU enablement the series looks good:

Tested-by: Michael Roth <mdroth@linux.vnet.ibm.com>

> 
> 
> >   
> > reflected this series, but it does contain an older version of the
> > dma-map* patch:
> > 
> >   index f136324..be529c9 100644
> >   --- a/board-qemu/slof/pci-phb.fs
> >   +++ b/board-qemu/slof/pci-phb.fs
> >   @@ -146,6 +146,7 @@ setup-puid
> >    \ grub does not align allocated addresses to the size so when mapping,
> >    \ we might ask bm-alloc for an extra IOMMU page
> >    : dma-align ( size virt -- aligned-size ) tce-mask and + tce-ps #aligned ;
> >   +: dma-trunc ( addr -- addr&~fff) tce-mask not and ;
> >   
> >    : dma-map-in  ( virt size cachable? -- devaddr )
> >       phb-debug? IF cr ." dma-map-in called: " .s cr THEN
> >   @@ -163,7 +164,7 @@ setup-puid
> >       swap                             ( size dev-addr virt )
> >       2dup tce-mask and or >r         \ add page offset to the return value
> >   
> >   -   tce-mask not and 3 OR            \ Trunk virt and add read and write perm
> >   +   dma-trunc 3 OR                   \ Truncate and add read and write perm
> >       rot                             ( virt dev-addr size r: dev-addr )
> >       0
> >       ?DO
> >   @@ -179,10 +180,10 @@ setup-puid
> >    : dma-map-out  ( virt devaddr size -- )
> >       phb-debug? IF cr ." dma-map-out called: " .s cr THEN
> >       (init-dma-window-vars)
> >   -   rot drop            ( devaddr size )
> >   +   rot drop                        ( devaddr size )
> >       over dma-align
> >   -   2dup swap tce-mask not and swap bm-handle -rot
> >   -   bm-free
> >   +   swap dma-trunc swap             ( devaddr-trunc size-extended )
> >   +   2dup bm-handle -rot bm-free
> >       0
> >       ?DO
> >           dup 0 dma-window-liobn -rot
> > 
> > 
> > I haven't been able to re-test yet since something seems to have gone
> > awry with the test system last night, but I assume that's what the issue
> > was. Sorry for the confusion.
> > 
> >>
> >> It is true that checking differs between qemu's and kvm's h_put_tce
> >> handlers, I added this (see below) to my qemu and it still works, can
> >> you please try in your tree and capture the spapr_iommu_pci_put traces?
> >> Thanks,
> >>
> >>
> >> diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
> >> index 3d3bcc86496a..feae27bf11fa 100644
> >> --- a/hw/ppc/spapr_iommu.c
> >> +++ b/hw/ppc/spapr_iommu.c
> >> @@ -564,6 +564,9 @@ static target_ulong h_put_tce(PowerPCCPU *cpu,
> >> SpaprMachineState *spapr,
> >>      if (tcet) {
> >>          hwaddr page_mask = IOMMU_PAGE_MASK(tcet->page_shift);
> >>
> >> +        if (ioba & ~page_mask)
> >> +            abort();
> >> +
> >>
> >>
> >>
> >>
> >> -- 
> >> Alexey
> 
> -- 
> Alexey
Alexey Kardashevskiy Dec. 3, 2019, 3:11 a.m. UTC | #9
On 03/12/2019 13:49, Michael Roth wrote:
> Quoting Alexey Kardashevskiy (2019-11-27 18:15:03)
>>
>>
>> On 23/11/2019 01:42, Michael Roth wrote:
>>> Quoting Alexey Kardashevskiy (2019-11-21 21:14:10)
>>>>
>>>>
>>>> On 22/11/2019 12:49, Alexey Kardashevskiy wrote:
>>>>>
>>>>>
>>>>> On 22/11/2019 11:09, Michael Roth wrote:
>>>>>> Quoting Alexey Kardashevskiy (2019-11-20 17:58:01)
>>>>>>> When QEMU is started with iommu_platform=on, the guest driver must accept
>>>>>>> it or the device will fail.
>>>>>>>
>>>>>>> This enables IOMMU support for virtio-net, -scsi, -block, -serial, -9p
>>>>>>> devices. -serial and -9p are only compile tested though.
>>>>>>>
>>>>>>> For virtio-net we map all RX buffers once and TX when xmit() is called
>>>>>>> and unmap older pages when we are about to reuse the VQ descriptor.
>>>>>>> As all other devices are synchronous, we unmap IOMMU pages right after
>>>>>>> completion of a transaction.
>>>>>>>
>>>>>>> This depends on QEMU's:
>>>>>>> https://patchwork.ozlabs.org/patch/1194067/
>>>>>>>
>>>>>>> Copied-bits-and-pieces-from: Michael Roth <mdroth@linux.vnet.ibm.com>
>>>>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>>>>> ---
>>>>>>> Changes:
>>>>>>> v2:
>>>>>>> * added Mike's fs/dma-instance-function.fs
>>>>>>> * total rework
>>>>>>> ---
>>>>>>>  board-qemu/slof/Makefile         |  1 +
>>>>>>>  lib/libvirtio/virtio.h           |  5 ++
>>>>>>>  lib/libvirtio/virtio-9p.c        |  4 ++
>>>>>>>  lib/libvirtio/virtio-blk.c       |  4 ++
>>>>>>>  lib/libvirtio/virtio-net.c       |  5 +-
>>>>>>>  lib/libvirtio/virtio-scsi.c      |  5 ++
>>>>>>>  lib/libvirtio/virtio-serial.c    | 12 +++--
>>>>>>>  lib/libvirtio/virtio.c           | 82 +++++++++++++++++++++++++++++++-
>>>>>>>  board-qemu/slof/OF.fs            |  3 ++
>>>>>>>  slof/fs/dma-instance-function.fs | 28 +++++++++++
>>>>>>>  10 files changed, 143 insertions(+), 6 deletions(-)
>>>>>>>  create mode 100644 slof/fs/dma-instance-function.fs
>>>>>>>
>>>>>>> diff --git a/board-qemu/slof/Makefile b/board-qemu/slof/Makefile
>>>>>>> index 2263e751bde9..d7ed2d7a6f18 100644
>>>>>>> --- a/board-qemu/slof/Makefile
>>>>>>> +++ b/board-qemu/slof/Makefile
>>>>>>> @@ -99,6 +99,7 @@ OF_FFS_FILES = \
>>>>>>>         $(SLOFCMNDIR)/fs/graphics.fs \
>>>>>>>         $(SLOFCMNDIR)/fs/generic-disk.fs \
>>>>>>>         $(SLOFCMNDIR)/fs/dma-function.fs \
>>>>>>> +       $(SLOFCMNDIR)/fs/dma-instance-function.fs \
>>>>>>>         $(SLOFCMNDIR)/fs/pci-device.fs \
>>>>>>>         $(SLOFCMNDIR)/fs/pci-bridge.fs \
>>>>>>>         $(SLOFCMNDIR)/fs/pci-properties.fs \
>>>>>>> diff --git a/lib/libvirtio/virtio.h b/lib/libvirtio/virtio.h
>>>>>>> index 4927a97f9f5f..250120b3112b 100644
>>>>>>> --- a/lib/libvirtio/virtio.h
>>>>>>> +++ b/lib/libvirtio/virtio.h
>>>>>>> @@ -29,6 +29,7 @@
>>>>>>>  #define VIRTIO_F_RING_INDIRECT_DESC    BIT(28)
>>>>>>>  #define VIRTIO_F_RING_EVENT_IDX                BIT(29)
>>>>>>>  #define VIRTIO_F_VERSION_1             BIT(32)
>>>>>>> +#define VIRTIO_F_IOMMU_PLATFORM         BIT(33)
>>>>>>>
>>>>>>>  #define VIRTIO_TIMEOUT                 5000 /* 5 sec timeout */
>>>>>>>
>>>>>>> @@ -84,6 +85,8 @@ struct vqs {
>>>>>>>         struct vring_desc *desc;
>>>>>>>         struct vring_avail *avail;
>>>>>>>         struct vring_used *used;
>>>>>>> +       void **desc_gpas; /* to get gpa from desc->addr (which is ioba) */
>>>>>>> +       uint64_t bus_desc;
>>>>>>>  };
>>>>>>>
>>>>>>>  struct virtio_device {
>>>>>>> @@ -109,6 +112,8 @@ extern struct vring_used *virtio_get_vring_used(struct virtio_device *dev, int q
>>>>>>>  extern void virtio_fill_desc(struct vqs *vq, int id, uint64_t features,
>>>>>>>                               uint64_t addr, uint32_t len,
>>>>>>>                               uint16_t flags, uint16_t next);
>>>>>>> +extern void virtio_free_desc(struct vqs *vq, int id, uint64_t features);
>>>>>>> +void *virtio_desc_addr(struct virtio_device *vdev, int queue, int id);
>>>>>>>  extern struct vqs *virtio_queue_init_vq(struct virtio_device *dev, unsigned int id);
>>>>>>>  extern void virtio_queue_term_vq(struct virtio_device *dev, struct vqs *vq, unsigned int id);
>>>>>>>
>>>>>>> diff --git a/lib/libvirtio/virtio-9p.c b/lib/libvirtio/virtio-9p.c
>>>>>>> index 426069fe9509..76078612b06e 100644
>>>>>>> --- a/lib/libvirtio/virtio-9p.c
>>>>>>> +++ b/lib/libvirtio/virtio-9p.c
>>>>>>> @@ -129,6 +129,10 @@ static int virtio_9p_transact(void *opaque, uint8_t *tx, int tx_size, uint8_t *r
>>>>>>>                 // do something better
>>>>>>>                 mb();
>>>>>>>         }
>>>>>>> +
>>>>>>> +       virtio_free_desc(vq, id, dev->features);
>>>>>>> +       virtio_free_desc(vq, id + 1, dev->features);
>>>>>>> +
>>>>>>>         if (i == 0) {
>>>>>>>                 return -1;
>>>>>>>         }
>>>>>>> diff --git a/lib/libvirtio/virtio-blk.c b/lib/libvirtio/virtio-blk.c
>>>>>>> index a0dadbb0d6a8..0363038e559d 100644
>>>>>>> --- a/lib/libvirtio/virtio-blk.c
>>>>>>> +++ b/lib/libvirtio/virtio-blk.c
>>>>>>> @@ -195,6 +195,10 @@ virtioblk_transfer(struct virtio_device *dev, char *buf, uint64_t blocknum,
>>>>>>>                         break;
>>>>>>>         }
>>>>>>>
>>>>>>> +       virtio_free_desc(vq, id, dev->features);
>>>>>>> +       virtio_free_desc(vq, id + 1, dev->features);
>>>>>>> +       virtio_free_desc(vq, id + 2, dev->features);
>>>>>>> +
>>>>>>>         if (status == 0)
>>>>>>>                 return cnt;
>>>>>>>
>>>>>>> diff --git a/lib/libvirtio/virtio-net.c b/lib/libvirtio/virtio-net.c
>>>>>>> index ae67883020ef..5a0d19088527 100644
>>>>>>> --- a/lib/libvirtio/virtio-net.c
>>>>>>> +++ b/lib/libvirtio/virtio-net.c
>>>>>>> @@ -255,6 +255,9 @@ static int virtionet_xmit(struct virtio_net *vnet, char *buf, int len)
>>>>>>>         idx = virtio_modern16_to_cpu(vdev, vq_tx->avail->idx);
>>>>>>>         id = (idx * 2) % vq_tx->size;
>>>>>>>
>>>>>>> +       virtio_free_desc(vq_tx, id, vdev->features);
>>>>>>> +       virtio_free_desc(vq_tx, id + 1, vdev->features);
>>>>>>> +
>>>>>>>         /* Set up virtqueue descriptor for header */
>>>>>>>         virtio_fill_desc(vq_tx, id, vdev->features, (uint64_t)nethdr,
>>>>>>>                          net_hdr_size, VRING_DESC_F_NEXT, id + 1);
>>>>>>> @@ -317,7 +320,7 @@ static int virtionet_receive(struct virtio_net *vnet, char *buf, int maxlen)
>>>>>>>  #endif
>>>>>>>
>>>>>>>         /* Copy data to destination buffer */
>>>>>>> -       memcpy(buf, (void *)virtio_modern64_to_cpu(vdev, vq_rx->desc[id].addr), len);
>>>>>>> +       memcpy(buf, virtio_desc_addr(vdev, VQ_RX, id), len);
>>>>>>>
>>>>>>>         /* Move indices to next entries */
>>>>>>>         last_rx_idx = last_rx_idx + 1;
>>>>>>> diff --git a/lib/libvirtio/virtio-scsi.c b/lib/libvirtio/virtio-scsi.c
>>>>>>> index 251661e8d466..44573d10e24c 100644
>>>>>>> --- a/lib/libvirtio/virtio-scsi.c
>>>>>>> +++ b/lib/libvirtio/virtio-scsi.c
>>>>>>> @@ -81,6 +81,11 @@ int virtioscsi_send(struct virtio_device *dev,
>>>>>>>                         break;
>>>>>>>         }
>>>>>>>
>>>>>>> +       virtio_free_desc(vq, id, dev->features);
>>>>>>> +       virtio_free_desc(vq, id + 1, dev->features);
>>>>>>> +       if (!(buf == NULL || buf_len == 0))
>>>>>>> +               virtio_free_desc(vq, id + 2, dev->features);
>>>>>>> +
>>>>>>>         return 0;
>>>>>>>  }
>>>>>>>
>>>>>>> diff --git a/lib/libvirtio/virtio-serial.c b/lib/libvirtio/virtio-serial.c
>>>>>>> index 78ab0e25d7db..66fbfe8b6cc6 100644
>>>>>>> --- a/lib/libvirtio/virtio-serial.c
>>>>>>> +++ b/lib/libvirtio/virtio-serial.c
>>>>>>> @@ -108,7 +108,7 @@ void virtio_serial_shutdown(struct virtio_device *dev)
>>>>>>>
>>>>>>>  int virtio_serial_putchar(struct virtio_device *dev, char c)
>>>>>>>  {
>>>>>>> -       int id;
>>>>>>> +       int id, ret;
>>>>>>>         uint32_t vq_size, time;
>>>>>>>         volatile uint16_t *current_used_idx;
>>>>>>>         uint16_t last_used_idx, avail_idx;
>>>>>>> @@ -135,17 +135,21 @@ int virtio_serial_putchar(struct virtio_device *dev, char c)
>>>>>>>         virtio_queue_notify(dev, TX_Q);
>>>>>>>
>>>>>>>         /* Wait for host to consume the descriptor */
>>>>>>> +       ret = 1;
>>>>>>>         time = SLOF_GetTimer() + VIRTIO_TIMEOUT;
>>>>>>>         while (*current_used_idx == last_used_idx) {
>>>>>>>                 // do something better
>>>>>>>                 mb();
>>>>>>>                 if (time < SLOF_GetTimer()) {
>>>>>>>                         printf("virtio_serial_putchar failed! \n");
>>>>>>> -                       return 0;
>>>>>>> +                       ret = 0;
>>>>>>> +                       break;
>>>>>>>                 }
>>>>>>>         }
>>>>>>>
>>>>>>> -       return 1;
>>>>>>> +       virtio_free_desc(vq, id, dev->features);
>>>>>>> +
>>>>>>> +       return ret;
>>>>>>>  }
>>>>>>>
>>>>>>>  char virtio_serial_getchar(struct virtio_device *dev)
>>>>>>> @@ -165,7 +169,7 @@ char virtio_serial_getchar(struct virtio_device *dev)
>>>>>>>                 % vq_rx->size;
>>>>>>>
>>>>>>>         /* Copy data to destination buffer */
>>>>>>> -       memcpy(buf, (void *)virtio_modern64_to_cpu(dev, vq_rx->desc[id - 1].addr), RX_ELEM_SIZE);
>>>>>>> +       memcpy(buf, virtio_desc_addr(dev, RX_Q, id - 1), RX_ELEM_SIZE);
>>>>>>>
>>>>>>>         /* Move indices to next entries */
>>>>>>>         last_rx_idx = last_rx_idx + 1;
>>>>>>> diff --git a/lib/libvirtio/virtio.c b/lib/libvirtio/virtio.c
>>>>>>> index fff0c7e54c19..d4db5f656888 100644
>>>>>>> --- a/lib/libvirtio/virtio.c
>>>>>>> +++ b/lib/libvirtio/virtio.c
>>>>>>> @@ -273,6 +273,17 @@ void virtio_fill_desc(struct vqs *vq, int id, uint64_t features,
>>>>>>>         next %= vq->size;
>>>>>>>
>>>>>>>         if (features & VIRTIO_F_VERSION_1) {
>>>>>>> +               if (features & VIRTIO_F_IOMMU_PLATFORM) {
>>>>>>> +                       void *gpa = (void *) addr;
>>>>>>> +
>>>>>>> +                       if (!vq->desc_gpas) {
>>>>>>> +                               fprintf(stderr, "IOMMU setup has not been done!\n");
>>>>>>> +                               return;
>>>>>>> +                       }
>>>>>>> +
>>>>>>> +                       addr = SLOF_dma_map_in(gpa, len, 0);
>>>>>>> +                       vq->desc_gpas[id] = gpa;
>>>>>>> +               }
>>>>>>>                 desc->addr = cpu_to_le64(addr);
>>>>>>>                 desc->len = cpu_to_le32(len);
>>>>>>>                 desc->flags = cpu_to_le16(flags);
>>>>>>> @@ -285,6 +296,32 @@ void virtio_fill_desc(struct vqs *vq, int id, uint64_t features,
>>>>>>>         }
>>>>>>>  }
>>>>>>>
>>>>>>> +void virtio_free_desc(struct vqs *vq, int id, uint64_t features)
>>>>>>> +{
>>>>>>> +       struct vring_desc *desc;
>>>>>>> +
>>>>>>> +       id %= vq->size;
>>>>>>> +       desc = &vq->desc[id];
>>>>>>> +
>>>>>>> +       if (features & VIRTIO_F_VERSION_1) {
>>>>>>> +               if (features & VIRTIO_F_IOMMU_PLATFORM) {
>>>>>>> +                       SLOF_dma_map_out(le64_to_cpu(desc->addr),
>>>>>>> +                                        0, le32_to_cpu(desc->len));
>>>>>>> +                       vq->desc_gpas[id] = NULL;
>>>>>>> +               }
>>>>>>> +       }
>>>>>>> +}
>>>>>>
>>>>>> On a 4.18 RHEL8 kernel I was unable to boot from virtio-blk, it complains the
>>>>>> disk isn't bootable and the H_PUT_TCE traces show:
>>>>>>
>>>>>>  qemu-system-ppc-19019 [045] .... 786844.067356: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x1000,0xe46b003,0xdc73350
>>>>>>  qemu-system-ppc-19019 [045] .... 786844.067358: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
>>>>>>  qemu-system-ppc-19019 [045] .... 786844.067361: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x2000,0xe46c003,0xdc73350
>>>>>>  qemu-system-ppc-19019 [045] .... 786844.067362: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
>>>>>>  qemu-system-ppc-19019 [041] .... 786844.070532: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x3000,0xe67b003,0xdc73490
>>>>>>  qemu-system-ppc-19019 [041] .... 786844.070533: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
>>>>>>  qemu-system-ppc-19019 [041] .... 786844.070817: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x4000,0xe469003,0xdc73490
>>>>>>  qemu-system-ppc-19019 [041] .... 786844.070818: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
>>>>>>  qemu-system-ppc-19019 [041] .... 786844.071102: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x5000,0xe67a003,0xdc73490
>>>>>>  qemu-system-ppc-19019 [041] .... 786844.071103: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
>>>>>>  qemu-system-ppc-19019 [041] .... 786844.071123: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
>>>>>>  qemu-system-ppc-19019 [041] .... 786844.073532: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x3228,0x0,0xdc73488
>>>>>>  qemu-system-ppc-19019 [041] .... 786844.073534: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_PARAMETER
>>>>>>
>>>>>> If I disable in-kernel TCE acceleration, or use TCG, it seems to work
>>>>>> fine. If I add the following hack it seems to fix things:
>>>>>>
>>>>>>   @@ -305,7 +307,8 @@ void virtio_free_desc(struct vqs *vq, int id, uint64_t features)
>>>>>>    
>>>>>>           if (features & VIRTIO_F_VERSION_1) {
>>>>>>                   if (features & VIRTIO_F_IOMMU_PLATFORM) {
>>>>>>   -                       SLOF_dma_map_out(le64_to_cpu(desc->addr),
>>>>>>   +                       fprintf(stderr, "mapping out desc->addr: %llx\n", le64_to_cpu(desc->addr));
>>>>>>   +                       SLOF_dma_map_out(le64_to_cpu(desc->addr) & ~(0xfff),
>>>>>>                                            0, le32_to_cpu(desc->len));
>>>>>>                           vq->desc_gpas[id] = NULL;
>>>>>>                   }
>>>>>>
>>>>>> I guess the kernel is a bit stricter about unmapping unaligned TCE entries? If
>>>>>> so, maybe we should align automatically in dma-map-out?
>>>>>
>>>>>
>>>>> I thought I did that as well in 1/4, I'll double check. Hmmm.
>>>>
>>>>
>>>> v2 of these patches did not align (or, trunc, really) in dma-map-out but
>>>> v3 do, it is "swap dma-trunc swap" in dma-map-out of 1/4. SO I am really
>>>> confused now.
>>>
>>> Yah sorry I think the issue was on my end. I thought:
>>>
>>>   https://github.com/aik/SLOF/commits/virtio-iommu
>>
>>
>> My bad, I should have pushed it out since I started pushing them out.
>>
>> The question now is if we are good to go and I can push these out and
>> send SLOF blob update so people can actually start testing it? Thanks,
> 
> System came back after clearing GARD record, re-tested disk boot for
> virtio-blk/virtio-scsi, and netboot via virtio-net, with iommu on/off and
> everything seems to be working.
> 
> Still need to verify in an actual uv-capable system, but in terms of base
> IOMMU enablement the series looks good:
> 
> Tested-by: Michael Roth <mdroth@linux.vnet.ibm.com>


Oh great, thanks!

Are going to give it some more review or I just push it out and send
slof blob update to qemu?


David, how much do you want to see many rb's for these?


Thanks,



> 
>>
>>
>>>   
>>> reflected this series, but it does contain an older version of the
>>> dma-map* patch:
>>>
>>>   index f136324..be529c9 100644
>>>   --- a/board-qemu/slof/pci-phb.fs
>>>   +++ b/board-qemu/slof/pci-phb.fs
>>>   @@ -146,6 +146,7 @@ setup-puid
>>>    \ grub does not align allocated addresses to the size so when mapping,
>>>    \ we might ask bm-alloc for an extra IOMMU page
>>>    : dma-align ( size virt -- aligned-size ) tce-mask and + tce-ps #aligned ;
>>>   +: dma-trunc ( addr -- addr&~fff) tce-mask not and ;
>>>   
>>>    : dma-map-in  ( virt size cachable? -- devaddr )
>>>       phb-debug? IF cr ." dma-map-in called: " .s cr THEN
>>>   @@ -163,7 +164,7 @@ setup-puid
>>>       swap                             ( size dev-addr virt )
>>>       2dup tce-mask and or >r         \ add page offset to the return value
>>>   
>>>   -   tce-mask not and 3 OR            \ Trunk virt and add read and write perm
>>>   +   dma-trunc 3 OR                   \ Truncate and add read and write perm
>>>       rot                             ( virt dev-addr size r: dev-addr )
>>>       0
>>>       ?DO
>>>   @@ -179,10 +180,10 @@ setup-puid
>>>    : dma-map-out  ( virt devaddr size -- )
>>>       phb-debug? IF cr ." dma-map-out called: " .s cr THEN
>>>       (init-dma-window-vars)
>>>   -   rot drop            ( devaddr size )
>>>   +   rot drop                        ( devaddr size )
>>>       over dma-align
>>>   -   2dup swap tce-mask not and swap bm-handle -rot
>>>   -   bm-free
>>>   +   swap dma-trunc swap             ( devaddr-trunc size-extended )
>>>   +   2dup bm-handle -rot bm-free
>>>       0
>>>       ?DO
>>>           dup 0 dma-window-liobn -rot
>>>
>>>
>>> I haven't been able to re-test yet since something seems to have gone
>>> awry with the test system last night, but I assume that's what the issue
>>> was. Sorry for the confusion.
>>>
>>>>
>>>> It is true that checking differs between qemu's and kvm's h_put_tce
>>>> handlers, I added this (see below) to my qemu and it still works, can
>>>> you please try in your tree and capture the spapr_iommu_pci_put traces?
>>>> Thanks,
>>>>
>>>>
>>>> diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
>>>> index 3d3bcc86496a..feae27bf11fa 100644
>>>> --- a/hw/ppc/spapr_iommu.c
>>>> +++ b/hw/ppc/spapr_iommu.c
>>>> @@ -564,6 +564,9 @@ static target_ulong h_put_tce(PowerPCCPU *cpu,
>>>> SpaprMachineState *spapr,
>>>>      if (tcet) {
>>>>          hwaddr page_mask = IOMMU_PAGE_MASK(tcet->page_shift);
>>>>
>>>> +        if (ioba & ~page_mask)
>>>> +            abort();
>>>> +
>>>>
>>>>
>>>>
>>>>
>>>> -- 
>>>> Alexey
>>
>> -- 
>> Alexey
diff mbox series

Patch

diff --git a/board-qemu/slof/Makefile b/board-qemu/slof/Makefile
index 2263e751bde9..d7ed2d7a6f18 100644
--- a/board-qemu/slof/Makefile
+++ b/board-qemu/slof/Makefile
@@ -99,6 +99,7 @@  OF_FFS_FILES = \
 	$(SLOFCMNDIR)/fs/graphics.fs \
 	$(SLOFCMNDIR)/fs/generic-disk.fs \
 	$(SLOFCMNDIR)/fs/dma-function.fs \
+	$(SLOFCMNDIR)/fs/dma-instance-function.fs \
 	$(SLOFCMNDIR)/fs/pci-device.fs \
 	$(SLOFCMNDIR)/fs/pci-bridge.fs \
 	$(SLOFCMNDIR)/fs/pci-properties.fs \
diff --git a/lib/libvirtio/virtio.h b/lib/libvirtio/virtio.h
index 4927a97f9f5f..250120b3112b 100644
--- a/lib/libvirtio/virtio.h
+++ b/lib/libvirtio/virtio.h
@@ -29,6 +29,7 @@ 
 #define VIRTIO_F_RING_INDIRECT_DESC	BIT(28)
 #define VIRTIO_F_RING_EVENT_IDX		BIT(29)
 #define VIRTIO_F_VERSION_1		BIT(32)
+#define VIRTIO_F_IOMMU_PLATFORM         BIT(33)
 
 #define VIRTIO_TIMEOUT		        5000 /* 5 sec timeout */
 
@@ -84,6 +85,8 @@  struct vqs {
 	struct vring_desc *desc;
 	struct vring_avail *avail;
 	struct vring_used *used;
+	void **desc_gpas; /* to get gpa from desc->addr (which is ioba) */
+	uint64_t bus_desc;
 };
 
 struct virtio_device {
@@ -109,6 +112,8 @@  extern struct vring_used *virtio_get_vring_used(struct virtio_device *dev, int q
 extern void virtio_fill_desc(struct vqs *vq, int id, uint64_t features,
                              uint64_t addr, uint32_t len,
                              uint16_t flags, uint16_t next);
+extern void virtio_free_desc(struct vqs *vq, int id, uint64_t features);
+void *virtio_desc_addr(struct virtio_device *vdev, int queue, int id);
 extern struct vqs *virtio_queue_init_vq(struct virtio_device *dev, unsigned int id);
 extern void virtio_queue_term_vq(struct virtio_device *dev, struct vqs *vq, unsigned int id);
 
diff --git a/lib/libvirtio/virtio-9p.c b/lib/libvirtio/virtio-9p.c
index 426069fe9509..76078612b06e 100644
--- a/lib/libvirtio/virtio-9p.c
+++ b/lib/libvirtio/virtio-9p.c
@@ -129,6 +129,10 @@  static int virtio_9p_transact(void *opaque, uint8_t *tx, int tx_size, uint8_t *r
 		// do something better
 		mb();
 	}
+
+	virtio_free_desc(vq, id, dev->features);
+	virtio_free_desc(vq, id + 1, dev->features);
+
 	if (i == 0) {
 		return -1;
 	}
diff --git a/lib/libvirtio/virtio-blk.c b/lib/libvirtio/virtio-blk.c
index a0dadbb0d6a8..0363038e559d 100644
--- a/lib/libvirtio/virtio-blk.c
+++ b/lib/libvirtio/virtio-blk.c
@@ -195,6 +195,10 @@  virtioblk_transfer(struct virtio_device *dev, char *buf, uint64_t blocknum,
 			break;
 	}
 
+	virtio_free_desc(vq, id, dev->features);
+	virtio_free_desc(vq, id + 1, dev->features);
+	virtio_free_desc(vq, id + 2, dev->features);
+
 	if (status == 0)
 		return cnt;
 
diff --git a/lib/libvirtio/virtio-net.c b/lib/libvirtio/virtio-net.c
index ae67883020ef..5a0d19088527 100644
--- a/lib/libvirtio/virtio-net.c
+++ b/lib/libvirtio/virtio-net.c
@@ -255,6 +255,9 @@  static int virtionet_xmit(struct virtio_net *vnet, char *buf, int len)
 	idx = virtio_modern16_to_cpu(vdev, vq_tx->avail->idx);
 	id = (idx * 2) % vq_tx->size;
 
+	virtio_free_desc(vq_tx, id, vdev->features);
+	virtio_free_desc(vq_tx, id + 1, vdev->features);
+
 	/* Set up virtqueue descriptor for header */
 	virtio_fill_desc(vq_tx, id, vdev->features, (uint64_t)nethdr,
 			 net_hdr_size, VRING_DESC_F_NEXT, id + 1);
@@ -317,7 +320,7 @@  static int virtionet_receive(struct virtio_net *vnet, char *buf, int maxlen)
 #endif
 
 	/* Copy data to destination buffer */
-	memcpy(buf, (void *)virtio_modern64_to_cpu(vdev, vq_rx->desc[id].addr), len);
+	memcpy(buf, virtio_desc_addr(vdev, VQ_RX, id), len);
 
 	/* Move indices to next entries */
 	last_rx_idx = last_rx_idx + 1;
diff --git a/lib/libvirtio/virtio-scsi.c b/lib/libvirtio/virtio-scsi.c
index 251661e8d466..44573d10e24c 100644
--- a/lib/libvirtio/virtio-scsi.c
+++ b/lib/libvirtio/virtio-scsi.c
@@ -81,6 +81,11 @@  int virtioscsi_send(struct virtio_device *dev,
 			break;
 	}
 
+	virtio_free_desc(vq, id, dev->features);
+	virtio_free_desc(vq, id + 1, dev->features);
+	if (!(buf == NULL || buf_len == 0))
+		virtio_free_desc(vq, id + 2, dev->features);
+
 	return 0;
 }
 
diff --git a/lib/libvirtio/virtio-serial.c b/lib/libvirtio/virtio-serial.c
index 78ab0e25d7db..66fbfe8b6cc6 100644
--- a/lib/libvirtio/virtio-serial.c
+++ b/lib/libvirtio/virtio-serial.c
@@ -108,7 +108,7 @@  void virtio_serial_shutdown(struct virtio_device *dev)
 
 int virtio_serial_putchar(struct virtio_device *dev, char c)
 {
-	int id;
+	int id, ret;
 	uint32_t vq_size, time;
 	volatile uint16_t *current_used_idx;
 	uint16_t last_used_idx, avail_idx;
@@ -135,17 +135,21 @@  int virtio_serial_putchar(struct virtio_device *dev, char c)
 	virtio_queue_notify(dev, TX_Q);
 
 	/* Wait for host to consume the descriptor */
+	ret = 1;
 	time = SLOF_GetTimer() + VIRTIO_TIMEOUT;
 	while (*current_used_idx == last_used_idx) {
 		// do something better
 		mb();
 		if (time < SLOF_GetTimer()) {
 			printf("virtio_serial_putchar failed! \n");
-			return 0;
+			ret = 0;
+			break;
 		}
 	}
 
-	return 1;
+	virtio_free_desc(vq, id, dev->features);
+
+	return ret;
 }
 
 char virtio_serial_getchar(struct virtio_device *dev)
@@ -165,7 +169,7 @@  char virtio_serial_getchar(struct virtio_device *dev)
 		% vq_rx->size;
 
 	/* Copy data to destination buffer */
-	memcpy(buf, (void *)virtio_modern64_to_cpu(dev, vq_rx->desc[id - 1].addr), RX_ELEM_SIZE);
+	memcpy(buf, virtio_desc_addr(dev, RX_Q, id - 1), RX_ELEM_SIZE);
 
 	/* Move indices to next entries */
 	last_rx_idx = last_rx_idx + 1;
diff --git a/lib/libvirtio/virtio.c b/lib/libvirtio/virtio.c
index fff0c7e54c19..d4db5f656888 100644
--- a/lib/libvirtio/virtio.c
+++ b/lib/libvirtio/virtio.c
@@ -273,6 +273,17 @@  void virtio_fill_desc(struct vqs *vq, int id, uint64_t features,
 	next %= vq->size;
 
 	if (features & VIRTIO_F_VERSION_1) {
+		if (features & VIRTIO_F_IOMMU_PLATFORM) {
+			void *gpa = (void *) addr;
+
+			if (!vq->desc_gpas) {
+				fprintf(stderr, "IOMMU setup has not been done!\n");
+				return;
+			}
+
+			addr = SLOF_dma_map_in(gpa, len, 0);
+			vq->desc_gpas[id] = gpa;
+		}
 		desc->addr = cpu_to_le64(addr);
 		desc->len = cpu_to_le32(len);
 		desc->flags = cpu_to_le16(flags);
@@ -285,6 +296,32 @@  void virtio_fill_desc(struct vqs *vq, int id, uint64_t features,
 	}
 }
 
+void virtio_free_desc(struct vqs *vq, int id, uint64_t features)
+{
+	struct vring_desc *desc;
+
+	id %= vq->size;
+	desc = &vq->desc[id];
+
+	if (features & VIRTIO_F_VERSION_1) {
+		if (features & VIRTIO_F_IOMMU_PLATFORM) {
+			SLOF_dma_map_out(le64_to_cpu(desc->addr),
+					 0, le32_to_cpu(desc->len));
+			vq->desc_gpas[id] = NULL;
+		}
+	}
+}
+
+void *virtio_desc_addr(struct virtio_device *vdev, int queue, int id)
+{
+	struct vqs *vq = &vdev->vq[queue];
+
+	if (vq->desc_gpas)
+		return vq->desc_gpas[id];
+
+	return (void *) virtio_modern64_to_cpu(vdev, vq->desc[id].addr);
+}
+
 /**
  * Reset virtio device
  */
@@ -326,6 +363,19 @@  static void virtio_set_qaddr(struct virtio_device *dev, int queue, unsigned long
 		uint64_t q_used;
 		uint32_t q_size = virtio_get_qsize(dev, queue);
 
+		if (dev->features & VIRTIO_F_IOMMU_PLATFORM) {
+			unsigned long cb;
+
+			cb = q_size * sizeof(struct vring_desc);
+			cb += sizeof(struct vring_avail) + sizeof(uint16_t) * q_size;
+			cb = VQ_ALIGN(cb);
+			cb += sizeof(struct vring_used) + sizeof(uint16_t) * q_size;
+			cb = VQ_ALIGN(cb);
+			q_desc = SLOF_dma_map_in((void *)q_desc, cb, 0);
+
+			dev->vq[queue].bus_desc = q_desc;
+		}
+
 		virtio_pci_write64(dev->common.addr + offset_of(struct virtio_dev_common, q_desc), q_desc);
 		q_avail = q_desc + q_size * sizeof(struct vring_desc);
 		virtio_pci_write64(dev->common.addr + offset_of(struct virtio_dev_common, q_avail), q_avail);
@@ -373,14 +423,41 @@  struct vqs *virtio_queue_init_vq(struct virtio_device *dev, unsigned int id)
 
 	vq->avail->flags = virtio_cpu_to_modern16(dev, VRING_AVAIL_F_NO_INTERRUPT);
 	vq->avail->idx = 0;
+	if (dev->features & VIRTIO_F_IOMMU_PLATFORM)
+		vq->desc_gpas = SLOF_alloc_mem_aligned(
+			vq->size * sizeof(vq->desc_gpas[0]), 4096);
 
 	return vq;
 }
 
 void virtio_queue_term_vq(struct virtio_device *dev, struct vqs *vq, unsigned int id)
 {
-	if (vq->desc)
+	if (vq->desc_gpas) {
+		int i;
+
+		for (i = 0; i < vq->size; ++i)
+			virtio_free_desc(vq, i, dev->features);
+
+		memset(vq->desc_gpas, 0, vq->size * sizeof(vq->desc_gpas[0]));
+		SLOF_free_mem(vq->desc_gpas,
+			vq->size * sizeof(vq->desc_gpas[0]));
+	}
+	if (vq->desc) {
+		if (dev->features & VIRTIO_F_IOMMU_PLATFORM) {
+			unsigned long cb;
+			uint32_t q_size = virtio_get_qsize(dev, vq->id);
+
+			cb = q_size * sizeof(struct vring_desc);
+			cb += sizeof(struct vring_avail) + sizeof(uint16_t) * q_size;
+			cb = VQ_ALIGN(cb);
+			cb += sizeof(struct vring_used) + sizeof(uint16_t) * q_size;
+			cb = VQ_ALIGN(cb);
+
+			SLOF_dma_map_out(vq->bus_desc, 0, cb);
+		}
+
 		SLOF_free_mem(vq->desc, virtio_vring_size(vq->size));
+	}
 	memset(vq, 0, sizeof(*vq));
 }
 
@@ -474,6 +551,9 @@  int virtio_negotiate_guest_features(struct virtio_device *dev, uint64_t features
 		return -1;
 	}
 
+	if (host_features & VIRTIO_F_IOMMU_PLATFORM)
+		features |= VIRTIO_F_IOMMU_PLATFORM;
+
 	virtio_set_guest_features(dev,  features);
 	host_features = virtio_get_host_features(dev);
 	if ((host_features & features) != features) {
diff --git a/board-qemu/slof/OF.fs b/board-qemu/slof/OF.fs
index a85f6c558e67..3e117ad03e09 100644
--- a/board-qemu/slof/OF.fs
+++ b/board-qemu/slof/OF.fs
@@ -143,6 +143,9 @@  check-for-nvramrc
 
 8a0 cp
 
+\ For DMA functions used by client/package instances.
+#include "dma-instance-function.fs"
+
 \ The client interface.
 #include "client.fs"
 \ ELF binary file format.
diff --git a/slof/fs/dma-instance-function.fs b/slof/fs/dma-instance-function.fs
new file mode 100644
index 000000000000..057181be0f44
--- /dev/null
+++ b/slof/fs/dma-instance-function.fs
@@ -0,0 +1,28 @@ 
+\ ****************************************************************************/
+\ * Copyright (c) 2011 IBM Corporation
+\ * All rights reserved.
+\ * This program and the accompanying materials
+\ * are made available under the terms of the BSD License
+\ * which accompanies this distribution, and is available at
+\ * http://www.opensource.org/licenses/bsd-license.php
+\ *
+\ * Contributors:
+\ *     IBM Corporation - initial implementation
+\ ****************************************************************************/
+
+\ DMA memory allocation functions
+: dma-alloc ( size -- virt )
+   s" dma-alloc" $call-parent
+;
+
+: dma-free ( virt size -- )
+   s" dma-free" $call-parent
+;
+
+: dma-map-in ( virt size cacheable? -- devaddr )
+   s" dma-map-in" $call-parent
+;
+
+: dma-map-out ( virt devaddr size -- )
+   s" dma-map-out" $call-parent
+;