Patchwork [21/26] Implement TCE translation for sPAPR VIO

login
register
mail settings
Submitter David Gibson
Date March 16, 2011, 4:56 a.m.
Message ID <1300251423-6715-22-git-send-email-david@gibson.dropbear.id.au>
Download mbox | patch
Permalink /patch/87163/
State New
Headers show

Comments

David Gibson - March 16, 2011, 4:56 a.m.
From: Ben Herrenschmidt <benh@kernel.crashing.org>

This patch implements the necessary infrastructure and hypercalls for
sPAPR's TCE (Translation Control Entry) IOMMU mechanism.  This is necessary
for all virtual IO devices which do DMA (i.e. nearly all of them).

Signed-off-by: Ben Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: David Gibson <dwg@au1.ibm.com>
---
 hw/spapr.c     |    3 +-
 hw/spapr_vio.c |  232 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 hw/spapr_vio.h |   32 ++++++++
 3 files changed, 266 insertions(+), 1 deletions(-)
Alexander Graf - March 16, 2011, 4:03 p.m.
On 03/16/2011 05:56 AM, David Gibson wrote:
> From: Ben Herrenschmidt<benh@kernel.crashing.org>
>
> This patch implements the necessary infrastructure and hypercalls for
> sPAPR's TCE (Translation Control Entry) IOMMU mechanism.  This is necessary
> for all virtual IO devices which do DMA (i.e. nearly all of them).
>
> Signed-off-by: Ben Herrenschmidt<benh@kernel.crashing.org>
> Signed-off-by: David Gibson<dwg@au1.ibm.com>
> ---
>   hw/spapr.c     |    3 +-
>   hw/spapr_vio.c |  232 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>   hw/spapr_vio.h |   32 ++++++++
>   3 files changed, 266 insertions(+), 1 deletions(-)
>
> diff --git a/hw/spapr.c b/hw/spapr.c
> index e7f8864..a362889 100644
> --- a/hw/spapr.c
> +++ b/hw/spapr.c
> @@ -62,7 +62,8 @@ static void *spapr_create_fdt(int *fdt_size, ram_addr_t ramsize,
>       uint32_t start_prop = cpu_to_be32(initrd_base);
>       uint32_t end_prop = cpu_to_be32(initrd_base + initrd_size);
>       uint32_t pft_size_prop[] = {0, cpu_to_be32(hash_shift)};
> -    char hypertas_prop[] = "hcall-pft\0hcall-term\0hcall-dabr\0hcall-interrupt";
> +    char hypertas_prop[] = "hcall-pft\0hcall-term\0hcall-dabr\0hcall-interrupt"
> +        "\0hcall-tce";
>       uint32_t interrupt_server_ranges_prop[] = {0, cpu_to_be32(smp_cpus)};
>       int i;
>       char *modelname;
> diff --git a/hw/spapr_vio.c b/hw/spapr_vio.c
> index 45edd94..37cf51e 100644
> --- a/hw/spapr_vio.c
> +++ b/hw/spapr_vio.c
> @@ -37,6 +37,7 @@
>   #endif /* CONFIG_FDT */
>
>   /* #define DEBUG_SPAPR */
> +/* #define DEBUG_TCE */
>
>   #ifdef DEBUG_SPAPR
>   #define dprintf(fmt, ...) \
> @@ -114,6 +115,28 @@ static int vio_make_devnode(VIOsPAPRDevice *dev,
>               return ret;
>       }
>
> +    if (dev->rtce_window_size) {
> +        uint32_t dma_prop[] = {cpu_to_be32(dev->reg),
> +                               0, 0,
> +                               0, cpu_to_be32(dev->rtce_window_size)};
> +
> +        ret = fdt_setprop_cell(fdt, node_off, "ibm,#dma-address-cells", 2);
> +        if (ret<  0) {
> +            return ret;
> +        }
> +
> +        ret = fdt_setprop_cell(fdt, node_off, "ibm,#dma-size-cells", 2);
> +        if (ret<  0) {
> +            return ret;
> +        }
> +
> +        ret = fdt_setprop(fdt, node_off, "ibm,my-dma-window", dma_prop,
> +                          sizeof(dma_prop));
> +        if (ret<  0) {
> +            return ret;
> +        }
> +    }
> +
>       if (info->devnode) {
>           ret = (info->devnode)(dev, fdt, node_off);
>           if (ret<  0) {
> @@ -125,6 +148,210 @@ static int vio_make_devnode(VIOsPAPRDevice *dev,
>   }
>   #endif /* CONFIG_FDT */
>
> +/*
> + * RTCE handling
> + */
> +
> +static void rtce_init(VIOsPAPRDevice *dev)
> +{
> +    size_t size = (dev->rtce_window_size>>  SPAPR_VIO_TCE_PAGE_SHIFT)
> +        * sizeof(VIOsPAPR_RTCE);
> +
> +    if (size) {
> +        dev->rtce_table = qemu_mallocz(size);
> +    }
> +}
> +
> +static target_ulong h_put_tce(CPUState *env, sPAPREnvironment *spapr,
> +                              target_ulong opcode, target_ulong *args)
> +{
> +    target_ulong liobn = args[0];
> +    target_ulong ioba = args[1];
> +    target_ulong tce = args[2];
> +    VIOsPAPRDevice *dev = spapr_vio_find_by_reg(spapr->vio_bus, liobn);
> +    VIOsPAPR_RTCE *rtce;
> +
> +    if (!dev) {
> +        fprintf(stderr, "spapr_vio_put_tce on non-existent LIOBN "
> +                TARGET_FMT_lx "\n",
> +                liobn);
> +        return H_PARAMETER;
> +    }
> +
> +    ioba&= ~(SPAPR_VIO_TCE_PAGE_SIZE - 1);
> +
> +#ifdef DEBUG_TCE
> +    fprintf(stderr, "spapr_vio_put_tce on %s  ioba 0x" TARGET_FMT_lx
> +            "  TCE 0x" TARGET_FMT_lx "\n", dev->qdev.id, ioba, tce);
> +#endif
> +
> +    if (ioba>= dev->rtce_window_size) {
> +        fprintf(stderr, "spapr_vio_put_tce on out-of-boards IOBA 0x" TARGET_FMT_lx "\n",
> +                ioba);
> +        return H_PARAMETER;
> +    }
> +
> +    rtce = dev->rtce_table + (ioba>>  SPAPR_VIO_TCE_PAGE_SHIFT);
> +    rtce->tce = tce;
> +
> +    return H_SUCCESS;
> +}
> +
> +int spapr_vio_check_tces(VIOsPAPRDevice *dev, target_ulong ioba,
> +                         target_ulong len, enum VIOsPAPR_TCEAccess access)
> +{
> +    int start, end, i;
> +
> +    start = ioba>>  SPAPR_VIO_TCE_PAGE_SHIFT;
> +    end = (ioba + len - 1)>>  SPAPR_VIO_TCE_PAGE_SHIFT;
> +
> +    for (i = start; i<= end; i++) {
> +        if ((dev->rtce_table[i].tce&  access) != access) {
> +            fprintf(stderr, "FAIL on %d\n", i);
> +            return -1;
> +        }
> +    }
> +
> +    return 0;
> +}
> +
> +/* XX Might want to special case KVM for speed ? */

XXX

> +int spapr_tce_dma_write(VIOsPAPRDevice *dev, uint64_t taddr, const void *buf,
> +                        uint32_t size)
> +{
> +#ifdef DEBUG_TCE
> +    fprintf(stderr, "spapr_tce_dma_write taddr=0x%llx size=0x%x\n",
> +            (unsigned long long)taddr, size);
> +#endif
> +
> +    while(size) {
> +        uint64_t tce;
> +        uint32_t lsize;
> +        uint64_t txaddr;
> +
> +        /* Check if we are in bound */
> +        if (taddr>= dev->rtce_window_size) {
> +            fprintf(stderr, "spapr_tce_dma_write out of bounds\n");
> +            return -H_DEST_PARM;
> +        }
> +        tce = dev->rtce_table[taddr>>  SPAPR_VIO_TCE_PAGE_SHIFT].tce;
> +
> +        /* How much til end of page ? */
> +        lsize = MIN(size, ((~taddr)&  SPAPR_VIO_TCE_PAGE_MASK) + 1);
> +
> +        /* Check TCE */
> +        if (!(tce&  2))

Braces

> +            return -H_DEST_PARM;
> +
> +        /* Translate */
> +        txaddr = (tce&  ~SPAPR_VIO_TCE_PAGE_MASK) | (taddr&  SPAPR_VIO_TCE_PAGE_MASK);
> +
> +#ifdef DEBUG_TCE
> +        fprintf(stderr, " ->  write to txaddr=0x%llx, size=0x%x\n",
> +                (unsigned long long)txaddr, lsize);
> +#endif
> +
> +        /* Do it */
> +        cpu_physical_memory_write(txaddr, buf, lsize);
> +        buf += lsize;
> +        taddr += lsize;
> +        size -= lsize;
> +    }
> +    return 0;
> +}
> +
> +/* XX Might want to special case KVM for speed ? */

XXX

> +int spapr_tce_dma_zero(VIOsPAPRDevice *dev, uint64_t taddr, uint32_t size)
> +{
> +    uint8_t *zeroes;
> +
> +#ifdef DEBUG_TCE
> +    fprintf(stderr, "spapr_tce_dma_zero taddr=0x%llx size=0x%x\n",
> +            (unsigned long long)taddr, size);
> +#endif
> +
> +    /* FIXME: do this better... */
> +    zeroes = alloca(size);
> +    memset(zeroes, 0, size);

You sure that zeroes is still alive during the call? If I were a 
compiler, I'd probably optimize the return away so that it'd end up 
being a simple branch to spapr_tce_dma_write - coincidentally 
invalidating the stack that zeroes is on.


Alex
Benjamin Herrenschmidt - March 16, 2011, 8:05 p.m.
On Wed, 2011-03-16 at 17:03 +0100, Alexander Graf wrote:
> 
> > +int spapr_tce_dma_zero(VIOsPAPRDevice *dev, uint64_t taddr,
> uint32_t size)
> > +{
> > +    uint8_t *zeroes;
> > +
> > +#ifdef DEBUG_TCE
> > +    fprintf(stderr, "spapr_tce_dma_zero taddr=0x%llx size=0x%x\n",
> > +            (unsigned long long)taddr, size);
> > +#endif
> > +
> > +    /* FIXME: do this better... */
> > +    zeroes = alloca(size);
> > +    memset(zeroes, 0, size);
> 
> You sure that zeroes is still alive during the call? If I were a 
> compiler, I'd probably optimize the return away so that it'd end up 
> being a simple branch to spapr_tce_dma_write - coincidentally 
> invalidating the stack that zeroes is on.

Ugh ? How would this ever be legal for a compiler to do that ?

Ben.
Anthony Liguori - March 16, 2011, 8:21 p.m.
On 03/16/2011 03:05 PM, Benjamin Herrenschmidt wrote:
> On Wed, 2011-03-16 at 17:03 +0100, Alexander Graf wrote:
>>> +int spapr_tce_dma_zero(VIOsPAPRDevice *dev, uint64_t taddr,
>> uint32_t size)
>>> +{
>>> +    uint8_t *zeroes;
>>> +
>>> +#ifdef DEBUG_TCE
>>> +    fprintf(stderr, "spapr_tce_dma_zero taddr=0x%llx size=0x%x\n",
>>> +            (unsigned long long)taddr, size);
>>> +#endif
>>> +
>>> +    /* FIXME: do this better... */
>>> +    zeroes = alloca(size);
>>> +    memset(zeroes, 0, size);
>> You sure that zeroes is still alive during the call? If I were a
>> compiler, I'd probably optimize the return away so that it'd end up
>> being a simple branch to spapr_tce_dma_write - coincidentally
>> invalidating the stack that zeroes is on.
> Ugh ? How would this ever be legal for a compiler to do that ?

Yeah, the compiler can't do that.  The return of alloca() is valid as 
long as the stack frame is valid.  Inlining doesn't change that.
Regards,

Anthony Liguori

> Ben.
>
>
>
Anthony Liguori - March 16, 2011, 8:22 p.m.
On 03/16/2011 03:05 PM, Benjamin Herrenschmidt wrote:
> On Wed, 2011-03-16 at 17:03 +0100, Alexander Graf wrote:
>>> +int spapr_tce_dma_zero(VIOsPAPRDevice *dev, uint64_t taddr,
>> uint32_t size)
>>> +{
>>> +    uint8_t *zeroes;
>>> +
>>> +#ifdef DEBUG_TCE
>>> +    fprintf(stderr, "spapr_tce_dma_zero taddr=0x%llx size=0x%x\n",
>>> +            (unsigned long long)taddr, size);
>>> +#endif
>>> +
>>> +    /* FIXME: do this better... */
>>> +    zeroes = alloca(size);
>>> +    memset(zeroes, 0, size);
>> You sure that zeroes is still alive during the call? If I were a
>> compiler, I'd probably optimize the return away so that it'd end up
>> being a simple branch to spapr_tce_dma_write - coincidentally
>> invalidating the stack that zeroes is on.
> Ugh ? How would this ever be legal for a compiler to do that ?

But BTW, if you're already being evil and using alloca, it's a whole lot 
nicer to just do:

uint8_t zeros[size];

Regards,

Anthony Liguori

> Ben.
>
>
>
Benjamin Herrenschmidt - March 16, 2011, 8:36 p.m.
On Wed, 2011-03-16 at 15:22 -0500, Anthony Liguori wrote:
> 
> But BTW, if you're already being evil and using alloca, it's a whole
> lot 
> nicer to just do:
> 
> uint8_t zeros[size];

Right. I haven't written that bit of the code so I'll let David fix it
but it does indeed look nicer. Eventually, we -could- I suppose make
some of these faster since all we really need is poke at the guest
memory and I'm sure we can do it directly a way or another :-)

Cheers,
Ben.
Anthony Liguori - March 16, 2011, 10:20 p.m.
On 03/15/2011 11:56 PM, David Gibson wrote:
> From: Ben Herrenschmidt<benh@kernel.crashing.org>
>
> This patch implements the necessary infrastructure and hypercalls for
> sPAPR's TCE (Translation Control Entry) IOMMU mechanism.  This is necessary
> for all virtual IO devices which do DMA (i.e. nearly all of them).
>
> Signed-off-by: Ben Herrenschmidt<benh@kernel.crashing.org>
> Signed-off-by: David Gibson<dwg@au1.ibm.com>
> ---
>   hw/spapr.c     |    3 +-
>   hw/spapr_vio.c |  232 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>   hw/spapr_vio.h |   32 ++++++++
>   3 files changed, 266 insertions(+), 1 deletions(-)
>
> diff --git a/hw/spapr.c b/hw/spapr.c
> index e7f8864..a362889 100644
> --- a/hw/spapr.c
> +++ b/hw/spapr.c
> @@ -62,7 +62,8 @@ static void *spapr_create_fdt(int *fdt_size, ram_addr_t ramsize,
>       uint32_t start_prop = cpu_to_be32(initrd_base);
>       uint32_t end_prop = cpu_to_be32(initrd_base + initrd_size);
>       uint32_t pft_size_prop[] = {0, cpu_to_be32(hash_shift)};
> -    char hypertas_prop[] = "hcall-pft\0hcall-term\0hcall-dabr\0hcall-interrupt";
> +    char hypertas_prop[] = "hcall-pft\0hcall-term\0hcall-dabr\0hcall-interrupt"
> +        "\0hcall-tce";
>       uint32_t interrupt_server_ranges_prop[] = {0, cpu_to_be32(smp_cpus)};
>       int i;
>       char *modelname;
> diff --git a/hw/spapr_vio.c b/hw/spapr_vio.c
> index 45edd94..37cf51e 100644
> --- a/hw/spapr_vio.c
> +++ b/hw/spapr_vio.c
> @@ -37,6 +37,7 @@
>   #endif /* CONFIG_FDT */
>
>   /* #define DEBUG_SPAPR */
> +/* #define DEBUG_TCE */
>
>   #ifdef DEBUG_SPAPR
>   #define dprintf(fmt, ...) \
> @@ -114,6 +115,28 @@ static int vio_make_devnode(VIOsPAPRDevice *dev,
>               return ret;
>       }
>
> +    if (dev->rtce_window_size) {
> +        uint32_t dma_prop[] = {cpu_to_be32(dev->reg),
> +                               0, 0,
> +                               0, cpu_to_be32(dev->rtce_window_size)};
> +
> +        ret = fdt_setprop_cell(fdt, node_off, "ibm,#dma-address-cells", 2);
> +        if (ret<  0) {
> +            return ret;
> +        }
> +
> +        ret = fdt_setprop_cell(fdt, node_off, "ibm,#dma-size-cells", 2);
> +        if (ret<  0) {
> +            return ret;
> +        }
> +
> +        ret = fdt_setprop(fdt, node_off, "ibm,my-dma-window", dma_prop,
> +                          sizeof(dma_prop));
> +        if (ret<  0) {
> +            return ret;
> +        }
> +    }
> +
>       if (info->devnode) {
>           ret = (info->devnode)(dev, fdt, node_off);
>           if (ret<  0) {
> @@ -125,6 +148,210 @@ static int vio_make_devnode(VIOsPAPRDevice *dev,
>   }
>   #endif /* CONFIG_FDT */
>
> +/*
> + * RTCE handling
> + */
> +
> +static void rtce_init(VIOsPAPRDevice *dev)
> +{
> +    size_t size = (dev->rtce_window_size>>  SPAPR_VIO_TCE_PAGE_SHIFT)
> +        * sizeof(VIOsPAPR_RTCE);
> +
> +    if (size) {
> +        dev->rtce_table = qemu_mallocz(size);
> +    }
> +}
> +
> +static target_ulong h_put_tce(CPUState *env, sPAPREnvironment *spapr,
> +                              target_ulong opcode, target_ulong *args)
> +{
> +    target_ulong liobn = args[0];
> +    target_ulong ioba = args[1];
> +    target_ulong tce = args[2];
> +    VIOsPAPRDevice *dev = spapr_vio_find_by_reg(spapr->vio_bus, liobn);
> +    VIOsPAPR_RTCE *rtce;
> +
> +    if (!dev) {
> +        fprintf(stderr, "spapr_vio_put_tce on non-existent LIOBN "
> +                TARGET_FMT_lx "\n",
> +                liobn);

You generally want to avoid guest triggered fprintfs as it can be 
exploited in scenarios where qemu's stdout is logged to disk (libvirt).  
We usually wrap this in a DPRINTF() of some sort.

Regards,

Anthony Liguori
David Gibson - March 17, 2011, 1:43 a.m.
On Wed, Mar 16, 2011 at 05:03:54PM +0100, Alexander Graf wrote:
> On 03/16/2011 05:56 AM, David Gibson wrote:
> >From: Ben Herrenschmidt<benh@kernel.crashing.org>
[snip]
> >+/* XX Might want to special case KVM for speed ? */
> 
> XXX

Comment removed.  In fact, we've now implemented the KVM acceleration
in a later series of patches.
David Gibson - March 18, 2011, 1:58 a.m.
On Wed, Mar 16, 2011 at 05:20:53PM -0500, Anthony Liguori wrote:
> On 03/15/2011 11:56 PM, David Gibson wrote:
> >From: Ben Herrenschmidt<benh@kernel.crashing.org>
[snip]
> >+static target_ulong h_put_tce(CPUState *env, sPAPREnvironment *spapr,
> >+                              target_ulong opcode, target_ulong *args)
> >+{
> >+    target_ulong liobn = args[0];
> >+    target_ulong ioba = args[1];
> >+    target_ulong tce = args[2];
> >+    VIOsPAPRDevice *dev = spapr_vio_find_by_reg(spapr->vio_bus, liobn);
> >+    VIOsPAPR_RTCE *rtce;
> >+
> >+    if (!dev) {
> >+        fprintf(stderr, "spapr_vio_put_tce on non-existent LIOBN "
> >+                TARGET_FMT_lx "\n",
> >+                liobn);
> 
> You generally want to avoid guest triggered fprintfs as it can be
> exploited in scenarios where qemu's stdout is logged to disk
> (libvirt).  We usually wrap this in a DPRINTF() of some sort.

Ah, good point.  I've gone through and audited for this sort of thing.

Patch

diff --git a/hw/spapr.c b/hw/spapr.c
index e7f8864..a362889 100644
--- a/hw/spapr.c
+++ b/hw/spapr.c
@@ -62,7 +62,8 @@  static void *spapr_create_fdt(int *fdt_size, ram_addr_t ramsize,
     uint32_t start_prop = cpu_to_be32(initrd_base);
     uint32_t end_prop = cpu_to_be32(initrd_base + initrd_size);
     uint32_t pft_size_prop[] = {0, cpu_to_be32(hash_shift)};
-    char hypertas_prop[] = "hcall-pft\0hcall-term\0hcall-dabr\0hcall-interrupt";
+    char hypertas_prop[] = "hcall-pft\0hcall-term\0hcall-dabr\0hcall-interrupt"
+        "\0hcall-tce";
     uint32_t interrupt_server_ranges_prop[] = {0, cpu_to_be32(smp_cpus)};
     int i;
     char *modelname;
diff --git a/hw/spapr_vio.c b/hw/spapr_vio.c
index 45edd94..37cf51e 100644
--- a/hw/spapr_vio.c
+++ b/hw/spapr_vio.c
@@ -37,6 +37,7 @@ 
 #endif /* CONFIG_FDT */
 
 /* #define DEBUG_SPAPR */
+/* #define DEBUG_TCE */
 
 #ifdef DEBUG_SPAPR
 #define dprintf(fmt, ...) \
@@ -114,6 +115,28 @@  static int vio_make_devnode(VIOsPAPRDevice *dev,
             return ret;
     }
 
+    if (dev->rtce_window_size) {
+        uint32_t dma_prop[] = {cpu_to_be32(dev->reg),
+                               0, 0,
+                               0, cpu_to_be32(dev->rtce_window_size)};
+
+        ret = fdt_setprop_cell(fdt, node_off, "ibm,#dma-address-cells", 2);
+        if (ret < 0) {
+            return ret;
+        }
+
+        ret = fdt_setprop_cell(fdt, node_off, "ibm,#dma-size-cells", 2);
+        if (ret < 0) {
+            return ret;
+        }
+
+        ret = fdt_setprop(fdt, node_off, "ibm,my-dma-window", dma_prop,
+                          sizeof(dma_prop));
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
     if (info->devnode) {
         ret = (info->devnode)(dev, fdt, node_off);
         if (ret < 0) {
@@ -125,6 +148,210 @@  static int vio_make_devnode(VIOsPAPRDevice *dev,
 }
 #endif /* CONFIG_FDT */
 
+/*
+ * RTCE handling
+ */
+
+static void rtce_init(VIOsPAPRDevice *dev)
+{
+    size_t size = (dev->rtce_window_size >> SPAPR_VIO_TCE_PAGE_SHIFT)
+        * sizeof(VIOsPAPR_RTCE);
+
+    if (size) {
+        dev->rtce_table = qemu_mallocz(size);
+    }
+}
+
+static target_ulong h_put_tce(CPUState *env, sPAPREnvironment *spapr,
+                              target_ulong opcode, target_ulong *args)
+{
+    target_ulong liobn = args[0];
+    target_ulong ioba = args[1];
+    target_ulong tce = args[2];
+    VIOsPAPRDevice *dev = spapr_vio_find_by_reg(spapr->vio_bus, liobn);
+    VIOsPAPR_RTCE *rtce;
+
+    if (!dev) {
+        fprintf(stderr, "spapr_vio_put_tce on non-existent LIOBN "
+                TARGET_FMT_lx "\n",
+                liobn);
+        return H_PARAMETER;
+    }
+
+    ioba &= ~(SPAPR_VIO_TCE_PAGE_SIZE - 1);
+
+#ifdef DEBUG_TCE
+    fprintf(stderr, "spapr_vio_put_tce on %s  ioba 0x" TARGET_FMT_lx 
+            "  TCE 0x" TARGET_FMT_lx "\n", dev->qdev.id, ioba, tce);
+#endif
+
+    if (ioba >= dev->rtce_window_size) {
+        fprintf(stderr, "spapr_vio_put_tce on out-of-boards IOBA 0x" TARGET_FMT_lx "\n",
+                ioba);
+        return H_PARAMETER;
+    }
+
+    rtce = dev->rtce_table + (ioba >> SPAPR_VIO_TCE_PAGE_SHIFT);
+    rtce->tce = tce;
+   
+    return H_SUCCESS;
+}
+
+int spapr_vio_check_tces(VIOsPAPRDevice *dev, target_ulong ioba,
+                         target_ulong len, enum VIOsPAPR_TCEAccess access)
+{
+    int start, end, i;
+
+    start = ioba >> SPAPR_VIO_TCE_PAGE_SHIFT;
+    end = (ioba + len - 1) >> SPAPR_VIO_TCE_PAGE_SHIFT;
+
+    for (i = start; i <= end; i++) {
+        if ((dev->rtce_table[i].tce & access) != access) {
+            fprintf(stderr, "FAIL on %d\n", i);
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
+/* XX Might want to special case KVM for speed ? */
+int spapr_tce_dma_write(VIOsPAPRDevice *dev, uint64_t taddr, const void *buf,
+                        uint32_t size)
+{
+#ifdef DEBUG_TCE
+    fprintf(stderr, "spapr_tce_dma_write taddr=0x%llx size=0x%x\n",
+            (unsigned long long)taddr, size);
+#endif
+
+    while(size) {
+        uint64_t tce;
+        uint32_t lsize;
+        uint64_t txaddr;
+
+        /* Check if we are in bound */
+        if (taddr >= dev->rtce_window_size) {
+            fprintf(stderr, "spapr_tce_dma_write out of bounds\n");
+            return -H_DEST_PARM;
+        }
+        tce = dev->rtce_table[taddr >> SPAPR_VIO_TCE_PAGE_SHIFT].tce;
+
+        /* How much til end of page ? */
+        lsize = MIN(size, ((~taddr) & SPAPR_VIO_TCE_PAGE_MASK) + 1);
+
+        /* Check TCE */
+        if (!(tce & 2))
+            return -H_DEST_PARM;
+
+        /* Translate */
+        txaddr = (tce & ~SPAPR_VIO_TCE_PAGE_MASK) | (taddr & SPAPR_VIO_TCE_PAGE_MASK);
+
+#ifdef DEBUG_TCE
+        fprintf(stderr, " -> write to txaddr=0x%llx, size=0x%x\n",
+                (unsigned long long)txaddr, lsize);
+#endif
+
+        /* Do it */
+        cpu_physical_memory_write(txaddr, buf, lsize);
+        buf += lsize;
+        taddr += lsize;
+        size -= lsize;
+    }
+    return 0;
+}
+
+/* XX Might want to special case KVM for speed ? */
+int spapr_tce_dma_zero(VIOsPAPRDevice *dev, uint64_t taddr, uint32_t size)
+{
+    uint8_t *zeroes;
+
+#ifdef DEBUG_TCE
+    fprintf(stderr, "spapr_tce_dma_zero taddr=0x%llx size=0x%x\n",
+            (unsigned long long)taddr, size);
+#endif
+
+    /* FIXME: do this better... */
+    zeroes = alloca(size);
+    memset(zeroes, 0, size);
+    return spapr_tce_dma_write(dev, taddr, zeroes, size);
+}
+
+void stb_tce(VIOsPAPRDevice *dev, uint64_t taddr, uint8_t val)
+{
+    spapr_tce_dma_write(dev, taddr, &val, sizeof(val));
+}
+
+void sth_tce(VIOsPAPRDevice *dev, uint64_t taddr, uint16_t val)
+{
+    val = tswap16(val);
+    spapr_tce_dma_write(dev, taddr, &val, sizeof(val));
+}
+
+
+void stw_tce(VIOsPAPRDevice *dev, uint64_t taddr, uint32_t val)
+{
+    val = tswap32(val);
+    spapr_tce_dma_write(dev, taddr, &val, sizeof(val));
+}
+
+void stq_tce(VIOsPAPRDevice *dev, uint64_t taddr, uint64_t val)
+{
+    val = tswap64(val);
+    spapr_tce_dma_write(dev, taddr, &val, sizeof(val));
+}
+
+int spapr_tce_dma_read(VIOsPAPRDevice *dev, uint64_t taddr, void *buf,
+                       uint32_t size)
+{
+#ifdef DEBUG_TCE
+    fprintf(stderr, "spapr_tce_dma_write taddr=0x%llx size=0x%x\n",
+            (unsigned long long)taddr, size);
+#endif
+
+    while(size) {
+        uint64_t tce;
+        uint32_t lsize;
+        uint64_t txaddr;
+
+        /* Check if we are in bound */
+        if (taddr >= dev->rtce_window_size) {
+            fprintf(stderr, "spapr_tce_dma_read out of bounds\n");
+            return -H_DEST_PARM;
+        }
+        tce = dev->rtce_table[taddr >> SPAPR_VIO_TCE_PAGE_SHIFT].tce;
+
+        /* How much til end of page ? */
+        lsize = MIN(size, ((~taddr) & SPAPR_VIO_TCE_PAGE_MASK) + 1);
+
+        /* Check TCE */
+        if (!(tce & 1)) {
+            return H_DEST_PARM;
+        }
+
+        /* Translate */
+        txaddr = (tce & ~SPAPR_VIO_TCE_PAGE_MASK) | (taddr & SPAPR_VIO_TCE_PAGE_MASK);
+
+#ifdef DEBUG_TCE
+        fprintf(stderr, " -> write to txaddr=0x%llx, size=0x%x\n",
+                (unsigned long long)txaddr, lsize);
+#endif
+        /* Do it */
+        cpu_physical_memory_read(txaddr, buf, lsize);
+        buf += lsize;
+        taddr += lsize;
+        size -= lsize;
+    }
+    return H_SUCCESS;
+}
+
+uint64_t ldq_tce(VIOsPAPRDevice *dev, uint64_t taddr)
+{
+    uint64_t val;
+
+    spapr_tce_dma_read(dev, taddr, &val, sizeof(val));
+    return tswap64(val);
+}
+
 static int spapr_vio_busdev_init(DeviceState *dev, DeviceInfo *info)
 {
     VIOsPAPRDeviceInfo *_info = (VIOsPAPRDeviceInfo *)info;
@@ -137,6 +364,8 @@  static int spapr_vio_busdev_init(DeviceState *dev, DeviceInfo *info)
 
     _dev->qdev.id = id;
 
+    rtce_init(_dev);
+
     return _info->init(_dev);
 }
 
@@ -190,6 +419,9 @@  VIOsPAPRBus *spapr_vio_bus_init(void)
     /* hcall-vio */
     spapr_register_hypercall(H_VIO_SIGNAL, h_vio_signal);
 
+    /* hcall-tce */
+    spapr_register_hypercall(H_PUT_TCE, h_put_tce);
+
     for (_info = device_info_list; _info; _info = _info->next) {
         VIOsPAPRDeviceInfo *info = (VIOsPAPRDeviceInfo *)_info;
 
diff --git a/hw/spapr_vio.h b/hw/spapr_vio.h
index 2013927..1b15d3e 100644
--- a/hw/spapr_vio.h
+++ b/hw/spapr_vio.h
@@ -21,12 +21,29 @@ 
  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  */
 
+#define SPAPR_VIO_TCE_PAGE_SHIFT	12
+#define SPAPR_VIO_TCE_PAGE_SIZE		(1ULL << SPAPR_VIO_TCE_PAGE_SHIFT)
+#define SPAPR_VIO_TCE_PAGE_MASK		(SPAPR_VIO_TCE_PAGE_SIZE - 1)
+
+enum VIOsPAPR_TCEAccess {
+    SPAPR_TCE_FAULT = 0,
+    SPAPR_TCE_RO = 1,
+    SPAPR_TCE_WO = 2,
+    SPAPR_TCE_RW = 3,
+};
+
+typedef struct VIOsPAPR_RTCE {
+    uint64_t tce;
+} VIOsPAPR_RTCE;
+
 typedef struct VIOsPAPRDevice {
     DeviceState qdev;
     uint32_t reg;
     qemu_irq qirq;
     uint32_t vio_irq_num;
     target_ulong signal_state;
+    uint32_t rtce_window_size;
+    VIOsPAPR_RTCE *rtce_table;
 } VIOsPAPRDevice;
 
 typedef struct VIOsPAPRBus {
@@ -49,6 +66,21 @@  extern int spapr_populate_vdevice(VIOsPAPRBus *bus, void *fdt);
 
 extern int spapr_vio_signal(VIOsPAPRDevice *dev, target_ulong mode);
 
+int spapr_vio_check_tces(VIOsPAPRDevice *dev, target_ulong ioba,
+                         target_ulong len,
+                         enum VIOsPAPR_TCEAccess access);
+
+int spapr_tce_dma_read(VIOsPAPRDevice *dev, uint64_t taddr,
+                       void *buf, uint32_t size);
+int spapr_tce_dma_write(VIOsPAPRDevice *dev, uint64_t taddr,
+                        const void *buf, uint32_t size);
+int spapr_tce_dma_zero(VIOsPAPRDevice *dev, uint64_t taddr, uint32_t size);
+void stb_tce(VIOsPAPRDevice *dev, uint64_t taddr, uint8_t val);
+void sth_tce(VIOsPAPRDevice *dev, uint64_t taddr, uint16_t val);
+void stw_tce(VIOsPAPRDevice *dev, uint64_t taddr, uint32_t val);
+void stq_tce(VIOsPAPRDevice *dev, uint64_t taddr, uint64_t val);
+uint64_t ldq_tce(VIOsPAPRDevice *dev, uint64_t taddr);
+
 void vty_putchars(VIOsPAPRDevice *sdev, uint8_t *buf, int len);
 void spapr_vty_create(VIOsPAPRBus *bus,
                       uint32_t reg, CharDriverState *chardev,