diff mbox

[RFC/PATCH] Add a memory barrier to guest memory access functions

Message ID 1337215928.30558.28.camel@pasglop
State New
Headers show

Commit Message

Benjamin Herrenschmidt May 17, 2012, 12:52 a.m. UTC
The emulated devices can run simultaneously with the guest, so
we need to be careful with ordering of load and stores done by
them to the guest system memory, which need to be observed in
the right order by the guest operating system.

In order to avoid unnecessary overhead on i386, we define a new
barrier dma_mb() which is a full barrier on powerpc and a nop
on i386 and x86_64 (see the comment I added in the code).

This barrier is then added to qemu_get_ram_ptr() which is easier
than sprinkling into all the functions that provide guest
access and are all more or less open coded.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---

Discussion: So the other option is to do it only in
cpu_physical_memory_rw() and leave the responsibility to use explicit
barriers to the callers of ld*/st* accessors. For example virtio already
does it in a few places explicitly.

 exec.c         |   11 +++++++++++
 qemu-barrier.h |   29 ++++++++++++++++++++++++++---
 2 files changed, 37 insertions(+), 3 deletions(-)

Comments

Anthony Liguori May 17, 2012, 2:28 a.m. UTC | #1
On 05/16/2012 07:52 PM, Benjamin Herrenschmidt wrote:
> The emulated devices can run simultaneously with the guest, so
> we need to be careful with ordering of load and stores done by
> them to the guest system memory, which need to be observed in
> the right order by the guest operating system.
>
> In order to avoid unnecessary overhead on i386, we define a new
> barrier dma_mb() which is a full barrier on powerpc and a nop
> on i386 and x86_64 (see the comment I added in the code).
>
> This barrier is then added to qemu_get_ram_ptr() which is easier
> than sprinkling into all the functions that provide guest
> access and are all more or less open coded.
>
> Signed-off-by: Benjamin Herrenschmidt<benh@kernel.crashing.org>
> ---
>
> Discussion: So the other option is to do it only in
> cpu_physical_memory_rw() and leave the responsibility to use explicit
> barriers to the callers of ld*/st* accessors. For example virtio already
> does it in a few places explicitly.
>
>   exec.c         |   11 +++++++++++
>   qemu-barrier.h |   29 ++++++++++++++++++++++++++---
>   2 files changed, 37 insertions(+), 3 deletions(-)
>
> diff --git a/exec.c b/exec.c
> index 40cf52d..fc857b6 100644
> --- a/exec.c
> +++ b/exec.c
> @@ -25,6 +25,7 @@
>   #endif
>
>   #include "qemu-common.h"
> +#include "qemu-barrier.h"
>   #include "cpu.h"
>   #include "tcg.h"
>   #include "hw/hw.h"
> @@ -2794,6 +2795,9 @@ void *qemu_get_ram_ptr(ram_addr_t addr)
>   {
>       RAMBlock *block;
>
> +    /* We ensure ordering for all DMA transactions */
> +    dma_mb();
> +

I get being conservative, but I don't think this makes a lot of sense.  There 
are cases where the return of this function is cached (like the VGA ram area). 
I think it would make more sense if you explicitly put a barrier after write 
operations.

Regards,

Anthony Liguori

>       QLIST_FOREACH(block,&ram_list.blocks, next) {
>           if (addr - block->offset<  block->length) {
>               /* Move this entry to to start of the list.  */
> @@ -2830,6 +2834,9 @@ void *qemu_safe_ram_ptr(ram_addr_t addr)
>   {
>       RAMBlock *block;
>
> +    /* We ensure ordering for all DMA transactions */
> +    dma_mb();
> +
>       QLIST_FOREACH(block,&ram_list.blocks, next) {
>           if (addr - block->offset<  block->length) {
>               if (xen_enabled()) {
> @@ -2861,6 +2868,10 @@ void *qemu_ram_ptr_length(ram_addr_t addr, ram_addr_t *size)
>       if (*size == 0) {
>           return NULL;
>       }
> +
> +    /* We ensure ordering for all DMA transactions */
> +    dma_mb();
> +
>       if (xen_enabled()) {
>           return xen_map_cache(addr, *size, 1);
>       } else {
> diff --git a/qemu-barrier.h b/qemu-barrier.h
> index 7e11197..8c62683 100644
> --- a/qemu-barrier.h
> +++ b/qemu-barrier.h
> @@ -23,7 +23,21 @@
>   #define smp_mb() __sync_synchronize()
>   #else
>   #define smp_mb() asm volatile("lock; addl $0,0(%%esp) " ::: "memory")
> -#endif
> + #endif
> +
> +/*
> + * DMA barrier is used to order accesses from qemu devices to
> + * guest memory, in order to make them appear to the guest in
> + * program order.
> + *
> + * We assume that we never uses non-temporal accesses for such
> + * DMA and so don't need anything other than a compiler barrier
> + *
> + * If some devices use weakly ordered SSE load/store instructions
> + * then those devices will be responsible for using the appropriate
> + * barriers as well.
> + */
> +#define dma_mb()    barrier()
>
>   #elif defined(__x86_64__)
>
> @@ -31,6 +45,9 @@
>   #define smp_rmb()   barrier()
>   #define smp_mb() asm volatile("mfence" ::: "memory")
>
> +/* Same comment as i386 */
> +#define dma_mb()    barrier()
> +
>   #elif defined(_ARCH_PPC)
>
>   /*
> @@ -46,8 +63,13 @@
>   #define smp_rmb()   asm volatile("sync" ::: "memory")
>   #endif
>
> -#define smp_mb()   asm volatile("sync" ::: "memory")
> +#define smp_mb()    asm volatile("sync" ::: "memory")
>
> +/*
> + * We use a full barrier for DMA which encompass the full
> + * requirements of the PCI ordering model.
> + */
> +#define dma_mb()    smp_mb()
>   #else
>
>   /*
> @@ -57,8 +79,9 @@
>    * be overkill for wmb() and rmb().
>    */
>   #define smp_wmb()   __sync_synchronize()
> -#define smp_mb()   __sync_synchronize()
> +#define smp_mb()    __sync_synchronize()
>   #define smp_rmb()   __sync_synchronize()
> +#define dma_rmb()   __sync_synchronize()
>
>   #endif
>
>
>
Benjamin Herrenschmidt May 17, 2012, 2:44 a.m. UTC | #2
On Wed, 2012-05-16 at 21:28 -0500, Anthony Liguori wrote:

> > @@ -2794,6 +2795,9 @@ void *qemu_get_ram_ptr(ram_addr_t addr)
> >   {
> >       RAMBlock *block;
> >
> > +    /* We ensure ordering for all DMA transactions */
> > +    dma_mb();
> > +
> 
> I get being conservative, but I don't think this makes a lot of sense.  There 
> are cases where the return of this function is cached (like the VGA ram area). 
> I think it would make more sense if you explicitly put a barrier after write 
> operations.

Well, it depends ... sure something that caches the result is akin to
map/unmap and responsible for doing its own barriers between accesses,
however as a whole, this means that an entire map/unmap section is
ordered surrounding accesses which is actually not a bad idea.

Anyway, I'll post a different patch that adds the barrier more
selectively to:

 - cpu_physical_memory_rw  (that's the obvious main one)
 - cpu_physical_memory_write_rom (probably overkill but
   not a fast path so no big deal)
 - ld*_* and st*_* (or do you think these should require
   explicit barriers in the callers ?)

Note that with the above, cpu_physical_memory_map and unmap will
imply a barrier when using bounce buffers, it would make sense to also
provide the same barrier when not.

That does actually make sense for the same reason explained above,
ie, when those are used for a DMA transfer via async IO, that guarantees
ordering of the "block" vs. surrounding accesses even if accesses within
the actual map/unmap region are not ordered vs. each other.

Any objection ?

Cheers,
Ben.
David Gibson May 17, 2012, 3:35 a.m. UTC | #3
On Wed, May 16, 2012 at 09:28:39PM -0500, Anthony Liguori wrote:
> On 05/16/2012 07:52 PM, Benjamin Herrenschmidt wrote:
[snip]
> >@@ -2794,6 +2795,9 @@ void *qemu_get_ram_ptr(ram_addr_t addr)
> >  {
> >      RAMBlock *block;
> >
> >+    /* We ensure ordering for all DMA transactions */
> >+    dma_mb();
> >+
> 
> I get being conservative, but I don't think this makes a lot of
> sense.  There are cases where the return of this function is cached
> (like the VGA ram area). I think it would make more sense if you
> explicitly put a barrier after write operations.

I tend to agree.  I think the barriers should be in
cpu_physical_memory_rw() and the st*_phys() functions.
Anthony Liguori May 17, 2012, 10:09 p.m. UTC | #4
On 05/16/2012 09:44 PM, Benjamin Herrenschmidt wrote:
> On Wed, 2012-05-16 at 21:28 -0500, Anthony Liguori wrote:
>
>>> @@ -2794,6 +2795,9 @@ void *qemu_get_ram_ptr(ram_addr_t addr)
>>>    {
>>>        RAMBlock *block;
>>>
>>> +    /* We ensure ordering for all DMA transactions */
>>> +    dma_mb();
>>> +
>>
>> I get being conservative, but I don't think this makes a lot of sense.  There
>> are cases where the return of this function is cached (like the VGA ram area).
>> I think it would make more sense if you explicitly put a barrier after write
>> operations.
>
> Well, it depends ... sure something that caches the result is akin to
> map/unmap and responsible for doing its own barriers between accesses,
> however as a whole, this means that an entire map/unmap section is
> ordered surrounding accesses which is actually not a bad idea.
>
> Anyway, I'll post a different patch that adds the barrier more
> selectively to:
>
>   - cpu_physical_memory_rw  (that's the obvious main one)
>   - cpu_physical_memory_write_rom (probably overkill but
>     not a fast path so no big deal)
>   - ld*_* and st*_* (or do you think these should require
>     explicit barriers in the callers ?)

ld/st should not ever be used by device emulation because they use a concept of 
"target endianness" that doesn't exist for devices.

So no, I don't think you need to put barriers there as they should only be used 
by VCPU emulation code.

> Note that with the above, cpu_physical_memory_map and unmap will
> imply a barrier when using bounce buffers, it would make sense to also
> provide the same barrier when not.
>
> That does actually make sense for the same reason explained above,
> ie, when those are used for a DMA transfer via async IO, that guarantees
> ordering of the "block" vs. surrounding accesses even if accesses within
> the actual map/unmap region are not ordered vs. each other.
>
> Any objection ?

I think so.  I'd like to see a better comment about barrier usage at the top of 
the file or something like that too.

Regards,

Anthony Liguori

>
> Cheers,
> Ben.
>
>
>
David Gibson May 18, 2012, 1:04 a.m. UTC | #5
On Thu, May 17, 2012 at 05:09:22PM -0500, Anthony Liguori wrote:
> On 05/16/2012 09:44 PM, Benjamin Herrenschmidt wrote:
> >On Wed, 2012-05-16 at 21:28 -0500, Anthony Liguori wrote:
> >
> >>>@@ -2794,6 +2795,9 @@ void *qemu_get_ram_ptr(ram_addr_t addr)
> >>>   {
> >>>       RAMBlock *block;
> >>>
> >>>+    /* We ensure ordering for all DMA transactions */
> >>>+    dma_mb();
> >>>+
> >>
> >>I get being conservative, but I don't think this makes a lot of sense.  There
> >>are cases where the return of this function is cached (like the VGA ram area).
> >>I think it would make more sense if you explicitly put a barrier after write
> >>operations.
> >
> >Well, it depends ... sure something that caches the result is akin to
> >map/unmap and responsible for doing its own barriers between accesses,
> >however as a whole, this means that an entire map/unmap section is
> >ordered surrounding accesses which is actually not a bad idea.
> >
> >Anyway, I'll post a different patch that adds the barrier more
> >selectively to:
> >
> >  - cpu_physical_memory_rw  (that's the obvious main one)
> >  - cpu_physical_memory_write_rom (probably overkill but
> >    not a fast path so no big deal)
> >  - ld*_* and st*_* (or do you think these should require
> >    explicit barriers in the callers ?)
> 
> ld/st should not ever be used by device emulation because they use a
> concept of "target endianness" that doesn't exist for devices.

Ah, but there are the explicit endian versions, which were used
routinely until I replaced a lot of them with ld*_dma().
Benjamin Herrenschmidt May 18, 2012, 1:16 a.m. UTC | #6
On Thu, 2012-05-17 at 17:09 -0500, Anthony Liguori wrote:

> ld/st should not ever be used by device emulation because they use a concept of 
> "target endianness" that doesn't exist for devices.

Hrm, there's a bit of both, some of them even have explicit endianness
arguments and some of them are definitely used by bits of hw/*

virtio core uses them directly as well but then virtio also does
explicit barriers (though I need to double check whether it does enough
of them in all the right places).

> So no, I don't think you need to put barriers there as they should only be used 
> by VCPU emulation code.

I'm ok to leave them alone for now, I'll give a quick look at the
various callers.

> > Note that with the above, cpu_physical_memory_map and unmap will
> > imply a barrier when using bounce buffers, it would make sense to also
> > provide the same barrier when not.
> >
> > That does actually make sense for the same reason explained above,
> > ie, when those are used for a DMA transfer via async IO, that guarantees
> > ordering of the "block" vs. surrounding accesses even if accesses within
> > the actual map/unmap region are not ordered vs. each other.
> >
> > Any objection ?
> 
> I think so.  I'd like to see a better comment about barrier usage at the top of 
> the file or something like that too.

Ok.

Cheers,
Ben.

> Regards,
> 
> Anthony Liguori
> 
> >
> > Cheers,
> > Ben.
> >
> >
> >
diff mbox

Patch

diff --git a/exec.c b/exec.c
index 40cf52d..fc857b6 100644
--- a/exec.c
+++ b/exec.c
@@ -25,6 +25,7 @@ 
 #endif
 
 #include "qemu-common.h"
+#include "qemu-barrier.h"
 #include "cpu.h"
 #include "tcg.h"
 #include "hw/hw.h"
@@ -2794,6 +2795,9 @@  void *qemu_get_ram_ptr(ram_addr_t addr)
 {
     RAMBlock *block;
 
+    /* We ensure ordering for all DMA transactions */
+    dma_mb();
+
     QLIST_FOREACH(block, &ram_list.blocks, next) {
         if (addr - block->offset < block->length) {
             /* Move this entry to to start of the list.  */
@@ -2830,6 +2834,9 @@  void *qemu_safe_ram_ptr(ram_addr_t addr)
 {
     RAMBlock *block;
 
+    /* We ensure ordering for all DMA transactions */
+    dma_mb();
+
     QLIST_FOREACH(block, &ram_list.blocks, next) {
         if (addr - block->offset < block->length) {
             if (xen_enabled()) {
@@ -2861,6 +2868,10 @@  void *qemu_ram_ptr_length(ram_addr_t addr, ram_addr_t *size)
     if (*size == 0) {
         return NULL;
     }
+
+    /* We ensure ordering for all DMA transactions */
+    dma_mb();
+
     if (xen_enabled()) {
         return xen_map_cache(addr, *size, 1);
     } else {
diff --git a/qemu-barrier.h b/qemu-barrier.h
index 7e11197..8c62683 100644
--- a/qemu-barrier.h
+++ b/qemu-barrier.h
@@ -23,7 +23,21 @@ 
 #define smp_mb() __sync_synchronize()
 #else
 #define smp_mb() asm volatile("lock; addl $0,0(%%esp) " ::: "memory")
-#endif
+ #endif
+
+/*
+ * DMA barrier is used to order accesses from qemu devices to
+ * guest memory, in order to make them appear to the guest in
+ * program order.
+ *
+ * We assume that we never uses non-temporal accesses for such
+ * DMA and so don't need anything other than a compiler barrier
+ *
+ * If some devices use weakly ordered SSE load/store instructions
+ * then those devices will be responsible for using the appropriate
+ * barriers as well.
+ */
+#define dma_mb()    barrier()
 
 #elif defined(__x86_64__)
 
@@ -31,6 +45,9 @@ 
 #define smp_rmb()   barrier()
 #define smp_mb() asm volatile("mfence" ::: "memory")
 
+/* Same comment as i386 */
+#define dma_mb()    barrier()
+
 #elif defined(_ARCH_PPC)
 
 /*
@@ -46,8 +63,13 @@ 
 #define smp_rmb()   asm volatile("sync" ::: "memory")
 #endif
 
-#define smp_mb()   asm volatile("sync" ::: "memory")
+#define smp_mb()    asm volatile("sync" ::: "memory")
 
+/*
+ * We use a full barrier for DMA which encompass the full
+ * requirements of the PCI ordering model.
+ */
+#define dma_mb()    smp_mb()
 #else
 
 /*
@@ -57,8 +79,9 @@ 
  * be overkill for wmb() and rmb().
  */
 #define smp_wmb()   __sync_synchronize()
-#define smp_mb()   __sync_synchronize()
+#define smp_mb()    __sync_synchronize()
 #define smp_rmb()   __sync_synchronize()
+#define dma_rmb()   __sync_synchronize()
 
 #endif