diff mbox series

[v2,02/12] util: add atomic64

Message ID 20180910232752.31565-3-cota@braap.org
State New
Headers show
Series i386 + x86_64 mttcg | expand

Commit Message

Emilio Cota Sept. 10, 2018, 11:27 p.m. UTC
This introduces read/set accessors for int64_t and uint64_t.

Signed-off-by: Emilio G. Cota <cota@braap.org>
---
 include/qemu/atomic.h | 34 ++++++++++++++++++
 util/atomic64.c       | 83 +++++++++++++++++++++++++++++++++++++++++++
 util/cacheinfo.c      |  3 ++
 util/Makefile.objs    |  1 +
 4 files changed, 121 insertions(+)
 create mode 100644 util/atomic64.c

Comments

Richard Henderson Sept. 11, 2018, 12:43 p.m. UTC | #1
On 09/10/2018 04:27 PM, Emilio G. Cota wrote:
> +#define GEN_READ(name, type)                    \
> +    type name(const type *ptr)                  \
> +    {                                           \
> +        QemuSpin *lock = addr_to_lock(ptr);     \
> +        type ret;                               \
> +                                                \
> +        qemu_spin_lock(lock);                   \
> +        ret = *ptr;                             \
> +        qemu_spin_unlock(lock);                 \
> +        return ret;                             \
> +    }
> +
> +GEN_READ(atomic_read_i64, int64_t)
> +GEN_READ(atomic_read_u64, uint64_t)

Is there really a good reason to have two external
functions instead of having one of them inline and
perform a cast?

Is this any better than using libatomic?


r~
Emilio Cota Sept. 11, 2018, 8:43 p.m. UTC | #2
On Tue, Sep 11, 2018 at 05:43:38 -0700, Richard Henderson wrote:
> On 09/10/2018 04:27 PM, Emilio G. Cota wrote:
> > +#define GEN_READ(name, type)                    \
> > +    type name(const type *ptr)                  \
> > +    {                                           \
> > +        QemuSpin *lock = addr_to_lock(ptr);     \
> > +        type ret;                               \
> > +                                                \
> > +        qemu_spin_lock(lock);                   \
> > +        ret = *ptr;                             \
> > +        qemu_spin_unlock(lock);                 \
> > +        return ret;                             \
> > +    }
> > +
> > +GEN_READ(atomic_read_i64, int64_t)
> > +GEN_READ(atomic_read_u64, uint64_t)
> 
> Is there really a good reason to have two external
> functions instead of having one of them inline and
> perform a cast?

Not really. I can do a follow-up patch if you want me to.

> Is this any better than using libatomic?

I didn't think of using libatomic. I just checked the source
code and it's quite similar:
- It uses 64 locks instead of 16 ($page_size/$cache_line,
  but these are hard-coded for POSIX as 4096/64, respectively)
- We compute the cacheline size and corresponding padding
  at run-time, which is a little better than the above.
- The locks are implemented as pthread_mutex instead of
  spinlocks. I think spinlocks make more sense here because
  we do not expect huge contention (systems without
  !CONFIG_ATOMIC64 have few cores); for libatomic it makes
  sense to use mutexes because it might be used in many-core
  machines.

So yes, we could have used libatomic. If you feel strongly
about it, I can give it a shot.

Thanks,

		Emilio
Murilo Opsfelder Araújo Sept. 18, 2018, 1:23 p.m. UTC | #3
On Tue, Sep 11, 2018 at 04:43:04PM -0400, Emilio G. Cota wrote:
> On Tue, Sep 11, 2018 at 05:43:38 -0700, Richard Henderson wrote:
> > On 09/10/2018 04:27 PM, Emilio G. Cota wrote:
> > > +#define GEN_READ(name, type)                    \
> > > +    type name(const type *ptr)                  \
> > > +    {                                           \
> > > +        QemuSpin *lock = addr_to_lock(ptr);     \
> > > +        type ret;                               \
> > > +                                                \
> > > +        qemu_spin_lock(lock);                   \
> > > +        ret = *ptr;                             \
> > > +        qemu_spin_unlock(lock);                 \
> > > +        return ret;                             \
> > > +    }
> > > +
> > > +GEN_READ(atomic_read_i64, int64_t)
> > > +GEN_READ(atomic_read_u64, uint64_t)
> >
> > Is there really a good reason to have two external
> > functions instead of having one of them inline and
> > perform a cast?
>
> Not really. I can do a follow-up patch if you want me to.
>
> > Is this any better than using libatomic?
>
> I didn't think of using libatomic. I just checked the source
> code and it's quite similar:
> - It uses 64 locks instead of 16 ($page_size/$cache_line,
>   but these are hard-coded for POSIX as 4096/64, respectively)
> - We compute the cacheline size and corresponding padding
>   at run-time, which is a little better than the above.
> - The locks are implemented as pthread_mutex instead of
>   spinlocks. I think spinlocks make more sense here because
>   we do not expect huge contention (systems without
>   !CONFIG_ATOMIC64 have few cores); for libatomic it makes
>   sense to use mutexes because it might be used in many-core
>   machines.
>
> So yes, we could have used libatomic. If you feel strongly
> about it, I can give it a shot.

Would allowing user to choose between libatomic or Emilio's implementation
at configure time be a bad idea?

One could pass, for example, --with-libatomic to configure script to use
libatomic instead of using Emilio's implementation, which could be the
default implementation - or vice-versa.

Murilo
Peter Maydell Sept. 18, 2018, 3:55 p.m. UTC | #4
On 11 September 2018 at 21:43, Emilio G. Cota <cota@braap.org> wrote:
> On Tue, Sep 11, 2018 at 05:43:38 -0700, Richard Henderson wrote:
>> Is this any better than using libatomic?
>
> I didn't think of using libatomic. I just checked the source
> code and it's quite similar:
> - It uses 64 locks instead of 16 ($page_size/$cache_line,
>   but these are hard-coded for POSIX as 4096/64, respectively)
> - We compute the cacheline size and corresponding padding
>   at run-time, which is a little better than the above.
> - The locks are implemented as pthread_mutex instead of
>   spinlocks. I think spinlocks make more sense here because
>   we do not expect huge contention (systems without
>   !CONFIG_ATOMIC64 have few cores); for libatomic it makes
>   sense to use mutexes because it might be used in many-core
>   machines.
>
> So yes, we could have used libatomic. If you feel strongly
> about it, I can give it a shot.

I guess the question I have is:
 * will we need (now or in future) to emit atomic accesses
   into generated TCG code that need to interoperate with
   the atomic accesses we do from C code ?
 * if so, does libatomic provide a mechanism for doing that?

(maybe there's nothing you can do except to call out from
the generated code to C code anyway)

thanks
-- PMM
Emilio Cota Sept. 18, 2018, 6:42 p.m. UTC | #5
On Tue, Sep 18, 2018 at 16:55:56 +0100, Peter Maydell wrote:
> On 11 September 2018 at 21:43, Emilio G. Cota <cota@braap.org> wrote:
> > On Tue, Sep 11, 2018 at 05:43:38 -0700, Richard Henderson wrote:
> >> Is this any better than using libatomic?
> >
> > I didn't think of using libatomic. I just checked the source
> > code and it's quite similar:
> > - It uses 64 locks instead of 16 ($page_size/$cache_line,
> >   but these are hard-coded for POSIX as 4096/64, respectively)
> > - We compute the cacheline size and corresponding padding
> >   at run-time, which is a little better than the above.
> > - The locks are implemented as pthread_mutex instead of
> >   spinlocks. I think spinlocks make more sense here because
> >   we do not expect huge contention (systems without
> >   !CONFIG_ATOMIC64 have few cores); for libatomic it makes
> >   sense to use mutexes because it might be used in many-core
> >   machines.
> >
> > So yes, we could have used libatomic. If you feel strongly
> > about it, I can give it a shot.
> 
> I guess the question I have is:
>  * will we need (now or in future) to emit atomic accesses
>    into generated TCG code that need to interoperate with
>    the atomic accesses we do from C code ?
>  * if so, does libatomic provide a mechanism for doing that?
>
> (maybe there's nothing you can do except to call out from
> the generated code to C code anyway)

We already have these. For instance:

- sTLB lookups can happen concurrently with invalidations
  to the same sTLB from another core (via tlb_reset_dirty)

- icount_decr is written to by cpu_exit, and is read
  at the beginning of each TB (with a tcg_gen_ld_32 generated
  by gen_tb_start).

The latter is always a 32-bit access, so all hosts can generate
it atomically. The former can be a 64-bit access if the guest
is 64-bit, which cannot be done if !CONFIG_ATOMIC64. But we
already disable mttcg for those combinations (we define
 TCG_OVERSIZED_GUEST for this).

If we wanted to support mttcg even for oversized guests,
then libatomic or our "atomic64" implementation could help,
but we'd have to insert a lock on every guest load/store
(either via the TCG backend or via a C helper). I do not think
we have enough users of these hosts to warrant support for this.
Moreover, these systems are unlikely to have many cores, and
therefore are unlikely to benefit much from mttcg.

Thanks,

		Emilio
Emilio Cota Sept. 18, 2018, 6:49 p.m. UTC | #6
On Tue, Sep 18, 2018 at 10:23:32 -0300, Murilo Opsfelder Araujo wrote:
> On Tue, Sep 11, 2018 at 04:43:04PM -0400, Emilio G. Cota wrote:
> > On Tue, Sep 11, 2018 at 05:43:38 -0700, Richard Henderson wrote:
> > > On 09/10/2018 04:27 PM, Emilio G. Cota wrote:
> > > > +#define GEN_READ(name, type)                    \
> > > > +    type name(const type *ptr)                  \
> > > > +    {                                           \
> > > > +        QemuSpin *lock = addr_to_lock(ptr);     \
> > > > +        type ret;                               \
> > > > +                                                \
> > > > +        qemu_spin_lock(lock);                   \
> > > > +        ret = *ptr;                             \
> > > > +        qemu_spin_unlock(lock);                 \
> > > > +        return ret;                             \
> > > > +    }
> > > > +
> > > > +GEN_READ(atomic_read_i64, int64_t)
> > > > +GEN_READ(atomic_read_u64, uint64_t)
> > >
> > > Is there really a good reason to have two external
> > > functions instead of having one of them inline and
> > > perform a cast?
> >
> > Not really. I can do a follow-up patch if you want me to.
> >
> > > Is this any better than using libatomic?
> >
> > I didn't think of using libatomic. I just checked the source
> > code and it's quite similar:
> > - It uses 64 locks instead of 16 ($page_size/$cache_line,
> >   but these are hard-coded for POSIX as 4096/64, respectively)
> > - We compute the cacheline size and corresponding padding
> >   at run-time, which is a little better than the above.
> > - The locks are implemented as pthread_mutex instead of
> >   spinlocks. I think spinlocks make more sense here because
> >   we do not expect huge contention (systems without
> >   !CONFIG_ATOMIC64 have few cores); for libatomic it makes
> >   sense to use mutexes because it might be used in many-core
> >   machines.
> >
> > So yes, we could have used libatomic. If you feel strongly
> > about it, I can give it a shot.
> 
> Would allowing user to choose between libatomic or Emilio's implementation
> at configure time be a bad idea?
> 
> One could pass, for example, --with-libatomic to configure script to use
> libatomic instead of using Emilio's implementation, which could be the
> default implementation - or vice-versa.

If we decide to use libatomic then there's no need to keep
our own atomic64. As I said in another message, both
implementations are very similar.

libatomic supports more operations, but at the moment
we only need 64-bit atomic_read/write.

Thanks,

		Emilio
Peter Maydell Sept. 18, 2018, 7:04 p.m. UTC | #7
On 18 September 2018 at 19:42, Emilio G. Cota <cota@braap.org> wrote:
> We already have these. For instance:
>
> - sTLB lookups can happen concurrently with invalidations
>   to the same sTLB from another core (via tlb_reset_dirty)
>
> - icount_decr is written to by cpu_exit, and is read
>   at the beginning of each TB (with a tcg_gen_ld_32 generated
>   by gen_tb_start).
>
> The latter is always a 32-bit access, so all hosts can generate
> it atomically. The former can be a 64-bit access if the guest
> is 64-bit, which cannot be done if !CONFIG_ATOMIC64. But we
> already disable mttcg for those combinations (we define
>  TCG_OVERSIZED_GUEST for this).

Does libatomic give us a firm guarantee that for 32-bit
types it will definitely produce an inline atomic access
insn that will interwork with what we're using? At the
moment our guard against this going wrong is that we don't
link against libatomic (so we get a compile failure if the
compiler decides to do something weird and out of line).

thanks
-- PMM
Richard Henderson Sept. 18, 2018, 7:23 p.m. UTC | #8
On 9/18/18 12:04 PM, Peter Maydell wrote:
> Does libatomic give us a firm guarantee that for 32-bit
> types it will definitely produce an inline atomic access
> insn that will interwork with what we're using? At the
> moment our guard against this going wrong is that we don't
> link against libatomic (so we get a compile failure if the
> compiler decides to do something weird and out of line).

No, it does not.


r~
diff mbox series

Patch

diff --git a/include/qemu/atomic.h b/include/qemu/atomic.h
index 9ed39effd3..c34c2f78c4 100644
--- a/include/qemu/atomic.h
+++ b/include/qemu/atomic.h
@@ -450,4 +450,38 @@ 
     _oldn;                                                              \
 })
 
+/* Abstractions to access atomically (i.e. "once") i64/u64 variables */
+#ifdef CONFIG_ATOMIC64
+static inline int64_t atomic_read_i64(const int64_t *ptr)
+{
+    /* use __nocheck because sizeof(void *) might be < sizeof(u64) */
+    return atomic_read__nocheck(ptr);
+}
+
+static inline uint64_t atomic_read_u64(const uint64_t *ptr)
+{
+    return atomic_read__nocheck(ptr);
+}
+
+static inline void atomic_set_i64(int64_t *ptr, int64_t val)
+{
+    atomic_set__nocheck(ptr, val);
+}
+
+static inline void atomic_set_u64(uint64_t *ptr, uint64_t val)
+{
+    atomic_set__nocheck(ptr, val);
+}
+
+static inline void atomic64_init(void)
+{
+}
+#else /* !CONFIG_ATOMIC64 */
+int64_t  atomic_read_i64(const int64_t *ptr);
+uint64_t atomic_read_u64(const uint64_t *ptr);
+void atomic_set_i64(int64_t *ptr, int64_t val);
+void atomic_set_u64(uint64_t *ptr, uint64_t val);
+void atomic64_init(void);
+#endif /* !CONFIG_ATOMIC64 */
+
 #endif /* QEMU_ATOMIC_H */
diff --git a/util/atomic64.c b/util/atomic64.c
new file mode 100644
index 0000000000..b198a6c9c8
--- /dev/null
+++ b/util/atomic64.c
@@ -0,0 +1,83 @@ 
+/*
+ * Copyright (C) 2018, Emilio G. Cota <cota@braap.org>
+ *
+ * License: GNU GPL, version 2 or later.
+ *   See the COPYING file in the top-level directory.
+ */
+#include "qemu/osdep.h"
+#include "qemu/atomic.h"
+#include "qemu/thread.h"
+
+#ifdef CONFIG_ATOMIC64
+#error This file must only be compiled if !CONFIG_ATOMIC64
+#endif
+
+/*
+ * When !CONFIG_ATOMIC64, we serialize both reads and writes with spinlocks.
+ * We use an array of spinlocks, with padding computed at run-time based on
+ * the host's dcache line size.
+ * We point to the array with a void * to simplify the padding's computation.
+ * Each spinlock is located every lock_size bytes.
+ */
+static void *lock_array;
+static size_t lock_size;
+
+/*
+ * Systems without CONFIG_ATOMIC64 are unlikely to have many cores, so we use a
+ * small array of locks.
+ */
+#define NR_LOCKS 16
+
+static QemuSpin *addr_to_lock(const void *addr)
+{
+    uintptr_t a = (uintptr_t)addr;
+    uintptr_t idx;
+
+    idx = a >> qemu_dcache_linesize_log;
+    idx ^= (idx >> 8) ^ (idx >> 16);
+    idx &= NR_LOCKS - 1;
+    return lock_array + idx * lock_size;
+}
+
+#define GEN_READ(name, type)                    \
+    type name(const type *ptr)                  \
+    {                                           \
+        QemuSpin *lock = addr_to_lock(ptr);     \
+        type ret;                               \
+                                                \
+        qemu_spin_lock(lock);                   \
+        ret = *ptr;                             \
+        qemu_spin_unlock(lock);                 \
+        return ret;                             \
+    }
+
+GEN_READ(atomic_read_i64, int64_t)
+GEN_READ(atomic_read_u64, uint64_t)
+#undef GEN_READ
+
+#define GEN_SET(name, type)                     \
+    void name(type *ptr, type val)              \
+    {                                           \
+        QemuSpin *lock = addr_to_lock(ptr);     \
+                                                \
+        qemu_spin_lock(lock);                   \
+        *ptr = val;                             \
+        qemu_spin_unlock(lock);                 \
+    }
+
+GEN_SET(atomic_set_i64, int64_t)
+GEN_SET(atomic_set_u64, uint64_t)
+#undef GEN_SET
+
+void atomic64_init(void)
+{
+    int i;
+
+    lock_size = ROUND_UP(sizeof(QemuSpin), qemu_dcache_linesize);
+    lock_array = qemu_memalign(qemu_dcache_linesize, lock_size * NR_LOCKS);
+    for (i = 0; i < NR_LOCKS; i++) {
+        QemuSpin *lock = lock_array + i * lock_size;
+
+        qemu_spin_init(lock);
+    }
+}
diff --git a/util/cacheinfo.c b/util/cacheinfo.c
index 57c7d58159..6c6cbdb25d 100644
--- a/util/cacheinfo.c
+++ b/util/cacheinfo.c
@@ -8,6 +8,7 @@ 
 
 #include "qemu/osdep.h"
 #include "qemu/host-utils.h"
+#include "qemu/atomic.h"
 
 int qemu_icache_linesize = 0;
 int qemu_icache_linesize_log;
@@ -179,4 +180,6 @@  static void __attribute__((constructor)) init_cache_info(void)
     qemu_icache_linesize_log = 31 - clz32(isize);
     qemu_dcache_linesize = dsize;
     qemu_dcache_linesize_log = 31 - clz32(dsize);
+
+    atomic64_init();
 }
diff --git a/util/Makefile.objs b/util/Makefile.objs
index 0e88899011..0820923c18 100644
--- a/util/Makefile.objs
+++ b/util/Makefile.objs
@@ -3,6 +3,7 @@  util-obj-y += bufferiszero.o
 util-obj-y += lockcnt.o
 util-obj-y += aiocb.o async.o aio-wait.o thread-pool.o qemu-timer.o
 util-obj-y += main-loop.o iohandler.o
+util-obj-$(call lnot,$(CONFIG_ATOMIC64)) += atomic64.o
 util-obj-$(CONFIG_POSIX) += aio-posix.o
 util-obj-$(CONFIG_POSIX) += compatfd.o
 util-obj-$(CONFIG_POSIX) += event_notifier-posix.o