diff mbox series

Remove catomics

Message ID AM5PR0801MB166869326D5273AA950568BA83AC9@AM5PR0801MB1668.eurprd08.prod.outlook.com
State New
Headers show
Series Remove catomics | expand

Commit Message

Wilco Dijkstra June 16, 2022, 10:01 a.m. UTC
The catomics are not supported on most targets and are only used in a few places which are not
performance critical, so replace all uses with more standard atomics.
Replace uses of catomic_add, catomic_increment, catomic_decrement and catomic_fetch_and_add with
atomic_fetch_add_relaxed which maps to a standard compiler builtin. Relaxed memory ordering is
correct for simple counters since they only need atomicity.

Passes buildmanyglibc and regress on AArch64.

---

Comments

Adhemerval Zanella Netto June 16, 2022, 8:06 p.m. UTC | #1
> On 16 Jun 2022, at 03:01, Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
> 
> 
> The catomics are not supported on most targets and are only used in a few places which are not
> performance critical, so replace all uses with more standard atomics.
> Replace uses of catomic_add, catomic_increment, catomic_decrement and catomic_fetch_and_add with
> atomic_fetch_add_relaxed which maps to a standard compiler builtin. Relaxed memory ordering is
> correct for simple counters since they only need atomicity.
> 
> Passes buildmanyglibc and regress on AArch64.

LGTM, thanks. Some comments below, but I can send a cleanup patch if you prefer.

Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org <mailto:adhemerval.zanella@linaro.org>>

> 
> ---
> 
> diff --git a/elf/dl-fptr.c b/elf/dl-fptr.c
> index 6645a260b809ecd521796e0d1adee56b3e0bd993..d6e63b807b597b886562657da2d007fc9053be72 100644
> --- a/elf/dl-fptr.c
> +++ b/elf/dl-fptr.c
> @@ -40,7 +40,7 @@
> 
> #ifndef COMPARE_AND_SWAP
> # define COMPARE_AND_SWAP(ptr, old, new) \
> -  (catomic_compare_and_exchange_bool_acq (ptr, new, old) == 0)
> +  (atomic_compare_and_exchange_bool_acq (ptr, new, old) == 0)
> #endif

I think we can also remove this macro, only ia64 redefines it to old compiler builtin
(__sync_bool_compare_and_swap).

> 
> ElfW(Addr) _dl_boot_fptr_table [ELF_MACHINE_BOOT_FPTR_TABLE_LEN];
> diff --git a/elf/dl-profile.c b/elf/dl-profile.c
> index ec57e3a96552ae6460c22a0fcc819b85d486c0da..0af1f577d2d695d08edce9e13d9b39f77911b1d5 100644
> --- a/elf/dl-profile.c
> +++ b/elf/dl-profile.c
> @@ -548,24 +548,24 @@ _dl_mcount (ElfW(Addr) frompc, ElfW(Addr) selfpc)
> 	      size_t newfromidx;
> 	      to_index = (data[narcs].self_pc
> 			  / (HASHFRACTION * sizeof (*tos)));
> -	      newfromidx = catomic_exchange_and_add (&fromidx, 1) + 1;
> +	      newfromidx =  	 (&fromidx, 1) + 1;
> 	      froms[newfromidx].here = &data[narcs];
> 	      froms[newfromidx].link = tos[to_index];
> 	      tos[to_index] = newfromidx;
> -	      catomic_increment (&narcs);
> +	      atomic_fetch_add_relaxed (&narcs, 1);
> 	    }
> 
> 	  /* If we still have no entry stop searching and insert.  */
> 	  if (*topcindex == 0)
> 	    {
> -	      unsigned int newarc = catomic_exchange_and_add (narcsp, 1);
> +	      unsigned int newarc = atomic_fetch_add_relaxed (narcsp, 1);
> 
> 	      /* In rare cases it could happen that all entries in FROMS are
> 		 occupied.  So we cannot count this anymore.  */
> 	      if (newarc >= fromlimit)
> 		goto done;
> 
> -	      *topcindex = catomic_exchange_and_add (&fromidx, 1) + 1;
> +	      *topcindex = atomic_fetch_add_relaxed (&fromidx, 1) + 1;
> 	      fromp = &froms[*topcindex];
> 
> 	      fromp->here = &data[newarc];
> @@ -573,7 +573,7 @@ _dl_mcount (ElfW(Addr) frompc, ElfW(Addr) selfpc)
> 	      data[newarc].self_pc = selfpc;
> 	      data[newarc].count = 0;
> 	      fromp->link = 0;
> -	      catomic_increment (&narcs);
> +	      atomic_fetch_add_relaxed (&narcs, 1);
> 
> 	      break;
> 	    }
> @@ -586,7 +586,7 @@ _dl_mcount (ElfW(Addr) frompc, ElfW(Addr) selfpc)
>     }
> 
>   /* Increment the counter.  */
> -  catomic_increment (&fromp->here->count);
> +  atomic_fetch_add_relaxed (&fromp->here->count, 1);
> 
>  done:
>   ;

Ok.

> diff --git a/include/atomic.h b/include/atomic.h
> index 2cb52c9cfd894308b97b97a04dd574b2287bf1b2..264db9a0b7619ff6520f84a19c53c1eb9a3b42a3 100644
> --- a/include/atomic.h
> +++ b/include/atomic.h
> @@ -24,13 +24,6 @@
>    - atomic arithmetic and logic operation on memory.  They all
>      have the prefix "atomic_".
> 
> -   - conditionally atomic operations of the same kinds.  These
> -     always behave identical but can be faster when atomicity
> -     is not really needed since only one thread has access to
> -     the memory location.  In that case the code is slower in
> -     the multi-thread case.  The interfaces have the prefix
> -     "catomic_".
> -
>    - support functions like barriers.  They also have the prefix
>      "atomic_".
> 

Ok.

> @@ -93,29 +86,6 @@
> #endif
> 
> 
> -#ifndef catomic_compare_and_exchange_val_acq
> -# ifdef __arch_c_compare_and_exchange_val_32_acq
> -#  define catomic_compare_and_exchange_val_acq(mem, newval, oldval) \
> -  __atomic_val_bysize (__arch_c_compare_and_exchange_val,acq,		      \
> -		       mem, newval, oldval)
> -# else
> -#  define catomic_compare_and_exchange_val_acq(mem, newval, oldval) \
> -  atomic_compare_and_exchange_val_acq (mem, newval, oldval)
> -# endif
> -#endif
> -
> -
> -#ifndef catomic_compare_and_exchange_val_rel
> -# ifndef atomic_compare_and_exchange_val_rel
> -#  define catomic_compare_and_exchange_val_rel(mem, newval, oldval)	      \
> -  catomic_compare_and_exchange_val_acq (mem, newval, oldval)
> -# else
> -#  define catomic_compare_and_exchange_val_rel(mem, newval, oldval)	      \
> -  atomic_compare_and_exchange_val_rel (mem, newval, oldval)
> -# endif
> -#endif
> -
> -
> #ifndef atomic_compare_and_exchange_val_rel
> # define atomic_compare_and_exchange_val_rel(mem, newval, oldval)	      \
>   atomic_compare_and_exchange_val_acq (mem, newval, oldval)
> @@ -141,23 +111,6 @@
> #endif
> 
> 
> -#ifndef catomic_compare_and_exchange_bool_acq
> -# ifdef __arch_c_compare_and_exchange_bool_32_acq
> -#  define catomic_compare_and_exchange_bool_acq(mem, newval, oldval) \
> -  __atomic_bool_bysize (__arch_c_compare_and_exchange_bool,acq,		      \
> -		        mem, newval, oldval)
> -# else
> -#  define catomic_compare_and_exchange_bool_acq(mem, newval, oldval) \
> -  ({ /* Cannot use __oldval here, because macros later in this file might     \
> -	call this macro with __oldval argument.	 */			      \
> -     __typeof (oldval) __atg4_old = (oldval);				      \
> -     catomic_compare_and_exchange_val_acq (mem, newval, __atg4_old)	      \
> -       != __atg4_old;							      \
> -  })
> -# endif
> -#endif
> -
> -
> /* Store NEWVALUE in *MEM and return the old value.  */
> #ifndef atomic_exchange_acq
> # define atomic_exchange_acq(mem, newvalue) \

Ok.

> @@ -212,24 +165,6 @@
>   atomic_exchange_and_add_acq(mem, value)
> #endif
> 
> -#ifndef catomic_exchange_and_add
> -# define catomic_exchange_and_add(mem, value) \
> -  ({ __typeof (*(mem)) __atg7_oldv;					      \
> -     __typeof (mem) __atg7_memp = (mem);				      \
> -     __typeof (*(mem)) __atg7_value = (value);				      \
> -									      \
> -     do									      \
> -       __atg7_oldv = *__atg7_memp;					      \
> -     while (__builtin_expect						      \
> -	    (catomic_compare_and_exchange_bool_acq (__atg7_memp,	      \
> -						    __atg7_oldv		      \
> -						    + __atg7_value,	      \
> -						    __atg7_oldv), 0));	      \
> -									      \
> -     __atg7_oldv; })
> -#endif
> -
> -
> #ifndef atomic_max
> # define atomic_max(mem, value) \
>   do {									      \
> @@ -246,25 +181,6 @@
>   } while (0)
> #endif
> 
> -
> -#ifndef catomic_max
> -# define catomic_max(mem, value) \
> -  do {									      \
> -    __typeof (*(mem)) __atg9_oldv;					      \
> -    __typeof (mem) __atg9_memp = (mem);					      \
> -    __typeof (*(mem)) __atg9_value = (value);				      \
> -    do {								      \
> -      __atg9_oldv = *__atg9_memp;					      \
> -      if (__atg9_oldv >= __atg9_value)					      \
> -	break;								      \
> -    } while (__builtin_expect						      \
> -	     (catomic_compare_and_exchange_bool_acq (__atg9_memp,	      \
> -						     __atg9_value,	      \
> -						     __atg9_oldv), 0));	      \
> -  } while (0)
> -#endif
> -
> -
> #ifndef atomic_min
> # define atomic_min(mem, value) \
>   do {									      \
> @@ -288,32 +204,16 @@
> #endif
> 
> 
> -#ifndef catomic_add
> -# define catomic_add(mem, value) \
> -  (void) catomic_exchange_and_add ((mem), (value))
> -#endif
> -
> -
> #ifndef atomic_increment
> # define atomic_increment(mem) atomic_add ((mem), 1)
> #endif
> 
> 
> -#ifndef catomic_increment
> -# define catomic_increment(mem) catomic_add ((mem), 1)
> -#endif
> -
> -
> #ifndef atomic_increment_val
> # define atomic_increment_val(mem) (atomic_exchange_and_add ((mem), 1) + 1)
> #endif
> 
> 
> -#ifndef catomic_increment_val
> -# define catomic_increment_val(mem) (catomic_exchange_and_add ((mem), 1) + 1)
> -#endif
> -
> -
> /* Add one to *MEM and return true iff it's now zero.  */
> #ifndef atomic_increment_and_test
> # define atomic_increment_and_test(mem) \
> @@ -326,21 +226,11 @@
> #endif
> 
> 
> -#ifndef catomic_decrement
> -# define catomic_decrement(mem) catomic_add ((mem), -1)
> -#endif
> -
> -
> #ifndef atomic_decrement_val
> # define atomic_decrement_val(mem) (atomic_exchange_and_add ((mem), -1) - 1)
> #endif
> 
> 
> -#ifndef catomic_decrement_val
> -# define catomic_decrement_val(mem) (catomic_exchange_and_add ((mem), -1) - 1)
> -#endif
> -
> -
> /* Subtract 1 from *MEM and return true iff it's now zero.  */
> #ifndef atomic_decrement_and_test
> # define atomic_decrement_and_test(mem) \
> @@ -421,22 +311,6 @@
>   } while (0)
> #endif
> 
> -#ifndef catomic_and
> -# define catomic_and(mem, mask) \
> -  do {									      \
> -    __typeof (*(mem)) __atg20_old;					      \
> -    __typeof (mem) __atg20_memp = (mem);				      \
> -    __typeof (*(mem)) __atg20_mask = (mask);				      \
> -									      \
> -    do									      \
> -      __atg20_old = (*__atg20_memp);					      \
> -    while (__builtin_expect						      \
> -	   (catomic_compare_and_exchange_bool_acq (__atg20_memp,	      \
> -						   __atg20_old & __atg20_mask,\
> -						   __atg20_old), 0));	      \
> -  } while (0)
> -#endif
> -
> /* Atomically *mem &= mask and return the old value of *mem.  */
> #ifndef atomic_and_val
> # define atomic_and_val(mem, mask) \
> @@ -471,22 +345,6 @@
>   } while (0)
> #endif
> 
> -#ifndef catomic_or
> -# define catomic_or(mem, mask) \
> -  do {									      \
> -    __typeof (*(mem)) __atg18_old;					      \
> -    __typeof (mem) __atg18_memp = (mem);				      \
> -    __typeof (*(mem)) __atg18_mask = (mask);				      \
> -									      \
> -    do									      \
> -      __atg18_old = (*__atg18_memp);					      \
> -    while (__builtin_expect						      \
> -	   (catomic_compare_and_exchange_bool_acq (__atg18_memp,	      \
> -						   __atg18_old | __atg18_mask,\
> -						   __atg18_old), 0));	      \
> -  } while (0)
> -#endif
> -
> /* Atomically *mem |= mask and return the old value of *mem.  */
> #ifndef atomic_or_val
> # define atomic_or_val(mem, mask) \

Ok.

> diff --git a/malloc/arena.c b/malloc/arena.c
> index 0a684a720d9648953073bd7d35faca100762c031..7c74a18381a4be5fe6bcb94b38a62dbfa6b674f4 100644
> --- a/malloc/arena.c
> +++ b/malloc/arena.c
> @@ -958,11 +958,11 @@ arena_get2 (size_t size, mstate avoid_arena)
>          enough address space to create that many arenas.  */
>       if (__glibc_unlikely (n <= narenas_limit - 1))
>         {
> -          if (catomic_compare_and_exchange_bool_acq (&narenas, n + 1, n))
> +          if (atomic_compare_and_exchange_bool_acq (&narenas, n + 1, n))
>             goto repeat;
>           a = _int_new_arena (size);
> 	  if (__glibc_unlikely (a == NULL))
> -            catomic_decrement (&narenas);
> +            atomic_fetch_add_relaxed (&narenas, -1);
>         }
>       else
>         a = reused_arena (avoid_arena);

Ok.

> diff --git a/malloc/malloc.c b/malloc/malloc.c
> index 09e5ff2bce5d79b3754687db3aef314640d336eb..a335cd23e18ee09027a8e7a71d808bfa1343e06d 100644
> --- a/malloc/malloc.c
> +++ b/malloc/malloc.c
> @@ -2486,11 +2486,11 @@ sysmalloc_mmap (INTERNAL_SIZE_T nb, size_t pagesize, int extra_flags, mstate av)
>     }
> 
>   /* update statistics */
> -  int new = atomic_exchange_and_add (&mp_.n_mmaps, 1) + 1;
> +  int new = atomic_fetch_add_relaxed (&mp_.n_mmaps, 1) + 1;
>   atomic_max (&mp_.max_n_mmaps, new);
> 
>   unsigned long sum;
> -  sum = atomic_exchange_and_add (&mp_.mmapped_mem, size) + size;
> +  sum = atomic_fetch_add_relaxed (&mp_.mmapped_mem, size) + size;
>   atomic_max (&mp_.max_mmapped_mem, sum);
> 
>   check_chunk (av, p);
> @@ -3059,8 +3059,8 @@ munmap_chunk (mchunkptr p)
>       || __glibc_unlikely (!powerof2 (mem & (pagesize - 1))))
>     malloc_printerr ("munmap_chunk(): invalid pointer");
> 
> -  atomic_decrement (&mp_.n_mmaps);
> -  atomic_add (&mp_.mmapped_mem, -total_size);
> +  atomic_fetch_add_relaxed (&mp_.n_mmaps, -1);
> +  atomic_fetch_add_relaxed (&mp_.mmapped_mem, -total_size);
> 
>   /* If munmap failed the process virtual memory address space is in a
>      bad shape.  Just leave the block hanging around, the process will
> @@ -3110,7 +3110,7 @@ mremap_chunk (mchunkptr p, size_t new_size)
>   set_head (p, (new_size - offset) | IS_MMAPPED);
> 
>   INTERNAL_SIZE_T new;
> -  new = atomic_exchange_and_add (&mp_.mmapped_mem, new_size - size - offset)
> +  new = atomic_fetch_add_relaxed (&mp_.mmapped_mem, new_size - size - offset)
>         + new_size - size - offset;
>   atomic_max (&mp_.max_mmapped_mem, new);
>   return p;
> @@ -3832,7 +3832,7 @@ _int_malloc (mstate av, size_t bytes)
>       if (__glibc_unlikely (pp != NULL && misaligned_chunk (pp)))       \
> 	malloc_printerr ("malloc(): unaligned fastbin chunk detected"); \
>     }							\
> -  while ((pp = catomic_compare_and_exchange_val_acq (fb, pp, victim)) \
> +  while ((pp = atomic_compare_and_exchange_val_acq (fb, pp, victim)) \
> 	 != victim);					\
> 
>   if ((unsigned long) (nb) <= (unsigned long) (get_max_fast ()))
> @@ -4550,7 +4550,7 @@ _int_free (mstate av, mchunkptr p, int have_lock)
> 	  old2 = old;
> 	  p->fd = PROTECT_PTR (&p->fd, old);
> 	}
> -      while ((old = catomic_compare_and_exchange_val_rel (fb, p, old2))
> +      while ((old = atomic_compare_and_exchange_val_rel (fb, p, old2))
> 	     != old2);
> 
>     /* Check that size of fastbin chunk at the top is the same as

Ok.

> diff --git a/malloc/memusage.c b/malloc/memusage.c
> index f30906dffb2731c104ea375af48f59c65bcc7c9c..74712834fa8b96fb2d9589d34b34ab07d05a84ca 100644
> --- a/malloc/memusage.c
> +++ b/malloc/memusage.c
> @@ -148,8 +148,8 @@ update_data (struct header *result, size_t len, size_t old_len)
> 
>   /* Compute current heap usage and compare it with the maximum value.  */
>   size_t heap
> -    = catomic_exchange_and_add (&current_heap, len - old_len) + len - old_len;
> -  catomic_max (&peak_heap, heap);
> +    = atomic_fetch_add_relaxed (&current_heap, len - old_len) + len - old_len;
> +  atomic_max (&peak_heap, heap);
> 
>   /* Compute current stack usage and compare it with the maximum
>      value.  The base stack pointer might not be set if this is not
> @@ -172,15 +172,15 @@ update_data (struct header *result, size_t len, size_t old_len)
>     start_sp = sp;
>   size_t current_stack = start_sp - sp;
> #endif
> -  catomic_max (&peak_stack, current_stack);
> +  atomic_max (&peak_stack, current_stack);
> 
>   /* Add up heap and stack usage and compare it with the maximum value.  */
> -  catomic_max (&peak_total, heap + current_stack);
> +  atomic_max (&peak_total, heap + current_stack);
> 
>   /* Store the value only if we are writing to a file.  */
>   if (fd != -1)
>     {
> -      uint32_t idx = catomic_exchange_and_add (&buffer_cnt, 1);
> +      uint32_t idx = atomic_fetch_add_relaxed (&buffer_cnt, 1);
>       if (idx + 1 >= 2 * buffer_size)
>         {
>           /* We try to reset the counter to the correct range.  If
> @@ -188,7 +188,7 @@ update_data (struct header *result, size_t len, size_t old_len)
>              counter it does not matter since that thread will take
>              care of the correction.  */
>           uint32_t reset = (idx + 1) % (2 * buffer_size);
> -          catomic_compare_and_exchange_val_acq (&buffer_cnt, reset, idx + 1);
> +          atomic_compare_and_exchange_val_acq (&buffer_cnt, reset, idx + 1);
>           if (idx >= 2 * buffer_size)
>             idx = reset - 1;
>         }
> @@ -362,24 +362,24 @@ malloc (size_t len)
>     return (*mallocp)(len);
> 
>   /* Keep track of number of calls.  */
> -  catomic_increment (&calls[idx_malloc]);
> +  atomic_fetch_add_relaxed (&calls[idx_malloc], 1);
>   /* Keep track of total memory consumption for `malloc'.  */
> -  catomic_add (&total[idx_malloc], len);
> +  atomic_fetch_add_relaxed (&total[idx_malloc], len);
>   /* Keep track of total memory requirement.  */
> -  catomic_add (&grand_total, len);
> +  atomic_fetch_add_relaxed (&grand_total, len);
>   /* Remember the size of the request.  */
>   if (len < 65536)
> -    catomic_increment (&histogram[len / 16]);
> +    atomic_fetch_add_relaxed (&histogram[len / 16], 1);
>   else
> -    catomic_increment (&large);
> +    atomic_fetch_add_relaxed (&large, 1);
>   /* Total number of calls of any of the functions.  */
> -  catomic_increment (&calls_total);
> +  atomic_fetch_add_relaxed (&calls_total, 1);
> 
>   /* Do the real work.  */
>   result = (struct header *) (*mallocp)(len + sizeof (struct header));
>   if (result == NULL)
>     {
> -      catomic_increment (&failed[idx_malloc]);
> +      atomic_fetch_add_relaxed (&failed[idx_malloc], 1);
>       return NULL;
>     }
> 
> @@ -430,21 +430,21 @@ realloc (void *old, size_t len)
>     }
> 
>   /* Keep track of number of calls.  */
> -  catomic_increment (&calls[idx_realloc]);
> +  atomic_fetch_add_relaxed (&calls[idx_realloc], 1);
>   if (len > old_len)
>     {
>       /* Keep track of total memory consumption for `realloc'.  */
> -      catomic_add (&total[idx_realloc], len - old_len);
> +      atomic_fetch_add_relaxed (&total[idx_realloc], len - old_len);
>       /* Keep track of total memory requirement.  */
> -      catomic_add (&grand_total, len - old_len);
> +      atomic_fetch_add_relaxed (&grand_total, len - old_len);
>     }
> 
>   if (len == 0 && old != NULL)
>     {
>       /* Special case.  */
> -      catomic_increment (&realloc_free);
> +      atomic_fetch_add_relaxed (&realloc_free, 1);
>       /* Keep track of total memory freed using `free'.  */
> -      catomic_add (&total[idx_free], real->length);
> +      atomic_fetch_add_relaxed (&total[idx_free], real->length);
> 
>       /* Update the allocation data and write out the records if necessary.  */
>       update_data (NULL, 0, old_len);
> @@ -457,26 +457,26 @@ realloc (void *old, size_t len)
> 
>   /* Remember the size of the request.  */
>   if (len < 65536)
> -    catomic_increment (&histogram[len / 16]);
> +    atomic_fetch_add_relaxed (&histogram[len / 16], 1);
>   else
> -    catomic_increment (&large);
> +    atomic_fetch_add_relaxed (&large, 1);
>   /* Total number of calls of any of the functions.  */
> -  catomic_increment (&calls_total);
> +  atomic_fetch_add_relaxed (&calls_total, 1);
> 
>   /* Do the real work.  */
>   result = (struct header *) (*reallocp)(real, len + sizeof (struct header));
>   if (result == NULL)
>     {
> -      catomic_increment (&failed[idx_realloc]);
> +      atomic_fetch_add_relaxed (&failed[idx_realloc], 1);
>       return NULL;
>     }
> 
>   /* Record whether the reduction/increase happened in place.  */
>   if (real == result)
> -    catomic_increment (&inplace);
> +    atomic_fetch_add_relaxed (&inplace, 1);
>   /* Was the buffer increased?  */
>   if (old_len > len)
> -    catomic_increment (&decreasing);
> +    atomic_fetch_add_relaxed (&decreasing, 1);
> 
>   /* Update the allocation data and write out the records if necessary.  */
>   update_data (result, len, old_len);
> @@ -508,16 +508,16 @@ calloc (size_t n, size_t len)
>     return (*callocp)(n, len);
> 
>   /* Keep track of number of calls.  */
> -  catomic_increment (&calls[idx_calloc]);
> +  atomic_fetch_add_relaxed (&calls[idx_calloc], 1);
>   /* Keep track of total memory consumption for `calloc'.  */
> -  catomic_add (&total[idx_calloc], size);
> +  atomic_fetch_add_relaxed (&total[idx_calloc], size);
>   /* Keep track of total memory requirement.  */
> -  catomic_add (&grand_total, size);
> +  atomic_fetch_add_relaxed (&grand_total, size);
>   /* Remember the size of the request.  */
>   if (size < 65536)
> -    catomic_increment (&histogram[size / 16]);
> +    atomic_fetch_add_relaxed (&histogram[size / 16], 1);
>   else
> -    catomic_increment (&large);
> +    atomic_fetch_add_relaxed (&large, 1);
>   /* Total number of calls of any of the functions.  */
>   ++calls_total;
> 
> @@ -525,7 +525,7 @@ calloc (size_t n, size_t len)
>   result = (struct header *) (*mallocp)(size + sizeof (struct header));
>   if (result == NULL)
>     {
> -      catomic_increment (&failed[idx_calloc]);
> +      atomic_fetch_add_relaxed (&failed[idx_calloc], 1);
>       return NULL;
>     }
> 
> @@ -563,7 +563,7 @@ free (void *ptr)
>   /* `free (NULL)' has no effect.  */
>   if (ptr == NULL)
>     {
> -      catomic_increment (&calls[idx_free]);
> +      atomic_fetch_add_relaxed (&calls[idx_free], 1);
>       return;
>     }
> 

Ok.

> @@ -577,9 +577,9 @@ free (void *ptr)
>     }
> 
>   /* Keep track of number of calls.  */
> -  catomic_increment (&calls[idx_free]);
> +  atomic_fetch_add_relaxed (&calls[idx_free], 1);
>   /* Keep track of total memory freed using `free'.  */
> -  catomic_add (&total[idx_free], real->length);
> +  atomic_fetch_add_relaxed (&total[idx_free], real->length);
> 
>   /* Update the allocation data and write out the records if necessary.  */
>   update_data (NULL, 0, real->length);
> @@ -614,22 +614,22 @@ mmap (void *start, size_t len, int prot, int flags, int fd, off_t offset)
>                  ? idx_mmap_a : prot & PROT_WRITE ? idx_mmap_w : idx_mmap_r);
> 
>       /* Keep track of number of calls.  */
> -      catomic_increment (&calls[idx]);
> +      atomic_fetch_add_relaxed (&calls[idx], 1);
>       /* Keep track of total memory consumption for `malloc'.  */
> -      catomic_add (&total[idx], len);
> +      atomic_fetch_add_relaxed (&total[idx], len);
>       /* Keep track of total memory requirement.  */
> -      catomic_add (&grand_total, len);
> +      atomic_fetch_add_relaxed (&grand_total, len);
>       /* Remember the size of the request.  */
>       if (len < 65536)
> -        catomic_increment (&histogram[len / 16]);
> +        atomic_fetch_add_relaxed (&histogram[len / 16], 1);
>       else
> -        catomic_increment (&large);
> +        atomic_fetch_add_relaxed (&large, 1);
>       /* Total number of calls of any of the functions.  */
> -      catomic_increment (&calls_total);
> +      atomic_fetch_add_relaxed (&calls_total, 1);
> 
>       /* Check for failures.  */
>       if (result == NULL)
> -        catomic_increment (&failed[idx]);
> +        atomic_fetch_add_relaxed (&failed[idx], 1);
>       else if (idx == idx_mmap_w)
>         /* Update the allocation data and write out the records if
>            necessary.  Note the first parameter is NULL which means
> @@ -667,22 +667,22 @@ mmap64 (void *start, size_t len, int prot, int flags, int fd, off64_t offset)
>                  ? idx_mmap_a : prot & PROT_WRITE ? idx_mmap_w : idx_mmap_r);
> 
>       /* Keep track of number of calls.  */
> -      catomic_increment (&calls[idx]);
> +      atomic_fetch_add_relaxed (&calls[idx], 1);
>       /* Keep track of total memory consumption for `malloc'.  */
> -      catomic_add (&total[idx], len);
> +      atomic_fetch_add_relaxed (&total[idx], len);
>       /* Keep track of total memory requirement.  */
> -      catomic_add (&grand_total, len);
> +      atomic_fetch_add_relaxed (&grand_total, len);
>       /* Remember the size of the request.  */
>       if (len < 65536)
> -        catomic_increment (&histogram[len / 16]);
> +        atomic_fetch_add_relaxed (&histogram[len / 16], 1);
>       else
> -        catomic_increment (&large);
> +        atomic_fetch_add_relaxed (&large, 1);
>       /* Total number of calls of any of the functions.  */
> -      catomic_increment (&calls_total);
> +      atomic_fetch_add_relaxed (&calls_total, 1);
> 
>       /* Check for failures.  */
>       if (result == NULL)
> -        catomic_increment (&failed[idx]);
> +        atomic_fetch_add_relaxed (&failed[idx], 1);
>       else if (idx == idx_mmap_w)
>         /* Update the allocation data and write out the records if
>            necessary.  Note the first parameter is NULL which means

Ok.

> @@ -722,33 +722,33 @@ mremap (void *start, size_t old_len, size_t len, int flags, ...)
>   if (!not_me && trace_mmap)
>     {
>       /* Keep track of number of calls.  */
> -      catomic_increment (&calls[idx_mremap]);
> +      atomic_fetch_add_relaxed (&calls[idx_mremap], 1);
>       if (len > old_len)
>         {
>           /* Keep track of total memory consumption for `malloc'.  */
> -          catomic_add (&total[idx_mremap], len - old_len);
> +          atomic_fetch_add_relaxed (&total[idx_mremap], len - old_len);
>           /* Keep track of total memory requirement.  */
> -          catomic_add (&grand_total, len - old_len);
> +          atomic_fetch_add_relaxed (&grand_total, len - old_len);
>         }
>       /* Remember the size of the request.  */
>       if (len < 65536)
> -        catomic_increment (&histogram[len / 16]);
> +        atomic_fetch_add_relaxed (&histogram[len / 16], 1);
>       else
> -        catomic_increment (&large);
> +        atomic_fetch_add_relaxed (&large, 1);
>       /* Total number of calls of any of the functions.  */
> -      catomic_increment (&calls_total);
> +      atomic_fetch_add_relaxed (&calls_total, 1);
> 
>       /* Check for failures.  */
>       if (result == NULL)
> -        catomic_increment (&failed[idx_mremap]);
> +        atomic_fetch_add_relaxed (&failed[idx_mremap], 1);
>       else
>         {
>           /* Record whether the reduction/increase happened in place.  */
>           if (start == result)
> -            catomic_increment (&inplace_mremap);
> +            atomic_fetch_add_relaxed (&inplace_mremap, 1);
>           /* Was the buffer increased?  */
>           if (old_len > len)
> -            catomic_increment (&decreasing_mremap);
> +            atomic_fetch_add_relaxed (&decreasing_mremap, 1);
> 
>           /* Update the allocation data and write out the records if
>              necessary.  Note the first parameter is NULL which means
> @@ -783,19 +783,19 @@ munmap (void *start, size_t len)
>   if (!not_me && trace_mmap)
>     {
>       /* Keep track of number of calls.  */
> -      catomic_increment (&calls[idx_munmap]);
> +      atomic_fetch_add_relaxed (&calls[idx_munmap], 1);
> 
>       if (__glibc_likely (result == 0))
>         {
>           /* Keep track of total memory freed using `free'.  */
> -          catomic_add (&total[idx_munmap], len);
> +          atomic_fetch_add_relaxed (&total[idx_munmap], len);
> 
>           /* Update the allocation data and write out the records if
>              necessary.  */
>           update_data (NULL, 0, len);
>         }
>       else
> -        catomic_increment (&failed[idx_munmap]);
> +        atomic_fetch_add_relaxed (&failed[idx_munmap], 1);
>     }
> 
>   return result;

Ok.

> diff --git a/manual/memory.texi b/manual/memory.texi
> index 23a039c57e60c81787252d935e3b309fd8290902..5cb1dbd281006148f23cfa38c5703fb79089ba78 100644
> --- a/manual/memory.texi
> +++ b/manual/memory.texi
> @@ -354,7 +354,7 @@ this function is in @file{stdlib.h}.
> @c that's protected by list_lock; next_free is only modified while
> @c list_lock is held too.  All other data members of an arena, as well
> @c as the metadata of the memory areas assigned to it, are only modified
> -@c while holding the arena's mutex (fastbin pointers use catomic ops
> +@c while holding the arena's mutex (fastbin pointers use atomic ops
> @c because they may be modified by free without taking the arena's
> @c lock).  Some reassurance was needed for fastbins, for it wasn't clear
> @c how they were initialized.  It turns out they are always
> @@ -383,7 +383,7 @@ this function is in @file{stdlib.h}.
> @c     mutex_lock (arena lock) dup @asulock @aculock [returns locked]
> @c    __get_nprocs ext ok @acsfd
> @c    NARENAS_FROM_NCORES ok
> -@c    catomic_compare_and_exchange_bool_acq ok
> +@c    atomic_compare_and_exchange_bool_acq ok
> @c    _int_new_arena ok @asulock @aculock @acsmem
> @c     new_heap ok @acsmem
> @c      mmap ok @acsmem
> @@ -397,7 +397,7 @@ this function is in @file{stdlib.h}.
> @c     mutex_lock (list_lock) dup @asulock @aculock
> @c     atomic_write_barrier ok
> @c     mutex_unlock (list_lock) @aculock
> -@c    catomic_decrement ok
> +@c    atomic_decrement ok
> @c    reused_arena @asulock @aculock
> @c      reads&writes next_to_use and iterates over arena next without guards
> @c      those are harmless as long as we don't drop arenas from the
> @@ -414,7 +414,7 @@ this function is in @file{stdlib.h}.
> @c   get_max_fast ok
> @c   fastbin_index ok
> @c   fastbin ok
> -@c   catomic_compare_and_exhange_val_acq ok
> +@c   atomic_compare_and_exhange_val_acq ok
> @c   malloc_printerr dup @mtsenv
> @c     if we get to it, we're toast already, undefined behavior must have
> @c     been invoked before
> @@ -521,10 +521,10 @@ this function is in @file{stdlib.h}.
> @c     chunk2mem dup ok
> @c     free_perturb ok
> @c     set_fastchunks ok
> -@c      catomic_and ok
> +@c      atomic_and ok
> @c     fastbin_index dup ok
> @c     fastbin dup ok
> -@c     catomic_compare_and_exchange_val_rel ok
> +@c     atomic_compare_and_exchange_val_rel ok
> @c     chunk_is_mmapped ok
> @c     contiguous dup ok
> @c     prev_inuse ok
> @@ -706,7 +706,7 @@ The prototype for this function is in @file{stdlib.h}.
> @safety{@prelim{}@mtsafe{}@asunsafe{@asulock{}}@acunsafe{@aculock{} @acsfd{} @acsmem{}}}
> @c __libc_free @asulock @aculock @acsfd @acsmem
> @c   releasing memory into fastbins modifies the arena without taking
> -@c   its mutex, but catomic operations ensure safety.  If two (or more)
> +@c   its mutex, but atomic operations ensure safety.  If two (or more)
> @c   threads are running malloc and have their own arenas locked when
> @c   each gets a signal whose handler free()s large (non-fastbin-able)
> @c   blocks from each other's arena, we deadlock; this is a more general

Ok.

> diff --git a/misc/tst-atomic.c b/misc/tst-atomic.c
> index 6d681a7bfdf4f48b4c04a073ebd480326dbd3cc8..4f9d2c1a46b363d346dbc2fa0962ae196844a43a 100644
> --- a/misc/tst-atomic.c
> +++ b/misc/tst-atomic.c
> @@ -393,117 +393,6 @@ do_test (void)
>     }
> #endif
> 
> -#ifdef catomic_compare_and_exchange_val_acq
> -  mem = 24;
> -  if (catomic_compare_and_exchange_val_acq (&mem, 35, 24) != 24
> -      || mem != 35)
> -    {
> -      puts ("catomic_compare_and_exchange_val_acq test 1 failed");
> -      ret = 1;
> -    }
> -
> -  mem = 12;
> -  if (catomic_compare_and_exchange_val_acq (&mem, 10, 15) != 12
> -      || mem != 12)
> -    {
> -      puts ("catomic_compare_and_exchange_val_acq test 2 failed");
> -      ret = 1;
> -    }
> -
> -  mem = -15;
> -  if (catomic_compare_and_exchange_val_acq (&mem, -56, -15) != -15
> -      || mem != -56)
> -    {
> -      puts ("catomic_compare_and_exchange_val_acq test 3 failed");
> -      ret = 1;
> -    }
> -
> -  mem = -1;
> -  if (catomic_compare_and_exchange_val_acq (&mem, 17, 0) != -1
> -      || mem != -1)
> -    {
> -      puts ("catomic_compare_and_exchange_val_acq test 4 failed");
> -      ret = 1;
> -    }
> -#endif
> -
> -  mem = 24;
> -  if (catomic_compare_and_exchange_bool_acq (&mem, 35, 24)
> -      || mem != 35)
> -    {
> -      puts ("catomic_compare_and_exchange_bool_acq test 1 failed");
> -      ret = 1;
> -    }
> -
> -  mem = 12;
> -  if (! catomic_compare_and_exchange_bool_acq (&mem, 10, 15)
> -      || mem != 12)
> -    {
> -      puts ("catomic_compare_and_exchange_bool_acq test 2 failed");
> -      ret = 1;
> -    }
> -
> -  mem = -15;
> -  if (catomic_compare_and_exchange_bool_acq (&mem, -56, -15)
> -      || mem != -56)
> -    {
> -      puts ("catomic_compare_and_exchange_bool_acq test 3 failed");
> -      ret = 1;
> -    }
> -
> -  mem = -1;
> -  if (! catomic_compare_and_exchange_bool_acq (&mem, 17, 0)
> -      || mem != -1)
> -    {
> -      puts ("catomic_compare_and_exchange_bool_acq test 4 failed");
> -      ret = 1;
> -    }
> -
> -  mem = 2;
> -  if (catomic_exchange_and_add (&mem, 11) != 2
> -      || mem != 13)
> -    {
> -      puts ("catomic_exchange_and_add test failed");
> -      ret = 1;
> -    }
> -
> -  mem = -21;
> -  catomic_add (&mem, 22);
> -  if (mem != 1)
> -    {
> -      puts ("catomic_add test failed");
> -      ret = 1;
> -    }
> -
> -  mem = -1;
> -  catomic_increment (&mem);
> -  if (mem != 0)
> -    {
> -      puts ("catomic_increment test failed");
> -      ret = 1;
> -    }
> -
> -  mem = 2;
> -  if (catomic_increment_val (&mem) != 3)
> -    {
> -      puts ("catomic_increment_val test failed");
> -      ret = 1;
> -    }
> -
> -  mem = 17;
> -  catomic_decrement (&mem);
> -  if (mem != 16)
> -    {
> -      puts ("catomic_decrement test failed");
> -      ret = 1;
> -    }
> -
> -  if (catomic_decrement_val (&mem) != 15)
> -    {
> -      puts ("catomic_decrement_val test failed");
> -      ret = 1;
> -    }
> -
>   /* Tests for C11-like atomics.  */
>   mem = 11;
>   if (atomic_load_relaxed (&mem) != 11 || atomic_load_acquire (&mem) != 11)

Ok.

> diff --git a/sysdeps/hppa/dl-fptr.c b/sysdeps/hppa/dl-fptr.c
> index 9ed21602d6155d4b960278f8d1fac4ffa885b9d5..a814769b0b5cb492561f54f32575808e00b4dd24 100644
> --- a/sysdeps/hppa/dl-fptr.c
> +++ b/sysdeps/hppa/dl-fptr.c
> @@ -43,7 +43,7 @@
> 
> #ifndef COMPARE_AND_SWAP
> # define COMPARE_AND_SWAP(ptr, old, new) \
> -  (catomic_compare_and_exchange_bool_acq (ptr, new, old) == 0)
> +  (atomic_compare_and_exchange_bool_acq (ptr, new, old) == 0)
> #endif
> 

I think we can just remove this macro, hppa does not redefine it.

> ElfW(Addr) _dl_boot_fptr_table [ELF_MACHINE_BOOT_FPTR_TABLE_LEN];
> diff --git a/sysdeps/s390/atomic-machine.h b/sysdeps/s390/atomic-machine.h
> index e85b2ef50541c7aab6d2981180f6205d2bd681b6..6b1de51c2a30baf5554a729a80a7ce04b56fc22c 100644
> --- a/sysdeps/s390/atomic-machine.h
> +++ b/sysdeps/s390/atomic-machine.h
> @@ -70,8 +70,6 @@
>     !__atomic_compare_exchange_n (mem, (void *) &__atg2_oldval, newval,	\
> 				  1, __ATOMIC_ACQUIRE,			\
> 				  __ATOMIC_RELAXED); })
> -#define catomic_compare_and_exchange_bool_acq(mem, newval, oldval)	\
> -  atomic_compare_and_exchange_bool_acq (mem, newval, oldval)
> 
> /* Store NEWVALUE in *MEM and return the old value.  */
> #define atomic_exchange_acq(mem, newvalue)				\
> @@ -90,8 +88,6 @@
> # define atomic_exchange_and_add_rel(mem, operand)			\
>   ({ __atomic_check_size((mem));					\
>   __atomic_fetch_add ((mem), (operand), __ATOMIC_RELEASE); })
> -#define catomic_exchange_and_add(mem, value)	\
> -  atomic_exchange_and_add (mem, value)
> 
> /* Atomically *mem |= mask and return the old value of *mem.  */
> /* The gcc builtin uses load-and-or instruction on z196 zarch and higher cpus
> @@ -104,8 +100,6 @@
>   do {						\
>     atomic_or_val (mem, mask);			\
>   } while (0)
> -#define catomic_or(mem, mask)			\
> -  atomic_or (mem, mask)
> 
> /* Atomically *mem |= 1 << bit and return true if the bit was set in old value
>    of *mem.  */
> @@ -129,5 +123,3 @@
>   do {						\
>     atomic_and_val (mem, mask);			\
>   } while (0)
> -#define catomic_and(mem, mask)			\
> -  atomic_and(mem, mask)

Ok.

> diff --git a/sysdeps/unix/sysv/linux/riscv/atomic-machine.h b/sysdeps/unix/sysv/linux/riscv/atomic-machine.h
> index 9ae89e0ef12ad28319755ac51260908779b9579f..f4b2cbced828a80335887bf172fd60767cf978ac 100644
> --- a/sysdeps/unix/sysv/linux/riscv/atomic-machine.h
> +++ b/sysdeps/unix/sysv/linux/riscv/atomic-machine.h
> @@ -170,10 +170,6 @@
>   ({ typeof (*mem) __mask = (typeof (*mem))1 << (bit);    \
>      asm_amo ("amoor", ".aq", mem, __mask) & __mask; })
> 
> -# define catomic_exchange_and_add(mem, value)		\
> -  atomic_exchange_and_add (mem, value)
> -# define catomic_max(mem, value) atomic_max (mem, value)
> -
> #else /* __riscv_atomic */
> # error "ISAs that do not subsume the A extension are not supported"
> #endif /* !__riscv_atomic */

Ok.

> diff --git a/sysdeps/x86/atomic-machine.h b/sysdeps/x86/atomic-machine.h
> index f24f1c71ed718c601c71decc1ee0c4b49fdf32f8..920390f4e629106040415af4623fcd826b242da2 100644
> --- a/sysdeps/x86/atomic-machine.h
> +++ b/sysdeps/x86/atomic-machine.h
> @@ -52,52 +52,7 @@
>   (! __sync_bool_compare_and_swap (mem, oldval, newval))
> 
> 
> -#define __arch_c_compare_and_exchange_val_8_acq(mem, newval, oldval) \
> -  ({ __typeof (*mem) ret;						      \
> -     __asm __volatile ("cmpl $0, %%" SEG_REG ":%P5\n\t"			      \
> -		       "je 0f\n\t"					      \
> -		       "lock\n"						      \
> -		       "0:\tcmpxchgb %b2, %1"				      \
> -		       : "=a" (ret), "=m" (*mem)			      \
> -		       : BR_CONSTRAINT (newval), "m" (*mem), "0" (oldval),    \
> -			 "i" (offsetof (tcbhead_t, multiple_threads)));	      \
> -     ret; })
> -
> -#define __arch_c_compare_and_exchange_val_16_acq(mem, newval, oldval) \
> -  ({ __typeof (*mem) ret;						      \
> -     __asm __volatile ("cmpl $0, %%" SEG_REG ":%P5\n\t"			      \
> -		       "je 0f\n\t"					      \
> -		       "lock\n"						      \
> -		       "0:\tcmpxchgw %w2, %1"				      \
> -		       : "=a" (ret), "=m" (*mem)			      \
> -		       : BR_CONSTRAINT (newval), "m" (*mem), "0" (oldval),    \
> -			 "i" (offsetof (tcbhead_t, multiple_threads)));	      \
> -     ret; })
> -
> -#define __arch_c_compare_and_exchange_val_32_acq(mem, newval, oldval) \
> -  ({ __typeof (*mem) ret;						      \
> -     __asm __volatile ("cmpl $0, %%" SEG_REG ":%P5\n\t"			      \
> -		       "je 0f\n\t"					      \
> -		       "lock\n"						      \
> -		       "0:\tcmpxchgl %2, %1"				      \
> -		       : "=a" (ret), "=m" (*mem)			      \
> -		       : BR_CONSTRAINT (newval), "m" (*mem), "0" (oldval),    \
> -			 "i" (offsetof (tcbhead_t, multiple_threads)));       \
> -     ret; })
> -
> #ifdef __x86_64__
> -# define __arch_c_compare_and_exchange_val_64_acq(mem, newval, oldval) \
> -  ({ __typeof (*mem) ret;						      \
> -     __asm __volatile ("cmpl $0, %%fs:%P5\n\t"				      \
> -		       "je 0f\n\t"					      \
> -		       "lock\n"						      \
> -		       "0:\tcmpxchgq %q2, %1"				      \
> -		       : "=a" (ret), "=m" (*mem)			      \
> -		       : "q" ((int64_t) cast_to_integer (newval)),	      \
> -			 "m" (*mem),					      \
> -			 "0" ((int64_t) cast_to_integer (oldval)),	      \
> -			 "i" (offsetof (tcbhead_t, multiple_threads)));	      \
> -     ret; })
> # define do_exchange_and_add_val_64_acq(pfx, mem, value) 0
> # define do_add_val_64_acq(pfx, mem, value) do { } while (0)
> #else
> @@ -107,13 +62,6 @@
>    such an operation.  So don't define any code for now.  If it is
>    really going to be used the code below can be used on Intel Pentium
>    and later, but NOT on i486.  */
> -# define __arch_c_compare_and_exchange_val_64_acq(mem, newval, oldval) \
> -  ({ __typeof (*mem) ret = *(mem);					      \
> -     __atomic_link_error ();						      \
> -     ret = (newval);							      \
> -     ret = (oldval);							      \
> -     ret; })
> -
> # define __arch_compare_and_exchange_val_64_acq(mem, newval, oldval)	      \
>   ({ __typeof (*mem) ret = *(mem);					      \
>      __atomic_link_error ();						      \
> @@ -206,14 +154,6 @@
> #define atomic_exchange_and_add(mem, value) \
>   __sync_fetch_and_add (mem, value)
> 
> -#define __arch_exchange_and_add_cprefix \
> -  "cmpl $0, %%" SEG_REG ":%P4\n\tje 0f\n\tlock\n0:\t"
> -
> -#define catomic_exchange_and_add(mem, value) \
> -  __arch_exchange_and_add_body (__arch_exchange_and_add_cprefix, __arch_c,    \
> -				mem, value)
> -
> -
> #define __arch_add_body(lock, pfx, apfx, mem, value) \
>   do {									      \
>     if (__builtin_constant_p (value) && (value) == 1)			      \
> @@ -248,13 +188,6 @@
> # define atomic_add(mem, value) \
>   __arch_add_body (LOCK_PREFIX, atomic, __arch, mem, value)
> 
> -#define __arch_add_cprefix \
> -  "cmpl $0, %%" SEG_REG ":%P3\n\tje 0f\n\tlock\n0:\t"
> -
> -#define catomic_add(mem, value) \
> -  __arch_add_body (__arch_add_cprefix, atomic, __arch_c, mem, value)
> -
> -
> #define atomic_add_negative(mem, value) \
>   ({ unsigned char __result;						      \
>      if (sizeof (*mem) == 1)						      \
> @@ -331,13 +264,6 @@
> 
> #define atomic_increment(mem) __arch_increment_body (LOCK_PREFIX, __arch, mem)
> 
> -#define __arch_increment_cprefix \
> -  "cmpl $0, %%" SEG_REG ":%P2\n\tje 0f\n\tlock\n0:\t"
> -
> -#define catomic_increment(mem) \
> -  __arch_increment_body (__arch_increment_cprefix, __arch_c, mem)
> -
> -
> #define atomic_increment_and_test(mem) \
>   ({ unsigned char __result;						      \
>      if (sizeof (*mem) == 1)						      \
> @@ -389,13 +315,6 @@
> 
> #define atomic_decrement(mem) __arch_decrement_body (LOCK_PREFIX, __arch, mem)
> 
> -#define __arch_decrement_cprefix \
> -  "cmpl $0, %%" SEG_REG ":%P2\n\tje 0f\n\tlock\n0:\t"
> -
> -#define catomic_decrement(mem) \
> -  __arch_decrement_body (__arch_decrement_cprefix, __arch_c, mem)
> -
> -
> #define atomic_decrement_and_test(mem) \
>   ({ unsigned char __result;						      \
>      if (sizeof (*mem) == 1)						      \
> @@ -493,14 +412,8 @@
>       __atomic_link_error ();						      \
>   } while (0)
> 
> -#define __arch_cprefix \
> -  "cmpl $0, %%" SEG_REG ":%P3\n\tje 0f\n\tlock\n0:\t"
> -
> #define atomic_and(mem, mask) __arch_and_body (LOCK_PREFIX, mem, mask)
> 
> -#define catomic_and(mem, mask) __arch_and_body (__arch_cprefix, mem, mask)
> -
> -
> #define __arch_or_body(lock, mem, mask) \
>   do {									      \
>     if (sizeof (*mem) == 1)						      \
> @@ -529,8 +442,6 @@
> 
> #define atomic_or(mem, mask) __arch_or_body (LOCK_PREFIX, mem, mask)
> 
> -#define catomic_or(mem, mask) __arch_or_body (__arch_cprefix, mem, mask)
> -
> /* We don't use mfence because it is supposedly slower due to having to
>    provide stronger guarantees (e.g., regarding self-modifying code).  */
> #define atomic_full_barrier() \
> 

Ok.
Wilco Dijkstra June 17, 2022, 11:56 a.m. UTC | #2
Hi Adhemerval,

> LGTM, thanks. Some comments below, but I can send a cleanup patch if you prefer.
>
> #ifndef COMPARE_AND_SWAP
>
> I think we can also remove this macro, only ia64 redefines it to old compiler builtin
> (__sync_bool_compare_and_swap).

I removed it from ia64 and the #ifdef/#endif elsewhere. However the macro seems
to have arguments in the correct order so it seems best to not substitute it for now.

Also I noticed some dead uses of multiple_threads, so I have removed those
(in principle your single-thread patch series no longer needs to make changes
to sysdeps/x86/atomic-machine.h). Rerunning build-many-glibcs for a few arches passed.

Cheers,
Wilco

v2: Remove COMPARE_AND_SWAP ifdefs and dead uses of multiple_threads.

The catomics are not supported on most targets and are only used in a few places which are not
performance critical, so replace all uses with more standard atomics.
Replace uses of catomic_add, catomic_increment, catomic_decrement and catomic_fetch_and_add with
atomic_fetch_add_relaxed which maps to a standard compiler builtin. Relaxed memory ordering is
correct for simple counters since they only need atomicity.

---
diff --git a/elf/dl-fptr.c b/elf/dl-fptr.c
index 6645a260b809ecd521796e0d1adee56b3e0bd993..ffc83ef78bc16f36773a51da7442646f47e85b71 100644
--- a/elf/dl-fptr.c
+++ b/elf/dl-fptr.c
@@ -38,10 +38,8 @@
 # error "ELF_MACHINE_LOAD_ADDRESS is not defined."
 #endif
 
-#ifndef COMPARE_AND_SWAP
-# define COMPARE_AND_SWAP(ptr, old, new) \
-  (catomic_compare_and_exchange_bool_acq (ptr, new, old) == 0)
-#endif
+#define COMPARE_AND_SWAP(ptr, old, new) \
+  (atomic_compare_and_exchange_bool_acq (ptr, new, old) == 0)
 
 ElfW(Addr) _dl_boot_fptr_table [ELF_MACHINE_BOOT_FPTR_TABLE_LEN];
 
diff --git a/elf/dl-profile.c b/elf/dl-profile.c
index ec57e3a96552ae6460c22a0fcc819b85d486c0da..0af1f577d2d695d08edce9e13d9b39f77911b1d5 100644
--- a/elf/dl-profile.c
+++ b/elf/dl-profile.c
@@ -548,24 +548,24 @@ _dl_mcount (ElfW(Addr) frompc, ElfW(Addr) selfpc)
 	      size_t newfromidx;
 	      to_index = (data[narcs].self_pc
 			  / (HASHFRACTION * sizeof (*tos)));
-	      newfromidx = catomic_exchange_and_add (&fromidx, 1) + 1;
+	      newfromidx = atomic_fetch_add_relaxed (&fromidx, 1) + 1;
 	      froms[newfromidx].here = &data[narcs];
 	      froms[newfromidx].link = tos[to_index];
 	      tos[to_index] = newfromidx;
-	      catomic_increment (&narcs);
+	      atomic_fetch_add_relaxed (&narcs, 1);
 	    }
 
 	  /* If we still have no entry stop searching and insert.  */
 	  if (*topcindex == 0)
 	    {
-	      unsigned int newarc = catomic_exchange_and_add (narcsp, 1);
+	      unsigned int newarc = atomic_fetch_add_relaxed (narcsp, 1);
 
 	      /* In rare cases it could happen that all entries in FROMS are
 		 occupied.  So we cannot count this anymore.  */
 	      if (newarc >= fromlimit)
 		goto done;
 
-	      *topcindex = catomic_exchange_and_add (&fromidx, 1) + 1;
+	      *topcindex = atomic_fetch_add_relaxed (&fromidx, 1) + 1;
 	      fromp = &froms[*topcindex];
 
 	      fromp->here = &data[newarc];
@@ -573,7 +573,7 @@ _dl_mcount (ElfW(Addr) frompc, ElfW(Addr) selfpc)
 	      data[newarc].self_pc = selfpc;
 	      data[newarc].count = 0;
 	      fromp->link = 0;
-	      catomic_increment (&narcs);
+	      atomic_fetch_add_relaxed (&narcs, 1);
 
 	      break;
 	    }
@@ -586,7 +586,7 @@ _dl_mcount (ElfW(Addr) frompc, ElfW(Addr) selfpc)
     }
 
   /* Increment the counter.  */
-  catomic_increment (&fromp->here->count);
+  atomic_fetch_add_relaxed (&fromp->here->count, 1);
 
  done:
   ;
diff --git a/include/atomic.h b/include/atomic.h
index 2cb52c9cfd894308b97b97a04dd574b2287bf1b2..264db9a0b7619ff6520f84a19c53c1eb9a3b42a3 100644
--- a/include/atomic.h
+++ b/include/atomic.h
@@ -24,13 +24,6 @@
    - atomic arithmetic and logic operation on memory.  They all
      have the prefix "atomic_".
 
-   - conditionally atomic operations of the same kinds.  These
-     always behave identical but can be faster when atomicity
-     is not really needed since only one thread has access to
-     the memory location.  In that case the code is slower in
-     the multi-thread case.  The interfaces have the prefix
-     "catomic_".
-
    - support functions like barriers.  They also have the prefix
      "atomic_".
 
@@ -93,29 +86,6 @@
 #endif
 
 
-#ifndef catomic_compare_and_exchange_val_acq
-# ifdef __arch_c_compare_and_exchange_val_32_acq
-#  define catomic_compare_and_exchange_val_acq(mem, newval, oldval) \
-  __atomic_val_bysize (__arch_c_compare_and_exchange_val,acq,		      \
-		       mem, newval, oldval)
-# else
-#  define catomic_compare_and_exchange_val_acq(mem, newval, oldval) \
-  atomic_compare_and_exchange_val_acq (mem, newval, oldval)
-# endif
-#endif
-
-
-#ifndef catomic_compare_and_exchange_val_rel
-# ifndef atomic_compare_and_exchange_val_rel
-#  define catomic_compare_and_exchange_val_rel(mem, newval, oldval)	      \
-  catomic_compare_and_exchange_val_acq (mem, newval, oldval)
-# else
-#  define catomic_compare_and_exchange_val_rel(mem, newval, oldval)	      \
-  atomic_compare_and_exchange_val_rel (mem, newval, oldval)
-# endif
-#endif
-
-
 #ifndef atomic_compare_and_exchange_val_rel
 # define atomic_compare_and_exchange_val_rel(mem, newval, oldval)	      \
   atomic_compare_and_exchange_val_acq (mem, newval, oldval)
@@ -141,23 +111,6 @@
 #endif
 
 
-#ifndef catomic_compare_and_exchange_bool_acq
-# ifdef __arch_c_compare_and_exchange_bool_32_acq
-#  define catomic_compare_and_exchange_bool_acq(mem, newval, oldval) \
-  __atomic_bool_bysize (__arch_c_compare_and_exchange_bool,acq,		      \
-		        mem, newval, oldval)
-# else
-#  define catomic_compare_and_exchange_bool_acq(mem, newval, oldval) \
-  ({ /* Cannot use __oldval here, because macros later in this file might     \
-	call this macro with __oldval argument.	 */			      \
-     __typeof (oldval) __atg4_old = (oldval);				      \
-     catomic_compare_and_exchange_val_acq (mem, newval, __atg4_old)	      \
-       != __atg4_old;							      \
-  })
-# endif
-#endif
-
-
 /* Store NEWVALUE in *MEM and return the old value.  */
 #ifndef atomic_exchange_acq
 # define atomic_exchange_acq(mem, newvalue) \
@@ -212,24 +165,6 @@
   atomic_exchange_and_add_acq(mem, value)
 #endif
 
-#ifndef catomic_exchange_and_add
-# define catomic_exchange_and_add(mem, value) \
-  ({ __typeof (*(mem)) __atg7_oldv;					      \
-     __typeof (mem) __atg7_memp = (mem);				      \
-     __typeof (*(mem)) __atg7_value = (value);				      \
-									      \
-     do									      \
-       __atg7_oldv = *__atg7_memp;					      \
-     while (__builtin_expect						      \
-	    (catomic_compare_and_exchange_bool_acq (__atg7_memp,	      \
-						    __atg7_oldv		      \
-						    + __atg7_value,	      \
-						    __atg7_oldv), 0));	      \
-									      \
-     __atg7_oldv; })
-#endif
-
-
 #ifndef atomic_max
 # define atomic_max(mem, value) \
   do {									      \
@@ -246,25 +181,6 @@
   } while (0)
 #endif
 
-
-#ifndef catomic_max
-# define catomic_max(mem, value) \
-  do {									      \
-    __typeof (*(mem)) __atg9_oldv;					      \
-    __typeof (mem) __atg9_memp = (mem);					      \
-    __typeof (*(mem)) __atg9_value = (value);				      \
-    do {								      \
-      __atg9_oldv = *__atg9_memp;					      \
-      if (__atg9_oldv >= __atg9_value)					      \
-	break;								      \
-    } while (__builtin_expect						      \
-	     (catomic_compare_and_exchange_bool_acq (__atg9_memp,	      \
-						     __atg9_value,	      \
-						     __atg9_oldv), 0));	      \
-  } while (0)
-#endif
-
-
 #ifndef atomic_min
 # define atomic_min(mem, value) \
   do {									      \
@@ -288,32 +204,16 @@
 #endif
 
 
-#ifndef catomic_add
-# define catomic_add(mem, value) \
-  (void) catomic_exchange_and_add ((mem), (value))
-#endif
-
-
 #ifndef atomic_increment
 # define atomic_increment(mem) atomic_add ((mem), 1)
 #endif
 
 
-#ifndef catomic_increment
-# define catomic_increment(mem) catomic_add ((mem), 1)
-#endif
-
-
 #ifndef atomic_increment_val
 # define atomic_increment_val(mem) (atomic_exchange_and_add ((mem), 1) + 1)
 #endif
 
 
-#ifndef catomic_increment_val
-# define catomic_increment_val(mem) (catomic_exchange_and_add ((mem), 1) + 1)
-#endif
-
-
 /* Add one to *MEM and return true iff it's now zero.  */
 #ifndef atomic_increment_and_test
 # define atomic_increment_and_test(mem) \
@@ -326,21 +226,11 @@
 #endif
 
 
-#ifndef catomic_decrement
-# define catomic_decrement(mem) catomic_add ((mem), -1)
-#endif
-
-
 #ifndef atomic_decrement_val
 # define atomic_decrement_val(mem) (atomic_exchange_and_add ((mem), -1) - 1)
 #endif
 
 
-#ifndef catomic_decrement_val
-# define catomic_decrement_val(mem) (catomic_exchange_and_add ((mem), -1) - 1)
-#endif
-
-
 /* Subtract 1 from *MEM and return true iff it's now zero.  */
 #ifndef atomic_decrement_and_test
 # define atomic_decrement_and_test(mem) \
@@ -421,22 +311,6 @@
   } while (0)
 #endif
 
-#ifndef catomic_and
-# define catomic_and(mem, mask) \
-  do {									      \
-    __typeof (*(mem)) __atg20_old;					      \
-    __typeof (mem) __atg20_memp = (mem);				      \
-    __typeof (*(mem)) __atg20_mask = (mask);				      \
-									      \
-    do									      \
-      __atg20_old = (*__atg20_memp);					      \
-    while (__builtin_expect						      \
-	   (catomic_compare_and_exchange_bool_acq (__atg20_memp,	      \
-						   __atg20_old & __atg20_mask,\
-						   __atg20_old), 0));	      \
-  } while (0)
-#endif
-
 /* Atomically *mem &= mask and return the old value of *mem.  */
 #ifndef atomic_and_val
 # define atomic_and_val(mem, mask) \
@@ -471,22 +345,6 @@
   } while (0)
 #endif
 
-#ifndef catomic_or
-# define catomic_or(mem, mask) \
-  do {									      \
-    __typeof (*(mem)) __atg18_old;					      \
-    __typeof (mem) __atg18_memp = (mem);				      \
-    __typeof (*(mem)) __atg18_mask = (mask);				      \
-									      \
-    do									      \
-      __atg18_old = (*__atg18_memp);					      \
-    while (__builtin_expect						      \
-	   (catomic_compare_and_exchange_bool_acq (__atg18_memp,	      \
-						   __atg18_old | __atg18_mask,\
-						   __atg18_old), 0));	      \
-  } while (0)
-#endif
-
 /* Atomically *mem |= mask and return the old value of *mem.  */
 #ifndef atomic_or_val
 # define atomic_or_val(mem, mask) \
diff --git a/malloc/arena.c b/malloc/arena.c
index 0a684a720d9648953073bd7d35faca100762c031..7c74a18381a4be5fe6bcb94b38a62dbfa6b674f4 100644
--- a/malloc/arena.c
+++ b/malloc/arena.c
@@ -958,11 +958,11 @@ arena_get2 (size_t size, mstate avoid_arena)
          enough address space to create that many arenas.  */
       if (__glibc_unlikely (n <= narenas_limit - 1))
         {
-          if (catomic_compare_and_exchange_bool_acq (&narenas, n + 1, n))
+          if (atomic_compare_and_exchange_bool_acq (&narenas, n + 1, n))
             goto repeat;
           a = _int_new_arena (size);
 	  if (__glibc_unlikely (a == NULL))
-            catomic_decrement (&narenas);
+            atomic_fetch_add_relaxed (&narenas, -1);
         }
       else
         a = reused_arena (avoid_arena);
diff --git a/malloc/malloc.c b/malloc/malloc.c
index 09e5ff2bce5d79b3754687db3aef314640d336eb..a335cd23e18ee09027a8e7a71d808bfa1343e06d 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -2486,11 +2486,11 @@ sysmalloc_mmap (INTERNAL_SIZE_T nb, size_t pagesize, int extra_flags, mstate av)
     }
 
   /* update statistics */
-  int new = atomic_exchange_and_add (&mp_.n_mmaps, 1) + 1;
+  int new = atomic_fetch_add_relaxed (&mp_.n_mmaps, 1) + 1;
   atomic_max (&mp_.max_n_mmaps, new);
 
   unsigned long sum;
-  sum = atomic_exchange_and_add (&mp_.mmapped_mem, size) + size;
+  sum = atomic_fetch_add_relaxed (&mp_.mmapped_mem, size) + size;
   atomic_max (&mp_.max_mmapped_mem, sum);
 
   check_chunk (av, p);
@@ -3059,8 +3059,8 @@ munmap_chunk (mchunkptr p)
       || __glibc_unlikely (!powerof2 (mem & (pagesize - 1))))
     malloc_printerr ("munmap_chunk(): invalid pointer");
 
-  atomic_decrement (&mp_.n_mmaps);
-  atomic_add (&mp_.mmapped_mem, -total_size);
+  atomic_fetch_add_relaxed (&mp_.n_mmaps, -1);
+  atomic_fetch_add_relaxed (&mp_.mmapped_mem, -total_size);
 
   /* If munmap failed the process virtual memory address space is in a
      bad shape.  Just leave the block hanging around, the process will
@@ -3110,7 +3110,7 @@ mremap_chunk (mchunkptr p, size_t new_size)
   set_head (p, (new_size - offset) | IS_MMAPPED);
 
   INTERNAL_SIZE_T new;
-  new = atomic_exchange_and_add (&mp_.mmapped_mem, new_size - size - offset)
+  new = atomic_fetch_add_relaxed (&mp_.mmapped_mem, new_size - size - offset)
         + new_size - size - offset;
   atomic_max (&mp_.max_mmapped_mem, new);
   return p;
@@ -3832,7 +3832,7 @@ _int_malloc (mstate av, size_t bytes)
       if (__glibc_unlikely (pp != NULL && misaligned_chunk (pp)))       \
 	malloc_printerr ("malloc(): unaligned fastbin chunk detected"); \
     }							\
-  while ((pp = catomic_compare_and_exchange_val_acq (fb, pp, victim)) \
+  while ((pp = atomic_compare_and_exchange_val_acq (fb, pp, victim)) \
 	 != victim);					\
 
   if ((unsigned long) (nb) <= (unsigned long) (get_max_fast ()))
@@ -4550,7 +4550,7 @@ _int_free (mstate av, mchunkptr p, int have_lock)
 	  old2 = old;
 	  p->fd = PROTECT_PTR (&p->fd, old);
 	}
-      while ((old = catomic_compare_and_exchange_val_rel (fb, p, old2))
+      while ((old = atomic_compare_and_exchange_val_rel (fb, p, old2))
 	     != old2);
 
     /* Check that size of fastbin chunk at the top is the same as
diff --git a/malloc/memusage.c b/malloc/memusage.c
index f30906dffb2731c104ea375af48f59c65bcc7c9c..74712834fa8b96fb2d9589d34b34ab07d05a84ca 100644
--- a/malloc/memusage.c
+++ b/malloc/memusage.c
@@ -148,8 +148,8 @@ update_data (struct header *result, size_t len, size_t old_len)
 
   /* Compute current heap usage and compare it with the maximum value.  */
   size_t heap
-    = catomic_exchange_and_add (&current_heap, len - old_len) + len - old_len;
-  catomic_max (&peak_heap, heap);
+    = atomic_fetch_add_relaxed (&current_heap, len - old_len) + len - old_len;
+  atomic_max (&peak_heap, heap);
 
   /* Compute current stack usage and compare it with the maximum
      value.  The base stack pointer might not be set if this is not
@@ -172,15 +172,15 @@ update_data (struct header *result, size_t len, size_t old_len)
     start_sp = sp;
   size_t current_stack = start_sp - sp;
 #endif
-  catomic_max (&peak_stack, current_stack);
+  atomic_max (&peak_stack, current_stack);
 
   /* Add up heap and stack usage and compare it with the maximum value.  */
-  catomic_max (&peak_total, heap + current_stack);
+  atomic_max (&peak_total, heap + current_stack);
 
   /* Store the value only if we are writing to a file.  */
   if (fd != -1)
     {
-      uint32_t idx = catomic_exchange_and_add (&buffer_cnt, 1);
+      uint32_t idx = atomic_fetch_add_relaxed (&buffer_cnt, 1);
       if (idx + 1 >= 2 * buffer_size)
         {
           /* We try to reset the counter to the correct range.  If
@@ -188,7 +188,7 @@ update_data (struct header *result, size_t len, size_t old_len)
              counter it does not matter since that thread will take
              care of the correction.  */
           uint32_t reset = (idx + 1) % (2 * buffer_size);
-          catomic_compare_and_exchange_val_acq (&buffer_cnt, reset, idx + 1);
+          atomic_compare_and_exchange_val_acq (&buffer_cnt, reset, idx + 1);
           if (idx >= 2 * buffer_size)
             idx = reset - 1;
         }
@@ -362,24 +362,24 @@ malloc (size_t len)
     return (*mallocp)(len);
 
   /* Keep track of number of calls.  */
-  catomic_increment (&calls[idx_malloc]);
+  atomic_fetch_add_relaxed (&calls[idx_malloc], 1);
   /* Keep track of total memory consumption for `malloc'.  */
-  catomic_add (&total[idx_malloc], len);
+  atomic_fetch_add_relaxed (&total[idx_malloc], len);
   /* Keep track of total memory requirement.  */
-  catomic_add (&grand_total, len);
+  atomic_fetch_add_relaxed (&grand_total, len);
   /* Remember the size of the request.  */
   if (len < 65536)
-    catomic_increment (&histogram[len / 16]);
+    atomic_fetch_add_relaxed (&histogram[len / 16], 1);
   else
-    catomic_increment (&large);
+    atomic_fetch_add_relaxed (&large, 1);
   /* Total number of calls of any of the functions.  */
-  catomic_increment (&calls_total);
+  atomic_fetch_add_relaxed (&calls_total, 1);
 
   /* Do the real work.  */
   result = (struct header *) (*mallocp)(len + sizeof (struct header));
   if (result == NULL)
     {
-      catomic_increment (&failed[idx_malloc]);
+      atomic_fetch_add_relaxed (&failed[idx_malloc], 1);
       return NULL;
     }
 
@@ -430,21 +430,21 @@ realloc (void *old, size_t len)
     }
 
   /* Keep track of number of calls.  */
-  catomic_increment (&calls[idx_realloc]);
+  atomic_fetch_add_relaxed (&calls[idx_realloc], 1);
   if (len > old_len)
     {
       /* Keep track of total memory consumption for `realloc'.  */
-      catomic_add (&total[idx_realloc], len - old_len);
+      atomic_fetch_add_relaxed (&total[idx_realloc], len - old_len);
       /* Keep track of total memory requirement.  */
-      catomic_add (&grand_total, len - old_len);
+      atomic_fetch_add_relaxed (&grand_total, len - old_len);
     }
 
   if (len == 0 && old != NULL)
     {
       /* Special case.  */
-      catomic_increment (&realloc_free);
+      atomic_fetch_add_relaxed (&realloc_free, 1);
       /* Keep track of total memory freed using `free'.  */
-      catomic_add (&total[idx_free], real->length);
+      atomic_fetch_add_relaxed (&total[idx_free], real->length);
 
       /* Update the allocation data and write out the records if necessary.  */
       update_data (NULL, 0, old_len);
@@ -457,26 +457,26 @@ realloc (void *old, size_t len)
 
   /* Remember the size of the request.  */
   if (len < 65536)
-    catomic_increment (&histogram[len / 16]);
+    atomic_fetch_add_relaxed (&histogram[len / 16], 1);
   else
-    catomic_increment (&large);
+    atomic_fetch_add_relaxed (&large, 1);
   /* Total number of calls of any of the functions.  */
-  catomic_increment (&calls_total);
+  atomic_fetch_add_relaxed (&calls_total, 1);
 
   /* Do the real work.  */
   result = (struct header *) (*reallocp)(real, len + sizeof (struct header));
   if (result == NULL)
     {
-      catomic_increment (&failed[idx_realloc]);
+      atomic_fetch_add_relaxed (&failed[idx_realloc], 1);
       return NULL;
     }
 
   /* Record whether the reduction/increase happened in place.  */
   if (real == result)
-    catomic_increment (&inplace);
+    atomic_fetch_add_relaxed (&inplace, 1);
   /* Was the buffer increased?  */
   if (old_len > len)
-    catomic_increment (&decreasing);
+    atomic_fetch_add_relaxed (&decreasing, 1);
 
   /* Update the allocation data and write out the records if necessary.  */
   update_data (result, len, old_len);
@@ -508,16 +508,16 @@ calloc (size_t n, size_t len)
     return (*callocp)(n, len);
 
   /* Keep track of number of calls.  */
-  catomic_increment (&calls[idx_calloc]);
+  atomic_fetch_add_relaxed (&calls[idx_calloc], 1);
   /* Keep track of total memory consumption for `calloc'.  */
-  catomic_add (&total[idx_calloc], size);
+  atomic_fetch_add_relaxed (&total[idx_calloc], size);
   /* Keep track of total memory requirement.  */
-  catomic_add (&grand_total, size);
+  atomic_fetch_add_relaxed (&grand_total, size);
   /* Remember the size of the request.  */
   if (size < 65536)
-    catomic_increment (&histogram[size / 16]);
+    atomic_fetch_add_relaxed (&histogram[size / 16], 1);
   else
-    catomic_increment (&large);
+    atomic_fetch_add_relaxed (&large, 1);
   /* Total number of calls of any of the functions.  */
   ++calls_total;
 
@@ -525,7 +525,7 @@ calloc (size_t n, size_t len)
   result = (struct header *) (*mallocp)(size + sizeof (struct header));
   if (result == NULL)
     {
-      catomic_increment (&failed[idx_calloc]);
+      atomic_fetch_add_relaxed (&failed[idx_calloc], 1);
       return NULL;
     }
 
@@ -563,7 +563,7 @@ free (void *ptr)
   /* `free (NULL)' has no effect.  */
   if (ptr == NULL)
     {
-      catomic_increment (&calls[idx_free]);
+      atomic_fetch_add_relaxed (&calls[idx_free], 1);
       return;
     }
 
@@ -577,9 +577,9 @@ free (void *ptr)
     }
 
   /* Keep track of number of calls.  */
-  catomic_increment (&calls[idx_free]);
+  atomic_fetch_add_relaxed (&calls[idx_free], 1);
   /* Keep track of total memory freed using `free'.  */
-  catomic_add (&total[idx_free], real->length);
+  atomic_fetch_add_relaxed (&total[idx_free], real->length);
 
   /* Update the allocation data and write out the records if necessary.  */
   update_data (NULL, 0, real->length);
@@ -614,22 +614,22 @@ mmap (void *start, size_t len, int prot, int flags, int fd, off_t offset)
                  ? idx_mmap_a : prot & PROT_WRITE ? idx_mmap_w : idx_mmap_r);
 
       /* Keep track of number of calls.  */
-      catomic_increment (&calls[idx]);
+      atomic_fetch_add_relaxed (&calls[idx], 1);
       /* Keep track of total memory consumption for `malloc'.  */
-      catomic_add (&total[idx], len);
+      atomic_fetch_add_relaxed (&total[idx], len);
       /* Keep track of total memory requirement.  */
-      catomic_add (&grand_total, len);
+      atomic_fetch_add_relaxed (&grand_total, len);
       /* Remember the size of the request.  */
       if (len < 65536)
-        catomic_increment (&histogram[len / 16]);
+        atomic_fetch_add_relaxed (&histogram[len / 16], 1);
       else
-        catomic_increment (&large);
+        atomic_fetch_add_relaxed (&large, 1);
       /* Total number of calls of any of the functions.  */
-      catomic_increment (&calls_total);
+      atomic_fetch_add_relaxed (&calls_total, 1);
 
       /* Check for failures.  */
       if (result == NULL)
-        catomic_increment (&failed[idx]);
+        atomic_fetch_add_relaxed (&failed[idx], 1);
       else if (idx == idx_mmap_w)
         /* Update the allocation data and write out the records if
            necessary.  Note the first parameter is NULL which means
@@ -667,22 +667,22 @@ mmap64 (void *start, size_t len, int prot, int flags, int fd, off64_t offset)
                  ? idx_mmap_a : prot & PROT_WRITE ? idx_mmap_w : idx_mmap_r);
 
       /* Keep track of number of calls.  */
-      catomic_increment (&calls[idx]);
+      atomic_fetch_add_relaxed (&calls[idx], 1);
       /* Keep track of total memory consumption for `malloc'.  */
-      catomic_add (&total[idx], len);
+      atomic_fetch_add_relaxed (&total[idx], len);
       /* Keep track of total memory requirement.  */
-      catomic_add (&grand_total, len);
+      atomic_fetch_add_relaxed (&grand_total, len);
       /* Remember the size of the request.  */
       if (len < 65536)
-        catomic_increment (&histogram[len / 16]);
+        atomic_fetch_add_relaxed (&histogram[len / 16], 1);
       else
-        catomic_increment (&large);
+        atomic_fetch_add_relaxed (&large, 1);
       /* Total number of calls of any of the functions.  */
-      catomic_increment (&calls_total);
+      atomic_fetch_add_relaxed (&calls_total, 1);
 
       /* Check for failures.  */
       if (result == NULL)
-        catomic_increment (&failed[idx]);
+        atomic_fetch_add_relaxed (&failed[idx], 1);
       else if (idx == idx_mmap_w)
         /* Update the allocation data and write out the records if
            necessary.  Note the first parameter is NULL which means
@@ -722,33 +722,33 @@ mremap (void *start, size_t old_len, size_t len, int flags, ...)
   if (!not_me && trace_mmap)
     {
       /* Keep track of number of calls.  */
-      catomic_increment (&calls[idx_mremap]);
+      atomic_fetch_add_relaxed (&calls[idx_mremap], 1);
       if (len > old_len)
         {
           /* Keep track of total memory consumption for `malloc'.  */
-          catomic_add (&total[idx_mremap], len - old_len);
+          atomic_fetch_add_relaxed (&total[idx_mremap], len - old_len);
           /* Keep track of total memory requirement.  */
-          catomic_add (&grand_total, len - old_len);
+          atomic_fetch_add_relaxed (&grand_total, len - old_len);
         }
       /* Remember the size of the request.  */
       if (len < 65536)
-        catomic_increment (&histogram[len / 16]);
+        atomic_fetch_add_relaxed (&histogram[len / 16], 1);
       else
-        catomic_increment (&large);
+        atomic_fetch_add_relaxed (&large, 1);
       /* Total number of calls of any of the functions.  */
-      catomic_increment (&calls_total);
+      atomic_fetch_add_relaxed (&calls_total, 1);
 
       /* Check for failures.  */
       if (result == NULL)
-        catomic_increment (&failed[idx_mremap]);
+        atomic_fetch_add_relaxed (&failed[idx_mremap], 1);
       else
         {
           /* Record whether the reduction/increase happened in place.  */
           if (start == result)
-            catomic_increment (&inplace_mremap);
+            atomic_fetch_add_relaxed (&inplace_mremap, 1);
           /* Was the buffer increased?  */
           if (old_len > len)
-            catomic_increment (&decreasing_mremap);
+            atomic_fetch_add_relaxed (&decreasing_mremap, 1);
 
           /* Update the allocation data and write out the records if
              necessary.  Note the first parameter is NULL which means
@@ -783,19 +783,19 @@ munmap (void *start, size_t len)
   if (!not_me && trace_mmap)
     {
       /* Keep track of number of calls.  */
-      catomic_increment (&calls[idx_munmap]);
+      atomic_fetch_add_relaxed (&calls[idx_munmap], 1);
 
       if (__glibc_likely (result == 0))
         {
           /* Keep track of total memory freed using `free'.  */
-          catomic_add (&total[idx_munmap], len);
+          atomic_fetch_add_relaxed (&total[idx_munmap], len);
 
           /* Update the allocation data and write out the records if
              necessary.  */
           update_data (NULL, 0, len);
         }
       else
-        catomic_increment (&failed[idx_munmap]);
+        atomic_fetch_add_relaxed (&failed[idx_munmap], 1);
     }
 
   return result;
diff --git a/manual/memory.texi b/manual/memory.texi
index 23a039c57e60c81787252d935e3b309fd8290902..5cb1dbd281006148f23cfa38c5703fb79089ba78 100644
--- a/manual/memory.texi
+++ b/manual/memory.texi
@@ -354,7 +354,7 @@ this function is in @file{stdlib.h}.
 @c that's protected by list_lock; next_free is only modified while
 @c list_lock is held too.  All other data members of an arena, as well
 @c as the metadata of the memory areas assigned to it, are only modified
-@c while holding the arena's mutex (fastbin pointers use catomic ops
+@c while holding the arena's mutex (fastbin pointers use atomic ops
 @c because they may be modified by free without taking the arena's
 @c lock).  Some reassurance was needed for fastbins, for it wasn't clear
 @c how they were initialized.  It turns out they are always
@@ -383,7 +383,7 @@ this function is in @file{stdlib.h}.
 @c     mutex_lock (arena lock) dup @asulock @aculock [returns locked]
 @c    __get_nprocs ext ok @acsfd
 @c    NARENAS_FROM_NCORES ok
-@c    catomic_compare_and_exchange_bool_acq ok
+@c    atomic_compare_and_exchange_bool_acq ok
 @c    _int_new_arena ok @asulock @aculock @acsmem
 @c     new_heap ok @acsmem
 @c      mmap ok @acsmem
@@ -397,7 +397,7 @@ this function is in @file{stdlib.h}.
 @c     mutex_lock (list_lock) dup @asulock @aculock
 @c     atomic_write_barrier ok
 @c     mutex_unlock (list_lock) @aculock
-@c    catomic_decrement ok
+@c    atomic_decrement ok
 @c    reused_arena @asulock @aculock
 @c      reads&writes next_to_use and iterates over arena next without guards
 @c      those are harmless as long as we don't drop arenas from the
@@ -414,7 +414,7 @@ this function is in @file{stdlib.h}.
 @c   get_max_fast ok
 @c   fastbin_index ok
 @c   fastbin ok
-@c   catomic_compare_and_exhange_val_acq ok
+@c   atomic_compare_and_exhange_val_acq ok
 @c   malloc_printerr dup @mtsenv
 @c     if we get to it, we're toast already, undefined behavior must have
 @c     been invoked before
@@ -521,10 +521,10 @@ this function is in @file{stdlib.h}.
 @c     chunk2mem dup ok
 @c     free_perturb ok
 @c     set_fastchunks ok
-@c      catomic_and ok
+@c      atomic_and ok
 @c     fastbin_index dup ok
 @c     fastbin dup ok
-@c     catomic_compare_and_exchange_val_rel ok
+@c     atomic_compare_and_exchange_val_rel ok
 @c     chunk_is_mmapped ok
 @c     contiguous dup ok
 @c     prev_inuse ok
@@ -706,7 +706,7 @@ The prototype for this function is in @file{stdlib.h}.
 @safety{@prelim{}@mtsafe{}@asunsafe{@asulock{}}@acunsafe{@aculock{} @acsfd{} @acsmem{}}}
 @c __libc_free @asulock @aculock @acsfd @acsmem
 @c   releasing memory into fastbins modifies the arena without taking
-@c   its mutex, but catomic operations ensure safety.  If two (or more)
+@c   its mutex, but atomic operations ensure safety.  If two (or more)
 @c   threads are running malloc and have their own arenas locked when
 @c   each gets a signal whose handler free()s large (non-fastbin-able)
 @c   blocks from each other's arena, we deadlock; this is a more general
diff --git a/misc/tst-atomic.c b/misc/tst-atomic.c
index 6d681a7bfdf4f48b4c04a073ebd480326dbd3cc8..4f9d2c1a46b363d346dbc2fa0962ae196844a43a 100644
--- a/misc/tst-atomic.c
+++ b/misc/tst-atomic.c
@@ -393,117 +393,6 @@ do_test (void)
     }
 #endif
 
-#ifdef catomic_compare_and_exchange_val_acq
-  mem = 24;
-  if (catomic_compare_and_exchange_val_acq (&mem, 35, 24) != 24
-      || mem != 35)
-    {
-      puts ("catomic_compare_and_exchange_val_acq test 1 failed");
-      ret = 1;
-    }
-
-  mem = 12;
-  if (catomic_compare_and_exchange_val_acq (&mem, 10, 15) != 12
-      || mem != 12)
-    {
-      puts ("catomic_compare_and_exchange_val_acq test 2 failed");
-      ret = 1;
-    }
-
-  mem = -15;
-  if (catomic_compare_and_exchange_val_acq (&mem, -56, -15) != -15
-      || mem != -56)
-    {
-      puts ("catomic_compare_and_exchange_val_acq test 3 failed");
-      ret = 1;
-    }
-
-  mem = -1;
-  if (catomic_compare_and_exchange_val_acq (&mem, 17, 0) != -1
-      || mem != -1)
-    {
-      puts ("catomic_compare_and_exchange_val_acq test 4 failed");
-      ret = 1;
-    }
-#endif
-
-  mem = 24;
-  if (catomic_compare_and_exchange_bool_acq (&mem, 35, 24)
-      || mem != 35)
-    {
-      puts ("catomic_compare_and_exchange_bool_acq test 1 failed");
-      ret = 1;
-    }
-
-  mem = 12;
-  if (! catomic_compare_and_exchange_bool_acq (&mem, 10, 15)
-      || mem != 12)
-    {
-      puts ("catomic_compare_and_exchange_bool_acq test 2 failed");
-      ret = 1;
-    }
-
-  mem = -15;
-  if (catomic_compare_and_exchange_bool_acq (&mem, -56, -15)
-      || mem != -56)
-    {
-      puts ("catomic_compare_and_exchange_bool_acq test 3 failed");
-      ret = 1;
-    }
-
-  mem = -1;
-  if (! catomic_compare_and_exchange_bool_acq (&mem, 17, 0)
-      || mem != -1)
-    {
-      puts ("catomic_compare_and_exchange_bool_acq test 4 failed");
-      ret = 1;
-    }
-
-  mem = 2;
-  if (catomic_exchange_and_add (&mem, 11) != 2
-      || mem != 13)
-    {
-      puts ("catomic_exchange_and_add test failed");
-      ret = 1;
-    }
-
-  mem = -21;
-  catomic_add (&mem, 22);
-  if (mem != 1)
-    {
-      puts ("catomic_add test failed");
-      ret = 1;
-    }
-
-  mem = -1;
-  catomic_increment (&mem);
-  if (mem != 0)
-    {
-      puts ("catomic_increment test failed");
-      ret = 1;
-    }
-
-  mem = 2;
-  if (catomic_increment_val (&mem) != 3)
-    {
-      puts ("catomic_increment_val test failed");
-      ret = 1;
-    }
-
-  mem = 17;
-  catomic_decrement (&mem);
-  if (mem != 16)
-    {
-      puts ("catomic_decrement test failed");
-      ret = 1;
-    }
-
-  if (catomic_decrement_val (&mem) != 15)
-    {
-      puts ("catomic_decrement_val test failed");
-      ret = 1;
-    }
-
   /* Tests for C11-like atomics.  */
   mem = 11;
   if (atomic_load_relaxed (&mem) != 11 || atomic_load_acquire (&mem) != 11)
diff --git a/sysdeps/hppa/dl-fptr.c b/sysdeps/hppa/dl-fptr.c
index 9ed21602d6155d4b960278f8d1fac4ffa885b9d5..40bf5cd3b306315d8eeb6bdba2b2b46b1ea5059e 100644
--- a/sysdeps/hppa/dl-fptr.c
+++ b/sysdeps/hppa/dl-fptr.c
@@ -41,10 +41,8 @@
 # error "ELF_MACHINE_LOAD_ADDRESS is not defined."
 #endif
 
-#ifndef COMPARE_AND_SWAP
-# define COMPARE_AND_SWAP(ptr, old, new) \
-  (catomic_compare_and_exchange_bool_acq (ptr, new, old) == 0)
-#endif
+#define COMPARE_AND_SWAP(ptr, old, new) \
+  (atomic_compare_and_exchange_bool_acq (ptr, new, old) == 0)
 
 ElfW(Addr) _dl_boot_fptr_table [ELF_MACHINE_BOOT_FPTR_TABLE_LEN];
 
diff --git a/sysdeps/ia64/dl-fptr.h b/sysdeps/ia64/dl-fptr.h
index e6c34528b67e7986ac44d77eb2505889a1767e29..a3a2994db20f44dfd3280f6ad618001cbadc0539 100644
--- a/sysdeps/ia64/dl-fptr.h
+++ b/sysdeps/ia64/dl-fptr.h
@@ -22,9 +22,6 @@
 #include <ia64intrin.h>
 #include <sysdeps/generic/dl-fptr.h>
 
-#define COMPARE_AND_SWAP(ptr, old, new)	\
-  __sync_bool_compare_and_swap (ptr, old, new)
-
 /* There are currently 123 dynamic symbols in ld.so.
    ELF_MACHINE_BOOT_FPTR_TABLE_LEN needs to be at least that big.  */
 #define ELF_MACHINE_BOOT_FPTR_TABLE_LEN	200
diff --git a/sysdeps/s390/atomic-machine.h b/sysdeps/s390/atomic-machine.h
index e85b2ef50541c7aab6d2981180f6205d2bd681b6..6b1de51c2a30baf5554a729a80a7ce04b56fc22c 100644
--- a/sysdeps/s390/atomic-machine.h
+++ b/sysdeps/s390/atomic-machine.h
@@ -70,8 +70,6 @@
     !__atomic_compare_exchange_n (mem, (void *) &__atg2_oldval, newval,	\
 				  1, __ATOMIC_ACQUIRE,			\
 				  __ATOMIC_RELAXED); })
-#define catomic_compare_and_exchange_bool_acq(mem, newval, oldval)	\
-  atomic_compare_and_exchange_bool_acq (mem, newval, oldval)
 
 /* Store NEWVALUE in *MEM and return the old value.  */
 #define atomic_exchange_acq(mem, newvalue)				\
@@ -90,8 +88,6 @@
 # define atomic_exchange_and_add_rel(mem, operand)			\
   ({ __atomic_check_size((mem));					\
   __atomic_fetch_add ((mem), (operand), __ATOMIC_RELEASE); })
-#define catomic_exchange_and_add(mem, value)	\
-  atomic_exchange_and_add (mem, value)
 
 /* Atomically *mem |= mask and return the old value of *mem.  */
 /* The gcc builtin uses load-and-or instruction on z196 zarch and higher cpus
@@ -104,8 +100,6 @@
   do {						\
     atomic_or_val (mem, mask);			\
   } while (0)
-#define catomic_or(mem, mask)			\
-  atomic_or (mem, mask)
 
 /* Atomically *mem |= 1 << bit and return true if the bit was set in old value
    of *mem.  */
@@ -129,5 +123,3 @@
   do {						\
     atomic_and_val (mem, mask);			\
   } while (0)
-#define catomic_and(mem, mask)			\
-  atomic_and(mem, mask)
diff --git a/sysdeps/unix/sysv/linux/riscv/atomic-machine.h b/sysdeps/unix/sysv/linux/riscv/atomic-machine.h
index 9ae89e0ef12ad28319755ac51260908779b9579f..f4b2cbced828a80335887bf172fd60767cf978ac 100644
--- a/sysdeps/unix/sysv/linux/riscv/atomic-machine.h
+++ b/sysdeps/unix/sysv/linux/riscv/atomic-machine.h
@@ -170,10 +170,6 @@
   ({ typeof (*mem) __mask = (typeof (*mem))1 << (bit);    \
      asm_amo ("amoor", ".aq", mem, __mask) & __mask; })
 
-# define catomic_exchange_and_add(mem, value)		\
-  atomic_exchange_and_add (mem, value)
-# define catomic_max(mem, value) atomic_max (mem, value)
-
 #else /* __riscv_atomic */
 # error "ISAs that do not subsume the A extension are not supported"
 #endif /* !__riscv_atomic */
diff --git a/sysdeps/x86/atomic-machine.h b/sysdeps/x86/atomic-machine.h
index f24f1c71ed718c601c71decc1ee0c4b49fdf32f8..5a48521443367b1ca94c9f153c2a163661de789c 100644
--- a/sysdeps/x86/atomic-machine.h
+++ b/sysdeps/x86/atomic-machine.h
@@ -20,7 +20,6 @@
 #define _X86_ATOMIC_MACHINE_H 1
 
 #include <stdint.h>
-#include <tls.h>			/* For tcbhead_t.  */
 #include <libc-pointer-arith.h>		/* For cast_to_integer.  */
 
 #define LOCK_PREFIX "lock;"
@@ -52,52 +51,7 @@
   (! __sync_bool_compare_and_swap (mem, oldval, newval))
 
 
-#define __arch_c_compare_and_exchange_val_8_acq(mem, newval, oldval) \
-  ({ __typeof (*mem) ret;						      \
-     __asm __volatile ("cmpl $0, %%" SEG_REG ":%P5\n\t"			      \
-		       "je 0f\n\t"					      \
-		       "lock\n"						      \
-		       "0:\tcmpxchgb %b2, %1"				      \
-		       : "=a" (ret), "=m" (*mem)			      \
-		       : BR_CONSTRAINT (newval), "m" (*mem), "0" (oldval),    \
-			 "i" (offsetof (tcbhead_t, multiple_threads)));	      \
-     ret; })
-
-#define __arch_c_compare_and_exchange_val_16_acq(mem, newval, oldval) \
-  ({ __typeof (*mem) ret;						      \
-     __asm __volatile ("cmpl $0, %%" SEG_REG ":%P5\n\t"			      \
-		       "je 0f\n\t"					      \
-		       "lock\n"						      \
-		       "0:\tcmpxchgw %w2, %1"				      \
-		       : "=a" (ret), "=m" (*mem)			      \
-		       : BR_CONSTRAINT (newval), "m" (*mem), "0" (oldval),    \
-			 "i" (offsetof (tcbhead_t, multiple_threads)));	      \
-     ret; })
-
-#define __arch_c_compare_and_exchange_val_32_acq(mem, newval, oldval) \
-  ({ __typeof (*mem) ret;						      \
-     __asm __volatile ("cmpl $0, %%" SEG_REG ":%P5\n\t"			      \
-		       "je 0f\n\t"					      \
-		       "lock\n"						      \
-		       "0:\tcmpxchgl %2, %1"				      \
-		       : "=a" (ret), "=m" (*mem)			      \
-		       : BR_CONSTRAINT (newval), "m" (*mem), "0" (oldval),    \
-			 "i" (offsetof (tcbhead_t, multiple_threads)));       \
-     ret; })
-
 #ifdef __x86_64__
-# define __arch_c_compare_and_exchange_val_64_acq(mem, newval, oldval) \
-  ({ __typeof (*mem) ret;						      \
-     __asm __volatile ("cmpl $0, %%fs:%P5\n\t"				      \
-		       "je 0f\n\t"					      \
-		       "lock\n"						      \
-		       "0:\tcmpxchgq %q2, %1"				      \
-		       : "=a" (ret), "=m" (*mem)			      \
-		       : "q" ((int64_t) cast_to_integer (newval)),	      \
-			 "m" (*mem),					      \
-			 "0" ((int64_t) cast_to_integer (oldval)),	      \
-			 "i" (offsetof (tcbhead_t, multiple_threads)));	      \
-     ret; })
 # define do_exchange_and_add_val_64_acq(pfx, mem, value) 0
 # define do_add_val_64_acq(pfx, mem, value) do { } while (0)
 #else
@@ -107,13 +61,6 @@
    such an operation.  So don't define any code for now.  If it is
    really going to be used the code below can be used on Intel Pentium
    and later, but NOT on i486.  */
-# define __arch_c_compare_and_exchange_val_64_acq(mem, newval, oldval) \
-  ({ __typeof (*mem) ret = *(mem);					      \
-     __atomic_link_error ();						      \
-     ret = (newval);							      \
-     ret = (oldval);							      \
-     ret; })
-
 # define __arch_compare_and_exchange_val_64_acq(mem, newval, oldval)	      \
   ({ __typeof (*mem) ret = *(mem);					      \
      __atomic_link_error ();						      \
@@ -181,24 +128,20 @@
      if (sizeof (*mem) == 1)						      \
        __asm __volatile (lock "xaddb %b0, %1"				      \
 			 : "=q" (__result), "=m" (*mem)			      \
-			 : "0" (__addval), "m" (*mem),			      \
-			   "i" (offsetof (tcbhead_t, multiple_threads)));     \
+			 : "0" (__addval), "m" (*mem));			      \
      else if (sizeof (*mem) == 2)					      \
        __asm __volatile (lock "xaddw %w0, %1"				      \
 			 : "=r" (__result), "=m" (*mem)			      \
-			 : "0" (__addval), "m" (*mem),			      \
-			   "i" (offsetof (tcbhead_t, multiple_threads)));     \
+			 : "0" (__addval), "m" (*mem));			      \
      else if (sizeof (*mem) == 4)					      \
        __asm __volatile (lock "xaddl %0, %1"				      \
 			 : "=r" (__result), "=m" (*mem)			      \
-			 : "0" (__addval), "m" (*mem),			      \
-			   "i" (offsetof (tcbhead_t, multiple_threads)));     \
+			 : "0" (__addval), "m" (*mem));			      \
      else if (__HAVE_64B_ATOMICS)					      \
        __asm __volatile (lock "xaddq %q0, %1"				      \
 			 : "=r" (__result), "=m" (*mem)			      \
 			 : "0" ((int64_t) cast_to_integer (__addval)),     \
-			   "m" (*mem),					      \
-			   "i" (offsetof (tcbhead_t, multiple_threads)));     \
+			   "m" (*mem));					      \
      else								      \
        __result = do_exchange_and_add_val_64_acq (pfx, (mem), __addval);      \
      __result; })
@@ -206,14 +149,6 @@
 #define atomic_exchange_and_add(mem, value) \
   __sync_fetch_and_add (mem, value)
 
-#define __arch_exchange_and_add_cprefix \
-  "cmpl $0, %%" SEG_REG ":%P4\n\tje 0f\n\tlock\n0:\t"
-
-#define catomic_exchange_and_add(mem, value) \
-  __arch_exchange_and_add_body (__arch_exchange_and_add_cprefix, __arch_c,    \
-				mem, value)
-
-
 #define __arch_add_body(lock, pfx, apfx, mem, value) \
   do {									      \
     if (__builtin_constant_p (value) && (value) == 1)			      \
@@ -223,24 +158,20 @@
     else if (sizeof (*mem) == 1)					      \
       __asm __volatile (lock "addb %b1, %0"				      \
 			: "=m" (*mem)					      \
-			: IBR_CONSTRAINT (value), "m" (*mem),		      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+			: IBR_CONSTRAINT (value), "m" (*mem));		      \
     else if (sizeof (*mem) == 2)					      \
       __asm __volatile (lock "addw %w1, %0"				      \
 			: "=m" (*mem)					      \
-			: "ir" (value), "m" (*mem),			      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+			: "ir" (value), "m" (*mem));			      \
     else if (sizeof (*mem) == 4)					      \
       __asm __volatile (lock "addl %1, %0"				      \
 			: "=m" (*mem)					      \
-			: "ir" (value), "m" (*mem),			      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+			: "ir" (value), "m" (*mem));			      \
     else if (__HAVE_64B_ATOMICS)					      \
       __asm __volatile (lock "addq %q1, %0"				      \
 			: "=m" (*mem)					      \
 			: "ir" ((int64_t) cast_to_integer (value)),	      \
-			  "m" (*mem),					      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+			  "m" (*mem));					      \
     else								      \
       do_add_val_64_acq (apfx, (mem), (value));				      \
   } while (0)
@@ -248,13 +179,6 @@
 # define atomic_add(mem, value) \
   __arch_add_body (LOCK_PREFIX, atomic, __arch, mem, value)
 
-#define __arch_add_cprefix \
-  "cmpl $0, %%" SEG_REG ":%P3\n\tje 0f\n\tlock\n0:\t"
-
-#define catomic_add(mem, value) \
-  __arch_add_body (__arch_add_cprefix, atomic, __arch_c, mem, value)
-
-
 #define atomic_add_negative(mem, value) \
   ({ unsigned char __result;						      \
      if (sizeof (*mem) == 1)						      \
@@ -308,36 +232,25 @@
     if (sizeof (*mem) == 1)						      \
       __asm __volatile (lock "incb %b0"					      \
 			: "=m" (*mem)					      \
-			: "m" (*mem),					      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+			: "m" (*mem));					      \
     else if (sizeof (*mem) == 2)					      \
       __asm __volatile (lock "incw %w0"					      \
 			: "=m" (*mem)					      \
-			: "m" (*mem),					      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+			: "m" (*mem));					      \
     else if (sizeof (*mem) == 4)					      \
       __asm __volatile (lock "incl %0"					      \
 			: "=m" (*mem)					      \
-			: "m" (*mem),					      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+			: "m" (*mem));					      \
     else if (__HAVE_64B_ATOMICS)					      \
       __asm __volatile (lock "incq %q0"					      \
 			: "=m" (*mem)					      \
-			: "m" (*mem),					      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+			: "m" (*mem));					      \
     else								      \
       do_add_val_64_acq (pfx, mem, 1);					      \
   } while (0)
 
 #define atomic_increment(mem) __arch_increment_body (LOCK_PREFIX, __arch, mem)
 
-#define __arch_increment_cprefix \
-  "cmpl $0, %%" SEG_REG ":%P2\n\tje 0f\n\tlock\n0:\t"
-
-#define catomic_increment(mem) \
-  __arch_increment_body (__arch_increment_cprefix, __arch_c, mem)
-
-
 #define atomic_increment_and_test(mem) \
   ({ unsigned char __result;						      \
      if (sizeof (*mem) == 1)						      \
@@ -366,36 +279,25 @@
     if (sizeof (*mem) == 1)						      \
       __asm __volatile (lock "decb %b0"					      \
 			: "=m" (*mem)					      \
-			: "m" (*mem),					      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+			: "m" (*mem));					      \
     else if (sizeof (*mem) == 2)					      \
       __asm __volatile (lock "decw %w0"					      \
 			: "=m" (*mem)					      \
-			: "m" (*mem),					      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+			: "m" (*mem));					      \
     else if (sizeof (*mem) == 4)					      \
       __asm __volatile (lock "decl %0"					      \
 			: "=m" (*mem)					      \
-			: "m" (*mem),					      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+			: "m" (*mem));					      \
     else if (__HAVE_64B_ATOMICS)					      \
       __asm __volatile (lock "decq %q0"					      \
 			: "=m" (*mem)					      \
-			: "m" (*mem),					      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+			: "m" (*mem));					      \
     else								      \
       do_add_val_64_acq (pfx, mem, -1);					      \
   } while (0)
 
 #define atomic_decrement(mem) __arch_decrement_body (LOCK_PREFIX, __arch, mem)
 
-#define __arch_decrement_cprefix \
-  "cmpl $0, %%" SEG_REG ":%P2\n\tje 0f\n\tlock\n0:\t"
-
-#define catomic_decrement(mem) \
-  __arch_decrement_body (__arch_decrement_cprefix, __arch_c, mem)
-
-
 #define atomic_decrement_and_test(mem) \
   ({ unsigned char __result;						      \
      if (sizeof (*mem) == 1)						      \
@@ -472,65 +374,49 @@
     if (sizeof (*mem) == 1)						      \
       __asm __volatile (lock "andb %b1, %0"				      \
 			: "=m" (*mem)					      \
-			: IBR_CONSTRAINT (mask), "m" (*mem),		      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+			: IBR_CONSTRAINT (mask), "m" (*mem));		      \
     else if (sizeof (*mem) == 2)					      \
       __asm __volatile (lock "andw %w1, %0"				      \
 			: "=m" (*mem)					      \
-			: "ir" (mask), "m" (*mem),			      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+			: "ir" (mask), "m" (*mem));			      \
     else if (sizeof (*mem) == 4)					      \
       __asm __volatile (lock "andl %1, %0"				      \
 			: "=m" (*mem)					      \
-			: "ir" (mask), "m" (*mem),			      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+			: "ir" (mask), "m" (*mem));			      \
     else if (__HAVE_64B_ATOMICS)					      \
       __asm __volatile (lock "andq %q1, %0"				      \
 			: "=m" (*mem)					      \
-			: "ir" (mask), "m" (*mem),			      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+			: "ir" (mask), "m" (*mem));			      \
     else								      \
       __atomic_link_error ();						      \
   } while (0)
 
-#define __arch_cprefix \
-  "cmpl $0, %%" SEG_REG ":%P3\n\tje 0f\n\tlock\n0:\t"
-
 #define atomic_and(mem, mask) __arch_and_body (LOCK_PREFIX, mem, mask)
 
-#define catomic_and(mem, mask) __arch_and_body (__arch_cprefix, mem, mask)
-
-
 #define __arch_or_body(lock, mem, mask) \
   do {									      \
     if (sizeof (*mem) == 1)						      \
       __asm __volatile (lock "orb %b1, %0"				      \
 			: "=m" (*mem)					      \
-			: IBR_CONSTRAINT (mask), "m" (*mem),		      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+			: IBR_CONSTRAINT (mask), "m" (*mem));		      \
     else if (sizeof (*mem) == 2)					      \
       __asm __volatile (lock "orw %w1, %0"				      \
 			: "=m" (*mem)					      \
-			: "ir" (mask), "m" (*mem),			      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+			: "ir" (mask), "m" (*mem));			      \
     else if (sizeof (*mem) == 4)					      \
       __asm __volatile (lock "orl %1, %0"				      \
 			: "=m" (*mem)					      \
-			: "ir" (mask), "m" (*mem),			      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+			: "ir" (mask), "m" (*mem));			      \
     else if (__HAVE_64B_ATOMICS)					      \
       __asm __volatile (lock "orq %q1, %0"				      \
 			: "=m" (*mem)					      \
-			: "ir" (mask), "m" (*mem),			      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+			: "ir" (mask), "m" (*mem));			      \
     else								      \
       __atomic_link_error ();						      \
   } while (0)
 
 #define atomic_or(mem, mask) __arch_or_body (LOCK_PREFIX, mem, mask)
 
-#define catomic_or(mem, mask) __arch_or_body (__arch_cprefix, mem, mask)
-
 /* We don't use mfence because it is supposedly slower due to having to
    provide stronger guarantees (e.g., regarding self-modifying code).  */
 #define atomic_full_barrier() \
Adhemerval Zanella Netto June 22, 2022, 1 p.m. UTC | #3
> On 17 Jun 2022, at 08:56, Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
> 
> Hi Adhemerval,
> 
>> LGTM, thanks. Some comments below, but I can send a cleanup patch if you prefer.
>> 
>> #ifndef COMPARE_AND_SWAP
>> 
>> I think we can also remove this macro, only ia64 redefines it to old compiler builtin
>> (__sync_bool_compare_and_swap).
> 
> I removed it from ia64 and the #ifdef/#endif elsewhere. However the macro seems
> to have arguments in the correct order so it seems best to not substitute it for now.

Fair enough.

> 
> Also I noticed some dead uses of multiple_threads, so I have removed those
> (in principle your single-thread patch series no longer needs to make changes
> to sysdeps/x86/atomic-machine.h). Rerunning build-many-glibcs for a few arches passed.

Since this patch removes a x86 optimization (sorry, I realized it after my review), 
I think it would be better if circle back and first get my single-thread refactor
patches in (which fixes SINGLE_THREAD_P syscall on aarch64 and other architectures)
since it does not change x86.

After we can then remove the unused catomic operation and make the single-thread
optimization locking generic (so we can finally remove x86 arch-specific bits).

> 
> Cheers,
> Wilco
> 
> v2: Remove COMPARE_AND_SWAP ifdefs and dead uses of multiple_threads.
> 
> The catomics are not supported on most targets and are only used in a few places which are not
> performance critical, so replace all uses with more standard atomics.
> Replace uses of catomic_add, catomic_increment, catomic_decrement and catomic_fetch_and_add with
> atomic_fetch_add_relaxed which maps to a standard compiler builtin. Relaxed memory ordering is
> correct for simple counters since they only need atomicity.
> 
> ---
> diff --git a/elf/dl-fptr.c b/elf/dl-fptr.c
> index 6645a260b809ecd521796e0d1adee56b3e0bd993..ffc83ef78bc16f36773a51da7442646f47e85b71 100644
> --- a/elf/dl-fptr.c
> +++ b/elf/dl-fptr.c
> @@ -38,10 +38,8 @@
> # error "ELF_MACHINE_LOAD_ADDRESS is not defined."
> #endif
> 
> -#ifndef COMPARE_AND_SWAP
> -# define COMPARE_AND_SWAP(ptr, old, new) \
> -  (catomic_compare_and_exchange_bool_acq (ptr, new, old) == 0)
> -#endif
> +#define COMPARE_AND_SWAP(ptr, old, new) \
> +  (atomic_compare_and_exchange_bool_acq (ptr, new, old) == 0)
> 
> ElfW(Addr) _dl_boot_fptr_table [ELF_MACHINE_BOOT_FPTR_TABLE_LEN];
> 
> diff --git a/elf/dl-profile.c b/elf/dl-profile.c
> index ec57e3a96552ae6460c22a0fcc819b85d486c0da..0af1f577d2d695d08edce9e13d9b39f77911b1d5 100644
> --- a/elf/dl-profile.c
> +++ b/elf/dl-profile.c
> @@ -548,24 +548,24 @@ _dl_mcount (ElfW(Addr) frompc, ElfW(Addr) selfpc)
> 	      size_t newfromidx;
> 	      to_index = (data[narcs].self_pc
> 			  / (HASHFRACTION * sizeof (*tos)));
> -	      newfromidx = catomic_exchange_and_add (&fromidx, 1) + 1;
> +	      newfromidx = atomic_fetch_add_relaxed (&fromidx, 1) + 1;
> 	      froms[newfromidx].here = &data[narcs];
> 	      froms[newfromidx].link = tos[to_index];
> 	      tos[to_index] = newfromidx;
> -	      catomic_increment (&narcs);
> +	      atomic_fetch_add_relaxed (&narcs, 1);
> 	    }
> 
> 	  /* If we still have no entry stop searching and insert.  */
> 	  if (*topcindex == 0)
> 	    {
> -	      unsigned int newarc = catomic_exchange_and_add (narcsp, 1);
> +	      unsigned int newarc = atomic_fetch_add_relaxed (narcsp, 1);
> 
> 	      /* In rare cases it could happen that all entries in FROMS are
> 		 occupied.  So we cannot count this anymore.  */
> 	      if (newarc >= fromlimit)
> 		goto done;
> 
> -	      *topcindex = catomic_exchange_and_add (&fromidx, 1) + 1;
> +	      *topcindex = atomic_fetch_add_relaxed (&fromidx, 1) + 1;
> 	      fromp = &froms[*topcindex];
> 
> 	      fromp->here = &data[newarc];
> @@ -573,7 +573,7 @@ _dl_mcount (ElfW(Addr) frompc, ElfW(Addr) selfpc)
> 	      data[newarc].self_pc = selfpc;
> 	      data[newarc].count = 0;
> 	      fromp->link = 0;
> -	      catomic_increment (&narcs);
> +	      atomic_fetch_add_relaxed (&narcs, 1);
> 
> 	      break;
> 	    }
> @@ -586,7 +586,7 @@ _dl_mcount (ElfW(Addr) frompc, ElfW(Addr) selfpc)
>     }
> 
>   /* Increment the counter.  */
> -  catomic_increment (&fromp->here->count);
> +  atomic_fetch_add_relaxed (&fromp->here->count, 1);
> 
>  done:
>   ;
> diff --git a/include/atomic.h b/include/atomic.h
> index 2cb52c9cfd894308b97b97a04dd574b2287bf1b2..264db9a0b7619ff6520f84a19c53c1eb9a3b42a3 100644
> --- a/include/atomic.h
> +++ b/include/atomic.h
> @@ -24,13 +24,6 @@
>    - atomic arithmetic and logic operation on memory.  They all
>      have the prefix "atomic_".
> 
> -   - conditionally atomic operations of the same kinds.  These
> -     always behave identical but can be faster when atomicity
> -     is not really needed since only one thread has access to
> -     the memory location.  In that case the code is slower in
> -     the multi-thread case.  The interfaces have the prefix
> -     "catomic_".
> -
>    - support functions like barriers.  They also have the prefix
>      "atomic_".
> 
> @@ -93,29 +86,6 @@
> #endif
> 
> 
> -#ifndef catomic_compare_and_exchange_val_acq
> -# ifdef __arch_c_compare_and_exchange_val_32_acq
> -#  define catomic_compare_and_exchange_val_acq(mem, newval, oldval) \
> -  __atomic_val_bysize (__arch_c_compare_and_exchange_val,acq,		      \
> -		       mem, newval, oldval)
> -# else
> -#  define catomic_compare_and_exchange_val_acq(mem, newval, oldval) \
> -  atomic_compare_and_exchange_val_acq (mem, newval, oldval)
> -# endif
> -#endif
> -
> -
> -#ifndef catomic_compare_and_exchange_val_rel
> -# ifndef atomic_compare_and_exchange_val_rel
> -#  define catomic_compare_and_exchange_val_rel(mem, newval, oldval)	      \
> -  catomic_compare_and_exchange_val_acq (mem, newval, oldval)
> -# else
> -#  define catomic_compare_and_exchange_val_rel(mem, newval, oldval)	      \
> -  atomic_compare_and_exchange_val_rel (mem, newval, oldval)
> -# endif
> -#endif
> -
> -
> #ifndef atomic_compare_and_exchange_val_rel
> # define atomic_compare_and_exchange_val_rel(mem, newval, oldval)	      \
>   atomic_compare_and_exchange_val_acq (mem, newval, oldval)
> @@ -141,23 +111,6 @@
> #endif
> 
> 
> -#ifndef catomic_compare_and_exchange_bool_acq
> -# ifdef __arch_c_compare_and_exchange_bool_32_acq
> -#  define catomic_compare_and_exchange_bool_acq(mem, newval, oldval) \
> -  __atomic_bool_bysize (__arch_c_compare_and_exchange_bool,acq,		      \
> -		        mem, newval, oldval)
> -# else
> -#  define catomic_compare_and_exchange_bool_acq(mem, newval, oldval) \
> -  ({ /* Cannot use __oldval here, because macros later in this file might     \
> -	call this macro with __oldval argument.	 */			      \
> -     __typeof (oldval) __atg4_old = (oldval);				      \
> -     catomic_compare_and_exchange_val_acq (mem, newval, __atg4_old)	      \
> -       != __atg4_old;							      \
> -  })
> -# endif
> -#endif
> -
> -
> /* Store NEWVALUE in *MEM and return the old value.  */
> #ifndef atomic_exchange_acq
> # define atomic_exchange_acq(mem, newvalue) \
> @@ -212,24 +165,6 @@
>   atomic_exchange_and_add_acq(mem, value)
> #endif
> 
> -#ifndef catomic_exchange_and_add
> -# define catomic_exchange_and_add(mem, value) \
> -  ({ __typeof (*(mem)) __atg7_oldv;					      \
> -     __typeof (mem) __atg7_memp = (mem);				      \
> -     __typeof (*(mem)) __atg7_value = (value);				      \
> -									      \
> -     do									      \
> -       __atg7_oldv = *__atg7_memp;					      \
> -     while (__builtin_expect						      \
> -	    (catomic_compare_and_exchange_bool_acq (__atg7_memp,	      \
> -						    __atg7_oldv		      \
> -						    + __atg7_value,	      \
> -						    __atg7_oldv), 0));	      \
> -									      \
> -     __atg7_oldv; })
> -#endif
> -
> -
> #ifndef atomic_max
> # define atomic_max(mem, value) \
>   do {									      \
> @@ -246,25 +181,6 @@
>   } while (0)
> #endif
> 
> -
> -#ifndef catomic_max
> -# define catomic_max(mem, value) \
> -  do {									      \
> -    __typeof (*(mem)) __atg9_oldv;					      \
> -    __typeof (mem) __atg9_memp = (mem);					      \
> -    __typeof (*(mem)) __atg9_value = (value);				      \
> -    do {								      \
> -      __atg9_oldv = *__atg9_memp;					      \
> -      if (__atg9_oldv >= __atg9_value)					      \
> -	break;								      \
> -    } while (__builtin_expect						      \
> -	     (catomic_compare_and_exchange_bool_acq (__atg9_memp,	      \
> -						     __atg9_value,	      \
> -						     __atg9_oldv), 0));	      \
> -  } while (0)
> -#endif
> -
> -
> #ifndef atomic_min
> # define atomic_min(mem, value) \
>   do {									      \
> @@ -288,32 +204,16 @@
> #endif
> 
> 
> -#ifndef catomic_add
> -# define catomic_add(mem, value) \
> -  (void) catomic_exchange_and_add ((mem), (value))
> -#endif
> -
> -
> #ifndef atomic_increment
> # define atomic_increment(mem) atomic_add ((mem), 1)
> #endif
> 
> 
> -#ifndef catomic_increment
> -# define catomic_increment(mem) catomic_add ((mem), 1)
> -#endif
> -
> -
> #ifndef atomic_increment_val
> # define atomic_increment_val(mem) (atomic_exchange_and_add ((mem), 1) + 1)
> #endif
> 
> 
> -#ifndef catomic_increment_val
> -# define catomic_increment_val(mem) (catomic_exchange_and_add ((mem), 1) + 1)
> -#endif
> -
> -
> /* Add one to *MEM and return true iff it's now zero.  */
> #ifndef atomic_increment_and_test
> # define atomic_increment_and_test(mem) \
> @@ -326,21 +226,11 @@
> #endif
> 
> 
> -#ifndef catomic_decrement
> -# define catomic_decrement(mem) catomic_add ((mem), -1)
> -#endif
> -
> -
> #ifndef atomic_decrement_val
> # define atomic_decrement_val(mem) (atomic_exchange_and_add ((mem), -1) - 1)
> #endif
> 
> 
> -#ifndef catomic_decrement_val
> -# define catomic_decrement_val(mem) (catomic_exchange_and_add ((mem), -1) - 1)
> -#endif
> -
> -
> /* Subtract 1 from *MEM and return true iff it's now zero.  */
> #ifndef atomic_decrement_and_test
> # define atomic_decrement_and_test(mem) \
> @@ -421,22 +311,6 @@
>   } while (0)
> #endif
> 
> -#ifndef catomic_and
> -# define catomic_and(mem, mask) \
> -  do {									      \
> -    __typeof (*(mem)) __atg20_old;					      \
> -    __typeof (mem) __atg20_memp = (mem);				      \
> -    __typeof (*(mem)) __atg20_mask = (mask);				      \
> -									      \
> -    do									      \
> -      __atg20_old = (*__atg20_memp);					      \
> -    while (__builtin_expect						      \
> -	   (catomic_compare_and_exchange_bool_acq (__atg20_memp,	      \
> -						   __atg20_old & __atg20_mask,\
> -						   __atg20_old), 0));	      \
> -  } while (0)
> -#endif
> -
> /* Atomically *mem &= mask and return the old value of *mem.  */
> #ifndef atomic_and_val
> # define atomic_and_val(mem, mask) \
> @@ -471,22 +345,6 @@
>   } while (0)
> #endif
> 
> -#ifndef catomic_or
> -# define catomic_or(mem, mask) \
> -  do {									      \
> -    __typeof (*(mem)) __atg18_old;					      \
> -    __typeof (mem) __atg18_memp = (mem);				      \
> -    __typeof (*(mem)) __atg18_mask = (mask);				      \
> -									      \
> -    do									      \
> -      __atg18_old = (*__atg18_memp);					      \
> -    while (__builtin_expect						      \
> -	   (catomic_compare_and_exchange_bool_acq (__atg18_memp,	      \
> -						   __atg18_old | __atg18_mask,\
> -						   __atg18_old), 0));	      \
> -  } while (0)
> -#endif
> -
> /* Atomically *mem |= mask and return the old value of *mem.  */
> #ifndef atomic_or_val
> # define atomic_or_val(mem, mask) \
> diff --git a/malloc/arena.c b/malloc/arena.c
> index 0a684a720d9648953073bd7d35faca100762c031..7c74a18381a4be5fe6bcb94b38a62dbfa6b674f4 100644
> --- a/malloc/arena.c
> +++ b/malloc/arena.c
> @@ -958,11 +958,11 @@ arena_get2 (size_t size, mstate avoid_arena)
>          enough address space to create that many arenas.  */
>       if (__glibc_unlikely (n <= narenas_limit - 1))
>         {
> -          if (catomic_compare_and_exchange_bool_acq (&narenas, n + 1, n))
> +          if (atomic_compare_and_exchange_bool_acq (&narenas, n + 1, n))
>             goto repeat;
>           a = _int_new_arena (size);
> 	  if (__glibc_unlikely (a == NULL))
> -            catomic_decrement (&narenas);
> +            atomic_fetch_add_relaxed (&narenas, -1);
>         }
>       else
>         a = reused_arena (avoid_arena);
> diff --git a/malloc/malloc.c b/malloc/malloc.c
> index 09e5ff2bce5d79b3754687db3aef314640d336eb..a335cd23e18ee09027a8e7a71d808bfa1343e06d 100644
> --- a/malloc/malloc.c
> +++ b/malloc/malloc.c
> @@ -2486,11 +2486,11 @@ sysmalloc_mmap (INTERNAL_SIZE_T nb, size_t pagesize, int extra_flags, mstate av)
>     }
> 
>   /* update statistics */
> -  int new = atomic_exchange_and_add (&mp_.n_mmaps, 1) + 1;
> +  int new = atomic_fetch_add_relaxed (&mp_.n_mmaps, 1) + 1;
>   atomic_max (&mp_.max_n_mmaps, new);
> 
>   unsigned long sum;
> -  sum = atomic_exchange_and_add (&mp_.mmapped_mem, size) + size;
> +  sum = atomic_fetch_add_relaxed (&mp_.mmapped_mem, size) + size;
>   atomic_max (&mp_.max_mmapped_mem, sum);
> 
>   check_chunk (av, p);
> @@ -3059,8 +3059,8 @@ munmap_chunk (mchunkptr p)
>       || __glibc_unlikely (!powerof2 (mem & (pagesize - 1))))
>     malloc_printerr ("munmap_chunk(): invalid pointer");
> 
> -  atomic_decrement (&mp_.n_mmaps);
> -  atomic_add (&mp_.mmapped_mem, -total_size);
> +  atomic_fetch_add_relaxed (&mp_.n_mmaps, -1);
> +  atomic_fetch_add_relaxed (&mp_.mmapped_mem, -total_size);
> 
>   /* If munmap failed the process virtual memory address space is in a
>      bad shape.  Just leave the block hanging around, the process will
> @@ -3110,7 +3110,7 @@ mremap_chunk (mchunkptr p, size_t new_size)
>   set_head (p, (new_size - offset) | IS_MMAPPED);
> 
>   INTERNAL_SIZE_T new;
> -  new = atomic_exchange_and_add (&mp_.mmapped_mem, new_size - size - offset)
> +  new = atomic_fetch_add_relaxed (&mp_.mmapped_mem, new_size - size - offset)
>         + new_size - size - offset;
>   atomic_max (&mp_.max_mmapped_mem, new);
>   return p;
> @@ -3832,7 +3832,7 @@ _int_malloc (mstate av, size_t bytes)
>       if (__glibc_unlikely (pp != NULL && misaligned_chunk (pp)))       \
> 	malloc_printerr ("malloc(): unaligned fastbin chunk detected"); \
>     }							\
> -  while ((pp = catomic_compare_and_exchange_val_acq (fb, pp, victim)) \
> +  while ((pp = atomic_compare_and_exchange_val_acq (fb, pp, victim)) \
> 	 != victim);					\
> 
>   if ((unsigned long) (nb) <= (unsigned long) (get_max_fast ()))
> @@ -4550,7 +4550,7 @@ _int_free (mstate av, mchunkptr p, int have_lock)
> 	  old2 = old;
> 	  p->fd = PROTECT_PTR (&p->fd, old);
> 	}
> -      while ((old = catomic_compare_and_exchange_val_rel (fb, p, old2))
> +      while ((old = atomic_compare_and_exchange_val_rel (fb, p, old2))
> 	     != old2);
> 
>     /* Check that size of fastbin chunk at the top is the same as
> diff --git a/malloc/memusage.c b/malloc/memusage.c
> index f30906dffb2731c104ea375af48f59c65bcc7c9c..74712834fa8b96fb2d9589d34b34ab07d05a84ca 100644
> --- a/malloc/memusage.c
> +++ b/malloc/memusage.c
> @@ -148,8 +148,8 @@ update_data (struct header *result, size_t len, size_t old_len)
> 
>   /* Compute current heap usage and compare it with the maximum value.  */
>   size_t heap
> -    = catomic_exchange_and_add (&current_heap, len - old_len) + len - old_len;
> -  catomic_max (&peak_heap, heap);
> +    = atomic_fetch_add_relaxed (&current_heap, len - old_len) + len - old_len;
> +  atomic_max (&peak_heap, heap);
> 
>   /* Compute current stack usage and compare it with the maximum
>      value.  The base stack pointer might not be set if this is not
> @@ -172,15 +172,15 @@ update_data (struct header *result, size_t len, size_t old_len)
>     start_sp = sp;
>   size_t current_stack = start_sp - sp;
> #endif
> -  catomic_max (&peak_stack, current_stack);
> +  atomic_max (&peak_stack, current_stack);
> 
>   /* Add up heap and stack usage and compare it with the maximum value.  */
> -  catomic_max (&peak_total, heap + current_stack);
> +  atomic_max (&peak_total, heap + current_stack);
> 
>   /* Store the value only if we are writing to a file.  */
>   if (fd != -1)
>     {
> -      uint32_t idx = catomic_exchange_and_add (&buffer_cnt, 1);
> +      uint32_t idx = atomic_fetch_add_relaxed (&buffer_cnt, 1);
>       if (idx + 1 >= 2 * buffer_size)
>         {
>           /* We try to reset the counter to the correct range.  If
> @@ -188,7 +188,7 @@ update_data (struct header *result, size_t len, size_t old_len)
>              counter it does not matter since that thread will take
>              care of the correction.  */
>           uint32_t reset = (idx + 1) % (2 * buffer_size);
> -          catomic_compare_and_exchange_val_acq (&buffer_cnt, reset, idx + 1);
> +          atomic_compare_and_exchange_val_acq (&buffer_cnt, reset, idx + 1);
>           if (idx >= 2 * buffer_size)
>             idx = reset - 1;
>         }
> @@ -362,24 +362,24 @@ malloc (size_t len)
>     return (*mallocp)(len);
> 
>   /* Keep track of number of calls.  */
> -  catomic_increment (&calls[idx_malloc]);
> +  atomic_fetch_add_relaxed (&calls[idx_malloc], 1);
>   /* Keep track of total memory consumption for `malloc'.  */
> -  catomic_add (&total[idx_malloc], len);
> +  atomic_fetch_add_relaxed (&total[idx_malloc], len);
>   /* Keep track of total memory requirement.  */
> -  catomic_add (&grand_total, len);
> +  atomic_fetch_add_relaxed (&grand_total, len);
>   /* Remember the size of the request.  */
>   if (len < 65536)
> -    catomic_increment (&histogram[len / 16]);
> +    atomic_fetch_add_relaxed (&histogram[len / 16], 1);
>   else
> -    catomic_increment (&large);
> +    atomic_fetch_add_relaxed (&large, 1);
>   /* Total number of calls of any of the functions.  */
> -  catomic_increment (&calls_total);
> +  atomic_fetch_add_relaxed (&calls_total, 1);
> 
>   /* Do the real work.  */
>   result = (struct header *) (*mallocp)(len + sizeof (struct header));
>   if (result == NULL)
>     {
> -      catomic_increment (&failed[idx_malloc]);
> +      atomic_fetch_add_relaxed (&failed[idx_malloc], 1);
>       return NULL;
>     }
> 
> @@ -430,21 +430,21 @@ realloc (void *old, size_t len)
>     }
> 
>   /* Keep track of number of calls.  */
> -  catomic_increment (&calls[idx_realloc]);
> +  atomic_fetch_add_relaxed (&calls[idx_realloc], 1);
>   if (len > old_len)
>     {
>       /* Keep track of total memory consumption for `realloc'.  */
> -      catomic_add (&total[idx_realloc], len - old_len);
> +      atomic_fetch_add_relaxed (&total[idx_realloc], len - old_len);
>       /* Keep track of total memory requirement.  */
> -      catomic_add (&grand_total, len - old_len);
> +      atomic_fetch_add_relaxed (&grand_total, len - old_len);
>     }
> 
>   if (len == 0 && old != NULL)
>     {
>       /* Special case.  */
> -      catomic_increment (&realloc_free);
> +      atomic_fetch_add_relaxed (&realloc_free, 1);
>       /* Keep track of total memory freed using `free'.  */
> -      catomic_add (&total[idx_free], real->length);
> +      atomic_fetch_add_relaxed (&total[idx_free], real->length);
> 
>       /* Update the allocation data and write out the records if necessary.  */
>       update_data (NULL, 0, old_len);
> @@ -457,26 +457,26 @@ realloc (void *old, size_t len)
> 
>   /* Remember the size of the request.  */
>   if (len < 65536)
> -    catomic_increment (&histogram[len / 16]);
> +    atomic_fetch_add_relaxed (&histogram[len / 16], 1);
>   else
> -    catomic_increment (&large);
> +    atomic_fetch_add_relaxed (&large, 1);
>   /* Total number of calls of any of the functions.  */
> -  catomic_increment (&calls_total);
> +  atomic_fetch_add_relaxed (&calls_total, 1);
> 
>   /* Do the real work.  */
>   result = (struct header *) (*reallocp)(real, len + sizeof (struct header));
>   if (result == NULL)
>     {
> -      catomic_increment (&failed[idx_realloc]);
> +      atomic_fetch_add_relaxed (&failed[idx_realloc], 1);
>       return NULL;
>     }
> 
>   /* Record whether the reduction/increase happened in place.  */
>   if (real == result)
> -    catomic_increment (&inplace);
> +    atomic_fetch_add_relaxed (&inplace, 1);
>   /* Was the buffer increased?  */
>   if (old_len > len)
> -    catomic_increment (&decreasing);
> +    atomic_fetch_add_relaxed (&decreasing, 1);
> 
>   /* Update the allocation data and write out the records if necessary.  */
>   update_data (result, len, old_len);
> @@ -508,16 +508,16 @@ calloc (size_t n, size_t len)
>     return (*callocp)(n, len);
> 
>   /* Keep track of number of calls.  */
> -  catomic_increment (&calls[idx_calloc]);
> +  atomic_fetch_add_relaxed (&calls[idx_calloc], 1);
>   /* Keep track of total memory consumption for `calloc'.  */
> -  catomic_add (&total[idx_calloc], size);
> +  atomic_fetch_add_relaxed (&total[idx_calloc], size);
>   /* Keep track of total memory requirement.  */
> -  catomic_add (&grand_total, size);
> +  atomic_fetch_add_relaxed (&grand_total, size);
>   /* Remember the size of the request.  */
>   if (size < 65536)
> -    catomic_increment (&histogram[size / 16]);
> +    atomic_fetch_add_relaxed (&histogram[size / 16], 1);
>   else
> -    catomic_increment (&large);
> +    atomic_fetch_add_relaxed (&large, 1);
>   /* Total number of calls of any of the functions.  */
>   ++calls_total;
> 
> @@ -525,7 +525,7 @@ calloc (size_t n, size_t len)
>   result = (struct header *) (*mallocp)(size + sizeof (struct header));
>   if (result == NULL)
>     {
> -      catomic_increment (&failed[idx_calloc]);
> +      atomic_fetch_add_relaxed (&failed[idx_calloc], 1);
>       return NULL;
>     }
> 
> @@ -563,7 +563,7 @@ free (void *ptr)
>   /* `free (NULL)' has no effect.  */
>   if (ptr == NULL)
>     {
> -      catomic_increment (&calls[idx_free]);
> +      atomic_fetch_add_relaxed (&calls[idx_free], 1);
>       return;
>     }
> 
> @@ -577,9 +577,9 @@ free (void *ptr)
>     }
> 
>   /* Keep track of number of calls.  */
> -  catomic_increment (&calls[idx_free]);
> +  atomic_fetch_add_relaxed (&calls[idx_free], 1);
>   /* Keep track of total memory freed using `free'.  */
> -  catomic_add (&total[idx_free], real->length);
> +  atomic_fetch_add_relaxed (&total[idx_free], real->length);
> 
>   /* Update the allocation data and write out the records if necessary.  */
>   update_data (NULL, 0, real->length);
> @@ -614,22 +614,22 @@ mmap (void *start, size_t len, int prot, int flags, int fd, off_t offset)
>                  ? idx_mmap_a : prot & PROT_WRITE ? idx_mmap_w : idx_mmap_r);
> 
>       /* Keep track of number of calls.  */
> -      catomic_increment (&calls[idx]);
> +      atomic_fetch_add_relaxed (&calls[idx], 1);
>       /* Keep track of total memory consumption for `malloc'.  */
> -      catomic_add (&total[idx], len);
> +      atomic_fetch_add_relaxed (&total[idx], len);
>       /* Keep track of total memory requirement.  */
> -      catomic_add (&grand_total, len);
> +      atomic_fetch_add_relaxed (&grand_total, len);
>       /* Remember the size of the request.  */
>       if (len < 65536)
> -        catomic_increment (&histogram[len / 16]);
> +        atomic_fetch_add_relaxed (&histogram[len / 16], 1);
>       else
> -        catomic_increment (&large);
> +        atomic_fetch_add_relaxed (&large, 1);
>       /* Total number of calls of any of the functions.  */
> -      catomic_increment (&calls_total);
> +      atomic_fetch_add_relaxed (&calls_total, 1);
> 
>       /* Check for failures.  */
>       if (result == NULL)
> -        catomic_increment (&failed[idx]);
> +        atomic_fetch_add_relaxed (&failed[idx], 1);
>       else if (idx == idx_mmap_w)
>         /* Update the allocation data and write out the records if
>            necessary.  Note the first parameter is NULL which means
> @@ -667,22 +667,22 @@ mmap64 (void *start, size_t len, int prot, int flags, int fd, off64_t offset)
>                  ? idx_mmap_a : prot & PROT_WRITE ? idx_mmap_w : idx_mmap_r);
> 
>       /* Keep track of number of calls.  */
> -      catomic_increment (&calls[idx]);
> +      atomic_fetch_add_relaxed (&calls[idx], 1);
>       /* Keep track of total memory consumption for `malloc'.  */
> -      catomic_add (&total[idx], len);
> +      atomic_fetch_add_relaxed (&total[idx], len);
>       /* Keep track of total memory requirement.  */
> -      catomic_add (&grand_total, len);
> +      atomic_fetch_add_relaxed (&grand_total, len);
>       /* Remember the size of the request.  */
>       if (len < 65536)
> -        catomic_increment (&histogram[len / 16]);
> +        atomic_fetch_add_relaxed (&histogram[len / 16], 1);
>       else
> -        catomic_increment (&large);
> +        atomic_fetch_add_relaxed (&large, 1);
>       /* Total number of calls of any of the functions.  */
> -      catomic_increment (&calls_total);
> +      atomic_fetch_add_relaxed (&calls_total, 1);
> 
>       /* Check for failures.  */
>       if (result == NULL)
> -        catomic_increment (&failed[idx]);
> +        atomic_fetch_add_relaxed (&failed[idx], 1);
>       else if (idx == idx_mmap_w)
>         /* Update the allocation data and write out the records if
>            necessary.  Note the first parameter is NULL which means
> @@ -722,33 +722,33 @@ mremap (void *start, size_t old_len, size_t len, int flags, ...)
>   if (!not_me && trace_mmap)
>     {
>       /* Keep track of number of calls.  */
> -      catomic_increment (&calls[idx_mremap]);
> +      atomic_fetch_add_relaxed (&calls[idx_mremap], 1);
>       if (len > old_len)
>         {
>           /* Keep track of total memory consumption for `malloc'.  */
> -          catomic_add (&total[idx_mremap], len - old_len);
> +          atomic_fetch_add_relaxed (&total[idx_mremap], len - old_len);
>           /* Keep track of total memory requirement.  */
> -          catomic_add (&grand_total, len - old_len);
> +          atomic_fetch_add_relaxed (&grand_total, len - old_len);
>         }
>       /* Remember the size of the request.  */
>       if (len < 65536)
> -        catomic_increment (&histogram[len / 16]);
> +        atomic_fetch_add_relaxed (&histogram[len / 16], 1);
>       else
> -        catomic_increment (&large);
> +        atomic_fetch_add_relaxed (&large, 1);
>       /* Total number of calls of any of the functions.  */
> -      catomic_increment (&calls_total);
> +      atomic_fetch_add_relaxed (&calls_total, 1);
> 
>       /* Check for failures.  */
>       if (result == NULL)
> -        catomic_increment (&failed[idx_mremap]);
> +        atomic_fetch_add_relaxed (&failed[idx_mremap], 1);
>       else
>         {
>           /* Record whether the reduction/increase happened in place.  */
>           if (start == result)
> -            catomic_increment (&inplace_mremap);
> +            atomic_fetch_add_relaxed (&inplace_mremap, 1);
>           /* Was the buffer increased?  */
>           if (old_len > len)
> -            catomic_increment (&decreasing_mremap);
> +            atomic_fetch_add_relaxed (&decreasing_mremap, 1);
> 
>           /* Update the allocation data and write out the records if
>              necessary.  Note the first parameter is NULL which means
> @@ -783,19 +783,19 @@ munmap (void *start, size_t len)
>   if (!not_me && trace_mmap)
>     {
>       /* Keep track of number of calls.  */
> -      catomic_increment (&calls[idx_munmap]);
> +      atomic_fetch_add_relaxed (&calls[idx_munmap], 1);
> 
>       if (__glibc_likely (result == 0))
>         {
>           /* Keep track of total memory freed using `free'.  */
> -          catomic_add (&total[idx_munmap], len);
> +          atomic_fetch_add_relaxed (&total[idx_munmap], len);
> 
>           /* Update the allocation data and write out the records if
>              necessary.  */
>           update_data (NULL, 0, len);
>         }
>       else
> -        catomic_increment (&failed[idx_munmap]);
> +        atomic_fetch_add_relaxed (&failed[idx_munmap], 1);
>     }
> 
>   return result;
> diff --git a/manual/memory.texi b/manual/memory.texi
> index 23a039c57e60c81787252d935e3b309fd8290902..5cb1dbd281006148f23cfa38c5703fb79089ba78 100644
> --- a/manual/memory.texi
> +++ b/manual/memory.texi
> @@ -354,7 +354,7 @@ this function is in @file{stdlib.h}.
> @c that's protected by list_lock; next_free is only modified while
> @c list_lock is held too.  All other data members of an arena, as well
> @c as the metadata of the memory areas assigned to it, are only modified
> -@c while holding the arena's mutex (fastbin pointers use catomic ops
> +@c while holding the arena's mutex (fastbin pointers use atomic ops
> @c because they may be modified by free without taking the arena's
> @c lock).  Some reassurance was needed for fastbins, for it wasn't clear
> @c how they were initialized.  It turns out they are always
> @@ -383,7 +383,7 @@ this function is in @file{stdlib.h}.
> @c     mutex_lock (arena lock) dup @asulock @aculock [returns locked]
> @c    __get_nprocs ext ok @acsfd
> @c    NARENAS_FROM_NCORES ok
> -@c    catomic_compare_and_exchange_bool_acq ok
> +@c    atomic_compare_and_exchange_bool_acq ok
> @c    _int_new_arena ok @asulock @aculock @acsmem
> @c     new_heap ok @acsmem
> @c      mmap ok @acsmem
> @@ -397,7 +397,7 @@ this function is in @file{stdlib.h}.
> @c     mutex_lock (list_lock) dup @asulock @aculock
> @c     atomic_write_barrier ok
> @c     mutex_unlock (list_lock) @aculock
> -@c    catomic_decrement ok
> +@c    atomic_decrement ok
> @c    reused_arena @asulock @aculock
> @c      reads&writes next_to_use and iterates over arena next without guards
> @c      those are harmless as long as we don't drop arenas from the
> @@ -414,7 +414,7 @@ this function is in @file{stdlib.h}.
> @c   get_max_fast ok
> @c   fastbin_index ok
> @c   fastbin ok
> -@c   catomic_compare_and_exhange_val_acq ok
> +@c   atomic_compare_and_exhange_val_acq ok
> @c   malloc_printerr dup @mtsenv
> @c     if we get to it, we're toast already, undefined behavior must have
> @c     been invoked before
> @@ -521,10 +521,10 @@ this function is in @file{stdlib.h}.
> @c     chunk2mem dup ok
> @c     free_perturb ok
> @c     set_fastchunks ok
> -@c      catomic_and ok
> +@c      atomic_and ok
> @c     fastbin_index dup ok
> @c     fastbin dup ok
> -@c     catomic_compare_and_exchange_val_rel ok
> +@c     atomic_compare_and_exchange_val_rel ok
> @c     chunk_is_mmapped ok
> @c     contiguous dup ok
> @c     prev_inuse ok
> @@ -706,7 +706,7 @@ The prototype for this function is in @file{stdlib.h}.
> @safety{@prelim{}@mtsafe{}@asunsafe{@asulock{}}@acunsafe{@aculock{} @acsfd{} @acsmem{}}}
> @c __libc_free @asulock @aculock @acsfd @acsmem
> @c   releasing memory into fastbins modifies the arena without taking
> -@c   its mutex, but catomic operations ensure safety.  If two (or more)
> +@c   its mutex, but atomic operations ensure safety.  If two (or more)
> @c   threads are running malloc and have their own arenas locked when
> @c   each gets a signal whose handler free()s large (non-fastbin-able)
> @c   blocks from each other's arena, we deadlock; this is a more general
> diff --git a/misc/tst-atomic.c b/misc/tst-atomic.c
> index 6d681a7bfdf4f48b4c04a073ebd480326dbd3cc8..4f9d2c1a46b363d346dbc2fa0962ae196844a43a 100644
> --- a/misc/tst-atomic.c
> +++ b/misc/tst-atomic.c
> @@ -393,117 +393,6 @@ do_test (void)
>     }
> #endif
> 
> -#ifdef catomic_compare_and_exchange_val_acq
> -  mem = 24;
> -  if (catomic_compare_and_exchange_val_acq (&mem, 35, 24) != 24
> -      || mem != 35)
> -    {
> -      puts ("catomic_compare_and_exchange_val_acq test 1 failed");
> -      ret = 1;
> -    }
> -
> -  mem = 12;
> -  if (catomic_compare_and_exchange_val_acq (&mem, 10, 15) != 12
> -      || mem != 12)
> -    {
> -      puts ("catomic_compare_and_exchange_val_acq test 2 failed");
> -      ret = 1;
> -    }
> -
> -  mem = -15;
> -  if (catomic_compare_and_exchange_val_acq (&mem, -56, -15) != -15
> -      || mem != -56)
> -    {
> -      puts ("catomic_compare_and_exchange_val_acq test 3 failed");
> -      ret = 1;
> -    }
> -
> -  mem = -1;
> -  if (catomic_compare_and_exchange_val_acq (&mem, 17, 0) != -1
> -      || mem != -1)
> -    {
> -      puts ("catomic_compare_and_exchange_val_acq test 4 failed");
> -      ret = 1;
> -    }
> -#endif
> -
> -  mem = 24;
> -  if (catomic_compare_and_exchange_bool_acq (&mem, 35, 24)
> -      || mem != 35)
> -    {
> -      puts ("catomic_compare_and_exchange_bool_acq test 1 failed");
> -      ret = 1;
> -    }
> -
> -  mem = 12;
> -  if (! catomic_compare_and_exchange_bool_acq (&mem, 10, 15)
> -      || mem != 12)
> -    {
> -      puts ("catomic_compare_and_exchange_bool_acq test 2 failed");
> -      ret = 1;
> -    }
> -
> -  mem = -15;
> -  if (catomic_compare_and_exchange_bool_acq (&mem, -56, -15)
> -      || mem != -56)
> -    {
> -      puts ("catomic_compare_and_exchange_bool_acq test 3 failed");
> -      ret = 1;
> -    }
> -
> -  mem = -1;
> -  if (! catomic_compare_and_exchange_bool_acq (&mem, 17, 0)
> -      || mem != -1)
> -    {
> -      puts ("catomic_compare_and_exchange_bool_acq test 4 failed");
> -      ret = 1;
> -    }
> -
> -  mem = 2;
> -  if (catomic_exchange_and_add (&mem, 11) != 2
> -      || mem != 13)
> -    {
> -      puts ("catomic_exchange_and_add test failed");
> -      ret = 1;
> -    }
> -
> -  mem = -21;
> -  catomic_add (&mem, 22);
> -  if (mem != 1)
> -    {
> -      puts ("catomic_add test failed");
> -      ret = 1;
> -    }
> -
> -  mem = -1;
> -  catomic_increment (&mem);
> -  if (mem != 0)
> -    {
> -      puts ("catomic_increment test failed");
> -      ret = 1;
> -    }
> -
> -  mem = 2;
> -  if (catomic_increment_val (&mem) != 3)
> -    {
> -      puts ("catomic_increment_val test failed");
> -      ret = 1;
> -    }
> -
> -  mem = 17;
> -  catomic_decrement (&mem);
> -  if (mem != 16)
> -    {
> -      puts ("catomic_decrement test failed");
> -      ret = 1;
> -    }
> -
> -  if (catomic_decrement_val (&mem) != 15)
> -    {
> -      puts ("catomic_decrement_val test failed");
> -      ret = 1;
> -    }
> -
>   /* Tests for C11-like atomics.  */
>   mem = 11;
>   if (atomic_load_relaxed (&mem) != 11 || atomic_load_acquire (&mem) != 11)
> diff --git a/sysdeps/hppa/dl-fptr.c b/sysdeps/hppa/dl-fptr.c
> index 9ed21602d6155d4b960278f8d1fac4ffa885b9d5..40bf5cd3b306315d8eeb6bdba2b2b46b1ea5059e 100644
> --- a/sysdeps/hppa/dl-fptr.c
> +++ b/sysdeps/hppa/dl-fptr.c
> @@ -41,10 +41,8 @@
> # error "ELF_MACHINE_LOAD_ADDRESS is not defined."
> #endif
> 
> -#ifndef COMPARE_AND_SWAP
> -# define COMPARE_AND_SWAP(ptr, old, new) \
> -  (catomic_compare_and_exchange_bool_acq (ptr, new, old) == 0)
> -#endif
> +#define COMPARE_AND_SWAP(ptr, old, new) \
> +  (atomic_compare_and_exchange_bool_acq (ptr, new, old) == 0)
> 
> ElfW(Addr) _dl_boot_fptr_table [ELF_MACHINE_BOOT_FPTR_TABLE_LEN];
> 
> diff --git a/sysdeps/ia64/dl-fptr.h b/sysdeps/ia64/dl-fptr.h
> index e6c34528b67e7986ac44d77eb2505889a1767e29..a3a2994db20f44dfd3280f6ad618001cbadc0539 100644
> --- a/sysdeps/ia64/dl-fptr.h
> +++ b/sysdeps/ia64/dl-fptr.h
> @@ -22,9 +22,6 @@
> #include <ia64intrin.h>
> #include <sysdeps/generic/dl-fptr.h>
> 
> -#define COMPARE_AND_SWAP(ptr, old, new)	\
> -  __sync_bool_compare_and_swap (ptr, old, new)
> -
> /* There are currently 123 dynamic symbols in ld.so.
>    ELF_MACHINE_BOOT_FPTR_TABLE_LEN needs to be at least that big.  */
> #define ELF_MACHINE_BOOT_FPTR_TABLE_LEN	200
> diff --git a/sysdeps/s390/atomic-machine.h b/sysdeps/s390/atomic-machine.h
> index e85b2ef50541c7aab6d2981180f6205d2bd681b6..6b1de51c2a30baf5554a729a80a7ce04b56fc22c 100644
> --- a/sysdeps/s390/atomic-machine.h
> +++ b/sysdeps/s390/atomic-machine.h
> @@ -70,8 +70,6 @@
>     !__atomic_compare_exchange_n (mem, (void *) &__atg2_oldval, newval,	\
> 				  1, __ATOMIC_ACQUIRE,			\
> 				  __ATOMIC_RELAXED); })
> -#define catomic_compare_and_exchange_bool_acq(mem, newval, oldval)	\
> -  atomic_compare_and_exchange_bool_acq (mem, newval, oldval)
> 
> /* Store NEWVALUE in *MEM and return the old value.  */
> #define atomic_exchange_acq(mem, newvalue)				\
> @@ -90,8 +88,6 @@
> # define atomic_exchange_and_add_rel(mem, operand)			\
>   ({ __atomic_check_size((mem));					\
>   __atomic_fetch_add ((mem), (operand), __ATOMIC_RELEASE); })
> -#define catomic_exchange_and_add(mem, value)	\
> -  atomic_exchange_and_add (mem, value)
> 
> /* Atomically *mem |= mask and return the old value of *mem.  */
> /* The gcc builtin uses load-and-or instruction on z196 zarch and higher cpus
> @@ -104,8 +100,6 @@
>   do {						\
>     atomic_or_val (mem, mask);			\
>   } while (0)
> -#define catomic_or(mem, mask)			\
> -  atomic_or (mem, mask)
> 
> /* Atomically *mem |= 1 << bit and return true if the bit was set in old value
>    of *mem.  */
> @@ -129,5 +123,3 @@
>   do {						\
>     atomic_and_val (mem, mask);			\
>   } while (0)
> -#define catomic_and(mem, mask)			\
> -  atomic_and(mem, mask)
> diff --git a/sysdeps/unix/sysv/linux/riscv/atomic-machine.h b/sysdeps/unix/sysv/linux/riscv/atomic-machine.h
> index 9ae89e0ef12ad28319755ac51260908779b9579f..f4b2cbced828a80335887bf172fd60767cf978ac 100644
> --- a/sysdeps/unix/sysv/linux/riscv/atomic-machine.h
> +++ b/sysdeps/unix/sysv/linux/riscv/atomic-machine.h
> @@ -170,10 +170,6 @@
>   ({ typeof (*mem) __mask = (typeof (*mem))1 << (bit);    \
>      asm_amo ("amoor", ".aq", mem, __mask) & __mask; })
> 
> -# define catomic_exchange_and_add(mem, value)		\
> -  atomic_exchange_and_add (mem, value)
> -# define catomic_max(mem, value) atomic_max (mem, value)
> -
> #else /* __riscv_atomic */
> # error "ISAs that do not subsume the A extension are not supported"
> #endif /* !__riscv_atomic */
> diff --git a/sysdeps/x86/atomic-machine.h b/sysdeps/x86/atomic-machine.h
> index f24f1c71ed718c601c71decc1ee0c4b49fdf32f8..5a48521443367b1ca94c9f153c2a163661de789c 100644
> --- a/sysdeps/x86/atomic-machine.h
> +++ b/sysdeps/x86/atomic-machine.h
> @@ -20,7 +20,6 @@
> #define _X86_ATOMIC_MACHINE_H 1
> 
> #include <stdint.h>
> -#include <tls.h>			/* For tcbhead_t.  */
> #include <libc-pointer-arith.h>		/* For cast_to_integer.  */
> 
> #define LOCK_PREFIX "lock;"
> @@ -52,52 +51,7 @@
>   (! __sync_bool_compare_and_swap (mem, oldval, newval))
> 
> 
> -#define __arch_c_compare_and_exchange_val_8_acq(mem, newval, oldval) \
> -  ({ __typeof (*mem) ret;						      \
> -     __asm __volatile ("cmpl $0, %%" SEG_REG ":%P5\n\t"			      \
> -		       "je 0f\n\t"					      \
> -		       "lock\n"						      \
> -		       "0:\tcmpxchgb %b2, %1"				      \
> -		       : "=a" (ret), "=m" (*mem)			      \
> -		       : BR_CONSTRAINT (newval), "m" (*mem), "0" (oldval),    \
> -			 "i" (offsetof (tcbhead_t, multiple_threads)));	      \
> -     ret; })
> -
> -#define __arch_c_compare_and_exchange_val_16_acq(mem, newval, oldval) \
> -  ({ __typeof (*mem) ret;						      \
> -     __asm __volatile ("cmpl $0, %%" SEG_REG ":%P5\n\t"			      \
> -		       "je 0f\n\t"					      \
> -		       "lock\n"						      \
> -		       "0:\tcmpxchgw %w2, %1"				      \
> -		       : "=a" (ret), "=m" (*mem)			      \
> -		       : BR_CONSTRAINT (newval), "m" (*mem), "0" (oldval),    \
> -			 "i" (offsetof (tcbhead_t, multiple_threads)));	      \
> -     ret; })
> -
> -#define __arch_c_compare_and_exchange_val_32_acq(mem, newval, oldval) \
> -  ({ __typeof (*mem) ret;						      \
> -     __asm __volatile ("cmpl $0, %%" SEG_REG ":%P5\n\t"			      \
> -		       "je 0f\n\t"					      \
> -		       "lock\n"						      \
> -		       "0:\tcmpxchgl %2, %1"				      \
> -		       : "=a" (ret), "=m" (*mem)			      \
> -		       : BR_CONSTRAINT (newval), "m" (*mem), "0" (oldval),    \
> -			 "i" (offsetof (tcbhead_t, multiple_threads)));       \
> -     ret; })
> -
> #ifdef __x86_64__
> -# define __arch_c_compare_and_exchange_val_64_acq(mem, newval, oldval) \
> -  ({ __typeof (*mem) ret;						      \
> -     __asm __volatile ("cmpl $0, %%fs:%P5\n\t"				      \
> -		       "je 0f\n\t"					      \
> -		       "lock\n"						      \
> -		       "0:\tcmpxchgq %q2, %1"				      \
> -		       : "=a" (ret), "=m" (*mem)			      \
> -		       : "q" ((int64_t) cast_to_integer (newval)),	      \
> -			 "m" (*mem),					      \
> -			 "0" ((int64_t) cast_to_integer (oldval)),	      \
> -			 "i" (offsetof (tcbhead_t, multiple_threads)));	      \
> -     ret; })
> # define do_exchange_and_add_val_64_acq(pfx, mem, value) 0
> # define do_add_val_64_acq(pfx, mem, value) do { } while (0)
> #else
> @@ -107,13 +61,6 @@
>    such an operation.  So don't define any code for now.  If it is
>    really going to be used the code below can be used on Intel Pentium
>    and later, but NOT on i486.  */
> -# define __arch_c_compare_and_exchange_val_64_acq(mem, newval, oldval) \
> -  ({ __typeof (*mem) ret = *(mem);					      \
> -     __atomic_link_error ();						      \
> -     ret = (newval);							      \
> -     ret = (oldval);							      \
> -     ret; })
> -
> # define __arch_compare_and_exchange_val_64_acq(mem, newval, oldval)	      \
>   ({ __typeof (*mem) ret = *(mem);					      \
>      __atomic_link_error ();						      \
> @@ -181,24 +128,20 @@
>      if (sizeof (*mem) == 1)						      \
>        __asm __volatile (lock "xaddb %b0, %1"				      \
> 			 : "=q" (__result), "=m" (*mem)			      \
> -			 : "0" (__addval), "m" (*mem),			      \
> -			   "i" (offsetof (tcbhead_t, multiple_threads)));     \
> +			 : "0" (__addval), "m" (*mem));			      \
>      else if (sizeof (*mem) == 2)					      \
>        __asm __volatile (lock "xaddw %w0, %1"				      \
> 			 : "=r" (__result), "=m" (*mem)			      \
> -			 : "0" (__addval), "m" (*mem),			      \
> -			   "i" (offsetof (tcbhead_t, multiple_threads)));     \
> +			 : "0" (__addval), "m" (*mem));			      \
>      else if (sizeof (*mem) == 4)					      \
>        __asm __volatile (lock "xaddl %0, %1"				      \
> 			 : "=r" (__result), "=m" (*mem)			      \
> -			 : "0" (__addval), "m" (*mem),			      \
> -			   "i" (offsetof (tcbhead_t, multiple_threads)));     \
> +			 : "0" (__addval), "m" (*mem));			      \
>      else if (__HAVE_64B_ATOMICS)					      \
>        __asm __volatile (lock "xaddq %q0, %1"				      \
> 			 : "=r" (__result), "=m" (*mem)			      \
> 			 : "0" ((int64_t) cast_to_integer (__addval)),     \
> -			   "m" (*mem),					      \
> -			   "i" (offsetof (tcbhead_t, multiple_threads)));     \
> +			   "m" (*mem));					      \
>      else								      \
>        __result = do_exchange_and_add_val_64_acq (pfx, (mem), __addval);      \
>      __result; })
> @@ -206,14 +149,6 @@
> #define atomic_exchange_and_add(mem, value) \
>   __sync_fetch_and_add (mem, value)
> 
> -#define __arch_exchange_and_add_cprefix \
> -  "cmpl $0, %%" SEG_REG ":%P4\n\tje 0f\n\tlock\n0:\t"
> -
> -#define catomic_exchange_and_add(mem, value) \
> -  __arch_exchange_and_add_body (__arch_exchange_and_add_cprefix, __arch_c,    \
> -				mem, value)
> -
> -
> #define __arch_add_body(lock, pfx, apfx, mem, value) \
>   do {									      \
>     if (__builtin_constant_p (value) && (value) == 1)			      \
> @@ -223,24 +158,20 @@
>     else if (sizeof (*mem) == 1)					      \
>       __asm __volatile (lock "addb %b1, %0"				      \
> 			: "=m" (*mem)					      \
> -			: IBR_CONSTRAINT (value), "m" (*mem),		      \
> -			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
> +			: IBR_CONSTRAINT (value), "m" (*mem));		      \
>     else if (sizeof (*mem) == 2)					      \
>       __asm __volatile (lock "addw %w1, %0"				      \
> 			: "=m" (*mem)					      \
> -			: "ir" (value), "m" (*mem),			      \
> -			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
> +			: "ir" (value), "m" (*mem));			      \
>     else if (sizeof (*mem) == 4)					      \
>       __asm __volatile (lock "addl %1, %0"				      \
> 			: "=m" (*mem)					      \
> -			: "ir" (value), "m" (*mem),			      \
> -			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
> +			: "ir" (value), "m" (*mem));			      \
>     else if (__HAVE_64B_ATOMICS)					      \
>       __asm __volatile (lock "addq %q1, %0"				      \
> 			: "=m" (*mem)					      \
> 			: "ir" ((int64_t) cast_to_integer (value)),	      \
> -			  "m" (*mem),					      \
> -			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
> +			  "m" (*mem));					      \
>     else								      \
>       do_add_val_64_acq (apfx, (mem), (value));				      \
>   } while (0)
> @@ -248,13 +179,6 @@
> # define atomic_add(mem, value) \
>   __arch_add_body (LOCK_PREFIX, atomic, __arch, mem, value)
> 
> -#define __arch_add_cprefix \
> -  "cmpl $0, %%" SEG_REG ":%P3\n\tje 0f\n\tlock\n0:\t"
> -
> -#define catomic_add(mem, value) \
> -  __arch_add_body (__arch_add_cprefix, atomic, __arch_c, mem, value)
> -
> -
> #define atomic_add_negative(mem, value) \
>   ({ unsigned char __result;						      \
>      if (sizeof (*mem) == 1)						      \
> @@ -308,36 +232,25 @@
>     if (sizeof (*mem) == 1)						      \
>       __asm __volatile (lock "incb %b0"					      \
> 			: "=m" (*mem)					      \
> -			: "m" (*mem),					      \
> -			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
> +			: "m" (*mem));					      \
>     else if (sizeof (*mem) == 2)					      \
>       __asm __volatile (lock "incw %w0"					      \
> 			: "=m" (*mem)					      \
> -			: "m" (*mem),					      \
> -			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
> +			: "m" (*mem));					      \
>     else if (sizeof (*mem) == 4)					      \
>       __asm __volatile (lock "incl %0"					      \
> 			: "=m" (*mem)					      \
> -			: "m" (*mem),					      \
> -			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
> +			: "m" (*mem));					      \
>     else if (__HAVE_64B_ATOMICS)					      \
>       __asm __volatile (lock "incq %q0"					      \
> 			: "=m" (*mem)					      \
> -			: "m" (*mem),					      \
> -			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
> +			: "m" (*mem));					      \
>     else								      \
>       do_add_val_64_acq (pfx, mem, 1);					      \
>   } while (0)
> 
> #define atomic_increment(mem) __arch_increment_body (LOCK_PREFIX, __arch, mem)
> 
> -#define __arch_increment_cprefix \
> -  "cmpl $0, %%" SEG_REG ":%P2\n\tje 0f\n\tlock\n0:\t"
> -
> -#define catomic_increment(mem) \
> -  __arch_increment_body (__arch_increment_cprefix, __arch_c, mem)
> -
> -
> #define atomic_increment_and_test(mem) \
>   ({ unsigned char __result;						      \
>      if (sizeof (*mem) == 1)						      \
> @@ -366,36 +279,25 @@
>     if (sizeof (*mem) == 1)						      \
>       __asm __volatile (lock "decb %b0"					      \
> 			: "=m" (*mem)					      \
> -			: "m" (*mem),					      \
> -			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
> +			: "m" (*mem));					      \
>     else if (sizeof (*mem) == 2)					      \
>       __asm __volatile (lock "decw %w0"					      \
> 			: "=m" (*mem)					      \
> -			: "m" (*mem),					      \
> -			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
> +			: "m" (*mem));					      \
>     else if (sizeof (*mem) == 4)					      \
>       __asm __volatile (lock "decl %0"					      \
> 			: "=m" (*mem)					      \
> -			: "m" (*mem),					      \
> -			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
> +			: "m" (*mem));					      \
>     else if (__HAVE_64B_ATOMICS)					      \
>       __asm __volatile (lock "decq %q0"					      \
> 			: "=m" (*mem)					      \
> -			: "m" (*mem),					      \
> -			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
> +			: "m" (*mem));					      \
>     else								      \
>       do_add_val_64_acq (pfx, mem, -1);					      \
>   } while (0)
> 
> #define atomic_decrement(mem) __arch_decrement_body (LOCK_PREFIX, __arch, mem)
> 
> -#define __arch_decrement_cprefix \
> -  "cmpl $0, %%" SEG_REG ":%P2\n\tje 0f\n\tlock\n0:\t"
> -
> -#define catomic_decrement(mem) \
> -  __arch_decrement_body (__arch_decrement_cprefix, __arch_c, mem)
> -
> -
> #define atomic_decrement_and_test(mem) \
>   ({ unsigned char __result;						      \
>      if (sizeof (*mem) == 1)						      \
> @@ -472,65 +374,49 @@
>     if (sizeof (*mem) == 1)						      \
>       __asm __volatile (lock "andb %b1, %0"				      \
> 			: "=m" (*mem)					      \
> -			: IBR_CONSTRAINT (mask), "m" (*mem),		      \
> -			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
> +			: IBR_CONSTRAINT (mask), "m" (*mem));		      \
>     else if (sizeof (*mem) == 2)					      \
>       __asm __volatile (lock "andw %w1, %0"				      \
> 			: "=m" (*mem)					      \
> -			: "ir" (mask), "m" (*mem),			      \
> -			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
> +			: "ir" (mask), "m" (*mem));			      \
>     else if (sizeof (*mem) == 4)					      \
>       __asm __volatile (lock "andl %1, %0"				      \
> 			: "=m" (*mem)					      \
> -			: "ir" (mask), "m" (*mem),			      \
> -			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
> +			: "ir" (mask), "m" (*mem));			      \
>     else if (__HAVE_64B_ATOMICS)					      \
>       __asm __volatile (lock "andq %q1, %0"				      \
> 			: "=m" (*mem)					      \
> -			: "ir" (mask), "m" (*mem),			      \
> -			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
> +			: "ir" (mask), "m" (*mem));			      \
>     else								      \
>       __atomic_link_error ();						      \
>   } while (0)
> 
> -#define __arch_cprefix \
> -  "cmpl $0, %%" SEG_REG ":%P3\n\tje 0f\n\tlock\n0:\t"
> -
> #define atomic_and(mem, mask) __arch_and_body (LOCK_PREFIX, mem, mask)
> 
> -#define catomic_and(mem, mask) __arch_and_body (__arch_cprefix, mem, mask)
> -
> -
> #define __arch_or_body(lock, mem, mask) \
>   do {									      \
>     if (sizeof (*mem) == 1)						      \
>       __asm __volatile (lock "orb %b1, %0"				      \
> 			: "=m" (*mem)					      \
> -			: IBR_CONSTRAINT (mask), "m" (*mem),		      \
> -			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
> +			: IBR_CONSTRAINT (mask), "m" (*mem));		      \
>     else if (sizeof (*mem) == 2)					      \
>       __asm __volatile (lock "orw %w1, %0"				      \
> 			: "=m" (*mem)					      \
> -			: "ir" (mask), "m" (*mem),			      \
> -			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
> +			: "ir" (mask), "m" (*mem));			      \
>     else if (sizeof (*mem) == 4)					      \
>       __asm __volatile (lock "orl %1, %0"				      \
> 			: "=m" (*mem)					      \
> -			: "ir" (mask), "m" (*mem),			      \
> -			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
> +			: "ir" (mask), "m" (*mem));			      \
>     else if (__HAVE_64B_ATOMICS)					      \
>       __asm __volatile (lock "orq %q1, %0"				      \
> 			: "=m" (*mem)					      \
> -			: "ir" (mask), "m" (*mem),			      \
> -			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
> +			: "ir" (mask), "m" (*mem));			      \
>     else								      \
>       __atomic_link_error ();						      \
>   } while (0)
> 
> #define atomic_or(mem, mask) __arch_or_body (LOCK_PREFIX, mem, mask)
> 
> -#define catomic_or(mem, mask) __arch_or_body (__arch_cprefix, mem, mask)
> -
> /* We don't use mfence because it is supposedly slower due to having to
>    provide stronger guarantees (e.g., regarding self-modifying code).  */
> #define atomic_full_barrier() \
>
Wilco Dijkstra July 5, 2022, 11:16 a.m. UTC | #4
Hi Adhemerval,

> Since this patch removes a x86 optimization (sorry, I realized it after my review), 
> I think it would be better if circle back and first get my single-thread refactor
> patches in (which fixes SINGLE_THREAD_P syscall on aarch64 and other architectures)
> since it does not change x86.

It's a typical target "optimization" - not only slower but also functionally incorrect...

> After we can then remove the unused catomic operation and make the single-thread
> optimization locking generic (so we can finally remove x86 arch-specific bits).

I'm not sure I'm following - the catomics are not used in any locks or in performance
critical code that could benefit from single-threaded optimizations. In fact my patch
improves performance by using much faster relaxed atomics (since all we need is 
atomicity for the counter increments).

Single-threaded locking optimizations are completely independent of all this, and so is
your SINGLE_THREAD_P patch.

So the only concern here is rebase clashes due to your patch rewriting all the x86 code.
My point is that this unnecessary. The catomics are useless and all of the target specific
code can be removed since it is either unused already or will be soon after follow-up
patches. So why first rewrite it all? It just seems lots of work for no gain...

Cheers,
Wilco
Adhemerval Zanella Netto July 6, 2022, 12:15 p.m. UTC | #5
> On 5 Jul 2022, at 08:16, Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
> 
> Hi Adhemerval,
> 
>> Since this patch removes a x86 optimization (sorry, I realized it after my review), 
>> I think it would be better if circle back and first get my single-thread refactor
>> patches in (which fixes SINGLE_THREAD_P syscall on aarch64 and other architectures)
>> since it does not change x86.
> 
> It's a typical target "optimization" - not only slower but also functionally incorrect...

My idea is just to avoid unintentional breakage or performance regression
on some architecture.

> 
>> After we can then remove the unused catomic operation and make the single-thread
>> optimization locking generic (so we can finally remove x86 arch-specific bits).
> 
> I'm not sure I'm following - the catomics are not used in any locks or in performance
> critical code that could benefit from single-threaded optimizations. In fact my patch
> improves performance by using much faster relaxed atomics (since all we need is 
> atomicity for the counter increments).

Initially I though atomic were being used on x86 on malloc code, but 
working on removing the old atomic usage I see that current it only make 
difference on some counters where relaxed atomic are indeed a better
solution (since you already added the single-thread path on generic
code).

So I withdraw my objection and the patch approach is ok.  You might
need to send a newer version or if you may I can adjust it and
add my single-thread.h removal on top of it.

> 
> Single-threaded locking optimizations are completely independent of all this, and so is
> your SINGLE_THREAD_P patch.
> 
> So the only concern here is rebase clashes due to your patch rewriting all the x86 code.
> My point is that this unnecessary. The catomics are useless and all of the target specific
> code can be removed since it is either unused already or will be soon after follow-up
> patches. So why first rewrite it all? It just seems lots of work for no gain...
> 
> Cheers,
> Wilco
diff mbox series

Patch

diff --git a/elf/dl-fptr.c b/elf/dl-fptr.c
index 6645a260b809ecd521796e0d1adee56b3e0bd993..d6e63b807b597b886562657da2d007fc9053be72 100644
--- a/elf/dl-fptr.c
+++ b/elf/dl-fptr.c
@@ -40,7 +40,7 @@ 
 
 #ifndef COMPARE_AND_SWAP
 # define COMPARE_AND_SWAP(ptr, old, new) \
-  (catomic_compare_and_exchange_bool_acq (ptr, new, old) == 0)
+  (atomic_compare_and_exchange_bool_acq (ptr, new, old) == 0)
 #endif
 
 ElfW(Addr) _dl_boot_fptr_table [ELF_MACHINE_BOOT_FPTR_TABLE_LEN];
diff --git a/elf/dl-profile.c b/elf/dl-profile.c
index ec57e3a96552ae6460c22a0fcc819b85d486c0da..0af1f577d2d695d08edce9e13d9b39f77911b1d5 100644
--- a/elf/dl-profile.c
+++ b/elf/dl-profile.c
@@ -548,24 +548,24 @@  _dl_mcount (ElfW(Addr) frompc, ElfW(Addr) selfpc)
 	      size_t newfromidx;
 	      to_index = (data[narcs].self_pc
 			  / (HASHFRACTION * sizeof (*tos)));
-	      newfromidx = catomic_exchange_and_add (&fromidx, 1) + 1;
+	      newfromidx = atomic_fetch_add_relaxed (&fromidx, 1) + 1;
 	      froms[newfromidx].here = &data[narcs];
 	      froms[newfromidx].link = tos[to_index];
 	      tos[to_index] = newfromidx;
-	      catomic_increment (&narcs);
+	      atomic_fetch_add_relaxed (&narcs, 1);
 	    }
 
 	  /* If we still have no entry stop searching and insert.  */
 	  if (*topcindex == 0)
 	    {
-	      unsigned int newarc = catomic_exchange_and_add (narcsp, 1);
+	      unsigned int newarc = atomic_fetch_add_relaxed (narcsp, 1);
 
 	      /* In rare cases it could happen that all entries in FROMS are
 		 occupied.  So we cannot count this anymore.  */
 	      if (newarc >= fromlimit)
 		goto done;
 
-	      *topcindex = catomic_exchange_and_add (&fromidx, 1) + 1;
+	      *topcindex = atomic_fetch_add_relaxed (&fromidx, 1) + 1;
 	      fromp = &froms[*topcindex];
 
 	      fromp->here = &data[newarc];
@@ -573,7 +573,7 @@  _dl_mcount (ElfW(Addr) frompc, ElfW(Addr) selfpc)
 	      data[newarc].self_pc = selfpc;
 	      data[newarc].count = 0;
 	      fromp->link = 0;
-	      catomic_increment (&narcs);
+	      atomic_fetch_add_relaxed (&narcs, 1);
 
 	      break;
 	    }
@@ -586,7 +586,7 @@  _dl_mcount (ElfW(Addr) frompc, ElfW(Addr) selfpc)
     }
 
   /* Increment the counter.  */
-  catomic_increment (&fromp->here->count);
+  atomic_fetch_add_relaxed (&fromp->here->count, 1);
 
  done:
   ;
diff --git a/include/atomic.h b/include/atomic.h
index 2cb52c9cfd894308b97b97a04dd574b2287bf1b2..264db9a0b7619ff6520f84a19c53c1eb9a3b42a3 100644
--- a/include/atomic.h
+++ b/include/atomic.h
@@ -24,13 +24,6 @@ 
    - atomic arithmetic and logic operation on memory.  They all
      have the prefix "atomic_".
 
-   - conditionally atomic operations of the same kinds.  These
-     always behave identical but can be faster when atomicity
-     is not really needed since only one thread has access to
-     the memory location.  In that case the code is slower in
-     the multi-thread case.  The interfaces have the prefix
-     "catomic_".
-
    - support functions like barriers.  They also have the prefix
      "atomic_".
 
@@ -93,29 +86,6 @@ 
 #endif
 
 
-#ifndef catomic_compare_and_exchange_val_acq
-# ifdef __arch_c_compare_and_exchange_val_32_acq
-#  define catomic_compare_and_exchange_val_acq(mem, newval, oldval) \
-  __atomic_val_bysize (__arch_c_compare_and_exchange_val,acq,		      \
-		       mem, newval, oldval)
-# else
-#  define catomic_compare_and_exchange_val_acq(mem, newval, oldval) \
-  atomic_compare_and_exchange_val_acq (mem, newval, oldval)
-# endif
-#endif
-
-
-#ifndef catomic_compare_and_exchange_val_rel
-# ifndef atomic_compare_and_exchange_val_rel
-#  define catomic_compare_and_exchange_val_rel(mem, newval, oldval)	      \
-  catomic_compare_and_exchange_val_acq (mem, newval, oldval)
-# else
-#  define catomic_compare_and_exchange_val_rel(mem, newval, oldval)	      \
-  atomic_compare_and_exchange_val_rel (mem, newval, oldval)
-# endif
-#endif
-
-
 #ifndef atomic_compare_and_exchange_val_rel
 # define atomic_compare_and_exchange_val_rel(mem, newval, oldval)	      \
   atomic_compare_and_exchange_val_acq (mem, newval, oldval)
@@ -141,23 +111,6 @@ 
 #endif
 
 
-#ifndef catomic_compare_and_exchange_bool_acq
-# ifdef __arch_c_compare_and_exchange_bool_32_acq
-#  define catomic_compare_and_exchange_bool_acq(mem, newval, oldval) \
-  __atomic_bool_bysize (__arch_c_compare_and_exchange_bool,acq,		      \
-		        mem, newval, oldval)
-# else
-#  define catomic_compare_and_exchange_bool_acq(mem, newval, oldval) \
-  ({ /* Cannot use __oldval here, because macros later in this file might     \
-	call this macro with __oldval argument.	 */			      \
-     __typeof (oldval) __atg4_old = (oldval);				      \
-     catomic_compare_and_exchange_val_acq (mem, newval, __atg4_old)	      \
-       != __atg4_old;							      \
-  })
-# endif
-#endif
-
-
 /* Store NEWVALUE in *MEM and return the old value.  */
 #ifndef atomic_exchange_acq
 # define atomic_exchange_acq(mem, newvalue) \
@@ -212,24 +165,6 @@ 
   atomic_exchange_and_add_acq(mem, value)
 #endif
 
-#ifndef catomic_exchange_and_add
-# define catomic_exchange_and_add(mem, value) \
-  ({ __typeof (*(mem)) __atg7_oldv;					      \
-     __typeof (mem) __atg7_memp = (mem);				      \
-     __typeof (*(mem)) __atg7_value = (value);				      \
-									      \
-     do									      \
-       __atg7_oldv = *__atg7_memp;					      \
-     while (__builtin_expect						      \
-	    (catomic_compare_and_exchange_bool_acq (__atg7_memp,	      \
-						    __atg7_oldv		      \
-						    + __atg7_value,	      \
-						    __atg7_oldv), 0));	      \
-									      \
-     __atg7_oldv; })
-#endif
-
-
 #ifndef atomic_max
 # define atomic_max(mem, value) \
   do {									      \
@@ -246,25 +181,6 @@ 
   } while (0)
 #endif
 
-
-#ifndef catomic_max
-# define catomic_max(mem, value) \
-  do {									      \
-    __typeof (*(mem)) __atg9_oldv;					      \
-    __typeof (mem) __atg9_memp = (mem);					      \
-    __typeof (*(mem)) __atg9_value = (value);				      \
-    do {								      \
-      __atg9_oldv = *__atg9_memp;					      \
-      if (__atg9_oldv >= __atg9_value)					      \
-	break;								      \
-    } while (__builtin_expect						      \
-	     (catomic_compare_and_exchange_bool_acq (__atg9_memp,	      \
-						     __atg9_value,	      \
-						     __atg9_oldv), 0));	      \
-  } while (0)
-#endif
-
-
 #ifndef atomic_min
 # define atomic_min(mem, value) \
   do {									      \
@@ -288,32 +204,16 @@ 
 #endif
 
 
-#ifndef catomic_add
-# define catomic_add(mem, value) \
-  (void) catomic_exchange_and_add ((mem), (value))
-#endif
-
-
 #ifndef atomic_increment
 # define atomic_increment(mem) atomic_add ((mem), 1)
 #endif
 
 
-#ifndef catomic_increment
-# define catomic_increment(mem) catomic_add ((mem), 1)
-#endif
-
-
 #ifndef atomic_increment_val
 # define atomic_increment_val(mem) (atomic_exchange_and_add ((mem), 1) + 1)
 #endif
 
 
-#ifndef catomic_increment_val
-# define catomic_increment_val(mem) (catomic_exchange_and_add ((mem), 1) + 1)
-#endif
-
-
 /* Add one to *MEM and return true iff it's now zero.  */
 #ifndef atomic_increment_and_test
 # define atomic_increment_and_test(mem) \
@@ -326,21 +226,11 @@ 
 #endif
 
 
-#ifndef catomic_decrement
-# define catomic_decrement(mem) catomic_add ((mem), -1)
-#endif
-
-
 #ifndef atomic_decrement_val
 # define atomic_decrement_val(mem) (atomic_exchange_and_add ((mem), -1) - 1)
 #endif
 
 
-#ifndef catomic_decrement_val
-# define catomic_decrement_val(mem) (catomic_exchange_and_add ((mem), -1) - 1)
-#endif
-
-
 /* Subtract 1 from *MEM and return true iff it's now zero.  */
 #ifndef atomic_decrement_and_test
 # define atomic_decrement_and_test(mem) \
@@ -421,22 +311,6 @@ 
   } while (0)
 #endif
 
-#ifndef catomic_and
-# define catomic_and(mem, mask) \
-  do {									      \
-    __typeof (*(mem)) __atg20_old;					      \
-    __typeof (mem) __atg20_memp = (mem);				      \
-    __typeof (*(mem)) __atg20_mask = (mask);				      \
-									      \
-    do									      \
-      __atg20_old = (*__atg20_memp);					      \
-    while (__builtin_expect						      \
-	   (catomic_compare_and_exchange_bool_acq (__atg20_memp,	      \
-						   __atg20_old & __atg20_mask,\
-						   __atg20_old), 0));	      \
-  } while (0)
-#endif
-
 /* Atomically *mem &= mask and return the old value of *mem.  */
 #ifndef atomic_and_val
 # define atomic_and_val(mem, mask) \
@@ -471,22 +345,6 @@ 
   } while (0)
 #endif
 
-#ifndef catomic_or
-# define catomic_or(mem, mask) \
-  do {									      \
-    __typeof (*(mem)) __atg18_old;					      \
-    __typeof (mem) __atg18_memp = (mem);				      \
-    __typeof (*(mem)) __atg18_mask = (mask);				      \
-									      \
-    do									      \
-      __atg18_old = (*__atg18_memp);					      \
-    while (__builtin_expect						      \
-	   (catomic_compare_and_exchange_bool_acq (__atg18_memp,	      \
-						   __atg18_old | __atg18_mask,\
-						   __atg18_old), 0));	      \
-  } while (0)
-#endif
-
 /* Atomically *mem |= mask and return the old value of *mem.  */
 #ifndef atomic_or_val
 # define atomic_or_val(mem, mask) \
diff --git a/malloc/arena.c b/malloc/arena.c
index 0a684a720d9648953073bd7d35faca100762c031..7c74a18381a4be5fe6bcb94b38a62dbfa6b674f4 100644
--- a/malloc/arena.c
+++ b/malloc/arena.c
@@ -958,11 +958,11 @@  arena_get2 (size_t size, mstate avoid_arena)
          enough address space to create that many arenas.  */
       if (__glibc_unlikely (n <= narenas_limit - 1))
         {
-          if (catomic_compare_and_exchange_bool_acq (&narenas, n + 1, n))
+          if (atomic_compare_and_exchange_bool_acq (&narenas, n + 1, n))
             goto repeat;
           a = _int_new_arena (size);
 	  if (__glibc_unlikely (a == NULL))
-            catomic_decrement (&narenas);
+            atomic_fetch_add_relaxed (&narenas, -1);
         }
       else
         a = reused_arena (avoid_arena);
diff --git a/malloc/malloc.c b/malloc/malloc.c
index 09e5ff2bce5d79b3754687db3aef314640d336eb..a335cd23e18ee09027a8e7a71d808bfa1343e06d 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -2486,11 +2486,11 @@  sysmalloc_mmap (INTERNAL_SIZE_T nb, size_t pagesize, int extra_flags, mstate av)
     }
 
   /* update statistics */
-  int new = atomic_exchange_and_add (&mp_.n_mmaps, 1) + 1;
+  int new = atomic_fetch_add_relaxed (&mp_.n_mmaps, 1) + 1;
   atomic_max (&mp_.max_n_mmaps, new);
 
   unsigned long sum;
-  sum = atomic_exchange_and_add (&mp_.mmapped_mem, size) + size;
+  sum = atomic_fetch_add_relaxed (&mp_.mmapped_mem, size) + size;
   atomic_max (&mp_.max_mmapped_mem, sum);
 
   check_chunk (av, p);
@@ -3059,8 +3059,8 @@  munmap_chunk (mchunkptr p)
       || __glibc_unlikely (!powerof2 (mem & (pagesize - 1))))
     malloc_printerr ("munmap_chunk(): invalid pointer");
 
-  atomic_decrement (&mp_.n_mmaps);
-  atomic_add (&mp_.mmapped_mem, -total_size);
+  atomic_fetch_add_relaxed (&mp_.n_mmaps, -1);
+  atomic_fetch_add_relaxed (&mp_.mmapped_mem, -total_size);
 
   /* If munmap failed the process virtual memory address space is in a
      bad shape.  Just leave the block hanging around, the process will
@@ -3110,7 +3110,7 @@  mremap_chunk (mchunkptr p, size_t new_size)
   set_head (p, (new_size - offset) | IS_MMAPPED);
 
   INTERNAL_SIZE_T new;
-  new = atomic_exchange_and_add (&mp_.mmapped_mem, new_size - size - offset)
+  new = atomic_fetch_add_relaxed (&mp_.mmapped_mem, new_size - size - offset)
         + new_size - size - offset;
   atomic_max (&mp_.max_mmapped_mem, new);
   return p;
@@ -3832,7 +3832,7 @@  _int_malloc (mstate av, size_t bytes)
       if (__glibc_unlikely (pp != NULL && misaligned_chunk (pp)))       \
 	malloc_printerr ("malloc(): unaligned fastbin chunk detected"); \
     }							\
-  while ((pp = catomic_compare_and_exchange_val_acq (fb, pp, victim)) \
+  while ((pp = atomic_compare_and_exchange_val_acq (fb, pp, victim)) \
 	 != victim);					\
 
   if ((unsigned long) (nb) <= (unsigned long) (get_max_fast ()))
@@ -4550,7 +4550,7 @@  _int_free (mstate av, mchunkptr p, int have_lock)
 	  old2 = old;
 	  p->fd = PROTECT_PTR (&p->fd, old);
 	}
-      while ((old = catomic_compare_and_exchange_val_rel (fb, p, old2))
+      while ((old = atomic_compare_and_exchange_val_rel (fb, p, old2))
 	     != old2);
 
     /* Check that size of fastbin chunk at the top is the same as
diff --git a/malloc/memusage.c b/malloc/memusage.c
index f30906dffb2731c104ea375af48f59c65bcc7c9c..74712834fa8b96fb2d9589d34b34ab07d05a84ca 100644
--- a/malloc/memusage.c
+++ b/malloc/memusage.c
@@ -148,8 +148,8 @@  update_data (struct header *result, size_t len, size_t old_len)
 
   /* Compute current heap usage and compare it with the maximum value.  */
   size_t heap
-    = catomic_exchange_and_add (&current_heap, len - old_len) + len - old_len;
-  catomic_max (&peak_heap, heap);
+    = atomic_fetch_add_relaxed (&current_heap, len - old_len) + len - old_len;
+  atomic_max (&peak_heap, heap);
 
   /* Compute current stack usage and compare it with the maximum
      value.  The base stack pointer might not be set if this is not
@@ -172,15 +172,15 @@  update_data (struct header *result, size_t len, size_t old_len)
     start_sp = sp;
   size_t current_stack = start_sp - sp;
 #endif
-  catomic_max (&peak_stack, current_stack);
+  atomic_max (&peak_stack, current_stack);
 
   /* Add up heap and stack usage and compare it with the maximum value.  */
-  catomic_max (&peak_total, heap + current_stack);
+  atomic_max (&peak_total, heap + current_stack);
 
   /* Store the value only if we are writing to a file.  */
   if (fd != -1)
     {
-      uint32_t idx = catomic_exchange_and_add (&buffer_cnt, 1);
+      uint32_t idx = atomic_fetch_add_relaxed (&buffer_cnt, 1);
       if (idx + 1 >= 2 * buffer_size)
         {
           /* We try to reset the counter to the correct range.  If
@@ -188,7 +188,7 @@  update_data (struct header *result, size_t len, size_t old_len)
              counter it does not matter since that thread will take
              care of the correction.  */
           uint32_t reset = (idx + 1) % (2 * buffer_size);
-          catomic_compare_and_exchange_val_acq (&buffer_cnt, reset, idx + 1);
+          atomic_compare_and_exchange_val_acq (&buffer_cnt, reset, idx + 1);
           if (idx >= 2 * buffer_size)
             idx = reset - 1;
         }
@@ -362,24 +362,24 @@  malloc (size_t len)
     return (*mallocp)(len);
 
   /* Keep track of number of calls.  */
-  catomic_increment (&calls[idx_malloc]);
+  atomic_fetch_add_relaxed (&calls[idx_malloc], 1);
   /* Keep track of total memory consumption for `malloc'.  */
-  catomic_add (&total[idx_malloc], len);
+  atomic_fetch_add_relaxed (&total[idx_malloc], len);
   /* Keep track of total memory requirement.  */
-  catomic_add (&grand_total, len);
+  atomic_fetch_add_relaxed (&grand_total, len);
   /* Remember the size of the request.  */
   if (len < 65536)
-    catomic_increment (&histogram[len / 16]);
+    atomic_fetch_add_relaxed (&histogram[len / 16], 1);
   else
-    catomic_increment (&large);
+    atomic_fetch_add_relaxed (&large, 1);
   /* Total number of calls of any of the functions.  */
-  catomic_increment (&calls_total);
+  atomic_fetch_add_relaxed (&calls_total, 1);
 
   /* Do the real work.  */
   result = (struct header *) (*mallocp)(len + sizeof (struct header));
   if (result == NULL)
     {
-      catomic_increment (&failed[idx_malloc]);
+      atomic_fetch_add_relaxed (&failed[idx_malloc], 1);
       return NULL;
     }
 
@@ -430,21 +430,21 @@  realloc (void *old, size_t len)
     }
 
   /* Keep track of number of calls.  */
-  catomic_increment (&calls[idx_realloc]);
+  atomic_fetch_add_relaxed (&calls[idx_realloc], 1);
   if (len > old_len)
     {
       /* Keep track of total memory consumption for `realloc'.  */
-      catomic_add (&total[idx_realloc], len - old_len);
+      atomic_fetch_add_relaxed (&total[idx_realloc], len - old_len);
       /* Keep track of total memory requirement.  */
-      catomic_add (&grand_total, len - old_len);
+      atomic_fetch_add_relaxed (&grand_total, len - old_len);
     }
 
   if (len == 0 && old != NULL)
     {
       /* Special case.  */
-      catomic_increment (&realloc_free);
+      atomic_fetch_add_relaxed (&realloc_free, 1);
       /* Keep track of total memory freed using `free'.  */
-      catomic_add (&total[idx_free], real->length);
+      atomic_fetch_add_relaxed (&total[idx_free], real->length);
 
       /* Update the allocation data and write out the records if necessary.  */
       update_data (NULL, 0, old_len);
@@ -457,26 +457,26 @@  realloc (void *old, size_t len)
 
   /* Remember the size of the request.  */
   if (len < 65536)
-    catomic_increment (&histogram[len / 16]);
+    atomic_fetch_add_relaxed (&histogram[len / 16], 1);
   else
-    catomic_increment (&large);
+    atomic_fetch_add_relaxed (&large, 1);
   /* Total number of calls of any of the functions.  */
-  catomic_increment (&calls_total);
+  atomic_fetch_add_relaxed (&calls_total, 1);
 
   /* Do the real work.  */
   result = (struct header *) (*reallocp)(real, len + sizeof (struct header));
   if (result == NULL)
     {
-      catomic_increment (&failed[idx_realloc]);
+      atomic_fetch_add_relaxed (&failed[idx_realloc], 1);
       return NULL;
     }
 
   /* Record whether the reduction/increase happened in place.  */
   if (real == result)
-    catomic_increment (&inplace);
+    atomic_fetch_add_relaxed (&inplace, 1);
   /* Was the buffer increased?  */
   if (old_len > len)
-    catomic_increment (&decreasing);
+    atomic_fetch_add_relaxed (&decreasing, 1);
 
   /* Update the allocation data and write out the records if necessary.  */
   update_data (result, len, old_len);
@@ -508,16 +508,16 @@  calloc (size_t n, size_t len)
     return (*callocp)(n, len);
 
   /* Keep track of number of calls.  */
-  catomic_increment (&calls[idx_calloc]);
+  atomic_fetch_add_relaxed (&calls[idx_calloc], 1);
   /* Keep track of total memory consumption for `calloc'.  */
-  catomic_add (&total[idx_calloc], size);
+  atomic_fetch_add_relaxed (&total[idx_calloc], size);
   /* Keep track of total memory requirement.  */
-  catomic_add (&grand_total, size);
+  atomic_fetch_add_relaxed (&grand_total, size);
   /* Remember the size of the request.  */
   if (size < 65536)
-    catomic_increment (&histogram[size / 16]);
+    atomic_fetch_add_relaxed (&histogram[size / 16], 1);
   else
-    catomic_increment (&large);
+    atomic_fetch_add_relaxed (&large, 1);
   /* Total number of calls of any of the functions.  */
   ++calls_total;
 
@@ -525,7 +525,7 @@  calloc (size_t n, size_t len)
   result = (struct header *) (*mallocp)(size + sizeof (struct header));
   if (result == NULL)
     {
-      catomic_increment (&failed[idx_calloc]);
+      atomic_fetch_add_relaxed (&failed[idx_calloc], 1);
       return NULL;
     }
 
@@ -563,7 +563,7 @@  free (void *ptr)
   /* `free (NULL)' has no effect.  */
   if (ptr == NULL)
     {
-      catomic_increment (&calls[idx_free]);
+      atomic_fetch_add_relaxed (&calls[idx_free], 1);
       return;
     }
 
@@ -577,9 +577,9 @@  free (void *ptr)
     }
 
   /* Keep track of number of calls.  */
-  catomic_increment (&calls[idx_free]);
+  atomic_fetch_add_relaxed (&calls[idx_free], 1);
   /* Keep track of total memory freed using `free'.  */
-  catomic_add (&total[idx_free], real->length);
+  atomic_fetch_add_relaxed (&total[idx_free], real->length);
 
   /* Update the allocation data and write out the records if necessary.  */
   update_data (NULL, 0, real->length);
@@ -614,22 +614,22 @@  mmap (void *start, size_t len, int prot, int flags, int fd, off_t offset)
                  ? idx_mmap_a : prot & PROT_WRITE ? idx_mmap_w : idx_mmap_r);
 
       /* Keep track of number of calls.  */
-      catomic_increment (&calls[idx]);
+      atomic_fetch_add_relaxed (&calls[idx], 1);
       /* Keep track of total memory consumption for `malloc'.  */
-      catomic_add (&total[idx], len);
+      atomic_fetch_add_relaxed (&total[idx], len);
       /* Keep track of total memory requirement.  */
-      catomic_add (&grand_total, len);
+      atomic_fetch_add_relaxed (&grand_total, len);
       /* Remember the size of the request.  */
       if (len < 65536)
-        catomic_increment (&histogram[len / 16]);
+        atomic_fetch_add_relaxed (&histogram[len / 16], 1);
       else
-        catomic_increment (&large);
+        atomic_fetch_add_relaxed (&large, 1);
       /* Total number of calls of any of the functions.  */
-      catomic_increment (&calls_total);
+      atomic_fetch_add_relaxed (&calls_total, 1);
 
       /* Check for failures.  */
       if (result == NULL)
-        catomic_increment (&failed[idx]);
+        atomic_fetch_add_relaxed (&failed[idx], 1);
       else if (idx == idx_mmap_w)
         /* Update the allocation data and write out the records if
            necessary.  Note the first parameter is NULL which means
@@ -667,22 +667,22 @@  mmap64 (void *start, size_t len, int prot, int flags, int fd, off64_t offset)
                  ? idx_mmap_a : prot & PROT_WRITE ? idx_mmap_w : idx_mmap_r);
 
       /* Keep track of number of calls.  */
-      catomic_increment (&calls[idx]);
+      atomic_fetch_add_relaxed (&calls[idx], 1);
       /* Keep track of total memory consumption for `malloc'.  */
-      catomic_add (&total[idx], len);
+      atomic_fetch_add_relaxed (&total[idx], len);
       /* Keep track of total memory requirement.  */
-      catomic_add (&grand_total, len);
+      atomic_fetch_add_relaxed (&grand_total, len);
       /* Remember the size of the request.  */
       if (len < 65536)
-        catomic_increment (&histogram[len / 16]);
+        atomic_fetch_add_relaxed (&histogram[len / 16], 1);
       else
-        catomic_increment (&large);
+        atomic_fetch_add_relaxed (&large, 1);
       /* Total number of calls of any of the functions.  */
-      catomic_increment (&calls_total);
+      atomic_fetch_add_relaxed (&calls_total, 1);
 
       /* Check for failures.  */
       if (result == NULL)
-        catomic_increment (&failed[idx]);
+        atomic_fetch_add_relaxed (&failed[idx], 1);
       else if (idx == idx_mmap_w)
         /* Update the allocation data and write out the records if
            necessary.  Note the first parameter is NULL which means
@@ -722,33 +722,33 @@  mremap (void *start, size_t old_len, size_t len, int flags, ...)
   if (!not_me && trace_mmap)
     {
       /* Keep track of number of calls.  */
-      catomic_increment (&calls[idx_mremap]);
+      atomic_fetch_add_relaxed (&calls[idx_mremap], 1);
       if (len > old_len)
         {
           /* Keep track of total memory consumption for `malloc'.  */
-          catomic_add (&total[idx_mremap], len - old_len);
+          atomic_fetch_add_relaxed (&total[idx_mremap], len - old_len);
           /* Keep track of total memory requirement.  */
-          catomic_add (&grand_total, len - old_len);
+          atomic_fetch_add_relaxed (&grand_total, len - old_len);
         }
       /* Remember the size of the request.  */
       if (len < 65536)
-        catomic_increment (&histogram[len / 16]);
+        atomic_fetch_add_relaxed (&histogram[len / 16], 1);
       else
-        catomic_increment (&large);
+        atomic_fetch_add_relaxed (&large, 1);
       /* Total number of calls of any of the functions.  */
-      catomic_increment (&calls_total);
+      atomic_fetch_add_relaxed (&calls_total, 1);
 
       /* Check for failures.  */
       if (result == NULL)
-        catomic_increment (&failed[idx_mremap]);
+        atomic_fetch_add_relaxed (&failed[idx_mremap], 1);
       else
         {
           /* Record whether the reduction/increase happened in place.  */
           if (start == result)
-            catomic_increment (&inplace_mremap);
+            atomic_fetch_add_relaxed (&inplace_mremap, 1);
           /* Was the buffer increased?  */
           if (old_len > len)
-            catomic_increment (&decreasing_mremap);
+            atomic_fetch_add_relaxed (&decreasing_mremap, 1);
 
           /* Update the allocation data and write out the records if
              necessary.  Note the first parameter is NULL which means
@@ -783,19 +783,19 @@  munmap (void *start, size_t len)
   if (!not_me && trace_mmap)
     {
       /* Keep track of number of calls.  */
-      catomic_increment (&calls[idx_munmap]);
+      atomic_fetch_add_relaxed (&calls[idx_munmap], 1);
 
       if (__glibc_likely (result == 0))
         {
           /* Keep track of total memory freed using `free'.  */
-          catomic_add (&total[idx_munmap], len);
+          atomic_fetch_add_relaxed (&total[idx_munmap], len);
 
           /* Update the allocation data and write out the records if
              necessary.  */
           update_data (NULL, 0, len);
         }
       else
-        catomic_increment (&failed[idx_munmap]);
+        atomic_fetch_add_relaxed (&failed[idx_munmap], 1);
     }
 
   return result;
diff --git a/manual/memory.texi b/manual/memory.texi
index 23a039c57e60c81787252d935e3b309fd8290902..5cb1dbd281006148f23cfa38c5703fb79089ba78 100644
--- a/manual/memory.texi
+++ b/manual/memory.texi
@@ -354,7 +354,7 @@  this function is in @file{stdlib.h}.
 @c that's protected by list_lock; next_free is only modified while
 @c list_lock is held too.  All other data members of an arena, as well
 @c as the metadata of the memory areas assigned to it, are only modified
-@c while holding the arena's mutex (fastbin pointers use catomic ops
+@c while holding the arena's mutex (fastbin pointers use atomic ops
 @c because they may be modified by free without taking the arena's
 @c lock).  Some reassurance was needed for fastbins, for it wasn't clear
 @c how they were initialized.  It turns out they are always
@@ -383,7 +383,7 @@  this function is in @file{stdlib.h}.
 @c     mutex_lock (arena lock) dup @asulock @aculock [returns locked]
 @c    __get_nprocs ext ok @acsfd
 @c    NARENAS_FROM_NCORES ok
-@c    catomic_compare_and_exchange_bool_acq ok
+@c    atomic_compare_and_exchange_bool_acq ok
 @c    _int_new_arena ok @asulock @aculock @acsmem
 @c     new_heap ok @acsmem
 @c      mmap ok @acsmem
@@ -397,7 +397,7 @@  this function is in @file{stdlib.h}.
 @c     mutex_lock (list_lock) dup @asulock @aculock
 @c     atomic_write_barrier ok
 @c     mutex_unlock (list_lock) @aculock
-@c    catomic_decrement ok
+@c    atomic_decrement ok
 @c    reused_arena @asulock @aculock
 @c      reads&writes next_to_use and iterates over arena next without guards
 @c      those are harmless as long as we don't drop arenas from the
@@ -414,7 +414,7 @@  this function is in @file{stdlib.h}.
 @c   get_max_fast ok
 @c   fastbin_index ok
 @c   fastbin ok
-@c   catomic_compare_and_exhange_val_acq ok
+@c   atomic_compare_and_exhange_val_acq ok
 @c   malloc_printerr dup @mtsenv
 @c     if we get to it, we're toast already, undefined behavior must have
 @c     been invoked before
@@ -521,10 +521,10 @@  this function is in @file{stdlib.h}.
 @c     chunk2mem dup ok
 @c     free_perturb ok
 @c     set_fastchunks ok
-@c      catomic_and ok
+@c      atomic_and ok
 @c     fastbin_index dup ok
 @c     fastbin dup ok
-@c     catomic_compare_and_exchange_val_rel ok
+@c     atomic_compare_and_exchange_val_rel ok
 @c     chunk_is_mmapped ok
 @c     contiguous dup ok
 @c     prev_inuse ok
@@ -706,7 +706,7 @@  The prototype for this function is in @file{stdlib.h}.
 @safety{@prelim{}@mtsafe{}@asunsafe{@asulock{}}@acunsafe{@aculock{} @acsfd{} @acsmem{}}}
 @c __libc_free @asulock @aculock @acsfd @acsmem
 @c   releasing memory into fastbins modifies the arena without taking
-@c   its mutex, but catomic operations ensure safety.  If two (or more)
+@c   its mutex, but atomic operations ensure safety.  If two (or more)
 @c   threads are running malloc and have their own arenas locked when
 @c   each gets a signal whose handler free()s large (non-fastbin-able)
 @c   blocks from each other's arena, we deadlock; this is a more general
diff --git a/misc/tst-atomic.c b/misc/tst-atomic.c
index 6d681a7bfdf4f48b4c04a073ebd480326dbd3cc8..4f9d2c1a46b363d346dbc2fa0962ae196844a43a 100644
--- a/misc/tst-atomic.c
+++ b/misc/tst-atomic.c
@@ -393,117 +393,6 @@  do_test (void)
     }
 #endif
 
-#ifdef catomic_compare_and_exchange_val_acq
-  mem = 24;
-  if (catomic_compare_and_exchange_val_acq (&mem, 35, 24) != 24
-      || mem != 35)
-    {
-      puts ("catomic_compare_and_exchange_val_acq test 1 failed");
-      ret = 1;
-    }
-
-  mem = 12;
-  if (catomic_compare_and_exchange_val_acq (&mem, 10, 15) != 12
-      || mem != 12)
-    {
-      puts ("catomic_compare_and_exchange_val_acq test 2 failed");
-      ret = 1;
-    }
-
-  mem = -15;
-  if (catomic_compare_and_exchange_val_acq (&mem, -56, -15) != -15
-      || mem != -56)
-    {
-      puts ("catomic_compare_and_exchange_val_acq test 3 failed");
-      ret = 1;
-    }
-
-  mem = -1;
-  if (catomic_compare_and_exchange_val_acq (&mem, 17, 0) != -1
-      || mem != -1)
-    {
-      puts ("catomic_compare_and_exchange_val_acq test 4 failed");
-      ret = 1;
-    }
-#endif
-
-  mem = 24;
-  if (catomic_compare_and_exchange_bool_acq (&mem, 35, 24)
-      || mem != 35)
-    {
-      puts ("catomic_compare_and_exchange_bool_acq test 1 failed");
-      ret = 1;
-    }
-
-  mem = 12;
-  if (! catomic_compare_and_exchange_bool_acq (&mem, 10, 15)
-      || mem != 12)
-    {
-      puts ("catomic_compare_and_exchange_bool_acq test 2 failed");
-      ret = 1;
-    }
-
-  mem = -15;
-  if (catomic_compare_and_exchange_bool_acq (&mem, -56, -15)
-      || mem != -56)
-    {
-      puts ("catomic_compare_and_exchange_bool_acq test 3 failed");
-      ret = 1;
-    }
-
-  mem = -1;
-  if (! catomic_compare_and_exchange_bool_acq (&mem, 17, 0)
-      || mem != -1)
-    {
-      puts ("catomic_compare_and_exchange_bool_acq test 4 failed");
-      ret = 1;
-    }
-
-  mem = 2;
-  if (catomic_exchange_and_add (&mem, 11) != 2
-      || mem != 13)
-    {
-      puts ("catomic_exchange_and_add test failed");
-      ret = 1;
-    }
-
-  mem = -21;
-  catomic_add (&mem, 22);
-  if (mem != 1)
-    {
-      puts ("catomic_add test failed");
-      ret = 1;
-    }
-
-  mem = -1;
-  catomic_increment (&mem);
-  if (mem != 0)
-    {
-      puts ("catomic_increment test failed");
-      ret = 1;
-    }
-
-  mem = 2;
-  if (catomic_increment_val (&mem) != 3)
-    {
-      puts ("catomic_increment_val test failed");
-      ret = 1;
-    }
-
-  mem = 17;
-  catomic_decrement (&mem);
-  if (mem != 16)
-    {
-      puts ("catomic_decrement test failed");
-      ret = 1;
-    }
-
-  if (catomic_decrement_val (&mem) != 15)
-    {
-      puts ("catomic_decrement_val test failed");
-      ret = 1;
-    }
-
   /* Tests for C11-like atomics.  */
   mem = 11;
   if (atomic_load_relaxed (&mem) != 11 || atomic_load_acquire (&mem) != 11)
diff --git a/sysdeps/hppa/dl-fptr.c b/sysdeps/hppa/dl-fptr.c
index 9ed21602d6155d4b960278f8d1fac4ffa885b9d5..a814769b0b5cb492561f54f32575808e00b4dd24 100644
--- a/sysdeps/hppa/dl-fptr.c
+++ b/sysdeps/hppa/dl-fptr.c
@@ -43,7 +43,7 @@ 
 
 #ifndef COMPARE_AND_SWAP
 # define COMPARE_AND_SWAP(ptr, old, new) \
-  (catomic_compare_and_exchange_bool_acq (ptr, new, old) == 0)
+  (atomic_compare_and_exchange_bool_acq (ptr, new, old) == 0)
 #endif
 
 ElfW(Addr) _dl_boot_fptr_table [ELF_MACHINE_BOOT_FPTR_TABLE_LEN];
diff --git a/sysdeps/s390/atomic-machine.h b/sysdeps/s390/atomic-machine.h
index e85b2ef50541c7aab6d2981180f6205d2bd681b6..6b1de51c2a30baf5554a729a80a7ce04b56fc22c 100644
--- a/sysdeps/s390/atomic-machine.h
+++ b/sysdeps/s390/atomic-machine.h
@@ -70,8 +70,6 @@ 
     !__atomic_compare_exchange_n (mem, (void *) &__atg2_oldval, newval,	\
 				  1, __ATOMIC_ACQUIRE,			\
 				  __ATOMIC_RELAXED); })
-#define catomic_compare_and_exchange_bool_acq(mem, newval, oldval)	\
-  atomic_compare_and_exchange_bool_acq (mem, newval, oldval)
 
 /* Store NEWVALUE in *MEM and return the old value.  */
 #define atomic_exchange_acq(mem, newvalue)				\
@@ -90,8 +88,6 @@ 
 # define atomic_exchange_and_add_rel(mem, operand)			\
   ({ __atomic_check_size((mem));					\
   __atomic_fetch_add ((mem), (operand), __ATOMIC_RELEASE); })
-#define catomic_exchange_and_add(mem, value)	\
-  atomic_exchange_and_add (mem, value)
 
 /* Atomically *mem |= mask and return the old value of *mem.  */
 /* The gcc builtin uses load-and-or instruction on z196 zarch and higher cpus
@@ -104,8 +100,6 @@ 
   do {						\
     atomic_or_val (mem, mask);			\
   } while (0)
-#define catomic_or(mem, mask)			\
-  atomic_or (mem, mask)
 
 /* Atomically *mem |= 1 << bit and return true if the bit was set in old value
    of *mem.  */
@@ -129,5 +123,3 @@ 
   do {						\
     atomic_and_val (mem, mask);			\
   } while (0)
-#define catomic_and(mem, mask)			\
-  atomic_and(mem, mask)
diff --git a/sysdeps/unix/sysv/linux/riscv/atomic-machine.h b/sysdeps/unix/sysv/linux/riscv/atomic-machine.h
index 9ae89e0ef12ad28319755ac51260908779b9579f..f4b2cbced828a80335887bf172fd60767cf978ac 100644
--- a/sysdeps/unix/sysv/linux/riscv/atomic-machine.h
+++ b/sysdeps/unix/sysv/linux/riscv/atomic-machine.h
@@ -170,10 +170,6 @@ 
   ({ typeof (*mem) __mask = (typeof (*mem))1 << (bit);    \
      asm_amo ("amoor", ".aq", mem, __mask) & __mask; })
 
-# define catomic_exchange_and_add(mem, value)		\
-  atomic_exchange_and_add (mem, value)
-# define catomic_max(mem, value) atomic_max (mem, value)
-
 #else /* __riscv_atomic */
 # error "ISAs that do not subsume the A extension are not supported"
 #endif /* !__riscv_atomic */
diff --git a/sysdeps/x86/atomic-machine.h b/sysdeps/x86/atomic-machine.h
index f24f1c71ed718c601c71decc1ee0c4b49fdf32f8..920390f4e629106040415af4623fcd826b242da2 100644
--- a/sysdeps/x86/atomic-machine.h
+++ b/sysdeps/x86/atomic-machine.h
@@ -52,52 +52,7 @@ 
   (! __sync_bool_compare_and_swap (mem, oldval, newval))
 
 
-#define __arch_c_compare_and_exchange_val_8_acq(mem, newval, oldval) \
-  ({ __typeof (*mem) ret;						      \
-     __asm __volatile ("cmpl $0, %%" SEG_REG ":%P5\n\t"			      \
-		       "je 0f\n\t"					      \
-		       "lock\n"						      \
-		       "0:\tcmpxchgb %b2, %1"				      \
-		       : "=a" (ret), "=m" (*mem)			      \
-		       : BR_CONSTRAINT (newval), "m" (*mem), "0" (oldval),    \
-			 "i" (offsetof (tcbhead_t, multiple_threads)));	      \
-     ret; })
-
-#define __arch_c_compare_and_exchange_val_16_acq(mem, newval, oldval) \
-  ({ __typeof (*mem) ret;						      \
-     __asm __volatile ("cmpl $0, %%" SEG_REG ":%P5\n\t"			      \
-		       "je 0f\n\t"					      \
-		       "lock\n"						      \
-		       "0:\tcmpxchgw %w2, %1"				      \
-		       : "=a" (ret), "=m" (*mem)			      \
-		       : BR_CONSTRAINT (newval), "m" (*mem), "0" (oldval),    \
-			 "i" (offsetof (tcbhead_t, multiple_threads)));	      \
-     ret; })
-
-#define __arch_c_compare_and_exchange_val_32_acq(mem, newval, oldval) \
-  ({ __typeof (*mem) ret;						      \
-     __asm __volatile ("cmpl $0, %%" SEG_REG ":%P5\n\t"			      \
-		       "je 0f\n\t"					      \
-		       "lock\n"						      \
-		       "0:\tcmpxchgl %2, %1"				      \
-		       : "=a" (ret), "=m" (*mem)			      \
-		       : BR_CONSTRAINT (newval), "m" (*mem), "0" (oldval),    \
-			 "i" (offsetof (tcbhead_t, multiple_threads)));       \
-     ret; })
-
 #ifdef __x86_64__
-# define __arch_c_compare_and_exchange_val_64_acq(mem, newval, oldval) \
-  ({ __typeof (*mem) ret;						      \
-     __asm __volatile ("cmpl $0, %%fs:%P5\n\t"				      \
-		       "je 0f\n\t"					      \
-		       "lock\n"						      \
-		       "0:\tcmpxchgq %q2, %1"				      \
-		       : "=a" (ret), "=m" (*mem)			      \
-		       : "q" ((int64_t) cast_to_integer (newval)),	      \
-			 "m" (*mem),					      \
-			 "0" ((int64_t) cast_to_integer (oldval)),	      \
-			 "i" (offsetof (tcbhead_t, multiple_threads)));	      \
-     ret; })
 # define do_exchange_and_add_val_64_acq(pfx, mem, value) 0
 # define do_add_val_64_acq(pfx, mem, value) do { } while (0)
 #else
@@ -107,13 +62,6 @@ 
    such an operation.  So don't define any code for now.  If it is
    really going to be used the code below can be used on Intel Pentium
    and later, but NOT on i486.  */
-# define __arch_c_compare_and_exchange_val_64_acq(mem, newval, oldval) \
-  ({ __typeof (*mem) ret = *(mem);					      \
-     __atomic_link_error ();						      \
-     ret = (newval);							      \
-     ret = (oldval);							      \
-     ret; })
-
 # define __arch_compare_and_exchange_val_64_acq(mem, newval, oldval)	      \
   ({ __typeof (*mem) ret = *(mem);					      \
      __atomic_link_error ();						      \
@@ -206,14 +154,6 @@ 
 #define atomic_exchange_and_add(mem, value) \
   __sync_fetch_and_add (mem, value)
 
-#define __arch_exchange_and_add_cprefix \
-  "cmpl $0, %%" SEG_REG ":%P4\n\tje 0f\n\tlock\n0:\t"
-
-#define catomic_exchange_and_add(mem, value) \
-  __arch_exchange_and_add_body (__arch_exchange_and_add_cprefix, __arch_c,    \
-				mem, value)
-
-
 #define __arch_add_body(lock, pfx, apfx, mem, value) \
   do {									      \
     if (__builtin_constant_p (value) && (value) == 1)			      \
@@ -248,13 +188,6 @@ 
 # define atomic_add(mem, value) \
   __arch_add_body (LOCK_PREFIX, atomic, __arch, mem, value)
 
-#define __arch_add_cprefix \
-  "cmpl $0, %%" SEG_REG ":%P3\n\tje 0f\n\tlock\n0:\t"
-
-#define catomic_add(mem, value) \
-  __arch_add_body (__arch_add_cprefix, atomic, __arch_c, mem, value)
-
-
 #define atomic_add_negative(mem, value) \
   ({ unsigned char __result;						      \
      if (sizeof (*mem) == 1)						      \
@@ -331,13 +264,6 @@ 
 
 #define atomic_increment(mem) __arch_increment_body (LOCK_PREFIX, __arch, mem)
 
-#define __arch_increment_cprefix \
-  "cmpl $0, %%" SEG_REG ":%P2\n\tje 0f\n\tlock\n0:\t"
-
-#define catomic_increment(mem) \
-  __arch_increment_body (__arch_increment_cprefix, __arch_c, mem)
-
-
 #define atomic_increment_and_test(mem) \
   ({ unsigned char __result;						      \
      if (sizeof (*mem) == 1)						      \
@@ -389,13 +315,6 @@ 
 
 #define atomic_decrement(mem) __arch_decrement_body (LOCK_PREFIX, __arch, mem)
 
-#define __arch_decrement_cprefix \
-  "cmpl $0, %%" SEG_REG ":%P2\n\tje 0f\n\tlock\n0:\t"
-
-#define catomic_decrement(mem) \
-  __arch_decrement_body (__arch_decrement_cprefix, __arch_c, mem)
-
-
 #define atomic_decrement_and_test(mem) \
   ({ unsigned char __result;						      \
      if (sizeof (*mem) == 1)						      \
@@ -493,14 +412,8 @@ 
       __atomic_link_error ();						      \
   } while (0)
 
-#define __arch_cprefix \
-  "cmpl $0, %%" SEG_REG ":%P3\n\tje 0f\n\tlock\n0:\t"
-
 #define atomic_and(mem, mask) __arch_and_body (LOCK_PREFIX, mem, mask)
 
-#define catomic_and(mem, mask) __arch_and_body (__arch_cprefix, mem, mask)
-
-
 #define __arch_or_body(lock, mem, mask) \
   do {									      \
     if (sizeof (*mem) == 1)						      \
@@ -529,8 +442,6 @@ 
 
 #define atomic_or(mem, mask) __arch_or_body (LOCK_PREFIX, mem, mask)
 
-#define catomic_or(mem, mask) __arch_or_body (__arch_cprefix, mem, mask)
-
 /* We don't use mfence because it is supposedly slower due to having to
    provide stronger guarantees (e.g., regarding self-modifying code).  */
 #define atomic_full_barrier() \